001/* 002 * Copyright 2021 Global Biodiversity Information Facility (GBIF) 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package org.gbif.utils.file.csv; 017 018import org.gbif.utils.file.CharsetDetection; 019import org.gbif.utils.file.UnknownCharsetException; 020import org.gbif.utils.file.tabular.TabularFileMetadataExtractor; 021 022import java.io.File; 023import java.io.IOException; 024import java.io.InputStream; 025import java.nio.charset.Charset; 026import java.util.ArrayList; 027import java.util.List; 028 029import org.slf4j.Logger; 030import org.slf4j.LoggerFactory; 031 032/** 033 * 034 */ 035public class CSVReaderFactory { 036 037 private static final Logger LOG = LoggerFactory.getLogger(CSVReaderFactory.class); 038 private static final String[] POTENTIAL_DELIMITERS = {",", "\t", ";", "|"}; 039 private static final int ROWS_TO_INSPECT = 10; 040 041 /** 042 * Data about the CSV file 043 */ 044 public static class CSVMetadata { 045 String delimiter; 046 Character quotedBy; 047 048 public String getDelimiter() { 049 return delimiter; 050 } 051 052 public void setDelimiter(String delimiter) { 053 this.delimiter = delimiter; 054 } 055 056 public Character getQuotedBy() { 057 return quotedBy; 058 } 059 060 public void setQuotedBy(Character quotedBy) { 061 this.quotedBy = quotedBy; 062 } 063 } 064 065 /** 066 * Build a CSVReader with specific delimiter and presence or not of a header line. 067 * Encoding detection will be attempted. 068 * @param source 069 * @param delimiter 070 * @param header Does the file include a header line ? 071 * @return 072 * @throws IOException 073 */ 074 public static CSVReader build(File source, String delimiter, boolean header) throws IOException { 075 return new CSVReader(source, detectEncoding(source), delimiter, null, header ? 1 : 0); 076 } 077 078 /** 079 * Build a CSVReader with specific encoding, delimiter, quotes character and number of header row(s). 080 * 081 * @param source 082 * @param encoding 083 * @param delimiter 084 * @param quotes 085 * @param headerRows 086 * @return 087 * @throws IOException 088 */ 089 public static CSVReader build(File source, String encoding, String delimiter, Character quotes, Integer headerRows) 090 throws IOException { 091 return new CSVReader(source, encoding, delimiter, quotes, headerRows); 092 } 093 094 /** 095 * Build a CSVReader with specific encoding, delimiter and number of header row(s) but default quote character 096 * (quotation marks) 097 * @param source 098 * @param encoding 099 * @param delimiter 100 * @param headerRows 101 * @return 102 * @throws IOException 103 */ 104 public static CSVReader build(File source, String encoding, String delimiter, Integer headerRows) throws IOException { 105 return new CSVReader(source, encoding, delimiter, '"', headerRows); 106 } 107 108 /** 109 * Build a CSVReader with specific encoding, delimiter quotes and number of header row(s) 110 * 111 * @param stream 112 * @param encoding 113 * @param delimiter 114 * @param quotes 115 * @param headerRows 116 * @return 117 * @throws IOException 118 */ 119 public static CSVReader build(InputStream stream, String encoding, String delimiter, Character quotes, 120 Integer headerRows) throws IOException { 121 return new CSVReader(stream, encoding, delimiter, quotes, headerRows); 122 } 123 124 /** 125 * Build a CSVReader and try to detect the encoding, delimiter and quotes. 126 * 127 * @param source 128 * @param headerRows 129 * @return 130 * @throws IOException 131 */ 132 public static CSVReader build(File source, Integer headerRows) throws IOException { 133 String encoding = detectEncoding(source); 134 CSVMetadata csvMeta = extractCsvMetadata(source, encoding); 135 return new CSVReader(source, encoding, csvMeta.getDelimiter(), csvMeta.getQuotedBy(), headerRows); 136 } 137 138 /** 139 * Assumes 1 header row 140 * 141 * @param source 142 * @return 143 * @throws IOException 144 */ 145 public static CSVReader build(File source) throws IOException { 146 return build(source, 1); 147 } 148 149 public static CSVReader buildTabReader(InputStream stream, String encoding, Integer headerRows) throws IOException { 150 return new CSVReader(stream, encoding, "\t", null, headerRows); 151 } 152 153 public static CSVReader buildUtf8TabReader(InputStream stream) throws IOException { 154 return buildTabReader(stream, "utf8", 0); 155 } 156 157 /** 158 * Replaced by {@link TabularFileMetadataExtractor} 159 * 160 * Extract metadata from a CSV file. 161 * Metadata includes delimiter and quotes character. 162 * 163 * @param source 164 * @param encoding 165 * @return 166 * @throws UnknownDelimitersException 167 */ 168 public static CSVMetadata extractCsvMetadata(File source, String encoding) throws UnknownDelimitersException { 169 CSVMetadata csvMetadata = new CSVMetadata(); 170 // try csv, tab and then other popular delimiters 171 // keep number of resulting columns for comparisons 172 int maxColumns = 0; 173 174 for (String delim : POTENTIAL_DELIMITERS) { 175 // test with various quotes including a dynamic one if the first char in each field is consistently the same 176 List<Character> potentialQuotes = new ArrayList<Character>(); 177 178 CSVReader reader; 179 try { 180 reader = build(source, encoding, delim, null, 1); 181 Character firstChar = likelyQuoteChar(reader); 182 reader.close(); 183 if (firstChar != null) { 184 potentialQuotes.add(firstChar); 185 } 186 } catch (IOException ignored) { 187 } 188 // prefer quotes for CSVs 189 if (delim.equals(",")) { 190 potentialQuotes.add('"'); 191 potentialQuotes.add('\''); 192 potentialQuotes.add(null); 193 } else { 194 potentialQuotes.add(null); 195 potentialQuotes.add('"'); 196 potentialQuotes.add('\''); 197 } 198 199 for (Character quote : potentialQuotes) { 200 try { 201 reader = build(source, encoding, delim, quote, 0); 202 int x = consistentRowSize(reader); 203 // try to find the delimiter and quote that will give us the maximum number of rows 204 if (x > maxColumns) { 205 csvMetadata.setDelimiter(delim); 206 csvMetadata.setQuotedBy(quote); 207 maxColumns = x; 208 } 209 reader.close(); 210 } catch (IOException ignored) { 211 // swallow, maybe different delimiters work 212 // if all fail we will throw an exception at the end 213 } 214 } 215 } 216 217 if (maxColumns < 1) { 218 throw new UnknownDelimitersException("Unable to detect field delimiter"); 219 } 220 221 return csvMetadata; 222 } 223 224 /** 225 * @return the number of consistent columns, -1 if non consistent or column numbers-2 in case the column numbers only 226 * differ by 1 at max. 227 */ 228 private static int consistentRowSize(CSVReader reader) { 229 int rowNum = 0; 230 int columns = 0; 231 boolean plusMinusOne = false; 232 while (reader.hasNext() && rowNum < ROWS_TO_INSPECT) { 233 String[] row = reader.next(); 234 if (rowNum == 0) { 235 columns = row.length; 236 } 237 if (Math.abs(columns - row.length) > 1) { 238 return -1; 239 } 240 if (columns != row.length) { 241 plusMinusOne = true; 242 } 243 rowNum++; 244 } 245 if (plusMinusOne) { 246 return columns - 2; 247 } 248 return columns; 249 } 250 251 private static String detectEncoding(File source) throws UnknownCharsetException { 252 Charset encoding; 253 try { 254 encoding = CharsetDetection.detectEncoding(source, 16384); 255 if (encoding == null) { 256 throw new UnknownCharsetException("Unable to detect the files character encoding"); 257 } 258 } catch (IOException e) { 259 throw new UnknownCharsetException(e); 260 } 261 return encoding.displayName(); 262 } 263 264 /** 265 * Checks if all non empty/null fields start with the same character. 266 * 267 * @return the first character if consistent, otherwise null 268 */ 269 private static Character likelyQuoteChar(CSVReader reader) { 270 Character quote = null; 271 int line = 0; 272 while (reader.hasNext() && line < 10) { 273 line++; 274 String[] row = reader.next(); 275 if (row != null) { 276 for (String col : row) { 277 if (col != null && col.length() > 0) { 278 // same char at start & end? 279 if (col.length() > 1 && col.charAt(0) == col.charAt(col.length() - 1)) { 280 // only consider non alphanumerics 281 char potQuote = col.charAt(0); 282 if (Character.isLetterOrDigit(potQuote)) { 283 break; 284 } 285 if (quote == null) { 286 quote = potQuote; 287 } else { 288 if (!quote.equals(potQuote)) { 289 quote = null; 290 break; 291 } 292 } 293 } 294 } 295 } 296 } 297 } 298 return quote; 299 } 300 301}