001/* 002 * Licensed under the Apache License, Version 2.0 (the "License"); 003 * you may not use this file except in compliance with the License. 004 * You may obtain a copy of the License at 005 * 006 * http://www.apache.org/licenses/LICENSE-2.0 007 * 008 * Unless required by applicable law or agreed to in writing, software 009 * distributed under the License is distributed on an "AS IS" BASIS, 010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 011 * See the License for the specific language governing permissions and 012 * limitations under the License. 013 */ 014package org.gbif.utils.file.csv; 015 016import org.gbif.utils.file.CharsetDetection; 017import org.gbif.utils.file.UnknownCharsetException; 018import org.gbif.utils.file.tabular.TabularFileMetadataExtractor; 019 020import java.io.File; 021import java.io.IOException; 022import java.io.InputStream; 023import java.nio.charset.Charset; 024import java.util.ArrayList; 025import java.util.List; 026 027import org.slf4j.Logger; 028import org.slf4j.LoggerFactory; 029 030/** 031 * 032 */ 033public class CSVReaderFactory { 034 035 private static final Logger LOG = LoggerFactory.getLogger(CSVReaderFactory.class); 036 private static final String[] POTENTIAL_DELIMITERS = {",", "\t", ";", "|"}; 037 private static final int ROWS_TO_INSPECT = 10; 038 039 /** 040 * Data about the CSV file 041 */ 042 public static class CSVMetadata { 043 String delimiter; 044 Character quotedBy; 045 046 public String getDelimiter() { 047 return delimiter; 048 } 049 050 public void setDelimiter(String delimiter) { 051 this.delimiter = delimiter; 052 } 053 054 public Character getQuotedBy() { 055 return quotedBy; 056 } 057 058 public void setQuotedBy(Character quotedBy) { 059 this.quotedBy = quotedBy; 060 } 061 } 062 063 /** 064 * Build a CSVReader with specific delimiter and presence or not of a header line. 065 * Encoding detection will be attempted. 066 * @param source 067 * @param delimiter 068 * @param header Does the file include a header line ? 069 * @return 070 * @throws IOException 071 */ 072 public static CSVReader build(File source, String delimiter, boolean header) throws IOException { 073 return new CSVReader(source, detectEncoding(source), delimiter, null, header ? 1 : 0); 074 } 075 076 /** 077 * Build a CSVReader with specific encoding, delimiter, quotes character and number of header row(s). 078 * 079 * @param source 080 * @param encoding 081 * @param delimiter 082 * @param quotes 083 * @param headerRows 084 * @return 085 * @throws IOException 086 */ 087 public static CSVReader build( 088 File source, String encoding, String delimiter, Character quotes, Integer headerRows) 089 throws IOException { 090 return new CSVReader(source, encoding, delimiter, quotes, headerRows); 091 } 092 093 /** 094 * Build a CSVReader with specific encoding, delimiter and number of header row(s) but default quote character 095 * (quotation marks) 096 * @param source 097 * @param encoding 098 * @param delimiter 099 * @param headerRows 100 * @return 101 * @throws IOException 102 */ 103 public static CSVReader build(File source, String encoding, String delimiter, Integer headerRows) 104 throws IOException { 105 return new CSVReader(source, encoding, delimiter, '"', headerRows); 106 } 107 108 /** 109 * Build a CSVReader with specific encoding, delimiter quotes and number of header row(s) 110 * 111 * @param stream 112 * @param encoding 113 * @param delimiter 114 * @param quotes 115 * @param headerRows 116 * @return 117 * @throws IOException 118 */ 119 public static CSVReader build( 120 InputStream stream, String encoding, String delimiter, Character quotes, Integer headerRows) 121 throws IOException { 122 return new CSVReader(stream, encoding, delimiter, quotes, headerRows); 123 } 124 125 /** 126 * Build a CSVReader and try to detect the encoding, delimiter and quotes. 127 * 128 * @param source 129 * @param headerRows 130 * @return 131 * @throws IOException 132 */ 133 public static CSVReader build(File source, Integer headerRows) throws IOException { 134 String encoding = detectEncoding(source); 135 CSVMetadata csvMeta = extractCsvMetadata(source, encoding); 136 return new CSVReader( 137 source, encoding, csvMeta.getDelimiter(), csvMeta.getQuotedBy(), headerRows); 138 } 139 140 /** 141 * Assumes 1 header row 142 * 143 * @param source 144 * @return 145 * @throws IOException 146 */ 147 public static CSVReader build(File source) throws IOException { 148 return build(source, 1); 149 } 150 151 public static CSVReader buildTabReader(InputStream stream, String encoding, Integer headerRows) 152 throws IOException { 153 return new CSVReader(stream, encoding, "\t", null, headerRows); 154 } 155 156 public static CSVReader buildUtf8TabReader(InputStream stream) throws IOException { 157 return buildTabReader(stream, "utf8", 0); 158 } 159 160 /** 161 * Replaced by {@link TabularFileMetadataExtractor} 162 * 163 * Extract metadata from a CSV file. 164 * Metadata includes delimiter and quotes character. 165 * 166 * @param source 167 * @param encoding 168 * @return 169 * @throws UnknownDelimitersException 170 */ 171 public static CSVMetadata extractCsvMetadata(File source, String encoding) 172 throws UnknownDelimitersException { 173 CSVMetadata csvMetadata = new CSVMetadata(); 174 // try csv, tab and then other popular delimiters 175 // keep number of resulting columns for comparisons 176 int maxColumns = 0; 177 178 for (String delim : POTENTIAL_DELIMITERS) { 179 // test with various quotes including a dynamic one if the first char in each field is 180 // consistently the same 181 List<Character> potentialQuotes = new ArrayList<Character>(); 182 183 CSVReader reader; 184 try { 185 reader = build(source, encoding, delim, null, 1); 186 Character firstChar = likelyQuoteChar(reader); 187 reader.close(); 188 if (firstChar != null) { 189 potentialQuotes.add(firstChar); 190 } 191 } catch (IOException ignored) { 192 } 193 // prefer quotes for CSVs 194 if (delim.equals(",")) { 195 potentialQuotes.add('"'); 196 potentialQuotes.add('\''); 197 potentialQuotes.add(null); 198 } else { 199 potentialQuotes.add(null); 200 potentialQuotes.add('"'); 201 potentialQuotes.add('\''); 202 } 203 204 for (Character quote : potentialQuotes) { 205 try { 206 reader = build(source, encoding, delim, quote, 0); 207 int x = consistentRowSize(reader); 208 // try to find the delimiter and quote that will give us the maximum number of rows 209 if (x > maxColumns) { 210 csvMetadata.setDelimiter(delim); 211 csvMetadata.setQuotedBy(quote); 212 maxColumns = x; 213 } 214 reader.close(); 215 } catch (IOException ignored) { 216 // swallow, maybe different delimiters work 217 // if all fail we will throw an exception at the end 218 } 219 } 220 } 221 222 if (maxColumns < 1) { 223 throw new UnknownDelimitersException("Unable to detect field delimiter"); 224 } 225 226 return csvMetadata; 227 } 228 229 /** 230 * @return the number of consistent columns, -1 if non consistent or column numbers-2 in case the column numbers only 231 * differ by 1 at max. 232 */ 233 private static int consistentRowSize(CSVReader reader) { 234 int rowNum = 0; 235 int columns = 0; 236 boolean plusMinusOne = false; 237 while (reader.hasNext() && rowNum < ROWS_TO_INSPECT) { 238 String[] row = reader.next(); 239 if (rowNum == 0) { 240 columns = row.length; 241 } 242 if (Math.abs(columns - row.length) > 1) { 243 return -1; 244 } 245 if (columns != row.length) { 246 plusMinusOne = true; 247 } 248 rowNum++; 249 } 250 if (plusMinusOne) { 251 return columns - 2; 252 } 253 return columns; 254 } 255 256 private static String detectEncoding(File source) throws UnknownCharsetException { 257 Charset encoding; 258 try { 259 encoding = CharsetDetection.detectEncoding(source, 16384); 260 if (encoding == null) { 261 throw new UnknownCharsetException("Unable to detect the files character encoding"); 262 } 263 } catch (IOException e) { 264 throw new UnknownCharsetException(e); 265 } 266 return encoding.displayName(); 267 } 268 269 /** 270 * Checks if all non empty/null fields start with the same character. 271 * 272 * @return the first character if consistent, otherwise null 273 */ 274 private static Character likelyQuoteChar(CSVReader reader) { 275 Character quote = null; 276 int line = 0; 277 while (reader.hasNext() && line < 10) { 278 line++; 279 String[] row = reader.next(); 280 if (row != null) { 281 for (String col : row) { 282 if (col != null && col.length() > 0) { 283 // same char at start & end? 284 if (col.length() > 1 && col.charAt(0) == col.charAt(col.length() - 1)) { 285 // only consider non alphanumerics 286 char potQuote = col.charAt(0); 287 if (Character.isLetterOrDigit(potQuote)) { 288 break; 289 } 290 if (quote == null) { 291 quote = potQuote; 292 } else { 293 if (!quote.equals(potQuote)) { 294 quote = null; 295 break; 296 } 297 } 298 } 299 } 300 } 301 } 302 } 303 return quote; 304 } 305}