001/***************************************************************************
002 * Copyright 2010-2015 Global Biodiversity Information Facility Secretariat
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
005 * use this file except in compliance with the License. You may obtain a copy of
006 * the License at
007 *
008 * http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
012 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
013 * License for the specific language governing permissions and limitations under
014 * the License.
015 ***************************************************************************/
016package org.gbif.utils.file.csv;
017
018import org.gbif.utils.file.CharsetDetection;
019import org.gbif.utils.file.UnkownCharsetException;
020import org.gbif.utils.file.tabular.TabularFileMetadataExtractor;
021
022import java.io.File;
023import java.io.IOException;
024import java.io.InputStream;
025import java.nio.charset.Charset;
026import java.util.ArrayList;
027import java.util.List;
028
029import org.slf4j.Logger;
030import org.slf4j.LoggerFactory;
031
032/**
033 *
034 */
035public class CSVReaderFactory {
036
037  private static final Logger LOG = LoggerFactory.getLogger(CSVReaderFactory.class);
038  private static final String[] POTENTIAL_DELIMITERS = {",", "\t", ";", "|"};
039  private static final int ROWS_TO_INSPECT = 10;
040
041  /**
042   * Data about the CSV file
043   */
044  public static class CSVMetadata {
045    String delimiter;
046    Character quotedBy;
047
048    public String getDelimiter() {
049      return delimiter;
050    }
051
052    public void setDelimiter(String delimiter) {
053      this.delimiter = delimiter;
054    }
055
056    public Character getQuotedBy() {
057      return quotedBy;
058    }
059
060    public void setQuotedBy(Character quotedBy) {
061      this.quotedBy = quotedBy;
062    }
063  }
064
065  /**
066   * Build a CSVReader with specific delimiter and presence or not of a header line.
067   * Encoding detection will be attempted.
068   * @param source
069   * @param delimiter
070   * @param header Does the file include a header line ?
071   * @return
072   * @throws IOException
073   */
074  public static CSVReader build(File source, String delimiter, boolean header) throws IOException {
075    return new CSVReader(source, detectEncoding(source), delimiter, null, header ? 1 : 0);
076  }
077
078  /**
079   * Build a CSVReader with specific encoding, delimiter, quotes character and number of header row(s).
080   *
081   * @param source
082   * @param encoding
083   * @param delimiter
084   * @param quotes
085   * @param headerRows
086   * @return
087   * @throws IOException
088   */
089  public static CSVReader build(File source, String encoding, String delimiter, Character quotes, Integer headerRows)
090          throws IOException {
091    return new CSVReader(source, encoding, delimiter, quotes, headerRows);
092  }
093
094  /**
095   * Build a CSVReader with specific encoding, delimiter and number of header row(s) but default quote character
096   * (quotation marks)
097   * @param source
098   * @param encoding
099   * @param delimiter
100   * @param headerRows
101   * @return
102   * @throws IOException
103   */
104  public static CSVReader build(File source, String encoding, String delimiter, Integer headerRows) throws IOException {
105    return new CSVReader(source, encoding, delimiter, '"', headerRows);
106  }
107
108  /**
109   * Build a CSVReader with specific encoding, delimiter quotes and number of header row(s)
110   *
111   * @param stream
112   * @param encoding
113   * @param delimiter
114   * @param quotes
115   * @param headerRows
116   * @return
117   * @throws IOException
118   */
119  public static CSVReader build(InputStream stream, String encoding, String delimiter, Character quotes,
120                                Integer headerRows) throws IOException {
121    return new CSVReader(stream, encoding, delimiter, quotes, headerRows);
122  }
123
124  /**
125   * Build a CSVReader and try to detect the encoding, delimiter and quotes.
126   *
127   * @param source
128   * @param headerRows
129   * @return
130   * @throws IOException
131   */
132  public static CSVReader build(File source, Integer headerRows) throws IOException {
133    String encoding = detectEncoding(source);
134    CSVMetadata csvMeta  = extractCsvMetadata(source, encoding);
135    return new CSVReader(source, encoding, csvMeta.getDelimiter(), csvMeta.getQuotedBy(), headerRows);
136  }
137
138  /**
139   * Assumes 1 header row
140   *
141   * @param source
142   * @return
143   * @throws IOException
144   */
145  public static CSVReader build(File source) throws IOException {
146    return build(source, 1);
147  }
148
149  public static CSVReader buildTabReader(InputStream stream, String encoding, Integer headerRows) throws IOException {
150    return new CSVReader(stream, encoding, "\t", null, headerRows);
151  }
152
153  public static CSVReader buildUtf8TabReader(InputStream stream) throws IOException {
154    return buildTabReader(stream, "utf8", 0);
155  }
156
157  /**
158   * Replaced by {@link TabularFileMetadataExtractor}
159   *
160   * Extract metadata from a CSV file.
161   * Metadata includes delimiter and quotes character.
162   * 
163   * @param source
164   * @param encoding
165   * @return
166   * @throws UnkownDelimitersException
167   */
168  public static CSVMetadata extractCsvMetadata(File source, String encoding) throws UnkownDelimitersException {
169    CSVMetadata csvMetadata = new CSVMetadata();
170    // try csv, tab and then other popular delimiters
171    // keep number of resulting columns for comparisons
172    int maxColumns = 0;
173
174    for (String delim : POTENTIAL_DELIMITERS) {
175      // test with various quotes including a dynamic one if the first char in each field is consistently the same
176      List<Character> potentialQuotes = new ArrayList<Character>();
177
178      CSVReader reader;
179      try {
180        reader = build(source, encoding, delim, null, 1);
181        Character firstChar = likelyQuoteChar(reader);
182        reader.close();
183        if (firstChar != null) {
184          potentialQuotes.add(firstChar);
185        }
186      } catch (IOException ignored) {
187      }
188      // prefer quotes for CSVs
189      if (delim.equals(",")) {
190        potentialQuotes.add('"');
191        potentialQuotes.add('\'');
192        potentialQuotes.add(null);
193      } else {
194        potentialQuotes.add(null);
195        potentialQuotes.add('"');
196        potentialQuotes.add('\'');
197      }
198
199      for (Character quote : potentialQuotes) {
200        try {
201          reader = build(source, encoding, delim, quote, 0);
202          int x = consistentRowSize(reader);
203          // try to find the delimiter and quote that will give us the maximum number of rows
204          if (x > maxColumns) {
205            csvMetadata.setDelimiter(delim);
206            csvMetadata.setQuotedBy(quote);
207            maxColumns = x;
208          }
209          reader.close();
210        } catch (IOException ignored) {
211          // swallow, maybe different delimiters work
212          // if all fail we will throw an exception at the end
213        }
214      }
215    }
216
217    if (maxColumns < 1) {
218      throw new UnkownDelimitersException("Unable to detect field delimiter");
219    }
220
221    return csvMetadata;
222  }
223
224  /**
225   * @return the number of consistent columns, -1 if non consistent or column numbers-2 in case the column numbers only
226   * differ by 1 at max.
227   */
228  private static int consistentRowSize(CSVReader reader) {
229    int rowNum = 0;
230    int columns = 0;
231    boolean plusMinusOne = false;
232    while (reader.hasNext() && rowNum < ROWS_TO_INSPECT) {
233      String[] row = reader.next();
234      if (rowNum == 0) {
235        columns = row.length;
236      }
237      if (Math.abs(columns - row.length) > 1) {
238        return -1;
239      }
240      if (columns != row.length) {
241        plusMinusOne = true;
242      }
243      rowNum++;
244    }
245    if (plusMinusOne) {
246      return columns - 2;
247    }
248    return columns;
249  }
250
251  private static String detectEncoding(File source) throws UnkownCharsetException {
252    Charset encoding;
253    try {
254      encoding = CharsetDetection.detectEncoding(source, 16384);
255      if (encoding == null) {
256        throw new UnkownCharsetException("Unable to detect the files character encoding");
257      }
258    } catch (IOException e) {
259      throw new UnkownCharsetException(e);
260    }
261    return encoding.displayName();
262  }
263
264  /**
265   * Checks if all non empty/null fields start with the same character.
266   *
267   * @return the first character if consistent, otherwise null
268   */
269  private static Character likelyQuoteChar(CSVReader reader) {
270    Character quote = null;
271    int line = 0;
272    while (reader.hasNext() && line < 10) {
273      line++;
274      String[] row = reader.next();
275      if (row != null) {
276        for (String col : row) {
277          if (col != null && col.length() > 0) {
278            // same char at start & end?
279            if (col.length() > 1 && col.charAt(0) == col.charAt(col.length() - 1)) {
280              // only consider non alphanumerics
281              char potQuote = col.charAt(0);
282              if (Character.isLetterOrDigit(potQuote)) {
283                break;
284              }
285              if (quote == null) {
286                quote = potQuote;
287              } else {
288                if (!quote.equals(potQuote)) {
289                  quote = null;
290                  break;
291                }
292              }
293            }
294          }
295        }
296      }
297    }
298    return quote;
299  }
300
301}
302