001/*
002 * Licensed under the Apache License, Version 2.0 (the "License");
003 * you may not use this file except in compliance with the License.
004 * You may obtain a copy of the License at
005 *
006 *     http://www.apache.org/licenses/LICENSE-2.0
007 *
008 * Unless required by applicable law or agreed to in writing, software
009 * distributed under the License is distributed on an "AS IS" BASIS,
010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
011 * See the License for the specific language governing permissions and
012 * limitations under the License.
013 */
014package org.gbif.utils.file.csv;
015
016import org.gbif.utils.file.CharsetDetection;
017import org.gbif.utils.file.UnknownCharsetException;
018import org.gbif.utils.file.tabular.TabularFileMetadataExtractor;
019
020import java.io.File;
021import java.io.IOException;
022import java.io.InputStream;
023import java.nio.charset.Charset;
024import java.util.ArrayList;
025import java.util.List;
026
027import org.slf4j.Logger;
028import org.slf4j.LoggerFactory;
029
030/**
031 *
032 */
033public class CSVReaderFactory {
034
035  private static final Logger LOG = LoggerFactory.getLogger(CSVReaderFactory.class);
036  private static final String[] POTENTIAL_DELIMITERS = {",", "\t", ";", "|"};
037  private static final int ROWS_TO_INSPECT = 10;
038
039  /**
040   * Data about the CSV file
041   */
042  public static class CSVMetadata {
043    String delimiter;
044    Character quotedBy;
045
046    public String getDelimiter() {
047      return delimiter;
048    }
049
050    public void setDelimiter(String delimiter) {
051      this.delimiter = delimiter;
052    }
053
054    public Character getQuotedBy() {
055      return quotedBy;
056    }
057
058    public void setQuotedBy(Character quotedBy) {
059      this.quotedBy = quotedBy;
060    }
061  }
062
063  /**
064   * Build a CSVReader with specific delimiter and presence or not of a header line.
065   * Encoding detection will be attempted.
066   * @param source
067   * @param delimiter
068   * @param header Does the file include a header line ?
069   * @return
070   * @throws IOException
071   */
072  public static CSVReader build(File source, String delimiter, boolean header) throws IOException {
073    return new CSVReader(source, detectEncoding(source), delimiter, null, header ? 1 : 0);
074  }
075
076  /**
077   * Build a CSVReader with specific encoding, delimiter, quotes character and number of header row(s).
078   *
079   * @param source
080   * @param encoding
081   * @param delimiter
082   * @param quotes
083   * @param headerRows
084   * @return
085   * @throws IOException
086   */
087  public static CSVReader build(
088      File source, String encoding, String delimiter, Character quotes, Integer headerRows)
089      throws IOException {
090    return new CSVReader(source, encoding, delimiter, quotes, headerRows);
091  }
092
093  /**
094   * Build a CSVReader with specific encoding, delimiter and number of header row(s) but default quote character
095   * (quotation marks)
096   * @param source
097   * @param encoding
098   * @param delimiter
099   * @param headerRows
100   * @return
101   * @throws IOException
102   */
103  public static CSVReader build(File source, String encoding, String delimiter, Integer headerRows)
104      throws IOException {
105    return new CSVReader(source, encoding, delimiter, '"', headerRows);
106  }
107
108  /**
109   * Build a CSVReader with specific encoding, delimiter quotes and number of header row(s)
110   *
111   * @param stream
112   * @param encoding
113   * @param delimiter
114   * @param quotes
115   * @param headerRows
116   * @return
117   * @throws IOException
118   */
119  public static CSVReader build(
120      InputStream stream, String encoding, String delimiter, Character quotes, Integer headerRows)
121      throws IOException {
122    return new CSVReader(stream, encoding, delimiter, quotes, headerRows);
123  }
124
125  /**
126   * Build a CSVReader and try to detect the encoding, delimiter and quotes.
127   *
128   * @param source
129   * @param headerRows
130   * @return
131   * @throws IOException
132   */
133  public static CSVReader build(File source, Integer headerRows) throws IOException {
134    String encoding = detectEncoding(source);
135    CSVMetadata csvMeta = extractCsvMetadata(source, encoding);
136    return new CSVReader(
137        source, encoding, csvMeta.getDelimiter(), csvMeta.getQuotedBy(), headerRows);
138  }
139
140  /**
141   * Assumes 1 header row
142   *
143   * @param source
144   * @return
145   * @throws IOException
146   */
147  public static CSVReader build(File source) throws IOException {
148    return build(source, 1);
149  }
150
151  public static CSVReader buildTabReader(InputStream stream, String encoding, Integer headerRows)
152      throws IOException {
153    return new CSVReader(stream, encoding, "\t", null, headerRows);
154  }
155
156  public static CSVReader buildUtf8TabReader(InputStream stream) throws IOException {
157    return buildTabReader(stream, "utf8", 0);
158  }
159
160  /**
161   * Replaced by {@link TabularFileMetadataExtractor}
162   *
163   * Extract metadata from a CSV file.
164   * Metadata includes delimiter and quotes character.
165   *
166   * @param source
167   * @param encoding
168   * @return
169   * @throws UnknownDelimitersException
170   */
171  public static CSVMetadata extractCsvMetadata(File source, String encoding)
172      throws UnknownDelimitersException {
173    CSVMetadata csvMetadata = new CSVMetadata();
174    // try csv, tab and then other popular delimiters
175    // keep number of resulting columns for comparisons
176    int maxColumns = 0;
177
178    for (String delim : POTENTIAL_DELIMITERS) {
179      // test with various quotes including a dynamic one if the first char in each field is
180      // consistently the same
181      List<Character> potentialQuotes = new ArrayList<Character>();
182
183      CSVReader reader;
184      try {
185        reader = build(source, encoding, delim, null, 1);
186        Character firstChar = likelyQuoteChar(reader);
187        reader.close();
188        if (firstChar != null) {
189          potentialQuotes.add(firstChar);
190        }
191      } catch (IOException ignored) {
192      }
193      // prefer quotes for CSVs
194      if (delim.equals(",")) {
195        potentialQuotes.add('"');
196        potentialQuotes.add('\'');
197        potentialQuotes.add(null);
198      } else {
199        potentialQuotes.add(null);
200        potentialQuotes.add('"');
201        potentialQuotes.add('\'');
202      }
203
204      for (Character quote : potentialQuotes) {
205        try {
206          reader = build(source, encoding, delim, quote, 0);
207          int x = consistentRowSize(reader);
208          // try to find the delimiter and quote that will give us the maximum number of rows
209          if (x > maxColumns) {
210            csvMetadata.setDelimiter(delim);
211            csvMetadata.setQuotedBy(quote);
212            maxColumns = x;
213          }
214          reader.close();
215        } catch (IOException ignored) {
216          // swallow, maybe different delimiters work
217          // if all fail we will throw an exception at the end
218        }
219      }
220    }
221
222    if (maxColumns < 1) {
223      throw new UnknownDelimitersException("Unable to detect field delimiter");
224    }
225
226    return csvMetadata;
227  }
228
229  /**
230   * @return the number of consistent columns, -1 if non consistent or column numbers-2 in case the column numbers only
231   * differ by 1 at max.
232   */
233  private static int consistentRowSize(CSVReader reader) {
234    int rowNum = 0;
235    int columns = 0;
236    boolean plusMinusOne = false;
237    while (reader.hasNext() && rowNum < ROWS_TO_INSPECT) {
238      String[] row = reader.next();
239      if (rowNum == 0) {
240        columns = row.length;
241      }
242      if (Math.abs(columns - row.length) > 1) {
243        return -1;
244      }
245      if (columns != row.length) {
246        plusMinusOne = true;
247      }
248      rowNum++;
249    }
250    if (plusMinusOne) {
251      return columns - 2;
252    }
253    return columns;
254  }
255
256  private static String detectEncoding(File source) throws UnknownCharsetException {
257    Charset encoding;
258    try {
259      encoding = CharsetDetection.detectEncoding(source, 16384);
260      if (encoding == null) {
261        throw new UnknownCharsetException("Unable to detect the files character encoding");
262      }
263    } catch (IOException e) {
264      throw new UnknownCharsetException(e);
265    }
266    return encoding.displayName();
267  }
268
269  /**
270   * Checks if all non empty/null fields start with the same character.
271   *
272   * @return the first character if consistent, otherwise null
273   */
274  private static Character likelyQuoteChar(CSVReader reader) {
275    Character quote = null;
276    int line = 0;
277    while (reader.hasNext() && line < 10) {
278      line++;
279      String[] row = reader.next();
280      if (row != null) {
281        for (String col : row) {
282          if (col != null && col.length() > 0) {
283            // same char at start & end?
284            if (col.length() > 1 && col.charAt(0) == col.charAt(col.length() - 1)) {
285              // only consider non alphanumerics
286              char potQuote = col.charAt(0);
287              if (Character.isLetterOrDigit(potQuote)) {
288                break;
289              }
290              if (quote == null) {
291                quote = potQuote;
292              } else {
293                if (!quote.equals(potQuote)) {
294                  quote = null;
295                  break;
296                }
297              }
298            }
299          }
300        }
301      }
302    }
303    return quote;
304  }
305}