001/*
002 * Copyright 2021 Global Biodiversity Information Facility (GBIF)
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016package org.gbif.utils.file.csv;
017
018import org.gbif.utils.file.CharsetDetection;
019import org.gbif.utils.file.UnknownCharsetException;
020import org.gbif.utils.file.tabular.TabularFileMetadataExtractor;
021
022import java.io.File;
023import java.io.IOException;
024import java.io.InputStream;
025import java.nio.charset.Charset;
026import java.util.ArrayList;
027import java.util.List;
028
029import org.slf4j.Logger;
030import org.slf4j.LoggerFactory;
031
032/**
033 *
034 */
035public class CSVReaderFactory {
036
037  private static final Logger LOG = LoggerFactory.getLogger(CSVReaderFactory.class);
038  private static final String[] POTENTIAL_DELIMITERS = {",", "\t", ";", "|"};
039  private static final int ROWS_TO_INSPECT = 10;
040
041  /**
042   * Data about the CSV file
043   */
044  public static class CSVMetadata {
045    String delimiter;
046    Character quotedBy;
047
048    public String getDelimiter() {
049      return delimiter;
050    }
051
052    public void setDelimiter(String delimiter) {
053      this.delimiter = delimiter;
054    }
055
056    public Character getQuotedBy() {
057      return quotedBy;
058    }
059
060    public void setQuotedBy(Character quotedBy) {
061      this.quotedBy = quotedBy;
062    }
063  }
064
065  /**
066   * Build a CSVReader with specific delimiter and presence or not of a header line.
067   * Encoding detection will be attempted.
068   * @param source
069   * @param delimiter
070   * @param header Does the file include a header line ?
071   * @return
072   * @throws IOException
073   */
074  public static CSVReader build(File source, String delimiter, boolean header) throws IOException {
075    return new CSVReader(source, detectEncoding(source), delimiter, null, header ? 1 : 0);
076  }
077
078  /**
079   * Build a CSVReader with specific encoding, delimiter, quotes character and number of header row(s).
080   *
081   * @param source
082   * @param encoding
083   * @param delimiter
084   * @param quotes
085   * @param headerRows
086   * @return
087   * @throws IOException
088   */
089  public static CSVReader build(File source, String encoding, String delimiter, Character quotes, Integer headerRows)
090          throws IOException {
091    return new CSVReader(source, encoding, delimiter, quotes, headerRows);
092  }
093
094  /**
095   * Build a CSVReader with specific encoding, delimiter and number of header row(s) but default quote character
096   * (quotation marks)
097   * @param source
098   * @param encoding
099   * @param delimiter
100   * @param headerRows
101   * @return
102   * @throws IOException
103   */
104  public static CSVReader build(File source, String encoding, String delimiter, Integer headerRows) throws IOException {
105    return new CSVReader(source, encoding, delimiter, '"', headerRows);
106  }
107
108  /**
109   * Build a CSVReader with specific encoding, delimiter quotes and number of header row(s)
110   *
111   * @param stream
112   * @param encoding
113   * @param delimiter
114   * @param quotes
115   * @param headerRows
116   * @return
117   * @throws IOException
118   */
119  public static CSVReader build(InputStream stream, String encoding, String delimiter, Character quotes,
120                                Integer headerRows) throws IOException {
121    return new CSVReader(stream, encoding, delimiter, quotes, headerRows);
122  }
123
124  /**
125   * Build a CSVReader and try to detect the encoding, delimiter and quotes.
126   *
127   * @param source
128   * @param headerRows
129   * @return
130   * @throws IOException
131   */
132  public static CSVReader build(File source, Integer headerRows) throws IOException {
133    String encoding = detectEncoding(source);
134    CSVMetadata csvMeta  = extractCsvMetadata(source, encoding);
135    return new CSVReader(source, encoding, csvMeta.getDelimiter(), csvMeta.getQuotedBy(), headerRows);
136  }
137
138  /**
139   * Assumes 1 header row
140   *
141   * @param source
142   * @return
143   * @throws IOException
144   */
145  public static CSVReader build(File source) throws IOException {
146    return build(source, 1);
147  }
148
149  public static CSVReader buildTabReader(InputStream stream, String encoding, Integer headerRows) throws IOException {
150    return new CSVReader(stream, encoding, "\t", null, headerRows);
151  }
152
153  public static CSVReader buildUtf8TabReader(InputStream stream) throws IOException {
154    return buildTabReader(stream, "utf8", 0);
155  }
156
157  /**
158   * Replaced by {@link TabularFileMetadataExtractor}
159   *
160   * Extract metadata from a CSV file.
161   * Metadata includes delimiter and quotes character.
162   *
163   * @param source
164   * @param encoding
165   * @return
166   * @throws UnknownDelimitersException
167   */
168  public static CSVMetadata extractCsvMetadata(File source, String encoding) throws UnknownDelimitersException {
169    CSVMetadata csvMetadata = new CSVMetadata();
170    // try csv, tab and then other popular delimiters
171    // keep number of resulting columns for comparisons
172    int maxColumns = 0;
173
174    for (String delim : POTENTIAL_DELIMITERS) {
175      // test with various quotes including a dynamic one if the first char in each field is consistently the same
176      List<Character> potentialQuotes = new ArrayList<Character>();
177
178      CSVReader reader;
179      try {
180        reader = build(source, encoding, delim, null, 1);
181        Character firstChar = likelyQuoteChar(reader);
182        reader.close();
183        if (firstChar != null) {
184          potentialQuotes.add(firstChar);
185        }
186      } catch (IOException ignored) {
187      }
188      // prefer quotes for CSVs
189      if (delim.equals(",")) {
190        potentialQuotes.add('"');
191        potentialQuotes.add('\'');
192        potentialQuotes.add(null);
193      } else {
194        potentialQuotes.add(null);
195        potentialQuotes.add('"');
196        potentialQuotes.add('\'');
197      }
198
199      for (Character quote : potentialQuotes) {
200        try {
201          reader = build(source, encoding, delim, quote, 0);
202          int x = consistentRowSize(reader);
203          // try to find the delimiter and quote that will give us the maximum number of rows
204          if (x > maxColumns) {
205            csvMetadata.setDelimiter(delim);
206            csvMetadata.setQuotedBy(quote);
207            maxColumns = x;
208          }
209          reader.close();
210        } catch (IOException ignored) {
211          // swallow, maybe different delimiters work
212          // if all fail we will throw an exception at the end
213        }
214      }
215    }
216
217    if (maxColumns < 1) {
218      throw new UnknownDelimitersException("Unable to detect field delimiter");
219    }
220
221    return csvMetadata;
222  }
223
224  /**
225   * @return the number of consistent columns, -1 if non consistent or column numbers-2 in case the column numbers only
226   * differ by 1 at max.
227   */
228  private static int consistentRowSize(CSVReader reader) {
229    int rowNum = 0;
230    int columns = 0;
231    boolean plusMinusOne = false;
232    while (reader.hasNext() && rowNum < ROWS_TO_INSPECT) {
233      String[] row = reader.next();
234      if (rowNum == 0) {
235        columns = row.length;
236      }
237      if (Math.abs(columns - row.length) > 1) {
238        return -1;
239      }
240      if (columns != row.length) {
241        plusMinusOne = true;
242      }
243      rowNum++;
244    }
245    if (plusMinusOne) {
246      return columns - 2;
247    }
248    return columns;
249  }
250
251  private static String detectEncoding(File source) throws UnknownCharsetException {
252    Charset encoding;
253    try {
254      encoding = CharsetDetection.detectEncoding(source, 16384);
255      if (encoding == null) {
256        throw new UnknownCharsetException("Unable to detect the files character encoding");
257      }
258    } catch (IOException e) {
259      throw new UnknownCharsetException(e);
260    }
261    return encoding.displayName();
262  }
263
264  /**
265   * Checks if all non empty/null fields start with the same character.
266   *
267   * @return the first character if consistent, otherwise null
268   */
269  private static Character likelyQuoteChar(CSVReader reader) {
270    Character quote = null;
271    int line = 0;
272    while (reader.hasNext() && line < 10) {
273      line++;
274      String[] row = reader.next();
275      if (row != null) {
276        for (String col : row) {
277          if (col != null && col.length() > 0) {
278            // same char at start & end?
279            if (col.length() > 1 && col.charAt(0) == col.charAt(col.length() - 1)) {
280              // only consider non alphanumerics
281              char potQuote = col.charAt(0);
282              if (Character.isLetterOrDigit(potQuote)) {
283                break;
284              }
285              if (quote == null) {
286                quote = potQuote;
287              } else {
288                if (!quote.equals(potQuote)) {
289                  quote = null;
290                  break;
291                }
292              }
293            }
294          }
295        }
296      }
297    }
298    return quote;
299  }
300
301}