001/*
002 * Copyright 2021 Global Biodiversity Information Facility (GBIF)
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016package org.gbif.utils.file.csv;
017
018import org.gbif.utils.file.ClosableReportingIterator;
019
020import java.io.BufferedReader;
021import java.io.File;
022import java.io.FileInputStream;
023import java.io.IOException;
024import java.io.InputStream;
025import java.io.InputStreamReader;
026import java.util.HashSet;
027import java.util.Map;
028import java.util.Set;
029import java.util.concurrent.ConcurrentHashMap;
030
031import org.apache.commons.lang3.StringUtils;
032import org.apache.commons.lang3.text.StrTokenizer;
033import org.slf4j.Logger;
034import org.slf4j.LoggerFactory;
035
036public class CSVReader implements ClosableReportingIterator<String[]> {
037
038  private static final Logger LOG = LoggerFactory.getLogger(CSVReader.class);
039  public final int headerRows;
040  public final String encoding;
041  public final String delimiter;
042  public final Character quoteChar;
043  public final String[] header;
044  private final StrTokenizer tokenizer;
045  private String row;
046  private int rows;
047  private int readRows;
048  private final int emptyLinesCacheLimit;
049  private final Map<Integer, String> emptyLines;
050  private final BufferedReader br;
051  private boolean rowError;
052  private String errorMessage;
053  private Exception exception;
054
055  public CSVReader(File source, String encoding, String delimiter, Character quotes, Integer headerRows)
056          throws IOException {
057    this(new FileInputStream(source), encoding, delimiter, quotes, headerRows);
058  }
059
060  public CSVReader(InputStream stream, String encoding, String delimiter, Character quotes, Integer headerRows)
061          throws IOException {
062    this(stream, encoding, delimiter, quotes, headerRows, 1000);
063  }
064
065  public CSVReader(InputStream stream, String encoding, String delimiter, Character quotes, Integer headerRows, int emptyLineCache)
066          throws IOException {
067    this.emptyLinesCacheLimit = emptyLineCache;
068    if (emptyLineCache > 0) {
069      this.emptyLines = new ConcurrentHashMap<>(emptyLineCache);
070    } else {
071      emptyLines = null;
072    }
073    this.rows = 0;
074    this.readRows = 0;
075    this.delimiter = delimiter;
076    this.encoding = encoding;
077    this.quoteChar = quotes;
078    this.headerRows = headerRows == null || headerRows < 0 ? 0 : headerRows;
079    tokenizer = new StrTokenizer();
080    tokenizer.setDelimiterString(delimiter);
081    if (quotes != null) {
082      tokenizer.setQuoteChar(quotes);
083    }
084    tokenizer.setIgnoreEmptyTokens(false);
085    tokenizer.reset();
086    InputStreamReader reader = new InputStreamReader(stream, encoding);
087    br = new BufferedReader(reader);
088    row = br.readLine();
089    // parse header row
090    if (row == null) {
091      header = null;
092    } else {
093      tokenizer.reset(row);
094      header = tokenizer.getTokenArray();
095    }
096    // skip initial header rows?
097    while (headerRows != null && headerRows > 0) {
098      headerRows--;
099      row = br.readLine();
100    }
101  }
102
103  /**
104   * Get the header, or null if none
105   */
106  public String[] getHeader() {
107    return header;
108  }
109
110  @Override
111  public void close() {
112    try {
113      br.close();
114    } catch (IOException e) {
115      LOG.debug("Exception caught", e);
116    }
117  }
118
119  /**
120   * @return the current line number of the String[] iterator
121   */
122  public int currLineNumber() {
123    return rows;
124  }
125
126  /**
127   * @return a set of the line numbers of the firsts empty rows found in the file
128   */
129  public Set<Integer> getEmptyLines() {
130    return emptyLines == null ? new HashSet<>() : emptyLines.keySet();
131  }
132
133  /**
134   * @return the number of rows of data that were correctly read from the file
135   */
136  public int getReadRows() {
137    return readRows;
138  }
139
140  /*
141   * (non-Javadoc)
142   * @see java.util.Iterator#hasNext()
143   */
144  @Override
145  public boolean hasNext() {
146    return row != null;
147  }
148
149  /*
150   * (non-Javadoc)
151   * @see java.util.Iterator#next()
152   */
153  @Override
154  public String[] next() {
155    if (row == null) {
156      return null;
157    }
158    tokenizer.reset(row);
159    resetReportingIterator();
160    try {
161      row = br.readLine();
162      rows++;
163      // skip empty lines
164      while (row != null && row.length() == 0) {
165        // save line number of empty line
166        if (emptyLines != null && emptyLines.size() < emptyLinesCacheLimit) {
167          emptyLines.put(rows + headerRows + 1, "");
168        }
169        row = br.readLine();
170        rows++;
171      }
172      readRows++;
173    } catch (IOException e) {
174      LOG.debug("Exception caught", e);
175      rowError = true;
176      exception = e;
177
178      // construct error message showing exception and problem row
179      StringBuilder msg = new StringBuilder();
180      msg.append("Exception caught: ");
181      msg.append(e.getMessage());
182      if (StringUtils.isNotBlank(row)) {
183        msg.append("\n");
184        msg.append("Row: ");
185        msg.append(row);
186      }
187      errorMessage = msg.toString();
188
189      // ensure iteration terminates
190      row = null;
191    }
192    return tokenizer.getTokenArray();
193  }
194
195  /**
196   * Reset all reporting parameters.
197   */
198  private void resetReportingIterator() {
199    rowError = false;
200    exception = null;
201    errorMessage = null;
202  }
203
204  @Override
205  public void remove() {
206    throw new UnsupportedOperationException("Remove not supported");
207  }
208
209  @Override
210  public boolean hasRowError() {
211    return rowError;
212  }
213
214  @Override
215  public String getErrorMessage() {
216    return errorMessage;
217  }
218
219  @Override
220  public Exception getException() {
221    return exception;
222  }
223}