001/*
002 * Licensed under the Apache License, Version 2.0 (the "License");
003 * you may not use this file except in compliance with the License.
004 * You may obtain a copy of the License at
005 *
006 *     http://www.apache.org/licenses/LICENSE-2.0
007 *
008 * Unless required by applicable law or agreed to in writing, software
009 * distributed under the License is distributed on an "AS IS" BASIS,
010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
011 * See the License for the specific language governing permissions and
012 * limitations under the License.
013 */
014package org.gbif.utils.file.csv;
015
016import org.gbif.utils.file.ClosableReportingIterator;
017
018import java.io.BufferedReader;
019import java.io.File;
020import java.io.FileInputStream;
021import java.io.IOException;
022import java.io.InputStream;
023import java.io.InputStreamReader;
024import java.util.HashSet;
025import java.util.Map;
026import java.util.Set;
027import java.util.concurrent.ConcurrentHashMap;
028
029import org.apache.commons.lang3.StringUtils;
030import org.apache.commons.lang3.text.StrTokenizer;
031import org.slf4j.Logger;
032import org.slf4j.LoggerFactory;
033
034public class CSVReader implements ClosableReportingIterator<String[]> {
035
036  private static final Logger LOG = LoggerFactory.getLogger(CSVReader.class);
037  public final int headerRows;
038  public final String encoding;
039  public final String delimiter;
040  public final Character quoteChar;
041  public final String[] header;
042  private final StrTokenizer tokenizer;
043  private String row;
044  private int rows;
045  private int readRows;
046  private final int emptyLinesCacheLimit;
047  private final Map<Integer, String> emptyLines;
048  private final BufferedReader br;
049  private boolean rowError;
050  private String errorMessage;
051  private Exception exception;
052
053  public CSVReader(
054      File source, String encoding, String delimiter, Character quotes, Integer headerRows)
055      throws IOException {
056    this(new FileInputStream(source), encoding, delimiter, quotes, headerRows);
057  }
058
059  public CSVReader(
060      InputStream stream, String encoding, String delimiter, Character quotes, Integer headerRows)
061      throws IOException {
062    this(stream, encoding, delimiter, quotes, headerRows, 1000);
063  }
064
065  public CSVReader(
066      InputStream stream,
067      String encoding,
068      String delimiter,
069      Character quotes,
070      Integer headerRows,
071      int emptyLineCache)
072      throws IOException {
073    this.emptyLinesCacheLimit = emptyLineCache;
074    if (emptyLineCache > 0) {
075      this.emptyLines = new ConcurrentHashMap<>(emptyLineCache);
076    } else {
077      emptyLines = null;
078    }
079    this.rows = 0;
080    this.readRows = 0;
081    this.delimiter = delimiter;
082    this.encoding = encoding;
083    this.quoteChar = quotes;
084    this.headerRows = headerRows == null || headerRows < 0 ? 0 : headerRows;
085    tokenizer = new StrTokenizer();
086    tokenizer.setDelimiterString(delimiter);
087    if (quotes != null) {
088      tokenizer.setQuoteChar(quotes);
089    }
090    tokenizer.setIgnoreEmptyTokens(false);
091    tokenizer.reset();
092    InputStreamReader reader = new InputStreamReader(stream, encoding);
093    br = new BufferedReader(reader);
094    row = br.readLine();
095    // parse header row
096    if (row == null) {
097      header = null;
098    } else {
099      tokenizer.reset(row);
100      header = tokenizer.getTokenArray();
101    }
102    // skip initial header rows?
103    while (headerRows != null && headerRows > 0) {
104      headerRows--;
105      row = br.readLine();
106    }
107  }
108
109  /**
110   * Get the header, or null if none
111   */
112  public String[] getHeader() {
113    return header;
114  }
115
116  @Override
117  public void close() {
118    try {
119      br.close();
120    } catch (IOException e) {
121      LOG.debug("Exception caught", e);
122    }
123  }
124
125  /**
126   * @return the current line number of the String[] iterator
127   */
128  public int currLineNumber() {
129    return rows;
130  }
131
132  /**
133   * @return a set of the line numbers of the firsts empty rows found in the file
134   */
135  public Set<Integer> getEmptyLines() {
136    return emptyLines == null ? new HashSet<>() : emptyLines.keySet();
137  }
138
139  /**
140   * @return the number of rows of data that were correctly read from the file
141   */
142  public int getReadRows() {
143    return readRows;
144  }
145
146  /*
147   * (non-Javadoc)
148   * @see java.util.Iterator#hasNext()
149   */
150  @Override
151  public boolean hasNext() {
152    return row != null;
153  }
154
155  /*
156   * (non-Javadoc)
157   * @see java.util.Iterator#next()
158   */
159  @Override
160  public String[] next() {
161    if (row == null) {
162      return null;
163    }
164    tokenizer.reset(row);
165    resetReportingIterator();
166    try {
167      row = br.readLine();
168      rows++;
169      // skip empty lines
170      while (row != null && row.length() == 0) {
171        // save line number of empty line
172        if (emptyLines != null && emptyLines.size() < emptyLinesCacheLimit) {
173          emptyLines.put(rows + headerRows + 1, "");
174        }
175        row = br.readLine();
176        rows++;
177      }
178      readRows++;
179    } catch (IOException e) {
180      LOG.debug("Exception caught", e);
181      rowError = true;
182      exception = e;
183
184      // construct error message showing exception and problem row
185      StringBuilder msg = new StringBuilder();
186      msg.append("Exception caught: ");
187      msg.append(e.getMessage());
188      if (StringUtils.isNotBlank(row)) {
189        msg.append("\n");
190        msg.append("Row: ");
191        msg.append(row);
192      }
193      errorMessage = msg.toString();
194
195      // ensure iteration terminates
196      row = null;
197    }
198    return tokenizer.getTokenArray();
199  }
200
201  /**
202   * Reset all reporting parameters.
203   */
204  private void resetReportingIterator() {
205    rowError = false;
206    exception = null;
207    errorMessage = null;
208  }
209
210  @Override
211  public void remove() {
212    throw new UnsupportedOperationException("Remove not supported");
213  }
214
215  @Override
216  public boolean hasRowError() {
217    return rowError;
218  }
219
220  @Override
221  public String getErrorMessage() {
222    return errorMessage;
223  }
224
225  @Override
226  public Exception getException() {
227    return exception;
228  }
229}