001/***************************************************************************
002 * Copyright 2010-2015 Global Biodiversity Information Facility Secretariat
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
005 * use this file except in compliance with the License. You may obtain a copy of
006 * the License at
007 *
008 * http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
012 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
013 * License for the specific language governing permissions and limitations under
014 * the License.
015 ***************************************************************************/
016package org.gbif.utils.file.csv;
017
018import com.google.common.base.Preconditions;
019import com.google.common.collect.Sets;
020import org.gbif.utils.file.ClosableReportingIterator;
021
022import java.io.BufferedReader;
023import java.io.File;
024import java.io.FileInputStream;
025import java.io.IOException;
026import java.io.InputStream;
027import java.io.InputStreamReader;
028import java.util.Map;
029import java.util.Set;
030
031import com.google.common.base.Strings;
032import com.google.common.cache.Cache;
033import com.google.common.cache.CacheBuilder;
034import org.apache.commons.lang3.text.StrTokenizer;
035import org.slf4j.Logger;
036import org.slf4j.LoggerFactory;
037
038/**
039 *
040 */
041public class CSVReader implements ClosableReportingIterator<String[]> {
042
043  private static final Logger LOG = LoggerFactory.getLogger(CSVReader.class);
044  public final int headerRows;
045  public final String encoding;
046  public final String delimiter;
047  public final Character quoteChar;
048  public final String[] header;
049  private final StrTokenizer tokenizer;
050  private String row;
051  private int rows;
052  private int readRows;
053  private final Map<Integer, String> emptyLines;
054  private final BufferedReader br;
055  private boolean rowError;
056  private String errorMessage;
057  private Exception exception;
058
059  public CSVReader(File source, String encoding, String delimiter, Character quotes, Integer headerRows)
060          throws IOException {
061    this(new FileInputStream(source), encoding, delimiter, quotes, headerRows);
062  }
063
064  public CSVReader(InputStream stream, String encoding, String delimiter, Character quotes, Integer headerRows)
065          throws IOException {
066    this(stream, encoding, delimiter, quotes, headerRows, 1000);
067  }
068
069  public CSVReader(InputStream stream, String encoding, String delimiter, Character quotes, Integer headerRows, int emptyLineCache)
070          throws IOException {
071    if (emptyLineCache > 0) {
072      Cache<Integer, String> cache = CacheBuilder.newBuilder().maximumSize(emptyLineCache).build();
073      this.emptyLines = cache.asMap();
074    } else {
075      emptyLines = null;
076    }
077    this.rows = 0;
078    this.readRows = 0;
079    this.delimiter = delimiter;
080    this.encoding = encoding;
081    this.quoteChar = quotes;
082    this.headerRows = headerRows == null || headerRows < 0 ? 0 : headerRows;
083    tokenizer = new StrTokenizer();
084    tokenizer.setDelimiterString(delimiter);
085    if (quotes != null) {
086      tokenizer.setQuoteChar(quotes);
087    }
088    tokenizer.setIgnoreEmptyTokens(false);
089    tokenizer.reset();
090    InputStreamReader reader = new InputStreamReader(stream, encoding);
091    br = new BufferedReader(reader);
092    row = br.readLine();
093    // parse header row
094    if (row == null) {
095      header = null;
096    } else {
097      tokenizer.reset(row);
098      header = tokenizer.getTokenArray();
099    }
100    // skip initial header rows?
101    while (headerRows > 0) {
102      headerRows--;
103      row = br.readLine();
104    }
105  }
106
107  /**
108   * Get the header, or null if none
109   * @return
110   */
111  public String[] getHeader() {
112    return header;
113  }
114
115  @Override
116  public void close() {
117    try {
118      br.close();
119    } catch (IOException e) {
120      LOG.debug("Exception caught", e);
121    }
122  }
123
124  /**
125   * @return the current line number of the String[] iterator
126   */
127  public int currLineNumber() {
128    return rows;
129  }
130
131  /**
132   * @return a set of the line numbers of the firsts empty rows found in the file
133   */
134  public Set<Integer> getEmptyLines() {
135    return emptyLines == null ? Sets.newHashSet() : emptyLines.keySet();
136  }
137
138  /**
139   * @return the number of rows of data that were correctly read from the file
140   */
141  public int getReadRows() {
142    return readRows;
143  }
144
145  /*
146   * (non-Javadoc)
147   * @see java.util.Iterator#hasNext()
148   */
149  @Override
150  public boolean hasNext() {
151    return row != null;
152  }
153
154  /*
155   * (non-Javadoc)
156   * @see java.util.Iterator#next()
157   */
158  @Override
159  public String[] next() {
160    if (row == null) {
161      return null;
162    }
163    tokenizer.reset(row);
164    resetReportingIterator();
165    try {
166      row = br.readLine();
167      rows++;
168      // skip empty lines
169      while (row != null && row.length() == 0) {
170        // save line number of empty line
171        if (emptyLines != null) {
172          emptyLines.put(rows + headerRows + 1, "");
173        }
174        row = br.readLine();
175        rows++;
176      }
177      readRows++;
178    } catch (IOException e) {
179      LOG.debug("Exception caught", e);
180      rowError = true;
181      exception = e;
182
183      // construct error message showing exception and problem row
184      StringBuilder msg = new StringBuilder();
185      msg.append("Exception caught: ");
186      msg.append(e.getMessage());
187      if (!Strings.isNullOrEmpty(row)) {
188        msg.append("\n");
189        msg.append("Row: ");
190        msg.append(row);
191      }
192      errorMessage = msg.toString();
193
194      // ensure iteration terminates
195      row = null;
196    }
197    return tokenizer.getTokenArray();
198  }
199
200  /**
201   * Reset all reporting parameters.
202   */
203  private void resetReportingIterator() {
204    rowError = false;
205    exception = null;
206    errorMessage = null;
207  }
208
209  @Override
210  public void remove() {
211    throw new UnsupportedOperationException("Remove not supported");
212  }
213
214  @Override
215  public boolean hasRowError() {
216    return rowError;
217  }
218
219  @Override
220  public String getErrorMessage() {
221    return errorMessage;
222  }
223
224  @Override
225  public Exception getException() {
226    return exception;
227  }
228}