001/* 002 * Licensed under the Apache License, Version 2.0 (the "License"); 003 * you may not use this file except in compliance with the License. 004 * You may obtain a copy of the License at 005 * 006 * http://www.apache.org/licenses/LICENSE-2.0 007 * 008 * Unless required by applicable law or agreed to in writing, software 009 * distributed under the License is distributed on an "AS IS" BASIS, 010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 011 * See the License for the specific language governing permissions and 012 * limitations under the License. 013 */ 014package org.gbif.utils.file.csv; 015 016import org.gbif.utils.file.ClosableReportingIterator; 017 018import java.io.BufferedReader; 019import java.io.File; 020import java.io.FileInputStream; 021import java.io.IOException; 022import java.io.InputStream; 023import java.io.InputStreamReader; 024import java.util.HashSet; 025import java.util.Map; 026import java.util.Set; 027import java.util.concurrent.ConcurrentHashMap; 028 029import org.apache.commons.lang3.StringUtils; 030import org.apache.commons.lang3.text.StrTokenizer; 031import org.slf4j.Logger; 032import org.slf4j.LoggerFactory; 033 034public class CSVReader implements ClosableReportingIterator<String[]> { 035 036 private static final Logger LOG = LoggerFactory.getLogger(CSVReader.class); 037 public final int headerRows; 038 public final String encoding; 039 public final String delimiter; 040 public final Character quoteChar; 041 public final String[] header; 042 private final StrTokenizer tokenizer; 043 private String row; 044 private int rows; 045 private int readRows; 046 private final int emptyLinesCacheLimit; 047 private final Map<Integer, String> emptyLines; 048 private final BufferedReader br; 049 private boolean rowError; 050 private String errorMessage; 051 private Exception exception; 052 053 public CSVReader( 054 File source, String encoding, String delimiter, Character quotes, Integer headerRows) 055 throws IOException { 056 this(new FileInputStream(source), encoding, delimiter, quotes, headerRows); 057 } 058 059 public CSVReader( 060 InputStream stream, String encoding, String delimiter, Character quotes, Integer headerRows) 061 throws IOException { 062 this(stream, encoding, delimiter, quotes, headerRows, 1000); 063 } 064 065 public CSVReader( 066 InputStream stream, 067 String encoding, 068 String delimiter, 069 Character quotes, 070 Integer headerRows, 071 int emptyLineCache) 072 throws IOException { 073 this.emptyLinesCacheLimit = emptyLineCache; 074 if (emptyLineCache > 0) { 075 this.emptyLines = new ConcurrentHashMap<>(emptyLineCache); 076 } else { 077 emptyLines = null; 078 } 079 this.rows = 0; 080 this.readRows = 0; 081 this.delimiter = delimiter; 082 this.encoding = encoding; 083 this.quoteChar = quotes; 084 this.headerRows = headerRows == null || headerRows < 0 ? 0 : headerRows; 085 tokenizer = new StrTokenizer(); 086 tokenizer.setDelimiterString(delimiter); 087 if (quotes != null) { 088 tokenizer.setQuoteChar(quotes); 089 } 090 tokenizer.setIgnoreEmptyTokens(false); 091 tokenizer.reset(); 092 InputStreamReader reader = new InputStreamReader(stream, encoding); 093 br = new BufferedReader(reader); 094 row = br.readLine(); 095 // parse header row 096 if (row == null) { 097 header = null; 098 } else { 099 tokenizer.reset(row); 100 header = tokenizer.getTokenArray(); 101 } 102 // skip initial header rows? 103 while (headerRows != null && headerRows > 0) { 104 headerRows--; 105 row = br.readLine(); 106 } 107 } 108 109 /** 110 * Get the header, or null if none 111 */ 112 public String[] getHeader() { 113 return header; 114 } 115 116 @Override 117 public void close() { 118 try { 119 br.close(); 120 } catch (IOException e) { 121 LOG.debug("Exception caught", e); 122 } 123 } 124 125 /** 126 * @return the current line number of the String[] iterator 127 */ 128 public int currLineNumber() { 129 return rows; 130 } 131 132 /** 133 * @return a set of the line numbers of the firsts empty rows found in the file 134 */ 135 public Set<Integer> getEmptyLines() { 136 return emptyLines == null ? new HashSet<>() : emptyLines.keySet(); 137 } 138 139 /** 140 * @return the number of rows of data that were correctly read from the file 141 */ 142 public int getReadRows() { 143 return readRows; 144 } 145 146 /* 147 * (non-Javadoc) 148 * @see java.util.Iterator#hasNext() 149 */ 150 @Override 151 public boolean hasNext() { 152 return row != null; 153 } 154 155 /* 156 * (non-Javadoc) 157 * @see java.util.Iterator#next() 158 */ 159 @Override 160 public String[] next() { 161 if (row == null) { 162 return null; 163 } 164 tokenizer.reset(row); 165 resetReportingIterator(); 166 try { 167 row = br.readLine(); 168 rows++; 169 // skip empty lines 170 while (row != null && row.length() == 0) { 171 // save line number of empty line 172 if (emptyLines != null && emptyLines.size() < emptyLinesCacheLimit) { 173 emptyLines.put(rows + headerRows + 1, ""); 174 } 175 row = br.readLine(); 176 rows++; 177 } 178 readRows++; 179 } catch (IOException e) { 180 LOG.debug("Exception caught", e); 181 rowError = true; 182 exception = e; 183 184 // construct error message showing exception and problem row 185 StringBuilder msg = new StringBuilder(); 186 msg.append("Exception caught: "); 187 msg.append(e.getMessage()); 188 if (StringUtils.isNotBlank(row)) { 189 msg.append("\n"); 190 msg.append("Row: "); 191 msg.append(row); 192 } 193 errorMessage = msg.toString(); 194 195 // ensure iteration terminates 196 row = null; 197 } 198 return tokenizer.getTokenArray(); 199 } 200 201 /** 202 * Reset all reporting parameters. 203 */ 204 private void resetReportingIterator() { 205 rowError = false; 206 exception = null; 207 errorMessage = null; 208 } 209 210 @Override 211 public void remove() { 212 throw new UnsupportedOperationException("Remove not supported"); 213 } 214 215 @Override 216 public boolean hasRowError() { 217 return rowError; 218 } 219 220 @Override 221 public String getErrorMessage() { 222 return errorMessage; 223 } 224 225 @Override 226 public Exception getException() { 227 return exception; 228 } 229}