001/* 002 * Copyright 2021 Global Biodiversity Information Facility (GBIF) 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package org.gbif.utils.file.csv; 017 018import org.gbif.utils.file.ClosableReportingIterator; 019 020import java.io.BufferedReader; 021import java.io.File; 022import java.io.FileInputStream; 023import java.io.IOException; 024import java.io.InputStream; 025import java.io.InputStreamReader; 026import java.util.HashSet; 027import java.util.Map; 028import java.util.Set; 029import java.util.concurrent.ConcurrentHashMap; 030 031import org.apache.commons.lang3.StringUtils; 032import org.apache.commons.lang3.text.StrTokenizer; 033import org.slf4j.Logger; 034import org.slf4j.LoggerFactory; 035 036public class CSVReader implements ClosableReportingIterator<String[]> { 037 038 private static final Logger LOG = LoggerFactory.getLogger(CSVReader.class); 039 public final int headerRows; 040 public final String encoding; 041 public final String delimiter; 042 public final Character quoteChar; 043 public final String[] header; 044 private final StrTokenizer tokenizer; 045 private String row; 046 private int rows; 047 private int readRows; 048 private final int emptyLinesCacheLimit; 049 private final Map<Integer, String> emptyLines; 050 private final BufferedReader br; 051 private boolean rowError; 052 private String errorMessage; 053 private Exception exception; 054 055 public CSVReader(File source, String encoding, String delimiter, Character quotes, Integer headerRows) 056 throws IOException { 057 this(new FileInputStream(source), encoding, delimiter, quotes, headerRows); 058 } 059 060 public CSVReader(InputStream stream, String encoding, String delimiter, Character quotes, Integer headerRows) 061 throws IOException { 062 this(stream, encoding, delimiter, quotes, headerRows, 1000); 063 } 064 065 public CSVReader(InputStream stream, String encoding, String delimiter, Character quotes, Integer headerRows, int emptyLineCache) 066 throws IOException { 067 this.emptyLinesCacheLimit = emptyLineCache; 068 if (emptyLineCache > 0) { 069 this.emptyLines = new ConcurrentHashMap<>(emptyLineCache); 070 } else { 071 emptyLines = null; 072 } 073 this.rows = 0; 074 this.readRows = 0; 075 this.delimiter = delimiter; 076 this.encoding = encoding; 077 this.quoteChar = quotes; 078 this.headerRows = headerRows == null || headerRows < 0 ? 0 : headerRows; 079 tokenizer = new StrTokenizer(); 080 tokenizer.setDelimiterString(delimiter); 081 if (quotes != null) { 082 tokenizer.setQuoteChar(quotes); 083 } 084 tokenizer.setIgnoreEmptyTokens(false); 085 tokenizer.reset(); 086 InputStreamReader reader = new InputStreamReader(stream, encoding); 087 br = new BufferedReader(reader); 088 row = br.readLine(); 089 // parse header row 090 if (row == null) { 091 header = null; 092 } else { 093 tokenizer.reset(row); 094 header = tokenizer.getTokenArray(); 095 } 096 // skip initial header rows? 097 while (headerRows != null && headerRows > 0) { 098 headerRows--; 099 row = br.readLine(); 100 } 101 } 102 103 /** 104 * Get the header, or null if none 105 */ 106 public String[] getHeader() { 107 return header; 108 } 109 110 @Override 111 public void close() { 112 try { 113 br.close(); 114 } catch (IOException e) { 115 LOG.debug("Exception caught", e); 116 } 117 } 118 119 /** 120 * @return the current line number of the String[] iterator 121 */ 122 public int currLineNumber() { 123 return rows; 124 } 125 126 /** 127 * @return a set of the line numbers of the firsts empty rows found in the file 128 */ 129 public Set<Integer> getEmptyLines() { 130 return emptyLines == null ? new HashSet<>() : emptyLines.keySet(); 131 } 132 133 /** 134 * @return the number of rows of data that were correctly read from the file 135 */ 136 public int getReadRows() { 137 return readRows; 138 } 139 140 /* 141 * (non-Javadoc) 142 * @see java.util.Iterator#hasNext() 143 */ 144 @Override 145 public boolean hasNext() { 146 return row != null; 147 } 148 149 /* 150 * (non-Javadoc) 151 * @see java.util.Iterator#next() 152 */ 153 @Override 154 public String[] next() { 155 if (row == null) { 156 return null; 157 } 158 tokenizer.reset(row); 159 resetReportingIterator(); 160 try { 161 row = br.readLine(); 162 rows++; 163 // skip empty lines 164 while (row != null && row.length() == 0) { 165 // save line number of empty line 166 if (emptyLines != null && emptyLines.size() < emptyLinesCacheLimit) { 167 emptyLines.put(rows + headerRows + 1, ""); 168 } 169 row = br.readLine(); 170 rows++; 171 } 172 readRows++; 173 } catch (IOException e) { 174 LOG.debug("Exception caught", e); 175 rowError = true; 176 exception = e; 177 178 // construct error message showing exception and problem row 179 StringBuilder msg = new StringBuilder(); 180 msg.append("Exception caught: "); 181 msg.append(e.getMessage()); 182 if (StringUtils.isNotBlank(row)) { 183 msg.append("\n"); 184 msg.append("Row: "); 185 msg.append(row); 186 } 187 errorMessage = msg.toString(); 188 189 // ensure iteration terminates 190 row = null; 191 } 192 return tokenizer.getTokenArray(); 193 } 194 195 /** 196 * Reset all reporting parameters. 197 */ 198 private void resetReportingIterator() { 199 rowError = false; 200 exception = null; 201 errorMessage = null; 202 } 203 204 @Override 205 public void remove() { 206 throw new UnsupportedOperationException("Remove not supported"); 207 } 208 209 @Override 210 public boolean hasRowError() { 211 return rowError; 212 } 213 214 @Override 215 public String getErrorMessage() { 216 return errorMessage; 217 } 218 219 @Override 220 public Exception getException() { 221 return exception; 222 } 223}