001/* 002 * Licensed under the Apache License, Version 2.0 (the "License"); 003 * you may not use this file except in compliance with the License. 004 * You may obtain a copy of the License at 005 * 006 * http://www.apache.org/licenses/LICENSE-2.0 007 * 008 * Unless required by applicable law or agreed to in writing, software 009 * distributed under the License is distributed on an "AS IS" BASIS, 010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 011 * See the License for the specific language governing permissions and 012 * limitations under the License. 013 */ 014package org.gbif.utils.file.tabular; 015 016import org.gbif.utils.file.FileUtils; 017 018import java.io.File; 019import java.io.IOException; 020import java.nio.charset.StandardCharsets; 021import java.nio.file.Files; 022import java.text.ParseException; 023import java.util.List; 024 025import org.junit.jupiter.api.Test; 026 027import static org.junit.jupiter.api.Assertions.assertEquals; 028import static org.junit.jupiter.api.Assertions.assertThrows; 029 030/** 031 * Unit tests for {@link TabularDataFileReader}. 032 */ 033public class TabularDataFileReaderTest { 034 035 @Test 036 public void testCsvOptionalQuotes() throws IOException, ParseException { 037 File csv = FileUtils.getClasspathFile("csv/csv_optional_quotes_excel2008.csv"); 038 039 try (TabularDataFileReader<List<String>> reader = 040 TabularFiles.newTabularFileReader( 041 Files.newBufferedReader(csv.toPath(), StandardCharsets.UTF_8), ',', true)) { 042 List<String> rec = reader.read(); 043 assertEquals(3, rec.size()); 044 assertEquals("1", rec.get(0)); 045 assertEquals("This has a, comma", rec.get(2)); 046 047 rec = reader.read(); 048 assertEquals("I say this is only a \"quote\"", rec.get(2)); 049 050 while (rec != null) { 051 rec = reader.read(); 052 } 053 } 054 } 055 056 /** 057 * Ensure if we can escape a quote character with a backslash 058 */ 059 @Test 060 public void testEscapedQuotes() throws IOException, ParseException { 061 File tsv = FileUtils.getClasspathFile("csv/csv_escaped_quotes.csv"); 062 try (TabularDataFileReader<List<String>> reader = 063 TabularFiles.newTabularFileReader( 064 Files.newBufferedReader(tsv.toPath(), StandardCharsets.UTF_8), 065 ',', 066 "\n", 067 '"', 068 false, 069 1)) { 070 071 List<String> rec = reader.read(); 072 assertEquals(12, rec.size()); 073 assertEquals( 074 "Danish Mycological Society (2017-09-04). Fungal records database (http://svampe.databasen.org), contributed by Frøslev, T., Heilmann-Clausen, J., Jeppesen, T.S., Lange, C., Læssøe, T., Petersen, J.H., Søchting, U., \"Vesterholt\", J.", 075 rec.get(5)); 076 assertEquals("{\"Substrate\":\"wood\"}", rec.get(10)); 077 } 078 } 079 080 @Test 081 public void testWrongEscapedQuotes1() throws IOException { 082 File tsv = FileUtils.getClasspathFile("csv/csv_wrong_escaped_quotes_1.csv"); 083 try (TabularDataFileReader<List<String>> reader = 084 TabularFiles.newTabularFileReader( 085 Files.newBufferedReader(tsv.toPath(), StandardCharsets.UTF_8), 086 ',', 087 "\n", 088 '"', 089 false, 090 1)) { 091 assertThrows(ParseException.class, reader::read); 092 } 093 } 094 095 @Test 096 public void testWrongEscapedQuotes2() throws IOException { 097 File tsv = FileUtils.getClasspathFile("csv/csv_wrong_escaped_quotes_2.csv"); 098 try (TabularDataFileReader<List<String>> reader = 099 TabularFiles.newTabularFileReader( 100 Files.newBufferedReader(tsv.toPath(), StandardCharsets.UTF_8), 101 ',', 102 "\n", 103 '"', 104 false, 105 1)) { 106 assertThrows(ParseException.class, reader::read); 107 } 108 } 109 110 /** 111 * Test a CSV with all cells quoted. 112 */ 113 @Test 114 public void testCsvAlwaysQuotes() throws IOException, ParseException { 115 File csv = FileUtils.getClasspathFile("csv/csv_always_quoted.csv"); 116 117 try (TabularDataFileReader<List<String>> reader = 118 TabularFiles.newTabularFileReader( 119 Files.newBufferedReader(csv.toPath(), StandardCharsets.UTF_8), ',', true)) { 120 List<String> rec = reader.read(); 121 // the value we retrieve should not include the quotes 122 assertEquals("8728372", rec.get(0)); 123 124 // read all records 125 while (rec != null) { 126 rec = reader.read(); 127 } 128 assertEquals(2, reader.getLastRecordNumber()); 129 assertEquals(3, reader.getLastRecordLineNumber()); 130 } 131 } 132 133 /** 134 * Test a CSV file that includes a newline character (\n) inside a properly quoted cell. 135 */ 136 @Test 137 public void testCsvMultiline() throws IOException, ParseException { 138 File csv = FileUtils.getClasspathFile("csv/csv_quote_endline.csv"); 139 140 try (TabularDataFileReader<List<String>> reader = 141 TabularFiles.newTabularFileReader( 142 Files.newBufferedReader(csv.toPath(), StandardCharsets.UTF_8), ',', true)) { 143 144 // before we start reading, those methods are expected to return 0 145 assertEquals(0, reader.getLastRecordNumber()); 146 assertEquals(0, reader.getLastRecordLineNumber()); 147 148 int numberOfRows = 0; 149 List<String> rec = reader.read(); 150 while (rec != null) { 151 numberOfRows++; 152 rec = reader.read(); 153 } 154 155 assertEquals(3, numberOfRows); 156 assertEquals(3, reader.getLastRecordNumber()); 157 assertEquals(7, reader.getLastRecordLineNumber()); 158 } 159 } 160 161 /** 162 * Testing classic non quoted tab files with escaped \t tabs. 163 */ 164 @Test 165 public void testTab() throws IOException, ParseException { 166 File csv = FileUtils.getClasspathFile("csv/escapedTab.tab"); 167 try (TabularDataFileReader<List<String>> reader = 168 TabularFiles.newTabularFileReader( 169 Files.newBufferedReader(csv.toPath(), StandardCharsets.UTF_8), '\t', true)) { 170 171 int numberOfRecords = 0; 172 List<String> rec = reader.read(); 173 while (rec != null) { 174 numberOfRecords++; 175 rec = reader.read(); 176 } 177 178 assertEquals(8, numberOfRecords); 179 assertEquals(8, reader.getLastRecordNumber()); 180 assertEquals(9, reader.getLastRecordLineNumber()); 181 } 182 } 183 184 @Test 185 public void testCsvWithComment() throws IOException, ParseException { 186 File csv = FileUtils.getClasspathFile("csv/tab_separated_generic_comments.txt"); 187 try (TabularDataFileReader<List<String>> reader = 188 TabularFiles.newTabularFileReader( 189 Files.newBufferedReader(csv.toPath(), StandardCharsets.UTF_8), 190 '\t', 191 "\n", 192 null, 193 true, 194 2)) { 195 196 int numberOfRecords = 0; 197 List<String> rec = reader.read(); 198 while (rec != null) { 199 numberOfRecords++; 200 rec = reader.read(); 201 } 202 203 assertEquals(4, numberOfRecords); 204 assertEquals(4, reader.getLastRecordNumber()); 205 assertEquals(7, reader.getLastRecordLineNumber()); 206 } 207 } 208 209 @Test 210 public void testIgnoreEmptyLines() throws IOException, ParseException { 211 File csv = FileUtils.getClasspathFile("csv/empty_line.tab"); 212 try (TabularDataFileReader<List<String>> reader = 213 TabularFiles.newTabularFileReader( 214 Files.newBufferedReader(csv.toPath(), StandardCharsets.UTF_8), 215 '\t', 216 "\n", 217 null, 218 true)) { 219 String[] ids = {"1", "5", "10", "12", "14", "20", "21", "", "30"}; 220 int row = 0; 221 List<String> line = reader.read(); 222 while (line != null) { 223 assertEquals(ids[row], line.get(0)); 224 row++; 225 line = reader.read(); 226 } 227 assertEquals(9, reader.getLastRecordNumber()); 228 assertEquals(12, reader.getLastRecordLineNumber()); 229 } 230 } 231 232 /** 233 * Test extracting a CSV file containing embedded JSON, which itself contains escaped quotes. 234 * 235 * JSON value like: { "test": "value, \"like\" this" } 236 * 237 * Would become in CSV: "{ ""test"": ""value, \""like\"" this"" }" 238 */ 239 @Test 240 public void testCsvJsonEscapedQuotes() throws IOException, ParseException { 241 File csv = FileUtils.getClasspathFile("csv/csv_json_escaped_quotes2.csv"); 242 try (TabularDataFileReader<List<String>> reader = 243 TabularFiles.newTabularFileReader( 244 Files.newBufferedReader(csv.toPath(), StandardCharsets.UTF_8), ',', "\n", '"', true)) { 245 246 List<String> atom = reader.read(); 247 assertEquals(3, atom.size()); 248 assertEquals("779", atom.get(0)); 249 // Without the Java escapes: {"chronostratigraphy": "Cretaceous, Early Cretaceous, Albian - 250 // Late Cretaceous, Cenomanian", "cataloguedescription": "Very worn vertebra. Old catalogue 251 // says \"fragments of bone\".", "created": "2009-05-13", "barcode": "010039076", "project": 252 // "eMesozoic", "determinationnames": "Ornithocheirus", "subdepartment": "Vertebrates", 253 // "lithostratigraphy": "Selborne Group, Upper Greensand Formation, Cambridge Greensand 254 // Member", "imagecategory": ["Register;Specimen"]} 255 assertEquals("{\"jsonKey\": \"jsonValue\"}", atom.get(1)); 256 assertEquals("Cambridge, Cambridge", atom.get(2)); 257 258 atom = reader.read(); 259 assertEquals( 260 "{\"jsonKey\": \"jsonValue with a \"quote\" in the middle (invalid JSON)\"}", 261 atom.get(1)); 262 263 atom = reader.read(); 264 assertEquals( 265 "{\"jsonKey\": \"jsonValue with a \\\"quote\\\" in the middle (valid JSON)\"}", 266 atom.get(1)); 267 } 268 } 269 270 /** 271 * TSV cannot encode tabs or newlines, so there is no escape character. 272 */ 273 @Test 274 public void testTsvBackslashes() throws IOException, ParseException { 275 File csv = FileUtils.getClasspathFile("tabular/with_backslashes.tsv"); 276 try (TabularDataFileReader<List<String>> reader = 277 TabularFiles.newTabularFileReader( 278 Files.newBufferedReader(csv.toPath(), StandardCharsets.UTF_8), 279 '\t', 280 "\n", 281 null, 282 true)) { 283 284 List<String> atom = reader.read(); 285 assertEquals(2, atom.size()); 286 assertEquals("key", atom.get(0)); 287 assertEquals("value", atom.get(1)); 288 atom = reader.read(); 289 assertEquals("Around 1\\4 mile along the road", atom.get(1)); 290 atom = reader.read(); 291 assertEquals("Near the Cloud\\Mitchell county line", atom.get(1)); 292 atom = reader.read(); 293 assertEquals("{\"jKey\": \"jValue with \\\"quotes\\\"\"}", atom.get(1)); 294 } 295 } 296}