001/*
002 * Licensed under the Apache License, Version 2.0 (the "License");
003 * you may not use this file except in compliance with the License.
004 * You may obtain a copy of the License at
005 *
006 *     http://www.apache.org/licenses/LICENSE-2.0
007 *
008 * Unless required by applicable law or agreed to in writing, software
009 * distributed under the License is distributed on an "AS IS" BASIS,
010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
011 * See the License for the specific language governing permissions and
012 * limitations under the License.
013 */
014package org.gbif.utils.file.tabular;
015
016import org.gbif.utils.file.FileUtils;
017
018import java.io.File;
019import java.io.IOException;
020import java.nio.charset.StandardCharsets;
021import java.nio.file.Files;
022import java.text.ParseException;
023import java.util.List;
024
025import org.junit.jupiter.api.Test;
026
027import static org.junit.jupiter.api.Assertions.assertEquals;
028import static org.junit.jupiter.api.Assertions.assertThrows;
029
030/**
031 * Unit tests for {@link TabularDataFileReader}.
032 */
033public class TabularDataFileReaderTest {
034
035  @Test
036  public void testCsvOptionalQuotes() throws IOException, ParseException {
037    File csv = FileUtils.getClasspathFile("csv/csv_optional_quotes_excel2008.csv");
038
039    try (TabularDataFileReader<List<String>> reader =
040        TabularFiles.newTabularFileReader(
041            Files.newBufferedReader(csv.toPath(), StandardCharsets.UTF_8), ',', true)) {
042      List<String> rec = reader.read();
043      assertEquals(3, rec.size());
044      assertEquals("1", rec.get(0));
045      assertEquals("This has a, comma", rec.get(2));
046
047      rec = reader.read();
048      assertEquals("I say this is only a \"quote\"", rec.get(2));
049
050      while (rec != null) {
051        rec = reader.read();
052      }
053    }
054  }
055
056  /**
057   * Ensure if we can escape a quote character with a backslash
058   */
059  @Test
060  public void testEscapedQuotes() throws IOException, ParseException {
061    File tsv = FileUtils.getClasspathFile("csv/csv_escaped_quotes.csv");
062    try (TabularDataFileReader<List<String>> reader =
063        TabularFiles.newTabularFileReader(
064            Files.newBufferedReader(tsv.toPath(), StandardCharsets.UTF_8),
065            ',',
066            "\n",
067            '"',
068            false,
069            1)) {
070
071      List<String> rec = reader.read();
072      assertEquals(12, rec.size());
073      assertEquals(
074          "Danish Mycological Society (2017-09-04). Fungal records database (http://svampe.databasen.org), contributed by Frøslev, T., Heilmann-Clausen, J., Jeppesen, T.S., Lange, C., Læssøe, T., Petersen, J.H., Søchting, U., \"Vesterholt\", J.",
075          rec.get(5));
076      assertEquals("{\"Substrate\":\"wood\"}", rec.get(10));
077    }
078  }
079
080  @Test
081  public void testWrongEscapedQuotes1() throws IOException {
082    File tsv = FileUtils.getClasspathFile("csv/csv_wrong_escaped_quotes_1.csv");
083    try (TabularDataFileReader<List<String>> reader =
084        TabularFiles.newTabularFileReader(
085            Files.newBufferedReader(tsv.toPath(), StandardCharsets.UTF_8),
086            ',',
087            "\n",
088            '"',
089            false,
090            1)) {
091      assertThrows(ParseException.class, reader::read);
092    }
093  }
094
095  @Test
096  public void testWrongEscapedQuotes2() throws IOException {
097    File tsv = FileUtils.getClasspathFile("csv/csv_wrong_escaped_quotes_2.csv");
098    try (TabularDataFileReader<List<String>> reader =
099        TabularFiles.newTabularFileReader(
100            Files.newBufferedReader(tsv.toPath(), StandardCharsets.UTF_8),
101            ',',
102            "\n",
103            '"',
104            false,
105            1)) {
106      assertThrows(ParseException.class, reader::read);
107    }
108  }
109
110  /**
111   * Test a CSV with all cells quoted.
112   */
113  @Test
114  public void testCsvAlwaysQuotes() throws IOException, ParseException {
115    File csv = FileUtils.getClasspathFile("csv/csv_always_quoted.csv");
116
117    try (TabularDataFileReader<List<String>> reader =
118        TabularFiles.newTabularFileReader(
119            Files.newBufferedReader(csv.toPath(), StandardCharsets.UTF_8), ',', true)) {
120      List<String> rec = reader.read();
121      // the value we retrieve should not include the quotes
122      assertEquals("8728372", rec.get(0));
123
124      // read all records
125      while (rec != null) {
126        rec = reader.read();
127      }
128      assertEquals(2, reader.getLastRecordNumber());
129      assertEquals(3, reader.getLastRecordLineNumber());
130    }
131  }
132
133  /**
134   * Test a CSV file that includes a newline character (\n) inside a properly quoted cell.
135   */
136  @Test
137  public void testCsvMultiline() throws IOException, ParseException {
138    File csv = FileUtils.getClasspathFile("csv/csv_quote_endline.csv");
139
140    try (TabularDataFileReader<List<String>> reader =
141        TabularFiles.newTabularFileReader(
142            Files.newBufferedReader(csv.toPath(), StandardCharsets.UTF_8), ',', true)) {
143
144      // before we start reading, those methods are expected to return 0
145      assertEquals(0, reader.getLastRecordNumber());
146      assertEquals(0, reader.getLastRecordLineNumber());
147
148      int numberOfRows = 0;
149      List<String> rec = reader.read();
150      while (rec != null) {
151        numberOfRows++;
152        rec = reader.read();
153      }
154
155      assertEquals(3, numberOfRows);
156      assertEquals(3, reader.getLastRecordNumber());
157      assertEquals(7, reader.getLastRecordLineNumber());
158    }
159  }
160
161  /**
162   * Testing classic non quoted tab files with escaped \t tabs.
163   */
164  @Test
165  public void testTab() throws IOException, ParseException {
166    File csv = FileUtils.getClasspathFile("csv/escapedTab.tab");
167    try (TabularDataFileReader<List<String>> reader =
168        TabularFiles.newTabularFileReader(
169            Files.newBufferedReader(csv.toPath(), StandardCharsets.UTF_8), '\t', true)) {
170
171      int numberOfRecords = 0;
172      List<String> rec = reader.read();
173      while (rec != null) {
174        numberOfRecords++;
175        rec = reader.read();
176      }
177
178      assertEquals(8, numberOfRecords);
179      assertEquals(8, reader.getLastRecordNumber());
180      assertEquals(9, reader.getLastRecordLineNumber());
181    }
182  }
183
184  @Test
185  public void testCsvWithComment() throws IOException, ParseException {
186    File csv = FileUtils.getClasspathFile("csv/tab_separated_generic_comments.txt");
187    try (TabularDataFileReader<List<String>> reader =
188        TabularFiles.newTabularFileReader(
189            Files.newBufferedReader(csv.toPath(), StandardCharsets.UTF_8),
190            '\t',
191            "\n",
192            null,
193            true,
194            2)) {
195
196      int numberOfRecords = 0;
197      List<String> rec = reader.read();
198      while (rec != null) {
199        numberOfRecords++;
200        rec = reader.read();
201      }
202
203      assertEquals(4, numberOfRecords);
204      assertEquals(4, reader.getLastRecordNumber());
205      assertEquals(7, reader.getLastRecordLineNumber());
206    }
207  }
208
209  @Test
210  public void testIgnoreEmptyLines() throws IOException, ParseException {
211    File csv = FileUtils.getClasspathFile("csv/empty_line.tab");
212    try (TabularDataFileReader<List<String>> reader =
213        TabularFiles.newTabularFileReader(
214            Files.newBufferedReader(csv.toPath(), StandardCharsets.UTF_8),
215            '\t',
216            "\n",
217            null,
218            true)) {
219      String[] ids = {"1", "5", "10", "12", "14", "20", "21", "", "30"};
220      int row = 0;
221      List<String> line = reader.read();
222      while (line != null) {
223        assertEquals(ids[row], line.get(0));
224        row++;
225        line = reader.read();
226      }
227      assertEquals(9, reader.getLastRecordNumber());
228      assertEquals(12, reader.getLastRecordLineNumber());
229    }
230  }
231
232  /**
233   * Test extracting a CSV file containing embedded JSON, which itself contains escaped quotes.
234   *
235   * JSON value like: { "test": "value, \"like\" this" }
236   *
237   * Would become in CSV: "{ ""test"": ""value, \""like\"" this"" }"
238   */
239  @Test
240  public void testCsvJsonEscapedQuotes() throws IOException, ParseException {
241    File csv = FileUtils.getClasspathFile("csv/csv_json_escaped_quotes2.csv");
242    try (TabularDataFileReader<List<String>> reader =
243        TabularFiles.newTabularFileReader(
244            Files.newBufferedReader(csv.toPath(), StandardCharsets.UTF_8), ',', "\n", '"', true)) {
245
246      List<String> atom = reader.read();
247      assertEquals(3, atom.size());
248      assertEquals("779", atom.get(0));
249      // Without the Java escapes: {"chronostratigraphy": "Cretaceous, Early Cretaceous, Albian -
250      // Late Cretaceous, Cenomanian", "cataloguedescription": "Very worn vertebra. Old catalogue
251      // says \"fragments of bone\".", "created": "2009-05-13", "barcode": "010039076", "project":
252      // "eMesozoic", "determinationnames": "Ornithocheirus", "subdepartment": "Vertebrates",
253      // "lithostratigraphy": "Selborne Group, Upper Greensand Formation, Cambridge Greensand
254      // Member", "imagecategory": ["Register;Specimen"]}
255      assertEquals("{\"jsonKey\": \"jsonValue\"}", atom.get(1));
256      assertEquals("Cambridge, Cambridge", atom.get(2));
257
258      atom = reader.read();
259      assertEquals(
260          "{\"jsonKey\": \"jsonValue with a \"quote\" in the middle (invalid JSON)\"}",
261          atom.get(1));
262
263      atom = reader.read();
264      assertEquals(
265          "{\"jsonKey\": \"jsonValue with a \\\"quote\\\" in the middle (valid JSON)\"}",
266          atom.get(1));
267    }
268  }
269
270  /**
271   * TSV cannot encode tabs or newlines, so there is no escape character.
272   */
273  @Test
274  public void testTsvBackslashes() throws IOException, ParseException {
275    File csv = FileUtils.getClasspathFile("tabular/with_backslashes.tsv");
276    try (TabularDataFileReader<List<String>> reader =
277        TabularFiles.newTabularFileReader(
278            Files.newBufferedReader(csv.toPath(), StandardCharsets.UTF_8),
279            '\t',
280            "\n",
281            null,
282            true)) {
283
284      List<String> atom = reader.read();
285      assertEquals(2, atom.size());
286      assertEquals("key", atom.get(0));
287      assertEquals("value", atom.get(1));
288      atom = reader.read();
289      assertEquals("Around 1\\4 mile along the road", atom.get(1));
290      atom = reader.read();
291      assertEquals("Near the Cloud\\Mitchell county line", atom.get(1));
292      atom = reader.read();
293      assertEquals("{\"jKey\": \"jValue with \\\"quotes\\\"\"}", atom.get(1));
294    }
295  }
296}