001/*
002 * Licensed under the Apache License, Version 2.0 (the "License");
003 * you may not use this file except in compliance with the License.
004 * You may obtain a copy of the License at
005 *
006 *     http://www.apache.org/licenses/LICENSE-2.0
007 *
008 * Unless required by applicable law or agreed to in writing, software
009 * distributed under the License is distributed on an "AS IS" BASIS,
010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
011 * See the License for the specific language governing permissions and
012 * limitations under the License.
013 */
014package org.gbif.utils.file.csv;
015
016import org.gbif.utils.collection.IterableUtils;
017import org.gbif.utils.file.FileUtils;
018
019import java.io.ByteArrayInputStream;
020import java.io.File;
021import java.io.IOException;
022import java.io.InputStream;
023import java.nio.charset.StandardCharsets;
024
025import org.junit.jupiter.api.Test;
026
027import static org.junit.jupiter.api.Assertions.assertEquals;
028import static org.junit.jupiter.api.Assertions.assertFalse;
029import static org.junit.jupiter.api.Assertions.assertTrue;
030
031public class CSVReaderTest {
032
033  private static final String UTF8 = StandardCharsets.UTF_8.name();
034
035  @Test
036  public void testCsvAlwaysQuotes() throws IOException {
037    File csv = FileUtils.getClasspathFile("csv/csv_always_quoted.csv");
038    try (CSVReader reader = new CSVReader(csv, "utf8", ",", '"', 1)) {
039      String[] rec = reader.next();
040      rec = reader.next();
041      assertEquals("18728553", rec[0]);
042      assertEquals("-0.25864171259110291", rec[6]);
043      assertEquals("Martins Wood, Ightham", rec[10]);
044    }
045  }
046
047  @Test
048  public void testCsvQuotedDelimiter() throws IOException {
049    String rows =
050        "12,\"not real\"\n"
051            + "13,not \"real\"\n"
052            + "\"14\",noting\n"
053            + "15,\"not \"\"real\"\"\"\n"
054            + "16,\"no, this is \"\"real\"\"\"\n";
055
056    InputStream stream = new ByteArrayInputStream(rows.getBytes(StandardCharsets.UTF_8));
057    CSVReader reader = new CSVReader(stream, UTF8, ",", '"', 0);
058
059    String[] rec = reader.next();
060    assertEquals(2, rec.length);
061    assertEquals("12", rec[0]);
062    assertEquals("not real", rec[1]);
063
064    rec = reader.next();
065    assertEquals(2, rec.length);
066    assertEquals("13", rec[0]);
067    assertEquals("not \"real\"", rec[1]);
068
069    rec = reader.next();
070    assertEquals(2, rec.length);
071    assertEquals("14", rec[0]);
072    assertEquals("noting", rec[1]);
073
074    rec = reader.next();
075    assertEquals(2, rec.length);
076    assertEquals("15", rec[0]);
077    assertEquals("not \"real\"", rec[1]);
078
079    rec = reader.next();
080    assertEquals(2, rec.length);
081    assertEquals("16", rec[0]);
082    assertEquals("no, this is \"real\"", rec[1]);
083
084    assertFalse(reader.hasNext());
085  }
086
087  /**
088   * csv file with optional quotes generated by excel.
089   * single, double quotes and comma within a field are tested.
090   */
091  @Test
092  public void testCsvOptionalQuotes() throws IOException {
093    File csv = FileUtils.getClasspathFile("csv/csv_optional_quotes_excel2008.csv");
094    CSVReader reader = new CSVReader(csv, UTF8, ",", '"', 1);
095
096    String[] atom = reader.next();
097    assertEquals(3, atom.length);
098    assertEquals("1", atom[0]);
099    assertEquals("This has a, comma", atom[2]);
100
101    atom = reader.next();
102    assertEquals("I say this is only a \"quote\"", atom[2]);
103
104    atom = reader.next();
105    assertEquals("What though, \"if you have a quote\" and a comma", atom[2]);
106
107    atom = reader.next();
108    assertEquals("What, if we have a \"quote, which has a comma, or 2\"", atom[2]);
109
110    reader.close();
111  }
112
113  /**
114   * tests the csv reader with different number of header rows on the same file and compares the 4th line in the text
115   * file for each of them
116   */
117  @Test
118  public void testHeaderRows() throws IOException {
119    File source = FileUtils.getClasspathFile("csv/iucn100.csv");
120    // assert the headers are the same, no matter how many rows we skip for the iterator
121    CSVReader reader = new CSVReader(source, UTF8, ",", '"', 1);
122    reader.next();
123    reader.next();
124    String[] row4h1 = reader.next();
125    reader.close();
126
127    reader = new CSVReader(source, UTF8, ",", '"', 0);
128    reader.next();
129    reader.next();
130    reader.next();
131    String[] row4h0 = reader.next();
132    reader.close();
133
134    reader = new CSVReader(source, UTF8, ",", '"', 3);
135    String[] row4h3 = reader.next();
136    reader.close();
137
138    assertTrue(row4h0.length == row4h1.length);
139    assertTrue(row4h0.length == row4h3.length);
140    int idx = row4h0.length;
141    while (idx > 0) {
142      idx--;
143      assertEquals(row4h0[idx], row4h1[idx]);
144      assertEquals(row4h0[idx], row4h3[idx]);
145    }
146  }
147
148  /**
149   * Test if skip header rows is working with larger settings.
150   */
151  @Test
152  public void testHeaderRows2() throws IOException {
153    File source = FileUtils.getClasspathFile("csv/iucn100.csv");
154    CSVReader reader = new CSVReader(source, UTF8, ",", '"', 7);
155    for (String[] row : IterableUtils.iterable(reader)) {
156      assertEquals("9", row[0]);
157      assertEquals("Aaptosyax grypus Rainboth, 1991", row[1]);
158      assertEquals("Actinopterygii", row[4]);
159      break;
160    }
161  }
162
163  @Test
164  public void testIgnoreEmptyLines() throws IOException {
165    File csv = FileUtils.getClasspathFile("csv/empty_line.tab");
166    CSVReader reader = new CSVReader(csv, UTF8, "\t", null, 1);
167    String[] ids = {"1", "5", "10", "12", "14", "20", "21", "", "30"};
168    int row = 0;
169    while (reader.hasNext()) {
170      String[] rec = reader.next();
171      assertEquals(ids[row], rec[0]);
172      row++;
173    }
174    assertTrue(reader.getEmptyLines().size() > 1);
175    assertTrue(reader.getEmptyLines().contains(6));
176    assertTrue(reader.getEmptyLines().contains(12));
177  }
178
179  /**
180   * Testing classic non quoted tab files with escaped \t tabs.
181   */
182  @Test
183  public void testTab() throws IOException {
184    // build archive from single tab file
185    File source = FileUtils.getClasspathFile("csv/escapedTab.tab");
186    CSVReader reader = new CSVReader(source, UTF8, "\t", null, 1);
187
188    // there should be 8 rows, each with 58 columns
189    String[] line;
190    int lineCount = 0;
191    while ((line = reader.next()) != null) {
192      lineCount++;
193    }
194    assertEquals(8, lineCount);
195  }
196
197  /**
198   * Test extracting a CSV file containing embedded JSON, which itself contains escaped quotes.
199   *
200   * JSON value like: { "test": "value, \"like\" this" }
201   *
202   * Would become in CSV: "{ ""test"": ""value, \""like\"" this"" }"
203   */
204  @Test
205  public void testCsvJsonEscapedQuotes() throws IOException {
206    File csv = FileUtils.getClasspathFile("csv/csv_json_escaped_quotes.csv");
207    CSVReader reader = new CSVReader(csv, UTF8, ",", '"', 1);
208
209    String[] atom = reader.next();
210    assertEquals(71, atom.length);
211    assertEquals("779", atom[0]);
212    assertEquals("Cambridge, Cambridge", atom[62]);
213    // Without the Java escapes: {"chronostratigraphy": "Cretaceous, Early Cretaceous, Albian - Late
214    // Cretaceous, Cenomanian", "cataloguedescription": "Very worn vertebra. Old catalogue says
215    // \"fragments of bone\".", "created": "2009-05-13", "barcode": "010039076", "project":
216    // "eMesozoic", "determinationnames": "Ornithocheirus", "subdepartment": "Vertebrates",
217    // "lithostratigraphy": "Selborne Group, Upper Greensand Formation, Cambridge Greensand Member",
218    // "imagecategory": ["Register;Specimen"]}
219    assertEquals(
220        "{\"chronostratigraphy\": \"Cretaceous, Early Cretaceous, Albian - Late Cretaceous, Cenomanian\", \"cataloguedescription\": \"Very worn vertebra. Old catalogue says \\\"fragments of bone\\\".\", \"created\": \"2009-05-13\", \"barcode\": \"010039076\", \"project\": \"eMesozoic\", \"determinationnames\": \"Ornithocheirus\", \"subdepartment\": \"Vertebrates\", \"lithostratigraphy\": \"Selborne Group, Upper Greensand Formation, Cambridge Greensand Member\", \"imagecategory\": [\"Register;Specimen\"]}",
221        atom[2]);
222
223    reader.close();
224  }
225}