001/* 002 * Licensed under the Apache License, Version 2.0 (the "License"); 003 * you may not use this file except in compliance with the License. 004 * You may obtain a copy of the License at 005 * 006 * http://www.apache.org/licenses/LICENSE-2.0 007 * 008 * Unless required by applicable law or agreed to in writing, software 009 * distributed under the License is distributed on an "AS IS" BASIS, 010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 011 * See the License for the specific language governing permissions and 012 * limitations under the License. 013 */ 014package org.gbif.utils.file.csv; 015 016import org.gbif.utils.collection.IterableUtils; 017import org.gbif.utils.file.FileUtils; 018 019import java.io.ByteArrayInputStream; 020import java.io.File; 021import java.io.IOException; 022import java.io.InputStream; 023import java.nio.charset.StandardCharsets; 024 025import org.junit.jupiter.api.Test; 026 027import static org.junit.jupiter.api.Assertions.assertEquals; 028import static org.junit.jupiter.api.Assertions.assertFalse; 029import static org.junit.jupiter.api.Assertions.assertTrue; 030 031public class CSVReaderTest { 032 033 private static final String UTF8 = StandardCharsets.UTF_8.name(); 034 035 @Test 036 public void testCsvAlwaysQuotes() throws IOException { 037 File csv = FileUtils.getClasspathFile("csv/csv_always_quoted.csv"); 038 try (CSVReader reader = new CSVReader(csv, "utf8", ",", '"', 1)) { 039 String[] rec = reader.next(); 040 rec = reader.next(); 041 assertEquals("18728553", rec[0]); 042 assertEquals("-0.25864171259110291", rec[6]); 043 assertEquals("Martins Wood, Ightham", rec[10]); 044 } 045 } 046 047 @Test 048 public void testCsvQuotedDelimiter() throws IOException { 049 String rows = 050 "12,\"not real\"\n" 051 + "13,not \"real\"\n" 052 + "\"14\",noting\n" 053 + "15,\"not \"\"real\"\"\"\n" 054 + "16,\"no, this is \"\"real\"\"\"\n"; 055 056 InputStream stream = new ByteArrayInputStream(rows.getBytes(StandardCharsets.UTF_8)); 057 CSVReader reader = new CSVReader(stream, UTF8, ",", '"', 0); 058 059 String[] rec = reader.next(); 060 assertEquals(2, rec.length); 061 assertEquals("12", rec[0]); 062 assertEquals("not real", rec[1]); 063 064 rec = reader.next(); 065 assertEquals(2, rec.length); 066 assertEquals("13", rec[0]); 067 assertEquals("not \"real\"", rec[1]); 068 069 rec = reader.next(); 070 assertEquals(2, rec.length); 071 assertEquals("14", rec[0]); 072 assertEquals("noting", rec[1]); 073 074 rec = reader.next(); 075 assertEquals(2, rec.length); 076 assertEquals("15", rec[0]); 077 assertEquals("not \"real\"", rec[1]); 078 079 rec = reader.next(); 080 assertEquals(2, rec.length); 081 assertEquals("16", rec[0]); 082 assertEquals("no, this is \"real\"", rec[1]); 083 084 assertFalse(reader.hasNext()); 085 } 086 087 /** 088 * csv file with optional quotes generated by excel. 089 * single, double quotes and comma within a field are tested. 090 */ 091 @Test 092 public void testCsvOptionalQuotes() throws IOException { 093 File csv = FileUtils.getClasspathFile("csv/csv_optional_quotes_excel2008.csv"); 094 CSVReader reader = new CSVReader(csv, UTF8, ",", '"', 1); 095 096 String[] atom = reader.next(); 097 assertEquals(3, atom.length); 098 assertEquals("1", atom[0]); 099 assertEquals("This has a, comma", atom[2]); 100 101 atom = reader.next(); 102 assertEquals("I say this is only a \"quote\"", atom[2]); 103 104 atom = reader.next(); 105 assertEquals("What though, \"if you have a quote\" and a comma", atom[2]); 106 107 atom = reader.next(); 108 assertEquals("What, if we have a \"quote, which has a comma, or 2\"", atom[2]); 109 110 reader.close(); 111 } 112 113 /** 114 * tests the csv reader with different number of header rows on the same file and compares the 4th line in the text 115 * file for each of them 116 */ 117 @Test 118 public void testHeaderRows() throws IOException { 119 File source = FileUtils.getClasspathFile("csv/iucn100.csv"); 120 // assert the headers are the same, no matter how many rows we skip for the iterator 121 CSVReader reader = new CSVReader(source, UTF8, ",", '"', 1); 122 reader.next(); 123 reader.next(); 124 String[] row4h1 = reader.next(); 125 reader.close(); 126 127 reader = new CSVReader(source, UTF8, ",", '"', 0); 128 reader.next(); 129 reader.next(); 130 reader.next(); 131 String[] row4h0 = reader.next(); 132 reader.close(); 133 134 reader = new CSVReader(source, UTF8, ",", '"', 3); 135 String[] row4h3 = reader.next(); 136 reader.close(); 137 138 assertTrue(row4h0.length == row4h1.length); 139 assertTrue(row4h0.length == row4h3.length); 140 int idx = row4h0.length; 141 while (idx > 0) { 142 idx--; 143 assertEquals(row4h0[idx], row4h1[idx]); 144 assertEquals(row4h0[idx], row4h3[idx]); 145 } 146 } 147 148 /** 149 * Test if skip header rows is working with larger settings. 150 */ 151 @Test 152 public void testHeaderRows2() throws IOException { 153 File source = FileUtils.getClasspathFile("csv/iucn100.csv"); 154 CSVReader reader = new CSVReader(source, UTF8, ",", '"', 7); 155 for (String[] row : IterableUtils.iterable(reader)) { 156 assertEquals("9", row[0]); 157 assertEquals("Aaptosyax grypus Rainboth, 1991", row[1]); 158 assertEquals("Actinopterygii", row[4]); 159 break; 160 } 161 } 162 163 @Test 164 public void testIgnoreEmptyLines() throws IOException { 165 File csv = FileUtils.getClasspathFile("csv/empty_line.tab"); 166 CSVReader reader = new CSVReader(csv, UTF8, "\t", null, 1); 167 String[] ids = {"1", "5", "10", "12", "14", "20", "21", "", "30"}; 168 int row = 0; 169 while (reader.hasNext()) { 170 String[] rec = reader.next(); 171 assertEquals(ids[row], rec[0]); 172 row++; 173 } 174 assertTrue(reader.getEmptyLines().size() > 1); 175 assertTrue(reader.getEmptyLines().contains(6)); 176 assertTrue(reader.getEmptyLines().contains(12)); 177 } 178 179 /** 180 * Testing classic non quoted tab files with escaped \t tabs. 181 */ 182 @Test 183 public void testTab() throws IOException { 184 // build archive from single tab file 185 File source = FileUtils.getClasspathFile("csv/escapedTab.tab"); 186 CSVReader reader = new CSVReader(source, UTF8, "\t", null, 1); 187 188 // there should be 8 rows, each with 58 columns 189 String[] line; 190 int lineCount = 0; 191 while ((line = reader.next()) != null) { 192 lineCount++; 193 } 194 assertEquals(8, lineCount); 195 } 196 197 /** 198 * Test extracting a CSV file containing embedded JSON, which itself contains escaped quotes. 199 * 200 * JSON value like: { "test": "value, \"like\" this" } 201 * 202 * Would become in CSV: "{ ""test"": ""value, \""like\"" this"" }" 203 */ 204 @Test 205 public void testCsvJsonEscapedQuotes() throws IOException { 206 File csv = FileUtils.getClasspathFile("csv/csv_json_escaped_quotes.csv"); 207 CSVReader reader = new CSVReader(csv, UTF8, ",", '"', 1); 208 209 String[] atom = reader.next(); 210 assertEquals(71, atom.length); 211 assertEquals("779", atom[0]); 212 assertEquals("Cambridge, Cambridge", atom[62]); 213 // Without the Java escapes: {"chronostratigraphy": "Cretaceous, Early Cretaceous, Albian - Late 214 // Cretaceous, Cenomanian", "cataloguedescription": "Very worn vertebra. Old catalogue says 215 // \"fragments of bone\".", "created": "2009-05-13", "barcode": "010039076", "project": 216 // "eMesozoic", "determinationnames": "Ornithocheirus", "subdepartment": "Vertebrates", 217 // "lithostratigraphy": "Selborne Group, Upper Greensand Formation, Cambridge Greensand Member", 218 // "imagecategory": ["Register;Specimen"]} 219 assertEquals( 220 "{\"chronostratigraphy\": \"Cretaceous, Early Cretaceous, Albian - Late Cretaceous, Cenomanian\", \"cataloguedescription\": \"Very worn vertebra. Old catalogue says \\\"fragments of bone\\\".\", \"created\": \"2009-05-13\", \"barcode\": \"010039076\", \"project\": \"eMesozoic\", \"determinationnames\": \"Ornithocheirus\", \"subdepartment\": \"Vertebrates\", \"lithostratigraphy\": \"Selborne Group, Upper Greensand Formation, Cambridge Greensand Member\", \"imagecategory\": [\"Register;Specimen\"]}", 221 atom[2]); 222 223 reader.close(); 224 } 225}