Source code

001/*
002 * Licensed under the Apache License, Version 2.0 (the "License");
003 * you may not use this file except in compliance with the License.
004 * You may obtain a copy of the License at
005 *
006 *     http://www.apache.org/licenses/LICENSE-2.0
007 *
008 * Unless required by applicable law or agreed to in writing, software
009 * distributed under the License is distributed on an "AS IS" BASIS,
010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
011 * See the License for the specific language governing permissions and
012 * limitations under the License.
013 */
014package org.gbif.utils.file.tabular;
015
016import org.gbif.utils.file.FileUtils;
017
018import java.io.IOException;
019import java.nio.charset.Charset;
020import java.nio.charset.StandardCharsets;
021import java.nio.file.Path;
022import java.util.ArrayList;
023import java.util.List;
024import java.util.Map;
025import java.util.Set;
026
027import org.junit.jupiter.api.Test;
028
029import static org.gbif.utils.file.tabular.TabularFileMetadataExtractor.computeLineDelimiterStats;
030import static org.gbif.utils.file.tabular.TabularFileMetadataExtractor.extractTabularFileMetadata;
031import static org.gbif.utils.file.tabular.TabularFileMetadataExtractor.getDelimiterWithHighestCount;
032import static org.gbif.utils.file.tabular.TabularFileMetadataExtractor.getQuoteCharWithHighestCount;
033import static org.junit.jupiter.api.Assertions.assertEquals;
034import static org.junit.jupiter.api.Assertions.assertFalse;
035import static org.junit.jupiter.api.Assertions.assertNotNull;
036import static org.junit.jupiter.api.Assertions.assertNull;
037
038/**
039 * Unit tests related to {@link TabularFileMetadataExtractor}
040 */
041public class TabularFileMetadataExtractorTest {
042
043  @Test
044  public void testComputeDelimiterFrequencySums() {
045    List<String> sample = new ArrayList<>();
046    sample.add("ID\tName\tName2\tName3");
047    sample.add("1\ta\tb\tc,1");
048    sample.add("2\tc\td\te,2");
049    sample.add("3\tf\tg\th,3");
050
051    List<TabularFileMetadataExtractor.LineDelimiterStats> linesStats =
052        computeLineDelimiterStats(sample);
053    Map<Character, Integer> delimiterFrequencySums =
054        TabularFileMetadataExtractor.computeDelimiterFrequencySums(linesStats);
055    // here, the delimiter that is used the most often is in fact the correct one
056    assertEquals(12, delimiterFrequencySums.get('\t').intValue());
057    assertEquals(3, delimiterFrequencySums.get(',').intValue());
058
059    // add a "noise" line to demonstrate the impact on this function
060    sample.add("4\ti\tj\tk,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4");
061    linesStats = computeLineDelimiterStats(sample);
062    delimiterFrequencySums = TabularFileMetadataExtractor.computeDelimiterFrequencySums(linesStats);
063    // here, the delimiter that is used the most often is the wrong one
064    assertEquals(15, delimiterFrequencySums.get('\t').intValue());
065    assertEquals(34, delimiterFrequencySums.get(',').intValue());
066  }
067
068  @Test
069  public void testComputeDelimiterDistinctFrequency() {
070    List<String> sample = new ArrayList<>();
071    sample.add("ID\tName\tName2\tName3");
072    sample.add("1\ta\tb\tc,1");
073    sample.add("2\tc\td\te,2");
074    sample.add("3\tf\tg\th,3");
075
076    List<TabularFileMetadataExtractor.LineDelimiterStats> linesStats =
077        computeLineDelimiterStats(sample);
078    Map<Character, Set<Integer>> delimiterDistinctFrequency =
079        TabularFileMetadataExtractor.computeDelimiterDistinctFrequency(linesStats);
080
081    // here, the delimiter with the most stable frequency is the correct one
082    assertEquals(1, delimiterDistinctFrequency.get('\t').size());
083    assertEquals(2, delimiterDistinctFrequency.get(',').size());
084
085    sample.add("4\ti\t\"j\t\"\tk,4");
086    sample.add("5\tl\t\"m\t\t\"\tn,5");
087    linesStats = computeLineDelimiterStats(sample);
088    delimiterDistinctFrequency =
089        TabularFileMetadataExtractor.computeDelimiterDistinctFrequency(linesStats);
090    // here, the delimiter that is the most stable is now the wrong one (because of the delimiter
091    // inside the quoted text)
092    assertEquals(3, delimiterDistinctFrequency.get('\t').size());
093    assertEquals(2, delimiterDistinctFrequency.get(',').size());
094  }
095
096  @Test
097  public void testComputeDelimiterHighestFrequencyPerLine() {
098    List<String> sample = new ArrayList<>();
099    sample.add("ID\tName\tName2\tName3");
100    sample.add("1\ta\tb\tc,1");
101    sample.add("2\tc\td\te,2");
102    sample.add("3\tf\tg\th,3");
103
104    Map<Character, Long> delimiterDistinctFrequency =
105        TabularFileMetadataExtractor.computeDelimiterHighestFrequencyPerLine(sample);
106
107    assertEquals(4, delimiterDistinctFrequency.get('\t').intValue());
108    assertNull(delimiterDistinctFrequency.get(','));
109
110    // this line alone won't have an impact on computeDelimiterHighestFrequencyPerLine result
111    sample.add("4\ti\tj\tk,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4");
112    delimiterDistinctFrequency =
113        TabularFileMetadataExtractor.computeDelimiterHighestFrequencyPerLine(sample);
114    assertEquals(4, delimiterDistinctFrequency.get('\t').intValue());
115    assertEquals(1, delimiterDistinctFrequency.get(',').intValue());
116  }
117
118  @Test
119  public void testExtractTabularMetadata() {
120    List<String> sample = new ArrayList<>();
121    sample.add("OccurrenceID,ScientificName,Locality");
122    sample.add("1,Gadus morhua,\"This has a, comma\"");
123    sample.add("2,Abies alba,\"I say this is only a \"\"quote\"\"\"");
124    sample.add("3,Pomatoma saltatrix,\"What though, \"\"if you have a quote\"\" and a comma\"");
125    sample.add("4,Yikes ofcourses,\"What, if we have a \"\"quote, which has a comma, or 2\"\"\"");
126
127    TabularFileMetadata metadata = TabularFileMetadataExtractor.extractTabularMetadata(sample);
128    assertEquals(Character.valueOf(',').charValue(), metadata.getDelimiter().charValue());
129    assertEquals(Character.valueOf('\"'), metadata.getQuotedBy());
130  }
131
132  @Test
133  public void testSingleLineWithSeparatorAsValue() {
134    List<String> sample = new ArrayList<>();
135    sample.add("ID\tName\tName1\tName2");
136    sample.add("1\ta\tb\t,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,");
137    sample.add("2\tc\td\te");
138    sample.add("3\tf\tg\th");
139
140    TabularFileMetadata metadata = TabularFileMetadataExtractor.extractTabularMetadata(sample);
141    assertEquals(Character.valueOf('\t').charValue(), metadata.getDelimiter().charValue());
142    assertNull(metadata.getQuotedBy());
143
144    // try another version
145    sample.clear();
146    sample.add("1\tCarlos");
147    sample.add("2\tPeter, Karl & Inge");
148    sample.add("3\tCarla, Klara, Berit, Susanna");
149    sample.add("4\tFoo & Bar");
150    metadata = TabularFileMetadataExtractor.extractTabularMetadata(sample);
151    assertEquals(Character.valueOf('\t').charValue(), metadata.getDelimiter().charValue());
152    assertNull(metadata.getQuotedBy());
153  }
154
155  @Test
156  public void testGetDelimiterWithHighestCount() {
157    // no delimiter
158    assertFalse(getDelimiterWithHighestCount("there is no delimiter here").isPresent());
159
160    assertEquals(
161        Character.valueOf(',').charValue(),
162        getDelimiterWithHighestCount("a,b,c,d,e").get().charValue());
163    assertEquals(
164        Character.valueOf('|').charValue(),
165        getDelimiterWithHighestCount("a|b,c|d|e").get().charValue());
166    assertEquals(
167        Character.valueOf('\t').charValue(),
168        getDelimiterWithHighestCount("a\tb\tc\td\te").get().charValue());
169    assertEquals(
170        Character.valueOf(';').charValue(),
171        getDelimiterWithHighestCount("a; b; c; d; e").get().charValue());
172  }
173
174  @Test
175  public void testGetQuoteCharWithHighestCount() {
176    // no quote character
177    assertFalse(getQuoteCharWithHighestCount("a,b,c,d", ',').isPresent());
178
179    // test double quote character and ensure the result is not affected by another quote character
180    // that is not used for quoting
181    assertEquals(
182        Character.valueOf('\"').charValue(),
183        getQuoteCharWithHighestCount("a,\"b,8\",c\'\'\'\'\'\'\'\'\'\'\',d", ',').get().charValue());
184
185    // test single quote character
186    assertEquals(
187        Character.valueOf('\'').charValue(),
188        getQuoteCharWithHighestCount("a,\'b,8\',c,d", ',').get().charValue());
189  }
190
191  @Test
192  public void detectCsvAlwaysQuoted() throws IOException {
193    TabularFileMetadata tabFileMetadata =
194        extractTabularFileMetadata(
195            FileUtils.getClasspathFile("csv/csv_always_quoted.csv").toPath());
196    assertEquals(',', tabFileMetadata.getDelimiter().charValue());
197    assertEquals('"', tabFileMetadata.getQuotedBy().charValue());
198  }
199
200  @Test
201  public void detectPipe() throws IOException {
202    runExtractTabularFileMetadata("csv/pipe_separator.txt", '|', null, StandardCharsets.UTF_8);
203  }
204
205  @Test
206  public void detectSemicolon() throws IOException {
207    runExtractTabularFileMetadata("csv/semicolon_separator.csv", ';', null, StandardCharsets.UTF_8);
208  }
209
210  @Test
211  public void detectTab() throws IOException {
212    String[] files = {
213      "csv/ipni.tab.txt",
214      "csv/tab_separated_generic.txt",
215      "csv/iucn100.tab.txt",
216      "csv/ebird.tab.txt",
217      "csv/empty_line.tab",
218      "csv/irmng.tail",
219      "csv/MOBOT.tab.csv"
220    };
221    for (String fn : files) {
222      runExtractTabularFileMetadata(fn, '\t', null, StandardCharsets.UTF_8);
223    }
224  }
225
226  @Test
227  public void detectTabQuoted() throws IOException {
228    String[] files = {
229      "csv/eol/my_darwincore_tab_separated_quoted.txt",
230      "csv/eol/my_dataobject_tab_separated_quoted.txt",
231      "csv/borza_tab_separated_quoted.txt"
232    };
233    for (String fn : files) {
234      runExtractTabularFileMetadata(fn, '\t', '"', StandardCharsets.UTF_8);
235    }
236  }
237
238  private static void runExtractTabularFileMetadata(
239      String classPathFile,
240      Character expectedDelimiter,
241      Character expectedQuoteChar,
242      Charset expectedCharset)
243      throws IOException {
244    Path source = FileUtils.getClasspathFile(classPathFile).toPath();
245    TabularFileMetadata tabFileMetadata = extractTabularFileMetadata(source);
246    assertEquals(expectedDelimiter.charValue(), tabFileMetadata.getDelimiter().charValue());
247
248    if (expectedQuoteChar == null) {
249      assertNull(tabFileMetadata.getQuotedBy());
250    } else {
251      assertNotNull(tabFileMetadata.getQuotedBy(), "Expect a quote character -> " + source);
252      assertEquals(expectedQuoteChar, tabFileMetadata.getQuotedBy(), "Source file -> " + source);
253    }
254
255    assertEquals(expectedCharset, tabFileMetadata.getEncoding());
256  }
257
258  @Test
259  public void detectEncoding() throws IOException {
260    runExtractTabularFileMetadata(
261        "tabular/test_encoding_detection.iso-8859-1.csv", ',', null, StandardCharsets.ISO_8859_1);
262    runExtractTabularFileMetadata(
263        "tabular/test_encoding_detection.utf-8.csv", ',', null, StandardCharsets.UTF_8);
264  }
265}