001/* 002 * Licensed under the Apache License, Version 2.0 (the "License"); 003 * you may not use this file except in compliance with the License. 004 * You may obtain a copy of the License at 005 * 006 * http://www.apache.org/licenses/LICENSE-2.0 007 * 008 * Unless required by applicable law or agreed to in writing, software 009 * distributed under the License is distributed on an "AS IS" BASIS, 010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 011 * See the License for the specific language governing permissions and 012 * limitations under the License. 013 */ 014package org.gbif.utils.file.tabular; 015 016import org.gbif.utils.file.FileUtils; 017 018import java.io.IOException; 019import java.nio.charset.Charset; 020import java.nio.charset.StandardCharsets; 021import java.nio.file.Path; 022import java.util.ArrayList; 023import java.util.List; 024import java.util.Map; 025import java.util.Set; 026 027import org.junit.jupiter.api.Test; 028 029import static org.gbif.utils.file.tabular.TabularFileMetadataExtractor.computeLineDelimiterStats; 030import static org.gbif.utils.file.tabular.TabularFileMetadataExtractor.extractTabularFileMetadata; 031import static org.gbif.utils.file.tabular.TabularFileMetadataExtractor.getDelimiterWithHighestCount; 032import static org.gbif.utils.file.tabular.TabularFileMetadataExtractor.getQuoteCharWithHighestCount; 033import static org.junit.jupiter.api.Assertions.assertEquals; 034import static org.junit.jupiter.api.Assertions.assertFalse; 035import static org.junit.jupiter.api.Assertions.assertNotNull; 036import static org.junit.jupiter.api.Assertions.assertNull; 037 038/** 039 * Unit tests related to {@link TabularFileMetadataExtractor} 040 */ 041public class TabularFileMetadataExtractorTest { 042 043 @Test 044 public void testComputeDelimiterFrequencySums() { 045 List<String> sample = new ArrayList<>(); 046 sample.add("ID\tName\tName2\tName3"); 047 sample.add("1\ta\tb\tc,1"); 048 sample.add("2\tc\td\te,2"); 049 sample.add("3\tf\tg\th,3"); 050 051 List<TabularFileMetadataExtractor.LineDelimiterStats> linesStats = 052 computeLineDelimiterStats(sample); 053 Map<Character, Integer> delimiterFrequencySums = 054 TabularFileMetadataExtractor.computeDelimiterFrequencySums(linesStats); 055 // here, the delimiter that is used the most often is in fact the correct one 056 assertEquals(12, delimiterFrequencySums.get('\t').intValue()); 057 assertEquals(3, delimiterFrequencySums.get(',').intValue()); 058 059 // add a "noise" line to demonstrate the impact on this function 060 sample.add("4\ti\tj\tk,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4"); 061 linesStats = computeLineDelimiterStats(sample); 062 delimiterFrequencySums = TabularFileMetadataExtractor.computeDelimiterFrequencySums(linesStats); 063 // here, the delimiter that is used the most often is the wrong one 064 assertEquals(15, delimiterFrequencySums.get('\t').intValue()); 065 assertEquals(34, delimiterFrequencySums.get(',').intValue()); 066 } 067 068 @Test 069 public void testComputeDelimiterDistinctFrequency() { 070 List<String> sample = new ArrayList<>(); 071 sample.add("ID\tName\tName2\tName3"); 072 sample.add("1\ta\tb\tc,1"); 073 sample.add("2\tc\td\te,2"); 074 sample.add("3\tf\tg\th,3"); 075 076 List<TabularFileMetadataExtractor.LineDelimiterStats> linesStats = 077 computeLineDelimiterStats(sample); 078 Map<Character, Set<Integer>> delimiterDistinctFrequency = 079 TabularFileMetadataExtractor.computeDelimiterDistinctFrequency(linesStats); 080 081 // here, the delimiter with the most stable frequency is the correct one 082 assertEquals(1, delimiterDistinctFrequency.get('\t').size()); 083 assertEquals(2, delimiterDistinctFrequency.get(',').size()); 084 085 sample.add("4\ti\t\"j\t\"\tk,4"); 086 sample.add("5\tl\t\"m\t\t\"\tn,5"); 087 linesStats = computeLineDelimiterStats(sample); 088 delimiterDistinctFrequency = 089 TabularFileMetadataExtractor.computeDelimiterDistinctFrequency(linesStats); 090 // here, the delimiter that is the most stable is now the wrong one (because of the delimiter 091 // inside the quoted text) 092 assertEquals(3, delimiterDistinctFrequency.get('\t').size()); 093 assertEquals(2, delimiterDistinctFrequency.get(',').size()); 094 } 095 096 @Test 097 public void testComputeDelimiterHighestFrequencyPerLine() { 098 List<String> sample = new ArrayList<>(); 099 sample.add("ID\tName\tName2\tName3"); 100 sample.add("1\ta\tb\tc,1"); 101 sample.add("2\tc\td\te,2"); 102 sample.add("3\tf\tg\th,3"); 103 104 Map<Character, Long> delimiterDistinctFrequency = 105 TabularFileMetadataExtractor.computeDelimiterHighestFrequencyPerLine(sample); 106 107 assertEquals(4, delimiterDistinctFrequency.get('\t').intValue()); 108 assertNull(delimiterDistinctFrequency.get(',')); 109 110 // this line alone won't have an impact on computeDelimiterHighestFrequencyPerLine result 111 sample.add("4\ti\tj\tk,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4"); 112 delimiterDistinctFrequency = 113 TabularFileMetadataExtractor.computeDelimiterHighestFrequencyPerLine(sample); 114 assertEquals(4, delimiterDistinctFrequency.get('\t').intValue()); 115 assertEquals(1, delimiterDistinctFrequency.get(',').intValue()); 116 } 117 118 @Test 119 public void testExtractTabularMetadata() { 120 List<String> sample = new ArrayList<>(); 121 sample.add("OccurrenceID,ScientificName,Locality"); 122 sample.add("1,Gadus morhua,\"This has a, comma\""); 123 sample.add("2,Abies alba,\"I say this is only a \"\"quote\"\"\""); 124 sample.add("3,Pomatoma saltatrix,\"What though, \"\"if you have a quote\"\" and a comma\""); 125 sample.add("4,Yikes ofcourses,\"What, if we have a \"\"quote, which has a comma, or 2\"\"\""); 126 127 TabularFileMetadata metadata = TabularFileMetadataExtractor.extractTabularMetadata(sample); 128 assertEquals(Character.valueOf(',').charValue(), metadata.getDelimiter().charValue()); 129 assertEquals(Character.valueOf('\"'), metadata.getQuotedBy()); 130 } 131 132 @Test 133 public void testSingleLineWithSeparatorAsValue() { 134 List<String> sample = new ArrayList<>(); 135 sample.add("ID\tName\tName1\tName2"); 136 sample.add("1\ta\tb\t,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"); 137 sample.add("2\tc\td\te"); 138 sample.add("3\tf\tg\th"); 139 140 TabularFileMetadata metadata = TabularFileMetadataExtractor.extractTabularMetadata(sample); 141 assertEquals(Character.valueOf('\t').charValue(), metadata.getDelimiter().charValue()); 142 assertNull(metadata.getQuotedBy()); 143 144 // try another version 145 sample.clear(); 146 sample.add("1\tCarlos"); 147 sample.add("2\tPeter, Karl & Inge"); 148 sample.add("3\tCarla, Klara, Berit, Susanna"); 149 sample.add("4\tFoo & Bar"); 150 metadata = TabularFileMetadataExtractor.extractTabularMetadata(sample); 151 assertEquals(Character.valueOf('\t').charValue(), metadata.getDelimiter().charValue()); 152 assertNull(metadata.getQuotedBy()); 153 } 154 155 @Test 156 public void testGetDelimiterWithHighestCount() { 157 // no delimiter 158 assertFalse(getDelimiterWithHighestCount("there is no delimiter here").isPresent()); 159 160 assertEquals( 161 Character.valueOf(',').charValue(), 162 getDelimiterWithHighestCount("a,b,c,d,e").get().charValue()); 163 assertEquals( 164 Character.valueOf('|').charValue(), 165 getDelimiterWithHighestCount("a|b,c|d|e").get().charValue()); 166 assertEquals( 167 Character.valueOf('\t').charValue(), 168 getDelimiterWithHighestCount("a\tb\tc\td\te").get().charValue()); 169 assertEquals( 170 Character.valueOf(';').charValue(), 171 getDelimiterWithHighestCount("a; b; c; d; e").get().charValue()); 172 } 173 174 @Test 175 public void testGetQuoteCharWithHighestCount() { 176 // no quote character 177 assertFalse(getQuoteCharWithHighestCount("a,b,c,d", ',').isPresent()); 178 179 // test double quote character and ensure the result is not affected by another quote character 180 // that is not used for quoting 181 assertEquals( 182 Character.valueOf('\"').charValue(), 183 getQuoteCharWithHighestCount("a,\"b,8\",c\'\'\'\'\'\'\'\'\'\'\',d", ',').get().charValue()); 184 185 // test single quote character 186 assertEquals( 187 Character.valueOf('\'').charValue(), 188 getQuoteCharWithHighestCount("a,\'b,8\',c,d", ',').get().charValue()); 189 } 190 191 @Test 192 public void detectCsvAlwaysQuoted() throws IOException { 193 TabularFileMetadata tabFileMetadata = 194 extractTabularFileMetadata( 195 FileUtils.getClasspathFile("csv/csv_always_quoted.csv").toPath()); 196 assertEquals(',', tabFileMetadata.getDelimiter().charValue()); 197 assertEquals('"', tabFileMetadata.getQuotedBy().charValue()); 198 } 199 200 @Test 201 public void detectPipe() throws IOException { 202 runExtractTabularFileMetadata("csv/pipe_separator.txt", '|', null, StandardCharsets.UTF_8); 203 } 204 205 @Test 206 public void detectSemicolon() throws IOException { 207 runExtractTabularFileMetadata("csv/semicolon_separator.csv", ';', null, StandardCharsets.UTF_8); 208 } 209 210 @Test 211 public void detectTab() throws IOException { 212 String[] files = { 213 "csv/ipni.tab.txt", 214 "csv/tab_separated_generic.txt", 215 "csv/iucn100.tab.txt", 216 "csv/ebird.tab.txt", 217 "csv/empty_line.tab", 218 "csv/irmng.tail", 219 "csv/MOBOT.tab.csv" 220 }; 221 for (String fn : files) { 222 runExtractTabularFileMetadata(fn, '\t', null, StandardCharsets.UTF_8); 223 } 224 } 225 226 @Test 227 public void detectTabQuoted() throws IOException { 228 String[] files = { 229 "csv/eol/my_darwincore_tab_separated_quoted.txt", 230 "csv/eol/my_dataobject_tab_separated_quoted.txt", 231 "csv/borza_tab_separated_quoted.txt" 232 }; 233 for (String fn : files) { 234 runExtractTabularFileMetadata(fn, '\t', '"', StandardCharsets.UTF_8); 235 } 236 } 237 238 private static void runExtractTabularFileMetadata( 239 String classPathFile, 240 Character expectedDelimiter, 241 Character expectedQuoteChar, 242 Charset expectedCharset) 243 throws IOException { 244 Path source = FileUtils.getClasspathFile(classPathFile).toPath(); 245 TabularFileMetadata tabFileMetadata = extractTabularFileMetadata(source); 246 assertEquals(expectedDelimiter.charValue(), tabFileMetadata.getDelimiter().charValue()); 247 248 if (expectedQuoteChar == null) { 249 assertNull(tabFileMetadata.getQuotedBy()); 250 } else { 251 assertNotNull(tabFileMetadata.getQuotedBy(), "Expect a quote character -> " + source); 252 assertEquals(expectedQuoteChar, tabFileMetadata.getQuotedBy(), "Source file -> " + source); 253 } 254 255 assertEquals(expectedCharset, tabFileMetadata.getEncoding()); 256 } 257 258 @Test 259 public void detectEncoding() throws IOException { 260 runExtractTabularFileMetadata( 261 "tabular/test_encoding_detection.iso-8859-1.csv", ',', null, StandardCharsets.ISO_8859_1); 262 runExtractTabularFileMetadata( 263 "tabular/test_encoding_detection.utf-8.csv", ',', null, StandardCharsets.UTF_8); 264 } 265}