001/* 002 * Copyright 2021 Global Biodiversity Information Facility (GBIF) 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package org.gbif.utils.file.tabular; 017 018import org.gbif.utils.PreconditionUtils; 019import org.gbif.utils.file.CharsetDetection; 020import org.gbif.utils.file.UnknownCharsetException; 021 022import java.io.BufferedReader; 023import java.io.IOException; 024import java.nio.charset.Charset; 025import java.nio.file.Files; 026import java.nio.file.Path; 027import java.util.ArrayList; 028import java.util.Arrays; 029import java.util.Collections; 030import java.util.Comparator; 031import java.util.HashSet; 032import java.util.LinkedHashMap; 033import java.util.List; 034import java.util.Map; 035import java.util.Objects; 036import java.util.Optional; 037import java.util.Set; 038import java.util.function.BiFunction; 039import java.util.function.Function; 040import java.util.function.Predicate; 041import java.util.regex.Matcher; 042import java.util.regex.Pattern; 043import java.util.stream.Collectors; 044import java.util.stream.Stream; 045 046import org.apache.commons.lang3.StringUtils; 047import org.slf4j.Logger; 048import org.slf4j.LoggerFactory; 049 050import static java.util.Collections.reverseOrder; 051import static java.util.stream.Collectors.counting; 052import static java.util.stream.Collectors.toSet; 053 054/** 055 * Utility class to extract metadata {@link TabularFileMetadata} from a tabular file. 056 */ 057public class TabularFileMetadataExtractor { 058 059 private static final Logger LOG = LoggerFactory.getLogger(TabularFileMetadataExtractor.class); 060 private static final int MAX_SAMPLE_SIZE = 15; 061 062 // This needs to be large enough to stumble upon a non-ASCII character. 063 private static final int CHARSET_DETECTION_BUFFER_LENGTH = 1024*1024; 064 065 private TabularFileMetadataExtractor() { 066 } 067 068 private static final Character[] POTENTIAL_DELIMITER_CHAR = {',', '\t', ';', '|'}; 069 private static final Character[] POTENTIAL_QUOTES_CHAR = {'"', '\''}; 070 071 private static final Predicate<LineDelimiterStats> CONTAINS_FREQUENCY = lineStats -> lineStats.getFrequency() > 0; 072 private static final Comparator<Map.Entry<Character, Long>> BY_VALUE_LONG_DESC = Map.Entry.comparingByValue(reverseOrder()); 073 private static final BiFunction<Character, Character, Pattern> COMPILE_QUOTE_PATTERN_FCT = (delimiter, quoteChar) 074 -> Pattern.compile("[" + delimiter + "][ ]*[" + quoteChar + "][ ]*[^" + delimiter + "]"); 075 076 /** 077 * Extract metadata from a tabular file using a sample (defined by {@link #MAX_SAMPLE_SIZE}) of the file. 078 * The extraction process is based on the frequency of character in the sample using 3 different approaches. 079 * The method will not return any default value if no delimiter and/or quote character can be found in the sample. 080 * The caller should decide which default values should be used to read the file. 081 * 082 * @param filePath a {@link Path} pointing to a file (not a folder). 083 * @return new {@link TabularFileMetadata}, never null (but the content can be null). 084 * @throws IOException 085 * @throws UnknownCharsetException 086 */ 087 public static TabularFileMetadata extractTabularFileMetadata(Path filePath) throws IOException, UnknownCharsetException { 088 Objects.requireNonNull(filePath, "filePath shall be provided"); 089 PreconditionUtils.checkArgument(!Files.isDirectory(filePath), "filePath should point to a file, not a directory"); 090 091 Charset encoding; 092 try { 093 encoding = CharsetDetection.detectEncoding(filePath.toFile(), CHARSET_DETECTION_BUFFER_LENGTH); 094 if (encoding == null) { 095 throw new UnknownCharsetException("Unable to detect the file's character encoding"); 096 } 097 } catch (IOException e) { 098 throw new UnknownCharsetException(e); 099 } 100 101 // open a first stream to read a sample of the file 102 List<String> lines = new ArrayList<>(); 103 try (BufferedReader bf = Files.newBufferedReader(filePath, encoding)) { 104 String line; 105 do { 106 line = bf.readLine(); 107 if(line != null) { 108 lines.add(line); 109 } 110 } 111 while(line != null && lines.size() < MAX_SAMPLE_SIZE); 112 } 113 TabularFileMetadata tabularFileMetadata = extractTabularMetadata(lines); 114 tabularFileMetadata.setEncoding(encoding); 115 return tabularFileMetadata; 116 } 117 118 /** 119 * Tries to extract the {@link TabularFileMetadata} from a sample of lines of a tabular file. 120 * 121 * @param sample 122 * @return new {@link TabularFileMetadata}, never null (but the content can be null). 123 */ 124 static TabularFileMetadata extractTabularMetadata(final List<String> sample) { 125 Objects.requireNonNull(sample, "sample shall be provided"); 126 TabularFileMetadata tabularFileMetadata = new TabularFileMetadata(); 127 128 Optional<Character> delimiterFound = getDelimiterChar(sample); 129 final Character delimiter = delimiterFound.orElse(null); 130 if(delimiter == null) { 131 return tabularFileMetadata; 132 } 133 134 Optional<Character> quoteFound = getHighestCountOf(sample, line -> getQuoteCharWithHighestCount(line, delimiter)); 135 final Character quote = quoteFound.orElse(null); 136 137 tabularFileMetadata.setDelimiter(delimiter); 138 tabularFileMetadata.setQuotedBy(quote); 139 140 return tabularFileMetadata; 141 } 142 143 /** 144 * Extract a character from a line using the given function. 145 * Return the character with the highest counts. 146 * 147 * @param sample 148 * @param characterExtractor function to apply on each line to extract a character 149 * 150 * @return 151 */ 152 private static Optional<Character> getHighestCountOf(final List<String> sample, final Function<String, 153 Optional<Character>> characterExtractor) { 154 155 //remove Optional wrapper and ignore Optional.empty 156 return sample.stream() 157 .map(characterExtractor) 158 .flatMap(o -> o.map(Stream::of).orElseGet(Stream::empty)) //remove Optional wrapper and ignore Optional.empty 159 .collect(Collectors.groupingBy(Function.identity(), counting())) 160 .entrySet() 161 .stream() 162 .min(BY_VALUE_LONG_DESC) 163 .map(Map.Entry::getKey); 164 } 165 166 /** 167 * Given a sample of line, this method tries to determine the delimiter char used. 168 * 169 * @param sample 170 * 171 * @return the determined delimiter or Optional.empty if it can not be determined. 172 */ 173 public static Optional<Character> getDelimiterChar(final List<String> sample) { 174 175 // count the frequency of all possible delimiter for each lines 176 List<LineDelimiterStats> linesStats = 177 computeLineDelimiterStats(sample); 178 179 // get the distinct set of frequency for each delimiters to check the "stability" 180 Map<Character, Set<Integer>> delimiterDistinctFrequency = 181 computeDelimiterDistinctFrequency(linesStats) 182 .entrySet().stream() 183 // filter out delimiter that we never saw 184 .filter(entry -> entry.getValue().size() > 1 || !entry.getValue().contains(0)) 185 .sorted(Comparator.comparing(e -> e.getValue().size())) 186 .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, 187 (e1, e2) -> e2, LinkedHashMap::new)); 188 189 // we can have more than one 190 Set<Character> mostStableDelimiter = 191 getAllEqualsToFirst(delimiterDistinctFrequency, (s1,s2) -> s1.size() == s2.size()); 192 193 // get the most used delimiter to check the "overall usage" 194 Map<Character, Integer> delimiterFrequencySums = computeDelimiterFrequencySums(linesStats) 195 .entrySet().stream() 196 .sorted(Map.Entry.<Character, Integer>comparingByValue().reversed()) 197 .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, 198 (e1, e2) -> e2, LinkedHashMap::new)); 199 200 Set<Character> mostFrequentDelimiter = getAllEqualsToFirst(delimiterFrequencySums, Integer::equals); 201 202 //get the highest frequency per line to check for "usage per line" 203 Map<Character, Long> delimiterHighestFrequencyPerLine = computeDelimiterHighestFrequencyPerLine(sample) 204 .entrySet().stream() 205 .sorted(Map.Entry.<Character, Long>comparingByValue().reversed()) 206 .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, 207 (e1, e2) -> e2, LinkedHashMap::new)); 208 Set<Character> mostFrequentDelimiterPerLine = getAllEqualsToFirst(delimiterHighestFrequencyPerLine, 209 Long::equals); 210 211 //summary 212 if (LOG.isDebugEnabled()) { 213 LOG.debug("delimiterDistinctFrequency -> " + delimiterDistinctFrequency); 214 LOG.debug("mostStableDelimiter -> " + mostStableDelimiter); 215 LOG.debug("delimiterFrequencySums -> " + delimiterFrequencySums); 216 LOG.debug("mostFrequentDelimiter -> " + mostFrequentDelimiter); 217 LOG.debug("delimiterHighestFrequencyPerLine->" + delimiterHighestFrequencyPerLine); 218 LOG.debug("mostFrequentDelimiterPerLine ->" + mostFrequentDelimiterPerLine); 219 } 220 221 //if the most stable is also the one that is used to most within the sample 222 Optional<Character> resultCharacter = intersectSingle(mostStableDelimiter, mostFrequentDelimiter); 223 if(resultCharacter.isPresent()) { 224 return resultCharacter; 225 } 226 227 //otherwise, if the most stable is also the most used based on lines 228 resultCharacter = intersectSingle(mostStableDelimiter, mostFrequentDelimiterPerLine); 229 if(resultCharacter.isPresent()) { 230 return resultCharacter; 231 } 232 233 //as last resort if the most frequent delimiter overall and by line is the same 234 resultCharacter = intersectSingle(mostFrequentDelimiter, mostFrequentDelimiterPerLine); 235 236 return resultCharacter; 237 } 238 239 /** 240 * Return the {@link Character} represents the intersection between 2 sets only if the resulting set represents 241 * a single element. 242 * @param set1 243 * @param set2 244 * @return 245 */ 246 private static Optional<Character> intersectSingle(Set<Character> set1, Set<Character> set2) { 247 Set<Character> intersection = new HashSet<>(set1); 248 intersection.retainAll(set2); 249 250 return intersection.size() == 1 ? intersection.stream().findFirst() : Optional.empty(); 251 } 252 253 /** 254 * Given a {@link Map}, return all elements that are equals to the first element (including itself) 255 * based on the provided equals function. 256 * @param map 257 * @param equalsPredicate 258 * @param <T> 259 * @return all elements that are equals to the first one or an empty set if the map is empty 260 */ 261 private static <T> Set<Character> getAllEqualsToFirst(Map<Character, T> map, BiFunction<T,T, Boolean> equalsPredicate) { 262 263 Optional<Map.Entry<Character, T>> firstMapEntry = map.entrySet().stream().findFirst(); 264 if(!firstMapEntry.isPresent()){ 265 return Collections.EMPTY_SET; 266 } 267 268 final T firstValue = firstMapEntry.get().getValue(); 269 return map.entrySet().stream().filter( e -> equalsPredicate.apply(firstValue,e.getValue())) 270 .map(Map.Entry::getKey).collect(Collectors.toSet()); 271 } 272 273 /** 274 * For each element(line) of the sample, compute a {@link LineDelimiterStats} for each delimiter. 275 * Note: delimiter that are not used within a line will be included with the frequency 0. 276 * @param sample 277 * @return new List, never null 278 */ 279 static List<LineDelimiterStats> computeLineDelimiterStats(List<String> sample){ 280 return sample.stream() 281 .map(TabularFileMetadataExtractor::lineToLineDelimiterStats) 282 .flatMap(List::stream) 283 .collect(Collectors.toList()); 284 } 285 286 /** 287 * Compute the stats for each potential delimiters on a line. 288 * @param line 289 * @return 290 */ 291 private static List<LineDelimiterStats> lineToLineDelimiterStats(String line) { 292 return Arrays.stream(POTENTIAL_DELIMITER_CHAR) 293 .map(delimiter -> new LineDelimiterStats(delimiter, StringUtils.countMatches(line, delimiter))) 294 .collect(Collectors.toList()); 295 } 296 297 /** 298 * For each {@link LineDelimiterStats}, collect the distinct frequency (count) of each delimiter. 299 * This gives us an idea of the "stability" of each delimiter across the sample. 300 * Note that since quotes are not handled, noise can be introduced if a quoted cells use the delimiter. 301 * 302 * See unit test for examples of when this method will be affected by noise. 303 * 304 * The most stable delimiter is normally defined by the {@link Character} returned by the methods where 305 * the list of distinct frequency is the smallest in size excluding cases where the list contains only the element 306 * representing 0 as Integer (which means the delimiter was never used). 307 * 308 * @param linesStats 309 * 310 * @return 311 */ 312 static Map<Character, Set<Integer>> computeDelimiterDistinctFrequency(List<LineDelimiterStats> linesStats) { 313 return linesStats.stream() 314 .collect( 315 Collectors.groupingBy(LineDelimiterStats::getDelimiter, 316 Collectors.mapping(LineDelimiterStats::getFrequency, toSet()))); 317 } 318 319 /** 320 * For each line, check the delimiter that is used the most. 321 * Return the count of each delimiter. 322 * @param lines 323 * @return 324 */ 325 static Map<Character, Long> computeDelimiterHighestFrequencyPerLine(List<String> lines) { 326 return lines.stream() 327 .map(TabularFileMetadataExtractor::getDelimiterWithHighestCount2) 328 .flatMap(o -> o.map(Stream::of).orElseGet(Stream::empty)) //remove Optional wrapper and ignore Optional.empty 329 .collect(Collectors.groupingBy(LineDelimiterStats::getDelimiter, counting())); 330 } 331 332 /** 333 * For {@link LineDelimiterStats}, sum the frequency (count) of each delimiter. 334 * This gives us an idea of the overall usage of each delimiter across the sample. 335 * Note that since quotes are not handled, noise can be introduced if a quoted cell uses the delimiter. 336 * 337 * See unit test for examples of when this method will be affected by noise. 338 * 339 * @param linesStats 340 * 341 * @return 342 */ 343 static Map<Character, Integer> computeDelimiterFrequencySums(List<LineDelimiterStats> linesStats) { 344 return linesStats.stream() 345 .filter(CONTAINS_FREQUENCY) 346 .collect( 347 Collectors.groupingBy(LineDelimiterStats::getDelimiter, 348 Collectors.summingInt(LineDelimiterStats::getFrequency))); 349 } 350 351 /** 352 * Given a line, get the delimiter with the highest count if any can be found. 353 * Note: quotes are ignored in the count so a delimiter used inside quotes will be counted. 354 * 355 * @param line line of text to analyse 356 * 357 * @return 358 */ 359 static Optional<Character> getDelimiterWithHighestCount(String line) { 360 int highestCount = 0; 361 Character highestCountDelimiter = null; 362 for (Character delimiter : POTENTIAL_DELIMITER_CHAR) { 363 int currentCount = StringUtils.countMatches(line, delimiter); 364 if (currentCount > highestCount) { 365 highestCount = currentCount; 366 highestCountDelimiter = delimiter; 367 } 368 } 369 return Optional.ofNullable(highestCountDelimiter); 370 } 371 372 static Optional<LineDelimiterStats> getDelimiterWithHighestCount2(String line) { 373 int highestCount = 0; 374 // Character highestCountDelimiter = null; 375 LineDelimiterStats lineDelimiterStats = null; 376 for (Character delimiter : POTENTIAL_DELIMITER_CHAR) { 377 int currentCount = StringUtils.countMatches(line, delimiter); 378 if (currentCount > highestCount) { 379 highestCount = currentCount; 380 lineDelimiterStats = new LineDelimiterStats(delimiter,highestCount); 381 } 382 } 383 return Optional.ofNullable(lineDelimiterStats); 384 } 385 386 /** 387 * Given a line and a delimiter, try to determine the quoting character if any can be found. 388 * To check if a quote character is used we run a regex to check for a delimiter followed by a quoting character. 389 * 390 * @param line line of text to analyse 391 * @param delimiter delimiter used in the line of text 392 * 393 * @return 394 */ 395 static Optional<Character> getQuoteCharWithHighestCount(String line, Character delimiter) { 396 int highestCount = 0; 397 Character highestCountQuoteChar = null; 398 for (Character quoteChar : POTENTIAL_QUOTES_CHAR) { 399 int currentCount = 0; 400 Matcher m = COMPILE_QUOTE_PATTERN_FCT.apply(delimiter, quoteChar).matcher(line); 401 while (m.find()) { 402 currentCount++; 403 } 404 if (currentCount > highestCount) { 405 highestCount = currentCount; 406 highestCountQuoteChar = quoteChar; 407 } 408 } 409 return Optional.ofNullable(highestCountQuoteChar); 410 } 411 412 /** 413 * Inner representation of stats (frequency) of a delimiter 414 */ 415 static class LineDelimiterStats { 416 private Character delimiter; 417 private int frequency; 418 LineDelimiterStats(Character delimiter, int frequency) { 419 this.delimiter = delimiter; 420 this.frequency = frequency; 421 } 422 423 Character getDelimiter() { 424 return delimiter; 425 } 426 427 int getFrequency() { 428 return frequency; 429 } 430 } 431 432}