001/* 002 * Licensed under the Apache License, Version 2.0 (the "License"); 003 * you may not use this file except in compliance with the License. 004 * You may obtain a copy of the License at 005 * 006 * http://www.apache.org/licenses/LICENSE-2.0 007 * 008 * Unless required by applicable law or agreed to in writing, software 009 * distributed under the License is distributed on an "AS IS" BASIS, 010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 011 * See the License for the specific language governing permissions and 012 * limitations under the License. 013 */ 014package org.gbif.utils.file.tabular; 015 016import org.gbif.utils.PreconditionUtils; 017import org.gbif.utils.file.CharsetDetection; 018import org.gbif.utils.file.UnknownCharsetException; 019 020import java.io.BufferedReader; 021import java.io.IOException; 022import java.nio.charset.Charset; 023import java.nio.file.Files; 024import java.nio.file.Path; 025import java.util.ArrayList; 026import java.util.Arrays; 027import java.util.Collections; 028import java.util.Comparator; 029import java.util.HashSet; 030import java.util.LinkedHashMap; 031import java.util.List; 032import java.util.Map; 033import java.util.Objects; 034import java.util.Optional; 035import java.util.Set; 036import java.util.function.BiFunction; 037import java.util.function.Function; 038import java.util.function.Predicate; 039import java.util.regex.Matcher; 040import java.util.regex.Pattern; 041import java.util.stream.Collectors; 042import java.util.stream.Stream; 043 044import org.apache.commons.lang3.StringUtils; 045import org.slf4j.Logger; 046import org.slf4j.LoggerFactory; 047 048import static java.util.Collections.reverseOrder; 049import static java.util.stream.Collectors.counting; 050import static java.util.stream.Collectors.toSet; 051 052/** 053 * Utility class to extract metadata {@link TabularFileMetadata} from a tabular file. 054 */ 055public class TabularFileMetadataExtractor { 056 057 private static final Logger LOG = LoggerFactory.getLogger(TabularFileMetadataExtractor.class); 058 private static final int MAX_SAMPLE_SIZE = 15; 059 060 // This needs to be large enough to stumble upon a non-ASCII character. 061 private static final int CHARSET_DETECTION_BUFFER_LENGTH = 1024 * 1024; 062 063 private TabularFileMetadataExtractor() {} 064 065 private static final Character[] POTENTIAL_DELIMITER_CHAR = {',', '\t', ';', '|'}; 066 private static final Character[] POTENTIAL_QUOTES_CHAR = {'"', '\''}; 067 068 private static final Predicate<LineDelimiterStats> CONTAINS_FREQUENCY = 069 lineStats -> lineStats.getFrequency() > 0; 070 private static final Comparator<Map.Entry<Character, Long>> BY_VALUE_LONG_DESC = 071 Map.Entry.comparingByValue(reverseOrder()); 072 private static final BiFunction<Character, Character, Pattern> COMPILE_QUOTE_PATTERN_FCT = 073 (delimiter, quoteChar) -> 074 Pattern.compile("[" + delimiter + "][ ]*[" + quoteChar + "][ ]*[^" + delimiter + "]"); 075 076 /** 077 * Extract metadata from a tabular file using a sample (defined by {@link #MAX_SAMPLE_SIZE}) of the file. 078 * The extraction process is based on the frequency of character in the sample using 3 different approaches. 079 * The method will not return any default value if no delimiter and/or quote character can be found in the sample. 080 * The caller should decide which default values should be used to read the file. 081 * 082 * @param filePath a {@link Path} pointing to a file (not a folder). 083 * @return new {@link TabularFileMetadata}, never null (but the content can be null). 084 * @throws IOException 085 * @throws UnknownCharsetException 086 */ 087 public static TabularFileMetadata extractTabularFileMetadata(Path filePath) 088 throws IOException, UnknownCharsetException { 089 Objects.requireNonNull(filePath, "filePath shall be provided"); 090 PreconditionUtils.checkArgument( 091 !Files.isDirectory(filePath), "filePath should point to a file, not a directory"); 092 093 Charset encoding; 094 try { 095 encoding = 096 CharsetDetection.detectEncoding(filePath.toFile(), CHARSET_DETECTION_BUFFER_LENGTH); 097 if (encoding == null) { 098 throw new UnknownCharsetException("Unable to detect the file's character encoding"); 099 } 100 } catch (IOException e) { 101 throw new UnknownCharsetException(e); 102 } 103 104 // open a first stream to read a sample of the file 105 List<String> lines = new ArrayList<>(); 106 try (BufferedReader bf = Files.newBufferedReader(filePath, encoding)) { 107 String line; 108 do { 109 line = bf.readLine(); 110 if (line != null) { 111 lines.add(line); 112 } 113 } while (line != null && lines.size() < MAX_SAMPLE_SIZE); 114 } 115 TabularFileMetadata tabularFileMetadata = extractTabularMetadata(lines); 116 tabularFileMetadata.setEncoding(encoding); 117 return tabularFileMetadata; 118 } 119 120 /** 121 * Tries to extract the {@link TabularFileMetadata} from a sample of lines of a tabular file. 122 * 123 * @param sample 124 * @return new {@link TabularFileMetadata}, never null (but the content can be null). 125 */ 126 static TabularFileMetadata extractTabularMetadata(final List<String> sample) { 127 Objects.requireNonNull(sample, "sample shall be provided"); 128 TabularFileMetadata tabularFileMetadata = new TabularFileMetadata(); 129 130 Optional<Character> delimiterFound = getDelimiterChar(sample); 131 final Character delimiter = delimiterFound.orElse(null); 132 if (delimiter == null) { 133 return tabularFileMetadata; 134 } 135 136 Optional<Character> quoteFound = 137 getHighestCountOf(sample, line -> getQuoteCharWithHighestCount(line, delimiter)); 138 final Character quote = quoteFound.orElse(null); 139 140 tabularFileMetadata.setDelimiter(delimiter); 141 tabularFileMetadata.setQuotedBy(quote); 142 143 return tabularFileMetadata; 144 } 145 146 /** 147 * Extract a character from a line using the given function. 148 * Return the character with the highest counts. 149 * 150 * @param sample 151 * @param characterExtractor function to apply on each line to extract a character 152 * 153 * @return 154 */ 155 private static Optional<Character> getHighestCountOf( 156 final List<String> sample, final Function<String, Optional<Character>> characterExtractor) { 157 158 // remove Optional wrapper and ignore Optional.empty 159 return sample.stream() 160 .map(characterExtractor) 161 .flatMap( 162 o -> 163 o.map(Stream::of) 164 .orElseGet(Stream::empty)) // remove Optional wrapper and ignore Optional.empty 165 .collect(Collectors.groupingBy(Function.identity(), counting())) 166 .entrySet() 167 .stream() 168 .min(BY_VALUE_LONG_DESC) 169 .map(Map.Entry::getKey); 170 } 171 172 /** 173 * Given a sample of line, this method tries to determine the delimiter char used. 174 * 175 * @param sample 176 * 177 * @return the determined delimiter or Optional.empty if it can not be determined. 178 */ 179 public static Optional<Character> getDelimiterChar(final List<String> sample) { 180 181 // count the frequency of all possible delimiter for each lines 182 List<LineDelimiterStats> linesStats = computeLineDelimiterStats(sample); 183 184 // get the distinct set of frequency for each delimiters to check the "stability" 185 Map<Character, Set<Integer>> delimiterDistinctFrequency = 186 computeDelimiterDistinctFrequency(linesStats).entrySet().stream() 187 // filter out delimiter that we never saw 188 .filter(entry -> entry.getValue().size() > 1 || !entry.getValue().contains(0)) 189 .sorted(Comparator.comparing(e -> e.getValue().size())) 190 .collect( 191 Collectors.toMap( 192 Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e2, LinkedHashMap::new)); 193 194 // we can have more than one 195 Set<Character> mostStableDelimiter = 196 getAllEqualsToFirst(delimiterDistinctFrequency, (s1, s2) -> s1.size() == s2.size()); 197 198 // get the most used delimiter to check the "overall usage" 199 Map<Character, Integer> delimiterFrequencySums = 200 computeDelimiterFrequencySums(linesStats).entrySet().stream() 201 .sorted(Map.Entry.<Character, Integer>comparingByValue().reversed()) 202 .collect( 203 Collectors.toMap( 204 Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e2, LinkedHashMap::new)); 205 206 Set<Character> mostFrequentDelimiter = 207 getAllEqualsToFirst(delimiterFrequencySums, Integer::equals); 208 209 // get the highest frequency per line to check for "usage per line" 210 Map<Character, Long> delimiterHighestFrequencyPerLine = 211 computeDelimiterHighestFrequencyPerLine(sample).entrySet().stream() 212 .sorted(Map.Entry.<Character, Long>comparingByValue().reversed()) 213 .collect( 214 Collectors.toMap( 215 Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e2, LinkedHashMap::new)); 216 Set<Character> mostFrequentDelimiterPerLine = 217 getAllEqualsToFirst(delimiterHighestFrequencyPerLine, Long::equals); 218 219 // summary 220 if (LOG.isDebugEnabled()) { 221 LOG.debug("delimiterDistinctFrequency -> " + delimiterDistinctFrequency); 222 LOG.debug("mostStableDelimiter -> " + mostStableDelimiter); 223 LOG.debug("delimiterFrequencySums -> " + delimiterFrequencySums); 224 LOG.debug("mostFrequentDelimiter -> " + mostFrequentDelimiter); 225 LOG.debug("delimiterHighestFrequencyPerLine->" + delimiterHighestFrequencyPerLine); 226 LOG.debug("mostFrequentDelimiterPerLine ->" + mostFrequentDelimiterPerLine); 227 } 228 229 // if the most stable is also the one that is used to most within the sample 230 Optional<Character> resultCharacter = 231 intersectSingle(mostStableDelimiter, mostFrequentDelimiter); 232 if (resultCharacter.isPresent()) { 233 return resultCharacter; 234 } 235 236 // otherwise, if the most stable is also the most used based on lines 237 resultCharacter = intersectSingle(mostStableDelimiter, mostFrequentDelimiterPerLine); 238 if (resultCharacter.isPresent()) { 239 return resultCharacter; 240 } 241 242 // as last resort if the most frequent delimiter overall and by line is the same 243 resultCharacter = intersectSingle(mostFrequentDelimiter, mostFrequentDelimiterPerLine); 244 245 return resultCharacter; 246 } 247 248 /** 249 * Return the {@link Character} represents the intersection between 2 sets only if the resulting set represents 250 * a single element. 251 * @param set1 252 * @param set2 253 * @return 254 */ 255 private static Optional<Character> intersectSingle(Set<Character> set1, Set<Character> set2) { 256 Set<Character> intersection = new HashSet<>(set1); 257 intersection.retainAll(set2); 258 259 return intersection.size() == 1 ? intersection.stream().findFirst() : Optional.empty(); 260 } 261 262 /** 263 * Given a {@link Map}, return all elements that are equals to the first element (including itself) 264 * based on the provided equals function. 265 * @param map 266 * @param equalsPredicate 267 * @param <T> 268 * @return all elements that are equals to the first one or an empty set if the map is empty 269 */ 270 private static <T> Set<Character> getAllEqualsToFirst( 271 Map<Character, T> map, BiFunction<T, T, Boolean> equalsPredicate) { 272 273 Optional<Map.Entry<Character, T>> firstMapEntry = map.entrySet().stream().findFirst(); 274 if (!firstMapEntry.isPresent()) { 275 return Collections.EMPTY_SET; 276 } 277 278 final T firstValue = firstMapEntry.get().getValue(); 279 return map.entrySet().stream() 280 .filter(e -> equalsPredicate.apply(firstValue, e.getValue())) 281 .map(Map.Entry::getKey) 282 .collect(Collectors.toSet()); 283 } 284 285 /** 286 * For each element(line) of the sample, compute a {@link LineDelimiterStats} for each delimiter. 287 * Note: delimiter that are not used within a line will be included with the frequency 0. 288 * @param sample 289 * @return new List, never null 290 */ 291 static List<LineDelimiterStats> computeLineDelimiterStats(List<String> sample) { 292 return sample.stream() 293 .map(TabularFileMetadataExtractor::lineToLineDelimiterStats) 294 .flatMap(List::stream) 295 .collect(Collectors.toList()); 296 } 297 298 /** 299 * Compute the stats for each potential delimiters on a line. 300 * @param line 301 * @return 302 */ 303 private static List<LineDelimiterStats> lineToLineDelimiterStats(String line) { 304 return Arrays.stream(POTENTIAL_DELIMITER_CHAR) 305 .map( 306 delimiter -> 307 new LineDelimiterStats(delimiter, StringUtils.countMatches(line, delimiter))) 308 .collect(Collectors.toList()); 309 } 310 311 /** 312 * For each {@link LineDelimiterStats}, collect the distinct frequency (count) of each delimiter. 313 * This gives us an idea of the "stability" of each delimiter across the sample. 314 * Note that since quotes are not handled, noise can be introduced if a quoted cells use the delimiter. 315 * 316 * See unit test for examples of when this method will be affected by noise. 317 * 318 * The most stable delimiter is normally defined by the {@link Character} returned by the methods where 319 * the list of distinct frequency is the smallest in size excluding cases where the list contains only the element 320 * representing 0 as Integer (which means the delimiter was never used). 321 * 322 * @param linesStats 323 * 324 * @return 325 */ 326 static Map<Character, Set<Integer>> computeDelimiterDistinctFrequency( 327 List<LineDelimiterStats> linesStats) { 328 return linesStats.stream() 329 .collect( 330 Collectors.groupingBy( 331 LineDelimiterStats::getDelimiter, 332 Collectors.mapping(LineDelimiterStats::getFrequency, toSet()))); 333 } 334 335 /** 336 * For each line, check the delimiter that is used the most. 337 * Return the count of each delimiter. 338 * @param lines 339 * @return 340 */ 341 static Map<Character, Long> computeDelimiterHighestFrequencyPerLine(List<String> lines) { 342 return lines.stream() 343 .map(TabularFileMetadataExtractor::getDelimiterWithHighestCount2) 344 .flatMap( 345 o -> 346 o.map(Stream::of) 347 .orElseGet(Stream::empty)) // remove Optional wrapper and ignore Optional.empty 348 .collect(Collectors.groupingBy(LineDelimiterStats::getDelimiter, counting())); 349 } 350 351 /** 352 * For {@link LineDelimiterStats}, sum the frequency (count) of each delimiter. 353 * This gives us an idea of the overall usage of each delimiter across the sample. 354 * Note that since quotes are not handled, noise can be introduced if a quoted cell uses the delimiter. 355 * 356 * See unit test for examples of when this method will be affected by noise. 357 * 358 * @param linesStats 359 * 360 * @return 361 */ 362 static Map<Character, Integer> computeDelimiterFrequencySums( 363 List<LineDelimiterStats> linesStats) { 364 return linesStats.stream() 365 .filter(CONTAINS_FREQUENCY) 366 .collect( 367 Collectors.groupingBy( 368 LineDelimiterStats::getDelimiter, 369 Collectors.summingInt(LineDelimiterStats::getFrequency))); 370 } 371 372 /** 373 * Given a line, get the delimiter with the highest count if any can be found. 374 * Note: quotes are ignored in the count so a delimiter used inside quotes will be counted. 375 * 376 * @param line line of text to analyse 377 * 378 * @return 379 */ 380 static Optional<Character> getDelimiterWithHighestCount(String line) { 381 int highestCount = 0; 382 Character highestCountDelimiter = null; 383 for (Character delimiter : POTENTIAL_DELIMITER_CHAR) { 384 int currentCount = StringUtils.countMatches(line, delimiter); 385 if (currentCount > highestCount) { 386 highestCount = currentCount; 387 highestCountDelimiter = delimiter; 388 } 389 } 390 return Optional.ofNullable(highestCountDelimiter); 391 } 392 393 static Optional<LineDelimiterStats> getDelimiterWithHighestCount2(String line) { 394 int highestCount = 0; 395 // Character highestCountDelimiter = null; 396 LineDelimiterStats lineDelimiterStats = null; 397 for (Character delimiter : POTENTIAL_DELIMITER_CHAR) { 398 int currentCount = StringUtils.countMatches(line, delimiter); 399 if (currentCount > highestCount) { 400 highestCount = currentCount; 401 lineDelimiterStats = new LineDelimiterStats(delimiter, highestCount); 402 } 403 } 404 return Optional.ofNullable(lineDelimiterStats); 405 } 406 407 /** 408 * Given a line and a delimiter, try to determine the quoting character if any can be found. 409 * To check if a quote character is used we run a regex to check for a delimiter followed by a quoting character. 410 * 411 * @param line line of text to analyse 412 * @param delimiter delimiter used in the line of text 413 * 414 * @return 415 */ 416 static Optional<Character> getQuoteCharWithHighestCount(String line, Character delimiter) { 417 int highestCount = 0; 418 Character highestCountQuoteChar = null; 419 for (Character quoteChar : POTENTIAL_QUOTES_CHAR) { 420 int currentCount = 0; 421 Matcher m = COMPILE_QUOTE_PATTERN_FCT.apply(delimiter, quoteChar).matcher(line); 422 while (m.find()) { 423 currentCount++; 424 } 425 if (currentCount > highestCount) { 426 highestCount = currentCount; 427 highestCountQuoteChar = quoteChar; 428 } 429 } 430 return Optional.ofNullable(highestCountQuoteChar); 431 } 432 433 /** 434 * Inner representation of stats (frequency) of a delimiter 435 */ 436 static class LineDelimiterStats { 437 private Character delimiter; 438 private int frequency; 439 440 LineDelimiterStats(Character delimiter, int frequency) { 441 this.delimiter = delimiter; 442 this.frequency = frequency; 443 } 444 445 Character getDelimiter() { 446 return delimiter; 447 } 448 449 int getFrequency() { 450 return frequency; 451 } 452 } 453}