Source code

001/*
002 * Copyright 2021 Global Biodiversity Information Facility (GBIF)
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016package org.gbif.utils.file.tabular;
017
018import org.gbif.utils.PreconditionUtils;
019import org.gbif.utils.file.CharsetDetection;
020import org.gbif.utils.file.UnknownCharsetException;
021
022import java.io.BufferedReader;
023import java.io.IOException;
024import java.nio.charset.Charset;
025import java.nio.file.Files;
026import java.nio.file.Path;
027import java.util.ArrayList;
028import java.util.Arrays;
029import java.util.Collections;
030import java.util.Comparator;
031import java.util.HashSet;
032import java.util.LinkedHashMap;
033import java.util.List;
034import java.util.Map;
035import java.util.Objects;
036import java.util.Optional;
037import java.util.Set;
038import java.util.function.BiFunction;
039import java.util.function.Function;
040import java.util.function.Predicate;
041import java.util.regex.Matcher;
042import java.util.regex.Pattern;
043import java.util.stream.Collectors;
044import java.util.stream.Stream;
045
046import org.apache.commons.lang3.StringUtils;
047import org.slf4j.Logger;
048import org.slf4j.LoggerFactory;
049
050import static java.util.Collections.reverseOrder;
051import static java.util.stream.Collectors.counting;
052import static java.util.stream.Collectors.toSet;
053
054/**
055 * Utility class to extract metadata {@link TabularFileMetadata} from a tabular file.
056 */
057public class TabularFileMetadataExtractor {
058
059  private static final Logger LOG = LoggerFactory.getLogger(TabularFileMetadataExtractor.class);
060  private static final int MAX_SAMPLE_SIZE = 15;
061
062  // This needs to be large enough to stumble upon a non-ASCII character.
063  private static final int CHARSET_DETECTION_BUFFER_LENGTH = 1024*1024;
064
065  private TabularFileMetadataExtractor() {
066  }
067
068  private static final Character[] POTENTIAL_DELIMITER_CHAR = {',', '\t', ';', '|'};
069  private static final Character[] POTENTIAL_QUOTES_CHAR = {'"', '\''};
070
071  private static final Predicate<LineDelimiterStats> CONTAINS_FREQUENCY = lineStats -> lineStats.getFrequency() > 0;
072  private static final Comparator<Map.Entry<Character, Long>> BY_VALUE_LONG_DESC = Map.Entry.comparingByValue(reverseOrder());
073  private static final BiFunction<Character, Character, Pattern> COMPILE_QUOTE_PATTERN_FCT = (delimiter, quoteChar)
074          -> Pattern.compile("[" + delimiter + "][ ]*[" + quoteChar + "][ ]*[^" + delimiter + "]");
075
076  /**
077   * Extract metadata from a tabular file using a sample (defined by {@link #MAX_SAMPLE_SIZE}) of the file.
078   * The extraction process is based on the frequency of character in the sample using 3 different approaches.
079   * The method will not return any default value if no delimiter and/or quote character can be found in the sample.
080   * The caller should decide which default values should be used to read the file.
081   *
082   * @param filePath a {@link Path} pointing to a file (not a folder).
083   * @return new {@link TabularFileMetadata}, never null (but the content can be null).
084   * @throws IOException
085   * @throws UnknownCharsetException
086   */
087  public static TabularFileMetadata extractTabularFileMetadata(Path filePath) throws IOException, UnknownCharsetException {
088    Objects.requireNonNull(filePath, "filePath shall be provided");
089    PreconditionUtils.checkArgument(!Files.isDirectory(filePath), "filePath should point to a file, not a directory");
090
091    Charset encoding;
092    try {
093      encoding = CharsetDetection.detectEncoding(filePath.toFile(), CHARSET_DETECTION_BUFFER_LENGTH);
094      if (encoding == null) {
095        throw new UnknownCharsetException("Unable to detect the file's character encoding");
096      }
097    } catch (IOException e) {
098      throw new UnknownCharsetException(e);
099    }
100
101    // open a first stream to read a sample of the file
102    List<String> lines = new ArrayList<>();
103    try (BufferedReader bf = Files.newBufferedReader(filePath, encoding)) {
104      String line;
105      do {
106        line = bf.readLine();
107        if(line != null) {
108          lines.add(line);
109        }
110      }
111      while(line != null && lines.size() < MAX_SAMPLE_SIZE);
112    }
113    TabularFileMetadata tabularFileMetadata = extractTabularMetadata(lines);
114    tabularFileMetadata.setEncoding(encoding);
115    return tabularFileMetadata;
116  }
117
118  /**
119   * Tries to extract the {@link TabularFileMetadata} from a sample of lines of a tabular file.
120   *
121   * @param sample
122   * @return new {@link TabularFileMetadata}, never null (but the content can be null).
123   */
124  static TabularFileMetadata extractTabularMetadata(final List<String> sample) {
125    Objects.requireNonNull(sample, "sample shall be provided");
126    TabularFileMetadata tabularFileMetadata = new TabularFileMetadata();
127
128    Optional<Character> delimiterFound = getDelimiterChar(sample);
129    final Character delimiter = delimiterFound.orElse(null);
130    if(delimiter == null) {
131      return tabularFileMetadata;
132    }
133
134    Optional<Character> quoteFound = getHighestCountOf(sample, line -> getQuoteCharWithHighestCount(line, delimiter));
135    final Character quote = quoteFound.orElse(null);
136
137    tabularFileMetadata.setDelimiter(delimiter);
138    tabularFileMetadata.setQuotedBy(quote);
139
140    return tabularFileMetadata;
141  }
142
143  /**
144   * Extract a character from a line using the given function.
145   * Return the character with the highest counts.
146   *
147   * @param sample
148   * @param characterExtractor function to apply on each line to extract a character
149   *
150   * @return
151   */
152  private static Optional<Character> getHighestCountOf(final List<String> sample, final Function<String,
153          Optional<Character>> characterExtractor) {
154
155    //remove Optional wrapper and ignore Optional.empty
156    return sample.stream()
157        .map(characterExtractor)
158        .flatMap(o -> o.map(Stream::of).orElseGet(Stream::empty)) //remove Optional wrapper and ignore Optional.empty
159        .collect(Collectors.groupingBy(Function.identity(), counting()))
160        .entrySet()
161        .stream()
162        .min(BY_VALUE_LONG_DESC)
163        .map(Map.Entry::getKey);
164  }
165
166  /**
167   * Given a sample of line, this method tries to determine the delimiter char used.
168   *
169   * @param sample
170   *
171   * @return the determined delimiter or Optional.empty if it can not be determined.
172   */
173  public static Optional<Character> getDelimiterChar(final List<String> sample) {
174
175    // count the frequency of all possible delimiter for each lines
176    List<LineDelimiterStats> linesStats =
177            computeLineDelimiterStats(sample);
178
179    // get the distinct set of frequency for each delimiters to check the "stability"
180    Map<Character, Set<Integer>> delimiterDistinctFrequency =
181            computeDelimiterDistinctFrequency(linesStats)
182                    .entrySet().stream()
183                    // filter out delimiter that we never saw
184                    .filter(entry -> entry.getValue().size() > 1 || !entry.getValue().contains(0))
185                    .sorted(Comparator.comparing(e -> e.getValue().size()))
186                    .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue,
187                            (e1, e2) -> e2, LinkedHashMap::new));
188
189    // we can have more than one
190    Set<Character> mostStableDelimiter =
191            getAllEqualsToFirst(delimiterDistinctFrequency, (s1,s2) -> s1.size() == s2.size());
192
193    // get the most used delimiter to check the "overall usage"
194    Map<Character, Integer> delimiterFrequencySums = computeDelimiterFrequencySums(linesStats)
195            .entrySet().stream()
196            .sorted(Map.Entry.<Character, Integer>comparingByValue().reversed())
197            .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue,
198                    (e1, e2) -> e2, LinkedHashMap::new));
199
200    Set<Character> mostFrequentDelimiter = getAllEqualsToFirst(delimiterFrequencySums, Integer::equals);
201
202    //get the highest frequency per line to check for "usage per line"
203    Map<Character, Long> delimiterHighestFrequencyPerLine = computeDelimiterHighestFrequencyPerLine(sample)
204            .entrySet().stream()
205            .sorted(Map.Entry.<Character, Long>comparingByValue().reversed())
206            .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue,
207                    (e1, e2) -> e2, LinkedHashMap::new));
208    Set<Character> mostFrequentDelimiterPerLine = getAllEqualsToFirst(delimiterHighestFrequencyPerLine,
209           Long::equals);
210
211    //summary
212    if (LOG.isDebugEnabled()) {
213      LOG.debug("delimiterDistinctFrequency -> " + delimiterDistinctFrequency);
214      LOG.debug("mostStableDelimiter -> " + mostStableDelimiter);
215      LOG.debug("delimiterFrequencySums -> " + delimiterFrequencySums);
216      LOG.debug("mostFrequentDelimiter -> " + mostFrequentDelimiter);
217      LOG.debug("delimiterHighestFrequencyPerLine->" + delimiterHighestFrequencyPerLine);
218      LOG.debug("mostFrequentDelimiterPerLine ->" + mostFrequentDelimiterPerLine);
219    }
220
221    //if the most stable is also the one that is used to most within the sample
222    Optional<Character> resultCharacter = intersectSingle(mostStableDelimiter, mostFrequentDelimiter);
223    if(resultCharacter.isPresent()) {
224      return resultCharacter;
225    }
226
227    //otherwise, if the most stable is also the most used based on lines
228    resultCharacter = intersectSingle(mostStableDelimiter, mostFrequentDelimiterPerLine);
229    if(resultCharacter.isPresent()) {
230      return resultCharacter;
231    }
232
233    //as last resort if the most frequent delimiter overall and by line is the same
234    resultCharacter = intersectSingle(mostFrequentDelimiter, mostFrequentDelimiterPerLine);
235
236    return resultCharacter;
237  }
238
239  /**
240   * Return the {@link Character} represents the intersection between 2 sets only if the resulting set represents
241   * a single element.
242   * @param set1
243   * @param set2
244   * @return
245   */
246  private static Optional<Character> intersectSingle(Set<Character> set1, Set<Character> set2) {
247    Set<Character> intersection = new HashSet<>(set1);
248    intersection.retainAll(set2);
249
250    return intersection.size() == 1 ? intersection.stream().findFirst() : Optional.empty();
251  }
252
253  /**
254   * Given a {@link Map}, return all elements that are equals to the first element (including itself)
255   * based on the provided equals function.
256   * @param map
257   * @param equalsPredicate
258   * @param <T>
259   * @return all elements that are equals to the first one or an empty set if the map is empty
260   */
261  private static <T> Set<Character> getAllEqualsToFirst(Map<Character, T> map, BiFunction<T,T, Boolean> equalsPredicate) {
262
263    Optional<Map.Entry<Character, T>> firstMapEntry = map.entrySet().stream().findFirst();
264    if(!firstMapEntry.isPresent()){
265      return Collections.EMPTY_SET;
266    }
267
268    final T firstValue = firstMapEntry.get().getValue();
269    return map.entrySet().stream().filter( e -> equalsPredicate.apply(firstValue,e.getValue()))
270            .map(Map.Entry::getKey).collect(Collectors.toSet());
271  }
272
273  /**
274   * For each element(line) of the sample, compute a {@link LineDelimiterStats} for each delimiter.
275   * Note: delimiter that are not used within a line will be included with the frequency 0.
276   * @param sample
277   * @return new List, never null
278   */
279  static List<LineDelimiterStats> computeLineDelimiterStats(List<String> sample){
280    return sample.stream()
281            .map(TabularFileMetadataExtractor::lineToLineDelimiterStats)
282            .flatMap(List::stream)
283            .collect(Collectors.toList());
284  }
285
286  /**
287   * Compute the stats for each potential delimiters on a line.
288   * @param line
289   * @return
290   */
291  private static List<LineDelimiterStats> lineToLineDelimiterStats(String line) {
292    return Arrays.stream(POTENTIAL_DELIMITER_CHAR)
293            .map(delimiter -> new LineDelimiterStats(delimiter, StringUtils.countMatches(line, delimiter)))
294            .collect(Collectors.toList());
295  }
296
297  /**
298   * For each {@link LineDelimiterStats}, collect the distinct frequency (count) of each delimiter.
299   * This gives us an idea of the "stability" of each delimiter across the sample.
300   * Note that since quotes are not handled, noise can be introduced if a quoted cells use the delimiter.
301   *
302   * See unit test for examples of when this method will be affected by noise.
303   *
304   * The most stable delimiter is normally defined by the {@link Character} returned by the methods where
305   * the list of distinct frequency is the smallest in size excluding cases where the list contains only the element
306   * representing 0 as Integer (which means the delimiter was never used).
307   *
308   * @param linesStats
309   *
310   * @return
311   */
312  static Map<Character, Set<Integer>> computeDelimiterDistinctFrequency(List<LineDelimiterStats> linesStats) {
313    return linesStats.stream()
314            .collect(
315                    Collectors.groupingBy(LineDelimiterStats::getDelimiter,
316                            Collectors.mapping(LineDelimiterStats::getFrequency, toSet())));
317  }
318
319  /**
320   * For each line, check the delimiter that is used the most.
321   * Return the count of each delimiter.
322   * @param lines
323   * @return
324   */
325  static Map<Character, Long> computeDelimiterHighestFrequencyPerLine(List<String> lines) {
326    return lines.stream()
327            .map(TabularFileMetadataExtractor::getDelimiterWithHighestCount2)
328            .flatMap(o -> o.map(Stream::of).orElseGet(Stream::empty)) //remove Optional wrapper and ignore Optional.empty
329            .collect(Collectors.groupingBy(LineDelimiterStats::getDelimiter, counting()));
330  }
331
332  /**
333   * For {@link LineDelimiterStats}, sum the frequency (count) of each delimiter.
334   * This gives us an idea of the overall usage of each delimiter across the sample.
335   * Note that since quotes are not handled, noise can be introduced if a quoted cell uses the delimiter.
336   *
337   * See unit test for examples of when this method will be affected by noise.
338   *
339   * @param linesStats
340   *
341   * @return
342   */
343  static Map<Character, Integer> computeDelimiterFrequencySums(List<LineDelimiterStats> linesStats) {
344    return linesStats.stream()
345            .filter(CONTAINS_FREQUENCY)
346            .collect(
347                    Collectors.groupingBy(LineDelimiterStats::getDelimiter,
348                            Collectors.summingInt(LineDelimiterStats::getFrequency)));
349  }
350
351  /**
352   * Given a line, get the delimiter with the highest count if any can be found.
353   * Note: quotes are ignored in the count so a delimiter used inside quotes will be counted.
354   *
355   * @param line line of text to analyse
356   *
357   * @return
358   */
359  static Optional<Character> getDelimiterWithHighestCount(String line) {
360    int highestCount = 0;
361    Character highestCountDelimiter = null;
362    for (Character delimiter : POTENTIAL_DELIMITER_CHAR) {
363      int currentCount = StringUtils.countMatches(line, delimiter);
364      if (currentCount > highestCount) {
365        highestCount = currentCount;
366        highestCountDelimiter = delimiter;
367      }
368    }
369    return Optional.ofNullable(highestCountDelimiter);
370  }
371
372  static Optional<LineDelimiterStats> getDelimiterWithHighestCount2(String line) {
373    int highestCount = 0;
374   // Character highestCountDelimiter = null;
375    LineDelimiterStats lineDelimiterStats = null;
376    for (Character delimiter : POTENTIAL_DELIMITER_CHAR) {
377      int currentCount = StringUtils.countMatches(line, delimiter);
378      if (currentCount > highestCount) {
379        highestCount = currentCount;
380        lineDelimiterStats = new LineDelimiterStats(delimiter,highestCount);
381      }
382    }
383    return Optional.ofNullable(lineDelimiterStats);
384  }
385
386  /**
387   * Given a line and a delimiter, try to determine the quoting character if any can be found.
388   * To check if a quote character is used we run a regex to check for a delimiter followed by a quoting character.
389   *
390   * @param line line of text to analyse
391   * @param delimiter delimiter used in the line of text
392   *
393   * @return
394   */
395  static Optional<Character> getQuoteCharWithHighestCount(String line, Character delimiter) {
396    int highestCount = 0;
397    Character highestCountQuoteChar = null;
398    for (Character quoteChar : POTENTIAL_QUOTES_CHAR) {
399      int currentCount = 0;
400      Matcher m = COMPILE_QUOTE_PATTERN_FCT.apply(delimiter, quoteChar).matcher(line);
401      while (m.find()) {
402        currentCount++;
403      }
404      if (currentCount > highestCount) {
405        highestCount = currentCount;
406        highestCountQuoteChar = quoteChar;
407      }
408    }
409    return Optional.ofNullable(highestCountQuoteChar);
410  }
411
412  /**
413   * Inner representation of stats (frequency) of a delimiter
414   */
415  static class LineDelimiterStats {
416    private Character delimiter;
417    private int frequency;
418    LineDelimiterStats(Character delimiter, int frequency) {
419      this.delimiter = delimiter;
420      this.frequency = frequency;
421    }
422
423    Character getDelimiter() {
424      return delimiter;
425    }
426
427    int getFrequency() {
428      return frequency;
429    }
430  }
431
432}