001package org.gbif.utils.file.tabular;
002
003import org.gbif.utils.file.CharsetDetection;
004import org.gbif.utils.file.UnkownCharsetException;
005
006import java.io.BufferedReader;
007import java.io.IOException;
008import java.nio.charset.Charset;
009import java.nio.file.Files;
010import java.nio.file.Path;
011import java.util.ArrayList;
012import java.util.Arrays;
013import java.util.Collections;
014import java.util.Comparator;
015import java.util.LinkedHashMap;
016import java.util.List;
017import java.util.Map;
018import java.util.Objects;
019import java.util.Optional;
020import java.util.Set;
021import java.util.function.BiFunction;
022import java.util.function.Function;
023import java.util.function.Predicate;
024import java.util.regex.Matcher;
025import java.util.regex.Pattern;
026import java.util.stream.Collectors;
027import java.util.stream.Stream;
028
029import com.google.common.base.Preconditions;
030import com.google.common.collect.Sets;
031import org.apache.commons.lang3.StringUtils;
032import org.slf4j.Logger;
033import org.slf4j.LoggerFactory;
034
035import static java.util.Collections.reverseOrder;
036import static java.util.stream.Collectors.counting;
037import static java.util.stream.Collectors.toSet;
038
039/**
040 * Utility class to extract metadata {@link TabularFileMetadata} from a tabular file.
041 */
042public class TabularFileMetadataExtractor {
043
044  private static final Logger LOG = LoggerFactory.getLogger(TabularFileMetadataExtractor.class);
045  private static final int MAX_SAMPLE_SIZE = 15;
046  private static final int CHARSET_DETECTION_BUFFER_LENGTH = 16384;
047
048  private TabularFileMetadataExtractor() {
049  }
050
051  private static final Character[] POTENTIAL_DELIMITER_CHAR = {',', '\t', ';', '|'};
052  private static final Character[] POTENTIAL_QUOTES_CHAR = {'"', '\''};
053
054  private static final Predicate<LineDelimiterStats> CONTAINS_FREQUENCY = lineStats -> lineStats.getFrequency() > 0;
055  private static final Comparator<Map.Entry<Character, Long>> BY_VALUE_LONG_DESC = Comparator.comparing(Map.Entry::getValue, reverseOrder());
056  private static final BiFunction<Character, Character, Pattern> COMPILE_QUOTE_PATTERN_FCT = (delimiter, quoteChar)
057          -> Pattern.compile("[" + delimiter + "][ ]*[" + quoteChar + "][ ]*[^" + delimiter + "]");
058
059  /**
060   * Extract metadata from a tabular file using a sample (defined by {@link #MAX_SAMPLE_SIZE}) of the file.
061   * The extraction process is based on the frequency of character in the sample using 3 different approaches.
062   * The method will not return any default value if no delimiter and/or quote character can be found in the sample.
063   * The caller should decide which default values should be used to read the file.
064   *
065   * @param filePath a {@link Path} pointing to a file (not a folder).
066   * @return new {@link TabularFileMetadata}, never null (but the content can be null).
067   * @throws IOException
068   * @throws UnkownCharsetException
069   */
070  public static TabularFileMetadata extractTabularFileMetadata(Path filePath) throws IOException, UnkownCharsetException {
071    Objects.requireNonNull(filePath, "filePath shall be provided");
072    Preconditions.checkArgument(!Files.isDirectory(filePath), "filePath should point to a file, not a directory");
073
074    Charset encoding;
075    try {
076      encoding = CharsetDetection.detectEncoding(filePath.toFile(), CHARSET_DETECTION_BUFFER_LENGTH);
077      if (encoding == null) {
078        throw new UnkownCharsetException("Unable to detect the files character encoding");
079      }
080    } catch (IOException e) {
081      throw new UnkownCharsetException(e);
082    }
083
084    // open a first stream to read a sample of the file
085    List<String> lines = new ArrayList<>();
086    try (BufferedReader bf = Files.newBufferedReader(filePath, encoding)) {
087      String line;
088      do {
089        line = bf.readLine();
090        if(line != null) {
091          lines.add(line);
092        }
093      }
094      while(line != null && lines.size() < MAX_SAMPLE_SIZE);
095    }
096    TabularFileMetadata tabularFileMetadata = extractTabularMetadata(lines);
097    tabularFileMetadata.setEncoding(encoding);
098    return tabularFileMetadata;
099  }
100
101  /**
102   * Tries to extract the {@link TabularFileMetadata} from a sample of lines of a tabular file.
103   *
104   * @param sample
105   * @return new {@link TabularFileMetadata}, never null (but the content can be null).
106   */
107  static TabularFileMetadata extractTabularMetadata(final List<String> sample) {
108    Objects.requireNonNull(sample, "sample shall be provided");
109    TabularFileMetadata tabularFileMetadata = new TabularFileMetadata();
110
111    Optional<Character> delimiterFound = getDelimiterChar(sample);
112    final Character delimiter = delimiterFound.orElse(null);
113    if(delimiter == null) {
114      return tabularFileMetadata;
115    }
116
117    Optional<Character> quoteFound = getHighestCountOf(sample, line -> getQuoteCharWithHighestCount(line, delimiter));
118    final Character quote = quoteFound.orElse(null);
119
120    tabularFileMetadata.setDelimiter(delimiter);
121    tabularFileMetadata.setQuotedBy(quote);
122
123    return tabularFileMetadata;
124  }
125
126  /**
127   * Extract a character from a line using the given function.
128   * Return the character with the highest counts.
129   *
130   * @param sample
131   * @param characterExtractor function to apply on each line to extract a character
132   *
133   * @return
134   */
135  private static Optional<Character> getHighestCountOf(final List<String> sample, final Function<String,
136          Optional<Character>> characterExtractor) {
137
138    return sample.stream()
139            .map(characterExtractor)
140            .flatMap(o -> o.map(Stream::of).orElseGet(Stream::empty)) //remove Optional wrapper and ignore Optional.empty
141            .collect(Collectors.groupingBy(Function.identity(), counting()))
142            .entrySet()
143            .stream()
144            .sorted(BY_VALUE_LONG_DESC)
145            .findFirst()
146            .map(Map.Entry::getKey);
147  }
148
149  /**
150   * Given a sample of line, this method tries to determine the delimiter char used.
151   *
152   * @param sample
153   *
154   * @return the determined delimiter or Optional.empty if it can not be determined.
155   */
156  public static Optional<Character> getDelimiterChar(final List<String> sample) {
157
158    // count the frequency of all possible delimiter for each lines
159    List<LineDelimiterStats> linesStats =
160            computeLineDelimiterStats(sample);
161
162    // get the distinct set of frequency for each delimiters to check the "stability"
163    Map<Character, Set<Integer>> delimiterDistinctFrequency =
164            computeDelimiterDistinctFrequency(linesStats)
165                    .entrySet().stream()
166                    // filter out delimiter that we never saw
167                    .filter(entry -> entry.getValue().size() > 1 || !entry.getValue().contains(Integer.valueOf(0)))
168                    .sorted(Comparator.comparing(e -> e.getValue().size()))
169                    .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue,
170                            (e1, e2) -> e2, LinkedHashMap::new));
171
172    // we can have more than one
173    Set<Character> mostStableDelimiter =
174            getAllEqualsToFirst(delimiterDistinctFrequency, (s1,s2) -> s1.size() == s2.size());
175
176    // get the most used delimiter to check the "overall usage"
177    Map<Character, Integer> delimiterFrequencySums = computeDelimiterFrequencySums(linesStats)
178            .entrySet().stream()
179            .sorted( Comparator.comparing( (Map.Entry<Character, Integer> e) -> e.getValue()).reversed())
180            .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue,
181                    (e1, e2) -> e2, LinkedHashMap::new));
182
183    Set<Character> mostFrequentDelimiter = getAllEqualsToFirst(delimiterFrequencySums, Integer::equals);
184
185    //get the highest frequency per line to check for "usage per line"
186    Map<Character, Long> delimiterHighestFrequencyPerLine = computeDelimiterHighestFrequencyPerLine(sample)
187            .entrySet().stream()
188            .sorted( Comparator.comparing( (Map.Entry<Character, Long> e) -> e.getValue()).reversed())
189            .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue,
190                    (e1, e2) -> e2, LinkedHashMap::new));
191    Set<Character> mostFrequentDelimiterPerLine = getAllEqualsToFirst(delimiterHighestFrequencyPerLine,
192           Long::equals);
193
194    //summary
195    if (LOG.isDebugEnabled()) {
196      LOG.debug("delimiterDistinctFrequency -> " + delimiterDistinctFrequency);
197      LOG.debug("mostStableDelimiter -> " + mostStableDelimiter);
198      LOG.debug("delimiterFrequencySums -> " + delimiterFrequencySums);
199      LOG.debug("mostFrequentDelimiter -> " + mostFrequentDelimiter);
200      LOG.debug("delimiterHighestFrequencyPerLine->" + delimiterHighestFrequencyPerLine);
201      LOG.debug("mostFrequentDelimiterPerLine ->" + mostFrequentDelimiterPerLine);
202    }
203
204    //if the most stable is also the one that is used to most within the sample
205    Optional<Character> resultCharacter = intersectSingle(mostStableDelimiter, mostFrequentDelimiter);
206    if(resultCharacter.isPresent()) {
207      return resultCharacter;
208    }
209
210    //otherwise, if the most stable is also the most used based on lines
211    resultCharacter = intersectSingle(mostStableDelimiter, mostFrequentDelimiterPerLine);
212    if(resultCharacter.isPresent()) {
213      return resultCharacter;
214    }
215
216    //as last resort if the most frequent delimiter overall and by line is the same
217    resultCharacter = intersectSingle(mostFrequentDelimiter, mostFrequentDelimiterPerLine);
218    if(resultCharacter.isPresent()) {
219      return resultCharacter;
220    }
221
222    // other idea
223    // give more weight to the first line (it is probably the header line where normally no quotes are used)
224    // give more weight to the first "column" (it is probably the "id" line where normally no quotes are used)
225    return Optional.empty();
226  }
227
228  /**
229   * Return the {@link Character} represents the intersection between 2 sets only if the resulting set represents
230   * a single element.
231   * @param set1
232   * @param set2
233   * @return
234   */
235  private static Optional<Character> intersectSingle(Set<Character> set1, Set<Character> set2) {
236    Sets.SetView<Character> intersection = Sets.intersection(set1, set2);
237    return intersection.size() == 1 ? intersection.stream().findFirst() : Optional.empty();
238  }
239
240  /**
241   * Given a {@link Map}, return all elements that are equals to the first element (including itself)
242   * based on the provided equals function.
243   * @param map
244   * @param equalsPredicate
245   * @param <T>
246   * @return all elements that are equals to the first one or an empty set if the map is empty
247   */
248  private static <T> Set<Character> getAllEqualsToFirst(Map<Character, T> map, BiFunction<T,T, Boolean> equalsPredicate) {
249
250    Optional<Map.Entry<Character, T>> firstMapEntry = map.entrySet().stream().findFirst();
251    if(!firstMapEntry.isPresent()){
252      return Collections.EMPTY_SET;
253    }
254
255    final T firstValue = firstMapEntry.get().getValue();
256    return map.entrySet().stream().filter( e -> equalsPredicate.apply(firstValue,e.getValue()))
257            .map(Map.Entry::getKey).collect(Collectors.toSet());
258  }
259
260  /**
261   * For each element(line) of the sample, compute a {@link LineDelimiterStats} for each delimiter.
262   * Note: delimiter that are not used within a line will be included with the frequency 0.
263   * @param sample
264   * @return new List, never null
265   */
266  static List<LineDelimiterStats> computeLineDelimiterStats(List<String> sample){
267    return sample.stream()
268            .map(TabularFileMetadataExtractor::lineToLineDelimiterStats)
269            .flatMap(List::stream)
270            .collect(Collectors.toList());
271  }
272
273  /**
274   * Compute the stats for each potential delimiters on a line.
275   * @param line
276   * @return
277   */
278  private static List<LineDelimiterStats> lineToLineDelimiterStats(String line) {
279    return Arrays.stream(POTENTIAL_DELIMITER_CHAR)
280            .map( delimiter -> new LineDelimiterStats(delimiter, StringUtils.countMatches(line, delimiter)))
281            .collect(Collectors.toList());
282  }
283
284  /**
285   * For each {@link LineDelimiterStats}, collect the distinct frequency (count) of each delimiter.
286   * This gives us an idea of the "stability" of each delimiter across the sample.
287   * Note that since quotes are not handled, noise can be introduced if a quoted cells use the delimiter.
288   *
289   * See unit test for examples of when this method will be affected by noise.
290   *
291   * The most stable delimiter is normally defined by the {@link Character} returned by the methods where
292   * the list of distinct frequency is the smallest in size excluding cases where the list contains only the element
293   * representing 0 as Integer (which means the delimiter was never used).
294   *
295   * @param linesStats
296   *
297   * @return
298   */
299  static Map<Character, Set<Integer>> computeDelimiterDistinctFrequency(List<LineDelimiterStats> linesStats) {
300    return linesStats.stream()
301            .collect(
302                    Collectors.groupingBy(LineDelimiterStats::getDelimiter,
303                            Collectors.mapping(LineDelimiterStats::getFrequency, toSet())));
304  }
305
306  /**
307   * For each line, check the delimiter that is used the most.
308   * Return the count of each delimiter.
309   * @param lines
310   * @return
311   */
312  static Map<Character, Long> computeDelimiterHighestFrequencyPerLine(List<String> lines) {
313    return lines.stream()
314            .map(TabularFileMetadataExtractor::getDelimiterWithHighestCount2)
315            .flatMap(o -> o.map(Stream::of).orElseGet(Stream::empty)) //remove Optional wrapper and ignore Optional.empty
316            .collect(Collectors.groupingBy(LineDelimiterStats::getDelimiter, counting()));
317  }
318
319  /**
320   * For {@link LineDelimiterStats}, sum the frequency (count) of each delimiter.
321   * This gives us an idea of the overall usage of each delimiter across the sample.
322   * Note that since quotes are not handled, noise can be introduced if a quoted cell uses the delimiter.
323   *
324   * See unit test for examples of when this method will be affected by noise.
325   *
326   * @param linesStats
327   *
328   * @return
329   */
330  static Map<Character, Integer> computeDelimiterFrequencySums(List<LineDelimiterStats> linesStats) {
331    return linesStats.stream()
332            .filter(CONTAINS_FREQUENCY)
333            .collect(
334                    Collectors.groupingBy(LineDelimiterStats::getDelimiter,
335                            Collectors.summingInt(LineDelimiterStats::getFrequency)));
336  }
337
338  /**
339   * Given a line, get the delimiter with the highest count if any can be found.
340   * Note: quotes are ignored in the count so a delimiter used inside quotes will be counted.
341   *
342   * @param line line of text to analyse
343   *
344   * @return
345   */
346  static Optional<Character> getDelimiterWithHighestCount(String line) {
347    int highestCount = 0;
348    Character highestCountDelimiter = null;
349    for (Character delimiter : POTENTIAL_DELIMITER_CHAR) {
350      int currentCount = StringUtils.countMatches(line, delimiter);
351      if (currentCount > highestCount) {
352        highestCount = currentCount;
353        highestCountDelimiter = delimiter;
354      }
355    }
356    return Optional.ofNullable(highestCountDelimiter);
357  }
358
359  static Optional<LineDelimiterStats> getDelimiterWithHighestCount2(String line) {
360    int highestCount = 0;
361   // Character highestCountDelimiter = null;
362    LineDelimiterStats lineDelimiterStats = null;
363    for (Character delimiter : POTENTIAL_DELIMITER_CHAR) {
364      int currentCount = StringUtils.countMatches(line, delimiter);
365      if (currentCount > highestCount) {
366        highestCount = currentCount;
367        lineDelimiterStats = new LineDelimiterStats(delimiter,highestCount);
368      }
369    }
370    return Optional.ofNullable(lineDelimiterStats);
371  }
372
373  /**
374   * Given a line and a delimiter, try to determine the quoting character if any can be found.
375   * To check if a quote character is used we run a regex to check for a delimiter followed by a quoting character.
376   *
377   * @param line line of text to analyse
378   * @param delimiter delimiter used in the line of text
379   *
380   * @return
381   */
382  static Optional<Character> getQuoteCharWithHighestCount(String line, Character delimiter) {
383    int highestCount = 0;
384    Character highestCountQuoteChar = null;
385    for (Character quoteChar : POTENTIAL_QUOTES_CHAR) {
386      int currentCount = 0;
387      Matcher m = COMPILE_QUOTE_PATTERN_FCT.apply(delimiter, quoteChar).matcher(line);
388      while (m.find()) {
389        currentCount++;
390      }
391      if (currentCount > highestCount) {
392        highestCount = currentCount;
393        highestCountQuoteChar = quoteChar;
394      }
395    }
396    return Optional.ofNullable(highestCountQuoteChar);
397  }
398
399  /**
400   * Inner representation of stats (frequency) of a delimiter
401   */
402  static class LineDelimiterStats {
403    private Character delimiter;
404    private int frequency;
405    LineDelimiterStats(Character delimiter, int frequency) {
406      this.delimiter = delimiter;
407      this.frequency = frequency;
408    }
409
410    Character getDelimiter() {
411      return delimiter;
412    }
413
414    int getFrequency() {
415      return frequency;
416    }
417  }
418
419}