Source code

001/*
002 * Licensed under the Apache License, Version 2.0 (the "License");
003 * you may not use this file except in compliance with the License.
004 * You may obtain a copy of the License at
005 *
006 *     http://www.apache.org/licenses/LICENSE-2.0
007 *
008 * Unless required by applicable law or agreed to in writing, software
009 * distributed under the License is distributed on an "AS IS" BASIS,
010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
011 * See the License for the specific language governing permissions and
012 * limitations under the License.
013 */
014package org.gbif.utils.file.tabular;
015
016import org.gbif.utils.PreconditionUtils;
017import org.gbif.utils.file.CharsetDetection;
018import org.gbif.utils.file.UnknownCharsetException;
019
020import java.io.BufferedReader;
021import java.io.IOException;
022import java.nio.charset.Charset;
023import java.nio.file.Files;
024import java.nio.file.Path;
025import java.util.ArrayList;
026import java.util.Arrays;
027import java.util.Collections;
028import java.util.Comparator;
029import java.util.HashSet;
030import java.util.LinkedHashMap;
031import java.util.List;
032import java.util.Map;
033import java.util.Objects;
034import java.util.Optional;
035import java.util.Set;
036import java.util.function.BiFunction;
037import java.util.function.Function;
038import java.util.function.Predicate;
039import java.util.regex.Matcher;
040import java.util.regex.Pattern;
041import java.util.stream.Collectors;
042import java.util.stream.Stream;
043
044import org.apache.commons.lang3.StringUtils;
045import org.slf4j.Logger;
046import org.slf4j.LoggerFactory;
047
048import static java.util.Collections.reverseOrder;
049import static java.util.stream.Collectors.counting;
050import static java.util.stream.Collectors.toSet;
051
052/**
053 * Utility class to extract metadata {@link TabularFileMetadata} from a tabular file.
054 */
055public class TabularFileMetadataExtractor {
056
057  private static final Logger LOG = LoggerFactory.getLogger(TabularFileMetadataExtractor.class);
058  private static final int MAX_SAMPLE_SIZE = 15;
059
060  // This needs to be large enough to stumble upon a non-ASCII character.
061  private static final int CHARSET_DETECTION_BUFFER_LENGTH = 1024 * 1024;
062
063  private TabularFileMetadataExtractor() {}
064
065  private static final Character[] POTENTIAL_DELIMITER_CHAR = {',', '\t', ';', '|'};
066  private static final Character[] POTENTIAL_QUOTES_CHAR = {'"', '\''};
067
068  private static final Predicate<LineDelimiterStats> CONTAINS_FREQUENCY =
069      lineStats -> lineStats.getFrequency() > 0;
070  private static final Comparator<Map.Entry<Character, Long>> BY_VALUE_LONG_DESC =
071      Map.Entry.comparingByValue(reverseOrder());
072  private static final BiFunction<Character, Character, Pattern> COMPILE_QUOTE_PATTERN_FCT =
073      (delimiter, quoteChar) ->
074          Pattern.compile("[" + delimiter + "][ ]*[" + quoteChar + "][ ]*[^" + delimiter + "]");
075
076  /**
077   * Extract metadata from a tabular file using a sample (defined by {@link #MAX_SAMPLE_SIZE}) of the file.
078   * The extraction process is based on the frequency of character in the sample using 3 different approaches.
079   * The method will not return any default value if no delimiter and/or quote character can be found in the sample.
080   * The caller should decide which default values should be used to read the file.
081   *
082   * @param filePath a {@link Path} pointing to a file (not a folder).
083   * @return new {@link TabularFileMetadata}, never null (but the content can be null).
084   * @throws IOException
085   * @throws UnknownCharsetException
086   */
087  public static TabularFileMetadata extractTabularFileMetadata(Path filePath)
088      throws IOException, UnknownCharsetException {
089    Objects.requireNonNull(filePath, "filePath shall be provided");
090    PreconditionUtils.checkArgument(
091        !Files.isDirectory(filePath), "filePath should point to a file, not a directory");
092
093    Charset encoding;
094    try {
095      encoding =
096          CharsetDetection.detectEncoding(filePath.toFile(), CHARSET_DETECTION_BUFFER_LENGTH);
097      if (encoding == null) {
098        throw new UnknownCharsetException("Unable to detect the file's character encoding");
099      }
100    } catch (IOException e) {
101      throw new UnknownCharsetException(e);
102    }
103
104    // open a first stream to read a sample of the file
105    List<String> lines = new ArrayList<>();
106    try (BufferedReader bf = Files.newBufferedReader(filePath, encoding)) {
107      String line;
108      do {
109        line = bf.readLine();
110        if (line != null) {
111          lines.add(line);
112        }
113      } while (line != null && lines.size() < MAX_SAMPLE_SIZE);
114    }
115    TabularFileMetadata tabularFileMetadata = extractTabularMetadata(lines);
116    tabularFileMetadata.setEncoding(encoding);
117    return tabularFileMetadata;
118  }
119
120  /**
121   * Tries to extract the {@link TabularFileMetadata} from a sample of lines of a tabular file.
122   *
123   * @param sample
124   * @return new {@link TabularFileMetadata}, never null (but the content can be null).
125   */
126  static TabularFileMetadata extractTabularMetadata(final List<String> sample) {
127    Objects.requireNonNull(sample, "sample shall be provided");
128    TabularFileMetadata tabularFileMetadata = new TabularFileMetadata();
129
130    Optional<Character> delimiterFound = getDelimiterChar(sample);
131    final Character delimiter = delimiterFound.orElse(null);
132    if (delimiter == null) {
133      return tabularFileMetadata;
134    }
135
136    Optional<Character> quoteFound =
137        getHighestCountOf(sample, line -> getQuoteCharWithHighestCount(line, delimiter));
138    final Character quote = quoteFound.orElse(null);
139
140    tabularFileMetadata.setDelimiter(delimiter);
141    tabularFileMetadata.setQuotedBy(quote);
142
143    return tabularFileMetadata;
144  }
145
146  /**
147   * Extract a character from a line using the given function.
148   * Return the character with the highest counts.
149   *
150   * @param sample
151   * @param characterExtractor function to apply on each line to extract a character
152   *
153   * @return
154   */
155  private static Optional<Character> getHighestCountOf(
156      final List<String> sample, final Function<String, Optional<Character>> characterExtractor) {
157
158    // remove Optional wrapper and ignore Optional.empty
159    return sample.stream()
160        .map(characterExtractor)
161        .flatMap(
162            o ->
163                o.map(Stream::of)
164                    .orElseGet(Stream::empty)) // remove Optional wrapper and ignore Optional.empty
165        .collect(Collectors.groupingBy(Function.identity(), counting()))
166        .entrySet()
167        .stream()
168        .min(BY_VALUE_LONG_DESC)
169        .map(Map.Entry::getKey);
170  }
171
172  /**
173   * Given a sample of line, this method tries to determine the delimiter char used.
174   *
175   * @param sample
176   *
177   * @return the determined delimiter or Optional.empty if it can not be determined.
178   */
179  public static Optional<Character> getDelimiterChar(final List<String> sample) {
180
181    // count the frequency of all possible delimiter for each lines
182    List<LineDelimiterStats> linesStats = computeLineDelimiterStats(sample);
183
184    // get the distinct set of frequency for each delimiters to check the "stability"
185    Map<Character, Set<Integer>> delimiterDistinctFrequency =
186        computeDelimiterDistinctFrequency(linesStats).entrySet().stream()
187            // filter out delimiter that we never saw
188            .filter(entry -> entry.getValue().size() > 1 || !entry.getValue().contains(0))
189            .sorted(Comparator.comparing(e -> e.getValue().size()))
190            .collect(
191                Collectors.toMap(
192                    Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e2, LinkedHashMap::new));
193
194    // we can have more than one
195    Set<Character> mostStableDelimiter =
196        getAllEqualsToFirst(delimiterDistinctFrequency, (s1, s2) -> s1.size() == s2.size());
197
198    // get the most used delimiter to check the "overall usage"
199    Map<Character, Integer> delimiterFrequencySums =
200        computeDelimiterFrequencySums(linesStats).entrySet().stream()
201            .sorted(Map.Entry.<Character, Integer>comparingByValue().reversed())
202            .collect(
203                Collectors.toMap(
204                    Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e2, LinkedHashMap::new));
205
206    Set<Character> mostFrequentDelimiter =
207        getAllEqualsToFirst(delimiterFrequencySums, Integer::equals);
208
209    // get the highest frequency per line to check for "usage per line"
210    Map<Character, Long> delimiterHighestFrequencyPerLine =
211        computeDelimiterHighestFrequencyPerLine(sample).entrySet().stream()
212            .sorted(Map.Entry.<Character, Long>comparingByValue().reversed())
213            .collect(
214                Collectors.toMap(
215                    Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e2, LinkedHashMap::new));
216    Set<Character> mostFrequentDelimiterPerLine =
217        getAllEqualsToFirst(delimiterHighestFrequencyPerLine, Long::equals);
218
219    // summary
220    if (LOG.isDebugEnabled()) {
221      LOG.debug("delimiterDistinctFrequency -> " + delimiterDistinctFrequency);
222      LOG.debug("mostStableDelimiter -> " + mostStableDelimiter);
223      LOG.debug("delimiterFrequencySums -> " + delimiterFrequencySums);
224      LOG.debug("mostFrequentDelimiter -> " + mostFrequentDelimiter);
225      LOG.debug("delimiterHighestFrequencyPerLine->" + delimiterHighestFrequencyPerLine);
226      LOG.debug("mostFrequentDelimiterPerLine ->" + mostFrequentDelimiterPerLine);
227    }
228
229    // if the most stable is also the one that is used to most within the sample
230    Optional<Character> resultCharacter =
231        intersectSingle(mostStableDelimiter, mostFrequentDelimiter);
232    if (resultCharacter.isPresent()) {
233      return resultCharacter;
234    }
235
236    // otherwise, if the most stable is also the most used based on lines
237    resultCharacter = intersectSingle(mostStableDelimiter, mostFrequentDelimiterPerLine);
238    if (resultCharacter.isPresent()) {
239      return resultCharacter;
240    }
241
242    // as last resort if the most frequent delimiter overall and by line is the same
243    resultCharacter = intersectSingle(mostFrequentDelimiter, mostFrequentDelimiterPerLine);
244
245    return resultCharacter;
246  }
247
248  /**
249   * Return the {@link Character} represents the intersection between 2 sets only if the resulting set represents
250   * a single element.
251   * @param set1
252   * @param set2
253   * @return
254   */
255  private static Optional<Character> intersectSingle(Set<Character> set1, Set<Character> set2) {
256    Set<Character> intersection = new HashSet<>(set1);
257    intersection.retainAll(set2);
258
259    return intersection.size() == 1 ? intersection.stream().findFirst() : Optional.empty();
260  }
261
262  /**
263   * Given a {@link Map}, return all elements that are equals to the first element (including itself)
264   * based on the provided equals function.
265   * @param map
266   * @param equalsPredicate
267   * @param <T>
268   * @return all elements that are equals to the first one or an empty set if the map is empty
269   */
270  private static <T> Set<Character> getAllEqualsToFirst(
271      Map<Character, T> map, BiFunction<T, T, Boolean> equalsPredicate) {
272
273    Optional<Map.Entry<Character, T>> firstMapEntry = map.entrySet().stream().findFirst();
274    if (!firstMapEntry.isPresent()) {
275      return Collections.EMPTY_SET;
276    }
277
278    final T firstValue = firstMapEntry.get().getValue();
279    return map.entrySet().stream()
280        .filter(e -> equalsPredicate.apply(firstValue, e.getValue()))
281        .map(Map.Entry::getKey)
282        .collect(Collectors.toSet());
283  }
284
285  /**
286   * For each element(line) of the sample, compute a {@link LineDelimiterStats} for each delimiter.
287   * Note: delimiter that are not used within a line will be included with the frequency 0.
288   * @param sample
289   * @return new List, never null
290   */
291  static List<LineDelimiterStats> computeLineDelimiterStats(List<String> sample) {
292    return sample.stream()
293        .map(TabularFileMetadataExtractor::lineToLineDelimiterStats)
294        .flatMap(List::stream)
295        .collect(Collectors.toList());
296  }
297
298  /**
299   * Compute the stats for each potential delimiters on a line.
300   * @param line
301   * @return
302   */
303  private static List<LineDelimiterStats> lineToLineDelimiterStats(String line) {
304    return Arrays.stream(POTENTIAL_DELIMITER_CHAR)
305        .map(
306            delimiter ->
307                new LineDelimiterStats(delimiter, StringUtils.countMatches(line, delimiter)))
308        .collect(Collectors.toList());
309  }
310
311  /**
312   * For each {@link LineDelimiterStats}, collect the distinct frequency (count) of each delimiter.
313   * This gives us an idea of the "stability" of each delimiter across the sample.
314   * Note that since quotes are not handled, noise can be introduced if a quoted cells use the delimiter.
315   *
316   * See unit test for examples of when this method will be affected by noise.
317   *
318   * The most stable delimiter is normally defined by the {@link Character} returned by the methods where
319   * the list of distinct frequency is the smallest in size excluding cases where the list contains only the element
320   * representing 0 as Integer (which means the delimiter was never used).
321   *
322   * @param linesStats
323   *
324   * @return
325   */
326  static Map<Character, Set<Integer>> computeDelimiterDistinctFrequency(
327      List<LineDelimiterStats> linesStats) {
328    return linesStats.stream()
329        .collect(
330            Collectors.groupingBy(
331                LineDelimiterStats::getDelimiter,
332                Collectors.mapping(LineDelimiterStats::getFrequency, toSet())));
333  }
334
335  /**
336   * For each line, check the delimiter that is used the most.
337   * Return the count of each delimiter.
338   * @param lines
339   * @return
340   */
341  static Map<Character, Long> computeDelimiterHighestFrequencyPerLine(List<String> lines) {
342    return lines.stream()
343        .map(TabularFileMetadataExtractor::getDelimiterWithHighestCount2)
344        .flatMap(
345            o ->
346                o.map(Stream::of)
347                    .orElseGet(Stream::empty)) // remove Optional wrapper and ignore Optional.empty
348        .collect(Collectors.groupingBy(LineDelimiterStats::getDelimiter, counting()));
349  }
350
351  /**
352   * For {@link LineDelimiterStats}, sum the frequency (count) of each delimiter.
353   * This gives us an idea of the overall usage of each delimiter across the sample.
354   * Note that since quotes are not handled, noise can be introduced if a quoted cell uses the delimiter.
355   *
356   * See unit test for examples of when this method will be affected by noise.
357   *
358   * @param linesStats
359   *
360   * @return
361   */
362  static Map<Character, Integer> computeDelimiterFrequencySums(
363      List<LineDelimiterStats> linesStats) {
364    return linesStats.stream()
365        .filter(CONTAINS_FREQUENCY)
366        .collect(
367            Collectors.groupingBy(
368                LineDelimiterStats::getDelimiter,
369                Collectors.summingInt(LineDelimiterStats::getFrequency)));
370  }
371
372  /**
373   * Given a line, get the delimiter with the highest count if any can be found.
374   * Note: quotes are ignored in the count so a delimiter used inside quotes will be counted.
375   *
376   * @param line line of text to analyse
377   *
378   * @return
379   */
380  static Optional<Character> getDelimiterWithHighestCount(String line) {
381    int highestCount = 0;
382    Character highestCountDelimiter = null;
383    for (Character delimiter : POTENTIAL_DELIMITER_CHAR) {
384      int currentCount = StringUtils.countMatches(line, delimiter);
385      if (currentCount > highestCount) {
386        highestCount = currentCount;
387        highestCountDelimiter = delimiter;
388      }
389    }
390    return Optional.ofNullable(highestCountDelimiter);
391  }
392
393  static Optional<LineDelimiterStats> getDelimiterWithHighestCount2(String line) {
394    int highestCount = 0;
395    // Character highestCountDelimiter = null;
396    LineDelimiterStats lineDelimiterStats = null;
397    for (Character delimiter : POTENTIAL_DELIMITER_CHAR) {
398      int currentCount = StringUtils.countMatches(line, delimiter);
399      if (currentCount > highestCount) {
400        highestCount = currentCount;
401        lineDelimiterStats = new LineDelimiterStats(delimiter, highestCount);
402      }
403    }
404    return Optional.ofNullable(lineDelimiterStats);
405  }
406
407  /**
408   * Given a line and a delimiter, try to determine the quoting character if any can be found.
409   * To check if a quote character is used we run a regex to check for a delimiter followed by a quoting character.
410   *
411   * @param line line of text to analyse
412   * @param delimiter delimiter used in the line of text
413   *
414   * @return
415   */
416  static Optional<Character> getQuoteCharWithHighestCount(String line, Character delimiter) {
417    int highestCount = 0;
418    Character highestCountQuoteChar = null;
419    for (Character quoteChar : POTENTIAL_QUOTES_CHAR) {
420      int currentCount = 0;
421      Matcher m = COMPILE_QUOTE_PATTERN_FCT.apply(delimiter, quoteChar).matcher(line);
422      while (m.find()) {
423        currentCount++;
424      }
425      if (currentCount > highestCount) {
426        highestCount = currentCount;
427        highestCountQuoteChar = quoteChar;
428      }
429    }
430    return Optional.ofNullable(highestCountQuoteChar);
431  }
432
433  /**
434   * Inner representation of stats (frequency) of a delimiter
435   */
436  static class LineDelimiterStats {
437    private Character delimiter;
438    private int frequency;
439
440    LineDelimiterStats(Character delimiter, int frequency) {
441      this.delimiter = delimiter;
442      this.frequency = frequency;
443    }
444
445    Character getDelimiter() {
446      return delimiter;
447    }
448
449    int getFrequency() {
450      return frequency;
451    }
452  }
453}