001package org.gbif.utils.file.tabular;
002
003import java.io.IOException;
004import java.io.Reader;
005import java.io.Writer;
006import java.nio.charset.Charset;
007import java.nio.charset.StandardCharsets;
008import java.nio.file.Files;
009import java.nio.file.Path;
010import java.util.List;
011import java.util.Objects;
012import java.util.Optional;
013import java.util.stream.Collectors;
014
015import com.fasterxml.jackson.databind.MappingIterator;
016import com.fasterxml.jackson.databind.SequenceWriter;
017import com.fasterxml.jackson.dataformat.csv.CsvMapper;
018import com.fasterxml.jackson.dataformat.csv.CsvParser;
019import com.fasterxml.jackson.dataformat.csv.CsvSchema;
020import com.google.common.base.CharMatcher;
021
022import static org.gbif.utils.file.tabular.JacksonUtils.buildCsvSchema;
023
024/**
025 * Utility class to rewrite a tabular file (e.g. CSV) into a normalized format.
026 * The regular use case is to allow external tools to work as expected (e.g. unix split, unix sort).
027 */
028public class TabularFileNormalizer {
029
030  // A character is considered to be an ISO control character if its code is in
031  // the range '\u0000' through '\u001F' or in the range '\u007F' through '\u009F'.
032  private static CharMatcher CONTROL_CHAR_MATCHER = CharMatcher.JAVA_ISO_CONTROL;
033
034  public static String NORMALIZED_END_OF_LINE = "\n";
035
036  /**
037   * Normalizes the provided tabular "file" (provided as {@link Reader} to let the caller deal with charset).
038   * Normalization includes: striping of Control Characters (see {@link #CONTROL_CHAR_MATCHER}),
039   * usage of \n as end-line-character, ensuring there is an end-of-line character on the last line and
040   * removing empty (completely empty) lines.
041   * The normalized content will have unnecessary quotes removed.
042   *
043   * @param source           {@link Path} representing the source
044   * @param destination      {@link Path} representing the destination. If the file already exists it will be overwritten.
045   * @param sourceCharset    optionally, the {@link Charset} of the source. If null UTF-8 will be used.
046   * @param delimiterChar
047   * @param endOfLineSymbols
048   * @param quoteChar        optional
049   *
050   * @return number of lines written
051   *
052   * @throws IOException
053   */
054  public static int normalizeFile(Path source, Path destination, Charset sourceCharset,
055                                  char delimiterChar, String endOfLineSymbols, Character quoteChar) throws IOException {
056    Objects.requireNonNull(source, "source path shall be provided");
057    Objects.requireNonNull(destination, "normalizedWriter shall be provided");
058
059    int numberOfLine = 0;
060    CsvMapper mapper = new CsvMapper();
061    mapper.enable(CsvParser.Feature.WRAP_AS_ARRAY);
062
063    CsvSchema readerSchema = buildCsvSchema(delimiterChar, endOfLineSymbols, quoteChar);
064    CsvSchema writerSchema = buildCsvSchema(delimiterChar, NORMALIZED_END_OF_LINE, quoteChar);
065
066    Charset charset = Optional.ofNullable(sourceCharset).orElse(StandardCharsets.UTF_8);
067    try (Reader sourceReader = Files.newBufferedReader(source, charset);
068         Writer writer = Files.newBufferedWriter(destination, charset);
069         MappingIterator<List<String>> it = mapper.readerFor(List.class)
070                 .with(readerSchema)
071                 .readValues(sourceReader);
072         SequenceWriter csvWriter = mapper.writerFor(List.class).with(writerSchema).writeValues(writer)) {
073      Optional<List<String>> line;
074      while (it.hasNext()) {
075        line = normalizeLine(it.next());
076        if (line.isPresent()) {
077          csvWriter.write(line.get());
078          numberOfLine++;
079        }
080      }
081    }
082    catch (IOException ioEx) {
083      //avoid keeping incomplete file
084      Files.deleteIfExists(destination);
085      throw ioEx;
086    }
087    return numberOfLine;
088  }
089
090  /**
091   * For a given line in a tabular file, normalize it if it contains something.
092   *
093   * @param line
094   *
095   * @return normalized line as String or {@code Optional.empty()} is the line was empty
096   */
097  private static Optional<List<String>> normalizeLine(List<String> line) {
098    if (line == null || line.isEmpty()) {
099      return Optional.empty();
100    }
101
102    return Optional.of(
103            line.stream()
104                    .map(s -> s == null ? "" : s)
105                    .map(CONTROL_CHAR_MATCHER::removeFrom)
106                    .collect(Collectors.toList()));
107  }
108}