Source code

001/*
002 * Copyright 2021 Global Biodiversity Information Facility (GBIF)
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016package org.gbif.utils.file.tabular;
017
018import java.io.IOException;
019import java.io.Reader;
020import java.io.Writer;
021import java.nio.charset.Charset;
022import java.nio.charset.StandardCharsets;
023import java.nio.file.Files;
024import java.nio.file.Path;
025import java.util.ArrayList;
026import java.util.List;
027import java.util.Objects;
028import java.util.Optional;
029import java.util.stream.Collectors;
030
031import com.fasterxml.jackson.databind.MappingIterator;
032import com.fasterxml.jackson.databind.SequenceWriter;
033import com.fasterxml.jackson.dataformat.csv.CsvGenerator;
034import com.fasterxml.jackson.dataformat.csv.CsvMapper;
035import com.fasterxml.jackson.dataformat.csv.CsvParser;
036import com.fasterxml.jackson.dataformat.csv.CsvSchema;
037
038import static org.gbif.utils.file.tabular.JacksonUtils.buildCsvSchema;
039
040/**
041 * Utility class to rewrite a tabular file (e.g. CSV) into a normalized format.
042 * The regular use case is to allow external tools to work as expected (e.g. unix split, unix sort).
043 */
044public class TabularFileNormalizer {
045
046  // A character is considered to be an ISO control character if its code is in
047  // the range '\u0000' through '\u001F' or in the range '\u007F' through '\u009F'.
048  private static final String CONTROL_CHAR_REGEX = "\\p{Cntrl}";
049
050  public static final String NORMALIZED_END_OF_LINE = "\n";
051
052  /**
053   * Normalizes the provided tabular "file" (provided as {@link Reader} to let the caller deal with charset).
054   * Normalization includes: striping of Control Characters (see {@link #CONTROL_CHAR_REGEX}),
055   * usage of \n as end-line-character, ensuring there is an end-of-line character on the last line and
056   * removing empty (completely empty) lines.
057   * The normalized content will have unnecessary quotes removed.
058   *
059   * @param source           {@link Path} representing the source
060   * @param destination      {@link Path} representing the destination. If the file already exists it will be overwritten.
061   * @param sourceCharset    optionally, the {@link Charset} of the source. If null UTF-8 will be used.
062   * @param delimiterChar
063   * @param endOfLineSymbols
064   * @param quoteChar        optional
065   *
066   * @return number of lines written
067   *
068   * @throws IOException
069   */
070  public static int normalizeFile(Path source, Path destination, Charset sourceCharset,
071                                  char delimiterChar, String endOfLineSymbols, Character quoteChar) throws IOException {
072    Objects.requireNonNull(source, "source path shall be provided");
073    Objects.requireNonNull(destination, "normalizedWriter shall be provided");
074
075    int numberOfLine = 0;
076    CsvMapper mapper = new CsvMapper();
077    mapper.enable(CsvParser.Feature.WRAP_AS_ARRAY);
078
079    mapper.configure(CsvGenerator.Feature.STRICT_CHECK_FOR_QUOTING, true);
080
081    CsvSchema readerSchema = buildCsvSchema(delimiterChar, endOfLineSymbols, quoteChar);
082    CsvSchema writerSchema = buildCsvSchema(delimiterChar, NORMALIZED_END_OF_LINE, quoteChar);
083
084    Charset charset = Optional.ofNullable(sourceCharset).orElse(StandardCharsets.UTF_8);
085    try (Reader sourceReader = Files.newBufferedReader(source, charset);
086         Writer writer = Files.newBufferedWriter(destination, charset);
087         MappingIterator<List<String>> it = mapper.readerFor(List.class)
088                 .with(readerSchema)
089                 .readValues(sourceReader);
090         SequenceWriter csvWriter = mapper.writerFor(List.class).with(writerSchema).writeValues(writer)) {
091      List<String> line;
092      while (it.hasNext()) {
093        line = normalizeLine(it.next());
094        if (!line.isEmpty()) {
095          csvWriter.write(line);
096          numberOfLine++;
097        }
098      }
099    }
100    catch (IOException ioEx) {
101      //avoid keeping incomplete file
102      Files.deleteIfExists(destination);
103      throw ioEx;
104    }
105    return numberOfLine;
106  }
107
108  /**
109   * For a given line in a tabular file, normalize it if it contains something.
110   *
111   * @param line
112   *
113   * @return normalized line or an empty list if source is null or empty
114   */
115  private static List<String> normalizeLine(List<String> line) {
116    if (line == null || line.isEmpty()) {
117      return new ArrayList<>();
118    }
119
120    return line.stream()
121        .map(s -> s == null ? "" : s)
122        .map(str -> str.replaceAll(CONTROL_CHAR_REGEX, ""))
123        .collect(Collectors.toList());
124  }
125}