Source code

001/*
002 * Licensed under the Apache License, Version 2.0 (the "License");
003 * you may not use this file except in compliance with the License.
004 * You may obtain a copy of the License at
005 *
006 *     http://www.apache.org/licenses/LICENSE-2.0
007 *
008 * Unless required by applicable law or agreed to in writing, software
009 * distributed under the License is distributed on an "AS IS" BASIS,
010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
011 * See the License for the specific language governing permissions and
012 * limitations under the License.
013 */
014package org.gbif.utils.file.tabular;
015
016import java.io.IOException;
017import java.io.Reader;
018import java.io.Writer;
019import java.nio.charset.Charset;
020import java.nio.charset.StandardCharsets;
021import java.nio.file.Files;
022import java.nio.file.Path;
023import java.util.ArrayList;
024import java.util.List;
025import java.util.Objects;
026import java.util.Optional;
027import java.util.stream.Collectors;
028
029import com.fasterxml.jackson.databind.MappingIterator;
030import com.fasterxml.jackson.databind.SequenceWriter;
031import com.fasterxml.jackson.dataformat.csv.CsvGenerator;
032import com.fasterxml.jackson.dataformat.csv.CsvMapper;
033import com.fasterxml.jackson.dataformat.csv.CsvParser;
034import com.fasterxml.jackson.dataformat.csv.CsvSchema;
035
036import static org.gbif.utils.file.tabular.JacksonUtils.buildCsvSchema;
037
038/**
039 * Utility class to rewrite a tabular file (e.g. CSV) into a normalized format.
040 * The regular use case is to allow external tools to work as expected (e.g. unix split, unix sort).
041 */
042public class TabularFileNormalizer {
043
044  // A character is considered to be an ISO control character if its code is in
045  // the range '\u0000' through '\u001F' or in the range '\u007F' through '\u009F'.
046  private static final String CONTROL_CHAR_REGEX = "\\p{Cntrl}";
047
048  public static final String NORMALIZED_END_OF_LINE = "\n";
049
050  /**
051   * Normalizes the provided tabular "file" (provided as {@link Reader} to let the caller deal with charset).
052   * Normalization includes: stripping of Control Characters (see {@link #CONTROL_CHAR_REGEX}),
053   * usage of \n as end-line-character, ensuring there is an end-of-line character on the last line and
054   * removing empty (completely empty) lines.
055   * The normalized content will have unnecessary quotes removed.
056   *
057   * @param source           {@link Path} representing the source
058   * @param destination      {@link Path} representing the destination. If the file already exists it will be overwritten.
059   * @param sourceCharset    optionally, the {@link Charset} of the source. If null UTF-8 will be used.
060   * @param delimiterChar
061   * @param endOfLineSymbols
062   * @param quoteChar        optional
063   *
064   * @return number of lines written
065   *
066   * @throws IOException
067   */
068  public static int normalizeFile(
069      Path source,
070      Path destination,
071      Charset sourceCharset,
072      char delimiterChar,
073      String endOfLineSymbols,
074      Character quoteChar)
075      throws IOException {
076    Objects.requireNonNull(source, "source path shall be provided");
077    Objects.requireNonNull(destination, "normalizedWriter shall be provided");
078
079    int numberOfLine = 0;
080    CsvMapper mapper = new CsvMapper();
081    mapper.enable(CsvParser.Feature.WRAP_AS_ARRAY);
082
083    mapper.configure(CsvGenerator.Feature.STRICT_CHECK_FOR_QUOTING, true);
084
085    CsvSchema readerSchema = buildCsvSchema(delimiterChar, endOfLineSymbols, quoteChar);
086    CsvSchema writerSchema = buildCsvSchema(delimiterChar, NORMALIZED_END_OF_LINE, quoteChar);
087
088    Charset charset = Optional.ofNullable(sourceCharset).orElse(StandardCharsets.UTF_8);
089    try (Reader sourceReader = Files.newBufferedReader(source, charset);
090        Writer writer = Files.newBufferedWriter(destination, charset);
091        MappingIterator<List<String>> it =
092            mapper.readerFor(List.class).with(readerSchema).readValues(sourceReader);
093        SequenceWriter csvWriter =
094            mapper.writerFor(List.class).with(writerSchema).writeValues(writer)) {
095      List<String> line;
096      while (it.hasNext()) {
097        line = normalizeLine(it.next());
098        if (!line.isEmpty()) {
099          csvWriter.write(line);
100          numberOfLine++;
101        }
102      }
103    } catch (IOException ioEx) {
104      // avoid keeping incomplete file
105      Files.deleteIfExists(destination);
106      throw ioEx;
107    }
108    return numberOfLine;
109  }
110
111  /**
112   * For a given line in a tabular file, normalize it if it contains something.
113   *
114   * @param line
115   *
116   * @return normalized line or an empty list if source is null or empty
117   */
118  private static List<String> normalizeLine(List<String> line) {
119    if (line == null || line.isEmpty()) {
120      return new ArrayList<>();
121    }
122
123    return line.stream()
124        .map(s -> s == null ? "" : s)
125        .map(str -> str.replaceAll(CONTROL_CHAR_REGEX, ""))
126        .collect(Collectors.toList());
127  }
128}