001/* 002 * Copyright 2021 Global Biodiversity Information Facility (GBIF) 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package org.gbif.utils.file.tabular; 017 018import java.io.IOException; 019import java.io.Reader; 020import java.io.Writer; 021import java.nio.charset.Charset; 022import java.nio.charset.StandardCharsets; 023import java.nio.file.Files; 024import java.nio.file.Path; 025import java.util.ArrayList; 026import java.util.List; 027import java.util.Objects; 028import java.util.Optional; 029import java.util.stream.Collectors; 030 031import com.fasterxml.jackson.databind.MappingIterator; 032import com.fasterxml.jackson.databind.SequenceWriter; 033import com.fasterxml.jackson.dataformat.csv.CsvGenerator; 034import com.fasterxml.jackson.dataformat.csv.CsvMapper; 035import com.fasterxml.jackson.dataformat.csv.CsvParser; 036import com.fasterxml.jackson.dataformat.csv.CsvSchema; 037 038import static org.gbif.utils.file.tabular.JacksonUtils.buildCsvSchema; 039 040/** 041 * Utility class to rewrite a tabular file (e.g. CSV) into a normalized format. 042 * The regular use case is to allow external tools to work as expected (e.g. unix split, unix sort). 043 */ 044public class TabularFileNormalizer { 045 046 // A character is considered to be an ISO control character if its code is in 047 // the range '\u0000' through '\u001F' or in the range '\u007F' through '\u009F'. 048 private static final String CONTROL_CHAR_REGEX = "\\p{Cntrl}"; 049 050 public static final String NORMALIZED_END_OF_LINE = "\n"; 051 052 /** 053 * Normalizes the provided tabular "file" (provided as {@link Reader} to let the caller deal with charset). 054 * Normalization includes: striping of Control Characters (see {@link #CONTROL_CHAR_REGEX}), 055 * usage of \n as end-line-character, ensuring there is an end-of-line character on the last line and 056 * removing empty (completely empty) lines. 057 * The normalized content will have unnecessary quotes removed. 058 * 059 * @param source {@link Path} representing the source 060 * @param destination {@link Path} representing the destination. If the file already exists it will be overwritten. 061 * @param sourceCharset optionally, the {@link Charset} of the source. If null UTF-8 will be used. 062 * @param delimiterChar 063 * @param endOfLineSymbols 064 * @param quoteChar optional 065 * 066 * @return number of lines written 067 * 068 * @throws IOException 069 */ 070 public static int normalizeFile(Path source, Path destination, Charset sourceCharset, 071 char delimiterChar, String endOfLineSymbols, Character quoteChar) throws IOException { 072 Objects.requireNonNull(source, "source path shall be provided"); 073 Objects.requireNonNull(destination, "normalizedWriter shall be provided"); 074 075 int numberOfLine = 0; 076 CsvMapper mapper = new CsvMapper(); 077 mapper.enable(CsvParser.Feature.WRAP_AS_ARRAY); 078 079 mapper.configure(CsvGenerator.Feature.STRICT_CHECK_FOR_QUOTING, true); 080 081 CsvSchema readerSchema = buildCsvSchema(delimiterChar, endOfLineSymbols, quoteChar); 082 CsvSchema writerSchema = buildCsvSchema(delimiterChar, NORMALIZED_END_OF_LINE, quoteChar); 083 084 Charset charset = Optional.ofNullable(sourceCharset).orElse(StandardCharsets.UTF_8); 085 try (Reader sourceReader = Files.newBufferedReader(source, charset); 086 Writer writer = Files.newBufferedWriter(destination, charset); 087 MappingIterator<List<String>> it = mapper.readerFor(List.class) 088 .with(readerSchema) 089 .readValues(sourceReader); 090 SequenceWriter csvWriter = mapper.writerFor(List.class).with(writerSchema).writeValues(writer)) { 091 List<String> line; 092 while (it.hasNext()) { 093 line = normalizeLine(it.next()); 094 if (!line.isEmpty()) { 095 csvWriter.write(line); 096 numberOfLine++; 097 } 098 } 099 } 100 catch (IOException ioEx) { 101 //avoid keeping incomplete file 102 Files.deleteIfExists(destination); 103 throw ioEx; 104 } 105 return numberOfLine; 106 } 107 108 /** 109 * For a given line in a tabular file, normalize it if it contains something. 110 * 111 * @param line 112 * 113 * @return normalized line or an empty list if source is null or empty 114 */ 115 private static List<String> normalizeLine(List<String> line) { 116 if (line == null || line.isEmpty()) { 117 return new ArrayList<>(); 118 } 119 120 return line.stream() 121 .map(s -> s == null ? "" : s) 122 .map(str -> str.replaceAll(CONTROL_CHAR_REGEX, "")) 123 .collect(Collectors.toList()); 124 } 125}