001/* 002 * Licensed under the Apache License, Version 2.0 (the "License"); 003 * you may not use this file except in compliance with the License. 004 * You may obtain a copy of the License at 005 * 006 * http://www.apache.org/licenses/LICENSE-2.0 007 * 008 * Unless required by applicable law or agreed to in writing, software 009 * distributed under the License is distributed on an "AS IS" BASIS, 010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 011 * See the License for the specific language governing permissions and 012 * limitations under the License. 013 */ 014package org.gbif.utils.file.tabular; 015 016import java.io.IOException; 017import java.io.Reader; 018import java.io.Writer; 019import java.nio.charset.Charset; 020import java.nio.charset.StandardCharsets; 021import java.nio.file.Files; 022import java.nio.file.Path; 023import java.util.ArrayList; 024import java.util.List; 025import java.util.Objects; 026import java.util.Optional; 027import java.util.stream.Collectors; 028 029import com.fasterxml.jackson.databind.MappingIterator; 030import com.fasterxml.jackson.databind.SequenceWriter; 031import com.fasterxml.jackson.dataformat.csv.CsvGenerator; 032import com.fasterxml.jackson.dataformat.csv.CsvMapper; 033import com.fasterxml.jackson.dataformat.csv.CsvParser; 034import com.fasterxml.jackson.dataformat.csv.CsvSchema; 035 036import static org.gbif.utils.file.tabular.JacksonUtils.buildCsvSchema; 037 038/** 039 * Utility class to rewrite a tabular file (e.g. CSV) into a normalized format. 040 * The regular use case is to allow external tools to work as expected (e.g. unix split, unix sort). 041 */ 042public class TabularFileNormalizer { 043 044 // A character is considered to be an ISO control character if its code is in 045 // the range '\u0000' through '\u001F' or in the range '\u007F' through '\u009F'. 046 private static final String CONTROL_CHAR_REGEX = "\\p{Cntrl}"; 047 048 public static final String NORMALIZED_END_OF_LINE = "\n"; 049 050 /** 051 * Normalizes the provided tabular "file" (provided as {@link Reader} to let the caller deal with charset). 052 * Normalization includes: stripping of Control Characters (see {@link #CONTROL_CHAR_REGEX}), 053 * usage of \n as end-line-character, ensuring there is an end-of-line character on the last line and 054 * removing empty (completely empty) lines. 055 * The normalized content will have unnecessary quotes removed. 056 * 057 * @param source {@link Path} representing the source 058 * @param destination {@link Path} representing the destination. If the file already exists it will be overwritten. 059 * @param sourceCharset optionally, the {@link Charset} of the source. If null UTF-8 will be used. 060 * @param delimiterChar 061 * @param endOfLineSymbols 062 * @param quoteChar optional 063 * 064 * @return number of lines written 065 * 066 * @throws IOException 067 */ 068 public static int normalizeFile( 069 Path source, 070 Path destination, 071 Charset sourceCharset, 072 char delimiterChar, 073 String endOfLineSymbols, 074 Character quoteChar) 075 throws IOException { 076 Objects.requireNonNull(source, "source path shall be provided"); 077 Objects.requireNonNull(destination, "normalizedWriter shall be provided"); 078 079 int numberOfLine = 0; 080 CsvMapper mapper = new CsvMapper(); 081 mapper.enable(CsvParser.Feature.WRAP_AS_ARRAY); 082 083 mapper.configure(CsvGenerator.Feature.STRICT_CHECK_FOR_QUOTING, true); 084 085 CsvSchema readerSchema = buildCsvSchema(delimiterChar, endOfLineSymbols, quoteChar); 086 CsvSchema writerSchema = buildCsvSchema(delimiterChar, NORMALIZED_END_OF_LINE, quoteChar); 087 088 Charset charset = Optional.ofNullable(sourceCharset).orElse(StandardCharsets.UTF_8); 089 try (Reader sourceReader = Files.newBufferedReader(source, charset); 090 Writer writer = Files.newBufferedWriter(destination, charset); 091 MappingIterator<List<String>> it = 092 mapper.readerFor(List.class).with(readerSchema).readValues(sourceReader); 093 SequenceWriter csvWriter = 094 mapper.writerFor(List.class).with(writerSchema).writeValues(writer)) { 095 List<String> line; 096 while (it.hasNext()) { 097 line = normalizeLine(it.next()); 098 if (!line.isEmpty()) { 099 csvWriter.write(line); 100 numberOfLine++; 101 } 102 } 103 } catch (IOException ioEx) { 104 // avoid keeping incomplete file 105 Files.deleteIfExists(destination); 106 throw ioEx; 107 } 108 return numberOfLine; 109 } 110 111 /** 112 * For a given line in a tabular file, normalize it if it contains something. 113 * 114 * @param line 115 * 116 * @return normalized line or an empty list if source is null or empty 117 */ 118 private static List<String> normalizeLine(List<String> line) { 119 if (line == null || line.isEmpty()) { 120 return new ArrayList<>(); 121 } 122 123 return line.stream() 124 .map(s -> s == null ? "" : s) 125 .map(str -> str.replaceAll(CONTROL_CHAR_REGEX, "")) 126 .collect(Collectors.toList()); 127 } 128}