Source code

001/*
002 * Licensed under the Apache License, Version 2.0 (the "License");
003 * you may not use this file except in compliance with the License.
004 * You may obtain a copy of the License at
005 *
006 *     http://www.apache.org/licenses/LICENSE-2.0
007 *
008 * Unless required by applicable law or agreed to in writing, software
009 * distributed under the License is distributed on an "AS IS" BASIS,
010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
011 * See the License for the specific language governing permissions and
012 * limitations under the License.
013 */
014package org.gbif.utils.file;
015
016import org.gbif.utils.collection.CompactHashSet;
017import org.gbif.utils.text.LineComparator;
018
019import java.io.BufferedInputStream;
020import java.io.BufferedReader;
021import java.io.BufferedWriter;
022import java.io.File;
023import java.io.FileInputStream;
024import java.io.FileNotFoundException;
025import java.io.FileOutputStream;
026import java.io.FileWriter;
027import java.io.IOException;
028import java.io.InputStream;
029import java.io.InputStreamReader;
030import java.io.OutputStream;
031import java.io.OutputStreamWriter;
032import java.io.UnsupportedEncodingException;
033import java.io.Writer;
034import java.net.URISyntaxException;
035import java.net.URL;
036import java.nio.ByteBuffer;
037import java.nio.charset.Charset;
038import java.nio.charset.StandardCharsets;
039import java.util.Collections;
040import java.util.Comparator;
041import java.util.HashMap;
042import java.util.LinkedList;
043import java.util.List;
044import java.util.Map;
045import java.util.Objects;
046import java.util.Set;
047import java.util.regex.Pattern;
048import java.util.stream.Collectors;
049
050import org.apache.commons.io.FilenameUtils;
051import org.apache.commons.io.LineIterator;
052import org.apache.commons.lang3.StringUtils;
053import org.apache.commons.lang3.tuple.Pair;
054import org.slf4j.Logger;
055import org.slf4j.LoggerFactory;
056
057/**
058 * Collection of file utils.
059 * <br>
060 * This class has only been tested for use with a UTF-8 system encoding.
061 */
062public final class FileUtils {
063
064  private static final Logger LOG = LoggerFactory.getLogger(FileUtils.class);
065
066  public static final String UTF8 = StandardCharsets.UTF_8.name();
067  public static final Pattern TAB_DELIMITED = Pattern.compile("\t");
068  private static int linesPerMemorySort = 100000;
069  private static Boolean gnuSortAvailable = null;
070  private static final Object sortLock = new Object();
071
072  static {
073    /* Warn when the software is not run in a Unicode environment.  This library has not been
074     * tested to run in a non-Unicode environment, and may cause data corruption.
075     */
076    if (Charset.defaultCharset().equals(StandardCharsets.US_ASCII)) {
077      System.err.println(
078          "The default character set is US ASCII.  It is strongly recommended to "
079              + "run this software in a Unicode environment.");
080    }
081  }
082
083  public static String classpath2Filepath(String path) {
084    return new File(ClassLoader.getSystemResource(path).getFile()).getAbsolutePath();
085  }
086
087  public static InputStream classpathStream(String path) throws IOException {
088    InputStream in = null;
089    // relative path. Use classpath instead
090    URL url = FileUtils.class.getClassLoader().getResource(path);
091    if (url != null) {
092      in = url.openStream();
093    }
094    return in;
095  }
096
097  public static Set<String> columnsToSet(InputStream source, int... column) throws IOException {
098    return columnsToSet(source, new CompactHashSet<String>(), column);
099  }
100
101  /**
102   * Reads a file and returns a unique set of multiple columns from lines which are no comments (starting with #) and
103   * trims whitespace.
104   *
105   * @param source the UTF-8 encoded text file with tab delimited columns
106   * @param resultSet the set implementation to be used. Will not be cleared before reading!
107   * @param column variable length argument of column indices to process
108   * @return set of column rows
109   */
110  public static Set<String> columnsToSet(InputStream source, Set<String> resultSet, int... column)
111      throws IOException {
112    LineIterator lines = getLineIterator(source);
113    int maxCols = 0;
114    for (int c : column) {
115      if (c > maxCols) {
116        maxCols = c;
117      }
118    }
119    while (lines.hasNext()) {
120      String line = lines.nextLine().trim();
121      // ignore comments
122      if (!ignore(line)) {
123        String[] parts = TAB_DELIMITED.split(line);
124        if (maxCols <= parts.length) {
125          for (int c : column) {
126            String cell = parts[c].trim();
127            resultSet.add(cell);
128          }
129        }
130      }
131    }
132    return resultSet;
133  }
134
135  public static void copyStreams(InputStream in, OutputStream out) throws IOException {
136    // write the file to the file specified
137    int bytesRead;
138    byte[] buffer = new byte[8192];
139
140    while ((bytesRead = in.read(buffer, 0, 8192)) != -1) {
141      out.write(buffer, 0, bytesRead);
142    }
143
144    out.close();
145    in.close();
146  }
147
148  public static void copyStreamToFile(InputStream in, File out) throws IOException {
149    copyStreams(in, new FileOutputStream(out));
150  }
151
152  public static File createTempDir() throws IOException {
153    return createTempDir("gbif-futil", ".tmp");
154  }
155
156  /**
157   * @param prefix The prefix string to be used in generating the file's name; must be at least three characters long
158   * @param suffix The suffix string to be used in generating the file's name; may be null, in which case the suffix
159   *        ".tmp" will be used
160   */
161  public static File createTempDir(String prefix, String suffix) throws IOException {
162    File dir = File.createTempFile(prefix, suffix);
163    if (!dir.delete()) {
164      throw new IOException("Could not delete temp file: " + dir.getAbsolutePath());
165    }
166    if (!dir.mkdir()) {
167      throw new IOException("Could not create temp directory: " + dir.getAbsolutePath());
168    }
169    return dir;
170  }
171
172  /**
173   * Delete directory recursively, including all its files, sub-folders, and sub-folder's files.
174   *
175   * @param directory directory to delete recursively
176   */
177  public static void deleteDirectoryRecursively(File directory) {
178    File[] list = directory.listFiles();
179    for (File file : list) {
180      if (file.isDirectory()) {
181        deleteDirectoryRecursively(file);
182        file.delete();
183      } else {
184        file.delete();
185      }
186    }
187    directory.delete();
188  }
189
190  /**
191   * Escapes a filename so it is a valid filename on all systems, replacing /. .. \t\r\n.
192   *
193   * @param filename to be escaped
194   */
195  public static String escapeFilename(String filename) {
196    return filename.replaceAll("[\\s./&]", "_");
197  }
198
199  public static File getClasspathFile(String path) {
200    return new File(ClassLoader.getSystemResource(path).getFile());
201  }
202
203  public static InputStream getInputStream(File source) throws FileNotFoundException {
204    return new FileInputStream(source);
205  }
206
207  public static BufferedReader getInputStreamReader(InputStream input)
208      throws FileNotFoundException {
209    return getInputStreamReader(input, UTF8);
210  }
211
212  public static BufferedReader getInputStreamReader(InputStream input, String encoding)
213      throws FileNotFoundException {
214    BufferedReader reader = null;
215    try {
216      reader = new BufferedReader(new InputStreamReader(input, encoding));
217    } catch (UnsupportedEncodingException e) {
218      LOG.warn("Caught Exception", e);
219    }
220    return reader;
221  }
222
223  /**
224   * @param source the source input stream encoded in UTF-8
225   */
226  public static LineIterator getLineIterator(InputStream source) {
227    return getLineIterator(source, UTF8);
228  }
229
230  /**
231   * @param source the source input stream
232   * @param encoding the encoding used by the input stream
233   */
234  public static LineIterator getLineIterator(InputStream source, String encoding) {
235    try {
236      return new LineIterator(new BufferedReader(new InputStreamReader(source, encoding)));
237    } catch (UnsupportedEncodingException e) {
238      throw new IllegalArgumentException("Unsupported encoding" + encoding, e);
239    }
240  }
241
242  public static BufferedReader getUtf8Reader(File file) throws FileNotFoundException {
243    BufferedReader reader = null;
244    try {
245      reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), UTF8));
246    } catch (UnsupportedEncodingException e) {
247      LOG.warn("Caught Exception", e);
248    }
249    return reader;
250  }
251
252  /**
253   * Converts the byte size into human-readable format.
254   * Support both SI and byte format.
255   */
256  public static String humanReadableByteCount(long bytes, boolean si) {
257    int unit = si ? 1000 : 1024;
258    if (bytes < unit) {
259      return bytes + " B";
260    }
261    int exp = (int) (Math.log(bytes) / Math.log(unit));
262    String pre = (si ? "kMGTPE" : "KMGTPE").charAt(exp - 1) + (si ? "" : "i");
263    return String.format("%.1f %sB", bytes / Math.pow(unit, exp), pre);
264  }
265
266  public static boolean isCompressedFile(File source) {
267    String suffix = source.getName().substring(source.getName().lastIndexOf('.') + 1);
268    return suffix != null
269        && suffix.length() > 0
270        && ("zip".equalsIgnoreCase(suffix)
271            || "tgz".equalsIgnoreCase(suffix)
272            || "gz".equalsIgnoreCase(suffix));
273  }
274
275  /**
276   * Reads a complete file into a byte buffer.
277   */
278  public static ByteBuffer readByteBuffer(File file) throws IOException {
279    byte[] content = org.apache.commons.io.FileUtils.readFileToByteArray(file);
280    return ByteBuffer.wrap(content);
281  }
282
283  /**
284   * Reads the first bytes of a file into a byte buffer.
285   *
286   * @param bufferSize the number of bytes to read from the file
287   */
288  public static ByteBuffer readByteBuffer(File file, int bufferSize) throws IOException {
289    ByteBuffer bbuf = ByteBuffer.allocate(bufferSize);
290    BufferedInputStream f = new BufferedInputStream(new FileInputStream(file), bufferSize);
291
292    int b;
293    while ((b = f.read()) != -1) {
294      if (!bbuf.hasRemaining()) {
295        break;
296      }
297      bbuf.put((byte) b);
298    }
299    f.close();
300
301    return bbuf;
302  }
303
304  /**
305   * @param linesPerMemorySort are the number of lines that should be sorted in memory, determining the number of file
306   *        segments to be sorted when doing a Java file sort. Defaults to 100000, if you have
307   *        memory available a higher value increases performance.
308   */
309  public static void setLinesPerMemorySort(int linesPerMemorySort) {
310    FileUtils.linesPerMemorySort = linesPerMemorySort;
311  }
312
313  public static Writer startNewUtf8File(File file) throws IOException {
314    touch(file);
315    return new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file, false), UTF8));
316  }
317
318  public static Writer startNewUtf8XmlFile(File file) throws IOException {
319    Writer writer = startNewUtf8File(file);
320    writer.write("<?xml version='1.0' encoding='utf-8'?>\n");
321    return writer;
322  }
323
324  /**
325   * Takes a utf8 encoded input stream and reads in every line/row into a list.
326   *
327   * @return list of rows
328   */
329  public static LinkedList<String> streamToList(InputStream source) throws IOException {
330    return streamToList(source, FileUtils.UTF8);
331  }
332
333  /**
334   * Reads a file and returns a list of all lines which are no comments (starting with #) and trims whitespace.
335   *
336   * @param source the UTF-8 encoded text file to read
337   * @param resultList the list implementation to be used. Will not be cleared before reading!
338   * @return list of lines
339   */
340  public static List<String> streamToList(InputStream source, List<String> resultList)
341      throws IOException {
342    LineIterator lines = getLineIterator(source);
343    while (lines.hasNext()) {
344      String line = lines.nextLine().trim();
345      // ignore comments
346      if (!ignore(line)) {
347        resultList.add(line);
348      }
349    }
350    return resultList;
351  }
352
353  public static LinkedList<String> streamToList(InputStream source, String encoding)
354      throws IOException {
355    LinkedList<String> resultList = new LinkedList<>();
356    try {
357      LineIterator lines =
358          new LineIterator(new BufferedReader(new InputStreamReader(source, encoding)));
359      while (lines.hasNext()) {
360        String line = lines.nextLine();
361        resultList.add(line);
362      }
363    } catch (UnsupportedEncodingException e) {
364      throw new IllegalArgumentException("Unsupported encoding " + encoding, e);
365    }
366    return resultList;
367  }
368
369  /**
370   * Reads a utf8 encoded inut stream, splits
371   */
372  public static Map<String, String> streamToMap(InputStream source) throws IOException {
373    return streamToMap(source, new HashMap<>());
374  }
375
376  public static Map<String, String> streamToMap(
377      InputStream source, int key, int value, boolean trimToNull) throws IOException {
378    return streamToMap(source, new HashMap<>(), key, value, trimToNull);
379  }
380
381  /**
382   * Read a hashmap from a tab delimited utf8 input stream using the row number as an integer value and the entire row
383   * as the value. Ignores commented rows starting with #.
384   *
385   * @param source tab delimited text file to read
386   */
387  public static Map<String, String> streamToMap(InputStream source, Map<String, String> result)
388      throws IOException {
389    LineIterator lines = getLineIterator(source);
390    Integer row = 0;
391    while (lines.hasNext()) {
392      row++;
393      String line = lines.nextLine().trim();
394      // ignore comments
395      if (!ignore(line)) {
396        result.put(line, row.toString());
397      }
398    }
399    return result;
400  }
401
402  /**
403   * Read a hashmap from a tab delimited utf8 file, ignoring commented rows starting with #.
404   *
405   * @param source tab delimited input stream to read
406   * @param key column number to use as key
407   * @param value column number to use as value
408   * @param trimToNull if true trims map entries to null
409   */
410  public static Map<String, String> streamToMap(
411      InputStream source, Map<String, String> result, int key, int value, boolean trimToNull)
412      throws IOException {
413    LineIterator lines = getLineIterator(source);
414    int maxCols = key > value ? key : value + 1;
415    while (lines.hasNext()) {
416      String line = lines.nextLine();
417      // ignore comments
418      if (!ignore(line)) {
419        String[] parts = TAB_DELIMITED.split(line);
420        if (maxCols <= parts.length) {
421          if (trimToNull) {
422            result.put(StringUtils.trimToNull(parts[key]), StringUtils.trimToNull(parts[value]));
423          } else {
424            result.put(parts[key], parts[value]);
425          }
426        }
427      }
428    }
429    return result;
430  }
431
432  public static Set<String> streamToSet(InputStream source) throws IOException {
433    return streamToSet(source, new CompactHashSet<>());
434  }
435
436  /**
437   * Reads a file and returns a unique set of all lines which are no comments (starting with #) and trims whitespace.
438   *
439   * @param source the UTF-8 encoded text file to read
440   * @param resultSet the set implementation to be used. Will not be cleared before reading!
441   * @return set of unique lines
442   */
443  public static Set<String> streamToSet(InputStream source, Set<String> resultSet)
444      throws IOException {
445    LineIterator lines = getLineIterator(source);
446    while (lines.hasNext()) {
447      String line = lines.nextLine().trim();
448      // ignore comments
449      if (!ignore(line)) {
450        resultSet.add(line);
451      }
452    }
453    return resultSet;
454  }
455
456  public static String toFilePath(URL url) {
457    String protocol =
458        url.getProtocol() == null || "http".equalsIgnoreCase(url.getProtocol())
459            ? ""
460            : "/__" + url.getProtocol() + "__";
461    String domain = url.getAuthority() == null ? "__domainless" : url.getAuthority();
462    return domain + protocol + url.getFile();
463  }
464
465  public static File url2file(URL url) {
466    File f = null;
467    try {
468      f = new File(url.toURI());
469    } catch (URISyntaxException e) {
470      f = new File(url.getPath());
471    }
472    return f;
473  }
474
475  /**
476   * For the given list, finds the index of the lowest value using the given comparator.
477   *
478   * @param values To compare
479   * @param comparator To use
480   * @return The index of the lowest value, or -1 if they are all null
481   */
482  static int lowestValueIndex(List<String> values, Comparator<String> comparator) {
483    int index = 0;
484    String lowestValue = null;
485    for (int i = 0; i < values.size(); i++) {
486      String value = values.get(i);
487      if (lowestValue == null) {
488        lowestValue = value;
489        index = i;
490      } else if (comparator.compare(lowestValue, value) > 0) {
491        lowestValue = value;
492        index = i;
493      }
494    }
495
496    return lowestValue == null ? -1 : index;
497  }
498
499  /**
500   * For the given file's path, returns a proposed new filename (including path) with the extension
501   * index and suffix. So a file of "/tmp/input.txt" -> "/tmp/input_part_10.txt".
502   *
503   * @param original File
504   * @param index E.g. 10
505   * @return The proposed name
506   */
507  private static File getChunkFile(File original, int index) {
508    return new File(
509        original.getParentFile(),
510        FilenameUtils.getBaseName(original.getName())
511            + '_'
512            + index
513            + getFileExtension(original.getName()));
514  }
515
516  private static boolean ignore(String line) {
517    return StringUtils.trimToNull(line) == null || line.startsWith("#");
518  }
519
520  public int getLinesPerMemorySort() {
521    return linesPerMemorySort;
522  }
523
524  /**
525   * Merges a list of intermediary sort chunk files into a single sorted file. On completion, the intermediary sort
526   * chunk files are deleted.
527   *
528   * @param sortFiles sort chunk files to merge
529   * @param sortedFileWriter writer to merge to. Can already be open and contain data
530   * @param lineComparator To use when determining the order (reuse the one that was used to sort the individual
531   *        files)
532   */
533  public void mergeSortedFiles(
534      List<File> sortFiles, Writer sortedFileWriter, Comparator<String> lineComparator)
535      throws IOException {
536    LinkedList<Pair<String, BufferedReader>> partReaders = new LinkedList<>();
537    try {
538      for (File f : sortFiles) {
539        // Use UTF-8 sort order.
540        BufferedReader partReader =
541            new BufferedReader(
542                new InputStreamReader(new FileInputStream(f), StandardCharsets.UTF_8));
543        // Load first lines
544        String partLine = partReader.readLine();
545        if (partLine != null) {
546          partReaders.add(Pair.of(partLine, partReader));
547        }
548      }
549      // Sort the first lines
550      Collections.sort(partReaders, (Comparator.comparing(Pair::getLeft, lineComparator)));
551
552      // Start with the first reader
553      while (partReaders.size() > 1) {
554        BufferedReader currentBuffer = partReaders.get(0).getRight();
555        String currentLine = partReaders.get(0).getLeft();
556        String nextFilesFirstLine = partReaders.get(1).getLeft();
557
558        // Read from it, until its value is greater than the second reader.
559        while (currentLine != null
560            && lineComparator.compare(currentLine, nextFilesFirstLine) <= 0) {
561          sortedFileWriter.write(currentLine);
562          sortedFileWriter.write('\n');
563
564          currentLine = currentBuffer.readLine();
565        }
566        partReaders.remove(0);
567
568        if (currentLine == null) {
569          // If it's completed, close and remove it.
570          currentBuffer.close();
571        } else {
572          // Otherwise, insert it into the list, maintaining the order
573          Pair<String, BufferedReader> currentReaderPair = Pair.of(currentLine, currentBuffer);
574
575          // Start at 1, as we are always larger than the first (was the second) entry
576          for (int i = 1; i <= partReaders.size(); i++) {
577            // If we get here, it goes at the end of the list.
578            if (i == partReaders.size()) {
579              partReaders.add(i, currentReaderPair);
580              break;
581            }
582
583            if (lineComparator.compare(partReaders.get(i).getLeft(), currentLine) >= 0) {
584              partReaders.add(i, currentReaderPair);
585              break;
586            }
587          }
588        }
589        // mergeFileStatus("Loop "+currentLine, partReaders);
590      }
591
592      // Read the remainder of the final buffer
593      // mergeFileStatus("Final", partReaders);
594      if (partReaders.size() > 0) {
595        BufferedReader currentBuffer = partReaders.get(0).getRight();
596        String current = partReaders.get(0).getLeft();
597        while (current != null) {
598          sortedFileWriter.write(current);
599          sortedFileWriter.write('\n');
600
601          current = currentBuffer.readLine();
602        }
603        currentBuffer.close();
604      }
605    } finally {
606      for (Pair<String, BufferedReader> pair : partReaders) {
607        try {
608          pair.getRight().close();
609        } catch (RuntimeException e) {
610        }
611      }
612      // I assume it periodically flushes anyway, so only need to do once at end...
613      sortedFileWriter.flush();
614      sortedFileWriter.close();
615      // delete (intermediary) sort chunk files, only the sorted file remains
616      for (File f : sortFiles) {
617        f.delete();
618      }
619    }
620  }
621
622  // Just for debugging
623  private void mergeFileStatus(String note, List<Pair<String, BufferedReader>> partReaders) {
624    LOG.trace(note);
625    for (int i = 0; i < partReaders.size(); i++) {
626      LOG.trace(i + ": " + partReaders.get(i).getLeft());
627    }
628    LOG.trace("-");
629  }
630
631  /**
632   * Sorts the input file into the output file using the supplied delimited line parameters.
633   *
634   * This method is not reliable when the sort field may contain Unicode codepoints outside the Basic Multilingual Plane,
635   * i.e. above \uFFFF. In that case, the sort order differs from Java's String sort order.  This should not be a problem
636   * for most usage; the Supplementary Multilingual Planes contain ancient scripts, emojis, arrows and so on.
637   *
638   * @param input To sort
639   * @param sorted The sorted version of the input excluding ignored header lines (see ignoreHeaderLines)
640   * @param column the column that keeps the values to sort on
641   * @param columnDelimiter the delimiter that separates columns in a row
642   * @param enclosedBy optional column enclosing character, e.g. a double quote for CSVs
643   * @param newlineDelimiter the chars used for new lines, usually \n, \n\r or \r
644   * @param ignoreHeaderLines number of beginning lines to ignore, e.g. headers
645   */
646  public void sort(
647      File input,
648      File sorted,
649      String encoding,
650      int column,
651      String columnDelimiter,
652      Character enclosedBy,
653      String newlineDelimiter,
654      int ignoreHeaderLines)
655      throws IOException {
656    sort(
657        Collections.singletonList(input),
658        sorted,
659        encoding,
660        column,
661        columnDelimiter,
662        enclosedBy,
663        newlineDelimiter,
664        ignoreHeaderLines);
665  }
666
667  /**
668   * Sorts the input file into the output file using the supplied delimited line parameters.
669   *
670   * This method is not reliable when the sort field may contain Unicode codepoints outside the Basic Multilingual Plane,
671   * i.e. above \uFFFF. In that case, the sort order differs from Java's String sort order.  This should not be a problem
672   * for most usage; the Supplementary Multilingual Planes contain ancient scripts, emojis, arrows and so on.
673   *
674   * @param inputs To sort
675   * @param sorted The sorted version of the input excluding ignored header lines (see ignoreHeaderLines)
676   * @param column the column that keeps the values to sort on
677   * @param columnDelimiter the delimiter that separates columns in a row
678   * @param enclosedBy optional column enclosing character, e.g. a double quote for CSVs
679   * @param newlineDelimiter the chars used for new lines, usually \n, \n\r or \r
680   * @param ignoreHeaderLines number of beginning lines to ignore, e.g. headers
681   */
682  public void sort(
683      List<File> inputs,
684      File sorted,
685      String encoding,
686      int column,
687      String columnDelimiter,
688      Character enclosedBy,
689      String newlineDelimiter,
690      int ignoreHeaderLines)
691      throws IOException {
692    Comparator<String> lineComparator;
693    if (enclosedBy == null) {
694      lineComparator = new LineComparator(column, columnDelimiter);
695    } else {
696      lineComparator = new LineComparator(column, columnDelimiter, enclosedBy);
697    }
698    sort(
699        inputs,
700        sorted,
701        encoding,
702        column,
703        columnDelimiter,
704        enclosedBy,
705        newlineDelimiter,
706        ignoreHeaderLines,
707        lineComparator,
708        false);
709  }
710
711  /**
712   * Sorts the input file into the output file using the supplied delimited line parameters.
713   *
714   * This method is not reliable when the sort field may contain Unicode codepoints outside the Basic Multilingual Plane,
715   * i.e. above \uFFFF. In that case, the sort order differs from Java's String sort order.  This should not be a problem
716   * for most usage; the Supplementary Multilingual Planes contain ancient scripts, emojis, arrows and so on.
717   *
718   * This method is globally synchronized, in case multiple sorts are attempted to the same file simultaneously.
719   * This could be improved to allow synchronizing against the destination file, rather than for all sorts.
720   *
721   * @param input To sort
722   * @param sorted The sorted version of the input excluding ignored header lines (see ignoreHeaderLines)
723   * @param column the column that keeps the values to sort on
724   * @param columnDelimiter the delimiter that separates columns in a row
725   * @param enclosedBy optional column enclosing character, e.g. a double quote for CSVs
726   * @param newlineDelimiter the chars used for new lines, usually \n, \r\n or \r
727   * @param ignoreHeaderLines number of beginning lines to ignore, e.g. headers
728   * @param lineComparator used to sort the output
729   * @param ignoreCase ignore case order, this parameter couldn't have any effect if the LineComparator is used
730   */
731  public void sort(
732      File input,
733      File sorted,
734      String encoding,
735      int column,
736      String columnDelimiter,
737      Character enclosedBy,
738      String newlineDelimiter,
739      int ignoreHeaderLines,
740      Comparator<String> lineComparator,
741      boolean ignoreCase)
742      throws IOException {
743    sort(
744        Collections.singletonList(input),
745        sorted,
746        encoding,
747        column,
748        columnDelimiter,
749        enclosedBy,
750        newlineDelimiter,
751        ignoreHeaderLines,
752        lineComparator,
753        ignoreCase);
754  }
755
756  /**
757   * Sorts the input file into the output file using the supplied delimited line parameters.
758   *
759   * This method is not reliable when the sort field may contain Unicode codepoints outside the Basic Multilingual Plane,
760   * i.e. above \uFFFF. In that case, the sort order differs from Java's String sort order.  This should not be a problem
761   * for most usage; the Supplementary Multilingual Planes contain ancient scripts, emojis, arrows and so on.
762   *
763   * This method is globally synchronized, in case multiple sorts are attempted to the same file simultaneously.
764   * This could be improved to allow synchronizing against the destination file, rather than for all sorts.
765   *
766   * @param inputs To sort
767   * @param sorted The sorted version of the input excluding ignored header lines (see ignoreHeaderLines)
768   * @param column the column that keeps the values to sort on
769   * @param columnDelimiter the delimiter that separates columns in a row
770   * @param enclosedBy optional column enclosing character, e.g. a double quote for CSVs
771   * @param newlineDelimiter the chars used for new lines, usually \n, \r\n or \r
772   * @param ignoreHeaderLines number of beginning lines to ignore, e.g. headers
773   * @param lineComparator used to sort the output
774   * @param ignoreCase ignore case order, this parameter couldn't have any effect if the LineComparator is used
775   */
776  public void sort(
777      List<File> inputs,
778      File sorted,
779      String encoding,
780      int column,
781      String columnDelimiter,
782      Character enclosedBy,
783      String newlineDelimiter,
784      int ignoreHeaderLines,
785      Comparator<String> lineComparator,
786      boolean ignoreCase)
787      throws IOException {
788    LOG.debug(
789        "Sorting file(s) {} as new file {}",
790        inputs.stream().map(File::getAbsolutePath).toArray(),
791        sorted.getAbsolutePath());
792    if (encoding == null) {
793      LOG.warn("No encoding specified, assume UTF-8");
794      encoding = FileUtils.UTF8;
795    }
796    synchronized (sortLock) {
797      if (sorted.exists()) {
798        // Delete a file, which will allow processes with it open to continue reading it.
799        // The GNU sort truncates and appends, which would mean a partial read otherwise.
800        LOG.warn("Deleting existed sorted file {}", sorted.getAbsoluteFile());
801        sorted.delete();
802      }
803      // if the id is in the first column, first try sorting via shell as its the fastest we can get
804      if (!sortInGnu(
805          inputs,
806          sorted,
807          encoding,
808          ignoreHeaderLines,
809          column,
810          columnDelimiter,
811          enclosedBy,
812          newlineDelimiter,
813          ignoreCase)) {
814        LOG.debug("No GNU sort available, using native Java sorting");
815        sortInJava(inputs, sorted, encoding, lineComparator, ignoreHeaderLines);
816      }
817    }
818  }
819
820  /**
821   * Sorts the input file into the output file using the supplied lineComparator.
822   *
823   * @param input To sort
824   * @param sorted The sorted version of the input excluding ignored header lines (see ignoreHeaderLines)
825   * @param lineComparator To use during comparison
826   * @param ignoreHeaderLines number of beginning lines to ignore, e.g. headers
827   */
828  public void sortInJava(
829      File input,
830      File sorted,
831      String encoding,
832      Comparator<String> lineComparator,
833      int ignoreHeaderLines)
834      throws IOException {
835    sortInJava(
836        Collections.singletonList(input), sorted, encoding, lineComparator, ignoreHeaderLines);
837  }
838
839  /**
840   * Sorts the input file into the output file using the supplied lineComparator.
841   *
842   * @param inputs To sort
843   * @param sorted The sorted version of the input excluding ignored header lines (see ignoreHeaderLines)
844   * @param lineComparator To use during comparison
845   * @param ignoreHeaderLines number of beginning lines to ignore, e.g. headers
846   */
847  public void sortInJava(
848      List<File> inputs,
849      File sorted,
850      String encoding,
851      Comparator<String> lineComparator,
852      int ignoreHeaderLines)
853      throws IOException {
854    LOG.debug("Sorting file(s) {}", inputs);
855    long start = System.currentTimeMillis();
856
857    List<File> sortFiles = new LinkedList<>();
858    List<String> headerLines = new LinkedList<>();
859    for (File input : inputs) {
860      BufferedReader br =
861          new BufferedReader(new InputStreamReader(new FileInputStream(input), encoding));
862      int skipHeaderLines = ignoreHeaderLines;
863      try {
864        String line = br.readLine();
865        int fileCount = 0;
866
867        List<String> linesToSort = new LinkedList<>();
868        while (line != null) {
869          if (skipHeaderLines > 0) {
870            // Only add the header lines for the first file
871            if (headerLines.size() < ignoreHeaderLines) {
872              headerLines.add(line);
873            }
874            skipHeaderLines--;
875          } else {
876            linesToSort.add(line);
877
878            // if buffer is full, then sort and write to file
879            if (linesToSort.size() == linesPerMemorySort) {
880              sortFiles.add(sortAndWrite(input, encoding, lineComparator, fileCount, linesToSort));
881              linesToSort = new LinkedList<>();
882              fileCount++;
883            }
884          }
885          line = br.readLine();
886        }
887        // catch the last lot
888        if (!linesToSort.isEmpty()) {
889          sortFiles.add(sortAndWrite(input, encoding, lineComparator, fileCount, linesToSort));
890        }
891      } finally {
892        br.close();
893      }
894    }
895    LOG.debug(
896        sortFiles.size()
897            + " sorted file chunks created in "
898            + (System.currentTimeMillis() - start) / 1000
899            + " secs");
900
901    // now merge the sorted files into one single sorted file
902    Writer sortedFileWriter = new BufferedWriter(new FileWriter(sorted));
903    // first write the old header lines if existing
904    for (String h : headerLines) {
905      sortedFileWriter.write(h);
906      sortedFileWriter.write("\n");
907    }
908    mergeSortedFiles(sortFiles, sortedFileWriter, lineComparator);
909
910    LOG.debug(
911        "File(s) {} sorted successfully using {} parts to do sorting in {}s",
912        inputs.stream().map(File::getAbsolutePath).toArray(),
913        sortFiles.size(),
914        (System.currentTimeMillis() - start) / 1000);
915  }
916
917  /**
918   * Splits the supplied file into files of set line size and with a suffix.
919   *
920   * @param input To split up
921   * @param linesPerOutput Lines per split file
922   * @param extension The file extension to use - e.g. ".txt"
923   * @return The split files
924   */
925  public List<File> split(File input, int linesPerOutput, String extension) throws IOException {
926    LOG.debug("Splitting File[" + input.getAbsolutePath() + ']');
927    long timer = System.currentTimeMillis();
928    List<File> splitFiles = new LinkedList<>();
929    // Use ISO-8859-1 as a binary-safe encoding.
930    BufferedReader br =
931        new BufferedReader(
932            new InputStreamReader(new FileInputStream(input), StandardCharsets.ISO_8859_1));
933    String line = br.readLine();
934    int fileCount = 0;
935    File splitFile = getChunkFile(input, fileCount);
936    fileCount++;
937    splitFiles.add(splitFile);
938    OutputStreamWriter fw =
939        new OutputStreamWriter(new FileOutputStream(splitFile), StandardCharsets.ISO_8859_1);
940    try {
941      int lineCount = 0;
942      while (line != null) {
943        if (lineCount == linesPerOutput) {
944          fw.flush();
945          fw.close();
946          splitFile = getChunkFile(input, fileCount);
947          splitFiles.add(splitFile);
948          // is ok to reuse, as last one is closed, and this will always get closed - see finally
949          // below
950          fw = new OutputStreamWriter(new FileOutputStream(splitFile), StandardCharsets.ISO_8859_1);
951          fileCount++;
952          lineCount = 0;
953        }
954        fw.write(line);
955        fw.write("\n");
956        line = br.readLine();
957        lineCount++;
958      }
959      fw.flush();
960    } finally {
961      fw.close();
962    }
963    LOG.debug(
964        "File["
965            + input.getAbsolutePath()
966            + "] split successfully into["
967            + splitFiles.size()
968            + "] parts in secs["
969            + (1 + System.currentTimeMillis() - timer) / 1000
970            + "]");
971    return splitFiles;
972  }
973
974  /**
975   * Test whether we have a new enough version of GNU Sort that supports (primarily) the -k option with a start and end
976   * column.
977   *
978   * Mac OS only includes an old version of GNU sort, and will fail this test.
979   */
980  private boolean gnuSortAvailable() {
981    if (gnuSortAvailable != null) {
982      return gnuSortAvailable;
983    }
984
985    try {
986      String command = "sort -k1,1 -t',' --ignore-case /dev/null";
987      LOG.debug("Testing capability of 'sort' with command: {}", command);
988
989      Process process = new ProcessBuilder("/bin/sh", "-c", command).start();
990      int exitValue = process.waitFor();
991
992      if (exitValue == 0) {
993        LOG.debug("GNU sort is capable");
994        gnuSortAvailable = true;
995      } else {
996        LOG.warn(
997            "GNU sort does not exist or is too old, and will not be used.  Sorting large files will be slow.",
998            new InputStreamUtils().readEntireStream(process.getErrorStream()).replace('\n', ' '));
999        gnuSortAvailable = false;
1000      }
1001    } catch (Exception e) {
1002      LOG.warn(
1003          "GNU sort does not exist or is too old, and will not be used.  Sorting large files will be slow.",
1004          e);
1005      gnuSortAvailable = false;
1006    }
1007
1008    return gnuSortAvailable;
1009  }
1010
1011  /**
1012   * sort a text file via an external GNU sort command:
1013   * sorting tabs at 3rd column, numerical reverse order
1014   * sort -t$'\t' -k3 -o sorted.txt col2007.txt
1015   * <p/>
1016   * The GNU sort based sorting is extremely efficient and much, much faster than the current sortInJava method. It is
1017   * locale aware though and we only want the native C sorting locale. See
1018   * http://www.gnu.org/software/coreutils/faq/coreutils-faq.html#Sort-does-not-sort-in-normal-order_0021
1019   * <p/>
1020   * Example C sort order:
1021   * <p/>
1022   * <pre>
1023   * 1 oOdontoceti
1024   * 10 gGlobicephala melaena melaena Traill
1025   * 100 gGlobicephala melaena melaena Traill
1026   * 101 gGlobicephala melaena melaena Traill
1027   * 11 pPontoporia Gray
1028   * 12 pPontoporia blainvillei Gervais and d'Orbigny
1029   * 120 iInia d'Orbigny
1030   * 121 iInia geoffrensis Blainville
1031   * 2 sSusuidae
1032   * 20 cCetacea
1033   * Amphiptera
1034   * Amphiptera pacifica Giglioli
1035   * Anarnak Lacépède
1036   * Balaena mangidach Chamisso
1037   * amphiptera
1038   * amphiptera pacifica Giglioli
1039   * anarnak Lacépède
1040   * balaena mangidach Chamisso
1041   * </pre>
1042   */
1043  protected boolean sortInGnu(
1044      List<File> inputs,
1045      File sorted,
1046      String encoding,
1047      int ignoreHeaderLines,
1048      int column,
1049      String columnDelimiter,
1050      Character enclosedBy,
1051      String lineDelimiter,
1052      boolean ignoreCase)
1053      throws IOException {
1054    String command;
1055    // GNU sort is available for use when:
1056    // • line delimiter is \n
1057    // • no enclosed by/quote character is in use
1058    // • sorting is using the first column
1059    // • sort version is sufficient to include start and end column (-k 1,1).
1060    // Use the --debug option to sort if working on this code.
1061    if (lineDelimiter == null || !lineDelimiter.contains("\n")) {
1062      LOG.debug("Cannot use GNU sort on this file: line delimiter does not contain newline.");
1063      return false;
1064    } else if (columnDelimiter != null && column > 0) {
1065      LOG.debug("Cannot use GNU sort on this file: sort column is not the first.");
1066      return false;
1067    } else if (enclosedBy != null) {
1068      LOG.debug("Cannot use GNU sort on this file: enclosed by character set.");
1069      return false;
1070    } else if (!gnuSortAvailable()) {
1071      LOG.debug("Cannot use GNU sort on this file: command unavailable.");
1072      return false;
1073    }
1074
1075    // keep header rows
1076    boolean success = false;
1077    try {
1078      LinkedList<String> cmds = new LinkedList<>();
1079      cmds.add("/bin/sh");
1080      cmds.add("-c");
1081      cmds.add("");
1082      ProcessBuilder pb = new ProcessBuilder(cmds);
1083      Map<String, String> env = pb.environment();
1084
1085      // clear the environment, but keep specified temp working directory
1086      env.keySet().removeIf(key -> !(key.equals("TMPDIR")));
1087      if (System.getProperty("java.io.tmpdir") != null) {
1088        env.put("TMPDIR", System.getProperty("java.io.tmpdir"));
1089      }
1090      // make sure we use the C locale for sorting
1091      env.put("LC_ALL", "C");
1092
1093      String sortArgs =
1094          String.format(
1095              " %s -k%d,%d -t'%s'",
1096              ignoreCase ? "--ignore-case" : "", column + 1, column + 1, columnDelimiter);
1097
1098      String fileList = inputs.stream().map(File::getAbsolutePath).collect(Collectors.joining(" "));
1099      if (ignoreHeaderLines > 0) {
1100        // copy header lines
1101        command =
1102            "head -n "
1103                + ignoreHeaderLines
1104                + ' '
1105                + inputs.get(0).getAbsolutePath()
1106                + " > "
1107                + sorted.getAbsolutePath();
1108        LOG.debug("Issue external command: {}", command);
1109        cmds.removeLast();
1110        cmds.add(command);
1111        Process process = pb.start();
1112        int exitValue = process.waitFor();
1113        if (exitValue != 0) {
1114          LOG.warn("Error sorting file (copying header lines) with GNU head");
1115          return false;
1116        }
1117
1118        // do the sorting ignoring the header rows
1119        command =
1120            "tail -q -n +"
1121                + (ignoreHeaderLines + 1)
1122                + " "
1123                + fileList
1124                + " | "
1125                + "sort "
1126                + sortArgs
1127                + " >> "
1128                + sorted.getAbsolutePath();
1129      } else {
1130        // do sorting directly, we don't have header rows
1131        command = "sort " + sortArgs + " -o " + sorted.getAbsolutePath() + ' ' + fileList;
1132      }
1133
1134      LOG.debug("Issue external command: {}", command);
1135      cmds.removeLast();
1136      cmds.add(command);
1137      Process process = pb.start();
1138      // get the stdout and stderr from the command that was run
1139      InputStream err = process.getErrorStream();
1140      int exitValue = process.waitFor();
1141      if (exitValue == 0) {
1142        LOG.debug("Successfully sorted file with GNU sort");
1143        success = true;
1144      } else {
1145        LOG.warn("Error sorting file with GNU sort");
1146        InputStreamUtils isu = new InputStreamUtils();
1147        System.err.append(isu.readEntireStream(err));
1148      }
1149    } catch (Exception e) {
1150      LOG.warn("Caught Exception using GNU sort", e);
1151    }
1152    return success;
1153  }
1154
1155  /**
1156   * Sorts the lines and writes to file using the
1157   *
1158   * @param input File to base the name on
1159   * @param lineComparator To compare the lines for sorting
1160   * @param fileCount Used for the file name
1161   * @param linesToSort To actually sort
1162   * @return The written file
1163   */
1164  private File sortAndWrite(
1165      File input,
1166      String encoding,
1167      Comparator<String> lineComparator,
1168      int fileCount,
1169      List<String> linesToSort)
1170      throws IOException {
1171    long start = System.currentTimeMillis();
1172    linesToSort.sort(lineComparator);
1173    // When implementing a comparator, make it SUPER quick!!!
1174    LOG.debug(
1175        "Collections.sort took msec["
1176            + (System.currentTimeMillis() - start)
1177            + "] to sort records["
1178            + linesToSort.size()
1179            + ']');
1180    File sortFile = getChunkFile(input, fileCount);
1181    try (Writer fw = new OutputStreamWriter(new FileOutputStream(sortFile), encoding)) {
1182      for (String s : linesToSort) {
1183        fw.write(s);
1184        fw.write("\n");
1185      }
1186    }
1187    return sortFile;
1188  }
1189
1190  /**
1191   * Creates an empty file or updates the last updated timestamp on the same as the unix command of
1192   * the same name.
1193   *
1194   * <p>From Guava.
1195   *
1196   * @param file the file to create or update
1197   * @throws IOException if an I/O error occurs
1198   */
1199  public static void touch(File file) throws IOException {
1200    Objects.requireNonNull(file);
1201    if (!file.createNewFile() && !file.setLastModified(System.currentTimeMillis())) {
1202      throw new IOException("Unable to update modification time of " + file);
1203    }
1204  }
1205
1206  /**
1207   * Returns the <a href="http://en.wikipedia.org/wiki/Filename_extension">file extension</a> for
1208   * the given file name, or the empty string if the file has no extension. The result does not
1209   * include the '{@code .}'.
1210   *
1211   * <p><b>Note:</b> This method simply returns everything after the last '{@code .}' in the file's
1212   * name as determined by {@link File#getName}. It does not account for any filesystem-specific
1213   * behavior that the {@link File} API does not already account for. For example, on NTFS it will
1214   * report {@code "txt"} as the extension for the filename {@code "foo.exe:.txt"} even though NTFS
1215   * will drop the {@code ":.txt"} part of the name when the file is actually created on the
1216   * filesystem due to NTFS's <a href="https://goo.gl/vTpJi4">Alternate Data Streams</a>.
1217   *
1218   * <p>From Guava.
1219   */
1220  public static String getFileExtension(String fullName) {
1221    Objects.requireNonNull(fullName);
1222    String fileName = new File(fullName).getName();
1223    int dotIndex = fileName.lastIndexOf('.');
1224    return (dotIndex == -1) ? "" : fileName.substring(dotIndex + 1);
1225  }
1226
1227  /**
1228   * Creates any necessary but nonexistent parent directories of the specified file. Note that if
1229   * this operation fails it may have succeeded in creating some (but not all) of the necessary
1230   * parent directories.
1231   *
1232   * <p>From Guava.
1233   *
1234   * @throws IOException if an I/O error occurs, or if any necessary but nonexistent parent
1235   *     directories of the specified file could not be created.
1236   */
1237  public static void createParentDirs(File file) throws IOException {
1238    Objects.requireNonNull(file);
1239    File parent = file.getCanonicalFile().getParentFile();
1240    if (parent == null) {
1241      /*
1242       * The given directory is a filesystem root. All zero of its ancestors exist. This doesn't
1243       * mean that the root itself exists -- consider x:\ on a Windows machine without such a drive
1244       * -- or even that the caller can create it, but this method makes no such guarantees even for
1245       * non-root files.
1246       */
1247      return;
1248    }
1249    //noinspection ResultOfMethodCallIgnored
1250    parent.mkdirs();
1251    if (!parent.isDirectory()) {
1252      throw new IOException("Unable to create parent directories of " + file);
1253    }
1254  }
1255}