Source code

001/*
002 * Copyright 2021 Global Biodiversity Information Facility (GBIF)
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016package org.gbif.utils.file;
017
018import org.gbif.utils.collection.CompactHashSet;
019import org.gbif.utils.text.LineComparator;
020
021import java.io.BufferedInputStream;
022import java.io.BufferedReader;
023import java.io.BufferedWriter;
024import java.io.File;
025import java.io.FileInputStream;
026import java.io.FileNotFoundException;
027import java.io.FileOutputStream;
028import java.io.FileWriter;
029import java.io.IOException;
030import java.io.InputStream;
031import java.io.InputStreamReader;
032import java.io.OutputStream;
033import java.io.OutputStreamWriter;
034import java.io.UnsupportedEncodingException;
035import java.io.Writer;
036import java.net.URISyntaxException;
037import java.net.URL;
038import java.nio.ByteBuffer;
039import java.nio.charset.Charset;
040import java.nio.charset.StandardCharsets;
041import java.util.Comparator;
042import java.util.HashMap;
043import java.util.LinkedList;
044import java.util.List;
045import java.util.Map;
046import java.util.Objects;
047import java.util.Set;
048import java.util.regex.Pattern;
049
050import org.apache.commons.io.FilenameUtils;
051import org.apache.commons.io.LineIterator;
052import org.apache.commons.lang3.StringUtils;
053import org.slf4j.Logger;
054import org.slf4j.LoggerFactory;
055
056/**
057 * Collection of file utils.
058 * <br>
059 * This class has only been tested for use with a UTF-8 system encoding.
060 */
061public final class FileUtils {
062
063  private static final Logger LOG = LoggerFactory.getLogger(FileUtils.class);
064
065  public static final String UTF8 = StandardCharsets.UTF_8.name();
066  public static final Pattern TAB_DELIMITED = Pattern.compile("\t");
067  private static int linesPerMemorySort = 100000;
068  private static Boolean gnuSortAvailable = null;
069  private static final Object sortLock = new Object();
070
071  static {
072    /* Warn when the software is not run in a Unicode environment.  This library has not been
073     * tested to run in a non-Unicode environment, and may cause data corruption.
074     */
075    if (Charset.defaultCharset().equals(StandardCharsets.US_ASCII)) {
076      System.err.println("The default character set is US ASCII.  It is strongly recommended to " +
077        "run this software in a Unicode environment.");
078    }
079  }
080
081  public static String classpath2Filepath(String path) {
082    return new File(ClassLoader.getSystemResource(path).getFile()).getAbsolutePath();
083  }
084
085  public static InputStream classpathStream(String path) throws IOException {
086    InputStream in = null;
087    // relative path. Use classpath instead
088    URL url = FileUtils.class.getClassLoader().getResource(path);
089    if (url != null) {
090      in = url.openStream();
091    }
092    return in;
093  }
094
095  public static Set<String> columnsToSet(InputStream source, int... column) throws IOException {
096    return columnsToSet(source, new CompactHashSet<String>(), column);
097  }
098
099  /**
100   * Reads a file and returns a unique set of multiple columns from lines which are no comments (starting with #) and
101   * trims whitespace.
102   *
103   * @param source the UTF-8 encoded text file with tab delimited columns
104   * @param resultSet the set implementation to be used. Will not be cleared before reading!
105   * @param column variable length argument of column indices to process
106   * @return set of column rows
107   */
108  public static Set<String> columnsToSet(InputStream source, Set<String> resultSet, int... column) throws IOException {
109    LineIterator lines = getLineIterator(source);
110    int maxCols = 0;
111    for (int c : column) {
112      if (c > maxCols) {
113        maxCols = c;
114      }
115    }
116    while (lines.hasNext()) {
117      String line = lines.nextLine().trim();
118      // ignore comments
119      if (!ignore(line)) {
120        String[] parts = TAB_DELIMITED.split(line);
121        if (maxCols <= parts.length) {
122          for (int c : column) {
123            String cell = parts[c].trim();
124            resultSet.add(cell);
125          }
126        }
127      }
128    }
129    return resultSet;
130  }
131
132  public static void copyStreams(InputStream in, OutputStream out) throws IOException {
133    // write the file to the file specified
134    int bytesRead;
135    byte[] buffer = new byte[8192];
136
137    while ((bytesRead = in.read(buffer, 0, 8192)) != -1) {
138      out.write(buffer, 0, bytesRead);
139    }
140
141    out.close();
142    in.close();
143  }
144
145  public static void copyStreamToFile(InputStream in, File out) throws IOException {
146    copyStreams(in, new FileOutputStream(out));
147  }
148
149  public static File createTempDir() throws IOException {
150    return createTempDir("gbif-futil", ".tmp");
151  }
152
153  /**
154   * @param prefix The prefix string to be used in generating the file's name; must be at least three characters long
155   * @param suffix The suffix string to be used in generating the file's name; may be null, in which case the suffix
156   *        ".tmp" will be used
157   */
158  public static File createTempDir(String prefix, String suffix) throws IOException {
159    File dir = File.createTempFile(prefix, suffix);
160    if (!dir.delete()) {
161      throw new IOException("Could not delete temp file: " + dir.getAbsolutePath());
162    }
163    if (!dir.mkdir()) {
164      throw new IOException("Could not create temp directory: " + dir.getAbsolutePath());
165    }
166    return dir;
167  }
168
169  /**
170   * Delete directory recursively, including all its files, sub-folders, and sub-folder's files.
171   *
172   * @param directory directory to delete recursively
173   */
174  public static void deleteDirectoryRecursively(File directory) {
175    File[] list = directory.listFiles();
176    for (File file : list) {
177      if (file.isDirectory()) {
178        deleteDirectoryRecursively(file);
179        file.delete();
180      } else {
181        file.delete();
182      }
183    }
184    directory.delete();
185  }
186
187  /**
188   * Escapes a filename so it is a valid filename on all systems, replacing /. .. \t\r\n.
189   *
190   * @param filename to be escaped
191   */
192  public static String escapeFilename(String filename) {
193    return filename.replaceAll("[\\s./&]", "_");
194  }
195
196  public static File getClasspathFile(String path) {
197    return new File(ClassLoader.getSystemResource(path).getFile());
198  }
199
200  public static InputStream getInputStream(File source) throws FileNotFoundException {
201    return new FileInputStream(source);
202  }
203
204  public static BufferedReader getInputStreamReader(InputStream input) throws FileNotFoundException {
205    return getInputStreamReader(input, UTF8);
206  }
207
208  public static BufferedReader getInputStreamReader(InputStream input, String encoding) throws FileNotFoundException {
209    BufferedReader reader = null;
210    try {
211      reader = new BufferedReader(new InputStreamReader(input, encoding));
212    } catch (UnsupportedEncodingException e) {
213      LOG.warn("Caught Exception", e);
214    }
215    return reader;
216  }
217
218  /**
219   * @param source the source input stream encoded in UTF-8
220   */
221  public static LineIterator getLineIterator(InputStream source) {
222    return getLineIterator(source, UTF8);
223  }
224
225  /**
226   * @param source the source input stream
227   * @param encoding the encoding used by the input stream
228   */
229  public static LineIterator getLineIterator(InputStream source, String encoding) {
230    try {
231      return new LineIterator(new BufferedReader(new InputStreamReader(source, encoding)));
232    } catch (UnsupportedEncodingException e) {
233      throw new IllegalArgumentException("Unsupported encoding" + encoding, e);
234    }
235  }
236
237  public static BufferedReader getUtf8Reader(File file) throws FileNotFoundException {
238    BufferedReader reader = null;
239    try {
240      reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), UTF8));
241    } catch (UnsupportedEncodingException e) {
242      LOG.warn("Caught Exception", e);
243    }
244    return reader;
245  }
246
247  /**
248   * Converts the byte size into human-readable format.
249   * Support both SI and byte format.
250   */
251  public static String humanReadableByteCount(long bytes, boolean si) {
252    int unit = si ? 1000 : 1024;
253    if (bytes < unit) {
254      return bytes + " B";
255    }
256    int exp = (int) (Math.log(bytes) / Math.log(unit));
257    String pre = (si ? "kMGTPE" : "KMGTPE").charAt(exp - 1) + (si ? "" : "i");
258    return String.format("%.1f %sB", bytes / Math.pow(unit, exp), pre);
259  }
260
261  public static boolean isCompressedFile(File source) {
262    String suffix = source.getName().substring(source.getName().lastIndexOf('.') + 1);
263    return suffix != null && suffix.length() > 0 && ("zip".equalsIgnoreCase(suffix) || "tgz".equalsIgnoreCase(suffix)
264      || "gz".equalsIgnoreCase(suffix));
265  }
266
267  /**
268   * Reads a complete file into a byte buffer.
269   */
270  public static ByteBuffer readByteBuffer(File file) throws IOException {
271    byte[] content = org.apache.commons.io.FileUtils.readFileToByteArray(file);
272    return ByteBuffer.wrap(content);
273  }
274
275  /**
276   * Reads the first bytes of a file into a byte buffer.
277   *
278   * @param bufferSize the number of bytes to read from the file
279   */
280  public static ByteBuffer readByteBuffer(File file, int bufferSize) throws IOException {
281    ByteBuffer bbuf = ByteBuffer.allocate(bufferSize);
282    BufferedInputStream f = new BufferedInputStream(new FileInputStream(file), bufferSize);
283
284    int b;
285    while ((b = f.read()) != -1) {
286      if (!bbuf.hasRemaining()) {
287        break;
288      }
289      bbuf.put((byte) b);
290    }
291    f.close();
292
293    return bbuf;
294  }
295
296  /**
297   * @param linesPerMemorySort are the number of lines that should be sorted in memory, determining the number of file
298   *        segments to be sorted when doing a Java file sort. Defaults to 100000, if you have
299   *        memory available a higher value increases performance.
300   */
301  public static void setLinesPerMemorySort(int linesPerMemorySort) {
302    FileUtils.linesPerMemorySort = linesPerMemorySort;
303  }
304
305  public static Writer startNewUtf8File(File file) throws IOException {
306    touch(file);
307    return new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file, false), UTF8));
308  }
309
310  public static Writer startNewUtf8XmlFile(File file) throws IOException {
311    Writer writer = startNewUtf8File(file);
312    writer.write("<?xml version='1.0' encoding='utf-8'?>\n");
313    return writer;
314  }
315
316  /**
317   * Takes a utf8 encoded input stream and reads in every line/row into a list.
318   *
319   * @return list of rows
320   */
321  public static LinkedList<String> streamToList(InputStream source) throws IOException {
322    return streamToList(source, FileUtils.UTF8);
323  }
324
325  /**
326   * Reads a file and returns a list of all lines which are no comments (starting with #) and trims whitespace.
327   *
328   * @param source the UTF-8 encoded text file to read
329   * @param resultList the list implementation to be used. Will not be cleared before reading!
330   * @return list of lines
331   */
332  public static List<String> streamToList(InputStream source, List<String> resultList) throws IOException {
333    LineIterator lines = getLineIterator(source);
334    while (lines.hasNext()) {
335      String line = lines.nextLine().trim();
336      // ignore comments
337      if (!ignore(line)) {
338        resultList.add(line);
339      }
340    }
341    return resultList;
342  }
343
344  public static LinkedList<String> streamToList(InputStream source, String encoding) throws IOException {
345    LinkedList<String> resultList = new LinkedList<>();
346    try {
347      LineIterator lines = new LineIterator(new BufferedReader(new InputStreamReader(source, encoding)));
348      while (lines.hasNext()) {
349        String line = lines.nextLine();
350        resultList.add(line);
351      }
352    } catch (UnsupportedEncodingException e) {
353      throw new IllegalArgumentException("Unsupported encoding " + encoding, e);
354    }
355    return resultList;
356  }
357
358  /**
359   * Reads a utf8 encoded inut stream, splits
360   */
361  public static Map<String, String> streamToMap(InputStream source) throws IOException {
362    return streamToMap(source, new HashMap<>());
363  }
364
365  public static Map<String, String> streamToMap(InputStream source, int key, int value, boolean trimToNull)
366    throws IOException {
367    return streamToMap(source, new HashMap<>(), key, value, trimToNull);
368  }
369
370  /**
371   * Read a hashmap from a tab delimited utf8 input stream using the row number as an integer value and the entire row
372   * as the value. Ignores commented rows starting with #.
373   *
374   * @param source tab delimited text file to read
375   */
376  public static Map<String, String> streamToMap(InputStream source, Map<String, String> result) throws IOException {
377    LineIterator lines = getLineIterator(source);
378    Integer row = 0;
379    while (lines.hasNext()) {
380      row++;
381      String line = lines.nextLine().trim();
382      // ignore comments
383      if (!ignore(line)) {
384        result.put(line, row.toString());
385      }
386    }
387    return result;
388  }
389
390  /**
391   * Read a hashmap from a tab delimited utf8 file, ignoring commented rows starting with #.
392   *
393   * @param source tab delimited input stream to read
394   * @param key column number to use as key
395   * @param value column number to use as value
396   * @param trimToNull if true trims map entries to null
397   */
398  public static Map<String, String> streamToMap(InputStream source, Map<String, String> result, int key, int value,
399    boolean trimToNull) throws IOException {
400    LineIterator lines = getLineIterator(source);
401    int maxCols = key > value ? key : value + 1;
402    while (lines.hasNext()) {
403      String line = lines.nextLine();
404      // ignore comments
405      if (!ignore(line)) {
406        String[] parts = TAB_DELIMITED.split(line);
407        if (maxCols <= parts.length) {
408          if (trimToNull) {
409            result.put(StringUtils.trimToNull(parts[key]), StringUtils.trimToNull(parts[value]));
410          } else {
411            result.put(parts[key], parts[value]);
412          }
413        }
414      }
415    }
416    return result;
417  }
418
419  public static Set<String> streamToSet(InputStream source) throws IOException {
420    return streamToSet(source, new CompactHashSet<>());
421  }
422
423  /**
424   * Reads a file and returns a unique set of all lines which are no comments (starting with #) and trims whitespace.
425   *
426   * @param source the UTF-8 encoded text file to read
427   * @param resultSet the set implementation to be used. Will not be cleared before reading!
428   * @return set of unique lines
429   */
430  public static Set<String> streamToSet(InputStream source, Set<String> resultSet) throws IOException {
431    LineIterator lines = getLineIterator(source);
432    while (lines.hasNext()) {
433      String line = lines.nextLine().trim();
434      // ignore comments
435      if (!ignore(line)) {
436        resultSet.add(line);
437      }
438    }
439    return resultSet;
440  }
441
442  public static String toFilePath(URL url) {
443    String protocol =
444      url.getProtocol() == null || "http".equalsIgnoreCase(url.getProtocol()) ? "" : "/__" + url.getProtocol() + "__";
445    String domain = url.getAuthority() == null ? "__domainless" : url.getAuthority();
446    return domain + protocol + url.getFile();
447  }
448
449  public static File url2file(URL url) {
450    File f = null;
451    try {
452      f = new File(url.toURI());
453    } catch (URISyntaxException e) {
454      f = new File(url.getPath());
455    }
456    return f;
457  }
458
459  /**
460   * For the given list, finds the index of the lowest value using the given comparator.
461   *
462   * @param values To compare
463   * @param comparator To use
464   * @return The index of the lowest value, or -1 if they are all null
465   */
466  static int lowestValueIndex(List<String> values, Comparator<String> comparator) {
467    int index = 0;
468    String lowestValue = null;
469    for (int i = 0; i < values.size(); i++) {
470      String value = values.get(i);
471      if (lowestValue == null) {
472        lowestValue = value;
473        index = i;
474      } else if (comparator.compare(lowestValue, value) > 0) {
475        lowestValue = value;
476        index = i;
477      }
478    }
479
480    return lowestValue == null ? -1 : index;
481  }
482
483  /**
484   * For the given file's path, returns a proposed new filename (including path) with the extension
485   * index and suffix. So a file of "/tmp/input.txt" -> "/tmp/input_part_10.txt".
486   *
487   * @param original File
488   * @param index E.g. 10
489   * @return The proposed name
490   */
491  private static File getChunkFile(File original, int index) {
492    return new File(original.getParentFile(),
493      FilenameUtils.getBaseName(original.getName()) + '_' + index + getFileExtension(original.getName()));
494  }
495
496  private static boolean ignore(String line) {
497    return StringUtils.trimToNull(line) == null || line.startsWith("#");
498  }
499
500  public int getLinesPerMemorySort() {
501    return linesPerMemorySort;
502  }
503
504  /**
505   * Merges a list of intermediary sort chunk files into a single sorted file. On completion, the intermediary sort
506   * chunk files are deleted.
507   *
508   * @param sortFiles sort chunk files to merge
509   * @param sortedFileWriter writer to merge to. Can already be open and contain data
510   * @param lineComparator To use when determining the order (reuse the one that was used to sort the individual
511   *        files)
512   */
513  public void mergedSortedFiles(List<File> sortFiles, OutputStreamWriter sortedFileWriter, Comparator<String> lineComparator)
514    throws IOException {
515    List<BufferedReader> partReaders = new LinkedList<>();
516    try {
517      List<String> partReaderLine = new LinkedList<>();
518      for (File f : sortFiles) {
519        // Use UTF-8 sort order.
520        partReaders.add(new BufferedReader(
521          new InputStreamReader(new FileInputStream(f), StandardCharsets.UTF_8)));
522      }
523      boolean moreData = false;
524      // load first line in
525      for (BufferedReader partReader : partReaders) {
526        String partLine = partReader.readLine();
527        if (partLine != null) {
528          moreData = true;
529        }
530        // we still add the "null" to keep the partReaders and partLineReader indexes in sync - ALWAYS
531        partReaderLine.add(partLine);
532      }
533      // keep going until all readers are exhausted
534      while (moreData) {
535        int index = lowestValueIndex(partReaderLine, lineComparator);
536        if (index >= 0) {
537          sortedFileWriter.write(partReaderLine.get(index));
538          sortedFileWriter.write("\n");
539          BufferedReader r = partReaders.get(index);
540          String partLine = r.readLine();
541          // TODO: Synchronization on local variable?
542          synchronized (partReaderLine) {
543            partReaderLine.add(index, partLine);
544            partReaderLine.remove(index + 1);
545          }
546        } else {
547          moreData = false;
548        }
549      }
550    } finally {
551      for (BufferedReader b : partReaders) {
552        try {
553          b.close();
554        } catch (RuntimeException e) {
555        }
556      }
557      // I assume it periodically flushes anyway, so only need to do once at end...
558      sortedFileWriter.flush();
559      sortedFileWriter.close();
560      // delete (intermediary) sort chunk files, only the sorted file remains
561      for (File f : sortFiles) {
562        f.delete();
563      }
564    }
565  }
566
567  /**
568   * Sorts the input file into the output file using the supplied delimited line parameters.
569   *
570   * This method is not reliable when the sort field may contain Unicode codepoints outside the Basic Multilingual Plane,
571   * i.e. above \uFFFF. In that case, the sort order differs from Java's String sort order.  This should not be a problem
572   * for most usage; the Supplementary Multilingual Planes contain ancient scripts, emojis, arrows and so on.
573   *
574   * @param input To sort
575   * @param sorted The sorted version of the input excluding ignored header lines (see ignoreHeaderLines)
576   * @param column the column that keeps the values to sort on
577   * @param columnDelimiter the delimiter that separates columns in a row
578   * @param enclosedBy optional column enclosing character, e.g. a double quote for CSVs
579   * @param newlineDelimiter the chars used for new lines, usually \n, \n\r or \r
580   * @param ignoreHeaderLines number of beginning lines to ignore, e.g. headers
581   */
582  public void sort(File input, File sorted, String encoding, int column, String columnDelimiter, Character enclosedBy,
583    String newlineDelimiter, int ignoreHeaderLines) throws IOException {
584    Comparator<String> lineComparator;
585    if (enclosedBy == null) {
586      lineComparator = new LineComparator(column, columnDelimiter);
587    } else {
588      lineComparator = new LineComparator(column, columnDelimiter, enclosedBy);
589    }
590    sort(input, sorted, encoding, column, columnDelimiter, enclosedBy, newlineDelimiter, ignoreHeaderLines,
591      lineComparator, false);
592  }
593
594  /**
595   * Sorts the input file into the output file using the supplied delimited line parameters.
596   *
597   * This method is not reliable when the sort field may contain Unicode codepoints outside the Basic Multilingual Plane,
598   * i.e. above \uFFFF. In that case, the sort order differs from Java's String sort order.  This should not be a problem
599   * for most usage; the Supplementary Multilingual Planes contain ancient scripts, emojis, arrows and so on.
600   *
601   * TODO: This method is globally synchronized, in case multiple sorts are attempted to the same file simultaneously.
602   * This could be improved to allow synchronizing against the destination file, rather than for all sorts.
603   *
604   * @param input To sort
605   * @param sorted The sorted version of the input excluding ignored header lines (see ignoreHeaderLines)
606   * @param column the column that keeps the values to sort on
607   * @param columnDelimiter the delimiter that separates columns in a row
608   * @param enclosedBy optional column enclosing character, e.g. a double quote for CSVs
609   * @param newlineDelimiter the chars used for new lines, usually \n, \r\n or \r
610   * @param ignoreHeaderLines number of beginning lines to ignore, e.g. headers
611   * @param lineComparator used to sort the output
612   * @param ignoreCase ignore case order, this parameter couldn't have any effect if the LineComparator is used
613   */
614  public void sort(File input, File sorted, String encoding, int column, String columnDelimiter, Character enclosedBy,
615    String newlineDelimiter, int ignoreHeaderLines, Comparator<String> lineComparator, boolean ignoreCase)
616    throws IOException {
617    LOG.debug("Sorting " + input.getAbsolutePath() + " as new file " + sorted.getAbsolutePath());
618    if (encoding == null) {
619      LOG.warn("No encoding specified, assume UTF-8");
620      encoding = FileUtils.UTF8;
621    }
622    synchronized (sortLock) {
623      if (sorted.exists()) {
624        // Delete a file, which will allow processes with it open to continue reading it.
625        // The GNU sort truncates and appends, which would mean a partial read otherwise.
626        LOG.warn("Deleting existed sorted file {}", sorted.getAbsoluteFile());
627        sorted.delete();
628      }
629      // if the id is in the first column, first try sorting via shell as its the fastest we can get
630      if (!sortInGnu(input, sorted, encoding, ignoreHeaderLines, column, columnDelimiter, newlineDelimiter, ignoreCase)) {
631        LOG.debug("No GNU sort available, using native Java sorting");
632        sortInJava(input, sorted, encoding, lineComparator, ignoreHeaderLines);
633      }
634    }
635  }
636
637  /**
638   * Sorts the input file into the output file using the supplied lineComparator.
639   *
640   * @param input To sort
641   * @param sorted The sorted version of the input excluding ignored header lines (see ignoreHeaderLines)
642   * @param lineComparator To use during comparison
643   * @param ignoreHeaderLines number of beginning lines to ignore, e.g. headers
644   */
645  public void sortInJava(File input, File sorted, String encoding, Comparator<String> lineComparator,
646    int ignoreHeaderLines) throws IOException {
647    LOG.debug("Sorting File[" + input.getAbsolutePath() + ']');
648    long start = System.currentTimeMillis();
649    List<File> sortFiles = new LinkedList<>();
650    BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(input), encoding));
651    List<String> headerLines = new LinkedList<>();
652    try {
653      String line = br.readLine();
654      int fileCount = 0;
655
656      List<String> linesToSort = new LinkedList<>();
657      while (line != null) {
658        if (ignoreHeaderLines > 0) {
659          headerLines.add(line);
660          ignoreHeaderLines--;
661        } else {
662          linesToSort.add(line);
663
664          // if buffer is full, then sort and write to file
665          if (linesToSort.size() == linesPerMemorySort) {
666            sortFiles.add(sortAndWrite(input, encoding, lineComparator, fileCount, linesToSort));
667            linesToSort = new LinkedList<>();
668            fileCount++;
669          }
670        }
671        line = br.readLine();
672      }
673      // catch the last lot
674      if (!linesToSort.isEmpty()) {
675        sortFiles.add(sortAndWrite(input, encoding, lineComparator, fileCount, linesToSort));
676      }
677    } finally {
678      br.close();
679    }
680    LOG.debug(
681      sortFiles.size() + " sorted file chunks created in " + (System.currentTimeMillis() - start) / 1000 + " secs");
682
683    // now merge the sorted files into one single sorted file
684    FileWriter sortedFileWriter = new FileWriter(sorted);
685    // first write the old header lines if existing
686    for (String h : headerLines) {
687      sortedFileWriter.write(h);
688      sortedFileWriter.write("\n");
689    }
690    mergedSortedFiles(sortFiles, sortedFileWriter, lineComparator);
691
692    LOG.debug(
693      "File " + input.getAbsolutePath() + " sorted successfully using " + sortFiles.size() + " parts to do sorting in "
694        + (System.currentTimeMillis() - start) / 1000 + " secs");
695  }
696
697
698  /**
699   * Splits the supplied file into files of set line size and with a suffix.
700   *
701   * @param input To split up
702   * @param linesPerOutput Lines per split file
703   * @param extension The file extension to use - e.g. ".txt"
704   * @return The split files
705   */
706  public List<File> split(File input, int linesPerOutput, String extension) throws IOException {
707    LOG.debug("Splitting File[" + input.getAbsolutePath() + ']');
708    long timer = System.currentTimeMillis();
709    List<File> splitFiles = new LinkedList<>();
710    // Use ISO-8859-1 as a binary-safe encoding.
711    BufferedReader br = new BufferedReader(
712        new InputStreamReader(new FileInputStream(input), StandardCharsets.ISO_8859_1));
713    String line = br.readLine();
714    int fileCount = 0;
715    File splitFile = getChunkFile(input, fileCount);
716    fileCount++;
717    splitFiles.add(splitFile);
718    OutputStreamWriter fw =
719      new OutputStreamWriter(new FileOutputStream(splitFile), StandardCharsets.ISO_8859_1);
720    try {
721      int lineCount = 0;
722      while (line != null) {
723        if (lineCount == linesPerOutput) {
724          fw.flush();
725          fw.close();
726          splitFile = getChunkFile(input, fileCount);
727          splitFiles.add(splitFile);
728          // is ok to reuse, as last one is closed, and this will always get closed - see finally below
729          fw = new OutputStreamWriter(new FileOutputStream(splitFile), StandardCharsets.ISO_8859_1);
730          fileCount++;
731          lineCount = 0;
732        }
733        fw.write(line);
734        fw.write("\n");
735        line = br.readLine();
736        lineCount++;
737      }
738      fw.flush();
739    } finally {
740      fw.close();
741    }
742    LOG.debug("File[" + input.getAbsolutePath() + "] split successfully into[" + splitFiles.size() + "] parts in secs["
743      + (1 + System.currentTimeMillis() - timer) / 1000 + "]");
744    return splitFiles;
745  }
746
747  /**
748   * Test whether we have a new enough version of GNU Sort that supports (primarily) the -k option with a start and end
749   * column.
750   *
751   * Mac OS only includes an old version of GNU sort, and will fail this test.
752   */
753  private boolean gnuSortAvailable() {
754    if (gnuSortAvailable != null) {
755      return gnuSortAvailable;
756    }
757
758    try {
759      String command = "sort -k1,1 -t',' --ignore-case /dev/null";
760      LOG.debug("Testing capability of GNU sort with command: {}", command);
761
762      Process process = new ProcessBuilder("/bin/sh", "-c", command).start();
763      int exitValue = process.waitFor();
764
765      if (exitValue == 0) {
766        LOG.debug("GNU sort is capable");
767        gnuSortAvailable = true;
768      } else {
769        LOG.warn("GNU sort does not exist or is too old, and will not be used.  Sorting large files will be slow.",
770            new InputStreamUtils().readEntireStream(process.getErrorStream()).replace('\n', ' '));
771        gnuSortAvailable = false;
772      }
773    } catch (Exception e) {
774      LOG.warn("GNU sort does not exist or is too old, and will not be used.  Sorting large files will be slow.", e);
775      gnuSortAvailable = false;
776    }
777
778    return gnuSortAvailable;
779  }
780
781  /**
782   * sort a text file via an external GNU sort command:
783   * sorting tabs at 3rd column, numerical reverse order
784   * sort -t$'\t' -k3 -o sorted.txt col2007.txt
785   * <p/>
786   * The GNU sort based sorting is extremely efficient and much, much faster than the current sortInJava method. It is
787   * locale aware though and we only want the native C sorting locale. See
788   * http://www.gnu.org/software/coreutils/faq/coreutils-faq.html#Sort-does-not-sort-in-normal-order_0021
789   * <p/>
790   * Example C sort order:
791   * <p/>
792   * <pre>
793   * 1 oOdontoceti
794   * 10 gGlobicephala melaena melaena Traill
795   * 100 gGlobicephala melaena melaena Traill
796   * 101 gGlobicephala melaena melaena Traill
797   * 11 pPontoporia Gray
798   * 12 pPontoporia blainvillei Gervais and d'Orbigny
799   * 120 iInia d'Orbigny
800   * 121 iInia geoffrensis Blainville
801   * 2 sSusuidae
802   * 20 cCetacea
803   * Amphiptera
804   * Amphiptera pacifica Giglioli
805   * Anarnak Lacépède
806   * Balaena mangidach Chamisso
807   * amphiptera
808   * amphiptera pacifica Giglioli
809   * anarnak Lacépède
810   * balaena mangidach Chamisso
811   * </pre>
812   */
813  protected boolean sortInGnu(File input, File sorted, String encoding, int ignoreHeaderLines, int column,
814    String columnDelimiter, String lineDelimiter, boolean ignoreCase) throws IOException {
815    String command;
816    // GNU sort is checked for use when:
817    // • line delimiter is \n
818    // • column delimiter is set and we're not using the first column
819    // • sort version is sufficient to include start and end column (-k 1,1).
820    // Use the --debug option to sort if working on this code.
821    if (lineDelimiter == null || !lineDelimiter.contains("\n") || (columnDelimiter != null && column > 0) ||
822        !gnuSortAvailable()) {
823      LOG.debug("Cannot use GNU sort on this file");
824      return false;
825    }
826
827    // keep header rows
828    boolean success = false;
829    try {
830      LinkedList<String> cmds = new LinkedList<>();
831      cmds.add("/bin/sh");
832      cmds.add("-c");
833      cmds.add("");
834      ProcessBuilder pb = new ProcessBuilder(cmds);
835      Map<String, String> env = pb.environment();
836      
837      //clear the environment, but keep specified temp working directory 
838      env.keySet().removeIf(key -> !(key.equals("TMPDIR")));
839      if (System.getProperty("java.io.tmpdir") != null) {
840        env.put("TMPDIR", System.getProperty("java.io.tmpdir"));
841      }
842      // make sure we use the C locale for sorting
843      env.put("LC_ALL", "C");
844
845      String sortArgs = String.format(" %s -k%d,%d -t'%s'",
846        ignoreCase ? "--ignore-case" : "", column+1, column+1, columnDelimiter);
847
848      if (ignoreHeaderLines > 0) {
849        // copy header lines
850        command = "head -n " + ignoreHeaderLines + ' ' + input.getAbsolutePath() + " > " + sorted.getAbsolutePath();
851        LOG.debug("Issue external command: {}", command);
852        cmds.removeLast();
853        cmds.add(command);
854        Process process = pb.start();
855        int exitValue = process.waitFor();
856        if (exitValue != 0) {
857          LOG.warn("Error sorting file (copying header lines) with GNU head");
858          return false;
859        }
860
861        // do the sorting ignoring the header rows
862        command = "sed " + ignoreHeaderLines + "d " + input.getAbsolutePath() + " | "
863            + "sort " + sortArgs
864            + " >> " + sorted.getAbsolutePath();
865      } else {
866        // do sorting directly, we don't have header rows
867        command = "sort " + sortArgs + " -o " + sorted.getAbsolutePath() + ' ' + input.getAbsolutePath();
868      }
869
870      LOG.debug("Issue external command: {}", command);
871      cmds.removeLast();
872      cmds.add(command);
873      Process process = pb.start();
874      // get the stdout and stderr from the command that was run
875      InputStream err = process.getErrorStream();
876      int exitValue = process.waitFor();
877      if (exitValue == 0) {
878        LOG.debug("Successfully sorted file with GNU sort");
879        success = true;
880      } else {
881        LOG.warn("Error sorting file with GNU sort");
882        InputStreamUtils isu = new InputStreamUtils();
883        System.err.append(isu.readEntireStream(err));
884      }
885    } catch (Exception e) {
886      LOG.warn("Caught Exception using GNU sort", e);
887    }
888    return success;
889  }
890
891  /**
892   * Sorts the lines and writes to file using the
893   *
894   * @param input File to base the name on
895   * @param lineComparator To compare the lines for sorting
896   * @param fileCount Used for the file name
897   * @param linesToSort To actually sort
898   * @return The written file
899   */
900  private File sortAndWrite(File input, String encoding, Comparator<String> lineComparator, int fileCount,
901    List<String> linesToSort) throws IOException {
902    long start = System.currentTimeMillis();
903    linesToSort.sort(lineComparator);
904    // When implementing a comparator, make it SUPER quick!!!
905    LOG.debug(
906      "Collections.sort took msec[" + (System.currentTimeMillis() - start) + "] to sort records[" + linesToSort.size()
907        + ']');
908    File sortFile = getChunkFile(input, fileCount);
909    try (Writer fw = new OutputStreamWriter(new FileOutputStream(sortFile), encoding)) {
910      for (String s : linesToSort) {
911        fw.write(s);
912        fw.write("\n");
913      }
914    }
915    return sortFile;
916  }
917
918  /**
919   * Creates an empty file or updates the last updated timestamp on the same as the unix command of
920   * the same name.
921   *
922   * <p>From Guava.
923   *
924   * @param file the file to create or update
925   * @throws IOException if an I/O error occurs
926   */
927  public static void touch(File file) throws IOException {
928    Objects.requireNonNull(file);
929    if (!file.createNewFile() && !file.setLastModified(System.currentTimeMillis())) {
930      throw new IOException("Unable to update modification time of " + file);
931    }
932  }
933
934  /**
935   * Returns the <a href="http://en.wikipedia.org/wiki/Filename_extension">file extension</a> for
936   * the given file name, or the empty string if the file has no extension. The result does not
937   * include the '{@code .}'.
938   *
939   * <p><b>Note:</b> This method simply returns everything after the last '{@code .}' in the file's
940   * name as determined by {@link File#getName}. It does not account for any filesystem-specific
941   * behavior that the {@link File} API does not already account for. For example, on NTFS it will
942   * report {@code "txt"} as the extension for the filename {@code "foo.exe:.txt"} even though NTFS
943   * will drop the {@code ":.txt"} part of the name when the file is actually created on the
944   * filesystem due to NTFS's <a href="https://goo.gl/vTpJi4">Alternate Data Streams</a>.
945   *
946   * <p>From Guava.
947   */
948  public static String getFileExtension(String fullName) {
949    Objects.requireNonNull(fullName);
950    String fileName = new File(fullName).getName();
951    int dotIndex = fileName.lastIndexOf('.');
952    return (dotIndex == -1) ? "" : fileName.substring(dotIndex + 1);
953  }
954
955  /**
956   * Creates any necessary but nonexistent parent directories of the specified file. Note that if
957   * this operation fails it may have succeeded in creating some (but not all) of the necessary
958   * parent directories.
959   *
960   * <p>From Guava.
961   *
962   * @throws IOException if an I/O error occurs, or if any necessary but nonexistent parent
963   *     directories of the specified file could not be created.
964   */
965  public static void createParentDirs(File file) throws IOException {
966    Objects.requireNonNull(file);
967    File parent = file.getCanonicalFile().getParentFile();
968    if (parent == null) {
969      /*
970       * The given directory is a filesystem root. All zero of its ancestors exist. This doesn't
971       * mean that the root itself exists -- consider x:\ on a Windows machine without such a drive
972       * -- or even that the caller can create it, but this method makes no such guarantees even for
973       * non-root files.
974       */
975      return;
976    }
977    //noinspection ResultOfMethodCallIgnored
978    parent.mkdirs();
979    if (!parent.isDirectory()) {
980      throw new IOException("Unable to create parent directories of " + file);
981    }
982  }
983}