001package org.gbif.utils.file;
002
003import org.gbif.utils.collection.CompactHashSet;
004import org.gbif.utils.text.LineComparator;
005
006import java.io.BufferedInputStream;
007import java.io.BufferedReader;
008import java.io.BufferedWriter;
009import java.io.File;
010import java.io.FileInputStream;
011import java.io.FileNotFoundException;
012import java.io.FileOutputStream;
013import java.io.FileReader;
014import java.io.FileWriter;
015import java.io.IOException;
016import java.io.InputStream;
017import java.io.InputStreamReader;
018import java.io.OutputStream;
019import java.io.OutputStreamWriter;
020import java.io.UnsupportedEncodingException;
021import java.io.Writer;
022import java.net.URISyntaxException;
023import java.net.URL;
024import java.nio.ByteBuffer;
025import java.util.Collections;
026import java.util.Comparator;
027import java.util.HashMap;
028import java.util.LinkedList;
029import java.util.List;
030import java.util.Map;
031import java.util.Set;
032import java.util.regex.Pattern;
033
034import com.google.common.io.Files;
035import org.apache.commons.io.FilenameUtils;
036import org.apache.commons.io.LineIterator;
037import org.apache.commons.lang3.StringUtils;
038import org.slf4j.Logger;
039import org.slf4j.LoggerFactory;
040
041/**
042 * Collection of file utils.
043 */
044public class FileUtils {
045
046  public static final String UTF8 = "UTF8";
047
048  public static final Pattern TAB_DELIMITED = Pattern.compile("\t");
049
050
051  private static int linesPerMemorySort = 100000;
052  private static final Logger LOG = LoggerFactory.getLogger(FileUtils.class);
053
054
055  public static String classpath2Filepath(String path) {
056    return new File(ClassLoader.getSystemResource(path).getFile()).getAbsolutePath();
057  }
058
059  public static InputStream classpathStream(String path) throws IOException {
060    InputStream in = null;
061    // relative path. Use classpath instead
062    URL url = FileUtils.class.getClassLoader().getResource(path);
063    if (url != null) {
064      in = url.openStream();
065    }
066    return in;
067  }
068
069  public static Set<String> columnsToSet(InputStream source, int... column) throws IOException {
070    return columnsToSet(source, new CompactHashSet<String>(), column);
071  }
072
073  /**
074   * Reads a file and returns a unique set of multiple columns from lines which are no comments (starting with #) and
075   * trims whitespace.
076   * 
077   * @param source the UTF-8 encoded text file with tab delimited columns
078   * @param resultSet the set implementation to be used. Will not be cleared before reading!
079   * @param column variable length argument of column indices to process
080   * @return set of column rows
081   */
082  public static Set<String> columnsToSet(InputStream source, Set<String> resultSet, int... column) throws IOException {
083    LineIterator lines = getLineIterator(source);
084    int maxCols = 0;
085    for (int c : column) {
086      if (c > maxCols) {
087        maxCols = c;
088      }
089    }
090    while (lines.hasNext()) {
091      String line = lines.nextLine().trim();
092      // ignore comments
093      if (!ignore(line)) {
094        String[] parts = TAB_DELIMITED.split(line);
095        if (maxCols <= parts.length) {
096          for (int c : column) {
097            String cell = parts[c].trim();
098            resultSet.add(cell);
099          }
100        }
101      }
102    }
103    return resultSet;
104  }
105
106  public static void copyStreams(InputStream in, OutputStream out) throws IOException {
107    // write the file to the file specified
108    int bytesRead;
109    byte[] buffer = new byte[8192];
110
111    while ((bytesRead = in.read(buffer, 0, 8192)) != -1) {
112      out.write(buffer, 0, bytesRead);
113    }
114
115    out.close();
116    in.close();
117  }
118
119  public static void copyStreamToFile(InputStream in, File out) throws IOException {
120    copyStreams(in, new FileOutputStream(out));
121  }
122
123  public static File createTempDir() throws IOException {
124    return createTempDir("gbif-futil", ".tmp");
125  }
126
127  /**
128   * @param prefix The prefix string to be used in generating the file's name; must be at least three characters long
129   * @param suffix The suffix string to be used in generating the file's name; may be null, in which case the suffix
130   *        ".tmp" will be used
131   */
132  public static File createTempDir(String prefix, String suffix) throws IOException {
133    File dir = File.createTempFile(prefix, suffix);
134    if (!dir.delete()) {
135      throw new IOException("Could not delete temp file: " + dir.getAbsolutePath());
136    }
137    if (!dir.mkdir()) {
138      throw new IOException("Could not create temp directory: " + dir.getAbsolutePath());
139    }
140    return dir;
141  }
142
143  /**
144   * Delete directory recursively, including all its files, sub-folders, and sub-folder's files.
145   * 
146   * @param directory directory to delete recursively
147   */
148  public static void deleteDirectoryRecursively(File directory) {
149    File[] list = directory.listFiles();
150    for (File file : list) {
151      if (file.isDirectory()) {
152        deleteDirectoryRecursively(file);
153        file.delete();
154      } else {
155        file.delete();
156      }
157    }
158    directory.delete();
159  }
160
161  /**
162   * Escapes a filename so it is a valid filename on all systems, replacing /. .. \t\r\n.
163   * 
164   * @param filename to be escaped
165   */
166  public static String escapeFilename(String filename) {
167    return filename.replaceAll("[\\s./&]", "_");
168  }
169
170  public static File getClasspathFile(String path) {
171    return new File(ClassLoader.getSystemResource(path).getFile());
172  }
173
174  public static InputStream getInputStream(File source) throws FileNotFoundException {
175    return new FileInputStream(source);
176  }
177
178  public static BufferedReader getInputStreamReader(InputStream input) throws FileNotFoundException {
179    return getInputStreamReader(input, UTF8);
180  }
181
182  public static BufferedReader getInputStreamReader(InputStream input, String encoding) throws FileNotFoundException {
183    BufferedReader reader = null;
184    try {
185      reader = new BufferedReader(new InputStreamReader(input, encoding));
186    } catch (UnsupportedEncodingException e) {
187      LOG.warn("Caught Exception", e);
188    }
189    return reader;
190  }
191
192  /**
193   * @param source the source input stream encoded in UTF8
194   */
195  public static LineIterator getLineIterator(InputStream source) {
196    return getLineIterator(source, UTF8);
197  }
198
199  /**
200   * @param source the source input stream
201   * @param encoding the encoding used by the input stream
202   */
203  public static LineIterator getLineIterator(InputStream source, String encoding) {
204    try {
205      return new LineIterator(new BufferedReader(new InputStreamReader(source, encoding)));
206    } catch (UnsupportedEncodingException e) {
207      throw new IllegalArgumentException("Unsupported encoding" + encoding, e);
208    }
209  }
210
211  public static BufferedReader getUtf8Reader(File file) throws FileNotFoundException {
212    BufferedReader reader = null;
213    try {
214      reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), UTF8));
215    } catch (UnsupportedEncodingException e) {
216      LOG.warn("Caught Exception", e);
217    }
218    return reader;
219  }
220
221  /**
222   * Converts the byte size into human-readable format.
223   * Support both SI and byte format.
224   */
225  public static String humanReadableByteCount(long bytes, boolean si) {
226    int unit = si ? 1000 : 1024;
227    if (bytes < unit) {
228      return bytes + " B";
229    }
230    int exp = (int) (Math.log(bytes) / Math.log(unit));
231    String pre = (si ? "kMGTPE" : "KMGTPE").charAt(exp - 1) + (si ? "" : "i");
232    return String.format("%.1f %sB", bytes / Math.pow(unit, exp), pre);
233  }
234
235  public static boolean isCompressedFile(File source) {
236    String suffix = source.getName().substring(source.getName().lastIndexOf('.') + 1);
237    return suffix != null && suffix.length() > 0 && ("zip".equalsIgnoreCase(suffix) || "tgz".equalsIgnoreCase(suffix)
238      || "gz".equalsIgnoreCase(suffix));
239  }
240
241  /**
242   * Reads a complete file into a byte buffer.
243   */
244  public static ByteBuffer readByteBuffer(File file) throws IOException {
245    byte[] content = org.apache.commons.io.FileUtils.readFileToByteArray(file);
246    return ByteBuffer.wrap(content);
247  }
248
249  /**
250   * Reads the first bytes of a file into a byte buffer.
251   * 
252   * @param bufferSize the number of bytes to read from the file
253   */
254  public static ByteBuffer readByteBuffer(File file, int bufferSize) throws IOException {
255    ByteBuffer bbuf = ByteBuffer.allocate(bufferSize);
256    BufferedInputStream f = new BufferedInputStream(new FileInputStream(file), bufferSize);
257
258    int b;
259    while ((b = f.read()) != -1) {
260      if (!bbuf.hasRemaining()) {
261        break;
262      }
263      bbuf.put((byte) b);
264    }
265    f.close();
266
267    return bbuf;
268  }
269
270  /**
271   * @param linesPerMemorySort are the number of lines that should be sorted in memory, determining the number of file
272   *        segments to be sorted when doing a java file sort. Defaults to 100000, if you have
273   *        memory available a higher value increases performance.
274   */
275  public static void setLinesPerMemorySort(int linesPerMemorySort) {
276    FileUtils.linesPerMemorySort = linesPerMemorySort;
277  }
278
279  public static Writer startNewUtf8File(File file) throws IOException {
280    Files.touch(file);
281    return new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file, false), UTF8));
282  }
283
284  public static Writer startNewUtf8XmlFile(File file) throws IOException {
285    Writer writer = startNewUtf8File(file);
286    writer.write("<?xml version='1.0' encoding='utf-8'?>\n");
287    return writer;
288  }
289
290  /**
291   * Takes a utf8 encoded input stream and reads in every line/row into a list.
292   * 
293   * @return list of rows
294   */
295  public static LinkedList<String> streamToList(InputStream source) throws IOException {
296    return streamToList(source, "UTF-8");
297  }
298
299  /**
300   * Reads a file and returns a list of all lines which are no comments (starting with #) and trims whitespace.
301   * 
302   * @param source the UTF-8 encoded text file to read
303   * @param resultList the list implementation to be used. Will not be cleared before reading!
304   * @return list of lines
305   */
306  public static List<String> streamToList(InputStream source, List<String> resultList) throws IOException {
307    LineIterator lines = getLineIterator(source);
308    while (lines.hasNext()) {
309      String line = lines.nextLine().trim();
310      // ignore comments
311      if (!ignore(line)) {
312        resultList.add(line);
313      }
314    }
315    return resultList;
316  }
317
318  public static LinkedList<String> streamToList(InputStream source, String encoding) throws IOException {
319    LinkedList<String> resultList = new LinkedList<String>();
320    try {
321      LineIterator lines = new LineIterator(new BufferedReader(new InputStreamReader(source, encoding)));
322      while (lines.hasNext()) {
323        String line = lines.nextLine();
324        resultList.add(line);
325      }
326    } catch (UnsupportedEncodingException e) {
327      throw new IllegalArgumentException("Unsupported encoding " + encoding, e);
328    }
329    return resultList;
330  }
331
332  /**
333   * Reads a utf8 encoded inut stream, splits
334   */
335  public static Map<String, String> streamToMap(InputStream source) throws IOException {
336    return streamToMap(source, new HashMap<String, String>());
337  }
338
339  public static Map<String, String> streamToMap(InputStream source, int key, int value, boolean trimToNull)
340    throws IOException {
341    return streamToMap(source, new HashMap<String, String>(), key, value, trimToNull);
342  }
343
344  /**
345   * Read a hashmap from a tab delimited utf8 input stream using the row number as an integer value and the entire row
346   * as the value. Ignores commented rows starting with #.
347   * 
348   * @param source tab delimited text file to read
349   */
350  public static Map<String, String> streamToMap(InputStream source, Map<String, String> result) throws IOException {
351    LineIterator lines = getLineIterator(source);
352    Integer row = 0;
353    while (lines.hasNext()) {
354      row++;
355      String line = lines.nextLine().trim();
356      // ignore comments
357      if (!ignore(line)) {
358        result.put(line, row.toString());
359      }
360    }
361    return result;
362  }
363
364  /**
365   * Read a hashmap from a tab delimited utf8 file, ignoring commented rows starting with #.
366   * 
367   * @param source tab delimited input stream to read
368   * @param key column number to use as key
369   * @param value column number to use as value
370   * @param trimToNull if true trims map entries to null
371   */
372  public static Map<String, String> streamToMap(InputStream source, Map<String, String> result, int key, int value,
373    boolean trimToNull) throws IOException {
374    LineIterator lines = getLineIterator(source);
375    int maxCols = key > value ? key : value + 1;
376    while (lines.hasNext()) {
377      String line = lines.nextLine();
378      // ignore comments
379      if (!ignore(line)) {
380        String[] parts = TAB_DELIMITED.split(line);
381        if (maxCols <= parts.length) {
382          if (trimToNull) {
383            result.put(StringUtils.trimToNull(parts[key]), StringUtils.trimToNull(parts[value]));
384          } else {
385            result.put(parts[key], parts[value]);
386          }
387        }
388      }
389    }
390    return result;
391  }
392
393  public static Set<String> streamToSet(InputStream source) throws IOException {
394    return streamToSet(source, new CompactHashSet<String>());
395  }
396
397  /**
398   * Reads a file and returns a unique set of all lines which are no comments (starting with #) and trims whitespace.
399   * 
400   * @param source the UTF-8 encoded text file to read
401   * @param resultSet the set implementation to be used. Will not be cleared before reading!
402   * @return set of unique lines
403   */
404  public static Set<String> streamToSet(InputStream source, Set<String> resultSet) throws IOException {
405    LineIterator lines = getLineIterator(source);
406    while (lines.hasNext()) {
407      String line = lines.nextLine().trim();
408      // ignore comments
409      if (!ignore(line)) {
410        resultSet.add(line);
411      }
412    }
413    return resultSet;
414  }
415
416  public static String toFilePath(URL url) {
417    String protocol =
418      url.getProtocol() == null || "http".equalsIgnoreCase(url.getProtocol()) ? "" : "/__" + url.getProtocol() + "__";
419    String domain = url.getAuthority() == null ? "__domainless" : url.getAuthority();
420    return domain + protocol + url.getFile();
421  }
422
423  public static File url2file(URL url) {
424    File f = null;
425    try {
426      f = new File(url.toURI());
427    } catch (URISyntaxException e) {
428      f = new File(url.getPath());
429    }
430    return f;
431  }
432
433  /**
434   * For the given list, finds the index of the lowest value using the given comparator.
435   * 
436   * @param values To compare
437   * @param comparator To use
438   * @return The index of the lowest value, or -1 if they are all null
439   */
440  static int lowestValueIndex(List<String> values, Comparator<String> comparator) {
441    int index = 0;
442    String lowestValue = null;
443    for (int i = 0; i < values.size(); i++) {
444      String value = values.get(i);
445      if (lowestValue == null) {
446        lowestValue = value;
447        index = i;
448      } else if (comparator.compare(lowestValue, value) > 0) {
449        lowestValue = value;
450        index = i;
451      }
452    }
453
454    return lowestValue == null ? -1 : index;
455  }
456
457  /**
458   * For the given file's path, returns a proposed new filename (including path) with the extension
459   * index and suffix. So a file of "/tmp/input.txt" -> "/tmp/input_part_10.txt".
460   * 
461   * @param original File
462   * @param index E.g. 10
463   * @return The proposed name
464   */
465  private static File getChunkFile(File original, int index) {
466    return new File(original.getParentFile(),
467      FilenameUtils.getBaseName(original.getName()) + '_' + index + Files.getFileExtension(original.getName()));
468  }
469
470  private static boolean ignore(String line) {
471    return StringUtils.trimToNull(line) == null || line.startsWith("#");
472  }
473
474  public int getLinesPerMemorySort() {
475    return linesPerMemorySort;
476  }
477
478  /**
479   * Merges a list of intermediary sort chunk files into a single sorted file. On completion, the intermediary sort
480   * chunk files are deleted.
481   * 
482   * @param sortFiles sort chunk files to merge
483   * @param sortedFileWriter writer to merge to. Can already be open and contain data
484   * @param lineComparator To use when determining the order (reuse the one that was used to sort the individual
485   *        files)
486   */
487  public void mergedSortedFiles(List<File> sortFiles, FileWriter sortedFileWriter, Comparator<String> lineComparator)
488    throws IOException {
489    List<BufferedReader> partReaders = new LinkedList<BufferedReader>();
490    try {
491      List<String> partReaderLine = new LinkedList<String>();
492      for (File f : sortFiles) {
493        partReaders.add(new BufferedReader(new FileReader(f)));
494      }
495      boolean moreData = false;
496      // load first line in
497      for (BufferedReader partReader : partReaders) {
498        String partLine = partReader.readLine();
499        if (partLine != null) {
500          moreData = true;
501        }
502        // we still add the "null" to keep the partReaders and partLineReader indexes in sync - ALWAYS
503        partReaderLine.add(partLine);
504      }
505      // keep going until all readers are exhausted
506      while (moreData) {
507        int index = lowestValueIndex(partReaderLine, lineComparator);
508        if (index >= 0) {
509          sortedFileWriter.write(partReaderLine.get(index));
510          sortedFileWriter.write("\n");
511          BufferedReader r = partReaders.get(index);
512          String partLine = r.readLine();
513          // TODO: Synchronization on local variable?
514          synchronized (partReaderLine) {
515            partReaderLine.add(index, partLine);
516            partReaderLine.remove(index + 1);
517          }
518        } else {
519          moreData = false;
520        }
521      }
522    } finally {
523      for (BufferedReader b : partReaders) {
524        try {
525          b.close();
526        } catch (RuntimeException e) {
527        }
528      }
529      // I assume it periodically flushes anyway, so only need to do once at end...
530      sortedFileWriter.flush();
531      sortedFileWriter.close();
532      // delete (intermediary) sort chunk files, only the sorted file remains
533      for (File f : sortFiles) {
534        f.delete();
535      }
536    }
537  }
538
539  /**
540   * Sorts the input file into the output file using the supplied delimited line parameters.
541   * The resulting rows will be sorted according to the @See UnixSortComparator with values taken from the specified
542   * column.
543   * 
544   * @param input To sort
545   * @param sorted The sorted version of the input excluding ignored header lines (see ignoreHeaderLines)
546   * @param column the column that keeps the values to sort on
547   * @param columnDelimiter the delimiter that seperates columns in a row
548   * @param enclosedBy optional column enclosing character, e.g. a double quote for CSVs
549   * @param newlineDelimiter the chars used for new lines, usually \n, \n\r or \r
550   * @param ignoreHeaderLines number of beginning lines to ignore, e.g. headers
551   */
552  public void sort(File input, File sorted, String encoding, int column, String columnDelimiter, Character enclosedBy,
553    String newlineDelimiter, int ignoreHeaderLines) throws IOException {
554    Comparator<String> lineComparator;
555    if (enclosedBy == null) {
556      lineComparator = new LineComparator(column, columnDelimiter);
557    } else {
558      lineComparator = new LineComparator(column, columnDelimiter, enclosedBy);
559    }
560    sort(input, sorted, encoding, column, columnDelimiter, enclosedBy, newlineDelimiter, ignoreHeaderLines,
561      lineComparator, false);
562  }
563
564
565  /**
566   * Sorts the input file into the output file using the supplied delimited line parameters.
567   * The resulting rows will be sorted according to the @See UnixSortComparator with values taken from the specified
568   * column.
569   * 
570   * @param input To sort
571   * @param sorted The sorted version of the input excluding ignored header lines (see ignoreHeaderLines)
572   * @param column the column that keeps the values to sort on
573   * @param columnDelimiter the delimiter that seperates columns in a row
574   * @param enclosedBy optional column enclosing character, e.g. a double quote for CSVs
575   * @param newlineDelimiter the chars used for new lines, usually \n, \n\r or \r
576   * @param ignoreHeaderLines number of beginning lines to ignore, e.g. headers
577   * @param lineComparator used to sort the output
578   * @param ignoreCase ignore case order, this parameter couldn't have any effect if the LineComparator is used
579   */
580  public void sort(File input, File sorted, String encoding, int column, String columnDelimiter, Character enclosedBy,
581    String newlineDelimiter, int ignoreHeaderLines, Comparator<String> lineComparator, boolean ignoreCase)
582    throws IOException {
583    LOG.debug("sorting " + input.getAbsolutePath() + " as new file " + sorted.getAbsolutePath());
584    if (encoding == null) {
585      LOG.warn("No encoding specified, assume UTF-8");
586      encoding = "UTF-8";
587    }
588    // if the id is in the first column, first try sorting via unix shell as its the fastest we can get
589    if (!sortInUnix(input, sorted, encoding, ignoreHeaderLines, column, columnDelimiter, newlineDelimiter, ignoreCase)) {
590      // not first column or doesnt work - maybe running on windows. Do native java sorting
591      LOG.debug("No unix sort available, using native java sorting");
592      sortInJava(input, sorted, encoding, lineComparator, ignoreHeaderLines);
593    }
594  }
595
596  /**
597   * Sorts the input file into the output file using the supplied lineComparator.
598   * 
599   * @param input To sort
600   * @param sorted The sorted version of the input excluding ignored header lines (see ignoreHeaderLines)
601   * @param lineComparator To use during comparison
602   * @param ignoreHeaderLines number of beginning lines to ignore, e.g. headers
603   */
604  public void sortInJava(File input, File sorted, String encoding, Comparator<String> lineComparator,
605    int ignoreHeaderLines) throws IOException {
606    LOG.debug("Sorting File[" + input.getAbsolutePath() + ']');
607    long start = System.currentTimeMillis();
608    List<File> sortFiles = new LinkedList<File>();
609    BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(input), encoding));
610    List<String> headerLines = new LinkedList<String>();
611    try {
612      String line = br.readLine();
613      int fileCount = 0;
614
615      List<String> linesToSort = new LinkedList<String>();
616      while (line != null) {
617        if (ignoreHeaderLines > 0) {
618          headerLines.add(line);
619          ignoreHeaderLines--;
620        } else {
621          linesToSort.add(line);
622
623          // if buffer is full, then sort and write to file
624          if (linesToSort.size() == linesPerMemorySort) {
625            sortFiles.add(sortAndWrite(input, encoding, lineComparator, fileCount, linesToSort));
626            linesToSort = new LinkedList<String>();
627            fileCount++;
628          }
629        }
630        line = br.readLine();
631      }
632      // catch the last lot
633      if (!linesToSort.isEmpty()) {
634        sortFiles.add(sortAndWrite(input, encoding, lineComparator, fileCount, linesToSort));
635      }
636    } finally {
637      br.close();
638    }
639    LOG.debug(
640      sortFiles.size() + " sorted file chunks created in " + (System.currentTimeMillis() - start) / 1000 + " secs");
641
642    // now merge the sorted files into one single sorted file
643    FileWriter sortedFileWriter = new FileWriter(sorted);
644    // first write the old header lines if existing
645    for (String h : headerLines) {
646      sortedFileWriter.write(h);
647      sortedFileWriter.write("\n");
648    }
649    mergedSortedFiles(sortFiles, sortedFileWriter, lineComparator);
650
651    LOG.debug(
652      "File " + input.getAbsolutePath() + " sorted successfully using " + sortFiles.size() + " parts to do sorting in "
653        + (System.currentTimeMillis() - start) / 1000 + " secs");
654  }
655
656
657  /**
658   * Splits the supplied file into files of set line size and with a suffix.
659   * 
660   * @param input To split up
661   * @param linesPerOutput Lines per split file
662   * @param extension The file extension to use - e.g. ".txt"
663   * @return The split files
664   */
665  public List<File> split(File input, int linesPerOutput, String extension) throws IOException {
666    LOG.debug("Splitting File[" + input.getAbsolutePath() + ']');
667    long timer = System.currentTimeMillis();
668    List<File> splitFiles = new LinkedList<File>();
669    BufferedReader br = new BufferedReader(new FileReader(input));
670    String line = br.readLine();
671    int fileCount = 0;
672    File splitFile = getChunkFile(input, fileCount);
673    fileCount++;
674    splitFiles.add(splitFile);
675    FileWriter fw = new FileWriter(splitFile);
676    try {
677      int lineCount = 0;
678      while (line != null) {
679        if (lineCount == linesPerOutput) {
680          fw.flush();
681          fw.close();
682          splitFile = getChunkFile(input, fileCount);
683          splitFiles.add(splitFile);
684          // is ok to reuse, as last one is closed, and this will always get closed - see finally below
685          fw = new FileWriter(splitFile);
686          fileCount++;
687          lineCount = 0;
688        }
689        fw.write(line);
690        fw.write("\n");
691        line = br.readLine();
692        lineCount++;
693      }
694      fw.flush();
695    } finally {
696      fw.close();
697    }
698    LOG.debug("File[" + input.getAbsolutePath() + "] split successfully into[" + splitFiles.size() + "] parts in secs["
699      + (1 + System.currentTimeMillis() - timer) / 1000 + "]");
700    return splitFiles;
701  }
702
703  /**
704   * sort a text file via an external unix sort command:
705   * sorting tabs at 3rd column, numerical reverse order
706   * sort -t$'\t' -k3 -o sorted.txt col2007.txt
707   * <p/>
708   * The unix based sorting is extremely efficient and much, much faster than the current sortInJava method. It is
709   * locale aware though and we only want the native C sorting locale. See
710   * http://www.gnu.org/software/coreutils/faq/coreutils-faq.html#Sort-does-not-sort-in-normal-order_0021
711   * <p/>
712   * Example C sort oder:
713   * <p/>
714   * 1 oOdontoceti 10 gGlobicephala melaena melaena Traill 100 gGlobicephala melaena melaena Traill 101 gGlobicephala
715   * melaena melaena Traill 11 pPontoporia Gray 12 pPontoporia blainvillei Gervais and d'Orbigny 120 iInia d'Orbigny 121
716   * iInia geoffrensis Blainville 2 sSusuidae 20 cCetacea Amphiptera Amphiptera pacifica Giglioli Anarnak Lacépède
717   * Balaena mangidach Chamisso amphiptera amphiptera pacifica Giglioli anarnak Lacépède balaena mangidach Chamisso
718   * ånarnak Lacépède
719   */
720  protected boolean sortInUnix(File input, File sorted, String encoding, int ignoreHeaderLines, int column,
721    String columnDelimiter, String lineDelimiter, boolean ignoreCase) throws IOException {
722    String command;
723    // disable unix sorting for now - behaves differently on various OSes
724    if (column != 0 || lineDelimiter == null || !lineDelimiter.contains("\n") || (columnDelimiter != null
725      && columnDelimiter.contains("\n"))) {
726      LOG.debug("Cannot use unix sort on this file");
727      return false;
728    }
729    // keep header rows
730    boolean success = false;
731    try {
732      LinkedList<String> cmds = new LinkedList<String>();
733      cmds.add("/bin/sh");
734      cmds.add("-c");
735      cmds.add("");
736      ProcessBuilder pb = new ProcessBuilder(cmds);
737      Map<String, String> env = pb.environment();
738      env.clear();
739      // make sure we use the C locale for sorting
740      env.put("LC_ALL", "C");
741      if (ignoreHeaderLines > 0) {
742        // use
743        command = "head -n " + ignoreHeaderLines + ' ' + input.getAbsolutePath() + " > " + sorted.getAbsolutePath();
744        LOG.debug("Issue unix sort cmd: " + command);
745        cmds.removeLast();
746        cmds.add(command);
747        Process process = pb.start();
748        process.waitFor();
749
750        // do the sorting ignoring the header rows
751        command =
752          "sed " + ignoreHeaderLines + "d " + input.getAbsolutePath() + " | sort "
753            + (ignoreCase ? "--ignore-case" : "") + " >> "
754            + sorted.getAbsolutePath();
755      } else {
756        // do sorting directly, we dont have header rows
757        command = "sort -o " + sorted.getAbsolutePath() + ' ' + input.getAbsolutePath();
758      }
759
760      LOG.debug("Issue unix sort cmd: " + command);
761      cmds.removeLast();
762      cmds.add(command);
763      Process process = pb.start();
764      // get the stdout and stderr from the command that was run
765      InputStream err = process.getErrorStream();
766      int exitValue = process.waitFor();
767      if (exitValue == 0) {
768        LOG.debug("Successfully sorted file with unix sort");
769        success = true;
770      } else {
771        LOG.warn("Error sorting file with unix sort");
772        InputStreamUtils isu = new InputStreamUtils();
773        System.err.append(isu.readEntireStream(err));
774      }
775    } catch (Exception e) {
776      LOG.warn("Caught Exception", e);
777    }
778    return success;
779  }
780
781  /**
782   * Sorts the lines and writes to file using the
783   * 
784   * @param input File to base the name on
785   * @param lineComparator To compare the lines for sorting
786   * @param fileCount Used for the file name
787   * @param linesToSort To actually sort
788   * @return The written file
789   */
790  private File sortAndWrite(File input, String encoding, Comparator<String> lineComparator, int fileCount,
791    List<String> linesToSort) throws IOException {
792    long start = System.currentTimeMillis();
793    Collections.sort(linesToSort, lineComparator);
794    // When implementing a comparator, make it SUPER quick!!!
795    LOG.debug(
796      "Collections.sort took msec[" + (System.currentTimeMillis() - start) + "] to sort records[" + linesToSort.size()
797        + ']');
798    File sortFile = getChunkFile(input, fileCount);
799    Writer fw = new OutputStreamWriter(new FileOutputStream(sortFile), encoding);
800    try {
801      for (String s : linesToSort) {
802        fw.write(s);
803        fw.write("\n");
804      }
805    } finally {
806      fw.close();
807    }
808    return sortFile;
809  }
810
811}