001/* 002 * Licensed under the Apache License, Version 2.0 (the "License"); 003 * you may not use this file except in compliance with the License. 004 * You may obtain a copy of the License at 005 * 006 * http://www.apache.org/licenses/LICENSE-2.0 007 * 008 * Unless required by applicable law or agreed to in writing, software 009 * distributed under the License is distributed on an "AS IS" BASIS, 010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 011 * See the License for the specific language governing permissions and 012 * limitations under the License. 013 */ 014package org.gbif.utils.file; 015 016import org.gbif.utils.collection.CompactHashSet; 017import org.gbif.utils.text.LineComparator; 018 019import java.io.BufferedInputStream; 020import java.io.BufferedReader; 021import java.io.BufferedWriter; 022import java.io.File; 023import java.io.FileInputStream; 024import java.io.FileNotFoundException; 025import java.io.FileOutputStream; 026import java.io.FileWriter; 027import java.io.IOException; 028import java.io.InputStream; 029import java.io.InputStreamReader; 030import java.io.OutputStream; 031import java.io.OutputStreamWriter; 032import java.io.UnsupportedEncodingException; 033import java.io.Writer; 034import java.net.URISyntaxException; 035import java.net.URL; 036import java.nio.ByteBuffer; 037import java.nio.charset.Charset; 038import java.nio.charset.StandardCharsets; 039import java.util.Collections; 040import java.util.Comparator; 041import java.util.HashMap; 042import java.util.LinkedList; 043import java.util.List; 044import java.util.Map; 045import java.util.Objects; 046import java.util.Set; 047import java.util.regex.Pattern; 048import java.util.stream.Collectors; 049 050import org.apache.commons.io.FilenameUtils; 051import org.apache.commons.io.LineIterator; 052import org.apache.commons.lang3.StringUtils; 053import org.apache.commons.lang3.tuple.Pair; 054import org.slf4j.Logger; 055import org.slf4j.LoggerFactory; 056 057/** 058 * Collection of file utils. 059 * <br> 060 * This class has only been tested for use with a UTF-8 system encoding. 061 */ 062public final class FileUtils { 063 064 private static final Logger LOG = LoggerFactory.getLogger(FileUtils.class); 065 066 public static final String UTF8 = StandardCharsets.UTF_8.name(); 067 public static final Pattern TAB_DELIMITED = Pattern.compile("\t"); 068 private static int linesPerMemorySort = 100000; 069 private static Boolean gnuSortAvailable = null; 070 private static final Object sortLock = new Object(); 071 072 static { 073 /* Warn when the software is not run in a Unicode environment. This library has not been 074 * tested to run in a non-Unicode environment, and may cause data corruption. 075 */ 076 if (Charset.defaultCharset().equals(StandardCharsets.US_ASCII)) { 077 System.err.println( 078 "The default character set is US ASCII. It is strongly recommended to " 079 + "run this software in a Unicode environment."); 080 } 081 } 082 083 public static String classpath2Filepath(String path) { 084 return new File(ClassLoader.getSystemResource(path).getFile()).getAbsolutePath(); 085 } 086 087 public static InputStream classpathStream(String path) throws IOException { 088 InputStream in = null; 089 // relative path. Use classpath instead 090 URL url = FileUtils.class.getClassLoader().getResource(path); 091 if (url != null) { 092 in = url.openStream(); 093 } 094 return in; 095 } 096 097 public static Set<String> columnsToSet(InputStream source, int... column) throws IOException { 098 return columnsToSet(source, new CompactHashSet<String>(), column); 099 } 100 101 /** 102 * Reads a file and returns a unique set of multiple columns from lines which are no comments (starting with #) and 103 * trims whitespace. 104 * 105 * @param source the UTF-8 encoded text file with tab delimited columns 106 * @param resultSet the set implementation to be used. Will not be cleared before reading! 107 * @param column variable length argument of column indices to process 108 * @return set of column rows 109 */ 110 public static Set<String> columnsToSet(InputStream source, Set<String> resultSet, int... column) 111 throws IOException { 112 LineIterator lines = getLineIterator(source); 113 int maxCols = 0; 114 for (int c : column) { 115 if (c > maxCols) { 116 maxCols = c; 117 } 118 } 119 while (lines.hasNext()) { 120 String line = lines.nextLine().trim(); 121 // ignore comments 122 if (!ignore(line)) { 123 String[] parts = TAB_DELIMITED.split(line); 124 if (maxCols <= parts.length) { 125 for (int c : column) { 126 String cell = parts[c].trim(); 127 resultSet.add(cell); 128 } 129 } 130 } 131 } 132 return resultSet; 133 } 134 135 public static void copyStreams(InputStream in, OutputStream out) throws IOException { 136 // write the file to the file specified 137 int bytesRead; 138 byte[] buffer = new byte[8192]; 139 140 while ((bytesRead = in.read(buffer, 0, 8192)) != -1) { 141 out.write(buffer, 0, bytesRead); 142 } 143 144 out.close(); 145 in.close(); 146 } 147 148 public static void copyStreamToFile(InputStream in, File out) throws IOException { 149 copyStreams(in, new FileOutputStream(out)); 150 } 151 152 public static File createTempDir() throws IOException { 153 return createTempDir("gbif-futil", ".tmp"); 154 } 155 156 /** 157 * @param prefix The prefix string to be used in generating the file's name; must be at least three characters long 158 * @param suffix The suffix string to be used in generating the file's name; may be null, in which case the suffix 159 * ".tmp" will be used 160 */ 161 public static File createTempDir(String prefix, String suffix) throws IOException { 162 File dir = File.createTempFile(prefix, suffix); 163 if (!dir.delete()) { 164 throw new IOException("Could not delete temp file: " + dir.getAbsolutePath()); 165 } 166 if (!dir.mkdir()) { 167 throw new IOException("Could not create temp directory: " + dir.getAbsolutePath()); 168 } 169 return dir; 170 } 171 172 /** 173 * Delete directory recursively, including all its files, sub-folders, and sub-folder's files. 174 * 175 * @param directory directory to delete recursively 176 */ 177 public static void deleteDirectoryRecursively(File directory) { 178 File[] list = directory.listFiles(); 179 for (File file : list) { 180 if (file.isDirectory()) { 181 deleteDirectoryRecursively(file); 182 file.delete(); 183 } else { 184 file.delete(); 185 } 186 } 187 directory.delete(); 188 } 189 190 /** 191 * Escapes a filename so it is a valid filename on all systems, replacing /. .. \t\r\n. 192 * 193 * @param filename to be escaped 194 */ 195 public static String escapeFilename(String filename) { 196 return filename.replaceAll("[\\s./&]", "_"); 197 } 198 199 public static File getClasspathFile(String path) { 200 return new File(ClassLoader.getSystemResource(path).getFile()); 201 } 202 203 public static InputStream getInputStream(File source) throws FileNotFoundException { 204 return new FileInputStream(source); 205 } 206 207 public static BufferedReader getInputStreamReader(InputStream input) 208 throws FileNotFoundException { 209 return getInputStreamReader(input, UTF8); 210 } 211 212 public static BufferedReader getInputStreamReader(InputStream input, String encoding) 213 throws FileNotFoundException { 214 BufferedReader reader = null; 215 try { 216 reader = new BufferedReader(new InputStreamReader(input, encoding)); 217 } catch (UnsupportedEncodingException e) { 218 LOG.warn("Caught Exception", e); 219 } 220 return reader; 221 } 222 223 /** 224 * @param source the source input stream encoded in UTF-8 225 */ 226 public static LineIterator getLineIterator(InputStream source) { 227 return getLineIterator(source, UTF8); 228 } 229 230 /** 231 * @param source the source input stream 232 * @param encoding the encoding used by the input stream 233 */ 234 public static LineIterator getLineIterator(InputStream source, String encoding) { 235 try { 236 return new LineIterator(new BufferedReader(new InputStreamReader(source, encoding))); 237 } catch (UnsupportedEncodingException e) { 238 throw new IllegalArgumentException("Unsupported encoding" + encoding, e); 239 } 240 } 241 242 public static BufferedReader getUtf8Reader(File file) throws FileNotFoundException { 243 BufferedReader reader = null; 244 try { 245 reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), UTF8)); 246 } catch (UnsupportedEncodingException e) { 247 LOG.warn("Caught Exception", e); 248 } 249 return reader; 250 } 251 252 /** 253 * Converts the byte size into human-readable format. 254 * Support both SI and byte format. 255 */ 256 public static String humanReadableByteCount(long bytes, boolean si) { 257 int unit = si ? 1000 : 1024; 258 if (bytes < unit) { 259 return bytes + " B"; 260 } 261 int exp = (int) (Math.log(bytes) / Math.log(unit)); 262 String pre = (si ? "kMGTPE" : "KMGTPE").charAt(exp - 1) + (si ? "" : "i"); 263 return String.format("%.1f %sB", bytes / Math.pow(unit, exp), pre); 264 } 265 266 public static boolean isCompressedFile(File source) { 267 String suffix = source.getName().substring(source.getName().lastIndexOf('.') + 1); 268 return suffix != null 269 && suffix.length() > 0 270 && ("zip".equalsIgnoreCase(suffix) 271 || "tgz".equalsIgnoreCase(suffix) 272 || "gz".equalsIgnoreCase(suffix)); 273 } 274 275 /** 276 * Reads a complete file into a byte buffer. 277 */ 278 public static ByteBuffer readByteBuffer(File file) throws IOException { 279 byte[] content = org.apache.commons.io.FileUtils.readFileToByteArray(file); 280 return ByteBuffer.wrap(content); 281 } 282 283 /** 284 * Reads the first bytes of a file into a byte buffer. 285 * 286 * @param bufferSize the number of bytes to read from the file 287 */ 288 public static ByteBuffer readByteBuffer(File file, int bufferSize) throws IOException { 289 ByteBuffer bbuf = ByteBuffer.allocate(bufferSize); 290 BufferedInputStream f = new BufferedInputStream(new FileInputStream(file), bufferSize); 291 292 int b; 293 while ((b = f.read()) != -1) { 294 if (!bbuf.hasRemaining()) { 295 break; 296 } 297 bbuf.put((byte) b); 298 } 299 f.close(); 300 301 return bbuf; 302 } 303 304 /** 305 * @param linesPerMemorySort are the number of lines that should be sorted in memory, determining the number of file 306 * segments to be sorted when doing a Java file sort. Defaults to 100000, if you have 307 * memory available a higher value increases performance. 308 */ 309 public static void setLinesPerMemorySort(int linesPerMemorySort) { 310 FileUtils.linesPerMemorySort = linesPerMemorySort; 311 } 312 313 public static Writer startNewUtf8File(File file) throws IOException { 314 touch(file); 315 return new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file, false), UTF8)); 316 } 317 318 public static Writer startNewUtf8XmlFile(File file) throws IOException { 319 Writer writer = startNewUtf8File(file); 320 writer.write("<?xml version='1.0' encoding='utf-8'?>\n"); 321 return writer; 322 } 323 324 /** 325 * Takes a utf8 encoded input stream and reads in every line/row into a list. 326 * 327 * @return list of rows 328 */ 329 public static LinkedList<String> streamToList(InputStream source) throws IOException { 330 return streamToList(source, FileUtils.UTF8); 331 } 332 333 /** 334 * Reads a file and returns a list of all lines which are no comments (starting with #) and trims whitespace. 335 * 336 * @param source the UTF-8 encoded text file to read 337 * @param resultList the list implementation to be used. Will not be cleared before reading! 338 * @return list of lines 339 */ 340 public static List<String> streamToList(InputStream source, List<String> resultList) 341 throws IOException { 342 LineIterator lines = getLineIterator(source); 343 while (lines.hasNext()) { 344 String line = lines.nextLine().trim(); 345 // ignore comments 346 if (!ignore(line)) { 347 resultList.add(line); 348 } 349 } 350 return resultList; 351 } 352 353 public static LinkedList<String> streamToList(InputStream source, String encoding) 354 throws IOException { 355 LinkedList<String> resultList = new LinkedList<>(); 356 try { 357 LineIterator lines = 358 new LineIterator(new BufferedReader(new InputStreamReader(source, encoding))); 359 while (lines.hasNext()) { 360 String line = lines.nextLine(); 361 resultList.add(line); 362 } 363 } catch (UnsupportedEncodingException e) { 364 throw new IllegalArgumentException("Unsupported encoding " + encoding, e); 365 } 366 return resultList; 367 } 368 369 /** 370 * Reads a utf8 encoded inut stream, splits 371 */ 372 public static Map<String, String> streamToMap(InputStream source) throws IOException { 373 return streamToMap(source, new HashMap<>()); 374 } 375 376 public static Map<String, String> streamToMap( 377 InputStream source, int key, int value, boolean trimToNull) throws IOException { 378 return streamToMap(source, new HashMap<>(), key, value, trimToNull); 379 } 380 381 /** 382 * Read a hashmap from a tab delimited utf8 input stream using the row number as an integer value and the entire row 383 * as the value. Ignores commented rows starting with #. 384 * 385 * @param source tab delimited text file to read 386 */ 387 public static Map<String, String> streamToMap(InputStream source, Map<String, String> result) 388 throws IOException { 389 LineIterator lines = getLineIterator(source); 390 Integer row = 0; 391 while (lines.hasNext()) { 392 row++; 393 String line = lines.nextLine().trim(); 394 // ignore comments 395 if (!ignore(line)) { 396 result.put(line, row.toString()); 397 } 398 } 399 return result; 400 } 401 402 /** 403 * Read a hashmap from a tab delimited utf8 file, ignoring commented rows starting with #. 404 * 405 * @param source tab delimited input stream to read 406 * @param key column number to use as key 407 * @param value column number to use as value 408 * @param trimToNull if true trims map entries to null 409 */ 410 public static Map<String, String> streamToMap( 411 InputStream source, Map<String, String> result, int key, int value, boolean trimToNull) 412 throws IOException { 413 LineIterator lines = getLineIterator(source); 414 int maxCols = key > value ? key : value + 1; 415 while (lines.hasNext()) { 416 String line = lines.nextLine(); 417 // ignore comments 418 if (!ignore(line)) { 419 String[] parts = TAB_DELIMITED.split(line); 420 if (maxCols <= parts.length) { 421 if (trimToNull) { 422 result.put(StringUtils.trimToNull(parts[key]), StringUtils.trimToNull(parts[value])); 423 } else { 424 result.put(parts[key], parts[value]); 425 } 426 } 427 } 428 } 429 return result; 430 } 431 432 public static Set<String> streamToSet(InputStream source) throws IOException { 433 return streamToSet(source, new CompactHashSet<>()); 434 } 435 436 /** 437 * Reads a file and returns a unique set of all lines which are no comments (starting with #) and trims whitespace. 438 * 439 * @param source the UTF-8 encoded text file to read 440 * @param resultSet the set implementation to be used. Will not be cleared before reading! 441 * @return set of unique lines 442 */ 443 public static Set<String> streamToSet(InputStream source, Set<String> resultSet) 444 throws IOException { 445 LineIterator lines = getLineIterator(source); 446 while (lines.hasNext()) { 447 String line = lines.nextLine().trim(); 448 // ignore comments 449 if (!ignore(line)) { 450 resultSet.add(line); 451 } 452 } 453 return resultSet; 454 } 455 456 public static String toFilePath(URL url) { 457 String protocol = 458 url.getProtocol() == null || "http".equalsIgnoreCase(url.getProtocol()) 459 ? "" 460 : "/__" + url.getProtocol() + "__"; 461 String domain = url.getAuthority() == null ? "__domainless" : url.getAuthority(); 462 return domain + protocol + url.getFile(); 463 } 464 465 public static File url2file(URL url) { 466 File f = null; 467 try { 468 f = new File(url.toURI()); 469 } catch (URISyntaxException e) { 470 f = new File(url.getPath()); 471 } 472 return f; 473 } 474 475 /** 476 * For the given list, finds the index of the lowest value using the given comparator. 477 * 478 * @param values To compare 479 * @param comparator To use 480 * @return The index of the lowest value, or -1 if they are all null 481 */ 482 static int lowestValueIndex(List<String> values, Comparator<String> comparator) { 483 int index = 0; 484 String lowestValue = null; 485 for (int i = 0; i < values.size(); i++) { 486 String value = values.get(i); 487 if (lowestValue == null) { 488 lowestValue = value; 489 index = i; 490 } else if (comparator.compare(lowestValue, value) > 0) { 491 lowestValue = value; 492 index = i; 493 } 494 } 495 496 return lowestValue == null ? -1 : index; 497 } 498 499 /** 500 * For the given file's path, returns a proposed new filename (including path) with the extension 501 * index and suffix. So a file of "/tmp/input.txt" -> "/tmp/input_part_10.txt". 502 * 503 * @param original File 504 * @param index E.g. 10 505 * @return The proposed name 506 */ 507 private static File getChunkFile(File original, int index) { 508 return new File( 509 original.getParentFile(), 510 FilenameUtils.getBaseName(original.getName()) 511 + '_' 512 + index 513 + getFileExtension(original.getName())); 514 } 515 516 private static boolean ignore(String line) { 517 return StringUtils.trimToNull(line) == null || line.startsWith("#"); 518 } 519 520 public int getLinesPerMemorySort() { 521 return linesPerMemorySort; 522 } 523 524 /** 525 * Merges a list of intermediary sort chunk files into a single sorted file. On completion, the intermediary sort 526 * chunk files are deleted. 527 * 528 * @param sortFiles sort chunk files to merge 529 * @param sortedFileWriter writer to merge to. Can already be open and contain data 530 * @param lineComparator To use when determining the order (reuse the one that was used to sort the individual 531 * files) 532 */ 533 public void mergeSortedFiles( 534 List<File> sortFiles, Writer sortedFileWriter, Comparator<String> lineComparator) 535 throws IOException { 536 LinkedList<Pair<String, BufferedReader>> partReaders = new LinkedList<>(); 537 try { 538 for (File f : sortFiles) { 539 // Use UTF-8 sort order. 540 BufferedReader partReader = 541 new BufferedReader( 542 new InputStreamReader(new FileInputStream(f), StandardCharsets.UTF_8)); 543 // Load first lines 544 String partLine = partReader.readLine(); 545 if (partLine != null) { 546 partReaders.add(Pair.of(partLine, partReader)); 547 } 548 } 549 // Sort the first lines 550 Collections.sort(partReaders, (Comparator.comparing(Pair::getLeft, lineComparator))); 551 552 // Start with the first reader 553 while (partReaders.size() > 1) { 554 BufferedReader currentBuffer = partReaders.get(0).getRight(); 555 String currentLine = partReaders.get(0).getLeft(); 556 String nextFilesFirstLine = partReaders.get(1).getLeft(); 557 558 // Read from it, until its value is greater than the second reader. 559 while (currentLine != null 560 && lineComparator.compare(currentLine, nextFilesFirstLine) <= 0) { 561 sortedFileWriter.write(currentLine); 562 sortedFileWriter.write('\n'); 563 564 currentLine = currentBuffer.readLine(); 565 } 566 partReaders.remove(0); 567 568 if (currentLine == null) { 569 // If it's completed, close and remove it. 570 currentBuffer.close(); 571 } else { 572 // Otherwise, insert it into the list, maintaining the order 573 Pair<String, BufferedReader> currentReaderPair = Pair.of(currentLine, currentBuffer); 574 575 // Start at 1, as we are always larger than the first (was the second) entry 576 for (int i = 1; i <= partReaders.size(); i++) { 577 // If we get here, it goes at the end of the list. 578 if (i == partReaders.size()) { 579 partReaders.add(i, currentReaderPair); 580 break; 581 } 582 583 if (lineComparator.compare(partReaders.get(i).getLeft(), currentLine) >= 0) { 584 partReaders.add(i, currentReaderPair); 585 break; 586 } 587 } 588 } 589 // mergeFileStatus("Loop "+currentLine, partReaders); 590 } 591 592 // Read the remainder of the final buffer 593 // mergeFileStatus("Final", partReaders); 594 if (partReaders.size() > 0) { 595 BufferedReader currentBuffer = partReaders.get(0).getRight(); 596 String current = partReaders.get(0).getLeft(); 597 while (current != null) { 598 sortedFileWriter.write(current); 599 sortedFileWriter.write('\n'); 600 601 current = currentBuffer.readLine(); 602 } 603 currentBuffer.close(); 604 } 605 } finally { 606 for (Pair<String, BufferedReader> pair : partReaders) { 607 try { 608 pair.getRight().close(); 609 } catch (RuntimeException e) { 610 } 611 } 612 // I assume it periodically flushes anyway, so only need to do once at end... 613 sortedFileWriter.flush(); 614 sortedFileWriter.close(); 615 // delete (intermediary) sort chunk files, only the sorted file remains 616 for (File f : sortFiles) { 617 f.delete(); 618 } 619 } 620 } 621 622 // Just for debugging 623 private void mergeFileStatus(String note, List<Pair<String, BufferedReader>> partReaders) { 624 LOG.trace(note); 625 for (int i = 0; i < partReaders.size(); i++) { 626 LOG.trace(i + ": " + partReaders.get(i).getLeft()); 627 } 628 LOG.trace("-"); 629 } 630 631 /** 632 * Sorts the input file into the output file using the supplied delimited line parameters. 633 * 634 * This method is not reliable when the sort field may contain Unicode codepoints outside the Basic Multilingual Plane, 635 * i.e. above \uFFFF. In that case, the sort order differs from Java's String sort order. This should not be a problem 636 * for most usage; the Supplementary Multilingual Planes contain ancient scripts, emojis, arrows and so on. 637 * 638 * @param input To sort 639 * @param sorted The sorted version of the input excluding ignored header lines (see ignoreHeaderLines) 640 * @param column the column that keeps the values to sort on 641 * @param columnDelimiter the delimiter that separates columns in a row 642 * @param enclosedBy optional column enclosing character, e.g. a double quote for CSVs 643 * @param newlineDelimiter the chars used for new lines, usually \n, \n\r or \r 644 * @param ignoreHeaderLines number of beginning lines to ignore, e.g. headers 645 */ 646 public void sort( 647 File input, 648 File sorted, 649 String encoding, 650 int column, 651 String columnDelimiter, 652 Character enclosedBy, 653 String newlineDelimiter, 654 int ignoreHeaderLines) 655 throws IOException { 656 sort( 657 Collections.singletonList(input), 658 sorted, 659 encoding, 660 column, 661 columnDelimiter, 662 enclosedBy, 663 newlineDelimiter, 664 ignoreHeaderLines); 665 } 666 667 /** 668 * Sorts the input file into the output file using the supplied delimited line parameters. 669 * 670 * This method is not reliable when the sort field may contain Unicode codepoints outside the Basic Multilingual Plane, 671 * i.e. above \uFFFF. In that case, the sort order differs from Java's String sort order. This should not be a problem 672 * for most usage; the Supplementary Multilingual Planes contain ancient scripts, emojis, arrows and so on. 673 * 674 * @param inputs To sort 675 * @param sorted The sorted version of the input excluding ignored header lines (see ignoreHeaderLines) 676 * @param column the column that keeps the values to sort on 677 * @param columnDelimiter the delimiter that separates columns in a row 678 * @param enclosedBy optional column enclosing character, e.g. a double quote for CSVs 679 * @param newlineDelimiter the chars used for new lines, usually \n, \n\r or \r 680 * @param ignoreHeaderLines number of beginning lines to ignore, e.g. headers 681 */ 682 public void sort( 683 List<File> inputs, 684 File sorted, 685 String encoding, 686 int column, 687 String columnDelimiter, 688 Character enclosedBy, 689 String newlineDelimiter, 690 int ignoreHeaderLines) 691 throws IOException { 692 Comparator<String> lineComparator; 693 if (enclosedBy == null) { 694 lineComparator = new LineComparator(column, columnDelimiter); 695 } else { 696 lineComparator = new LineComparator(column, columnDelimiter, enclosedBy); 697 } 698 sort( 699 inputs, 700 sorted, 701 encoding, 702 column, 703 columnDelimiter, 704 enclosedBy, 705 newlineDelimiter, 706 ignoreHeaderLines, 707 lineComparator, 708 false); 709 } 710 711 /** 712 * Sorts the input file into the output file using the supplied delimited line parameters. 713 * 714 * This method is not reliable when the sort field may contain Unicode codepoints outside the Basic Multilingual Plane, 715 * i.e. above \uFFFF. In that case, the sort order differs from Java's String sort order. This should not be a problem 716 * for most usage; the Supplementary Multilingual Planes contain ancient scripts, emojis, arrows and so on. 717 * 718 * This method is globally synchronized, in case multiple sorts are attempted to the same file simultaneously. 719 * This could be improved to allow synchronizing against the destination file, rather than for all sorts. 720 * 721 * @param input To sort 722 * @param sorted The sorted version of the input excluding ignored header lines (see ignoreHeaderLines) 723 * @param column the column that keeps the values to sort on 724 * @param columnDelimiter the delimiter that separates columns in a row 725 * @param enclosedBy optional column enclosing character, e.g. a double quote for CSVs 726 * @param newlineDelimiter the chars used for new lines, usually \n, \r\n or \r 727 * @param ignoreHeaderLines number of beginning lines to ignore, e.g. headers 728 * @param lineComparator used to sort the output 729 * @param ignoreCase ignore case order, this parameter couldn't have any effect if the LineComparator is used 730 */ 731 public void sort( 732 File input, 733 File sorted, 734 String encoding, 735 int column, 736 String columnDelimiter, 737 Character enclosedBy, 738 String newlineDelimiter, 739 int ignoreHeaderLines, 740 Comparator<String> lineComparator, 741 boolean ignoreCase) 742 throws IOException { 743 sort( 744 Collections.singletonList(input), 745 sorted, 746 encoding, 747 column, 748 columnDelimiter, 749 enclosedBy, 750 newlineDelimiter, 751 ignoreHeaderLines, 752 lineComparator, 753 ignoreCase); 754 } 755 756 /** 757 * Sorts the input file into the output file using the supplied delimited line parameters. 758 * 759 * This method is not reliable when the sort field may contain Unicode codepoints outside the Basic Multilingual Plane, 760 * i.e. above \uFFFF. In that case, the sort order differs from Java's String sort order. This should not be a problem 761 * for most usage; the Supplementary Multilingual Planes contain ancient scripts, emojis, arrows and so on. 762 * 763 * This method is globally synchronized, in case multiple sorts are attempted to the same file simultaneously. 764 * This could be improved to allow synchronizing against the destination file, rather than for all sorts. 765 * 766 * @param inputs To sort 767 * @param sorted The sorted version of the input excluding ignored header lines (see ignoreHeaderLines) 768 * @param column the column that keeps the values to sort on 769 * @param columnDelimiter the delimiter that separates columns in a row 770 * @param enclosedBy optional column enclosing character, e.g. a double quote for CSVs 771 * @param newlineDelimiter the chars used for new lines, usually \n, \r\n or \r 772 * @param ignoreHeaderLines number of beginning lines to ignore, e.g. headers 773 * @param lineComparator used to sort the output 774 * @param ignoreCase ignore case order, this parameter couldn't have any effect if the LineComparator is used 775 */ 776 public void sort( 777 List<File> inputs, 778 File sorted, 779 String encoding, 780 int column, 781 String columnDelimiter, 782 Character enclosedBy, 783 String newlineDelimiter, 784 int ignoreHeaderLines, 785 Comparator<String> lineComparator, 786 boolean ignoreCase) 787 throws IOException { 788 LOG.debug( 789 "Sorting file(s) {} as new file {}", 790 inputs.stream().map(File::getAbsolutePath).toArray(), 791 sorted.getAbsolutePath()); 792 if (encoding == null) { 793 LOG.warn("No encoding specified, assume UTF-8"); 794 encoding = FileUtils.UTF8; 795 } 796 synchronized (sortLock) { 797 if (sorted.exists()) { 798 // Delete a file, which will allow processes with it open to continue reading it. 799 // The GNU sort truncates and appends, which would mean a partial read otherwise. 800 LOG.warn("Deleting existed sorted file {}", sorted.getAbsoluteFile()); 801 sorted.delete(); 802 } 803 // if the id is in the first column, first try sorting via shell as its the fastest we can get 804 if (!sortInGnu( 805 inputs, 806 sorted, 807 encoding, 808 ignoreHeaderLines, 809 column, 810 columnDelimiter, 811 enclosedBy, 812 newlineDelimiter, 813 ignoreCase)) { 814 LOG.debug("No GNU sort available, using native Java sorting"); 815 sortInJava(inputs, sorted, encoding, lineComparator, ignoreHeaderLines); 816 } 817 } 818 } 819 820 /** 821 * Sorts the input file into the output file using the supplied lineComparator. 822 * 823 * @param input To sort 824 * @param sorted The sorted version of the input excluding ignored header lines (see ignoreHeaderLines) 825 * @param lineComparator To use during comparison 826 * @param ignoreHeaderLines number of beginning lines to ignore, e.g. headers 827 */ 828 public void sortInJava( 829 File input, 830 File sorted, 831 String encoding, 832 Comparator<String> lineComparator, 833 int ignoreHeaderLines) 834 throws IOException { 835 sortInJava( 836 Collections.singletonList(input), sorted, encoding, lineComparator, ignoreHeaderLines); 837 } 838 839 /** 840 * Sorts the input file into the output file using the supplied lineComparator. 841 * 842 * @param inputs To sort 843 * @param sorted The sorted version of the input excluding ignored header lines (see ignoreHeaderLines) 844 * @param lineComparator To use during comparison 845 * @param ignoreHeaderLines number of beginning lines to ignore, e.g. headers 846 */ 847 public void sortInJava( 848 List<File> inputs, 849 File sorted, 850 String encoding, 851 Comparator<String> lineComparator, 852 int ignoreHeaderLines) 853 throws IOException { 854 LOG.debug("Sorting file(s) {}", inputs); 855 long start = System.currentTimeMillis(); 856 857 List<File> sortFiles = new LinkedList<>(); 858 List<String> headerLines = new LinkedList<>(); 859 for (File input : inputs) { 860 BufferedReader br = 861 new BufferedReader(new InputStreamReader(new FileInputStream(input), encoding)); 862 int skipHeaderLines = ignoreHeaderLines; 863 try { 864 String line = br.readLine(); 865 int fileCount = 0; 866 867 List<String> linesToSort = new LinkedList<>(); 868 while (line != null) { 869 if (skipHeaderLines > 0) { 870 // Only add the header lines for the first file 871 if (headerLines.size() < ignoreHeaderLines) { 872 headerLines.add(line); 873 } 874 skipHeaderLines--; 875 } else { 876 linesToSort.add(line); 877 878 // if buffer is full, then sort and write to file 879 if (linesToSort.size() == linesPerMemorySort) { 880 sortFiles.add(sortAndWrite(input, encoding, lineComparator, fileCount, linesToSort)); 881 linesToSort = new LinkedList<>(); 882 fileCount++; 883 } 884 } 885 line = br.readLine(); 886 } 887 // catch the last lot 888 if (!linesToSort.isEmpty()) { 889 sortFiles.add(sortAndWrite(input, encoding, lineComparator, fileCount, linesToSort)); 890 } 891 } finally { 892 br.close(); 893 } 894 } 895 LOG.debug( 896 sortFiles.size() 897 + " sorted file chunks created in " 898 + (System.currentTimeMillis() - start) / 1000 899 + " secs"); 900 901 // now merge the sorted files into one single sorted file 902 Writer sortedFileWriter = new BufferedWriter(new FileWriter(sorted)); 903 // first write the old header lines if existing 904 for (String h : headerLines) { 905 sortedFileWriter.write(h); 906 sortedFileWriter.write("\n"); 907 } 908 mergeSortedFiles(sortFiles, sortedFileWriter, lineComparator); 909 910 LOG.debug( 911 "File(s) {} sorted successfully using {} parts to do sorting in {}s", 912 inputs.stream().map(File::getAbsolutePath).toArray(), 913 sortFiles.size(), 914 (System.currentTimeMillis() - start) / 1000); 915 } 916 917 /** 918 * Splits the supplied file into files of set line size and with a suffix. 919 * 920 * @param input To split up 921 * @param linesPerOutput Lines per split file 922 * @param extension The file extension to use - e.g. ".txt" 923 * @return The split files 924 */ 925 public List<File> split(File input, int linesPerOutput, String extension) throws IOException { 926 LOG.debug("Splitting File[" + input.getAbsolutePath() + ']'); 927 long timer = System.currentTimeMillis(); 928 List<File> splitFiles = new LinkedList<>(); 929 // Use ISO-8859-1 as a binary-safe encoding. 930 BufferedReader br = 931 new BufferedReader( 932 new InputStreamReader(new FileInputStream(input), StandardCharsets.ISO_8859_1)); 933 String line = br.readLine(); 934 int fileCount = 0; 935 File splitFile = getChunkFile(input, fileCount); 936 fileCount++; 937 splitFiles.add(splitFile); 938 OutputStreamWriter fw = 939 new OutputStreamWriter(new FileOutputStream(splitFile), StandardCharsets.ISO_8859_1); 940 try { 941 int lineCount = 0; 942 while (line != null) { 943 if (lineCount == linesPerOutput) { 944 fw.flush(); 945 fw.close(); 946 splitFile = getChunkFile(input, fileCount); 947 splitFiles.add(splitFile); 948 // is ok to reuse, as last one is closed, and this will always get closed - see finally 949 // below 950 fw = new OutputStreamWriter(new FileOutputStream(splitFile), StandardCharsets.ISO_8859_1); 951 fileCount++; 952 lineCount = 0; 953 } 954 fw.write(line); 955 fw.write("\n"); 956 line = br.readLine(); 957 lineCount++; 958 } 959 fw.flush(); 960 } finally { 961 fw.close(); 962 } 963 LOG.debug( 964 "File[" 965 + input.getAbsolutePath() 966 + "] split successfully into[" 967 + splitFiles.size() 968 + "] parts in secs[" 969 + (1 + System.currentTimeMillis() - timer) / 1000 970 + "]"); 971 return splitFiles; 972 } 973 974 /** 975 * Test whether we have a new enough version of GNU Sort that supports (primarily) the -k option with a start and end 976 * column. 977 * 978 * Mac OS only includes an old version of GNU sort, and will fail this test. 979 */ 980 private boolean gnuSortAvailable() { 981 if (gnuSortAvailable != null) { 982 return gnuSortAvailable; 983 } 984 985 try { 986 String command = "sort -k1,1 -t',' --ignore-case /dev/null"; 987 LOG.debug("Testing capability of 'sort' with command: {}", command); 988 989 Process process = new ProcessBuilder("/bin/sh", "-c", command).start(); 990 int exitValue = process.waitFor(); 991 992 if (exitValue == 0) { 993 LOG.debug("GNU sort is capable"); 994 gnuSortAvailable = true; 995 } else { 996 LOG.warn( 997 "GNU sort does not exist or is too old, and will not be used. Sorting large files will be slow.", 998 new InputStreamUtils().readEntireStream(process.getErrorStream()).replace('\n', ' ')); 999 gnuSortAvailable = false; 1000 } 1001 } catch (Exception e) { 1002 LOG.warn( 1003 "GNU sort does not exist or is too old, and will not be used. Sorting large files will be slow.", 1004 e); 1005 gnuSortAvailable = false; 1006 } 1007 1008 return gnuSortAvailable; 1009 } 1010 1011 /** 1012 * sort a text file via an external GNU sort command: 1013 * sorting tabs at 3rd column, numerical reverse order 1014 * sort -t$'\t' -k3 -o sorted.txt col2007.txt 1015 * <p/> 1016 * The GNU sort based sorting is extremely efficient and much, much faster than the current sortInJava method. It is 1017 * locale aware though and we only want the native C sorting locale. See 1018 * http://www.gnu.org/software/coreutils/faq/coreutils-faq.html#Sort-does-not-sort-in-normal-order_0021 1019 * <p/> 1020 * Example C sort order: 1021 * <p/> 1022 * <pre> 1023 * 1 oOdontoceti 1024 * 10 gGlobicephala melaena melaena Traill 1025 * 100 gGlobicephala melaena melaena Traill 1026 * 101 gGlobicephala melaena melaena Traill 1027 * 11 pPontoporia Gray 1028 * 12 pPontoporia blainvillei Gervais and d'Orbigny 1029 * 120 iInia d'Orbigny 1030 * 121 iInia geoffrensis Blainville 1031 * 2 sSusuidae 1032 * 20 cCetacea 1033 * Amphiptera 1034 * Amphiptera pacifica Giglioli 1035 * Anarnak Lacépède 1036 * Balaena mangidach Chamisso 1037 * amphiptera 1038 * amphiptera pacifica Giglioli 1039 * anarnak Lacépède 1040 * balaena mangidach Chamisso 1041 * </pre> 1042 */ 1043 protected boolean sortInGnu( 1044 List<File> inputs, 1045 File sorted, 1046 String encoding, 1047 int ignoreHeaderLines, 1048 int column, 1049 String columnDelimiter, 1050 Character enclosedBy, 1051 String lineDelimiter, 1052 boolean ignoreCase) 1053 throws IOException { 1054 String command; 1055 // GNU sort is available for use when: 1056 // • line delimiter is \n 1057 // • no enclosed by/quote character is in use 1058 // • sorting is using the first column 1059 // • sort version is sufficient to include start and end column (-k 1,1). 1060 // Use the --debug option to sort if working on this code. 1061 if (lineDelimiter == null || !lineDelimiter.contains("\n")) { 1062 LOG.debug("Cannot use GNU sort on this file: line delimiter does not contain newline."); 1063 return false; 1064 } else if (columnDelimiter != null && column > 0) { 1065 LOG.debug("Cannot use GNU sort on this file: sort column is not the first."); 1066 return false; 1067 } else if (enclosedBy != null) { 1068 LOG.debug("Cannot use GNU sort on this file: enclosed by character set."); 1069 return false; 1070 } else if (!gnuSortAvailable()) { 1071 LOG.debug("Cannot use GNU sort on this file: command unavailable."); 1072 return false; 1073 } 1074 1075 // keep header rows 1076 boolean success = false; 1077 try { 1078 LinkedList<String> cmds = new LinkedList<>(); 1079 cmds.add("/bin/sh"); 1080 cmds.add("-c"); 1081 cmds.add(""); 1082 ProcessBuilder pb = new ProcessBuilder(cmds); 1083 Map<String, String> env = pb.environment(); 1084 1085 // clear the environment, but keep specified temp working directory 1086 env.keySet().removeIf(key -> !(key.equals("TMPDIR"))); 1087 if (System.getProperty("java.io.tmpdir") != null) { 1088 env.put("TMPDIR", System.getProperty("java.io.tmpdir")); 1089 } 1090 // make sure we use the C locale for sorting 1091 env.put("LC_ALL", "C"); 1092 1093 String sortArgs = 1094 String.format( 1095 " %s -k%d,%d -t'%s'", 1096 ignoreCase ? "--ignore-case" : "", column + 1, column + 1, columnDelimiter); 1097 1098 String fileList = inputs.stream().map(File::getAbsolutePath).collect(Collectors.joining(" ")); 1099 if (ignoreHeaderLines > 0) { 1100 // copy header lines 1101 command = 1102 "head -n " 1103 + ignoreHeaderLines 1104 + ' ' 1105 + inputs.get(0).getAbsolutePath() 1106 + " > " 1107 + sorted.getAbsolutePath(); 1108 LOG.debug("Issue external command: {}", command); 1109 cmds.removeLast(); 1110 cmds.add(command); 1111 Process process = pb.start(); 1112 int exitValue = process.waitFor(); 1113 if (exitValue != 0) { 1114 LOG.warn("Error sorting file (copying header lines) with GNU head"); 1115 return false; 1116 } 1117 1118 // do the sorting ignoring the header rows 1119 command = 1120 "tail -q -n +" 1121 + (ignoreHeaderLines + 1) 1122 + " " 1123 + fileList 1124 + " | " 1125 + "sort " 1126 + sortArgs 1127 + " >> " 1128 + sorted.getAbsolutePath(); 1129 } else { 1130 // do sorting directly, we don't have header rows 1131 command = "sort " + sortArgs + " -o " + sorted.getAbsolutePath() + ' ' + fileList; 1132 } 1133 1134 LOG.debug("Issue external command: {}", command); 1135 cmds.removeLast(); 1136 cmds.add(command); 1137 Process process = pb.start(); 1138 // get the stdout and stderr from the command that was run 1139 InputStream err = process.getErrorStream(); 1140 int exitValue = process.waitFor(); 1141 if (exitValue == 0) { 1142 LOG.debug("Successfully sorted file with GNU sort"); 1143 success = true; 1144 } else { 1145 LOG.warn("Error sorting file with GNU sort"); 1146 InputStreamUtils isu = new InputStreamUtils(); 1147 System.err.append(isu.readEntireStream(err)); 1148 } 1149 } catch (Exception e) { 1150 LOG.warn("Caught Exception using GNU sort", e); 1151 } 1152 return success; 1153 } 1154 1155 /** 1156 * Sorts the lines and writes to file using the 1157 * 1158 * @param input File to base the name on 1159 * @param lineComparator To compare the lines for sorting 1160 * @param fileCount Used for the file name 1161 * @param linesToSort To actually sort 1162 * @return The written file 1163 */ 1164 private File sortAndWrite( 1165 File input, 1166 String encoding, 1167 Comparator<String> lineComparator, 1168 int fileCount, 1169 List<String> linesToSort) 1170 throws IOException { 1171 long start = System.currentTimeMillis(); 1172 linesToSort.sort(lineComparator); 1173 // When implementing a comparator, make it SUPER quick!!! 1174 LOG.debug( 1175 "Collections.sort took msec[" 1176 + (System.currentTimeMillis() - start) 1177 + "] to sort records[" 1178 + linesToSort.size() 1179 + ']'); 1180 File sortFile = getChunkFile(input, fileCount); 1181 try (Writer fw = new OutputStreamWriter(new FileOutputStream(sortFile), encoding)) { 1182 for (String s : linesToSort) { 1183 fw.write(s); 1184 fw.write("\n"); 1185 } 1186 } 1187 return sortFile; 1188 } 1189 1190 /** 1191 * Creates an empty file or updates the last updated timestamp on the same as the unix command of 1192 * the same name. 1193 * 1194 * <p>From Guava. 1195 * 1196 * @param file the file to create or update 1197 * @throws IOException if an I/O error occurs 1198 */ 1199 public static void touch(File file) throws IOException { 1200 Objects.requireNonNull(file); 1201 if (!file.createNewFile() && !file.setLastModified(System.currentTimeMillis())) { 1202 throw new IOException("Unable to update modification time of " + file); 1203 } 1204 } 1205 1206 /** 1207 * Returns the <a href="http://en.wikipedia.org/wiki/Filename_extension">file extension</a> for 1208 * the given file name, or the empty string if the file has no extension. The result does not 1209 * include the '{@code .}'. 1210 * 1211 * <p><b>Note:</b> This method simply returns everything after the last '{@code .}' in the file's 1212 * name as determined by {@link File#getName}. It does not account for any filesystem-specific 1213 * behavior that the {@link File} API does not already account for. For example, on NTFS it will 1214 * report {@code "txt"} as the extension for the filename {@code "foo.exe:.txt"} even though NTFS 1215 * will drop the {@code ":.txt"} part of the name when the file is actually created on the 1216 * filesystem due to NTFS's <a href="https://goo.gl/vTpJi4">Alternate Data Streams</a>. 1217 * 1218 * <p>From Guava. 1219 */ 1220 public static String getFileExtension(String fullName) { 1221 Objects.requireNonNull(fullName); 1222 String fileName = new File(fullName).getName(); 1223 int dotIndex = fileName.lastIndexOf('.'); 1224 return (dotIndex == -1) ? "" : fileName.substring(dotIndex + 1); 1225 } 1226 1227 /** 1228 * Creates any necessary but nonexistent parent directories of the specified file. Note that if 1229 * this operation fails it may have succeeded in creating some (but not all) of the necessary 1230 * parent directories. 1231 * 1232 * <p>From Guava. 1233 * 1234 * @throws IOException if an I/O error occurs, or if any necessary but nonexistent parent 1235 * directories of the specified file could not be created. 1236 */ 1237 public static void createParentDirs(File file) throws IOException { 1238 Objects.requireNonNull(file); 1239 File parent = file.getCanonicalFile().getParentFile(); 1240 if (parent == null) { 1241 /* 1242 * The given directory is a filesystem root. All zero of its ancestors exist. This doesn't 1243 * mean that the root itself exists -- consider x:\ on a Windows machine without such a drive 1244 * -- or even that the caller can create it, but this method makes no such guarantees even for 1245 * non-root files. 1246 */ 1247 return; 1248 } 1249 //noinspection ResultOfMethodCallIgnored 1250 parent.mkdirs(); 1251 if (!parent.isDirectory()) { 1252 throw new IOException("Unable to create parent directories of " + file); 1253 } 1254 } 1255}