001/* 002 * Copyright 2021 Global Biodiversity Information Facility (GBIF) 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package org.gbif.utils.file; 017 018import org.gbif.utils.collection.CompactHashSet; 019import org.gbif.utils.text.LineComparator; 020 021import java.io.BufferedInputStream; 022import java.io.BufferedReader; 023import java.io.BufferedWriter; 024import java.io.File; 025import java.io.FileInputStream; 026import java.io.FileNotFoundException; 027import java.io.FileOutputStream; 028import java.io.FileWriter; 029import java.io.IOException; 030import java.io.InputStream; 031import java.io.InputStreamReader; 032import java.io.OutputStream; 033import java.io.OutputStreamWriter; 034import java.io.UnsupportedEncodingException; 035import java.io.Writer; 036import java.net.URISyntaxException; 037import java.net.URL; 038import java.nio.ByteBuffer; 039import java.nio.charset.Charset; 040import java.nio.charset.StandardCharsets; 041import java.util.Comparator; 042import java.util.HashMap; 043import java.util.LinkedList; 044import java.util.List; 045import java.util.Map; 046import java.util.Objects; 047import java.util.Set; 048import java.util.regex.Pattern; 049 050import org.apache.commons.io.FilenameUtils; 051import org.apache.commons.io.LineIterator; 052import org.apache.commons.lang3.StringUtils; 053import org.slf4j.Logger; 054import org.slf4j.LoggerFactory; 055 056/** 057 * Collection of file utils. 058 * <br> 059 * This class has only been tested for use with a UTF-8 system encoding. 060 */ 061public final class FileUtils { 062 063 private static final Logger LOG = LoggerFactory.getLogger(FileUtils.class); 064 065 public static final String UTF8 = StandardCharsets.UTF_8.name(); 066 public static final Pattern TAB_DELIMITED = Pattern.compile("\t"); 067 private static int linesPerMemorySort = 100000; 068 private static Boolean gnuSortAvailable = null; 069 private static final Object sortLock = new Object(); 070 071 static { 072 /* Warn when the software is not run in a Unicode environment. This library has not been 073 * tested to run in a non-Unicode environment, and may cause data corruption. 074 */ 075 if (Charset.defaultCharset().equals(StandardCharsets.US_ASCII)) { 076 System.err.println("The default character set is US ASCII. It is strongly recommended to " + 077 "run this software in a Unicode environment."); 078 } 079 } 080 081 public static String classpath2Filepath(String path) { 082 return new File(ClassLoader.getSystemResource(path).getFile()).getAbsolutePath(); 083 } 084 085 public static InputStream classpathStream(String path) throws IOException { 086 InputStream in = null; 087 // relative path. Use classpath instead 088 URL url = FileUtils.class.getClassLoader().getResource(path); 089 if (url != null) { 090 in = url.openStream(); 091 } 092 return in; 093 } 094 095 public static Set<String> columnsToSet(InputStream source, int... column) throws IOException { 096 return columnsToSet(source, new CompactHashSet<String>(), column); 097 } 098 099 /** 100 * Reads a file and returns a unique set of multiple columns from lines which are no comments (starting with #) and 101 * trims whitespace. 102 * 103 * @param source the UTF-8 encoded text file with tab delimited columns 104 * @param resultSet the set implementation to be used. Will not be cleared before reading! 105 * @param column variable length argument of column indices to process 106 * @return set of column rows 107 */ 108 public static Set<String> columnsToSet(InputStream source, Set<String> resultSet, int... column) throws IOException { 109 LineIterator lines = getLineIterator(source); 110 int maxCols = 0; 111 for (int c : column) { 112 if (c > maxCols) { 113 maxCols = c; 114 } 115 } 116 while (lines.hasNext()) { 117 String line = lines.nextLine().trim(); 118 // ignore comments 119 if (!ignore(line)) { 120 String[] parts = TAB_DELIMITED.split(line); 121 if (maxCols <= parts.length) { 122 for (int c : column) { 123 String cell = parts[c].trim(); 124 resultSet.add(cell); 125 } 126 } 127 } 128 } 129 return resultSet; 130 } 131 132 public static void copyStreams(InputStream in, OutputStream out) throws IOException { 133 // write the file to the file specified 134 int bytesRead; 135 byte[] buffer = new byte[8192]; 136 137 while ((bytesRead = in.read(buffer, 0, 8192)) != -1) { 138 out.write(buffer, 0, bytesRead); 139 } 140 141 out.close(); 142 in.close(); 143 } 144 145 public static void copyStreamToFile(InputStream in, File out) throws IOException { 146 copyStreams(in, new FileOutputStream(out)); 147 } 148 149 public static File createTempDir() throws IOException { 150 return createTempDir("gbif-futil", ".tmp"); 151 } 152 153 /** 154 * @param prefix The prefix string to be used in generating the file's name; must be at least three characters long 155 * @param suffix The suffix string to be used in generating the file's name; may be null, in which case the suffix 156 * ".tmp" will be used 157 */ 158 public static File createTempDir(String prefix, String suffix) throws IOException { 159 File dir = File.createTempFile(prefix, suffix); 160 if (!dir.delete()) { 161 throw new IOException("Could not delete temp file: " + dir.getAbsolutePath()); 162 } 163 if (!dir.mkdir()) { 164 throw new IOException("Could not create temp directory: " + dir.getAbsolutePath()); 165 } 166 return dir; 167 } 168 169 /** 170 * Delete directory recursively, including all its files, sub-folders, and sub-folder's files. 171 * 172 * @param directory directory to delete recursively 173 */ 174 public static void deleteDirectoryRecursively(File directory) { 175 File[] list = directory.listFiles(); 176 for (File file : list) { 177 if (file.isDirectory()) { 178 deleteDirectoryRecursively(file); 179 file.delete(); 180 } else { 181 file.delete(); 182 } 183 } 184 directory.delete(); 185 } 186 187 /** 188 * Escapes a filename so it is a valid filename on all systems, replacing /. .. \t\r\n. 189 * 190 * @param filename to be escaped 191 */ 192 public static String escapeFilename(String filename) { 193 return filename.replaceAll("[\\s./&]", "_"); 194 } 195 196 public static File getClasspathFile(String path) { 197 return new File(ClassLoader.getSystemResource(path).getFile()); 198 } 199 200 public static InputStream getInputStream(File source) throws FileNotFoundException { 201 return new FileInputStream(source); 202 } 203 204 public static BufferedReader getInputStreamReader(InputStream input) throws FileNotFoundException { 205 return getInputStreamReader(input, UTF8); 206 } 207 208 public static BufferedReader getInputStreamReader(InputStream input, String encoding) throws FileNotFoundException { 209 BufferedReader reader = null; 210 try { 211 reader = new BufferedReader(new InputStreamReader(input, encoding)); 212 } catch (UnsupportedEncodingException e) { 213 LOG.warn("Caught Exception", e); 214 } 215 return reader; 216 } 217 218 /** 219 * @param source the source input stream encoded in UTF-8 220 */ 221 public static LineIterator getLineIterator(InputStream source) { 222 return getLineIterator(source, UTF8); 223 } 224 225 /** 226 * @param source the source input stream 227 * @param encoding the encoding used by the input stream 228 */ 229 public static LineIterator getLineIterator(InputStream source, String encoding) { 230 try { 231 return new LineIterator(new BufferedReader(new InputStreamReader(source, encoding))); 232 } catch (UnsupportedEncodingException e) { 233 throw new IllegalArgumentException("Unsupported encoding" + encoding, e); 234 } 235 } 236 237 public static BufferedReader getUtf8Reader(File file) throws FileNotFoundException { 238 BufferedReader reader = null; 239 try { 240 reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), UTF8)); 241 } catch (UnsupportedEncodingException e) { 242 LOG.warn("Caught Exception", e); 243 } 244 return reader; 245 } 246 247 /** 248 * Converts the byte size into human-readable format. 249 * Support both SI and byte format. 250 */ 251 public static String humanReadableByteCount(long bytes, boolean si) { 252 int unit = si ? 1000 : 1024; 253 if (bytes < unit) { 254 return bytes + " B"; 255 } 256 int exp = (int) (Math.log(bytes) / Math.log(unit)); 257 String pre = (si ? "kMGTPE" : "KMGTPE").charAt(exp - 1) + (si ? "" : "i"); 258 return String.format("%.1f %sB", bytes / Math.pow(unit, exp), pre); 259 } 260 261 public static boolean isCompressedFile(File source) { 262 String suffix = source.getName().substring(source.getName().lastIndexOf('.') + 1); 263 return suffix != null && suffix.length() > 0 && ("zip".equalsIgnoreCase(suffix) || "tgz".equalsIgnoreCase(suffix) 264 || "gz".equalsIgnoreCase(suffix)); 265 } 266 267 /** 268 * Reads a complete file into a byte buffer. 269 */ 270 public static ByteBuffer readByteBuffer(File file) throws IOException { 271 byte[] content = org.apache.commons.io.FileUtils.readFileToByteArray(file); 272 return ByteBuffer.wrap(content); 273 } 274 275 /** 276 * Reads the first bytes of a file into a byte buffer. 277 * 278 * @param bufferSize the number of bytes to read from the file 279 */ 280 public static ByteBuffer readByteBuffer(File file, int bufferSize) throws IOException { 281 ByteBuffer bbuf = ByteBuffer.allocate(bufferSize); 282 BufferedInputStream f = new BufferedInputStream(new FileInputStream(file), bufferSize); 283 284 int b; 285 while ((b = f.read()) != -1) { 286 if (!bbuf.hasRemaining()) { 287 break; 288 } 289 bbuf.put((byte) b); 290 } 291 f.close(); 292 293 return bbuf; 294 } 295 296 /** 297 * @param linesPerMemorySort are the number of lines that should be sorted in memory, determining the number of file 298 * segments to be sorted when doing a Java file sort. Defaults to 100000, if you have 299 * memory available a higher value increases performance. 300 */ 301 public static void setLinesPerMemorySort(int linesPerMemorySort) { 302 FileUtils.linesPerMemorySort = linesPerMemorySort; 303 } 304 305 public static Writer startNewUtf8File(File file) throws IOException { 306 touch(file); 307 return new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file, false), UTF8)); 308 } 309 310 public static Writer startNewUtf8XmlFile(File file) throws IOException { 311 Writer writer = startNewUtf8File(file); 312 writer.write("<?xml version='1.0' encoding='utf-8'?>\n"); 313 return writer; 314 } 315 316 /** 317 * Takes a utf8 encoded input stream and reads in every line/row into a list. 318 * 319 * @return list of rows 320 */ 321 public static LinkedList<String> streamToList(InputStream source) throws IOException { 322 return streamToList(source, FileUtils.UTF8); 323 } 324 325 /** 326 * Reads a file and returns a list of all lines which are no comments (starting with #) and trims whitespace. 327 * 328 * @param source the UTF-8 encoded text file to read 329 * @param resultList the list implementation to be used. Will not be cleared before reading! 330 * @return list of lines 331 */ 332 public static List<String> streamToList(InputStream source, List<String> resultList) throws IOException { 333 LineIterator lines = getLineIterator(source); 334 while (lines.hasNext()) { 335 String line = lines.nextLine().trim(); 336 // ignore comments 337 if (!ignore(line)) { 338 resultList.add(line); 339 } 340 } 341 return resultList; 342 } 343 344 public static LinkedList<String> streamToList(InputStream source, String encoding) throws IOException { 345 LinkedList<String> resultList = new LinkedList<>(); 346 try { 347 LineIterator lines = new LineIterator(new BufferedReader(new InputStreamReader(source, encoding))); 348 while (lines.hasNext()) { 349 String line = lines.nextLine(); 350 resultList.add(line); 351 } 352 } catch (UnsupportedEncodingException e) { 353 throw new IllegalArgumentException("Unsupported encoding " + encoding, e); 354 } 355 return resultList; 356 } 357 358 /** 359 * Reads a utf8 encoded inut stream, splits 360 */ 361 public static Map<String, String> streamToMap(InputStream source) throws IOException { 362 return streamToMap(source, new HashMap<>()); 363 } 364 365 public static Map<String, String> streamToMap(InputStream source, int key, int value, boolean trimToNull) 366 throws IOException { 367 return streamToMap(source, new HashMap<>(), key, value, trimToNull); 368 } 369 370 /** 371 * Read a hashmap from a tab delimited utf8 input stream using the row number as an integer value and the entire row 372 * as the value. Ignores commented rows starting with #. 373 * 374 * @param source tab delimited text file to read 375 */ 376 public static Map<String, String> streamToMap(InputStream source, Map<String, String> result) throws IOException { 377 LineIterator lines = getLineIterator(source); 378 Integer row = 0; 379 while (lines.hasNext()) { 380 row++; 381 String line = lines.nextLine().trim(); 382 // ignore comments 383 if (!ignore(line)) { 384 result.put(line, row.toString()); 385 } 386 } 387 return result; 388 } 389 390 /** 391 * Read a hashmap from a tab delimited utf8 file, ignoring commented rows starting with #. 392 * 393 * @param source tab delimited input stream to read 394 * @param key column number to use as key 395 * @param value column number to use as value 396 * @param trimToNull if true trims map entries to null 397 */ 398 public static Map<String, String> streamToMap(InputStream source, Map<String, String> result, int key, int value, 399 boolean trimToNull) throws IOException { 400 LineIterator lines = getLineIterator(source); 401 int maxCols = key > value ? key : value + 1; 402 while (lines.hasNext()) { 403 String line = lines.nextLine(); 404 // ignore comments 405 if (!ignore(line)) { 406 String[] parts = TAB_DELIMITED.split(line); 407 if (maxCols <= parts.length) { 408 if (trimToNull) { 409 result.put(StringUtils.trimToNull(parts[key]), StringUtils.trimToNull(parts[value])); 410 } else { 411 result.put(parts[key], parts[value]); 412 } 413 } 414 } 415 } 416 return result; 417 } 418 419 public static Set<String> streamToSet(InputStream source) throws IOException { 420 return streamToSet(source, new CompactHashSet<>()); 421 } 422 423 /** 424 * Reads a file and returns a unique set of all lines which are no comments (starting with #) and trims whitespace. 425 * 426 * @param source the UTF-8 encoded text file to read 427 * @param resultSet the set implementation to be used. Will not be cleared before reading! 428 * @return set of unique lines 429 */ 430 public static Set<String> streamToSet(InputStream source, Set<String> resultSet) throws IOException { 431 LineIterator lines = getLineIterator(source); 432 while (lines.hasNext()) { 433 String line = lines.nextLine().trim(); 434 // ignore comments 435 if (!ignore(line)) { 436 resultSet.add(line); 437 } 438 } 439 return resultSet; 440 } 441 442 public static String toFilePath(URL url) { 443 String protocol = 444 url.getProtocol() == null || "http".equalsIgnoreCase(url.getProtocol()) ? "" : "/__" + url.getProtocol() + "__"; 445 String domain = url.getAuthority() == null ? "__domainless" : url.getAuthority(); 446 return domain + protocol + url.getFile(); 447 } 448 449 public static File url2file(URL url) { 450 File f = null; 451 try { 452 f = new File(url.toURI()); 453 } catch (URISyntaxException e) { 454 f = new File(url.getPath()); 455 } 456 return f; 457 } 458 459 /** 460 * For the given list, finds the index of the lowest value using the given comparator. 461 * 462 * @param values To compare 463 * @param comparator To use 464 * @return The index of the lowest value, or -1 if they are all null 465 */ 466 static int lowestValueIndex(List<String> values, Comparator<String> comparator) { 467 int index = 0; 468 String lowestValue = null; 469 for (int i = 0; i < values.size(); i++) { 470 String value = values.get(i); 471 if (lowestValue == null) { 472 lowestValue = value; 473 index = i; 474 } else if (comparator.compare(lowestValue, value) > 0) { 475 lowestValue = value; 476 index = i; 477 } 478 } 479 480 return lowestValue == null ? -1 : index; 481 } 482 483 /** 484 * For the given file's path, returns a proposed new filename (including path) with the extension 485 * index and suffix. So a file of "/tmp/input.txt" -> "/tmp/input_part_10.txt". 486 * 487 * @param original File 488 * @param index E.g. 10 489 * @return The proposed name 490 */ 491 private static File getChunkFile(File original, int index) { 492 return new File(original.getParentFile(), 493 FilenameUtils.getBaseName(original.getName()) + '_' + index + getFileExtension(original.getName())); 494 } 495 496 private static boolean ignore(String line) { 497 return StringUtils.trimToNull(line) == null || line.startsWith("#"); 498 } 499 500 public int getLinesPerMemorySort() { 501 return linesPerMemorySort; 502 } 503 504 /** 505 * Merges a list of intermediary sort chunk files into a single sorted file. On completion, the intermediary sort 506 * chunk files are deleted. 507 * 508 * @param sortFiles sort chunk files to merge 509 * @param sortedFileWriter writer to merge to. Can already be open and contain data 510 * @param lineComparator To use when determining the order (reuse the one that was used to sort the individual 511 * files) 512 */ 513 public void mergedSortedFiles(List<File> sortFiles, OutputStreamWriter sortedFileWriter, Comparator<String> lineComparator) 514 throws IOException { 515 List<BufferedReader> partReaders = new LinkedList<>(); 516 try { 517 List<String> partReaderLine = new LinkedList<>(); 518 for (File f : sortFiles) { 519 // Use UTF-8 sort order. 520 partReaders.add(new BufferedReader( 521 new InputStreamReader(new FileInputStream(f), StandardCharsets.UTF_8))); 522 } 523 boolean moreData = false; 524 // load first line in 525 for (BufferedReader partReader : partReaders) { 526 String partLine = partReader.readLine(); 527 if (partLine != null) { 528 moreData = true; 529 } 530 // we still add the "null" to keep the partReaders and partLineReader indexes in sync - ALWAYS 531 partReaderLine.add(partLine); 532 } 533 // keep going until all readers are exhausted 534 while (moreData) { 535 int index = lowestValueIndex(partReaderLine, lineComparator); 536 if (index >= 0) { 537 sortedFileWriter.write(partReaderLine.get(index)); 538 sortedFileWriter.write("\n"); 539 BufferedReader r = partReaders.get(index); 540 String partLine = r.readLine(); 541 // TODO: Synchronization on local variable? 542 synchronized (partReaderLine) { 543 partReaderLine.add(index, partLine); 544 partReaderLine.remove(index + 1); 545 } 546 } else { 547 moreData = false; 548 } 549 } 550 } finally { 551 for (BufferedReader b : partReaders) { 552 try { 553 b.close(); 554 } catch (RuntimeException e) { 555 } 556 } 557 // I assume it periodically flushes anyway, so only need to do once at end... 558 sortedFileWriter.flush(); 559 sortedFileWriter.close(); 560 // delete (intermediary) sort chunk files, only the sorted file remains 561 for (File f : sortFiles) { 562 f.delete(); 563 } 564 } 565 } 566 567 /** 568 * Sorts the input file into the output file using the supplied delimited line parameters. 569 * 570 * This method is not reliable when the sort field may contain Unicode codepoints outside the Basic Multilingual Plane, 571 * i.e. above \uFFFF. In that case, the sort order differs from Java's String sort order. This should not be a problem 572 * for most usage; the Supplementary Multilingual Planes contain ancient scripts, emojis, arrows and so on. 573 * 574 * @param input To sort 575 * @param sorted The sorted version of the input excluding ignored header lines (see ignoreHeaderLines) 576 * @param column the column that keeps the values to sort on 577 * @param columnDelimiter the delimiter that separates columns in a row 578 * @param enclosedBy optional column enclosing character, e.g. a double quote for CSVs 579 * @param newlineDelimiter the chars used for new lines, usually \n, \n\r or \r 580 * @param ignoreHeaderLines number of beginning lines to ignore, e.g. headers 581 */ 582 public void sort(File input, File sorted, String encoding, int column, String columnDelimiter, Character enclosedBy, 583 String newlineDelimiter, int ignoreHeaderLines) throws IOException { 584 Comparator<String> lineComparator; 585 if (enclosedBy == null) { 586 lineComparator = new LineComparator(column, columnDelimiter); 587 } else { 588 lineComparator = new LineComparator(column, columnDelimiter, enclosedBy); 589 } 590 sort(input, sorted, encoding, column, columnDelimiter, enclosedBy, newlineDelimiter, ignoreHeaderLines, 591 lineComparator, false); 592 } 593 594 /** 595 * Sorts the input file into the output file using the supplied delimited line parameters. 596 * 597 * This method is not reliable when the sort field may contain Unicode codepoints outside the Basic Multilingual Plane, 598 * i.e. above \uFFFF. In that case, the sort order differs from Java's String sort order. This should not be a problem 599 * for most usage; the Supplementary Multilingual Planes contain ancient scripts, emojis, arrows and so on. 600 * 601 * TODO: This method is globally synchronized, in case multiple sorts are attempted to the same file simultaneously. 602 * This could be improved to allow synchronizing against the destination file, rather than for all sorts. 603 * 604 * @param input To sort 605 * @param sorted The sorted version of the input excluding ignored header lines (see ignoreHeaderLines) 606 * @param column the column that keeps the values to sort on 607 * @param columnDelimiter the delimiter that separates columns in a row 608 * @param enclosedBy optional column enclosing character, e.g. a double quote for CSVs 609 * @param newlineDelimiter the chars used for new lines, usually \n, \r\n or \r 610 * @param ignoreHeaderLines number of beginning lines to ignore, e.g. headers 611 * @param lineComparator used to sort the output 612 * @param ignoreCase ignore case order, this parameter couldn't have any effect if the LineComparator is used 613 */ 614 public void sort(File input, File sorted, String encoding, int column, String columnDelimiter, Character enclosedBy, 615 String newlineDelimiter, int ignoreHeaderLines, Comparator<String> lineComparator, boolean ignoreCase) 616 throws IOException { 617 LOG.debug("Sorting " + input.getAbsolutePath() + " as new file " + sorted.getAbsolutePath()); 618 if (encoding == null) { 619 LOG.warn("No encoding specified, assume UTF-8"); 620 encoding = FileUtils.UTF8; 621 } 622 synchronized (sortLock) { 623 if (sorted.exists()) { 624 // Delete a file, which will allow processes with it open to continue reading it. 625 // The GNU sort truncates and appends, which would mean a partial read otherwise. 626 LOG.warn("Deleting existed sorted file {}", sorted.getAbsoluteFile()); 627 sorted.delete(); 628 } 629 // if the id is in the first column, first try sorting via shell as its the fastest we can get 630 if (!sortInGnu(input, sorted, encoding, ignoreHeaderLines, column, columnDelimiter, newlineDelimiter, ignoreCase)) { 631 LOG.debug("No GNU sort available, using native Java sorting"); 632 sortInJava(input, sorted, encoding, lineComparator, ignoreHeaderLines); 633 } 634 } 635 } 636 637 /** 638 * Sorts the input file into the output file using the supplied lineComparator. 639 * 640 * @param input To sort 641 * @param sorted The sorted version of the input excluding ignored header lines (see ignoreHeaderLines) 642 * @param lineComparator To use during comparison 643 * @param ignoreHeaderLines number of beginning lines to ignore, e.g. headers 644 */ 645 public void sortInJava(File input, File sorted, String encoding, Comparator<String> lineComparator, 646 int ignoreHeaderLines) throws IOException { 647 LOG.debug("Sorting File[" + input.getAbsolutePath() + ']'); 648 long start = System.currentTimeMillis(); 649 List<File> sortFiles = new LinkedList<>(); 650 BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(input), encoding)); 651 List<String> headerLines = new LinkedList<>(); 652 try { 653 String line = br.readLine(); 654 int fileCount = 0; 655 656 List<String> linesToSort = new LinkedList<>(); 657 while (line != null) { 658 if (ignoreHeaderLines > 0) { 659 headerLines.add(line); 660 ignoreHeaderLines--; 661 } else { 662 linesToSort.add(line); 663 664 // if buffer is full, then sort and write to file 665 if (linesToSort.size() == linesPerMemorySort) { 666 sortFiles.add(sortAndWrite(input, encoding, lineComparator, fileCount, linesToSort)); 667 linesToSort = new LinkedList<>(); 668 fileCount++; 669 } 670 } 671 line = br.readLine(); 672 } 673 // catch the last lot 674 if (!linesToSort.isEmpty()) { 675 sortFiles.add(sortAndWrite(input, encoding, lineComparator, fileCount, linesToSort)); 676 } 677 } finally { 678 br.close(); 679 } 680 LOG.debug( 681 sortFiles.size() + " sorted file chunks created in " + (System.currentTimeMillis() - start) / 1000 + " secs"); 682 683 // now merge the sorted files into one single sorted file 684 FileWriter sortedFileWriter = new FileWriter(sorted); 685 // first write the old header lines if existing 686 for (String h : headerLines) { 687 sortedFileWriter.write(h); 688 sortedFileWriter.write("\n"); 689 } 690 mergedSortedFiles(sortFiles, sortedFileWriter, lineComparator); 691 692 LOG.debug( 693 "File " + input.getAbsolutePath() + " sorted successfully using " + sortFiles.size() + " parts to do sorting in " 694 + (System.currentTimeMillis() - start) / 1000 + " secs"); 695 } 696 697 698 /** 699 * Splits the supplied file into files of set line size and with a suffix. 700 * 701 * @param input To split up 702 * @param linesPerOutput Lines per split file 703 * @param extension The file extension to use - e.g. ".txt" 704 * @return The split files 705 */ 706 public List<File> split(File input, int linesPerOutput, String extension) throws IOException { 707 LOG.debug("Splitting File[" + input.getAbsolutePath() + ']'); 708 long timer = System.currentTimeMillis(); 709 List<File> splitFiles = new LinkedList<>(); 710 // Use ISO-8859-1 as a binary-safe encoding. 711 BufferedReader br = new BufferedReader( 712 new InputStreamReader(new FileInputStream(input), StandardCharsets.ISO_8859_1)); 713 String line = br.readLine(); 714 int fileCount = 0; 715 File splitFile = getChunkFile(input, fileCount); 716 fileCount++; 717 splitFiles.add(splitFile); 718 OutputStreamWriter fw = 719 new OutputStreamWriter(new FileOutputStream(splitFile), StandardCharsets.ISO_8859_1); 720 try { 721 int lineCount = 0; 722 while (line != null) { 723 if (lineCount == linesPerOutput) { 724 fw.flush(); 725 fw.close(); 726 splitFile = getChunkFile(input, fileCount); 727 splitFiles.add(splitFile); 728 // is ok to reuse, as last one is closed, and this will always get closed - see finally below 729 fw = new OutputStreamWriter(new FileOutputStream(splitFile), StandardCharsets.ISO_8859_1); 730 fileCount++; 731 lineCount = 0; 732 } 733 fw.write(line); 734 fw.write("\n"); 735 line = br.readLine(); 736 lineCount++; 737 } 738 fw.flush(); 739 } finally { 740 fw.close(); 741 } 742 LOG.debug("File[" + input.getAbsolutePath() + "] split successfully into[" + splitFiles.size() + "] parts in secs[" 743 + (1 + System.currentTimeMillis() - timer) / 1000 + "]"); 744 return splitFiles; 745 } 746 747 /** 748 * Test whether we have a new enough version of GNU Sort that supports (primarily) the -k option with a start and end 749 * column. 750 * 751 * Mac OS only includes an old version of GNU sort, and will fail this test. 752 */ 753 private boolean gnuSortAvailable() { 754 if (gnuSortAvailable != null) { 755 return gnuSortAvailable; 756 } 757 758 try { 759 String command = "sort -k1,1 -t',' --ignore-case /dev/null"; 760 LOG.debug("Testing capability of GNU sort with command: {}", command); 761 762 Process process = new ProcessBuilder("/bin/sh", "-c", command).start(); 763 int exitValue = process.waitFor(); 764 765 if (exitValue == 0) { 766 LOG.debug("GNU sort is capable"); 767 gnuSortAvailable = true; 768 } else { 769 LOG.warn("GNU sort does not exist or is too old, and will not be used. Sorting large files will be slow.", 770 new InputStreamUtils().readEntireStream(process.getErrorStream()).replace('\n', ' ')); 771 gnuSortAvailable = false; 772 } 773 } catch (Exception e) { 774 LOG.warn("GNU sort does not exist or is too old, and will not be used. Sorting large files will be slow.", e); 775 gnuSortAvailable = false; 776 } 777 778 return gnuSortAvailable; 779 } 780 781 /** 782 * sort a text file via an external GNU sort command: 783 * sorting tabs at 3rd column, numerical reverse order 784 * sort -t$'\t' -k3 -o sorted.txt col2007.txt 785 * <p/> 786 * The GNU sort based sorting is extremely efficient and much, much faster than the current sortInJava method. It is 787 * locale aware though and we only want the native C sorting locale. See 788 * http://www.gnu.org/software/coreutils/faq/coreutils-faq.html#Sort-does-not-sort-in-normal-order_0021 789 * <p/> 790 * Example C sort order: 791 * <p/> 792 * <pre> 793 * 1 oOdontoceti 794 * 10 gGlobicephala melaena melaena Traill 795 * 100 gGlobicephala melaena melaena Traill 796 * 101 gGlobicephala melaena melaena Traill 797 * 11 pPontoporia Gray 798 * 12 pPontoporia blainvillei Gervais and d'Orbigny 799 * 120 iInia d'Orbigny 800 * 121 iInia geoffrensis Blainville 801 * 2 sSusuidae 802 * 20 cCetacea 803 * Amphiptera 804 * Amphiptera pacifica Giglioli 805 * Anarnak Lacépède 806 * Balaena mangidach Chamisso 807 * amphiptera 808 * amphiptera pacifica Giglioli 809 * anarnak Lacépède 810 * balaena mangidach Chamisso 811 * </pre> 812 */ 813 protected boolean sortInGnu(File input, File sorted, String encoding, int ignoreHeaderLines, int column, 814 String columnDelimiter, String lineDelimiter, boolean ignoreCase) throws IOException { 815 String command; 816 // GNU sort is checked for use when: 817 // • line delimiter is \n 818 // • column delimiter is set and we're not using the first column 819 // • sort version is sufficient to include start and end column (-k 1,1). 820 // Use the --debug option to sort if working on this code. 821 if (lineDelimiter == null || !lineDelimiter.contains("\n") || (columnDelimiter != null && column > 0) || 822 !gnuSortAvailable()) { 823 LOG.debug("Cannot use GNU sort on this file"); 824 return false; 825 } 826 827 // keep header rows 828 boolean success = false; 829 try { 830 LinkedList<String> cmds = new LinkedList<>(); 831 cmds.add("/bin/sh"); 832 cmds.add("-c"); 833 cmds.add(""); 834 ProcessBuilder pb = new ProcessBuilder(cmds); 835 Map<String, String> env = pb.environment(); 836 837 //clear the environment, but keep specified temp working directory 838 env.keySet().removeIf(key -> !(key.equals("TMPDIR"))); 839 if (System.getProperty("java.io.tmpdir") != null) { 840 env.put("TMPDIR", System.getProperty("java.io.tmpdir")); 841 } 842 // make sure we use the C locale for sorting 843 env.put("LC_ALL", "C"); 844 845 String sortArgs = String.format(" %s -k%d,%d -t'%s'", 846 ignoreCase ? "--ignore-case" : "", column+1, column+1, columnDelimiter); 847 848 if (ignoreHeaderLines > 0) { 849 // copy header lines 850 command = "head -n " + ignoreHeaderLines + ' ' + input.getAbsolutePath() + " > " + sorted.getAbsolutePath(); 851 LOG.debug("Issue external command: {}", command); 852 cmds.removeLast(); 853 cmds.add(command); 854 Process process = pb.start(); 855 int exitValue = process.waitFor(); 856 if (exitValue != 0) { 857 LOG.warn("Error sorting file (copying header lines) with GNU head"); 858 return false; 859 } 860 861 // do the sorting ignoring the header rows 862 command = "sed " + ignoreHeaderLines + "d " + input.getAbsolutePath() + " | " 863 + "sort " + sortArgs 864 + " >> " + sorted.getAbsolutePath(); 865 } else { 866 // do sorting directly, we don't have header rows 867 command = "sort " + sortArgs + " -o " + sorted.getAbsolutePath() + ' ' + input.getAbsolutePath(); 868 } 869 870 LOG.debug("Issue external command: {}", command); 871 cmds.removeLast(); 872 cmds.add(command); 873 Process process = pb.start(); 874 // get the stdout and stderr from the command that was run 875 InputStream err = process.getErrorStream(); 876 int exitValue = process.waitFor(); 877 if (exitValue == 0) { 878 LOG.debug("Successfully sorted file with GNU sort"); 879 success = true; 880 } else { 881 LOG.warn("Error sorting file with GNU sort"); 882 InputStreamUtils isu = new InputStreamUtils(); 883 System.err.append(isu.readEntireStream(err)); 884 } 885 } catch (Exception e) { 886 LOG.warn("Caught Exception using GNU sort", e); 887 } 888 return success; 889 } 890 891 /** 892 * Sorts the lines and writes to file using the 893 * 894 * @param input File to base the name on 895 * @param lineComparator To compare the lines for sorting 896 * @param fileCount Used for the file name 897 * @param linesToSort To actually sort 898 * @return The written file 899 */ 900 private File sortAndWrite(File input, String encoding, Comparator<String> lineComparator, int fileCount, 901 List<String> linesToSort) throws IOException { 902 long start = System.currentTimeMillis(); 903 linesToSort.sort(lineComparator); 904 // When implementing a comparator, make it SUPER quick!!! 905 LOG.debug( 906 "Collections.sort took msec[" + (System.currentTimeMillis() - start) + "] to sort records[" + linesToSort.size() 907 + ']'); 908 File sortFile = getChunkFile(input, fileCount); 909 try (Writer fw = new OutputStreamWriter(new FileOutputStream(sortFile), encoding)) { 910 for (String s : linesToSort) { 911 fw.write(s); 912 fw.write("\n"); 913 } 914 } 915 return sortFile; 916 } 917 918 /** 919 * Creates an empty file or updates the last updated timestamp on the same as the unix command of 920 * the same name. 921 * 922 * <p>From Guava. 923 * 924 * @param file the file to create or update 925 * @throws IOException if an I/O error occurs 926 */ 927 public static void touch(File file) throws IOException { 928 Objects.requireNonNull(file); 929 if (!file.createNewFile() && !file.setLastModified(System.currentTimeMillis())) { 930 throw new IOException("Unable to update modification time of " + file); 931 } 932 } 933 934 /** 935 * Returns the <a href="http://en.wikipedia.org/wiki/Filename_extension">file extension</a> for 936 * the given file name, or the empty string if the file has no extension. The result does not 937 * include the '{@code .}'. 938 * 939 * <p><b>Note:</b> This method simply returns everything after the last '{@code .}' in the file's 940 * name as determined by {@link File#getName}. It does not account for any filesystem-specific 941 * behavior that the {@link File} API does not already account for. For example, on NTFS it will 942 * report {@code "txt"} as the extension for the filename {@code "foo.exe:.txt"} even though NTFS 943 * will drop the {@code ":.txt"} part of the name when the file is actually created on the 944 * filesystem due to NTFS's <a href="https://goo.gl/vTpJi4">Alternate Data Streams</a>. 945 * 946 * <p>From Guava. 947 */ 948 public static String getFileExtension(String fullName) { 949 Objects.requireNonNull(fullName); 950 String fileName = new File(fullName).getName(); 951 int dotIndex = fileName.lastIndexOf('.'); 952 return (dotIndex == -1) ? "" : fileName.substring(dotIndex + 1); 953 } 954 955 /** 956 * Creates any necessary but nonexistent parent directories of the specified file. Note that if 957 * this operation fails it may have succeeded in creating some (but not all) of the necessary 958 * parent directories. 959 * 960 * <p>From Guava. 961 * 962 * @throws IOException if an I/O error occurs, or if any necessary but nonexistent parent 963 * directories of the specified file could not be created. 964 */ 965 public static void createParentDirs(File file) throws IOException { 966 Objects.requireNonNull(file); 967 File parent = file.getCanonicalFile().getParentFile(); 968 if (parent == null) { 969 /* 970 * The given directory is a filesystem root. All zero of its ancestors exist. This doesn't 971 * mean that the root itself exists -- consider x:\ on a Windows machine without such a drive 972 * -- or even that the caller can create it, but this method makes no such guarantees even for 973 * non-root files. 974 */ 975 return; 976 } 977 //noinspection ResultOfMethodCallIgnored 978 parent.mkdirs(); 979 if (!parent.isDirectory()) { 980 throw new IOException("Unable to create parent directories of " + file); 981 } 982 } 983}