001/* 002 * Licensed under the Apache License, Version 2.0 (the "License"); 003 * you may not use this file except in compliance with the License. 004 * You may obtain a copy of the License at 005 * 006 * http://www.apache.org/licenses/LICENSE-2.0 007 * 008 * Unless required by applicable law or agreed to in writing, software 009 * distributed under the License is distributed on an "AS IS" BASIS, 010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 011 * See the License for the specific language governing permissions and 012 * limitations under the License. 013 */ 014package org.gbif.utils.file; 015 016import org.gbif.utils.text.LineComparator; 017 018import java.io.BufferedReader; 019import java.io.ByteArrayOutputStream; 020import java.io.File; 021import java.io.FileInputStream; 022import java.io.FileOutputStream; 023import java.io.IOException; 024import java.io.InputStreamReader; 025import java.io.OutputStream; 026import java.io.OutputStreamWriter; 027import java.nio.charset.StandardCharsets; 028import java.nio.file.Files; 029import java.util.ArrayList; 030import java.util.Arrays; 031import java.util.Comparator; 032import java.util.Iterator; 033import java.util.LinkedList; 034import java.util.List; 035import java.util.concurrent.TimeUnit; 036 037import org.apache.commons.io.LineIterator; 038import org.apache.commons.lang3.time.StopWatch; 039import org.junit.jupiter.api.Disabled; 040import org.junit.jupiter.api.Test; 041 042import static org.junit.jupiter.api.Assertions.assertEquals; 043import static org.junit.jupiter.api.Assertions.assertFalse; 044import static org.junit.jupiter.api.Assertions.assertTrue; 045import static org.junit.jupiter.api.Assertions.fail; 046 047/** 048 * @author markus 049 */ 050public class FileUtilsTest { 051 052 private final String ENCODING = StandardCharsets.UTF_8.displayName(); 053 054 public static void assertUnixSortOrder(File sorted) throws IOException { 055 // read file 056 BufferedReader br = 057 new BufferedReader( 058 new InputStreamReader(new FileInputStream(sorted), StandardCharsets.UTF_8)); 059 LineIterator liter = new LineIterator(br); 060 assertUnixSortOrder(liter); 061 } 062 063 public static void assertUnixSortOrder(Iterator<String> it) throws IOException { 064 LinkedList<String> sorted = 065 FileUtils.streamToList(FileUtils.classpathStream("sorting/LF_sorted.txt")); 066 while (it.hasNext()) { 067 String x = it.next(); 068 System.out.println(x); 069 assertEquals(sorted.poll(), x); 070 } 071 } 072 073 @Test 074 public void humanReadableByteCountTest() { 075 assertEquals("11 B", FileUtils.humanReadableByteCount(11, true)); 076 assertEquals("1.0 kB", FileUtils.humanReadableByteCount(1_000, true)); 077 assertEquals("1.0 MB", FileUtils.humanReadableByteCount(1_000_000, true)); 078 assertEquals("1.0 GB", FileUtils.humanReadableByteCount(1_000_000_000, true)); 079 assertEquals("1.0 TB", FileUtils.humanReadableByteCount(1_000_000_000_000L, true)); 080 081 assertEquals("11 B", FileUtils.humanReadableByteCount(11, false)); 082 assertEquals("1.0 KiB", FileUtils.humanReadableByteCount(1024, false)); 083 assertEquals("1.0 MiB", FileUtils.humanReadableByteCount(1024 * 1024, false)); 084 assertEquals("1.0 GiB", FileUtils.humanReadableByteCount(1024 * 1024 * 1024, false)); 085 assertEquals("1.0 TiB", FileUtils.humanReadableByteCount(1024 * 1024 * 1024 * 1024L, false)); 086 } 087 088 /** 089 * tests deleting directory recursively. 090 */ 091 @Test 092 public void testDeleteRecursive() throws IOException { 093 File topDirectory = Files.createTempDirectory("top").toFile(); 094 File middleDirectory = new File(topDirectory, "middle"); 095 middleDirectory.mkdir(); 096 File bottomDirectory = new File(middleDirectory, "bottom"); 097 bottomDirectory.mkdir(); 098 File bottomFile = new File(bottomDirectory, "bottom"); 099 FileUtils.touch(bottomFile); 100 101 assertTrue(topDirectory.getParentFile().exists()); 102 assertTrue(topDirectory.exists()); 103 assertTrue(middleDirectory.getParentFile().exists()); 104 assertTrue(middleDirectory.exists()); 105 assertTrue(bottomDirectory.getParentFile().exists()); 106 assertTrue(bottomDirectory.exists()); 107 assertTrue(bottomFile.exists()); 108 109 FileUtils.deleteDirectoryRecursively(topDirectory); 110 111 assertTrue(topDirectory.getParentFile().exists()); 112 assertFalse(topDirectory.exists()); 113 assertFalse(middleDirectory.getParentFile().exists()); 114 assertFalse(middleDirectory.exists()); 115 assertFalse(bottomDirectory.getParentFile().exists()); 116 assertFalse(bottomDirectory.exists()); 117 assertFalse(bottomFile.exists()); 118 } 119 120 @Test 121 @Disabled("Run manually to check the performance of merging sorted files.") 122 public void testMergeSortedFilesSpeed() throws IOException { 123 // This is the chunks produced from the first part of Java-sorting iNaturalist's media file. 124 String prefix = 125 "/4TB/Matt/unpacked_50c9509d-22c7-4a22-a47d-8c48425ef4a7/media_%dcsv-normalized"; 126 List<File> sortFiles = new ArrayList<>(); 127 for (int i = 0; i <= 825; i++) { 128 sortFiles.add(new File(String.format(prefix, i))); 129 } 130 131 LineComparator lineComparator = new LineComparator(0, ",", '"'); 132 133 OutputStreamWriter performanceWriter = 134 new OutputStreamWriter( 135 new OutputStream() { 136 int count = 0; 137 StopWatch sw = StopWatch.createStarted(); 138 139 @Override 140 public void write(int b) throws IOException { 141 if (b == '\n') { 142 count++; 143 144 if (count % 100_000 == 0 && sw.getTime(TimeUnit.SECONDS) > 0) { 145 System.out.println( 146 "Done " 147 + count 148 + " at " 149 + (count / sw.getTime(TimeUnit.SECONDS)) 150 + " lines per second"); 151 } 152 153 if (count == 10_000_000) { 154 System.out.println( 155 "Done " 156 + count 157 + " at " 158 + (count / sw.getTime(TimeUnit.SECONDS)) 159 + " lines per second"); 160 System.out.println("Took " + sw.getTime(TimeUnit.SECONDS)); 161 throw new IOException("Did enough"); 162 } 163 } 164 } 165 }); 166 try { 167 new FileUtils().mergeSortedFiles(sortFiles, performanceWriter, lineComparator); 168 } catch (IOException e) { 169 } 170 171 StopWatch sw = StopWatch.createStarted(); 172 OutputStreamWriter fileWriter = 173 new OutputStreamWriter(new FileOutputStream(prefix.replace("%d", "OUTPUT"))); 174 new FileUtils().mergeSortedFiles(sortFiles, fileWriter, lineComparator); 175 System.out.println("Took " + sw.getTime(TimeUnit.SECONDS) + " seconds."); 176 } 177 178 @Test 179 public void testMergeSortedFiles() throws IOException { 180 List<File> sortedSplitFiles = new ArrayList<>(); 181 for (int i = 0; i <= 4; i++) { 182 sortedSplitFiles.add(FileUtils.getClasspathFile("merging/split_" + i + ".txt")); 183 } 184 // Also add an empty file 185 sortedSplitFiles.add(File.createTempFile("gbif-common-file-merge", "empty.txt")); 186 187 ByteArrayOutputStream output = new ByteArrayOutputStream(); 188 OutputStreamWriter writer = new OutputStreamWriter(output); 189 190 LineComparator lineComparator = new LineComparator(0, ",", '"'); 191 192 new FileUtils().mergeSortedFiles(sortedSplitFiles, writer, lineComparator); 193 194 String[] sorted = output.toString().split("\n"); 195 for (int i = 1; i < sorted.length; i++) { 196 assertTrue(sorted[i - 1].compareTo(sorted[i]) <= 0); 197 } 198 assertEquals(100, sorted.length); 199 } 200 201 @Test 202 public void testMergeEmptyFiles() throws IOException { 203 List<File> sortedSplitFiles = new ArrayList<>(); 204 sortedSplitFiles.add(File.createTempFile("gbif-common-file-merge", "empty.txt")); 205 sortedSplitFiles.add(File.createTempFile("gbif-common-file-merge", "empty.txt")); 206 207 ByteArrayOutputStream output = new ByteArrayOutputStream(); 208 OutputStreamWriter writer = new OutputStreamWriter(output); 209 210 LineComparator lineComparator = new LineComparator(0, ",", '"'); 211 212 new FileUtils().mergeSortedFiles(sortedSplitFiles, writer, lineComparator); 213 214 assertEquals("", output.toString()); 215 } 216 217 @Test 218 public void testSortingHeaderlessFile() throws IOException { 219 final int IDCOLUMN = 0; 220 File source = FileUtils.getClasspathFile("sorting/irmng.tail"); 221 File sorted = File.createTempFile("gbif-common-file-sort", "sorted.txt"); 222 sorted.deleteOnExit(); 223 FileUtils futils = new FileUtils(); 224 futils.sort(source, sorted, ENCODING, IDCOLUMN, "\t", null, "\n", 0); 225 226 // read file 227 BufferedReader br = 228 new BufferedReader( 229 new InputStreamReader(new FileInputStream(sorted), StandardCharsets.UTF_8)); 230 int line = 0; 231 while (true) { 232 line++; 233 String row = br.readLine(); 234 if (row == null) { 235 break; 236 } 237 238 if (line == 1) { 239 assertTrue(row.startsWith("aca10000053")); 240 } else if (line == 2) { 241 assertTrue(row.startsWith("aca1000012")); 242 } else if (line == 100) { 243 assertTrue(row.startsWith("acr10001387")); 244 } else if (line == 100000) { 245 assertTrue(row.startsWith("vir10000981")); 246 } 247 } 248 } 249 250 /** 251 * Sorting strings containing characters which are surrogate pairs, meaning Unicode characters beyond U+FFFF, will 252 * give different results between GNU Sort and a Java String comparator. 253 * 254 * "fl LATIN SMALL LIGATURE FL" is U+FB02. 255 * "ð LINEAR B IDEOGRAM B241 CHARIOT" is U+100CD. 256 * 257 * GNU sort will use this order, based on the value of the whole character. 258 * 259 * Java represents ð as a surrogate pair \ud800\udccd in UTF-16, and sorts based on parts of pairs. Therefore, it 260 * gives the wrong order. 261 */ 262 @Disabled("Expected to fail") 263 @Test 264 public void testSortingUnicodeFile() throws IOException { 265 FileUtils futils = new FileUtils(); 266 final int IDCOLUMN = 0; 267 268 File source = 269 FileUtils.getClasspathFile("sorting/unicode-supplementary-multilingual-plane.txt"); 270 File gnuSorted = File.createTempFile("gbif-common-file-sort", "sorted-gnu.txt"); 271 File javaSorted = File.createTempFile("gbif-common-file-sort", "sorted-java.txt"); 272 gnuSorted.deleteOnExit(); 273 javaSorted.deleteOnExit(); 274 275 futils.sort(source, gnuSorted, ENCODING, IDCOLUMN, "\t", null, "\n", 0); 276 // The columnDelimiter of ' prevents GNU Sort from being used. 277 futils.sort(source, javaSorted, ENCODING, IDCOLUMN, "'", null, "\n", 0); 278 279 // read file 280 BufferedReader gnuBr = 281 new BufferedReader( 282 new InputStreamReader(new FileInputStream(gnuSorted), StandardCharsets.UTF_8)); 283 BufferedReader javaBr = 284 new BufferedReader( 285 new InputStreamReader(new FileInputStream(javaSorted), StandardCharsets.UTF_8)); 286 287 int line = 0; 288 String gnuRow, javaRow; 289 while ((gnuRow = gnuBr.readLine()) != null) { 290 javaRow = javaBr.readLine(); 291 292 line++; 293 294 System.out.println(gnuRow + "\t\t\t\t" + javaRow); 295 296 assertEquals("Line " + line, gnuRow, javaRow); 297 } 298 } 299 300 /** 301 * tests sorting mac line endings \r which don't work with unix sort 302 */ 303 @Test 304 public void testSortingMac() throws IOException { 305 File source = FileUtils.getClasspathFile("sorting/LF_mac.txt"); 306 File sorted = File.createTempFile("sort-test", "mac.txt"); 307 sorted.deleteOnExit(); 308 FileUtils futils = new FileUtils(); 309 futils.sort(source, sorted, ENCODING, 0, "×", null, "\r", 0); 310 311 assertUnixSortOrder(sorted); 312 } 313 314 /** 315 * tests sorting unix line endings \n which work with unix sort 316 */ 317 @Test 318 public void testSortingUnix() throws IOException { 319 File source = FileUtils.getClasspathFile("sorting/LF_unix.txt"); 320 File sorted = File.createTempFile("sort-test", "unix.txt"); 321 sorted.deleteOnExit(); 322 FileUtils futils = new FileUtils(); 323 futils.sort(source, sorted, ENCODING, 0, "×", null, "\n", 0); 324 325 assertUnixSortOrder(sorted); 326 } 327 328 /** 329 * tests sorting windows line endings \r\n which work with unix sort 330 */ 331 @Test 332 public void testSortingWindows() throws IOException { 333 File source = FileUtils.getClasspathFile("sorting/LF_win.txt"); 334 File sorted = File.createTempFile("sort-test", "windows.txt"); 335 sorted.deleteOnExit(); 336 FileUtils futils = new FileUtils(); 337 futils.sort(source, sorted, ENCODING, 0, "×", null, "\r\n", 0); 338 339 assertUnixSortOrder(sorted); 340 } 341 342 /** 343 * Tests sorting by a column with uneven length strings as the sort column. 344 * 345 * The order musn't be different depending whether the column is last or not. 346 * 347 * The "-k×,×" argument to sort is essential here, otherwise the delimiter from the following column is part of the sort order. 348 */ 349 @Test 350 public void testSortingUnevenLengths() throws IOException { 351 FileUtils futils = new FileUtils(); 352 353 File source = FileUtils.getClasspathFile("sorting/uneven_lengths_col1.txt"); 354 File sorted = File.createTempFile("sort-test", "uneven_lengths_col1.txt"); 355 sorted.deleteOnExit(); 356 futils.sort(source, sorted, ENCODING, 0, ";", null, "\n", 0); 357 358 List<String> sortedStrings = FileUtils.streamToList(new FileInputStream(sorted), ENCODING); 359 assertEquals("980-sp10;x", sortedStrings.get(0)); 360 assertEquals("980-sp100;x", sortedStrings.get(1)); 361 assertEquals("980-sp101;x", sortedStrings.get(2)); 362 363 File source2 = FileUtils.getClasspathFile("sorting/uneven_lengths_col2.txt"); 364 File sorted2 = File.createTempFile("sort-test", "uneven_lengths_col2.txt"); 365 sorted.deleteOnExit(); 366 futils.sort(source2, sorted2, ENCODING, 1, ";", null, "\n", 0); 367 368 List<String> sortedStrings2 = FileUtils.streamToList(new FileInputStream(sorted2), ENCODING); 369 assertEquals("x;980-sp10", sortedStrings2.get(0)); 370 assertEquals("x;980-sp100", sortedStrings2.get(1)); 371 assertEquals("x;980-sp101", sortedStrings2.get(2)); 372 } 373 374 @Test 375 public void testSortingWithHeaders() throws IOException { 376 final int IDCOLUMN = 0; 377 File source = FileUtils.getClasspathFile("sorting/csv_always_quoted.csv"); 378 File sorted = File.createTempFile("gbif-common-file-sort", "sorted.txt"); 379 sorted.deleteOnExit(); 380 FileUtils futils = new FileUtils(); 381 futils.sort(source, sorted, ENCODING, IDCOLUMN, ",", '"', "\n", 1); 382 383 // read file 384 BufferedReader br = 385 new BufferedReader( 386 new InputStreamReader(new FileInputStream(sorted), StandardCharsets.UTF_8)); 387 int line = 0; 388 while (true) { 389 line++; 390 String row = br.readLine(); 391 if (row == null) { 392 break; 393 } 394 395 if (line == 1) { 396 assertTrue(row.startsWith("\"ID\",\"catalogNumber\"")); 397 } else if (line == 2) { 398 assertTrue( 399 row.startsWith( 400 "\"18728553\",\"18728553\",\"Event\",\"18728553\",\"Muscardinus avellanarius\"")); 401 } else if (line == 3) { 402 assertTrue( 403 row.startsWith( 404 "\"8728372\",\"18728372\",\"Event\",\"18728372\",\"Muscardinus avellanarius\",\"52.31635664254722\"")); 405 } 406 } 407 } 408 409 @Test 410 public void testSortingWithNonFirstIdColumn() throws IOException { 411 File source = FileUtils.getClasspathFile("sorting/TDB_104.csv"); 412 File sorted = File.createTempFile("gbif-common-file-sort", "sorted.txt"); 413 sorted.deleteOnExit(); 414 FileUtils futils = new FileUtils(); 415 futils.sort(source, sorted, ENCODING, 3, ";", null, "\n", 1); 416 417 // read file 418 BufferedReader br = 419 new BufferedReader( 420 new InputStreamReader(new FileInputStream(sorted), StandardCharsets.UTF_8)); 421 int line = 0; 422 while (true) { 423 line++; 424 String row = br.readLine(); 425 if (row == null) { 426 break; 427 } 428 429 if (line == 1) { 430 assertEquals( 431 "taxonRank;scientificName;scientificNameAuthorship;taxonID;parentNameUsageID;vernacularName;taxonomicStatus", 432 row); 433 } else if (line == 2) { 434 // row 2 and 3 have the same ids - only test if the id is correct (actual order of those 2 435 // records can differ) 436 Iterator<String> columns = Arrays.stream(row.split(";", -1)).iterator(); 437 438 columns.next(); 439 columns.next(); 440 columns.next(); 441 assertEquals( 442 "urn:lsid:luomus.fi:taxonconcept:0071b855-3d23-4fdc-b2e0-8464c22d752a:1", 443 columns.next()); 444 445 } else if (line == 100) { 446 assertEquals( 447 "species;Ctenochira angulata;(Thomson, 1883) ;urn:lsid:luomus.fi:taxonconcept:4adcf436-a0d2-4940-9155-220ffc6f5859:1;urn:lsid:luomus.fi:taxonconcept:817994ea-b58b-4deb-973f-9fa99c537f8a:1;;valid", 448 row); 449 } 450 } 451 } 452 453 /** 454 * If only columns containing delimiters are quoted in CSV, we can't use GNU sort. 455 * X,"Look, now!",1 456 * X,Why should I,2 457 */ 458 @Test 459 public void testSortingWithQuotedDelimiters() throws IOException { 460 File source = FileUtils.getClasspathFile("sorting/csv_quoted_delimiters.csv"); 461 File sorted = File.createTempFile("gbif-common-file-sort", "sorted.txt"); 462 sorted.deleteOnExit(); 463 FileUtils futils = new FileUtils(); 464 futils.sort(source, sorted, ENCODING, 0, ",", '"', "\n", 1); 465 466 // read file 467 BufferedReader br = 468 new BufferedReader( 469 new InputStreamReader(new FileInputStream(sorted), StandardCharsets.UTF_8)); 470 int line = 30950; 471 while (true) { 472 String row = br.readLine(); 473 if (row == null) { 474 break; 475 } 476 477 if (line == 30950) { 478 assertEquals("catalogNumber", row.substring(0, 13)); 479 } else { 480 // Catalog number ends in 30951 to 30961. 481 assertEquals("ZMA.COL.P." + line, row.replace("\"", "").replace(",", ".").substring(0, 15)); 482 } 483 line++; 484 } 485 } 486 487 /** 488 * Test that ensures the chunk file is deleted at the end of sortInJava method. Otherwise, unwanted chunk files 489 * will be left over. 490 */ 491 @Test 492 public void testSortInJava() throws IOException { 493 File source = FileUtils.getClasspathFile("sorting/taxon.txt"); 494 File sorted = File.createTempFile("gbif-common-file-sort", "taxon_sorted.txt"); 495 sorted.deleteOnExit(); 496 FileUtils futils = new FileUtils(); 497 Comparator<String> lineComparator = new LineComparator(0, "\t"); 498 futils.sortInJava(source, sorted, ENCODING, lineComparator, 3); 499 500 // the chunk file should NOT exist 501 File chunkFile = new File(source.getParent(), "taxon_0txt"); 502 assertFalse(chunkFile.exists()); 503 504 // the sorted file should exist 505 System.out.println(sorted.getAbsolutePath()); 506 assertTrue(sorted.exists()); 507 508 // read file 509 BufferedReader br = 510 new BufferedReader( 511 new InputStreamReader(new FileInputStream(sorted), StandardCharsets.UTF_8)); 512 int line = 0; 513 while (true) { 514 line++; 515 String row = br.readLine(); 516 if (row == null) { 517 break; 518 } 519 // first line (smallest ID) 520 if (line == 4) { 521 assertTrue(row.startsWith("118701359")); 522 } 523 // last line (largest ID) 524 else if (line == 10) { 525 assertTrue(row.startsWith("120320038")); 526 } 527 } 528 } 529 530 /** 531 * Test using GNU sort (if available on this platform). 532 */ 533 @Test 534 public void testSort() throws IOException { 535 File source = FileUtils.getClasspathFile("sorting/taxon.txt"); 536 File sorted = File.createTempFile("gbif-common-file-sort", "taxon_sorted.txt"); 537 sorted.deleteOnExit(); 538 FileUtils futils = new FileUtils(); 539 futils.sort(source, sorted, ENCODING, 0, "\t", null, "\n", 3); 540 541 // the sorted file should exist 542 System.out.println(sorted.getAbsolutePath()); 543 assertTrue(sorted.exists()); 544 545 // read file 546 BufferedReader br = 547 new BufferedReader( 548 new InputStreamReader(new FileInputStream(sorted), StandardCharsets.UTF_8)); 549 int line = 0; 550 while (true) { 551 line++; 552 String row = br.readLine(); 553 if (row == null) { 554 break; 555 } 556 // first line (smallest ID) 557 if (line == 4) { 558 assertTrue(row.startsWith("118701359")); 559 } 560 // last line (largest ID) 561 else if (line == 10) { 562 assertTrue(row.startsWith("120320038")); 563 } 564 } 565 } 566 567 /** 568 * Test sorting multiple fils into a single file. First column, so GNU sort. 569 */ 570 @Test 571 public void testMultiFileSort() throws IOException { 572 final int IDCOLUMN = 0; 573 File source1 = FileUtils.getClasspathFile("sorting/multi/VernacularNames-adai.csv"); 574 File source2 = FileUtils.getClasspathFile("sorting/multi/VernacularNames-choctaw.csv"); 575 File source3 = FileUtils.getClasspathFile("sorting/multi/VernacularNames-nahya.csv"); 576 List<File> sources = Arrays.asList(source1, source2, source3); 577 File sorted = File.createTempFile("gbif-common-file-sort", "sorted.txt"); 578 sorted.deleteOnExit(); 579 FileUtils futils = new FileUtils(); 580 futils.sort(sources, sorted, ENCODING, IDCOLUMN, ",", '"', "\n", 1); 581 582 // read file 583 BufferedReader br = 584 new BufferedReader( 585 new InputStreamReader(new FileInputStream(sorted), StandardCharsets.UTF_8)); 586 int line = 0; 587 while (true) { 588 line++; 589 String row = br.readLine(); 590 if (row == null) { 591 break; 592 } 593 594 if (line == 1) { 595 assertTrue(row.startsWith("id,vernacularName,language")); 596 } else if (line == 2) { 597 assertTrue(row.startsWith("122860,xoyamet,und,\"\",\"\",,nahya,,2013-05-16T08:27:53Z")); 598 } else if (line == 3) { 599 assertTrue(row.startsWith("49662,heohè,und,\"\",\"\",,Adai,Ben,2021-01-26T16:07:11Z")); 600 } else if (line == 4) { 601 assertTrue(row.startsWith("50897,Umbi,und,\"\",\"\",,Choctaw,Ben,2021-01-13T02:14:34Z")); 602 } else { 603 fail("Too many lines."); 604 } 605 } 606 } 607 608 /** 609 * Test sorting multiple files into a single file. Second column, so Java sort. 610 */ 611 @Test 612 public void testMultiFileSort2ndColumn() throws IOException { 613 final int IDCOLUMN = 1; 614 File source1 = FileUtils.getClasspathFile("sorting/multi/VernacularNames-adai.csv"); 615 File source2 = FileUtils.getClasspathFile("sorting/multi/VernacularNames-choctaw.csv"); 616 File source3 = FileUtils.getClasspathFile("sorting/multi/VernacularNames-nahya.csv"); 617 List<File> sources = Arrays.asList(source1, source2, source3); 618 File sorted = File.createTempFile("gbif-common-file-sort", "sorted.txt"); 619 sorted.deleteOnExit(); 620 FileUtils futils = new FileUtils(); 621 futils.sort(sources, sorted, ENCODING, IDCOLUMN, ",", '"', "\n", 1); 622 623 // read file 624 BufferedReader br = 625 new BufferedReader( 626 new InputStreamReader(new FileInputStream(sorted), StandardCharsets.UTF_8)); 627 int line = 0; 628 while (true) { 629 line++; 630 String row = br.readLine(); 631 if (row == null) { 632 break; 633 } 634 635 if (line == 1) { 636 assertTrue(row.startsWith("id,vernacularName,language")); 637 } else if (line == 2) { 638 assertTrue(row.startsWith("50897,Umbi,und,\"\",\"\",,Choctaw,Ben,2021-01-13T02:14:34Z")); 639 } else if (line == 3) { 640 assertTrue(row.startsWith("49662,heohè,und,\"\",\"\",,Adai,Ben,2021-01-26T16:07:11Z")); 641 } else if (line == 4) { 642 assertTrue(row.startsWith("122860,xoyamet,und,\"\",\"\",,nahya,,2013-05-16T08:27:53Z")); 643 } else { 644 fail("Too many lines."); 645 } 646 } 647 } 648}