001/*
002 * Licensed under the Apache License, Version 2.0 (the "License");
003 * you may not use this file except in compliance with the License.
004 * You may obtain a copy of the License at
005 *
006 *     http://www.apache.org/licenses/LICENSE-2.0
007 *
008 * Unless required by applicable law or agreed to in writing, software
009 * distributed under the License is distributed on an "AS IS" BASIS,
010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
011 * See the License for the specific language governing permissions and
012 * limitations under the License.
013 */
014package org.gbif.utils.file;
015
016import org.gbif.utils.text.LineComparator;
017
018import java.io.BufferedReader;
019import java.io.ByteArrayOutputStream;
020import java.io.File;
021import java.io.FileInputStream;
022import java.io.FileOutputStream;
023import java.io.IOException;
024import java.io.InputStreamReader;
025import java.io.OutputStream;
026import java.io.OutputStreamWriter;
027import java.nio.charset.StandardCharsets;
028import java.nio.file.Files;
029import java.util.ArrayList;
030import java.util.Arrays;
031import java.util.Comparator;
032import java.util.Iterator;
033import java.util.LinkedList;
034import java.util.List;
035import java.util.concurrent.TimeUnit;
036
037import org.apache.commons.io.LineIterator;
038import org.apache.commons.lang3.time.StopWatch;
039import org.junit.jupiter.api.Disabled;
040import org.junit.jupiter.api.Test;
041
042import static org.junit.jupiter.api.Assertions.assertEquals;
043import static org.junit.jupiter.api.Assertions.assertFalse;
044import static org.junit.jupiter.api.Assertions.assertTrue;
045import static org.junit.jupiter.api.Assertions.fail;
046
047/**
048 * @author markus
049 */
050public class FileUtilsTest {
051
052  private final String ENCODING = StandardCharsets.UTF_8.displayName();
053
054  public static void assertUnixSortOrder(File sorted) throws IOException {
055    // read file
056    BufferedReader br =
057        new BufferedReader(
058            new InputStreamReader(new FileInputStream(sorted), StandardCharsets.UTF_8));
059    LineIterator liter = new LineIterator(br);
060    assertUnixSortOrder(liter);
061  }
062
063  public static void assertUnixSortOrder(Iterator<String> it) throws IOException {
064    LinkedList<String> sorted =
065        FileUtils.streamToList(FileUtils.classpathStream("sorting/LF_sorted.txt"));
066    while (it.hasNext()) {
067      String x = it.next();
068      System.out.println(x);
069      assertEquals(sorted.poll(), x);
070    }
071  }
072
073  @Test
074  public void humanReadableByteCountTest() {
075    assertEquals("11 B", FileUtils.humanReadableByteCount(11, true));
076    assertEquals("1.0 kB", FileUtils.humanReadableByteCount(1_000, true));
077    assertEquals("1.0 MB", FileUtils.humanReadableByteCount(1_000_000, true));
078    assertEquals("1.0 GB", FileUtils.humanReadableByteCount(1_000_000_000, true));
079    assertEquals("1.0 TB", FileUtils.humanReadableByteCount(1_000_000_000_000L, true));
080
081    assertEquals("11 B", FileUtils.humanReadableByteCount(11, false));
082    assertEquals("1.0 KiB", FileUtils.humanReadableByteCount(1024, false));
083    assertEquals("1.0 MiB", FileUtils.humanReadableByteCount(1024 * 1024, false));
084    assertEquals("1.0 GiB", FileUtils.humanReadableByteCount(1024 * 1024 * 1024, false));
085    assertEquals("1.0 TiB", FileUtils.humanReadableByteCount(1024 * 1024 * 1024 * 1024L, false));
086  }
087
088  /**
089   * tests deleting directory recursively.
090   */
091  @Test
092  public void testDeleteRecursive() throws IOException {
093    File topDirectory = Files.createTempDirectory("top").toFile();
094    File middleDirectory = new File(topDirectory, "middle");
095    middleDirectory.mkdir();
096    File bottomDirectory = new File(middleDirectory, "bottom");
097    bottomDirectory.mkdir();
098    File bottomFile = new File(bottomDirectory, "bottom");
099    FileUtils.touch(bottomFile);
100
101    assertTrue(topDirectory.getParentFile().exists());
102    assertTrue(topDirectory.exists());
103    assertTrue(middleDirectory.getParentFile().exists());
104    assertTrue(middleDirectory.exists());
105    assertTrue(bottomDirectory.getParentFile().exists());
106    assertTrue(bottomDirectory.exists());
107    assertTrue(bottomFile.exists());
108
109    FileUtils.deleteDirectoryRecursively(topDirectory);
110
111    assertTrue(topDirectory.getParentFile().exists());
112    assertFalse(topDirectory.exists());
113    assertFalse(middleDirectory.getParentFile().exists());
114    assertFalse(middleDirectory.exists());
115    assertFalse(bottomDirectory.getParentFile().exists());
116    assertFalse(bottomDirectory.exists());
117    assertFalse(bottomFile.exists());
118  }
119
120  @Test
121  @Disabled("Run manually to check the performance of merging sorted files.")
122  public void testMergeSortedFilesSpeed() throws IOException {
123    // This is the chunks produced from the first part of Java-sorting iNaturalist's media file.
124    String prefix =
125        "/4TB/Matt/unpacked_50c9509d-22c7-4a22-a47d-8c48425ef4a7/media_%dcsv-normalized";
126    List<File> sortFiles = new ArrayList<>();
127    for (int i = 0; i <= 825; i++) {
128      sortFiles.add(new File(String.format(prefix, i)));
129    }
130
131    LineComparator lineComparator = new LineComparator(0, ",", '"');
132
133    OutputStreamWriter performanceWriter =
134        new OutputStreamWriter(
135            new OutputStream() {
136              int count = 0;
137              StopWatch sw = StopWatch.createStarted();
138
139              @Override
140              public void write(int b) throws IOException {
141                if (b == '\n') {
142                  count++;
143
144                  if (count % 100_000 == 0 && sw.getTime(TimeUnit.SECONDS) > 0) {
145                    System.out.println(
146                        "Done "
147                            + count
148                            + " at "
149                            + (count / sw.getTime(TimeUnit.SECONDS))
150                            + " lines per second");
151                  }
152
153                  if (count == 10_000_000) {
154                    System.out.println(
155                        "Done "
156                            + count
157                            + " at "
158                            + (count / sw.getTime(TimeUnit.SECONDS))
159                            + " lines per second");
160                    System.out.println("Took " + sw.getTime(TimeUnit.SECONDS));
161                    throw new IOException("Did enough");
162                  }
163                }
164              }
165            });
166    try {
167      new FileUtils().mergeSortedFiles(sortFiles, performanceWriter, lineComparator);
168    } catch (IOException e) {
169    }
170
171    StopWatch sw = StopWatch.createStarted();
172    OutputStreamWriter fileWriter =
173        new OutputStreamWriter(new FileOutputStream(prefix.replace("%d", "OUTPUT")));
174    new FileUtils().mergeSortedFiles(sortFiles, fileWriter, lineComparator);
175    System.out.println("Took " + sw.getTime(TimeUnit.SECONDS) + " seconds.");
176  }
177
178  @Test
179  public void testMergeSortedFiles() throws IOException {
180    List<File> sortedSplitFiles = new ArrayList<>();
181    for (int i = 0; i <= 4; i++) {
182      sortedSplitFiles.add(FileUtils.getClasspathFile("merging/split_" + i + ".txt"));
183    }
184    // Also add an empty file
185    sortedSplitFiles.add(File.createTempFile("gbif-common-file-merge", "empty.txt"));
186
187    ByteArrayOutputStream output = new ByteArrayOutputStream();
188    OutputStreamWriter writer = new OutputStreamWriter(output);
189
190    LineComparator lineComparator = new LineComparator(0, ",", '"');
191
192    new FileUtils().mergeSortedFiles(sortedSplitFiles, writer, lineComparator);
193
194    String[] sorted = output.toString().split("\n");
195    for (int i = 1; i < sorted.length; i++) {
196      assertTrue(sorted[i - 1].compareTo(sorted[i]) <= 0);
197    }
198    assertEquals(100, sorted.length);
199  }
200
201  @Test
202  public void testMergeEmptyFiles() throws IOException {
203    List<File> sortedSplitFiles = new ArrayList<>();
204    sortedSplitFiles.add(File.createTempFile("gbif-common-file-merge", "empty.txt"));
205    sortedSplitFiles.add(File.createTempFile("gbif-common-file-merge", "empty.txt"));
206
207    ByteArrayOutputStream output = new ByteArrayOutputStream();
208    OutputStreamWriter writer = new OutputStreamWriter(output);
209
210    LineComparator lineComparator = new LineComparator(0, ",", '"');
211
212    new FileUtils().mergeSortedFiles(sortedSplitFiles, writer, lineComparator);
213
214    assertEquals("", output.toString());
215  }
216
217  @Test
218  public void testSortingHeaderlessFile() throws IOException {
219    final int IDCOLUMN = 0;
220    File source = FileUtils.getClasspathFile("sorting/irmng.tail");
221    File sorted = File.createTempFile("gbif-common-file-sort", "sorted.txt");
222    sorted.deleteOnExit();
223    FileUtils futils = new FileUtils();
224    futils.sort(source, sorted, ENCODING, IDCOLUMN, "\t", null, "\n", 0);
225
226    // read file
227    BufferedReader br =
228        new BufferedReader(
229            new InputStreamReader(new FileInputStream(sorted), StandardCharsets.UTF_8));
230    int line = 0;
231    while (true) {
232      line++;
233      String row = br.readLine();
234      if (row == null) {
235        break;
236      }
237
238      if (line == 1) {
239        assertTrue(row.startsWith("aca10000053"));
240      } else if (line == 2) {
241        assertTrue(row.startsWith("aca1000012"));
242      } else if (line == 100) {
243        assertTrue(row.startsWith("acr10001387"));
244      } else if (line == 100000) {
245        assertTrue(row.startsWith("vir10000981"));
246      }
247    }
248  }
249
250  /**
251   * Sorting strings containing characters which are surrogate pairs, meaning Unicode characters beyond U+FFFF, will
252   * give different results between GNU Sort and a Java String comparator.
253   *
254   * "fl LATIN SMALL LIGATURE FL" is U+FB02.
255   * "𐃍 LINEAR B IDEOGRAM B241 CHARIOT" is U+100CD.
256   *
257   * GNU sort will use this order, based on the value of the whole character.
258   *
259   * Java represents 𐃍 as a surrogate pair \ud800\udccd in UTF-16, and sorts based on parts of pairs. Therefore, it
260   * gives the wrong order.
261   */
262  @Disabled("Expected to fail")
263  @Test
264  public void testSortingUnicodeFile() throws IOException {
265    FileUtils futils = new FileUtils();
266    final int IDCOLUMN = 0;
267
268    File source =
269        FileUtils.getClasspathFile("sorting/unicode-supplementary-multilingual-plane.txt");
270    File gnuSorted = File.createTempFile("gbif-common-file-sort", "sorted-gnu.txt");
271    File javaSorted = File.createTempFile("gbif-common-file-sort", "sorted-java.txt");
272    gnuSorted.deleteOnExit();
273    javaSorted.deleteOnExit();
274
275    futils.sort(source, gnuSorted, ENCODING, IDCOLUMN, "\t", null, "\n", 0);
276    // The columnDelimiter of ' prevents GNU Sort from being used.
277    futils.sort(source, javaSorted, ENCODING, IDCOLUMN, "'", null, "\n", 0);
278
279    // read file
280    BufferedReader gnuBr =
281        new BufferedReader(
282            new InputStreamReader(new FileInputStream(gnuSorted), StandardCharsets.UTF_8));
283    BufferedReader javaBr =
284        new BufferedReader(
285            new InputStreamReader(new FileInputStream(javaSorted), StandardCharsets.UTF_8));
286
287    int line = 0;
288    String gnuRow, javaRow;
289    while ((gnuRow = gnuBr.readLine()) != null) {
290      javaRow = javaBr.readLine();
291
292      line++;
293
294      System.out.println(gnuRow + "\t\t\t\t" + javaRow);
295
296      assertEquals("Line " + line, gnuRow, javaRow);
297    }
298  }
299
300  /**
301   * tests sorting mac line endings \r which don't work with unix sort
302   */
303  @Test
304  public void testSortingMac() throws IOException {
305    File source = FileUtils.getClasspathFile("sorting/LF_mac.txt");
306    File sorted = File.createTempFile("sort-test", "mac.txt");
307    sorted.deleteOnExit();
308    FileUtils futils = new FileUtils();
309    futils.sort(source, sorted, ENCODING, 0, "×", null, "\r", 0);
310
311    assertUnixSortOrder(sorted);
312  }
313
314  /**
315   * tests sorting unix line endings \n which work with unix sort
316   */
317  @Test
318  public void testSortingUnix() throws IOException {
319    File source = FileUtils.getClasspathFile("sorting/LF_unix.txt");
320    File sorted = File.createTempFile("sort-test", "unix.txt");
321    sorted.deleteOnExit();
322    FileUtils futils = new FileUtils();
323    futils.sort(source, sorted, ENCODING, 0, "×", null, "\n", 0);
324
325    assertUnixSortOrder(sorted);
326  }
327
328  /**
329   * tests sorting windows line endings \r\n which work with unix sort
330   */
331  @Test
332  public void testSortingWindows() throws IOException {
333    File source = FileUtils.getClasspathFile("sorting/LF_win.txt");
334    File sorted = File.createTempFile("sort-test", "windows.txt");
335    sorted.deleteOnExit();
336    FileUtils futils = new FileUtils();
337    futils.sort(source, sorted, ENCODING, 0, "×", null, "\r\n", 0);
338
339    assertUnixSortOrder(sorted);
340  }
341
342  /**
343   * Tests sorting by a column with uneven length strings as the sort column.
344   *
345   * The order musn't be different depending whether the column is last or not.
346   *
347   * The "-k×,×" argument to sort is essential here, otherwise the delimiter from the following column is part of the sort order.
348   */
349  @Test
350  public void testSortingUnevenLengths() throws IOException {
351    FileUtils futils = new FileUtils();
352
353    File source = FileUtils.getClasspathFile("sorting/uneven_lengths_col1.txt");
354    File sorted = File.createTempFile("sort-test", "uneven_lengths_col1.txt");
355    sorted.deleteOnExit();
356    futils.sort(source, sorted, ENCODING, 0, ";", null, "\n", 0);
357
358    List<String> sortedStrings = FileUtils.streamToList(new FileInputStream(sorted), ENCODING);
359    assertEquals("980-sp10;x", sortedStrings.get(0));
360    assertEquals("980-sp100;x", sortedStrings.get(1));
361    assertEquals("980-sp101;x", sortedStrings.get(2));
362
363    File source2 = FileUtils.getClasspathFile("sorting/uneven_lengths_col2.txt");
364    File sorted2 = File.createTempFile("sort-test", "uneven_lengths_col2.txt");
365    sorted.deleteOnExit();
366    futils.sort(source2, sorted2, ENCODING, 1, ";", null, "\n", 0);
367
368    List<String> sortedStrings2 = FileUtils.streamToList(new FileInputStream(sorted2), ENCODING);
369    assertEquals("x;980-sp10", sortedStrings2.get(0));
370    assertEquals("x;980-sp100", sortedStrings2.get(1));
371    assertEquals("x;980-sp101", sortedStrings2.get(2));
372  }
373
374  @Test
375  public void testSortingWithHeaders() throws IOException {
376    final int IDCOLUMN = 0;
377    File source = FileUtils.getClasspathFile("sorting/csv_always_quoted.csv");
378    File sorted = File.createTempFile("gbif-common-file-sort", "sorted.txt");
379    sorted.deleteOnExit();
380    FileUtils futils = new FileUtils();
381    futils.sort(source, sorted, ENCODING, IDCOLUMN, ",", '"', "\n", 1);
382
383    // read file
384    BufferedReader br =
385        new BufferedReader(
386            new InputStreamReader(new FileInputStream(sorted), StandardCharsets.UTF_8));
387    int line = 0;
388    while (true) {
389      line++;
390      String row = br.readLine();
391      if (row == null) {
392        break;
393      }
394
395      if (line == 1) {
396        assertTrue(row.startsWith("\"ID\",\"catalogNumber\""));
397      } else if (line == 2) {
398        assertTrue(
399            row.startsWith(
400                "\"18728553\",\"18728553\",\"Event\",\"18728553\",\"Muscardinus avellanarius\""));
401      } else if (line == 3) {
402        assertTrue(
403            row.startsWith(
404                "\"8728372\",\"18728372\",\"Event\",\"18728372\",\"Muscardinus avellanarius\",\"52.31635664254722\""));
405      }
406    }
407  }
408
409  @Test
410  public void testSortingWithNonFirstIdColumn() throws IOException {
411    File source = FileUtils.getClasspathFile("sorting/TDB_104.csv");
412    File sorted = File.createTempFile("gbif-common-file-sort", "sorted.txt");
413    sorted.deleteOnExit();
414    FileUtils futils = new FileUtils();
415    futils.sort(source, sorted, ENCODING, 3, ";", null, "\n", 1);
416
417    // read file
418    BufferedReader br =
419        new BufferedReader(
420            new InputStreamReader(new FileInputStream(sorted), StandardCharsets.UTF_8));
421    int line = 0;
422    while (true) {
423      line++;
424      String row = br.readLine();
425      if (row == null) {
426        break;
427      }
428
429      if (line == 1) {
430        assertEquals(
431            "taxonRank;scientificName;scientificNameAuthorship;taxonID;parentNameUsageID;vernacularName;taxonomicStatus",
432            row);
433      } else if (line == 2) {
434        // row 2 and 3 have the same ids - only test if the id is correct (actual order of those 2
435        // records can differ)
436        Iterator<String> columns = Arrays.stream(row.split(";", -1)).iterator();
437
438        columns.next();
439        columns.next();
440        columns.next();
441        assertEquals(
442            "urn:lsid:luomus.fi:taxonconcept:0071b855-3d23-4fdc-b2e0-8464c22d752a:1",
443            columns.next());
444
445      } else if (line == 100) {
446        assertEquals(
447            "species;Ctenochira angulata;(Thomson, 1883) ;urn:lsid:luomus.fi:taxonconcept:4adcf436-a0d2-4940-9155-220ffc6f5859:1;urn:lsid:luomus.fi:taxonconcept:817994ea-b58b-4deb-973f-9fa99c537f8a:1;;valid",
448            row);
449      }
450    }
451  }
452
453  /**
454   * If only columns containing delimiters are quoted in CSV, we can't use GNU sort.
455   *   X,"Look, now!",1
456   *   X,Why should I,2
457   */
458  @Test
459  public void testSortingWithQuotedDelimiters() throws IOException {
460    File source = FileUtils.getClasspathFile("sorting/csv_quoted_delimiters.csv");
461    File sorted = File.createTempFile("gbif-common-file-sort", "sorted.txt");
462    sorted.deleteOnExit();
463    FileUtils futils = new FileUtils();
464    futils.sort(source, sorted, ENCODING, 0, ",", '"', "\n", 1);
465
466    // read file
467    BufferedReader br =
468        new BufferedReader(
469            new InputStreamReader(new FileInputStream(sorted), StandardCharsets.UTF_8));
470    int line = 30950;
471    while (true) {
472      String row = br.readLine();
473      if (row == null) {
474        break;
475      }
476
477      if (line == 30950) {
478        assertEquals("catalogNumber", row.substring(0, 13));
479      } else {
480        // Catalog number ends in 30951 to 30961.
481        assertEquals("ZMA.COL.P." + line, row.replace("\"", "").replace(",", ".").substring(0, 15));
482      }
483      line++;
484    }
485  }
486
487  /**
488   * Test that ensures the chunk file is deleted at the end of sortInJava method. Otherwise, unwanted chunk files
489   * will be left over.
490   */
491  @Test
492  public void testSortInJava() throws IOException {
493    File source = FileUtils.getClasspathFile("sorting/taxon.txt");
494    File sorted = File.createTempFile("gbif-common-file-sort", "taxon_sorted.txt");
495    sorted.deleteOnExit();
496    FileUtils futils = new FileUtils();
497    Comparator<String> lineComparator = new LineComparator(0, "\t");
498    futils.sortInJava(source, sorted, ENCODING, lineComparator, 3);
499
500    // the chunk file should NOT exist
501    File chunkFile = new File(source.getParent(), "taxon_0txt");
502    assertFalse(chunkFile.exists());
503
504    // the sorted file should exist
505    System.out.println(sorted.getAbsolutePath());
506    assertTrue(sorted.exists());
507
508    // read file
509    BufferedReader br =
510        new BufferedReader(
511            new InputStreamReader(new FileInputStream(sorted), StandardCharsets.UTF_8));
512    int line = 0;
513    while (true) {
514      line++;
515      String row = br.readLine();
516      if (row == null) {
517        break;
518      }
519      // first line (smallest ID)
520      if (line == 4) {
521        assertTrue(row.startsWith("118701359"));
522      }
523      // last line (largest ID)
524      else if (line == 10) {
525        assertTrue(row.startsWith("120320038"));
526      }
527    }
528  }
529
530  /**
531   * Test using GNU sort (if available on this platform).
532   */
533  @Test
534  public void testSort() throws IOException {
535    File source = FileUtils.getClasspathFile("sorting/taxon.txt");
536    File sorted = File.createTempFile("gbif-common-file-sort", "taxon_sorted.txt");
537    sorted.deleteOnExit();
538    FileUtils futils = new FileUtils();
539    futils.sort(source, sorted, ENCODING, 0, "\t", null, "\n", 3);
540
541    // the sorted file should exist
542    System.out.println(sorted.getAbsolutePath());
543    assertTrue(sorted.exists());
544
545    // read file
546    BufferedReader br =
547        new BufferedReader(
548            new InputStreamReader(new FileInputStream(sorted), StandardCharsets.UTF_8));
549    int line = 0;
550    while (true) {
551      line++;
552      String row = br.readLine();
553      if (row == null) {
554        break;
555      }
556      // first line (smallest ID)
557      if (line == 4) {
558        assertTrue(row.startsWith("118701359"));
559      }
560      // last line (largest ID)
561      else if (line == 10) {
562        assertTrue(row.startsWith("120320038"));
563      }
564    }
565  }
566
567  /**
568   * Test sorting multiple fils into a single file. First column, so GNU sort.
569   */
570  @Test
571  public void testMultiFileSort() throws IOException {
572    final int IDCOLUMN = 0;
573    File source1 = FileUtils.getClasspathFile("sorting/multi/VernacularNames-adai.csv");
574    File source2 = FileUtils.getClasspathFile("sorting/multi/VernacularNames-choctaw.csv");
575    File source3 = FileUtils.getClasspathFile("sorting/multi/VernacularNames-nahya.csv");
576    List<File> sources = Arrays.asList(source1, source2, source3);
577    File sorted = File.createTempFile("gbif-common-file-sort", "sorted.txt");
578    sorted.deleteOnExit();
579    FileUtils futils = new FileUtils();
580    futils.sort(sources, sorted, ENCODING, IDCOLUMN, ",", '"', "\n", 1);
581
582    // read file
583    BufferedReader br =
584        new BufferedReader(
585            new InputStreamReader(new FileInputStream(sorted), StandardCharsets.UTF_8));
586    int line = 0;
587    while (true) {
588      line++;
589      String row = br.readLine();
590      if (row == null) {
591        break;
592      }
593
594      if (line == 1) {
595        assertTrue(row.startsWith("id,vernacularName,language"));
596      } else if (line == 2) {
597        assertTrue(row.startsWith("122860,xoyamet,und,\"\",\"\",,nahya,,2013-05-16T08:27:53Z"));
598      } else if (line == 3) {
599        assertTrue(row.startsWith("49662,heohè,und,\"\",\"\",,Adai,Ben,2021-01-26T16:07:11Z"));
600      } else if (line == 4) {
601        assertTrue(row.startsWith("50897,Umbi,und,\"\",\"\",,Choctaw,Ben,2021-01-13T02:14:34Z"));
602      } else {
603        fail("Too many lines.");
604      }
605    }
606  }
607
608  /**
609   * Test sorting multiple files into a single file. Second column, so Java sort.
610   */
611  @Test
612  public void testMultiFileSort2ndColumn() throws IOException {
613    final int IDCOLUMN = 1;
614    File source1 = FileUtils.getClasspathFile("sorting/multi/VernacularNames-adai.csv");
615    File source2 = FileUtils.getClasspathFile("sorting/multi/VernacularNames-choctaw.csv");
616    File source3 = FileUtils.getClasspathFile("sorting/multi/VernacularNames-nahya.csv");
617    List<File> sources = Arrays.asList(source1, source2, source3);
618    File sorted = File.createTempFile("gbif-common-file-sort", "sorted.txt");
619    sorted.deleteOnExit();
620    FileUtils futils = new FileUtils();
621    futils.sort(sources, sorted, ENCODING, IDCOLUMN, ",", '"', "\n", 1);
622
623    // read file
624    BufferedReader br =
625        new BufferedReader(
626            new InputStreamReader(new FileInputStream(sorted), StandardCharsets.UTF_8));
627    int line = 0;
628    while (true) {
629      line++;
630      String row = br.readLine();
631      if (row == null) {
632        break;
633      }
634
635      if (line == 1) {
636        assertTrue(row.startsWith("id,vernacularName,language"));
637      } else if (line == 2) {
638        assertTrue(row.startsWith("50897,Umbi,und,\"\",\"\",,Choctaw,Ben,2021-01-13T02:14:34Z"));
639      } else if (line == 3) {
640        assertTrue(row.startsWith("49662,heohè,und,\"\",\"\",,Adai,Ben,2021-01-26T16:07:11Z"));
641      } else if (line == 4) {
642        assertTrue(row.startsWith("122860,xoyamet,und,\"\",\"\",,nahya,,2013-05-16T08:27:53Z"));
643      } else {
644        fail("Too many lines.");
645      }
646    }
647  }
648}