001/*
002 * Licensed under the Apache License, Version 2.0 (the "License");
003 * you may not use this file except in compliance with the License.
004 * You may obtain a copy of the License at
005 *
006 *     http://www.apache.org/licenses/LICENSE-2.0
007 *
008 * Unless required by applicable law or agreed to in writing, software
009 * distributed under the License is distributed on an "AS IS" BASIS,
010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
011 * See the License for the specific language governing permissions and
012 * limitations under the License.
013 */
014package org.gbif.dwca.io;
015
016import org.gbif.dwc.Archive;
017import org.gbif.dwc.DwcFiles;
018import org.gbif.dwc.DwcLayout;
019import org.gbif.dwc.UnsupportedArchiveException;
020import org.gbif.dwc.meta.DwcMetaFiles;
021import org.gbif.dwc.record.Record;
022import org.gbif.dwc.record.StarRecord;
023import org.gbif.dwc.terms.DcTerm;
024import org.gbif.dwc.terms.DwcTerm;
025import org.gbif.dwc.terms.GbifTerm;
026import org.gbif.utils.file.CompressionUtil;
027import org.gbif.utils.file.FileUtils;
028import org.gbif.utils.file.tabular.TabularDataFileReader;
029import org.gbif.utils.file.tabular.TabularFiles;
030
031import java.io.File;
032import java.io.IOException;
033import java.io.InputStream;
034import java.nio.charset.StandardCharsets;
035import java.nio.file.Files;
036import java.util.HashSet;
037import java.util.Iterator;
038import java.util.List;
039import java.util.Set;
040
041import org.junit.jupiter.api.Test;
042
043import static org.junit.jupiter.api.Assertions.assertEquals;
044import static org.junit.jupiter.api.Assertions.assertNotNull;
045import static org.junit.jupiter.api.Assertions.assertThrows;
046import static org.junit.jupiter.api.Assertions.assertTrue;
047import static org.junit.jupiter.api.Assertions.fail;
048
049public class ArchiveFactoryTest {
050
051  private void assertNumberOfCoreRecords(Archive arch, int expectedRecords) {
052    int rows = 0;
053    for (Record rec : arch.getCore()) {
054      assertNotNull(rec);
055      rows++;
056    }
057    assertEquals(expectedRecords, rows);
058  }
059
060  @Test
061  public void testMetaHandlerUtf16le() throws Exception {
062    for (String fn : new String[]{"/meta/meta.xml", "/meta-utf16le.xml","/xml-entity-meta/meta.xml"}) {
063      InputStream is = getClass().getResourceAsStream(fn);
064      DwcMetaFiles.fromMetaDescriptor(is);
065    }
066  }
067
068  @Test
069  public void testCoreRecords() throws IOException {
070    // note that we don't read a DWC archive, but only test the csvreader!
071    // we therefore do not detect header rows and count *all* rows instead
072
073    assertNumberOfCoreRecords(DwcFiles.fromLocation(FileUtils.getClasspathFile("iucn100.tab.txt").toPath()), 99);
074    assertNumberOfCoreRecords(DwcFiles.fromLocation(FileUtils.getClasspathFile("iucn100.pipe.txt").toPath()), 99);
075    assertNumberOfCoreRecords(DwcFiles.fromLocation(FileUtils.getClasspathFile("iucn100.csv").toPath()), 99);
076    assertNumberOfCoreRecords(DwcFiles.fromLocation(FileUtils.getClasspathFile("csv_quoted-unquoted_headers.csv").toPath()),
077      3);
078    assertNumberOfCoreRecords(DwcFiles.fromLocation(FileUtils.getClasspathFile("csv_incl_single_quotes.csv").toPath()), 3);
079  }
080
081  /**
082   * Test dwca-reader bug 83
083   *
084   * @see <a href="http://code.google.com/p/darwincore/issues/detail?id=83">Issue 83</a>
085   */
086  @Test
087  public void testCsv() throws UnsupportedArchiveException, IOException {
088    File csv = FileUtils.getClasspathFile("csv_always_quoted.csv");
089    // read archive from this tmp dir
090    Archive arch = DwcFiles.fromLocation(csv.toPath());
091
092    boolean found = false;
093    for (Record rec : arch.getCore()) {
094      if ("ENNH0192".equals(rec.id())) {
095        found = true;
096        assertEquals("Martins Wood, Ightham", rec.value(DwcTerm.locality));
097      }
098    }
099    assertTrue(found);
100  }
101
102  /**
103   * Test GNUB style dwca with a single tab delimited file that has a .tab suffix.
104   */
105  @Test
106  public void testGnubTab() throws UnsupportedArchiveException, IOException {
107    File tab = FileUtils.getClasspathFile("gnub.tab");
108    // read archive from this tmp dir
109    Archive arch = DwcFiles.fromLocation(tab.toPath());
110
111    Record rec = arch.getCore().iterator().next();
112    assertEquals("246daa62-6fce-448f-88b4-94b0ccc89cf1", rec.id());
113  }
114
115  /**
116   * Test GNUB style dwca with a single tab delimited file that has a .tab suffix.
117   */
118  @Test
119  public void testGnubTabZip() throws UnsupportedArchiveException, IOException {
120    // test GNUB zip with 1 data file
121    File tmpDir = Files.createTempDirectory("dwca-io-test").toFile();
122    tmpDir.deleteOnExit();
123    File zip = FileUtils.getClasspathFile("gnub.tab.zip");
124    CompressionUtil.decompressFile(tmpDir, zip);
125
126    // read archive from this tmp dir
127    Archive arch = DwcFiles.fromLocation(tmpDir.toPath());
128
129    Record rec = arch.getCore().iterator().next();
130    assertEquals("246daa62-6fce-448f-88b4-94b0ccc89cf1", rec.id());
131  }
132
133  /**
134   * Testing CSV with optional quotes
135   */
136  @Test
137  public void testCsvOptionalQuotes() throws UnsupportedArchiveException, IOException {
138    File csv = FileUtils.getClasspathFile("csv_optional_quotes_excel2008CSV.csv");
139    Archive arch = DwcFiles.fromLocation(csv.toPath());
140    String[] ids = {"1", "2", "3", "4"};
141    String[] scinames = {"Gadus morhua", "Abies alba", "Pomatoma saltatrix", "Yikes ofcourses"};
142    String[] localities =
143      {"This has a, comma", "I say this is only a \"quote\"", "What though, \"if you have a quote\" and a comma",
144        "What, if we have a \"quote, which has a comma, or 2\""};
145    int row = 0;
146    for (Record rec : arch.getCore()) {
147      assertEquals(ids[row], rec.id());
148      assertEquals(scinames[row], rec.value(DwcTerm.scientificName));
149      assertEquals(localities[row], rec.value(DwcTerm.locality));
150      row++;
151    }
152  }
153
154  /**
155   * Test IPT bug 2158
156   *
157   * @see <a href="http://code.google.com/p/gbif-providertoolkit/source/detail?r=2158">IPT revision 2158</a>
158   */
159  @Test
160  public void testIssue2158() throws UnsupportedArchiveException, IOException {
161    // test zip with 1 extension file
162    File zip = FileUtils.getClasspathFile("archive-tax.zip");
163    File tmpDir = Files.createTempDirectory("dwca-io-test").toFile();
164    CompressionUtil.decompressFile(tmpDir, zip);
165    // read archive from this tmp dir
166    Archive arch = DwcFiles.fromLocation(tmpDir.toPath());
167    assertNotNull(arch.getCore().getId());
168    assertEquals(1, arch.getExtensions().size());
169
170    boolean found = false;
171    for (Record rec : arch.getCore()) {
172      if ("113775".equals(rec.id())) {
173        found = true;
174        assertEquals(
175          "Ehrenberg, 1832, in Hemprich and Ehrenberg, Symbolæ Phisicæ Mammalia, 2: ftn. 1 (last page of fascicle headed \"Herpestes leucurus H. E.\").",
176          rec.value(DwcTerm.originalNameUsageID));
177      }
178    }
179    assertTrue(found);
180  }
181
182  /**
183   * The pensoft archive http://pensoft.net/dwc/bdj/checklist_980.zip
184   * contains empty extension files which caused NPE in the dwca reader.
185   */
186  @Test
187  public void testExtensionNPE() throws UnsupportedArchiveException, IOException {
188    File zip = FileUtils.getClasspathFile("checklist_980.zip");
189    File tmpDir = Files.createTempDirectory("dwca-io-test").toFile();
190    CompressionUtil.decompressFile(tmpDir, zip);
191    // read archive from this tmp dir
192    Archive arch = DwcFiles.fromLocation(tmpDir.toPath());
193    assertNotNull(arch.getCore().getId());
194    assertEquals(3, arch.getExtensions().size());
195
196    boolean found = false;
197    for (StarRecord rec : arch) {
198      if ("980-sp10".equals(rec.core().id())) {
199        found = true;
200      }
201    }
202    assertTrue(found);
203  }
204
205  /**
206   * Test extension sorting verifying that all core records do have the right number of extension records attached
207   * when using the star record iterator.
208   */
209  @Test
210  public void testStarIteratorExtRecords() throws Exception {
211    File zip = FileUtils.getClasspathFile("checklist_980.zip");
212    File tmpDir = Files.createTempDirectory("dwca-io-test").toFile();
213    // read archive from this tmp dir
214    Archive arch = DwcFiles.fromCompressed(zip.toPath(), tmpDir.toPath());
215    int counter = 0;
216    int occCounter = 0;
217    Set<String> ids = new HashSet<>();
218    for (StarRecord rec : arch) {
219      counter++;
220      ids.add(rec.core().id());
221      List<Record> occs = rec.extension(DwcTerm.Occurrence);
222      occCounter += occs.size();
223    }
224    assertEquals(356, counter, "Core taxon file has 356 records");
225    assertEquals(356, ids.size(), "Core taxon file has 356 unique ids");
226
227    // read extension file on its own and extract core ids to be cross checked with core id set
228    File file = arch.getExtension(DwcTerm.Occurrence).getFirstLocationFile();
229
230    TabularDataFileReader<List<String>> occReader = TabularFiles.newTabularFileReader(
231        Files.newBufferedReader(file.toPath(), StandardCharsets.UTF_8),
232          ';', "\n", null,true, 0);
233
234    int occCounter2 = 0;
235    List<String> rec;
236    while ((rec = occReader.read()) != null) {
237      String id = rec.get(1);
238      occCounter2++;
239      assertTrue(ids.contains(id), "Occurrence coreid " + id + " not existing");
240    }
241    assertEquals(740, occCounter2, "Occurrence extension file has 740 records");
242    assertEquals(740, occCounter, "Occurrence start extensions should be 740 records");
243  }
244
245  /**
246   * Identifier not set properly when reading single csv file
247   * the csv file attached is a utf16 little endian encoded file.
248   * This encoding is known to cause problems and not supported.
249   * If you look at the detected concept terms you will find that there is NO concept at all detected because of the
250   * wrong character encoding used (the factory tries it with utf8).
251   *
252   * @see <a href="http://code.google.com/p/darwincore/issues/detail?id=78">Issue 78</a>
253   */
254  @Test
255  public void testIssue78() throws IOException, UnsupportedArchiveException {
256    // test folder with single text file in
257    Archive arch = DwcFiles.fromLocation(FileUtils.getClasspathFile("MOBOTDarwinCore.csv").toPath());
258    assertNotNull(arch.getCore());
259    assertNotNull(arch.getCore().getId());
260    assertEquals(DwcTerm.occurrenceID, arch.getCore().getId().getTerm());
261    assertNotNull(arch.getCore().getRowType());
262    assertEquals(DwcTerm.Occurrence, arch.getCore().getRowType());
263    assertTrue(arch.getCore().hasTerm(DwcTerm.occurrenceID));
264    assertTrue(arch.getCore().hasTerm(DwcTerm.scientificName));
265    assertEquals("UTF-8", arch.getCore().getEncoding());
266
267    int i = 0;
268    for (Record rec : arch.getCore()) {
269      i++;
270      assertEquals(rec.id(), "MO:Tropicos:" + i);
271    }
272    assertEquals(3, i);
273  }
274
275  @Test
276  public void testOpenArchive() throws IOException, UnsupportedArchiveException {
277    // test proper archive
278    Archive arch = DwcFiles.fromLocation(FileUtils.getClasspathFile("archive-dwc").toPath());
279    assertNotNull(arch.getCore());
280    assertNotNull(arch.getCore().getId());
281    assertNotNull(arch.getCore().getId());
282    assertTrue(arch.getCore().hasTerm(DwcTerm.scientificName));
283    assertEquals(2, arch.getExtensions().size());
284    assertEquals("DarwinCore.txt", arch.getCore().getFirstLocation());
285
286    // test meta.xml with xml entities as attribute values
287    arch = DwcFiles.fromLocation(FileUtils.getClasspathFile("xml-entity-meta").toPath());
288    assertNotNull(arch.getCore());
289    assertNotNull(arch.getCore().getId());
290    assertEquals(new Character('"'), arch.getCore().getFieldsEnclosedBy());
291    assertEquals("test", arch.getCore().getFirstLocation());
292
293    // test direct pointer to core data file (with taxonID, meaning it has dwc:Taxon rowType)
294    arch = DwcFiles.fromLocation(FileUtils.getClasspathFile("archive-dwc/DarwinCore.txt").toPath());
295    assertNotNull(arch.getCore());
296    assertNotNull(arch.getCore().getId());
297    assertEquals(DwcTerm.taxonID, arch.getCore().getId().getTerm());
298    assertNotNull(arch.getCore().getRowType());
299    assertEquals(DwcTerm.Taxon, arch.getCore().getRowType());
300    assertTrue(arch.getCore().hasTerm(DwcTerm.scientificName));
301    assertEquals(0, arch.getExtensions().size());
302    Iterator<StarRecord> dwci = arch.iterator();
303    StarRecord star = dwci.next();
304    assertEquals("Globicephala melaena melaena Traill", star.core().value(DwcTerm.scientificName));
305    assertEquals(DwcLayout.FILE_ROOT, arch.getDwcLayout());
306
307    // test folder with single text file in (with taxonID, meaning it has dwc:Taxon rowType)
308    arch = DwcFiles.fromLocation(FileUtils.getClasspathFile("dwca").toPath());
309    assertNotNull(arch.getCore());
310    assertNotNull(arch.getCore().getId());
311    assertEquals(DwcTerm.taxonID, arch.getCore().getId().getTerm());
312    assertNotNull(arch.getCore().getRowType());
313    assertEquals(DwcTerm.Taxon, arch.getCore().getRowType());
314    assertTrue(arch.getCore().hasTerm(DwcTerm.scientificName));
315    assertTrue(arch.getCore().hasTerm(DwcTerm.taxonID));
316    assertEquals(0, arch.getExtensions().size());
317    dwci = arch.iterator();
318    star = dwci.next();
319    assertEquals("Globicephala melaena melaena Traill", star.core().value(DwcTerm.scientificName));
320    assertEquals("1559060", star.core().value(DwcTerm.taxonID));
321    assertEquals("DarwinCore.txt", arch.getCore().getFirstLocation());
322    assertEquals(DwcLayout.DIRECTORY_ROOT, arch.getDwcLayout());
323  }
324
325  @Test
326  public void testOpenSmallArchiveWithEmptyLines() throws IOException, UnsupportedArchiveException {
327    // test folder with single text file in
328    Archive arch = DwcFiles.fromLocation(FileUtils.getClasspathFile("empty_line.tab").toPath());
329    assertNotNull(arch.getCore());
330    assertNotNull(arch.getCore().getId());
331    assertTrue(arch.getCore().hasTerm(DwcTerm.scientificName));
332    assertEquals(0, arch.getExtensions().size());
333    Iterator<StarRecord> dwci = arch.iterator();
334    StarRecord star = dwci.next();
335    dwci.next();
336    dwci.next();
337    dwci.next();
338    star = dwci.next();
339    assertEquals("Delphinus delphis var. delphis", star.core().value(DwcTerm.scientificName));
340    int i = 0;
341    for (StarRecord rec : arch) {
342      i++;
343      if (i > 20) {
344        break;
345      }
346    }
347    assertEquals(6, i);
348  }
349
350  /**
351   * Test bug 77.
352   *
353   * @see <a href="http://code.google.com/p/darwincore/issues/detail?id=77">Issue 77</a>
354   */
355  @Test
356  public void testQuotedHeaders() throws IOException, UnsupportedArchiveException {
357    // test folder with single text file in
358    Archive arch = DwcFiles.fromLocation(FileUtils.getClasspathFile("quoted_headers_MOBOTDarwinCore.csv").toPath());
359    assertNotNull(arch.getCore());
360    assertNotNull(arch.getCore().getId());
361    assertTrue(arch.getCore().hasTerm(DwcTerm.occurrenceID));
362    assertTrue(arch.getCore().hasTerm(DwcTerm.catalogNumber));
363    assertTrue(arch.getCore().hasTerm(DwcTerm.institutionCode));
364    assertTrue(arch.getCore().hasTerm(DwcTerm.basisOfRecord));
365    assertTrue(arch.getCore().hasTerm(DwcTerm.scientificName));
366    assertTrue(arch.getCore().hasTerm(DwcTerm.maximumElevationInMeters));
367    assertTrue(arch.getCore().hasTerm(DcTerm.references));
368
369    int i = 0;
370    for (Record rec : arch.getCore()) {
371      i++;
372      assertEquals(rec.id(), "MO:Tropicos:" + i);
373    }
374    assertEquals(2, i);
375  }
376
377
378  @Test
379  public void testTab() throws UnsupportedArchiveException, IOException {
380    File tab = FileUtils.getClasspathFile("dwca/DarwinCore.txt");
381    // read archive from this tmp dir
382    Archive arch = DwcFiles.fromLocation(tab.toPath());
383
384    boolean found = false;
385    int count = 0;
386    for (Record rec : arch.getCore()) {
387      count++;
388      if ("1559060".equals(rec.id())) {
389        found = true;
390        assertEquals("Globicephala melaena melaena Traill", rec.value(DwcTerm.scientificName));
391        assertEquals("Hershkovitz, P., Catalog of Living Whales, Smithsonian Institution, Bulletin 246, 1966, p. 91",
392          rec.value(DwcTerm.nameAccordingTo));
393        assertEquals("105849", rec.value(DwcTerm.parentNameUsageID));
394      }
395    }
396    assertEquals(1, arch.getCore().getIgnoreHeaderLines());
397    assertEquals(0, arch.getExtensions().size());
398    assertEquals(24, count);
399    assertTrue(found);
400  }
401
402  /**
403   * Test reading a single-file Simple Darwin Core Archive.
404   */
405  @Test
406  public void testSimpleDWCA() throws UnsupportedArchiveException, IOException {
407    File tab = FileUtils.getClasspathFile("issues/Borza.txt");
408    // Read single-file DWC "archive".
409    Archive arch = DwcFiles.fromLocation(tab.toPath());
410
411    // File is not in default encoding.
412    arch.getCore().setEncoding(StandardCharsets.ISO_8859_1.name());
413
414    boolean found = false;
415    int count = 0;
416    for (Record rec : arch.getCore()) {
417      count++;
418      if (count == 1) {
419        //1 Borza:Corophiidae:1 Borza Borza:Corophiidae Corophiidae 1   Animalia  Arthropoda  Malacostraca  Amphipoda Corophiidae Chelicorophium    sowinskyi   Chelicorophium sowinskyi  "(Martynov, 1924)"  species   Péter Borza   Europe  Danube  Hungary     47.788111 18.960944           Preserved Specimen  1917-07-17  1917  7 17      Unger E
420        assertEquals("1", rec.id());
421        assertEquals("Chelicorophium sowinskyi", rec.value(DwcTerm.scientificName));
422        // we do detect optional quotation in tab files...
423        assertEquals("(Martynov, 1924)", rec.value(DwcTerm.scientificNameAuthorship));
424        assertEquals("47.788111", rec.value(DwcTerm.decimalLatitude));
425        assertEquals("18.960944", rec.value(DwcTerm.decimalLongitude));
426      }
427      if ("173".equals(rec.id())) {
428        found = true;
429        assertEquals("Chelicorophium curvispinum", rec.value(DwcTerm.scientificName));
430        assertEquals("(G. O. Sars, 1895)", rec.value(DwcTerm.scientificNameAuthorship));
431        assertEquals("47.965166", rec.value(DwcTerm.decimalLatitude));
432        assertEquals("17.304666", rec.value(DwcTerm.decimalLongitude));
433      }
434    }
435    assertEquals(1, arch.getCore().getIgnoreHeaderLines());
436    assertEquals(435, count);
437    assertTrue(found);
438  }
439
440  @Test
441  public void testTabEol() throws UnsupportedArchiveException, IOException {
442    File tab = FileUtils.getClasspathFile("issues/eol/my_darwincore.txt");
443    // read archive from this tmp dir
444    Archive arch = DwcFiles.fromLocation(tab.toPath());
445
446    boolean found = false;
447    int count = 0;
448    for (Record rec : arch.getCore()) {
449      count++;
450      if (count == 1) {
451        //1 Borza:Corophiidae:1 Borza Borza:Corophiidae Corophiidae 1   Animalia  Arthropoda  Malacostraca  Amphipoda Corophiidae Chelicorophium    sowinskyi   Chelicorophium sowinskyi  "(Martynov, 1924)"  species   Péter Borza   Europe  Danube  Hungary     47.788111 18.960944           Preserved Specimen  1917-07-17  1917  7 17      Unger E
452        assertEquals("1", rec.id());
453        assertEquals("gadus morhua", rec.value(DwcTerm.scientificName));
454        assertEquals("gadidae", rec.value(DwcTerm.family));
455      } else if ("2".equals(rec.id())) {
456        found = true;
457        assertEquals("chanos chanos", rec.value(DwcTerm.scientificName));
458        assertEquals("channidae", rec.value(DwcTerm.family));
459      } else {
460        assertEquals("3", rec.id());
461        assertEquals("mola mola", rec.value(DwcTerm.scientificName));
462        assertEquals("familyx", rec.value(DwcTerm.family));
463      }
464    }
465    assertEquals(1, arch.getCore().getIgnoreHeaderLines());
466    assertEquals(3, count);
467    assertTrue(found);
468  }
469
470  /**
471   * Test extracting a CSV file containing embedded JSON, which itself contains escaped quotes.
472   *
473   * JSON value: { "test": "value, \"like\" this" }
474   *
475   * As a column in CSV: "{ ""test"": ""value, \""like\"" this"" }"
476   */
477  @Test
478  public void testCsvJsonEscapedQuotes() throws UnsupportedArchiveException, IOException {
479    Archive arch = DwcFiles.fromLocation(FileUtils.getClasspathFile("issues/csv-json-escaped-quotes").toPath());
480
481    arch.initialize();
482
483    arch.validate();
484
485    // Archive only has one record.
486    Record rec = arch.getCore().iterator().next();
487
488    assertEquals("779", rec.id());
489    assertEquals("Cambridge, Cambridge", rec.value(DwcTerm.locality));
490    // Without the Java escapes: {"chronostratigraphy": "Cretaceous, Early Cretaceous, Albian - Late Cretaceous, Cenomanian", "cataloguedescription": "Very worn vertebra. Old catalogue says \"fragments of bone\".", "created": "2009-05-13", "barcode": "010039076", "project": "eMesozoic", "determinationnames": "Ornithocheirus", "subdepartment": "Vertebrates", "lithostratigraphy": "Selborne Group, Upper Greensand Formation, Cambridge Greensand Member", "imagecategory": ["Register;Specimen"]}
491    assertEquals("{\"chronostratigraphy\": \"Cretaceous, Early Cretaceous, Albian - Late Cretaceous, Cenomanian\", \"cataloguedescription\": \"Very worn vertebra. Old catalogue says \\\"fragments of bone\\\".\", \"created\": \"2009-05-13\", \"barcode\": \"010039076\", \"project\": \"eMesozoic\", \"determinationnames\": \"Ornithocheirus\", \"subdepartment\": \"Vertebrates\", \"lithostratigraphy\": \"Selborne Group, Upper Greensand Formation, Cambridge Greensand Member\", \"imagecategory\": [\"Register;Specimen\"]}", rec.value(DwcTerm.dynamicProperties));
492  }
493
494  /**
495   * Ensure that extensions are just skipped for archives that do not have the core id in the mapped extension.
496   * https://code.google.com/p/darwincore/issues/detail?id=232
497   */
498  @Test
499  public void testNullCoreID() throws IOException {
500    try {
501      File tmpDir = Files.createTempDirectory("dwca-io-test").toFile();
502      tmpDir.deleteOnExit();
503
504      Archive archive = DwcFiles.fromCompressed(FileUtils.getClasspathFile("nullCoreID.zip").toPath(), tmpDir.toPath());
505      Iterator<StarRecord> iter = archive.iterator();
506      while (iter.hasNext()) {
507        iter.next();
508      }
509    } catch (UnsupportedArchiveException e) {
510      fail("Extensions with no core IDs should be ignored");
511    }
512  }
513
514  /**
515   * Test opening a single data file with both eventID column, meaning it has dwc:Event rowType.
516   */
517  @Test
518  public void testOpenArchiveForEventCore() throws IOException, UnsupportedArchiveException {
519    Archive arch = DwcFiles.fromLocation(FileUtils.getClasspathFile("event.txt").toPath());
520    assertNotNull(arch.getCore());
521    assertNotNull(arch.getCore().getId());
522    assertEquals(DwcTerm.eventID, arch.getCore().getId().getTerm());
523    assertNotNull(arch.getCore().getRowType());
524    assertEquals(DwcTerm.Event, arch.getCore().getRowType());
525    assertTrue(arch.getCore().hasTerm(DwcTerm.samplingProtocol));
526    assertEquals(0, arch.getExtensions().size());
527    Iterator<StarRecord> dwci = arch.iterator();
528    StarRecord star = dwci.next();
529    assertEquals("Aubach above Wiesthal", star.core().value(DwcTerm.locality));
530    assertEquals(DwcLayout.FILE_ROOT, arch.getDwcLayout());
531  }
532
533  /**
534   * Test opening a single data file with a generic ID column and an eventID column meaning the Archive's ID-term
535   * gets set to (DwcTerm.eventID and its rowType gets set to DwcTerm.Event.
536   */
537  @Test
538  public void testOpenArchiveForGenericCore() throws IOException, UnsupportedArchiveException {
539    Archive arch = DwcFiles.fromLocation(FileUtils.getClasspathFile("event-plus-id.txt").toPath());
540    assertNotNull(arch.getCore());
541    assertNotNull(arch.getCore().getId());
542    assertEquals(DwcTerm.eventID, arch.getCore().getId().getTerm());
543    assertEquals(DwcTerm.Event, arch.getCore().getRowType());
544    assertTrue(arch.getCore().hasTerm(DwcTerm.samplingProtocol));
545    assertEquals(0, arch.getExtensions().size());
546    Iterator<StarRecord> dwci = arch.iterator();
547    StarRecord star = dwci.next();
548    assertEquals("Aubach above Wiesthal", star.core().value(DwcTerm.locality));
549    assertEquals(DwcLayout.FILE_ROOT, arch.getDwcLayout());
550  }
551
552  /**
553   * Basic validation of archives, where we rely on falling back to defaults from the DWC-A metadata schema.
554   */
555  @Test
556  public void testFallbackToDefaultsArchives() throws IOException {
557    try {
558      Archive arch = DwcFiles.fromLocation(FileUtils.getClasspathFile("defaults/meta-file-encoding-missing").toPath());
559      assertEquals("UTF-8", arch.getCore().getEncoding());
560    } catch (UnsupportedArchiveException e) {
561      fail("Core file encoding defaults to UTF-8 if missing in meta.xml.");
562    }
563
564    try {
565      Archive arch = DwcFiles.fromLocation(FileUtils.getClasspathFile("defaults/extension-encoding-missing").toPath());
566      assertEquals("UTF-8", arch.getExtension(GbifTerm.Multimedia).getEncoding());
567    } catch (UnsupportedArchiveException e) {
568      fail("Extension file encoding defaults to UTF-8 if missing in meta.xml.");
569    }
570  }
571
572  /**
573   * Basic validation of archives, that the declared files exist and have basic, valid structure.
574   */
575  @Test
576  public void testInvalidArchives() {
577    // Simple archive problems
578    assertThrows(
579        UnsupportedArchiveException.class,
580        () -> DwcFiles.fromLocation(FileUtils.getClasspathFile("invalid/empty").toPath()),
581        "Empty archive should not be opened.");
582
583    assertThrows(
584        UnsupportedArchiveException.class,
585        () -> DwcFiles.fromLocation(FileUtils.getClasspathFile("invalid/meta-file-location-missing").toPath()),
586        "Archive with missing file location in meta.xml should not be opened.");
587
588    // Extension archive problems
589    assertThrows(
590        UnsupportedArchiveException.class,
591        () -> DwcFiles.fromLocation(FileUtils.getClasspathFile("invalid/extension-missing").toPath()),
592        "Archive with missing extension file should not be opened.");
593
594    assertThrows(
595        UnsupportedArchiveException.class,
596        () -> DwcFiles.fromLocation(FileUtils.getClasspathFile("invalid/extension-location-missing").toPath()),
597        "Archive with missing extension file location in meta.xml should not be opened."
598    );
599
600    assertThrows(
601        UnsupportedArchiveException.class,
602        () -> DwcFiles.fromLocation(FileUtils.getClasspathFile("invalid/extension-core-id-missing").toPath()),
603        "Archive with extension lacking coreid in meta.xml should not be opened."
604    );
605
606    assertThrows(
607        UnsupportedArchiveException.class,
608        () -> DwcFiles.fromLocation(FileUtils.getClasspathFile("invalid/extension-id-missing").toPath()),
609        "Archive with extension and core missing id in meta.xml should not be opened."
610    );
611  }
612}