001/* 002 * Licensed under the Apache License, Version 2.0 (the "License"); 003 * you may not use this file except in compliance with the License. 004 * You may obtain a copy of the License at 005 * 006 * http://www.apache.org/licenses/LICENSE-2.0 007 * 008 * Unless required by applicable law or agreed to in writing, software 009 * distributed under the License is distributed on an "AS IS" BASIS, 010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 011 * See the License for the specific language governing permissions and 012 * limitations under the License. 013 */ 014package org.gbif.dwca.io; 015 016import org.gbif.dwc.Archive; 017import org.gbif.dwc.DwcFiles; 018import org.gbif.dwc.DwcLayout; 019import org.gbif.dwc.UnsupportedArchiveException; 020import org.gbif.dwc.meta.DwcMetaFiles; 021import org.gbif.dwc.record.Record; 022import org.gbif.dwc.record.StarRecord; 023import org.gbif.dwc.terms.DcTerm; 024import org.gbif.dwc.terms.DwcTerm; 025import org.gbif.dwc.terms.GbifTerm; 026import org.gbif.utils.file.CompressionUtil; 027import org.gbif.utils.file.FileUtils; 028import org.gbif.utils.file.tabular.TabularDataFileReader; 029import org.gbif.utils.file.tabular.TabularFiles; 030 031import java.io.File; 032import java.io.IOException; 033import java.io.InputStream; 034import java.nio.charset.StandardCharsets; 035import java.nio.file.Files; 036import java.util.HashSet; 037import java.util.Iterator; 038import java.util.List; 039import java.util.Set; 040 041import org.junit.jupiter.api.Test; 042 043import static org.junit.jupiter.api.Assertions.assertEquals; 044import static org.junit.jupiter.api.Assertions.assertNotNull; 045import static org.junit.jupiter.api.Assertions.assertThrows; 046import static org.junit.jupiter.api.Assertions.assertTrue; 047import static org.junit.jupiter.api.Assertions.fail; 048 049public class ArchiveFactoryTest { 050 051 private void assertNumberOfCoreRecords(Archive arch, int expectedRecords) { 052 int rows = 0; 053 for (Record rec : arch.getCore()) { 054 assertNotNull(rec); 055 rows++; 056 } 057 assertEquals(expectedRecords, rows); 058 } 059 060 @Test 061 public void testMetaHandlerUtf16le() throws Exception { 062 for (String fn : new String[]{"/meta/meta.xml", "/meta-utf16le.xml","/xml-entity-meta/meta.xml"}) { 063 InputStream is = getClass().getResourceAsStream(fn); 064 DwcMetaFiles.fromMetaDescriptor(is); 065 } 066 } 067 068 @Test 069 public void testCoreRecords() throws IOException { 070 // note that we don't read a DWC archive, but only test the csvreader! 071 // we therefore do not detect header rows and count *all* rows instead 072 073 assertNumberOfCoreRecords(DwcFiles.fromLocation(FileUtils.getClasspathFile("iucn100.tab.txt").toPath()), 99); 074 assertNumberOfCoreRecords(DwcFiles.fromLocation(FileUtils.getClasspathFile("iucn100.pipe.txt").toPath()), 99); 075 assertNumberOfCoreRecords(DwcFiles.fromLocation(FileUtils.getClasspathFile("iucn100.csv").toPath()), 99); 076 assertNumberOfCoreRecords(DwcFiles.fromLocation(FileUtils.getClasspathFile("csv_quoted-unquoted_headers.csv").toPath()), 077 3); 078 assertNumberOfCoreRecords(DwcFiles.fromLocation(FileUtils.getClasspathFile("csv_incl_single_quotes.csv").toPath()), 3); 079 } 080 081 /** 082 * Test dwca-reader bug 83 083 * 084 * @see <a href="http://code.google.com/p/darwincore/issues/detail?id=83">Issue 83</a> 085 */ 086 @Test 087 public void testCsv() throws UnsupportedArchiveException, IOException { 088 File csv = FileUtils.getClasspathFile("csv_always_quoted.csv"); 089 // read archive from this tmp dir 090 Archive arch = DwcFiles.fromLocation(csv.toPath()); 091 092 boolean found = false; 093 for (Record rec : arch.getCore()) { 094 if ("ENNH0192".equals(rec.id())) { 095 found = true; 096 assertEquals("Martins Wood, Ightham", rec.value(DwcTerm.locality)); 097 } 098 } 099 assertTrue(found); 100 } 101 102 /** 103 * Test GNUB style dwca with a single tab delimited file that has a .tab suffix. 104 */ 105 @Test 106 public void testGnubTab() throws UnsupportedArchiveException, IOException { 107 File tab = FileUtils.getClasspathFile("gnub.tab"); 108 // read archive from this tmp dir 109 Archive arch = DwcFiles.fromLocation(tab.toPath()); 110 111 Record rec = arch.getCore().iterator().next(); 112 assertEquals("246daa62-6fce-448f-88b4-94b0ccc89cf1", rec.id()); 113 } 114 115 /** 116 * Test GNUB style dwca with a single tab delimited file that has a .tab suffix. 117 */ 118 @Test 119 public void testGnubTabZip() throws UnsupportedArchiveException, IOException { 120 // test GNUB zip with 1 data file 121 File tmpDir = Files.createTempDirectory("dwca-io-test").toFile(); 122 tmpDir.deleteOnExit(); 123 File zip = FileUtils.getClasspathFile("gnub.tab.zip"); 124 CompressionUtil.decompressFile(tmpDir, zip); 125 126 // read archive from this tmp dir 127 Archive arch = DwcFiles.fromLocation(tmpDir.toPath()); 128 129 Record rec = arch.getCore().iterator().next(); 130 assertEquals("246daa62-6fce-448f-88b4-94b0ccc89cf1", rec.id()); 131 } 132 133 /** 134 * Testing CSV with optional quotes 135 */ 136 @Test 137 public void testCsvOptionalQuotes() throws UnsupportedArchiveException, IOException { 138 File csv = FileUtils.getClasspathFile("csv_optional_quotes_excel2008CSV.csv"); 139 Archive arch = DwcFiles.fromLocation(csv.toPath()); 140 String[] ids = {"1", "2", "3", "4"}; 141 String[] scinames = {"Gadus morhua", "Abies alba", "Pomatoma saltatrix", "Yikes ofcourses"}; 142 String[] localities = 143 {"This has a, comma", "I say this is only a \"quote\"", "What though, \"if you have a quote\" and a comma", 144 "What, if we have a \"quote, which has a comma, or 2\""}; 145 int row = 0; 146 for (Record rec : arch.getCore()) { 147 assertEquals(ids[row], rec.id()); 148 assertEquals(scinames[row], rec.value(DwcTerm.scientificName)); 149 assertEquals(localities[row], rec.value(DwcTerm.locality)); 150 row++; 151 } 152 } 153 154 /** 155 * Test IPT bug 2158 156 * 157 * @see <a href="http://code.google.com/p/gbif-providertoolkit/source/detail?r=2158">IPT revision 2158</a> 158 */ 159 @Test 160 public void testIssue2158() throws UnsupportedArchiveException, IOException { 161 // test zip with 1 extension file 162 File zip = FileUtils.getClasspathFile("archive-tax.zip"); 163 File tmpDir = Files.createTempDirectory("dwca-io-test").toFile(); 164 CompressionUtil.decompressFile(tmpDir, zip); 165 // read archive from this tmp dir 166 Archive arch = DwcFiles.fromLocation(tmpDir.toPath()); 167 assertNotNull(arch.getCore().getId()); 168 assertEquals(1, arch.getExtensions().size()); 169 170 boolean found = false; 171 for (Record rec : arch.getCore()) { 172 if ("113775".equals(rec.id())) { 173 found = true; 174 assertEquals( 175 "Ehrenberg, 1832, in Hemprich and Ehrenberg, Symbolæ Phisicæ Mammalia, 2: ftn. 1 (last page of fascicle headed \"Herpestes leucurus H. E.\").", 176 rec.value(DwcTerm.originalNameUsageID)); 177 } 178 } 179 assertTrue(found); 180 } 181 182 /** 183 * The pensoft archive http://pensoft.net/dwc/bdj/checklist_980.zip 184 * contains empty extension files which caused NPE in the dwca reader. 185 */ 186 @Test 187 public void testExtensionNPE() throws UnsupportedArchiveException, IOException { 188 File zip = FileUtils.getClasspathFile("checklist_980.zip"); 189 File tmpDir = Files.createTempDirectory("dwca-io-test").toFile(); 190 CompressionUtil.decompressFile(tmpDir, zip); 191 // read archive from this tmp dir 192 Archive arch = DwcFiles.fromLocation(tmpDir.toPath()); 193 assertNotNull(arch.getCore().getId()); 194 assertEquals(3, arch.getExtensions().size()); 195 196 boolean found = false; 197 for (StarRecord rec : arch) { 198 if ("980-sp10".equals(rec.core().id())) { 199 found = true; 200 } 201 } 202 assertTrue(found); 203 } 204 205 /** 206 * Test extension sorting verifying that all core records do have the right number of extension records attached 207 * when using the star record iterator. 208 */ 209 @Test 210 public void testStarIteratorExtRecords() throws Exception { 211 File zip = FileUtils.getClasspathFile("checklist_980.zip"); 212 File tmpDir = Files.createTempDirectory("dwca-io-test").toFile(); 213 // read archive from this tmp dir 214 Archive arch = DwcFiles.fromCompressed(zip.toPath(), tmpDir.toPath()); 215 int counter = 0; 216 int occCounter = 0; 217 Set<String> ids = new HashSet<>(); 218 for (StarRecord rec : arch) { 219 counter++; 220 ids.add(rec.core().id()); 221 List<Record> occs = rec.extension(DwcTerm.Occurrence); 222 occCounter += occs.size(); 223 } 224 assertEquals(356, counter, "Core taxon file has 356 records"); 225 assertEquals(356, ids.size(), "Core taxon file has 356 unique ids"); 226 227 // read extension file on its own and extract core ids to be cross checked with core id set 228 File file = arch.getExtension(DwcTerm.Occurrence).getFirstLocationFile(); 229 230 TabularDataFileReader<List<String>> occReader = TabularFiles.newTabularFileReader( 231 Files.newBufferedReader(file.toPath(), StandardCharsets.UTF_8), 232 ';', "\n", null,true, 0); 233 234 int occCounter2 = 0; 235 List<String> rec; 236 while ((rec = occReader.read()) != null) { 237 String id = rec.get(1); 238 occCounter2++; 239 assertTrue(ids.contains(id), "Occurrence coreid " + id + " not existing"); 240 } 241 assertEquals(740, occCounter2, "Occurrence extension file has 740 records"); 242 assertEquals(740, occCounter, "Occurrence start extensions should be 740 records"); 243 } 244 245 /** 246 * Identifier not set properly when reading single csv file 247 * the csv file attached is a utf16 little endian encoded file. 248 * This encoding is known to cause problems and not supported. 249 * If you look at the detected concept terms you will find that there is NO concept at all detected because of the 250 * wrong character encoding used (the factory tries it with utf8). 251 * 252 * @see <a href="http://code.google.com/p/darwincore/issues/detail?id=78">Issue 78</a> 253 */ 254 @Test 255 public void testIssue78() throws IOException, UnsupportedArchiveException { 256 // test folder with single text file in 257 Archive arch = DwcFiles.fromLocation(FileUtils.getClasspathFile("MOBOTDarwinCore.csv").toPath()); 258 assertNotNull(arch.getCore()); 259 assertNotNull(arch.getCore().getId()); 260 assertEquals(DwcTerm.occurrenceID, arch.getCore().getId().getTerm()); 261 assertNotNull(arch.getCore().getRowType()); 262 assertEquals(DwcTerm.Occurrence, arch.getCore().getRowType()); 263 assertTrue(arch.getCore().hasTerm(DwcTerm.occurrenceID)); 264 assertTrue(arch.getCore().hasTerm(DwcTerm.scientificName)); 265 assertEquals("UTF-8", arch.getCore().getEncoding()); 266 267 int i = 0; 268 for (Record rec : arch.getCore()) { 269 i++; 270 assertEquals(rec.id(), "MO:Tropicos:" + i); 271 } 272 assertEquals(3, i); 273 } 274 275 @Test 276 public void testOpenArchive() throws IOException, UnsupportedArchiveException { 277 // test proper archive 278 Archive arch = DwcFiles.fromLocation(FileUtils.getClasspathFile("archive-dwc").toPath()); 279 assertNotNull(arch.getCore()); 280 assertNotNull(arch.getCore().getId()); 281 assertNotNull(arch.getCore().getId()); 282 assertTrue(arch.getCore().hasTerm(DwcTerm.scientificName)); 283 assertEquals(2, arch.getExtensions().size()); 284 assertEquals("DarwinCore.txt", arch.getCore().getFirstLocation()); 285 286 // test meta.xml with xml entities as attribute values 287 arch = DwcFiles.fromLocation(FileUtils.getClasspathFile("xml-entity-meta").toPath()); 288 assertNotNull(arch.getCore()); 289 assertNotNull(arch.getCore().getId()); 290 assertEquals(new Character('"'), arch.getCore().getFieldsEnclosedBy()); 291 assertEquals("test", arch.getCore().getFirstLocation()); 292 293 // test direct pointer to core data file (with taxonID, meaning it has dwc:Taxon rowType) 294 arch = DwcFiles.fromLocation(FileUtils.getClasspathFile("archive-dwc/DarwinCore.txt").toPath()); 295 assertNotNull(arch.getCore()); 296 assertNotNull(arch.getCore().getId()); 297 assertEquals(DwcTerm.taxonID, arch.getCore().getId().getTerm()); 298 assertNotNull(arch.getCore().getRowType()); 299 assertEquals(DwcTerm.Taxon, arch.getCore().getRowType()); 300 assertTrue(arch.getCore().hasTerm(DwcTerm.scientificName)); 301 assertEquals(0, arch.getExtensions().size()); 302 Iterator<StarRecord> dwci = arch.iterator(); 303 StarRecord star = dwci.next(); 304 assertEquals("Globicephala melaena melaena Traill", star.core().value(DwcTerm.scientificName)); 305 assertEquals(DwcLayout.FILE_ROOT, arch.getDwcLayout()); 306 307 // test folder with single text file in (with taxonID, meaning it has dwc:Taxon rowType) 308 arch = DwcFiles.fromLocation(FileUtils.getClasspathFile("dwca").toPath()); 309 assertNotNull(arch.getCore()); 310 assertNotNull(arch.getCore().getId()); 311 assertEquals(DwcTerm.taxonID, arch.getCore().getId().getTerm()); 312 assertNotNull(arch.getCore().getRowType()); 313 assertEquals(DwcTerm.Taxon, arch.getCore().getRowType()); 314 assertTrue(arch.getCore().hasTerm(DwcTerm.scientificName)); 315 assertTrue(arch.getCore().hasTerm(DwcTerm.taxonID)); 316 assertEquals(0, arch.getExtensions().size()); 317 dwci = arch.iterator(); 318 star = dwci.next(); 319 assertEquals("Globicephala melaena melaena Traill", star.core().value(DwcTerm.scientificName)); 320 assertEquals("1559060", star.core().value(DwcTerm.taxonID)); 321 assertEquals("DarwinCore.txt", arch.getCore().getFirstLocation()); 322 assertEquals(DwcLayout.DIRECTORY_ROOT, arch.getDwcLayout()); 323 } 324 325 @Test 326 public void testOpenSmallArchiveWithEmptyLines() throws IOException, UnsupportedArchiveException { 327 // test folder with single text file in 328 Archive arch = DwcFiles.fromLocation(FileUtils.getClasspathFile("empty_line.tab").toPath()); 329 assertNotNull(arch.getCore()); 330 assertNotNull(arch.getCore().getId()); 331 assertTrue(arch.getCore().hasTerm(DwcTerm.scientificName)); 332 assertEquals(0, arch.getExtensions().size()); 333 Iterator<StarRecord> dwci = arch.iterator(); 334 StarRecord star = dwci.next(); 335 dwci.next(); 336 dwci.next(); 337 dwci.next(); 338 star = dwci.next(); 339 assertEquals("Delphinus delphis var. delphis", star.core().value(DwcTerm.scientificName)); 340 int i = 0; 341 for (StarRecord rec : arch) { 342 i++; 343 if (i > 20) { 344 break; 345 } 346 } 347 assertEquals(6, i); 348 } 349 350 /** 351 * Test bug 77. 352 * 353 * @see <a href="http://code.google.com/p/darwincore/issues/detail?id=77">Issue 77</a> 354 */ 355 @Test 356 public void testQuotedHeaders() throws IOException, UnsupportedArchiveException { 357 // test folder with single text file in 358 Archive arch = DwcFiles.fromLocation(FileUtils.getClasspathFile("quoted_headers_MOBOTDarwinCore.csv").toPath()); 359 assertNotNull(arch.getCore()); 360 assertNotNull(arch.getCore().getId()); 361 assertTrue(arch.getCore().hasTerm(DwcTerm.occurrenceID)); 362 assertTrue(arch.getCore().hasTerm(DwcTerm.catalogNumber)); 363 assertTrue(arch.getCore().hasTerm(DwcTerm.institutionCode)); 364 assertTrue(arch.getCore().hasTerm(DwcTerm.basisOfRecord)); 365 assertTrue(arch.getCore().hasTerm(DwcTerm.scientificName)); 366 assertTrue(arch.getCore().hasTerm(DwcTerm.maximumElevationInMeters)); 367 assertTrue(arch.getCore().hasTerm(DcTerm.references)); 368 369 int i = 0; 370 for (Record rec : arch.getCore()) { 371 i++; 372 assertEquals(rec.id(), "MO:Tropicos:" + i); 373 } 374 assertEquals(2, i); 375 } 376 377 378 @Test 379 public void testTab() throws UnsupportedArchiveException, IOException { 380 File tab = FileUtils.getClasspathFile("dwca/DarwinCore.txt"); 381 // read archive from this tmp dir 382 Archive arch = DwcFiles.fromLocation(tab.toPath()); 383 384 boolean found = false; 385 int count = 0; 386 for (Record rec : arch.getCore()) { 387 count++; 388 if ("1559060".equals(rec.id())) { 389 found = true; 390 assertEquals("Globicephala melaena melaena Traill", rec.value(DwcTerm.scientificName)); 391 assertEquals("Hershkovitz, P., Catalog of Living Whales, Smithsonian Institution, Bulletin 246, 1966, p. 91", 392 rec.value(DwcTerm.nameAccordingTo)); 393 assertEquals("105849", rec.value(DwcTerm.parentNameUsageID)); 394 } 395 } 396 assertEquals(1, arch.getCore().getIgnoreHeaderLines()); 397 assertEquals(0, arch.getExtensions().size()); 398 assertEquals(24, count); 399 assertTrue(found); 400 } 401 402 /** 403 * Test reading a single-file Simple Darwin Core Archive. 404 */ 405 @Test 406 public void testSimpleDWCA() throws UnsupportedArchiveException, IOException { 407 File tab = FileUtils.getClasspathFile("issues/Borza.txt"); 408 // Read single-file DWC "archive". 409 Archive arch = DwcFiles.fromLocation(tab.toPath()); 410 411 // File is not in default encoding. 412 arch.getCore().setEncoding(StandardCharsets.ISO_8859_1.name()); 413 414 boolean found = false; 415 int count = 0; 416 for (Record rec : arch.getCore()) { 417 count++; 418 if (count == 1) { 419 //1 Borza:Corophiidae:1 Borza Borza:Corophiidae Corophiidae 1 Animalia Arthropoda Malacostraca Amphipoda Corophiidae Chelicorophium sowinskyi Chelicorophium sowinskyi "(Martynov, 1924)" species Péter Borza Europe Danube Hungary 47.788111 18.960944 Preserved Specimen 1917-07-17 1917 7 17 Unger E 420 assertEquals("1", rec.id()); 421 assertEquals("Chelicorophium sowinskyi", rec.value(DwcTerm.scientificName)); 422 // we do detect optional quotation in tab files... 423 assertEquals("(Martynov, 1924)", rec.value(DwcTerm.scientificNameAuthorship)); 424 assertEquals("47.788111", rec.value(DwcTerm.decimalLatitude)); 425 assertEquals("18.960944", rec.value(DwcTerm.decimalLongitude)); 426 } 427 if ("173".equals(rec.id())) { 428 found = true; 429 assertEquals("Chelicorophium curvispinum", rec.value(DwcTerm.scientificName)); 430 assertEquals("(G. O. Sars, 1895)", rec.value(DwcTerm.scientificNameAuthorship)); 431 assertEquals("47.965166", rec.value(DwcTerm.decimalLatitude)); 432 assertEquals("17.304666", rec.value(DwcTerm.decimalLongitude)); 433 } 434 } 435 assertEquals(1, arch.getCore().getIgnoreHeaderLines()); 436 assertEquals(435, count); 437 assertTrue(found); 438 } 439 440 @Test 441 public void testTabEol() throws UnsupportedArchiveException, IOException { 442 File tab = FileUtils.getClasspathFile("issues/eol/my_darwincore.txt"); 443 // read archive from this tmp dir 444 Archive arch = DwcFiles.fromLocation(tab.toPath()); 445 446 boolean found = false; 447 int count = 0; 448 for (Record rec : arch.getCore()) { 449 count++; 450 if (count == 1) { 451 //1 Borza:Corophiidae:1 Borza Borza:Corophiidae Corophiidae 1 Animalia Arthropoda Malacostraca Amphipoda Corophiidae Chelicorophium sowinskyi Chelicorophium sowinskyi "(Martynov, 1924)" species Péter Borza Europe Danube Hungary 47.788111 18.960944 Preserved Specimen 1917-07-17 1917 7 17 Unger E 452 assertEquals("1", rec.id()); 453 assertEquals("gadus morhua", rec.value(DwcTerm.scientificName)); 454 assertEquals("gadidae", rec.value(DwcTerm.family)); 455 } else if ("2".equals(rec.id())) { 456 found = true; 457 assertEquals("chanos chanos", rec.value(DwcTerm.scientificName)); 458 assertEquals("channidae", rec.value(DwcTerm.family)); 459 } else { 460 assertEquals("3", rec.id()); 461 assertEquals("mola mola", rec.value(DwcTerm.scientificName)); 462 assertEquals("familyx", rec.value(DwcTerm.family)); 463 } 464 } 465 assertEquals(1, arch.getCore().getIgnoreHeaderLines()); 466 assertEquals(3, count); 467 assertTrue(found); 468 } 469 470 /** 471 * Test extracting a CSV file containing embedded JSON, which itself contains escaped quotes. 472 * 473 * JSON value: { "test": "value, \"like\" this" } 474 * 475 * As a column in CSV: "{ ""test"": ""value, \""like\"" this"" }" 476 */ 477 @Test 478 public void testCsvJsonEscapedQuotes() throws UnsupportedArchiveException, IOException { 479 Archive arch = DwcFiles.fromLocation(FileUtils.getClasspathFile("issues/csv-json-escaped-quotes").toPath()); 480 481 arch.initialize(); 482 483 arch.validate(); 484 485 // Archive only has one record. 486 Record rec = arch.getCore().iterator().next(); 487 488 assertEquals("779", rec.id()); 489 assertEquals("Cambridge, Cambridge", rec.value(DwcTerm.locality)); 490 // Without the Java escapes: {"chronostratigraphy": "Cretaceous, Early Cretaceous, Albian - Late Cretaceous, Cenomanian", "cataloguedescription": "Very worn vertebra. Old catalogue says \"fragments of bone\".", "created": "2009-05-13", "barcode": "010039076", "project": "eMesozoic", "determinationnames": "Ornithocheirus", "subdepartment": "Vertebrates", "lithostratigraphy": "Selborne Group, Upper Greensand Formation, Cambridge Greensand Member", "imagecategory": ["Register;Specimen"]} 491 assertEquals("{\"chronostratigraphy\": \"Cretaceous, Early Cretaceous, Albian - Late Cretaceous, Cenomanian\", \"cataloguedescription\": \"Very worn vertebra. Old catalogue says \\\"fragments of bone\\\".\", \"created\": \"2009-05-13\", \"barcode\": \"010039076\", \"project\": \"eMesozoic\", \"determinationnames\": \"Ornithocheirus\", \"subdepartment\": \"Vertebrates\", \"lithostratigraphy\": \"Selborne Group, Upper Greensand Formation, Cambridge Greensand Member\", \"imagecategory\": [\"Register;Specimen\"]}", rec.value(DwcTerm.dynamicProperties)); 492 } 493 494 /** 495 * Ensure that extensions are just skipped for archives that do not have the core id in the mapped extension. 496 * https://code.google.com/p/darwincore/issues/detail?id=232 497 */ 498 @Test 499 public void testNullCoreID() throws IOException { 500 try { 501 File tmpDir = Files.createTempDirectory("dwca-io-test").toFile(); 502 tmpDir.deleteOnExit(); 503 504 Archive archive = DwcFiles.fromCompressed(FileUtils.getClasspathFile("nullCoreID.zip").toPath(), tmpDir.toPath()); 505 Iterator<StarRecord> iter = archive.iterator(); 506 while (iter.hasNext()) { 507 iter.next(); 508 } 509 } catch (UnsupportedArchiveException e) { 510 fail("Extensions with no core IDs should be ignored"); 511 } 512 } 513 514 /** 515 * Test opening a single data file with both eventID column, meaning it has dwc:Event rowType. 516 */ 517 @Test 518 public void testOpenArchiveForEventCore() throws IOException, UnsupportedArchiveException { 519 Archive arch = DwcFiles.fromLocation(FileUtils.getClasspathFile("event.txt").toPath()); 520 assertNotNull(arch.getCore()); 521 assertNotNull(arch.getCore().getId()); 522 assertEquals(DwcTerm.eventID, arch.getCore().getId().getTerm()); 523 assertNotNull(arch.getCore().getRowType()); 524 assertEquals(DwcTerm.Event, arch.getCore().getRowType()); 525 assertTrue(arch.getCore().hasTerm(DwcTerm.samplingProtocol)); 526 assertEquals(0, arch.getExtensions().size()); 527 Iterator<StarRecord> dwci = arch.iterator(); 528 StarRecord star = dwci.next(); 529 assertEquals("Aubach above Wiesthal", star.core().value(DwcTerm.locality)); 530 assertEquals(DwcLayout.FILE_ROOT, arch.getDwcLayout()); 531 } 532 533 /** 534 * Test opening a single data file with a generic ID column and an eventID column meaning the Archive's ID-term 535 * gets set to (DwcTerm.eventID and its rowType gets set to DwcTerm.Event. 536 */ 537 @Test 538 public void testOpenArchiveForGenericCore() throws IOException, UnsupportedArchiveException { 539 Archive arch = DwcFiles.fromLocation(FileUtils.getClasspathFile("event-plus-id.txt").toPath()); 540 assertNotNull(arch.getCore()); 541 assertNotNull(arch.getCore().getId()); 542 assertEquals(DwcTerm.eventID, arch.getCore().getId().getTerm()); 543 assertEquals(DwcTerm.Event, arch.getCore().getRowType()); 544 assertTrue(arch.getCore().hasTerm(DwcTerm.samplingProtocol)); 545 assertEquals(0, arch.getExtensions().size()); 546 Iterator<StarRecord> dwci = arch.iterator(); 547 StarRecord star = dwci.next(); 548 assertEquals("Aubach above Wiesthal", star.core().value(DwcTerm.locality)); 549 assertEquals(DwcLayout.FILE_ROOT, arch.getDwcLayout()); 550 } 551 552 /** 553 * Basic validation of archives, where we rely on falling back to defaults from the DWC-A metadata schema. 554 */ 555 @Test 556 public void testFallbackToDefaultsArchives() throws IOException { 557 try { 558 Archive arch = DwcFiles.fromLocation(FileUtils.getClasspathFile("defaults/meta-file-encoding-missing").toPath()); 559 assertEquals("UTF-8", arch.getCore().getEncoding()); 560 } catch (UnsupportedArchiveException e) { 561 fail("Core file encoding defaults to UTF-8 if missing in meta.xml."); 562 } 563 564 try { 565 Archive arch = DwcFiles.fromLocation(FileUtils.getClasspathFile("defaults/extension-encoding-missing").toPath()); 566 assertEquals("UTF-8", arch.getExtension(GbifTerm.Multimedia).getEncoding()); 567 } catch (UnsupportedArchiveException e) { 568 fail("Extension file encoding defaults to UTF-8 if missing in meta.xml."); 569 } 570 } 571 572 /** 573 * Basic validation of archives, that the declared files exist and have basic, valid structure. 574 */ 575 @Test 576 public void testInvalidArchives() { 577 // Simple archive problems 578 assertThrows( 579 UnsupportedArchiveException.class, 580 () -> DwcFiles.fromLocation(FileUtils.getClasspathFile("invalid/empty").toPath()), 581 "Empty archive should not be opened."); 582 583 assertThrows( 584 UnsupportedArchiveException.class, 585 () -> DwcFiles.fromLocation(FileUtils.getClasspathFile("invalid/meta-file-location-missing").toPath()), 586 "Archive with missing file location in meta.xml should not be opened."); 587 588 // Extension archive problems 589 assertThrows( 590 UnsupportedArchiveException.class, 591 () -> DwcFiles.fromLocation(FileUtils.getClasspathFile("invalid/extension-missing").toPath()), 592 "Archive with missing extension file should not be opened."); 593 594 assertThrows( 595 UnsupportedArchiveException.class, 596 () -> DwcFiles.fromLocation(FileUtils.getClasspathFile("invalid/extension-location-missing").toPath()), 597 "Archive with missing extension file location in meta.xml should not be opened." 598 ); 599 600 assertThrows( 601 UnsupportedArchiveException.class, 602 () -> DwcFiles.fromLocation(FileUtils.getClasspathFile("invalid/extension-core-id-missing").toPath()), 603 "Archive with extension lacking coreid in meta.xml should not be opened." 604 ); 605 606 assertThrows( 607 UnsupportedArchiveException.class, 608 () -> DwcFiles.fromLocation(FileUtils.getClasspathFile("invalid/extension-id-missing").toPath()), 609 "Archive with extension and core missing id in meta.xml should not be opened." 610 ); 611 } 612}