001/* 002 * Copyright 2020-2021 Global Biodiversity Information Facility (GBIF) 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package org.gbif.api.model.crawler; 017 018import java.util.ArrayList; 019import java.util.List; 020import java.util.Objects; 021import java.util.StringJoiner; 022 023import com.fasterxml.jackson.annotation.JsonCreator; 024import com.fasterxml.jackson.annotation.JsonProperty; 025 026/** 027 * The rules followed here should match the document at: 028 * http://dev.gbif.org/wiki/display/INT/Identifier+problems+and+how+to+solve+them. 029 */ 030public class OccurrenceValidationReport { 031 // if the percentage of invalid triplets (eg missing catalog number) is greater than this, the archive is invalid 032 private static final double INVALID_TRIPLET_THRESHOLD = 0.25; 033 034 // the number of occurrence records checked in the validation 035 private final int checkedRecords; 036 037 // the number of triplets that were unique 038 private final int uniqueTriplets; 039 040 /** 041 * the number of triplets that were invalid (because one or more of institutionCode, collectionCode or catalogNumber 042 * were null or empty) 043 */ 044 private final int recordsWithInvalidTriplets; 045 046 // the number of occurrenceIds that were unique (therefore also == the number of records with unique occurrenceId) 047 private final int uniqueOccurrenceIds; 048 049 // records that had no occurrenceId 050 private final int recordsMissingOccurrenceId; 051 052 // false if we had to stop at our memory-saving limit 053 private final boolean allRecordsChecked; 054 055 // if the archive is not valid this will hold a readable reason 056 private String invalidationReason; 057 058 // is this archive valid 059 private final boolean valid; 060 061 @JsonCreator 062 public OccurrenceValidationReport(@JsonProperty("checkedRecords") int checkedRecords, 063 @JsonProperty("uniqueTriplets") int uniqueTriplets, 064 @JsonProperty("invalidTriplets") int recordsWithInvalidTriplets, 065 @JsonProperty("uniqueOccIds") int uniqueOccurrenceIds, 066 @JsonProperty("missingOccIds") int recordsMissingOccurrenceId, 067 @JsonProperty("allRecordsChecked") boolean allRecordsChecked) { 068 this.checkedRecords = checkedRecords; 069 this.uniqueTriplets = uniqueTriplets; 070 this.recordsWithInvalidTriplets = recordsWithInvalidTriplets; 071 this.uniqueOccurrenceIds = uniqueOccurrenceIds; 072 this.recordsMissingOccurrenceId = recordsMissingOccurrenceId; 073 this.allRecordsChecked = allRecordsChecked; 074 this.valid = validate(); 075 } 076 077 /** 078 * At the moment the only truly fatal conditions are: 079 * - whole archive is empty or unreadable 080 * - triplets are invalid (% invalid > than our threshold) && occIds are invalid (must be 100% coverage and unique) 081 * - any duplicate triplets && occIds are invalid 082 */ 083 private boolean validate() { 084 boolean hasRecords = checkedRecords > 0; 085 double invalidRatio = hasRecords ? (double) recordsWithInvalidTriplets / checkedRecords : 0; 086 boolean invalidTripletsBelowLimit = invalidRatio <= INVALID_TRIPLET_THRESHOLD; 087 boolean hasUniqueTriplets = uniqueTriplets == checkedRecords - recordsWithInvalidTriplets; 088 boolean hasUniqueOccIds = uniqueOccurrenceIds == checkedRecords - recordsMissingOccurrenceId; 089 boolean hasGoodOccIds = uniqueOccurrenceIds == checkedRecords; 090 boolean looksValid = invalidTripletsBelowLimit && hasUniqueTriplets || hasGoodOccIds; 091 092 if (!looksValid) { 093 List<String> reasons = new ArrayList<>(); 094 if (!invalidTripletsBelowLimit) { 095 reasons.add(Math.round(100 * invalidRatio) + "% invalid triplets is > than threshold of " + Math 096 .round(100 * INVALID_TRIPLET_THRESHOLD) + '%'); 097 } 098 if (!hasUniqueTriplets) { 099 reasons.add((checkedRecords - recordsWithInvalidTriplets - uniqueTriplets) + " duplicate triplets detected"); 100 } 101 if (!hasGoodOccIds) { 102 if (recordsMissingOccurrenceId != 0) { 103 reasons.add(recordsMissingOccurrenceId + " records without an occurrence id (should be 0)"); 104 } 105 if (!hasUniqueOccIds) { 106 reasons.add( 107 (checkedRecords - recordsMissingOccurrenceId - uniqueOccurrenceIds) + " duplicate occurrence ids detected"); 108 } 109 } 110 111 String reason = String.join("; ", reasons); 112 invalidationReason = "Archive invalid because [" + reason + ']'; 113 } 114 115 return looksValid; 116 } 117 118 public int getCheckedRecords() { 119 return checkedRecords; 120 } 121 122 public int getUniqueTriplets() { 123 return uniqueTriplets; 124 } 125 126 public int getRecordsWithInvalidTriplets() { 127 return recordsWithInvalidTriplets; 128 } 129 130 public int getUniqueOccurrenceIds() { 131 return uniqueOccurrenceIds; 132 } 133 134 public int getRecordsMissingOccurrenceId() { 135 return recordsMissingOccurrenceId; 136 } 137 138 public boolean isAllRecordsChecked() { 139 return allRecordsChecked; 140 } 141 142 public String getInvalidationReason() { 143 return invalidationReason; 144 } 145 146 public boolean isValid() { 147 return valid; 148 } 149 150 @Override 151 public boolean equals(Object o) { 152 if (this == o) { 153 return true; 154 } 155 if (o == null || getClass() != o.getClass()) { 156 return false; 157 } 158 OccurrenceValidationReport that = (OccurrenceValidationReport) o; 159 return checkedRecords == that.checkedRecords && 160 uniqueTriplets == that.uniqueTriplets && 161 recordsWithInvalidTriplets == that.recordsWithInvalidTriplets && 162 uniqueOccurrenceIds == that.uniqueOccurrenceIds && 163 recordsMissingOccurrenceId == that.recordsMissingOccurrenceId && 164 allRecordsChecked == that.allRecordsChecked && 165 valid == that.valid; 166 } 167 168 @Override 169 public int hashCode() { 170 return Objects 171 .hash(checkedRecords, uniqueTriplets, recordsWithInvalidTriplets, uniqueOccurrenceIds, 172 recordsMissingOccurrenceId, allRecordsChecked, valid); 173 } 174 175 @Override 176 public String toString() { 177 return new StringJoiner(", ", OccurrenceValidationReport.class.getSimpleName() + "[", 178 "]") 179 .add("checkedRecords=" + checkedRecords) 180 .add("uniqueTriplets=" + uniqueTriplets) 181 .add("recordsWithInvalidTriplets=" + recordsWithInvalidTriplets) 182 .add("uniqueOccurrenceIds=" + uniqueOccurrenceIds) 183 .add("recordsMissingOccurrenceId=" + recordsMissingOccurrenceId) 184 .add("allRecordsChecked=" + allRecordsChecked) 185 .add("invalidationReason='" + invalidationReason + "'") 186 .add("valid=" + valid) 187 .toString(); 188 } 189}