001/*
002 * Copyright 2020-2021 Global Biodiversity Information Facility (GBIF)
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016package org.gbif.api.model.crawler;
017
018import java.util.ArrayList;
019import java.util.List;
020import java.util.Objects;
021import java.util.StringJoiner;
022
023import com.fasterxml.jackson.annotation.JsonCreator;
024import com.fasterxml.jackson.annotation.JsonProperty;
025
026/**
027 * The rules followed here should match the document at:
028 * http://dev.gbif.org/wiki/display/INT/Identifier+problems+and+how+to+solve+them.
029 */
030public class OccurrenceValidationReport {
031  // if the percentage of invalid triplets (eg missing catalog number) is greater than this, the archive is invalid
032  private static final double INVALID_TRIPLET_THRESHOLD = 0.25;
033
034  // the number of occurrence records checked in the validation
035  private final int checkedRecords;
036
037  // the number of triplets that were unique
038  private final int uniqueTriplets;
039
040  /**
041   * the number of triplets that were invalid (because one or more of institutionCode, collectionCode or catalogNumber
042   * were null or empty)
043   */
044  private final int recordsWithInvalidTriplets;
045
046  // the number of occurrenceIds that were unique (therefore also == the number of records with unique occurrenceId)
047  private final int uniqueOccurrenceIds;
048
049  // records that had no occurrenceId
050  private final int recordsMissingOccurrenceId;
051
052  // false if we had to stop at our memory-saving limit
053  private final boolean allRecordsChecked;
054
055  // if the archive is not valid this will hold a readable reason
056  private String invalidationReason;
057
058  // is this archive valid
059  private final boolean valid;
060
061  @JsonCreator
062  public OccurrenceValidationReport(@JsonProperty("checkedRecords") int checkedRecords,
063                                    @JsonProperty("uniqueTriplets") int uniqueTriplets,
064                                    @JsonProperty("invalidTriplets") int recordsWithInvalidTriplets,
065                                    @JsonProperty("uniqueOccIds") int uniqueOccurrenceIds,
066                                    @JsonProperty("missingOccIds") int recordsMissingOccurrenceId,
067                                    @JsonProperty("allRecordsChecked") boolean allRecordsChecked) {
068    this.checkedRecords = checkedRecords;
069    this.uniqueTriplets = uniqueTriplets;
070    this.recordsWithInvalidTriplets = recordsWithInvalidTriplets;
071    this.uniqueOccurrenceIds = uniqueOccurrenceIds;
072    this.recordsMissingOccurrenceId = recordsMissingOccurrenceId;
073    this.allRecordsChecked = allRecordsChecked;
074    this.valid = validate();
075  }
076
077  /**
078   * At the moment the only truly fatal conditions are:
079   * - whole archive is empty or unreadable
080   * - triplets are invalid (% invalid > than our threshold) && occIds are invalid (must be 100% coverage and unique)
081   * - any duplicate triplets && occIds are invalid
082   */
083  private boolean validate() {
084    boolean hasRecords = checkedRecords > 0;
085    double invalidRatio = hasRecords ? (double) recordsWithInvalidTriplets / checkedRecords : 0;
086    boolean invalidTripletsBelowLimit = invalidRatio <= INVALID_TRIPLET_THRESHOLD;
087    boolean hasUniqueTriplets = uniqueTriplets == checkedRecords - recordsWithInvalidTriplets;
088    boolean hasUniqueOccIds = uniqueOccurrenceIds == checkedRecords - recordsMissingOccurrenceId;
089    boolean hasGoodOccIds = uniqueOccurrenceIds == checkedRecords;
090    boolean looksValid = invalidTripletsBelowLimit && hasUniqueTriplets || hasGoodOccIds;
091
092    if (!looksValid) {
093      List<String> reasons = new ArrayList<>();
094      if (!invalidTripletsBelowLimit) {
095        reasons.add(Math.round(100 * invalidRatio) + "% invalid triplets is > than threshold of " + Math
096          .round(100 * INVALID_TRIPLET_THRESHOLD) + '%');
097      }
098      if (!hasUniqueTriplets) {
099        reasons.add((checkedRecords - recordsWithInvalidTriplets - uniqueTriplets) + " duplicate triplets detected");
100      }
101      if (!hasGoodOccIds) {
102        if (recordsMissingOccurrenceId != 0) {
103          reasons.add(recordsMissingOccurrenceId + " records without an occurrence id (should be 0)");
104        }
105        if (!hasUniqueOccIds) {
106          reasons.add(
107            (checkedRecords - recordsMissingOccurrenceId - uniqueOccurrenceIds) + " duplicate occurrence ids detected");
108        }
109      }
110
111      String reason = String.join("; ", reasons);
112      invalidationReason = "Archive invalid because [" + reason + ']';
113    }
114
115    return looksValid;
116  }
117
118  public int getCheckedRecords() {
119    return checkedRecords;
120  }
121
122  public int getUniqueTriplets() {
123    return uniqueTriplets;
124  }
125
126  public int getRecordsWithInvalidTriplets() {
127    return recordsWithInvalidTriplets;
128  }
129
130  public int getUniqueOccurrenceIds() {
131    return uniqueOccurrenceIds;
132  }
133
134  public int getRecordsMissingOccurrenceId() {
135    return recordsMissingOccurrenceId;
136  }
137
138  public boolean isAllRecordsChecked() {
139    return allRecordsChecked;
140  }
141
142  public String getInvalidationReason() {
143    return invalidationReason;
144  }
145
146  public boolean isValid() {
147    return valid;
148  }
149
150  @Override
151  public boolean equals(Object o) {
152    if (this == o) {
153      return true;
154    }
155    if (o == null || getClass() != o.getClass()) {
156      return false;
157    }
158    OccurrenceValidationReport that = (OccurrenceValidationReport) o;
159    return checkedRecords == that.checkedRecords &&
160      uniqueTriplets == that.uniqueTriplets &&
161      recordsWithInvalidTriplets == that.recordsWithInvalidTriplets &&
162      uniqueOccurrenceIds == that.uniqueOccurrenceIds &&
163      recordsMissingOccurrenceId == that.recordsMissingOccurrenceId &&
164      allRecordsChecked == that.allRecordsChecked &&
165      valid == that.valid;
166  }
167
168  @Override
169  public int hashCode() {
170    return Objects
171      .hash(checkedRecords, uniqueTriplets, recordsWithInvalidTriplets, uniqueOccurrenceIds,
172        recordsMissingOccurrenceId, allRecordsChecked, valid);
173  }
174
175  @Override
176  public String toString() {
177    return new StringJoiner(", ", OccurrenceValidationReport.class.getSimpleName() + "[",
178      "]")
179      .add("checkedRecords=" + checkedRecords)
180      .add("uniqueTriplets=" + uniqueTriplets)
181      .add("recordsWithInvalidTriplets=" + recordsWithInvalidTriplets)
182      .add("uniqueOccurrenceIds=" + uniqueOccurrenceIds)
183      .add("recordsMissingOccurrenceId=" + recordsMissingOccurrenceId)
184      .add("allRecordsChecked=" + allRecordsChecked)
185      .add("invalidationReason='" + invalidationReason + "'")
186      .add("valid=" + valid)
187      .toString();
188  }
189}