001package org.gbif.api.model.crawler;
002
003import java.util.List;
004
005import com.google.common.base.Joiner;
006import com.google.common.base.Objects;
007import com.google.common.collect.Lists;
008import org.codehaus.jackson.annotate.JsonCreator;
009import org.codehaus.jackson.annotate.JsonProperty;
010
011/**
012 * The rules followed here should match the document at:
013 * http://dev.gbif.org/wiki/display/INT/Identifier+problems+and+how+to+solve+them.
014 */
015public class OccurrenceValidationReport {
016  // if the percentage of invalid triplets (eg missing catalog number) is greater than this, the archive is invalid
017  private static final double INVALID_TRIPLET_THRESHOLD = 0.25;
018
019  // the number of occurrence records checked in the validation
020  private final int checkedRecords;
021
022  // the number of triplets that were unique
023  private final int uniqueTriplets;
024
025  /**
026   * the number of triplets that were invalid (because one or more of institutionCode, collectionCode or catalogNumber
027   * were null or empty)
028   */
029  private final int recordsWithInvalidTriplets;
030
031  // the number of occurrenceIds that were unique (therefore also == the number of records with unique occurrenceId)
032  private final int uniqueOccurrenceIds;
033
034  // records that had no occurrenceId
035  private final int recordsMissingOccurrenceId;
036
037  // false if we had to stop at our memory-saving limit
038  private final boolean allRecordsChecked;
039
040  // if the archive is not valid this will hold a readable reason
041  private String invalidationReason;
042
043  // is this archive valid
044  private final boolean valid;
045
046  @JsonCreator
047  public OccurrenceValidationReport(@JsonProperty("checkedRecords") int checkedRecords,
048    @JsonProperty("uniqueTriplets") int uniqueTriplets,
049    @JsonProperty("invalidTriplets") int recordsWithInvalidTriplets,
050    @JsonProperty("uniqueOccIds") int uniqueOccurrenceIds,
051    @JsonProperty("missingOccIds") int recordsMissingOccurrenceId,
052    @JsonProperty("allRecordsChecked") boolean allRecordsChecked) {
053    this.checkedRecords = checkedRecords;
054    this.uniqueTriplets = uniqueTriplets;
055    this.recordsWithInvalidTriplets = recordsWithInvalidTriplets;
056    this.uniqueOccurrenceIds = uniqueOccurrenceIds;
057    this.recordsMissingOccurrenceId = recordsMissingOccurrenceId;
058    this.allRecordsChecked = allRecordsChecked;
059    this.valid = validate();
060  }
061
062  /**
063   * At the moment the only truly fatal conditions are:
064   * - whole archive is empty or unreadable
065   * - triplets are invalid (% invalid > than our threshold) && occIds are invalid (must be 100% coverage and unique)
066   * - any duplicate triplets && occIds are invalid
067   */
068  private boolean validate() {
069    boolean hasRecords = checkedRecords > 0;
070    double invalidRatio = hasRecords ? (double) recordsWithInvalidTriplets / checkedRecords : 0;
071    boolean invalidTripletsBelowLimit = invalidRatio <= INVALID_TRIPLET_THRESHOLD;
072    boolean hasUniqueTriplets = uniqueTriplets == checkedRecords - recordsWithInvalidTriplets;
073    boolean hasUniqueOccIds = uniqueOccurrenceIds == checkedRecords - recordsMissingOccurrenceId;
074    boolean hasGoodOccIds = uniqueOccurrenceIds == checkedRecords;
075    boolean looksValid = invalidTripletsBelowLimit && hasUniqueTriplets || hasGoodOccIds;
076
077    if (!looksValid) {
078      List<String> reasons = Lists.newArrayList();
079      if (!invalidTripletsBelowLimit) {
080        reasons.add(Math.round(100 * invalidRatio) + "% invalid triplets is > than threshold of " + Math
081          .round(100 * INVALID_TRIPLET_THRESHOLD) + '%');
082      }
083      if (!hasUniqueTriplets) {
084        reasons.add((checkedRecords - recordsWithInvalidTriplets - uniqueTriplets) + " duplicate triplets detected");
085      }
086      if (!hasGoodOccIds) {
087        if (recordsMissingOccurrenceId != 0) {
088          reasons.add(recordsMissingOccurrenceId + " records without an occurrence id (should be 0)");
089        }
090        if (!hasUniqueOccIds) {
091          reasons.add(
092            (checkedRecords - recordsMissingOccurrenceId - uniqueOccurrenceIds) + " duplicate occurrence ids detected");
093        }
094      }
095      String reason = Joiner.on("; ").join(reasons);
096      invalidationReason = "Archive invalid because [" + reason + ']';
097    }
098
099    return looksValid;
100  }
101
102  public int getCheckedRecords() {
103    return checkedRecords;
104  }
105
106  public int getUniqueTriplets() {
107    return uniqueTriplets;
108  }
109
110  public int getRecordsWithInvalidTriplets() {
111    return recordsWithInvalidTriplets;
112  }
113
114  public int getUniqueOccurrenceIds() {
115    return uniqueOccurrenceIds;
116  }
117
118  public int getRecordsMissingOccurrenceId() {
119    return recordsMissingOccurrenceId;
120  }
121
122  public boolean isAllRecordsChecked() {
123    return allRecordsChecked;
124  }
125
126  public String getInvalidationReason() {
127    return invalidationReason;
128  }
129
130  public boolean isValid() {
131    return valid;
132  }
133
134  @Override
135  public int hashCode() {
136    return Objects.hashCode(checkedRecords, uniqueTriplets, recordsWithInvalidTriplets, uniqueOccurrenceIds,
137      recordsMissingOccurrenceId, allRecordsChecked, valid);
138  }
139
140  @Override
141  public boolean equals(Object obj) {
142    if (this == obj) {
143      return true;
144    }
145    if (obj == null || getClass() != obj.getClass()) {
146      return false;
147    }
148    final OccurrenceValidationReport other = (OccurrenceValidationReport) obj;
149    return Objects.equal(this.checkedRecords, other.checkedRecords)
150           && Objects.equal(this.uniqueTriplets, other.uniqueTriplets)
151           && Objects.equal(this.recordsWithInvalidTriplets, other.recordsWithInvalidTriplets)
152           && Objects.equal(this.uniqueOccurrenceIds, other.uniqueOccurrenceIds)
153           && Objects.equal(this.recordsMissingOccurrenceId, other.recordsMissingOccurrenceId)
154           && Objects.equal(this.allRecordsChecked, other.allRecordsChecked)
155           && Objects.equal(this.valid, other.valid);
156  }
157
158  @Override
159  public String toString() {
160    return Objects.toStringHelper(this).add("checkedRecords", checkedRecords)
161      .add("uniqueTriplets", uniqueTriplets).add("recordsWithInvalidTriplets", recordsWithInvalidTriplets)
162      .add("uniqueOccurrenceIds", uniqueOccurrenceIds).add("recordsMissingOccurrenceId", recordsMissingOccurrenceId)
163      .add("allRecordsChecked", allRecordsChecked).add("invalidationReason", invalidationReason).add("valid", valid)
164      .toString();
165  }
166}