001/*
002 * Copyright 2020 Global Biodiversity Information Facility (GBIF)
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016package org.gbif.api.model.crawler;
017
018import org.gbif.api.vocabulary.EndpointType;
019
020import java.net.URI;
021import java.util.Collections;
022import java.util.Map;
023import java.util.Objects;
024import java.util.StringJoiner;
025import java.util.UUID;
026
027import javax.annotation.Nullable;
028import javax.annotation.concurrent.Immutable;
029import javax.annotation.concurrent.ThreadSafe;
030
031import com.fasterxml.jackson.annotation.JsonCreator;
032import com.fasterxml.jackson.annotation.JsonIgnore;
033import com.fasterxml.jackson.annotation.JsonProperty;
034
035import static org.gbif.api.util.PreconditionUtils.checkArgument;
036
037/**
038 * This class represents a job to be worked on by a crawler. That can be either one of the XML based protocols
039 * (BioCASe, DiGIR, TAPIR) or a DwC-Archive.
040 * <p/>
041 * For now this object will be used in JSON serialized form in ZooKeeper.
042 */
043@Immutable
044@ThreadSafe
045@SuppressWarnings("unused")
046public class CrawlJob {
047
048  private final UUID datasetKey;
049  private final EndpointType endpointType;
050  private final URI targetUrl;
051  private final int attempt;
052  private final Map<String, String> properties;
053
054  /**
055   * Creates a new crawl job.
056   *
057   * @param datasetKey   of the dataset to crawl
058   * @param endpointType of the dataset
059   * @param targetUrl    of the dataset
060   * @param attempt      a monotonously increasing counter, increased every time we try to crawl a dataset whether that
061   *                     attempt is successful or not
062   * @param properties   a way to provide protocol or crawl specific options
063   */
064  @JsonCreator
065  public CrawlJob(
066    @JsonProperty("datasetKey") UUID datasetKey,
067    @JsonProperty("endpointType") EndpointType endpointType,
068    @JsonProperty("targetUrl") URI targetUrl,
069    @JsonProperty("attempt") int attempt,
070    @Nullable @JsonProperty("properties") Map<String, String> properties
071  ) {
072    this.datasetKey = Objects.requireNonNull(datasetKey);
073    this.endpointType = Objects.requireNonNull(endpointType);
074    this.targetUrl = Objects.requireNonNull(targetUrl);
075    checkArgument(attempt > 0, "attempt has to be greater than 0");
076    this.attempt = attempt;
077
078    if (properties == null) {
079      this.properties = Collections.emptyMap();
080    } else {
081      this.properties = Collections.unmodifiableMap(properties);
082    }
083  }
084
085  /**
086   * Constructor with mandatory fields.
087   * Properties field is set to null.
088   *
089   * @param datasetKey   of the dataset to crawl
090   * @param endpointType of the dataset
091   * @param targetUrl    of the dataset
092   * @param attempt      a monotonously increasing counter, increased every time we try to crawl a dataset whether that
093   *                     attempt is successful or not
094   */
095  public CrawlJob(UUID datasetKey, Integer attempt, EndpointType endpointType, URI targetUrl) {
096    //This constructor is used for the MyBatis persistence layer.
097    this.datasetKey = datasetKey;
098    this.attempt = attempt;
099    this.endpointType = endpointType;
100    this.targetUrl = targetUrl;
101    this.properties = Collections.emptyMap();
102  }
103
104  public UUID getDatasetKey() {
105    return datasetKey;
106  }
107
108  public EndpointType getEndpointType() {
109    return endpointType;
110  }
111
112  /**
113   * Used to save protocol specific information (e.g. contentNamespace for TAPIR and BioCASe).
114   *
115   * @return an immutable map of all the properties
116   */
117  // NOTE: This should be an ImmutableMap but Jackson 1.x can't easily deserialize that
118  public Map<String, String> getProperties() {
119    return properties;
120  }
121
122  public URI getTargetUrl() {
123    return targetUrl;
124  }
125
126  public int getAttempt() {
127    return attempt;
128  }
129
130  @JsonIgnore
131  public String getProperty(String name) {
132    return properties.get(name);
133  }
134
135  @Override
136  public boolean equals(Object o) {
137    if (this == o) {
138      return true;
139    }
140    if (o == null || getClass() != o.getClass()) {
141      return false;
142    }
143    CrawlJob crawlJob = (CrawlJob) o;
144    return attempt == crawlJob.attempt &&
145      Objects.equals(datasetKey, crawlJob.datasetKey) &&
146      endpointType == crawlJob.endpointType &&
147      Objects.equals(targetUrl, crawlJob.targetUrl) &&
148      Objects.equals(properties, crawlJob.properties);
149  }
150
151  @Override
152  public int hashCode() {
153    return Objects.hash(datasetKey, endpointType, targetUrl, attempt, properties);
154  }
155
156  @Override
157  public String toString() {
158    return new StringJoiner(", ", CrawlJob.class.getSimpleName() + "[", "]")
159      .add("datasetKey=" + datasetKey)
160      .add("endpointType=" + endpointType)
161      .add("targetUrl=" + targetUrl)
162      .add("attempt=" + attempt)
163      .add("properties=" + properties)
164      .toString();
165  }
166}