001/*
002 * Licensed under the Apache License, Version 2.0 (the "License");
003 * you may not use this file except in compliance with the License.
004 * You may obtain a copy of the License at
005 *
006 *     http://www.apache.org/licenses/LICENSE-2.0
007 *
008 * Unless required by applicable law or agreed to in writing, software
009 * distributed under the License is distributed on an "AS IS" BASIS,
010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
011 * See the License for the specific language governing permissions and
012 * limitations under the License.
013 */
014package org.gbif.api.service.crawler;
015
016import org.gbif.api.model.crawler.DatasetProcessStatus;
017
018import java.util.List;
019import java.util.Set;
020import java.util.UUID;
021
022import javax.annotation.Nullable;
023
024/**
025 * This service exposes information regarding current crawling process and is not intended to provide historical
026 * information. Only information about currently queued and running crawl jobs is exposed.
027 * <p/>
028 * We distinguish between XML based (BioCASe, DiGIR, TAPIR), Darwin Core archive, ABCD archive and Camtrap Data Package
029 * datasets. These don't share the same work queues because their processing is different in the beginning
030 * (downloading an archive vs. request-response type iterating over the endpoint. They do however share the same
031 * pipeline for processing the gathered data.
032 */
033public interface DatasetProcessService {
034
035  /**
036   * Returns the processing status for a particular dataset identified by a {@link UUID} key.
037   *
038   * @param datasetKey the dataset key
039   *
040   * @return a consolidated object populated with the crawl status for the specific dataset. Returns null if the
041   *         dataset is not currently being processed
042   */
043  @Nullable
044  DatasetProcessStatus getDatasetProcessStatus(UUID datasetKey);
045
046  /**
047   * @return the processing status for all datasets that are currently being worked on (XML and DwC-A). These might be
048   *         in different states, some can still be crawled on a page by page basis, some may be downloaded in the case
049   *         of DwC-A and for some only the interpretation is still running.
050   *         <p/>
051   *         There is a chance that some processes will be returned that are already finished.
052   */
053  Set<DatasetProcessStatus> getRunningDatasetProcesses();
054
055  /**
056   * @return an ordered list of dataset processing statuses for all XML based datasets that are currently waiting to be
057   *         crawled
058   */
059  List<DatasetProcessStatus> getPendingXmlDatasetProcesses();
060
061  /**
062   * @return an ordered list of dataset processing statuses for all DwC-A based datasets that are currently waiting to
063   *         be crawled
064   */
065  List<DatasetProcessStatus> getPendingDwcaDatasetProcesses();
066
067  /**
068   * @return an ordered list of dataset processing statuses for all ABCD-A based datasets that are currently waiting to
069   *         be crawled
070   */
071  List<DatasetProcessStatus> getPendingAbcdaDatasetProcesses();
072
073  /**
074   * @return an ordered list of dataset processing statuses for all CamtrapDP based datasets that are currently waiting to
075   *         be crawled
076   */
077  List<DatasetProcessStatus> getPendingCamtrapDpDatasetProcesses();
078}