001/*
002 * Copyright 2014 Global Biodiversity Information Facility (GBIF)
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016package org.gbif.api.service.crawler;
017
018import org.gbif.api.model.crawler.DatasetProcessStatus;
019
020import java.util.List;
021import java.util.Set;
022import java.util.UUID;
023import javax.annotation.Nullable;
024
025/**
026 * This service exposes information regarding the crawling process and is not intended to provide historical
027 * information at the moment. Only information about currently queued and running crawl jobs is exposed.
028 * <p/>
029 * We distinguish between XML based (BioCASe, DiGIR, TAPIR) and DwC-A datasets. These don't share the same work queues
030 * because their processing is different in the beginning (downloading an archive vs. request-response type iterating
031 * over the endpoint. They do however share the same pipeline for processing the gathered data.
032 */
033public interface DatasetProcessService {
034
035  /**
036   * Returns the processing status for a particular dataset identified by a {@link UUID} key.
037   *
038   * @param datasetKey the dataset key
039   *
040   * @return a consolidated object populated with the crawl status for the specific dataset. Returns null if the
041   *         dataset is not currently being processed
042   */
043  @Nullable
044  DatasetProcessStatus getDatasetProcessStatus(UUID datasetKey);
045
046  /**
047   * @return the processing status for all datasets that are currently being worked on (XML and DwC-A). These might be
048   *         in different states, some can still be crawled on a page by page basis, some may be downloaded in the case
049   *         of DwC-A and for some only the interpretation is still running.
050   *         <p/>
051   *         There is a chance that some processes will be returned that are already finished.
052   */
053  Set<DatasetProcessStatus> getRunningDatasetProcesses();
054
055  /**
056   * @return an ordered list of dataset processing statuses for all XML based datasets that are currently waiting to be
057   *         crawled
058   */
059  List<DatasetProcessStatus> getPendingXmlDatasetProcesses();
060
061  /**
062   * @return an ordered list of dataset processing statuses for all DwC-A based datasets that are currently waiting to
063   *         be crawled
064   */
065  List<DatasetProcessStatus> getPendingDwcaDatasetProcesses();
066}