001/* 002 * Licensed under the Apache License, Version 2.0 (the "License"); 003 * you may not use this file except in compliance with the License. 004 * You may obtain a copy of the License at 005 * 006 * http://www.apache.org/licenses/LICENSE-2.0 007 * 008 * Unless required by applicable law or agreed to in writing, software 009 * distributed under the License is distributed on an "AS IS" BASIS, 010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 011 * See the License for the specific language governing permissions and 012 * limitations under the License. 013 */ 014package org.gbif.api.service.crawler; 015 016import org.gbif.api.model.crawler.DatasetProcessStatus; 017 018import java.util.List; 019import java.util.Set; 020import java.util.UUID; 021 022import javax.annotation.Nullable; 023 024/** 025 * This service exposes information regarding current crawling process and is not intended to provide historical 026 * information. Only information about currently queued and running crawl jobs is exposed. 027 * <p/> 028 * We distinguish between XML based (BioCASe, DiGIR, TAPIR), Darwin Core archive, ABCD archive and Camtrap Data Package 029 * datasets. These don't share the same work queues because their processing is different in the beginning 030 * (downloading an archive vs. request-response type iterating over the endpoint. They do however share the same 031 * pipeline for processing the gathered data. 032 */ 033public interface DatasetProcessService { 034 035 /** 036 * Returns the processing status for a particular dataset identified by a {@link UUID} key. 037 * 038 * @param datasetKey the dataset key 039 * 040 * @return a consolidated object populated with the crawl status for the specific dataset. Returns null if the 041 * dataset is not currently being processed 042 */ 043 @Nullable 044 DatasetProcessStatus getDatasetProcessStatus(UUID datasetKey); 045 046 /** 047 * @return the processing status for all datasets that are currently being worked on (XML and DwC-A). These might be 048 * in different states, some can still be crawled on a page by page basis, some may be downloaded in the case 049 * of DwC-A and for some only the interpretation is still running. 050 * <p/> 051 * There is a chance that some processes will be returned that are already finished. 052 */ 053 Set<DatasetProcessStatus> getRunningDatasetProcesses(); 054 055 /** 056 * @return an ordered list of dataset processing statuses for all XML based datasets that are currently waiting to be 057 * crawled 058 */ 059 List<DatasetProcessStatus> getPendingXmlDatasetProcesses(); 060 061 /** 062 * @return an ordered list of dataset processing statuses for all DwC-A based datasets that are currently waiting to 063 * be crawled 064 */ 065 List<DatasetProcessStatus> getPendingDwcaDatasetProcesses(); 066 067 /** 068 * @return an ordered list of dataset processing statuses for all ABCD-A based datasets that are currently waiting to 069 * be crawled 070 */ 071 List<DatasetProcessStatus> getPendingAbcdaDatasetProcesses(); 072 073 /** 074 * @return an ordered list of dataset processing statuses for all CamtrapDP based datasets that are currently waiting to 075 * be crawled 076 */ 077 List<DatasetProcessStatus> getPendingCamtrapDpDatasetProcesses(); 078}