001/*
002 * Licensed under the Apache License, Version 2.0 (the "License");
003 * you may not use this file except in compliance with the License.
004 * You may obtain a copy of the License at
005 *
006 *     http://www.apache.org/licenses/LICENSE-2.0
007 *
008 * Unless required by applicable law or agreed to in writing, software
009 * distributed under the License is distributed on an "AS IS" BASIS,
010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
011 * See the License for the specific language governing permissions and
012 * limitations under the License.
013 */
014package org.gbif.common.parsers;
015
016import org.gbif.api.exception.UnparsableException;
017import org.gbif.api.model.checklistbank.ParsedName;
018import org.gbif.common.parsers.core.Parsable;
019import org.gbif.common.parsers.core.ParseResult;
020
021import java.util.regex.Matcher;
022import java.util.regex.Pattern;
023
024import org.apache.commons.lang3.Range;
025import org.apache.commons.lang3.StringUtils;
026import org.slf4j.Logger;
027import org.slf4j.LoggerFactory;
028
029import static org.gbif.common.parsers.utils.NameParserUtils.PARSER;
030
031/**
032 * Singleton implementation using regex to extract a scientific name after a typestatus from a string.
033 * For example given the input "Holotype of Dianthus fruticosus ssp. amorginus Runemark"
034 * the parser will extract Dianthus fruticosus ssp. amorginus Runemark.
035 */
036public class TypifiedNameParser implements Parsable<String> {
037  protected final Logger log = LoggerFactory.getLogger(getClass());
038  private static TypifiedNameParser singletonObject = null;
039
040  private static final Range<Integer> REASONABLE_NAME_SIZE_RANGE = Range.between(4, 40);
041  private static final Pattern NAME_SEPARATOR = Pattern.compile("\\sOF\\W*\\s+\\W*(.+)\\W*\\s*$", Pattern.CASE_INSENSITIVE);
042  private static final Pattern CLEAN_WHITESPACE = Pattern.compile("\\s+");
043
044  private TypifiedNameParser() {
045  }
046
047  @Override
048  public ParseResult<String> parse(String input) {
049    if (StringUtils.isNotEmpty(input)) {
050      Matcher m = NAME_SEPARATOR.matcher(input);
051      if (m.find()) {
052        String name = m.group(1);
053        // make sure the name does not end with "type", see http://dev.gbif.org/issues/browse/POR-2703
054        if (!name.endsWith("type")) {
055          try {
056            ParsedName pn = PARSER.parse(name,null);
057            return ParseResult.success(ParseResult.CONFIDENCE.PROBABLE, pn.canonicalNameComplete());
058
059          } catch (UnparsableException e) {
060            log.debug("Cannot parse typified name: [{}] from input [{}]", name, input);
061            name = CLEAN_WHITESPACE.matcher(name).replaceAll(" ").trim();
062            if (REASONABLE_NAME_SIZE_RANGE.contains(name.length())) {
063              return ParseResult.success(ParseResult.CONFIDENCE.POSSIBLE, name);
064            }
065          }
066        }
067      }
068    }
069    return ParseResult.fail();
070  }
071
072  public static TypifiedNameParser getInstance() {
073    synchronized (TypifiedNameParser.class) {
074      if (singletonObject == null) {
075        singletonObject = new TypifiedNameParser();
076      }
077    }
078    return singletonObject;
079  }
080}