001/* 002 * Licensed under the Apache License, Version 2.0 (the "License"); 003 * you may not use this file except in compliance with the License. 004 * You may obtain a copy of the License at 005 * 006 * http://www.apache.org/licenses/LICENSE-2.0 007 * 008 * Unless required by applicable law or agreed to in writing, software 009 * distributed under the License is distributed on an "AS IS" BASIS, 010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 011 * See the License for the specific language governing permissions and 012 * limitations under the License. 013 */ 014package org.gbif.common.parsers; 015 016import org.gbif.api.exception.UnparsableException; 017import org.gbif.api.model.checklistbank.ParsedName; 018import org.gbif.common.parsers.core.Parsable; 019import org.gbif.common.parsers.core.ParseResult; 020 021import java.util.regex.Matcher; 022import java.util.regex.Pattern; 023 024import org.apache.commons.lang3.Range; 025import org.apache.commons.lang3.StringUtils; 026import org.slf4j.Logger; 027import org.slf4j.LoggerFactory; 028 029import static org.gbif.common.parsers.utils.NameParserUtils.PARSER; 030 031/** 032 * Singleton implementation using regex to extract a scientific name after a typestatus from a string. 033 * For example given the input "Holotype of Dianthus fruticosus ssp. amorginus Runemark" 034 * the parser will extract Dianthus fruticosus ssp. amorginus Runemark. 035 */ 036public class TypifiedNameParser implements Parsable<String> { 037 protected final Logger log = LoggerFactory.getLogger(getClass()); 038 private static TypifiedNameParser singletonObject = null; 039 040 private static final Range<Integer> REASONABLE_NAME_SIZE_RANGE = Range.between(4, 40); 041 private static final Pattern NAME_SEPARATOR = Pattern.compile("\\sOF\\W*\\s+\\W*(.+)\\W*\\s*$", Pattern.CASE_INSENSITIVE); 042 private static final Pattern CLEAN_WHITESPACE = Pattern.compile("\\s+"); 043 044 private TypifiedNameParser() { 045 } 046 047 @Override 048 public ParseResult<String> parse(String input) { 049 if (StringUtils.isNotEmpty(input)) { 050 Matcher m = NAME_SEPARATOR.matcher(input); 051 if (m.find()) { 052 String name = m.group(1); 053 // make sure the name does not end with "type", see http://dev.gbif.org/issues/browse/POR-2703 054 if (!name.endsWith("type")) { 055 try { 056 ParsedName pn = PARSER.parse(name,null); 057 return ParseResult.success(ParseResult.CONFIDENCE.PROBABLE, pn.canonicalNameComplete()); 058 059 } catch (UnparsableException e) { 060 log.debug("Cannot parse typified name: [{}] from input [{}]", name, input); 061 name = CLEAN_WHITESPACE.matcher(name).replaceAll(" ").trim(); 062 if (REASONABLE_NAME_SIZE_RANGE.contains(name.length())) { 063 return ParseResult.success(ParseResult.CONFIDENCE.POSSIBLE, name); 064 } 065 } 066 } 067 } 068 } 069 return ParseResult.fail(); 070 } 071 072 public static TypifiedNameParser getInstance() { 073 synchronized (TypifiedNameParser.class) { 074 if (singletonObject == null) { 075 singletonObject = new TypifiedNameParser(); 076 } 077 } 078 return singletonObject; 079 } 080}