001/* 002 * Licensed under the Apache License, Version 2.0 (the "License"); 003 * you may not use this file except in compliance with the License. 004 * You may obtain a copy of the License at 005 * 006 * http://www.apache.org/licenses/LICENSE-2.0 007 * 008 * Unless required by applicable law or agreed to in writing, software 009 * distributed under the License is distributed on an "AS IS" BASIS, 010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 011 * See the License for the specific language governing permissions and 012 * limitations under the License. 013 */ 014package org.gbif.common.parsers.utils; 015 016import org.gbif.api.exception.UnparsableException; 017import org.gbif.api.model.checklistbank.ParsedName; 018import org.gbif.api.vocabulary.Rank; 019 020import java.util.Arrays; 021import java.util.HashSet; 022import java.util.Set; 023import java.util.regex.Pattern; 024 025import org.apache.commons.lang3.StringUtils; 026 027import static org.gbif.common.parsers.utils.NameParserUtils.PARSER; 028 029/** 030 * Utilities to work on classifications. 031 */ 032public final class ClassificationUtils { 033 034 // used to clean up bad characters 035 private static final Pattern CLEAN_REG_EX = Pattern.compile("[{}ยง';_|$%!?]+"); 036 037 // common null strings to ignore for fast performance. 038 // Less frequent ones are kept in the blacklisted names dictionary! 039 public static final Set<String> NULL_STRINGS = 040 new HashSet<String>(Arrays.asList("/N", "\\", "\\\\", "\\N", "\\\\N", "null", "NULL", "Null")); 041 042 private ClassificationUtils() { 043 throw new UnsupportedOperationException("Can't initialize class"); 044 } 045 046 /** 047 * Parses a canonical name at a specific Rank. 048 */ 049 public static String canonicalName(String scientificName, Rank rank) { 050 ParsedName pn = null; 051 try { 052 pn = PARSER.parse(scientificName, rank); 053 } catch (UnparsableException e) { 054 } 055 return pn.canonicalNameWithMarker(); 056 } 057 058 /** 059 * Parses a scientific name and creates the canonical name including a potential hybrid and rank marker 060 * plus the cultivar and strain names if existing. 061 * Note: This method once used to only include the hybrid marker - if that is still needed revert to buildName 062 * method. 063 */ 064 public static String canonicalName(String scientificName) { 065 return canonicalName(scientificName,null); 066 } 067 068 /** 069 * Cleans up a taxon as far as possible by removing erroneous chars etc. 070 * This does not do any parsing. 071 * 072 * @param taxon to check 073 */ 074 public static String clean(String taxon) { 075 if (StringUtils.isEmpty(taxon) || NULL_STRINGS.contains(taxon)) { 076 return null; 077 } 078 079 String cleanedTaxon = taxon; 080 081 // if it is a single word and ALL "UPPERCASE", turn it into a Capitalised word 082 // Note: if we lowercase names with multiple words we might accidently create valid looking names by lowercasing the 083 // author 084 // for example ABIES ALBA REMSEN will become an Abies alba remsen which will then be interpreted badly 085 // ABIES ALBA LINNEAUS 1771 will even be Abies alba linneaus 1771, a perfectly formed zoological name 086 if (!cleanedTaxon.contains(" ") && cleanedTaxon.equals(cleanedTaxon.toUpperCase())) { 087 cleanedTaxon = cleanedTaxon.substring(0, 1) + cleanedTaxon.substring(1).toLowerCase(); 088 } 089 090 // remove the " from names with it at the beginning and end 091 while (cleanedTaxon.length()> 0 && cleanedTaxon.charAt(0) == '\"' && cleanedTaxon.charAt(cleanedTaxon.length() - 1) == '\"') { 092 if (cleanedTaxon.length() == 1) { 093 return null; 094 } 095 cleanedTaxon = cleanedTaxon.substring(1, cleanedTaxon.length() - 1); 096 } 097 098 // remove the " from names with it just at the beginning 099 while (cleanedTaxon.length()> 0 && cleanedTaxon.charAt(0) == '\"') { 100 if (cleanedTaxon.length() == 1) { 101 return null; 102 } 103 cleanedTaxon = cleanedTaxon.substring(1); 104 } 105 106 // remove the " from names with it just at the end 107 while (cleanedTaxon.length()> 0 && cleanedTaxon.charAt(cleanedTaxon.length() - 1) == '\"') { 108 if (cleanedTaxon.length() == 1) { 109 return null; 110 } 111 cleanedTaxon = cleanedTaxon.substring(0, cleanedTaxon.length() - 1); 112 } 113 114 115 // remove noise 116 cleanedTaxon = CLEAN_REG_EX.matcher(cleanedTaxon).replaceAll(""); 117 cleanedTaxon = cleanedTaxon.trim(); 118 119 // don't let any blacklisted names through 120 if (BlacklistedNames.contains(cleanedTaxon.toUpperCase()) || (!cleanedTaxon.equals(taxon) && BlacklistedNames 121 .contains(taxon.toUpperCase()))) { 122 // blacklisted name 123 return null; 124 } 125 126 return StringUtils.trimToNull(cleanedTaxon); 127 } 128 129 /** 130 * Clean some noise from the author. A large proportion are "\N" for example. 131 * 132 * @param author to clean 133 * 134 * @return cleaned author 135 */ 136 public static String cleanAuthor(String author) { 137 if (StringUtils.isEmpty(author) || NULL_STRINGS.contains(author)) { 138 return null; 139 } 140 141 String cleanedAuthor = author; 142 143 // remove the " from names with it at the beginning and end 144 while (cleanedAuthor.length() >0 && cleanedAuthor.charAt(0) == '\"' && cleanedAuthor.charAt(cleanedAuthor.length() - 1) == '\"') { 145 if (cleanedAuthor.length() == 1) { 146 return null; 147 } 148 cleanedAuthor = cleanedAuthor.substring(1, cleanedAuthor.length() - 1); 149 } 150 151 // remove noise 152 cleanedAuthor = CLEAN_REG_EX.matcher(cleanedAuthor).replaceAll(""); 153 cleanedAuthor = cleanedAuthor.trim(); 154 155 return StringUtils.trimToNull(cleanedAuthor); 156 } 157 158 /** 159 * Parses a scientific name without knowing its Rank. 160 * @deprecated unsufficiently documented; should be in NameParser project if it is needed at all 161 */ 162 @Deprecated 163 public static String parseName(String scientificName) { 164 return parseName(scientificName, null); 165 } 166 167 /** 168 * Parses a scientific name of a specific rank. 169 * @deprecated unsufficiently documented; should be in NameParser project if it is needed at all 170 */ 171 @Deprecated 172 public static String parseName(String scientificName, Rank rank) { 173 174 try { 175 ParsedName pn = PARSER.parse(scientificName, rank); 176 // Handle Aus sp. and Aus bus spp. 177 if (pn.isIndetermined()) { 178 pn.setRank(null); 179 } 180 return pn.fullName(); 181 182 } catch (UnparsableException e) { 183 // TODO: logging 184 } 185 186 // looks dirty, so try and normalize it as best we can and get a canonical at least 187 String canon = PARSER.parseToCanonical(scientificName, rank); 188 if (canon != null) { 189 return canon; 190 } 191 192 return scientificName; 193 } 194}