Source code

001/*
002 * Licensed under the Apache License, Version 2.0 (the "License");
003 * you may not use this file except in compliance with the License.
004 * You may obtain a copy of the License at
005 *
006 *     http://www.apache.org/licenses/LICENSE-2.0
007 *
008 * Unless required by applicable law or agreed to in writing, software
009 * distributed under the License is distributed on an "AS IS" BASIS,
010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
011 * See the License for the specific language governing permissions and
012 * limitations under the License.
013 */
014package org.gbif.common.parsers.utils;
015
016import org.gbif.api.exception.UnparsableException;
017import org.gbif.api.model.checklistbank.ParsedName;
018import org.gbif.api.vocabulary.Rank;
019
020import java.util.Arrays;
021import java.util.HashSet;
022import java.util.Set;
023import java.util.regex.Pattern;
024
025import org.apache.commons.lang3.StringUtils;
026
027import static org.gbif.common.parsers.utils.NameParserUtils.PARSER;
028
029/**
030 * Utilities to work on classifications.
031 */
032public final class ClassificationUtils {
033
034  // used to clean up bad characters
035  private static final Pattern CLEAN_REG_EX = Pattern.compile("[{}§';_|$%!?]+");
036
037  // common null strings to ignore for fast performance.
038  // Less frequent ones are kept in the blacklisted names dictionary!
039  public static final Set<String> NULL_STRINGS =
040    new HashSet<String>(Arrays.asList("/N", "\\", "\\\\", "\\N", "\\\\N", "null", "NULL", "Null"));
041
042  private ClassificationUtils() {
043    throw new UnsupportedOperationException("Can't initialize class");
044  }
045
046  /**
047   * Parses a canonical name at a specific Rank.
048   */
049  public static String canonicalName(String scientificName, Rank rank) {
050    ParsedName pn = null;
051    try {
052      pn = PARSER.parse(scientificName, rank);
053    } catch (UnparsableException e) {
054    }
055    return pn.canonicalNameWithMarker();
056  }
057
058  /**
059   * Parses a scientific name and creates the canonical name including a potential hybrid and rank marker
060   * plus the cultivar and strain names if existing.
061   * Note: This method once used to only include the hybrid marker - if that is still needed revert to buildName
062   * method.
063   */
064  public static String canonicalName(String scientificName) {
065    return canonicalName(scientificName,null);
066  }
067
068  /**
069   * Cleans up a taxon as far as possible by removing erroneous chars etc.
070   * This does not do any parsing.
071   *
072   * @param taxon to check
073   */
074  public static String clean(String taxon) {
075    if (StringUtils.isEmpty(taxon) || NULL_STRINGS.contains(taxon)) {
076      return null;
077    }
078
079    String cleanedTaxon = taxon;
080
081    // if it is a single word and ALL "UPPERCASE", turn it into a Capitalised word
082    // Note: if we lowercase names with multiple words we might accidently create valid looking names by lowercasing the
083    // author
084    // for example ABIES ALBA REMSEN will become an Abies alba remsen which will then be interpreted badly
085    // ABIES ALBA LINNEAUS 1771 will even be Abies alba linneaus 1771, a perfectly formed zoological name
086    if (!cleanedTaxon.contains(" ") && cleanedTaxon.equals(cleanedTaxon.toUpperCase())) {
087      cleanedTaxon = cleanedTaxon.substring(0, 1) + cleanedTaxon.substring(1).toLowerCase();
088    }
089
090    // remove the " from names with it at the beginning and end
091    while (cleanedTaxon.length()> 0 && cleanedTaxon.charAt(0) == '\"' && cleanedTaxon.charAt(cleanedTaxon.length() - 1) == '\"') {
092      if (cleanedTaxon.length() == 1) {
093        return null;
094      }
095      cleanedTaxon = cleanedTaxon.substring(1, cleanedTaxon.length() - 1);
096    }
097
098    // remove the " from names with it just at the beginning
099    while (cleanedTaxon.length()> 0 && cleanedTaxon.charAt(0) == '\"') {
100      if (cleanedTaxon.length() == 1) {
101        return null;
102      }
103      cleanedTaxon = cleanedTaxon.substring(1);
104    }
105
106    // remove the " from names with it just at the end
107    while (cleanedTaxon.length()> 0 && cleanedTaxon.charAt(cleanedTaxon.length() - 1) == '\"') {
108      if (cleanedTaxon.length() == 1) {
109        return null;
110      }
111      cleanedTaxon = cleanedTaxon.substring(0, cleanedTaxon.length() - 1);
112    }
113
114
115    // remove noise
116    cleanedTaxon = CLEAN_REG_EX.matcher(cleanedTaxon).replaceAll("");
117    cleanedTaxon = cleanedTaxon.trim();
118
119    // don't let any blacklisted names through
120    if (BlacklistedNames.contains(cleanedTaxon.toUpperCase()) || (!cleanedTaxon.equals(taxon) && BlacklistedNames
121      .contains(taxon.toUpperCase()))) {
122      // blacklisted name
123      return null;
124    }
125
126    return StringUtils.trimToNull(cleanedTaxon);
127  }
128
129  /**
130   * Clean some noise from the author. A large proportion are "\N" for example.
131   *
132   * @param author to clean
133   *
134   * @return cleaned author
135   */
136  public static String cleanAuthor(String author) {
137    if (StringUtils.isEmpty(author) || NULL_STRINGS.contains(author)) {
138      return null;
139    }
140
141    String cleanedAuthor = author;
142
143    // remove the " from names with it at the beginning and end
144    while (cleanedAuthor.length() >0 && cleanedAuthor.charAt(0) == '\"' && cleanedAuthor.charAt(cleanedAuthor.length() - 1) == '\"') {
145      if (cleanedAuthor.length() == 1) {
146        return null;
147      }
148      cleanedAuthor = cleanedAuthor.substring(1, cleanedAuthor.length() - 1);
149    }
150
151    // remove noise
152    cleanedAuthor = CLEAN_REG_EX.matcher(cleanedAuthor).replaceAll("");
153    cleanedAuthor = cleanedAuthor.trim();
154
155    return StringUtils.trimToNull(cleanedAuthor);
156  }
157
158  /**
159   * Parses a scientific name without knowing its Rank.
160   * @deprecated unsufficiently documented; should be in NameParser project if it is needed at all
161   */
162  @Deprecated
163  public static String parseName(String scientificName) {
164    return parseName(scientificName, null);
165  }
166
167  /**
168   * Parses a scientific name of a specific rank.
169   * @deprecated unsufficiently documented; should be in NameParser project if it is needed at all
170   */
171  @Deprecated
172  public static String parseName(String scientificName, Rank rank) {
173
174    try {
175      ParsedName pn = PARSER.parse(scientificName, rank);
176      // Handle Aus sp. and Aus bus spp.
177      if (pn.isIndetermined()) {
178          pn.setRank(null);
179      }
180      return pn.fullName();
181
182    } catch (UnparsableException e) {
183      // TODO: logging
184    }
185
186    // looks dirty, so try and normalize it as best we can and get a canonical at least
187    String canon = PARSER.parseToCanonical(scientificName, rank);
188    if (canon != null) {
189      return canon;
190    }
191
192    return scientificName;
193  }
194}