001/*
002 * Licensed under the Apache License, Version 2.0 (the "License");
003 * you may not use this file except in compliance with the License.
004 * You may obtain a copy of the License at
005 *
006 *     http://www.apache.org/licenses/LICENSE-2.0
007 *
008 * Unless required by applicable law or agreed to in writing, software
009 * distributed under the License is distributed on an "AS IS" BASIS,
010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
011 * See the License for the specific language governing permissions and
012 * limitations under the License.
013 */
014package org.gbif.common.parsers;
015
016import org.gbif.api.util.VocabularyUtils;
017import org.gbif.api.vocabulary.Country;
018import org.gbif.common.parsers.core.EnumParser;
019
020import java.util.regex.Pattern;
021
022import org.apache.commons.lang3.StringUtils;
023
024/**
025 * Singleton implementation of the dictionary that uses the file /dictionaries/parse/countryName.txt.
026 */
027public class CountryParser extends EnumParser<Country> {
028
029  private static CountryParser singletonObject = null;
030
031  // "off Australia"
032  private static final Pattern REMOVE_OFF_PATTERN = Pattern.compile("off ", Pattern.CASE_INSENSITIVE);
033  private static final Pattern REMOVE_ISO3166_PATTERN = Pattern.compile("\\b(ISO.?3166.?[123]?:?)", Pattern.CASE_INSENSITIVE);
034
035  private CountryParser() {
036    super(Country.class, false);
037    // also make sure we have all official iso countries mapped
038    for (Country c : Country.OFFICIAL_COUNTRIES) {
039      add(c.name(), c);
040      add(c.getTitle(), c);
041      add(c.getIso2LetterCode(), c);
042      add(c.getIso3LetterCode(), c);
043    }
044    // and Kosovo (which is not an official code, but should be treated as such by GBIF)
045    add(Country.KOSOVO.name(), Country.KOSOVO);
046    add(Country.KOSOVO.getTitle(), Country.KOSOVO);
047    add(Country.KOSOVO.getIso2LetterCode(), Country.KOSOVO);
048    add(Country.KOSOVO.getIso3LetterCode(), Country.KOSOVO);
049    // use dict file last
050    init(CountryParser.class.getResourceAsStream("/dictionaries/parse/countryName.tsv"));
051  }
052
053  @Override
054  protected String normalize(String value) {
055    value = handleNotAvailable(value);
056    if (value != null) {
057      String cleanedCountry = REMOVE_ISO3166_PATTERN.matcher(value).replaceAll("");
058      // step 1: remove all non-letter and not-whitespace characters
059      cleanedCountry = cleanedCountry.chars()
060          .filter(p -> Character.isLetter((char) p) || Character.isWhitespace(p))
061          .collect(StringBuilder::new, StringBuilder::appendCodePoint, StringBuilder::append)
062          .toString();
063      // step 2: remove 'off '
064      cleanedCountry = REMOVE_OFF_PATTERN.matcher(cleanedCountry).replaceFirst("");
065      // step 3: normalize whitespaces
066      cleanedCountry = StringUtils.normalizeSpace(cleanedCountry);
067      // step 4: trim to null
068      cleanedCountry = StringUtils.trimToNull(cleanedCountry);
069      return super.normalize(cleanedCountry);
070    }
071    return null;
072  }
073
074  @Override
075  protected Country fromDictFile(String value) {
076    Country c = Country.fromIsoCode(value);
077    if (c == null) {
078      try {
079        c = VocabularyUtils.lookupEnum(value, Country.class);
080      } catch (RuntimeException e) {
081      }
082    }
083    return c;
084  }
085
086  public static CountryParser getInstance() {
087    synchronized (CountryParser.class) {
088      if (singletonObject == null) {
089        singletonObject = new CountryParser();
090      }
091    }
092    return singletonObject;
093  }
094}