001/* 002 * Licensed under the Apache License, Version 2.0 (the "License"); 003 * you may not use this file except in compliance with the License. 004 * You may obtain a copy of the License at 005 * 006 * http://www.apache.org/licenses/LICENSE-2.0 007 * 008 * Unless required by applicable law or agreed to in writing, software 009 * distributed under the License is distributed on an "AS IS" BASIS, 010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 011 * See the License for the specific language governing permissions and 012 * limitations under the License. 013 */ 014package org.gbif.common.parsers; 015 016import org.gbif.api.util.VocabularyUtils; 017import org.gbif.api.vocabulary.Country; 018import org.gbif.common.parsers.core.EnumParser; 019 020import java.util.regex.Pattern; 021 022import org.apache.commons.lang3.StringUtils; 023 024/** 025 * Singleton implementation of the dictionary that uses the file /dictionaries/parse/countryName.txt. 026 */ 027public class CountryParser extends EnumParser<Country> { 028 029 private static CountryParser singletonObject = null; 030 031 // "off Australia" 032 private static final Pattern REMOVE_OFF_PATTERN = Pattern.compile("off ", Pattern.CASE_INSENSITIVE); 033 private static final Pattern REMOVE_ISO3166_PATTERN = Pattern.compile("\\b(ISO.?3166.?[123]?:?)", Pattern.CASE_INSENSITIVE); 034 035 private CountryParser() { 036 super(Country.class, false); 037 // also make sure we have all official iso countries mapped 038 for (Country c : Country.OFFICIAL_COUNTRIES) { 039 add(c.name(), c); 040 add(c.getTitle(), c); 041 add(c.getIso2LetterCode(), c); 042 add(c.getIso3LetterCode(), c); 043 } 044 // and Kosovo (which is not an official code, but should be treated as such by GBIF) 045 add(Country.KOSOVO.name(), Country.KOSOVO); 046 add(Country.KOSOVO.getTitle(), Country.KOSOVO); 047 add(Country.KOSOVO.getIso2LetterCode(), Country.KOSOVO); 048 add(Country.KOSOVO.getIso3LetterCode(), Country.KOSOVO); 049 // use dict file last 050 init(CountryParser.class.getResourceAsStream("/dictionaries/parse/countryName.tsv")); 051 } 052 053 @Override 054 protected String normalize(String value) { 055 value = handleNotAvailable(value); 056 if (value != null) { 057 String cleanedCountry = REMOVE_ISO3166_PATTERN.matcher(value).replaceAll(""); 058 // step 1: remove all non-letter and not-whitespace characters 059 cleanedCountry = cleanedCountry.chars() 060 .filter(p -> Character.isLetter((char) p) || Character.isWhitespace(p)) 061 .collect(StringBuilder::new, StringBuilder::appendCodePoint, StringBuilder::append) 062 .toString(); 063 // step 2: remove 'off ' 064 cleanedCountry = REMOVE_OFF_PATTERN.matcher(cleanedCountry).replaceFirst(""); 065 // step 3: normalize whitespaces 066 cleanedCountry = StringUtils.normalizeSpace(cleanedCountry); 067 // step 4: trim to null 068 cleanedCountry = StringUtils.trimToNull(cleanedCountry); 069 return super.normalize(cleanedCountry); 070 } 071 return null; 072 } 073 074 @Override 075 protected Country fromDictFile(String value) { 076 Country c = Country.fromIsoCode(value); 077 if (c == null) { 078 try { 079 c = VocabularyUtils.lookupEnum(value, Country.class); 080 } catch (RuntimeException e) { 081 } 082 } 083 return c; 084 } 085 086 public static CountryParser getInstance() { 087 synchronized (CountryParser.class) { 088 if (singletonObject == null) { 089 singletonObject = new CountryParser(); 090 } 091 } 092 return singletonObject; 093 } 094}