001/* 002 * Licensed under the Apache License, Version 2.0 (the "License"); 003 * you may not use this file except in compliance with the License. 004 * You may obtain a copy of the License at 005 * 006 * http://www.apache.org/licenses/LICENSE-2.0 007 * 008 * Unless required by applicable law or agreed to in writing, software 009 * distributed under the License is distributed on an "AS IS" BASIS, 010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 011 * See the License for the specific language governing permissions and 012 * limitations under the License. 013 */ 014package org.gbif.common.parsers.core; 015 016import org.gbif.api.util.VocabularyUtils; 017 018import java.io.InputStream; 019import java.util.Arrays; 020import java.util.HashSet; 021import java.util.Locale; 022import java.util.regex.Pattern; 023 024import org.apache.commons.lang3.StringUtils; 025 026/** 027 * Generic parser for enumerations making use of our vocabulary util to lookup an enum value from a string. 028 * Also adds stronger normalization removing any non-ASCII-alphanumeric characters. It is still ok to use hyphens or 029 * underscores in the enumeration values. 030 */ 031public class EnumParser<T extends Enum<T>> extends FileBasedDictionaryParser<T> { 032 033 private final Class<T> clazz; 034 private final Pattern NORMALIZER; 035 protected final ASCIIParser asciiParser = ASCIIParser.getInstance(); 036 037 // These become null, as after removing non-letters "N/A" might mean something like "Namibia". 038 // Only values that could conflict with an enum value need be included. 039 // Values are used in uppercase. 040 private final HashSet<String> notAvailable = new HashSet<>( 041 Arrays.asList( 042 "N/A", "N.A.", "N.A", "N / A", "#N/A", "[N/A]", "(N/A)", // Not available 043 "N/K", "N.K.", "N.K", "N / K", "#N/K", "[N/K]", "(N/K)", // Not known 044 "UNK.", "UNK" // Unknown 045 )); 046 047 protected EnumParser(Class<T> clazz, boolean allowDigits, final InputStream... inputs) { 048 super(false); 049 050 if (allowDigits) { 051 NORMALIZER = Pattern.compile("[^\\p{IsAlphabetic}♀♂\\p{N}]+"); 052 } else { 053 NORMALIZER = Pattern.compile("[^\\p{IsAlphabetic}♀♂]+"); 054 } 055 this.clazz = clazz; 056 // init dicts 057 addEnumValues(); 058 059 if (inputs != null) { 060 for (InputStream input : inputs) { 061 init(input); 062 } 063 } 064 } 065 066 private void addEnumValues() { 067 T[] values = clazz.getEnumConstants(); 068 if (values != null) { 069 for (T val : values) { 070 add(val.name(), val); 071 } 072 } 073 } 074 075 @Override 076 protected String normalize(String value) { 077 if (StringUtils.isEmpty(handleNotAvailable(value))) return null; 078 079 // convert to ascii 080 ParseResult<String> asci = asciiParser.parse(value); 081 return NORMALIZER.matcher(asci.getPayload()).replaceAll("").toUpperCase(); 082 } 083 084 /** 085 * Treat "n/a" etc as null. 086 * A separate method so it can be called before stripping slash characters etc. 087 */ 088 protected String handleNotAvailable(String value) { 089 if (value == null) { 090 return null; 091 } 092 return notAvailable.contains(value.toUpperCase(Locale.ENGLISH)) ? null : value; 093 } 094 095 @Override 096 protected T fromDictFile(String value) { 097 try { 098 return (T) VocabularyUtils.lookupEnum(value, clazz); 099 } catch (RuntimeException e) { 100 return null; 101 } 102 } 103 104}