Source code

001/*
002 * Licensed under the Apache License, Version 2.0 (the "License");
003 * you may not use this file except in compliance with the License.
004 * You may obtain a copy of the License at
005 *
006 *     http://www.apache.org/licenses/LICENSE-2.0
007 *
008 * Unless required by applicable law or agreed to in writing, software
009 * distributed under the License is distributed on an "AS IS" BASIS,
010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
011 * See the License for the specific language governing permissions and
012 * limitations under the License.
013 */
014package org.gbif.common.parsers.core;
015
016import org.gbif.api.util.VocabularyUtils;
017
018import java.io.InputStream;
019import java.util.Arrays;
020import java.util.HashSet;
021import java.util.Locale;
022import java.util.regex.Pattern;
023
024import org.apache.commons.lang3.StringUtils;
025
026/**
027 * Generic parser for enumerations making use of our vocabulary util to lookup an enum value from a string.
028 * Also adds stronger normalization removing any non-ASCII-alphanumeric characters. It is still ok to use hyphens or
029 * underscores in the enumeration values.
030 */
031public class EnumParser<T extends Enum<T>> extends FileBasedDictionaryParser<T> {
032
033  private final Class<T> clazz;
034  private final Pattern NORMALIZER;
035  protected final ASCIIParser asciiParser = ASCIIParser.getInstance();
036
037  // These become null, as after removing non-letters "N/A" might mean something like "Namibia".
038  // Only values that could conflict with an enum value need be included.
039  // Values are used in uppercase.
040  private final HashSet<String> notAvailable = new HashSet<>(
041      Arrays.asList(
042          "N/A", "N.A.", "N.A", "N / A", "#N/A", "[N/A]", "(N/A)", // Not available
043          "N/K", "N.K.", "N.K", "N / K", "#N/K", "[N/K]", "(N/K)", // Not known
044          "UNK.", "UNK" // Unknown
045      ));
046
047  protected EnumParser(Class<T> clazz, boolean allowDigits, final InputStream... inputs) {
048    super(false);
049
050    if (allowDigits) {
051      NORMALIZER = Pattern.compile("[^\\p{IsAlphabetic}♀♂\\p{N}]+");
052    } else {
053      NORMALIZER = Pattern.compile("[^\\p{IsAlphabetic}♀♂]+");
054    }
055    this.clazz = clazz;
056    // init dicts
057    addEnumValues();
058
059    if (inputs != null) {
060      for (InputStream input : inputs) {
061        init(input);
062      }
063    }
064  }
065
066  private void addEnumValues() {
067    T[] values = clazz.getEnumConstants();
068    if (values != null) {
069      for (T val : values) {
070        add(val.name(), val);
071      }
072    }
073  }
074
075  @Override
076  protected String normalize(String value) {
077    if (StringUtils.isEmpty(handleNotAvailable(value))) return null;
078
079    // convert to ascii
080    ParseResult<String> asci = asciiParser.parse(value);
081    return NORMALIZER.matcher(asci.getPayload()).replaceAll("").toUpperCase();
082  }
083
084  /**
085   * Treat "n/a" etc as null.
086   * A separate method so it can be called before stripping slash characters etc.
087   */
088  protected String handleNotAvailable(String value) {
089    if (value == null) {
090      return null;
091    }
092    return notAvailable.contains(value.toUpperCase(Locale.ENGLISH)) ? null : value;
093  }
094
095  @Override
096  protected T fromDictFile(String value) {
097    try {
098      return (T) VocabularyUtils.lookupEnum(value, clazz);
099    } catch (RuntimeException e) {
100      return null;
101    }
102  }
103
104}