Source code

001/*
002 * Licensed under the Apache License, Version 2.0 (the "License");
003 * you may not use this file except in compliance with the License.
004 * You may obtain a copy of the License at
005 *
006 *     http://www.apache.org/licenses/LICENSE-2.0
007 *
008 * Unless required by applicable law or agreed to in writing, software
009 * distributed under the License is distributed on an "AS IS" BASIS,
010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
011 * See the License for the specific language governing permissions and
012 * limitations under the License.
013 */
014package org.gbif.common.parsers;
015
016import org.gbif.api.vocabulary.Language;
017import org.gbif.common.parsers.core.EnumParser;
018import org.gbif.utils.file.csv.CSVReader;
019import org.gbif.utils.file.csv.CSVReaderFactory;
020
021import java.util.Arrays;
022import java.util.HashSet;
023import java.util.List;
024import java.util.Locale;
025import java.util.Set;
026import java.util.regex.Pattern;
027
028import org.apache.commons.lang3.StringUtils;
029
030/**
031 * Singleton implementation of the case insensitive ISO 639-1 language dictionary
032 * that uses the org.gbif.common.api.Language enumeration.
033 * Parsing results will be upper cased 2 letter codes.
034 * Understood inputs are 2 or 3 letter (both terminological and bibliographical) ISO codes
035 * and natural language names given in any of the ISO languages.
036 */
037public class LanguageParser extends EnumParser<Language> {
038
039  private static LanguageParser singletonObject = null;
040  private static final Pattern LOCALE = Pattern.compile("^[a-zA-Z]{2,3}[_-][a-zA-Z0-9]");
041  private static final List<Pattern> REMOVE_FROM_NAME_PATTERNS = Arrays.asList(
042      // remove brackets
043      Pattern.compile("\\(.\\)"),
044      // remove French ", langues"
045      Pattern.compile(", ?langues"),
046      // remove English " languages"
047      Pattern.compile(" languages")
048  );
049
050
051  private LanguageParser() {
052    super(Language.class, false, LanguageParser.class.getResourceAsStream("/dictionaries/parse/language.tsv"));
053
054    // make sure we have all enum values mapped
055    for (Language r : Language.values()) {
056      add(r.name(), r);
057      add(r.getTitleEnglish(), r);
058      add(r.getTitleNative(), r);
059      add(r.getIso2LetterCode(), r);
060      add(r.getIso3LetterCode(), r);
061    }
062
063    // make sure we have all enum values mapped
064    for (Locale l : Locale.getAvailableLocales()) {
065      Language lang = Language.fromIsoCode(l.getISO3Language());
066      if (lang == null) {
067        log.warn("ISO code {} not part of our language enumeration", lang);
068        continue;
069      }
070      add(l.getISO3Language(), lang);
071      add(l.getDisplayLanguage(), lang);
072      add(l.getLanguage(), lang);
073      for (Locale l2 : Locale.getAvailableLocales()) {
074        add(l.getDisplayLanguage(l2), lang);
075      }
076    }
077
078    // OFFICIAL LIST, downloaded from
079    // http://www.loc.gov/standards/iso639-2/ascii_8bits.html
080
081    // An alpha-3 (bibliographic) code
082    // an alpha-3 (terminologic) code (when given)
083    // an alpha-2 code (when given)
084    // an English name
085    // a French name
086    try {
087      CSVReader r = CSVReaderFactory.build(LanguageParser.class.getResourceAsStream("/dictionaries/parse/ISO-639-2_utf-8.txt"), "UTF8", "|", null, 0);
088      while(r.hasNext()) {
089        String[] row = r.next();
090        if (row.length>2) {
091          // ISO 2 letter code
092          String alpha2 = row[2];
093          if (!StringUtils.isBlank(alpha2)) {
094            Language lang = Language.fromIsoCode(alpha2);
095            if (lang == null || lang == Language.UNKNOWN) {
096              log.warn("ISO code {} not part of our language enumeration", alpha2);
097              continue;
098            }
099            // alpha-3 (bibliographic)
100            add(row[0], lang);
101            // alpha-3 (terminologic)
102            add(row[1], lang);
103            // English
104            for (String l : mutateLanguageName(row[3])) {
105              add(l, lang);
106            }
107            // French
108            for (String l : mutateLanguageName(row[4])) {
109              add(l, lang);
110            }
111          }
112        }
113      }
114    } catch (Exception e) {
115      log.error("Cannot initiate language parser: {}", e.getMessage());
116      throw new IllegalStateException("Cannot initiate language parser", e);
117    }
118
119
120    // ISO 639-3 list from SIL:
121    // http://www-01.sil.org/iso639-3/download.asp
122    try {
123      // Id     Part2B  Part2T  Part1   Scope   Language_Type   Ref_Name        Comment
124      CSVReader r = CSVReaderFactory.buildTabReader(LanguageParser.class.getResourceAsStream("/dictionaries/parse/iso-639-3-sil.tab"), "UTF8", 1);
125      while(r.hasNext()) {
126        String[] row = r.next();
127        if (row.length>2) {
128          String alpha2 = row[3];
129          if (!StringUtils.isBlank(alpha2)) {
130            Language lang = Language.fromIsoCode(alpha2);
131            if (lang == null || lang == Language.UNKNOWN) {
132              log.warn("ISO code {} not part of our language enumeration", alpha2);
133              continue;
134            }
135            // 3-letter code
136            add(row[0], lang);
137            // 3-letter code part2B
138            add(row[1], lang);
139            // 3-letter code part2T
140            add(row[2], lang);
141            // name
142            add(row[6], lang);
143          }
144        }
145      }
146    } catch (Exception e) {
147      log.error("Cannot initiate language parser: {}", e.getMessage());
148      throw new IllegalStateException("Cannot initiate language parser", e);
149    }
150  }
151
152  private Set<String> mutateLanguageName(String lang) {
153    Set<String> langs = new HashSet<>();
154    for (String l :  lang.split(";")) {
155      langs.add(l);
156      // also remove common patterns
157      for (Pattern p : REMOVE_FROM_NAME_PATTERNS) {
158        langs.add(p.matcher(l).replaceAll(""));
159      }
160    }
161    return langs;
162  }
163
164  @Override
165  protected String normalize(String value) {
166    if (value != null) {
167      /**
168       * A language string could come in as a locale like "en-US", "eng-US" or "es-419", or "en_US" in some systems, so
169       * extract only the part before the hyphen/underscore. Only if it contains "-" or "_" is parsing attempted.
170       * Whether it actually represents an ISO 369 language code is left for the language parser to determine.
171       *
172       * https://en.wikipedia.org/wiki/IETF_language_tag
173       *
174       * (Note the form en_US is used by Java and many Unix systems.)
175       */
176      if (LOCALE.matcher(value).find()) {
177        int index = value.indexOf("-");
178        if (index == -1) {
179          index = value.indexOf("_");
180        }
181        // only allow underscore
182        if (index > 1 && index < 4 ) {
183          return super.normalize(value.substring(0, index));
184        }
185      }
186      return super.normalize(value);
187    }
188    return null;
189  }
190
191  public static LanguageParser getInstance() {
192    synchronized (LanguageParser.class) {
193      if (singletonObject == null) {
194        singletonObject = new LanguageParser();
195      }
196    }
197    return singletonObject;
198  }
199}