001/* 002 * Licensed under the Apache License, Version 2.0 (the "License"); 003 * you may not use this file except in compliance with the License. 004 * You may obtain a copy of the License at 005 * 006 * http://www.apache.org/licenses/LICENSE-2.0 007 * 008 * Unless required by applicable law or agreed to in writing, software 009 * distributed under the License is distributed on an "AS IS" BASIS, 010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 011 * See the License for the specific language governing permissions and 012 * limitations under the License. 013 */ 014package org.gbif.common.parsers; 015 016import org.gbif.api.vocabulary.Language; 017import org.gbif.common.parsers.core.EnumParser; 018import org.gbif.utils.file.csv.CSVReader; 019import org.gbif.utils.file.csv.CSVReaderFactory; 020 021import java.util.Arrays; 022import java.util.HashSet; 023import java.util.List; 024import java.util.Locale; 025import java.util.Set; 026import java.util.regex.Pattern; 027 028import org.apache.commons.lang3.StringUtils; 029 030/** 031 * Singleton implementation of the case insensitive ISO 639-1 language dictionary 032 * that uses the org.gbif.common.api.Language enumeration. 033 * Parsing results will be upper cased 2 letter codes. 034 * Understood inputs are 2 or 3 letter (both terminological and bibliographical) ISO codes 035 * and natural language names given in any of the ISO languages. 036 */ 037public class LanguageParser extends EnumParser<Language> { 038 039 private static LanguageParser singletonObject = null; 040 private static final Pattern LOCALE = Pattern.compile("^[a-zA-Z]{2,3}[_-][a-zA-Z0-9]"); 041 private static final List<Pattern> REMOVE_FROM_NAME_PATTERNS = Arrays.asList( 042 // remove brackets 043 Pattern.compile("\\(.\\)"), 044 // remove French ", langues" 045 Pattern.compile(", ?langues"), 046 // remove English " languages" 047 Pattern.compile(" languages") 048 ); 049 050 051 private LanguageParser() { 052 super(Language.class, false, LanguageParser.class.getResourceAsStream("/dictionaries/parse/language.tsv")); 053 054 // make sure we have all enum values mapped 055 for (Language r : Language.values()) { 056 add(r.name(), r); 057 add(r.getTitleEnglish(), r); 058 add(r.getTitleNative(), r); 059 add(r.getIso2LetterCode(), r); 060 add(r.getIso3LetterCode(), r); 061 } 062 063 // make sure we have all enum values mapped 064 for (Locale l : Locale.getAvailableLocales()) { 065 Language lang = Language.fromIsoCode(l.getISO3Language()); 066 if (lang == null) { 067 log.warn("ISO code {} not part of our language enumeration", lang); 068 continue; 069 } 070 add(l.getISO3Language(), lang); 071 add(l.getDisplayLanguage(), lang); 072 add(l.getLanguage(), lang); 073 for (Locale l2 : Locale.getAvailableLocales()) { 074 add(l.getDisplayLanguage(l2), lang); 075 } 076 } 077 078 // OFFICIAL LIST, downloaded from 079 // http://www.loc.gov/standards/iso639-2/ascii_8bits.html 080 081 // An alpha-3 (bibliographic) code 082 // an alpha-3 (terminologic) code (when given) 083 // an alpha-2 code (when given) 084 // an English name 085 // a French name 086 try { 087 CSVReader r = CSVReaderFactory.build(LanguageParser.class.getResourceAsStream("/dictionaries/parse/ISO-639-2_utf-8.txt"), "UTF8", "|", null, 0); 088 while(r.hasNext()) { 089 String[] row = r.next(); 090 if (row.length>2) { 091 // ISO 2 letter code 092 String alpha2 = row[2]; 093 if (!StringUtils.isBlank(alpha2)) { 094 Language lang = Language.fromIsoCode(alpha2); 095 if (lang == null || lang == Language.UNKNOWN) { 096 log.warn("ISO code {} not part of our language enumeration", alpha2); 097 continue; 098 } 099 // alpha-3 (bibliographic) 100 add(row[0], lang); 101 // alpha-3 (terminologic) 102 add(row[1], lang); 103 // English 104 for (String l : mutateLanguageName(row[3])) { 105 add(l, lang); 106 } 107 // French 108 for (String l : mutateLanguageName(row[4])) { 109 add(l, lang); 110 } 111 } 112 } 113 } 114 } catch (Exception e) { 115 log.error("Cannot initiate language parser: {}", e.getMessage()); 116 throw new IllegalStateException("Cannot initiate language parser", e); 117 } 118 119 120 // ISO 639-3 list from SIL: 121 // http://www-01.sil.org/iso639-3/download.asp 122 try { 123 // Id Part2B Part2T Part1 Scope Language_Type Ref_Name Comment 124 CSVReader r = CSVReaderFactory.buildTabReader(LanguageParser.class.getResourceAsStream("/dictionaries/parse/iso-639-3-sil.tab"), "UTF8", 1); 125 while(r.hasNext()) { 126 String[] row = r.next(); 127 if (row.length>2) { 128 String alpha2 = row[3]; 129 if (!StringUtils.isBlank(alpha2)) { 130 Language lang = Language.fromIsoCode(alpha2); 131 if (lang == null || lang == Language.UNKNOWN) { 132 log.warn("ISO code {} not part of our language enumeration", alpha2); 133 continue; 134 } 135 // 3-letter code 136 add(row[0], lang); 137 // 3-letter code part2B 138 add(row[1], lang); 139 // 3-letter code part2T 140 add(row[2], lang); 141 // name 142 add(row[6], lang); 143 } 144 } 145 } 146 } catch (Exception e) { 147 log.error("Cannot initiate language parser: {}", e.getMessage()); 148 throw new IllegalStateException("Cannot initiate language parser", e); 149 } 150 } 151 152 private Set<String> mutateLanguageName(String lang) { 153 Set<String> langs = new HashSet<>(); 154 for (String l : lang.split(";")) { 155 langs.add(l); 156 // also remove common patterns 157 for (Pattern p : REMOVE_FROM_NAME_PATTERNS) { 158 langs.add(p.matcher(l).replaceAll("")); 159 } 160 } 161 return langs; 162 } 163 164 @Override 165 protected String normalize(String value) { 166 if (value != null) { 167 /** 168 * A language string could come in as a locale like "en-US", "eng-US" or "es-419", or "en_US" in some systems, so 169 * extract only the part before the hyphen/underscore. Only if it contains "-" or "_" is parsing attempted. 170 * Whether it actually represents an ISO 369 language code is left for the language parser to determine. 171 * 172 * https://en.wikipedia.org/wiki/IETF_language_tag 173 * 174 * (Note the form en_US is used by Java and many Unix systems.) 175 */ 176 if (LOCALE.matcher(value).find()) { 177 int index = value.indexOf("-"); 178 if (index == -1) { 179 index = value.indexOf("_"); 180 } 181 // only allow underscore 182 if (index > 1 && index < 4 ) { 183 return super.normalize(value.substring(0, index)); 184 } 185 } 186 return super.normalize(value); 187 } 188 return null; 189 } 190 191 public static LanguageParser getInstance() { 192 synchronized (LanguageParser.class) { 193 if (singletonObject == null) { 194 singletonObject = new LanguageParser(); 195 } 196 } 197 return singletonObject; 198 } 199}