001/* 002 * Licensed under the Apache License, Version 2.0 (the "License"); 003 * you may not use this file except in compliance with the License. 004 * You may obtain a copy of the License at 005 * 006 * http://www.apache.org/licenses/LICENSE-2.0 007 * 008 * Unless required by applicable law or agreed to in writing, software 009 * distributed under the License is distributed on an "AS IS" BASIS, 010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 011 * See the License for the specific language governing permissions and 012 * limitations under the License. 013 */ 014package org.gbif.common.parsers; 015 016import org.gbif.api.vocabulary.License; 017import org.gbif.common.parsers.core.ASCIIParser; 018import org.gbif.common.parsers.core.FileBasedDictionaryParser; 019import org.gbif.common.parsers.core.KeyValue; 020import org.gbif.common.parsers.core.ParseResult; 021 022import java.net.URI; 023import java.util.Iterator; 024import java.util.regex.Pattern; 025 026import org.apache.commons.lang3.StringUtils; 027 028/** 029 * A license parser giving URIs, compared to {@link License} which is for our GBIF enumeration. 030 * 031 * Singleton implementation of the dictionary that uses the file /dictionaries/parse/license_uri.txt to lookup a 032 * License by its URI or its acronym/title, e.g. a lookup by "CC-BY 4.0" returns http://creativecommons.org/licenses/by/4.0/. 033 * <br/> 034 * Note a lookup by license acronym/title without a version number defaults to the latest version of that license, 035 * e.g. a lookup by "CC-BY" returns http://creativecommons.org/licenses/by/4.0/. 036 */ 037public class LicenseUriParser extends FileBasedDictionaryParser<URI> { 038 039 private static final String COMMENT_MARKER = "#"; 040 private static final String LICENSE_FILEPATH = "/dictionaries/parse/license_uri.tsv"; 041 // allows us to remove the protocol part for http:// and https:// 042 private static final Pattern REMOVE_HTTP_PATTERN = Pattern.compile("^https?:\\/\\/", Pattern.CASE_INSENSITIVE); 043 private static final Pattern NORMALIZER = Pattern.compile("[^\\p{IsAlphabetic}\\p{N}©]+"); 044 protected final ASCIIParser asciiParser = ASCIIParser.getInstance(); 045 private static LicenseUriParser singletonObject = null; 046 047 private LicenseUriParser() { 048 super(false); 049 init(LicenseUriParser.class.getResourceAsStream(LICENSE_FILEPATH), COMMENT_MARKER); 050 } 051 052 /** 053 * @param source To build the dictionary from 054 */ 055 @Override 056 public void init(Iterator<KeyValue<String, URI>> source) { 057 while (source.hasNext()) { 058 KeyValue<String, URI> kvp = source.next(); 059 add(kvp.getKey(), kvp.getValue()); 060 // Also adds the value, to save defining all of them. 061 add(kvp.getValue().toString(), kvp.getValue()); 062 } 063 } 064 065 @Override 066 protected String normalize(String value) { 067 if (StringUtils.isEmpty(value)) { 068 return null; 069 } 070 ParseResult<String> ascii = asciiParser.parse(value); 071 String noHttp = REMOVE_HTTP_PATTERN.matcher(ascii.getPayload()).replaceAll("").toLowerCase(); 072 return super.normalize(NORMALIZER.matcher(noHttp).replaceAll("")); 073 } 074 075 public static LicenseUriParser getInstance() { 076 synchronized (LicenseUriParser.class) { 077 if (singletonObject == null) { 078 singletonObject = new LicenseUriParser(); 079 } 080 } 081 return singletonObject; 082 } 083 084 @Override 085 protected URI fromDictFile(String value) { 086 try { 087 return URI.create(value); 088 } catch (RuntimeException e) { 089 return null; 090 } 091 } 092}