001/*
002 * Licensed under the Apache License, Version 2.0 (the "License");
003 * you may not use this file except in compliance with the License.
004 * You may obtain a copy of the License at
005 *
006 *     http://www.apache.org/licenses/LICENSE-2.0
007 *
008 * Unless required by applicable law or agreed to in writing, software
009 * distributed under the License is distributed on an "AS IS" BASIS,
010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
011 * See the License for the specific language governing permissions and
012 * limitations under the License.
013 */
014package org.gbif.common.parsers;
015
016import org.gbif.api.vocabulary.License;
017import org.gbif.common.parsers.core.ASCIIParser;
018import org.gbif.common.parsers.core.FileBasedDictionaryParser;
019import org.gbif.common.parsers.core.KeyValue;
020import org.gbif.common.parsers.core.ParseResult;
021
022import java.net.URI;
023import java.util.Iterator;
024import java.util.regex.Pattern;
025
026import org.apache.commons.lang3.StringUtils;
027
028/**
029 * A license parser giving URIs, compared to {@link License} which is for our GBIF enumeration.
030 *
031 * Singleton implementation of the dictionary that uses the file /dictionaries/parse/license_uri.txt to lookup a
032 * License by its URI or its acronym/title, e.g. a lookup by "CC-BY 4.0" returns http://creativecommons.org/licenses/by/4.0/.
033 * <br/>
034 * Note a lookup by license acronym/title without a version number defaults to the latest version of that license,
035 * e.g. a lookup by "CC-BY" returns http://creativecommons.org/licenses/by/4.0/.
036 */
037public class LicenseUriParser extends FileBasedDictionaryParser<URI> {
038
039  private static final String COMMENT_MARKER = "#";
040  private static final String LICENSE_FILEPATH = "/dictionaries/parse/license_uri.tsv";
041  // allows us to remove the protocol part for http:// and https://
042  private static final Pattern REMOVE_HTTP_PATTERN = Pattern.compile("^https?:\\/\\/", Pattern.CASE_INSENSITIVE);
043  private static final Pattern NORMALIZER = Pattern.compile("[^\\p{IsAlphabetic}\\p{N}©]+");
044  protected final ASCIIParser asciiParser = ASCIIParser.getInstance();
045  private static LicenseUriParser singletonObject = null;
046
047  private LicenseUriParser() {
048    super(false);
049    init(LicenseUriParser.class.getResourceAsStream(LICENSE_FILEPATH), COMMENT_MARKER);
050  }
051
052  /**
053   * @param source To build the dictionary from
054   */
055  @Override
056  public void init(Iterator<KeyValue<String, URI>> source) {
057    while (source.hasNext()) {
058      KeyValue<String, URI> kvp = source.next();
059      add(kvp.getKey(), kvp.getValue());
060      // Also adds the value, to save defining all of them.
061      add(kvp.getValue().toString(), kvp.getValue());
062    }
063  }
064
065  @Override
066  protected String normalize(String value) {
067    if (StringUtils.isEmpty(value)) {
068      return null;
069    }
070    ParseResult<String> ascii = asciiParser.parse(value);
071    String noHttp = REMOVE_HTTP_PATTERN.matcher(ascii.getPayload()).replaceAll("").toLowerCase();
072    return super.normalize(NORMALIZER.matcher(noHttp).replaceAll(""));
073  }
074
075  public static LicenseUriParser getInstance() {
076    synchronized (LicenseUriParser.class) {
077      if (singletonObject == null) {
078        singletonObject = new LicenseUriParser();
079      }
080    }
081    return singletonObject;
082  }
083
084  @Override
085  protected URI fromDictFile(String value) {
086    try {
087      return URI.create(value);
088    } catch (RuntimeException e) {
089      return null;
090    }
091  }
092}