001/*
002 * Licensed under the Apache License, Version 2.0 (the "License");
003 * you may not use this file except in compliance with the License.
004 * You may obtain a copy of the License at
005 *
006 *     http://www.apache.org/licenses/LICENSE-2.0
007 *
008 * Unless required by applicable law or agreed to in writing, software
009 * distributed under the License is distributed on an "AS IS" BASIS,
010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
011 * See the License for the specific language governing permissions and
012 * limitations under the License.
013 */
014package org.gbif.common.parsers;
015
016import java.net.URI;
017import java.util.ArrayList;
018import java.util.Arrays;
019import java.util.List;
020import java.util.regex.Pattern;
021import java.util.stream.Collectors;
022
023import org.apache.commons.lang3.StringUtils;
024import org.slf4j.Logger;
025import org.slf4j.LoggerFactory;
026
027/**
028 * Greedy URL parser assuming HTTP URIs in case no schema was given.
029 * Modified version of the registry-metadata GreedyUriConverter.
030 */
031public class UrlParser {
032  private static final Logger LOG = LoggerFactory.getLogger(UrlParser.class);
033  private static final String[] MULTI_VALUE_DELIMITERS = {"|#DELIMITER#|", "|", ",", ";"};
034  private static final String HTTP_SCHEME = "http://";
035
036  // Pattern for things that are probably domains followed by a slash, without a protocol.
037  // Doesn't match IDNs etc, but this is just for people who forgot the http:// anyway.
038  // The longest TLD currently in existence is 24 characters long, but can be up to 63 according to specs.
039  private static final Pattern DOMAIN_ISH = Pattern.compile("^[A-Za-z0-9.-]{1,60}\\.[A-Za-z]{2,10}(?:/.*)?");
040
041  private UrlParser() {
042  }
043
044  /**
045   * Convert a String into a java.net.URI.
046   * In case its missing the protocol prefix, it is prefixed with the default protocol.
047   *
048   * @param value The input value to be converted
049   *
050   * @return The converted value, or null if not parsable or exception occurred
051   */
052  public static URI parse(String value) {
053    value = StringUtils.trimToEmpty(value);
054    if (StringUtils.isEmpty(value)) {
055      return null;
056    }
057
058    URI uri = null;
059    try {
060      uri = URI.create(value);
061      if (!uri.isAbsolute() && DOMAIN_ISH.matcher(value).matches()) {
062        // make into an HTTP address
063        try {
064          uri = URI.create(HTTP_SCHEME + value);
065        } catch (IllegalArgumentException e) {
066          // keep the previous scheme-less result
067        }
068      }
069
070      // verify that we have a domain
071      if (StringUtils.isEmpty(uri.getHost())) {
072        return null;
073      }
074
075    } catch (IllegalArgumentException ignored) {
076    }
077
078    return uri;
079  }
080
081
082  /**
083   * Parses a single string with null, one or many URIs concatenated together as found in dwc:associatedMedia.
084   */
085  public static List<URI> parseUriList(String uris) {
086    List<URI> result = new ArrayList<>();
087
088    if (StringUtils.isNotEmpty(uris)) {
089      // first try to use the entire string
090      URI uri = UrlParser.parse(uris);
091      if (uri != null) {
092        result.add(uri);
093
094      } else {
095        // try common delimiters
096        int maxValidUrls = 0;
097        for (String delimiter : MULTI_VALUE_DELIMITERS) {
098          List<String> urls = Arrays.stream(StringUtils.splitByWholeSeparator(uris, delimiter))
099              .map(org.gbif.utils.text.StringUtils::trim)
100              .filter(StringUtils::isNotEmpty)
101              .collect(Collectors.toList());
102
103          // avoid parsing if we haven't actually split anything
104          if (urls.size() > 1) {
105            List<URI> tmp = new ArrayList<>();
106            for (String url : urls) {
107              uri = UrlParser.parse(url);
108              if (uri != null) {
109                tmp.add(uri);
110              }
111            }
112            if (tmp.size() > maxValidUrls) {
113              result = tmp;
114              maxValidUrls = tmp.size();
115            } else if (maxValidUrls > 0 && tmp.size() == maxValidUrls) {
116              LOG.info("Unclear what delimiter is being used for concatenated URIs = {}", uris);
117            }
118          }
119        }
120      }
121    }
122    return result;
123  }
124
125}