001/* 002 * Licensed under the Apache License, Version 2.0 (the "License"); 003 * you may not use this file except in compliance with the License. 004 * You may obtain a copy of the License at 005 * 006 * http://www.apache.org/licenses/LICENSE-2.0 007 * 008 * Unless required by applicable law or agreed to in writing, software 009 * distributed under the License is distributed on an "AS IS" BASIS, 010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 011 * See the License for the specific language governing permissions and 012 * limitations under the License. 013 */ 014package org.gbif.common.parsers; 015 016import java.net.URI; 017import java.util.ArrayList; 018import java.util.Arrays; 019import java.util.List; 020import java.util.regex.Pattern; 021import java.util.stream.Collectors; 022 023import org.apache.commons.lang3.StringUtils; 024import org.slf4j.Logger; 025import org.slf4j.LoggerFactory; 026 027/** 028 * Greedy URL parser assuming HTTP URIs in case no schema was given. 029 * Modified version of the registry-metadata GreedyUriConverter. 030 */ 031public class UrlParser { 032 private static final Logger LOG = LoggerFactory.getLogger(UrlParser.class); 033 private static final String[] MULTI_VALUE_DELIMITERS = {"|#DELIMITER#|", "|", ",", ";"}; 034 private static final String HTTP_SCHEME = "http://"; 035 036 // Pattern for things that are probably domains followed by a slash, without a protocol. 037 // Doesn't match IDNs etc, but this is just for people who forgot the http:// anyway. 038 // The longest TLD currently in existence is 24 characters long, but can be up to 63 according to specs. 039 private static final Pattern DOMAIN_ISH = Pattern.compile("^[A-Za-z0-9.-]{1,60}\\.[A-Za-z]{2,10}(?:/.*)?"); 040 041 private UrlParser() { 042 } 043 044 /** 045 * Convert a String into a java.net.URI. 046 * In case its missing the protocol prefix, it is prefixed with the default protocol. 047 * 048 * @param value The input value to be converted 049 * 050 * @return The converted value, or null if not parsable or exception occurred 051 */ 052 public static URI parse(String value) { 053 value = StringUtils.trimToEmpty(value); 054 if (StringUtils.isEmpty(value)) { 055 return null; 056 } 057 058 URI uri = null; 059 try { 060 uri = URI.create(value); 061 if (!uri.isAbsolute() && DOMAIN_ISH.matcher(value).matches()) { 062 // make into an HTTP address 063 try { 064 uri = URI.create(HTTP_SCHEME + value); 065 } catch (IllegalArgumentException e) { 066 // keep the previous scheme-less result 067 } 068 } 069 070 // verify that we have a domain 071 if (StringUtils.isEmpty(uri.getHost())) { 072 return null; 073 } 074 075 } catch (IllegalArgumentException ignored) { 076 } 077 078 return uri; 079 } 080 081 082 /** 083 * Parses a single string with null, one or many URIs concatenated together as found in dwc:associatedMedia. 084 */ 085 public static List<URI> parseUriList(String uris) { 086 List<URI> result = new ArrayList<>(); 087 088 if (StringUtils.isNotEmpty(uris)) { 089 // first try to use the entire string 090 URI uri = UrlParser.parse(uris); 091 if (uri != null) { 092 result.add(uri); 093 094 } else { 095 // try common delimiters 096 int maxValidUrls = 0; 097 for (String delimiter : MULTI_VALUE_DELIMITERS) { 098 List<String> urls = Arrays.stream(StringUtils.splitByWholeSeparator(uris, delimiter)) 099 .map(org.gbif.utils.text.StringUtils::trim) 100 .filter(StringUtils::isNotEmpty) 101 .collect(Collectors.toList()); 102 103 // avoid parsing if we haven't actually split anything 104 if (urls.size() > 1) { 105 List<URI> tmp = new ArrayList<>(); 106 for (String url : urls) { 107 uri = UrlParser.parse(url); 108 if (uri != null) { 109 tmp.add(uri); 110 } 111 } 112 if (tmp.size() > maxValidUrls) { 113 result = tmp; 114 maxValidUrls = tmp.size(); 115 } else if (maxValidUrls > 0 && tmp.size() == maxValidUrls) { 116 LOG.info("Unclear what delimiter is being used for concatenated URIs = {}", uris); 117 } 118 } 119 } 120 } 121 } 122 return result; 123 } 124 125}