001/* 002 * Licensed under the Apache License, Version 2.0 (the "License"); 003 * you may not use this file except in compliance with the License. 004 * You may obtain a copy of the License at 005 * 006 * http://www.apache.org/licenses/LICENSE-2.0 007 * 008 * Unless required by applicable law or agreed to in writing, software 009 * distributed under the License is distributed on an "AS IS" BASIS, 010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 011 * See the License for the specific language governing permissions and 012 * limitations under the License. 013 */ 014package org.gbif.common.parsers; 015 016import org.gbif.api.model.common.MediaObject; 017import org.gbif.api.vocabulary.MediaType; 018 019import java.net.URI; 020import java.util.Arrays; 021import java.util.Collections; 022import java.util.HashMap; 023import java.util.HashSet; 024import java.util.Map; 025import java.util.Set; 026import java.util.regex.Pattern; 027 028import javax.annotation.Nullable; 029 030import org.apache.commons.lang3.StringUtils; 031import org.apache.tika.Tika; 032import org.apache.tika.mime.MediaTypeRegistry; 033import org.apache.tika.mime.MimeType; 034import org.apache.tika.mime.MimeTypeException; 035import org.apache.tika.mime.MimeTypes; 036import org.slf4j.Logger; 037import org.slf4j.LoggerFactory; 038 039public class MediaParser { 040 private static final Logger LOG = LoggerFactory.getLogger(MediaParser.class); 041 private static final Tika TIKA = new Tika(); 042 private static final MimeTypes MIME_TYPES = MimeTypes.getDefaultMimeTypes(); 043 private static final String HTML_TYPE = "text/html"; 044 // MIME types which we consider as HTML links instead of real media file URIs 045 private static final Set<String> HTML_MIME_TYPES = Collections.unmodifiableSet( 046 new HashSet<>( 047 Arrays.asList("text/x-coldfusion", "text/x-php", "text/asp", "text/aspdotnet", "text/x-cgi", 048 "text/x-jsp", "text/x-perl", HTML_TYPE, MimeTypes.OCTET_STREAM))); 049 050 // List of exceptions, could be read from a file if it grows. URLs matching this return a media file. 051 private static final Map<Pattern, String> knownUrlPatterns; 052 053 // Add missing alias types. 054 static { 055 knownUrlPatterns = new HashMap<>(); 056 knownUrlPatterns.put(Pattern.compile("http://www\\.unimus\\.no/felles/bilder/web_hent_bilde\\.php\\?id=\\d+&type=jpeg"), "image/jpeg"); 057 knownUrlPatterns.put(Pattern.compile("http://www\\.jacq\\.org/image\\.php\\?filename=\\d+&method=europeana"), "image/jpeg"); 058 knownUrlPatterns.put(Pattern.compile("https://images\\.ala\\.org\\.au/image/proxyImageThumbnailLarge\\?imageId=[0-9a-f-]{36}"), "image/jpeg"); 059 knownUrlPatterns.put(Pattern.compile("http://[a-zA-Z0-9-]+\\.wildlifemonitoring\\.ru/get_photo\\.php\\?id=\\d+"), "image/jpeg"); 060 knownUrlPatterns.put(Pattern.compile("http://procyon\\.acadiau\\.ca/ecsmith/cgi-bin/image\\.cgi\\?[0-9A-Z]+,jpeg"), "image/jpeg"); 061 knownUrlPatterns.put(Pattern.compile("http://www\\.biologie\\.uni-ulm\\.de/cgi-bin/perl/sound\\.pl\\?sid=T&objid=\\d+"), "audio/vnd.wave"); 062 knownUrlPatterns.put(Pattern.compile("https://dofbasen\\.dk/sound_proxy\\.php\\?referer=gbif&mode=o&snd=[0-9_]+.mp3&raw=1"), "audio/mpeg"); 063 064 MediaTypeRegistry mediaTypeRegistry = MIME_TYPES.getMediaTypeRegistry(); 065 mediaTypeRegistry.addAlias(org.apache.tika.mime.MediaType.audio("mpeg"), org.apache.tika.mime.MediaType.audio("mp3")); 066 mediaTypeRegistry.addAlias(org.apache.tika.mime.MediaType.audio("mpeg"), org.apache.tika.mime.MediaType.audio("mpeg3")); 067 } 068 069 private static MediaParser instance = null; 070 071 public static MediaParser getInstance() { 072 synchronized (MediaParser.class) { 073 if (instance == null) { 074 instance = new MediaParser(); 075 } 076 } 077 return instance; 078 } 079 080 public MediaObject detectType(MediaObject mo) { 081 if (StringUtils.isEmpty(mo.getFormat())) { 082 // derive from URI 083 mo.setFormat(parseMimeType(mo.getIdentifier())); 084 } 085 086 // if MIME type is text/html make it a references link instead 087 if (HTML_TYPE.equalsIgnoreCase(mo.getFormat()) && mo.getIdentifier() != null) { 088 // make file URI the references link URL instead 089 mo.setReferences(mo.getIdentifier()); 090 mo.setIdentifier(null); 091 mo.setFormat(null); 092 } 093 094 if (StringUtils.isNotEmpty(mo.getFormat())) { 095 if (mo.getFormat().startsWith("image")) { 096 mo.setType(MediaType.StillImage); 097 } else if (mo.getFormat().startsWith("audio")) { 098 mo.setType(MediaType.Sound); 099 } else if (mo.getFormat().startsWith("video")) { 100 mo.setType(MediaType.MovingImage); 101 } else { 102 LOG.debug("Unsupported media format {}", mo.getFormat()); 103 } 104 } 105 return mo; 106 } 107 108 /** 109 * Parses a MIME type using Apache Tika which can handle the following: 110 * https://github.com/apache/tika/blob/master/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 111 * https://tika.apache.org/1.19.1/formats.html#Full_list_of_Supported_Formats 112 */ 113 public String parseMimeType(@Nullable String format) { 114 if (format != null) { 115 format = StringUtils.trimToNull(format.trim().toLowerCase()); 116 } 117 if (format == null) { 118 return null; 119 } 120 121 try { 122 MimeType mime = MIME_TYPES.getRegisteredMimeType(format); 123 if (mime != null) { 124 return mime.getName(); 125 } 126 127 } catch (MimeTypeException e) { 128 } 129 130 // Failed, but return the input if it's a reasonable MIME type 131 return MimeType.isValid(format) ? format : null; 132 } 133 134 /** 135 * Parses a MIME type using Apache Tika which can handle the following: 136 * https://github.com/apache/tika/blob/master/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 137 * https://tika.apache.org/1.19.1/formats.html#Full_list_of_Supported_Formats 138 */ 139 public String parseMimeType(@Nullable URI uri) { 140 if (uri != null) { 141 String uriString = uri.toString(); 142 String mime = TIKA.detect(uriString); 143 if (mime != null && HTML_MIME_TYPES.contains(mime.toLowerCase())) { 144 // We may have something like http://example.org/imageServer?img=test.jpg, so re-run the detection on the last 145 // part of the URL query string. 146 if (uri.getQuery() != null) { 147 mime = TIKA.detect(uri.getQuery()); 148 if (mime != null && !HTML_MIME_TYPES.contains(mime.toLowerCase())) { 149 return mime; 150 } 151 } 152 153 // First check the dictionary for known exceptions. 154 for (Map.Entry<Pattern, String> p : knownUrlPatterns.entrySet()) { 155 if (p.getKey().matcher(uriString).matches()) { 156 return p.getValue(); 157 } 158 } 159 160 // links without any suffix default to OCTET STREAM, see: 161 // http://dev.gbif.org/issues/browse/POR-2066 162 return HTML_TYPE; 163 } 164 return mime; 165 } 166 return null; 167 } 168}