001/*
002 * Licensed under the Apache License, Version 2.0 (the "License");
003 * you may not use this file except in compliance with the License.
004 * You may obtain a copy of the License at
005 *
006 *     http://www.apache.org/licenses/LICENSE-2.0
007 *
008 * Unless required by applicable law or agreed to in writing, software
009 * distributed under the License is distributed on an "AS IS" BASIS,
010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
011 * See the License for the specific language governing permissions and
012 * limitations under the License.
013 */
014package org.gbif.common.parsers;
015
016import org.gbif.api.model.common.MediaObject;
017import org.gbif.api.vocabulary.MediaType;
018
019import java.net.URI;
020import java.util.Arrays;
021import java.util.Collections;
022import java.util.HashMap;
023import java.util.HashSet;
024import java.util.Map;
025import java.util.Set;
026import java.util.regex.Pattern;
027
028import javax.annotation.Nullable;
029
030import org.apache.commons.lang3.StringUtils;
031import org.apache.tika.Tika;
032import org.apache.tika.mime.MediaTypeRegistry;
033import org.apache.tika.mime.MimeType;
034import org.apache.tika.mime.MimeTypeException;
035import org.apache.tika.mime.MimeTypes;
036import org.slf4j.Logger;
037import org.slf4j.LoggerFactory;
038
039public class MediaParser {
040  private static final Logger LOG = LoggerFactory.getLogger(MediaParser.class);
041  private static final Tika TIKA = new Tika();
042  private static final MimeTypes MIME_TYPES = MimeTypes.getDefaultMimeTypes();
043  private static final String HTML_TYPE = "text/html";
044  // MIME types which we consider as HTML links instead of real media file URIs
045  private static final Set<String> HTML_MIME_TYPES = Collections.unmodifiableSet(
046      new HashSet<>(
047          Arrays.asList("text/x-coldfusion", "text/x-php", "text/asp", "text/aspdotnet", "text/x-cgi",
048              "text/x-jsp", "text/x-perl", HTML_TYPE, MimeTypes.OCTET_STREAM)));
049
050  // List of exceptions, could be read from a file if it grows. URLs matching this return a media file.
051  private static final Map<Pattern, String> knownUrlPatterns;
052
053  // Add missing alias types.
054  static {
055    knownUrlPatterns = new HashMap<>();
056    knownUrlPatterns.put(Pattern.compile("http://www\\.unimus\\.no/felles/bilder/web_hent_bilde\\.php\\?id=\\d+&type=jpeg"), "image/jpeg");
057    knownUrlPatterns.put(Pattern.compile("http://www\\.jacq\\.org/image\\.php\\?filename=\\d+&method=europeana"), "image/jpeg");
058    knownUrlPatterns.put(Pattern.compile("https://images\\.ala\\.org\\.au/image/proxyImageThumbnailLarge\\?imageId=[0-9a-f-]{36}"), "image/jpeg");
059    knownUrlPatterns.put(Pattern.compile("http://[a-zA-Z0-9-]+\\.wildlifemonitoring\\.ru/get_photo\\.php\\?id=\\d+"), "image/jpeg");
060    knownUrlPatterns.put(Pattern.compile("http://procyon\\.acadiau\\.ca/ecsmith/cgi-bin/image\\.cgi\\?[0-9A-Z]+,jpeg"), "image/jpeg");
061    knownUrlPatterns.put(Pattern.compile("http://www\\.biologie\\.uni-ulm\\.de/cgi-bin/perl/sound\\.pl\\?sid=T&objid=\\d+"), "audio/vnd.wave");
062    knownUrlPatterns.put(Pattern.compile("https://dofbasen\\.dk/sound_proxy\\.php\\?referer=gbif&mode=o&snd=[0-9_]+.mp3&raw=1"), "audio/mpeg");
063
064    MediaTypeRegistry mediaTypeRegistry = MIME_TYPES.getMediaTypeRegistry();
065    mediaTypeRegistry.addAlias(org.apache.tika.mime.MediaType.audio("mpeg"), org.apache.tika.mime.MediaType.audio("mp3"));
066    mediaTypeRegistry.addAlias(org.apache.tika.mime.MediaType.audio("mpeg"), org.apache.tika.mime.MediaType.audio("mpeg3"));
067  }
068
069  private static MediaParser instance = null;
070
071  public static MediaParser getInstance() {
072    synchronized (MediaParser.class) {
073      if (instance == null) {
074        instance = new MediaParser();
075      }
076    }
077    return instance;
078  }
079
080  public MediaObject detectType(MediaObject mo) {
081    if (StringUtils.isEmpty(mo.getFormat())) {
082      // derive from URI
083      mo.setFormat(parseMimeType(mo.getIdentifier()));
084    }
085
086    // if MIME type is text/html make it a references link instead
087    if (HTML_TYPE.equalsIgnoreCase(mo.getFormat()) && mo.getIdentifier() != null) {
088      // make file URI the references link URL instead
089      mo.setReferences(mo.getIdentifier());
090      mo.setIdentifier(null);
091      mo.setFormat(null);
092    }
093
094    if (StringUtils.isNotEmpty(mo.getFormat())) {
095      if (mo.getFormat().startsWith("image")) {
096        mo.setType(MediaType.StillImage);
097      } else if (mo.getFormat().startsWith("audio")) {
098        mo.setType(MediaType.Sound);
099      } else if (mo.getFormat().startsWith("video")) {
100        mo.setType(MediaType.MovingImage);
101      } else {
102        LOG.debug("Unsupported media format {}", mo.getFormat());
103      }
104    }
105    return mo;
106  }
107
108  /**
109   * Parses a MIME type using Apache Tika which can handle the following:
110   * https://github.com/apache/tika/blob/master/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
111   * https://tika.apache.org/1.19.1/formats.html#Full_list_of_Supported_Formats
112   */
113  public String parseMimeType(@Nullable String format) {
114    if (format != null) {
115      format = StringUtils.trimToNull(format.trim().toLowerCase());
116    }
117    if (format == null) {
118      return null;
119    }
120
121    try {
122      MimeType mime = MIME_TYPES.getRegisteredMimeType(format);
123      if (mime != null) {
124        return mime.getName();
125      }
126
127    } catch (MimeTypeException e) {
128    }
129
130    // Failed, but return the input if it's a reasonable MIME type
131    return MimeType.isValid(format) ? format : null;
132  }
133
134  /**
135   * Parses a MIME type using Apache Tika which can handle the following:
136   * https://github.com/apache/tika/blob/master/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
137   * https://tika.apache.org/1.19.1/formats.html#Full_list_of_Supported_Formats
138   */
139  public String parseMimeType(@Nullable URI uri) {
140    if (uri != null) {
141      String uriString = uri.toString();
142      String mime = TIKA.detect(uriString);
143      if (mime != null && HTML_MIME_TYPES.contains(mime.toLowerCase())) {
144        // We may have something like http://example.org/imageServer?img=test.jpg, so re-run the detection on the last
145        // part of the URL query string.
146        if (uri.getQuery() != null) {
147          mime = TIKA.detect(uri.getQuery());
148          if (mime != null && !HTML_MIME_TYPES.contains(mime.toLowerCase())) {
149            return mime;
150          }
151        }
152
153        // First check the dictionary for known exceptions.
154        for (Map.Entry<Pattern, String> p : knownUrlPatterns.entrySet()) {
155          if (p.getKey().matcher(uriString).matches()) {
156            return p.getValue();
157          }
158        }
159
160        // links without any suffix default to OCTET STREAM, see:
161        // http://dev.gbif.org/issues/browse/POR-2066
162        return HTML_TYPE;
163      }
164      return mime;
165    }
166    return null;
167  }
168}