001package org.gbif.utils.file;
002
003import java.io.BufferedInputStream;
004import java.io.BufferedOutputStream;
005import java.io.File;
006import java.io.FileFilter;
007import java.io.FileInputStream;
008import java.io.FileOutputStream;
009import java.io.IOException;
010import java.io.InputStream;
011import java.io.OutputStream;
012import java.util.ArrayList;
013import java.util.Arrays;
014import java.util.Collection;
015import java.util.Enumeration;
016import java.util.HashSet;
017import java.util.List;
018import java.util.Set;
019import java.util.zip.GZIPInputStream;
020import java.util.zip.ZipEntry;
021import java.util.zip.ZipException;
022import java.util.zip.ZipFile;
023import java.util.zip.ZipOutputStream;
024
025import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
026import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
027import org.apache.commons.io.IOUtils;
028import org.apache.commons.io.filefilter.HiddenFileFilter;
029import org.apache.commons.io.filefilter.TrueFileFilter;
030import org.apache.commons.lang3.StringUtils;
031import org.slf4j.Logger;
032import org.slf4j.LoggerFactory;
033
034public class CompressionUtil {
035
036  private CompressionUtil() {
037    throw new UnsupportedOperationException("Can't initialize class");
038  }
039
040  public static class UnsupportedCompressionType extends RuntimeException {
041
042    public UnsupportedCompressionType() {
043    }
044
045    public UnsupportedCompressionType(String message) {
046      super(message);
047    }
048
049    public UnsupportedCompressionType(String message, Throwable cause) {
050      super(message, cause);
051    }
052
053  }
054
055  private static final Logger LOG = LoggerFactory.getLogger(CompressionUtil.class);
056  private static final int BUFFER = 2048;
057  private static final String APPLE_RESOURCE_FORK = "__MACOSX";
058
059  /**
060   * Tries to decompress a file into a newly created temporary directory, trying gzip or zip regardless of the filename
061   * or its suffix.
062   *
063   * @return folder containing all decompressed files
064   */
065  public static File decompressFile(File compressedFile) throws IOException, UnsupportedCompressionType {
066    // create empty tmp dir
067    File dir = File.createTempFile("gbif-", null);
068    if (dir.exists() && !dir.delete()) {
069      throw new IOException("Couldn't delete temporary directory");
070    }
071
072    if (!dir.mkdirs()) {
073      throw new IOException("Couldn't create temporary directory for decompression");
074    }
075
076    // decompress
077    decompressFile(dir, compressedFile);
078
079    return dir;
080  }
081
082  /**
083   * Defaults keeping subDirectories to false.
084   *
085   * @see org.gbif.utils.file.CompressionUtil#decompressFile(java.io.File, java.io.File, boolean)
086   */
087  public static List<File> decompressFile(File directory, File compressedFile)
088    throws IOException, UnsupportedCompressionType {
089    return decompressFile(directory, compressedFile, false);
090  }
091
092  /**
093   * Tries to decompress a file trying gzip or zip regardless of the filename or its suffix.
094   *
095   * @param directory      directory where archive's contents will be decompressed to
096   * @param compressedFile compressed file
097   *
098   * @return list of files that have been extracted or null an empty list if archive couldn't be decompressed
099   *
100   * @throws IOException                if problem occurred reading compressed file, or directory couldn't be written
101   *                                    to
102   * @throws UnsupportedCompressionType if the compression type wasn't recognized
103   */
104  public static List<File> decompressFile(File directory, File compressedFile, boolean keepSubdirectories)
105    throws IOException, UnsupportedCompressionType {
106    List<File> files = null;
107    // first try zip
108    try {
109      files = unzipFile(directory, compressedFile, keepSubdirectories);
110    } catch (ZipException e) {
111      LOG.debug("No zip compression");
112    }
113
114    // Try gzip if needed
115    if (files == null) {
116      try {
117        files = ungzipFile(directory, compressedFile);
118      } catch (Exception e) {
119        LOG.debug("No gzip compression");
120        throw new UnsupportedCompressionType("Unknown compression type. Neither zip nor gzip", e);
121      }
122    }
123
124    return files;
125  }
126
127  /**
128   * Extracts a gzipped file. Subdirectories or hidden files (i.e. files starting with a dot) are being ignored.
129   *
130   * @param directory where the file should be extracted to
131   * @param zipFile   to extract
132   *
133   * @return a list of all created files
134   */
135  public static List<File> ungzipFile(File directory, File zipFile) throws IOException {
136    List<File> files = new ArrayList<File>();
137    TarArchiveInputStream in = new TarArchiveInputStream(new GZIPInputStream(new FileInputStream(zipFile)));
138    try {
139      TarArchiveEntry entry = in.getNextTarEntry();
140      while (entry != null) {
141        if (entry.isDirectory()) {
142          LOG.debug("TAR archive contains directories which are being ignored");
143          entry = in.getNextTarEntry();
144          continue;
145        }
146        String fn = new File(entry.getName()).getName();
147        if (fn.startsWith(".")) {
148          LOG.debug("TAR archive contains a hidden file which is being ignored");
149          entry = in.getNextTarEntry();
150          continue;
151        }
152        File targetFile = new File(directory, fn);
153        if (targetFile.exists()) {
154          LOG.warn("TAR archive contains duplicate filenames, only the first is being extracted");
155          entry = in.getNextTarEntry();
156          continue;
157        }
158        LOG.debug("Extracting file: {} to: {}", entry.getName(), targetFile.getAbsolutePath());
159        FileOutputStream out = new FileOutputStream(targetFile);
160        try {
161          IOUtils.copy(in, out);
162          out.close();
163        } finally {
164          IOUtils.closeQuietly(out);
165        }
166        files.add(targetFile);
167      }
168    } finally {
169      in.close();
170    }
171    return files;
172  }
173
174  /**
175   * Gunzip a file.  Use this method with isTarred false if the gzip contains a single file.  If it's a gzip
176   * of a tar archive pass true to isTarred (or call @ungzipFile(directory, zipFile) which is what this method
177   * just redirects to for isTarred).
178   *
179   * @param directory the output directory for the uncompressed file(s)
180   * @param zipFile   the gzip file
181   * @param isTarred  true if the gzip contains a tar archive
182   *
183   * @return a List of the uncompressed file name(s)
184   *
185   * @throws IOException if reading or writing fails
186   */
187  public static List<File> ungzipFile(File directory, File zipFile, boolean isTarred) throws IOException {
188    if (isTarred) return ungzipFile(directory, zipFile);
189
190    List<File> files = new ArrayList<File>();
191    GZIPInputStream in = null;
192    BufferedOutputStream dest = null;
193    try {
194      in = new GZIPInputStream(new FileInputStream(zipFile));
195
196      // assume that the gzip filename is the filename + .gz
197      String unzippedName = zipFile.getName().substring(0, zipFile.getName().lastIndexOf("."));
198      File outputFile = new File(directory, unzippedName);
199      LOG.debug("Extracting file: {} to: {}", unzippedName, outputFile.getAbsolutePath());
200      FileOutputStream fos = new FileOutputStream(outputFile);
201
202      dest = new BufferedOutputStream(fos, BUFFER);
203      int count;
204      byte[] data = new byte[BUFFER];
205      while ((count = in.read(data, 0, BUFFER)) != -1) {
206        dest.write(data, 0, count);
207      }
208      files.add(outputFile);
209    } finally {
210      if (in != null) in.close();
211      if (dest != null) {
212        dest.flush();
213        dest.close();
214      }
215    }
216
217    return files;
218  }
219
220  /**
221   * Defaults keepSubdirectories to false.
222   *
223   * @see org.gbif.utils.file.CompressionUtil#unzipFile(java.io.File, java.io.File, boolean)
224   */
225  public static List<File> unzipFile(File directory, File zipFile) throws IOException {
226     return unzipFile(directory, zipFile, false);
227  }
228
229  /**
230   * Zip a directory with all files but skipping included subdirectories.
231   * Only files directly within the directory are added to the archive.
232   *
233   * @param dir     the directory to zip
234   * @param zipFile the zipped file
235   */
236  public static void zipDir(File dir, File zipFile) throws IOException {
237    zipDir(dir, zipFile, false);
238  }
239
240  /**
241   * Zip a directory with all files. Files in Subdirectories will be included if the inclSubdirs is true.
242   *
243   * @param dir     the directory to zip
244   * @param zipFile the zipped file
245   * @param inclSubdirs if true includes all subdirectories recursively
246   */
247  public static void zipDir(File dir, File zipFile, boolean inclSubdirs) throws IOException {
248    Collection<File> files = org.apache.commons.io.FileUtils.listFiles(dir, null, inclSubdirs);
249    zipFiles(files, dir, zipFile);
250  }
251
252  public static void zipFile(File file, File zipFile) throws IOException {
253    Set<File> files = new HashSet<File>();
254    files.add(file);
255    zipFiles(files, file.getParentFile(), zipFile);
256  }
257
258  /**
259   * Creates a zip archive from a given collection of files.
260   * In order to preserve paths in the archive a rootContext can be specified which will be removed from the individual
261   * zip entries. For example a rootContext of /home/freak with a file /home/freak/photo/birthday.jpg to be zipped
262   * will result in a zip entry with a path photo/birthday.jpg.
263   *
264   * @param files to be included in the zip archive
265   * @param rootContext optional path to be removed from each file
266   * @param zipFile the zip file to be created
267   * @throws IOException
268   */
269  public static void zipFiles(Collection<File> files, File rootContext, File zipFile) throws IOException {
270    if (files.isEmpty()) {
271      LOG.info("no files to zip.");
272    } else {
273      try {
274        BufferedInputStream origin = null;
275        FileOutputStream dest = new FileOutputStream(zipFile);
276        ZipOutputStream out = new ZipOutputStream(new BufferedOutputStream(dest));
277        // out.setMethod(ZipOutputStream.DEFLATED);
278        byte[] data = new byte[BUFFER];
279        for (File f : files) {
280          LOG.debug("Adding file {} to archive", f);
281          FileInputStream fi = new FileInputStream(f);
282          origin = new BufferedInputStream(fi, BUFFER);
283
284          String zipPath = StringUtils.removeStart(f.getAbsolutePath(), rootContext.getAbsolutePath() + File.separator);
285          ZipEntry entry = new ZipEntry(zipPath);
286          out.putNextEntry(entry);
287          int count;
288          while ((count = origin.read(data, 0, BUFFER)) != -1) {
289            out.write(data, 0, count);
290          }
291          origin.close();
292        }
293        out.finish();
294        out.close();
295      } catch (IOException e) {
296        LOG.error("IOException while zipping files: {}", files);
297        throw e;
298      }
299    }
300  }
301
302  /**
303   * Extracts a zipped file into a target directory. If the file is wrapped in a root directory, this is removed by
304   * default. Other subdirectories are ignored according to the parameter keepSubdirectories.
305   * </br>
306   * The following types of files are also ignored by default:
307   * i) hidden files (i.e. files starting with a dot)
308   * ii) Apple resource fork (__MACOSX), including its subdirectories and subfiles
309   *
310   * @param directory          where the zipped file and its subdirectories should be extracted to
311   * @param zipFile            to extract
312   * @param keepSubdirectories whether to preserve subdirectories or not
313   *
314   * @return a list of all created files and directories extracted to target directory
315   */
316  public static List<File> unzipFile(File directory, File zipFile, boolean keepSubdirectories) throws IOException {
317    LOG.debug("Unzipping archive " + zipFile.getName() + " into directory: " + directory.getAbsolutePath());
318    ZipFile zf = new ZipFile(zipFile);
319    Enumeration<? extends ZipEntry> entries = zf.entries();
320    while (entries.hasMoreElements()) {
321      ZipEntry entry = entries.nextElement();
322      // ignore resource fork directories and subfiles
323      if (entry.getName().toUpperCase().contains(APPLE_RESOURCE_FORK)) {
324        LOG.debug("Ignoring resource fork file: " + entry.getName());
325      }
326      // ignore directories and hidden directories (e.g. .svn) (based on flag)
327      else if (entry.isDirectory()) {
328        if (isHiddenFile(new File(entry.getName()))) {
329          LOG.debug("Ignoring hidden directory: " + entry.getName());
330        } else if (keepSubdirectories) {
331          new File(directory, entry.getName()).mkdir();
332        } else {
333          LOG.debug("Ignoring (sub)directory: " + entry.getName());
334        }
335      }
336      // ignore hidden files
337      else {
338        if (isHiddenFile(new File(entry.getName()))) {
339          LOG.debug("Ignoring hidden file: " + entry.getName());
340        } else {
341          File targetFile = (keepSubdirectories) ? new File(directory, entry.getName())
342            : new File(directory, new File(entry.getName()).getName());
343          // ensure parent folder always exists, and extract file
344          createParentFolder(targetFile);
345          extractFile(zf, entry, targetFile);
346        }
347      }
348    }
349    zf.close();
350    // remove the wrapping root directory and flatten structure
351    if (keepSubdirectories) {
352      removeRootDirectory(directory);
353    }
354    return (directory.listFiles() == null) ? new ArrayList<File>() : Arrays.asList(directory.listFiles());
355  }
356
357  /**
358   * @return true if file is a hidden file or directory, or if any of its parent directories are hidden checking
359   * recursively
360   */
361  private static boolean isHiddenFile(File f) {
362    if (f.getName().startsWith(".")) {
363      return true;
364    } else if (f.getParentFile() != null) {
365      return isHiddenFile(f.getParentFile());
366    }
367    return false;
368  }
369
370  /**
371   * Removes a wrapping root directory and flatten its structure by moving all that root directory's files and
372   * subdirectories up to the same level as the root directory.
373   */
374  private static void removeRootDirectory(File directory) {
375    File[] rootFiles = directory.listFiles((FileFilter) HiddenFileFilter.VISIBLE);
376    if (rootFiles.length == 1) {
377      File root = rootFiles[0];
378      if (root.isDirectory()) {
379        LOG.debug("Removing single root folder {} found in decompressed archive", root.getAbsoluteFile());
380        for (File f : org.apache.commons.io.FileUtils.listFilesAndDirs(root, TrueFileFilter.TRUE, TrueFileFilter.TRUE)) {
381          File f2 = new File(directory, f.getName());
382          f.renameTo(f2);
383        }
384        root.delete();
385      }
386    }
387  }
388
389  /**
390   * Extract an entry from a zipped file into a target file.
391   *
392   * @param zf         .zip file being unzipped (ZipFile)
393   * @param entry      entry in .zip file currently being examined (ZipEntry)
394   * @param targetFile destination file to extract to
395   */
396  private static void extractFile(ZipFile zf, ZipEntry entry, File targetFile) {
397    try {
398      LOG.debug("Extracting file: {} to: {}", entry.getName(), targetFile.getAbsolutePath());
399      InputStream in = zf.getInputStream(entry);
400      OutputStream out = new BufferedOutputStream(new FileOutputStream(targetFile));
401      try {
402        IOUtils.copy(zf.getInputStream(entry), out);
403      } finally {
404        in.close();
405        out.close();
406      }
407    } catch (IOException e) {
408      LOG.error("File could not be extraced: " + e.getMessage(), e);
409    }
410  }
411
412  /**
413   * Make parent folder.
414   *
415   * @param file destination file
416   */
417  private static void createParentFolder(File file) {
418    File parent = new File(file.getParent());
419    if (!parent.exists()) {
420      LOG.debug((parent.mkdirs()) ? "Created parent directory: " + parent.getAbsolutePath()
421        : "Failed to create parent directory: " + parent.getAbsolutePath());
422    }
423  }
424}