001/*
002 * Licensed under the Apache License, Version 2.0 (the "License");
003 * you may not use this file except in compliance with the License.
004 * You may obtain a copy of the License at
005 *
006 *     http://www.apache.org/licenses/LICENSE-2.0
007 *
008 * Unless required by applicable law or agreed to in writing, software
009 * distributed under the License is distributed on an "AS IS" BASIS,
010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
011 * See the License for the specific language governing permissions and
012 * limitations under the License.
013 */
014package org.gbif.utils.file;
015
016import java.io.BufferedInputStream;
017import java.io.BufferedOutputStream;
018import java.io.File;
019import java.io.FileFilter;
020import java.io.FileInputStream;
021import java.io.FileOutputStream;
022import java.io.IOException;
023import java.io.InputStream;
024import java.io.OutputStream;
025import java.io.RandomAccessFile;
026import java.nio.file.Files;
027import java.util.ArrayList;
028import java.util.Arrays;
029import java.util.Collection;
030import java.util.Comparator;
031import java.util.HashSet;
032import java.util.List;
033import java.util.Set;
034import java.util.stream.Collectors;
035import java.util.zip.GZIPInputStream;
036import java.util.zip.ZipEntry;
037import java.util.zip.ZipException;
038import java.util.zip.ZipInputStream;
039import java.util.zip.ZipOutputStream;
040
041import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
042import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
043import org.apache.commons.io.FileUtils;
044import org.apache.commons.io.IOUtils;
045import org.apache.commons.io.filefilter.HiddenFileFilter;
046import org.apache.commons.io.filefilter.TrueFileFilter;
047import org.apache.commons.lang3.StringUtils;
048import org.slf4j.Logger;
049import org.slf4j.LoggerFactory;
050
051public class CompressionUtil {
052
053  private CompressionUtil() {
054    throw new UnsupportedOperationException("Can't initialize class");
055  }
056
057  public static class UnsupportedCompressionType extends RuntimeException {
058
059    public UnsupportedCompressionType() {}
060
061    public UnsupportedCompressionType(String message) {
062      super(message);
063    }
064
065    public UnsupportedCompressionType(String message, Throwable cause) {
066      super(message, cause);
067    }
068  }
069
070  private static final Logger LOG = LoggerFactory.getLogger(CompressionUtil.class);
071  private static final int BUFFER = 2048;
072  private static final String APPLE_RESOURCE_FORK = "__MACOSX";
073  private static final byte[] TAR_MAGIC_BYTES = new byte[] {'u', 's', 't', 'a', 'r'};
074
075  /**
076   * Tries to decompress a file into a newly created temporary directory, trying gzip or zip regardless of the filename
077   * or its suffix.
078   *
079   * @return folder containing all decompressed files
080   */
081  public static File decompressFile(File compressedFile)
082      throws IOException, UnsupportedCompressionType {
083    // create empty tmp dir
084    File dir = File.createTempFile("gbif-", null);
085    if (dir.exists() && !dir.delete()) {
086      throw new IOException("Couldn't delete temporary directory");
087    }
088
089    if (!dir.mkdirs()) {
090      throw new IOException("Couldn't create temporary directory for decompression");
091    }
092
093    // decompress
094    decompressFile(dir, compressedFile);
095
096    return dir;
097  }
098
099  /**
100   * Defaults keeping subDirectories to false.
101   *
102   * @see org.gbif.utils.file.CompressionUtil#decompressFile(java.io.File, java.io.File, boolean)
103   */
104  public static List<File> decompressFile(File directory, File compressedFile)
105      throws IOException, UnsupportedCompressionType {
106    return decompressFile(directory, compressedFile, false);
107  }
108
109  /**
110   * Tries to decompress a file using TAR+gzip, TAR or Zip regardless of the filename or its suffix.
111   *
112   * @param directory      directory where archive's contents will be decompressed to
113   * @param compressedFile compressed file
114   *
115   * @return list of files that have been extracted or null an empty list if archive couldn't be decompressed
116   *
117   * @throws IOException                if problem occurred reading compressed file, or directory couldn't be written
118   *                                    to
119   * @throws UnsupportedCompressionType if the compression type wasn't recognized
120   */
121  public static List<File> decompressFile(
122      File directory, File compressedFile, boolean keepSubdirectories)
123      throws IOException, UnsupportedCompressionType {
124    List<File> files = null;
125
126    // Test before trying gzip format
127    if (isGzipFormat(compressedFile)) {
128      try {
129        LOG.debug("Uncompressing {} with gzip compression to {}", compressedFile, directory);
130        files = untgzFile(directory, compressedFile);
131      } catch (Exception e) {
132        LOG.debug("Not gzip compression");
133      }
134    }
135
136    // Test before trying TAR format
137    if (isTarFormat(compressedFile)) {
138      try {
139        LOG.debug("Uncompressing {} with TAR compression to {}", compressedFile, directory);
140        files = untarFile(directory, compressedFile);
141      } catch (Exception e) {
142        LOG.debug("Not TAR compression");
143      }
144    }
145
146    // Then try zip
147    if (files == null) {
148      try {
149        LOG.debug("Uncompressing {} with Zip compression to {}", compressedFile, directory);
150        files = unzipFile(directory, compressedFile, keepSubdirectories);
151      } catch (ZipException e) {
152        LOG.debug("Not Zip compression");
153        throw new UnsupportedCompressionType("Unknown compression type. Neither gzip nor Zip", e);
154      }
155    }
156
157    if (files.isEmpty()) {
158      LOG.warn("No files extracted from {}, tried TGZ, TAR and Zip compression.", compressedFile);
159    }
160
161    return files;
162  }
163
164  /**
165   * Check the file's first two bytes, to see if they are the gzip magic number.
166   * @param compressedFile compressed file
167   * @return               true if the file is in gzip format
168   * @throws IOException   if a problem occurred reading compressed file
169   */
170  private static boolean isGzipFormat(File compressedFile) throws IOException {
171    try (RandomAccessFile file = new RandomAccessFile(compressedFile, "r")) {
172      return GZIPInputStream.GZIP_MAGIC == (file.read() & 0xff | ((file.read() << 8) & 0xff00));
173    }
174  }
175
176  /**
177   * Check the file is a Tape ARchive (TAR).
178   * @param compressedFile compressed file
179   * @return               true if the file is a TAR
180   * @throws IOException   if a problem occurred reading compressed file
181   */
182  private static boolean isTarFormat(File compressedFile) throws IOException {
183    try (RandomAccessFile file = new RandomAccessFile(compressedFile, "r")) {
184      // TAR files contain "ustar\0" or "ustar " at byte 257.
185      // https://www.gnu.org/software/tar/manual/html_node/Standard.html
186      byte[] at257 = new byte[5];
187      file.seek(257);
188      file.read(at257, 0, 5);
189      return Arrays.equals(at257, TAR_MAGIC_BYTES);
190    } catch (Exception e) {
191      LOG.debug("Exc", e);
192    }
193    return false;
194  }
195
196  /**
197   * Extracts a gzipped TAR file. Directory structure and hidden files (i.e. files starting with a dot) are ignored.
198   *
199   * @param directory where the file should be extracted to
200   * @param tgzFile   to extract
201   *
202   * @return a list of all created files
203   */
204  public static List<File> untgzFile(File directory, File tgzFile) throws IOException {
205    return untarStream(directory, new GZIPInputStream(new FileInputStream(tgzFile)));
206  }
207
208  /**
209   * Extracts a plain TAR file. Directory structure and hidden files (i.e. files starting with a dot) are ignored.
210   *
211   * @param directory where the file should be extracted to
212   * @param tarFile   to extract
213   *
214   * @return a list of all created files
215   */
216  public static List<File> untarFile(File directory, File tarFile) throws IOException {
217    return untarStream(directory, new FileInputStream(tarFile));
218  }
219
220  /**
221   * Extracts a TAR stream. Directory structure and hidden files (i.e. files starting with a dot) are ignored.
222   *
223   * @param directory where the file should be extracted to
224   * @param tarStream to extract
225   *
226   * @return a list of all created files
227   */
228  private static List<File> untarStream(File directory, InputStream tarStream) throws IOException {
229    List<File> files = new ArrayList<File>();
230    try (TarArchiveInputStream in = new TarArchiveInputStream(tarStream)) {
231      TarArchiveEntry entry;
232      while ((entry = in.getNextTarEntry()) != null) {
233        if (entry.isDirectory()) {
234          LOG.debug("TAR archive contains directories which are being ignored");
235          continue;
236        }
237        String fn = new File(entry.getName()).getName();
238        if (fn.startsWith(".")) {
239          LOG.debug("TAR archive contains a hidden file {} which is being ignored", fn);
240          continue;
241        }
242        File targetFile = new File(directory, fn);
243        if (targetFile.exists()) {
244          LOG.warn("TAR archive contains duplicate filename {}, only the first was extracted", fn);
245          continue;
246        }
247        LOG.debug("Extracting file: {} to: {}", entry.getName(), targetFile.getAbsolutePath());
248        try (FileOutputStream out = new FileOutputStream(targetFile)) {
249          IOUtils.copy(in, out);
250        }
251        files.add(targetFile);
252      }
253    }
254    return files;
255  }
256
257  /**
258   * Gunzip a file.  Use this method with isTarred false if the gzip contains a single file.  If it's a gzip
259   * of a TAR pass true to isTarred (or call @untgzFile(directory, tgzFile) which is what this method
260   * just redirects to for isTarred).
261   *
262   * @param directory the output directory for the uncompressed file(s)
263   * @param gzipFile  the gzip file
264   * @param isTarred  true if the gzip contains a TAR
265   *
266   * @return a List of the uncompressed file name(s)
267   *
268   * @throws IOException if reading or writing fails
269   */
270  public static List<File> ungzipFile(File directory, File gzipFile, boolean isTarred)
271      throws IOException {
272    if (isTarred) return untgzFile(directory, gzipFile);
273
274    List<File> files = new ArrayList<File>();
275    GZIPInputStream in = null;
276    BufferedOutputStream dest = null;
277    try {
278      in = new GZIPInputStream(new FileInputStream(gzipFile));
279
280      // assume that the gzip filename is the filename + .gz
281      String unzippedName = gzipFile.getName().substring(0, gzipFile.getName().lastIndexOf("."));
282      File outputFile = new File(directory, unzippedName);
283      LOG.debug("Extracting file: {} to: {}", unzippedName, outputFile.getAbsolutePath());
284      FileOutputStream fos = new FileOutputStream(outputFile);
285
286      dest = new BufferedOutputStream(fos, BUFFER);
287      int count;
288      byte[] data = new byte[BUFFER];
289      while ((count = in.read(data, 0, BUFFER)) != -1) {
290        dest.write(data, 0, count);
291      }
292      files.add(outputFile);
293    } finally {
294      if (in != null) in.close();
295      if (dest != null) {
296        dest.flush();
297        dest.close();
298      }
299    }
300
301    return files;
302  }
303
304  /**
305   * Defaults keepSubdirectories to false.
306   *
307   * @see org.gbif.utils.file.CompressionUtil#unzipFile(java.io.File, java.io.File, boolean)
308   */
309  public static List<File> unzipFile(File directory, File zipFile) throws IOException {
310    return unzipFile(directory, zipFile, false);
311  }
312
313  /**
314   * Zip a directory with all files but skipping included subdirectories.
315   * Only files directly within the directory are added to the archive.
316   *
317   * @param dir     the directory to zip
318   * @param zipFile the zipped file
319   */
320  public static void zipDir(File dir, File zipFile) throws IOException {
321    zipDir(dir, zipFile, false);
322  }
323
324  /**
325   * Zip a directory with all files. Files in Subdirectories will be included if the inclSubdirs is true.
326   *
327   * @param dir     the directory to zip
328   * @param zipFile the zipped file
329   * @param inclSubdirs if true includes all subdirectories recursively
330   */
331  public static void zipDir(File dir, File zipFile, boolean inclSubdirs) throws IOException {
332    Collection<File> files = org.apache.commons.io.FileUtils.listFiles(dir, null, inclSubdirs);
333    zipFiles(files, dir, zipFile);
334  }
335
336  public static void zipFile(File file, File zipFile) throws IOException {
337    Set<File> files = new HashSet<File>();
338    files.add(file);
339    zipFiles(files, file.getParentFile(), zipFile);
340  }
341
342  /**
343   * Creates a zip archive from a given collection of files.
344   * In order to preserve paths in the archive a rootContext can be specified which will be removed from the individual
345   * zip entries. For example a rootContext of /home/freak with a file /home/freak/photo/birthday.jpg to be zipped
346   * will result in a zip entry with a path photo/birthday.jpg.
347   *
348   * @param files to be included in the zip archive
349   * @param rootContext optional path to be removed from each file
350   * @param zipFile the zip file to be created
351   * @throws IOException
352   */
353  public static void zipFiles(Collection<File> files, File rootContext, File zipFile)
354      throws IOException {
355    if (files.isEmpty()) {
356      LOG.info("no files to zip.");
357    } else {
358      try {
359        BufferedInputStream origin = null;
360        FileOutputStream dest = new FileOutputStream(zipFile);
361        ZipOutputStream out = new ZipOutputStream(new BufferedOutputStream(dest));
362        // out.setMethod(ZipOutputStream.DEFLATED);
363        byte[] data = new byte[BUFFER];
364        for (File f : files) {
365          LOG.debug("Adding file {} to archive", f);
366          FileInputStream fi = new FileInputStream(f);
367          origin = new BufferedInputStream(fi, BUFFER);
368
369          String zipPath =
370              StringUtils.removeStart(
371                  f.getAbsolutePath(), rootContext.getAbsolutePath() + File.separator);
372          ZipEntry entry = new ZipEntry(zipPath);
373          out.putNextEntry(entry);
374          int count;
375          while ((count = origin.read(data, 0, BUFFER)) != -1) {
376            out.write(data, 0, count);
377          }
378          origin.close();
379        }
380        out.finish();
381        out.close();
382      } catch (IOException e) {
383        LOG.error("IOException while zipping files: {}", files);
384        throw e;
385      }
386    }
387  }
388
389  /**
390   * Extracts a zipped file into a target directory. If the file is wrapped in a root directory, this is removed by
391   * default. Other subdirectories are ignored according to the parameter keepSubdirectories.
392   * </br>
393   * The following types of files are also ignored by default:
394   * i) hidden files (i.e. files starting with a dot)
395   * ii) Apple resource fork (__MACOSX), including its subdirectories and subfiles
396   *
397   * @param directory          where the zipped file and its subdirectories should be extracted to
398   * @param zipFile            to extract
399   * @param keepSubdirectories whether to preserve subdirectories or not
400   *
401   * @return a list of all created files and directories extracted to target directory
402   */
403  public static List<File> unzipFile(File directory, File zipFile, boolean keepSubdirectories)
404      throws IOException {
405    LOG.debug(
406        "Unzipping archive "
407            + zipFile.getName()
408            + " into directory: "
409            + directory.getAbsolutePath());
410
411    // This is changed from using ZipFile to a ZipInputStream since Java 8u192 can't open certain
412    // Zip64 files.
413    // https://bugs.openjdk.java.net/browse/JDK-8186464
414    try (FileInputStream fInput = new FileInputStream(zipFile);
415        ZipInputStream zipInput = new ZipInputStream(fInput)) {
416      ZipEntry entry;
417
418      while ((entry = zipInput.getNextEntry()) != null) {
419        // ignore resource fork directories and subfiles
420        if (entry.getName().toUpperCase().contains(APPLE_RESOURCE_FORK)) {
421          LOG.debug("Ignoring resource fork file: " + entry.getName());
422        }
423        // ignore directories and hidden directories (e.g. .svn) (based on flag)
424        else if (entry.isDirectory()) {
425          if (isHiddenFile(new File(entry.getName()))) {
426            LOG.debug("Ignoring hidden directory: " + entry.getName());
427          } else if (keepSubdirectories) {
428            new File(directory, entry.getName()).mkdir();
429          } else {
430            LOG.debug("Ignoring (sub)directory: " + entry.getName());
431          }
432        }
433        // ignore hidden files
434        else {
435          if (isHiddenFile(new File(entry.getName()))) {
436            LOG.debug("Ignoring hidden file: " + entry.getName());
437          } else {
438            File targetFile =
439                (keepSubdirectories)
440                    ? new File(directory, entry.getName())
441                    : new File(directory, new File(entry.getName()).getName());
442            // ensure parent folder always exists, and extract file
443            createParentFolder(targetFile);
444
445            LOG.debug("Extracting file: {} to: {}", entry.getName(), targetFile.getAbsolutePath());
446            try (OutputStream out =
447                new BufferedOutputStream(Files.newOutputStream(targetFile.toPath()))) {
448              IOUtils.copy(zipInput, out);
449            }
450          }
451        }
452      }
453    }
454    // remove the wrapping root directory and flatten structure
455    if (keepSubdirectories) {
456      removeRootDirectory(directory);
457    }
458
459    File[] files = directory.listFiles();
460
461    return (files == null) ? new ArrayList<>() : Arrays.asList(files);
462  }
463
464  /**
465   * @return true if file is a hidden file or directory, or if any of its parent directories are hidden checking
466   * recursively
467   */
468  private static boolean isHiddenFile(File f) {
469    if (f.getName().startsWith(".")) {
470      return true;
471    } else if (f.getParentFile() != null) {
472      return isHiddenFile(f.getParentFile());
473    }
474    return false;
475  }
476
477  /**
478   * Removes a wrapping root directory and flatten its structure by moving all that root directory's files and
479   * subdirectories up to the same level as the root directory.
480   */
481  @SuppressWarnings("ResultOfMethodCallIgnored")
482  private static void removeRootDirectory(File directory) {
483    File[] rootFiles = directory.listFiles((FileFilter) HiddenFileFilter.VISIBLE);
484    if (rootFiles == null) {
485      LOG.error("Failed to retrieve root directory from {}", directory.getAbsolutePath());
486      return;
487    }
488
489    if (rootFiles.length != 1) {
490      LOG.error("More than one root directory at {}", directory.getAbsolutePath());
491      return;
492    }
493
494    File root = rootFiles[0];
495    if (root.isDirectory()) {
496      LOG.debug(
497          "Removing single root folder {} found in decompressed archive", root.getAbsoluteFile());
498      Collection<File> filesAndDirs =
499          FileUtils.listFilesAndDirs(root, TrueFileFilter.TRUE, TrueFileFilter.TRUE);
500      // directories shouldn't be in the end, sort
501      List<File> sortedFilesAndDirs =
502          filesAndDirs.stream()
503              .sorted(Comparator.comparing(File::getAbsolutePath))
504              .collect(Collectors.toList());
505
506      for (File f : sortedFilesAndDirs) {
507        File f2 = new File(directory, f.getName());
508        f.renameTo(f2);
509      }
510      root.delete();
511    }
512  }
513
514  /**
515   * Make parent folder.
516   *
517   * @param file destination file
518   */
519  private static void createParentFolder(File file) {
520    File parent = new File(file.getParent());
521    if (!parent.exists()) {
522      LOG.debug(
523          (parent.mkdirs())
524              ? "Created parent directory: " + parent.getAbsolutePath()
525              : "Failed to create parent directory: " + parent.getAbsolutePath());
526    }
527  }
528}