001/*
002 * Copyright 2021 Global Biodiversity Information Facility (GBIF)
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016package org.gbif.utils.file;
017
018import java.io.BufferedInputStream;
019import java.io.BufferedOutputStream;
020import java.io.File;
021import java.io.FileFilter;
022import java.io.FileInputStream;
023import java.io.FileOutputStream;
024import java.io.IOException;
025import java.io.InputStream;
026import java.io.OutputStream;
027import java.io.RandomAccessFile;
028import java.util.ArrayList;
029import java.util.Arrays;
030import java.util.Collection;
031import java.util.HashSet;
032import java.util.List;
033import java.util.Set;
034import java.util.zip.GZIPInputStream;
035import java.util.zip.ZipEntry;
036import java.util.zip.ZipException;
037import java.util.zip.ZipInputStream;
038import java.util.zip.ZipOutputStream;
039
040import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
041import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
042import org.apache.commons.io.IOUtils;
043import org.apache.commons.io.filefilter.HiddenFileFilter;
044import org.apache.commons.io.filefilter.TrueFileFilter;
045import org.apache.commons.lang3.StringUtils;
046import org.slf4j.Logger;
047import org.slf4j.LoggerFactory;
048
049public class CompressionUtil {
050
051  private CompressionUtil() {
052    throw new UnsupportedOperationException("Can't initialize class");
053  }
054
055  public static class UnsupportedCompressionType extends RuntimeException {
056
057    public UnsupportedCompressionType() {
058    }
059
060    public UnsupportedCompressionType(String message) {
061      super(message);
062    }
063
064    public UnsupportedCompressionType(String message, Throwable cause) {
065      super(message, cause);
066    }
067
068  }
069
070  private static final Logger LOG = LoggerFactory.getLogger(CompressionUtil.class);
071  private static final int BUFFER = 2048;
072  private static final String APPLE_RESOURCE_FORK = "__MACOSX";
073  private static final byte[] TAR_MAGIC_BYTES = new byte[]{'u', 's', 't', 'a', 'r'};
074
075  /**
076   * Tries to decompress a file into a newly created temporary directory, trying gzip or zip regardless of the filename
077   * or its suffix.
078   *
079   * @return folder containing all decompressed files
080   */
081  public static File decompressFile(File compressedFile) throws IOException, UnsupportedCompressionType {
082    // create empty tmp dir
083    File dir = File.createTempFile("gbif-", null);
084    if (dir.exists() && !dir.delete()) {
085      throw new IOException("Couldn't delete temporary directory");
086    }
087
088    if (!dir.mkdirs()) {
089      throw new IOException("Couldn't create temporary directory for decompression");
090    }
091
092    // decompress
093    decompressFile(dir, compressedFile);
094
095    return dir;
096  }
097
098  /**
099   * Defaults keeping subDirectories to false.
100   *
101   * @see org.gbif.utils.file.CompressionUtil#decompressFile(java.io.File, java.io.File, boolean)
102   */
103  public static List<File> decompressFile(File directory, File compressedFile)
104    throws IOException, UnsupportedCompressionType {
105    return decompressFile(directory, compressedFile, false);
106  }
107
108  /**
109   * Tries to decompress a file using TAR+gzip, TAR or Zip regardless of the filename or its suffix.
110   *
111   * @param directory      directory where archive's contents will be decompressed to
112   * @param compressedFile compressed file
113   *
114   * @return list of files that have been extracted or null an empty list if archive couldn't be decompressed
115   *
116   * @throws IOException                if problem occurred reading compressed file, or directory couldn't be written
117   *                                    to
118   * @throws UnsupportedCompressionType if the compression type wasn't recognized
119   */
120  public static List<File> decompressFile(File directory, File compressedFile, boolean keepSubdirectories)
121    throws IOException, UnsupportedCompressionType {
122    List<File> files = null;
123
124    // Test before trying gzip format
125    if (isGzipFormat(compressedFile)) {
126      try {
127        LOG.debug("Uncompressing {} with gzip compression to {}", compressedFile, directory);
128        files = untgzFile(directory, compressedFile);
129      } catch (Exception e) {
130        LOG.debug("Not gzip compression");
131      }
132    }
133
134    // Test before trying TAR format
135    if (isTarFormat(compressedFile)) {
136      try {
137        LOG.debug("Uncompressing {} with TAR compression to {}", compressedFile, directory);
138        files = untarFile(directory, compressedFile);
139      } catch (Exception e) {
140        LOG.debug("Not TAR compression");
141      }
142    }
143
144    // Then try zip
145    if (files == null) {
146      try {
147        LOG.debug("Uncompressing {} with Zip compression to {}", compressedFile, directory);
148        files = unzipFile(directory, compressedFile, keepSubdirectories);
149      } catch (ZipException e) {
150        LOG.debug("Not Zip compression");
151        throw new UnsupportedCompressionType("Unknown compression type. Neither gzip nor Zip", e);
152      }
153    }
154
155    if (files.isEmpty()) {
156      LOG.warn("No files extracted from {}, tried TGZ, TAR and Zip compression.", compressedFile);
157    }
158
159    return files;
160  }
161
162  /**
163   * Check the file's first two bytes, to see if they are the gzip magic number.
164   * @param compressedFile compressed file
165   * @return               true if the file is in gzip format
166   * @throws IOException   if a problem occurred reading compressed file
167   */
168  private static boolean isGzipFormat(File compressedFile) throws IOException {
169    try (RandomAccessFile file = new RandomAccessFile(compressedFile, "r")) {
170      return GZIPInputStream.GZIP_MAGIC == (file.read() & 0xff | ((file.read() << 8) & 0xff00));
171    }
172  }
173
174  /**
175   * Check the file is a Tape ARchive (TAR).
176   * @param compressedFile compressed file
177   * @return               true if the file is a TAR
178   * @throws IOException   if a problem occurred reading compressed file
179   */
180  private static boolean isTarFormat(File compressedFile) throws IOException {
181    try (RandomAccessFile file = new RandomAccessFile(compressedFile, "r")) {
182      // TAR files contain "ustar\0" or "ustar " at byte 257.
183      // https://www.gnu.org/software/tar/manual/html_node/Standard.html
184      byte[] at257 = new byte[5];
185      file.seek(257);
186      file.read(at257, 0, 5);
187      return Arrays.equals(at257, TAR_MAGIC_BYTES);
188    } catch (Exception e) {
189      LOG.debug("Exc", e);
190    }
191    return false;
192  }
193
194  /**
195   * Extracts a gzipped TAR file. Directory structure and hidden files (i.e. files starting with a dot) are ignored.
196   *
197   * @param directory where the file should be extracted to
198   * @param tgzFile   to extract
199   *
200   * @return a list of all created files
201   */
202  public static List<File> untgzFile(File directory, File tgzFile) throws IOException {
203    return untarStream(directory, new GZIPInputStream(new FileInputStream(tgzFile)));
204  }
205
206  /**
207   * Extracts a plain TAR file. Directory structure and hidden files (i.e. files starting with a dot) are ignored.
208   *
209   * @param directory where the file should be extracted to
210   * @param tarFile   to extract
211   *
212   * @return a list of all created files
213   */
214  public static List<File> untarFile(File directory, File tarFile) throws IOException {
215    return untarStream(directory, new FileInputStream(tarFile));
216  }
217
218  /**
219   * Extracts a TAR stream. Directory structure and hidden files (i.e. files starting with a dot) are ignored.
220   *
221   * @param directory where the file should be extracted to
222   * @param tarStream to extract
223   *
224   * @return a list of all created files
225   */
226  private static List<File> untarStream(File directory, InputStream tarStream) throws IOException {
227    List<File> files = new ArrayList<File>();
228    try (TarArchiveInputStream in = new TarArchiveInputStream(tarStream)) {
229      TarArchiveEntry entry;
230      while ((entry = in.getNextTarEntry()) != null) {
231        if (entry.isDirectory()) {
232          LOG.debug("TAR archive contains directories which are being ignored");
233          continue;
234        }
235        String fn = new File(entry.getName()).getName();
236        if (fn.startsWith(".")) {
237          LOG.debug("TAR archive contains a hidden file {} which is being ignored", fn);
238          continue;
239        }
240        File targetFile = new File(directory, fn);
241        if (targetFile.exists()) {
242          LOG.warn("TAR archive contains duplicate filename {}, only the first was extracted", fn);
243          continue;
244        }
245        LOG.debug("Extracting file: {} to: {}", entry.getName(), targetFile.getAbsolutePath());
246        try (FileOutputStream out = new FileOutputStream(targetFile)) {
247          IOUtils.copy(in, out);
248        }
249        files.add(targetFile);
250      }
251    }
252    return files;
253  }
254
255  /**
256   * Gunzip a file.  Use this method with isTarred false if the gzip contains a single file.  If it's a gzip
257   * of a TAR pass true to isTarred (or call @untgzFile(directory, tgzFile) which is what this method
258   * just redirects to for isTarred).
259   *
260   * @param directory the output directory for the uncompressed file(s)
261   * @param gzipFile  the gzip file
262   * @param isTarred  true if the gzip contains a TAR
263   *
264   * @return a List of the uncompressed file name(s)
265   *
266   * @throws IOException if reading or writing fails
267   */
268  public static List<File> ungzipFile(File directory, File gzipFile, boolean isTarred) throws IOException {
269    if (isTarred) return untgzFile(directory, gzipFile);
270
271    List<File> files = new ArrayList<File>();
272    GZIPInputStream in = null;
273    BufferedOutputStream dest = null;
274    try {
275      in = new GZIPInputStream(new FileInputStream(gzipFile));
276
277      // assume that the gzip filename is the filename + .gz
278      String unzippedName = gzipFile.getName().substring(0, gzipFile.getName().lastIndexOf("."));
279      File outputFile = new File(directory, unzippedName);
280      LOG.debug("Extracting file: {} to: {}", unzippedName, outputFile.getAbsolutePath());
281      FileOutputStream fos = new FileOutputStream(outputFile);
282
283      dest = new BufferedOutputStream(fos, BUFFER);
284      int count;
285      byte[] data = new byte[BUFFER];
286      while ((count = in.read(data, 0, BUFFER)) != -1) {
287        dest.write(data, 0, count);
288      }
289      files.add(outputFile);
290    } finally {
291      if (in != null) in.close();
292      if (dest != null) {
293        dest.flush();
294        dest.close();
295      }
296    }
297
298    return files;
299  }
300
301  /**
302   * Defaults keepSubdirectories to false.
303   *
304   * @see org.gbif.utils.file.CompressionUtil#unzipFile(java.io.File, java.io.File, boolean)
305   */
306  public static List<File> unzipFile(File directory, File zipFile) throws IOException {
307     return unzipFile(directory, zipFile, false);
308  }
309
310  /**
311   * Zip a directory with all files but skipping included subdirectories.
312   * Only files directly within the directory are added to the archive.
313   *
314   * @param dir     the directory to zip
315   * @param zipFile the zipped file
316   */
317  public static void zipDir(File dir, File zipFile) throws IOException {
318    zipDir(dir, zipFile, false);
319  }
320
321  /**
322   * Zip a directory with all files. Files in Subdirectories will be included if the inclSubdirs is true.
323   *
324   * @param dir     the directory to zip
325   * @param zipFile the zipped file
326   * @param inclSubdirs if true includes all subdirectories recursively
327   */
328  public static void zipDir(File dir, File zipFile, boolean inclSubdirs) throws IOException {
329    Collection<File> files = org.apache.commons.io.FileUtils.listFiles(dir, null, inclSubdirs);
330    zipFiles(files, dir, zipFile);
331  }
332
333  public static void zipFile(File file, File zipFile) throws IOException {
334    Set<File> files = new HashSet<File>();
335    files.add(file);
336    zipFiles(files, file.getParentFile(), zipFile);
337  }
338
339  /**
340   * Creates a zip archive from a given collection of files.
341   * In order to preserve paths in the archive a rootContext can be specified which will be removed from the individual
342   * zip entries. For example a rootContext of /home/freak with a file /home/freak/photo/birthday.jpg to be zipped
343   * will result in a zip entry with a path photo/birthday.jpg.
344   *
345   * @param files to be included in the zip archive
346   * @param rootContext optional path to be removed from each file
347   * @param zipFile the zip file to be created
348   * @throws IOException
349   */
350  public static void zipFiles(Collection<File> files, File rootContext, File zipFile) throws IOException {
351    if (files.isEmpty()) {
352      LOG.info("no files to zip.");
353    } else {
354      try {
355        BufferedInputStream origin = null;
356        FileOutputStream dest = new FileOutputStream(zipFile);
357        ZipOutputStream out = new ZipOutputStream(new BufferedOutputStream(dest));
358        // out.setMethod(ZipOutputStream.DEFLATED);
359        byte[] data = new byte[BUFFER];
360        for (File f : files) {
361          LOG.debug("Adding file {} to archive", f);
362          FileInputStream fi = new FileInputStream(f);
363          origin = new BufferedInputStream(fi, BUFFER);
364
365          String zipPath = StringUtils.removeStart(f.getAbsolutePath(), rootContext.getAbsolutePath() + File.separator);
366          ZipEntry entry = new ZipEntry(zipPath);
367          out.putNextEntry(entry);
368          int count;
369          while ((count = origin.read(data, 0, BUFFER)) != -1) {
370            out.write(data, 0, count);
371          }
372          origin.close();
373        }
374        out.finish();
375        out.close();
376      } catch (IOException e) {
377        LOG.error("IOException while zipping files: {}", files);
378        throw e;
379      }
380    }
381  }
382
383  /**
384   * Extracts a zipped file into a target directory. If the file is wrapped in a root directory, this is removed by
385   * default. Other subdirectories are ignored according to the parameter keepSubdirectories.
386   * </br>
387   * The following types of files are also ignored by default:
388   * i) hidden files (i.e. files starting with a dot)
389   * ii) Apple resource fork (__MACOSX), including its subdirectories and subfiles
390   *
391   * @param directory          where the zipped file and its subdirectories should be extracted to
392   * @param zipFile            to extract
393   * @param keepSubdirectories whether to preserve subdirectories or not
394   *
395   * @return a list of all created files and directories extracted to target directory
396   */
397  public static List<File> unzipFile(File directory, File zipFile, boolean keepSubdirectories) throws IOException {
398    LOG.debug("Unzipping archive " + zipFile.getName() + " into directory: " + directory.getAbsolutePath());
399
400    // This is changed from using ZipFile to a ZipInputStream since Java 8u192 can't open certain Zip64 files.
401    // https://bugs.openjdk.java.net/browse/JDK-8186464
402    try (FileInputStream fInput = new FileInputStream(zipFile);
403         ZipInputStream zipInput = new ZipInputStream(fInput)) {
404      ZipEntry entry;
405
406      while ((entry = zipInput.getNextEntry()) != null) {
407        // ignore resource fork directories and subfiles
408        if (entry.getName().toUpperCase().contains(APPLE_RESOURCE_FORK)) {
409          LOG.debug("Ignoring resource fork file: " + entry.getName());
410        }
411        // ignore directories and hidden directories (e.g. .svn) (based on flag)
412        else if (entry.isDirectory()) {
413          if (isHiddenFile(new File(entry.getName()))) {
414            LOG.debug("Ignoring hidden directory: " + entry.getName());
415          } else if (keepSubdirectories) {
416            new File(directory, entry.getName()).mkdir();
417          } else {
418            LOG.debug("Ignoring (sub)directory: " + entry.getName());
419          }
420        }
421        // ignore hidden files
422        else {
423          if (isHiddenFile(new File(entry.getName()))) {
424            LOG.debug("Ignoring hidden file: " + entry.getName());
425          } else {
426            File targetFile = (keepSubdirectories) ? new File(directory, entry.getName())
427              : new File(directory, new File(entry.getName()).getName());
428            // ensure parent folder always exists, and extract file
429            createParentFolder(targetFile);
430
431            LOG.debug("Extracting file: {} to: {}", entry.getName(), targetFile.getAbsolutePath());
432            try (OutputStream out = new BufferedOutputStream(new FileOutputStream(targetFile))) {
433              IOUtils.copy(zipInput, out);
434            }
435          }
436        }
437      }
438    }
439    // remove the wrapping root directory and flatten structure
440    if (keepSubdirectories) {
441      removeRootDirectory(directory);
442    }
443    return (directory.listFiles() == null) ? new ArrayList<File>() : Arrays.asList(directory.listFiles());
444  }
445
446  /**
447   * @return true if file is a hidden file or directory, or if any of its parent directories are hidden checking
448   * recursively
449   */
450  private static boolean isHiddenFile(File f) {
451    if (f.getName().startsWith(".")) {
452      return true;
453    } else if (f.getParentFile() != null) {
454      return isHiddenFile(f.getParentFile());
455    }
456    return false;
457  }
458
459  /**
460   * Removes a wrapping root directory and flatten its structure by moving all that root directory's files and
461   * subdirectories up to the same level as the root directory.
462   */
463  private static void removeRootDirectory(File directory) {
464    File[] rootFiles = directory.listFiles((FileFilter) HiddenFileFilter.VISIBLE);
465    if (rootFiles.length == 1) {
466      File root = rootFiles[0];
467      if (root.isDirectory()) {
468        LOG.debug("Removing single root folder {} found in decompressed archive", root.getAbsoluteFile());
469        for (File f : org.apache.commons.io.FileUtils.listFilesAndDirs(root, TrueFileFilter.TRUE, TrueFileFilter.TRUE)) {
470          File f2 = new File(directory, f.getName());
471          f.renameTo(f2);
472        }
473        root.delete();
474      }
475    }
476  }
477
478  /**
479   * Make parent folder.
480   *
481   * @param file destination file
482   */
483  private static void createParentFolder(File file) {
484    File parent = new File(file.getParent());
485    if (!parent.exists()) {
486      LOG.debug((parent.mkdirs()) ? "Created parent directory: " + parent.getAbsolutePath()
487        : "Failed to create parent directory: " + parent.getAbsolutePath());
488    }
489  }
490}