001/* 002 * Copyright 2021 Global Biodiversity Information Facility (GBIF) 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package org.gbif.utils.file; 017 018import java.io.BufferedInputStream; 019import java.io.BufferedOutputStream; 020import java.io.File; 021import java.io.FileFilter; 022import java.io.FileInputStream; 023import java.io.FileOutputStream; 024import java.io.IOException; 025import java.io.InputStream; 026import java.io.OutputStream; 027import java.io.RandomAccessFile; 028import java.util.ArrayList; 029import java.util.Arrays; 030import java.util.Collection; 031import java.util.HashSet; 032import java.util.List; 033import java.util.Set; 034import java.util.zip.GZIPInputStream; 035import java.util.zip.ZipEntry; 036import java.util.zip.ZipException; 037import java.util.zip.ZipInputStream; 038import java.util.zip.ZipOutputStream; 039 040import org.apache.commons.compress.archivers.tar.TarArchiveEntry; 041import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; 042import org.apache.commons.io.IOUtils; 043import org.apache.commons.io.filefilter.HiddenFileFilter; 044import org.apache.commons.io.filefilter.TrueFileFilter; 045import org.apache.commons.lang3.StringUtils; 046import org.slf4j.Logger; 047import org.slf4j.LoggerFactory; 048 049public class CompressionUtil { 050 051 private CompressionUtil() { 052 throw new UnsupportedOperationException("Can't initialize class"); 053 } 054 055 public static class UnsupportedCompressionType extends RuntimeException { 056 057 public UnsupportedCompressionType() { 058 } 059 060 public UnsupportedCompressionType(String message) { 061 super(message); 062 } 063 064 public UnsupportedCompressionType(String message, Throwable cause) { 065 super(message, cause); 066 } 067 068 } 069 070 private static final Logger LOG = LoggerFactory.getLogger(CompressionUtil.class); 071 private static final int BUFFER = 2048; 072 private static final String APPLE_RESOURCE_FORK = "__MACOSX"; 073 private static final byte[] TAR_MAGIC_BYTES = new byte[]{'u', 's', 't', 'a', 'r'}; 074 075 /** 076 * Tries to decompress a file into a newly created temporary directory, trying gzip or zip regardless of the filename 077 * or its suffix. 078 * 079 * @return folder containing all decompressed files 080 */ 081 public static File decompressFile(File compressedFile) throws IOException, UnsupportedCompressionType { 082 // create empty tmp dir 083 File dir = File.createTempFile("gbif-", null); 084 if (dir.exists() && !dir.delete()) { 085 throw new IOException("Couldn't delete temporary directory"); 086 } 087 088 if (!dir.mkdirs()) { 089 throw new IOException("Couldn't create temporary directory for decompression"); 090 } 091 092 // decompress 093 decompressFile(dir, compressedFile); 094 095 return dir; 096 } 097 098 /** 099 * Defaults keeping subDirectories to false. 100 * 101 * @see org.gbif.utils.file.CompressionUtil#decompressFile(java.io.File, java.io.File, boolean) 102 */ 103 public static List<File> decompressFile(File directory, File compressedFile) 104 throws IOException, UnsupportedCompressionType { 105 return decompressFile(directory, compressedFile, false); 106 } 107 108 /** 109 * Tries to decompress a file using TAR+gzip, TAR or Zip regardless of the filename or its suffix. 110 * 111 * @param directory directory where archive's contents will be decompressed to 112 * @param compressedFile compressed file 113 * 114 * @return list of files that have been extracted or null an empty list if archive couldn't be decompressed 115 * 116 * @throws IOException if problem occurred reading compressed file, or directory couldn't be written 117 * to 118 * @throws UnsupportedCompressionType if the compression type wasn't recognized 119 */ 120 public static List<File> decompressFile(File directory, File compressedFile, boolean keepSubdirectories) 121 throws IOException, UnsupportedCompressionType { 122 List<File> files = null; 123 124 // Test before trying gzip format 125 if (isGzipFormat(compressedFile)) { 126 try { 127 LOG.debug("Uncompressing {} with gzip compression to {}", compressedFile, directory); 128 files = untgzFile(directory, compressedFile); 129 } catch (Exception e) { 130 LOG.debug("Not gzip compression"); 131 } 132 } 133 134 // Test before trying TAR format 135 if (isTarFormat(compressedFile)) { 136 try { 137 LOG.debug("Uncompressing {} with TAR compression to {}", compressedFile, directory); 138 files = untarFile(directory, compressedFile); 139 } catch (Exception e) { 140 LOG.debug("Not TAR compression"); 141 } 142 } 143 144 // Then try zip 145 if (files == null) { 146 try { 147 LOG.debug("Uncompressing {} with Zip compression to {}", compressedFile, directory); 148 files = unzipFile(directory, compressedFile, keepSubdirectories); 149 } catch (ZipException e) { 150 LOG.debug("Not Zip compression"); 151 throw new UnsupportedCompressionType("Unknown compression type. Neither gzip nor Zip", e); 152 } 153 } 154 155 if (files.isEmpty()) { 156 LOG.warn("No files extracted from {}, tried TGZ, TAR and Zip compression.", compressedFile); 157 } 158 159 return files; 160 } 161 162 /** 163 * Check the file's first two bytes, to see if they are the gzip magic number. 164 * @param compressedFile compressed file 165 * @return true if the file is in gzip format 166 * @throws IOException if a problem occurred reading compressed file 167 */ 168 private static boolean isGzipFormat(File compressedFile) throws IOException { 169 try (RandomAccessFile file = new RandomAccessFile(compressedFile, "r")) { 170 return GZIPInputStream.GZIP_MAGIC == (file.read() & 0xff | ((file.read() << 8) & 0xff00)); 171 } 172 } 173 174 /** 175 * Check the file is a Tape ARchive (TAR). 176 * @param compressedFile compressed file 177 * @return true if the file is a TAR 178 * @throws IOException if a problem occurred reading compressed file 179 */ 180 private static boolean isTarFormat(File compressedFile) throws IOException { 181 try (RandomAccessFile file = new RandomAccessFile(compressedFile, "r")) { 182 // TAR files contain "ustar\0" or "ustar " at byte 257. 183 // https://www.gnu.org/software/tar/manual/html_node/Standard.html 184 byte[] at257 = new byte[5]; 185 file.seek(257); 186 file.read(at257, 0, 5); 187 return Arrays.equals(at257, TAR_MAGIC_BYTES); 188 } catch (Exception e) { 189 LOG.debug("Exc", e); 190 } 191 return false; 192 } 193 194 /** 195 * Extracts a gzipped TAR file. Directory structure and hidden files (i.e. files starting with a dot) are ignored. 196 * 197 * @param directory where the file should be extracted to 198 * @param tgzFile to extract 199 * 200 * @return a list of all created files 201 */ 202 public static List<File> untgzFile(File directory, File tgzFile) throws IOException { 203 return untarStream(directory, new GZIPInputStream(new FileInputStream(tgzFile))); 204 } 205 206 /** 207 * Extracts a plain TAR file. Directory structure and hidden files (i.e. files starting with a dot) are ignored. 208 * 209 * @param directory where the file should be extracted to 210 * @param tarFile to extract 211 * 212 * @return a list of all created files 213 */ 214 public static List<File> untarFile(File directory, File tarFile) throws IOException { 215 return untarStream(directory, new FileInputStream(tarFile)); 216 } 217 218 /** 219 * Extracts a TAR stream. Directory structure and hidden files (i.e. files starting with a dot) are ignored. 220 * 221 * @param directory where the file should be extracted to 222 * @param tarStream to extract 223 * 224 * @return a list of all created files 225 */ 226 private static List<File> untarStream(File directory, InputStream tarStream) throws IOException { 227 List<File> files = new ArrayList<File>(); 228 try (TarArchiveInputStream in = new TarArchiveInputStream(tarStream)) { 229 TarArchiveEntry entry; 230 while ((entry = in.getNextTarEntry()) != null) { 231 if (entry.isDirectory()) { 232 LOG.debug("TAR archive contains directories which are being ignored"); 233 continue; 234 } 235 String fn = new File(entry.getName()).getName(); 236 if (fn.startsWith(".")) { 237 LOG.debug("TAR archive contains a hidden file {} which is being ignored", fn); 238 continue; 239 } 240 File targetFile = new File(directory, fn); 241 if (targetFile.exists()) { 242 LOG.warn("TAR archive contains duplicate filename {}, only the first was extracted", fn); 243 continue; 244 } 245 LOG.debug("Extracting file: {} to: {}", entry.getName(), targetFile.getAbsolutePath()); 246 try (FileOutputStream out = new FileOutputStream(targetFile)) { 247 IOUtils.copy(in, out); 248 } 249 files.add(targetFile); 250 } 251 } 252 return files; 253 } 254 255 /** 256 * Gunzip a file. Use this method with isTarred false if the gzip contains a single file. If it's a gzip 257 * of a TAR pass true to isTarred (or call @untgzFile(directory, tgzFile) which is what this method 258 * just redirects to for isTarred). 259 * 260 * @param directory the output directory for the uncompressed file(s) 261 * @param gzipFile the gzip file 262 * @param isTarred true if the gzip contains a TAR 263 * 264 * @return a List of the uncompressed file name(s) 265 * 266 * @throws IOException if reading or writing fails 267 */ 268 public static List<File> ungzipFile(File directory, File gzipFile, boolean isTarred) throws IOException { 269 if (isTarred) return untgzFile(directory, gzipFile); 270 271 List<File> files = new ArrayList<File>(); 272 GZIPInputStream in = null; 273 BufferedOutputStream dest = null; 274 try { 275 in = new GZIPInputStream(new FileInputStream(gzipFile)); 276 277 // assume that the gzip filename is the filename + .gz 278 String unzippedName = gzipFile.getName().substring(0, gzipFile.getName().lastIndexOf(".")); 279 File outputFile = new File(directory, unzippedName); 280 LOG.debug("Extracting file: {} to: {}", unzippedName, outputFile.getAbsolutePath()); 281 FileOutputStream fos = new FileOutputStream(outputFile); 282 283 dest = new BufferedOutputStream(fos, BUFFER); 284 int count; 285 byte[] data = new byte[BUFFER]; 286 while ((count = in.read(data, 0, BUFFER)) != -1) { 287 dest.write(data, 0, count); 288 } 289 files.add(outputFile); 290 } finally { 291 if (in != null) in.close(); 292 if (dest != null) { 293 dest.flush(); 294 dest.close(); 295 } 296 } 297 298 return files; 299 } 300 301 /** 302 * Defaults keepSubdirectories to false. 303 * 304 * @see org.gbif.utils.file.CompressionUtil#unzipFile(java.io.File, java.io.File, boolean) 305 */ 306 public static List<File> unzipFile(File directory, File zipFile) throws IOException { 307 return unzipFile(directory, zipFile, false); 308 } 309 310 /** 311 * Zip a directory with all files but skipping included subdirectories. 312 * Only files directly within the directory are added to the archive. 313 * 314 * @param dir the directory to zip 315 * @param zipFile the zipped file 316 */ 317 public static void zipDir(File dir, File zipFile) throws IOException { 318 zipDir(dir, zipFile, false); 319 } 320 321 /** 322 * Zip a directory with all files. Files in Subdirectories will be included if the inclSubdirs is true. 323 * 324 * @param dir the directory to zip 325 * @param zipFile the zipped file 326 * @param inclSubdirs if true includes all subdirectories recursively 327 */ 328 public static void zipDir(File dir, File zipFile, boolean inclSubdirs) throws IOException { 329 Collection<File> files = org.apache.commons.io.FileUtils.listFiles(dir, null, inclSubdirs); 330 zipFiles(files, dir, zipFile); 331 } 332 333 public static void zipFile(File file, File zipFile) throws IOException { 334 Set<File> files = new HashSet<File>(); 335 files.add(file); 336 zipFiles(files, file.getParentFile(), zipFile); 337 } 338 339 /** 340 * Creates a zip archive from a given collection of files. 341 * In order to preserve paths in the archive a rootContext can be specified which will be removed from the individual 342 * zip entries. For example a rootContext of /home/freak with a file /home/freak/photo/birthday.jpg to be zipped 343 * will result in a zip entry with a path photo/birthday.jpg. 344 * 345 * @param files to be included in the zip archive 346 * @param rootContext optional path to be removed from each file 347 * @param zipFile the zip file to be created 348 * @throws IOException 349 */ 350 public static void zipFiles(Collection<File> files, File rootContext, File zipFile) throws IOException { 351 if (files.isEmpty()) { 352 LOG.info("no files to zip."); 353 } else { 354 try { 355 BufferedInputStream origin = null; 356 FileOutputStream dest = new FileOutputStream(zipFile); 357 ZipOutputStream out = new ZipOutputStream(new BufferedOutputStream(dest)); 358 // out.setMethod(ZipOutputStream.DEFLATED); 359 byte[] data = new byte[BUFFER]; 360 for (File f : files) { 361 LOG.debug("Adding file {} to archive", f); 362 FileInputStream fi = new FileInputStream(f); 363 origin = new BufferedInputStream(fi, BUFFER); 364 365 String zipPath = StringUtils.removeStart(f.getAbsolutePath(), rootContext.getAbsolutePath() + File.separator); 366 ZipEntry entry = new ZipEntry(zipPath); 367 out.putNextEntry(entry); 368 int count; 369 while ((count = origin.read(data, 0, BUFFER)) != -1) { 370 out.write(data, 0, count); 371 } 372 origin.close(); 373 } 374 out.finish(); 375 out.close(); 376 } catch (IOException e) { 377 LOG.error("IOException while zipping files: {}", files); 378 throw e; 379 } 380 } 381 } 382 383 /** 384 * Extracts a zipped file into a target directory. If the file is wrapped in a root directory, this is removed by 385 * default. Other subdirectories are ignored according to the parameter keepSubdirectories. 386 * </br> 387 * The following types of files are also ignored by default: 388 * i) hidden files (i.e. files starting with a dot) 389 * ii) Apple resource fork (__MACOSX), including its subdirectories and subfiles 390 * 391 * @param directory where the zipped file and its subdirectories should be extracted to 392 * @param zipFile to extract 393 * @param keepSubdirectories whether to preserve subdirectories or not 394 * 395 * @return a list of all created files and directories extracted to target directory 396 */ 397 public static List<File> unzipFile(File directory, File zipFile, boolean keepSubdirectories) throws IOException { 398 LOG.debug("Unzipping archive " + zipFile.getName() + " into directory: " + directory.getAbsolutePath()); 399 400 // This is changed from using ZipFile to a ZipInputStream since Java 8u192 can't open certain Zip64 files. 401 // https://bugs.openjdk.java.net/browse/JDK-8186464 402 try (FileInputStream fInput = new FileInputStream(zipFile); 403 ZipInputStream zipInput = new ZipInputStream(fInput)) { 404 ZipEntry entry; 405 406 while ((entry = zipInput.getNextEntry()) != null) { 407 // ignore resource fork directories and subfiles 408 if (entry.getName().toUpperCase().contains(APPLE_RESOURCE_FORK)) { 409 LOG.debug("Ignoring resource fork file: " + entry.getName()); 410 } 411 // ignore directories and hidden directories (e.g. .svn) (based on flag) 412 else if (entry.isDirectory()) { 413 if (isHiddenFile(new File(entry.getName()))) { 414 LOG.debug("Ignoring hidden directory: " + entry.getName()); 415 } else if (keepSubdirectories) { 416 new File(directory, entry.getName()).mkdir(); 417 } else { 418 LOG.debug("Ignoring (sub)directory: " + entry.getName()); 419 } 420 } 421 // ignore hidden files 422 else { 423 if (isHiddenFile(new File(entry.getName()))) { 424 LOG.debug("Ignoring hidden file: " + entry.getName()); 425 } else { 426 File targetFile = (keepSubdirectories) ? new File(directory, entry.getName()) 427 : new File(directory, new File(entry.getName()).getName()); 428 // ensure parent folder always exists, and extract file 429 createParentFolder(targetFile); 430 431 LOG.debug("Extracting file: {} to: {}", entry.getName(), targetFile.getAbsolutePath()); 432 try (OutputStream out = new BufferedOutputStream(new FileOutputStream(targetFile))) { 433 IOUtils.copy(zipInput, out); 434 } 435 } 436 } 437 } 438 } 439 // remove the wrapping root directory and flatten structure 440 if (keepSubdirectories) { 441 removeRootDirectory(directory); 442 } 443 return (directory.listFiles() == null) ? new ArrayList<File>() : Arrays.asList(directory.listFiles()); 444 } 445 446 /** 447 * @return true if file is a hidden file or directory, or if any of its parent directories are hidden checking 448 * recursively 449 */ 450 private static boolean isHiddenFile(File f) { 451 if (f.getName().startsWith(".")) { 452 return true; 453 } else if (f.getParentFile() != null) { 454 return isHiddenFile(f.getParentFile()); 455 } 456 return false; 457 } 458 459 /** 460 * Removes a wrapping root directory and flatten its structure by moving all that root directory's files and 461 * subdirectories up to the same level as the root directory. 462 */ 463 private static void removeRootDirectory(File directory) { 464 File[] rootFiles = directory.listFiles((FileFilter) HiddenFileFilter.VISIBLE); 465 if (rootFiles.length == 1) { 466 File root = rootFiles[0]; 467 if (root.isDirectory()) { 468 LOG.debug("Removing single root folder {} found in decompressed archive", root.getAbsoluteFile()); 469 for (File f : org.apache.commons.io.FileUtils.listFilesAndDirs(root, TrueFileFilter.TRUE, TrueFileFilter.TRUE)) { 470 File f2 = new File(directory, f.getName()); 471 f.renameTo(f2); 472 } 473 root.delete(); 474 } 475 } 476 } 477 478 /** 479 * Make parent folder. 480 * 481 * @param file destination file 482 */ 483 private static void createParentFolder(File file) { 484 File parent = new File(file.getParent()); 485 if (!parent.exists()) { 486 LOG.debug((parent.mkdirs()) ? "Created parent directory: " + parent.getAbsolutePath() 487 : "Failed to create parent directory: " + parent.getAbsolutePath()); 488 } 489 } 490}