001/* 002 * Licensed under the Apache License, Version 2.0 (the "License"); 003 * you may not use this file except in compliance with the License. 004 * You may obtain a copy of the License at 005 * 006 * http://www.apache.org/licenses/LICENSE-2.0 007 * 008 * Unless required by applicable law or agreed to in writing, software 009 * distributed under the License is distributed on an "AS IS" BASIS, 010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 011 * See the License for the specific language governing permissions and 012 * limitations under the License. 013 */ 014package org.gbif.utils.file; 015 016import java.io.BufferedInputStream; 017import java.io.BufferedOutputStream; 018import java.io.File; 019import java.io.FileFilter; 020import java.io.FileInputStream; 021import java.io.FileOutputStream; 022import java.io.IOException; 023import java.io.InputStream; 024import java.io.OutputStream; 025import java.io.RandomAccessFile; 026import java.nio.file.Files; 027import java.util.ArrayList; 028import java.util.Arrays; 029import java.util.Collection; 030import java.util.Comparator; 031import java.util.HashSet; 032import java.util.List; 033import java.util.Set; 034import java.util.stream.Collectors; 035import java.util.zip.GZIPInputStream; 036import java.util.zip.ZipEntry; 037import java.util.zip.ZipException; 038import java.util.zip.ZipInputStream; 039import java.util.zip.ZipOutputStream; 040 041import org.apache.commons.compress.archivers.tar.TarArchiveEntry; 042import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; 043import org.apache.commons.io.FileUtils; 044import org.apache.commons.io.IOUtils; 045import org.apache.commons.io.filefilter.HiddenFileFilter; 046import org.apache.commons.io.filefilter.TrueFileFilter; 047import org.apache.commons.lang3.StringUtils; 048import org.slf4j.Logger; 049import org.slf4j.LoggerFactory; 050 051public class CompressionUtil { 052 053 private CompressionUtil() { 054 throw new UnsupportedOperationException("Can't initialize class"); 055 } 056 057 public static class UnsupportedCompressionType extends RuntimeException { 058 059 public UnsupportedCompressionType() {} 060 061 public UnsupportedCompressionType(String message) { 062 super(message); 063 } 064 065 public UnsupportedCompressionType(String message, Throwable cause) { 066 super(message, cause); 067 } 068 } 069 070 private static final Logger LOG = LoggerFactory.getLogger(CompressionUtil.class); 071 private static final int BUFFER = 2048; 072 private static final String APPLE_RESOURCE_FORK = "__MACOSX"; 073 private static final byte[] TAR_MAGIC_BYTES = new byte[] {'u', 's', 't', 'a', 'r'}; 074 075 /** 076 * Tries to decompress a file into a newly created temporary directory, trying gzip or zip regardless of the filename 077 * or its suffix. 078 * 079 * @return folder containing all decompressed files 080 */ 081 public static File decompressFile(File compressedFile) 082 throws IOException, UnsupportedCompressionType { 083 // create empty tmp dir 084 File dir = File.createTempFile("gbif-", null); 085 if (dir.exists() && !dir.delete()) { 086 throw new IOException("Couldn't delete temporary directory"); 087 } 088 089 if (!dir.mkdirs()) { 090 throw new IOException("Couldn't create temporary directory for decompression"); 091 } 092 093 // decompress 094 decompressFile(dir, compressedFile); 095 096 return dir; 097 } 098 099 /** 100 * Defaults keeping subDirectories to false. 101 * 102 * @see org.gbif.utils.file.CompressionUtil#decompressFile(java.io.File, java.io.File, boolean) 103 */ 104 public static List<File> decompressFile(File directory, File compressedFile) 105 throws IOException, UnsupportedCompressionType { 106 return decompressFile(directory, compressedFile, false); 107 } 108 109 /** 110 * Tries to decompress a file using TAR+gzip, TAR or Zip regardless of the filename or its suffix. 111 * 112 * @param directory directory where archive's contents will be decompressed to 113 * @param compressedFile compressed file 114 * 115 * @return list of files that have been extracted or null an empty list if archive couldn't be decompressed 116 * 117 * @throws IOException if problem occurred reading compressed file, or directory couldn't be written 118 * to 119 * @throws UnsupportedCompressionType if the compression type wasn't recognized 120 */ 121 public static List<File> decompressFile( 122 File directory, File compressedFile, boolean keepSubdirectories) 123 throws IOException, UnsupportedCompressionType { 124 List<File> files = null; 125 126 // Test before trying gzip format 127 if (isGzipFormat(compressedFile)) { 128 try { 129 LOG.debug("Uncompressing {} with gzip compression to {}", compressedFile, directory); 130 files = untgzFile(directory, compressedFile); 131 } catch (Exception e) { 132 LOG.debug("Not gzip compression"); 133 } 134 } 135 136 // Test before trying TAR format 137 if (isTarFormat(compressedFile)) { 138 try { 139 LOG.debug("Uncompressing {} with TAR compression to {}", compressedFile, directory); 140 files = untarFile(directory, compressedFile); 141 } catch (Exception e) { 142 LOG.debug("Not TAR compression"); 143 } 144 } 145 146 // Then try zip 147 if (files == null) { 148 try { 149 LOG.debug("Uncompressing {} with Zip compression to {}", compressedFile, directory); 150 files = unzipFile(directory, compressedFile, keepSubdirectories); 151 } catch (ZipException e) { 152 LOG.debug("Not Zip compression"); 153 throw new UnsupportedCompressionType("Unknown compression type. Neither gzip nor Zip", e); 154 } 155 } 156 157 if (files.isEmpty()) { 158 LOG.warn("No files extracted from {}, tried TGZ, TAR and Zip compression.", compressedFile); 159 } 160 161 return files; 162 } 163 164 /** 165 * Check the file's first two bytes, to see if they are the gzip magic number. 166 * @param compressedFile compressed file 167 * @return true if the file is in gzip format 168 * @throws IOException if a problem occurred reading compressed file 169 */ 170 private static boolean isGzipFormat(File compressedFile) throws IOException { 171 try (RandomAccessFile file = new RandomAccessFile(compressedFile, "r")) { 172 return GZIPInputStream.GZIP_MAGIC == (file.read() & 0xff | ((file.read() << 8) & 0xff00)); 173 } 174 } 175 176 /** 177 * Check the file is a Tape ARchive (TAR). 178 * @param compressedFile compressed file 179 * @return true if the file is a TAR 180 * @throws IOException if a problem occurred reading compressed file 181 */ 182 private static boolean isTarFormat(File compressedFile) throws IOException { 183 try (RandomAccessFile file = new RandomAccessFile(compressedFile, "r")) { 184 // TAR files contain "ustar\0" or "ustar " at byte 257. 185 // https://www.gnu.org/software/tar/manual/html_node/Standard.html 186 byte[] at257 = new byte[5]; 187 file.seek(257); 188 file.read(at257, 0, 5); 189 return Arrays.equals(at257, TAR_MAGIC_BYTES); 190 } catch (Exception e) { 191 LOG.debug("Exc", e); 192 } 193 return false; 194 } 195 196 /** 197 * Extracts a gzipped TAR file. Directory structure and hidden files (i.e. files starting with a dot) are ignored. 198 * 199 * @param directory where the file should be extracted to 200 * @param tgzFile to extract 201 * 202 * @return a list of all created files 203 */ 204 public static List<File> untgzFile(File directory, File tgzFile) throws IOException { 205 return untarStream(directory, new GZIPInputStream(new FileInputStream(tgzFile))); 206 } 207 208 /** 209 * Extracts a plain TAR file. Directory structure and hidden files (i.e. files starting with a dot) are ignored. 210 * 211 * @param directory where the file should be extracted to 212 * @param tarFile to extract 213 * 214 * @return a list of all created files 215 */ 216 public static List<File> untarFile(File directory, File tarFile) throws IOException { 217 return untarStream(directory, new FileInputStream(tarFile)); 218 } 219 220 /** 221 * Extracts a TAR stream. Directory structure and hidden files (i.e. files starting with a dot) are ignored. 222 * 223 * @param directory where the file should be extracted to 224 * @param tarStream to extract 225 * 226 * @return a list of all created files 227 */ 228 private static List<File> untarStream(File directory, InputStream tarStream) throws IOException { 229 List<File> files = new ArrayList<File>(); 230 try (TarArchiveInputStream in = new TarArchiveInputStream(tarStream)) { 231 TarArchiveEntry entry; 232 while ((entry = in.getNextTarEntry()) != null) { 233 if (entry.isDirectory()) { 234 LOG.debug("TAR archive contains directories which are being ignored"); 235 continue; 236 } 237 String fn = new File(entry.getName()).getName(); 238 if (fn.startsWith(".")) { 239 LOG.debug("TAR archive contains a hidden file {} which is being ignored", fn); 240 continue; 241 } 242 File targetFile = new File(directory, fn); 243 if (targetFile.exists()) { 244 LOG.warn("TAR archive contains duplicate filename {}, only the first was extracted", fn); 245 continue; 246 } 247 LOG.debug("Extracting file: {} to: {}", entry.getName(), targetFile.getAbsolutePath()); 248 try (FileOutputStream out = new FileOutputStream(targetFile)) { 249 IOUtils.copy(in, out); 250 } 251 files.add(targetFile); 252 } 253 } 254 return files; 255 } 256 257 /** 258 * Gunzip a file. Use this method with isTarred false if the gzip contains a single file. If it's a gzip 259 * of a TAR pass true to isTarred (or call @untgzFile(directory, tgzFile) which is what this method 260 * just redirects to for isTarred). 261 * 262 * @param directory the output directory for the uncompressed file(s) 263 * @param gzipFile the gzip file 264 * @param isTarred true if the gzip contains a TAR 265 * 266 * @return a List of the uncompressed file name(s) 267 * 268 * @throws IOException if reading or writing fails 269 */ 270 public static List<File> ungzipFile(File directory, File gzipFile, boolean isTarred) 271 throws IOException { 272 if (isTarred) return untgzFile(directory, gzipFile); 273 274 List<File> files = new ArrayList<File>(); 275 GZIPInputStream in = null; 276 BufferedOutputStream dest = null; 277 try { 278 in = new GZIPInputStream(new FileInputStream(gzipFile)); 279 280 // assume that the gzip filename is the filename + .gz 281 String unzippedName = gzipFile.getName().substring(0, gzipFile.getName().lastIndexOf(".")); 282 File outputFile = new File(directory, unzippedName); 283 LOG.debug("Extracting file: {} to: {}", unzippedName, outputFile.getAbsolutePath()); 284 FileOutputStream fos = new FileOutputStream(outputFile); 285 286 dest = new BufferedOutputStream(fos, BUFFER); 287 int count; 288 byte[] data = new byte[BUFFER]; 289 while ((count = in.read(data, 0, BUFFER)) != -1) { 290 dest.write(data, 0, count); 291 } 292 files.add(outputFile); 293 } finally { 294 if (in != null) in.close(); 295 if (dest != null) { 296 dest.flush(); 297 dest.close(); 298 } 299 } 300 301 return files; 302 } 303 304 /** 305 * Defaults keepSubdirectories to false. 306 * 307 * @see org.gbif.utils.file.CompressionUtil#unzipFile(java.io.File, java.io.File, boolean) 308 */ 309 public static List<File> unzipFile(File directory, File zipFile) throws IOException { 310 return unzipFile(directory, zipFile, false); 311 } 312 313 /** 314 * Zip a directory with all files but skipping included subdirectories. 315 * Only files directly within the directory are added to the archive. 316 * 317 * @param dir the directory to zip 318 * @param zipFile the zipped file 319 */ 320 public static void zipDir(File dir, File zipFile) throws IOException { 321 zipDir(dir, zipFile, false); 322 } 323 324 /** 325 * Zip a directory with all files. Files in Subdirectories will be included if the inclSubdirs is true. 326 * 327 * @param dir the directory to zip 328 * @param zipFile the zipped file 329 * @param inclSubdirs if true includes all subdirectories recursively 330 */ 331 public static void zipDir(File dir, File zipFile, boolean inclSubdirs) throws IOException { 332 Collection<File> files = org.apache.commons.io.FileUtils.listFiles(dir, null, inclSubdirs); 333 zipFiles(files, dir, zipFile); 334 } 335 336 public static void zipFile(File file, File zipFile) throws IOException { 337 Set<File> files = new HashSet<File>(); 338 files.add(file); 339 zipFiles(files, file.getParentFile(), zipFile); 340 } 341 342 /** 343 * Creates a zip archive from a given collection of files. 344 * In order to preserve paths in the archive a rootContext can be specified which will be removed from the individual 345 * zip entries. For example a rootContext of /home/freak with a file /home/freak/photo/birthday.jpg to be zipped 346 * will result in a zip entry with a path photo/birthday.jpg. 347 * 348 * @param files to be included in the zip archive 349 * @param rootContext optional path to be removed from each file 350 * @param zipFile the zip file to be created 351 * @throws IOException 352 */ 353 public static void zipFiles(Collection<File> files, File rootContext, File zipFile) 354 throws IOException { 355 if (files.isEmpty()) { 356 LOG.info("no files to zip."); 357 } else { 358 try { 359 BufferedInputStream origin = null; 360 FileOutputStream dest = new FileOutputStream(zipFile); 361 ZipOutputStream out = new ZipOutputStream(new BufferedOutputStream(dest)); 362 // out.setMethod(ZipOutputStream.DEFLATED); 363 byte[] data = new byte[BUFFER]; 364 for (File f : files) { 365 LOG.debug("Adding file {} to archive", f); 366 FileInputStream fi = new FileInputStream(f); 367 origin = new BufferedInputStream(fi, BUFFER); 368 369 String zipPath = 370 StringUtils.removeStart( 371 f.getAbsolutePath(), rootContext.getAbsolutePath() + File.separator); 372 ZipEntry entry = new ZipEntry(zipPath); 373 out.putNextEntry(entry); 374 int count; 375 while ((count = origin.read(data, 0, BUFFER)) != -1) { 376 out.write(data, 0, count); 377 } 378 origin.close(); 379 } 380 out.finish(); 381 out.close(); 382 } catch (IOException e) { 383 LOG.error("IOException while zipping files: {}", files); 384 throw e; 385 } 386 } 387 } 388 389 /** 390 * Extracts a zipped file into a target directory. If the file is wrapped in a root directory, this is removed by 391 * default. Other subdirectories are ignored according to the parameter keepSubdirectories. 392 * </br> 393 * The following types of files are also ignored by default: 394 * i) hidden files (i.e. files starting with a dot) 395 * ii) Apple resource fork (__MACOSX), including its subdirectories and subfiles 396 * 397 * @param directory where the zipped file and its subdirectories should be extracted to 398 * @param zipFile to extract 399 * @param keepSubdirectories whether to preserve subdirectories or not 400 * 401 * @return a list of all created files and directories extracted to target directory 402 */ 403 public static List<File> unzipFile(File directory, File zipFile, boolean keepSubdirectories) 404 throws IOException { 405 LOG.debug( 406 "Unzipping archive " 407 + zipFile.getName() 408 + " into directory: " 409 + directory.getAbsolutePath()); 410 411 // This is changed from using ZipFile to a ZipInputStream since Java 8u192 can't open certain 412 // Zip64 files. 413 // https://bugs.openjdk.java.net/browse/JDK-8186464 414 try (FileInputStream fInput = new FileInputStream(zipFile); 415 ZipInputStream zipInput = new ZipInputStream(fInput)) { 416 ZipEntry entry; 417 418 while ((entry = zipInput.getNextEntry()) != null) { 419 // ignore resource fork directories and subfiles 420 if (entry.getName().toUpperCase().contains(APPLE_RESOURCE_FORK)) { 421 LOG.debug("Ignoring resource fork file: " + entry.getName()); 422 } 423 // ignore directories and hidden directories (e.g. .svn) (based on flag) 424 else if (entry.isDirectory()) { 425 if (isHiddenFile(new File(entry.getName()))) { 426 LOG.debug("Ignoring hidden directory: " + entry.getName()); 427 } else if (keepSubdirectories) { 428 new File(directory, entry.getName()).mkdir(); 429 } else { 430 LOG.debug("Ignoring (sub)directory: " + entry.getName()); 431 } 432 } 433 // ignore hidden files 434 else { 435 if (isHiddenFile(new File(entry.getName()))) { 436 LOG.debug("Ignoring hidden file: " + entry.getName()); 437 } else { 438 File targetFile = 439 (keepSubdirectories) 440 ? new File(directory, entry.getName()) 441 : new File(directory, new File(entry.getName()).getName()); 442 // ensure parent folder always exists, and extract file 443 createParentFolder(targetFile); 444 445 LOG.debug("Extracting file: {} to: {}", entry.getName(), targetFile.getAbsolutePath()); 446 try (OutputStream out = 447 new BufferedOutputStream(Files.newOutputStream(targetFile.toPath()))) { 448 IOUtils.copy(zipInput, out); 449 } 450 } 451 } 452 } 453 } 454 // remove the wrapping root directory and flatten structure 455 if (keepSubdirectories) { 456 removeRootDirectory(directory); 457 } 458 459 File[] files = directory.listFiles(); 460 461 return (files == null) ? new ArrayList<>() : Arrays.asList(files); 462 } 463 464 /** 465 * @return true if file is a hidden file or directory, or if any of its parent directories are hidden checking 466 * recursively 467 */ 468 private static boolean isHiddenFile(File f) { 469 if (f.getName().startsWith(".")) { 470 return true; 471 } else if (f.getParentFile() != null) { 472 return isHiddenFile(f.getParentFile()); 473 } 474 return false; 475 } 476 477 /** 478 * Removes a wrapping root directory and flatten its structure by moving all that root directory's files and 479 * subdirectories up to the same level as the root directory. 480 */ 481 @SuppressWarnings("ResultOfMethodCallIgnored") 482 private static void removeRootDirectory(File directory) { 483 File[] rootFiles = directory.listFiles((FileFilter) HiddenFileFilter.VISIBLE); 484 if (rootFiles == null) { 485 LOG.error("Failed to retrieve root directory from {}", directory.getAbsolutePath()); 486 return; 487 } 488 489 if (rootFiles.length != 1) { 490 LOG.error("More than one root directory at {}", directory.getAbsolutePath()); 491 return; 492 } 493 494 File root = rootFiles[0]; 495 if (root.isDirectory()) { 496 LOG.debug( 497 "Removing single root folder {} found in decompressed archive", root.getAbsoluteFile()); 498 Collection<File> filesAndDirs = 499 FileUtils.listFilesAndDirs(root, TrueFileFilter.TRUE, TrueFileFilter.TRUE); 500 // directories shouldn't be in the end, sort 501 List<File> sortedFilesAndDirs = 502 filesAndDirs.stream() 503 .sorted(Comparator.comparing(File::getAbsolutePath)) 504 .collect(Collectors.toList()); 505 506 for (File f : sortedFilesAndDirs) { 507 File f2 = new File(directory, f.getName()); 508 f.renameTo(f2); 509 } 510 root.delete(); 511 } 512 } 513 514 /** 515 * Make parent folder. 516 * 517 * @param file destination file 518 */ 519 private static void createParentFolder(File file) { 520 File parent = new File(file.getParent()); 521 if (!parent.exists()) { 522 LOG.debug( 523 (parent.mkdirs()) 524 ? "Created parent directory: " + parent.getAbsolutePath() 525 : "Failed to create parent directory: " + parent.getAbsolutePath()); 526 } 527 } 528}