001/* 002 * Copyright 2021 Global Biodiversity Information Facility (GBIF) 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package org.gbif.utils.file; 017 018import java.io.File; 019import java.io.FileInputStream; 020import java.io.FileOutputStream; 021import java.io.IOException; 022import java.nio.ByteBuffer; 023import java.nio.channels.FileChannel; 024import java.util.ArrayList; 025import java.util.List; 026 027import org.slf4j.Logger; 028import org.slf4j.LoggerFactory; 029 030/** 031 * A utility to split files into chucks. 032 * This is done on file size, but then extended to read to the end of the current line. 033 * Therefore a chunksize of 32meg will result in a split files of slightly more (assuming lines are 034 * not very long). 035 * This is done using NIO libraries for high performance. 036 */ 037public class FileSplitter { 038 039 private static final Logger LOG = LoggerFactory.getLogger(FileSplitter.class); 040 public static final String SEPARATOR = "_"; 041 public static final int READ_AHEAD_BYTES = 256; 042 043 // for the file, gives the Byte markers for reading lines, such that the lines read will approximately 044 // equate be the chunk size (slightly more as it reads to the end of the row) 045 public static List<Long> scanToChunk(File from, long chunkSizeBytes) throws IOException { 046 List<Long> chunkBytes = new ArrayList<Long>(); 047 FileInputStream fis = new FileInputStream(from); 048 FileChannel fcin = fis.getChannel(); 049 050 long byteCount = chunkSizeBytes; 051 052 // now we need to read and transfer to the end of the line... 053 ByteBuffer bb = ByteBuffer.allocate(READ_AHEAD_BYTES); 054 055 while (byteCount < fcin.size()) { 056 fcin.read(bb, byteCount); 057 int i = 0; 058 for (i = 0; i < bb.limit(); i++) { 059 if ((char) bb.get(i) == '\n') { 060 i++; 061 break; 062 } 063 } 064 // bb.rewind(); 065 chunkBytes.add(i + byteCount); 066 byteCount += chunkSizeBytes; 067 } 068 fcin.close(); 069 fis.close(); 070 return chunkBytes; 071 } 072 073 /** 074 * Splits a file "pumaConcolor.txt" into the target directory using the suffix ("part") like so: 075 * - pumaConcolor_part_0.txt 076 * - pumaConcolor_part_1.txt 077 * - pumaConcolor_part_2.txt 078 * Returns the files parts 079 */ 080 public static List<File> split(File from, File targetDirectory, String suffix, long chunkSizeBytes) 081 throws IOException { 082 List<File> files = new ArrayList<File>(); 083 FileInputStream fis = new FileInputStream(from); 084 FileChannel fcin = fis.getChannel(); 085 086 String filePartNamePrefix = ""; 087 String filePartNameSuffix = ""; 088 if (from.getName().contains(".")) { 089 filePartNamePrefix = from.getName().substring(0, from.getName().indexOf(".")); 090 filePartNameSuffix = from.getName().substring(from.getName().indexOf(".")); 091 } else { 092 filePartNamePrefix = from.getName(); 093 } 094 095 long byteCount = 0; 096 int filePartCount = 0; 097 098 while (byteCount < fcin.size()) { 099 long time = System.currentTimeMillis(); 100 // create the output file 101 String fileName = filePartNamePrefix + SEPARATOR + suffix + SEPARATOR + filePartCount + filePartNameSuffix; 102 File to = new File(targetDirectory, fileName); 103 files.add(to); 104 105 // copy to the new file 106 FileOutputStream fos = new FileOutputStream(to); 107 FileChannel fcout = fos.getChannel(); 108 fcin.transferTo(byteCount, chunkSizeBytes, fcout); 109 byteCount += chunkSizeBytes; 110 111 // now we need to read and transfer to the end of the line... 112 ByteBuffer bb = ByteBuffer.allocate(READ_AHEAD_BYTES); 113 fcin.read(bb, byteCount); 114 int i = 0; 115 for (i = 0; i < bb.limit(); i++) { 116 if ((char) bb.get(i) == '\n') { 117 i++; 118 break; 119 } 120 } 121 bb.rewind(); 122 bb.limit(i); 123 fcout.write(bb); 124 byteCount += i; 125 126 fcout.close(); 127 fos.close(); 128 filePartCount++; 129 LOG.debug("Filepart[" + fileName + "] created in " + (1 + System.currentTimeMillis() - time) / 1000 + " secs"); 130 } 131 132 // TODO - have tested but need to test thoroughly... 133 // what if the file was smaller than 32 meg? 134 // what if the chunk size was exactly the file size? 135 // did the joins line up properly? 136 // what if the read ahead line did not get to the end of the line? 137 138 fcin.close(); 139 fis.close(); 140 return files; 141 } 142 143 private FileSplitter() { 144 throw new UnsupportedOperationException("Can't initialize class"); 145 } 146 147}