001/* 002 * Licensed under the Apache License, Version 2.0 (the "License"); 003 * you may not use this file except in compliance with the License. 004 * You may obtain a copy of the License at 005 * 006 * http://www.apache.org/licenses/LICENSE-2.0 007 * 008 * Unless required by applicable law or agreed to in writing, software 009 * distributed under the License is distributed on an "AS IS" BASIS, 010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 011 * See the License for the specific language governing permissions and 012 * limitations under the License. 013 */ 014package org.gbif.utils.file; 015 016import java.io.File; 017import java.io.FileInputStream; 018import java.io.FileOutputStream; 019import java.io.IOException; 020import java.nio.ByteBuffer; 021import java.nio.channels.FileChannel; 022import java.util.ArrayList; 023import java.util.List; 024 025import org.slf4j.Logger; 026import org.slf4j.LoggerFactory; 027 028/** 029 * A utility to split files into chucks. 030 * This is done on file size, but then extended to read to the end of the current line. 031 * Therefore a chunksize of 32meg will result in a split files of slightly more (assuming lines are 032 * not very long). 033 * This is done using NIO libraries for high performance. 034 */ 035public class FileSplitter { 036 037 private static final Logger LOG = LoggerFactory.getLogger(FileSplitter.class); 038 public static final String SEPARATOR = "_"; 039 public static final int READ_AHEAD_BYTES = 256; 040 041 // for the file, gives the Byte markers for reading lines, such that the lines read will 042 // approximately 043 // equate be the chunk size (slightly more as it reads to the end of the row) 044 public static List<Long> scanToChunk(File from, long chunkSizeBytes) throws IOException { 045 List<Long> chunkBytes = new ArrayList<Long>(); 046 FileInputStream fis = new FileInputStream(from); 047 FileChannel fcin = fis.getChannel(); 048 049 long byteCount = chunkSizeBytes; 050 051 // now we need to read and transfer to the end of the line... 052 ByteBuffer bb = ByteBuffer.allocate(READ_AHEAD_BYTES); 053 054 while (byteCount < fcin.size()) { 055 fcin.read(bb, byteCount); 056 int i = 0; 057 for (i = 0; i < bb.limit(); i++) { 058 if ((char) bb.get(i) == '\n') { 059 i++; 060 break; 061 } 062 } 063 // bb.rewind(); 064 chunkBytes.add(i + byteCount); 065 byteCount += chunkSizeBytes; 066 } 067 fcin.close(); 068 fis.close(); 069 return chunkBytes; 070 } 071 072 /** 073 * Splits a file "pumaConcolor.txt" into the target directory using the suffix ("part") like so: 074 * - pumaConcolor_part_0.txt 075 * - pumaConcolor_part_1.txt 076 * - pumaConcolor_part_2.txt 077 * Returns the files parts 078 */ 079 public static List<File> split( 080 File from, File targetDirectory, String suffix, long chunkSizeBytes) throws IOException { 081 List<File> files = new ArrayList<File>(); 082 FileInputStream fis = new FileInputStream(from); 083 FileChannel fcin = fis.getChannel(); 084 085 String filePartNamePrefix = ""; 086 String filePartNameSuffix = ""; 087 if (from.getName().contains(".")) { 088 filePartNamePrefix = from.getName().substring(0, from.getName().indexOf(".")); 089 filePartNameSuffix = from.getName().substring(from.getName().indexOf(".")); 090 } else { 091 filePartNamePrefix = from.getName(); 092 } 093 094 long byteCount = 0; 095 int filePartCount = 0; 096 097 while (byteCount < fcin.size()) { 098 long time = System.currentTimeMillis(); 099 // create the output file 100 String fileName = 101 filePartNamePrefix + SEPARATOR + suffix + SEPARATOR + filePartCount + filePartNameSuffix; 102 File to = new File(targetDirectory, fileName); 103 files.add(to); 104 105 // copy to the new file 106 FileOutputStream fos = new FileOutputStream(to); 107 FileChannel fcout = fos.getChannel(); 108 fcin.transferTo(byteCount, chunkSizeBytes, fcout); 109 byteCount += chunkSizeBytes; 110 111 // now we need to read and transfer to the end of the line... 112 ByteBuffer bb = ByteBuffer.allocate(READ_AHEAD_BYTES); 113 fcin.read(bb, byteCount); 114 int i = 0; 115 for (i = 0; i < bb.limit(); i++) { 116 if ((char) bb.get(i) == '\n') { 117 i++; 118 break; 119 } 120 } 121 bb.rewind(); 122 bb.limit(i); 123 fcout.write(bb); 124 byteCount += i; 125 126 fcout.close(); 127 fos.close(); 128 filePartCount++; 129 LOG.debug( 130 "Filepart[" 131 + fileName 132 + "] created in " 133 + (1 + System.currentTimeMillis() - time) / 1000 134 + " secs"); 135 } 136 137 // TODO - have tested but need to test thoroughly... 138 // what if the file was smaller than 32 meg? 139 // what if the chunk size was exactly the file size? 140 // did the joins line up properly? 141 // what if the read ahead line did not get to the end of the line? 142 143 fcin.close(); 144 fis.close(); 145 return files; 146 } 147 148 private FileSplitter() { 149 throw new UnsupportedOperationException("Can't initialize class"); 150 } 151}