001/*
002 * Licensed under the Apache License, Version 2.0 (the "License");
003 * you may not use this file except in compliance with the License.
004 * You may obtain a copy of the License at
005 *
006 *     http://www.apache.org/licenses/LICENSE-2.0
007 *
008 * Unless required by applicable law or agreed to in writing, software
009 * distributed under the License is distributed on an "AS IS" BASIS,
010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
011 * See the License for the specific language governing permissions and
012 * limitations under the License.
013 */
014package org.gbif.utils.file;
015
016import java.io.File;
017import java.io.FileInputStream;
018import java.io.FileOutputStream;
019import java.io.IOException;
020import java.nio.ByteBuffer;
021import java.nio.channels.FileChannel;
022import java.util.ArrayList;
023import java.util.List;
024
025import org.slf4j.Logger;
026import org.slf4j.LoggerFactory;
027
028/**
029 * A utility to split files into chucks.
030 * This is done on file size, but then extended to read to the end of the current line.
031 * Therefore a chunksize of 32meg will result in a split files of slightly more (assuming lines are
032 * not very long).
033 * This is done using NIO libraries for high performance.
034 */
035public class FileSplitter {
036
037  private static final Logger LOG = LoggerFactory.getLogger(FileSplitter.class);
038  public static final String SEPARATOR = "_";
039  public static final int READ_AHEAD_BYTES = 256;
040
041  // for the file, gives the Byte markers for reading lines, such that the lines read will
042  // approximately
043  // equate be the chunk size (slightly more as it reads to the end of the row)
044  public static List<Long> scanToChunk(File from, long chunkSizeBytes) throws IOException {
045    List<Long> chunkBytes = new ArrayList<Long>();
046    FileInputStream fis = new FileInputStream(from);
047    FileChannel fcin = fis.getChannel();
048
049    long byteCount = chunkSizeBytes;
050
051    // now we need to read and transfer to the end of the line...
052    ByteBuffer bb = ByteBuffer.allocate(READ_AHEAD_BYTES);
053
054    while (byteCount < fcin.size()) {
055      fcin.read(bb, byteCount);
056      int i = 0;
057      for (i = 0; i < bb.limit(); i++) {
058        if ((char) bb.get(i) == '\n') {
059          i++;
060          break;
061        }
062      }
063      // bb.rewind();
064      chunkBytes.add(i + byteCount);
065      byteCount += chunkSizeBytes;
066    }
067    fcin.close();
068    fis.close();
069    return chunkBytes;
070  }
071
072  /**
073   * Splits a file "pumaConcolor.txt" into the target directory using the suffix ("part") like so:
074   * - pumaConcolor_part_0.txt
075   * - pumaConcolor_part_1.txt
076   * - pumaConcolor_part_2.txt
077   * Returns the files parts
078   */
079  public static List<File> split(
080      File from, File targetDirectory, String suffix, long chunkSizeBytes) throws IOException {
081    List<File> files = new ArrayList<File>();
082    FileInputStream fis = new FileInputStream(from);
083    FileChannel fcin = fis.getChannel();
084
085    String filePartNamePrefix = "";
086    String filePartNameSuffix = "";
087    if (from.getName().contains(".")) {
088      filePartNamePrefix = from.getName().substring(0, from.getName().indexOf("."));
089      filePartNameSuffix = from.getName().substring(from.getName().indexOf("."));
090    } else {
091      filePartNamePrefix = from.getName();
092    }
093
094    long byteCount = 0;
095    int filePartCount = 0;
096
097    while (byteCount < fcin.size()) {
098      long time = System.currentTimeMillis();
099      // create the output file
100      String fileName =
101          filePartNamePrefix + SEPARATOR + suffix + SEPARATOR + filePartCount + filePartNameSuffix;
102      File to = new File(targetDirectory, fileName);
103      files.add(to);
104
105      // copy to the new file
106      FileOutputStream fos = new FileOutputStream(to);
107      FileChannel fcout = fos.getChannel();
108      fcin.transferTo(byteCount, chunkSizeBytes, fcout);
109      byteCount += chunkSizeBytes;
110
111      // now we need to read and transfer to the end of the line...
112      ByteBuffer bb = ByteBuffer.allocate(READ_AHEAD_BYTES);
113      fcin.read(bb, byteCount);
114      int i = 0;
115      for (i = 0; i < bb.limit(); i++) {
116        if ((char) bb.get(i) == '\n') {
117          i++;
118          break;
119        }
120      }
121      bb.rewind();
122      bb.limit(i);
123      fcout.write(bb);
124      byteCount += i;
125
126      fcout.close();
127      fos.close();
128      filePartCount++;
129      LOG.debug(
130          "Filepart["
131              + fileName
132              + "] created in "
133              + (1 + System.currentTimeMillis() - time) / 1000
134              + " secs");
135    }
136
137    // TODO - have tested but need to test thoroughly...
138    // what if the file was smaller than 32 meg?
139    // what if the chunk size was exactly the file size?
140    // did the joins line up properly?
141    // what if the read ahead line did not get to the end of the line?
142
143    fcin.close();
144    fis.close();
145    return files;
146  }
147
148  private FileSplitter() {
149    throw new UnsupportedOperationException("Can't initialize class");
150  }
151}