001/**
002 *
003 */
004package org.gbif.utils.file;
005
006import java.io.File;
007import java.io.FileInputStream;
008import java.io.FileOutputStream;
009import java.io.IOException;
010import java.nio.ByteBuffer;
011import java.nio.channels.FileChannel;
012import java.util.ArrayList;
013import java.util.List;
014
015import org.slf4j.Logger;
016import org.slf4j.LoggerFactory;
017
018/**
019 * A utility to split files into chucks.
020 * This is done on file size, but then extended to read to the end of the current line.
021 * Therefore a chunksize of 32meg will result in a split files of slightly more (assuming lines are
022 * not very long).
023 * This is done using NIO libraries for high performance.
024 */
025public class FileSplitter {
026
027  private static final Logger LOG = LoggerFactory.getLogger(FileSplitter.class);
028  public static final String SEPARATOR = "_";
029  public static final int READ_AHEAD_BYTES = 256;
030
031  // for the file, gives the Byte markers for reading lines, such that the lines read will approximately
032  // equate be the chunk size (slightly more as it reads to the end of the row)
033  public static List<Long> scanToChunk(File from, long chunkSizeBytes) throws IOException {
034    List<Long> chunkBytes = new ArrayList<Long>();
035    FileInputStream fis = new FileInputStream(from);
036    FileChannel fcin = fis.getChannel();
037
038    long byteCount = chunkSizeBytes;
039
040    // now we need to read and transfer to the end of the line...
041    ByteBuffer bb = ByteBuffer.allocate(READ_AHEAD_BYTES);
042
043    while (byteCount < fcin.size()) {
044      fcin.read(bb, byteCount);
045      int i = 0;
046      for (i = 0; i < bb.limit(); i++) {
047        if ((char) bb.get(i) == '\n') {
048          i++;
049          break;
050        }
051      }
052      // bb.rewind();
053      chunkBytes.add(i + byteCount);
054      byteCount += chunkSizeBytes;
055    }
056    fcin.close();
057    fis.close();
058    return chunkBytes;
059  }
060
061  /**
062   * Splits a file "pumaConcolor.txt" into the target directory using the suffix ("part") like so:
063   * - pumaConcolor_part_0.txt
064   * - pumaConcolor_part_1.txt
065   * - pumaConcolor_part_2.txt
066   * Returns the files parts
067   */
068  public static List<File> split(File from, File targetDirectory, String suffix, long chunkSizeBytes)
069    throws IOException {
070    List<File> files = new ArrayList<File>();
071    FileInputStream fis = new FileInputStream(from);
072    FileChannel fcin = fis.getChannel();
073
074    String filePartNamePrefix = "";
075    String filePartNameSuffix = "";
076    if (from.getName().contains(".")) {
077      filePartNamePrefix = from.getName().substring(0, from.getName().indexOf("."));
078      filePartNameSuffix = from.getName().substring(from.getName().indexOf("."));
079    } else {
080      filePartNamePrefix = from.getName();
081    }
082
083    long byteCount = 0;
084    int filePartCount = 0;
085
086    while (byteCount < fcin.size()) {
087      long time = System.currentTimeMillis();
088      // create the output file
089      String fileName = filePartNamePrefix + SEPARATOR + suffix + SEPARATOR + filePartCount + filePartNameSuffix;
090      File to = new File(targetDirectory, fileName);
091      files.add(to);
092
093      // copy to the new file
094      FileOutputStream fos = new FileOutputStream(to);
095      FileChannel fcout = fos.getChannel();
096      fcin.transferTo(byteCount, chunkSizeBytes, fcout);
097      byteCount += chunkSizeBytes;
098
099      // now we need to read and transfer to the end of the line...
100      ByteBuffer bb = ByteBuffer.allocate(READ_AHEAD_BYTES);
101      fcin.read(bb, byteCount);
102      int i = 0;
103      for (i = 0; i < bb.limit(); i++) {
104        if ((char) bb.get(i) == '\n') {
105          i++;
106          break;
107        }
108      }
109      bb.rewind();
110      bb.limit(i);
111      fcout.write(bb);
112      byteCount += i;
113
114      fcout.close();
115      fos.close();
116      filePartCount++;
117      LOG.debug("Filepart[" + fileName + "] created in " + (1 + System.currentTimeMillis() - time) / 1000 + " secs");
118    }
119
120    // TODO - have tested but need to test thoroughly...
121    // what if the file was smaller than 32 meg?
122    // what if the chunk size was exactly the file size?
123    // did the joins line up properly?
124    // what if the read ahead line did not get to the end of the line?
125
126    fcin.close();
127    fis.close();
128    return files;
129  }
130
131  private FileSplitter() {
132    throw new UnsupportedOperationException("Can't initialize class");
133  }
134
135}