Source code

001/*
002 * Copyright 2021 Global Biodiversity Information Facility (GBIF)
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016package org.gbif.utils.file;
017
018import java.io.File;
019import java.io.FileInputStream;
020import java.io.FileOutputStream;
021import java.io.IOException;
022import java.nio.ByteBuffer;
023import java.nio.channels.FileChannel;
024import java.util.ArrayList;
025import java.util.List;
026
027import org.slf4j.Logger;
028import org.slf4j.LoggerFactory;
029
030/**
031 * A utility to split files into chucks.
032 * This is done on file size, but then extended to read to the end of the current line.
033 * Therefore a chunksize of 32meg will result in a split files of slightly more (assuming lines are
034 * not very long).
035 * This is done using NIO libraries for high performance.
036 */
037public class FileSplitter {
038
039  private static final Logger LOG = LoggerFactory.getLogger(FileSplitter.class);
040  public static final String SEPARATOR = "_";
041  public static final int READ_AHEAD_BYTES = 256;
042
043  // for the file, gives the Byte markers for reading lines, such that the lines read will approximately
044  // equate be the chunk size (slightly more as it reads to the end of the row)
045  public static List<Long> scanToChunk(File from, long chunkSizeBytes) throws IOException {
046    List<Long> chunkBytes = new ArrayList<Long>();
047    FileInputStream fis = new FileInputStream(from);
048    FileChannel fcin = fis.getChannel();
049
050    long byteCount = chunkSizeBytes;
051
052    // now we need to read and transfer to the end of the line...
053    ByteBuffer bb = ByteBuffer.allocate(READ_AHEAD_BYTES);
054
055    while (byteCount < fcin.size()) {
056      fcin.read(bb, byteCount);
057      int i = 0;
058      for (i = 0; i < bb.limit(); i++) {
059        if ((char) bb.get(i) == '\n') {
060          i++;
061          break;
062        }
063      }
064      // bb.rewind();
065      chunkBytes.add(i + byteCount);
066      byteCount += chunkSizeBytes;
067    }
068    fcin.close();
069    fis.close();
070    return chunkBytes;
071  }
072
073  /**
074   * Splits a file "pumaConcolor.txt" into the target directory using the suffix ("part") like so:
075   * - pumaConcolor_part_0.txt
076   * - pumaConcolor_part_1.txt
077   * - pumaConcolor_part_2.txt
078   * Returns the files parts
079   */
080  public static List<File> split(File from, File targetDirectory, String suffix, long chunkSizeBytes)
081    throws IOException {
082    List<File> files = new ArrayList<File>();
083    FileInputStream fis = new FileInputStream(from);
084    FileChannel fcin = fis.getChannel();
085
086    String filePartNamePrefix = "";
087    String filePartNameSuffix = "";
088    if (from.getName().contains(".")) {
089      filePartNamePrefix = from.getName().substring(0, from.getName().indexOf("."));
090      filePartNameSuffix = from.getName().substring(from.getName().indexOf("."));
091    } else {
092      filePartNamePrefix = from.getName();
093    }
094
095    long byteCount = 0;
096    int filePartCount = 0;
097
098    while (byteCount < fcin.size()) {
099      long time = System.currentTimeMillis();
100      // create the output file
101      String fileName = filePartNamePrefix + SEPARATOR + suffix + SEPARATOR + filePartCount + filePartNameSuffix;
102      File to = new File(targetDirectory, fileName);
103      files.add(to);
104
105      // copy to the new file
106      FileOutputStream fos = new FileOutputStream(to);
107      FileChannel fcout = fos.getChannel();
108      fcin.transferTo(byteCount, chunkSizeBytes, fcout);
109      byteCount += chunkSizeBytes;
110
111      // now we need to read and transfer to the end of the line...
112      ByteBuffer bb = ByteBuffer.allocate(READ_AHEAD_BYTES);
113      fcin.read(bb, byteCount);
114      int i = 0;
115      for (i = 0; i < bb.limit(); i++) {
116        if ((char) bb.get(i) == '\n') {
117          i++;
118          break;
119        }
120      }
121      bb.rewind();
122      bb.limit(i);
123      fcout.write(bb);
124      byteCount += i;
125
126      fcout.close();
127      fos.close();
128      filePartCount++;
129      LOG.debug("Filepart[" + fileName + "] created in " + (1 + System.currentTimeMillis() - time) / 1000 + " secs");
130    }
131
132    // TODO - have tested but need to test thoroughly...
133    // what if the file was smaller than 32 meg?
134    // what if the chunk size was exactly the file size?
135    // did the joins line up properly?
136    // what if the read ahead line did not get to the end of the line?
137
138    fcin.close();
139    fis.close();
140    return files;
141  }
142
143  private FileSplitter() {
144    throw new UnsupportedOperationException("Can't initialize class");
145  }
146
147}