001/*
002 * Licensed under the Apache License, Version 2.0 (the "License");
003 * you may not use this file except in compliance with the License.
004 * You may obtain a copy of the License at
005 *
006 *     http://www.apache.org/licenses/LICENSE-2.0
007 *
008 * Unless required by applicable law or agreed to in writing, software
009 * distributed under the License is distributed on an "AS IS" BASIS,
010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
011 * See the License for the specific language governing permissions and
012 * limitations under the License.
013 */
014package org.gbif.dwc.io;
015
016import org.gbif.utils.file.FileUtils;
017
018import java.io.BufferedReader;
019import java.io.File;
020import java.io.FileInputStream;
021import java.io.IOException;
022import java.io.InputStreamReader;
023import java.nio.charset.StandardCharsets;
024
025import org.apache.commons.text.StringTokenizer;
026import org.junit.jupiter.api.Test;
027
028public class StrTokenizerPerformance {
029
030  private long test(StringTokenizer tokenizer, File source) throws IOException {
031    FileInputStream fis = new FileInputStream(source);
032    InputStreamReader reader = new InputStreamReader(fis, StandardCharsets.UTF_8);
033    BufferedReader br = new BufferedReader(reader);
034
035    // keep track of time while iterating
036    long start = System.currentTimeMillis();
037    String row = br.readLine();
038    while (row != null) {
039      tokenizer.reset(row);
040      String[] columns = tokenizer.getTokenArray();
041      row = br.readLine();
042    }
043    long dur = System.currentTimeMillis() - start;
044    br.close();
045    return dur;
046  }
047
048  @Test
049  public void testCharVsStringPerformance() throws IOException {
050    File source = FileUtils.getClasspathFile("irmng.tail");
051
052    // test CHAR
053    StringTokenizer tokenizer = new StringTokenizer();
054    tokenizer.setDelimiterChar('\t');
055    tokenizer.setEmptyTokenAsNull(true);
056    tokenizer.setIgnoreEmptyTokens(false);
057    long time = test(tokenizer, source);
058    System.out.println(time + " milliseconds for CHAR based tokenizer.");
059
060    // test STRING
061    tokenizer = new StringTokenizer();
062    tokenizer.setDelimiterString("\t");
063    tokenizer.setEmptyTokenAsNull(true);
064    time = test(tokenizer, source);
065    System.out.println(time + " milliseconds for STRING based tokenizer.");
066  }
067}