001/* 002 * Licensed under the Apache License, Version 2.0 (the "License"); 003 * you may not use this file except in compliance with the License. 004 * You may obtain a copy of the License at 005 * 006 * http://www.apache.org/licenses/LICENSE-2.0 007 * 008 * Unless required by applicable law or agreed to in writing, software 009 * distributed under the License is distributed on an "AS IS" BASIS, 010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 011 * See the License for the specific language governing permissions and 012 * limitations under the License. 013 */ 014package org.gbif.dwc.io; 015 016import org.gbif.utils.file.FileUtils; 017 018import java.io.BufferedReader; 019import java.io.File; 020import java.io.FileInputStream; 021import java.io.IOException; 022import java.io.InputStreamReader; 023import java.nio.charset.StandardCharsets; 024 025import org.apache.commons.text.StringTokenizer; 026import org.junit.jupiter.api.Test; 027 028public class StrTokenizerPerformance { 029 030 private long test(StringTokenizer tokenizer, File source) throws IOException { 031 FileInputStream fis = new FileInputStream(source); 032 InputStreamReader reader = new InputStreamReader(fis, StandardCharsets.UTF_8); 033 BufferedReader br = new BufferedReader(reader); 034 035 // keep track of time while iterating 036 long start = System.currentTimeMillis(); 037 String row = br.readLine(); 038 while (row != null) { 039 tokenizer.reset(row); 040 String[] columns = tokenizer.getTokenArray(); 041 row = br.readLine(); 042 } 043 long dur = System.currentTimeMillis() - start; 044 br.close(); 045 return dur; 046 } 047 048 @Test 049 public void testCharVsStringPerformance() throws IOException { 050 File source = FileUtils.getClasspathFile("irmng.tail"); 051 052 // test CHAR 053 StringTokenizer tokenizer = new StringTokenizer(); 054 tokenizer.setDelimiterChar('\t'); 055 tokenizer.setEmptyTokenAsNull(true); 056 tokenizer.setIgnoreEmptyTokens(false); 057 long time = test(tokenizer, source); 058 System.out.println(time + " milliseconds for CHAR based tokenizer."); 059 060 // test STRING 061 tokenizer = new StringTokenizer(); 062 tokenizer.setDelimiterString("\t"); 063 tokenizer.setEmptyTokenAsNull(true); 064 time = test(tokenizer, source); 065 System.out.println(time + " milliseconds for STRING based tokenizer."); 066 } 067}