001/* 002 * Licensed under the Apache License, Version 2.0 (the "License"); 003 * you may not use this file except in compliance with the License. 004 * You may obtain a copy of the License at 005 * 006 * http://www.apache.org/licenses/LICENSE-2.0 007 * 008 * Unless required by applicable law or agreed to in writing, software 009 * distributed under the License is distributed on an "AS IS" BASIS, 010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 011 * See the License for the specific language governing permissions and 012 * limitations under the License. 013 */ 014package org.gbif.dwc.io; 015 016import org.apache.commons.text.StringTokenizer; 017import org.junit.jupiter.api.Test; 018 019import static org.junit.jupiter.api.Assertions.assertEquals; 020import static org.junit.jupiter.api.Assertions.assertNull; 021 022public class StrTokenizerTest { 023 024 @Test 025 public void testCsvQuoted() { 026 StringTokenizer tokenizer = new StringTokenizer(); 027 tokenizer.setDelimiterString(","); 028 tokenizer.setQuoteChar('"'); 029 tokenizer.setEmptyTokenAsNull(true); 030 tokenizer.setIgnoreEmptyTokens(false); 031 032 tokenizer.reset("121,432423, 9099053,\"Frieda karla L.,DC.\",Ahrens"); 033 String[] columns = tokenizer.getTokenArray(); 034 assertEquals("121", columns[0]); 035 assertEquals("432423", columns[1]); 036 assertEquals(" 9099053", columns[2]); 037 assertEquals("Frieda karla L.,DC.", columns[3]); 038 assertEquals("Ahrens", columns[4]); 039 040 tokenizer.reset(" ,4321"); 041 columns = tokenizer.getTokenArray(); 042 assertEquals(" ", columns[0]); 043 assertEquals("4321", columns[1]); 044 045 tokenizer.reset(" ,,,,zzz "); 046 columns = tokenizer.getTokenArray(); 047 assertEquals(" ", columns[0]); 048 assertNull(columns[1]); 049 assertNull(columns[2]); 050 assertNull(columns[3]); 051 assertEquals("zzz ", columns[4]); 052 053 tokenizer.reset(",,,,zzz "); 054 columns = tokenizer.getTokenArray(); 055 assertNull(columns[0]); 056 assertNull(columns[1]); 057 assertNull(columns[2]); 058 assertNull(columns[3]); 059 assertEquals("zzz ", columns[4]); 060 } 061 062 @Test 063 public void testCsvUnquoted() { 064 StringTokenizer tokenizer = new StringTokenizer(); 065 tokenizer.setDelimiterString(","); 066 tokenizer.setEmptyTokenAsNull(true); 067 tokenizer.setIgnoreEmptyTokens(false); 068 069 tokenizer.reset("121,432423, 9099053,Frieda karla L.,DC.,Ahrens"); 070 String[] columns = tokenizer.getTokenArray(); 071 assertEquals("121", columns[0]); 072 assertEquals("432423", columns[1]); 073 assertEquals(" 9099053", columns[2]); 074 assertEquals("Frieda karla L.", columns[3]); 075 assertEquals("DC.", columns[4]); 076 assertEquals("Ahrens", columns[5]); 077 078 tokenizer.reset(",,,,zzz "); 079 columns = tokenizer.getTokenArray(); 080 assertNull(columns[0]); 081 assertNull(columns[1]); 082 assertNull(columns[2]); 083 assertNull(columns[3]); 084 assertEquals("zzz ", columns[4]); 085 } 086 087 @Test 088 public void testPipes() { 089 StringTokenizer tokenizer = new StringTokenizer(); 090 tokenizer.setDelimiterChar('|'); 091 tokenizer.setQuoteChar('"'); 092 tokenizer.setEmptyTokenAsNull(true); 093 tokenizer.setIgnoreEmptyTokens(false); 094 095 tokenizer.reset("121|432423| 9099053|\"Frieda karla L.|DC.\"|Ahrens"); 096 String[] columns = tokenizer.getTokenArray(); 097 assertEquals("121", columns[0]); 098 assertEquals("432423", columns[1]); 099 assertEquals(" 9099053", columns[2]); 100 assertEquals("Frieda karla L.|DC.", columns[3]); 101 assertEquals("Ahrens", columns[4]); 102 103 tokenizer.reset(" |4321"); 104 columns = tokenizer.getTokenArray(); 105 assertEquals(" ", columns[0]); 106 assertEquals("4321", columns[1]); 107 108 tokenizer.reset(" ||||zzz "); 109 columns = tokenizer.getTokenArray(); 110 assertEquals(" ", columns[0]); 111 assertNull(columns[1]); 112 assertNull(columns[2]); 113 assertNull(columns[3]); 114 assertEquals("zzz ", columns[4]); 115 116 tokenizer.reset("||||zzz "); 117 columns = tokenizer.getTokenArray(); 118 assertNull(columns[0]); 119 assertNull(columns[1]); 120 assertNull(columns[2]); 121 assertNull(columns[3]); 122 assertEquals("zzz ", columns[4]); 123 } 124 125 @Test 126 public void testTabQuoted() { 127 StringTokenizer tokenizer = new StringTokenizer(); 128 tokenizer.setDelimiterString("\t"); 129 tokenizer.setQuoteChar('"'); 130 tokenizer.setEmptyTokenAsNull(true); 131 tokenizer.setIgnoreEmptyTokens(false); 132 133 tokenizer.reset("121\t432423\t 9099053\t\"Frieda karla L.,DC.\"\tAhrens"); 134 String[] columns = tokenizer.getTokenArray(); 135 assertEquals("121", columns[0]); 136 assertEquals("432423", columns[1]); 137 assertEquals(" 9099053", columns[2]); 138 assertEquals("Frieda karla L.,DC.", columns[3]); 139 assertEquals("Ahrens", columns[4]); 140 141 tokenizer.reset(" \t4321"); 142 columns = tokenizer.getTokenArray(); 143 assertEquals(" ", columns[0]); 144 assertEquals("4321", columns[1]); 145 146 tokenizer.reset(" \t\t\t\tzzz "); 147 columns = tokenizer.getTokenArray(); 148 assertEquals(" ", columns[0]); 149 assertNull(columns[1]); 150 assertNull(columns[2]); 151 assertNull(columns[3]); 152 assertEquals("zzz ", columns[4]); 153 154 tokenizer.reset("\t\t\t\tzzz "); 155 columns = tokenizer.getTokenArray(); 156 assertNull(columns[0]); 157 assertNull(columns[1]); 158 assertNull(columns[2]); 159 assertNull(columns[3]); 160 assertEquals("zzz ", columns[4]); 161 } 162 163 @Test 164 public void testTabUnquoted() { 165 StringTokenizer tokenizer = new StringTokenizer(); 166 tokenizer.setDelimiterString("\t"); 167 tokenizer.setEmptyTokenAsNull(true); 168 tokenizer.setIgnoreEmptyTokens(false); 169 170 tokenizer.reset("121\t432423\t 9099053\t\"Frieda karla L.,DC.\"\tAhrens"); 171 String[] columns = tokenizer.getTokenArray(); 172 assertEquals("121", columns[0]); 173 assertEquals("432423", columns[1]); 174 assertEquals(" 9099053", columns[2]); 175 assertEquals("\"Frieda karla L.,DC.\"", columns[3]); 176 assertEquals("Ahrens", columns[4]); 177 178 tokenizer.reset(" \t4321"); 179 columns = tokenizer.getTokenArray(); 180 assertEquals(" ", columns[0]); 181 assertEquals("4321", columns[1]); 182 183 tokenizer.reset(" \t\t\t\tzzz "); 184 columns = tokenizer.getTokenArray(); 185 assertEquals(" ", columns[0]); 186 assertNull(columns[1]); 187 assertNull(columns[2]); 188 assertNull(columns[3]); 189 assertEquals("zzz ", columns[4]); 190 191 tokenizer.reset("\t\t\t\tzzz "); 192 columns = tokenizer.getTokenArray(); 193 assertNull(columns[0]); 194 assertNull(columns[1]); 195 assertNull(columns[2]); 196 assertNull(columns[3]); 197 assertEquals("zzz ", columns[4]); 198 } 199 200}