001/*
002 * Licensed under the Apache License, Version 2.0 (the "License");
003 * you may not use this file except in compliance with the License.
004 * You may obtain a copy of the License at
005 *
006 *     http://www.apache.org/licenses/LICENSE-2.0
007 *
008 * Unless required by applicable law or agreed to in writing, software
009 * distributed under the License is distributed on an "AS IS" BASIS,
010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
011 * See the License for the specific language governing permissions and
012 * limitations under the License.
013 */
014package org.gbif.dwc.io;
015
016import org.apache.commons.text.StringTokenizer;
017import org.junit.jupiter.api.Test;
018
019import static org.junit.jupiter.api.Assertions.assertEquals;
020import static org.junit.jupiter.api.Assertions.assertNull;
021
022public class StrTokenizerTest {
023
024  @Test
025  public void testCsvQuoted() {
026    StringTokenizer tokenizer = new StringTokenizer();
027    tokenizer.setDelimiterString(",");
028    tokenizer.setQuoteChar('"');
029    tokenizer.setEmptyTokenAsNull(true);
030    tokenizer.setIgnoreEmptyTokens(false);
031
032    tokenizer.reset("121,432423, 9099053,\"Frieda karla L.,DC.\",Ahrens");
033    String[] columns = tokenizer.getTokenArray();
034    assertEquals("121", columns[0]);
035    assertEquals("432423", columns[1]);
036    assertEquals(" 9099053", columns[2]);
037    assertEquals("Frieda karla L.,DC.", columns[3]);
038    assertEquals("Ahrens", columns[4]);
039
040    tokenizer.reset("   ,4321");
041    columns = tokenizer.getTokenArray();
042    assertEquals("   ", columns[0]);
043    assertEquals("4321", columns[1]);
044
045    tokenizer.reset(" ,,,,zzz  ");
046    columns = tokenizer.getTokenArray();
047    assertEquals(" ", columns[0]);
048    assertNull(columns[1]);
049    assertNull(columns[2]);
050    assertNull(columns[3]);
051    assertEquals("zzz  ", columns[4]);
052
053    tokenizer.reset(",,,,zzz  ");
054    columns = tokenizer.getTokenArray();
055    assertNull(columns[0]);
056    assertNull(columns[1]);
057    assertNull(columns[2]);
058    assertNull(columns[3]);
059    assertEquals("zzz  ", columns[4]);
060  }
061
062  @Test
063  public void testCsvUnquoted() {
064    StringTokenizer tokenizer = new StringTokenizer();
065    tokenizer.setDelimiterString(",");
066    tokenizer.setEmptyTokenAsNull(true);
067    tokenizer.setIgnoreEmptyTokens(false);
068
069    tokenizer.reset("121,432423, 9099053,Frieda karla L.,DC.,Ahrens");
070    String[] columns = tokenizer.getTokenArray();
071    assertEquals("121", columns[0]);
072    assertEquals("432423", columns[1]);
073    assertEquals(" 9099053", columns[2]);
074    assertEquals("Frieda karla L.", columns[3]);
075    assertEquals("DC.", columns[4]);
076    assertEquals("Ahrens", columns[5]);
077
078    tokenizer.reset(",,,,zzz  ");
079    columns = tokenizer.getTokenArray();
080    assertNull(columns[0]);
081    assertNull(columns[1]);
082    assertNull(columns[2]);
083    assertNull(columns[3]);
084    assertEquals("zzz  ", columns[4]);
085  }
086
087  @Test
088  public void testPipes() {
089    StringTokenizer tokenizer = new StringTokenizer();
090    tokenizer.setDelimiterChar('|');
091    tokenizer.setQuoteChar('"');
092    tokenizer.setEmptyTokenAsNull(true);
093    tokenizer.setIgnoreEmptyTokens(false);
094
095    tokenizer.reset("121|432423| 9099053|\"Frieda karla L.|DC.\"|Ahrens");
096    String[] columns = tokenizer.getTokenArray();
097    assertEquals("121", columns[0]);
098    assertEquals("432423", columns[1]);
099    assertEquals(" 9099053", columns[2]);
100    assertEquals("Frieda karla L.|DC.", columns[3]);
101    assertEquals("Ahrens", columns[4]);
102
103    tokenizer.reset("   |4321");
104    columns = tokenizer.getTokenArray();
105    assertEquals("   ", columns[0]);
106    assertEquals("4321", columns[1]);
107
108    tokenizer.reset(" ||||zzz  ");
109    columns = tokenizer.getTokenArray();
110    assertEquals(" ", columns[0]);
111    assertNull(columns[1]);
112    assertNull(columns[2]);
113    assertNull(columns[3]);
114    assertEquals("zzz  ", columns[4]);
115
116    tokenizer.reset("||||zzz  ");
117    columns = tokenizer.getTokenArray();
118    assertNull(columns[0]);
119    assertNull(columns[1]);
120    assertNull(columns[2]);
121    assertNull(columns[3]);
122    assertEquals("zzz  ", columns[4]);
123  }
124
125  @Test
126  public void testTabQuoted() {
127    StringTokenizer tokenizer = new StringTokenizer();
128    tokenizer.setDelimiterString("\t");
129    tokenizer.setQuoteChar('"');
130    tokenizer.setEmptyTokenAsNull(true);
131    tokenizer.setIgnoreEmptyTokens(false);
132
133    tokenizer.reset("121\t432423\t 9099053\t\"Frieda karla L.,DC.\"\tAhrens");
134    String[] columns = tokenizer.getTokenArray();
135    assertEquals("121", columns[0]);
136    assertEquals("432423", columns[1]);
137    assertEquals(" 9099053", columns[2]);
138    assertEquals("Frieda karla L.,DC.", columns[3]);
139    assertEquals("Ahrens", columns[4]);
140
141    tokenizer.reset("   \t4321");
142    columns = tokenizer.getTokenArray();
143    assertEquals("   ", columns[0]);
144    assertEquals("4321", columns[1]);
145
146    tokenizer.reset(" \t\t\t\tzzz  ");
147    columns = tokenizer.getTokenArray();
148    assertEquals(" ", columns[0]);
149    assertNull(columns[1]);
150    assertNull(columns[2]);
151    assertNull(columns[3]);
152    assertEquals("zzz  ", columns[4]);
153
154    tokenizer.reset("\t\t\t\tzzz  ");
155    columns = tokenizer.getTokenArray();
156    assertNull(columns[0]);
157    assertNull(columns[1]);
158    assertNull(columns[2]);
159    assertNull(columns[3]);
160    assertEquals("zzz  ", columns[4]);
161  }
162
163  @Test
164  public void testTabUnquoted() {
165    StringTokenizer tokenizer = new StringTokenizer();
166    tokenizer.setDelimiterString("\t");
167    tokenizer.setEmptyTokenAsNull(true);
168    tokenizer.setIgnoreEmptyTokens(false);
169
170    tokenizer.reset("121\t432423\t 9099053\t\"Frieda karla L.,DC.\"\tAhrens");
171    String[] columns = tokenizer.getTokenArray();
172    assertEquals("121", columns[0]);
173    assertEquals("432423", columns[1]);
174    assertEquals(" 9099053", columns[2]);
175    assertEquals("\"Frieda karla L.,DC.\"", columns[3]);
176    assertEquals("Ahrens", columns[4]);
177
178    tokenizer.reset("   \t4321");
179    columns = tokenizer.getTokenArray();
180    assertEquals("   ", columns[0]);
181    assertEquals("4321", columns[1]);
182
183    tokenizer.reset(" \t\t\t\tzzz  ");
184    columns = tokenizer.getTokenArray();
185    assertEquals(" ", columns[0]);
186    assertNull(columns[1]);
187    assertNull(columns[2]);
188    assertNull(columns[3]);
189    assertEquals("zzz  ", columns[4]);
190
191    tokenizer.reset("\t\t\t\tzzz  ");
192    columns = tokenizer.getTokenArray();
193    assertNull(columns[0]);
194    assertNull(columns[1]);
195    assertNull(columns[2]);
196    assertNull(columns[3]);
197    assertEquals("zzz  ", columns[4]);
198  }
199
200}