001/* 002 * Licensed under the Apache License, Version 2.0 (the "License"); 003 * you may not use this file except in compliance with the License. 004 * You may obtain a copy of the License at 005 * 006 * http://www.apache.org/licenses/LICENSE-2.0 007 * 008 * Unless required by applicable law or agreed to in writing, software 009 * distributed under the License is distributed on an "AS IS" BASIS, 010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 011 * See the License for the specific language governing permissions and 012 * limitations under the License. 013 */ 014package org.gbif.common.parsers.core; 015 016import java.io.BufferedReader; 017import java.io.IOException; 018import java.io.InputStream; 019import java.io.InputStreamReader; 020import java.nio.charset.StandardCharsets; 021import java.util.Iterator; 022import java.util.NoSuchElementException; 023import java.util.regex.Pattern; 024 025import org.apache.commons.lang3.StringUtils; 026 027/** 028 * A very simple Dictionary backed by a tab delimited file. 029 */ 030public abstract class FileBasedDictionaryParser<T> extends DictionaryBackedParser<T> { 031 032 public FileBasedDictionaryParser(boolean caseSensitive) { 033 super(caseSensitive); 034 } 035 036 protected void init(InputStream input) { 037 init(input, null); 038 } 039 040 /** 041 * Init the parser to read the InputStream and ignore lines starting with the commentMarker. 042 * 043 * @param input 044 * @param commentMarker marker identifying a commented line (e.g. #) or null to read all lines 045 */ 046 protected void init(InputStream input, String commentMarker) { 047 init(new Source(input, commentMarker)); 048 } 049 050 /** 051 * Returns the value read from the dictionary as an instance of <T> 052 * 053 * @param value 054 * @return 055 */ 056 protected abstract T fromDictFile(String value); 057 058 /** 059 * An iterator over a well formed tab file. 060 * Should the file be poorly formed expect runtime exceptions. 061 */ 062 class Source implements Iterator<KeyValue<String, T>> { 063 064 private final BufferedReader r; 065 private final Pattern tab = Pattern.compile("\t"); 066 private final String commentMarker; 067 private String line = null; 068 069 Source(InputStream file) { 070 this(file, null); 071 } 072 073 Source(InputStream file, String commentMarker) { 074 r = new BufferedReader(new InputStreamReader(file, StandardCharsets.UTF_8)); 075 this.commentMarker = commentMarker; 076 } 077 078 @Override 079 public boolean hasNext() { 080 if (line != null) { 081 return true; 082 } 083 084 try { 085 // we discard empty or commented lines 086 do { 087 line = r.readLine(); 088 } while (line != null && !isValidLine(line)); 089 } catch (IOException ignored) { 090 close(); 091 return false; 092 } 093 094 if (line == null) { 095 close(); 096 return false; 097 } else { 098 return true; 099 } 100 } 101 102 /** 103 * Check if a line is valid or not. 104 * A valid line is not a comment (if configured) and should be in the form "key<tab>value". 105 * 106 * @param line 107 * @return 108 */ 109 private boolean isValidLine(String line) { 110 if (line == null) { 111 return false; 112 } 113 114 if (commentMarker != null) { 115 if (line.startsWith(commentMarker)) { 116 return false; 117 } 118 } 119 return (tab.split(line).length == 2); 120 } 121 122 @Override 123 public KeyValue<String, T> next() { 124 if (!hasNext()) { 125 throw new NoSuchElementException(); 126 } 127 128 String[] atoms = tab.split(line); 129 line = null; 130 return new KeyValue<>(StringUtils.trimToNull(atoms[0]), fromDictFile(StringUtils.trimToNull(atoms[1]))); 131 } 132 133 @Override 134 public void remove() { 135 } 136 137 public void close() { 138 if (r != null) { 139 try { 140 r.close(); 141 } catch (IOException ignored) { 142 } 143 } 144 } 145 } 146}