001/* 002 * Licensed under the Apache License, Version 2.0 (the "License"); 003 * you may not use this file except in compliance with the License. 004 * You may obtain a copy of the License at 005 * 006 * http://www.apache.org/licenses/LICENSE-2.0 007 * 008 * Unless required by applicable law or agreed to in writing, software 009 * distributed under the License is distributed on an "AS IS" BASIS, 010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 011 * See the License for the specific language governing permissions and 012 * limitations under the License. 013 */ 014package org.gbif.common.parsers.date; 015 016 017import java.util.ArrayList; 018import java.util.Collections; 019import java.util.HashMap; 020import java.util.List; 021import java.util.Map; 022import java.util.Objects; 023import java.util.StringJoiner; 024import java.util.regex.Pattern; 025 026import org.apache.commons.lang3.StringUtils; 027 028/** 029 * The contract of the {@link TextualMonthDateTokenizer} is to break a string representing a date with a textual 030 * month representation into a list of {@link DateToken}. 031 * 032 * A {@link DateToken} should be interpreted as a possible/candidate for date part. No validation will be performed 033 * by the {@link TextualMonthDateTokenizer}, simply patterns matching. 034 * 035 * The class it-self is Thread-Safe, see nested classes for specific details about them. 036 */ 037public class TextualMonthDateTokenizer { 038 039 public enum TokenType { 040 /** Matches 1 or 2 integer(s) (possibly a day) */ 041 INT_2, 042 /** Matches 4 integers (possibly a year) */ 043 INT_4, 044 /** Matches between 1 and 10 letters including the dot (.) */ 045 TEXT 046 } 047 048 private static final Pattern SEPARATOR_PATTERN = Pattern.compile("[^A-Za-z0-9.]+"); 049 private static final Pattern DAY_SUFFIXES_PATTERN = Pattern.compile("(?<=[0-9]{1,2})(st|nd|rd|th|\\.)", 050 Pattern.CASE_INSENSITIVE); 051 052 private static final Map<TokenType, Pattern> PATTERNS_BY_TYPE; 053 054 static { 055 Map<TokenType, Pattern> patternsByType = new HashMap<>(); 056 patternsByType.put(TokenType.INT_2, Pattern.compile("[0-9]{1,2}")); 057 patternsByType.put(TokenType.INT_4, Pattern.compile("[0-9]{4}")); 058 patternsByType.put(TokenType.TEXT, Pattern.compile("[A-Za-z.]{1,10}")); 059 PATTERNS_BY_TYPE = Collections.unmodifiableMap(patternsByType); 060 } 061 062 /** 063 * Private constructor use static method {@link #newInstance()} 064 */ 065 private TextualMonthDateTokenizer() {} 066 067 public static TextualMonthDateTokenizer newInstance(){ 068 return new TextualMonthDateTokenizer(); 069 } 070 071 /** 072 * Tokenize a string into a {@link DateTokens}. 073 * 074 * @param str 075 * @return {@link DateTokens} instance, or null if str is null or empty 076 */ 077 public DateTokens tokenize(String str) { 078 if (StringUtils.isBlank(str)) { 079 return null; 080 } 081 082 str = DAY_SUFFIXES_PATTERN.matcher(str).replaceAll(""); 083 DateTokens tokens = new DateTokens(); 084 085 String[] parts = SEPARATOR_PATTERN.split(str); 086 for (String part : parts) { 087 for (TokenType tokenType : PATTERNS_BY_TYPE.keySet()) { 088 if (PATTERNS_BY_TYPE.get(tokenType).matcher(part).matches()) { 089 tokens.addToken(new DateToken(part, tokenType)); 090 //should always match only on pattern 091 break; 092 } 093 } 094 } 095 return tokens; 096 } 097 098 /** 099 * Contains the result of the tokenization. 100 * DateToken are stored by TokenType on a 1 to 1 assumption. 101 * If a DateToken already exists for the same TokenType it will be replaced and the previous one will be moved to the 102 * discardedTokens list. 103 * 104 * This class is NOT Thread-Safe 105 * 106 */ 107 public static class DateTokens { 108 private final Map<TokenType, DateToken> tokens = new HashMap<>(3); 109 private List<DateToken> discardedTokens = null; 110 111 private void addToken(DateToken dateToken){ 112 DateToken prev = tokens.put(dateToken.type, dateToken); 113 if(prev != null){ 114 addDiscardedToken(prev); 115 } 116 } 117 118 private void addDiscardedToken(DateToken dateToken){ 119 if(discardedTokens == null){ 120 discardedTokens = new ArrayList<>(); 121 } 122 discardedTokens.add(dateToken); 123 } 124 125 /** 126 * Checks if some DateToken were discarded during the tokenization. 127 * 128 * @return 129 */ 130 public boolean containsDiscardedTokens(){ 131 return discardedTokens != null; 132 } 133 134 /** 135 * Size does NOT include discarded token(s). 136 * 137 * @return 138 */ 139 public int size(){ 140 return tokens.size(); 141 } 142 143 public DateToken getToken(TokenType tokenType){ 144 return tokens.get(tokenType); 145 } 146 147 public List<DateToken> getDiscardedTokens() { 148 return Collections.unmodifiableList(discardedTokens); 149 } 150 151 @Override 152 public String toString() { 153 return new StringJoiner(", ", DateTokens.class.getSimpleName() + "[", "]") 154 .add("tokens=" + tokens) 155 .add("discardedTokens=" + discardedTokens) 156 .toString(); 157 } 158 } 159 160 /** 161 * Represents a possible candidate for date part. The value of the token represents what was provided and 162 * may or may not be valid. 163 * 164 * This class is Thread-Safe. 165 */ 166 public static class DateToken { 167 private final String token; 168 private final TokenType type; 169 170 DateToken(String token, TokenType tokenType){ 171 this.token = token; 172 this.type = tokenType; 173 } 174 175 public String getToken() { 176 return token; 177 } 178 179 @Override 180 public String toString() { 181 return new StringJoiner(", ", DateToken.class.getSimpleName() + "[", "]") 182 .add("token='" + token + "'") 183 .add("type=" + type) 184 .toString(); 185 } 186 187 @Override 188 public boolean equals(Object o) { 189 if (this == o) { 190 return true; 191 } 192 if (!(o instanceof DateToken)) { 193 return false; 194 } 195 DateToken dateToken = (DateToken) o; 196 return Objects.equals(token, dateToken.token) && type == dateToken.type; 197 } 198 199 @Override 200 public int hashCode() { 201 return Objects.hash(token, type); 202 } 203 } 204}