Source code

001/*
002 * Licensed under the Apache License, Version 2.0 (the "License");
003 * you may not use this file except in compliance with the License.
004 * You may obtain a copy of the License at
005 *
006 *     http://www.apache.org/licenses/LICENSE-2.0
007 *
008 * Unless required by applicable law or agreed to in writing, software
009 * distributed under the License is distributed on an "AS IS" BASIS,
010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
011 * See the License for the specific language governing permissions and
012 * limitations under the License.
013 */
014package org.gbif.common.parsers.date;
015
016
017import java.util.ArrayList;
018import java.util.Collections;
019import java.util.HashMap;
020import java.util.List;
021import java.util.Map;
022import java.util.Objects;
023import java.util.StringJoiner;
024import java.util.regex.Pattern;
025
026import org.apache.commons.lang3.StringUtils;
027
028/**
029 * The contract of the {@link TextualMonthDateTokenizer} is to break a string representing a date with a textual
030 * month representation into a list of {@link DateToken}.
031 *
032 * A {@link DateToken} should be interpreted as a possible/candidate for date part. No validation will be performed
033 * by the {@link TextualMonthDateTokenizer}, simply patterns matching.
034 *
035 * The class it-self is Thread-Safe, see nested classes for specific details about them.
036 */
037public class TextualMonthDateTokenizer {
038
039  public enum TokenType {
040    /** Matches 1 or 2 integer(s) (possibly a day) */
041    INT_2,
042    /** Matches 4 integers (possibly a year) */
043    INT_4,
044    /** Matches between 1 and 10 letters including the dot (.) */
045    TEXT
046  }
047
048  private static final Pattern SEPARATOR_PATTERN =  Pattern.compile("[^A-Za-z0-9.]+");
049  private static final Pattern DAY_SUFFIXES_PATTERN =  Pattern.compile("(?<=[0-9]{1,2})(st|nd|rd|th|\\.)",
050          Pattern.CASE_INSENSITIVE);
051
052  private static final Map<TokenType, Pattern> PATTERNS_BY_TYPE;
053
054  static {
055    Map<TokenType, Pattern> patternsByType = new HashMap<>();
056    patternsByType.put(TokenType.INT_2, Pattern.compile("[0-9]{1,2}"));
057    patternsByType.put(TokenType.INT_4, Pattern.compile("[0-9]{4}"));
058    patternsByType.put(TokenType.TEXT, Pattern.compile("[A-Za-z.]{1,10}"));
059    PATTERNS_BY_TYPE = Collections.unmodifiableMap(patternsByType);
060  }
061
062  /**
063   * Private constructor use static method {@link #newInstance()}
064   */
065  private TextualMonthDateTokenizer() {}
066
067  public static TextualMonthDateTokenizer newInstance(){
068    return new TextualMonthDateTokenizer();
069  }
070
071  /**
072   * Tokenize a string into a {@link DateTokens}.
073   *
074   * @param str
075   * @return {@link DateTokens} instance, or null if str is null or empty
076   */
077  public DateTokens tokenize(String str) {
078    if (StringUtils.isBlank(str)) {
079      return null;
080    }
081
082    str = DAY_SUFFIXES_PATTERN.matcher(str).replaceAll("");
083    DateTokens tokens = new DateTokens();
084
085    String[] parts = SEPARATOR_PATTERN.split(str);
086    for (String part : parts) {
087      for (TokenType tokenType : PATTERNS_BY_TYPE.keySet()) {
088        if (PATTERNS_BY_TYPE.get(tokenType).matcher(part).matches()) {
089          tokens.addToken(new DateToken(part, tokenType));
090          //should always match only on pattern
091          break;
092        }
093      }
094    }
095    return tokens;
096  }
097
098  /**
099   * Contains the result of the tokenization.
100   * DateToken are stored by TokenType on a 1 to 1 assumption.
101   * If a DateToken already exists for the same TokenType it will be replaced and the previous one will be moved to the
102   * discardedTokens list.
103   *
104   * This class is NOT Thread-Safe
105   *
106   */
107  public static class DateTokens {
108    private final Map<TokenType, DateToken> tokens = new HashMap<>(3);
109    private List<DateToken> discardedTokens = null;
110
111    private void addToken(DateToken dateToken){
112      DateToken prev = tokens.put(dateToken.type, dateToken);
113      if(prev != null){
114        addDiscardedToken(prev);
115      }
116    }
117
118    private void addDiscardedToken(DateToken dateToken){
119      if(discardedTokens == null){
120        discardedTokens = new ArrayList<>();
121      }
122      discardedTokens.add(dateToken);
123    }
124
125    /**
126     * Checks if some DateToken were discarded during the tokenization.
127     *
128     * @return
129     */
130    public boolean containsDiscardedTokens(){
131      return discardedTokens != null;
132    }
133
134    /**
135     * Size does NOT include discarded token(s).
136     *
137     * @return
138     */
139    public int size(){
140      return tokens.size();
141    }
142
143    public DateToken getToken(TokenType tokenType){
144      return tokens.get(tokenType);
145    }
146
147    public List<DateToken> getDiscardedTokens() {
148      return Collections.unmodifiableList(discardedTokens);
149    }
150
151    @Override
152    public String toString() {
153      return new StringJoiner(", ", DateTokens.class.getSimpleName() + "[", "]")
154          .add("tokens=" + tokens)
155          .add("discardedTokens=" + discardedTokens)
156          .toString();
157    }
158  }
159
160  /**
161   * Represents a possible candidate for date part. The value of the token represents what was provided and
162   * may or may not be valid.
163   *
164   * This class is Thread-Safe.
165   */
166  public static class DateToken {
167    private final String token;
168    private final TokenType type;
169
170    DateToken(String token, TokenType tokenType){
171      this.token = token;
172      this.type = tokenType;
173    }
174
175    public String getToken() {
176      return token;
177    }
178
179    @Override
180    public String toString() {
181      return new StringJoiner(", ", DateToken.class.getSimpleName() + "[", "]")
182          .add("token='" + token + "'")
183          .add("type=" + type)
184          .toString();
185    }
186
187    @Override
188    public boolean equals(Object o) {
189      if (this == o) {
190        return true;
191      }
192      if (!(o instanceof DateToken)) {
193        return false;
194      }
195      DateToken dateToken = (DateToken) o;
196      return Objects.equals(token, dateToken.token) && type == dateToken.type;
197    }
198
199    @Override
200    public int hashCode() {
201      return Objects.hash(token, type);
202    }
203  }
204}