001/*
002 * Licensed under the Apache License, Version 2.0 (the "License");
003 * you may not use this file except in compliance with the License.
004 * You may obtain a copy of the License at
005 *
006 *     http://www.apache.org/licenses/LICENSE-2.0
007 *
008 * Unless required by applicable law or agreed to in writing, software
009 * distributed under the License is distributed on an "AS IS" BASIS,
010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
011 * See the License for the specific language governing permissions and
012 * limitations under the License.
013 */
014package org.gbif.common.parsers.date;
015
016import org.gbif.utils.file.csv.CSVReader;
017import org.gbif.utils.file.csv.CSVReaderFactory;
018
019import java.io.IOException;
020import java.io.InputStream;
021import java.nio.charset.StandardCharsets;
022import java.time.Month;
023import java.util.Arrays;
024import java.util.HashMap;
025import java.util.HashSet;
026import java.util.List;
027import java.util.Map;
028import java.util.Set;
029import java.util.stream.Collectors;
030
031import org.apache.commons.lang3.StringUtils;
032import org.apache.commons.lang3.builder.ToStringBuilder;
033import org.apache.commons.lang3.builder.ToStringStyle;
034import org.slf4j.Logger;
035import org.slf4j.LoggerFactory;
036
037/**
038 * DatePartsNormalizer contract is to take String representing Year, Month and Day and return the corresponding
039 * Integer as {@link NormalizedYearMonthDay} if possible.
040 *
041 * No validation will be applied to the normalized values.
042 *
043 * Thread-Safe after creation.
044 *
045 */
046public class DatePartsNormalizer {
047
048  private static final Logger LOG = LoggerFactory.getLogger(DatePartsNormalizer.class);
049
050  private static final String STRING_NULL = "\\N";
051  private static final String COLUMN_SEPARATOR = "\t";
052
053  private static final String COMMENT_MARKER = "#";
054  private static final String MONTH_FILEPATH = "/dictionaries/parse/month.tsv";
055
056  private static final String[][] MONTHS = new String[Month.values().length][];
057
058  // Load all the month names and alternative month names from a file
059  static {
060    Map<String, Set<String>> monthMap = new HashMap<>(Month.values().length);
061    String keyName;
062    for (Month m : Month.values()) {
063      keyName = m.name().toLowerCase();
064      monthMap.put(keyName, new HashSet<>());
065      //add the key itself
066      monthMap.get(keyName).add(keyName);
067    }
068
069    InputStream monthFileStream = DatePartsNormalizer.class.getResourceAsStream(MONTH_FILEPATH);
070
071    if (monthFileStream == null) {
072      LOG.error("Month file can not be loaded. File not found: {}", MONTH_FILEPATH);
073    } else {
074      try (CSVReader csv = CSVReaderFactory.build(monthFileStream, StandardCharsets.UTF_8.name(), COLUMN_SEPARATOR, null, 0)) {
075        while (csv.hasNext()) {
076          String[] row = csv.next();
077          if (row == null || StringUtils.isBlank(row[0]) || row[0].startsWith(COMMENT_MARKER)) {
078            continue;
079          }
080          String monthKey = row[0].toLowerCase();
081          if (monthMap.containsKey(monthKey)) {
082            List<String> split = Arrays.stream(StringUtils.split(row[1], ','))
083                .map(org.gbif.utils.text.StringUtils::trim)
084                .filter(StringUtils::isNotEmpty)
085                .collect(Collectors.toList());
086
087            for (String monthAltName : split) {
088              monthMap.get(monthKey).add(monthAltName.toLowerCase());
089            }
090          } else {
091            LOG.error("Unknown month “{}” found in: {}", monthKey, MONTH_FILEPATH);
092          }
093        }
094      } catch (IOException e) {
095        LOG.error("Error loading month alternative names", e);
096      }
097
098      // keep it in an array
099      int index = 0;
100      for (Month m : Month.values()) {
101        MONTHS[index] = monthMap.get(m.name().toLowerCase()).toArray(new String[0]);
102        index++;
103      }
104    }
105  }
106
107  /**
108   * Private constructor use static method {@link #newInstance()}
109   */
110  private DatePartsNormalizer(){}
111
112  public static DatePartsNormalizer newInstance(){
113    return new DatePartsNormalizer();
114  }
115
116  /**
117   * Normalize date parts value.
118   *
119   * @param year
120   * @param month
121   * @param day
122   * @return result of normalization as NormalizedYearMonthDay
123   */
124  public NormalizedYearMonthDay normalize(String year, String month, String day) {
125    year = normalizeFloat(year);
126    month = normalizeFloat(month);
127    day = normalizeFloat(day);
128
129    Integer monthAsInt = parseOrNull(month);
130    if(monthAsInt == null){
131      monthAsInt = monthNameToNumerical(month);
132    }
133
134    Integer iYear = parseOrNull(year);
135    Integer iMonth = monthAsInt;
136    Integer iDay = parseOrNull(day);
137
138    boolean yearDiscarded = wasDiscarded(year, iYear);
139    boolean monthDiscarded = wasDiscarded(month, iMonth);
140    boolean dayDiscarded = wasDiscarded(day, iDay);
141
142    return new NormalizedYearMonthDay(iYear, iMonth, iDay, yearDiscarded, monthDiscarded, dayDiscarded);
143  }
144
145  /**
146   * Normalize date parts value.
147   *
148   * @param year
149   * @param dayOfYear
150   * @return result of normalization as NormalizedYearMonthDay
151   */
152  public NormalizedYearDayOfYear normalize(String year, String dayOfYear) {
153    year = normalizeFloat(year);
154    dayOfYear = normalizeFloat(dayOfYear);
155
156    Integer iYear = parseOrNull(year);
157    Integer iDayOfYear = parseOrNull(dayOfYear);
158
159    boolean yearDiscarded = wasDiscarded(year, iYear);
160    boolean dayOfYearDiscarded = wasDiscarded(dayOfYear, iDayOfYear);
161
162    return new NormalizedYearDayOfYear(iYear, iDayOfYear, yearDiscarded, dayOfYearDiscarded);
163  }
164
165  /**
166   * Often months come in the form Sept. September etc. This will convert many variations into the numerical version
167   *
168   * @param month name to normalize
169   *
170   * @return the numerical value of the month (January == 1 )
171   */
172  public Integer monthNameToNumerical(String month) {
173    if (StringUtils.isNotBlank(month)) {
174      int m = 1;
175      for (String[] monthValues : MONTHS) {
176        for (String monthVal : monthValues) {
177          if (monthVal.equals(month.toLowerCase()) || (monthVal+ ".").equals(month.toLowerCase())) {
178            return m;
179          }
180        }
181        m++;
182      }
183    }
184    return null;
185  }
186
187
188  /**
189   * Often values are seen as Float rather than int, due to a database export The year "1978" is actually seen as
190   * "1978.0".  Where this is detected, the string is normalized to the INT representation
191   *
192   * @param value To check
193   *
194   * @return the integer value (as String)
195   */
196  public String normalizeFloat(String value) {
197    if (value != null && value.contains(".0")) {
198      try {
199        Double d = Double.valueOf(value);
200        if ((double) d.intValue() == d) {
201          return String.valueOf(d.intValue());
202        }
203      } catch (NumberFormatException e) {
204      }
205    }
206    return value;
207  }
208
209  /**
210   * Try to parse the provided String as Integer. Returns null if not possible.
211   * This function will trim the provided String.
212   * @param integer
213   * @return
214   */
215  private Integer parseOrNull(String integer){
216    if(integer != null){
217      integer = integer.trim();
218    }
219
220    try{
221      return Integer.valueOf(integer);
222    }
223    catch(NumberFormatException nfEx){}
224    return null;
225  }
226
227  /**
228   * Assert if a String value was discarded in the normalization process.
229   *
230   * @param strValue
231   * @param intValue
232   * @return the value should be considered discarded or not
233   */
234  private boolean wasDiscarded(String strValue, Integer intValue) {
235    if (StringUtils.isBlank(strValue) || STRING_NULL.equals(strValue)) {
236      return false;
237    }
238    return intValue == null;
239  }
240
241  /**
242   * Hold result of the normalization process.
243   */
244  public static class NormalizedYearMonthDay {
245
246    private Integer year;
247    private Integer month;
248    private Integer day;
249
250    private boolean yDiscarded;
251    private boolean mDiscarded;
252    private boolean dDiscarded;
253
254    NormalizedYearMonthDay(Integer year, Integer month, Integer day, boolean yDiscarded, boolean mDiscarded,
255                           boolean dDiscarded){
256      this.year = year;
257      this.month = month;
258      this.day = day;
259
260      this.yDiscarded = yDiscarded;
261      this.mDiscarded = mDiscarded;
262      this.dDiscarded = dDiscarded;
263    }
264
265    public Integer getYear() {
266      return year;
267    }
268
269    public Integer getMonth() {
270      return month;
271    }
272
273    public Integer getDay() {
274      return day;
275    }
276
277    public boolean yDiscarded() {
278      return yDiscarded;
279    }
280
281    public boolean mDiscarded() {
282      return mDiscarded;
283    }
284
285    public boolean dDiscarded() {
286      return dDiscarded;
287    }
288
289    /**
290     * The NormalizedYearMonthDay contains at least one discarded part.
291     * @return
292     */
293    public boolean containsDiscardedPart(){
294      return yDiscarded || mDiscarded || dDiscarded;
295    }
296
297    @Override
298    public String toString() {
299      return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE)
300              .append("year", year)
301              .append("month", month)
302              .append("day", day)
303              .append("yDiscarded", yDiscarded)
304              .append("mDiscarded", mDiscarded)
305              .append("dDiscarded", dDiscarded)
306              .toString();
307    }
308  }
309
310  /**
311   * Hold result of the normalization process.
312   */
313  public static class NormalizedYearDayOfYear {
314
315    private Integer year;
316    private Integer dayOfYear;
317
318    private boolean yDiscarded;
319    private boolean dDiscarded;
320
321    NormalizedYearDayOfYear(Integer year, Integer dayOfYear, boolean yDiscarded, boolean dDiscarded){
322      this.year = year;
323      this.dayOfYear = dayOfYear;
324
325      this.yDiscarded = yDiscarded;
326      this.dDiscarded = dDiscarded;
327    }
328
329    public Integer getYear() {
330      return year;
331    }
332
333    public Integer getDayOfYear() {
334      return dayOfYear;
335    }
336
337    public boolean yDiscarded() {
338      return yDiscarded;
339    }
340
341    public boolean dDiscarded() {
342      return dDiscarded;
343    }
344
345    /**
346     * The NormalizedYearDayOfYEar contains at least one discarded part.
347     * @return
348     */
349    public boolean containsDiscardedPart(){
350      return yDiscarded || dDiscarded;
351    }
352
353    @Override
354    public String toString() {
355      return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE)
356        .append("year", year)
357        .append("dayOfYear", dayOfYear)
358        .append("yDiscarded", yDiscarded)
359        .append("dDiscarded", dDiscarded)
360        .toString();
361    }
362  }
363}