001/* 002 * Licensed under the Apache License, Version 2.0 (the "License"); 003 * you may not use this file except in compliance with the License. 004 * You may obtain a copy of the License at 005 * 006 * http://www.apache.org/licenses/LICENSE-2.0 007 * 008 * Unless required by applicable law or agreed to in writing, software 009 * distributed under the License is distributed on an "AS IS" BASIS, 010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 011 * See the License for the specific language governing permissions and 012 * limitations under the License. 013 */ 014package org.gbif.common.parsers.date; 015 016import org.gbif.utils.file.csv.CSVReader; 017import org.gbif.utils.file.csv.CSVReaderFactory; 018 019import java.io.IOException; 020import java.io.InputStream; 021import java.nio.charset.StandardCharsets; 022import java.time.Month; 023import java.util.Arrays; 024import java.util.HashMap; 025import java.util.HashSet; 026import java.util.List; 027import java.util.Map; 028import java.util.Set; 029import java.util.stream.Collectors; 030 031import org.apache.commons.lang3.StringUtils; 032import org.apache.commons.lang3.builder.ToStringBuilder; 033import org.apache.commons.lang3.builder.ToStringStyle; 034import org.slf4j.Logger; 035import org.slf4j.LoggerFactory; 036 037/** 038 * DatePartsNormalizer contract is to take String representing Year, Month and Day and return the corresponding 039 * Integer as {@link NormalizedYearMonthDay} if possible. 040 * 041 * No validation will be applied to the normalized values. 042 * 043 * Thread-Safe after creation. 044 * 045 */ 046public class DatePartsNormalizer { 047 048 private static final Logger LOG = LoggerFactory.getLogger(DatePartsNormalizer.class); 049 050 private static final String STRING_NULL = "\\N"; 051 private static final String COLUMN_SEPARATOR = "\t"; 052 053 private static final String COMMENT_MARKER = "#"; 054 private static final String MONTH_FILEPATH = "/dictionaries/parse/month.tsv"; 055 056 private static final String[][] MONTHS = new String[Month.values().length][]; 057 058 // Load all the month names and alternative month names from a file 059 static { 060 Map<String, Set<String>> monthMap = new HashMap<>(Month.values().length); 061 String keyName; 062 for (Month m : Month.values()) { 063 keyName = m.name().toLowerCase(); 064 monthMap.put(keyName, new HashSet<>()); 065 //add the key itself 066 monthMap.get(keyName).add(keyName); 067 } 068 069 InputStream monthFileStream = DatePartsNormalizer.class.getResourceAsStream(MONTH_FILEPATH); 070 071 if (monthFileStream == null) { 072 LOG.error("Month file can not be loaded. File not found: {}", MONTH_FILEPATH); 073 } else { 074 try (CSVReader csv = CSVReaderFactory.build(monthFileStream, StandardCharsets.UTF_8.name(), COLUMN_SEPARATOR, null, 0)) { 075 while (csv.hasNext()) { 076 String[] row = csv.next(); 077 if (row == null || StringUtils.isBlank(row[0]) || row[0].startsWith(COMMENT_MARKER)) { 078 continue; 079 } 080 String monthKey = row[0].toLowerCase(); 081 if (monthMap.containsKey(monthKey)) { 082 List<String> split = Arrays.stream(StringUtils.split(row[1], ',')) 083 .map(org.gbif.utils.text.StringUtils::trim) 084 .filter(StringUtils::isNotEmpty) 085 .collect(Collectors.toList()); 086 087 for (String monthAltName : split) { 088 monthMap.get(monthKey).add(monthAltName.toLowerCase()); 089 } 090 } else { 091 LOG.error("Unknown month “{}” found in: {}", monthKey, MONTH_FILEPATH); 092 } 093 } 094 } catch (IOException e) { 095 LOG.error("Error loading month alternative names", e); 096 } 097 098 // keep it in an array 099 int index = 0; 100 for (Month m : Month.values()) { 101 MONTHS[index] = monthMap.get(m.name().toLowerCase()).toArray(new String[0]); 102 index++; 103 } 104 } 105 } 106 107 /** 108 * Private constructor use static method {@link #newInstance()} 109 */ 110 private DatePartsNormalizer(){} 111 112 public static DatePartsNormalizer newInstance(){ 113 return new DatePartsNormalizer(); 114 } 115 116 /** 117 * Normalize date parts value. 118 * 119 * @param year 120 * @param month 121 * @param day 122 * @return result of normalization as NormalizedYearMonthDay 123 */ 124 public NormalizedYearMonthDay normalize(String year, String month, String day) { 125 year = normalizeFloat(year); 126 month = normalizeFloat(month); 127 day = normalizeFloat(day); 128 129 Integer monthAsInt = parseOrNull(month); 130 if(monthAsInt == null){ 131 monthAsInt = monthNameToNumerical(month); 132 } 133 134 Integer iYear = parseOrNull(year); 135 Integer iMonth = monthAsInt; 136 Integer iDay = parseOrNull(day); 137 138 boolean yearDiscarded = wasDiscarded(year, iYear); 139 boolean monthDiscarded = wasDiscarded(month, iMonth); 140 boolean dayDiscarded = wasDiscarded(day, iDay); 141 142 return new NormalizedYearMonthDay(iYear, iMonth, iDay, yearDiscarded, monthDiscarded, dayDiscarded); 143 } 144 145 /** 146 * Normalize date parts value. 147 * 148 * @param year 149 * @param dayOfYear 150 * @return result of normalization as NormalizedYearMonthDay 151 */ 152 public NormalizedYearDayOfYear normalize(String year, String dayOfYear) { 153 year = normalizeFloat(year); 154 dayOfYear = normalizeFloat(dayOfYear); 155 156 Integer iYear = parseOrNull(year); 157 Integer iDayOfYear = parseOrNull(dayOfYear); 158 159 boolean yearDiscarded = wasDiscarded(year, iYear); 160 boolean dayOfYearDiscarded = wasDiscarded(dayOfYear, iDayOfYear); 161 162 return new NormalizedYearDayOfYear(iYear, iDayOfYear, yearDiscarded, dayOfYearDiscarded); 163 } 164 165 /** 166 * Often months come in the form Sept. September etc. This will convert many variations into the numerical version 167 * 168 * @param month name to normalize 169 * 170 * @return the numerical value of the month (January == 1 ) 171 */ 172 public Integer monthNameToNumerical(String month) { 173 if (StringUtils.isNotBlank(month)) { 174 int m = 1; 175 for (String[] monthValues : MONTHS) { 176 for (String monthVal : monthValues) { 177 if (monthVal.equals(month.toLowerCase()) || (monthVal+ ".").equals(month.toLowerCase())) { 178 return m; 179 } 180 } 181 m++; 182 } 183 } 184 return null; 185 } 186 187 188 /** 189 * Often values are seen as Float rather than int, due to a database export The year "1978" is actually seen as 190 * "1978.0". Where this is detected, the string is normalized to the INT representation 191 * 192 * @param value To check 193 * 194 * @return the integer value (as String) 195 */ 196 public String normalizeFloat(String value) { 197 if (value != null && value.contains(".0")) { 198 try { 199 Double d = Double.valueOf(value); 200 if ((double) d.intValue() == d) { 201 return String.valueOf(d.intValue()); 202 } 203 } catch (NumberFormatException e) { 204 } 205 } 206 return value; 207 } 208 209 /** 210 * Try to parse the provided String as Integer. Returns null if not possible. 211 * This function will trim the provided String. 212 * @param integer 213 * @return 214 */ 215 private Integer parseOrNull(String integer){ 216 if(integer != null){ 217 integer = integer.trim(); 218 } 219 220 try{ 221 return Integer.valueOf(integer); 222 } 223 catch(NumberFormatException nfEx){} 224 return null; 225 } 226 227 /** 228 * Assert if a String value was discarded in the normalization process. 229 * 230 * @param strValue 231 * @param intValue 232 * @return the value should be considered discarded or not 233 */ 234 private boolean wasDiscarded(String strValue, Integer intValue) { 235 if (StringUtils.isBlank(strValue) || STRING_NULL.equals(strValue)) { 236 return false; 237 } 238 return intValue == null; 239 } 240 241 /** 242 * Hold result of the normalization process. 243 */ 244 public static class NormalizedYearMonthDay { 245 246 private Integer year; 247 private Integer month; 248 private Integer day; 249 250 private boolean yDiscarded; 251 private boolean mDiscarded; 252 private boolean dDiscarded; 253 254 NormalizedYearMonthDay(Integer year, Integer month, Integer day, boolean yDiscarded, boolean mDiscarded, 255 boolean dDiscarded){ 256 this.year = year; 257 this.month = month; 258 this.day = day; 259 260 this.yDiscarded = yDiscarded; 261 this.mDiscarded = mDiscarded; 262 this.dDiscarded = dDiscarded; 263 } 264 265 public Integer getYear() { 266 return year; 267 } 268 269 public Integer getMonth() { 270 return month; 271 } 272 273 public Integer getDay() { 274 return day; 275 } 276 277 public boolean yDiscarded() { 278 return yDiscarded; 279 } 280 281 public boolean mDiscarded() { 282 return mDiscarded; 283 } 284 285 public boolean dDiscarded() { 286 return dDiscarded; 287 } 288 289 /** 290 * The NormalizedYearMonthDay contains at least one discarded part. 291 * @return 292 */ 293 public boolean containsDiscardedPart(){ 294 return yDiscarded || mDiscarded || dDiscarded; 295 } 296 297 @Override 298 public String toString() { 299 return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) 300 .append("year", year) 301 .append("month", month) 302 .append("day", day) 303 .append("yDiscarded", yDiscarded) 304 .append("mDiscarded", mDiscarded) 305 .append("dDiscarded", dDiscarded) 306 .toString(); 307 } 308 } 309 310 /** 311 * Hold result of the normalization process. 312 */ 313 public static class NormalizedYearDayOfYear { 314 315 private Integer year; 316 private Integer dayOfYear; 317 318 private boolean yDiscarded; 319 private boolean dDiscarded; 320 321 NormalizedYearDayOfYear(Integer year, Integer dayOfYear, boolean yDiscarded, boolean dDiscarded){ 322 this.year = year; 323 this.dayOfYear = dayOfYear; 324 325 this.yDiscarded = yDiscarded; 326 this.dDiscarded = dDiscarded; 327 } 328 329 public Integer getYear() { 330 return year; 331 } 332 333 public Integer getDayOfYear() { 334 return dayOfYear; 335 } 336 337 public boolean yDiscarded() { 338 return yDiscarded; 339 } 340 341 public boolean dDiscarded() { 342 return dDiscarded; 343 } 344 345 /** 346 * The NormalizedYearDayOfYEar contains at least one discarded part. 347 * @return 348 */ 349 public boolean containsDiscardedPart(){ 350 return yDiscarded || dDiscarded; 351 } 352 353 @Override 354 public String toString() { 355 return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) 356 .append("year", year) 357 .append("dayOfYear", dayOfYear) 358 .append("yDiscarded", yDiscarded) 359 .append("dDiscarded", dDiscarded) 360 .toString(); 361 } 362 } 363}