001/* 002 * Copyright 2021 Global Biodiversity Information Facility (GBIF) 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package org.gbif.utils.text; 017 018import java.io.PrintWriter; 019import java.io.StringWriter; 020import java.io.Writer; 021import java.nio.ByteBuffer; 022import java.nio.charset.CharacterCodingException; 023import java.nio.charset.CharsetDecoder; 024import java.nio.charset.StandardCharsets; 025import java.text.Normalizer; 026import java.util.Arrays; 027import java.util.Calendar; 028import java.util.HashMap; 029import java.util.Map; 030import java.util.Objects; 031import java.util.Random; 032import java.util.function.Function; 033import java.util.regex.Pattern; 034import java.util.stream.Collectors; 035 036import org.apache.commons.lang3.text.WordUtils; 037 038/** 039 * Utils class adding specific string methods to existing guava Strings and 040 * commons {@link org.apache.commons.lang3.StringUtils}. 041 */ 042public final class StringUtils { 043 044 private static final Pattern MARKER = Pattern.compile("\\p{M}"); 045 public static final int LINNEAN_YEAR = 1751; 046 private static final String CONS = "BCDFGHJKLMNPQRSTVWXYZ"; 047 private static final Pattern OCT = Pattern.compile("^[0-7]+$"); 048 private static final Pattern HEX = Pattern.compile("^[0-9abcdefABCDEF]+$"); 049 050 private static final String VOC = "AEIOU"; 051 private static final Random RND = new Random(); 052 053 /** 054 * This includes some special whitespaces which not present in standard trim list: 055 * <ul> 056 * <li>U+0085 Next Line (NEL)</li> 057 * <li>U+00A0 No-Break Space (NBSP)</li> 058 * <li>U+000C Form Feed (FF)</li> 059 * <li>U+2007 Figure Space </li> 060 * </ul> 061 */ 062 public static final String WHITESPACES_LIST = "" 063 + "\u2002\u3000\r\u0085\u200A\u2005\u2000\u3000" 064 + "\u2029\u000B\u3000\u2008\u2003\u205F\u3000\u1680" 065 + "\u0009\u0020\u2006\u2001\u202F\u00A0\u000C\u2009" 066 + "\u3000\u2004\u3000\u3000\u2028\n\u2007\u3000"; 067 068 private StringUtils() { 069 } 070 071 /** 072 * Removes accents & diacretics and converts ligatures into several chars 073 * @param x string to fold into ASCII 074 * @return string converted to ASCII equivalent, expanding common ligatures 075 */ 076 public static String foldToAscii(String x) { 077 if (x == null) { 078 return null; 079 } 080 x = replaceSpecialCases(x); 081 // use java unicode normalizer to remove accents 082 x = Normalizer.normalize(x, Normalizer.Form.NFD); 083 return MARKER.matcher(x).replaceAll(""); 084 } 085 086 /** 087 * Apply a function then join the result using a space if not null. 088 * E.g. can be used with apache.commons.lang3.StringUtils::trimToNull to compose a name when some parts are 089 * optionals. 090 * 091 * @param fct the function to apply or Function.identity() if none 092 * @param parts 093 * 094 * @return a String that represents all parts joined by a space or empty String. Never null. 095 */ 096 public static String thenJoin(Function<String, String> fct, String... parts) { 097 Objects.requireNonNull(fct, "fct shall be provided, use Function.identity() is you want to use the String as is"); 098 return Arrays.stream(parts != null ? parts : new String[0]) 099 .map(fct) 100 .filter(Objects::nonNull) 101 .collect(Collectors.joining(" ")); 102 } 103 104 /** 105 * The Normalizer misses a few cases and 2 char ligatures which we deal with here 106 */ 107 private static String replaceSpecialCases(String x) { 108 StringBuilder sb = new StringBuilder(); 109 110 for (int i = 0; i < x.length(); i++) { 111 char c = x.charAt(i); 112 switch (c) { 113 case 'ß': 114 sb.append("ss"); 115 break; 116 case 'Æ': 117 sb.append("AE"); 118 break; 119 case 'æ': 120 sb.append("ae"); 121 break; 122 case 'Ð': 123 sb.append("D"); 124 break; 125 case 'đ': 126 sb.append("d"); 127 break; 128 case 'ð': 129 sb.append("d"); 130 break; 131 case 'Ø': 132 sb.append("O"); 133 break; 134 case 'ø': 135 sb.append("o"); 136 break; 137 case 'Œ': 138 sb.append("OE"); 139 break; 140 case 'œ': 141 sb.append("oe"); 142 break; 143 case 'Ŧ': 144 sb.append("T"); 145 break; 146 case 'ŧ': 147 sb.append("t"); 148 break; 149 case 'Ł': 150 sb.append("L"); 151 break; 152 case 'ł': 153 sb.append("l"); 154 break; 155 default: 156 sb.append(c); 157 } 158 } 159 return sb.toString(); 160 } 161 162 /** 163 * Increase a given string by 1, i.e. increase the last char in that string by one. 164 * If its a z or Z the char before is increased instead and a new char a is appended. 165 * Only true letters are increased, but spaces, punctuation or numbers remain unchanged. 166 * Null values stay null and empty strings empty. 167 * The case of existing characters will be kept and the appended chars will use the case of the last char of the 168 * original string. 169 * 170 * For example "Carlb" becomes "Carla", "Aua" "Atz", "zZz" "aAaa" or "Abies zzz" "Abiet aaa". 171 * 172 * @param x 173 * @return 174 */ 175 public static String increase(String x) { 176 if (x == null) { 177 return null; 178 } 179 if (x.equals("")) { 180 return x; 181 } 182 183 char[] chars = x.toCharArray(); 184 int idx = chars.length - 1; 185 boolean appendingNeeded = false; 186 Character lastOriginalChar = null; 187 188 while (idx >= 0){ 189 char c = chars[idx]; 190 if (!Character.isLetter(c)){ 191 idx--; 192 continue; 193 } 194 195 if (lastOriginalChar == null){ 196 lastOriginalChar = c; 197 } 198 199 if (c == 'z'){ 200 chars[idx] = 'a'; 201 appendingNeeded = true; 202 203 } else if (c == 'Z'){ 204 chars[idx] = 'A'; 205 appendingNeeded = true; 206 207 } else { 208 c++; 209 chars[idx] = c; 210 appendingNeeded = false; 211 break; 212 } 213 idx--; 214 } 215 216 // first char, also append to end 217 if (appendingNeeded){ 218 char append = (lastOriginalChar==null || Character.isLowerCase(lastOriginalChar)) ? 'a' : 'A'; 219 return String.valueOf(chars) + append; 220 221 } else { 222 return String.valueOf(chars); 223 } 224 } 225 226 /** 227 * Creates a random species binomial with no meaning at all, but highly randomized. 228 * 229 * @return a random canonical species name 230 */ 231 public static String randomSpecies() { 232 return randomGenus() + " " + randomEpithet(); 233 } 234 235 public static String randomGenus() { 236 return WordUtils.capitalize(randomString(RND.nextInt(9) + 3).toLowerCase()); 237 } 238 239 public static String randomEpithet() { 240 return randomString(RND.nextInt(12) + 4).toLowerCase(); 241 } 242 public static String randomFamily() { 243 return WordUtils.capitalize(StringUtils.randomString(RND.nextInt(15) + 5).toLowerCase()) + "idae"; 244 } 245 246 public static String randomAuthor() { 247 return WordUtils.capitalize(StringUtils.randomString(RND.nextInt(12) + 1).toLowerCase()); 248 } 249 250 /** 251 * Creates a random string in upper case of given length with purely latin characters only. 252 * Vocals are used much more frequently than consonants 253 * @param len 254 * @return a random string in upper case 255 */ 256 public static String randomString(int len) { 257 StringBuilder sb = new StringBuilder(len); 258 for (int i = 0; i < len; i++) { 259 if (RND.nextInt(3) > 1) { 260 sb.append(CONS.charAt(RND.nextInt(CONS.length()))); 261 } else { 262 sb.append(VOC.charAt(RND.nextInt(VOC.length()))); 263 } 264 } 265 266 return sb.toString(); 267 } 268 269 /** 270 * @return a year since Linnéan times 1751 before now as a 4 character long string 271 */ 272 public static String randomSpeciesYear() { 273 int maxYear = Calendar.getInstance().get(Calendar.YEAR); 274 return String.valueOf(LINNEAN_YEAR + RND.nextInt(maxYear - LINNEAN_YEAR + 1)); 275 } 276 277 /** 278 * Simple integer parsing method that does not throw any exception but 279 * returns null instead. 280 * 281 * @param x 282 * @return the parsed integer or null 283 */ 284 public static Integer parseInteger(String x) { 285 try { 286 return Integer.valueOf(x); 287 } catch (NumberFormatException e) { 288 289 } 290 return null; 291 } 292 293 /** 294 * Simple boolean parsing method that understands yes,y,true,t or 1 as true and respective values for false. 295 * It does not throw any exception but returns null instead. 296 * 297 * @param x 298 * @return the parsed integer or null 299 */ 300 public static Boolean parseBoolean(String x) { 301 x = org.apache.commons.lang3.StringUtils.trimToEmpty(x).toLowerCase(); 302 if (x.equals("true") || x.equals("t") || x.equals("1") || x.equals("yes") || x.equals("y")) { 303 return true; 304 } 305 if (x.equals("false") || x.equals("f") || x.equals("0") || x.equals("no") || x.equals("n")) { 306 return false; 307 } 308 return null; 309 } 310 311 /** 312 * Unescapes various unicode escapes if existing: 313 * 314 * java unicode escape, four hexadecimal digits 315 * \ uhhhh 316 * 317 * octal escape 318 * \nnn 319 * The octal value nnn, where nnn stands for 1 to 3 digits between ‘0’ and ‘7’. For example, the code for the ASCII 320 * ESC (escape) character is ‘\033’. 321 * 322 * hexadecimal escape 323 * \xhh... 324 * The hexadecimal value hh, where hh stands for a sequence of hexadecimal digits (‘0’–‘9’, and either ‘A’–‘F’ or 325 * ‘a’–‘f’).Like the same construct in ISO C, the escape sequence continues until the first nonhexadecimal digit is seen. 326 * However, using more than two hexadecimal digits produces undefined results. (The ‘\x’ escape sequence is not allowed 327 * in POSIX awk.) 328 * 329 * @param text string potentially containing unicode escape chars 330 * @return the unescaped string 331 */ 332 public static String unescapeUnicodeChars(String text) { 333 if (text == null) { 334 return null; 335 } 336 // replace unicode, hexadecimal or octal character encodings by iterating over the chars once 337 // 338 // java unicode escape, four hexadecimal digits 339 // \ uhhhh 340 // 341 // octal escape 342 // \nnn 343 // The octal value nnn, where nnn stands for 1 to 3 digits between ‘0’ and ‘7’. For example, the code for the ASCII 344 // ESC (escape) character is ‘\033’. 345 // 346 // hexadecimal escape 347 // \xhh... 348 // The hexadecimal value hh, where hh stands for a sequence of hexadecimal digits (‘0’–‘9’, and either ‘A’–‘F’ or 349 // ‘a’–‘f’). 350 // Like the same construct in ISO C, the escape sequence continues until the first nonhexadecimal digit is seen. 351 // However, using more than two hexadecimal digits produces undefined results. (The ‘\x’ escape sequence is not allowed 352 // in POSIX awk.) 353 int i = 0, len = text.length(); 354 char c; 355 StringBuffer sb = new StringBuffer(len); 356 while (i < len) { 357 c = text.charAt(i++); 358 if (c == '\\') { 359 if (i < len) { 360 c = text.charAt(i++); 361 try { 362 if (c == 'u' && text.length() >= i + 4) { 363 // make sure we have only hexadecimals 364 String hex = text.substring(i, i + 4); 365 if (HEX.matcher(hex).find()) { 366 c = (char) Integer.parseInt(hex, 16); 367 i += 4; 368 } else { 369 throw new NumberFormatException("No hex value: " + hex); 370 } 371 } else if (c == 'n' && text.length() >= i + 2) { 372 // make sure we have only 0-7 digits 373 String oct = text.substring(i, i + 2); 374 if (OCT.matcher(oct).find()) { 375 c = (char) Integer.parseInt(oct, 8); 376 i += 2; 377 } else { 378 throw new NumberFormatException("No octal value: " + oct); 379 } 380 } else if (c == 'x' && text.length() >= i + 2) { 381 // make sure we have only hexadecimals 382 String hex = text.substring(i, i + 2); 383 if (HEX.matcher(hex).find()) { 384 c = (char) Integer.parseInt(hex, 16); 385 i += 2; 386 } else { 387 throw new NumberFormatException("No hex value: " + hex); 388 } 389 } else if (c == 'r' || c == 'n' || c == 't') { 390 // escaped newline or tab. Replace with simple space 391 c = ' '; 392 } else { 393 throw new NumberFormatException("No char escape"); 394 } 395 } catch (NumberFormatException e) { 396 // keep original characters including \ if escape sequence was invalid 397 // but replace \n with space instead 398 if (c == 'n') { 399 c = ' '; 400 } else { 401 c = '\\'; 402 i--; 403 } 404 } 405 } 406 } // fall through: \ escapes itself, quotes any character but u 407 sb.append(c); 408 } 409 return sb.toString(); 410 } 411 412 /** 413 * Tries to decode a UTF8 string only if common UTF8 character combinations are found which are unlikely to be correctly encoded text. 414 * E.g. ü is the German Umlaut ü and indicates we have encoded utf8 text still. 415 */ 416 public static String decodeUtf8Garbage(String text) { 417 Pattern UTF8_TEST = Pattern.compile("(ä|ü|ö|Ã\u0084|Ã\u009C|Ã\u0096|" + // äüöÄÜÖ 418 "ñ|ø|ç|î|ô|û|Ã\u0091|Ã\u0098|Ã\u0087|Ã\u008E|Ã\u0094|Ã\u009B" + // ñøçîôûÑØÇÎÔÛ 419 "á|é|ó|ú|Ã\u00AD|Ã\u0081|Ã\u0089|Ã\u0093|Ã\u009A|Ã\u008D)" // áéóúíÁÉÓÚÍ 420 , Pattern.CASE_INSENSITIVE); 421 if (text != null && UTF8_TEST.matcher(text).find()) { 422 // typical utf8 combinations found. Try to decode from latin1 to utf8 423 byte[] bytes = text.getBytes(StandardCharsets.ISO_8859_1); 424 final CharsetDecoder utf8Decoder = StandardCharsets.UTF_8.newDecoder(); 425 ByteBuffer buffer = ByteBuffer.wrap(bytes); 426 try { 427 return utf8Decoder.decode(buffer).toString(); 428 } catch (CharacterCodingException e) { 429 // maybe wasnt a good idea, return original 430 } 431 } 432 return text; 433 } 434 435 /** 436 * Joins a list of objects into a string, skipping null values and calling toString on each object. 437 * @param delimiter to join the values with 438 * @param values to be joined 439 * @return 440 */ 441 public static String joinIfNotNull(String delimiter, Object... values) { 442 return Arrays.stream(values) 443 .filter(Objects::nonNull) 444 .map(Object::toString) 445 .collect(Collectors.joining(delimiter)); 446 } 447 448 /** 449 * Uppercases all keys and values in a Map. 450 * If keys clash only one entry will remain, which is not guaranteed. 451 * 452 * @param map 453 * @return new map with keys and values upper cased. 454 */ 455 public static Map<String, String> upper(Map<String, String> map) { 456 Map<String, String> upperMap = new HashMap<>(); 457 for (String k : map.keySet()) { 458 String v = map.get(k); 459 if (v != null) { 460 v = v.trim().toUpperCase(); 461 } 462 upperMap.put(k.toUpperCase(), v); 463 } 464 return upperMap; 465 } 466 467 /** 468 * Returns an empty string or the trimmed lower case version of any input, but never NULL. 469 */ 470 public static String emptyLowerCase(String str) { 471 return org.apache.commons.lang3.StringUtils.trimToEmpty(str).toLowerCase(); 472 } 473 474 /** 475 * Reads a stack trace from an exception and returns it as a String. 476 * @param aThrowable 477 * @return teh full stack trace as a String 478 */ 479 public static String getStackTrace(Throwable aThrowable) { 480 final Writer result = new StringWriter(); 481 final PrintWriter printWriter = new PrintWriter(result); 482 aThrowable.printStackTrace(printWriter); 483 return result.toString(); 484 } 485 486 /** 487 * Strips a set of whitespace characters from the start and end of a String. 488 * This is similar to String.trim() but also includes some specific characters. 489 * 490 * @param str String to be trimmed 491 * @return trimmed String 492 */ 493 public static String trim(String str) { 494 return org.apache.commons.lang3.StringUtils.strip(str, WHITESPACES_LIST); 495 } 496 497 /** 498 * Removes all whitespace characters from the String. 499 * 500 * @param str String to be processed 501 * @return String without whitespaces 502 */ 503 public static String deleteWhitespace(final String str) { 504 if (org.apache.commons.lang3.StringUtils.isEmpty(str)) { 505 return str; 506 } 507 final int sz = str.length(); 508 final char[] chs = new char[sz]; 509 int count = 0; 510 for (int i = 0; i < sz; i++) { 511 if (org.apache.commons.lang3.StringUtils.containsNone(WHITESPACES_LIST, str.charAt(i))) { 512 chs[count++] = str.charAt(i); 513 } 514 } 515 if (count == sz) { 516 return str; 517 } 518 return new String(chs, 0, count); 519 } 520}