001/* 002 * Licensed under the Apache License, Version 2.0 (the "License"); 003 * you may not use this file except in compliance with the License. 004 * You may obtain a copy of the License at 005 * 006 * http://www.apache.org/licenses/LICENSE-2.0 007 * 008 * Unless required by applicable law or agreed to in writing, software 009 * distributed under the License is distributed on an "AS IS" BASIS, 010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 011 * See the License for the specific language governing permissions and 012 * limitations under the License. 013 */ 014package org.gbif.utils.text; 015 016import java.io.PrintWriter; 017import java.io.StringWriter; 018import java.io.Writer; 019import java.nio.ByteBuffer; 020import java.nio.charset.CharacterCodingException; 021import java.nio.charset.CharsetDecoder; 022import java.nio.charset.StandardCharsets; 023import java.text.Normalizer; 024import java.util.Arrays; 025import java.util.Calendar; 026import java.util.HashMap; 027import java.util.Map; 028import java.util.Objects; 029import java.util.Random; 030import java.util.function.Function; 031import java.util.regex.Pattern; 032import java.util.stream.Collectors; 033 034import org.apache.commons.lang3.text.WordUtils; 035 036/** 037 * Utils class adding specific string methods to existing guava Strings and 038 * commons {@link org.apache.commons.lang3.StringUtils}. 039 */ 040public final class StringUtils { 041 042 private static final Pattern MARKER = Pattern.compile("\\p{M}"); 043 public static final int LINNEAN_YEAR = 1751; 044 private static final String CONS = "BCDFGHJKLMNPQRSTVWXYZ"; 045 private static final Pattern OCT = Pattern.compile("^[0-7]+$"); 046 private static final Pattern HEX = Pattern.compile("^[0-9abcdefABCDEF]+$"); 047 048 private static final String VOC = "AEIOU"; 049 private static final Random RND = new Random(); 050 051 /** 052 * This includes some special whitespaces which not present in standard trim list: 053 * <ul> 054 * <li>U+0085 Next Line (NEL)</li> 055 * <li>U+00A0 No-Break Space (NBSP)</li> 056 * <li>U+000C Form Feed (FF)</li> 057 * <li>U+2007 Figure Space </li> 058 * </ul> 059 */ 060 public static final String WHITESPACES_LIST = 061 "" 062 + "\u2002\u3000\r\u0085\u200A\u2005\u2000\u3000" 063 + "\u2029\u000B\u3000\u2008\u2003\u205F\u3000\u1680" 064 + "\u0009\u0020\u2006\u2001\u202F\u00A0\u000C\u2009" 065 + "\u3000\u2004\u3000\u3000\u2028\n\u2007\u3000"; 066 067 private StringUtils() {} 068 069 /** 070 * Removes accents & diacretics and converts ligatures into several chars 071 * @param x string to fold into ASCII 072 * @return string converted to ASCII equivalent, expanding common ligatures 073 */ 074 public static String foldToAscii(String x) { 075 if (x == null) { 076 return null; 077 } 078 x = replaceSpecialCases(x); 079 // use java unicode normalizer to remove accents 080 x = Normalizer.normalize(x, Normalizer.Form.NFD); 081 return MARKER.matcher(x).replaceAll(""); 082 } 083 084 /** 085 * Apply a function then join the result using a space if not null. 086 * E.g. can be used with apache.commons.lang3.StringUtils::trimToNull to compose a name when some parts are 087 * optionals. 088 * 089 * @param fct the function to apply or Function.identity() if none 090 * @param parts 091 * 092 * @return a String that represents all parts joined by a space or empty String. Never null. 093 */ 094 public static String thenJoin(Function<String, String> fct, String... parts) { 095 Objects.requireNonNull( 096 fct, "fct shall be provided, use Function.identity() is you want to use the String as is"); 097 return Arrays.stream(parts != null ? parts : new String[0]) 098 .map(fct) 099 .filter(Objects::nonNull) 100 .collect(Collectors.joining(" ")); 101 } 102 103 /** 104 * The Normalizer misses a few cases and 2 char ligatures which we deal with here 105 */ 106 private static String replaceSpecialCases(String x) { 107 StringBuilder sb = new StringBuilder(); 108 109 for (int i = 0; i < x.length(); i++) { 110 char c = x.charAt(i); 111 switch (c) { 112 case 'ß': 113 sb.append("ss"); 114 break; 115 case 'Æ': 116 sb.append("AE"); 117 break; 118 case 'æ': 119 sb.append("ae"); 120 break; 121 case 'Ð': 122 sb.append("D"); 123 break; 124 case 'đ': 125 sb.append("d"); 126 break; 127 case 'ð': 128 sb.append("d"); 129 break; 130 case 'Ø': 131 sb.append("O"); 132 break; 133 case 'ø': 134 sb.append("o"); 135 break; 136 case 'Œ': 137 sb.append("OE"); 138 break; 139 case 'œ': 140 sb.append("oe"); 141 break; 142 case 'Ŧ': 143 sb.append("T"); 144 break; 145 case 'ŧ': 146 sb.append("t"); 147 break; 148 case 'Ł': 149 sb.append("L"); 150 break; 151 case 'ł': 152 sb.append("l"); 153 break; 154 default: 155 sb.append(c); 156 } 157 } 158 return sb.toString(); 159 } 160 161 /** 162 * Increase a given string by 1, i.e. increase the last char in that string by one. 163 * If its a z or Z the char before is increased instead and a new char a is appended. 164 * Only true letters are increased, but spaces, punctuation or numbers remain unchanged. 165 * Null values stay null and empty strings empty. 166 * The case of existing characters will be kept and the appended chars will use the case of the last char of the 167 * original string. 168 * 169 * For example "Carlb" becomes "Carla", "Aua" "Atz", "zZz" "aAaa" or "Abies zzz" "Abiet aaa". 170 * 171 * @param x 172 * @return 173 */ 174 public static String increase(String x) { 175 if (x == null) { 176 return null; 177 } 178 if (x.equals("")) { 179 return x; 180 } 181 182 char[] chars = x.toCharArray(); 183 int idx = chars.length - 1; 184 boolean appendingNeeded = false; 185 Character lastOriginalChar = null; 186 187 while (idx >= 0) { 188 char c = chars[idx]; 189 if (!Character.isLetter(c)) { 190 idx--; 191 continue; 192 } 193 194 if (lastOriginalChar == null) { 195 lastOriginalChar = c; 196 } 197 198 if (c == 'z') { 199 chars[idx] = 'a'; 200 appendingNeeded = true; 201 202 } else if (c == 'Z') { 203 chars[idx] = 'A'; 204 appendingNeeded = true; 205 206 } else { 207 c++; 208 chars[idx] = c; 209 appendingNeeded = false; 210 break; 211 } 212 idx--; 213 } 214 215 // first char, also append to end 216 if (appendingNeeded) { 217 char append = 218 (lastOriginalChar == null || Character.isLowerCase(lastOriginalChar)) ? 'a' : 'A'; 219 return String.valueOf(chars) + append; 220 221 } else { 222 return String.valueOf(chars); 223 } 224 } 225 226 /** 227 * Creates a random species binomial with no meaning at all, but highly randomized. 228 * 229 * @return a random canonical species name 230 */ 231 public static String randomSpecies() { 232 return randomGenus() + " " + randomEpithet(); 233 } 234 235 public static String randomGenus() { 236 return WordUtils.capitalize(randomString(RND.nextInt(9) + 3).toLowerCase()); 237 } 238 239 public static String randomEpithet() { 240 return randomString(RND.nextInt(12) + 4).toLowerCase(); 241 } 242 243 public static String randomFamily() { 244 return WordUtils.capitalize(StringUtils.randomString(RND.nextInt(15) + 5).toLowerCase()) 245 + "idae"; 246 } 247 248 public static String randomAuthor() { 249 return WordUtils.capitalize(StringUtils.randomString(RND.nextInt(12) + 1).toLowerCase()); 250 } 251 252 /** 253 * Creates a random string in upper case of given length with purely latin characters only. 254 * Vocals are used much more frequently than consonants 255 * @param len 256 * @return a random string in upper case 257 */ 258 public static String randomString(int len) { 259 StringBuilder sb = new StringBuilder(len); 260 for (int i = 0; i < len; i++) { 261 if (RND.nextInt(3) > 1) { 262 sb.append(CONS.charAt(RND.nextInt(CONS.length()))); 263 } else { 264 sb.append(VOC.charAt(RND.nextInt(VOC.length()))); 265 } 266 } 267 268 return sb.toString(); 269 } 270 271 /** 272 * @return a year since Linnéan times 1751 before now as a 4 character long string 273 */ 274 public static String randomSpeciesYear() { 275 int maxYear = Calendar.getInstance().get(Calendar.YEAR); 276 return String.valueOf(LINNEAN_YEAR + RND.nextInt(maxYear - LINNEAN_YEAR + 1)); 277 } 278 279 /** 280 * Simple integer parsing method that does not throw any exception but 281 * returns null instead. 282 * 283 * @param x 284 * @return the parsed integer or null 285 */ 286 public static Integer parseInteger(String x) { 287 try { 288 return Integer.valueOf(x); 289 } catch (NumberFormatException e) { 290 291 } 292 return null; 293 } 294 295 /** 296 * Simple boolean parsing method that understands yes,y,true,t or 1 as true and respective values for false. 297 * It does not throw any exception but returns null instead. 298 * 299 * @param x 300 * @return the parsed integer or null 301 */ 302 public static Boolean parseBoolean(String x) { 303 x = org.apache.commons.lang3.StringUtils.trimToEmpty(x).toLowerCase(); 304 if (x.equals("true") || x.equals("t") || x.equals("1") || x.equals("yes") || x.equals("y")) { 305 return true; 306 } 307 if (x.equals("false") || x.equals("f") || x.equals("0") || x.equals("no") || x.equals("n")) { 308 return false; 309 } 310 return null; 311 } 312 313 /** 314 * Unescapes various unicode escapes if existing: 315 * 316 * java unicode escape, four hexadecimal digits 317 * \ uhhhh 318 * 319 * octal escape 320 * \nnn 321 * The octal value nnn, where nnn stands for 1 to 3 digits between ‘0’ and ‘7’. For example, the code for the ASCII 322 * ESC (escape) character is ‘\033’. 323 * 324 * hexadecimal escape 325 * \xhh... 326 * The hexadecimal value hh, where hh stands for a sequence of hexadecimal digits (‘0’–‘9’, and either ‘A’–‘F’ or 327 * ‘a’–‘f’).Like the same construct in ISO C, the escape sequence continues until the first nonhexadecimal digit is seen. 328 * However, using more than two hexadecimal digits produces undefined results. (The ‘\x’ escape sequence is not allowed 329 * in POSIX awk.) 330 * 331 * @param text string potentially containing unicode escape chars 332 * @return the unescaped string 333 */ 334 public static String unescapeUnicodeChars(String text) { 335 if (text == null) { 336 return null; 337 } 338 // replace unicode, hexadecimal or octal character encodings by iterating over the chars once 339 // 340 // java unicode escape, four hexadecimal digits 341 // \ uhhhh 342 // 343 // octal escape 344 // \nnn 345 // The octal value nnn, where nnn stands for 1 to 3 digits between ‘0’ and ‘7’. For example, the 346 // code for the ASCII 347 // ESC (escape) character is ‘\033’. 348 // 349 // hexadecimal escape 350 // \xhh... 351 // The hexadecimal value hh, where hh stands for a sequence of hexadecimal digits (‘0’–‘9’, and 352 // either ‘A’–‘F’ or 353 // ‘a’–‘f’). 354 // Like the same construct in ISO C, the escape sequence continues until the first 355 // nonhexadecimal digit is seen. 356 // However, using more than two hexadecimal digits produces undefined results. (The ‘\x’ escape 357 // sequence is not allowed 358 // in POSIX awk.) 359 int i = 0, len = text.length(); 360 char c; 361 StringBuffer sb = new StringBuffer(len); 362 while (i < len) { 363 c = text.charAt(i++); 364 if (c == '\\') { 365 if (i < len) { 366 c = text.charAt(i++); 367 try { 368 if (c == 'u' && text.length() >= i + 4) { 369 // make sure we have only hexadecimals 370 String hex = text.substring(i, i + 4); 371 if (HEX.matcher(hex).find()) { 372 c = (char) Integer.parseInt(hex, 16); 373 i += 4; 374 } else { 375 throw new NumberFormatException("No hex value: " + hex); 376 } 377 } else if (c == 'n' && text.length() >= i + 2) { 378 // make sure we have only 0-7 digits 379 String oct = text.substring(i, i + 2); 380 if (OCT.matcher(oct).find()) { 381 c = (char) Integer.parseInt(oct, 8); 382 i += 2; 383 } else { 384 throw new NumberFormatException("No octal value: " + oct); 385 } 386 } else if (c == 'x' && text.length() >= i + 2) { 387 // make sure we have only hexadecimals 388 String hex = text.substring(i, i + 2); 389 if (HEX.matcher(hex).find()) { 390 c = (char) Integer.parseInt(hex, 16); 391 i += 2; 392 } else { 393 throw new NumberFormatException("No hex value: " + hex); 394 } 395 } else if (c == 'r' || c == 'n' || c == 't') { 396 // escaped newline or tab. Replace with simple space 397 c = ' '; 398 } else { 399 throw new NumberFormatException("No char escape"); 400 } 401 } catch (NumberFormatException e) { 402 // keep original characters including \ if escape sequence was invalid 403 // but replace \n with space instead 404 if (c == 'n') { 405 c = ' '; 406 } else { 407 c = '\\'; 408 i--; 409 } 410 } 411 } 412 } // fall through: \ escapes itself, quotes any character but u 413 sb.append(c); 414 } 415 return sb.toString(); 416 } 417 418 /** 419 * Tries to decode a UTF8 string only if common UTF8 character combinations are found which are unlikely to be correctly encoded text. 420 * E.g. ü is the German Umlaut ü and indicates we have encoded utf8 text still. 421 */ 422 public static String decodeUtf8Garbage(String text) { 423 Pattern UTF8_TEST = 424 Pattern.compile( 425 "(ä|ü|ö|Ã\u0084|Ã\u009C|Ã\u0096|" 426 + // äüöÄÜÖ 427 "ñ|ø|ç|î|ô|û|Ã\u0091|Ã\u0098|Ã\u0087|Ã\u008E|Ã\u0094|Ã\u009B" 428 + // ñøçîôûÑØÇÎÔÛ 429 "á|é|ó|ú|Ã\u00AD|Ã\u0081|Ã\u0089|Ã\u0093|Ã\u009A|Ã\u008D)" // áéóúíÁÉÓÚÍ 430 , 431 Pattern.CASE_INSENSITIVE); 432 if (text != null && UTF8_TEST.matcher(text).find()) { 433 // typical utf8 combinations found. Try to decode from latin1 to utf8 434 byte[] bytes = text.getBytes(StandardCharsets.ISO_8859_1); 435 final CharsetDecoder utf8Decoder = StandardCharsets.UTF_8.newDecoder(); 436 ByteBuffer buffer = ByteBuffer.wrap(bytes); 437 try { 438 return utf8Decoder.decode(buffer).toString(); 439 } catch (CharacterCodingException e) { 440 // maybe wasnt a good idea, return original 441 } 442 } 443 return text; 444 } 445 446 /** 447 * Joins a list of objects into a string, skipping null values and calling toString on each object. 448 * @param delimiter to join the values with 449 * @param values to be joined 450 * @return 451 */ 452 public static String joinIfNotNull(String delimiter, Object... values) { 453 return Arrays.stream(values) 454 .filter(Objects::nonNull) 455 .map(Object::toString) 456 .collect(Collectors.joining(delimiter)); 457 } 458 459 /** 460 * Uppercases all keys and values in a Map. 461 * If keys clash only one entry will remain, which is not guaranteed. 462 * 463 * @param map 464 * @return new map with keys and values upper cased. 465 */ 466 public static Map<String, String> upper(Map<String, String> map) { 467 Map<String, String> upperMap = new HashMap<>(); 468 for (String k : map.keySet()) { 469 String v = map.get(k); 470 if (v != null) { 471 v = v.trim().toUpperCase(); 472 } 473 upperMap.put(k.toUpperCase(), v); 474 } 475 return upperMap; 476 } 477 478 /** 479 * Returns an empty string or the trimmed lower case version of any input, but never NULL. 480 */ 481 public static String emptyLowerCase(String str) { 482 return org.apache.commons.lang3.StringUtils.trimToEmpty(str).toLowerCase(); 483 } 484 485 /** 486 * Reads a stack trace from an exception and returns it as a String. 487 * @param aThrowable 488 * @return teh full stack trace as a String 489 */ 490 public static String getStackTrace(Throwable aThrowable) { 491 final Writer result = new StringWriter(); 492 final PrintWriter printWriter = new PrintWriter(result); 493 aThrowable.printStackTrace(printWriter); 494 return result.toString(); 495 } 496 497 /** 498 * Strips a set of whitespace characters from the start and end of a String. 499 * This is similar to String.trim() but also includes some specific characters. 500 * 501 * @param str String to be trimmed 502 * @return trimmed String 503 */ 504 public static String trim(String str) { 505 return org.apache.commons.lang3.StringUtils.strip(str, WHITESPACES_LIST); 506 } 507 508 /** 509 * Removes all whitespace characters from the String. 510 * 511 * @param str String to be processed 512 * @return String without whitespaces 513 */ 514 public static String deleteWhitespace(final String str) { 515 if (org.apache.commons.lang3.StringUtils.isEmpty(str)) { 516 return str; 517 } 518 final int sz = str.length(); 519 final char[] chs = new char[sz]; 520 int count = 0; 521 for (int i = 0; i < sz; i++) { 522 if (org.apache.commons.lang3.StringUtils.containsNone(WHITESPACES_LIST, str.charAt(i))) { 523 chs[count++] = str.charAt(i); 524 } 525 } 526 if (count == sz) { 527 return str; 528 } 529 return new String(chs, 0, count); 530 } 531}