Source code

001/*
002 * Copyright 2021 Global Biodiversity Information Facility (GBIF)
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016package org.gbif.utils.text;
017
018import java.io.PrintWriter;
019import java.io.StringWriter;
020import java.io.Writer;
021import java.nio.ByteBuffer;
022import java.nio.charset.CharacterCodingException;
023import java.nio.charset.CharsetDecoder;
024import java.nio.charset.StandardCharsets;
025import java.text.Normalizer;
026import java.util.Arrays;
027import java.util.Calendar;
028import java.util.HashMap;
029import java.util.Map;
030import java.util.Objects;
031import java.util.Random;
032import java.util.function.Function;
033import java.util.regex.Pattern;
034import java.util.stream.Collectors;
035
036import org.apache.commons.lang3.text.WordUtils;
037
038/**
039 * Utils class adding specific string methods to existing guava Strings and
040 * commons {@link org.apache.commons.lang3.StringUtils}.
041 */
042public final class StringUtils {
043
044  private static final Pattern MARKER = Pattern.compile("\\p{M}");
045  public static final int LINNEAN_YEAR = 1751;
046  private static final String CONS = "BCDFGHJKLMNPQRSTVWXYZ";
047  private static final Pattern OCT = Pattern.compile("^[0-7]+$");
048  private static final Pattern HEX = Pattern.compile("^[0-9abcdefABCDEF]+$");
049
050  private static final String VOC = "AEIOU";
051  private static final Random RND = new Random();
052
053  /**
054   * This includes some special whitespaces which not present in standard trim list:
055   * <ul>
056   *  <li>U+0085 Next Line (NEL)</li>
057   *  <li>U+00A0 No-Break Space (NBSP)</li>
058   *  <li>U+000C Form Feed (FF)</li>
059   *  <li>U+2007 Figure Space </li>
060   * </ul>
061   */
062  public static final String WHITESPACES_LIST = ""
063      + "\u2002\u3000\r\u0085\u200A\u2005\u2000\u3000"
064      + "\u2029\u000B\u3000\u2008\u2003\u205F\u3000\u1680"
065      + "\u0009\u0020\u2006\u2001\u202F\u00A0\u000C\u2009"
066      + "\u3000\u2004\u3000\u3000\u2028\n\u2007\u3000";
067
068  private StringUtils() {
069  }
070
071  /**
072   * Removes accents & diacretics and converts ligatures into several chars
073   * @param x string to fold into ASCII
074   * @return string converted to ASCII equivalent, expanding common ligatures
075   */
076  public static String foldToAscii(String x) {
077    if (x == null) {
078      return null;
079    }
080    x = replaceSpecialCases(x);
081    // use java unicode normalizer to remove accents
082    x = Normalizer.normalize(x, Normalizer.Form.NFD);
083    return MARKER.matcher(x).replaceAll("");
084  }
085
086  /**
087   * Apply a function then join the result using a space if not null.
088   * E.g. can be used with apache.commons.lang3.StringUtils::trimToNull to compose a name when some parts are
089   * optionals.
090   *
091   * @param fct   the function to apply or Function.identity() if none
092   * @param parts
093   *
094   * @return a String that represents all parts joined by a space or empty String. Never null.
095   */
096  public static String thenJoin(Function<String, String> fct, String... parts) {
097    Objects.requireNonNull(fct, "fct shall be provided, use Function.identity() is you want to use the String as is");
098    return Arrays.stream(parts != null ? parts : new String[0])
099            .map(fct)
100            .filter(Objects::nonNull)
101            .collect(Collectors.joining(" "));
102  }
103
104  /**
105   * The Normalizer misses a few cases and 2 char ligatures which we deal with here
106   */
107  private static String replaceSpecialCases(String x) {
108    StringBuilder sb = new StringBuilder();
109
110    for (int i = 0; i < x.length(); i++) {
111      char c = x.charAt(i);
112      switch (c) {
113        case 'ß':
114          sb.append("ss");
115          break;
116        case 'Æ':
117          sb.append("AE");
118          break;
119        case 'æ':
120          sb.append("ae");
121          break;
122        case 'Ð':
123          sb.append("D");
124          break;
125        case 'đ':
126          sb.append("d");
127          break;
128        case 'ð':
129          sb.append("d");
130          break;
131        case 'Ø':
132          sb.append("O");
133          break;
134        case 'ø':
135          sb.append("o");
136          break;
137        case 'Œ':
138          sb.append("OE");
139          break;
140        case 'œ':
141          sb.append("oe");
142          break;
143        case 'Ŧ':
144          sb.append("T");
145          break;
146        case 'ŧ':
147          sb.append("t");
148          break;
149        case 'Ł':
150          sb.append("L");
151          break;
152        case 'ł':
153          sb.append("l");
154          break;
155        default:
156          sb.append(c);
157      }
158    }
159    return sb.toString();
160  }
161
162  /**
163   * Increase a given string by 1, i.e. increase the last char in that string by one.
164   * If its a z or Z the char before is increased instead and a new char a is appended.
165   * Only true letters are increased, but spaces, punctuation or numbers remain unchanged.
166   * Null values stay null and empty strings empty.
167   * The case of existing characters will be kept and the appended chars will use the case of the last char of the
168   * original string.
169   *
170   * For example "Carlb" becomes "Carla", "Aua" "Atz", "zZz" "aAaa" or "Abies zzz" "Abiet aaa".
171   *
172   * @param x
173   * @return
174   */
175  public static String increase(String x) {
176    if (x == null) {
177      return null;
178    }
179    if (x.equals("")) {
180      return x;
181    }
182
183    char[] chars = x.toCharArray();
184    int idx = chars.length - 1;
185    boolean appendingNeeded = false;
186    Character lastOriginalChar = null;
187
188    while (idx >= 0){
189      char c = chars[idx];
190      if (!Character.isLetter(c)){
191        idx--;
192        continue;
193      }
194
195      if (lastOriginalChar == null){
196        lastOriginalChar = c;
197      }
198
199      if (c == 'z'){
200        chars[idx] = 'a';
201        appendingNeeded = true;
202
203      } else if (c == 'Z'){
204        chars[idx] = 'A';
205        appendingNeeded = true;
206
207      } else {
208        c++;
209        chars[idx] = c;
210        appendingNeeded = false;
211        break;
212      }
213      idx--;
214    }
215
216    // first char, also append to end
217    if (appendingNeeded){
218      char append = (lastOriginalChar==null || Character.isLowerCase(lastOriginalChar)) ? 'a' : 'A';
219      return String.valueOf(chars) + append;
220
221    } else {
222      return String.valueOf(chars);
223    }
224  }
225
226  /**
227   * Creates a random species binomial with no meaning at all, but highly randomized.
228   *
229   * @return a random canonical species name
230   */
231  public static String randomSpecies() {
232    return randomGenus() + " " + randomEpithet();
233  }
234
235  public static String randomGenus() {
236    return WordUtils.capitalize(randomString(RND.nextInt(9) + 3).toLowerCase());
237  }
238
239  public static String randomEpithet() {
240    return randomString(RND.nextInt(12) + 4).toLowerCase();
241  }
242  public static String randomFamily() {
243      return WordUtils.capitalize(StringUtils.randomString(RND.nextInt(15) + 5).toLowerCase()) + "idae";
244  }
245
246  public static String randomAuthor() {
247    return WordUtils.capitalize(StringUtils.randomString(RND.nextInt(12) + 1).toLowerCase());
248  }
249
250  /**
251   * Creates a random string in upper case of given length with purely latin characters only.
252   * Vocals are used much more frequently than consonants
253   * @param len
254   * @return a random string in upper case
255   */
256  public static String randomString(int len) {
257    StringBuilder sb = new StringBuilder(len);
258    for (int i = 0; i < len; i++) {
259      if (RND.nextInt(3) > 1) {
260        sb.append(CONS.charAt(RND.nextInt(CONS.length())));
261      } else {
262        sb.append(VOC.charAt(RND.nextInt(VOC.length())));
263      }
264    }
265
266    return sb.toString();
267  }
268
269  /**
270   * @return a year since Linnéan times 1751 before now as a 4 character long string
271   */
272  public static String randomSpeciesYear() {
273    int maxYear = Calendar.getInstance().get(Calendar.YEAR);
274    return String.valueOf(LINNEAN_YEAR + RND.nextInt(maxYear - LINNEAN_YEAR + 1));
275  }
276
277  /**
278   * Simple integer parsing method that does not throw any exception but
279   * returns null instead.
280   *
281   * @param x
282   * @return the parsed integer or null
283   */
284  public static Integer parseInteger(String x) {
285    try {
286      return Integer.valueOf(x);
287    } catch (NumberFormatException e) {
288
289    }
290    return null;
291  }
292
293  /**
294   * Simple boolean parsing method that understands yes,y,true,t or 1 as true and respective values for false.
295   * It does not throw any exception but returns null instead.
296   *
297   * @param x
298   * @return the parsed integer or null
299   */
300  public static Boolean parseBoolean(String x) {
301    x = org.apache.commons.lang3.StringUtils.trimToEmpty(x).toLowerCase();
302    if (x.equals("true") || x.equals("t") || x.equals("1") || x.equals("yes") || x.equals("y")) {
303      return true;
304    }
305    if (x.equals("false") || x.equals("f") || x.equals("0") || x.equals("no") || x.equals("n")) {
306      return false;
307    }
308    return null;
309  }
310
311  /**
312   * Unescapes various unicode escapes if existing:
313   *
314   * java unicode escape, four hexadecimal digits
315   * \ uhhhh
316   *
317   * octal escape
318   * \nnn
319   * The octal value nnn, where nnn stands for 1 to 3 digits between ‘0’ and ‘7’. For example, the code for the ASCII
320   * ESC (escape) character is ‘\033’.
321   *
322   * hexadecimal escape
323   * \xhh...
324   * The hexadecimal value hh, where hh stands for a sequence of hexadecimal digits (‘0’–‘9’, and either ‘A’–‘F’ or
325   * ‘a’–‘f’).Like the same construct in ISO C, the escape sequence continues until the first nonhexadecimal digit is seen.
326   * However, using more than two hexadecimal digits produces undefined results. (The ‘\x’ escape sequence is not allowed
327   * in POSIX awk.)
328   *
329   * @param text string potentially containing unicode escape chars
330   * @return the unescaped string
331   */
332  public static String unescapeUnicodeChars(String text) {
333    if (text == null) {
334      return null;
335    }
336    // replace unicode, hexadecimal or octal character encodings by iterating over the chars once
337    //
338    // java unicode escape, four hexadecimal digits
339    // \ uhhhh
340    //
341    // octal escape
342    // \nnn
343    // The octal value nnn, where nnn stands for 1 to 3 digits between ‘0’ and ‘7’. For example, the code for the ASCII
344    // ESC (escape) character is ‘\033’.
345    //
346    // hexadecimal escape
347    // \xhh...
348    // The hexadecimal value hh, where hh stands for a sequence of hexadecimal digits (‘0’–‘9’, and either ‘A’–‘F’ or
349    // ‘a’–‘f’).
350    // Like the same construct in ISO C, the escape sequence continues until the first nonhexadecimal digit is seen.
351    // However, using more than two hexadecimal digits produces undefined results. (The ‘\x’ escape sequence is not allowed
352    // in POSIX awk.)
353    int i = 0, len = text.length();
354    char c;
355    StringBuffer sb = new StringBuffer(len);
356    while (i < len) {
357      c = text.charAt(i++);
358      if (c == '\\') {
359        if (i < len) {
360          c = text.charAt(i++);
361          try {
362            if (c == 'u' && text.length() >= i + 4) {
363              // make sure we have only hexadecimals
364              String hex = text.substring(i, i + 4);
365              if (HEX.matcher(hex).find()) {
366                c = (char) Integer.parseInt(hex, 16);
367                i += 4;
368              } else {
369                throw new NumberFormatException("No hex value: " + hex);
370              }
371            } else if (c == 'n' && text.length() >= i + 2) {
372              // make sure we have only 0-7 digits
373              String oct = text.substring(i, i + 2);
374              if (OCT.matcher(oct).find()) {
375                c = (char) Integer.parseInt(oct, 8);
376                i += 2;
377              } else {
378                throw new NumberFormatException("No octal value: " + oct);
379              }
380            } else if (c == 'x' && text.length() >= i + 2) {
381              // make sure we have only hexadecimals
382              String hex = text.substring(i, i + 2);
383              if (HEX.matcher(hex).find()) {
384                c = (char) Integer.parseInt(hex, 16);
385                i += 2;
386              } else {
387                throw new NumberFormatException("No hex value: " + hex);
388              }
389            } else if (c == 'r' || c == 'n' || c == 't') {
390              // escaped newline or tab. Replace with simple space
391              c = ' ';
392            } else {
393              throw new NumberFormatException("No char escape");
394            }
395          } catch (NumberFormatException e) {
396            // keep original characters including \ if escape sequence was invalid
397            // but replace \n with space instead
398            if (c == 'n') {
399              c = ' ';
400            } else {
401              c = '\\';
402              i--;
403            }
404          }
405        }
406      } // fall through: \ escapes itself, quotes any character but u
407      sb.append(c);
408    }
409    return sb.toString();
410  }
411
412  /**
413   * Tries to decode a UTF8 string only if common UTF8 character combinations are found which are unlikely to be correctly encoded text.
414   * E.g. Ã¼ is the German Umlaut ü and indicates we have encoded utf8 text still.
415   */
416  public static String decodeUtf8Garbage(String text) {
417    Pattern UTF8_TEST = Pattern.compile("(Ã¤|Ã¼|Ã¶|Ã\u0084|Ã\u009C|Ã\u0096|" + // äüöÄÜÖ
418        "Ã±|Ã¸|Ã§|Ã®|Ã´|Ã»|Ã\u0091|Ã\u0098|Ã\u0087|Ã\u008E|Ã\u0094|Ã\u009B"  + // ñøçîôûÑØÇÎÔÛ
419        "Ã¡|Ã©|Ã³|Ãº|Ã\u00AD|Ã\u0081|Ã\u0089|Ã\u0093|Ã\u009A|Ã\u008D)"         // áéóúíÁÉÓÚÍ
420        , Pattern.CASE_INSENSITIVE);
421    if (text != null && UTF8_TEST.matcher(text).find()) {
422      // typical utf8 combinations found. Try to decode from latin1 to utf8
423      byte[] bytes = text.getBytes(StandardCharsets.ISO_8859_1);
424      final CharsetDecoder utf8Decoder = StandardCharsets.UTF_8.newDecoder();
425      ByteBuffer buffer = ByteBuffer.wrap(bytes);
426      try {
427        return utf8Decoder.decode(buffer).toString();
428      } catch (CharacterCodingException e) {
429        // maybe wasnt a good idea, return original
430      }
431    }
432    return text;
433  }
434
435  /**
436   * Joins a list of objects into a string, skipping null values and calling toString on each object.
437   * @param delimiter to join the values with
438   * @param values to be joined
439   * @return
440   */
441  public static String joinIfNotNull(String delimiter, Object... values) {
442    return Arrays.stream(values)
443        .filter(Objects::nonNull)
444        .map(Object::toString)
445        .collect(Collectors.joining(delimiter));
446  }
447
448  /**
449   * Uppercases all keys and values in a Map.
450   * If keys clash only one entry will remain, which is not guaranteed.
451   *
452   * @param map
453   * @return new map with keys and values upper cased.
454   */
455  public static Map<String, String> upper(Map<String, String> map) {
456    Map<String, String> upperMap = new HashMap<>();
457    for (String k : map.keySet()) {
458      String v = map.get(k);
459      if (v != null) {
460        v = v.trim().toUpperCase();
461      }
462      upperMap.put(k.toUpperCase(), v);
463    }
464    return upperMap;
465  }
466
467  /**
468   * Returns an empty string or the trimmed lower case version of any input, but never NULL.
469   */
470  public static String emptyLowerCase(String str) {
471    return org.apache.commons.lang3.StringUtils.trimToEmpty(str).toLowerCase();
472  }
473
474  /**
475   * Reads a stack trace from an exception and returns it as a String.
476   * @param aThrowable
477   * @return teh full stack trace as a String
478   */
479  public static String getStackTrace(Throwable aThrowable) {
480    final Writer result = new StringWriter();
481    final PrintWriter printWriter = new PrintWriter(result);
482    aThrowable.printStackTrace(printWriter);
483    return result.toString();
484  }
485
486  /**
487   * Strips a set of whitespace characters from the start and end of a String.
488   * This is similar to String.trim() but also includes some specific characters.
489   *
490   * @param str String to be trimmed
491   * @return trimmed String
492   */
493  public static String trim(String str) {
494    return org.apache.commons.lang3.StringUtils.strip(str, WHITESPACES_LIST);
495  }
496
497  /**
498   * Removes all whitespace characters from the String.
499   *
500   * @param str String to be processed
501   * @return String without whitespaces
502   */
503  public static String deleteWhitespace(final String str) {
504    if (org.apache.commons.lang3.StringUtils.isEmpty(str)) {
505      return str;
506    }
507    final int sz = str.length();
508    final char[] chs = new char[sz];
509    int count = 0;
510    for (int i = 0; i < sz; i++) {
511      if (org.apache.commons.lang3.StringUtils.containsNone(WHITESPACES_LIST, str.charAt(i))) {
512        chs[count++] = str.charAt(i);
513      }
514    }
515    if (count == sz) {
516      return str;
517    }
518    return new String(chs, 0, count);
519  }
520}