Source code

001/*
002 * Licensed under the Apache License, Version 2.0 (the "License");
003 * you may not use this file except in compliance with the License.
004 * You may obtain a copy of the License at
005 *
006 *     http://www.apache.org/licenses/LICENSE-2.0
007 *
008 * Unless required by applicable law or agreed to in writing, software
009 * distributed under the License is distributed on an "AS IS" BASIS,
010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
011 * See the License for the specific language governing permissions and
012 * limitations under the License.
013 */
014package org.gbif.utils.text;
015
016import java.io.PrintWriter;
017import java.io.StringWriter;
018import java.io.Writer;
019import java.nio.ByteBuffer;
020import java.nio.charset.CharacterCodingException;
021import java.nio.charset.CharsetDecoder;
022import java.nio.charset.StandardCharsets;
023import java.text.Normalizer;
024import java.util.Arrays;
025import java.util.Calendar;
026import java.util.HashMap;
027import java.util.Map;
028import java.util.Objects;
029import java.util.Random;
030import java.util.function.Function;
031import java.util.regex.Pattern;
032import java.util.stream.Collectors;
033
034import org.apache.commons.lang3.text.WordUtils;
035
036/**
037 * Utils class adding specific string methods to existing guava Strings and
038 * commons {@link org.apache.commons.lang3.StringUtils}.
039 */
040public final class StringUtils {
041
042  private static final Pattern MARKER = Pattern.compile("\\p{M}");
043  public static final int LINNEAN_YEAR = 1751;
044  private static final String CONS = "BCDFGHJKLMNPQRSTVWXYZ";
045  private static final Pattern OCT = Pattern.compile("^[0-7]+$");
046  private static final Pattern HEX = Pattern.compile("^[0-9abcdefABCDEF]+$");
047
048  private static final String VOC = "AEIOU";
049  private static final Random RND = new Random();
050
051  /**
052   * This includes some special whitespaces which not present in standard trim list:
053   * <ul>
054   *  <li>U+0085 Next Line (NEL)</li>
055   *  <li>U+00A0 No-Break Space (NBSP)</li>
056   *  <li>U+000C Form Feed (FF)</li>
057   *  <li>U+2007 Figure Space </li>
058   * </ul>
059   */
060  public static final String WHITESPACES_LIST =
061      ""
062          + "\u2002\u3000\r\u0085\u200A\u2005\u2000\u3000"
063          + "\u2029\u000B\u3000\u2008\u2003\u205F\u3000\u1680"
064          + "\u0009\u0020\u2006\u2001\u202F\u00A0\u000C\u2009"
065          + "\u3000\u2004\u3000\u3000\u2028\n\u2007\u3000";
066
067  private StringUtils() {}
068
069  /**
070   * Removes accents & diacretics and converts ligatures into several chars
071   * @param x string to fold into ASCII
072   * @return string converted to ASCII equivalent, expanding common ligatures
073   */
074  public static String foldToAscii(String x) {
075    if (x == null) {
076      return null;
077    }
078    x = replaceSpecialCases(x);
079    // use java unicode normalizer to remove accents
080    x = Normalizer.normalize(x, Normalizer.Form.NFD);
081    return MARKER.matcher(x).replaceAll("");
082  }
083
084  /**
085   * Apply a function then join the result using a space if not null.
086   * E.g. can be used with apache.commons.lang3.StringUtils::trimToNull to compose a name when some parts are
087   * optionals.
088   *
089   * @param fct   the function to apply or Function.identity() if none
090   * @param parts
091   *
092   * @return a String that represents all parts joined by a space or empty String. Never null.
093   */
094  public static String thenJoin(Function<String, String> fct, String... parts) {
095    Objects.requireNonNull(
096        fct, "fct shall be provided, use Function.identity() is you want to use the String as is");
097    return Arrays.stream(parts != null ? parts : new String[0])
098        .map(fct)
099        .filter(Objects::nonNull)
100        .collect(Collectors.joining(" "));
101  }
102
103  /**
104   * The Normalizer misses a few cases and 2 char ligatures which we deal with here
105   */
106  private static String replaceSpecialCases(String x) {
107    StringBuilder sb = new StringBuilder();
108
109    for (int i = 0; i < x.length(); i++) {
110      char c = x.charAt(i);
111      switch (c) {
112        case 'ß':
113          sb.append("ss");
114          break;
115        case 'Æ':
116          sb.append("AE");
117          break;
118        case 'æ':
119          sb.append("ae");
120          break;
121        case 'Ð':
122          sb.append("D");
123          break;
124        case 'đ':
125          sb.append("d");
126          break;
127        case 'ð':
128          sb.append("d");
129          break;
130        case 'Ø':
131          sb.append("O");
132          break;
133        case 'ø':
134          sb.append("o");
135          break;
136        case 'Œ':
137          sb.append("OE");
138          break;
139        case 'œ':
140          sb.append("oe");
141          break;
142        case 'Ŧ':
143          sb.append("T");
144          break;
145        case 'ŧ':
146          sb.append("t");
147          break;
148        case 'Ł':
149          sb.append("L");
150          break;
151        case 'ł':
152          sb.append("l");
153          break;
154        default:
155          sb.append(c);
156      }
157    }
158    return sb.toString();
159  }
160
161  /**
162   * Increase a given string by 1, i.e. increase the last char in that string by one.
163   * If its a z or Z the char before is increased instead and a new char a is appended.
164   * Only true letters are increased, but spaces, punctuation or numbers remain unchanged.
165   * Null values stay null and empty strings empty.
166   * The case of existing characters will be kept and the appended chars will use the case of the last char of the
167   * original string.
168   *
169   * For example "Carlb" becomes "Carla", "Aua" "Atz", "zZz" "aAaa" or "Abies zzz" "Abiet aaa".
170   *
171   * @param x
172   * @return
173   */
174  public static String increase(String x) {
175    if (x == null) {
176      return null;
177    }
178    if (x.equals("")) {
179      return x;
180    }
181
182    char[] chars = x.toCharArray();
183    int idx = chars.length - 1;
184    boolean appendingNeeded = false;
185    Character lastOriginalChar = null;
186
187    while (idx >= 0) {
188      char c = chars[idx];
189      if (!Character.isLetter(c)) {
190        idx--;
191        continue;
192      }
193
194      if (lastOriginalChar == null) {
195        lastOriginalChar = c;
196      }
197
198      if (c == 'z') {
199        chars[idx] = 'a';
200        appendingNeeded = true;
201
202      } else if (c == 'Z') {
203        chars[idx] = 'A';
204        appendingNeeded = true;
205
206      } else {
207        c++;
208        chars[idx] = c;
209        appendingNeeded = false;
210        break;
211      }
212      idx--;
213    }
214
215    // first char, also append to end
216    if (appendingNeeded) {
217      char append =
218          (lastOriginalChar == null || Character.isLowerCase(lastOriginalChar)) ? 'a' : 'A';
219      return String.valueOf(chars) + append;
220
221    } else {
222      return String.valueOf(chars);
223    }
224  }
225
226  /**
227   * Creates a random species binomial with no meaning at all, but highly randomized.
228   *
229   * @return a random canonical species name
230   */
231  public static String randomSpecies() {
232    return randomGenus() + " " + randomEpithet();
233  }
234
235  public static String randomGenus() {
236    return WordUtils.capitalize(randomString(RND.nextInt(9) + 3).toLowerCase());
237  }
238
239  public static String randomEpithet() {
240    return randomString(RND.nextInt(12) + 4).toLowerCase();
241  }
242
243  public static String randomFamily() {
244    return WordUtils.capitalize(StringUtils.randomString(RND.nextInt(15) + 5).toLowerCase())
245        + "idae";
246  }
247
248  public static String randomAuthor() {
249    return WordUtils.capitalize(StringUtils.randomString(RND.nextInt(12) + 1).toLowerCase());
250  }
251
252  /**
253   * Creates a random string in upper case of given length with purely latin characters only.
254   * Vocals are used much more frequently than consonants
255   * @param len
256   * @return a random string in upper case
257   */
258  public static String randomString(int len) {
259    StringBuilder sb = new StringBuilder(len);
260    for (int i = 0; i < len; i++) {
261      if (RND.nextInt(3) > 1) {
262        sb.append(CONS.charAt(RND.nextInt(CONS.length())));
263      } else {
264        sb.append(VOC.charAt(RND.nextInt(VOC.length())));
265      }
266    }
267
268    return sb.toString();
269  }
270
271  /**
272   * @return a year since Linnéan times 1751 before now as a 4 character long string
273   */
274  public static String randomSpeciesYear() {
275    int maxYear = Calendar.getInstance().get(Calendar.YEAR);
276    return String.valueOf(LINNEAN_YEAR + RND.nextInt(maxYear - LINNEAN_YEAR + 1));
277  }
278
279  /**
280   * Simple integer parsing method that does not throw any exception but
281   * returns null instead.
282   *
283   * @param x
284   * @return the parsed integer or null
285   */
286  public static Integer parseInteger(String x) {
287    try {
288      return Integer.valueOf(x);
289    } catch (NumberFormatException e) {
290
291    }
292    return null;
293  }
294
295  /**
296   * Simple boolean parsing method that understands yes,y,true,t or 1 as true and respective values for false.
297   * It does not throw any exception but returns null instead.
298   *
299   * @param x
300   * @return the parsed integer or null
301   */
302  public static Boolean parseBoolean(String x) {
303    x = org.apache.commons.lang3.StringUtils.trimToEmpty(x).toLowerCase();
304    if (x.equals("true") || x.equals("t") || x.equals("1") || x.equals("yes") || x.equals("y")) {
305      return true;
306    }
307    if (x.equals("false") || x.equals("f") || x.equals("0") || x.equals("no") || x.equals("n")) {
308      return false;
309    }
310    return null;
311  }
312
313  /**
314   * Unescapes various unicode escapes if existing:
315   *
316   * java unicode escape, four hexadecimal digits
317   * \ uhhhh
318   *
319   * octal escape
320   * \nnn
321   * The octal value nnn, where nnn stands for 1 to 3 digits between ‘0’ and ‘7’. For example, the code for the ASCII
322   * ESC (escape) character is ‘\033’.
323   *
324   * hexadecimal escape
325   * \xhh...
326   * The hexadecimal value hh, where hh stands for a sequence of hexadecimal digits (‘0’–‘9’, and either ‘A’–‘F’ or
327   * ‘a’–‘f’).Like the same construct in ISO C, the escape sequence continues until the first nonhexadecimal digit is seen.
328   * However, using more than two hexadecimal digits produces undefined results. (The ‘\x’ escape sequence is not allowed
329   * in POSIX awk.)
330   *
331   * @param text string potentially containing unicode escape chars
332   * @return the unescaped string
333   */
334  public static String unescapeUnicodeChars(String text) {
335    if (text == null) {
336      return null;
337    }
338    // replace unicode, hexadecimal or octal character encodings by iterating over the chars once
339    //
340    // java unicode escape, four hexadecimal digits
341    // \ uhhhh
342    //
343    // octal escape
344    // \nnn
345    // The octal value nnn, where nnn stands for 1 to 3 digits between ‘0’ and ‘7’. For example, the
346    // code for the ASCII
347    // ESC (escape) character is ‘\033’.
348    //
349    // hexadecimal escape
350    // \xhh...
351    // The hexadecimal value hh, where hh stands for a sequence of hexadecimal digits (‘0’–‘9’, and
352    // either ‘A’–‘F’ or
353    // ‘a’–‘f’).
354    // Like the same construct in ISO C, the escape sequence continues until the first
355    // nonhexadecimal digit is seen.
356    // However, using more than two hexadecimal digits produces undefined results. (The ‘\x’ escape
357    // sequence is not allowed
358    // in POSIX awk.)
359    int i = 0, len = text.length();
360    char c;
361    StringBuffer sb = new StringBuffer(len);
362    while (i < len) {
363      c = text.charAt(i++);
364      if (c == '\\') {
365        if (i < len) {
366          c = text.charAt(i++);
367          try {
368            if (c == 'u' && text.length() >= i + 4) {
369              // make sure we have only hexadecimals
370              String hex = text.substring(i, i + 4);
371              if (HEX.matcher(hex).find()) {
372                c = (char) Integer.parseInt(hex, 16);
373                i += 4;
374              } else {
375                throw new NumberFormatException("No hex value: " + hex);
376              }
377            } else if (c == 'n' && text.length() >= i + 2) {
378              // make sure we have only 0-7 digits
379              String oct = text.substring(i, i + 2);
380              if (OCT.matcher(oct).find()) {
381                c = (char) Integer.parseInt(oct, 8);
382                i += 2;
383              } else {
384                throw new NumberFormatException("No octal value: " + oct);
385              }
386            } else if (c == 'x' && text.length() >= i + 2) {
387              // make sure we have only hexadecimals
388              String hex = text.substring(i, i + 2);
389              if (HEX.matcher(hex).find()) {
390                c = (char) Integer.parseInt(hex, 16);
391                i += 2;
392              } else {
393                throw new NumberFormatException("No hex value: " + hex);
394              }
395            } else if (c == 'r' || c == 'n' || c == 't') {
396              // escaped newline or tab. Replace with simple space
397              c = ' ';
398            } else {
399              throw new NumberFormatException("No char escape");
400            }
401          } catch (NumberFormatException e) {
402            // keep original characters including \ if escape sequence was invalid
403            // but replace \n with space instead
404            if (c == 'n') {
405              c = ' ';
406            } else {
407              c = '\\';
408              i--;
409            }
410          }
411        }
412      } // fall through: \ escapes itself, quotes any character but u
413      sb.append(c);
414    }
415    return sb.toString();
416  }
417
418  /**
419   * Tries to decode a UTF8 string only if common UTF8 character combinations are found which are unlikely to be correctly encoded text.
420   * E.g. Ã¼ is the German Umlaut ü and indicates we have encoded utf8 text still.
421   */
422  public static String decodeUtf8Garbage(String text) {
423    Pattern UTF8_TEST =
424        Pattern.compile(
425            "(Ã¤|Ã¼|Ã¶|Ã\u0084|Ã\u009C|Ã\u0096|"
426                + // äüöÄÜÖ
427                "Ã±|Ã¸|Ã§|Ã®|Ã´|Ã»|Ã\u0091|Ã\u0098|Ã\u0087|Ã\u008E|Ã\u0094|Ã\u009B"
428                + // ñøçîôûÑØÇÎÔÛ
429                "Ã¡|Ã©|Ã³|Ãº|Ã\u00AD|Ã\u0081|Ã\u0089|Ã\u0093|Ã\u009A|Ã\u008D)" // áéóúíÁÉÓÚÍ
430            ,
431            Pattern.CASE_INSENSITIVE);
432    if (text != null && UTF8_TEST.matcher(text).find()) {
433      // typical utf8 combinations found. Try to decode from latin1 to utf8
434      byte[] bytes = text.getBytes(StandardCharsets.ISO_8859_1);
435      final CharsetDecoder utf8Decoder = StandardCharsets.UTF_8.newDecoder();
436      ByteBuffer buffer = ByteBuffer.wrap(bytes);
437      try {
438        return utf8Decoder.decode(buffer).toString();
439      } catch (CharacterCodingException e) {
440        // maybe wasnt a good idea, return original
441      }
442    }
443    return text;
444  }
445
446  /**
447   * Joins a list of objects into a string, skipping null values and calling toString on each object.
448   * @param delimiter to join the values with
449   * @param values to be joined
450   * @return
451   */
452  public static String joinIfNotNull(String delimiter, Object... values) {
453    return Arrays.stream(values)
454        .filter(Objects::nonNull)
455        .map(Object::toString)
456        .collect(Collectors.joining(delimiter));
457  }
458
459  /**
460   * Uppercases all keys and values in a Map.
461   * If keys clash only one entry will remain, which is not guaranteed.
462   *
463   * @param map
464   * @return new map with keys and values upper cased.
465   */
466  public static Map<String, String> upper(Map<String, String> map) {
467    Map<String, String> upperMap = new HashMap<>();
468    for (String k : map.keySet()) {
469      String v = map.get(k);
470      if (v != null) {
471        v = v.trim().toUpperCase();
472      }
473      upperMap.put(k.toUpperCase(), v);
474    }
475    return upperMap;
476  }
477
478  /**
479   * Returns an empty string or the trimmed lower case version of any input, but never NULL.
480   */
481  public static String emptyLowerCase(String str) {
482    return org.apache.commons.lang3.StringUtils.trimToEmpty(str).toLowerCase();
483  }
484
485  /**
486   * Reads a stack trace from an exception and returns it as a String.
487   * @param aThrowable
488   * @return teh full stack trace as a String
489   */
490  public static String getStackTrace(Throwable aThrowable) {
491    final Writer result = new StringWriter();
492    final PrintWriter printWriter = new PrintWriter(result);
493    aThrowable.printStackTrace(printWriter);
494    return result.toString();
495  }
496
497  /**
498   * Strips a set of whitespace characters from the start and end of a String.
499   * This is similar to String.trim() but also includes some specific characters.
500   *
501   * @param str String to be trimmed
502   * @return trimmed String
503   */
504  public static String trim(String str) {
505    return org.apache.commons.lang3.StringUtils.strip(str, WHITESPACES_LIST);
506  }
507
508  /**
509   * Removes all whitespace characters from the String.
510   *
511   * @param str String to be processed
512   * @return String without whitespaces
513   */
514  public static String deleteWhitespace(final String str) {
515    if (org.apache.commons.lang3.StringUtils.isEmpty(str)) {
516      return str;
517    }
518    final int sz = str.length();
519    final char[] chs = new char[sz];
520    int count = 0;
521    for (int i = 0; i < sz; i++) {
522      if (org.apache.commons.lang3.StringUtils.containsNone(WHITESPACES_LIST, str.charAt(i))) {
523        chs[count++] = str.charAt(i);
524      }
525    }
526    if (count == sz) {
527      return str;
528    }
529    return new String(chs, 0, count);
530  }
531}