001package org.gbif.utils.text;
002
003import java.io.PrintWriter;
004import java.io.StringWriter;
005import java.io.Writer;
006import java.nio.ByteBuffer;
007import java.nio.charset.CharacterCodingException;
008import java.nio.charset.CharsetDecoder;
009import java.text.Normalizer;
010import java.util.Arrays;
011import java.util.Calendar;
012import java.util.Map;
013import java.util.Objects;
014import java.util.Random;
015import java.util.function.Function;
016import java.util.regex.Pattern;
017import java.util.stream.Collectors;
018
019import com.google.common.base.Charsets;
020import com.google.common.base.Joiner;
021import com.google.common.base.Strings;
022import com.google.common.collect.Maps;
023import org.apache.commons.lang3.text.WordUtils;
024
025
026/**
027 * Utils class adding specific string methods to existing guava {@link Strings} and
028 * commons {@link org.apache.commons.lang3.StringUtils}.
029 */
030public class StringUtils {
031  private static Pattern MARKER = Pattern.compile("\\p{M}");
032  public static final int LINNEAN_YEAR = 1751;
033  private static final String CONS = "BCDFGHJKLMNPQRSTVWXYZ";
034  private static final Pattern OCT = Pattern.compile("^[0-7]+$");
035  private static final Pattern HEX = Pattern.compile("^[0-9abcdefABCDEF]+$");
036
037  private static final String VOC = "AEIOU";
038  private static Random rnd = new Random();
039
040  private StringUtils() {
041  }
042
043  /**
044   * Removes accents & diacretics and converts ligatures into several chars
045   * @param x string to fold into ASCII
046   * @return string converted to ASCII equivalent, expanding common ligatures
047   */
048  public static String foldToAscii(String x) {
049    if (x == null) {
050      return null;
051    }
052    x = replaceSpecialCases(x);
053    // use java unicode normalizer to remove accents
054    x = Normalizer.normalize(x, Normalizer.Form.NFD);
055    return MARKER.matcher(x).replaceAll("");
056  }
057
058
059  /**
060   * Apply a function then join the result using a space if not null.
061   * E.g. can be used with apache.commons.lang3.StringUtils::trimToNull to compose a name when some parts are
062   * optionals.
063   *
064   * @param fct   the function to apply or Function.identity() if none
065   * @param parts
066   *
067   * @return a String that represents all parts joined by a space or empty String. Never null.
068   */
069  public static String thenJoin(Function<String, String> fct, String... parts) {
070    Objects.requireNonNull(fct, "fct shall be provided, use Function.identity() is you want to use the String as is");
071    return Arrays.stream(parts != null ? parts : new String[0])
072            .map(fct)
073            .filter(Objects::nonNull)
074            .collect(Collectors.joining(" "));
075  }
076
077  /**
078   * The Normalizer misses a few cases and 2 char ligatures which we deal with here
079   */
080  private static String replaceSpecialCases(String x) {
081    StringBuilder sb = new StringBuilder();
082
083    for (int i = 0; i < x.length(); i++) {
084      char c = x.charAt(i);
085      switch (c) {
086        case 'ß':
087          sb.append("ss");
088          break;
089        case 'Æ':
090          sb.append("AE");
091          break;
092        case 'æ':
093          sb.append("ae");
094          break;
095        case 'Ð':
096          sb.append("D");
097          break;
098        case 'đ':
099          sb.append("d");
100          break;
101        case 'ð':
102          sb.append("d");
103          break;
104        case 'Ø':
105          sb.append("O");
106          break;
107        case 'ø':
108          sb.append("o");
109          break;
110        case 'Œ':
111          sb.append("OE");
112          break;
113        case 'œ':
114          sb.append("oe");
115          break;
116        case 'Ŧ':
117          sb.append("T");
118          break;
119        case 'ŧ':
120          sb.append("t");
121          break;
122        case 'Ł':
123          sb.append("L");
124          break;
125        case 'ł':
126          sb.append("l");
127          break;
128        default:
129          sb.append(c);
130      }
131    }
132    return sb.toString();
133  }
134
135  /**
136   * Increase a given string by 1, i.e. increase the last char in that string by one.
137   * If its a z or Z the char before is increased instead and a new char a is appended.
138   * Only true letters are increased, but spaces, punctuation or numbers remain unchanged.
139   * Null values stay null and empty strings empty.
140   * The case of existing characters will be kept and the appended chars will use the case of the last char of the
141   * original string.
142   *
143   * For example "Carlb" becomes "Carla", "Aua" "Atz", "zZz" "aAaa" or "Abies zzz" "Abiet aaa".
144   *
145   * @param x
146   * @return
147   */
148  public static String increase(String x) {
149    if (x == null) {
150      return null;
151    }
152    if (x.equals("")) {
153      return x;
154    }
155
156    char[] chars = x.toCharArray();
157    int idx = chars.length - 1;
158    boolean appendingNeeded = false;
159    Character lastOriginalChar = null;
160
161    while (idx >= 0){
162      char c = chars[idx];
163      if (!Character.isLetter(c)){
164        idx--;
165        continue;
166      }
167
168      if (lastOriginalChar == null){
169        lastOriginalChar = c;
170      }
171
172      if (c == 'z'){
173        chars[idx] = 'a';
174        appendingNeeded = true;
175
176      } else if (c == 'Z'){
177        chars[idx] = 'A';
178        appendingNeeded = true;
179
180      } else {
181        c++;
182        chars[idx] = c;
183        appendingNeeded = false;
184        break;
185      }
186      idx--;
187    }
188
189    // first char, also append to end
190    if (appendingNeeded){
191      char append = (lastOriginalChar==null || Character.isLowerCase(lastOriginalChar)) ? 'a' : 'A';
192      return String.valueOf(chars) + append;
193
194    } else {
195      return String.valueOf(chars);
196    }
197  }
198
199  /**
200   * Creates a random species binomial with no meaning at all, but highly randomized.
201   *
202   * @return a random canonical species name
203   */
204  public static String randomSpecies() {
205    return randomGenus() + " " + randomEpithet();
206  }
207
208  public static String randomGenus() {
209    return WordUtils.capitalize(randomString(rnd.nextInt(9) + 3).toLowerCase());
210  }
211
212  public static String randomEpithet() {
213    return randomString(rnd.nextInt(12) + 4).toLowerCase();
214  }
215  public static String randomFamily() {
216      return WordUtils.capitalize(StringUtils.randomString(rnd.nextInt(15) + 5).toLowerCase()) + "idae";
217  }
218
219  public static String randomAuthor() {
220    return WordUtils.capitalize(StringUtils.randomString(rnd.nextInt(12) + 1).toLowerCase());
221  }
222
223  /**
224   * Creates a random string in upper case of given length with purely latin characters only.
225   * Vocals are used much more frequently than consonants
226   * @param len
227   * @return a random string in upper case
228   */
229  public static String randomString(int len) {
230    StringBuilder sb = new StringBuilder(len);
231    for (int i = 0; i < len; i++) {
232      if (rnd.nextInt(3) > 1) {
233        sb.append(CONS.charAt(rnd.nextInt(CONS.length())));
234      } else {
235        sb.append(VOC.charAt(rnd.nextInt(VOC.length())));
236      }
237    }
238
239    return sb.toString();
240  }
241
242  /**
243   * @return a year since Linnéan times 1751 before now as a 4 character long string
244   */
245  public static String randomSpeciesYear() {
246    int maxYear = Calendar.getInstance().get(Calendar.YEAR);
247    return String.valueOf(LINNEAN_YEAR + rnd.nextInt(maxYear - LINNEAN_YEAR + 1));
248  }
249
250  /**
251   * Simple integer parsing method that does not throw any exception but
252   * returns null instead.
253   *
254   * @param x
255   * @return the parsed integer or null
256   */
257  public static Integer parseInteger(String x) {
258    try {
259      return Integer.valueOf(x);
260    } catch (NumberFormatException e) {
261
262    }
263    return null;
264  }
265
266  /**
267   * Simple boolean parsing method that understands yes,y,true,t or 1 as true and respective values for false.
268   * It does not throw any exception but returns null instead.
269   *
270   * @param x
271   * @return the parsed integer or null
272   */
273  public static Boolean parseBoolean(String x) {
274    x = org.apache.commons.lang3.StringUtils.trimToEmpty(x).toLowerCase();
275    if (x.equals("true") || x.equals("t") || x.equals("1") || x.equals("yes") || x.equals("y")) {
276      return true;
277    }
278    if (x.equals("false") || x.equals("f") || x.equals("0") || x.equals("no") || x.equals("n")) {
279      return false;
280    }
281    return null;
282  }
283
284  /**
285   * Unescapes various unicode escapes if existing:
286   *
287   * java unicode escape, four hexadecimal digits
288   * \ uhhhh
289   *
290   * octal escape
291   * \nnn
292   * The octal value nnn, where nnn stands for 1 to 3 digits between ‘0’ and ‘7’. For example, the code for the ASCII
293   * ESC (escape) character is ‘\033’.
294   *
295   * hexadecimal escape
296   * \xhh...
297   * The hexadecimal value hh, where hh stands for a sequence of hexadecimal digits (‘0’–‘9’, and either ‘A’–‘F’ or
298   * ‘a’–‘f’).Like the same construct in ISO C, the escape sequence continues until the first nonhexadecimal digit is seen.
299   * However, using more than two hexadecimal digits produces undefined results. (The ‘\x’ escape sequence is not allowed
300   * in POSIX awk.)
301   *
302   * @param text string potentially containing unicode escape chars
303   * @return the unescaped string
304   */
305  public static String unescapeUnicodeChars(String text) {
306    if (text == null) {
307      return null;
308    }
309    // replace unicode, hexadecimal or octal character encodings by iterating over the chars once
310    //
311    // java unicode escape, four hexadecimal digits
312    // \ uhhhh
313    //
314    // octal escape
315    // \nnn
316    // The octal value nnn, where nnn stands for 1 to 3 digits between ‘0’ and ‘7’. For example, the code for the ASCII
317    // ESC (escape) character is ‘\033’.
318    //
319    // hexadecimal escape
320    // \xhh...
321    // The hexadecimal value hh, where hh stands for a sequence of hexadecimal digits (‘0’–‘9’, and either ‘A’–‘F’ or
322    // ‘a’–‘f’).
323    // Like the same construct in ISO C, the escape sequence continues until the first nonhexadecimal digit is seen.
324    // However, using more than two hexadecimal digits produces undefined results. (The ‘\x’ escape sequence is not allowed
325    // in POSIX awk.)
326    int i = 0, len = text.length();
327    char c;
328    StringBuffer sb = new StringBuffer(len);
329    while (i < len) {
330      c = text.charAt(i++);
331      if (c == '\\') {
332        if (i < len) {
333          c = text.charAt(i++);
334          try {
335            if (c == 'u' && text.length() >= i + 4) {
336              // make sure we have only hexadecimals
337              String hex = text.substring(i, i + 4);
338              if (HEX.matcher(hex).find()) {
339                c = (char) Integer.parseInt(hex, 16);
340                i += 4;
341              } else {
342                throw new NumberFormatException("No hex value: " + hex);
343              }
344            } else if (c == 'n' && text.length() >= i + 2) {
345              // make sure we have only 0-7 digits
346              String oct = text.substring(i, i + 2);
347              if (OCT.matcher(oct).find()) {
348                c = (char) Integer.parseInt(oct, 8);
349                i += 2;
350              } else {
351                throw new NumberFormatException("No octal value: " + oct);
352              }
353            } else if (c == 'x' && text.length() >= i + 2) {
354              // make sure we have only hexadecimals
355              String hex = text.substring(i, i + 2);
356              if (HEX.matcher(hex).find()) {
357                c = (char) Integer.parseInt(hex, 16);
358                i += 2;
359              } else {
360                throw new NumberFormatException("No hex value: " + hex);
361              }
362            } else if (c == 'r' || c == 'n' || c == 't') {
363              // escaped newline or tab. Replace with simple space
364              c = ' ';
365            } else {
366              throw new NumberFormatException("No char escape");
367            }
368          } catch (NumberFormatException e) {
369            // keep original characters including \ if escape sequence was invalid
370            // but replace \n with space instead
371            if (c == 'n') {
372              c = ' ';
373            } else {
374              c = '\\';
375              i--;
376            }
377          }
378        }
379      } // fall through: \ escapes itself, quotes any character but u
380      sb.append(c);
381    }
382    return sb.toString();
383  }
384
385  /**
386   * Tries to decode a UTF8 string only if common UTF8 character combinations are found which are unlikely to be correctly encoded text.
387   * E.g. ü is the German Umlaut ü and indicates we have encoded utf8 text still.
388   */
389  public static String decodeUtf8Garbage(String text) {
390    Pattern UTF8_TEST = Pattern.compile("(ä|ü|ö|Ã\u0084|Ã\u009C|Ã\u0096|" + // äüöÄÜÖ
391        "ñ|ø|ç|î|ô|û|Ã\u0091|Ã\u0098|Ã\u0087|Ã\u008E|Ã\u0094|Ã\u009B"  + // ñøçîôûÑØÇÎÔÛ
392        "á|é|ó|ú|Ã\u00AD|Ã\u0081|Ã\u0089|Ã\u0093|Ã\u009A|Ã\u008D)"         // áéóúíÁÉÓÚÍ
393        , Pattern.CASE_INSENSITIVE);
394    if (text != null && UTF8_TEST.matcher(text).find()) {
395      // typical utf8 combinations found. Try to decode from latin1 to utf8
396      byte[] bytes = text.getBytes(Charsets.ISO_8859_1);
397      final CharsetDecoder utf8Decoder = Charsets.UTF_8.newDecoder();
398      ByteBuffer buffer = ByteBuffer.wrap(bytes);
399      try {
400        return utf8Decoder.decode(buffer).toString();
401      } catch (CharacterCodingException e) {
402        // maybe wasnt a good idea, return original
403      }
404    }
405    return text;
406  }
407
408  /**
409   * Joins a list of objects into a string, skipping null values and calling toString on each object.
410   * @param delimiter to join the values with
411   * @param values to be joined
412   * @return
413   */
414  public static String joinIfNotNull(String delimiter, Object... values) {
415    return Joiner.on(delimiter).skipNulls().join(values);
416  }
417
418  /**
419   * Uppercases all keys and values in a Map.
420   * If keys clash only one entry will remain, which is not guaranteed.
421   *
422   * @param map
423   * @return new map with keys and values upper cased.
424   */
425  public static Map<String, String> upper(Map<String, String> map) {
426    Map<String, String> upperMap = Maps.newHashMap();
427    for (String k : map.keySet()) {
428      String v = map.get(k);
429      if (v != null) {
430        v = v.trim().toUpperCase();
431      }
432      upperMap.put(k.toUpperCase(), v);
433    }
434    return upperMap;
435  }
436
437  /**
438   * Returns an empty string or the trimmed lower case version of any input, but never NULL.
439   *
440   * @param x
441   * @return
442   */
443  public static String emptyLowerCase(String x) {
444    if (Strings.isNullOrEmpty(x)) {
445      return "";
446    }
447    return x.trim().toLowerCase();
448  }
449
450  /**
451   * Reads a stack trace from an exception and returns it as a String.
452   * @param aThrowable
453   * @return teh full stack trace as a String
454   */
455  public static String getStackTrace(Throwable aThrowable) {
456    final Writer result = new StringWriter();
457    final PrintWriter printWriter = new PrintWriter(result);
458    aThrowable.printStackTrace(printWriter);
459    return result.toString();
460  }
461
462}