001package org.gbif.api.util;
002
003import java.text.Normalizer;
004
005import org.apache.commons.lang3.StringUtils;
006
007/**
008 * Utilities dealing with unicode strings
009 */
010public class UnicodeUtils {
011
012    /**
013     * Replaces all diacretics with their ascii counterpart.
014     */
015    public static String ascii(String x) {
016        if (x == null) {
017            return null;
018        }
019        // manually normalize characters not dealt with by the java Normalizer
020        x = StringUtils.replaceChars(x, "øØðÐ", "oOdD");
021
022        // use java unicode normalizer to remove accents and punctuation
023        x = Normalizer.normalize(x, Normalizer.Form.NFD);
024        x = x.replaceAll("\\p{M}", "");
025        return x;
026    }
027
028    /**
029     * Replaces all digraphs and ligatures with their underlying 2 latin letters.
030     *
031     * @param x the string to decompose
032     */
033    public static String decompose(String x) {
034        if (x == null) {
035            return null;
036        }
037        return x.replaceAll("æ", "ae")
038                .replaceAll("Æ", "Ae")
039                .replaceAll("œ", "oe")
040                .replaceAll("Œ", "Oe")
041                .replaceAll("IJ", "Ij")
042                .replaceAll("ij", "ij")
043                .replaceAll("Lj", "Lj")
044                .replaceAll("lj", "lj")
045                .replaceAll("ȸ", "db")
046                .replaceAll("ȹ", "qp")
047                .replaceAll("ß", "ss")
048                .replaceAll("st", "st")
049                .replaceAll("ſt", "ft")
050                .replaceAll("ff", "ff")
051                .replaceAll("fi", "fi")
052                .replaceAll("fl", "fl")
053                .replaceAll("ffi", "ffi")
054                .replaceAll("ffl", "ffl");
055    }
056}