001/*
002 * Copyright 2020 Global Biodiversity Information Facility (GBIF)
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016package org.gbif.api.util;
017
018import java.text.Normalizer;
019
020import org.apache.commons.lang3.StringUtils;
021
022/**
023 * Utilities dealing with unicode strings
024 */
025public class UnicodeUtils {
026
027    /**
028     * Replaces all diacretics with their ascii counterpart.
029     */
030    public static String ascii(String x) {
031        if (x == null) {
032            return null;
033        }
034        // manually normalize characters not dealt with by the java Normalizer
035        x = StringUtils.replaceChars(x, "øØðÐ", "oOdD");
036
037        // use java unicode normalizer to remove accents and punctuation
038        x = Normalizer.normalize(x, Normalizer.Form.NFD);
039        x = x.replaceAll("\\p{M}", "");
040        return x;
041    }
042
043    /**
044     * Replaces all digraphs and ligatures with their underlying 2 latin letters.
045     *
046     * @param x the string to decompose
047     */
048    public static String decompose(String x) {
049        if (x == null) {
050            return null;
051        }
052        return x.replaceAll("æ", "ae")
053                .replaceAll("Æ", "Ae")
054                .replaceAll("œ", "oe")
055                .replaceAll("Œ", "Oe")
056                .replaceAll("IJ", "Ij")
057                .replaceAll("ij", "ij")
058                .replaceAll("Lj", "Lj")
059                .replaceAll("lj", "lj")
060                .replaceAll("ȸ", "db")
061                .replaceAll("ȹ", "qp")
062                .replaceAll("ß", "ss")
063                .replaceAll("st", "st")
064                .replaceAll("ſt", "ft")
065                .replaceAll("ff", "ff")
066                .replaceAll("fi", "fi")
067                .replaceAll("fl", "fl")
068                .replaceAll("ffi", "ffi")
069                .replaceAll("ffl", "ffl");
070    }
071}