001/* 002 * Copyright 2020 Global Biodiversity Information Facility (GBIF) 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package org.gbif.api.util; 017 018import java.text.Normalizer; 019 020import org.apache.commons.lang3.StringUtils; 021 022/** 023 * Utilities dealing with unicode strings 024 */ 025public class UnicodeUtils { 026 027 /** 028 * Replaces all diacretics with their ascii counterpart. 029 */ 030 public static String ascii(String x) { 031 if (x == null) { 032 return null; 033 } 034 // manually normalize characters not dealt with by the java Normalizer 035 x = StringUtils.replaceChars(x, "øØðÐ", "oOdD"); 036 037 // use java unicode normalizer to remove accents and punctuation 038 x = Normalizer.normalize(x, Normalizer.Form.NFD); 039 x = x.replaceAll("\\p{M}", ""); 040 return x; 041 } 042 043 /** 044 * Replaces all digraphs and ligatures with their underlying 2 latin letters. 045 * 046 * @param x the string to decompose 047 */ 048 public static String decompose(String x) { 049 if (x == null) { 050 return null; 051 } 052 return x.replaceAll("æ", "ae") 053 .replaceAll("Æ", "Ae") 054 .replaceAll("œ", "oe") 055 .replaceAll("Œ", "Oe") 056 .replaceAll("IJ", "Ij") 057 .replaceAll("ij", "ij") 058 .replaceAll("Lj", "Lj") 059 .replaceAll("lj", "lj") 060 .replaceAll("ȸ", "db") 061 .replaceAll("ȹ", "qp") 062 .replaceAll("ß", "ss") 063 .replaceAll("st", "st") 064 .replaceAll("ſt", "ft") 065 .replaceAll("ff", "ff") 066 .replaceAll("fi", "fi") 067 .replaceAll("fl", "fl") 068 .replaceAll("ffi", "ffi") 069 .replaceAll("ffl", "ffl"); 070 } 071}