001/* 002 * Licensed under the Apache License, Version 2.0 (the "License"); 003 * you may not use this file except in compliance with the License. 004 * You may obtain a copy of the License at 005 * 006 * http://www.apache.org/licenses/LICENSE-2.0 007 * 008 * Unless required by applicable law or agreed to in writing, software 009 * distributed under the License is distributed on an "AS IS" BASIS, 010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 011 * See the License for the specific language governing permissions and 012 * limitations under the License. 013 */ 014package org.gbif.utils.file; 015 016import java.io.File; 017import java.io.IOException; 018import java.nio.charset.Charset; 019 020import org.apache.commons.lang3.StringUtils; 021import org.junit.jupiter.api.Test; 022 023import static org.gbif.utils.file.CharsetDetection.detectEncoding; 024import static org.junit.jupiter.api.Assertions.assertEquals; 025 026/** 027 * @author markus 028 */ 029public class CharsetDetectionTest { 030 031 @Test 032 public void testCP1252Encoding() throws IOException { 033 File test = FileUtils.getClasspathFile("charsets/cp1252-test.txt"); 034 035 Charset encoding = detectEncoding(test); 036 037 assertEquals("windows1252", encoding.displayName().replace("-", "").toLowerCase()); 038 } 039 040 @Test 041 public void testEncodingDetection() throws IOException { 042 String[] files = 043 new String[] { 044 "iso-8859-1_names.txt", 045 "macroman_names.txt", 046 "utf-16BE_bom_names.txt", 047 "utf-16BE_names.txt", 048 "utf-16LE_bom_names.txt", 049 "utf-16LE_names.txt", 050 "utf-8_bom_names.txt", 051 "utf-8_names.txt", 052 "windows1252_names.txt" 053 }; 054 for (String fn : files) { 055 File test = FileUtils.getClasspathFile("charsets/" + fn); 056 057 Charset encoding = detectEncoding(test); 058 String expected = StringUtils.substringBefore(fn, "_"); 059 060 // x-MacRoman is alias for MacRoman used on unix, therefore remove x- 061 assertEquals( 062 expected.replace("-", "").toLowerCase(), 063 encoding.displayName().toLowerCase().replace("x-mac", "mac").replace("-", "")); 064 } 065 } 066 067 @Test 068 public void testEncodingDetectionKyles() throws IOException { 069 String[] files = 070 new String[] { 071 "utf-8_arabic.csv", 072 "utf-8_japanese.csv", 073 "utf-8_korean.csv", 074 "utf-8_latin.csv", 075 "utf-8_no-bom.csv", 076 "utf-8_traditional-chinese.csv", 077 "utf-8_.csv", 078 "utf-16BE_no-bom.csv", 079 "utf-16BE_.csv", 080 "utf-16LE_little-endian-no-bom.csv", 081 "utf-16LE_little-endian.csv" 082 }; 083 for (String fn : files) { 084 File test = FileUtils.getClasspathFile("charsets/kyle/" + fn); 085 086 Charset encoding = detectEncoding(test); 087 String expected = StringUtils.substringBefore(fn, "_"); 088 089 assertEquals( 090 expected.replace("-", "").toLowerCase(), 091 encoding.displayName().toLowerCase().replace("-", "")); 092 } 093 } 094}