001/*
002 * Licensed under the Apache License, Version 2.0 (the "License");
003 * you may not use this file except in compliance with the License.
004 * You may obtain a copy of the License at
005 *
006 *     http://www.apache.org/licenses/LICENSE-2.0
007 *
008 * Unless required by applicable law or agreed to in writing, software
009 * distributed under the License is distributed on an "AS IS" BASIS,
010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
011 * See the License for the specific language governing permissions and
012 * limitations under the License.
013 */
014package org.gbif.utils.file;
015
016import java.io.File;
017import java.io.IOException;
018import java.nio.charset.Charset;
019
020import org.apache.commons.lang3.StringUtils;
021import org.junit.jupiter.api.Test;
022
023import static org.gbif.utils.file.CharsetDetection.detectEncoding;
024import static org.junit.jupiter.api.Assertions.assertEquals;
025
026/**
027 * @author markus
028 */
029public class CharsetDetectionTest {
030
031  @Test
032  public void testCP1252Encoding() throws IOException {
033    File test = FileUtils.getClasspathFile("charsets/cp1252-test.txt");
034
035    Charset encoding = detectEncoding(test);
036
037    assertEquals("windows1252", encoding.displayName().replace("-", "").toLowerCase());
038  }
039
040  @Test
041  public void testEncodingDetection() throws IOException {
042    String[] files =
043        new String[] {
044          "iso-8859-1_names.txt",
045          "macroman_names.txt",
046          "utf-16BE_bom_names.txt",
047          "utf-16BE_names.txt",
048          "utf-16LE_bom_names.txt",
049          "utf-16LE_names.txt",
050          "utf-8_bom_names.txt",
051          "utf-8_names.txt",
052          "windows1252_names.txt"
053        };
054    for (String fn : files) {
055      File test = FileUtils.getClasspathFile("charsets/" + fn);
056
057      Charset encoding = detectEncoding(test);
058      String expected = StringUtils.substringBefore(fn, "_");
059
060      // x-MacRoman is alias for MacRoman used on unix, therefore remove x-
061      assertEquals(
062          expected.replace("-", "").toLowerCase(),
063          encoding.displayName().toLowerCase().replace("x-mac", "mac").replace("-", ""));
064    }
065  }
066
067  @Test
068  public void testEncodingDetectionKyles() throws IOException {
069    String[] files =
070        new String[] {
071          "utf-8_arabic.csv",
072          "utf-8_japanese.csv",
073          "utf-8_korean.csv",
074          "utf-8_latin.csv",
075          "utf-8_no-bom.csv",
076          "utf-8_traditional-chinese.csv",
077          "utf-8_.csv",
078          "utf-16BE_no-bom.csv",
079          "utf-16BE_.csv",
080          "utf-16LE_little-endian-no-bom.csv",
081          "utf-16LE_little-endian.csv"
082        };
083    for (String fn : files) {
084      File test = FileUtils.getClasspathFile("charsets/kyle/" + fn);
085
086      Charset encoding = detectEncoding(test);
087      String expected = StringUtils.substringBefore(fn, "_");
088
089      assertEquals(
090          expected.replace("-", "").toLowerCase(),
091          encoding.displayName().toLowerCase().replace("-", ""));
092    }
093  }
094}