001/*
002 * Licensed under the Apache License, Version 2.0 (the "License");
003 * you may not use this file except in compliance with the License.
004 * You may obtain a copy of the License at
005 *
006 *     http://www.apache.org/licenses/LICENSE-2.0
007 *
008 * Unless required by applicable law or agreed to in writing, software
009 * distributed under the License is distributed on an "AS IS" BASIS,
010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
011 * See the License for the specific language governing permissions and
012 * limitations under the License.
013 */
014package org.gbif.utils.file;
015
016import java.io.InputStream;
017
018import javax.xml.parsers.SAXParser;
019import javax.xml.parsers.SAXParserFactory;
020
021import org.apache.commons.io.IOUtils;
022import org.apache.commons.io.input.BOMInputStream;
023import org.junit.jupiter.api.Test;
024import org.xml.sax.ext.DefaultHandler2;
025
026import static org.junit.jupiter.api.Assertions.assertEquals;
027
028/**
029 * @author markus
030 */
031public class BomSafeInputStreamWrapperTest {
032
033  static SAXParserFactory SAX_FACTORY = SAXParserFactory.newInstance();
034
035  static {
036    SAX_FACTORY.setNamespaceAware(true);
037    SAX_FACTORY.setValidating(false);
038  }
039
040  /**
041   * The Java SAX Parser is known to have problems with UTF8 file that contain a proper BOM markup:
042   * http://bugs.java.com/bugdatabase/view_bug.do?bug_id=4508058
043   * https://de.wikipedia.org/wiki/Byte_Order_Mark
044   *
045   *  Make sure the SAX Parser can handle any valid UTF files by using a BomSafeInputStreamWrapper stream.
046   */
047  @Test
048  public void testSaxParser() throws Exception {
049    SAXParser p = SAX_FACTORY.newSAXParser();
050    for (String f : new String[] {"utf8", "utf8bom", "utf16le", "utf16be"}) {
051      String fn = "/sax/" + f + ".xml";
052      System.out.println(fn);
053      InputStream is = getClass().getResourceAsStream(fn);
054      p.parse(is, new DefaultHandler2());
055
056      is = new BOMInputStream(getClass().getResourceAsStream(fn));
057      p.parse(is, new DefaultHandler2());
058    }
059  }
060
061  @Test
062  public void testUTF16Stream() throws Exception {
063    // should be the exact same bytes
064
065    byte[] b1 = IOUtils.toByteArray(getClass().getResourceAsStream("/sax/utf16le.xml"));
066    byte[] b2 =
067        IOUtils.toByteArray(new BOMInputStream(getClass().getResourceAsStream("/sax/utf16le.xml")));
068
069    assertEquals(b1.length, b2.length);
070    int idx = 0;
071    for (byte b : b1) {
072      assertEquals(b, b2[idx++]);
073    }
074  }
075
076  @Test
077  public void testBomSafeInputStreamWrapper() throws Exception {
078    // test no bom
079    InputStream in =
080        new BomSafeInputStreamWrapper(getClass().getResourceAsStream("/charsets/utf-8_names.txt"));
081    int x = in.read();
082    int y = in.read();
083    int z = in.read();
084    in.close();
085    assertEquals(35, x);
086    assertEquals(35, y);
087    assertEquals(35, z);
088
089    in =
090        new BomSafeInputStreamWrapper(
091            getClass().getResourceAsStream("/charsets/utf-8_bom_names.txt"));
092    x = in.read();
093    y = in.read();
094    z = in.read();
095    in.close();
096    assertEquals(35, x);
097    assertEquals(35, y);
098    assertEquals(35, z);
099
100    in =
101        new BomSafeInputStreamWrapper(
102            getClass().getResourceAsStream("/charsets/utf-16LE_names.txt"));
103    x = in.read();
104    y = in.read();
105    z = in.read();
106    in.close();
107    assertEquals(35, x);
108    assertEquals(0, y);
109    assertEquals(35, z);
110
111    in =
112        new BomSafeInputStreamWrapper(
113            getClass().getResourceAsStream("/charsets/utf-16LE_bom_names.txt"));
114    x = in.read();
115    y = in.read();
116    z = in.read();
117    in.close();
118    assertEquals(35, x);
119    assertEquals(0, y);
120    assertEquals(35, z);
121
122    in =
123        new BomSafeInputStreamWrapper(
124            getClass().getResourceAsStream("/charsets/utf-16BE_bom_names.txt"));
125    x = in.read();
126    y = in.read();
127    z = in.read();
128    in.close();
129    assertEquals(0, x);
130    assertEquals(35, y);
131    assertEquals(0, z);
132
133    in =
134        new BomSafeInputStreamWrapper(
135            getClass().getResourceAsStream("/charsets/utf-16BE_bom_names.txt"));
136    x = in.read();
137    y = in.read();
138    z = in.read();
139    in.close();
140    assertEquals(0, x);
141    assertEquals(35, y);
142    assertEquals(0, z);
143  }
144}