001/* 002 * Licensed under the Apache License, Version 2.0 (the "License"); 003 * you may not use this file except in compliance with the License. 004 * You may obtain a copy of the License at 005 * 006 * http://www.apache.org/licenses/LICENSE-2.0 007 * 008 * Unless required by applicable law or agreed to in writing, software 009 * distributed under the License is distributed on an "AS IS" BASIS, 010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 011 * See the License for the specific language governing permissions and 012 * limitations under the License. 013 */ 014package org.gbif.utils.file; 015 016import java.io.InputStream; 017 018import javax.xml.parsers.SAXParser; 019import javax.xml.parsers.SAXParserFactory; 020 021import org.apache.commons.io.IOUtils; 022import org.apache.commons.io.input.BOMInputStream; 023import org.junit.jupiter.api.Test; 024import org.xml.sax.ext.DefaultHandler2; 025 026import static org.junit.jupiter.api.Assertions.assertEquals; 027 028/** 029 * @author markus 030 */ 031public class BomSafeInputStreamWrapperTest { 032 033 static SAXParserFactory SAX_FACTORY = SAXParserFactory.newInstance(); 034 035 static { 036 SAX_FACTORY.setNamespaceAware(true); 037 SAX_FACTORY.setValidating(false); 038 } 039 040 /** 041 * The Java SAX Parser is known to have problems with UTF8 file that contain a proper BOM markup: 042 * http://bugs.java.com/bugdatabase/view_bug.do?bug_id=4508058 043 * https://de.wikipedia.org/wiki/Byte_Order_Mark 044 * 045 * Make sure the SAX Parser can handle any valid UTF files by using a BomSafeInputStreamWrapper stream. 046 */ 047 @Test 048 public void testSaxParser() throws Exception { 049 SAXParser p = SAX_FACTORY.newSAXParser(); 050 for (String f : new String[] {"utf8", "utf8bom", "utf16le", "utf16be"}) { 051 String fn = "/sax/" + f + ".xml"; 052 System.out.println(fn); 053 InputStream is = getClass().getResourceAsStream(fn); 054 p.parse(is, new DefaultHandler2()); 055 056 is = new BOMInputStream(getClass().getResourceAsStream(fn)); 057 p.parse(is, new DefaultHandler2()); 058 } 059 } 060 061 @Test 062 public void testUTF16Stream() throws Exception { 063 // should be the exact same bytes 064 065 byte[] b1 = IOUtils.toByteArray(getClass().getResourceAsStream("/sax/utf16le.xml")); 066 byte[] b2 = 067 IOUtils.toByteArray(new BOMInputStream(getClass().getResourceAsStream("/sax/utf16le.xml"))); 068 069 assertEquals(b1.length, b2.length); 070 int idx = 0; 071 for (byte b : b1) { 072 assertEquals(b, b2[idx++]); 073 } 074 } 075 076 @Test 077 public void testBomSafeInputStreamWrapper() throws Exception { 078 // test no bom 079 InputStream in = 080 new BomSafeInputStreamWrapper(getClass().getResourceAsStream("/charsets/utf-8_names.txt")); 081 int x = in.read(); 082 int y = in.read(); 083 int z = in.read(); 084 in.close(); 085 assertEquals(35, x); 086 assertEquals(35, y); 087 assertEquals(35, z); 088 089 in = 090 new BomSafeInputStreamWrapper( 091 getClass().getResourceAsStream("/charsets/utf-8_bom_names.txt")); 092 x = in.read(); 093 y = in.read(); 094 z = in.read(); 095 in.close(); 096 assertEquals(35, x); 097 assertEquals(35, y); 098 assertEquals(35, z); 099 100 in = 101 new BomSafeInputStreamWrapper( 102 getClass().getResourceAsStream("/charsets/utf-16LE_names.txt")); 103 x = in.read(); 104 y = in.read(); 105 z = in.read(); 106 in.close(); 107 assertEquals(35, x); 108 assertEquals(0, y); 109 assertEquals(35, z); 110 111 in = 112 new BomSafeInputStreamWrapper( 113 getClass().getResourceAsStream("/charsets/utf-16LE_bom_names.txt")); 114 x = in.read(); 115 y = in.read(); 116 z = in.read(); 117 in.close(); 118 assertEquals(35, x); 119 assertEquals(0, y); 120 assertEquals(35, z); 121 122 in = 123 new BomSafeInputStreamWrapper( 124 getClass().getResourceAsStream("/charsets/utf-16BE_bom_names.txt")); 125 x = in.read(); 126 y = in.read(); 127 z = in.read(); 128 in.close(); 129 assertEquals(0, x); 130 assertEquals(35, y); 131 assertEquals(0, z); 132 133 in = 134 new BomSafeInputStreamWrapper( 135 getClass().getResourceAsStream("/charsets/utf-16BE_bom_names.txt")); 136 x = in.read(); 137 y = in.read(); 138 z = in.read(); 139 in.close(); 140 assertEquals(0, x); 141 assertEquals(35, y); 142 assertEquals(0, z); 143 } 144}