001/* 002 * Copyright 2021 Global Biodiversity Information Facility (GBIF) 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package org.gbif.utils.file; 017 018import java.io.IOException; 019import java.io.InputStream; 020 021/** 022 * A wrapper for an input stream that removes UTF8 BOM sequences at the start of the file. 023 * UTF8 BOMs can cause XML parser to fall over with a "Content is not allowed in prolog" Exception. 024 * See: 025 * <ul> 026 * <li>http://bugs.java.com/bugdatabase/view_bug.do?bug_id=4508058</li> 027 * <li>https://de.wikipedia.org/wiki/Byte_Order_Mark</li> 028 * </ul> 029 * 030 * @deprecated use org.apache.commons.io.input.BOMInputStream instead 031 */ 032@Deprecated 033public class BomSafeInputStreamWrapper extends InputStream { 034 035 private static final int BUFFER_SIZE = 4; 036 private final InputStream stream; 037 private final byte[] buffer = new byte[BUFFER_SIZE]; 038 private int pointer = 0; 039 040 public BomSafeInputStreamWrapper(InputStream stream) { 041 this.stream = stream; 042 skipBom(); 043 } 044 045 @Override 046 public int read() throws IOException { 047 if (pointer < BUFFER_SIZE) { 048 pointer++; 049 return buffer[pointer - 1]; 050 } else { 051 return stream.read(); 052 } 053 } 054 055 private void skipBom() { 056 try { 057 stream.read(buffer, 0, BUFFER_SIZE); 058 if (CharsetDetection.hasUTF16BEBom(buffer) || CharsetDetection.hasUTF16LEBom(buffer)) { 059 // SQX Parser handles UTF16 BOMs fine 060 pointer = 2; 061 } else if (CharsetDetection.hasUTF8Bom(buffer)) { 062 pointer = 3; 063 } 064 } catch (IOException ignored) { 065 } 066 } 067 068}