001/*
002 * Copyright 2021 Global Biodiversity Information Facility (GBIF)
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016package org.gbif.utils.file;
017
018import java.io.IOException;
019import java.io.InputStream;
020
021/**
022 * A wrapper for an input stream that removes UTF8 BOM sequences at the start of the file.
023 * UTF8 BOMs can cause XML parser to fall over with a "Content is not allowed in prolog" Exception.
024 * See:
025 * <ul>
026 *  <li>http://bugs.java.com/bugdatabase/view_bug.do?bug_id=4508058</li>
027 *  <li>https://de.wikipedia.org/wiki/Byte_Order_Mark</li>
028 * </ul>
029 *
030 * @deprecated use org.apache.commons.io.input.BOMInputStream instead
031 */
032@Deprecated
033public class BomSafeInputStreamWrapper extends InputStream {
034
035  private static final int BUFFER_SIZE = 4;
036  private final InputStream stream;
037  private final byte[] buffer = new byte[BUFFER_SIZE];
038  private int pointer = 0;
039
040  public BomSafeInputStreamWrapper(InputStream stream) {
041    this.stream = stream;
042    skipBom();
043  }
044
045  @Override
046  public int read() throws IOException {
047    if (pointer < BUFFER_SIZE) {
048      pointer++;
049      return buffer[pointer - 1];
050    } else {
051      return stream.read();
052    }
053  }
054
055  private void skipBom() {
056    try {
057      stream.read(buffer, 0, BUFFER_SIZE);
058      if (CharsetDetection.hasUTF16BEBom(buffer) || CharsetDetection.hasUTF16LEBom(buffer)) {
059        // SQX Parser handles UTF16 BOMs fine
060        pointer = 2;
061      } else if (CharsetDetection.hasUTF8Bom(buffer)) {
062        pointer = 3;
063      }
064    } catch (IOException ignored) {
065    }
066  }
067
068}