Source code

001/*
002 * Copyright 2021 Global Biodiversity Information Facility (GBIF)
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016package org.gbif.utils.file;
017
018import java.io.File;
019import java.io.IOException;
020import java.nio.ByteBuffer;
021import java.nio.CharBuffer;
022import java.nio.charset.CharacterCodingException;
023import java.nio.charset.Charset;
024import java.nio.charset.CharsetDecoder;
025import java.nio.charset.StandardCharsets;
026
027import org.slf4j.Logger;
028import org.slf4j.LoggerFactory;
029
030import static org.gbif.utils.file.FileUtils.readByteBuffer;
031
032/**
033 * <p>
034 * Utility class to guess the encoding of a given file or byte array. The guess is unfortunately not 100% sure.
035 * Especially for 8-bit charsets. It's not possible
036 * to know which 8-bit charset is used. Except through statistical analysis.
037 * </p>
038 * <p>
039 * On the other hand, unicode files encoded in UTF-16 (low or big endian) or UTF-8 files with a Byte Order Marker are
040 * easy to find. For UTF-8 files with no BOM,
041 * if the buffer is wide enough, it's easy to guess.
042 * </p>
043 * <p>
044 * To determine whether mostly-English text is UTF-8 or ISO-8859-1, a fairly large buffer may be necessary to find an
045 * instance of é, ° etc.
046 * </p>
047 * This class is a heavily modified version of the original written by Guillaume LAFORGE:
048 * com.glaforge.i18n.io.CharsetToolkit
049 * taken from
050 * http://glaforge.free.fr/wiki/index.php?wiki=GuessEncoding
051 *
052 * @author Guillaume LAFORGE
053 * @author Markus Döring
054 */
055public class CharsetDetection {
056
057  private static final Logger LOG = LoggerFactory.getLogger(CharsetDetection.class);
058  // encodings to test and very unlikely chars in that encoding
059  private static final byte LF = 0x0a;
060  private static final byte CR = 0x0d;
061  private static final byte TAB = 0x09;
062
063  private static final int UNDEFINED_PENALTY = 100;
064  private static final char[] COMMON_NON_ASCII_CHARS;
065
066  static {
067    String commonChars = "äåáàæœčéèêëïñøöüßšž°±";
068    CharBuffer cbuf = CharBuffer.allocate(commonChars.length() * 2);
069    for (char c : commonChars.toCharArray()) {
070      cbuf.append(c);
071      cbuf.append(Character.toUpperCase(c));
072    }
073    COMMON_NON_ASCII_CHARS = cbuf.array();
074  }
075
076  private static final Charset LATIN1 = StandardCharsets.ISO_8859_1;
077  private static final Charset WINDOWS1252;
078  private static final Charset MACROMAN;
079
080  static {
081    Charset cs = null;
082    try {
083      cs = Charset.forName("Cp1252");
084    } catch (Exception e) {
085      LOG.warn("Windows 1252 encoding not supported on this Virtual Machine");
086    }
087    WINDOWS1252 = cs;
088
089    cs = null;
090    try {
091      cs = Charset.forName("MacRoman");
092    } catch (Exception e) {
093      LOG.warn("MacRoman encoding not supported on this Virtual Machine");
094    }
095    MACROMAN = cs;
096  }
097
098  private final byte[] buffer;
099
100  /**
101   * Constructor of the <code>com.glaforge.i18n.io.CharsetToolkit</code> utility class.
102   *
103   * @param buffer the byte buffer of which we want to know the encoding.
104   */
105  private CharsetDetection(byte[] buffer) {
106    this.buffer = buffer;
107  }
108
109  public static Charset detectEncoding(File file) throws IOException {
110    byte[] data = readByteBuffer(file).array();
111
112    CharsetDetection detector = new CharsetDetection(data);
113    Charset charset = detector.detectEncoding();
114
115    LOG.debug("Detected character encoding " + charset.displayName());
116    return charset;
117  }
118
119  /**
120   * @param bufferLength number of bytes to read in for the detection. Needs to be long enough to encounter non-ASCII
121   *                     characters, which could be unusual in English text.
122   */
123  public static Charset detectEncoding(File file, int bufferLength) throws IOException {
124    byte[] data = readByteBuffer(file, bufferLength).array();
125
126    CharsetDetection detector = new CharsetDetection(data);
127    Charset charset = detector.detectEncoding();
128
129    LOG.debug("Detected character encoding " + charset.displayName());
130    return charset;
131  }
132
133  /**
134   * Retrieve the default charset of the system.
135   *
136   * @return the default <code>Charset</code>.
137   */
138  public static Charset getDefaultSystemCharset() {
139    return Charset.forName(System.getProperty("file.encoding"));
140  }
141
142  /**
143   * Has a Byte Order Marker for UTF-16 Big Endian
144   * (utf-16 and ucs-2).
145   *
146   * @param bom a buffer.
147   *
148   * @return true if the buffer has a BOM for UTF-16 Big Endian.
149   */
150  protected static boolean hasUTF16BEBom(byte[] bom) {
151    return bom[0] == -2 && bom[1] == -1;
152  }
153
154  /**
155   * Has a Byte Order Marker for UTF-16 Low Endian
156   * (ucs-2le, ucs-4le, and ucs-16le).
157   *
158   * @param bom a buffer.
159   *
160   * @return true if the buffer has a BOM for UTF-16 Low Endian.
161   */
162  protected static boolean hasUTF16LEBom(byte[] bom) {
163    return bom[0] == -1 && bom[1] == -2;
164  }
165
166  /**
167   * Has a Byte Order Marker for UTF-8 (Used by Microsoft's Notepad and other editors).
168   *
169   * @param bom a buffer.
170   *
171   * @return true if the buffer has a BOM for UTF8.
172   */
173  protected static boolean hasUTF8Bom(byte[] bom) {
174    return bom[0] == -17 && bom[1] == -69 && bom[2] == -65;
175  }
176
177  private static boolean isCommonChar(char c) {
178    for (char cc : COMMON_NON_ASCII_CHARS) {
179      if (c == cc) {
180        return true;
181      }
182    }
183    return false;
184  }
185
186  /**
187   * If the byte has the form 10xxxxx, then it's a continuation byte of a multiple byte character;
188   *
189   * @param b a byte.
190   *
191   * @return true if it's a continuation char.
192   */
193  private static boolean isContinuationChar(byte b) {
194    return -128 <= b && b <= -65;
195  }
196
197  /**
198   * If the byte has the form 11110xx, then it's the first byte of a five-bytes sequence character.
199   *
200   * @param b a byte.
201   *
202   * @return true if it's the first byte of a five-bytes sequence.
203   */
204  private static boolean isFiveBytesSequence(byte b) {
205    return -8 <= b && b <= -5;
206  }
207
208  /**
209   * If the byte has the form 11110xx, then it's the first byte of a four-bytes sequence character.
210   *
211   * @param b a byte.
212   *
213   * @return true if it's the first byte of a four-bytes sequence.
214   */
215  private static boolean isFourBytesSequence(byte b) {
216    return -16 <= b && b <= -9;
217  }
218
219  /**
220   * If the byte has the form 1110xxx, then it's the first byte of a six-bytes sequence character.
221   *
222   * @param b a byte.
223   *
224   * @return true if it's the first byte of a six-bytes sequence.
225   */
226  private static boolean isSixBytesSequence(byte b) {
227    return -4 <= b && b <= -3;
228  }
229
230  /**
231   * If the byte has the form 1110xxx, then it's the first byte of a three-bytes sequence character.
232   *
233   * @param b a byte.
234   *
235   * @return true if it's the first byte of a three-bytes sequence.
236   */
237  private static boolean isThreeBytesSequence(byte b) {
238    return -32 <= b && b <= -17;
239  }
240
241  /**
242   * If the byte has the form 110xxxx, then it's the first byte of a two-bytes sequence character.
243   *
244   * @param b a byte.
245   *
246   * @return true if it's the first byte of a two-bytes sequence.
247   */
248  private static boolean isTwoBytesSequence(byte b) {
249    return -64 <= b && b <= -33;
250  }
251
252  private Charset detectCharacterEncoding8bit() {
253
254    // the number of "bad" chars for the best guess. A better guess will have
255    long leastSuspicousChars = testLatin1();
256    long suspicousChars;
257
258    // the best guess so far
259    Charset bestEncoding = LATIN1;
260
261    if (WINDOWS1252 != null) {
262      suspicousChars = testWindows1252();
263      if (suspicousChars < leastSuspicousChars) {
264        leastSuspicousChars = suspicousChars;
265        bestEncoding = WINDOWS1252;
266      }
267    }
268
269    if (MACROMAN != null) {
270      suspicousChars = testMacRoman();
271      if (suspicousChars < leastSuspicousChars) {
272        leastSuspicousChars = suspicousChars;
273        bestEncoding = MACROMAN;
274      }
275    }
276
277    LOG.debug("8bit Encoding guessed: {} with {} rare characters", bestEncoding, leastSuspicousChars);
278    return bestEncoding;
279  }
280
281  /**
282   * <p>
283   * Guess the encoding of the provided buffer.
284   * </p>
285   * If Byte Order Markers are encountered at the beginning of the buffer, we immediately
286   * return the charset implied by this BOM. Otherwise, the file would not be a human
287   * readable text file.
288   * <p/>
289   * <p>
290   * If there is no BOM, this method tries to discern whether the file is UTF-8 or not. If it is not UTF-8, we assume
291   * the encoding is the default system encoding (of course, it might be any 8-bit charset, but usually, an 8-bit
292   * charset is the default one).
293   * </p>
294   * <p/>
295   * <p>
296   * It is possible to discern UTF-8 thanks to the pattern of characters with a multi-byte sequence.
297   * </p>
298   * <p/>
299   * <pre>
300   * UCS-4 range (hex.)        UTF-8 octet sequence (binary)
301   * 0000 0000-0000 007F       0xxxxxxx
302   * 0000 0080-0000 07FF       110xxxxx 10xxxxxx
303   * 0000 0800-0000 FFFF       1110xxxx 10xxxxxx 10xxxxxx
304   * 0001 0000-001F FFFF       11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
305   * 0020 0000-03FF FFFF       111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
306   * 0400 0000-7FFF FFFF       1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
307   * </pre>
308   * <p>
309   * With UTF-8, 0xFE and 0xFF never appear.
310   * </p>
311   *
312   * @return the Charset recognized or the system default.
313   */
314  public Charset detectEncoding() {
315    // if the file has a Byte Order Marker, we can assume the file is in UTF-xx
316    // otherwise, the file would not be human readable
317    if (hasUTF8Bom(buffer)) {
318      return StandardCharsets.UTF_8;
319    }
320    if (hasUTF16LEBom(buffer)) {
321      return StandardCharsets.UTF_16LE;
322    }
323    if (hasUTF16BEBom(buffer)) {
324      return StandardCharsets.UTF_16BE;
325    }
326
327    // if it's not UTF-8 or a BOM present check for UTF16 zeros
328    Charset cs = detectUtf16();
329    if (cs != null) {
330      return cs;
331    }
332
333    // if the file is in UTF-8, high order bytes must have a certain value, in order to be valid
334    // if it's not the case, we can assume the encoding is some 8 bit one
335    boolean validU8Char = true;
336
337    // TODO the buffer is not read up to the end, but up to length - 6
338
339    int length = buffer.length;
340    int i = 0;
341    while (i < length - 6) {
342      byte b0 = buffer[i];
343      byte b1 = buffer[i + 1];
344      byte b2 = buffer[i + 2];
345      byte b3 = buffer[i + 3];
346      byte b4 = buffer[i + 4];
347      byte b5 = buffer[i + 5];
348      if (b0 < 0) {
349        // a high order bit was encountered, thus the encoding is not US-ASCII
350        // a two-byte sequence was encountered
351        if (isTwoBytesSequence(b0)) {
352          // there must be one continuation byte of the form 10xxxxxx,
353          // otherwise the following character is is not a valid UTF-8 construct
354          if (isContinuationChar(b1)) {
355            i++;
356          } else {
357            validU8Char = false;
358          }
359        }
360        // a three-byte sequence was encountered
361        else if (isThreeBytesSequence(b0)) {
362          // there must be two continuation bytes of the form 10xxxxxx,
363          // otherwise the following character is is not a valid UTF-8 construct
364          if (isContinuationChar(b1) && isContinuationChar(b2)) {
365            i += 2;
366          } else {
367            validU8Char = false;
368          }
369        }
370        // a four-byte sequence was encountered
371        else if (isFourBytesSequence(b0)) {
372          // there must be three continuation bytes of the form 10xxxxxx,
373          // otherwise the following character is is not a valid UTF-8 construct
374          if (isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3)) {
375            i += 3;
376          } else {
377            validU8Char = false;
378          }
379        }
380        // a five-byte sequence was encountered
381        else if (isFiveBytesSequence(b0)) {
382          // there must be four continuation bytes of the form 10xxxxxx,
383          // otherwise the following character is is not a valid UTF-8 construct
384          if (isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3) && isContinuationChar(b4)) {
385            i += 4;
386          } else {
387            validU8Char = false;
388          }
389        }
390        // a six-byte sequence was encountered
391        else if (isSixBytesSequence(b0)) {
392          // there must be five continuation bytes of the form 10xxxxxx,
393          // otherwise the following character is is not a valid UTF-8 construct
394          if (isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3) && isContinuationChar(b4)
395              && isContinuationChar(b5)) {
396            i += 5;
397          } else {
398            validU8Char = false;
399          }
400        } else {
401          validU8Char = false;
402        }
403      }
404      if (!validU8Char) {
405        break;
406      }
407      i++;
408    }
409
410    // if no invalid UTF-8 were encountered, we can assume the encoding is UTF-8,
411    // otherwise the file would not be human readable
412    if (validU8Char) {
413      return StandardCharsets.UTF_8;
414    }
415
416    // finally it must be some 8bit encoding we try to detect statistically
417    return detectCharacterEncoding8bit();
418  }
419
420  private Charset detectUtf16() {
421
422    // first try to see if we got a little or big endian, i.e. lots of zeros as the first byte or second byte if we deal
423    // with latin characters at least
424    int zerosLE = 0;
425    int zerosBE = 0;
426    boolean even = true;
427
428    int length = buffer.length;
429    int i = 0;
430    while (i < length) {
431      byte b = buffer[i];
432      i++;
433      even = !even;
434      if (b == 0x00) {
435        // zero occur a lot in utf16 with latin characters
436        if (even) {
437          zerosLE++;
438        } else {
439          zerosBE++;
440        }
441      }
442    }
443
444    // a UTF16 encoding with many latin characters would have either lots of even or uneven bytes as zero - but not both
445    int min = buffer.length / 10;
446    if ((zerosBE > min || zerosLE > min) && Math.abs(zerosBE - zerosLE) > min) {
447      Charset charset = zerosBE > zerosLE ? StandardCharsets.UTF_16BE : StandardCharsets.UTF_16LE;
448
449      // now try to decode the whole lot just to make sure
450      try {
451        CharsetDecoder decoder = charset.newDecoder();
452        decoder.decode(ByteBuffer.wrap(buffer));
453        // that worked without a problem - think we got it!
454        return charset;
455      } catch (CharacterCodingException e) {
456        // finally try with the plain UTF16 encoding
457        charset = StandardCharsets.UTF_16;
458        try {
459          CharsetDecoder decoder = charset.newDecoder();
460          decoder.decode(ByteBuffer.wrap(buffer));
461          // that worked without a problem - think we got it!
462          return charset;
463        } catch (CharacterCodingException e2) {
464        }
465      }
466    }
467
468    return null;
469  }
470
471  private long testLatin1() {
472    Charset charset = StandardCharsets.ISO_8859_1;
473    CharsetDecoder decoder = charset.newDecoder();
474
475    long suspicious = 0;
476    // count the following
477
478    // first try to decode the whole lot and count common non ascii chars
479    try {
480      CharBuffer cbuf = decoder.decode(ByteBuffer.wrap(buffer));
481      while (cbuf.hasRemaining()) {
482        char c = cbuf.get();
483        if (isCommonChar(c)) {
484          suspicious--;
485        }
486      }
487
488      // if that worked without a problem try to count suspicious characters which are rarely used in our texts
489      int length = buffer.length;
490      int i = 0;
491      while (i < length) {
492        byte b = buffer[i];
493        i++;
494        // range 7f-9f undefined, see http://de.wikipedia.org/wiki/ISO_8859-1
495        if (b >= (byte) 0x80 && b <= (byte) 0x9f) {
496          suspicious += UNDEFINED_PENALTY;
497        }
498      }
499    } catch (CharacterCodingException e) {
500      suspicious = Long.MAX_VALUE;
501    }
502
503    return suspicious;
504  }
505
506  private long testMacRoman() {
507    CharsetDecoder decoder = MACROMAN.newDecoder();
508
509    long suspicious = 0;
510
511    // first try to decode the whole lot
512    try {
513      CharBuffer cbuf = decoder.decode(ByteBuffer.wrap(buffer));
514      while (cbuf.hasRemaining()) {
515        char c = cbuf.get();
516        if (isCommonChar(c)) {
517          suspicious--;
518        }
519      }
520      // if that worked without a problem try to count suspicious characters which are rarely used in our texts
521      int length = buffer.length;
522      int i = 0;
523      while (i < length) {
524        byte b = buffer[i];
525        i++;
526        // all ranges defined I am afraid
527      }
528    } catch (CharacterCodingException e) {
529      suspicious = Long.MAX_VALUE;
530    }
531
532    return suspicious;
533  }
534
535  private long testWindows1252() {
536    CharsetDecoder decoder = WINDOWS1252.newDecoder();
537    long suspicious = 0;
538
539    // first try to decode the whole lot
540    try {
541      CharBuffer cbuf = decoder.decode(ByteBuffer.wrap(buffer));
542      while (cbuf.hasRemaining()) {
543        char c = cbuf.get();
544        if (isCommonChar(c)) {
545          suspicious--;
546        }
547      }
548      // if that worked without a problem try to count suspicous characters which are rarely used in our texts
549      // see http://de.wikipedia.org/wiki/ISO_8859-1
550      int length = buffer.length;
551      int i = 0;
552      while (i < length) {
553        byte b = buffer[i];
554        i++;
555        // 5 undefined chars
556        if (b == (byte) 0x81 || b == (byte) 0x8d || b == (byte) 0x8f || b == (byte) 0x90 || b == (byte) 0x9d) {
557          suspicious += UNDEFINED_PENALTY;
558        }
559      }
560    } catch (CharacterCodingException e) {
561      suspicious = Long.MAX_VALUE;
562    }
563
564    return suspicious;
565  }
566
567}