Source code

001/*
002 * Licensed under the Apache License, Version 2.0 (the "License");
003 * you may not use this file except in compliance with the License.
004 * You may obtain a copy of the License at
005 *
006 *     http://www.apache.org/licenses/LICENSE-2.0
007 *
008 * Unless required by applicable law or agreed to in writing, software
009 * distributed under the License is distributed on an "AS IS" BASIS,
010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
011 * See the License for the specific language governing permissions and
012 * limitations under the License.
013 */
014package org.gbif.utils.file;
015
016import java.io.File;
017import java.io.IOException;
018import java.nio.ByteBuffer;
019import java.nio.CharBuffer;
020import java.nio.charset.CharacterCodingException;
021import java.nio.charset.Charset;
022import java.nio.charset.CharsetDecoder;
023import java.nio.charset.StandardCharsets;
024
025import org.slf4j.Logger;
026import org.slf4j.LoggerFactory;
027
028import static org.gbif.utils.file.FileUtils.readByteBuffer;
029
030/**
031 * <p>
032 * Utility class to guess the encoding of a given file or byte array. The guess is unfortunately not 100% sure.
033 * Especially for 8-bit charsets. It's not possible
034 * to know which 8-bit charset is used. Except through statistical analysis.
035 * </p>
036 * <p>
037 * On the other hand, unicode files encoded in UTF-16 (low or big endian) or UTF-8 files with a Byte Order Marker are
038 * easy to find. For UTF-8 files with no BOM,
039 * if the buffer is wide enough, it's easy to guess.
040 * </p>
041 * <p>
042 * To determine whether mostly-English text is UTF-8 or ISO-8859-1, a fairly large buffer may be necessary to find an
043 * instance of é, ° etc.
044 * </p>
045 * This class is a heavily modified version of the original written by Guillaume LAFORGE:
046 * com.glaforge.i18n.io.CharsetToolkit
047 * taken from
048 * http://glaforge.free.fr/wiki/index.php?wiki=GuessEncoding
049 *
050 * @author Guillaume LAFORGE
051 * @author Markus Döring
052 */
053public class CharsetDetection {
054
055  private static final Logger LOG = LoggerFactory.getLogger(CharsetDetection.class);
056  // encodings to test and very unlikely chars in that encoding
057  private static final byte LF = 0x0a;
058  private static final byte CR = 0x0d;
059  private static final byte TAB = 0x09;
060
061  private static final int UNDEFINED_PENALTY = 100;
062  private static final char[] COMMON_NON_ASCII_CHARS;
063
064  static {
065    String commonChars = "äåáàæœčéèêëïñøöüßšž°±";
066    CharBuffer cbuf = CharBuffer.allocate(commonChars.length() * 2);
067    for (char c : commonChars.toCharArray()) {
068      cbuf.append(c);
069      cbuf.append(Character.toUpperCase(c));
070    }
071    COMMON_NON_ASCII_CHARS = cbuf.array();
072  }
073
074  private static final Charset LATIN1 = StandardCharsets.ISO_8859_1;
075  private static final Charset WINDOWS1252;
076  private static final Charset MACROMAN;
077
078  static {
079    Charset cs = null;
080    try {
081      cs = Charset.forName("Cp1252");
082    } catch (Exception e) {
083      LOG.warn("Windows 1252 encoding not supported on this Virtual Machine");
084    }
085    WINDOWS1252 = cs;
086
087    cs = null;
088    try {
089      cs = Charset.forName("MacRoman");
090    } catch (Exception e) {
091      LOG.warn("MacRoman encoding not supported on this Virtual Machine");
092    }
093    MACROMAN = cs;
094  }
095
096  private final byte[] buffer;
097
098  /**
099   * Constructor of the <code>com.glaforge.i18n.io.CharsetToolkit</code> utility class.
100   *
101   * @param buffer the byte buffer of which we want to know the encoding.
102   */
103  private CharsetDetection(byte[] buffer) {
104    this.buffer = buffer;
105  }
106
107  public static Charset detectEncoding(File file) throws IOException {
108    byte[] data = readByteBuffer(file).array();
109
110    CharsetDetection detector = new CharsetDetection(data);
111    Charset charset = detector.detectEncoding();
112
113    LOG.debug("Detected character encoding " + charset.displayName());
114    return charset;
115  }
116
117  /**
118   * @param bufferLength number of bytes to read in for the detection. Needs to be long enough to encounter non-ASCII
119   *                     characters, which could be unusual in English text.
120   */
121  public static Charset detectEncoding(File file, int bufferLength) throws IOException {
122    byte[] data = readByteBuffer(file, bufferLength).array();
123
124    CharsetDetection detector = new CharsetDetection(data);
125    Charset charset = detector.detectEncoding();
126
127    LOG.debug("Detected character encoding " + charset.displayName());
128    return charset;
129  }
130
131  /**
132   * Retrieve the default charset of the system.
133   *
134   * @return the default <code>Charset</code>.
135   */
136  public static Charset getDefaultSystemCharset() {
137    return Charset.forName(System.getProperty("file.encoding"));
138  }
139
140  /**
141   * Has a Byte Order Marker for UTF-16 Big Endian
142   * (utf-16 and ucs-2).
143   *
144   * @param bom a buffer.
145   *
146   * @return true if the buffer has a BOM for UTF-16 Big Endian.
147   */
148  protected static boolean hasUTF16BEBom(byte[] bom) {
149    return bom[0] == -2 && bom[1] == -1;
150  }
151
152  /**
153   * Has a Byte Order Marker for UTF-16 Low Endian
154   * (ucs-2le, ucs-4le, and ucs-16le).
155   *
156   * @param bom a buffer.
157   *
158   * @return true if the buffer has a BOM for UTF-16 Low Endian.
159   */
160  protected static boolean hasUTF16LEBom(byte[] bom) {
161    return bom[0] == -1 && bom[1] == -2;
162  }
163
164  /**
165   * Has a Byte Order Marker for UTF-8 (Used by Microsoft's Notepad and other editors).
166   *
167   * @param bom a buffer.
168   *
169   * @return true if the buffer has a BOM for UTF8.
170   */
171  protected static boolean hasUTF8Bom(byte[] bom) {
172    return bom[0] == -17 && bom[1] == -69 && bom[2] == -65;
173  }
174
175  private static boolean isCommonChar(char c) {
176    for (char cc : COMMON_NON_ASCII_CHARS) {
177      if (c == cc) {
178        return true;
179      }
180    }
181    return false;
182  }
183
184  /**
185   * If the byte has the form 10xxxxx, then it's a continuation byte of a multiple byte character;
186   *
187   * @param b a byte.
188   *
189   * @return true if it's a continuation char.
190   */
191  private static boolean isContinuationChar(byte b) {
192    return -128 <= b && b <= -65;
193  }
194
195  /**
196   * If the byte has the form 11110xx, then it's the first byte of a five-bytes sequence character.
197   *
198   * @param b a byte.
199   *
200   * @return true if it's the first byte of a five-bytes sequence.
201   */
202  private static boolean isFiveBytesSequence(byte b) {
203    return -8 <= b && b <= -5;
204  }
205
206  /**
207   * If the byte has the form 11110xx, then it's the first byte of a four-bytes sequence character.
208   *
209   * @param b a byte.
210   *
211   * @return true if it's the first byte of a four-bytes sequence.
212   */
213  private static boolean isFourBytesSequence(byte b) {
214    return -16 <= b && b <= -9;
215  }
216
217  /**
218   * If the byte has the form 1110xxx, then it's the first byte of a six-bytes sequence character.
219   *
220   * @param b a byte.
221   *
222   * @return true if it's the first byte of a six-bytes sequence.
223   */
224  private static boolean isSixBytesSequence(byte b) {
225    return -4 <= b && b <= -3;
226  }
227
228  /**
229   * If the byte has the form 1110xxx, then it's the first byte of a three-bytes sequence character.
230   *
231   * @param b a byte.
232   *
233   * @return true if it's the first byte of a three-bytes sequence.
234   */
235  private static boolean isThreeBytesSequence(byte b) {
236    return -32 <= b && b <= -17;
237  }
238
239  /**
240   * If the byte has the form 110xxxx, then it's the first byte of a two-bytes sequence character.
241   *
242   * @param b a byte.
243   *
244   * @return true if it's the first byte of a two-bytes sequence.
245   */
246  private static boolean isTwoBytesSequence(byte b) {
247    return -64 <= b && b <= -33;
248  }
249
250  private Charset detectCharacterEncoding8bit() {
251
252    // the number of "bad" chars for the best guess. A better guess will have
253    long leastSuspicousChars = testLatin1();
254    long suspicousChars;
255
256    // the best guess so far
257    Charset bestEncoding = LATIN1;
258
259    if (WINDOWS1252 != null) {
260      suspicousChars = testWindows1252();
261      if (suspicousChars < leastSuspicousChars) {
262        leastSuspicousChars = suspicousChars;
263        bestEncoding = WINDOWS1252;
264      }
265    }
266
267    if (MACROMAN != null) {
268      suspicousChars = testMacRoman();
269      if (suspicousChars < leastSuspicousChars) {
270        leastSuspicousChars = suspicousChars;
271        bestEncoding = MACROMAN;
272      }
273    }
274
275    LOG.debug(
276        "8bit Encoding guessed: {} with {} rare characters", bestEncoding, leastSuspicousChars);
277    return bestEncoding;
278  }
279
280  /**
281   * <p>
282   * Guess the encoding of the provided buffer.
283   * </p>
284   * If Byte Order Markers are encountered at the beginning of the buffer, we immediately
285   * return the charset implied by this BOM. Otherwise, the file would not be a human
286   * readable text file.
287   * <p/>
288   * <p>
289   * If there is no BOM, this method tries to discern whether the file is UTF-8 or not. If it is not UTF-8, we assume
290   * the encoding is the default system encoding (of course, it might be any 8-bit charset, but usually, an 8-bit
291   * charset is the default one).
292   * </p>
293   * <p/>
294   * <p>
295   * It is possible to discern UTF-8 thanks to the pattern of characters with a multi-byte sequence.
296   * </p>
297   * <p/>
298   * <pre>
299   * UCS-4 range (hex.)        UTF-8 octet sequence (binary)
300   * 0000 0000-0000 007F       0xxxxxxx
301   * 0000 0080-0000 07FF       110xxxxx 10xxxxxx
302   * 0000 0800-0000 FFFF       1110xxxx 10xxxxxx 10xxxxxx
303   * 0001 0000-001F FFFF       11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
304   * 0020 0000-03FF FFFF       111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
305   * 0400 0000-7FFF FFFF       1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
306   * </pre>
307   * <p>
308   * With UTF-8, 0xFE and 0xFF never appear.
309   * </p>
310   *
311   * @return the Charset recognized or the system default.
312   */
313  public Charset detectEncoding() {
314    // if the file has a Byte Order Marker, we can assume the file is in UTF-xx
315    // otherwise, the file would not be human readable
316    if (hasUTF8Bom(buffer)) {
317      return StandardCharsets.UTF_8;
318    }
319    if (hasUTF16LEBom(buffer)) {
320      return StandardCharsets.UTF_16LE;
321    }
322    if (hasUTF16BEBom(buffer)) {
323      return StandardCharsets.UTF_16BE;
324    }
325
326    // if it's not UTF-8 or a BOM present check for UTF16 zeros
327    Charset cs = detectUtf16();
328    if (cs != null) {
329      return cs;
330    }
331
332    // if the file is in UTF-8, high order bytes must have a certain value, in order to be valid
333    // if it's not the case, we can assume the encoding is some 8 bit one
334    boolean validU8Char = true;
335
336    // TODO the buffer is not read up to the end, but up to length - 6
337
338    int length = buffer.length;
339    int i = 0;
340    while (i < length - 6) {
341      byte b0 = buffer[i];
342      byte b1 = buffer[i + 1];
343      byte b2 = buffer[i + 2];
344      byte b3 = buffer[i + 3];
345      byte b4 = buffer[i + 4];
346      byte b5 = buffer[i + 5];
347      if (b0 < 0) {
348        // a high order bit was encountered, thus the encoding is not US-ASCII
349        // a two-byte sequence was encountered
350        if (isTwoBytesSequence(b0)) {
351          // there must be one continuation byte of the form 10xxxxxx,
352          // otherwise the following character is is not a valid UTF-8 construct
353          if (isContinuationChar(b1)) {
354            i++;
355          } else {
356            validU8Char = false;
357          }
358        }
359        // a three-byte sequence was encountered
360        else if (isThreeBytesSequence(b0)) {
361          // there must be two continuation bytes of the form 10xxxxxx,
362          // otherwise the following character is is not a valid UTF-8 construct
363          if (isContinuationChar(b1) && isContinuationChar(b2)) {
364            i += 2;
365          } else {
366            validU8Char = false;
367          }
368        }
369        // a four-byte sequence was encountered
370        else if (isFourBytesSequence(b0)) {
371          // there must be three continuation bytes of the form 10xxxxxx,
372          // otherwise the following character is is not a valid UTF-8 construct
373          if (isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3)) {
374            i += 3;
375          } else {
376            validU8Char = false;
377          }
378        }
379        // a five-byte sequence was encountered
380        else if (isFiveBytesSequence(b0)) {
381          // there must be four continuation bytes of the form 10xxxxxx,
382          // otherwise the following character is is not a valid UTF-8 construct
383          if (isContinuationChar(b1)
384              && isContinuationChar(b2)
385              && isContinuationChar(b3)
386              && isContinuationChar(b4)) {
387            i += 4;
388          } else {
389            validU8Char = false;
390          }
391        }
392        // a six-byte sequence was encountered
393        else if (isSixBytesSequence(b0)) {
394          // there must be five continuation bytes of the form 10xxxxxx,
395          // otherwise the following character is is not a valid UTF-8 construct
396          if (isContinuationChar(b1)
397              && isContinuationChar(b2)
398              && isContinuationChar(b3)
399              && isContinuationChar(b4)
400              && isContinuationChar(b5)) {
401            i += 5;
402          } else {
403            validU8Char = false;
404          }
405        } else {
406          validU8Char = false;
407        }
408      }
409      if (!validU8Char) {
410        break;
411      }
412      i++;
413    }
414
415    // if no invalid UTF-8 were encountered, we can assume the encoding is UTF-8,
416    // otherwise the file would not be human readable
417    if (validU8Char) {
418      return StandardCharsets.UTF_8;
419    }
420
421    // finally it must be some 8bit encoding we try to detect statistically
422    return detectCharacterEncoding8bit();
423  }
424
425  private Charset detectUtf16() {
426
427    // first try to see if we got a little or big endian, i.e. lots of zeros as the first byte or
428    // second byte if we deal
429    // with latin characters at least
430    int zerosLE = 0;
431    int zerosBE = 0;
432    boolean even = true;
433
434    int length = buffer.length;
435    int i = 0;
436    while (i < length) {
437      byte b = buffer[i];
438      i++;
439      even = !even;
440      if (b == 0x00) {
441        // zero occur a lot in utf16 with latin characters
442        if (even) {
443          zerosLE++;
444        } else {
445          zerosBE++;
446        }
447      }
448    }
449
450    // a UTF16 encoding with many latin characters would have either lots of even or uneven bytes as
451    // zero - but not both
452    int min = buffer.length / 10;
453    if ((zerosBE > min || zerosLE > min) && Math.abs(zerosBE - zerosLE) > min) {
454      Charset charset = zerosBE > zerosLE ? StandardCharsets.UTF_16BE : StandardCharsets.UTF_16LE;
455
456      // now try to decode the whole lot just to make sure
457      try {
458        CharsetDecoder decoder = charset.newDecoder();
459        decoder.decode(ByteBuffer.wrap(buffer));
460        // that worked without a problem - think we got it!
461        return charset;
462      } catch (CharacterCodingException e) {
463        // finally try with the plain UTF16 encoding
464        charset = StandardCharsets.UTF_16;
465        try {
466          CharsetDecoder decoder = charset.newDecoder();
467          decoder.decode(ByteBuffer.wrap(buffer));
468          // that worked without a problem - think we got it!
469          return charset;
470        } catch (CharacterCodingException e2) {
471        }
472      }
473    }
474
475    return null;
476  }
477
478  private long testLatin1() {
479    Charset charset = StandardCharsets.ISO_8859_1;
480    CharsetDecoder decoder = charset.newDecoder();
481
482    long suspicious = 0;
483    // count the following
484
485    // first try to decode the whole lot and count common non ascii chars
486    try {
487      CharBuffer cbuf = decoder.decode(ByteBuffer.wrap(buffer));
488      while (cbuf.hasRemaining()) {
489        char c = cbuf.get();
490        if (isCommonChar(c)) {
491          suspicious--;
492        }
493      }
494
495      // if that worked without a problem try to count suspicious characters which are rarely used
496      // in our texts
497      int length = buffer.length;
498      int i = 0;
499      while (i < length) {
500        byte b = buffer[i];
501        i++;
502        // range 7f-9f undefined, see http://de.wikipedia.org/wiki/ISO_8859-1
503        if (b >= (byte) 0x80 && b <= (byte) 0x9f) {
504          suspicious += UNDEFINED_PENALTY;
505        }
506      }
507    } catch (CharacterCodingException e) {
508      suspicious = Long.MAX_VALUE;
509    }
510
511    return suspicious;
512  }
513
514  private long testMacRoman() {
515    CharsetDecoder decoder = MACROMAN.newDecoder();
516
517    long suspicious = 0;
518
519    // first try to decode the whole lot
520    try {
521      CharBuffer cbuf = decoder.decode(ByteBuffer.wrap(buffer));
522      while (cbuf.hasRemaining()) {
523        char c = cbuf.get();
524        if (isCommonChar(c)) {
525          suspicious--;
526        }
527      }
528      // if that worked without a problem try to count suspicious characters which are rarely used
529      // in our texts
530      int length = buffer.length;
531      int i = 0;
532      while (i < length) {
533        byte b = buffer[i];
534        i++;
535        // all ranges defined I am afraid
536      }
537    } catch (CharacterCodingException e) {
538      suspicious = Long.MAX_VALUE;
539    }
540
541    return suspicious;
542  }
543
544  private long testWindows1252() {
545    CharsetDecoder decoder = WINDOWS1252.newDecoder();
546    long suspicious = 0;
547
548    // first try to decode the whole lot
549    try {
550      CharBuffer cbuf = decoder.decode(ByteBuffer.wrap(buffer));
551      while (cbuf.hasRemaining()) {
552        char c = cbuf.get();
553        if (isCommonChar(c)) {
554          suspicious--;
555        }
556      }
557      // if that worked without a problem try to count suspicous characters which are rarely used in
558      // our texts
559      // see http://de.wikipedia.org/wiki/ISO_8859-1
560      int length = buffer.length;
561      int i = 0;
562      while (i < length) {
563        byte b = buffer[i];
564        i++;
565        // 5 undefined chars
566        if (b == (byte) 0x81
567            || b == (byte) 0x8d
568            || b == (byte) 0x8f
569            || b == (byte) 0x90
570            || b == (byte) 0x9d) {
571          suspicious += UNDEFINED_PENALTY;
572        }
573      }
574    } catch (CharacterCodingException e) {
575      suspicious = Long.MAX_VALUE;
576    }
577
578    return suspicious;
579  }
580}