001package org.gbif.utils.file;
002
003import java.io.File;
004import java.io.IOException;
005import java.nio.ByteBuffer;
006import java.nio.CharBuffer;
007import java.nio.charset.CharacterCodingException;
008import java.nio.charset.Charset;
009import java.nio.charset.CharsetDecoder;
010
011import com.google.common.base.Charsets;
012import org.slf4j.Logger;
013import org.slf4j.LoggerFactory;
014
015import static org.gbif.utils.file.FileUtils.readByteBuffer;
016
017/**
018 * <p>
019 * Utility class to guess the encoding of a given file or byte array. The guess is unfortunately not 100% sure.
020 * Especially for 8-bit charsets. It's not possible
021 * to know which 8-bit charset is used. Except through statistical analysis.
022 * </p>
023 * <p/>
024 * <p>
025 * On the other hand, unicode files encoded in UTF-16 (low or big endian) or UTF-8 files with a Byte Order Marker are
026 * easy to find. For UTF-8 files with no BOM,
027 * if the buffer is wide enough, it's easy to guess.
028 * </p>
029 * <p/>
030 * <p>
031 * A byte buffer of 4KB or 8KB is sufficient to be able to guess the encoding.
032 * </p>
033 * This class is a heavily modified version of the original written by Guillaume LAFORGE:
034 * com.glaforge.i18n.io.CharsetToolkit
035 * taken from
036 * http://glaforge.free.fr/wiki/index.php?wiki=GuessEncoding
037 *
038 * @author Guillaume LAFORGE
039 * @author Markus Döring
040 */
041public class CharsetDetection {
042
043  private static final Logger LOG = LoggerFactory.getLogger(CharsetDetection.class);
044  // encodings to test and very unlikely chars in that encoding
045  private static final byte LF = 0x0a;
046  private static final byte CR = 0x0d;
047  private static final byte TAB = 0x09;
048
049  private static final int UNDEFINED_PENALTY = 100;
050  private static final char[] COMMON_NON_ASCII_CHARS;
051
052  static {
053    String commonChars = "äåáàæœčéèêëïñøöüßšž";
054    CharBuffer cbuf = CharBuffer.allocate(commonChars.length() * 2);
055    for (char c : commonChars.toCharArray()) {
056      cbuf.append(c);
057      cbuf.append(Character.toUpperCase(c));
058    }
059    COMMON_NON_ASCII_CHARS = cbuf.array();
060  }
061
062  private static final Charset LATIN1 = Charsets.ISO_8859_1;
063  private static final Charset WINDOWS1252;
064  private static final Charset MACROMAN;
065
066  static {
067    Charset cs = null;
068    try {
069      cs = Charset.forName("Cp1252");
070    } catch (Exception e) {
071      LOG.warn("Windows 1252 encoding not supported on this Virtual Machine");
072    }
073    WINDOWS1252 = cs;
074
075    cs = null;
076    try {
077      cs = Charset.forName("MacRoman");
078    } catch (Exception e) {
079      LOG.warn("MacRoman encoding not supported on this Virtual Machine");
080    }
081    MACROMAN = cs;
082  }
083
084  private final byte[] buffer;
085
086  /**
087   * Constructor of the <code>com.glaforge.i18n.io.CharsetToolkit</code> utility class.
088   *
089   * @param buffer the byte buffer of which we want to know the encoding.
090   */
091  private CharsetDetection(byte[] buffer) {
092    this.buffer = buffer;
093  }
094
095  public static Charset detectEncoding(File file) throws IOException {
096    byte[] data = readByteBuffer(file).array();
097
098    CharsetDetection detector = new CharsetDetection(data);
099    Charset charset = detector.detectEncoding();
100
101    LOG.debug("Detected character encoding " + charset.displayName());
102    return charset;
103  }
104
105  /**
106   * @param bufferLength number of bytes to read in for the detection. 8192 is a reasonable value
107   */
108  public static Charset detectEncoding(File file, int bufferLength) throws IOException {
109    byte[] data = readByteBuffer(file, bufferLength).array();
110
111    CharsetDetection detector = new CharsetDetection(data);
112    Charset charset = detector.detectEncoding();
113
114    LOG.debug("Detected character encoding " + charset.displayName());
115    return charset;
116  }
117
118  /**
119   * Retrieve the default charset of the system.
120   *
121   * @return the default <code>Charset</code>.
122   */
123  public static Charset getDefaultSystemCharset() {
124    return Charset.forName(System.getProperty("file.encoding"));
125  }
126
127  /**
128   * Has a Byte Order Marker for UTF-16 Big Endian
129   * (utf-16 and ucs-2).
130   *
131   * @param bom a buffer.
132   *
133   * @return true if the buffer has a BOM for UTF-16 Big Endian.
134   */
135  protected static boolean hasUTF16BEBom(byte[] bom) {
136    return bom[0] == -2 && bom[1] == -1;
137  }
138
139  /**
140   * Has a Byte Order Marker for UTF-16 Low Endian
141   * (ucs-2le, ucs-4le, and ucs-16le).
142   *
143   * @param bom a buffer.
144   *
145   * @return true if the buffer has a BOM for UTF-16 Low Endian.
146   */
147  protected static boolean hasUTF16LEBom(byte[] bom) {
148    return bom[0] == -1 && bom[1] == -2;
149  }
150
151  /**
152   * Has a Byte Order Marker for UTF-8 (Used by Microsoft's Notepad and other editors).
153   *
154   * @param bom a buffer.
155   *
156   * @return true if the buffer has a BOM for UTF8.
157   */
158  protected static boolean hasUTF8Bom(byte[] bom) {
159    return bom[0] == -17 && bom[1] == -69 && bom[2] == -65;
160  }
161
162  private static boolean isCommonChar(char c) {
163    for (char cc : COMMON_NON_ASCII_CHARS) {
164      if (c == cc) {
165        return true;
166      }
167    }
168    return false;
169  }
170
171  /**
172   * If the byte has the form 10xxxxx, then it's a continuation byte of a multiple byte character;
173   *
174   * @param b a byte.
175   *
176   * @return true if it's a continuation char.
177   */
178  private static boolean isContinuationChar(byte b) {
179    return -128 <= b && b <= -65;
180  }
181
182  /**
183   * If the byte has the form 11110xx, then it's the first byte of a five-bytes sequence character.
184   *
185   * @param b a byte.
186   *
187   * @return true if it's the first byte of a five-bytes sequence.
188   */
189  private static boolean isFiveBytesSequence(byte b) {
190    return -8 <= b && b <= -5;
191  }
192
193  /**
194   * If the byte has the form 11110xx, then it's the first byte of a four-bytes sequence character.
195   *
196   * @param b a byte.
197   *
198   * @return true if it's the first byte of a four-bytes sequence.
199   */
200  private static boolean isFourBytesSequence(byte b) {
201    return -16 <= b && b <= -9;
202  }
203
204  /**
205   * If the byte has the form 1110xxx, then it's the first byte of a six-bytes sequence character.
206   *
207   * @param b a byte.
208   *
209   * @return true if it's the first byte of a six-bytes sequence.
210   */
211  private static boolean isSixBytesSequence(byte b) {
212    return -4 <= b && b <= -3;
213  }
214
215  /**
216   * If the byte has the form 1110xxx, then it's the first byte of a three-bytes sequence character.
217   *
218   * @param b a byte.
219   *
220   * @return true if it's the first byte of a three-bytes sequence.
221   */
222  private static boolean isThreeBytesSequence(byte b) {
223    return -32 <= b && b <= -17;
224  }
225
226  /**
227   * If the byte has the form 110xxxx, then it's the first byte of a two-bytes sequence character.
228   *
229   * @param b a byte.
230   *
231   * @return true if it's the first byte of a two-bytes sequence.
232   */
233  private static boolean isTwoBytesSequence(byte b) {
234    return -64 <= b && b <= -33;
235  }
236
237  private Charset detectCharacterEncoding8bit() {
238
239    // the number of "bad" chars for the best guess. A better guess will have
240    long leastSuspicousChars = testLatin1();
241    long suspicousChars;
242
243    // the best guess so far
244    Charset bestEncoding = LATIN1;
245
246    if (WINDOWS1252 != null) {
247      suspicousChars = testWindows1252();
248      if (suspicousChars < leastSuspicousChars) {
249        leastSuspicousChars = suspicousChars;
250        bestEncoding = WINDOWS1252;
251      }
252    }
253
254    if (MACROMAN != null) {
255      suspicousChars = testMacRoman();
256      if (suspicousChars < leastSuspicousChars) {
257        leastSuspicousChars = suspicousChars;
258        bestEncoding = MACROMAN;
259      }
260    }
261
262    LOG.debug("8bit Encoding guessed: {} with {} rare characters", bestEncoding, leastSuspicousChars);
263    return bestEncoding;
264  }
265
266  /**
267   * <p>
268   * Guess the encoding of the provided buffer.
269   * </p>
270   * If Byte Order Markers are encountered at the beginning of the buffer, we immidiately
271   * return the charset implied by this BOM. Otherwise, the file would not be a human
272   * readable text file.</p>
273   * <p/>
274   * <p>
275   * If there is no BOM, this method tries to discern whether the file is UTF-8 or not. If it is not UTF-8, we assume
276   * the encoding is the default system
277   * encoding (of course, it might be any 8-bit charset, but usually, an 8-bit charset is the default one).
278   * </p>
279   * <p/>
280   * <p>
281   * It is possible to discern UTF-8 thanks to the pattern of characters with a multi-byte sequence.
282   * </p>
283   * <p/>
284   * <pre>
285   * UCS-4 range (hex.)        UTF-8 octet sequence (binary)
286   * 0000 0000-0000 007F       0xxxxxxx
287   * 0000 0080-0000 07FF       110xxxxx 10xxxxxx
288   * 0000 0800-0000 FFFF       1110xxxx 10xxxxxx 10xxxxxx
289   * 0001 0000-001F FFFF       11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
290   * 0020 0000-03FF FFFF       111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
291   * 0400 0000-7FFF FFFF       1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
292   * </pre>
293   * <p>
294   * With UTF-8, 0xFE and 0xFF never appear.
295   * </p>
296   *
297   * @return the Charset recognized or the system default.
298   */
299  public Charset detectEncoding() {
300    // if the file has a Byte Order Marker, we can assume the file is in UTF-xx
301    // otherwise, the file would not be human readable
302    if (hasUTF8Bom(buffer)) {
303      return Charsets.UTF_8;
304    }
305    if (hasUTF16LEBom(buffer)) {
306      return Charsets.UTF_16LE;
307    }
308    if (hasUTF16BEBom(buffer)) {
309      return Charsets.UTF_16BE;
310    }
311
312    // if it's not UTF-8 or a BOM present check for UTF16 zeros
313    Charset cs = detectUtf16();
314    if (cs != null) {
315      return cs;
316    }
317
318    // if the file is in UTF-8, high order bytes must have a certain value, in order to be valid
319    // if it's not the case, we can assume the encoding is some 8 bit one
320    boolean validU8Char = true;
321
322    // TODO the buffer is not read up to the end, but up to length - 6
323
324    int length = buffer.length;
325    int i = 0;
326    while (i < length - 6) {
327      byte b0 = buffer[i];
328      byte b1 = buffer[i + 1];
329      byte b2 = buffer[i + 2];
330      byte b3 = buffer[i + 3];
331      byte b4 = buffer[i + 4];
332      byte b5 = buffer[i + 5];
333      if (b0 < 0) {
334        // a high order bit was encountered, thus the encoding is not US-ASCII
335        // a two-bytes sequence was encoutered
336        if (isTwoBytesSequence(b0)) {
337          // there must be one continuation byte of the form 10xxxxxx,
338          // otherwise the following characteris is not a valid UTF-8 construct
339          if (isContinuationChar(b1)) {
340            i++;
341          } else {
342            validU8Char = false;
343          }
344        }
345        // a three-bytes sequence was encoutered
346        else if (isThreeBytesSequence(b0)) {
347          // there must be two continuation bytes of the form 10xxxxxx,
348          // otherwise the following characteris is not a valid UTF-8 construct
349          if (isContinuationChar(b1) && isContinuationChar(b2)) {
350            i += 2;
351          } else {
352            validU8Char = false;
353          }
354        }
355        // a four-bytes sequence was encoutered
356        else if (isFourBytesSequence(b0)) {
357          // there must be three continuation bytes of the form 10xxxxxx,
358          // otherwise the following characteris is not a valid UTF-8 construct
359          if (isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3)) {
360            i += 3;
361          } else {
362            validU8Char = false;
363          }
364        }
365        // a five-bytes sequence was encoutered
366        else if (isFiveBytesSequence(b0)) {
367          // there must be four continuation bytes of the form 10xxxxxx,
368          // otherwise the following characteris is not a valid UTF-8 construct
369          if (isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3) && isContinuationChar(b4)) {
370            i += 4;
371          } else {
372            validU8Char = false;
373          }
374        }
375        // a six-bytes sequence was encoutered
376        else if (isSixBytesSequence(b0)) {
377          // there must be five continuation bytes of the form 10xxxxxx,
378          // otherwise the following characteris is not a valid UTF-8 construct
379          if (isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3) && isContinuationChar(b4)
380              && isContinuationChar(b5)) {
381            i += 5;
382          } else {
383            validU8Char = false;
384          }
385        } else {
386          validU8Char = false;
387        }
388      }
389      if (!validU8Char) {
390        break;
391      }
392      i++;
393    }
394
395    // if no invalid UTF-8 were encountered, we can assume the encoding is UTF-8,
396    // otherwise the file would not be human readable
397    if (validU8Char) {
398      return Charsets.UTF_8;
399    }
400
401    // finally it must be some 8bit encoding we try to detect statistically
402    return detectCharacterEncoding8bit();
403  }
404
405  private Charset detectUtf16() {
406
407    // first try to see if we got a little or big endian, i.e. lots of zeros as the first byte or second byte if we deal
408    // with latin characters at least
409    int zerosLE = 0;
410    int zerosBE = 0;
411    boolean even = true;
412
413    int length = buffer.length;
414    int i = 0;
415    while (i < length) {
416      byte b = buffer[i];
417      i++;
418      even = !even;
419      if (b == 0x00) {
420        // zero occurr a lot in utf16 with latin characters
421        if (even) {
422          zerosLE++;
423        } else {
424          zerosBE++;
425        }
426      }
427    }
428
429    // a UTF16 encoding with many latin characters would have either lots of even or uneven bytes as zero - but not both
430    int min = buffer.length / 10;
431    if ((zerosBE > min || zerosLE > min) && Math.abs(zerosBE - zerosLE) > min) {
432      Charset charset = zerosBE > zerosLE ? Charsets.UTF_16BE : Charsets.UTF_16LE;
433
434      // now try to decode the whole lot just to make sure
435      try {
436        CharsetDecoder decoder = charset.newDecoder();
437        decoder.decode(ByteBuffer.wrap(buffer));
438        // that worked without a problem - think we got it!
439        return charset;
440      } catch (CharacterCodingException e) {
441        // finally try with the plain UTF16 encoding
442        charset = Charsets.UTF_16;
443        try {
444          CharsetDecoder decoder = charset.newDecoder();
445          decoder.decode(ByteBuffer.wrap(buffer));
446          // that worked without a problem - think we got it!
447          return charset;
448        } catch (CharacterCodingException e2) {
449        }
450      }
451    }
452
453    return null;
454  }
455
456  private long testLatin1() {
457    Charset charset = Charsets.ISO_8859_1;
458    CharsetDecoder decoder = charset.newDecoder();
459
460    long suspicous = 0;
461    // count the following
462
463    // first try to decode the whole lot and count common non ascii chars
464    try {
465      CharBuffer cbuf = decoder.decode(ByteBuffer.wrap(buffer));
466      while (cbuf.hasRemaining()) {
467        char c = cbuf.get();
468        if (isCommonChar(c)) {
469          suspicous--;
470        }
471      }
472
473      // if that worked without a problem try to count suspicous characters which are rarely used in our texts
474      int length = buffer.length;
475      int i = 0;
476      while (i < length) {
477        byte b = buffer[i];
478        i++;
479        // range 7f-9f undefined, see http://de.wikipedia.org/wiki/ISO_8859-1
480        if (b >= (byte) 0x80 && b <= (byte) 0x9f) {
481          suspicous += UNDEFINED_PENALTY;
482        }
483      }
484    } catch (CharacterCodingException e) {
485      suspicous = Long.MAX_VALUE;
486    }
487
488    return suspicous;
489  }
490
491  private long testMacRoman() {
492    CharsetDecoder decoder = MACROMAN.newDecoder();
493
494    long suspicous = 0;
495
496    // first try to decode the whole lot
497    try {
498      CharBuffer cbuf = decoder.decode(ByteBuffer.wrap(buffer));
499      while (cbuf.hasRemaining()) {
500        char c = cbuf.get();
501        if (isCommonChar(c)) {
502          suspicous--;
503        }
504      }
505      // if that worked without a problem try to count suspicious characters which are rarely used in our texts
506      int length = buffer.length;
507      int i = 0;
508      while (i < length) {
509        byte b = buffer[i];
510        i++;
511        // all ranges defined I am afraid
512      }
513    } catch (CharacterCodingException e) {
514      suspicous = Long.MAX_VALUE;
515    }
516
517    return suspicous;
518  }
519
520  private long testWindows1252() {
521    CharsetDecoder decoder = WINDOWS1252.newDecoder();
522    long suspicous = 0;
523
524    // first try to decode the whole lot
525    try {
526      CharBuffer cbuf = decoder.decode(ByteBuffer.wrap(buffer));
527      while (cbuf.hasRemaining()) {
528        char c = cbuf.get();
529        if (isCommonChar(c)) {
530          suspicous--;
531        }
532      }
533      // if that worked without a problem try to count suspicous characters which are rarely used in our texts
534      // see http://de.wikipedia.org/wiki/ISO_8859-1
535      int length = buffer.length;
536      int i = 0;
537      while (i < length) {
538        byte b = buffer[i];
539        i++;
540        // 5 undefined chars
541        if (b == (byte) 0x81 || b == (byte) 0x8d || b == (byte) 0x8f || b == (byte) 0x90 || b == (byte) 0x9d) {
542          suspicous += UNDEFINED_PENALTY;
543        }
544      }
545    } catch (CharacterCodingException e) {
546      suspicous = Long.MAX_VALUE;
547    }
548
549    return suspicous;
550  }
551
552}