001/* 002 * Licensed under the Apache License, Version 2.0 (the "License"); 003 * you may not use this file except in compliance with the License. 004 * You may obtain a copy of the License at 005 * 006 * http://www.apache.org/licenses/LICENSE-2.0 007 * 008 * Unless required by applicable law or agreed to in writing, software 009 * distributed under the License is distributed on an "AS IS" BASIS, 010 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 011 * See the License for the specific language governing permissions and 012 * limitations under the License. 013 */ 014package org.gbif.utils.file; 015 016import java.io.File; 017import java.io.IOException; 018import java.nio.ByteBuffer; 019import java.nio.CharBuffer; 020import java.nio.charset.CharacterCodingException; 021import java.nio.charset.Charset; 022import java.nio.charset.CharsetDecoder; 023import java.nio.charset.StandardCharsets; 024 025import org.slf4j.Logger; 026import org.slf4j.LoggerFactory; 027 028import static org.gbif.utils.file.FileUtils.readByteBuffer; 029 030/** 031 * <p> 032 * Utility class to guess the encoding of a given file or byte array. The guess is unfortunately not 100% sure. 033 * Especially for 8-bit charsets. It's not possible 034 * to know which 8-bit charset is used. Except through statistical analysis. 035 * </p> 036 * <p> 037 * On the other hand, unicode files encoded in UTF-16 (low or big endian) or UTF-8 files with a Byte Order Marker are 038 * easy to find. For UTF-8 files with no BOM, 039 * if the buffer is wide enough, it's easy to guess. 040 * </p> 041 * <p> 042 * To determine whether mostly-English text is UTF-8 or ISO-8859-1, a fairly large buffer may be necessary to find an 043 * instance of é, ° etc. 044 * </p> 045 * This class is a heavily modified version of the original written by Guillaume LAFORGE: 046 * com.glaforge.i18n.io.CharsetToolkit 047 * taken from 048 * http://glaforge.free.fr/wiki/index.php?wiki=GuessEncoding 049 * 050 * @author Guillaume LAFORGE 051 * @author Markus Döring 052 */ 053public class CharsetDetection { 054 055 private static final Logger LOG = LoggerFactory.getLogger(CharsetDetection.class); 056 // encodings to test and very unlikely chars in that encoding 057 private static final byte LF = 0x0a; 058 private static final byte CR = 0x0d; 059 private static final byte TAB = 0x09; 060 061 private static final int UNDEFINED_PENALTY = 100; 062 private static final char[] COMMON_NON_ASCII_CHARS; 063 064 static { 065 String commonChars = "äåáàæœčéèêëïñøöüßšž°±"; 066 CharBuffer cbuf = CharBuffer.allocate(commonChars.length() * 2); 067 for (char c : commonChars.toCharArray()) { 068 cbuf.append(c); 069 cbuf.append(Character.toUpperCase(c)); 070 } 071 COMMON_NON_ASCII_CHARS = cbuf.array(); 072 } 073 074 private static final Charset LATIN1 = StandardCharsets.ISO_8859_1; 075 private static final Charset WINDOWS1252; 076 private static final Charset MACROMAN; 077 078 static { 079 Charset cs = null; 080 try { 081 cs = Charset.forName("Cp1252"); 082 } catch (Exception e) { 083 LOG.warn("Windows 1252 encoding not supported on this Virtual Machine"); 084 } 085 WINDOWS1252 = cs; 086 087 cs = null; 088 try { 089 cs = Charset.forName("MacRoman"); 090 } catch (Exception e) { 091 LOG.warn("MacRoman encoding not supported on this Virtual Machine"); 092 } 093 MACROMAN = cs; 094 } 095 096 private final byte[] buffer; 097 098 /** 099 * Constructor of the <code>com.glaforge.i18n.io.CharsetToolkit</code> utility class. 100 * 101 * @param buffer the byte buffer of which we want to know the encoding. 102 */ 103 private CharsetDetection(byte[] buffer) { 104 this.buffer = buffer; 105 } 106 107 public static Charset detectEncoding(File file) throws IOException { 108 byte[] data = readByteBuffer(file).array(); 109 110 CharsetDetection detector = new CharsetDetection(data); 111 Charset charset = detector.detectEncoding(); 112 113 LOG.debug("Detected character encoding " + charset.displayName()); 114 return charset; 115 } 116 117 /** 118 * @param bufferLength number of bytes to read in for the detection. Needs to be long enough to encounter non-ASCII 119 * characters, which could be unusual in English text. 120 */ 121 public static Charset detectEncoding(File file, int bufferLength) throws IOException { 122 byte[] data = readByteBuffer(file, bufferLength).array(); 123 124 CharsetDetection detector = new CharsetDetection(data); 125 Charset charset = detector.detectEncoding(); 126 127 LOG.debug("Detected character encoding " + charset.displayName()); 128 return charset; 129 } 130 131 /** 132 * Retrieve the default charset of the system. 133 * 134 * @return the default <code>Charset</code>. 135 */ 136 public static Charset getDefaultSystemCharset() { 137 return Charset.forName(System.getProperty("file.encoding")); 138 } 139 140 /** 141 * Has a Byte Order Marker for UTF-16 Big Endian 142 * (utf-16 and ucs-2). 143 * 144 * @param bom a buffer. 145 * 146 * @return true if the buffer has a BOM for UTF-16 Big Endian. 147 */ 148 protected static boolean hasUTF16BEBom(byte[] bom) { 149 return bom[0] == -2 && bom[1] == -1; 150 } 151 152 /** 153 * Has a Byte Order Marker for UTF-16 Low Endian 154 * (ucs-2le, ucs-4le, and ucs-16le). 155 * 156 * @param bom a buffer. 157 * 158 * @return true if the buffer has a BOM for UTF-16 Low Endian. 159 */ 160 protected static boolean hasUTF16LEBom(byte[] bom) { 161 return bom[0] == -1 && bom[1] == -2; 162 } 163 164 /** 165 * Has a Byte Order Marker for UTF-8 (Used by Microsoft's Notepad and other editors). 166 * 167 * @param bom a buffer. 168 * 169 * @return true if the buffer has a BOM for UTF8. 170 */ 171 protected static boolean hasUTF8Bom(byte[] bom) { 172 return bom[0] == -17 && bom[1] == -69 && bom[2] == -65; 173 } 174 175 private static boolean isCommonChar(char c) { 176 for (char cc : COMMON_NON_ASCII_CHARS) { 177 if (c == cc) { 178 return true; 179 } 180 } 181 return false; 182 } 183 184 /** 185 * If the byte has the form 10xxxxx, then it's a continuation byte of a multiple byte character; 186 * 187 * @param b a byte. 188 * 189 * @return true if it's a continuation char. 190 */ 191 private static boolean isContinuationChar(byte b) { 192 return -128 <= b && b <= -65; 193 } 194 195 /** 196 * If the byte has the form 11110xx, then it's the first byte of a five-bytes sequence character. 197 * 198 * @param b a byte. 199 * 200 * @return true if it's the first byte of a five-bytes sequence. 201 */ 202 private static boolean isFiveBytesSequence(byte b) { 203 return -8 <= b && b <= -5; 204 } 205 206 /** 207 * If the byte has the form 11110xx, then it's the first byte of a four-bytes sequence character. 208 * 209 * @param b a byte. 210 * 211 * @return true if it's the first byte of a four-bytes sequence. 212 */ 213 private static boolean isFourBytesSequence(byte b) { 214 return -16 <= b && b <= -9; 215 } 216 217 /** 218 * If the byte has the form 1110xxx, then it's the first byte of a six-bytes sequence character. 219 * 220 * @param b a byte. 221 * 222 * @return true if it's the first byte of a six-bytes sequence. 223 */ 224 private static boolean isSixBytesSequence(byte b) { 225 return -4 <= b && b <= -3; 226 } 227 228 /** 229 * If the byte has the form 1110xxx, then it's the first byte of a three-bytes sequence character. 230 * 231 * @param b a byte. 232 * 233 * @return true if it's the first byte of a three-bytes sequence. 234 */ 235 private static boolean isThreeBytesSequence(byte b) { 236 return -32 <= b && b <= -17; 237 } 238 239 /** 240 * If the byte has the form 110xxxx, then it's the first byte of a two-bytes sequence character. 241 * 242 * @param b a byte. 243 * 244 * @return true if it's the first byte of a two-bytes sequence. 245 */ 246 private static boolean isTwoBytesSequence(byte b) { 247 return -64 <= b && b <= -33; 248 } 249 250 private Charset detectCharacterEncoding8bit() { 251 252 // the number of "bad" chars for the best guess. A better guess will have 253 long leastSuspicousChars = testLatin1(); 254 long suspicousChars; 255 256 // the best guess so far 257 Charset bestEncoding = LATIN1; 258 259 if (WINDOWS1252 != null) { 260 suspicousChars = testWindows1252(); 261 if (suspicousChars < leastSuspicousChars) { 262 leastSuspicousChars = suspicousChars; 263 bestEncoding = WINDOWS1252; 264 } 265 } 266 267 if (MACROMAN != null) { 268 suspicousChars = testMacRoman(); 269 if (suspicousChars < leastSuspicousChars) { 270 leastSuspicousChars = suspicousChars; 271 bestEncoding = MACROMAN; 272 } 273 } 274 275 LOG.debug( 276 "8bit Encoding guessed: {} with {} rare characters", bestEncoding, leastSuspicousChars); 277 return bestEncoding; 278 } 279 280 /** 281 * <p> 282 * Guess the encoding of the provided buffer. 283 * </p> 284 * If Byte Order Markers are encountered at the beginning of the buffer, we immediately 285 * return the charset implied by this BOM. Otherwise, the file would not be a human 286 * readable text file. 287 * <p/> 288 * <p> 289 * If there is no BOM, this method tries to discern whether the file is UTF-8 or not. If it is not UTF-8, we assume 290 * the encoding is the default system encoding (of course, it might be any 8-bit charset, but usually, an 8-bit 291 * charset is the default one). 292 * </p> 293 * <p/> 294 * <p> 295 * It is possible to discern UTF-8 thanks to the pattern of characters with a multi-byte sequence. 296 * </p> 297 * <p/> 298 * <pre> 299 * UCS-4 range (hex.) UTF-8 octet sequence (binary) 300 * 0000 0000-0000 007F 0xxxxxxx 301 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx 302 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx 303 * 0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 304 * 0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 305 * 0400 0000-7FFF FFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 306 * </pre> 307 * <p> 308 * With UTF-8, 0xFE and 0xFF never appear. 309 * </p> 310 * 311 * @return the Charset recognized or the system default. 312 */ 313 public Charset detectEncoding() { 314 // if the file has a Byte Order Marker, we can assume the file is in UTF-xx 315 // otherwise, the file would not be human readable 316 if (hasUTF8Bom(buffer)) { 317 return StandardCharsets.UTF_8; 318 } 319 if (hasUTF16LEBom(buffer)) { 320 return StandardCharsets.UTF_16LE; 321 } 322 if (hasUTF16BEBom(buffer)) { 323 return StandardCharsets.UTF_16BE; 324 } 325 326 // if it's not UTF-8 or a BOM present check for UTF16 zeros 327 Charset cs = detectUtf16(); 328 if (cs != null) { 329 return cs; 330 } 331 332 // if the file is in UTF-8, high order bytes must have a certain value, in order to be valid 333 // if it's not the case, we can assume the encoding is some 8 bit one 334 boolean validU8Char = true; 335 336 // TODO the buffer is not read up to the end, but up to length - 6 337 338 int length = buffer.length; 339 int i = 0; 340 while (i < length - 6) { 341 byte b0 = buffer[i]; 342 byte b1 = buffer[i + 1]; 343 byte b2 = buffer[i + 2]; 344 byte b3 = buffer[i + 3]; 345 byte b4 = buffer[i + 4]; 346 byte b5 = buffer[i + 5]; 347 if (b0 < 0) { 348 // a high order bit was encountered, thus the encoding is not US-ASCII 349 // a two-byte sequence was encountered 350 if (isTwoBytesSequence(b0)) { 351 // there must be one continuation byte of the form 10xxxxxx, 352 // otherwise the following character is is not a valid UTF-8 construct 353 if (isContinuationChar(b1)) { 354 i++; 355 } else { 356 validU8Char = false; 357 } 358 } 359 // a three-byte sequence was encountered 360 else if (isThreeBytesSequence(b0)) { 361 // there must be two continuation bytes of the form 10xxxxxx, 362 // otherwise the following character is is not a valid UTF-8 construct 363 if (isContinuationChar(b1) && isContinuationChar(b2)) { 364 i += 2; 365 } else { 366 validU8Char = false; 367 } 368 } 369 // a four-byte sequence was encountered 370 else if (isFourBytesSequence(b0)) { 371 // there must be three continuation bytes of the form 10xxxxxx, 372 // otherwise the following character is is not a valid UTF-8 construct 373 if (isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3)) { 374 i += 3; 375 } else { 376 validU8Char = false; 377 } 378 } 379 // a five-byte sequence was encountered 380 else if (isFiveBytesSequence(b0)) { 381 // there must be four continuation bytes of the form 10xxxxxx, 382 // otherwise the following character is is not a valid UTF-8 construct 383 if (isContinuationChar(b1) 384 && isContinuationChar(b2) 385 && isContinuationChar(b3) 386 && isContinuationChar(b4)) { 387 i += 4; 388 } else { 389 validU8Char = false; 390 } 391 } 392 // a six-byte sequence was encountered 393 else if (isSixBytesSequence(b0)) { 394 // there must be five continuation bytes of the form 10xxxxxx, 395 // otherwise the following character is is not a valid UTF-8 construct 396 if (isContinuationChar(b1) 397 && isContinuationChar(b2) 398 && isContinuationChar(b3) 399 && isContinuationChar(b4) 400 && isContinuationChar(b5)) { 401 i += 5; 402 } else { 403 validU8Char = false; 404 } 405 } else { 406 validU8Char = false; 407 } 408 } 409 if (!validU8Char) { 410 break; 411 } 412 i++; 413 } 414 415 // if no invalid UTF-8 were encountered, we can assume the encoding is UTF-8, 416 // otherwise the file would not be human readable 417 if (validU8Char) { 418 return StandardCharsets.UTF_8; 419 } 420 421 // finally it must be some 8bit encoding we try to detect statistically 422 return detectCharacterEncoding8bit(); 423 } 424 425 private Charset detectUtf16() { 426 427 // first try to see if we got a little or big endian, i.e. lots of zeros as the first byte or 428 // second byte if we deal 429 // with latin characters at least 430 int zerosLE = 0; 431 int zerosBE = 0; 432 boolean even = true; 433 434 int length = buffer.length; 435 int i = 0; 436 while (i < length) { 437 byte b = buffer[i]; 438 i++; 439 even = !even; 440 if (b == 0x00) { 441 // zero occur a lot in utf16 with latin characters 442 if (even) { 443 zerosLE++; 444 } else { 445 zerosBE++; 446 } 447 } 448 } 449 450 // a UTF16 encoding with many latin characters would have either lots of even or uneven bytes as 451 // zero - but not both 452 int min = buffer.length / 10; 453 if ((zerosBE > min || zerosLE > min) && Math.abs(zerosBE - zerosLE) > min) { 454 Charset charset = zerosBE > zerosLE ? StandardCharsets.UTF_16BE : StandardCharsets.UTF_16LE; 455 456 // now try to decode the whole lot just to make sure 457 try { 458 CharsetDecoder decoder = charset.newDecoder(); 459 decoder.decode(ByteBuffer.wrap(buffer)); 460 // that worked without a problem - think we got it! 461 return charset; 462 } catch (CharacterCodingException e) { 463 // finally try with the plain UTF16 encoding 464 charset = StandardCharsets.UTF_16; 465 try { 466 CharsetDecoder decoder = charset.newDecoder(); 467 decoder.decode(ByteBuffer.wrap(buffer)); 468 // that worked without a problem - think we got it! 469 return charset; 470 } catch (CharacterCodingException e2) { 471 } 472 } 473 } 474 475 return null; 476 } 477 478 private long testLatin1() { 479 Charset charset = StandardCharsets.ISO_8859_1; 480 CharsetDecoder decoder = charset.newDecoder(); 481 482 long suspicious = 0; 483 // count the following 484 485 // first try to decode the whole lot and count common non ascii chars 486 try { 487 CharBuffer cbuf = decoder.decode(ByteBuffer.wrap(buffer)); 488 while (cbuf.hasRemaining()) { 489 char c = cbuf.get(); 490 if (isCommonChar(c)) { 491 suspicious--; 492 } 493 } 494 495 // if that worked without a problem try to count suspicious characters which are rarely used 496 // in our texts 497 int length = buffer.length; 498 int i = 0; 499 while (i < length) { 500 byte b = buffer[i]; 501 i++; 502 // range 7f-9f undefined, see http://de.wikipedia.org/wiki/ISO_8859-1 503 if (b >= (byte) 0x80 && b <= (byte) 0x9f) { 504 suspicious += UNDEFINED_PENALTY; 505 } 506 } 507 } catch (CharacterCodingException e) { 508 suspicious = Long.MAX_VALUE; 509 } 510 511 return suspicious; 512 } 513 514 private long testMacRoman() { 515 CharsetDecoder decoder = MACROMAN.newDecoder(); 516 517 long suspicious = 0; 518 519 // first try to decode the whole lot 520 try { 521 CharBuffer cbuf = decoder.decode(ByteBuffer.wrap(buffer)); 522 while (cbuf.hasRemaining()) { 523 char c = cbuf.get(); 524 if (isCommonChar(c)) { 525 suspicious--; 526 } 527 } 528 // if that worked without a problem try to count suspicious characters which are rarely used 529 // in our texts 530 int length = buffer.length; 531 int i = 0; 532 while (i < length) { 533 byte b = buffer[i]; 534 i++; 535 // all ranges defined I am afraid 536 } 537 } catch (CharacterCodingException e) { 538 suspicious = Long.MAX_VALUE; 539 } 540 541 return suspicious; 542 } 543 544 private long testWindows1252() { 545 CharsetDecoder decoder = WINDOWS1252.newDecoder(); 546 long suspicious = 0; 547 548 // first try to decode the whole lot 549 try { 550 CharBuffer cbuf = decoder.decode(ByteBuffer.wrap(buffer)); 551 while (cbuf.hasRemaining()) { 552 char c = cbuf.get(); 553 if (isCommonChar(c)) { 554 suspicious--; 555 } 556 } 557 // if that worked without a problem try to count suspicous characters which are rarely used in 558 // our texts 559 // see http://de.wikipedia.org/wiki/ISO_8859-1 560 int length = buffer.length; 561 int i = 0; 562 while (i < length) { 563 byte b = buffer[i]; 564 i++; 565 // 5 undefined chars 566 if (b == (byte) 0x81 567 || b == (byte) 0x8d 568 || b == (byte) 0x8f 569 || b == (byte) 0x90 570 || b == (byte) 0x9d) { 571 suspicious += UNDEFINED_PENALTY; 572 } 573 } 574 } catch (CharacterCodingException e) { 575 suspicious = Long.MAX_VALUE; 576 } 577 578 return suspicious; 579 } 580}