001/* 002 * Copyright 2021 Global Biodiversity Information Facility (GBIF) 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package org.gbif.utils.file; 017 018import java.io.File; 019import java.io.IOException; 020import java.nio.ByteBuffer; 021import java.nio.CharBuffer; 022import java.nio.charset.CharacterCodingException; 023import java.nio.charset.Charset; 024import java.nio.charset.CharsetDecoder; 025import java.nio.charset.StandardCharsets; 026 027import org.slf4j.Logger; 028import org.slf4j.LoggerFactory; 029 030import static org.gbif.utils.file.FileUtils.readByteBuffer; 031 032/** 033 * <p> 034 * Utility class to guess the encoding of a given file or byte array. The guess is unfortunately not 100% sure. 035 * Especially for 8-bit charsets. It's not possible 036 * to know which 8-bit charset is used. Except through statistical analysis. 037 * </p> 038 * <p> 039 * On the other hand, unicode files encoded in UTF-16 (low or big endian) or UTF-8 files with a Byte Order Marker are 040 * easy to find. For UTF-8 files with no BOM, 041 * if the buffer is wide enough, it's easy to guess. 042 * </p> 043 * <p> 044 * To determine whether mostly-English text is UTF-8 or ISO-8859-1, a fairly large buffer may be necessary to find an 045 * instance of é, ° etc. 046 * </p> 047 * This class is a heavily modified version of the original written by Guillaume LAFORGE: 048 * com.glaforge.i18n.io.CharsetToolkit 049 * taken from 050 * http://glaforge.free.fr/wiki/index.php?wiki=GuessEncoding 051 * 052 * @author Guillaume LAFORGE 053 * @author Markus Döring 054 */ 055public class CharsetDetection { 056 057 private static final Logger LOG = LoggerFactory.getLogger(CharsetDetection.class); 058 // encodings to test and very unlikely chars in that encoding 059 private static final byte LF = 0x0a; 060 private static final byte CR = 0x0d; 061 private static final byte TAB = 0x09; 062 063 private static final int UNDEFINED_PENALTY = 100; 064 private static final char[] COMMON_NON_ASCII_CHARS; 065 066 static { 067 String commonChars = "äåáàæœčéèêëïñøöüßšž°±"; 068 CharBuffer cbuf = CharBuffer.allocate(commonChars.length() * 2); 069 for (char c : commonChars.toCharArray()) { 070 cbuf.append(c); 071 cbuf.append(Character.toUpperCase(c)); 072 } 073 COMMON_NON_ASCII_CHARS = cbuf.array(); 074 } 075 076 private static final Charset LATIN1 = StandardCharsets.ISO_8859_1; 077 private static final Charset WINDOWS1252; 078 private static final Charset MACROMAN; 079 080 static { 081 Charset cs = null; 082 try { 083 cs = Charset.forName("Cp1252"); 084 } catch (Exception e) { 085 LOG.warn("Windows 1252 encoding not supported on this Virtual Machine"); 086 } 087 WINDOWS1252 = cs; 088 089 cs = null; 090 try { 091 cs = Charset.forName("MacRoman"); 092 } catch (Exception e) { 093 LOG.warn("MacRoman encoding not supported on this Virtual Machine"); 094 } 095 MACROMAN = cs; 096 } 097 098 private final byte[] buffer; 099 100 /** 101 * Constructor of the <code>com.glaforge.i18n.io.CharsetToolkit</code> utility class. 102 * 103 * @param buffer the byte buffer of which we want to know the encoding. 104 */ 105 private CharsetDetection(byte[] buffer) { 106 this.buffer = buffer; 107 } 108 109 public static Charset detectEncoding(File file) throws IOException { 110 byte[] data = readByteBuffer(file).array(); 111 112 CharsetDetection detector = new CharsetDetection(data); 113 Charset charset = detector.detectEncoding(); 114 115 LOG.debug("Detected character encoding " + charset.displayName()); 116 return charset; 117 } 118 119 /** 120 * @param bufferLength number of bytes to read in for the detection. Needs to be long enough to encounter non-ASCII 121 * characters, which could be unusual in English text. 122 */ 123 public static Charset detectEncoding(File file, int bufferLength) throws IOException { 124 byte[] data = readByteBuffer(file, bufferLength).array(); 125 126 CharsetDetection detector = new CharsetDetection(data); 127 Charset charset = detector.detectEncoding(); 128 129 LOG.debug("Detected character encoding " + charset.displayName()); 130 return charset; 131 } 132 133 /** 134 * Retrieve the default charset of the system. 135 * 136 * @return the default <code>Charset</code>. 137 */ 138 public static Charset getDefaultSystemCharset() { 139 return Charset.forName(System.getProperty("file.encoding")); 140 } 141 142 /** 143 * Has a Byte Order Marker for UTF-16 Big Endian 144 * (utf-16 and ucs-2). 145 * 146 * @param bom a buffer. 147 * 148 * @return true if the buffer has a BOM for UTF-16 Big Endian. 149 */ 150 protected static boolean hasUTF16BEBom(byte[] bom) { 151 return bom[0] == -2 && bom[1] == -1; 152 } 153 154 /** 155 * Has a Byte Order Marker for UTF-16 Low Endian 156 * (ucs-2le, ucs-4le, and ucs-16le). 157 * 158 * @param bom a buffer. 159 * 160 * @return true if the buffer has a BOM for UTF-16 Low Endian. 161 */ 162 protected static boolean hasUTF16LEBom(byte[] bom) { 163 return bom[0] == -1 && bom[1] == -2; 164 } 165 166 /** 167 * Has a Byte Order Marker for UTF-8 (Used by Microsoft's Notepad and other editors). 168 * 169 * @param bom a buffer. 170 * 171 * @return true if the buffer has a BOM for UTF8. 172 */ 173 protected static boolean hasUTF8Bom(byte[] bom) { 174 return bom[0] == -17 && bom[1] == -69 && bom[2] == -65; 175 } 176 177 private static boolean isCommonChar(char c) { 178 for (char cc : COMMON_NON_ASCII_CHARS) { 179 if (c == cc) { 180 return true; 181 } 182 } 183 return false; 184 } 185 186 /** 187 * If the byte has the form 10xxxxx, then it's a continuation byte of a multiple byte character; 188 * 189 * @param b a byte. 190 * 191 * @return true if it's a continuation char. 192 */ 193 private static boolean isContinuationChar(byte b) { 194 return -128 <= b && b <= -65; 195 } 196 197 /** 198 * If the byte has the form 11110xx, then it's the first byte of a five-bytes sequence character. 199 * 200 * @param b a byte. 201 * 202 * @return true if it's the first byte of a five-bytes sequence. 203 */ 204 private static boolean isFiveBytesSequence(byte b) { 205 return -8 <= b && b <= -5; 206 } 207 208 /** 209 * If the byte has the form 11110xx, then it's the first byte of a four-bytes sequence character. 210 * 211 * @param b a byte. 212 * 213 * @return true if it's the first byte of a four-bytes sequence. 214 */ 215 private static boolean isFourBytesSequence(byte b) { 216 return -16 <= b && b <= -9; 217 } 218 219 /** 220 * If the byte has the form 1110xxx, then it's the first byte of a six-bytes sequence character. 221 * 222 * @param b a byte. 223 * 224 * @return true if it's the first byte of a six-bytes sequence. 225 */ 226 private static boolean isSixBytesSequence(byte b) { 227 return -4 <= b && b <= -3; 228 } 229 230 /** 231 * If the byte has the form 1110xxx, then it's the first byte of a three-bytes sequence character. 232 * 233 * @param b a byte. 234 * 235 * @return true if it's the first byte of a three-bytes sequence. 236 */ 237 private static boolean isThreeBytesSequence(byte b) { 238 return -32 <= b && b <= -17; 239 } 240 241 /** 242 * If the byte has the form 110xxxx, then it's the first byte of a two-bytes sequence character. 243 * 244 * @param b a byte. 245 * 246 * @return true if it's the first byte of a two-bytes sequence. 247 */ 248 private static boolean isTwoBytesSequence(byte b) { 249 return -64 <= b && b <= -33; 250 } 251 252 private Charset detectCharacterEncoding8bit() { 253 254 // the number of "bad" chars for the best guess. A better guess will have 255 long leastSuspicousChars = testLatin1(); 256 long suspicousChars; 257 258 // the best guess so far 259 Charset bestEncoding = LATIN1; 260 261 if (WINDOWS1252 != null) { 262 suspicousChars = testWindows1252(); 263 if (suspicousChars < leastSuspicousChars) { 264 leastSuspicousChars = suspicousChars; 265 bestEncoding = WINDOWS1252; 266 } 267 } 268 269 if (MACROMAN != null) { 270 suspicousChars = testMacRoman(); 271 if (suspicousChars < leastSuspicousChars) { 272 leastSuspicousChars = suspicousChars; 273 bestEncoding = MACROMAN; 274 } 275 } 276 277 LOG.debug("8bit Encoding guessed: {} with {} rare characters", bestEncoding, leastSuspicousChars); 278 return bestEncoding; 279 } 280 281 /** 282 * <p> 283 * Guess the encoding of the provided buffer. 284 * </p> 285 * If Byte Order Markers are encountered at the beginning of the buffer, we immediately 286 * return the charset implied by this BOM. Otherwise, the file would not be a human 287 * readable text file. 288 * <p/> 289 * <p> 290 * If there is no BOM, this method tries to discern whether the file is UTF-8 or not. If it is not UTF-8, we assume 291 * the encoding is the default system encoding (of course, it might be any 8-bit charset, but usually, an 8-bit 292 * charset is the default one). 293 * </p> 294 * <p/> 295 * <p> 296 * It is possible to discern UTF-8 thanks to the pattern of characters with a multi-byte sequence. 297 * </p> 298 * <p/> 299 * <pre> 300 * UCS-4 range (hex.) UTF-8 octet sequence (binary) 301 * 0000 0000-0000 007F 0xxxxxxx 302 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx 303 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx 304 * 0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 305 * 0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 306 * 0400 0000-7FFF FFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 307 * </pre> 308 * <p> 309 * With UTF-8, 0xFE and 0xFF never appear. 310 * </p> 311 * 312 * @return the Charset recognized or the system default. 313 */ 314 public Charset detectEncoding() { 315 // if the file has a Byte Order Marker, we can assume the file is in UTF-xx 316 // otherwise, the file would not be human readable 317 if (hasUTF8Bom(buffer)) { 318 return StandardCharsets.UTF_8; 319 } 320 if (hasUTF16LEBom(buffer)) { 321 return StandardCharsets.UTF_16LE; 322 } 323 if (hasUTF16BEBom(buffer)) { 324 return StandardCharsets.UTF_16BE; 325 } 326 327 // if it's not UTF-8 or a BOM present check for UTF16 zeros 328 Charset cs = detectUtf16(); 329 if (cs != null) { 330 return cs; 331 } 332 333 // if the file is in UTF-8, high order bytes must have a certain value, in order to be valid 334 // if it's not the case, we can assume the encoding is some 8 bit one 335 boolean validU8Char = true; 336 337 // TODO the buffer is not read up to the end, but up to length - 6 338 339 int length = buffer.length; 340 int i = 0; 341 while (i < length - 6) { 342 byte b0 = buffer[i]; 343 byte b1 = buffer[i + 1]; 344 byte b2 = buffer[i + 2]; 345 byte b3 = buffer[i + 3]; 346 byte b4 = buffer[i + 4]; 347 byte b5 = buffer[i + 5]; 348 if (b0 < 0) { 349 // a high order bit was encountered, thus the encoding is not US-ASCII 350 // a two-byte sequence was encountered 351 if (isTwoBytesSequence(b0)) { 352 // there must be one continuation byte of the form 10xxxxxx, 353 // otherwise the following character is is not a valid UTF-8 construct 354 if (isContinuationChar(b1)) { 355 i++; 356 } else { 357 validU8Char = false; 358 } 359 } 360 // a three-byte sequence was encountered 361 else if (isThreeBytesSequence(b0)) { 362 // there must be two continuation bytes of the form 10xxxxxx, 363 // otherwise the following character is is not a valid UTF-8 construct 364 if (isContinuationChar(b1) && isContinuationChar(b2)) { 365 i += 2; 366 } else { 367 validU8Char = false; 368 } 369 } 370 // a four-byte sequence was encountered 371 else if (isFourBytesSequence(b0)) { 372 // there must be three continuation bytes of the form 10xxxxxx, 373 // otherwise the following character is is not a valid UTF-8 construct 374 if (isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3)) { 375 i += 3; 376 } else { 377 validU8Char = false; 378 } 379 } 380 // a five-byte sequence was encountered 381 else if (isFiveBytesSequence(b0)) { 382 // there must be four continuation bytes of the form 10xxxxxx, 383 // otherwise the following character is is not a valid UTF-8 construct 384 if (isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3) && isContinuationChar(b4)) { 385 i += 4; 386 } else { 387 validU8Char = false; 388 } 389 } 390 // a six-byte sequence was encountered 391 else if (isSixBytesSequence(b0)) { 392 // there must be five continuation bytes of the form 10xxxxxx, 393 // otherwise the following character is is not a valid UTF-8 construct 394 if (isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3) && isContinuationChar(b4) 395 && isContinuationChar(b5)) { 396 i += 5; 397 } else { 398 validU8Char = false; 399 } 400 } else { 401 validU8Char = false; 402 } 403 } 404 if (!validU8Char) { 405 break; 406 } 407 i++; 408 } 409 410 // if no invalid UTF-8 were encountered, we can assume the encoding is UTF-8, 411 // otherwise the file would not be human readable 412 if (validU8Char) { 413 return StandardCharsets.UTF_8; 414 } 415 416 // finally it must be some 8bit encoding we try to detect statistically 417 return detectCharacterEncoding8bit(); 418 } 419 420 private Charset detectUtf16() { 421 422 // first try to see if we got a little or big endian, i.e. lots of zeros as the first byte or second byte if we deal 423 // with latin characters at least 424 int zerosLE = 0; 425 int zerosBE = 0; 426 boolean even = true; 427 428 int length = buffer.length; 429 int i = 0; 430 while (i < length) { 431 byte b = buffer[i]; 432 i++; 433 even = !even; 434 if (b == 0x00) { 435 // zero occur a lot in utf16 with latin characters 436 if (even) { 437 zerosLE++; 438 } else { 439 zerosBE++; 440 } 441 } 442 } 443 444 // a UTF16 encoding with many latin characters would have either lots of even or uneven bytes as zero - but not both 445 int min = buffer.length / 10; 446 if ((zerosBE > min || zerosLE > min) && Math.abs(zerosBE - zerosLE) > min) { 447 Charset charset = zerosBE > zerosLE ? StandardCharsets.UTF_16BE : StandardCharsets.UTF_16LE; 448 449 // now try to decode the whole lot just to make sure 450 try { 451 CharsetDecoder decoder = charset.newDecoder(); 452 decoder.decode(ByteBuffer.wrap(buffer)); 453 // that worked without a problem - think we got it! 454 return charset; 455 } catch (CharacterCodingException e) { 456 // finally try with the plain UTF16 encoding 457 charset = StandardCharsets.UTF_16; 458 try { 459 CharsetDecoder decoder = charset.newDecoder(); 460 decoder.decode(ByteBuffer.wrap(buffer)); 461 // that worked without a problem - think we got it! 462 return charset; 463 } catch (CharacterCodingException e2) { 464 } 465 } 466 } 467 468 return null; 469 } 470 471 private long testLatin1() { 472 Charset charset = StandardCharsets.ISO_8859_1; 473 CharsetDecoder decoder = charset.newDecoder(); 474 475 long suspicious = 0; 476 // count the following 477 478 // first try to decode the whole lot and count common non ascii chars 479 try { 480 CharBuffer cbuf = decoder.decode(ByteBuffer.wrap(buffer)); 481 while (cbuf.hasRemaining()) { 482 char c = cbuf.get(); 483 if (isCommonChar(c)) { 484 suspicious--; 485 } 486 } 487 488 // if that worked without a problem try to count suspicious characters which are rarely used in our texts 489 int length = buffer.length; 490 int i = 0; 491 while (i < length) { 492 byte b = buffer[i]; 493 i++; 494 // range 7f-9f undefined, see http://de.wikipedia.org/wiki/ISO_8859-1 495 if (b >= (byte) 0x80 && b <= (byte) 0x9f) { 496 suspicious += UNDEFINED_PENALTY; 497 } 498 } 499 } catch (CharacterCodingException e) { 500 suspicious = Long.MAX_VALUE; 501 } 502 503 return suspicious; 504 } 505 506 private long testMacRoman() { 507 CharsetDecoder decoder = MACROMAN.newDecoder(); 508 509 long suspicious = 0; 510 511 // first try to decode the whole lot 512 try { 513 CharBuffer cbuf = decoder.decode(ByteBuffer.wrap(buffer)); 514 while (cbuf.hasRemaining()) { 515 char c = cbuf.get(); 516 if (isCommonChar(c)) { 517 suspicious--; 518 } 519 } 520 // if that worked without a problem try to count suspicious characters which are rarely used in our texts 521 int length = buffer.length; 522 int i = 0; 523 while (i < length) { 524 byte b = buffer[i]; 525 i++; 526 // all ranges defined I am afraid 527 } 528 } catch (CharacterCodingException e) { 529 suspicious = Long.MAX_VALUE; 530 } 531 532 return suspicious; 533 } 534 535 private long testWindows1252() { 536 CharsetDecoder decoder = WINDOWS1252.newDecoder(); 537 long suspicious = 0; 538 539 // first try to decode the whole lot 540 try { 541 CharBuffer cbuf = decoder.decode(ByteBuffer.wrap(buffer)); 542 while (cbuf.hasRemaining()) { 543 char c = cbuf.get(); 544 if (isCommonChar(c)) { 545 suspicious--; 546 } 547 } 548 // if that worked without a problem try to count suspicous characters which are rarely used in our texts 549 // see http://de.wikipedia.org/wiki/ISO_8859-1 550 int length = buffer.length; 551 int i = 0; 552 while (i < length) { 553 byte b = buffer[i]; 554 i++; 555 // 5 undefined chars 556 if (b == (byte) 0x81 || b == (byte) 0x8d || b == (byte) 0x8f || b == (byte) 0x90 || b == (byte) 0x9d) { 557 suspicious += UNDEFINED_PENALTY; 558 } 559 } 560 } catch (CharacterCodingException e) { 561 suspicious = Long.MAX_VALUE; 562 } 563 564 return suspicious; 565 } 566 567}