1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36 package groovy.util;
37
38 import java.io.*;
39 import java.nio.charset.Charset;
40 import java.util.*;
41
42 /***
43 * <p>Utility class to guess the encoding of a given text file.</p>
44 *
45 * <p>Unicode files encoded in UTF-16 (low or big endian) or UTF-8 files
46 * with a Byte Order Marker are correctly discovered. For UTF-8 files with no BOM, if the buffer
47 * is wide enough, the charset should also be discovered.</p>
48 *
49 * <p>A byte buffer of 4KB is usually sufficient to be able to guess the encoding.</p>
50 *
51 * <p>Usage:</p>
52 * <pre>
53 * // guess the encoding
54 * Charset guessedCharset = CharsetToolkit.guessEncoding(file, 4096);
55 *
56 * // create a reader with the correct charset
57 * CharsetToolkit toolkit = new CharsetToolkit(file);
58 * BufferedReader reader = toolkit.getReader();
59 *
60 * // read the file content
61 * String line;
62 * while ((line = br.readLine())!= null)
63 * {
64 * System.out.println(line);
65 * }
66 * </pre>
67 *
68 * @author Guillaume Laforge
69 */
70 public class CharsetToolkit {
71 private byte[] buffer;
72 private Charset defaultCharset;
73 private Charset charset;
74 private boolean enforce8Bit = true;
75 private File file;
76
77 /***
78 * Constructor of the <code>CharsetToolkit</code> utility class.
79 *
80 * @param file of which we want to know the encoding.
81 */
82 public CharsetToolkit(File file) throws IOException {
83 this.file = file;
84 this.defaultCharset = getDefaultSystemCharset();
85 this.charset = null;
86 InputStream input = new FileInputStream(file);
87 try {
88 byte[] bytes = new byte[4096];
89 int bytesRead = input.read(bytes);
90 if (bytesRead == -1) {
91 this.buffer = new byte[0];
92 }
93 else if (bytesRead < 4096) {
94 byte[] bytesToGuess = new byte[bytesRead];
95 System.arraycopy(bytes, 0, bytesToGuess, 0, bytesRead);
96 this.buffer = bytesToGuess;
97 }
98 else {
99 this.buffer = bytes;
100 }
101 } finally {
102 try {input.close();} catch (IOException e){}
103 }
104 }
105
106 /***
107 * Defines the default <code>Charset</code> used in case the buffer represents
108 * an 8-bit <code>Charset</code>.
109 *
110 * @param defaultCharset the default <code>Charset</code> to be returned by <code>guessEncoding()</code>
111 * if an 8-bit <code>Charset</code> is encountered.
112 */
113 public void setDefaultCharset(Charset defaultCharset) {
114 if (defaultCharset != null)
115 this.defaultCharset = defaultCharset;
116 else
117 this.defaultCharset = getDefaultSystemCharset();
118 }
119
120 public Charset getCharset() {
121 if (this.charset == null)
122 this.charset = guessEncoding();
123 return charset;
124 }
125
126 /***
127 * If US-ASCII is recognized, enforce to return the default encoding, rather than US-ASCII.
128 * It might be a file without any special character in the range 128-255, but that may be or become
129 * a file encoded with the default <code>charset</code> rather than US-ASCII.
130 *
131 * @param enforce a boolean specifying the use or not of US-ASCII.
132 */
133 public void setEnforce8Bit(boolean enforce) {
134 this.enforce8Bit = enforce;
135 }
136
137 /***
138 * Gets the enforce8Bit flag, in case we do not want to ever get a US-ASCII encoding.
139 *
140 * @return a boolean representing the flag of use of US-ASCII.
141 */
142 public boolean getEnforce8Bit() {
143 return this.enforce8Bit;
144 }
145
146 /***
147 * Retrieves the default Charset
148 */
149 public Charset getDefaultCharset() {
150 return defaultCharset;
151 }
152
153 /***
154 * <p>Guess the encoding of the provided buffer.</p>
155 * If Byte Order Markers are encountered at the beginning of the buffer, we immidiately
156 * return the charset implied by this BOM. Otherwise, the file would not be a human
157 * readable text file.</p>
158 *
159 * <p>If there is no BOM, this method tries to discern whether the file is UTF-8 or not.
160 * If it is not UTF-8, we assume the encoding is the default system encoding
161 * (of course, it might be any 8-bit charset, but usually, an 8-bit charset is the default one).</p>
162 *
163 * <p>It is possible to discern UTF-8 thanks to the pattern of characters with a multi-byte sequence.</p>
164 * <pre>
165 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
166 * 0000 0000-0000 007F 0xxxxxxx
167 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
168 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
169 * 0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
170 * 0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
171 * 0400 0000-7FFF FFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
172 * </pre>
173 * <p>With UTF-8, 0xFE and 0xFF never appear.</p>
174 *
175 * @return the Charset recognized.
176 */
177 private Charset guessEncoding() {
178
179
180 if (hasUTF8Bom())
181 return Charset.forName("UTF-8");
182 if (hasUTF16LEBom())
183 return Charset.forName("UTF-16LE");
184 if (hasUTF16BEBom())
185 return Charset.forName("UTF-16BE");
186
187
188
189 boolean highOrderBit = false;
190
191
192
193 boolean validU8Char = true;
194
195
196
197 int length = buffer.length;
198 int i = 0;
199 while (i < length - 6) {
200 byte b0 = buffer[i];
201 byte b1 = buffer[i + 1];
202 byte b2 = buffer[i + 2];
203 byte b3 = buffer[i + 3];
204 byte b4 = buffer[i + 4];
205 byte b5 = buffer[i + 5];
206 if (b0 < 0) {
207
208
209 highOrderBit = true;
210
211 if (isTwoBytesSequence(b0)) {
212
213
214 if (!isContinuationChar(b1))
215 validU8Char = false;
216 else
217 i++;
218 }
219
220 else if (isThreeBytesSequence(b0)) {
221
222
223 if (!(isContinuationChar(b1) && isContinuationChar(b2)))
224 validU8Char = false;
225 else
226 i += 2;
227 }
228
229 else if (isFourBytesSequence(b0)) {
230
231
232 if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3)))
233 validU8Char = false;
234 else
235 i += 3;
236 }
237
238 else if (isFiveBytesSequence(b0)) {
239
240
241 if (!(isContinuationChar(b1)
242 && isContinuationChar(b2)
243 && isContinuationChar(b3)
244 && isContinuationChar(b4)))
245 validU8Char = false;
246 else
247 i += 4;
248 }
249
250 else if (isSixBytesSequence(b0)) {
251
252
253 if (!(isContinuationChar(b1)
254 && isContinuationChar(b2)
255 && isContinuationChar(b3)
256 && isContinuationChar(b4)
257 && isContinuationChar(b5)))
258 validU8Char = false;
259 else
260 i += 5;
261 }
262 else
263 validU8Char = false;
264 }
265 if (!validU8Char)
266 break;
267 i++;
268 }
269
270
271 if (!highOrderBit) {
272
273 if (this.enforce8Bit)
274 return this.defaultCharset;
275 else
276 return Charset.forName("US-ASCII");
277 }
278
279
280 if (validU8Char)
281 return Charset.forName("UTF-8");
282
283 return this.defaultCharset;
284 }
285
286 /***
287 * If the byte has the form 10xxxxx, then it's a continuation byte of a multiple byte character;
288 *
289 * @param b a byte.
290 * @return true if it's a continuation char.
291 */
292 private static boolean isContinuationChar(byte b) {
293 return -128 <= b && b <= -65;
294 }
295
296 /***
297 * If the byte has the form 110xxxx, then it's the first byte of a two-bytes sequence character.
298 *
299 * @param b a byte.
300 * @return true if it's the first byte of a two-bytes sequence.
301 */
302 private static boolean isTwoBytesSequence(byte b) {
303 return -64 <= b && b <= -33;
304 }
305
306 /***
307 * If the byte has the form 1110xxx, then it's the first byte of a three-bytes sequence character.
308 *
309 * @param b a byte.
310 * @return true if it's the first byte of a three-bytes sequence.
311 */
312 private static boolean isThreeBytesSequence(byte b) {
313 return -32 <= b && b <= -17;
314 }
315
316 /***
317 * If the byte has the form 11110xx, then it's the first byte of a four-bytes sequence character.
318 *
319 * @param b a byte.
320 * @return true if it's the first byte of a four-bytes sequence.
321 */
322 private static boolean isFourBytesSequence(byte b) {
323 return -16 <= b && b <= -9;
324 }
325
326 /***
327 * If the byte has the form 11110xx, then it's the first byte of a five-bytes sequence character.
328 *
329 * @param b a byte.
330 * @return true if it's the first byte of a five-bytes sequence.
331 */
332 private static boolean isFiveBytesSequence(byte b) {
333 return -8 <= b && b <= -5;
334 }
335
336 /***
337 * If the byte has the form 1110xxx, then it's the first byte of a six-bytes sequence character.
338 *
339 * @param b a byte.
340 * @return true if it's the first byte of a six-bytes sequence.
341 */
342 private static boolean isSixBytesSequence(byte b) {
343 return -4 <= b && b <= -3;
344 }
345
346 /***
347 * Retrieve the default charset of the system.
348 *
349 * @return the default <code>Charset</code>.
350 */
351 public static Charset getDefaultSystemCharset() {
352 return Charset.forName(System.getProperty("file.encoding"));
353 }
354
355 /***
356 * Has a Byte Order Marker for UTF-8 (Used by Microsoft's Notepad and other editors).
357 *
358 * @return true if the buffer has a BOM for UTF8.
359 */
360 public boolean hasUTF8Bom() {
361 if (buffer.length >= 3)
362 return (buffer[0] == -17 && buffer[1] == -69 && buffer[2] == -65);
363 else
364 return false;
365 }
366
367 /***
368 * Has a Byte Order Marker for UTF-16 Low Endian
369 * (ucs-2le, ucs-4le, and ucs-16le).
370 *
371 * @return true if the buffer has a BOM for UTF-16 Low Endian.
372 */
373 public boolean hasUTF16LEBom() {
374 if (buffer.length >= 2)
375 return (buffer[0] == -1 && buffer[1] == -2);
376 else
377 return false;
378 }
379
380 /***
381 * Has a Byte Order Marker for UTF-16 Big Endian
382 * (utf-16 and ucs-2).
383 *
384 * @return true if the buffer has a BOM for UTF-16 Big Endian.
385 */
386 public boolean hasUTF16BEBom() {
387 if (buffer.length >= 2)
388 return (buffer[0] == -2 && buffer[1] == -1);
389 else
390 return false;
391 }
392
393 /***
394 * Gets a <code>BufferedReader</code> (indeed a <code>LineNumberReader</code>) from the <code>File</code>
395 * specified in the constructor of <code>CharsetToolkit</code> using the charset discovered by the
396 * method <code>guessEncoding()</code>.
397 *
398 * @return a <code>BufferedReader</code>
399 * @throws FileNotFoundException if the file is not found.
400 */
401 public BufferedReader getReader() throws FileNotFoundException {
402 LineNumberReader reader = new LineNumberReader(new InputStreamReader(new FileInputStream(file), getCharset()));
403 if (hasUTF8Bom() || hasUTF16LEBom() || hasUTF16BEBom()) {
404 try {
405 reader.read();
406 }
407 catch (IOException e) {
408
409
410 }
411 }
412 return reader;
413 }
414
415 /***
416 * Retrieves all the available <code>Charset</code>s on the platform,
417 * among which the default <code>charset</code>.
418 *
419 * @return an array of <code>Charset</code>s.
420 */
421 public static Charset[] getAvailableCharsets() {
422 Collection collection = Charset.availableCharsets().values();
423 return (Charset[]) collection.toArray(new Charset[collection.size()]);
424 }
425 }