View Javadoc

1   /*
2    * $Id: CharsetToolkit.java 4112 2006-10-13 13:21:25Z blackdrag $
3    *
4    * Copyright 2003 (C) Guillaume Laforge. All Rights Reserved.
5    *
6    * Redistribution and use of this software and associated documentation
7    * ("Software"), with or without modification, are permitted provided that the
8    * following conditions are met:
9    *  1. Redistributions of source code must retain copyright statements and
10   * notices. Redistributions must also contain a copy of this document.
11   *  2. Redistributions in binary form must reproduce the above copyright
12   * notice, this list of conditions and the following disclaimer in the
13   * documentation and/or other materials provided with the distribution.
14   *  3. The name "groovy" must not be used to endorse or promote products
15   * derived from this Software without prior written permission of The Codehaus.
16   * For written permission, please contact info@codehaus.org.
17   *  4. Products derived from this Software may not be called "groovy" nor may
18   * "groovy" appear in their names without prior written permission of The
19   * Codehaus. "groovy" is a registered trademark of The Codehaus.
20   *  5. Due credit should be given to The Codehaus - http://groovy.codehaus.org/
21   *
22   * THIS SOFTWARE IS PROVIDED BY THE CODEHAUS AND CONTRIBUTORS ``AS IS'' AND ANY
23   * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
24   * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
25   * DISCLAIMED. IN NO EVENT SHALL THE CODEHAUS OR ITS CONTRIBUTORS BE LIABLE FOR
26   * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27   * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
28   * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29   * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30   * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31   * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
32   * DAMAGE.
33   *
34   */
35  
36  package groovy.util;
37  
38  import java.io.*;
39  import java.nio.charset.Charset;
40  import java.util.*;
41  
42  /***
43   * <p>Utility class to guess the encoding of a given text file.</p>
44   *
45   * <p>Unicode files encoded in UTF-16 (low or big endian) or UTF-8 files
46   * with a Byte Order Marker are correctly discovered. For UTF-8 files with no BOM, if the buffer
47   * is wide enough, the charset should also be discovered.</p>
48   *
49   * <p>A byte buffer of 4KB is usually sufficient to be able to guess the encoding.</p>
50   *
51   * <p>Usage:</p>
52   * <pre>
53   * // guess the encoding
54   * Charset guessedCharset = CharsetToolkit.guessEncoding(file, 4096);
55   *
56   * // create a reader with the correct charset
57   * CharsetToolkit toolkit = new CharsetToolkit(file);
58   * BufferedReader reader = toolkit.getReader();
59   *
60   * // read the file content
61   * String line;
62   * while ((line = br.readLine())!= null)
63   * {
64   *     System.out.println(line);
65   * }
66   * </pre>
67   *
68   * @author Guillaume Laforge
69   */
70  public class CharsetToolkit {
71      private byte[] buffer;
72      private Charset defaultCharset;
73      private Charset charset;
74      private boolean enforce8Bit = true;
75      private File file;
76  
77      /***
78       * Constructor of the <code>CharsetToolkit</code> utility class.
79       *
80       * @param file of which we want to know the encoding.
81       */
82      public CharsetToolkit(File file) throws IOException {
83          this.file = file;
84          this.defaultCharset = getDefaultSystemCharset();
85          this.charset = null;
86          InputStream input = new FileInputStream(file);
87          try {
88              byte[] bytes = new byte[4096];
89              int bytesRead = input.read(bytes);
90              if (bytesRead == -1) {
91                  this.buffer = new byte[0];
92              }
93              else if (bytesRead < 4096) {
94                  byte[] bytesToGuess = new byte[bytesRead];
95                  System.arraycopy(bytes, 0, bytesToGuess, 0, bytesRead);
96                  this.buffer = bytesToGuess;
97              }
98              else {
99                  this.buffer = bytes;
100             }
101         } finally {
102             try {input.close();} catch (IOException e){}
103         }
104     }
105 
106     /***
107      * Defines the default <code>Charset</code> used in case the buffer represents
108      * an 8-bit <code>Charset</code>.
109      *
110      * @param defaultCharset the default <code>Charset</code> to be returned by <code>guessEncoding()</code>
111      * if an 8-bit <code>Charset</code> is encountered.
112      */
113     public void setDefaultCharset(Charset defaultCharset) {
114         if (defaultCharset != null)
115             this.defaultCharset = defaultCharset;
116         else
117             this.defaultCharset = getDefaultSystemCharset();
118     }
119 
120     public Charset getCharset() {
121         if (this.charset == null)
122             this.charset = guessEncoding();
123         return charset;
124     }
125 
126     /***
127      * If US-ASCII is recognized, enforce to return the default encoding, rather than US-ASCII.
128      * It might be a file without any special character in the range 128-255, but that may be or become
129      * a file encoded with the default <code>charset</code> rather than US-ASCII.
130      *
131      * @param enforce a boolean specifying the use or not of US-ASCII.
132      */
133     public void setEnforce8Bit(boolean enforce) {
134         this.enforce8Bit = enforce;
135     }
136 
137     /***
138      * Gets the enforce8Bit flag, in case we do not want to ever get a US-ASCII encoding.
139      *
140      * @return a boolean representing the flag of use of US-ASCII.
141      */
142     public boolean getEnforce8Bit() {
143         return this.enforce8Bit;
144     }
145 
146     /***
147      * Retrieves the default Charset
148      */
149     public Charset getDefaultCharset() {
150         return defaultCharset;
151     }
152 
153     /***
154      * <p>Guess the encoding of the provided buffer.</p>
155      * If Byte Order Markers are encountered at the beginning of the buffer, we immidiately
156      * return the charset implied by this BOM. Otherwise, the file would not be a human
157      * readable text file.</p>
158      *
159      * <p>If there is no BOM, this method tries to discern whether the file is UTF-8 or not.
160      * If it is not UTF-8, we assume the encoding is the default system encoding
161      * (of course, it might be any 8-bit charset, but usually, an 8-bit charset is the default one).</p>
162      *
163      * <p>It is possible to discern UTF-8 thanks to the pattern of characters with a multi-byte sequence.</p>
164      * <pre>
165      * UCS-4 range (hex.)        UTF-8 octet sequence (binary)
166      * 0000 0000-0000 007F       0xxxxxxx
167      * 0000 0080-0000 07FF       110xxxxx 10xxxxxx
168      * 0000 0800-0000 FFFF       1110xxxx 10xxxxxx 10xxxxxx
169      * 0001 0000-001F FFFF       11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
170      * 0020 0000-03FF FFFF       111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
171      * 0400 0000-7FFF FFFF       1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
172      * </pre>
173      * <p>With UTF-8, 0xFE and 0xFF never appear.</p>
174      *
175      * @return the Charset recognized.
176      */
177     private Charset guessEncoding() {
178         // if the file has a Byte Order Marker, we can assume the file is in UTF-xx
179         // otherwise, the file would not be human readable
180         if (hasUTF8Bom())
181             return Charset.forName("UTF-8");
182         if (hasUTF16LEBom())
183             return Charset.forName("UTF-16LE");
184         if (hasUTF16BEBom())
185             return Charset.forName("UTF-16BE");
186 
187         // if a byte has its most significant bit set, the file is in UTF-8 or in the default encoding
188         // otherwise, the file is in US-ASCII
189         boolean highOrderBit = false;
190 
191         // if the file is in UTF-8, high order bytes must have a certain value, in order to be valid
192         // if it's not the case, we can assume the encoding is the default encoding of the system
193         boolean validU8Char = true;
194 
195         // TODO the buffer is not read up to the end, but up to length - 6
196 
197         int length = buffer.length;
198         int i = 0;
199         while (i < length - 6) {
200             byte b0 = buffer[i];
201             byte b1 = buffer[i + 1];
202             byte b2 = buffer[i + 2];
203             byte b3 = buffer[i + 3];
204             byte b4 = buffer[i + 4];
205             byte b5 = buffer[i + 5];
206             if (b0 < 0) {
207                 // a high order bit was encountered, thus the encoding is not US-ASCII
208                 // it may be either an 8-bit encoding or UTF-8
209                 highOrderBit = true;
210                 // a two-bytes sequence was encoutered
211                 if (isTwoBytesSequence(b0)) {
212                     // there must be one continuation byte of the form 10xxxxxx,
213                     // otherwise the following characteris is not a valid UTF-8 construct
214                     if (!isContinuationChar(b1))
215                         validU8Char = false;
216                     else
217                         i++;
218                 }
219                 // a three-bytes sequence was encoutered
220                 else if (isThreeBytesSequence(b0)) {
221                     // there must be two continuation bytes of the form 10xxxxxx,
222                     // otherwise the following characteris is not a valid UTF-8 construct
223                     if (!(isContinuationChar(b1) && isContinuationChar(b2)))
224                         validU8Char = false;
225                     else
226                         i += 2;
227                 }
228                 // a four-bytes sequence was encoutered
229                 else if (isFourBytesSequence(b0)) {
230                     // there must be three continuation bytes of the form 10xxxxxx,
231                     // otherwise the following characteris is not a valid UTF-8 construct
232                     if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3)))
233                         validU8Char = false;
234                     else
235                         i += 3;
236                 }
237                 // a five-bytes sequence was encoutered
238                 else if (isFiveBytesSequence(b0)) {
239                     // there must be four continuation bytes of the form 10xxxxxx,
240                     // otherwise the following characteris is not a valid UTF-8 construct
241                     if (!(isContinuationChar(b1)
242                         && isContinuationChar(b2)
243                         && isContinuationChar(b3)
244                         && isContinuationChar(b4)))
245                         validU8Char = false;
246                     else
247                         i += 4;
248                 }
249                 // a six-bytes sequence was encoutered
250                 else if (isSixBytesSequence(b0)) {
251                     // there must be five continuation bytes of the form 10xxxxxx,
252                     // otherwise the following characteris is not a valid UTF-8 construct
253                     if (!(isContinuationChar(b1)
254                         && isContinuationChar(b2)
255                         && isContinuationChar(b3)
256                         && isContinuationChar(b4)
257                         && isContinuationChar(b5)))
258                         validU8Char = false;
259                     else
260                         i += 5;
261                 }
262                 else
263                     validU8Char = false;
264             }
265             if (!validU8Char)
266                 break;
267             i++;
268         }
269         // if no byte with an high order bit set, the encoding is US-ASCII
270         // (it might have been UTF-7, but this encoding is usually internally used only by mail systems)
271         if (!highOrderBit) {
272             // returns the default charset rather than US-ASCII if the enforce8Bit flag is set.
273             if (this.enforce8Bit)
274                 return this.defaultCharset;
275             else
276                 return Charset.forName("US-ASCII");
277         }
278         // if no invalid UTF-8 were encountered, we can assume the encoding is UTF-8,
279         // otherwise the file would not be human readable
280         if (validU8Char)
281             return Charset.forName("UTF-8");
282         // finally, if it's not UTF-8 nor US-ASCII, let's assume the encoding is the default encoding
283         return this.defaultCharset;
284     }
285 
286     /***
287      * If the byte has the form 10xxxxx, then it's a continuation byte of a multiple byte character;
288      *
289      * @param b a byte.
290      * @return true if it's a continuation char.
291      */
292     private static boolean isContinuationChar(byte b) {
293         return -128 <= b && b <= -65;
294     }
295 
296     /***
297      * If the byte has the form 110xxxx, then it's the first byte of a two-bytes sequence character.
298      *
299      * @param b a byte.
300      * @return true if it's the first byte of a two-bytes sequence.
301      */
302     private static boolean isTwoBytesSequence(byte b) {
303         return -64 <= b && b <= -33;
304     }
305 
306     /***
307      * If the byte has the form 1110xxx, then it's the first byte of a three-bytes sequence character.
308      *
309      * @param b a byte.
310      * @return true if it's the first byte of a three-bytes sequence.
311      */
312     private static boolean isThreeBytesSequence(byte b) {
313         return -32 <= b && b <= -17;
314     }
315 
316     /***
317      * If the byte has the form 11110xx, then it's the first byte of a four-bytes sequence character.
318      *
319      * @param b a byte.
320      * @return true if it's the first byte of a four-bytes sequence.
321      */
322     private static boolean isFourBytesSequence(byte b) {
323         return -16 <= b && b <= -9;
324     }
325 
326     /***
327      * If the byte has the form 11110xx, then it's the first byte of a five-bytes sequence character.
328      *
329      * @param b a byte.
330      * @return true if it's the first byte of a five-bytes sequence.
331      */
332     private static boolean isFiveBytesSequence(byte b) {
333         return -8 <= b && b <= -5;
334     }
335 
336     /***
337      * If the byte has the form 1110xxx, then it's the first byte of a six-bytes sequence character.
338      *
339      * @param b a byte.
340      * @return true if it's the first byte of a six-bytes sequence.
341      */
342     private static boolean isSixBytesSequence(byte b) {
343         return -4 <= b && b <= -3;
344     }
345 
346     /***
347      * Retrieve the default charset of the system.
348      *
349      * @return the default <code>Charset</code>.
350      */
351     public static Charset getDefaultSystemCharset() {
352         return Charset.forName(System.getProperty("file.encoding"));
353     }
354 
355     /***
356      * Has a Byte Order Marker for UTF-8 (Used by Microsoft's Notepad and other editors).
357      *
358      * @return true if the buffer has a BOM for UTF8.
359      */
360     public boolean hasUTF8Bom() {
361         if (buffer.length >= 3)
362             return (buffer[0] == -17 && buffer[1] == -69 && buffer[2] == -65);
363         else
364             return false;
365     }
366 
367     /***
368      * Has a Byte Order Marker for UTF-16 Low Endian
369      * (ucs-2le, ucs-4le, and ucs-16le).
370      *
371      * @return true if the buffer has a BOM for UTF-16 Low Endian.
372      */
373     public boolean hasUTF16LEBom() {
374         if (buffer.length >= 2)
375             return (buffer[0] == -1 && buffer[1] == -2);
376         else
377             return false;
378     }
379 
380     /***
381      * Has a Byte Order Marker for UTF-16 Big Endian
382      * (utf-16 and ucs-2).
383      *
384      * @return true if the buffer has a BOM for UTF-16 Big Endian.
385      */
386     public boolean hasUTF16BEBom() {
387         if (buffer.length >= 2)
388             return (buffer[0] == -2 && buffer[1] == -1);
389         else
390             return false;
391     }
392 
393     /***
394      * Gets a <code>BufferedReader</code> (indeed a <code>LineNumberReader</code>) from the <code>File</code>
395      * specified in the constructor of <code>CharsetToolkit</code> using the charset discovered by the
396      * method <code>guessEncoding()</code>.
397      *
398      * @return a <code>BufferedReader</code>
399      * @throws FileNotFoundException if the file is not found.
400      */
401     public BufferedReader getReader() throws FileNotFoundException {
402         LineNumberReader reader = new LineNumberReader(new InputStreamReader(new FileInputStream(file), getCharset()));
403         if (hasUTF8Bom() || hasUTF16LEBom() || hasUTF16BEBom()) {
404             try {
405                 reader.read();
406             }
407             catch (IOException e) {
408                 // should never happen, as a file with no content
409                 // but with a BOM has at least one char
410             }
411         }
412         return reader;
413     }
414 
415     /***
416      * Retrieves all the available <code>Charset</code>s on the platform,
417      * among which the default <code>charset</code>.
418      *
419      * @return an array of <code>Charset</code>s.
420      */
421     public static Charset[] getAvailableCharsets() {
422         Collection collection = Charset.availableCharsets().values();
423         return (Charset[]) collection.toArray(new Charset[collection.size()]);
424     }
425 }