| /* UTF_8.java -- |
| Copyright (C) 2002, 2004, 2005 Free Software Foundation, Inc. |
| |
| This file is part of GNU Classpath. |
| |
| GNU Classpath is free software; you can redistribute it and/or modify |
| it under the terms of the GNU General Public License as published by |
| the Free Software Foundation; either version 2, or (at your option) |
| any later version. |
| |
| GNU Classpath is distributed in the hope that it will be useful, but |
| WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| General Public License for more details. |
| |
| You should have received a copy of the GNU General Public License |
| along with GNU Classpath; see the file COPYING. If not, write to the |
| Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA |
| 02110-1301 USA. |
| |
| Linking this library statically or dynamically with other modules is |
| making a combined work based on this library. Thus, the terms and |
| conditions of the GNU General Public License cover the whole |
| combination. |
| |
| As a special exception, the copyright holders of this library give you |
| permission to link this library with independent modules to produce an |
| executable, regardless of the license terms of these independent |
| modules, and to copy and distribute the resulting executable under |
| terms of your choice, provided that you also meet, for each linked |
| independent module, the terms and conditions of the license of that |
| module. An independent module is a module which is not derived from |
| or based on this library. If you modify this library, you may extend |
| this exception to your version of the library, but you are not |
| obligated to do so. If you do not wish to do so, delete this |
| exception statement from your version. */ |
| |
| package gnu.java.nio.charset; |
| |
| import java.nio.ByteBuffer; |
| import java.nio.CharBuffer; |
| import java.nio.charset.Charset; |
| import java.nio.charset.CharsetDecoder; |
| import java.nio.charset.CharsetEncoder; |
| import java.nio.charset.CoderResult; |
| |
| /** |
| * UTF-8 charset. |
| * |
| * <p> UTF-8 references: |
| * <ul> |
| * <li> <a href="http://ietf.org/rfc/rfc2279.txt">RFC 2279</a> |
| * <li> The <a href="http://www.unicode.org/unicode/standard/standard.html"> |
| * Unicode standard</a> and |
| * <a href="http://www.unicode.org/versions/corrigendum1.html"> |
| * Corrigendum</a> |
| * </ul> |
| * |
| * @author Jesse Rosenstock |
| */ |
| final class UTF_8 extends Charset |
| { |
| UTF_8 () |
| { |
| super ("UTF-8", new String[] { |
| /* These names are provided by |
| * http://oss.software.ibm.com/cgi-bin/icu/convexp?s=ALL |
| */ |
| "ibm-1208", "ibm-1209", "ibm-5304", "ibm-5305", |
| "windows-65001", "cp1208", |
| // see http://java.sun.com/j2se/1.5.0/docs/guide/intl/encoding.doc.html |
| "UTF8" |
| }); |
| } |
| |
| public boolean contains (Charset cs) |
| { |
| return cs instanceof US_ASCII || cs instanceof ISO_8859_1 |
| || cs instanceof UTF_8 || cs instanceof UTF_16BE |
| || cs instanceof UTF_16LE || cs instanceof UTF_16; |
| } |
| |
| public CharsetDecoder newDecoder () |
| { |
| return new Decoder (this); |
| } |
| |
| public CharsetEncoder newEncoder () |
| { |
| return new Encoder (this); |
| } |
| |
| private static final class Decoder extends CharsetDecoder |
| { |
| // Package-private to avoid a trampoline constructor. |
| Decoder (Charset cs) |
| { |
| super (cs, 1f, 1f); |
| } |
| |
| protected CoderResult decodeLoop (ByteBuffer in, CharBuffer out) |
| { |
| // TODO: Optimize this in the case in.hasArray() / out.hasArray() |
| int inPos = in.position(); |
| try |
| { |
| while (in.hasRemaining ()) |
| { |
| char c; |
| byte b1 = in.get (); |
| int highNibble = ((b1 & 0xFF) >> 4) & 0xF; |
| switch (highNibble) |
| { |
| case 0: case 1: case 2: case 3: |
| case 4: case 5: case 6: case 7: |
| if (out.remaining () < 1) |
| return CoderResult.OVERFLOW; |
| out.put ((char) b1); |
| inPos++; |
| break; |
| |
| case 0xC: case 0xD: |
| byte b2; |
| if (in.remaining () < 1) |
| return CoderResult.UNDERFLOW; |
| if (out.remaining () < 1) |
| return CoderResult.OVERFLOW; |
| if (!isContinuation (b2 = in.get ())) |
| return CoderResult.malformedForLength (1); |
| c = (char) (((b1 & 0x1F) << 6) | (b2 & 0x3F)); |
| // check that we had the shortest encoding |
| if (c <= 0x7F) |
| return CoderResult.malformedForLength (2); |
| out.put (c); |
| inPos += 2; |
| break; |
| |
| case 0xE: |
| byte b3; |
| if (in.remaining () < 2) |
| return CoderResult.UNDERFLOW; |
| if (out.remaining () < 1) |
| return CoderResult.OVERFLOW; |
| if (!isContinuation (b2 = in.get ())) |
| return CoderResult.malformedForLength (1); |
| if (!isContinuation (b3 = in.get ())) |
| return CoderResult.malformedForLength (1); |
| c = (char) (((b1 & 0x0F) << 12) |
| | ((b2 & 0x3F) << 6) |
| | (b3 & 0x3F)); |
| // check that we had the shortest encoding |
| if (c <= 0x7FF) |
| return CoderResult.malformedForLength (3); |
| out.put (c); |
| inPos += 3; |
| break; |
| |
| case 0xF: |
| byte b4; |
| if (in.remaining () < 3) |
| return CoderResult.UNDERFLOW; |
| if((b1&0x0F) > 4) |
| return CoderResult.malformedForLength (4); |
| if (out.remaining () < 2) |
| return CoderResult.OVERFLOW; |
| if (!isContinuation (b2 = in.get ())) |
| return CoderResult.malformedForLength (3); |
| if (!isContinuation (b3 = in.get ())) |
| return CoderResult.malformedForLength (2); |
| if (!isContinuation (b4 = in.get ())) |
| return CoderResult.malformedForLength (1); |
| int n = (((b1 & 0x3) << 18) |
| | ((b2 & 0x3F) << 12) |
| | ((b3 & 0x3F) << 6) |
| | (b4 & 0x3F)) - 0x10000; |
| char c1 = (char)(0xD800 | (n & 0xFFC00)>>10); |
| char c2 = (char)(0xDC00 | (n & 0x003FF)); |
| out.put (c1); |
| out.put (c2); |
| inPos += 4; |
| break; |
| |
| default: |
| return CoderResult.malformedForLength (1); |
| } |
| } |
| |
| return CoderResult.UNDERFLOW; |
| } |
| finally |
| { |
| // In case we did a get(), then encountered an error, reset the |
| // position to before the error. If there was no error, this |
| // will benignly reset the position to the value it already has. |
| in.position (inPos); |
| } |
| } |
| |
| private static boolean isContinuation (byte b) |
| { |
| return (b & 0xC0) == 0x80; |
| } |
| } |
| |
| private static final class Encoder extends CharsetEncoder |
| { |
| // Package-private to avoid a trampoline constructor. |
| Encoder (Charset cs) |
| { |
| // According to |
| // http://www-106.ibm.com/developerworks/unicode/library/utfencodingforms/index.html |
| // On average, English takes slightly over one unit per code point. |
| // Most Latin-script languages take about 1.1 bytes. Greek, Russian, |
| // Arabic and Hebrew take about 1.7 bytes, and most others (including |
| // Japanese, Chinese, Korean and Hindi) take about 3 bytes. |
| // We assume we will be dealing with latin scripts, and use 1.1 |
| // for averageBytesPerChar. |
| super (cs, 1.1f, 4.0f); |
| } |
| |
| protected CoderResult encodeLoop (CharBuffer in, ByteBuffer out) |
| { |
| int inPos = in.position(); |
| try |
| { |
| // TODO: Optimize this in the case in.hasArray() / out.hasArray() |
| while (in.hasRemaining ()) |
| { |
| int remaining = out.remaining (); |
| char c = in.get (); |
| |
| // UCS-4 range (hex.) UTF-8 octet sequence (binary) |
| // 0000 0000-0000 007F 0xxxxxxx |
| // 0000 0080-0000 07FF 110xxxxx 10xxxxxx |
| // 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx |
| |
| // Scalar Value UTF-16 byte 1 byte 2 byte 3 byte 4 |
| // 0000 0000 0xxx xxxx 0000 0000 0xxx xxxx 0xxx xxxx |
| // 0000 0yyy yyxx xxxx 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx |
| // zzzz yyyy yyxx xxxx zzzz yyyy yyxx xxxx 1110 zzzz 10yy yyyy 10xx xxxx |
| // u uuuu zzzz yyyy yyxx xxxx 1101 10ww wwzz zzyy 1111 0uuu 10uu zzzz 10yy yyyy 10xx xxxx |
| // + 1101 11yy yyxx xxxx |
| // Note: uuuuu = wwww + 1 |
| if (c <= 0x7F) |
| { |
| if (remaining < 1) |
| return CoderResult.OVERFLOW; |
| out.put ((byte) c); |
| inPos++; |
| } |
| else if (c <= 0x7FF) |
| { |
| if (remaining < 2) |
| return CoderResult.OVERFLOW; |
| out.put ((byte) (0xC0 | (c >> 6))); |
| out.put ((byte) (0x80 | (c & 0x3F))); |
| inPos++; |
| } |
| else if (0xD800 <= c && c <= 0xDFFF) |
| { |
| if (remaining < 4) |
| return CoderResult.OVERFLOW; |
| |
| // we got a low surrogate without a preciding high one |
| if (c > 0xDBFF) |
| return CoderResult.malformedForLength (1); |
| |
| // high surrogates |
| if (!in.hasRemaining ()) |
| return CoderResult.UNDERFLOW; |
| |
| char d = in.get (); |
| |
| // make sure d is a low surrogate |
| if (d < 0xDC00 || d > 0xDFFF) |
| return CoderResult.malformedForLength (1); |
| |
| // make the 32 bit value |
| // int value2 = (c - 0xD800) * 0x400 + (d - 0xDC00) + 0x10000; |
| int value = (((c & 0x3FF) << 10) | (d & 0x3FF)) + 0x10000; |
| // assert value == value2; |
| out.put ((byte) (0xF0 | ((value >> 18) & 0x07))); |
| out.put ((byte) (0x80 | ((value >> 12) & 0x3F))); |
| out.put ((byte) (0x80 | ((value >> 6) & 0x3F))); |
| out.put ((byte) (0x80 | ((value ) & 0x3F))); |
| inPos += 2; |
| } |
| else |
| { |
| if (remaining < 3) |
| return CoderResult.OVERFLOW; |
| |
| out.put ((byte) (0xE0 | (c >> 12))); |
| out.put ((byte) (0x80 | ((c >> 6) & 0x3F))); |
| out.put ((byte) (0x80 | (c & 0x3F))); |
| inPos++; |
| } |
| } |
| |
| return CoderResult.UNDERFLOW; |
| } |
| finally |
| { |
| // In case we did a get(), then encountered an error, reset the |
| // position to before the error. If there was no error, this |
| // will benignly reset the position to the value it already has. |
| in.position (inPos); |
| } |
| } |
| } |
| } |