llvm-gcc-4.2/libjava/gnu/gcj/convert/Input_UTF8.java - llvm-archive - Git at Google

 /* Copyright (C) 1999, 2000  Free Software Foundation

    This file is part of libgcj.

 This software is copyrighted work licensed under the terms of the
 Libgcj License.  Please consult the file "LIBGCJ_LICENSE" for
 details.  */

 package gnu.gcj.convert;

 /**
  * Convert UTF8 to Unicode.
  * @author Per Bothner <bothner@cygnus.com>
  * @date March 1999.
  */

 public class Input_UTF8 extends BytesToUnicode
 {
   public String getName() { return "UTF8"; }

   int partial = 0;
   int partial_bytes_expected = 0;
   //int suggogate_second = -1;

   public int read (char[] outbuffer, int outpos, int count)
   {
     int origpos = outpos;
     for (;;)
       {
 	if (outpos - origpos >= count)
 	  break;
 	if (inpos >= inlength)
 	  break;
 	int b = inbuffer[inpos++];
 	if (b >= 0)
 	  outbuffer[outpos++] = (char) b;
 	else
 	  {
 	    if ((b & 0xC0) == 0x80) // Continuation byte
 	      {
 		partial = (partial << 6) | (b & 0x3F);
 		--partial_bytes_expected;
 		if (partial_bytes_expected == 1)
 		  {
 		    if (partial > (0xFFFF>>6))
 		      {
 			// The next continuation byte will cause the result
 			// to exceed 0xFFFF, so we must use a surrogate pair.
 			// The "Unicode scalar value" (see D28 in section 3.7
 			// of the Unicode Standard 2.0) is defined as:
 			// value == (hi-0xD800)*0x400+(lo-0xDC00)+0x10000,
 			// where (hi, lo) is the Unicode surrogate pair.
 			// After reading the first three bytes, we have:
 			// partial == (value >> 6).
 			// Substituting and simplifying, we get:
 			// partial == (hi-0xD800)*0x10+((lo-0xDC00)>>6)+0x400.
 			// The definition lo>=0xDC00 && lo<=0xDFFF implies
 			// that (lo-0xDC00)>>6 is in the range 0..15.
 			// Hence we can solve for `hi' and we can emit
 			// the high-surrogate without waiting for the
 			// final byte:
 			outbuffer[outpos++]
 			  = (char) (0xD800 + ((partial - 0x400) >> 4));

 			// Now we want to set it up so that when we read
 			// the final byte on the next iteration, we will
 			// get the low-surrogate without special handling.
 			// I.e. we want:
 			// lo == (next_partial << 6) | (next & 0x3F)
 			// where next is the next input byte and next_partial
 			// is the value of partial at the end of this
 			// iteration.  This implies:  next_partial == lo >> 6.
 			// We can simplify the previous:
 			// partial == (hi-0xD800)*0x10+((lo-0xDC00)>>6)+0x400,
 			// to: partial == (hi-0xD800)*0x10+(lo>>6)+0x90.
 			// Inserting the values of hi and next_partial,
 			// and simplifying, we get:  partial ==
 			// ( (partial-0x400)&~0xF) + next_partial + 0x90.
 			// Solving for next_partial, we get:
 			// next_partial = partial+0x400-0x90-(partial&~0xF):
 			// or: next_partial = (partial&0xF) + 0x370.  Hence:
 			partial = (partial & 0xF) + 0x370;
 		      }
 		  }
 		else if (partial_bytes_expected == 0)
 		  {
 		    outbuffer[outpos++] = (char) partial;
 		    partial = 0;
 		    partial_bytes_expected = 0;
 		  }
 	      }
 	    else // prefix byte
 	      {
 		if ((b & 0xE0) == 0xC0)
 		  {
 		    partial = b & 0x1F;
 		    partial_bytes_expected = 1;
 		  }
 		else if ((b & 0xF0) == 0xE0)
 		  {
 		    partial = b & 0xF;
 		    partial_bytes_expected = 2;
 		  }
 		else
 		  {
 		    partial = b & 7;
 		    partial_bytes_expected = 3;
 		  }
 	      }
 	  }
       }
     return outpos - origpos;
   }
 }
	/* Copyright (C) 1999, 2000 Free Software Foundation

	This file is part of libgcj.

	This software is copyrighted work licensed under the terms of the
	Libgcj License. Please consult the file "LIBGCJ_LICENSE" for
	details. */

	package gnu.gcj.convert;

	/**
	* Convert UTF8 to Unicode.
	* @author Per Bothner <bothner@cygnus.com>
	* @date March 1999.
	*/

	public class Input_UTF8 extends BytesToUnicode
	{
	public String getName() { return "UTF8"; }

	int partial = 0;
	int partial_bytes_expected = 0;
	//int suggogate_second = -1;

	public int read (char[] outbuffer, int outpos, int count)
	{
	int origpos = outpos;
	for (;;)
	{
	if (outpos - origpos >= count)
	break;
	if (inpos >= inlength)
	break;
	int b = inbuffer[inpos++];
	if (b >= 0)
	outbuffer[outpos++] = (char) b;
	else
	{
	if ((b & 0xC0) == 0x80) // Continuation byte
	{
	partial = (partial << 6) \| (b & 0x3F);
	--partial_bytes_expected;
	if (partial_bytes_expected == 1)
	{
	if (partial > (0xFFFF>>6))
	{
	// The next continuation byte will cause the result
	// to exceed 0xFFFF, so we must use a surrogate pair.
	// The "Unicode scalar value" (see D28 in section 3.7
	// of the Unicode Standard 2.0) is defined as:
	// value == (hi-0xD800)*0x400+(lo-0xDC00)+0x10000,
	// where (hi, lo) is the Unicode surrogate pair.
	// After reading the first three bytes, we have:
	// partial == (value >> 6).
	// Substituting and simplifying, we get:
	// partial == (hi-0xD800)*0x10+((lo-0xDC00)>>6)+0x400.
	// The definition lo>=0xDC00 && lo<=0xDFFF implies
	// that (lo-0xDC00)>>6 is in the range 0..15.
	// Hence we can solve for `hi' and we can emit
	// the high-surrogate without waiting for the
	// final byte:
	outbuffer[outpos++]
	= (char) (0xD800 + ((partial - 0x400) >> 4));

	// Now we want to set it up so that when we read
	// the final byte on the next iteration, we will
	// get the low-surrogate without special handling.
	// I.e. we want:
	// lo == (next_partial << 6) \| (next & 0x3F)
	// where next is the next input byte and next_partial
	// is the value of partial at the end of this
	// iteration. This implies: next_partial == lo >> 6.
	// We can simplify the previous:
	// partial == (hi-0xD800)*0x10+((lo-0xDC00)>>6)+0x400,
	// to: partial == (hi-0xD800)*0x10+(lo>>6)+0x90.
	// Inserting the values of hi and next_partial,
	// and simplifying, we get: partial ==
	// ( (partial-0x400)&~0xF) + next_partial + 0x90.
	// Solving for next_partial, we get:
	// next_partial = partial+0x400-0x90-(partial&~0xF):
	// or: next_partial = (partial&0xF) + 0x370. Hence:
	partial = (partial & 0xF) + 0x370;
	}
	}
	else if (partial_bytes_expected == 0)
	{
	outbuffer[outpos++] = (char) partial;
	partial = 0;
	partial_bytes_expected = 0;
	}
	}
	else // prefix byte
	{
	if ((b & 0xE0) == 0xC0)
	{
	partial = b & 0x1F;
	partial_bytes_expected = 1;
	}
	else if ((b & 0xF0) == 0xE0)
	{
	partial = b & 0xF;
	partial_bytes_expected = 2;
	}
	else
	{
	partial = b & 7;
	partial_bytes_expected = 3;
	}
	}
	}
	}
	return outpos - origpos;
	}
	}