| /* XmlParser.java -- |
| Copyright (C) 1999,2000,2001 Free Software Foundation, Inc. |
| |
| This file is part of GNU Classpath. |
| |
| GNU Classpath is free software; you can redistribute it and/or modify |
| it under the terms of the GNU General Public License as published by |
| the Free Software Foundation; either version 2, or (at your option) |
| any later version. |
| |
| GNU Classpath is distributed in the hope that it will be useful, but |
| WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| General Public License for more details. |
| |
| You should have received a copy of the GNU General Public License |
| along with GNU Classpath; see the file COPYING. If not, write to the |
| Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA |
| 02111-1307 USA. |
| |
| Linking this library statically or dynamically with other modules is |
| making a combined work based on this library. Thus, the terms and |
| conditions of the GNU General Public License cover the whole |
| combination. |
| |
| As a special exception, the copyright holders of this library give you |
| permission to link this library with independent modules to produce an |
| executable, regardless of the license terms of these independent |
| modules, and to copy and distribute the resulting executable under |
| terms of your choice, provided that you also meet, for each linked |
| independent module, the terms and conditions of the license of that |
| module. An independent module is a module which is not derived from |
| or based on this library. If you modify this library, you may extend |
| this exception to your version of the library, but you are not |
| obligated to do so. If you do not wish to do so, delete this |
| exception statement from your version. |
| |
| Partly derived from code which carried the following notice: |
| |
| Copyright (c) 1997, 1998 by Microstar Software Ltd. |
| |
| AElfred is free for both commercial and non-commercial use and |
| redistribution, provided that Microstar's copyright and disclaimer are |
| retained intact. You are free to modify AElfred for your own use and |
| to redistribute AElfred with your modifications, provided that the |
| modifications are clearly documented. |
| |
| This program is distributed in the hope that it will be useful, but |
| WITHOUT ANY WARRANTY; without even the implied warranty of |
| merchantability or fitness for a particular purpose. Please use it AT |
| YOUR OWN RISK. |
| */ |
| |
| package gnu.xml.aelfred2; |
| |
| import gnu.java.security.action.GetPropertyAction; |
| |
| import java.io.BufferedInputStream; |
| import java.io.CharConversionException; |
| import java.io.EOFException; |
| import java.io.InputStream; |
| import java.io.InputStreamReader; |
| import java.io.IOException; |
| import java.io.Reader; |
| import java.io.UnsupportedEncodingException; |
| import java.net.URL; |
| import java.net.URLConnection; |
| import java.security.AccessController; |
| |
| import java.util.Iterator; |
| import java.util.HashMap; |
| import java.util.LinkedList; |
| |
| import org.xml.sax.InputSource; |
| import org.xml.sax.SAXException; |
| |
| |
| /** |
| * Parse XML documents and return parse events through call-backs. |
| * Use the <code>SAXDriver</code> class as your entry point, as all |
| * internal parser interfaces are subject to change. |
| * |
| * @author Written by David Megginson <dmeggins@microstar.com> |
| * (version 1.2a with bugfixes) |
| * @author Updated by David Brownell <dbrownell@users.sourceforge.net> |
| * @see SAXDriver |
| */ |
| final class XmlParser |
| { |
| |
| // avoid slow per-character readCh() |
| private final static boolean USE_CHEATS = true; |
| |
| //////////////////////////////////////////////////////////////////////// |
| // Constants. |
| //////////////////////////////////////////////////////////////////////// |
| |
| // |
| // Constants for element content type. |
| // |
| |
| /** |
| * Constant: an element has not been declared. |
| * @see #getElementContentType |
| */ |
| public final static int CONTENT_UNDECLARED = 0; |
| |
| /** |
| * Constant: the element has a content model of ANY. |
| * @see #getElementContentType |
| */ |
| public final static int CONTENT_ANY = 1; |
| |
| /** |
| * Constant: the element has declared content of EMPTY. |
| * @see #getElementContentType |
| */ |
| public final static int CONTENT_EMPTY = 2; |
| |
| /** |
| * Constant: the element has mixed content. |
| * @see #getElementContentType |
| */ |
| public final static int CONTENT_MIXED = 3; |
| |
| /** |
| * Constant: the element has element content. |
| * @see #getElementContentType |
| */ |
| public final static int CONTENT_ELEMENTS = 4; |
| |
| |
| // |
| // Constants for the entity type. |
| // |
| |
| /** |
| * Constant: the entity has not been declared. |
| * @see #getEntityType |
| */ |
| public final static int ENTITY_UNDECLARED = 0; |
| |
| /** |
| * Constant: the entity is internal. |
| * @see #getEntityType |
| */ |
| public final static int ENTITY_INTERNAL = 1; |
| |
| /** |
| * Constant: the entity is external, non-parsable data. |
| * @see #getEntityType |
| */ |
| public final static int ENTITY_NDATA = 2; |
| |
| /** |
| * Constant: the entity is external XML data. |
| * @see #getEntityType |
| */ |
| public final static int ENTITY_TEXT = 3; |
| |
| // |
| // Attribute type constants are interned literal strings. |
| // |
| |
| // |
| // Constants for supported encodings. "external" is just a flag. |
| // |
| private final static int ENCODING_EXTERNAL = 0; |
| private final static int ENCODING_UTF_8 = 1; |
| private final static int ENCODING_ISO_8859_1 = 2; |
| private final static int ENCODING_UCS_2_12 = 3; |
| private final static int ENCODING_UCS_2_21 = 4; |
| private final static int ENCODING_UCS_4_1234 = 5; |
| private final static int ENCODING_UCS_4_4321 = 6; |
| private final static int ENCODING_UCS_4_2143 = 7; |
| private final static int ENCODING_UCS_4_3412 = 8; |
| private final static int ENCODING_ASCII = 9; |
| |
| // |
| // Constants for attribute default value. |
| // |
| |
| /** |
| * Constant: the attribute is not declared. |
| * @see #getAttributeDefaultValueType |
| */ |
| public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 30; |
| |
| /** |
| * Constant: the attribute has a literal default value specified. |
| * @see #getAttributeDefaultValueType |
| * @see #getAttributeDefaultValue |
| */ |
| public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 31; |
| |
| /** |
| * Constant: the attribute was declared #IMPLIED. |
| * @see #getAttributeDefaultValueType |
| */ |
| public final static int ATTRIBUTE_DEFAULT_IMPLIED = 32; |
| |
| /** |
| * Constant: the attribute was declared #REQUIRED. |
| * @see #getAttributeDefaultValueType |
| */ |
| public final static int ATTRIBUTE_DEFAULT_REQUIRED = 33; |
| |
| /** |
| * Constant: the attribute was declared #FIXED. |
| * @see #getAttributeDefaultValueType |
| * @see #getAttributeDefaultValue |
| */ |
| public final static int ATTRIBUTE_DEFAULT_FIXED = 34; |
| |
| // |
| // Constants for input. |
| // |
| private final static int INPUT_NONE = 0; |
| private final static int INPUT_INTERNAL = 1; |
| private final static int INPUT_STREAM = 3; |
| private final static int INPUT_READER = 5; |
| |
| // |
| // Flags for reading literals. |
| // |
| // expand general entity refs (attribute values in dtd and content) |
| private final static int LIT_ENTITY_REF = 2; |
| // normalize this value (space chars) (attributes, public ids) |
| private final static int LIT_NORMALIZE = 4; |
| // literal is an attribute value |
| private final static int LIT_ATTRIBUTE = 8; |
| // don't expand parameter entities |
| private final static int LIT_DISABLE_PE = 16; |
| // don't expand [or parse] character refs |
| private final static int LIT_DISABLE_CREF = 32; |
| // don't parse general entity refs |
| private final static int LIT_DISABLE_EREF = 64; |
| // literal is a public ID value |
| private final static int LIT_PUBID = 256; |
| |
| // |
| // Flags affecting PE handling in DTDs (if expandPE is true). |
| // PEs expand with space padding, except inside literals. |
| // |
| private final static int CONTEXT_NORMAL = 0; |
| private final static int CONTEXT_LITERAL = 1; |
| |
| // Emit warnings for relative URIs with no base URI. |
| static boolean uriWarnings; |
| static |
| { |
| String key = "gnu.xml.aelfred2.XmlParser.uriWarnings"; |
| GetPropertyAction a = new GetPropertyAction(key); |
| uriWarnings = "true".equals(AccessController.doPrivileged(a)); |
| } |
| |
| // |
| // The current XML handler interface. |
| // |
| private SAXDriver handler; |
| |
| // |
| // I/O information. |
| // |
| private Reader reader; // current reader |
| private InputStream is; // current input stream |
| private int line; // current line number |
| private int column; // current column number |
| private int sourceType; // type of input source |
| private LinkedList inputStack; // stack of input soruces |
| private URLConnection externalEntity; // current external entity |
| private int encoding; // current character encoding |
| private int currentByteCount; // bytes read from current source |
| private InputSource scratch; // temporary |
| |
| // |
| // Buffers for decoded but unparsed character input. |
| // |
| private char[] readBuffer; |
| private int readBufferPos; |
| private int readBufferLength; |
| private int readBufferOverflow; // overflow from last data chunk. |
| |
| // |
| // Buffer for undecoded raw byte input. |
| // |
| private final static int READ_BUFFER_MAX = 16384; |
| private byte[] rawReadBuffer; |
| |
| |
| // |
| // Buffer for attribute values, char refs, DTD stuff. |
| // |
| private static int DATA_BUFFER_INITIAL = 4096; |
| private char[] dataBuffer; |
| private int dataBufferPos; |
| |
| // |
| // Buffer for parsed names. |
| // |
| private static int NAME_BUFFER_INITIAL = 1024; |
| private char[] nameBuffer; |
| private int nameBufferPos; |
| |
| // |
| // Save any standalone flag |
| // |
| private boolean docIsStandalone; |
| |
| // |
| // Hashtables for DTD information on elements, entities, and notations. |
| // Populated until we start ignoring decls (because of skipping a PE) |
| // |
| private HashMap elementInfo; |
| private HashMap entityInfo; |
| private HashMap notationInfo; |
| private boolean skippedPE; |
| |
| // |
| // Element type currently in force. |
| // |
| private String currentElement; |
| private int currentElementContent; |
| |
| // |
| // Stack of entity names, to detect recursion. |
| // |
| private LinkedList entityStack; |
| |
| // |
| // PE expansion is enabled in most chunks of the DTD, not all. |
| // When it's enabled, literals are treated differently. |
| // |
| private boolean inLiteral; |
| private boolean expandPE; |
| private boolean peIsError; |
| |
| // |
| // can't report entity expansion inside two constructs: |
| // - attribute expansions (internal entities only) |
| // - markup declarations (parameter entities only) |
| // |
| private boolean doReport; |
| |
| // |
| // Symbol table, for caching interned names. |
| // |
| // These show up wherever XML names or nmtokens are used: naming elements, |
| // attributes, PIs, notations, entities, and enumerated attribute values. |
| // |
| // NOTE: This hashtable doesn't grow. The default size is intended to be |
| // rather large for most documents. Example: one snapshot of the DocBook |
| // XML 4.1 DTD used only about 350 such names. As a rule, only pathological |
| // documents (ones that don't reuse names) should ever see much collision. |
| // |
| // Be sure that SYMBOL_TABLE_LENGTH always stays prime, for best hashing. |
| // "2039" keeps the hash table size at about two memory pages on typical |
| // 32 bit hardware. |
| // |
| private final static int SYMBOL_TABLE_LENGTH = 2039; |
| |
| private Object[][] symbolTable; |
| |
| // |
| // Hash table of attributes found in current start tag. |
| // |
| private String[] tagAttributes; |
| private int tagAttributePos; |
| |
| // |
| // Utility flag: have we noticed a CR while reading the last |
| // data chunk? If so, we will have to go back and normalise |
| // CR or CR/LF line ends. |
| // |
| private boolean sawCR; |
| |
| // |
| // Utility flag: are we in CDATA? If so, whitespace isn't ignorable. |
| // |
| private boolean inCDATA; |
| |
| // |
| // Xml version. |
| // |
| private static final int XML_10 = 0; |
| private static final int XML_11 = 1; |
| private int xmlVersion = XML_10; |
| |
| ////////////////////////////////////////////////////////////////////// |
| // Constructors. |
| //////////////////////////////////////////////////////////////////////// |
| |
| /** |
| * Construct a new parser with no associated handler. |
| * @see #setHandler |
| * @see #parse |
| */ |
| // package private |
| XmlParser() |
| { |
| } |
| |
| /** |
| * Set the handler that will receive parsing events. |
| * @param handler The handler to receive callback events. |
| * @see #parse |
| */ |
| // package private |
| void setHandler(SAXDriver handler) |
| { |
| this.handler = handler; |
| } |
| |
| /** |
| * Parse an XML document from the character stream, byte stream, or URI |
| * that you provide (in that order of preference). Any URI that you |
| * supply will become the base URI for resolving relative URI, and may |
| * be used to acquire a reader or byte stream. |
| * |
| * <p> Only one thread at a time may use this parser; since it is |
| * private to this package, post-parse cleanup is done by the caller, |
| * which MUST NOT REUSE the parser (just null it). |
| * |
| * @param systemId Absolute URI of the document; should never be null, |
| * but may be so iff a reader <em>or</em> a stream is provided. |
| * @param publicId The public identifier of the document, or null. |
| * @param reader A character stream; must be null if stream isn't. |
| * @param stream A byte input stream; must be null if reader isn't. |
| * @param encoding The suggested encoding, or null if unknown. |
| * @exception java.lang.Exception Basically SAXException or IOException |
| */ |
| // package private |
| void doParse(String systemId, String publicId, Reader reader, |
| InputStream stream, String encoding) |
| throws Exception |
| { |
| if (handler == null) |
| { |
| throw new IllegalStateException("no callback handler"); |
| } |
| |
| initializeVariables(); |
| |
| // predeclare the built-in entities here (replacement texts) |
| // we don't need to intern(), since we're guaranteed literals |
| // are always (globally) interned. |
| setInternalEntity("amp", "&"); |
| setInternalEntity("lt", "<"); |
| setInternalEntity("gt", ">"); |
| setInternalEntity("apos", "'"); |
| setInternalEntity("quot", """); |
| |
| try |
| { |
| // pushURL first to ensure locator is correct in startDocument |
| // ... it might report an IO or encoding exception. |
| handler.startDocument(); |
| pushURL(false, "[document]", |
| // default baseURI: null |
| new ExternalIdentifiers(publicId, systemId, null), |
| reader, stream, encoding, false); |
| |
| parseDocument(); |
| } |
| catch (EOFException e) |
| { |
| //empty input |
| error("empty document, with no root element."); |
| } |
| finally |
| { |
| if (reader != null) |
| { |
| try |
| { |
| reader.close(); |
| } |
| catch (IOException e) |
| { |
| /* ignore */ |
| } |
| } |
| if (stream != null) |
| { |
| try |
| { |
| stream.close(); |
| } |
| catch (IOException e) |
| { |
| /* ignore */ |
| } |
| } |
| if (is != null) |
| { |
| try |
| { |
| is.close(); |
| } |
| catch (IOException e) |
| { |
| /* ignore */ |
| } |
| } |
| scratch = null; |
| } |
| } |
| |
| ////////////////////////////////////////////////////////////////////// |
| // Error reporting. |
| ////////////////////////////////////////////////////////////////////// |
| |
| /** |
| * Report an error. |
| * @param message The error message. |
| * @param textFound The text that caused the error (or null). |
| * @see SAXDriver#error |
| * @see #line |
| */ |
| private void error(String message, String textFound, String textExpected) |
| throws SAXException |
| { |
| if (textFound != null) |
| { |
| message = message + " (found \"" + textFound + "\")"; |
| } |
| if (textExpected != null) |
| { |
| message = message + " (expected \"" + textExpected + "\")"; |
| } |
| handler.fatal(message); |
| |
| // "can't happen" |
| throw new SAXException(message); |
| } |
| |
| /** |
| * Report a serious error. |
| * @param message The error message. |
| * @param textFound The text that caused the error (or null). |
| */ |
| private void error(String message, char textFound, String textExpected) |
| throws SAXException |
| { |
| error(message, new Character(textFound).toString(), textExpected); |
| } |
| |
| /** |
| * Report typical case fatal errors. |
| */ |
| private void error(String message) |
| throws SAXException |
| { |
| handler.fatal(message); |
| } |
| |
| ////////////////////////////////////////////////////////////////////// |
| // Major syntactic productions. |
| ////////////////////////////////////////////////////////////////////// |
| |
| /** |
| * Parse an XML document. |
| * <pre> |
| * [1] document ::= prolog element Misc* |
| * </pre> |
| * <p>This is the top-level parsing function for a single XML |
| * document. As a minimum, a well-formed document must have |
| * a document element, and a valid document must have a prolog |
| * (one with doctype) as well. |
| */ |
| private void parseDocument() |
| throws Exception |
| { |
| try |
| { // added by MHK |
| boolean sawDTD = parseProlog(); |
| require('<'); |
| parseElement(!sawDTD); |
| } |
| catch (EOFException ee) |
| { // added by MHK |
| error("premature end of file", "[EOF]", null); |
| } |
| |
| try |
| { |
| parseMisc(); //skip all white, PIs, and comments |
| char c = readCh(); //if this doesn't throw an exception... |
| error("unexpected characters after document end", c, null); |
| } |
| catch (EOFException e) |
| { |
| return; |
| } |
| } |
| |
| static final char[] startDelimComment = { '<', '!', '-', '-' }; |
| static final char[] endDelimComment = { '-', '-' }; |
| |
| /** |
| * Skip a comment. |
| * <pre> |
| * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* "-->" |
| * </pre> |
| * <p> (The <code><!--</code> has already been read.) |
| */ |
| private void parseComment() |
| throws Exception |
| { |
| char c; |
| boolean saved = expandPE; |
| |
| expandPE = false; |
| parseUntil(endDelimComment); |
| require('>'); |
| expandPE = saved; |
| handler.comment(dataBuffer, 0, dataBufferPos); |
| dataBufferPos = 0; |
| } |
| |
| static final char[] startDelimPI = { '<', '?' }; |
| static final char[] endDelimPI = { '?', '>' }; |
| |
| /** |
| * Parse a processing instruction and do a call-back. |
| * <pre> |
| * [16] PI ::= '<?' PITarget |
| * (S (Char* - (Char* '?>' Char*)))? |
| * '?>' |
| * [17] PITarget ::= Name - ( ('X'|'x') ('M'|m') ('L'|l') ) |
| * </pre> |
| * <p> (The <code><?</code> has already been read.) |
| */ |
| private void parsePI() |
| throws SAXException, IOException |
| { |
| String name; |
| boolean saved = expandPE; |
| |
| expandPE = false; |
| name = readNmtoken(true); |
| //NE08 |
| if (name.indexOf(':') >= 0) |
| { |
| error("Illegal character(':') in processing instruction name ", |
| name, null); |
| } |
| if ("xml".equalsIgnoreCase(name)) |
| { |
| error("Illegal processing instruction target", name, null); |
| } |
| if (!tryRead(endDelimPI)) |
| { |
| requireWhitespace(); |
| parseUntil(endDelimPI); |
| } |
| expandPE = saved; |
| handler.processingInstruction(name, dataBufferToString()); |
| } |
| |
| static final char[] endDelimCDATA = { ']', ']', '>' }; |
| |
| private boolean isDirtyCurrentElement; |
| |
| /** |
| * Parse a CDATA section. |
| * <pre> |
| * [18] CDSect ::= CDStart CData CDEnd |
| * [19] CDStart ::= '<![CDATA[' |
| * [20] CData ::= (Char* - (Char* ']]>' Char*)) |
| * [21] CDEnd ::= ']]>' |
| * </pre> |
| * <p> (The '<![CDATA[' has already been read.) |
| */ |
| private void parseCDSect() |
| throws Exception |
| { |
| parseUntil(endDelimCDATA); |
| dataBufferFlush(); |
| } |
| |
| /** |
| * Parse the prolog of an XML document. |
| * <pre> |
| * [22] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)? |
| * </pre> |
| * <p>We do not look for the XML declaration here, because it was |
| * handled by pushURL (). |
| * @see pushURL |
| * @return true if a DTD was read. |
| */ |
| private boolean parseProlog() |
| throws Exception |
| { |
| parseMisc(); |
| |
| if (tryRead("<!DOCTYPE")) |
| { |
| parseDoctypedecl(); |
| parseMisc(); |
| return true; |
| } |
| return false; |
| } |
| |
| private void checkLegalVersion(String version) |
| throws SAXException |
| { |
| int len = version.length(); |
| for (int i = 0; i < len; i++) |
| { |
| char c = version.charAt(i); |
| if ('0' <= c && c <= '9') |
| { |
| continue; |
| } |
| if (c == '_' || c == '.' || c == ':' || c == '-') |
| { |
| continue; |
| } |
| if ('a' <= c && c <= 'z') |
| { |
| continue; |
| } |
| if ('A' <= c && c <= 'Z') |
| { |
| continue; |
| } |
| error ("illegal character in version", version, "1.0"); |
| } |
| } |
| |
| /** |
| * Parse the XML declaration. |
| * <pre> |
| * [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>' |
| * [24] VersionInfo ::= S 'version' Eq |
| * ("'" VersionNum "'" | '"' VersionNum '"' ) |
| * [26] VersionNum ::= ([a-zA-Z0-9_.:] | '-')* |
| * [32] SDDecl ::= S 'standalone' Eq |
| * ( "'"" ('yes' | 'no') "'"" | '"' ("yes" | "no") '"' ) |
| * [80] EncodingDecl ::= S 'encoding' Eq |
| * ( "'" EncName "'" | "'" EncName "'" ) |
| * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* |
| * </pre> |
| * <p> (The <code><?xml</code> and whitespace have already been read.) |
| * @return the encoding in the declaration, uppercased; or null |
| * @see #parseTextDecl |
| * @see #setupDecoding |
| */ |
| private String parseXMLDecl(boolean ignoreEncoding) |
| throws SAXException, IOException |
| { |
| String version; |
| String encodingName = null; |
| String standalone = null; |
| int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF; |
| String inputEncoding = null; |
| |
| switch (this.encoding) |
| { |
| case ENCODING_EXTERNAL: |
| case ENCODING_UTF_8: |
| inputEncoding = "UTF-8"; |
| break; |
| case ENCODING_ISO_8859_1: |
| inputEncoding = "ISO-8859-1"; |
| break; |
| case ENCODING_UCS_2_12: |
| inputEncoding = "UTF-16BE"; |
| break; |
| case ENCODING_UCS_2_21: |
| inputEncoding = "UTF-16LE"; |
| break; |
| } |
| |
| // Read the version. |
| require("version"); |
| parseEq(); |
| checkLegalVersion(version = readLiteral(flags)); |
| if (!version.equals("1.0")) |
| { |
| if (version.equals("1.1")) |
| { |
| handler.warn("expected XML version 1.0, not: " + version); |
| xmlVersion = XML_11; |
| } |
| else |
| { |
| error("illegal XML version", version, "1.0 or 1.1"); |
| } |
| } |
| else |
| { |
| xmlVersion = XML_10; |
| } |
| // Try reading an encoding declaration. |
| boolean white = tryWhitespace(); |
| |
| if (tryRead("encoding")) |
| { |
| if (!white) |
| { |
| error("whitespace required before 'encoding='"); |
| } |
| parseEq(); |
| encodingName = readLiteral(flags); |
| if (!ignoreEncoding) |
| { |
| setupDecoding(encodingName); |
| } |
| } |
| |
| // Try reading a standalone declaration |
| if (encodingName != null) |
| { |
| white = tryWhitespace(); |
| } |
| if (tryRead("standalone")) |
| { |
| if (!white) |
| { |
| error("whitespace required before 'standalone='"); |
| } |
| parseEq(); |
| standalone = readLiteral(flags); |
| if ("yes".equals(standalone)) |
| { |
| docIsStandalone = true; |
| } |
| else if (!"no".equals(standalone)) |
| { |
| error("standalone flag must be 'yes' or 'no'"); |
| } |
| } |
| |
| skipWhitespace(); |
| require("?>"); |
| |
| if (inputEncoding == null) |
| { |
| inputEncoding = encodingName; |
| } |
| handler.xmlDecl(version, encodingName, docIsStandalone, |
| inputEncoding); |
| |
| return encodingName; |
| } |
| |
| /** |
| * Parse a text declaration. |
| * <pre> |
| * [79] TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>' |
| * [80] EncodingDecl ::= S 'encoding' Eq |
| * ( '"' EncName '"' | "'" EncName "'" ) |
| * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* |
| * </pre> |
| * <p> (The <code><?xml</code>' and whitespace have already been read.) |
| * @return the encoding in the declaration, uppercased; or null |
| * @see #parseXMLDecl |
| * @see #setupDecoding |
| */ |
| private String parseTextDecl(boolean ignoreEncoding) |
| throws SAXException, IOException |
| { |
| String encodingName = null; |
| int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF; |
| |
| // Read an optional version. |
| if (tryRead ("version")) |
| { |
| String version; |
| parseEq(); |
| checkLegalVersion(version = readLiteral(flags)); |
| |
| if (version.equals("1.1")) |
| { |
| if (xmlVersion == XML_10) |
| { |
| error("external subset has later version number.", "1.0", |
| version); |
| } |
| handler.warn("expected XML version 1.0, not: " + version); |
| xmlVersion = XML_11; |
| } |
| else if (!version.equals("1.0")) |
| { |
| error("illegal XML version", version, "1.0 or 1.1"); |
| } |
| requireWhitespace(); |
| } |
| |
| // Read the encoding. |
| require("encoding"); |
| parseEq(); |
| encodingName = readLiteral(flags); |
| if (!ignoreEncoding) |
| { |
| setupDecoding(encodingName); |
| } |
| skipWhitespace(); |
| require("?>"); |
| |
| return encodingName; |
| } |
| |
| /** |
| * Sets up internal state so that we can decode an entity using the |
| * specified encoding. This is used when we start to read an entity |
| * and we have been given knowledge of its encoding before we start to |
| * read any data (e.g. from a SAX input source or from a MIME type). |
| * |
| * <p> It is also used after autodetection, at which point only very |
| * limited adjustments to the encoding may be used (switching between |
| * related builtin decoders). |
| * |
| * @param encodingName The name of the encoding specified by the user. |
| * @exception IOException if the encoding isn't supported either |
| * internally to this parser, or by the hosting JVM. |
| * @see #parseXMLDecl |
| * @see #parseTextDecl |
| */ |
| private void setupDecoding(String encodingName) |
| throws SAXException, IOException |
| { |
| encodingName = encodingName.toUpperCase(); |
| |
| // ENCODING_EXTERNAL indicates an encoding that wasn't |
| // autodetected ... we can use builtin decoders, or |
| // ones from the JVM (InputStreamReader). |
| |
| // Otherwise we can only tweak what was autodetected, and |
| // only for single byte (ASCII derived) builtin encodings. |
| |
| // ASCII-derived encodings |
| if (encoding == ENCODING_UTF_8 || encoding == ENCODING_EXTERNAL) |
| { |
| if (encodingName.equals("ISO-8859-1") |
| || encodingName.equals("8859_1") |
| || encodingName.equals("ISO8859_1")) |
| { |
| encoding = ENCODING_ISO_8859_1; |
| return; |
| } |
| else if (encodingName.equals("US-ASCII") |
| || encodingName.equals("ASCII")) |
| { |
| encoding = ENCODING_ASCII; |
| return; |
| } |
| else if (encodingName.equals("UTF-8") |
| || encodingName.equals("UTF8")) |
| { |
| encoding = ENCODING_UTF_8; |
| return; |
| } |
| else if (encoding != ENCODING_EXTERNAL) |
| { |
| // used to start with a new reader ... |
| throw new UnsupportedEncodingException(encodingName); |
| } |
| // else fallthrough ... |
| // it's ASCII-ish and something other than a builtin |
| } |
| |
| // Unicode and such |
| if (encoding == ENCODING_UCS_2_12 || encoding == ENCODING_UCS_2_21) |
| { |
| if (!(encodingName.equals("ISO-10646-UCS-2") |
| || encodingName.equals("UTF-16") |
| || encodingName.equals("UTF-16BE") |
| || encodingName.equals("UTF-16LE"))) |
| { |
| error("unsupported Unicode encoding", encodingName, "UTF-16"); |
| } |
| return; |
| } |
| |
| // four byte encodings |
| if (encoding == ENCODING_UCS_4_1234 |
| || encoding == ENCODING_UCS_4_4321 |
| || encoding == ENCODING_UCS_4_2143 |
| || encoding == ENCODING_UCS_4_3412) |
| { |
| // Strictly: "UCS-4" == "UTF-32BE"; also, "UTF-32LE" exists |
| if (!encodingName.equals("ISO-10646-UCS-4")) |
| { |
| error("unsupported 32-bit encoding", encodingName, |
| "ISO-10646-UCS-4"); |
| } |
| return; |
| } |
| |
| // assert encoding == ENCODING_EXTERNAL |
| // if (encoding != ENCODING_EXTERNAL) |
| // throw new RuntimeException ("encoding = " + encoding); |
| |
| if (encodingName.equals("UTF-16BE")) |
| { |
| encoding = ENCODING_UCS_2_12; |
| return; |
| } |
| if (encodingName.equals("UTF-16LE")) |
| { |
| encoding = ENCODING_UCS_2_21; |
| return; |
| } |
| |
| // We couldn't use the builtin decoders at all. But we can try to |
| // create a reader, since we haven't messed up buffering. Tweak |
| // the encoding name if necessary. |
| |
| if (encodingName.equals("UTF-16") |
| || encodingName.equals("ISO-10646-UCS-2")) |
| { |
| encodingName = "Unicode"; |
| } |
| // Ignoring all the EBCDIC aliases here |
| |
| reader = new InputStreamReader(is, encodingName); |
| sourceType = INPUT_READER; |
| } |
| |
| /** |
| * Parse miscellaneous markup outside the document element and DOCTYPE |
| * declaration. |
| * <pre> |
| * [27] Misc ::= Comment | PI | S |
| * </pre> |
| */ |
| private void parseMisc() |
| throws Exception |
| { |
| while (true) |
| { |
| skipWhitespace(); |
| if (tryRead(startDelimPI)) |
| { |
| parsePI(); |
| } |
| else if (tryRead(startDelimComment)) |
| { |
| parseComment(); |
| } |
| else |
| { |
| return; |
| } |
| } |
| } |
| |
| /** |
| * Parse a document type declaration. |
| * <pre> |
| * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? |
| * ('[' (markupdecl | PEReference | S)* ']' S?)? '>' |
| * </pre> |
| * <p> (The <code><!DOCTYPE</code> has already been read.) |
| */ |
| private void parseDoctypedecl() |
| throws Exception |
| { |
| String rootName; |
| ExternalIdentifiers ids; |
| |
| // Read the document type name. |
| requireWhitespace(); |
| rootName = readNmtoken(true); |
| |
| // Read the External subset's IDs |
| skipWhitespace(); |
| ids = readExternalIds(false, true); |
| |
| // report (a) declaration of name, (b) lexical info (ids) |
| handler.doctypeDecl(rootName, ids.publicId, ids.systemId); |
| |
| // Internal subset is parsed first, if present |
| skipWhitespace(); |
| if (tryRead('[')) |
| { |
| |
| // loop until the subset ends |
| while (true) |
| { |
| doReport = expandPE = true; |
| skipWhitespace(); |
| doReport = expandPE = false; |
| if (tryRead(']')) |
| { |
| break; // end of subset |
| } |
| else |
| { |
| // WFC, PEs in internal subset (only between decls) |
| peIsError = expandPE = true; |
| parseMarkupdecl(); |
| peIsError = expandPE = false; |
| } |
| } |
| } |
| skipWhitespace(); |
| require('>'); |
| |
| // Read the external subset, if any |
| InputSource subset; |
| |
| if (ids.systemId == null) |
| { |
| subset = handler.getExternalSubset(rootName, |
| handler.getSystemId()); |
| } |
| else |
| { |
| subset = null; |
| } |
| if (ids.systemId != null || subset != null) |
| { |
| pushString(null, ">"); |
| |
| // NOTE: [dtd] is so we say what SAX2 expects, |
| // though it's misleading (subset, not entire dtd) |
| if (ids.systemId != null) |
| { |
| pushURL(true, "[dtd]", ids, null, null, null, true); |
| } |
| else |
| { |
| handler.warn("modifying document by adding external subset"); |
| pushURL(true, "[dtd]", |
| new ExternalIdentifiers(subset.getPublicId(), |
| subset.getSystemId(), |
| null), |
| subset.getCharacterStream(), |
| subset.getByteStream(), |
| subset.getEncoding(), |
| false); |
| } |
| |
| // Loop until we end up back at '>' |
| while (true) |
| { |
| doReport = expandPE = true; |
| skipWhitespace(); |
| doReport = expandPE = false; |
| if (tryRead('>')) |
| { |
| break; |
| } |
| else |
| { |
| expandPE = true; |
| parseMarkupdecl(); |
| expandPE = false; |
| } |
| } |
| |
| // the ">" string isn't popped yet |
| if (inputStack.size() != 1) |
| { |
| error("external subset has unmatched '>'"); |
| } |
| } |
| |
| // done dtd |
| handler.endDoctype(); |
| expandPE = false; |
| doReport = true; |
| } |
| |
| /** |
| * Parse a markup declaration in the internal or external DTD subset. |
| * <pre> |
| * [29] markupdecl ::= elementdecl | Attlistdecl | EntityDecl |
| * | NotationDecl | PI | Comment |
| * [30] extSubsetDecl ::= (markupdecl | conditionalSect |
| * | PEReference | S) * |
| * </pre> |
| * <p> Reading toplevel PE references is handled as a lexical issue |
| * by the caller, as is whitespace. |
| */ |
| private void parseMarkupdecl() |
| throws Exception |
| { |
| char[] saved = null; |
| boolean savedPE = expandPE; |
| |
| // prevent "<%foo;" and ensures saved entity is right |
| require('<'); |
| unread('<'); |
| expandPE = false; |
| |
| if (tryRead("<!ELEMENT")) |
| { |
| saved = readBuffer; |
| expandPE = savedPE; |
| parseElementDecl(); |
| } |
| else if (tryRead("<!ATTLIST")) |
| { |
| saved = readBuffer; |
| expandPE = savedPE; |
| parseAttlistDecl(); |
| } |
| else if (tryRead("<!ENTITY")) |
| { |
| saved = readBuffer; |
| expandPE = savedPE; |
| parseEntityDecl(); |
| } |
| else if (tryRead("<!NOTATION")) |
| { |
| saved = readBuffer; |
| expandPE = savedPE; |
| parseNotationDecl(); |
| } |
| else if (tryRead(startDelimPI)) |
| { |
| saved = readBuffer; |
| expandPE = savedPE; |
| parsePI(); |
| } |
| else if (tryRead(startDelimComment)) |
| { |
| saved = readBuffer; |
| expandPE = savedPE; |
| parseComment(); |
| } |
| else if (tryRead("<![")) |
| { |
| saved = readBuffer; |
| expandPE = savedPE; |
| if (inputStack.size() > 0) |
| { |
| parseConditionalSect(saved); |
| } |
| else |
| { |
| error("conditional sections illegal in internal subset"); |
| } |
| } |
| else |
| { |
| error("expected markup declaration"); |
| } |
| |
| // VC: Proper Decl/PE Nesting |
| if (readBuffer != saved) |
| { |
| handler.verror("Illegal Declaration/PE nesting"); |
| } |
| } |
| |
| /** |
| * Parse an element, with its tags. |
| * <pre> |
| * [39] element ::= EmptyElementTag | STag content ETag |
| * [40] STag ::= '<' Name (S Attribute)* S? '>' |
| * [44] EmptyElementTag ::= '<' Name (S Attribute)* S? '/>' |
| * </pre> |
| * <p> (The '<' has already been read.) |
| * <p>NOTE: this method actually chains onto parseContent (), if necessary, |
| * and parseContent () will take care of calling parseETag (). |
| */ |
| private void parseElement(boolean maybeGetSubset) |
| throws Exception |
| { |
| String gi; |
| char c; |
| int oldElementContent = currentElementContent; |
| String oldElement = currentElement; |
| ElementDecl element; |
| |
| // This is the (global) counter for the |
| // array of specified attributes. |
| tagAttributePos = 0; |
| |
| // Read the element type name. |
| gi = readNmtoken(true); |
| |
| // If we saw no DTD, and this is the document root element, |
| // let the application modify the input stream by providing one. |
| if (maybeGetSubset) |
| { |
| InputSource subset = handler.getExternalSubset(gi, |
| handler.getSystemId()); |
| if (subset != null) |
| { |
| String publicId = subset.getPublicId(); |
| String systemId = subset.getSystemId(); |
| |
| handler.warn("modifying document by adding DTD"); |
| handler.doctypeDecl(gi, publicId, systemId); |
| pushString(null, ">"); |
| |
| // NOTE: [dtd] is so we say what SAX2 expects, |
| // though it's misleading (subset, not entire dtd) |
| pushURL(true, "[dtd]", |
| new ExternalIdentifiers(publicId, systemId, null), |
| subset.getCharacterStream(), |
| subset.getByteStream(), |
| subset.getEncoding(), |
| false); |
| |
| // Loop until we end up back at '>' |
| while (true) |
| { |
| doReport = expandPE = true; |
| skipWhitespace(); |
| doReport = expandPE = false; |
| if (tryRead('>')) |
| { |
| break; |
| } |
| else |
| { |
| expandPE = true; |
| parseMarkupdecl(); |
| expandPE = false; |
| } |
| } |
| |
| // the ">" string isn't popped yet |
| if (inputStack.size() != 1) |
| { |
| error("external subset has unmatched '>'"); |
| } |
| |
| handler.endDoctype(); |
| } |
| } |
| |
| // Determine the current content type. |
| currentElement = gi; |
| element = (ElementDecl) elementInfo.get(gi); |
| currentElementContent = getContentType(element, CONTENT_ANY); |
| |
| // Read the attributes, if any. |
| // After this loop, "c" is the closing delimiter. |
| boolean white = tryWhitespace(); |
| c = readCh(); |
| while (c != '/' && c != '>') |
| { |
| unread(c); |
| if (!white) |
| { |
| error("need whitespace between attributes"); |
| } |
| parseAttribute(gi); |
| white = tryWhitespace(); |
| c = readCh(); |
| } |
| |
| // Supply any defaulted attributes. |
| Iterator atts = declaredAttributes(element); |
| if (atts != null) |
| { |
| String aname; |
| loop: |
| while (atts.hasNext()) |
| { |
| aname = (String) atts.next(); |
| // See if it was specified. |
| for (int i = 0; i < tagAttributePos; i++) |
| { |
| if (tagAttributes[i] == aname) |
| { |
| continue loop; |
| } |
| } |
| // ... or has a default |
| String value = getAttributeDefaultValue(gi, aname); |
| |
| if (value == null) |
| { |
| continue; |
| } |
| handler.attribute(aname, value, false); |
| } |
| } |
| |
| // Figure out if this is a start tag |
| // or an empty element, and dispatch an |
| // event accordingly. |
| switch (c) |
| { |
| case '>': |
| handler.startElement(gi); |
| parseContent(); |
| break; |
| case '/': |
| require('>'); |
| handler.startElement(gi); |
| handler.endElement(gi); |
| break; |
| } |
| |
| // Restore the previous state. |
| currentElement = oldElement; |
| currentElementContent = oldElementContent; |
| } |
| |
| /** |
| * Parse an attribute assignment. |
| * <pre> |
| * [41] Attribute ::= Name Eq AttValue |
| * </pre> |
| * @param name The name of the attribute's element. |
| * @see SAXDriver#attribute |
| */ |
| private void parseAttribute(String name) |
| throws Exception |
| { |
| String aname; |
| String type; |
| String value; |
| int flags = LIT_ATTRIBUTE | LIT_ENTITY_REF; |
| |
| // Read the attribute name. |
| aname = readNmtoken(true); |
| type = getAttributeType(name, aname); |
| |
| // Parse '=' |
| parseEq(); |
| |
| // Read the value, normalizing whitespace |
| // unless it is CDATA. |
| if (handler.stringInterning) |
| { |
| if (type == "CDATA" || type == null) |
| { |
| value = readLiteral(flags); |
| } |
| else |
| { |
| value = readLiteral(flags | LIT_NORMALIZE); |
| } |
| } |
| else |
| { |
| if (type.equals("CDATA") || type == null) |
| { |
| value = readLiteral(flags); |
| } |
| else |
| { |
| value = readLiteral(flags | LIT_NORMALIZE); |
| } |
| } |
| |
| // WFC: no duplicate attributes |
| for (int i = 0; i < tagAttributePos; i++) |
| { |
| if (aname.equals(tagAttributes [i])) |
| { |
| error("duplicate attribute", aname, null); |
| } |
| } |
| |
| // Inform the handler about the |
| // attribute. |
| handler.attribute(aname, value, true); |
| dataBufferPos = 0; |
| |
| // Note that the attribute has been |
| // specified. |
| if (tagAttributePos == tagAttributes.length) |
| { |
| String newAttrib[] = new String[tagAttributes.length * 2]; |
| System.arraycopy(tagAttributes, 0, newAttrib, 0, tagAttributePos); |
| tagAttributes = newAttrib; |
| } |
| tagAttributes[tagAttributePos++] = aname; |
| } |
| |
| /** |
| * Parse an equals sign surrounded by optional whitespace. |
| * <pre> |
| * [25] Eq ::= S? '=' S? |
| * </pre> |
| */ |
| private void parseEq() |
| throws SAXException, IOException |
| { |
| skipWhitespace(); |
| require('='); |
| skipWhitespace(); |
| } |
| |
| /** |
| * Parse an end tag. |
| * <pre> |
| * [42] ETag ::= '</' Name S? '>' |
| * </pre> |
| * <p>NOTE: parseContent () chains to here, we already read the |
| * "</". |
| */ |
| private void parseETag() |
| throws Exception |
| { |
| require(currentElement); |
| skipWhitespace(); |
| require('>'); |
| handler.endElement(currentElement); |
| // not re-reporting any SAXException re bogus end tags, |
| // even though that diagnostic might be clearer ... |
| } |
| |
| /** |
| * Parse the content of an element. |
| * <pre> |
| * [43] content ::= (element | CharData | Reference |
| * | CDSect | PI | Comment)* |
| * [67] Reference ::= EntityRef | CharRef |
| * </pre> |
| * <p> NOTE: consumes ETtag. |
| */ |
| private void parseContent() |
| throws Exception |
| { |
| char c; |
| |
| while (true) |
| { |
| // consume characters (or ignorable whitspace) until delimiter |
| parseCharData(); |
| |
| // Handle delimiters |
| c = readCh(); |
| switch (c) |
| { |
| case '&': // Found "&" |
| c = readCh(); |
| if (c == '#') |
| { |
| parseCharRef(); |
| } |
| else |
| { |
| unread(c); |
| parseEntityRef(true); |
| } |
| isDirtyCurrentElement = true; |
| break; |
| |
| case '<': // Found "<" |
| dataBufferFlush(); |
| c = readCh(); |
| switch (c) |
| { |
| case '!': // Found "<!" |
| c = readCh(); |
| switch (c) |
| { |
| case '-': // Found "<!-" |
| require('-'); |
| isDirtyCurrentElement = false; |
| parseComment(); |
| break; |
| case '[': // Found "<![" |
| isDirtyCurrentElement = false; |
| require("CDATA["); |
| handler.startCDATA(); |
| inCDATA = true; |
| parseCDSect(); |
| inCDATA = false; |
| handler.endCDATA(); |
| break; |
| default: |
| error("expected comment or CDATA section", c, null); |
| break; |
| } |
| break; |
| |
| case '?': // Found "<?" |
| isDirtyCurrentElement = false; |
| parsePI(); |
| break; |
| |
| case '/': // Found "</" |
| isDirtyCurrentElement = false; |
| parseETag(); |
| return; |
| |
| default: // Found "<" followed by something else |
| isDirtyCurrentElement = false; |
| unread(c); |
| parseElement(false); |
| break; |
| } |
| } |
| } |
| } |
| |
| /** |
| * Parse an element type declaration. |
| * <pre> |
| * [45] elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>' |
| * </pre> |
| * <p> NOTE: the '<!ELEMENT' has already been read. |
| */ |
| private void parseElementDecl() |
| throws Exception |
| { |
| String name; |
| |
| requireWhitespace(); |
| // Read the element type name. |
| name = readNmtoken(true); |
| |
| requireWhitespace(); |
| // Read the content model. |
| parseContentspec(name); |
| |
| skipWhitespace(); |
| require('>'); |
| } |
| |
| /** |
| * Content specification. |
| * <pre> |
| * [46] contentspec ::= 'EMPTY' | 'ANY' | Mixed | elements |
| * </pre> |
| */ |
| private void parseContentspec(String name) |
| throws Exception |
| { |
| // FIXME: move elementDecl() into setElement(), pass EMTPY/ANY ... |
| if (tryRead("EMPTY")) |
| { |
| setElement(name, CONTENT_EMPTY, null, null); |
| if (!skippedPE) |
| { |
| handler.getDeclHandler().elementDecl(name, "EMPTY"); |
| } |
| return; |
| } |
| else if (tryRead("ANY")) |
| { |
| setElement(name, CONTENT_ANY, null, null); |
| if (!skippedPE) |
| { |
| handler.getDeclHandler().elementDecl(name, "ANY"); |
| } |
| return; |
| } |
| else |
| { |
| String model; |
| char[] saved; |
| |
| require('('); |
| saved = readBuffer; |
| dataBufferAppend('('); |
| skipWhitespace(); |
| if (tryRead("#PCDATA")) |
| { |
| dataBufferAppend("#PCDATA"); |
| parseMixed(saved); |
| model = dataBufferToString(); |
| setElement(name, CONTENT_MIXED, model, null); |
| } |
| else |
| { |
| parseElements(saved); |
| model = dataBufferToString(); |
| setElement(name, CONTENT_ELEMENTS, model, null); |
| } |
| if (!skippedPE) |
| { |
| handler.getDeclHandler().elementDecl(name, model); |
| } |
| } |
| } |
| |
| /** |
| * Parse an element-content model. |
| * <pre> |
| * [47] elements ::= (choice | seq) ('?' | '*' | '+')? |
| * [49] choice ::= '(' S? cp (S? '|' S? cp)+ S? ')' |
| * [50] seq ::= '(' S? cp (S? ',' S? cp)* S? ')' |
| * </pre> |
| * |
| * <p> NOTE: the opening '(' and S have already been read. |
| * |
| * @param saved Buffer for entity that should have the terminal ')' |
| */ |
| private void parseElements(char[] saved) |
| throws Exception |
| { |
| char c; |
| char sep; |
| |
| // Parse the first content particle |
| skipWhitespace(); |
| parseCp(); |
| |
| // Check for end or for a separator. |
| skipWhitespace(); |
| c = readCh(); |
| switch (c) |
| { |
| case ')': |
| // VC: Proper Group/PE Nesting |
| if (readBuffer != saved) |
| { |
| handler.verror("Illegal Group/PE nesting"); |
| } |
| |
| dataBufferAppend(')'); |
| c = readCh(); |
| switch (c) |
| { |
| case '*': |
| case '+': |
| case '?': |
| dataBufferAppend(c); |
| break; |
| default: |
| unread(c); |
| } |
| return; |
| case ',': // Register the separator. |
| case '|': |
| sep = c; |
| dataBufferAppend(c); |
| break; |
| default: |
| error("bad separator in content model", c, null); |
| return; |
| } |
| |
| // Parse the rest of the content model. |
| while (true) |
| { |
| skipWhitespace(); |
| parseCp(); |
| skipWhitespace(); |
| c = readCh(); |
| if (c == ')') |
| { |
| // VC: Proper Group/PE Nesting |
| if (readBuffer != saved) |
| { |
| handler.verror("Illegal Group/PE nesting"); |
| } |
| |
| dataBufferAppend(')'); |
| break; |
| } |
| else if (c != sep) |
| { |
| error("bad separator in content model", c, null); |
| return; |
| } |
| else |
| { |
| dataBufferAppend(c); |
| } |
| } |
| |
| // Check for the occurrence indicator. |
| c = readCh(); |
| switch (c) |
| { |
| case '?': |
| case '*': |
| case '+': |
| dataBufferAppend(c); |
| return; |
| default: |
| unread(c); |
| return; |
| } |
| } |
| |
| /** |
| * Parse a content particle. |
| * <pre> |
| * [48] cp ::= (Name | choice | seq) ('?' | '*' | '+')? |
| * </pre> |
| */ |
| private void parseCp() |
| throws Exception |
| { |
| if (tryRead('(')) |
| { |
| dataBufferAppend('('); |
| parseElements(readBuffer); |
| } |
| else |
| { |
| dataBufferAppend(readNmtoken(true)); |
| char c = readCh(); |
| switch (c) |
| { |
| case '?': |
| case '*': |
| case '+': |
| dataBufferAppend(c); |
| break; |
| default: |
| unread(c); |
| break; |
| } |
| } |
| } |
| |
| /** |
| * Parse mixed content. |
| * <pre> |
| * [51] Mixed ::= '(' S? ( '#PCDATA' (S? '|' S? Name)*) S? ')*' |
| * | '(' S? ('#PCDATA') S? ')' |
| * </pre> |
| * |
| * @param saved Buffer for entity that should have the terminal ')' |
| */ |
| private void parseMixed(char[] saved) |
| throws Exception |
| { |
| // Check for PCDATA alone. |
| skipWhitespace(); |
| if (tryRead(')')) |
| { |
| // VC: Proper Group/PE Nesting |
| if (readBuffer != saved) |
| { |
| handler.verror("Illegal Group/PE nesting"); |
| } |
| |
| dataBufferAppend(")*"); |
| tryRead('*'); |
| return; |
| } |
| |
| // Parse mixed content. |
| skipWhitespace(); |
| while (!tryRead(")")) |
| { |
| require('|'); |
| dataBufferAppend('|'); |
| skipWhitespace(); |
| dataBufferAppend(readNmtoken(true)); |
| skipWhitespace(); |
| } |
| |
| // VC: Proper Group/PE Nesting |
| if (readBuffer != saved) |
| { |
| handler.verror("Illegal Group/PE nesting"); |
| } |
| |
| require('*'); |
| dataBufferAppend(")*"); |
| } |
| |
| /** |
| * Parse an attribute list declaration. |
| * <pre> |
| * [52] AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>' |
| * </pre> |
| * <p>NOTE: the '<!ATTLIST' has already been read. |
| */ |
| private void parseAttlistDecl() |
| throws Exception |
| { |
| String elementName; |
| |
| requireWhitespace(); |
| elementName = readNmtoken(true); |
| boolean white = tryWhitespace(); |
| while (!tryRead('>')) |
| { |
| if (!white) |
| { |
| error("whitespace required before attribute definition"); |
| } |
| parseAttDef(elementName); |
| white = tryWhitespace(); |
| } |
| } |
| |
| /** |
| * Parse a single attribute definition. |
| * <pre> |
| * [53] AttDef ::= S Name S AttType S DefaultDecl |
| * </pre> |
| */ |
| private void parseAttDef(String elementName) |
| throws Exception |
| { |
| String name; |
| String type; |
| String enumer = null; |
| |
| // Read the attribute name. |
| name = readNmtoken(true); |
| |
| // Read the attribute type. |
| requireWhitespace(); |
| type = readAttType(); |
| |
| // Get the string of enumerated values if necessary. |
| if (handler.stringInterning) |
| { |
| if ("ENUMERATION" == type || "NOTATION" == type) |
| { |
| enumer = dataBufferToString(); |
| } |
| } |
| else |
| { |
| if ("ENUMERATION".equals(type) || "NOTATION".equals(type)) |
| { |
| enumer = dataBufferToString(); |
| } |
| } |
| |
| // Read the default value. |
| requireWhitespace(); |
| parseDefault(elementName, name, type, enumer); |
| } |
| |
| /** |
| * Parse the attribute type. |
| * <pre> |
| * [54] AttType ::= StringType | TokenizedType | EnumeratedType |
| * [55] StringType ::= 'CDATA' |
| * [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY' |
| * | 'ENTITIES' | 'NMTOKEN' | 'NMTOKENS' |
| * [57] EnumeratedType ::= NotationType | Enumeration |
| * </pre> |
| */ |
| private String readAttType() |
| throws Exception |
| { |
| if (tryRead('(')) |
| { |
| parseEnumeration(false); |
| return "ENUMERATION"; |
| } |
| else |
| { |
| String typeString = readNmtoken(true); |
| if (handler.stringInterning) |
| { |
| if ("NOTATION" == typeString) |
| { |
| parseNotationType(); |
| return typeString; |
| } |
| else if ("CDATA" == typeString |
| || "ID" == typeString |
| || "IDREF" == typeString |
| || "IDREFS" == typeString |
| || "ENTITY" == typeString |
| || "ENTITIES" == typeString |
| || "NMTOKEN" == typeString |
| || "NMTOKENS" == typeString) |
| { |
| return typeString; |
| } |
| } |
| else |
| { |
| if ("NOTATION".equals(typeString)) |
| { |
| parseNotationType(); |
| return typeString; |
| } |
| else if ("CDATA".equals(typeString) |
| || "ID".equals(typeString) |
| || "IDREF".equals(typeString) |
| || "IDREFS".equals(typeString) |
| || "ENTITY".equals(typeString) |
| || "ENTITIES".equals(typeString) |
| || "NMTOKEN".equals(typeString) |
| || "NMTOKENS".equals(typeString)) |
| { |
| return typeString; |
| } |
| } |
| error("illegal attribute type", typeString, null); |
| return null; |
| } |
| } |
| |
| /** |
| * Parse an enumeration. |
| * <pre> |
| * [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')' |
| * </pre> |
| * <p>NOTE: the '(' has already been read. |
| */ |
| private void parseEnumeration(boolean isNames) |
| throws Exception |
| { |
| dataBufferAppend('('); |
| |
| // Read the first token. |
| skipWhitespace(); |
| dataBufferAppend(readNmtoken(isNames)); |
| // Read the remaining tokens. |
| skipWhitespace(); |
| while (!tryRead(')')) |
| { |
| require('|'); |
| dataBufferAppend('|'); |
| skipWhitespace(); |
| dataBufferAppend(readNmtoken (isNames)); |
| skipWhitespace(); |
| } |
| dataBufferAppend(')'); |
| } |
| |
| /** |
| * Parse a notation type for an attribute. |
| * <pre> |
| * [58] NotationType ::= 'NOTATION' S '(' S? NameNtoks |
| * (S? '|' S? name)* S? ')' |
| * </pre> |
| * <p>NOTE: the 'NOTATION' has already been read |
| */ |
| private void parseNotationType() |
| throws Exception |
| { |
| requireWhitespace(); |
| require('('); |
| |
| parseEnumeration(true); |
| } |
| |
| /** |
| * Parse the default value for an attribute. |
| * <pre> |
| * [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED' |
| * | (('#FIXED' S)? AttValue) |
| * </pre> |
| */ |
| private void parseDefault(String elementName, String name, |
| String type, String enumer) |
| throws Exception |
| { |
| int valueType = ATTRIBUTE_DEFAULT_SPECIFIED; |
| String value = null; |
| int flags = LIT_ATTRIBUTE; |
| boolean saved = expandPE; |
| String defaultType = null; |
| |
| // LIT_ATTRIBUTE forces '<' checks now (ASAP) and turns whitespace |
| // chars to spaces (doesn't matter when that's done if it doesn't |
| // interfere with char refs expanding to whitespace). |
| |
| if (!skippedPE) |
| { |
| flags |= LIT_ENTITY_REF; |
| if (handler.stringInterning) |
| { |
| if ("CDATA" != type) |
| { |
| flags |= LIT_NORMALIZE; |
| } |
| } |
| else |
| { |
| if (!"CDATA".equals(type)) |
| { |
| flags |= LIT_NORMALIZE; |
| } |
| } |
| } |
| |
| expandPE = false; |
| if (tryRead('#')) |
| { |
| if (tryRead("FIXED")) |
| { |
| defaultType = "#FIXED"; |
| valueType = ATTRIBUTE_DEFAULT_FIXED; |
| requireWhitespace(); |
| value = readLiteral(flags); |
| } |
| else if (tryRead("REQUIRED")) |
| { |
| defaultType = "#REQUIRED"; |
| valueType = ATTRIBUTE_DEFAULT_REQUIRED; |
| } |
| else if (tryRead("IMPLIED")) |
| { |
| defaultType = "#IMPLIED"; |
| valueType = ATTRIBUTE_DEFAULT_IMPLIED; |
| } |
| else |
| { |
| error("illegal keyword for attribute default value"); |
| } |
| } |
| else |
| { |
| value = readLiteral(flags); |
| } |
| expandPE = saved; |
| setAttribute(elementName, name, type, enumer, value, valueType); |
| if (handler.stringInterning) |
| { |
| if ("ENUMERATION" == type) |
| { |
| type = enumer; |
| } |
| else if ("NOTATION" == type) |
| { |
| type = "NOTATION " + enumer; |
| } |
| } |
| else |
| { |
| if ("ENUMERATION".equals(type)) |
| { |
| type = enumer; |
| } |
| else if ("NOTATION".equals(type)) |
| { |
| type = "NOTATION " + enumer; |
| } |
| } |
| if (!skippedPE) |
| { |
| handler.getDeclHandler().attributeDecl(elementName, name, type, |
| defaultType, value); |
| } |
| } |
| |
| /** |
| * Parse a conditional section. |
| * <pre> |
| * [61] conditionalSect ::= includeSect || ignoreSect |
| * [62] includeSect ::= '<![' S? 'INCLUDE' S? '[' |
| * extSubsetDecl ']]>' |
| * [63] ignoreSect ::= '<![' S? 'IGNORE' S? '[' |
| * ignoreSectContents* ']]>' |
| * [64] ignoreSectContents ::= Ignore |
| * ('<![' ignoreSectContents* ']]>' Ignore )* |
| * [65] Ignore ::= Char* - (Char* ( '<![' | ']]>') Char* ) |
| * </pre> |
| * <p> NOTE: the '>![' has already been read. |
| */ |
| private void parseConditionalSect(char[] saved) |
| throws Exception |
| { |
| skipWhitespace(); |
| if (tryRead("INCLUDE")) |
| { |
| skipWhitespace(); |
| require('['); |
| // VC: Proper Conditional Section/PE Nesting |
| if (readBuffer != saved) |
| { |
| handler.verror("Illegal Conditional Section/PE nesting"); |
| } |
| skipWhitespace(); |
| while (!tryRead("]]>")) |
| { |
| parseMarkupdecl(); |
| skipWhitespace(); |
| } |
| } |
| else if (tryRead("IGNORE")) |
| { |
| skipWhitespace(); |
| require('['); |
| // VC: Proper Conditional Section/PE Nesting |
| if (readBuffer != saved) |
| { |
| handler.verror("Illegal Conditional Section/PE nesting"); |
| } |
| int nesting = 1; |
| char c; |
| expandPE = false; |
| for (int nest = 1; nest > 0; ) |
| { |
| c = readCh(); |
| switch (c) |
| { |
| case '<': |
| if (tryRead("![")) |
| { |
| nest++; |
| } |
| case ']': |
| if (tryRead("]>")) |
| { |
| nest--; |
| } |
| } |
| } |
| expandPE = true; |
| } |
| else |
| { |
| error("conditional section must begin with INCLUDE or IGNORE"); |
| } |
| } |
| |
| private void parseCharRef() |
| throws SAXException, IOException |
| { |
| parseCharRef(true /* do flushDataBuffer by default */); |
| } |
| |
| /** |
| * Try to read a character reference without consuming data from buffer. |
| * <pre> |
| * [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' |
| * </pre> |
| * <p>NOTE: the '&#' has already been read. |
| */ |
| private void tryReadCharRef() |
| throws SAXException, IOException |
| { |
| int value = 0; |
| char c; |
| |
| if (tryRead('x')) |
| { |
| loop1: |
| while (true) |
| { |
| c = readCh(); |
| if (c == ';') |
| { |
| break loop1; |
| } |
| else |
| { |
| int n = Character.digit(c, 16); |
| if (n == -1) |
| { |
| error("illegal character in character reference", c, null); |
| break loop1; |
| } |
| value *= 16; |
| value += n; |
| } |
| } |
| } |
| else |
| { |
| loop2: |
| while (true) |
| { |
| c = readCh(); |
| if (c == ';') |
| { |
| break loop2; |
| } |
| else |
| { |
| int n = Character.digit(c, 10); |
| if (n == -1) |
| { |
| error("illegal character in character reference", c, null); |
| break loop2; |
| } |
| value *= 10; |
| value += n; |
| } |
| } |
| } |
| |
| // check for character refs being legal XML |
| if ((value < 0x0020 |
| && ! (value == '\n' || value == '\t' || value == '\r')) |
| || (value >= 0xD800 && value <= 0xDFFF) |
| || value == 0xFFFE || value == 0xFFFF |
| || value > 0x0010ffff) |
| { |
| error("illegal XML character reference U+" |
| + Integer.toHexString(value)); |
| } |
| |
| // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz |
| // (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz: |
| if (value > 0x0010ffff) |
| { |
| // too big for surrogate |
| error("character reference " + value + " is too large for UTF-16", |
| new Integer(value).toString(), null); |
| } |
| |
| } |
| |
| /** |
| * Read and interpret a character reference. |
| * <pre> |
| * [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' |
| * </pre> |
| * <p>NOTE: the '&#' has already been read. |
| */ |
| private void parseCharRef(boolean doFlush) |
| throws SAXException, IOException |
| { |
| int value = 0; |
| char c; |
| |
| if (tryRead('x')) |
| { |
| loop1: |
| while (true) |
| { |
| c = readCh(); |
| if (c == ';') |
| { |
| break loop1; |
| } |
| else |
| { |
| int n = Character.digit(c, 16); |
| if (n == -1) |
| { |
| error("illegal character in character reference", c, null); |
| break loop1; |
| } |
| value *= 16; |
| value += n; |
| } |
| } |
| } |
| else |
| { |
| loop2: |
| while (true) |
| { |
| c = readCh(); |
| if (c == ';') |
| { |
| break loop2; |
| } |
| else |
| { |
| int n = Character.digit(c, 10); |
| if (n == -1) |
| { |
| error("illegal character in character reference", c, null); |
| break loop2; |
| } |
| value *= 10; |
| value += c - '0'; |
| } |
| } |
| } |
| |
| // check for character refs being legal XML |
| if ((value < 0x0020 |
| && ! (value == '\n' || value == '\t' || value == '\r')) |
| || (value >= 0xD800 && value <= 0xDFFF) |
| || value == 0xFFFE || value == 0xFFFF |
| || value > 0x0010ffff) |
| { |
| error("illegal XML character reference U+" |
| + Integer.toHexString(value)); |
| } |
| |
| // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz |
| // (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz: |
| if (value <= 0x0000ffff) |
| { |
| // no surrogates needed |
| dataBufferAppend((char) value); |
| } |
| else if (value <= 0x0010ffff) |
| { |
| value -= 0x10000; |
| // > 16 bits, surrogate needed |
| dataBufferAppend((char) (0xd800 | (value >> 10))); |
| dataBufferAppend((char) (0xdc00 | (value & 0x0003ff))); |
| } |
| else |
| { |
| // too big for surrogate |
| error("character reference " + value + " is too large for UTF-16", |
| new Integer(value).toString(), null); |
| } |
| if (doFlush) |
| { |
| dataBufferFlush(); |
| } |
| } |
| |
| /** |
| * Parse and expand an entity reference. |
| * <pre> |
| * [68] EntityRef ::= '&' Name ';' |
| * </pre> |
| * <p>NOTE: the '&' has already been read. |
| * @param externalAllowed External entities are allowed here. |
| */ |
| private void parseEntityRef(boolean externalAllowed) |
| throws SAXException, IOException |
| { |
| String name; |
| |
| name = readNmtoken(true); |
| require(';'); |
| switch (getEntityType(name)) |
| { |
| case ENTITY_UNDECLARED: |
| // NOTE: XML REC describes amazingly convoluted handling for |
| // this case. Nothing as meaningful as being a WFness error |
| // unless the processor might _legitimately_ not have seen a |
| // declaration ... which is what this implements. |
| String message; |
| |
| message = "reference to undeclared general entity " + name; |
| if (skippedPE && !docIsStandalone) |
| { |
| handler.verror(message); |
| // we don't know this entity, and it might be external... |
| if (externalAllowed) |
| { |
| handler.skippedEntity(name); |
| } |
| } |
| else |
| { |
| error(message); |
| } |
| break; |
| case ENTITY_INTERNAL: |
| pushString(name, getEntityValue(name)); |
| |
| //workaround for possible input pop before marking |
| //the buffer reading position |
| char t = readCh(); |
| unread(t); |
| int bufferPosMark = readBufferPos; |
| |
| int end = readBufferPos + getEntityValue(name).length(); |
| for (int k = readBufferPos; k < end; k++) |
| { |
| t = readCh(); |
| if (t == '&') |
| { |
| t = readCh(); |
| if (t == '#') |
| { |
| //try to match a character ref |
| tryReadCharRef(); |
| |
| //everything has been read |
| if (readBufferPos >= end) |
| { |
| break; |
| } |
| k = readBufferPos; |
| continue; |
| } |
| else if (Character.isLetter(t)) |
| { |
| //looks like an entity ref |
| unread(t); |
| readNmtoken(true); |
| require(';'); |
| |
| //everything has been read |
| if (readBufferPos >= end) |
| { |
| break; |
| } |
| k = readBufferPos; |
| continue; |
| } |
| error(" malformed entity reference"); |
| } |
| |
| } |
| readBufferPos = bufferPosMark; |
| break; |
| case ENTITY_TEXT: |
| if (externalAllowed) |
| { |
| pushURL(false, name, getEntityIds(name), |
| null, null, null, true); |
| } |
| else |
| { |
| error("reference to external entity in attribute value.", |
| name, null); |
| } |
| break; |
| case ENTITY_NDATA: |
| if (externalAllowed) |
| { |
| error("unparsed entity reference in content", name, null); |
| } |
| else |
| { |
| error("reference to external entity in attribute value.", |
| name, null); |
| } |
| break; |
| default: |
| throw new RuntimeException(); |
| } |
| } |
| |
| /** |
| * Parse and expand a parameter entity reference. |
| * <pre> |
| * [69] PEReference ::= '%' Name ';' |
| * </pre> |
| * <p>NOTE: the '%' has already been read. |
| */ |
| private void parsePEReference() |
| throws SAXException, IOException |
| { |
| String name; |
| |
| name = "%" + readNmtoken(true); |
| require(';'); |
| switch (getEntityType(name)) |
| { |
| case ENTITY_UNDECLARED: |
| // VC: Entity Declared |
| handler.verror("reference to undeclared parameter entity " + name); |
| |
| // we should disable handling of all subsequent declarations |
| // unless this is a standalone document (info discarded) |
| break; |
| case ENTITY_INTERNAL: |
| if (inLiteral) |
| { |
| pushString(name, getEntityValue(name)); |
| } |
| else |
| { |
| pushString(name, ' ' + getEntityValue(name) + ' '); |
| } |
| break; |
| case ENTITY_TEXT: |
| if (!inLiteral) |
| { |
| pushString(null, " "); |
| } |
| pushURL(true, name, getEntityIds(name), null, null, null, true); |
| if (!inLiteral) |
| { |
| pushString(null, " "); |
| } |
| break; |
| } |
| } |
| |
| /** |
| * Parse an entity declaration. |
| * <pre> |
| * [70] EntityDecl ::= GEDecl | PEDecl |
| * [71] GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>' |
| * [72] PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>' |
| * [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?) |
| * [74] PEDef ::= EntityValue | ExternalID |
| * [75] ExternalID ::= 'SYSTEM' S SystemLiteral |
| * | 'PUBLIC' S PubidLiteral S SystemLiteral |
| * [76] NDataDecl ::= S 'NDATA' S Name |
| * </pre> |
| * <p>NOTE: the '<!ENTITY' has already been read. |
| */ |
| private void parseEntityDecl() |
| throws Exception |
| { |
| boolean peFlag = false; |
| int flags = 0; |
| |
| // Check for a parameter entity. |
| expandPE = false; |
| requireWhitespace(); |
| if (tryRead('%')) |
| { |
| peFlag = true; |
| requireWhitespace(); |
| } |
| expandPE = true; |
| |
| // Read the entity name, and prepend |
| // '%' if necessary. |
| String name = readNmtoken(true); |
| //NE08 |
| if (name.indexOf(':') >= 0) |
| { |
| error("Illegal character(':') in entity name ", name, null); |
| } |
| if (peFlag) |
| { |
| name = "%" + name; |
| } |
| |
| // Read the entity value. |
| requireWhitespace(); |
| char c = readCh(); |
| unread (c); |
| if (c == '"' || c == '\'') |
| { |
| // Internal entity ... replacement text has expanded refs |
| // to characters and PEs, but not to general entities |
| String value = readLiteral(flags); |
| setInternalEntity(name, value); |
| } |
| else |
| { |
| // Read the external IDs |
| ExternalIdentifiers ids = readExternalIds(false, false); |
| |
| // Check for NDATA declaration. |
| boolean white = tryWhitespace(); |
| if (!peFlag && tryRead("NDATA")) |
| { |
| if (!white) |
| { |
| error("whitespace required before NDATA"); |
| } |
| requireWhitespace(); |
| String notationName = readNmtoken(true); |
| if (!skippedPE) |
| { |
| setExternalEntity(name, ENTITY_NDATA, ids, notationName); |
| handler.unparsedEntityDecl(name, ids.publicId, ids.systemId, |
| ids.baseUri, notationName); |
| } |
| } |
| else if (!skippedPE) |
| { |
| setExternalEntity(name, ENTITY_TEXT, ids, null); |
| handler.getDeclHandler() |
| .externalEntityDecl(name, ids.publicId, |
| handler.resolveURIs() |
| // FIXME: ASSUMES not skipped |
| // "false" forces error on bad URI |
| ? handler.absolutize(ids.baseUri, |
| ids.systemId, |
| false) |
| : ids.systemId); |
| } |
| } |
| |
| // Finish the declaration. |
| skipWhitespace(); |
| require('>'); |
| } |
| |
| /** |
| * Parse a notation declaration. |
| * <pre> |
| * [82] NotationDecl ::= '<!NOTATION' S Name S |
| * (ExternalID | PublicID) S? '>' |
| * [83] PublicID ::= 'PUBLIC' S PubidLiteral |
| * </pre> |
| * <P>NOTE: the '<!NOTATION' has already been read. |
| */ |
| private void parseNotationDecl() |
| throws Exception |
| { |
| String nname; |
| ExternalIdentifiers ids; |
| |
| requireWhitespace(); |
| nname = readNmtoken(true); |
| //NE08 |
| if (nname.indexOf(':') >= 0) |
| { |
| error("Illegal character(':') in notation name ", nname, null); |
| } |
| requireWhitespace(); |
| |
| // Read the external identifiers. |
| ids = readExternalIds(true, false); |
| |
| // Register the notation. |
| setNotation(nname, ids); |
| |
| skipWhitespace(); |
| require('>'); |
| } |
| |
| /** |
| * Parse character data. |
| * <pre> |
| * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) |
| * </pre> |
| */ |
| private void parseCharData() |
| throws Exception |
| { |
| char c; |
| int state = 0; |
| boolean pureWhite = false; |
| |
| // assert (dataBufferPos == 0); |
| |
| // are we expecting pure whitespace? it might be dirty... |
| if ((currentElementContent == CONTENT_ELEMENTS) && !isDirtyCurrentElement) |
| { |
| pureWhite = true; |
| } |
| |
| // always report right out of readBuffer |
| // to minimize (pointless) buffer copies |
| while (true) |
| { |
| int lineAugment = 0; |
| int columnAugment = 0; |
| int i; |
| |
| loop: |
| for (i = readBufferPos; i < readBufferLength; i++) |
| { |
| switch (c = readBuffer[i]) |
| { |
| case '\n': |
| lineAugment++; |
| columnAugment = 0; |
| // pureWhite unmodified |
| break; |
| case '\r': // should not happen!! |
| case '\t': |
| case ' ': |
| // pureWhite unmodified |
| columnAugment++; |
| break; |
| case '&': |
| case '<': |
| columnAugment++; |
| // pureWhite unmodified |
| // CLEAN end of text sequence |
| state = 1; |
| break loop; |
| case ']': |
| // that's not a whitespace char, and |
| // can not terminate pure whitespace either |
| pureWhite = false; |
| if ((i + 2) < readBufferLength) |
| { |
| if (readBuffer [i + 1] == ']' |
| && readBuffer [i + 2] == '>') |
| { |
| // ERROR end of text sequence |
| state = 2; |
| break loop; |
| } |
| } |
| else |
| { |
| // FIXME missing two end-of-buffer cases |
| } |
| columnAugment++; |
| break; |
| default: |
| if ((c < 0x0020 || c > 0xFFFD) |
| || ((c >= 0x007f) && (c <= 0x009f) && (c != 0x0085) |
| && xmlVersion == XML_11)) |
| { |
| error("illegal XML character U+" |
| + Integer.toHexString(c)); |
| } |
| // that's not a whitespace char |
| pureWhite = false; |
| columnAugment++; |
| } |
| } |
| |
| // report text thus far |
| if (lineAugment > 0) |
| { |
| line += lineAugment; |
| column = columnAugment; |
| } |
| else |
| { |
| column += columnAugment; |
| } |
| |
| // report characters/whitspace |
| int length = i - readBufferPos; |
| |
| if (length != 0) |
| { |
| if (pureWhite) |
| { |
| handler.ignorableWhitespace(readBuffer, |
| readBufferPos, length); |
| } |
| else |
| { |
| handler.charData(readBuffer, readBufferPos, length); |
| } |
| readBufferPos = i; |
| } |
| |
| if (state != 0) |
| { |
| break; |
| } |
| |
| // fill next buffer from this entity, or |
| // pop stack and continue with previous entity |
| unread(readCh()); |
| } |
| if (!pureWhite) |
| { |
| isDirtyCurrentElement = true; |
| } |
| // finish, maybe with error |
| if (state != 1) // finish, no error |
| { |
| error("character data may not contain ']]>'"); |
| } |
| } |
| |
| ////////////////////////////////////////////////////////////////////// |
| // High-level reading and scanning methods. |
| ////////////////////////////////////////////////////////////////////// |
| |
| /** |
| * Require whitespace characters. |
| */ |
| private void requireWhitespace() |
| throws SAXException, IOException |
| { |
| char c = readCh(); |
| if (isWhitespace(c)) |
| { |
| skipWhitespace(); |
| } |
| else |
| { |
| error("whitespace required", c, null); |
| } |
| } |
| |
| /** |
| * Skip whitespace characters. |
| * <pre> |
| * [3] S ::= (#x20 | #x9 | #xd | #xa)+ |
| * </pre> |
| */ |
| private void skipWhitespace() |
| throws SAXException, IOException |
| { |
| // Start with a little cheat. Most of |
| // the time, the white space will fall |
| // within the current read buffer; if |
| // not, then fall through. |
| if (USE_CHEATS) |
| { |
| int lineAugment = 0; |
| int columnAugment = 0; |
| |
| loop: |
| for (int i = readBufferPos; i < readBufferLength; i++) |
| { |
| switch (readBuffer[i]) |
| { |
| case ' ': |
| case '\t': |
| case '\r': |
| columnAugment++; |
| break; |
| case '\n': |
| lineAugment++; |
| columnAugment = 0; |
| break; |
| case '%': |
| if (expandPE) |
| { |
| break loop; |
| } |
| // else fall through... |
| default: |
| readBufferPos = i; |
| if (lineAugment > 0) |
| { |
| line += lineAugment; |
| column = columnAugment; |
| } |
| else |
| { |
| column += columnAugment; |
| } |
| return; |
| } |
| } |
| } |
| |
| // OK, do it the slow way. |
| char c = readCh (); |
| while (isWhitespace(c)) |
| { |
| c = readCh(); |
| } |
| unread(c); |
| } |
| |
| /** |
| * Read a name or (when parsing an enumeration) name token. |
| * <pre> |
| * [5] Name ::= (Letter | '_' | ':') (NameChar)* |
| * [7] Nmtoken ::= (NameChar)+ |
| * </pre> |
| */ |
| private String readNmtoken(boolean isName) |
| throws SAXException, IOException |
| { |
| char c; |
| |
| if (USE_CHEATS) |
| { |
| loop: |
| for (int i = readBufferPos; i < readBufferLength; i++) |
| { |
| c = readBuffer[i]; |
| switch (c) |
| { |
| case '%': |
| if (expandPE) |
| { |
| break loop; |
| } |
| // else fall through... |
| |
| // What may legitimately come AFTER a name/nmtoken? |
| case '<': case '>': case '&': |
| case ',': case '|': case '*': case '+': case '?': |
| case ')': |
| case '=': |
| case '\'': case '"': |
| case '[': |
| case ' ': case '\t': case '\r': case '\n': |
| case ';': |
| case '/': |
| int start = readBufferPos; |
| if (i == start) |
| { |
| error("name expected", readBuffer[i], null); |
| } |
| readBufferPos = i; |
| return intern(readBuffer, start, i - start); |
| |
| default: |
| // FIXME ... per IBM's OASIS test submission, these: |
| // ? U+06dd |
| // Combining U+309B |
| //these switches are kind of ugly but at least we won't |
| //have to go over the whole lits for each char |
| if (isName && i == readBufferPos) |
| { |
| char c2 = (char) (c & 0x00f0); |
| switch (c & 0xff00) |
| { |
| //starting with 01 |
| case 0x0100: |
| switch (c2) |
| { |
| case 0x0030: |
| if (c == 0x0132 || c == 0x0133 || c == 0x013f) |
| { |
| error("Not a name start character, U+" |
| + Integer.toHexString(c)); |
| } |
| break; |
| case 0x0040: |
| if (c == 0x0140 || c == 0x0149) |
| { |
| error("Not a name start character, U+" |
| + Integer.toHexString(c)); |
| } |
| break; |
| case 0x00c0: |
| if (c == 0x01c4 || c == 0x01cc) |
| { |
| error("Not a name start character, U+" |
| + Integer.toHexString(c)); |
| } |
| break; |
| case 0x00f0: |
| if (c == 0x01f1 || c == 0x01f3) |
| { |
| error("Not a name start character, U+" |
| + Integer.toHexString(c)); |
| } |
| break; |
| case 0x00b0: |
| if (c == 0x01f1 || c == 0x01f3) |
| { |
| error("Not a name start character, U+" |
| + Integer.toHexString(c)); |
| } |
| break; |
| default: |
| if (c == 0x017f) |
| { |
| error("Not a name start character, U+" |
| + Integer.toHexString(c)); |
| } |
| } |
| |
| break; |
| //starting with 11 |
| case 0x1100: |
| switch (c2) |
| { |
| case 0x0000: |
| if (c == 0x1104 || c == 0x1108 || |
| c == 0x110a || c == 0x110d) |
| { |
| error("Not a name start character, U+" |
| + Integer.toHexString(c)); |
| } |
| break; |
| case 0x0030: |
| if (c == 0x113b || c == 0x113f) |
| { |
| error("Not a name start character, U+" |
| + Integer.toHexString(c)); |
| } |
| break; |
| case 0x0040: |
| if (c == 0x1141 || c == 0x114d |
| || c == 0x114f ) |
| { |
| error("Not a name start character, U+" |
| + Integer.toHexString(c)); |
| } |
| break; |
| case 0x0050: |
| if (c == 0x1151 || c == 0x1156) |
| { |
| error("Not a name start character, U+" |
| + Integer.toHexString(c)); |
| } |
| break; |
| case 0x0060: |
| if (c == 0x1162 || c == 0x1164 |
| || c == 0x1166 || c == 0x116b |
| || c == 0x116f) |
| { |
| error("Not a name start character, U+" |
| + Integer.toHexString(c)); |
| } |
| break; |
| case 0x00b0: |
| if (c == 0x11b6 || c == 0x11b9 |
| || c == 0x11bb || c == 0x116f) |
| { |
| error("Not a name start character, U+" |
| + Integer.toHexString(c)); |
| } |
| break; |
| default: |
| if (c == 0x1174 || c == 0x119f |
| || c == 0x11ac || c == 0x11c3 |
| || c == 0x11f1) |
| { |
| error("Not a name start character, U+" |
| + Integer.toHexString(c)); |
| } |
| } |
| break; |
| default: |
| if (c == 0x0e46 || c == 0x1011 |
| || c == 0x212f || c == 0x0587 |
| || c == 0x0230 ) |
| { |
| error("Not a name start character, U+" |
| + Integer.toHexString(c)); |
| } |
| } |
| } |
| // punt on exact tests from Appendix A; approximate |
| // them using the Unicode ID start/part rules |
| if (i == readBufferPos && isName) |
| { |
| if (!Character.isUnicodeIdentifierStart(c) |
| && c != ':' && c != '_') |
| { |
| error("Not a name start character, U+" |
| + Integer.toHexString(c)); |
| } |
| } |
| else if (!Character.isUnicodeIdentifierPart(c) |
| && c != '-' && c != ':' && c != '_' && c != '.' |
| && !isExtender(c)) |
| { |
| error("Not a name character, U+" |
| + Integer.toHexString(c)); |
| } |
| } |
| } |
| } |
| |
| nameBufferPos = 0; |
| |
| // Read the first character. |
| loop: |
| while (true) |
| { |
| c = readCh(); |
| switch (c) |
| { |
| case '%': |
| case '<': case '>': case '&': |
| case ',': case '|': case '*': case '+': case '?': |
| case ')': |
| case '=': |
| case '\'': case '"': |
| case '[': |
| case ' ': case '\t': case '\n': case '\r': |
| case ';': |
| case '/': |
| unread(c); |
| if (nameBufferPos == 0) |
| { |
| error ("name expected"); |
| } |
| // punt on exact tests from Appendix A, but approximate them |
| if (isName |
| && !Character.isUnicodeIdentifierStart(nameBuffer[0]) |
| && ":_".indexOf(nameBuffer[0]) == -1) |
| { |
| error("Not a name start character, U+" |
| + Integer.toHexString(nameBuffer[0])); |
| } |
| String s = intern(nameBuffer, 0, nameBufferPos); |
| nameBufferPos = 0; |
| return s; |
| default: |
| // punt on exact tests from Appendix A, but approximate them |
| |
| if ((nameBufferPos != 0 || !isName) |
| && !Character.isUnicodeIdentifierPart(c) |
| && ":-_.".indexOf(c) == -1 |
| && !isExtender(c)) |
| { |
| error("Not a name character, U+" |
| + Integer.toHexString(c)); |
| } |
| if (nameBufferPos >= nameBuffer.length) |
| { |
| nameBuffer = |
| (char[]) extendArray(nameBuffer, |
| nameBuffer.length, nameBufferPos); |
| } |
| nameBuffer[nameBufferPos++] = c; |
| } |
| } |
| } |
| |
| private static boolean isExtender(char c) |
| { |
| // [88] Extender ::= ... |
| return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387 |
| || c == 0x0640 || c == 0x0e46 || c == 0x0ec6 || c == 0x3005 |
| || (c >= 0x3031 && c <= 0x3035) |
| || (c >= 0x309d && c <= 0x309e) |
| || (c >= 0x30fc && c <= 0x30fe); |
| } |
| |
| /** |
| * Read a literal. With matching single or double quotes as |
| * delimiters (and not embedded!) this is used to parse: |
| * <pre> |
| * [9] EntityValue ::= ... ([^%&] | PEReference | Reference)* ... |
| * [10] AttValue ::= ... ([^<&] | Reference)* ... |
| * [11] SystemLiteral ::= ... (URLchar - "'")* ... |
| * [12] PubidLiteral ::= ... (PubidChar - "'")* ... |
| * </pre> |
| * as well as the quoted strings in XML and text declarations |
| * (for version, encoding, and standalone) which have their |
| * own constraints. |
| */ |
| private String readLiteral(int flags) |
| throws SAXException, IOException |
| { |
| char delim, c; |
| int startLine = line; |
| boolean saved = expandPE; |
| boolean savedReport = doReport; |
| |
| // Find the first delimiter. |
| delim = readCh(); |
| if (delim != '"' && delim != '\'') |
| { |
| error("expected '\"' or \"'\"", delim, null); |
| return null; |
| } |
| inLiteral = true; |
| if ((flags & LIT_DISABLE_PE) != 0) |
| { |
| expandPE = false; |
| } |
| doReport = false; |
| |
| // Each level of input source has its own buffer; remember |
| // ours, so we won't read the ending delimiter from any |
| // other input source, regardless of entity processing. |
| char[] ourBuf = readBuffer; |
| |
| // Read the literal. |
| try |
| { |
| c = readCh(); |
| boolean ampRead = false; |
| loop: |
| while (! (c == delim && readBuffer == ourBuf)) |
| { |
| switch (c) |
| { |
| // attributes and public ids are normalized |
| // in almost the same ways |
| case '\n': |
| case '\r': |
| if ((flags & (LIT_ATTRIBUTE | LIT_PUBID)) != 0) |
| { |
| c = ' '; |
| } |
| break; |
| case '\t': |
| if ((flags & LIT_ATTRIBUTE) != 0) |
| { |
| c = ' '; |
| } |
| break; |
| case '&': |
| c = readCh(); |
| // Char refs are expanded immediately, except for |
| // all the cases where it's deferred. |
| if (c == '#') |
| { |
| if ((flags & LIT_DISABLE_CREF) != 0) |
| { |
| dataBufferAppend('&'); |
| break; |
| } |
| parseCharRef(false /* Do not do flushDataBuffer */); |
| |
| // exotic WFness risk: this is an entity literal, |
| // dataBuffer [dataBufferPos - 1] == '&', and |
| // following chars are a _partial_ entity/char ref |
| |
| // It looks like an entity ref ... |
| } |
| else |
| { |
| unread(c); |
| // Expand it? |
| if ((flags & LIT_ENTITY_REF) > 0) |
| { |
| parseEntityRef(false); |
| if (String.valueOf(readBuffer).equals("&")) |
| { |
| ampRead = true; |
| } |
| //Is it just data? |
| } |
| else if ((flags & LIT_DISABLE_EREF) != 0) |
| { |
| dataBufferAppend('&'); |
| |
| // OK, it will be an entity ref -- expanded later. |
| } |
| else |
| { |
| String name = readNmtoken(true); |
| require(';'); |
| dataBufferAppend('&'); |
| dataBufferAppend(name); |
| dataBufferAppend(';'); |
| } |
| } |
| c = readCh(); |
| continue loop; |
| |
| case '<': |
| // and why? Perhaps so "&foo;" expands the same |
| // inside and outside an attribute? |
| if ((flags & LIT_ATTRIBUTE) != 0) |
| { |
| error("attribute values may not contain '<'"); |
| } |
| break; |
| |
| // We don't worry about case '%' and PE refs, readCh does. |
| |
| default: |
| break; |
| } |
| dataBufferAppend(c); |
| c = readCh(); |
| } |
| } |
| catch (EOFException e) |
| { |
| error("end of input while looking for delimiter (started on line " |
| + startLine + ')', null, new Character(delim).toString()); |
| } |
| inLiteral = false; |
| expandPE = saved; |
| doReport = savedReport; |
| |
| // Normalise whitespace if necessary. |
| if ((flags & LIT_NORMALIZE) > 0) |
| { |
| dataBufferNormalize(); |
| } |
| |
| // Return the value. |
| return dataBufferToString(); |
| } |
| |
| /** |
| * Try reading external identifiers. |
| * A system identifier is not required for notations. |
| * @param inNotation Are we parsing a notation decl? |
| * @param isSubset Parsing external subset decl (may be omitted)? |
| * @return A three-member String array containing the identifiers, |
| * or nulls. Order: public, system, baseURI. |
| */ |
| private ExternalIdentifiers readExternalIds(boolean inNotation, |
| boolean isSubset) |
| throws Exception |
| { |
| char c; |
| ExternalIdentifiers ids = new ExternalIdentifiers(); |
| int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF; |
| |
| if (tryRead("PUBLIC")) |
| { |
| requireWhitespace(); |
| ids.publicId = readLiteral(LIT_NORMALIZE | LIT_PUBID | flags); |
| if (inNotation) |
| { |
| skipWhitespace(); |
| c = readCh(); |
| unread(c); |
| if (c == '"' || c == '\'') |
| { |
| ids.systemId = readLiteral(flags); |
| } |
| } |
| else |
| { |
| requireWhitespace(); |
| ids.systemId = readLiteral(flags); |
| } |
| |
| for (int i = 0; i < ids.publicId.length(); i++) |
| { |
| c = ids.publicId.charAt(i); |
| if (c >= 'a' && c <= 'z') |
| { |
| continue; |
| } |
| if (c >= 'A' && c <= 'Z') |
| { |
| continue; |
| } |
| if (" \r\n0123456789-' ()+,./:=?;!*#@$_%".indexOf(c) != -1) |
| { |
| continue; |
| } |
| error("illegal PUBLIC id character U+" |
| + Integer.toHexString(c)); |
| } |
| } |
| else if (tryRead("SYSTEM")) |
| { |
| requireWhitespace(); |
| ids.systemId = readLiteral(flags); |
| } |
| else if (!isSubset) |
| { |
| error("missing SYSTEM or PUBLIC keyword"); |
| } |
| |
| if (ids.systemId != null) |
| { |
| if (ids.systemId.indexOf('#') != -1) |
| { |
| handler.verror("SYSTEM id has an URI fragment: " + ids.systemId); |
| } |
| ids.baseUri = handler.getSystemId(); |
| if (ids.baseUri == null && uriWarnings) |
| { |
| handler.warn("No base URI; hope URI is absolute: " |
| + ids.systemId); |
| } |
| } |
| |
| return ids; |
| } |
| |
| /** |
| * Test if a character is whitespace. |
| * <pre> |
| * [3] S ::= (#x20 | #x9 | #xd | #xa)+ |
| * </pre> |
| * @param c The character to test. |
| * @return true if the character is whitespace. |
| */ |
| private final boolean isWhitespace(char c) |
| { |
| if (c > 0x20) |
| { |
| return false; |
| } |
| if (c == 0x20 || c == 0x0a || c == 0x09 || c == 0x0d) |
| { |
| return true; |
| } |
| return false; // illegal ... |
| } |
| |
| ////////////////////////////////////////////////////////////////////// |
| // Utility routines. |
| ////////////////////////////////////////////////////////////////////// |
| |
| /** |
| * Add a character to the data buffer. |
| */ |
| private void dataBufferAppend(char c) |
| { |
| // Expand buffer if necessary. |
| if (dataBufferPos >= dataBuffer.length) |
| { |
| dataBuffer = (char[]) extendArray(dataBuffer, |
| dataBuffer.length, dataBufferPos); |
| } |
| dataBuffer[dataBufferPos++] = c; |
| } |
| |
| /** |
| * Add a string to the data buffer. |
| */ |
| private void dataBufferAppend(String s) |
| { |
| dataBufferAppend(s.toCharArray(), 0, s.length()); |
| } |
| |
| /** |
| * Append (part of) a character array to the data buffer. |
| */ |
| private void dataBufferAppend(char[] ch, int start, int length) |
| { |
| dataBuffer = (char[]) extendArray(dataBuffer, dataBuffer.length, |
| dataBufferPos + length); |
| |
| System.arraycopy(ch, start, dataBuffer, dataBufferPos, length); |
| dataBufferPos += length; |
| } |
| |
| /** |
| * Normalise space characters in the data buffer. |
| */ |
| private void dataBufferNormalize() |
| { |
| int i = 0; |
| int j = 0; |
| int end = dataBufferPos; |
| |
| // Skip spaces at the start. |
| while (j < end && dataBuffer[j] == ' ') |
| { |
| j++; |
| } |
| |
| // Skip whitespace at the end. |
| while (end > j && dataBuffer[end - 1] == ' ') |
| { |
| end --; |
| } |
| |
| // Start copying to the left. |
| while (j < end) |
| { |
| |
| char c = dataBuffer[j++]; |
| |
| // Normalise all other spaces to |
| // a single space. |
| if (c == ' ') |
| { |
| while (j < end && dataBuffer[j++] == ' ') |
| { |
| continue; |
| } |
| dataBuffer[i++] = ' '; |
| dataBuffer[i++] = dataBuffer[j - 1]; |
| } |
| else |
| { |
| dataBuffer[i++] = c; |
| } |
| } |
| |
| // The new length is <= the old one. |
| dataBufferPos = i; |
| } |
| |
| /** |
| * Convert the data buffer to a string. |
| */ |
| private String dataBufferToString() |
| { |
| String s = new String(dataBuffer, 0, dataBufferPos); |
| dataBufferPos = 0; |
| return s; |
| } |
| |
| /** |
| * Flush the contents of the data buffer to the handler, as |
| * appropriate, and reset the buffer for new input. |
| */ |
| private void dataBufferFlush() |
| throws SAXException |
| { |
| if (currentElementContent == CONTENT_ELEMENTS |
| && dataBufferPos > 0 |
| && !inCDATA) |
| { |
| // We can't just trust the buffer to be whitespace, there |
| // are (error) cases when it isn't |
| for (int i = 0; i < dataBufferPos; i++) |
| { |
| if (!isWhitespace(dataBuffer[i])) |
| { |
| handler.charData(dataBuffer, 0, dataBufferPos); |
| dataBufferPos = 0; |
| } |
| } |
| if (dataBufferPos > 0) |
| { |
| handler.ignorableWhitespace(dataBuffer, 0, dataBufferPos); |
| dataBufferPos = 0; |
| } |
| } |
| else if (dataBufferPos > 0) |
| { |
| handler.charData(dataBuffer, 0, dataBufferPos); |
| dataBufferPos = 0; |
| } |
| } |
| |
| /** |
| * Require a string to appear, or throw an exception. |
| * <p><em>Precondition:</em> Entity expansion is not required. |
| * <p><em>Precondition:</em> data buffer has no characters that |
| * will get sent to the application. |
| */ |
| private void require(String delim) |
| throws SAXException, IOException |
| { |
| int length = delim.length(); |
| char[] ch; |
| |
| if (length < dataBuffer.length) |
| { |
| ch = dataBuffer; |
| delim.getChars(0, length, ch, 0); |
| } |
| else |
| { |
| ch = delim.toCharArray(); |
| } |
| |
| if (USE_CHEATS && length <= (readBufferLength - readBufferPos)) |
| { |
| int offset = readBufferPos; |
| |
| for (int i = 0; i < length; i++, offset++) |
| { |
| if (ch[i] != readBuffer[offset]) |
| { |
| error ("required string", null, delim); |
| } |
| } |
| readBufferPos = offset; |
| |
| } |
| else |
| { |
| for (int i = 0; i < length; i++) |
| { |
| require(ch[i]); |
| } |
| } |
| } |
| |
| /** |
| * Require a character to appear, or throw an exception. |
| */ |
| private void require(char delim) |
| throws SAXException, IOException |
| { |
| char c = readCh(); |
| |
| if (c != delim) |
| { |
| error("required character", c, new Character(delim).toString()); |
| } |
| } |
| |
| /** |
| * Create an interned string from a character array. |
| * Ælfred uses this method to create an interned version |
| * of all names and name tokens, so that it can test equality |
| * with <code>==</code> instead of <code>String.equals ()</code>. |
| * |
| * <p>This is much more efficient than constructing a non-interned |
| * string first, and then interning it. |
| * |
| * @param ch an array of characters for building the string. |
| * @param start the starting position in the array. |
| * @param length the number of characters to place in the string. |
| * @return an interned string. |
| * @see #intern (String) |
| * @see java.lang.String#intern |
| */ |
| public String intern(char[] ch, int start, int length) |
| { |
| int index = 0; |
| int hash = 0; |
| Object[] bucket; |
| |
| // Generate a hash code. This is a widely used string hash, |
| // often attributed to Brian Kernighan. |
| for (int i = start; i < start + length; i++) |
| { |
| hash = 31 * hash + ch[i]; |
| } |
| hash = (hash & 0x7fffffff) % SYMBOL_TABLE_LENGTH; |
| |
| // Get the bucket -- consists of {array,String} pairs |
| if ((bucket = symbolTable[hash]) == null) |
| { |
| // first string in this bucket |
| bucket = new Object[8]; |
| |
| // Search for a matching tuple, and |
| // return the string if we find one. |
| } |
| else |
| { |
| while (index < bucket.length) |
| { |
| char[] chFound = (char[]) bucket[index]; |
| |
| // Stop when we hit an empty entry. |
| if (chFound == null) |
| { |
| break; |
| } |
| |
| // If they're the same length, check for a match. |
| if (chFound.length == length) |
| { |
| for (int i = 0; i < chFound.length; i++) |
| { |
| // continue search on failure |
| if (ch[start + i] != chFound[i]) |
| { |
| break; |
| } |
| else if (i == length - 1) |
| { |
| // That's it, we have a match! |
| return (String) bucket[index + 1]; |
| } |
| } |
| } |
| index += 2; |
| } |
| // Not found -- we'll have to add it. |
| |
| // Do we have to grow the bucket? |
| bucket = (Object[]) extendArray(bucket, bucket.length, index); |
| } |
| symbolTable[hash] = bucket; |
| |
| // OK, add it to the end of the bucket -- "local" interning. |
| // Intern "globally" to let applications share interning benefits. |
| // That is, "!=" and "==" work on our strings, not just equals(). |
| String s = new String(ch, start, length).intern(); |
| bucket[index] = s.toCharArray(); |
| bucket[index + 1] = s; |
| return s; |
| } |
| |
| /** |
| * Ensure the capacity of an array, allocating a new one if |
| * necessary. Usually extends only for name hash collisions. |
| */ |
| private Object extendArray(Object array, int currentSize, int requiredSize) |
| { |
| if (requiredSize < currentSize) |
| { |
| return array; |
| } |
| else |
| { |
| Object newArray = null; |
| int newSize = currentSize * 2; |
| |
| if (newSize <= requiredSize) |
| { |
| newSize = requiredSize + 1; |
| } |
| |
| if (array instanceof char[]) |
| { |
| newArray = new char[newSize]; |
| } |
| else if (array instanceof Object[]) |
| { |
| newArray = new Object[newSize]; |
| } |
| else |
| { |
| throw new RuntimeException(); |
| } |
| |
| System.arraycopy(array, 0, newArray, 0, currentSize); |
| return newArray; |
| } |
| } |
| |
| ////////////////////////////////////////////////////////////////////// |
| // XML query routines. |
| ////////////////////////////////////////////////////////////////////// |
| |
| boolean isStandalone() |
| { |
| return docIsStandalone; |
| } |
| |
| // |
| // Elements |
| // |
| |
| private int getContentType(ElementDecl element, int defaultType) |
| { |
| int retval; |
| |
| if (element == null) |
| { |
| return defaultType; |
| } |
| retval = element.contentType; |
| if (retval == CONTENT_UNDECLARED) |
| { |
| retval = defaultType; |
| } |
| return retval; |
| } |
| |
| /** |
| * Look up the content type of an element. |
| * @param name The element type name. |
| * @return An integer constant representing the content type. |
| * @see #CONTENT_UNDECLARED |
| * @see #CONTENT_ANY |
| * @see #CONTENT_EMPTY |
| * @see #CONTENT_MIXED |
| * @see #CONTENT_ELEMENTS |
| */ |
| public int getElementContentType(String name) |
| { |
| ElementDecl element = (ElementDecl) elementInfo.get(name); |
| return getContentType(element, CONTENT_UNDECLARED); |
| } |
| |
| /** |
| * Register an element. |
| * Array format: |
| * [0] element type name |
| * [1] content model (mixed, elements only) |
| * [2] attribute hash table |
| */ |
| private void setElement(String name, int contentType, |
| String contentModel, HashMap attributes) |
| throws SAXException |
| { |
| if (skippedPE) |
| { |
| return; |
| } |
| |
| ElementDecl element = (ElementDecl) elementInfo.get(name); |
| |
| // first <!ELEMENT ...> or <!ATTLIST ...> for this type? |
| if (element == null) |
| { |
| element = new ElementDecl(); |
| element.contentType = contentType; |
| element.contentModel = contentModel; |
| element.attributes = attributes; |
| elementInfo.put(name, element); |
| return; |
| } |
| |
| // <!ELEMENT ...> declaration? |
| if (contentType != CONTENT_UNDECLARED) |
| { |
| // ... following an associated <!ATTLIST ...> |
| if (element.contentType == CONTENT_UNDECLARED) |
| { |
| element.contentType = contentType; |
| element.contentModel = contentModel; |
| } |
| else |
| { |
| // VC: Unique Element Type Declaration |
| handler.verror("multiple declarations for element type: " |
| + name); |
| } |
| } |
| |
| // first <!ATTLIST ...>, before <!ELEMENT ...> ? |
| else if (attributes != null) |
| { |
| element.attributes = attributes; |
| } |
| } |
| |
| /** |
| * Look up the attribute hash table for an element. |
| * The hash table is the second item in the element array. |
| */ |
| private HashMap getElementAttributes(String name) |
| { |
| ElementDecl element = (ElementDecl) elementInfo.get(name); |
| return (element == null) ? null : element.attributes; |
| } |
| |
| // |
| // Attributes |
| // |
| |
| /** |
| * Get the declared attributes for an element type. |
| * @param elname The name of the element type. |
| * @return An iterator over all the attributes declared for |
| * a specific element type. The results will be valid only |
| * after the DTD (if any) has been parsed. |
| * @see #getAttributeType |
| * @see #getAttributeEnumeration |
| * @see #getAttributeDefaultValueType |
| * @see #getAttributeDefaultValue |
| * @see #getAttributeExpandedValue |
| */ |
| private Iterator declaredAttributes(ElementDecl element) |
| { |
| HashMap attlist; |
| |
| if (element == null) |
| { |
| return null; |
| } |
| if ((attlist = element.attributes) == null) |
| { |
| return null; |
| } |
| return attlist.keySet().iterator(); |
| } |
| |
| /** |
| * Get the declared attributes for an element type. |
| * @param elname The name of the element type. |
| * @return An iterator over all the attributes declared for |
| * a specific element type. The results will be valid only |
| * after the DTD (if any) has been parsed. |
| * @see #getAttributeType |
| * @see #getAttributeEnumeration |
| * @see #getAttributeDefaultValueType |
| * @see #getAttributeDefaultValue |
| * @see #getAttributeExpandedValue |
| */ |
| public Iterator declaredAttributes(String elname) |
| { |
| return declaredAttributes((ElementDecl) elementInfo.get(elname)); |
| } |
| |
| /** |
| * Retrieve the declared type of an attribute. |
| * @param name The name of the associated element. |
| * @param aname The name of the attribute. |
| * @return An interend string denoting the type, or null |
| * indicating an undeclared attribute. |
| */ |
| public String getAttributeType(String name, String aname) |
| { |
| AttributeDecl attribute = getAttribute(name, aname); |
| return (attribute == null) ? null : attribute.type; |
| } |
| |
| /** |
| * Retrieve the allowed values for an enumerated attribute type. |
| * @param name The name of the associated element. |
| * @param aname The name of the attribute. |
| * @return A string containing the token list. |
| */ |
| public String getAttributeEnumeration(String name, String aname) |
| { |
| AttributeDecl attribute = getAttribute(name, aname); |
| // assert: attribute.enumeration is "ENUMERATION" or "NOTATION" |
| return (attribute == null) ? null : attribute.enumeration; |
| } |
| |
| /** |
| * Retrieve the default value of a declared attribute. |
| * @param name The name of the associated element. |
| * @param aname The name of the attribute. |
| * @return The default value, or null if the attribute was |
| * #IMPLIED or simply undeclared and unspecified. |
| * @see #getAttributeExpandedValue |
| */ |
| public String getAttributeDefaultValue(String name, String aname) |
| { |
| AttributeDecl attribute = getAttribute(name, aname); |
| return (attribute == null) ? null : attribute.value; |
| } |
| |
| /* |
| |
| // FIXME: Leaving this in, until W3C finally resolves the confusion |
| // between parts of the XML 2nd REC about when entity declararations |
| // are guaranteed to be known. Current code matches what section 5.1 |
| // (conformance) describes, but some readings of the self-contradicting |
| // text in 4.1 (the "Entity Declared" WFC and VC) seem to expect that |
| // attribute expansion/normalization must be deferred in some cases |
| // (just TRY to identify them!). |
| |
| * Retrieve the expanded value of a declared attribute. |
| * <p>General entities (and char refs) will be expanded (once). |
| * @param name The name of the associated element. |
| * @param aname The name of the attribute. |
| * @return The expanded default value, or null if the attribute was |
| * #IMPLIED or simply undeclared |
| * @see #getAttributeDefaultValue |
| public String getAttributeExpandedValue (String name, String aname) |
| throws Exception |
| { |
| AttributeDecl attribute = getAttribute (name, aname); |
| |
| if (attribute == null) { |
| return null; |
| } else if (attribute.defaultValue == null && attribute.value != null) { |
| // we MUST use the same buf for both quotes else the literal |
| // can't be properly terminated |
| char buf [] = new char [1]; |
| int flags = LIT_ENTITY_REF | LIT_ATTRIBUTE; |
| String type = getAttributeType (name, aname); |
| |
| if (type != "CDATA" && type != null) |
| flags |= LIT_NORMALIZE; |
| buf [0] = '"'; |
| pushCharArray (null, buf, 0, 1); |
| pushString (null, attribute.value); |
| pushCharArray (null, buf, 0, 1); |
| attribute.defaultValue = readLiteral (flags); |
| } |
| return attribute.defaultValue; |
| } |
| */ |
| |
| /** |
| * Retrieve the default value mode of a declared attribute. |
| * @see #ATTRIBUTE_DEFAULT_SPECIFIED |
| * @see #ATTRIBUTE_DEFAULT_IMPLIED |
| * @see #ATTRIBUTE_DEFAULT_REQUIRED |
| * @see #ATTRIBUTE_DEFAULT_FIXED |
| */ |
| public int getAttributeDefaultValueType(String name, String aname) |
| { |
| AttributeDecl attribute = getAttribute(name, aname); |
| return (attribute == null) ? ATTRIBUTE_DEFAULT_UNDECLARED : |
| attribute.valueType; |
| } |
| |
| /** |
| * Register an attribute declaration for later retrieval. |
| * Format: |
| * - String type |
| * - String default value |
| * - int value type |
| * - enumeration |
| * - processed default value |
| */ |
| private void setAttribute(String elName, String name, String type, |
| String enumeration, String value, int valueType) |
| throws Exception |
| { |
| HashMap attlist; |
| |
| if (skippedPE) |
| { |
| return; |
| } |
| |
| // Create a new hashtable if necessary. |
| attlist = getElementAttributes(elName); |
| if (attlist == null) |
| { |
| attlist = new HashMap(); |
| } |
| |
| // ignore multiple attribute declarations! |
| if (attlist.get(name) != null) |
| { |
| // warn ... |
| return; |
| } |
| else |
| { |
| AttributeDecl attribute = new AttributeDecl(); |
| attribute.type = type; |
| attribute.value = value; |
| attribute.valueType = valueType; |
| attribute.enumeration = enumeration; |
| attlist.put(name, attribute); |
| |
| // save; but don't overwrite any existing <!ELEMENT ...> |
| setElement(elName, CONTENT_UNDECLARED, null, attlist); |
| } |
| } |
| |
| /** |
| * Retrieve the attribute declaration for the given element name and name. |
| */ |
| private AttributeDecl getAttribute(String elName, String name) |
| { |
| HashMap attlist = getElementAttributes(elName); |
| return (attlist == null) ? null : (AttributeDecl) attlist.get(name); |
| } |
| |
| // |
| // Entities |
| // |
| |
| /** |
| * Find the type of an entity. |
| * @returns An integer constant representing the entity type. |
| * @see #ENTITY_UNDECLARED |
| * @see #ENTITY_INTERNAL |
| * @see #ENTITY_NDATA |
| * @see #ENTITY_TEXT |
| */ |
| public int getEntityType(String ename) |
| { |
| EntityInfo entity = (EntityInfo) entityInfo.get(ename); |
| return (entity == null) ? ENTITY_UNDECLARED : entity.type; |
| } |
| |
| /** |
| * Return an external entity's identifiers. |
| * @param ename The name of the external entity. |
| * @return The entity's public identifier, system identifier, and base URI. |
| * Null if the entity was not declared as an external entity. |
| * @see #getEntityType |
| */ |
| public ExternalIdentifiers getEntityIds(String ename) |
| { |
| EntityInfo entity = (EntityInfo) entityInfo.get(ename); |
| return (entity == null) ? null : entity.ids; |
| } |
| |
| /** |
| * Return an internal entity's replacement text. |
| * @param ename The name of the internal entity. |
| * @return The entity's replacement text, or null if |
| * the entity was not declared as an internal entity. |
| * @see #getEntityType |
| */ |
| public String getEntityValue(String ename) |
| { |
| EntityInfo entity = (EntityInfo) entityInfo.get(ename); |
| return (entity == null) ? null : entity.value; |
| } |
| |
| /** |
| * Register an entity declaration for later retrieval. |
| */ |
| private void setInternalEntity(String eName, String value) |
| throws SAXException |
| { |
| if (skippedPE) |
| { |
| return; |
| } |
| |
| if (entityInfo.get(eName) == null) |
| { |
| EntityInfo entity = new EntityInfo(); |
| entity.type = ENTITY_INTERNAL; |
| entity.value = value; |
| entityInfo.put(eName, entity); |
| } |
| if (handler.stringInterning) |
| { |
| if ("lt" == eName || "gt" == eName || "quot" == eName |
| || "apos" == eName || "amp" == eName) |
| { |
| return; |
| } |
| } |
| else |
| { |
| if ("lt".equals(eName) || "gt".equals(eName) || "quot".equals(eName) |
| || "apos".equals(eName) || "amp".equals(eName)) |
| { |
| return; |
| } |
| } |
| handler.getDeclHandler().internalEntityDecl(eName, value); |
| } |
| |
| /** |
| * Register an external entity declaration for later retrieval. |
| */ |
| private void setExternalEntity(String eName, int eClass, |
| ExternalIdentifiers ids, String nName) |
| { |
| if (entityInfo.get(eName) == null) |
| { |
| EntityInfo entity = new EntityInfo(); |
| entity.type = eClass; |
| entity.ids = ids; |
| entity.notationName = nName; |
| entityInfo.put(eName, entity); |
| } |
| } |
| |
| // |
| // Notations. |
| // |
| |
| /** |
| * Report a notation declaration, checking for duplicates. |
| */ |
| private void setNotation(String nname, ExternalIdentifiers ids) |
| throws SAXException |
| { |
| if (skippedPE) |
| { |
| return; |
| } |
| |
| handler.notationDecl(nname, ids.publicId, ids.systemId, ids.baseUri); |
| if (notationInfo.get(nname) == null) |
| { |
| notationInfo.put(nname, nname); |
| } |
| else |
| { |
| // VC: Unique Notation Name |
| handler.verror("Duplicate notation name decl: " + nname); |
| } |
| } |
| |
| // |
| // Location. |
| // |
| |
| /** |
| * Return the current line number. |
| */ |
| public int getLineNumber() |
| { |
| return line; |
| } |
| |
| /** |
| * Return the current column number. |
| */ |
| public int getColumnNumber() |
| { |
| return column; |
| } |
| |
| ////////////////////////////////////////////////////////////////////// |
| // High-level I/O. |
| ////////////////////////////////////////////////////////////////////// |
| |
| /** |
| * Read a single character from the readBuffer. |
| * <p>The readDataChunk () method maintains the buffer. |
| * <p>If we hit the end of an entity, try to pop the stack and |
| * keep going. |
| * <p> (This approach doesn't really enforce XML's rules about |
| * entity boundaries, but this is not currently a validating |
| * parser). |
| * <p>This routine also attempts to keep track of the current |
| * position in external entities, but it's not entirely accurate. |
| * @return The next available input character. |
| * @see #unread (char) |
| * @see #readDataChunk |
| * @see #readBuffer |
| * @see #line |
| * @return The next character from the current input source. |
| */ |
| private char readCh() |
| throws SAXException, IOException |
| { |
| // As long as there's nothing in the |
| // read buffer, try reading more data |
| // (for an external entity) or popping |
| // the entity stack (for either). |
| while (readBufferPos >= readBufferLength) |
| { |
| switch (sourceType) |
| { |
| case INPUT_READER: |
| case INPUT_STREAM: |
| readDataChunk(); |
| while (readBufferLength < 1) |
| { |
| popInput(); |
| if (readBufferLength < 1) |
| { |
| readDataChunk(); |
| } |
| } |
| break; |
| |
| default: |
| |
| popInput(); |
| break; |
| } |
| } |
| |
| char c = readBuffer[readBufferPos++]; |
| |
| if (c == '\n') |
| { |
| line++; |
| column = 0; |
| } |
| else |
| { |
| if (c == '<') |
| { |
| /* the most common return to parseContent () ... NOP */ |
| } |
| else if (((c < 0x0020 && (c != '\t') && (c != '\r')) || c > 0xFFFD) |
| || ((c >= 0x007f) && (c <= 0x009f) && (c != 0x0085) |
| && xmlVersion == XML_11)) |
| { |
| error("illegal XML character U+" + Integer.toHexString(c)); |
| } |
| |
| // If we're in the DTD and in a context where PEs get expanded, |
| // do so ... 1/14/2000 errata identify those contexts. There |
| // are also spots in the internal subset where PE refs are fatal |
| // errors, hence yet another flag. |
| else if (c == '%' && expandPE) |
| { |
| if (peIsError) |
| { |
| error("PE reference within decl in internal subset."); |
| } |
| parsePEReference(); |
| return readCh(); |
| } |
| column++; |
| } |
| |
| return c; |
| } |
| |
| /** |
| * Push a single character back onto the current input stream. |
| * <p>This method usually pushes the character back onto |
| * the readBuffer. |
| * <p>I don't think that this would ever be called with |
| * readBufferPos = 0, because the methods always reads a character |
| * before unreading it, but just in case, I've added a boundary |
| * condition. |
| * @param c The character to push back. |
| * @see #readCh |
| * @see #unread (char[]) |
| * @see #readBuffer |
| */ |
| private void unread(char c) |
| throws SAXException |
| { |
| // Normal condition. |
| if (c == '\n') |
| { |
| line--; |
| column = -1; |
| } |
| if (readBufferPos > 0) |
| { |
| readBuffer[--readBufferPos] = c; |
| } |
| else |
| { |
| pushString(null, new Character(c).toString()); |
| } |
| } |
| |
| /** |
| * Push a char array back onto the current input stream. |
| * <p>NOTE: you must <em>never</em> push back characters that you |
| * haven't actually read: use pushString () instead. |
| * @see #readCh |
| * @see #unread (char) |
| * @see #readBuffer |
| * @see #pushString |
| */ |
| private void unread(char[] ch, int length) |
| throws SAXException |
| { |
| for (int i = 0; i < length; i++) |
| { |
| if (ch[i] == '\n') |
| { |
| line--; |
| column = -1; |
| } |
| } |
| if (length < readBufferPos) |
| { |
| readBufferPos -= length; |
| } |
| else |
| { |
| pushCharArray(null, ch, 0, length); |
| } |
| } |
| |
| /** |
| * Push, or skip, a new external input source. |
| * The source will be some kind of parsed entity, such as a PE |
| * (including the external DTD subset) or content for the body. |
| * |
| * @param url The java.net.URL object for the entity. |
| * @see SAXDriver#resolveEntity |
| * @see #pushString |
| * @see #sourceType |
| * @see #pushInput |
| * @see #detectEncoding |
| * @see #sourceType |
| * @see #readBuffer |
| */ |
| private void pushURL(boolean isPE, |
| String ename, |
| ExternalIdentifiers ids, |
| Reader reader, |
| InputStream stream, |
| String encoding, |
| boolean doResolve) |
| throws SAXException, IOException |
| { |
| boolean ignoreEncoding; |
| String systemId; |
| InputSource source; |
| |
| if (!isPE) |
| { |
| dataBufferFlush(); |
| } |
| |
| scratch.setPublicId(ids.publicId); |
| scratch.setSystemId(ids.systemId); |
| |
| // See if we should skip or substitute the entity. |
| // If we're not skipping, resolving reports startEntity() |
| // and updates the (handler's) stack of URIs. |
| if (doResolve) |
| { |
| // assert (stream == null && reader == null && encoding == null) |
| source = handler.resolveEntity(isPE, ename, scratch, ids.baseUri); |
| if (source == null) |
| { |
| handler.warn("skipping entity: " + ename); |
| handler.skippedEntity(ename); |
| if (isPE) |
| { |
| skippedPE = true; |
| } |
| return; |
| } |
| |
| // we might be using alternate IDs/encoding |
| systemId = source.getSystemId(); |
| // The following warning and setting systemId was deleted bcause |
| // the application has the option of not setting systemId |
| // provided that it has set the characte/byte stream. |
| /* |
| if (systemId == null) { |
| handler.warn ("missing system ID, using " + ids.systemId); |
| systemId = ids.systemId; |
| } |
| */ |
| } |
| else |
| { |
| // "[document]", or "[dtd]" via getExternalSubset() |
| scratch.setCharacterStream(reader); |
| scratch.setByteStream(stream); |
| scratch.setEncoding(encoding); |
| source = scratch; |
| systemId = ids.systemId; |
| if (handler.stringInterning) |
| { |
| handler.startExternalEntity(ename, systemId, |
| "[document]" == ename); |
| } |
| else |
| { |
| handler.startExternalEntity(ename, systemId, |
| "[document]".equals(ename)); |
| } |
| } |
| |
| // we may have been given I/O streams directly |
| if (source.getCharacterStream() != null) |
| { |
| if (source.getByteStream() != null) |
| error("InputSource has two streams!"); |
| reader = source.getCharacterStream(); |
| } |
| else if (source.getByteStream() != null) |
| { |
| encoding = source.getEncoding(); |
| if (encoding == null) |
| { |
| stream = source.getByteStream(); |
| } |
| else |
| { |
| try |
| { |
| reader = new InputStreamReader(source.getByteStream(), |
| encoding); |
| } |
| catch (IOException e) |
| { |
| stream = source.getByteStream(); |
| } |
| } |
| } |
| else if (systemId == null) |
| { |
| error("InputSource has no URI!"); |
| } |
| scratch.setCharacterStream(null); |
| scratch.setByteStream(null); |
| scratch.setEncoding(null); |
| |
| // Push the existing status. |
| pushInput(ename); |
| |
| // Create a new read buffer. |
| // (Note the four-character margin) |
| readBuffer = new char[READ_BUFFER_MAX + 4]; |
| readBufferPos = 0; |
| readBufferLength = 0; |
| readBufferOverflow = -1; |
| is = null; |
| line = 1; |
| column = 0; |
| currentByteCount = 0; |
| |
| // If there's an explicit character stream, just |
| // ignore encoding declarations. |
| if (reader != null) |
| { |
| sourceType = INPUT_READER; |
| this.reader = reader; |
| tryEncodingDecl(true); |
| return; |
| } |
| |
| // Else we handle the conversion, and need to ensure |
| // it's done right. |
| sourceType = INPUT_STREAM; |
| if (stream != null) |
| { |
| is = stream; |
| } |
| else |
| { |
| // We have to open our own stream to the URL. |
| URL url = new URL(systemId); |
| |
| externalEntity = url.openConnection(); |
| externalEntity.connect(); |
| is = externalEntity.getInputStream(); |
| } |
| |
| // If we get to here, there must be |
| // an InputStream available. |
| if (!is.markSupported()) |
| { |
| is = new BufferedInputStream(is); |
| } |
| |
| // Get any external encoding label. |
| if (encoding == null && externalEntity != null) |
| { |
| // External labels can be untrustworthy; filesystems in |
| // particular often have the wrong default for content |
| // that wasn't locally originated. Those we autodetect. |
| if (!"file".equals(externalEntity.getURL().getProtocol())) |
| { |
| int temp; |
| |
| // application/xml;charset=something;otherAttr=... |
| // ... with many variants on 'something' |
| encoding = externalEntity.getContentType(); |
| |
| // MHK code (fix for Saxon 5.5.1/007): |
| // protect against encoding==null |
| if (encoding == null) |
| { |
| temp = -1; |
| } |
| else |
| { |
| temp = encoding.indexOf("charset"); |
| } |
| |
| // RFC 2376 sez MIME text defaults to ASCII, but since the |
| // JDK will create a MIME type out of thin air, we always |
| // autodetect when there's no explicit charset attribute. |
| if (temp < 0) |
| { |
| encoding = null; // autodetect |
| } |
| else |
| { |
| // only this one attribute |
| if ((temp = encoding.indexOf(';')) > 0) |
| { |
| encoding = encoding.substring(0, temp); |
| } |
| |
| if ((temp = encoding.indexOf('=', temp + 7)) > 0) |
| { |
| encoding = encoding.substring(temp + 1); |
| |
| // attributes can have comment fields (RFC 822) |
| if ((temp = encoding.indexOf('(')) > 0) |
| { |
| encoding = encoding.substring(0, temp); |
| } |
| // ... and values may be quoted |
| if ((temp = encoding.indexOf('"')) > 0) |
| { |
| encoding = |
| encoding.substring(temp + 1, |
| encoding.indexOf('"', temp + 2)); |
| } |
| encoding.trim(); |
| } |
| else |
| { |
| handler.warn("ignoring illegal MIME attribute: " |
| + encoding); |
| encoding = null; |
| } |
| } |
| } |
| } |
| |
| // if we got an external encoding label, use it ... |
| if (encoding != null) |
| { |
| this.encoding = ENCODING_EXTERNAL; |
| setupDecoding(encoding); |
| ignoreEncoding = true; |
| |
| // ... else autodetect from first bytes. |
| } |
| else |
| { |
| detectEncoding(); |
| ignoreEncoding = false; |
| } |
| |
| // Read any XML or text declaration. |
| // If we autodetected, it may tell us the "real" encoding. |
| try |
| { |
| tryEncodingDecl(ignoreEncoding); |
| } |
| catch (UnsupportedEncodingException x) |
| { |
| encoding = x.getMessage(); |
| |
| // if we don't handle the declared encoding, |
| // try letting a JVM InputStreamReader do it |
| try |
| { |
| if (sourceType != INPUT_STREAM) |
| { |
| throw x; |
| } |
| |
| is.reset(); |
| readBufferPos = 0; |
| readBufferLength = 0; |
| readBufferOverflow = -1; |
| line = 1; |
| currentByteCount = column = 0; |
| |
| sourceType = INPUT_READER; |
| this.reader = new InputStreamReader(is, encoding); |
| is = null; |
| |
| tryEncodingDecl(true); |
| |
| } |
| catch (IOException e) |
| { |
| error("unsupported text encoding", |
| encoding, |
| null); |
| } |
| } |
| } |
| |
| /** |
| * Check for an encoding declaration. This is the second part of the |
| * XML encoding autodetection algorithm, relying on detectEncoding to |
| * get to the point that this part can read any encoding declaration |
| * in the document (using only US-ASCII characters). |
| * |
| * <p> Because this part starts to fill parser buffers with this data, |
| * it's tricky to setup a reader so that Java's built-in decoders can be |
| * used for the character encodings that aren't built in to this parser |
| * (such as EUC-JP, KOI8-R, Big5, etc). |
| * |
| * @return any encoding in the declaration, uppercased; or null |
| * @see detectEncoding |
| */ |
| private String tryEncodingDecl(boolean ignoreEncoding) |
| throws SAXException, IOException |
| { |
| // Read the XML/text declaration. |
| if (tryRead("<?xml")) |
| { |
| if (tryWhitespace()) |
| { |
| if (inputStack.size() > 0) |
| { |
| return parseTextDecl(ignoreEncoding); |
| } |
| else |
| { |
| return parseXMLDecl(ignoreEncoding); |
| } |
| } |
| else |
| { |
| // <?xml-stylesheet ...?> or similar |
| unread('l'); |
| unread('m'); |
| unread('x'); |
| unread('?'); |
| unread('<'); |
| } |
| } |
| return null; |
| } |
| |
| /** |
| * Attempt to detect the encoding of an entity. |
| * <p>The trick here (as suggested in the XML standard) is that |
| * any entity not in UTF-8, or in UCS-2 with a byte-order mark, |
| * <b>must</b> begin with an XML declaration or an encoding |
| * declaration; we simply have to look for "<?xml" in various |
| * encodings. |
| * <p>This method has no way to distinguish among 8-bit encodings. |
| * Instead, it sets up for UTF-8, then (possibly) revises its assumption |
| * later in setupDecoding (). Any ASCII-derived 8-bit encoding |
| * should work, but most will be rejected later by setupDecoding (). |
| * @see #tryEncoding (byte[], byte, byte, byte, byte) |
| * @see #tryEncoding (byte[], byte, byte) |
| * @see #setupDecoding |
| */ |
| private void detectEncoding() |
| throws SAXException, IOException |
| { |
| byte[] signature = new byte[4]; |
| |
| // Read the first four bytes for |
| // autodetection. |
| is.mark(4); |
| is.read(signature); |
| is.reset(); |
| |
| // |
| // FIRST: four byte encodings (who uses these?) |
| // |
| if (tryEncoding(signature, (byte) 0x00, (byte) 0x00, |
| (byte) 0x00, (byte) 0x3c)) |
| { |
| // UCS-4 must begin with "<?xml" |
| // 0x00 0x00 0x00 0x3c: UCS-4, big-endian (1234) |
| // "UTF-32BE" |
| encoding = ENCODING_UCS_4_1234; |
| } |
| else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x00, |
| (byte) 0x00, (byte) 0x00)) |
| { |
| // 0x3c 0x00 0x00 0x00: UCS-4, little-endian (4321) |
| // "UTF-32LE" |
| encoding = ENCODING_UCS_4_4321; |
| } |
| else if (tryEncoding(signature, (byte) 0x00, (byte) 0x00, |
| (byte) 0x3c, (byte) 0x00)) |
| { |
| // 0x00 0x00 0x3c 0x00: UCS-4, unusual (2143) |
| encoding = ENCODING_UCS_4_2143; |
| } |
| else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c, |
| (byte) 0x00, (byte) 0x00)) |
| { |
| // 0x00 0x3c 0x00 0x00: UCS-4, unusual (3421) |
| encoding = ENCODING_UCS_4_3412; |
| |
| // 00 00 fe ff UCS_4_1234 (with BOM) |
| // ff fe 00 00 UCS_4_4321 (with BOM) |
| } |
| |
| // |
| // SECOND: two byte encodings |
| // note ... with 1/14/2000 errata the XML spec identifies some |
| // more "broken UTF-16" autodetection cases, with no XML decl, |
| // which we don't handle here (that's legal too). |
| // |
| else if (tryEncoding(signature, (byte) 0xfe, (byte) 0xff)) |
| { |
| // UCS-2 with a byte-order marker. (UTF-16) |
| // 0xfe 0xff: UCS-2, big-endian (12) |
| encoding = ENCODING_UCS_2_12; |
| is.read(); is.read(); |
| } |
| else if (tryEncoding(signature, (byte) 0xff, (byte) 0xfe)) |
| { |
| // UCS-2 with a byte-order marker. (UTF-16) |
| // 0xff 0xfe: UCS-2, little-endian (21) |
| encoding = ENCODING_UCS_2_21; |
| is.read(); is.read(); |
| } |
| else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c, |
| (byte) 0x00, (byte) 0x3f)) |
| { |
| // UTF-16BE (otherwise, malformed UTF-16) |
| // 0x00 0x3c 0x00 0x3f: UCS-2, big-endian, no byte-order mark |
| encoding = ENCODING_UCS_2_12; |
| error("no byte-order mark for UCS-2 entity"); |
| } |
| else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x00, |
| (byte) 0x3f, (byte) 0x00)) |
| { |
| // UTF-16LE (otherwise, malformed UTF-16) |
| // 0x3c 0x00 0x3f 0x00: UCS-2, little-endian, no byte-order mark |
| encoding = ENCODING_UCS_2_21; |
| error("no byte-order mark for UCS-2 entity"); |
| } |
| |
| // |
| // THIRD: ASCII-derived encodings, fixed and variable lengths |
| // |
| else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x3f, |
| (byte) 0x78, (byte) 0x6d)) |
| { |
| // ASCII derived |
| // 0x3c 0x3f 0x78 0x6d: UTF-8 or other 8-bit markup (read ENCODING) |
| encoding = ENCODING_UTF_8; |
| prefetchASCIIEncodingDecl(); |
| } |
| else if (signature[0] == (byte) 0xef |
| && signature[1] == (byte) 0xbb |
| && signature[2] == (byte) 0xbf) |
| { |
| // 0xef 0xbb 0xbf: UTF-8 BOM (not part of document text) |
| // this un-needed notion slipped into XML 2nd ed through a |
| // "non-normative" erratum; now required by MSFT and UDDI, |
| // and E22 made it normative. |
| encoding = ENCODING_UTF_8; |
| is.read(); is.read(); is.read(); |
| } |
| else |
| { |
| // 4c 6f a7 94 ... we don't understand EBCDIC flavors |
| // ... but we COULD at least kick in some fixed code page |
| |
| // (default) UTF-8 without encoding/XML declaration |
| encoding = ENCODING_UTF_8; |
| } |
| } |
| |
| /** |
| * Check for a four-byte signature. |
| * <p>Utility routine for detectEncoding (). |
| * <p>Always looks for some part of "<?XML" in a specific encoding. |
| * @param sig The first four bytes read. |
| * @param b1 The first byte of the signature |
| * @param b2 The second byte of the signature |
| * @param b3 The third byte of the signature |
| * @param b4 The fourth byte of the signature |
| * @see #detectEncoding |
| */ |
| private static boolean tryEncoding(byte[] sig, byte b1, byte b2, |
| byte b3, byte b4) |
| { |
| return (sig[0] == b1 && sig[1] == b2 |
| && sig[2] == b3 && sig[3] == b4); |
| } |
| |
| /** |
| * Check for a two-byte signature. |
| * <p>Looks for a UCS-2 byte-order mark. |
| * <p>Utility routine for detectEncoding (). |
| * @param sig The first four bytes read. |
| * @param b1 The first byte of the signature |
| * @param b2 The second byte of the signature |
| * @see #detectEncoding |
| */ |
| private static boolean tryEncoding(byte[] sig, byte b1, byte b2) |
| { |
| return ((sig[0] == b1) && (sig[1] == b2)); |
| } |
| |
| /** |
| * This method pushes a string back onto input. |
| * <p>It is useful either as the expansion of an internal entity, |
| * or for backtracking during the parse. |
| * <p>Call pushCharArray () to do the actual work. |
| * @param s The string to push back onto input. |
| * @see #pushCharArray |
| */ |
| private void pushString(String ename, String s) |
| throws SAXException |
| { |
| char[] ch = s.toCharArray(); |
| pushCharArray(ename, ch, 0, ch.length); |
| } |
| |
| /** |
| * Push a new internal input source. |
| * <p>This method is useful for expanding an internal entity, |
| * or for unreading a string of characters. It creates a new |
| * readBuffer containing the characters in the array, instead |
| * of characters converted from an input byte stream. |
| * @param ch The char array to push. |
| * @see #pushString |
| * @see #pushURL |
| * @see #readBuffer |
| * @see #sourceType |
| * @see #pushInput |
| */ |
| private void pushCharArray(String ename, char[] ch, int start, int length) |
| throws SAXException |
| { |
| // Push the existing status |
| pushInput(ename); |
| if (ename != null && doReport) |
| { |
| dataBufferFlush(); |
| handler.startInternalEntity(ename); |
| } |
| sourceType = INPUT_INTERNAL; |
| readBuffer = ch; |
| readBufferPos = start; |
| readBufferLength = length; |
| readBufferOverflow = -1; |
| } |
| |
| /** |
| * Save the current input source onto the stack. |
| * <p>This method saves all of the global variables associated with |
| * the current input source, so that they can be restored when a new |
| * input source has finished. It also tests for entity recursion. |
| * <p>The method saves the following global variables onto a stack |
| * using a fixed-length array: |
| * <ol> |
| * <li>sourceType |
| * <li>externalEntity |
| * <li>readBuffer |
| * <li>readBufferPos |
| * <li>readBufferLength |
| * <li>line |
| * <li>encoding |
| * </ol> |
| * @param ename The name of the entity (if any) causing the new input. |
| * @see #popInput |
| * @see #sourceType |
| * @see #externalEntity |
| * @see #readBuffer |
| * @see #readBufferPos |
| * @see #readBufferLength |
| * @see #line |
| * @see #encoding |
| */ |
| private void pushInput(String ename) |
| throws SAXException |
| { |
| // Check for entity recursion. |
| if (ename != null) |
| { |
| Iterator entities = entityStack.iterator(); |
| while (entities.hasNext()) |
| { |
| String e = (String) entities.next(); |
| if (e != null && e == ename) |
| { |
| error("recursive reference to entity", ename, null); |
| } |
| } |
| } |
| entityStack.addLast(ename); |
| |
| // Don't bother if there is no current input. |
| if (sourceType == INPUT_NONE) |
| { |
| return; |
| } |
| |
| // Set up a snapshot of the current |
| // input source. |
| Input input = new Input(); |
| |
| input.sourceType = sourceType; |
| input.externalEntity = externalEntity; |
| input.readBuffer = readBuffer; |
| input.readBufferPos = readBufferPos; |
| input.readBufferLength = readBufferLength; |
| input.line = line; |
| input.encoding = encoding; |
| input.readBufferOverflow = readBufferOverflow; |
| input.is = is; |
| input.currentByteCount = currentByteCount; |
| input.column = column; |
| input.reader = reader; |
| |
| // Push it onto the stack. |
| inputStack.addLast(input); |
| } |
| |
| /** |
| * Restore a previous input source. |
| * <p>This method restores all of the global variables associated with |
| * the current input source. |
| * @exception java.io.EOFException |
| * If there are no more entries on the input stack. |
| * @see #pushInput |
| * @see #sourceType |
| * @see #externalEntity |
| * @see #readBuffer |
| * @see #readBufferPos |
| * @see #readBufferLength |
| * @see #line |
| * @see #encoding |
| */ |
| private void popInput() |
| throws SAXException, IOException |
| { |
| String ename = (String) entityStack.removeLast(); |
| |
| if (ename != null && doReport) |
| { |
| dataBufferFlush(); |
| } |
| switch (sourceType) |
| { |
| case INPUT_STREAM: |
| handler.endExternalEntity(ename); |
| is.close(); |
| break; |
| case INPUT_READER: |
| handler.endExternalEntity(ename); |
| reader.close(); |
| break; |
| case INPUT_INTERNAL: |
| if (ename != null && doReport) |
| { |
| handler.endInternalEntity(ename); |
| } |
| break; |
| } |
| |
| // Throw an EOFException if there |
| // is nothing else to pop. |
| if (inputStack.isEmpty()) |
| { |
| throw new EOFException("no more input"); |
| } |
| |
| Input input = (Input) inputStack.removeLast(); |
| |
| sourceType = input.sourceType; |
| externalEntity = input.externalEntity; |
| readBuffer = input.readBuffer; |
| readBufferPos = input.readBufferPos; |
| readBufferLength = input.readBufferLength; |
| line = input.line; |
| encoding = input.encoding; |
| readBufferOverflow = input.readBufferOverflow; |
| is = input.is; |
| currentByteCount = input.currentByteCount; |
| column = input.column; |
| reader = input.reader; |
| } |
| |
| /** |
| * Return true if we can read the expected character. |
| * <p>Note that the character will be removed from the input stream |
| * on success, but will be put back on failure. Do not attempt to |
| * read the character again if the method succeeds. |
| * @param delim The character that should appear next. For a |
| * insensitive match, you must supply this in upper-case. |
| * @return true if the character was successfully read, or false if |
| * it was not. |
| * @see #tryRead (String) |
| */ |
| private boolean tryRead(char delim) |
| throws SAXException, IOException |
| { |
| char c; |
| |
| // Read the character |
| c = readCh(); |
| |
| // Test for a match, and push the character |
| // back if the match fails. |
| if (c == delim) |
| { |
| return true; |
| } |
| else |
| { |
| unread(c); |
| return false; |
| } |
| } |
| |
| /** |
| * Return true if we can read the expected string. |
| * <p>This is simply a convenience method. |
| * <p>Note that the string will be removed from the input stream |
| * on success, but will be put back on failure. Do not attempt to |
| * read the string again if the method succeeds. |
| * <p>This method will push back a character rather than an |
| * array whenever possible (probably the majority of cases). |
| * @param delim The string that should appear next. |
| * @return true if the string was successfully read, or false if |
| * it was not. |
| * @see #tryRead (char) |
| */ |
| private boolean tryRead(String delim) |
| throws SAXException, IOException |
| { |
| return tryRead(delim.toCharArray()); |
| } |
| |
| private boolean tryRead(char[] ch) |
| throws SAXException, IOException |
| { |
| char c; |
| |
| // Compare the input, character- |
| // by character. |
| |
| for (int i = 0; i < ch.length; i++) |
| { |
| c = readCh(); |
| if (c != ch[i]) |
| { |
| unread(c); |
| if (i != 0) |
| { |
| unread(ch, i); |
| } |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| /** |
| * Return true if we can read some whitespace. |
| * <p>This is simply a convenience method. |
| * <p>This method will push back a character rather than an |
| * array whenever possible (probably the majority of cases). |
| * @return true if whitespace was found. |
| */ |
| private boolean tryWhitespace() |
| throws SAXException, IOException |
| { |
| char c; |
| c = readCh(); |
| if (isWhitespace(c)) |
| { |
| skipWhitespace(); |
| return true; |
| } |
| else |
| { |
| unread(c); |
| return false; |
| } |
| } |
| |
| /** |
| * Read all data until we find the specified string. |
| * This is useful for scanning CDATA sections and PIs. |
| * <p>This is inefficient right now, since it calls tryRead () |
| * for every character. |
| * @param delim The string delimiter |
| * @see #tryRead (String, boolean) |
| * @see #readCh |
| */ |
| private void parseUntil(String delim) |
| throws SAXException, IOException |
| { |
| parseUntil(delim.toCharArray()); |
| } |
| |
| private void parseUntil(char[] delim) |
| throws SAXException, IOException |
| { |
| char c; |
| int startLine = line; |
| |
| try |
| { |
| while (!tryRead(delim)) |
| { |
| c = readCh(); |
| dataBufferAppend(c); |
| } |
| } |
| catch (EOFException e) |
| { |
| error("end of input while looking for delimiter " |
| + "(started on line " + startLine |
| + ')', null, new String(delim)); |
| } |
| } |
| |
| ////////////////////////////////////////////////////////////////////// |
| // Low-level I/O. |
| ////////////////////////////////////////////////////////////////////// |
| |
| /** |
| * Prefetch US-ASCII XML/text decl from input stream into read buffer. |
| * Doesn't buffer more than absolutely needed, so that when an encoding |
| * decl says we need to create an InputStreamReader, we can discard our |
| * buffer and reset(). Caller knows the first chars of the decl exist |
| * in the input stream. |
| */ |
| private void prefetchASCIIEncodingDecl() |
| throws SAXException, IOException |
| { |
| int ch; |
| readBufferPos = readBufferLength = 0; |
| |
| is.mark(readBuffer.length); |
| while (true) |
| { |
| ch = is.read(); |
| readBuffer[readBufferLength++] = (char) ch; |
| switch (ch) |
| { |
| case (int) '>': |
| return; |
| case -1: |
| error("file ends before end of XML or encoding declaration.", |
| null, "?>"); |
| } |
| if (readBuffer.length == readBufferLength) |
| { |
| error("unfinished XML or encoding declaration"); |
| } |
| } |
| } |
| |
| /** |
| * Read a chunk of data from an external input source. |
| * <p>This is simply a front-end that fills the rawReadBuffer |
| * with bytes, then calls the appropriate encoding handler. |
| * @see #encoding |
| * @see #rawReadBuffer |
| * @see #readBuffer |
| * @see #filterCR |
| * @see #copyUtf8ReadBuffer |
| * @see #copyIso8859_1ReadBuffer |
| * @see #copyUcs_2ReadBuffer |
| * @see #copyUcs_4ReadBuffer |
| */ |
| private void readDataChunk() |
| throws SAXException, IOException |
| { |
| int count; |
| |
| // See if we have any overflow (filterCR sets for CR at end) |
| if (readBufferOverflow > -1) |
| { |
| readBuffer[0] = (char) readBufferOverflow; |
| readBufferOverflow = -1; |
| readBufferPos = 1; |
| sawCR = true; |
| } |
| else |
| { |
| readBufferPos = 0; |
| sawCR = false; |
| } |
| |
| // input from a character stream. |
| if (sourceType == INPUT_READER) |
| { |
| count = reader.read(readBuffer, |
| readBufferPos, READ_BUFFER_MAX - readBufferPos); |
| if (count < 0) |
| { |
| readBufferLength = readBufferPos; |
| } |
| else |
| { |
| readBufferLength = readBufferPos + count; |
| } |
| if (readBufferLength > 0) |
| { |
| filterCR(count >= 0); |
| } |
| sawCR = false; |
| return; |
| } |
| |
| // Read as many bytes as possible into the raw buffer. |
| count = is.read(rawReadBuffer, 0, READ_BUFFER_MAX); |
| |
| // Dispatch to an encoding-specific reader method to populate |
| // the readBuffer. In most parser speed profiles, these routines |
| // show up at the top of the CPU usage chart. |
| if (count > 0) |
| { |
| switch (encoding) |
| { |
| // one byte builtins |
| case ENCODING_ASCII: |
| copyIso8859_1ReadBuffer(count, (char) 0x0080); |
| break; |
| case ENCODING_UTF_8: |
| copyUtf8ReadBuffer(count); |
| break; |
| case ENCODING_ISO_8859_1: |
| copyIso8859_1ReadBuffer(count, (char) 0); |
| break; |
| |
| // two byte builtins |
| case ENCODING_UCS_2_12: |
| copyUcs2ReadBuffer(count, 8, 0); |
| break; |
| case ENCODING_UCS_2_21: |
| copyUcs2ReadBuffer(count, 0, 8); |
| break; |
| |
| // four byte builtins |
| case ENCODING_UCS_4_1234: |
| copyUcs4ReadBuffer(count, 24, 16, 8, 0); |
| break; |
| case ENCODING_UCS_4_4321: |
| copyUcs4ReadBuffer(count, 0, 8, 16, 24); |
| break; |
| case ENCODING_UCS_4_2143: |
| copyUcs4ReadBuffer(count, 16, 24, 0, 8); |
| break; |
| case ENCODING_UCS_4_3412: |
| copyUcs4ReadBuffer(count, 8, 0, 24, 16); |
| break; |
| } |
| } |
| else |
| { |
| readBufferLength = readBufferPos; |
| } |
| |
| readBufferPos = 0; |
| |
| // Filter out all carriage returns if we've seen any |
| // (including any saved from a previous read) |
| if (sawCR) |
| { |
| filterCR(count >= 0); |
| sawCR = false; |
| |
| // must actively report EOF, lest some CRs get lost. |
| if (readBufferLength == 0 && count >= 0) |
| { |
| readDataChunk(); |
| } |
| } |
| |
| if (count > 0) |
| { |
| currentByteCount += count; |
| } |
| } |
| |
| /** |
| * Filter carriage returns in the read buffer. |
| * CRLF becomes LF; CR becomes LF. |
| * @param moreData true iff more data might come from the same source |
| * @see #readDataChunk |
| * @see #readBuffer |
| * @see #readBufferOverflow |
| */ |
| private void filterCR(boolean moreData) |
| { |
| int i, j; |
| |
| readBufferOverflow = -1; |
| |
| loop: |
| for (i = j = readBufferPos; j < readBufferLength; i++, j++) |
| { |
| switch (readBuffer[j]) |
| { |
| case '\r': |
| if (j == readBufferLength - 1) |
| { |
| if (moreData) |
| { |
| readBufferOverflow = '\r'; |
| readBufferLength--; |
| } |
| else // CR at end of buffer |
| { |
| readBuffer[i++] = '\n'; |
| } |
| break loop; |
| } |
| else if (readBuffer[j + 1] == '\n') |
| { |
| j++; |
| } |
| readBuffer[i] = '\n'; |
| break; |
| |
| case '\n': |
| default: |
| readBuffer[i] = readBuffer[j]; |
| break; |
| } |
| } |
| readBufferLength = i; |
| } |
| |
| /** |
| * Convert a buffer of UTF-8-encoded bytes into UTF-16 characters. |
| * <p>When readDataChunk () calls this method, the raw bytes are in |
| * rawReadBuffer, and the final characters will appear in |
| * readBuffer. |
| * <p>Note that as of Unicode 3.1, good practice became a requirement, |
| * so that each Unicode character has exactly one UTF-8 representation. |
| * @param count The number of bytes to convert. |
| * @see #readDataChunk |
| * @see #rawReadBuffer |
| * @see #readBuffer |
| * @see #getNextUtf8Byte |
| */ |
| private void copyUtf8ReadBuffer(int count) |
| throws SAXException, IOException |
| { |
| int i = 0; |
| int j = readBufferPos; |
| int b1; |
| char c = 0; |
| |
| /* |
| // check once, so the runtime won't (if it's smart enough) |
| if (count < 0 || count > rawReadBuffer.length) |
| throw new ArrayIndexOutOfBoundsException (Integer.toString (count)); |
| */ |
| |
| while (i < count) |
| { |
| b1 = rawReadBuffer[i++]; |
| |
| // Determine whether we are dealing |
| // with a one-, two-, three-, or four- |
| // byte sequence. |
| if (b1 < 0) |
| { |
| if ((b1 & 0xe0) == 0xc0) |
| { |
| // 2-byte sequence: 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx |
| c = (char) (((b1 & 0x1f) << 6) |
| | getNextUtf8Byte(i++, count)); |
| if (c < 0x0080) |
| { |
| encodingError("Illegal two byte UTF-8 sequence", |
| c, 0); |
| } |
| |
| //Sec 2.11 |
| // [1] the two-character sequence #xD #xA |
| // [2] the two-character sequence #xD #x85 |
| if ((c == 0x0085 || c == 0x000a) && sawCR) |
| { |
| continue; |
| } |
| |
| // Sec 2.11 |
| // [3] the single character #x85 |
| |
| if (c == 0x0085 && xmlVersion == XML_11) |
| { |
| readBuffer[j++] = '\r'; |
| } |
| } |
| else if ((b1 & 0xf0) == 0xe0) |
| { |
| // 3-byte sequence: |
| // zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx |
| // most CJKV characters |
| c = (char) (((b1 & 0x0f) << 12) | |
| (getNextUtf8Byte(i++, count) << 6) | |
| getNextUtf8Byte(i++, count)); |
| //sec 2.11 |
| //[4] the single character #x2028 |
| if (c == 0x2028 && xmlVersion == XML_11) |
| { |
| readBuffer[j++] = '\r'; |
| sawCR = true; |
| continue; |
| } |
| if (c < 0x0800 || (c >= 0xd800 && c <= 0xdfff)) |
| { |
| encodingError("Illegal three byte UTF-8 sequence", |
| c, 0); |
| } |
| } |
| else if ((b1 & 0xf8) == 0xf0) |
| { |
| // 4-byte sequence: 11101110wwwwzzzzyy + 110111yyyyxxxxxx |
| // = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx |
| // (uuuuu = wwww + 1) |
| // "Surrogate Pairs" ... from the "Astral Planes" |
| // Unicode 3.1 assigned the first characters there |
| int iso646 = b1 & 07; |
| iso646 = (iso646 << 6) + getNextUtf8Byte(i++, count); |
| iso646 = (iso646 << 6) + getNextUtf8Byte(i++, count); |
| iso646 = (iso646 << 6) + getNextUtf8Byte(i++, count); |
| |
| if (iso646 <= 0xffff) |
| { |
| encodingError("Illegal four byte UTF-8 sequence", |
| iso646, 0); |
| } |
| else |
| { |
| if (iso646 > 0x0010ffff) |
| { |
| encodingError("UTF-8 value out of range for Unicode", |
| iso646, 0); |
| } |
| iso646 -= 0x010000; |
| readBuffer[j++] = (char) (0xd800 | (iso646 >> 10)); |
| readBuffer[j++] = (char) (0xdc00 | (iso646 & 0x03ff)); |
| continue; |
| } |
| } |
| else |
| { |
| // The five and six byte encodings aren't supported; |
| // they exceed the Unicode (and XML) range. |
| encodingError("unsupported five or six byte UTF-8 sequence", |
| 0xff & b1, i); |
| // NOTREACHED |
| c = 0; |
| } |
| } |
| else |
| { |
| // 1-byte sequence: 000000000xxxxxxx = 0xxxxxxx |
| // (US-ASCII character, "common" case, one branch to here) |
| c = (char) b1; |
| } |
| readBuffer[j++] = c; |
| if (c == '\r') |
| { |
| sawCR = true; |
| } |
| } |
| // How many characters have we read? |
| readBufferLength = j; |
| } |
| |
| /** |
| * Return the next byte value in a UTF-8 sequence. |
| * If it is not possible to get a byte from the current |
| * entity, throw an exception. |
| * @param pos The current position in the rawReadBuffer. |
| * @param count The number of bytes in the rawReadBuffer |
| * @return The significant six bits of a non-initial byte in |
| * a UTF-8 sequence. |
| * @exception EOFException If the sequence is incomplete. |
| */ |
| private int getNextUtf8Byte(int pos, int count) |
| throws SAXException, IOException |
| { |
| int val; |
| |
| // Take a character from the buffer |
| // or from the actual input stream. |
| if (pos < count) |
| { |
| val = rawReadBuffer[pos]; |
| } |
| else |
| { |
| val = is.read(); |
| if (val == -1) |
| { |
| encodingError("unfinished multi-byte UTF-8 sequence at EOF", |
| -1, pos); |
| } |
| } |
| |
| // Check for the correct bits at the start. |
| if ((val & 0xc0) != 0x80) |
| { |
| encodingError("bad continuation of multi-byte UTF-8 sequence", |
| val, pos + 1); |
| } |
| |
| // Return the significant bits. |
| return (val & 0x3f); |
| } |
| |
| /** |
| * Convert a buffer of US-ASCII or ISO-8859-1-encoded bytes into |
| * UTF-16 characters. |
| * |
| * <p>When readDataChunk () calls this method, the raw bytes are in |
| * rawReadBuffer, and the final characters will appear in |
| * readBuffer. |
| * |
| * @param count The number of bytes to convert. |
| * @param mask For ASCII conversion, 0x7f; else, 0xff. |
| * @see #readDataChunk |
| * @see #rawReadBuffer |
| * @see #readBuffer |
| */ |
| private void copyIso8859_1ReadBuffer(int count, char mask) |
| throws IOException |
| { |
| int i, j; |
| for (i = 0, j = readBufferPos; i < count; i++, j++) |
| { |
| char c = (char) (rawReadBuffer[i] & 0xff); |
| if ((c & mask) != 0) |
| { |
| throw new CharConversionException("non-ASCII character U+" |
| + Integer.toHexString(c)); |
| } |
| if (c == 0x0085 && xmlVersion == XML_11) |
| { |
| c = '\r'; |
| } |
| readBuffer[j] = c; |
| if (c == '\r') |
| { |
| sawCR = true; |
| } |
| } |
| readBufferLength = j; |
| } |
| |
| /** |
| * Convert a buffer of UCS-2-encoded bytes into UTF-16 characters |
| * (as used in Java string manipulation). |
| * |
| * <p>When readDataChunk () calls this method, the raw bytes are in |
| * rawReadBuffer, and the final characters will appear in |
| * readBuffer. |
| * @param count The number of bytes to convert. |
| * @param shift1 The number of bits to shift byte 1. |
| * @param shift2 The number of bits to shift byte 2 |
| * @see #readDataChunk |
| * @see #rawReadBuffer |
| * @see #readBuffer |
| */ |
| private void copyUcs2ReadBuffer(int count, int shift1, int shift2) |
| throws SAXException |
| { |
| int j = readBufferPos; |
| |
| if (count > 0 && (count % 2) != 0) |
| { |
| encodingError("odd number of bytes in UCS-2 encoding", -1, count); |
| } |
| // The loops are faster with less internal brancing; hence two |
| if (shift1 == 0) |
| { // "UTF-16-LE" |
| for (int i = 0; i < count; i += 2) |
| { |
| char c = (char) (rawReadBuffer[i + 1] << 8); |
| c |= 0xff & rawReadBuffer[i]; |
| readBuffer[j++] = c; |
| if (c == '\r') |
| { |
| sawCR = true; |
| } |
| } |
| } |
| else |
| { // "UTF-16-BE" |
| for (int i = 0; i < count; i += 2) |
| { |
| char c = (char) (rawReadBuffer[i] << 8); |
| c |= 0xff & rawReadBuffer[i + 1]; |
| readBuffer[j++] = c; |
| if (c == '\r') |
| { |
| sawCR = true; |
| } |
| } |
| } |
| readBufferLength = j; |
| } |
| |
| /** |
| * Convert a buffer of UCS-4-encoded bytes into UTF-16 characters. |
| * |
| * <p>When readDataChunk () calls this method, the raw bytes are in |
| * rawReadBuffer, and the final characters will appear in |
| * readBuffer. |
| * <p>Java has Unicode chars, and this routine uses surrogate pairs |
| * for ISO-10646 values between 0x00010000 and 0x000fffff. An |
| * exception is thrown if the ISO-10646 character has no Unicode |
| * representation. |
| * |
| * @param count The number of bytes to convert. |
| * @param shift1 The number of bits to shift byte 1. |
| * @param shift2 The number of bits to shift byte 2 |
| * @param shift3 The number of bits to shift byte 2 |
| * @param shift4 The number of bits to shift byte 2 |
| * @see #readDataChunk |
| * @see #rawReadBuffer |
| * @see #readBuffer |
| */ |
| private void copyUcs4ReadBuffer(int count, int shift1, int shift2, |
| int shift3, int shift4) |
| throws SAXException |
| { |
| int j = readBufferPos; |
| |
| if (count > 0 && (count % 4) != 0) |
| { |
| encodingError("number of bytes in UCS-4 encoding " + |
| "not divisible by 4", |
| -1, count); |
| } |
| for (int i = 0; i < count; i += 4) |
| { |
| int value = (((rawReadBuffer [i] & 0xff) << shift1) | |
| ((rawReadBuffer [i + 1] & 0xff) << shift2) | |
| ((rawReadBuffer [i + 2] & 0xff) << shift3) | |
| ((rawReadBuffer [i + 3] & 0xff) << shift4)); |
| if (value < 0x0000ffff) |
| { |
| readBuffer [j++] = (char) value; |
| if (value == (int) '\r') |
| { |
| sawCR = true; |
| } |
| } |
| else if (value < 0x0010ffff) |
| { |
| value -= 0x010000; |
| readBuffer[j++] = (char) (0xd8 | ((value >> 10) & 0x03ff)); |
| readBuffer[j++] = (char) (0xdc | (value & 0x03ff)); |
| } |
| else |
| { |
| encodingError("UCS-4 value out of range for Unicode", |
| value, i); |
| } |
| } |
| readBufferLength = j; |
| } |
| |
| /** |
| * Report a character encoding error. |
| */ |
| private void encodingError(String message, int value, int offset) |
| throws SAXException |
| { |
| if (value != -1) |
| { |
| message = message + " (character code: 0x" + |
| Integer.toHexString(value) + ')'; |
| error(message); |
| } |
| } |
| |
| ////////////////////////////////////////////////////////////////////// |
| // Local Variables. |
| ////////////////////////////////////////////////////////////////////// |
| |
| /** |
| * Re-initialize the variables for each parse. |
| */ |
| private void initializeVariables() |
| { |
| // First line |
| line = 1; |
| column = 0; |
| |
| // Set up the buffers for data and names |
| dataBufferPos = 0; |
| dataBuffer = new char[DATA_BUFFER_INITIAL]; |
| nameBufferPos = 0; |
| nameBuffer = new char[NAME_BUFFER_INITIAL]; |
| |
| // Set up the DTD hash tables |
| elementInfo = new HashMap(); |
| entityInfo = new HashMap(); |
| notationInfo = new HashMap(); |
| skippedPE = false; |
| |
| // Set up the variables for the current |
| // element context. |
| currentElement = null; |
| currentElementContent = CONTENT_UNDECLARED; |
| |
| // Set up the input variables |
| sourceType = INPUT_NONE; |
| inputStack = new LinkedList(); |
| entityStack = new LinkedList(); |
| externalEntity = null; |
| tagAttributePos = 0; |
| tagAttributes = new String[100]; |
| rawReadBuffer = new byte[READ_BUFFER_MAX]; |
| readBufferOverflow = -1; |
| |
| scratch = new InputSource(); |
| |
| inLiteral = false; |
| expandPE = false; |
| peIsError = false; |
| |
| doReport = false; |
| |
| inCDATA = false; |
| |
| symbolTable = new Object[SYMBOL_TABLE_LENGTH][]; |
| } |
| |
| static class ExternalIdentifiers |
| { |
| |
| String publicId; |
| String systemId; |
| String baseUri; |
| |
| ExternalIdentifiers() |
| { |
| } |
| |
| ExternalIdentifiers(String publicId, String systemId, String baseUri) |
| { |
| this.publicId = publicId; |
| this.systemId = systemId; |
| this.baseUri = baseUri; |
| } |
| |
| } |
| |
| static class EntityInfo |
| { |
| |
| int type; |
| ExternalIdentifiers ids; |
| String value; |
| String notationName; |
| |
| } |
| |
| static class AttributeDecl |
| { |
| |
| String type; |
| String value; |
| int valueType; |
| String enumeration; |
| String defaultValue; |
| |
| } |
| |
| static class ElementDecl |
| { |
| |
| int contentType; |
| String contentModel; |
| HashMap attributes; |
| |
| } |
| |
| static class Input |
| { |
| |
| int sourceType; |
| URLConnection externalEntity; |
| char[] readBuffer; |
| int readBufferPos; |
| int readBufferLength; |
| int line; |
| int encoding; |
| int readBufferOverflow; |
| InputStream is; |
| int currentByteCount; |
| int column; |
| Reader reader; |
| |
| } |
| |
| } |
| |