| /* XMLWriter.java -- |
| Copyright (C) 1999,2000,2001 Free Software Foundation, Inc. |
| |
| This file is part of GNU Classpath. |
| |
| GNU Classpath is free software; you can redistribute it and/or modify |
| it under the terms of the GNU General Public License as published by |
| the Free Software Foundation; either version 2, or (at your option) |
| any later version. |
| |
| GNU Classpath is distributed in the hope that it will be useful, but |
| WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| General Public License for more details. |
| |
| You should have received a copy of the GNU General Public License |
| along with GNU Classpath; see the file COPYING. If not, write to the |
| Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA |
| 02110-1301 USA. |
| |
| Linking this library statically or dynamically with other modules is |
| making a combined work based on this library. Thus, the terms and |
| conditions of the GNU General Public License cover the whole |
| combination. |
| |
| As a special exception, the copyright holders of this library give you |
| permission to link this library with independent modules to produce an |
| executable, regardless of the license terms of these independent |
| modules, and to copy and distribute the resulting executable under |
| terms of your choice, provided that you also meet, for each linked |
| independent module, the terms and conditions of the license of that |
| module. An independent module is a module which is not derived from |
| or based on this library. If you modify this library, you may extend |
| this exception to your version of the library, but you are not |
| obligated to do so. If you do not wish to do so, delete this |
| exception statement from your version. */ |
| |
| package gnu.xml.util; |
| |
| import java.io.BufferedWriter; |
| import java.io.CharConversionException; |
| import java.io.IOException; |
| import java.io.OutputStream; |
| import java.io.OutputStreamWriter; |
| import java.io.Writer; |
| import java.util.Stack; |
| |
| import org.xml.sax.*; |
| import org.xml.sax.ext.*; |
| import org.xml.sax.helpers.*; |
| |
| |
| /** |
| * This class is a SAX handler which writes all its input as a well formed |
| * XML or XHTML document. If driven using SAX2 events, this output may |
| * include a recreated document type declaration, subject to limitations |
| * of SAX (no internal subset exposed) or DOM (the important declarations, |
| * with their documentation, are discarded). |
| * |
| * <p> By default, text is generated "as-is", but some optional modes |
| * are supported. Pretty-printing is supported, to make life easier |
| * for people reading the output. XHTML (1.0) output has can be made |
| * particularly pretty; all the built-in character entities are known. |
| * Canonical XML can also be generated, assuming the input is properly |
| * formed. |
| * |
| * <hr> |
| * |
| * <p> Some of the methods on this class are intended for applications to |
| * use directly, rather than as pure SAX2 event callbacks. Some of those |
| * methods access the JavaBeans properties (used to tweak output formats, |
| * for example canonicalization and pretty printing). Subclasses |
| * are expected to add new behaviors, not to modify current behavior, so |
| * many such methods are final.</p> |
| * |
| * <p> The <em>write*()</em> methods may be slightly simpler for some |
| * applications to use than direct callbacks. For example, they support |
| * a simple policy for encoding data items as the content of a single element. |
| * |
| * <p> To reuse an XMLWriter you must provide it with a new Writer, since |
| * this handler closes the writer it was given as part of its endDocument() |
| * handling. (XML documents have an end of input, and the way to encode |
| * that on a stream is to close it.) </p> |
| * |
| * <hr> |
| * |
| * <p> Note that any relative URIs in the source document, as found in |
| * entity and notation declarations, ought to have been fully resolved by |
| * the parser providing events to this handler. This means that the |
| * output text should only have fully resolved URIs, which may not be |
| * the desired behavior in cases where later binding is desired. </p> |
| * |
| * <p> <em>Note that due to SAX2 defaults, you may need to manually |
| * ensure that the input events are XML-conformant with respect to namespace |
| * prefixes and declarations. {@link gnu.xml.pipeline.NSFilter} is |
| * one solution to this problem, in the context of processing pipelines.</em> |
| * Something as simple as connecting this handler to a parser might not |
| * generate the correct output. Another workaround is to ensure that the |
| * <em>namespace-prefixes</em> feature is always set to true, if you're |
| * hooking this directly up to some XMLReader implementation. |
| * |
| * @see gnu.xml.pipeline.TextConsumer |
| * |
| * @author David Brownell |
| * |
| * @deprecated Please use the javax.xml.stream APIs instead |
| */ |
| public class XMLWriter |
| implements ContentHandler, LexicalHandler, DTDHandler, DeclHandler |
| { |
| // text prints/escapes differently depending on context |
| // CTX_ENTITY ... entity literal value |
| // CTX_ATTRIBUTE ... attribute literal value |
| // CTX_CONTENT ... content of an element |
| // CTX_UNPARSED ... CDATA, comment, PI, names, etc |
| // CTX_NAME ... name or nmtoken, no escapes possible |
| private static final int CTX_ENTITY = 1; |
| private static final int CTX_ATTRIBUTE = 2; |
| private static final int CTX_CONTENT = 3; |
| private static final int CTX_UNPARSED = 4; |
| private static final int CTX_NAME = 5; |
| |
| // FIXME: names (element, attribute, PI, notation, etc) are not |
| // currently written out with range checks (escapeChars). |
| // In non-XHTML, some names can't be directly written; panic! |
| |
| private static String sysEOL; |
| |
| static { |
| try { |
| sysEOL = System.getProperty ("line.separator", "\n"); |
| |
| // don't use the system's EOL if it's illegal XML. |
| if (!isLineEnd (sysEOL)) |
| sysEOL = "\n"; |
| |
| } catch (SecurityException e) { |
| sysEOL = "\n"; |
| } |
| } |
| |
| private static boolean isLineEnd (String eol) |
| { |
| return "\n".equals (eol) |
| || "\r".equals (eol) |
| || "\r\n".equals (eol); |
| } |
| |
| private Writer out; |
| private boolean inCDATA; |
| private int elementNestLevel; |
| private String eol = sysEOL; |
| |
| private short dangerMask; |
| private StringBuffer stringBuf; |
| private Locator locator; |
| private ErrorHandler errHandler; |
| |
| private boolean expandingEntities = false; |
| private int entityNestLevel; |
| private boolean xhtml; |
| private boolean startedDoctype; |
| private String encoding; |
| |
| private boolean canonical; |
| private boolean inDoctype; |
| private boolean inEpilogue; |
| |
| // pretty printing controls |
| private boolean prettyPrinting; |
| private int column; |
| private boolean noWrap; |
| private Stack space = new Stack (); |
| |
| // this is not a hard'n'fast rule -- longer lines are OK, |
| // but are to be avoided. Here, prettyprinting is more to |
| // show structure "cleanly" than to be precise about it. |
| // better to have ragged layout than one line 24Kb long. |
| private static final int lineLength = 75; |
| |
| |
| /** |
| * Constructs this handler with System.out used to write SAX events |
| * using the UTF-8 encoding. Avoid using this except when you know |
| * it's safe to close System.out at the end of the document. |
| */ |
| public XMLWriter () throws IOException |
| { this (System.out); } |
| |
| /** |
| * Constructs a handler which writes all input to the output stream |
| * in the UTF-8 encoding, and closes it when endDocument is called. |
| * (Yes it's annoying that this throws an exception -- but there's |
| * really no way around it, since it's barely possible a JDK may |
| * exist somewhere that doesn't know how to emit UTF-8.) |
| */ |
| public XMLWriter (OutputStream out) throws IOException |
| { |
| this (new OutputStreamWriter (out, "UTF8")); |
| } |
| |
| /** |
| * Constructs a handler which writes all input to the writer, and then |
| * closes the writer when the document ends. If an XML declaration is |
| * written onto the output, and this class can determine the name of |
| * the character encoding for this writer, that encoding name will be |
| * included in the XML declaration. |
| * |
| * <P> See the description of the constructor which takes an encoding |
| * name for imporant information about selection of encodings. |
| * |
| * @param writer XML text is written to this writer. |
| */ |
| public XMLWriter (Writer writer) |
| { |
| this (writer, null); |
| } |
| |
| /** |
| * Constructs a handler which writes all input to the writer, and then |
| * closes the writer when the document ends. If an XML declaration is |
| * written onto the output, this class will use the specified encoding |
| * name in that declaration. If no encoding name is specified, no |
| * encoding name will be declared unless this class can otherwise |
| * determine the name of the character encoding for this writer. |
| * |
| * <P> At this time, only the UTF-8 ("UTF8") and UTF-16 ("Unicode") |
| * output encodings are fully lossless with respect to XML data. If you |
| * use any other encoding you risk having your data be silently mangled |
| * on output, as the standard Java character encoding subsystem silently |
| * maps non-encodable characters to a question mark ("?") and will not |
| * report such errors to applications. |
| * |
| * <p> For a few other encodings the risk can be reduced. If the writer is |
| * a java.io.OutputStreamWriter, and uses either the ISO-8859-1 ("8859_1", |
| * "ISO8859_1", etc) or US-ASCII ("ASCII") encodings, content which |
| * can't be encoded in those encodings will be written safely. Where |
| * relevant, the XHTML entity names will be used; otherwise, numeric |
| * character references will be emitted. |
| * |
| * <P> However, there remain a number of cases where substituting such |
| * entity or character references is not an option. Such references are |
| * not usable within a DTD, comment, PI, or CDATA section. Neither may |
| * they be used when element, attribute, entity, or notation names have |
| * the problematic characters. |
| * |
| * @param writer XML text is written to this writer. |
| * @param encoding if non-null, and an XML declaration is written, |
| * this is the name that will be used for the character encoding. |
| */ |
| public XMLWriter (Writer writer, String encoding) |
| { |
| setWriter (writer, encoding); |
| } |
| |
| private void setEncoding (String encoding) |
| { |
| if (encoding == null && out instanceof OutputStreamWriter) |
| encoding = ((OutputStreamWriter)out).getEncoding (); |
| |
| if (encoding != null) { |
| encoding = encoding.toUpperCase (); |
| |
| // Use official encoding names where we know them, |
| // avoiding the Java-only names. When using common |
| // encodings where we can easily tell if characters |
| // are out of range, we'll escape out-of-range |
| // characters using character refs for safety. |
| |
| // I _think_ these are all the main synonyms for these! |
| if ("UTF8".equals (encoding)) { |
| encoding = "UTF-8"; |
| } else if ("US-ASCII".equals (encoding) |
| || "ASCII".equals (encoding)) { |
| dangerMask = (short) 0xff80; |
| encoding = "US-ASCII"; |
| } else if ("ISO-8859-1".equals (encoding) |
| || "8859_1".equals (encoding) |
| || "ISO8859_1".equals (encoding)) { |
| dangerMask = (short) 0xff00; |
| encoding = "ISO-8859-1"; |
| } else if ("UNICODE".equals (encoding) |
| || "UNICODE-BIG".equals (encoding) |
| || "UNICODE-LITTLE".equals (encoding)) { |
| encoding = "UTF-16"; |
| |
| // TODO: UTF-16BE, UTF-16LE ... no BOM; what |
| // release of JDK supports those Unicode names? |
| } |
| |
| if (dangerMask != 0) |
| stringBuf = new StringBuffer (); |
| } |
| |
| this.encoding = encoding; |
| } |
| |
| |
| /** |
| * Resets the handler to write a new text document. |
| * |
| * @param writer XML text is written to this writer. |
| * @param encoding if non-null, and an XML declaration is written, |
| * this is the name that will be used for the character encoding. |
| * |
| * @exception IllegalStateException if the current |
| * document hasn't yet ended (with {@link #endDocument}) |
| */ |
| final public void setWriter (Writer writer, String encoding) |
| { |
| if (out != null) |
| throw new IllegalStateException ( |
| "can't change stream in mid course"); |
| out = writer; |
| if (out != null) |
| setEncoding (encoding); |
| if (!(out instanceof BufferedWriter)) |
| out = new BufferedWriter (out); |
| space.push ("default"); |
| } |
| |
| /** |
| * Assigns the line ending style to be used on output. |
| * @param eolString null to use the system default; else |
| * "\n", "\r", or "\r\n". |
| */ |
| final public void setEOL (String eolString) |
| { |
| if (eolString == null) |
| eol = sysEOL; |
| else if (!isLineEnd (eolString)) |
| eol = eolString; |
| else |
| throw new IllegalArgumentException (eolString); |
| } |
| |
| /** |
| * Assigns the error handler to be used to present most fatal |
| * errors. |
| */ |
| public void setErrorHandler (ErrorHandler handler) |
| { |
| errHandler = handler; |
| } |
| |
| /** |
| * Used internally and by subclasses, this encapsulates the logic |
| * involved in reporting fatal errors. It uses locator information |
| * for good diagnostics, if available, and gives the application's |
| * ErrorHandler the opportunity to handle the error before throwing |
| * an exception. |
| */ |
| protected void fatal (String message, Exception e) |
| throws SAXException |
| { |
| SAXParseException x; |
| |
| if (locator == null) |
| x = new SAXParseException (message, null, null, -1, -1, e); |
| else |
| x = new SAXParseException (message, locator, e); |
| if (errHandler != null) |
| errHandler.fatalError (x); |
| throw x; |
| } |
| |
| |
| // JavaBeans properties |
| |
| /** |
| * Controls whether the output should attempt to follow the "transitional" |
| * XHTML rules so that it meets the "HTML Compatibility Guidelines" |
| * appendix in the XHTML specification. A "transitional" Document Type |
| * Declaration (DTD) is placed near the beginning of the output document, |
| * instead of whatever DTD would otherwise have been placed there, and |
| * XHTML empty elements are printed specially. When writing text in |
| * US-ASCII or ISO-8859-1 encodings, the predefined XHTML internal |
| * entity names are used (in preference to character references) when |
| * writing content characters which can't be expressed in those encodings. |
| * |
| * <p> When this option is enabled, it is the caller's responsibility |
| * to ensure that the input is otherwise valid as XHTML. Things to |
| * be careful of in all cases, as described in the appendix referenced |
| * above, include: <ul> |
| * |
| * <li> Element and attribute names must be in lower case, both |
| * in the document and in any CSS style sheet. |
| * <li> All XML constructs must be valid as defined by the XHTML |
| * "transitional" DTD (including all familiar constructs, |
| * even deprecated ones). |
| * <li> The root element must be "html". |
| * <li> Elements that must be empty (such as <em><br></em> |
| * must have no content. |
| * <li> Use both <em>lang</em> and <em>xml:lang</em> attributes |
| * when specifying language. |
| * <li> Similarly, use both <em>id</em> and <em>name</em> attributes |
| * when defining elements that may be referred to through |
| * URI fragment identifiers ... and make sure that the |
| * value is a legal NMTOKEN, since not all such HTML 4.0 |
| * identifiers are valid in XML. |
| * <li> Be careful with character encodings; make sure you provide |
| * a <em><meta http-equiv="Content-type" |
| * content="text/xml;charset=..." /></em> element in |
| * the HTML "head" element, naming the same encoding |
| * used to create this handler. Also, if that encoding |
| * is anything other than US-ASCII, make sure that if |
| * the document is given a MIME content type, it has |
| * a <em>charset=...</em> attribute with that encoding. |
| * </ul> |
| * |
| * <p> Additionally, some of the oldest browsers have additional |
| * quirks, to address with guidelines such as: <ul> |
| * |
| * <li> Processing instructions may be rendered, so avoid them. |
| * (Similarly for an XML declaration.) |
| * <li> Embedded style sheets and scripts should not contain XML |
| * markup delimiters: &, <, and ]]> are trouble. |
| * <li> Attribute values should not have line breaks or multiple |
| * consecutive white space characters. |
| * <li> Use no more than one of the deprecated (transitional) |
| * <em><isindex></em> elements. |
| * <li> Some boolean attributes (such as <em>compact, checked, |
| * disabled, readonly, selected,</em> and more) confuse |
| * some browsers, since they only understand minimized |
| * versions which are illegal in XML. |
| * </ul> |
| * |
| * <p> Also, some characteristics of the resulting output may be |
| * a function of whether the document is later given a MIME |
| * content type of <em>text/html</em> rather than one indicating |
| * XML (<em>application/xml</em> or <em>text/xml</em>). Worse, |
| * some browsers ignore MIME content types and prefer to rely URI |
| * name suffixes -- so an "index.xml" could always be XML, never |
| * XHTML, no matter its MIME type. |
| */ |
| final public void setXhtml (boolean value) |
| { |
| if (locator != null) |
| throw new IllegalStateException ("started parsing"); |
| xhtml = value; |
| if (xhtml) |
| canonical = false; |
| } |
| |
| /** |
| * Returns true if the output attempts to echo the input following |
| * "transitional" XHTML rules and matching the "HTML Compatibility |
| * Guidelines" so that an HTML version 3 browser can read the output |
| * as HTML; returns false (the default) othewise. |
| */ |
| final public boolean isXhtml () |
| { |
| return xhtml; |
| } |
| |
| /** |
| * Controls whether the output text contains references to |
| * entities (the default), or instead contains the expanded |
| * values of those entities. |
| */ |
| final public void setExpandingEntities (boolean value) |
| { |
| if (locator != null) |
| throw new IllegalStateException ("started parsing"); |
| expandingEntities = value; |
| if (!expandingEntities) |
| canonical = false; |
| } |
| |
| /** |
| * Returns true if the output will have no entity references; |
| * returns false (the default) otherwise. |
| */ |
| final public boolean isExpandingEntities () |
| { |
| return expandingEntities; |
| } |
| |
| /** |
| * Controls pretty-printing, which by default is not enabled |
| * (and currently is most useful for XHTML output). |
| * Pretty printing enables structural indentation, sorting of attributes |
| * by name, line wrapping, and potentially other mechanisms for making |
| * output more or less readable. |
| * |
| * <p> At this writing, structural indentation and line wrapping are |
| * enabled when pretty printing is enabled and the <em>xml:space</em> |
| * attribute has the value <em>default</em> (its other legal value is |
| * <em>preserve</em>, as defined in the XML specification). The three |
| * XHTML element types which use another value are recognized by their |
| * names (namespaces are ignored). |
| * |
| * <p> Also, for the record, the "pretty" aspect of printing here |
| * is more to provide basic structure on outputs that would otherwise |
| * risk being a single long line of text. For now, expect the |
| * structure to be ragged ... unless you'd like to submit a patch |
| * to make this be more strictly formatted! |
| * |
| * @exception IllegalStateException thrown if this method is invoked |
| * after output has begun. |
| */ |
| final public void setPrettyPrinting (boolean value) |
| { |
| if (locator != null) |
| throw new IllegalStateException ("started parsing"); |
| prettyPrinting = value; |
| if (prettyPrinting) |
| canonical = false; |
| } |
| |
| /** |
| * Returns value of flag controlling pretty printing. |
| */ |
| final public boolean isPrettyPrinting () |
| { |
| return prettyPrinting; |
| } |
| |
| |
| /** |
| * Sets the output style to be canonicalized. Input events must |
| * meet requirements that are slightly more stringent than the |
| * basic well-formedness ones, and include: <ul> |
| * |
| * <li> Namespace prefixes must not have been changed from those |
| * in the original document. (This may only be ensured by setting |
| * the SAX2 XMLReader <em>namespace-prefixes</em> feature flag; |
| * by default, it is cleared.) |
| * |
| * <li> Redundant namespace declaration attributes have been |
| * removed. (If an ancestor element defines a namespace prefix |
| * and that declaration hasn't been overriden, an element must |
| * not redeclare it.) |
| * |
| * <li> If comments are not to be included in the canonical output, |
| * they must first be removed from the input event stream; this |
| * <em>Canonical XML with comments</em> by default. |
| * |
| * <li> If the input character encoding was not UCS-based, the |
| * character data must have been normalized using Unicode |
| * Normalization Form C. (UTF-8 and UTF-16 are UCS-based.) |
| * |
| * <li> Attribute values must have been normalized, as is done |
| * by any conformant XML processor which processes all external |
| * parameter entities. |
| * |
| * <li> Similarly, attribute value defaulting has been performed. |
| * |
| * </ul> |
| * |
| * <p> Note that fragments of XML documents, as specified by an XPath |
| * node set, may be canonicalized. In such cases, elements may need |
| * some fixup (for <em>xml:*</em> attributes and application-specific |
| * context). |
| * |
| * @exception IllegalArgumentException if the output encoding |
| * is anything other than UTF-8. |
| */ |
| final public void setCanonical (boolean value) |
| { |
| if (value && !"UTF-8".equals (encoding)) |
| throw new IllegalArgumentException ("encoding != UTF-8"); |
| canonical = value; |
| if (canonical) { |
| prettyPrinting = xhtml = false; |
| expandingEntities = true; |
| eol = "\n"; |
| } |
| } |
| |
| |
| /** |
| * Returns value of flag controlling canonical output. |
| */ |
| final public boolean isCanonical () |
| { |
| return canonical; |
| } |
| |
| |
| /** |
| * Flushes the output stream. When this handler is used in long lived |
| * pipelines, it can be important to flush buffered state, for example |
| * so that it can reach the disk as part of a state checkpoint. |
| */ |
| final public void flush () |
| throws IOException |
| { |
| if (out != null) |
| out.flush (); |
| } |
| |
| |
| // convenience routines |
| |
| // FIXME: probably want a subclass that holds a lot of these... |
| // and maybe more! |
| |
| /** |
| * Writes the string as if characters() had been called on the contents |
| * of the string. This is particularly useful when applications act as |
| * producers and write data directly to event consumers. |
| */ |
| final public void write (String data) |
| throws SAXException |
| { |
| char buf [] = data.toCharArray (); |
| characters (buf, 0, buf.length); |
| } |
| |
| |
| /** |
| * Writes an element that has content consisting of a single string. |
| * @see #writeEmptyElement |
| * @see #startElement |
| */ |
| public void writeElement ( |
| String uri, |
| String localName, |
| String qName, |
| Attributes atts, |
| String content |
| ) throws SAXException |
| { |
| if (content == null || content.length () == 0) { |
| writeEmptyElement (uri, localName, qName, atts); |
| return; |
| } |
| startElement (uri, localName, qName, atts); |
| char chars [] = content.toCharArray (); |
| characters (chars, 0, chars.length); |
| endElement (uri, localName, qName); |
| } |
| |
| |
| /** |
| * Writes an element that has content consisting of a single integer, |
| * encoded as a decimal string. |
| * @see #writeEmptyElement |
| * @see #startElement |
| */ |
| public void writeElement ( |
| String uri, |
| String localName, |
| String qName, |
| Attributes atts, |
| int content |
| ) throws SAXException |
| { |
| writeElement (uri, localName, qName, atts, Integer.toString (content)); |
| } |
| |
| |
| // SAX1 ContentHandler |
| /** <b>SAX1</b>: provides parser status information */ |
| final public void setDocumentLocator (Locator l) |
| { |
| locator = l; |
| } |
| |
| |
| // URL for dtd that validates against all normal HTML constructs |
| private static final String xhtmlFullDTD = |
| "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"; |
| |
| |
| /** |
| * <b>SAX1</b>: indicates the beginning of a document parse. |
| * If you're writing (well formed) fragments of XML, neither |
| * this nor endDocument should be called. |
| */ |
| // NOT final |
| public void startDocument () |
| throws SAXException |
| { |
| try { |
| if (out == null) |
| throw new IllegalStateException ( |
| "null Writer given to XMLWriter"); |
| |
| // Not all parsers provide the locator we want; this also |
| // flags whether events are being sent to this object yet. |
| // We could only have this one call if we only printed whole |
| // documents ... but we also print fragments, so most of the |
| // callbacks here replicate this test. |
| |
| if (locator == null) |
| locator = new LocatorImpl (); |
| |
| // Unless the data is in US-ASCII or we're canonicalizing, write |
| // the XML declaration if we know the encoding. US-ASCII won't |
| // normally get mangled by web server confusion about the |
| // character encodings used. Plus, it's an easy way to |
| // ensure we can write ASCII that's unlikely to confuse |
| // elderly HTML parsers. |
| |
| if (!canonical |
| && dangerMask != (short) 0xff80 |
| && encoding != null) { |
| rawWrite ("<?xml version='1.0'"); |
| rawWrite (" encoding='" + encoding + "'"); |
| rawWrite ("?>"); |
| newline (); |
| } |
| |
| if (xhtml) { |
| |
| rawWrite ("<!DOCTYPE html PUBLIC"); |
| newline (); |
| rawWrite (" '-//W3C//DTD XHTML 1.0 Transitional//EN'"); |
| newline (); |
| rawWrite (" '"); |
| // NOTE: URL (above) matches the REC |
| rawWrite (xhtmlFullDTD); |
| rawWrite ("'>"); |
| newline (); |
| newline (); |
| |
| // fake the rest of the handler into ignoring |
| // everything until the root element, so any |
| // XHTML DTD comments, PIs, etc are ignored |
| startedDoctype = true; |
| } |
| |
| entityNestLevel = 0; |
| |
| } catch (IOException e) { |
| fatal ("can't write", e); |
| } |
| } |
| |
| /** |
| * <b>SAX1</b>: indicates the completion of a parse. |
| * Note that all complete SAX event streams make this call, even |
| * if an error is reported during a parse. |
| */ |
| // NOT final |
| public void endDocument () |
| throws SAXException |
| { |
| try { |
| if (!canonical) { |
| newline (); |
| newline (); |
| } |
| out.close (); |
| out = null; |
| locator = null; |
| } catch (IOException e) { |
| fatal ("can't write", e); |
| } |
| } |
| |
| // XHTML elements declared as EMPTY print differently |
| final private static boolean isEmptyElementTag (String tag) |
| { |
| switch (tag.charAt (0)) { |
| case 'a': return "area".equals (tag); |
| case 'b': return "base".equals (tag) |
| || "basefont".equals (tag) |
| || "br".equals (tag); |
| case 'c': return "col".equals (tag); |
| case 'f': return "frame".equals (tag); |
| case 'h': return "hr".equals (tag); |
| case 'i': return "img".equals (tag) |
| || "input".equals (tag) |
| || "isindex".equals (tag); |
| case 'l': return "link".equals (tag); |
| case 'm': return "meta".equals (tag); |
| case 'p': return "param".equals (tag); |
| } |
| return false; |
| } |
| |
| private static boolean indentBefore (String tag) |
| { |
| // basically indent before block content |
| // and within structure like tables, lists |
| switch (tag.charAt (0)) { |
| case 'a': return "applet".equals (tag); |
| case 'b': return "body".equals (tag) |
| || "blockquote".equals (tag); |
| case 'c': return "center".equals (tag); |
| case 'f': return "frame".equals (tag) |
| || "frameset".equals (tag); |
| case 'h': return "head".equals (tag); |
| case 'm': return "meta".equals (tag); |
| case 'o': return "object".equals (tag); |
| case 'p': return "param".equals (tag) |
| || "pre".equals (tag); |
| case 's': return "style".equals (tag); |
| case 't': return "title".equals (tag) |
| || "td".equals (tag) |
| || "th".equals (tag); |
| } |
| // ... but not inline elements like "em", "b", "font" |
| return false; |
| } |
| |
| private static boolean spaceBefore (String tag) |
| { |
| // blank line AND INDENT before certain structural content |
| switch (tag.charAt (0)) { |
| case 'h': return "h1".equals (tag) |
| || "h2".equals (tag) |
| || "h3".equals (tag) |
| || "h4".equals (tag) |
| || "h5".equals (tag) |
| || "h6".equals (tag) |
| || "hr".equals (tag); |
| case 'l': return "li".equals (tag); |
| case 'o': return "ol".equals (tag); |
| case 'p': return "p".equals (tag); |
| case 't': return "table".equals (tag) |
| || "tr".equals (tag); |
| case 'u': return "ul".equals (tag); |
| } |
| return false; |
| } |
| |
| // XHTML DTDs say these three have xml:space="preserve" |
| private static boolean spacePreserve (String tag) |
| { |
| return "pre".equals (tag) |
| || "style".equals (tag) |
| || "script".equals (tag); |
| } |
| |
| /** |
| * <b>SAX2</b>: ignored. |
| */ |
| final public void startPrefixMapping (String prefix, String uri) |
| {} |
| |
| /** |
| * <b>SAX2</b>: ignored. |
| */ |
| final public void endPrefixMapping (String prefix) |
| {} |
| |
| private void writeStartTag ( |
| String name, |
| Attributes atts, |
| boolean isEmpty |
| ) throws SAXException, IOException |
| { |
| rawWrite ('<'); |
| rawWrite (name); |
| |
| // write out attributes ... sorting is particularly useful |
| // with output that's been heavily defaulted. |
| if (atts != null && atts.getLength () != 0) { |
| |
| // Set up to write, with optional sorting |
| int indices [] = new int [atts.getLength ()]; |
| |
| for (int i= 0; i < indices.length; i++) |
| indices [i] = i; |
| |
| // optionally sort |
| |
| // FIXME: canon xml demands xmlns nodes go first, |
| // and sorting by URI first (empty first) then localname |
| // it should maybe use a different sort |
| |
| if (canonical || prettyPrinting) { |
| |
| // insertion sort by attribute name |
| for (int i = 1; i < indices.length; i++) { |
| int n = indices [i], j; |
| String s = atts.getQName (n); |
| |
| for (j = i - 1; j >= 0; j--) { |
| if (s.compareTo (atts.getQName (indices [j])) |
| >= 0) |
| break; |
| indices [j + 1] = indices [j]; |
| } |
| indices [j + 1] = n; |
| } |
| } |
| |
| // write, sorted or no |
| for (int i= 0; i < indices.length; i++) { |
| String s = atts.getQName (indices [i]); |
| |
| if (s == null || "".equals (s)) |
| throw new IllegalArgumentException ("no XML name"); |
| rawWrite (" "); |
| rawWrite (s); |
| rawWrite ("="); |
| writeQuotedValue (atts.getValue (indices [i]), |
| CTX_ATTRIBUTE); |
| } |
| } |
| if (isEmpty) |
| rawWrite (" /"); |
| rawWrite ('>'); |
| } |
| |
| /** |
| * <b>SAX2</b>: indicates the start of an element. |
| * When XHTML is in use, avoid attribute values with |
| * line breaks or multiple whitespace characters, since |
| * not all user agents handle them correctly. |
| */ |
| final public void startElement ( |
| String uri, |
| String localName, |
| String qName, |
| Attributes atts |
| ) throws SAXException |
| { |
| startedDoctype = false; |
| |
| if (locator == null) |
| locator = new LocatorImpl (); |
| |
| if (qName == null || "".equals (qName)) |
| throw new IllegalArgumentException ("no XML name"); |
| |
| try { |
| if (entityNestLevel != 0) |
| return; |
| if (prettyPrinting) { |
| String whitespace = null; |
| |
| if (xhtml && spacePreserve (qName)) |
| whitespace = "preserve"; |
| else if (atts != null) |
| whitespace = atts.getValue ("xml:space"); |
| if (whitespace == null) |
| whitespace = (String) space.peek (); |
| space.push (whitespace); |
| |
| if ("default".equals (whitespace)) { |
| if (xhtml) { |
| if (spaceBefore (qName)) { |
| newline (); |
| doIndent (); |
| } else if (indentBefore (qName)) |
| doIndent (); |
| // else it's inlined, modulo line length |
| // FIXME: incrementing element nest level |
| // for inlined elements causes ugliness |
| } else |
| doIndent (); |
| } |
| } |
| elementNestLevel++; |
| writeStartTag (qName, atts, xhtml && isEmptyElementTag (qName)); |
| |
| if (xhtml) { |
| // FIXME: if this is an XHTML "pre" element, turn |
| // off automatic wrapping. |
| } |
| |
| } catch (IOException e) { |
| fatal ("can't write", e); |
| } |
| } |
| |
| /** |
| * Writes an empty element. |
| * @see #startElement |
| */ |
| public void writeEmptyElement ( |
| String uri, |
| String localName, |
| String qName, |
| Attributes atts |
| ) throws SAXException |
| { |
| if (canonical) { |
| startElement (uri, localName, qName, atts); |
| endElement (uri, localName, qName); |
| } else { |
| try { |
| writeStartTag (qName, atts, true); |
| } catch (IOException e) { |
| fatal ("can't write", e); |
| } |
| } |
| } |
| |
| |
| /** <b>SAX2</b>: indicates the end of an element */ |
| final public void endElement (String uri, String localName, String qName) |
| throws SAXException |
| { |
| if (qName == null || "".equals (qName)) |
| throw new IllegalArgumentException ("no XML name"); |
| |
| try { |
| elementNestLevel--; |
| if (entityNestLevel != 0) |
| return; |
| if (xhtml && isEmptyElementTag (qName)) |
| return; |
| rawWrite ("</"); |
| rawWrite (qName); |
| rawWrite ('>'); |
| |
| if (prettyPrinting) { |
| if (!space.empty ()) |
| space.pop (); |
| else |
| fatal ("stack discipline", null); |
| } |
| if (elementNestLevel == 0) |
| inEpilogue = true; |
| |
| } catch (IOException e) { |
| fatal ("can't write", e); |
| } |
| } |
| |
| /** <b>SAX1</b>: reports content characters */ |
| final public void characters (char ch [], int start, int length) |
| throws SAXException |
| { |
| if (locator == null) |
| locator = new LocatorImpl (); |
| |
| try { |
| if (entityNestLevel != 0) |
| return; |
| if (inCDATA) { |
| escapeChars (ch, start, length, CTX_UNPARSED); |
| } else { |
| escapeChars (ch, start, length, CTX_CONTENT); |
| } |
| } catch (IOException e) { |
| fatal ("can't write", e); |
| } |
| } |
| |
| /** <b>SAX1</b>: reports ignorable whitespace */ |
| final public void ignorableWhitespace (char ch [], int start, int length) |
| throws SAXException |
| { |
| if (locator == null) |
| locator = new LocatorImpl (); |
| |
| try { |
| if (entityNestLevel != 0) |
| return; |
| // don't forget to map NL to CRLF, CR, etc |
| escapeChars (ch, start, length, CTX_CONTENT); |
| } catch (IOException e) { |
| fatal ("can't write", e); |
| } |
| } |
| |
| /** |
| * <b>SAX1</b>: reports a PI. |
| * This doesn't check for illegal target names, such as "xml" or "XML", |
| * or namespace-incompatible ones like "big:dog"; the caller is |
| * responsible for ensuring those names are legal. |
| */ |
| final public void processingInstruction (String target, String data) |
| throws SAXException |
| { |
| if (locator == null) |
| locator = new LocatorImpl (); |
| |
| // don't print internal subset for XHTML |
| if (xhtml && startedDoctype) |
| return; |
| |
| // ancient HTML browsers might render these ... their loss. |
| // to prevent: "if (xhtml) return;". |
| |
| try { |
| if (entityNestLevel != 0) |
| return; |
| if (canonical && inEpilogue) |
| newline (); |
| rawWrite ("<?"); |
| rawWrite (target); |
| rawWrite (' '); |
| escapeChars (data.toCharArray (), -1, -1, CTX_UNPARSED); |
| rawWrite ("?>"); |
| if (elementNestLevel == 0 && !(canonical && inEpilogue)) |
| newline (); |
| } catch (IOException e) { |
| fatal ("can't write", e); |
| } |
| } |
| |
| /** <b>SAX1</b>: indicates a non-expanded entity reference */ |
| public void skippedEntity (String name) |
| throws SAXException |
| { |
| try { |
| rawWrite ("&"); |
| rawWrite (name); |
| rawWrite (";"); |
| } catch (IOException e) { |
| fatal ("can't write", e); |
| } |
| } |
| |
| // SAX2 LexicalHandler |
| |
| /** <b>SAX2</b>: called before parsing CDATA characters */ |
| final public void startCDATA () |
| throws SAXException |
| { |
| if (locator == null) |
| locator = new LocatorImpl (); |
| |
| if (canonical) |
| return; |
| |
| try { |
| inCDATA = true; |
| if (entityNestLevel == 0) |
| rawWrite ("<![CDATA["); |
| } catch (IOException e) { |
| fatal ("can't write", e); |
| } |
| } |
| |
| /** <b>SAX2</b>: called after parsing CDATA characters */ |
| final public void endCDATA () |
| throws SAXException |
| { |
| if (canonical) |
| return; |
| |
| try { |
| inCDATA = false; |
| if (entityNestLevel == 0) |
| rawWrite ("]]>"); |
| } catch (IOException e) { |
| fatal ("can't write", e); |
| } |
| } |
| |
| /** |
| * <b>SAX2</b>: called when the doctype is partially parsed |
| * Note that this, like other doctype related calls, is ignored |
| * when XHTML is in use. |
| */ |
| final public void startDTD (String name, String publicId, String systemId) |
| throws SAXException |
| { |
| if (locator == null) |
| locator = new LocatorImpl (); |
| if (xhtml) |
| return; |
| try { |
| inDoctype = startedDoctype = true; |
| if (canonical) |
| return; |
| rawWrite ("<!DOCTYPE "); |
| rawWrite (name); |
| rawWrite (' '); |
| |
| if (!expandingEntities) { |
| if (publicId != null) |
| rawWrite ("PUBLIC '" + publicId + "' '" + systemId + "' "); |
| else if (systemId != null) |
| rawWrite ("SYSTEM '" + systemId + "' "); |
| } |
| |
| rawWrite ('['); |
| newline (); |
| } catch (IOException e) { |
| fatal ("can't write", e); |
| } |
| } |
| |
| /** <b>SAX2</b>: called after the doctype is parsed */ |
| final public void endDTD () |
| throws SAXException |
| { |
| inDoctype = false; |
| if (canonical || xhtml) |
| return; |
| try { |
| rawWrite ("]>"); |
| newline (); |
| } catch (IOException e) { |
| fatal ("can't write", e); |
| } |
| } |
| |
| /** |
| * <b>SAX2</b>: called before parsing a general entity in content |
| */ |
| final public void startEntity (String name) |
| throws SAXException |
| { |
| try { |
| boolean writeEOL = true; |
| |
| // Predefined XHTML entities (for characters) will get |
| // mapped back later. |
| if (xhtml || expandingEntities) |
| return; |
| |
| entityNestLevel++; |
| if (name.equals ("[dtd]")) |
| return; |
| if (entityNestLevel != 1) |
| return; |
| if (!name.startsWith ("%")) { |
| writeEOL = false; |
| rawWrite ('&'); |
| } |
| rawWrite (name); |
| rawWrite (';'); |
| if (writeEOL) |
| newline (); |
| } catch (IOException e) { |
| fatal ("can't write", e); |
| } |
| } |
| |
| /** |
| * <b>SAX2</b>: called after parsing a general entity in content |
| */ |
| final public void endEntity (String name) |
| throws SAXException |
| { |
| if (xhtml || expandingEntities) |
| return; |
| entityNestLevel--; |
| } |
| |
| /** |
| * <b>SAX2</b>: called when comments are parsed. |
| * When XHTML is used, the old HTML tradition of using comments |
| * to for inline CSS, or for JavaScript code is discouraged. |
| * This is because XML processors are encouraged to discard, on |
| * the grounds that comments are for users (and perhaps text |
| * editors) not programs. Instead, use external scripts |
| */ |
| final public void comment (char ch [], int start, int length) |
| throws SAXException |
| { |
| if (locator == null) |
| locator = new LocatorImpl (); |
| |
| // don't print internal subset for XHTML |
| if (xhtml && startedDoctype) |
| return; |
| // don't print comment in doctype for canon xml |
| if (canonical && inDoctype) |
| return; |
| |
| try { |
| boolean indent; |
| |
| if (prettyPrinting && space.empty ()) |
| fatal ("stack discipline", null); |
| indent = prettyPrinting && "default".equals (space.peek ()); |
| if (entityNestLevel != 0) |
| return; |
| if (indent) |
| doIndent (); |
| if (canonical && inEpilogue) |
| newline (); |
| rawWrite ("<!--"); |
| escapeChars (ch, start, length, CTX_UNPARSED); |
| rawWrite ("-->"); |
| if (indent) |
| doIndent (); |
| if (elementNestLevel == 0 && !(canonical && inEpilogue)) |
| newline (); |
| } catch (IOException e) { |
| fatal ("can't write", e); |
| } |
| } |
| |
| // SAX1 DTDHandler |
| |
| /** <b>SAX1</b>: called on notation declarations */ |
| final public void notationDecl (String name, |
| String publicId, String systemId) |
| throws SAXException |
| { |
| if (xhtml) |
| return; |
| try { |
| // At this time, only SAX2 callbacks start these. |
| if (!startedDoctype) |
| return; |
| |
| if (entityNestLevel != 0) |
| return; |
| rawWrite ("<!NOTATION " + name + " "); |
| if (publicId != null) |
| rawWrite ("PUBLIC \"" + publicId + '"'); |
| else |
| rawWrite ("SYSTEM "); |
| if (systemId != null) |
| rawWrite ('"' + systemId + '"'); |
| rawWrite (">"); |
| newline (); |
| } catch (IOException e) { |
| fatal ("can't write", e); |
| } |
| } |
| |
| /** <b>SAX1</b>: called on unparsed entity declarations */ |
| final public void unparsedEntityDecl (String name, |
| String publicId, String systemId, |
| String notationName) |
| throws SAXException |
| { |
| if (xhtml) |
| return; |
| try { |
| // At this time, only SAX2 callbacks start these. |
| if (!startedDoctype) { |
| // FIXME: write to temporary buffer, and make the start |
| // of the root element write these declarations. |
| return; |
| } |
| |
| if (entityNestLevel != 0) |
| return; |
| rawWrite ("<!ENTITY " + name + " "); |
| if (publicId != null) |
| rawWrite ("PUBLIC \"" + publicId + '"'); |
| else |
| rawWrite ("SYSTEM "); |
| rawWrite ('"' + systemId + '"'); |
| rawWrite (" NDATA " + notationName + ">"); |
| newline (); |
| } catch (IOException e) { |
| fatal ("can't write", e); |
| } |
| } |
| |
| // SAX2 DeclHandler |
| |
| /** <b>SAX2</b>: called on attribute declarations */ |
| final public void attributeDecl (String eName, String aName, |
| String type, String mode, String value) |
| throws SAXException |
| { |
| if (xhtml) |
| return; |
| try { |
| // At this time, only SAX2 callbacks start these. |
| if (!startedDoctype) |
| return; |
| if (entityNestLevel != 0) |
| return; |
| rawWrite ("<!ATTLIST " + eName + ' ' + aName + ' '); |
| rawWrite (type); |
| rawWrite (' '); |
| if (mode != null) |
| rawWrite (mode + ' '); |
| if (value != null) |
| writeQuotedValue (value, CTX_ATTRIBUTE); |
| rawWrite ('>'); |
| newline (); |
| } catch (IOException e) { |
| fatal ("can't write", e); |
| } |
| } |
| |
| /** <b>SAX2</b>: called on element declarations */ |
| final public void elementDecl (String name, String model) |
| throws SAXException |
| { |
| if (xhtml) |
| return; |
| try { |
| // At this time, only SAX2 callbacks start these. |
| if (!startedDoctype) |
| return; |
| if (entityNestLevel != 0) |
| return; |
| rawWrite ("<!ELEMENT " + name + ' ' + model + '>'); |
| newline (); |
| } catch (IOException e) { |
| fatal ("can't write", e); |
| } |
| } |
| |
| /** <b>SAX2</b>: called on external entity declarations */ |
| final public void externalEntityDecl ( |
| String name, |
| String publicId, |
| String systemId) |
| throws SAXException |
| { |
| if (xhtml) |
| return; |
| try { |
| // At this time, only SAX2 callbacks start these. |
| if (!startedDoctype) |
| return; |
| if (entityNestLevel != 0) |
| return; |
| rawWrite ("<!ENTITY "); |
| if (name.startsWith ("%")) { |
| rawWrite ("% "); |
| rawWrite (name.substring (1)); |
| } else |
| rawWrite (name); |
| if (publicId != null) |
| rawWrite (" PUBLIC \"" + publicId + '"'); |
| else |
| rawWrite (" SYSTEM "); |
| rawWrite ('"' + systemId + "\">"); |
| newline (); |
| } catch (IOException e) { |
| fatal ("can't write", e); |
| } |
| } |
| |
| /** <b>SAX2</b>: called on internal entity declarations */ |
| final public void internalEntityDecl (String name, String value) |
| throws SAXException |
| { |
| if (xhtml) |
| return; |
| try { |
| // At this time, only SAX2 callbacks start these. |
| if (!startedDoctype) |
| return; |
| if (entityNestLevel != 0) |
| return; |
| rawWrite ("<!ENTITY "); |
| if (name.startsWith ("%")) { |
| rawWrite ("% "); |
| rawWrite (name.substring (1)); |
| } else |
| rawWrite (name); |
| rawWrite (' '); |
| writeQuotedValue (value, CTX_ENTITY); |
| rawWrite ('>'); |
| newline (); |
| } catch (IOException e) { |
| fatal ("can't write", e); |
| } |
| } |
| |
| private void writeQuotedValue (String value, int code) |
| throws SAXException, IOException |
| { |
| char buf [] = value.toCharArray (); |
| int off = 0, len = buf.length; |
| |
| // we can't add line breaks to attribute/entity/... values |
| noWrap = true; |
| rawWrite ('"'); |
| escapeChars (buf, off, len, code); |
| rawWrite ('"'); |
| noWrap = false; |
| } |
| |
| // From "HTMLlat1x.ent" ... names of entities for ISO-8859-1 |
| // (Latin/1) characters, all codes: 160-255 (0xA0-0xFF). |
| // Codes 128-159 have no assigned values. |
| private static final String HTMLlat1x [] = { |
| // 160 |
| "nbsp", "iexcl", "cent", "pound", "curren", |
| "yen", "brvbar", "sect", "uml", "copy", |
| |
| // 170 |
| "ordf", "laquo", "not", "shy", "reg", |
| "macr", "deg", "plusmn", "sup2", "sup3", |
| |
| // 180 |
| "acute", "micro", "para", "middot", "cedil", |
| "sup1", "ordm", "raquo", "frac14", "frac12", |
| |
| // 190 |
| "frac34", "iquest", "Agrave", "Aacute", "Acirc", |
| "Atilde", "Auml", "Aring", "AElig", "Ccedil", |
| |
| // 200 |
| "Egrave", "Eacute", "Ecirc", "Euml", "Igrave", |
| "Iacute", "Icirc", "Iuml", "ETH", "Ntilde", |
| |
| // 210 |
| "Ograve", "Oacute", "Ocirc", "Otilde", "Ouml", |
| "times", "Oslash", "Ugrave", "Uacute", "Ucirc", |
| |
| // 220 |
| "Uuml", "Yacute", "THORN", "szlig", "agrave", |
| "aacute", "acirc", "atilde", "auml", "aring", |
| |
| // 230 |
| "aelig", "ccedil", "egrave", "eacute", "ecirc", |
| "euml", "igrave", "iacute", "icirc", "iuml", |
| |
| // 240 |
| "eth", "ntilde", "ograve", "oacute", "ocirc", |
| "otilde", "ouml", "divide", "oslash", "ugrave", |
| |
| // 250 |
| "uacute", "ucirc", "uuml", "yacute", "thorn", |
| "yuml" |
| }; |
| |
| // From "HTMLsymbolx.ent" ... some of the symbols that |
| // we can conveniently handle. Entities for the Greek. |
| // alphabet (upper and lower cases) are compact. |
| private static final String HTMLsymbolx_GR [] = { |
| // 913 |
| "Alpha", "Beta", "Gamma", "Delta", "Epsilon", |
| "Zeta", "Eta", "Theta", "Iota", "Kappa", |
| |
| // 923 |
| "Lambda", "Mu", "Nu", "Xi", "Omicron", |
| "Pi", "Rho", null, "Sigma", "Tau", |
| |
| // 933 |
| "Upsilon", "Phi", "Chi", "Psi", "Omega" |
| }; |
| |
| private static final String HTMLsymbolx_gr [] = { |
| // 945 |
| "alpha", "beta", "gamma", "delta", "epsilon", |
| "zeta", "eta", "theta", "iota", "kappa", |
| |
| // 955 |
| "lambda", "mu", "nu", "xi", "omicron", |
| "pi", "rho", "sigmaf", "sigma", "tau", |
| |
| // 965 |
| "upsilon", "phi", "chi", "psi", "omega" |
| }; |
| |
| |
| // General routine to write text and substitute predefined |
| // entities (XML, and a special case for XHTML) as needed. |
| private void escapeChars (char buf [], int off, int len, int code) |
| throws SAXException, IOException |
| { |
| int first = 0; |
| |
| if (off < 0) { |
| off = 0; |
| len = buf.length; |
| } |
| for (int i = 0; i < len; i++) { |
| String esc; |
| char c = buf [off + i]; |
| |
| switch (c) { |
| // Note that CTX_ATTRIBUTE isn't explicitly tested here; |
| // all syntax delimiters are escaped in CTX_ATTRIBUTE, |
| // otherwise it's similar to CTX_CONTENT |
| |
| // ampersand flags entity references; entity replacement |
| // text has unexpanded references, other text doesn't. |
| case '&': |
| if (code == CTX_ENTITY || code == CTX_UNPARSED) |
| continue; |
| esc = "amp"; |
| break; |
| |
| // attributes and text may NOT have literal '<', but |
| // entities may have markup constructs |
| case '<': |
| if (code == CTX_ENTITY || code == CTX_UNPARSED) |
| continue; |
| esc = "lt"; |
| break; |
| |
| // as above re markup constructs; but otherwise |
| // except when canonicalizing, this is for consistency |
| case '>': |
| if (code == CTX_ENTITY || code == CTX_UNPARSED) |
| continue; |
| esc = "gt"; |
| break; |
| case '\'': |
| if (code == CTX_CONTENT || code == CTX_UNPARSED) |
| continue; |
| if (canonical) |
| continue; |
| esc = "apos"; |
| break; |
| |
| // needed when printing quoted attribute/entity values |
| case '"': |
| if (code == CTX_CONTENT || code == CTX_UNPARSED) |
| continue; |
| esc = "quot"; |
| break; |
| |
| // make line ends work per host OS convention |
| case '\n': |
| esc = eol; |
| break; |
| |
| // |
| // No other characters NEED special treatment ... except |
| // for encoding-specific issues, like whether the character |
| // can really be represented in that encoding. |
| // |
| default: |
| // |
| // There are characters we can never write safely; getting |
| // them is an error. |
| // |
| // (a) They're never legal in XML ... detected by range |
| // checks, and (eventually) by remerging surrogate |
| // pairs on output. (Easy error for apps to prevent.) |
| // |
| // (b) This encoding can't represent them, and we |
| // can't make reference substitution (e.g. inside |
| // CDATA sections, names, PI data, etc). (Hard for |
| // apps to prevent, except by using UTF-8 or UTF-16 |
| // as their output encoding.) |
| // |
| // We know a very little bit about what characters |
| // the US-ASCII and ISO-8859-1 encodings support. For |
| // other encodings we can't detect the second type of |
| // error at all. (Never an issue for UTF-8 or UTF-16.) |
| // |
| |
| // FIXME: CR in CDATA is an error; in text, turn to a char ref |
| |
| // FIXME: CR/LF/TAB in attributes should become char refs |
| |
| if ((c > 0xfffd) |
| || ((c < 0x0020) && !((c == 0x0009) |
| || (c == 0x000A) || (c == 0x000D))) |
| || (((c & dangerMask) != 0) |
| && (code == CTX_UNPARSED))) { |
| |
| // if case (b) in CDATA, we might end the section, |
| // write a reference, then restart ... possible |
| // in one DOM L3 draft. |
| |
| throw new CharConversionException ( |
| "Illegal or non-writable character: U+" |
| + Integer.toHexString (c)); |
| } |
| |
| // |
| // If the output encoding represents the character |
| // directly, let it do so! Else we'll escape it. |
| // |
| if ((c & dangerMask) == 0) |
| continue; |
| esc = null; |
| |
| // Avoid numeric refs where symbolic ones exist, as |
| // symbolic ones make more sense to humans reading! |
| if (xhtml) { |
| // all the HTMLlat1x.ent entities |
| // (all the "ISO-8859-1" characters) |
| if (c >= 160 && c <= 255) |
| esc = HTMLlat1x [c - 160]; |
| |
| // not quite half the HTMLsymbolx.ent entities |
| else if (c >= 913 && c <= 937) |
| esc = HTMLsymbolx_GR [c - 913]; |
| else if (c >= 945 && c <= 969) |
| esc = HTMLsymbolx_gr [c - 945]; |
| |
| else switch (c) { |
| // all of the HTMLspecialx.ent entities |
| case 338: esc = "OElig"; break; |
| case 339: esc = "oelig"; break; |
| case 352: esc = "Scaron"; break; |
| case 353: esc = "scaron"; break; |
| case 376: esc = "Yuml"; break; |
| case 710: esc = "circ"; break; |
| case 732: esc = "tilde"; break; |
| case 8194: esc = "ensp"; break; |
| case 8195: esc = "emsp"; break; |
| case 8201: esc = "thinsp"; break; |
| case 8204: esc = "zwnj"; break; |
| case 8205: esc = "zwj"; break; |
| case 8206: esc = "lrm"; break; |
| case 8207: esc = "rlm"; break; |
| case 8211: esc = "ndash"; break; |
| case 8212: esc = "mdash"; break; |
| case 8216: esc = "lsquo"; break; |
| case 8217: esc = "rsquo"; break; |
| case 8218: esc = "sbquo"; break; |
| case 8220: esc = "ldquo"; break; |
| case 8221: esc = "rdquo"; break; |
| case 8222: esc = "bdquo"; break; |
| case 8224: esc = "dagger"; break; |
| case 8225: esc = "Dagger"; break; |
| case 8240: esc = "permil"; break; |
| case 8249: esc = "lsaquo"; break; |
| case 8250: esc = "rsaquo"; break; |
| case 8364: esc = "euro"; break; |
| |
| // the other HTMLsymbox.ent entities |
| case 402: esc = "fnof"; break; |
| case 977: esc = "thetasym"; break; |
| case 978: esc = "upsih"; break; |
| case 982: esc = "piv"; break; |
| case 8226: esc = "bull"; break; |
| case 8230: esc = "hellip"; break; |
| case 8242: esc = "prime"; break; |
| case 8243: esc = "Prime"; break; |
| case 8254: esc = "oline"; break; |
| case 8260: esc = "frasl"; break; |
| case 8472: esc = "weierp"; break; |
| case 8465: esc = "image"; break; |
| case 8476: esc = "real"; break; |
| case 8482: esc = "trade"; break; |
| case 8501: esc = "alefsym"; break; |
| case 8592: esc = "larr"; break; |
| case 8593: esc = "uarr"; break; |
| case 8594: esc = "rarr"; break; |
| case 8595: esc = "darr"; break; |
| case 8596: esc = "harr"; break; |
| case 8629: esc = "crarr"; break; |
| case 8656: esc = "lArr"; break; |
| case 8657: esc = "uArr"; break; |
| case 8658: esc = "rArr"; break; |
| case 8659: esc = "dArr"; break; |
| case 8660: esc = "hArr"; break; |
| case 8704: esc = "forall"; break; |
| case 8706: esc = "part"; break; |
| case 8707: esc = "exist"; break; |
| case 8709: esc = "empty"; break; |
| case 8711: esc = "nabla"; break; |
| case 8712: esc = "isin"; break; |
| case 8713: esc = "notin"; break; |
| case 8715: esc = "ni"; break; |
| case 8719: esc = "prod"; break; |
| case 8721: esc = "sum"; break; |
| case 8722: esc = "minus"; break; |
| case 8727: esc = "lowast"; break; |
| case 8730: esc = "radic"; break; |
| case 8733: esc = "prop"; break; |
| case 8734: esc = "infin"; break; |
| case 8736: esc = "ang"; break; |
| case 8743: esc = "and"; break; |
| case 8744: esc = "or"; break; |
| case 8745: esc = "cap"; break; |
| case 8746: esc = "cup"; break; |
| case 8747: esc = "int"; break; |
| case 8756: esc = "there4"; break; |
| case 8764: esc = "sim"; break; |
| case 8773: esc = "cong"; break; |
| case 8776: esc = "asymp"; break; |
| case 8800: esc = "ne"; break; |
| case 8801: esc = "equiv"; break; |
| case 8804: esc = "le"; break; |
| case 8805: esc = "ge"; break; |
| case 8834: esc = "sub"; break; |
| case 8835: esc = "sup"; break; |
| case 8836: esc = "nsub"; break; |
| case 8838: esc = "sube"; break; |
| case 8839: esc = "supe"; break; |
| case 8853: esc = "oplus"; break; |
| case 8855: esc = "otimes"; break; |
| case 8869: esc = "perp"; break; |
| case 8901: esc = "sdot"; break; |
| case 8968: esc = "lceil"; break; |
| case 8969: esc = "rceil"; break; |
| case 8970: esc = "lfloor"; break; |
| case 8971: esc = "rfloor"; break; |
| case 9001: esc = "lang"; break; |
| case 9002: esc = "rang"; break; |
| case 9674: esc = "loz"; break; |
| case 9824: esc = "spades"; break; |
| case 9827: esc = "clubs"; break; |
| case 9829: esc = "hearts"; break; |
| case 9830: esc = "diams"; break; |
| } |
| } |
| |
| // else escape with numeric char refs |
| if (esc == null) { |
| stringBuf.setLength (0); |
| stringBuf.append ("#x"); |
| stringBuf.append (Integer.toHexString (c).toUpperCase ()); |
| esc = stringBuf.toString (); |
| |
| // FIXME: We don't write surrogate pairs correctly. |
| // They should work as one ref per character, since |
| // each pair is one character. For reading back into |
| // Unicode, it matters beginning in Unicode 3.1 ... |
| } |
| break; |
| } |
| if (i != first) |
| rawWrite (buf, off + first, i - first); |
| first = i + 1; |
| if (esc == eol) |
| newline (); |
| else { |
| rawWrite ('&'); |
| rawWrite (esc); |
| rawWrite (';'); |
| } |
| } |
| if (first < len) |
| rawWrite (buf, off + first, len - first); |
| } |
| |
| |
| |
| private void newline () |
| throws SAXException, IOException |
| { |
| out.write (eol); |
| column = 0; |
| } |
| |
| private void doIndent () |
| throws SAXException, IOException |
| { |
| int space = elementNestLevel * 2; |
| |
| newline (); |
| column = space; |
| // track tabs only at line starts |
| while (space > 8) { |
| out.write ("\t"); |
| space -= 8; |
| } |
| while (space > 0) { |
| out.write (" "); |
| space -= 2; |
| } |
| } |
| |
| private void rawWrite (char c) |
| throws IOException |
| { |
| out.write (c); |
| column++; |
| } |
| |
| private void rawWrite (String s) |
| throws SAXException, IOException |
| { |
| if (prettyPrinting && "default".equals (space.peek ())) { |
| char data [] = s.toCharArray (); |
| rawWrite (data, 0, data.length); |
| } else { |
| out.write (s); |
| column += s.length (); |
| } |
| } |
| |
| // NOTE: if xhtml, the REC gives some rules about whitespace |
| // which we could follow ... notably, many places where conformant |
| // agents "must" consolidate/normalize whitespace. Line ends can |
| // be removed there, etc. This may not be the right place to do |
| // such mappings though. |
| |
| // Line buffering may help clarify algorithms and improve results. |
| |
| // It's likely xml:space needs more attention. |
| |
| private void rawWrite (char buf [], int offset, int length) |
| throws SAXException, IOException |
| { |
| boolean wrap; |
| |
| if (prettyPrinting && space.empty ()) |
| fatal ("stack discipline", null); |
| |
| wrap = prettyPrinting && "default".equals (space.peek ()); |
| if (!wrap) { |
| out.write (buf, offset, length); |
| column += length; |
| return; |
| } |
| |
| // we're pretty printing and want to fill lines out only |
| // to the desired line length. |
| while (length > 0) { |
| int target = lineLength - column; |
| boolean wrote = false; |
| |
| // Do we even have a problem? |
| if (target > length || noWrap) { |
| out.write (buf, offset, length); |
| column += length; |
| return; |
| } |
| |
| // break the line at a space character, trying to fill |
| // as much of the line as possible. |
| char c; |
| |
| for (int i = target - 1; i >= 0; i--) { |
| if ((c = buf [offset + i]) == ' ' || c == '\t') { |
| i++; |
| out.write (buf, offset, i); |
| doIndent (); |
| offset += i; |
| length -= i; |
| wrote = true; |
| break; |
| } |
| } |
| if (wrote) |
| continue; |
| |
| // no space character permitting break before target |
| // line length is filled. So, take the next one. |
| if (target < 0) |
| target = 0; |
| for (int i = target; i < length; i++) |
| if ((c = buf [offset + i]) == ' ' || c == '\t') { |
| i++; |
| out.write (buf, offset, i); |
| doIndent (); |
| offset += i; |
| length -= i; |
| wrote = true; |
| break; |
| } |
| if (wrote) |
| continue; |
| |
| // no such luck. |
| out.write (buf, offset, length); |
| column += length; |
| break; |
| } |
| } |
| } |