001    /*
002    // $Id: //open/util/resgen/src/org/eigenbase/xom/XMLUtil.java#5 $
003    // Package org.eigenbase.xom is an XML Object Mapper.
004    // Copyright (C) 2005-2005 The Eigenbase Project
005    // Copyright (C) 2005-2005 Disruptive Tech
006    // Copyright (C) 2005-2005 LucidEra, Inc.
007    // Portions Copyright (C) 2001-2005 Kana Software, Inc. and others.
008    //
009    // This library is free software; you can redistribute it and/or modify it
010    // under the terms of the GNU Lesser General Public License as published by the
011    // Free Software Foundation; either version 2 of the License, or (at your
012    // option) any later version approved by The Eigenbase Project.
013    //
014    // This library is distributed in the hope that it will be useful,
015    // but WITHOUT ANY WARRANTY; without even the implied warranty of
016    // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
017    // GNU Lesser General Public License for more details.
018    //
019    // You should have received a copy of the GNU Lesser General Public License
020    // along with this library; if not, write to the Free Software
021    // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
022    //
023    // jhyde, 3 October, 2001
024    */
025    
026    package org.eigenbase.xom;
027    import java.io.IOException;
028    import java.io.PrintWriter;
029    import java.io.Reader;
030    
031    /**
032     * Utilities for dealing with XML data.  These methods must NOT depend upon any
033     * XML parser or object model (MSXML, DOM, SAX, etc.)
034     *
035     * @author jhyde
036     * @since 3 October, 2001
037     * @version $Id: //open/util/resgen/src/org/eigenbase/xom/XMLUtil.java#5 $
038     **/
039    public class XMLUtil {
040    
041        /**
042         * Determine if a String contains any XML special characters, return true
043         * if it does.  If this function returns true, the string will need to be
044         * encoded either using the stringEncodeXML function above or using a
045         * CDATA section.  Note that MSXML has a nasty bug whereby whitespace
046         * characters outside of a CDATA section are lost when parsing.  To
047         * avoid hitting this bug, this method treats many whitespace characters
048         * as "special".
049         * @param input the String to scan for XML special characters.
050         * @return true if the String contains any such characters.
051         */
052        public static boolean stringHasXMLSpecials(String input)
053        {
054            for (int i = 0; i < input.length(); i++) {
055                char c = input.charAt(i);
056                switch (c) {
057                case '<':
058                case '>':
059                case '"':
060                case '\'':
061                case '&':
062                case '\t':
063                case '\n':
064                case '\r':
065                    return true;
066                }
067            }
068            return false;
069        }
070    
071        /**
072         * Encode a String for XML output, displaying it to a PrintWriter.
073         * The String to be encoded is displayed, except that
074         * special characters are converted into entities.
075         * @param input a String to convert.
076         * @param out a PrintWriter to which to write the results.
077         */
078        public static void stringEncodeXML(String input, PrintWriter out)
079        {
080            for (int i = 0; i < input.length(); i++) {
081                char c = input.charAt(i);
082                switch (c) {
083                case '<':
084                case '>':
085                case '"':
086                case '\'':
087                case '&':
088                case '\t':
089                case '\n':
090                case '\r':
091                    out.print("&#" + (int)c + ";");
092                    break;
093                default:
094                    out.print(c);
095                }
096            }
097        }
098    
099        /**
100         * Quote a string, and write to a {@link PrintWriter}.
101         *
102         * <p>For example, <code>"a string"</code> becomes <code>&lt![CDATA[a
103         * string]]&gt;</code>.  If the string contains ']]&gt;' (which commonly
104         * occurs when wrapping other XML documents), we give up on using
105         * <code>&lt![CDATA[</code> ... <code>]]&gt;</code>, and just encode the
106         * string.  For example, <code>"A string with ]]&gt; in it"</code> becomes
107         * <code>"A string with ]]&amp;gt; in it"</code>.</p>
108         **/
109        public static void printPCDATA(PrintWriter pw, String data)
110        {
111            if (data.indexOf("]]>") > -1) {
112                String s = StringEscaper.xmlEscaper.escapeString(data);
113                pw.print(s);
114            } else {
115                pw.print("<![CDATA[");
116                pw.print(data);
117                pw.print("]]>");
118            }
119        }
120    
121        /**
122         * Quote a string.
123         *
124         * @see #printPCDATA(PrintWriter,String)
125         **/
126        public static String quotePCDATA(String data)
127        {
128            if (data.indexOf("]]>") > -1) {
129                return StringEscaper.xmlEscaper.escapeString(data);
130            } else {
131                return "<![CDATA[" + data + "]]>";
132            }
133        }
134    
135        /**
136         * Quote a string in an element and a CDATA, and write to a {@link
137         * PrintWriter}.  For example, it <code>tag</code> is "Value", then
138         * <code>"a string"</code> becomes <code>&ltValue&gt;&lt![CDATA[a
139         * string]]&gt;&lt/Value&gt;.
140         *
141         * @param newline whether to print a newline after the element
142         * @see #printPCDATA(PrintWriter,String)
143         **/
144        public static void printPCDATA(
145            PrintWriter pw, String tag, String data, boolean newline)
146        {
147            if (data == null || data.length() == 0) {
148                return;
149            }
150            pw.print("<");
151            pw.print(tag);
152            pw.print(">");
153            printPCDATA(pw,data);
154            pw.print("</");
155            pw.print(tag);
156            pw.print(">");
157            if (newline) {
158                pw.println();
159            }
160        }
161    
162        public static void printPCDATA(PrintWriter pw, String tag, String data)
163        {
164            boolean newline = false;
165            printPCDATA(pw, tag, data, newline);
166        }
167    
168        private static String escapeForQuoting(String val)
169        {
170            return StringEscaper.xmlNumericEscaper.escapeString(val);
171        }
172    
173        /** Quote a string so that it can be included as an XML attribute value. */
174        public static String quoteAtt(String val)
175        {
176            return "\"" + escapeForQuoting(val) + "\"";
177        }
178    
179        /** Return an XML attribute/value pair for String val */
180        public static String quoteAtt(String name, String val)
181        {
182            if ((val == null) || val.equals("")) {
183                return "";
184            }
185            return " " + name + "=" + quoteAtt(val);
186        }
187    
188        /** Return an XML attribute/value pair for int val */
189        public static String quoteAtt(String name, int val)
190        {
191            return " " + name + "=\"" + val + "\"";
192        }
193    
194        /** Return an XML attribute/value pair for boolean val */
195        public static String quoteAtt(String name, boolean val)
196        {
197            return " " + name + "=\"" + (val ? "TRUE" : "FALSE") + "\"";
198        }
199    
200        /** Quote a string so that it can be included as an XML attribute value. */
201        public static void printAtt(PrintWriter pw, String val)
202        {
203            pw.print("\"");
204            pw.print(escapeForQuoting(val));
205            pw.print("\"");
206        }
207    
208        /** Print an XML attribute name and value for string val */
209        public static void printAtt(PrintWriter pw, String name, String val)
210        {
211            if (val != null /* && !val.equals("") */) {
212                pw.print(" ");
213                pw.print(name);
214                pw.print("=\"");
215                pw.print(escapeForQuoting(val));
216                pw.print("\"");
217            }
218        }
219    
220        /** Print an XML attribute name and value for int val */
221        public static void printAtt(PrintWriter pw, String name, int val)
222        {
223            pw.print(" ");
224            pw.print(name);
225            pw.print("=\"");
226            pw.print(val);
227            pw.print("\"");
228        }
229    
230        /** Print an XML attribute name and value for boolean val */
231        public static void printAtt(PrintWriter pw, String name, boolean val)
232        {
233            pw.print(" ");
234            pw.print(name);
235            pw.print(val ? "=\"true\"" : "=\"false\"");
236        }
237    
238        /**
239         * Retrieve the name of the first tag in the XML document specified by the
240         * given Reader, without parsing the full file/string.  This function is
241         * useful to identify the DocType of an XML document before parsing,
242         * possibly to send the document off to different pieces of code.
243         * For performance reasons, the function attempts to read as little of
244         * the file or string as possible before making its decision about the
245         * first tag.  Leading comments are ignored.
246         * @param xml a Reader containing an XML document.
247         * @return the first tag name, as a String, or null if no first tag
248         * can be found.
249         */
250        public static String getFirstTagName(Reader xml)
251        {
252            final int OUTSIDE = 0;  // constant: identify outside state
253            final int BRACKET = 1;  // constant: bracket, contents unknown
254            final int COMMENT = 2;  // constant: identify a comment section
255            final int IGNORE = 3;   // constant: identify an ignored section
256            final int TAG = 4;      // constant: identify a tag section
257    
258            int state = OUTSIDE;
259            String commentMatch = null;
260            StringBuffer tagBuffer = null;
261            boolean sawBang = false;
262    
263            try {
264                int c = xml.read();
265                for (;;) {
266                    // No tag found if we hit EOF first.
267                    if (c == -1) {
268                        return null;
269                    }
270                    switch (state) {
271                    case OUTSIDE:
272                        // Start of any sort of tag
273                        if (c == '<') {
274                            state = BRACKET;
275                            commentMatch = "!--";
276                            sawBang = false;
277                            c = xml.read();
278    
279                            // Other non-whitespace characters outside of any tag
280                        } else if (!Character.isWhitespace((char) c)) {
281                            return null;
282    
283                            // Whitespace characters are ignored
284                        } else {
285                            c = xml.read();
286                        }
287                        break;
288    
289                    case BRACKET:
290                        // Check for the start of a comment.
291                        if (commentMatch != null) {
292                            if (c == commentMatch.charAt(0)) {
293                                // This match indicates a comment
294                                if (commentMatch.length() == 1) {
295                                    c = xml.read();
296                                    commentMatch = "-->";
297                                    state = COMMENT;
298                                } else {
299                                    // Remove the first character from commentMatch,
300                                    // then process the character as usual.
301                                    commentMatch =
302                                        commentMatch.substring(1, commentMatch.length());
303                                }
304                            } else {
305                                // No longer eligible for comment.
306                                commentMatch = null;
307                            }
308                        }
309    
310                        // Hit whitespace; ignore the character.
311                        if (Character.isWhitespace((char) c)) {
312                            c = xml.read();
313                            break;
314                        }
315    
316                        switch (c) {
317                        case '?':
318                            c = xml.read();
319                            state = IGNORE;
320                            break;
321                        case '!':
322                            // Enter an ignored section unless eligible for comment.
323                            c = xml.read();
324                            sawBang = true;
325                            if (commentMatch == null) {
326                                state = IGNORE;
327                            }
328                            break;
329                        case '-':
330                            // Enter an ignored section unless eligible for comment.
331                            c = xml.read();
332                            if (commentMatch == null) {
333                                state = IGNORE;
334                            }
335                            break;
336                        case '>':
337                            // Return to OUTSIDE state immediately
338                            c = xml.read();
339                            state = OUTSIDE;
340                            break;
341                        default:
342                            // State depends on whether we saw a ! or not.
343                            if (sawBang) {
344                                state = IGNORE;
345                            } else {
346                                state = TAG;
347                            }
348                            tagBuffer = new StringBuffer();
349                        }
350                        break;
351    
352                    case COMMENT:
353                        // Did we match the next expected end-of-comment character?
354                        if (c == commentMatch.charAt(0)) {
355                            c = xml.read();
356                            if (commentMatch.length() == 1) {
357                                // Done with the comment
358                                state = OUTSIDE;
359                            } else {
360                                commentMatch =
361                                    commentMatch.substring(1, commentMatch.length());
362                            }
363                        } else {
364                            // If not, restart our quest for the end-of-comment character.
365                            c = xml.read();
366                            commentMatch = "-->";
367                        }
368                        break;
369    
370                    case IGNORE:
371                        // Drop out on a close >.  Ignore all other characters.
372                        if (c == '>') {
373                            c = xml.read();
374                            state = OUTSIDE;
375                        } else {
376                            c = xml.read();
377                        }
378                        break;
379    
380                    case TAG:
381                        // Store characters in the tag buffer until we hit whitespace.
382                        // When we hit whitespace or '>' or '/', return the name of the tag.
383                        if (Character.isWhitespace((char)c) || c == '>'
384                            || c == '/') {
385                            return tagBuffer.toString();
386                        } else {
387                            tagBuffer.append((char)c);
388                            c = xml.read();
389                        }
390                        break;
391                    }
392                }
393            } catch (IOException ex) {
394                // On exception, we can't determine the first tag, so return null.
395                return null;
396            }
397        }
398    }
399    
400    
401    // End XMLUtil.java