001    /*
002    // $Id: //open/util/resgen/src/org/eigenbase/xom/wrappers/Annotator.java#3 $
003    // Package org.eigenbase.xom is an XML Object Mapper.
004    // Copyright (C) 2008-2008 The Eigenbase Project
005    // Copyright (C) 2008-2008 Disruptive Tech
006    // Copyright (C) 2008-2008 LucidEra, Inc.
007    //
008    // This library is free software; you can redistribute it and/or modify it
009    // under the terms of the GNU Lesser General Public License as published by the
010    // Free Software Foundation; either version 2 of the License, or (at your
011    // option) any later version approved by The Eigenbase Project.
012    //
013    // This library is distributed in the hope that it will be useful,
014    // but WITHOUT ANY WARRANTY; without even the implied warranty of
015    // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016    // GNU Lesser General Public License for more details.
017    //
018    // You should have received a copy of the GNU Lesser General Public License
019    // along with this library; if not, write to the Free Software
020    // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
021    */
022    package org.eigenbase.xom.wrappers;
023    
024    import org.eigenbase.xom.*;
025    import org.w3c.dom.Node;
026    
027    import java.util.*;
028    import java.io.PrintWriter;
029    
030    /**
031     * Quick and dirty XML parser that finds the precise start and end
032     * position of all nodes in a document. Also finds all line endings, so
033     * that character offsets can be converted to line/column positions.
034     *
035     * @author jhyde
036     * @since 13 October, 2008
037     * @version $Id: //open/util/resgen/src/org/eigenbase/xom/wrappers/Annotator.java#3 $
038     */
039    public class Annotator {
040        private final List/*<LocInfo>*/ locInfoList = new ArrayList();
041        private int[] lineStartPositions;
042        private final String xml;
043        private final Map/*<DOMWrapper, LocInfo>*/ wrapperLocMap =
044            new HashMap();
045        private final Map/*<Node, LocInfo>*/ nodeLocMap = new HashMap();
046        private int seq; // workspace for populateMap
047    
048        /**
049         * Creates an Annotator.
050         *
051         * <p>For testing purposes, {@code wrapper} may be null. Parses the XML
052         * but does not build the mapping from location information to DOM nodes.
053         *
054         * @param xml XML source string
055         * @param def Wrapper around root DOM node
056         */
057        Annotator(String xml, DOMWrapper def) {
058            this.xml = xml;
059            parse(xml);
060            if (def != null) {
061                seq = 0;
062                populateMap(def);
063                assert this.nodeLocMap.size() == this.wrapperLocMap.size();
064            }
065        }
066    
067        public Location getLocation(DOMWrapper wrapper) {
068            LocInfo location0 = (LocInfo) wrapperLocMap.get(wrapper);
069            if (location0 == null) {
070                location0 = (Annotator.LocInfo)
071                    nodeLocMap.get(((W3CDOMWrapper) wrapper).node);
072                if (location0 == null) {
073                    return null;
074                }
075            }
076            final LocInfo location = location0;
077            return new Location() {
078                public int getStartLine() {
079                    return getLine(getStartPos()) + 1;
080                }
081    
082                public int getStartColumn() {
083                    return getCol(getStartPos()) + 1;
084                }
085    
086                public int getStartPos() {
087                    return location.startTagStartPos;
088                }
089    
090                public int getEndLine() {
091                    return getLine(getEndPos()) + 1;
092                }
093    
094                public int getEndColumn() {
095                    return getCol(getEndPos()) + 1;
096                }
097    
098                public int getEndPos() {
099                    return location.endTagEndPos >= 0
100                        ? location.endTagEndPos
101                        : location.startTagEndPos;
102                }
103    
104                public String toString() {
105                    return location.toString(Annotator.this);
106                }
107            };
108        }
109    
110        /**
111         * Returns the list of LocInfo. For testing.
112         *
113         * @return list of LocInfo.
114         */
115        List getLocInfoList() {
116            return locInfoList;
117        }
118    
119        // enum State
120        private static final int
121            STATE_NORMAL = 0,
122            STATE_TAG = 1,
123            STATE_ENDTAG = 2,
124            STATE_QUOT = 3,
125            STATE_APOS = 4,
126            STATE_COMMENT = 5,
127            STATE_CDATA = 6;
128    
129        void parse(String s)
130        {
131            final ArrayStack/*<LocInfo>*/ lockInfoStack = new ArrayStack();
132            final List lineStartPositions = new ArrayList();
133            int state = STATE_NORMAL;
134            final int count = s.length();
135            int i = 0;
136            int last = 0;
137            lineStartPositions.add(new Integer(i));
138            lockInfoStack.push(null);
139            LocInfo location = null;
140            loop:
141            while (i < count) {
142                final char c = s.charAt(i);
143                switch (c) {
144                case '<':
145                    stateSwitch:
146                    switch (state) {
147                    case STATE_NORMAL:
148                        if (i > last) {
149                            // Unlike other node types, we create the LocInfo
150                            // at the end of the element. No need to add the node
151                            // to the stack, because we'd just remove it again.
152                            LocInfo loc2 =
153                                new LocInfo(locInfoList.size(), TYPE_TEXT, last);
154                            loc2.endTagEndPos = i;
155                            locInfoList.add(loc2);
156                        }
157                        if (i + 1 < count) {
158                            final char c1 = s.charAt(i + 1);
159                            switch (c1) {
160                            case '/':
161                                // ^</Tag>
162                                state = STATE_ENDTAG;
163                                assert location != null;
164                                break stateSwitch;
165                            case '?':
166                                // ^<?xml ... ?>
167                                location =
168                                    new LocInfo(
169                                        locInfoList.size(),
170                                        TYPE_PROCESSING_INSTRUCTION, i);
171                                locInfoList.add(location);
172                                state = STATE_TAG;
173                                i += "<?".length();
174                                continue loop;
175                            case '!':
176                                if (s.startsWith("--", i + 2)) {
177                                    // ^<!--
178                                    location =
179                                        new LocInfo(
180                                            locInfoList.size(),
181                                            TYPE_COMMENT, i);
182                                    locInfoList.add(location);
183                                    state = STATE_COMMENT;
184                                    i += "<!--".length();
185                                    continue loop;
186                                }
187                                if (s.startsWith("[CDATA[", i + 2)) {
188                                    // ^<![CDATA[
189                                    location =
190                                        new LocInfo(
191                                            locInfoList.size(),
192                                            TYPE_CDATA_SECTION, i);
193                                    locInfoList.add(location);
194                                    state = STATE_CDATA;
195                                    i += "<![CDATA[".length();
196                                    continue loop;
197                                }
198                                break;
199                            }
200                        }
201                        // Start of an element,
202                        // ^<Tag a1=v a2=v>
203                        // Don't push until we see end of the head tag <Tag ... ^>
204                        state = STATE_TAG;
205                        location = new LocInfo(locInfoList.size(), TYPE_ELEMENT, i);
206                        locInfoList.add(location);
207                        ++i;
208                        continue loop;
209                    }
210                    break;
211    
212                case '>':
213                    switch (state) {
214                    case STATE_TAG:
215                        ++i;
216                        assert location != null;
217                        switch (location.type) {
218                        case TYPE_PROCESSING_INSTRUCTION:
219                            // <? ... ?^>
220                        case TYPE_CDATA_SECTION:
221                            // <![CDATA[ ... ]]^>
222                        case TYPE_COMMENT:
223                            // <!-- ... --^>
224                            location.endTagEndPos = i;
225                            location = (LocInfo) lockInfoStack.peek();
226                            break;
227                        default:
228                            // <Tag^>
229                            location.startTagEndPos = i;
230                            lockInfoStack.push(location);
231                            break;
232                        }
233                        last = i;
234                        state = STATE_NORMAL;
235                        continue loop;
236    
237                    case STATE_ENDTAG:
238                        // </Tag^>
239                        ++i;
240                        assert location != null;
241                        location.endTagEndPos = i;
242                        try {
243                            location = (LocInfo) lockInfoStack.pop();
244                        } catch (IndexOutOfBoundsException e) {
245                            throw new RuntimeException(
246                                "i=" + i + ", xml=" + xml.substring(i)
247                                    + ", nodeList=" + locInfoList,
248                                e);
249                        }
250                        last = i;
251                        state = STATE_NORMAL;
252                        continue loop;
253                    }
254                    break;
255    
256                case '/':
257                    switch (state) {
258                    case STATE_TAG:
259                        ++i;
260                        if (i < count && s.charAt(i) == '>') {
261                            // <Tag a1=v1 a2=v2 ^/>
262                            ++i;
263                            location.endTagEndPos = i;
264                            // no need to pop; we never pushed when we saw '<'
265                            location = (LocInfo) lockInfoStack.peek();
266                            last = i;
267                            state = STATE_NORMAL;
268                        }
269                        continue loop;
270                    }
271                    break;
272    
273                case ']':
274                    switch (state) {
275                    case STATE_CDATA:
276                        if (s.startsWith("]>", i + 1)) {
277                             // <![CDATA[ ... ^]]>
278                            state = STATE_NORMAL;
279                            i += "]]>".length();
280                            location.endTagEndPos = i;
281                            location = (LocInfo) lockInfoStack.peek();
282                            last = i;
283                            continue loop;
284                        }
285                    }
286                    break;
287    
288                case '-':
289                    switch (state) {
290                    case STATE_COMMENT:
291                        if (s.startsWith("->", i + 1)) {
292                            // <!-- xxxxx^-->
293                            i += "-->".length();
294                            location.endTagEndPos = i;
295                            last = i;
296                            location = (LocInfo) lockInfoStack.peek();
297                            state = STATE_NORMAL;
298                            continue loop;
299                        }
300                    }
301                    break;
302    
303                case '\r':
304                    ++i;
305                    if (i < count && s.charAt(i) == '\n') {
306                        // only count windows line ending CR LF as one line
307                        ++i;
308                    }
309                    lineStartPositions.add(new Integer(i));
310                    continue loop;
311    
312                case '\n':
313                    ++i;
314                    lineStartPositions.add(new Integer(i));
315                    continue loop;
316    
317                case '\'':
318                    switch (state) {
319                    case STATE_APOS:
320                        // a='xxx^'
321                        state = STATE_TAG;
322                        break;
323                    case STATE_TAG:
324                        // a=^'xxx'
325                        state = STATE_APOS;
326                        break;
327                    case STATE_QUOT:
328                        // a="doesn^'t matter"
329                    default:
330                        break;
331                    }
332                    break;
333    
334                case '"':
335                    switch (state) {
336                    case STATE_QUOT:
337                        // a="xxx^"
338                        state = STATE_TAG;
339                        break;
340                    case STATE_TAG:
341                        // a=^"xxx"
342                        state = STATE_QUOT;
343                        break;
344                    case STATE_APOS:
345                        // a='doesn^"t matter'
346                    default:
347                        break;
348                    }
349                    break;
350                }
351    
352                ++i;
353            }
354            this.lineStartPositions = new int[lineStartPositions.size()];
355            for (int j = 0; j < lineStartPositions.size(); j++) {
356                this.lineStartPositions[j] =
357                    ((Integer) lineStartPositions.get(j)).intValue();
358            }
359        }
360    
361        private void populateMap(DOMWrapper def)
362        {
363            final int defType = def.getType();
364            LocInfo location;
365            while (true) {
366                location = (LocInfo) locInfoList.get(seq++);
367                if (defType == DOMWrapper.ELEMENT
368                    && location.type == TYPE_ELEMENT)
369                {
370                    break;
371                }
372                if (defType == DOMWrapper.CDATA
373                    && location.type == TYPE_TEXT)
374                {
375                    break;
376                }
377                if (seq >= locInfoList.size()) {
378                    return;
379                }
380            }
381            wrapperLocMap.put(def, location);
382            nodeLocMap.put(((W3CDOMWrapper) def).node, location);
383            final DOMWrapper[] elementChildren = def.getElementChildren();
384            for (int i = 0; i < elementChildren.length; i++) {
385                DOMWrapper domWrapper = elementChildren[i];
386                populateMap(domWrapper);
387            }
388        }
389    
390        /**
391         * Returns the line that a character position falls on. The first line in a
392         * document is numbered 0.
393         *
394         * @param pos Character position
395         * @return Line (starting from 0)
396         */
397        int getLine(int pos)
398        {
399            int index = Arrays.binarySearch(lineStartPositions, pos);
400            if (index >= 0) {
401                return index;
402            } else {
403                return -2 - index;
404            }
405        }
406    
407        /**
408         * Returns the column that a character position falls on. The first column
409         * in a line is numbered 0.
410         *
411         * @param pos Character position
412         * @return column (starting from 0)
413         */
414        int getCol(int pos)
415        {
416            int index = Arrays.binarySearch(lineStartPositions, pos);
417            if (index >= 0) {
418                return 0;
419            } else {
420                index = -2 - index;
421                return pos - lineStartPositions[index];
422            }
423        }
424    
425        void list(PrintWriter pw)
426        {
427            for (int i = 0; i < locInfoList.size(); i++) {
428                LocInfo location = (LocInfo) locInfoList.get(i);
429                pw.println(
430                    location.seq + ": " + location.toString(this) + " ["
431                        + location.getText(xml) + "]");
432            }
433            pw.flush();
434        }
435    
436        // enum Type
437        private static final int
438            TYPE_ELEMENT = Node.ELEMENT_NODE,
439            TYPE_PROCESSING_INSTRUCTION = Node.PROCESSING_INSTRUCTION_NODE,
440            TYPE_COMMENT = Node.COMMENT_NODE,
441            TYPE_CDATA_SECTION = Node.CDATA_SECTION_NODE,
442            TYPE_TEXT = Node.TEXT_NODE;
443    
444        class LocInfo {
445            /** Sequence in document, ordered by start position (prefix order) */
446            final int seq;
447            /** Node type, typically {@link Node#ELEMENT_NODE}. */
448            final int startTagStartPos;
449            final int type;
450            int startTagEndPos = -1; // -1 if entity is a single tag
451            int endTagEndPos = -1;
452    
453            /**
454             * Creates a LocInfo.
455             *
456             * @param seq Sequence number in document
457             * @param nodeType Node type, typically {@link Node#ELEMENT_NODE}.
458             * @param startTagStartPos Position of start of element
459             */
460            LocInfo(int seq, int nodeType, int startTagStartPos) {
461                this.seq = seq;
462                this.type = nodeType;
463                this.startTagStartPos = startTagStartPos;
464            }
465    
466            public String toString(Annotator annotator) {
467                return "line " + annotator.getLine(startTagStartPos)
468                    + ", column " + annotator.getCol(startTagStartPos);
469            }
470    
471            /**
472             * Returns the fragment of source XML that this node encompasses.
473             *
474             * @param xml Whole source XML
475             * @return fragment of source XML
476             */
477            public String getText(String xml) {
478                return xml.substring(
479                    startTagStartPos,
480                    endTagEndPos >= 0 ? endTagEndPos
481                        : xml.length());
482            }
483    
484            /**
485             * Returns the fragment of source XML corresponding to the head tag
486             * of this element, if this is an element, otherwise the whole node.
487             *
488             * @param xml Whole source XML
489             * @return fragment of source XML
490             */
491            public String getHeadText(String xml) {
492                return xml.substring(
493                    startTagStartPos,
494                    startTagEndPos >= 0 ? startTagEndPos
495                        : endTagEndPos >= 0 ? endTagEndPos
496                            : xml.length());
497            }
498    
499            public String toString() {
500                return getHeadText(xml);
501            }
502        }
503    
504        /**
505         * Similar to {@link Stack} but based on {@link ArrayList} instead of
506         * {@link Vector}, and therefore more efficient.
507         */
508        private static class ArrayStack extends ArrayList {
509            public final void push(Object t)
510            {
511                if (false) System.out.println(size() + " push [" + t + "]");
512                add(t);
513            }
514    
515            public final Object peek()
516            {
517                return get(size() - 1);
518            }
519    
520            public final Object pop()
521            {
522                final int index = size() - 1;
523                Object t = remove(index);
524                if (false) System.out.println(size() + " pop  [" + t + "]");
525                return get(index - 1);
526            }
527        }
528    }
529    
530    // End Annotator.java