001 /*
002 // $Id: //open/util/resgen/src/org/eigenbase/xom/wrappers/Annotator.java#3 $
003 // Package org.eigenbase.xom is an XML Object Mapper.
004 // Copyright (C) 2008-2008 The Eigenbase Project
005 // Copyright (C) 2008-2008 Disruptive Tech
006 // Copyright (C) 2008-2008 LucidEra, Inc.
007 //
008 // This library is free software; you can redistribute it and/or modify it
009 // under the terms of the GNU Lesser General Public License as published by the
010 // Free Software Foundation; either version 2 of the License, or (at your
011 // option) any later version approved by The Eigenbase Project.
012 //
013 // This library is distributed in the hope that it will be useful,
014 // but WITHOUT ANY WARRANTY; without even the implied warranty of
015 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
016 // GNU Lesser General Public License for more details.
017 //
018 // You should have received a copy of the GNU Lesser General Public License
019 // along with this library; if not, write to the Free Software
020 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
021 */
022 package org.eigenbase.xom.wrappers;
023
024 import org.eigenbase.xom.*;
025 import org.w3c.dom.Node;
026
027 import java.util.*;
028 import java.io.PrintWriter;
029
030 /**
031 * Quick and dirty XML parser that finds the precise start and end
032 * position of all nodes in a document. Also finds all line endings, so
033 * that character offsets can be converted to line/column positions.
034 *
035 * @author jhyde
036 * @since 13 October, 2008
037 * @version $Id: //open/util/resgen/src/org/eigenbase/xom/wrappers/Annotator.java#3 $
038 */
039 public class Annotator {
040 private final List/*<LocInfo>*/ locInfoList = new ArrayList();
041 private int[] lineStartPositions;
042 private final String xml;
043 private final Map/*<DOMWrapper, LocInfo>*/ wrapperLocMap =
044 new HashMap();
045 private final Map/*<Node, LocInfo>*/ nodeLocMap = new HashMap();
046 private int seq; // workspace for populateMap
047
048 /**
049 * Creates an Annotator.
050 *
051 * <p>For testing purposes, {@code wrapper} may be null. Parses the XML
052 * but does not build the mapping from location information to DOM nodes.
053 *
054 * @param xml XML source string
055 * @param def Wrapper around root DOM node
056 */
057 Annotator(String xml, DOMWrapper def) {
058 this.xml = xml;
059 parse(xml);
060 if (def != null) {
061 seq = 0;
062 populateMap(def);
063 assert this.nodeLocMap.size() == this.wrapperLocMap.size();
064 }
065 }
066
067 public Location getLocation(DOMWrapper wrapper) {
068 LocInfo location0 = (LocInfo) wrapperLocMap.get(wrapper);
069 if (location0 == null) {
070 location0 = (Annotator.LocInfo)
071 nodeLocMap.get(((W3CDOMWrapper) wrapper).node);
072 if (location0 == null) {
073 return null;
074 }
075 }
076 final LocInfo location = location0;
077 return new Location() {
078 public int getStartLine() {
079 return getLine(getStartPos()) + 1;
080 }
081
082 public int getStartColumn() {
083 return getCol(getStartPos()) + 1;
084 }
085
086 public int getStartPos() {
087 return location.startTagStartPos;
088 }
089
090 public int getEndLine() {
091 return getLine(getEndPos()) + 1;
092 }
093
094 public int getEndColumn() {
095 return getCol(getEndPos()) + 1;
096 }
097
098 public int getEndPos() {
099 return location.endTagEndPos >= 0
100 ? location.endTagEndPos
101 : location.startTagEndPos;
102 }
103
104 public String toString() {
105 return location.toString(Annotator.this);
106 }
107 };
108 }
109
110 /**
111 * Returns the list of LocInfo. For testing.
112 *
113 * @return list of LocInfo.
114 */
115 List getLocInfoList() {
116 return locInfoList;
117 }
118
119 // enum State
120 private static final int
121 STATE_NORMAL = 0,
122 STATE_TAG = 1,
123 STATE_ENDTAG = 2,
124 STATE_QUOT = 3,
125 STATE_APOS = 4,
126 STATE_COMMENT = 5,
127 STATE_CDATA = 6;
128
129 void parse(String s)
130 {
131 final ArrayStack/*<LocInfo>*/ lockInfoStack = new ArrayStack();
132 final List lineStartPositions = new ArrayList();
133 int state = STATE_NORMAL;
134 final int count = s.length();
135 int i = 0;
136 int last = 0;
137 lineStartPositions.add(new Integer(i));
138 lockInfoStack.push(null);
139 LocInfo location = null;
140 loop:
141 while (i < count) {
142 final char c = s.charAt(i);
143 switch (c) {
144 case '<':
145 stateSwitch:
146 switch (state) {
147 case STATE_NORMAL:
148 if (i > last) {
149 // Unlike other node types, we create the LocInfo
150 // at the end of the element. No need to add the node
151 // to the stack, because we'd just remove it again.
152 LocInfo loc2 =
153 new LocInfo(locInfoList.size(), TYPE_TEXT, last);
154 loc2.endTagEndPos = i;
155 locInfoList.add(loc2);
156 }
157 if (i + 1 < count) {
158 final char c1 = s.charAt(i + 1);
159 switch (c1) {
160 case '/':
161 // ^</Tag>
162 state = STATE_ENDTAG;
163 assert location != null;
164 break stateSwitch;
165 case '?':
166 // ^<?xml ... ?>
167 location =
168 new LocInfo(
169 locInfoList.size(),
170 TYPE_PROCESSING_INSTRUCTION, i);
171 locInfoList.add(location);
172 state = STATE_TAG;
173 i += "<?".length();
174 continue loop;
175 case '!':
176 if (s.startsWith("--", i + 2)) {
177 // ^<!--
178 location =
179 new LocInfo(
180 locInfoList.size(),
181 TYPE_COMMENT, i);
182 locInfoList.add(location);
183 state = STATE_COMMENT;
184 i += "<!--".length();
185 continue loop;
186 }
187 if (s.startsWith("[CDATA[", i + 2)) {
188 // ^<![CDATA[
189 location =
190 new LocInfo(
191 locInfoList.size(),
192 TYPE_CDATA_SECTION, i);
193 locInfoList.add(location);
194 state = STATE_CDATA;
195 i += "<![CDATA[".length();
196 continue loop;
197 }
198 break;
199 }
200 }
201 // Start of an element,
202 // ^<Tag a1=v a2=v>
203 // Don't push until we see end of the head tag <Tag ... ^>
204 state = STATE_TAG;
205 location = new LocInfo(locInfoList.size(), TYPE_ELEMENT, i);
206 locInfoList.add(location);
207 ++i;
208 continue loop;
209 }
210 break;
211
212 case '>':
213 switch (state) {
214 case STATE_TAG:
215 ++i;
216 assert location != null;
217 switch (location.type) {
218 case TYPE_PROCESSING_INSTRUCTION:
219 // <? ... ?^>
220 case TYPE_CDATA_SECTION:
221 // <![CDATA[ ... ]]^>
222 case TYPE_COMMENT:
223 // <!-- ... --^>
224 location.endTagEndPos = i;
225 location = (LocInfo) lockInfoStack.peek();
226 break;
227 default:
228 // <Tag^>
229 location.startTagEndPos = i;
230 lockInfoStack.push(location);
231 break;
232 }
233 last = i;
234 state = STATE_NORMAL;
235 continue loop;
236
237 case STATE_ENDTAG:
238 // </Tag^>
239 ++i;
240 assert location != null;
241 location.endTagEndPos = i;
242 try {
243 location = (LocInfo) lockInfoStack.pop();
244 } catch (IndexOutOfBoundsException e) {
245 throw new RuntimeException(
246 "i=" + i + ", xml=" + xml.substring(i)
247 + ", nodeList=" + locInfoList,
248 e);
249 }
250 last = i;
251 state = STATE_NORMAL;
252 continue loop;
253 }
254 break;
255
256 case '/':
257 switch (state) {
258 case STATE_TAG:
259 ++i;
260 if (i < count && s.charAt(i) == '>') {
261 // <Tag a1=v1 a2=v2 ^/>
262 ++i;
263 location.endTagEndPos = i;
264 // no need to pop; we never pushed when we saw '<'
265 location = (LocInfo) lockInfoStack.peek();
266 last = i;
267 state = STATE_NORMAL;
268 }
269 continue loop;
270 }
271 break;
272
273 case ']':
274 switch (state) {
275 case STATE_CDATA:
276 if (s.startsWith("]>", i + 1)) {
277 // <![CDATA[ ... ^]]>
278 state = STATE_NORMAL;
279 i += "]]>".length();
280 location.endTagEndPos = i;
281 location = (LocInfo) lockInfoStack.peek();
282 last = i;
283 continue loop;
284 }
285 }
286 break;
287
288 case '-':
289 switch (state) {
290 case STATE_COMMENT:
291 if (s.startsWith("->", i + 1)) {
292 // <!-- xxxxx^-->
293 i += "-->".length();
294 location.endTagEndPos = i;
295 last = i;
296 location = (LocInfo) lockInfoStack.peek();
297 state = STATE_NORMAL;
298 continue loop;
299 }
300 }
301 break;
302
303 case '\r':
304 ++i;
305 if (i < count && s.charAt(i) == '\n') {
306 // only count windows line ending CR LF as one line
307 ++i;
308 }
309 lineStartPositions.add(new Integer(i));
310 continue loop;
311
312 case '\n':
313 ++i;
314 lineStartPositions.add(new Integer(i));
315 continue loop;
316
317 case '\'':
318 switch (state) {
319 case STATE_APOS:
320 // a='xxx^'
321 state = STATE_TAG;
322 break;
323 case STATE_TAG:
324 // a=^'xxx'
325 state = STATE_APOS;
326 break;
327 case STATE_QUOT:
328 // a="doesn^'t matter"
329 default:
330 break;
331 }
332 break;
333
334 case '"':
335 switch (state) {
336 case STATE_QUOT:
337 // a="xxx^"
338 state = STATE_TAG;
339 break;
340 case STATE_TAG:
341 // a=^"xxx"
342 state = STATE_QUOT;
343 break;
344 case STATE_APOS:
345 // a='doesn^"t matter'
346 default:
347 break;
348 }
349 break;
350 }
351
352 ++i;
353 }
354 this.lineStartPositions = new int[lineStartPositions.size()];
355 for (int j = 0; j < lineStartPositions.size(); j++) {
356 this.lineStartPositions[j] =
357 ((Integer) lineStartPositions.get(j)).intValue();
358 }
359 }
360
361 private void populateMap(DOMWrapper def)
362 {
363 final int defType = def.getType();
364 LocInfo location;
365 while (true) {
366 location = (LocInfo) locInfoList.get(seq++);
367 if (defType == DOMWrapper.ELEMENT
368 && location.type == TYPE_ELEMENT)
369 {
370 break;
371 }
372 if (defType == DOMWrapper.CDATA
373 && location.type == TYPE_TEXT)
374 {
375 break;
376 }
377 if (seq >= locInfoList.size()) {
378 return;
379 }
380 }
381 wrapperLocMap.put(def, location);
382 nodeLocMap.put(((W3CDOMWrapper) def).node, location);
383 final DOMWrapper[] elementChildren = def.getElementChildren();
384 for (int i = 0; i < elementChildren.length; i++) {
385 DOMWrapper domWrapper = elementChildren[i];
386 populateMap(domWrapper);
387 }
388 }
389
390 /**
391 * Returns the line that a character position falls on. The first line in a
392 * document is numbered 0.
393 *
394 * @param pos Character position
395 * @return Line (starting from 0)
396 */
397 int getLine(int pos)
398 {
399 int index = Arrays.binarySearch(lineStartPositions, pos);
400 if (index >= 0) {
401 return index;
402 } else {
403 return -2 - index;
404 }
405 }
406
407 /**
408 * Returns the column that a character position falls on. The first column
409 * in a line is numbered 0.
410 *
411 * @param pos Character position
412 * @return column (starting from 0)
413 */
414 int getCol(int pos)
415 {
416 int index = Arrays.binarySearch(lineStartPositions, pos);
417 if (index >= 0) {
418 return 0;
419 } else {
420 index = -2 - index;
421 return pos - lineStartPositions[index];
422 }
423 }
424
425 void list(PrintWriter pw)
426 {
427 for (int i = 0; i < locInfoList.size(); i++) {
428 LocInfo location = (LocInfo) locInfoList.get(i);
429 pw.println(
430 location.seq + ": " + location.toString(this) + " ["
431 + location.getText(xml) + "]");
432 }
433 pw.flush();
434 }
435
436 // enum Type
437 private static final int
438 TYPE_ELEMENT = Node.ELEMENT_NODE,
439 TYPE_PROCESSING_INSTRUCTION = Node.PROCESSING_INSTRUCTION_NODE,
440 TYPE_COMMENT = Node.COMMENT_NODE,
441 TYPE_CDATA_SECTION = Node.CDATA_SECTION_NODE,
442 TYPE_TEXT = Node.TEXT_NODE;
443
444 class LocInfo {
445 /** Sequence in document, ordered by start position (prefix order) */
446 final int seq;
447 /** Node type, typically {@link Node#ELEMENT_NODE}. */
448 final int startTagStartPos;
449 final int type;
450 int startTagEndPos = -1; // -1 if entity is a single tag
451 int endTagEndPos = -1;
452
453 /**
454 * Creates a LocInfo.
455 *
456 * @param seq Sequence number in document
457 * @param nodeType Node type, typically {@link Node#ELEMENT_NODE}.
458 * @param startTagStartPos Position of start of element
459 */
460 LocInfo(int seq, int nodeType, int startTagStartPos) {
461 this.seq = seq;
462 this.type = nodeType;
463 this.startTagStartPos = startTagStartPos;
464 }
465
466 public String toString(Annotator annotator) {
467 return "line " + annotator.getLine(startTagStartPos)
468 + ", column " + annotator.getCol(startTagStartPos);
469 }
470
471 /**
472 * Returns the fragment of source XML that this node encompasses.
473 *
474 * @param xml Whole source XML
475 * @return fragment of source XML
476 */
477 public String getText(String xml) {
478 return xml.substring(
479 startTagStartPos,
480 endTagEndPos >= 0 ? endTagEndPos
481 : xml.length());
482 }
483
484 /**
485 * Returns the fragment of source XML corresponding to the head tag
486 * of this element, if this is an element, otherwise the whole node.
487 *
488 * @param xml Whole source XML
489 * @return fragment of source XML
490 */
491 public String getHeadText(String xml) {
492 return xml.substring(
493 startTagStartPos,
494 startTagEndPos >= 0 ? startTagEndPos
495 : endTagEndPos >= 0 ? endTagEndPos
496 : xml.length());
497 }
498
499 public String toString() {
500 return getHeadText(xml);
501 }
502 }
503
504 /**
505 * Similar to {@link Stack} but based on {@link ArrayList} instead of
506 * {@link Vector}, and therefore more efficient.
507 */
508 private static class ArrayStack extends ArrayList {
509 public final void push(Object t)
510 {
511 if (false) System.out.println(size() + " push [" + t + "]");
512 add(t);
513 }
514
515 public final Object peek()
516 {
517 return get(size() - 1);
518 }
519
520 public final Object pop()
521 {
522 final int index = size() - 1;
523 Object t = remove(index);
524 if (false) System.out.println(size() + " pop [" + t + "]");
525 return get(index - 1);
526 }
527 }
528 }
529
530 // End Annotator.java