001 /* 002 // $Id: //open/util/resgen/src/org/eigenbase/xom/wrappers/Annotator.java#3 $ 003 // Package org.eigenbase.xom is an XML Object Mapper. 004 // Copyright (C) 2008-2008 The Eigenbase Project 005 // Copyright (C) 2008-2008 Disruptive Tech 006 // Copyright (C) 2008-2008 LucidEra, Inc. 007 // 008 // This library is free software; you can redistribute it and/or modify it 009 // under the terms of the GNU Lesser General Public License as published by the 010 // Free Software Foundation; either version 2 of the License, or (at your 011 // option) any later version approved by The Eigenbase Project. 012 // 013 // This library is distributed in the hope that it will be useful, 014 // but WITHOUT ANY WARRANTY; without even the implied warranty of 015 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 // GNU Lesser General Public License for more details. 017 // 018 // You should have received a copy of the GNU Lesser General Public License 019 // along with this library; if not, write to the Free Software 020 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 021 */ 022 package org.eigenbase.xom.wrappers; 023 024 import org.eigenbase.xom.*; 025 import org.w3c.dom.Node; 026 027 import java.util.*; 028 import java.io.PrintWriter; 029 030 /** 031 * Quick and dirty XML parser that finds the precise start and end 032 * position of all nodes in a document. Also finds all line endings, so 033 * that character offsets can be converted to line/column positions. 034 * 035 * @author jhyde 036 * @since 13 October, 2008 037 * @version $Id: //open/util/resgen/src/org/eigenbase/xom/wrappers/Annotator.java#3 $ 038 */ 039 public class Annotator { 040 private final List/*<LocInfo>*/ locInfoList = new ArrayList(); 041 private int[] lineStartPositions; 042 private final String xml; 043 private final Map/*<DOMWrapper, LocInfo>*/ wrapperLocMap = 044 new HashMap(); 045 private final Map/*<Node, LocInfo>*/ nodeLocMap = new HashMap(); 046 private int seq; // workspace for populateMap 047 048 /** 049 * Creates an Annotator. 050 * 051 * <p>For testing purposes, {@code wrapper} may be null. Parses the XML 052 * but does not build the mapping from location information to DOM nodes. 053 * 054 * @param xml XML source string 055 * @param def Wrapper around root DOM node 056 */ 057 Annotator(String xml, DOMWrapper def) { 058 this.xml = xml; 059 parse(xml); 060 if (def != null) { 061 seq = 0; 062 populateMap(def); 063 assert this.nodeLocMap.size() == this.wrapperLocMap.size(); 064 } 065 } 066 067 public Location getLocation(DOMWrapper wrapper) { 068 LocInfo location0 = (LocInfo) wrapperLocMap.get(wrapper); 069 if (location0 == null) { 070 location0 = (Annotator.LocInfo) 071 nodeLocMap.get(((W3CDOMWrapper) wrapper).node); 072 if (location0 == null) { 073 return null; 074 } 075 } 076 final LocInfo location = location0; 077 return new Location() { 078 public int getStartLine() { 079 return getLine(getStartPos()) + 1; 080 } 081 082 public int getStartColumn() { 083 return getCol(getStartPos()) + 1; 084 } 085 086 public int getStartPos() { 087 return location.startTagStartPos; 088 } 089 090 public int getEndLine() { 091 return getLine(getEndPos()) + 1; 092 } 093 094 public int getEndColumn() { 095 return getCol(getEndPos()) + 1; 096 } 097 098 public int getEndPos() { 099 return location.endTagEndPos >= 0 100 ? location.endTagEndPos 101 : location.startTagEndPos; 102 } 103 104 public String toString() { 105 return location.toString(Annotator.this); 106 } 107 }; 108 } 109 110 /** 111 * Returns the list of LocInfo. For testing. 112 * 113 * @return list of LocInfo. 114 */ 115 List getLocInfoList() { 116 return locInfoList; 117 } 118 119 // enum State 120 private static final int 121 STATE_NORMAL = 0, 122 STATE_TAG = 1, 123 STATE_ENDTAG = 2, 124 STATE_QUOT = 3, 125 STATE_APOS = 4, 126 STATE_COMMENT = 5, 127 STATE_CDATA = 6; 128 129 void parse(String s) 130 { 131 final ArrayStack/*<LocInfo>*/ lockInfoStack = new ArrayStack(); 132 final List lineStartPositions = new ArrayList(); 133 int state = STATE_NORMAL; 134 final int count = s.length(); 135 int i = 0; 136 int last = 0; 137 lineStartPositions.add(new Integer(i)); 138 lockInfoStack.push(null); 139 LocInfo location = null; 140 loop: 141 while (i < count) { 142 final char c = s.charAt(i); 143 switch (c) { 144 case '<': 145 stateSwitch: 146 switch (state) { 147 case STATE_NORMAL: 148 if (i > last) { 149 // Unlike other node types, we create the LocInfo 150 // at the end of the element. No need to add the node 151 // to the stack, because we'd just remove it again. 152 LocInfo loc2 = 153 new LocInfo(locInfoList.size(), TYPE_TEXT, last); 154 loc2.endTagEndPos = i; 155 locInfoList.add(loc2); 156 } 157 if (i + 1 < count) { 158 final char c1 = s.charAt(i + 1); 159 switch (c1) { 160 case '/': 161 // ^</Tag> 162 state = STATE_ENDTAG; 163 assert location != null; 164 break stateSwitch; 165 case '?': 166 // ^<?xml ... ?> 167 location = 168 new LocInfo( 169 locInfoList.size(), 170 TYPE_PROCESSING_INSTRUCTION, i); 171 locInfoList.add(location); 172 state = STATE_TAG; 173 i += "<?".length(); 174 continue loop; 175 case '!': 176 if (s.startsWith("--", i + 2)) { 177 // ^<!-- 178 location = 179 new LocInfo( 180 locInfoList.size(), 181 TYPE_COMMENT, i); 182 locInfoList.add(location); 183 state = STATE_COMMENT; 184 i += "<!--".length(); 185 continue loop; 186 } 187 if (s.startsWith("[CDATA[", i + 2)) { 188 // ^<![CDATA[ 189 location = 190 new LocInfo( 191 locInfoList.size(), 192 TYPE_CDATA_SECTION, i); 193 locInfoList.add(location); 194 state = STATE_CDATA; 195 i += "<![CDATA[".length(); 196 continue loop; 197 } 198 break; 199 } 200 } 201 // Start of an element, 202 // ^<Tag a1=v a2=v> 203 // Don't push until we see end of the head tag <Tag ... ^> 204 state = STATE_TAG; 205 location = new LocInfo(locInfoList.size(), TYPE_ELEMENT, i); 206 locInfoList.add(location); 207 ++i; 208 continue loop; 209 } 210 break; 211 212 case '>': 213 switch (state) { 214 case STATE_TAG: 215 ++i; 216 assert location != null; 217 switch (location.type) { 218 case TYPE_PROCESSING_INSTRUCTION: 219 // <? ... ?^> 220 case TYPE_CDATA_SECTION: 221 // <![CDATA[ ... ]]^> 222 case TYPE_COMMENT: 223 // <!-- ... --^> 224 location.endTagEndPos = i; 225 location = (LocInfo) lockInfoStack.peek(); 226 break; 227 default: 228 // <Tag^> 229 location.startTagEndPos = i; 230 lockInfoStack.push(location); 231 break; 232 } 233 last = i; 234 state = STATE_NORMAL; 235 continue loop; 236 237 case STATE_ENDTAG: 238 // </Tag^> 239 ++i; 240 assert location != null; 241 location.endTagEndPos = i; 242 try { 243 location = (LocInfo) lockInfoStack.pop(); 244 } catch (IndexOutOfBoundsException e) { 245 throw new RuntimeException( 246 "i=" + i + ", xml=" + xml.substring(i) 247 + ", nodeList=" + locInfoList, 248 e); 249 } 250 last = i; 251 state = STATE_NORMAL; 252 continue loop; 253 } 254 break; 255 256 case '/': 257 switch (state) { 258 case STATE_TAG: 259 ++i; 260 if (i < count && s.charAt(i) == '>') { 261 // <Tag a1=v1 a2=v2 ^/> 262 ++i; 263 location.endTagEndPos = i; 264 // no need to pop; we never pushed when we saw '<' 265 location = (LocInfo) lockInfoStack.peek(); 266 last = i; 267 state = STATE_NORMAL; 268 } 269 continue loop; 270 } 271 break; 272 273 case ']': 274 switch (state) { 275 case STATE_CDATA: 276 if (s.startsWith("]>", i + 1)) { 277 // <![CDATA[ ... ^]]> 278 state = STATE_NORMAL; 279 i += "]]>".length(); 280 location.endTagEndPos = i; 281 location = (LocInfo) lockInfoStack.peek(); 282 last = i; 283 continue loop; 284 } 285 } 286 break; 287 288 case '-': 289 switch (state) { 290 case STATE_COMMENT: 291 if (s.startsWith("->", i + 1)) { 292 // <!-- xxxxx^--> 293 i += "-->".length(); 294 location.endTagEndPos = i; 295 last = i; 296 location = (LocInfo) lockInfoStack.peek(); 297 state = STATE_NORMAL; 298 continue loop; 299 } 300 } 301 break; 302 303 case '\r': 304 ++i; 305 if (i < count && s.charAt(i) == '\n') { 306 // only count windows line ending CR LF as one line 307 ++i; 308 } 309 lineStartPositions.add(new Integer(i)); 310 continue loop; 311 312 case '\n': 313 ++i; 314 lineStartPositions.add(new Integer(i)); 315 continue loop; 316 317 case '\'': 318 switch (state) { 319 case STATE_APOS: 320 // a='xxx^' 321 state = STATE_TAG; 322 break; 323 case STATE_TAG: 324 // a=^'xxx' 325 state = STATE_APOS; 326 break; 327 case STATE_QUOT: 328 // a="doesn^'t matter" 329 default: 330 break; 331 } 332 break; 333 334 case '"': 335 switch (state) { 336 case STATE_QUOT: 337 // a="xxx^" 338 state = STATE_TAG; 339 break; 340 case STATE_TAG: 341 // a=^"xxx" 342 state = STATE_QUOT; 343 break; 344 case STATE_APOS: 345 // a='doesn^"t matter' 346 default: 347 break; 348 } 349 break; 350 } 351 352 ++i; 353 } 354 this.lineStartPositions = new int[lineStartPositions.size()]; 355 for (int j = 0; j < lineStartPositions.size(); j++) { 356 this.lineStartPositions[j] = 357 ((Integer) lineStartPositions.get(j)).intValue(); 358 } 359 } 360 361 private void populateMap(DOMWrapper def) 362 { 363 final int defType = def.getType(); 364 LocInfo location; 365 while (true) { 366 location = (LocInfo) locInfoList.get(seq++); 367 if (defType == DOMWrapper.ELEMENT 368 && location.type == TYPE_ELEMENT) 369 { 370 break; 371 } 372 if (defType == DOMWrapper.CDATA 373 && location.type == TYPE_TEXT) 374 { 375 break; 376 } 377 if (seq >= locInfoList.size()) { 378 return; 379 } 380 } 381 wrapperLocMap.put(def, location); 382 nodeLocMap.put(((W3CDOMWrapper) def).node, location); 383 final DOMWrapper[] elementChildren = def.getElementChildren(); 384 for (int i = 0; i < elementChildren.length; i++) { 385 DOMWrapper domWrapper = elementChildren[i]; 386 populateMap(domWrapper); 387 } 388 } 389 390 /** 391 * Returns the line that a character position falls on. The first line in a 392 * document is numbered 0. 393 * 394 * @param pos Character position 395 * @return Line (starting from 0) 396 */ 397 int getLine(int pos) 398 { 399 int index = Arrays.binarySearch(lineStartPositions, pos); 400 if (index >= 0) { 401 return index; 402 } else { 403 return -2 - index; 404 } 405 } 406 407 /** 408 * Returns the column that a character position falls on. The first column 409 * in a line is numbered 0. 410 * 411 * @param pos Character position 412 * @return column (starting from 0) 413 */ 414 int getCol(int pos) 415 { 416 int index = Arrays.binarySearch(lineStartPositions, pos); 417 if (index >= 0) { 418 return 0; 419 } else { 420 index = -2 - index; 421 return pos - lineStartPositions[index]; 422 } 423 } 424 425 void list(PrintWriter pw) 426 { 427 for (int i = 0; i < locInfoList.size(); i++) { 428 LocInfo location = (LocInfo) locInfoList.get(i); 429 pw.println( 430 location.seq + ": " + location.toString(this) + " [" 431 + location.getText(xml) + "]"); 432 } 433 pw.flush(); 434 } 435 436 // enum Type 437 private static final int 438 TYPE_ELEMENT = Node.ELEMENT_NODE, 439 TYPE_PROCESSING_INSTRUCTION = Node.PROCESSING_INSTRUCTION_NODE, 440 TYPE_COMMENT = Node.COMMENT_NODE, 441 TYPE_CDATA_SECTION = Node.CDATA_SECTION_NODE, 442 TYPE_TEXT = Node.TEXT_NODE; 443 444 class LocInfo { 445 /** Sequence in document, ordered by start position (prefix order) */ 446 final int seq; 447 /** Node type, typically {@link Node#ELEMENT_NODE}. */ 448 final int startTagStartPos; 449 final int type; 450 int startTagEndPos = -1; // -1 if entity is a single tag 451 int endTagEndPos = -1; 452 453 /** 454 * Creates a LocInfo. 455 * 456 * @param seq Sequence number in document 457 * @param nodeType Node type, typically {@link Node#ELEMENT_NODE}. 458 * @param startTagStartPos Position of start of element 459 */ 460 LocInfo(int seq, int nodeType, int startTagStartPos) { 461 this.seq = seq; 462 this.type = nodeType; 463 this.startTagStartPos = startTagStartPos; 464 } 465 466 public String toString(Annotator annotator) { 467 return "line " + annotator.getLine(startTagStartPos) 468 + ", column " + annotator.getCol(startTagStartPos); 469 } 470 471 /** 472 * Returns the fragment of source XML that this node encompasses. 473 * 474 * @param xml Whole source XML 475 * @return fragment of source XML 476 */ 477 public String getText(String xml) { 478 return xml.substring( 479 startTagStartPos, 480 endTagEndPos >= 0 ? endTagEndPos 481 : xml.length()); 482 } 483 484 /** 485 * Returns the fragment of source XML corresponding to the head tag 486 * of this element, if this is an element, otherwise the whole node. 487 * 488 * @param xml Whole source XML 489 * @return fragment of source XML 490 */ 491 public String getHeadText(String xml) { 492 return xml.substring( 493 startTagStartPos, 494 startTagEndPos >= 0 ? startTagEndPos 495 : endTagEndPos >= 0 ? endTagEndPos 496 : xml.length()); 497 } 498 499 public String toString() { 500 return getHeadText(xml); 501 } 502 } 503 504 /** 505 * Similar to {@link Stack} but based on {@link ArrayList} instead of 506 * {@link Vector}, and therefore more efficient. 507 */ 508 private static class ArrayStack extends ArrayList { 509 public final void push(Object t) 510 { 511 if (false) System.out.println(size() + " push [" + t + "]"); 512 add(t); 513 } 514 515 public final Object peek() 516 { 517 return get(size() - 1); 518 } 519 520 public final Object pop() 521 { 522 final int index = size() - 1; 523 Object t = remove(index); 524 if (false) System.out.println(size() + " pop [" + t + "]"); 525 return get(index - 1); 526 } 527 } 528 } 529 530 // End Annotator.java