1 /** 2 * @fileoverview A class to stream over XML documents or XML 3 * fragments. 4 */ 5 6 goog.provide('xrx.stream'); 7 8 9 10 goog.require('goog.string'); 11 goog.require('xrx.location'); 12 goog.require('xrx.reader'); 13 goog.require('xrx.token'); 14 15 16 17 /** 18 * A class to stream over XML documents or XML fragments. 19 * <br/> 20 * <br/> 21 * <b>IMPORTANT NOTE: This class represents a XML streamer and not a 22 * XML parser! The streamer is different from a XML parser in the 23 * following respects:</b> 24 * <br/> 25 * <br/> 26 * <li>the XML input document or fragment must be well-formed before 27 * streaming starts. The streamer itself does not do any well-formed 28 * checks 29 * <li>the streamer expects the XML document or fragment serialized 30 * as a string 31 * <li>the string must be encoded in UTF-8 32 * <li>whitespace must already be normalized and collapsed before 33 * streaming starts 34 * <li>the streamer expects the XML document without any indentation 35 * <br/><br/> 36 * These restrictions are intended by design, finally to reach optimal 37 * performance and to reach full XML support in browsers. For more 38 * background about parsing see e.g.: 39 * <li><a href="http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=4623219"> 40 * XML Document Parsing: Operational and Performance Characteristics</a> 41 * <br/> 42 * <br/> 43 * <b>String conversion, encoding conversion, whitespace 44 * normalization as well as indentation can best be prepared with the 45 * XQuery and XPath 3.0 serialization feature. Example XQuery script:</b> 46 * <pre>--- 47 *xquery version "3.0"; 48 * 49 *declare option output:method "xml"; 50 *declare option output:encoding "UTF-8"; 51 *declare option output:indent "no"; 52 * 53 *declare variable $xml := <someXml/>; 54 * 55 *fn:serialize($xml) 56 *---</pre> 57 * The output of this XQuery script is exactly what the streamer expects. 58 * <br/> 59 * <br/> 60 * See also: 61 * <li><a href="http://www.w3.org/TR/xslt-xquery-serialization-30/"> 62 * XSLT and XQuery Serialization 3.0</a> 63 * <li><a href="../../src/agent/xrx2html.xql">XRX++ XQuery Agent</a> 64 * <li><a href="../../src/agent/xrx2html.xsl">XRX++ XSLT Agent (For 65 * development only, only runs in modern browsers with full XML 66 * support)</a> 67 * 68 * @param {!string} xml A well-formed, normalized XML document or 69 * XML fragment serialized as UTF-8 string. 70 * @constructor 71 */ 72 xrx.stream = function(xml) { 73 74 75 76 /** 77 * @type 78 * @private 79 */ 80 this.reader_ = new xrx.reader(xml); 81 82 83 84 /** 85 * Weather the stream is stopped. 86 * @type {boolean} 87 * @private 88 */ 89 this.stopped_ = false; 90 }; 91 92 93 94 /** 95 * Event, thrown whenever a start-tag row is found. 96 */ 97 xrx.stream.prototype.rowStartTag = goog.abstractMethod; 98 99 100 101 /** 102 * Event, thrown whenever a end-tag row is found. 103 */ 104 xrx.stream.prototype.rowEndTag = goog.abstractMethod; 105 106 107 108 /** 109 * Event, thrown whenever a empty-tag row is found. 110 */ 111 xrx.stream.prototype.rowEmptyTag = goog.abstractMethod; 112 113 114 115 /** 116 * Event, thrown whenever a namespace declaration is found. 117 */ 118 xrx.stream.prototype.namespace = goog.abstractMethod; 119 120 121 122 /** 123 * Enumeration of internal states used by the streamer. 124 * @enum 125 * @private 126 */ 127 xrx.stream.State_ = { 128 XML_START: 1, 129 XML_END: 2, 130 START_TAG: 3, 131 END_TAG: 4, 132 EMPTY_TAG: 5, 133 NOT_TAG: 6, 134 LT_SEEN: 7, 135 GT_SEEN: 8, 136 WS_SEEN: 9, 137 TAG_START: 10, 138 TAG_NAME: 11, 139 TOK_END: 12, 140 ATTR_NAME: 13, 141 ATTR_VAL: 14 142 }; 143 144 145 146 /** 147 * Returns or sets the content of the current stream reader. 148 * 149 * @param opt_xml Well-formed, normalized UTF-8 XML string. 150 * @return The content of the stream reader. 151 */ 152 xrx.stream.prototype.xml = function(opt_xml) { 153 154 return !opt_xml ? this.reader_.input() : this.reader_.input(opt_xml); 155 }; 156 157 158 159 /** 160 * Updates the XML stream at a given location. 161 * 162 * @param {!number} offset The offset. 163 * @param {!number} length Number of characters to replace. 164 * @param {!string} xml The new string. 165 */ 166 xrx.stream.prototype.update = function(offset, length, xml) { 167 168 this.reader_.input(this.xml().substr(0, offset) + xml + 169 this.xml().substr(offset + length)); 170 }; 171 172 173 174 /** 175 * Can be called to stop streaming. 176 */ 177 xrx.stream.prototype.stop = function() { 178 179 this.stopped_ = true; 180 }; 181 182 183 184 /** 185 * Returns or sets the position of the stream reader. 186 * 187 * @param opt_pos The position. 188 * @return {!number} The position or the new position. 189 */ 190 xrx.stream.prototype.pos = function(opt_pos) { 191 if (opt_pos) this.reader_.set(opt_pos); 192 return this.reader_.pos(); 193 }; 194 195 196 197 /** 198 * Streams over a XML document or XML fragment in forward direction 199 * and fires start-row, end-row, empty row and namespace events. 200 * The streaming starts at the beginning of the XML document / 201 * fragment by default or optionally at an offset. 202 * 203 * @param {?number} opt_offset The offset. 204 */ 205 xrx.stream.prototype.forward = function(opt_offset) { 206 var state = xrx.stream.State_.XML_START; 207 var token; 208 var offset; 209 var length; 210 var reader = this.reader_; 211 212 !opt_offset ? reader.first() : reader.set(opt_offset); 213 this.stopped_ = false; 214 215 for (;;) { 216 217 switch (state) { 218 // start parsing 219 case xrx.stream.State_.XML_START: 220 reader.get() === '<' ? state = xrx.stream.State_.LT_SEEN : 221 state = xrx.stream.State_.NOT_TAG; 222 break; 223 // end parsing 224 case xrx.stream.State_.XML_END: 225 break; 226 // parse start tag or empty tag 227 case xrx.stream.State_.START_TAG: 228 offset = reader.pos(); 229 reader.forwardInclusive('>'); 230 state = xrx.stream.State_.NOT_TAG; 231 reader.peek(-2) === '/' ? token = xrx.token.EMPTY_TAG : 232 token = xrx.token.START_TAG; 233 length = reader.pos() - offset; 234 break; 235 // parse end tag 236 case xrx.stream.State_.END_TAG: 237 offset = reader.pos(); 238 reader.forwardInclusive('>'); 239 state = xrx.stream.State_.NOT_TAG; 240 token = xrx.token.END_TAG; 241 length = reader.pos() - offset; 242 break; 243 // empty tag (never used) 244 case xrx.stream.State_.EMPTY_TAG: 245 break; 246 // parse token that is not a tag 247 case xrx.stream.State_.NOT_TAG: 248 if (!reader.get()) { 249 state = xrx.stream.State_.XML_END; 250 } else if (reader.peek() === '<') { 251 state = xrx.stream.State_.LT_SEEN; 252 } else { 253 reader.forwardExclusive('<'); 254 state = xrx.stream.State_.LT_SEEN; 255 } 256 // if we have parsed the not-tag, the row is complete. 257 switch(token) { 258 case xrx.token.START_TAG: 259 this.rowStartTag(offset, length, reader.pos() - offset); 260 break; 261 case xrx.token.END_TAG: 262 this.rowEndTag(offset, length, reader.pos() - offset); 263 break; 264 case xrx.token.EMPTY_TAG: 265 this.rowEmptyTag(offset, length, reader.pos() - offset); 266 break; 267 default: 268 break; 269 }; 270 // are there any namespace declarations? 271 var tag = this.xml().substr(offset, length); 272 if (goog.string.contains(tag, 'xmlns')) { 273 var atts = this.attributes(tag); 274 for (var pos in atts) { 275 var att = atts[pos]; 276 if (goog.string.startsWith(att.xml(tag), 'xmlns')) { 277 this.namespace(att.offset + offset, att.length); 278 } 279 } 280 } 281 break; 282 // '<' seen: start tag or empty tag or end tag? 283 case xrx.stream.State_.LT_SEEN: 284 if (reader.peek(1) === '/') { 285 state = xrx.stream.State_.END_TAG; 286 } else { 287 state = xrx.stream.State_.START_TAG; 288 } 289 break; 290 default: 291 throw Error('Invalid parser state.'); 292 break; 293 } 294 295 if (state === xrx.stream.State_.XML_END || this.stopped_) { 296 this.stopped_ = false; 297 break; 298 } 299 } 300 }; 301 302 303 304 /** 305 * Streams over a XML document or XML fragment in backward direction 306 * and fires start-row, end-row, empty row and namespace events. The 307 * streaming starts at the end of the XML document / fragment by 308 * default or optionally at an offset. 309 * TODO(jochen): do we need lenght2 in backward streaming events? 310 * 311 * @param {?number} opt_offset The offset. 312 */ 313 xrx.stream.prototype.backward = function(opt_offset) { 314 var state = xrx.stream.State_.XML_START; 315 var reader = this.reader_; 316 var token; 317 var offset; 318 var length; 319 320 !opt_offset ? reader.last() : reader.set(opt_offset); 321 this.stopped_ = false; 322 323 for (;;) { 324 325 switch (state) { 326 // start parsing 327 case xrx.stream.State_.XML_START: 328 if (reader.get() === '<') reader.previous(); 329 reader.get() === '>' ? state = xrx.stream.State_.GT_SEEN : 330 state = xrx.stream.State_.NOT_TAG; 331 break; 332 // end parsing 333 case xrx.stream.State_.XML_END: 334 break; 335 // start tag (never used) 336 case xrx.stream.State_.START_TAG: 337 break; 338 // parse end tag or start tag 339 case xrx.stream.State_.END_TAG: 340 offset = reader.pos(); 341 reader.backwardInclusive('<'); 342 state = xrx.stream.State_.NOT_TAG; 343 if (reader.peek(1) !== '/') { 344 var off = reader.pos(); 345 var len1 = offset - reader.pos() + 1; 346 this.rowStartTag(off, len1); 347 // are there any namespace declarations? 348 var tag = this.xml().substr(off, len1); 349 if (goog.string.contains(tag, 'xmlns')) { 350 var atts = this.attributes(tag); 351 for (var pos in atts) { 352 var att = atts[pos]; 353 if (goog.string.startsWith(att.xml(tag), 'xmlns')) { 354 this.namespace(att.offset + off, att.length); 355 } 356 } 357 } 358 } else { 359 this.rowEndTag(reader.pos(), offset - reader.pos() + 1); 360 } 361 reader.previous(); 362 if (reader.finished()) state = xrx.stream.State_.XML_END; 363 break; 364 // parse empty tag 365 case xrx.stream.State_.EMPTY_TAG: 366 offset = reader.pos(); 367 reader.backwardInclusive('<'); 368 state = xrx.stream.State_.NOT_TAG; 369 var off = reader.pos(); 370 var len1 = offset - reader.pos() + 1; 371 this.rowEmptyTag(off, len1); 372 // are there any namespace declarations? 373 var tag = this.xml().substr(off, len1); 374 if (goog.string.contains(tag, 'xmlns')) { 375 var atts = this.attributes(tag); 376 for (var pos in atts) { 377 var att = atts[pos]; 378 if (goog.string.startsWith(att.xml(tag), 'xmlns')) { 379 this.namespace(att.offset + off, att.length); 380 } 381 } 382 } 383 reader.previous(); 384 if (reader.finished()) state = xrx.stream.State_.XML_END; 385 break; 386 // parse token that is not a tag 387 case xrx.stream.State_.NOT_TAG: 388 if (reader.get() === '>') { 389 state = xrx.stream.State_.GT_SEEN; 390 } else { 391 offset = reader.pos(); 392 reader.backwardExclusive('>'); 393 //this.tokenNotTag.call(this, reader.pos(), offset - reader.pos() + 1); 394 reader.previous(); 395 state = xrx.stream.State_.GT_SEEN; 396 } 397 if (reader.finished()) state = xrx.stream.State_.XML_END; 398 break; 399 // '>' seen: end tag or start tag or empty tag? 400 case xrx.stream.State_.GT_SEEN: 401 if (reader.peek(-1) === '/') { 402 state = xrx.stream.State_.EMPTY_TAG; 403 } else { 404 state = xrx.stream.State_.END_TAG; 405 } 406 break; 407 default: 408 throw Error('Invalid parser state.'); 409 break; 410 } 411 412 if (state === xrx.stream.State_.XML_END || this.stopped_) { 413 this.stopped_ = false; 414 break; 415 } 416 } 417 }; 418 419 420 421 /** 422 * Streams over a start-tag, a empty tag or an end-tag and 423 * returns the location of the name of the tag. 424 * 425 * @param {!string} xml The tag. 426 * @param {?xrx.reader} opt_reader Optional reader object. 427 * @return {!string} The tag-name. 428 */ 429 xrx.stream.prototype.tagName = function(xml, opt_reader) { 430 var state = xrx.stream.State_.TAG_START; 431 var offset; 432 var length; 433 var reader = opt_reader || new xrx.reader(xml); 434 435 this.stopped_ = false; 436 437 for (;;) { 438 439 switch(state) { 440 case xrx.stream.State_.TAG_START: 441 if (reader.next() !== '<') { 442 throw Error('< is expected.'); 443 } else { 444 state = xrx.stream.State_.TAG_NAME; 445 reader.get() === '/' ? reader.next() : null; 446 offset = reader.pos(); 447 } 448 break; 449 case xrx.stream.State_.TAG_NAME: 450 if (reader.next().match(/( |\/|>)/g)) { 451 state = xrx.stream.State_.TOK_END; 452 reader.backward(); 453 length = reader.pos() - offset - 1; 454 } 455 break; 456 default: 457 throw Error('Invalid parser state.'); 458 break; 459 } 460 461 if (state === xrx.stream.State_.TOK_END) break; 462 } 463 464 return new xrx.location(offset, length); 465 }; 466 467 468 469 /** 470 * Streams over a start-tag or a empty tag and returns the location 471 * of the n'th attribute, or null if the attribute does not exist. 472 * 473 * @param {!string} xml The start-tag or empty tag. 474 * @param {!number} pos The attribute position. 475 * @return {string|null} The attribute at position n or null. 476 */ 477 xrx.stream.prototype.attribute = function(xml, pos, opt_offset) { 478 return this.attr_(xml, pos, xrx.token.ATTRIBUTE, opt_offset); 479 }; 480 481 482 483 /** 484 * Streams over a start-tag or a empty tag and returns an array of 485 * locations of all attributes found in the tag or null if no 486 * attributes were found. 487 * 488 * @param {!string} xml The start-tag or empty tag. 489 * @return {Array.<string>|null} The attribute array. 490 */ 491 xrx.stream.prototype.attributes = function(xml) { 492 var locs = {}; 493 var location = new xrx.location(); 494 495 for(var i = 1;;i++) { 496 var newLocation = this.attribute(xml, i, location.offset + location.length); 497 if (!newLocation) break; 498 499 locs[i] = newLocation; 500 } 501 502 return locs; 503 }; 504 505 506 507 /** 508 * Streams over a start-tag or empty tag and returns the location 509 * of the name of the n'th attribute. 510 * 511 * @param {!string} xml The tag. 512 * @param {!number} pos The attribute position. 513 * @return {!string} The attribute name. 514 */ 515 xrx.stream.prototype.attrName = function(xml, pos) { 516 return this.attr_(xml, pos, xrx.token.ATTR_NAME); 517 }; 518 519 520 521 /** 522 * Streams over a start-tag or empty tag and returns the location 523 * of the value of the n'th attribute. 524 * 525 * @param {!string} xml The attribute. 526 * @param {!number} pos The attribute position. 527 * @return {!xrx.location} The attribute value location. 528 */ 529 xrx.stream.prototype.attrValue = function(xml, pos) { 530 return this.attr_(xml, pos, xrx.token.ATTR_VALUE); 531 }; 532 533 534 /** 535 * Shared utility function for attributes. 536 * 537 * @private 538 */ 539 xrx.stream.prototype.attr_ = function(xml, pos, tokenType, opt_offset, opt_reader) { 540 var reader = opt_reader || new xrx.reader(xml); 541 if (opt_offset) reader.set(opt_offset); 542 this.stopped_ = false; 543 544 var location = !opt_offset ? this.tagName(xml, reader) : new xrx.location(); 545 // tag does not contain any attributes ? => return null 546 if (reader.peek(-1).match(/(\/|>)/g)) return null; 547 548 var state = xrx.stream.State_.ATTR_NAME; 549 var offset = reader.pos(); 550 var length; 551 var found = 0; 552 var quote; 553 554 555 for (;;) { 556 557 switch(state) { 558 case xrx.stream.State_.ATTR_NAME: 559 found += 1; 560 tokenType === xrx.token.ATTRIBUTE || tokenType === xrx.token.ATTR_NAME ? 561 offset = reader.pos() : null; 562 reader.forwardInclusive('='); 563 if (tokenType === xrx.token.ATTR_NAME && found === pos) { 564 location.offset = offset; 565 location.length = reader.pos() - offset - 1; 566 state = xrx.stream.State_.TOK_END; 567 } else { 568 quote = reader.next(); 569 tokenType === xrx.token.ATTR_VALUE ? offset = reader.pos() : null; 570 state = xrx.stream.State_.ATTR_VAL; 571 } 572 break; 573 case xrx.stream.State_.ATTR_VAL: 574 reader.forwardInclusive(quote); 575 if(found === pos) { 576 location.offset = offset; 577 if (tokenType === xrx.token.ATTRIBUTE) { 578 location.length = reader.pos() - offset; 579 } else if (tokenType === xrx.token.ATTR_VALUE) { 580 location.length = reader.pos() - offset - 1; 581 } else {} 582 state = xrx.stream.State_.TOK_END; 583 } else { 584 reader.next(); 585 if(!reader.peek(-1).match(/(\/|>)/g)) { 586 state = xrx.stream.State_.ATTR_NAME; 587 } else { 588 state = xrx.stream.State_.TOK_END; 589 location = null; 590 } 591 } 592 break; 593 default: 594 throw Error('Invalid parser state.'); 595 break; 596 } 597 598 if (state === xrx.stream.State_.TOK_END) break; 599 } 600 return location; 601 }; 602 603 604 605 /** 606 * Streams over some XML content and returns the location of 607 * one or more comments. 608 */ 609 xrx.stream.prototype.comment = function(xml) { 610 // TODO(jochen) 611 }; 612 613 614 615 /** 616 * Streams over some XML content and returns the location of 617 * one or more processing instructions (PI). 618 * 619 * @param xml XML string. 620 */ 621 xrx.stream.prototype.pi = function(xml) { 622 // TODO(jochen) 623 }; 624 625 626 627 /** 628 * Streams over some XML content and returns the location of 629 * one or more character data (CDATA) sections. 630 * 631 * @param xml XML string. 632 */ 633 xrx.stream.prototype.cdata = function(xml) { 634 // TODO(jochen) 635 }; 636 637 638 639 /** 640 * Streams over some XML content and returns the location of 641 * one or more document type declarations. 642 * 643 * @param xml XML string. 644 */ 645 xrx.stream.prototype.doctypedecl = function(xml) { 646 // TODO(jochen) 647 }; 648