1 /**
  2  * @fileoverview A class to stream over XML documents or XML
  3  * fragments.
  4  */
  5 
  6 goog.provide('xrx.stream');
  7 
  8 
  9 
 10 goog.require('goog.string');
 11 goog.require('xrx.location');
 12 goog.require('xrx.reader');
 13 goog.require('xrx.token');
 14 
 15 
 16 
 17 /**
 18  * A class to stream over XML documents or XML fragments.
 19  * <br/>
 20  * <br/>
 21  * <b>IMPORTANT NOTE: This class represents a XML streamer and not a 
 22  *   XML parser! The streamer is different from a XML parser in the 
 23  *   following respects:</b>
 24  * <br/>
 25  * <br/>  
 26  * <li>the XML input document or fragment must be well-formed before 
 27  *   streaming starts. The streamer itself does not do any well-formed 
 28  *   checks
 29  * <li>the streamer expects the XML document or fragment serialized 
 30  *   as a string
 31  * <li>the string must be encoded in UTF-8
 32  * <li>whitespace must already be normalized and collapsed before 
 33  *   streaming starts
 34  * <li>the streamer expects the XML document without any indentation
 35  * <br/><br/>
 36  * These restrictions are intended by design, finally to reach optimal
 37  * performance and to reach full XML support in browsers. For more 
 38  * background about parsing see e.g.:
 39  * <li><a href="http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=4623219">
 40  *   XML Document Parsing: Operational and Performance Characteristics</a>
 41  * <br/>
 42  * <br/>
 43  * <b>String conversion, encoding conversion, whitespace
 44  *   normalization as well as indentation can best be prepared with the 
 45  *   XQuery and XPath 3.0 serialization feature. Example XQuery script:</b>
 46  * <pre>---
 47  *xquery version "3.0";
 48  *
 49  *declare option output:method "xml";
 50  *declare option output:encoding "UTF-8";
 51  *declare option output:indent "no";
 52  *
 53  *declare variable $xml := <someXml/>;
 54  *
 55  *fn:serialize($xml)
 56  *---</pre>
 57  * The output of this XQuery script is exactly what the streamer expects.
 58  * <br/>
 59  * <br/>
 60  * See also: 
 61  * <li><a href="http://www.w3.org/TR/xslt-xquery-serialization-30/">
 62  *   XSLT and XQuery Serialization 3.0</a>
 63  * <li><a href="../../src/agent/xrx2html.xql">XRX++ XQuery Agent</a>
 64  * <li><a href="../../src/agent/xrx2html.xsl">XRX++ XSLT Agent (For
 65  *   development only, only runs in modern browsers with full XML 
 66  *   support)</a>
 67  *   
 68  * @param {!string} xml A well-formed, normalized XML document or
 69  * XML fragment serialized as UTF-8 string.
 70  * @constructor
 71  */
 72 xrx.stream = function(xml) {
 73 
 74 
 75 
 76   /**
 77    * @type
 78    * @private
 79    */
 80   this.reader_ = new xrx.reader(xml);
 81   
 82   
 83 
 84   /**
 85    * Weather the stream is stopped.
 86    * @type {boolean}
 87    * @private
 88    */
 89   this.stopped_ = false;
 90 };
 91 
 92 
 93 
 94 /**
 95  * Event, thrown whenever a start-tag row is found.
 96  */
 97 xrx.stream.prototype.rowStartTag = goog.abstractMethod;
 98 
 99 
100 
101 /**
102  * Event, thrown whenever a end-tag row is found.
103  */
104 xrx.stream.prototype.rowEndTag = goog.abstractMethod;
105 
106 
107 
108 /**
109  * Event, thrown whenever a empty-tag row is found.
110  */
111 xrx.stream.prototype.rowEmptyTag = goog.abstractMethod;
112 
113 
114 
115 /**
116  * Event, thrown whenever a namespace declaration is found.
117  */
118 xrx.stream.prototype.namespace = goog.abstractMethod;
119 
120 
121 
122 /**
123  * Enumeration of internal states used by the streamer.
124  * @enum
125  * @private
126  */
127 xrx.stream.State_ = {
128   XML_START: 1,
129   XML_END: 2,
130   START_TAG: 3,
131   END_TAG: 4,
132   EMPTY_TAG: 5,
133   NOT_TAG: 6,
134   LT_SEEN: 7,
135   GT_SEEN: 8,
136   WS_SEEN: 9,
137   TAG_START: 10,
138   TAG_NAME: 11,
139   TOK_END: 12,
140   ATTR_NAME: 13,
141   ATTR_VAL: 14
142 };
143 
144 
145 
146 /**
147  * Returns or sets the content of the current stream reader.
148  * 
149  * @param opt_xml Well-formed, normalized UTF-8 XML string.
150  * @return The content of the stream reader.
151  */
152 xrx.stream.prototype.xml = function(opt_xml) {
153   
154   return !opt_xml ? this.reader_.input() : this.reader_.input(opt_xml);
155 };
156 
157 
158 
159 /**
160  * Updates the XML stream at a given location.
161  * 
162  * @param {!number} offset The offset.
163  * @param {!number} length Number of characters to replace.
164  * @param {!string} xml The new string.
165  */
166 xrx.stream.prototype.update = function(offset, length, xml) {
167   
168   this.reader_.input(this.xml().substr(0, offset) + xml + 
169       this.xml().substr(offset + length));
170 };
171 
172 
173 
174 /**
175  * Can be called to stop streaming.
176  */
177 xrx.stream.prototype.stop = function() {
178 
179   this.stopped_ = true;
180 };
181 
182 
183 
184 /**
185  * Returns or sets the position of the stream reader.
186  * 
187  * @param opt_pos The position.
188  * @return {!number} The position or the new position.
189  */
190 xrx.stream.prototype.pos = function(opt_pos) {
191   if (opt_pos) this.reader_.set(opt_pos);
192   return this.reader_.pos();
193 };
194 
195 
196 
197 /**
198  * Streams over a XML document or XML fragment in forward direction
199  * and fires start-row, end-row, empty row and namespace events. 
200  * The streaming starts at the beginning of the XML document / 
201  * fragment by default or optionally at an offset.
202  * 
203  * @param {?number} opt_offset The offset.
204  */
205 xrx.stream.prototype.forward = function(opt_offset) {
206   var state = xrx.stream.State_.XML_START;
207   var token;
208   var offset;
209   var length;
210   var reader = this.reader_;
211 
212   !opt_offset ? reader.first() : reader.set(opt_offset);
213   this.stopped_ = false;
214 
215   for (;;) {
216 
217     switch (state) {
218     // start parsing
219     case xrx.stream.State_.XML_START:
220       reader.get() === '<' ? state = xrx.stream.State_.LT_SEEN :
221           state = xrx.stream.State_.NOT_TAG;
222       break;
223     // end parsing
224     case xrx.stream.State_.XML_END:
225       break;
226     // parse start tag or empty tag
227     case xrx.stream.State_.START_TAG:
228       offset = reader.pos();
229       reader.forwardInclusive('>');
230       state = xrx.stream.State_.NOT_TAG;
231       reader.peek(-2) === '/' ? token = xrx.token.EMPTY_TAG : 
232           token = xrx.token.START_TAG;
233       length = reader.pos() - offset;
234       break;
235     // parse end tag
236     case xrx.stream.State_.END_TAG:
237       offset = reader.pos();
238       reader.forwardInclusive('>');
239       state = xrx.stream.State_.NOT_TAG;
240       token = xrx.token.END_TAG;
241       length = reader.pos() - offset;
242       break;
243     // empty tag (never used)
244     case xrx.stream.State_.EMPTY_TAG:
245       break;
246     // parse token that is not a tag
247     case xrx.stream.State_.NOT_TAG:
248       if (!reader.get()) {
249         state = xrx.stream.State_.XML_END;
250       } else if (reader.peek() === '<') {
251         state = xrx.stream.State_.LT_SEEN;
252       } else {
253         reader.forwardExclusive('<');
254         state = xrx.stream.State_.LT_SEEN;
255       }
256       // if we have parsed the not-tag, the row is complete.
257       switch(token) {
258       case xrx.token.START_TAG:
259         this.rowStartTag(offset, length, reader.pos() - offset);
260         break;
261       case xrx.token.END_TAG:
262         this.rowEndTag(offset, length, reader.pos() - offset);
263         break;
264       case xrx.token.EMPTY_TAG:
265         this.rowEmptyTag(offset, length, reader.pos() - offset);
266         break;
267       default:
268         break;
269       };
270       // are there any namespace declarations?
271       var tag = this.xml().substr(offset, length);
272       if (goog.string.contains(tag, 'xmlns')) {
273         var atts = this.attributes(tag);
274         for (var pos in atts) {
275           var att = atts[pos];
276           if (goog.string.startsWith(att.xml(tag), 'xmlns')) {
277             this.namespace(att.offset + offset, att.length);
278           }
279         }
280       }
281       break;
282     // '<' seen: start tag or empty tag or end tag?
283     case xrx.stream.State_.LT_SEEN:
284       if (reader.peek(1) === '/') {
285         state = xrx.stream.State_.END_TAG;
286       } else {
287         state = xrx.stream.State_.START_TAG;
288       }
289       break;
290     default:
291       throw Error('Invalid parser state.');
292       break;
293     }
294 
295     if (state === xrx.stream.State_.XML_END || this.stopped_) {
296       this.stopped_ = false;
297       break;
298     }
299   }
300 };
301 
302 
303 
304 /**
305  * Streams over a XML document or XML fragment in backward direction
306  * and fires start-row, end-row, empty row and namespace events. The 
307  * streaming starts at the end of the XML document / fragment by 
308  * default or optionally at an offset.
309  * TODO(jochen): do we need lenght2 in backward streaming events?
310  * 
311  * @param {?number} opt_offset The offset.
312  */
313 xrx.stream.prototype.backward = function(opt_offset) {
314   var state = xrx.stream.State_.XML_START;
315   var reader = this.reader_;
316   var token;
317   var offset;
318   var length;
319 
320   !opt_offset ? reader.last() : reader.set(opt_offset);
321   this.stopped_ = false;
322 
323   for (;;) {
324 
325     switch (state) {
326     // start parsing
327     case xrx.stream.State_.XML_START:
328       if (reader.get() === '<') reader.previous();
329       reader.get() === '>' ? state = xrx.stream.State_.GT_SEEN : 
330           state = xrx.stream.State_.NOT_TAG;
331       break;
332     // end parsing
333     case xrx.stream.State_.XML_END:
334       break;
335     // start tag (never used)
336     case xrx.stream.State_.START_TAG:
337       break;
338     // parse end tag or start tag
339     case xrx.stream.State_.END_TAG:
340       offset = reader.pos();
341       reader.backwardInclusive('<');
342       state = xrx.stream.State_.NOT_TAG;
343       if (reader.peek(1) !== '/') {
344         var off = reader.pos();
345         var len1 = offset - reader.pos() + 1;
346         this.rowStartTag(off, len1);
347         // are there any namespace declarations?
348         var tag = this.xml().substr(off, len1);
349         if (goog.string.contains(tag, 'xmlns')) {
350           var atts = this.attributes(tag);
351           for (var pos in atts) {
352             var att = atts[pos];
353             if (goog.string.startsWith(att.xml(tag), 'xmlns')) {
354               this.namespace(att.offset + off, att.length);
355             }
356           }
357         }
358       } else {
359         this.rowEndTag(reader.pos(), offset - reader.pos() + 1);
360       }
361       reader.previous();
362       if (reader.finished()) state = xrx.stream.State_.XML_END;
363       break;
364     // parse empty tag
365     case xrx.stream.State_.EMPTY_TAG:
366       offset = reader.pos();
367       reader.backwardInclusive('<');
368       state = xrx.stream.State_.NOT_TAG;
369       var off = reader.pos();
370       var len1 = offset - reader.pos() + 1;
371       this.rowEmptyTag(off, len1);
372       // are there any namespace declarations?
373       var tag = this.xml().substr(off, len1);
374       if (goog.string.contains(tag, 'xmlns')) {
375         var atts = this.attributes(tag);
376         for (var pos in atts) {
377           var att = atts[pos];
378           if (goog.string.startsWith(att.xml(tag), 'xmlns')) {
379             this.namespace(att.offset + off, att.length);
380           }
381         }
382       }
383       reader.previous();
384       if (reader.finished()) state = xrx.stream.State_.XML_END;
385       break;
386     // parse token that is not a tag
387     case xrx.stream.State_.NOT_TAG:
388       if (reader.get() === '>') {
389         state = xrx.stream.State_.GT_SEEN;
390       } else {
391         offset = reader.pos();
392         reader.backwardExclusive('>');
393         //this.tokenNotTag.call(this, reader.pos(), offset - reader.pos() + 1);
394         reader.previous();
395         state = xrx.stream.State_.GT_SEEN;
396       }
397       if (reader.finished()) state = xrx.stream.State_.XML_END;
398       break;
399     // '>' seen: end tag or start tag or empty tag?
400     case xrx.stream.State_.GT_SEEN:
401       if (reader.peek(-1) === '/') {
402         state = xrx.stream.State_.EMPTY_TAG;
403       } else {
404         state = xrx.stream.State_.END_TAG;
405       }
406       break;
407     default:
408       throw Error('Invalid parser state.');
409       break;
410     }
411 
412     if (state === xrx.stream.State_.XML_END || this.stopped_) {
413       this.stopped_ = false;
414       break;
415     }
416   }
417 };
418 
419 
420 
421 /**
422  * Streams over a start-tag, a empty tag or an end-tag and
423  * returns the location of the name of the tag.
424  * 
425  * @param {!string} xml The tag.
426  * @param {?xrx.reader} opt_reader Optional reader object.
427  * @return {!string} The tag-name.
428  */
429 xrx.stream.prototype.tagName = function(xml, opt_reader) {
430   var state = xrx.stream.State_.TAG_START;
431   var offset;
432   var length;
433   var reader = opt_reader || new xrx.reader(xml);
434 
435   this.stopped_ = false;
436   
437   for (;;) {
438     
439     switch(state) {
440     case xrx.stream.State_.TAG_START:
441       if (reader.next() !== '<') {
442         throw Error('< is expected.');
443       } else {
444         state = xrx.stream.State_.TAG_NAME;
445         reader.get() === '/' ? reader.next() : null;
446         offset = reader.pos();
447       }
448       break;
449     case xrx.stream.State_.TAG_NAME:
450       if (reader.next().match(/( |\/|>)/g)) {
451         state = xrx.stream.State_.TOK_END;
452         reader.backward();
453         length = reader.pos() - offset - 1;
454       }
455       break;
456     default:
457       throw Error('Invalid parser state.');
458       break;
459     }
460     
461     if (state === xrx.stream.State_.TOK_END) break; 
462   }
463 
464   return new xrx.location(offset, length);
465 };
466 
467 
468 
469 /**
470  * Streams over a start-tag or a empty tag and returns the location
471  * of the n'th attribute, or null if the attribute does not exist.
472  * 
473  * @param {!string} xml The start-tag or empty tag.
474  * @param {!number} pos The attribute position.
475  * @return {string|null} The attribute at position n or null.
476  */
477 xrx.stream.prototype.attribute = function(xml, pos, opt_offset) {
478   return this.attr_(xml, pos, xrx.token.ATTRIBUTE, opt_offset);
479 };
480 
481 
482 
483 /**
484  * Streams over a start-tag or a empty tag and returns an array of 
485  * locations of all attributes found in the tag or null if no 
486  * attributes were found.
487  * 
488  * @param {!string} xml The start-tag or empty tag.
489  * @return {Array.<string>|null} The attribute array.
490  */
491 xrx.stream.prototype.attributes = function(xml) {
492   var locs = {};
493   var location = new xrx.location();
494 
495   for(var i = 1;;i++) {
496     var newLocation = this.attribute(xml, i, location.offset + location.length);
497     if (!newLocation) break;
498     
499     locs[i] = newLocation;
500   }
501 
502   return locs;
503 };
504 
505 
506 
507 /**
508  * Streams over a start-tag or empty tag and returns the location
509  * of the name of the n'th attribute.
510  * 
511  * @param {!string} xml The tag.
512  * @param {!number} pos The attribute position.
513  * @return {!string} The attribute name.
514  */
515 xrx.stream.prototype.attrName = function(xml, pos) {
516   return this.attr_(xml, pos, xrx.token.ATTR_NAME);
517 };
518 
519 
520 
521 /**
522  * Streams over a start-tag or empty tag and returns the location 
523  * of the value of the n'th attribute.
524  * 
525  * @param {!string} xml The attribute.
526  * @param {!number} pos The attribute position.
527  * @return {!xrx.location} The attribute value location.
528  */
529 xrx.stream.prototype.attrValue = function(xml, pos) {
530   return this.attr_(xml, pos, xrx.token.ATTR_VALUE);
531 };
532 
533 
534 /**
535  * Shared utility function for attributes.
536  * 
537  * @private
538  */
539 xrx.stream.prototype.attr_ = function(xml, pos, tokenType, opt_offset, opt_reader) {
540   var reader = opt_reader || new xrx.reader(xml);
541   if (opt_offset) reader.set(opt_offset);
542   this.stopped_ = false;
543   
544   var location = !opt_offset ? this.tagName(xml, reader) : new xrx.location();
545   // tag does not contain any attributes ? => return null
546   if (reader.peek(-1).match(/(\/|>)/g)) return null; 
547 
548   var state = xrx.stream.State_.ATTR_NAME;
549   var offset = reader.pos();
550   var length;
551   var found = 0;
552   var quote;
553 
554   
555   for (;;) {
556 
557     switch(state) {
558     case xrx.stream.State_.ATTR_NAME:
559       found += 1;
560       tokenType === xrx.token.ATTRIBUTE || tokenType === xrx.token.ATTR_NAME ? 
561           offset = reader.pos() : null;
562       reader.forwardInclusive('=');
563       if (tokenType === xrx.token.ATTR_NAME && found === pos) {
564         location.offset = offset;
565         location.length = reader.pos() - offset - 1;
566         state = xrx.stream.State_.TOK_END;
567       } else {
568         quote = reader.next();
569         tokenType === xrx.token.ATTR_VALUE ? offset = reader.pos() : null;
570         state = xrx.stream.State_.ATTR_VAL;
571       }
572       break;
573     case xrx.stream.State_.ATTR_VAL:
574       reader.forwardInclusive(quote);
575       if(found === pos) {
576         location.offset = offset;
577         if (tokenType === xrx.token.ATTRIBUTE) {
578           location.length = reader.pos() - offset;
579         } else if (tokenType === xrx.token.ATTR_VALUE) {
580           location.length = reader.pos() - offset - 1;
581         } else {}
582         state = xrx.stream.State_.TOK_END;
583       } else {
584         reader.next();
585         if(!reader.peek(-1).match(/(\/|>)/g)) {
586           state = xrx.stream.State_.ATTR_NAME;
587         } else {
588           state = xrx.stream.State_.TOK_END;
589           location = null;
590         }
591       }
592       break;
593     default:
594       throw Error('Invalid parser state.');
595       break;
596     }
597     
598     if (state === xrx.stream.State_.TOK_END) break;
599   }
600   return location;
601 };
602 
603 
604 
605 /**
606  * Streams over some XML content and returns the location of 
607  * one or more comments.
608  */
609 xrx.stream.prototype.comment = function(xml) {
610   // TODO(jochen)
611 };
612 
613 
614 
615 /**
616  * Streams over some XML content and returns the location of 
617  * one or more processing instructions (PI).
618  * 
619  * @param xml XML string.
620  */
621 xrx.stream.prototype.pi = function(xml) {
622   // TODO(jochen)
623 };
624 
625 
626 
627 /**
628  * Streams over some XML content and returns the location of
629  * one or more character data (CDATA) sections.
630  * 
631  * @param xml XML string.
632  */
633 xrx.stream.prototype.cdata = function(xml) {
634   // TODO(jochen)
635 };
636 
637 
638 
639 /**
640  * Streams over some XML content and returns the location of
641  * one or more document type declarations.
642  * 
643  * @param xml XML string.
644  */
645 xrx.stream.prototype.doctypedecl = function(xml) {
646   // TODO(jochen)
647 };
648