1 /**
  2  * @fileoverview A class to stream over XML tokens.
  3  */
  4 
  5 goog.provide('xrx.stream');
  6 
  7 
  8 
  9 goog.require('xrx.reader');
 10 goog.require('xrx.token');
 11 
 12 
 13 
 14 xrx.stream = function(xml) {
 15 
 16 
 17 
 18   this.reader_ = new xrx.reader(xml);
 19   
 20   
 21   
 22   this.stopped_ = false;
 23 };
 24 
 25 
 26 
 27 xrx.stream.prototype.rowStartTag = goog.abstractMethod;
 28 
 29 
 30 
 31 xrx.stream.prototype.rowEndTag = goog.abstractMethod;
 32 
 33 
 34 
 35 xrx.stream.prototype.rowEmptyTag = goog.abstractMethod;
 36 
 37 
 38 
 39 xrx.stream.State = {
 40   XML_START: 1,
 41   XML_END: 2,
 42   START_TAG: 3,
 43   END_TAG: 4,
 44   EMPTY_TAG: 5,
 45   NOT_TAG: 6,
 46   LT_SEEN: 7,
 47   GT_SEEN: 8,
 48   WS_SEEN: 9,
 49   TAG_START: 10,
 50   TAG_NAME: 11,
 51   TOK_END: 12,
 52   ATTR_NAME: 13,
 53   ATTR_VAL: 14
 54 };
 55 
 56 
 57 
 58 xrx.stream.prototype.xml = function(xml) {
 59   
 60   return !xml ? this.reader_.input() : this.reader_.input(xml);
 61 };
 62 
 63 
 64 
 65 xrx.stream.prototype.update = function(token, xml) {
 66   
 67   this.reader_.input(this.xml().substr(0, token.offset()) + 
 68       xml + this.xml().substr(token.offset() + token.length()));
 69 };
 70 
 71 
 72 
 73 xrx.stream.prototype.stop = function() {
 74 
 75   this.stopped_ = true;
 76 };
 77 
 78 
 79 
 80 xrx.stream.prototype.stopped = function() {
 81 
 82   this.stopped_ = false;
 83 };
 84 
 85 
 86 
 87 xrx.stream.prototype.set = function(pos) {
 88   this.reader_.set(pos);
 89 };
 90 
 91 
 92 xrx.stream.prototype.pos = function() {
 93   return this.reader_.pos();
 94 };
 95 
 96 
 97 xrx.stream.prototype.forward = function(startAt) {
 98   var state = xrx.stream.State.XML_START;
 99   var token;
100   var offset;
101   var length;
102   var reader = this.reader_;
103 
104   startAt === undefined ? reader.first() : reader.set(startAt);
105   this.stopped_ = false;
106 
107   for (;;) {
108 
109     switch (state) {
110     // start parsing
111     case xrx.stream.State.XML_START:
112       reader.get() === '<' ? state = xrx.stream.State.LT_SEEN :
113           state = xrx.stream.State.NOT_TAG;
114       break;
115     // end parsing
116     case xrx.stream.State.XML_END:
117       break;
118     // parse start tag or empty tag
119     case xrx.stream.State.START_TAG:
120       offset = reader.pos();
121       reader.forwardInclusive('>');
122       state = xrx.stream.State.NOT_TAG;
123       reader.peek(-2) === '/' ? token = xrx.token.EMPTY_TAG : 
124           token = xrx.token.START_TAG;
125       length = reader.pos() - offset;
126       break;
127     // parse end tag
128     case xrx.stream.State.END_TAG:
129       offset = reader.pos();
130       reader.forwardInclusive('>');
131       state = xrx.stream.State.NOT_TAG;
132       token = xrx.token.END_TAG;
133       length = reader.pos() - offset;
134       break;
135     // empty tag (never used)
136     case xrx.stream.State.EMPTY_TAG:
137       break;
138     // parse token that is not a tag
139     case xrx.stream.State.NOT_TAG:
140       if (!reader.get()) {
141         state = xrx.stream.State.XML_END;
142       } else if (reader.peek() === '<') {
143         state = xrx.stream.State.LT_SEEN;
144       } else {
145         reader.forwardExclusive('<');
146         state = xrx.stream.State.LT_SEEN;
147       }
148       switch(token) {
149       case xrx.token.START_TAG:
150         this.rowStartTag(offset, length, reader.pos() - offset);
151         break;
152       case xrx.token.END_TAG:
153         this.rowEndTag(offset, length, reader.pos() - offset);
154         break;
155       case xrx.token.EMPTY_TAG:
156         this.rowEmptyTag(offset, length, reader.pos() - offset);
157         break;
158       default:
159         break;
160       };
161       break;
162     // '<' seen: start tag or empty tag or end tag?
163     case xrx.stream.State.LT_SEEN:
164       if (reader.peek(1) === '/') {
165         state = xrx.stream.State.END_TAG;
166       } else {
167         state = xrx.stream.State.START_TAG;
168       }
169       break;
170     default:
171       throw Error('Invalid parser state.');
172       break;
173     }
174 
175     if (state === xrx.stream.State.XML_END || this.stopped_) {
176       this.stopped();
177       break;
178     }
179   }
180 };
181 
182 
183 
184 xrx.stream.prototype.backward = function(startAt) {
185   var state = xrx.stream.State.XML_START;
186   var reader = this.reader_;
187   var token;
188   var offset;
189   var length;
190 
191   startAt === undefined ? reader.last() : reader.set(startAt);
192   this.stopped_ = false;
193 
194   for (;;) {
195 
196     switch (state) {
197     // start parsing
198     case xrx.stream.State.XML_START:
199       if (reader.get() === '<') reader.previous();
200       reader.get() === '>' ? state = xrx.stream.State.GT_SEEN : 
201           state = xrx.stream.State.NOT_TAG;
202       break;
203     // end parsing
204     case xrx.stream.State.XML_END:
205       break;
206     // start tag (never used)
207     case xrx.stream.State.START_TAG:
208       break;
209     // parse end tag or start tag
210     case xrx.stream.State.END_TAG:
211       offset = reader.pos();
212       reader.backwardInclusive('<');
213       state = xrx.stream.State.NOT_TAG;
214       reader.peek(1) !== '/' ?
215           this.rowStartTag(reader.pos(), offset - reader.pos() + 1) :
216             this.rowEndTag(reader.pos(), offset - reader.pos() + 1);
217       reader.previous();
218       if (reader.finished()) state = xrx.stream.State.XML_END;
219       break;
220     // parse empty tag
221     case xrx.stream.State.EMPTY_TAG:
222       offset = reader.pos();
223       reader.backwardInclusive('<');
224       state = xrx.stream.State.NOT_TAG;
225       this.rowEmptyTag(reader.pos(), offset - reader.pos() + 1);
226       reader.previous();
227       if (reader.finished()) state = xrx.stream.State.XML_END;
228       break;
229     // parse token that is not a tag
230     case xrx.stream.State.NOT_TAG:
231       if (reader.get() === '>') {
232         state = xrx.stream.State.GT_SEEN;
233       } else {
234         offset = reader.pos();
235         reader.backwardExclusive('>');
236         //this.tokenNotTag.call(this, reader.pos(), offset - reader.pos() + 1);
237         reader.previous();
238         state = xrx.stream.State.GT_SEEN;
239       }
240       if (reader.finished()) state = xrx.stream.State.XML_END;
241       break;
242     // '>' seen: end tag or start tag or empty tag?
243     case xrx.stream.State.GT_SEEN:
244       if (reader.peek(-1) === '/') {
245         state = xrx.stream.State.EMPTY_TAG;
246       } else {
247         state = xrx.stream.State.END_TAG;
248       }
249       break;
250     default:
251       throw Error('Invalid parser state.');
252       break;
253     }
254 
255     if (state === xrx.stream.State.XML_END || this.stopped_) {
256       this.stopped();
257       break;
258     }
259   }
260 };
261 
262 
263 
264 xrx.stream.prototype.tagName = function(tag) {
265   var state = xrx.stream.State.TAG_START;
266   var tagName = new xrx.token.TagName(tag.label());
267   var offset;
268   var length;
269   var reader = this.reader_;
270 
271   reader.set(tag.offset());
272   this.stopped_ = false;
273   
274   for (;;) {
275     
276     switch(state) {
277     case xrx.stream.State.TAG_START:
278       if (reader.next() !== '<') {
279         throw Error('< is expected.');
280       } else {
281         state = xrx.stream.State.TAG_NAME;
282         reader.get() === '/' ? reader.next() : null;
283         tagName.offset(reader.pos());
284       }
285       break;
286     case xrx.stream.State.TAG_NAME:
287       if (reader.next().match(/( |\/|>)/g)) {
288         state = xrx.stream.State.TOK_END;
289         tagName.length(reader.pos() - tagName.offset() - 1);
290         reader.backward();
291       }
292       break;
293     default:
294       throw Error('Invalid parser state.');
295       break;
296     }
297     
298     if (state === xrx.stream.State.TOK_END) break; 
299   }
300   return tagName;
301 };
302 
303 
304 xrx.stream.prototype.attribute = function(tag, attribute, opt_offset) {
305   return this.attr_(tag, attribute);
306 };
307 
308 
309 
310 xrx.stream.prototype.attrName = function(tag, attrName, opt_offset) {
311   return this.attr_(tag, attrName);
312 };
313 
314 
315 
316 xrx.stream.prototype.attrValue = function(tag, attrValue, opt_offset) {
317   return this.attr_(tag, attrValue);
318 };
319 
320 
321 
322 xrx.stream.prototype.attr_ = function(tag, attr, opt_offset) {
323   var pos = attr.label().last();
324   var tokenType = attr.type();
325   var reader = this.reader_;
326   reader.set(opt_offset || tag.offset());
327   this.stopped_ = false;
328   
329   var tagName = !opt_offset ? this.tagName(tag) : null;
330   // tag does not contain any attributes ? => return null
331   if (reader.peek(-1).match(/(\/|>)/g)) return null; 
332 
333   var state = xrx.stream.State.ATTR_NAME;
334   var offset = reader.pos();
335   var length;
336   var found = 0;
337   var quote;
338 
339   
340   for (;;) {
341 
342     switch(state) {
343     case xrx.stream.State.ATTR_NAME:
344       found += 1;
345       tokenType === xrx.token.ATTRIBUTE || tokenType === xrx.token.ATTR_NAME ? 
346           offset = reader.pos() : null;
347       reader.forwardInclusive('=');
348       if (tokenType === xrx.token.ATTR_NAME && found === pos) {
349         attr.offset(offset);
350         attr.length(reader.pos() - offset - 1);
351         state = xrx.stream.State.TOK_END;
352       } else {
353         quote = reader.next();
354         tokenType === xrx.token.ATTR_VALUE ? offset = reader.pos() : null;
355         state = xrx.stream.State.ATTR_VAL;
356       }
357       break;
358     case xrx.stream.State.ATTR_VAL:
359       reader.forwardInclusive(quote);
360       if(found === pos) {
361         attr.offset(offset);
362         if (tokenType === xrx.token.ATTRIBUTE) {
363           attr.length(reader.pos() - offset);
364         } else if (tokenType === xrx.token.ATTR_VALUE) {
365           attr.length(reader.pos() - offset - 1);
366         } else {}
367         state = xrx.stream.State.TOK_END;
368       } else {
369         reader.next();
370         if(!reader.peek(-1).match(/(\/|>)/g)) {
371           state = xrx.stream.State.ATTR_NAME;
372         } else {
373           state = xrx.stream.State.TOK_END;
374           attr = null;
375         }
376       }
377       break;
378     default:
379       throw Error('Invalid parser state.');
380       break;
381     }
382     
383     if (state === xrx.stream.State.TOK_END) break;
384   }
385   return attr;
386 };
387 
388 
389 xrx.stream.prototype.attributes = function(tag) {
390   var attr = new xrx.token.Attribute(tag.label().clone());
391   attr.label().child();
392   var atts = [];
393 
394   for(;;) {
395     var tmp = this.attribute(tag, attr, attr.offset() + attr.length());
396     if (!tmp) break;
397 
398     var newAttr = new xrx.token.Attribute(tmp.label().clone());
399     newAttr.offset(tmp.offset());
400     newAttr.length(tmp.length());
401     atts.push(newAttr);
402 
403     attr.label().nextSibling();
404   }
405 
406   return atts;
407 };
408 
409