1 /** 2 * @fileoverview A class to stream over XML tokens. 3 */ 4 5 goog.provide('xrx.stream'); 6 7 8 9 goog.require('xrx.reader'); 10 goog.require('xrx.token'); 11 12 13 14 xrx.stream = function(xml) { 15 16 17 18 this.reader_ = new xrx.reader(xml); 19 20 21 22 this.stopped_ = false; 23 }; 24 25 26 27 xrx.stream.prototype.rowStartTag = goog.abstractMethod; 28 29 30 31 xrx.stream.prototype.rowEndTag = goog.abstractMethod; 32 33 34 35 xrx.stream.prototype.rowEmptyTag = goog.abstractMethod; 36 37 38 39 xrx.stream.State = { 40 XML_START: 1, 41 XML_END: 2, 42 START_TAG: 3, 43 END_TAG: 4, 44 EMPTY_TAG: 5, 45 NOT_TAG: 6, 46 LT_SEEN: 7, 47 GT_SEEN: 8, 48 WS_SEEN: 9, 49 TAG_START: 10, 50 TAG_NAME: 11, 51 TOK_END: 12, 52 ATTR_NAME: 13, 53 ATTR_VAL: 14 54 }; 55 56 57 58 xrx.stream.prototype.xml = function(xml) { 59 60 return !xml ? this.reader_.input() : this.reader_.input(xml); 61 }; 62 63 64 65 xrx.stream.prototype.update = function(token, xml) { 66 67 this.reader_.input(this.xml().substr(0, token.offset()) + 68 xml + this.xml().substr(token.offset() + token.length())); 69 }; 70 71 72 73 xrx.stream.prototype.stop = function() { 74 75 this.stopped_ = true; 76 }; 77 78 79 80 xrx.stream.prototype.stopped = function() { 81 82 this.stopped_ = false; 83 }; 84 85 86 87 xrx.stream.prototype.set = function(pos) { 88 this.reader_.set(pos); 89 }; 90 91 92 xrx.stream.prototype.pos = function() { 93 return this.reader_.pos(); 94 }; 95 96 97 xrx.stream.prototype.forward = function(startAt) { 98 var state = xrx.stream.State.XML_START; 99 var token; 100 var offset; 101 var length; 102 var reader = this.reader_; 103 104 startAt === undefined ? reader.first() : reader.set(startAt); 105 this.stopped_ = false; 106 107 for (;;) { 108 109 switch (state) { 110 // start parsing 111 case xrx.stream.State.XML_START: 112 reader.get() === '<' ? state = xrx.stream.State.LT_SEEN : 113 state = xrx.stream.State.NOT_TAG; 114 break; 115 // end parsing 116 case xrx.stream.State.XML_END: 117 break; 118 // parse start tag or empty tag 119 case xrx.stream.State.START_TAG: 120 offset = reader.pos(); 121 reader.forwardInclusive('>'); 122 state = xrx.stream.State.NOT_TAG; 123 reader.peek(-2) === '/' ? token = xrx.token.EMPTY_TAG : 124 token = xrx.token.START_TAG; 125 length = reader.pos() - offset; 126 break; 127 // parse end tag 128 case xrx.stream.State.END_TAG: 129 offset = reader.pos(); 130 reader.forwardInclusive('>'); 131 state = xrx.stream.State.NOT_TAG; 132 token = xrx.token.END_TAG; 133 length = reader.pos() - offset; 134 break; 135 // empty tag (never used) 136 case xrx.stream.State.EMPTY_TAG: 137 break; 138 // parse token that is not a tag 139 case xrx.stream.State.NOT_TAG: 140 if (!reader.get()) { 141 state = xrx.stream.State.XML_END; 142 } else if (reader.peek() === '<') { 143 state = xrx.stream.State.LT_SEEN; 144 } else { 145 reader.forwardExclusive('<'); 146 state = xrx.stream.State.LT_SEEN; 147 } 148 switch(token) { 149 case xrx.token.START_TAG: 150 this.rowStartTag(offset, length, reader.pos() - offset); 151 break; 152 case xrx.token.END_TAG: 153 this.rowEndTag(offset, length, reader.pos() - offset); 154 break; 155 case xrx.token.EMPTY_TAG: 156 this.rowEmptyTag(offset, length, reader.pos() - offset); 157 break; 158 default: 159 break; 160 }; 161 break; 162 // '<' seen: start tag or empty tag or end tag? 163 case xrx.stream.State.LT_SEEN: 164 if (reader.peek(1) === '/') { 165 state = xrx.stream.State.END_TAG; 166 } else { 167 state = xrx.stream.State.START_TAG; 168 } 169 break; 170 default: 171 throw Error('Invalid parser state.'); 172 break; 173 } 174 175 if (state === xrx.stream.State.XML_END || this.stopped_) { 176 this.stopped(); 177 break; 178 } 179 } 180 }; 181 182 183 184 xrx.stream.prototype.backward = function(startAt) { 185 var state = xrx.stream.State.XML_START; 186 var reader = this.reader_; 187 var token; 188 var offset; 189 var length; 190 191 startAt === undefined ? reader.last() : reader.set(startAt); 192 this.stopped_ = false; 193 194 for (;;) { 195 196 switch (state) { 197 // start parsing 198 case xrx.stream.State.XML_START: 199 if (reader.get() === '<') reader.previous(); 200 reader.get() === '>' ? state = xrx.stream.State.GT_SEEN : 201 state = xrx.stream.State.NOT_TAG; 202 break; 203 // end parsing 204 case xrx.stream.State.XML_END: 205 break; 206 // start tag (never used) 207 case xrx.stream.State.START_TAG: 208 break; 209 // parse end tag or start tag 210 case xrx.stream.State.END_TAG: 211 offset = reader.pos(); 212 reader.backwardInclusive('<'); 213 state = xrx.stream.State.NOT_TAG; 214 reader.peek(1) !== '/' ? 215 this.rowStartTag(reader.pos(), offset - reader.pos() + 1) : 216 this.rowEndTag(reader.pos(), offset - reader.pos() + 1); 217 reader.previous(); 218 if (reader.finished()) state = xrx.stream.State.XML_END; 219 break; 220 // parse empty tag 221 case xrx.stream.State.EMPTY_TAG: 222 offset = reader.pos(); 223 reader.backwardInclusive('<'); 224 state = xrx.stream.State.NOT_TAG; 225 this.rowEmptyTag(reader.pos(), offset - reader.pos() + 1); 226 reader.previous(); 227 if (reader.finished()) state = xrx.stream.State.XML_END; 228 break; 229 // parse token that is not a tag 230 case xrx.stream.State.NOT_TAG: 231 if (reader.get() === '>') { 232 state = xrx.stream.State.GT_SEEN; 233 } else { 234 offset = reader.pos(); 235 reader.backwardExclusive('>'); 236 //this.tokenNotTag.call(this, reader.pos(), offset - reader.pos() + 1); 237 reader.previous(); 238 state = xrx.stream.State.GT_SEEN; 239 } 240 if (reader.finished()) state = xrx.stream.State.XML_END; 241 break; 242 // '>' seen: end tag or start tag or empty tag? 243 case xrx.stream.State.GT_SEEN: 244 if (reader.peek(-1) === '/') { 245 state = xrx.stream.State.EMPTY_TAG; 246 } else { 247 state = xrx.stream.State.END_TAG; 248 } 249 break; 250 default: 251 throw Error('Invalid parser state.'); 252 break; 253 } 254 255 if (state === xrx.stream.State.XML_END || this.stopped_) { 256 this.stopped(); 257 break; 258 } 259 } 260 }; 261 262 263 264 xrx.stream.prototype.tagName = function(tag) { 265 var state = xrx.stream.State.TAG_START; 266 var tagName = new xrx.token.TagName(tag.label()); 267 var offset; 268 var length; 269 var reader = this.reader_; 270 271 reader.set(tag.offset()); 272 this.stopped_ = false; 273 274 for (;;) { 275 276 switch(state) { 277 case xrx.stream.State.TAG_START: 278 if (reader.next() !== '<') { 279 throw Error('< is expected.'); 280 } else { 281 state = xrx.stream.State.TAG_NAME; 282 reader.get() === '/' ? reader.next() : null; 283 tagName.offset(reader.pos()); 284 } 285 break; 286 case xrx.stream.State.TAG_NAME: 287 if (reader.next().match(/( |\/|>)/g)) { 288 state = xrx.stream.State.TOK_END; 289 tagName.length(reader.pos() - tagName.offset() - 1); 290 reader.backward(); 291 } 292 break; 293 default: 294 throw Error('Invalid parser state.'); 295 break; 296 } 297 298 if (state === xrx.stream.State.TOK_END) break; 299 } 300 return tagName; 301 }; 302 303 304 xrx.stream.prototype.attribute = function(tag, attribute, opt_offset) { 305 return this.attr_(tag, attribute); 306 }; 307 308 309 310 xrx.stream.prototype.attrName = function(tag, attrName, opt_offset) { 311 return this.attr_(tag, attrName); 312 }; 313 314 315 316 xrx.stream.prototype.attrValue = function(tag, attrValue, opt_offset) { 317 return this.attr_(tag, attrValue); 318 }; 319 320 321 322 xrx.stream.prototype.attr_ = function(tag, attr, opt_offset) { 323 var pos = attr.label().last(); 324 var tokenType = attr.type(); 325 var reader = this.reader_; 326 reader.set(opt_offset || tag.offset()); 327 this.stopped_ = false; 328 329 var tagName = !opt_offset ? this.tagName(tag) : null; 330 // tag does not contain any attributes ? => return null 331 if (reader.peek(-1).match(/(\/|>)/g)) return null; 332 333 var state = xrx.stream.State.ATTR_NAME; 334 var offset = reader.pos(); 335 var length; 336 var found = 0; 337 var quote; 338 339 340 for (;;) { 341 342 switch(state) { 343 case xrx.stream.State.ATTR_NAME: 344 found += 1; 345 tokenType === xrx.token.ATTRIBUTE || tokenType === xrx.token.ATTR_NAME ? 346 offset = reader.pos() : null; 347 reader.forwardInclusive('='); 348 if (tokenType === xrx.token.ATTR_NAME && found === pos) { 349 attr.offset(offset); 350 attr.length(reader.pos() - offset - 1); 351 state = xrx.stream.State.TOK_END; 352 } else { 353 quote = reader.next(); 354 tokenType === xrx.token.ATTR_VALUE ? offset = reader.pos() : null; 355 state = xrx.stream.State.ATTR_VAL; 356 } 357 break; 358 case xrx.stream.State.ATTR_VAL: 359 reader.forwardInclusive(quote); 360 if(found === pos) { 361 attr.offset(offset); 362 if (tokenType === xrx.token.ATTRIBUTE) { 363 attr.length(reader.pos() - offset); 364 } else if (tokenType === xrx.token.ATTR_VALUE) { 365 attr.length(reader.pos() - offset - 1); 366 } else {} 367 state = xrx.stream.State.TOK_END; 368 } else { 369 reader.next(); 370 if(!reader.peek(-1).match(/(\/|>)/g)) { 371 state = xrx.stream.State.ATTR_NAME; 372 } else { 373 state = xrx.stream.State.TOK_END; 374 attr = null; 375 } 376 } 377 break; 378 default: 379 throw Error('Invalid parser state.'); 380 break; 381 } 382 383 if (state === xrx.stream.State.TOK_END) break; 384 } 385 return attr; 386 }; 387 388 389 xrx.stream.prototype.attributes = function(tag) { 390 var attr = new xrx.token.Attribute(tag.label().clone()); 391 attr.label().child(); 392 var atts = []; 393 394 for(;;) { 395 var tmp = this.attribute(tag, attr, attr.offset() + attr.length()); 396 if (!tmp) break; 397 398 var newAttr = new xrx.token.Attribute(tmp.label().clone()); 399 newAttr.offset(tmp.offset()); 400 newAttr.length(tmp.length()); 401 atts.push(newAttr); 402 403 attr.label().nextSibling(); 404 } 405 406 return atts; 407 }; 408 409