160 lines
		
	
	
		
			4.4 KiB
		
	
	
	
		
			JavaScript
		
	
	
	
	
	
			
		
		
	
	
			160 lines
		
	
	
		
			4.4 KiB
		
	
	
	
		
			JavaScript
		
	
	
	
	
	
| 'use strict';
 | |
| 
 | |
| const unicode = require('../common/unicode');
 | |
| const ERR = require('../common/error-codes');
 | |
| 
 | |
| //Aliases
 | |
| const $ = unicode.CODE_POINTS;
 | |
| 
 | |
| //Const
 | |
| const DEFAULT_BUFFER_WATERLINE = 1 << 16;
 | |
| 
 | |
| //Preprocessor
 | |
| //NOTE: HTML input preprocessing
 | |
| //(see: http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream)
 | |
| class Preprocessor {
 | |
|     constructor() {
 | |
|         this.html = null;
 | |
| 
 | |
|         this.pos = -1;
 | |
|         this.lastGapPos = -1;
 | |
|         this.lastCharPos = -1;
 | |
| 
 | |
|         this.gapStack = [];
 | |
| 
 | |
|         this.skipNextNewLine = false;
 | |
| 
 | |
|         this.lastChunkWritten = false;
 | |
|         this.endOfChunkHit = false;
 | |
|         this.bufferWaterline = DEFAULT_BUFFER_WATERLINE;
 | |
|     }
 | |
| 
 | |
|     _err() {
 | |
|         // NOTE: err reporting is noop by default. Enabled by mixin.
 | |
|     }
 | |
| 
 | |
|     _addGap() {
 | |
|         this.gapStack.push(this.lastGapPos);
 | |
|         this.lastGapPos = this.pos;
 | |
|     }
 | |
| 
 | |
|     _processSurrogate(cp) {
 | |
|         //NOTE: try to peek a surrogate pair
 | |
|         if (this.pos !== this.lastCharPos) {
 | |
|             const nextCp = this.html.charCodeAt(this.pos + 1);
 | |
| 
 | |
|             if (unicode.isSurrogatePair(nextCp)) {
 | |
|                 //NOTE: we have a surrogate pair. Peek pair character and recalculate code point.
 | |
|                 this.pos++;
 | |
| 
 | |
|                 //NOTE: add gap that should be avoided during retreat
 | |
|                 this._addGap();
 | |
| 
 | |
|                 return unicode.getSurrogatePairCodePoint(cp, nextCp);
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         //NOTE: we are at the end of a chunk, therefore we can't infer surrogate pair yet.
 | |
|         else if (!this.lastChunkWritten) {
 | |
|             this.endOfChunkHit = true;
 | |
|             return $.EOF;
 | |
|         }
 | |
| 
 | |
|         //NOTE: isolated surrogate
 | |
|         this._err(ERR.surrogateInInputStream);
 | |
| 
 | |
|         return cp;
 | |
|     }
 | |
| 
 | |
|     dropParsedChunk() {
 | |
|         if (this.pos > this.bufferWaterline) {
 | |
|             this.lastCharPos -= this.pos;
 | |
|             this.html = this.html.substring(this.pos);
 | |
|             this.pos = 0;
 | |
|             this.lastGapPos = -1;
 | |
|             this.gapStack = [];
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     write(chunk, isLastChunk) {
 | |
|         if (this.html) {
 | |
|             this.html += chunk;
 | |
|         } else {
 | |
|             this.html = chunk;
 | |
|         }
 | |
| 
 | |
|         this.lastCharPos = this.html.length - 1;
 | |
|         this.endOfChunkHit = false;
 | |
|         this.lastChunkWritten = isLastChunk;
 | |
|     }
 | |
| 
 | |
|     insertHtmlAtCurrentPos(chunk) {
 | |
|         this.html = this.html.substring(0, this.pos + 1) + chunk + this.html.substring(this.pos + 1, this.html.length);
 | |
| 
 | |
|         this.lastCharPos = this.html.length - 1;
 | |
|         this.endOfChunkHit = false;
 | |
|     }
 | |
| 
 | |
|     advance() {
 | |
|         this.pos++;
 | |
| 
 | |
|         if (this.pos > this.lastCharPos) {
 | |
|             this.endOfChunkHit = !this.lastChunkWritten;
 | |
|             return $.EOF;
 | |
|         }
 | |
| 
 | |
|         let cp = this.html.charCodeAt(this.pos);
 | |
| 
 | |
|         //NOTE: any U+000A LINE FEED (LF) characters that immediately follow a U+000D CARRIAGE RETURN (CR) character
 | |
|         //must be ignored.
 | |
|         if (this.skipNextNewLine && cp === $.LINE_FEED) {
 | |
|             this.skipNextNewLine = false;
 | |
|             this._addGap();
 | |
|             return this.advance();
 | |
|         }
 | |
| 
 | |
|         //NOTE: all U+000D CARRIAGE RETURN (CR) characters must be converted to U+000A LINE FEED (LF) characters
 | |
|         if (cp === $.CARRIAGE_RETURN) {
 | |
|             this.skipNextNewLine = true;
 | |
|             return $.LINE_FEED;
 | |
|         }
 | |
| 
 | |
|         this.skipNextNewLine = false;
 | |
| 
 | |
|         if (unicode.isSurrogate(cp)) {
 | |
|             cp = this._processSurrogate(cp);
 | |
|         }
 | |
| 
 | |
|         //OPTIMIZATION: first check if code point is in the common allowed
 | |
|         //range (ASCII alphanumeric, whitespaces, big chunk of BMP)
 | |
|         //before going into detailed performance cost validation.
 | |
|         const isCommonValidRange =
 | |
|             (cp > 0x1f && cp < 0x7f) || cp === $.LINE_FEED || cp === $.CARRIAGE_RETURN || (cp > 0x9f && cp < 0xfdd0);
 | |
| 
 | |
|         if (!isCommonValidRange) {
 | |
|             this._checkForProblematicCharacters(cp);
 | |
|         }
 | |
| 
 | |
|         return cp;
 | |
|     }
 | |
| 
 | |
|     _checkForProblematicCharacters(cp) {
 | |
|         if (unicode.isControlCodePoint(cp)) {
 | |
|             this._err(ERR.controlCharacterInInputStream);
 | |
|         } else if (unicode.isUndefinedCodePoint(cp)) {
 | |
|             this._err(ERR.noncharacterInInputStream);
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     retreat() {
 | |
|         if (this.pos === this.lastGapPos) {
 | |
|             this.lastGapPos = this.gapStack.pop();
 | |
|             this.pos--;
 | |
|         }
 | |
| 
 | |
|         this.pos--;
 | |
|     }
 | |
| }
 | |
| 
 | |
| module.exports = Preprocessor;
 |