From 68f90b5bd8873504ad752042b3e0a43ffd371e03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Thu, 24 Feb 2022 17:19:47 +0100 Subject: [PATCH] refactor(tokenizer): Use explicit offsets for locations (#402) --- packages/parse5/lib/tokenizer/index.ts | 84 +++++++++---------- packages/parse5/lib/tokenizer/preprocessor.ts | 5 +- .../tokenizer/tokenizer-location-info.test.ts | 19 +++++ 3 files changed, 64 insertions(+), 44 deletions(-) diff --git a/packages/parse5/lib/tokenizer/index.ts b/packages/parse5/lib/tokenizer/index.ts index 0068ea477..eead9ec90 100644 --- a/packages/parse5/lib/tokenizer/index.ts +++ b/packages/parse5/lib/tokenizer/index.ts @@ -220,6 +220,7 @@ export class Tokenizer { private consumedAfterSnapshot = -1; + private currentLocation: Location | null; private currentCharacterToken: CharacterToken | null = null; private currentToken: Token | null = null; private currentAttr: Attribute = { name: '', value: '' }; @@ -231,6 +232,7 @@ export class Tokenizer { this.addLocationInfo = !!options.sourceCodeLocationInfo; this.onParseError = options.onParseError ?? null; this.preprocessor = new Preprocessor(options); + this.currentLocation = this.getCurrentLocation(-1); } //Errors @@ -238,17 +240,16 @@ export class Tokenizer { this.onParseError?.(this.preprocessor.getError(code)); } - private currentAttrLocation: Location | null = null; - private ctLoc: Location | null = null; - private _getCurrentLocation(): Location | null { + // NOTE: `offset` may never run across line boundaries. + private getCurrentLocation(offset: number): Location | null { if (!this.addLocationInfo) { return null; } return { startLine: this.preprocessor.line, - startCol: this.preprocessor.col, - startOffset: this.preprocessor.offset, + startCol: this.preprocessor.col - offset, + startOffset: this.preprocessor.offset - offset, endLine: -1, endCol: -1, endOffset: -1, @@ -334,7 +335,7 @@ export class Tokenizer { selfClosing: false, ackSelfClosing: false, attrs: [], - location: this.ctLoc, + location: this.getCurrentLocation(1), }; } @@ -346,15 +347,15 @@ export class Tokenizer { selfClosing: false, ackSelfClosing: false, attrs: [], - location: this.ctLoc, + location: this.getCurrentLocation(2), }; } - private _createCommentToken(): void { + private _createCommentToken(offset: number): void { this.currentToken = { type: TokenType.COMMENT, data: '', - location: this.ctLoc, + location: this.getCurrentLocation(offset), }; } @@ -365,7 +366,7 @@ export class Tokenizer { forceQuirks: false, publicId: null, systemId: null, - location: this.ctLoc, + location: this.currentLocation, }; } @@ -373,20 +374,20 @@ export class Tokenizer { this.currentCharacterToken = { type, chars, - location: this.ctLoc, + location: this.currentLocation, }; } private _createEOFToken(): void { - const ctLoc = this._getCurrentLocation(); + const location = this.getCurrentLocation(0); - if (ctLoc) { - ctLoc.endLine = ctLoc.startLine; - ctLoc.endCol = ctLoc.startCol; - ctLoc.endOffset = ctLoc.startOffset; + if (location) { + location.endLine = location.startLine; + location.endCol = location.startCol; + location.endOffset = location.startOffset; } - this.currentToken = { type: TokenType.EOF, location: ctLoc }; + this.currentToken = { type: TokenType.EOF, location }; } //Tag attributes @@ -395,7 +396,7 @@ export class Tokenizer { name: attrNameFirstCh, value: '', }; - this.currentAttrLocation = this._getCurrentLocation(); + this.currentLocation = this.getCurrentLocation(0); } private _leaveAttrName(): void { @@ -404,9 +405,9 @@ export class Tokenizer { if (getTokenAttr(token, this.currentAttr.name) === null) { token.attrs.push(this.currentAttr); - if (token.location) { + if (token.location && this.currentLocation) { const attrLocations = (token.location.attrs ??= Object.create(null)); - attrLocations[this.currentAttr.name] = this.currentAttrLocation!; + attrLocations[this.currentAttr.name] = this.currentLocation; // Set end location this._leaveAttrValue(); @@ -417,19 +418,19 @@ export class Tokenizer { } private _leaveAttrValue(): void { - if (this.currentAttrLocation) { - this.currentAttrLocation.endLine = this.preprocessor.line; - this.currentAttrLocation.endCol = this.preprocessor.col; - this.currentAttrLocation.endOffset = this.preprocessor.offset; + if (this.currentLocation) { + this.currentLocation.endLine = this.preprocessor.line; + this.currentLocation.endCol = this.preprocessor.col; + this.currentLocation.endOffset = this.preprocessor.offset; } } //Token emission private _emitCurrentToken(): void { - this._emitCurrentCharacterToken(); - const ct = this.currentToken!; + this._emitCurrentCharacterToken(ct.location); + this.currentToken = null; //NOTE: store emited start tag's tagName to determine is the following end tag token is appropriate. @@ -462,16 +463,17 @@ export class Tokenizer { } this.tokenQueue.push(ct); + this.currentLocation = this.getCurrentLocation(-1); } - private _emitCurrentCharacterToken(): void { + private _emitCurrentCharacterToken(nextLocation: Location | null): void { if (this.currentCharacterToken) { //NOTE: if we have pending character token make it's end location equal to the //current token's start location. - if (this.ctLoc && this.currentCharacterToken.location) { - this.currentCharacterToken.location.endLine = this.ctLoc.startLine; - this.currentCharacterToken.location.endCol = this.ctLoc.startCol; - this.currentCharacterToken.location.endOffset = this.ctLoc.startOffset; + if (nextLocation && this.currentCharacterToken.location) { + this.currentCharacterToken.location.endLine = nextLocation.startLine; + this.currentCharacterToken.location.endCol = nextLocation.startCol; + this.currentCharacterToken.location.endOffset = nextLocation.startOffset; } this.tokenQueue.push(this.currentCharacterToken); @@ -496,7 +498,8 @@ export class Tokenizer { //3)TokenType.CHARACTER - any character sequence which don't belong to groups 1 and 2 (e.g. 'abcdef1234@@#$%^') private _appendCharToCurrentCharacterToken(type: CharacterToken['type'], ch: string): void { if (this.currentCharacterToken && this.currentCharacterToken.type !== type) { - this._emitCurrentCharacterToken(); + this.currentLocation = this.getCurrentLocation(0); + this._emitCurrentCharacterToken(this.currentLocation); } if (this.currentCharacterToken) { @@ -925,7 +928,6 @@ export class Tokenizer { //------------------------------------------------------------------ private _stateData(cp: number): void { this.preprocessor.dropParsedChunk(); - this.ctLoc = this._getCurrentLocation(); switch (cp) { case $.LESS_THAN_SIGN: { @@ -956,7 +958,6 @@ export class Tokenizer { //------------------------------------------------------------------ private _stateRcdata(cp: number): void { this.preprocessor.dropParsedChunk(); - this.ctLoc = this._getCurrentLocation(); switch (cp) { case $.AMPERSAND: { @@ -987,7 +988,6 @@ export class Tokenizer { //------------------------------------------------------------------ private _stateRawtext(cp: number): void { this.preprocessor.dropParsedChunk(); - this.ctLoc = this._getCurrentLocation(); switch (cp) { case $.LESS_THAN_SIGN: { @@ -1013,7 +1013,6 @@ export class Tokenizer { //------------------------------------------------------------------ private _stateScriptData(cp: number): void { this.preprocessor.dropParsedChunk(); - this.ctLoc = this._getCurrentLocation(); switch (cp) { case $.LESS_THAN_SIGN: { @@ -1039,7 +1038,6 @@ export class Tokenizer { //------------------------------------------------------------------ private _statePlaintext(cp: number): void { this.preprocessor.dropParsedChunk(); - this.ctLoc = this._getCurrentLocation(); switch (cp) { case $.NULL: { @@ -1076,7 +1074,7 @@ export class Tokenizer { } case $.QUESTION_MARK: { this._err(ERR.unexpectedQuestionMarkInsteadOfTagName); - this._createCommentToken(); + this._createCommentToken(1); this.state = State.BOGUS_COMMENT; this._stateBogusComment(cp); break; @@ -1118,7 +1116,7 @@ export class Tokenizer { } default: { this._err(ERR.invalidFirstCharacterOfTagName); - this._createCommentToken(); + this._createCommentToken(2); this.state = State.BOGUS_COMMENT; this._stateBogusComment(cp); } @@ -1956,16 +1954,18 @@ export class Tokenizer { //------------------------------------------------------------------ private _stateMarkupDeclarationOpen(cp: number): void { if (this._consumeSequenceIfMatch($$.DASH_DASH, true)) { - this._createCommentToken(); + this._createCommentToken($$.DASH_DASH.length + 1); this.state = State.COMMENT_START; } else if (this._consumeSequenceIfMatch($$.DOCTYPE, false)) { + // NOTE: Doctypes tokens are created without fixed offsets. We keep track of the moment a doctype *might* start here. + this.currentLocation = this.getCurrentLocation($$.DOCTYPE.length + 1); this.state = State.DOCTYPE; } else if (this._consumeSequenceIfMatch($$.CDATA_START, true)) { if (this.allowCDATA) { this.state = State.CDATA_SECTION; } else { this._err(ERR.cdataInHtmlContent); - this._createCommentToken(); + this._createCommentToken($$.CDATA_START.length + 1); (this.currentToken as CommentToken).data = '[CDATA['; this.state = State.BOGUS_COMMENT; } @@ -1975,7 +1975,7 @@ export class Tokenizer { //results are no longer valid and we will need to start over. else if (!this._ensureHibernation()) { this._err(ERR.incorrectlyOpenedComment); - this._createCommentToken(); + this._createCommentToken(2); this.state = State.BOGUS_COMMENT; this._stateBogusComment(cp); } diff --git a/packages/parse5/lib/tokenizer/preprocessor.ts b/packages/parse5/lib/tokenizer/preprocessor.ts index 18258ab57..7cb6d1ec3 100644 --- a/packages/parse5/lib/tokenizer/preprocessor.ts +++ b/packages/parse5/lib/tokenizer/preprocessor.ts @@ -17,7 +17,8 @@ const DEFAULT_BUFFER_WATERLINE = 1 << 16; export class Preprocessor { public html = ''; private pos = -1; - private lastGapPos = -1; + // NOTE: Initial `lastGapPos` is -2, to ensure `col` on initialisation is 0 + private lastGapPos = -2; private gapStack: number[] = []; private skipNextNewLine = false; private lastChunkWritten = false; @@ -106,7 +107,7 @@ export class Preprocessor { this.lineStartPos -= this.pos; this.droppedBufferSize += this.pos; this.pos = 0; - this.lastGapPos = -1; + this.lastGapPos = -2; this.gapStack.length = 0; } } diff --git a/packages/parse5/lib/tokenizer/tokenizer-location-info.test.ts b/packages/parse5/lib/tokenizer/tokenizer-location-info.test.ts index 3b01d95a4..e45a464ce 100644 --- a/packages/parse5/lib/tokenizer/tokenizer-location-info.test.ts +++ b/packages/parse5/lib/tokenizer/tokenizer-location-info.test.ts @@ -77,6 +77,24 @@ it('Location Info (Tokenizer)', () => { lastStartTagName: 'plaintext', htmlChunks: ['Text', ' \n', 'Test
'], }, + { + initialMode: TokenizerMode.DATA, + lastStartTagName: '', + htmlChunks: [ + '\n', + '', + '', + '', + '', + '', + ], + }, + { + initialMode: TokenizerMode.DATA, + lastStartTagName: '', + allowCDATA: true, + htmlChunks: ['', '', '', ' \n'], + }, ]; for (const testCase of testCases) { @@ -93,6 +111,7 @@ it('Location Info (Tokenizer)', () => { tokenizer.preprocessor.bufferWaterline = 8; tokenizer.state = testCase.initialMode; tokenizer.lastStartTagName = testCase.lastStartTagName; + tokenizer.allowCDATA = !!testCase.allowCDATA; for (let token = tokenizer.getNextToken(), j = 0; token.type !== TokenType.EOF; ) { if (token.type === TokenType.HIBERNATION) {