wip: refactor line / column generation

This commit is contained in:
Evan You 2023-11-14 16:35:52 +08:00
parent 6311ebbf32
commit f7db16b9cf
2 changed files with 76 additions and 81 deletions

View File

@ -27,6 +27,7 @@ import {
DecodingMode,
htmlDecodeTree
} from 'entities/lib/decode.js'
import { Position } from '../ast'
const enum CharCodes {
Tab = 0x9, // "\t"
@ -120,21 +121,25 @@ export enum QuoteType {
}
export interface Callbacks {
ontext(start: number, endIndex: number): void
ontextentity(codepoint: number, endIndex: number): void
onopentagname(start: number, endIndex: number): void
onopentagend(endIndex: number): void
onselfclosingtag(endIndex: number): void
onclosetag(start: number, endIndex: number): void
onattribdata(start: number, endIndex: number): void
onattribentity(codepoint: number): void
onattribend(quote: QuoteType, endIndex: number): void
onattribname(start: number, endIndex: number): void
oncdata(start: number, endIndex: number, endOffset: number): void
onclosetag(start: number, endIndex: number): void
oncomment(start: number, endIndex: number, endOffset: number): void
ondeclaration(start: number, endIndex: number): void
oncdata(start: number, endIndex: number, endOffset: number): void
// onprocessinginstruction(start: number, endIndex: number): void
// ondeclaration(start: number, endIndex: number): void
onend(): void
onopentagend(endIndex: number): void
onopentagname(start: number, endIndex: number): void
onprocessinginstruction(start: number, endIndex: number): void
onselfclosingtag(endIndex: number): void
ontext(start: number, endIndex: number): void
ontextentity(codepoint: number, endIndex: number): void
}
/**
@ -167,15 +172,12 @@ export default class Tokenizer {
private baseState = State.Text
/** For special parsing behavior inside of script and style tags. */
private isSpecial = false
/** Reocrd newline positions for fast line / column calculation */
private newlines: number[] = []
private readonly decodeEntities: boolean
private readonly entityDecoder: EntityDecoder
public line = 1
public column = 1
public startLine = 1
public startColumn = 1
constructor(
{ decodeEntities = true }: { decodeEntities?: boolean },
private readonly cbs: Callbacks
@ -189,20 +191,35 @@ export default class Tokenizer {
public reset(): void {
this.state = State.Text
this.buffer = ''
this.recordStart(0)
this.sectionStart = 0
this.index = 0
this.line = 1
this.column = 1
this.startLine = 1
this.startColumn = 1
this.baseState = State.Text
this.currentSequence = undefined!
this.newlines.length = 0
}
private recordStart(start = this.index) {
this.sectionStart = start
this.startLine = this.line
this.startColumn = this.column + (start - this.index)
/**
* Generate Position object with line / column information using recorded
* newline positions. We know the index is always going to be an already
* processed index, so all the newlines up to this index should have been
* recorded.
*/
public getPositionForIndex(index: number): Position {
let line = 1
let column = index + 1
for (let i = this.newlines.length - 1; i >= 0; i--) {
const newlineIndex = this.newlines[i]
if (index > newlineIndex) {
line = i + 2
column = index - newlineIndex
break
}
}
return {
offset: index,
line,
column
}
}
private stateText(c: number): void {
@ -214,7 +231,7 @@ export default class Tokenizer {
this.cbs.ontext(this.sectionStart, this.index)
}
this.state = State.BeforeTagName
this.recordStart()
this.sectionStart = this.index
} else if (this.decodeEntities && c === CharCodes.Amp) {
this.startEntity()
}
@ -257,7 +274,7 @@ export default class Tokenizer {
}
this.isSpecial = false
this.recordStart(endOfText + 2) // Skip over the `</`
this.sectionStart = endOfText + 2 // Skip over the `</`
this.stateInClosingTagName(c)
return // We are done; skip the rest of the function.
}
@ -289,7 +306,7 @@ export default class Tokenizer {
this.state = State.InCommentLike
this.currentSequence = Sequences.CdataEnd
this.sequenceIndex = 0
this.recordStart(this.index + 1)
this.sectionStart = this.index + 1
}
} else {
this.sequenceIndex = 0
@ -340,7 +357,7 @@ export default class Tokenizer {
}
this.sequenceIndex = 0
this.recordStart(this.index + 1)
this.sectionStart = this.index + 1
this.state = State.Text
}
} else if (this.sequenceIndex === 0) {
@ -374,13 +391,13 @@ export default class Tokenizer {
private stateBeforeTagName(c: number): void {
if (c === CharCodes.ExclamationMark) {
this.state = State.BeforeDeclaration
this.recordStart(this.index + 1)
this.sectionStart = this.index + 1
} else if (c === CharCodes.Questionmark) {
this.state = State.InProcessingInstruction
this.recordStart(this.index + 1)
this.sectionStart = this.index + 1
} else if (this.isTagStartChar(c)) {
const lower = c | 0x20
this.recordStart()
this.sectionStart = this.index
if (lower === Sequences.TitleEnd[2]) {
this.startSpecial(Sequences.TitleEnd, 3)
} else {
@ -399,7 +416,7 @@ export default class Tokenizer {
private stateInTagName(c: number): void {
if (isEndOfTagSection(c)) {
this.cbs.onopentagname(this.sectionStart, this.index)
this.recordStart(-1)
this.sectionStart = -1
this.state = State.BeforeAttributeName
this.stateBeforeAttributeName(c)
}
@ -413,13 +430,13 @@ export default class Tokenizer {
this.state = this.isTagStartChar(c)
? State.InClosingTagName
: State.InSpecialComment
this.recordStart()
this.sectionStart = this.index
}
}
private stateInClosingTagName(c: number): void {
if (c === CharCodes.Gt || isWhitespace(c)) {
this.cbs.onclosetag(this.sectionStart, this.index)
this.recordStart(-1)
this.sectionStart = -1
this.state = State.AfterClosingTagName
this.stateAfterClosingTagName(c)
}
@ -428,7 +445,7 @@ export default class Tokenizer {
// Skip everything until ">"
if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
this.state = State.Text
this.recordStart(this.index + 1)
this.sectionStart = this.index + 1
}
}
private stateBeforeAttributeName(c: number): void {
@ -440,19 +457,19 @@ export default class Tokenizer {
} else {
this.state = State.Text
}
this.recordStart(this.index + 1)
this.sectionStart = this.index + 1
} else if (c === CharCodes.Slash) {
this.state = State.InSelfClosingTag
} else if (!isWhitespace(c)) {
this.state = State.InAttributeName
this.recordStart()
this.sectionStart = this.index
}
}
private stateInSelfClosingTag(c: number): void {
if (c === CharCodes.Gt) {
this.cbs.onselfclosingtag(this.index)
this.state = State.Text
this.recordStart(this.index + 1)
this.sectionStart = this.index + 1
this.isSpecial = false // Reset special state, in case of self-closing special tags
} else if (!isWhitespace(c)) {
this.state = State.BeforeAttributeName
@ -462,7 +479,7 @@ export default class Tokenizer {
private stateInAttributeName(c: number): void {
if (c === CharCodes.Eq || isEndOfTagSection(c)) {
this.cbs.onattribname(this.sectionStart, this.index)
this.recordStart()
this.sectionStart = this.index
this.state = State.AfterAttributeName
this.stateAfterAttributeName(c)
}
@ -472,24 +489,24 @@ export default class Tokenizer {
this.state = State.BeforeAttributeValue
} else if (c === CharCodes.Slash || c === CharCodes.Gt) {
this.cbs.onattribend(QuoteType.NoValue, this.sectionStart)
this.recordStart(-1)
this.sectionStart = -1
this.state = State.BeforeAttributeName
this.stateBeforeAttributeName(c)
} else if (!isWhitespace(c)) {
this.cbs.onattribend(QuoteType.NoValue, this.sectionStart)
this.state = State.InAttributeName
this.recordStart()
this.sectionStart = this.index
}
}
private stateBeforeAttributeValue(c: number): void {
if (c === CharCodes.DoubleQuote) {
this.state = State.InAttributeValueDq
this.recordStart(this.index + 1)
this.sectionStart = this.index + 1
} else if (c === CharCodes.SingleQuote) {
this.state = State.InAttributeValueSq
this.recordStart(this.index + 1)
this.sectionStart = this.index + 1
} else if (!isWhitespace(c)) {
this.recordStart()
this.sectionStart = this.index
this.state = State.InAttributeValueNq
this.stateInAttributeValueNoQuotes(c) // Reconsume token
}
@ -497,7 +514,7 @@ export default class Tokenizer {
private handleInAttributeValue(c: number, quote: number) {
if (c === quote || (!this.decodeEntities && this.fastForwardTo(quote))) {
this.cbs.onattribdata(this.sectionStart, this.index)
this.recordStart(-1)
this.sectionStart = -1
this.cbs.onattribend(
quote === CharCodes.DoubleQuote ? QuoteType.Double : QuoteType.Single,
this.index + 1
@ -516,7 +533,7 @@ export default class Tokenizer {
private stateInAttributeValueNoQuotes(c: number): void {
if (isWhitespace(c) || c === CharCodes.Gt) {
this.cbs.onattribdata(this.sectionStart, this.index)
this.recordStart(-1)
this.sectionStart = -1
this.cbs.onattribend(QuoteType.Unquoted, this.index)
this.state = State.BeforeAttributeName
this.stateBeforeAttributeName(c)
@ -535,16 +552,16 @@ export default class Tokenizer {
}
private stateInDeclaration(c: number): void {
if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
this.cbs.ondeclaration(this.sectionStart, this.index)
// this.cbs.ondeclaration(this.sectionStart, this.index)
this.state = State.Text
this.recordStart(this.index + 1)
this.sectionStart = this.index + 1
}
}
private stateInProcessingInstruction(c: number): void {
if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
this.cbs.onprocessinginstruction(this.sectionStart, this.index)
// this.cbs.onprocessinginstruction(this.sectionStart, this.index)
this.state = State.Text
this.recordStart(this.index + 1)
this.sectionStart = this.index + 1
}
}
private stateBeforeComment(c: number): void {
@ -553,7 +570,7 @@ export default class Tokenizer {
this.currentSequence = Sequences.CommentEnd
// Allow short comments (eg. <!-->)
this.sequenceIndex = 2
this.recordStart(this.index + 1)
this.sectionStart = this.index + 1
} else {
this.state = State.InDeclaration
}
@ -562,7 +579,7 @@ export default class Tokenizer {
if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
this.cbs.oncomment(this.sectionStart, this.index, 0)
this.state = State.Text
this.recordStart(this.index + 1)
this.sectionStart = this.index + 1
}
}
private stateBeforeSpecialS(c: number): void {
@ -715,14 +732,10 @@ export default class Tokenizer {
break
}
}
this.index++
// line / column handling
if (c === CharCodes.NewLine) {
this.line++
this.column = 1
} else {
this.column++
this.newlines.push(this.index)
}
this.index++
}
this.cleanup()
this.finish()
@ -739,14 +752,14 @@ export default class Tokenizer {
(this.state === State.InSpecialTag && this.sequenceIndex === 0)
) {
this.cbs.ontext(this.sectionStart, this.index)
this.recordStart()
this.sectionStart = this.index
} else if (
this.state === State.InAttributeValueDq ||
this.state === State.InAttributeValueSq ||
this.state === State.InAttributeValueNq
) {
this.cbs.onattribdata(this.sectionStart, this.index)
this.recordStart()
this.sectionStart = this.index
}
}
}
@ -805,7 +818,7 @@ export default class Tokenizer {
if (this.sectionStart < this.entityStart) {
this.cbs.onattribdata(this.sectionStart, this.entityStart)
}
this.recordStart(this.entityStart + consumed)
this.sectionStart = this.entityStart + consumed
this.index = this.sectionStart - 1
this.cbs.onattribentity(cp)
@ -813,7 +826,7 @@ export default class Tokenizer {
if (this.sectionStart < this.entityStart) {
this.cbs.ontext(this.sectionStart, this.entityStart)
}
this.recordStart(this.entityStart + consumed)
this.sectionStart = this.entityStart + consumed
this.index = this.sectionStart - 1
this.cbs.ontextentity(cp, this.sectionStart)

View File

@ -228,20 +228,6 @@ const tokenizer = new Tokenizer(
endIndex = end
// TODO throw error
startIndex = end + 1
},
// TODO ignore
ondeclaration(start, end) {
endIndex = end
// TODO onprocessinginstruction
startIndex = end + 1
},
// TODO ignore
onprocessinginstruction(start, end) {
endIndex = end
// TODO onprocessinginstruction
startIndex = end + 1
}
}
)
@ -306,12 +292,8 @@ function onText(content: string, start: number, end: number) {
type: NodeTypes.TEXT,
content,
loc: {
start: {
offset: start,
line: tokenizer.startLine,
column: tokenizer.startColumn
},
end: { offset: end, line: tokenizer.line, column: tokenizer.column },
start: tokenizer.getPositionForIndex(start),
end: tokenizer.getPositionForIndex(end),
source: content
}
})