wip: parse directive in tokenizer

This commit is contained in:
Evan You 2023-11-15 01:14:36 +08:00
parent 622d34efe1
commit 08038a938c
3 changed files with 149 additions and 482 deletions

View File

@ -1,446 +0,0 @@
import Tokenizer, { Callbacks, QuoteType } from './Tokenizer.js'
import { fromCodePoint } from 'entities/lib/decode.js'
const formTags = new Set([
'input',
'option',
'optgroup',
'select',
'button',
'datalist',
'textarea'
])
const pTag = new Set(['p'])
const tableSectionTags = new Set(['thead', 'tbody'])
const ddtTags = new Set(['dd', 'dt'])
const rtpTags = new Set(['rt', 'rp'])
const openImpliesClose = new Map<string, Set<string>>([
['tr', new Set(['tr', 'th', 'td'])],
['th', new Set(['th'])],
['td', new Set(['thead', 'th', 'td'])],
['body', new Set(['head', 'link', 'script'])],
['li', new Set(['li'])],
['p', pTag],
['h1', pTag],
['h2', pTag],
['h3', pTag],
['h4', pTag],
['h5', pTag],
['h6', pTag],
['select', formTags],
['input', formTags],
['output', formTags],
['button', formTags],
['datalist', formTags],
['textarea', formTags],
['option', new Set(['option'])],
['optgroup', new Set(['optgroup', 'option'])],
['dd', ddtTags],
['dt', ddtTags],
['address', pTag],
['article', pTag],
['aside', pTag],
['blockquote', pTag],
['details', pTag],
['div', pTag],
['dl', pTag],
['fieldset', pTag],
['figcaption', pTag],
['figure', pTag],
['footer', pTag],
['form', pTag],
['header', pTag],
['hr', pTag],
['main', pTag],
['nav', pTag],
['ol', pTag],
['pre', pTag],
['section', pTag],
['table', pTag],
['ul', pTag],
['rt', rtpTags],
['rp', rtpTags],
['tbody', tableSectionTags],
['tfoot', tableSectionTags]
])
const voidElements = new Set([
'area',
'base',
'basefont',
'br',
'col',
'command',
'embed',
'frame',
'hr',
'img',
'input',
'isindex',
'keygen',
'link',
'meta',
'param',
'source',
'track',
'wbr'
])
const foreignContextElements = new Set(['math', 'svg'])
const htmlIntegrationElements = new Set([
'mi',
'mo',
'mn',
'ms',
'mtext',
'annotation-xml',
'foreignobject',
'desc',
'title'
])
export interface ParserOptions {
/**
* Decode entities within the document.
*
* @default true
*/
decodeEntities?: boolean
}
export interface Handler {
onparserinit(parser: Parser): void
/**
* Resets the handler back to starting state
*/
onreset(): void
/**
* Signals the handler that parsing is done
*/
onend(): void
onerror(error: Error): void
onclosetag(name: string, isImplied: boolean): void
onopentagname(name: string): void
/**
*
* @param name Name of the attribute
* @param value Value of the attribute.
* @param quote Quotes used around the attribute. `null` if the attribute has no quotes around the value, `undefined` if the attribute has no value.
*/
onattribute(
name: string,
value: string,
quote?: string | undefined | null
): void
onopentag(
name: string,
attribs: { [s: string]: string },
isImplied: boolean
): void
ontext(data: string): void
oncomment(data: string): void
oncdatastart(): void
oncdataend(): void
oncommentend(): void
onprocessinginstruction(name: string, data: string): void
}
const reNameEnd = /\s|\//
export class Parser implements Callbacks {
/** The start index of the last event. */
public startIndex = 0
/** The end index of the last event. */
public endIndex = 0
/**
* Store the start index of the current open tag,
* so we can update the start index for attributes.
*/
private openTagStart = 0
private tagname = ''
private attribname = ''
private attribvalue = ''
private attribs: null | { [key: string]: string } = null
private readonly stack: string[] = []
/** Determines whether self-closing tags are recognized. */
private readonly foreignContext: boolean[]
private readonly cbs: Partial<Handler>
private readonly tokenizer: Tokenizer
private buffer: string = ''
constructor(
cbs?: Partial<Handler> | null,
private readonly options: ParserOptions = {}
) {
this.cbs = cbs ?? {}
this.tokenizer = new Tokenizer(this.options, this)
this.foreignContext = [false]
this.cbs.onparserinit?.(this)
}
// Tokenizer event handlers
/** @internal */
ontext(start: number, endIndex: number): void {
const data = this.getSlice(start, endIndex)
this.endIndex = endIndex - 1
this.cbs.ontext?.(data)
this.startIndex = endIndex
}
/** @internal */
ontextentity(cp: number, endIndex: number): void {
this.endIndex = endIndex - 1
this.cbs.ontext?.(fromCodePoint(cp))
this.startIndex = endIndex
}
/** @internal */
onopentagname(start: number, endIndex: number): void {
this.emitOpenTag(this.getSlice(start, (this.endIndex = endIndex)))
}
private emitOpenTag(name: string) {
this.openTagStart = this.startIndex
this.tagname = name
const impliesClose = openImpliesClose.get(name)
if (impliesClose) {
while (this.stack.length > 0 && impliesClose.has(this.stack[0])) {
const element = this.stack.shift()!
this.cbs.onclosetag?.(element, true)
}
}
if (!voidElements.has(name)) {
this.stack.unshift(name)
if (foreignContextElements.has(name)) {
this.foreignContext.unshift(true)
} else if (htmlIntegrationElements.has(name)) {
this.foreignContext.unshift(false)
}
}
this.cbs.onopentagname?.(name)
if (this.cbs.onopentag) this.attribs = {}
}
private endOpenTag(isImplied: boolean) {
this.startIndex = this.openTagStart
if (this.attribs) {
this.cbs.onopentag?.(this.tagname, this.attribs, isImplied)
this.attribs = null
}
if (this.cbs.onclosetag && voidElements.has(this.tagname)) {
this.cbs.onclosetag(this.tagname, true)
}
this.tagname = ''
}
/** @internal */
onopentagend(endIndex: number): void {
this.endIndex = endIndex
this.endOpenTag(false)
// Set `startIndex` for next node
this.startIndex = endIndex + 1
}
/** @internal */
onclosetag(start: number, endIndex: number): void {
const name = this.getSlice(start, (this.endIndex = endIndex))
if (foreignContextElements.has(name) || htmlIntegrationElements.has(name)) {
this.foreignContext.shift()
}
if (!voidElements.has(name)) {
const pos = this.stack.indexOf(name)
if (pos !== -1) {
for (let index = 0; index <= pos; index++) {
const element = this.stack.shift()!
// We know the stack has sufficient elements.
this.cbs.onclosetag?.(element, index !== pos)
}
} else if (name === 'p') {
// Implicit open before close
this.emitOpenTag('p')
this.closeCurrentTag(true)
}
} else if (name === 'br') {
// We can't use `emitOpenTag` for implicit open, as `br` would be implicitly closed.
this.cbs.onopentagname?.('br')
this.cbs.onopentag?.('br', {}, true)
this.cbs.onclosetag?.('br', false)
}
// Set `startIndex` for next node
this.startIndex = endIndex + 1
}
/** @internal */
onselfclosingtag(endIndex: number): void {
this.endIndex = endIndex
this.closeCurrentTag(false)
// Set `startIndex` for next node
this.startIndex = endIndex + 1
}
private closeCurrentTag(isOpenImplied: boolean) {
const name = this.tagname
this.endOpenTag(isOpenImplied)
// Self-closing tags will be on the top of the stack
if (this.stack[0] === name) {
// If the opening tag isn't implied, the closing tag has to be implied.
this.cbs.onclosetag?.(name, !isOpenImplied)
this.stack.shift()
}
}
/** @internal */
onattribname(start: number, endIndex: number): void {
this.attribname = this.getSlice((this.startIndex = start), endIndex)
}
/** @internal */
onattribdata(start: number, endIndex: number): void {
this.attribvalue += this.getSlice(start, endIndex)
}
/** @internal */
onattribentity(cp: number): void {
this.attribvalue += fromCodePoint(cp)
}
/** @internal */
onattribend(quote: QuoteType, endIndex: number): void {
this.endIndex = endIndex
this.cbs.onattribute?.(
this.attribname,
this.attribvalue,
quote === QuoteType.Double
? '"'
: quote === QuoteType.Single
? "'"
: quote === QuoteType.NoValue
? undefined
: null
)
if (
this.attribs &&
!Object.prototype.hasOwnProperty.call(this.attribs, this.attribname)
) {
this.attribs[this.attribname] = this.attribvalue
}
this.attribvalue = ''
}
private getInstructionName(value: string) {
const index = value.search(reNameEnd)
return index < 0 ? value : value.slice(0, index)
}
/** @internal */
ondeclaration(start: number, endIndex: number): void {
this.endIndex = endIndex
const value = this.getSlice(start, endIndex)
if (this.cbs.onprocessinginstruction) {
const name = this.getInstructionName(value)
this.cbs.onprocessinginstruction(`!${name}`, `!${value}`)
}
// Set `startIndex` for next node
this.startIndex = endIndex + 1
}
/** @internal */
onprocessinginstruction(start: number, endIndex: number): void {
this.endIndex = endIndex
const value = this.getSlice(start, endIndex)
if (this.cbs.onprocessinginstruction) {
const name = this.getInstructionName(value)
this.cbs.onprocessinginstruction(`?${name}`, `?${value}`)
}
// Set `startIndex` for next node
this.startIndex = endIndex + 1
}
/** @internal */
oncomment(start: number, endIndex: number, offset: number): void {
this.endIndex = endIndex
this.cbs.oncomment?.(this.getSlice(start, endIndex - offset))
this.cbs.oncommentend?.()
// Set `startIndex` for next node
this.startIndex = endIndex + 1
}
/** @internal */
oncdata(start: number, endIndex: number, offset: number): void {
this.endIndex = endIndex
this.cbs.oncdatastart?.()
this.cbs.ontext?.(this.getSlice(start, endIndex - offset))
this.cbs.oncdataend?.()
// Set `startIndex` for next node
this.startIndex = endIndex + 1
}
/** @internal */
onend(): void {
if (this.cbs.onclosetag) {
// Set the end index for all remaining tags
this.endIndex = this.startIndex
for (let index = 0; index < this.stack.length; index++) {
this.cbs.onclosetag(this.stack[index], true)
}
}
this.cbs.onend?.()
}
private getSlice(start: number, end: number) {
return this.buffer.slice(start, end)
}
/**
* Parses a chunk of data and calls the corresponding callbacks.
*
* @param input string to parse.
*/
public parse(input: string): void {
this.reset()
this.buffer = input
this.tokenizer.parse(input)
}
/**
* Resets the parser to a blank state, ready to parse a new HTML document
*/
public reset(): void {
this.cbs.onreset?.()
this.tokenizer.reset()
this.tagname = ''
this.attribname = ''
this.attribs = null
this.stack.length = 0
this.startIndex = 0
this.endIndex = 0
this.cbs.onparserinit?.(this)
this.foreignContext.length = 0
this.foreignContext.unshift(false)
}
}

View File

@ -56,7 +56,13 @@ export const enum CharCodes {
UpperZ = 0x5a, // "Z"
LowerZ = 0x7a, // "z"
LowerX = 0x78, // "x"
OpeningSquareBracket = 0x5b // "["
OpeningSquareBracket = 0x5b, // "["
LowerV = 0x76, // "v"
Dot = 0x2e, // "."
Colon = 0x3a, // ":"
At = 0x40, // "@"
LeftSqaure = 91, // "["
RightSquare = 93 // "]"
}
/** All the states the tokenizer can be in. */
@ -72,6 +78,10 @@ const enum State {
// Attributes
BeforeAttributeName,
InAttributeName,
InDirectiveName,
InDirectiveArg,
InDirectiveDynamicArg,
InDirectiveModifier,
AfterAttributeName,
BeforeAttributeValue,
InAttributeValueDq, // "
@ -134,6 +144,10 @@ export interface Callbacks {
onattribend(quote: QuoteType, endIndex: number): void
onattribname(start: number, endIndex: number): void
ondirname(start: number, endIndex: number): void
ondirarg(start: number, endIndex: number): void
ondirmodifier(start: number, endIndex: number): void
oncomment(start: number, endIndex: number, endOffset: number): void
oncdata(start: number, endIndex: number, endOffset: number): void
@ -461,6 +475,26 @@ export default class Tokenizer {
} else if (c === CharCodes.Slash) {
this.state = State.InSelfClosingTag
} else if (!isWhitespace(c)) {
this.enterAttribute(c)
}
}
private enterAttribute(c: number) {
if (
c === CharCodes.LowerV &&
this.buffer.charCodeAt(this.index + 1) === CharCodes.Dash
) {
this.state = State.InDirectiveName
this.sectionStart = this.index
} else if (
c === CharCodes.Dot ||
c === CharCodes.Colon ||
c === CharCodes.At ||
c === CharCodes.Number
) {
this.cbs.ondirname(this.index, this.index + 1)
this.state = State.InDirectiveArg
this.sectionStart = this.index + 1
} else {
this.state = State.InAttributeName
this.sectionStart = this.index
}
@ -484,6 +518,54 @@ export default class Tokenizer {
this.stateAfterAttributeName(c)
}
}
private stateInDirectiveName(c: number): void {
if (c === CharCodes.Eq || isEndOfTagSection(c)) {
this.cbs.ondirname(this.sectionStart, this.index)
this.sectionStart = this.index
this.state = State.AfterAttributeName
this.stateAfterAttributeName(c)
} else if (c === CharCodes.Colon) {
this.cbs.ondirname(this.sectionStart, this.index)
this.state = State.InDirectiveArg
this.sectionStart = this.index + 1
} else if (c === CharCodes.Dot) {
this.cbs.ondirname(this.sectionStart, this.index)
this.state = State.InDirectiveModifier
this.sectionStart = this.index + 1
}
}
private stateInDirectiveArg(c: number): void {
if (c === CharCodes.Eq || isEndOfTagSection(c)) {
this.cbs.ondirarg(this.sectionStart, this.index)
this.sectionStart = this.index
this.state = State.AfterAttributeName
this.stateAfterAttributeName(c)
} else if (c === CharCodes.LeftSqaure) {
this.state = State.InDirectiveDynamicArg
} else if (c === CharCodes.Dot) {
this.cbs.ondirarg(this.sectionStart, this.index)
this.state = State.InDirectiveModifier
this.sectionStart = this.index + 1
}
}
private stateInDynamicDirectiveArg(c: number): void {
if (c === CharCodes.RightSquare) {
this.state = State.InDirectiveArg
} else if (c === CharCodes.Eq || isEndOfTagSection(c)) {
// TODO emit error
}
}
private stateInDirectiveModifier(c: number): void {
if (c === CharCodes.Eq || isEndOfTagSection(c)) {
this.cbs.ondirmodifier(this.sectionStart, this.index)
this.sectionStart = this.index
this.state = State.AfterAttributeName
this.stateAfterAttributeName(c)
} else if (c === CharCodes.Dot) {
this.cbs.ondirmodifier(this.sectionStart, this.index)
this.sectionStart = this.index + 1
}
}
private stateAfterAttributeName(c: number): void {
if (c === CharCodes.Eq) {
this.state = State.BeforeAttributeValue
@ -494,8 +576,7 @@ export default class Tokenizer {
this.stateBeforeAttributeName(c)
} else if (!isWhitespace(c)) {
this.cbs.onattribend(QuoteType.NoValue, this.sectionStart)
this.state = State.InAttributeName
this.sectionStart = this.index
this.enterAttribute(c)
}
}
private stateBeforeAttributeValue(c: number): void {
@ -655,6 +736,22 @@ export default class Tokenizer {
this.stateInAttributeName(c)
break
}
case State.InDirectiveName: {
this.stateInDirectiveName(c)
break
}
case State.InDirectiveArg: {
this.stateInDirectiveArg(c)
break
}
case State.InDirectiveDynamicArg: {
this.stateInDynamicDirectiveArg(c)
break
}
case State.InDirectiveModifier: {
this.stateInDirectiveModifier(c)
break
}
case State.InCommentLike: {
this.stateInCommentLike(c)
break
@ -796,6 +893,10 @@ export default class Tokenizer {
this.state === State.BeforeAttributeValue ||
this.state === State.AfterAttributeName ||
this.state === State.InAttributeName ||
this.state === State.InDirectiveName ||
this.state === State.InDirectiveArg ||
this.state === State.InDirectiveDynamicArg ||
this.state === State.InDirectiveModifier ||
this.state === State.InAttributeValueSq ||
this.state === State.InAttributeValueDq ||
this.state === State.InAttributeValueNq ||

View File

@ -142,11 +142,6 @@ const tokenizer = new Tokenizer(
onattribname(start, end) {
const name = getSlice(start, end)
if (currentAttrs.has(name)) {
// TODO emit error DUPLICATE_ATTRIBUTE
} else {
currentAttrs.add(name)
}
if (!inVPre && isDirective(name)) {
// directive
const match = directiveParseRE.exec(name)!
@ -259,42 +254,59 @@ const tokenizer = new Tokenizer(
currentAttrValue += fromCodePoint(codepoint)
},
onattribend(_quote, end) {
if (currentElement) {
if (currentAttrValue) {
if (currentProp!.type === NodeTypes.ATTRIBUTE) {
// assign value
currentProp!.value = {
type: NodeTypes.TEXT,
content: currentAttrValue,
// @ts-expect-error TODO
loc: {}
}
} else {
// directive
currentProp!.exp = {
type: NodeTypes.SIMPLE_EXPRESSION,
content: currentAttrValue,
isStatic: false,
// Treat as non-constant by default. This can be potentially set to
// other values by `transformExpression` to make it eligible for hoisting.
constType: ConstantTypes.NOT_CONSTANT,
// @ts-expect-error TODO
loc: {}
}
}
}
currentProp!.loc.end = tokenizer.getPositionForIndex(end)
currentElement.props.push(currentProp!)
}
// TODO check duplicate
// if (currentAttrs.has(name)) {
// // emit error DUPLICATE_ATTRIBUTE
// } else {
// currentAttrs.add(name)
// }
// if (currentElement) {
// if (currentAttrValue) {
// if (currentProp!.type === NodeTypes.ATTRIBUTE) {
// // assign value
// currentProp!.value = {
// type: NodeTypes.TEXT,
// content: currentAttrValue,
// // @ts-expect-error TODO
// loc: {}
// }
// } else {
// // directive
// currentProp!.exp = {
// type: NodeTypes.SIMPLE_EXPRESSION,
// content: currentAttrValue,
// isStatic: false,
// // Treat as non-constant by default. This can be potentially set to
// // other values by `transformExpression` to make it eligible for hoisting.
// constType: ConstantTypes.NOT_CONSTANT,
// // @ts-expect-error TODO
// loc: {}
// }
// }
// }
// currentProp!.loc.end = tokenizer.getPositionForIndex(end)
// currentElement.props.push(currentProp!)
// }
currentAttrValue = ''
},
ondirname(start, end) {
// console.log('name ' + getSlice(start, end))
currentProp
},
ondirarg(start, end) {
// console.log('arg ' + getSlice(start, end))
},
ondirmodifier(start, end) {
// console.log('.' + getSlice(start, end))
},
oncomment(start, end, offset) {
// TODO oncomment
},
onend() {
const end = currentInput.length
const end = currentInput.length - 1
for (let index = 0; index < stack.length; index++) {
onCloseTag(stack[index], end)
}