wip: port parser

This commit is contained in:
Evan You 2023-11-13 21:03:39 +08:00
parent 2a6292e37f
commit 19bd714239
8 changed files with 497 additions and 116 deletions

View File

@ -1,5 +1,5 @@
import { ParserOptions } from '../src/options' import { ParserOptions } from '../src/options'
import { baseParse, TextModes } from '../src/parse' import { TextModes } from '../src/parse'
import { ErrorCodes } from '../src/errors' import { ErrorCodes } from '../src/errors'
import { import {
CommentNode, CommentNode,
@ -14,6 +14,8 @@ import {
DirectiveNode DirectiveNode
} from '../src/ast' } from '../src/ast'
import { baseParse } from '../src/parser/index'
describe('compiler: parse', () => { describe('compiler: parse', () => {
describe('Text', () => { describe('Text', () => {
test('simple text', () => { test('simple text', () => {

View File

@ -128,9 +128,9 @@ export interface BaseElementNode extends Node {
ns: Namespace ns: Namespace
tag: string tag: string
tagType: ElementTypes tagType: ElementTypes
isSelfClosing: boolean
props: Array<AttributeNode | DirectiveNode> props: Array<AttributeNode | DirectiveNode>
children: TemplateChildNode[] children: TemplateChildNode[]
isSelfClosing?: boolean
} }
export interface PlainElementNode extends BaseElementNode { export interface PlainElementNode extends BaseElementNode {

View File

@ -70,3 +70,5 @@ export {
warnDeprecation, warnDeprecation,
CompilerDeprecationTypes CompilerDeprecationTypes
} from './compat/compatConfig' } from './compat/compatConfig'
export { baseParse as newParse } from './parser/index'

View File

@ -17,6 +17,10 @@ export interface ErrorHandlingOptions {
export interface ParserOptions export interface ParserOptions
extends ErrorHandlingOptions, extends ErrorHandlingOptions,
CompilerCompatOptions { CompilerCompatOptions {
/**
* Parse as HTML. Default: false
*/
htmlMode?: boolean
/** /**
* e.g. platform native elements, e.g. `<div>` for browsers * e.g. platform native elements, e.g. `<div>` for browsers
*/ */

View File

@ -40,6 +40,7 @@ import {
} from './compat/compatConfig' } from './compat/compatConfig'
type OptionalOptions = type OptionalOptions =
| 'htmlMode'
| 'whitespace' | 'whitespace'
| 'isNativeTag' | 'isNativeTag'
| 'isBuiltInComponent' | 'isBuiltInComponent'

View File

@ -108,22 +108,6 @@ export interface ParserOptions {
* @default true * @default true
*/ */
decodeEntities?: boolean decodeEntities?: boolean
/**
* If set to true, CDATA sections will be recognized as text even if the xmlMode option is not enabled.
* NOTE: If xmlMode is set to `true` then CDATA sections will always be recognized as text.
*
* @default false
*/
recognizeCDATA?: boolean
/**
* If set to `true`, self-closing tags will trigger the onclosetag event even if xmlMode is not set to `true`.
* NOTE: If xmlMode is set to `true` then self-closing tags will always be recognized.
*
* @default false
*/
recognizeSelfClosing?: boolean
} }
export interface Handler { export interface Handler {
@ -186,7 +170,6 @@ export class Parser implements Callbacks {
/** Determines whether self-closing tags are recognized. */ /** Determines whether self-closing tags are recognized. */
private readonly foreignContext: boolean[] private readonly foreignContext: boolean[]
private readonly cbs: Partial<Handler> private readonly cbs: Partial<Handler>
private readonly recognizeSelfClosing: boolean
private readonly tokenizer: Tokenizer private readonly tokenizer: Tokenizer
private buffer: string = '' private buffer: string = ''
@ -196,7 +179,6 @@ export class Parser implements Callbacks {
private readonly options: ParserOptions = {} private readonly options: ParserOptions = {}
) { ) {
this.cbs = cbs ?? {} this.cbs = cbs ?? {}
this.recognizeSelfClosing = options.recognizeSelfClosing ?? false
this.tokenizer = new Tokenizer(this.options, this) this.tokenizer = new Tokenizer(this.options, this)
this.foreignContext = [false] this.foreignContext = [false]
this.cbs.onparserinit?.(this) this.cbs.onparserinit?.(this)
@ -307,15 +289,9 @@ export class Parser implements Callbacks {
/** @internal */ /** @internal */
onselfclosingtag(endIndex: number): void { onselfclosingtag(endIndex: number): void {
this.endIndex = endIndex this.endIndex = endIndex
if (this.recognizeSelfClosing || this.foreignContext[0]) {
this.closeCurrentTag(false) this.closeCurrentTag(false)
// Set `startIndex` for next node // Set `startIndex` for next node
this.startIndex = endIndex + 1 this.startIndex = endIndex + 1
} else {
// Ignore the fact that the tag is self-closing.
this.onopentagend(endIndex)
}
} }
private closeCurrentTag(isOpenImplied: boolean) { private closeCurrentTag(isOpenImplied: boolean) {
@ -417,17 +393,9 @@ export class Parser implements Callbacks {
/** @internal */ /** @internal */
oncdata(start: number, endIndex: number, offset: number): void { oncdata(start: number, endIndex: number, offset: number): void {
this.endIndex = endIndex this.endIndex = endIndex
const value = this.getSlice(start, endIndex - offset)
if (this.options.recognizeCDATA) {
this.cbs.oncdatastart?.() this.cbs.oncdatastart?.()
this.cbs.ontext?.(value) this.cbs.ontext?.(this.getSlice(start, endIndex - offset))
this.cbs.oncdataend?.() this.cbs.oncdataend?.()
} else {
this.cbs.oncomment?.(`[CDATA[${value}]]`)
this.cbs.oncommentend?.()
}
// Set `startIndex` for next node // Set `startIndex` for next node
this.startIndex = endIndex + 1 this.startIndex = endIndex + 1
} }
@ -456,8 +424,7 @@ export class Parser implements Callbacks {
public parse(input: string): void { public parse(input: string): void {
this.reset() this.reset()
this.buffer = input this.buffer = input
this.tokenizer.write(input) this.tokenizer.parse(input)
this.tokenizer.end()
} }
/** /**

View File

@ -1,3 +1,27 @@
/**
* This Tokenizer is adapted from htmlparser2 under the MIT License listed at
* https://github.com/fb55/htmlparser2/blob/master/LICENSE
Copyright 2010, 2011, Chris Winberry <chris@winberry.net>. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
deal in the Software without restriction, including without limitation the
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
IN THE SOFTWARE.
*/
import { import {
EntityDecoder, EntityDecoder,
DecodingMode, DecodingMode,
@ -143,10 +167,6 @@ export default class Tokenizer {
private baseState = State.Text private baseState = State.Text
/** For special parsing behavior inside of script and style tags. */ /** For special parsing behavior inside of script and style tags. */
private isSpecial = false private isSpecial = false
/** Indicates whether the tokenizer has been paused. */
public running = true
/** The offset of the current buffer. */
private offset = 0
private readonly decodeEntities: boolean private readonly decodeEntities: boolean
private readonly entityDecoder: EntityDecoder private readonly entityDecoder: EntityDecoder
@ -168,29 +188,6 @@ export default class Tokenizer {
this.index = 0 this.index = 0
this.baseState = State.Text this.baseState = State.Text
this.currentSequence = undefined! this.currentSequence = undefined!
this.running = true
this.offset = 0
}
public write(chunk: string): void {
this.offset += this.buffer.length
this.buffer = chunk
this.parse()
}
public end(): void {
if (this.running) this.finish()
}
public pause(): void {
this.running = false
}
public resume(): void {
this.running = true
if (this.index < this.buffer.length + this.offset) {
this.parse()
}
} }
private stateText(c: number): void { private stateText(c: number): void {
@ -293,8 +290,8 @@ export default class Tokenizer {
* @returns Whether the character was found. * @returns Whether the character was found.
*/ */
private fastForwardTo(c: number): boolean { private fastForwardTo(c: number): boolean {
while (++this.index < this.buffer.length + this.offset) { while (++this.index < this.buffer.length) {
if (this.buffer.charCodeAt(this.index - this.offset) === c) { if (this.buffer.charCodeAt(this.index) === c) {
return true return true
} }
} }
@ -305,7 +302,7 @@ export default class Tokenizer {
* *
* TODO: Refactor `parse` to increment index before calling states. * TODO: Refactor `parse` to increment index before calling states.
*/ */
this.index = this.buffer.length + this.offset - 1 this.index = this.buffer.length - 1
return false return false
} }
@ -577,10 +574,7 @@ export default class Tokenizer {
} }
private stateInEntity(): void { private stateInEntity(): void {
const length = this.entityDecoder.write( const length = this.entityDecoder.write(this.buffer, this.index)
this.buffer,
this.index - this.offset
)
// If `length` is positive, we are done with the entity. // If `length` is positive, we are done with the entity.
if (length >= 0) { if (length >= 0) {
@ -591,45 +585,19 @@ export default class Tokenizer {
} }
} else { } else {
// Mark buffer as consumed. // Mark buffer as consumed.
this.index = this.offset + this.buffer.length - 1 this.index = this.buffer.length - 1
} }
} }
/**
* Remove data that has already been consumed from the buffer.
*/
private cleanup() {
// If we are inside of text or attributes, emit what we already have.
if (this.running && this.sectionStart !== this.index) {
if (
this.state === State.Text ||
(this.state === State.InSpecialTag && this.sequenceIndex === 0)
) {
this.cbs.ontext(this.sectionStart, this.index)
this.sectionStart = this.index
} else if (
this.state === State.InAttributeValueDq ||
this.state === State.InAttributeValueSq ||
this.state === State.InAttributeValueNq
) {
this.cbs.onattribdata(this.sectionStart, this.index)
this.sectionStart = this.index
}
}
}
private shouldContinue() {
return this.index < this.buffer.length + this.offset && this.running
}
/** /**
* Iterates through the buffer, calling the function corresponding to the current state. * Iterates through the buffer, calling the function corresponding to the current state.
* *
* States that are more likely to be hit are higher up, as a performance improvement. * States that are more likely to be hit are higher up, as a performance improvement.
*/ */
private parse() { public parse(input: string) {
while (this.shouldContinue()) { this.buffer = input
const c = this.buffer.charCodeAt(this.index - this.offset) while (this.index < this.buffer.length) {
const c = this.buffer.charCodeAt(this.index)
switch (this.state) { switch (this.state) {
case State.Text: { case State.Text: {
this.stateText(c) this.stateText(c)
@ -735,6 +703,30 @@ export default class Tokenizer {
this.index++ this.index++
} }
this.cleanup() this.cleanup()
this.finish()
}
/**
* Remove data that has already been consumed from the buffer.
*/
private cleanup() {
// If we are inside of text or attributes, emit what we already have.
if (this.sectionStart !== this.index) {
if (
this.state === State.Text ||
(this.state === State.InSpecialTag && this.sequenceIndex === 0)
) {
this.cbs.ontext(this.sectionStart, this.index)
this.sectionStart = this.index
} else if (
this.state === State.InAttributeValueDq ||
this.state === State.InAttributeValueSq ||
this.state === State.InAttributeValueNq
) {
this.cbs.onattribdata(this.sectionStart, this.index)
this.sectionStart = this.index
}
}
} }
private finish() { private finish() {
@ -750,7 +742,7 @@ export default class Tokenizer {
/** Handle any trailing data. */ /** Handle any trailing data. */
private handleTrailingData() { private handleTrailingData() {
const endIndex = this.buffer.length + this.offset const endIndex = this.buffer.length
// If there is no remaining data, we are done. // If there is no remaining data, we are done.
if (this.sectionStart >= endIndex) { if (this.sectionStart >= endIndex) {

View File

@ -1,16 +1,429 @@
import { RootNode, createRoot } from '../ast' import { fromCodePoint } from 'entities/lib/decode.js'
import {
ElementNode,
ElementTypes,
NodeTypes,
RootNode,
TemplateChildNode,
createRoot
} from '../ast'
import { ParserOptions } from '../options' import { ParserOptions } from '../options'
import { Parser } from './Parser' import Tokenizer from './Tokenizer'
import { hasOwn } from '@vue/shared'
const parser = new Parser({ const formTags = new Set([
'input',
'option',
'optgroup',
'select',
'button',
'datalist',
'textarea'
])
const pTag = new Set(['p'])
const tableSectionTags = new Set(['thead', 'tbody'])
const ddtTags = new Set(['dd', 'dt'])
const rtpTags = new Set(['rt', 'rp'])
const openImpliesClose = new Map<string, Set<string>>([
['tr', new Set(['tr', 'th', 'td'])],
['th', new Set(['th'])],
['td', new Set(['thead', 'th', 'td'])],
['body', new Set(['head', 'link', 'script'])],
['li', new Set(['li'])],
['p', pTag],
['h1', pTag],
['h2', pTag],
['h3', pTag],
['h4', pTag],
['h5', pTag],
['h6', pTag],
['select', formTags],
['input', formTags],
['output', formTags],
['button', formTags],
['datalist', formTags],
['textarea', formTags],
['option', new Set(['option'])],
['optgroup', new Set(['optgroup', 'option'])],
['dd', ddtTags],
['dt', ddtTags],
['address', pTag],
['article', pTag],
['aside', pTag],
['blockquote', pTag],
['details', pTag],
['div', pTag],
['dl', pTag],
['fieldset', pTag],
['figcaption', pTag],
['figure', pTag],
['footer', pTag],
['form', pTag],
['header', pTag],
['hr', pTag],
['main', pTag],
['nav', pTag],
['ol', pTag],
['pre', pTag],
['section', pTag],
['table', pTag],
['ul', pTag],
['rt', rtpTags],
['rp', rtpTags],
['tbody', tableSectionTags],
['tfoot', tableSectionTags]
])
const voidElements = new Set([
'area',
'base',
'basefont',
'br',
'col',
'command',
'embed',
'frame',
'hr',
'img',
'input',
'isindex',
'keygen',
'link',
'meta',
'param',
'source',
'track',
'wbr'
])
const foreignContextElements = new Set(['math', 'svg'])
const htmlIntegrationElements = new Set([
'mi',
'mo',
'mn',
'ms',
'mtext',
'annotation-xml',
'foreignobject',
'desc',
'title'
])
let currentOptions: ParserOptions = {}
let currentRoot: RootNode = createRoot([])
let elementStack: ElementNode[] = []
// parser state
let htmlMode = false
let currentInput = ''
let openTagStart = 0
let tagname = ''
let attribname = ''
let attribvalue = ''
let attribs: Record<string, string> | null = null
let startIndex = 0
let endIndex = 0
let inPre = 0
// let inVPre = 0
const stack: string[] = []
const foreignContext: boolean[] = [false]
const tokenizer = new Tokenizer(
// TODO handle entities
{ decodeEntities: true },
{
ontext(start, end) {
const content = getSlice(start, end)
endIndex = end - 1
onText(content)
startIndex = end
},
ontextentity(cp, end) {
endIndex = end - 1
onText(fromCodePoint(cp))
startIndex = end
},
onopentagname(start, end) {
emitOpenTag(getSlice(start, (endIndex = end)))
},
onopentagend(end) {
endIndex = end
endOpenTag(false)
startIndex = end + 1
},
onclosetag(start, end) {
endIndex = end
const name = getSlice(start, end)
if (
htmlMode &&
(foreignContextElements.has(name) || htmlIntegrationElements.has(name))
) {
foreignContext.shift()
}
if (!voidElements.has(name)) {
const pos = stack.indexOf(name)
if (pos !== -1) {
for (let index = 0; index <= pos; index++) {
stack.shift()
onCloseTag()
}
} else if (htmlMode && name === 'p') {
// Implicit open before close
emitOpenTag('p')
closeCurrentTag(true)
}
} else if (htmlMode && name === 'br') {
// TODO // TODO
}) // We can't use `emitOpenTag` for implicit open, as `br` would be implicitly closed.
// this.cbs.onopentag?.('br', {}, true)
// this.cbs.onclosetag?.('br', false)
}
// Set `startIndex` for next node
startIndex = end + 1
},
onselfclosingtag(end) {
endIndex = end
closeCurrentTag(false)
startIndex = end + 1
},
onattribname(start, end) {
attribname = getSlice((startIndex = start), end)
},
onattribdata(start, end) {
attribvalue += getSlice(start, end)
},
onattribentity(codepoint) {
attribvalue += fromCodePoint(codepoint)
},
onattribend(quote, end) {
endIndex = end
if (attribs && !hasOwn(attribs, attribname)) {
// TODO gen attributes AST nodes
attribs[attribname] = attribvalue
}
attribvalue = ''
},
oncomment(start, end, offset) {
endIndex = end
// TODO oncomment
startIndex = end + 1
},
onend() {
// Set the end index for all remaining tags
endIndex = startIndex
for (let index = 0; index < stack.length; index++) {
onCloseTag()
}
},
oncdata(start, end, offset) {
endIndex = end
// TODO throw error
startIndex = end + 1
},
// TODO ignore
ondeclaration(start, end) {
endIndex = end
// TODO onprocessinginstruction
startIndex = end + 1
},
// TODO ignore
onprocessinginstruction(start, end) {
endIndex = end
// TODO onprocessinginstruction
startIndex = end + 1
}
}
)
function getSlice(start: number, end: number) {
return currentInput.slice(start, end)
}
function emitOpenTag(name: string) {
openTagStart = startIndex
tagname = name
const impliesClose = htmlMode && openImpliesClose.get(name)
if (impliesClose) {
while (stack.length > 0 && impliesClose.has(stack[0])) {
stack.shift()
onCloseTag()
}
}
if (!voidElements.has(name)) {
stack.unshift(name)
if (htmlMode) {
if (foreignContextElements.has(name)) {
foreignContext.unshift(true)
} else if (htmlIntegrationElements.has(name)) {
foreignContext.unshift(false)
}
}
}
attribs = {}
}
function closeCurrentTag(isOpenImplied: boolean) {
const name = tagname
endOpenTag(isOpenImplied)
if (stack[0] === name) {
onCloseTag()
stack.shift()
}
}
function endOpenTag(isImplied: boolean) {
startIndex = openTagStart
if (attribs) {
onOpenTag(tagname)
attribs = null
}
if (voidElements.has(tagname)) {
onCloseTag()
}
tagname = ''
}
function onText(content: string) {
const parent = getParent()
const lastNode = parent.children[parent.children.length - 1]
if (lastNode?.type === NodeTypes.TEXT) {
// merge
lastNode.content += content
// TODO update loc
} else {
parent.children.push({
type: NodeTypes.TEXT,
content,
// @ts-ignore TODO
loc: {}
})
}
}
function onOpenTag(tag: string) {
const el: ElementNode = {
type: NodeTypes.ELEMENT,
tag,
// TODO namespace
ns: 0,
// TODO refine tag type
tagType: ElementTypes.ELEMENT,
// TODO props
props: [],
children: [],
// @ts-ignore TODO
loc: {},
codegenNode: undefined
}
addNode(el)
elementStack.push(el)
}
function onCloseTag() {
const el = elementStack.pop()!
// whitepsace management
const nodes = el.children
const shouldCondense = currentOptions.whitespace !== 'preserve'
let removedWhitespace = false
for (let i = 0; i < nodes.length; i++) {
const node = nodes[i]
if (node.type === NodeTypes.TEXT) {
if (!inPre) {
if (!/[^\t\r\n\f ]/.test(node.content)) {
const prev = nodes[i - 1]
const next = nodes[i + 1]
// Remove if:
// - the whitespace is the first or last node, or:
// - (condense mode) the whitespace is between twos comments, or:
// - (condense mode) the whitespace is between comment and element, or:
// - (condense mode) the whitespace is between two elements AND contains newline
if (
!prev ||
!next ||
(shouldCondense &&
((prev.type === NodeTypes.COMMENT &&
next.type === NodeTypes.COMMENT) ||
(prev.type === NodeTypes.COMMENT &&
next.type === NodeTypes.ELEMENT) ||
(prev.type === NodeTypes.ELEMENT &&
next.type === NodeTypes.COMMENT) ||
(prev.type === NodeTypes.ELEMENT &&
next.type === NodeTypes.ELEMENT &&
/[\r\n]/.test(node.content))))
) {
removedWhitespace = true
nodes[i] = null as any
} else {
// Otherwise, the whitespace is condensed into a single space
node.content = ' '
}
} else if (shouldCondense) {
// in condense mode, consecutive whitespaces in text are condensed
// down to a single space.
node.content = node.content.replace(/[\t\r\n\f ]+/g, ' ')
}
} else {
// #6410 normalize windows newlines in <pre>:
// in SSR, browsers normalize server-rendered \r\n into a single \n
// in the DOM
node.content = node.content.replace(/\r\n/g, '\n')
}
}
}
if (removedWhitespace) {
el.children = nodes.filter(Boolean)
}
}
function addNode(node: TemplateChildNode) {
getParent().children.push(node)
}
function getParent() {
return elementStack[elementStack.length - 1] || currentRoot
}
function reset() {
tokenizer.reset()
tagname = ''
attribname = ''
attribvalue = ''
attribs = null
startIndex = 0
endIndex = 0
stack.length = 0
elementStack.length = 0
foreignContext.length = 1
foreignContext[0] = false
}
export function baseParse( export function baseParse(
content: string, input: string,
options: ParserOptions = {} options: ParserOptions = {}
): RootNode { ): RootNode {
const root = createRoot([]) reset()
parser.parse(content) currentInput = input.trim()
currentOptions = options
htmlMode = !!options.htmlMode
const root = (currentRoot = createRoot([]))
tokenizer.parse(currentInput)
// temp hack for ts
console.log(endIndex)
return root return root
} }