diff --git a/packages/compiler-core/__tests__/parse.spec.ts b/packages/compiler-core/__tests__/parse.spec.ts index 8db4eeb8a..acf33e1c9 100644 --- a/packages/compiler-core/__tests__/parse.spec.ts +++ b/packages/compiler-core/__tests__/parse.spec.ts @@ -1,5 +1,5 @@ import { ParserOptions } from '../src/options' -import { baseParse, TextModes } from '../src/parse' +import { TextModes } from '../src/parse' import { ErrorCodes } from '../src/errors' import { CommentNode, @@ -14,6 +14,8 @@ import { DirectiveNode } from '../src/ast' +import { baseParse } from '../src/parser/index' + describe('compiler: parse', () => { describe('Text', () => { test('simple text', () => { diff --git a/packages/compiler-core/src/ast.ts b/packages/compiler-core/src/ast.ts index 515083c33..131d80aaa 100644 --- a/packages/compiler-core/src/ast.ts +++ b/packages/compiler-core/src/ast.ts @@ -128,9 +128,9 @@ export interface BaseElementNode extends Node { ns: Namespace tag: string tagType: ElementTypes - isSelfClosing: boolean props: Array children: TemplateChildNode[] + isSelfClosing?: boolean } export interface PlainElementNode extends BaseElementNode { diff --git a/packages/compiler-core/src/index.ts b/packages/compiler-core/src/index.ts index 588bb92cc..c88844a5c 100644 --- a/packages/compiler-core/src/index.ts +++ b/packages/compiler-core/src/index.ts @@ -70,3 +70,5 @@ export { warnDeprecation, CompilerDeprecationTypes } from './compat/compatConfig' + +export { baseParse as newParse } from './parser/index' diff --git a/packages/compiler-core/src/options.ts b/packages/compiler-core/src/options.ts index abfba98e3..8566fa7ba 100644 --- a/packages/compiler-core/src/options.ts +++ b/packages/compiler-core/src/options.ts @@ -17,6 +17,10 @@ export interface ErrorHandlingOptions { export interface ParserOptions extends ErrorHandlingOptions, CompilerCompatOptions { + /** + * Parse as HTML. Default: false + */ + htmlMode?: boolean /** * e.g. platform native elements, e.g. `
` for browsers */ diff --git a/packages/compiler-core/src/parse.ts b/packages/compiler-core/src/parse.ts index c237239db..69f97b5c9 100644 --- a/packages/compiler-core/src/parse.ts +++ b/packages/compiler-core/src/parse.ts @@ -40,6 +40,7 @@ import { } from './compat/compatConfig' type OptionalOptions = + | 'htmlMode' | 'whitespace' | 'isNativeTag' | 'isBuiltInComponent' diff --git a/packages/compiler-core/src/parser/Parser.ts b/packages/compiler-core/src/parser/Parser.ts index d10ab08e0..be6ff0b41 100644 --- a/packages/compiler-core/src/parser/Parser.ts +++ b/packages/compiler-core/src/parser/Parser.ts @@ -108,22 +108,6 @@ export interface ParserOptions { * @default true */ decodeEntities?: boolean - - /** - * If set to true, CDATA sections will be recognized as text even if the xmlMode option is not enabled. - * NOTE: If xmlMode is set to `true` then CDATA sections will always be recognized as text. - * - * @default false - */ - recognizeCDATA?: boolean - - /** - * If set to `true`, self-closing tags will trigger the onclosetag event even if xmlMode is not set to `true`. - * NOTE: If xmlMode is set to `true` then self-closing tags will always be recognized. - * - * @default false - */ - recognizeSelfClosing?: boolean } export interface Handler { @@ -186,7 +170,6 @@ export class Parser implements Callbacks { /** Determines whether self-closing tags are recognized. */ private readonly foreignContext: boolean[] private readonly cbs: Partial - private readonly recognizeSelfClosing: boolean private readonly tokenizer: Tokenizer private buffer: string = '' @@ -196,7 +179,6 @@ export class Parser implements Callbacks { private readonly options: ParserOptions = {} ) { this.cbs = cbs ?? {} - this.recognizeSelfClosing = options.recognizeSelfClosing ?? false this.tokenizer = new Tokenizer(this.options, this) this.foreignContext = [false] this.cbs.onparserinit?.(this) @@ -307,15 +289,9 @@ export class Parser implements Callbacks { /** @internal */ onselfclosingtag(endIndex: number): void { this.endIndex = endIndex - if (this.recognizeSelfClosing || this.foreignContext[0]) { - this.closeCurrentTag(false) - - // Set `startIndex` for next node - this.startIndex = endIndex + 1 - } else { - // Ignore the fact that the tag is self-closing. - this.onopentagend(endIndex) - } + this.closeCurrentTag(false) + // Set `startIndex` for next node + this.startIndex = endIndex + 1 } private closeCurrentTag(isOpenImplied: boolean) { @@ -417,17 +393,9 @@ export class Parser implements Callbacks { /** @internal */ oncdata(start: number, endIndex: number, offset: number): void { this.endIndex = endIndex - const value = this.getSlice(start, endIndex - offset) - - if (this.options.recognizeCDATA) { - this.cbs.oncdatastart?.() - this.cbs.ontext?.(value) - this.cbs.oncdataend?.() - } else { - this.cbs.oncomment?.(`[CDATA[${value}]]`) - this.cbs.oncommentend?.() - } - + this.cbs.oncdatastart?.() + this.cbs.ontext?.(this.getSlice(start, endIndex - offset)) + this.cbs.oncdataend?.() // Set `startIndex` for next node this.startIndex = endIndex + 1 } @@ -456,8 +424,7 @@ export class Parser implements Callbacks { public parse(input: string): void { this.reset() this.buffer = input - this.tokenizer.write(input) - this.tokenizer.end() + this.tokenizer.parse(input) } /** diff --git a/packages/compiler-core/src/parser/Tokenizer.ts b/packages/compiler-core/src/parser/Tokenizer.ts index b87cc52d8..cece336fc 100644 --- a/packages/compiler-core/src/parser/Tokenizer.ts +++ b/packages/compiler-core/src/parser/Tokenizer.ts @@ -1,3 +1,27 @@ +/** + * This Tokenizer is adapted from htmlparser2 under the MIT License listed at + * https://github.com/fb55/htmlparser2/blob/master/LICENSE + +Copyright 2010, 2011, Chris Winberry . All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to +deal in the Software without restriction, including without limitation the +rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +sell copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +IN THE SOFTWARE. + */ + import { EntityDecoder, DecodingMode, @@ -143,10 +167,6 @@ export default class Tokenizer { private baseState = State.Text /** For special parsing behavior inside of script and style tags. */ private isSpecial = false - /** Indicates whether the tokenizer has been paused. */ - public running = true - /** The offset of the current buffer. */ - private offset = 0 private readonly decodeEntities: boolean private readonly entityDecoder: EntityDecoder @@ -168,29 +188,6 @@ export default class Tokenizer { this.index = 0 this.baseState = State.Text this.currentSequence = undefined! - this.running = true - this.offset = 0 - } - - public write(chunk: string): void { - this.offset += this.buffer.length - this.buffer = chunk - this.parse() - } - - public end(): void { - if (this.running) this.finish() - } - - public pause(): void { - this.running = false - } - - public resume(): void { - this.running = true - if (this.index < this.buffer.length + this.offset) { - this.parse() - } } private stateText(c: number): void { @@ -293,8 +290,8 @@ export default class Tokenizer { * @returns Whether the character was found. */ private fastForwardTo(c: number): boolean { - while (++this.index < this.buffer.length + this.offset) { - if (this.buffer.charCodeAt(this.index - this.offset) === c) { + while (++this.index < this.buffer.length) { + if (this.buffer.charCodeAt(this.index) === c) { return true } } @@ -305,7 +302,7 @@ export default class Tokenizer { * * TODO: Refactor `parse` to increment index before calling states. */ - this.index = this.buffer.length + this.offset - 1 + this.index = this.buffer.length - 1 return false } @@ -577,10 +574,7 @@ export default class Tokenizer { } private stateInEntity(): void { - const length = this.entityDecoder.write( - this.buffer, - this.index - this.offset - ) + const length = this.entityDecoder.write(this.buffer, this.index) // If `length` is positive, we are done with the entity. if (length >= 0) { @@ -591,45 +585,19 @@ export default class Tokenizer { } } else { // Mark buffer as consumed. - this.index = this.offset + this.buffer.length - 1 + this.index = this.buffer.length - 1 } } - /** - * Remove data that has already been consumed from the buffer. - */ - private cleanup() { - // If we are inside of text or attributes, emit what we already have. - if (this.running && this.sectionStart !== this.index) { - if ( - this.state === State.Text || - (this.state === State.InSpecialTag && this.sequenceIndex === 0) - ) { - this.cbs.ontext(this.sectionStart, this.index) - this.sectionStart = this.index - } else if ( - this.state === State.InAttributeValueDq || - this.state === State.InAttributeValueSq || - this.state === State.InAttributeValueNq - ) { - this.cbs.onattribdata(this.sectionStart, this.index) - this.sectionStart = this.index - } - } - } - - private shouldContinue() { - return this.index < this.buffer.length + this.offset && this.running - } - /** * Iterates through the buffer, calling the function corresponding to the current state. * * States that are more likely to be hit are higher up, as a performance improvement. */ - private parse() { - while (this.shouldContinue()) { - const c = this.buffer.charCodeAt(this.index - this.offset) + public parse(input: string) { + this.buffer = input + while (this.index < this.buffer.length) { + const c = this.buffer.charCodeAt(this.index) switch (this.state) { case State.Text: { this.stateText(c) @@ -735,6 +703,30 @@ export default class Tokenizer { this.index++ } this.cleanup() + this.finish() + } + + /** + * Remove data that has already been consumed from the buffer. + */ + private cleanup() { + // If we are inside of text or attributes, emit what we already have. + if (this.sectionStart !== this.index) { + if ( + this.state === State.Text || + (this.state === State.InSpecialTag && this.sequenceIndex === 0) + ) { + this.cbs.ontext(this.sectionStart, this.index) + this.sectionStart = this.index + } else if ( + this.state === State.InAttributeValueDq || + this.state === State.InAttributeValueSq || + this.state === State.InAttributeValueNq + ) { + this.cbs.onattribdata(this.sectionStart, this.index) + this.sectionStart = this.index + } + } } private finish() { @@ -750,7 +742,7 @@ export default class Tokenizer { /** Handle any trailing data. */ private handleTrailingData() { - const endIndex = this.buffer.length + this.offset + const endIndex = this.buffer.length // If there is no remaining data, we are done. if (this.sectionStart >= endIndex) { diff --git a/packages/compiler-core/src/parser/index.ts b/packages/compiler-core/src/parser/index.ts index 3489ec2fd..d12af24e3 100644 --- a/packages/compiler-core/src/parser/index.ts +++ b/packages/compiler-core/src/parser/index.ts @@ -1,16 +1,429 @@ -import { RootNode, createRoot } from '../ast' +import { fromCodePoint } from 'entities/lib/decode.js' +import { + ElementNode, + ElementTypes, + NodeTypes, + RootNode, + TemplateChildNode, + createRoot +} from '../ast' import { ParserOptions } from '../options' -import { Parser } from './Parser' +import Tokenizer from './Tokenizer' +import { hasOwn } from '@vue/shared' -const parser = new Parser({ - // TODO -}) +const formTags = new Set([ + 'input', + 'option', + 'optgroup', + 'select', + 'button', + 'datalist', + 'textarea' +]) +const pTag = new Set(['p']) +const tableSectionTags = new Set(['thead', 'tbody']) +const ddtTags = new Set(['dd', 'dt']) +const rtpTags = new Set(['rt', 'rp']) + +const openImpliesClose = new Map>([ + ['tr', new Set(['tr', 'th', 'td'])], + ['th', new Set(['th'])], + ['td', new Set(['thead', 'th', 'td'])], + ['body', new Set(['head', 'link', 'script'])], + ['li', new Set(['li'])], + ['p', pTag], + ['h1', pTag], + ['h2', pTag], + ['h3', pTag], + ['h4', pTag], + ['h5', pTag], + ['h6', pTag], + ['select', formTags], + ['input', formTags], + ['output', formTags], + ['button', formTags], + ['datalist', formTags], + ['textarea', formTags], + ['option', new Set(['option'])], + ['optgroup', new Set(['optgroup', 'option'])], + ['dd', ddtTags], + ['dt', ddtTags], + ['address', pTag], + ['article', pTag], + ['aside', pTag], + ['blockquote', pTag], + ['details', pTag], + ['div', pTag], + ['dl', pTag], + ['fieldset', pTag], + ['figcaption', pTag], + ['figure', pTag], + ['footer', pTag], + ['form', pTag], + ['header', pTag], + ['hr', pTag], + ['main', pTag], + ['nav', pTag], + ['ol', pTag], + ['pre', pTag], + ['section', pTag], + ['table', pTag], + ['ul', pTag], + ['rt', rtpTags], + ['rp', rtpTags], + ['tbody', tableSectionTags], + ['tfoot', tableSectionTags] +]) + +const voidElements = new Set([ + 'area', + 'base', + 'basefont', + 'br', + 'col', + 'command', + 'embed', + 'frame', + 'hr', + 'img', + 'input', + 'isindex', + 'keygen', + 'link', + 'meta', + 'param', + 'source', + 'track', + 'wbr' +]) + +const foreignContextElements = new Set(['math', 'svg']) + +const htmlIntegrationElements = new Set([ + 'mi', + 'mo', + 'mn', + 'ms', + 'mtext', + 'annotation-xml', + 'foreignobject', + 'desc', + 'title' +]) + +let currentOptions: ParserOptions = {} +let currentRoot: RootNode = createRoot([]) +let elementStack: ElementNode[] = [] + +// parser state +let htmlMode = false +let currentInput = '' +let openTagStart = 0 +let tagname = '' +let attribname = '' +let attribvalue = '' +let attribs: Record | null = null +let startIndex = 0 +let endIndex = 0 +let inPre = 0 +// let inVPre = 0 +const stack: string[] = [] +const foreignContext: boolean[] = [false] + +const tokenizer = new Tokenizer( + // TODO handle entities + { decodeEntities: true }, + { + ontext(start, end) { + const content = getSlice(start, end) + endIndex = end - 1 + onText(content) + startIndex = end + }, + + ontextentity(cp, end) { + endIndex = end - 1 + onText(fromCodePoint(cp)) + startIndex = end + }, + + onopentagname(start, end) { + emitOpenTag(getSlice(start, (endIndex = end))) + }, + + onopentagend(end) { + endIndex = end + endOpenTag(false) + startIndex = end + 1 + }, + + onclosetag(start, end) { + endIndex = end + const name = getSlice(start, end) + + if ( + htmlMode && + (foreignContextElements.has(name) || htmlIntegrationElements.has(name)) + ) { + foreignContext.shift() + } + + if (!voidElements.has(name)) { + const pos = stack.indexOf(name) + if (pos !== -1) { + for (let index = 0; index <= pos; index++) { + stack.shift() + onCloseTag() + } + } else if (htmlMode && name === 'p') { + // Implicit open before close + emitOpenTag('p') + closeCurrentTag(true) + } + } else if (htmlMode && name === 'br') { + // TODO + // We can't use `emitOpenTag` for implicit open, as `br` would be implicitly closed. + // this.cbs.onopentag?.('br', {}, true) + // this.cbs.onclosetag?.('br', false) + } + + // Set `startIndex` for next node + startIndex = end + 1 + }, + + onselfclosingtag(end) { + endIndex = end + closeCurrentTag(false) + startIndex = end + 1 + }, + + onattribname(start, end) { + attribname = getSlice((startIndex = start), end) + }, + onattribdata(start, end) { + attribvalue += getSlice(start, end) + }, + onattribentity(codepoint) { + attribvalue += fromCodePoint(codepoint) + }, + onattribend(quote, end) { + endIndex = end + if (attribs && !hasOwn(attribs, attribname)) { + // TODO gen attributes AST nodes + attribs[attribname] = attribvalue + } + attribvalue = '' + }, + + oncomment(start, end, offset) { + endIndex = end + // TODO oncomment + startIndex = end + 1 + }, + + onend() { + // Set the end index for all remaining tags + endIndex = startIndex + for (let index = 0; index < stack.length; index++) { + onCloseTag() + } + }, + + oncdata(start, end, offset) { + endIndex = end + // TODO throw error + startIndex = end + 1 + }, + + // TODO ignore + ondeclaration(start, end) { + endIndex = end + // TODO onprocessinginstruction + startIndex = end + 1 + }, + + // TODO ignore + onprocessinginstruction(start, end) { + endIndex = end + // TODO onprocessinginstruction + startIndex = end + 1 + } + } +) + +function getSlice(start: number, end: number) { + return currentInput.slice(start, end) +} + +function emitOpenTag(name: string) { + openTagStart = startIndex + tagname = name + const impliesClose = htmlMode && openImpliesClose.get(name) + if (impliesClose) { + while (stack.length > 0 && impliesClose.has(stack[0])) { + stack.shift() + onCloseTag() + } + } + if (!voidElements.has(name)) { + stack.unshift(name) + if (htmlMode) { + if (foreignContextElements.has(name)) { + foreignContext.unshift(true) + } else if (htmlIntegrationElements.has(name)) { + foreignContext.unshift(false) + } + } + } + attribs = {} +} + +function closeCurrentTag(isOpenImplied: boolean) { + const name = tagname + endOpenTag(isOpenImplied) + if (stack[0] === name) { + onCloseTag() + stack.shift() + } +} + +function endOpenTag(isImplied: boolean) { + startIndex = openTagStart + if (attribs) { + onOpenTag(tagname) + attribs = null + } + if (voidElements.has(tagname)) { + onCloseTag() + } + tagname = '' +} + +function onText(content: string) { + const parent = getParent() + const lastNode = parent.children[parent.children.length - 1] + if (lastNode?.type === NodeTypes.TEXT) { + // merge + lastNode.content += content + // TODO update loc + } else { + parent.children.push({ + type: NodeTypes.TEXT, + content, + // @ts-ignore TODO + loc: {} + }) + } +} + +function onOpenTag(tag: string) { + const el: ElementNode = { + type: NodeTypes.ELEMENT, + tag, + // TODO namespace + ns: 0, + // TODO refine tag type + tagType: ElementTypes.ELEMENT, + // TODO props + props: [], + children: [], + // @ts-ignore TODO + loc: {}, + codegenNode: undefined + } + addNode(el) + elementStack.push(el) +} + +function onCloseTag() { + const el = elementStack.pop()! + // whitepsace management + const nodes = el.children + const shouldCondense = currentOptions.whitespace !== 'preserve' + let removedWhitespace = false + for (let i = 0; i < nodes.length; i++) { + const node = nodes[i] + if (node.type === NodeTypes.TEXT) { + if (!inPre) { + if (!/[^\t\r\n\f ]/.test(node.content)) { + const prev = nodes[i - 1] + const next = nodes[i + 1] + // Remove if: + // - the whitespace is the first or last node, or: + // - (condense mode) the whitespace is between twos comments, or: + // - (condense mode) the whitespace is between comment and element, or: + // - (condense mode) the whitespace is between two elements AND contains newline + if ( + !prev || + !next || + (shouldCondense && + ((prev.type === NodeTypes.COMMENT && + next.type === NodeTypes.COMMENT) || + (prev.type === NodeTypes.COMMENT && + next.type === NodeTypes.ELEMENT) || + (prev.type === NodeTypes.ELEMENT && + next.type === NodeTypes.COMMENT) || + (prev.type === NodeTypes.ELEMENT && + next.type === NodeTypes.ELEMENT && + /[\r\n]/.test(node.content)))) + ) { + removedWhitespace = true + nodes[i] = null as any + } else { + // Otherwise, the whitespace is condensed into a single space + node.content = ' ' + } + } else if (shouldCondense) { + // in condense mode, consecutive whitespaces in text are condensed + // down to a single space. + node.content = node.content.replace(/[\t\r\n\f ]+/g, ' ') + } + } else { + // #6410 normalize windows newlines in
:
+        // in SSR, browsers normalize server-rendered \r\n into a single \n
+        // in the DOM
+        node.content = node.content.replace(/\r\n/g, '\n')
+      }
+    }
+  }
+  if (removedWhitespace) {
+    el.children = nodes.filter(Boolean)
+  }
+}
+
+function addNode(node: TemplateChildNode) {
+  getParent().children.push(node)
+}
+
+function getParent() {
+  return elementStack[elementStack.length - 1] || currentRoot
+}
+
+function reset() {
+  tokenizer.reset()
+  tagname = ''
+  attribname = ''
+  attribvalue = ''
+  attribs = null
+  startIndex = 0
+  endIndex = 0
+  stack.length = 0
+  elementStack.length = 0
+  foreignContext.length = 1
+  foreignContext[0] = false
+}
 
 export function baseParse(
-  content: string,
+  input: string,
   options: ParserOptions = {}
 ): RootNode {
-  const root = createRoot([])
-  parser.parse(content)
+  reset()
+  currentInput = input.trim()
+  currentOptions = options
+  htmlMode = !!options.htmlMode
+  const root = (currentRoot = createRoot([]))
+  tokenizer.parse(currentInput)
+  // temp hack for ts
+  console.log(endIndex)
   return root
 }