wip: port parser

2023-11-13 21:03:39 +08:00 · 2023-11-13 21:03:39 +08:00 · 19bd714239
parent 2a6292e37f
commit 19bd714239
8 changed files with 497 additions and 116 deletions
--- a/packages/compiler-core/tests/parse.spec.ts
+++ b/packages/compiler-core/tests/parse.spec.ts
@ -1,5 +1,5 @@
 import { ParserOptions } from '../src/options'
-import { baseParse, TextModes } from '../src/parse'
+import { TextModes } from '../src/parse'
 import { ErrorCodes } from '../src/errors'
 import {
  CommentNode,
@ -14,6 +14,8 @@ import {
  DirectiveNode
 } from '../src/ast'
 import { baseParse } from '../src/parser/index'
 describe('compiler: parse', () => {
  describe('Text', () => {
    test('simple text', () => {
--- a/packages/compiler-core/src/ast.ts
+++ b/packages/compiler-core/src/ast.ts
@ -128,9 +128,9 @@ export interface BaseElementNode extends Node {
  ns: Namespace
  tag: string
  tagType: ElementTypes
  isSelfClosing: boolean
  props: Array<AttributeNode | DirectiveNode>
  children: TemplateChildNode[]
  isSelfClosing?: boolean
 }
 export interface PlainElementNode extends BaseElementNode {
--- a/packages/compiler-core/src/index.ts
+++ b/packages/compiler-core/src/index.ts
@ -70,3 +70,5 @@ export {
  warnDeprecation,
  CompilerDeprecationTypes
 } from './compat/compatConfig'
 export { baseParse as newParse } from './parser/index'
--- a/packages/compiler-core/src/options.ts
+++ b/packages/compiler-core/src/options.ts
@ -17,6 +17,10 @@ export interface ErrorHandlingOptions {
 export interface ParserOptions
  extends ErrorHandlingOptions,
    CompilerCompatOptions {
  /**
   * Parse as HTML. Default: false
   */
  htmlMode?: boolean
  /**
   * e.g. platform native elements, e.g. `<div>` for browsers
   */
--- a/packages/compiler-core/src/parse.ts
+++ b/packages/compiler-core/src/parse.ts
@ -40,6 +40,7 @@ import {
 } from './compat/compatConfig'
 type OptionalOptions =
  | 'htmlMode'
  | 'whitespace'
  | 'isNativeTag'
  | 'isBuiltInComponent'
--- a/packages/compiler-core/src/parser/Parser.ts
+++ b/packages/compiler-core/src/parser/Parser.ts
@ -108,22 +108,6 @@ export interface ParserOptions {
   * @default true
   */
  decodeEntities?: boolean
  /**
   * If set to true, CDATA sections will be recognized as text even if the xmlMode option is not enabled.
   * NOTE: If xmlMode is set to `true` then CDATA sections will always be recognized as text.
   *
   * @default false
   */
  recognizeCDATA?: boolean
  /**
   * If set to `true`, self-closing tags will trigger the onclosetag event even if xmlMode is not set to `true`.
   * NOTE: If xmlMode is set to `true` then self-closing tags will always be recognized.
   *
   * @default false
   */
  recognizeSelfClosing?: boolean
 }
 export interface Handler {
@ -186,7 +170,6 @@ export class Parser implements Callbacks {
  /** Determines whether self-closing tags are recognized. */
  private readonly foreignContext: boolean[]
  private readonly cbs: Partial<Handler>
  private readonly recognizeSelfClosing: boolean
  private readonly tokenizer: Tokenizer
  private buffer: string = ''
@ -196,7 +179,6 @@ export class Parser implements Callbacks {
    private readonly options: ParserOptions = {}
  ) {
    this.cbs = cbs ?? {}
    this.recognizeSelfClosing = options.recognizeSelfClosing ?? false
    this.tokenizer = new Tokenizer(this.options, this)
    this.foreignContext = [false]
    this.cbs.onparserinit?.(this)
@ -307,15 +289,9 @@ export class Parser implements Callbacks {
  /** @internal */
  onselfclosingtag(endIndex: number): void {
    this.endIndex = endIndex
    if (this.recognizeSelfClosing || this.foreignContext[0]) {
    this.closeCurrentTag(false)
    // Set `startIndex` for next node
    this.startIndex = endIndex + 1
    } else {
      // Ignore the fact that the tag is self-closing.
      this.onopentagend(endIndex)
    }
  }
  private closeCurrentTag(isOpenImplied: boolean) {
@ -417,17 +393,9 @@ export class Parser implements Callbacks {
  /** @internal */
  oncdata(start: number, endIndex: number, offset: number): void {
    this.endIndex = endIndex
    const value = this.getSlice(start, endIndex - offset)
    if (this.options.recognizeCDATA) {
    this.cbs.oncdatastart?.()
-      this.cbs.ontext?.(value)
+    this.cbs.ontext?.(this.getSlice(start, endIndex - offset))
    this.cbs.oncdataend?.()
    } else {
      this.cbs.oncomment?.(`[CDATA[${value}]]`)
      this.cbs.oncommentend?.()
    }
    // Set `startIndex` for next node
    this.startIndex = endIndex + 1
  }
@ -456,8 +424,7 @@ export class Parser implements Callbacks {
  public parse(input: string): void {
    this.reset()
    this.buffer = input
-    this.tokenizer.write(input)
+    this.tokenizer.parse(input)
    this.tokenizer.end()
  }
  /**
--- a/packages/compiler-core/src/parser/Tokenizer.ts
+++ b/packages/compiler-core/src/parser/Tokenizer.ts
@ -1,3 +1,27 @@
 /**
 * This Tokenizer is adapted from htmlparser2 under the MIT License listed at
 * https://github.com/fb55/htmlparser2/blob/master/LICENSE
 Copyright 2010, 2011, Chris Winberry <chris@winberry.net>. All rights reserved.
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to
 deal in the Software without restriction, including without limitation the
 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 sell copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in
 all copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 IN THE SOFTWARE.
 */
 import {
  EntityDecoder,
  DecodingMode,
@ -143,10 +167,6 @@ export default class Tokenizer {
  private baseState = State.Text
  /** For special parsing behavior inside of script and style tags. */
  private isSpecial = false
  /** Indicates whether the tokenizer has been paused. */
  public running = true
  /** The offset of the current buffer. */
  private offset = 0
  private readonly decodeEntities: boolean
  private readonly entityDecoder: EntityDecoder
@ -168,29 +188,6 @@ export default class Tokenizer {
    this.index = 0
    this.baseState = State.Text
    this.currentSequence = undefined!
    this.running = true
    this.offset = 0
  }
  public write(chunk: string): void {
    this.offset += this.buffer.length
    this.buffer = chunk
    this.parse()
  }
  public end(): void {
    if (this.running) this.finish()
  }
  public pause(): void {
    this.running = false
  }
  public resume(): void {
    this.running = true
    if (this.index < this.buffer.length + this.offset) {
      this.parse()
    }
  }
  private stateText(c: number): void {
@ -293,8 +290,8 @@ export default class Tokenizer {
   * @returns Whether the character was found.
   */
  private fastForwardTo(c: number): boolean {
-    while (++this.index < this.buffer.length + this.offset) {
+    while (++this.index < this.buffer.length) {
-      if (this.buffer.charCodeAt(this.index - this.offset) === c) {
+      if (this.buffer.charCodeAt(this.index) === c) {
        return true
      }
    }
@ -305,7 +302,7 @@ export default class Tokenizer {
     *
     * TODO: Refactor `parse` to increment index before calling states.
     */
-    this.index = this.buffer.length + this.offset - 1
+    this.index = this.buffer.length - 1
    return false
  }
@ -577,10 +574,7 @@ export default class Tokenizer {
  }
  private stateInEntity(): void {
-    const length = this.entityDecoder.write(
+    const length = this.entityDecoder.write(this.buffer, this.index)
      this.buffer,
      this.index - this.offset
    )
    // If `length` is positive, we are done with the entity.
    if (length >= 0) {
@ -591,45 +585,19 @@ export default class Tokenizer {
      }
    } else {
      // Mark buffer as consumed.
-      this.index = this.offset + this.buffer.length - 1
+      this.index = this.buffer.length - 1
    }
  }
  /**
   * Remove data that has already been consumed from the buffer.
   */
  private cleanup() {
    // If we are inside of text or attributes, emit what we already have.
    if (this.running && this.sectionStart !== this.index) {
      if (
        this.state === State.Text ||
        (this.state === State.InSpecialTag && this.sequenceIndex === 0)
      ) {
        this.cbs.ontext(this.sectionStart, this.index)
        this.sectionStart = this.index
      } else if (
        this.state === State.InAttributeValueDq ||
        this.state === State.InAttributeValueSq ||
        this.state === State.InAttributeValueNq
      ) {
        this.cbs.onattribdata(this.sectionStart, this.index)
        this.sectionStart = this.index
      }
    }
  }
  private shouldContinue() {
    return this.index < this.buffer.length + this.offset && this.running
  }
  /**
   * Iterates through the buffer, calling the function corresponding to the current state.
   *
   * States that are more likely to be hit are higher up, as a performance improvement.
   */
-  private parse() {
+  public parse(input: string) {
-    while (this.shouldContinue()) {
+    this.buffer = input
-      const c = this.buffer.charCodeAt(this.index - this.offset)
+    while (this.index < this.buffer.length) {
      const c = this.buffer.charCodeAt(this.index)
      switch (this.state) {
        case State.Text: {
          this.stateText(c)
@ -735,6 +703,30 @@ export default class Tokenizer {
      this.index++
    }
    this.cleanup()
    this.finish()
  }
  /**
   * Remove data that has already been consumed from the buffer.
   */
  private cleanup() {
    // If we are inside of text or attributes, emit what we already have.
    if (this.sectionStart !== this.index) {
      if (
        this.state === State.Text ||
        (this.state === State.InSpecialTag && this.sequenceIndex === 0)
      ) {
        this.cbs.ontext(this.sectionStart, this.index)
        this.sectionStart = this.index
      } else if (
        this.state === State.InAttributeValueDq ||
        this.state === State.InAttributeValueSq ||
        this.state === State.InAttributeValueNq
      ) {
        this.cbs.onattribdata(this.sectionStart, this.index)
        this.sectionStart = this.index
      }
    }
  }
  private finish() {
@ -750,7 +742,7 @@ export default class Tokenizer {
  /** Handle any trailing data. */
  private handleTrailingData() {
-    const endIndex = this.buffer.length + this.offset
+    const endIndex = this.buffer.length
    // If there is no remaining data, we are done.
    if (this.sectionStart >= endIndex) {
--- a/packages/compiler-core/src/parser/index.ts
+++ b/packages/compiler-core/src/parser/index.ts
@ -1,16 +1,429 @@
-import { RootNode, createRoot } from '../ast'
+import { fromCodePoint } from 'entities/lib/decode.js'
 import {
  ElementNode,
  ElementTypes,
  NodeTypes,
  RootNode,
  TemplateChildNode,
  createRoot
 } from '../ast'
 import { ParserOptions } from '../options'
-import { Parser } from './Parser'
+import Tokenizer from './Tokenizer'
 import { hasOwn } from '@vue/shared'
-const parser = new Parser({
+const formTags = new Set([
  'input',
  'option',
  'optgroup',
  'select',
  'button',
  'datalist',
  'textarea'
 ])
 const pTag = new Set(['p'])
 const tableSectionTags = new Set(['thead', 'tbody'])
 const ddtTags = new Set(['dd', 'dt'])
 const rtpTags = new Set(['rt', 'rp'])
 const openImpliesClose = new Map<string, Set<string>>([
  ['tr', new Set(['tr', 'th', 'td'])],
  ['th', new Set(['th'])],
  ['td', new Set(['thead', 'th', 'td'])],
  ['body', new Set(['head', 'link', 'script'])],
  ['li', new Set(['li'])],
  ['p', pTag],
  ['h1', pTag],
  ['h2', pTag],
  ['h3', pTag],
  ['h4', pTag],
  ['h5', pTag],
  ['h6', pTag],
  ['select', formTags],
  ['input', formTags],
  ['output', formTags],
  ['button', formTags],
  ['datalist', formTags],
  ['textarea', formTags],
  ['option', new Set(['option'])],
  ['optgroup', new Set(['optgroup', 'option'])],
  ['dd', ddtTags],
  ['dt', ddtTags],
  ['address', pTag],
  ['article', pTag],
  ['aside', pTag],
  ['blockquote', pTag],
  ['details', pTag],
  ['div', pTag],
  ['dl', pTag],
  ['fieldset', pTag],
  ['figcaption', pTag],
  ['figure', pTag],
  ['footer', pTag],
  ['form', pTag],
  ['header', pTag],
  ['hr', pTag],
  ['main', pTag],
  ['nav', pTag],
  ['ol', pTag],
  ['pre', pTag],
  ['section', pTag],
  ['table', pTag],
  ['ul', pTag],
  ['rt', rtpTags],
  ['rp', rtpTags],
  ['tbody', tableSectionTags],
  ['tfoot', tableSectionTags]
 ])
 const voidElements = new Set([
  'area',
  'base',
  'basefont',
  'br',
  'col',
  'command',
  'embed',
  'frame',
  'hr',
  'img',
  'input',
  'isindex',
  'keygen',
  'link',
  'meta',
  'param',
  'source',
  'track',
  'wbr'
 ])
 const foreignContextElements = new Set(['math', 'svg'])
 const htmlIntegrationElements = new Set([
  'mi',
  'mo',
  'mn',
  'ms',
  'mtext',
  'annotation-xml',
  'foreignobject',
  'desc',
  'title'
 ])
 let currentOptions: ParserOptions = {}
 let currentRoot: RootNode = createRoot([])
 let elementStack: ElementNode[] = []
 // parser state
 let htmlMode = false
 let currentInput = ''
 let openTagStart = 0
 let tagname = ''
 let attribname = ''
 let attribvalue = ''
 let attribs: Record<string, string> | null = null
 let startIndex = 0
 let endIndex = 0
 let inPre = 0
 // let inVPre = 0
 const stack: string[] = []
 const foreignContext: boolean[] = [false]
 const tokenizer = new Tokenizer(
  // TODO handle entities
  { decodeEntities: true },
  {
    ontext(start, end) {
      const content = getSlice(start, end)
      endIndex = end - 1
      onText(content)
      startIndex = end
    },
    ontextentity(cp, end) {
      endIndex = end - 1
      onText(fromCodePoint(cp))
      startIndex = end
    },
    onopentagname(start, end) {
      emitOpenTag(getSlice(start, (endIndex = end)))
    },
    onopentagend(end) {
      endIndex = end
      endOpenTag(false)
      startIndex = end + 1
    },
    onclosetag(start, end) {
      endIndex = end
      const name = getSlice(start, end)
      if (
        htmlMode &&
        (foreignContextElements.has(name) || htmlIntegrationElements.has(name))
      ) {
        foreignContext.shift()
      }
      if (!voidElements.has(name)) {
        const pos = stack.indexOf(name)
        if (pos !== -1) {
          for (let index = 0; index <= pos; index++) {
            stack.shift()
            onCloseTag()
          }
        } else if (htmlMode && name === 'p') {
          // Implicit open before close
          emitOpenTag('p')
          closeCurrentTag(true)
        }
      } else if (htmlMode && name === 'br') {
        // TODO
-})
+        // We can't use `emitOpenTag` for implicit open, as `br` would be implicitly closed.
        // this.cbs.onopentag?.('br', {}, true)
        // this.cbs.onclosetag?.('br', false)
      }
      // Set `startIndex` for next node
      startIndex = end + 1
    },
    onselfclosingtag(end) {
      endIndex = end
      closeCurrentTag(false)
      startIndex = end + 1
    },
    onattribname(start, end) {
      attribname = getSlice((startIndex = start), end)
    },
    onattribdata(start, end) {
      attribvalue += getSlice(start, end)
    },
    onattribentity(codepoint) {
      attribvalue += fromCodePoint(codepoint)
    },
    onattribend(quote, end) {
      endIndex = end
      if (attribs && !hasOwn(attribs, attribname)) {
        // TODO gen attributes AST nodes
        attribs[attribname] = attribvalue
      }
      attribvalue = ''
    },
    oncomment(start, end, offset) {
      endIndex = end
      // TODO oncomment
      startIndex = end + 1
    },
    onend() {
      // Set the end index for all remaining tags
      endIndex = startIndex
      for (let index = 0; index < stack.length; index++) {
        onCloseTag()
      }
    },
    oncdata(start, end, offset) {
      endIndex = end
      // TODO throw error
      startIndex = end + 1
    },
    // TODO ignore
    ondeclaration(start, end) {
      endIndex = end
      // TODO onprocessinginstruction
      startIndex = end + 1
    },
    // TODO ignore
    onprocessinginstruction(start, end) {
      endIndex = end
      // TODO onprocessinginstruction
      startIndex = end + 1
    }
  }
 )
 function getSlice(start: number, end: number) {
  return currentInput.slice(start, end)
 }
 function emitOpenTag(name: string) {
  openTagStart = startIndex
  tagname = name
  const impliesClose = htmlMode && openImpliesClose.get(name)
  if (impliesClose) {
    while (stack.length > 0 && impliesClose.has(stack[0])) {
      stack.shift()
      onCloseTag()
    }
  }
  if (!voidElements.has(name)) {
    stack.unshift(name)
    if (htmlMode) {
      if (foreignContextElements.has(name)) {
        foreignContext.unshift(true)
      } else if (htmlIntegrationElements.has(name)) {
        foreignContext.unshift(false)
      }
    }
  }
  attribs = {}
 }
 function closeCurrentTag(isOpenImplied: boolean) {
  const name = tagname
  endOpenTag(isOpenImplied)
  if (stack[0] === name) {
    onCloseTag()
    stack.shift()
  }
 }
 function endOpenTag(isImplied: boolean) {
  startIndex = openTagStart
  if (attribs) {
    onOpenTag(tagname)
    attribs = null
  }
  if (voidElements.has(tagname)) {
    onCloseTag()
  }
  tagname = ''
 }
 function onText(content: string) {
  const parent = getParent()
  const lastNode = parent.children[parent.children.length - 1]
  if (lastNode?.type === NodeTypes.TEXT) {
    // merge
    lastNode.content += content
    // TODO update loc
  } else {
    parent.children.push({
      type: NodeTypes.TEXT,
      content,
      // @ts-ignore TODO
      loc: {}
    })
  }
 }
 function onOpenTag(tag: string) {
  const el: ElementNode = {
    type: NodeTypes.ELEMENT,
    tag,
    // TODO namespace
    ns: 0,
    // TODO refine tag type
    tagType: ElementTypes.ELEMENT,
    // TODO props
    props: [],
    children: [],
    // @ts-ignore TODO
    loc: {},
    codegenNode: undefined
  }
  addNode(el)
  elementStack.push(el)
 }
 function onCloseTag() {
  const el = elementStack.pop()!
  // whitepsace management
  const nodes = el.children
  const shouldCondense = currentOptions.whitespace !== 'preserve'
  let removedWhitespace = false
  for (let i = 0; i < nodes.length; i++) {
    const node = nodes[i]
    if (node.type === NodeTypes.TEXT) {
      if (!inPre) {
        if (!/[^\t\r\n\f ]/.test(node.content)) {
          const prev = nodes[i - 1]
          const next = nodes[i + 1]
          // Remove if:
          // - the whitespace is the first or last node, or:
          // - (condense mode) the whitespace is between twos comments, or:
          // - (condense mode) the whitespace is between comment and element, or:
          // - (condense mode) the whitespace is between two elements AND contains newline
          if (
            !prev ||
            !next ||
            (shouldCondense &&
              ((prev.type === NodeTypes.COMMENT &&
                next.type === NodeTypes.COMMENT) ||
                (prev.type === NodeTypes.COMMENT &&
                  next.type === NodeTypes.ELEMENT) ||
                (prev.type === NodeTypes.ELEMENT &&
                  next.type === NodeTypes.COMMENT) ||
                (prev.type === NodeTypes.ELEMENT &&
                  next.type === NodeTypes.ELEMENT &&
                  /[\r\n]/.test(node.content))))
          ) {
            removedWhitespace = true
            nodes[i] = null as any
          } else {
            // Otherwise, the whitespace is condensed into a single space
            node.content = ' '
          }
        } else if (shouldCondense) {
          // in condense mode, consecutive whitespaces in text are condensed
          // down to a single space.
          node.content = node.content.replace(/[\t\r\n\f ]+/g, ' ')
        }
      } else {
        // #6410 normalize windows newlines in <pre>:
        // in SSR, browsers normalize server-rendered \r\n into a single \n
        // in the DOM
        node.content = node.content.replace(/\r\n/g, '\n')
      }
    }
  }
  if (removedWhitespace) {
    el.children = nodes.filter(Boolean)
  }
 }
 function addNode(node: TemplateChildNode) {
  getParent().children.push(node)
 }
 function getParent() {
  return elementStack[elementStack.length - 1] || currentRoot
 }
 function reset() {
  tokenizer.reset()
  tagname = ''
  attribname = ''
  attribvalue = ''
  attribs = null
  startIndex = 0
  endIndex = 0
  stack.length = 0
  elementStack.length = 0
  foreignContext.length = 1
  foreignContext[0] = false
 }
 export function baseParse(
-  content: string,
+  input: string,
  options: ParserOptions = {}
 ): RootNode {
-  const root = createRoot([])
+  reset()
-  parser.parse(content)
+  currentInput = input.trim()
  currentOptions = options
  htmlMode = !!options.htmlMode
  const root = (currentRoot = createRoot([]))
  tokenizer.parse(currentInput)
  // temp hack for ts
  console.log(endIndex)
  return root
 }