wip: entities parsing in browser

This commit is contained in:
Evan You 2023-11-18 21:39:31 +08:00
parent caabba7590
commit 1912af04e3
7 changed files with 103 additions and 2439 deletions

View File

@ -50,7 +50,8 @@ export interface ParserOptions
*/
whitespace?: 'preserve' | 'condense'
/**
* Only needed for DOM compilers
* Only used for DOM compilers that runs in the browser.
* In non-browser builds, this option is ignored.
*/
decodeEntities?: (rawText: string, asAttr: boolean) => string
/**

View File

@ -22,12 +22,20 @@ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
IN THE SOFTWARE.
*/
import { ElementNode, Position } from '../ast'
/**
* Note: entities is a non-browser-build-only dependency.
* In the browser, we use an HTML element to do the decoding.
* Make sure all imports from entities are only used in non-browser branches
* so that it can be properly treeshaken.
*/
import {
EntityDecoder,
DecodingMode,
htmlDecodeTree
htmlDecodeTree,
fromCodePoint
} from 'entities/lib/decode.js'
import { ElementNode, Position } from '../ast'
export const enum ParseMode {
BASE,
@ -170,7 +178,7 @@ export enum QuoteType {
export interface Callbacks {
ontext(start: number, endIndex: number): void
ontextentity(codepoint: number, endIndex: number): void
ontextentity(char: string, endIndex: number): void
oninterpolation(start: number, endIndex: number): void
@ -180,7 +188,7 @@ export interface Callbacks {
onclosetag(start: number, endIndex: number): void
onattribdata(start: number, endIndex: number): void
onattribentity(codepoint: number): void
onattribentity(char: string): void
onattribend(quote: QuoteType, endIndex: number): void
onattribname(start: number, endIndex: number): void
onattribnameend(endIndex: number): void
@ -233,15 +241,17 @@ export default class Tokenizer {
/** Reocrd newline positions for fast line / column calculation */
private newlines: number[] = []
private readonly entityDecoder: EntityDecoder
private readonly entityDecoder?: EntityDecoder
constructor(
private readonly stack: ElementNode[],
private readonly cbs: Callbacks
) {
this.entityDecoder = new EntityDecoder(htmlDecodeTree, (cp, consumed) =>
this.emitCodePoint(cp, consumed)
)
if (!__BROWSER__) {
this.entityDecoder = new EntityDecoder(htmlDecodeTree, (cp, consumed) =>
this.emitCodePoint(cp, consumed)
)
}
}
public mode = ParseMode.BASE
@ -290,7 +300,7 @@ export default class Tokenizer {
}
this.state = State.BeforeTagName
this.sectionStart = this.index
} else if (c === CharCodes.Amp) {
} else if (!__BROWSER__ && c === CharCodes.Amp) {
this.startEntity()
} else if (c === this.delimiterOpen[0]) {
this.state = State.InterpolationOpen
@ -398,7 +408,7 @@ export default class Tokenizer {
!(this.mode === ParseMode.SFC && this.stack.length === 0))
) {
// We have to parse entities in <title> and <textarea> tags.
if (c === CharCodes.Amp) {
if (!__BROWSER__ && c === CharCodes.Amp) {
this.startEntity()
}
} else if (this.fastForwardTo(CharCodes.Lt)) {
@ -702,7 +712,7 @@ export default class Tokenizer {
}
}
private handleInAttributeValue(c: number, quote: number) {
if (c === quote) {
if (c === quote || (__BROWSER__ && this.fastForwardTo(quote))) {
this.cbs.onattribdata(this.sectionStart, this.index)
this.sectionStart = -1
this.cbs.onattribend(
@ -710,7 +720,7 @@ export default class Tokenizer {
this.index + 1
)
this.state = State.BeforeAttributeName
} else if (c === CharCodes.Amp) {
} else if (!__BROWSER__ && c === CharCodes.Amp) {
this.startEntity()
}
}
@ -727,7 +737,7 @@ export default class Tokenizer {
this.cbs.onattribend(QuoteType.Unquoted, this.index)
this.state = State.BeforeAttributeName
this.stateBeforeAttributeName(c)
} else if (c === CharCodes.Amp) {
} else if (!__BROWSER__ && c === CharCodes.Amp) {
this.startEntity()
}
}
@ -796,29 +806,33 @@ export default class Tokenizer {
}
private startEntity() {
this.baseState = this.state
this.state = State.InEntity
this.entityStart = this.index
this.entityDecoder.startEntity(
this.baseState === State.Text || this.baseState === State.InSpecialTag
? DecodingMode.Legacy
: DecodingMode.Attribute
)
if (!__BROWSER__) {
this.baseState = this.state
this.state = State.InEntity
this.entityStart = this.index
this.entityDecoder!.startEntity(
this.baseState === State.Text || this.baseState === State.InSpecialTag
? DecodingMode.Legacy
: DecodingMode.Attribute
)
}
}
private stateInEntity(): void {
const length = this.entityDecoder.write(this.buffer, this.index)
if (!__BROWSER__) {
const length = this.entityDecoder!.write(this.buffer, this.index)
// If `length` is positive, we are done with the entity.
if (length >= 0) {
this.state = this.baseState
// If `length` is positive, we are done with the entity.
if (length >= 0) {
this.state = this.baseState
if (length === 0) {
this.index = this.entityStart
if (length === 0) {
this.index = this.entityStart
}
} else {
// Mark buffer as consumed.
this.index = this.buffer.length - 1
}
} else {
// Mark buffer as consumed.
this.index = this.buffer.length - 1
}
}
@ -1002,8 +1016,8 @@ export default class Tokenizer {
}
private finish() {
if (this.state === State.InEntity) {
this.entityDecoder.end()
if (!__BROWSER__ && this.state === State.InEntity) {
this.entityDecoder!.end()
this.state = this.baseState
}
@ -1052,25 +1066,27 @@ export default class Tokenizer {
}
private emitCodePoint(cp: number, consumed: number): void {
if (
this.baseState !== State.Text &&
this.baseState !== State.InSpecialTag
) {
if (this.sectionStart < this.entityStart) {
this.cbs.onattribdata(this.sectionStart, this.entityStart)
}
this.sectionStart = this.entityStart + consumed
this.index = this.sectionStart - 1
if (!__BROWSER__) {
if (
this.baseState !== State.Text &&
this.baseState !== State.InSpecialTag
) {
if (this.sectionStart < this.entityStart) {
this.cbs.onattribdata(this.sectionStart, this.entityStart)
}
this.sectionStart = this.entityStart + consumed
this.index = this.sectionStart - 1
this.cbs.onattribentity(cp)
} else {
if (this.sectionStart < this.entityStart) {
this.cbs.ontext(this.sectionStart, this.entityStart)
}
this.sectionStart = this.entityStart + consumed
this.index = this.sectionStart - 1
this.cbs.onattribentity(fromCodePoint(cp))
} else {
if (this.sectionStart < this.entityStart) {
this.cbs.ontext(this.sectionStart, this.entityStart)
}
this.sectionStart = this.entityStart + consumed
this.index = this.sectionStart - 1
this.cbs.ontextentity(cp, this.sectionStart)
this.cbs.ontextentity(fromCodePoint(cp), this.sectionStart)
}
}
}
}

View File

@ -1,4 +1,3 @@
import { fromCodePoint } from 'entities/lib/decode.js'
import {
AttributeNode,
ConstantTypes,
@ -29,6 +28,7 @@ import { defaultOnError, defaultOnWarn } from '../errors'
import { forAliasRE, isCoreComponent } from '../utils'
type OptionalOptions =
| 'decodeEntities'
| 'whitespace'
| 'isNativeTag'
| 'isBuiltInComponent'
@ -37,18 +37,6 @@ type OptionalOptions =
type MergedParserOptions = Omit<Required<ParserOptions>, OptionalOptions> &
Pick<ParserOptions, OptionalOptions>
// The default decoder only provides escapes for characters reserved as part of
// the template syntax, and is only used if the custom renderer did not provide
// a platform-specific decoder.
const decodeRE = /&(gt|lt|amp|apos|quot);/g
const decodeMap: Record<string, string> = {
gt: '>',
lt: '<',
amp: '&',
apos: "'",
quot: '"'
}
export const defaultParserOptions: MergedParserOptions = {
parseMode: 'base',
delimiters: [`{{`, `}}`],
@ -56,9 +44,6 @@ export const defaultParserOptions: MergedParserOptions = {
isVoidTag: NO,
isPreTag: NO,
isCustomElement: NO,
// TODO handle entities
decodeEntities: (rawText: string): string =>
rawText.replace(decodeRE, (_, p1) => decodeMap[p1]),
onError: defaultOnError,
onWarn: defaultOnWarn,
comments: __DEV__
@ -84,8 +69,8 @@ const tokenizer = new Tokenizer(stack, {
onText(getSlice(start, end), start, end)
},
ontextentity(cp, end) {
onText(fromCodePoint(cp), end - 1, end)
ontextentity(char, end) {
onText(char, end - 1, end)
},
oninterpolation(start, end) {
@ -242,8 +227,8 @@ const tokenizer = new Tokenizer(stack, {
currentAttrEndIndex = end
},
onattribentity(codepoint) {
currentAttrValue += fromCodePoint(codepoint)
onattribentity(char) {
currentAttrValue += char
},
onattribnameend(end) {
@ -265,6 +250,13 @@ const tokenizer = new Tokenizer(stack, {
onattribend(quote, end) {
if (currentElement && currentProp) {
if (quote !== QuoteType.NoValue) {
if (__BROWSER__ && currentAttrValue.includes('&')) {
// TODO should not do this in <script> or <style>
currentAttrValue = currentOptions.decodeEntities!(
currentAttrValue,
true
)
}
if (currentProp.type === NodeTypes.ATTRIBUTE) {
// assign value
@ -422,6 +414,10 @@ function closeCurrentTag(end: number) {
}
function onText(content: string, start: number, end: number) {
if (__BROWSER__ && content.includes('&')) {
// TODO do not do this in <script> or <style>
content = currentOptions.decodeEntities!(content, false)
}
const parent = getParent()
const lastNode = parent.children[parent.children.length - 1]
if (lastNode?.type === NodeTypes.TEXT) {
@ -697,6 +693,19 @@ export function baseParse(input: string, options?: ParserOptions): RootNode {
currentInput = input
currentOptions = extend({}, defaultParserOptions, options)
if (__DEV__) {
if (!__BROWSER__ && currentOptions.decodeEntities) {
console.warn(
`[@vue/compiler-core] decodeEntities option is passed but will be ` +
`ignored in non-browser builds.`
)
} else if (__BROWSER__ && !currentOptions.decodeEntities) {
throw new Error(
`[@vue/compiler-core] decodeEntities option is required in browser builds.`
)
}
}
tokenizer.mode =
currentOptions.parseMode === 'html'
? ParseMode.HTML

View File

@ -1,133 +0,0 @@
import { ParserOptions } from '@vue/compiler-core'
import namedCharacterReferences from './namedChars.json'
// lazy compute this to make this file tree-shakable for browser
let maxCRNameLength: number
export const decodeHtml: ParserOptions['decodeEntities'] = (
rawText,
asAttr
) => {
let offset = 0
const end = rawText.length
let decodedText = ''
function advance(length: number) {
offset += length
rawText = rawText.slice(length)
}
while (offset < end) {
const head = /&(?:#x?)?/i.exec(rawText)
if (!head || offset + head.index >= end) {
const remaining = end - offset
decodedText += rawText.slice(0, remaining)
advance(remaining)
break
}
// Advance to the "&".
decodedText += rawText.slice(0, head.index)
advance(head.index)
if (head[0] === '&') {
// Named character reference.
let name = ''
let value: string | undefined = undefined
if (/[0-9a-z]/i.test(rawText[1])) {
if (!maxCRNameLength) {
maxCRNameLength = Object.keys(namedCharacterReferences).reduce(
(max, name) => Math.max(max, name.length),
0
)
}
for (let length = maxCRNameLength; !value && length > 0; --length) {
name = rawText.slice(1, 1 + length)
value = (namedCharacterReferences as Record<string, string>)[name]
}
if (value) {
const semi = name.endsWith(';')
if (
asAttr &&
!semi &&
/[=a-z0-9]/i.test(rawText[name.length + 1] || '')
) {
decodedText += '&' + name
advance(1 + name.length)
} else {
decodedText += value
advance(1 + name.length)
}
} else {
decodedText += '&' + name
advance(1 + name.length)
}
} else {
decodedText += '&'
advance(1)
}
} else {
// Numeric character reference.
const hex = head[0] === '&#x'
const pattern = hex ? /^&#x([0-9a-f]+);?/i : /^&#([0-9]+);?/
const body = pattern.exec(rawText)
if (!body) {
decodedText += head[0]
advance(head[0].length)
} else {
// https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
let cp = Number.parseInt(body[1], hex ? 16 : 10)
if (cp === 0) {
cp = 0xfffd
} else if (cp > 0x10ffff) {
cp = 0xfffd
} else if (cp >= 0xd800 && cp <= 0xdfff) {
cp = 0xfffd
} else if ((cp >= 0xfdd0 && cp <= 0xfdef) || (cp & 0xfffe) === 0xfffe) {
// noop
} else if (
(cp >= 0x01 && cp <= 0x08) ||
cp === 0x0b ||
(cp >= 0x0d && cp <= 0x1f) ||
(cp >= 0x7f && cp <= 0x9f)
) {
cp = CCR_REPLACEMENTS[cp] || cp
}
decodedText += String.fromCodePoint(cp)
advance(body[0].length)
}
}
}
return decodedText
}
// https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
const CCR_REPLACEMENTS: Record<number, number | undefined> = {
0x80: 0x20ac,
0x82: 0x201a,
0x83: 0x0192,
0x84: 0x201e,
0x85: 0x2026,
0x86: 0x2020,
0x87: 0x2021,
0x88: 0x02c6,
0x89: 0x2030,
0x8a: 0x0160,
0x8b: 0x2039,
0x8c: 0x0152,
0x8e: 0x017d,
0x91: 0x2018,
0x92: 0x2019,
0x93: 0x201c,
0x94: 0x201d,
0x95: 0x2022,
0x96: 0x2013,
0x97: 0x2014,
0x98: 0x02dc,
0x99: 0x2122,
0x9a: 0x0161,
0x9b: 0x203a,
0x9c: 0x0153,
0x9e: 0x017e,
0x9f: 0x0178
}

File diff suppressed because it is too large Load Diff

View File

@ -1,7 +1,6 @@
import { ParserOptions, ElementNode, NodeTypes } from '@vue/compiler-core'
import { isVoidTag, isHTMLTag, isSVGTag } from '@vue/shared'
import { TRANSITION, TRANSITION_GROUP } from './runtimeHelpers'
import { decodeHtml } from './decodeHtml'
import { decodeHtmlBrowser } from './decodeHtmlBrowser'
export const enum DOMNamespaces {
@ -15,7 +14,7 @@ export const parserOptions: ParserOptions = {
isVoidTag,
isNativeTag: tag => isHTMLTag(tag) || isSVGTag(tag),
isPreTag: tag => tag === 'pre',
decodeEntities: __BROWSER__ ? decodeHtmlBrowser : decodeHtml,
decodeEntities: __BROWSER__ ? decodeHtmlBrowser : undefined,
isBuiltInComponent: (tag: string): symbol | undefined => {
if (tag === 'Transition' || tag === 'transition') {

View File

@ -215,7 +215,12 @@ function createConfig(format, output, plugins = []) {
}
function resolveExternal() {
const treeShakenDeps = ['source-map-js', '@babel/parser', 'estree-walker']
const treeShakenDeps = [
'source-map-js',
'@babel/parser',
'estree-walker',
'entities/lib/decode.js'
]
if (isGlobalBuild || isBrowserESMBuild || isCompatPackage) {
if (!packageOptions.enableNonBrowserBranches) {