diff --git a/cspell.json b/cspell.json index 382234611..f89f7708f 100644 --- a/cspell.json +++ b/cspell.json @@ -217,7 +217,9 @@ "gitter", "codecov", "opencollective", - "dependabot" + "dependabot", + "domelementtype", + "domhandler" ], "ignoreRegExpList": ["/Author.+/", "/data:.*/", "/\"mappings\":\".+\"/"], "ignorePaths": ["**/dist/**", "examples/**/README.md"] diff --git a/lib/html/HtmlParser.js b/lib/html/HtmlParser.js new file mode 100644 index 000000000..0abadde57 --- /dev/null +++ b/lib/html/HtmlParser.js @@ -0,0 +1,209 @@ +/* + MIT License http://www.opensource.org/licenses/mit-license.php + Author Ivan Kopeykin @vankop +*/ + +"use strict"; + +const ElementType = require("domelementtype"); +const { DomHandler } = require("domhandler"); +const { Parser: HtmlParser2 } = require("htmlparser2"); +const { SyncBailHook, HookMap } = require("tapable"); +const Parser = require("../Parser"); + +/** @typedef {import("htmlparser2").ParserOptions} HtmlParserOptions */ +/** @typedef {import("htmlparser2").DomHandlerOptions} HtmlParserDomHandlerOptions */ +/** @typedef {import("domhandler").Node} DomNode */ +/** @typedef {import("domhandler").Element} DomHandlerElement */ +/** @typedef {import("domhandler").DataNode} DomDataNode */ +/** @typedef {import("../Parser").ParserState} ParserState */ +/** @typedef {import("../Parser").PreparsedAst} PreparsedAst */ +/** @typedef {{[k: string]: {value: string, range: Readonly<[number, number]>}}} TagAttributes */ +/** @typedef {Omit & {attribs: TagAttributes}} DomElement */ + +/** + * Webpack need custom handler to get attributes indexes in DOM structure + */ +class CustomDomHandler extends DomHandler { + constructor(cb, options, elementCb, errorCb) { + super(cb, options, elementCb); + /** @type {{[k: string]: [number, number]}} */ + this._attributes = undefined; + // process errors, if any + this.onerror = errorCb; + } + + // cspell:word onattribute + onattribute(name, value) { + if (!this._attributes) this._attributes = {}; + //@ts-expect-error + const tokenizer = this._parser._tokenizer; + const html = tokenizer._buffer; + const endIndex = tokenizer._index; + const startIndex = endIndex - value.length; + const unquoted = html[endIndex] !== '"' && html[endIndex] !== "'"; + this._attributes[name] = [unquoted ? startIndex : startIndex - 1, endIndex]; + } + + // cspell:word onopentag + onopentag(name, attributes) { + super.onopentag(name, attributes); + //@ts-expect-error + const attribs = this._tagStack[this._tagStack.length - 1].attribs; + + for (const attributeName of Object.keys(this._attributes)) { + const value = attribs[attributeName]; + attribs[attributeName] = { + value, + range: this._attributes[attributeName] + }; + } + this._attributes = undefined; + } +} + +class HtmlParser extends Parser { + /** + * @param {HtmlParserOptions=} options htmlparser2 parser options + */ + constructor(options) { + super(); + + this._options = options; + this.hooks = Object.freeze({ + /** @type {HookMap>} */ + tag: new HookMap(() => new SyncBailHook(["tag"])), + /** @type {SyncBailHook<[DomDataNode], void | undefined | null>} */ + text: new SyncBailHook(["text"]), + /** @type {SyncBailHook<[DomDataNode], void | undefined | null>} */ + directive: new SyncBailHook(["directive"]), + /** @type {SyncBailHook<[DomDataNode], void | undefined | null>} */ + comment: new SyncBailHook(["directive"]) + }); + + /** @type {ParserState} */ + this.state = undefined; + } + + /** + * @param {DomNode[]} nodes nodes + */ + walkNodes(nodes) { + for (const node of nodes) this.walkNode(node); + } + + /** + * @param {DomNode} node nodes + */ + walkNode(node) { + switch (node.type) { + case ElementType.Script: + case ElementType.Style: + case ElementType.Tag: + this.walkElement(/** @type {DomElement} */ (node)); + break; + case ElementType.Comment: + this.walkComment(/** @type {DomDataNode} */ (node)); + break; + case ElementType.Directive: + this.walkDirective(/** @type {DomDataNode} */ (node)); + break; + case ElementType.Text: + this.walkText(/** @type {DomDataNode} */ (node)); + break; + case ElementType.CDATA: + case ElementType.Doctype: + break; + } + } + + /** + * @param {DomElement} element element + */ + walkElement(element) { + const name = element.tagName; + this.hooks.tag.for(name).call(element); + } + + /** + * @param {DomDataNode} node element + */ + walkText(node) { + this.hooks.text.call(node); + } + + /** + * @param {DomDataNode} node element + */ + walkDirective(node) { + this.hooks.directive.call(node); + } + + /** + * @param {DomDataNode} node element + */ + walkComment(node) { + this.hooks.comment.call(node); + } + + /** + * @param {string | Buffer | PreparsedAst} source the source to parse + * @param {ParserState} state the parser state + * @returns {ParserState} the parser state + */ + parse(source, state) { + if (source === null) { + throw new Error("source must not be null"); + } + if (Buffer.isBuffer(source)) { + source = source.toString("utf-8"); + } + + const oldState = this.state; + const dom = HtmlParser._parse( + /** @type {string} */ (source), + this._options + ); + this.walkNodes(dom); + this.state = oldState; + + return state; + } + + /** + * @param {string} code code + * @param {HtmlParserOptions} options options + * @private + * @returns {DomNode[]} dom + */ + static _parse(code, options) { + /** @type {HtmlParserOptions & HtmlParserDomHandlerOptions} */ + const htmlParserOptions = { + ...options, + withStartIndices: true, + withEndIndices: true + }; + + let dom; + let errors = []; + + try { + const handler = new CustomDomHandler( + undefined, + htmlParserOptions, + undefined, + e => errors.push(e) + ); + new HtmlParser2(handler, options).end(code); + dom = handler.dom; + } catch (e) { + errors = [e]; + } + + if (errors.length > 0) throw errors[0]; + + return dom; + } +} + +module.exports = HtmlParser; diff --git a/lib/html/HtmlParserHelpers.js b/lib/html/HtmlParserHelpers.js new file mode 100644 index 000000000..a60d134ad --- /dev/null +++ b/lib/html/HtmlParserHelpers.js @@ -0,0 +1,25 @@ +/* + MIT License http://www.opensource.org/licenses/mit-license.php + Author Ivan Kopeykin @vankop +*/ + +"use strict"; + +/** @typedef {import("domhandler").NodeWithChildren} DomNodeWithChildren */ + +/** + * @param {DomNodeWithChildren} node node with children + * @returns {[number, number]|null} range + * @example + * for + * + * range of text node " void 0;" will be returned + */ +function childrenRange(node) { + const firstChild = node.firstChild; + if (!firstChild) return null; + + return [firstChild.startIndex, node.lastChild.endIndex]; +} + +exports.childrenRange = childrenRange; diff --git a/lib/index.js b/lib/index.js index ffabb82d1..d5593bd7e 100644 --- a/lib/index.js +++ b/lib/index.js @@ -307,6 +307,12 @@ module.exports = mergeExports(fn, { } }, + html: { + get HtmlParser() { + return require("./html/HtmlParser"); + } + }, + javascript: { get EnableChunkLoadingPlugin() { return require("./javascript/EnableChunkLoadingPlugin"); diff --git a/package.json b/package.json index 3ed6c2ac3..7b577838d 100644 --- a/package.json +++ b/package.json @@ -13,11 +13,14 @@ "@webassemblyjs/wasm-parser": "1.9.0", "acorn": "^7.4.0", "chrome-trace-event": "^1.0.2", + "domelementtype": "^2.0.2", + "domhandler": "^3.0.0", "enhanced-resolve": "^5.0.0", "eslint-scope": "^5.1.0", "events": "^3.2.0", "glob-to-regexp": "^0.4.1", "graceful-fs": "^4.2.4", + "htmlparser2": "^4.1.0", "json-parse-better-errors": "^1.0.2", "loader-runner": "^4.0.0", "mime-types": "^2.1.27", diff --git a/test/HtmlParser.unittest.js b/test/HtmlParser.unittest.js new file mode 100644 index 000000000..99fc58b3f --- /dev/null +++ b/test/HtmlParser.unittest.js @@ -0,0 +1,39 @@ +"use strict"; + +const HtmlParser = require("../lib/html/HtmlParser"); + +const options = { + decodeEntities: false, + lowerCaseTags: false, + lowerCaseAttributeNames: false, + recognizeCDATA: true, + recognizeSelfClosing: true +}; + +describe("correct attributes range", () => { + it("with quotes", () => { + let range; + const testParser = new HtmlParser(options); + testParser.hooks.tag.for("img").tap("Test", element => { + range = element.attribs.src.range; + }); + const pre = ""; + const code = `${pre}"http://ok.ok"${post}`; + testParser.parse(code, {}); + expect(range).toEqual([pre.length, code.length - post.length - 1]); + }); + + it("without quotes", () => { + let range; + const testParser = new HtmlParser(options); + testParser.hooks.tag.for("img").tap("Test", element => { + range = element.attribs.src.range; + }); + const pre = ""; + const code = `${pre}nosrc ${post}`; + testParser.parse(code, {}); + expect(range).toEqual([pre.length, code.length - post.length - 1]); + }); +}); diff --git a/types.d.ts b/types.d.ts index cda6296e9..57331e3e8 100644 --- a/types.d.ts +++ b/types.d.ts @@ -4,6 +4,7 @@ * Run `yarn special-lint-fix` to update */ +import { DataNode, Element, Node as NodeImport } from "domhandler/lib/node"; import { ArrayExpression, ArrayPattern, @@ -78,6 +79,7 @@ import { YieldExpression } from "estree"; import { Stats as FsStats, WriteStream } from "fs"; +import { ParserOptions } from "htmlparser2/lib/Parser"; import { default as ValidationError } from "schema-utils/declarations/ValidationError"; import { AsArray, @@ -3581,6 +3583,73 @@ declare class HotModuleReplacementPlugin { apply(compiler: Compiler): void; static getParserHooks(parser: JavascriptParser): HMRJavascriptParserHooks; } +declare class HtmlParser extends Parser { + constructor(options?: ParserOptions); + hooks: Readonly<{ + tag: HookMap< + SyncBailHook< + [ + Pick< + Element, + | "type" + | "name" + | "tagName" + | "children" + | "firstChild" + | "lastChild" + | "childNodes" + | "parent" + | "prev" + | "next" + | "startIndex" + | "endIndex" + | "nodeType" + | "parentNode" + | "previousSibling" + | "nextSibling" + > & { + attribs: { + [index: string]: { value: string; range: [number, number] }; + }; + } + ], + true | void + > + >; + text: SyncBailHook<[DataNode], void>; + directive: SyncBailHook<[DataNode], void>; + comment: SyncBailHook<[DataNode], void>; + }>; + state: Record & ParserStateBase; + walkNodes(nodes: NodeImport[]): void; + walkNode(node: NodeImport): void; + walkElement( + element: Pick< + Element, + | "type" + | "name" + | "tagName" + | "children" + | "firstChild" + | "lastChild" + | "childNodes" + | "parent" + | "prev" + | "next" + | "startIndex" + | "endIndex" + | "nodeType" + | "parentNode" + | "previousSibling" + | "nextSibling" + > & { + attribs: { [index: string]: { value: string; range: [number, number] } }; + } + ): void; + walkText(node: DataNode): void; + walkDirective(node: DataNode): void; + walkComment(node: DataNode): void; +} declare class HttpUriPlugin { constructor(); @@ -9941,6 +10010,9 @@ declare namespace exports { HashedModuleIdsPlugin }; } + export namespace html { + export { HtmlParser }; + } export namespace javascript { export { EnableChunkLoadingPlugin, diff --git a/yarn.lock b/yarn.lock index 1b4f1435b..cb7ee7e04 100644 --- a/yarn.lock +++ b/yarn.lock @@ -2260,6 +2260,20 @@ doctypes@^1.1.0: resolved "https://registry.yarnpkg.com/doctypes/-/doctypes-1.1.0.tgz#ea80b106a87538774e8a3a4a5afe293de489e0a9" integrity sha1-6oCxBqh1OHdOijpKWv4pPeSJ4Kk= +dom-serializer@^1.0.1: + version "1.1.0" + resolved "https://registry.yarnpkg.com/dom-serializer/-/dom-serializer-1.1.0.tgz#5f7c828f1bfc44887dc2a315ab5c45691d544b58" + integrity sha512-ox7bvGXt2n+uLWtCRLybYx60IrOlWL/aCebWJk1T0d4m3y2tzf4U3ij9wBMUb6YJZpz06HCCYuyCDveE2xXmzQ== + dependencies: + domelementtype "^2.0.1" + domhandler "^3.0.0" + entities "^2.0.0" + +domelementtype@^2.0.1, domelementtype@^2.0.2: + version "2.0.2" + resolved "https://registry.yarnpkg.com/domelementtype/-/domelementtype-2.0.2.tgz#f3b6e549201e46f588b59463dd77187131fe6971" + integrity sha512-wFwTwCVebUrMgGeAwRL/NhZtHAUyT9n9yg4IMDwf10+6iCMxSkVq9MGCVEH+QZWo1nNidy8kNvwmv4zWHDTqvA== + domexception@^1.0.1: version "1.0.1" resolved "https://registry.yarnpkg.com/domexception/-/domexception-1.0.1.tgz#937442644ca6a31261ef36e3ec677fe805582c90" @@ -2267,6 +2281,22 @@ domexception@^1.0.1: dependencies: webidl-conversions "^4.0.2" +domhandler@^3.0.0: + version "3.0.0" + resolved "https://registry.yarnpkg.com/domhandler/-/domhandler-3.0.0.tgz#51cd13efca31da95bbb0c5bee3a48300e333b3e9" + integrity sha512-eKLdI5v9m67kbXQbJSNn1zjh0SDzvzWVWtX+qEI3eMjZw8daH9k8rlj1FZY9memPwjiskQFbe7vHVVJIAqoEhw== + dependencies: + domelementtype "^2.0.1" + +domutils@^2.0.0: + version "2.3.0" + resolved "https://registry.yarnpkg.com/domutils/-/domutils-2.3.0.tgz#6469c63a3da2de0c3016f3a59e6a969e10705bce" + integrity sha512-xWC75PM3QF6MjE5e58OzwTX0B/rPQnlqH0YyXB/c056RtVJA+eu60da2I/bdnEHzEYC00g8QaZUlAbqOZVbOsw== + dependencies: + dom-serializer "^1.0.1" + domelementtype "^2.0.1" + domhandler "^3.0.0" + dot-prop@^5.2.0: version "5.2.0" resolved "https://registry.yarnpkg.com/dot-prop/-/dot-prop-5.2.0.tgz#c34ecc29556dc45f1f4c22697b6f4904e0cc4fcb" @@ -2328,6 +2358,11 @@ enquirer@^2.3.6: dependencies: ansi-colors "^4.1.1" +entities@^2.0.0: + version "2.0.3" + resolved "https://registry.yarnpkg.com/entities/-/entities-2.0.3.tgz#5c487e5742ab93c15abb5da22759b8590ec03b7f" + integrity sha512-MyoZ0jgnLvB2X3Lg5HqpFmn1kybDiIfEQmKzTb5apr51Rb+T3KdmMiqa70T+bhGnyv7bQ6WMj2QMHpGMmlrUYQ== + errno@^0.1.1, errno@^0.1.3: version "0.1.7" resolved "https://registry.yarnpkg.com/errno/-/errno-0.1.7.tgz#4684d71779ad39af177e3f007996f7c67c852618" @@ -3202,6 +3237,16 @@ html-escaper@^2.0.0: resolved "https://registry.yarnpkg.com/html-escaper/-/html-escaper-2.0.2.tgz#dfd60027da36a36dfcbe236262c00a5822681453" integrity sha512-H2iMtd0I4Mt5eYiapRdIDjp+XzelXQ0tFE4JS7YFwFevXXMmOp9myNrUvCg0D6ws8iqkRPBfKHgbwig1SmlLfg== +htmlparser2@^4.1.0: + version "4.1.0" + resolved "https://registry.yarnpkg.com/htmlparser2/-/htmlparser2-4.1.0.tgz#9a4ef161f2e4625ebf7dfbe6c0a2f52d18a59e78" + integrity sha512-4zDq1a1zhE4gQso/c5LP1OtrhYTncXNSpvJYtWJBtXAETPlMfi3IFNjGuQbYLuVY4ZR0QMqRVvo4Pdy9KLyP8Q== + dependencies: + domelementtype "^2.0.1" + domhandler "^3.0.0" + domutils "^2.0.0" + entities "^2.0.0" + http-signature@~1.2.0: version "1.2.0" resolved "https://registry.yarnpkg.com/http-signature/-/http-signature-1.2.0.tgz#9aecd925114772f3d95b65a60abb8f7c18fbace1"