add HtmlParser

This commit is contained in:
Ivan Kopeykin 2020-09-24 00:01:19 +03:00
parent 8295202bc8
commit 7b762de9f0
8 changed files with 402 additions and 1 deletions

View File

@ -217,7 +217,9 @@
"gitter", "gitter",
"codecov", "codecov",
"opencollective", "opencollective",
"dependabot" "dependabot",
"domelementtype",
"domhandler"
], ],
"ignoreRegExpList": ["/Author.+/", "/data:.*/", "/\"mappings\":\".+\"/"], "ignoreRegExpList": ["/Author.+/", "/data:.*/", "/\"mappings\":\".+\"/"],
"ignorePaths": ["**/dist/**", "examples/**/README.md"] "ignorePaths": ["**/dist/**", "examples/**/README.md"]

209
lib/html/HtmlParser.js Normal file
View File

@ -0,0 +1,209 @@
/*
MIT License http://www.opensource.org/licenses/mit-license.php
Author Ivan Kopeykin @vankop
*/
"use strict";
const ElementType = require("domelementtype");
const { DomHandler } = require("domhandler");
const { Parser: HtmlParser2 } = require("htmlparser2");
const { SyncBailHook, HookMap } = require("tapable");
const Parser = require("../Parser");
/** @typedef {import("htmlparser2").ParserOptions} HtmlParserOptions */
/** @typedef {import("htmlparser2").DomHandlerOptions} HtmlParserDomHandlerOptions */
/** @typedef {import("domhandler").Node} DomNode */
/** @typedef {import("domhandler").Element} DomHandlerElement */
/** @typedef {import("domhandler").DataNode} DomDataNode */
/** @typedef {import("../Parser").ParserState} ParserState */
/** @typedef {import("../Parser").PreparsedAst} PreparsedAst */
/** @typedef {{[k: string]: {value: string, range: Readonly<[number, number]>}}} TagAttributes */
/** @typedef {Omit<DomHandlerElement, 'attribs'> & {attribs: TagAttributes}} DomElement */
/**
* Webpack need custom handler to get attributes indexes in DOM structure
*/
class CustomDomHandler extends DomHandler {
constructor(cb, options, elementCb, errorCb) {
super(cb, options, elementCb);
/** @type {{[k: string]: [number, number]}} */
this._attributes = undefined;
// process errors, if any
this.onerror = errorCb;
}
// cspell:word onattribute
onattribute(name, value) {
if (!this._attributes) this._attributes = {};
//@ts-expect-error
const tokenizer = this._parser._tokenizer;
const html = tokenizer._buffer;
const endIndex = tokenizer._index;
const startIndex = endIndex - value.length;
const unquoted = html[endIndex] !== '"' && html[endIndex] !== "'";
this._attributes[name] = [unquoted ? startIndex : startIndex - 1, endIndex];
}
// cspell:word onopentag
onopentag(name, attributes) {
super.onopentag(name, attributes);
//@ts-expect-error
const attribs = this._tagStack[this._tagStack.length - 1].attribs;
for (const attributeName of Object.keys(this._attributes)) {
const value = attribs[attributeName];
attribs[attributeName] = {
value,
range: this._attributes[attributeName]
};
}
this._attributes = undefined;
}
}
class HtmlParser extends Parser {
/**
* @param {HtmlParserOptions=} options htmlparser2 parser options
*/
constructor(options) {
super();
this._options = options;
this.hooks = Object.freeze({
/** @type {HookMap<SyncBailHook<[DomElement], true | void | undefined | null>>} */
tag: new HookMap(() => new SyncBailHook(["tag"])),
/** @type {SyncBailHook<[DomDataNode], void | undefined | null>} */
text: new SyncBailHook(["text"]),
/** @type {SyncBailHook<[DomDataNode], void | undefined | null>} */
directive: new SyncBailHook(["directive"]),
/** @type {SyncBailHook<[DomDataNode], void | undefined | null>} */
comment: new SyncBailHook(["directive"])
});
/** @type {ParserState} */
this.state = undefined;
}
/**
* @param {DomNode[]} nodes nodes
*/
walkNodes(nodes) {
for (const node of nodes) this.walkNode(node);
}
/**
* @param {DomNode} node nodes
*/
walkNode(node) {
switch (node.type) {
case ElementType.Script:
case ElementType.Style:
case ElementType.Tag:
this.walkElement(/** @type {DomElement} */ (node));
break;
case ElementType.Comment:
this.walkComment(/** @type {DomDataNode} */ (node));
break;
case ElementType.Directive:
this.walkDirective(/** @type {DomDataNode} */ (node));
break;
case ElementType.Text:
this.walkText(/** @type {DomDataNode} */ (node));
break;
case ElementType.CDATA:
case ElementType.Doctype:
break;
}
}
/**
* @param {DomElement} element element
*/
walkElement(element) {
const name = element.tagName;
this.hooks.tag.for(name).call(element);
}
/**
* @param {DomDataNode} node element
*/
walkText(node) {
this.hooks.text.call(node);
}
/**
* @param {DomDataNode} node element
*/
walkDirective(node) {
this.hooks.directive.call(node);
}
/**
* @param {DomDataNode} node element
*/
walkComment(node) {
this.hooks.comment.call(node);
}
/**
* @param {string | Buffer | PreparsedAst} source the source to parse
* @param {ParserState} state the parser state
* @returns {ParserState} the parser state
*/
parse(source, state) {
if (source === null) {
throw new Error("source must not be null");
}
if (Buffer.isBuffer(source)) {
source = source.toString("utf-8");
}
const oldState = this.state;
const dom = HtmlParser._parse(
/** @type {string} */ (source),
this._options
);
this.walkNodes(dom);
this.state = oldState;
return state;
}
/**
* @param {string} code code
* @param {HtmlParserOptions} options options
* @private
* @returns {DomNode[]} dom
*/
static _parse(code, options) {
/** @type {HtmlParserOptions & HtmlParserDomHandlerOptions} */
const htmlParserOptions = {
...options,
withStartIndices: true,
withEndIndices: true
};
let dom;
let errors = [];
try {
const handler = new CustomDomHandler(
undefined,
htmlParserOptions,
undefined,
e => errors.push(e)
);
new HtmlParser2(handler, options).end(code);
dom = handler.dom;
} catch (e) {
errors = [e];
}
if (errors.length > 0) throw errors[0];
return dom;
}
}
module.exports = HtmlParser;

View File

@ -0,0 +1,25 @@
/*
MIT License http://www.opensource.org/licenses/mit-license.php
Author Ivan Kopeykin @vankop
*/
"use strict";
/** @typedef {import("domhandler").NodeWithChildren} DomNodeWithChildren */
/**
* @param {DomNodeWithChildren} node node with children
* @returns {[number, number]|null} range
* @example
* for
* <script> void 0;</script>
* range of text node " void 0;" will be returned
*/
function childrenRange(node) {
const firstChild = node.firstChild;
if (!firstChild) return null;
return [firstChild.startIndex, node.lastChild.endIndex];
}
exports.childrenRange = childrenRange;

View File

@ -307,6 +307,12 @@ module.exports = mergeExports(fn, {
} }
}, },
html: {
get HtmlParser() {
return require("./html/HtmlParser");
}
},
javascript: { javascript: {
get EnableChunkLoadingPlugin() { get EnableChunkLoadingPlugin() {
return require("./javascript/EnableChunkLoadingPlugin"); return require("./javascript/EnableChunkLoadingPlugin");

View File

@ -13,11 +13,14 @@
"@webassemblyjs/wasm-parser": "1.9.0", "@webassemblyjs/wasm-parser": "1.9.0",
"acorn": "^7.4.0", "acorn": "^7.4.0",
"chrome-trace-event": "^1.0.2", "chrome-trace-event": "^1.0.2",
"domelementtype": "^2.0.2",
"domhandler": "^3.0.0",
"enhanced-resolve": "^5.0.0", "enhanced-resolve": "^5.0.0",
"eslint-scope": "^5.1.0", "eslint-scope": "^5.1.0",
"events": "^3.2.0", "events": "^3.2.0",
"glob-to-regexp": "^0.4.1", "glob-to-regexp": "^0.4.1",
"graceful-fs": "^4.2.4", "graceful-fs": "^4.2.4",
"htmlparser2": "^4.1.0",
"json-parse-better-errors": "^1.0.2", "json-parse-better-errors": "^1.0.2",
"loader-runner": "^4.0.0", "loader-runner": "^4.0.0",
"mime-types": "^2.1.27", "mime-types": "^2.1.27",

View File

@ -0,0 +1,39 @@
"use strict";
const HtmlParser = require("../lib/html/HtmlParser");
const options = {
decodeEntities: false,
lowerCaseTags: false,
lowerCaseAttributeNames: false,
recognizeCDATA: true,
recognizeSelfClosing: true
};
describe("correct attributes range", () => {
it("with quotes", () => {
let range;
const testParser = new HtmlParser(options);
testParser.hooks.tag.for("img").tap("Test", element => {
range = element.attribs.src.range;
});
const pre = "<img src=";
const post = "/>";
const code = `${pre}"http://ok.ok"${post}`;
testParser.parse(code, {});
expect(range).toEqual([pre.length, code.length - post.length - 1]);
});
it("without quotes", () => {
let range;
const testParser = new HtmlParser(options);
testParser.hooks.tag.for("img").tap("Test", element => {
range = element.attribs.src.range;
});
const pre = "<img src=";
const post = "/>";
const code = `${pre}nosrc ${post}`;
testParser.parse(code, {});
expect(range).toEqual([pre.length, code.length - post.length - 1]);
});
});

72
types.d.ts vendored
View File

@ -4,6 +4,7 @@
* Run `yarn special-lint-fix` to update * Run `yarn special-lint-fix` to update
*/ */
import { DataNode, Element, Node as NodeImport } from "domhandler/lib/node";
import { import {
ArrayExpression, ArrayExpression,
ArrayPattern, ArrayPattern,
@ -78,6 +79,7 @@ import {
YieldExpression YieldExpression
} from "estree"; } from "estree";
import { Stats as FsStats, WriteStream } from "fs"; import { Stats as FsStats, WriteStream } from "fs";
import { ParserOptions } from "htmlparser2/lib/Parser";
import { default as ValidationError } from "schema-utils/declarations/ValidationError"; import { default as ValidationError } from "schema-utils/declarations/ValidationError";
import { import {
AsArray, AsArray,
@ -3581,6 +3583,73 @@ declare class HotModuleReplacementPlugin {
apply(compiler: Compiler): void; apply(compiler: Compiler): void;
static getParserHooks(parser: JavascriptParser): HMRJavascriptParserHooks; static getParserHooks(parser: JavascriptParser): HMRJavascriptParserHooks;
} }
declare class HtmlParser extends Parser {
constructor(options?: ParserOptions);
hooks: Readonly<{
tag: HookMap<
SyncBailHook<
[
Pick<
Element,
| "type"
| "name"
| "tagName"
| "children"
| "firstChild"
| "lastChild"
| "childNodes"
| "parent"
| "prev"
| "next"
| "startIndex"
| "endIndex"
| "nodeType"
| "parentNode"
| "previousSibling"
| "nextSibling"
> & {
attribs: {
[index: string]: { value: string; range: [number, number] };
};
}
],
true | void
>
>;
text: SyncBailHook<[DataNode], void>;
directive: SyncBailHook<[DataNode], void>;
comment: SyncBailHook<[DataNode], void>;
}>;
state: Record<string, any> & ParserStateBase;
walkNodes(nodes: NodeImport[]): void;
walkNode(node: NodeImport): void;
walkElement(
element: Pick<
Element,
| "type"
| "name"
| "tagName"
| "children"
| "firstChild"
| "lastChild"
| "childNodes"
| "parent"
| "prev"
| "next"
| "startIndex"
| "endIndex"
| "nodeType"
| "parentNode"
| "previousSibling"
| "nextSibling"
> & {
attribs: { [index: string]: { value: string; range: [number, number] } };
}
): void;
walkText(node: DataNode): void;
walkDirective(node: DataNode): void;
walkComment(node: DataNode): void;
}
declare class HttpUriPlugin { declare class HttpUriPlugin {
constructor(); constructor();
@ -9941,6 +10010,9 @@ declare namespace exports {
HashedModuleIdsPlugin HashedModuleIdsPlugin
}; };
} }
export namespace html {
export { HtmlParser };
}
export namespace javascript { export namespace javascript {
export { export {
EnableChunkLoadingPlugin, EnableChunkLoadingPlugin,

View File

@ -2260,6 +2260,20 @@ doctypes@^1.1.0:
resolved "https://registry.yarnpkg.com/doctypes/-/doctypes-1.1.0.tgz#ea80b106a87538774e8a3a4a5afe293de489e0a9" resolved "https://registry.yarnpkg.com/doctypes/-/doctypes-1.1.0.tgz#ea80b106a87538774e8a3a4a5afe293de489e0a9"
integrity sha1-6oCxBqh1OHdOijpKWv4pPeSJ4Kk= integrity sha1-6oCxBqh1OHdOijpKWv4pPeSJ4Kk=
dom-serializer@^1.0.1:
version "1.1.0"
resolved "https://registry.yarnpkg.com/dom-serializer/-/dom-serializer-1.1.0.tgz#5f7c828f1bfc44887dc2a315ab5c45691d544b58"
integrity sha512-ox7bvGXt2n+uLWtCRLybYx60IrOlWL/aCebWJk1T0d4m3y2tzf4U3ij9wBMUb6YJZpz06HCCYuyCDveE2xXmzQ==
dependencies:
domelementtype "^2.0.1"
domhandler "^3.0.0"
entities "^2.0.0"
domelementtype@^2.0.1, domelementtype@^2.0.2:
version "2.0.2"
resolved "https://registry.yarnpkg.com/domelementtype/-/domelementtype-2.0.2.tgz#f3b6e549201e46f588b59463dd77187131fe6971"
integrity sha512-wFwTwCVebUrMgGeAwRL/NhZtHAUyT9n9yg4IMDwf10+6iCMxSkVq9MGCVEH+QZWo1nNidy8kNvwmv4zWHDTqvA==
domexception@^1.0.1: domexception@^1.0.1:
version "1.0.1" version "1.0.1"
resolved "https://registry.yarnpkg.com/domexception/-/domexception-1.0.1.tgz#937442644ca6a31261ef36e3ec677fe805582c90" resolved "https://registry.yarnpkg.com/domexception/-/domexception-1.0.1.tgz#937442644ca6a31261ef36e3ec677fe805582c90"
@ -2267,6 +2281,22 @@ domexception@^1.0.1:
dependencies: dependencies:
webidl-conversions "^4.0.2" webidl-conversions "^4.0.2"
domhandler@^3.0.0:
version "3.0.0"
resolved "https://registry.yarnpkg.com/domhandler/-/domhandler-3.0.0.tgz#51cd13efca31da95bbb0c5bee3a48300e333b3e9"
integrity sha512-eKLdI5v9m67kbXQbJSNn1zjh0SDzvzWVWtX+qEI3eMjZw8daH9k8rlj1FZY9memPwjiskQFbe7vHVVJIAqoEhw==
dependencies:
domelementtype "^2.0.1"
domutils@^2.0.0:
version "2.3.0"
resolved "https://registry.yarnpkg.com/domutils/-/domutils-2.3.0.tgz#6469c63a3da2de0c3016f3a59e6a969e10705bce"
integrity sha512-xWC75PM3QF6MjE5e58OzwTX0B/rPQnlqH0YyXB/c056RtVJA+eu60da2I/bdnEHzEYC00g8QaZUlAbqOZVbOsw==
dependencies:
dom-serializer "^1.0.1"
domelementtype "^2.0.1"
domhandler "^3.0.0"
dot-prop@^5.2.0: dot-prop@^5.2.0:
version "5.2.0" version "5.2.0"
resolved "https://registry.yarnpkg.com/dot-prop/-/dot-prop-5.2.0.tgz#c34ecc29556dc45f1f4c22697b6f4904e0cc4fcb" resolved "https://registry.yarnpkg.com/dot-prop/-/dot-prop-5.2.0.tgz#c34ecc29556dc45f1f4c22697b6f4904e0cc4fcb"
@ -2328,6 +2358,11 @@ enquirer@^2.3.6:
dependencies: dependencies:
ansi-colors "^4.1.1" ansi-colors "^4.1.1"
entities@^2.0.0:
version "2.0.3"
resolved "https://registry.yarnpkg.com/entities/-/entities-2.0.3.tgz#5c487e5742ab93c15abb5da22759b8590ec03b7f"
integrity sha512-MyoZ0jgnLvB2X3Lg5HqpFmn1kybDiIfEQmKzTb5apr51Rb+T3KdmMiqa70T+bhGnyv7bQ6WMj2QMHpGMmlrUYQ==
errno@^0.1.1, errno@^0.1.3: errno@^0.1.1, errno@^0.1.3:
version "0.1.7" version "0.1.7"
resolved "https://registry.yarnpkg.com/errno/-/errno-0.1.7.tgz#4684d71779ad39af177e3f007996f7c67c852618" resolved "https://registry.yarnpkg.com/errno/-/errno-0.1.7.tgz#4684d71779ad39af177e3f007996f7c67c852618"
@ -3202,6 +3237,16 @@ html-escaper@^2.0.0:
resolved "https://registry.yarnpkg.com/html-escaper/-/html-escaper-2.0.2.tgz#dfd60027da36a36dfcbe236262c00a5822681453" resolved "https://registry.yarnpkg.com/html-escaper/-/html-escaper-2.0.2.tgz#dfd60027da36a36dfcbe236262c00a5822681453"
integrity sha512-H2iMtd0I4Mt5eYiapRdIDjp+XzelXQ0tFE4JS7YFwFevXXMmOp9myNrUvCg0D6ws8iqkRPBfKHgbwig1SmlLfg== integrity sha512-H2iMtd0I4Mt5eYiapRdIDjp+XzelXQ0tFE4JS7YFwFevXXMmOp9myNrUvCg0D6ws8iqkRPBfKHgbwig1SmlLfg==
htmlparser2@^4.1.0:
version "4.1.0"
resolved "https://registry.yarnpkg.com/htmlparser2/-/htmlparser2-4.1.0.tgz#9a4ef161f2e4625ebf7dfbe6c0a2f52d18a59e78"
integrity sha512-4zDq1a1zhE4gQso/c5LP1OtrhYTncXNSpvJYtWJBtXAETPlMfi3IFNjGuQbYLuVY4ZR0QMqRVvo4Pdy9KLyP8Q==
dependencies:
domelementtype "^2.0.1"
domhandler "^3.0.0"
domutils "^2.0.0"
entities "^2.0.0"
http-signature@~1.2.0: http-signature@~1.2.0:
version "1.2.0" version "1.2.0"
resolved "https://registry.yarnpkg.com/http-signature/-/http-signature-1.2.0.tgz#9aecd925114772f3d95b65a60abb8f7c18fbace1" resolved "https://registry.yarnpkg.com/http-signature/-/http-signature-1.2.0.tgz#9aecd925114772f3d95b65a60abb8f7c18fbace1"