2018-07-30 23:08:51 +08:00
|
|
|
/*
|
|
|
|
MIT License http://www.opensource.org/licenses/mit-license.php
|
|
|
|
Author Tobias Koppers @sokra
|
|
|
|
*/
|
|
|
|
|
2018-07-04 15:59:22 +08:00
|
|
|
"use strict";
|
|
|
|
|
|
|
|
// Simulations show these probabilities for a single change
|
|
|
|
// 93.1% that one group is invalidated
|
|
|
|
// 4.8% that two groups are invalidated
|
|
|
|
// 1.1% that 3 groups are invalidated
|
|
|
|
// 0.1% that 4 or more groups are invalidated
|
|
|
|
//
|
|
|
|
// And these for removing/adding 10 lexically adjacent files
|
|
|
|
// 64.5% that one group is invalidated
|
|
|
|
// 24.8% that two groups are invalidated
|
|
|
|
// 7.8% that 3 groups are invalidated
|
|
|
|
// 2.7% that 4 or more groups are invalidated
|
|
|
|
//
|
|
|
|
// And these for removing/adding 3 random files
|
|
|
|
// 0% that one group is invalidated
|
|
|
|
// 3.7% that two groups are invalidated
|
|
|
|
// 80.8% that 3 groups are invalidated
|
|
|
|
// 12.3% that 4 groups are invalidated
|
|
|
|
// 3.2% that 5 or more groups are invalidated
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @param {string} a key
|
|
|
|
* @param {string} b key
|
|
|
|
* @returns {number} the similarity as number
|
|
|
|
*/
|
|
|
|
const similarity = (a, b) => {
|
|
|
|
const l = Math.min(a.length, b.length);
|
|
|
|
let dist = 0;
|
|
|
|
for (let i = 0; i < l; i++) {
|
|
|
|
const ca = a.charCodeAt(i);
|
|
|
|
const cb = b.charCodeAt(i);
|
|
|
|
dist += Math.max(0, 10 - Math.abs(ca - cb));
|
|
|
|
}
|
|
|
|
return dist;
|
|
|
|
};
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @param {string} a key
|
|
|
|
* @param {string} b key
|
2020-06-16 23:28:53 +08:00
|
|
|
* @param {Set<string>} usedNames set of already used names
|
2018-07-04 15:59:22 +08:00
|
|
|
* @returns {string} the common part and a single char for the difference
|
|
|
|
*/
|
2020-06-16 23:28:53 +08:00
|
|
|
const getName = (a, b, usedNames) => {
|
2018-07-04 15:59:22 +08:00
|
|
|
const l = Math.min(a.length, b.length);
|
2020-06-16 23:28:53 +08:00
|
|
|
let i = 0;
|
|
|
|
while (i < l) {
|
|
|
|
if (a.charCodeAt(i) !== b.charCodeAt(i)) {
|
|
|
|
i++;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
i++;
|
|
|
|
}
|
|
|
|
while (i < l) {
|
|
|
|
const name = a.slice(0, i);
|
|
|
|
const lowerName = name.toLowerCase();
|
|
|
|
if (!usedNames.has(lowerName)) {
|
|
|
|
usedNames.add(lowerName);
|
|
|
|
return name;
|
2018-07-04 15:59:22 +08:00
|
|
|
}
|
2020-06-16 23:28:53 +08:00
|
|
|
i++;
|
2018-07-04 15:59:22 +08:00
|
|
|
}
|
2020-06-16 23:28:53 +08:00
|
|
|
// names always contain a hash, so this is always unique
|
|
|
|
// we don't need to check usedNames nor add it
|
2018-07-04 15:59:22 +08:00
|
|
|
return a;
|
|
|
|
};
|
|
|
|
|
2025-09-11 08:10:10 +08:00
|
|
|
/** @typedef {Record<string, number>} Sizes */
|
|
|
|
|
2018-12-04 23:40:06 +08:00
|
|
|
/**
|
2025-09-11 08:10:10 +08:00
|
|
|
* @param {Sizes} total total size
|
|
|
|
* @param {Sizes} size single size
|
2018-12-04 23:40:06 +08:00
|
|
|
* @returns {void}
|
|
|
|
*/
|
|
|
|
const addSizeTo = (total, size) => {
|
|
|
|
for (const key of Object.keys(size)) {
|
|
|
|
total[key] = (total[key] || 0) + size[key];
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2021-03-10 05:38:45 +08:00
|
|
|
/**
|
2025-09-11 08:10:10 +08:00
|
|
|
* @param {Sizes} total total size
|
|
|
|
* @param {Sizes} size single size
|
2021-03-10 05:38:45 +08:00
|
|
|
* @returns {void}
|
|
|
|
*/
|
|
|
|
const subtractSizeFrom = (total, size) => {
|
|
|
|
for (const key of Object.keys(size)) {
|
|
|
|
total[key] -= size[key];
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2018-12-04 23:40:06 +08:00
|
|
|
/**
|
2023-06-12 22:21:21 +08:00
|
|
|
* @template T
|
|
|
|
* @param {Iterable<Node<T>>} nodes some nodes
|
2025-09-11 08:10:10 +08:00
|
|
|
* @returns {Sizes} total size
|
2018-12-04 23:40:06 +08:00
|
|
|
*/
|
2025-07-17 00:13:14 +08:00
|
|
|
const sumSize = (nodes) => {
|
2018-12-04 23:40:06 +08:00
|
|
|
const sum = Object.create(null);
|
|
|
|
for (const node of nodes) {
|
|
|
|
addSizeTo(sum, node.size);
|
|
|
|
}
|
|
|
|
return sum;
|
|
|
|
};
|
|
|
|
|
2023-06-12 22:21:21 +08:00
|
|
|
/**
|
2025-09-11 08:10:10 +08:00
|
|
|
* @param {Sizes} size size
|
|
|
|
* @param {Sizes} maxSize minimum size
|
2023-06-12 22:21:21 +08:00
|
|
|
* @returns {boolean} true, when size is too big
|
|
|
|
*/
|
2018-12-04 23:40:06 +08:00
|
|
|
const isTooBig = (size, maxSize) => {
|
|
|
|
for (const key of Object.keys(size)) {
|
2021-03-10 05:38:45 +08:00
|
|
|
const s = size[key];
|
|
|
|
if (s === 0) continue;
|
2018-12-04 23:40:06 +08:00
|
|
|
const maxSizeValue = maxSize[key];
|
2024-08-02 02:36:27 +08:00
|
|
|
if (typeof maxSizeValue === "number" && s > maxSizeValue) return true;
|
2018-12-04 23:40:06 +08:00
|
|
|
}
|
|
|
|
return false;
|
|
|
|
};
|
|
|
|
|
2023-06-12 22:21:21 +08:00
|
|
|
/**
|
2025-09-11 08:10:10 +08:00
|
|
|
* @param {Sizes} size size
|
|
|
|
* @param {Sizes} minSize minimum size
|
2023-06-12 22:21:21 +08:00
|
|
|
* @returns {boolean} true, when size is too small
|
|
|
|
*/
|
2018-12-04 23:40:06 +08:00
|
|
|
const isTooSmall = (size, minSize) => {
|
|
|
|
for (const key of Object.keys(size)) {
|
2021-03-10 05:38:45 +08:00
|
|
|
const s = size[key];
|
|
|
|
if (s === 0) continue;
|
2018-12-04 23:40:06 +08:00
|
|
|
const minSizeValue = minSize[key];
|
2024-08-02 02:36:27 +08:00
|
|
|
if (typeof minSizeValue === "number" && s < minSizeValue) return true;
|
2018-12-04 23:40:06 +08:00
|
|
|
}
|
|
|
|
return false;
|
|
|
|
};
|
|
|
|
|
2023-06-12 22:21:21 +08:00
|
|
|
/**
|
2025-09-11 08:10:10 +08:00
|
|
|
* @param {Sizes} size size
|
|
|
|
* @param {Sizes} minSize minimum size
|
2023-06-12 22:21:21 +08:00
|
|
|
* @returns {Set<string>} set of types that are too small
|
|
|
|
*/
|
2021-03-10 05:38:45 +08:00
|
|
|
const getTooSmallTypes = (size, minSize) => {
|
2018-12-04 23:40:06 +08:00
|
|
|
const types = new Set();
|
|
|
|
for (const key of Object.keys(size)) {
|
2021-03-10 05:38:45 +08:00
|
|
|
const s = size[key];
|
|
|
|
if (s === 0) continue;
|
2018-12-04 23:40:06 +08:00
|
|
|
const minSizeValue = minSize[key];
|
2024-08-02 02:36:27 +08:00
|
|
|
if (typeof minSizeValue === "number" && s < minSizeValue) types.add(key);
|
2018-12-04 23:40:06 +08:00
|
|
|
}
|
|
|
|
return types;
|
|
|
|
};
|
|
|
|
|
2023-06-12 22:21:21 +08:00
|
|
|
/**
|
2025-03-11 22:20:50 +08:00
|
|
|
* @template {object} T
|
|
|
|
* @param {T} size size
|
2023-06-12 22:21:21 +08:00
|
|
|
* @param {Set<string>} types types
|
|
|
|
* @returns {number} number of matching size types
|
|
|
|
*/
|
2018-12-04 23:40:06 +08:00
|
|
|
const getNumberOfMatchingSizeTypes = (size, types) => {
|
|
|
|
let i = 0;
|
|
|
|
for (const key of Object.keys(size)) {
|
2025-03-11 22:20:50 +08:00
|
|
|
if (size[/** @type {keyof T} */ (key)] !== 0 && types.has(key)) i++;
|
2018-12-04 23:40:06 +08:00
|
|
|
}
|
|
|
|
return i;
|
|
|
|
};
|
|
|
|
|
2023-06-12 22:21:21 +08:00
|
|
|
/**
|
2025-09-11 08:10:10 +08:00
|
|
|
* @param {Sizes} size size
|
2023-06-12 22:21:21 +08:00
|
|
|
* @param {Set<string>} types types
|
|
|
|
* @returns {number} selective size sum
|
|
|
|
*/
|
2018-12-04 23:40:06 +08:00
|
|
|
const selectiveSizeSum = (size, types) => {
|
|
|
|
let sum = 0;
|
|
|
|
for (const key of Object.keys(size)) {
|
2021-03-10 05:38:45 +08:00
|
|
|
if (size[key] !== 0 && types.has(key)) sum += size[key];
|
2018-12-04 23:40:06 +08:00
|
|
|
}
|
|
|
|
return sum;
|
|
|
|
};
|
|
|
|
|
2018-07-04 15:59:22 +08:00
|
|
|
/**
|
|
|
|
* @template T
|
|
|
|
*/
|
|
|
|
class Node {
|
|
|
|
/**
|
|
|
|
* @param {T} item item
|
|
|
|
* @param {string} key key
|
2025-09-11 08:10:10 +08:00
|
|
|
* @param {Sizes} size size
|
2018-07-04 15:59:22 +08:00
|
|
|
*/
|
|
|
|
constructor(item, key, size) {
|
|
|
|
this.item = item;
|
|
|
|
this.key = key;
|
|
|
|
this.size = size;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2025-09-11 08:10:10 +08:00
|
|
|
/** @typedef {number[]} Similarities */
|
|
|
|
|
2018-07-04 15:59:22 +08:00
|
|
|
/**
|
|
|
|
* @template T
|
|
|
|
*/
|
|
|
|
class Group {
|
|
|
|
/**
|
|
|
|
* @param {Node<T>[]} nodes nodes
|
2025-09-11 08:10:10 +08:00
|
|
|
* @param {Similarities | null} similarities similarities between the nodes (length = nodes.length - 1)
|
|
|
|
* @param {Sizes=} size size of the group
|
2018-07-04 15:59:22 +08:00
|
|
|
*/
|
2018-12-04 23:40:06 +08:00
|
|
|
constructor(nodes, similarities, size) {
|
2018-07-04 15:59:22 +08:00
|
|
|
this.nodes = nodes;
|
|
|
|
this.similarities = similarities;
|
2018-12-04 23:40:06 +08:00
|
|
|
this.size = size || sumSize(nodes);
|
2023-06-12 22:21:21 +08:00
|
|
|
/** @type {string | undefined} */
|
2018-07-04 15:59:22 +08:00
|
|
|
this.key = undefined;
|
|
|
|
}
|
2018-12-04 23:40:06 +08:00
|
|
|
|
|
|
|
/**
|
2025-03-12 09:56:14 +08:00
|
|
|
* @param {(node: Node<T>) => boolean} filter filter function
|
2023-06-12 22:21:21 +08:00
|
|
|
* @returns {Node<T>[] | undefined} removed nodes
|
2018-12-04 23:40:06 +08:00
|
|
|
*/
|
|
|
|
popNodes(filter) {
|
|
|
|
const newNodes = [];
|
|
|
|
const newSimilarities = [];
|
|
|
|
const resultNodes = [];
|
|
|
|
let lastNode;
|
|
|
|
for (let i = 0; i < this.nodes.length; i++) {
|
|
|
|
const node = this.nodes[i];
|
|
|
|
if (filter(node)) {
|
|
|
|
resultNodes.push(node);
|
|
|
|
} else {
|
|
|
|
if (newNodes.length > 0) {
|
|
|
|
newSimilarities.push(
|
|
|
|
lastNode === this.nodes[i - 1]
|
2025-09-11 08:10:10 +08:00
|
|
|
? /** @type {Similarities} */ (this.similarities)[i - 1]
|
2024-08-06 11:08:48 +08:00
|
|
|
: similarity(/** @type {Node<T>} */ (lastNode).key, node.key)
|
2018-12-04 23:40:06 +08:00
|
|
|
);
|
|
|
|
}
|
|
|
|
newNodes.push(node);
|
|
|
|
lastNode = node;
|
|
|
|
}
|
|
|
|
}
|
2024-08-02 02:36:27 +08:00
|
|
|
if (resultNodes.length === this.nodes.length) return;
|
2018-12-04 23:40:06 +08:00
|
|
|
this.nodes = newNodes;
|
|
|
|
this.similarities = newSimilarities;
|
|
|
|
this.size = sumSize(newNodes);
|
|
|
|
return resultNodes;
|
|
|
|
}
|
2018-07-04 15:59:22 +08:00
|
|
|
}
|
|
|
|
|
2018-12-04 23:40:06 +08:00
|
|
|
/**
|
2023-06-12 22:21:21 +08:00
|
|
|
* @template T
|
|
|
|
* @param {Iterable<Node<T>>} nodes nodes
|
2025-09-11 08:10:10 +08:00
|
|
|
* @returns {Similarities} similarities
|
2018-12-04 23:40:06 +08:00
|
|
|
*/
|
2025-07-17 00:13:14 +08:00
|
|
|
const getSimilarities = (nodes) => {
|
2018-12-04 23:40:06 +08:00
|
|
|
// calculate similarities between lexically adjacent nodes
|
2025-09-11 08:10:10 +08:00
|
|
|
/** @type {Similarities} */
|
2018-12-04 23:40:06 +08:00
|
|
|
const similarities = [];
|
2024-07-31 06:15:03 +08:00
|
|
|
let last;
|
2018-12-04 23:40:06 +08:00
|
|
|
for (const node of nodes) {
|
|
|
|
if (last !== undefined) {
|
|
|
|
similarities.push(similarity(last.key, node.key));
|
|
|
|
}
|
|
|
|
last = node;
|
|
|
|
}
|
|
|
|
return similarities;
|
|
|
|
};
|
|
|
|
|
2018-07-04 15:59:22 +08:00
|
|
|
/**
|
|
|
|
* @template T
|
2024-06-11 21:09:50 +08:00
|
|
|
* @typedef {object} GroupedItems<T>
|
2018-07-04 15:59:22 +08:00
|
|
|
* @property {string} key
|
|
|
|
* @property {T[]} items
|
2025-09-11 08:10:10 +08:00
|
|
|
* @property {Sizes} size
|
2018-07-04 15:59:22 +08:00
|
|
|
*/
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @template T
|
2024-06-11 21:09:50 +08:00
|
|
|
* @typedef {object} Options
|
2025-09-11 08:10:10 +08:00
|
|
|
* @property {Sizes} maxSize maximum size of a group
|
|
|
|
* @property {Sizes} minSize minimum size of a group (preferred over maximum size)
|
2018-07-04 15:59:22 +08:00
|
|
|
* @property {Iterable<T>} items a list of items
|
2025-09-11 08:10:10 +08:00
|
|
|
* @property {(item: T) => Sizes} getSize function to get size of an item
|
2025-03-12 09:56:14 +08:00
|
|
|
* @property {(item: T) => string} getKey function to get the key of an item
|
2018-07-04 15:59:22 +08:00
|
|
|
*/
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @template T
|
|
|
|
* @param {Options<T>} options options object
|
|
|
|
* @returns {GroupedItems<T>[]} grouped items
|
|
|
|
*/
|
|
|
|
module.exports = ({ maxSize, minSize, items, getSize, getKey }) => {
|
|
|
|
/** @type {Group<T>[]} */
|
|
|
|
const result = [];
|
|
|
|
|
|
|
|
const nodes = Array.from(
|
|
|
|
items,
|
2025-07-17 00:13:14 +08:00
|
|
|
(item) => new Node(item, getKey(item), getSize(item))
|
2018-07-04 15:59:22 +08:00
|
|
|
);
|
|
|
|
|
|
|
|
/** @type {Node<T>[]} */
|
|
|
|
const initialNodes = [];
|
|
|
|
|
2018-11-24 16:17:16 +08:00
|
|
|
// lexically ordering of keys
|
|
|
|
nodes.sort((a, b) => {
|
|
|
|
if (a.key < b.key) return -1;
|
|
|
|
if (a.key > b.key) return 1;
|
|
|
|
return 0;
|
|
|
|
});
|
|
|
|
|
2018-07-04 15:59:22 +08:00
|
|
|
// return nodes bigger than maxSize directly as group
|
2018-12-04 23:40:06 +08:00
|
|
|
// But make sure that minSize is not violated
|
2018-07-04 15:59:22 +08:00
|
|
|
for (const node of nodes) {
|
2018-12-04 23:40:06 +08:00
|
|
|
if (isTooBig(node.size, maxSize) && !isTooSmall(node.size, minSize)) {
|
2018-07-04 15:59:22 +08:00
|
|
|
result.push(new Group([node], []));
|
|
|
|
} else {
|
|
|
|
initialNodes.push(node);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (initialNodes.length > 0) {
|
2018-12-04 23:40:06 +08:00
|
|
|
const initialGroup = new Group(initialNodes, getSimilarities(initialNodes));
|
2018-07-04 15:59:22 +08:00
|
|
|
|
2023-06-12 22:21:21 +08:00
|
|
|
/**
|
|
|
|
* @param {Group<T>} group group
|
2025-09-11 08:10:10 +08:00
|
|
|
* @param {Sizes} consideredSize size of the group to consider
|
2023-06-12 22:21:21 +08:00
|
|
|
* @returns {boolean} true, if the group was modified
|
|
|
|
*/
|
2021-03-31 22:37:33 +08:00
|
|
|
const removeProblematicNodes = (group, consideredSize = group.size) => {
|
|
|
|
const problemTypes = getTooSmallTypes(consideredSize, minSize);
|
2021-03-10 05:38:45 +08:00
|
|
|
if (problemTypes.size > 0) {
|
|
|
|
// We hit an edge case where the working set is already smaller than minSize
|
|
|
|
// We merge problematic nodes with the smallest result node to keep minSize intact
|
|
|
|
const problemNodes = group.popNodes(
|
2025-07-17 00:13:14 +08:00
|
|
|
(n) => getNumberOfMatchingSizeTypes(n.size, problemTypes) > 0
|
2021-03-10 05:38:45 +08:00
|
|
|
);
|
2021-03-31 22:37:33 +08:00
|
|
|
if (problemNodes === undefined) return false;
|
2021-03-10 05:38:45 +08:00
|
|
|
// Only merge it with result nodes that have the problematic size type
|
|
|
|
const possibleResultGroups = result.filter(
|
2025-07-17 00:13:14 +08:00
|
|
|
(n) => getNumberOfMatchingSizeTypes(n.size, problemTypes) > 0
|
2021-03-10 05:38:45 +08:00
|
|
|
);
|
|
|
|
if (possibleResultGroups.length > 0) {
|
|
|
|
const bestGroup = possibleResultGroups.reduce((min, group) => {
|
|
|
|
const minMatches = getNumberOfMatchingSizeTypes(min, problemTypes);
|
|
|
|
const groupMatches = getNumberOfMatchingSizeTypes(
|
|
|
|
group,
|
|
|
|
problemTypes
|
|
|
|
);
|
2025-07-02 20:10:54 +08:00
|
|
|
if (minMatches !== groupMatches) {
|
2021-03-10 05:38:45 +08:00
|
|
|
return minMatches < groupMatches ? group : min;
|
2025-07-02 20:10:54 +08:00
|
|
|
}
|
2021-03-10 05:38:45 +08:00
|
|
|
if (
|
|
|
|
selectiveSizeSum(min.size, problemTypes) >
|
|
|
|
selectiveSizeSum(group.size, problemTypes)
|
2025-07-02 20:10:54 +08:00
|
|
|
) {
|
2021-03-10 05:38:45 +08:00
|
|
|
return group;
|
2025-07-02 20:10:54 +08:00
|
|
|
}
|
2021-03-10 05:38:45 +08:00
|
|
|
return min;
|
|
|
|
});
|
|
|
|
for (const node of problemNodes) bestGroup.nodes.push(node);
|
|
|
|
bestGroup.nodes.sort((a, b) => {
|
|
|
|
if (a.key < b.key) return -1;
|
|
|
|
if (a.key > b.key) return 1;
|
|
|
|
return 0;
|
|
|
|
});
|
|
|
|
} else {
|
|
|
|
// There are no other nodes with the same size types
|
|
|
|
// We create a new group and have to accept that it's smaller than minSize
|
|
|
|
result.push(new Group(problemNodes, null));
|
|
|
|
}
|
|
|
|
return true;
|
2018-07-04 15:59:22 +08:00
|
|
|
}
|
2024-07-31 04:21:27 +08:00
|
|
|
return false;
|
2021-03-10 05:38:45 +08:00
|
|
|
};
|
|
|
|
|
2018-12-04 23:40:06 +08:00
|
|
|
if (initialGroup.nodes.length > 0) {
|
2018-11-24 16:17:16 +08:00
|
|
|
const queue = [initialGroup];
|
2018-07-04 15:59:22 +08:00
|
|
|
|
2018-11-24 16:17:16 +08:00
|
|
|
while (queue.length) {
|
2023-06-12 22:21:21 +08:00
|
|
|
const group = /** @type {Group<T>} */ (queue.pop());
|
2018-11-24 16:17:16 +08:00
|
|
|
// only groups bigger than maxSize need to be splitted
|
2018-12-04 23:40:06 +08:00
|
|
|
if (!isTooBig(group.size, maxSize)) {
|
2018-11-24 16:17:16 +08:00
|
|
|
result.push(group);
|
|
|
|
continue;
|
|
|
|
}
|
2021-03-10 05:38:45 +08:00
|
|
|
// If the group is already too small
|
|
|
|
// we try to work only with the unproblematic nodes
|
|
|
|
if (removeProblematicNodes(group)) {
|
|
|
|
// This changed something, so we try this group again
|
|
|
|
queue.push(group);
|
|
|
|
continue;
|
|
|
|
}
|
2018-07-04 15:59:22 +08:00
|
|
|
|
2018-11-24 16:17:16 +08:00
|
|
|
// find unsplittable area from left and right
|
|
|
|
// going minSize from left and right
|
|
|
|
// at least one node need to be included otherwise we get stuck
|
2018-12-04 23:40:06 +08:00
|
|
|
let left = 1;
|
2024-07-31 04:09:42 +08:00
|
|
|
const leftSize = Object.create(null);
|
2018-12-04 23:40:06 +08:00
|
|
|
addSizeTo(leftSize, group.nodes[0].size);
|
2021-03-10 05:38:45 +08:00
|
|
|
while (left < group.nodes.length && isTooSmall(leftSize, minSize)) {
|
2018-12-04 23:40:06 +08:00
|
|
|
addSizeTo(leftSize, group.nodes[left].size);
|
2018-11-24 16:17:16 +08:00
|
|
|
left++;
|
|
|
|
}
|
2018-12-04 23:40:06 +08:00
|
|
|
let right = group.nodes.length - 2;
|
2024-07-31 04:09:42 +08:00
|
|
|
const rightSize = Object.create(null);
|
2018-12-04 23:40:06 +08:00
|
|
|
addSizeTo(rightSize, group.nodes[group.nodes.length - 1].size);
|
2021-03-08 21:20:47 +08:00
|
|
|
while (right >= 0 && isTooSmall(rightSize, minSize)) {
|
2018-12-04 23:40:06 +08:00
|
|
|
addSizeTo(rightSize, group.nodes[right].size);
|
2018-11-24 16:17:16 +08:00
|
|
|
right--;
|
|
|
|
}
|
|
|
|
|
2021-03-31 22:37:33 +08:00
|
|
|
// left v v right
|
|
|
|
// [ O O O ] O O O [ O O O ]
|
|
|
|
// ^^^^^^^^^ leftSize
|
|
|
|
// rightSize ^^^^^^^^^
|
|
|
|
// leftSize > minSize
|
|
|
|
// rightSize > minSize
|
|
|
|
|
|
|
|
// Perfect split: [ O O O ] [ O O O ]
|
|
|
|
// right === left - 1
|
|
|
|
|
2018-11-24 16:17:16 +08:00
|
|
|
if (left - 1 > right) {
|
2021-03-31 22:37:33 +08:00
|
|
|
// We try to remove some problematic nodes to "fix" that
|
|
|
|
let prevSize;
|
|
|
|
if (right < group.nodes.length - left) {
|
|
|
|
subtractSizeFrom(rightSize, group.nodes[right + 1].size);
|
|
|
|
prevSize = rightSize;
|
|
|
|
} else {
|
|
|
|
subtractSizeFrom(leftSize, group.nodes[left - 1].size);
|
|
|
|
prevSize = leftSize;
|
|
|
|
}
|
|
|
|
if (removeProblematicNodes(group, prevSize)) {
|
|
|
|
// This changed something, so we try this group again
|
|
|
|
queue.push(group);
|
|
|
|
continue;
|
|
|
|
}
|
2018-11-24 16:17:16 +08:00
|
|
|
// can't split group while holding minSize
|
|
|
|
// because minSize is preferred of maxSize we return
|
2018-12-04 23:40:06 +08:00
|
|
|
// the problematic nodes as result here even while it's too big
|
2018-11-24 16:17:16 +08:00
|
|
|
// To avoid this make sure maxSize > minSize * 3
|
|
|
|
result.push(group);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (left <= right) {
|
|
|
|
// when there is a area between left and right
|
|
|
|
// we look for best split point
|
|
|
|
// we split at the minimum similarity
|
|
|
|
// here key space is separated the most
|
2021-03-10 05:38:45 +08:00
|
|
|
// But we also need to make sure to not create too small groups
|
|
|
|
let best = -1;
|
|
|
|
let bestSimilarity = Infinity;
|
|
|
|
let pos = left;
|
2024-07-31 04:09:42 +08:00
|
|
|
const rightSize = sumSize(group.nodes.slice(pos));
|
2021-03-31 22:37:33 +08:00
|
|
|
|
|
|
|
// pos v v right
|
|
|
|
// [ O O O ] O O O [ O O O ]
|
|
|
|
// ^^^^^^^^^ leftSize
|
|
|
|
// rightSize ^^^^^^^^^^^^^^^
|
|
|
|
|
|
|
|
while (pos <= right + 1) {
|
2025-09-11 08:10:10 +08:00
|
|
|
const similarity =
|
|
|
|
/** @type {Similarities} */
|
|
|
|
(group.similarities)[pos - 1];
|
2021-03-10 05:38:45 +08:00
|
|
|
if (
|
|
|
|
similarity < bestSimilarity &&
|
|
|
|
!isTooSmall(leftSize, minSize) &&
|
|
|
|
!isTooSmall(rightSize, minSize)
|
|
|
|
) {
|
|
|
|
best = pos;
|
2018-11-24 16:17:16 +08:00
|
|
|
bestSimilarity = similarity;
|
|
|
|
}
|
2021-03-10 05:38:45 +08:00
|
|
|
addSizeTo(leftSize, group.nodes[pos].size);
|
|
|
|
subtractSizeFrom(rightSize, group.nodes[pos].size);
|
|
|
|
pos++;
|
|
|
|
}
|
|
|
|
if (best < 0) {
|
2021-03-31 22:37:33 +08:00
|
|
|
// This can't happen
|
|
|
|
// but if that assumption is wrong
|
|
|
|
// fallback to a big group
|
2021-03-10 05:38:45 +08:00
|
|
|
result.push(group);
|
|
|
|
continue;
|
2018-07-04 15:59:22 +08:00
|
|
|
}
|
2021-03-10 05:38:45 +08:00
|
|
|
left = best;
|
|
|
|
right = best - 1;
|
2018-07-04 15:59:22 +08:00
|
|
|
}
|
|
|
|
|
2018-11-24 16:17:16 +08:00
|
|
|
// create two new groups for left and right area
|
|
|
|
// and queue them up
|
|
|
|
const rightNodes = [group.nodes[right + 1]];
|
2025-09-11 08:10:10 +08:00
|
|
|
/** @type {Similarities} */
|
2020-03-13 00:51:26 +08:00
|
|
|
const rightSimilarities = [];
|
2018-11-24 16:17:16 +08:00
|
|
|
for (let i = right + 2; i < group.nodes.length; i++) {
|
2023-06-12 22:21:21 +08:00
|
|
|
rightSimilarities.push(
|
2025-09-11 08:10:10 +08:00
|
|
|
/** @type {Similarities} */ (group.similarities)[i - 1]
|
2023-06-12 22:21:21 +08:00
|
|
|
);
|
2018-11-24 16:17:16 +08:00
|
|
|
rightNodes.push(group.nodes[i]);
|
|
|
|
}
|
2020-03-13 00:51:26 +08:00
|
|
|
queue.push(new Group(rightNodes, rightSimilarities));
|
2018-07-04 15:59:22 +08:00
|
|
|
|
2018-11-24 16:17:16 +08:00
|
|
|
const leftNodes = [group.nodes[0]];
|
2025-09-11 08:10:10 +08:00
|
|
|
/** @type {Similarities} */
|
2020-03-13 00:51:26 +08:00
|
|
|
const leftSimilarities = [];
|
2018-11-24 16:17:16 +08:00
|
|
|
for (let i = 1; i < left; i++) {
|
2023-06-12 22:21:21 +08:00
|
|
|
leftSimilarities.push(
|
2025-09-11 08:10:10 +08:00
|
|
|
/** @type {Similarities} */ (group.similarities)[i - 1]
|
2023-06-12 22:21:21 +08:00
|
|
|
);
|
2018-11-24 16:17:16 +08:00
|
|
|
leftNodes.push(group.nodes[i]);
|
|
|
|
}
|
2020-03-13 00:51:26 +08:00
|
|
|
queue.push(new Group(leftNodes, leftSimilarities));
|
2018-07-04 15:59:22 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// lexically ordering
|
|
|
|
result.sort((a, b) => {
|
|
|
|
if (a.nodes[0].key < b.nodes[0].key) return -1;
|
|
|
|
if (a.nodes[0].key > b.nodes[0].key) return 1;
|
|
|
|
return 0;
|
|
|
|
});
|
|
|
|
|
|
|
|
// give every group a name
|
2020-06-16 23:28:53 +08:00
|
|
|
const usedNames = new Set();
|
2018-07-04 15:59:22 +08:00
|
|
|
for (let i = 0; i < result.length; i++) {
|
|
|
|
const group = result[i];
|
2020-06-16 23:28:53 +08:00
|
|
|
if (group.nodes.length === 1) {
|
|
|
|
group.key = group.nodes[0].key;
|
|
|
|
} else {
|
|
|
|
const first = group.nodes[0];
|
|
|
|
const last = group.nodes[group.nodes.length - 1];
|
|
|
|
const name = getName(first.key, last.key, usedNames);
|
|
|
|
group.key = name;
|
|
|
|
}
|
2018-07-04 15:59:22 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// return the results
|
2024-07-31 11:31:11 +08:00
|
|
|
return result.map(
|
2025-07-17 00:13:14 +08:00
|
|
|
(group) =>
|
2024-07-31 11:31:11 +08:00
|
|
|
/** @type {GroupedItems<T>} */
|
|
|
|
({
|
|
|
|
key: group.key,
|
2025-07-17 00:13:14 +08:00
|
|
|
items: group.nodes.map((node) => node.item),
|
2024-07-31 11:31:11 +08:00
|
|
|
size: group.size
|
|
|
|
})
|
|
|
|
);
|
2018-07-04 15:59:22 +08:00
|
|
|
};
|