perf(backend): parse5をやめて軽量な実装にし、メモリ削減・高速化 (#16892)

* wip

* test

* Revert "test"

This reverts commit b7c5ae7214.

* Update MfmService.ts
This commit is contained in:
syuilo 2025-11-29 21:19:55 +09:00 committed by GitHub
parent cad93071da
commit 4bdbe794a6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 80 additions and 81 deletions

View File

@ -150,7 +150,6 @@
"oauth2orize-pkce": "0.1.2",
"os-utils": "0.0.14",
"otpauth": "9.4.1",
"parse5": "7.3.0",
"pg": "8.16.3",
"pkce-challenge": "4.1.0",
"probe-image-size": "7.2.3",

View File

@ -5,7 +5,7 @@
import { URL } from 'node:url';
import { Inject, Injectable } from '@nestjs/common';
import * as parse5 from 'parse5';
import * as htmlParser from 'node-html-parser';
import { DI } from '@/di-symbols.js';
import type { Config } from '@/config.js';
import { intersperse } from '@/misc/prelude/array.js';
@ -13,13 +13,8 @@ import { normalizeForSearch } from '@/misc/normalize-for-search.js';
import type { IMentionedRemoteUsers } from '@/models/Note.js';
import { bindThis } from '@/decorators.js';
import { escapeHtml } from '@/misc/escape-html.js';
import type { DefaultTreeAdapterMap } from 'parse5';
import type * as mfm from 'mfm-js';
const treeAdapter = parse5.defaultTreeAdapter;
type Node = DefaultTreeAdapterMap['node'];
type ChildNode = DefaultTreeAdapterMap['childNode'];
const urlRegex = /^https?:\/\/[\w\/:%#@$&?!()\[\]~.,=+\-]+/;
const urlRegexFull = /^https?:\/\/[\w\/:%#@$&?!()\[\]~.,=+\-]+$/;
@ -38,68 +33,68 @@ export class MfmService {
const normalizedHashtagNames = hashtagNames == null ? undefined : new Set<string>(hashtagNames.map(x => normalizeForSearch(x)));
const dom = parse5.parseFragment(html);
const doc = htmlParser.parse(`<div>${html}</div>`);
let text = '';
for (const n of dom.childNodes) {
for (const n of doc.childNodes) {
analyze(n);
}
return text.trim();
function getText(node: Node): string {
if (treeAdapter.isTextNode(node)) return node.value;
if (!treeAdapter.isElementNode(node)) return '';
if (node.nodeName === 'br') return '\n';
function getText(node: htmlParser.Node): string {
if (node instanceof htmlParser.TextNode) return node.textContent;
if (!(node instanceof htmlParser.HTMLElement)) return '';
if (node.tagName === 'BR') return '\n';
if (node.childNodes) {
if (node.childNodes != null) {
return node.childNodes.map(n => getText(n)).join('');
}
return '';
}
function appendChildren(childNodes: ChildNode[]): void {
if (childNodes) {
function analyzeChildren(childNodes: htmlParser.Node[] | null): void {
if (childNodes != null) {
for (const n of childNodes) {
analyze(n);
}
}
}
function analyze(node: Node) {
if (treeAdapter.isTextNode(node)) {
text += node.value;
function analyze(node: htmlParser.Node) {
if (node instanceof htmlParser.TextNode) {
text += node.textContent;
return;
}
// Skip comment or document type node
if (!treeAdapter.isElementNode(node)) {
if (!(node instanceof htmlParser.HTMLElement)) {
return;
}
switch (node.nodeName) {
case 'br': {
switch (node.tagName) {
case 'BR': {
text += '\n';
break;
}
case 'a': {
case 'A': {
const txt = getText(node);
const rel = node.attrs.find(x => x.name === 'rel');
const href = node.attrs.find(x => x.name === 'href');
const rel = node.attributes.rel;
const href = node.attributes.href;
// ハッシュタグ
if (normalizedHashtagNames && href && normalizedHashtagNames.has(normalizeForSearch(txt))) {
if (normalizedHashtagNames && href != null && normalizedHashtagNames.has(normalizeForSearch(txt))) {
text += txt;
// メンション
} else if (txt.startsWith('@') && !(rel && rel.value.startsWith('me '))) {
} else if (txt.startsWith('@') && !(rel != null && rel.startsWith('me '))) {
const part = txt.split('@');
if (part.length === 2 && href) {
//#region ホスト名部分が省略されているので復元する
const acct = `${txt}@${(new URL(href.value)).hostname}`;
const acct = `${txt}@${(new URL(href)).hostname}`;
text += acct;
//#endregion
} else if (part.length === 3) {
@ -114,17 +109,17 @@ export class MfmService {
if (!href) {
return txt;
}
if (!txt || txt === href.value) { // #6383: Missing text node
if (href.value.match(urlRegexFull)) {
return href.value;
if (!txt || txt === href) { // #6383: Missing text node
if (href.match(urlRegexFull)) {
return href;
} else {
return `<${href.value}>`;
return `<${href}>`;
}
}
if (href.value.match(urlRegex) && !href.value.match(urlRegexFull)) {
return `[${txt}](<${href.value}>)`; // #6846
if (href.match(urlRegex) && !href.match(urlRegexFull)) {
return `[${txt}](<${href}>)`; // #6846
} else {
return `[${txt}](${href.value})`;
return `[${txt}](${href})`;
}
};
@ -133,60 +128,64 @@ export class MfmService {
break;
}
case 'h1': {
case 'H1': {
text += '【';
appendChildren(node.childNodes);
analyzeChildren(node.childNodes);
text += '】\n';
break;
}
case 'b':
case 'strong': {
case 'B':
case 'STRONG': {
text += '**';
appendChildren(node.childNodes);
analyzeChildren(node.childNodes);
text += '**';
break;
}
case 'small': {
case 'SMALL': {
text += '<small>';
appendChildren(node.childNodes);
analyzeChildren(node.childNodes);
text += '</small>';
break;
}
case 's':
case 'del': {
case 'S':
case 'DEL': {
text += '~~';
appendChildren(node.childNodes);
analyzeChildren(node.childNodes);
text += '~~';
break;
}
case 'i':
case 'em': {
case 'I':
case 'EM': {
text += '<i>';
appendChildren(node.childNodes);
analyzeChildren(node.childNodes);
text += '</i>';
break;
}
case 'ruby': {
case 'RUBY': {
let ruby: [string, string][] = [];
for (const child of node.childNodes) {
if (child.nodeName === 'rp') {
if ((child instanceof htmlParser.TextNode) && !/\s|\[|\]/.test(child.textContent)) {
ruby.push([child.textContent, '']);
continue;
}
if (treeAdapter.isTextNode(child) && !/\s|\[|\]/.test(child.value)) {
ruby.push([child.value, '']);
if (!(child instanceof htmlParser.HTMLElement)) continue;
if (child.tagName === 'RP') {
continue;
}
if (child.nodeName === 'rt' && ruby.length > 0) {
if (child.tagName === 'RT' && ruby.length > 0) {
const rt = getText(child);
if (/\s|\[|\]/.test(rt)) {
// If any space is included in rt, it is treated as a normal text
ruby = [];
appendChildren(node.childNodes);
analyzeChildren(node.childNodes);
break;
} else {
ruby.at(-1)![1] = rt;
@ -195,7 +194,7 @@ export class MfmService {
}
// If any other element is included in ruby, it is treated as a normal text
ruby = [];
appendChildren(node.childNodes);
analyzeChildren(node.childNodes);
break;
}
for (const [base, rt] of ruby) {
@ -205,26 +204,30 @@ export class MfmService {
}
// block code (<pre><code>)
case 'pre': {
if (node.childNodes.length === 1 && node.childNodes[0].nodeName === 'code') {
case 'PRE': {
if (node.childNodes.length === 1 && (node.childNodes[0] instanceof htmlParser.HTMLElement) && node.childNodes[0].tagName === 'CODE') {
text += '\n```\n';
text += getText(node.childNodes[0]);
text += '\n```\n';
} else if (node.childNodes.length === 1 && (node.childNodes[0] instanceof htmlParser.TextNode) && node.childNodes[0].textContent.startsWith('<code>') && node.childNodes[0].textContent.endsWith('</code>')) {
text += '\n```\n';
text += node.childNodes[0].textContent.slice(6, -7);
text += '\n```\n';
} else {
appendChildren(node.childNodes);
analyzeChildren(node.childNodes);
}
break;
}
// inline code (<code>)
case 'code': {
case 'CODE': {
text += '`';
appendChildren(node.childNodes);
analyzeChildren(node.childNodes);
text += '`';
break;
}
case 'blockquote': {
case 'BLOCKQUOTE': {
const t = getText(node);
if (t) {
text += '\n> ';
@ -233,33 +236,33 @@ export class MfmService {
break;
}
case 'p':
case 'h2':
case 'h3':
case 'h4':
case 'h5':
case 'h6': {
case 'P':
case 'H2':
case 'H3':
case 'H4':
case 'H5':
case 'H6': {
text += '\n\n';
appendChildren(node.childNodes);
analyzeChildren(node.childNodes);
break;
}
// other block elements
case 'div':
case 'header':
case 'footer':
case 'article':
case 'li':
case 'dt':
case 'dd': {
case 'DIV':
case 'HEADER':
case 'FOOTER':
case 'ARTICLE':
case 'LI':
case 'DT':
case 'DD': {
text += '\n';
appendChildren(node.childNodes);
analyzeChildren(node.childNodes);
break;
}
default: // includes inline elements
{
appendChildren(node.childNodes);
analyzeChildren(node.childNodes);
break;
}
}

View File

@ -120,7 +120,7 @@ async function discoverClientInformation(logger: Logger, httpRequestService: Htt
}
const text = await res.text();
const fragment = htmlParser.parse(text);
const fragment = htmlParser.parse(`<div>${text}</div>`);
redirectUris.push(...[...fragment.querySelectorAll('link[rel=redirect_uri][href]')].map(el => el.attributes.href));

View File

@ -333,9 +333,6 @@ importers:
otpauth:
specifier: 9.4.1
version: 9.4.1
parse5:
specifier: 7.3.0
version: 7.3.0
pg:
specifier: 8.16.3
version: 8.16.3