perf(backend): parse5をやめて軽量な実装にし、メモリ削減・高速化 (#16892)

* wip

* test

* Revert "test"

This reverts commit b7c5ae7214.

* Update MfmService.ts
This commit is contained in:
syuilo 2025-11-29 21:19:55 +09:00 committed by GitHub
parent cad93071da
commit 4bdbe794a6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 80 additions and 81 deletions

View File

@ -150,7 +150,6 @@
"oauth2orize-pkce": "0.1.2", "oauth2orize-pkce": "0.1.2",
"os-utils": "0.0.14", "os-utils": "0.0.14",
"otpauth": "9.4.1", "otpauth": "9.4.1",
"parse5": "7.3.0",
"pg": "8.16.3", "pg": "8.16.3",
"pkce-challenge": "4.1.0", "pkce-challenge": "4.1.0",
"probe-image-size": "7.2.3", "probe-image-size": "7.2.3",

View File

@ -5,7 +5,7 @@
import { URL } from 'node:url'; import { URL } from 'node:url';
import { Inject, Injectable } from '@nestjs/common'; import { Inject, Injectable } from '@nestjs/common';
import * as parse5 from 'parse5'; import * as htmlParser from 'node-html-parser';
import { DI } from '@/di-symbols.js'; import { DI } from '@/di-symbols.js';
import type { Config } from '@/config.js'; import type { Config } from '@/config.js';
import { intersperse } from '@/misc/prelude/array.js'; import { intersperse } from '@/misc/prelude/array.js';
@ -13,13 +13,8 @@ import { normalizeForSearch } from '@/misc/normalize-for-search.js';
import type { IMentionedRemoteUsers } from '@/models/Note.js'; import type { IMentionedRemoteUsers } from '@/models/Note.js';
import { bindThis } from '@/decorators.js'; import { bindThis } from '@/decorators.js';
import { escapeHtml } from '@/misc/escape-html.js'; import { escapeHtml } from '@/misc/escape-html.js';
import type { DefaultTreeAdapterMap } from 'parse5';
import type * as mfm from 'mfm-js'; import type * as mfm from 'mfm-js';
const treeAdapter = parse5.defaultTreeAdapter;
type Node = DefaultTreeAdapterMap['node'];
type ChildNode = DefaultTreeAdapterMap['childNode'];
const urlRegex = /^https?:\/\/[\w\/:%#@$&?!()\[\]~.,=+\-]+/; const urlRegex = /^https?:\/\/[\w\/:%#@$&?!()\[\]~.,=+\-]+/;
const urlRegexFull = /^https?:\/\/[\w\/:%#@$&?!()\[\]~.,=+\-]+$/; const urlRegexFull = /^https?:\/\/[\w\/:%#@$&?!()\[\]~.,=+\-]+$/;
@ -38,68 +33,68 @@ export class MfmService {
const normalizedHashtagNames = hashtagNames == null ? undefined : new Set<string>(hashtagNames.map(x => normalizeForSearch(x))); const normalizedHashtagNames = hashtagNames == null ? undefined : new Set<string>(hashtagNames.map(x => normalizeForSearch(x)));
const dom = parse5.parseFragment(html); const doc = htmlParser.parse(`<div>${html}</div>`);
let text = ''; let text = '';
for (const n of dom.childNodes) { for (const n of doc.childNodes) {
analyze(n); analyze(n);
} }
return text.trim(); return text.trim();
function getText(node: Node): string { function getText(node: htmlParser.Node): string {
if (treeAdapter.isTextNode(node)) return node.value; if (node instanceof htmlParser.TextNode) return node.textContent;
if (!treeAdapter.isElementNode(node)) return ''; if (!(node instanceof htmlParser.HTMLElement)) return '';
if (node.nodeName === 'br') return '\n'; if (node.tagName === 'BR') return '\n';
if (node.childNodes) { if (node.childNodes != null) {
return node.childNodes.map(n => getText(n)).join(''); return node.childNodes.map(n => getText(n)).join('');
} }
return ''; return '';
} }
function appendChildren(childNodes: ChildNode[]): void { function analyzeChildren(childNodes: htmlParser.Node[] | null): void {
if (childNodes) { if (childNodes != null) {
for (const n of childNodes) { for (const n of childNodes) {
analyze(n); analyze(n);
} }
} }
} }
function analyze(node: Node) { function analyze(node: htmlParser.Node) {
if (treeAdapter.isTextNode(node)) { if (node instanceof htmlParser.TextNode) {
text += node.value; text += node.textContent;
return; return;
} }
// Skip comment or document type node // Skip comment or document type node
if (!treeAdapter.isElementNode(node)) { if (!(node instanceof htmlParser.HTMLElement)) {
return; return;
} }
switch (node.nodeName) { switch (node.tagName) {
case 'br': { case 'BR': {
text += '\n'; text += '\n';
break; break;
} }
case 'a': { case 'A': {
const txt = getText(node); const txt = getText(node);
const rel = node.attrs.find(x => x.name === 'rel'); const rel = node.attributes.rel;
const href = node.attrs.find(x => x.name === 'href'); const href = node.attributes.href;
// ハッシュタグ // ハッシュタグ
if (normalizedHashtagNames && href && normalizedHashtagNames.has(normalizeForSearch(txt))) { if (normalizedHashtagNames && href != null && normalizedHashtagNames.has(normalizeForSearch(txt))) {
text += txt; text += txt;
// メンション // メンション
} else if (txt.startsWith('@') && !(rel && rel.value.startsWith('me '))) { } else if (txt.startsWith('@') && !(rel != null && rel.startsWith('me '))) {
const part = txt.split('@'); const part = txt.split('@');
if (part.length === 2 && href) { if (part.length === 2 && href) {
//#region ホスト名部分が省略されているので復元する //#region ホスト名部分が省略されているので復元する
const acct = `${txt}@${(new URL(href.value)).hostname}`; const acct = `${txt}@${(new URL(href)).hostname}`;
text += acct; text += acct;
//#endregion //#endregion
} else if (part.length === 3) { } else if (part.length === 3) {
@ -114,17 +109,17 @@ export class MfmService {
if (!href) { if (!href) {
return txt; return txt;
} }
if (!txt || txt === href.value) { // #6383: Missing text node if (!txt || txt === href) { // #6383: Missing text node
if (href.value.match(urlRegexFull)) { if (href.match(urlRegexFull)) {
return href.value; return href;
} else { } else {
return `<${href.value}>`; return `<${href}>`;
} }
} }
if (href.value.match(urlRegex) && !href.value.match(urlRegexFull)) { if (href.match(urlRegex) && !href.match(urlRegexFull)) {
return `[${txt}](<${href.value}>)`; // #6846 return `[${txt}](<${href}>)`; // #6846
} else { } else {
return `[${txt}](${href.value})`; return `[${txt}](${href})`;
} }
}; };
@ -133,60 +128,64 @@ export class MfmService {
break; break;
} }
case 'h1': { case 'H1': {
text += '【'; text += '【';
appendChildren(node.childNodes); analyzeChildren(node.childNodes);
text += '】\n'; text += '】\n';
break; break;
} }
case 'b': case 'B':
case 'strong': { case 'STRONG': {
text += '**'; text += '**';
appendChildren(node.childNodes); analyzeChildren(node.childNodes);
text += '**'; text += '**';
break; break;
} }
case 'small': { case 'SMALL': {
text += '<small>'; text += '<small>';
appendChildren(node.childNodes); analyzeChildren(node.childNodes);
text += '</small>'; text += '</small>';
break; break;
} }
case 's': case 'S':
case 'del': { case 'DEL': {
text += '~~'; text += '~~';
appendChildren(node.childNodes); analyzeChildren(node.childNodes);
text += '~~'; text += '~~';
break; break;
} }
case 'i': case 'I':
case 'em': { case 'EM': {
text += '<i>'; text += '<i>';
appendChildren(node.childNodes); analyzeChildren(node.childNodes);
text += '</i>'; text += '</i>';
break; break;
} }
case 'ruby': { case 'RUBY': {
let ruby: [string, string][] = []; let ruby: [string, string][] = [];
for (const child of node.childNodes) { for (const child of node.childNodes) {
if (child.nodeName === 'rp') { if ((child instanceof htmlParser.TextNode) && !/\s|\[|\]/.test(child.textContent)) {
ruby.push([child.textContent, '']);
continue; continue;
} }
if (treeAdapter.isTextNode(child) && !/\s|\[|\]/.test(child.value)) {
ruby.push([child.value, '']); if (!(child instanceof htmlParser.HTMLElement)) continue;
if (child.tagName === 'RP') {
continue; continue;
} }
if (child.nodeName === 'rt' && ruby.length > 0) {
if (child.tagName === 'RT' && ruby.length > 0) {
const rt = getText(child); const rt = getText(child);
if (/\s|\[|\]/.test(rt)) { if (/\s|\[|\]/.test(rt)) {
// If any space is included in rt, it is treated as a normal text // If any space is included in rt, it is treated as a normal text
ruby = []; ruby = [];
appendChildren(node.childNodes); analyzeChildren(node.childNodes);
break; break;
} else { } else {
ruby.at(-1)![1] = rt; ruby.at(-1)![1] = rt;
@ -195,7 +194,7 @@ export class MfmService {
} }
// If any other element is included in ruby, it is treated as a normal text // If any other element is included in ruby, it is treated as a normal text
ruby = []; ruby = [];
appendChildren(node.childNodes); analyzeChildren(node.childNodes);
break; break;
} }
for (const [base, rt] of ruby) { for (const [base, rt] of ruby) {
@ -205,26 +204,30 @@ export class MfmService {
} }
// block code (<pre><code>) // block code (<pre><code>)
case 'pre': { case 'PRE': {
if (node.childNodes.length === 1 && node.childNodes[0].nodeName === 'code') { if (node.childNodes.length === 1 && (node.childNodes[0] instanceof htmlParser.HTMLElement) && node.childNodes[0].tagName === 'CODE') {
text += '\n```\n'; text += '\n```\n';
text += getText(node.childNodes[0]); text += getText(node.childNodes[0]);
text += '\n```\n'; text += '\n```\n';
} else if (node.childNodes.length === 1 && (node.childNodes[0] instanceof htmlParser.TextNode) && node.childNodes[0].textContent.startsWith('<code>') && node.childNodes[0].textContent.endsWith('</code>')) {
text += '\n```\n';
text += node.childNodes[0].textContent.slice(6, -7);
text += '\n```\n';
} else { } else {
appendChildren(node.childNodes); analyzeChildren(node.childNodes);
} }
break; break;
} }
// inline code (<code>) // inline code (<code>)
case 'code': { case 'CODE': {
text += '`'; text += '`';
appendChildren(node.childNodes); analyzeChildren(node.childNodes);
text += '`'; text += '`';
break; break;
} }
case 'blockquote': { case 'BLOCKQUOTE': {
const t = getText(node); const t = getText(node);
if (t) { if (t) {
text += '\n> '; text += '\n> ';
@ -233,33 +236,33 @@ export class MfmService {
break; break;
} }
case 'p': case 'P':
case 'h2': case 'H2':
case 'h3': case 'H3':
case 'h4': case 'H4':
case 'h5': case 'H5':
case 'h6': { case 'H6': {
text += '\n\n'; text += '\n\n';
appendChildren(node.childNodes); analyzeChildren(node.childNodes);
break; break;
} }
// other block elements // other block elements
case 'div': case 'DIV':
case 'header': case 'HEADER':
case 'footer': case 'FOOTER':
case 'article': case 'ARTICLE':
case 'li': case 'LI':
case 'dt': case 'DT':
case 'dd': { case 'DD': {
text += '\n'; text += '\n';
appendChildren(node.childNodes); analyzeChildren(node.childNodes);
break; break;
} }
default: // includes inline elements default: // includes inline elements
{ {
appendChildren(node.childNodes); analyzeChildren(node.childNodes);
break; break;
} }
} }

View File

@ -120,7 +120,7 @@ async function discoverClientInformation(logger: Logger, httpRequestService: Htt
} }
const text = await res.text(); const text = await res.text();
const fragment = htmlParser.parse(text); const fragment = htmlParser.parse(`<div>${text}</div>`);
redirectUris.push(...[...fragment.querySelectorAll('link[rel=redirect_uri][href]')].map(el => el.attributes.href)); redirectUris.push(...[...fragment.querySelectorAll('link[rel=redirect_uri][href]')].map(el => el.attributes.href));

View File

@ -333,9 +333,6 @@ importers:
otpauth: otpauth:
specifier: 9.4.1 specifier: 9.4.1
version: 9.4.1 version: 9.4.1
parse5:
specifier: 7.3.0
version: 7.3.0
pg: pg:
specifier: 8.16.3 specifier: 8.16.3
version: 8.16.3 version: 8.16.3