perf(backend): parse5をやめて軽量な実装にし、メモリ削減・高速化 (#16892)

* wip * test * Revert "test" This reverts commit b7c5ae7214. * Update MfmService.ts
2025-11-29 21:19:55 +09:00 · 2025-11-29 21:19:55 +09:00 · 4bdbe794a6
parent cad93071da
commit 4bdbe794a6
4 changed files with 80 additions and 81 deletions
--- a/packages/backend/package.json
+++ b/packages/backend/package.json
@ -150,7 +150,6 @@
 		"oauth2orize-pkce": "0.1.2",
 		"os-utils": "0.0.14",
 		"otpauth": "9.4.1",
-		"parse5": "7.3.0",
 		"pg": "8.16.3",
 		"pkce-challenge": "4.1.0",
 		"probe-image-size": "7.2.3",
--- a/packages/backend/src/core/MfmService.ts
+++ b/packages/backend/src/core/MfmService.ts
@ -5,7 +5,7 @@

 import { URL } from 'node:url';
 import { Inject, Injectable } from '@nestjs/common';
-import * as parse5 from 'parse5';
+import * as htmlParser from 'node-html-parser';
 import { DI } from '@/di-symbols.js';
 import type { Config } from '@/config.js';
 import { intersperse } from '@/misc/prelude/array.js';
@ -13,13 +13,8 @@ import { normalizeForSearch } from '@/misc/normalize-for-search.js';
 import type { IMentionedRemoteUsers } from '@/models/Note.js';
 import { bindThis } from '@/decorators.js';
 import { escapeHtml } from '@/misc/escape-html.js';
-import type { DefaultTreeAdapterMap } from 'parse5';
 import type * as mfm from 'mfm-js';

-const treeAdapter = parse5.defaultTreeAdapter;
-type Node = DefaultTreeAdapterMap['node'];
-type ChildNode = DefaultTreeAdapterMap['childNode'];
-
 const urlRegex = /^https?:\/\/[\w\/:%#@$&?!()\[\]~.,=+\-]+/;
 const urlRegexFull = /^https?:\/\/[\w\/:%#@$&?!()\[\]~.,=+\-]+$/;

@ -38,68 +33,68 @@ export class MfmService {

 		const normalizedHashtagNames = hashtagNames == null ? undefined : new Set<string>(hashtagNames.map(x => normalizeForSearch(x)));

-		const dom = parse5.parseFragment(html);
+		const doc = htmlParser.parse(`<div>${html}</div>`);

 		let text = '';

-		for (const n of dom.childNodes) {
+		for (const n of doc.childNodes) {
 			analyze(n);
 		}

 		return text.trim();

-		function getText(node: Node): string {
-			if (treeAdapter.isTextNode(node)) return node.value;
-			if (!treeAdapter.isElementNode(node)) return '';
-			if (node.nodeName === 'br') return '\n';
+		function getText(node: htmlParser.Node): string {
+			if (node instanceof htmlParser.TextNode) return node.textContent;
+			if (!(node instanceof htmlParser.HTMLElement)) return '';
+			if (node.tagName === 'BR') return '\n';

-			if (node.childNodes) {
+			if (node.childNodes != null) {
 				return node.childNodes.map(n => getText(n)).join('');
 			}

 			return '';
 		}

-		function appendChildren(childNodes: ChildNode[]): void {
-			if (childNodes) {
+		function analyzeChildren(childNodes: htmlParser.Node[] | null): void {
+			if (childNodes != null) {
 				for (const n of childNodes) {
 					analyze(n);
 				}
 			}
 		}

-		function analyze(node: Node) {
-			if (treeAdapter.isTextNode(node)) {
-				text += node.value;
+		function analyze(node: htmlParser.Node) {
+			if (node instanceof htmlParser.TextNode) {
+				text += node.textContent;
 				return;
 			}

 			// Skip comment or document type node
-			if (!treeAdapter.isElementNode(node)) {
+			if (!(node instanceof htmlParser.HTMLElement)) {
 				return;
 			}

-			switch (node.nodeName) {
-				case 'br': {
+			switch (node.tagName) {
+				case 'BR': {
 					text += '\n';
 					break;
 				}

-				case 'a': {
+				case 'A': {
 					const txt = getText(node);
-					const rel = node.attrs.find(x => x.name === 'rel');
-					const href = node.attrs.find(x => x.name === 'href');
+					const rel = node.attributes.rel;
+					const href = node.attributes.href;

 					// ハッシュタグ
-					if (normalizedHashtagNames && href && normalizedHashtagNames.has(normalizeForSearch(txt))) {
+					if (normalizedHashtagNames && href != null && normalizedHashtagNames.has(normalizeForSearch(txt))) {
 						text += txt;
 						// メンション
-					} else if (txt.startsWith('@') && !(rel && rel.value.startsWith('me '))) {
+					} else if (txt.startsWith('@') && !(rel != null && rel.startsWith('me '))) {
 						const part = txt.split('@');

 						if (part.length === 2 && href) {
 							//#region ホスト名部分が省略されているので復元する
-							const acct = `${txt}@${(new URL(href.value)).hostname}`;
+							const acct = `${txt}@${(new URL(href)).hostname}`;
 							text += acct;
 							//#endregion
 						} else if (part.length === 3) {
@ -114,17 +109,17 @@ export class MfmService {
 							if (!href) {
 								return txt;
 							}
-							if (!txt || txt === href.value) {	// #6383: Missing text node
-								if (href.value.match(urlRegexFull)) {
-									return href.value;
+							if (!txt || txt === href) {	// #6383: Missing text node
+								if (href.match(urlRegexFull)) {
+									return href;
 								} else {
-									return `<${href.value}>`;
+									return `<${href}>`;
 								}
 							}
-							if (href.value.match(urlRegex) && !href.value.match(urlRegexFull)) {
-								return `[${txt}](<${href.value}>)`;	// #6846
+							if (href.match(urlRegex) && !href.match(urlRegexFull)) {
+								return `[${txt}](<${href}>)`;	// #6846
 							} else {
-								return `[${txt}](${href.value})`;
+								return `[${txt}](${href})`;
 							}
 						};

@ -133,60 +128,64 @@ export class MfmService {
 					break;
 				}

-				case 'h1': {
+				case 'H1': {
 					text += '【';
-					appendChildren(node.childNodes);
+					analyzeChildren(node.childNodes);
 					text += '】\n';
 					break;
 				}

-				case 'b':
-				case 'strong': {
+				case 'B':
+				case 'STRONG': {
 					text += '**';
-					appendChildren(node.childNodes);
+					analyzeChildren(node.childNodes);
 					text += '**';
 					break;
 				}

-				case 'small': {
+				case 'SMALL': {
 					text += '<small>';
-					appendChildren(node.childNodes);
+					analyzeChildren(node.childNodes);
 					text += '</small>';
 					break;
 				}

-				case 's':
-				case 'del': {
+				case 'S':
+				case 'DEL': {
 					text += '~~';
-					appendChildren(node.childNodes);
+					analyzeChildren(node.childNodes);
 					text += '~~';
 					break;
 				}

-				case 'i':
-				case 'em': {
+				case 'I':
+				case 'EM': {
 					text += '<i>';
-					appendChildren(node.childNodes);
+					analyzeChildren(node.childNodes);
 					text += '</i>';
 					break;
 				}

-				case 'ruby': {
+				case 'RUBY': {
 					let ruby: [string, string][] = [];
 					for (const child of node.childNodes) {
-						if (child.nodeName === 'rp') {
+						if ((child instanceof htmlParser.TextNode) && !/\s|\[|\]/.test(child.textContent)) {
+							ruby.push([child.textContent, '']);
 							continue;
 						}
-						if (treeAdapter.isTextNode(child) && !/\s|\[|\]/.test(child.value)) {
-							ruby.push([child.value, '']);
+
+						if (!(child instanceof htmlParser.HTMLElement)) continue;
+
+						if (child.tagName === 'RP') {
 							continue;
 						}
-						if (child.nodeName === 'rt' && ruby.length > 0) {
+
+						if (child.tagName === 'RT' && ruby.length > 0) {
 							const rt = getText(child);
 							if (/\s|\[|\]/.test(rt)) {
 								// If any space is included in rt, it is treated as a normal text
 								ruby = [];
-								appendChildren(node.childNodes);
+								analyzeChildren(node.childNodes);
 								break;
 							} else {
 								ruby.at(-1)![1] = rt;
@ -195,7 +194,7 @@ export class MfmService {
 						}
 						// If any other element is included in ruby, it is treated as a normal text
 						ruby = [];
-						appendChildren(node.childNodes);
+						analyzeChildren(node.childNodes);
 						break;
 					}
 					for (const [base, rt] of ruby) {
@ -205,26 +204,30 @@ export class MfmService {
 				}

 				// block code (<pre><code>)
-				case 'pre': {
-					if (node.childNodes.length === 1 && node.childNodes[0].nodeName === 'code') {
+				case 'PRE': {
+					if (node.childNodes.length === 1 && (node.childNodes[0] instanceof htmlParser.HTMLElement) && node.childNodes[0].tagName === 'CODE') {
 						text += '\n```\n';
 						text += getText(node.childNodes[0]);
 						text += '\n```\n';
+					} else if (node.childNodes.length === 1 && (node.childNodes[0] instanceof htmlParser.TextNode) && node.childNodes[0].textContent.startsWith('<code>') && node.childNodes[0].textContent.endsWith('</code>')) {
+						text += '\n```\n';
+						text += node.childNodes[0].textContent.slice(6, -7);
+						text += '\n```\n';
 					} else {
-						appendChildren(node.childNodes);
+						analyzeChildren(node.childNodes);
 					}
 					break;
 				}

 				// inline code (<code>)
-				case 'code': {
+				case 'CODE': {
 					text += '`';
-					appendChildren(node.childNodes);
+					analyzeChildren(node.childNodes);
 					text += '`';
 					break;
 				}

-				case 'blockquote': {
+				case 'BLOCKQUOTE': {
 					const t = getText(node);
 					if (t) {
 						text += '\n> ';
@ -233,33 +236,33 @@ export class MfmService {
 					break;
 				}

-				case 'p':
-				case 'h2':
-				case 'h3':
-				case 'h4':
-				case 'h5':
-				case 'h6': {
+				case 'P':
+				case 'H2':
+				case 'H3':
+				case 'H4':
+				case 'H5':
+				case 'H6': {
 					text += '\n\n';
-					appendChildren(node.childNodes);
+					analyzeChildren(node.childNodes);
 					break;
 				}

 				// other block elements
-				case 'div':
-				case 'header':
-				case 'footer':
-				case 'article':
-				case 'li':
-				case 'dt':
-				case 'dd': {
+				case 'DIV':
+				case 'HEADER':
+				case 'FOOTER':
+				case 'ARTICLE':
+				case 'LI':
+				case 'DT':
+				case 'DD': {
 					text += '\n';
-					appendChildren(node.childNodes);
+					analyzeChildren(node.childNodes);
 					break;
 				}

 				default:	// includes inline elements
 				{
-					appendChildren(node.childNodes);
+					analyzeChildren(node.childNodes);
 					break;
 				}
 			}
--- a/packages/backend/src/server/oauth/OAuth2ProviderService.ts
+++ b/packages/backend/src/server/oauth/OAuth2ProviderService.ts
@ -120,7 +120,7 @@ async function discoverClientInformation(logger: Logger, httpRequestService: Htt
 		}

 		const text = await res.text();
-		const fragment = htmlParser.parse(text);
+		const fragment = htmlParser.parse(`<div>${text}</div>`);

 		redirectUris.push(...[...fragment.querySelectorAll('link[rel=redirect_uri][href]')].map(el => el.attributes.href));

--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@ -333,9 +333,6 @@ importers:
      otpauth:
        specifier: 9.4.1
        version: 9.4.1
-      parse5:
-        specifier: 7.3.0
-        version: 7.3.0
      pg:
        specifier: 8.16.3
        version: 8.16.3