perf(backend): parse5をやめて軽量な実装にし、メモリ削減・高速化 (#16892)
* wip
* test
* Revert "test"
This reverts commit b7c5ae7214.
* Update MfmService.ts
This commit is contained in:
parent
cad93071da
commit
4bdbe794a6
|
|
@ -150,7 +150,6 @@
|
||||||
"oauth2orize-pkce": "0.1.2",
|
"oauth2orize-pkce": "0.1.2",
|
||||||
"os-utils": "0.0.14",
|
"os-utils": "0.0.14",
|
||||||
"otpauth": "9.4.1",
|
"otpauth": "9.4.1",
|
||||||
"parse5": "7.3.0",
|
|
||||||
"pg": "8.16.3",
|
"pg": "8.16.3",
|
||||||
"pkce-challenge": "4.1.0",
|
"pkce-challenge": "4.1.0",
|
||||||
"probe-image-size": "7.2.3",
|
"probe-image-size": "7.2.3",
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,7 @@
|
||||||
|
|
||||||
import { URL } from 'node:url';
|
import { URL } from 'node:url';
|
||||||
import { Inject, Injectable } from '@nestjs/common';
|
import { Inject, Injectable } from '@nestjs/common';
|
||||||
import * as parse5 from 'parse5';
|
import * as htmlParser from 'node-html-parser';
|
||||||
import { DI } from '@/di-symbols.js';
|
import { DI } from '@/di-symbols.js';
|
||||||
import type { Config } from '@/config.js';
|
import type { Config } from '@/config.js';
|
||||||
import { intersperse } from '@/misc/prelude/array.js';
|
import { intersperse } from '@/misc/prelude/array.js';
|
||||||
|
|
@ -13,13 +13,8 @@ import { normalizeForSearch } from '@/misc/normalize-for-search.js';
|
||||||
import type { IMentionedRemoteUsers } from '@/models/Note.js';
|
import type { IMentionedRemoteUsers } from '@/models/Note.js';
|
||||||
import { bindThis } from '@/decorators.js';
|
import { bindThis } from '@/decorators.js';
|
||||||
import { escapeHtml } from '@/misc/escape-html.js';
|
import { escapeHtml } from '@/misc/escape-html.js';
|
||||||
import type { DefaultTreeAdapterMap } from 'parse5';
|
|
||||||
import type * as mfm from 'mfm-js';
|
import type * as mfm from 'mfm-js';
|
||||||
|
|
||||||
const treeAdapter = parse5.defaultTreeAdapter;
|
|
||||||
type Node = DefaultTreeAdapterMap['node'];
|
|
||||||
type ChildNode = DefaultTreeAdapterMap['childNode'];
|
|
||||||
|
|
||||||
const urlRegex = /^https?:\/\/[\w\/:%#@$&?!()\[\]~.,=+\-]+/;
|
const urlRegex = /^https?:\/\/[\w\/:%#@$&?!()\[\]~.,=+\-]+/;
|
||||||
const urlRegexFull = /^https?:\/\/[\w\/:%#@$&?!()\[\]~.,=+\-]+$/;
|
const urlRegexFull = /^https?:\/\/[\w\/:%#@$&?!()\[\]~.,=+\-]+$/;
|
||||||
|
|
||||||
|
|
@ -38,68 +33,68 @@ export class MfmService {
|
||||||
|
|
||||||
const normalizedHashtagNames = hashtagNames == null ? undefined : new Set<string>(hashtagNames.map(x => normalizeForSearch(x)));
|
const normalizedHashtagNames = hashtagNames == null ? undefined : new Set<string>(hashtagNames.map(x => normalizeForSearch(x)));
|
||||||
|
|
||||||
const dom = parse5.parseFragment(html);
|
const doc = htmlParser.parse(`<div>${html}</div>`);
|
||||||
|
|
||||||
let text = '';
|
let text = '';
|
||||||
|
|
||||||
for (const n of dom.childNodes) {
|
for (const n of doc.childNodes) {
|
||||||
analyze(n);
|
analyze(n);
|
||||||
}
|
}
|
||||||
|
|
||||||
return text.trim();
|
return text.trim();
|
||||||
|
|
||||||
function getText(node: Node): string {
|
function getText(node: htmlParser.Node): string {
|
||||||
if (treeAdapter.isTextNode(node)) return node.value;
|
if (node instanceof htmlParser.TextNode) return node.textContent;
|
||||||
if (!treeAdapter.isElementNode(node)) return '';
|
if (!(node instanceof htmlParser.HTMLElement)) return '';
|
||||||
if (node.nodeName === 'br') return '\n';
|
if (node.tagName === 'BR') return '\n';
|
||||||
|
|
||||||
if (node.childNodes) {
|
if (node.childNodes != null) {
|
||||||
return node.childNodes.map(n => getText(n)).join('');
|
return node.childNodes.map(n => getText(n)).join('');
|
||||||
}
|
}
|
||||||
|
|
||||||
return '';
|
return '';
|
||||||
}
|
}
|
||||||
|
|
||||||
function appendChildren(childNodes: ChildNode[]): void {
|
function analyzeChildren(childNodes: htmlParser.Node[] | null): void {
|
||||||
if (childNodes) {
|
if (childNodes != null) {
|
||||||
for (const n of childNodes) {
|
for (const n of childNodes) {
|
||||||
analyze(n);
|
analyze(n);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function analyze(node: Node) {
|
function analyze(node: htmlParser.Node) {
|
||||||
if (treeAdapter.isTextNode(node)) {
|
if (node instanceof htmlParser.TextNode) {
|
||||||
text += node.value;
|
text += node.textContent;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Skip comment or document type node
|
// Skip comment or document type node
|
||||||
if (!treeAdapter.isElementNode(node)) {
|
if (!(node instanceof htmlParser.HTMLElement)) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
switch (node.nodeName) {
|
switch (node.tagName) {
|
||||||
case 'br': {
|
case 'BR': {
|
||||||
text += '\n';
|
text += '\n';
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
case 'a': {
|
case 'A': {
|
||||||
const txt = getText(node);
|
const txt = getText(node);
|
||||||
const rel = node.attrs.find(x => x.name === 'rel');
|
const rel = node.attributes.rel;
|
||||||
const href = node.attrs.find(x => x.name === 'href');
|
const href = node.attributes.href;
|
||||||
|
|
||||||
// ハッシュタグ
|
// ハッシュタグ
|
||||||
if (normalizedHashtagNames && href && normalizedHashtagNames.has(normalizeForSearch(txt))) {
|
if (normalizedHashtagNames && href != null && normalizedHashtagNames.has(normalizeForSearch(txt))) {
|
||||||
text += txt;
|
text += txt;
|
||||||
// メンション
|
// メンション
|
||||||
} else if (txt.startsWith('@') && !(rel && rel.value.startsWith('me '))) {
|
} else if (txt.startsWith('@') && !(rel != null && rel.startsWith('me '))) {
|
||||||
const part = txt.split('@');
|
const part = txt.split('@');
|
||||||
|
|
||||||
if (part.length === 2 && href) {
|
if (part.length === 2 && href) {
|
||||||
//#region ホスト名部分が省略されているので復元する
|
//#region ホスト名部分が省略されているので復元する
|
||||||
const acct = `${txt}@${(new URL(href.value)).hostname}`;
|
const acct = `${txt}@${(new URL(href)).hostname}`;
|
||||||
text += acct;
|
text += acct;
|
||||||
//#endregion
|
//#endregion
|
||||||
} else if (part.length === 3) {
|
} else if (part.length === 3) {
|
||||||
|
|
@ -114,17 +109,17 @@ export class MfmService {
|
||||||
if (!href) {
|
if (!href) {
|
||||||
return txt;
|
return txt;
|
||||||
}
|
}
|
||||||
if (!txt || txt === href.value) { // #6383: Missing text node
|
if (!txt || txt === href) { // #6383: Missing text node
|
||||||
if (href.value.match(urlRegexFull)) {
|
if (href.match(urlRegexFull)) {
|
||||||
return href.value;
|
return href;
|
||||||
} else {
|
} else {
|
||||||
return `<${href.value}>`;
|
return `<${href}>`;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (href.value.match(urlRegex) && !href.value.match(urlRegexFull)) {
|
if (href.match(urlRegex) && !href.match(urlRegexFull)) {
|
||||||
return `[${txt}](<${href.value}>)`; // #6846
|
return `[${txt}](<${href}>)`; // #6846
|
||||||
} else {
|
} else {
|
||||||
return `[${txt}](${href.value})`;
|
return `[${txt}](${href})`;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -133,60 +128,64 @@ export class MfmService {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
case 'h1': {
|
case 'H1': {
|
||||||
text += '【';
|
text += '【';
|
||||||
appendChildren(node.childNodes);
|
analyzeChildren(node.childNodes);
|
||||||
text += '】\n';
|
text += '】\n';
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
case 'b':
|
case 'B':
|
||||||
case 'strong': {
|
case 'STRONG': {
|
||||||
text += '**';
|
text += '**';
|
||||||
appendChildren(node.childNodes);
|
analyzeChildren(node.childNodes);
|
||||||
text += '**';
|
text += '**';
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
case 'small': {
|
case 'SMALL': {
|
||||||
text += '<small>';
|
text += '<small>';
|
||||||
appendChildren(node.childNodes);
|
analyzeChildren(node.childNodes);
|
||||||
text += '</small>';
|
text += '</small>';
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
case 's':
|
case 'S':
|
||||||
case 'del': {
|
case 'DEL': {
|
||||||
text += '~~';
|
text += '~~';
|
||||||
appendChildren(node.childNodes);
|
analyzeChildren(node.childNodes);
|
||||||
text += '~~';
|
text += '~~';
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
case 'i':
|
case 'I':
|
||||||
case 'em': {
|
case 'EM': {
|
||||||
text += '<i>';
|
text += '<i>';
|
||||||
appendChildren(node.childNodes);
|
analyzeChildren(node.childNodes);
|
||||||
text += '</i>';
|
text += '</i>';
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
case 'ruby': {
|
case 'RUBY': {
|
||||||
let ruby: [string, string][] = [];
|
let ruby: [string, string][] = [];
|
||||||
for (const child of node.childNodes) {
|
for (const child of node.childNodes) {
|
||||||
if (child.nodeName === 'rp') {
|
if ((child instanceof htmlParser.TextNode) && !/\s|\[|\]/.test(child.textContent)) {
|
||||||
|
ruby.push([child.textContent, '']);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (treeAdapter.isTextNode(child) && !/\s|\[|\]/.test(child.value)) {
|
|
||||||
ruby.push([child.value, '']);
|
if (!(child instanceof htmlParser.HTMLElement)) continue;
|
||||||
|
|
||||||
|
if (child.tagName === 'RP') {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (child.nodeName === 'rt' && ruby.length > 0) {
|
|
||||||
|
if (child.tagName === 'RT' && ruby.length > 0) {
|
||||||
const rt = getText(child);
|
const rt = getText(child);
|
||||||
if (/\s|\[|\]/.test(rt)) {
|
if (/\s|\[|\]/.test(rt)) {
|
||||||
// If any space is included in rt, it is treated as a normal text
|
// If any space is included in rt, it is treated as a normal text
|
||||||
ruby = [];
|
ruby = [];
|
||||||
appendChildren(node.childNodes);
|
analyzeChildren(node.childNodes);
|
||||||
break;
|
break;
|
||||||
} else {
|
} else {
|
||||||
ruby.at(-1)![1] = rt;
|
ruby.at(-1)![1] = rt;
|
||||||
|
|
@ -195,7 +194,7 @@ export class MfmService {
|
||||||
}
|
}
|
||||||
// If any other element is included in ruby, it is treated as a normal text
|
// If any other element is included in ruby, it is treated as a normal text
|
||||||
ruby = [];
|
ruby = [];
|
||||||
appendChildren(node.childNodes);
|
analyzeChildren(node.childNodes);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
for (const [base, rt] of ruby) {
|
for (const [base, rt] of ruby) {
|
||||||
|
|
@ -205,26 +204,30 @@ export class MfmService {
|
||||||
}
|
}
|
||||||
|
|
||||||
// block code (<pre><code>)
|
// block code (<pre><code>)
|
||||||
case 'pre': {
|
case 'PRE': {
|
||||||
if (node.childNodes.length === 1 && node.childNodes[0].nodeName === 'code') {
|
if (node.childNodes.length === 1 && (node.childNodes[0] instanceof htmlParser.HTMLElement) && node.childNodes[0].tagName === 'CODE') {
|
||||||
text += '\n```\n';
|
text += '\n```\n';
|
||||||
text += getText(node.childNodes[0]);
|
text += getText(node.childNodes[0]);
|
||||||
text += '\n```\n';
|
text += '\n```\n';
|
||||||
|
} else if (node.childNodes.length === 1 && (node.childNodes[0] instanceof htmlParser.TextNode) && node.childNodes[0].textContent.startsWith('<code>') && node.childNodes[0].textContent.endsWith('</code>')) {
|
||||||
|
text += '\n```\n';
|
||||||
|
text += node.childNodes[0].textContent.slice(6, -7);
|
||||||
|
text += '\n```\n';
|
||||||
} else {
|
} else {
|
||||||
appendChildren(node.childNodes);
|
analyzeChildren(node.childNodes);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
// inline code (<code>)
|
// inline code (<code>)
|
||||||
case 'code': {
|
case 'CODE': {
|
||||||
text += '`';
|
text += '`';
|
||||||
appendChildren(node.childNodes);
|
analyzeChildren(node.childNodes);
|
||||||
text += '`';
|
text += '`';
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
case 'blockquote': {
|
case 'BLOCKQUOTE': {
|
||||||
const t = getText(node);
|
const t = getText(node);
|
||||||
if (t) {
|
if (t) {
|
||||||
text += '\n> ';
|
text += '\n> ';
|
||||||
|
|
@ -233,33 +236,33 @@ export class MfmService {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
case 'p':
|
case 'P':
|
||||||
case 'h2':
|
case 'H2':
|
||||||
case 'h3':
|
case 'H3':
|
||||||
case 'h4':
|
case 'H4':
|
||||||
case 'h5':
|
case 'H5':
|
||||||
case 'h6': {
|
case 'H6': {
|
||||||
text += '\n\n';
|
text += '\n\n';
|
||||||
appendChildren(node.childNodes);
|
analyzeChildren(node.childNodes);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
// other block elements
|
// other block elements
|
||||||
case 'div':
|
case 'DIV':
|
||||||
case 'header':
|
case 'HEADER':
|
||||||
case 'footer':
|
case 'FOOTER':
|
||||||
case 'article':
|
case 'ARTICLE':
|
||||||
case 'li':
|
case 'LI':
|
||||||
case 'dt':
|
case 'DT':
|
||||||
case 'dd': {
|
case 'DD': {
|
||||||
text += '\n';
|
text += '\n';
|
||||||
appendChildren(node.childNodes);
|
analyzeChildren(node.childNodes);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
default: // includes inline elements
|
default: // includes inline elements
|
||||||
{
|
{
|
||||||
appendChildren(node.childNodes);
|
analyzeChildren(node.childNodes);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -120,7 +120,7 @@ async function discoverClientInformation(logger: Logger, httpRequestService: Htt
|
||||||
}
|
}
|
||||||
|
|
||||||
const text = await res.text();
|
const text = await res.text();
|
||||||
const fragment = htmlParser.parse(text);
|
const fragment = htmlParser.parse(`<div>${text}</div>`);
|
||||||
|
|
||||||
redirectUris.push(...[...fragment.querySelectorAll('link[rel=redirect_uri][href]')].map(el => el.attributes.href));
|
redirectUris.push(...[...fragment.querySelectorAll('link[rel=redirect_uri][href]')].map(el => el.attributes.href));
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -333,9 +333,6 @@ importers:
|
||||||
otpauth:
|
otpauth:
|
||||||
specifier: 9.4.1
|
specifier: 9.4.1
|
||||||
version: 9.4.1
|
version: 9.4.1
|
||||||
parse5:
|
|
||||||
specifier: 7.3.0
|
|
||||||
version: 7.3.0
|
|
||||||
pg:
|
pg:
|
||||||
specifier: 8.16.3
|
specifier: 8.16.3
|
||||||
version: 8.16.3
|
version: 8.16.3
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue