feat: HTMLを整形し、不正なHTMLを含まないように

This commit is contained in:
usbharu 2024-01-24 15:27:38 +09:00
parent 20012782b7
commit 45e0bd8edc
5 changed files with 236 additions and 0 deletions

View File

@ -215,6 +215,7 @@ dependencies {
implementation("org.flywaydb:flyway-core")
implementation("dev.usbharu:emoji-kt:2.0.0")
implementation("org.jsoup:jsoup:1.17.2")
implementation("io.ktor:ktor-client-logging-jvm:$ktor_version")

View File

@ -0,0 +1,107 @@
package dev.usbharu.hideout.core.service.post
import org.jsoup.Jsoup
import org.jsoup.nodes.Attributes
import org.jsoup.nodes.Element
import org.jsoup.nodes.TextNode
import org.jsoup.parser.Tag
import org.jsoup.select.Elements
class DefaultPostContentFormatter() : PostContentFormatter {
override suspend fun format(content: String): FormattedPostContent {
val document =
Jsoup.parseBodyFragment(content).getElementsByTag("body").first() ?: return FormattedPostContent("", "")
val flattenHtml = document.childNodes().mapNotNull {
if (it is Element) {
if (it.tagName() == "p") {
p(it)
} else {
p(Element("p").appendChildren(document.childNodes()))
}
} else if (it is TextNode) {
Element("p").appendText(it.text())
} else {
null
}
}.filter { it.text().isNotBlank() }
val formattedHtml = mutableListOf<Element>()
for (element in flattenHtml) {
var brCount = 0
var prevIndex = 0
val childNodes = element.childNodes()
for ((index, childNode) in childNodes.withIndex()) {
if (childNode is Element && childNode.tagName() == "br") {
brCount++
} else if (brCount >= 2) {
formattedHtml.add(Element("p").appendChildren(childNodes.subList(prevIndex, index - brCount)))
prevIndex = index
}
}
formattedHtml.add(Element("p").appendChildren(childNodes.subList(prevIndex, childNodes.size)))
}
val elements = Elements(formattedHtml)
return FormattedPostContent(elements.outerHtml().replace("\n", ""), printHtml(elements))
}
private fun p(element: Element): Element {
val childNodes = element.childNodes()
if (childNodes.size == 1 && childNodes.first() is TextNode) {
val pTag = Element("p")
pTag.appendText(element.text())
return pTag
}
val map = childNodes.mapNotNull {
if (it is Element) {
if (it.tagName() == "a") {
a(it)
} else if (it.tagName() == "br") {
Element("br")
} else {
TextNode(it.text())
}
} else if (it is TextNode) {
it
} else {
null
}
}
val pTag = Element("p")
pTag.appendChildren(map)
return pTag
}
private fun a(element: Element): Element {
val attributes = Attributes()
attributes.put("href", element.attribute("href").value)
return Element(Tag.valueOf("a"), "", attributes).appendText(element.text())
}
private fun printHtml(element: Elements): String {
return element.joinToString("\n\n") {
it.childNodes().joinToString("") {
if (it is Element && it.tagName() == "br") {
"\n"
} else if (it is Element) {
it.text()
} else if (it is TextNode) {
it.text()
} else {
""
}
}
}
}
}

View File

@ -0,0 +1,6 @@
package dev.usbharu.hideout.core.service.post
data class FormattedPostContent(
val html: String,
val content: String
)

View File

@ -0,0 +1,5 @@
package dev.usbharu.hideout.core.service.post
interface PostContentFormatter {
suspend fun format(content: String): FormattedPostContent
}

View File

@ -0,0 +1,117 @@
package dev.usbharu.hideout.core.service.post
import kotlinx.coroutines.test.runTest
import org.assertj.core.api.Assertions.assertThat
import org.junit.jupiter.api.Test
class DefaultPostContentFormatterTest {
@Test
fun pタグがpタグになる() = runTest {
//language=HTML
val html = """<p>hoge</p>"""
val actual = DefaultPostContentFormatter().format(html)
assertThat(actual).isEqualTo(FormattedPostContent("<p>hoge</p>", "hoge"))
}
@Test
fun hタグがpタグになる() = runTest {
//language=HTML
val html = """<h1>hoge</h1>"""
val actual = DefaultPostContentFormatter().format(html)
assertThat(actual).isEqualTo(FormattedPostContent("<p>hoge</p>", "hoge"))
}
@Test
fun pタグのネストは破棄される() = runTest {
//language=HTML
val html = """<p>hoge<p>fuga</p>piyo</p>"""
val actual = DefaultPostContentFormatter().format(html)
assertThat(actual).isEqualTo(FormattedPostContent("<p>hoge</p><p>fuga</p><p>piyo</p>", "hoge\n\nfuga\n\npiyo"))
}
@Test
fun spanタグは無視される() = runTest {
//language=HTML
val html = """<p><span>hoge</span></p>"""
val actual = DefaultPostContentFormatter().format(html)
assertThat(actual).isEqualTo(FormattedPostContent("<p>hoge</p>", "hoge"))
}
@Test
fun `2連続改行は段落に変換される`() = runTest {
//language=HTML
val html = """<p>hoge<br><br>fuga</p>"""
val actual = DefaultPostContentFormatter().format(html)
assertThat(actual).isEqualTo(FormattedPostContent("<p>hoge</p><p>fuga</p>", "hoge\n\nfuga"))
}
@Test
fun iタグは無視される() = runTest {
//language=HTML
val html = """<p><i>hoge</i></p>"""
val actual = DefaultPostContentFormatter().format(html)
assertThat(actual).isEqualTo(FormattedPostContent("<p>hoge</p>", "hoge"))
}
@Test
fun aタグはhrefの中身のみ引き継がれる() = runTest {
//language=HTML
val html = """<p><a href='https://example.com' class='u-url' target='_blank'>hoge</a></p>"""
val actual = DefaultPostContentFormatter().format(html)
assertThat(actual).isEqualTo(FormattedPostContent("<p><a href=\"https://example.com\">hoge</a></p>", "hoge"))
}
@Test
fun aタグの中のspanは無視される() = runTest {
//language=HTML
val html = """<p><a href='https://example.com'><span>hoge</span></a></p>"""
val actual = DefaultPostContentFormatter().format(html)
assertThat(actual).isEqualTo(FormattedPostContent("<p><a href=\"https://example.com\">hoge</a></p>", "hoge"))
}
@Test
fun brタグのコンテンツは改行になる() = runTest {
//language=HTML
val html = """<p>hoge<br>fuga</p>"""
val actual = DefaultPostContentFormatter().format(html)
assertThat(actual).isEqualTo(FormattedPostContent("<p>hoge<br> fuga</p>", "hoge\nfuga"))
}
@Test
fun いきなりテキストが来たらpタグで囲む() = runTest {
//language=HTML
val html = """hoge"""
val actual = DefaultPostContentFormatter().format(html)
assertThat(actual).isEqualTo(FormattedPostContent("<p>hoge</p>", "hoge"))
}
@Test
fun bodyタグが含まれていた場合消す() = runTest {
//language=HTML
val html = """</body><p>hoge</p>"""
val actual = DefaultPostContentFormatter().format(html)
assertThat(actual).isEqualTo(FormattedPostContent("<p>hoge</p>", "hoge"))
}
}