Implement first-draft of HTML message/reply parsing + HTML sanitization

This commit is contained in:
Garrett Mills 2025-01-04 04:43:39 -05:00
parent bfe94aa2fe
commit 0460771d5d
7 changed files with 88 additions and 5 deletions

BIN
bun.lockb

Binary file not shown.

View File

@ -1,5 +1,5 @@
{
"name": "email-comments",
"name": "chorus",
"module": "index.ts",
"type": "module",
"devDependencies": {
@ -10,9 +10,13 @@
},
"dependencies": {
"@types/imapflow": "^1.0.19",
"@types/jsdom": "^21.1.7",
"@types/mailparser": "^3.4.5",
"imapflow": "^1.0.171",
"isomorphic-dompurify": "^2.19.0",
"jsdom": "^25.0.1",
"letterparser": "^0.1.8",
"marked": "^15.0.5",
"zod": "^3.24.1"
}
}

View File

@ -9,6 +9,7 @@ import {collect, Collection} from "../bones/collection/Collection.ts";
import {AsyncCollection} from "../bones/collection/AsyncCollection.ts";
import {withClient} from "./client.ts";
import {buildThreadAddressMatcher} from "../threads/id.ts";
import {htmlReplyPipeline} from "./sanitize.ts";
export async function getMailboxesToSearch(thread?: string, client?: ImapFlow): Promise<Collection<string>> {
// There are 2 possibilities for where mail might end up.
@ -112,6 +113,7 @@ export class MailboxIterable extends Iterable<Message> {
subject: message.envelope.subject,
mailbox: this.mailbox,
modseq: message.modseq,
html: htmlReplyPipeline.apply(source),
recipients,
content,
thread,

View File

@ -50,7 +50,7 @@ export class Email {
this.foundVisible = false;
let fragment: Fragment | null = null;
const lines = modifiedText.split("\n");
const lines = modifiedText.split(/\n|>\/?\s*[rR][bB]</);
for (const line of lines) {
const processedLine = line.trimEnd();
const isQuoted = processedLine.endsWith(">");

72
src/mail/sanitize.ts Normal file
View File

@ -0,0 +1,72 @@
import {Pipeline} from "../bones/Pipe.ts";
import {extract} from "letterparser";
import {ReplyParser} from "./replies.ts";
import DOMPurify from 'isomorphic-dompurify'
import { JSDOM } from 'jsdom'
export const allowedTags = [
'li', 'ol', 'ul', 'ul', 'b', 'br', 'code', 'em',
'i', 'small', 'strong', 'sub', 'sup', 'u',
]
export const lineBreakingTags = ['p', 'div']
export const sanitizeHtml = (html: string): string =>
DOMPurify((new JSDOM('')).window).sanitize(html, { ALLOWED_TAGS: allowedTags })
/**
* Transforms an HTML email content string by sanitizing the HTML
* and stripping out quotes and signatures as best as possible.
* This is imperfect and currently results in too many line breaks
* in some cases, but it works okay enough.
*/
export const htmlReplyPipeline = Pipeline.id<string>()
// extract the HTML from the email source
.tap(source => extract(source).html || '')
// here it gets weird -- the reply parser expects to work in terms of newlines and <br> tags
// however a lot of email uses <p>. Do a pass to insert an artificial <br> at the beginning
// of each <p> tag
.tap(html => {
const window = (new JSDOM('')).window
const dp = DOMPurify(window)
dp.addHook('uponSanitizeElement', node => {
if ( node.nodeType !== window.Node.ELEMENT_NODE ) {
// Skip text/document nodes, e.g.
return
}
if ( node.textContent === '' && node.nodeName.toLowerCase() !== 'br' ) {
// Drop empty nodes as long as they're not self-closing
node.parentNode?.removeChild(node)
return
}
if ( lineBreakingTags.includes(node.nodeName.toLowerCase()) ) {
// If a wrapping tag would cause a line break, explicitly add in that break
const br = window.document.createElement('br')
const child = node.firstChild
child ? node.insertBefore(br, child) : node.appendChild(br)
return
}
})
const result = dp.sanitize(html, {
ALLOWED_TAGS: [...allowedTags, ...lineBreakingTags],
})
dp.removeHook('uponSanitizeElement')
return result
})
// then, do a second pass which filters out the <p> tags, leaving the <br> in their place
.tap(html => sanitizeHtml(html))
// run that through the reply parser to strip out quotes and signatures
.tap(html => ReplyParser.parseReply(html))
// finally, clean it up and add the <br>'s back in since the reply parser replaced them with \n
.tap(reply => reply.trim().replace(/\n+/g, '<br>'))

View File

@ -4,6 +4,8 @@ import type {Message, ThreadData} from "../types.ts";
import {AsyncCollection} from "../bones/collection/AsyncCollection.ts";
import {sha256} from "../bones/crypto.ts";
import {config} from "../config.ts";
import { marked } from "marked";
import {sanitizeHtml} from "../mail/sanitize.ts";
export async function refreshThreadsEntirely(): Promise<void> {
await withClient(async client => {
@ -51,12 +53,13 @@ export async function refreshThreadsEntirely(): Promise<void> {
threadData.comments.push({
user: {
name: message.from.name || '(anonymous)',
mailId: sha256(message.from.address!),
domainId: sha256(message.from.address!.split('@').reverse()[0]),
mailId: sha256(message.from.address!.toLowerCase()),
domainId: sha256(message.from.address!.toLowerCase().split('@').reverse()[0]),
},
date: message.date,
subject: message.subject,
text: message.content,
rendered: message.html || sanitizeHtml(await marked(message.content)),
})
}

View File

@ -38,6 +38,7 @@ export type Message = {
},
subject: string,
content: string,
html?: string,
mailbox: string,
modseq: BigInt,
thread?: string,
@ -54,6 +55,7 @@ export type ThreadComment = {
date: Date,
subject: string,
text: string,
rendered: string,
}
export type ThreadData = {