diff --git a/bun.lockb b/bun.lockb index 960e3db..a0d3687 100755 Binary files a/bun.lockb and b/bun.lockb differ diff --git a/package.json b/package.json index a85dee4..c1b5df2 100644 --- a/package.json +++ b/package.json @@ -1,5 +1,5 @@ { - "name": "email-comments", + "name": "chorus", "module": "index.ts", "type": "module", "devDependencies": { @@ -10,9 +10,13 @@ }, "dependencies": { "@types/imapflow": "^1.0.19", + "@types/jsdom": "^21.1.7", "@types/mailparser": "^3.4.5", "imapflow": "^1.0.171", + "isomorphic-dompurify": "^2.19.0", + "jsdom": "^25.0.1", "letterparser": "^0.1.8", + "marked": "^15.0.5", "zod": "^3.24.1" } -} \ No newline at end of file +} diff --git a/src/mail/read.ts b/src/mail/read.ts index e4456fb..3f4faf7 100644 --- a/src/mail/read.ts +++ b/src/mail/read.ts @@ -9,6 +9,7 @@ import {collect, Collection} from "../bones/collection/Collection.ts"; import {AsyncCollection} from "../bones/collection/AsyncCollection.ts"; import {withClient} from "./client.ts"; import {buildThreadAddressMatcher} from "../threads/id.ts"; +import {htmlReplyPipeline} from "./sanitize.ts"; export async function getMailboxesToSearch(thread?: string, client?: ImapFlow): Promise> { // There are 2 possibilities for where mail might end up. @@ -112,6 +113,7 @@ export class MailboxIterable extends Iterable { subject: message.envelope.subject, mailbox: this.mailbox, modseq: message.modseq, + html: htmlReplyPipeline.apply(source), recipients, content, thread, diff --git a/src/mail/replies.ts b/src/mail/replies.ts index c40b648..ecc2a44 100644 --- a/src/mail/replies.ts +++ b/src/mail/replies.ts @@ -50,7 +50,7 @@ export class Email { this.foundVisible = false; let fragment: Fragment | null = null; - const lines = modifiedText.split("\n"); + const lines = modifiedText.split(/\n|>\/?\s*[rR][bB]"); diff --git a/src/mail/sanitize.ts b/src/mail/sanitize.ts new file mode 100644 index 0000000..982aae1 --- /dev/null +++ b/src/mail/sanitize.ts @@ -0,0 +1,72 @@ +import {Pipeline} from "../bones/Pipe.ts"; +import {extract} from "letterparser"; +import {ReplyParser} from "./replies.ts"; +import DOMPurify from 'isomorphic-dompurify' +import { JSDOM } from 'jsdom' + + +export const allowedTags = [ + 'li', 'ol', 'ul', 'ul', 'b', 'br', 'code', 'em', + 'i', 'small', 'strong', 'sub', 'sup', 'u', +] + +export const lineBreakingTags = ['p', 'div'] + +export const sanitizeHtml = (html: string): string => + DOMPurify((new JSDOM('')).window).sanitize(html, { ALLOWED_TAGS: allowedTags }) + +/** + * Transforms an HTML email content string by sanitizing the HTML + * and stripping out quotes and signatures as best as possible. + * This is imperfect and currently results in too many line breaks + * in some cases, but it works okay enough. + */ +export const htmlReplyPipeline = Pipeline.id() + // extract the HTML from the email source + .tap(source => extract(source).html || '') + + // here it gets weird -- the reply parser expects to work in terms of newlines and
tags + // however a lot of email uses

. Do a pass to insert an artificial
at the beginning + // of each

tag + .tap(html => { + const window = (new JSDOM('')).window + const dp = DOMPurify(window) + + dp.addHook('uponSanitizeElement', node => { + if ( node.nodeType !== window.Node.ELEMENT_NODE ) { + // Skip text/document nodes, e.g. + return + } + + if ( node.textContent === '' && node.nodeName.toLowerCase() !== 'br' ) { + // Drop empty nodes as long as they're not self-closing + node.parentNode?.removeChild(node) + return + } + + if ( lineBreakingTags.includes(node.nodeName.toLowerCase()) ) { + // If a wrapping tag would cause a line break, explicitly add in that break + const br = window.document.createElement('br') + const child = node.firstChild + child ? node.insertBefore(br, child) : node.appendChild(br) + return + } + }) + + const result = dp.sanitize(html, { + ALLOWED_TAGS: [...allowedTags, ...lineBreakingTags], + }) + + dp.removeHook('uponSanitizeElement') + + return result + }) + + // then, do a second pass which filters out the

tags, leaving the
in their place + .tap(html => sanitizeHtml(html)) + + // run that through the reply parser to strip out quotes and signatures + .tap(html => ReplyParser.parseReply(html)) + + // finally, clean it up and add the
's back in since the reply parser replaced them with \n + .tap(reply => reply.trim().replace(/\n+/g, '
')) diff --git a/src/threads/refresh.ts b/src/threads/refresh.ts index 3d7613e..86b1d97 100644 --- a/src/threads/refresh.ts +++ b/src/threads/refresh.ts @@ -4,6 +4,8 @@ import type {Message, ThreadData} from "../types.ts"; import {AsyncCollection} from "../bones/collection/AsyncCollection.ts"; import {sha256} from "../bones/crypto.ts"; import {config} from "../config.ts"; +import { marked } from "marked"; +import {sanitizeHtml} from "../mail/sanitize.ts"; export async function refreshThreadsEntirely(): Promise { await withClient(async client => { @@ -51,12 +53,13 @@ export async function refreshThreadsEntirely(): Promise { threadData.comments.push({ user: { name: message.from.name || '(anonymous)', - mailId: sha256(message.from.address!), - domainId: sha256(message.from.address!.split('@').reverse()[0]), + mailId: sha256(message.from.address!.toLowerCase()), + domainId: sha256(message.from.address!.toLowerCase().split('@').reverse()[0]), }, date: message.date, subject: message.subject, text: message.content, + rendered: message.html || sanitizeHtml(await marked(message.content)), }) } diff --git a/src/types.ts b/src/types.ts index a5058d9..2b1e1f6 100644 --- a/src/types.ts +++ b/src/types.ts @@ -38,6 +38,7 @@ export type Message = { }, subject: string, content: string, + html?: string, mailbox: string, modseq: BigInt, thread?: string, @@ -54,6 +55,7 @@ export type ThreadComment = { date: Date, subject: string, text: string, + rendered: string, } export type ThreadData = {