Implement first-draft of HTML message/reply parsing + HTML sanitization
This commit is contained in:
parent
bfe94aa2fe
commit
0460771d5d
@ -1,5 +1,5 @@
|
||||
{
|
||||
"name": "email-comments",
|
||||
"name": "chorus",
|
||||
"module": "index.ts",
|
||||
"type": "module",
|
||||
"devDependencies": {
|
||||
@ -10,9 +10,13 @@
|
||||
},
|
||||
"dependencies": {
|
||||
"@types/imapflow": "^1.0.19",
|
||||
"@types/jsdom": "^21.1.7",
|
||||
"@types/mailparser": "^3.4.5",
|
||||
"imapflow": "^1.0.171",
|
||||
"isomorphic-dompurify": "^2.19.0",
|
||||
"jsdom": "^25.0.1",
|
||||
"letterparser": "^0.1.8",
|
||||
"marked": "^15.0.5",
|
||||
"zod": "^3.24.1"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -9,6 +9,7 @@ import {collect, Collection} from "../bones/collection/Collection.ts";
|
||||
import {AsyncCollection} from "../bones/collection/AsyncCollection.ts";
|
||||
import {withClient} from "./client.ts";
|
||||
import {buildThreadAddressMatcher} from "../threads/id.ts";
|
||||
import {htmlReplyPipeline} from "./sanitize.ts";
|
||||
|
||||
export async function getMailboxesToSearch(thread?: string, client?: ImapFlow): Promise<Collection<string>> {
|
||||
// There are 2 possibilities for where mail might end up.
|
||||
@ -112,6 +113,7 @@ export class MailboxIterable extends Iterable<Message> {
|
||||
subject: message.envelope.subject,
|
||||
mailbox: this.mailbox,
|
||||
modseq: message.modseq,
|
||||
html: htmlReplyPipeline.apply(source),
|
||||
recipients,
|
||||
content,
|
||||
thread,
|
||||
|
@ -50,7 +50,7 @@ export class Email {
|
||||
this.foundVisible = false;
|
||||
let fragment: Fragment | null = null;
|
||||
|
||||
const lines = modifiedText.split("\n");
|
||||
const lines = modifiedText.split(/\n|>\/?\s*[rR][bB]</);
|
||||
for (const line of lines) {
|
||||
const processedLine = line.trimEnd();
|
||||
const isQuoted = processedLine.endsWith(">");
|
||||
|
72
src/mail/sanitize.ts
Normal file
72
src/mail/sanitize.ts
Normal file
@ -0,0 +1,72 @@
|
||||
import {Pipeline} from "../bones/Pipe.ts";
|
||||
import {extract} from "letterparser";
|
||||
import {ReplyParser} from "./replies.ts";
|
||||
import DOMPurify from 'isomorphic-dompurify'
|
||||
import { JSDOM } from 'jsdom'
|
||||
|
||||
|
||||
export const allowedTags = [
|
||||
'li', 'ol', 'ul', 'ul', 'b', 'br', 'code', 'em',
|
||||
'i', 'small', 'strong', 'sub', 'sup', 'u',
|
||||
]
|
||||
|
||||
export const lineBreakingTags = ['p', 'div']
|
||||
|
||||
export const sanitizeHtml = (html: string): string =>
|
||||
DOMPurify((new JSDOM('')).window).sanitize(html, { ALLOWED_TAGS: allowedTags })
|
||||
|
||||
/**
|
||||
* Transforms an HTML email content string by sanitizing the HTML
|
||||
* and stripping out quotes and signatures as best as possible.
|
||||
* This is imperfect and currently results in too many line breaks
|
||||
* in some cases, but it works okay enough.
|
||||
*/
|
||||
export const htmlReplyPipeline = Pipeline.id<string>()
|
||||
// extract the HTML from the email source
|
||||
.tap(source => extract(source).html || '')
|
||||
|
||||
// here it gets weird -- the reply parser expects to work in terms of newlines and <br> tags
|
||||
// however a lot of email uses <p>. Do a pass to insert an artificial <br> at the beginning
|
||||
// of each <p> tag
|
||||
.tap(html => {
|
||||
const window = (new JSDOM('')).window
|
||||
const dp = DOMPurify(window)
|
||||
|
||||
dp.addHook('uponSanitizeElement', node => {
|
||||
if ( node.nodeType !== window.Node.ELEMENT_NODE ) {
|
||||
// Skip text/document nodes, e.g.
|
||||
return
|
||||
}
|
||||
|
||||
if ( node.textContent === '' && node.nodeName.toLowerCase() !== 'br' ) {
|
||||
// Drop empty nodes as long as they're not self-closing
|
||||
node.parentNode?.removeChild(node)
|
||||
return
|
||||
}
|
||||
|
||||
if ( lineBreakingTags.includes(node.nodeName.toLowerCase()) ) {
|
||||
// If a wrapping tag would cause a line break, explicitly add in that break
|
||||
const br = window.document.createElement('br')
|
||||
const child = node.firstChild
|
||||
child ? node.insertBefore(br, child) : node.appendChild(br)
|
||||
return
|
||||
}
|
||||
})
|
||||
|
||||
const result = dp.sanitize(html, {
|
||||
ALLOWED_TAGS: [...allowedTags, ...lineBreakingTags],
|
||||
})
|
||||
|
||||
dp.removeHook('uponSanitizeElement')
|
||||
|
||||
return result
|
||||
})
|
||||
|
||||
// then, do a second pass which filters out the <p> tags, leaving the <br> in their place
|
||||
.tap(html => sanitizeHtml(html))
|
||||
|
||||
// run that through the reply parser to strip out quotes and signatures
|
||||
.tap(html => ReplyParser.parseReply(html))
|
||||
|
||||
// finally, clean it up and add the <br>'s back in since the reply parser replaced them with \n
|
||||
.tap(reply => reply.trim().replace(/\n+/g, '<br>'))
|
@ -4,6 +4,8 @@ import type {Message, ThreadData} from "../types.ts";
|
||||
import {AsyncCollection} from "../bones/collection/AsyncCollection.ts";
|
||||
import {sha256} from "../bones/crypto.ts";
|
||||
import {config} from "../config.ts";
|
||||
import { marked } from "marked";
|
||||
import {sanitizeHtml} from "../mail/sanitize.ts";
|
||||
|
||||
export async function refreshThreadsEntirely(): Promise<void> {
|
||||
await withClient(async client => {
|
||||
@ -51,12 +53,13 @@ export async function refreshThreadsEntirely(): Promise<void> {
|
||||
threadData.comments.push({
|
||||
user: {
|
||||
name: message.from.name || '(anonymous)',
|
||||
mailId: sha256(message.from.address!),
|
||||
domainId: sha256(message.from.address!.split('@').reverse()[0]),
|
||||
mailId: sha256(message.from.address!.toLowerCase()),
|
||||
domainId: sha256(message.from.address!.toLowerCase().split('@').reverse()[0]),
|
||||
},
|
||||
date: message.date,
|
||||
subject: message.subject,
|
||||
text: message.content,
|
||||
rendered: message.html || sanitizeHtml(await marked(message.content)),
|
||||
})
|
||||
}
|
||||
|
||||
|
@ -38,6 +38,7 @@ export type Message = {
|
||||
},
|
||||
subject: string,
|
||||
content: string,
|
||||
html?: string,
|
||||
mailbox: string,
|
||||
modseq: BigInt,
|
||||
thread?: string,
|
||||
@ -54,6 +55,7 @@ export type ThreadComment = {
|
||||
date: Date,
|
||||
subject: string,
|
||||
text: string,
|
||||
rendered: string,
|
||||
}
|
||||
|
||||
export type ThreadData = {
|
||||
|
Loading…
Reference in New Issue
Block a user