Implement first-draft of HTML message/reply parsing + HTML sanitization
This commit is contained in:
parent
bfe94aa2fe
commit
0460771d5d
@ -1,5 +1,5 @@
|
|||||||
{
|
{
|
||||||
"name": "email-comments",
|
"name": "chorus",
|
||||||
"module": "index.ts",
|
"module": "index.ts",
|
||||||
"type": "module",
|
"type": "module",
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
@ -10,9 +10,13 @@
|
|||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@types/imapflow": "^1.0.19",
|
"@types/imapflow": "^1.0.19",
|
||||||
|
"@types/jsdom": "^21.1.7",
|
||||||
"@types/mailparser": "^3.4.5",
|
"@types/mailparser": "^3.4.5",
|
||||||
"imapflow": "^1.0.171",
|
"imapflow": "^1.0.171",
|
||||||
|
"isomorphic-dompurify": "^2.19.0",
|
||||||
|
"jsdom": "^25.0.1",
|
||||||
"letterparser": "^0.1.8",
|
"letterparser": "^0.1.8",
|
||||||
|
"marked": "^15.0.5",
|
||||||
"zod": "^3.24.1"
|
"zod": "^3.24.1"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -9,6 +9,7 @@ import {collect, Collection} from "../bones/collection/Collection.ts";
|
|||||||
import {AsyncCollection} from "../bones/collection/AsyncCollection.ts";
|
import {AsyncCollection} from "../bones/collection/AsyncCollection.ts";
|
||||||
import {withClient} from "./client.ts";
|
import {withClient} from "./client.ts";
|
||||||
import {buildThreadAddressMatcher} from "../threads/id.ts";
|
import {buildThreadAddressMatcher} from "../threads/id.ts";
|
||||||
|
import {htmlReplyPipeline} from "./sanitize.ts";
|
||||||
|
|
||||||
export async function getMailboxesToSearch(thread?: string, client?: ImapFlow): Promise<Collection<string>> {
|
export async function getMailboxesToSearch(thread?: string, client?: ImapFlow): Promise<Collection<string>> {
|
||||||
// There are 2 possibilities for where mail might end up.
|
// There are 2 possibilities for where mail might end up.
|
||||||
@ -112,6 +113,7 @@ export class MailboxIterable extends Iterable<Message> {
|
|||||||
subject: message.envelope.subject,
|
subject: message.envelope.subject,
|
||||||
mailbox: this.mailbox,
|
mailbox: this.mailbox,
|
||||||
modseq: message.modseq,
|
modseq: message.modseq,
|
||||||
|
html: htmlReplyPipeline.apply(source),
|
||||||
recipients,
|
recipients,
|
||||||
content,
|
content,
|
||||||
thread,
|
thread,
|
||||||
|
@ -50,7 +50,7 @@ export class Email {
|
|||||||
this.foundVisible = false;
|
this.foundVisible = false;
|
||||||
let fragment: Fragment | null = null;
|
let fragment: Fragment | null = null;
|
||||||
|
|
||||||
const lines = modifiedText.split("\n");
|
const lines = modifiedText.split(/\n|>\/?\s*[rR][bB]</);
|
||||||
for (const line of lines) {
|
for (const line of lines) {
|
||||||
const processedLine = line.trimEnd();
|
const processedLine = line.trimEnd();
|
||||||
const isQuoted = processedLine.endsWith(">");
|
const isQuoted = processedLine.endsWith(">");
|
||||||
|
72
src/mail/sanitize.ts
Normal file
72
src/mail/sanitize.ts
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
import {Pipeline} from "../bones/Pipe.ts";
|
||||||
|
import {extract} from "letterparser";
|
||||||
|
import {ReplyParser} from "./replies.ts";
|
||||||
|
import DOMPurify from 'isomorphic-dompurify'
|
||||||
|
import { JSDOM } from 'jsdom'
|
||||||
|
|
||||||
|
|
||||||
|
export const allowedTags = [
|
||||||
|
'li', 'ol', 'ul', 'ul', 'b', 'br', 'code', 'em',
|
||||||
|
'i', 'small', 'strong', 'sub', 'sup', 'u',
|
||||||
|
]
|
||||||
|
|
||||||
|
export const lineBreakingTags = ['p', 'div']
|
||||||
|
|
||||||
|
export const sanitizeHtml = (html: string): string =>
|
||||||
|
DOMPurify((new JSDOM('')).window).sanitize(html, { ALLOWED_TAGS: allowedTags })
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Transforms an HTML email content string by sanitizing the HTML
|
||||||
|
* and stripping out quotes and signatures as best as possible.
|
||||||
|
* This is imperfect and currently results in too many line breaks
|
||||||
|
* in some cases, but it works okay enough.
|
||||||
|
*/
|
||||||
|
export const htmlReplyPipeline = Pipeline.id<string>()
|
||||||
|
// extract the HTML from the email source
|
||||||
|
.tap(source => extract(source).html || '')
|
||||||
|
|
||||||
|
// here it gets weird -- the reply parser expects to work in terms of newlines and <br> tags
|
||||||
|
// however a lot of email uses <p>. Do a pass to insert an artificial <br> at the beginning
|
||||||
|
// of each <p> tag
|
||||||
|
.tap(html => {
|
||||||
|
const window = (new JSDOM('')).window
|
||||||
|
const dp = DOMPurify(window)
|
||||||
|
|
||||||
|
dp.addHook('uponSanitizeElement', node => {
|
||||||
|
if ( node.nodeType !== window.Node.ELEMENT_NODE ) {
|
||||||
|
// Skip text/document nodes, e.g.
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( node.textContent === '' && node.nodeName.toLowerCase() !== 'br' ) {
|
||||||
|
// Drop empty nodes as long as they're not self-closing
|
||||||
|
node.parentNode?.removeChild(node)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( lineBreakingTags.includes(node.nodeName.toLowerCase()) ) {
|
||||||
|
// If a wrapping tag would cause a line break, explicitly add in that break
|
||||||
|
const br = window.document.createElement('br')
|
||||||
|
const child = node.firstChild
|
||||||
|
child ? node.insertBefore(br, child) : node.appendChild(br)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
const result = dp.sanitize(html, {
|
||||||
|
ALLOWED_TAGS: [...allowedTags, ...lineBreakingTags],
|
||||||
|
})
|
||||||
|
|
||||||
|
dp.removeHook('uponSanitizeElement')
|
||||||
|
|
||||||
|
return result
|
||||||
|
})
|
||||||
|
|
||||||
|
// then, do a second pass which filters out the <p> tags, leaving the <br> in their place
|
||||||
|
.tap(html => sanitizeHtml(html))
|
||||||
|
|
||||||
|
// run that through the reply parser to strip out quotes and signatures
|
||||||
|
.tap(html => ReplyParser.parseReply(html))
|
||||||
|
|
||||||
|
// finally, clean it up and add the <br>'s back in since the reply parser replaced them with \n
|
||||||
|
.tap(reply => reply.trim().replace(/\n+/g, '<br>'))
|
@ -4,6 +4,8 @@ import type {Message, ThreadData} from "../types.ts";
|
|||||||
import {AsyncCollection} from "../bones/collection/AsyncCollection.ts";
|
import {AsyncCollection} from "../bones/collection/AsyncCollection.ts";
|
||||||
import {sha256} from "../bones/crypto.ts";
|
import {sha256} from "../bones/crypto.ts";
|
||||||
import {config} from "../config.ts";
|
import {config} from "../config.ts";
|
||||||
|
import { marked } from "marked";
|
||||||
|
import {sanitizeHtml} from "../mail/sanitize.ts";
|
||||||
|
|
||||||
export async function refreshThreadsEntirely(): Promise<void> {
|
export async function refreshThreadsEntirely(): Promise<void> {
|
||||||
await withClient(async client => {
|
await withClient(async client => {
|
||||||
@ -51,12 +53,13 @@ export async function refreshThreadsEntirely(): Promise<void> {
|
|||||||
threadData.comments.push({
|
threadData.comments.push({
|
||||||
user: {
|
user: {
|
||||||
name: message.from.name || '(anonymous)',
|
name: message.from.name || '(anonymous)',
|
||||||
mailId: sha256(message.from.address!),
|
mailId: sha256(message.from.address!.toLowerCase()),
|
||||||
domainId: sha256(message.from.address!.split('@').reverse()[0]),
|
domainId: sha256(message.from.address!.toLowerCase().split('@').reverse()[0]),
|
||||||
},
|
},
|
||||||
date: message.date,
|
date: message.date,
|
||||||
subject: message.subject,
|
subject: message.subject,
|
||||||
text: message.content,
|
text: message.content,
|
||||||
|
rendered: message.html || sanitizeHtml(await marked(message.content)),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -38,6 +38,7 @@ export type Message = {
|
|||||||
},
|
},
|
||||||
subject: string,
|
subject: string,
|
||||||
content: string,
|
content: string,
|
||||||
|
html?: string,
|
||||||
mailbox: string,
|
mailbox: string,
|
||||||
modseq: BigInt,
|
modseq: BigInt,
|
||||||
thread?: string,
|
thread?: string,
|
||||||
@ -54,6 +55,7 @@ export type ThreadComment = {
|
|||||||
date: Date,
|
date: Date,
|
||||||
subject: string,
|
subject: string,
|
||||||
text: string,
|
text: string,
|
||||||
|
rendered: string,
|
||||||
}
|
}
|
||||||
|
|
||||||
export type ThreadData = {
|
export type ThreadData = {
|
||||||
|
Loading…
Reference in New Issue
Block a user