FEATURE: new 'trim_incoming_emails' site setting (#12874)

This setting allows admin to de/activate automatic trimming of incoming email.
There are instances where it does wonders in trimming all the garbage content and other
instances where it's so bad that it trims the most important part of the email.

FIX: don't remove hidden content using the style attribute when converting HTML to Markdown.
The regexp used was doing more harm than good. It was way too broad.

FIX: properly elide signatures from emails sent with Front App.
This is fairly safe as Front App nicely identifies signatures in the HTML part.
This commit is contained in:
Régis Hanol 2021-04-28 17:08:48 +02:00 committed by GitHub
parent 548c044809
commit cd93d1b5f7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 32 additions and 23 deletions

View File

@ -1995,6 +1995,7 @@ en:
forwarded_emails_behaviour: "How to treat a forwarded email to Discourse"
always_show_trimmed_content: "Always show trimmed part of incoming emails. WARNING: might reveal email addresses."
trim_incoming_emails: "Trim part of the incoming emails that isn't relevant."
private_email: "Don't include content from posts or topics in email title or email body. NOTE: also disables digest emails."
email_total_attachment_size_limit_kb: "Max total size of files attached to outgoing emails in kB. Set to 0 to disable sending of attachments."
post_excerpts_in_emails: "In notification emails, always send excerpts instead of full posts"

View File

@ -1190,6 +1190,7 @@ email:
- quote
- create_replies
always_show_trimmed_content: false
trim_incoming_emails: true
private_email: false
email_custom_template:
default: ""

View File

@ -63,24 +63,27 @@ module Email
def process!
return if is_blocked?
id_hash = Digest::SHA1.hexdigest(@message_id)
DistributedMutex.synchronize("process_email_#{id_hash}") do
begin
# If we find an existing incoming email record with the exact same
# message_id do not create a new IncomingEmail record to avoid double
# ups.
@incoming_email = find_existing_and_update_imap
return if @incoming_email
# If we find an existing incoming email record with the exact same `message_id`
# do not create a new `IncomingEmail` record to avoid double ups.
return if @incoming_email = find_existing_and_update_imap
ensure_valid_address_lists
ensure_valid_date
@from_email, @from_display_name = parse_from_field
@from_user = User.find_by_email(@from_email)
@incoming_email = create_incoming_email
post = process_internal
raise BouncedEmailError if is_bounce?
return post
post
rescue Exception => e
error = e.to_s
error = e.class.name if error.blank?
@ -92,13 +95,10 @@ module Email
end
def find_existing_and_update_imap
incoming_email = IncomingEmail.find_by(message_id: @message_id)
return if !incoming_email
return unless incoming_email = IncomingEmail.find_by(message_id: @message_id)
# If we are not doing this for IMAP purposes just return the record.
if @opts[:imap_uid].blank?
return incoming_email
end
return incoming_email if @opts[:imap_uid].blank?
# If the message_id matches the post id regexp then we
# generated the message_id not the imap server, e.g. in GroupSmtpEmail,
@ -117,6 +117,7 @@ module Email
imap_group_id: @opts[:imap_group_id],
imap_sync: false
)
incoming_email
end
@ -440,6 +441,7 @@ module Email
[:protonmail, /class="protonmail_/],
[:zimbra, /data-marker="__/],
[:newton, /(id|class)="cm_/],
[:front, /class="front-/],
]
def extract_from_gmail(doc)
@ -501,8 +503,14 @@ module Email
to_markdown(doc.to_html, elided.to_html)
end
def extract_from_front(doc)
# Removes anything that has a class starting with 'front-'
elided = doc.css("*[class^='front-']").remove
to_markdown(doc.to_html, elided.to_html)
end
def trim_reply_and_extract_elided(text)
return [text, ""] if @opts[:skip_trimming]
return [text, ""] if @opts[:skip_trimming] || !SiteSetting.trim_incoming_emails
EmailReplyTrimmer.trim(text, true)
end
@ -517,8 +525,7 @@ module Email
encodings = COMMON_ENCODINGS.dup
encodings.unshift(mail_part.charset) if mail_part.charset.present?
# mail (>=2.5) decodes mails with 8bit transfer encoding to utf-8, so
# always try UTF-8 first
# mail (>=2.5) decodes mails with 8bit transfer encoding to utf-8, so always try UTF-8 first
if mail_part.content_transfer_encoding == "8bit"
encodings.delete("UTF-8")
encodings.unshift("UTF-8")

View File

@ -37,11 +37,8 @@ class HtmlToMarkdown
@doc.traverse { |node| node.remove if !allowed.include?(node.name) }
end
HIDDEN_STYLES ||= /(display\s*:\s*none)|(visibility\s*:\s*hidden)|(opacity\s*:\s*0)|(transform\s*:\s*scale\(0\))|((width|height)\s*:\s*0)/i
def remove_hidden!(doc)
@doc.css("[hidden]").remove
@doc.css("[style]").each { |n| n.remove if n["style"][HIDDEN_STYLES] }
@doc.css("img[width]").each { |n| n.remove if n["width"].to_i <= 0 }
@doc.css("img[height]").each { |n| n.remove if n["height"].to_i <= 0 }
end

View File

@ -530,16 +530,22 @@ describe Email::Receiver do
it "doesn't include the 'elided' part of the original message when always_show_trimmed_content is disabled" do
SiteSetting.always_show_trimmed_content = false
expect { process(:original_message) }.to change { topic.posts.count }.from(1).to(2)
expect { process(:original_message) }.to change { topic.posts.count }
expect(topic.posts.last.raw).to eq("This is a reply :)")
end
it "adds the 'elided' part of the original message for public replies when always_show_trimmed_content is enabled" do
SiteSetting.always_show_trimmed_content = true
expect { process(:original_message) }.to change { topic.posts.count }.from(1).to(2)
expect { process(:original_message) }.to change { topic.posts.count }
expect(topic.posts.last.raw).to eq("This is a reply :)\n\n<details class='elided'>\n<summary title='Show trimmed content'>&#183;&#183;&#183;</summary>\n\n---Original Message---\nThis part should not be included\n\n</details>")
end
it "doesn't trim the message when trim_incoming_emails is disabled" do
SiteSetting.trim_incoming_emails = false
expect { process(:original_message) }.to change { topic.posts.count }
expect(topic.posts.last.raw).to eq("This is a reply :)\n\n---Original Message---\nThis part should not be included")
end
it "supports attached images in TEXT part" do
SiteSetting.incoming_email_prefer_html = false

View File

@ -83,7 +83,6 @@ describe HtmlToMarkdown do
end
it "skips hidden tags" do
expect(html_to_markdown(%Q{<p>Hello <span style="display: none">cruel </span>World!</p>})).to eq("Hello World!")
expect(html_to_markdown(%Q{<p>Hello <span hidden>cruel </span>World!</p>})).to eq("Hello World!")
end
@ -157,8 +156,6 @@ describe HtmlToMarkdown do
it "skips hidden <img>" do
expect(html_to_markdown(%Q{<img src="https://www.discourse.org/logo.svg" width=0>})).to eq("")
expect(html_to_markdown(%Q{<img src="https://www.discourse.org/logo.svg" height="0">})).to eq("")
expect(html_to_markdown(%Q{<img src="https://www.discourse.org/logo.svg" style="width: 0">})).to eq("")
expect(html_to_markdown(%Q{<img src="https://www.discourse.org/logo.svg" style="height:0px">})).to eq("")
end
it "supports width/height on <img>" do