From d523c3705730a0899c5a0a8926bb560f64c149c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9gis=20Hanol?= Date: Tue, 31 Dec 2024 15:29:36 +0100 Subject: [PATCH] FIX: correctly extract body and/or reply from exchange emails (#30512) When receiving emails sent with Exchange, we look for some markers to identify the body of the mail and the reply (aka. previous email). For some reasons, those markers aren't 100% reliable and sometimes, only one of them is present. The commit 20ba54d53630b57c25fa3f325b0f219581314936 introduced the bug because the `HTML_EXTRACTERS` regex for exchange looks for either `messageBodySection` or `messageReplySection` but we were only using the `reply` section. So if an email had only the `body` section, it would not be correctly extracted. This commit handle the cases where either one of them is missing and use the other one as the actual "reply". When both are present, it correctly elides the "reply" section. --- lib/email/receiver.rb | 19 +++++++-- spec/fixtures/emails/exchange_html_body.eml | 14 +++++++ .../emails/exchange_html_body_and_reply.eml | 17 ++++++++ spec/fixtures/emails/exchange_html_reply.eml | 14 +++++++ spec/lib/email/receiver_spec.rb | 40 +++++++++++++++++-- 5 files changed, 97 insertions(+), 7 deletions(-) create mode 100644 spec/fixtures/emails/exchange_html_body.eml create mode 100644 spec/fixtures/emails/exchange_html_body_and_reply.eml create mode 100644 spec/fixtures/emails/exchange_html_reply.eml diff --git a/lib/email/receiver.rb b/lib/email/receiver.rb index fce8c76b82e..c3fb865acfb 100644 --- a/lib/email/receiver.rb +++ b/lib/email/receiver.rb @@ -564,10 +564,21 @@ module Email end def extract_from_exchange(doc) - # Exchange is using the 'messageReplySection' class for forwarded emails - # And 'messageBodySection' for the actual email - elided = doc.css("div[name='messageReplySection']").remove - to_markdown(doc.css("div[name='messageReplySection']").to_html, elided.to_html) + # Exchange is using 'messageReplySection' for forwarded emails and 'messageBodySection' for the actual email + reply = doc.css("div[name='messageReplySection']") + body = doc.css("div[name='messageBodySection']") + + if reply.present? && body.present? + elided = doc.css("div[name='messageReplySection']").remove + body = doc.css("div[name='messageBodySection']") + to_markdown(body.to_html, elided.to_html) + elsif reply.present? + to_markdown(reply.to_html, "") + elsif body.present? + to_markdown(body.to_html, "") + else + to_markdown(doc.to_html, "") + end end def extract_from_apple_mail(doc) diff --git a/spec/fixtures/emails/exchange_html_body.eml b/spec/fixtures/emails/exchange_html_body.eml new file mode 100644 index 00000000000..0177c632592 --- /dev/null +++ b/spec/fixtures/emails/exchange_html_body.eml @@ -0,0 +1,14 @@ +Return-Path: +From: Foo Bar +To: alt+4f97315cc828096c9cb34c6f1a0d6fe8@bar.com +Date: Fri, 15 Jan 2017 00:12:43 +0100 +Message-ID: <180@foo.bar.mail> +Mime-Version: 1.0 +Content-Type: text/html; charset=UTF-8 +Content-Transfer-Encoding: quoted-printable + +
+
+

This is the body of the email.

+
+
diff --git a/spec/fixtures/emails/exchange_html_body_and_reply.eml b/spec/fixtures/emails/exchange_html_body_and_reply.eml new file mode 100644 index 00000000000..00a423a7f92 --- /dev/null +++ b/spec/fixtures/emails/exchange_html_body_and_reply.eml @@ -0,0 +1,17 @@ +Return-Path: +From: Foo Bar +To: alt+4f97315cc828096c9cb34c6f1a0d6fe8@bar.com +Date: Fri, 15 Jan 2017 00:12:43 +0100 +Message-ID: <180@foo.bar.mail> +Mime-Version: 1.0 +Content-Type: text/html; charset=UTF-8 +Content-Transfer-Encoding: quoted-printable + +
+
+

This is the body of the email.

+
+
+

This is the reply!

+
+
diff --git a/spec/fixtures/emails/exchange_html_reply.eml b/spec/fixtures/emails/exchange_html_reply.eml new file mode 100644 index 00000000000..1cb1751834e --- /dev/null +++ b/spec/fixtures/emails/exchange_html_reply.eml @@ -0,0 +1,14 @@ +Return-Path: +From: Foo Bar +To: alt+4f97315cc828096c9cb34c6f1a0d6fe8@bar.com +Date: Fri, 15 Jan 2017 00:12:43 +0100 +Message-ID: <180@foo.bar.mail> +Mime-Version: 1.0 +Content-Type: text/html; charset=UTF-8 +Content-Transfer-Encoding: quoted-printable + +
+
+

This is the body !! of the email.

+
+
diff --git a/spec/lib/email/receiver_spec.rb b/spec/lib/email/receiver_spec.rb index bb5d553d02d..e6f2a88c144 100644 --- a/spec/lib/email/receiver_spec.rb +++ b/spec/lib/email/receiver_spec.rb @@ -415,9 +415,43 @@ RSpec.describe Email::Receiver do it "automatically elides gmail quotes" do SiteSetting.always_show_trimmed_content = true expect { process(:gmail_html_reply) }.to change { topic.posts.count } - expect(topic.posts.last.raw).to eq( - "This is a **GMAIL** reply ;)\n\n
\n···\n\nThis is the *elided* part!\n\n
", - ) + expect(topic.posts.last.raw).to eq <<~MD.strip + This is a **GMAIL** reply ;) + +
+ ··· + + This is the *elided* part! + +
+ MD + end + + it "correctly extracts body from exchange emails" do + SiteSetting.always_show_trimmed_content = true + expect { process(:exchange_html_body) }.to change { topic.posts.count } + expect(topic.posts.last.raw).to eq("This is the **body** of the email.") + end + + it "correctly extracts reply from exchange emails" do + SiteSetting.always_show_trimmed_content = true + expect { process(:exchange_html_reply) }.to change { topic.posts.count } + expect(topic.posts.last.raw).to eq("This is the **body !!** of the email.") + end + + it "correctly extracts body & reply from exchange emails" do + SiteSetting.always_show_trimmed_content = true + expect { process(:exchange_html_body_and_reply) }.to change { topic.posts.count } + expect(topic.posts.last.raw).to eq <<~MD.strip + This is the **body** of the email. + +
+ ··· + + This is the *reply*! + +
+ MD end it "doesn't process email with same message-id more than once" do