FEATURE: better email reply parsing

This commit is contained in:
Arpit Jalan 2015-12-09 23:14:01 +05:30
parent c447900e3b
commit 7a8e5a50ff
7 changed files with 346 additions and 51 deletions

View File

@ -47,8 +47,6 @@ gem 'aws-sdk', require: false
gem 'excon', require: false
gem 'unf', require: false
gem 'email_reply_parser'
# note: for image_optim to correctly work you need to follow
# https://github.com/toy/image_optim
# pinned due to https://github.com/toy/image_optim/pull/75, docker image must be upgraded to upgrade

View File

@ -115,7 +115,6 @@ GEM
railties
docile (1.1.5)
dotenv (2.0.2)
email_reply_parser (0.5.8)
ember-data-source (1.0.0.beta.16.1)
ember-source (~> 1.8)
ember-handlebars-template (0.1.5)
@ -451,7 +450,6 @@ DEPENDENCIES
byebug
certified
discourse-qunit-rails
email_reply_parser
ember-rails
ember-source (= 1.12.1)
excon

View File

@ -0,0 +1,278 @@
require 'strscan'
# https://github.com/github/email_reply_parser/blob/master/lib/email_reply_parser.rb
#
# EmailReplyParser is a small library to parse plain text email content. The
# goal is to identify which fragments are quoted, part of a signature, or
# original body content. We want to support both top and bottom posters, so
# no simple "REPLY ABOVE HERE" content is used.
#
# Beyond RFC 5322 (which is handled by the [Ruby mail gem][mail]), there aren't
# any real standards for how emails are created. This attempts to parse out
# common conventions for things like replies:
#
# this is some text
#
# On <date>, <author> wrote:
# > blah blah
# > blah blah
#
# ... and signatures:
#
# this is some text
#
# --
# Bob
# http://homepage.com/~bob
#
# Each of these are parsed into Fragment objects.
#
# EmailReplyParser also attempts to figure out which of these blocks should
# be hidden from users.
#
# [mail]: https://github.com/mikel/mail
class EmailReplyParser
# Public: Splits an email body into a list of Fragments.
#
# text - A String email body.
#
# Returns an Email instance.
def self.read(text)
Email.new.read(text)
end
# Public: Get the text of the visible portions of the given email body.
#
# text - A String email body.
#
# Returns a String.
def self.parse_reply(text)
self.read(text).visible_text
end
### Emails
# An Email instance represents a parsed body String.
class Email
# Emails have an Array of Fragments.
attr_reader :fragments
def initialize
@fragments = []
end
# Public: Gets the combined text of the visible fragments of the email body.
#
# Returns a String.
def visible_text
fragments.select{|f| !f.hidden?}.map{|f| f.to_s}.join("\n").rstrip
end
# Splits the given text into a list of Fragments. This is roughly done by
# reversing the text and parsing from the bottom to the top. This way we
# can check for 'On <date>, <author> wrote:' lines above quoted blocks.
#
# text - A String email body.
#
# Returns this same Email instance.
def read(text)
# in 1.9 we want to operate on the raw bytes
text = text.dup.force_encoding('binary') if text.respond_to?(:force_encoding)
# Normalize line endings.
text.gsub!("\r\n", "\n")
# Check for "On DATE, NAME <EMAIL> wrote:"
# or "---- Original Message ----" and strip
# email content after that part
if text =~ /^(On\s.+wrote:.*)$/nm || text =~ /^([\s_-]+Original (?i)message?[\s_-]+$.*)/nm
text.gsub!($1, "")
end
# Some users may reply directly above a line of underscores.
# In order to ensure that these fragments are split correctly,
# make sure that all lines of underscores are preceded by
# at least two newline characters.
text.gsub!(/([^\n])(?=\n_{7}_+)$/m, "\\1\n")
# The text is reversed initially due to the way we check for hidden
# fragments.
text = text.reverse
# This determines if any 'visible' Fragment has been found. Once any
# visible Fragment is found, stop looking for hidden ones.
@found_visible = false
# This instance variable points to the current Fragment. If the matched
# line fits, it should be added to this Fragment. Otherwise, finish it
# and start a new Fragment.
@fragment = nil
# Use the StringScanner to pull out each line of the email content.
@scanner = StringScanner.new(text)
while line = @scanner.scan_until(/\n/n)
scan_line(line)
end
# Be sure to parse the last line of the email.
if (last_line = @scanner.rest.to_s).size > 0
scan_line(last_line)
end
# Finish up the final fragment. Finishing a fragment will detect any
# attributes (hidden, signature, reply), and join each line into a
# string.
finish_fragment
@scanner = @fragment = nil
# Now that parsing is done, reverse the order.
@fragments.reverse!
self
end
private
EMPTY = "".freeze
SIGNATURE = '(?m)(--\s*$|__\s*$)|(^(\w+\s*){1,3} ym morf tneS$)'
begin
require 're2'
SIG_REGEX = RE2::Regexp.new(SIGNATURE)
rescue LoadError
SIG_REGEX = Regexp.new(SIGNATURE)
end
### Line-by-Line Parsing
# Scans the given line of text and figures out which fragment it belongs
# to.
#
# line - A String line of text from the email.
#
# Returns nothing.
def scan_line(line)
line.chomp!("\n")
line.lstrip! unless SIG_REGEX.match(line)
# We're looking for leading `>`'s to see if this line is part of a
# quoted Fragment.
is_quoted = !!(line =~ /(>+)$/n)
# Mark the current Fragment as a signature if the current line is empty
# and the Fragment starts with a common signature indicator.
if @fragment && line == EMPTY
if SIG_REGEX.match @fragment.lines.last
@fragment.signature = true
finish_fragment
end
end
# If the line matches the current fragment, add it. Note that a common
# reply header also counts as part of the quoted Fragment, even though
# it doesn't start with `>`.
if @fragment &&
((@fragment.quoted? == is_quoted) ||
(@fragment.quoted? && (quote_header?(line) || line == EMPTY)))
@fragment.lines << line
# Otherwise, finish the fragment and start a new one.
else
finish_fragment
@fragment = Fragment.new(is_quoted, line)
end
end
# Detects if a given line is a header above a quoted area. It is only
# checked for lines preceding quoted regions.
#
# line - A String line of text from the email.
#
# Returns true if the line is a valid header, or false.
def quote_header?(line)
line =~ /^:etorw.*nO$/n
end
# Builds the fragment string and reverses it, after all lines have been
# added. It also checks to see if this Fragment is hidden. The hidden
# Fragment check reads from the bottom to the top.
#
# Any quoted Fragments or signature Fragments are marked hidden if they
# are below any visible Fragments. Visible Fragments are expected to
# contain original content by the author. If they are below a quoted
# Fragment, then the Fragment should be visible to give context to the
# reply.
#
# some original text (visible)
#
# > do you have any two's? (quoted, visible)
#
# Go fish! (visible)
#
# > --
# > Player 1 (quoted, hidden)
#
# --
# Player 2 (signature, hidden)
#
def finish_fragment
if @fragment
@fragment.finish
if !@found_visible
if @fragment.quoted? || @fragment.signature? ||
@fragment.to_s.strip == EMPTY
@fragment.hidden = true
else
@found_visible = true
end
end
@fragments << @fragment
end
@fragment = nil
end
end
### Fragments
# Represents a group of paragraphs in the email sharing common attributes.
# Paragraphs should get their own fragment if they are a quoted area or a
# signature.
class Fragment < Struct.new(:quoted, :signature, :hidden)
# This is an Array of String lines of content. Since the content is
# reversed, this array is backwards, and contains reversed strings.
attr_reader :lines,
# This is reserved for the joined String that is build when this Fragment
# is finished.
:content
def initialize(quoted, first_line)
self.signature = self.hidden = false
self.quoted = quoted
@lines = [first_line]
@content = nil
@lines.compact!
end
alias quoted? quoted
alias signature? signature
alias hidden? hidden
# Builds the string content by joining the lines and reversing them.
#
# Returns nothing.
def finish
@content = @lines.join("\n")
@lines = nil
@content.reverse!
end
def to_s
@content
end
def inspect
to_s.inspect
end
end
end

View File

@ -1,5 +1,6 @@
require_dependency 'new_post_manager'
require_dependency 'email/html_cleaner'
require_dependency 'email/email_reply_parser'
module Email

View File

@ -100,53 +100,6 @@ It will also be my *only* reply."
)
end
it "handles inline reply" do
expect(test_parse_body(fixture_file("emails/inline_reply.eml"))).
to eq(
"On Wed, Oct 8, 2014 at 11:12 AM, techAPJ <info@unconfigured.discourse.org> wrote:
> techAPJ <https://meta.discourse.org/users/techapj>
> November 28
>
> Test reply.
>
> First paragraph.
>
> Second paragraph.
>
> To respond, reply to this email or visit
> https://meta.discourse.org/t/testing-default-email-replies/22638/3 in
> your browser.
> ------------------------------
> Previous Replies codinghorror
> <https://meta.discourse.org/users/codinghorror>
> November 28
>
> We're testing the latest GitHub email processing library which we are
> integrating now.
>
> https://github.com/github/email_reply_parser
>
> Go ahead and reply to this topic and I'll reply from various email clients
> for testing.
> ------------------------------
>
> To respond, reply to this email or visit
> https://meta.discourse.org/t/testing-default-email-replies/22638/3 in
> your browser.
>
> To unsubscribe from these emails, visit your user preferences
> <https://meta.discourse.org/my/preferences>.
>
The quick brown fox jumps over the lazy dog. The quick brown fox jumps over
the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown
fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog.
The quick brown fox jumps over the lazy dog. The quick brown fox jumps over
the lazy dog. The quick brown fox jumps over the lazy dog."
)
end
it "can retrieve the first part of multiple replies" do
expect(test_parse_body(fixture_file("emails/inline_mixed.eml"))).to eq(
"The quick brown fox jumps over the lazy dog. The quick brown fox jumps over
@ -173,6 +126,14 @@ the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown"
expect(test_parse_body(fixture_file("emails/iphone_signature.eml"))).not_to match(/Sent from my iPhone/)
end
it "strips regular signature" do
expect(test_parse_body(fixture_file("emails/signature.eml"))).not_to match(/Arpit/)
end
it "strips 'original message' context" do
expect(test_parse_body(fixture_file("emails/original_message_context.eml"))).not_to match(/Context/)
end
it "properly renders email reply from gmail web client" do
expect(test_parse_body(fixture_file("emails/gmail_web.eml"))).
to eq(

View File

@ -0,0 +1,30 @@
Delivered-To: test@mail.com
Return-Path: <walter.white@googlemail.com>
From: Walter White <walter.white@googlemail.com>
Content-Type: multipart/alternative;
boundary=Apple-Mail-8E182EEF-9DBC-41DE-A593-DF2E5EBD3975
Content-Transfer-Encoding: 7bit
Mime-Version: 1.0 (1.0)
Subject: Re: Signature in email replies!
Date: Thu, 23 Oct 2014 14:43:49 +0530
References: <1234@mail.gmail.com>
In-Reply-To: <1234@mail.gmail.com>
To: Arpit Jalan <test@mail.com>
X-Mailer: iPhone Mail (12A405)
--Apple-Mail-8E182EEF-9DBC-41DE-A593-DF2E5EBD3975
Content-Type: text/plain;
charset=us-ascii
Content-Transfer-Encoding: 7bit
This post should not include signature.
----Original Message----
Context here.
> On 23-Oct-2014, at 9:45 am, Arpit Jalan <test@mail.com> wrote:
>
> Signature in email replies!
--Apple-Mail-8E182EEF-9DBC-41DE-A593-DF2E5EBD3975

29
spec/fixtures/emails/signature.eml vendored Normal file
View File

@ -0,0 +1,29 @@
Delivered-To: test@mail.com
Return-Path: <walter.white@googlemail.com>
From: Walter White <walter.white@googlemail.com>
Content-Type: multipart/alternative;
boundary=Apple-Mail-8E182EEF-9DBC-41DE-A593-DF2E5EBD3975
Content-Transfer-Encoding: 7bit
Mime-Version: 1.0 (1.0)
Subject: Re: Signature in email replies!
Date: Thu, 23 Oct 2014 14:43:49 +0530
References: <1234@mail.gmail.com>
In-Reply-To: <1234@mail.gmail.com>
To: Arpit Jalan <test@mail.com>
X-Mailer: iPhone Mail (12A405)
--Apple-Mail-8E182EEF-9DBC-41DE-A593-DF2E5EBD3975
Content-Type: text/plain;
charset=us-ascii
Content-Transfer-Encoding: 7bit
This post should not include signature.
----Arpit
> On 23-Oct-2014, at 9:45 am, Arpit Jalan <test@mail.com> wrote:
>
> Signature in email replies!
--Apple-Mail-8E182EEF-9DBC-41DE-A593-DF2E5EBD3975