FEATURE: new gem discourse_email_parser

This commit is contained in:
Arpit Jalan 2015-12-10 13:28:52 +05:30
parent 7a8e5a50ff
commit 553248de86
4 changed files with 5 additions and 280 deletions

View File

@ -47,6 +47,8 @@ gem 'aws-sdk', require: false
gem 'excon', require: false
gem 'unf', require: false
gem 'discourse_email_parser'
# note: for image_optim to correctly work you need to follow
# https://github.com/toy/image_optim
# pinned due to https://github.com/toy/image_optim/pull/75, docker image must be upgraded to upgrade

View File

@ -113,6 +113,7 @@ GEM
diff-lcs (1.2.5)
discourse-qunit-rails (0.0.8)
railties
discourse_email_parser (0.6.1)
docile (1.1.5)
dotenv (2.0.2)
ember-data-source (1.0.0.beta.16.1)
@ -450,6 +451,7 @@ DEPENDENCIES
byebug
certified
discourse-qunit-rails
discourse_email_parser
ember-rails
ember-source (= 1.12.1)
excon

View File

@ -1,278 +0,0 @@
require 'strscan'
# https://github.com/github/email_reply_parser/blob/master/lib/email_reply_parser.rb
#
# EmailReplyParser is a small library to parse plain text email content. The
# goal is to identify which fragments are quoted, part of a signature, or
# original body content. We want to support both top and bottom posters, so
# no simple "REPLY ABOVE HERE" content is used.
#
# Beyond RFC 5322 (which is handled by the [Ruby mail gem][mail]), there aren't
# any real standards for how emails are created. This attempts to parse out
# common conventions for things like replies:
#
# this is some text
#
# On <date>, <author> wrote:
# > blah blah
# > blah blah
#
# ... and signatures:
#
# this is some text
#
# --
# Bob
# http://homepage.com/~bob
#
# Each of these are parsed into Fragment objects.
#
# EmailReplyParser also attempts to figure out which of these blocks should
# be hidden from users.
#
# [mail]: https://github.com/mikel/mail
class EmailReplyParser
# Public: Splits an email body into a list of Fragments.
#
# text - A String email body.
#
# Returns an Email instance.
def self.read(text)
Email.new.read(text)
end
# Public: Get the text of the visible portions of the given email body.
#
# text - A String email body.
#
# Returns a String.
def self.parse_reply(text)
self.read(text).visible_text
end
### Emails
# An Email instance represents a parsed body String.
class Email
# Emails have an Array of Fragments.
attr_reader :fragments
def initialize
@fragments = []
end
# Public: Gets the combined text of the visible fragments of the email body.
#
# Returns a String.
def visible_text
fragments.select{|f| !f.hidden?}.map{|f| f.to_s}.join("\n").rstrip
end
# Splits the given text into a list of Fragments. This is roughly done by
# reversing the text and parsing from the bottom to the top. This way we
# can check for 'On <date>, <author> wrote:' lines above quoted blocks.
#
# text - A String email body.
#
# Returns this same Email instance.
def read(text)
# in 1.9 we want to operate on the raw bytes
text = text.dup.force_encoding('binary') if text.respond_to?(:force_encoding)
# Normalize line endings.
text.gsub!("\r\n", "\n")
# Check for "On DATE, NAME <EMAIL> wrote:"
# or "---- Original Message ----" and strip
# email content after that part
if text =~ /^(On\s.+wrote:.*)$/nm || text =~ /^([\s_-]+Original (?i)message?[\s_-]+$.*)/nm
text.gsub!($1, "")
end
# Some users may reply directly above a line of underscores.
# In order to ensure that these fragments are split correctly,
# make sure that all lines of underscores are preceded by
# at least two newline characters.
text.gsub!(/([^\n])(?=\n_{7}_+)$/m, "\\1\n")
# The text is reversed initially due to the way we check for hidden
# fragments.
text = text.reverse
# This determines if any 'visible' Fragment has been found. Once any
# visible Fragment is found, stop looking for hidden ones.
@found_visible = false
# This instance variable points to the current Fragment. If the matched
# line fits, it should be added to this Fragment. Otherwise, finish it
# and start a new Fragment.
@fragment = nil
# Use the StringScanner to pull out each line of the email content.
@scanner = StringScanner.new(text)
while line = @scanner.scan_until(/\n/n)
scan_line(line)
end
# Be sure to parse the last line of the email.
if (last_line = @scanner.rest.to_s).size > 0
scan_line(last_line)
end
# Finish up the final fragment. Finishing a fragment will detect any
# attributes (hidden, signature, reply), and join each line into a
# string.
finish_fragment
@scanner = @fragment = nil
# Now that parsing is done, reverse the order.
@fragments.reverse!
self
end
private
EMPTY = "".freeze
SIGNATURE = '(?m)(--\s*$|__\s*$)|(^(\w+\s*){1,3} ym morf tneS$)'
begin
require 're2'
SIG_REGEX = RE2::Regexp.new(SIGNATURE)
rescue LoadError
SIG_REGEX = Regexp.new(SIGNATURE)
end
### Line-by-Line Parsing
# Scans the given line of text and figures out which fragment it belongs
# to.
#
# line - A String line of text from the email.
#
# Returns nothing.
def scan_line(line)
line.chomp!("\n")
line.lstrip! unless SIG_REGEX.match(line)
# We're looking for leading `>`'s to see if this line is part of a
# quoted Fragment.
is_quoted = !!(line =~ /(>+)$/n)
# Mark the current Fragment as a signature if the current line is empty
# and the Fragment starts with a common signature indicator.
if @fragment && line == EMPTY
if SIG_REGEX.match @fragment.lines.last
@fragment.signature = true
finish_fragment
end
end
# If the line matches the current fragment, add it. Note that a common
# reply header also counts as part of the quoted Fragment, even though
# it doesn't start with `>`.
if @fragment &&
((@fragment.quoted? == is_quoted) ||
(@fragment.quoted? && (quote_header?(line) || line == EMPTY)))
@fragment.lines << line
# Otherwise, finish the fragment and start a new one.
else
finish_fragment
@fragment = Fragment.new(is_quoted, line)
end
end
# Detects if a given line is a header above a quoted area. It is only
# checked for lines preceding quoted regions.
#
# line - A String line of text from the email.
#
# Returns true if the line is a valid header, or false.
def quote_header?(line)
line =~ /^:etorw.*nO$/n
end
# Builds the fragment string and reverses it, after all lines have been
# added. It also checks to see if this Fragment is hidden. The hidden
# Fragment check reads from the bottom to the top.
#
# Any quoted Fragments or signature Fragments are marked hidden if they
# are below any visible Fragments. Visible Fragments are expected to
# contain original content by the author. If they are below a quoted
# Fragment, then the Fragment should be visible to give context to the
# reply.
#
# some original text (visible)
#
# > do you have any two's? (quoted, visible)
#
# Go fish! (visible)
#
# > --
# > Player 1 (quoted, hidden)
#
# --
# Player 2 (signature, hidden)
#
def finish_fragment
if @fragment
@fragment.finish
if !@found_visible
if @fragment.quoted? || @fragment.signature? ||
@fragment.to_s.strip == EMPTY
@fragment.hidden = true
else
@found_visible = true
end
end
@fragments << @fragment
end
@fragment = nil
end
end
### Fragments
# Represents a group of paragraphs in the email sharing common attributes.
# Paragraphs should get their own fragment if they are a quoted area or a
# signature.
class Fragment < Struct.new(:quoted, :signature, :hidden)
# This is an Array of String lines of content. Since the content is
# reversed, this array is backwards, and contains reversed strings.
attr_reader :lines,
# This is reserved for the joined String that is build when this Fragment
# is finished.
:content
def initialize(quoted, first_line)
self.signature = self.hidden = false
self.quoted = quoted
@lines = [first_line]
@content = nil
@lines.compact!
end
alias quoted? quoted
alias signature? signature
alias hidden? hidden
# Builds the string content by joining the lines and reversing them.
#
# Returns nothing.
def finish
@content = @lines.join("\n")
@lines = nil
@content.reverse!
end
def to_s
@content
end
def inspect
to_s.inspect
end
end
end

View File

@ -1,6 +1,5 @@
require_dependency 'new_post_manager'
require_dependency 'email/html_cleaner'
require_dependency 'email/email_reply_parser'
module Email
@ -136,7 +135,7 @@ module Email
body = discourse_email_trimmer body
raise EmptyEmailError if body.strip.blank?
body = EmailReplyParser.parse_reply body
body = DiscourseEmailParser.parse_reply body
raise EmptyEmailError if body.strip.blank?
body.force_encoding(encoding).encode("UTF-8")