From 553248de86dd971e16706fd788ed24488e034124 Mon Sep 17 00:00:00 2001 From: Arpit Jalan Date: Thu, 10 Dec 2015 13:28:52 +0530 Subject: [PATCH] FEATURE: new gem discourse_email_parser --- Gemfile | 2 + Gemfile.lock | 2 + lib/email/email_reply_parser.rb | 278 -------------------------------- lib/email/receiver.rb | 3 +- 4 files changed, 5 insertions(+), 280 deletions(-) delete mode 100644 lib/email/email_reply_parser.rb diff --git a/Gemfile b/Gemfile index da9298c3dd1..595932d342c 100644 --- a/Gemfile +++ b/Gemfile @@ -47,6 +47,8 @@ gem 'aws-sdk', require: false gem 'excon', require: false gem 'unf', require: false +gem 'discourse_email_parser' + # note: for image_optim to correctly work you need to follow # https://github.com/toy/image_optim # pinned due to https://github.com/toy/image_optim/pull/75, docker image must be upgraded to upgrade diff --git a/Gemfile.lock b/Gemfile.lock index 154d7060048..a1ffac6d8cf 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -113,6 +113,7 @@ GEM diff-lcs (1.2.5) discourse-qunit-rails (0.0.8) railties + discourse_email_parser (0.6.1) docile (1.1.5) dotenv (2.0.2) ember-data-source (1.0.0.beta.16.1) @@ -450,6 +451,7 @@ DEPENDENCIES byebug certified discourse-qunit-rails + discourse_email_parser ember-rails ember-source (= 1.12.1) excon diff --git a/lib/email/email_reply_parser.rb b/lib/email/email_reply_parser.rb deleted file mode 100644 index 14282cee53a..00000000000 --- a/lib/email/email_reply_parser.rb +++ /dev/null @@ -1,278 +0,0 @@ -require 'strscan' - -# https://github.com/github/email_reply_parser/blob/master/lib/email_reply_parser.rb -# -# EmailReplyParser is a small library to parse plain text email content. The -# goal is to identify which fragments are quoted, part of a signature, or -# original body content. We want to support both top and bottom posters, so -# no simple "REPLY ABOVE HERE" content is used. -# -# Beyond RFC 5322 (which is handled by the [Ruby mail gem][mail]), there aren't -# any real standards for how emails are created. This attempts to parse out -# common conventions for things like replies: -# -# this is some text -# -# On , wrote: -# > blah blah -# > blah blah -# -# ... and signatures: -# -# this is some text -# -# -- -# Bob -# http://homepage.com/~bob -# -# Each of these are parsed into Fragment objects. -# -# EmailReplyParser also attempts to figure out which of these blocks should -# be hidden from users. -# -# [mail]: https://github.com/mikel/mail -class EmailReplyParser - - # Public: Splits an email body into a list of Fragments. - # - # text - A String email body. - # - # Returns an Email instance. - def self.read(text) - Email.new.read(text) - end - - # Public: Get the text of the visible portions of the given email body. - # - # text - A String email body. - # - # Returns a String. - def self.parse_reply(text) - self.read(text).visible_text - end - - ### Emails - - # An Email instance represents a parsed body String. - class Email - # Emails have an Array of Fragments. - attr_reader :fragments - - def initialize - @fragments = [] - end - - # Public: Gets the combined text of the visible fragments of the email body. - # - # Returns a String. - def visible_text - fragments.select{|f| !f.hidden?}.map{|f| f.to_s}.join("\n").rstrip - end - - # Splits the given text into a list of Fragments. This is roughly done by - # reversing the text and parsing from the bottom to the top. This way we - # can check for 'On , wrote:' lines above quoted blocks. - # - # text - A String email body. - # - # Returns this same Email instance. - def read(text) - # in 1.9 we want to operate on the raw bytes - text = text.dup.force_encoding('binary') if text.respond_to?(:force_encoding) - - # Normalize line endings. - text.gsub!("\r\n", "\n") - - # Check for "On DATE, NAME wrote:" - # or "---- Original Message ----" and strip - # email content after that part - if text =~ /^(On\s.+wrote:.*)$/nm || text =~ /^([\s_-]+Original (?i)message?[\s_-]+$.*)/nm - text.gsub!($1, "") - end - - # Some users may reply directly above a line of underscores. - # In order to ensure that these fragments are split correctly, - # make sure that all lines of underscores are preceded by - # at least two newline characters. - text.gsub!(/([^\n])(?=\n_{7}_+)$/m, "\\1\n") - - # The text is reversed initially due to the way we check for hidden - # fragments. - text = text.reverse - - # This determines if any 'visible' Fragment has been found. Once any - # visible Fragment is found, stop looking for hidden ones. - @found_visible = false - - # This instance variable points to the current Fragment. If the matched - # line fits, it should be added to this Fragment. Otherwise, finish it - # and start a new Fragment. - @fragment = nil - - # Use the StringScanner to pull out each line of the email content. - @scanner = StringScanner.new(text) - while line = @scanner.scan_until(/\n/n) - scan_line(line) - end - - # Be sure to parse the last line of the email. - if (last_line = @scanner.rest.to_s).size > 0 - scan_line(last_line) - end - - # Finish up the final fragment. Finishing a fragment will detect any - # attributes (hidden, signature, reply), and join each line into a - # string. - finish_fragment - - @scanner = @fragment = nil - - # Now that parsing is done, reverse the order. - @fragments.reverse! - self - end - - private - EMPTY = "".freeze - SIGNATURE = '(?m)(--\s*$|__\s*$)|(^(\w+\s*){1,3} ym morf tneS$)' - - begin - require 're2' - SIG_REGEX = RE2::Regexp.new(SIGNATURE) - rescue LoadError - SIG_REGEX = Regexp.new(SIGNATURE) - end - - ### Line-by-Line Parsing - - # Scans the given line of text and figures out which fragment it belongs - # to. - # - # line - A String line of text from the email. - # - # Returns nothing. - def scan_line(line) - line.chomp!("\n") - line.lstrip! unless SIG_REGEX.match(line) - - # We're looking for leading `>`'s to see if this line is part of a - # quoted Fragment. - is_quoted = !!(line =~ /(>+)$/n) - - # Mark the current Fragment as a signature if the current line is empty - # and the Fragment starts with a common signature indicator. - if @fragment && line == EMPTY - if SIG_REGEX.match @fragment.lines.last - @fragment.signature = true - finish_fragment - end - end - - # If the line matches the current fragment, add it. Note that a common - # reply header also counts as part of the quoted Fragment, even though - # it doesn't start with `>`. - if @fragment && - ((@fragment.quoted? == is_quoted) || - (@fragment.quoted? && (quote_header?(line) || line == EMPTY))) - @fragment.lines << line - - # Otherwise, finish the fragment and start a new one. - else - finish_fragment - @fragment = Fragment.new(is_quoted, line) - end - end - - # Detects if a given line is a header above a quoted area. It is only - # checked for lines preceding quoted regions. - # - # line - A String line of text from the email. - # - # Returns true if the line is a valid header, or false. - def quote_header?(line) - line =~ /^:etorw.*nO$/n - end - - # Builds the fragment string and reverses it, after all lines have been - # added. It also checks to see if this Fragment is hidden. The hidden - # Fragment check reads from the bottom to the top. - # - # Any quoted Fragments or signature Fragments are marked hidden if they - # are below any visible Fragments. Visible Fragments are expected to - # contain original content by the author. If they are below a quoted - # Fragment, then the Fragment should be visible to give context to the - # reply. - # - # some original text (visible) - # - # > do you have any two's? (quoted, visible) - # - # Go fish! (visible) - # - # > -- - # > Player 1 (quoted, hidden) - # - # -- - # Player 2 (signature, hidden) - # - def finish_fragment - if @fragment - @fragment.finish - if !@found_visible - if @fragment.quoted? || @fragment.signature? || - @fragment.to_s.strip == EMPTY - @fragment.hidden = true - else - @found_visible = true - end - end - @fragments << @fragment - end - @fragment = nil - end - end - - ### Fragments - - # Represents a group of paragraphs in the email sharing common attributes. - # Paragraphs should get their own fragment if they are a quoted area or a - # signature. - class Fragment < Struct.new(:quoted, :signature, :hidden) - # This is an Array of String lines of content. Since the content is - # reversed, this array is backwards, and contains reversed strings. - attr_reader :lines, - - # This is reserved for the joined String that is build when this Fragment - # is finished. - :content - - def initialize(quoted, first_line) - self.signature = self.hidden = false - self.quoted = quoted - @lines = [first_line] - @content = nil - @lines.compact! - end - - alias quoted? quoted - alias signature? signature - alias hidden? hidden - - # Builds the string content by joining the lines and reversing them. - # - # Returns nothing. - def finish - @content = @lines.join("\n") - @lines = nil - @content.reverse! - end - - def to_s - @content - end - - def inspect - to_s.inspect - end - end -end diff --git a/lib/email/receiver.rb b/lib/email/receiver.rb index eb4a84eb4d8..794b2c02c0e 100644 --- a/lib/email/receiver.rb +++ b/lib/email/receiver.rb @@ -1,6 +1,5 @@ require_dependency 'new_post_manager' require_dependency 'email/html_cleaner' -require_dependency 'email/email_reply_parser' module Email @@ -136,7 +135,7 @@ module Email body = discourse_email_trimmer body raise EmptyEmailError if body.strip.blank? - body = EmailReplyParser.parse_reply body + body = DiscourseEmailParser.parse_reply body raise EmptyEmailError if body.strip.blank? body.force_encoding(encoding).encode("UTF-8")