From b76674f6406f94539e6b3060d60c010f2de99d19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9gis=20Hanol?= Date: Wed, 26 Apr 2017 16:49:06 +0200 Subject: [PATCH] FEATURE: convert incoming emails in HTML to markdown - remove incoming_email_prefer_html site setting - remove HtmlCleaner class --- config/locales/server.en.yml | 1 - config/site_settings.yml | 2 - lib/email/html_cleaner.rb | 132 ------------------------- lib/email/receiver.rb | 16 +-- lib/html_to_markdown.rb | 2 +- spec/components/email/receiver_spec.rb | 14 +-- 6 files changed, 10 insertions(+), 157 deletions(-) delete mode 100644 lib/email/html_cleaner.rb diff --git a/config/locales/server.en.yml b/config/locales/server.en.yml index 40617b68e0f..19c7d0880c0 100644 --- a/config/locales/server.en.yml +++ b/config/locales/server.en.yml @@ -1306,7 +1306,6 @@ en: reply_by_email_enabled: "Enable replying to topics via email." reply_by_email_address: "Template for reply by email incoming email address, for example: %{reply_key}@reply.example.com or replies+%{reply_key}@example.com" alternative_reply_by_email_addresses: "List of alternative templates for reply by email incoming email addresses. Example: %{reply_key}@reply.example.com|replies+%{reply_key}@example.com" - incoming_email_prefer_html: "Use the HTML instead of the text for incoming email. May cause unexpected formatting issues!" disable_emails: "Prevent Discourse from sending any kind of emails" diff --git a/config/site_settings.yml b/config/site_settings.yml index 370bd71946d..c978dc48ec9 100644 --- a/config/site_settings.yml +++ b/config/site_settings.yml @@ -646,8 +646,6 @@ email: pop3_polling_username: '' pop3_polling_password: '' log_mail_processing_failures: false - incoming_email_prefer_html: - default: false email_in: default: false client: true diff --git a/lib/email/html_cleaner.rb b/lib/email/html_cleaner.rb deleted file mode 100644 index e8c89f7a1b1..00000000000 --- a/lib/email/html_cleaner.rb +++ /dev/null @@ -1,132 +0,0 @@ -module Email - # HtmlCleaner cleans up the extremely dirty HTML that many email clients - # generate by stripping out any excess divs or spans, removing styling in - # the process (which also makes the html more suitable to be parsed as - # Markdown). - class HtmlCleaner - # Elements to hoist all children out of - HTML_HOIST_ELEMENTS = %w(div span font table tbody th tr td) - # Node types to always delete - HTML_DELETE_ELEMENT_TYPES = [ - Nokogiri::XML::Node::DTD_NODE, - Nokogiri::XML::Node::COMMENT_NODE, - ] - - # Private variables: - # @doc - nokogiri document - # @out - same as @doc, but only if trimming has occured - def initialize(html) - if String === html - @doc = Nokogiri::HTML(html) - else - @doc = html - end - end - - class << self - # Email::HtmlCleaner.trim(inp, opts={}) - # - # Arguments: - # inp - Either a HTML string or a Nokogiri document. - # Options: - # :return => :doc, :string - # Specify the desired return type. - # Defaults to the type of the input. - # A value of :string is equivalent to calling get_document_text() - # on the returned document. - def trim(inp, opts={}) - cleaner = HtmlCleaner.new(inp) - - opts[:return] ||= ((String === inp) ? :string : :doc) - - if opts[:return] == :string - cleaner.output_html - else - cleaner.output_document - end - end - - # Email::HtmlCleaner.get_document_text(doc) - # - # Get the body portion of the document, including html, as a string. - def get_document_text(doc) - body = doc.xpath('//body') - if body - body.inner_html - else - doc.inner_html - end - end - end - - def output_document - @out ||= begin - doc = @doc - trim_process_node doc - add_newlines doc - doc - end - end - - def output_html - HtmlCleaner.get_document_text(output_document) - end - - private - - def add_newlines(doc) - # Replace
tags with a markdown \n - doc.xpath('//br').each do |br| - br.replace(new_linebreak_node doc, 2) - end - # Surround

tags with newlines, to help with line-wise postprocessing - # and ensure markdown paragraphs - doc.xpath('//p').each do |p| - p.before(new_linebreak_node doc) - p.after(new_linebreak_node doc, 2) - end - end - - def new_linebreak_node(doc, count=1) - Nokogiri::XML::Text.new("\n" * count, doc) - end - - def trim_process_node(node) - if should_hoist?(node) - hoisted = trim_hoist_element node - hoisted.each { |child| trim_process_node child } - elsif should_delete?(node) - node.remove - else - if children = node.children - children.each { |child| trim_process_node child } - end - end - - node - end - - def trim_hoist_element(element) - hoisted = [] - element.children.each do |child| - element.before(child) - hoisted << child - end - element.remove - hoisted - end - - def should_hoist?(node) - return false unless node.element? - HTML_HOIST_ELEMENTS.include? node.name - end - - def should_delete?(node) - return true if HTML_DELETE_ELEMENT_TYPES.include? node.type - return true if node.element? && node.name == 'head' - return true if node.text? && node.text.strip.blank? - - false - end - end -end diff --git a/lib/email/receiver.rb b/lib/email/receiver.rb index b629fdb5587..f87ca1bab64 100644 --- a/lib/email/receiver.rb +++ b/lib/email/receiver.rb @@ -1,7 +1,7 @@ require "digest" require_dependency "new_post_manager" require_dependency "post_action_creator" -require_dependency "email/html_cleaner" +require_dependency "html_to_markdown" module Email @@ -188,18 +188,18 @@ module Email text = fix_charset(@mail) end - if html.present? && (SiteSetting.incoming_email_prefer_html || text.blank?) - html = Email::HtmlCleaner.new(html).output_html - html = trim_discourse_markers(html) - html, elided = EmailReplyTrimmer.trim(html, true) - return [html, elided] - end - if text.present? text = trim_discourse_markers(text) text, elided = EmailReplyTrimmer.trim(text, true) return [text, elided] end + + if html.present? + markdown = HtmlToMarkdown.new(html).to_markdown + markdown = trim_discourse_markers(markdown) + markdown, elided = EmailReplyTrimmer.trim(markdown, true) + return [markdown, elided] + end end def fix_charset(mail_part) diff --git a/lib/html_to_markdown.rb b/lib/html_to_markdown.rb index 7cfa0d6df01..6a5112c8270 100644 --- a/lib/html_to_markdown.rb +++ b/lib/html_to_markdown.rb @@ -115,7 +115,7 @@ class HtmlToMarkdown RUBY end - WHITELISTED ||= %w{del ins kbd s small strike sub sup table tbody td tfoot th thead tr} + WHITELISTED ||= %w{del ins kbd s small strike sub sup} WHITELISTED.each do |tag| class_eval <<-RUBY def visit_#{tag}(node) diff --git a/spec/components/email/receiver_spec.rb b/spec/components/email/receiver_spec.rb index 5f0ea2fcf63..e00fcc353a8 100644 --- a/spec/components/email/receiver_spec.rb +++ b/spec/components/email/receiver_spec.rb @@ -157,7 +157,7 @@ describe Email::Receiver do expect(topic.posts.last.cooked).not_to match(/
HTML reply ;)") + expect(topic.posts.last.raw).to eq("This is a **HTML** reply ;)") expect { process(:hebrew_reply) }.to change { topic.posts.count } expect(topic.posts.last.raw).to eq("שלום! מה שלומך היום?") @@ -174,18 +174,6 @@ describe Email::Receiver do expect(topic.posts.last.raw).to eq("This is the *text* part.") end - it "prefers html over text when site setting is enabled" do - SiteSetting.incoming_email_prefer_html = true - expect { process(:text_and_html_reply) }.to change { topic.posts.count } - expect(topic.posts.last.raw).to eq('This is the html part.') - end - - it "uses text when prefer_html site setting is enabled but no html is available" do - SiteSetting.incoming_email_prefer_html = true - expect { process(:text_reply) }.to change { topic.posts.count } - expect(topic.posts.last.raw).to eq("This is a text reply :)") - end - it "removes the 'on , wrote' quoting line" do expect { process(:on_date_contact_wrote) }.to change { topic.posts.count } expect(topic.posts.last.raw).to eq("This is the actual reply.")