Add 'keep_cid_imgs' option to HTML to Markdown converter to improve incoming email parsing

This commit is contained in:
Régis Hanol 2017-05-03 22:53:47 +02:00
parent 9d7917f79d
commit 768c63c103
2 changed files with 22 additions and 10 deletions

View File

@ -136,7 +136,7 @@ class HtmlToMarkdown
end end
def visit_img(node) def visit_img(node)
if is_valid_url?(node["src"]) && is_visible_img?(node) if is_valid_src?(node["src"]) && is_visible_img?(node)
if @opts[:keep_img_tags] if @opts[:keep_img_tags]
@stack[-1].markdown << node.to_html @stack[-1].markdown << node.to_html
else else
@ -147,7 +147,7 @@ class HtmlToMarkdown
end end
def visit_a(node) def visit_a(node)
if is_valid_url?(node["href"]) if is_valid_href?(node["href"])
@stack[-1].markdown << "[" @stack[-1].markdown << "["
traverse(node) traverse(node)
@stack[-1].markdown << "](#{node["href"]})" @stack[-1].markdown << "](#{node["href"]})"
@ -206,8 +206,14 @@ class HtmlToMarkdown
(lines + [""]).join("\n") (lines + [""]).join("\n")
end end
def is_valid_url?(url) def is_valid_href?(href)
url.present? && (url.start_with?("http") || url.start_with?("www.")) href.present? && (href.start_with?("http") || href.start_with?("www."))
end
def is_valid_src?(src)
return false if src.blank?
return true if @opts[:keep_cid_imgs] && src.start_with?("cid:")
src.start_with?("http") || src.start_with?("www.")
end end
def is_visible_img?(img) def is_visible_img?(img)

View File

@ -3,8 +3,8 @@ require 'html_to_markdown'
describe HtmlToMarkdown do describe HtmlToMarkdown do
def html_to_markdown(html) def html_to_markdown(html, opts={})
HtmlToMarkdown.new(html).to_markdown HtmlToMarkdown.new(html, opts).to_markdown
end end
it "remove whitespaces" do it "remove whitespaces" do
@ -56,13 +56,14 @@ describe HtmlToMarkdown do
end end
HTML_WITH_IMG ||= %Q{<img src="https://www.discourse.org/logo.svg" alt="Discourse Logo">} HTML_WITH_IMG ||= %Q{<img src="https://www.discourse.org/logo.svg" alt="Discourse Logo">}
HTML_WITH_CID_IMG ||= %Q{<img src="cid:ii_1525434659ddb4cb" alt="Discourse Logo">}
it "converts <img>" do it "converts <img>" do
expect(html_to_markdown(HTML_WITH_IMG)).to eq("![Discourse Logo](https://www.discourse.org/logo.svg)") expect(html_to_markdown(HTML_WITH_IMG)).to eq("![Discourse Logo](https://www.discourse.org/logo.svg)")
end end
it "keeps <img> with 'keep_img_tags'" do it "keeps <img> with 'keep_img_tags'" do
expect(HtmlToMarkdown.new(HTML_WITH_IMG, keep_img_tags: true).to_markdown).to eq(HTML_WITH_IMG) expect(html_to_markdown(HTML_WITH_IMG, keep_img_tags: true)).to eq(HTML_WITH_IMG)
end end
it "removes empty & invalid <img>" do it "removes empty & invalid <img>" do
@ -71,6 +72,11 @@ describe HtmlToMarkdown do
expect(html_to_markdown(%Q{<img src="foo.bar">})).to eq("") expect(html_to_markdown(%Q{<img src="foo.bar">})).to eq("")
end end
it "keeps <img> with src='cid:' whith 'keep_cid_imgs'" do
expect(html_to_markdown(HTML_WITH_CID_IMG, keep_cid_imgs: true)).to eq("![Discourse Logo](cid:ii_1525434659ddb4cb)")
expect(html_to_markdown(HTML_WITH_CID_IMG, keep_img_tags: true, keep_cid_imgs: true)).to eq("<img src=\"cid:ii_1525434659ddb4cb\" alt=\"Discourse Logo\">")
end
it "skips hidden <img>" do it "skips hidden <img>" do
expect(html_to_markdown(%Q{<img src="https://www.discourse.org/logo.svg" width=0>})).to eq("") expect(html_to_markdown(%Q{<img src="https://www.discourse.org/logo.svg" width=0>})).to eq("")
expect(html_to_markdown(%Q{<img src="https://www.discourse.org/logo.svg" height="0">})).to eq("") expect(html_to_markdown(%Q{<img src="https://www.discourse.org/logo.svg" height="0">})).to eq("")