discourse/spec/components/html_to_markdown_spec.rb
Régis Hanol cd93d1b5f7
FEATURE: new 'trim_incoming_emails' site setting (#12874)
This setting allows admin to de/activate automatic trimming of incoming email.
There are instances where it does wonders in trimming all the garbage content and other
instances where it's so bad that it trims the most important part of the email.

FIX: don't remove hidden content using the style attribute when converting HTML to Markdown.
The regexp used was doing more harm than good. It was way too broad.

FIX: properly elide signatures from emails sent with Front App.
This is fairly safe as Front App nicely identifies signatures in the HTML part.
2021-04-28 17:08:48 +02:00

455 lines
16 KiB
Ruby

# frozen_string_literal: true
require 'rails_helper'
require 'html_to_markdown'
describe HtmlToMarkdown do
def html_to_markdown(html, opts = {})
HtmlToMarkdown.new(html, opts).to_markdown
end
it "remove whitespaces" do
html = <<-HTML
<div dir="auto">Hello,
<div dir="auto"><br></div>
<div dir="auto">&nbsp; &nbsp; This is the 1st paragraph.&nbsp; &nbsp; </div>
<div dir="auto"><br></div>
<div dir="auto">
&nbsp; &nbsp; &nbsp; &nbsp; This is another paragraph
</div>
</div>
HTML
expect(html_to_markdown(html)).to eq("Hello,\n\nThis is the 1st paragraph.\n\nThis is another paragraph")
html = <<~HTML
<body text="#000000" bgcolor="#FFFFFF">
<p>Let me see if it happens by answering your message through
Thunderbird.</p>
<p>Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1
Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1
Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1
Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1
Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1
Long sentence 1
</p>
</body>
HTML
markdown = <<~MD
Let me see if it happens by answering your message through Thunderbird.
Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1
MD
expect(html_to_markdown(html)).to eq(markdown.strip)
html = <<~HTML
<p> This post
has lots<br> of
space
</p>
<pre> This space was left untouched !</pre>
HTML
markdown = <<~MD
This post has lots
of space
```
This space was left untouched !
```
MD
expect(html_to_markdown(html)).to eq(markdown.strip)
end
it "doesn't error on non-inline elements like (aside, section)" do
html = <<~HTML
<aside class="quote no-group">
<blockquote>
<p>hello.</p>
</blockquote>
</aside>
HTML
markdown = <<~MD
> hello.
MD
expect(html_to_markdown(html)).to eq(markdown.strip)
end
it "skips hidden tags" do
expect(html_to_markdown(%Q{<p>Hello <span hidden>cruel </span>World!</p>})).to eq("Hello World!")
end
it "converts <strong>" do
expect(html_to_markdown("<strong>Strong</strong>")).to eq("**Strong**")
expect(html_to_markdown("<strong>Str*ng</strong>")).to eq("__Str*ng__")
end
it "converts <b>" do
expect(html_to_markdown("<b>Bold</b>")).to eq("**Bold**")
expect(html_to_markdown("<b>B*ld</b>")).to eq("__B*ld__")
html = <<~HTML
Before
<p><b>Bold
<br>
<br>
</b>
</p>
After
HTML
expect(html_to_markdown(html)).to eq("Before\n\n**Bold**\n\nAfter")
end
it "converts <em>" do
expect(html_to_markdown("<em>Emphasis</em>")).to eq("*Emphasis*")
expect(html_to_markdown("<em>Emph*sis</em>")).to eq("_Emph*sis_")
end
it "converts <i>" do
expect(html_to_markdown("<i>Italic</i>")).to eq("*Italic*")
expect(html_to_markdown("<i>It*lic</i>")).to eq("_It*lic_")
end
it "converts <a>" do
expect(html_to_markdown(%Q{<a href="https://www.discourse.org">Discourse</a>})).to eq("[Discourse](https://www.discourse.org)")
end
it "supports SiteSetting.allowed_href_schemes" do
SiteSetting.allowed_href_schemes = "tel|steam"
expect(html_to_markdown(%Q{<a href="steam://store/48000">LIMBO</a>})).to eq("[LIMBO](steam://store/48000)")
end
it "removes empty & invalid <a>" do
expect(html_to_markdown(%Q{<a>Discourse</a>})).to eq("Discourse")
expect(html_to_markdown(%Q{<a href="">Discourse</a>})).to eq("Discourse")
expect(html_to_markdown(%Q{<a href="foo.bar">Discourse</a>})).to eq("Discourse")
end
HTML_WITH_IMG ||= %Q{<img src="https://www.discourse.org/logo.svg" alt="Discourse Logo">}
HTML_WITH_CID_IMG ||= %Q{<img src="cid:ii_1525434659ddb4cb" title="Discourse Logo">}
it "converts <img>" do
expect(html_to_markdown(HTML_WITH_IMG)).to eq("![Discourse Logo](https://www.discourse.org/logo.svg)")
end
it "keeps <img> with 'keep_img_tags'" do
expect(html_to_markdown(HTML_WITH_IMG, keep_img_tags: true)).to eq(HTML_WITH_IMG)
end
it "removes empty & invalid <img>" do
expect(html_to_markdown(%Q{<img>})).to eq("")
expect(html_to_markdown(%Q{<img src="">})).to eq("")
expect(html_to_markdown(%Q{<img src="foo.bar">})).to eq("")
end
it "keeps <img> with src='cid:' whith 'keep_cid_imgs'" do
expect(html_to_markdown(HTML_WITH_CID_IMG, keep_cid_imgs: true)).to eq(HTML_WITH_CID_IMG)
end
it "skips hidden <img>" do
expect(html_to_markdown(%Q{<img src="https://www.discourse.org/logo.svg" width=0>})).to eq("")
expect(html_to_markdown(%Q{<img src="https://www.discourse.org/logo.svg" height="0">})).to eq("")
end
it "supports width/height on <img>" do
expect(html_to_markdown(%Q{<img src="https://www.discourse.org/logo.svg" height=100>})).to eq("![](https://www.discourse.org/logo.svg)")
expect(html_to_markdown(%Q{<img src="https://www.discourse.org/logo.svg" width=200>})).to eq("![](https://www.discourse.org/logo.svg)")
expect(html_to_markdown(%Q{<img src="https://www.discourse.org/logo.svg" height=100 width=200>})).to eq("![|200x100](https://www.discourse.org/logo.svg)")
end
(1..6).each do |n|
it "converts <h#{n}>" do
expect(html_to_markdown("<h#{n}>Header #{n}</h#{n}>")).to eq("#" * n + " Header #{n}")
end
end
it "converts <br>" do
expect(html_to_markdown("Before<br>Inside<br>After")).to eq("Before\nInside\nAfter")
end
it "skips <br> inside <p> if next character is \n" do
expect(html_to_markdown("<p>Before<br>\nInside<br>After</p>")).to eq("Before\nInside\nAfter")
end
it "converts <hr>" do
expect(html_to_markdown("Before<hr>Inside<hr>After")).to eq("Before\n\n---\n\nInside\n\n---\n\nAfter")
end
it "converts <tt>" do
expect(html_to_markdown("<tt>Teletype</tt>")).to eq("`Teletype`")
end
it "converts <code>" do
expect(html_to_markdown("<code>Code</code>")).to eq("`Code`")
end
it "supports <ins>" do
expect(html_to_markdown("This is an <ins>insertion</ins>")).to eq("This is an <ins>insertion</ins>")
end
it "supports <del>" do
expect(html_to_markdown("This is a <del>deletion</del>")).to eq("This is a <del>deletion</del>")
end
it "supports <sub>" do
expect(html_to_markdown("H<sub>2</sub>O")).to eq("H<sub>2</sub>O")
end
it "supports <mark>" do
expect(html_to_markdown("<mark>This is highlighted!</mark>")).to eq("<mark>This is highlighted!</mark>")
end
it "supports <sup>" do
expect(html_to_markdown("<sup>Super Script!</sup>")).to eq("<sup>Super Script!</sup>")
end
it "supports <small>" do
expect(html_to_markdown("<small>Small</small>")).to eq("<small>Small</small>")
end
it "supports <kbd>" do
expect(html_to_markdown("<kbd>CTRL</kbd>+<kbd>C</kbd>")).to eq("<kbd>CTRL</kbd>+<kbd>C</kbd>")
end
it "supports <abbr>" do
expect(html_to_markdown(%Q{<abbr title="Civilized Discourse Construction Kit, Inc.">CDCK</abbr>})).to eq(%Q{<abbr title="Civilized Discourse Construction Kit, Inc.">CDCK</abbr>})
end
it "supports <s>" do
expect(html_to_markdown("<s>Strike Through</s>")).to eq("~~Strike Through~~")
end
it "supports <strike>" do
expect(html_to_markdown("<strike>Strike Through</strike>")).to eq("~~Strike Through~~")
end
it "supports <blockquote>" do
expect(html_to_markdown("<blockquote>Quote</blockquote>")).to eq("> Quote")
end
it "supports <ul>" do
expect(html_to_markdown("<ul><li>🍏</li><li>🍐</li><li>🍌</li></ul>")).to eq("- 🍏\n- 🍐\n- 🍌")
expect(html_to_markdown("<ul>\n<li>🍏</li>\n<li>🍐</li>\n<li>🍌</li>\n</ul>")).to eq("- 🍏\n- 🍐\n- 🍌")
end
it "supports <ol>" do
expect(html_to_markdown("<ol><li>🍆</li><li>🍅</li><li>🍄</li></ol>")).to eq("1. 🍆\n1. 🍅\n1. 🍄")
end
it "supports <p> inside <li>" do
expect(html_to_markdown("<ul><li><p>🍏</p></li><li><p>🍐</p></li><li><p>🍌</p></li></ul>")).to eq("- 🍏\n\n- 🍐\n\n- 🍌")
end
it "supports <ul> inside <ul>" do
expect(html_to_markdown(<<-HTML
<ul>
<li>Fruits
<ul>
<li>🍏</li>
<li>🍐</li>
<li>🍌</li>
</ul>
</li>
<li>Vegetables
<ul>
<li>🍆</li>
<li>🍅</li>
<li>🍄</li>
</ul>
</li>
</ul>
HTML
)).to eq("- Fruits\n - 🍏\n - 🍐\n - 🍌\n- Vegetables\n - 🍆\n - 🍅\n - 🍄")
end
it "supports bare <li>" do
expect(html_to_markdown("<li>I'm alone</li>")).to eq("- I'm alone")
end
it "supports <pre>" do
expect(html_to_markdown("<pre>var foo = 'bar';</pre>")).to eq("```\nvar foo = 'bar';\n```")
expect(html_to_markdown("<pre><code>var foo = 'bar';</code></pre>")).to eq("```\nvar foo = 'bar';\n```")
expect(html_to_markdown(%Q{<pre><code class="lang-javascript">var foo = 'bar';</code></pre>})).to eq("```javascript\nvar foo = 'bar';\n```")
expect(html_to_markdown("<pre> function f() {\n console.log('Hello world!');\n }</pre>")).to eq("```\n function f() {\n console.log('Hello world!');\n }\n```")
end
it "supports <pre> inside <blockquote>" do
expect(html_to_markdown("<blockquote><pre><code>var foo = 'bar';</code></pre></blockquote>")).to eq("> ```\n> var foo = 'bar';\n> ```")
end
it "works" do
expect(html_to_markdown("<ul><li><p>A list item with a blockquote:</p><blockquote><p>This is a <strong>blockquote</strong><br>inside a list item.</p></blockquote></li></ul>")).to eq("- A list item with a blockquote:\n\n > This is a **blockquote**\n > inside a list item.")
end
it "supports html document" do
expect(html_to_markdown("<html><body>Hello<div>World</div></body></html>")).to eq("Hello\nWorld")
end
it "handles <p>" do
expect(html_to_markdown("<p>1st paragraph</p><p>2nd paragraph</p>")).to eq("1st paragraph\n\n2nd paragraph")
expect(html_to_markdown("<body><p>1st paragraph</p>\n <p> 2nd paragraph\n 2nd paragraph</p>\n<p>3rd paragraph</p></body>")).to eq("1st paragraph\n\n2nd paragraph 2nd paragraph\n\n3rd paragraph")
end
it "handles <div>" do
expect(html_to_markdown("<div>1st div</div><div>2nd div</div>")).to eq("1st div\n2nd div")
end
it "swallows <span>" do
expect(html_to_markdown("<span>Span</span>")).to eq("Span")
end
it "swallows <u>" do
expect(html_to_markdown("<u>Underline</u>")).to eq("Underline")
end
it "removes <script>" do
expect(html_to_markdown("<script>var foo = 'bar'</script>")).to eq("")
end
it "removes <style>" do
expect(html_to_markdown("<style>* { margin: 0 }</style>")).to eq("")
end
it "handles <p> and <div> within <span>" do
html = "<div>1st paragraph<span><div>2nd paragraph</div><p>3rd paragraph</p></span></div>"
expect(html_to_markdown(html)).to eq("1st paragraph\n2nd paragraph\n\n3rd paragraph")
end
it "handles <p> and <div> within <font>" do
html = "<font>1st paragraph<br><span>2nd paragraph</span><div>3rd paragraph</div><p>4th paragraph</p></font>"
expect(html_to_markdown(html)).to eq("1st paragraph\n2nd paragraph\n3rd paragraph\n\n4th paragraph")
end
context "with an oddly placed <br>" do
it "handles <strong>" do
expect(html_to_markdown("Hello <strong><br>Bold</strong> World")).to eq("Hello\n**Bold** World")
expect(html_to_markdown("Hello <strong>Bold<br></strong> World")).to eq("Hello **Bold**\nWorld")
expect(html_to_markdown("Hello <strong>Bold<br>text</strong> World")).to eq("Hello **Bold**\n**text** World")
end
it "handles <em>" do
expect(html_to_markdown("Hello <em><br>Italic</em> World")).to eq("Hello\n*Italic* World")
expect(html_to_markdown("Hello <em>Italic<br></em> World")).to eq("Hello *Italic*\nWorld")
expect(html_to_markdown("Hello <em>Italic<br>text</em> World")).to eq("Hello *Italic*\n*text* World")
end
it "works" do
expect(html_to_markdown("<div>A <b> B <i> C <br> D </i> E <br> F </b> G</div>")).to eq("A __B *C*__\n__*D* E__\n**F** G")
end
end
context "with an empty tag" do
it "handles <strong>" do
expect(html_to_markdown("<strong></strong>")).to eq("")
expect(html_to_markdown("<strong> </strong>")).to eq("")
expect(html_to_markdown("Some<strong> </strong>text")).to eq("Some text")
expect(html_to_markdown("Some<strong> </strong>text")).to eq("Some text")
end
it "handles <em>" do
expect(html_to_markdown("<em></em>")).to eq("")
expect(html_to_markdown("<em> </em>")).to eq("")
expect(html_to_markdown("Some<em> </em>text")).to eq("Some text")
expect(html_to_markdown("Some<em> </em>text")).to eq("Some text")
end
end
context "with spaces around text" do
it "handles <strong>" do
expect(html_to_markdown("<strong> Bold</strong>")).to eq("**Bold**")
expect(html_to_markdown("<strong> Bold</strong>")).to eq("**Bold**")
expect(html_to_markdown("<strong>Bold </strong>")).to eq("**Bold**")
expect(html_to_markdown("<strong>Bold </strong>")).to eq("**Bold**")
expect(html_to_markdown("Some<strong> bold</strong> text")).to eq("Some **bold** text")
expect(html_to_markdown("Some<strong> bold</strong> text")).to eq("Some **bold** text")
expect(html_to_markdown("Some <strong>bold </strong>text")).to eq("Some **bold** text")
expect(html_to_markdown("Some <strong>bold </strong>text")).to eq("Some **bold** text")
end
it "handles <em>" do
expect(html_to_markdown("<em> Italic</em>")).to eq("*Italic*")
expect(html_to_markdown("<em> Italic</em>")).to eq("*Italic*")
expect(html_to_markdown("<em>Italic </em>")).to eq("*Italic*")
expect(html_to_markdown("<em>Italic </em>")).to eq("*Italic*")
expect(html_to_markdown("Some<em> italic</em> text")).to eq("Some *italic* text")
expect(html_to_markdown("Some<em> italic</em> text")).to eq("Some *italic* text")
expect(html_to_markdown("Some <em>italic </em>text")).to eq("Some *italic* text")
expect(html_to_markdown("Some <em>italic </em>text")).to eq("Some *italic* text")
end
end
it "supoorts <table>" do
html = <<~HTML
<table>
<thead>
<tr>
<th>This</th>
<th>is</th>
<th>the</th>
<th><i>headers</i></th>
</tr>
</thead>
<tbody>
<tr>
<td>I am</td>
<td>the</td>
<td><b>first</b></td>
<td>row</td>
</tr>
<tr>
<td>And this</td>
<td>is the</td>
<td>2<sup>nd</sup></td>
<td>line</td>
</tr>
</tbody>
</table>
HTML
markdown = <<~MD
| This | is | the | *headers* |
| - | - | - | - |
| I am | the | **first** | row |
| And this | is the | 2<sup>nd</sup> | line |
MD
expect(html_to_markdown(html)).to eq(markdown.strip)
expect(html_to_markdown("<table><tr><td>Hello</td><td>World</td></tr></table>")).to eq("| Hello | World |\n| - | - |")
end
it "doesn't swallow badly formatted <table>" do
html = <<~HTML
<table>
<tr>
<th>1</th>
<th>2</th>
<th>3</th>
<th>4</th>
</tr>
<tr>
<td>One</td>
<td>Two</td>
<td>Three</td>
</tr>
</table>
HTML
expect(html_to_markdown(html)).to eq("1 2 3 4 \nOne Two Three")
end
end