discourse/spec/lib/html_to_markdown_spec.rb
Martin Brennan 575bc4af73
FIX: Remove newlines from img alt & title in HTML to markdown parser (#25473)
We were having a minor issue with emails with embedded images
that had newlines in the alt string; for example:

```
<p class="MsoNormal"><span style="font-size:11.0pt"><img width="898"
height="498" style="width:9.3541in;height:5.1875in" id="Picture_x0020_5"
src="cid:image003.png@01DA4EBA.0400B610" alt="A screenshot of a computer
program

Description automatically generated"></span><span
style="font-size:11.0pt"><o:p></o:p></span></p>
```

Once this was parsed and converted to markdown (or directly to HTML
in some cases), this caused an issue in the composer and the post
UI, where the markdown parser didn't know how to deal with this,
making the HTML show directly instead of showing an image.

The easiest way to deal with this is to just strip \n from image
alt and title attrs in the HTMLToMarkdown class.
2024-01-31 10:23:09 +10:00

530 lines
17 KiB
Ruby

# frozen_string_literal: true
require "html_to_markdown"
RSpec.describe HtmlToMarkdown do
def html_to_markdown(html, opts = {})
HtmlToMarkdown.new(html, opts).to_markdown
end
it "remove whitespaces" do
html = <<-HTML
<div dir="auto">Hello,
<div dir="auto"><br></div>
<div dir="auto">&nbsp; &nbsp; This is the 1st paragraph.&nbsp; &nbsp; </div>
<div dir="auto"><br></div>
<div dir="auto">
&nbsp; &nbsp; &nbsp; &nbsp; This is another paragraph
</div>
</div>
HTML
expect(html_to_markdown(html)).to eq(
"Hello,\n\nThis is the 1st paragraph.\n\nThis is another paragraph",
)
html = <<~HTML
<body text="#000000" bgcolor="#FFFFFF">
<p>Let me see if it happens by answering your message through
Thunderbird.</p>
<p>Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1
Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1
Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1
Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1
Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1
Long sentence 1
</p>
</body>
HTML
markdown = <<~MD
Let me see if it happens by answering your message through Thunderbird.
Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1
MD
expect(html_to_markdown(html)).to eq(markdown.strip)
html = <<~HTML
<p> This post
has lots<br> of
space
</p>
<pre> This space was left untouched !</pre>
HTML
markdown = <<~MD
This post has lots
of space
```
This space was left untouched !
```
MD
expect(html_to_markdown(html)).to eq(markdown.strip)
end
it "doesn't error on non-inline elements like (aside, section)" do
html = <<~HTML
<aside class="quote no-group">
<blockquote>
<p>Hello,<br>is it me you're looking for?</p>
</blockquote>
<br>
</aside>
HTML
markdown = <<~MD
> Hello,
> is it me you're looking for?
MD
expect(html_to_markdown(html)).to eq(markdown.strip)
end
it "skips hidden tags" do
expect(html_to_markdown("<p>Hello <span hidden>cruel </span>World!</p>")).to eq("Hello World!")
end
it "converts <strong>" do
expect(html_to_markdown("<strong>Strong</strong>")).to eq("**Strong**")
expect(html_to_markdown("<strong>Str*ng</strong>")).to eq("__Str*ng__")
end
it "converts <b>" do
expect(html_to_markdown("<b>Bold</b>")).to eq("**Bold**")
expect(html_to_markdown("<b>B*ld</b>")).to eq("__B*ld__")
html = <<~HTML
Before
<p><b>Bold
<br>
<br>
</b>
</p>
After
HTML
expect(html_to_markdown(html)).to eq("Before\n\n**Bold**\n\nAfter")
end
it "converts <em>" do
expect(html_to_markdown("<em>Emphasis</em>")).to eq("*Emphasis*")
expect(html_to_markdown("<em>Emph*sis</em>")).to eq("_Emph*sis_")
end
it "converts <i>" do
expect(html_to_markdown("<i>Italic</i>")).to eq("*Italic*")
expect(html_to_markdown("<i>It*lic</i>")).to eq("_It*lic_")
end
it "converts <a>" do
expect(html_to_markdown(%Q{<a href="https://www.discourse.org">Discourse</a>})).to eq(
"[Discourse](https://www.discourse.org)",
)
end
it "supports SiteSetting.allowed_href_schemes" do
SiteSetting.allowed_href_schemes = "tel|steam"
expect(html_to_markdown(%Q{<a href="steam://store/48000">LIMBO</a>})).to eq(
"[LIMBO](steam://store/48000)",
)
end
it "removes empty & invalid <a>" do
expect(html_to_markdown("<a>Discourse</a>")).to eq("Discourse")
expect(html_to_markdown(%Q{<a href="">Discourse</a>})).to eq("Discourse")
expect(html_to_markdown(%Q{<a href="foo.bar">Discourse</a>})).to eq("Discourse")
end
HTML_WITH_IMG ||= %Q{<img src="https://www.discourse.org/logo.svg" alt="Discourse Logo">}
HTML_WITH_CID_IMG ||= %Q{<img src="cid:ii_1525434659ddb4cb" title="Discourse Logo">}
it "converts <img>" do
expect(html_to_markdown(HTML_WITH_IMG)).to eq(
"![Discourse Logo](https://www.discourse.org/logo.svg)",
)
end
it "keeps <img> with 'keep_img_tags'" do
expect(html_to_markdown(HTML_WITH_IMG, keep_img_tags: true)).to eq(HTML_WITH_IMG)
end
it "removes newlines from img alt text" do
html_with_alt_newlines =
%Q{<img src="https://www.discourse.org/logo.svg" alt="Discourse\n\nLogo">}
expect(html_to_markdown(html_with_alt_newlines)).to eq(
"![Discourse Logo](https://www.discourse.org/logo.svg)",
)
end
it "removes empty & invalid <img>" do
expect(html_to_markdown("<img>")).to eq("")
expect(html_to_markdown(%Q{<img src="">})).to eq("")
expect(html_to_markdown(%Q{<img src="foo.bar">})).to eq("")
end
it "keeps <img> with src='cid:' with 'keep_cid_imgs'" do
expect(html_to_markdown(HTML_WITH_CID_IMG, keep_cid_imgs: true)).to eq(HTML_WITH_CID_IMG)
end
it "removes newlines from img alt text with cid images" do
html_with_cid_alt_newlines = %Q{<img src="cid:ii_1525434659ddb4cb" title="Discourse\n\nLogo">}
expect(html_to_markdown(html_with_cid_alt_newlines, keep_cid_imgs: true)).to eq(
%Q{<img src="cid:ii_1525434659ddb4cb" title="Discourse Logo">},
)
end
it "skips hidden <img>" do
expect(html_to_markdown(%Q{<img src="https://www.discourse.org/logo.svg" width=0>})).to eq("")
expect(html_to_markdown(%Q{<img src="https://www.discourse.org/logo.svg" height="0">})).to eq(
"",
)
end
it "supports width/height on <img>" do
expect(html_to_markdown(%Q{<img src="https://www.discourse.org/logo.svg" height=100>})).to eq(
"![](https://www.discourse.org/logo.svg)",
)
expect(html_to_markdown(%Q{<img src="https://www.discourse.org/logo.svg" width=200>})).to eq(
"![](https://www.discourse.org/logo.svg)",
)
expect(
html_to_markdown(%Q{<img src="https://www.discourse.org/logo.svg" height=100 width=200>}),
).to eq("![|200x100](https://www.discourse.org/logo.svg)")
end
(1..6).each do |n|
it "converts <h#{n}>" do
expect(html_to_markdown("<h#{n}>Header #{n}</h#{n}>")).to eq("#" * n + " Header #{n}")
end
end
it "converts <br>" do
expect(html_to_markdown("Before<br>Inside<br>After")).to eq("Before\nInside\nAfter")
end
it "skips <br> inside <p> if next character is \n" do
expect(html_to_markdown("<p>Before<br>\nInside<br>After</p>")).to eq("Before\nInside\nAfter")
end
it "converts <hr>" do
expect(html_to_markdown("Before<hr>Inside<hr>After")).to eq(
"Before\n\n---\n\nInside\n\n---\n\nAfter",
)
end
it "converts <tt>" do
expect(html_to_markdown("<tt>Teletype</tt>")).to eq("`Teletype`")
end
it "converts <code>" do
expect(html_to_markdown("<code>Code</code>")).to eq("`Code`")
end
it "supports <ins>" do
expect(html_to_markdown("This is an <ins>insertion</ins>")).to eq(
"This is an <ins>insertion</ins>",
)
end
it "supports <del>" do
expect(html_to_markdown("This is a <del>deletion</del>")).to eq("This is a <del>deletion</del>")
end
it "supports <sub>" do
expect(html_to_markdown("H<sub>2</sub>O")).to eq("H<sub>2</sub>O")
end
it "supports <mark>" do
expect(html_to_markdown("<mark>This is highlighted!</mark>")).to eq(
"<mark>This is highlighted!</mark>",
)
end
it "supports <sup>" do
expect(html_to_markdown("<sup>Super Script!</sup>")).to eq("<sup>Super Script!</sup>")
end
it "supports <small>" do
expect(html_to_markdown("<small>Small</small>")).to eq("<small>Small</small>")
end
it "supports <kbd>" do
expect(html_to_markdown("<kbd>CTRL</kbd>+<kbd>C</kbd>")).to eq("<kbd>CTRL</kbd>+<kbd>C</kbd>")
end
it "supports <abbr>" do
expect(
html_to_markdown(%Q{<abbr title="Civilized Discourse Construction Kit, Inc.">CDCK</abbr>}),
).to eq(%Q{<abbr title="Civilized Discourse Construction Kit, Inc.">CDCK</abbr>})
end
it "supports <s>" do
expect(html_to_markdown("<s>Strike Through</s>")).to eq("~~Strike Through~~")
end
it "supports <strike>" do
expect(html_to_markdown("<strike>Strike Through</strike>")).to eq("~~Strike Through~~")
end
it "supports <blockquote>" do
expect(html_to_markdown("<blockquote>Quote</blockquote>")).to eq("> Quote")
end
it "supports <ul>" do
expect(html_to_markdown("<ul><li>🍏</li><li>🍐</li><li>🍌</li></ul>")).to eq("- 🍏\n- 🍐\n- 🍌")
expect(html_to_markdown("<ul>\n<li>🍏</li>\n<li>🍐</li>\n<li>🍌</li>\n</ul>")).to eq(
"- 🍏\n- 🍐\n- 🍌",
)
end
it "supports <ol>" do
expect(html_to_markdown("<ol><li>🍆</li><li>🍅</li><li>🍄</li></ol>")).to eq("1. 🍆\n1. 🍅\n1. 🍄")
end
it "supports <p> inside <li>" do
expect(html_to_markdown("<ul><li><p>🍏</p></li><li><p>🍐</p></li><li><p>🍌</p></li></ul>")).to eq(
"- 🍏\n\n- 🍐\n\n- 🍌",
)
end
it "supports <ul> inside <ul>" do
expect(html_to_markdown(<<-HTML)).to eq(
<ul>
<li>Fruits
<ul>
<li>🍏</li>
<li>🍐</li>
<li>🍌</li>
</ul>
</li>
<li>Vegetables
<ul>
<li>🍆</li>
<li>🍅</li>
<li>🍄</li>
</ul>
</li>
</ul>
HTML
"- Fruits\n - 🍏\n - 🍐\n - 🍌\n- Vegetables\n - 🍆\n - 🍅\n - 🍄",
)
end
it "supports bare <li>" do
expect(html_to_markdown("<li>I'm alone</li>")).to eq("- I'm alone")
end
it "supports <pre>" do
expect(html_to_markdown("<pre>var foo = 'bar';</pre>")).to eq("```\nvar foo = 'bar';\n```")
expect(html_to_markdown("<pre><code>var foo = 'bar';</code></pre>")).to eq(
"```\nvar foo = 'bar';\n```",
)
expect(
html_to_markdown(%Q{<pre><code class="lang-javascript">var foo = 'bar';</code></pre>}),
).to eq("```javascript\nvar foo = 'bar';\n```")
expect(
html_to_markdown(
"<pre> function f() {\n console.log('Hello world!');\n }</pre>",
),
).to eq("```\n function f() {\n console.log('Hello world!');\n }\n```")
end
it "supports <pre> inside <blockquote>" do
expect(
html_to_markdown("<blockquote><pre><code>var foo = 'bar';</code></pre></blockquote>"),
).to eq("> ```\n> var foo = 'bar';\n> ```")
end
it "works" do
expect(
html_to_markdown(
"<ul><li><p>A list item with a blockquote:</p><blockquote><p>This is a <strong>blockquote</strong><br>inside a list item.</p></blockquote></li></ul>",
),
).to eq(
"- A list item with a blockquote:\n\n > This is a **blockquote**\n > inside a list item.",
)
end
it "supports html document" do
expect(html_to_markdown("<html><body>Hello<div>World</div></body></html>")).to eq(
"Hello\nWorld",
)
end
it "handles <p>" do
expect(html_to_markdown("<p>1st paragraph</p><p>2nd paragraph</p>")).to eq(
"1st paragraph\n\n2nd paragraph",
)
expect(
html_to_markdown(
"<body><p>1st paragraph</p>\n <p> 2nd paragraph\n 2nd paragraph</p>\n<p>3rd paragraph</p></body>",
),
).to eq("1st paragraph\n\n2nd paragraph 2nd paragraph\n\n3rd paragraph")
end
it "handles <div>" do
expect(html_to_markdown("<div>1st div</div><div>2nd div</div>")).to eq("1st div\n2nd div")
end
it "swallows <span>" do
expect(html_to_markdown("<span>Span</span>")).to eq("Span")
end
it "swallows <u>" do
expect(html_to_markdown("<u>Underline</u>")).to eq("Underline")
end
it "removes <script>" do
expect(html_to_markdown("<script>var foo = 'bar'</script>")).to eq("")
end
it "removes <style>" do
expect(html_to_markdown("<style>* { margin: 0 }</style>")).to eq("")
end
it "handles <p> and <div> within <span>" do
html = "<div>1st paragraph<span><div>2nd paragraph</div><p>3rd paragraph</p></span></div>"
expect(html_to_markdown(html)).to eq("1st paragraph\n2nd paragraph\n\n3rd paragraph")
end
it "handles <p> and <div> within <font>" do
html =
"<font>1st paragraph<br><span>2nd paragraph</span><div>3rd paragraph</div><p>4th paragraph</p></font>"
expect(html_to_markdown(html)).to eq(
"1st paragraph\n2nd paragraph\n3rd paragraph\n\n4th paragraph",
)
end
context "with an oddly placed <br>" do
it "handles <strong>" do
expect(html_to_markdown("Hello <strong><br>Bold</strong> World")).to eq(
"Hello\n**Bold** World",
)
expect(html_to_markdown("Hello <strong>Bold<br></strong> World")).to eq(
"Hello **Bold**\nWorld",
)
expect(html_to_markdown("Hello <strong>Bold<br>text</strong> World")).to eq(
"Hello **Bold**\n**text** World",
)
end
it "handles <em>" do
expect(html_to_markdown("Hello <em><br>Italic</em> World")).to eq("Hello\n*Italic* World")
expect(html_to_markdown("Hello <em>Italic<br></em> World")).to eq("Hello *Italic*\nWorld")
expect(html_to_markdown("Hello <em>Italic<br>text</em> World")).to eq(
"Hello *Italic*\n*text* World",
)
end
it "works" do
expect(html_to_markdown("<div>A <b> B <i> C <br> D </i> E <br> F </b> G</div>")).to eq(
"A __B *C*__\n__*D* E__\n**F** G",
)
end
end
context "with an empty tag" do
it "handles <strong>" do
expect(html_to_markdown("<strong></strong>")).to eq("")
expect(html_to_markdown("<strong> </strong>")).to eq("")
expect(html_to_markdown("Some<strong> </strong>text")).to eq("Some text")
expect(html_to_markdown("Some<strong> </strong>text")).to eq("Some text")
end
it "handles <em>" do
expect(html_to_markdown("<em></em>")).to eq("")
expect(html_to_markdown("<em> </em>")).to eq("")
expect(html_to_markdown("Some<em> </em>text")).to eq("Some text")
expect(html_to_markdown("Some<em> </em>text")).to eq("Some text")
end
end
context "with spaces around text" do
it "handles <strong>" do
expect(html_to_markdown("<strong> Bold</strong>")).to eq("**Bold**")
expect(html_to_markdown("<strong> Bold</strong>")).to eq("**Bold**")
expect(html_to_markdown("<strong>Bold </strong>")).to eq("**Bold**")
expect(html_to_markdown("<strong>Bold </strong>")).to eq("**Bold**")
expect(html_to_markdown("Some<strong> bold</strong> text")).to eq("Some **bold** text")
expect(html_to_markdown("Some<strong> bold</strong> text")).to eq("Some **bold** text")
expect(html_to_markdown("Some <strong>bold </strong>text")).to eq("Some **bold** text")
expect(html_to_markdown("Some <strong>bold </strong>text")).to eq("Some **bold** text")
end
it "handles <em>" do
expect(html_to_markdown("<em> Italic</em>")).to eq("*Italic*")
expect(html_to_markdown("<em> Italic</em>")).to eq("*Italic*")
expect(html_to_markdown("<em>Italic </em>")).to eq("*Italic*")
expect(html_to_markdown("<em>Italic </em>")).to eq("*Italic*")
expect(html_to_markdown("Some<em> italic</em> text")).to eq("Some *italic* text")
expect(html_to_markdown("Some<em> italic</em> text")).to eq("Some *italic* text")
expect(html_to_markdown("Some <em>italic </em>text")).to eq("Some *italic* text")
expect(html_to_markdown("Some <em>italic </em>text")).to eq("Some *italic* text")
end
end
it "supports <table>" do
html = <<~HTML
<table>
<thead>
<tr>
<th>This</th>
<th>is</th>
<th>the</th>
<th><i>headers</i></th>
</tr>
</thead>
<tbody>
<tr>
<td>I am</td>
<td>the</td>
<td><b>first</b></td>
<td>row</td>
</tr>
<tr>
<td>And this</td>
<td>is the</td>
<td>2<sup>nd</sup></td>
<td>line</td>
</tr>
</tbody>
</table>
HTML
markdown = <<~MD
| This | is | the | *headers* |
| - | - | - | - |
| I am | the | **first** | row |
| And this | is the | 2<sup>nd</sup> | line |
MD
expect(html_to_markdown(html)).to eq(markdown.strip)
expect(html_to_markdown("<table><tr><td>Hello</td><td>World</td></tr></table>")).to eq(
"| Hello | World |\n| - | - |",
)
end
it "doesn't swallow badly formatted <table>" do
html = <<~HTML
<table>
<tr>
<th>1</th>
<th>2</th>
<th>3</th>
<th>4</th>
</tr>
<tr>
<td>One</td>
<td>Two</td>
<td>Three</td>
</tr>
</table>
HTML
expect(html_to_markdown(html)).to eq("1 2 3 4 \nOne Two Three")
end
end