discourse/spec/components/html_to_markdown_spec.rb
Josh Soref 59097b207f
DEV: Correct typos and spelling mistakes (#12812)
Over the years we accrued many spelling mistakes in the code base. 

This PR attempts to fix spelling mistakes and typos in all areas of the code that are extremely safe to change 

- comments
- test descriptions
- other low risk areas
2021-05-21 11:43:47 +10:00

455 lines
16 KiB
Ruby

# frozen_string_literal: true
require 'rails_helper'
require 'html_to_markdown'
describe HtmlToMarkdown do
def html_to_markdown(html, opts = {})
HtmlToMarkdown.new(html, opts).to_markdown
end
it "remove whitespaces" do
html = <<-HTML
<div dir="auto">Hello,
<div dir="auto"><br></div>
<div dir="auto">&nbsp; &nbsp; This is the 1st paragraph.&nbsp; &nbsp; </div>
<div dir="auto"><br></div>
<div dir="auto">
&nbsp; &nbsp; &nbsp; &nbsp; This is another paragraph
</div>
</div>
HTML
expect(html_to_markdown(html)).to eq("Hello,\n\nThis is the 1st paragraph.\n\nThis is another paragraph")
html = <<~HTML
<body text="#000000" bgcolor="#FFFFFF">
<p>Let me see if it happens by answering your message through
Thunderbird.</p>
<p>Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1
Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1
Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1
Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1
Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1
Long sentence 1
</p>
</body>
HTML
markdown = <<~MD
Let me see if it happens by answering your message through Thunderbird.
Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1
MD
expect(html_to_markdown(html)).to eq(markdown.strip)
html = <<~HTML
<p> This post
has lots<br> of
space
</p>
<pre> This space was left untouched !</pre>
HTML
markdown = <<~MD
This post has lots
of space
```
This space was left untouched !
```
MD
expect(html_to_markdown(html)).to eq(markdown.strip)
end
it "doesn't error on non-inline elements like (aside, section)" do
html = <<~HTML
<aside class="quote no-group">
<blockquote>
<p>hello.</p>
</blockquote>
</aside>
HTML
markdown = <<~MD
> hello.
MD
expect(html_to_markdown(html)).to eq(markdown.strip)
end
it "skips hidden tags" do
expect(html_to_markdown(%Q{<p>Hello <span hidden>cruel </span>World!</p>})).to eq("Hello World!")
end
it "converts <strong>" do
expect(html_to_markdown("<strong>Strong</strong>")).to eq("**Strong**")
expect(html_to_markdown("<strong>Str*ng</strong>")).to eq("__Str*ng__")
end
it "converts <b>" do
expect(html_to_markdown("<b>Bold</b>")).to eq("**Bold**")
expect(html_to_markdown("<b>B*ld</b>")).to eq("__B*ld__")
html = <<~HTML
Before
<p><b>Bold
<br>
<br>
</b>
</p>
After
HTML
expect(html_to_markdown(html)).to eq("Before\n\n**Bold**\n\nAfter")
end
it "converts <em>" do
expect(html_to_markdown("<em>Emphasis</em>")).to eq("*Emphasis*")
expect(html_to_markdown("<em>Emph*sis</em>")).to eq("_Emph*sis_")
end
it "converts <i>" do
expect(html_to_markdown("<i>Italic</i>")).to eq("*Italic*")
expect(html_to_markdown("<i>It*lic</i>")).to eq("_It*lic_")
end
it "converts <a>" do
expect(html_to_markdown(%Q{<a href="https://www.discourse.org">Discourse</a>})).to eq("[Discourse](https://www.discourse.org)")
end
it "supports SiteSetting.allowed_href_schemes" do
SiteSetting.allowed_href_schemes = "tel|steam"
expect(html_to_markdown(%Q{<a href="steam://store/48000">LIMBO</a>})).to eq("[LIMBO](steam://store/48000)")
end
it "removes empty & invalid <a>" do
expect(html_to_markdown(%Q{<a>Discourse</a>})).to eq("Discourse")
expect(html_to_markdown(%Q{<a href="">Discourse</a>})).to eq("Discourse")
expect(html_to_markdown(%Q{<a href="foo.bar">Discourse</a>})).to eq("Discourse")
end
HTML_WITH_IMG ||= %Q{<img src="https://www.discourse.org/logo.svg" alt="Discourse Logo">}
HTML_WITH_CID_IMG ||= %Q{<img src="cid:ii_1525434659ddb4cb" title="Discourse Logo">}
it "converts <img>" do
expect(html_to_markdown(HTML_WITH_IMG)).to eq("![Discourse Logo](https://www.discourse.org/logo.svg)")
end
it "keeps <img> with 'keep_img_tags'" do
expect(html_to_markdown(HTML_WITH_IMG, keep_img_tags: true)).to eq(HTML_WITH_IMG)
end
it "removes empty & invalid <img>" do
expect(html_to_markdown(%Q{<img>})).to eq("")
expect(html_to_markdown(%Q{<img src="">})).to eq("")
expect(html_to_markdown(%Q{<img src="foo.bar">})).to eq("")
end
it "keeps <img> with src='cid:' with 'keep_cid_imgs'" do
expect(html_to_markdown(HTML_WITH_CID_IMG, keep_cid_imgs: true)).to eq(HTML_WITH_CID_IMG)
end
it "skips hidden <img>" do
expect(html_to_markdown(%Q{<img src="https://www.discourse.org/logo.svg" width=0>})).to eq("")
expect(html_to_markdown(%Q{<img src="https://www.discourse.org/logo.svg" height="0">})).to eq("")
end
it "supports width/height on <img>" do
expect(html_to_markdown(%Q{<img src="https://www.discourse.org/logo.svg" height=100>})).to eq("![](https://www.discourse.org/logo.svg)")
expect(html_to_markdown(%Q{<img src="https://www.discourse.org/logo.svg" width=200>})).to eq("![](https://www.discourse.org/logo.svg)")
expect(html_to_markdown(%Q{<img src="https://www.discourse.org/logo.svg" height=100 width=200>})).to eq("![|200x100](https://www.discourse.org/logo.svg)")
end
(1..6).each do |n|
it "converts <h#{n}>" do
expect(html_to_markdown("<h#{n}>Header #{n}</h#{n}>")).to eq("#" * n + " Header #{n}")
end
end
it "converts <br>" do
expect(html_to_markdown("Before<br>Inside<br>After")).to eq("Before\nInside\nAfter")
end
it "skips <br> inside <p> if next character is \n" do
expect(html_to_markdown("<p>Before<br>\nInside<br>After</p>")).to eq("Before\nInside\nAfter")
end
it "converts <hr>" do
expect(html_to_markdown("Before<hr>Inside<hr>After")).to eq("Before\n\n---\n\nInside\n\n---\n\nAfter")
end
it "converts <tt>" do
expect(html_to_markdown("<tt>Teletype</tt>")).to eq("`Teletype`")
end
it "converts <code>" do
expect(html_to_markdown("<code>Code</code>")).to eq("`Code`")
end
it "supports <ins>" do
expect(html_to_markdown("This is an <ins>insertion</ins>")).to eq("This is an <ins>insertion</ins>")
end
it "supports <del>" do
expect(html_to_markdown("This is a <del>deletion</del>")).to eq("This is a <del>deletion</del>")
end
it "supports <sub>" do
expect(html_to_markdown("H<sub>2</sub>O")).to eq("H<sub>2</sub>O")
end
it "supports <mark>" do
expect(html_to_markdown("<mark>This is highlighted!</mark>")).to eq("<mark>This is highlighted!</mark>")
end
it "supports <sup>" do
expect(html_to_markdown("<sup>Super Script!</sup>")).to eq("<sup>Super Script!</sup>")
end
it "supports <small>" do
expect(html_to_markdown("<small>Small</small>")).to eq("<small>Small</small>")
end
it "supports <kbd>" do
expect(html_to_markdown("<kbd>CTRL</kbd>+<kbd>C</kbd>")).to eq("<kbd>CTRL</kbd>+<kbd>C</kbd>")
end
it "supports <abbr>" do
expect(html_to_markdown(%Q{<abbr title="Civilized Discourse Construction Kit, Inc.">CDCK</abbr>})).to eq(%Q{<abbr title="Civilized Discourse Construction Kit, Inc.">CDCK</abbr>})
end
it "supports <s>" do
expect(html_to_markdown("<s>Strike Through</s>")).to eq("~~Strike Through~~")
end
it "supports <strike>" do
expect(html_to_markdown("<strike>Strike Through</strike>")).to eq("~~Strike Through~~")
end
it "supports <blockquote>" do
expect(html_to_markdown("<blockquote>Quote</blockquote>")).to eq("> Quote")
end
it "supports <ul>" do
expect(html_to_markdown("<ul><li>🍏</li><li>🍐</li><li>🍌</li></ul>")).to eq("- 🍏\n- 🍐\n- 🍌")
expect(html_to_markdown("<ul>\n<li>🍏</li>\n<li>🍐</li>\n<li>🍌</li>\n</ul>")).to eq("- 🍏\n- 🍐\n- 🍌")
end
it "supports <ol>" do
expect(html_to_markdown("<ol><li>🍆</li><li>🍅</li><li>🍄</li></ol>")).to eq("1. 🍆\n1. 🍅\n1. 🍄")
end
it "supports <p> inside <li>" do
expect(html_to_markdown("<ul><li><p>🍏</p></li><li><p>🍐</p></li><li><p>🍌</p></li></ul>")).to eq("- 🍏\n\n- 🍐\n\n- 🍌")
end
it "supports <ul> inside <ul>" do
expect(html_to_markdown(<<-HTML
<ul>
<li>Fruits
<ul>
<li>🍏</li>
<li>🍐</li>
<li>🍌</li>
</ul>
</li>
<li>Vegetables
<ul>
<li>🍆</li>
<li>🍅</li>
<li>🍄</li>
</ul>
</li>
</ul>
HTML
)).to eq("- Fruits\n - 🍏\n - 🍐\n - 🍌\n- Vegetables\n - 🍆\n - 🍅\n - 🍄")
end
it "supports bare <li>" do
expect(html_to_markdown("<li>I'm alone</li>")).to eq("- I'm alone")
end
it "supports <pre>" do
expect(html_to_markdown("<pre>var foo = 'bar';</pre>")).to eq("```\nvar foo = 'bar';\n```")
expect(html_to_markdown("<pre><code>var foo = 'bar';</code></pre>")).to eq("```\nvar foo = 'bar';\n```")
expect(html_to_markdown(%Q{<pre><code class="lang-javascript">var foo = 'bar';</code></pre>})).to eq("```javascript\nvar foo = 'bar';\n```")
expect(html_to_markdown("<pre> function f() {\n console.log('Hello world!');\n }</pre>")).to eq("```\n function f() {\n console.log('Hello world!');\n }\n```")
end
it "supports <pre> inside <blockquote>" do
expect(html_to_markdown("<blockquote><pre><code>var foo = 'bar';</code></pre></blockquote>")).to eq("> ```\n> var foo = 'bar';\n> ```")
end
it "works" do
expect(html_to_markdown("<ul><li><p>A list item with a blockquote:</p><blockquote><p>This is a <strong>blockquote</strong><br>inside a list item.</p></blockquote></li></ul>")).to eq("- A list item with a blockquote:\n\n > This is a **blockquote**\n > inside a list item.")
end
it "supports html document" do
expect(html_to_markdown("<html><body>Hello<div>World</div></body></html>")).to eq("Hello\nWorld")
end
it "handles <p>" do
expect(html_to_markdown("<p>1st paragraph</p><p>2nd paragraph</p>")).to eq("1st paragraph\n\n2nd paragraph")
expect(html_to_markdown("<body><p>1st paragraph</p>\n <p> 2nd paragraph\n 2nd paragraph</p>\n<p>3rd paragraph</p></body>")).to eq("1st paragraph\n\n2nd paragraph 2nd paragraph\n\n3rd paragraph")
end
it "handles <div>" do
expect(html_to_markdown("<div>1st div</div><div>2nd div</div>")).to eq("1st div\n2nd div")
end
it "swallows <span>" do
expect(html_to_markdown("<span>Span</span>")).to eq("Span")
end
it "swallows <u>" do
expect(html_to_markdown("<u>Underline</u>")).to eq("Underline")
end
it "removes <script>" do
expect(html_to_markdown("<script>var foo = 'bar'</script>")).to eq("")
end
it "removes <style>" do
expect(html_to_markdown("<style>* { margin: 0 }</style>")).to eq("")
end
it "handles <p> and <div> within <span>" do
html = "<div>1st paragraph<span><div>2nd paragraph</div><p>3rd paragraph</p></span></div>"
expect(html_to_markdown(html)).to eq("1st paragraph\n2nd paragraph\n\n3rd paragraph")
end
it "handles <p> and <div> within <font>" do
html = "<font>1st paragraph<br><span>2nd paragraph</span><div>3rd paragraph</div><p>4th paragraph</p></font>"
expect(html_to_markdown(html)).to eq("1st paragraph\n2nd paragraph\n3rd paragraph\n\n4th paragraph")
end
context "with an oddly placed <br>" do
it "handles <strong>" do
expect(html_to_markdown("Hello <strong><br>Bold</strong> World")).to eq("Hello\n**Bold** World")
expect(html_to_markdown("Hello <strong>Bold<br></strong> World")).to eq("Hello **Bold**\nWorld")
expect(html_to_markdown("Hello <strong>Bold<br>text</strong> World")).to eq("Hello **Bold**\n**text** World")
end
it "handles <em>" do
expect(html_to_markdown("Hello <em><br>Italic</em> World")).to eq("Hello\n*Italic* World")
expect(html_to_markdown("Hello <em>Italic<br></em> World")).to eq("Hello *Italic*\nWorld")
expect(html_to_markdown("Hello <em>Italic<br>text</em> World")).to eq("Hello *Italic*\n*text* World")
end
it "works" do
expect(html_to_markdown("<div>A <b> B <i> C <br> D </i> E <br> F </b> G</div>")).to eq("A __B *C*__\n__*D* E__\n**F** G")
end
end
context "with an empty tag" do
it "handles <strong>" do
expect(html_to_markdown("<strong></strong>")).to eq("")
expect(html_to_markdown("<strong> </strong>")).to eq("")
expect(html_to_markdown("Some<strong> </strong>text")).to eq("Some text")
expect(html_to_markdown("Some<strong> </strong>text")).to eq("Some text")
end
it "handles <em>" do
expect(html_to_markdown("<em></em>")).to eq("")
expect(html_to_markdown("<em> </em>")).to eq("")
expect(html_to_markdown("Some<em> </em>text")).to eq("Some text")
expect(html_to_markdown("Some<em> </em>text")).to eq("Some text")
end
end
context "with spaces around text" do
it "handles <strong>" do
expect(html_to_markdown("<strong> Bold</strong>")).to eq("**Bold**")
expect(html_to_markdown("<strong> Bold</strong>")).to eq("**Bold**")
expect(html_to_markdown("<strong>Bold </strong>")).to eq("**Bold**")
expect(html_to_markdown("<strong>Bold </strong>")).to eq("**Bold**")
expect(html_to_markdown("Some<strong> bold</strong> text")).to eq("Some **bold** text")
expect(html_to_markdown("Some<strong> bold</strong> text")).to eq("Some **bold** text")
expect(html_to_markdown("Some <strong>bold </strong>text")).to eq("Some **bold** text")
expect(html_to_markdown("Some <strong>bold </strong>text")).to eq("Some **bold** text")
end
it "handles <em>" do
expect(html_to_markdown("<em> Italic</em>")).to eq("*Italic*")
expect(html_to_markdown("<em> Italic</em>")).to eq("*Italic*")
expect(html_to_markdown("<em>Italic </em>")).to eq("*Italic*")
expect(html_to_markdown("<em>Italic </em>")).to eq("*Italic*")
expect(html_to_markdown("Some<em> italic</em> text")).to eq("Some *italic* text")
expect(html_to_markdown("Some<em> italic</em> text")).to eq("Some *italic* text")
expect(html_to_markdown("Some <em>italic </em>text")).to eq("Some *italic* text")
expect(html_to_markdown("Some <em>italic </em>text")).to eq("Some *italic* text")
end
end
it "supports <table>" do
html = <<~HTML
<table>
<thead>
<tr>
<th>This</th>
<th>is</th>
<th>the</th>
<th><i>headers</i></th>
</tr>
</thead>
<tbody>
<tr>
<td>I am</td>
<td>the</td>
<td><b>first</b></td>
<td>row</td>
</tr>
<tr>
<td>And this</td>
<td>is the</td>
<td>2<sup>nd</sup></td>
<td>line</td>
</tr>
</tbody>
</table>
HTML
markdown = <<~MD
| This | is | the | *headers* |
| - | - | - | - |
| I am | the | **first** | row |
| And this | is the | 2<sup>nd</sup> | line |
MD
expect(html_to_markdown(html)).to eq(markdown.strip)
expect(html_to_markdown("<table><tr><td>Hello</td><td>World</td></tr></table>")).to eq("| Hello | World |\n| - | - |")
end
it "doesn't swallow badly formatted <table>" do
html = <<~HTML
<table>
<tr>
<th>1</th>
<th>2</th>
<th>3</th>
<th>4</th>
</tr>
<tr>
<td>One</td>
<td>Two</td>
<td>Three</td>
</tr>
</table>
HTML
expect(html_to_markdown(html)).to eq("1 2 3 4 \nOne Two Three")
end
end