2017-04-25 04:01:41 +08:00
require 'rails_helper'
require 'html_to_markdown'
describe HtmlToMarkdown do
2017-07-28 09:20:09 +08:00
def html_to_markdown ( html , opts = { } )
2017-05-04 04:53:47 +08:00
HtmlToMarkdown . new ( html , opts ) . to_markdown
2017-04-25 04:01:41 +08:00
end
2017-05-04 00:04:31 +08:00
it " remove whitespaces " do
expect ( html_to_markdown ( <<-HTML
< div dir = " auto " > Hello ,
< div dir = " auto " > < br > < / div>
< div dir = " auto " > & nbsp ; & nbsp ; This is the 1 st paragraph . & nbsp ; & nbsp ; < / div>
< div dir = " auto " > < br > < / div>
< div dir = " auto " >
& nbsp ; & nbsp ; & nbsp ; & nbsp ; This is another paragraph
< / div>
< / div>
HTML
) ) . to eq ( " Hello, \n \n This is the 1st paragraph. \n \n This is another paragraph " )
end
2017-05-04 01:34:03 +08:00
it " skips hidden tags " do
expect ( html_to_markdown ( %Q{ <p>Hello <span style="display: none">cruel </span>World!</p> } ) ) . to eq ( " Hello World! " )
end
2017-04-25 04:01:41 +08:00
it " converts <strong> " do
expect ( html_to_markdown ( " <strong>Strong</strong> " ) ) . to eq ( " **Strong** " )
expect ( html_to_markdown ( " <strong>Str*ng</strong> " ) ) . to eq ( " __Str*ng__ " )
end
it " converts <b> " do
expect ( html_to_markdown ( " <b>Bold</b> " ) ) . to eq ( " **Bold** " )
expect ( html_to_markdown ( " <b>B*ld</b> " ) ) . to eq ( " __B*ld__ " )
end
it " converts <em> " do
expect ( html_to_markdown ( " <em>Emphasis</em> " ) ) . to eq ( " *Emphasis* " )
expect ( html_to_markdown ( " <em>Emph*sis</em> " ) ) . to eq ( " _Emph*sis_ " )
end
it " converts <i> " do
expect ( html_to_markdown ( " <i>Italic</i> " ) ) . to eq ( " *Italic* " )
expect ( html_to_markdown ( " <i>It*lic</i> " ) ) . to eq ( " _It*lic_ " )
end
it " converts <a> " do
expect ( html_to_markdown ( %Q{ <a href="https://www.discourse.org">Discourse</a> } ) ) . to eq ( " [Discourse](https://www.discourse.org) " )
end
2017-05-03 22:42:37 +08:00
it " removes empty & invalid <a> " do
expect ( html_to_markdown ( %Q{ <a>Discourse</a> } ) ) . to eq ( " Discourse " )
expect ( html_to_markdown ( %Q{ <a href="">Discourse</a> } ) ) . to eq ( " Discourse " )
expect ( html_to_markdown ( %Q{ <a href="foo.bar">Discourse</a> } ) ) . to eq ( " Discourse " )
end
2017-05-04 04:53:47 +08:00
HTML_WITH_IMG || = %Q{ <img src="https://www.discourse.org/logo.svg" alt="Discourse Logo"> }
HTML_WITH_CID_IMG || = %Q{ <img src="cid:ii_1525434659ddb4cb" alt="Discourse Logo"> }
2017-04-29 04:14:46 +08:00
2017-04-25 04:01:41 +08:00
it " converts <img> " do
2017-04-29 04:14:46 +08:00
expect ( html_to_markdown ( HTML_WITH_IMG ) ) . to eq ( " ![Discourse Logo](https://www.discourse.org/logo.svg) " )
end
it " keeps <img> with 'keep_img_tags' " do
2017-05-04 04:53:47 +08:00
expect ( html_to_markdown ( HTML_WITH_IMG , keep_img_tags : true ) ) . to eq ( HTML_WITH_IMG )
2017-04-25 04:01:41 +08:00
end
2017-05-04 00:29:25 +08:00
it " removes empty & invalid <img> " do
expect ( html_to_markdown ( %Q{ <img> } ) ) . to eq ( " " )
expect ( html_to_markdown ( %Q{ <img src=""> } ) ) . to eq ( " " )
expect ( html_to_markdown ( %Q{ <img src="foo.bar"> } ) ) . to eq ( " " )
end
2017-05-04 04:53:47 +08:00
it " keeps <img> with src='cid:' whith 'keep_cid_imgs' " do
expect ( html_to_markdown ( HTML_WITH_CID_IMG , keep_cid_imgs : true ) ) . to eq ( " ![Discourse Logo](cid:ii_1525434659ddb4cb) " )
expect ( html_to_markdown ( HTML_WITH_CID_IMG , keep_img_tags : true , keep_cid_imgs : true ) ) . to eq ( " <img src= \" cid:ii_1525434659ddb4cb \" alt= \" Discourse Logo \" > " )
end
2017-05-04 01:40:34 +08:00
it " skips hidden <img> " do
expect ( html_to_markdown ( %Q{ <img src="https://www.discourse.org/logo.svg" width=0> } ) ) . to eq ( " " )
expect ( html_to_markdown ( %Q{ <img src="https://www.discourse.org/logo.svg" height="0"> } ) ) . to eq ( " " )
expect ( html_to_markdown ( %Q{ <img src="https://www.discourse.org/logo.svg" style="width: 0"> } ) ) . to eq ( " " )
expect ( html_to_markdown ( %Q{ <img src="https://www.discourse.org/logo.svg" style="height:0px"> } ) ) . to eq ( " " )
end
2017-04-25 04:01:41 +08:00
( 1 .. 6 ) . each do | n |
it " converts <h #{ n } > " do
expect ( html_to_markdown ( " <h #{ n } >Header #{ n } </h #{ n } > " ) ) . to eq ( " # " * n + " Header #{ n } " )
end
end
it " converts <br> " do
expect ( html_to_markdown ( " Before<br>Inside<br>After " ) ) . to eq ( " Before \n Inside \n After " )
end
it " converts <hr> " do
expect ( html_to_markdown ( " Before<hr>Inside<hr>After " ) ) . to eq ( " Before \n \n --- \n \n Inside \n \n --- \n \n After " )
end
it " converts <tt> " do
expect ( html_to_markdown ( " <tt>Teletype</tt> " ) ) . to eq ( " `Teletype` " )
end
it " converts <code> " do
expect ( html_to_markdown ( " <code>Code</code> " ) ) . to eq ( " `Code` " )
end
it " supports <ins> " do
expect ( html_to_markdown ( " This is an <ins>insertion</ins> " ) ) . to eq ( " This is an <ins>insertion</ins> " )
end
it " supports <del> " do
expect ( html_to_markdown ( " This is a <del>deletion</del> " ) ) . to eq ( " This is a <del>deletion</del> " )
end
it " supports <sub> " do
expect ( html_to_markdown ( " H<sub>2</sub>O " ) ) . to eq ( " H<sub>2</sub>O " )
end
it " supports <sup> " do
expect ( html_to_markdown ( " <sup>Super Script!</sup> " ) ) . to eq ( " <sup>Super Script!</sup> " )
end
it " supports <small> " do
expect ( html_to_markdown ( " <small>Small</small> " ) ) . to eq ( " <small>Small</small> " )
end
it " supports <kbd> " do
expect ( html_to_markdown ( " <kbd>CTRL</kbd>+<kbd>C</kbd> " ) ) . to eq ( " <kbd>CTRL</kbd>+<kbd>C</kbd> " )
end
it " supports <abbr> " do
expect ( html_to_markdown ( %Q{ <abbr title="Civilized Discourse Construction Kit, Inc.">CDCK</abbr> } ) ) . to eq ( %Q{ <abbr title="Civilized Discourse Construction Kit, Inc.">CDCK</abbr> } )
end
it " supports <s> " do
expect ( html_to_markdown ( " <s>Strike Through</s> " ) ) . to eq ( " <s>Strike Through</s> " )
end
it " supports <strike> " do
expect ( html_to_markdown ( " <strike>Strike Through</strike> " ) ) . to eq ( " <strike>Strike Through</strike> " )
end
it " supports <blockquote> " do
expect ( html_to_markdown ( " <blockquote>Quote</blockquote> " ) ) . to eq ( " > Quote " )
end
it " supports <ul> " do
expect ( html_to_markdown ( " <ul><li>🍏</li><li>🍐</li><li>🍌</li></ul> " ) ) . to eq ( " - 🍏 \n - 🍐 \n - 🍌 " )
expect ( html_to_markdown ( " <ul> \n <li>🍏</li> \n <li>🍐</li> \n <li>🍌</li> \n </ul> " ) ) . to eq ( " - 🍏 \n - 🍐 \n - 🍌 " )
end
it " supports <ol> " do
expect ( html_to_markdown ( " <ol><li>🍆</li><li>🍅</li><li>🍄</li></ol> " ) ) . to eq ( " 1. 🍆 \n 1. 🍅 \n 1. 🍄 " )
end
it " supports <p> inside <li> " do
expect ( html_to_markdown ( " <ul><li><p>🍏</p></li><li><p>🍐</p></li><li><p>🍌</p></li></ul> " ) ) . to eq ( " - 🍏 \n \n - 🍐 \n \n - 🍌 " )
end
it " supports <ul> inside <ul> " do
expect ( html_to_markdown ( <<-HTML
< ul >
< li > Fruits
< ul >
< li > 🍏 < / li>
< li > 🍐 < / li>
< li > 🍌 < / li>
< / ul>
< / li>
< li > Vegetables
< ul >
< li > 🍆 < / li>
< li > 🍅 < / li>
< li > 🍄 < / li>
< / ul>
< / li>
< / ul>
HTML
) ) . to eq ( " - Fruits \n - 🍏 \n - 🍐 \n - 🍌 \n - Vegetables \n - 🍆 \n - 🍅 \n - 🍄 " )
end
2017-05-17 21:05:11 +08:00
it " supports bare <li> " do
expect ( html_to_markdown ( " <li>I'm alone</li> " ) ) . to eq ( " - I'm alone " )
end
2017-04-25 04:01:41 +08:00
it " supports <pre> " do
expect ( html_to_markdown ( " <pre>var foo = 'bar';</pre> " ) ) . to eq ( " ``` \n var foo = 'bar'; \n ``` " )
expect ( html_to_markdown ( " <pre><code>var foo = 'bar';</code></pre> " ) ) . to eq ( " ``` \n var foo = 'bar'; \n ``` " )
expect ( html_to_markdown ( %Q{ <pre><code class="lang-javascript">var foo = 'bar';</code></pre> } ) ) . to eq ( " ```javascript \n var foo = 'bar'; \n ``` " )
end
it " works " do
expect ( html_to_markdown ( " <ul><li><p>A list item with a blockquote:</p><blockquote><p>This is a <strong>blockquote</strong><br>inside a list item.</p></blockquote></li></ul> " ) ) . to eq ( " - A list item with a blockquote: \n \n > This is a **blockquote** \n > inside a list item. " )
end
2017-04-29 04:02:20 +08:00
it " supports html document " do
expect ( html_to_markdown ( " <html><body>Hello<div>World</div></body></html> " ) ) . to eq ( " Hello \n World " )
end
2017-04-25 04:01:41 +08:00
it " handles <p> " do
expect ( html_to_markdown ( " <p>1st paragraph</p><p>2nd paragraph</p> " ) ) . to eq ( " 1st paragraph \n \n 2nd paragraph " )
end
it " handles <div> " do
expect ( html_to_markdown ( " <div>1st div</div><div>2nd div</div> " ) ) . to eq ( " 1st div \n \n 2nd div " )
end
it " swallows <span> " do
expect ( html_to_markdown ( " <span>Span</span> " ) ) . to eq ( " Span " )
end
it " swallows <u> " do
expect ( html_to_markdown ( " <u>Underline</u> " ) ) . to eq ( " Underline " )
end
it " removes <script> " do
expect ( html_to_markdown ( " <script>var foo = 'bar'</script> " ) ) . to eq ( " " )
end
it " removes <style> " do
expect ( html_to_markdown ( " <style>* { margin: 0 }</style> " ) ) . to eq ( " " )
end
2017-05-10 00:33:54 +08:00
it " handles divs within spans " do
html = " <div>1st paragraph<span><div>2nd paragraph</div></span></div> "
expect ( html_to_markdown ( html ) ) . to eq ( " 1st paragraph \n 2nd paragraph " )
end
2017-08-03 05:02:59 +08:00
context " with an oddly placed <br> " do
it " handles <strong> " do
2017-08-15 04:13:24 +08:00
expect ( html_to_markdown ( " <strong><br>Bold</strong> " ) ) . to eq ( " **Bold** " )
2017-08-03 05:02:59 +08:00
expect ( html_to_markdown ( " <strong>Bold<br></strong> " ) ) . to eq ( " **Bold** " )
expect ( html_to_markdown ( " <strong>Bold<br>text</strong> " ) ) . to eq ( " **Bold \n text** " )
end
it " handles <em> " do
2017-08-15 04:13:24 +08:00
expect ( html_to_markdown ( " <em><br>Italic</em> " ) ) . to eq ( " *Italic* " )
2017-08-03 05:02:59 +08:00
expect ( html_to_markdown ( " <em>Italic<br></em> " ) ) . to eq ( " *Italic* " )
expect ( html_to_markdown ( " <em>Italic<br>text</em> " ) ) . to eq ( " *Italic \n text* " )
end
end
context " with an empty tag " do
it " handles <strong> " do
expect ( html_to_markdown ( " <strong></strong> " ) ) . to eq ( " " )
expect ( html_to_markdown ( " <strong> </strong> " ) ) . to eq ( " " )
2017-08-15 04:13:24 +08:00
expect ( html_to_markdown ( " Some<strong> </strong>text " ) ) . to eq ( " Some text " )
expect ( html_to_markdown ( " Some<strong> </strong>text " ) ) . to eq ( " Some text " )
2017-08-03 05:02:59 +08:00
end
it " handles <em> " do
expect ( html_to_markdown ( " <em></em> " ) ) . to eq ( " " )
expect ( html_to_markdown ( " <em> </em> " ) ) . to eq ( " " )
2017-08-15 04:13:24 +08:00
expect ( html_to_markdown ( " Some<em> </em>text " ) ) . to eq ( " Some text " )
expect ( html_to_markdown ( " Some<em> </em>text " ) ) . to eq ( " Some text " )
end
end
context " with spaces around text " do
it " handles <strong> " do
expect ( html_to_markdown ( " <strong> Bold</strong> " ) ) . to eq ( " **Bold** " )
expect ( html_to_markdown ( " <strong> Bold</strong> " ) ) . to eq ( " **Bold** " )
expect ( html_to_markdown ( " <strong>Bold </strong> " ) ) . to eq ( " **Bold** " )
expect ( html_to_markdown ( " <strong>Bold </strong> " ) ) . to eq ( " **Bold** " )
expect ( html_to_markdown ( " Some<strong> bold</strong> text " ) ) . to eq ( " Some **bold** text " )
expect ( html_to_markdown ( " Some<strong> bold</strong> text " ) ) . to eq ( " Some **bold** text " )
expect ( html_to_markdown ( " Some <strong>bold </strong>text " ) ) . to eq ( " Some **bold** text " )
expect ( html_to_markdown ( " Some <strong>bold </strong>text " ) ) . to eq ( " Some **bold** text " )
end
it " handles <em> " do
expect ( html_to_markdown ( " <em> Italic</em> " ) ) . to eq ( " *Italic* " )
expect ( html_to_markdown ( " <em> Italic</em> " ) ) . to eq ( " *Italic* " )
expect ( html_to_markdown ( " <em>Italic </em> " ) ) . to eq ( " *Italic* " )
expect ( html_to_markdown ( " <em>Italic </em> " ) ) . to eq ( " *Italic* " )
expect ( html_to_markdown ( " Some<em> italic</em> text " ) ) . to eq ( " Some *italic* text " )
expect ( html_to_markdown ( " Some<em> italic</em> text " ) ) . to eq ( " Some *italic* text " )
expect ( html_to_markdown ( " Some <em>italic </em>text " ) ) . to eq ( " Some *italic* text " )
expect ( html_to_markdown ( " Some <em>italic </em>text " ) ) . to eq ( " Some *italic* text " )
2017-08-03 05:02:59 +08:00
end
end
2017-04-25 04:01:41 +08:00
end