# frozen_string_literal: true require "html_to_markdown" RSpec.describe HtmlToMarkdown do def html_to_markdown(html, opts = {}) HtmlToMarkdown.new(html, opts).to_markdown end it "remove whitespaces" do html = <<-HTML
Hello,

    This is the 1st paragraph.   

        This is another paragraph
HTML expect(html_to_markdown(html)).to eq( "Hello,\n\nThis is the 1st paragraph.\n\nThis is another paragraph", ) html = <<~HTML

Let me see if it happens by answering your message through Thunderbird.

Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1

HTML markdown = <<~MD Let me see if it happens by answering your message through Thunderbird. Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 MD expect(html_to_markdown(html)).to eq(markdown.strip) html = <<~HTML

This post has lots
of space

    This     space    was   left untouched     !
HTML markdown = <<~MD This post has lots of space ``` This space was left untouched ! ``` MD expect(html_to_markdown(html)).to eq(markdown.strip) end it "removes tags that aren't allowed" do html = <<~HTML Text withing custom tag
Text within allowed tag
HTML expect(html_to_markdown(html)).to eq("Text within allowed tag") end it "allows additional tags that can be consumed by subclasses" do class ExtendedHtmlToMarkdown < HtmlToMarkdown def to_markdown yield @doc super end end html = <<~HTML Image text
Text within allowed tag
HTML md = ExtendedHtmlToMarkdown .new(html) .to_markdown { |doc| expect(doc.css("custom-image")).to be_empty } expect(md).to eq("Text within allowed tag") md = ExtendedHtmlToMarkdown .new(html, { additional_allowed_tags: ["custom-image"] }) .to_markdown do |doc| doc.css("custom-image").each { |img| img.replace("Image #{img["image-id"]}") } end expect(md).to eq("Image 42\nText within allowed tag") end it "doesn't error on non-inline elements like (aside, section)" do html = <<~HTML HTML markdown = <<~MD > Hello, > is it me you're looking for? MD expect(html_to_markdown(html)).to eq(markdown.strip) end it "skips hidden tags" do expect(html_to_markdown("

Hello World!

")).to eq("Hello World!") end it "converts " do expect(html_to_markdown("Strong")).to eq("**Strong**") expect(html_to_markdown("Str*ng")).to eq("__Str*ng__") end it "converts " do expect(html_to_markdown("Bold")).to eq("**Bold**") expect(html_to_markdown("B*ld")).to eq("__B*ld__") html = <<~HTML Before

Bold

After HTML expect(html_to_markdown(html)).to eq("Before\n\n**Bold**\n\nAfter") end it "converts " do expect(html_to_markdown("Emphasis")).to eq("*Emphasis*") expect(html_to_markdown("Emph*sis")).to eq("_Emph*sis_") end it "converts " do expect(html_to_markdown("Italic")).to eq("*Italic*") expect(html_to_markdown("It*lic")).to eq("_It*lic_") end it "converts " do expect(html_to_markdown(%Q{Discourse})).to eq( "[Discourse](https://www.discourse.org)", ) end it "supports SiteSetting.allowed_href_schemes" do SiteSetting.allowed_href_schemes = "tel|steam" expect(html_to_markdown(%Q{LIMBO})).to eq( "[LIMBO](steam://store/48000)", ) end it "removes empty & invalid " do expect(html_to_markdown("Discourse")).to eq("Discourse") expect(html_to_markdown(%Q{Discourse})).to eq("Discourse") expect(html_to_markdown(%Q{Discourse})).to eq("Discourse") end HTML_WITH_IMG ||= %Q{Discourse Logo} HTML_WITH_CID_IMG ||= %Q{} it "converts " do expect(html_to_markdown(HTML_WITH_IMG)).to eq( "![Discourse Logo](https://www.discourse.org/logo.svg)", ) end it "keeps with 'keep_img_tags'" do expect(html_to_markdown(HTML_WITH_IMG, keep_img_tags: true)).to eq(HTML_WITH_IMG) end it "removes newlines from img alt text" do html_with_alt_newlines = %Q{Discourse\n\nLogo} expect(html_to_markdown(html_with_alt_newlines)).to eq( "![Discourse Logo](https://www.discourse.org/logo.svg)", ) end it "removes empty & invalid " do expect(html_to_markdown("")).to eq("") expect(html_to_markdown(%Q{})).to eq("") expect(html_to_markdown(%Q{})).to eq("") end it "keeps with src='cid:' with 'keep_cid_imgs'" do expect(html_to_markdown(HTML_WITH_CID_IMG, keep_cid_imgs: true)).to eq(HTML_WITH_CID_IMG) end it "removes newlines from img alt text with cid images" do html_with_cid_alt_newlines = %Q{} expect(html_to_markdown(html_with_cid_alt_newlines, keep_cid_imgs: true)).to eq( %Q{}, ) end it "skips hidden " do expect(html_to_markdown(%Q{})).to eq("") expect(html_to_markdown(%Q{})).to eq( "", ) end it "supports width/height on " do expect(html_to_markdown(%Q{})).to eq( "![](https://www.discourse.org/logo.svg)", ) expect(html_to_markdown(%Q{})).to eq( "![](https://www.discourse.org/logo.svg)", ) expect( html_to_markdown(%Q{}), ).to eq("![|200x100](https://www.discourse.org/logo.svg)") end (1..6).each do |n| it "converts " do expect(html_to_markdown("Header #{n}")).to eq("#" * n + " Header #{n}") end end it "converts
" do expect(html_to_markdown("Before
Inside
After")).to eq("Before\nInside\nAfter") end it "skips
inside

if next character is \n" do expect(html_to_markdown("

Before
\nInside
After

")).to eq("Before\nInside\nAfter") end it "converts
" do expect(html_to_markdown("Before
Inside
After")).to eq( "Before\n\n---\n\nInside\n\n---\n\nAfter", ) end it "converts " do expect(html_to_markdown("Teletype")).to eq("`Teletype`") end it "converts " do expect(html_to_markdown("Code")).to eq("`Code`") end describe "when HTML is used within Markdown" do HtmlToMarkdown::ALLOWED.each do |tag| it "keeps mandatory HTML entities in text of <#{tag}>" do expect(html_to_markdown("<#{tag}>Less than: <")).to eq( "<#{tag}>Less than: <", ) expect(html_to_markdown("<#{tag}>Greater than: >")).to eq( "<#{tag}>Greater than: >", ) expect(html_to_markdown("<#{tag}>Ampersand: &")).to eq( "<#{tag}>Ampersand: &", ) expect(html_to_markdown("<#{tag}>Double Quote: "")).to eq( "<#{tag}>Double Quote: \"", ) expect(html_to_markdown("<#{tag}>Single Quote: '")).to eq( "<#{tag}>Single Quote: '", ) expect(html_to_markdown("<#{tag}>Copyright Symbol: ©")).to eq( "<#{tag}>Copyright Symbol: ©", ) expect(html_to_markdown("<#{tag}>Euro Symbol: €")).to eq( "<#{tag}>Euro Symbol: €", ) end end end it "supports " do expect(html_to_markdown("This is an insertion")).to eq( "This is an insertion", ) end it "supports " do expect(html_to_markdown("This is a deletion")).to eq("This is a deletion") end it "supports " do expect(html_to_markdown("H2O")).to eq("H2O") end it "supports " do expect(html_to_markdown("This is highlighted!")).to eq( "This is highlighted!", ) end it "supports " do expect(html_to_markdown("Super Script!")).to eq("Super Script!") end it "supports " do expect(html_to_markdown("Small")).to eq("Small") expect(html_to_markdown("Small")).to eq( "Small", ) expect(html_to_markdown("Small")).to eq( "**Small**", ) expect(html_to_markdown("<small>")).to eq( "**<small>**", ) end it "supports " do expect(html_to_markdown("Big")).to eq("Big") expect(html_to_markdown("<big>")).to eq("<big>") end it "supports " do expect(html_to_markdown("CTRL+C")).to eq("CTRL+C") expect(html_to_markdown("<")).to eq("<") end it "supports " do expect( html_to_markdown(%Q{CDCK}), ).to eq(%Q{CDCK}) expect( html_to_markdown( %Q{<abbr>}, ), ).to eq(%Q{<abbr>}) end it "supports " do expect(html_to_markdown("Strike Through")).to eq("~~Strike Through~~") end it "supports " do expect(html_to_markdown("Strike Through")).to eq("~~Strike Through~~") end it "supports
" do expect(html_to_markdown("
Quote
")).to eq("> Quote") end it "supports
    " do expect(html_to_markdown("
    • 🍏
    • 🍐
    • 🍌
    ")).to eq("- 🍏\n- 🍐\n- 🍌") expect(html_to_markdown("
      \n
    • 🍏
    • \n
    • 🍐
    • \n
    • 🍌
    • \n
    ")).to eq( "- 🍏\n- 🍐\n- 🍌", ) end it "supports
      " do expect(html_to_markdown("
      1. 🍆
      2. 🍅
      3. 🍄
      ")).to eq("1. 🍆\n1. 🍅\n1. 🍄") end it "supports

      inside

    1. " do expect(html_to_markdown("
      • 🍏

      • 🍐

      • 🍌

      ")).to eq( "- 🍏\n\n- 🍐\n\n- 🍌", ) end it "supports
        inside
          " do expect(html_to_markdown(<<-HTML)).to eq(
          • Fruits
            • 🍏
            • 🍐
            • 🍌
          • Vegetables
            • 🍆
            • 🍅
            • 🍄
          HTML "- Fruits\n - 🍏\n - 🍐\n - 🍌\n- Vegetables\n - 🍆\n - 🍅\n - 🍄", ) end it "supports bare
        • " do expect(html_to_markdown("
        • I'm alone
        • ")).to eq("- I'm alone") end it "supports
          " do
              expect(html_to_markdown("
          var foo = 'bar';
          ")).to eq("```\nvar foo = 'bar';\n```") expect(html_to_markdown("
          var foo = 'bar';
          ")).to eq( "```\nvar foo = 'bar';\n```", ) expect( html_to_markdown(%Q{
          var foo = 'bar';
          }), ).to eq("```javascript\nvar foo = 'bar';\n```") expect( html_to_markdown( "
              function f() {\n        console.log('Hello world!');\n    }
          ", ), ).to eq("```\n function f() {\n console.log('Hello world!');\n }\n```") html = <<~HTML
          Reported-and-tested-by: A <a@example.com>
                Reviewed-by: B <b@example.com>
          HTML md = <<~MD ```plaintext Reported-and-tested-by: A Reviewed-by: B ``` MD expect(html_to_markdown(html)).to eq(md.strip) end it "supports
           inside 
          " do expect( html_to_markdown("
          var foo = 'bar';
          "), ).to eq("> ```\n> var foo = 'bar';\n> ```") end it "works" do expect( html_to_markdown( "
          • A list item with a blockquote:

            This is a blockquote
            inside a list item.

          ", ), ).to eq( "- A list item with a blockquote:\n\n > This is a **blockquote**\n > inside a list item.", ) end it "supports html document" do expect(html_to_markdown("Hello
          World
          ")).to eq( "Hello\nWorld", ) end it "handles

          " do expect(html_to_markdown("

          1st paragraph

          2nd paragraph

          ")).to eq( "1st paragraph\n\n2nd paragraph", ) expect( html_to_markdown( "

          1st paragraph

          \n

          2nd paragraph\n 2nd paragraph

          \n

          3rd paragraph

          ", ), ).to eq("1st paragraph\n\n2nd paragraph 2nd paragraph\n\n3rd paragraph") end it "handles
          " do expect(html_to_markdown("
          1st div
          2nd div
          ")).to eq("1st div\n2nd div") end it "swallows " do expect(html_to_markdown("Span")).to eq("Span") end it "swallows " do expect(html_to_markdown("Underline")).to eq("Underline") end it "swallows
          " do expect(html_to_markdown("
          Centered
          ")).to eq("Centered") end it "removes ")).to eq("") end it "removes ")).to eq("") end it "handles

          and

          within " do html = "
          1st paragraph
          2nd paragraph

          3rd paragraph

          " expect(html_to_markdown(html)).to eq("1st paragraph\n2nd paragraph\n\n3rd paragraph") end it "handles

          and

          within " do html = "1st paragraph
          2nd paragraph
          3rd paragraph

          4th paragraph

          " expect(html_to_markdown(html)).to eq( "1st paragraph\n2nd paragraph\n3rd paragraph\n\n4th paragraph", ) end context "with an oddly placed
          " do it "handles " do expect(html_to_markdown("Hello
          Bold
          World")).to eq( "Hello\n**Bold** World", ) expect(html_to_markdown("Hello Bold
          World")).to eq( "Hello **Bold**\nWorld", ) expect(html_to_markdown("Hello Bold
          text
          World")).to eq( "Hello **Bold**\n**text** World", ) end it "handles " do expect(html_to_markdown("Hello
          Italic
          World")).to eq("Hello\n*Italic* World") expect(html_to_markdown("Hello Italic
          World")).to eq("Hello *Italic*\nWorld") expect(html_to_markdown("Hello Italic
          text
          World")).to eq( "Hello *Italic*\n*text* World", ) end it "works" do expect(html_to_markdown("
          A B C
          D
          E
          F
          G
          ")).to eq( "A __B *C*__\n__*D* E__\n**F** G", ) end end context "with an empty tag" do it "handles " do expect(html_to_markdown("")).to eq("") expect(html_to_markdown(" ")).to eq("") expect(html_to_markdown("Some text")).to eq("Some text") expect(html_to_markdown("Some text")).to eq("Some text") end it "handles " do expect(html_to_markdown("")).to eq("") expect(html_to_markdown(" ")).to eq("") expect(html_to_markdown("Some text")).to eq("Some text") expect(html_to_markdown("Some text")).to eq("Some text") end end context "with spaces around text" do it "handles " do expect(html_to_markdown(" Bold")).to eq("**Bold**") expect(html_to_markdown(" Bold")).to eq("**Bold**") expect(html_to_markdown("Bold ")).to eq("**Bold**") expect(html_to_markdown("Bold ")).to eq("**Bold**") expect(html_to_markdown("Some bold text")).to eq("Some **bold** text") expect(html_to_markdown("Some bold text")).to eq("Some **bold** text") expect(html_to_markdown("Some bold text")).to eq("Some **bold** text") expect(html_to_markdown("Some bold text")).to eq("Some **bold** text") end it "handles " do expect(html_to_markdown(" Italic")).to eq("*Italic*") expect(html_to_markdown(" Italic")).to eq("*Italic*") expect(html_to_markdown("Italic ")).to eq("*Italic*") expect(html_to_markdown("Italic ")).to eq("*Italic*") expect(html_to_markdown("Some italic text")).to eq("Some *italic* text") expect(html_to_markdown("Some italic text")).to eq("Some *italic* text") expect(html_to_markdown("Some italic text")).to eq("Some *italic* text") expect(html_to_markdown("Some italic text")).to eq("Some *italic* text") end end it "supports " do html = <<~HTML
          This is the headers
          I am the first row
          And this is the 2nd line
          This is the footer
          HTML markdown = <<~MD | This | is | the | *headers* | | - | - | - | - | | I am | the | **first** | row | | And this | is the | 2nd | line | | This | is | the | footer | MD expect(html_to_markdown(html)).to eq(markdown.strip) expect(html_to_markdown("
          HelloWorld
          ")).to eq( "| Hello | World |\n| - | - |", ) end it "keeps HTML for badly formatted " do html = <<~HTML
          1 2 3 4
          <One> Two Three
          HTML markdown = <<~MD
          1 2 3 4
          <One> **Two** Three
          MD expect(html_to_markdown(html)).to eq(markdown.strip) end it "keeps HTML for with colspan" do html = <<~HTML
          1 2
          One / Two
          HTML markdown = <<~MD
          1 2
          One / Two
          MD expect(html_to_markdown(html)).to eq(markdown.strip) end it "keeps HTML for with rowspan" do html = <<~HTML
          1 2
          A B
          C
          HTML markdown = <<~MD
          1 2
          A B
          C
          MD expect(html_to_markdown(html)).to eq(markdown.strip) end end