From b57b635d302c87dc2172d4fe688de492d2a37d2a Mon Sep 17 00:00:00 2001 From: Robin Ward Date: Tue, 9 May 2017 12:33:54 -0400 Subject: [PATCH] FIX: Extract `div` tags within `span`s --- lib/html_to_markdown.rb | 17 +++++++++++++++-- spec/components/html_to_markdown_spec.rb | 5 +++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/lib/html_to_markdown.rb b/lib/html_to_markdown.rb index 5cfbc9b6495..2044d5e37f4 100644 --- a/lib/html_to_markdown.rb +++ b/lib/html_to_markdown.rb @@ -8,10 +8,22 @@ class HtmlToMarkdown def initialize(html, opts={}) @opts = opts || {} - @doc = Nokogiri::HTML(html) + @doc = fix_span_elements(Nokogiri::HTML(html)) + remove_whitespaces! end + + # If a `
` is within a `` that's invalid, so let's hoist the `
` up + def fix_span_elements(node) + if node.name == 'span' && node.at('div') + node.swap(node.children) + end + + node.children.each {|c| fix_span_elements(c)} + node + end + def remove_whitespaces! @doc.traverse do |node| if node.is_a? Nokogiri::XML::Text @@ -33,7 +45,7 @@ class HtmlToMarkdown end def traverse(node) - node.children.each { |node| visit(node) } + node.children.each { |n| visit(n) } end def visit(node) @@ -197,6 +209,7 @@ class HtmlToMarkdown end def format_block + lines = @stack[-1].markdown.each_line.map do |line| prefix = @stack.map { |b| b.opened ? b.body : b.head }.join @stack.each { |b| b.opened = true } diff --git a/spec/components/html_to_markdown_spec.rb b/spec/components/html_to_markdown_spec.rb index 8a4a8077f8a..3d77cc85e40 100644 --- a/spec/components/html_to_markdown_spec.rb +++ b/spec/components/html_to_markdown_spec.rb @@ -219,4 +219,9 @@ describe HtmlToMarkdown do expect(html_to_markdown("")).to eq("") end + it "handles divs within spans" do + html = "
1st paragraph
2nd paragraph
" + expect(html_to_markdown(html)).to eq("1st paragraph\n2nd paragraph") + end + end