FIX: Extract div tags within spans

2024-11-26 14:03:39 +08:00 · 2017-05-09 12:33:54 -04:00 · 2017-05-09 12:33:54 -04:00 · b57b635d30
commit b57b635d30
parent c2829dce22
2 changed files with 20 additions and 2 deletions
--- a/lib/html_to_markdown.rb
+++ b/lib/html_to_markdown.rb
@ -8,10 +8,22 @@ class HtmlToMarkdown

  def initialize(html, opts={})
    @opts = opts || {}
-    @doc = Nokogiri::HTML(html)
+    @doc = fix_span_elements(Nokogiri::HTML(html))
+
    remove_whitespaces!
  end

+
+  # If a `<div>` is within a `<span>` that's invalid, so let's hoist the `<div>` up
+  def fix_span_elements(node)
+    if node.name == 'span' && node.at('div')
+      node.swap(node.children)
+    end
+
+    node.children.each {|c| fix_span_elements(c)}
+    node
+  end
+
  def remove_whitespaces!
    @doc.traverse do |node|
      if node.is_a? Nokogiri::XML::Text
@ -33,7 +45,7 @@ class HtmlToMarkdown
  end

  def traverse(node)
-    node.children.each { |node| visit(node) }
+    node.children.each { |n| visit(n) }
  end

  def visit(node)
@ -197,6 +209,7 @@ class HtmlToMarkdown
  end

  def format_block
+
    lines = @stack[-1].markdown.each_line.map do |line|
      prefix = @stack.map { |b| b.opened ? b.body : b.head }.join
      @stack.each { |b| b.opened = true }
--- a/spec/components/html_to_markdown_spec.rb
+++ b/spec/components/html_to_markdown_spec.rb
@ -219,4 +219,9 @@ describe HtmlToMarkdown do
    expect(html_to_markdown("<style>* { margin: 0 }</style>")).to eq("")
  end

+  it "handles divs within spans" do
+    html = "<div>1st paragraph<span><div>2nd paragraph</div></span></div>"
+    expect(html_to_markdown(html)).to eq("1st paragraph\n2nd paragraph")
+  end
+
 end