From b57b635d302c87dc2172d4fe688de492d2a37d2a Mon Sep 17 00:00:00 2001
From: Robin Ward <robin.ward@gmail.com>
Date: Tue, 9 May 2017 12:33:54 -0400
Subject: [PATCH] FIX: Extract `div` tags within `span`s

---
 lib/html_to_markdown.rb                  | 17 +++++++++++++++--
 spec/components/html_to_markdown_spec.rb |  5 +++++
 2 files changed, 20 insertions(+), 2 deletions(-)
diff --git a/lib/html_to_markdown.rb b/lib/html_to_markdown.rb
index 5cfbc9b6495..2044d5e37f4 100644
--- a/lib/html_to_markdown.rb
+++ b/lib/html_to_markdown.rb
@@ -8,10 +8,22 @@ class HtmlToMarkdown
 
   def initialize(html, opts={})
     @opts = opts || {}
-    @doc = Nokogiri::HTML(html)
+    @doc = fix_span_elements(Nokogiri::HTML(html))
+
     remove_whitespaces!
   end
 
+
+  # If a `<div>` is within a `<span>` that's invalid, so let's hoist the `<div>` up
+  def fix_span_elements(node)
+    if node.name == 'span' && node.at('div')
+      node.swap(node.children)
+    end
+
+    node.children.each {|c| fix_span_elements(c)}
+    node
+  end
+
   def remove_whitespaces!
     @doc.traverse do |node|
       if node.is_a? Nokogiri::XML::Text
@@ -33,7 +45,7 @@ class HtmlToMarkdown
   end
 
   def traverse(node)
-    node.children.each { |node| visit(node) }
+    node.children.each { |n| visit(n) }
   end
 
   def visit(node)
@@ -197,6 +209,7 @@ class HtmlToMarkdown
   end
 
   def format_block
+
     lines = @stack[-1].markdown.each_line.map do |line|
       prefix = @stack.map { |b| b.opened ? b.body : b.head }.join
       @stack.each { |b| b.opened = true }
diff --git a/spec/components/html_to_markdown_spec.rb b/spec/components/html_to_markdown_spec.rb
index 8a4a8077f8a..3d77cc85e40 100644
--- a/spec/components/html_to_markdown_spec.rb
+++ b/spec/components/html_to_markdown_spec.rb
@@ -219,4 +219,9 @@ describe HtmlToMarkdown do
     expect(html_to_markdown("<style>* { margin: 0 }</style>")).to eq("")
   end
 
+  it "handles divs within spans" do
+    html = "<div>1st paragraph<span><div>2nd paragraph</div></span></div>"
+    expect(html_to_markdown(html)).to eq("1st paragraph\n2nd paragraph")
+  end
+
 end