FEATURE: Use basic HTML table if it can't be converted to Markdown

Previously `HtmlToMarkdown` always converted HTML tables into Markdown tables. That lead to some badly formatted Markdown tables, e.g. when the table contained `rowspan` or `colspan`. This solves the issue by using very basic HTML tables in those cases.
This commit is contained in:
Gerhard Schlager 2024-06-09 13:11:37 +02:00 committed by Gerhard Schlager
parent b8f2cbf41c
commit 52e81582b4
2 changed files with 194 additions and 13 deletions

View File

@ -5,6 +5,7 @@ require "securerandom"
class HtmlToMarkdown
def initialize(html, opts = {})
@opts = opts
@within_html_block = false
# we're only interested in <body>
@doc = Nokogiri.HTML5(html).at("body")
@ -139,8 +140,16 @@ class HtmlToMarkdown
end
end
def traverse(node)
node.children.map { |n| visit(n) }.join
def traverse(node, within_html_block: false)
within_html_block_changed = false
if within_html_block
within_html_block_changed = true
@within_html_block = true
end
text = node.children.map { |n| visit(n) }.join
@within_html_block = false if within_html_block_changed
text
end
def visit(node)
@ -251,11 +260,8 @@ class HtmlToMarkdown
(1..6).each { |n| define_method("visit_h#{n}") { |node| "#{"#" * n} #{traverse(node)}" } }
CELLS ||= %w[th td]
CELLS.each { |tag| define_method("visit_#{tag}") { |node| "#{traverse(node)} " } }
def visit_table(node)
if rows = extract_rows(node)
if (rows = extract_rows(node))
headers = rows[0].css("td, th")
text = "| " + headers.map { |td| traverse(td).gsub(/\n/, "<br>") }.join(" | ") + " |\n"
text << "| " + (["-"] * headers.size).join(" | ") + " |\n"
@ -265,7 +271,7 @@ class HtmlToMarkdown
end
"\n\n#{text}\n\n"
else
traverse(node)
"<table>\n#{traverse(node, within_html_block: true)}</table>"
end
end
@ -277,6 +283,27 @@ class HtmlToMarkdown
rows
end
def visit_tr(node)
text = traverse(node)
@within_html_block ? "<tr>\n#{text}</tr>\n" : text
end
TABLE_CELLS ||= %w[th td]
TABLE_CELLS.each do |tag|
define_method("visit_#{tag}") do |node|
text = traverse(node)
if @within_html_block
element = create_element(tag, "\n\n#{text}\n\n")
node.attribute_nodes.each do |a|
element[a.name] = a.value if %w[rowspan colspan].include?(a.name)
end
"#{element.to_html}\n"
else
text
end
end
end
LISTS ||= %w[ul ol]
LISTS.each do |tag|
define_method("visit_#{tag}") do |node|
@ -352,7 +379,11 @@ class HtmlToMarkdown
end
def visit_text(node)
node.text
if @within_html_block
node.to_html
else
node.text
end
end
HTML5_BLOCK_ELEMENTS ||= %w[
@ -372,4 +403,14 @@ class HtmlToMarkdown
return false if !node
node.description&.block? || HTML5_BLOCK_ELEMENTS.include?(node.name)
end
def fragment_document
@fragment_document ||= Nokogiri::HTML5::DocumentFragment.parse("").document
end
def create_element(tag, inner_html = nil, attributes = {})
element = fragment_document.create_element(tag, nil, attributes)
element.inner_html = inner_html if inner_html
element
end
end

View File

@ -544,7 +544,7 @@ RSpec.describe HtmlToMarkdown do
)
end
it "doesn't swallow badly formatted <table>" do
it "keeps HTML for badly formatted <table>" do
html = <<~HTML
<table>
<tr>
@ -554,13 +554,153 @@ RSpec.describe HtmlToMarkdown do
<th>4</th>
</tr>
<tr>
<td>One</td>
<td>Two</td>
<td>Three</td>
<td>&lt;One&gt;</td>
<td><strong>Two</strong></td>
<td>Three<script>alert("foo")</script></td>
</tr>
</table>
HTML
expect(html_to_markdown(html)).to eq("1 2 3 4 \nOne Two Three")
markdown = <<~MD
<table>
<tr>
<th>
1
</th>
<th>
2
</th>
<th>
3
</th>
<th>
4
</th>
</tr>
<tr>
<td>
&lt;One&gt;
</td>
<td>
**Two**
</td>
<td>
Three
</td>
</tr>
</table>
MD
expect(html_to_markdown(html)).to eq(markdown.strip)
end
it "keeps HTML for <table> with colspan" do
html = <<~HTML
<table>
<tr>
<th>1</th>
<th>2</th>
</tr>
<tr>
<td colspan="2">One / Two</td>
</tr>
</table>
HTML
markdown = <<~MD
<table>
<tr>
<th>
1
</th>
<th>
2
</th>
</tr>
<tr>
<td colspan="2">
One / Two
</td>
</tr>
</table>
MD
expect(html_to_markdown(html)).to eq(markdown.strip)
end
it "keeps HTML for <table> with rowspan" do
html = <<~HTML
<table>
<tr>
<th>1</th>
<th>2</th>
</tr>
<tr>
<td>A</td>
<td rowspan="2">B</td>
</tr>
<tr>
<td>C</td>
</tr>
</table>
HTML
markdown = <<~MD
<table>
<tr>
<th>
1
</th>
<th>
2
</th>
</tr>
<tr>
<td>
A
</td>
<td rowspan="2">
B
</td>
</tr>
<tr>
<td>
C
</td>
</tr>
</table>
MD
expect(html_to_markdown(html)).to eq(markdown.strip)
end
end