From 52e81582b4b0735b082c68979106c8b4b12458c7 Mon Sep 17 00:00:00 2001 From: Gerhard Schlager Date: Sun, 9 Jun 2024 13:11:37 +0200 Subject: [PATCH] FEATURE: Use basic HTML table if it can't be converted to Markdown Previously `HtmlToMarkdown` always converted HTML tables into Markdown tables. That lead to some badly formatted Markdown tables, e.g. when the table contained `rowspan` or `colspan`. This solves the issue by using very basic HTML tables in those cases. --- lib/html_to_markdown.rb | 57 ++++++++++-- spec/lib/html_to_markdown_spec.rb | 150 +++++++++++++++++++++++++++++- 2 files changed, 194 insertions(+), 13 deletions(-) diff --git a/lib/html_to_markdown.rb b/lib/html_to_markdown.rb index 3052cedd318..eff00c66341 100644 --- a/lib/html_to_markdown.rb +++ b/lib/html_to_markdown.rb @@ -5,6 +5,7 @@ require "securerandom" class HtmlToMarkdown def initialize(html, opts = {}) @opts = opts + @within_html_block = false # we're only interested in @doc = Nokogiri.HTML5(html).at("body") @@ -139,8 +140,16 @@ class HtmlToMarkdown end end - def traverse(node) - node.children.map { |n| visit(n) }.join + def traverse(node, within_html_block: false) + within_html_block_changed = false + if within_html_block + within_html_block_changed = true + @within_html_block = true + end + + text = node.children.map { |n| visit(n) }.join + @within_html_block = false if within_html_block_changed + text end def visit(node) @@ -251,11 +260,8 @@ class HtmlToMarkdown (1..6).each { |n| define_method("visit_h#{n}") { |node| "#{"#" * n} #{traverse(node)}" } } - CELLS ||= %w[th td] - CELLS.each { |tag| define_method("visit_#{tag}") { |node| "#{traverse(node)} " } } - def visit_table(node) - if rows = extract_rows(node) + if (rows = extract_rows(node)) headers = rows[0].css("td, th") text = "| " + headers.map { |td| traverse(td).gsub(/\n/, "
") }.join(" | ") + " |\n" text << "| " + (["-"] * headers.size).join(" | ") + " |\n" @@ -265,7 +271,7 @@ class HtmlToMarkdown end "\n\n#{text}\n\n" else - traverse(node) + "\n#{traverse(node, within_html_block: true)}
" end end @@ -277,6 +283,27 @@ class HtmlToMarkdown rows end + def visit_tr(node) + text = traverse(node) + @within_html_block ? "\n#{text}\n" : text + end + + TABLE_CELLS ||= %w[th td] + TABLE_CELLS.each do |tag| + define_method("visit_#{tag}") do |node| + text = traverse(node) + if @within_html_block + element = create_element(tag, "\n\n#{text}\n\n") + node.attribute_nodes.each do |a| + element[a.name] = a.value if %w[rowspan colspan].include?(a.name) + end + "#{element.to_html}\n" + else + text + end + end + end + LISTS ||= %w[ul ol] LISTS.each do |tag| define_method("visit_#{tag}") do |node| @@ -352,7 +379,11 @@ class HtmlToMarkdown end def visit_text(node) - node.text + if @within_html_block + node.to_html + else + node.text + end end HTML5_BLOCK_ELEMENTS ||= %w[ @@ -372,4 +403,14 @@ class HtmlToMarkdown return false if !node node.description&.block? || HTML5_BLOCK_ELEMENTS.include?(node.name) end + + def fragment_document + @fragment_document ||= Nokogiri::HTML5::DocumentFragment.parse("").document + end + + def create_element(tag, inner_html = nil, attributes = {}) + element = fragment_document.create_element(tag, nil, attributes) + element.inner_html = inner_html if inner_html + element + end end diff --git a/spec/lib/html_to_markdown_spec.rb b/spec/lib/html_to_markdown_spec.rb index 821d2ad1f15..51aac6c3659 100644 --- a/spec/lib/html_to_markdown_spec.rb +++ b/spec/lib/html_to_markdown_spec.rb @@ -544,7 +544,7 @@ RSpec.describe HtmlToMarkdown do ) end - it "doesn't swallow badly formatted " do + it "keeps HTML for badly formatted
" do html = <<~HTML
@@ -554,13 +554,153 @@ RSpec.describe HtmlToMarkdown do - - - + + +
4
OneTwoThree<One>TwoThree
HTML - expect(html_to_markdown(html)).to eq("1 2 3 4 \nOne Two Three") + markdown = <<~MD + + + + + + + + + + + + +
+ + 1 + + + + 2 + + + + 3 + + + + 4 + +
+ + <One> + + + + **Two** + + + + Three + +
+ MD + + expect(html_to_markdown(html)).to eq(markdown.strip) + end + + it "keeps HTML for with colspan" do + html = <<~HTML +
+ + + + + + + +
12
One / Two
+ HTML + + markdown = <<~MD + + + + + + + + +
+ + 1 + + + + 2 + +
+ + One / Two + +
+ MD + + expect(html_to_markdown(html)).to eq(markdown.strip) + end + + it "keeps HTML for with rowspan" do + html = <<~HTML +
+ + + + + + + + + + + +
12
AB
C
+ HTML + + markdown = <<~MD + + + + + + + + + + + + +
+ + 1 + + + + 2 + +
+ + A + + + + B + +
+ + C + +
+ MD + + expect(html_to_markdown(html)).to eq(markdown.strip) end end