From c880af81202382bfe7a256f42c693b22cea47f55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9gis=20Hanol?= Date: Wed, 3 May 2017 18:04:31 +0200 Subject: [PATCH] FIX: properly trim whitespaces (including those pesky   html entities) --- lib/html_to_markdown.rb | 8 ++++---- spec/components/html_to_markdown_spec.rb | 14 ++++++++++++++ 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/lib/html_to_markdown.rb b/lib/html_to_markdown.rb index 9e18269c664..60f88d9095f 100644 --- a/lib/html_to_markdown.rb +++ b/lib/html_to_markdown.rb @@ -15,10 +15,10 @@ class HtmlToMarkdown def remove_whitespaces! @doc.traverse do |node| if node.is_a? Nokogiri::XML::Text - node.content = node.content.lstrip if node.previous_element&.description&.block? - node.content = node.content.lstrip if node.previous_element.nil? && node.parent.description&.block? - node.content = node.content.rstrip if node.next_element&.description&.block? - node.content = node.content.rstrip if node.next_element.nil? && node.parent.description&.block? + node.content = node.content.gsub(/\A[[:space:]]+/, "") if node.previous_element&.description&.block? + node.content = node.content.gsub(/\A[[:space:]]+/, "") if node.previous_element.nil? && node.parent.description&.block? + node.content = node.content.gsub(/[[:space:]]+\z/, "") if node.next_element&.description&.block? + node.content = node.content.gsub(/[[:space:]]+\z/, "") if node.next_element.nil? && node.parent.description&.block? node.remove if node.content.empty? end end diff --git a/spec/components/html_to_markdown_spec.rb b/spec/components/html_to_markdown_spec.rb index 43651536e31..e5a9c5b8e2c 100644 --- a/spec/components/html_to_markdown_spec.rb +++ b/spec/components/html_to_markdown_spec.rb @@ -7,6 +7,20 @@ describe HtmlToMarkdown do HtmlToMarkdown.new(html).to_markdown end + it "remove whitespaces" do + expect(html_to_markdown(<<-HTML +
Hello, +

+
    This is the 1st paragraph.   
+

+
+         This is another paragraph +
+
+ HTML + )).to eq("Hello,\n\nThis is the 1st paragraph.\n\nThis is another paragraph") + end + it "converts " do expect(html_to_markdown("Strong")).to eq("**Strong**") expect(html_to_markdown("Str*ng")).to eq("__Str*ng__")