mirror of
https://github.com/discourse/discourse.git
synced 2024-11-27 21:43:45 +08:00
575bc4af73
We were having a minor issue with emails with embedded images that had newlines in the alt string; for example: ``` <p class="MsoNormal"><span style="font-size:11.0pt"><img width="898" height="498" style="width:9.3541in;height:5.1875in" id="Picture_x0020_5" src="cid:image003.png@01DA4EBA.0400B610" alt="A screenshot of a computer program Description automatically generated"></span><span style="font-size:11.0pt"><o:p></o:p></span></p> ``` Once this was parsed and converted to markdown (or directly to HTML in some cases), this caused an issue in the composer and the post UI, where the markdown parser didn't know how to deal with this, making the HTML show directly instead of showing an image. The easiest way to deal with this is to just strip \n from image alt and title attrs in the HTMLToMarkdown class.
376 lines
9.2 KiB
Ruby
376 lines
9.2 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
require "securerandom"
|
|
|
|
class HtmlToMarkdown
|
|
def initialize(html, opts = {})
|
|
@opts = opts
|
|
|
|
# we're only interested in <body>
|
|
@doc = Nokogiri.HTML5(html).at("body")
|
|
|
|
remove_not_allowed!(@doc)
|
|
remove_hidden!(@doc)
|
|
hoist_line_breaks!(@doc)
|
|
remove_whitespaces!(@doc)
|
|
end
|
|
|
|
def to_markdown
|
|
traverse(@doc).gsub(/\n{2,}/, "\n\n").strip
|
|
end
|
|
|
|
private
|
|
|
|
def strip_newlines(string)
|
|
string.gsub(/\n/, " ")&.squeeze(" ")
|
|
end
|
|
|
|
def remove_not_allowed!(doc)
|
|
allowed = Set.new
|
|
|
|
HtmlToMarkdown.private_instance_methods.each do |m|
|
|
if tag = m.to_s[/^visit_(.+)/, 1]
|
|
allowed << tag
|
|
end
|
|
end
|
|
|
|
@doc.traverse { |node| node.remove if !allowed.include?(node.name) }
|
|
end
|
|
|
|
def remove_hidden!(doc)
|
|
@doc.css("[hidden]").remove
|
|
@doc.css("img[width]").each { |n| n.remove if n["width"].to_i <= 0 }
|
|
@doc.css("img[height]").each { |n| n.remove if n["height"].to_i <= 0 }
|
|
end
|
|
|
|
# When there's a <br> inside an inline element, split the inline element around the <br>
|
|
def hoist_line_breaks!(doc)
|
|
klass = "_" + SecureRandom.hex
|
|
doc.css("br").each { |br| br.add_class(klass) }
|
|
|
|
loop do
|
|
changed = false
|
|
|
|
doc
|
|
.css("br.#{klass}")
|
|
.each do |br|
|
|
parent = br.parent
|
|
|
|
if block?(parent)
|
|
br.remove_class(klass)
|
|
else
|
|
before, after = parent.children.slice_when { |n| n == br }.to_a
|
|
|
|
if before.size > 1
|
|
b = doc.document.create_element(parent.name)
|
|
before[0...-1].each { |c| b.add_child(c) }
|
|
parent.previous = b if b.inner_html.present?
|
|
end
|
|
|
|
if after.present?
|
|
a = doc.document.create_element(parent.name)
|
|
after.each { |c| a.add_child(c) }
|
|
parent.next = a if a.inner_html.present?
|
|
end
|
|
|
|
parent.replace(br)
|
|
|
|
changed = true
|
|
end
|
|
end
|
|
|
|
break if !changed
|
|
end
|
|
end
|
|
|
|
# Removes most of the unnecessary white spaces for better markdown conversion
|
|
# Loosely based on the CSS' White Space Processing Rules (https://www.w3.org/TR/css-text-3/#white-space-rules)
|
|
def remove_whitespaces!(node)
|
|
return true if "pre" == node.name
|
|
|
|
node
|
|
.children
|
|
.chunk { |n| is_inline?(n) }
|
|
.each do |inline, nodes|
|
|
if inline
|
|
collapse_spaces!(nodes) && remove_trailing_space!(nodes)
|
|
else
|
|
nodes.each { |n| remove_whitespaces!(n) }
|
|
end
|
|
end
|
|
end
|
|
|
|
def is_inline?(node)
|
|
node.text? ||
|
|
("br" != node.name && node.description&.inline? && node.children.all? { |n| is_inline?(n) })
|
|
end
|
|
|
|
def collapse_spaces!(nodes, was_space = true)
|
|
nodes.each do |node|
|
|
if node.text?
|
|
text = String.new
|
|
|
|
node.text.chars.each do |c|
|
|
if c[/[[:space:]]/]
|
|
text << " " if !was_space
|
|
was_space = true
|
|
else
|
|
text << c
|
|
was_space = false
|
|
end
|
|
end
|
|
|
|
node.content = text
|
|
else
|
|
node.children.each { |n| was_space = collapse_spaces!([n], was_space) }
|
|
end
|
|
end
|
|
|
|
was_space
|
|
end
|
|
|
|
def remove_trailing_space!(nodes)
|
|
last = nodes[-1]
|
|
|
|
if last.text?
|
|
last.content = last.content[0...-1] if last.content[-1] == " "
|
|
elsif last.children.present?
|
|
remove_trailing_space!(last.children)
|
|
end
|
|
end
|
|
|
|
def traverse(node)
|
|
node.children.map { |n| visit(n) }.join
|
|
end
|
|
|
|
def visit(node)
|
|
visitor = "visit_#{node.name}"
|
|
send(visitor, node) if respond_to?(visitor, true)
|
|
end
|
|
|
|
ALLOWED_IMG_SRCS ||= %w[http:// https:// www.]
|
|
|
|
def allowed_hrefs
|
|
@allowed_hrefs ||=
|
|
begin
|
|
hrefs = SiteSetting.allowed_href_schemes.split("|").map { |scheme| "#{scheme}:" }.to_set
|
|
ALLOWED_IMG_SRCS.each { |src| hrefs << src }
|
|
hrefs << "mailto:"
|
|
hrefs.to_a
|
|
end
|
|
end
|
|
|
|
def visit_a(node)
|
|
if node["href"].present? && node["href"].start_with?(*allowed_hrefs)
|
|
"[#{traverse(node)}](#{node["href"]})"
|
|
else
|
|
traverse(node)
|
|
end
|
|
end
|
|
|
|
def visit_img(node)
|
|
return if node["src"].blank?
|
|
|
|
node["alt"] = strip_newlines(node["alt"]) if node["alt"].present?
|
|
node["title"] = strip_newlines(node["title"]) if node["title"].present?
|
|
|
|
if @opts[:keep_img_tags]
|
|
node.to_html
|
|
elsif @opts[:keep_cid_imgs] && node["src"].start_with?("cid:")
|
|
node.to_html
|
|
elsif node["src"].start_with?(*ALLOWED_IMG_SRCS)
|
|
width = node["width"].to_i
|
|
height = node["height"].to_i
|
|
dimensions = "|#{width}x#{height}" if width > 0 && height > 0
|
|
"![#{node["alt"] || node["title"]}#{dimensions}](#{node["src"]})"
|
|
end
|
|
end
|
|
|
|
ALLOWED ||= %w[kbd del ins small big sub sup dl dd dt mark]
|
|
ALLOWED.each do |tag|
|
|
define_method("visit_#{tag}") { |node| "<#{tag}>#{traverse(node)}</#{tag}>" }
|
|
end
|
|
|
|
def visit_blockquote(node)
|
|
text = traverse(node)
|
|
text.strip!
|
|
text.gsub!(/\n{2,}/, "\n\n")
|
|
text.gsub!(/^/, "> ")
|
|
"\n\n#{text}\n\n"
|
|
end
|
|
|
|
BLOCKS ||= %w[div tr]
|
|
BLOCKS.each do |tag|
|
|
define_method("visit_#{tag}") do |node|
|
|
prefix = block?(node.previous_element) ? "" : "\n"
|
|
"#{prefix}#{traverse(node)}\n"
|
|
end
|
|
end
|
|
|
|
def visit_p(node)
|
|
"\n\n#{traverse(node)}\n\n"
|
|
end
|
|
|
|
TRAVERSABLES ||= %w[aside font span thead tbody tfooter u]
|
|
TRAVERSABLES.each { |tag| define_method("visit_#{tag}") { |node| traverse(node) } }
|
|
|
|
def visit_tt(node)
|
|
"`#{traverse(node)}`"
|
|
end
|
|
|
|
def visit_code(node)
|
|
node.ancestors("pre").present? ? traverse(node) : visit_tt(node)
|
|
end
|
|
|
|
def visit_pre(node)
|
|
text = traverse(node)
|
|
fence = text["`"] ? "~~~" : "```"
|
|
code = node.at("code")
|
|
code_class = code ? code["class"] : ""
|
|
lang = code_class ? code_class[/lang-(\w+)/, 1] : ""
|
|
"\n\n#{fence}#{lang}\n#{traverse(node)}\n#{fence}\n\n"
|
|
end
|
|
|
|
def visit_br(node)
|
|
"\n"
|
|
end
|
|
|
|
def visit_hr(node)
|
|
"\n\n---\n\n"
|
|
end
|
|
|
|
def visit_abbr(node)
|
|
title = node["title"].presence
|
|
title_attr = title ? %[ title="#{title}"] : ""
|
|
"<abbr#{title_attr}>#{traverse(node)}</abbr>"
|
|
end
|
|
|
|
def visit_acronym(node)
|
|
visit_abbr(node)
|
|
end
|
|
|
|
(1..6).each { |n| define_method("visit_h#{n}") { |node| "#{"#" * n} #{traverse(node)}" } }
|
|
|
|
CELLS ||= %w[th td]
|
|
CELLS.each { |tag| define_method("visit_#{tag}") { |node| "#{traverse(node)} " } }
|
|
|
|
def visit_table(node)
|
|
if rows = extract_rows(node)
|
|
headers = rows[0].css("td, th")
|
|
text = "| " + headers.map { |td| traverse(td).gsub(/\n/, "<br>") }.join(" | ") + " |\n"
|
|
text << "| " + (["-"] * headers.size).join(" | ") + " |\n"
|
|
rows[1..-1].each do |row|
|
|
text << "| " + row.css("td").map { |td| traverse(td).gsub(/\n/, "<br>") }.join(" | ") +
|
|
" |\n"
|
|
end
|
|
"\n\n#{text}\n\n"
|
|
else
|
|
traverse(node)
|
|
end
|
|
end
|
|
|
|
def extract_rows(table)
|
|
return if table.ancestors("table").present?
|
|
return if (rows = table.css("tr")).empty?
|
|
headers_count = rows[0].css("td, th").size
|
|
return if rows[1..-1].any? { |row| row.css("td").size != headers_count }
|
|
rows
|
|
end
|
|
|
|
LISTS ||= %w[ul ol]
|
|
LISTS.each do |tag|
|
|
define_method("visit_#{tag}") do |node|
|
|
prefix = block?(node.previous_element) ? "" : "\n"
|
|
suffix = node.ancestors("ul, ol, li").size > 0 ? "" : "\n"
|
|
"#{prefix}#{traverse(node)}#{suffix}"
|
|
end
|
|
end
|
|
|
|
def visit_li(node)
|
|
text = traverse(node)
|
|
|
|
lists = node.ancestors("ul, ol")
|
|
marker = "ol" == lists[0]&.name ? "1. " : "- "
|
|
indent = (" " * marker.size) * [1, lists.size].max
|
|
suffix = node == node.parent.elements[-1] ? "" : "\n"
|
|
|
|
text.gsub!(/\n{2,}/, "\n\n")
|
|
text.gsub!(/^(?!\s*$)/, indent)
|
|
text.lstrip!
|
|
|
|
"#{marker}#{text}#{suffix}"
|
|
end
|
|
|
|
EMPHASES ||= %w[i em]
|
|
EMPHASES.each do |tag|
|
|
define_method("visit_#{tag}") do |node|
|
|
text = traverse(node)
|
|
|
|
return "" if text.empty?
|
|
return " " if text.blank?
|
|
return "<#{tag}>#{text}</#{tag}>" if text["\n"] || (text["*"] && text["_"])
|
|
|
|
prefix = text[0][" "]
|
|
suffix = text[-1][" "] if text.size > 1
|
|
wrap = text["*"] ? "_" : "*"
|
|
|
|
"#{prefix}#{wrap}#{text.strip}#{wrap}#{suffix}"
|
|
end
|
|
end
|
|
|
|
STRONGS ||= %w[b strong]
|
|
STRONGS.each do |tag|
|
|
define_method("visit_#{tag}") do |node|
|
|
text = traverse(node)
|
|
|
|
return "" if text.empty?
|
|
return " " if text.blank?
|
|
return "<#{tag}>#{text}</#{tag}>" if text["\n"] || (text["*"] && text["_"])
|
|
|
|
prefix = text[0][" "]
|
|
suffix = text[-1][" "] if text.size > 1
|
|
wrap = text["*"] ? "__" : "**"
|
|
|
|
"#{prefix}#{wrap}#{text.strip}#{wrap}#{suffix}"
|
|
end
|
|
end
|
|
|
|
STRIKES ||= %w[s strike]
|
|
STRIKES.each do |tag|
|
|
define_method("visit_#{tag}") do |node|
|
|
text = traverse(node)
|
|
|
|
return "" if text.empty?
|
|
return " " if text.blank?
|
|
return "<#{tag}>#{text}</#{tag}>" if text["\n"] || text["~~"]
|
|
|
|
prefix = text[0][" "]
|
|
suffix = text[-1][" "] if text.size > 1
|
|
|
|
"#{prefix}~~#{text.strip}~~#{suffix}"
|
|
end
|
|
end
|
|
|
|
def visit_text(node)
|
|
node.text
|
|
end
|
|
|
|
HTML5_BLOCK_ELEMENTS ||= %w[
|
|
article
|
|
aside
|
|
details
|
|
dialog
|
|
figcaption
|
|
figure
|
|
footer
|
|
header
|
|
main
|
|
nav
|
|
section
|
|
]
|
|
def block?(node)
|
|
return false if !node
|
|
node.description&.block? || HTML5_BLOCK_ELEMENTS.include?(node.name)
|
|
end
|
|
end
|