discourse/lib/plain_text_to_markdown.rb
Sam Saffron 6300d978e2 FIX: correctly escape regex url
duplicate link detection could fail in certain cases
2019-06-04 16:21:03 +10:00

194 lines
4.9 KiB
Ruby

# frozen_string_literal: true
class PlainTextToMarkdown
SIGNATURE_SEPARATOR ||= "-- ".freeze
URL_REGEX ||= /((?:https?:(?:\/{1,3}|[a-z0-9%])|www\d{0,3}[.])(?:[^\s()<>]+|\([^\s()<>]+\))+(?:\([^\s()<>]+\)|[^`!()\[\]{};:'".,<>?«»“”‘’\s]))/i
def initialize(plaintext, opts = {})
@plaintext = plaintext
@lines = []
@format_flowed = opts[:format_flowed] || false
@delete_flowed_space = opts[:delete_flowed_space] || false
end
def to_markdown
prepare_lines
classify_lines
markdown = +""
last_quote_level = 0
last_line_blank = false
@lines.each do |line|
current_line_blank = line.text.blank?
unless last_line_blank && current_line_blank
if line.quote_level > 0
quote_identifiers = ">" * line.quote_level
markdown << quote_identifiers << "\n" unless line.quote_level >= last_quote_level || current_line_blank
markdown << quote_identifiers
markdown << " " unless current_line_blank
else
markdown << "\n" unless last_quote_level == 0 || current_line_blank
end
markdown << convert_text(line)
markdown << "\n"
end
last_line_blank = current_line_blank
last_quote_level = line.quote_level
end
markdown.rstrip!
markdown
end
private
class CodeBlock < Struct.new(:start_line, :end_line)
def initialize(start_line, end_line = nil)
super
end
def valid?
start_line.present? && end_line.present?
end
end
class Line < Struct.new(:text, :quote_level, :code_block)
def initialize(text, quote_level = 0, code_block = nil)
super
end
def valid_code_block?
code_block&.valid?
end
end
def prepare_lines
previous_line = nil
@plaintext.each_line do |text|
text.chomp!
line = Line.new(text)
remove_quote_level_indicators!(line)
if @format_flowed
line = merge_lines(line, previous_line)
@lines << line unless line == previous_line
else
@lines << line
end
previous_line = line
end
end
def classify_lines
previous_line = nil
@lines.each do |line|
classify_line_as_code!(line, previous_line)
previous_line = line
end
end
# @param line [Line]
def remove_quote_level_indicators!(line)
match_data = line.text.match(/^(?<indicators>>+)\s?(?<text>.*)/)
if match_data
line.text = match_data[:text]
line.quote_level = match_data[:indicators].length
end
end
# @param line [Line]
# @param previous_line [Line]
# @return [Line]
def merge_lines(line, previous_line)
return line if previous_line.nil? || line.text.blank?
return line if line.text == SIGNATURE_SEPARATOR || previous_line.text == SIGNATURE_SEPARATOR
return line unless line.quote_level == previous_line.quote_level && previous_line.text.end_with?(" ")
previous_line.text = previous_line.text[0...-1] if @delete_flowed_space
previous_line.text += line.text
previous_line
end
# @param line [Line]
# @param previous_line [Line]
def classify_line_as_code!(line, previous_line)
line.code_block = previous_line.code_block unless previous_line.nil? || previous_line.valid_code_block?
return unless line.text =~ /^\s{0,3}```/
if line.code_block.present?
line.code_block.end_line = line
else
line.code_block = CodeBlock.new(line)
end
end
# @param line [Line]
# @return [string]
def convert_text(line)
text = line.text
if line.valid_code_block?
code_block = line.code_block
return code_block.start_line == line || code_block.end_line == line ? text.lstrip : text
end
converted_text = replace_duplicate_links(text)
converted_text = escape_special_characters(converted_text)
converted_text = indent_with_non_breaking_spaces(converted_text)
converted_text
end
def replace_duplicate_links(text)
text.to_enum(:scan, URL_REGEX)
.map { $& }
.group_by { |url| url }
.keep_if { |_, urls | urls.length > 1 }
.keys.each do |url|
escaped = Regexp.escape(url)
text.gsub!(Regexp.new(%Q|#{escaped}(\s*[()\\[\\]<>«»'"“”‘’]?#{escaped}[()\\[\\]<>«»'"“”‘’]?)|, Regexp::IGNORECASE), url)
end
text
end
def indent_with_non_breaking_spaces(text)
text.sub(/^\s+/) do |s|
# replace tabs with 2 spaces
s.gsub!("\t", " ")
# replace indentation with non-breaking spaces
s.length > 1 ? "&nbsp;" * s.length : s
end
end
def escape_special_characters(text)
escaped_text = +""
text.split(URL_REGEX).each do |text_part|
if text_part =~ URL_REGEX
# no escaping withing URLs
escaped_text << text_part
else
# escape Markdown and HTML
text_part.gsub!(/[\\`*_{}\[\]()#+\-.!~]/) { |c| "\\#{c}" }
escaped_text << CGI.escapeHTML(text_part)
end
end
escaped_text
end
end