discourse/lib/plain_text_to_markdown.rb
Sam Saffron 30990006a9 DEV: enable frozen string literal on all files
This reduces chances of errors where consumers of strings mutate inputs
and reduces memory usage of the app.

Test suite passes now, but there may be some stuff left, so we will run
a few sites on a branch prior to merging
2019-05-13 09:31:32 +08:00

193 lines
4.8 KiB
Ruby

# frozen_string_literal: true
class PlainTextToMarkdown
SIGNATURE_SEPARATOR ||= "-- ".freeze
URL_REGEX ||= /((?:https?:(?:\/{1,3}|[a-z0-9%])|www\d{0,3}[.])(?:[^\s()<>]+|\([^\s()<>]+\))+(?:\([^\s()<>]+\)|[^`!()\[\]{};:'".,<>?«»“”‘’\s]))/i
def initialize(plaintext, opts = {})
@plaintext = plaintext
@lines = []
@format_flowed = opts[:format_flowed] || false
@delete_flowed_space = opts[:delete_flowed_space] || false
end
def to_markdown
prepare_lines
classify_lines
markdown = +""
last_quote_level = 0
last_line_blank = false
@lines.each do |line|
current_line_blank = line.text.blank?
unless last_line_blank && current_line_blank
if line.quote_level > 0
quote_identifiers = ">" * line.quote_level
markdown << quote_identifiers << "\n" unless line.quote_level >= last_quote_level || current_line_blank
markdown << quote_identifiers
markdown << " " unless current_line_blank
else
markdown << "\n" unless last_quote_level == 0 || current_line_blank
end
markdown << convert_text(line)
markdown << "\n"
end
last_line_blank = current_line_blank
last_quote_level = line.quote_level
end
markdown.rstrip!
markdown
end
private
class CodeBlock < Struct.new(:start_line, :end_line)
def initialize(start_line, end_line = nil)
super
end
def valid?
start_line.present? && end_line.present?
end
end
class Line < Struct.new(:text, :quote_level, :code_block)
def initialize(text, quote_level = 0, code_block = nil)
super
end
def valid_code_block?
code_block&.valid?
end
end
def prepare_lines
previous_line = nil
@plaintext.each_line do |text|
text.chomp!
line = Line.new(text)
remove_quote_level_indicators!(line)
if @format_flowed
line = merge_lines(line, previous_line)
@lines << line unless line == previous_line
else
@lines << line
end
previous_line = line
end
end
def classify_lines
previous_line = nil
@lines.each do |line|
classify_line_as_code!(line, previous_line)
previous_line = line
end
end
# @param line [Line]
def remove_quote_level_indicators!(line)
match_data = line.text.match(/^(?<indicators>>+)\s?(?<text>.*)/)
if match_data
line.text = match_data[:text]
line.quote_level = match_data[:indicators].length
end
end
# @param line [Line]
# @param previous_line [Line]
# @return [Line]
def merge_lines(line, previous_line)
return line if previous_line.nil? || line.text.blank?
return line if line.text == SIGNATURE_SEPARATOR || previous_line.text == SIGNATURE_SEPARATOR
return line unless line.quote_level == previous_line.quote_level && previous_line.text.end_with?(" ")
previous_line.text = previous_line.text[0...-1] if @delete_flowed_space
previous_line.text += line.text
previous_line
end
# @param line [Line]
# @param previous_line [Line]
def classify_line_as_code!(line, previous_line)
line.code_block = previous_line.code_block unless previous_line.nil? || previous_line.valid_code_block?
return unless line.text =~ /^\s{0,3}```/
if line.code_block.present?
line.code_block.end_line = line
else
line.code_block = CodeBlock.new(line)
end
end
# @param line [Line]
# @return [string]
def convert_text(line)
text = line.text
if line.valid_code_block?
code_block = line.code_block
return code_block.start_line == line || code_block.end_line == line ? text.lstrip : text
end
converted_text = replace_duplicate_links(text)
converted_text = escape_special_characters(converted_text)
converted_text = indent_with_non_breaking_spaces(converted_text)
converted_text
end
def replace_duplicate_links(text)
text.to_enum(:scan, URL_REGEX)
.map { $& }
.group_by { |url| url }
.keep_if { |_, urls | urls.length > 1 }
.keys.each do |url|
text.gsub!(Regexp.new(%Q|#{url}(\s*[()\\[\\]<>«»'"“”‘’]?#{url}[()\\[\\]<>«»'"“”‘’]?)|, Regexp::IGNORECASE), url)
end
text
end
def indent_with_non_breaking_spaces(text)
text.sub(/^\s+/) do |s|
# replace tabs with 2 spaces
s.gsub!("\t", " ")
# replace indentation with non-breaking spaces
s.length > 1 ? "&nbsp;" * s.length : s
end
end
def escape_special_characters(text)
escaped_text = +""
text.split(URL_REGEX).each do |text_part|
if text_part =~ URL_REGEX
# no escaping withing URLs
escaped_text << text_part
else
# escape Markdown and HTML
text_part.gsub!(/[\\`*_{}\[\]()#+\-.!~]/) { |c| "\\#{c}" }
escaped_text << CGI.escapeHTML(text_part)
end
end
escaped_text
end
end