discourse/lib/plain_text_to_markdown.rb

# frozen_string_literal: true

class PlainTextToMarkdown
  SIGNATURE_SEPARATOR = "-- "

  def initialize(plaintext, opts = {})
    @plaintext = plaintext
    @lines = []

    @format_flowed = opts[:format_flowed] || false
    @delete_flowed_space = opts[:delete_flowed_space] || false
  end

  def to_markdown
    prepare_lines
    classify_lines

    markdown = +""
    last_quote_level = 0
    last_line_blank = false

    @lines.each do |line|
      current_line_blank = line.text.blank?

      unless last_line_blank && current_line_blank
        if line.quote_level > 0
          quote_identifiers = ">" * line.quote_level
          unless line.quote_level >= last_quote_level || current_line_blank
            markdown << quote_identifiers << "\n"
          end
          markdown << quote_identifiers
          markdown << " " unless current_line_blank
        else
          markdown << "\n" unless last_quote_level == 0 || current_line_blank
        end

        markdown << convert_text(line)
        markdown << "\n"
      end

      last_line_blank = current_line_blank
      last_quote_level = line.quote_level
    end

    markdown.rstrip!
    markdown
  end

  private

  class CodeBlock < Struct.new(:start_line, :end_line)
    def initialize(start_line, end_line = nil)
      super
    end

    def valid?
      start_line.present? && end_line.present?
    end
  end

  class Line < Struct.new(:text, :quote_level, :code_block)
    def initialize(text, quote_level = 0, code_block = nil)
      super
    end

    def valid_code_block?
      code_block&.valid?
    end
  end

  def prepare_lines
    previous_line = nil

    @plaintext.each_line do |text|
      text.chomp!
      line = Line.new(text)

      remove_quote_level_indicators!(line)

      if @format_flowed
        line = merge_lines(line, previous_line)
        @lines << line unless line == previous_line
      else
        @lines << line
      end

      previous_line = line
    end
  end

  def classify_lines
    previous_line = nil

    @lines.each do |line|
      classify_line_as_code!(line, previous_line)

      previous_line = line
    end
  end

  # @param line [Line]
  def remove_quote_level_indicators!(line)
    match_data = line.text.match(/\A(?<indicators>>+)\s?(?<text>.*)/)

    if match_data
      line.text = match_data[:text]
      line.quote_level = match_data[:indicators].length
    end
  end

  # @param line [Line]
  # @param previous_line [Line]
  # @return [Line]
  def merge_lines(line, previous_line)
    return line if previous_line.nil? || line.text.blank?
    return line if line.text == SIGNATURE_SEPARATOR || previous_line.text == SIGNATURE_SEPARATOR
    unless line.quote_level == previous_line.quote_level && previous_line.text.end_with?(" ")
      return line
    end

    previous_line.text = previous_line.text[0...-1] if @delete_flowed_space
    previous_line.text += line.text
    previous_line
  end

  # @param line [Line]
  # @param previous_line [Line]
  def classify_line_as_code!(line, previous_line)
    line.code_block = previous_line.code_block unless previous_line.nil? ||
      previous_line.valid_code_block?
    return unless line.text =~ /\A\s{0,3}```/

    if line.code_block.present?
      line.code_block.end_line = line
    else
      line.code_block = CodeBlock.new(line)
    end
  end

  # @param line [Line]
  # @return [string]
  def convert_text(line)
    text = line.text

    if line.valid_code_block?
      code_block = line.code_block
      return code_block.start_line == line || code_block.end_line == line ? text.lstrip : text
    end

    converted_text = replace_duplicate_links(text)
    converted_text = escape_special_characters(converted_text)
    converted_text = indent_with_non_breaking_spaces(converted_text)
    converted_text
  end

  URL_REGEX = URI.regexp(%w[http https ftp mailto])
  BEFORE = Regexp.escape(%Q|([<«"“'‘|)
  AFTER = Regexp.escape(%Q|)]>»"”'’|)

  def replace_duplicate_links(text)
    urls = Set.new
    text.scan(URL_REGEX) { urls << $& }

    urls.each do |url|
      escaped = Regexp.escape(url)
      text.gsub!(
        Regexp.new(%Q|#{escaped}\s*[#{BEFORE}]?#{escaped}[#{AFTER}]?|, Regexp::IGNORECASE),
        url,
      )
    end

    text
  end

  def indent_with_non_breaking_spaces(text)
    text.sub(/\A\s+/) do |s|
      # replace tabs with 2 spaces
      s.gsub!("\t", "  ")

      # replace indentation with non-breaking spaces
      s.length > 1 ? "&nbsp;" * s.length : s
    end
  end

  def escape_special_characters(text)
    urls = Set.new
    text.scan(URL_REGEX) { urls << $& }

    hoisted = urls.map { |url| [SecureRandom.hex, url] }.to_h

    hoisted.each { |h, url| text.gsub!(url, h) }

    text.gsub!(/[\\`*_{}\[\]()#+\-.!~]/) { |c| "\\#{c}" }
    text = CGI.escapeHTML(text)

    hoisted.each { |h, url| text.gsub!(h, url) }

    text
  end
end
-												DEV: enable frozen string literal on all files

This reduces chances of errors where consumers of strings mutate inputs
and reduces memory usage of the app.

Test suite passes now, but there may be some stuff left, so we will run
a few sites on a branch prior to merging

											
										
										
											2019-05-03 06:17:27 +08:00
+								# frozen_string_literal: true
-												FEATURE: convert plain text emails to markdown

											
										
										
											2017-12-06 08:47:31 +08:00
+								class PlainTextToMarkdown
-												DEV: Bump rubocop_discourse (#29608)


											
										
										
											2024-11-06 06:27:49 +08:00
+								  SIGNATURE_SEPARATOR = "-- "
-												FEATURE: convert plain text emails to markdown

											
										
										
											2017-12-06 08:47:31 +08:00
 								  def initialize(plaintext, opts = {})
 								    @plaintext = plaintext
 								    @lines = []
 								    @format_flowed = opts[:format_flowed] || false
 								    @delete_flowed_space = opts[:delete_flowed_space] || false
 								  end
 								  def to_markdown
 								    prepare_lines
 								    classify_lines
-												DEV: enable frozen string literal on all files

This reduces chances of errors where consumers of strings mutate inputs
and reduces memory usage of the app.

Test suite passes now, but there may be some stuff left, so we will run
a few sites on a branch prior to merging

											
										
										
											2019-05-03 06:17:27 +08:00
+								    markdown = +""
-												FEATURE: convert plain text emails to markdown

											
										
										
											2017-12-06 08:47:31 +08:00
+								    last_quote_level = 0
 								    last_line_blank = false
 								    @lines.each do |line|
 								      current_line_blank = line.text.blank?
 								      unless last_line_blank && current_line_blank
 								        if line.quote_level > 0
 								          quote_identifiers = ">" * line.quote_level
 								          unless line.quote_level >= last_quote_level || current_line_blank
 								            markdown << quote_identifiers << "\n"
-												DEV: Apply syntax_tree formatting to `lib/*`

											
										
										
											2023-01-09 20:10:19 +08:00
+								          end
-												FEATURE: convert plain text emails to markdown

											
										
										
											2017-12-06 08:47:31 +08:00
+								          markdown << quote_identifiers
 								          markdown << " " unless current_line_blank
 								        else
 								          markdown << "\n" unless last_quote_level == 0 || current_line_blank
 								        end
 								        markdown << convert_text(line)
 								        markdown << "\n"
 								      end
 								      last_line_blank = current_line_blank
 								      last_quote_level = line.quote_level
 								    end
 								    markdown.rstrip!
 								    markdown
 								  end
 								  private
 								  class CodeBlock < Struct.new(:start_line, :end_line)
 								    def initialize(start_line, end_line = nil)
 								      super
 								    end
 								    def valid?
 								      start_line.present? && end_line.present?
 								    end
 								  end
 								  class Line < Struct.new(:text, :quote_level, :code_block)
 								    def initialize(text, quote_level = 0, code_block = nil)
 								      super
 								    end
 								    def valid_code_block?
 								      code_block&.valid?
 								    end
 								  end
 								  def prepare_lines
 								    previous_line = nil
 								    @plaintext.each_line do |text|
 								      text.chomp!
 								      line = Line.new(text)
 								      remove_quote_level_indicators!(line)
 								      if @format_flowed
 								        line = merge_lines(line, previous_line)
 								        @lines << line unless line == previous_line
 								      else
 								        @lines << line
 								      end
 								      previous_line = line
 								    end
 								  end
 								  def classify_lines
 								    previous_line = nil
 								    @lines.each do |line|
 								      classify_line_as_code!(line, previous_line)
 								      previous_line = line
 								    end
 								  end
 								  # @param line [Line]
 								  def remove_quote_level_indicators!(line)
-												DEV: Prefer \A and \z over ^ and $ in regexes (#19936)


											
										
										
											2023-01-21 02:52:49 +08:00
+								    match_data = line.text.match(/\A(?<indicators>>+)\s?(?<text>.*)/)
-												FEATURE: convert plain text emails to markdown

											
										
										
											2017-12-06 08:47:31 +08:00
 								    if match_data
 								      line.text = match_data[:text]
 								      line.quote_level = match_data[:indicators].length
 								    end
 								  end
 								  # @param line [Line]
 								  # @param previous_line [Line]
 								  # @return [Line]
 								  def merge_lines(line, previous_line)
 								    return line if previous_line.nil? || line.text.blank?
 								    return line if line.text == SIGNATURE_SEPARATOR || previous_line.text == SIGNATURE_SEPARATOR
 								    unless line.quote_level == previous_line.quote_level && previous_line.text.end_with?(" ")
 								      return line
-												DEV: Apply syntax_tree formatting to `lib/*`

											
										
										
											2023-01-09 20:10:19 +08:00
+								    end
-												FEATURE: convert plain text emails to markdown

											
										
										
											2017-12-06 08:47:31 +08:00
 								    previous_line.text = previous_line.text[0...-1] if @delete_flowed_space
 								    previous_line.text += line.text
 								    previous_line
 								  end
 								  # @param line [Line]
 								  # @param previous_line [Line]
 								  def classify_line_as_code!(line, previous_line)
 								    line.code_block = previous_line.code_block unless previous_line.nil? ||
 								      previous_line.valid_code_block?
-												DEV: Prefer \A and \z over ^ and $ in regexes (#19936)


											
										
										
											2023-01-21 02:52:49 +08:00
+								    return unless line.text =~ /\A\s{0,3}```/
-												FEATURE: convert plain text emails to markdown

											
										
										
											2017-12-06 08:47:31 +08:00
 								    if line.code_block.present?
 								      line.code_block.end_line = line
 								    else
 								      line.code_block = CodeBlock.new(line)
 								    end
 								  end
 								  # @param line [Line]
 								  # @return [string]
 								  def convert_text(line)
 								    text = line.text
 								    if line.valid_code_block?
 								      code_block = line.code_block
 								      return code_block.start_line == line || code_block.end_line == line ? text.lstrip : text
 								    end
 								    converted_text = replace_duplicate_links(text)
 								    converted_text = escape_special_characters(converted_text)
 								    converted_text = indent_with_non_breaking_spaces(converted_text)
 								    converted_text
 								  end
-												DEV: Remove logical OR assignment of constants (#29201)

Constants should always be only assigned once. The logical OR assignment
of a constant is a relic of the past before we used zeitwerk for
autoloading and had bugs where a file could be loaded twice resulting in
constant redefinition warnings.
											
										
										
											2024-10-16 10:09:07 +08:00
+								  URL_REGEX = URI.regexp(%w[http https ftp mailto])
 								  BEFORE = Regexp.escape(%Q|([<«"“'‘|)
 								  AFTER = Regexp.escape(%Q|)]>»"”'’|)
-												FIX: use URI.regexp to find URLs in plain text

											
										
										
											2019-06-07 07:26:06 +08:00
-												FEATURE: convert plain text emails to markdown

											
										
										
											2017-12-06 08:47:31 +08:00
+								  def replace_duplicate_links(text)
-												FIX: use URI.regexp to find URLs in plain text

											
										
										
											2019-06-07 07:26:06 +08:00
+								    urls = Set.new
 								    text.scan(URL_REGEX) { urls << $& }
-												FEATURE: convert plain text emails to markdown

											
										
										
											2017-12-06 08:47:31 +08:00
-												FIX: use URI.regexp to find URLs in plain text

											
										
										
											2019-06-07 07:26:06 +08:00
+								    urls.each do |url|
-												FIX: correctly escape regex url

duplicate link detection could fail in certain cases

											
										
										
											2019-06-04 14:20:40 +08:00
+								      escaped = Regexp.escape(url)
-												FIX: use URI.regexp to find URLs in plain text

											
										
										
											2019-06-07 07:26:06 +08:00
+								      text.gsub!(
 								        Regexp.new(%Q|#{escaped}\s*[#{BEFORE}]?#{escaped}[#{AFTER}]?|, Regexp::IGNORECASE),
 								        url,
 								      )
-												FEATURE: convert plain text emails to markdown

											
										
										
											2017-12-06 08:47:31 +08:00
+								    end
 								    text
 								  end
 								  def indent_with_non_breaking_spaces(text)
-												DEV: Prefer \A and \z over ^ and $ in regexes (#19936)


											
										
										
											2023-01-21 02:52:49 +08:00
+								    text.sub(/\A\s+/) do |s|
-												FEATURE: convert plain text emails to markdown

											
										
										
											2017-12-06 08:47:31 +08:00
+								      # replace tabs with 2 spaces
 								      s.gsub!("\t", "  ")
 								      # replace indentation with non-breaking spaces
 								      s.length > 1 ? "&nbsp;" * s.length : s
 								    end
 								  end
 								  def escape_special_characters(text)
-												FIX: use URI.regexp to find URLs in plain text

											
										
										
											2019-06-07 07:26:06 +08:00
+								    urls = Set.new
 								    text.scan(URL_REGEX) { urls << $& }
-												FEATURE: convert plain text emails to markdown

											
										
										
											2017-12-06 08:47:31 +08:00
-												FIX: use URI.regexp to find URLs in plain text

											
										
										
											2019-06-07 07:26:06 +08:00
+								    hoisted = urls.map { |url| [SecureRandom.hex, url] }.to_h
-												FEATURE: convert plain text emails to markdown

											
										
										
											2017-12-06 08:47:31 +08:00
-												FIX: use URI.regexp to find URLs in plain text

											
										
										
											2019-06-07 07:26:06 +08:00
+								    hoisted.each { |h, url| text.gsub!(url, h) }
 								    text.gsub!(/[\\`*_{}\[\]()#+\-.!~]/) { |c| "\\#{c}" }
 								    text = CGI.escapeHTML(text)
 								    hoisted.each { |h, url| text.gsub!(h, url) }
 								    text
-												FEATURE: convert plain text emails to markdown

											
										
										
											2017-12-06 08:47:31 +08:00
+								  end
 								end