discourse/lib/excerpt_parser.rb

# frozen_string_literal: true

class ExcerptParser < Nokogiri::XML::SAX::Document

  attr_reader :excerpt

  SPAN_REGEX = /<\s*span[^>]*class\s*=\s*['|"]excerpt['|"][^>]*>/

  def initialize(length, options = nil)
    @length = length
    @excerpt = +""
    @current_length = 0
    options || {}
    @strip_links = options[:strip_links] == true
    @strip_images = options[:strip_images] == true
    @text_entities = options[:text_entities] == true
    @markdown_images = options[:markdown_images] == true
    @keep_newlines = options[:keep_newlines] == true
    @keep_emoji_images = options[:keep_emoji_images] == true
    @keep_onebox_source = options[:keep_onebox_source] == true
    @keep_onebox_body = options[:keep_onebox_body] == true
    @keep_quotes = options[:keep_quotes] == true
    @remap_emoji = options[:remap_emoji] == true
    @start_excerpt = false
    @in_details_depth = 0
    @summary_contents = +""
    @detail_contents = +""
  end

  def self.get_excerpt(html, length, options)
    html ||= ''
    length = html.length if html.include?('excerpt') && SPAN_REGEX === html
    me = self.new(length, options)
    parser = Nokogiri::HTML::SAX::Parser.new(me)
    catch(:done) do
      parser.parse(html)
    end
    excerpt = me.excerpt.strip
    excerpt = excerpt.gsub(/\s*\n+\s*/, "\n\n") if options[:keep_onebox_source] || options[:keep_onebox_body]
    excerpt = CGI.unescapeHTML(excerpt) if options[:text_entities] == true
    excerpt
  end

  def escape_attribute(v)
    return "" unless v

    v = v.dup
    v.gsub!("&", "&amp;")
    v.gsub!("\"", "&#34;")
    v.gsub!("<", "&lt;")
    v.gsub!(">", "&gt;")
    v
  end

  def include_tag(name, attributes)
    characters("<#{name} #{attributes.map { |k, v| "#{k}=\"#{escape_attribute(v)}\"" }.join(' ')}>",
               truncate: false, count_it: false, encode: false)
  end

  def start_element(name, attributes = [])
    case name
    when "img"
      attributes = Hash[*attributes.flatten]

      if attributes["class"]&.include?('emoji')
        if @remap_emoji
          title = (attributes["alt"] || "").gsub(":", "")
          title = Emoji.lookup_unicode(title) || attributes["alt"]
          return characters(title)
        elsif @keep_emoji_images
          return include_tag(name, attributes)
        else
          return characters(attributes["alt"])
        end
      end

      unless @strip_images
        # If include_images is set, include the image in markdown
        characters("!") if @markdown_images

        if !attributes["alt"].blank?
          characters("[#{attributes["alt"]}]")
        elsif !attributes["title"].blank?
          characters("[#{attributes["title"]}]")
        else
          characters("[#{I18n.t 'excerpt_image'}]")
        end

        characters("(#{attributes['src']})") if @markdown_images
      end

    when "a"
      unless @strip_links
        include_tag(name, attributes)
        @in_a = true
      end

    when "aside"
      attributes = Hash[*attributes.flatten]
      unless (@keep_onebox_source || @keep_onebox_body) && attributes['class']&.include?('onebox')
        @in_quote = true
      end

      if attributes['class']&.include?('quote')
        if @keep_quotes || (@keep_onebox_body && attributes['data-topic'].present?)
          @in_quote = false
        end
      end

    when 'article'
      if attributes.include?(['class', 'onebox-body'])
        @in_quote = !@keep_onebox_body
      end

    when 'header'
      if attributes.include?(['class', 'source'])
        @in_quote = !@keep_onebox_source
      end

    when "div", "span"
      if attributes.include?(["class", "excerpt"])
        @excerpt = +""
        @current_length = 0
        @start_excerpt = true
      end

    when "details"
      @detail_contents = +"" if @in_details_depth == 0
      @in_details_depth += 1

    when "summary"
      if @in_details_depth == 1 && !@in_summary
        @summary_contents = +""
        @in_summary = true
      end

    end
  end

  def end_element(name)
    case name
    when "a"
      unless @strip_links
        characters("</a>", truncate: false, count_it: false, encode: false)
        @in_a = false
      end
    when "p", "br"
      if @keep_newlines
        characters("<br>", truncate: false, count_it: false, encode: false)
      else
        characters(" ")
      end
    when "aside"
      @in_quote = false
    when "details"
      @in_details_depth -= 1
      if @in_details_depth == 0
        @summary_contents = clean(@summary_contents)
        @detail_contents = clean(@detail_contents)

        if @current_length + @summary_contents.length >= @length
          characters(@summary_contents,
                     encode: false,
                     before_string: "<details class='disabled'><summary>",
                     after_string: "</summary></details>")
        else
          characters(@summary_contents,
                     truncate: false,
                     encode: false,
                     before_string: "<details><summary>",
                     after_string: "</summary>")

          characters(@detail_contents,
                     encode: false,
                     after_string: "</details>")
        end
      end
    when "summary"
      @in_summary = false if @in_details_depth == 1
    when "div", "span"
      throw :done if @start_excerpt
      characters("</span>", truncate: false, count_it: false, encode: false) if @in_spoiler
      @in_spoiler = false
    end
  end

  def clean(str)
    ERB::Util.html_escape(str.strip)
  end

  def characters(string, truncate: true, count_it: true, encode: true, before_string: nil, after_string: nil)
    return if @in_quote

    # we call length on this so might as well ensure we have a string
    string = string.to_s
    if @in_details_depth > 0
      if @in_summary
        @summary_contents << string
      else
        @detail_contents << string
      end
      return
    end

    @excerpt << before_string if before_string

    encode = encode ? lambda { |s| ERB::Util.html_escape(s) } : lambda { |s| s }
    if count_it && @current_length + string.length > @length
      length = [0, @length - @current_length - 1].max
      @excerpt << encode.call(string[0..length]) if truncate
      @excerpt << (@text_entities ? "..." : "&hellip;")
      @excerpt << "</a>" if @in_a
      @excerpt << after_string if after_string
      throw :done
    end

    @excerpt << encode.call(string)
    @excerpt << after_string if after_string
    @current_length += string.length if count_it
  end
end
DEV: enable frozen string literal on all files This reduces chances of errors where consumers of strings mutate inputs and reduces memory usage of the app. Test suite passes now, but there may be some stuff left, so we will run a few sites on a branch prior to merging 2019-05-03 06:17:27 +08:00			`# frozen_string_literal: true`

refactor 2013-05-28 07:48:47 +08:00			`class ExcerptParser < Nokogiri::XML::SAX::Document`

			`attr_reader :excerpt`

FEATURE: Allow manual excerpt to be specified anywhere in the post and override max excerpt length 2014-09-04 13:03:12 +08:00			`SPAN_REGEX = /<\sspan[^>]class\s=\s['\|"]excerpt['\|"][^>]*>/`

Add rubocop to our build. (#5004) 2017-07-28 09:20:09 +08:00			`def initialize(length, options = nil)`
refactor 2013-05-28 07:48:47 +08:00			`@length = length`
DEV: enable frozen string literal on all files This reduces chances of errors where consumers of strings mutate inputs and reduces memory usage of the app. Test suite passes now, but there may be some stuff left, so we will run a few sites on a branch prior to merging 2019-05-03 06:17:27 +08:00			`@excerpt = +""`
refactor 2013-05-28 07:48:47 +08:00			`@current_length = 0`
Better HTML emails, smarter email digests, new email section in admin with digest preview 2013-06-04 04:12:24 +08:00			`options \|\| {}`
refactor 2013-05-28 07:48:47 +08:00			`@strip_links = options[:strip_links] == true`
FEATURE: omit images from og and twitter description tags 2017-11-28 19:27:43 +08:00			`@strip_images = options[:strip_images] == true`
Better HTML emails, smarter email digests, new email section in admin with digest preview 2013-06-04 04:12:24 +08:00			`@text_entities = options[:text_entities] == true`
Allow images in the daily digest for top scoring posts 2013-06-06 06:54:46 +08:00			`@markdown_images = options[:markdown_images] == true`
FIX: clean html before sending it to jquery for collapsing 2015-05-20 12:42:54 +08:00			`@keep_newlines = options[:keep_newlines] == true`
FIX: Emoji in Discourse onebox is wrapped in square brackets. 2015-12-14 21:46:15 +08:00			`@keep_emoji_images = options[:keep_emoji_images] == true`
FEATURE: Add option for `ExcerptParser` to keep onebox source. 2017-04-10 16:11:58 +08:00			`@keep_onebox_source = options[:keep_onebox_source] == true`
DEV: Add option to keep onebox body content in post excerpt. 2019-11-02 19:44:04 +08:00			`@keep_onebox_body = options[:keep_onebox_body] == true`
DEV: Add option to keep quoted content in post excerpt. 2020-01-04 21:26:52 +08:00			`@keep_quotes = options[:keep_quotes] == true`
FEATURE: remap emojis back for push notifications and desktop alerts 2016-10-11 10:03:21 +08:00			`@remap_emoji = options[:remap_emoji] == true`
Feature: allow mods to cut pinned topic excerpts 2014-07-17 19:32:17 +08:00			`@start_excerpt = false`
FIX: support for generating excerpt when nesting <details> blocks 2018-01-23 02:17:35 +08:00			`@in_details_depth = 0`
DEV: enable frozen string literal on all files This reduces chances of errors where consumers of strings mutate inputs and reduces memory usage of the app. Test suite passes now, but there may be some stuff left, so we will run a few sites on a branch prior to merging 2019-05-03 06:17:27 +08:00			`@summary_contents = +""`
			`@detail_contents = +""`
refactor 2013-05-28 07:48:47 +08:00			`end`

			`def self.get_excerpt(html, length, options)`
FEATURE: Allow manual excerpt to be specified anywhere in the post and override max excerpt length 2014-09-04 13:03:12 +08:00			`html \|\|= ''`
FIX: properly unescape HTML entities in excerpts 2014-12-10 19:52:51 +08:00			`length = html.length if html.include?('excerpt') && SPAN_REGEX === html`
FEATURE: Allow manual excerpt to be specified anywhere in the post and override max excerpt length 2014-09-04 13:03:12 +08:00			`me = self.new(length, options)`
refactor 2013-05-28 07:48:47 +08:00			`parser = Nokogiri::HTML::SAX::Parser.new(me)`
			`catch(:done) do`
FEATURE: Allow manual excerpt to be specified anywhere in the post and override max excerpt length 2014-09-04 13:03:12 +08:00			`parser.parse(html)`
refactor 2013-05-28 07:48:47 +08:00			`end`
FIX: properly unescape HTML entities in excerpts 2014-12-10 19:52:51 +08:00			`excerpt = me.excerpt.strip`
DEV: Add option to keep onebox body content in post excerpt. 2019-11-02 19:44:04 +08:00			`excerpt = excerpt.gsub(/\s\n+\s/, "\n\n") if options[:keep_onebox_source] \|\| options[:keep_onebox_body]`
FIX: properly unescape HTML entities in excerpts 2014-12-10 19:52:51 +08:00			`excerpt = CGI.unescapeHTML(excerpt) if options[:text_entities] == true`
			`excerpt`
refactor 2013-05-28 07:48:47 +08:00			`end`

SECURITY: fix XSS in excerpt parser 2014-07-25 10:15:43 +08:00			`def escape_attribute(v)`
FIX: blank page on user page in rare cases 2014-10-07 08:37:27 +08:00			`return "" unless v`

			`v = v.dup`
			`v.gsub!("&", "&")`
			`v.gsub!("\"", """)`
			`v.gsub!("<", "<")`
			`v.gsub!(">", ">")`
			`v`
SECURITY: fix XSS in excerpt parser 2014-07-25 10:15:43 +08:00			`end`

Allow images in the daily digest for top scoring posts 2013-06-06 06:54:46 +08:00			`def include_tag(name, attributes)`
FIX: Calculation of text length for <details> in excerpt was wrong 2018-03-12 23:52:06 +08:00			`characters("<#{name} #{attributes.map { \|k, v\| "#{k}=\"#{escape_attribute(v)}\"" }.join(' ')}>",`
			`truncate: false, count_it: false, encode: false)`
Allow images in the daily digest for top scoring posts 2013-06-06 06:54:46 +08:00			`end`

Add rubocop to our build. (#5004) 2017-07-28 09:20:09 +08:00			`def start_element(name, attributes = [])`
refactor 2013-05-28 07:48:47 +08:00			`case name`
Add rubocop to our build. (#5004) 2017-07-28 09:20:09 +08:00			`when "img"`
			`attributes = Hash[*attributes.flatten]`
FEATURE: Display emojis in user stream. 2015-07-23 23:02:03 +08:00
FEATURE: omit images from og and twitter description tags 2017-11-28 19:27:43 +08:00			`if attributes["class"]&.include?('emoji')`
			`if @remap_emoji`
			`title = (attributes["alt"] \|\| "").gsub(":", "")`
			`title = Emoji.lookup_unicode(title) \|\| attributes["alt"]`
			`return characters(title)`
			`elsif @keep_emoji_images`
			`return include_tag(name, attributes)`
			`else`
			`return characters(attributes["alt"])`
FEATURE: Display emojis in user stream. 2015-07-23 23:02:03 +08:00			`end`
FEATURE: omit images from og and twitter description tags 2017-11-28 19:27:43 +08:00			`end`
FEATURE: Display emojis in user stream. 2015-07-23 23:02:03 +08:00
FEATURE: omit images from og and twitter description tags 2017-11-28 19:27:43 +08:00			`unless @strip_images`
Allow images in the daily digest for top scoring posts 2013-06-06 06:54:46 +08:00			`# If include_images is set, include the image in markdown`
			`characters("!") if @markdown_images`

FIX: Handle cases where `alt` and `title` tag is blank when parsing excerpt. 2017-04-11 12:12:51 +08:00			`if !attributes["alt"].blank?`
refactor 2013-05-28 07:48:47 +08:00			`characters("[#{attributes["alt"]}]")`
FIX: Handle cases where `alt` and `title` tag is blank when parsing excerpt. 2017-04-11 12:12:51 +08:00			`elsif !attributes["title"].blank?`
refactor 2013-05-28 07:48:47 +08:00			`characters("[#{attributes["title"]}]")`
			`else`
allow to translate image's exceprt 2015-01-23 17:57:01 +08:00			`characters("[#{I18n.t 'excerpt_image'}]")`
refactor 2013-05-28 07:48:47 +08:00			`end`
Allow images in the daily digest for top scoring posts 2013-06-06 06:54:46 +08:00
			`characters("(#{attributes['src']})") if @markdown_images`
FEATURE: omit images from og and twitter description tags 2017-11-28 19:27:43 +08:00			`end`
Allow images in the daily digest for top scoring posts 2013-06-06 06:54:46 +08:00
Add rubocop to our build. (#5004) 2017-07-28 09:20:09 +08:00			`when "a"`
			`unless @strip_links`
			`include_tag(name, attributes)`
			`@in_a = true`
			`end`
Preserve spoiler tags in post excerpts. 2014-02-20 16:48:30 +08:00
Add rubocop to our build. (#5004) 2017-07-28 09:20:09 +08:00			`when "aside"`
			`attributes = Hash[*attributes.flatten]`
FIX: do not raise error if 'class' attribute is not found. 2020-05-01 12:33:40 +08:00			`unless (@keep_onebox_source \|\| @keep_onebox_body) && attributes['class']&.include?('onebox')`
FIX: support for generating excerpt when nesting <details> blocks 2018-01-23 02:17:35 +08:00			`@in_quote = true`
			`end`
FIX: Quotes should be ignored when parsing for onebox source. 2017-04-11 15:13:21 +08:00
FIX: Do not error in excerpts when aside tag has no class attribute 2020-01-21 05:09:23 +08:00			`if attributes['class']&.include?('quote')`
DEV: Add option to keep quoted content in post excerpt. 2020-01-04 21:26:52 +08:00			`if @keep_quotes \|\| (@keep_onebox_body && attributes['data-topic'].present?)`
			`@in_quote = false`
			`end`
DEV: Add option to keep onebox body content in post excerpt. 2019-11-02 19:44:04 +08:00			`end`

Add rubocop to our build. (#5004) 2017-07-28 09:20:09 +08:00			`when 'article'`
DEV: Add option to keep onebox body content in post excerpt. 2019-11-02 19:44:04 +08:00			`if attributes.include?(['class', 'onebox-body'])`
			`@in_quote = !@keep_onebox_body`
			`end`

			`when 'header'`
			`if attributes.include?(['class', 'source'])`
			`@in_quote = !@keep_onebox_source`
Add rubocop to our build. (#5004) 2017-07-28 09:20:09 +08:00			`end`
FIX: support for generating excerpt when nesting <details> blocks 2018-01-23 02:17:35 +08:00
Add rubocop to our build. (#5004) 2017-07-28 09:20:09 +08:00			`when "div", "span"`
			`if attributes.include?(["class", "excerpt"])`
DEV: enable frozen string literal on all files This reduces chances of errors where consumers of strings mutate inputs and reduces memory usage of the app. Test suite passes now, but there may be some stuff left, so we will run a few sites on a branch prior to merging 2019-05-03 06:17:27 +08:00			`@excerpt = +""`
Add rubocop to our build. (#5004) 2017-07-28 09:20:09 +08:00			`@current_length = 0`
			`@start_excerpt = true`
			`end`
FIX: support for generating excerpt when nesting <details> blocks 2018-01-23 02:17:35 +08:00
FIX: details tags broke excerpts 2017-12-20 06:28:55 +08:00			`when "details"`
DEV: enable frozen string literal on all files This reduces chances of errors where consumers of strings mutate inputs and reduces memory usage of the app. Test suite passes now, but there may be some stuff left, so we will run a few sites on a branch prior to merging 2019-05-03 06:17:27 +08:00			`@detail_contents = +"" if @in_details_depth == 0`
FIX: support for generating excerpt when nesting <details> blocks 2018-01-23 02:17:35 +08:00			`@in_details_depth += 1`

FIX: details tags broke excerpts 2017-12-20 06:28:55 +08:00			`when "summary"`
FIX: support for generating excerpt when nesting <details> blocks 2018-01-23 02:17:35 +08:00			`if @in_details_depth == 1 && !@in_summary`
DEV: enable frozen string literal on all files This reduces chances of errors where consumers of strings mutate inputs and reduces memory usage of the app. Test suite passes now, but there may be some stuff left, so we will run a few sites on a branch prior to merging 2019-05-03 06:17:27 +08:00			`@summary_contents = +""`
FIX: support for generating excerpt when nesting <details> blocks 2018-01-23 02:17:35 +08:00			`@in_summary = true`
			`end`

refactor 2013-05-28 07:48:47 +08:00			`end`
			`end`

			`def end_element(name)`
			`case name`
			`when "a"`
			`unless @strip_links`
FIX: Calculation of text length for <details> in excerpt was wrong 2018-03-12 23:52:06 +08:00			`characters("</a>", truncate: false, count_it: false, encode: false)`
refactor 2013-05-28 07:48:47 +08:00			`@in_a = false`
			`end`
			`when "p", "br"`
FIX: clean html before sending it to jquery for collapsing 2015-05-20 12:42:54 +08:00			`if @keep_newlines`
FIX: Calculation of text length for <details> in excerpt was wrong 2018-03-12 23:52:06 +08:00			`characters("<br>", truncate: false, count_it: false, encode: false)`
FIX: clean html before sending it to jquery for collapsing 2015-05-20 12:42:54 +08:00			`else`
			`characters(" ")`
			`end`
refactor 2013-05-28 07:48:47 +08:00			`when "aside"`
			`@in_quote = false`
FIX: details tags broke excerpts 2017-12-20 06:28:55 +08:00			`when "details"`
FIX: support for generating excerpt when nesting <details> blocks 2018-01-23 02:17:35 +08:00			`@in_details_depth -= 1`
			`if @in_details_depth == 0`
FIX: Calculation of text length for <details> in excerpt was wrong 2018-03-12 23:52:06 +08:00			`@summary_contents = clean(@summary_contents)`
			`@detail_contents = clean(@detail_contents)`

			`if @current_length + @summary_contents.length >= @length`
			`characters(@summary_contents,`
			`encode: false,`
			`before_string: "<details class='disabled'><summary>",`
			`after_string: "</summary></details>")`
FIX: support for generating excerpt when nesting <details> blocks 2018-01-23 02:17:35 +08:00			`else`
FIX: Calculation of text length for <details> in excerpt was wrong 2018-03-12 23:52:06 +08:00			`characters(@summary_contents,`
			`truncate: false,`
			`encode: false,`
			`before_string: "<details><summary>",`
			`after_string: "</summary>")`

			`characters(@detail_contents,`
			`encode: false,`
			`after_string: "</details>")`
FIX: support for generating excerpt when nesting <details> blocks 2018-01-23 02:17:35 +08:00			`end`
FIX: details tags broke excerpts 2017-12-20 06:28:55 +08:00			`end`
FIX: Don't disable details when below truncate limit 2017-12-21 04:44:36 +08:00			`when "summary"`
FIX: support for generating excerpt when nesting <details> blocks 2018-01-23 02:17:35 +08:00			`@in_summary = false if @in_details_depth == 1`
Preserve spoiler tags in post excerpts. 2014-02-20 16:48:30 +08:00			`when "div", "span"`
Feature: allow mods to cut pinned topic excerpts 2014-07-17 19:32:17 +08:00			`throw :done if @start_excerpt`
FIX: Calculation of text length for <details> in excerpt was wrong 2018-03-12 23:52:06 +08:00			`characters("</span>", truncate: false, count_it: false, encode: false) if @in_spoiler`
Preserve spoiler tags in post excerpts. 2014-02-20 16:48:30 +08:00			`@in_spoiler = false`
refactor 2013-05-28 07:48:47 +08:00			`end`
			`end`

FIX: Don't disable details when below truncate limit 2017-12-21 04:44:36 +08:00			`def clean(str)`
			`ERB::Util.html_escape(str.strip)`
			`end`

FIX: Calculation of text length for <details> in excerpt was wrong 2018-03-12 23:52:06 +08:00			`def characters(string, truncate: true, count_it: true, encode: true, before_string: nil, after_string: nil)`
refactor 2013-05-28 07:48:47 +08:00			`return if @in_quote`
FIX: details tags broke excerpts 2017-12-20 06:28:55 +08:00
FIX: exception in excerpt parser for null nodes 2017-08-18 04:13:21 +08:00			`# we call length on this so might as well ensure we have a string`
			`string = string.to_s`
FIX: support for generating excerpt when nesting <details> blocks 2018-01-23 02:17:35 +08:00			`if @in_details_depth > 0`
FIX: details tags broke excerpts 2017-12-20 06:28:55 +08:00			`if @in_summary`
			`@summary_contents << string`
FIX: Don't disable details when below truncate limit 2017-12-21 04:44:36 +08:00			`else`
			`@detail_contents << string`
FIX: details tags broke excerpts 2017-12-20 06:28:55 +08:00			`end`
			`return`
			`end`
FIX: exception in excerpt parser for null nodes 2017-08-18 04:13:21 +08:00
FIX: Calculation of text length for <details> in excerpt was wrong 2018-03-12 23:52:06 +08:00			`@excerpt << before_string if before_string`

Add rubocop to our build. (#5004) 2017-07-28 09:20:09 +08:00			`encode = encode ? lambda { \|s\| ERB::Util.html_escape(s) } : lambda { \|s\| s }`
refactor 2013-05-28 07:48:47 +08:00			`if count_it && @current_length + string.length > @length`
			`length = [0, @length - @current_length - 1].max`
			`@excerpt << encode.call(string[0..length]) if truncate`
Better HTML emails, smarter email digests, new email section in admin with digest preview 2013-06-04 04:12:24 +08:00			`@excerpt << (@text_entities ? "..." : "…")`
refactor 2013-05-28 07:48:47 +08:00			`@excerpt << "</a>" if @in_a`
FIX: Calculation of text length for <details> in excerpt was wrong 2018-03-12 23:52:06 +08:00			`@excerpt << after_string if after_string`
refactor 2013-05-28 07:48:47 +08:00			`throw :done`
			`end`
FIX: Calculation of text length for <details> in excerpt was wrong 2018-03-12 23:52:06 +08:00
refactor 2013-05-28 07:48:47 +08:00			`@excerpt << encode.call(string)`
FIX: Calculation of text length for <details> in excerpt was wrong 2018-03-12 23:52:06 +08:00			`@excerpt << after_string if after_string`
refactor 2013-05-28 07:48:47 +08:00			`@current_length += string.length if count_it`
			`end`
			`end`