discourse/lib/excerpt_parser.rb

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

247 lines
7.4 KiB
Ruby
Raw Permalink Normal View History

# frozen_string_literal: true
2013-05-28 07:48:47 +08:00
class ExcerptParser < Nokogiri::XML::SAX::Document
attr_reader :excerpt
CUSTOM_EXCERPT_REGEX = /<\s*(span|div)[^>]*class\s*=\s*['"]excerpt['"][^>]*>/
def initialize(length, options = nil)
2013-05-28 07:48:47 +08:00
@length = length
@excerpt = +""
2013-05-28 07:48:47 +08:00
@current_length = 0
options || {}
2013-05-28 07:48:47 +08:00
@strip_links = options[:strip_links] == true
@strip_images = options[:strip_images] == true
@text_entities = options[:text_entities] == true
@markdown_images = options[:markdown_images] == true
@keep_newlines = options[:keep_newlines] == true
@keep_emoji_images = options[:keep_emoji_images] == true
@keep_onebox_source = options[:keep_onebox_source] == true
@keep_onebox_body = options[:keep_onebox_body] == true
@keep_quotes = options[:keep_quotes] == true
@keep_svg = options[:keep_svg] == true
@remap_emoji = options[:remap_emoji] == true
@start_excerpt = false
@start_hashtag_icon = false
@in_details_depth = 0
@summary_contents = +""
@detail_contents = +""
2013-05-28 07:48:47 +08:00
end
def self.get_excerpt(html, length, options)
html ||= ""
length = html.length if html.include?("excerpt") && CUSTOM_EXCERPT_REGEX === html
me = self.new(length, options)
2013-05-28 07:48:47 +08:00
parser = Nokogiri::HTML::SAX::Parser.new(me)
catch(:done) { parser.parse(html) }
excerpt = me.excerpt.strip
excerpt = excerpt.gsub(/\s*\n+\s*/, "\n\n") if options[:keep_onebox_source] ||
options[:keep_onebox_body]
excerpt = CGI.unescapeHTML(excerpt) if options[:text_entities] == true
excerpt
2013-05-28 07:48:47 +08:00
end
2014-07-25 10:15:43 +08:00
def escape_attribute(v)
return "" unless v
v = v.dup
v.gsub!("&", "&amp;")
v.gsub!("\"", "&#34;")
v.gsub!("<", "&lt;")
v.gsub!(">", "&gt;")
v
2014-07-25 10:15:43 +08:00
end
def include_tag(name, attributes)
characters(
"<#{name} #{attributes.map { |k, v| "#{k}=\"#{escape_attribute(v)}\"" }.join(" ")}>",
truncate: false,
count_it: false,
encode: false,
)
end
2013-05-28 07:48:47 +08:00
def start_element(name, attributes = [])
case name
when "img"
attributes = Hash[*attributes.flatten]
if attributes["class"]&.include?("emoji")
if @remap_emoji
title = (attributes["alt"] || "").gsub(":", "")
title = Emoji.lookup_unicode(title) || attributes["alt"]
return characters(title)
elsif @keep_emoji_images
return include_tag(name, attributes)
else
return characters(attributes["alt"])
end
end
unless @strip_images
# If include_images is set, include the image in markdown
characters("!") if @markdown_images
if !attributes["alt"].blank?
2013-05-28 07:48:47 +08:00
characters("[#{attributes["alt"]}]")
elsif !attributes["title"].blank?
2013-05-28 07:48:47 +08:00
characters("[#{attributes["title"]}]")
else
2015-01-23 17:57:01 +08:00
characters("[#{I18n.t "excerpt_image"}]")
2013-05-28 07:48:47 +08:00
end
characters("(#{attributes["src"]})") if @markdown_images
end
2013-05-28 07:48:47 +08:00
when "a"
unless @strip_links
include_tag(name, attributes)
2013-05-28 07:48:47 +08:00
@in_a = true
end
when "aside"
attributes = Hash[*attributes.flatten]
if !(@keep_onebox_source || @keep_onebox_body) || !attributes["class"]&.include?("onebox")
@in_quote = true
end
if attributes["class"]&.include?("quote")
if @keep_quotes || (@keep_onebox_body && attributes["data-topic"].present?)
@in_quote = false
end
end
when "article"
@in_quote = !@keep_onebox_body if attributes.include?(%w[class onebox-body])
when "header"
@in_quote = !@keep_onebox_source if attributes.include?(%w[class source])
when "div", "span"
attributes = Hash[*attributes.flatten]
# Only match "excerpt" class if it does not specifically equal "excerpt
# hidden" in order to prevent internal links with GitHub oneboxes from
# being empty https://meta.discourse.org/t/269436
if attributes["class"]&.include?("excerpt") && !attributes["class"]&.match?("excerpt hidden")
@excerpt = +""
@current_length = 0
@start_excerpt = true
elsif attributes["class"]&.include?("hashtag-icon-placeholder")
@start_hashtag_icon = true
include_tag(name, attributes)
end
2017-12-20 06:28:55 +08:00
when "details"
@detail_contents = +"" if @in_details_depth == 0
@in_details_depth += 1
2017-12-20 06:28:55 +08:00
when "summary"
if @in_details_depth == 1 && !@in_summary
@summary_contents = +""
@in_summary = true
end
when "svg"
attributes = Hash[*attributes.flatten]
if attributes["class"]&.include?("d-icon") && @keep_svg
include_tag(name, attributes)
@in_svg = true
end
when "use"
include_tag(name, attributes) if @in_svg && @keep_svg
2013-05-28 07:48:47 +08:00
end
end
def end_element(name)
case name
when "a"
unless @strip_links
characters("</a>", truncate: false, count_it: false, encode: false)
2013-05-28 07:48:47 +08:00
@in_a = false
end
when "p", "br"
if @keep_newlines
characters("<br>", truncate: false, count_it: false, encode: false)
else
characters(" ")
end
2013-05-28 07:48:47 +08:00
when "aside"
@in_quote = false
2017-12-20 06:28:55 +08:00
when "details"
@in_details_depth -= 1
if @in_details_depth == 0
@summary_contents = clean(@summary_contents)
@detail_contents = clean(@detail_contents)
if @current_length + @summary_contents.length >= @length
characters(
@summary_contents,
encode: false,
before_string: "<details class='disabled'><summary>",
after_string: "</summary></details>",
)
else
characters(
@summary_contents,
truncate: false,
encode: false,
before_string: "<details><summary>",
after_string: "</summary>",
)
characters(@detail_contents, encode: false, after_string: "</details>")
end
2017-12-20 06:28:55 +08:00
end
when "summary"
@in_summary = false if @in_details_depth == 1
when "div", "span"
throw :done if @start_excerpt
characters("</span>", truncate: false, count_it: false, encode: false) if @start_hashtag_icon
when "svg"
characters("</svg>", truncate: false, count_it: false, encode: false) if @keep_svg
@in_svg = false
when "use"
characters("</use>", truncate: false, count_it: false, encode: false) if @keep_svg
2013-05-28 07:48:47 +08:00
end
end
def clean(str)
ERB::Util.html_escape(str.strip)
end
def characters(
string,
truncate: true,
count_it: true,
encode: true,
before_string: nil,
after_string: nil
)
2013-05-28 07:48:47 +08:00
return if @in_quote
2017-12-20 06:28:55 +08:00
# we call length on this so might as well ensure we have a string
string = string.to_s
if @in_details_depth > 0
2017-12-20 06:28:55 +08:00
if @in_summary
@summary_contents << string
else
@detail_contents << string
2017-12-20 06:28:55 +08:00
end
return
end
@excerpt << before_string if before_string
2013-05-28 07:48:47 +08:00
encode = encode ? lambda { |s| ERB::Util.html_escape(s) } : lambda { |s| s }
if count_it && @current_length + string.length > @length
length = [0, @length - @current_length - 1].max
@excerpt << encode.call(string[0..length]) if truncate && !emoji?(string)
@excerpt << (@text_entities ? "..." : "&hellip;")
2013-05-28 07:48:47 +08:00
@excerpt << "</a>" if @in_a
@excerpt << after_string if after_string
2013-05-28 07:48:47 +08:00
throw :done
end
2013-05-28 07:48:47 +08:00
@excerpt << encode.call(string)
@excerpt << after_string if after_string
2013-05-28 07:48:47 +08:00
@current_length += string.length if count_it
end
def emoji?(string)
string.match?(/\A:\w+:\Z/)
end
2013-05-28 07:48:47 +08:00
end