discourse/lib/excerpt_parser.rb

193 lines
5.3 KiB
Ruby
Raw Normal View History

2013-05-28 07:48:47 +08:00
class ExcerptParser < Nokogiri::XML::SAX::Document
attr_reader :excerpt
SPAN_REGEX = /<\s*span[^>]*class\s*=\s*['|"]excerpt['|"][^>]*>/
2017-07-28 09:20:09 +08:00
def initialize(length, options = nil)
2013-05-28 07:48:47 +08:00
@length = length
@excerpt = ""
@current_length = 0
options || {}
2013-05-28 07:48:47 +08:00
@strip_links = options[:strip_links] == true
@strip_images = options[:strip_images] == true
@text_entities = options[:text_entities] == true
@markdown_images = options[:markdown_images] == true
@keep_newlines = options[:keep_newlines] == true
@keep_emoji_images = options[:keep_emoji_images] == true
@keep_onebox_source = options[:keep_onebox_source] == true
@remap_emoji = options[:remap_emoji] == true
@start_excerpt = false
@in_details_depth = 0
2017-12-20 06:28:55 +08:00
@summary_contents = ""
@detail_contents = ""
2013-05-28 07:48:47 +08:00
end
def self.get_excerpt(html, length, options)
html ||= ''
length = html.length if html.include?('excerpt') && SPAN_REGEX === html
me = self.new(length, options)
2013-05-28 07:48:47 +08:00
parser = Nokogiri::HTML::SAX::Parser.new(me)
catch(:done) do
parser.parse(html)
2013-05-28 07:48:47 +08:00
end
excerpt = me.excerpt.strip
excerpt = excerpt.gsub(/\s*\n+\s*/, "\n\n") if options[:keep_onebox_source]
excerpt = CGI.unescapeHTML(excerpt) if options[:text_entities] == true
excerpt
2013-05-28 07:48:47 +08:00
end
2014-07-25 10:15:43 +08:00
def escape_attribute(v)
return "" unless v
v = v.dup
v.gsub!("&", "&amp;")
v.gsub!("\"", "&#34;")
v.gsub!("<", "&lt;")
v.gsub!(">", "&gt;")
v
2014-07-25 10:15:43 +08:00
end
def include_tag(name, attributes)
2017-07-28 09:20:09 +08:00
characters("<#{name} #{attributes.map { |k, v| "#{k}=\"#{escape_attribute(v)}\"" }.join(' ')}>", false, false, false)
end
2017-07-28 09:20:09 +08:00
def start_element(name, attributes = [])
2013-05-28 07:48:47 +08:00
case name
2017-07-28 09:20:09 +08:00
when "img"
attributes = Hash[*attributes.flatten]
if attributes["class"]&.include?('emoji')
if @remap_emoji
title = (attributes["alt"] || "").gsub(":", "")
title = Emoji.lookup_unicode(title) || attributes["alt"]
return characters(title)
elsif @keep_emoji_images
return include_tag(name, attributes)
else
return characters(attributes["alt"])
end
end
unless @strip_images
# If include_images is set, include the image in markdown
characters("!") if @markdown_images
if !attributes["alt"].blank?
2013-05-28 07:48:47 +08:00
characters("[#{attributes["alt"]}]")
elsif !attributes["title"].blank?
2013-05-28 07:48:47 +08:00
characters("[#{attributes["title"]}]")
else
2015-01-23 17:57:01 +08:00
characters("[#{I18n.t 'excerpt_image'}]")
2013-05-28 07:48:47 +08:00
end
characters("(#{attributes['src']})") if @markdown_images
end
2017-07-28 09:20:09 +08:00
when "a"
unless @strip_links
include_tag(name, attributes)
@in_a = true
end
2017-07-28 09:20:09 +08:00
when "aside"
attributes = Hash[*attributes.flatten]
unless @keep_onebox_source && attributes['class'].include?('onebox')
@in_quote = true
end
2017-07-28 09:20:09 +08:00
when 'article'
if @keep_onebox_source && attributes.include?(['class', 'onebox-body'])
@in_quote = true
end
2017-07-28 09:20:09 +08:00
when "div", "span"
if attributes.include?(["class", "excerpt"])
@excerpt = ""
@current_length = 0
@start_excerpt = true
end
# Preserve spoilers
if attributes.include?(["class", "spoiler"])
include_tag("span", attributes)
@in_spoiler = true
end
2017-12-20 06:28:55 +08:00
when "details"
@detail_contents = "" if @in_details_depth == 0
@in_details_depth += 1
2017-12-20 06:28:55 +08:00
when "summary"
if @in_details_depth == 1 && !@in_summary
@summary_contents = ""
@in_summary = true
end
2013-05-28 07:48:47 +08:00
end
end
def end_element(name)
case name
when "a"
unless @strip_links
2017-07-28 09:20:09 +08:00
characters("</a>", false, false, false)
2013-05-28 07:48:47 +08:00
@in_a = false
end
when "p", "br"
if @keep_newlines
characters("<br>", false, false, false)
else
characters(" ")
end
2013-05-28 07:48:47 +08:00
when "aside"
@in_quote = false
2017-12-20 06:28:55 +08:00
when "details"
@in_details_depth -= 1
if @in_details_depth == 0
full = "<details><summary>#{clean(@summary_contents)}</summary>#{clean(@detail_contents)}</details>"
if @current_length + full.length > @length
@excerpt << "<details class='disabled'><summary>#{@summary_contents[0..@length]}</summary></details>"
else
@excerpt << full
end
2017-12-20 06:28:55 +08:00
end
when "summary"
@in_summary = false if @in_details_depth == 1
when "div", "span"
throw :done if @start_excerpt
characters("</span>", false, false, false) if @in_spoiler
@in_spoiler = false
2013-05-28 07:48:47 +08:00
end
end
def clean(str)
ERB::Util.html_escape(str.strip)
end
2013-05-28 07:48:47 +08:00
def characters(string, truncate = true, count_it = true, encode = true)
return if @in_quote
2017-12-20 06:28:55 +08:00
# we call length on this so might as well ensure we have a string
string = string.to_s
if @in_details_depth > 0
2017-12-20 06:28:55 +08:00
if @in_summary
@summary_contents << string
else
@detail_contents << string
2017-12-20 06:28:55 +08:00
end
return
end
2017-07-28 09:20:09 +08:00
encode = encode ? lambda { |s| ERB::Util.html_escape(s) } : lambda { |s| s }
2013-05-28 07:48:47 +08:00
if count_it && @current_length + string.length > @length
length = [0, @length - @current_length - 1].max
@excerpt << encode.call(string[0..length]) if truncate
@excerpt << (@text_entities ? "..." : "&hellip;")
2013-05-28 07:48:47 +08:00
@excerpt << "</a>" if @in_a
throw :done
end
@excerpt << encode.call(string)
@current_length += string.length if count_it
end
end