mirror of
https://github.com/discourse/discourse.git
synced 2024-11-26 23:06:19 +08:00
30990006a9
This reduces chances of errors where consumers of strings mutate inputs and reduces memory usage of the app. Test suite passes now, but there may be some stuff left, so we will run a few sites on a branch prior to merging
390 lines
12 KiB
Ruby
390 lines
12 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
# heavily based off
|
|
# https://github.com/vmg/redcarpet/blob/master/ext/redcarpet/html_smartypants.c
|
|
# and
|
|
# https://github.com/jmcnevin/rubypants/blob/master/lib/rubypants/core.rb
|
|
# 99% of the code here is by Jeremy McNevin
|
|
#
|
|
# This Source File is available under BSD/MIT license as well as standard GPL
|
|
#
|
|
|
|
class HtmlPrettify < String
|
|
def self.render(html)
|
|
new(html).to_html
|
|
end
|
|
|
|
# Create a new RubyPants instance with the text in +string+.
|
|
#
|
|
# Allowed elements in the options array:
|
|
#
|
|
# 0 :: do nothing
|
|
# 1 :: enable all, using only em-dash shortcuts
|
|
# 2 :: enable all, using old school en- and em-dash shortcuts (*default*)
|
|
# 3 :: enable all, using inverted old school en and em-dash shortcuts
|
|
# -1 :: stupefy (translate HTML entities to their ASCII-counterparts)
|
|
#
|
|
# If you don't like any of these defaults, you can pass symbols to change
|
|
# RubyPants' behavior:
|
|
#
|
|
# <tt>:quotes</tt> :: quotes
|
|
# <tt>:backticks</tt> :: backtick quotes (``double'' only)
|
|
# <tt>:allbackticks</tt> :: backtick quotes (``double'' and `single')
|
|
# <tt>:dashes</tt> :: dashes
|
|
# <tt>:oldschool</tt> :: old school dashes
|
|
# <tt>:inverted</tt> :: inverted old school dashes
|
|
# <tt>:ellipses</tt> :: ellipses
|
|
# <tt>:convertquotes</tt> :: convert <tt>"</tt> entities to
|
|
# <tt>"</tt>
|
|
# <tt>:stupefy</tt> :: translate RubyPants HTML entities
|
|
# to their ASCII counterparts.
|
|
#
|
|
# In addition, you can customize the HTML entities that will be injected by
|
|
# passing in a hash for the final argument. The defaults for these entities
|
|
# are as follows:
|
|
#
|
|
# <tt>:single_left_quote</tt> :: <tt>‘</tt>
|
|
# <tt>:double_left_quote</tt> :: <tt>“</tt>
|
|
# <tt>:single_right_quote</tt> :: <tt>’</tt>
|
|
# <tt>:double_right_quote</tt> :: <tt>”</tt>
|
|
# <tt>:em_dash</tt> :: <tt>—</tt>
|
|
# <tt>:en_dash</tt> :: <tt>–</tt>
|
|
# <tt>:ellipsis</tt> :: <tt>…</tt>
|
|
# <tt>:html_quote</tt> :: <tt>" </tt>
|
|
#
|
|
def initialize(string, options = [2], entities = {})
|
|
super string
|
|
|
|
@options = [*options]
|
|
@entities = default_entities.update(entities)
|
|
end
|
|
|
|
# Apply SmartyPants transformations.
|
|
def to_html
|
|
do_quotes = do_backticks = do_dashes = do_ellipses = nil
|
|
|
|
if @options.include?(0)
|
|
# Do nothing.
|
|
return self
|
|
elsif @options.include?(1)
|
|
# Do everything, turn all options on.
|
|
do_quotes = do_backticks = do_ellipses = true
|
|
do_dashes = :normal
|
|
elsif @options.include?(2)
|
|
# Do everything, turn all options on, use old school dash shorthand.
|
|
do_quotes = do_backticks = do_ellipses = true
|
|
do_dashes = :oldschool
|
|
elsif @options.include?(3)
|
|
# Do everything, turn all options on, use inverted old school
|
|
# dash shorthand.
|
|
do_quotes = do_backticks = do_ellipses = true
|
|
do_dashes = :inverted
|
|
elsif @options.include?(-1)
|
|
do_stupefy = true
|
|
else
|
|
do_quotes = @options.include?(:quotes)
|
|
do_backticks = @options.include?(:backticks)
|
|
do_backticks = :both if @options.include?(:allbackticks)
|
|
do_dashes = :normal if @options.include?(:dashes)
|
|
do_dashes = :oldschool if @options.include?(:oldschool)
|
|
do_dashes = :inverted if @options.include?(:inverted)
|
|
do_ellipses = @options.include?(:ellipses)
|
|
do_stupefy = @options.include?(:stupefy)
|
|
end
|
|
|
|
# Parse the HTML
|
|
tokens = tokenize
|
|
|
|
# Keep track of when we're inside <pre> or <code> tags.
|
|
in_pre = false
|
|
|
|
# Here is the result stored in.
|
|
result = +""
|
|
|
|
# This is a cheat, used to get some context for one-character
|
|
# tokens that consist of just a quote char. What we do is remember
|
|
# the last character of the previous text token, to use as context
|
|
# to curl single- character quote tokens correctly.
|
|
prev_token_last_char = nil
|
|
|
|
tokens.each do |token|
|
|
if token.first == :tag
|
|
result << token[1]
|
|
if token[1] =~ %r!<(/?)(?:pre|code|kbd|script|math)[\s>]!
|
|
in_pre = ($1 != "/") # Opening or closing tag?
|
|
end
|
|
else
|
|
t = token[1]
|
|
|
|
# Remember last char of this token before processing.
|
|
last_char = t[-1].chr
|
|
|
|
unless in_pre
|
|
|
|
t.gsub!("'", "'")
|
|
t.gsub!(""", '"')
|
|
|
|
if do_dashes
|
|
t = educate_dashes t if do_dashes == :normal
|
|
t = educate_dashes_oldschool t if do_dashes == :oldschool
|
|
t = educate_dashes_inverted t if do_dashes == :inverted
|
|
end
|
|
|
|
t = educate_ellipses t if do_ellipses
|
|
|
|
t = educate_fractions t
|
|
|
|
# Note: backticks need to be processed before quotes.
|
|
if do_backticks
|
|
t = educate_backticks t
|
|
t = educate_single_backticks t if do_backticks == :both
|
|
end
|
|
|
|
if do_quotes
|
|
if t == "'"
|
|
# Special case: single-character ' token
|
|
if prev_token_last_char =~ /\S/
|
|
t = entity(:single_right_quote)
|
|
else
|
|
t = entity(:single_left_quote)
|
|
end
|
|
elsif t == '"'
|
|
# Special case: single-character " token
|
|
if prev_token_last_char =~ /\S/
|
|
t = entity(:double_right_quote)
|
|
else
|
|
t = entity(:double_left_quote)
|
|
end
|
|
else
|
|
# Normal case:
|
|
t = educate_quotes t
|
|
end
|
|
end
|
|
|
|
t = stupefy_entities t if do_stupefy
|
|
end
|
|
|
|
prev_token_last_char = last_char
|
|
result << t
|
|
end
|
|
end
|
|
|
|
# Done
|
|
result
|
|
end
|
|
|
|
protected
|
|
|
|
# The string, with each instance of "<tt>--</tt>" translated to an
|
|
# em-dash HTML entity.
|
|
#
|
|
def educate_dashes(str)
|
|
str.
|
|
gsub(/--/, entity(:em_dash))
|
|
end
|
|
|
|
# The string, with each instance of "<tt>--</tt>" translated to an
|
|
# en-dash HTML entity, and each "<tt>---</tt>" translated to an
|
|
# em-dash HTML entity.
|
|
#
|
|
def educate_dashes_oldschool(str)
|
|
str.
|
|
gsub(/---/, entity(:em_dash)).
|
|
gsub(/--/, entity(:en_dash))
|
|
end
|
|
|
|
# Return the string, with each instance of "<tt>--</tt>" translated
|
|
# to an em-dash HTML entity, and each "<tt>---</tt>" translated to
|
|
# an en-dash HTML entity. Two reasons why: First, unlike the en- and
|
|
# em-dash syntax supported by +educate_dashes_oldschool+, it's
|
|
# compatible with existing entries written before SmartyPants 1.1,
|
|
# back when "<tt>--</tt>" was only used for em-dashes. Second,
|
|
# em-dashes are more common than en-dashes, and so it sort of makes
|
|
# sense that the shortcut should be shorter to type. (Thanks to
|
|
# Aaron Swartz for the idea.)
|
|
#
|
|
def educate_dashes_inverted(str)
|
|
str.
|
|
gsub(/---/, entity(:en_dash)).
|
|
gsub(/--/, entity(:em_dash))
|
|
end
|
|
|
|
# Return the string, with each instance of "<tt>...</tt>" translated
|
|
# to an ellipsis HTML entity. Also converts the case where there are
|
|
# spaces between the dots.
|
|
#
|
|
def educate_ellipses(str)
|
|
str.
|
|
gsub('...', entity(:ellipsis)).
|
|
gsub('. . .', entity(:ellipsis))
|
|
end
|
|
|
|
# Return the string, with "<tt>``backticks''</tt>"-style single quotes
|
|
# translated into HTML curly quote entities.
|
|
#
|
|
def educate_backticks(str)
|
|
str.
|
|
gsub("``", entity(:double_left_quote)).
|
|
gsub("''", entity(:double_right_quote))
|
|
end
|
|
|
|
# Return the string, with "<tt>`backticks'</tt>"-style single quotes
|
|
# translated into HTML curly quote entities.
|
|
#
|
|
def educate_single_backticks(str)
|
|
str.
|
|
gsub("`", entity(:single_left_quote)).
|
|
gsub("'", entity(:single_right_quote))
|
|
end
|
|
|
|
def educate_fractions(str)
|
|
str.gsub(/(\s+|^)(1\/4|1\/2|3\/4)([,.;\s]|$)/) do
|
|
frac =
|
|
if $2 == "1/2".freeze
|
|
entity(:frac12)
|
|
elsif $2 == "1/4".freeze
|
|
entity(:frac14)
|
|
elsif $2 == "3/4".freeze
|
|
entity(:frac34)
|
|
end
|
|
"#{$1}#{frac}#{$3}"
|
|
end
|
|
end
|
|
|
|
# Return the string, with "educated" curly quote HTML entities.
|
|
#
|
|
def educate_quotes(str)
|
|
punct_class = '[!"#\$\%\'()*+,\-.\/:;<=>?\@\[\\\\\]\^_`{|}~]'
|
|
|
|
# normalize html
|
|
str = str.dup
|
|
# Special case if the very first character is a quote followed by
|
|
# punctuation at a non-word-break. Close the quotes by brute
|
|
# force:
|
|
str.gsub!(/^'(?=#{punct_class}\B)/,
|
|
entity(:single_right_quote))
|
|
str.gsub!(/^"(?=#{punct_class}\B)/,
|
|
entity(:double_right_quote))
|
|
|
|
# Special case for double sets of quotes, e.g.:
|
|
# <p>He said, "'Quoted' words in a larger quote."</p>
|
|
str.gsub!(/"'(?=\w)/,
|
|
"#{entity(:double_left_quote)}#{entity(:single_left_quote)}")
|
|
str.gsub!(/'"(?=\w)/,
|
|
"#{entity(:single_left_quote)}#{entity(:double_left_quote)}")
|
|
|
|
# Special case for decade abbreviations (the '80s):
|
|
str.gsub!(/'(?=\d\ds)/,
|
|
entity(:single_right_quote))
|
|
|
|
close_class = %![^\ \t\r\n\\[\{\(\-]!
|
|
dec_dashes = "#{entity(:en_dash)}|#{entity(:em_dash)}"
|
|
|
|
# Get most opening single quotes:
|
|
str.gsub!(/(\s| |=|--|&[mn]dash;|#{dec_dashes}|ȁ[34];)'(?=\w)/,
|
|
'\1' + entity(:single_left_quote))
|
|
|
|
# Single closing quotes:
|
|
str.gsub!(/(#{close_class})'/,
|
|
'\1' + entity(:single_right_quote))
|
|
str.gsub!(/'(\s|s\b|$)/,
|
|
entity(:single_right_quote) + '\1')
|
|
|
|
# Any remaining single quotes should be opening ones:
|
|
str.gsub!(/'/,
|
|
entity(:single_left_quote))
|
|
|
|
# Get most opening double quotes:
|
|
str.gsub!(/(\s| |=|--|&[mn]dash;|#{dec_dashes}|ȁ[34];)"(?=\w)/,
|
|
'\1' + entity(:double_left_quote))
|
|
|
|
# Double closing quotes:
|
|
str.gsub!(/(#{close_class})"/,
|
|
'\1' + entity(:double_right_quote))
|
|
str.gsub!(/"(\s|s\b|$)/,
|
|
entity(:double_right_quote) + '\1')
|
|
|
|
# Any remaining quotes should be opening ones:
|
|
str.gsub!(/"/,
|
|
entity(:double_left_quote))
|
|
|
|
str
|
|
end
|
|
|
|
# Return the string, with each RubyPants HTML entity translated to
|
|
# its ASCII counterpart.
|
|
#
|
|
# Note: This is not reversible (but exactly the same as in SmartyPants)
|
|
#
|
|
def stupefy_entities(str)
|
|
new_str = str.dup
|
|
|
|
{
|
|
en_dash: '-',
|
|
em_dash: '--',
|
|
single_left_quote: "'",
|
|
single_right_quote: "'",
|
|
double_left_quote: '"',
|
|
double_right_quote: '"',
|
|
ellipsis: '...'
|
|
}.each do |k, v|
|
|
new_str.gsub!(/#{entity(k)}/, v)
|
|
end
|
|
|
|
new_str
|
|
end
|
|
|
|
# Return an array of the tokens comprising the string. Each token is
|
|
# either a tag (possibly with nested, tags contained therein, such
|
|
# as <tt><a href="<MTFoo>"></tt>, or a run of text between
|
|
# tags. Each element of the array is a two-element array; the first
|
|
# is either :tag or :text; the second is the actual value.
|
|
#
|
|
# Based on the <tt>_tokenize()</tt> subroutine from Brad Choate's
|
|
# MTRegex plugin. <http://www.bradchoate.com/past/mtregex.php>
|
|
#
|
|
# This is actually the easier variant using tag_soup, as used by
|
|
# Chad Miller in the Python port of SmartyPants.
|
|
#
|
|
def tokenize
|
|
tag_soup = /([^<]*)(<[^>]*>)/
|
|
|
|
tokens = []
|
|
|
|
prev_end = 0
|
|
|
|
scan(tag_soup) do
|
|
tokens << [:text, $1] if $1 != ""
|
|
tokens << [:tag, $2]
|
|
prev_end = $~.end(0)
|
|
end
|
|
|
|
if prev_end < size
|
|
tokens << [:text, self[prev_end..-1]]
|
|
end
|
|
|
|
tokens
|
|
end
|
|
|
|
def default_entities
|
|
{
|
|
single_left_quote: "‘",
|
|
double_left_quote: "“",
|
|
single_right_quote: "’",
|
|
double_right_quote: "”",
|
|
em_dash: "—",
|
|
en_dash: "–",
|
|
ellipsis: "…",
|
|
html_quote: """,
|
|
frac12: "½",
|
|
frac14: "¼",
|
|
frac34: "¾",
|
|
}
|
|
end
|
|
|
|
def entity(key)
|
|
@entities[key]
|
|
end
|
|
|
|
end
|