# heavily based off # https://github.com/vmg/redcarpet/blob/master/ext/redcarpet/html_smartypants.c # and # https://github.com/jmcnevin/rubypants/blob/master/lib/rubypants/core.rb # 99% of the code here is by Jeremy McNevin # # This Source File is available under BSD/MIT license as well as standard GPL # class HtmlPrettify < String def self.render(html) new(html).to_html end # Create a new RubyPants instance with the text in +string+. # # Allowed elements in the options array: # # 0 :: do nothing # 1 :: enable all, using only em-dash shortcuts # 2 :: enable all, using old school en- and em-dash shortcuts (*default*) # 3 :: enable all, using inverted old school en and em-dash shortcuts # -1 :: stupefy (translate HTML entities to their ASCII-counterparts) # # If you don't like any of these defaults, you can pass symbols to change # RubyPants' behavior: # # :quotes :: quotes # :backticks :: backtick quotes (``double'' only) # :allbackticks :: backtick quotes (``double'' and `single') # :dashes :: dashes # :oldschool :: old school dashes # :inverted :: inverted old school dashes # :ellipses :: ellipses # :convertquotes :: convert " entities to # " # :stupefy :: translate RubyPants HTML entities # to their ASCII counterparts. # # In addition, you can customize the HTML entities that will be injected by # passing in a hash for the final argument. The defaults for these entities # are as follows: # # :single_left_quote :: ‘ # :double_left_quote :: “ # :single_right_quote :: ’ # :double_right_quote :: ” # :em_dash :: — # :en_dash :: – # :ellipsis :: … # :html_quote :: " # def initialize(string, options=[2], entities = {}) super string @options = [*options] @entities = default_entities.update(entities) end # Apply SmartyPants transformations. def to_html do_quotes = do_backticks = do_dashes = do_ellipses = nil if @options.include?(0) # Do nothing. return self elsif @options.include?(1) # Do everything, turn all options on. do_quotes = do_backticks = do_ellipses = true do_dashes = :normal elsif @options.include?(2) # Do everything, turn all options on, use old school dash shorthand. do_quotes = do_backticks = do_ellipses = true do_dashes = :oldschool elsif @options.include?(3) # Do everything, turn all options on, use inverted old school # dash shorthand. do_quotes = do_backticks = do_ellipses = true do_dashes = :inverted elsif @options.include?(-1) do_stupefy = true else do_quotes = @options.include?(:quotes) do_backticks = @options.include?(:backticks) do_backticks = :both if @options.include?(:allbackticks) do_dashes = :normal if @options.include?(:dashes) do_dashes = :oldschool if @options.include?(:oldschool) do_dashes = :inverted if @options.include?(:inverted) do_ellipses = @options.include?(:ellipses) do_stupefy = @options.include?(:stupefy) end # Parse the HTML tokens = tokenize # Keep track of when we're inside
or tags.
in_pre = false
# Here is the result stored in.
result = ""
# This is a cheat, used to get some context for one-character
# tokens that consist of just a quote char. What we do is remember
# the last character of the previous text token, to use as context
# to curl single- character quote tokens correctly.
prev_token_last_char = nil
tokens.each do |token|
if token.first == :tag
result << token[1]
if token[1] =~ %r!<(/?)(?:pre|code|kbd|script|math)[\s>]!
in_pre = ($1 != "/") # Opening or closing tag?
end
else
t = token[1]
# Remember last char of this token before processing.
last_char = t[-1].chr
unless in_pre
t.gsub!("'", "'")
t = process_escapes t
t.gsub!(""", '"')
if do_dashes
t = educate_dashes t if do_dashes == :normal
t = educate_dashes_oldschool t if do_dashes == :oldschool
t = educate_dashes_inverted t if do_dashes == :inverted
end
t = educate_ellipses t if do_ellipses
t = educate_fractions t
# Note: backticks need to be processed before quotes.
if do_backticks
t = educate_backticks t
t = educate_single_backticks t if do_backticks == :both
end
if do_quotes
if t == "'"
# Special case: single-character ' token
if prev_token_last_char =~ /\S/
t = entity(:single_right_quote)
else
t = entity(:single_left_quote)
end
elsif t == '"'
# Special case: single-character " token
if prev_token_last_char =~ /\S/
t = entity(:double_right_quote)
else
t = entity(:double_left_quote)
end
else
# Normal case:
t = educate_quotes t
end
end
t = stupefy_entities t if do_stupefy
end
prev_token_last_char = last_char
result << t
end
end
# Done
result
end
protected
# Return the string, with after processing the following backslash
# escape sequences. This is useful if you want to force a "dumb" quote
# or other character to appear.
#
# Escaped are:
# \\ \" \' \. \- \`
#
def process_escapes(str)
str = str.gsub('\\\\', '\')
str.gsub!('\"', '"')
str.gsub!("\\\'", ''')
str.gsub!('\.', '.')
str.gsub!('\-', '-')
str.gsub!('\`', '`')
str
end
# The string, with each instance of "--" translated to an
# em-dash HTML entity.
#
def educate_dashes(str)
str.
gsub(/--/, entity(:em_dash))
end
# The string, with each instance of "--" translated to an
# en-dash HTML entity, and each "---" translated to an
# em-dash HTML entity.
#
def educate_dashes_oldschool(str)
str.
gsub(/---/, entity(:em_dash)).
gsub(/--/, entity(:en_dash))
end
# Return the string, with each instance of "--" translated
# to an em-dash HTML entity, and each "---" translated to
# an en-dash HTML entity. Two reasons why: First, unlike the en- and
# em-dash syntax supported by +educate_dashes_oldschool+, it's
# compatible with existing entries written before SmartyPants 1.1,
# back when "--" was only used for em-dashes. Second,
# em-dashes are more common than en-dashes, and so it sort of makes
# sense that the shortcut should be shorter to type. (Thanks to
# Aaron Swartz for the idea.)
#
def educate_dashes_inverted(str)
str.
gsub(/---/, entity(:en_dash)).
gsub(/--/, entity(:em_dash))
end
# Return the string, with each instance of "..." translated
# to an ellipsis HTML entity. Also converts the case where there are
# spaces between the dots.
#
def educate_ellipses(str)
str.
gsub('...', entity(:ellipsis)).
gsub('. . .', entity(:ellipsis))
end
# Return the string, with "``backticks''"-style single quotes
# translated into HTML curly quote entities.
#
def educate_backticks(str)
str.
gsub("``", entity(:double_left_quote)).
gsub("''", entity(:double_right_quote))
end
# Return the string, with "`backticks'"-style single quotes
# translated into HTML curly quote entities.
#
def educate_single_backticks(str)
str.
gsub("`", entity(:single_left_quote)).
gsub("'", entity(:single_right_quote))
end
def educate_fractions(str)
str.gsub(/(\s+|^)(1\/4|1\/2|3\/4)([,.;\s]|$)/) do
frac =
if $2 == "1/2".freeze
entity(:frac12)
elsif $2 == "1/4".freeze
entity(:frac14)
elsif $2 == "3/4".freeze
entity(:frac34)
end
"#{$1}#{frac}#{$3}"
end
end
# Return the string, with "educated" curly quote HTML entities.
#
def educate_quotes(str)
punct_class = '[!"#\$\%\'()*+,\-.\/:;<=>?\@\[\\\\\]\^_`{|}~]'
# normalize html
str = str.dup
# Special case if the very first character is a quote followed by
# punctuation at a non-word-break. Close the quotes by brute
# force:
str.gsub!(/^'(?=#{punct_class}\B)/,
entity(:single_right_quote))
str.gsub!(/^"(?=#{punct_class}\B)/,
entity(:double_right_quote))
# Special case for double sets of quotes, e.g.:
# He said, "'Quoted' words in a larger quote."
str.gsub!(/"'(?=\w)/,
"#{entity(:double_left_quote)}#{entity(:single_left_quote)}")
str.gsub!(/'"(?=\w)/,
"#{entity(:single_left_quote)}#{entity(:double_left_quote)}")
# Special case for decade abbreviations (the '80s):
str.gsub!(/'(?=\d\ds)/,
entity(:single_right_quote))
close_class = %![^\ \t\r\n\\[\{\(\-]!
dec_dashes = "#{entity(:en_dash)}|#{entity(:em_dash)}"
# Get most opening single quotes:
str.gsub!(/(\s| |=|--|&[mn]dash;|#{dec_dashes}|ȁ[34];)'(?=\w)/,
'\1' + entity(:single_left_quote))
# Single closing quotes:
str.gsub!(/(#{close_class})'/,
'\1' + entity(:single_right_quote))
str.gsub!(/'(\s|s\b|$)/,
entity(:single_right_quote) + '\1')
# Any remaining single quotes should be opening ones:
str.gsub!(/'/,
entity(:single_left_quote))
# Get most opening double quotes:
str.gsub!(/(\s| |=|--|&[mn]dash;|#{dec_dashes}|ȁ[34];)"(?=\w)/,
'\1' + entity(:double_left_quote))
# Double closing quotes:
str.gsub!(/(#{close_class})"/,
'\1' + entity(:double_right_quote))
str.gsub!(/"(\s|s\b|$)/,
entity(:double_right_quote) + '\1')
# Any remaining quotes should be opening ones:
str.gsub!(/"/,
entity(:double_left_quote))
str
end
# Return the string, with each RubyPants HTML entity translated to
# its ASCII counterpart.
#
# Note: This is not reversible (but exactly the same as in SmartyPants)
#
def stupefy_entities(str)
new_str = str.dup
{
:en_dash => '-',
:em_dash => '--',
:single_left_quote => "'",
:single_right_quote => "'",
:double_left_quote => '"',
:double_right_quote => '"',
:ellipsis => '...'
}.each do |k,v|
new_str.gsub!(/#{entity(k)}/, v)
end
new_str
end
# Return an array of the tokens comprising the string. Each token is
# either a tag (possibly with nested, tags contained therein, such
# as , or a run of text between
# tags. Each element of the array is a two-element array; the first
# is either :tag or :text; the second is the actual value.
#
# Based on the _tokenize() subroutine from Brad Choate's
# MTRegex plugin.
#
# This is actually the easier variant using tag_soup, as used by
# Chad Miller in the Python port of SmartyPants.
#
def tokenize
tag_soup = /([^<]*)(<[^>]*>)/
tokens = []
prev_end = 0
scan(tag_soup) do
tokens << [:text, $1] if $1 != ""
tokens << [:tag, $2]
prev_end = $~.end(0)
end
if prev_end < size
tokens << [:text, self[prev_end..-1]]
end
tokens
end
def default_entities
{
single_left_quote: "‘",
double_left_quote: "“",
single_right_quote: "’",
double_right_quote: "”",
em_dash: "—",
en_dash: "–",
ellipsis: "…",
html_quote: """,
frac12: "½",
frac14: "¼",
frac34: "¾",
}
end
def entity(key)
@entities[key]
end
end