2019-04-30 08:25:53 +08:00
|
|
|
# frozen_string_literal: true
|
|
|
|
|
2013-04-10 17:00:50 +08:00
|
|
|
#
|
|
|
|
# Clean up a text
|
|
|
|
#
|
2016-03-08 04:07:36 +08:00
|
|
|
|
2021-05-21 09:43:47 +08:00
|
|
|
# We use ActiveSupport mb_chars from here to properly support non ascii downcase
|
2016-03-08 04:07:36 +08:00
|
|
|
require "active_support/core_ext/string/multibyte"
|
|
|
|
|
2013-04-10 17:00:50 +08:00
|
|
|
class TextCleaner
|
|
|
|
def self.title_options
|
|
|
|
# cf. http://meta.discourse.org/t/should-we-have-auto-replace-rules-in-titles/5687
|
|
|
|
{
|
|
|
|
deduplicate_exclamation_marks: SiteSetting.title_prettify,
|
|
|
|
deduplicate_question_marks: SiteSetting.title_prettify,
|
2018-10-09 05:50:06 +08:00
|
|
|
replace_all_upper_case: SiteSetting.title_prettify && !SiteSetting.allow_uppercase_posts,
|
2013-04-10 17:00:50 +08:00
|
|
|
capitalize_first_letter: SiteSetting.title_prettify,
|
2013-04-18 06:19:42 +08:00
|
|
|
remove_all_periods_from_the_end: SiteSetting.title_prettify,
|
2019-05-16 05:43:00 +08:00
|
|
|
remove_extraneous_space:
|
|
|
|
SiteSetting.title_prettify && SiteSetting.title_remove_extraneous_space,
|
2013-04-10 17:00:50 +08:00
|
|
|
fixes_interior_spaces: true,
|
2018-02-21 02:22:36 +08:00
|
|
|
strip_whitespaces: true,
|
2021-05-24 16:13:30 +08:00
|
|
|
strip_zero_width_spaces: true,
|
|
|
|
case_option: SiteSetting.default_locale == "tr_TR" ? :turkic : nil,
|
2013-04-10 17:00:50 +08:00
|
|
|
}
|
|
|
|
end
|
|
|
|
|
|
|
|
def self.clean_title(title)
|
|
|
|
TextCleaner.clean(title, TextCleaner.title_options)
|
|
|
|
end
|
|
|
|
|
|
|
|
def self.clean(text, opts = {})
|
2019-04-30 08:25:53 +08:00
|
|
|
text = text.dup
|
|
|
|
|
2019-02-26 07:12:34 +08:00
|
|
|
# Remove invalid byte sequences
|
|
|
|
text.scrub!("")
|
2021-05-24 16:13:30 +08:00
|
|
|
|
2013-04-10 17:00:50 +08:00
|
|
|
# Replace !!!!! with a single !
|
|
|
|
text.gsub!(/!+/, "!") if opts[:deduplicate_exclamation_marks]
|
2021-05-24 16:13:30 +08:00
|
|
|
|
2013-04-10 17:00:50 +08:00
|
|
|
# Replace ????? with a single ?
|
|
|
|
text.gsub!(/\?+/, "?") if opts[:deduplicate_question_marks]
|
2021-05-24 16:13:30 +08:00
|
|
|
|
2013-04-10 17:00:50 +08:00
|
|
|
# Replace all-caps text with regular case letters
|
2021-05-24 16:13:30 +08:00
|
|
|
text = downcase(text.mb_chars, opts).to_s if opts[:replace_all_upper_case] &&
|
|
|
|
(text == upcase(text.mb_chars, opts))
|
|
|
|
|
2013-05-24 03:31:08 +08:00
|
|
|
# Capitalize first letter, but only when entire first word is lowercase
|
2016-03-20 20:35:48 +08:00
|
|
|
first, rest = text.split(" ", 2)
|
2021-05-24 16:13:30 +08:00
|
|
|
if first && opts[:capitalize_first_letter] && first == downcase(first.mb_chars, opts)
|
|
|
|
text = +"#{capitalize(first.mb_chars, opts)}#{rest ? " " + rest : ""}"
|
2016-03-20 20:35:48 +08:00
|
|
|
end
|
2021-05-24 16:13:30 +08:00
|
|
|
|
2013-04-18 06:19:42 +08:00
|
|
|
# Remove unnecessary periods at the end
|
|
|
|
text.sub!(/([^.])\.+(\s*)\z/, '\1\2') if opts[:remove_all_periods_from_the_end]
|
2021-05-24 16:13:30 +08:00
|
|
|
|
2013-04-10 17:00:50 +08:00
|
|
|
# Remove extraneous space before the end punctuation
|
|
|
|
text.sub!(/\s+([!?]\s*)\z/, '\1') if opts[:remove_extraneous_space]
|
2021-05-24 16:13:30 +08:00
|
|
|
|
2013-04-10 17:00:50 +08:00
|
|
|
# Fixes interior spaces
|
|
|
|
text.gsub!(/ +/, " ") if opts[:fixes_interior_spaces]
|
2021-05-24 16:13:30 +08:00
|
|
|
|
2014-08-12 06:01:58 +08:00
|
|
|
# Normalize whitespaces
|
|
|
|
text = normalize_whitespaces(text)
|
2021-05-24 16:13:30 +08:00
|
|
|
|
2013-04-10 17:00:50 +08:00
|
|
|
# Strip whitespaces
|
|
|
|
text.strip! if opts[:strip_whitespaces]
|
2021-05-24 16:13:30 +08:00
|
|
|
|
2018-02-21 02:22:36 +08:00
|
|
|
# Strip zero width spaces
|
|
|
|
text.gsub!(/\u200b/, "") if opts[:strip_zero_width_spaces]
|
2013-04-10 17:00:50 +08:00
|
|
|
|
|
|
|
text
|
|
|
|
end
|
|
|
|
|
2022-11-25 21:56:59 +08:00
|
|
|
@@whitespaces_regexp =
|
|
|
|
Regexp.new(
|
|
|
|
"(\u00A0|\u1680|\u180E|[\u2000-\u200A]|\u2028|\u2029|\u202F|\u205F|\u3000)",
|
|
|
|
Regexp::IGNORECASE,
|
|
|
|
).freeze
|
2014-08-12 06:01:58 +08:00
|
|
|
|
|
|
|
def self.normalize_whitespaces(text)
|
2018-11-20 16:12:32 +08:00
|
|
|
text&.gsub(@@whitespaces_regexp, " ")
|
2014-08-12 06:01:58 +08:00
|
|
|
end
|
|
|
|
|
2021-05-24 16:13:30 +08:00
|
|
|
def self.downcase(text, opts)
|
|
|
|
opts[:case_option] ? text.downcase(opts[:case_option]) : text.downcase
|
|
|
|
end
|
|
|
|
|
|
|
|
def self.upcase(text, opts)
|
|
|
|
opts[:case_option] ? text.upcase(opts[:case_option]) : text.upcase
|
|
|
|
end
|
|
|
|
|
|
|
|
def self.capitalize(text, opts)
|
|
|
|
opts[:case_option] ? text.capitalize(opts[:case_option]) : text.capitalize
|
|
|
|
end
|
2013-04-10 17:00:50 +08:00
|
|
|
end
|