mirror of
https://github.com/discourse/discourse.git
synced 2025-03-27 06:55:36 +08:00
FIX: Allow all caps within CJK text (#28018)
This improves the `TextSentinel` so that we don't consider CJK text as being uppercase and thus failing the validator. It also optimizes the entropy computation by using native ruby `.bytes` to get all the bytes from the text. It also tweaks the `seems_pronounceable?` and `seems_unpretentious?` check to use the `\p{Alnum}` unicode regexp group to account for non-latin languages. Reference - https://meta.discourse.org/t/body-seems-unclear-error-when-users-are-typing-in-chinese/88715 Inspired by https://github.com/discourse/discourse/pull/27900 Co-authored-by: Paulo Magalhaes <mentalstring@gmail.com>
This commit is contained in:
parent
f0487aa768
commit
23aa88d203
@ -1,12 +1,5 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
# We use ActiveSupport mb_chars from here to properly support non ascii downcase
|
||||
# TODO remove when ruby 2.4 lands
|
||||
require "active_support/core_ext/string/multibyte"
|
||||
|
||||
#
|
||||
# Given a string, tell us whether or not is acceptable.
|
||||
#
|
||||
class TextSentinel
|
||||
attr_accessor :text
|
||||
|
||||
@ -43,11 +36,9 @@ class TextSentinel
|
||||
TextSentinel.new(text, min_entropy: entropy, max_word_length: SiteSetting.title_max_word_length)
|
||||
end
|
||||
|
||||
# Entropy is a number of how many unique characters the string needs.
|
||||
# Non-ASCII characters are weighted heavier since they contain more "information"
|
||||
# Number of unique bytes
|
||||
def entropy
|
||||
chars = @text.to_s.strip.split("")
|
||||
@entropy ||= chars.pack("M*" * chars.size).gsub("\n", "").split("=").uniq.size
|
||||
@entropy ||= @text.strip.bytes.uniq.size
|
||||
end
|
||||
|
||||
def valid?
|
||||
@ -55,39 +46,32 @@ class TextSentinel
|
||||
seems_quiet?
|
||||
end
|
||||
|
||||
# Ensure minumum entropy
|
||||
def seems_meaningful?
|
||||
# Minimum entropy if entropy check required
|
||||
@opts[:min_entropy].blank? || (entropy >= @opts[:min_entropy])
|
||||
@opts[:min_entropy].nil? || entropy >= @opts[:min_entropy]
|
||||
end
|
||||
|
||||
# At least one non-symbol character
|
||||
def seems_pronounceable?
|
||||
# At least some non-symbol characters
|
||||
# (We don't have a comprehensive list of symbols, but this will eliminate some noise)
|
||||
@text.gsub(symbols_regex, "").size > 0
|
||||
@text.match?(/\p{Alnum}/)
|
||||
end
|
||||
|
||||
# Ensure maximum word length
|
||||
def seems_unpretentious?
|
||||
return true if skipped_locale.include?(SiteSetting.default_locale)
|
||||
# Don't allow super long words if there is a word length maximum
|
||||
|
||||
@opts[:max_word_length].blank? ||
|
||||
(@text.split(%r{\s|/|-|\.|:}).map(&:size).max || 0) <= @opts[:max_word_length]
|
||||
skipped_locales.include?(SiteSetting.default_locale) || @opts[:max_word_length].nil? ||
|
||||
@text.scan(/\p{Alnum}+/).map(&:size).max.to_i <= @opts[:max_word_length]
|
||||
end
|
||||
|
||||
# Ensure at least one lowercase letter
|
||||
def seems_quiet?
|
||||
return true if skipped_locale.include?(SiteSetting.default_locale)
|
||||
# We don't allow all upper case content
|
||||
SiteSetting.allow_uppercase_posts || @text == @text.mb_chars.downcase.to_s ||
|
||||
@text != @text.mb_chars.upcase.to_s
|
||||
SiteSetting.allow_uppercase_posts || @text.match?(/\p{Lowercase_Letter}|\p{Other_Letter}/) ||
|
||||
!@text.match?(/\p{Letter}/)
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def symbols_regex
|
||||
/[\ -\/\[-\`\:-\@\{-\~]/m
|
||||
end
|
||||
|
||||
def skipped_locale
|
||||
%w[zh_CN zh_TW ko ja].freeze
|
||||
# Hard to tell "word length" for CJK languages
|
||||
def skipped_locales
|
||||
@skipped_locales ||= %w[ja ko zh_CN zh_TW].freeze
|
||||
end
|
||||
end
|
||||
|
@ -30,15 +30,15 @@ RSpec.describe TextSentinel do
|
||||
end
|
||||
|
||||
it "Works on foreign characters" do
|
||||
expect(TextSentinel.new("去年十社會警告").entropy).to eq(19)
|
||||
expect(TextSentinel.new("去年十社會警告").entropy).to eq(18)
|
||||
end
|
||||
|
||||
it "generates enough entropy for short foreign strings" do
|
||||
expect(TextSentinel.new("又一个测").entropy).to eq(11)
|
||||
expect(TextSentinel.new("又一个测").entropy).to eq(10)
|
||||
end
|
||||
|
||||
it "handles repeated foreign characters" do
|
||||
expect(TextSentinel.new("又一个测试话题" * 3).entropy).to eq(18)
|
||||
expect(TextSentinel.new("又一个测试话题" * 3).entropy).to eq(17)
|
||||
end
|
||||
end
|
||||
|
||||
@ -108,8 +108,7 @@ RSpec.describe TextSentinel do
|
||||
expect(TextSentinel.new("去年十二月,北韓不顧國際社會警告")).to be_valid
|
||||
end
|
||||
|
||||
it "skips uppercase text for CJK locale" do
|
||||
SiteSetting.default_locale = "zh_CN"
|
||||
it "allows all caps for CJK characters" do
|
||||
expect(TextSentinel.new("去年SHIER月,北韓不顧國際社會警告")).to be_valid
|
||||
end
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user