diff --git a/lib/text_cleaner.rb b/lib/text_cleaner.rb index da12624bc95..6d621ec9fba 100644 --- a/lib/text_cleaner.rb +++ b/lib/text_cleaner.rb @@ -44,7 +44,7 @@ class TextCleaner text end - @@whitespaces_regexp = Regexp.new("(\u00A0|\u1680|\u180E|[\u2000-\u200B]|\u2028|\u2029|\u202F|\u205F|\u3000|\uFEFF)", "u").freeze + @@whitespaces_regexp = Regexp.new("(\u00A0|\u1680|\u180E|[\u2000-\u200A]|\u2028|\u2029|\u202F|\u205F|\u3000)", "u").freeze def self.normalize_whitespaces(text) text.gsub(@@whitespaces_regexp, ' ') diff --git a/spec/components/text_cleaner_spec.rb b/spec/components/text_cleaner_spec.rb index edcdccca6b2..278177a292e 100644 --- a/spec/components/text_cleaner_spec.rb +++ b/spec/components/text_cleaner_spec.rb @@ -193,10 +193,17 @@ describe TextCleaner do describe "#normalize_whitespaces" do it "normalize whitespaces" do - whitespaces = "\u0020\u00A0\u1680\u180E\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u200B\u2028\u2029\u202F\u205F\u3000\uFEFF" + whitespaces = "\u0020\u00A0\u1680\u180E\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u2028\u2029\u202F\u205F\u3000" expect(whitespaces.strip).not_to eq("") expect(TextCleaner.normalize_whitespaces(whitespaces).strip).to eq("") end + + it "does not muck with zero width white space" do + # this is used for khmer, dont mess with it + expect(TextCleaner.normalize_whitespaces("hello\u200Bworld").strip).to eq("hello\u200Bworld") + expect(TextCleaner.normalize_whitespaces("hello\uFEFFworld").strip).to eq("hello\uFEFFworld") + + end end end