diff --git a/lib/text_cleaner.rb b/lib/text_cleaner.rb index 48307d30bd8..cdbccc1ba1f 100644 --- a/lib/text_cleaner.rb +++ b/lib/text_cleaner.rb @@ -27,6 +27,8 @@ class TextCleaner end def self.clean(text, opts = {}) + # Remove invalid byte sequences + text.scrub!("") # Replace !!!!! with a single ! text.gsub!(/!+/, '!') if opts[:deduplicate_exclamation_marks] # Replace ????? with a single ? diff --git a/spec/components/text_cleaner_spec.rb b/spec/components/text_cleaner_spec.rb index d5cc3321693..e98ca726b50 100644 --- a/spec/components/text_cleaner_spec.rb +++ b/spec/components/text_cleaner_spec.rb @@ -229,4 +229,12 @@ describe TextCleaner do end end + context "invalid byte sequence" do + let(:with_invalid_bytes) { "abc\u3042\x81" } + let(:without_invalid_bytes) { "abc\u3042" } + + it "removes invalid bytes" do + expect(TextCleaner.clean(with_invalid_bytes)).to eq(without_invalid_bytes) + end + end end