From 3d9981ac5c0011484b63c8a41e4a882c82d46ca6 Mon Sep 17 00:00:00 2001 From: Maja Komel Date: Tue, 26 Feb 2019 00:12:34 +0100 Subject: [PATCH] FIX: strip invalid byte sequences --- lib/text_cleaner.rb | 2 ++ spec/components/text_cleaner_spec.rb | 8 ++++++++ 2 files changed, 10 insertions(+) diff --git a/lib/text_cleaner.rb b/lib/text_cleaner.rb index 48307d30bd8..cdbccc1ba1f 100644 --- a/lib/text_cleaner.rb +++ b/lib/text_cleaner.rb @@ -27,6 +27,8 @@ class TextCleaner end def self.clean(text, opts = {}) + # Remove invalid byte sequences + text.scrub!("") # Replace !!!!! with a single ! text.gsub!(/!+/, '!') if opts[:deduplicate_exclamation_marks] # Replace ????? with a single ? diff --git a/spec/components/text_cleaner_spec.rb b/spec/components/text_cleaner_spec.rb index d5cc3321693..e98ca726b50 100644 --- a/spec/components/text_cleaner_spec.rb +++ b/spec/components/text_cleaner_spec.rb @@ -229,4 +229,12 @@ describe TextCleaner do end end + context "invalid byte sequence" do + let(:with_invalid_bytes) { "abc\u3042\x81" } + let(:without_invalid_bytes) { "abc\u3042" } + + it "removes invalid bytes" do + expect(TextCleaner.clean(with_invalid_bytes)).to eq(without_invalid_bytes) + end + end end