diff --git a/Gemfile b/Gemfile index c2e9be001f4..d8acd785c57 100644 --- a/Gemfile +++ b/Gemfile @@ -180,6 +180,8 @@ gem 'rqrcode' gem 'sshkey', require: false +gem 'charlock_holmes', require: false + if ENV["IMPORT"] == "1" gem 'mysql2' gem 'redcarpet' diff --git a/Gemfile.lock b/Gemfile.lock index 6294c698ecd..902d22f0851 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -75,6 +75,7 @@ GEM uniform_notifier (~> 1.11.0) byebug (10.0.2) certified (1.0.0) + charlock_holmes (0.7.6) chunky_png (1.3.10) claide (1.0.2) claide-plugins (0.9.2) @@ -456,6 +457,7 @@ DEPENDENCIES bullet byebug certified + charlock_holmes cppjieba_rb danger discourse_image_optim @@ -550,4 +552,4 @@ DEPENDENCIES webpush BUNDLED WITH - 1.16.2 + 1.16.3 diff --git a/app/jobs/scheduled/poll_feed.rb b/app/jobs/scheduled/poll_feed.rb index 494fcf2a706..7d242cb03a8 100644 --- a/app/jobs/scheduled/poll_feed.rb +++ b/app/jobs/scheduled/poll_feed.rb @@ -6,6 +6,7 @@ require 'excon' require_dependency 'final_destination' require_dependency 'post_creator' require_dependency 'post_revisor' +require_dependency 'encodings' module Jobs class PollFeed < Jobs::Scheduled @@ -88,13 +89,15 @@ module Jobs def parsed_feed raw_feed = fetch_rss - return nil if raw_feed.blank? + encoded_feed = Encodings.to_utf8(raw_feed) + + return nil if encoded_feed.blank? if SiteSetting.embed_username_key_from_feed.present? - FeedElementInstaller.install(SiteSetting.embed_username_key_from_feed, raw_feed) + FeedElementInstaller.install(SiteSetting.embed_username_key_from_feed, encoded_feed) end - RSS::Parser.parse(raw_feed) + RSS::Parser.parse(encoded_feed) rescue RSS::NotWellFormedError, RSS::InvalidRSSError nil end diff --git a/lib/encodings.rb b/lib/encodings.rb new file mode 100644 index 00000000000..b9a8a876eac --- /dev/null +++ b/lib/encodings.rb @@ -0,0 +1,33 @@ +require 'charlock_holmes' + +module Encodings + BINARY_SCAN_LENGTH = 0 + + def self.to_utf8(string, encoding_hint: nil, delete_bom: true) + detector = CharlockHolmes::EncodingDetector.new(BINARY_SCAN_LENGTH) + result = detector.detect(string, encoding_hint&.to_s) + + if result && result[:encoding] + string = CharlockHolmes::Converter.convert(string, result[:encoding], Encoding::UTF_8.name) + else + string = string.encode(Encoding::UTF_8, undef: :replace, invalid: :replace, replace: '') + end + + delete_bom!(string) if delete_bom + string + end + + def self.try_utf8(string, source_encoding) + encoded = string.encode(Encoding::UTF_8, source_encoding) + encoded&.valid_encoding? ? delete_bom!(encoded) : nil + rescue Encoding::InvalidByteSequenceError, + Encoding::UndefinedConversionError, + Encoding::ConverterNotFoundError + nil + end + + def self.delete_bom!(string) + string.sub!(/\A\xEF\xBB\xBF/, '') unless string.blank? + string + end +end diff --git a/spec/fixtures/encodings/iso-8859-5.txt b/spec/fixtures/encodings/iso-8859-5.txt new file mode 100644 index 00000000000..0501b5848df --- /dev/null +++ b/spec/fixtures/encodings/iso-8859-5.txt @@ -0,0 +1 @@ + , \ No newline at end of file diff --git a/spec/fixtures/encodings/utf-16be.txt b/spec/fixtures/encodings/utf-16be.txt new file mode 100644 index 00000000000..24ca7e99a5c Binary files /dev/null and b/spec/fixtures/encodings/utf-16be.txt differ diff --git a/spec/fixtures/encodings/utf-16le.txt b/spec/fixtures/encodings/utf-16le.txt new file mode 100644 index 00000000000..3e3b1ee0d17 Binary files /dev/null and b/spec/fixtures/encodings/utf-16le.txt differ diff --git a/spec/fixtures/encodings/utf-8-bom.txt b/spec/fixtures/encodings/utf-8-bom.txt new file mode 100644 index 00000000000..4a8c90804da --- /dev/null +++ b/spec/fixtures/encodings/utf-8-bom.txt @@ -0,0 +1 @@ +Το σύστημα γραφής είναι ένα συμβολικό, οπτικό σύστημα καταγραφής της γλώσσας. \ No newline at end of file diff --git a/spec/fixtures/encodings/utf-8.txt b/spec/fixtures/encodings/utf-8.txt new file mode 100644 index 00000000000..91de03ad1af --- /dev/null +++ b/spec/fixtures/encodings/utf-8.txt @@ -0,0 +1 @@ +Το σύστημα γραφής είναι ένα συμβολικό, οπτικό σύστημα καταγραφής της γλώσσας. \ No newline at end of file diff --git a/spec/fixtures/feed/utf-16le-feed.rss b/spec/fixtures/feed/utf-16le-feed.rss new file mode 100644 index 00000000000..55e5fd717b2 Binary files /dev/null and b/spec/fixtures/feed/utf-16le-feed.rss differ diff --git a/spec/jobs/poll_feed_spec.rb b/spec/jobs/poll_feed_spec.rb index 92f98f685e0..a42835ce09c 100644 --- a/spec/jobs/poll_feed_spec.rb +++ b/spec/jobs/poll_feed_spec.rb @@ -136,5 +136,25 @@ describe Jobs::PollFeed do include_examples 'topic creation based on the the feed' end + + context 'encodings' do + before do + SiteSetting.feed_polling_enabled = true + SiteSetting.feed_polling_url = 'https://blog.discourse.org/feed/atom/' + SiteSetting.embed_by_username = 'eviltrout' + + stub_request(:head, SiteSetting.feed_polling_url) + end + + it 'works with encodings other than UTF-8' do + stub_request(:get, SiteSetting.feed_polling_url).to_return( + body: file_from_fixtures('utf-16le-feed.rss', 'feed').read, + headers: { "Content-Type" => "application/rss+xml" } + ) + + expect { poller.poll_feed }.to change { Topic.count }.by(1) + expect(Topic.last.first_post.raw).to include('
This is the body & content.
') + end + end end end diff --git a/spec/lib/encodings_spec.rb b/spec/lib/encodings_spec.rb new file mode 100644 index 00000000000..2d15ab5a005 --- /dev/null +++ b/spec/lib/encodings_spec.rb @@ -0,0 +1,32 @@ +require 'rails_helper' + +describe Encodings do + def to_utf8(filename, encoding_hint = nil) + string = File.read("#{Rails.root}/spec/fixtures/encodings/#{filename}").chomp + Encodings.to_utf8(string, encoding_hint: encoding_hint) + end + + context "unicode" do + let(:expected) { 'Το σύστημα γραφής είναι ένα συμβολικό, οπτικό σύστημα καταγραφής της γλώσσας.' } + + it "correctly encodes UTF-8 as UTF-8" do + expect(to_utf8('utf-8.txt')).to eq(expected) + end + + it "correctly encodes UTF-8 with BOM as UTF-8" do + expect(to_utf8('utf-8-bom.txt')).to eq(expected) + end + + it "correctly encodes UTF-16LE with BOM as UTF-8" do + expect(to_utf8('utf-16le.txt')).to eq(expected) + end + + it "correctly encodes UTF-16BE with BOM as UTF-8" do + expect(to_utf8('utf-16be.txt')).to eq(expected) + end + end + + it "correctly encodes ISO-8859-5 as UTF-8" do + expect(to_utf8('iso-8859-5.txt')).to eq('Письменность отличается от других существующих или возможных систем символической коммуникации тем, что всегда ассоциируется с некоторым языком и устной речью на этом языке') + end +end