From ff942ed2f3a33dd91dbb81e4194da3bb84487b1a Mon Sep 17 00:00:00 2001 From: Gerhard Schlager Date: Fri, 27 Jul 2018 19:41:53 +0200 Subject: [PATCH] FIX: Try detecting encoding of RSS feed --- Gemfile | 2 ++ Gemfile.lock | 4 ++- app/jobs/scheduled/poll_feed.rb | 9 ++++--- lib/encodings.rb | 33 +++++++++++++++++++++++++ spec/fixtures/encodings/iso-8859-5.txt | 1 + spec/fixtures/encodings/utf-16be.txt | Bin 0 -> 156 bytes spec/fixtures/encodings/utf-16le.txt | Bin 0 -> 156 bytes spec/fixtures/encodings/utf-8-bom.txt | 1 + spec/fixtures/encodings/utf-8.txt | 1 + spec/fixtures/feed/utf-16le-feed.rss | Bin 0 -> 3088 bytes spec/jobs/poll_feed_spec.rb | 20 +++++++++++++++ spec/lib/encodings_spec.rb | 32 ++++++++++++++++++++++++ 12 files changed, 99 insertions(+), 4 deletions(-) create mode 100644 lib/encodings.rb create mode 100644 spec/fixtures/encodings/iso-8859-5.txt create mode 100644 spec/fixtures/encodings/utf-16be.txt create mode 100644 spec/fixtures/encodings/utf-16le.txt create mode 100644 spec/fixtures/encodings/utf-8-bom.txt create mode 100644 spec/fixtures/encodings/utf-8.txt create mode 100644 spec/fixtures/feed/utf-16le-feed.rss create mode 100644 spec/lib/encodings_spec.rb diff --git a/Gemfile b/Gemfile index c2e9be001f4..d8acd785c57 100644 --- a/Gemfile +++ b/Gemfile @@ -180,6 +180,8 @@ gem 'rqrcode' gem 'sshkey', require: false +gem 'charlock_holmes', require: false + if ENV["IMPORT"] == "1" gem 'mysql2' gem 'redcarpet' diff --git a/Gemfile.lock b/Gemfile.lock index 6294c698ecd..902d22f0851 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -75,6 +75,7 @@ GEM uniform_notifier (~> 1.11.0) byebug (10.0.2) certified (1.0.0) + charlock_holmes (0.7.6) chunky_png (1.3.10) claide (1.0.2) claide-plugins (0.9.2) @@ -456,6 +457,7 @@ DEPENDENCIES bullet byebug certified + charlock_holmes cppjieba_rb danger discourse_image_optim @@ -550,4 +552,4 @@ DEPENDENCIES webpush BUNDLED WITH - 1.16.2 + 1.16.3 diff --git a/app/jobs/scheduled/poll_feed.rb b/app/jobs/scheduled/poll_feed.rb index 494fcf2a706..7d242cb03a8 100644 --- a/app/jobs/scheduled/poll_feed.rb +++ b/app/jobs/scheduled/poll_feed.rb @@ -6,6 +6,7 @@ require 'excon' require_dependency 'final_destination' require_dependency 'post_creator' require_dependency 'post_revisor' +require_dependency 'encodings' module Jobs class PollFeed < Jobs::Scheduled @@ -88,13 +89,15 @@ module Jobs def parsed_feed raw_feed = fetch_rss - return nil if raw_feed.blank? + encoded_feed = Encodings.to_utf8(raw_feed) + + return nil if encoded_feed.blank? if SiteSetting.embed_username_key_from_feed.present? - FeedElementInstaller.install(SiteSetting.embed_username_key_from_feed, raw_feed) + FeedElementInstaller.install(SiteSetting.embed_username_key_from_feed, encoded_feed) end - RSS::Parser.parse(raw_feed) + RSS::Parser.parse(encoded_feed) rescue RSS::NotWellFormedError, RSS::InvalidRSSError nil end diff --git a/lib/encodings.rb b/lib/encodings.rb new file mode 100644 index 00000000000..b9a8a876eac --- /dev/null +++ b/lib/encodings.rb @@ -0,0 +1,33 @@ +require 'charlock_holmes' + +module Encodings + BINARY_SCAN_LENGTH = 0 + + def self.to_utf8(string, encoding_hint: nil, delete_bom: true) + detector = CharlockHolmes::EncodingDetector.new(BINARY_SCAN_LENGTH) + result = detector.detect(string, encoding_hint&.to_s) + + if result && result[:encoding] + string = CharlockHolmes::Converter.convert(string, result[:encoding], Encoding::UTF_8.name) + else + string = string.encode(Encoding::UTF_8, undef: :replace, invalid: :replace, replace: '') + end + + delete_bom!(string) if delete_bom + string + end + + def self.try_utf8(string, source_encoding) + encoded = string.encode(Encoding::UTF_8, source_encoding) + encoded&.valid_encoding? ? delete_bom!(encoded) : nil + rescue Encoding::InvalidByteSequenceError, + Encoding::UndefinedConversionError, + Encoding::ConverterNotFoundError + nil + end + + def self.delete_bom!(string) + string.sub!(/\A\xEF\xBB\xBF/, '') unless string.blank? + string + end +end diff --git a/spec/fixtures/encodings/iso-8859-5.txt b/spec/fixtures/encodings/iso-8859-5.txt new file mode 100644 index 00000000000..0501b5848df --- /dev/null +++ b/spec/fixtures/encodings/iso-8859-5.txt @@ -0,0 +1 @@ + , \ No newline at end of file diff --git a/spec/fixtures/encodings/utf-16be.txt b/spec/fixtures/encodings/utf-16be.txt new file mode 100644 index 0000000000000000000000000000000000000000..24ca7e99a5c5a790ef4cf8be2310c2583dce8b58 GIT binary patch literal 156 zcmZXOOAbIl5JV?=7jXb_YFh*m@yqzHa3s{Q5{pWzI@PcHya=(zV`7(PN?KCooPm^; ym{F-9p;NBpBj=utMcoWbLq*MId`&IAC~y8#YcXc&OLte@m0YAlUrcI@Cb$7RpEMHy literal 0 HcmV?d00001 diff --git a/spec/fixtures/encodings/utf-16le.txt b/spec/fixtures/encodings/utf-16le.txt new file mode 100644 index 0000000000000000000000000000000000000000..3e3b1ee0d17eb29952f9a1e30126ab9b6e7a353e GIT binary patch literal 156 zcmZXOOAbIl5JV?=7jXb_YFh*m@r%!3g(IPcl~{DTs?+tluQQ~_GnO68Oi4>BpEIhj z#7uGp37vc;9X-k=7VTzAH=5UM_{K9mgD9{6Q)@A1)tBzBx+}RzhrY1dm`+5%H4a9VK=odRwFR9o=1$~ zSM<8DzAZ|;Ex;IY^_{mdzq;7TyV!Z|t#Ctx zl`YsUawHx_`~{2`@LkwRhIN%n9os2r&E?KA#;M(xAnslR;Y`TgkjTW;_e^<~Qu5uR za)U?m9MH@|9@RJwM>ZbtZs1LMX@8N0AlM%D-+A6*zJsu|fxT*fq| z8uk&^ebzeswb&P}G7+(^nWBgW?2h=QxF$TMJgYxCZoOEQ2+J8Z-+A9*zc_7U5E%Nt zoZB0ahcM|lwdZ)Q*bDxDVm09|)mH4KTR)|eRjwMkK4Im+DO#%b8Igv{ z(X$LD^uR5CZ=7F6duU(q7clbg!%peB#=ZpBp7S;Pm21sj)zRl`gq@Hcvfqw-1!~WJ z;!E5*;5EoVIcAw{KvsNG+RX7)or~SJfb|Uis-?Oo&CJUS&bxDJab&Of^9;%`gFh)$ z(>v(8sO@XK>1>JY3QMU>byml9+}aSe%!ylE6mL=WkD;D;)l)H@^BO-oxfeAq8j>&cJH9lX^Fg|AUuh;Ad(Nt*TF@gkpT5QtQZyyRw*4 z|IqcG*mxC>iRJ^VIz=?q`}`icNcZs>zFu~j(J|kI30R@GLT3fGsz6|p@1ku}v>5q2 zR|(DSmg@pPwe?-5lS?OUm%NSnRgNjO>x$@-XZ94GSLu>tJ9AyK?vwI!C7aXy7iSFy znp~QUQh8a`>SV@GeN`bM_xc~sp|;z5&&o667-M_JbKZ-1D~rO()?TMx_?j*<@vOCP Rb6#n!_=TcXf0nCC{{VQX1=|1s literal 0 HcmV?d00001 diff --git a/spec/jobs/poll_feed_spec.rb b/spec/jobs/poll_feed_spec.rb index 92f98f685e0..a42835ce09c 100644 --- a/spec/jobs/poll_feed_spec.rb +++ b/spec/jobs/poll_feed_spec.rb @@ -136,5 +136,25 @@ describe Jobs::PollFeed do include_examples 'topic creation based on the the feed' end + + context 'encodings' do + before do + SiteSetting.feed_polling_enabled = true + SiteSetting.feed_polling_url = 'https://blog.discourse.org/feed/atom/' + SiteSetting.embed_by_username = 'eviltrout' + + stub_request(:head, SiteSetting.feed_polling_url) + end + + it 'works with encodings other than UTF-8' do + stub_request(:get, SiteSetting.feed_polling_url).to_return( + body: file_from_fixtures('utf-16le-feed.rss', 'feed').read, + headers: { "Content-Type" => "application/rss+xml" } + ) + + expect { poller.poll_feed }.to change { Topic.count }.by(1) + expect(Topic.last.first_post.raw).to include('

This is the body & content.

') + end + end end end diff --git a/spec/lib/encodings_spec.rb b/spec/lib/encodings_spec.rb new file mode 100644 index 00000000000..2d15ab5a005 --- /dev/null +++ b/spec/lib/encodings_spec.rb @@ -0,0 +1,32 @@ +require 'rails_helper' + +describe Encodings do + def to_utf8(filename, encoding_hint = nil) + string = File.read("#{Rails.root}/spec/fixtures/encodings/#{filename}").chomp + Encodings.to_utf8(string, encoding_hint: encoding_hint) + end + + context "unicode" do + let(:expected) { 'Το σύστημα γραφής είναι ένα συμβολικό, οπτικό σύστημα καταγραφής της γλώσσας.' } + + it "correctly encodes UTF-8 as UTF-8" do + expect(to_utf8('utf-8.txt')).to eq(expected) + end + + it "correctly encodes UTF-8 with BOM as UTF-8" do + expect(to_utf8('utf-8-bom.txt')).to eq(expected) + end + + it "correctly encodes UTF-16LE with BOM as UTF-8" do + expect(to_utf8('utf-16le.txt')).to eq(expected) + end + + it "correctly encodes UTF-16BE with BOM as UTF-8" do + expect(to_utf8('utf-16be.txt')).to eq(expected) + end + end + + it "correctly encodes ISO-8859-5 as UTF-8" do + expect(to_utf8('iso-8859-5.txt')).to eq('Письменность отличается от других существующих или возможных систем символической коммуникации тем, что всегда ассоциируется с некоторым языком и устной речью на этом языке') + end +end