FIX: Try detecting encoding of RSS feed

This commit is contained in:
Gerhard Schlager 2018-07-27 19:41:53 +02:00
parent a900c9bf93
commit ff942ed2f3
12 changed files with 99 additions and 4 deletions

View File

@ -180,6 +180,8 @@ gem 'rqrcode'
gem 'sshkey', require: false gem 'sshkey', require: false
gem 'charlock_holmes', require: false
if ENV["IMPORT"] == "1" if ENV["IMPORT"] == "1"
gem 'mysql2' gem 'mysql2'
gem 'redcarpet' gem 'redcarpet'

View File

@ -75,6 +75,7 @@ GEM
uniform_notifier (~> 1.11.0) uniform_notifier (~> 1.11.0)
byebug (10.0.2) byebug (10.0.2)
certified (1.0.0) certified (1.0.0)
charlock_holmes (0.7.6)
chunky_png (1.3.10) chunky_png (1.3.10)
claide (1.0.2) claide (1.0.2)
claide-plugins (0.9.2) claide-plugins (0.9.2)
@ -456,6 +457,7 @@ DEPENDENCIES
bullet bullet
byebug byebug
certified certified
charlock_holmes
cppjieba_rb cppjieba_rb
danger danger
discourse_image_optim discourse_image_optim
@ -550,4 +552,4 @@ DEPENDENCIES
webpush webpush
BUNDLED WITH BUNDLED WITH
1.16.2 1.16.3

View File

@ -6,6 +6,7 @@ require 'excon'
require_dependency 'final_destination' require_dependency 'final_destination'
require_dependency 'post_creator' require_dependency 'post_creator'
require_dependency 'post_revisor' require_dependency 'post_revisor'
require_dependency 'encodings'
module Jobs module Jobs
class PollFeed < Jobs::Scheduled class PollFeed < Jobs::Scheduled
@ -88,13 +89,15 @@ module Jobs
def parsed_feed def parsed_feed
raw_feed = fetch_rss raw_feed = fetch_rss
return nil if raw_feed.blank? encoded_feed = Encodings.to_utf8(raw_feed)
return nil if encoded_feed.blank?
if SiteSetting.embed_username_key_from_feed.present? if SiteSetting.embed_username_key_from_feed.present?
FeedElementInstaller.install(SiteSetting.embed_username_key_from_feed, raw_feed) FeedElementInstaller.install(SiteSetting.embed_username_key_from_feed, encoded_feed)
end end
RSS::Parser.parse(raw_feed) RSS::Parser.parse(encoded_feed)
rescue RSS::NotWellFormedError, RSS::InvalidRSSError rescue RSS::NotWellFormedError, RSS::InvalidRSSError
nil nil
end end

33
lib/encodings.rb Normal file
View File

@ -0,0 +1,33 @@
require 'charlock_holmes'
module Encodings
BINARY_SCAN_LENGTH = 0
def self.to_utf8(string, encoding_hint: nil, delete_bom: true)
detector = CharlockHolmes::EncodingDetector.new(BINARY_SCAN_LENGTH)
result = detector.detect(string, encoding_hint&.to_s)
if result && result[:encoding]
string = CharlockHolmes::Converter.convert(string, result[:encoding], Encoding::UTF_8.name)
else
string = string.encode(Encoding::UTF_8, undef: :replace, invalid: :replace, replace: '')
end
delete_bom!(string) if delete_bom
string
end
def self.try_utf8(string, source_encoding)
encoded = string.encode(Encoding::UTF_8, source_encoding)
encoded&.valid_encoding? ? delete_bom!(encoded) : nil
rescue Encoding::InvalidByteSequenceError,
Encoding::UndefinedConversionError,
Encoding::ConverterNotFoundError
nil
end
def self.delete_bom!(string)
string.sub!(/\A\xEF\xBB\xBF/, '') unless string.blank?
string
end
end

View File

@ -0,0 +1 @@
Письменность отличается от других существующих или возможных систем символической коммуникации тем, что всегда ассоциируется с некоторым языком и устной речью на этом языке

BIN
spec/fixtures/encodings/utf-16be.txt vendored Normal file

Binary file not shown.

BIN
spec/fixtures/encodings/utf-16le.txt vendored Normal file

Binary file not shown.

1
spec/fixtures/encodings/utf-8-bom.txt vendored Normal file
View File

@ -0,0 +1 @@
Το σύστημα γραφής είναι ένα συμβολικό, οπτικό σύστημα καταγραφής της γλώσσας.

1
spec/fixtures/encodings/utf-8.txt vendored Normal file
View File

@ -0,0 +1 @@
Το σύστημα γραφής είναι ένα συμβολικό, οπτικό σύστημα καταγραφής της γλώσσας.

BIN
spec/fixtures/feed/utf-16le-feed.rss vendored Normal file

Binary file not shown.

View File

@ -136,5 +136,25 @@ describe Jobs::PollFeed do
include_examples 'topic creation based on the the feed' include_examples 'topic creation based on the the feed'
end end
context 'encodings' do
before do
SiteSetting.feed_polling_enabled = true
SiteSetting.feed_polling_url = 'https://blog.discourse.org/feed/atom/'
SiteSetting.embed_by_username = 'eviltrout'
stub_request(:head, SiteSetting.feed_polling_url)
end
it 'works with encodings other than UTF-8' do
stub_request(:get, SiteSetting.feed_polling_url).to_return(
body: file_from_fixtures('utf-16le-feed.rss', 'feed').read,
headers: { "Content-Type" => "application/rss+xml" }
)
expect { poller.poll_feed }.to change { Topic.count }.by(1)
expect(Topic.last.first_post.raw).to include('<p>This is the body &amp; content. </p>')
end
end
end end
end end

View File

@ -0,0 +1,32 @@
require 'rails_helper'
describe Encodings do
def to_utf8(filename, encoding_hint = nil)
string = File.read("#{Rails.root}/spec/fixtures/encodings/#{filename}").chomp
Encodings.to_utf8(string, encoding_hint: encoding_hint)
end
context "unicode" do
let(:expected) { 'Το σύστημα γραφής είναι ένα συμβολικό, οπτικό σύστημα καταγραφής της γλώσσας.' }
it "correctly encodes UTF-8 as UTF-8" do
expect(to_utf8('utf-8.txt')).to eq(expected)
end
it "correctly encodes UTF-8 with BOM as UTF-8" do
expect(to_utf8('utf-8-bom.txt')).to eq(expected)
end
it "correctly encodes UTF-16LE with BOM as UTF-8" do
expect(to_utf8('utf-16le.txt')).to eq(expected)
end
it "correctly encodes UTF-16BE with BOM as UTF-8" do
expect(to_utf8('utf-16be.txt')).to eq(expected)
end
end
it "correctly encodes ISO-8859-5 as UTF-8" do
expect(to_utf8('iso-8859-5.txt')).to eq('Письменность отличается от других существующих или возможных систем символической коммуникации тем, что всегда ассоциируется с некоторым языком и устной речью на этом языке')
end
end