mirror of
https://github.com/discourse/discourse.git
synced 2024-11-25 09:42:07 +08:00
FIX: Try detecting encoding of RSS feed
This commit is contained in:
parent
a900c9bf93
commit
ff942ed2f3
2
Gemfile
2
Gemfile
|
@ -180,6 +180,8 @@ gem 'rqrcode'
|
||||||
|
|
||||||
gem 'sshkey', require: false
|
gem 'sshkey', require: false
|
||||||
|
|
||||||
|
gem 'charlock_holmes', require: false
|
||||||
|
|
||||||
if ENV["IMPORT"] == "1"
|
if ENV["IMPORT"] == "1"
|
||||||
gem 'mysql2'
|
gem 'mysql2'
|
||||||
gem 'redcarpet'
|
gem 'redcarpet'
|
||||||
|
|
|
@ -75,6 +75,7 @@ GEM
|
||||||
uniform_notifier (~> 1.11.0)
|
uniform_notifier (~> 1.11.0)
|
||||||
byebug (10.0.2)
|
byebug (10.0.2)
|
||||||
certified (1.0.0)
|
certified (1.0.0)
|
||||||
|
charlock_holmes (0.7.6)
|
||||||
chunky_png (1.3.10)
|
chunky_png (1.3.10)
|
||||||
claide (1.0.2)
|
claide (1.0.2)
|
||||||
claide-plugins (0.9.2)
|
claide-plugins (0.9.2)
|
||||||
|
@ -456,6 +457,7 @@ DEPENDENCIES
|
||||||
bullet
|
bullet
|
||||||
byebug
|
byebug
|
||||||
certified
|
certified
|
||||||
|
charlock_holmes
|
||||||
cppjieba_rb
|
cppjieba_rb
|
||||||
danger
|
danger
|
||||||
discourse_image_optim
|
discourse_image_optim
|
||||||
|
@ -550,4 +552,4 @@ DEPENDENCIES
|
||||||
webpush
|
webpush
|
||||||
|
|
||||||
BUNDLED WITH
|
BUNDLED WITH
|
||||||
1.16.2
|
1.16.3
|
||||||
|
|
|
@ -6,6 +6,7 @@ require 'excon'
|
||||||
require_dependency 'final_destination'
|
require_dependency 'final_destination'
|
||||||
require_dependency 'post_creator'
|
require_dependency 'post_creator'
|
||||||
require_dependency 'post_revisor'
|
require_dependency 'post_revisor'
|
||||||
|
require_dependency 'encodings'
|
||||||
|
|
||||||
module Jobs
|
module Jobs
|
||||||
class PollFeed < Jobs::Scheduled
|
class PollFeed < Jobs::Scheduled
|
||||||
|
@ -88,13 +89,15 @@ module Jobs
|
||||||
|
|
||||||
def parsed_feed
|
def parsed_feed
|
||||||
raw_feed = fetch_rss
|
raw_feed = fetch_rss
|
||||||
return nil if raw_feed.blank?
|
encoded_feed = Encodings.to_utf8(raw_feed)
|
||||||
|
|
||||||
|
return nil if encoded_feed.blank?
|
||||||
|
|
||||||
if SiteSetting.embed_username_key_from_feed.present?
|
if SiteSetting.embed_username_key_from_feed.present?
|
||||||
FeedElementInstaller.install(SiteSetting.embed_username_key_from_feed, raw_feed)
|
FeedElementInstaller.install(SiteSetting.embed_username_key_from_feed, encoded_feed)
|
||||||
end
|
end
|
||||||
|
|
||||||
RSS::Parser.parse(raw_feed)
|
RSS::Parser.parse(encoded_feed)
|
||||||
rescue RSS::NotWellFormedError, RSS::InvalidRSSError
|
rescue RSS::NotWellFormedError, RSS::InvalidRSSError
|
||||||
nil
|
nil
|
||||||
end
|
end
|
||||||
|
|
33
lib/encodings.rb
Normal file
33
lib/encodings.rb
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
require 'charlock_holmes'
|
||||||
|
|
||||||
|
module Encodings
|
||||||
|
BINARY_SCAN_LENGTH = 0
|
||||||
|
|
||||||
|
def self.to_utf8(string, encoding_hint: nil, delete_bom: true)
|
||||||
|
detector = CharlockHolmes::EncodingDetector.new(BINARY_SCAN_LENGTH)
|
||||||
|
result = detector.detect(string, encoding_hint&.to_s)
|
||||||
|
|
||||||
|
if result && result[:encoding]
|
||||||
|
string = CharlockHolmes::Converter.convert(string, result[:encoding], Encoding::UTF_8.name)
|
||||||
|
else
|
||||||
|
string = string.encode(Encoding::UTF_8, undef: :replace, invalid: :replace, replace: '')
|
||||||
|
end
|
||||||
|
|
||||||
|
delete_bom!(string) if delete_bom
|
||||||
|
string
|
||||||
|
end
|
||||||
|
|
||||||
|
def self.try_utf8(string, source_encoding)
|
||||||
|
encoded = string.encode(Encoding::UTF_8, source_encoding)
|
||||||
|
encoded&.valid_encoding? ? delete_bom!(encoded) : nil
|
||||||
|
rescue Encoding::InvalidByteSequenceError,
|
||||||
|
Encoding::UndefinedConversionError,
|
||||||
|
Encoding::ConverterNotFoundError
|
||||||
|
nil
|
||||||
|
end
|
||||||
|
|
||||||
|
def self.delete_bom!(string)
|
||||||
|
string.sub!(/\A\xEF\xBB\xBF/, '') unless string.blank?
|
||||||
|
string
|
||||||
|
end
|
||||||
|
end
|
1
spec/fixtures/encodings/iso-8859-5.txt
vendored
Normal file
1
spec/fixtures/encodings/iso-8859-5.txt
vendored
Normal file
|
@ -0,0 +1 @@
|
||||||
|
Письменность отличается от других существующих или возможных систем символической коммуникации тем, что всегда ассоциируется с некоторым языком и устной речью на этом языке
|
BIN
spec/fixtures/encodings/utf-16be.txt
vendored
Normal file
BIN
spec/fixtures/encodings/utf-16be.txt
vendored
Normal file
Binary file not shown.
BIN
spec/fixtures/encodings/utf-16le.txt
vendored
Normal file
BIN
spec/fixtures/encodings/utf-16le.txt
vendored
Normal file
Binary file not shown.
1
spec/fixtures/encodings/utf-8-bom.txt
vendored
Normal file
1
spec/fixtures/encodings/utf-8-bom.txt
vendored
Normal file
|
@ -0,0 +1 @@
|
||||||
|
Το σύστημα γραφής είναι ένα συμβολικό, οπτικό σύστημα καταγραφής της γλώσσας.
|
1
spec/fixtures/encodings/utf-8.txt
vendored
Normal file
1
spec/fixtures/encodings/utf-8.txt
vendored
Normal file
|
@ -0,0 +1 @@
|
||||||
|
Το σύστημα γραφής είναι ένα συμβολικό, οπτικό σύστημα καταγραφής της γλώσσας.
|
BIN
spec/fixtures/feed/utf-16le-feed.rss
vendored
Normal file
BIN
spec/fixtures/feed/utf-16le-feed.rss
vendored
Normal file
Binary file not shown.
|
@ -136,5 +136,25 @@ describe Jobs::PollFeed do
|
||||||
|
|
||||||
include_examples 'topic creation based on the the feed'
|
include_examples 'topic creation based on the the feed'
|
||||||
end
|
end
|
||||||
|
|
||||||
|
context 'encodings' do
|
||||||
|
before do
|
||||||
|
SiteSetting.feed_polling_enabled = true
|
||||||
|
SiteSetting.feed_polling_url = 'https://blog.discourse.org/feed/atom/'
|
||||||
|
SiteSetting.embed_by_username = 'eviltrout'
|
||||||
|
|
||||||
|
stub_request(:head, SiteSetting.feed_polling_url)
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'works with encodings other than UTF-8' do
|
||||||
|
stub_request(:get, SiteSetting.feed_polling_url).to_return(
|
||||||
|
body: file_from_fixtures('utf-16le-feed.rss', 'feed').read,
|
||||||
|
headers: { "Content-Type" => "application/rss+xml" }
|
||||||
|
)
|
||||||
|
|
||||||
|
expect { poller.poll_feed }.to change { Topic.count }.by(1)
|
||||||
|
expect(Topic.last.first_post.raw).to include('<p>This is the body & content. </p>')
|
||||||
|
end
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
32
spec/lib/encodings_spec.rb
Normal file
32
spec/lib/encodings_spec.rb
Normal file
|
@ -0,0 +1,32 @@
|
||||||
|
require 'rails_helper'
|
||||||
|
|
||||||
|
describe Encodings do
|
||||||
|
def to_utf8(filename, encoding_hint = nil)
|
||||||
|
string = File.read("#{Rails.root}/spec/fixtures/encodings/#{filename}").chomp
|
||||||
|
Encodings.to_utf8(string, encoding_hint: encoding_hint)
|
||||||
|
end
|
||||||
|
|
||||||
|
context "unicode" do
|
||||||
|
let(:expected) { 'Το σύστημα γραφής είναι ένα συμβολικό, οπτικό σύστημα καταγραφής της γλώσσας.' }
|
||||||
|
|
||||||
|
it "correctly encodes UTF-8 as UTF-8" do
|
||||||
|
expect(to_utf8('utf-8.txt')).to eq(expected)
|
||||||
|
end
|
||||||
|
|
||||||
|
it "correctly encodes UTF-8 with BOM as UTF-8" do
|
||||||
|
expect(to_utf8('utf-8-bom.txt')).to eq(expected)
|
||||||
|
end
|
||||||
|
|
||||||
|
it "correctly encodes UTF-16LE with BOM as UTF-8" do
|
||||||
|
expect(to_utf8('utf-16le.txt')).to eq(expected)
|
||||||
|
end
|
||||||
|
|
||||||
|
it "correctly encodes UTF-16BE with BOM as UTF-8" do
|
||||||
|
expect(to_utf8('utf-16be.txt')).to eq(expected)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
it "correctly encodes ISO-8859-5 as UTF-8" do
|
||||||
|
expect(to_utf8('iso-8859-5.txt')).to eq('Письменность отличается от других существующих или возможных систем символической коммуникации тем, что всегда ассоциируется с некоторым языком и устной речью на этом языке')
|
||||||
|
end
|
||||||
|
end
|
Loading…
Reference in New Issue
Block a user