mirror of
https://github.com/discourse/discourse.git
synced 2025-01-22 11:58:30 +08:00
FIX: Try detecting encoding of RSS feed
This commit is contained in:
parent
a900c9bf93
commit
ff942ed2f3
2
Gemfile
2
Gemfile
|
@ -180,6 +180,8 @@ gem 'rqrcode'
|
|||
|
||||
gem 'sshkey', require: false
|
||||
|
||||
gem 'charlock_holmes', require: false
|
||||
|
||||
if ENV["IMPORT"] == "1"
|
||||
gem 'mysql2'
|
||||
gem 'redcarpet'
|
||||
|
|
|
@ -75,6 +75,7 @@ GEM
|
|||
uniform_notifier (~> 1.11.0)
|
||||
byebug (10.0.2)
|
||||
certified (1.0.0)
|
||||
charlock_holmes (0.7.6)
|
||||
chunky_png (1.3.10)
|
||||
claide (1.0.2)
|
||||
claide-plugins (0.9.2)
|
||||
|
@ -456,6 +457,7 @@ DEPENDENCIES
|
|||
bullet
|
||||
byebug
|
||||
certified
|
||||
charlock_holmes
|
||||
cppjieba_rb
|
||||
danger
|
||||
discourse_image_optim
|
||||
|
@ -550,4 +552,4 @@ DEPENDENCIES
|
|||
webpush
|
||||
|
||||
BUNDLED WITH
|
||||
1.16.2
|
||||
1.16.3
|
||||
|
|
|
@ -6,6 +6,7 @@ require 'excon'
|
|||
require_dependency 'final_destination'
|
||||
require_dependency 'post_creator'
|
||||
require_dependency 'post_revisor'
|
||||
require_dependency 'encodings'
|
||||
|
||||
module Jobs
|
||||
class PollFeed < Jobs::Scheduled
|
||||
|
@ -88,13 +89,15 @@ module Jobs
|
|||
|
||||
def parsed_feed
|
||||
raw_feed = fetch_rss
|
||||
return nil if raw_feed.blank?
|
||||
encoded_feed = Encodings.to_utf8(raw_feed)
|
||||
|
||||
return nil if encoded_feed.blank?
|
||||
|
||||
if SiteSetting.embed_username_key_from_feed.present?
|
||||
FeedElementInstaller.install(SiteSetting.embed_username_key_from_feed, raw_feed)
|
||||
FeedElementInstaller.install(SiteSetting.embed_username_key_from_feed, encoded_feed)
|
||||
end
|
||||
|
||||
RSS::Parser.parse(raw_feed)
|
||||
RSS::Parser.parse(encoded_feed)
|
||||
rescue RSS::NotWellFormedError, RSS::InvalidRSSError
|
||||
nil
|
||||
end
|
||||
|
|
33
lib/encodings.rb
Normal file
33
lib/encodings.rb
Normal file
|
@ -0,0 +1,33 @@
|
|||
require 'charlock_holmes'
|
||||
|
||||
module Encodings
|
||||
BINARY_SCAN_LENGTH = 0
|
||||
|
||||
def self.to_utf8(string, encoding_hint: nil, delete_bom: true)
|
||||
detector = CharlockHolmes::EncodingDetector.new(BINARY_SCAN_LENGTH)
|
||||
result = detector.detect(string, encoding_hint&.to_s)
|
||||
|
||||
if result && result[:encoding]
|
||||
string = CharlockHolmes::Converter.convert(string, result[:encoding], Encoding::UTF_8.name)
|
||||
else
|
||||
string = string.encode(Encoding::UTF_8, undef: :replace, invalid: :replace, replace: '')
|
||||
end
|
||||
|
||||
delete_bom!(string) if delete_bom
|
||||
string
|
||||
end
|
||||
|
||||
def self.try_utf8(string, source_encoding)
|
||||
encoded = string.encode(Encoding::UTF_8, source_encoding)
|
||||
encoded&.valid_encoding? ? delete_bom!(encoded) : nil
|
||||
rescue Encoding::InvalidByteSequenceError,
|
||||
Encoding::UndefinedConversionError,
|
||||
Encoding::ConverterNotFoundError
|
||||
nil
|
||||
end
|
||||
|
||||
def self.delete_bom!(string)
|
||||
string.sub!(/\A\xEF\xBB\xBF/, '') unless string.blank?
|
||||
string
|
||||
end
|
||||
end
|
1
spec/fixtures/encodings/iso-8859-5.txt
vendored
Normal file
1
spec/fixtures/encodings/iso-8859-5.txt
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
Письменность отличается от других существующих или возможных систем символической коммуникации тем, что всегда ассоциируется с некоторым языком и устной речью на этом языке
|
BIN
spec/fixtures/encodings/utf-16be.txt
vendored
Normal file
BIN
spec/fixtures/encodings/utf-16be.txt
vendored
Normal file
Binary file not shown.
BIN
spec/fixtures/encodings/utf-16le.txt
vendored
Normal file
BIN
spec/fixtures/encodings/utf-16le.txt
vendored
Normal file
Binary file not shown.
1
spec/fixtures/encodings/utf-8-bom.txt
vendored
Normal file
1
spec/fixtures/encodings/utf-8-bom.txt
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
Το σύστημα γραφής είναι ένα συμβολικό, οπτικό σύστημα καταγραφής της γλώσσας.
|
1
spec/fixtures/encodings/utf-8.txt
vendored
Normal file
1
spec/fixtures/encodings/utf-8.txt
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
Το σύστημα γραφής είναι ένα συμβολικό, οπτικό σύστημα καταγραφής της γλώσσας.
|
BIN
spec/fixtures/feed/utf-16le-feed.rss
vendored
Normal file
BIN
spec/fixtures/feed/utf-16le-feed.rss
vendored
Normal file
Binary file not shown.
|
@ -136,5 +136,25 @@ describe Jobs::PollFeed do
|
|||
|
||||
include_examples 'topic creation based on the the feed'
|
||||
end
|
||||
|
||||
context 'encodings' do
|
||||
before do
|
||||
SiteSetting.feed_polling_enabled = true
|
||||
SiteSetting.feed_polling_url = 'https://blog.discourse.org/feed/atom/'
|
||||
SiteSetting.embed_by_username = 'eviltrout'
|
||||
|
||||
stub_request(:head, SiteSetting.feed_polling_url)
|
||||
end
|
||||
|
||||
it 'works with encodings other than UTF-8' do
|
||||
stub_request(:get, SiteSetting.feed_polling_url).to_return(
|
||||
body: file_from_fixtures('utf-16le-feed.rss', 'feed').read,
|
||||
headers: { "Content-Type" => "application/rss+xml" }
|
||||
)
|
||||
|
||||
expect { poller.poll_feed }.to change { Topic.count }.by(1)
|
||||
expect(Topic.last.first_post.raw).to include('<p>This is the body & content. </p>')
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
32
spec/lib/encodings_spec.rb
Normal file
32
spec/lib/encodings_spec.rb
Normal file
|
@ -0,0 +1,32 @@
|
|||
require 'rails_helper'
|
||||
|
||||
describe Encodings do
|
||||
def to_utf8(filename, encoding_hint = nil)
|
||||
string = File.read("#{Rails.root}/spec/fixtures/encodings/#{filename}").chomp
|
||||
Encodings.to_utf8(string, encoding_hint: encoding_hint)
|
||||
end
|
||||
|
||||
context "unicode" do
|
||||
let(:expected) { 'Το σύστημα γραφής είναι ένα συμβολικό, οπτικό σύστημα καταγραφής της γλώσσας.' }
|
||||
|
||||
it "correctly encodes UTF-8 as UTF-8" do
|
||||
expect(to_utf8('utf-8.txt')).to eq(expected)
|
||||
end
|
||||
|
||||
it "correctly encodes UTF-8 with BOM as UTF-8" do
|
||||
expect(to_utf8('utf-8-bom.txt')).to eq(expected)
|
||||
end
|
||||
|
||||
it "correctly encodes UTF-16LE with BOM as UTF-8" do
|
||||
expect(to_utf8('utf-16le.txt')).to eq(expected)
|
||||
end
|
||||
|
||||
it "correctly encodes UTF-16BE with BOM as UTF-8" do
|
||||
expect(to_utf8('utf-16be.txt')).to eq(expected)
|
||||
end
|
||||
end
|
||||
|
||||
it "correctly encodes ISO-8859-5 as UTF-8" do
|
||||
expect(to_utf8('iso-8859-5.txt')).to eq('Письменность отличается от других существующих или возможных систем символической коммуникации тем, что всегда ассоциируется с некоторым языком и устной речью на этом языке')
|
||||
end
|
||||
end
|
Loading…
Reference in New Issue
Block a user