FEATURE: Replace SimpleRSS with Ruby RSS module (#5311)

* SPEC: PollFeedJob parsing atom feed

* add FeedItemAccessor

It is to provide a consistent interface to access a feed item's tag
content.

* add FeedElementInstaller

to install non-standard and non-namespaced feed elements

* FEATURE: replace SimpleRSS with Ruby RSS module

* get FinalDestination and download with Excon

* support namespaced element with FeedElementInstaller
This commit is contained in:
Kyle Zhao 2017-12-05 18:45:09 -05:00 committed by Sam
parent 410994b7f5
commit 5f318a5241
10 changed files with 286 additions and 81 deletions

View File

@ -166,7 +166,6 @@ gem 'gc_tracer', require: false, platform: :mri
# required for feed importing and embedding
gem 'ruby-readability', require: false
gem 'simple-rss', require: false
gem 'stackprof', require: false, platform: :mri
gem 'memory_profiler', require: false, platform: :mri

View File

@ -368,7 +368,6 @@ GEM
connection_pool (~> 2.2, >= 2.2.0)
rack-protection (>= 1.5.0)
redis (>= 3.3.4, < 5)
simple-rss (1.3.1)
slop (3.6.0)
sprockets (3.7.1)
concurrent-ruby (~> 1.0)
@ -500,7 +499,6 @@ DEPENDENCIES
seed-fu
shoulda
sidekiq
simple-rss
sprockets-rails
stackprof
thor

View File

@ -2,9 +2,13 @@
# Creates and Updates Topics based on an RSS or ATOM feed.
#
require 'digest/sha1'
require 'excon'
require 'rss'
require_dependency 'feed_item_accessor'
require_dependency 'feed_element_installer'
require_dependency 'final_destination'
require_dependency 'post_creator'
require_dependency 'post_revisor'
require 'open-uri'
module Jobs
class PollFeed < Jobs::Scheduled
@ -46,17 +50,11 @@ module Jobs
def import_topic(topic)
if topic.user
TopicEmbed.import(topic.user, topic.url, topic.title, CGI.unescapeHTML(topic.content.scrub))
TopicEmbed.import(topic.user, topic.url, topic.title, CGI.unescapeHTML(topic.content))
end
end
class Feed
require 'simple-rss'
if SiteSetting.embed_username_key_from_feed.present?
SimpleRSS.item_tags << SiteSetting.embed_username_key_from_feed.to_sym
end
def initialize
@feed_url = SiteSetting.feed_polling_url
@feed_url = "http://#{@feed_url}" if @feed_url !~ /^https?\:\/\//
@ -65,7 +63,7 @@ module Jobs
def topics
feed_topics = []
rss = fetch_rss
rss = parsed_feed
return feed_topics unless rss.present?
rss.items.each do |i|
@ -78,36 +76,56 @@ module Jobs
private
def fetch_rss
SimpleRSS.parse open(@feed_url, allow_redirections: :all)
rescue OpenURI::HTTPError, SimpleRSSError
def parsed_feed
raw_feed = fetch_rss
return nil if raw_feed.blank?
if SiteSetting.embed_username_key_from_feed.present?
FeedElementInstaller.install(SiteSetting.embed_username_key_from_feed, raw_feed)
end
RSS::Parser.parse(raw_feed)
rescue RSS::NotWellFormedError, RSS::InvalidRSSError
nil
end
def fetch_rss
final_destination = FinalDestination.new(@feed_url, verbose: true)
feed_final_url = final_destination.resolve
return nil unless final_destination.status == :resolved
Excon.new(feed_final_url.to_s).request(method: :get, expects: 200).body
rescue Excon::Error::HTTPStatus
nil
end
end
class FeedTopic
def initialize(article_rss_item)
@article_rss_item = article_rss_item
@accessor = FeedItemAccessor.new(article_rss_item)
end
def url
link = @article_rss_item.link
link = @accessor.link
if url?(link)
return link
else
return @article_rss_item.id
return @accessor.element_content(:id)
end
end
def content
@article_rss_item.content_encoded&.force_encoding("UTF-8")&.scrub ||
@article_rss_item.content&.force_encoding("UTF-8")&.scrub ||
@article_rss_item.description&.force_encoding("UTF-8")&.scrub
content = nil
%i[content_encoded content description].each do |content_element_name|
content ||= @accessor.element_content(content_element_name)
end
content&.force_encoding('UTF-8')&.scrub
end
def title
@article_rss_item.title.force_encoding("UTF-8").scrub
@accessor.element_content(:title).force_encoding('UTF-8').scrub
end
def user
@ -125,11 +143,7 @@ module Jobs
end
def author_username
begin
@article_rss_item.send(SiteSetting.embed_username_key_from_feed.to_sym)
rescue
nil
end
@accessor.element_content(SiteSetting.embed_username_key_from_feed.sub(':', '_'))
end
def default_user
@ -145,9 +159,6 @@ module Jobs
def find_user(user_name)
User.where(username_lower: user_name).first
end
end
end
end

View File

@ -0,0 +1,52 @@
require 'rexml/document'
require 'rss'
class FeedElementInstaller
private_class_method :new
def self.install(element_name, feed)
# RSS Specification at http://cyber.harvard.edu/rss/rss.html#extendingRss
# > A RSS feed may contain [non-standard elements], only if those elements are *defined in a namespace*
new(element_name, feed).install if element_name.include?(':')
end
attr_reader :feed, :original_name, :element_namespace, :element_name, :element_accessor
def initialize(element_name, feed)
@feed = feed
@original_name = element_name
@element_namespace, @element_name = *element_name.split(':')
@element_accessor = "#{@element_namespace}_#{@element_name}"
end
def element_uri
@element_uri ||= REXML::Document.new(feed).root&.attributes&.namespaces&.fetch(@element_namespace, '') || ''
end
def install
install_in_rss unless installed_in_rss?
install_in_atom unless installed_in_atom?
end
private
def install_in_rss
RSS::Rss::Channel::Item.install_text_element(element_name, element_uri, '?', element_accessor, nil, original_name)
RSS::BaseListener.install_get_text_element(element_uri, element_name, element_accessor)
end
def install_in_atom
RSS::Atom::Entry.install_text_element(element_name, element_uri, '?', element_accessor, nil, original_name)
RSS::Atom::Feed::Entry.install_text_element(element_name, element_uri, '?', element_accessor, nil, original_name)
RSS::BaseListener.install_get_text_element(element_uri, element_name, element_accessor)
end
def installed_in_rss?
RSS::Rss::Channel::Item.method_defined?(element_accessor)
end
def installed_in_atom?
RSS::Atom::Entry.method_defined?(element_accessor) || RSS::Atom::Feed::Entry.method_defined?(element_accessor)
end
end

25
lib/feed_item_accessor.rb Normal file
View File

@ -0,0 +1,25 @@
class FeedItemAccessor
attr_accessor :rss_item
def initialize(rss_item)
@rss_item = rss_item
end
def element_content(element_name)
try_attribute_or_self(element(element_name), :content)
end
def link
try_attribute_or_self(element(:link), :href)
end
private
def element(element_name)
rss_item.respond_to?(element_name) ? rss_item.send(element_name) : nil
end
def try_attribute_or_self(element, attribute_name)
element.respond_to?(attribute_name) ? element.send(attribute_name) : element
end
end

View File

@ -0,0 +1,40 @@
require 'feed_element_installer'
require 'rails_helper'
describe FeedElementInstaller do
describe '#install_rss_element' do
let(:raw_feed) { file_from_fixtures('feed.rss', 'feed').read }
it 'creates parsing for a non-standard, namespaced element' do
FeedElementInstaller.install('discourse:username', raw_feed)
feed = RSS::Parser.parse(raw_feed)
expect(feed.items.first.discourse_username).to eq('xrav3nz')
end
it 'does not create parsing for a non-standard, non-namespaced element' do
FeedElementInstaller.install('username', raw_feed)
feed = RSS::Parser.parse(raw_feed)
expect { feed.items.first.username }.to raise_error(NoMethodError)
end
end
describe '#install_atom_element' do
let(:raw_feed) { file_from_fixtures('feed.atom', 'feed').read }
it 'creates parsing for a non-standard, namespaced element' do
FeedElementInstaller.install('discourse:username', raw_feed)
feed = RSS::Parser.parse(raw_feed)
expect(feed.items.first.discourse_username).to eq('xrav3nz')
end
it 'does not create parsing for a non-standard, non-namespaced element' do
FeedElementInstaller.install('username', raw_feed)
feed = RSS::Parser.parse(raw_feed)
expect { feed.items.first.username }.to raise_error(NoMethodError)
end
end
end

View File

@ -0,0 +1,33 @@
require 'rss'
require 'feed_item_accessor'
require 'rails_helper'
describe FeedItemAccessor do
context 'for ATOM feed' do
let(:atom_feed) { RSS::Parser.parse(file_from_fixtures('feed.atom', 'feed'), false) }
let(:atom_feed_item) { atom_feed.items.first }
let(:item_accessor) { FeedItemAccessor.new(atom_feed_item) }
describe '#element_content' do
it { expect(item_accessor.element_content('title')).to eq(atom_feed_item.title.content) }
end
describe '#link' do
it { expect(item_accessor.link).to eq(atom_feed_item.link.href) }
end
end
context 'for RSS feed' do
let(:rss_feed) { RSS::Parser.parse(file_from_fixtures('feed.rss', 'feed'), false) }
let(:rss_feed_item) { rss_feed.items.first }
let(:item_accessor) { FeedItemAccessor.new(rss_feed_item) }
describe '#element_content' do
it { expect(item_accessor.element_content('title')).to eq(rss_feed_item.title) }
end
describe '#link' do
it { expect(item_accessor.link).to eq(rss_feed_item.link) }
end
end
end

30
spec/fixtures/feed/feed.atom vendored Normal file
View File

@ -0,0 +1,30 @@
<?xml version="1.0" encoding="UTF-8"?>
<feed
xmlns="http://www.w3.org/2005/Atom"
xmlns:thr="http://purl.org/syndication/thread/1.0"
xmlns:discourse="http://discourse.org/rss/modules/discourse/"
xml:lang="en-US"
xml:base="https://blog.discourse.org/wp-atom.php"
>
<title type="text">Discourse</title>
<subtitle type="text">Official blog for the open source Discourse project</subtitle>
<updated>2017-10-23T23:45:37Z</updated>
<link rel="alternate" type="text/html" href="https://blog.discourse.org" />
<id>https://blog.discourse.org/feed/atom/</id>
<link rel="self" type="application/atom+xml" href="https://blog.discourse.org/feed/atom/" />
<generator uri="https://wordpress.org/" version="4.8.2">WordPress</generator>
<entry>
<discourse:username><![CDATA[xrav3nz]]></discourse:username>
<author>
<name>xrav3nz</name>
</author>
<title type="html"><![CDATA[Poll Feed Spec Fixture]]></title>
<link rel="alternate" type="text/html" href="https://blog.discourse.org/2017/09/poll-feed-spec-fixture/" />
<id>https://blog.discourse.org/?p=pollfeedspec</id>
<updated>2017-09-14T15:22:33Z</updated>
<published>2017-09-14T15:22:33Z</published>
<category scheme="https://blog.discourse.org" term="design" />
<summary type="html"><![CDATA[Here are some random descriptions... [&#8230;]]]></summary>
<content type="html" xml:base="https://blog.discourse.org/2017/09/poll-feed-spec-fixture/"><![CDATA[<p>This is the body &amp; content. </p>]]></content>
</entry>
</feed>

View File

@ -5,6 +5,7 @@
xmlns:atom="http://www.w3.org/2005/Atom"
xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
xmlns:discourse="http://discourse.org/rss/modules/discourse/"
>
<channel>
<title>Discourse</title>
@ -21,6 +22,7 @@
<link>https://blog.discourse.org/2017/09/poll-feed-spec-fixture/</link>
<pubDate>Thu, 14 Sep 2017 15:22:33 +0000</pubDate>
<dc:creator><![CDATA[xrav3nz]]></dc:creator>
<discourse:username><![CDATA[xrav3nz]]></discourse:username>
<category><![CDATA[spec]]></category>
<guid isPermaLink="false">https://blog.discourse.org/?p=pollfeedspec</guid>
<description><![CDATA[Here are some random descriptions... [&#8230;]]]></description>

View File

@ -40,76 +40,91 @@ describe Jobs::PollFeed do
poller.execute({})
poller.execute({})
end
end
describe '#poll_feed' do
let(:embed_by_username) { 'eviltrout' }
let(:embed_username_key_from_feed) { 'dc_creator' }
let(:embed_username_key_from_feed) { 'discourse:username' }
let!(:default_user) { Fabricate(:evil_trout) }
let!(:feed_author) { Fabricate(:user, username: 'xrav3nz', email: 'hi@bye.com') }
before do
SiteSetting.feed_polling_enabled = true
SiteSetting.feed_polling_url = 'https://blog.discourse.org/feed/'
SiteSetting.embed_by_username = embed_by_username
shared_examples 'topic creation based on the the feed' do
describe 'author username parsing' do
context 'when neither embed_by_username nor embed_username_key_from_feed is set' do
before do
SiteSetting.embed_by_username = ""
SiteSetting.embed_username_key_from_feed = ""
end
stub_request(:get, SiteSetting.feed_polling_url).to_return(
status: 200,
body: file_from_fixtures('feed.rss', 'feed').read,
headers: { "Content-Type" => "application/rss+xml" }
)
end
describe 'author username parsing' do
context 'when neither embed_by_username nor embed_username_key_from_feed is set' do
before do
SiteSetting.embed_by_username = ""
SiteSetting.embed_username_key_from_feed = ""
it 'does not import topics' do
expect { poller.poll_feed }.not_to change { Topic.count }
end
end
it 'does not import topics' do
expect { poller.poll_feed }.not_to change { Topic.count }
context 'when embed_by_username is set' do
before do
SiteSetting.embed_by_username = embed_by_username
SiteSetting.embed_username_key_from_feed = ""
end
it 'creates the new topics under embed_by_username' do
expect { poller.poll_feed }.to change { Topic.count }.by(1)
expect(Topic.last.user).to eq(default_user)
end
end
context 'when embed_username_key_from_feed is set' do
before do
SiteSetting.embed_username_key_from_feed = embed_username_key_from_feed
end
it 'creates the new topics under the username found' do
expect { poller.poll_feed }.to change { Topic.count }.by(1)
expect(Topic.last.user).to eq(feed_author)
end
end
end
context 'when embed_by_username is set' do
before do
SiteSetting.embed_by_username = embed_by_username
SiteSetting.embed_username_key_from_feed = ""
end
it 'creates the new topics under embed_by_username' do
expect { poller.poll_feed }.to change { Topic.count }.by(1)
expect(Topic.last.user).to eq(default_user)
end
end
context 'when embed_username_key_from_feed is set' do
before do
SiteSetting.embed_username_key_from_feed = embed_username_key_from_feed
end
it 'creates the new topics under the username found' do
expect { poller.poll_feed }.to change { Topic.count }.by(1)
expect(Topic.last.user).to eq(feed_author)
end
it 'parses creates a new post correctly' do
expect { poller.poll_feed }.to change { Topic.count }.by(1)
expect(Topic.last.title).to eq('Poll Feed Spec Fixture')
expect(Topic.last.first_post.raw).to include('<p>This is the body &amp; content. </p>')
expect(Topic.last.topic_embed.embed_url).to eq('https://blog.discourse.org/2017/09/poll-feed-spec-fixture')
end
end
it 'parses the title correctly' do
expect { poller.poll_feed }.to change { Topic.count }.by(1)
expect(Topic.last.title).to eq('Poll Feed Spec Fixture')
context 'when parsing RSS feed' do
before do
SiteSetting.feed_polling_enabled = true
SiteSetting.feed_polling_url = 'https://blog.discourse.org/feed/'
SiteSetting.embed_by_username = embed_by_username
stub_request(:head, SiteSetting.feed_polling_url).to_return(status: 200)
stub_request(:get, SiteSetting.feed_polling_url).to_return(
status: 200,
body: file_from_fixtures('feed.rss', 'feed').read,
headers: { "Content-Type" => "application/rss+xml" }
)
end
include_examples 'topic creation based on the the feed'
end
it 'parses the content correctly' do
expect { poller.poll_feed }.to change { Topic.count }.by(1)
expect(Topic.last.first_post.raw).to include('<p>This is the body &amp; content. </p>')
end
context 'when parsing ATOM feed' do
before do
SiteSetting.feed_polling_enabled = true
SiteSetting.feed_polling_url = 'https://blog.discourse.org/feed/atom/'
SiteSetting.embed_by_username = embed_by_username
it 'parses the link correctly' do
expect { poller.poll_feed }.to change { Topic.count }.by(1)
expect(Topic.last.topic_embed.embed_url).to eq('https://blog.discourse.org/2017/09/poll-feed-spec-fixture')
stub_request(:head, SiteSetting.feed_polling_url).to_return(status: 200)
stub_request(:get, SiteSetting.feed_polling_url).to_return(
status: 200,
body: file_from_fixtures('feed.atom', 'feed').read,
headers: { "Content-Type" => "application/atom+xml" }
)
end
include_examples 'topic creation based on the the feed'
end
end
end