diff --git a/app/jobs/scheduled/poll_feed.rb b/app/jobs/scheduled/poll_feed.rb index 7d242cb03a8..b044b9dfa47 100644 --- a/app/jobs/scheduled/poll_feed.rb +++ b/app/jobs/scheduled/poll_feed.rb @@ -88,8 +88,9 @@ module Jobs private def parsed_feed - raw_feed = fetch_rss - encoded_feed = Encodings.to_utf8(raw_feed) + raw_feed, encoding = fetch_rss + encoded_feed = Encodings.try_utf8(raw_feed, encoding) if encoding + encoded_feed = Encodings.to_utf8(raw_feed, encoding_hint: encoding) unless encoded_feed return nil if encoded_feed.blank? @@ -107,10 +108,19 @@ module Jobs feed_final_url = final_destination.resolve return nil unless final_destination.status == :resolved - Excon.new(feed_final_url.to_s).request(method: :get, expects: 200).body + response = Excon.new(feed_final_url.to_s).request(method: :get, expects: 200) + [response.body, detect_charset(response)] rescue Excon::Error::HTTPStatus nil end + + def detect_charset(response) + if response.headers['Content-Type'] =~ /charset\s*=\s*([a-z0-9\-]+)/i + Encoding.find($1) + end + rescue ArgumentError + nil + end end class FeedTopic diff --git a/spec/fixtures/feed/iso-8859-15-feed.rss b/spec/fixtures/feed/iso-8859-15-feed.rss new file mode 100644 index 00000000000..8cd2628dea9 --- /dev/null +++ b/spec/fixtures/feed/iso-8859-15-feed.rss @@ -0,0 +1,33 @@ +<?xml version="1.0"?> +<rss version="2.0" + xmlns:content="http://purl.org/rss/1.0/modules/content/" + xmlns:wfw="http://wellformedweb.org/CommentAPI/" + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:atom="http://www.w3.org/2005/Atom" + xmlns:sy="http://purl.org/rss/1.0/modules/syndication/" + xmlns:slash="http://purl.org/rss/1.0/modules/slash/" + xmlns:discourse="http://discourse.org/rss/modules/discourse/" + > +<channel> + <title>Discourse</title> + <atom:link href="https://blog.discourse.org/feed/" rel="self" type="application/rss+xml" /> + <link>https://blog.discourse.org</link> + <description>Official blog for the open source Discourse project</description> + <lastBuildDate>Thu, 14 Sep 2017 15:22:33 +0000</lastBuildDate> + <language>en-US</language> + <sy:updatePeriod>hourly</sy:updatePeriod> + <sy:updateFrequency>1</sy:updateFrequency> + <generator>https://wordpress.org/?v=4.8.1</generator> + <item> + <title>Poll Feed Spec Fixture</title> + <link>https://blog.discourse.org/2017/09/poll-feed-spec-fixture/</link> + <pubDate>Thu, 14 Sep 2017 15:22:33 +0000</pubDate> + <dc:creator><![CDATA[xrav3nz]]></dc:creator> + <discourse:username><![CDATA[xrav3nz]]></discourse:username> + <category><![CDATA[spec]]></category> + <guid isPermaLink="false">https://blog.discourse.org/?p=pollfeedspec</guid> + <description><![CDATA[Here are some random descriptions... […]]]></description> + <content:encoded><![CDATA[<p>This is the body & content. 100� </p>]]></content:encoded> + </item> + </channel> +</rss> diff --git a/spec/jobs/poll_feed_spec.rb b/spec/jobs/poll_feed_spec.rb index a42835ce09c..8c3e3cf22ab 100644 --- a/spec/jobs/poll_feed_spec.rb +++ b/spec/jobs/poll_feed_spec.rb @@ -155,6 +155,26 @@ describe Jobs::PollFeed do expect { poller.poll_feed }.to change { Topic.count }.by(1) expect(Topic.last.first_post.raw).to include('<p>This is the body & content. </p>') end + + it 'respects the charset in the Content-Type header' do + stub_request(:get, SiteSetting.feed_polling_url).to_return( + body: file_from_fixtures('iso-8859-15-feed.rss', 'feed').read, + headers: { "Content-Type" => "application/rss+xml; charset=ISO-8859-15" } + ) + + expect { poller.poll_feed }.to change { Topic.count }.by(1) + expect(Topic.last.first_post.raw).to include('<p>This is the body & content. 100€ </p>') + end + + it 'works when the charset in the Content-Type header is unknown' do + stub_request(:get, SiteSetting.feed_polling_url).to_return( + body: file_from_fixtures('feed.rss', 'feed').read, + headers: { "Content-Type" => "application/rss+xml; charset=foo" } + ) + + expect { poller.poll_feed }.to change { Topic.count }.by(1) + expect(Topic.last.first_post.raw).to include('<p>This is the body & content. </p>') + end end end end