From 644dded000eb2f7393bef03ffda61758f1a2695b Mon Sep 17 00:00:00 2001 From: Blake Erickson Date: Tue, 13 Jun 2023 11:08:08 -0600 Subject: [PATCH] SECURITY: Use canonical url for topic embeddings (#22085) This prevents duplicate topics from being created when using embed_urls that only differ on query params. --- app/models/topic_embed.rb | 23 +++++++++++++---- spec/models/topic_embed_spec.rb | 45 ++++++++++++++++++++++++++++++++- 2 files changed, 62 insertions(+), 6 deletions(-) diff --git a/app/models/topic_embed.rb b/app/models/topic_embed.rb index bb32d3ab4db..3ac5de7d8f8 100644 --- a/app/models/topic_embed.rb +++ b/app/models/topic_embed.rb @@ -21,10 +21,14 @@ class TopicEmbed < ActiveRecord::Base end class FetchResponse - attr_accessor :title, :body, :author + attr_accessor :title, :body, :author, :url end def self.normalize_url(url) + # downcase + # remove trailing forward slash/ + # remove consecutive hyphens + # remove leading and trailing whitespace url.downcase.sub(%r{/\z}, "").sub(/\-+/, "-").strip end @@ -45,7 +49,7 @@ class TopicEmbed < ActiveRecord::Base url = normalize_url(url) - embed = TopicEmbed.find_by("lower(embed_url) = ?", url) + embed = topic_embed_by_url(url) content_sha1 = Digest::SHA1.hexdigest(contents) post = nil @@ -127,7 +131,7 @@ class TopicEmbed < ActiveRecord::Base return end - parse_html(html, url) + parse_html(html, uri.to_s) end def self.parse_html(html, url) @@ -151,6 +155,9 @@ class TopicEmbed < ActiveRecord::Base response = FetchResponse.new raw_doc = Nokogiri.HTML5(html) + + response.url = url + auth_element = raw_doc.at('meta[@name="discourse-username"]') || raw_doc.at('meta[@name="author"]') if auth_element.present? @@ -217,6 +224,7 @@ class TopicEmbed < ActiveRecord::Base response.title = opts[:title] if opts[:title].present? import_user = opts[:user] if opts[:user].present? import_user = response.author if response.author.present? + url = normalize_url(response.url) if response.url.present? TopicEmbed.import(import_user, url, response.title, response.body) end @@ -260,9 +268,14 @@ class TopicEmbed < ActiveRecord::Base fragment.at("div").inner_html end - def self.topic_id_for_embed(embed_url) + def self.topic_embed_by_url(embed_url) embed_url = normalize_url(embed_url).sub(%r{\Ahttps?\://}, "") - TopicEmbed.where("embed_url ~* ?", "^https?://#{Regexp.escape(embed_url)}$").pick(:topic_id) + TopicEmbed.where("embed_url ~* ?", "^https?://#{Regexp.escape(embed_url)}$").first + end + + def self.topic_id_for_embed(embed_url) + topic_embed = topic_embed_by_url(embed_url) + topic_embed&.topic_id end def self.first_paragraph_from(html) diff --git a/spec/models/topic_embed_spec.rb b/spec/models/topic_embed_spec.rb index ede62d5c7c4..91aa671cb7b 100644 --- a/spec/models/topic_embed_spec.rb +++ b/spec/models/topic_embed_spec.rb @@ -158,6 +158,17 @@ RSpec.describe TopicEmbed do expect(imported_post.topic.category).to eq(category) end + it "does not create duplicate topics with different protocols in the embed_url" do + Jobs.run_immediately! + expect { + TopicEmbed.import(user, "http://eviltrout.com/abcd", title, "some random content") + }.to change { Topic.all.count }.by(1) + + expect { + TopicEmbed.import(user, "https://eviltrout.com/abcd", title, "some random content") + }.to_not change { Topic.all.count } + end + it "creates the topic with the tag passed as a parameter" do Jobs.run_immediately! SiteSetting.tagging_enabled = true @@ -430,21 +441,53 @@ RSpec.describe TopicEmbed do end context "with canonical links" do + fab!(:user) { Fabricate(:user) } + let(:title) { "How to turn a fish from good to evil in 30 seconds" } let(:url) { "http://eviltrout.com/123?asd" } let(:canonical_url) { "http://eviltrout.com/123" } + let(:url2) { "http://eviltrout.com/blog?post=1&canonical=false" } + let(:canonical_url2) { "http://eviltrout.com/blog?post=1" } let(:content) { "" } + let(:content2) { "" } let(:canonical_content) { "Canonical" } before do stub_request(:get, url).to_return(status: 200, body: content) stub_request(:head, canonical_url) stub_request(:get, canonical_url).to_return(status: 200, body: canonical_content) + + stub_request(:get, url2).to_return(status: 200, body: content2) + stub_request(:head, canonical_url2) + stub_request(:get, canonical_url2).to_return(status: 200, body: canonical_content) end - it "a" do + it "fetches canonical content" do response = TopicEmbed.find_remote(url) expect(response.title).to eq("Canonical") + expect(response.url).to eq(canonical_url) + end + + it "does not create duplicate topics when url differs from canonical_url" do + Jobs.run_immediately! + expect { TopicEmbed.import_remote(canonical_url, { title: title, user: user }) }.to change { + Topic.all.count + }.by(1) + + expect { TopicEmbed.import_remote(url, { title: title, user: user }) }.to_not change { + Topic.all.count + } + end + + it "does not create duplicate topics when url contains extra params" do + Jobs.run_immediately! + expect { + TopicEmbed.import_remote(canonical_url2, { title: title, user: user }) + }.to change { Topic.all.count }.by(1) + + expect { TopicEmbed.import_remote(url2, { title: title, user: user }) }.to_not change { + Topic.all.count + } end end end