From 3329484e2d64951956981bde9bff662d6ad8b412 Mon Sep 17 00:00:00 2001 From: David Taylor Date: Tue, 26 Mar 2024 15:18:46 +0000 Subject: [PATCH] FEATURE: Simplify crawler content for non-canonical post URLs (#26324) When crawlers visit a post-specific URL like `/t/-/{topic-id}/{post-number}`, we use the canonical to direct them to the appropriate crawler-optimised paginated view (e.g. `?page=3`). However, analysis of google results shows that the post-specific URLs are still being included in the index. Google doesn't tell us exactly why this is happening. However, as a general rule, 'A large portion of the duplicate page's content should be present on the canonical version'. In our previous implementation, this wasn't 100% true all the time. That's because a request for a post-specific URL would include posts 'surrounding' that post, and won't exactly conform to the page boundaries which are used in the canonical version of the page. Essentially: in some cases, the content of the post-specific pages would include many posts which were not present on the canonical paginated version. This commit aims to resolve that problem by simplifying the implementation. Instead of rendering posts surrounding the target post_number, we will only render the target post, and include a link to 'show post in topic'. With this new implementation, 100% of the post-specific page content will be present on the canonical paginated version, which will hopefully mean google reduces their indexing of the non-canonical post-specific pages. --- app/views/topics/show.html.erb | 30 ++++++---- config/locales/server.en.yml | 1 + lib/topic_view.rb | 24 +++++++- spec/requests/topics_controller_spec.rb | 77 ++++++++++++++++++------- spec/views/topics/show.html.erb_spec.rb | 4 +- 5 files changed, 97 insertions(+), 39 deletions(-) diff --git a/app/views/topics/show.html.erb b/app/views/topics/show.html.erb index d197fd33f8e..ed817508c96 100644 --- a/app/views/topics/show.html.erb +++ b/app/views/topics/show.html.erb @@ -59,7 +59,7 @@ <% end %> - <% @topic_view.posts.each do |post| %> + <% @topic_view.crawler_posts.each do |post| %> <% if (u = post.user) && !post.hidden && post.cooked && !post.cooked.strip.empty? %>
class='topic-body crawler-post'> - <% if @topic_view.prev_page || @topic_view.next_page %> - - <% end %> + <% if @topic_view.single_post_request? || @topic_view.prev_page || @topic_view.next_page %> + + <% end %> <%= build_plugin_html 'server:topic-show-after-posts-crawler' %> <% else %> @@ -148,7 +154,7 @@ <%= auto_discovery_link_tag(@topic_view, {action: :feed, slug: @topic_view.topic.slug, topic_id: @topic_view.topic.id}, rel: 'alternate nofollow', title: t('rss_posts_in_topic', topic: @topic_view.title), type: 'application/rss+xml') %> <%= raw crawlable_meta_data(title: @topic_view.title, description: @topic_view.summary(strip_images: true), image: @topic_view.image_url, read_time: @topic_view.read_time, like_count: @topic_view.like_count, ignore_canonical: true, published_time: @topic_view.published_time, breadcrumbs: @breadcrumbs, tags: @tags.map(&:name)) %> - <% if @topic_view.prev_page || @topic_view.next_page %> + <% if !@topic_view.single_post_request? && (@topic_view.prev_page || @topic_view.next_page) %> <% if @topic_view.prev_page %> <% end %> diff --git a/config/locales/server.en.yml b/config/locales/server.en.yml index 87f670f4918..ed30a71c131 100644 --- a/config/locales/server.en.yml +++ b/config/locales/server.en.yml @@ -480,6 +480,7 @@ en: is_invalid: "seems unclear, is it a complete sentence?" next_page: "next page →" prev_page: "← previous page" + show_post_in_topic: "show post in topic" page_num: "Page %{num}" crawler_content_hidden: "HTML content omitted because you are logged in or using a modern mobile device." home_title: "Home" diff --git a/lib/topic_view.rb b/lib/topic_view.rb index 1b8ab6eb140..d30a7c3a7a1 100644 --- a/lib/topic_view.rb +++ b/lib/topic_view.rb @@ -163,9 +163,15 @@ class TopicView topic_embed = topic.topic_embed return topic_embed.embed_url if topic_embed end - path = relative_url.dup - path << ((@page > 1) ? "?page=#{@page}" : "") - path + current_page_path + end + + def current_page_path + if @page > 1 + "#{relative_url}?page=#{@page}" + else + relative_url + end end def contains_gaps? @@ -265,6 +271,18 @@ class TopicView @desired_post end + def crawler_posts + if single_post_request? + [desired_post] + else + posts + end + end + + def single_post_request? + @post_number && @post_number != 1 + end + def summary(opts = {}) return nil if desired_post.blank? # TODO, this is actually quite slow, should be cached in the post table diff --git a/spec/requests/topics_controller_spec.rb b/spec/requests/topics_controller_spec.rb index 798018731ac..72fd06a49af 100644 --- a/spec/requests/topics_controller_spec.rb +++ b/spec/requests/topics_controller_spec.rb @@ -5174,27 +5174,44 @@ RSpec.describe TopicsController do end context "when a crawler" do + fab!(:page1_time) { 3.months.ago } + fab!(:page2_time) { 2.months.ago } + fab!(:page3_time) { 1.month.ago } + + fab!(:page_1_topics) do + Fabricate.times( + 20, + :post, + user: post_author2, + topic: topic, + created_at: page1_time, + updated_at: page1_time, + ) + end + + fab!(:page_2_topics) do + Fabricate.times( + 20, + :post, + user: post_author3, + topic: topic, + created_at: page2_time, + updated_at: page2_time, + ) + end + + fab!(:page_3_topics) do + Fabricate.times( + 2, + :post, + user: post_author3, + topic: topic, + created_at: page3_time, + updated_at: page3_time, + ) + end + it "renders with the crawler layout, and handles proper pagination" do - page1_time = 3.months.ago - page2_time = 2.months.ago - page3_time = 1.month.ago - - freeze_time page1_time - - Fabricate(:post, user: post_author2, topic: topic) - Fabricate(:post, user: post_author3, topic: topic) - - freeze_time page2_time - Fabricate(:post, user: post_author4, topic: topic) - Fabricate(:post, user: post_author5, topic: topic) - - freeze_time page3_time - Fabricate(:post, user: post_author6, topic: topic) - - # ugly, but no interface to set this and we don't want to create - # 100 posts to test this thing - TopicView.stubs(:chunk_size).returns(2) - user_agent = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" get topic.relative_url, env: { "HTTP_USER_AGENT" => user_agent } @@ -5215,8 +5232,8 @@ RSpec.describe TopicsController do expect(response.headers["Last-Modified"]).to eq(page2_time.httpdate) - expect(body).to include("id='post_3'") - expect(body).to include("id='post_4'") + expect(body).to include("id='post_21'") + expect(body).to include("id='post_22'") expect(body).to include('