FEATURE: Simplify crawler content for non-canonical post URLs (#26324)

When crawlers visit a post-specific URL like `/t/-/{topic-id}/{post-number}`, we use the canonical to direct them to the appropriate crawler-optimised paginated view (e.g. `?page=3`).

However, analysis of google results shows that the post-specific URLs are still being included in the index. Google doesn't tell us exactly why this is happening. However, as a general rule, 'A large portion of the duplicate page's content should be present on the canonical version'.

In our previous implementation, this wasn't 100% true all the time. That's because a request for a post-specific URL would include posts 'surrounding' that post, and won't exactly conform to the page boundaries which are used in the canonical version of the page. Essentially: in some cases, the content of the post-specific pages would include many posts which were not present on the canonical paginated version.

This commit aims to resolve that problem by simplifying the implementation. Instead of rendering posts surrounding the target post_number, we will only render the target post, and include a link to 'show post in topic'. With this new implementation, 100% of the post-specific page content will be present on the canonical paginated version, which will hopefully mean google reduces their  indexing of the non-canonical post-specific pages.
This commit is contained in:
David Taylor 2024-03-26 15:18:46 +00:00 committed by GitHub
parent 58990fb00f
commit 3329484e2d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 97 additions and 39 deletions

View File

@ -59,7 +59,7 @@
<meta itemprop='text' content='<%= @topic_view.topic.excerpt %>'>
<% end %>
<% @topic_view.posts.each do |post| %>
<% @topic_view.crawler_posts.each do |post| %>
<% if (u = post.user) && !post.hidden && post.cooked && !post.cooked.strip.empty? %>
<div id='post_<%= post.post_number %>' <%= post.is_first_post? ? "" : "itemprop='comment' itemscope itemtype='http://schema.org/Comment'".html_safe %> class='topic-body crawler-post'>
<div class='crawler-post-meta'>
@ -128,16 +128,22 @@
<% end %>
</div>
<% if @topic_view.prev_page || @topic_view.next_page %>
<div role='navigation' itemscope itemtype='http://schema.org/SiteNavigationElement' class="topic-body crawler-post">
<% if @topic_view.prev_page %>
<span itemprop='name'><%= link_to t(:prev_page), @topic_view.prev_page_path, rel: 'prev', itemprop: 'url' %></span>
<% end %>
<% if @topic_view.next_page %>
<span itemprop='name'><b><%= link_to t(:next_page), @topic_view.next_page_path, rel: 'next', itemprop: 'url' %></b></span>
<% end %>
</div>
<% end %>
<% if @topic_view.single_post_request? || @topic_view.prev_page || @topic_view.next_page %>
<div role='navigation' itemscope itemtype='http://schema.org/SiteNavigationElement' class="topic-body crawler-post">
<% if @topic_view.single_post_request? %>
<span itemprop='name'>
<%= link_to t(:show_post_in_topic), "#{@topic_view.current_page_path}#post_#{@topic_view.crawler_posts.first&.post_number}", itemprop: 'url' %>
</span>
<% else %>
<% if @topic_view.prev_page %>
<span itemprop='name'><%= link_to t(:prev_page), @topic_view.prev_page_path, rel: 'prev', itemprop: 'url' %></span>
<% end %>
<% if @topic_view.next_page %>
<span itemprop='name'><b><%= link_to t(:next_page), @topic_view.next_page_path, rel: 'next', itemprop: 'url' %></b></span>
<% end %>
<% end %>
</div>
<% end %>
<%= build_plugin_html 'server:topic-show-after-posts-crawler' %>
<% else %>
@ -148,7 +154,7 @@
<%= auto_discovery_link_tag(@topic_view, {action: :feed, slug: @topic_view.topic.slug, topic_id: @topic_view.topic.id}, rel: 'alternate nofollow', title: t('rss_posts_in_topic', topic: @topic_view.title), type: 'application/rss+xml') %>
<%= raw crawlable_meta_data(title: @topic_view.title, description: @topic_view.summary(strip_images: true), image: @topic_view.image_url, read_time: @topic_view.read_time, like_count: @topic_view.like_count, ignore_canonical: true, published_time: @topic_view.published_time, breadcrumbs: @breadcrumbs, tags: @tags.map(&:name)) %>
<% if @topic_view.prev_page || @topic_view.next_page %>
<% if !@topic_view.single_post_request? && (@topic_view.prev_page || @topic_view.next_page) %>
<% if @topic_view.prev_page %>
<link rel="prev" href="<%= @topic_view.prev_page_path -%>">
<% end %>

View File

@ -480,6 +480,7 @@ en:
is_invalid: "seems unclear, is it a complete sentence?"
next_page: "next page →"
prev_page: "← previous page"
show_post_in_topic: "show post in topic"
page_num: "Page %{num}"
crawler_content_hidden: "HTML content omitted because you are logged in or using a modern mobile device."
home_title: "Home"

View File

@ -163,9 +163,15 @@ class TopicView
topic_embed = topic.topic_embed
return topic_embed.embed_url if topic_embed
end
path = relative_url.dup
path << ((@page > 1) ? "?page=#{@page}" : "")
path
current_page_path
end
def current_page_path
if @page > 1
"#{relative_url}?page=#{@page}"
else
relative_url
end
end
def contains_gaps?
@ -265,6 +271,18 @@ class TopicView
@desired_post
end
def crawler_posts
if single_post_request?
[desired_post]
else
posts
end
end
def single_post_request?
@post_number && @post_number != 1
end
def summary(opts = {})
return nil if desired_post.blank?
# TODO, this is actually quite slow, should be cached in the post table

View File

@ -5174,27 +5174,44 @@ RSpec.describe TopicsController do
end
context "when a crawler" do
fab!(:page1_time) { 3.months.ago }
fab!(:page2_time) { 2.months.ago }
fab!(:page3_time) { 1.month.ago }
fab!(:page_1_topics) do
Fabricate.times(
20,
:post,
user: post_author2,
topic: topic,
created_at: page1_time,
updated_at: page1_time,
)
end
fab!(:page_2_topics) do
Fabricate.times(
20,
:post,
user: post_author3,
topic: topic,
created_at: page2_time,
updated_at: page2_time,
)
end
fab!(:page_3_topics) do
Fabricate.times(
2,
:post,
user: post_author3,
topic: topic,
created_at: page3_time,
updated_at: page3_time,
)
end
it "renders with the crawler layout, and handles proper pagination" do
page1_time = 3.months.ago
page2_time = 2.months.ago
page3_time = 1.month.ago
freeze_time page1_time
Fabricate(:post, user: post_author2, topic: topic)
Fabricate(:post, user: post_author3, topic: topic)
freeze_time page2_time
Fabricate(:post, user: post_author4, topic: topic)
Fabricate(:post, user: post_author5, topic: topic)
freeze_time page3_time
Fabricate(:post, user: post_author6, topic: topic)
# ugly, but no interface to set this and we don't want to create
# 100 posts to test this thing
TopicView.stubs(:chunk_size).returns(2)
user_agent = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
get topic.relative_url, env: { "HTTP_USER_AGENT" => user_agent }
@ -5215,8 +5232,8 @@ RSpec.describe TopicsController do
expect(response.headers["Last-Modified"]).to eq(page2_time.httpdate)
expect(body).to include("id='post_3'")
expect(body).to include("id='post_4'")
expect(body).to include("id='post_21'")
expect(body).to include("id='post_22'")
expect(body).to include('<link rel="prev" href="' + topic.relative_url)
expect(body).to include('<link rel="next" href="' + topic.relative_url + "?page=3")
@ -5228,6 +5245,22 @@ RSpec.describe TopicsController do
expect(body).to include('<link rel="prev" href="' + topic.relative_url + "?page=2")
end
it "only renders one post for non-canonical post-specific URLs" do
get "#{topic.relative_url}/24"
expect(response.body).to have_tag("#post_24")
expect(response.body).not_to have_tag("#post_23")
expect(response.body).not_to have_tag("#post_25")
expect(response.body).not_to have_tag("a", with: { rel: "next" })
expect(response.body).not_to have_tag("a", with: { rel: "prev" })
expect(response.body).to have_tag(
"a",
text: I18n.t("show_post_in_topic"),
with: {
href: "#{topic.relative_url}?page=2#post_24",
},
)
end
context "with canonical_url" do
fab!(:topic_embed) { Fabricate(:topic_embed, embed_url: "https://markvanlan.com") }
let!(:user_agent) do

View File

@ -8,7 +8,7 @@ RSpec.describe "topics/show.html.erb" do
it "uses subfolder-safe category url" do
set_subfolder "/subpath"
topic_view = OpenStruct.new(topic: topic, posts: [])
topic_view = OpenStruct.new(topic: topic, posts: [], crawler_posts: [])
topic_view.stubs(:summary).returns("")
view.stubs(:crawler_layout?).returns(false)
assign(:topic_view, topic_view)
@ -21,7 +21,7 @@ RSpec.describe "topics/show.html.erb" do
end
it "add nofollow to RSS alternate link for topic" do
topic_view = OpenStruct.new(topic: topic, posts: [])
topic_view = OpenStruct.new(topic: topic, posts: [], crawler_posts: [])
topic_view.stubs(:summary).returns("")
view.stubs(:crawler_layout?).returns(false)
view.stubs(:url_for).returns("https://www.example.com/test.rss")