From 42809f4d696208afe2df3590bb23c3b190995489 Mon Sep 17 00:00:00 2001 From: Maja Komel Date: Mon, 3 Jun 2019 04:13:32 +0200 Subject: [PATCH] FIX: use crawler layout when saving url in Wayback Machine (#7667) --- app/controllers/application_controller.rb | 4 +- lib/crawler_detection.rb | 5 +- lib/middleware/anonymous_cache.rb | 2 +- spec/components/crawler_detection_spec.rb | 10 +++- spec/requests/topics_controller_spec.rb | 68 ++++++++++++----------- 5 files changed, 50 insertions(+), 39 deletions(-) diff --git a/app/controllers/application_controller.rb b/app/controllers/application_controller.rb index 107523388e3..7e301698422 100644 --- a/app/controllers/application_controller.rb +++ b/app/controllers/application_controller.rb @@ -79,7 +79,9 @@ class ApplicationController < ActionController::Base request.user_agent && (request.content_type.blank? || request.content_type.include?('html')) && !['json', 'rss'].include?(params[:format]) && - (has_escaped_fragment? || CrawlerDetection.crawler?(request.user_agent) || params.key?("print")) + (has_escaped_fragment? || params.key?("print") || + CrawlerDetection.crawler?(request.user_agent, request.headers["HTTP_VIA"]) + ) end def perform_refresh_session diff --git a/lib/crawler_detection.rb b/lib/crawler_detection.rb index 040c339ddb0..6cd1eba5426 100644 --- a/lib/crawler_detection.rb +++ b/lib/crawler_detection.rb @@ -1,6 +1,7 @@ # frozen_string_literal: true module CrawlerDetection + WAYBACK_MACHINE_URL = "web.archive.org" def self.to_matcher(string, type: nil) escaped = string.split('|').map { |agent| Regexp.escape(agent) }.join('|') @@ -13,8 +14,8 @@ module CrawlerDetection Regexp.new(escaped, Regexp::IGNORECASE) end - def self.crawler?(user_agent) - return true if user_agent.nil? + def self.crawler?(user_agent, via_header = nil) + return true if user_agent.nil? || via_header&.include?(WAYBACK_MACHINE_URL) # this is done to avoid regenerating regexes @non_crawler_matchers ||= {} diff --git a/lib/middleware/anonymous_cache.rb b/lib/middleware/anonymous_cache.rb index 48a7648a70a..51d0e19dec5 100644 --- a/lib/middleware/anonymous_cache.rb +++ b/lib/middleware/anonymous_cache.rb @@ -62,7 +62,7 @@ module Middleware @is_crawler ||= begin user_agent = @env[USER_AGENT] - if CrawlerDetection.crawler?(user_agent) + if CrawlerDetection.crawler?(user_agent, @env["HTTP_VIA"]) :true else user_agent.downcase.include?("discourse") ? :true : :false diff --git a/spec/components/crawler_detection_spec.rb b/spec/components/crawler_detection_spec.rb index 907b968eb9c..7d5393342d4 100644 --- a/spec/components/crawler_detection_spec.rb +++ b/spec/components/crawler_detection_spec.rb @@ -5,9 +5,9 @@ require_dependency 'crawler_detection' describe CrawlerDetection do - def crawler!(s) - if (!CrawlerDetection.crawler?(s)) - raise "#{s} should be a crawler!" + def crawler!(user_agent, via = nil) + if (!CrawlerDetection.crawler?(user_agent, via)) + raise "#{user_agent} should be a crawler!" end end @@ -50,6 +50,10 @@ describe CrawlerDetection do crawler! "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)" end + it "returns true when VIA header contains 'web.archive.org'" do + crawler!("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36", "HTTP/1.0 web.archive.org (Wayback Save Page)") + end + it "returns false for non-crawler user agents" do not_crawler! "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36" not_crawler! "Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko" diff --git a/spec/requests/topics_controller_spec.rb b/spec/requests/topics_controller_spec.rb index 2964a9178d7..7825047ef50 100644 --- a/spec/requests/topics_controller_spec.rb +++ b/spec/requests/topics_controller_spec.rb @@ -2690,15 +2690,14 @@ RSpec.describe TopicsController do end context "when a crawler" do - it "renders with the crawler layout, and handles proper pagination" do - - page1_time = 3.months.ago - page2_time = 2.months.ago - page3_time = 1.month.ago + let(:topic) { Fabricate(:topic) } + let(:page1_time) { 3.months.ago } + let(:page2_time) { 2.months.ago } + let(:page3_time) { 1.month.ago } + before do freeze_time page1_time - topic = Fabricate(:topic) Fabricate(:post, topic: topic) Fabricate(:post, topic: topic) @@ -2712,33 +2711,38 @@ RSpec.describe TopicsController do # ugly, but no inteface to set this and we don't want to create # 100 posts to test this thing TopicView.stubs(:chunk_size).returns(2) - - user_agent = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" - - get topic.url, env: { "HTTP_USER_AGENT" => user_agent } - - body = response.body - - expect(body).to have_tag(:body, with: { class: 'crawler' }) - expect(body).to_not have_tag(:meta, with: { name: 'fragment' }) - expect(body).to include(' user_agent } - body = response.body - - expect(response.headers['Last-Modified']).to eq(page2_time.httpdate) - - expect(body).to include(' user_agent } - body = response.body - - expect(response.headers['Last-Modified']).to eq(page3_time.httpdate) - expect(body).to include(' user_agent, "HTTP_VIA" => via } + + body = response.body + + expect(body).to have_tag(:body, with: { class: 'crawler' }) + expect(body).to_not have_tag(:meta, with: { name: 'fragment' }) + expect(body).to include(' user_agent, "HTTP_VIA" => via } + body = response.body + + expect(response.headers['Last-Modified']).to eq(page2_time.httpdate) + + expect(body).to include(' user_agent, "HTTP_VIA" => via } + body = response.body + + expect(response.headers['Last-Modified']).to eq(page3_time.httpdate) + expect(body).to include('