From 1fffb236b202dcbe92d7e0805cf06a61d9219b9e Mon Sep 17 00:00:00 2001 From: Arkshine Date: Fri, 24 May 2024 03:49:17 +0200 Subject: [PATCH] FIX: crawler requests exceptions for non UTF-8 user agents with invalid bytes --- lib/http_user_agent_encoder.rb | 14 ++++++++ lib/middleware/anonymous_cache.rb | 18 +++++----- lib/middleware/request_tracker.rb | 6 ++-- spec/lib/middleware/anonymous_cache_spec.rb | 21 +++++++++++ spec/lib/middleware/request_tracker_spec.rb | 40 +++++++++++++++++++++ 5 files changed, 86 insertions(+), 13 deletions(-) create mode 100644 lib/http_user_agent_encoder.rb diff --git a/lib/http_user_agent_encoder.rb b/lib/http_user_agent_encoder.rb new file mode 100644 index 00000000000..80ce6d29607 --- /dev/null +++ b/lib/http_user_agent_encoder.rb @@ -0,0 +1,14 @@ +# frozen_string_literal: true + +module HttpUserAgentEncoder + def self.ensure_utf8(user_agent) + return "" unless user_agent + + if user_agent.encoding != Encoding::UTF_8 + user_agent = user_agent.encode("utf-8", invalid: :replace, undef: :replace) + user_agent.scrub! + end + + user_agent || "" + end +end diff --git a/lib/middleware/anonymous_cache.rb b/lib/middleware/anonymous_cache.rb index afb4238062d..04e058c3c59 100644 --- a/lib/middleware/anonymous_cache.rb +++ b/lib/middleware/anonymous_cache.rb @@ -4,6 +4,7 @@ require "mobile_detection" require "crawler_detection" require "guardian" require "http_language_parser" +require "http_user_agent_encoder" module Middleware class AnonymousCache @@ -73,6 +74,7 @@ module Middleware def initialize(env, request = nil) @env = env + @user_agent = HttpUserAgentEncoder.ensure_utf8(@env[USER_AGENT]) @request = request || Rack::Request.new(@env) end @@ -82,7 +84,7 @@ module Middleware @request[Auth::DefaultCurrentUserProvider::API_KEY].nil? && @env[Auth::DefaultCurrentUserProvider::USER_API_KEY].nil? && @env[Auth::DefaultCurrentUserProvider::HEADER_API_KEY].nil? && - CrawlerDetection.is_blocked_crawler?(@env[USER_AGENT]) + CrawlerDetection.is_blocked_crawler?(@user_agent) end # rubocop:disable Lint/BooleanSymbol @@ -98,7 +100,7 @@ module Middleware # otherwise you get a broken params on the request params = {} - MobileDetection.resolve_mobile_view!(@env[USER_AGENT], params, session) ? :true : :false + MobileDetection.resolve_mobile_view!(@user_agent, params, session) ? :true : :false end @is_mobile == :true @@ -126,14 +128,12 @@ module Middleware def is_crawler? @is_crawler ||= begin - user_agent = @env[USER_AGENT] - if @env[DISCOURSE_RENDER] == "crawler" || - CrawlerDetection.crawler?(user_agent, @env["HTTP_VIA"]) + CrawlerDetection.crawler?(@user_agent, @env["HTTP_VIA"]) :true else - if user_agent.downcase.include?("discourse") && - !user_agent.downcase.include?("mobile") + if @user_agent.downcase.include?("discourse") && + !@user_agent.downcase.include?("mobile") :true else :false @@ -146,11 +146,11 @@ module Middleware # rubocop:enable Lint/BooleanSymbol def key_is_modern_mobile_device? - MobileDetection.modern_mobile_device?(@env[USER_AGENT]) if @env[USER_AGENT] + MobileDetection.modern_mobile_device?(@user_agent) if @user_agent end def key_is_old_browser? - CrawlerDetection.show_browser_update?(@env[USER_AGENT]) if @env[USER_AGENT] + CrawlerDetection.show_browser_update?(@user_agent) if @user_agent end def cache_key diff --git a/lib/middleware/request_tracker.rb b/lib/middleware/request_tracker.rb index 43d411e32c9..039f9b71327 100644 --- a/lib/middleware/request_tracker.rb +++ b/lib/middleware/request_tracker.rb @@ -2,6 +2,7 @@ require "method_profiler" require "middleware/anonymous_cache" +require "http_user_agent_encoder" class Middleware::RequestTracker @@detailed_request_loggers = nil @@ -186,10 +187,7 @@ class Middleware::RequestTracker if h[:is_crawler] user_agent = env["HTTP_USER_AGENT"] - if user_agent && (user_agent.encoding != Encoding::UTF_8) - user_agent = user_agent.encode("utf-8") - user_agent.scrub! - end + user_agent = HttpUserAgentEncoder.ensure_utf8(user_agent) if user_agent h[:user_agent] = user_agent end diff --git a/spec/lib/middleware/anonymous_cache_spec.rb b/spec/lib/middleware/anonymous_cache_spec.rb index 33b52642daa..b03cc807132 100644 --- a/spec/lib/middleware/anonymous_cache_spec.rb +++ b/spec/lib/middleware/anonymous_cache_spec.rb @@ -112,6 +112,18 @@ RSpec.describe Middleware::AnonymousCache do expect(key1).not_to eq(key2) end + it "handles user agents with invalid bytes" do + agent = (+"Evil Googlebot String \xc3\x28").force_encoding("ASCII") + expect { + key1 = new_helper("HTTP_USER_AGENT" => agent).cache_key + key2 = + new_helper( + "HTTP_USER_AGENT" => agent.encode("utf-8", invalid: :replace, undef: :replace), + ).cache_key + expect(key1).to eq(key2) + }.not_to raise_error + end + context "when cached" do let!(:helper) { new_helper("ANON_CACHE_DURATION" => 10) } @@ -351,6 +363,15 @@ RSpec.describe Middleware::AnonymousCache do expect(@status).to eq(403) + expect { + get "/", + headers: { + "HTTP_USER_AGENT" => (+"Evil Googlebot String \xc3\x28").force_encoding("ASCII"), + } + + expect(@status).to eq(403) + }.not_to raise_error + get "/", headers: { "HTTP_USER_AGENT" => "Twitterbot/2.1 (+http://www.notgoogle.com/bot.html)", diff --git a/spec/lib/middleware/request_tracker_spec.rb b/spec/lib/middleware/request_tracker_spec.rb index 190f97e9c95..5fd4e37357d 100644 --- a/spec/lib/middleware/request_tracker_spec.rb +++ b/spec/lib/middleware/request_tracker_spec.rb @@ -38,6 +38,46 @@ RSpec.describe Middleware::RequestTracker do expect(WebCrawlerRequest.where(user_agent: agent.encode("utf-8")).count).to eq(1) end + + it "can handle rogue user agents with invalid bytes sequences" do + agent = (+"Evil Googlebot String \xc3\x28").force_encoding("ASCII") # encode("utf-8") -> InvalidByteSequenceError + + expect { + middleware = + Middleware::RequestTracker.new( + ->(env) { ["200", { "Content-Type" => "text/html" }, [""]] }, + ) + middleware.call(env("HTTP_USER_AGENT" => agent)) + + CachedCounting.flush + + expect( + WebCrawlerRequest.where( + user_agent: agent.encode("utf-8", invalid: :replace, undef: :replace), + ).count, + ).to eq(1) + }.not_to raise_error + end + + it "can handle rogue user agents with undefined characters in the destination encoding" do + agent = (+"Evil Googlebot String \xc3\x28").force_encoding("ASCII-8BIT") # encode("utf-8") -> UndefinedConversionError + + expect { + middleware = + Middleware::RequestTracker.new( + ->(env) { ["200", { "Content-Type" => "text/html" }, [""]] }, + ) + middleware.call(env("HTTP_USER_AGENT" => agent)) + + CachedCounting.flush + + expect( + WebCrawlerRequest.where( + user_agent: agent.encode("utf-8", invalid: :replace, undef: :replace), + ).count, + ).to eq(1) + }.not_to raise_error + end end describe "log_request" do