FIX: crawler requests exceptions for non UTF-8 user agents with invalid bytes

This commit is contained in:
Arkshine 2024-05-24 03:49:17 +02:00 committed by Régis Hanol
parent b757275c1e
commit 1fffb236b2
5 changed files with 86 additions and 13 deletions

View File

@ -0,0 +1,14 @@
# frozen_string_literal: true
module HttpUserAgentEncoder
def self.ensure_utf8(user_agent)
return "" unless user_agent
if user_agent.encoding != Encoding::UTF_8
user_agent = user_agent.encode("utf-8", invalid: :replace, undef: :replace)
user_agent.scrub!
end
user_agent || ""
end
end

View File

@ -4,6 +4,7 @@ require "mobile_detection"
require "crawler_detection"
require "guardian"
require "http_language_parser"
require "http_user_agent_encoder"
module Middleware
class AnonymousCache
@ -73,6 +74,7 @@ module Middleware
def initialize(env, request = nil)
@env = env
@user_agent = HttpUserAgentEncoder.ensure_utf8(@env[USER_AGENT])
@request = request || Rack::Request.new(@env)
end
@ -82,7 +84,7 @@ module Middleware
@request[Auth::DefaultCurrentUserProvider::API_KEY].nil? &&
@env[Auth::DefaultCurrentUserProvider::USER_API_KEY].nil? &&
@env[Auth::DefaultCurrentUserProvider::HEADER_API_KEY].nil? &&
CrawlerDetection.is_blocked_crawler?(@env[USER_AGENT])
CrawlerDetection.is_blocked_crawler?(@user_agent)
end
# rubocop:disable Lint/BooleanSymbol
@ -98,7 +100,7 @@ module Middleware
# otherwise you get a broken params on the request
params = {}
MobileDetection.resolve_mobile_view!(@env[USER_AGENT], params, session) ? :true : :false
MobileDetection.resolve_mobile_view!(@user_agent, params, session) ? :true : :false
end
@is_mobile == :true
@ -126,14 +128,12 @@ module Middleware
def is_crawler?
@is_crawler ||=
begin
user_agent = @env[USER_AGENT]
if @env[DISCOURSE_RENDER] == "crawler" ||
CrawlerDetection.crawler?(user_agent, @env["HTTP_VIA"])
CrawlerDetection.crawler?(@user_agent, @env["HTTP_VIA"])
:true
else
if user_agent.downcase.include?("discourse") &&
!user_agent.downcase.include?("mobile")
if @user_agent.downcase.include?("discourse") &&
!@user_agent.downcase.include?("mobile")
:true
else
:false
@ -146,11 +146,11 @@ module Middleware
# rubocop:enable Lint/BooleanSymbol
def key_is_modern_mobile_device?
MobileDetection.modern_mobile_device?(@env[USER_AGENT]) if @env[USER_AGENT]
MobileDetection.modern_mobile_device?(@user_agent) if @user_agent
end
def key_is_old_browser?
CrawlerDetection.show_browser_update?(@env[USER_AGENT]) if @env[USER_AGENT]
CrawlerDetection.show_browser_update?(@user_agent) if @user_agent
end
def cache_key

View File

@ -2,6 +2,7 @@
require "method_profiler"
require "middleware/anonymous_cache"
require "http_user_agent_encoder"
class Middleware::RequestTracker
@@detailed_request_loggers = nil
@ -186,10 +187,7 @@ class Middleware::RequestTracker
if h[:is_crawler]
user_agent = env["HTTP_USER_AGENT"]
if user_agent && (user_agent.encoding != Encoding::UTF_8)
user_agent = user_agent.encode("utf-8")
user_agent.scrub!
end
user_agent = HttpUserAgentEncoder.ensure_utf8(user_agent) if user_agent
h[:user_agent] = user_agent
end

View File

@ -112,6 +112,18 @@ RSpec.describe Middleware::AnonymousCache do
expect(key1).not_to eq(key2)
end
it "handles user agents with invalid bytes" do
agent = (+"Evil Googlebot String \xc3\x28").force_encoding("ASCII")
expect {
key1 = new_helper("HTTP_USER_AGENT" => agent).cache_key
key2 =
new_helper(
"HTTP_USER_AGENT" => agent.encode("utf-8", invalid: :replace, undef: :replace),
).cache_key
expect(key1).to eq(key2)
}.not_to raise_error
end
context "when cached" do
let!(:helper) { new_helper("ANON_CACHE_DURATION" => 10) }
@ -351,6 +363,15 @@ RSpec.describe Middleware::AnonymousCache do
expect(@status).to eq(403)
expect {
get "/",
headers: {
"HTTP_USER_AGENT" => (+"Evil Googlebot String \xc3\x28").force_encoding("ASCII"),
}
expect(@status).to eq(403)
}.not_to raise_error
get "/",
headers: {
"HTTP_USER_AGENT" => "Twitterbot/2.1 (+http://www.notgoogle.com/bot.html)",

View File

@ -38,6 +38,46 @@ RSpec.describe Middleware::RequestTracker do
expect(WebCrawlerRequest.where(user_agent: agent.encode("utf-8")).count).to eq(1)
end
it "can handle rogue user agents with invalid bytes sequences" do
agent = (+"Evil Googlebot String \xc3\x28").force_encoding("ASCII") # encode("utf-8") -> InvalidByteSequenceError
expect {
middleware =
Middleware::RequestTracker.new(
->(env) { ["200", { "Content-Type" => "text/html" }, [""]] },
)
middleware.call(env("HTTP_USER_AGENT" => agent))
CachedCounting.flush
expect(
WebCrawlerRequest.where(
user_agent: agent.encode("utf-8", invalid: :replace, undef: :replace),
).count,
).to eq(1)
}.not_to raise_error
end
it "can handle rogue user agents with undefined characters in the destination encoding" do
agent = (+"Evil Googlebot String \xc3\x28").force_encoding("ASCII-8BIT") # encode("utf-8") -> UndefinedConversionError
expect {
middleware =
Middleware::RequestTracker.new(
->(env) { ["200", { "Content-Type" => "text/html" }, [""]] },
)
middleware.call(env("HTTP_USER_AGENT" => agent))
CachedCounting.flush
expect(
WebCrawlerRequest.where(
user_agent: agent.encode("utf-8", invalid: :replace, undef: :replace),
).count,
).to eq(1)
}.not_to raise_error
end
end
describe "log_request" do