mirror of
https://github.com/discourse/discourse.git
synced 2024-11-22 13:09:18 +08:00
FIX: crawler requests exceptions for non UTF-8 user agents with invalid bytes
This commit is contained in:
parent
b757275c1e
commit
1fffb236b2
14
lib/http_user_agent_encoder.rb
Normal file
14
lib/http_user_agent_encoder.rb
Normal file
|
@ -0,0 +1,14 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
module HttpUserAgentEncoder
|
||||
def self.ensure_utf8(user_agent)
|
||||
return "" unless user_agent
|
||||
|
||||
if user_agent.encoding != Encoding::UTF_8
|
||||
user_agent = user_agent.encode("utf-8", invalid: :replace, undef: :replace)
|
||||
user_agent.scrub!
|
||||
end
|
||||
|
||||
user_agent || ""
|
||||
end
|
||||
end
|
|
@ -4,6 +4,7 @@ require "mobile_detection"
|
|||
require "crawler_detection"
|
||||
require "guardian"
|
||||
require "http_language_parser"
|
||||
require "http_user_agent_encoder"
|
||||
|
||||
module Middleware
|
||||
class AnonymousCache
|
||||
|
@ -73,6 +74,7 @@ module Middleware
|
|||
|
||||
def initialize(env, request = nil)
|
||||
@env = env
|
||||
@user_agent = HttpUserAgentEncoder.ensure_utf8(@env[USER_AGENT])
|
||||
@request = request || Rack::Request.new(@env)
|
||||
end
|
||||
|
||||
|
@ -82,7 +84,7 @@ module Middleware
|
|||
@request[Auth::DefaultCurrentUserProvider::API_KEY].nil? &&
|
||||
@env[Auth::DefaultCurrentUserProvider::USER_API_KEY].nil? &&
|
||||
@env[Auth::DefaultCurrentUserProvider::HEADER_API_KEY].nil? &&
|
||||
CrawlerDetection.is_blocked_crawler?(@env[USER_AGENT])
|
||||
CrawlerDetection.is_blocked_crawler?(@user_agent)
|
||||
end
|
||||
|
||||
# rubocop:disable Lint/BooleanSymbol
|
||||
|
@ -98,7 +100,7 @@ module Middleware
|
|||
# otherwise you get a broken params on the request
|
||||
params = {}
|
||||
|
||||
MobileDetection.resolve_mobile_view!(@env[USER_AGENT], params, session) ? :true : :false
|
||||
MobileDetection.resolve_mobile_view!(@user_agent, params, session) ? :true : :false
|
||||
end
|
||||
|
||||
@is_mobile == :true
|
||||
|
@ -126,14 +128,12 @@ module Middleware
|
|||
def is_crawler?
|
||||
@is_crawler ||=
|
||||
begin
|
||||
user_agent = @env[USER_AGENT]
|
||||
|
||||
if @env[DISCOURSE_RENDER] == "crawler" ||
|
||||
CrawlerDetection.crawler?(user_agent, @env["HTTP_VIA"])
|
||||
CrawlerDetection.crawler?(@user_agent, @env["HTTP_VIA"])
|
||||
:true
|
||||
else
|
||||
if user_agent.downcase.include?("discourse") &&
|
||||
!user_agent.downcase.include?("mobile")
|
||||
if @user_agent.downcase.include?("discourse") &&
|
||||
!@user_agent.downcase.include?("mobile")
|
||||
:true
|
||||
else
|
||||
:false
|
||||
|
@ -146,11 +146,11 @@ module Middleware
|
|||
# rubocop:enable Lint/BooleanSymbol
|
||||
|
||||
def key_is_modern_mobile_device?
|
||||
MobileDetection.modern_mobile_device?(@env[USER_AGENT]) if @env[USER_AGENT]
|
||||
MobileDetection.modern_mobile_device?(@user_agent) if @user_agent
|
||||
end
|
||||
|
||||
def key_is_old_browser?
|
||||
CrawlerDetection.show_browser_update?(@env[USER_AGENT]) if @env[USER_AGENT]
|
||||
CrawlerDetection.show_browser_update?(@user_agent) if @user_agent
|
||||
end
|
||||
|
||||
def cache_key
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
|
||||
require "method_profiler"
|
||||
require "middleware/anonymous_cache"
|
||||
require "http_user_agent_encoder"
|
||||
|
||||
class Middleware::RequestTracker
|
||||
@@detailed_request_loggers = nil
|
||||
|
@ -186,10 +187,7 @@ class Middleware::RequestTracker
|
|||
|
||||
if h[:is_crawler]
|
||||
user_agent = env["HTTP_USER_AGENT"]
|
||||
if user_agent && (user_agent.encoding != Encoding::UTF_8)
|
||||
user_agent = user_agent.encode("utf-8")
|
||||
user_agent.scrub!
|
||||
end
|
||||
user_agent = HttpUserAgentEncoder.ensure_utf8(user_agent) if user_agent
|
||||
h[:user_agent] = user_agent
|
||||
end
|
||||
|
||||
|
|
|
@ -112,6 +112,18 @@ RSpec.describe Middleware::AnonymousCache do
|
|||
expect(key1).not_to eq(key2)
|
||||
end
|
||||
|
||||
it "handles user agents with invalid bytes" do
|
||||
agent = (+"Evil Googlebot String \xc3\x28").force_encoding("ASCII")
|
||||
expect {
|
||||
key1 = new_helper("HTTP_USER_AGENT" => agent).cache_key
|
||||
key2 =
|
||||
new_helper(
|
||||
"HTTP_USER_AGENT" => agent.encode("utf-8", invalid: :replace, undef: :replace),
|
||||
).cache_key
|
||||
expect(key1).to eq(key2)
|
||||
}.not_to raise_error
|
||||
end
|
||||
|
||||
context "when cached" do
|
||||
let!(:helper) { new_helper("ANON_CACHE_DURATION" => 10) }
|
||||
|
||||
|
@ -351,6 +363,15 @@ RSpec.describe Middleware::AnonymousCache do
|
|||
|
||||
expect(@status).to eq(403)
|
||||
|
||||
expect {
|
||||
get "/",
|
||||
headers: {
|
||||
"HTTP_USER_AGENT" => (+"Evil Googlebot String \xc3\x28").force_encoding("ASCII"),
|
||||
}
|
||||
|
||||
expect(@status).to eq(403)
|
||||
}.not_to raise_error
|
||||
|
||||
get "/",
|
||||
headers: {
|
||||
"HTTP_USER_AGENT" => "Twitterbot/2.1 (+http://www.notgoogle.com/bot.html)",
|
||||
|
|
|
@ -38,6 +38,46 @@ RSpec.describe Middleware::RequestTracker do
|
|||
|
||||
expect(WebCrawlerRequest.where(user_agent: agent.encode("utf-8")).count).to eq(1)
|
||||
end
|
||||
|
||||
it "can handle rogue user agents with invalid bytes sequences" do
|
||||
agent = (+"Evil Googlebot String \xc3\x28").force_encoding("ASCII") # encode("utf-8") -> InvalidByteSequenceError
|
||||
|
||||
expect {
|
||||
middleware =
|
||||
Middleware::RequestTracker.new(
|
||||
->(env) { ["200", { "Content-Type" => "text/html" }, [""]] },
|
||||
)
|
||||
middleware.call(env("HTTP_USER_AGENT" => agent))
|
||||
|
||||
CachedCounting.flush
|
||||
|
||||
expect(
|
||||
WebCrawlerRequest.where(
|
||||
user_agent: agent.encode("utf-8", invalid: :replace, undef: :replace),
|
||||
).count,
|
||||
).to eq(1)
|
||||
}.not_to raise_error
|
||||
end
|
||||
|
||||
it "can handle rogue user agents with undefined characters in the destination encoding" do
|
||||
agent = (+"Evil Googlebot String \xc3\x28").force_encoding("ASCII-8BIT") # encode("utf-8") -> UndefinedConversionError
|
||||
|
||||
expect {
|
||||
middleware =
|
||||
Middleware::RequestTracker.new(
|
||||
->(env) { ["200", { "Content-Type" => "text/html" }, [""]] },
|
||||
)
|
||||
middleware.call(env("HTTP_USER_AGENT" => agent))
|
||||
|
||||
CachedCounting.flush
|
||||
|
||||
expect(
|
||||
WebCrawlerRequest.where(
|
||||
user_agent: agent.encode("utf-8", invalid: :replace, undef: :replace),
|
||||
).count,
|
||||
).to eq(1)
|
||||
}.not_to raise_error
|
||||
end
|
||||
end
|
||||
|
||||
describe "log_request" do
|
||||
|
|
Loading…
Reference in New Issue
Block a user