mirror of
https://github.com/discourse/discourse.git
synced 2024-11-22 13:09:18 +08:00
FIX: crawler requests exceptions for non UTF-8 user agents with invalid bytes
This commit is contained in:
parent
b757275c1e
commit
1fffb236b2
14
lib/http_user_agent_encoder.rb
Normal file
14
lib/http_user_agent_encoder.rb
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
# frozen_string_literal: true
|
||||||
|
|
||||||
|
module HttpUserAgentEncoder
|
||||||
|
def self.ensure_utf8(user_agent)
|
||||||
|
return "" unless user_agent
|
||||||
|
|
||||||
|
if user_agent.encoding != Encoding::UTF_8
|
||||||
|
user_agent = user_agent.encode("utf-8", invalid: :replace, undef: :replace)
|
||||||
|
user_agent.scrub!
|
||||||
|
end
|
||||||
|
|
||||||
|
user_agent || ""
|
||||||
|
end
|
||||||
|
end
|
|
@ -4,6 +4,7 @@ require "mobile_detection"
|
||||||
require "crawler_detection"
|
require "crawler_detection"
|
||||||
require "guardian"
|
require "guardian"
|
||||||
require "http_language_parser"
|
require "http_language_parser"
|
||||||
|
require "http_user_agent_encoder"
|
||||||
|
|
||||||
module Middleware
|
module Middleware
|
||||||
class AnonymousCache
|
class AnonymousCache
|
||||||
|
@ -73,6 +74,7 @@ module Middleware
|
||||||
|
|
||||||
def initialize(env, request = nil)
|
def initialize(env, request = nil)
|
||||||
@env = env
|
@env = env
|
||||||
|
@user_agent = HttpUserAgentEncoder.ensure_utf8(@env[USER_AGENT])
|
||||||
@request = request || Rack::Request.new(@env)
|
@request = request || Rack::Request.new(@env)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -82,7 +84,7 @@ module Middleware
|
||||||
@request[Auth::DefaultCurrentUserProvider::API_KEY].nil? &&
|
@request[Auth::DefaultCurrentUserProvider::API_KEY].nil? &&
|
||||||
@env[Auth::DefaultCurrentUserProvider::USER_API_KEY].nil? &&
|
@env[Auth::DefaultCurrentUserProvider::USER_API_KEY].nil? &&
|
||||||
@env[Auth::DefaultCurrentUserProvider::HEADER_API_KEY].nil? &&
|
@env[Auth::DefaultCurrentUserProvider::HEADER_API_KEY].nil? &&
|
||||||
CrawlerDetection.is_blocked_crawler?(@env[USER_AGENT])
|
CrawlerDetection.is_blocked_crawler?(@user_agent)
|
||||||
end
|
end
|
||||||
|
|
||||||
# rubocop:disable Lint/BooleanSymbol
|
# rubocop:disable Lint/BooleanSymbol
|
||||||
|
@ -98,7 +100,7 @@ module Middleware
|
||||||
# otherwise you get a broken params on the request
|
# otherwise you get a broken params on the request
|
||||||
params = {}
|
params = {}
|
||||||
|
|
||||||
MobileDetection.resolve_mobile_view!(@env[USER_AGENT], params, session) ? :true : :false
|
MobileDetection.resolve_mobile_view!(@user_agent, params, session) ? :true : :false
|
||||||
end
|
end
|
||||||
|
|
||||||
@is_mobile == :true
|
@is_mobile == :true
|
||||||
|
@ -126,14 +128,12 @@ module Middleware
|
||||||
def is_crawler?
|
def is_crawler?
|
||||||
@is_crawler ||=
|
@is_crawler ||=
|
||||||
begin
|
begin
|
||||||
user_agent = @env[USER_AGENT]
|
|
||||||
|
|
||||||
if @env[DISCOURSE_RENDER] == "crawler" ||
|
if @env[DISCOURSE_RENDER] == "crawler" ||
|
||||||
CrawlerDetection.crawler?(user_agent, @env["HTTP_VIA"])
|
CrawlerDetection.crawler?(@user_agent, @env["HTTP_VIA"])
|
||||||
:true
|
:true
|
||||||
else
|
else
|
||||||
if user_agent.downcase.include?("discourse") &&
|
if @user_agent.downcase.include?("discourse") &&
|
||||||
!user_agent.downcase.include?("mobile")
|
!@user_agent.downcase.include?("mobile")
|
||||||
:true
|
:true
|
||||||
else
|
else
|
||||||
:false
|
:false
|
||||||
|
@ -146,11 +146,11 @@ module Middleware
|
||||||
# rubocop:enable Lint/BooleanSymbol
|
# rubocop:enable Lint/BooleanSymbol
|
||||||
|
|
||||||
def key_is_modern_mobile_device?
|
def key_is_modern_mobile_device?
|
||||||
MobileDetection.modern_mobile_device?(@env[USER_AGENT]) if @env[USER_AGENT]
|
MobileDetection.modern_mobile_device?(@user_agent) if @user_agent
|
||||||
end
|
end
|
||||||
|
|
||||||
def key_is_old_browser?
|
def key_is_old_browser?
|
||||||
CrawlerDetection.show_browser_update?(@env[USER_AGENT]) if @env[USER_AGENT]
|
CrawlerDetection.show_browser_update?(@user_agent) if @user_agent
|
||||||
end
|
end
|
||||||
|
|
||||||
def cache_key
|
def cache_key
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
|
|
||||||
require "method_profiler"
|
require "method_profiler"
|
||||||
require "middleware/anonymous_cache"
|
require "middleware/anonymous_cache"
|
||||||
|
require "http_user_agent_encoder"
|
||||||
|
|
||||||
class Middleware::RequestTracker
|
class Middleware::RequestTracker
|
||||||
@@detailed_request_loggers = nil
|
@@detailed_request_loggers = nil
|
||||||
|
@ -186,10 +187,7 @@ class Middleware::RequestTracker
|
||||||
|
|
||||||
if h[:is_crawler]
|
if h[:is_crawler]
|
||||||
user_agent = env["HTTP_USER_AGENT"]
|
user_agent = env["HTTP_USER_AGENT"]
|
||||||
if user_agent && (user_agent.encoding != Encoding::UTF_8)
|
user_agent = HttpUserAgentEncoder.ensure_utf8(user_agent) if user_agent
|
||||||
user_agent = user_agent.encode("utf-8")
|
|
||||||
user_agent.scrub!
|
|
||||||
end
|
|
||||||
h[:user_agent] = user_agent
|
h[:user_agent] = user_agent
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
|
@ -112,6 +112,18 @@ RSpec.describe Middleware::AnonymousCache do
|
||||||
expect(key1).not_to eq(key2)
|
expect(key1).not_to eq(key2)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
it "handles user agents with invalid bytes" do
|
||||||
|
agent = (+"Evil Googlebot String \xc3\x28").force_encoding("ASCII")
|
||||||
|
expect {
|
||||||
|
key1 = new_helper("HTTP_USER_AGENT" => agent).cache_key
|
||||||
|
key2 =
|
||||||
|
new_helper(
|
||||||
|
"HTTP_USER_AGENT" => agent.encode("utf-8", invalid: :replace, undef: :replace),
|
||||||
|
).cache_key
|
||||||
|
expect(key1).to eq(key2)
|
||||||
|
}.not_to raise_error
|
||||||
|
end
|
||||||
|
|
||||||
context "when cached" do
|
context "when cached" do
|
||||||
let!(:helper) { new_helper("ANON_CACHE_DURATION" => 10) }
|
let!(:helper) { new_helper("ANON_CACHE_DURATION" => 10) }
|
||||||
|
|
||||||
|
@ -351,6 +363,15 @@ RSpec.describe Middleware::AnonymousCache do
|
||||||
|
|
||||||
expect(@status).to eq(403)
|
expect(@status).to eq(403)
|
||||||
|
|
||||||
|
expect {
|
||||||
|
get "/",
|
||||||
|
headers: {
|
||||||
|
"HTTP_USER_AGENT" => (+"Evil Googlebot String \xc3\x28").force_encoding("ASCII"),
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(@status).to eq(403)
|
||||||
|
}.not_to raise_error
|
||||||
|
|
||||||
get "/",
|
get "/",
|
||||||
headers: {
|
headers: {
|
||||||
"HTTP_USER_AGENT" => "Twitterbot/2.1 (+http://www.notgoogle.com/bot.html)",
|
"HTTP_USER_AGENT" => "Twitterbot/2.1 (+http://www.notgoogle.com/bot.html)",
|
||||||
|
|
|
@ -38,6 +38,46 @@ RSpec.describe Middleware::RequestTracker do
|
||||||
|
|
||||||
expect(WebCrawlerRequest.where(user_agent: agent.encode("utf-8")).count).to eq(1)
|
expect(WebCrawlerRequest.where(user_agent: agent.encode("utf-8")).count).to eq(1)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
it "can handle rogue user agents with invalid bytes sequences" do
|
||||||
|
agent = (+"Evil Googlebot String \xc3\x28").force_encoding("ASCII") # encode("utf-8") -> InvalidByteSequenceError
|
||||||
|
|
||||||
|
expect {
|
||||||
|
middleware =
|
||||||
|
Middleware::RequestTracker.new(
|
||||||
|
->(env) { ["200", { "Content-Type" => "text/html" }, [""]] },
|
||||||
|
)
|
||||||
|
middleware.call(env("HTTP_USER_AGENT" => agent))
|
||||||
|
|
||||||
|
CachedCounting.flush
|
||||||
|
|
||||||
|
expect(
|
||||||
|
WebCrawlerRequest.where(
|
||||||
|
user_agent: agent.encode("utf-8", invalid: :replace, undef: :replace),
|
||||||
|
).count,
|
||||||
|
).to eq(1)
|
||||||
|
}.not_to raise_error
|
||||||
|
end
|
||||||
|
|
||||||
|
it "can handle rogue user agents with undefined characters in the destination encoding" do
|
||||||
|
agent = (+"Evil Googlebot String \xc3\x28").force_encoding("ASCII-8BIT") # encode("utf-8") -> UndefinedConversionError
|
||||||
|
|
||||||
|
expect {
|
||||||
|
middleware =
|
||||||
|
Middleware::RequestTracker.new(
|
||||||
|
->(env) { ["200", { "Content-Type" => "text/html" }, [""]] },
|
||||||
|
)
|
||||||
|
middleware.call(env("HTTP_USER_AGENT" => agent))
|
||||||
|
|
||||||
|
CachedCounting.flush
|
||||||
|
|
||||||
|
expect(
|
||||||
|
WebCrawlerRequest.where(
|
||||||
|
user_agent: agent.encode("utf-8", invalid: :replace, undef: :replace),
|
||||||
|
).count,
|
||||||
|
).to eq(1)
|
||||||
|
}.not_to raise_error
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
describe "log_request" do
|
describe "log_request" do
|
||||||
|
|
Loading…
Reference in New Issue
Block a user