diff --git a/lib/middleware/request_tracker.rb b/lib/middleware/request_tracker.rb index 897a09a8fda..5f3a1f74583 100644 --- a/lib/middleware/request_tracker.rb +++ b/lib/middleware/request_tracker.rb @@ -117,7 +117,12 @@ class Middleware::RequestTracker } if h[:is_crawler] - h[:user_agent] = env['HTTP_USER_AGENT'] + user_agent = env['HTTP_USER_AGENT'] + if user_agent.encoding != Encoding::UTF_8 + user_agent = user_agent.encode("utf-8") + user_agent.scrub! + end + h[:user_agent] = user_agent end if cache = headers["X-Discourse-Cached"] diff --git a/spec/components/middleware/request_tracker_spec.rb b/spec/components/middleware/request_tracker_spec.rb index 8faeaf079eb..d6c413cb9de 100644 --- a/spec/components/middleware/request_tracker_spec.rb +++ b/spec/components/middleware/request_tracker_spec.rb @@ -15,6 +15,26 @@ describe Middleware::RequestTracker do }.merge(opts) end + context "full request" do + before do + @orig = WebCrawlerRequest.autoflush + WebCrawlerRequest.autoflush = 1 + end + after do + WebCrawlerRequest.autoflush = @orig + end + + it "can handle rogue user agents" do + agent = (+"Evil Googlebot String \xc3\x28").force_encoding("Windows-1252") + + middleware = Middleware::RequestTracker.new(->(env) { ["200", { "Content-Type" => "text/html" }, [""]] }) + middleware.call(env("HTTP_USER_AGENT" => agent)) + + expect(WebCrawlerRequest.where(user_agent: agent.encode('utf-8')).count).to eq(1) + end + + end + context "log_request" do before do freeze_time Time.now