FEATURE: Add experimental tracking of 'real browser' pageviews (#26647)

Our 'page_view_crawler' / 'page_view_anon' metrics are based purely on the User Agent sent by clients. This means that 'badly behaved' bots which are imitating real user agents are counted towards 'anon' page views.

This commit introduces a new method of tracking visitors. When an initial HTML request is made, we assume it is a 'non-browser' request (i.e. a bot). Then, once the JS application has booted, we notify the server to count it as a 'browser' request. This reliance on a JavaScript-capable browser matches up more closely to dedicated analytics systems like Google Analytics.

Existing data collection and graphs are unchanged. Data collected via the new technique is available in a new 'experimental' report.
This commit is contained in:
David Taylor 2024-04-25 11:00:01 +01:00 committed by GitHub
parent 52e8d57293
commit 2f2da72747
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
16 changed files with 412 additions and 7 deletions

View File

@ -7,6 +7,12 @@ import getURL from "discourse-common/lib/get-url";
const LONG_POLL_AFTER_UNSEEN_TIME = 1200000; // 20 minutes
let _sendDeferredPageview = false;
export function sendDeferredPageview() {
_sendDeferredPageview = true;
}
function mbAjax(messageBus, opts) {
opts.headers ||= {};
@ -22,6 +28,11 @@ function mbAjax(messageBus, opts) {
opts.headers["Discourse-Present"] = "true";
}
if (_sendDeferredPageview) {
opts.headers["Discourse-Deferred-Track-View"] = "true";
_sendDeferredPageview = false;
}
const oldComplete = opts.complete;
opts.complete = function (xhr, stat) {
handleLogoff(xhr);

View File

@ -4,11 +4,20 @@ import {
resetPageTracking,
startPageTracking,
} from "discourse/lib/page-tracker";
import { sendDeferredPageview } from "./message-bus";
export default {
after: "inject-objects",
before: "message-bus",
initialize(owner) {
const isErrorPage =
document.querySelector("meta#discourse-error")?.dataset.discourseError ===
"true";
if (!isErrorPage) {
sendDeferredPageview();
}
// Tell our AJAX system to track a page transition
// eslint-disable-next-line ember/no-private-routing-service
const router = owner.lookup("router:main");

View File

@ -0,0 +1,17 @@
document.addEventListener("DOMContentLoaded", function () {
const isErrorPage =
document.querySelector("meta#discourse-error")?.dataset.discourseError ===
"true";
if (!isErrorPage) {
const root =
document.querySelector("meta[name=discourse-base-uri]")?.content || "";
fetch(`${root}/pageview`, {
method: "POST",
headers: {
"Discourse-Deferred-Track-View": "true",
},
});
}
});

View File

@ -0,0 +1,13 @@
# frozen_string_literal: true
class PageviewController < ApplicationController
skip_before_action :check_xhr,
:redirect_to_login_if_required,
:preload_json,
:verify_authenticity_token
def index
# pageview tracking is handled by middleware
render plain: "ok"
end
end

View File

@ -14,6 +14,10 @@ class ApplicationRequest < ActiveRecord::Base
page_view_anon_mobile
api
user_api
page_view_anon_browser
page_view_anon_browser_mobile
page_view_logged_in_browser
page_view_logged_in_browser_mobile
]
include CachedCounting

View File

@ -0,0 +1,75 @@
# frozen_string_literal: true
module Reports::ConsolidatedPageViewsBrowserDetection
extend ActiveSupport::Concern
class_methods do
def report_consolidated_page_views_browser_detection(report)
report.modes = [:stacked_chart]
data =
DB.query(
<<~SQL,
SELECT
date,
SUM(CASE WHEN req_type = :page_view_logged_in_browser THEN count ELSE 0 END) AS page_view_logged_in_browser,
SUM(CASE WHEN req_type = :page_view_anon_browser THEN count ELSE 0 END) AS page_view_anon_browser,
SUM(CASE WHEN req_type = :page_view_crawler THEN count ELSE 0 END) AS page_view_crawler,
SUM(
CASE WHEN req_type = :page_view_anon THEN count
WHEN req_type = :page_view_logged_in THEN count
WHEN req_type = :page_view_anon_browser THEN -count
WHEN req_type = :page_view_logged_in_browser THEN -count
ELSE 0
END
) AS page_view_other
FROM application_requests
WHERE date >= :start_date AND date <= :end_date
GROUP BY date
ORDER BY date ASC
SQL
start_date: report.start_date,
end_date: report.end_date,
page_view_anon: ApplicationRequest.req_types[:page_view_anon],
page_view_crawler: ApplicationRequest.req_types[:page_view_crawler],
page_view_logged_in: ApplicationRequest.req_types[:page_view_logged_in],
page_view_anon_browser: ApplicationRequest.req_types[:page_view_anon_browser],
page_view_logged_in_browser: ApplicationRequest.req_types[:page_view_logged_in_browser],
)
report.data = [
{
req: "page_view_logged_in_browser",
label:
I18n.t(
"reports.consolidated_page_views_browser_detection.xaxis.page_view_logged_in_browser",
),
color: report.colors[0],
data: data.map { |row| { x: row.date, y: row.page_view_logged_in_browser } },
},
{
req: "page_view_anon_browser",
label:
I18n.t(
"reports.consolidated_page_views_browser_detection.xaxis.page_view_anon_browser",
),
color: report.colors[1],
data: data.map { |row| { x: row.date, y: row.page_view_anon_browser } },
},
{
req: "page_view_crawler",
label:
I18n.t("reports.consolidated_page_views_browser_detection.xaxis.page_view_crawler"),
color: report.colors[2],
data: data.map { |row| { x: row.date, y: row.page_view_crawler } },
},
{
req: "page_view_other",
label: I18n.t("reports.consolidated_page_views_browser_detection.xaxis.page_view_other"),
color: report.colors[2],
data: data.map { |row| { x: row.date, y: row.page_view_other } },
},
]
end
end
end

View File

@ -49,6 +49,7 @@ class Report
include Reports::UserFlaggingRatio
include Reports::TrustLevelGrowth
include Reports::ConsolidatedPageViews
include Reports::ConsolidatedPageViewsBrowserDetection
include Reports::ConsolidatedApiRequests
include Reports::Visits
include Reports::TimeToFirstResponse

View File

@ -1,5 +1,13 @@
<% content_for :title do %><%= @page_title %> - <%= SiteSetting.title %><% end %>
<% content_for :head do %>
<meta id="discourse-error" data-discourse-error="true">
<% end %>
<% content_for :no_ember_head do %>
<meta id="discourse-error" data-discourse-error="true">
<% end %>
<div class="page-not-found">
<h1 class="title"><%= @title %></h1>

View File

@ -17,6 +17,8 @@
<%- if allow_plugins? %>
<%= build_plugin_html 'server:before-head-close' %>
<%- end -%>
<%= preload_script "pageview" %>
</head>
<body class="no-ember <%= @custom_body_class %>">
<%- if allow_plugins? %>

View File

@ -8,6 +8,7 @@
<%= render_google_tag_manager_head_code %>
<%= render_google_universal_analytics_code %>
<%= preload_script 'publish' %>
<%= preload_script 'pageview' %>
</head>
<body class="<%= @body_classes.to_a.join(' ') %>">
<%= theme_lookup("header") %>

View File

@ -31,7 +31,7 @@ def setup_message_bus_env(env)
"Access-Control-Allow-Origin" => Discourse.base_url_no_prefix,
"Access-Control-Allow-Methods" => "GET, POST",
"Access-Control-Allow-Headers" =>
"X-SILENCE-LOGGER, X-Shared-Session-Key, Dont-Chunk, Discourse-Present",
"X-SILENCE-LOGGER, X-Shared-Session-Key, Dont-Chunk, Discourse-Present, Discourse-Deferred-Track-View",
"Access-Control-Max-Age" => "7200",
}

View File

@ -1338,6 +1338,15 @@ en:
user_api: "User API"
yaxis: "Day"
description: "API requests for regular API keys and user API keys."
consolidated_page_views_browser_detection:
title: "Consolidated Pageviews with Browser Detection (Experimental)"
xaxis:
page_view_anon_browser: "Anonymous Browser"
page_view_logged_in_browser: "Logged In Browser"
page_view_crawler: "Known Crawler"
page_view_other: "Other pageviews"
yaxis: "Day"
description: "Pageviews for logged in users, anonymous users, known crawlers and other. This experimental report ensures logged-in/anon requests are coming from real browsers before counting them."
dau_by_mau:
title: "DAU/MAU"
xaxis: "Day"
@ -1497,6 +1506,16 @@ en:
xaxis: "Day"
yaxis: "Mobile Anon Pageviews"
description: "Number of new pageviews from visitors on a mobile device who are not logged in."
page_view_anon_browser_reqs:
title: "Anonymous Browser Pageviews"
xaxis: "Day"
yaxis: "Anonymous Browser Pageviews"
description: "Number of pageviews by anonymous visitors using real browsers."
page_view_logged_in_browser_reqs:
title: "Logged In Browser Pageviews"
xaxis: "Day"
yaxis: "Logged In Browser Pageviews"
description: "Number of pageviews by logged-in visitors using real browsers."
http_background_reqs:
title: "Background"
xaxis: "Day"

View File

@ -1629,6 +1629,8 @@ Discourse::Application.routes.draw do
resources :sidebar_sections, only: %i[index create update destroy]
put "/sidebar_sections/reset/:id" => "sidebar_sections#reset"
post "/pageview" => "pageview#index"
get "*url", to: "permalinks#show", constraints: PermalinkConstraint.new
get "/form-templates/:id" => "form_templates#show"

View File

@ -74,9 +74,31 @@ class Middleware::RequestTracker
elsif data[:has_auth_cookie]
ApplicationRequest.increment!(:page_view_logged_in)
ApplicationRequest.increment!(:page_view_logged_in_mobile) if data[:is_mobile]
if data[:explicit_track_view]
# Must be a browser if it had this header from our ajax implementation
ApplicationRequest.increment!(:page_view_logged_in_browser)
ApplicationRequest.increment!(:page_view_logged_in_browser_mobile) if data[:is_mobile]
end
elsif !SiteSetting.login_required
ApplicationRequest.increment!(:page_view_anon)
ApplicationRequest.increment!(:page_view_anon_mobile) if data[:is_mobile]
if data[:explicit_track_view]
# Must be a browser if it had this header from our ajax implementation
ApplicationRequest.increment!(:page_view_anon_browser)
ApplicationRequest.increment!(:page_view_anon_browser_mobile) if data[:is_mobile]
end
end
end
# Message-bus requests may include this 'deferred track' header which we use to detect
# 'real browser' views.
if data[:deferred_track] && !data[:is_crawler]
if data[:has_auth_cookie]
ApplicationRequest.increment!(:page_view_logged_in_browser)
ApplicationRequest.increment!(:page_view_logged_in_browser_mobile) if data[:is_mobile]
elsif !SiteSetting.login_required
ApplicationRequest.increment!(:page_view_anon_browser)
ApplicationRequest.increment!(:page_view_anon_browser_mobile) if data[:is_mobile]
end
end
@ -103,12 +125,20 @@ class Middleware::RequestTracker
request ||= Rack::Request.new(env)
helper = Middleware::AnonymousCache::Helper.new(env, request)
# Value of the discourse-track-view request header
env_track_view = env["HTTP_DISCOURSE_TRACK_VIEW"]
track_view = status == 200
track_view &&= env_track_view != "0" && env_track_view != "false"
track_view &&=
env_track_view || (request.get? && !request.xhr? && headers["Content-Type"] =~ %r{text/html})
track_view = !!track_view
# Was the discourse-track-view request header set to true? Likely
# set by our ajax library to indicate a page view.
explicit_track_view = status == 200 && %w[1 true].include?(env_track_view)
# An HTML response to a GET request is tracked implicitly
implicit_track_view =
status == 200 && !%w[0 false].include?(env_track_view) && request.get? && !request.xhr? &&
headers["Content-Type"] =~ %r{text/html}
track_view = !!(explicit_track_view || implicit_track_view)
has_auth_cookie = Auth::DefaultCurrentUserProvider.find_v0_auth_cookie(request).present?
has_auth_cookie ||= Auth::DefaultCurrentUserProvider.find_v1_auth_cookie(env).present?
@ -118,6 +148,10 @@ class Middleware::RequestTracker
is_message_bus = request.path.start_with?("#{Discourse.base_path}/message-bus/")
is_topic_timings = request.path.start_with?("#{Discourse.base_path}/topics/timings")
# This header is sent on a follow-up request after a real browser loads up a page
# see `scripts/pageview.js` and `instance-initializers/page-tracking.js`
has_deferred_track_header = %w[1 true].include?(env["HTTP_DISCOURSE_DEFERRED_TRACK_VIEW"])
h = {
status: status,
is_crawler: helper.is_crawler?,
@ -129,6 +163,8 @@ class Middleware::RequestTracker
track_view: track_view,
timing: timing,
queue_seconds: env["REQUEST_QUEUE_SECONDS"],
explicit_track_view: explicit_track_view,
deferred_track: has_deferred_track_header,
}
if h[:is_background]
@ -166,7 +202,8 @@ class Middleware::RequestTracker
data =
begin
self.class.get_data(env, result, info, request)
rescue StandardError
rescue StandardError => e
Rails.logger.warn("RequestTracker.get_data failed: #{e}")
nil
end

View File

@ -66,6 +66,7 @@ RSpec.describe Middleware::RequestTracker do
CachedCounting.flush
expect(ApplicationRequest.page_view_anon.first.count).to eq(2)
expect(ApplicationRequest.page_view_anon_browser.first.count).to eq(2)
end
it "can log requests correctly" do
@ -119,6 +120,23 @@ RSpec.describe Middleware::RequestTracker do
expect(ApplicationRequest.page_view_anon_mobile.first.count).to eq(1)
expect(ApplicationRequest.page_view_crawler.first.count).to eq(1)
expect(ApplicationRequest.page_view_anon_browser.first.count).to eq(1)
end
it "logs deferred pageviews correctly" do
data =
Middleware::RequestTracker.get_data(
env(:path => "/message-bus/abcde/poll", "HTTP_DISCOURSE_DEFERRED_TRACK_VIEW" => "1"),
["200", { "Content-Type" => "text/html" }],
0.1,
)
Middleware::RequestTracker.log_request(data)
expect(data[:deferred_track]).to eq(true)
CachedCounting.flush
expect(ApplicationRequest.page_view_anon_browser.first.count).to eq(1)
end
it "logs API requests correctly" do

View File

@ -0,0 +1,188 @@
# frozen_string_literal: true
describe "request tracking", type: :system do
before do
ApplicationRequest.enable
CachedCounting.reset
CachedCounting.enable
end
after do
CachedCounting.reset
ApplicationRequest.disable
CachedCounting.disable
end
it "tracks an anonymous visit correctly" do
visit "/"
try_until_success do
CachedCounting.flush
expect(ApplicationRequest.stats).to include(
"page_view_anon_total" => 1,
"page_view_anon_browser_total" => 1,
"page_view_logged_in_total" => 0,
"page_view_crawler_total" => 0,
)
end
find(".nav-item_categories a").click
try_until_success do
CachedCounting.flush
expect(ApplicationRequest.stats).to include(
"page_view_anon_total" => 2,
"page_view_anon_browser_total" => 2,
"page_view_logged_in_total" => 0,
"page_view_crawler_total" => 0,
)
end
end
it "tracks a crawler visit correctly" do
# Can't change Selenium's user agent... so change site settings to make Discourse detect chrome as a crawler
SiteSetting.crawler_user_agents += "|chrome"
visit "/"
try_until_success do
CachedCounting.flush
expect(ApplicationRequest.stats).to include(
"page_view_anon_total" => 0,
"page_view_anon_browser_total" => 0,
"page_view_logged_in_total" => 0,
"page_view_crawler_total" => 1,
)
end
end
it "tracks a logged-in session correctly" do
sign_in Fabricate(:user)
visit "/"
try_until_success do
CachedCounting.flush
expect(ApplicationRequest.stats).to include(
"page_view_anon_total" => 0,
"page_view_anon_browser_total" => 0,
"page_view_logged_in_total" => 1,
"page_view_crawler_total" => 0,
"page_view_logged_in_browser_total" => 1,
)
end
find(".nav-item_categories a").click
try_until_success do
CachedCounting.flush
expect(ApplicationRequest.stats).to include(
"page_view_anon_total" => 0,
"page_view_anon_browser_total" => 0,
"page_view_logged_in_total" => 2,
"page_view_crawler_total" => 0,
"page_view_logged_in_browser_total" => 2,
)
end
end
it "tracks normal error pages correctly" do
SiteSetting.bootstrap_error_pages = false
visit "/foobar"
try_until_success do
CachedCounting.flush
# Does not count error as a pageview
expect(ApplicationRequest.stats).to include(
"http_4xx_total" => 1,
"page_view_anon_total" => 0,
"page_view_anon_browser_total" => 0,
"page_view_logged_in_total" => 0,
"page_view_crawler_total" => 0,
)
end
find("#site-logo").click
try_until_success do
CachedCounting.flush
expect(ApplicationRequest.stats).to include(
"http_4xx_total" => 1,
"page_view_anon_total" => 1,
"page_view_anon_browser_total" => 1,
"page_view_logged_in_total" => 0,
"page_view_crawler_total" => 0,
)
end
end
it "tracks non-ember pages correctly" do
visit "/safe-mode"
try_until_success do
CachedCounting.flush
# Does not count error as a pageview
expect(ApplicationRequest.stats).to include(
"page_view_anon_total" => 1,
"page_view_anon_browser_total" => 1,
"page_view_logged_in_total" => 0,
"page_view_crawler_total" => 0,
)
end
end
it "tracks bootstrapped error pages correctly" do
SiteSetting.bootstrap_error_pages = true
visit "/foobar"
try_until_success do
CachedCounting.flush
# Does not count error as a pageview
expect(ApplicationRequest.stats).to include(
"http_4xx_total" => 1,
"page_view_anon_total" => 0,
"page_view_anon_browser_total" => 0,
"page_view_logged_in_total" => 0,
"page_view_crawler_total" => 0,
)
end
find("#site-logo").click
try_until_success do
CachedCounting.flush
expect(ApplicationRequest.stats).to include(
"http_4xx_total" => 1,
"page_view_anon_total" => 1,
"page_view_anon_browser_total" => 1,
"page_view_logged_in_total" => 0,
"page_view_crawler_total" => 0,
)
end
end
it "tracks published pages correctly" do
SiteSetting.enable_page_publishing = true
page =
Fabricate(:published_page, public: true, slug: "some-page", topic: Fabricate(:post).topic)
visit "/pub/some-page"
try_until_success do
CachedCounting.flush
# Does not count error as a pageview
expect(ApplicationRequest.stats).to include(
"page_view_anon_total" => 1,
"page_view_anon_browser_total" => 1,
"page_view_logged_in_total" => 0,
"page_view_crawler_total" => 0,
)
end
end
end