From 2f2da7274732cba30d03b6c5c3a4194652cb6783 Mon Sep 17 00:00:00 2001 From: David Taylor Date: Thu, 25 Apr 2024 11:00:01 +0100 Subject: [PATCH] FEATURE: Add experimental tracking of 'real browser' pageviews (#26647) Our 'page_view_crawler' / 'page_view_anon' metrics are based purely on the User Agent sent by clients. This means that 'badly behaved' bots which are imitating real user agents are counted towards 'anon' page views. This commit introduces a new method of tracking visitors. When an initial HTML request is made, we assume it is a 'non-browser' request (i.e. a bot). Then, once the JS application has booted, we notify the server to count it as a 'browser' request. This reliance on a JavaScript-capable browser matches up more closely to dedicated analytics systems like Google Analytics. Existing data collection and graphs are unchanged. Data collected via the new technique is available in a new 'experimental' report. --- .../app/instance-initializers/message-bus.js | 11 + .../instance-initializers/page-tracking.js | 9 + .../javascripts/discourse/scripts/pageview.js | 17 ++ app/controllers/pageview_controller.rb | 13 ++ app/models/application_request.rb | 4 + ...nsolidated_page_views_browser_detection.rb | 75 +++++++ app/models/report.rb | 1 + app/views/exceptions/not_found.html.erb | 8 + app/views/layouts/no_ember.html.erb | 2 + app/views/layouts/publish.html.erb | 1 + config/initializers/004-message_bus.rb | 2 +- config/locales/server.en.yml | 19 ++ config/routes.rb | 2 + lib/middleware/request_tracker.rb | 49 ++++- spec/lib/middleware/request_tracker_spec.rb | 18 ++ spec/system/request_tracker_spec.rb | 188 ++++++++++++++++++ 16 files changed, 412 insertions(+), 7 deletions(-) create mode 100644 app/assets/javascripts/discourse/scripts/pageview.js create mode 100644 app/controllers/pageview_controller.rb create mode 100644 app/models/concerns/reports/consolidated_page_views_browser_detection.rb create mode 100644 spec/system/request_tracker_spec.rb diff --git a/app/assets/javascripts/discourse/app/instance-initializers/message-bus.js b/app/assets/javascripts/discourse/app/instance-initializers/message-bus.js index be651bd64cc..819db4c3387 100644 --- a/app/assets/javascripts/discourse/app/instance-initializers/message-bus.js +++ b/app/assets/javascripts/discourse/app/instance-initializers/message-bus.js @@ -7,6 +7,12 @@ import getURL from "discourse-common/lib/get-url"; const LONG_POLL_AFTER_UNSEEN_TIME = 1200000; // 20 minutes +let _sendDeferredPageview = false; + +export function sendDeferredPageview() { + _sendDeferredPageview = true; +} + function mbAjax(messageBus, opts) { opts.headers ||= {}; @@ -22,6 +28,11 @@ function mbAjax(messageBus, opts) { opts.headers["Discourse-Present"] = "true"; } + if (_sendDeferredPageview) { + opts.headers["Discourse-Deferred-Track-View"] = "true"; + _sendDeferredPageview = false; + } + const oldComplete = opts.complete; opts.complete = function (xhr, stat) { handleLogoff(xhr); diff --git a/app/assets/javascripts/discourse/app/instance-initializers/page-tracking.js b/app/assets/javascripts/discourse/app/instance-initializers/page-tracking.js index 704493c7388..e3f226cb72e 100644 --- a/app/assets/javascripts/discourse/app/instance-initializers/page-tracking.js +++ b/app/assets/javascripts/discourse/app/instance-initializers/page-tracking.js @@ -4,11 +4,20 @@ import { resetPageTracking, startPageTracking, } from "discourse/lib/page-tracker"; +import { sendDeferredPageview } from "./message-bus"; export default { after: "inject-objects", + before: "message-bus", initialize(owner) { + const isErrorPage = + document.querySelector("meta#discourse-error")?.dataset.discourseError === + "true"; + if (!isErrorPage) { + sendDeferredPageview(); + } + // Tell our AJAX system to track a page transition // eslint-disable-next-line ember/no-private-routing-service const router = owner.lookup("router:main"); diff --git a/app/assets/javascripts/discourse/scripts/pageview.js b/app/assets/javascripts/discourse/scripts/pageview.js new file mode 100644 index 00000000000..6bb83f55443 --- /dev/null +++ b/app/assets/javascripts/discourse/scripts/pageview.js @@ -0,0 +1,17 @@ +document.addEventListener("DOMContentLoaded", function () { + const isErrorPage = + document.querySelector("meta#discourse-error")?.dataset.discourseError === + "true"; + + if (!isErrorPage) { + const root = + document.querySelector("meta[name=discourse-base-uri]")?.content || ""; + + fetch(`${root}/pageview`, { + method: "POST", + headers: { + "Discourse-Deferred-Track-View": "true", + }, + }); + } +}); diff --git a/app/controllers/pageview_controller.rb b/app/controllers/pageview_controller.rb new file mode 100644 index 00000000000..2b669eaf515 --- /dev/null +++ b/app/controllers/pageview_controller.rb @@ -0,0 +1,13 @@ +# frozen_string_literal: true + +class PageviewController < ApplicationController + skip_before_action :check_xhr, + :redirect_to_login_if_required, + :preload_json, + :verify_authenticity_token + + def index + # pageview tracking is handled by middleware + render plain: "ok" + end +end diff --git a/app/models/application_request.rb b/app/models/application_request.rb index a91c67571ed..8b7f966a56f 100644 --- a/app/models/application_request.rb +++ b/app/models/application_request.rb @@ -14,6 +14,10 @@ class ApplicationRequest < ActiveRecord::Base page_view_anon_mobile api user_api + page_view_anon_browser + page_view_anon_browser_mobile + page_view_logged_in_browser + page_view_logged_in_browser_mobile ] include CachedCounting diff --git a/app/models/concerns/reports/consolidated_page_views_browser_detection.rb b/app/models/concerns/reports/consolidated_page_views_browser_detection.rb new file mode 100644 index 00000000000..3b252722b1b --- /dev/null +++ b/app/models/concerns/reports/consolidated_page_views_browser_detection.rb @@ -0,0 +1,75 @@ +# frozen_string_literal: true + +module Reports::ConsolidatedPageViewsBrowserDetection + extend ActiveSupport::Concern + + class_methods do + def report_consolidated_page_views_browser_detection(report) + report.modes = [:stacked_chart] + + data = + DB.query( + <<~SQL, + SELECT + date, + SUM(CASE WHEN req_type = :page_view_logged_in_browser THEN count ELSE 0 END) AS page_view_logged_in_browser, + SUM(CASE WHEN req_type = :page_view_anon_browser THEN count ELSE 0 END) AS page_view_anon_browser, + SUM(CASE WHEN req_type = :page_view_crawler THEN count ELSE 0 END) AS page_view_crawler, + SUM( + CASE WHEN req_type = :page_view_anon THEN count + WHEN req_type = :page_view_logged_in THEN count + WHEN req_type = :page_view_anon_browser THEN -count + WHEN req_type = :page_view_logged_in_browser THEN -count + ELSE 0 + END + ) AS page_view_other + FROM application_requests + WHERE date >= :start_date AND date <= :end_date + GROUP BY date + ORDER BY date ASC + SQL + start_date: report.start_date, + end_date: report.end_date, + page_view_anon: ApplicationRequest.req_types[:page_view_anon], + page_view_crawler: ApplicationRequest.req_types[:page_view_crawler], + page_view_logged_in: ApplicationRequest.req_types[:page_view_logged_in], + page_view_anon_browser: ApplicationRequest.req_types[:page_view_anon_browser], + page_view_logged_in_browser: ApplicationRequest.req_types[:page_view_logged_in_browser], + ) + + report.data = [ + { + req: "page_view_logged_in_browser", + label: + I18n.t( + "reports.consolidated_page_views_browser_detection.xaxis.page_view_logged_in_browser", + ), + color: report.colors[0], + data: data.map { |row| { x: row.date, y: row.page_view_logged_in_browser } }, + }, + { + req: "page_view_anon_browser", + label: + I18n.t( + "reports.consolidated_page_views_browser_detection.xaxis.page_view_anon_browser", + ), + color: report.colors[1], + data: data.map { |row| { x: row.date, y: row.page_view_anon_browser } }, + }, + { + req: "page_view_crawler", + label: + I18n.t("reports.consolidated_page_views_browser_detection.xaxis.page_view_crawler"), + color: report.colors[2], + data: data.map { |row| { x: row.date, y: row.page_view_crawler } }, + }, + { + req: "page_view_other", + label: I18n.t("reports.consolidated_page_views_browser_detection.xaxis.page_view_other"), + color: report.colors[2], + data: data.map { |row| { x: row.date, y: row.page_view_other } }, + }, + ] + end + end +end diff --git a/app/models/report.rb b/app/models/report.rb index 0daa3e86f87..dbf576a5ce6 100644 --- a/app/models/report.rb +++ b/app/models/report.rb @@ -49,6 +49,7 @@ class Report include Reports::UserFlaggingRatio include Reports::TrustLevelGrowth include Reports::ConsolidatedPageViews + include Reports::ConsolidatedPageViewsBrowserDetection include Reports::ConsolidatedApiRequests include Reports::Visits include Reports::TimeToFirstResponse diff --git a/app/views/exceptions/not_found.html.erb b/app/views/exceptions/not_found.html.erb index 373a6c18059..ff8918d9f85 100644 --- a/app/views/exceptions/not_found.html.erb +++ b/app/views/exceptions/not_found.html.erb @@ -1,5 +1,13 @@ <% content_for :title do %><%= @page_title %> - <%= SiteSetting.title %><% end %> +<% content_for :head do %> + +<% end %> + +<% content_for :no_ember_head do %> + +<% end %> +

<%= @title %>

diff --git a/app/views/layouts/no_ember.html.erb b/app/views/layouts/no_ember.html.erb index 51d1f319e94..6a7c6a3986f 100644 --- a/app/views/layouts/no_ember.html.erb +++ b/app/views/layouts/no_ember.html.erb @@ -17,6 +17,8 @@ <%- if allow_plugins? %> <%= build_plugin_html 'server:before-head-close' %> <%- end -%> + + <%= preload_script "pageview" %> <%- if allow_plugins? %> diff --git a/app/views/layouts/publish.html.erb b/app/views/layouts/publish.html.erb index 04d91c9653f..c934229dc1e 100644 --- a/app/views/layouts/publish.html.erb +++ b/app/views/layouts/publish.html.erb @@ -8,6 +8,7 @@ <%= render_google_tag_manager_head_code %> <%= render_google_universal_analytics_code %> <%= preload_script 'publish' %> + <%= preload_script 'pageview' %> <%= theme_lookup("header") %> diff --git a/config/initializers/004-message_bus.rb b/config/initializers/004-message_bus.rb index 383d8a01134..d9a1959a3be 100644 --- a/config/initializers/004-message_bus.rb +++ b/config/initializers/004-message_bus.rb @@ -31,7 +31,7 @@ def setup_message_bus_env(env) "Access-Control-Allow-Origin" => Discourse.base_url_no_prefix, "Access-Control-Allow-Methods" => "GET, POST", "Access-Control-Allow-Headers" => - "X-SILENCE-LOGGER, X-Shared-Session-Key, Dont-Chunk, Discourse-Present", + "X-SILENCE-LOGGER, X-Shared-Session-Key, Dont-Chunk, Discourse-Present, Discourse-Deferred-Track-View", "Access-Control-Max-Age" => "7200", } diff --git a/config/locales/server.en.yml b/config/locales/server.en.yml index e30316ca322..2fa9a55ad68 100644 --- a/config/locales/server.en.yml +++ b/config/locales/server.en.yml @@ -1338,6 +1338,15 @@ en: user_api: "User API" yaxis: "Day" description: "API requests for regular API keys and user API keys." + consolidated_page_views_browser_detection: + title: "Consolidated Pageviews with Browser Detection (Experimental)" + xaxis: + page_view_anon_browser: "Anonymous Browser" + page_view_logged_in_browser: "Logged In Browser" + page_view_crawler: "Known Crawler" + page_view_other: "Other pageviews" + yaxis: "Day" + description: "Pageviews for logged in users, anonymous users, known crawlers and other. This experimental report ensures logged-in/anon requests are coming from real browsers before counting them." dau_by_mau: title: "DAU/MAU" xaxis: "Day" @@ -1497,6 +1506,16 @@ en: xaxis: "Day" yaxis: "Mobile Anon Pageviews" description: "Number of new pageviews from visitors on a mobile device who are not logged in." + page_view_anon_browser_reqs: + title: "Anonymous Browser Pageviews" + xaxis: "Day" + yaxis: "Anonymous Browser Pageviews" + description: "Number of pageviews by anonymous visitors using real browsers." + page_view_logged_in_browser_reqs: + title: "Logged In Browser Pageviews" + xaxis: "Day" + yaxis: "Logged In Browser Pageviews" + description: "Number of pageviews by logged-in visitors using real browsers." http_background_reqs: title: "Background" xaxis: "Day" diff --git a/config/routes.rb b/config/routes.rb index d784afeb440..34dc52d7445 100644 --- a/config/routes.rb +++ b/config/routes.rb @@ -1629,6 +1629,8 @@ Discourse::Application.routes.draw do resources :sidebar_sections, only: %i[index create update destroy] put "/sidebar_sections/reset/:id" => "sidebar_sections#reset" + post "/pageview" => "pageview#index" + get "*url", to: "permalinks#show", constraints: PermalinkConstraint.new get "/form-templates/:id" => "form_templates#show" diff --git a/lib/middleware/request_tracker.rb b/lib/middleware/request_tracker.rb index 042b946ba22..1147a25a617 100644 --- a/lib/middleware/request_tracker.rb +++ b/lib/middleware/request_tracker.rb @@ -74,9 +74,31 @@ class Middleware::RequestTracker elsif data[:has_auth_cookie] ApplicationRequest.increment!(:page_view_logged_in) ApplicationRequest.increment!(:page_view_logged_in_mobile) if data[:is_mobile] + if data[:explicit_track_view] + # Must be a browser if it had this header from our ajax implementation + ApplicationRequest.increment!(:page_view_logged_in_browser) + ApplicationRequest.increment!(:page_view_logged_in_browser_mobile) if data[:is_mobile] + end elsif !SiteSetting.login_required ApplicationRequest.increment!(:page_view_anon) ApplicationRequest.increment!(:page_view_anon_mobile) if data[:is_mobile] + if data[:explicit_track_view] + # Must be a browser if it had this header from our ajax implementation + ApplicationRequest.increment!(:page_view_anon_browser) + ApplicationRequest.increment!(:page_view_anon_browser_mobile) if data[:is_mobile] + end + end + end + + # Message-bus requests may include this 'deferred track' header which we use to detect + # 'real browser' views. + if data[:deferred_track] && !data[:is_crawler] + if data[:has_auth_cookie] + ApplicationRequest.increment!(:page_view_logged_in_browser) + ApplicationRequest.increment!(:page_view_logged_in_browser_mobile) if data[:is_mobile] + elsif !SiteSetting.login_required + ApplicationRequest.increment!(:page_view_anon_browser) + ApplicationRequest.increment!(:page_view_anon_browser_mobile) if data[:is_mobile] end end @@ -103,12 +125,20 @@ class Middleware::RequestTracker request ||= Rack::Request.new(env) helper = Middleware::AnonymousCache::Helper.new(env, request) + # Value of the discourse-track-view request header env_track_view = env["HTTP_DISCOURSE_TRACK_VIEW"] - track_view = status == 200 - track_view &&= env_track_view != "0" && env_track_view != "false" - track_view &&= - env_track_view || (request.get? && !request.xhr? && headers["Content-Type"] =~ %r{text/html}) - track_view = !!track_view + + # Was the discourse-track-view request header set to true? Likely + # set by our ajax library to indicate a page view. + explicit_track_view = status == 200 && %w[1 true].include?(env_track_view) + + # An HTML response to a GET request is tracked implicitly + implicit_track_view = + status == 200 && !%w[0 false].include?(env_track_view) && request.get? && !request.xhr? && + headers["Content-Type"] =~ %r{text/html} + + track_view = !!(explicit_track_view || implicit_track_view) + has_auth_cookie = Auth::DefaultCurrentUserProvider.find_v0_auth_cookie(request).present? has_auth_cookie ||= Auth::DefaultCurrentUserProvider.find_v1_auth_cookie(env).present? @@ -118,6 +148,10 @@ class Middleware::RequestTracker is_message_bus = request.path.start_with?("#{Discourse.base_path}/message-bus/") is_topic_timings = request.path.start_with?("#{Discourse.base_path}/topics/timings") + # This header is sent on a follow-up request after a real browser loads up a page + # see `scripts/pageview.js` and `instance-initializers/page-tracking.js` + has_deferred_track_header = %w[1 true].include?(env["HTTP_DISCOURSE_DEFERRED_TRACK_VIEW"]) + h = { status: status, is_crawler: helper.is_crawler?, @@ -129,6 +163,8 @@ class Middleware::RequestTracker track_view: track_view, timing: timing, queue_seconds: env["REQUEST_QUEUE_SECONDS"], + explicit_track_view: explicit_track_view, + deferred_track: has_deferred_track_header, } if h[:is_background] @@ -166,7 +202,8 @@ class Middleware::RequestTracker data = begin self.class.get_data(env, result, info, request) - rescue StandardError + rescue StandardError => e + Rails.logger.warn("RequestTracker.get_data failed: #{e}") nil end diff --git a/spec/lib/middleware/request_tracker_spec.rb b/spec/lib/middleware/request_tracker_spec.rb index 7757e093d7f..059f25f63d2 100644 --- a/spec/lib/middleware/request_tracker_spec.rb +++ b/spec/lib/middleware/request_tracker_spec.rb @@ -66,6 +66,7 @@ RSpec.describe Middleware::RequestTracker do CachedCounting.flush expect(ApplicationRequest.page_view_anon.first.count).to eq(2) + expect(ApplicationRequest.page_view_anon_browser.first.count).to eq(2) end it "can log requests correctly" do @@ -119,6 +120,23 @@ RSpec.describe Middleware::RequestTracker do expect(ApplicationRequest.page_view_anon_mobile.first.count).to eq(1) expect(ApplicationRequest.page_view_crawler.first.count).to eq(1) + + expect(ApplicationRequest.page_view_anon_browser.first.count).to eq(1) + end + + it "logs deferred pageviews correctly" do + data = + Middleware::RequestTracker.get_data( + env(:path => "/message-bus/abcde/poll", "HTTP_DISCOURSE_DEFERRED_TRACK_VIEW" => "1"), + ["200", { "Content-Type" => "text/html" }], + 0.1, + ) + Middleware::RequestTracker.log_request(data) + + expect(data[:deferred_track]).to eq(true) + CachedCounting.flush + + expect(ApplicationRequest.page_view_anon_browser.first.count).to eq(1) end it "logs API requests correctly" do diff --git a/spec/system/request_tracker_spec.rb b/spec/system/request_tracker_spec.rb new file mode 100644 index 00000000000..43df17b7296 --- /dev/null +++ b/spec/system/request_tracker_spec.rb @@ -0,0 +1,188 @@ +# frozen_string_literal: true + +describe "request tracking", type: :system do + before do + ApplicationRequest.enable + CachedCounting.reset + CachedCounting.enable + end + + after do + CachedCounting.reset + ApplicationRequest.disable + CachedCounting.disable + end + + it "tracks an anonymous visit correctly" do + visit "/" + + try_until_success do + CachedCounting.flush + expect(ApplicationRequest.stats).to include( + "page_view_anon_total" => 1, + "page_view_anon_browser_total" => 1, + "page_view_logged_in_total" => 0, + "page_view_crawler_total" => 0, + ) + end + + find(".nav-item_categories a").click + + try_until_success do + CachedCounting.flush + expect(ApplicationRequest.stats).to include( + "page_view_anon_total" => 2, + "page_view_anon_browser_total" => 2, + "page_view_logged_in_total" => 0, + "page_view_crawler_total" => 0, + ) + end + end + + it "tracks a crawler visit correctly" do + # Can't change Selenium's user agent... so change site settings to make Discourse detect chrome as a crawler + SiteSetting.crawler_user_agents += "|chrome" + + visit "/" + + try_until_success do + CachedCounting.flush + expect(ApplicationRequest.stats).to include( + "page_view_anon_total" => 0, + "page_view_anon_browser_total" => 0, + "page_view_logged_in_total" => 0, + "page_view_crawler_total" => 1, + ) + end + end + + it "tracks a logged-in session correctly" do + sign_in Fabricate(:user) + + visit "/" + + try_until_success do + CachedCounting.flush + expect(ApplicationRequest.stats).to include( + "page_view_anon_total" => 0, + "page_view_anon_browser_total" => 0, + "page_view_logged_in_total" => 1, + "page_view_crawler_total" => 0, + "page_view_logged_in_browser_total" => 1, + ) + end + + find(".nav-item_categories a").click + + try_until_success do + CachedCounting.flush + expect(ApplicationRequest.stats).to include( + "page_view_anon_total" => 0, + "page_view_anon_browser_total" => 0, + "page_view_logged_in_total" => 2, + "page_view_crawler_total" => 0, + "page_view_logged_in_browser_total" => 2, + ) + end + end + + it "tracks normal error pages correctly" do + SiteSetting.bootstrap_error_pages = false + + visit "/foobar" + + try_until_success do + CachedCounting.flush + + # Does not count error as a pageview + expect(ApplicationRequest.stats).to include( + "http_4xx_total" => 1, + "page_view_anon_total" => 0, + "page_view_anon_browser_total" => 0, + "page_view_logged_in_total" => 0, + "page_view_crawler_total" => 0, + ) + end + + find("#site-logo").click + + try_until_success do + CachedCounting.flush + expect(ApplicationRequest.stats).to include( + "http_4xx_total" => 1, + "page_view_anon_total" => 1, + "page_view_anon_browser_total" => 1, + "page_view_logged_in_total" => 0, + "page_view_crawler_total" => 0, + ) + end + end + + it "tracks non-ember pages correctly" do + visit "/safe-mode" + + try_until_success do + CachedCounting.flush + + # Does not count error as a pageview + expect(ApplicationRequest.stats).to include( + "page_view_anon_total" => 1, + "page_view_anon_browser_total" => 1, + "page_view_logged_in_total" => 0, + "page_view_crawler_total" => 0, + ) + end + end + + it "tracks bootstrapped error pages correctly" do + SiteSetting.bootstrap_error_pages = true + + visit "/foobar" + + try_until_success do + CachedCounting.flush + + # Does not count error as a pageview + expect(ApplicationRequest.stats).to include( + "http_4xx_total" => 1, + "page_view_anon_total" => 0, + "page_view_anon_browser_total" => 0, + "page_view_logged_in_total" => 0, + "page_view_crawler_total" => 0, + ) + end + + find("#site-logo").click + + try_until_success do + CachedCounting.flush + expect(ApplicationRequest.stats).to include( + "http_4xx_total" => 1, + "page_view_anon_total" => 1, + "page_view_anon_browser_total" => 1, + "page_view_logged_in_total" => 0, + "page_view_crawler_total" => 0, + ) + end + end + + it "tracks published pages correctly" do + SiteSetting.enable_page_publishing = true + page = + Fabricate(:published_page, public: true, slug: "some-page", topic: Fabricate(:post).topic) + + visit "/pub/some-page" + + try_until_success do + CachedCounting.flush + + # Does not count error as a pageview + expect(ApplicationRequest.stats).to include( + "page_view_anon_total" => 1, + "page_view_anon_browser_total" => 1, + "page_view_logged_in_total" => 0, + "page_view_crawler_total" => 0, + ) + end + end +end