From aa1138ff718925a064c987bc8b3dde1bfcbba210 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9gis=20Hanol?= Date: Mon, 25 Jan 2021 11:23:36 +0100 Subject: [PATCH] FIX: reindex_search job should work on model with no search data (#11819) Lots of changes but it's mostly a refactoring. The interesting part that was fix are the 'load_problem__ids' methods. They will now return records with no search data associated so they can be properly indexed for the search. This "bad" state usually happens after a migration. --- app/controllers/users_controller.rb | 27 +- app/jobs/scheduled/reindex_search.rb | 261 ++++++++---------- app/models/user_search.rb | 153 +++++----- spec/components/search_spec.rb | 4 +- spec/jobs/reindex_search_spec.rb | 25 +- .../similar_topics_controller_spec.rb | 2 +- 6 files changed, 204 insertions(+), 268 deletions(-) diff --git a/app/controllers/users_controller.rb b/app/controllers/users_controller.rb index 2c4163e986a..651fda83400 100644 --- a/app/controllers/users_controller.rb +++ b/app/controllers/users_controller.rb @@ -1041,18 +1041,14 @@ class UsersController < ApplicationController def search_users term = params[:term].to_s.strip - topic_id = params[:topic_id] - topic_id = topic_id.to_i if topic_id - - category_id = params[:category_id].to_i if category_id.present? + topic_id = params[:topic_id].to_i if params[:topic_id].present? + category_id = params[:category_id].to_i if params[:category_id].present? topic_allowed_users = params[:topic_allowed_users] || false group_names = params[:groups] || [] group_names << params[:group] if params[:group] - if group_names.present? - @groups = Group.where(name: group_names) - end + @groups = Group.where(name: group_names) if group_names.present? options = { topic_allowed_users: topic_allowed_users, @@ -1060,13 +1056,8 @@ class UsersController < ApplicationController groups: @groups } - if topic_id - options[:topic_id] = topic_id - end - - if category_id - options[:category_id] = category_id - end + options[:topic_id] = topic_id if topic_id + options[:category_id] = category_id if category_id results = UserSearch.new(term, options).search @@ -1075,8 +1066,10 @@ class UsersController < ApplicationController to_render = { users: results.as_json(only: user_fields, methods: [:avatar_template]) } + # blank term is only handy for in-topic search of users after @ + # we do not want group results ever if term is blank groups = - if current_user + if term.present? && current_user if params[:include_groups] == 'true' Group.visible_groups(current_user) elsif params[:include_mentionable_groups] == 'true' @@ -1086,10 +1079,6 @@ class UsersController < ApplicationController end end - # blank term is only handy for in-topic search of users after @ - # we do not want group results ever if term is blank - groups = nil if term.blank? - if groups groups = Group.search_groups(term, groups: groups) groups = groups.order('groups.name asc') diff --git a/app/jobs/scheduled/reindex_search.rb b/app/jobs/scheduled/reindex_search.rb index 3d6579c78f1..26215cdc7ae 100644 --- a/app/jobs/scheduled/reindex_search.rb +++ b/app/jobs/scheduled/reindex_search.rb @@ -1,201 +1,162 @@ # frozen_string_literal: true module Jobs - # if locale changes or search algorithm changes we may want to reindex stuff class ReindexSearch < ::Jobs::Scheduled every 2.hours - CLEANUP_GRACE_PERIOD = 1.day.ago - def execute(args) - @verbose = true if args && Hash === args && args[:verbose] + @verbose = args[:verbose] + @cleanup_grace_period = 1.day.ago - rebuild_problem_topics - rebuild_problem_posts - rebuild_problem_categories - rebuild_problem_users - rebuild_problem_tags - clean_post_search_data - clean_topic_search_data + rebuild_categories + rebuild_tags + rebuild_topics + rebuild_posts + rebuild_users - @verbose = nil + clean_topics + clean_posts end - def rebuild_problem_categories(limit: 500) + def rebuild_categories(limit: 500, indexer: SearchIndexer) category_ids = load_problem_category_ids(limit) - if @verbose - puts "rebuilding #{category_ids.length} categories" - end + puts "rebuilding #{category_ids.size} categories" if @verbose category_ids.each do |id| category = Category.find_by(id: id) - SearchIndexer.index(category, force: true) if category + indexer.index(category, force: true) if category end end - def rebuild_problem_users(limit: 10000) - user_ids = load_problem_user_ids(limit) - - if @verbose - puts "rebuilding #{user_ids.length} users" - end - - user_ids.each do |id| - user = User.find_by(id: id) - SearchIndexer.index(user, force: true) if user - end - end - - def rebuild_problem_topics(limit: 10000) - topic_ids = load_problem_topic_ids(limit) - - if @verbose - puts "rebuilding #{topic_ids.length} topics" - end - - topic_ids.each do |id| - topic = Topic.find_by(id: id) - SearchIndexer.index(topic, force: true) if topic - end - end - - def rebuild_problem_posts(limit: 20000, indexer: SearchIndexer, verbose: false) - post_ids = load_problem_post_ids(limit) - verbose ||= @verbose - - if verbose - puts "rebuilding #{post_ids.length} posts" - end - - i = 0 - post_ids.each do |id| - # could be deleted while iterating through batch - if post = Post.find_by(id: id) - indexer.index(post, force: true) - i += 1 - - if verbose && i % 1000 == 0 - puts "#{i} posts reindexed" - end - end - end - end - - def rebuild_problem_tags(limit: 10000) + def rebuild_tags(limit: 1_000, indexer: SearchIndexer) tag_ids = load_problem_tag_ids(limit) - if @verbose - puts "rebuilding #{tag_ids.length} tags" - end + puts "rebuilding #{tag_ids.size} tags" if @verbose tag_ids.each do |id| tag = Tag.find_by(id: id) - SearchIndexer.index(tag, force: true) if tag + indexer.index(tag, force: true) if tag end end - private + def rebuild_topics(limit: 10_000, indexer: SearchIndexer) + topic_ids = load_problem_topic_ids(limit) - def clean_post_search_data - puts "cleaning up post search data" if @verbose + puts "rebuilding #{topic_ids.size} topics" if @verbose - PostSearchData - .joins("LEFT JOIN posts p ON p.id = post_search_data.post_id") - .where("p.raw = ''") - .delete_all - - DB.exec(<<~SQL, deleted_at: CLEANUP_GRACE_PERIOD) - DELETE FROM post_search_data - WHERE post_id IN ( - SELECT post_id - FROM post_search_data - LEFT JOIN posts ON post_search_data.post_id = posts.id - INNER JOIN topics ON posts.topic_id = topics.id - WHERE (topics.deleted_at IS NOT NULL - AND topics.deleted_at <= :deleted_at) OR ( - posts.deleted_at IS NOT NULL AND - posts.deleted_at <= :deleted_at - ) - - ) - SQL + topic_ids.each do |id| + topic = Topic.find_by(id: id) + indexer.index(topic, force: true) if topic + end end - def clean_topic_search_data + def rebuild_posts(limit: 20_000, indexer: SearchIndexer) + post_ids = load_problem_post_ids(limit) + + puts "rebuilding #{post_ids.size} posts" if @verbose + + post_ids.each do |id| + post = Post.find_by(id: id) + indexer.index(post, force: true) if post + end + end + + def rebuild_users(limit: 5_000, indexer: SearchIndexer) + user_ids = load_problem_user_ids(limit) + + puts "rebuilding #{user_ids.size} users" if @verbose + + user_ids.each do |id| + user = User.find_by(id: id) + indexer.index(user, force: true) if user + end + end + + def clean_topics puts "cleaning up topic search data" if @verbose - DB.exec(<<~SQL, deleted_at: CLEANUP_GRACE_PERIOD) - DELETE FROM topic_search_data - WHERE topic_id IN ( - SELECT topic_id - FROM topic_search_data - INNER JOIN topics ON topic_search_data.topic_id = topics.id - WHERE topics.deleted_at IS NOT NULL - AND topics.deleted_at <= :deleted_at - ) + # remove search data from deleted topics + + DB.exec(<<~SQL, deleted_at: @cleanup_grace_period) + DELETE FROM topic_search_data + WHERE topic_id IN ( + SELECT topic_id + FROM topic_search_data + LEFT JOIN topics ON topic_id = topics.id + WHERE topics.id IS NULL + OR (deleted_at IS NOT NULL AND deleted_at <= :deleted_at) + ) SQL end - def load_problem_post_ids(limit) - params = { - locale: SiteSetting.default_locale, - version: SearchIndexer::MIN_POST_REINDEX_VERSION, - limit: limit - } + def clean_posts + puts "cleaning up post search data" if @verbose - DB.query_single(<<~SQL, params) - SELECT - posts.id - FROM posts - JOIN topics ON topics.id = posts.topic_id - LEFT JOIN post_search_data pd - ON pd.locale = :locale - AND pd.version >= :version - AND pd.post_id = posts.id - WHERE pd.post_id IS NULL - AND posts.deleted_at IS NULL - AND topics.deleted_at IS NULL - AND posts.raw != '' - ORDER BY posts.id DESC - LIMIT :limit + # remove search data from deleted/empty posts + + DB.exec(<<~SQL, deleted_at: @cleanup_grace_period) + DELETE FROM post_search_data + WHERE post_id IN ( + SELECT post_id + FROM post_search_data + LEFT JOIN posts ON post_id = posts.id + JOIN topics ON posts.topic_id = topics.id + WHERE posts.id IS NULL + OR posts.raw = '' + OR (posts.deleted_at IS NOT NULL AND posts.deleted_at <= :deleted_at) + OR (topics.deleted_at IS NOT NULL AND topics.deleted_at <= :deleted_at) + ) SQL end def load_problem_category_ids(limit) - Category.joins(:category_search_data) - .where('category_search_data.locale != ? - OR category_search_data.version != ?', SiteSetting.default_locale, SearchIndexer::CATEGORY_INDEX_VERSION) - .order('categories.id asc') - .limit(limit) - .pluck(:id) - end - - def load_problem_topic_ids(limit) - Topic.joins(:topic_search_data) - .where('topic_search_data.locale != ? - OR topic_search_data.version != ?', SiteSetting.default_locale, SearchIndexer::TOPIC_INDEX_VERSION) - .order('topics.id desc') - .limit(limit) - .pluck(:id) - end - - def load_problem_user_ids(limit) - User.joins(:user_search_data) - .where('user_search_data.locale != ? - OR user_search_data.version != ?', SiteSetting.default_locale, SearchIndexer::USER_INDEX_VERSION) - .order('users.id asc') + Category + .joins("LEFT JOIN category_search_data ON category_id = categories.id") + .where("category_search_data.locale IS NULL OR category_search_data.locale != ? OR category_search_data.version != ?", SiteSetting.default_locale, SearchIndexer::CATEGORY_INDEX_VERSION) + .order("categories.id ASC") .limit(limit) .pluck(:id) end def load_problem_tag_ids(limit) - Tag.joins(:tag_search_data) - .where('tag_search_data.locale != ? - OR tag_search_data.version != ?', SiteSetting.default_locale, SearchIndexer::TAG_INDEX_VERSION) - .order('tags.id asc') + Tag + .joins("LEFT JOIN tag_search_data ON tag_id = tags.id") + .where("tag_search_data.locale IS NULL OR tag_search_data.locale != ? OR tag_search_data.version != ?", SiteSetting.default_locale, SearchIndexer::TAG_INDEX_VERSION) + .order("tags.id ASC") .limit(limit) .pluck(:id) end + + def load_problem_topic_ids(limit) + Topic + .joins("LEFT JOIN topic_search_data ON topic_id = topics.id") + .where("topic_search_data.locale IS NULL OR topic_search_data.locale != ? OR topic_search_data.version != ?", SiteSetting.default_locale, SearchIndexer::TOPIC_INDEX_VERSION) + .order("topics.id DESC") + .limit(limit) + .pluck(:id) + end + + def load_problem_post_ids(limit) + Post + .joins(:topic) + .joins("LEFT JOIN post_search_data ON post_id = posts.id") + .where("posts.raw != ''") + .where("topics.deleted_at IS NULL") + .where("post_search_data.locale IS NULL OR post_search_data.locale != ? OR post_search_data.version != ?", SiteSetting.default_locale, SearchIndexer::POST_INDEX_VERSION) + .order("posts.id DESC") + .limit(limit) + .pluck(:id) + end + + def load_problem_user_ids(limit) + User + .joins("LEFT JOIN user_search_data ON user_id = users.id") + .where("user_search_data.locale IS NULL OR user_search_data.locale != ? OR user_search_data.version != ?", SiteSetting.default_locale, SearchIndexer::USER_INDEX_VERSION) + .order("users.id ASC") + .limit(limit) + .pluck(:id) + end + end end diff --git a/app/models/user_search.rb b/app/models/user_search.rb index 447f5d44925..2b9b05d0f85 100644 --- a/app/models/user_search.rb +++ b/app/models/user_search.rb @@ -1,13 +1,12 @@ # frozen_string_literal: true -# Searches for a user by username or full text or name (if enabled in SiteSettings) class UserSearch MAX_SIZE_PRIORITY_MENTION ||= 500 def initialize(term, opts = {}) - @term = term - @term_like = "#{term.downcase.gsub("_", "\\_")}%" + @term = term.downcase + @term_like = @term.gsub("_", "\\_") + "%" @topic_id = opts[:topic_id] @category_id = opts[:category_id] @topic_allowed_users = opts[:topic_allowed_users] @@ -15,131 +14,125 @@ class UserSearch @include_staged_users = opts[:include_staged_users] || false @limit = opts[:limit] || 20 @groups = opts[:groups] + + @topic = Topic.find(@topic_id) if @topic_id + @category = Category.find(@category_id) if @category_id + @guardian = Guardian.new(@searching_user) - @guardian.ensure_can_see_groups_members! @groups if @groups - @guardian.ensure_can_see_category! Category.find(@category_id) if @category_id - @guardian.ensure_can_see_topic! Topic.find(@topic_id) if @topic_id + @guardian.ensure_can_see_groups_members!(@groups) if @groups + @guardian.ensure_can_see_category!(@category) if @category + @guardian.ensure_can_see_topic!(@topic) if @topic end def scoped_users users = User.where(active: true) users = users.where(staged: false) unless @include_staged_users + users = users.not_suspended unless @searching_user&.staff? if @groups - users = users.joins("INNER JOIN group_users ON group_users.user_id = users.id") + users = users + .joins(:group_users) .where("group_users.group_id IN (?)", @groups.map(&:id)) end - unless @searching_user && @searching_user.staff? - users = users.not_suspended - end - # Only show users who have access to private topic - if @topic_id && @topic_allowed_users == "true" - topic = Topic.find_by(id: @topic_id) - - if topic.category && topic.category.read_restricted - users = users.includes(:secure_categories) - .where("users.admin = TRUE OR categories.id = ?", topic.category.id) - .references(:categories) - end - end - - users.limit(@limit) - end - - def filtered_by_term_users - users = scoped_users - - if @term.present? - if SiteSetting.enable_names? && @term !~ /[_\.-]/ - query = Search.ts_query(term: @term, ts_config: "simple") - - users = users.includes(:user_search_data) - .references(:user_search_data) - .where("user_search_data.search_data @@ #{query}") - .order(DB.sql_fragment("CASE WHEN username_lower LIKE ? THEN 0 ELSE 1 END ASC", @term_like)) - - else - users = users.where("username_lower LIKE :term_like", term_like: @term_like) - end + if @topic_allowed_users == "true" && @topic&.category&.read_restricted + users = users + .references(:categories) + .includes(:secure_categories) + .where("users.admin OR categories.id = ?", @topic.category_id) end users end + def filtered_by_term_users + if @term.blank? + scoped_users + elsif SiteSetting.enable_names? && @term !~ /[_\.-]/ + query = Search.ts_query(term: @term, ts_config: "simple") + + scoped_users + .includes(:user_search_data) + .where("user_search_data.search_data @@ #{query}") + .order(DB.sql_fragment("CASE WHEN username_lower LIKE ? THEN 0 ELSE 1 END ASC", @term_like)) + else + scoped_users.where("username_lower LIKE :term_like", term_like: @term_like) + end + end + def search_ids users = Set.new # 1. exact username matches if @term.present? - scoped_users.where(username_lower: @term.downcase) + scoped_users + .where(username_lower: @term) .limit(@limit) .pluck(:id) .each { |id| users << id } - end - return users.to_a if users.length >= @limit + return users.to_a if users.size >= @limit # 2. in topic if @topic_id in_topic = filtered_by_term_users - .where('users.id IN (SELECT p.user_id FROM posts p WHERE topic_id = ?)', @topic_id) + .where('users.id IN (SELECT user_id FROM posts WHERE topic_id = ?)', @topic_id) if @searching_user.present? in_topic = in_topic.where('users.id <> ?', @searching_user.id) end in_topic - .order('last_seen_at DESC') - .limit(@limit - users.length) + .order('last_seen_at DESC NULLS LAST') + .limit(@limit - users.size) .pluck(:id) .each { |id| users << id } end - return users.to_a if users.length >= @limit + return users.to_a if users.size >= @limit - secure_category_id = nil + # 3. in category + secure_category_id = + if @category_id + DB.query_single(<<~SQL, @category_id).first + SELECT id + FROM categories + WHERE read_restricted + AND id = ? + SQL + elsif @topic_id + DB.query_single(<<~SQL, @topic_id).first + SELECT id + FROM categories + WHERE read_restricted + AND id IN (SELECT category_id FROM topics WHERE id = ?) + SQL + end - if @category_id - secure_category_id = DB.query_single(<<~SQL, @category_id).first - SELECT id FROM categories - WHERE read_restricted AND id = ? - SQL - elsif @topic_id - secure_category_id = DB.query_single(<<~SQL, @topic_id).first - SELECT id FROM categories - WHERE read_restricted AND id IN ( - SELECT category_id FROM topics - WHERE id = ? - ) - SQL - end - - # 3. category matches if secure_category_id - @searching_user.present? - category_groups = Group.where(<<~SQL, secure_category_id, MAX_SIZE_PRIORITY_MENTION) groups.id IN ( - SELECT group_id FROM category_groups - JOIN groups g ON group_id = g.id - WHERE - category_id = ? AND - user_count < ? + SELECT group_id + FROM category_groups + JOIN groups g ON group_id = g.id + WHERE category_id = ? + AND user_count < ? ) SQL - category_groups = category_groups.members_visible_groups(@searching_user) + if @searching_user.present? + category_groups = category_groups.members_visible_groups(@searching_user) + end in_category = filtered_by_term_users .where(<<~SQL, category_groups.pluck(:id)) users.id IN ( SELECT gu.user_id - FROM group_users gu - WHERE group_id IN (?) - LIMIT 200 + FROM group_users gu + WHERE group_id IN (?) + LIMIT 200 ) SQL @@ -148,17 +141,19 @@ class UserSearch end in_category - .order('last_seen_at DESC') - .limit(@limit - users.length) + .order('last_seen_at DESC NULLS LAST') + .limit(@limit - users.size) .pluck(:id) .each { |id| users << id } end - return users.to_a if users.length >= @limit + return users.to_a if users.size >= @limit + # 4. global matches if @term.present? - filtered_by_term_users.order('last_seen_at DESC') - .limit(@limit - users.length) + filtered_by_term_users + .order('last_seen_at DESC NULLS LAST') + .limit(@limit - users.size) .pluck(:id) .each { |id| users << id } end diff --git a/spec/components/search_spec.rb b/spec/components/search_spec.rb index b34e2c40807..16a583a2a5b 100644 --- a/spec/components/search_spec.rb +++ b/spec/components/search_spec.rb @@ -961,7 +961,7 @@ describe Search do it 'can find posts with tags' do # we got to make this index (it is deferred) - Jobs::ReindexSearch.new.rebuild_problem_posts + Jobs::ReindexSearch.new.rebuild_posts result = Search.execute(tag.name) expect(result.posts.length).to eq(1) @@ -977,7 +977,7 @@ describe Search do it 'can find posts with tag synonyms' do synonym = Fabricate(:tag, name: 'synonym', target_tag: tag) - Jobs::ReindexSearch.new.rebuild_problem_posts + Jobs::ReindexSearch.new.rebuild_posts result = Search.execute(synonym.name) expect(result.posts.length).to eq(1) end diff --git a/spec/jobs/reindex_search_spec.rb b/spec/jobs/reindex_search_spec.rb index b6132855e22..3128aeccb1a 100644 --- a/spec/jobs/reindex_search_spec.rb +++ b/spec/jobs/reindex_search_spec.rb @@ -34,7 +34,7 @@ describe Jobs::ReindexSearch do end end - describe 'rebuild_problem_posts' do + describe 'rebuild_posts' do class FakeIndexer def self.index(post, force:) get_posts.push(post) @@ -59,10 +59,7 @@ describe Jobs::ReindexSearch do FakeIndexer.reset end - it ( - 'should not reindex posts that belong to a deleted topic ' \ - 'or have been trashed' - ) do + it "should not reindex posts that belong to a deleted topic or have been trashed" do post = Fabricate(:post) post2 = Fabricate(:post) post3 = Fabricate(:post) @@ -70,7 +67,7 @@ describe Jobs::ReindexSearch do post2.topic.trash! post3.trash! - subject.rebuild_problem_posts(indexer: FakeIndexer) + subject.rebuild_posts(indexer: FakeIndexer) expect(FakeIndexer.posts).to contain_exactly(post) end @@ -78,7 +75,7 @@ describe Jobs::ReindexSearch do it 'should not reindex posts with a developmental version' do post = Fabricate(:post, version: SearchIndexer::MIN_POST_REINDEX_VERSION + 1) - subject.rebuild_problem_posts(indexer: FakeIndexer) + subject.rebuild_posts(indexer: FakeIndexer) expect(FakeIndexer.posts).to eq([]) end @@ -94,7 +91,7 @@ describe Jobs::ReindexSearch do post2.save!(validate: false) - subject.rebuild_problem_posts(indexer: FakeIndexer) + subject.rebuild_posts(indexer: FakeIndexer) expect(FakeIndexer.posts).to contain_exactly(post) end @@ -107,9 +104,7 @@ describe Jobs::ReindexSearch do [topic, topic2].each { |t| SearchIndexer.index(t, force: true) } - freeze_time(described_class::CLEANUP_GRACE_PERIOD) do - topic.trash! - end + freeze_time(1.day.ago) { topic.trash! } expect { subject.execute({}) }.to change { TopicSearchData.count }.by(-1) expect(Topic.pluck(:id)).to contain_exactly(topic2.id) @@ -119,11 +114,7 @@ describe Jobs::ReindexSearch do ) end - it( - "should clean up post_search_data of posts with empty raw or posts from " \ - "trashed topics" - ) do - + it "should clean up post_search_data of posts with empty raw or posts from trashed topics" do post = Fabricate(:post) post2 = Fabricate(:post, post_type: Post.types[:small_action]) post2.raw = "" @@ -132,7 +123,7 @@ describe Jobs::ReindexSearch do post3.topic.trash! post4, post5, post6 = nil - freeze_time(described_class::CLEANUP_GRACE_PERIOD) do + freeze_time(1.day.ago) do post4 = Fabricate(:post) post4.topic.trash! diff --git a/spec/requests/similar_topics_controller_spec.rb b/spec/requests/similar_topics_controller_spec.rb index cef7f5e205e..8ecc0486681 100644 --- a/spec/requests/similar_topics_controller_spec.rb +++ b/spec/requests/similar_topics_controller_spec.rb @@ -17,7 +17,7 @@ describe SimilarTopicsController do def reindex_posts SearchIndexer.enable - Jobs::ReindexSearch.new.rebuild_problem_posts + Jobs::ReindexSearch.new.rebuild_posts end it "requires a title param" do