From 5de483a1b96c341c1f2534ed314e8febc33d42c1 Mon Sep 17 00:00:00 2001
From: Vinoth Kannan <vinothkannan@vinkas.com>
Date: Fri, 19 Apr 2019 17:27:16 +0530
Subject: [PATCH] PERF: Very post upload's existence by preloaded upload sha1s
 array

---
 app/models/concerns/has_url.rb | 10 ++++++
 lib/tasks/posts.rake           | 59 +++++++++++++++++++++++++---------
 2 files changed, 53 insertions(+), 16 deletions(-)

diff --git a/app/models/concerns/has_url.rb b/app/models/concerns/has_url.rb
index 8b92f6ab658..5fdc4b26490 100644
--- a/app/models/concerns/has_url.rb
+++ b/app/models/concerns/has_url.rb
@@ -6,6 +6,16 @@ module HasUrl
       url.match(self::URL_REGEX)
     end
 
+    def extract_sha1(path)
+      data = extract_url(path)
+      return if data.blank?
+
+      sha1 = data[2]
+      return if sha1&.length != Upload::SHA1_LENGTH
+
+      sha1
+    end
+
     def get_from_url(url)
       return if url.blank?
 
diff --git a/lib/tasks/posts.rake b/lib/tasks/posts.rake
index d01a409a944..f8cf44ecf93 100644
--- a/lib/tasks/posts.rake
+++ b/lib/tasks/posts.rake
@@ -388,31 +388,58 @@ task 'posts:reorder_posts', [:topic_id] => [:environment] do |_, args|
   puts "", "Done.", ""
 end
 
+UPLOAD_PATTERNS ||= [
+  /\/uploads\/#{RailsMultisite::ConnectionManagement.current_db}\//,
+  /\/original\//,
+  /\/optimized\//
+].freeze
+
 desc 'Finds missing post upload records from cooked HTML content'
 task 'posts:missing_uploads' => :environment do
   PostCustomField.where(name: Post::MISSING_UPLOADS).destroy_all
-  posts = Post.have_uploads.select(:id, :cooked)
   count = 0
 
-  posts.find_each do |post|
-    missing = []
+  Post.have_uploads.select(:id, :cooked).find_in_batches do |posts|
+    ids = posts.pluck(:id)
+    sha1s = Upload.joins(:post_uploads).where("post_uploads.post_id >= ? AND post_uploads.post_id <= ?", ids.min, ids.max).pluck(:sha1)
 
-    Nokogiri::HTML::fragment(post.cooked).css("a/@href", "img/@src").each do |media|
-      src = media.value
-      next if src.blank? || (src =~ /\/uploads\/#{RailsMultisite::ConnectionManagement.current_db}\//).blank?
+    posts.each do |post|
+      missing = []
 
-      src = "#{SiteSetting.force_https ? "https" : "http"}:#{src}" if src.start_with?("//")
-      next unless Discourse.store.has_been_uploaded?(src) || src =~ /\A\/[^\/]/i
+      Nokogiri::HTML::fragment(post.cooked).css("a/@href", "img/@src").each do |media|
+        src = media.value
+        next if src.blank? || UPLOAD_PATTERNS.none? { |pattern| src =~ pattern }
 
-      missing << src unless Upload.get_from_url(src) || OptimizedImage.get_from_url(src)
+        src = "#{SiteSetting.force_https ? "https" : "http"}:#{src}" if src.start_with?("//")
+        next unless Discourse.store.has_been_uploaded?(src) || src =~ /\A\/[^\/]/i
+
+        path = begin
+          URI(URI.unescape(src))&.path
+        rescue URI::Error
+        end
+
+        next if path.blank?
+
+        sha1 =
+          if path.include? "optimized"
+            OptimizedImage.extract_sha1(path)
+          else
+            Upload.extract_sha1(path)
+          end
+
+        if sha1.blank? || sha1s.exclude?(sha1)
+          missing << src
+        end
+      end
+
+      if missing.present?
+        PostCustomField.create!(post_id: post.id, name: Post::MISSING_UPLOADS, value: missing.to_json)
+        count += missing.count
+        putc "x"
+      else
+        putc "."
+      end
     end
-
-    if missing.present?
-      PostCustomField.create!(post_id: post.id, name: Post::MISSING_UPLOADS, value: missing.to_json)
-      count += missing.count
-    end
-
-    putc "."
   end
 
   puts "", "#{count} post uploads are missing.", ""