From cb12a721c4dd8187dab3eb2b915054a9ac76caa0 Mon Sep 17 00:00:00 2001
From: David Taylor <david@taylorhq.com>
Date: Wed, 5 Aug 2020 12:14:59 +0100
Subject: [PATCH] REFACTOR: Refactor pull_hotlinked_images job

This commit should cause no functional change
- Split into functions to avoid deep nesting
- Register custom field type, and remove manual json parse/serialize
- Recover from deleted upload records

Also adds a test to ensure pull_hotlinked_images redownloads secure images only once
---
 app/jobs/regular/pull_hotlinked_images.rb     | 243 ++++++++++--------
 app/models/post.rb                            |   8 +-
 lib/cooked_post_processor.rb                  |  14 +-
 spec/components/cooked_post_processor_spec.rb |   2 +-
 spec/jobs/pull_hotlinked_images_spec.rb       |   6 +-
 spec/requests/posts_controller_spec.rb        |   2 +-
 6 files changed, 144 insertions(+), 131 deletions(-)

diff --git a/app/jobs/regular/pull_hotlinked_images.rb b/app/jobs/regular/pull_hotlinked_images.rb
index 5e3266d21e0..5a6ab648a59 100644
--- a/app/jobs/regular/pull_hotlinked_images.rb
+++ b/app/jobs/regular/pull_hotlinked_images.rb
@@ -9,6 +9,89 @@ module Jobs
       @max_size = SiteSetting.max_image_size_kb.kilobytes
     end
 
+    def execute(args)
+      @post_id = args[:post_id]
+      raise Discourse::InvalidParameters.new(:post_id) if @post_id.blank?
+
+      post = Post.find_by(id: @post_id)
+      return if post.blank?
+
+      raw = post.raw.dup
+      start_raw = raw.dup
+
+      large_image_urls = post.custom_fields[Post::LARGE_IMAGES] || []
+      broken_image_urls = post.custom_fields[Post::BROKEN_IMAGES] || []
+      downloaded_image_ids = post.custom_fields[Post::DOWNLOADED_IMAGES] || {}
+
+      upload_records = Upload.where(id: downloaded_image_ids.values)
+      upload_records = Hash[upload_records.map { |u| [u.id, u] }]
+
+      downloaded_images = {}
+      downloaded_image_ids.each { |url, id| downloaded_images[url] = upload_records[id] }
+
+      extract_images_from(post.cooked).each do |node|
+        download_src = original_src = node['src'] || node['href']
+        download_src = "#{SiteSetting.force_https ? "https" : "http"}:#{original_src}" if original_src.start_with?("//")
+        normalized_src = normalize_src(download_src)
+
+        next if !should_download_image?(download_src, post)
+
+        begin
+          already_attempted_download = downloaded_images.include?(normalized_src) || large_image_urls.include?(normalized_src) || broken_image_urls.include?(normalized_src)
+          if !already_attempted_download
+            downloaded_images[normalized_src] = attempt_download(download_src, post.user_id)
+          end
+        rescue ImageTooLargeError
+          large_image_urls << normalized_src
+        rescue ImageBrokenError
+          broken_image_urls << normalized_src
+        end
+
+        # have we successfully downloaded that file?
+        if upload = downloaded_images[normalized_src]
+          raw = replace_in_raw(original_src: original_src, upload: upload, raw: raw)
+        end
+      rescue => e
+        raise e if Rails.env.test?
+        log(:error, "Failed to pull hotlinked image (#{download_src}) post: #{@post_id}\n" + e.message + "\n" + e.backtrace.join("\n"))
+      end
+
+      large_image_urls.uniq!
+      broken_image_urls.uniq!
+      downloaded_images.compact!
+
+      post.custom_fields[Post::LARGE_IMAGES] = large_image_urls
+      post.custom_fields[Post::BROKEN_IMAGES] = broken_image_urls
+
+      downloaded_image_ids = {}
+      downloaded_images.each { |url, upload| downloaded_image_ids[url] = upload.id }
+      post.custom_fields[Post::DOWNLOADED_IMAGES] = downloaded_image_ids
+
+      [Post::LARGE_IMAGES, Post::BROKEN_IMAGES, Post::DOWNLOADED_IMAGES].each do |key|
+        post.custom_fields.delete(key) if !post.custom_fields[key].present?
+      end
+
+      custom_fields_updated = !post.custom_fields_clean?
+
+      # only save custom fields if they changed
+      post.save_custom_fields if custom_fields_updated
+
+      # If post changed while we were downloading images, never apply edits
+      post.reload
+      post_changed_elsewhere = (start_raw != post.raw)
+      raw_changed_here = (raw != post.raw)
+
+      if !post_changed_elsewhere && raw_changed_here
+        changes = { raw: raw, edit_reason: I18n.t("upload.edit_reason") }
+        post.revise(Discourse.system_user, changes, bypass_bump: true, skip_staff_log: true)
+      elsif custom_fields_updated
+        post.trigger_post_process(
+          bypass_bump: true,
+          skip_pull_hotlinked_images: true # Avoid an infinite loop of job scheduling
+        )
+      end
+    end
+
     def download(src)
       downloaded = nil
 
@@ -32,131 +115,67 @@ module Jobs
       downloaded
     end
 
-    def execute(args)
-      post_id = args[:post_id]
-      raise Discourse::InvalidParameters.new(:post_id) unless post_id.present?
+    class ImageTooLargeError < StandardError; end
+    class ImageBrokenError < StandardError; end
 
-      post = Post.find_by(id: post_id)
-      return unless post.present?
+    def attempt_download(src, user_id)
+      # secure-media-uploads endpoint prevents anonymous downloads, so we
+      # need the presigned S3 URL here
+      src = Upload.signed_url_from_secure_media_url(src) if Upload.secure_media_url?(src)
 
-      raw = post.raw.dup
-      start_raw = raw.dup
+      hotlinked = download(src)
+      raise ImageBrokenError if !hotlinked
+      raise ImageTooLargeError if File.size(hotlinked.path) > @max_size
 
-      downloaded_urls = {}
+      filename = File.basename(URI.parse(src).path)
+      filename << File.extname(hotlinked.path) unless filename["."]
+      upload = UploadCreator.new(hotlinked, filename, origin: src).create_for(user_id)
 
-      large_images = JSON.parse(post.custom_fields[Post::LARGE_IMAGES].presence || "[]")
-      broken_images = JSON.parse(post.custom_fields[Post::BROKEN_IMAGES].presence || "[]")
-      downloaded_images = JSON.parse(post.custom_fields[Post::DOWNLOADED_IMAGES].presence || "{}")
+      if upload.persisted?
+        upload
+      else
+        log(:info, "Failed to persist downloaded hotlinked image for post: #{@post_id}: #{src} - #{upload.errors.full_messages.join("\n")}")
+        nil
+      end
+    end
 
-      has_new_large_image  = false
-      has_new_broken_image = false
-      has_downloaded_image = false
+    def replace_in_raw(original_src:, raw:, upload:)
+      raw = raw.dup
+      escaped_src = Regexp.escape(original_src)
 
-      extract_images_from(post.cooked).each do |node|
-        src = original_src = node['src'] || node['href']
-        src = "#{SiteSetting.force_https ? "https" : "http"}:#{src}" if src.start_with?("//")
-
-        if should_download_image?(src, post)
-          begin
-            # have we already downloaded that file?
-            schemeless_src = normalize_src(original_src)
-
-            unless downloaded_images.include?(schemeless_src) || large_images.include?(schemeless_src) || broken_images.include?(schemeless_src)
-
-              # secure-media-uploads endpoint prevents anonymous downloads, so we
-              # need the presigned S3 URL here
-              src = Upload.signed_url_from_secure_media_url(src) if Upload.secure_media_url?(src)
-
-              if hotlinked = download(src)
-                if File.size(hotlinked.path) <= @max_size
-                  filename = File.basename(URI.parse(src).path)
-                  filename << File.extname(hotlinked.path) unless filename["."]
-                  upload = UploadCreator.new(hotlinked, filename, origin: src).create_for(post.user_id)
-
-                  if upload.persisted?
-                    downloaded_urls[src] = upload.url
-                    downloaded_images[normalize_src(src)] = upload.id
-                    has_downloaded_image = true
-                  else
-                    log(:info, "Failed to pull hotlinked image for post: #{post_id}: #{src} - #{upload.errors.full_messages.join("\n")}")
-                  end
-                else
-                  large_images << normalize_src(original_src)
-                  has_new_large_image = true
-                end
-              else
-                broken_images << normalize_src(original_src)
-                has_new_broken_image = true
-              end
+      replace_raw = ->(match, match_src, replacement, _index) {
+        if normalize_src(original_src) == normalize_src(match_src)
+          replacement =
+            if replacement.include?(InlineUploads::PLACEHOLDER)
+              replacement.sub(InlineUploads::PLACEHOLDER, upload.short_url)
+            elsif replacement.include?(InlineUploads::PATH_PLACEHOLDER)
+              replacement.sub(InlineUploads::PATH_PLACEHOLDER, upload.short_path)
             end
 
-            # have we successfully downloaded that file?
-            if downloaded_urls[src].present?
-              escaped_src = Regexp.escape(original_src)
-
-              replace_raw = ->(match, match_src, replacement, _index) {
-
-                if normalize_src(src) == normalize_src(match_src)
-                  replacement =
-                    if replacement.include?(InlineUploads::PLACEHOLDER)
-                      replacement.sub(InlineUploads::PLACEHOLDER, upload.short_url)
-                    elsif replacement.include?(InlineUploads::PATH_PLACEHOLDER)
-                      replacement.sub(InlineUploads::PATH_PLACEHOLDER, upload.short_path)
-                    end
-
-                  raw = raw.gsub(
-                    match,
-                    replacement
-                  )
-                end
-              }
-
-              # there are 6 ways to insert an image in a post
-              # HTML tag - <img src="http://...">
-              InlineUploads.match_img(raw, external_src: true, &replace_raw)
-
-              # BBCode tag - [img]http://...[/img]
-              InlineUploads.match_bbcode_img(raw, external_src: true, &replace_raw)
-
-              # Markdown linked image - [![alt](http://...)](http://...)
-              # Markdown inline - ![alt](http://...)
-              # Markdown inline - ![](http://... "image title")
-              # Markdown inline - ![alt](http://... "image title")
-              InlineUploads.match_md_inline_img(raw, external_src: true, &replace_raw)
-
-              # Direct link
-              raw.gsub!(/^#{escaped_src}(\s?)$/) { "![](#{upload.short_url})#{$1}" }
-            end
-          rescue => e
-            if Rails.env.test?
-              raise e
-            else
-              log(:error, "Failed to pull hotlinked image (#{src}) post: #{post_id}\n" + e.message + "\n" + e.backtrace.join("\n"))
-            end
-          end
+          raw = raw.gsub(
+            match,
+            replacement
+          )
         end
-      end
+      }
 
-      large_images.uniq!
-      broken_images.uniq!
+      # there are 6 ways to insert an image in a post
+      # HTML tag - <img src="http://...">
+      InlineUploads.match_img(raw, external_src: true, &replace_raw)
 
-      post.custom_fields[Post::LARGE_IMAGES]      = large_images.to_json      if large_images.present?
-      post.custom_fields[Post::BROKEN_IMAGES]     = broken_images.to_json     if broken_images.present?
-      post.custom_fields[Post::DOWNLOADED_IMAGES] = downloaded_images.to_json if downloaded_images.present?
-      # only save custom fields if there are any
-      post.save_custom_fields if large_images.present? || broken_images.present? || downloaded_images.present?
+      # BBCode tag - [img]http://...[/img]
+      InlineUploads.match_bbcode_img(raw, external_src: true, &replace_raw)
 
-      post.reload
+      # Markdown linked image - [![alt](http://...)](http://...)
+      # Markdown inline - ![alt](http://...)
+      # Markdown inline - ![](http://... "image title")
+      # Markdown inline - ![alt](http://... "image title")
+      InlineUploads.match_md_inline_img(raw, external_src: true, &replace_raw)
 
-      if start_raw == post.raw && raw != post.raw
-        changes = { raw: raw, edit_reason: I18n.t("upload.edit_reason") }
-        post.revise(Discourse.system_user, changes, bypass_bump: true, skip_staff_log: true)
-      elsif has_downloaded_image || has_new_large_image || has_new_broken_image
-        post.trigger_post_process(
-          bypass_bump: true,
-          skip_pull_hotlinked_images: true # Avoid an infinite loop of job scheduling
-        )
-      end
+      # Direct link
+      raw.gsub!(/^#{escaped_src}(\s?)$/) { "![](#{upload.short_url})#{$1}" }
+
+      raw
     end
 
     def extract_images_from(html)
diff --git a/app/models/post.rb b/app/models/post.rb
index c33649199e3..5a432c8a0ff 100644
--- a/app/models/post.rb
+++ b/app/models/post.rb
@@ -72,6 +72,10 @@ class Post < ActiveRecord::Base
 
   SHORT_POST_CHARS ||= 1200
 
+  register_custom_field_type(LARGE_IMAGES, :json)
+  register_custom_field_type(BROKEN_IMAGES, :json)
+  register_custom_field_type(DOWNLOADED_IMAGES, :json)
+
   register_custom_field_type(MISSING_UPLOADS, :json)
   register_custom_field_type(MISSING_UPLOADS_IGNORED, :boolean)
 
@@ -942,9 +946,7 @@ class Post < ActiveRecord::Base
   end
 
   def downloaded_images
-    JSON.parse(self.custom_fields[Post::DOWNLOADED_IMAGES].presence || "{}")
-  rescue JSON::ParserError
-    {}
+    self.custom_fields[Post::DOWNLOADED_IMAGES] || {}
   end
 
   def each_upload_url(fragments: nil, include_local_upload: true)
diff --git a/lib/cooked_post_processor.rb b/lib/cooked_post_processor.rb
index e46295becf1..f255161f87a 100644
--- a/lib/cooked_post_processor.rb
+++ b/lib/cooked_post_processor.rb
@@ -178,21 +178,11 @@ class CookedPostProcessor
   end
 
   def large_images
-    @large_images ||=
-      begin
-        JSON.parse(@post.custom_fields[Post::LARGE_IMAGES].presence || "[]")
-      rescue JSON::ParserError
-        []
-      end
+    @large_images ||= @post.custom_fields[Post::LARGE_IMAGES].presence || []
   end
 
   def broken_images
-    @broken_images ||=
-      begin
-        JSON.parse(@post.custom_fields[Post::BROKEN_IMAGES].presence || "[]")
-      rescue JSON::ParserError
-        []
-      end
+    @broken_images ||= @post.custom_fields[Post::BROKEN_IMAGES].presence || []
   end
 
   def downloaded_images
diff --git a/spec/components/cooked_post_processor_spec.rb b/spec/components/cooked_post_processor_spec.rb
index dc202453248..424ec994541 100644
--- a/spec/components/cooked_post_processor_spec.rb
+++ b/spec/components/cooked_post_processor_spec.rb
@@ -1084,7 +1084,7 @@ describe CookedPostProcessor do
 
       post = Fabricate(:post, raw: url)
 
-      post.custom_fields[Post::LARGE_IMAGES] = "[\"//image.com/avatar.png\"]"
+      post.custom_fields[Post::LARGE_IMAGES] = ["//image.com/avatar.png"]
       post.save_custom_fields
 
       cpp = CookedPostProcessor.new(post, invalidate_oneboxes: true)
diff --git a/spec/jobs/pull_hotlinked_images_spec.rb b/spec/jobs/pull_hotlinked_images_spec.rb
index 3aebe91a987..b705272d074 100644
--- a/spec/jobs/pull_hotlinked_images_spec.rb
+++ b/spec/jobs/pull_hotlinked_images_spec.rb
@@ -190,11 +190,13 @@ describe Jobs::PullHotlinkedImages do
           post = Fabricate(:post, raw: "<img src='#{url}'>")
           upload.update(access_control_post: Fabricate(:post))
           FileStore::S3Store.any_instance.stubs(:store_upload).returns(upload.url)
+          FastImage.expects(:size).returns([100, 100]).at_least_once
 
-          # without this we get an infinite hang...
-          Post.any_instance.stubs(:trigger_post_process)
           expect { Jobs::PullHotlinkedImages.new.execute(post_id: post.id) }
             .to change { Upload.count }.by(1)
+
+          expect { Jobs::PullHotlinkedImages.new.execute(post_id: post.id) }
+            .to change { Upload.count }.by(0)
         end
       end
     end
diff --git a/spec/requests/posts_controller_spec.rb b/spec/requests/posts_controller_spec.rb
index f2d11f4de6c..30ec063b8c0 100644
--- a/spec/requests/posts_controller_spec.rb
+++ b/spec/requests/posts_controller_spec.rb
@@ -629,7 +629,7 @@ describe PostsController do
 
       it "will invalidate broken images cache" do
         sign_in(moderator)
-        post.custom_fields[Post::BROKEN_IMAGES] = ["https://example.com/image.jpg"].to_json
+        post.custom_fields[Post::BROKEN_IMAGES] = ["https://example.com/image.jpg"]
         post.save_custom_fields
         put "/posts/#{post.id}/rebake.json"
         post.reload