REFACTOR: Refactor pull_hotlinked_images job

This commit should cause no functional change - Split into functions to avoid deep nesting - Register custom field type, and remove manual json parse/serialize - Recover from deleted upload records Also adds a test to ensure pull_hotlinked_images redownloads secure images only once
2024-11-22 14:03:22 +08:00 · 2020-08-05 12:14:59 +01:00 · 2020-08-05 12:14:59 +01:00 · cb12a721c4
commit cb12a721c4
parent aee3c2c34d
6 changed files with 144 additions and 131 deletions
--- a/app/jobs/regular/pull_hotlinked_images.rb
+++ b/app/jobs/regular/pull_hotlinked_images.rb
@ -9,6 +9,89 @@ module Jobs
      @max_size = SiteSetting.max_image_size_kb.kilobytes
    end
    def execute(args)
      @post_id = args[:post_id]
      raise Discourse::InvalidParameters.new(:post_id) if @post_id.blank?
      post = Post.find_by(id: @post_id)
      return if post.blank?
      raw = post.raw.dup
      start_raw = raw.dup
      large_image_urls = post.custom_fields[Post::LARGE_IMAGES] || []
      broken_image_urls = post.custom_fields[Post::BROKEN_IMAGES] || []
      downloaded_image_ids = post.custom_fields[Post::DOWNLOADED_IMAGES] || {}
      upload_records = Upload.where(id: downloaded_image_ids.values)
      upload_records = Hash[upload_records.map { |u| [u.id, u] }]
      downloaded_images = {}
      downloaded_image_ids.each { |url, id| downloaded_images[url] = upload_records[id] }
      extract_images_from(post.cooked).each do |node|
        download_src = original_src = node['src'] || node['href']
        download_src = "#{SiteSetting.force_https ? "https" : "http"}:#{original_src}" if original_src.start_with?("//")
        normalized_src = normalize_src(download_src)
        next if !should_download_image?(download_src, post)
        begin
          already_attempted_download = downloaded_images.include?(normalized_src) || large_image_urls.include?(normalized_src) || broken_image_urls.include?(normalized_src)
          if !already_attempted_download
            downloaded_images[normalized_src] = attempt_download(download_src, post.user_id)
          end
        rescue ImageTooLargeError
          large_image_urls << normalized_src
        rescue ImageBrokenError
          broken_image_urls << normalized_src
        end
        # have we successfully downloaded that file?
        if upload = downloaded_images[normalized_src]
          raw = replace_in_raw(original_src: original_src, upload: upload, raw: raw)
        end
      rescue => e
        raise e if Rails.env.test?
        log(:error, "Failed to pull hotlinked image (#{download_src}) post: #{@post_id}\n" + e.message + "\n" + e.backtrace.join("\n"))
      end
      large_image_urls.uniq!
      broken_image_urls.uniq!
      downloaded_images.compact!
      post.custom_fields[Post::LARGE_IMAGES] = large_image_urls
      post.custom_fields[Post::BROKEN_IMAGES] = broken_image_urls
      downloaded_image_ids = {}
      downloaded_images.each { |url, upload| downloaded_image_ids[url] = upload.id }
      post.custom_fields[Post::DOWNLOADED_IMAGES] = downloaded_image_ids
      [Post::LARGE_IMAGES, Post::BROKEN_IMAGES, Post::DOWNLOADED_IMAGES].each do |key|
        post.custom_fields.delete(key) if !post.custom_fields[key].present?
      end
      custom_fields_updated = !post.custom_fields_clean?
      # only save custom fields if they changed
      post.save_custom_fields if custom_fields_updated
      # If post changed while we were downloading images, never apply edits
      post.reload
      post_changed_elsewhere = (start_raw != post.raw)
      raw_changed_here = (raw != post.raw)
      if !post_changed_elsewhere && raw_changed_here
        changes = { raw: raw, edit_reason: I18n.t("upload.edit_reason") }
        post.revise(Discourse.system_user, changes, bypass_bump: true, skip_staff_log: true)
      elsif custom_fields_updated
        post.trigger_post_process(
          bypass_bump: true,
          skip_pull_hotlinked_images: true # Avoid an infinite loop of job scheduling
        )
      end
    end
    def download(src)
      downloaded = nil
@ -32,131 +115,67 @@ module Jobs
      downloaded
    end
-    def execute(args)
+    class ImageTooLargeError < StandardError; end
-      post_id = args[:post_id]
+    class ImageBrokenError < StandardError; end
      raise Discourse::InvalidParameters.new(:post_id) unless post_id.present?
-      post = Post.find_by(id: post_id)
+    def attempt_download(src, user_id)
-      return unless post.present?
+      # secure-media-uploads endpoint prevents anonymous downloads, so we
      # need the presigned S3 URL here
      src = Upload.signed_url_from_secure_media_url(src) if Upload.secure_media_url?(src)
-      raw = post.raw.dup
+      hotlinked = download(src)
-      start_raw = raw.dup
+      raise ImageBrokenError if !hotlinked
      raise ImageTooLargeError if File.size(hotlinked.path) > @max_size
-      downloaded_urls = {}
+      filename = File.basename(URI.parse(src).path)
      filename << File.extname(hotlinked.path) unless filename["."]
      upload = UploadCreator.new(hotlinked, filename, origin: src).create_for(user_id)
-      large_images = JSON.parse(post.custom_fields[Post::LARGE_IMAGES].presence || "[]")
+      if upload.persisted?
-      broken_images = JSON.parse(post.custom_fields[Post::BROKEN_IMAGES].presence || "[]")
+        upload
-      downloaded_images = JSON.parse(post.custom_fields[Post::DOWNLOADED_IMAGES].presence || "{}")
+      else
        log(:info, "Failed to persist downloaded hotlinked image for post: #{@post_id}: #{src} - #{upload.errors.full_messages.join("\n")}")
        nil
      end
    end
-      has_new_large_image  = false
+    def replace_in_raw(original_src:, raw:, upload:)
-      has_new_broken_image = false
+      raw = raw.dup
-      has_downloaded_image = false
+      escaped_src = Regexp.escape(original_src)
-      extract_images_from(post.cooked).each do |node|
+      replace_raw = ->(match, match_src, replacement, _index) {
-        src = original_src = node['src'] || node['href']
+        if normalize_src(original_src) == normalize_src(match_src)
-        src = "#{SiteSetting.force_https ? "https" : "http"}:#{src}" if src.start_with?("//")
+          replacement =
-
+            if replacement.include?(InlineUploads::PLACEHOLDER)
-        if should_download_image?(src, post)
+              replacement.sub(InlineUploads::PLACEHOLDER, upload.short_url)
-          begin
+            elsif replacement.include?(InlineUploads::PATH_PLACEHOLDER)
-            # have we already downloaded that file?
+              replacement.sub(InlineUploads::PATH_PLACEHOLDER, upload.short_path)
            schemeless_src = normalize_src(original_src)
            unless downloaded_images.include?(schemeless_src) || large_images.include?(schemeless_src) || broken_images.include?(schemeless_src)
              # secure-media-uploads endpoint prevents anonymous downloads, so we
              # need the presigned S3 URL here
              src = Upload.signed_url_from_secure_media_url(src) if Upload.secure_media_url?(src)
              if hotlinked = download(src)
                if File.size(hotlinked.path) <= @max_size
                  filename = File.basename(URI.parse(src).path)
                  filename << File.extname(hotlinked.path) unless filename["."]
                  upload = UploadCreator.new(hotlinked, filename, origin: src).create_for(post.user_id)
                  if upload.persisted?
                    downloaded_urls[src] = upload.url
                    downloaded_images[normalize_src(src)] = upload.id
                    has_downloaded_image = true
                  else
                    log(:info, "Failed to pull hotlinked image for post: #{post_id}: #{src} - #{upload.errors.full_messages.join("\n")}")
                  end
                else
                  large_images << normalize_src(original_src)
                  has_new_large_image = true
                end
              else
                broken_images << normalize_src(original_src)
                has_new_broken_image = true
              end
            end
-            # have we successfully downloaded that file?
+          raw = raw.gsub(
-            if downloaded_urls[src].present?
+            match,
-              escaped_src = Regexp.escape(original_src)
+            replacement
-
+          )
              replace_raw = ->(match, match_src, replacement, _index) {
                if normalize_src(src) == normalize_src(match_src)
                  replacement =
                    if replacement.include?(InlineUploads::PLACEHOLDER)
                      replacement.sub(InlineUploads::PLACEHOLDER, upload.short_url)
                    elsif replacement.include?(InlineUploads::PATH_PLACEHOLDER)
                      replacement.sub(InlineUploads::PATH_PLACEHOLDER, upload.short_path)
                    end
                  raw = raw.gsub(
                    match,
                    replacement
                  )
                end
              }
              # there are 6 ways to insert an image in a post
              # HTML tag - <img src="http://...">
              InlineUploads.match_img(raw, external_src: true, &replace_raw)
              # BBCode tag - [img]http://...[/img]
              InlineUploads.match_bbcode_img(raw, external_src: true, &replace_raw)
              # Markdown linked image - [![alt](http://...)](http://...)
              # Markdown inline - ![alt](http://...)
              # Markdown inline - ![](http://... "image title")
              # Markdown inline - ![alt](http://... "image title")
              InlineUploads.match_md_inline_img(raw, external_src: true, &replace_raw)
              # Direct link
              raw.gsub!(/^#{escaped_src}(\s?)$/) { "![](#{upload.short_url})#{$1}" }
            end
          rescue => e
            if Rails.env.test?
              raise e
            else
              log(:error, "Failed to pull hotlinked image (#{src}) post: #{post_id}\n" + e.message + "\n" + e.backtrace.join("\n"))
            end
          end
        end
-      end
+      }
-      large_images.uniq!
+      # there are 6 ways to insert an image in a post
-      broken_images.uniq!
+      # HTML tag - <img src="http://...">
      InlineUploads.match_img(raw, external_src: true, &replace_raw)
-      post.custom_fields[Post::LARGE_IMAGES]      = large_images.to_json      if large_images.present?
+      # BBCode tag - [img]http://...[/img]
-      post.custom_fields[Post::BROKEN_IMAGES]     = broken_images.to_json     if broken_images.present?
+      InlineUploads.match_bbcode_img(raw, external_src: true, &replace_raw)
      post.custom_fields[Post::DOWNLOADED_IMAGES] = downloaded_images.to_json if downloaded_images.present?
      # only save custom fields if there are any
      post.save_custom_fields if large_images.present? || broken_images.present? || downloaded_images.present?
-      post.reload
+      # Markdown linked image - [![alt](http://...)](http://...)
      # Markdown inline - ![alt](http://...)
      # Markdown inline - ![](http://... "image title")
      # Markdown inline - ![alt](http://... "image title")
      InlineUploads.match_md_inline_img(raw, external_src: true, &replace_raw)
-      if start_raw == post.raw && raw != post.raw
+      # Direct link
-        changes = { raw: raw, edit_reason: I18n.t("upload.edit_reason") }
+      raw.gsub!(/^#{escaped_src}(\s?)$/) { "![](#{upload.short_url})#{$1}" }
-        post.revise(Discourse.system_user, changes, bypass_bump: true, skip_staff_log: true)
+
-      elsif has_downloaded_image || has_new_large_image || has_new_broken_image
+      raw
        post.trigger_post_process(
          bypass_bump: true,
          skip_pull_hotlinked_images: true # Avoid an infinite loop of job scheduling
        )
      end
    end
    def extract_images_from(html)
--- a/app/models/post.rb
+++ b/app/models/post.rb
@ -72,6 +72,10 @@ class Post < ActiveRecord::Base
  SHORT_POST_CHARS ||= 1200
  register_custom_field_type(LARGE_IMAGES, :json)
  register_custom_field_type(BROKEN_IMAGES, :json)
  register_custom_field_type(DOWNLOADED_IMAGES, :json)
  register_custom_field_type(MISSING_UPLOADS, :json)
  register_custom_field_type(MISSING_UPLOADS_IGNORED, :boolean)
@ -942,9 +946,7 @@ class Post < ActiveRecord::Base
  end
  def downloaded_images
-    JSON.parse(self.custom_fields[Post::DOWNLOADED_IMAGES].presence || "{}")
+    self.custom_fields[Post::DOWNLOADED_IMAGES] || {}
  rescue JSON::ParserError
    {}
  end
  def each_upload_url(fragments: nil, include_local_upload: true)
--- a/lib/cooked_post_processor.rb
+++ b/lib/cooked_post_processor.rb
@ -178,21 +178,11 @@ class CookedPostProcessor
  end
  def large_images
-    @large_images ||=
+    @large_images ||= @post.custom_fields[Post::LARGE_IMAGES].presence || []
      begin
        JSON.parse(@post.custom_fields[Post::LARGE_IMAGES].presence || "[]")
      rescue JSON::ParserError
        []
      end
  end
  def broken_images
-    @broken_images ||=
+    @broken_images ||= @post.custom_fields[Post::BROKEN_IMAGES].presence || []
      begin
        JSON.parse(@post.custom_fields[Post::BROKEN_IMAGES].presence || "[]")
      rescue JSON::ParserError
        []
      end
  end
  def downloaded_images
--- a/spec/components/cooked_post_processor_spec.rb
+++ b/spec/components/cooked_post_processor_spec.rb
@ -1084,7 +1084,7 @@ describe CookedPostProcessor do
      post = Fabricate(:post, raw: url)
-      post.custom_fields[Post::LARGE_IMAGES] = "[\"//image.com/avatar.png\"]"
+      post.custom_fields[Post::LARGE_IMAGES] = ["//image.com/avatar.png"]
      post.save_custom_fields
      cpp = CookedPostProcessor.new(post, invalidate_oneboxes: true)
--- a/spec/jobs/pull_hotlinked_images_spec.rb
+++ b/spec/jobs/pull_hotlinked_images_spec.rb
@ -190,11 +190,13 @@ describe Jobs::PullHotlinkedImages do
          post = Fabricate(:post, raw: "<img src='#{url}'>")
          upload.update(access_control_post: Fabricate(:post))
          FileStore::S3Store.any_instance.stubs(:store_upload).returns(upload.url)
          FastImage.expects(:size).returns([100, 100]).at_least_once
          # without this we get an infinite hang...
          Post.any_instance.stubs(:trigger_post_process)
          expect { Jobs::PullHotlinkedImages.new.execute(post_id: post.id) }
            .to change { Upload.count }.by(1)
          expect { Jobs::PullHotlinkedImages.new.execute(post_id: post.id) }
            .to change { Upload.count }.by(0)
        end
      end
    end
--- a/spec/requests/posts_controller_spec.rb
+++ b/spec/requests/posts_controller_spec.rb
@ -629,7 +629,7 @@ describe PostsController do
      it "will invalidate broken images cache" do
        sign_in(moderator)
-        post.custom_fields[Post::BROKEN_IMAGES] = ["https://example.com/image.jpg"].to_json
+        post.custom_fields[Post::BROKEN_IMAGES] = ["https://example.com/image.jpg"]
        post.save_custom_fields
        put "/posts/#{post.id}/rebake.json"
        post.reload