discourse/app/jobs/regular/pull_hotlinked_images.rb

require_dependency 'url_helper'
require_dependency 'file_helper'
require_dependency 'upload_creator'

module Jobs

  class PullHotlinkedImages < Jobs::Base

    sidekiq_options queue: 'low'

    LARGE_IMAGES = "large_images".freeze

    def initialize
      @max_size = SiteSetting.max_image_size_kb.kilobytes
    end

    def download(src)
      downloaded = nil

      begin
        retries ||= 3

        downloaded = FileHelper.download(
          src,
          max_file_size: @max_size,
          tmp_file_name: "discourse-hotlinked",
          follow_redirect: true
        )
      rescue
        if (retries -= 1) > 0
          sleep 1
          retry
        end
      end

      downloaded
    end

    def execute(args)
      return unless SiteSetting.download_remote_images_to_local?

      post_id = args[:post_id]
      raise Discourse::InvalidParameters.new(:post_id) unless post_id.present?

      post = Post.find_by(id: post_id)
      return unless post.present?

      raw = post.raw.dup
      start_raw = raw.dup
      downloaded_urls = {}
      large_images = post.custom_fields[LARGE_IMAGES].presence || []
      broken_images, new_large_images = [], []

      extract_images_from(post.cooked).each do |image|
        src = original_src = image['src']
        if src.start_with?("//")
          src = "#{SiteSetting.force_https ? "https" : "http"}:#{src}"
        end

        if is_valid_image_url(src)
          begin
            # have we already downloaded that file?
            unless downloaded_urls.include?(src) || large_images.include?(src) || broken_images.include?(src)
              if hotlinked = download(src)
                if File.size(hotlinked.path) <= @max_size
                  filename = File.basename(URI.parse(src).path)
                  filename << File.extname(hotlinked.path) unless filename["."]
                  upload = UploadCreator.new(hotlinked, filename, origin: src).create_for(post.user_id)
                  if upload.persisted?
                    downloaded_urls[src] = upload.url
                  else
                    log(:info, "Failed to pull hotlinked image for post: #{post_id}: #{src} - #{upload.errors.full_messages.join("\n")}")
                  end
                else
                  large_images << original_src
                  new_large_images << original_src
                end
              else
                broken_images << original_src
              end
            end
            # have we successfully downloaded that file?
            if downloaded_urls[src].present?
              url = downloaded_urls[src]
              escaped_src = Regexp.escape(original_src)
              # there are 6 ways to insert an image in a post
              # HTML tag - <img src="http://...">
              raw.gsub!(/src=["']#{escaped_src}["']/i, "src='#{url}'")
              # BBCode tag - [img]http://...[/img]
              raw.gsub!(/\[img\]#{escaped_src}\[\/img\]/i, "[img]#{url}[/img]")
              # Markdown linked image - [![alt](http://...)](http://...)
              raw.gsub!(/\[!\[([^\]]*)\]\(#{escaped_src}\)\]/) { "[<img src='#{url}' alt='#{$1}'>]" }
              # Markdown inline - ![alt](http://...)
              raw.gsub!(/!\[([^\]]*)\]\(#{escaped_src}\)/) { "![#{$1}](#{url})" }
              # Markdown inline - ![](http://... "image title")
              raw.gsub!(/!\[\]\(#{escaped_src} "([^\]]*)"\)/) { "![](#{url})" }
              # Markdown inline - ![alt](http://... "image title")
              raw.gsub!(/!\[([^\]]*)\]\(#{escaped_src} "([^\]]*)"\)/) { "![](#{url})" }
              # Markdown reference - [x]: http://
              raw.gsub!(/\[([^\]]+)\]:\s?#{escaped_src}/) { "[#{$1}]: #{url}" }
              # Direct link
              raw.gsub!(/^#{escaped_src}(\s?)$/) { "<img src='#{url}'>#{$1}" }
            end
          rescue => e
            log(:error, "Failed to pull hotlinked image (#{src}) post: #{post_id}\n" + e.message + "\n" + e.backtrace.join("\n"))
          end
        end

      end

      post.custom_fields[LARGE_IMAGES] = large_images
      post.save!
      post.reload

      if start_raw == post.raw && raw != post.raw
        changes = { raw: raw, edit_reason: I18n.t("upload.edit_reason") }
        # we never want that job to bump the topic
        options = { bypass_bump: true }
        post.revise(Discourse.system_user, changes, options)
      elsif downloaded_urls.present? || new_large_images.present?
        post.trigger_post_process(true)
      elsif broken_images.present?
        start_html = post.cooked
        doc = Nokogiri::HTML::fragment(start_html)
        images = doc.css("img[src]") - doc.css("img.avatar")
        images.each do |tag|
          src = tag['src']
          if broken_images.include?(src)
            tag.name = 'span'
            tag.set_attribute('class', 'broken-image fa fa-chain-broken')
            tag.set_attribute('title', I18n.t('post.image_placeholder.broken'))
            tag.remove_attribute('src')
            tag.remove_attribute('width')
            tag.remove_attribute('height')
          end
        end
        if start_html == post.cooked && doc.to_html != post.cooked
          post.update_column(:cooked, doc.to_html)
          post.publish_change_to_clients! :revised
        end
      end
    end

    def extract_images_from(html)
      doc = Nokogiri::HTML::fragment(html)
      doc.css("img[src]") - doc.css("img.avatar")
    end

    def is_valid_image_url(src)
      # make sure we actually have a url
      return false unless src.present?
      # we don't want to pull uploaded images
      return false if Discourse.store.has_been_uploaded?(src)
      # we don't want to pull relative images
      return false if src =~ /\A\/[^\/]/i

      # parse the src
      begin
        uri = URI.parse(src)
      rescue URI::InvalidURIError
        return false
      end

      hostname = uri.hostname
      return false unless hostname

      # we don't want to pull images hosted on the CDN (if we use one)
      return false if Discourse.asset_host.present? && URI.parse(Discourse.asset_host).hostname == hostname
      return false if SiteSetting.Upload.s3_cdn_url.present? && URI.parse(SiteSetting.Upload.s3_cdn_url).hostname == hostname
      # we don't want to pull images hosted on the main domain
      return false if URI.parse(Discourse.base_url_no_prefix).hostname == hostname
      # check the domains blacklist
      SiteSetting.should_download_images?(src)
    end

    def log(log_level, message)
      Rails.logger.public_send(
        log_level,
        "#{RailsMultisite::ConnectionManagement.current_db}: #{message}"
      )
    end

  end

end
FIX markdown hotlinked images were not properly pulled 2013-11-20 20:10:08 +08:00			`require_dependency 'url_helper'`
FEATURE: support email attachments 2014-04-15 04:55:57 +08:00			`require_dependency 'file_helper'`
REFACTOR: upload workflow creation into UploadCreator - Automatically convert large-ish PNG/BMP to JPEG - Updated fast_image to latest version 2017-05-11 06:16:57 +08:00			`require_dependency 'upload_creator'`
FIX markdown hotlinked images were not properly pulled 2013-11-20 20:10:08 +08:00
pull hotlinked images 2013-11-06 02:04:47 +08:00			`module Jobs`

			`class PullHotlinkedImages < Jobs::Base`
FEATURE: prioritize sidekiq jobs This commit introduces 3 queues for sidekiq "critical" for urgent jobs (weighted at 4x weight) "default" for standard jobs(weighted at 2x weight) "low" for less important jobs "critical jobs" Reset Password emails has been seperated to its own job Heartbeat which is required to keep sidekiq running Test email which needs to return real quick "low priority jobs" Notify mailing list Pull hotlinked images Update gravatar "default" All the rest Note: for people running sidekiq from command line use bin/sidekiq -q critical,4 -q default,2 -q low 2016-04-07 10:56:43 +08:00
			`sidekiq_options queue: 'low'`

NEW: large image placeholder added in cooked html (#5291) 2017-11-15 18:30:47 +08:00			`LARGE_IMAGES = "large_images".freeze`

pull hotlinked images 2013-11-06 02:04:47 +08:00			`def initialize`
do not pull hotlinked images when max_image_size_kb == 0 2013-11-14 00:30:48 +08:00			`@max_size = SiteSetting.max_image_size_kb.kilobytes`
pull hotlinked images 2013-11-06 02:04:47 +08:00			`end`

let's try 3 times to download images locally 2017-10-12 05:11:44 +08:00			`def download(src)`
			`downloaded = nil`

			`begin`
			`retries \|\|= 3`

			`downloaded = FileHelper.download(`
			`src,`
			`max_file_size: @max_size,`
			`tmp_file_name: "discourse-hotlinked",`
			`follow_redirect: true`
			`)`
			`rescue`
			`if (retries -= 1) > 0`
			`sleep 1`
			`retry`
			`end`
			`end`

			`downloaded`
			`end`

pull hotlinked images 2013-11-06 02:04:47 +08:00			`def execute(args)`
add download_remote_images_to_local site setting 2013-11-15 22:22:18 +08:00			`return unless SiteSetting.download_remote_images_to_local?`
pull hotlinked images 2013-11-06 02:04:47 +08:00
			`post_id = args[:post_id]`
			`raise Discourse::InvalidParameters.new(:post_id) unless post_id.present?`

Perform the where(...).first to find_by(...) refactoring. This refactoring was automated using the command: bundle exec "ruby refactorings/where_dot_first_to_find_by/app.rb" 2014-05-06 21:41:59 +08:00			`post = Post.find_by(id: post_id)`
pull hotlinked images 2013-11-06 02:04:47 +08:00			`return unless post.present?`

			`raw = post.raw.dup`
Backoff-retry for hotlinked image pull + some style fixes 2014-04-22 05:08:17 +08:00			`start_raw = raw.dup`
pull hotlinked images 2013-11-06 02:04:47 +08:00			`downloaded_urls = {}`
NEW: large image placeholder added in cooked html (#5291) 2017-11-15 18:30:47 +08:00			`large_images = post.custom_fields[LARGE_IMAGES].presence \|\| []`
			`broken_images, new_large_images = [], []`
pull hotlinked images 2013-11-06 02:04:47 +08:00
			`extract_images_from(post.cooked).each do \|image\|`
FIX: Handle img src starting with "//" in pull_hotlinked_images job 2017-01-16 18:50:07 +08:00			`src = original_src = image['src']`
use force_https site setting when adding scheme for downloading schemaless images locally 2017-10-12 06:06:24 +08:00			`if src.start_with?("//")`
			`src = "#{SiteSetting.force_https ? "https" : "http"}:#{src}"`
			`end`
pull hotlinked images 2013-11-06 02:04:47 +08:00
			`if is_valid_image_url(src)`
			`begin`
			`# have we already downloaded that file?`
NEW: large image placeholder added in cooked html (#5291) 2017-11-15 18:30:47 +08:00			`unless downloaded_urls.include?(src) \|\| large_images.include?(src) \|\| broken_images.include?(src)`
let's try 3 times to download images locally 2017-10-12 05:11:44 +08:00			`if hotlinked = download(src)`
FIX: Use File.size instead of IO.size 2015-08-18 00:57:28 +08:00			`if File.size(hotlinked.path) <= @max_size`
FIX: there's no need to try to download relative images 2014-09-27 00:27:10 +08:00			`filename = File.basename(URI.parse(src).path)`
FIX: pull hotlinked images even when they have no extension 2017-06-13 19:27:05 +08:00			`filename << File.extname(hotlinked.path) unless filename["."]`
REFACTOR: upload workflow creation into UploadCreator - Automatically convert large-ish PNG/BMP to JPEG - Updated fast_image to latest version 2017-05-11 06:16:57 +08:00			`upload = UploadCreator.new(hotlinked, filename, origin: src).create_for(post.user_id)`
FIX: pull hotlinked images even when they have no extension 2017-06-13 19:27:05 +08:00			`if upload.persisted?`
			`downloaded_urls[src] = upload.url`
			`else`
let's not generate an error when logging errors... 2017-10-19 05:14:13 +08:00			`log(:info, "Failed to pull hotlinked image for post: #{post_id}: #{src} - #{upload.errors.full_messages.join("\n")}")`
FIX: pull hotlinked images even when they have no extension 2017-06-13 19:27:05 +08:00			`end`
FIX: there's no need to try to download relative images 2014-09-27 00:27:10 +08:00			`else`
FEATURE: Add placeholders to broken and large image files (#5113) 2017-09-01 22:26:13 +08:00			`large_images << original_src`
NEW: large image placeholder added in cooked html (#5291) 2017-11-15 18:30:47 +08:00			`new_large_images << original_src`
FIX: there's no need to try to download relative images 2014-09-27 00:27:10 +08:00			`end`
pull hotlinked images 2013-11-06 02:04:47 +08:00			`else`
FEATURE: Add placeholders to broken and large image files (#5113) 2017-09-01 22:26:13 +08:00			`broken_images << original_src`
pull hotlinked images 2013-11-06 02:04:47 +08:00			`end`
			`end`
Correct few spelling in the comments 2013-12-21 15:19:22 +08:00			`# have we successfully downloaded that file?`
pull hotlinked images 2013-11-06 02:04:47 +08:00			`if downloaded_urls[src].present?`
			`url = downloaded_urls[src]`
FIX: Handle img src starting with "//" in pull_hotlinked_images job 2017-01-16 18:50:07 +08:00			`escaped_src = Regexp.escape(original_src)`
FIX markdown hotlinked images were not properly pulled 2013-11-20 20:10:08 +08:00			`# there are 6 ways to insert an image in a post`
pull hotlinked images 2013-11-06 02:04:47 +08:00			`# HTML tag - <img src="http://...">`
			`raw.gsub!(/src=["']#{escaped_src}["']/i, "src='#{url}'")`
			`# BBCode tag - [img]http://...[/img]`
			`raw.gsub!(/\[img\]#{escaped_src}\[\/img\]/i, "[img]#{url}[/img]")`
FIX markdown hotlinked images were not properly pulled 2013-11-20 20:10:08 +08:00			`# Markdown linked image - [![alt](http://...)](http://...)`
			`raw.gsub!(/\[!\[([^\]]*)\]\(#{escaped_src}\)\]/) { "[<img src='#{url}' alt='#{$1}'>]" }`
pull hotlinked images 2013-11-06 02:04:47 +08:00			`# Markdown inline - ![alt](http://...)`
			`raw.gsub!(/!\[([^\]]*)\]\(#{escaped_src}\)/) { "![#{$1}](#{url})" }`
FIX: properly insert images in markdown inline format 2016-09-01 14:25:40 +08:00			`# Markdown inline - ![](http://... "image title")`
			`raw.gsub!(/!\[\]\(#{escaped_src} "([^\]]*)"\)/) { "![](#{url})" }`
FIX: properly insert images in markdown inline format (take 2) 2016-09-01 20:26:39 +08:00			`# Markdown inline - ![alt](http://... "image title")`
			`raw.gsub!(/!\[([^\]])\]\(#{escaped_src} "([^\]])"\)/) { "![](#{url})" }`
pull hotlinked images 2013-11-06 02:04:47 +08:00			`# Markdown reference - [x]: http://`
FIX: PullHotlinkedImages was messing with URL when using Markdown references 2015-11-09 23:37:51 +08:00			`raw.gsub!(/\[([^\]]+)\]:\s?#{escaped_src}/) { "[#{$1}]: #{url}" }`
pull hotlinked images 2013-11-06 02:04:47 +08:00			`# Direct link`
FIX: keep whitespaces when replacing direct link to external images with local images 2016-02-15 19:34:45 +08:00			`raw.gsub!(/^#{escaped_src}(\s?)$/) { "<img src='#{url}'>#{$1}" }`
pull hotlinked images 2013-11-06 02:04:47 +08:00			`end`
			`rescue => e`
add more logs to pull hotlinked images 2017-09-28 07:00:13 +08:00			`log(:error, "Failed to pull hotlinked image (#{src}) post: #{post_id}\n" + e.message + "\n" + e.backtrace.join("\n"))`
pull hotlinked images 2013-11-06 02:04:47 +08:00			`end`
			`end`

			`end`

NEW: large image placeholder added in cooked html (#5291) 2017-11-15 18:30:47 +08:00			`post.custom_fields[LARGE_IMAGES] = large_images`
			`post.save!`
Backoff-retry for hotlinked image pull + some style fixes 2014-04-22 05:08:17 +08:00			`post.reload`
NEW: large image placeholder added in cooked html (#5291) 2017-11-15 18:30:47 +08:00
FIX: prevent infinite loop in PullHotlinkedImages job 2015-10-31 05:46:46 +08:00			`if start_raw == post.raw && raw != post.raw`
LOTS of changes to properly handle post/topic revisions FIX: history revision can now properly be hidden FIX: PostRevision serializer is now entirely dynamic to properly handle hidden revisions FIX: default history modal to "side by side" view on mobile FIX: properly hiden which revision has been hidden UX: inline category/user/wiki/post_type changes with the revision details FEATURE: new '/posts/:post_id/revisions/latest' endpoint to retrieve latest revision UX: do not show the hide/show revision button on mobile (no room for them) UX: remove CSS transitions on the buttons in the history modal FIX: PostRevisor now handles all the changes that might create new revisions FIX: PostRevision.ensure_consistency! was wrong due to off by 1 mistake... refactored topic's callbacks for better readability extracted 'PostRevisionGuardian' 2014-10-28 05:06:43 +08:00			`changes = { raw: raw, edit_reason: I18n.t("upload.edit_reason") }`
			`# we never want that job to bump the topic`
			`options = { bypass_bump: true }`
			`post.revise(Discourse.system_user, changes, options)`
NEW: large image placeholder added in cooked html (#5291) 2017-11-15 18:30:47 +08:00			`elsif downloaded_urls.present? \|\| new_large_images.present?`
FEATURE: pull onebox images 2017-06-02 17:39:06 +08:00			`post.trigger_post_process(true)`
NEW: large image placeholder added in cooked html (#5291) 2017-11-15 18:30:47 +08:00			`elsif broken_images.present?`
FEATURE: Add placeholders to broken and large image files (#5113) 2017-09-01 22:26:13 +08:00			`start_html = post.cooked`
			`doc = Nokogiri::HTML::fragment(start_html)`
			`images = doc.css("img[src]") - doc.css("img.avatar")`
			`images.each do \|tag\|`
			`src = tag['src']`
			`if broken_images.include?(src)`
			`tag.name = 'span'`
			`tag.set_attribute('class', 'broken-image fa fa-chain-broken')`
UX: Placeholder images color changed & tootip added 2017-09-13 17:46:38 +08:00			`tag.set_attribute('title', I18n.t('post.image_placeholder.broken'))`
FEATURE: Add placeholders to broken and large image files (#5113) 2017-09-01 22:26:13 +08:00			`tag.remove_attribute('src')`
UX: Placeholder images color changed & tootip added 2017-09-13 17:46:38 +08:00			`tag.remove_attribute('width')`
			`tag.remove_attribute('height')`
FEATURE: Add placeholders to broken and large image files (#5113) 2017-09-01 22:26:13 +08:00			`end`
			`end`
			`if start_html == post.cooked && doc.to_html != post.cooked`
			`post.update_column(:cooked, doc.to_html)`
			`post.publish_change_to_clients! :revised`
			`end`
pull hotlinked images 2013-11-06 02:04:47 +08:00			`end`
			`end`

			`def extract_images_from(html)`
			`doc = Nokogiri::HTML::fragment(html)`
FEATURE: pull onebox images 2017-06-02 17:39:06 +08:00			`doc.css("img[src]") - doc.css("img.avatar")`
pull hotlinked images 2013-11-06 02:04:47 +08:00			`end`

			`def is_valid_image_url(src)`
BUGFIX: make sure we do not try to pull images from the CDN 2014-05-08 01:49:16 +08:00			`# make sure we actually have a url`
			`return false unless src.present?`
			`# we don't want to pull uploaded images`
			`return false if Discourse.store.has_been_uploaded?(src)`
FIX: there's no need to try to download relative images 2014-09-27 00:27:10 +08:00			`# we don't want to pull relative images`
			`return false if src =~ /\A\/[^\/]/i`
FIX: `Jobs::PullHotlinkedImages#is_valid_image_src` returns true for a generic string. 2017-07-06 16:55:28 +08:00
BUGFIX: make sure we do not try to pull images from the CDN 2014-05-08 01:49:16 +08:00			`# parse the src`
			`begin`
			`uri = URI.parse(src)`
			`rescue URI::InvalidURIError`
			`return false`
			`end`
FIX: `Jobs::PullHotlinkedImages#is_valid_image_src` returns true for a generic string. 2017-07-06 16:55:28 +08:00
			`hostname = uri.hostname`
			`return false unless hostname`

BUGFIX: make sure we do not try to pull images from the CDN 2014-05-08 01:49:16 +08:00			`# we don't want to pull images hosted on the CDN (if we use one)`
FIX: `Jobs::PullHotlinkedImages#is_valid_image_src` returns true for a generic string. 2017-07-06 16:55:28 +08:00			`return false if Discourse.asset_host.present? && URI.parse(Discourse.asset_host).hostname == hostname`
FEATURE: allow specifying s3 config via globals This refactors handling of s3 so it can be specified via GlobalSetting This means that in a multisite environment you can configure s3 uploads without actual sites knowing credentials in s3 It is a critical setting for situations where assets are mirrored to s3. 2017-10-06 13:20:01 +08:00			`return false if SiteSetting.Upload.s3_cdn_url.present? && URI.parse(SiteSetting.Upload.s3_cdn_url).hostname == hostname`
BUGFIX: make sure we do not try to pull images from the CDN 2014-05-08 01:49:16 +08:00			`# we don't want to pull images hosted on the main domain`
FIX: `Jobs::PullHotlinkedImages#is_valid_image_src` returns true for a generic string. 2017-07-06 16:55:28 +08:00			`return false if URI.parse(Discourse.base_url_no_prefix).hostname == hostname`
BUGFIX: make sure we do not try to pull images from the CDN 2014-05-08 01:49:16 +08:00			`# check the domains blacklist`
Add site setting for domains to never download images from 2014-04-22 04:59:53 +08:00			`SiteSetting.should_download_images?(src)`
pull hotlinked images 2013-11-06 02:04:47 +08:00			`end`

Log site name when logging to Logster in `Jobs::PullHotlinkedImages`. 2017-07-05 09:34:24 +08:00			`def log(log_level, message)`
			`Rails.logger.public_send(`
			`log_level,`
			`"#{RailsMultisite::ConnectionManagement.current_db}: #{message}"`
			`)`
			`end`

pull hotlinked images 2013-11-06 02:04:47 +08:00			`end`

			`end`