discourse/app/jobs/regular/pull_hotlinked_images.rb

require_dependency 'url_helper'
require_dependency 'file_helper'
require_dependency 'upload_creator'

module Jobs

  class PullHotlinkedImages < Jobs::Base

    sidekiq_options queue: 'low'

    def initialize
      # maximum size of the file in bytes
      @max_size = SiteSetting.max_image_size_kb.kilobytes
    end

    def execute(args)
      return unless SiteSetting.download_remote_images_to_local?

      post_id = args[:post_id]
      raise Discourse::InvalidParameters.new(:post_id) unless post_id.present?

      post = Post.find_by(id: post_id)
      return unless post.present?

      raw = post.raw.dup
      start_raw = raw.dup
      downloaded_urls = {}
      broken_images, large_images = [], []

      extract_images_from(post.cooked).each do |image|
        src = original_src = image['src']
        src = "http:#{src}" if src.start_with?("//")

        if is_valid_image_url(src)
          hotlinked = nil
          begin
            # have we already downloaded that file?
            unless downloaded_urls.include?(src)
              begin
                hotlinked = FileHelper.download(
                  src,
                  max_file_size: @max_size,
                  tmp_file_name: "discourse-hotlinked",
                  follow_redirect: true
                )
              rescue Discourse::InvalidParameters
                log(:error, "InvalidParameters while downloading hotlinked image (#{src}) for post: #{post_id}")
              rescue => e
                log(:error, "Failed to download image #{e}")
              end
              if hotlinked
                if File.size(hotlinked.path) <= @max_size
                  filename = File.basename(URI.parse(src).path)
                  filename << File.extname(hotlinked.path) unless filename["."]
                  upload = UploadCreator.new(hotlinked, filename, origin: src).create_for(post.user_id)
                  if upload.persisted?
                    downloaded_urls[src] = upload.url
                  else
                    log(:error, "Failed to pull hotlinked image for post: #{post_id}: #{src} - #{upload.errors.join("\n")}")
                  end
                else
                  log(:error, "Failed to pull hotlinked image for post: #{post_id}: #{src} - Image is bigger than #{@max_size}")
                  large_images << original_src
                end
              else
                log(:error, "There was an error while downloading '#{src}' locally for post: #{post_id}")
                broken_images << original_src
              end
            end
            # have we successfully downloaded that file?
            if downloaded_urls[src].present?
              url = downloaded_urls[src]
              escaped_src = Regexp.escape(original_src)
              # there are 6 ways to insert an image in a post
              # HTML tag - <img src="http://...">
              raw.gsub!(/src=["']#{escaped_src}["']/i, "src='#{url}'")
              # BBCode tag - [img]http://...[/img]
              raw.gsub!(/\[img\]#{escaped_src}\[\/img\]/i, "[img]#{url}[/img]")
              # Markdown linked image - [![alt](http://...)](http://...)
              raw.gsub!(/\[!\[([^\]]*)\]\(#{escaped_src}\)\]/) { "[<img src='#{url}' alt='#{$1}'>]" }
              # Markdown inline - ![alt](http://...)
              raw.gsub!(/!\[([^\]]*)\]\(#{escaped_src}\)/) { "![#{$1}](#{url})" }
              # Markdown inline - ![](http://... "image title")
              raw.gsub!(/!\[\]\(#{escaped_src} "([^\]]*)"\)/) { "![](#{url})" }
              # Markdown inline - ![alt](http://... "image title")
              raw.gsub!(/!\[([^\]]*)\]\(#{escaped_src} "([^\]]*)"\)/) { "![](#{url})" }
              # Markdown reference - [x]: http://
              raw.gsub!(/\[([^\]]+)\]:\s?#{escaped_src}/) { "[#{$1}]: #{url}" }
              # Direct link
              raw.gsub!(/^#{escaped_src}(\s?)$/) { "<img src='#{url}'>#{$1}" }
            end
          rescue => e
            log(:error, "Failed to pull hotlinked image (#{src}) post: #{post_id}\n" + e.message + "\n" + e.backtrace.join("\n"))
          end
        end

      end

      post.reload
      if start_raw == post.raw && raw != post.raw
        changes = { raw: raw, edit_reason: I18n.t("upload.edit_reason") }
        # we never want that job to bump the topic
        options = { bypass_bump: true }
        post.revise(Discourse.system_user, changes, options)
      elsif downloaded_urls.present?
        post.trigger_post_process(true)
      elsif broken_images.present? || large_images.present?
        start_html = post.cooked
        doc = Nokogiri::HTML::fragment(start_html)
        images = doc.css("img[src]") - doc.css("img.avatar")
        images.each do |tag|
          src = tag['src']
          if broken_images.include?(src)
            tag.name = 'span'
            tag.set_attribute('class', 'broken-image fa fa-chain-broken')
            tag.set_attribute('title', I18n.t('post.image_placeholder.broken'))
            tag.remove_attribute('src')
            tag.remove_attribute('width')
            tag.remove_attribute('height')
          elsif large_images.include?(src)
            tag.name = 'a'
            tag.set_attribute('href', src)
            tag.set_attribute('target', '_blank')
            tag.set_attribute('title', I18n.t('post.image_placeholder.large'))
            tag.remove_attribute('src')
            tag.remove_attribute('width')
            tag.remove_attribute('height')
            tag.inner_html = '<span class="large-image fa fa-picture-o"></span>'
            parent = tag.parent
            if parent.name == 'a'
              parent.add_next_sibling(tag)
              parent.add_next_sibling('<br>')
              parent.content = parent["href"]
            end
          end
        end
        if start_html == post.cooked && doc.to_html != post.cooked
          post.update_column(:cooked, doc.to_html)
          post.publish_change_to_clients! :revised
        end
      end
    end

    def extract_images_from(html)
      doc = Nokogiri::HTML::fragment(html)
      doc.css("img[src]") - doc.css("img.avatar")
    end

    def is_valid_image_url(src)
      # make sure we actually have a url
      return false unless src.present?
      # we don't want to pull uploaded images
      return false if Discourse.store.has_been_uploaded?(src)
      # we don't want to pull relative images
      return false if src =~ /\A\/[^\/]/i

      # parse the src
      begin
        uri = URI.parse(src)
      rescue URI::InvalidURIError
        return false
      end

      hostname = uri.hostname
      return false unless hostname

      # we don't want to pull images hosted on the CDN (if we use one)
      return false if Discourse.asset_host.present? && URI.parse(Discourse.asset_host).hostname == hostname
      return false if SiteSetting.s3_cdn_url.present? && URI.parse(SiteSetting.s3_cdn_url).hostname == hostname
      # we don't want to pull images hosted on the main domain
      return false if URI.parse(Discourse.base_url_no_prefix).hostname == hostname
      # check the domains blacklist
      SiteSetting.should_download_images?(src)
    end

    def log(log_level, message)
      Rails.logger.public_send(
        log_level,
        "#{RailsMultisite::ConnectionManagement.current_db}: #{message}"
      )
    end

  end

end
FIX markdown hotlinked images were not properly pulled 2013-11-20 20:10:08 +08:00			`require_dependency 'url_helper'`
FEATURE: support email attachments 2014-04-15 04:55:57 +08:00			`require_dependency 'file_helper'`
REFACTOR: upload workflow creation into UploadCreator - Automatically convert large-ish PNG/BMP to JPEG - Updated fast_image to latest version 2017-05-11 06:16:57 +08:00			`require_dependency 'upload_creator'`
FIX markdown hotlinked images were not properly pulled 2013-11-20 20:10:08 +08:00
pull hotlinked images 2013-11-06 02:04:47 +08:00			`module Jobs`

			`class PullHotlinkedImages < Jobs::Base`
FEATURE: prioritize sidekiq jobs This commit introduces 3 queues for sidekiq "critical" for urgent jobs (weighted at 4x weight) "default" for standard jobs(weighted at 2x weight) "low" for less important jobs "critical jobs" Reset Password emails has been seperated to its own job Heartbeat which is required to keep sidekiq running Test email which needs to return real quick "low priority jobs" Notify mailing list Pull hotlinked images Update gravatar "default" All the rest Note: for people running sidekiq from command line use bin/sidekiq -q critical,4 -q default,2 -q low 2016-04-07 10:56:43 +08:00
			`sidekiq_options queue: 'low'`

pull hotlinked images 2013-11-06 02:04:47 +08:00			`def initialize`
			`# maximum size of the file in bytes`
do not pull hotlinked images when max_image_size_kb == 0 2013-11-14 00:30:48 +08:00			`@max_size = SiteSetting.max_image_size_kb.kilobytes`
pull hotlinked images 2013-11-06 02:04:47 +08:00			`end`

			`def execute(args)`
add download_remote_images_to_local site setting 2013-11-15 22:22:18 +08:00			`return unless SiteSetting.download_remote_images_to_local?`
pull hotlinked images 2013-11-06 02:04:47 +08:00
			`post_id = args[:post_id]`
			`raise Discourse::InvalidParameters.new(:post_id) unless post_id.present?`

Perform the where(...).first to find_by(...) refactoring. This refactoring was automated using the command: bundle exec "ruby refactorings/where_dot_first_to_find_by/app.rb" 2014-05-06 21:41:59 +08:00			`post = Post.find_by(id: post_id)`
pull hotlinked images 2013-11-06 02:04:47 +08:00			`return unless post.present?`

			`raw = post.raw.dup`
Backoff-retry for hotlinked image pull + some style fixes 2014-04-22 05:08:17 +08:00			`start_raw = raw.dup`
pull hotlinked images 2013-11-06 02:04:47 +08:00			`downloaded_urls = {}`
FEATURE: Add placeholders to broken and large image files (#5113) 2017-09-01 22:26:13 +08:00			`broken_images, large_images = [], []`
pull hotlinked images 2013-11-06 02:04:47 +08:00
			`extract_images_from(post.cooked).each do \|image\|`
FIX: Handle img src starting with "//" in pull_hotlinked_images job 2017-01-16 18:50:07 +08:00			`src = original_src = image['src']`
FIX: `Jobs::PullHotlinkedImages#is_valid_image_src` returns true for a generic string. 2017-07-06 16:55:28 +08:00			`src = "http:#{src}" if src.start_with?("//")`
pull hotlinked images 2013-11-06 02:04:47 +08:00
			`if is_valid_image_url(src)`
Backoff-retry for hotlinked image pull + some style fixes 2014-04-22 05:08:17 +08:00			`hotlinked = nil`
pull hotlinked images 2013-11-06 02:04:47 +08:00			`begin`
			`# have we already downloaded that file?`
Backoff-retry for hotlinked image pull + some style fixes 2014-04-22 05:08:17 +08:00			`unless downloaded_urls.include?(src)`
BUGFIX: pull hotlinked images job wasn't properly handling the InvalidParameters exception 2014-04-22 21:32:48 +08:00			`begin`
Refactor `FileHelper` to use keyword arguments. 2017-05-25 01:42:52 +08:00			`hotlinked = FileHelper.download(`
			`src,`
			`max_file_size: @max_size,`
			`tmp_file_name: "discourse-hotlinked",`
			`follow_redirect: true`
			`)`
BUGFIX: pull hotlinked images job wasn't properly handling the InvalidParameters exception 2014-04-22 21:32:48 +08:00			`rescue Discourse::InvalidParameters`
add more logs to pull hotlinked images 2017-09-28 07:00:13 +08:00			`log(:error, "InvalidParameters while downloading hotlinked image (#{src}) for post: #{post_id}")`
FIX: correctly raise errors when downloads fail This corrects an issue where we are hitting Gravatar for 404 over and over Also ensures file download properly reports errors 2017-09-28 14:35:27 +08:00			`rescue => e`
			`log(:error, "Failed to download image #{e}")`
BUGFIX: pull hotlinked images job wasn't properly handling the InvalidParameters exception 2014-04-22 21:32:48 +08:00			`end`
FIX: there's no need to try to download relative images 2014-09-27 00:27:10 +08:00			`if hotlinked`
FIX: Use File.size instead of IO.size 2015-08-18 00:57:28 +08:00			`if File.size(hotlinked.path) <= @max_size`
FIX: there's no need to try to download relative images 2014-09-27 00:27:10 +08:00			`filename = File.basename(URI.parse(src).path)`
FIX: pull hotlinked images even when they have no extension 2017-06-13 19:27:05 +08:00			`filename << File.extname(hotlinked.path) unless filename["."]`
REFACTOR: upload workflow creation into UploadCreator - Automatically convert large-ish PNG/BMP to JPEG - Updated fast_image to latest version 2017-05-11 06:16:57 +08:00			`upload = UploadCreator.new(hotlinked, filename, origin: src).create_for(post.user_id)`
FIX: pull hotlinked images even when they have no extension 2017-06-13 19:27:05 +08:00			`if upload.persisted?`
			`downloaded_urls[src] = upload.url`
			`else`
bump log level to debug download issues 2017-09-28 05:26:07 +08:00			`log(:error, "Failed to pull hotlinked image for post: #{post_id}: #{src} - #{upload.errors.join("\n")}")`
FIX: pull hotlinked images even when they have no extension 2017-06-13 19:27:05 +08:00			`end`
FIX: there's no need to try to download relative images 2014-09-27 00:27:10 +08:00			`else`
bump log level to debug download issues 2017-09-28 05:26:07 +08:00			`log(:error, "Failed to pull hotlinked image for post: #{post_id}: #{src} - Image is bigger than #{@max_size}")`
FEATURE: Add placeholders to broken and large image files (#5113) 2017-09-01 22:26:13 +08:00			`large_images << original_src`
FIX: there's no need to try to download relative images 2014-09-27 00:27:10 +08:00			`end`
pull hotlinked images 2013-11-06 02:04:47 +08:00			`else`
bump log level to debug download issues 2017-09-28 05:26:07 +08:00			`log(:error, "There was an error while downloading '#{src}' locally for post: #{post_id}")`
FEATURE: Add placeholders to broken and large image files (#5113) 2017-09-01 22:26:13 +08:00			`broken_images << original_src`
pull hotlinked images 2013-11-06 02:04:47 +08:00			`end`
			`end`
Correct few spelling in the comments 2013-12-21 15:19:22 +08:00			`# have we successfully downloaded that file?`
pull hotlinked images 2013-11-06 02:04:47 +08:00			`if downloaded_urls[src].present?`
			`url = downloaded_urls[src]`
FIX: Handle img src starting with "//" in pull_hotlinked_images job 2017-01-16 18:50:07 +08:00			`escaped_src = Regexp.escape(original_src)`
FIX markdown hotlinked images were not properly pulled 2013-11-20 20:10:08 +08:00			`# there are 6 ways to insert an image in a post`
pull hotlinked images 2013-11-06 02:04:47 +08:00			`# HTML tag - <img src="http://...">`
			`raw.gsub!(/src=["']#{escaped_src}["']/i, "src='#{url}'")`
			`# BBCode tag - [img]http://...[/img]`
			`raw.gsub!(/\[img\]#{escaped_src}\[\/img\]/i, "[img]#{url}[/img]")`
FIX markdown hotlinked images were not properly pulled 2013-11-20 20:10:08 +08:00			`# Markdown linked image - [![alt](http://...)](http://...)`
			`raw.gsub!(/\[!\[([^\]]*)\]\(#{escaped_src}\)\]/) { "[<img src='#{url}' alt='#{$1}'>]" }`
pull hotlinked images 2013-11-06 02:04:47 +08:00			`# Markdown inline - ![alt](http://...)`
			`raw.gsub!(/!\[([^\]]*)\]\(#{escaped_src}\)/) { "![#{$1}](#{url})" }`
FIX: properly insert images in markdown inline format 2016-09-01 14:25:40 +08:00			`# Markdown inline - ![](http://... "image title")`
			`raw.gsub!(/!\[\]\(#{escaped_src} "([^\]]*)"\)/) { "![](#{url})" }`
FIX: properly insert images in markdown inline format (take 2) 2016-09-01 20:26:39 +08:00			`# Markdown inline - ![alt](http://... "image title")`
			`raw.gsub!(/!\[([^\]])\]\(#{escaped_src} "([^\]])"\)/) { "![](#{url})" }`
pull hotlinked images 2013-11-06 02:04:47 +08:00			`# Markdown reference - [x]: http://`
FIX: PullHotlinkedImages was messing with URL when using Markdown references 2015-11-09 23:37:51 +08:00			`raw.gsub!(/\[([^\]]+)\]:\s?#{escaped_src}/) { "[#{$1}]: #{url}" }`
pull hotlinked images 2013-11-06 02:04:47 +08:00			`# Direct link`
FIX: keep whitespaces when replacing direct link to external images with local images 2016-02-15 19:34:45 +08:00			`raw.gsub!(/^#{escaped_src}(\s?)$/) { "<img src='#{url}'>#{$1}" }`
pull hotlinked images 2013-11-06 02:04:47 +08:00			`end`
			`rescue => e`
add more logs to pull hotlinked images 2017-09-28 07:00:13 +08:00			`log(:error, "Failed to pull hotlinked image (#{src}) post: #{post_id}\n" + e.message + "\n" + e.backtrace.join("\n"))`
pull hotlinked images 2013-11-06 02:04:47 +08:00			`end`
			`end`

			`end`

Backoff-retry for hotlinked image pull + some style fixes 2014-04-22 05:08:17 +08:00			`post.reload`
FIX: prevent infinite loop in PullHotlinkedImages job 2015-10-31 05:46:46 +08:00			`if start_raw == post.raw && raw != post.raw`
LOTS of changes to properly handle post/topic revisions FIX: history revision can now properly be hidden FIX: PostRevision serializer is now entirely dynamic to properly handle hidden revisions FIX: default history modal to "side by side" view on mobile FIX: properly hiden which revision has been hidden UX: inline category/user/wiki/post_type changes with the revision details FEATURE: new '/posts/:post_id/revisions/latest' endpoint to retrieve latest revision UX: do not show the hide/show revision button on mobile (no room for them) UX: remove CSS transitions on the buttons in the history modal FIX: PostRevisor now handles all the changes that might create new revisions FIX: PostRevision.ensure_consistency! was wrong due to off by 1 mistake... refactored topic's callbacks for better readability extracted 'PostRevisionGuardian' 2014-10-28 05:06:43 +08:00			`changes = { raw: raw, edit_reason: I18n.t("upload.edit_reason") }`
			`# we never want that job to bump the topic`
			`options = { bypass_bump: true }`
			`post.revise(Discourse.system_user, changes, options)`
FEATURE: pull onebox images 2017-06-02 17:39:06 +08:00			`elsif downloaded_urls.present?`
			`post.trigger_post_process(true)`
FEATURE: Add placeholders to broken and large image files (#5113) 2017-09-01 22:26:13 +08:00			`elsif broken_images.present? \|\| large_images.present?`
			`start_html = post.cooked`
			`doc = Nokogiri::HTML::fragment(start_html)`
			`images = doc.css("img[src]") - doc.css("img.avatar")`
			`images.each do \|tag\|`
			`src = tag['src']`
			`if broken_images.include?(src)`
			`tag.name = 'span'`
			`tag.set_attribute('class', 'broken-image fa fa-chain-broken')`
UX: Placeholder images color changed & tootip added 2017-09-13 17:46:38 +08:00			`tag.set_attribute('title', I18n.t('post.image_placeholder.broken'))`
FEATURE: Add placeholders to broken and large image files (#5113) 2017-09-01 22:26:13 +08:00			`tag.remove_attribute('src')`
UX: Placeholder images color changed & tootip added 2017-09-13 17:46:38 +08:00			`tag.remove_attribute('width')`
			`tag.remove_attribute('height')`
FEATURE: Add placeholders to broken and large image files (#5113) 2017-09-01 22:26:13 +08:00			`elsif large_images.include?(src)`
			`tag.name = 'a'`
			`tag.set_attribute('href', src)`
			`tag.set_attribute('target', '_blank')`
UX: Placeholder images color changed & tootip added 2017-09-13 17:46:38 +08:00			`tag.set_attribute('title', I18n.t('post.image_placeholder.large'))`
FEATURE: Add placeholders to broken and large image files (#5113) 2017-09-01 22:26:13 +08:00			`tag.remove_attribute('src')`
UX: Placeholder images color changed & tootip added 2017-09-13 17:46:38 +08:00			`tag.remove_attribute('width')`
			`tag.remove_attribute('height')`
FEATURE: Add placeholders to broken and large image files (#5113) 2017-09-01 22:26:13 +08:00			`tag.inner_html = '<span class="large-image fa fa-picture-o"></span>'`
UX: Placeholder images color changed & tootip added 2017-09-13 17:46:38 +08:00			`parent = tag.parent`
			`if parent.name == 'a'`
			`parent.add_next_sibling(tag)`
			`parent.add_next_sibling('<br>')`
			`parent.content = parent["href"]`
			`end`
FEATURE: Add placeholders to broken and large image files (#5113) 2017-09-01 22:26:13 +08:00			`end`
			`end`
			`if start_html == post.cooked && doc.to_html != post.cooked`
			`post.update_column(:cooked, doc.to_html)`
			`post.publish_change_to_clients! :revised`
			`end`
pull hotlinked images 2013-11-06 02:04:47 +08:00			`end`
			`end`

			`def extract_images_from(html)`
			`doc = Nokogiri::HTML::fragment(html)`
FEATURE: pull onebox images 2017-06-02 17:39:06 +08:00			`doc.css("img[src]") - doc.css("img.avatar")`
pull hotlinked images 2013-11-06 02:04:47 +08:00			`end`

			`def is_valid_image_url(src)`
BUGFIX: make sure we do not try to pull images from the CDN 2014-05-08 01:49:16 +08:00			`# make sure we actually have a url`
			`return false unless src.present?`
			`# we don't want to pull uploaded images`
			`return false if Discourse.store.has_been_uploaded?(src)`
FIX: there's no need to try to download relative images 2014-09-27 00:27:10 +08:00			`# we don't want to pull relative images`
			`return false if src =~ /\A\/[^\/]/i`
FIX: `Jobs::PullHotlinkedImages#is_valid_image_src` returns true for a generic string. 2017-07-06 16:55:28 +08:00
BUGFIX: make sure we do not try to pull images from the CDN 2014-05-08 01:49:16 +08:00			`# parse the src`
			`begin`
			`uri = URI.parse(src)`
			`rescue URI::InvalidURIError`
			`return false`
			`end`
FIX: `Jobs::PullHotlinkedImages#is_valid_image_src` returns true for a generic string. 2017-07-06 16:55:28 +08:00
			`hostname = uri.hostname`
			`return false unless hostname`

BUGFIX: make sure we do not try to pull images from the CDN 2014-05-08 01:49:16 +08:00			`# we don't want to pull images hosted on the CDN (if we use one)`
FIX: `Jobs::PullHotlinkedImages#is_valid_image_src` returns true for a generic string. 2017-07-06 16:55:28 +08:00			`return false if Discourse.asset_host.present? && URI.parse(Discourse.asset_host).hostname == hostname`
			`return false if SiteSetting.s3_cdn_url.present? && URI.parse(SiteSetting.s3_cdn_url).hostname == hostname`
BUGFIX: make sure we do not try to pull images from the CDN 2014-05-08 01:49:16 +08:00			`# we don't want to pull images hosted on the main domain`
FIX: `Jobs::PullHotlinkedImages#is_valid_image_src` returns true for a generic string. 2017-07-06 16:55:28 +08:00			`return false if URI.parse(Discourse.base_url_no_prefix).hostname == hostname`
BUGFIX: make sure we do not try to pull images from the CDN 2014-05-08 01:49:16 +08:00			`# check the domains blacklist`
Add site setting for domains to never download images from 2014-04-22 04:59:53 +08:00			`SiteSetting.should_download_images?(src)`
pull hotlinked images 2013-11-06 02:04:47 +08:00			`end`

Log site name when logging to Logster in `Jobs::PullHotlinkedImages`. 2017-07-05 09:34:24 +08:00			`def log(log_level, message)`
			`Rails.logger.public_send(`
			`log_level,`
			`"#{RailsMultisite::ConnectionManagement.current_db}: #{message}"`
			`)`
			`end`

pull hotlinked images 2013-11-06 02:04:47 +08:00			`end`

			`end`