DEV: Uploads import script can download files (#26816)

Uploads import script can download files
2024-11-22 05:59:39 +08:00 · 2024-05-04 22:48:16 +02:00 · 2024-05-04 22:48:16 +02:00 · 1872047053
commit 1872047053
parent f230767722
3 changed files with 118 additions and 12 deletions
--- a/lib/upload_markdown.rb
+++ b/lib/upload_markdown.rb
@ -7,16 +7,17 @@ class UploadMarkdown
  def to_markdown(display_name: nil)
    if FileHelper.is_supported_image?(@upload.original_filename)
-      image_markdown
+      image_markdown(display_name: display_name)
    elsif FileHelper.is_supported_playable_media?(@upload.original_filename)
-      playable_media_markdown
+      playable_media_markdown(display_name: display_name)
    else
      attachment_markdown(display_name: display_name)
    end
  end
-  def image_markdown
+  def image_markdown(display_name: nil)
-    "![#{@upload.original_filename}|#{@upload.width}x#{@upload.height}](#{@upload.short_url})"
+    display_name ||= @upload.original_filename
    "![#{display_name}|#{@upload.width}x#{@upload.height}](#{@upload.short_url})"
  end
  def attachment_markdown(display_name: nil, with_filesize: true)
@ -26,7 +27,7 @@ class UploadMarkdown
    "[#{display_name}|attachment](#{@upload.short_url})#{human_filesize}"
  end
-  def playable_media_markdown
+  def playable_media_markdown(display_name: nil)
    type =
      if FileHelper.is_supported_audio?(@upload.original_filename)
        "audio"
@ -34,6 +35,7 @@ class UploadMarkdown
        "video"
      end
    return attachment_markdown if !type
-    "![#{@upload.original_filename}|#{type}](#{@upload.short_url})"
+    display_name ||= @upload.original_filename
    "![#{display_name}|#{type}](#{@upload.short_url})"
  end
 end
--- a/script/bulk_import/uploads_importer.rb
+++ b/script/bulk_import/uploads_importer.rb
@ -20,6 +20,10 @@ rescue LoadError
 end
 module BulkImport
  MAX_FILE_SIZE = 1.gigabyte
  UploadMetadata = Struct.new(:original_filename, :origin_url, :description)
  class UploadsImporter
    TRANSACTION_SIZE = 1000
    QUEUE_SIZE = 1000
@ -109,7 +113,7 @@ module BulkImport
      producer_thread =
        Thread.new do
-          query("SELECT * FROM uploads", @source_db).tap do |result_set|
+          query("SELECT * FROM uploads ORDER BY id", @source_db).tap do |result_set|
            result_set.each { |row| queue << row unless output_existing_ids.include?(row["id"]) }
            result_set.close
          end
@ -131,7 +135,7 @@ module BulkImport
                puts "", "Failed to create upload: #{params[:id]} (#{error_message})", ""
              end
-              @output_db.execute(<<~SQL, params)
+              insert(<<~SQL, params)
                INSERT INTO uploads (id, upload, markdown, skip_reason)
                VALUES (:id, :upload, :markdown, :skip_reason)
              SQL
@ -158,14 +162,23 @@ module BulkImport
            begin
              data_file = nil
              path = nil
              metadata =
                UploadMetadata.new(
                  original_filename: row["display_filename"] || row["filename"],
                  description: row["description"].presence,
                )
              if row["data"].present?
                data_file = Tempfile.new("discourse-upload", binmode: true)
                data_file.write(row["data"])
                data_file.rewind
                path = data_file.path
              elsif row["url"].present?
                path, metadata.original_filename = download_file(url: row["url"], id: row["id"])
                metadata.origin_url = row["url"]
                next if !path
              else
-                relative_path = row["relative_path"]
+                relative_path = row["relative_path"] || ""
                file_exists = false
                @root_paths.each do |root_path|
@ -198,8 +211,9 @@ module BulkImport
                    begin
                      UploadCreator.new(
                        file,
-                        row["display_filename"] || row["filename"],
+                        metadata.original_filename,
                        type: row["type"],
                        origin: metadata.origin_url,
                      ).create_for(Discourse::SYSTEM_USER_ID)
                    rescue StandardError => e
                      error_message = e.message
@ -228,7 +242,8 @@ module BulkImport
                  status_queue << {
                    id: row["id"],
                    upload: upload.attributes.to_json,
-                    markdown: UploadMarkdown.new(upload).to_markdown,
+                    markdown:
                      UploadMarkdown.new(upload).to_markdown(display_name: metadata.description),
                    skip_reason: nil,
                  }
                  break
@ -269,6 +284,85 @@ module BulkImport
      status_thread.join
    end
    def download_file(url:, id:, retry_count: 0)
      path = download_cache_path(id)
      original_filename = nil
      if File.exist?(path) && (original_filename = get_original_filename(id))
        return path, original_filename
      end
      fd = FinalDestination.new(url)
      file = nil
      fd.get do |response, chunk, uri|
        if file.nil?
          check_response!(response, uri)
          original_filename = extract_filename_from_response(response, uri)
          file = File.open(path, "wb")
        end
        file.write(chunk)
        if file.size > MAX_FILE_SIZE
          file.close
          file.unlink
          file = nil
          throw :done
        end
      end
      if file
        file.close
        insert(
          "INSERT INTO downloads (id, original_filename) VALUES (?, ?)",
          [id, original_filename],
        )
        return path, original_filename
      end
      nil
    end
    def download_cache_path(id)
      id = id.gsub("/", "_").gsub("=", "-")
      File.join(@settings[:download_cache_path], id)
    end
    def get_original_filename(id)
      @output_db.get_first_value("SELECT original_filename FROM downloads WHERE id = ?", id)
    end
    def check_response!(response, uri)
      if uri.blank?
        code = response.code.to_i
        if code >= 400
          raise "#{code} Error"
        else
          throw :done
        end
      end
    end
    def extract_filename_from_response(response, uri)
      filename =
        if (header = response.header["Content-Disposition"].presence)
          disposition_filename =
            header[/filename\*=UTF-8''(\S+)\b/i, 1] || header[/filename=(?:"(.+)"|[^\s;]+)/i, 1]
          disposition_filename.present? ? URI.decode_www_form_component(disposition_filename) : nil
        end
      filename = File.basename(uri.path).presence || "file" if filename.blank?
      if File.extname(filename).blank? && response.content_type.present?
        ext = MiniMime.lookup_by_content_type(response.content_type)&.extension
        filename = "#{filename}.#{ext}" if ext.present?
      end
      filename
    end
    def fix_missing
      queue = SizedQueue.new(QUEUE_SIZE)
      consumer_threads = []
@ -419,7 +513,7 @@ module BulkImport
            case params.delete(:status)
            when :ok
-              @output_db.execute(<<~SQL, params)
+              insert(<<~SQL, params)
                INSERT INTO optimized_images (id, optimized_images)
                VALUES (:id, :optimized_images)
              SQL
@ -604,6 +698,13 @@ module BulkImport
          optimized_images JSON_TEXT
        )
      SQL
      @output_db.execute(<<~SQL)
        CREATE TABLE IF NOT EXISTS downloads (
          id TEXT PRIMARY KEY NOT NULL,
          original_filename TEXT NOT NULL
        )
      SQL
    end
    def insert(sql, bind_vars = [])
--- a/script/bulk_import/uploads_importer.yml
+++ b/script/bulk_import/uploads_importer.yml
@ -5,6 +5,9 @@ root_paths:
  - "/path/to/your/files"
  - "/path/to/more/files"
 # Files that are downloaded from URLs are cached in this directory.
 download_cache_path: "/path/to/downloaded/files"
 # The number of threads to use for processing uploads is calculated as:
 #   thread_count = [number of cores] * [thread_count_factor]
 # The thread count will be doubled if uploads are stored on S3 because there's a higher latency.