From 187204705323b650d61ed25862eb1a0c733aa63c Mon Sep 17 00:00:00 2001 From: Gerhard Schlager Date: Sat, 4 May 2024 22:48:16 +0200 Subject: [PATCH] DEV: Uploads import script can download files (#26816) Uploads import script can download files --- lib/upload_markdown.rb | 14 +-- script/bulk_import/uploads_importer.rb | 113 ++++++++++++++++++++++-- script/bulk_import/uploads_importer.yml | 3 + 3 files changed, 118 insertions(+), 12 deletions(-) diff --git a/lib/upload_markdown.rb b/lib/upload_markdown.rb index eae2319d625..5fdecb14b52 100644 --- a/lib/upload_markdown.rb +++ b/lib/upload_markdown.rb @@ -7,16 +7,17 @@ class UploadMarkdown def to_markdown(display_name: nil) if FileHelper.is_supported_image?(@upload.original_filename) - image_markdown + image_markdown(display_name: display_name) elsif FileHelper.is_supported_playable_media?(@upload.original_filename) - playable_media_markdown + playable_media_markdown(display_name: display_name) else attachment_markdown(display_name: display_name) end end - def image_markdown - "![#{@upload.original_filename}|#{@upload.width}x#{@upload.height}](#{@upload.short_url})" + def image_markdown(display_name: nil) + display_name ||= @upload.original_filename + "![#{display_name}|#{@upload.width}x#{@upload.height}](#{@upload.short_url})" end def attachment_markdown(display_name: nil, with_filesize: true) @@ -26,7 +27,7 @@ class UploadMarkdown "[#{display_name}|attachment](#{@upload.short_url})#{human_filesize}" end - def playable_media_markdown + def playable_media_markdown(display_name: nil) type = if FileHelper.is_supported_audio?(@upload.original_filename) "audio" @@ -34,6 +35,7 @@ class UploadMarkdown "video" end return attachment_markdown if !type - "![#{@upload.original_filename}|#{type}](#{@upload.short_url})" + display_name ||= @upload.original_filename + "![#{display_name}|#{type}](#{@upload.short_url})" end end diff --git a/script/bulk_import/uploads_importer.rb b/script/bulk_import/uploads_importer.rb index fe738f13099..da03c690b0f 100644 --- a/script/bulk_import/uploads_importer.rb +++ b/script/bulk_import/uploads_importer.rb @@ -20,6 +20,10 @@ rescue LoadError end module BulkImport + MAX_FILE_SIZE = 1.gigabyte + + UploadMetadata = Struct.new(:original_filename, :origin_url, :description) + class UploadsImporter TRANSACTION_SIZE = 1000 QUEUE_SIZE = 1000 @@ -109,7 +113,7 @@ module BulkImport producer_thread = Thread.new do - query("SELECT * FROM uploads", @source_db).tap do |result_set| + query("SELECT * FROM uploads ORDER BY id", @source_db).tap do |result_set| result_set.each { |row| queue << row unless output_existing_ids.include?(row["id"]) } result_set.close end @@ -131,7 +135,7 @@ module BulkImport puts "", "Failed to create upload: #{params[:id]} (#{error_message})", "" end - @output_db.execute(<<~SQL, params) + insert(<<~SQL, params) INSERT INTO uploads (id, upload, markdown, skip_reason) VALUES (:id, :upload, :markdown, :skip_reason) SQL @@ -158,14 +162,23 @@ module BulkImport begin data_file = nil path = nil + metadata = + UploadMetadata.new( + original_filename: row["display_filename"] || row["filename"], + description: row["description"].presence, + ) if row["data"].present? data_file = Tempfile.new("discourse-upload", binmode: true) data_file.write(row["data"]) data_file.rewind path = data_file.path + elsif row["url"].present? + path, metadata.original_filename = download_file(url: row["url"], id: row["id"]) + metadata.origin_url = row["url"] + next if !path else - relative_path = row["relative_path"] + relative_path = row["relative_path"] || "" file_exists = false @root_paths.each do |root_path| @@ -198,8 +211,9 @@ module BulkImport begin UploadCreator.new( file, - row["display_filename"] || row["filename"], + metadata.original_filename, type: row["type"], + origin: metadata.origin_url, ).create_for(Discourse::SYSTEM_USER_ID) rescue StandardError => e error_message = e.message @@ -228,7 +242,8 @@ module BulkImport status_queue << { id: row["id"], upload: upload.attributes.to_json, - markdown: UploadMarkdown.new(upload).to_markdown, + markdown: + UploadMarkdown.new(upload).to_markdown(display_name: metadata.description), skip_reason: nil, } break @@ -269,6 +284,85 @@ module BulkImport status_thread.join end + def download_file(url:, id:, retry_count: 0) + path = download_cache_path(id) + original_filename = nil + + if File.exist?(path) && (original_filename = get_original_filename(id)) + return path, original_filename + end + + fd = FinalDestination.new(url) + file = nil + + fd.get do |response, chunk, uri| + if file.nil? + check_response!(response, uri) + original_filename = extract_filename_from_response(response, uri) + file = File.open(path, "wb") + end + + file.write(chunk) + + if file.size > MAX_FILE_SIZE + file.close + file.unlink + file = nil + throw :done + end + end + + if file + file.close + insert( + "INSERT INTO downloads (id, original_filename) VALUES (?, ?)", + [id, original_filename], + ) + return path, original_filename + end + + nil + end + + def download_cache_path(id) + id = id.gsub("/", "_").gsub("=", "-") + File.join(@settings[:download_cache_path], id) + end + + def get_original_filename(id) + @output_db.get_first_value("SELECT original_filename FROM downloads WHERE id = ?", id) + end + + def check_response!(response, uri) + if uri.blank? + code = response.code.to_i + + if code >= 400 + raise "#{code} Error" + else + throw :done + end + end + end + + def extract_filename_from_response(response, uri) + filename = + if (header = response.header["Content-Disposition"].presence) + disposition_filename = + header[/filename\*=UTF-8''(\S+)\b/i, 1] || header[/filename=(?:"(.+)"|[^\s;]+)/i, 1] + disposition_filename.present? ? URI.decode_www_form_component(disposition_filename) : nil + end + + filename = File.basename(uri.path).presence || "file" if filename.blank? + + if File.extname(filename).blank? && response.content_type.present? + ext = MiniMime.lookup_by_content_type(response.content_type)&.extension + filename = "#{filename}.#{ext}" if ext.present? + end + + filename + end + def fix_missing queue = SizedQueue.new(QUEUE_SIZE) consumer_threads = [] @@ -419,7 +513,7 @@ module BulkImport case params.delete(:status) when :ok - @output_db.execute(<<~SQL, params) + insert(<<~SQL, params) INSERT INTO optimized_images (id, optimized_images) VALUES (:id, :optimized_images) SQL @@ -604,6 +698,13 @@ module BulkImport optimized_images JSON_TEXT ) SQL + + @output_db.execute(<<~SQL) + CREATE TABLE IF NOT EXISTS downloads ( + id TEXT PRIMARY KEY NOT NULL, + original_filename TEXT NOT NULL + ) + SQL end def insert(sql, bind_vars = []) diff --git a/script/bulk_import/uploads_importer.yml b/script/bulk_import/uploads_importer.yml index 4b7122a3bc5..ac3ca5e7d35 100644 --- a/script/bulk_import/uploads_importer.yml +++ b/script/bulk_import/uploads_importer.yml @@ -5,6 +5,9 @@ root_paths: - "/path/to/your/files" - "/path/to/more/files" +# Files that are downloaded from URLs are cached in this directory. +download_cache_path: "/path/to/downloaded/files" + # The number of threads to use for processing uploads is calculated as: # thread_count = [number of cores] * [thread_count_factor] # The thread count will be doubled if uploads are stored on S3 because there's a higher latency.