List and restore missing post uploads from S3 inventory.

This commit is contained in:
Vinoth Kannan 2019-05-04 01:16:20 +05:30
parent bfcbfd7864
commit e8fafbc123
5 changed files with 175 additions and 119 deletions

View File

@ -890,12 +890,12 @@ class Post < ActiveRecord::Base
def link_post_uploads(fragments: nil)
upload_ids = []
fragments ||= Nokogiri::HTML::fragment(self.cooked)
fragments.css("a/@href", "img/@src").each do |media|
if upload = Upload.get_from_url(media.value)
upload_ids << upload.id
end
each_upload_url(fragments: fragments) do |src, _, sha1|
upload = nil
upload = Upload.find_by(sha1: sha1) if sha1.present?
upload ||= Upload.get_from_url(src)
upload_ids << upload.id if upload.present?
end
upload_ids |= Upload.where(id: downloaded_images.values).pluck(:id)
@ -916,6 +916,84 @@ class Post < ActiveRecord::Base
{}
end
def each_upload_url(fragments: nil, include_local_upload: true)
upload_patterns = [
/\/uploads\/#{RailsMultisite::ConnectionManagement.current_db}\//,
/\/original\//,
/\/optimized\//
]
fragments ||= Nokogiri::HTML::fragment(self.cooked)
links = fragments.css("a/@href", "img/@src").map { |media| media.value }.uniq
links.each do |src|
next if src.blank? || upload_patterns.none? { |pattern| src =~ pattern }
src = "#{SiteSetting.force_https ? "https" : "http"}:#{src}" if src.start_with?("//")
next unless Discourse.store.has_been_uploaded?(src) || (include_local_upload && src =~ /\A\/[^\/]/i)
path = begin
URI(URI.unescape(src))&.path
rescue URI::Error
end
next if path.blank?
sha1 =
if path.include? "optimized"
OptimizedImage.extract_sha1(path)
else
Upload.extract_sha1(path)
end
yield(src, path, sha1)
end
end
def self.find_missing_uploads(include_local_upload: true)
PostCustomField.where(name: Post::MISSING_UPLOADS).delete_all
missing_uploads = []
missing_post_uploads = {}
Post.have_uploads.select(:id, :cooked).find_in_batches do |posts|
ids = posts.pluck(:id)
sha1s = Upload.joins(:post_uploads).where("post_uploads.post_id >= ? AND post_uploads.post_id <= ?", ids.min, ids.max).pluck(:sha1)
posts.each do |post|
post.each_upload_url do |src, path, sha1|
next if sha1.present? && sha1s.include?(sha1)
missing_post_uploads[post.id] ||= []
if missing_uploads.include?(src)
missing_post_uploads[post.id] << src
next
end
upload_id = nil
upload_id = Upload.where(sha1: sha1).pluck(:id).first if sha1.present?
upload_id ||= yield(post, src, path, sha1)
if upload_id.present?
attributes = { post_id: post.id, upload_id: upload_id }
PostUpload.create!(attributes) unless PostUpload.exists?(attributes)
else
missing_uploads << src
missing_post_uploads[post.id] << src
end
end
end
end
count = 0
missing_post_uploads = missing_post_uploads.reject { |_, uploads| uploads.empty? }
missing_post_uploads.reject do |post_id, uploads|
PostCustomField.create!(post_id: post_id, name: Post::MISSING_UPLOADS, value: uploads.to_json)
count += uploads.count
end
return { uploads: missing_uploads, post_uploads: missing_post_uploads, count: count }
end
private
def parse_quote_into_arguments(quote)

View File

@ -36,8 +36,6 @@ class S3Inventory
ActiveRecord::Base.transaction do
begin
table_name = "#{type}_inventory"
connection = ActiveRecord::Base.connection.raw_connection
connection.exec("CREATE TEMP TABLE #{table_name}(key text UNIQUE, etag text, PRIMARY KEY(etag, key))")
connection.copy_data("COPY #{table_name} FROM STDIN CSV") do
files.each do |file|
@ -54,6 +52,8 @@ class S3Inventory
WHERE #{model.table_name}.etag IS NULL
AND url ILIKE '%' || #{table_name}.key")
list_missing_post_uploads if type == "original"
uploads = (model == Upload) ? model.by_users.where("created_at < ?", inventory_date) : model
missing_uploads = uploads.joins("LEFT JOIN #{table_name} ON #{table_name}.etag = #{model.table_name}.etag").where("#{table_name}.etag is NULL")
@ -73,6 +73,35 @@ class S3Inventory
end
end
def list_missing_post_uploads
log "Listing missing post uploads..."
missing = Post.find_missing_uploads(include_local_upload: false) do |_, _, _, sha1|
next if sha1.blank?
upload_id = nil
result = connection.exec("SELECT * FROM #{table_name} WHERE key LIKE '%original/%/#{sha1}%'")
if result.count >= 0
key = result[0]["key"]
data = s3_helper.object(key).data
upload_id = Upload.create!(
user_id: Discourse.system_user.id,
original_filename: "",
filesize: data.content_length,
url: File.join(Discourse.store.absolute_base_url, key),
sha1: sha1,
etag: result[0]["etag"]
).id
end
upload_id
end
Discourse.stats.set("missing_post_uploads", missing[:count])
log "#{missing[:count]} post uploads are missing."
end
def download_inventory_files_to_tmp_directory
files.each do |file|
log "Downloading inventory file '#{file[:key]}' to tmp directory..."
@ -128,6 +157,14 @@ class S3Inventory
private
def connection
@connection ||= ActiveRecord::Base.connection.raw_connection
end
def table_name
"#{type}_inventory"
end
def files
@files ||= begin
symlink_file = unsorted_files.sort_by { |file| -file.last_modified.to_i }.first

View File

@ -390,56 +390,15 @@ task 'posts:reorder_posts', [:topic_id] => [:environment] do |_, args|
puts "", "Done.", ""
end
def get_missing_uploads
PostCustomField.where(name: Post::MISSING_UPLOADS)
end
desc 'Finds missing post upload records from cooked HTML content'
task 'posts:missing_uploads' => :environment do
get_missing_uploads.delete_all
upload_patterns = [
/\/uploads\/#{RailsMultisite::ConnectionManagement.current_db}\//,
/\/original\//,
/\/optimized\//
]
missing_uploads = []
old_scheme_upload_count = 0
count = 0
Post.have_uploads.select(:id, :cooked).find_in_batches do |posts|
ids = posts.pluck(:id)
sha1s = Upload.joins(:post_uploads).where("post_uploads.post_id >= ? AND post_uploads.post_id <= ?", ids.min, ids.max).pluck(:sha1)
missing = Post.find_missing_uploads do |post, src, path, sha1|
next if sha1.present?
posts.each do |post|
missing_post_uploads = []
links = Nokogiri::HTML::fragment(post.cooked).css("a/@href", "img/@src").map { |media| media.value }.uniq
links.each do |src|
next if src.blank? || upload_patterns.none? { |pattern| src =~ pattern }
src = "#{SiteSetting.force_https ? "https" : "http"}:#{src}" if src.start_with?("//")
next unless Discourse.store.has_been_uploaded?(src) || src =~ /\A\/[^\/]/i
path = begin
URI(URI.unescape(src))&.path
rescue URI::Error
end
next if path.blank?
sha1 =
if path.include? "optimized"
OptimizedImage.extract_sha1(path)
else
Upload.extract_sha1(path)
end
if sha1.blank? || sha1s.exclude?(sha1)
upload_id = nil
if missing_uploads.exclude?(src)
if sha1.blank?
# recovering old scheme upload.
local_store = FileStore::LocalStore.new
public_path = "#{local_store.public_dir}#{path}"
@ -476,37 +435,15 @@ task 'posts:missing_uploads' => :environment do
else
old_scheme_upload_count += 1
end
else
upload_id = Upload.where(sha1: sha1).pluck(:id).first
upload_id
end
if upload_id.present?
attributes = { post_id: post.id, upload_id: upload_id }
PostUpload.create!(attributes) unless PostUpload.exists?(attributes)
else
missing_uploads << src
end
end
puts "", "#{missing[:count]} post uploads are missing.", ""
missing_post_uploads << src if upload_id.blank?
end
end
if missing_post_uploads.present?
PostCustomField.create!(post_id: post.id, name: Post::MISSING_UPLOADS, value: missing_post_uploads.to_json)
count += missing_post_uploads.count
putc "x"
else
putc "."
end
end
end
puts "", "#{count} post uploads are missing.", ""
if count > 0
puts "#{missing_uploads.count} uploads are missing."
puts "#{old_scheme_upload_count} of #{missing_uploads.count} are old scheme uploads." if old_scheme_upload_count > 0
puts "#{get_missing_uploads.count} of #{Post.count} posts are affected.", ""
if missing[:count] > 0
puts "#{missing[:uploads].count} uploads are missing."
puts "#{old_scheme_upload_count} of #{missing[:uploads].count} are old scheme uploads." if old_scheme_upload_count > 0
puts "#{missing[:post_uploads].count} of #{Post.count} posts are affected.", ""
end
end

View File

@ -75,7 +75,7 @@ describe "S3Inventory" do
inventory.backfill_etags_and_list_missing
end
expect(output).to eq("#{upload.url}\n1 of 4 uploads are missing\n")
expect(output).to eq("Listing missing post uploads...\n0 post uploads are missing.\n#{upload.url}\n1 of 4 uploads are missing\n")
expect(Discourse.stats.get("missing_s3_uploads")).to eq(1)
end

View File

@ -1261,19 +1261,23 @@ describe Post do
)
end
let(:base_url) { "#{Discourse.base_url_no_prefix}#{Discourse.base_uri}" }
let(:video_url) { "#{base_url}#{video_upload.url}" }
let(:audio_url) { "#{base_url}#{audio_upload.url}" }
let(:raw) do
<<~RAW
<a href="#{attachment_upload.url}">Link</a>
<img src="#{image_upload.url}">
<video width="100%" height="100%" controls>
<source src="http://myforum.com#{video_upload.url}">
<a href="http://myforum.com#{video_upload.url}">http://myforum.com#{video_upload.url}</a>
<source src="#{video_url}">
<a href="#{video_url}">#{video_url}</a>
</video>
<audio controls>
<source src="http://myforum.com#{audio_upload.url}">
<a href="http://myforum.com#{audio_upload.url}">http://myforum.com#{audio_upload.url}</a>
<source src="#{audio_url}">
<a href="#{audio_url}">#{audio_url}</a>
</audio>
RAW
end