From 49400337b6598e633b54a39de46b31fd3df840ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9gis=20Hanol?= Date: Mon, 5 Mar 2018 10:24:42 +0100 Subject: [PATCH] Improve IPBoard3 importer - fix encoding issues - close topics - don't import queued & not approved topics - don't import deleted posts --- script/import_scripts/ipboard3.rb | 81 ++++++++++++++++++++++--------- 1 file changed, 58 insertions(+), 23 deletions(-) diff --git a/script/import_scripts/ipboard3.rb b/script/import_scripts/ipboard3.rb index f3aaa859a56..5a9812f85d5 100644 --- a/script/import_scripts/ipboard3.rb +++ b/script/import_scripts/ipboard3.rb @@ -16,6 +16,8 @@ class ImportScripts::IPBoard3 < ImportScripts::Base password: ENV["DB_PW"], database: ENV["DB_NAME"], ) + + @client.query("SET character_set_results = binary") end def execute @@ -23,6 +25,7 @@ class ImportScripts::IPBoard3 < ImportScripts::Base import_categories import_topics import_posts + close_topics import_personal_topics import_personal_posts end @@ -59,11 +62,10 @@ class ImportScripts::IPBoard3 < ImportScripts::Base break if users.empty? last_user_id = users[-1]["id"] - user_ids = users.map { |u| u["id"] } - - next if all_records_exist?(:users, user_ids) create_users(users, total: total_users, offset: offset) do |u| + next if user_id_from_imported_user_id(u["id"]) + %W{name email title pp_about_me}.each { |k| u[k]&.encode!("utf-8", "utf-8", invalid: :replace, undef: :replace, replace: "") } next if u["name"].blank? && !Email.is_valid?(u["email"]) { @@ -111,19 +113,21 @@ class ImportScripts::IPBoard3 < ImportScripts::Base child_categories = categories.select { |c| c["parent_id"] != -1 } create_categories(parent_categories) do |c| + next if category_id_from_imported_category_id(c["id"]) { id: c["id"], - name: c["name"], + name: c["name"].encode("utf-8", "utf-8"), description: clean_up(c["description"]), position: c["position"], } end create_categories(child_categories) do |c| + next if category_id_from_imported_category_id(c["id"]) { id: c["id"], parent_category_id: category_id_from_imported_category_id(c["parent_id"]), - name: c["name"], + name: c["name"].encode("utf-8", "utf-8"), description: clean_up(c["description"]), position: c["position"], } @@ -133,13 +137,18 @@ class ImportScripts::IPBoard3 < ImportScripts::Base def import_topics puts "", "importing topics..." + @closed_topic_ids = [] + last_topic_id = -1 total_topics = mysql_query(<<~SQL SELECT COUNT(*) count FROM topics JOIN posts ON tid = topic_id WHERE tdelete_time = 0 + AND pdelete_time = 0 AND new_topic = 1 + AND approved = 1 + AND queued = 0 SQL ).first["count"] @@ -147,6 +156,7 @@ class ImportScripts::IPBoard3 < ImportScripts::Base topics = mysql_query(<<~SQL SELECT tid id , title + , state , starter_id , start_date , views @@ -156,7 +166,10 @@ class ImportScripts::IPBoard3 < ImportScripts::Base FROM topics JOIN posts ON tid = topic_id WHERE tdelete_time = 0 + AND pdelete_time = 0 AND new_topic = 1 + AND approved = 1 + AND queued = 0 AND tid > #{last_topic_id} ORDER BY tid LIMIT #{BATCH_SIZE} @@ -166,17 +179,16 @@ class ImportScripts::IPBoard3 < ImportScripts::Base break if topics.empty? last_topic_id = topics[-1]["id"] - topic_ids = topics.map { |t| "t-#{t["id"]}" } - - next if all_records_exist?(:posts, topic_ids) create_posts(topics, total: total_topics, offset: offset) do |t| + @closed_topic_ids << "t-#{t["id"]}" if t["state"] != "open" + next if post_id_from_imported_post_id("t-#{t["id"]}") created_at = Time.zone.at(t["start_date"]) user_id = user_id_from_imported_user_id(t["starter_id"]) || -1 { id: "t-#{t["id"]}", - title: CGI.unescapeHTML(t["title"]), + title: CGI.unescapeHTML(t["title"].encode("utf-8", "utf-8")), user_id: user_id, created_at: created_at, views: t["views"], @@ -192,7 +204,14 @@ class ImportScripts::IPBoard3 < ImportScripts::Base puts "", "importing posts..." last_post_id = -1 - total_posts = mysql_query("SELECT COUNT(*) count FROM posts WHERE new_topic = 0").first["count"] + total_posts = mysql_query(<<~SQL + SELECT COUNT(*) count + FROM posts + WHERE new_topic = 0 + AND pdelete_time = 0 + AND queued = 0 + SQL + ).first["count"] batches(BATCH_SIZE) do |offset| posts = mysql_query(<<~SQL @@ -201,9 +220,10 @@ class ImportScripts::IPBoard3 < ImportScripts::Base , post_date , post , topic_id - , pdelete_time FROM posts WHERE new_topic = 0 + AND pdelete_time = 0 + AND queued = 0 AND pid > #{last_post_id} ORDER BY pid LIMIT #{BATCH_SIZE} @@ -213,11 +233,9 @@ class ImportScripts::IPBoard3 < ImportScripts::Base break if posts.empty? last_post_id = posts[-1]["id"] - post_ids = posts.map { |p| p["id"] } - - next if all_records_exist?(:posts, post_ids) create_posts(posts, total: total_posts, offset: offset) do |p| + next if post_id_from_imported_post_id(p["id"]) next unless t = topic_lookup_from_imported_post_id("t-#{p["topic_id"]}") user_id = user_id_from_imported_user_id(p["author_id"]) || -1 @@ -227,12 +245,31 @@ class ImportScripts::IPBoard3 < ImportScripts::Base created_at: Time.zone.at(p["post_date"]), raw: clean_up(p["post"], user_id), topic_id: t[:topic_id], - deleted_at: p["pdelete_time"] > 0 ? Time.zone.at(p["pdelete_time"]) : nil, } end end end + def close_topics + puts "", "closing #{@closed_topic_ids.size} topics..." + + sql = <<~SQL + WITH closed_topic_ids AS ( + SELECT t.id AS topic_id + FROM post_custom_fields pcf + JOIN posts p ON p.id = pcf.post_id + JOIN topics t ON t.id = p.topic_id + WHERE pcf.name = 'import_id' + AND pcf.value IN (?) + ) + UPDATE topics + SET closed = true + WHERE id IN (SELECT topic_id FROM closed_topic_ids) + SQL + + Topic.exec_sql(sql, @closed_topic_ids) + end + def import_personal_topics puts "", "import personal topics..." @@ -268,11 +305,9 @@ class ImportScripts::IPBoard3 < ImportScripts::Base break if personal_topics.empty? last_personal_topic_id = personal_topics[-1]["id"] - personal_topic_ids = personal_topics.map { |pt| "pt-#{pt["id"]}" } - - next if all_records_exist?(:posts, personal_topic_ids) create_posts(personal_topics, total: total_personal_topics, offset: offset) do |pt| + next if post_id_from_imported_post_id("pt-#{pt["id"]}") user_id = user_id_from_imported_user_id(pt["mt_starter_id"]) || -1 user_ids = [pt["mt_to_member_id"]] + pt["mt_invited_members"].scan(/i:(\d+);/).flatten.map(&:to_i) @@ -284,7 +319,7 @@ class ImportScripts::IPBoard3 < ImportScripts::Base archetype: Archetype.private_message, id: "pt-#{pt["id"]}", created_at: Time.zone.at(pt["mt_date"]), - title: CGI.unescapeHTML(pt["mt_title"]), + title: CGI.unescapeHTML(pt["mt_title"].encode("utf-8", "utf-8")), user_id: user_id, target_usernames: User.where(id: user_ids).pluck(:username), raw: clean_up(pt["msg_post"], user_id), @@ -317,11 +352,9 @@ class ImportScripts::IPBoard3 < ImportScripts::Base break if personal_posts.empty? last_personal_post_id = personal_posts[-1]["id"] - personal_post_ids = personal_posts.map { |pp| "pp-#{pp["id"]}" } - - next if all_records_exist?(:posts, personal_post_ids) create_posts(personal_posts, total: total_personal_posts, offset: offset) do |pp| + next if post_id_from_imported_post_id("pp-#{pp["id"]}") next unless t = topic_lookup_from_imported_post_id("pt-#{pp["msg_topic_id"]}") user_id = user_id_from_imported_user_id(pp["msg_author_id"]) || -1 @@ -337,6 +370,8 @@ class ImportScripts::IPBoard3 < ImportScripts::Base end def clean_up(raw, user_id = -1) + raw.encode!("utf-8", "utf-8", invalid: :replace, undef: :replace, replace: "") + raw.gsub!(/<(.+)> <\/\1>/, "\n\n") doc = Nokogiri::HTML.fragment(raw) @@ -344,7 +379,7 @@ class ImportScripts::IPBoard3 < ImportScripts::Base doc.css("blockquote.ipsBlockquote").each do |bq| post_id = post_id_from_imported_post_id(bq["data-cid"]) if post = Post.find_by(id: post_id) - bq.replace %{

[quote="#{post.user.username},post:#{post.post_number},topic:#{post.topic_id}"]\n#{bq.inner_html}\n[/quote]

} + bq.replace %{
[quote="#{post.user.username},post:#{post.post_number},topic:#{post.topic_id}"]\n#{bq.inner_html}\n[/quote]
} end end