mirror of
https://github.com/discourse/discourse.git
synced 2024-11-28 08:45:26 +08:00
30990006a9
This reduces chances of errors where consumers of strings mutate inputs and reduces memory usage of the app. Test suite passes now, but there may be some stuff left, so we will run a few sites on a branch prior to merging
267 lines
6.9 KiB
Ruby
Executable File
267 lines
6.9 KiB
Ruby
Executable File
#!/usr/bin/env ruby
|
|
# frozen_string_literal: true
|
|
|
|
require "bundler/inline"
|
|
|
|
gemfile(true) do
|
|
source "https://rubygems.org"
|
|
|
|
gem "net-http-persistent"
|
|
gem "nokogiri"
|
|
gem "webdrivers"
|
|
end
|
|
|
|
require "fileutils"
|
|
require "nokogiri"
|
|
require "optparse"
|
|
require "webdrivers"
|
|
require 'selenium/webdriver/remote/http/persistent'
|
|
require "set"
|
|
require "yaml"
|
|
|
|
DEFAULT_OUTPUT_PATH = "/shared/import/data"
|
|
|
|
def driver
|
|
@driver ||= begin
|
|
chrome_args = ["headless", "disable-gpu"]
|
|
chrome_args << "no-sandbox" if inside_container?
|
|
options = Selenium::WebDriver::Chrome::Options.new(args: chrome_args)
|
|
http_client = Selenium::WebDriver::Remote::Http::Persistent.new
|
|
Selenium::WebDriver.for(:chrome, options: options, http_client: http_client)
|
|
end
|
|
end
|
|
|
|
def inside_container?
|
|
File.foreach("/proc/1/cgroup") do |line|
|
|
return true if line.include?("docker")
|
|
end
|
|
|
|
false
|
|
end
|
|
|
|
MAX_GET_RETRIES = 5
|
|
MAX_FIND_RETRIES = 3
|
|
|
|
def get(url)
|
|
begin
|
|
retries ||= 0
|
|
driver.get(url)
|
|
rescue Net::ReadTimeout
|
|
sleep retries
|
|
retry if (retries += 1) < MAX_GET_RETRIES
|
|
end
|
|
end
|
|
|
|
def extract(css, parent_element = driver)
|
|
begin
|
|
retries ||= 0
|
|
parent_element.find_elements(css: css).map { |element| yield(element) }
|
|
rescue Net::ReadTimeout, Selenium::WebDriver::Error::StaleElementReferenceError
|
|
sleep retries
|
|
retry if (retries += 1) < MAX_FIND_RETRIES
|
|
end
|
|
end
|
|
|
|
def find(css, parent_element = driver)
|
|
begin
|
|
retries ||= 0
|
|
parent_element.find_element(css: css)
|
|
rescue Net::ReadTimeout, Selenium::WebDriver::Error::ElementNotVisibleError
|
|
sleep retries
|
|
retry if (retries += 1) < MAX_FIND_RETRIES
|
|
end
|
|
end
|
|
|
|
def crawl_categories
|
|
1.step(nil, 100).each do |start|
|
|
url = "https://groups.google.com/forum/?_escaped_fragment_=categories/#{@groupname}[#{start}-#{start + 99}]"
|
|
get(url)
|
|
|
|
topic_urls = extract(".subject a[href*='#{@groupname}']") { |a| a["href"].sub("/d/topic/", "/forum/?_escaped_fragment_=topic/") }
|
|
break if topic_urls.size == 0
|
|
|
|
topic_urls.each { |topic_url| crawl_topic(topic_url) }
|
|
end
|
|
end
|
|
|
|
def crawl_topic(url)
|
|
if @scraped_topic_urls.include?(url)
|
|
puts "Skipping #{url}"
|
|
return
|
|
end
|
|
|
|
puts "Scraping #{url}"
|
|
get(url)
|
|
|
|
extract(".subject a[href*='#{@groupname}']") do |a|
|
|
[
|
|
a["href"].sub("/d/msg/", "/forum/message/raw?msg="),
|
|
a["title"].empty?
|
|
]
|
|
end.each { |msg_url, might_be_deleted| crawl_message(msg_url, might_be_deleted) }
|
|
|
|
@scraped_topic_urls << url
|
|
rescue
|
|
puts "Failed to scrape topic at #{url}"
|
|
raise
|
|
end
|
|
|
|
def crawl_message(url, might_be_deleted)
|
|
get(url)
|
|
|
|
filename = File.join(@path, "#{url[/#{@groupname}\/(.+)/, 1].sub("/", "-")}.eml")
|
|
content = find("pre")["innerText"]
|
|
|
|
if !@first_message_checked
|
|
@first_message_checked = true
|
|
|
|
if content.match?(/From:.*\.\.\.@.*/i) && !@force_import
|
|
exit_with_error(<<~MSG)
|
|
It looks like you do not have permissions to see email addresses. Aborting.
|
|
Use the --force option to import anyway.
|
|
MSG
|
|
end
|
|
end
|
|
|
|
File.write(filename, content)
|
|
rescue Selenium::WebDriver::Error::NoSuchElementError
|
|
raise unless might_be_deleted
|
|
puts "Message might be deleted. Skipping #{url}"
|
|
rescue
|
|
puts "Failed to scrape message at #{url}"
|
|
raise
|
|
end
|
|
|
|
def login
|
|
puts "Logging in..."
|
|
get("https://www.google.com/accounts/Login")
|
|
|
|
sleep(1)
|
|
email_element = wait_for_element("input[type='email']")
|
|
exit_with_error("Failed to detect 'email' input on login page") if !email_element
|
|
|
|
driver.action.move_to(email_element)
|
|
email_element.send_keys(@email)
|
|
email_element.send_keys("\n")
|
|
|
|
sleep(1)
|
|
password_element = wait_for_element("input[type='password']")
|
|
exit_with_error("Failed to detect 'password' input on login page") if !password_element
|
|
|
|
driver.action.move_to(password_element)
|
|
password_element.send_keys(@password)
|
|
password_element.send_keys("\n")
|
|
|
|
sleep(1)
|
|
|
|
if driver.current_url.include?("challenge")
|
|
puts "", "2-Step Verification is required."
|
|
puts "Unlock on your phone and press Enter"
|
|
puts "or enter the code from your authenticator app"
|
|
puts "or enter the code you received via SMS (without the G- prefix)"
|
|
|
|
print "Enter code: "
|
|
|
|
code = gets.chomp
|
|
|
|
if code.empty?
|
|
# Verification via phone?
|
|
begin
|
|
wait_for_url { |url| !url.include?("challenge") }
|
|
rescue Selenium::WebDriver::Error::TimeOutError
|
|
exit_with_error("Failed to login. Did you tap 'Yes' on your phone to allow the login?")
|
|
end
|
|
else
|
|
code_element = wait_for_element("input[type='tel']")
|
|
exit_with_error("Failed to detect 'code' input on login page") if !code_element
|
|
|
|
code_element.send_keys(code)
|
|
code_element.send_keys("\n")
|
|
|
|
begin
|
|
wait_for_url { |url| !url.include?("challenge") }
|
|
rescue Selenium::WebDriver::Error::TimeOutError
|
|
exit_with_error("Failed to login. Wrong code?")
|
|
end
|
|
end
|
|
end
|
|
|
|
sleep(1)
|
|
user_element = wait_for_element("a[aria-label*='#{@email}']")
|
|
exit_with_error("Failed to login") if !user_element
|
|
end
|
|
|
|
def wait_for_url
|
|
wait = Selenium::WebDriver::Wait.new(timeout: 5)
|
|
wait.until { yield(driver.current_url) }
|
|
end
|
|
|
|
def wait_for_element(css)
|
|
wait = Selenium::WebDriver::Wait.new(timeout: 5)
|
|
wait.until { driver.find_element(css: css).displayed? }
|
|
find(css)
|
|
rescue Selenium::WebDriver::Error::TimeOutError
|
|
nil
|
|
end
|
|
|
|
def exit_with_error(*messages)
|
|
STDERR.puts messages
|
|
exit 1
|
|
end
|
|
|
|
def crawl
|
|
start_time = Time.now
|
|
status_filename = File.join(@path, "status.yml")
|
|
@scraped_topic_urls = File.exists?(status_filename) ? YAML.load_file(status_filename) : Set.new
|
|
|
|
login
|
|
|
|
begin
|
|
crawl_categories
|
|
ensure
|
|
File.write(status_filename, @scraped_topic_urls.to_yaml)
|
|
end
|
|
|
|
elapsed = Time.now - start_time
|
|
puts "", "", "Done (%02dh %02dmin %02dsec)" % [elapsed / 3600, elapsed / 60 % 60, elapsed % 60]
|
|
end
|
|
|
|
def parse_arguments
|
|
puts ""
|
|
|
|
@force_import = false
|
|
|
|
parser = OptionParser.new do |opts|
|
|
opts.banner = "Usage: google_groups.rb [options]"
|
|
|
|
opts.on("-e", "--email EMAIL", "email address of group admin or manager") { |v| @email = v }
|
|
opts.on("-p", "--password PASSWORD", "password of group admin or manager") { |v| @password = v }
|
|
opts.on("-g", "--groupname GROUPNAME") { |v| @groupname = v }
|
|
opts.on("--path PATH", "output path for emails") { |v| @path = v }
|
|
opts.on("-f", "--force", "force import when user isn't allowed to see email addresses") { @force_import = true }
|
|
opts.on("-h", "--help") do
|
|
puts opts
|
|
exit
|
|
end
|
|
end
|
|
|
|
begin
|
|
parser.parse!
|
|
rescue OptionParser::ParseError => e
|
|
exit_with_error(e.message, "", parser)
|
|
end
|
|
|
|
mandatory = [:email, :password, :groupname]
|
|
missing = mandatory.select { |name| instance_variable_get("@#{name}").nil? }
|
|
|
|
if missing.any?
|
|
exit_with_error("Missing arguments: #{missing.join(', ')}", "", parser)
|
|
end
|
|
|
|
@path = File.join(DEFAULT_OUTPUT_PATH, @groupname) if @path.nil?
|
|
FileUtils.mkpath(@path)
|
|
end
|
|
|
|
parse_arguments
|
|
crawl
|