mirror of
https://github.com/discourse/discourse.git
synced 2025-01-18 17:02:45 +08:00
Improve Google Groups scraper
* Better error detection during login phase * Experimental support for 2FA and SMS codes * Detect missing permissions to scrape email addresses
This commit is contained in:
parent
40ac895ef7
commit
2349ba3bc4
|
@ -5,6 +5,7 @@ require "bundler/inline"
|
||||||
gemfile(true) do
|
gemfile(true) do
|
||||||
source "https://rubygems.org"
|
source "https://rubygems.org"
|
||||||
|
|
||||||
|
gem "net-http-persistent"
|
||||||
gem "nokogiri"
|
gem "nokogiri"
|
||||||
gem "selenium-webdriver"
|
gem "selenium-webdriver"
|
||||||
end
|
end
|
||||||
|
@ -13,6 +14,7 @@ require "fileutils"
|
||||||
require "nokogiri"
|
require "nokogiri"
|
||||||
require "optparse"
|
require "optparse"
|
||||||
require "selenium-webdriver"
|
require "selenium-webdriver"
|
||||||
|
require 'selenium/webdriver/remote/http/persistent'
|
||||||
require "set"
|
require "set"
|
||||||
require "yaml"
|
require "yaml"
|
||||||
|
|
||||||
|
@ -23,7 +25,8 @@ def driver
|
||||||
chrome_args = ["headless", "disable-gpu"]
|
chrome_args = ["headless", "disable-gpu"]
|
||||||
chrome_args << "no-sandbox" << "disable-dev-shm-usage" if inside_container?
|
chrome_args << "no-sandbox" << "disable-dev-shm-usage" if inside_container?
|
||||||
options = Selenium::WebDriver::Chrome::Options.new(args: chrome_args)
|
options = Selenium::WebDriver::Chrome::Options.new(args: chrome_args)
|
||||||
Selenium::WebDriver.for(:chrome, options: options)
|
http_client = Selenium::WebDriver::Remote::Http::Persistent.new
|
||||||
|
Selenium::WebDriver.for(:chrome, options: options, http_client: http_client)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -108,6 +111,17 @@ def crawl_message(url, might_be_deleted)
|
||||||
filename = File.join(@path, "#{url[/#{@groupname}\/(.+)/, 1].sub("/", "-")}.eml")
|
filename = File.join(@path, "#{url[/#{@groupname}\/(.+)/, 1].sub("/", "-")}.eml")
|
||||||
content = find("pre")["innerText"]
|
content = find("pre")["innerText"]
|
||||||
|
|
||||||
|
if !@first_message_checked
|
||||||
|
@first_message_checked = true
|
||||||
|
|
||||||
|
if content.match?(/From:.*\.\.\.@.*/i) && !@force_import
|
||||||
|
exit_with_error(<<~MSG)
|
||||||
|
It looks like you do not have permissions to see email addresses. Aborting.
|
||||||
|
Use the --force option to import anyway.
|
||||||
|
MSG
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
File.write(filename, content)
|
File.write(filename, content)
|
||||||
rescue Selenium::WebDriver::Error::NoSuchElementError
|
rescue Selenium::WebDriver::Error::NoSuchElementError
|
||||||
raise unless might_be_deleted
|
raise unless might_be_deleted
|
||||||
|
@ -121,19 +135,78 @@ def login
|
||||||
puts "Logging in..."
|
puts "Logging in..."
|
||||||
get("https://www.google.com/accounts/Login")
|
get("https://www.google.com/accounts/Login")
|
||||||
|
|
||||||
sleep(0.5)
|
sleep(1)
|
||||||
|
email_element = wait_for_element("input[type='email']")
|
||||||
|
exit_with_error("Failed to detect 'email' input on login page") if !email_element
|
||||||
|
|
||||||
email_element = find("input[type='email']")
|
|
||||||
driver.action.move_to(email_element)
|
driver.action.move_to(email_element)
|
||||||
email_element.send_keys(@email)
|
email_element.send_keys(@email)
|
||||||
email_element.send_keys("\n")
|
email_element.send_keys("\n")
|
||||||
|
|
||||||
sleep(2)
|
sleep(1)
|
||||||
|
password_element = wait_for_element("input[type='password']")
|
||||||
|
exit_with_error("Failed to detect 'password' input on login page") if !password_element
|
||||||
|
|
||||||
password_element = find("input[type='password']")
|
|
||||||
driver.action.move_to(password_element)
|
driver.action.move_to(password_element)
|
||||||
password_element.send_keys(@password)
|
password_element.send_keys(@password)
|
||||||
password_element.send_keys("\n")
|
password_element.send_keys("\n")
|
||||||
|
|
||||||
|
sleep(1)
|
||||||
|
|
||||||
|
if driver.current_url.include?("challenge")
|
||||||
|
puts "", "2-Step Verification is required."
|
||||||
|
puts "Unlock on your phone and press Enter"
|
||||||
|
puts "or enter the code from your authenticator app"
|
||||||
|
puts "or enter the code you received via SMS (without the G- prefix)"
|
||||||
|
|
||||||
|
print "Enter code: "
|
||||||
|
|
||||||
|
code = gets.chomp
|
||||||
|
|
||||||
|
if code.empty?
|
||||||
|
# Verification via phone?
|
||||||
|
begin
|
||||||
|
wait_for_url { |url| !url.include?("challenge") }
|
||||||
|
rescue Selenium::WebDriver::Error::TimeOutError
|
||||||
|
exit_with_error("Failed to login. Did you tap 'Yes' on your phone to allow the login?")
|
||||||
|
end
|
||||||
|
else
|
||||||
|
code_element = wait_for_element("input[type='tel']")
|
||||||
|
exit_with_error("Failed to detect 'code' input on login page") if !code_element
|
||||||
|
|
||||||
|
code_element.send_keys(code)
|
||||||
|
code_element.send_keys("\n")
|
||||||
|
|
||||||
|
begin
|
||||||
|
wait_for_url { |url| !url.include?("challenge") }
|
||||||
|
rescue Selenium::WebDriver::Error::TimeOutError
|
||||||
|
exit_with_error("Failed to login. Wrong code?")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
sleep(1)
|
||||||
|
user_element = wait_for_element("a[aria-label*='#{@email}']")
|
||||||
|
exit_with_error("Failed to login") if !user_element
|
||||||
|
end
|
||||||
|
|
||||||
|
def wait_for_url
|
||||||
|
wait = Selenium::WebDriver::Wait.new(timeout: 5)
|
||||||
|
wait.until { yield(driver.current_url) }
|
||||||
|
end
|
||||||
|
|
||||||
|
def wait_for_element(css)
|
||||||
|
wait = Selenium::WebDriver::Wait.new(timeout: 5)
|
||||||
|
wait.until { driver.find_element(css: css).displayed? }
|
||||||
|
find(css)
|
||||||
|
rescue Selenium::WebDriver::Error::TimeOutError
|
||||||
|
nil
|
||||||
|
end
|
||||||
|
|
||||||
|
def exit_with_error(message)
|
||||||
|
puts driver.current_url
|
||||||
|
STDERR.puts message
|
||||||
|
exit 1
|
||||||
end
|
end
|
||||||
|
|
||||||
def crawl
|
def crawl
|
||||||
|
@ -156,6 +229,8 @@ end
|
||||||
def parse_arguments
|
def parse_arguments
|
||||||
puts ""
|
puts ""
|
||||||
|
|
||||||
|
@force_import = false
|
||||||
|
|
||||||
parser = OptionParser.new do |opts|
|
parser = OptionParser.new do |opts|
|
||||||
opts.banner = "Usage: google_groups.rb [options]"
|
opts.banner = "Usage: google_groups.rb [options]"
|
||||||
|
|
||||||
|
@ -163,6 +238,7 @@ def parse_arguments
|
||||||
opts.on("-p", "--password PASSWORD", "password of group admin or manager") { |v| @password = v }
|
opts.on("-p", "--password PASSWORD", "password of group admin or manager") { |v| @password = v }
|
||||||
opts.on("-g", "--groupname GROUPNAME") { |v| @groupname = v }
|
opts.on("-g", "--groupname GROUPNAME") { |v| @groupname = v }
|
||||||
opts.on("--path PATH", "output path for emails") { |v| @path = v }
|
opts.on("--path PATH", "output path for emails") { |v| @path = v }
|
||||||
|
opts.on("-f", "--force", "force import when user isn't allowed to see email addresses") { @force_import = true }
|
||||||
opts.on("-h", "--help") do
|
opts.on("-h", "--help") do
|
||||||
puts opts
|
puts opts
|
||||||
exit
|
exit
|
||||||
|
|
|
@ -38,7 +38,7 @@ module ImportScripts::Mbox
|
||||||
private
|
private
|
||||||
|
|
||||||
METADATA_FILENAME = 'metadata.yml'.freeze
|
METADATA_FILENAME = 'metadata.yml'.freeze
|
||||||
IGNORED_FILE_EXTENSIONS = ['.dbindex', '.dbnames', '.digest', '.subjects', 'status.yml']
|
IGNORED_FILE_EXTENSIONS = ['.dbindex', '.dbnames', '.digest', '.subjects', '.yml']
|
||||||
|
|
||||||
def index_category(directory)
|
def index_category(directory)
|
||||||
metadata_file = File.join(directory, METADATA_FILENAME)
|
metadata_file = File.join(directory, METADATA_FILENAME)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user