#!/usr/bin/env ruby

require "bundler/inline"

gemfile(true) do
  source "https://rubygems.org"

  gem "nokogiri"
  gem "selenium-webdriver"
end

require "fileutils"
require "nokogiri"
require "optparse"
require "selenium-webdriver"
require "set"
require "yaml"

DEFAULT_OUTPUT_PATH = "/shared/import/data"

def driver
  @driver ||= begin
    chrome_args = ["headless", "disable-gpu"]
    chrome_args << "no-sandbox" << "disable-dev-shm-usage" if inside_container?
    options = Selenium::WebDriver::Chrome::Options.new(args: chrome_args)
    Selenium::WebDriver.for(:chrome, options: options)
  end
end

def inside_container?
  File.foreach("/proc/1/cgroup") do |line|
    return true if line.include?("docker")
  end

  false
end

MAX_GET_RETRIES = 5
MAX_FIND_RETRIES = 3

def get(url)
  begin
    retries ||= 0
    driver.get(url)
  rescue Net::ReadTimeout
    sleep retries
    retry if (retries += 1) < MAX_GET_RETRIES
  end
end

def extract(css, parent_element = driver)
  begin
    retries ||= 0
    parent_element.find_elements(css: css).map { |element| yield(element) }
  rescue Net::ReadTimeout, Selenium::WebDriver::Error::StaleElementReferenceError
    sleep retries
    retry if (retries += 1) < MAX_FIND_RETRIES
  end
end

def find(css, parent_element = driver)
  begin
    retries ||= 0
    parent_element.find_element(css: css)
  rescue Net::ReadTimeout, Selenium::WebDriver::Error::ElementNotVisibleError
    sleep retries
    retry if (retries += 1) < MAX_FIND_RETRIES
  end
end

def crawl_categories
  1.step(nil, 100).each do |start|
    url = "https://groups.google.com/forum/?_escaped_fragment_=categories/#{@groupname}[#{start}-#{start + 99}]"
    get(url)

    topic_urls = extract(".subject a[href*='#{@groupname}']") { |a| a["href"].sub("/d/topic/", "/forum/?_escaped_fragment_=topic/") }
    break if topic_urls.size == 0

    topic_urls.each { |topic_url| crawl_topic(topic_url) }
  end
end

def crawl_topic(url)
  if @scraped_topic_urls.include?(url)
    puts "Skipping #{url}"
    return
  end

  puts "Scraping #{url}"
  get(url)

  extract(".subject a[href*='#{@groupname}']") do |a|
    [
      a["href"].sub("/d/msg/", "/forum/message/raw?msg="),
      a["title"].empty?
    ]
  end.each { |msg_url, might_be_deleted| crawl_message(msg_url, might_be_deleted) }

  @scraped_topic_urls << url
rescue
  puts "Failed to scrape topic at #{url}"
  raise
end

def crawl_message(url, might_be_deleted)
  get(url)

  filename = File.join(@path, "#{url[/#{@groupname}\/(.+)/, 1].sub("/", "-")}.eml")
  content = find("pre")["innerText"]

  File.write(filename, content)
rescue Selenium::WebDriver::Error::NoSuchElementError
  raise unless might_be_deleted
  puts "Message might be deleted. Skipping #{url}"
rescue
  puts "Failed to scrape message at #{url}"
  raise
end

def login
  puts "Logging in..."
  get("https://www.google.com/accounts/Login")

  sleep(0.5)

  email_element = find("input[type='email']")
  driver.action.move_to(email_element)
  email_element.send_keys(@email)
  email_element.send_keys("\n")

  sleep(2)

  password_element = find("input[type='password']")
  driver.action.move_to(password_element)
  password_element.send_keys(@password)
  password_element.send_keys("\n")
end

def crawl
  start_time = Time.now
  status_filename = File.join(@path, "status.yml")
  @scraped_topic_urls = File.exists?(status_filename) ? YAML.load_file(status_filename) : Set.new

  login

  begin
    crawl_categories
  ensure
    File.write(status_filename, @scraped_topic_urls.to_yaml)
  end

  elapsed = Time.now - start_time
  puts "", "", "Done (%02dh %02dmin %02dsec)" % [elapsed / 3600, elapsed / 60 % 60, elapsed % 60]
end

def parse_arguments
  puts ""

  parser = OptionParser.new do |opts|
    opts.banner = "Usage: google_groups.rb [options]"

    opts.on("-e", "--email EMAIL", "email address of group admin or manager") { |v| @email = v }
    opts.on("-p", "--password PASSWORD", "password of group admin or manager") { |v| @password = v }
    opts.on("-g", "--groupname GROUPNAME") { |v| @groupname = v }
    opts.on("--path PATH", "output path for emails") { |v| @path = v }
    opts.on("-h", "--help") do
      puts opts
      exit
    end
  end

  begin
    parser.parse!
  rescue OptionParser::ParseError => e
    STDERR.puts e.message, "", parser
    exit 1
  end

  mandatory = [:email, :password, :groupname]
  missing = mandatory.select { |name| instance_variable_get("@#{name}").nil? }

  if missing.any?
    STDERR.puts "Missing arguments: #{missing.join(', ')}", "", parser
    exit 1
  end

  @path = File.join(DEFAULT_OUTPUT_PATH, @groupname) if @path.nil?
  FileUtils.mkpath(@path)
end

parse_arguments
crawl