2013-02-06 03:16:51 +08:00
|
|
|
require_dependency 'oneboxer/handlebars_onebox'
|
|
|
|
|
|
|
|
module Oneboxer
|
|
|
|
class WikipediaOnebox < HandlebarsOnebox
|
|
|
|
|
|
|
|
matcher /^https?:\/\/.*wikipedia.(com|org)\/.*$/
|
|
|
|
favicon 'wikipedia.png'
|
|
|
|
|
|
|
|
def template
|
|
|
|
template_path('simple_onebox')
|
|
|
|
end
|
|
|
|
|
|
|
|
def translate_url
|
|
|
|
m = @url.match(/wiki\/(?<identifier>[^#\/]+)/mi)
|
|
|
|
|
|
|
|
article_id = CGI::unescape(m[:identifier])
|
|
|
|
return "http://en.m.wikipedia.org/w/index.php?title=#{URI::encode(article_id)}"
|
|
|
|
@url
|
|
|
|
end
|
|
|
|
|
|
|
|
def parse(data)
|
|
|
|
|
2013-02-12 22:46:45 +08:00
|
|
|
html_doc = Nokogiri::HTML(data)
|
2013-02-06 03:16:51 +08:00
|
|
|
|
|
|
|
result = {}
|
|
|
|
|
2013-02-12 22:46:45 +08:00
|
|
|
title = html_doc.at('title').inner_html
|
2013-02-06 03:16:51 +08:00
|
|
|
result[:title] = title.gsub!(/ - Wikipedia, the free encyclopedia/, '') if title.present?
|
|
|
|
|
|
|
|
# get the first image > 150 pix high
|
2013-02-12 22:46:45 +08:00
|
|
|
images = html_doc.search("img").select { |img| img['height'].to_i > 150 }
|
2013-02-06 03:16:51 +08:00
|
|
|
|
|
|
|
result[:image] = "http:#{images[0]["src"]}" unless images.empty?
|
|
|
|
|
|
|
|
# remove the table from mobile layout, as it can contain paras in some rare cases
|
2013-02-12 22:46:45 +08:00
|
|
|
html_doc.search("table").remove
|
2013-02-06 03:16:51 +08:00
|
|
|
|
|
|
|
# get all the paras
|
2013-02-12 22:46:45 +08:00
|
|
|
paras = html_doc.search("p")
|
2013-02-06 03:16:51 +08:00
|
|
|
text = ""
|
|
|
|
|
|
|
|
unless paras.empty?
|
|
|
|
cnt = 0
|
|
|
|
while text.length < MAX_TEXT and cnt <= 3
|
|
|
|
text << " " unless cnt == 0
|
|
|
|
paragraph = paras[cnt].inner_text[0..MAX_TEXT]
|
|
|
|
paragraph.gsub!(/\[\d+\]/mi, "")
|
|
|
|
text << paragraph
|
|
|
|
cnt += 1
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
text = "#{text[0..MAX_TEXT]}..." if text.length > MAX_TEXT
|
|
|
|
result[:text] = text
|
|
|
|
result
|
|
|
|
end
|
|
|
|
|
|
|
|
end
|
|
|
|
end
|