discourse/lib/oneboxer/wikipedia_onebox.rb

60 lines
1.5 KiB
Ruby
Raw Normal View History

2013-02-06 03:16:51 +08:00
require_dependency 'oneboxer/handlebars_onebox'
module Oneboxer
class WikipediaOnebox < HandlebarsOnebox
matcher /^https?:\/\/.*wikipedia.(com|org)\/.*$/
favicon 'wikipedia.png'
def template
template_path('simple_onebox')
end
def translate_url
m = @url.match(/wiki\/(?<identifier>[^#\/]+)/mi)
article_id = CGI::unescape(m[:identifier])
return "http://en.m.wikipedia.org/w/index.php?title=#{URI::encode(article_id)}"
@url
end
def parse(data)
2013-02-12 22:46:45 +08:00
html_doc = Nokogiri::HTML(data)
2013-02-06 03:16:51 +08:00
result = {}
2013-02-12 22:46:45 +08:00
title = html_doc.at('title').inner_html
2013-02-06 03:16:51 +08:00
result[:title] = title.gsub!(/ - Wikipedia, the free encyclopedia/, '') if title.present?
# get the first image > 150 pix high
2013-02-12 22:46:45 +08:00
images = html_doc.search("img").select { |img| img['height'].to_i > 150 }
2013-02-06 03:16:51 +08:00
result[:image] = "http:#{images[0]["src"]}" unless images.empty?
# remove the table from mobile layout, as it can contain paras in some rare cases
2013-02-12 22:46:45 +08:00
html_doc.search("table").remove
2013-02-06 03:16:51 +08:00
# get all the paras
2013-02-12 22:46:45 +08:00
paras = html_doc.search("p")
2013-02-06 03:16:51 +08:00
text = ""
unless paras.empty?
cnt = 0
while text.length < MAX_TEXT and cnt <= 3
text << " " unless cnt == 0
paragraph = paras[cnt].inner_text[0..MAX_TEXT]
paragraph.gsub!(/\[\d+\]/mi, "")
text << paragraph
cnt += 1
end
end
text = "#{text[0..MAX_TEXT]}..." if text.length > MAX_TEXT
result[:text] = text
result
end
end
end