discourse/app/models/topic_link.rb

Failed to ignore revisions in .git-blame-ignore-revs.

407 lines
12 KiB
Ruby
Raw Normal View History

# frozen_string_literal: true
2013-02-06 03:16:51 +08:00
require 'uri'
class TopicLink < ActiveRecord::Base
2015-09-26 02:07:04 +08:00
def self.max_domain_length
100
end
def self.max_url_length
500
end
2013-02-06 03:16:51 +08:00
belongs_to :topic
belongs_to :user
belongs_to :post
belongs_to :link_topic, class_name: 'Topic'
belongs_to :link_post, class_name: 'Post'
2013-02-06 03:16:51 +08:00
validates_presence_of :url
validates_length_of :url, maximum: 500
validates_uniqueness_of :url, scope: [:topic_id, :post_id]
has_many :topic_link_clicks, dependent: :destroy
2013-02-06 03:16:51 +08:00
validate :link_to_self
2014-04-06 02:47:25 +08:00
after_commit :crawl_link_title
2013-02-06 03:16:51 +08:00
# Make sure a topic can't link to itself
def link_to_self
errors.add(:base, "can't link to the same topic") if (topic_id == link_topic_id)
end
2013-11-16 01:15:46 +08:00
def self.topic_map(guardian, topic_id)
# Sam: complicated reports are really hard in AR
builder = DB.build(<<~SQL)
SELECT ftl.url,
COALESCE(ft.title, ftl.title) AS title,
ftl.link_topic_id,
ftl.reflection,
ftl.internal,
ftl.domain,
MIN(ftl.user_id) AS user_id,
SUM(clicks) AS clicks
FROM topic_links AS ftl
LEFT JOIN topics AS ft ON ftl.link_topic_id = ft.id
LEFT JOIN categories AS c ON c.id = ft.category_id
/*where*/
GROUP BY ftl.url, ft.title, ftl.title, ftl.link_topic_id, ftl.reflection, ftl.internal, ftl.domain
ORDER BY clicks DESC, count(*) DESC
LIMIT 50
SQL
builder.where('ftl.topic_id = :topic_id', topic_id: topic_id)
builder.where('ft.deleted_at IS NULL')
# note that ILIKE means "case insensitive LIKE"
builder.where("NOT(ftl.url ILIKE '%.png' OR ftl.url ILIKE '%.jpg' OR ftl.url ILIKE '%.gif')")
builder.where("COALESCE(ft.archetype, 'regular') <> :archetype", archetype: Archetype.private_message)
builder.where("clicks > 0")
builder.secure_category(guardian.secure_category_ids)
builder.query
end
2017-07-28 09:20:09 +08:00
def self.counts_for(guardian, topic, posts)
return {} if posts.blank?
# Sam: this is not tidy in AR and also happens to be a critical path
# for topic view
builder = DB.build("SELECT
l.post_id,
l.url,
l.clicks,
2014-04-06 02:47:25 +08:00
COALESCE(t.title, l.title) AS title,
l.internal,
2014-04-06 02:47:25 +08:00
l.reflection,
l.domain
FROM topic_links l
LEFT JOIN topics t ON t.id = l.link_topic_id
LEFT JOIN categories AS c ON c.id = t.category_id
/*where*/
ORDER BY reflection ASC, clicks DESC")
builder.where('t.deleted_at IS NULL')
builder.where("COALESCE(t.archetype, 'regular') <> :archetype", archetype: Archetype.private_message)
# not certain if pluck is right, cause it may interfere with caching
builder.where('l.post_id in (:post_ids)', post_ids: posts.map(&:id))
builder.secure_category(guardian.secure_category_ids)
result = {}
builder.query.each do |l|
result[l.post_id] ||= []
2017-07-28 09:20:09 +08:00
result[l.post_id] << { url: l.url,
clicks: l.clicks,
title: l.title,
internal: l.internal,
reflection: l.reflection }
end
result
end
2013-02-06 03:16:51 +08:00
def self.extract_from(post)
return if post.blank? || post.whisper? || post.user_id.blank?
2013-02-07 23:45:24 +08:00
current_urls = []
reflected_ids = []
PrettyText
.extract_links(post.cooked)
.map do |u|
uri = UrlHelper.relaxed_parse(u.url)
[u, uri]
end
.reject { |_, p| p.nil? || "mailto" == p.scheme }
.uniq { |_, p| p }
.each do |link, parsed|
TopicLink.transaction do
2013-02-06 03:16:51 +08:00
begin
url, reflected_id = self.ensure_entry_for(post, link, parsed)
current_urls << url unless url.nil?
reflected_ids << reflected_id unless reflected_id.nil?
rescue URI::Error
2013-02-06 03:16:51 +08:00
# if the URI is invalid, don't store it.
rescue ActionController::RoutingError
2013-02-07 23:45:24 +08:00
# If we can't find the route, no big deal
2013-02-06 03:16:51 +08:00
end
2013-02-07 23:45:24 +08:00
end
end
self.cleanup_entries(post, current_urls, reflected_ids)
2013-02-06 03:16:51 +08:00
end
2014-04-06 02:47:25 +08:00
def self.crawl_link_title(topic_link_id)
Jobs.enqueue(:crawl_topic_link, topic_link_id: topic_link_id)
end
2014-04-06 02:47:25 +08:00
def crawl_link_title
TopicLink.crawl_link_title(id)
2014-04-06 02:47:25 +08:00
end
def self.duplicate_lookup(topic)
results = TopicLink
2017-07-28 09:20:09 +08:00
.includes(:post, :user)
.joins(:post, :user)
.where("posts.id IS NOT NULL AND users.id IS NOT NULL")
.where(topic_id: topic.id, reflection: false)
.last(200)
lookup = {}
results.each do |tl|
normalized = tl.url.downcase.sub(/^https?:\/\//, '').sub(/\/$/, '')
lookup[normalized] = { domain: tl.domain,
username: tl.user.username_lower,
posted_at: tl.post.created_at,
post_number: tl.post.post_number }
end
lookup
end
private
# This pattern is used to create topic links very efficiently with minimal
# errors under heavy concurrent use
#
# It avoids a SELECT to find out if the record is there and minimizes all
# the work it needs to do in case a record is missing
#
# It handles calling the required callback and has parity with Rails implementation
#
# Usually we would rely on ActiveRecord but in this case we have had lots of churn
# around creation of topic links leading to hard to debug log messages in production
#
def self.safe_create_topic_link(
post_id:,
user_id:,
topic_id:,
url:,
domain: nil,
internal: false,
link_topic_id: nil,
link_post_id: nil,
quote: false,
extension: nil,
reflection: false
)
domain ||= Discourse.current_hostname
sql = <<~SQL
WITH new_row AS(
INSERT INTO topic_links(
post_id,
user_id,
topic_id,
url,
domain,
internal,
link_topic_id,
link_post_id,
quote,
extension,
reflection,
created_at,
updated_at
) VALUES (
:post_id,
:user_id,
:topic_id,
:url,
:domain,
:internal,
:link_topic_id,
:link_post_id,
:quote,
:extension,
:reflection,
:now,
:now
)
ON CONFLICT DO NOTHING
RETURNING id
)
SELECT COALESCE(
(SELECT id FROM new_row),
(SELECT id FROM topic_links WHERE post_id = :post_id AND topic_id = :topic_id AND url = :url)
), (SELECT id FROM new_row) IS NOT NULL
SQL
topic_link_id, new_record = DB.query_single(sql,
post_id: post_id,
user_id: user_id,
topic_id: topic_id,
url: url,
domain: domain,
internal: internal,
link_topic_id: link_topic_id,
link_post_id: link_post_id,
quote: quote,
extension: extension,
reflection: reflection,
now: Time.now
)
if new_record
DB.after_commit do
crawl_link_title(topic_link_id)
end
end
topic_link_id
end
def self.ensure_entry_for(post, link, parsed)
url = link.url
internal = false
topic_id = nil
post_number = nil
topic = nil
if upload = Upload.get_from_url(url)
internal = Discourse.store.internal?
# Store the same URL that will be used in the cooked version of the post
url = UrlHelper.cook_url(upload.url, secure: upload.secure?)
elsif route = Discourse.route_for(parsed)
internal = true
# We aren't interested in tracking internal links to users
return nil if route[:controller] == 'users'
topic_id = route[:topic_id].to_i
post_number = route[:post_number] || 1
topic_slug = route[:id]
# Store the canonical URL
topic = Topic.find_by(id: topic_id)
topic ||= Topic.find_by(slug: topic_slug) if topic_slug
topic_id = nil unless topic
if topic.present?
url = +"#{Discourse.base_url_no_prefix}#{topic.relative_url}"
url << "/#{post_number}" if post_number.to_i > 1
end
end
# Skip linking to ourselves
return nil if topic&.id == post.topic_id
reflected_post = nil
if post_number && topic
reflected_post = Post.find_by(topic_id: topic.id, post_number: post_number.to_i)
end
url = url[0...TopicLink.max_url_length]
return nil if parsed && parsed.host && parsed.host.length > TopicLink.max_domain_length
file_extension = File.extname(parsed.path)[1..10].downcase unless parsed.path.nil? || File.extname(parsed.path).empty?
safe_create_topic_link(
post_id: post.id,
user_id: post.user_id,
topic_id: post.topic_id,
url: url,
domain: parsed.host,
internal: internal,
link_topic_id: topic&.id,
link_post_id: reflected_post.try(:id),
quote: link.is_quote,
extension: file_extension,
)
reflected_id = nil
# Create the reflection if we can
if topic && post.topic && topic.archetype != 'private_message' && post.topic.archetype != 'private_message' && post.topic.visible?
prefix = Discourse.base_url_no_prefix
reflected_url = "#{prefix}#{post.topic.relative_url(post.post_number)}"
reflected_id = safe_create_topic_link(
user_id: post.user_id,
topic_id: topic&.id,
post_id: reflected_post&.id,
url: reflected_url,
domain: Discourse.current_hostname,
reflection: true,
internal: true,
link_topic_id: post.topic_id,
link_post_id: post.id
)
end
[url, reflected_id]
end
def self.cleanup_entries(post, current_urls, current_reflected_ids)
# Remove links that aren't there anymore
if current_urls.present?
TopicLink.where(
"(url not in (:urls)) AND (post_id = :post_id AND NOT reflection)",
urls: current_urls, post_id: post.id
).delete_all
current_reflected_ids.compact!
if current_reflected_ids.present?
TopicLink.where(
"(id not in (:reflected_ids)) AND (link_post_id = :post_id AND reflection)",
reflected_ids: current_reflected_ids, post_id: post.id
).delete_all
else
TopicLink
.where("link_post_id = :post_id AND reflection", post_id: post.id)
.delete_all
end
else
TopicLink
.where(
"(post_id = :post_id AND NOT reflection) OR (link_post_id = :post_id AND reflection)",
post_id: post.id
)
.delete_all
end
end
2013-02-06 03:16:51 +08:00
end
# == Schema Information
#
# Table name: topic_links
#
# id :integer not null, primary key
# topic_id :integer not null
# post_id :integer
# user_id :integer not null
# url :string(500) not null
# domain :string(100) not null
# internal :boolean default(FALSE), not null
# link_topic_id :integer
# created_at :datetime not null
# updated_at :datetime not null
# reflection :boolean default(FALSE)
# clicks :integer default(0), not null
# link_post_id :integer
2019-01-12 03:29:56 +08:00
# title :string
2014-04-08 23:35:44 +08:00
# crawled_at :datetime
2014-07-15 09:29:44 +08:00
# quote :boolean default(FALSE), not null
# extension :string(10)
#
# Indexes
#
2019-01-12 03:29:56 +08:00
# index_topic_links_on_extension (extension)
# index_topic_links_on_link_post_id_and_reflection (link_post_id,reflection)
# index_topic_links_on_post_id (post_id)
# index_topic_links_on_topic_id (topic_id)
2019-04-26 16:23:27 +08:00
# index_topic_links_on_user_id (user_id)
2019-01-12 03:29:56 +08:00
# unique_post_links (topic_id,post_id,url) UNIQUE
#