mirror of
https://github.com/discourse/discourse.git
synced 2024-11-22 11:23:25 +08:00
FIX: Disallow table cells to be weighted actual articles can be main content (#27508)
For Topic Embeds, we would prefer <article> to be the main article in a topic, rather than a table cell <td> with potentially a lot of data. However, in an example URL like here, the table cell (the very large code snippet) is seen as the Topic Embed's article due to the determined content weight by the Readability library we use. In the newly released 0.7.1 cantino/ruby-readability#94, the library has a new option to exclude the library's default <td> element into content weighting. This is more in line with the original library where they only weighted <p>. So this PR excludes the td, as seen in the tests, to allow the actual article to be seen as the article. This PR also adds the details tag into the allow-list.
This commit is contained in:
parent
ebdbb199a5
commit
489aac3fdd
|
@ -163,9 +163,31 @@ class TopicEmbed < ActiveRecord::Base
|
|||
require "ruby-readability"
|
||||
|
||||
opts = {
|
||||
tags: %w[div p code pre h1 h2 h3 b em i strong a img ul li ol blockquote figure figcaption],
|
||||
tags: %w[
|
||||
div
|
||||
p
|
||||
code
|
||||
pre
|
||||
h1
|
||||
h2
|
||||
h3
|
||||
b
|
||||
em
|
||||
i
|
||||
strong
|
||||
a
|
||||
img
|
||||
ul
|
||||
li
|
||||
ol
|
||||
blockquote
|
||||
figure
|
||||
figcaption
|
||||
details
|
||||
],
|
||||
attributes: %w[href src class],
|
||||
remove_empty_nodes: false,
|
||||
elements_to_score: %w[p],
|
||||
}
|
||||
|
||||
opts[
|
||||
|
|
|
@ -23,7 +23,7 @@ RSpec.describe TopicEmbed do
|
|||
expect(TopicEmbed.count).to eq(0)
|
||||
end
|
||||
|
||||
it "Allows figure and figcaption HTML tags" do
|
||||
it "Allows figure, figcaption, details HTML tags" do
|
||||
html = <<~HTML
|
||||
<html>
|
||||
<head>
|
||||
|
@ -35,7 +35,10 @@ RSpec.describe TopicEmbed do
|
|||
<figure>
|
||||
<img src="/a.png">
|
||||
<figcaption>Some caption</figcaption>
|
||||
<figure>
|
||||
</figure>
|
||||
<details>
|
||||
some details
|
||||
</details>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
|
@ -51,13 +54,60 @@ RSpec.describe TopicEmbed do
|
|||
<figure>
|
||||
<img src="https://blog.discourse.com/a.png">
|
||||
<figcaption>Some caption</figcaption>
|
||||
<figure>
|
||||
</figure></figure></div>
|
||||
</figure>
|
||||
<details>
|
||||
some details
|
||||
</details>
|
||||
</div>
|
||||
</div></div>
|
||||
HTML
|
||||
expect(parsed.body.strip).to eq(expected.strip)
|
||||
end
|
||||
|
||||
# ideally, articles get a heavier weightage than td elements
|
||||
# so to force that, we do not allow td elements to be scored
|
||||
it "does not score td tags" do
|
||||
html = <<~HTML
|
||||
<html>
|
||||
<head>
|
||||
<title>Some title</title>
|
||||
</head>
|
||||
<body>
|
||||
<article>
|
||||
article content
|
||||
<table>
|
||||
<tr>
|
||||
<td>
|
||||
<p>cats</p>
|
||||
<p>cats</p>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
</article>
|
||||
</body>
|
||||
</html>
|
||||
HTML
|
||||
|
||||
parsed = TopicEmbed.parse_html(html, "https://blog.discourse.com/somepost.html")
|
||||
|
||||
expected = <<-HTML
|
||||
<div><div>
|
||||
|
||||
article content
|
||||
|
||||
|
||||
|
||||
cats
|
||||
cats
|
||||
|
||||
|
||||
|
||||
|
||||
</div></div>
|
||||
HTML
|
||||
expect(parsed.body.strip).to eq(expected.strip)
|
||||
end
|
||||
|
||||
context "when creating a post" do
|
||||
let!(:post) { TopicEmbed.import(user, url, title, contents) }
|
||||
let(:topic_embed) { TopicEmbed.find_by(post: post) }
|
||||
|
|
Loading…
Reference in New Issue
Block a user