2019-05-03 06:17:27 +08:00
# frozen_string_literal: true
2016-05-19 20:25:08 +08:00
require " mini_racer "
2013-02-06 03:16:51 +08:00
require " nokogiri "
2016-06-15 02:31:51 +08:00
require " erb "
2013-02-06 03:16:51 +08:00
module PrettyText
2021-11-22 08:43:03 +08:00
DANGEROUS_BIDI_CHARACTERS = [
" \ u202A " ,
" \ u202B " ,
" \ u202C " ,
" \ u202D " ,
" \ u202E " ,
" \ u2066 " ,
" \ u2067 " ,
" \ u2068 " ,
" \ u2069 " ,
] . freeze
DANGEROUS_BIDI_REGEXP = Regexp . new ( DANGEROUS_BIDI_CHARACTERS . join ( " | " ) ) . freeze
FEATURE: Allow hotlinked media to be blocked (#16940)
This commit introduces a new site setting: `block_hotlinked_media`. When enabled, all attempts to hotlink media (images, videos, and audio) will fail, and be replaced with a linked placeholder. Exceptions to the rule can be added via `block_hotlinked_media_exceptions`.
`download_remote_image_to_local` can be used alongside this feature. In that case, hotlinked images will be blocked immediately when the post is created, but will then be replaced with the downloaded version a few seconds later.
This implementation is purely server-side, and does not impact the composer preview.
Technically, there are two stages to this feature:
1. `PrettyText.sanitize_hotlinked_media` is called during `PrettyText.cook`, and whenever new images are introduced by Onebox. It will iterate over all src/srcset attributes in the post HTML and check if they're allowed. If not, the attributes will be removed and replaced with a `data-blocked-hotlinked-src(set)` attribute
2. In the `CookedPostProcessor`, we iterate over all `data-blocked-hotlinked-src(set)` attributes and check whether we have a downloaded version of the media. If yes, we update the src to use the downloaded version. If not, the entire media element is replaced with a placeholder. The placeholder is labelled 'external media', and is a link to the offsite media.
2022-06-07 22:23:04 +08:00
BLOCKED_HOTLINKED_SRC_ATTR = " data-blocked-hotlinked-src "
BLOCKED_HOTLINKED_SRCSET_ATTR = " data-blocked-hotlinked-srcset "
2016-06-15 02:31:51 +08:00
@mutex = Mutex . new
@ctx_init = Mutex . new
2013-02-06 03:16:51 +08:00
2016-06-15 02:31:51 +08:00
def self . app_root
Rails . root
end
2013-07-16 15:48:48 +08:00
2023-11-07 00:59:49 +08:00
def self . apply_es6_file ( ctx : , path : , module_name : )
source = File . read ( path )
transpiler = DiscourseJsProcessor :: Transpiler . new
transpiled = transpiler . perform ( source , nil , module_name )
ctx . eval ( transpiled , filename : module_name )
2013-02-06 03:16:51 +08:00
end
2023-11-07 00:59:49 +08:00
def self . ctx_load_directory ( ctx : , base_path : , module_prefix : )
Dir [ " **/*.js " , base : base_path ] . sort . each do | f |
module_name = " #{ module_prefix } #{ f . delete_suffix ( " .js " ) } "
apply_es6_file ( ctx : ctx , path : File . join ( base_path , f ) , module_name : module_name )
2016-05-19 20:25:08 +08:00
end
2017-06-09 06:02:30 +08:00
end
def self . create_es6_context
2020-05-15 12:01:54 +08:00
ctx = MiniRacer :: Context . new ( timeout : 25_000 , ensure_gc_after_idle : 2000 )
2017-06-09 06:02:30 +08:00
2023-11-07 00:59:49 +08:00
ctx . eval ( " window = globalThis; window.devicePixelRatio = 2; " ) # hack to make code think stuff is retina
2017-06-09 06:02:30 +08:00
2022-02-12 01:16:27 +08:00
ctx . attach ( " rails.logger.info " , proc { | err | Rails . logger . info ( err . to_s ) } )
ctx . attach ( " rails.logger.warn " , proc { | err | Rails . logger . warn ( err . to_s ) } )
ctx . attach ( " rails.logger.error " , proc { | err | Rails . logger . error ( err . to_s ) } )
ctx . eval << ~ JS
console = {
prefix : " [PrettyText] " ,
log : function ( ... args ) { rails . logger . info ( console . prefix + args . join ( " " ) ) ; } ,
warn : function ( ... args ) { rails . logger . warn ( console . prefix + args . join ( " " ) ) ; } ,
error : function ( ... args ) { rails . logger . error ( console . prefix + args . join ( " " ) ) ; }
}
JS
2019-10-30 21:48:24 +08:00
ctx . eval ( " __PRETTY_TEXT = true " )
2017-06-09 06:02:30 +08:00
2020-05-14 04:23:41 +08:00
PrettyText :: Helpers . instance_methods . each do | method |
ctx . attach ( " __helpers. #{ method } " , PrettyText :: Helpers . method ( method ) )
end
2023-11-07 00:59:49 +08:00
root_path = " #{ Rails . root } /app/assets/javascripts "
2024-09-03 17:51:07 +08:00
d_node_modules = " #{ Rails . root } /app/assets/javascripts/discourse/node_modules "
2024-02-26 21:45:58 +08:00
md_node_modules = " #{ Rails . root } /app/assets/javascripts/discourse-markdown-it/node_modules "
2024-09-03 17:51:07 +08:00
ctx . load ( " #{ d_node_modules } /loader.js/dist/loader/loader.js " )
2024-02-26 21:45:58 +08:00
ctx . load ( " #{ md_node_modules } /markdown-it/dist/markdown-it.js " )
2023-11-07 00:59:49 +08:00
ctx . load ( " #{ root_path } /handlebars-shim.js " )
2024-09-03 17:51:07 +08:00
ctx . load ( " #{ md_node_modules } /xss/dist/xss.js " )
2022-08-30 02:11:59 +08:00
ctx . load ( " #{ Rails . root } /lib/pretty_text/vendor-shims.js " )
2023-11-07 00:59:49 +08:00
ctx_load_directory (
ctx : ctx ,
base_path : " #{ root_path } /pretty-text/addon " ,
module_prefix : " pretty-text/ " ,
)
ctx_load_directory (
ctx : ctx ,
base_path : " #{ root_path } /discourse-markdown-it/src " ,
module_prefix : " discourse-markdown-it/ " ,
)
%w[
2024-08-07 23:59:42 +08:00
discourse - common / addon / deprecation - workflow
2023-11-07 00:59:49 +08:00
discourse - common / addon / lib / get - url
discourse - common / addon / lib / object
discourse - common / addon / lib / deprecated
discourse - common / addon / lib / escape
discourse - common / addon / lib / avatar - utils
2024-07-05 11:22:48 +08:00
discourse - common / addon / lib / case - converter
2023-11-07 00:59:49 +08:00
discourse / app / lib / to - markdown
discourse / app / static / markdown - it / features
] . each do | f |
apply_es6_file (
ctx : ctx ,
path : " #{ root_path } / #{ f } .js " ,
module_name : f . sub ( " /addon/ " , " / " ) . sub ( " /app/ " , " / " ) ,
)
end
2016-06-15 02:31:51 +08:00
ctx . load ( " #{ Rails . root } /lib/pretty_text/shims.js " )
ctx . eval ( " __setUnicode( #{ Emoji . unicode_replacements_json } ) " )
2013-08-09 06:14:12 +08:00
2024-02-01 19:48:31 +08:00
Discourse . plugins . each do | plugin |
Dir
2024-02-07 17:34:31 +08:00
. glob ( " #{ plugin . directory } /assets/javascripts/**/discourse-markdown/**/*.{js,js.es6} " )
. filter { | a | File . file? ( a ) }
2024-02-01 19:48:31 +08:00
. each do | f |
module_name =
f . sub ( %r{ \ A.+assets/javascripts/ } , " discourse/plugins/ #{ plugin . name } / " ) . sub (
/ \ .js( \ .es6)? \ z / ,
" " ,
)
apply_es6_file ( ctx : ctx , path : f , module_name : module_name )
end
2013-02-06 03:16:51 +08:00
end
2018-04-10 14:37:16 +08:00
DiscoursePluginRegistry . vendored_core_pretty_text . each { | vpt | ctx . eval ( File . read ( vpt ) ) }
2017-04-19 05:49:56 +08:00
DiscoursePluginRegistry . vendored_pretty_text . each { | vpt | ctx . eval ( File . read ( vpt ) ) }
2013-08-16 06:12:10 +08:00
ctx
end
def self . v8
return @ctx if @ctx
# ensure we only init one of these
@ctx_init . synchronize do
return @ctx if @ctx
2016-06-15 02:31:51 +08:00
@ctx = create_es6_context
2013-08-16 06:12:10 +08:00
end
2014-04-15 04:55:57 +08:00
2013-02-06 03:16:51 +08:00
@ctx
end
2020-05-28 02:11:52 +08:00
def self . reset_translations
v8 . eval ( " __resetTranslationTree() " )
end
2014-11-14 14:51:04 +08:00
def self . reset_context
@ctx_init . synchronize do
2017-07-20 12:17:45 +08:00
@ctx & . dispose
2014-11-14 14:51:04 +08:00
@ctx = nil
end
end
2022-01-06 15:27:12 +08:00
# Acceptable options:
#
# disable_emojis - Disables the emoji markdown engine.
# features - A hash where the key is the markdown feature name and the value is a boolean to enable/disable the markdown feature.
# The hash is merged into the default features set in pretty-text.js which can be used to add new features or disable existing features.
# features_override - An array of markdown feature names to override the default markdown feature set. Currently used by plugins to customize what features should be enabled
# when rendering markdown.
# markdown_it_rules - An array of markdown rule names which will be applied to the markdown-it engine. Currently used by plugins to customize what markdown-it rules should be
# enabled when rendering markdown.
# topic_id - Topic id for the post being cooked.
FIX: Add post id to the anchor to prevent two identical anchors (#28070)
* FIX: Add post id to the anchor to prevent two identical anchors
We generate anchors for headings in posts. This works fine if there is
only one post in a topic with anchors. The problem comes when you have
two or more posts with the same heading. PrettyText generates anchors
based on the heading text using the raw context of each post, so it is
entirely possible to generate the same anchor for two posts in the same
topic, especially for topics with template replies
Post1:
# heading
context
Post2:
# heading
context
When both posts are on the page at the same time, the anchor will only
work for the first post, according to the [HTML specification](https://html.spec.whatwg.org/multipage/browsing-the-web.html#scroll-to-the-fragment-identifier).
> If there is an a element in the document tree whose root is document
> that has a name attribute whose value is equal to fragment, then
> return the *first* such element in tree order.
This bug is particularly serious in forums with non-Latin languages,
such as Chinese. We do not generate slugs for Chinese, which results in
the heading anchors being completely dependent on their order.
```ruby
[2] pry(main)> PrettyText.cook("# 中文")
=> "<h1><a name=\"h-1\" class=\"anchor\" href=\"#h-1\"></a>中文</h1>"
```
Therefore, the anchors in the two posts must be in exactly the same by
order, causing almost all of the anchors in the second post to be
invalid.
This commit solves this problem by adding the `post_id` to the anchor.
The new anchor generation method will add `p-{post_id}` as a prefix when
post_id is available:
```ruby
[3] pry(main)> PrettyText.cook("# 中文", post_id: 1234)
=> "<h1><a name=\"p-1234-h-1\" class=\"anchor\" href=\"#p-1234-h-1\"></a>中文</h1>"
```
This way we can ensure that each anchor name only appears once on the
same topic. Using post id also prevents the potential possibility of the
same anchor name when splitting/merging topics.
2024-07-25 13:50:30 +08:00
# post_id - Post id for the post being cooked.
2022-01-06 15:27:12 +08:00
# user_id - User id for the post being cooked.
2022-02-23 14:13:46 +08:00
# force_quote_link - Always create the link to the quoted topic for [quote] bbcode. Normally this only happens
# if the topic_id provided is different from the [quote topic:X].
FEATURE: Generic hashtag autocomplete lookup and markdown cooking (#18937)
This commit fleshes out and adds functionality for the new `#hashtag` search and
lookup system, still hidden behind the `enable_experimental_hashtag_autocomplete`
feature flag.
**Serverside**
We have two plugin API registration methods that are used to define data sources
(`register_hashtag_data_source`) and hashtag result type priorities depending on
the context (`register_hashtag_type_in_context`). Reading the comments in plugin.rb
should make it clear what these are doing. Reading the `HashtagAutocompleteService`
in full will likely help a lot as well.
Each data source is responsible for providing its own **lookup** and **search**
method that returns hashtag results based on the arguments provided. For example,
the category hashtag data source has to take into account parent categories and
how they relate, and each data source has to define their own icon to use for the
hashtag, and so on.
The `Site` serializer has two new attributes that source data from `HashtagAutocompleteService`.
There is `hashtag_icons` that is just a simple array of all the different icons that
can be used for allowlisting in our markdown pipeline, and there is `hashtag_context_configurations`
that is used to store the type priority orders for each registered context.
When sending emails, we cannot render the SVG icons for hashtags, so
we need to change the HTML hashtags to the normal `#hashtag` text.
**Markdown**
The `hashtag-autocomplete.js` file is where I have added the new `hashtag-autocomplete`
markdown rule, and like all of our rules this is used to cook the raw text on both the clientside
and on the serverside using MiniRacer. Only on the server side do we actually reach out to
the database with the `hashtagLookup` function, on the clientside we just render a plainer
version of the hashtag HTML. Only in the composer preview do we do further lookups based
on this.
This rule is the first one (that I can find) that uses the `currentUser` based on a passed
in `user_id` for guardian checks in markdown rendering code. This is the `last_editor_id`
for both the post and chat message. In some cases we need to cook without a user present,
so the `Discourse.system_user` is used in this case.
**Chat Channels**
This also contains the changes required for chat so that chat channels can be used
as a data source for hashtag searches and lookups. This data source will only be
used when `enable_experimental_hashtag_autocomplete` is `true`, so we don't have
to worry about channel results suddenly turning up.
------
**Known Rough Edges**
- Onebox excerpts will not render the icon svg/use tags, I plan to address that in a follow up PR
- Selecting a hashtag + pressing the Quote button will result in weird behaviour, I plan to address that in a follow up PR
- Mixed hashtag contexts for hashtags without a type suffix will not work correctly, e.g. #ux which is both a category and a channel slug will resolve to a category when used inside a post or within a [chat] transcript in that post. Users can get around this manually by adding the correct suffix, for example ::channel. We may get to this at some point in future
- Icons will not show for the hashtags in emails since SVG support is so terrible in email (this is not likely to be resolved, but still noting for posterity)
- Additional refinements and review fixes wil
2022-11-21 06:37:06 +08:00
# hashtag_context - Defaults to "topic-composer" if not supplied. Controls the order of #hashtag lookup results
# based on registered hashtag contexts from the `#register_hashtag_search_param` plugin API
# method.
2016-07-07 15:52:56 +08:00
def self . markdown ( text , opts = { } )
2013-02-06 03:16:51 +08:00
# we use the exact same markdown converter as the client
2013-02-26 00:42:20 +08:00
# TODO: use the same extensions on both client and server (in particular the template for mentions)
2013-02-06 03:16:51 +08:00
baked = nil
2016-05-19 20:25:08 +08:00
text = text || " "
2013-02-06 03:16:51 +08:00
2014-02-04 08:12:53 +08:00
protect do
2013-08-16 11:03:47 +08:00
context = v8
2013-10-12 04:24:27 +08:00
2016-06-15 02:31:51 +08:00
custom_emoji = { }
2016-11-18 02:35:39 +08:00
Emoji . custom . map { | e | custom_emoji [ e . name ] = e . url }
2016-06-15 02:31:51 +08:00
2022-02-23 14:13:46 +08:00
# note, any additional options added to __optInput here must be
# also be added to the buildOptions function in pretty-text.js,
# otherwise they will be discarded
2019-05-03 06:17:27 +08:00
buffer = + << ~ JS
2017-06-09 06:02:30 +08:00
__optInput = { } ;
__optInput . siteSettings = #{SiteSetting.client_settings_json};
2019-06-03 15:41:26 +08:00
#{"__optInput.disableEmojis = true" if opts[:disable_emojis]}
2018-05-23 22:47:09 +08:00
__paths = #{paths_json};
2017-06-09 06:02:30 +08:00
__optInput . getURL = __getURL ;
2019-07-09 19:42:02 +08:00
#{"__optInput.features = #{opts[:features].to_json};" if opts[:features]}
2022-01-06 15:27:12 +08:00
#{"__optInput.featuresOverride = #{opts[:features_override].to_json};" if opts[:features_override]}
#{"__optInput.markdownItRules = #{opts[:markdown_it_rules].to_json};" if opts[:markdown_it_rules]}
2017-06-09 06:02:30 +08:00
__optInput . getCurrentUser = __getCurrentUser ;
__optInput . lookupAvatar = __lookupAvatar ;
2017-11-03 21:51:40 +08:00
__optInput . lookupPrimaryUserGroup = __lookupPrimaryUserGroup ;
2017-11-21 05:28:03 +08:00
__optInput . formatUsername = __formatUsername ;
2017-06-09 06:02:30 +08:00
__optInput . getTopicInfo = __getTopicInfo ;
FEATURE: Generic hashtag autocomplete lookup and markdown cooking (#18937)
This commit fleshes out and adds functionality for the new `#hashtag` search and
lookup system, still hidden behind the `enable_experimental_hashtag_autocomplete`
feature flag.
**Serverside**
We have two plugin API registration methods that are used to define data sources
(`register_hashtag_data_source`) and hashtag result type priorities depending on
the context (`register_hashtag_type_in_context`). Reading the comments in plugin.rb
should make it clear what these are doing. Reading the `HashtagAutocompleteService`
in full will likely help a lot as well.
Each data source is responsible for providing its own **lookup** and **search**
method that returns hashtag results based on the arguments provided. For example,
the category hashtag data source has to take into account parent categories and
how they relate, and each data source has to define their own icon to use for the
hashtag, and so on.
The `Site` serializer has two new attributes that source data from `HashtagAutocompleteService`.
There is `hashtag_icons` that is just a simple array of all the different icons that
can be used for allowlisting in our markdown pipeline, and there is `hashtag_context_configurations`
that is used to store the type priority orders for each registered context.
When sending emails, we cannot render the SVG icons for hashtags, so
we need to change the HTML hashtags to the normal `#hashtag` text.
**Markdown**
The `hashtag-autocomplete.js` file is where I have added the new `hashtag-autocomplete`
markdown rule, and like all of our rules this is used to cook the raw text on both the clientside
and on the serverside using MiniRacer. Only on the server side do we actually reach out to
the database with the `hashtagLookup` function, on the clientside we just render a plainer
version of the hashtag HTML. Only in the composer preview do we do further lookups based
on this.
This rule is the first one (that I can find) that uses the `currentUser` based on a passed
in `user_id` for guardian checks in markdown rendering code. This is the `last_editor_id`
for both the post and chat message. In some cases we need to cook without a user present,
so the `Discourse.system_user` is used in this case.
**Chat Channels**
This also contains the changes required for chat so that chat channels can be used
as a data source for hashtag searches and lookups. This data source will only be
used when `enable_experimental_hashtag_autocomplete` is `true`, so we don't have
to worry about channel results suddenly turning up.
------
**Known Rough Edges**
- Onebox excerpts will not render the icon svg/use tags, I plan to address that in a follow up PR
- Selecting a hashtag + pressing the Quote button will result in weird behaviour, I plan to address that in a follow up PR
- Mixed hashtag contexts for hashtags without a type suffix will not work correctly, e.g. #ux which is both a category and a channel slug will resolve to a category when used inside a post or within a [chat] transcript in that post. Users can get around this manually by adding the correct suffix, for example ::channel. We may get to this at some point in future
- Icons will not show for the hashtags in emails since SVG support is so terrible in email (this is not likely to be resolved, but still noting for posterity)
- Additional refinements and review fixes wil
2022-11-21 06:37:06 +08:00
__optInput . hashtagLookup = __hashtagLookup ;
2017-06-09 06:02:30 +08:00
__optInput . customEmoji = #{custom_emoji.to_json};
2020-05-28 02:11:52 +08:00
__optInput . customEmojiTranslation = #{Plugin::CustomEmoji.translations.to_json};
2017-06-29 01:47:22 +08:00
__optInput . emojiUnicodeReplacer = __emojiUnicodeReplacer ;
2023-04-13 15:38:54 +08:00
__optInput . emojiDenyList = #{Emoji.denied.to_json};
2019-05-29 09:00:25 +08:00
__optInput . lookupUploadUrls = __lookupUploadUrls ;
2023-11-01 22:41:10 +08:00
__optInput . censoredRegexp = #{WordWatcher.serialized_regexps_for_action(:censor, engine: :js).to_json};
__optInput . watchedWordsReplace = #{WordWatcher.regexps_for_action(:replace, engine: :js).to_json};
__optInput . watchedWordsLink = #{WordWatcher.regexps_for_action(:link, engine: :js).to_json};
2022-01-28 11:02:02 +08:00
__optInput . additionalOptions = #{Site.markdown_additional_options.to_json};
2023-06-01 08:00:01 +08:00
__optInput . avatar_sizes = #{SiteSetting.avatar_sizes.to_json};
2017-06-09 06:02:30 +08:00
JS
2022-01-06 15:27:12 +08:00
buffer << " __optInput.topicId = #{ opts [ :topic_id ] . to_i } ; \n " if opts [ :topic_id ]
FIX: Add post id to the anchor to prevent two identical anchors (#28070)
* FIX: Add post id to the anchor to prevent two identical anchors
We generate anchors for headings in posts. This works fine if there is
only one post in a topic with anchors. The problem comes when you have
two or more posts with the same heading. PrettyText generates anchors
based on the heading text using the raw context of each post, so it is
entirely possible to generate the same anchor for two posts in the same
topic, especially for topics with template replies
Post1:
# heading
context
Post2:
# heading
context
When both posts are on the page at the same time, the anchor will only
work for the first post, according to the [HTML specification](https://html.spec.whatwg.org/multipage/browsing-the-web.html#scroll-to-the-fragment-identifier).
> If there is an a element in the document tree whose root is document
> that has a name attribute whose value is equal to fragment, then
> return the *first* such element in tree order.
This bug is particularly serious in forums with non-Latin languages,
such as Chinese. We do not generate slugs for Chinese, which results in
the heading anchors being completely dependent on their order.
```ruby
[2] pry(main)> PrettyText.cook("# 中文")
=> "<h1><a name=\"h-1\" class=\"anchor\" href=\"#h-1\"></a>中文</h1>"
```
Therefore, the anchors in the two posts must be in exactly the same by
order, causing almost all of the anchors in the second post to be
invalid.
This commit solves this problem by adding the `post_id` to the anchor.
The new anchor generation method will add `p-{post_id}` as a prefix when
post_id is available:
```ruby
[3] pry(main)> PrettyText.cook("# 中文", post_id: 1234)
=> "<h1><a name=\"p-1234-h-1\" class=\"anchor\" href=\"#p-1234-h-1\"></a>中文</h1>"
```
This way we can ensure that each anchor name only appears once on the
same topic. Using post id also prevents the potential possibility of the
same anchor name when splitting/merging topics.
2024-07-25 13:50:30 +08:00
buffer << " __optInput.postId = #{ opts [ :post_id ] . to_i } ; \n " if opts [ :post_id ]
2017-06-09 06:02:30 +08:00
2022-02-23 14:13:46 +08:00
if opts [ :force_quote_link ]
buffer << " __optInput.forceQuoteLink = #{ opts [ :force_quote_link ] } ; \n "
end
2023-03-30 12:50:36 +08:00
buffer << " __optInput.userId = #{ opts [ :user_id ] . to_i } ; \n " if opts [ :user_id ]
2017-06-09 06:02:30 +08:00
FEATURE: Generic hashtag autocomplete lookup and markdown cooking (#18937)
This commit fleshes out and adds functionality for the new `#hashtag` search and
lookup system, still hidden behind the `enable_experimental_hashtag_autocomplete`
feature flag.
**Serverside**
We have two plugin API registration methods that are used to define data sources
(`register_hashtag_data_source`) and hashtag result type priorities depending on
the context (`register_hashtag_type_in_context`). Reading the comments in plugin.rb
should make it clear what these are doing. Reading the `HashtagAutocompleteService`
in full will likely help a lot as well.
Each data source is responsible for providing its own **lookup** and **search**
method that returns hashtag results based on the arguments provided. For example,
the category hashtag data source has to take into account parent categories and
how they relate, and each data source has to define their own icon to use for the
hashtag, and so on.
The `Site` serializer has two new attributes that source data from `HashtagAutocompleteService`.
There is `hashtag_icons` that is just a simple array of all the different icons that
can be used for allowlisting in our markdown pipeline, and there is `hashtag_context_configurations`
that is used to store the type priority orders for each registered context.
When sending emails, we cannot render the SVG icons for hashtags, so
we need to change the HTML hashtags to the normal `#hashtag` text.
**Markdown**
The `hashtag-autocomplete.js` file is where I have added the new `hashtag-autocomplete`
markdown rule, and like all of our rules this is used to cook the raw text on both the clientside
and on the serverside using MiniRacer. Only on the server side do we actually reach out to
the database with the `hashtagLookup` function, on the clientside we just render a plainer
version of the hashtag HTML. Only in the composer preview do we do further lookups based
on this.
This rule is the first one (that I can find) that uses the `currentUser` based on a passed
in `user_id` for guardian checks in markdown rendering code. This is the `last_editor_id`
for both the post and chat message. In some cases we need to cook without a user present,
so the `Discourse.system_user` is used in this case.
**Chat Channels**
This also contains the changes required for chat so that chat channels can be used
as a data source for hashtag searches and lookups. This data source will only be
used when `enable_experimental_hashtag_autocomplete` is `true`, so we don't have
to worry about channel results suddenly turning up.
------
**Known Rough Edges**
- Onebox excerpts will not render the icon svg/use tags, I plan to address that in a follow up PR
- Selecting a hashtag + pressing the Quote button will result in weird behaviour, I plan to address that in a follow up PR
- Mixed hashtag contexts for hashtags without a type suffix will not work correctly, e.g. #ux which is both a category and a channel slug will resolve to a category when used inside a post or within a [chat] transcript in that post. Users can get around this manually by adding the correct suffix, for example ::channel. We may get to this at some point in future
- Icons will not show for the hashtags in emails since SVG support is so terrible in email (this is not likely to be resolved, but still noting for posterity)
- Additional refinements and review fixes wil
2022-11-21 06:37:06 +08:00
opts [ :hashtag_context ] = opts [ :hashtag_context ] || " topic-composer "
hashtag_types_as_js =
HashtagAutocompleteService
. ordered_types_for_context ( opts [ :hashtag_context ] )
. map { | t | " ' #{ t } ' " }
. join ( " , " )
buffer << " __optInput.hashtagTypesInPriorityOrder = [ #{ hashtag_types_as_js } ]; \n "
2023-05-23 15:33:55 +08:00
buffer << " __optInput.hashtagIcons = #{ HashtagAutocompleteService . data_source_icon_map . to_json } ; \n "
FEATURE: Generic hashtag autocomplete lookup and markdown cooking (#18937)
This commit fleshes out and adds functionality for the new `#hashtag` search and
lookup system, still hidden behind the `enable_experimental_hashtag_autocomplete`
feature flag.
**Serverside**
We have two plugin API registration methods that are used to define data sources
(`register_hashtag_data_source`) and hashtag result type priorities depending on
the context (`register_hashtag_type_in_context`). Reading the comments in plugin.rb
should make it clear what these are doing. Reading the `HashtagAutocompleteService`
in full will likely help a lot as well.
Each data source is responsible for providing its own **lookup** and **search**
method that returns hashtag results based on the arguments provided. For example,
the category hashtag data source has to take into account parent categories and
how they relate, and each data source has to define their own icon to use for the
hashtag, and so on.
The `Site` serializer has two new attributes that source data from `HashtagAutocompleteService`.
There is `hashtag_icons` that is just a simple array of all the different icons that
can be used for allowlisting in our markdown pipeline, and there is `hashtag_context_configurations`
that is used to store the type priority orders for each registered context.
When sending emails, we cannot render the SVG icons for hashtags, so
we need to change the HTML hashtags to the normal `#hashtag` text.
**Markdown**
The `hashtag-autocomplete.js` file is where I have added the new `hashtag-autocomplete`
markdown rule, and like all of our rules this is used to cook the raw text on both the clientside
and on the serverside using MiniRacer. Only on the server side do we actually reach out to
the database with the `hashtagLookup` function, on the clientside we just render a plainer
version of the hashtag HTML. Only in the composer preview do we do further lookups based
on this.
This rule is the first one (that I can find) that uses the `currentUser` based on a passed
in `user_id` for guardian checks in markdown rendering code. This is the `last_editor_id`
for both the post and chat message. In some cases we need to cook without a user present,
so the `Discourse.system_user` is used in this case.
**Chat Channels**
This also contains the changes required for chat so that chat channels can be used
as a data source for hashtag searches and lookups. This data source will only be
used when `enable_experimental_hashtag_autocomplete` is `true`, so we don't have
to worry about channel results suddenly turning up.
------
**Known Rough Edges**
- Onebox excerpts will not render the icon svg/use tags, I plan to address that in a follow up PR
- Selecting a hashtag + pressing the Quote button will result in weird behaviour, I plan to address that in a follow up PR
- Mixed hashtag contexts for hashtags without a type suffix will not work correctly, e.g. #ux which is both a category and a channel slug will resolve to a category when used inside a post or within a [chat] transcript in that post. Users can get around this manually by adding the correct suffix, for example ::channel. We may get to this at some point in future
- Icons will not show for the hashtags in emails since SVG support is so terrible in email (this is not likely to be resolved, but still noting for posterity)
- Additional refinements and review fixes wil
2022-11-21 06:37:06 +08:00
2023-11-07 00:59:49 +08:00
buffer << " __pluginFeatures = __loadPluginFeatures(); "
buffer << " __pt = __DiscourseMarkdownIt.withCustomFeatures(__pluginFeatures).withOptions(__optInput); "
2017-07-14 20:27:28 +08:00
2016-08-12 02:59:20 +08:00
# Be careful disabling sanitization. We allow for custom emails
2017-07-14 20:27:28 +08:00
buffer << ( " __pt.disableSanitizer(); " ) if opts [ :sanitize ] == false
2016-08-12 02:59:20 +08:00
2017-06-09 06:02:30 +08:00
opts = context . eval ( buffer )
2016-06-15 02:31:51 +08:00
2016-01-29 22:59:15 +08:00
DiscourseEvent . trigger ( :markdown_context , context )
2016-06-15 02:31:51 +08:00
baked = context . eval ( " __pt.cook( #{ text . inspect } ) " )
2013-02-06 03:16:51 +08:00
end
baked
end
2018-05-23 22:47:09 +08:00
def self . paths_json
paths = { baseUri : Discourse . base_path , CDN : Rails . configuration . action_controller . asset_host }
if SiteSetting . Upload . enable_s3_uploads
paths [ :S3CDN ] = SiteSetting . Upload . s3_cdn_url if SiteSetting . Upload . s3_cdn_url . present?
paths [ :S3BaseUrl ] = Discourse . store . absolute_base_url
end
paths . to_json
end
2013-02-06 03:16:51 +08:00
# leaving this here, cause it invokes v8, don't want to implement twice
2013-08-14 04:08:29 +08:00
def self . avatar_img ( avatar_template , size )
2018-05-23 22:47:09 +08:00
protect { v8 . eval ( << ~ JS ) }
2023-06-01 18:58:15 +08:00
__optInput = { } ;
__optInput . avatar_sizes = #{SiteSetting.avatar_sizes.to_json};
2018-05-23 22:47:09 +08:00
__paths = #{paths_json};
2023-07-12 16:06:16 +08:00
require ( " discourse-common/lib/avatar-utils " ) . avatarImg ( { size : #{size.inspect}, avatarTemplate: #{avatar_template.inspect}}, __getURL);
2018-05-23 22:47:09 +08:00
JS
2013-02-06 03:16:51 +08:00
end
2015-10-15 15:59:29 +08:00
def self . unescape_emoji ( title )
2019-03-21 16:11:33 +08:00
return title unless SiteSetting . enable_emoji? && title
2016-06-15 02:31:51 +08:00
set = SiteSetting . emoji_set . inspect
2019-03-06 19:49:17 +08:00
custom = Emoji . custom . map { | e | [ e . name , e . url ] } . to_h . to_json
2019-12-04 00:32:33 +08:00
2018-05-23 22:47:09 +08:00
protect { v8 . eval ( << ~ JS ) }
__paths = #{paths_json};
2019-05-21 22:56:51 +08:00
__performEmojiUnescape ( #{title.inspect}, {
getURL : __getURL ,
emojiSet : #{set},
2021-03-04 04:39:00 +08:00
emojiCDNUrl : " #{ SiteSetting . external_emoji_url . blank? ? " " : SiteSetting . external_emoji_url } " ,
2019-05-21 22:56:51 +08:00
customEmoji : #{custom},
2019-12-04 00:32:33 +08:00
enableEmojiShortcuts : #{SiteSetting.enable_emoji_shortcuts},
inlineEmoji : #{SiteSetting.enable_inline_emoji_translation}
2019-05-21 22:56:51 +08:00
} ) ;
2018-05-23 22:47:09 +08:00
JS
2015-10-15 15:59:29 +08:00
end
2019-03-21 16:11:33 +08:00
def self . escape_emoji ( title )
return unless title
2019-05-21 22:56:51 +08:00
replace_emoji_shortcuts = SiteSetting . enable_emoji && SiteSetting . enable_emoji_shortcuts
2019-03-21 16:11:33 +08:00
protect { v8 . eval ( << ~ JS ) }
2019-12-04 00:32:33 +08:00
__performEmojiEscape ( #{title.inspect}, {
emojiShortcuts : #{replace_emoji_shortcuts},
inlineEmoji : #{SiteSetting.enable_inline_emoji_translation}
} ) ;
2019-03-21 16:11:33 +08:00
JS
end
2013-02-06 03:16:51 +08:00
def self . cook ( text , opts = { } )
2015-04-24 01:33:29 +08:00
options = opts . dup
2015-12-31 03:35:25 +08:00
working_text = text . dup
2016-11-09 05:36:34 +08:00
2018-02-27 04:48:59 +08:00
sanitized = markdown ( working_text , options )
2015-05-26 09:13:12 +08:00
2020-05-05 11:46:57 +08:00
doc = Nokogiri :: HTML5 . fragment ( sanitized )
2015-05-26 09:13:12 +08:00
2020-09-10 23:59:51 +08:00
add_nofollow = ! options [ :omit_nofollow ] && SiteSetting . add_rel_nofollow_to_user_content
add_rel_attributes_to_user_content ( doc , add_nofollow )
2021-11-22 08:43:03 +08:00
strip_hidden_unicode_bidirectional_characters ( doc )
FEATURE: Allow hotlinked media to be blocked (#16940)
This commit introduces a new site setting: `block_hotlinked_media`. When enabled, all attempts to hotlink media (images, videos, and audio) will fail, and be replaced with a linked placeholder. Exceptions to the rule can be added via `block_hotlinked_media_exceptions`.
`download_remote_image_to_local` can be used alongside this feature. In that case, hotlinked images will be blocked immediately when the post is created, but will then be replaced with the downloaded version a few seconds later.
This implementation is purely server-side, and does not impact the composer preview.
Technically, there are two stages to this feature:
1. `PrettyText.sanitize_hotlinked_media` is called during `PrettyText.cook`, and whenever new images are introduced by Onebox. It will iterate over all src/srcset attributes in the post HTML and check if they're allowed. If not, the attributes will be removed and replaced with a `data-blocked-hotlinked-src(set)` attribute
2. In the `CookedPostProcessor`, we iterate over all `data-blocked-hotlinked-src(set)` attributes and check whether we have a downloaded version of the media. If yes, we update the src to use the downloaded version. If not, the entire media element is replaced with a placeholder. The placeholder is labelled 'external media', and is a link to the offsite media.
2022-06-07 22:23:04 +08:00
sanitize_hotlinked_media ( doc )
2024-02-15 04:43:53 +08:00
add_video_placeholder_image ( doc )
2015-05-26 09:13:12 +08:00
2018-11-22 16:01:03 +08:00
add_mentions ( doc , user_id : opts [ :user_id ] ) if SiteSetting . enable_mentions
2018-11-22 14:28:48 +08:00
2020-05-05 11:46:57 +08:00
scrubber = Loofah :: Scrubber . new { | node | node . remove if node . name == " script " }
2023-06-20 09:49:22 +08:00
loofah_fragment = Loofah . html5_fragment ( doc . to_html )
2020-05-05 11:46:57 +08:00
loofah_fragment . scrub! ( scrubber ) . to_html
2015-05-26 09:13:12 +08:00
end
2021-11-22 08:43:03 +08:00
def self . strip_hidden_unicode_bidirectional_characters ( doc )
return if ! DANGEROUS_BIDI_REGEXP . match? ( doc . content )
doc
. css ( " code,pre " )
. each do | code_tag |
next if ! DANGEROUS_BIDI_REGEXP . match? ( code_tag . content )
DANGEROUS_BIDI_CHARACTERS . each do | bidi |
next if ! code_tag . content . include? ( bidi )
formatted = " <U+ #{ bidi . ord . to_s ( 16 ) . upcase } > "
code_tag . inner_html =
code_tag . inner_html . gsub (
bidi ,
" <span class= \" bidi-warning \" title= \" #{ I18n . t ( " post.hidden_bidi_character " ) } \" > #{ formatted } </span> " ,
)
2023-01-09 20:10:19 +08:00
end
2021-11-22 08:43:03 +08:00
end
end
FEATURE: Allow hotlinked media to be blocked (#16940)
This commit introduces a new site setting: `block_hotlinked_media`. When enabled, all attempts to hotlink media (images, videos, and audio) will fail, and be replaced with a linked placeholder. Exceptions to the rule can be added via `block_hotlinked_media_exceptions`.
`download_remote_image_to_local` can be used alongside this feature. In that case, hotlinked images will be blocked immediately when the post is created, but will then be replaced with the downloaded version a few seconds later.
This implementation is purely server-side, and does not impact the composer preview.
Technically, there are two stages to this feature:
1. `PrettyText.sanitize_hotlinked_media` is called during `PrettyText.cook`, and whenever new images are introduced by Onebox. It will iterate over all src/srcset attributes in the post HTML and check if they're allowed. If not, the attributes will be removed and replaced with a `data-blocked-hotlinked-src(set)` attribute
2. In the `CookedPostProcessor`, we iterate over all `data-blocked-hotlinked-src(set)` attributes and check whether we have a downloaded version of the media. If yes, we update the src to use the downloaded version. If not, the entire media element is replaced with a placeholder. The placeholder is labelled 'external media', and is a link to the offsite media.
2022-06-07 22:23:04 +08:00
def self . sanitize_hotlinked_media ( doc )
return if ! SiteSetting . block_hotlinked_media
allowed_pattern = allowed_src_pattern
doc
2023-10-13 03:47:48 +08:00
. css ( " img[src], source[src], source[srcset], track[src], div[data-video-src] " )
FEATURE: Allow hotlinked media to be blocked (#16940)
This commit introduces a new site setting: `block_hotlinked_media`. When enabled, all attempts to hotlink media (images, videos, and audio) will fail, and be replaced with a linked placeholder. Exceptions to the rule can be added via `block_hotlinked_media_exceptions`.
`download_remote_image_to_local` can be used alongside this feature. In that case, hotlinked images will be blocked immediately when the post is created, but will then be replaced with the downloaded version a few seconds later.
This implementation is purely server-side, and does not impact the composer preview.
Technically, there are two stages to this feature:
1. `PrettyText.sanitize_hotlinked_media` is called during `PrettyText.cook`, and whenever new images are introduced by Onebox. It will iterate over all src/srcset attributes in the post HTML and check if they're allowed. If not, the attributes will be removed and replaced with a `data-blocked-hotlinked-src(set)` attribute
2. In the `CookedPostProcessor`, we iterate over all `data-blocked-hotlinked-src(set)` attributes and check whether we have a downloaded version of the media. If yes, we update the src to use the downloaded version. If not, the entire media element is replaced with a placeholder. The placeholder is labelled 'external media', and is a link to the offsite media.
2022-06-07 22:23:04 +08:00
. each do | el |
if el [ " src " ] && ! el [ " src " ] . match? ( allowed_pattern )
el [ PrettyText :: BLOCKED_HOTLINKED_SRC_ATTR ] = el . delete ( " src " )
end
2023-10-13 03:47:48 +08:00
if el [ " data-video-src " ] && ! el [ " data-video-src " ] . match? ( allowed_pattern )
el [ PrettyText :: BLOCKED_HOTLINKED_SRC_ATTR ] = el [ " data-video-src " ]
end
FEATURE: Allow hotlinked media to be blocked (#16940)
This commit introduces a new site setting: `block_hotlinked_media`. When enabled, all attempts to hotlink media (images, videos, and audio) will fail, and be replaced with a linked placeholder. Exceptions to the rule can be added via `block_hotlinked_media_exceptions`.
`download_remote_image_to_local` can be used alongside this feature. In that case, hotlinked images will be blocked immediately when the post is created, but will then be replaced with the downloaded version a few seconds later.
This implementation is purely server-side, and does not impact the composer preview.
Technically, there are two stages to this feature:
1. `PrettyText.sanitize_hotlinked_media` is called during `PrettyText.cook`, and whenever new images are introduced by Onebox. It will iterate over all src/srcset attributes in the post HTML and check if they're allowed. If not, the attributes will be removed and replaced with a `data-blocked-hotlinked-src(set)` attribute
2. In the `CookedPostProcessor`, we iterate over all `data-blocked-hotlinked-src(set)` attributes and check whether we have a downloaded version of the media. If yes, we update the src to use the downloaded version. If not, the entire media element is replaced with a placeholder. The placeholder is labelled 'external media', and is a link to the offsite media.
2022-06-07 22:23:04 +08:00
if el [ " srcset " ]
srcs = el [ " srcset " ] . split ( " , " ) . map { | e | e . split ( " " , 2 ) [ 0 ] . presence }
if srcs . any? { | src | ! src . match? ( allowed_pattern ) }
el [ PrettyText :: BLOCKED_HOTLINKED_SRCSET_ATTR ] = el . delete ( " srcset " )
2023-01-09 20:10:19 +08:00
end
FEATURE: Allow hotlinked media to be blocked (#16940)
This commit introduces a new site setting: `block_hotlinked_media`. When enabled, all attempts to hotlink media (images, videos, and audio) will fail, and be replaced with a linked placeholder. Exceptions to the rule can be added via `block_hotlinked_media_exceptions`.
`download_remote_image_to_local` can be used alongside this feature. In that case, hotlinked images will be blocked immediately when the post is created, but will then be replaced with the downloaded version a few seconds later.
This implementation is purely server-side, and does not impact the composer preview.
Technically, there are two stages to this feature:
1. `PrettyText.sanitize_hotlinked_media` is called during `PrettyText.cook`, and whenever new images are introduced by Onebox. It will iterate over all src/srcset attributes in the post HTML and check if they're allowed. If not, the attributes will be removed and replaced with a `data-blocked-hotlinked-src(set)` attribute
2. In the `CookedPostProcessor`, we iterate over all `data-blocked-hotlinked-src(set)` attributes and check whether we have a downloaded version of the media. If yes, we update the src to use the downloaded version. If not, the entire media element is replaced with a placeholder. The placeholder is labelled 'external media', and is a link to the offsite media.
2022-06-07 22:23:04 +08:00
end
end
end
2020-09-10 23:59:51 +08:00
def self . add_rel_attributes_to_user_content ( doc , add_nofollow )
2020-07-27 08:23:54 +08:00
allowlist = [ ]
2013-02-11 16:01:33 +08:00
2013-11-20 11:38:21 +08:00
domains = SiteSetting . exclude_rel_nofollow_domains
2020-07-27 08:23:54 +08:00
allowlist = domains . split ( " | " ) if domains . present?
2013-02-11 16:01:33 +08:00
2013-02-11 08:43:07 +08:00
site_uri = nil
doc
. css ( " a " )
. each do | l |
href = l [ " href " ] . to_s
2020-09-10 23:59:51 +08:00
l [ " rel " ] = " noopener " if l [ " target " ] == " _blank "
2023-01-09 20:10:19 +08:00
2013-02-26 00:42:20 +08:00
begin
2020-05-08 09:14:59 +08:00
uri = URI ( UrlHelper . encode_component ( href ) )
2013-02-11 08:43:07 +08:00
site_uri || = URI ( Discourse . base_url )
2023-01-09 20:10:19 +08:00
2020-09-10 23:59:51 +08:00
same_domain =
! uri . host . present? || uri . host == site_uri . host ||
uri . host . ends_with? ( " . #{ site_uri . host } " ) ||
allowlist . any? { | u | uri . host == u || uri . host . ends_with? ( " . #{ u } " ) }
2023-01-09 20:10:19 +08:00
2020-09-10 23:59:51 +08:00
l [ " rel " ] = " noopener nofollow ugc " if add_nofollow && ! same_domain
2018-08-14 18:23:32 +08:00
rescue URI :: Error
2013-02-26 00:42:20 +08:00
# add a nofollow anyway
2020-09-10 23:59:51 +08:00
l [ " rel " ] = " noopener nofollow ugc "
2023-01-09 20:10:19 +08:00
end
2013-02-11 08:43:07 +08:00
end
2013-02-06 03:16:51 +08:00
end
2017-02-06 21:45:04 +08:00
class DetectedLink < Struct . new ( :url , :is_quote )
end
2014-07-11 12:17:01 +08:00
2013-02-06 03:16:51 +08:00
def self . extract_links ( html )
links = [ ]
2020-05-05 11:46:57 +08:00
doc = Nokogiri :: HTML5 . fragment ( html )
2017-02-06 21:45:04 +08:00
2021-06-18 23:55:24 +08:00
# extract onebox links
doc
. css ( " aside.onebox[data-onebox-src] " )
. each { | onebox | links << DetectedLink . new ( onebox [ " data-onebox-src " ] , false ) }
# remove href inside quotes & oneboxes & elided part
doc . css ( " aside.quote a, aside.onebox a, .elided a " ) . remove
2014-07-11 12:17:01 +08:00
2022-01-24 08:33:23 +08:00
# remove hotlinked images
2024-06-19 19:25:29 +08:00
doc . css ( " a.lightbox > img, a.onebox > img " ) . each { | img | img . parent . remove }
2022-01-24 08:33:23 +08:00
2017-02-06 21:45:04 +08:00
# extract all links
doc
. css ( " a " )
. each do | a |
2020-04-30 14:48:34 +08:00
links << DetectedLink . new ( a [ " href " ] , false ) if a [ " href " ] . present? && a [ " href " ] [ 0 ] != " # "
2014-07-11 12:17:01 +08:00
end
2013-02-26 00:42:20 +08:00
2017-02-06 21:45:04 +08:00
# extract quotes
doc
. css ( " aside.quote[data-topic] " )
. each do | aside |
if aside [ " data-topic " ] . present?
2021-02-12 02:21:13 +08:00
url = + " /t/ #{ aside [ " data-topic " ] } "
2017-02-06 21:45:04 +08:00
url << " / #{ aside [ " data-post " ] } " if aside [ " data-post " ] . present?
links << DetectedLink . new ( url , true )
2023-01-09 20:10:19 +08:00
end
2013-02-14 04:22:04 +08:00
end
2017-02-06 21:45:04 +08:00
# extract Youtube links
doc
2023-03-29 23:54:25 +08:00
. css ( " div[data-video-id] " )
2017-02-06 21:45:04 +08:00
. each do | div |
2023-03-29 23:54:25 +08:00
if div [ " data-video-id " ] . present? && div [ " data-provider-name " ] . present?
base_url =
case div [ " data-provider-name " ]
when " youtube "
" https://www.youtube.com/watch?v= "
when " vimeo "
" https://vimeo.com/ "
when " tiktok "
" https://m.tiktok.com/v/ "
end
links << DetectedLink . new ( base_url + div [ " data-video-id " ] , false )
2023-01-09 20:10:19 +08:00
end
2017-02-06 21:45:04 +08:00
end
2016-09-23 04:50:05 +08:00
2013-02-06 03:16:51 +08:00
links
end
2024-02-15 04:43:53 +08:00
def self . add_video_placeholder_image ( doc )
doc
. css ( " .video-placeholder-container " )
. each do | video |
video_src = video [ " data-video-src " ]
2024-08-01 00:54:52 +08:00
next if video_src == " /404 " || video_src . nil?
2024-02-15 04:43:53 +08:00
video_sha1 = File . basename ( video_src , File . extname ( video_src ) )
thumbnail = Upload . where ( " original_filename LIKE ? " , " #{ video_sha1 } .% " ) . last
if thumbnail
video [ " data-thumbnail-src " ] = UrlHelper . absolute (
GlobalPath . upload_cdn_path ( thumbnail . url ) ,
)
end
end
end
2022-12-06 23:10:36 +08:00
def self . extract_mentions ( cooked )
mentions =
cooked
. css ( " .mention, .mention-group " )
2024-03-20 09:20:15 +08:00
. filter_map do | e |
2022-12-06 23:10:36 +08:00
if ( name = e . inner_text )
2024-03-20 09:20:15 +08:00
User . normalize_username ( name [ 1 .. - 1 ] ) if name [ 0 ] == " @ "
2022-12-06 23:10:36 +08:00
end
end
2023-06-15 21:52:52 +08:00
mentions =
DiscoursePluginRegistry . apply_modifier ( :pretty_text_extract_mentions , mentions , cooked )
2022-12-06 23:10:36 +08:00
mentions . compact!
mentions . uniq!
mentions
end
2013-05-28 07:48:47 +08:00
def self . excerpt ( html , max_length , options = { } )
2014-11-06 03:37:00 +08:00
# TODO: properly fix this HACK in ExcerptParser without introducing XSS
2020-05-05 11:46:57 +08:00
doc = Nokogiri :: HTML5 . fragment ( html )
2019-05-29 23:05:52 +08:00
DiscourseEvent . trigger ( :reduce_excerpt , doc , options )
2014-11-06 03:37:00 +08:00
strip_image_wrapping ( doc )
2020-02-07 04:08:13 +08:00
strip_oneboxed_media ( doc )
2023-08-08 09:18:55 +08:00
convert_hashtag_links_to_plaintext ( doc ) if options [ :plain_hashtags ]
2023-01-24 12:40:24 +08:00
2014-11-06 03:37:00 +08:00
html = doc . to_html
2013-05-28 07:48:47 +08:00
ExcerptParser . get_excerpt ( html , max_length , options )
end
2013-02-06 03:16:51 +08:00
2023-01-24 12:40:24 +08:00
def self . convert_hashtag_links_to_plaintext ( doc )
doc
. css ( " a.hashtag-cooked " )
. each { | hashtag | hashtag . replace ( " # #{ hashtag . attributes [ " data-slug " ] } " ) }
end
2013-06-06 03:28:10 +08:00
def self . strip_links ( string )
return string if string . blank?
# If the user is not basic, strip links from their bio
2020-05-05 11:46:57 +08:00
fragment = Nokogiri :: HTML5 . fragment ( string )
2014-09-18 00:08:00 +08:00
fragment . css ( " a " ) . each { | a | a . replace ( a . inner_html ) }
2013-06-06 03:28:10 +08:00
fragment . to_html
end
2018-05-10 01:24:44 +08:00
def self . make_all_links_absolute ( doc )
doc
2024-05-22 03:53:03 +08:00
. css ( " a[href] " )
. each do | a |
2018-05-10 01:24:44 +08:00
begin
2024-05-22 03:53:03 +08:00
href = a [ " href " ] . to_s
next if href . blank?
next if href . start_with? ( " mailto: " )
next if href . start_with? ( Discourse . base_url )
next if URI ( href ) . host . present?
a [ " href " ] = (
if href . start_with? ( Discourse . base_path )
" #{ Discourse . base_url_no_prefix } #{ href } "
else
" #{ Discourse . base_url } #{ href } "
end
)
2018-08-14 18:23:32 +08:00
rescue URI :: Error
2018-05-10 01:24:44 +08:00
# leave it
2018-06-09 01:56:20 +08:00
end
2018-05-10 01:24:44 +08:00
end
end
2016-06-21 23:12:30 +08:00
2014-04-18 00:32:51 +08:00
def self . strip_image_wrapping ( doc )
doc . css ( " .lightbox-wrapper .meta " ) . remove
end
2020-02-07 04:08:13 +08:00
def self . strip_oneboxed_media ( doc )
doc . css ( " audio " ) . remove
2020-02-18 02:52:23 +08:00
doc . css ( " .video-onebox,video " ) . remove
2020-02-07 04:08:13 +08:00
end
2018-05-10 01:24:44 +08:00
def self . convert_vimeo_iframes ( doc )
doc
. css ( " iframe[src*='player.vimeo.com'] " )
. each do | iframe |
2019-04-26 19:39:18 +08:00
if iframe [ " data-original-href " ] . present?
2022-08-09 18:28:29 +08:00
vimeo_url = UrlHelper . normalized_encode ( iframe [ " data-original-href " ] )
2019-04-26 19:39:18 +08:00
else
2023-01-31 19:00:27 +08:00
vimeo_id = iframe [ " src " ] . split ( " / " ) . last . sub ( " ?h= " , " / " )
2019-04-26 19:39:18 +08:00
vimeo_url = " https://vimeo.com/ #{ vimeo_id } "
2023-01-09 20:10:19 +08:00
end
2020-12-09 20:58:36 +08:00
iframe . replace Nokogiri :: HTML5 . fragment ( " <p><a href=' #{ vimeo_url } '> #{ vimeo_url } </a></p> " )
2019-04-26 19:39:18 +08:00
end
2018-05-10 01:24:44 +08:00
end
2022-09-29 07:24:33 +08:00
def self . strip_secure_uploads ( doc )
2020-11-02 07:52:21 +08:00
# images inside a lightbox or other link
doc
. css ( " a[href] " )
. each do | a |
2022-09-29 07:24:33 +08:00
next if ! Upload . secure_uploads_url? ( a [ " href " ] )
2023-01-09 20:10:19 +08:00
2020-11-02 07:52:21 +08:00
non_image_media = %w[ video audio ] . include? ( a & . parent & . name )
target = non_image_media ? a . parent : a
2022-09-29 07:24:33 +08:00
if target . to_s . include? ( " stripped-secure-view-media " ) ||
target . to_s . include? ( " stripped-secure-view-upload " )
next
2023-01-09 20:10:19 +08:00
end
2020-11-05 04:45:50 +08:00
next if a . css ( " img[src] " ) . empty? && ! non_image_media
2023-01-09 20:10:19 +08:00
2020-11-02 07:52:21 +08:00
if a . classes . include? ( " lightbox " )
img = a . css ( " img[src] " ) . first
2023-12-07 06:25:00 +08:00
srcset = img & . attributes & . [] ( " srcset " ) & . value
2020-11-05 04:45:50 +08:00
if srcset
# if available, use the first image from the srcset here
# so we get the optimized image instead of the possibly huge original
url = srcset . split ( " , " ) . first
2023-01-09 20:10:19 +08:00
else
2020-11-04 00:53:15 +08:00
url = img [ " src " ]
2023-01-09 20:10:19 +08:00
end
2022-09-29 07:24:33 +08:00
a . add_next_sibling secure_uploads_placeholder (
2023-01-09 20:10:19 +08:00
doc ,
url ,
2022-09-29 07:24:33 +08:00
width : img [ " width " ] ,
height : img [ " height " ] ,
2023-01-09 20:10:19 +08:00
)
a . remove
2020-11-05 04:45:50 +08:00
else
width = non_image_media ? nil : a . at_css ( " img " ) . attr ( " width " )
2020-11-02 07:52:21 +08:00
height = non_image_media ? nil : a . at_css ( " img " ) . attr ( " height " )
2020-11-05 04:45:50 +08:00
target . add_next_sibling secure_uploads_placeholder (
2023-01-09 20:10:19 +08:00
doc ,
2017-02-06 21:45:04 +08:00
a [ " href " ] ,
2022-09-29 07:24:33 +08:00
width : width ,
height : height ,
2023-01-09 20:10:19 +08:00
)
2020-09-10 07:50:16 +08:00
target . remove
2020-11-05 04:45:50 +08:00
end
2019-11-18 09:25:42 +08:00
end
2020-11-02 07:52:21 +08:00
# images by themselves or inside a onebox
2020-09-10 07:50:16 +08:00
doc
. css ( " img[src] " )
. each do | img |
2020-11-04 00:53:15 +08:00
url =
if img . parent . classes . include? ( " aspect-image " ) && img . attributes [ " srcset " ] . present?
2020-11-02 07:52:21 +08:00
# we are using the first image from the srcset here so we get the
# optimized image instead of the original, because an optimized
# image may be used for the onebox thumbnail
srcset = img . attributes [ " srcset " ] . value
srcset . split ( " , " ) . first
else
img [ " src " ]
end
2023-01-09 20:10:19 +08:00
2020-11-10 10:55:18 +08:00
width = img [ " width " ]
height = img [ " height " ]
2020-11-16 07:58:40 +08:00
onebox_type = nil
if img . ancestors . css ( " .onebox-body " ) . any?
if img . classes . include? ( " onebox-avatar-inline " )
onebox_type = " avatar-inline "
else
onebox_type = " thumbnail "
2023-01-09 20:10:19 +08:00
end
2020-11-16 07:58:40 +08:00
end
2020-11-10 10:55:18 +08:00
# we always want this to be tiny and without any special styles
if img . classes . include? ( " site-icon " )
2020-11-16 07:58:40 +08:00
onebox_type = nil
2020-11-10 10:55:18 +08:00
width = 16
height = 16
end
2020-11-02 07:52:21 +08:00
2022-09-29 07:24:33 +08:00
if Upload . secure_uploads_url? ( url )
img . add_next_sibling secure_uploads_placeholder (
doc ,
url ,
onebox_type : onebox_type ,
width : width ,
height : height ,
)
2020-09-10 07:50:16 +08:00
img . remove
2023-01-09 20:10:19 +08:00
end
2020-09-10 07:50:16 +08:00
end
end
2022-09-29 07:24:33 +08:00
def self . secure_uploads_placeholder ( doc , url , onebox_type : false , width : nil , height : nil )
2020-10-22 10:25:09 +08:00
data_width = width ? " data-width= #{ width } " : " "
data_height = height ? " data-height= #{ height } " : " "
2020-11-16 07:58:40 +08:00
data_onebox_type = onebox_type ? " data-onebox-type=' #{ onebox_type } ' " : " "
2020-09-10 07:50:16 +08:00
<< ~ HTML
2022-09-29 07:24:33 +08:00
< div class = " secure-upload-notice " data - stripped - secure - upload = " #{ url } " #{data_onebox_type} #{data_width} #{data_height}>
#{I18n.t("emails.secure_uploads_placeholder")} <a class='stripped-secure-view-upload' href="#{url}">#{I18n.t("emails.view_redacted_media")}</a>.
2020-09-10 07:50:16 +08:00
< / div>
HTML
2019-11-18 09:25:42 +08:00
end
2016-06-21 23:12:30 +08:00
def self . format_for_email ( html , post = nil )
2020-05-05 11:46:57 +08:00
doc = Nokogiri :: HTML5 . fragment ( html )
2016-06-21 23:12:30 +08:00
DiscourseEvent . trigger ( :reduce_cooked , doc , post )
2024-04-09 11:23:11 +08:00
strip_secure_uploads ( doc ) if post & . should_secure_uploads?
2016-06-21 23:12:30 +08:00
strip_image_wrapping ( doc )
2018-05-10 01:24:44 +08:00
convert_vimeo_iframes ( doc )
2016-06-21 23:12:30 +08:00
make_all_links_absolute ( doc )
doc . to_html
2013-11-29 04:57:21 +08:00
end
2013-05-28 07:48:47 +08:00
protected
2013-02-06 03:16:51 +08:00
2014-02-04 08:12:53 +08:00
class JavaScriptError < StandardError
attr_accessor :message , :backtrace
def initialize ( message , backtrace )
@message = message
@backtrace = backtrace
end
end
def self . protect
rval = nil
@mutex . synchronize { rval = yield }
rval
end
2018-11-22 14:28:48 +08:00
private
2024-11-06 06:27:49 +08:00
USER_TYPE = " user "
GROUP_TYPE = " group "
GROUP_MENTIONABLE_TYPE = " group-mentionable "
2018-11-22 14:28:48 +08:00
2018-11-22 16:01:03 +08:00
def self . add_mentions ( doc , user_id : nil )
2018-11-22 14:28:48 +08:00
elements = doc . css ( " span.mention " )
2018-11-23 08:31:52 +08:00
names = elements . map { | element | element . text [ 1 .. - 1 ] }
2018-11-22 14:28:48 +08:00
2018-11-22 16:01:03 +08:00
mentions = lookup_mentions ( names , user_id : user_id )
2018-11-22 14:28:48 +08:00
2020-02-11 01:31:42 +08:00
elements . each do | element |
2018-11-22 14:28:48 +08:00
name = element . text [ 1 .. - 1 ]
name . downcase!
if type = mentions [ name ]
element . name = " a "
element . children = PrettyText :: Helpers . format_username ( element . children . text )
case type
when USER_TYPE
2020-10-09 19:51:24 +08:00
element [ " href " ] = " #{ Discourse . base_path } /u/ #{ UrlHelper . encode_component ( name ) } "
2020-02-19 04:45:02 +08:00
when GROUP_MENTIONABLE_TYPE
element [ " class " ] = " mention-group notify "
2020-10-09 19:51:24 +08:00
element [ " href " ] = " #{ Discourse . base_path } /groups/ #{ UrlHelper . encode_component ( name ) } "
2018-11-22 14:28:48 +08:00
when GROUP_TYPE
element [ " class " ] = " mention-group "
2020-10-09 19:51:24 +08:00
element [ " href " ] = " #{ Discourse . base_path } /groups/ #{ UrlHelper . encode_component ( name ) } "
2018-11-22 14:28:48 +08:00
end
end
end
end
2018-11-22 16:01:03 +08:00
def self . lookup_mentions ( names , user_id : nil )
2018-11-22 16:42:56 +08:00
return { } if names . blank?
2018-11-22 14:28:48 +08:00
sql = << ~ SQL
(
SELECT
:user_type AS type ,
username_lower AS name
FROM users
2018-11-22 15:00:46 +08:00
WHERE username_lower IN ( :names ) AND staged = false
2018-11-22 14:28:48 +08:00
)
UNION
(
SELECT
:group_type AS type ,
2018-11-26 23:34:56 +08:00
lower ( name ) AS name
2018-11-22 14:28:48 +08:00
FROM groups
)
2020-02-19 04:45:02 +08:00
UNION
(
SELECT
:group_mentionable_type AS type ,
lower ( name ) AS name
FROM groups
WHERE lower ( name ) IN ( :names ) AND ( #{Group.mentionable_sql_clause(include_public: false)})
)
ORDER BY type
2018-11-22 14:28:48 +08:00
SQL
2018-11-22 16:01:03 +08:00
user = User . find_by ( id : user_id )
2018-11-22 16:32:56 +08:00
names . each ( & :downcase! )
2018-11-22 16:01:03 +08:00
2018-11-22 14:28:48 +08:00
results =
DB . query (
sql ,
names : names ,
user_type : USER_TYPE ,
2018-11-22 16:01:03 +08:00
group_type : GROUP_TYPE ,
2020-02-19 04:45:02 +08:00
group_mentionable_type : GROUP_MENTIONABLE_TYPE ,
2018-11-22 16:01:03 +08:00
levels : Group . alias_levels ( user ) ,
user_id : user_id ,
2018-11-22 14:28:48 +08:00
)
mentions = { }
results . each { | result | mentions [ result . name ] = result . type }
mentions
end
FEATURE: Allow hotlinked media to be blocked (#16940)
This commit introduces a new site setting: `block_hotlinked_media`. When enabled, all attempts to hotlink media (images, videos, and audio) will fail, and be replaced with a linked placeholder. Exceptions to the rule can be added via `block_hotlinked_media_exceptions`.
`download_remote_image_to_local` can be used alongside this feature. In that case, hotlinked images will be blocked immediately when the post is created, but will then be replaced with the downloaded version a few seconds later.
This implementation is purely server-side, and does not impact the composer preview.
Technically, there are two stages to this feature:
1. `PrettyText.sanitize_hotlinked_media` is called during `PrettyText.cook`, and whenever new images are introduced by Onebox. It will iterate over all src/srcset attributes in the post HTML and check if they're allowed. If not, the attributes will be removed and replaced with a `data-blocked-hotlinked-src(set)` attribute
2. In the `CookedPostProcessor`, we iterate over all `data-blocked-hotlinked-src(set)` attributes and check whether we have a downloaded version of the media. If yes, we update the src to use the downloaded version. If not, the entire media element is replaced with a placeholder. The placeholder is labelled 'external media', and is a link to the offsite media.
2022-06-07 22:23:04 +08:00
def self . allowed_src_pattern
allowed_src_prefixes = [
Discourse . base_path ,
Discourse . base_url ,
GlobalSetting . s3_cdn_url ,
GlobalSetting . cdn_url ,
SiteSetting . external_emoji_url . presence ,
* SiteSetting . block_hotlinked_media_exceptions . split ( " | " ) ,
]
patterns =
allowed_src_prefixes . compact . map do | url |
pattern = Regexp . escape ( url )
# If 'https://example.com' is allowed, ensure 'https://example.com.blah.com' is not
pattern += '(?:/|\z)' if ! pattern . ends_with? ( " \ / " )
pattern
end
/ \ A(data:| #{ patterns . join ( " | " ) } ) /
end
2013-02-06 03:16:51 +08:00
end