diff --git a/Gemfile.lock b/Gemfile.lock
index 3286fe4902d..e120a099ea0 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -273,13 +273,13 @@ GEM
net-smtp (0.5.0)
net-protocol
nio4r (2.7.4)
- nokogiri (1.16.8-aarch64-linux)
+ nokogiri (1.18.1-aarch64-linux-gnu)
racc (~> 1.4)
- nokogiri (1.16.8-arm64-darwin)
+ nokogiri (1.18.1-arm64-darwin)
racc (~> 1.4)
- nokogiri (1.16.8-x86_64-darwin)
+ nokogiri (1.18.1-x86_64-darwin)
racc (~> 1.4)
- nokogiri (1.16.8-x86_64-linux)
+ nokogiri (1.18.1-x86_64-linux-gnu)
racc (~> 1.4)
oauth (1.1.0)
oauth-tty (~> 1.0, >= 1.0.1)
diff --git a/app/services/search_indexer.rb b/app/services/search_indexer.rb
index 9ef1a4b0fcb..340fb3919fa 100644
--- a/app/services/search_indexer.rb
+++ b/app/services/search_indexer.rb
@@ -365,7 +365,7 @@ class SearchIndexer
return +"" if html.blank?
begin
- document = Nokogiri.HTML5("
#{html}
", nil, Encoding::UTF_8.to_s)
+ document = Nokogiri.HTML5("#{html}
", encoding: Encoding::UTF_8)
rescue ArgumentError
return +""
end
@@ -401,7 +401,7 @@ class SearchIndexer
end
html_scrubber = new
- Nokogiri::HTML::SAX::Parser.new(html_scrubber).parse(document.to_html)
+ Nokogiri::HTML4::SAX::Parser.new(html_scrubber, Encoding::UTF_8).parse(document.to_html)
html_scrubber.scrubbed.squish
end
diff --git a/lib/discourse_diff.rb b/lib/discourse_diff.rb
index 617b4e9fc28..3965c3b2f7f 100644
--- a/lib/discourse_diff.rb
+++ b/lib/discourse_diff.rb
@@ -277,7 +277,7 @@ class DiscourseDiff
def self.tokenize(html)
me = new
- parser = Nokogiri::HTML::SAX::Parser.new(me)
+ parser = Nokogiri::HTML4::SAX::Parser.new(me, Encoding::UTF_8)
parser.parse("#{html}")
me.tokens
end
diff --git a/lib/excerpt_parser.rb b/lib/excerpt_parser.rb
index 60318c70f57..2cb1a62a4a3 100644
--- a/lib/excerpt_parser.rb
+++ b/lib/excerpt_parser.rb
@@ -27,10 +27,11 @@ class ExcerptParser < Nokogiri::XML::SAX::Document
end
def self.get_excerpt(html, length, options)
- html ||= ""
+ return "" if html.blank?
+
length = html.length if html.include?("excerpt") && CUSTOM_EXCERPT_REGEX === html
me = self.new(length, options)
- parser = Nokogiri::HTML::SAX::Parser.new(me)
+ parser = Nokogiri::HTML4::SAX::Parser.new(me, Encoding::UTF_8)
catch(:done) { parser.parse(html) }
excerpt = me.excerpt.strip
excerpt = excerpt.gsub(/\s*\n+\s*/, "\n\n") if options[:keep_onebox_source] ||
diff --git a/lib/pretty_text.rb b/lib/pretty_text.rb
index 351b3dbc983..14945a43531 100644
--- a/lib/pretty_text.rb
+++ b/lib/pretty_text.rb
@@ -483,6 +483,8 @@ module PrettyText
end
def self.excerpt(html, max_length, options = {})
+ return "" if html.blank?
+
# TODO: properly fix this HACK in ExcerptParser without introducing XSS
doc = Nokogiri::HTML5.fragment(html)
DiscourseEvent.trigger(:reduce_excerpt, doc, options)
diff --git a/lib/retrieve_title.rb b/lib/retrieve_title.rb
index c00e27ed523..512c3b271ca 100644
--- a/lib/retrieve_title.rb
+++ b/lib/retrieve_title.rb
@@ -24,7 +24,7 @@ module RetrieveTitle
doc = nil
begin
- doc = Nokogiri.HTML5(html, nil, encoding)
+ doc = Nokogiri.HTML5(html, encoding:)
rescue ArgumentError
# invalid HTML (Eg: too many attributes, status tree too deep) - ignore
# Error in nokogumbo is not specialized, uses generic ArgumentError
diff --git a/script/import_scripts/disqus.rb b/script/import_scripts/disqus.rb
index 7edc2cc153e..646200fe332 100644
--- a/script/import_scripts/disqus.rb
+++ b/script/import_scripts/disqus.rb
@@ -17,7 +17,7 @@ class ImportScripts::Disqus < ImportScripts::Base
abort("Category #{IMPORT_CATEGORY} not found") if @category.blank?
@parser = DisqusSAX.new
- doc = Nokogiri::XML::SAX::Parser.new(@parser)
+ doc = Nokogiri::XML::SAX::Parser.new(@parser, Encoding::UTF_8)
doc.parse_file(IMPORT_FILE)
@parser.normalize