From affe26f0dd8ecb9f9adc4024c7d68524a7e6665c Mon Sep 17 00:00:00 2001 From: Jarek Radosz Date: Tue, 7 Jan 2025 12:05:39 +0100 Subject: [PATCH] DEV: Update nokogiri to 1.18.1 (#30554) Nokogiri/libxml is now more strict in terms of params it receives. It uses kwargs vs options object (I fixed an issue there in #30545) doesn't accept nil/blank html (fixed here) and most importantly handles encoding in a different way. It seems to require explicitly specifying UTF8. * Build(deps): Bump nokogiri from 1.16.8 to 1.18.1 Bumps [nokogiri](https://github.com/sparklemotion/nokogiri) from 1.16.8 to 1.18.1. - [Release notes](https://github.com/sparklemotion/nokogiri/releases) - [Changelog](https://github.com/sparklemotion/nokogiri/blob/main/CHANGELOG.md) - [Commits](https://github.com/sparklemotion/nokogiri/compare/v1.16.8...v1.18.1) --- updated-dependencies: - dependency-name: nokogiri dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Gemfile.lock | 8 ++++---- app/services/search_indexer.rb | 4 ++-- lib/discourse_diff.rb | 2 +- lib/excerpt_parser.rb | 5 +++-- lib/pretty_text.rb | 2 ++ lib/retrieve_title.rb | 2 +- script/import_scripts/disqus.rb | 2 +- 7 files changed, 14 insertions(+), 11 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index 3286fe4902d..e120a099ea0 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -273,13 +273,13 @@ GEM net-smtp (0.5.0) net-protocol nio4r (2.7.4) - nokogiri (1.16.8-aarch64-linux) + nokogiri (1.18.1-aarch64-linux-gnu) racc (~> 1.4) - nokogiri (1.16.8-arm64-darwin) + nokogiri (1.18.1-arm64-darwin) racc (~> 1.4) - nokogiri (1.16.8-x86_64-darwin) + nokogiri (1.18.1-x86_64-darwin) racc (~> 1.4) - nokogiri (1.16.8-x86_64-linux) + nokogiri (1.18.1-x86_64-linux-gnu) racc (~> 1.4) oauth (1.1.0) oauth-tty (~> 1.0, >= 1.0.1) diff --git a/app/services/search_indexer.rb b/app/services/search_indexer.rb index 9ef1a4b0fcb..340fb3919fa 100644 --- a/app/services/search_indexer.rb +++ b/app/services/search_indexer.rb @@ -365,7 +365,7 @@ class SearchIndexer return +"" if html.blank? begin - document = Nokogiri.HTML5("
#{html}
", nil, Encoding::UTF_8.to_s) + document = Nokogiri.HTML5("
#{html}
", encoding: Encoding::UTF_8) rescue ArgumentError return +"" end @@ -401,7 +401,7 @@ class SearchIndexer end html_scrubber = new - Nokogiri::HTML::SAX::Parser.new(html_scrubber).parse(document.to_html) + Nokogiri::HTML4::SAX::Parser.new(html_scrubber, Encoding::UTF_8).parse(document.to_html) html_scrubber.scrubbed.squish end diff --git a/lib/discourse_diff.rb b/lib/discourse_diff.rb index 617b4e9fc28..3965c3b2f7f 100644 --- a/lib/discourse_diff.rb +++ b/lib/discourse_diff.rb @@ -277,7 +277,7 @@ class DiscourseDiff def self.tokenize(html) me = new - parser = Nokogiri::HTML::SAX::Parser.new(me) + parser = Nokogiri::HTML4::SAX::Parser.new(me, Encoding::UTF_8) parser.parse("#{html}") me.tokens end diff --git a/lib/excerpt_parser.rb b/lib/excerpt_parser.rb index 60318c70f57..2cb1a62a4a3 100644 --- a/lib/excerpt_parser.rb +++ b/lib/excerpt_parser.rb @@ -27,10 +27,11 @@ class ExcerptParser < Nokogiri::XML::SAX::Document end def self.get_excerpt(html, length, options) - html ||= "" + return "" if html.blank? + length = html.length if html.include?("excerpt") && CUSTOM_EXCERPT_REGEX === html me = self.new(length, options) - parser = Nokogiri::HTML::SAX::Parser.new(me) + parser = Nokogiri::HTML4::SAX::Parser.new(me, Encoding::UTF_8) catch(:done) { parser.parse(html) } excerpt = me.excerpt.strip excerpt = excerpt.gsub(/\s*\n+\s*/, "\n\n") if options[:keep_onebox_source] || diff --git a/lib/pretty_text.rb b/lib/pretty_text.rb index 351b3dbc983..14945a43531 100644 --- a/lib/pretty_text.rb +++ b/lib/pretty_text.rb @@ -483,6 +483,8 @@ module PrettyText end def self.excerpt(html, max_length, options = {}) + return "" if html.blank? + # TODO: properly fix this HACK in ExcerptParser without introducing XSS doc = Nokogiri::HTML5.fragment(html) DiscourseEvent.trigger(:reduce_excerpt, doc, options) diff --git a/lib/retrieve_title.rb b/lib/retrieve_title.rb index c00e27ed523..512c3b271ca 100644 --- a/lib/retrieve_title.rb +++ b/lib/retrieve_title.rb @@ -24,7 +24,7 @@ module RetrieveTitle doc = nil begin - doc = Nokogiri.HTML5(html, nil, encoding) + doc = Nokogiri.HTML5(html, encoding:) rescue ArgumentError # invalid HTML (Eg: too many attributes, status tree too deep) - ignore # Error in nokogumbo is not specialized, uses generic ArgumentError diff --git a/script/import_scripts/disqus.rb b/script/import_scripts/disqus.rb index 7edc2cc153e..646200fe332 100644 --- a/script/import_scripts/disqus.rb +++ b/script/import_scripts/disqus.rb @@ -17,7 +17,7 @@ class ImportScripts::Disqus < ImportScripts::Base abort("Category #{IMPORT_CATEGORY} not found") if @category.blank? @parser = DisqusSAX.new - doc = Nokogiri::XML::SAX::Parser.new(@parser) + doc = Nokogiri::XML::SAX::Parser.new(@parser, Encoding::UTF_8) doc.parse_file(IMPORT_FILE) @parser.normalize