DEV: Update nokogiri to 1.18.1 (#30554)

Nokogiri/libxml is now more strict in terms of params it receives.

It uses kwargs vs options object (I fixed an issue there in #30545) doesn't accept nil/blank html (fixed here) and most importantly handles encoding in a different way. It seems to require explicitly specifying UTF8.

* Build(deps): Bump nokogiri from 1.16.8 to 1.18.1

Bumps [nokogiri](https://github.com/sparklemotion/nokogiri) from 1.16.8 to 1.18.1.
- [Release notes](https://github.com/sparklemotion/nokogiri/releases)
- [Changelog](https://github.com/sparklemotion/nokogiri/blob/main/CHANGELOG.md)
- [Commits](https://github.com/sparklemotion/nokogiri/compare/v1.16.8...v1.18.1)

---
updated-dependencies:
- dependency-name: nokogiri
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
This commit is contained in:
Jarek Radosz 2025-01-07 12:05:39 +01:00 committed by GitHub
parent c1a46995a7
commit affe26f0dd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 14 additions and 11 deletions

View File

@ -273,13 +273,13 @@ GEM
net-smtp (0.5.0)
net-protocol
nio4r (2.7.4)
nokogiri (1.16.8-aarch64-linux)
nokogiri (1.18.1-aarch64-linux-gnu)
racc (~> 1.4)
nokogiri (1.16.8-arm64-darwin)
nokogiri (1.18.1-arm64-darwin)
racc (~> 1.4)
nokogiri (1.16.8-x86_64-darwin)
nokogiri (1.18.1-x86_64-darwin)
racc (~> 1.4)
nokogiri (1.16.8-x86_64-linux)
nokogiri (1.18.1-x86_64-linux-gnu)
racc (~> 1.4)
oauth (1.1.0)
oauth-tty (~> 1.0, >= 1.0.1)

View File

@ -365,7 +365,7 @@ class SearchIndexer
return +"" if html.blank?
begin
document = Nokogiri.HTML5("<div>#{html}</div>", nil, Encoding::UTF_8.to_s)
document = Nokogiri.HTML5("<div>#{html}</div>", encoding: Encoding::UTF_8)
rescue ArgumentError
return +""
end
@ -401,7 +401,7 @@ class SearchIndexer
end
html_scrubber = new
Nokogiri::HTML::SAX::Parser.new(html_scrubber).parse(document.to_html)
Nokogiri::HTML4::SAX::Parser.new(html_scrubber, Encoding::UTF_8).parse(document.to_html)
html_scrubber.scrubbed.squish
end

View File

@ -277,7 +277,7 @@ class DiscourseDiff
def self.tokenize(html)
me = new
parser = Nokogiri::HTML::SAX::Parser.new(me)
parser = Nokogiri::HTML4::SAX::Parser.new(me, Encoding::UTF_8)
parser.parse("<html><body>#{html}</body></html>")
me.tokens
end

View File

@ -27,10 +27,11 @@ class ExcerptParser < Nokogiri::XML::SAX::Document
end
def self.get_excerpt(html, length, options)
html ||= ""
return "" if html.blank?
length = html.length if html.include?("excerpt") && CUSTOM_EXCERPT_REGEX === html
me = self.new(length, options)
parser = Nokogiri::HTML::SAX::Parser.new(me)
parser = Nokogiri::HTML4::SAX::Parser.new(me, Encoding::UTF_8)
catch(:done) { parser.parse(html) }
excerpt = me.excerpt.strip
excerpt = excerpt.gsub(/\s*\n+\s*/, "\n\n") if options[:keep_onebox_source] ||

View File

@ -483,6 +483,8 @@ module PrettyText
end
def self.excerpt(html, max_length, options = {})
return "" if html.blank?
# TODO: properly fix this HACK in ExcerptParser without introducing XSS
doc = Nokogiri::HTML5.fragment(html)
DiscourseEvent.trigger(:reduce_excerpt, doc, options)

View File

@ -24,7 +24,7 @@ module RetrieveTitle
doc = nil
begin
doc = Nokogiri.HTML5(html, nil, encoding)
doc = Nokogiri.HTML5(html, encoding:)
rescue ArgumentError
# invalid HTML (Eg: too many attributes, status tree too deep) - ignore
# Error in nokogumbo is not specialized, uses generic ArgumentError

View File

@ -17,7 +17,7 @@ class ImportScripts::Disqus < ImportScripts::Base
abort("Category #{IMPORT_CATEGORY} not found") if @category.blank?
@parser = DisqusSAX.new
doc = Nokogiri::XML::SAX::Parser.new(@parser)
doc = Nokogiri::XML::SAX::Parser.new(@parser, Encoding::UTF_8)
doc.parse_file(IMPORT_FILE)
@parser.normalize