2022-08-14 02:32:34 +08:00
|
|
|
// Copyright 2022 The Gitea Authors. All rights reserved.
|
2022-11-28 02:20:29 +08:00
|
|
|
// SPDX-License-Identifier: MIT
|
2022-08-14 02:32:34 +08:00
|
|
|
|
|
|
|
package charset
|
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
|
|
|
"regexp"
|
|
|
|
"strings"
|
|
|
|
"unicode"
|
|
|
|
"unicode/utf8"
|
|
|
|
|
|
|
|
"code.gitea.io/gitea/modules/translation"
|
|
|
|
|
|
|
|
"golang.org/x/net/html"
|
|
|
|
)
|
|
|
|
|
|
|
|
// VScode defaultWordRegexp
|
|
|
|
var defaultWordRegexp = regexp.MustCompile(`(-?\d*\.\d\w*)|([^\` + "`" + `\~\!\@\#\$\%\^\&\*\(\)\-\=\+\[\{\]\}\\\|\;\:\'\"\,\.\<\>\/\?\s\x00-\x1f]+)`)
|
|
|
|
|
|
|
|
func NewEscapeStreamer(locale translation.Locale, next HTMLStreamer, allowed ...rune) HTMLStreamer {
|
Fix isAllowed of escapeStreamer (#22814)
The use of `sort.Search` is wrong: The slice should be sorted, and
`return >= 0` doen't mean it exists, see the
[manual](https://pkg.go.dev/sort#Search).
Could be fixed like this if we really need it:
```diff
diff --git a/modules/charset/escape_stream.go b/modules/charset/escape_stream.go
index 823b63513..fcf1ffbc1 100644
--- a/modules/charset/escape_stream.go
+++ b/modules/charset/escape_stream.go
@@ -20,6 +20,9 @@ import (
var defaultWordRegexp = regexp.MustCompile(`(-?\d*\.\d\w*)|([^\` + "`" + `\~\!\@\#\$\%\^\&\*\(\)\-\=\+\[\{\]\}\\\|\;\:\'\"\,\.\<\>\/\?\s\x00-\x1f]+)`)
func NewEscapeStreamer(locale translation.Locale, next HTMLStreamer, allowed ...rune) HTMLStreamer {
+ sort.Slice(allowed, func(i, j int) bool {
+ return allowed[i] < allowed[j]
+ })
return &escapeStreamer{
escaped: &EscapeStatus{},
PassthroughHTMLStreamer: *NewPassthroughStreamer(next),
@@ -284,14 +287,8 @@ func (e *escapeStreamer) runeTypes(runes ...rune) (types []runeType, confusables
}
func (e *escapeStreamer) isAllowed(r rune) bool {
- if len(e.allowed) == 0 {
- return false
- }
- if len(e.allowed) == 1 {
- return e.allowed[0] == r
- }
-
- return sort.Search(len(e.allowed), func(i int) bool {
+ i := sort.Search(len(e.allowed), func(i int) bool {
return e.allowed[i] >= r
- }) >= 0
+ })
+ return i < len(e.allowed) && e.allowed[i] == r
}
```
But I don't think so, a map is better to do it.
2023-02-09 20:51:36 +08:00
|
|
|
allowedM := make(map[rune]bool, len(allowed))
|
|
|
|
for _, v := range allowed {
|
|
|
|
allowedM[v] = true
|
|
|
|
}
|
2022-08-14 02:32:34 +08:00
|
|
|
return &escapeStreamer{
|
|
|
|
escaped: &EscapeStatus{},
|
|
|
|
PassthroughHTMLStreamer: *NewPassthroughStreamer(next),
|
|
|
|
locale: locale,
|
|
|
|
ambiguousTables: AmbiguousTablesForLocale(locale),
|
Fix isAllowed of escapeStreamer (#22814)
The use of `sort.Search` is wrong: The slice should be sorted, and
`return >= 0` doen't mean it exists, see the
[manual](https://pkg.go.dev/sort#Search).
Could be fixed like this if we really need it:
```diff
diff --git a/modules/charset/escape_stream.go b/modules/charset/escape_stream.go
index 823b63513..fcf1ffbc1 100644
--- a/modules/charset/escape_stream.go
+++ b/modules/charset/escape_stream.go
@@ -20,6 +20,9 @@ import (
var defaultWordRegexp = regexp.MustCompile(`(-?\d*\.\d\w*)|([^\` + "`" + `\~\!\@\#\$\%\^\&\*\(\)\-\=\+\[\{\]\}\\\|\;\:\'\"\,\.\<\>\/\?\s\x00-\x1f]+)`)
func NewEscapeStreamer(locale translation.Locale, next HTMLStreamer, allowed ...rune) HTMLStreamer {
+ sort.Slice(allowed, func(i, j int) bool {
+ return allowed[i] < allowed[j]
+ })
return &escapeStreamer{
escaped: &EscapeStatus{},
PassthroughHTMLStreamer: *NewPassthroughStreamer(next),
@@ -284,14 +287,8 @@ func (e *escapeStreamer) runeTypes(runes ...rune) (types []runeType, confusables
}
func (e *escapeStreamer) isAllowed(r rune) bool {
- if len(e.allowed) == 0 {
- return false
- }
- if len(e.allowed) == 1 {
- return e.allowed[0] == r
- }
-
- return sort.Search(len(e.allowed), func(i int) bool {
+ i := sort.Search(len(e.allowed), func(i int) bool {
return e.allowed[i] >= r
- }) >= 0
+ })
+ return i < len(e.allowed) && e.allowed[i] == r
}
```
But I don't think so, a map is better to do it.
2023-02-09 20:51:36 +08:00
|
|
|
allowed: allowedM,
|
2022-08-14 02:32:34 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
type escapeStreamer struct {
|
|
|
|
PassthroughHTMLStreamer
|
|
|
|
escaped *EscapeStatus
|
|
|
|
locale translation.Locale
|
|
|
|
ambiguousTables []*AmbiguousTable
|
Fix isAllowed of escapeStreamer (#22814)
The use of `sort.Search` is wrong: The slice should be sorted, and
`return >= 0` doen't mean it exists, see the
[manual](https://pkg.go.dev/sort#Search).
Could be fixed like this if we really need it:
```diff
diff --git a/modules/charset/escape_stream.go b/modules/charset/escape_stream.go
index 823b63513..fcf1ffbc1 100644
--- a/modules/charset/escape_stream.go
+++ b/modules/charset/escape_stream.go
@@ -20,6 +20,9 @@ import (
var defaultWordRegexp = regexp.MustCompile(`(-?\d*\.\d\w*)|([^\` + "`" + `\~\!\@\#\$\%\^\&\*\(\)\-\=\+\[\{\]\}\\\|\;\:\'\"\,\.\<\>\/\?\s\x00-\x1f]+)`)
func NewEscapeStreamer(locale translation.Locale, next HTMLStreamer, allowed ...rune) HTMLStreamer {
+ sort.Slice(allowed, func(i, j int) bool {
+ return allowed[i] < allowed[j]
+ })
return &escapeStreamer{
escaped: &EscapeStatus{},
PassthroughHTMLStreamer: *NewPassthroughStreamer(next),
@@ -284,14 +287,8 @@ func (e *escapeStreamer) runeTypes(runes ...rune) (types []runeType, confusables
}
func (e *escapeStreamer) isAllowed(r rune) bool {
- if len(e.allowed) == 0 {
- return false
- }
- if len(e.allowed) == 1 {
- return e.allowed[0] == r
- }
-
- return sort.Search(len(e.allowed), func(i int) bool {
+ i := sort.Search(len(e.allowed), func(i int) bool {
return e.allowed[i] >= r
- }) >= 0
+ })
+ return i < len(e.allowed) && e.allowed[i] == r
}
```
But I don't think so, a map is better to do it.
2023-02-09 20:51:36 +08:00
|
|
|
allowed map[rune]bool
|
2022-08-14 02:32:34 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
func (e *escapeStreamer) EscapeStatus() *EscapeStatus {
|
|
|
|
return e.escaped
|
|
|
|
}
|
|
|
|
|
|
|
|
// Text tells the next streamer there is a text
|
|
|
|
func (e *escapeStreamer) Text(data string) error {
|
|
|
|
sb := &strings.Builder{}
|
2023-04-23 02:53:00 +08:00
|
|
|
var until int
|
|
|
|
var next int
|
|
|
|
pos := 0
|
2022-08-14 02:32:34 +08:00
|
|
|
if len(data) > len(UTF8BOM) && data[:len(UTF8BOM)] == string(UTF8BOM) {
|
|
|
|
_, _ = sb.WriteString(data[:len(UTF8BOM)])
|
|
|
|
pos = len(UTF8BOM)
|
|
|
|
}
|
2022-08-24 19:50:13 +08:00
|
|
|
dataBytes := []byte(data)
|
2022-08-14 02:32:34 +08:00
|
|
|
for pos < len(data) {
|
|
|
|
nextIdxs := defaultWordRegexp.FindStringIndex(data[pos:])
|
|
|
|
if nextIdxs == nil {
|
|
|
|
until = len(data)
|
|
|
|
next = until
|
|
|
|
} else {
|
|
|
|
until, next = nextIdxs[0]+pos, nextIdxs[1]+pos
|
|
|
|
}
|
|
|
|
|
|
|
|
// from pos until until we know that the runes are not \r\t\n or even ' '
|
|
|
|
runes := make([]rune, 0, next-until)
|
|
|
|
positions := make([]int, 0, next-until+1)
|
|
|
|
|
|
|
|
for pos < until {
|
2022-08-24 19:50:13 +08:00
|
|
|
r, sz := utf8.DecodeRune(dataBytes[pos:])
|
2022-08-14 02:32:34 +08:00
|
|
|
positions = positions[:0]
|
|
|
|
positions = append(positions, pos, pos+sz)
|
|
|
|
types, confusables, _ := e.runeTypes(r)
|
2022-08-24 19:50:13 +08:00
|
|
|
if err := e.handleRunes(dataBytes, []rune{r}, positions, types, confusables, sb); err != nil {
|
2022-08-14 02:32:34 +08:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
pos += sz
|
|
|
|
}
|
|
|
|
|
|
|
|
for i := pos; i < next; {
|
2022-08-24 19:50:13 +08:00
|
|
|
r, sz := utf8.DecodeRune(dataBytes[i:])
|
2022-08-14 02:32:34 +08:00
|
|
|
runes = append(runes, r)
|
|
|
|
positions = append(positions, i)
|
|
|
|
i += sz
|
|
|
|
}
|
|
|
|
positions = append(positions, next)
|
|
|
|
types, confusables, runeCounts := e.runeTypes(runes...)
|
|
|
|
if runeCounts.needsEscape() {
|
2022-08-24 19:50:13 +08:00
|
|
|
if err := e.handleRunes(dataBytes, runes, positions, types, confusables, sb); err != nil {
|
2022-08-14 02:32:34 +08:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
} else {
|
2022-08-24 19:50:13 +08:00
|
|
|
_, _ = sb.Write(dataBytes[pos:next])
|
2022-08-14 02:32:34 +08:00
|
|
|
}
|
|
|
|
pos = next
|
|
|
|
}
|
|
|
|
if sb.Len() > 0 {
|
|
|
|
if err := e.PassthroughHTMLStreamer.Text(sb.String()); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2022-08-24 19:50:13 +08:00
|
|
|
func (e *escapeStreamer) handleRunes(data []byte, runes []rune, positions []int, types []runeType, confusables []rune, sb *strings.Builder) error {
|
2022-08-14 02:32:34 +08:00
|
|
|
for i, r := range runes {
|
|
|
|
switch types[i] {
|
|
|
|
case brokenRuneType:
|
|
|
|
if sb.Len() > 0 {
|
|
|
|
if err := e.PassthroughHTMLStreamer.Text(sb.String()); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
sb.Reset()
|
|
|
|
}
|
|
|
|
end := positions[i+1]
|
|
|
|
start := positions[i]
|
2022-08-24 19:50:13 +08:00
|
|
|
if err := e.brokenRune(data[start:end]); err != nil {
|
2022-08-14 02:32:34 +08:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
case ambiguousRuneType:
|
|
|
|
if sb.Len() > 0 {
|
|
|
|
if err := e.PassthroughHTMLStreamer.Text(sb.String()); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
sb.Reset()
|
|
|
|
}
|
|
|
|
if err := e.ambiguousRune(r, confusables[0]); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
confusables = confusables[1:]
|
|
|
|
case invisibleRuneType:
|
|
|
|
if sb.Len() > 0 {
|
|
|
|
if err := e.PassthroughHTMLStreamer.Text(sb.String()); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
sb.Reset()
|
|
|
|
}
|
|
|
|
if err := e.invisibleRune(r); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
_, _ = sb.WriteRune(r)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (e *escapeStreamer) brokenRune(bs []byte) error {
|
|
|
|
e.escaped.Escaped = true
|
|
|
|
e.escaped.HasBadRunes = true
|
|
|
|
|
|
|
|
if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{
|
|
|
|
Key: "class",
|
|
|
|
Val: "broken-code-point",
|
|
|
|
}); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if err := e.PassthroughHTMLStreamer.Text(fmt.Sprintf("<%X>", bs)); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
return e.PassthroughHTMLStreamer.EndTag("span")
|
|
|
|
}
|
|
|
|
|
|
|
|
func (e *escapeStreamer) ambiguousRune(r, c rune) error {
|
|
|
|
e.escaped.Escaped = true
|
|
|
|
e.escaped.HasAmbiguous = true
|
|
|
|
|
|
|
|
if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{
|
|
|
|
Key: "class",
|
2023-03-24 18:35:38 +08:00
|
|
|
Val: "ambiguous-code-point",
|
2022-08-14 02:32:34 +08:00
|
|
|
}, html.Attribute{
|
2023-03-24 18:35:38 +08:00
|
|
|
Key: "data-tooltip-content",
|
2022-08-14 02:32:34 +08:00
|
|
|
Val: e.locale.Tr("repo.ambiguous_character", r, c),
|
|
|
|
}); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{
|
|
|
|
Key: "class",
|
|
|
|
Val: "char",
|
|
|
|
}); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if err := e.PassthroughHTMLStreamer.Text(string(r)); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if err := e.PassthroughHTMLStreamer.EndTag("span"); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
return e.PassthroughHTMLStreamer.EndTag("span")
|
|
|
|
}
|
|
|
|
|
|
|
|
func (e *escapeStreamer) invisibleRune(r rune) error {
|
|
|
|
e.escaped.Escaped = true
|
|
|
|
e.escaped.HasInvisible = true
|
|
|
|
|
|
|
|
if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{
|
|
|
|
Key: "class",
|
|
|
|
Val: "escaped-code-point",
|
|
|
|
}, html.Attribute{
|
|
|
|
Key: "data-escaped",
|
|
|
|
Val: fmt.Sprintf("[U+%04X]", r),
|
|
|
|
}); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{
|
|
|
|
Key: "class",
|
|
|
|
Val: "char",
|
|
|
|
}); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if err := e.PassthroughHTMLStreamer.Text(string(r)); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if err := e.PassthroughHTMLStreamer.EndTag("span"); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
return e.PassthroughHTMLStreamer.EndTag("span")
|
|
|
|
}
|
|
|
|
|
|
|
|
type runeCountType struct {
|
|
|
|
numBasicRunes int
|
|
|
|
numNonConfusingNonBasicRunes int
|
|
|
|
numAmbiguousRunes int
|
|
|
|
numInvisibleRunes int
|
|
|
|
numBrokenRunes int
|
|
|
|
}
|
|
|
|
|
|
|
|
func (counts runeCountType) needsEscape() bool {
|
|
|
|
if counts.numBrokenRunes > 0 {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
if counts.numBasicRunes == 0 &&
|
|
|
|
counts.numNonConfusingNonBasicRunes > 0 {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
return counts.numAmbiguousRunes > 0 || counts.numInvisibleRunes > 0
|
|
|
|
}
|
|
|
|
|
|
|
|
type runeType int
|
|
|
|
|
|
|
|
const (
|
2022-12-08 16:21:37 +08:00
|
|
|
basicASCIIRuneType runeType = iota // <- This is technically deadcode but its self-documenting so it should stay
|
2022-08-14 02:32:34 +08:00
|
|
|
brokenRuneType
|
|
|
|
nonBasicASCIIRuneType
|
|
|
|
ambiguousRuneType
|
|
|
|
invisibleRuneType
|
|
|
|
)
|
|
|
|
|
|
|
|
func (e *escapeStreamer) runeTypes(runes ...rune) (types []runeType, confusables []rune, runeCounts runeCountType) {
|
|
|
|
types = make([]runeType, len(runes))
|
|
|
|
for i, r := range runes {
|
|
|
|
var confusable rune
|
|
|
|
switch {
|
|
|
|
case r == utf8.RuneError:
|
|
|
|
types[i] = brokenRuneType
|
|
|
|
runeCounts.numBrokenRunes++
|
|
|
|
case r == ' ' || r == '\t' || r == '\n':
|
|
|
|
runeCounts.numBasicRunes++
|
Fix isAllowed of escapeStreamer (#22814)
The use of `sort.Search` is wrong: The slice should be sorted, and
`return >= 0` doen't mean it exists, see the
[manual](https://pkg.go.dev/sort#Search).
Could be fixed like this if we really need it:
```diff
diff --git a/modules/charset/escape_stream.go b/modules/charset/escape_stream.go
index 823b63513..fcf1ffbc1 100644
--- a/modules/charset/escape_stream.go
+++ b/modules/charset/escape_stream.go
@@ -20,6 +20,9 @@ import (
var defaultWordRegexp = regexp.MustCompile(`(-?\d*\.\d\w*)|([^\` + "`" + `\~\!\@\#\$\%\^\&\*\(\)\-\=\+\[\{\]\}\\\|\;\:\'\"\,\.\<\>\/\?\s\x00-\x1f]+)`)
func NewEscapeStreamer(locale translation.Locale, next HTMLStreamer, allowed ...rune) HTMLStreamer {
+ sort.Slice(allowed, func(i, j int) bool {
+ return allowed[i] < allowed[j]
+ })
return &escapeStreamer{
escaped: &EscapeStatus{},
PassthroughHTMLStreamer: *NewPassthroughStreamer(next),
@@ -284,14 +287,8 @@ func (e *escapeStreamer) runeTypes(runes ...rune) (types []runeType, confusables
}
func (e *escapeStreamer) isAllowed(r rune) bool {
- if len(e.allowed) == 0 {
- return false
- }
- if len(e.allowed) == 1 {
- return e.allowed[0] == r
- }
-
- return sort.Search(len(e.allowed), func(i int) bool {
+ i := sort.Search(len(e.allowed), func(i int) bool {
return e.allowed[i] >= r
- }) >= 0
+ })
+ return i < len(e.allowed) && e.allowed[i] == r
}
```
But I don't think so, a map is better to do it.
2023-02-09 20:51:36 +08:00
|
|
|
case e.allowed[r]:
|
2022-08-14 02:32:34 +08:00
|
|
|
if r > 0x7e || r < 0x20 {
|
|
|
|
types[i] = nonBasicASCIIRuneType
|
|
|
|
runeCounts.numNonConfusingNonBasicRunes++
|
|
|
|
} else {
|
|
|
|
runeCounts.numBasicRunes++
|
|
|
|
}
|
|
|
|
case unicode.Is(InvisibleRanges, r):
|
|
|
|
types[i] = invisibleRuneType
|
|
|
|
runeCounts.numInvisibleRunes++
|
|
|
|
case unicode.IsControl(r):
|
|
|
|
types[i] = invisibleRuneType
|
|
|
|
runeCounts.numInvisibleRunes++
|
|
|
|
case isAmbiguous(r, &confusable, e.ambiguousTables...):
|
|
|
|
confusables = append(confusables, confusable)
|
|
|
|
types[i] = ambiguousRuneType
|
|
|
|
runeCounts.numAmbiguousRunes++
|
|
|
|
case r > 0x7e || r < 0x20:
|
|
|
|
types[i] = nonBasicASCIIRuneType
|
|
|
|
runeCounts.numNonConfusingNonBasicRunes++
|
|
|
|
default:
|
|
|
|
runeCounts.numBasicRunes++
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return types, confusables, runeCounts
|
|
|
|
}
|