From 0e44fab5d69787565d831dd674eb4961068d67b1 Mon Sep 17 00:00:00 2001 From: mrsdizzie Date: Fri, 29 May 2020 17:12:53 -0400 Subject: [PATCH] Update emoji regex (#11584) (#11679) When matching emoji, use a regex built from the data we have instead of something generic using unicode ranges. A generic regex can't tell the difference between two separate emoji next to each other or one emoji that is built out of two separate emoji next to each other. This means that emoji that are next to each other without space in between will be now accurately spanned individually with proper title etc... --- modules/emoji/emoji.go | 27 +++++++++++++++++++++++++++ modules/markup/html.go | 7 +------ modules/markup/html_test.go | 4 +++- 3 files changed, 31 insertions(+), 7 deletions(-) diff --git a/modules/emoji/emoji.go b/modules/emoji/emoji.go index 2a51e61fcf..e4b9e5631d 100644 --- a/modules/emoji/emoji.go +++ b/modules/emoji/emoji.go @@ -6,8 +6,10 @@ package emoji import ( + "sort" "strings" "sync" + "unicode/utf8" ) // Gemoji is a set of emoji data. @@ -48,6 +50,12 @@ func loadMap() { // process emoji codes and aliases codePairs := make([]string, 0) aliasPairs := make([]string, 0) + + // sort from largest to small so we match combined emoji first + sort.Slice(GemojiData, func(i, j int) bool { + return len(GemojiData[i].Emoji) > len(GemojiData[j].Emoji) + }) + for i, e := range GemojiData { if e.Emoji == "" || len(e.Aliases) == 0 { continue @@ -72,6 +80,7 @@ func loadMap() { codeReplacer = strings.NewReplacer(codePairs...) aliasReplacer = strings.NewReplacer(aliasPairs...) }) + } // FromCode retrieves the emoji data based on the provided unicode code (ie, @@ -117,3 +126,21 @@ func ReplaceAliases(s string) string { loadMap() return aliasReplacer.Replace(s) } + +// FindEmojiSubmatchIndex returns index pair of longest emoji in a string +func FindEmojiSubmatchIndex(s string) []int { + loadMap() + + // if rune and string length are the same then no emoji will be present + // similar performance when there is unicode present but almost 200% faster when not + if utf8.RuneCountInString(s) == len(s) { + return nil + } + for j := range GemojiData { + i := strings.Index(s, GemojiData[j].Emoji) + if i != -1 { + return []int{i, i + len(GemojiData[j].Emoji)} + } + } + return nil +} diff --git a/modules/markup/html.go b/modules/markup/html.go index 8fbfee6a53..41248654d8 100644 --- a/modules/markup/html.go +++ b/modules/markup/html.go @@ -65,10 +65,6 @@ var ( // EmojiShortCodeRegex find emoji by alias like :smile: EmojiShortCodeRegex = regexp.MustCompile(`\:[\w\+\-]+\:{1}`) - - // find emoji literal: search all emoji hex range as many times as they appear as - // some emojis (skin color etc..) are just two or more chained together - emojiRegex = regexp.MustCompile(`[\x{1F000}-\x{1FFFF}|\x{2000}-\x{32ff}|\x{fe4e5}-\x{fe4ee}|\x{200D}|\x{FE0F}|\x{e0000}-\x{e007f}]+`) ) // CSS class for action keywords (e.g. "closes: #1") @@ -922,8 +918,7 @@ func emojiShortCodeProcessor(ctx *postProcessCtx, node *html.Node) { // emoji processor to match emoji and add emoji class func emojiProcessor(ctx *postProcessCtx, node *html.Node) { - m := emojiRegex.FindStringSubmatchIndex(node.Data) - + m := emoji.FindEmojiSubmatchIndex(node.Data) if m == nil { return } diff --git a/modules/markup/html_test.go b/modules/markup/html_test.go index 65d2d327d6..686057d11f 100644 --- a/modules/markup/html_test.go +++ b/modules/markup/html_test.go @@ -263,7 +263,9 @@ func TestRender_emoji(t *testing.T) { test( "Some text with :smile: in the middle", `

Some text with 😄 in the middle

`) - + test( + "Some text with 😄😄 2 emoji next to each other", + `

Some text with 😄😄 2 emoji next to each other

`) // should match nothing test( "2001:0db8:85a3:0000:0000:8a2e:0370:7334",