1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
|
// Package levenshtein implements the algorithm for calculating the "distance" between two strings
package levenshtein
import (
"strings"
"unicode"
"unicode/utf8"
"golang.org/x/text/runes"
"golang.org/x/text/transform"
"golang.org/x/text/unicode/norm"
)
// levenshtein is an algorithm that returns the "distance" between two strings
// the distance for hello and helloxd is 2 because it takes two inserts to go from hello to helloxd
// the distance between hello and hello is 0 because the strings are equal
// apart from insertions, the levenshtein algorithm also takes substitutions and deletions into account
// levenshtein implementation from https://en.wikipedia.org/wiki/Levenshtein_distance#Iterative_with_two_matrix_rows
func levenshtein(os, ot string) int {
n := utf8.RuneCountInString(os)
m := utf8.RuneCountInString(ot)
s := []rune(os)
t := []rune(ot)
v0 := make([]int, m+1)
v1 := make([]int, m+1)
for i := 0; i <= m; i++ {
v0[i] = i
}
// loop through every word in the first string
for i := range n {
v1[0] = i + 1
for j := range m {
// calculate deletion cost,
// insertion cost and
// substitution cost to get from the string
// to the target
dc := v0[j+1] + 1
ic := v1[j] + 1
var sc int
if s[i] == t[j] {
sc = v0[j]
} else {
sc = v0[j] + 1
}
// take the min of all the costs
v1[j+1] = min(min(dc, ic), sc)
}
v0, v1 = v1, v0
}
return v0[m]
}
// adjusted creates and adjusted version of the levenshtein algorithm
// where it filters entries where one of the words in the substr is not contained in `full`
// for these a score of -1 returned
// for all others it is the normal levenshtein distance
func adjusted(substr, full string) int {
sSub := strings.SplitSeq(substr, " ")
for vSub := range sSub {
if !strings.Contains(full, vSub) {
return -1
}
}
return levenshtein(substr, full)
}
// KeywordPenalty is the penalty for matching on keywords instead of display names
// We have a penalty for matching on keywords because currently there are a lot of generic keywords
const KeywordPenalty = 2
// DiscoveryScore computes the score of a discovery entry with the given search query
// a negative score means exclude the entry from the results
func DiscoveryScore(search string, displays map[string]string, keywords map[string]string) int {
search = normalize(search)
scoreDN := -1
for _, v := range displays {
score := adjusted(search, normalize(v))
// set the smallest non-zero score
if (score >= 0 && score < scoreDN) || scoreDN == -1 {
scoreDN = score
}
}
scoreKW := -1
for _, v := range keywords {
score := KeywordPenalty * adjusted(search, normalize(v))
if score == 0 {
score = KeywordPenalty
}
// set the smallest non-zero score
if (score >= 0 && score < scoreKW) || scoreKW == -1 {
scoreKW = score
}
}
// if both scores are positive, return the min
if scoreDN >= 0 && scoreKW >= 0 {
return min(scoreDN, scoreKW)
}
// scoreKW is negative, return scoreDN
if scoreDN >= 0 {
return scoreDN
}
// scoreDN is negative, return scoreKW
return scoreKW
}
// removeDiacritics removes "diacritics" :^)
// diacritics are special characters, e.g. GÉANT, becomes GEANT
func removeDiacritics(text string) (string, error) {
t := transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
result, _, err := transform.String(t, text)
if err != nil {
return text, err
}
return result, nil
}
// normalize removes diacritics and converts to lower case
func normalize(text string) string {
dt, _ := removeDiacritics(text)
return strings.ToLower(dt)
}
|