From be74ef76b30e7ad9fc74294c8e94ff1f40e87b4e Mon Sep 17 00:00:00 2001 From: jwijenbergh Date: Thu, 23 May 2024 15:33:11 +0200 Subject: Discovery: Improve search using levenshtein distance and sorting --- internal/discovery/discovery.go | 33 ++-------- internal/levenshtein/levenshtein.go | 117 ++++++++++++++++++++++++++++++++++++ 2 files changed, 122 insertions(+), 28 deletions(-) create mode 100644 internal/levenshtein/levenshtein.go (limited to 'internal') diff --git a/internal/discovery/discovery.go b/internal/discovery/discovery.go index 783febf..383a696 100644 --- a/internal/discovery/discovery.go +++ b/internal/discovery/discovery.go @@ -5,10 +5,10 @@ import ( "context" "encoding/json" "fmt" - "strings" "time" "github.com/eduvpn/eduvpn-common/internal/http" + "github.com/eduvpn/eduvpn-common/internal/levenshtein" "github.com/eduvpn/eduvpn-common/internal/log" "github.com/eduvpn/eduvpn-common/internal/verify" discotypes "github.com/eduvpn/eduvpn-common/types/discovery" @@ -40,20 +40,8 @@ type Organization struct { KeywordList discotypes.MapOrString `json:"keyword_list,omitempty"` } -// Matches returns if the search query `str` matches with this organization -func (s *Organization) Matches(str string) bool { - var catalog strings.Builder - for _, v := range s.DisplayName { - // length and nil error is returned - _, _ = catalog.WriteString(strings.ToLower(v)) - _, _ = catalog.WriteString(" ") - } - for _, v := range s.KeywordList { - // length and nil error is returned - _, _ = catalog.WriteString(strings.ToLower(v)) - _, _ = catalog.WriteString(" ") - } - return strings.Contains(catalog.String(), strings.ToLower(str)) +func (o *Organization) Score(search string) int { + return levenshtein.DiscoveryScore(search, o.DisplayName, o.KeywordList) } // Servers are the list of servers from https://disco.eduvpn.org/v2/server_list.json @@ -82,19 +70,8 @@ type Server struct { } // Matches returns if the search query `str` matches with this server -func (s *Server) Matches(str string) bool { - var catalog strings.Builder - for _, v := range s.DisplayName { - // length and nil error is returned - _, _ = catalog.WriteString(strings.ToLower(v)) - _, _ = catalog.WriteString(" ") - } - for _, v := range s.KeywordList { - // length and nil error is returned - _, _ = catalog.WriteString(strings.ToLower(v)) - _, _ = catalog.WriteString(" ") - } - return strings.Contains(catalog.String(), strings.ToLower(str)) +func (s *Server) Score(search string) int { + return levenshtein.DiscoveryScore(search, s.DisplayName, s.KeywordList) } // Discovery is the main structure used for this package. diff --git a/internal/levenshtein/levenshtein.go b/internal/levenshtein/levenshtein.go new file mode 100644 index 0000000..0a2689a --- /dev/null +++ b/internal/levenshtein/levenshtein.go @@ -0,0 +1,117 @@ +package levenshtein + +import ( + "unicode/utf8" + "unicode" + "strings" + "golang.org/x/text/runes" + "golang.org/x/text/transform" + "golang.org/x/text/unicode/norm" +) + +// min returns the min of a and b +func min(a, b int) int { + if a < b { + return a + } + return b +} + +// levenshtein is an algorithm that returns the "distance" between two strings +// the distance for hello and helloxd is 2 because it takes two inserts to go from hello to helloxd +// the distance between hello and hello is 0 because the strings are equal +// apart from insertions, the levenshtein algorithm also takes substitutions and deletions into account +// levenshtein implementation from https://en.wikipedia.org/wiki/Levenshtein_distance#Iterative_with_two_matrix_rows +func levenshtein(os, ot string) int { + n := utf8.RuneCountInString(os) + m := utf8.RuneCountInString(ot) + s := []rune(os) + t := []rune(ot) + v0 := make([]int, m+1) + v1 := make([]int, m+1) + for i := 0; i <= m; i++ { + v0[i] = i + } + + for i := 0; i < n; i++ { + v1[0] = i + 1 + for j := 0; j < m; j++ { + dc := v0[j+1] + 1 + ic := v1[j] + 1 + var sc int + if s[i] == t[j] { + sc = v0[j] + } else { + sc = v0[j] + 1 + } + v1[j+1] = min(min(dc, ic), sc) + } + v0, v1 = v1, v0 + } + return v0[m] +} + +// adjusted creates and adjusted version of the levenshtein algorithm +// where it filters entries where one of the words in the substr is not contained in `full` +// for these a score of -1 returned +// for all others it is the normal levenshtein distance +func adjusted(substr, full string) int { + substr = normalize(substr) + full = normalize(full) + s_sub := strings.Split(substr, " ") + for _, v_sub := range s_sub { + if !strings.Contains(full, v_sub) { + return -1 + } + } + return levenshtein(substr, full) +} + +// KeywordPenalty is the penalty for matching on keywords instead of display names +const KeywordPenalty = 2 + +// DiscoveryScore computes the score of a discovery entry with the given search query +// a negative score means exclude the entry from the results +func DiscoveryScore(search string, displays map[string]string, keywords map[string]string) int { + var catalogDN strings.Builder + for _, v := range displays { + // length and nil error is returned + _, _ = catalogDN.WriteString(v) + } + scoreDN := adjusted(search, catalogDN.String()) + var catalogKW strings.Builder + for _, v := range keywords { + // length and nil error is returned + _, _ = catalogKW.WriteString(v) + } + scoreKW := 3*adjusted(search, catalogKW.String()) + + // if both scores are positive, return the min + if scoreDN >= 0 && scoreKW >= 0 { + return min(scoreDN, scoreKW) + } + + // scoreKW is negative, return scoreDN + if scoreDN >= 0 { + return scoreDN + } + // scoreDN is negative, return scoreKW + return scoreKW +} + +// removeDiacritics removes "diacritics" :^) +// diacritics are special characters, e.g. GÉANT, becomes GEANT +func removeDiacritics(text string) (string, error) { + t := transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC) + result, _, err := transform.String(t, text) + if err != nil { + return text, err + } + return result, nil +} + +// normalize removes diacritics and converts to lower case +func normalize(text string) string { + dt, _ := removeDiacritics(text) + return strings.ToLower(dt) +} -- cgit v1.2.3