Better support on Chinese metadata, reduce duplication caused by Traditional/Simplified Chinese variants

This commit is contained in:
tony tan 2025-02-23 14:57:02 -08:00 committed by tan-tony
parent 15a3d2ca66
commit 217678af2d
7 changed files with 83 additions and 8 deletions

4
go.mod
View File

@ -35,6 +35,7 @@ require (
github.com/kardianos/service v1.2.2
github.com/kr/pretty v0.3.1
github.com/lestrrat-go/jwx/v2 v2.1.3
github.com/liuzl/gocc v0.0.0-20231231122217-0372e1059ca5
github.com/matoous/go-nanoid/v2 v2.1.0
github.com/mattn/go-sqlite3 v1.14.24
github.com/microcosm-cc/bluemonday v1.0.27
@ -66,6 +67,7 @@ require (
)
require (
github.com/adamzy/cedar-go v0.0.0-20170805034717-80a9c64b256d // indirect
github.com/aymerick/douceur v0.2.0 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
@ -90,6 +92,8 @@ require (
github.com/lestrrat-go/httprc v1.0.6 // indirect
github.com/lestrrat-go/iter v1.0.2 // indirect
github.com/lestrrat-go/option v1.0.1 // indirect
github.com/liuzl/cedar-go v0.0.0-20170805034717-80a9c64b256d // indirect
github.com/liuzl/da v0.0.0-20180704015230-14771aad5b1d // indirect
github.com/magiconair/properties v1.8.9 // indirect
github.com/mfridman/interpolate v0.0.2 // indirect
github.com/mitchellh/mapstructure v1.5.0 // indirect

8
go.sum
View File

@ -4,6 +4,8 @@ github.com/Masterminds/squirrel v1.5.4 h1:uUcX/aBc8O7Fg9kaISIUsHXdKuqehiXAMQTYX8
github.com/Masterminds/squirrel v1.5.4/go.mod h1:NNaOrjSoIDfDA40n7sr2tPNZRfjzjA400rg+riTZj10=
github.com/RaveNoX/go-jsoncommentstrip v1.0.0 h1:t527LHHE3HmiHrq74QMpNPZpGCIJzTx+apLkMKt4HC0=
github.com/RaveNoX/go-jsoncommentstrip v1.0.0/go.mod h1:78ihd09MekBnJnxpICcwzCMzGrKSKYe4AqU6PDYYpjk=
github.com/adamzy/cedar-go v0.0.0-20170805034717-80a9c64b256d h1:ir/IFJU5xbja5UaBEQLjcvn7aAU01nqU/NUyOBEU+ew=
github.com/adamzy/cedar-go v0.0.0-20170805034717-80a9c64b256d/go.mod h1:PRWNwWq0yifz6XDPZu48aSld8BWwBfr2JKB2bGWiEd4=
github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM=
github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA=
github.com/aymerick/douceur v0.2.0 h1:Mv+mAeH1Q+n9Fr+oyamOlAkUNPWPlA8PPGR0QAaYuPk=
@ -129,6 +131,12 @@ github.com/lestrrat-go/jwx/v2 v2.1.3 h1:Ud4lb2QuxRClYAmRleF50KrbKIoM1TddXgBrneT5
github.com/lestrrat-go/jwx/v2 v2.1.3/go.mod h1:q6uFgbgZfEmQrfJfrCo90QcQOcXFMfbI/fO0NqRtvZo=
github.com/lestrrat-go/option v1.0.1 h1:oAzP2fvZGQKWkvHa1/SAcFolBEca1oN+mQ7eooNBEYU=
github.com/lestrrat-go/option v1.0.1/go.mod h1:5ZHFbivi4xwXxhxY9XHDe2FHo6/Z7WWmtT7T5nBBp3I=
github.com/liuzl/cedar-go v0.0.0-20170805034717-80a9c64b256d h1:qSmEGTgjkESUX5kPMSGJ4pcBUtYVDdkNzMrjQyvRvp0=
github.com/liuzl/cedar-go v0.0.0-20170805034717-80a9c64b256d/go.mod h1:x7SghIWwLVcJObXbjK7S2ENsT1cAcdJcPl7dRaSFog0=
github.com/liuzl/da v0.0.0-20180704015230-14771aad5b1d h1:hTRDIpJ1FjS9ULJuEzu69n3qTgc18eI+ztw/pJv47hs=
github.com/liuzl/da v0.0.0-20180704015230-14771aad5b1d/go.mod h1:7xD3p0XnHvJFQ3t/stEJd877CSIMkH/fACVWen5pYnc=
github.com/liuzl/gocc v0.0.0-20231231122217-0372e1059ca5 h1:wnbHIeP1UX8ClYEWKGnw66PfYvReCHu9G5lXSte3Sqc=
github.com/liuzl/gocc v0.0.0-20231231122217-0372e1059ca5/go.mod h1:7KaV9YIR92M1FpbczAcfYQ3UZ5ayT27pNtunDmXvLBo=
github.com/magiconair/properties v1.8.9 h1:nWcCbLq1N2v/cpNsy5WvQ37Fb+YElfq20WJ/a8RkpQM=
github.com/magiconair/properties v1.8.9/go.mod h1:Dhd985XPs7jluiymwWYZ0G4Z61jb3vdS329zhj2hYo0=
github.com/matoous/go-nanoid/v2 v2.1.0 h1:P64+dmq21hhWdtvZfEAofnvJULaRR1Yib0+PnU669bE=

View File

@ -15,6 +15,7 @@ import (
"github.com/navidrome/navidrome/log"
"github.com/navidrome/navidrome/model"
"github.com/navidrome/navidrome/utils/slice"
"github.com/navidrome/navidrome/utils/str"
)
type Info struct {
@ -368,6 +369,9 @@ func sanitize(filePath string, tagName model.TagName, tag model.TagConf, value s
log.Trace("Invalid UUID tag value", "tag", tagName, "value", value)
return ""
}
default:
// normalize the tag value to reduce duplication
value = str.NormalizeText(value)
}
return value
}

View File

@ -15,12 +15,6 @@ func formatFullText(text ...string) string {
}
func (r sqlRepository) doSearch(sq SelectBuilder, q string, offset, size int, includeMissing bool, results any, orderBys ...string) error {
q = strings.TrimSpace(q)
q = strings.TrimSuffix(q, "*")
if len(q) < 2 {
return nil
}
//sq := r.newSelect().Columns(r.tableName + ".*")
//sq = r.withAnnotation(sq, r.tableName+".id")
//sq = r.withBookmark(sq, r.tableName+".id")
@ -41,7 +35,7 @@ func (r sqlRepository) doSearch(sq SelectBuilder, q string, offset, size int, in
}
func fullTextExpr(tableName string, s string) Sqlizer {
q := str.SanitizeStrings(s)
q := str.NormalizeText(str.SanitizeStrings(s))
if q == "" {
return nil
}

View File

@ -5,7 +5,7 @@ import (
. "github.com/onsi/gomega"
)
var _ = Describe("sqlRepository", func() {
var _ = Describe("Search", func() {
Describe("formatFullText", func() {
It("prefixes with a space", func() {
Expect(formatFullText("legiao urbana")).To(Equal(" legiao urbana"))

View File

@ -2,8 +2,22 @@ package str
import (
"strings"
"unicode"
"github.com/liuzl/gocc"
"golang.org/x/text/unicode/norm"
)
func init() {
var err error
opencc, err = gocc.New("t2s")
if err != nil {
panic(err)
}
}
var opencc *gocc.OpenCC
var utf8ToAscii = func() *strings.Replacer {
var utf8Map = map[string]string{
"'": ``,
@ -39,3 +53,32 @@ func LongestCommonPrefix(list []string) string {
}
return list[0]
}
// NormalizeText performs normalization on the given text
// This includes
// - converts input to Unicode NFC
// - converts all Chinese character to simplified
func NormalizeText(s string) string {
transformFuncs := []func(s string) string{
norm.NFC.String,
ToSimplifiedChinese,
}
for _, f := range transformFuncs {
s = f(s)
}
return s
}
// ToSimplifiedChinese converts the given string from Traditional Chinese to Simplified
// Original string is returned if it contains no Chinese character
func ToSimplifiedChinese(s string) string {
for _, r := range s {
if unicode.Is(unicode.Han, r) {
s, _ = opencc.Convert(s)
break
}
}
return s
}

View File

@ -31,6 +31,26 @@ var _ = Describe("String Utils", func() {
Expect(str.LongestCommonPrefix(albums)).To(Equal("/artist/album"))
})
})
Describe("NormalizeText", func() {
It("traditional Chinese should be replaced with simplified", func() {
Expect(str.NormalizeText("週傑倫")).To(Equal("周杰伦"))
})
It("simplified Chinese should be unchanged", func() {
Expect(str.NormalizeText("简体")).To(Equal("简体"))
})
It("alphabet should be unchanged", func() {
for _, letter := range letters {
Expect(str.NormalizeText(letter)).To(Equal(letter))
}
})
It("Japanese should be unchanged", func() {
Expect(str.NormalizeText("にっぽんご")).To(Equal("にっぽんご"))
})
It("unicode normalization", func() {
Expect(str.NormalizeText("e\u0301")).To(Equal("\u00e9"))
})
})
})
var testPaths = []string{
@ -146,3 +166,5 @@ var testPaths = []string{
"/Music/iTunes 1/iTunes Media/Music/War/Why Can't We Be Friends/Low Rider.m4a",
"/Music/iTunes 1/iTunes Media/Music/Yes/Fragile/01 Roundabout.m4a",
}
var letters = []string{"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"}