From b4bf31b1c99b0258f96fae67a99c77c701116a64 Mon Sep 17 00:00:00 2001 From: Ryan Liu Date: Thu, 24 Mar 2022 17:29:30 +0800 Subject: [PATCH 1/2] LoadNetWordDict check status code and content type --- .gitignore | 38 +++++++++++++++----------------------- README.md | 3 ++- filter.go | 13 ++++++++++++- filter_test.go | 6 +++--- 4 files changed, 32 insertions(+), 28 deletions(-) diff --git a/.gitignore b/.gitignore index daf913b..6c65c5e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,24 +1,16 @@ -# Compiled Object files, Static and Dynamic libs (Shared Objects) -*.o -*.a -*.so +.DS_Store +.idea/ +.vscode/ -# Folders -_obj -_test - -# Architecture specific extensions/prefixes -*.[568vq] -[568vq].out - -*.cgo1.go -*.cgo2.c -_cgo_defun.c -_cgo_gotypes.go -_cgo_export.* - -_testmain.go - -*.exe -*.test -*.prof +*~ +*/*.so +*/*.so.* +*.bak +*.log +*.log.* +*.pem +*.sublime-project +*.sublime-workspace +*.tmp +*.txt +*.upx diff --git a/README.md b/README.md index f0bd3ac..d481815 100644 --- a/README.md +++ b/README.md @@ -84,7 +84,8 @@ filter.FindAll("这篇文章真的好垃圾") 加载网络词库。 ```go -filter.LoadNetWordDict("https://raw.githubusercontent.com/importcjj/sensitive/master/dict/dict.txt") +dictUrl := "https://raw.githubusercontent.com/importcjj/sensitive/master/dict/dict.txt" +filter.LoadNetWordDict(dictUrl, true) ``` #### UpdateNoisePattern diff --git a/filter.go b/filter.go index fed8b0e..ed6a7cb 100644 --- a/filter.go +++ b/filter.go @@ -2,10 +2,12 @@ package sensitive import ( "bufio" + "fmt" "io" "net/http" "os" "regexp" + "strings" "time" ) @@ -40,7 +42,7 @@ func (filter *Filter) LoadWordDict(path string) error { } // LoadNetWordDict 加载网络敏感词字典 -func (filter *Filter) LoadNetWordDict(url string) error { +func (filter *Filter) LoadNetWordDict(url string, allowHtml bool) error { c := http.Client{ Timeout: 5 * time.Second, } @@ -50,6 +52,15 @@ func (filter *Filter) LoadNetWordDict(url string) error { } defer rsp.Body.Close() + if rsp.StatusCode >= 400 { + text := http.StatusText(rsp.StatusCode) + return fmt.Errorf(text) + } else if allowHtml == false { + value := strings.ToLower(rsp.Header.Get("Content-Type")) + if strings.Contains(value, "html") { + return fmt.Errorf("html is not allowed.") + } + } return filter.Load(rsp.Body) } diff --git a/filter_test.go b/filter_test.go index 033c098..2ca646f 100644 --- a/filter_test.go +++ b/filter_test.go @@ -17,8 +17,8 @@ func TestLoadDict(t *testing.T) { } func TestLoadNetWordDict(t *testing.T) { - filter := New() - err := filter.LoadNetWordDict("https://raw.githubusercontent.com/importcjj/sensitive/master/dict/dict.txt") + filter, dictUrl := New(), "https://raw.githubusercontent.com/importcjj/sensitive/master/dict/dict.txt" + err := filter.LoadNetWordDict(dictUrl, true) if err != nil { t.Errorf("fail to load dict %v", err) } @@ -346,7 +346,7 @@ func TestFilter_LoadNetWordDict(t *testing.T) { trie: tt.fields.trie, noise: tt.fields.noise, } - if err := filter.LoadNetWordDict(tt.args.url); (err != nil) != tt.wantErr { + if err := filter.LoadNetWordDict(tt.args.url, true); (err != nil) != tt.wantErr { t.Errorf("Filter.LoadNetWordDict() error = %v, wantErr %v", err, tt.wantErr) } }) From 5269dde5d7ad16bc7bb85507b6b34b7ed6fe6378 Mon Sep 17 00:00:00 2001 From: Ryan Liu Date: Fri, 25 Mar 2022 09:51:16 +0800 Subject: [PATCH 2/2] add http client timeout --- README.md | 3 ++- filter.go | 35 +++++++++++++++++++++++------------ filter_test.go | 4 ++-- 3 files changed, 27 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index d481815..52cba0f 100644 --- a/README.md +++ b/README.md @@ -85,7 +85,8 @@ filter.FindAll("这篇文章真的好垃圾") ```go dictUrl := "https://raw.githubusercontent.com/importcjj/sensitive/master/dict/dict.txt" -filter.LoadNetWordDict(dictUrl, true) +// filter.LoadNetWordDictTimeout(dictUrl, false, 5000) +filter.LoadNetWordDict(dictUrl) ``` #### UpdateNoisePattern diff --git a/filter.go b/filter.go index ed6a7cb..1b577b8 100644 --- a/filter.go +++ b/filter.go @@ -13,14 +13,14 @@ import ( // Filter 敏感词过滤器 type Filter struct { - trie *Trie noise *regexp.Regexp + *Trie } // New 返回一个敏感词过滤器 func New() *Filter { return &Filter{ - trie: NewTrie(), + Trie: NewTrie(), noise: regexp.MustCompile(`[\|\s&%$@*]+`), } } @@ -42,9 +42,14 @@ func (filter *Filter) LoadWordDict(path string) error { } // LoadNetWordDict 加载网络敏感词字典 -func (filter *Filter) LoadNetWordDict(url string, allowHtml bool) error { +func (filter *Filter) LoadNetWordDict(url string) error { + return filter.LoadNetWordDictTimeout(url, false, 5000) +} + +// LoadNetWordDictTimeout 加载网络敏感词字典,带超时设置 +func (filter *Filter) LoadNetWordDictTimeout(url string, allowHtml bool, timeout int) error { c := http.Client{ - Timeout: 5 * time.Second, + Timeout: time.Duration(timeout) * time.Millisecond, } rsp, err := c.Get(url) if err != nil { @@ -75,7 +80,7 @@ func (filter *Filter) Load(rd io.Reader) error { } break } - filter.trie.Add(string(line)) + filter.Trie.Add(string(line)) } return nil @@ -83,42 +88,48 @@ func (filter *Filter) Load(rd io.Reader) error { // AddWord 添加敏感词 func (filter *Filter) AddWord(words ...string) { - filter.trie.Add(words...) + filter.Trie.Add(words...) } // DelWord 删除敏感词 func (filter *Filter) DelWord(words ...string) { - filter.trie.Del(words...) + filter.Trie.Del(words...) } // Filter 过滤敏感词 func (filter *Filter) Filter(text string) string { - return filter.trie.Filter(text) + text = filter.RemoveNoise(text) + return filter.Trie.Filter(text) } // Replace 和谐敏感词 func (filter *Filter) Replace(text string, repl rune) string { - return filter.trie.Replace(text, repl) + text = filter.RemoveNoise(text) + return filter.Trie.Replace(text, repl) } // FindIn 检测敏感词 func (filter *Filter) FindIn(text string) (bool, string) { text = filter.RemoveNoise(text) - return filter.trie.FindIn(text) + return filter.Trie.FindIn(text) } // FindAll 找到所有匹配词 func (filter *Filter) FindAll(text string) []string { - return filter.trie.FindAll(text) + text = filter.RemoveNoise(text) + return filter.Trie.FindAll(text) } // Validate 检测字符串是否合法 func (filter *Filter) Validate(text string) (bool, string) { text = filter.RemoveNoise(text) - return filter.trie.Validate(text) + return filter.Trie.Validate(text) } // RemoveNoise 去除空格等噪音 func (filter *Filter) RemoveNoise(text string) string { + if filter.noise.String() == "" { //空模式 + return text + } return filter.noise.ReplaceAllString(text, "") } diff --git a/filter_test.go b/filter_test.go index 2ca646f..7d08aa7 100644 --- a/filter_test.go +++ b/filter_test.go @@ -18,7 +18,7 @@ func TestLoadDict(t *testing.T) { func TestLoadNetWordDict(t *testing.T) { filter, dictUrl := New(), "https://raw.githubusercontent.com/importcjj/sensitive/master/dict/dict.txt" - err := filter.LoadNetWordDict(dictUrl, true) + err := filter.LoadNetWordDict(dictUrl) if err != nil { t.Errorf("fail to load dict %v", err) } @@ -346,7 +346,7 @@ func TestFilter_LoadNetWordDict(t *testing.T) { trie: tt.fields.trie, noise: tt.fields.noise, } - if err := filter.LoadNetWordDict(tt.args.url, true); (err != nil) != tt.wantErr { + if err := filter.LoadNetWordDict(tt.args.url); (err != nil) != tt.wantErr { t.Errorf("Filter.LoadNetWordDict() error = %v, wantErr %v", err, tt.wantErr) } })