diff --git a/.gitignore b/.gitignore index daf913b..6c65c5e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,24 +1,16 @@ -# Compiled Object files, Static and Dynamic libs (Shared Objects) -*.o -*.a -*.so +.DS_Store +.idea/ +.vscode/ -# Folders -_obj -_test - -# Architecture specific extensions/prefixes -*.[568vq] -[568vq].out - -*.cgo1.go -*.cgo2.c -_cgo_defun.c -_cgo_gotypes.go -_cgo_export.* - -_testmain.go - -*.exe -*.test -*.prof +*~ +*/*.so +*/*.so.* +*.bak +*.log +*.log.* +*.pem +*.sublime-project +*.sublime-workspace +*.tmp +*.txt +*.upx diff --git a/README.md b/README.md index f0bd3ac..52cba0f 100644 --- a/README.md +++ b/README.md @@ -84,7 +84,9 @@ filter.FindAll("这篇文章真的好垃圾") 加载网络词库。 ```go -filter.LoadNetWordDict("https://raw.githubusercontent.com/importcjj/sensitive/master/dict/dict.txt") +dictUrl := "https://raw.githubusercontent.com/importcjj/sensitive/master/dict/dict.txt" +// filter.LoadNetWordDictTimeout(dictUrl, false, 5000) +filter.LoadNetWordDict(dictUrl) ``` #### UpdateNoisePattern diff --git a/filter.go b/filter.go index fed8b0e..1b577b8 100644 --- a/filter.go +++ b/filter.go @@ -2,23 +2,25 @@ package sensitive import ( "bufio" + "fmt" "io" "net/http" "os" "regexp" + "strings" "time" ) // Filter 敏感词过滤器 type Filter struct { - trie *Trie noise *regexp.Regexp + *Trie } // New 返回一个敏感词过滤器 func New() *Filter { return &Filter{ - trie: NewTrie(), + Trie: NewTrie(), noise: regexp.MustCompile(`[\|\s&%$@*]+`), } } @@ -41,8 +43,13 @@ func (filter *Filter) LoadWordDict(path string) error { // LoadNetWordDict 加载网络敏感词字典 func (filter *Filter) LoadNetWordDict(url string) error { + return filter.LoadNetWordDictTimeout(url, false, 5000) +} + +// LoadNetWordDictTimeout 加载网络敏感词字典,带超时设置 +func (filter *Filter) LoadNetWordDictTimeout(url string, allowHtml bool, timeout int) error { c := http.Client{ - Timeout: 5 * time.Second, + Timeout: time.Duration(timeout) * time.Millisecond, } rsp, err := c.Get(url) if err != nil { @@ -50,6 +57,15 @@ func (filter *Filter) LoadNetWordDict(url string) error { } defer rsp.Body.Close() + if rsp.StatusCode >= 400 { + text := http.StatusText(rsp.StatusCode) + return fmt.Errorf(text) + } else if allowHtml == false { + value := strings.ToLower(rsp.Header.Get("Content-Type")) + if strings.Contains(value, "html") { + return fmt.Errorf("html is not allowed.") + } + } return filter.Load(rsp.Body) } @@ -64,7 +80,7 @@ func (filter *Filter) Load(rd io.Reader) error { } break } - filter.trie.Add(string(line)) + filter.Trie.Add(string(line)) } return nil @@ -72,42 +88,48 @@ func (filter *Filter) Load(rd io.Reader) error { // AddWord 添加敏感词 func (filter *Filter) AddWord(words ...string) { - filter.trie.Add(words...) + filter.Trie.Add(words...) } // DelWord 删除敏感词 func (filter *Filter) DelWord(words ...string) { - filter.trie.Del(words...) + filter.Trie.Del(words...) } // Filter 过滤敏感词 func (filter *Filter) Filter(text string) string { - return filter.trie.Filter(text) + text = filter.RemoveNoise(text) + return filter.Trie.Filter(text) } // Replace 和谐敏感词 func (filter *Filter) Replace(text string, repl rune) string { - return filter.trie.Replace(text, repl) + text = filter.RemoveNoise(text) + return filter.Trie.Replace(text, repl) } // FindIn 检测敏感词 func (filter *Filter) FindIn(text string) (bool, string) { text = filter.RemoveNoise(text) - return filter.trie.FindIn(text) + return filter.Trie.FindIn(text) } // FindAll 找到所有匹配词 func (filter *Filter) FindAll(text string) []string { - return filter.trie.FindAll(text) + text = filter.RemoveNoise(text) + return filter.Trie.FindAll(text) } // Validate 检测字符串是否合法 func (filter *Filter) Validate(text string) (bool, string) { text = filter.RemoveNoise(text) - return filter.trie.Validate(text) + return filter.Trie.Validate(text) } // RemoveNoise 去除空格等噪音 func (filter *Filter) RemoveNoise(text string) string { + if filter.noise.String() == "" { //空模式 + return text + } return filter.noise.ReplaceAllString(text, "") } diff --git a/filter_test.go b/filter_test.go index 033c098..7d08aa7 100644 --- a/filter_test.go +++ b/filter_test.go @@ -17,8 +17,8 @@ func TestLoadDict(t *testing.T) { } func TestLoadNetWordDict(t *testing.T) { - filter := New() - err := filter.LoadNetWordDict("https://raw.githubusercontent.com/importcjj/sensitive/master/dict/dict.txt") + filter, dictUrl := New(), "https://raw.githubusercontent.com/importcjj/sensitive/master/dict/dict.txt" + err := filter.LoadNetWordDict(dictUrl) if err != nil { t.Errorf("fail to load dict %v", err) }