Skip to content

LoadNetWordDict check status code and content type #24

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 15 additions & 23 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,24 +1,16 @@
# Compiled Object files, Static and Dynamic libs (Shared Objects)
*.o
*.a
*.so
.DS_Store
.idea/
.vscode/

# Folders
_obj
_test

# Architecture specific extensions/prefixes
*.[568vq]
[568vq].out

*.cgo1.go
*.cgo2.c
_cgo_defun.c
_cgo_gotypes.go
_cgo_export.*

_testmain.go

*.exe
*.test
*.prof
*~
*/*.so
*/*.so.*
*.bak
*.log
*.log.*
*.pem
*.sublime-project
*.sublime-workspace
*.tmp
*.txt
*.upx
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,9 @@ filter.FindAll("这篇文章真的好垃圾")
加载网络词库。

```go
filter.LoadNetWordDict("https://raw.githubusercontent.com/importcjj/sensitive/master/dict/dict.txt")
dictUrl := "https://raw.githubusercontent.com/importcjj/sensitive/master/dict/dict.txt"
// filter.LoadNetWordDictTimeout(dictUrl, false, 5000)
filter.LoadNetWordDict(dictUrl)
```

#### UpdateNoisePattern
Expand Down
44 changes: 33 additions & 11 deletions filter.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,25 @@ package sensitive

import (
"bufio"
"fmt"
"io"
"net/http"
"os"
"regexp"
"strings"
"time"
)

// Filter 敏感词过滤器
type Filter struct {
trie *Trie
noise *regexp.Regexp
*Trie
}

// New 返回一个敏感词过滤器
func New() *Filter {
return &Filter{
trie: NewTrie(),
Trie: NewTrie(),
noise: regexp.MustCompile(`[\|\s&%$@*]+`),
}
}
Expand All @@ -41,15 +43,29 @@ func (filter *Filter) LoadWordDict(path string) error {

// LoadNetWordDict 加载网络敏感词字典
func (filter *Filter) LoadNetWordDict(url string) error {
return filter.LoadNetWordDictTimeout(url, false, 5000)
}

// LoadNetWordDictTimeout 加载网络敏感词字典,带超时设置
func (filter *Filter) LoadNetWordDictTimeout(url string, allowHtml bool, timeout int) error {
c := http.Client{
Timeout: 5 * time.Second,
Timeout: time.Duration(timeout) * time.Millisecond,
}
rsp, err := c.Get(url)
if err != nil {
return err
}
defer rsp.Body.Close()

if rsp.StatusCode >= 400 {
text := http.StatusText(rsp.StatusCode)
return fmt.Errorf(text)
} else if allowHtml == false {
value := strings.ToLower(rsp.Header.Get("Content-Type"))
if strings.Contains(value, "html") {
return fmt.Errorf("html is not allowed.")
}
}
return filter.Load(rsp.Body)
}

Expand All @@ -64,50 +80,56 @@ func (filter *Filter) Load(rd io.Reader) error {
}
break
}
filter.trie.Add(string(line))
filter.Trie.Add(string(line))
}

return nil
}

// AddWord 添加敏感词
func (filter *Filter) AddWord(words ...string) {
filter.trie.Add(words...)
filter.Trie.Add(words...)
}

// DelWord 删除敏感词
func (filter *Filter) DelWord(words ...string) {
filter.trie.Del(words...)
filter.Trie.Del(words...)
}

// Filter 过滤敏感词
func (filter *Filter) Filter(text string) string {
return filter.trie.Filter(text)
text = filter.RemoveNoise(text)
return filter.Trie.Filter(text)
}

// Replace 和谐敏感词
func (filter *Filter) Replace(text string, repl rune) string {
return filter.trie.Replace(text, repl)
text = filter.RemoveNoise(text)
return filter.Trie.Replace(text, repl)
}

// FindIn 检测敏感词
func (filter *Filter) FindIn(text string) (bool, string) {
text = filter.RemoveNoise(text)
return filter.trie.FindIn(text)
return filter.Trie.FindIn(text)
}

// FindAll 找到所有匹配词
func (filter *Filter) FindAll(text string) []string {
return filter.trie.FindAll(text)
text = filter.RemoveNoise(text)
return filter.Trie.FindAll(text)
}

// Validate 检测字符串是否合法
func (filter *Filter) Validate(text string) (bool, string) {
text = filter.RemoveNoise(text)
return filter.trie.Validate(text)
return filter.Trie.Validate(text)
}

// RemoveNoise 去除空格等噪音
func (filter *Filter) RemoveNoise(text string) string {
if filter.noise.String() == "" { //空模式
return text
}
return filter.noise.ReplaceAllString(text, "")
}
4 changes: 2 additions & 2 deletions filter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ func TestLoadDict(t *testing.T) {
}

func TestLoadNetWordDict(t *testing.T) {
filter := New()
err := filter.LoadNetWordDict("https://raw.githubusercontent.com/importcjj/sensitive/master/dict/dict.txt")
filter, dictUrl := New(), "https://raw.githubusercontent.com/importcjj/sensitive/master/dict/dict.txt"
err := filter.LoadNetWordDict(dictUrl)
if err != nil {
t.Errorf("fail to load dict %v", err)
}
Expand Down