Skip to content

Commit 177129b

Browse files
author
hero
committed
添加文章相似度计算(余弦夹角)
1 parent 205e001 commit 177129b

File tree

2 files changed

+69
-2
lines changed

2 files changed

+69
-2
lines changed

similarity_code/cosine.go

+62-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,67 @@
11
package similarity_code
22

3-
import "math"
3+
import (
4+
"github.com/huichen/sego"
5+
"math"
6+
)
7+
8+
//比对两片文章的相似度
9+
10+
func ComPare(a string, b string) float64 {
11+
var (
12+
aArray = Divide(a)
13+
bArray = Divide(b)
14+
aArrayLen = len(aArray)
15+
bArrayLen = len(bArray)
16+
aMap = make(map[string]int, aArrayLen)
17+
bMap = make(map[string]int, bArrayLen)
18+
allMap = make(map[string]string, bArrayLen+aArrayLen)
19+
AllArray = append(aArray, bArray...)
20+
aA []float64
21+
bA []float64
22+
)
23+
//计算出全部词的并集
24+
for _, v := range AllArray {
25+
allMap[v] = ""
26+
}
27+
//开始统计每个次出现的次数
28+
for _, vA := range aArray {
29+
if _, ok := aMap[vA]; ok {
30+
aMap[vA] += 1
31+
} else {
32+
aMap[vA] = 1
33+
}
34+
}
35+
for _, vB := range bArray {
36+
if _, ok := bMap[vB]; ok {
37+
bMap[vB] += 1
38+
} else {
39+
bMap[vB] = 1
40+
}
41+
}
42+
//计算每个key出现的次数
43+
for key, _ := range allMap {
44+
if _, ok := aMap[key]; ok {
45+
aA = append(aA, float64(aMap[key]))
46+
} else {
47+
aA = append(aA, 0)
48+
}
49+
if _, ok := bMap[key]; ok {
50+
bA = append(bA, float64(bMap[key]))
51+
} else {
52+
bA = append(bA, 0)
53+
}
54+
}
55+
//计算相似度
56+
return Cosine(aA, bA)
57+
}
58+
59+
func Divide(a string) []string {
60+
var segmenter sego.Segmenter
61+
segmenter.LoadDictionary("./dictionary.txt")
62+
segments := segmenter.Segment([]byte(a))
63+
return sego.SegmentsToSlice(segments, true)
64+
}
465

566
//向量空间余弦相似度(Cosine Similarity)
667
func Cosine(a []float64, b []float64) float64 {

similarity_code/cosine_test.go

+7-1
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,18 @@ func TestCosine(t *testing.T) {
99
t.Log(Cosine([]float64{1, 1, 1, 1, 1}, []float64{1, 0, 1, 1, 1, 1, 1, 1}))
1010
}
1111

12+
func TestComPare(t *testing.T) {
13+
A := "梦是美好的吗?我不能确定,但也不是悲伤的。天使的彩色羽翼张开,羽毛飘落,每一片羽毛都有一个梦,你的梦是什么颜色的羽毛?"
14+
B := "我不能确定,但也不是悲伤的。天使的彩色羽翼张开,羽毛飘落,每一片羽毛都有一个梦,你的梦是什么颜色的羽毛?"
15+
t.Log(ComPare(A, B))
16+
}
17+
1218
func TestSego(t *testing.T) {
1319
var segmenter sego.Segmenter
1420
segmenter.LoadDictionary("./dictionary.txt")
1521
segments := segmenter.Segment([]byte("梦是美好的吗?我不能确定,但也不是悲伤的。天使的彩色羽翼张开,羽毛飘落,每一片羽毛都有一个梦,你的梦是什么颜色的羽毛?"))
1622
a := sego.SegmentsToSlice(segments, false)
17-
b := sego.SegmentsToString(segments, true)
23+
b := sego.SegmentsToSlice(segments, true)
1824
for _, v := range a {
1925
t.Log("a >>>>>>", v)
2026
}

0 commit comments

Comments
 (0)