-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathdetect.go
191 lines (162 loc) · 4.78 KB
/
detect.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
package detector
import (
"bufio"
"io"
// "log"
"math"
"regexp"
)
const (
sampleLines = 15
nonDelimiterRegexString = `[[:alnum:]\n\r@\. ]`
)
// New a detector.
func New() Detector {
return &detector{
nonDelimiterRegex: regexp.MustCompile(nonDelimiterRegexString),
sampleLines: sampleLines,
}
}
// Detector defines the exposed interface.
type Detector interface {
DetectDelimiter(reader io.Reader, enclosure byte) []string
Configure(SampleLines *int, nonDelimiterRegexString *string)
}
// detector is the default implementation of Detector.
type detector struct {
nonDelimiterRegex *regexp.Regexp
sampleLines int
}
// DetectDelimiter finds a slice of delimiter string.
func (d *detector) DetectDelimiter(reader io.Reader, enclosure byte) []string {
statistics, totalLines := d.sample(reader, d.sampleLines, enclosure)
var candidates []string
for _, delimiter := range d.analyze(statistics, totalLines) {
candidates = append(candidates, string(delimiter))
}
return candidates
}
func (d *detector) Configure(sampleLines *int, nonDelimiterRegexString *string) {
if sampleLines != nil {
d.sampleLines = *sampleLines
}
if nonDelimiterRegexString != nil {
d.nonDelimiterRegex = regexp.MustCompile(*nonDelimiterRegexString)
}
}
// sample reads lines and walks through each character, records the frequencies of each candidate delimiter
// at each line(here we call it the 'frequencyTable'). It also returns the actual sampling lines
// because it might be less than sampleLines.
func (d *detector) sample(reader io.Reader, sampleLines int, enclosure byte) (frequencies frequencyTable, actualSampleLines int) {
bufferedReader := bufio.NewReader(reader)
frequencies = createFrequencyTable()
enclosed := false
actualSampleLines = 1
var prev, current, next byte
var err error
bufSize := 1024
buf := make([]byte, bufSize)
n, err := bufferedReader.Read(buf)
for err == nil {
for i := 0; i < n; i++ {
current = buf[i]
if i > 0 {
prev = buf[i-1]
} else {
prev = byte(0)
}
if i < n-1 {
next = buf[i+1]
} else {
next = byte(0)
}
if current == enclosure {
if !enclosed || next != enclosure {
if enclosed {
enclosed = false
} else {
enclosed = true
}
} else {
i++
}
} else if (current == '\n' && prev != '\r' && next != byte(0) && next != '\n' || current == '\r') && !enclosed {
if actualSampleLines == sampleLines {
return
}
actualSampleLines++
} else if !enclosed {
if !d.nonDelimiterRegex.MatchString(string(current)) {
frequencies.increment(current, actualSampleLines)
}
}
}
n, err = bufferedReader.Read(buf)
}
return
}
// analyze is built based on such an observation: the delimiter must appears
// the same times at each line, usually, it appears more than once. Therefore
// for each delimiter candidate, the deviation of its frequency at each line
// is calculated, if the deviation is 0, it means it appears the same times at
// each sampled line.
func (d *detector) analyze(ft frequencyTable, sampleLine int) []byte {
mean := func(frequencyOfLine map[int]int, size int) float32 {
total := 0
for i := 1; i <= size; i++ {
if frequency, ok := frequencyOfLine[i]; ok {
total += frequency
}
}
return float32(total) / float32(size)
}
deviation := func(frequencyOfLine map[int]int, size int) float64 {
average := mean(frequencyOfLine, size)
var total float64
for i := 1; i <= size; i++ {
var frequency float32
if v, ok := frequencyOfLine[i]; ok {
frequency = float32(v)
}
d := (average - frequency) * (average - frequency)
total += math.Sqrt(float64(d))
}
return total / float64(size)
}
var candidates []byte
var minDeviation float64
var minDelimiter byte
for delimiter, frequencyOfLine := range ft {
dev := deviation(frequencyOfLine, sampleLine)
if float64(0.0) == dev {
candidates = append(candidates, delimiter)
} else if minDeviation > dev || minDeviation == 0 {
// find minimum deviation available
minDeviation = dev
minDelimiter = delimiter
}
}
// if zero deviation candidates are not found, pick the minimum one
if len(candidates) == 0 && minDeviation > 0 {
candidates = append(candidates, minDelimiter)
}
return candidates
}
// frequencyTable remembers the frequency of character at each line.
// frequencyTable['.'][11] will get the frequency of char '.' at line 11.
type frequencyTable map[byte]map[int]int
// createFrequencyTable constructs a new frequencyTable.
func createFrequencyTable() frequencyTable {
return make(map[byte]map[int]int)
}
// increment the frequency for ch at line.
func (f frequencyTable) increment(ch byte, line int) frequencyTable {
if _, ok := f[ch]; !ok {
f[ch] = make(map[int]int)
}
if _, ok := f[ch][line]; !ok {
f[ch][line] = 0
}
f[ch][line]++
return f
}