From dfa01c093f79b258dc902610ae6b0fd9050935bc Mon Sep 17 00:00:00 2001 From: Anatoly Bubenkov <bubenkoff@gmail.com> Date: Wed, 28 Mar 2018 22:16:19 +0200 Subject: [PATCH] add deviation tolerancy --- detector/Fixtures/test3.csv | 5 +++++ detector/detect.go | 14 +++++++++++++- detector/detect_test.go | 13 +++++++++++++ 3 files changed, 31 insertions(+), 1 deletion(-) create mode 100644 detector/Fixtures/test3.csv diff --git a/detector/Fixtures/test3.csv b/detector/Fixtures/test3.csv new file mode 100644 index 0000000..b5da06a --- /dev/null +++ b/detector/Fixtures/test3.csv @@ -0,0 +1,5 @@ +Email Address,,first name,last name,Gender,Date of birth,country,city,STREET,POSTAL CODE,INTEREST 1 - Producer,INTEREST 2 - 2-Dutch Recs,INTEREST 3 - SKINK,INTEREST 4 - Musical Madness,INTEREST 5 - Dutch Master Works,INTEREST 6 - 4-Dots,INTEREST 7 - Blue Forest,INTEREST 8 - Brooks,INTEREST 9 - PROMOTER GENERAL,INTEREST 10 - PROMOTER CLEAN +some@example.com,First,Last,,,,,,,,,,,,,,Brooks,, +some+1@example.com,First,Last,,,,,,,,,,,,,,Brooks,, +some+2@example.com,First,Last,,,,,,,,,,,,,,Brooks,, +some+3@example.com,First,Last,,,,,,,,,,,,,,Brooks,, diff --git a/detector/detect.go b/detector/detect.go index 4f86e31..abf4355 100644 --- a/detector/detect.go +++ b/detector/detect.go @@ -3,6 +3,7 @@ package detector import ( "bufio" "io" + // "log" "math" "regexp" ) @@ -145,11 +146,22 @@ func (d *detector) analyze(ft frequencyTable, sampleLine int) []byte { } var candidates []byte + var minDeviation float64 + var minDelimiter byte for delimiter, frequencyOfLine := range ft { - if float64(0.0) == deviation(frequencyOfLine, sampleLine) { + dev := deviation(frequencyOfLine, sampleLine) + if float64(0.0) == dev { candidates = append(candidates, delimiter) + } else if minDeviation > dev || minDeviation == 0 { + // find minimum deviation available + minDeviation = dev + minDelimiter = delimiter } } + // if zero deviation candidates are not found, pick the minimum one + if len(candidates) == 0 && minDeviation > 0 { + candidates = append(candidates, minDelimiter) + } return candidates } diff --git a/detector/detect_test.go b/detector/detect_test.go index 42be316..63dc233 100644 --- a/detector/detect_test.go +++ b/detector/detect_test.go @@ -66,6 +66,19 @@ func TestDetectDelimiterComma(t *testing.T) { assert.Equal(t, []string{","}, delimiters) } +func TestDetectDelimiterComma2(t *testing.T) { + detector := New() + sampleLines := 4 + detector.Configure(&sampleLines, nil) + file, err := os.OpenFile("./Fixtures/test3.csv", os.O_RDONLY, os.ModePerm) + assert.NoError(t, err) + defer file.Close() + + delimiters := detector.DetectDelimiter(file, '"') + + assert.Equal(t, []string{","}, delimiters) +} + func TestDetectDelimiterSemicolon(t *testing.T) { detector := New() sampleLines := 4