Skip to content

Commit 4aa56d6

Browse files
authored
Fix issue 137: added Transform.CurrentRawRecord() for caller of omniparser to access the raw ingested record. (#138)
See details in #137.
1 parent 69749f5 commit 4aa56d6

19 files changed

+3606
-3314
lines changed

README.md

+4-2
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,9 @@
55
[![PkgGoDev](https://pkg.go.dev/badge/github.com/jf-tech/omniparser)](https://pkg.go.dev/github.com/jf-tech/omniparser)
66
[![Mentioned in Awesome Go](https://awesome.re/mentioned-badge.svg)](https://github.com/avelino/awesome-go)
77

8-
Omniparser is a native Golang ETL parser that ingests input data of various formats (**CSV, txt, fixed length/width, XML, EDI/X12/EDIFACT, JSON**, and
9-
custom formats) in streaming fashion and transforms data into desired JSON output based on a schema written in JSON.
8+
Omniparser is a native Golang ETL parser that ingests input data of various formats (**CSV, txt, fixed length/width,
9+
XML, EDI/X12/EDIFACT, JSON**, and custom formats) in streaming fashion and transforms data into desired JSON output
10+
based on a schema written in JSON.
1011

1112
Golang Version: 1.14
1213

@@ -64,6 +65,7 @@ situations.
6465
- Golang 1.14
6566

6667
## Recent Major Feature Additions/Changes
68+
- Added `Transform.CurrentRawRecord()` for caller of omniparser to access the raw ingested record.
6769
- Deprecated `custom_parse` in favor of `custom_func` (`custom_parse` is still usable for
6870
back-compatibility, it is just removed from all public docs and samples).
6971
- Added `NonValidatingReader` EDI segment reader.

doc/programmability.md

+6-1
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,11 @@ for {
3535
}
3636
if err != nil { ... }
3737
// output contains a []byte of the ingested and transformed record.
38+
39+
raw, err := transform.CurrentRawRecord()
40+
if err != nil { ... }
41+
rawRecord := raw.(*omniv21.RawRecord) // assuming the schema is of `omni.2.1` version.
42+
fmt.Println(rawRecord.UUIDv3()) // rawRecord.UUIDv3() returns a stable hash of the current raw record.
3843
}
3944
```
4045
Note this out-of-box omniparser setup contains only the `omni.2.1` schema handler, meaning only schemas
@@ -256,4 +261,4 @@ for {
256261
See [IDR](#idr) notes about the JSON/XML readers above.
257262
258263
## XML Reader
259-
See [IDR](#idr) notes about the JSON/XML readers above.
264+
See [IDR](#idr) notes about the JSON/XML readers above.

extensions/omniv21/ingester.go

+30-7
Original file line numberDiff line numberDiff line change
@@ -8,32 +8,55 @@ import (
88
"github.com/jf-tech/omniparser/errs"
99
"github.com/jf-tech/omniparser/extensions/omniv21/fileformat"
1010
"github.com/jf-tech/omniparser/extensions/omniv21/transform"
11+
"github.com/jf-tech/omniparser/idr"
1112
"github.com/jf-tech/omniparser/transformctx"
1213
)
1314

15+
// RawRecord contains the raw data ingested in from the input stream in the form of an IDR tree.
16+
// Note callers outside this package should absolutely make **NO** modifications to the content of
17+
// RawRecord. Treat it like read-only.
18+
type RawRecord struct {
19+
Node *idr.Node
20+
}
21+
22+
// UUIDv3 returns a stable MD5(v3) hash of the RawRecord.
23+
func (rr *RawRecord) UUIDv3() string {
24+
hash, _ := customfuncs.UUIDv3(nil, idr.JSONify2(rr.Node))
25+
return hash
26+
}
27+
1428
type ingester struct {
1529
finalOutputDecl *transform.Decl
1630
customFuncs customfuncs.CustomFuncs
1731
customParseFuncs transform.CustomParseFuncs // Deprecated.
1832
ctx *transformctx.Ctx
1933
reader fileformat.FormatReader
34+
rawRecord RawRecord
2035
}
2136

22-
func (g *ingester) Read() ([]byte, error) {
37+
// Read ingests a raw record from the input stream, transforms it according the given schema and return
38+
// the raw record, transformed JSON bytes.
39+
func (g *ingester) Read() (interface{}, []byte, error) {
40+
if g.rawRecord.Node != nil {
41+
g.reader.Release(g.rawRecord.Node)
42+
g.rawRecord.Node = nil
43+
}
2344
n, err := g.reader.Read()
45+
if n != nil {
46+
g.rawRecord.Node = n
47+
}
2448
if err != nil {
2549
// Read() supposed to have already done CtxAwareErr error wrapping. So directly return.
26-
return nil, err
50+
return nil, nil, err
2751
}
28-
defer g.reader.Release(n)
29-
result, err := transform.NewParseCtx(
30-
g.ctx, g.customFuncs, g.customParseFuncs).ParseNode(n, g.finalOutputDecl)
52+
result, err := transform.NewParseCtx(g.ctx, g.customFuncs, g.customParseFuncs).ParseNode(n, g.finalOutputDecl)
3153
if err != nil {
3254
// ParseNode() error not CtxAwareErr wrapped, so wrap it.
3355
// Note errs.ErrorTransformFailed is a continuable error.
34-
return nil, errs.ErrTransformFailed(g.fmtErrStr("fail to transform. err: %s", err.Error()))
56+
return nil, nil, errs.ErrTransformFailed(g.fmtErrStr("fail to transform. err: %s", err.Error()))
3557
}
36-
return json.Marshal(result)
58+
transformed, err := json.Marshal(result)
59+
return &g.rawRecord, transformed, err
3760
}
3861

3962
func (g *ingester) IsContinuableError(err error) bool {

extensions/omniv21/ingester_test.go

+13-5
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ func (r *testReader) Read() (*idr.Node, error) {
3333
return result, err
3434
}
3535

36-
func (r *testReader) Release(n *idr.Node) { r.releaseCalled++ }
36+
func (r *testReader) Release(_ *idr.Node) { r.releaseCalled++ }
3737

3838
func (r *testReader) IsContinuableError(err error) bool { return err == errContinuableInTest }
3939

@@ -45,9 +45,10 @@ func TestIngester_Read_ReadFailure(t *testing.T) {
4545
g := &ingester{
4646
reader: &testReader{result: []*idr.Node{nil}, err: []error{errors.New("test failure")}},
4747
}
48-
b, err := g.Read()
48+
raw, b, err := g.Read()
4949
assert.Error(t, err)
5050
assert.Equal(t, "test failure", err.Error())
51+
assert.Nil(t, raw)
5152
assert.Nil(t, b)
5253
assert.Equal(t, 0, g.reader.(*testReader).releaseCalled)
5354
}
@@ -64,15 +65,16 @@ func TestIngester_Read_ParseNodeFailure(t *testing.T) {
6465
finalOutputDecl: finalOutputDecl,
6566
reader: &testReader{result: []*idr.Node{ingesterTestNode}, err: []error{nil}},
6667
}
67-
b, err := g.Read()
68+
raw, b, err := g.Read()
6869
assert.Error(t, err)
6970
assert.True(t, errs.IsErrTransformFailed(err))
7071
assert.True(t, g.IsContinuableError(err))
7172
assert.Equal(t,
7273
`ctx: fail to transform. err: unable to convert value 'abc' to type 'int' on 'FINAL_OUTPUT', err: strconv.ParseInt: parsing "abc": invalid syntax`,
7374
err.Error())
75+
assert.Nil(t, raw)
7476
assert.Nil(t, b)
75-
assert.Equal(t, 1, g.reader.(*testReader).releaseCalled)
77+
assert.Equal(t, 0, g.reader.(*testReader).releaseCalled)
7678
}
7779

7880
func TestIngester_Read_Success(t *testing.T) {
@@ -87,9 +89,15 @@ func TestIngester_Read_Success(t *testing.T) {
8789
finalOutputDecl: finalOutputDecl,
8890
reader: &testReader{result: []*idr.Node{ingesterTestNode}, err: []error{nil}},
8991
}
90-
b, err := g.Read()
92+
raw, b, err := g.Read()
9193
assert.NoError(t, err)
94+
assert.Equal(t, "41665284-dab9-300d-b647-7ace9cb514b4", raw.(*RawRecord).UUIDv3())
9295
assert.Equal(t, "123", string(b))
96+
assert.Equal(t, 0, g.reader.(*testReader).releaseCalled)
97+
raw, b, err = g.Read()
98+
assert.Equal(t, io.EOF, err)
99+
assert.Nil(t, raw)
100+
assert.Nil(t, b)
93101
assert.Equal(t, 1, g.reader.(*testReader).releaseCalled)
94102
}
95103

Original file line numberDiff line numberDiff line change
@@ -1,44 +1,56 @@
11
[
22
{
3-
"date": "2019-01-31T12:34:56-08:00",
4-
"high_temperature_fahrenheit": 50.9,
5-
"latitude": 37.7749,
6-
"longitude": 122.4194,
7-
"low_temperature_fahrenheit": 30.2,
8-
"note": "note 1",
9-
"uv_index": [
10-
"12",
11-
"4",
12-
"6"
13-
],
14-
"wind": "North 20.5 mph"
3+
"RawRecord": "{\"DATE\":\"2019/01/31T12:34:56-0800\",\"HIGH_TEMP_C\":\"10.5\",\"LAT\":\"37.7749\",\"LONG\":\"122.4194\",\"LOW_TEMP_F\":\"30.2\",\"NOTE\":\"note 1\",\"UV_INDEX\":\"12/4/6\",\"WIND_DIR\":\"N\",\"WIND_SPEED_KMH\":\"33\"}",
4+
"RawRecordHash": "24a341e6-bdac-3319-ac76-7354d42a7402",
5+
"TransformedRecord": {
6+
"date": "2019-01-31T12:34:56-08:00",
7+
"high_temperature_fahrenheit": 50.9,
8+
"latitude": 37.7749,
9+
"longitude": 122.4194,
10+
"low_temperature_fahrenheit": 30.2,
11+
"note": "note 1",
12+
"uv_index": [
13+
"12",
14+
"4",
15+
"6"
16+
],
17+
"wind": "North 20.5 mph"
18+
}
1519
},
1620
{
17-
"date": "2020-07-31T01:23:45-05:00",
18-
"high_temperature_fahrenheit": 102.2,
19-
"latitude": 32.7767,
20-
"longitude": 96.797,
21-
"low_temperature_fahrenheit": 95,
22-
"note": "' note with bad quotes",
23-
"uv_index": [
24-
"9",
25-
"5",
26-
"6"
27-
],
28-
"wind": "South East 4.97 mph"
21+
"RawRecord": "{\"DATE\":\"2020/07/31T01:23:45-0500\",\"HIGH_TEMP_C\":\"39\",\"LAT\":\"32.7767\",\"LONG\":\"96.7970\",\"LOW_TEMP_F\":\"95\",\"NOTE\":\"' note with bad quotes\",\"UV_INDEX\":\"9/5/6\",\"WIND_DIR\":\"SE\",\"WIND_SPEED_KMH\":\"8\"}",
22+
"RawRecordHash": "dba160be-3cfe-3efc-a891-f76461c37c08",
23+
"TransformedRecord": {
24+
"date": "2020-07-31T01:23:45-05:00",
25+
"high_temperature_fahrenheit": 102.2,
26+
"latitude": 32.7767,
27+
"longitude": 96.797,
28+
"low_temperature_fahrenheit": 95,
29+
"note": "' note with bad quotes",
30+
"uv_index": [
31+
"9",
32+
"5",
33+
"6"
34+
],
35+
"wind": "South East 4.97 mph"
36+
}
2937
},
3038
{
31-
"date": "2030-11-22T20:18:00-05:00",
32-
"high_temperature_fahrenheit": 59.9,
33-
"latitude": 39.0997,
34-
"longitude": 94.5786,
35-
"low_temperature_fahrenheit": 17,
36-
"note": "note 3",
37-
"uv_index": [
38-
"10",
39-
"3",
40-
"4"
41-
],
42-
"wind": "Tornado 111.84 mph"
39+
"RawRecord": "{\"DATE\":\"2030/11/22T20:18:00-0500\",\"HIGH_TEMP_C\":\"15.5\",\"LAT\":\"39.0997\",\"LONG\":\"94.5786\",\"LOW_TEMP_F\":\"17\",\"NOTE\":\"note 3\",\"UV_INDEX\":\"10/3/4\",\"WIND_DIR\":\"X\",\"WIND_SPEED_KMH\":\"180\"}",
40+
"RawRecordHash": "fcdd707d-1ed4-3641-aca3-b0df568b1084",
41+
"TransformedRecord": {
42+
"date": "2030-11-22T20:18:00-05:00",
43+
"high_temperature_fahrenheit": 59.9,
44+
"latitude": 39.0997,
45+
"longitude": 94.5786,
46+
"low_temperature_fahrenheit": 17,
47+
"note": "note 3",
48+
"uv_index": [
49+
"10",
50+
"3",
51+
"4"
52+
],
53+
"wind": "Tornado 111.84 mph"
54+
}
4355
}
4456
]

0 commit comments

Comments
 (0)