From 4be711522a77760abc5de9c701f1e68b46c38b5d Mon Sep 17 00:00:00 2001 From: Leopoldo Ribeiro Date: Mon, 25 Jan 2021 08:23:41 -0300 Subject: [PATCH 1/5] First commit with setup and DVC files --- .dvc/.gitignore | 3 + .dvc/config | 5 ++ .dvc/plots/confusion.json | 107 ++++++++++++++++++++++++ .dvc/plots/confusion_normalized.json | 112 ++++++++++++++++++++++++++ .dvc/plots/default.json | 31 +++++++ .dvc/plots/linear.json | 116 +++++++++++++++++++++++++++ .dvc/plots/scatter.json | 104 ++++++++++++++++++++++++ .dvc/plots/smooth.json | 39 +++++++++ .dvcignore | 3 + data/raw/.gitignore | 2 + data/raw/train.dvc | 5 ++ data/raw/val.dvc | 5 ++ 12 files changed, 532 insertions(+) create mode 100644 .dvc/.gitignore create mode 100644 .dvc/config create mode 100644 .dvc/plots/confusion.json create mode 100644 .dvc/plots/confusion_normalized.json create mode 100644 .dvc/plots/default.json create mode 100644 .dvc/plots/linear.json create mode 100644 .dvc/plots/scatter.json create mode 100644 .dvc/plots/smooth.json create mode 100644 .dvcignore create mode 100644 data/raw/train.dvc create mode 100644 data/raw/val.dvc diff --git a/.dvc/.gitignore b/.dvc/.gitignore new file mode 100644 index 00000000..528f30c7 --- /dev/null +++ b/.dvc/.gitignore @@ -0,0 +1,3 @@ +/config.local +/tmp +/cache diff --git a/.dvc/config b/.dvc/config new file mode 100644 index 00000000..41833397 --- /dev/null +++ b/.dvc/config @@ -0,0 +1,5 @@ +[core] + analytics = false + remote = remote_storage +['remote "remote_storage"'] + url = /home/leo/estudodvc/dvc_remote diff --git a/.dvc/plots/confusion.json b/.dvc/plots/confusion.json new file mode 100644 index 00000000..af1b48d0 --- /dev/null +++ b/.dvc/plots/confusion.json @@ -0,0 +1,107 @@ +{ + "$schema": "https://vega.github.io/schema/vega-lite/v4.json", + "data": { + "values": "" + }, + "title": "", + "facet": { + "field": "rev", + "type": "nominal" + }, + "spec": { + "transform": [ + { + "aggregate": [ + { + "op": "count", + "as": "xy_count" + } + ], + "groupby": [ + "", + "" + ] + }, + { + "impute": "xy_count", + "groupby": [ + "rev", + "" + ], + "key": "", + "value": 0 + }, + { + "impute": "xy_count", + "groupby": [ + "rev", + "" + ], + "key": "", + "value": 0 + }, + { + "joinaggregate": [ + { + "op": "max", + "field": "xy_count", + "as": "max_count" + } + ], + "groupby": [] + }, + { + "calculate": "datum.xy_count / datum.max_count", + "as": "percent_of_max" + } + ], + "encoding": { + "x": { + "field": "", + "type": "nominal", + "sort": "ascending", + "title": "" + }, + "y": { + "field": "", + "type": "nominal", + "sort": "ascending", + "title": "" + } + }, + "layer": [ + { + "mark": "rect", + "width": 300, + "height": 300, + "encoding": { + "color": { + "field": "xy_count", + "type": "quantitative", + "title": "", + "scale": { + "domainMin": 0, + "nice": true + } + } + } + }, + { + "mark": "text", + "encoding": { + "text": { + "field": "xy_count", + "type": "quantitative" + }, + "color": { + "condition": { + "test": "datum.percent_of_max > 0.5", + "value": "white" + }, + "value": "black" + } + } + } + ] + } +} diff --git a/.dvc/plots/confusion_normalized.json b/.dvc/plots/confusion_normalized.json new file mode 100644 index 00000000..1d38849f --- /dev/null +++ b/.dvc/plots/confusion_normalized.json @@ -0,0 +1,112 @@ +{ + "$schema": "https://vega.github.io/schema/vega-lite/v4.json", + "data": { + "values": "" + }, + "title": "", + "facet": { + "field": "rev", + "type": "nominal" + }, + "spec": { + "transform": [ + { + "aggregate": [ + { + "op": "count", + "as": "xy_count" + } + ], + "groupby": [ + "", + "" + ] + }, + { + "impute": "xy_count", + "groupby": [ + "rev", + "" + ], + "key": "", + "value": 0 + }, + { + "impute": "xy_count", + "groupby": [ + "rev", + "" + ], + "key": "", + "value": 0 + }, + { + "joinaggregate": [ + { + "op": "sum", + "field": "xy_count", + "as": "sum_y" + } + ], + "groupby": [ + "" + ] + }, + { + "calculate": "datum.xy_count / datum.sum_y", + "as": "percent_of_y" + } + ], + "encoding": { + "x": { + "field": "", + "type": "nominal", + "sort": "ascending", + "title": "" + }, + "y": { + "field": "", + "type": "nominal", + "sort": "ascending", + "title": "" + } + }, + "layer": [ + { + "mark": "rect", + "width": 300, + "height": 300, + "encoding": { + "color": { + "field": "percent_of_y", + "type": "quantitative", + "title": "", + "scale": { + "domain": [ + 0, + 1 + ] + } + } + } + }, + { + "mark": "text", + "encoding": { + "text": { + "field": "percent_of_y", + "type": "quantitative", + "format": ".2f" + }, + "color": { + "condition": { + "test": "datum.percent_of_y > 0.5", + "value": "white" + }, + "value": "black" + } + } + } + ] + } +} diff --git a/.dvc/plots/default.json b/.dvc/plots/default.json new file mode 100644 index 00000000..9cf71ce0 --- /dev/null +++ b/.dvc/plots/default.json @@ -0,0 +1,31 @@ +{ + "$schema": "https://vega.github.io/schema/vega-lite/v4.json", + "data": { + "values": "" + }, + "title": "", + "width": 300, + "height": 300, + "mark": { + "type": "line" + }, + "encoding": { + "x": { + "field": "", + "type": "quantitative", + "title": "" + }, + "y": { + "field": "", + "type": "quantitative", + "title": "", + "scale": { + "zero": false + } + }, + "color": { + "field": "rev", + "type": "nominal" + } + } +} diff --git a/.dvc/plots/linear.json b/.dvc/plots/linear.json new file mode 100644 index 00000000..65549f9e --- /dev/null +++ b/.dvc/plots/linear.json @@ -0,0 +1,116 @@ +{ + "$schema": "https://vega.github.io/schema/vega-lite/v4.json", + "data": { + "values": "" + }, + "title": "", + "width": 300, + "height": 300, + "layer": [ + { + "encoding": { + "x": { + "field": "", + "type": "quantitative", + "title": "" + }, + "y": { + "field": "", + "type": "quantitative", + "title": "", + "scale": { + "zero": false + } + }, + "color": { + "field": "rev", + "type": "nominal" + } + }, + "layer": [ + { + "mark": "line" + }, + { + "selection": { + "label": { + "type": "single", + "nearest": true, + "on": "mouseover", + "encodings": [ + "x" + ], + "empty": "none", + "clear": "mouseout" + } + }, + "mark": "point", + "encoding": { + "opacity": { + "condition": { + "selection": "label", + "value": 1 + }, + "value": 0 + } + } + } + ] + }, + { + "transform": [ + { + "filter": { + "selection": "label" + } + } + ], + "layer": [ + { + "mark": { + "type": "rule", + "color": "gray" + }, + "encoding": { + "x": { + "field": "", + "type": "quantitative" + } + } + }, + { + "encoding": { + "text": { + "type": "quantitative", + "field": "" + }, + "x": { + "field": "", + "type": "quantitative" + }, + "y": { + "field": "", + "type": "quantitative" + } + }, + "layer": [ + { + "mark": { + "type": "text", + "align": "left", + "dx": 5, + "dy": -5 + }, + "encoding": { + "color": { + "type": "nominal", + "field": "rev" + } + } + } + ] + } + ] + } + ] +} diff --git a/.dvc/plots/scatter.json b/.dvc/plots/scatter.json new file mode 100644 index 00000000..9af9304c --- /dev/null +++ b/.dvc/plots/scatter.json @@ -0,0 +1,104 @@ +{ + "$schema": "https://vega.github.io/schema/vega-lite/v4.json", + "data": { + "values": "" + }, + "title": "", + "width": 300, + "height": 300, + "layer": [ + { + "encoding": { + "x": { + "field": "", + "type": "quantitative", + "title": "" + }, + "y": { + "field": "", + "type": "quantitative", + "title": "", + "scale": { + "zero": false + } + }, + "color": { + "field": "rev", + "type": "nominal" + } + }, + "layer": [ + { + "mark": "point" + }, + { + "selection": { + "label": { + "type": "single", + "nearest": true, + "on": "mouseover", + "encodings": [ + "x" + ], + "empty": "none", + "clear": "mouseout" + } + }, + "mark": "point", + "encoding": { + "opacity": { + "condition": { + "selection": "label", + "value": 1 + }, + "value": 0 + } + } + } + ] + }, + { + "transform": [ + { + "filter": { + "selection": "label" + } + } + ], + "layer": [ + { + "encoding": { + "text": { + "type": "quantitative", + "field": "" + }, + "x": { + "field": "", + "type": "quantitative" + }, + "y": { + "field": "", + "type": "quantitative" + } + }, + "layer": [ + { + "mark": { + "type": "text", + "align": "left", + "dx": 5, + "dy": -5 + }, + "encoding": { + "color": { + "type": "nominal", + "field": "rev" + } + } + } + ] + } + ] + } + ] +} diff --git a/.dvc/plots/smooth.json b/.dvc/plots/smooth.json new file mode 100644 index 00000000..d497ce75 --- /dev/null +++ b/.dvc/plots/smooth.json @@ -0,0 +1,39 @@ +{ + "$schema": "https://vega.github.io/schema/vega-lite/v4.json", + "data": { + "values": "" + }, + "title": "", + "mark": { + "type": "line" + }, + "encoding": { + "x": { + "field": "", + "type": "quantitative", + "title": "" + }, + "y": { + "field": "", + "type": "quantitative", + "title": "", + "scale": { + "zero": false + } + }, + "color": { + "field": "rev", + "type": "nominal" + } + }, + "transform": [ + { + "loess": "", + "on": "", + "groupby": [ + "rev" + ], + "bandwidth": 0.3 + } + ] +} diff --git a/.dvcignore b/.dvcignore new file mode 100644 index 00000000..51973055 --- /dev/null +++ b/.dvcignore @@ -0,0 +1,3 @@ +# Add patterns of files dvc should ignore, which could improve +# the performance. Learn more at +# https://dvc.org/doc/user-guide/dvcignore diff --git a/data/raw/.gitignore b/data/raw/.gitignore index e69de29b..a5d9d98f 100644 --- a/data/raw/.gitignore +++ b/data/raw/.gitignore @@ -0,0 +1,2 @@ +/train +/val diff --git a/data/raw/train.dvc b/data/raw/train.dvc new file mode 100644 index 00000000..f8e77548 --- /dev/null +++ b/data/raw/train.dvc @@ -0,0 +1,5 @@ +outs: +- md5: b673168a4895bd24aef44f431ecb39a4.dir + size: 75302934 + nfiles: 9469 + path: train diff --git a/data/raw/val.dvc b/data/raw/val.dvc new file mode 100644 index 00000000..237148f4 --- /dev/null +++ b/data/raw/val.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 0ad4dcf197b452735726bf8d8777201d.dir + size: 31248080 + nfiles: 3925 + path: val From 3cc9372211158f1836081758a9d248090f182752 Mon Sep 17 00:00:00 2001 From: Leopoldo Ribeiro Date: Tue, 26 Jan 2021 09:06:45 -0300 Subject: [PATCH 2/5] Created train and test CSV files --- data/prepared/.gitignore | 2 ++ data/prepared/test.csv.dvc | 4 ++++ data/prepared/train.csv.dvc | 4 ++++ 3 files changed, 10 insertions(+) create mode 100644 data/prepared/test.csv.dvc create mode 100644 data/prepared/train.csv.dvc diff --git a/data/prepared/.gitignore b/data/prepared/.gitignore index e69de29b..22a65dd9 100644 --- a/data/prepared/.gitignore +++ b/data/prepared/.gitignore @@ -0,0 +1,2 @@ +/train.csv +/test.csv diff --git a/data/prepared/test.csv.dvc b/data/prepared/test.csv.dvc new file mode 100644 index 00000000..1cf9fbc4 --- /dev/null +++ b/data/prepared/test.csv.dvc @@ -0,0 +1,4 @@ +outs: +- md5: 97295a44585b235c1a4a4620b9c303c1 + size: 77803 + path: test.csv diff --git a/data/prepared/train.csv.dvc b/data/prepared/train.csv.dvc new file mode 100644 index 00000000..eaa770b9 --- /dev/null +++ b/data/prepared/train.csv.dvc @@ -0,0 +1,4 @@ +outs: +- md5: d9ffb53c26a0a73b17a45bd0f21d4677 + size: 193348 + path: train.csv From c51da8d810fa1ceb27a853891e0c1d4dc75828eb Mon Sep 17 00:00:00 2001 From: Leopoldo Ribeiro Date: Tue, 26 Jan 2021 13:15:09 -0300 Subject: [PATCH 3/5] Trained an SGD classifier --- model/.gitignore | 1 + model/model.joblib.dvc | 4 ++++ 2 files changed, 5 insertions(+) create mode 100644 model/model.joblib.dvc diff --git a/model/.gitignore b/model/.gitignore index e69de29b..565a9d50 100644 --- a/model/.gitignore +++ b/model/.gitignore @@ -0,0 +1 @@ +/model.joblib diff --git a/model/model.joblib.dvc b/model/model.joblib.dvc new file mode 100644 index 00000000..36d6a9f6 --- /dev/null +++ b/model/model.joblib.dvc @@ -0,0 +1,4 @@ +outs: +- md5: d61d186f253f3a47a7dbe1cbebb6e6d5 + size: 241086 + path: model.joblib From 7fdc0df7d99cee5dccd432adcb845fe4678d95f2 Mon Sep 17 00:00:00 2001 From: Leopoldo Ribeiro Date: Tue, 26 Jan 2021 13:42:16 -0300 Subject: [PATCH 4/5] Evaluate the SGD model accuracy --- metrics/accuracy.json | 1 + 1 file changed, 1 insertion(+) create mode 100644 metrics/accuracy.json diff --git a/metrics/accuracy.json b/metrics/accuracy.json new file mode 100644 index 00000000..c443dd91 --- /dev/null +++ b/metrics/accuracy.json @@ -0,0 +1 @@ +{"accuracy": 0.788339670468948} \ No newline at end of file From 498b7008525e7cc551ae1727b835d55fef62d0c4 Mon Sep 17 00:00:00 2001 From: Leopoldo Ribeiro Date: Tue, 26 Jan 2021 14:26:12 -0300 Subject: [PATCH 5/5] Change SGD max_iter to 100 --- metrics/accuracy.json | 2 +- model/model.joblib.dvc | 2 +- src/train.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/metrics/accuracy.json b/metrics/accuracy.json index c443dd91..7e85b934 100644 --- a/metrics/accuracy.json +++ b/metrics/accuracy.json @@ -1 +1 @@ -{"accuracy": 0.788339670468948} \ No newline at end of file +{"accuracy": 0.7870722433460076} \ No newline at end of file diff --git a/model/model.joblib.dvc b/model/model.joblib.dvc index 36d6a9f6..33b0cf4c 100644 --- a/model/model.joblib.dvc +++ b/model/model.joblib.dvc @@ -1,4 +1,4 @@ outs: -- md5: d61d186f253f3a47a7dbe1cbebb6e6d5 +- md5: 2011bd3a9f08c3535d0396eb7e5aa0b7 size: 241086 path: model.joblib diff --git a/src/train.py b/src/train.py index e5feeda6..5d3c7c2c 100644 --- a/src/train.py +++ b/src/train.py @@ -37,7 +37,7 @@ def load_data(data_path): def main(repo_path): train_csv_path = repo_path / "data/prepared/train.csv" train_data, labels = load_data(train_csv_path) - sgd = SGDClassifier(max_iter=10) + sgd = SGDClassifier(max_iter=100) trained_model = sgd.fit(train_data, labels) dump(trained_model, repo_path / "model/model.joblib")