From 18d9f809b6a3de11c7e0540fb3f79ff2aecfaa1b Mon Sep 17 00:00:00 2001 From: Greg Mingas Date: Mon, 12 Apr 2021 12:03:55 +0100 Subject: [PATCH 01/10] Modify makefile to use Kaggle for downloading household data --- Makefile | 11 ++++++++++- env-configuration/requirements.txt | 1 + 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index f28e79e..b8110f7 100644 --- a/Makefile +++ b/Makefile @@ -52,6 +52,7 @@ ADD_PROVENANCE = $(PROVENANCE_DEF) && provenance ## set data file paths AE_DEIDENTIFIED_DATA = generator-outputs/odi-nhs-ae/hospital_ae_data_deidentify.csv generator-outputs/odi-nhs-ae/hospital_ae_data_deidentify.json LONDON_POSTCODES = generators/odi-nhs-ae/data/London\ postcodes.csv +HH_DATA = generators/household_poverty/data\ train.csv HP_DATA_CLEAN = generator-outputs/household_poverty/train_cleaned.csv generator-outputs/household_poverty/train_cleaned.json generated-data: $(AE_DEIDENTIFIED_DATA) $(HP_DATA_CLEAN) @@ -62,6 +63,14 @@ $(LONDON_POSTCODES): curl -o "./data/London postcodes.csv" \ https://www.doogal.co.uk/UKPostcodesCSV.ashx?region=E12000007 +# download the Household poverty dataset from Kaggle using API +$(HH_DATA): + cd generators/household_poverty/data/ && \ + kaggle competitions download -c costa-rican-household-poverty-prediction && \ + unzip costa-rican-household-poverty-prediction.zip -d unzipped && \ + cp unzipped/train.csv . && \ + rm -rf unzipped + # make the "A&E deidentified" generated dataset # this is currently the only generated dataset, so it is handled with # its own rule @@ -75,7 +84,7 @@ $(AE_DEIDENTIFIED_DATA) &: $(LONDON_POSTCODES) $(PYTHON) $(QUIPP_ROOT)/generators/household_poverty/clean.py # pre-process the Household Poverty dataset -$(HP_DATA_CLEAN): +$(HP_DATA_CLEAN): $(HH_DATA) mkdir -p generator-outputs/household_poverty/ && \ cd generator-outputs/household_poverty/ && \ $(PYTHON) $(QUIPP_ROOT)/generators/household_poverty/clean.py diff --git a/env-configuration/requirements.txt b/env-configuration/requirements.txt index 15626c9..78c8e07 100644 --- a/env-configuration/requirements.txt +++ b/env-configuration/requirements.txt @@ -13,3 +13,4 @@ shap ipython numpy>=1.20 pulp +kaggle From c8b8ea6d6bc1dc5abbc7183e6ad7b9fc0381b7ba Mon Sep 17 00:00:00 2001 From: Greg Mingas Date: Mon, 12 Apr 2021 12:12:14 +0100 Subject: [PATCH 02/10] Add kaggle username and key to CI script --- .github/workflows/run-synth-pipeline.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run-synth-pipeline.yml b/.github/workflows/run-synth-pipeline.yml index 87748dc..6b77638 100644 --- a/.github/workflows/run-synth-pipeline.yml +++ b/.github/workflows/run-synth-pipeline.yml @@ -21,4 +21,4 @@ jobs: run: docker build -f dockerfiles/quipp-dev.Dockerfile -t turinginst/quipp-env:latest . - name: Run pipeline - run: docker run turinginst/quipp-env:latest make + run: docker run -e KAGGLE_USERNAME=${{ secrets.USERNAME}} -e KAGGLE_KEY=$${{ secrets.KAGGLE_KEY}} turinginst/quipp-env:latest make From 2f3828f4825baa4089c19381da98270a5b8cf716 Mon Sep 17 00:00:00 2001 From: Greg Mingas Date: Mon, 12 Apr 2021 13:32:12 +0100 Subject: [PATCH 03/10] Fix name of kaggle username variable --- .github/workflows/run-synth-pipeline.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run-synth-pipeline.yml b/.github/workflows/run-synth-pipeline.yml index 6b77638..ea9c8d2 100644 --- a/.github/workflows/run-synth-pipeline.yml +++ b/.github/workflows/run-synth-pipeline.yml @@ -21,4 +21,4 @@ jobs: run: docker build -f dockerfiles/quipp-dev.Dockerfile -t turinginst/quipp-env:latest . - name: Run pipeline - run: docker run -e KAGGLE_USERNAME=${{ secrets.USERNAME}} -e KAGGLE_KEY=$${{ secrets.KAGGLE_KEY}} turinginst/quipp-env:latest make + run: docker run -e KAGGLE_USERNAME=${{ secrets.KAGGLE_USERNAME}} -e KAGGLE_KEY=$${{ secrets.KAGGLE_KEY}} turinginst/quipp-env:latest make From bd6d60bb52ca8701e449a9b07b0f4de86a6dcf49 Mon Sep 17 00:00:00 2001 From: Greg Mingas Date: Mon, 12 Apr 2021 13:41:09 +0100 Subject: [PATCH 04/10] Remove erroneous lines from makefile --- Makefile | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Makefile b/Makefile index b8110f7..a33a498 100644 --- a/Makefile +++ b/Makefile @@ -79,9 +79,7 @@ $(AE_DEIDENTIFIED_DATA) &: $(LONDON_POSTCODES) mkdir -p generator-outputs/household_poverty/ && \ cd generator-outputs/odi-nhs-ae/ && \ $(PYTHON) $(QUIPP_ROOT)/generators/odi-nhs-ae/generate.py && \ - $(PYTHON) $(QUIPP_ROOT)/generators/odi-nhs-ae/deidentify.py && \ - cd ../household_poverty/ && \ - $(PYTHON) $(QUIPP_ROOT)/generators/household_poverty/clean.py + $(PYTHON) $(QUIPP_ROOT)/generators/odi-nhs-ae/deidentify.py # pre-process the Household Poverty dataset $(HP_DATA_CLEAN): $(HH_DATA) From eb1cae61ce3f91e9b3a96094d87557ffdcb5b4b6 Mon Sep 17 00:00:00 2001 From: Oscar Giles Date: Mon, 12 Apr 2021 14:30:42 +0100 Subject: [PATCH 05/10] Remove erroneous lines in makefile --- Makefile | 2 -- 1 file changed, 2 deletions(-) diff --git a/Makefile b/Makefile index b8110f7..25c9807 100644 --- a/Makefile +++ b/Makefile @@ -80,8 +80,6 @@ $(AE_DEIDENTIFIED_DATA) &: $(LONDON_POSTCODES) cd generator-outputs/odi-nhs-ae/ && \ $(PYTHON) $(QUIPP_ROOT)/generators/odi-nhs-ae/generate.py && \ $(PYTHON) $(QUIPP_ROOT)/generators/odi-nhs-ae/deidentify.py && \ - cd ../household_poverty/ && \ - $(PYTHON) $(QUIPP_ROOT)/generators/household_poverty/clean.py # pre-process the Household Poverty dataset $(HP_DATA_CLEAN): $(HH_DATA) From 8473f6985eb1f47c8e966701b18f15edb4d7e8ab Mon Sep 17 00:00:00 2001 From: Greg Mingas Date: Mon, 12 Apr 2021 15:03:10 +0100 Subject: [PATCH 06/10] Add creation of Kaggle token file from environment variables --- Makefile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index a33a498..25baeeb 100644 --- a/Makefile +++ b/Makefile @@ -65,11 +65,13 @@ $(LONDON_POSTCODES): # download the Household poverty dataset from Kaggle using API $(HH_DATA): + touch ~/.kaggle/kaggle.json + dest=~/.kaggle/kaggle.json + echo "{\"username\":\"${secrets.KAGGLE_USERNAME}\", \"keys\":\"${secrets.KAGGLE_KEY}\"}" > "$dest" cd generators/household_poverty/data/ && \ kaggle competitions download -c costa-rican-household-poverty-prediction && \ unzip costa-rican-household-poverty-prediction.zip -d unzipped && \ - cp unzipped/train.csv . && \ - rm -rf unzipped + cp unzipped/train.csv . # make the "A&E deidentified" generated dataset # this is currently the only generated dataset, so it is handled with From 3cd5ac6ea8cafaff034f8553894d693c205fc694 Mon Sep 17 00:00:00 2001 From: Oscar Giles Date: Mon, 12 Apr 2021 15:03:31 +0100 Subject: [PATCH 07/10] Try to read secret --- .github/workflows/run-synth-pipeline.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/run-synth-pipeline.yml b/.github/workflows/run-synth-pipeline.yml index ea9c8d2..5eeee06 100644 --- a/.github/workflows/run-synth-pipeline.yml +++ b/.github/workflows/run-synth-pipeline.yml @@ -16,6 +16,10 @@ jobs: with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PAC }} + + - + name: Logging into kaggle + run: echo ${{ secrets.KAGGLE_USERNAME}} - name: Build pipeline run: docker build -f dockerfiles/quipp-dev.Dockerfile -t turinginst/quipp-env:latest . From c2bf876ac345df2e41ae95427aadb50e51d09317 Mon Sep 17 00:00:00 2001 From: Oscar Giles Date: Mon, 12 Apr 2021 15:05:04 +0100 Subject: [PATCH 08/10] Remove echo --- .github/workflows/run-synth-pipeline.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/run-synth-pipeline.yml b/.github/workflows/run-synth-pipeline.yml index 5eeee06..ea9c8d2 100644 --- a/.github/workflows/run-synth-pipeline.yml +++ b/.github/workflows/run-synth-pipeline.yml @@ -16,10 +16,6 @@ jobs: with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PAC }} - - - - name: Logging into kaggle - run: echo ${{ secrets.KAGGLE_USERNAME}} - name: Build pipeline run: docker build -f dockerfiles/quipp-dev.Dockerfile -t turinginst/quipp-env:latest . From 923c1f890f4406702aac638efacd78095b135a9e Mon Sep 17 00:00:00 2001 From: Greg Mingas Date: Mon, 12 Apr 2021 15:10:26 +0100 Subject: [PATCH 09/10] Add directory creation for kaggle in makefile --- Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile b/Makefile index 25baeeb..d3dd504 100644 --- a/Makefile +++ b/Makefile @@ -65,6 +65,7 @@ $(LONDON_POSTCODES): # download the Household poverty dataset from Kaggle using API $(HH_DATA): + mkdir ~/.kaggle touch ~/.kaggle/kaggle.json dest=~/.kaggle/kaggle.json echo "{\"username\":\"${secrets.KAGGLE_USERNAME}\", \"keys\":\"${secrets.KAGGLE_KEY}\"}" > "$dest" From 76bf9fb5b3296505667dd5ab93c8bd926e3c3ee5 Mon Sep 17 00:00:00 2001 From: Oscar Giles Date: Mon, 12 Apr 2021 15:17:35 +0100 Subject: [PATCH 10/10] Use environment vars --- .github/workflows/run-synth-pipeline.yml | 5 ++++- Makefile | 3 --- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/run-synth-pipeline.yml b/.github/workflows/run-synth-pipeline.yml index ea9c8d2..c31f3d7 100644 --- a/.github/workflows/run-synth-pipeline.yml +++ b/.github/workflows/run-synth-pipeline.yml @@ -21,4 +21,7 @@ jobs: run: docker build -f dockerfiles/quipp-dev.Dockerfile -t turinginst/quipp-env:latest . - name: Run pipeline - run: docker run -e KAGGLE_USERNAME=${{ secrets.KAGGLE_USERNAME}} -e KAGGLE_KEY=$${{ secrets.KAGGLE_KEY}} turinginst/quipp-env:latest make + env: + KAGGLE_USERNAME: ${{ secrets.KAGGLE_USERNAME }} + KAGGLE_KEY: ${{ secrets.KAGGLE_KEY }} + run: docker run -e KAGGLE_USERNAME=$KAGGLE_USERNAME -e KAGGLE_KEY=$KAGGLE_KEY turinginst/quipp-env:latest make diff --git a/Makefile b/Makefile index 25baeeb..4060a65 100644 --- a/Makefile +++ b/Makefile @@ -65,9 +65,6 @@ $(LONDON_POSTCODES): # download the Household poverty dataset from Kaggle using API $(HH_DATA): - touch ~/.kaggle/kaggle.json - dest=~/.kaggle/kaggle.json - echo "{\"username\":\"${secrets.KAGGLE_USERNAME}\", \"keys\":\"${secrets.KAGGLE_KEY}\"}" > "$dest" cd generators/household_poverty/data/ && \ kaggle competitions download -c costa-rican-household-poverty-prediction && \ unzip costa-rican-household-poverty-prediction.zip -d unzipped && \