alan-turing-institute · OscartGiles · Jan 7, 2021 · Jan 7, 2021 · Jan 12, 2021 · Jan 12, 2021
diff --git a/.github/workflows/run-synth-pipeline.yml b/.github/workflows/run-synth-pipeline.yml
@@ -16,9 +16,7 @@ jobs:
       with:
         username: ${{ secrets.DOCKER_USERNAME }}
         password: ${{ secrets.DOCKER_PAC }}
-    - 
-      name: Build pipeline
-      run: docker build -f dockerfiles/quipp-dev.Dockerfile -t turinginst/quipp-env:latest .
     -
-      name: Run pipeline
-      run: docker run turinginst/quipp-env:latest make
+      name: Run pipeline with privbayes-adult
+
+      run: docker run -v $GITHUB_WORKSPACE:/quipp-pipeline --workdir /quipp-pipeline turinginst/quipp-env:base make run-privbayes-adult
diff --git a/Makefile b/Makefile
@@ -1,7 +1,6 @@
 ## --- echo commands (for debugging)
 ## SHELL = sh -xv
 
-
 ##-------------------------------------
 ## Set up main path variables
 ##-------------------------------------
@@ -22,25 +21,46 @@ SYNTH_OUTPUTS_PREFIX = $(addprefix synth-output/,$(RUN_INPUTS_BASE_PREFIX))
 SYNTH_OUTPUTS_CSV = $(addsuffix /synthetic_data_1.csv,$(SYNTH_OUTPUTS_PREFIX))
 
 ## Construct a list of .json file names for each utility and privacy metric
-SYNTH_OUTPUTS_PRIV_DISCL_RISK = $(addsuffix /privacy_disclosure_risk.json,$(SYNTH_OUTPUTS_PREFIX))
-SYNTH_OUTPUTS_UTIL_CLASS = $(addsuffix /utility_classifiers.json,$(SYNTH_OUTPUTS_PREFIX))
+SYNTH_OUTPUTS_PRIV_DISCL_RISK = $(addsuffix /disclosure_risk.json,$(SYNTH_OUTPUTS_PREFIX))
+SYNTH_OUTPUTS_UTIL_CLASS = $(addsuffix /utility_diff.json,$(SYNTH_OUTPUTS_PREFIX))
 SYNTH_OUTPUTS_UTIL_CORR = $(addsuffix /utility_correlations.json,$(SYNTH_OUTPUTS_PREFIX))
+SYNTH_OUTPUTS_UTIL_FEATURE_IMPORTANCE = $(addsuffix /utility_feature_importance.json,$(SYNTH_OUTPUTS_PREFIX))
 
 .PHONY: all all-synthetic generated-data clean
 
-all: $(SYNTH_OUTPUTS_PRIV_DISCL_RISK) $(SYNTH_OUTPUTS_UTIL_CLASS) $(SYNTH_OUTPUTS_UTIL_CORR)
+all: $(SYNTH_OUTPUTS_PRIV_DISCL_RISK) $(SYNTH_OUTPUTS_UTIL_CLASS) $(SYNTH_OUTPUTS_UTIL_CORR) $(SYNTH_OUTPUTS_UTIL_FEATURE_IMPORTANCE)
 
 all-synthetic: $(SYNTH_OUTPUTS_CSV)
 
 
+##-------------------------------------
+## Add provenance information to output
+##-------------------------------------
+
+PROVENANCE_DEF = provenance() {\
+    git_result=$$(python provenance.py | jq '{commit, local_modifications}') ; \
+    ( jq ". += {git: $$git_result}" $$1 > $${1}.tmp ) && mv $${1}.tmp $$1 || \
+    echo "Warning: No provenance could be recorded for this target" && rm -f $${1}.tmp ; \
+}
+ADD_PROVENANCE = $(PROVENANCE_DEF) && provenance
+
+
 ##-------------------------------------
 ## Generate input data
 ##-------------------------------------
 
 ## set data file paths
 AE_DEIDENTIFIED_DATA = generator-outputs/odi-nhs-ae/hospital_ae_data_deidentify.csv generator-outputs/odi-nhs-ae/hospital_ae_data_deidentify.json
 LONDON_POSTCODES = generators/odi-nhs-ae/data/London\ postcodes.csv
-generated-data: $(AE_DEIDENTIFIED_DATA)
+HP_DATA_CLEAN = generator-outputs/household_poverty/train_cleaned.csv generator-outputs/household_poverty/train_cleaned.json
+ARTIFICIAL_DATA_1 = generator-outputs/artificial/artificial_1.csv generator-outputs/artificial/artificial_1.json
+ARTIFICIAL_DATA_2 = generator-outputs/artificial/artificial_2.csv generator-outputs/artificial/artificial_2.json
+ARTIFICIAL_DATA_3 = generator-outputs/artificial/artificial_3.csv generator-outputs/artificial/artificial_3.json
+ARTIFICIAL_DATA_4 = generator-outputs/artificial/artificial_4.csv generator-outputs/artificial/artificial_4.json
+ARTIFICIAL_DATA_5 = generator-outputs/artificial/artificial_5.csv generator-outputs/artificial/artificial_5.json
+ARTIFICIAL_DATA_6 = generator-outputs/artificial/artificial_6.csv generator-outputs/artificial/artificial_6.json
+ARTIFICIAL_DATA_7 = generator-outputs/artificial/artificial_7.csv generator-outputs/artificial/artificial_7.json
+generated-data: $(AE_DEIDENTIFIED_DATA) $(HP_DATA_CLEAN) $(ARTIFICIAL_DATA_1) $(ARTIFICIAL_DATA_2) $(ARTIFICIAL_DATA_3) $(ARTIFICIAL_DATA_4) $(ARTIFICIAL_DATA_5) $(ARTIFICIAL_DATA_6) $(ARTIFICIAL_DATA_7)
 
 # download the London Postcodes dataset used by the A&E generated
 # dataset (this is about 133 MB)
@@ -54,21 +74,36 @@ $(LONDON_POSTCODES):
 # its own rule
 $(AE_DEIDENTIFIED_DATA) &: $(LONDON_POSTCODES)
 	mkdir -p generator-outputs/odi-nhs-ae/ && \
+    mkdir -p generator-outputs/household_poverty/ && \
 	cd generator-outputs/odi-nhs-ae/ && \
 	$(PYTHON) $(QUIPP_ROOT)/generators/odi-nhs-ae/generate.py && \
 	$(PYTHON) $(QUIPP_ROOT)/generators/odi-nhs-ae/deidentify.py
 
+# pre-process the Household Poverty dataset
+$(HP_DATA_CLEAN):
+	mkdir -p generator-outputs/household_poverty/ && \
+	cd generator-outputs/household_poverty/ && \
+	$(PYTHON) $(QUIPP_ROOT)/generators/household_poverty/clean.py
+
+# generate the three artificial datasets
+$(ARTIFICIAL_DATA_1) $(ARTIFICIAL_DATA_2) $(ARTIFICIAL_DATA_3)  $(ARTIFICIAL_DATA_4)  $(ARTIFICIAL_DATA_5)  $(ARTIFICIAL_DATA_6)  $(ARTIFICIAL_DATA_7):
+	mkdir -p generator-outputs/artificial/ && \
+	cd generator-outputs/artificial/ && \
+	$(PYTHON) $(QUIPP_ROOT)/generators/artificial/generate.py
+
 
 ##-------------------------------------
 ## Generate synthetic data
 ##-------------------------------------
 
 ## synthesize data - this rule also builds "synth-output/%/data_description.json"
 $(SYNTH_OUTPUTS_CSV) : \
-synth-output/%/synthetic_data_1.csv : run-inputs/%.json $(AE_DEIDENTIFIED_DATA)
-	mkdir -p $$(dirname $@) && \
-	cp $< $$(dirname $@) && \
-	python synthesize.py -i $< -o $$(dirname $@)
+synth-output/%/synthetic_data_1.csv : run-inputs/%.json  $(AE_DEIDENTIFIED_DATA) $(ARTIFICIAL_DATA_1) $(ARTIFICIAL_DATA_2) $(ARTIFICIAL_DATA_3) $(ARTIFICIAL_DATA_4) $(ARTIFICIAL_DATA_5) $(ARTIFICIAL_DATA_6) $(ARTIFICIAL_DATA_7)
+	outdir=$$(dirname $@) && \
+	mkdir -p $$outdir && \
+	cp $< $${outdir}/input.json && \
+	$(ADD_PROVENANCE) $${outdir}/input.json && \
+	python synthesize.py -i $< -o $$outdir
 
 
 ##-------------------------------------
@@ -77,19 +112,45 @@ synth-output/%/synthetic_data_1.csv : run-inputs/%.json $(AE_DEIDENTIFIED_DATA)
 
 ## compute privacy and utility metrics
 $(SYNTH_OUTPUTS_PRIV_DISCL_RISK) : \
-synth-output/%/privacy_disclosure_risk.json : \
+synth-output/%/disclosure_risk.json : \
 run-inputs/%.json synth-output/%/synthetic_data_1.csv
-	python metrics/privacy-metrics/disclosure_risk.py -i $< -o $$(dirname $@)
+	python metrics/privacy-metrics/disclosure_risk.py -i $< -o $$(dirname $@) &&\
+	$(ADD_PROVENANCE) $@
 
 $(SYNTH_OUTPUTS_UTIL_CLASS) : \
-synth-output/%/utility_classifiers.json : \
+synth-output/%/utility_diff.json : \
 run-inputs/%.json synth-output/%/synthetic_data_1.csv
-	python metrics/utility-metrics/classifiers.py -i $< -o $$(dirname $@)
+	python metrics/utility-metrics/classifiers.py -i $< -o $$(dirname $@) &&\
+	$(ADD_PROVENANCE) $@
 
 $(SYNTH_OUTPUTS_UTIL_CORR) : \
 synth-output/%/utility_correlations.json : \
 run-inputs/%.json synth-output/%/synthetic_data_1.csv
-	python metrics/utility-metrics/correlations.py -i $< -o $$(dirname $@)
+	python metrics/utility-metrics/correlations.py -i $< -o $$(dirname $@) &&\
+	$(ADD_PROVENANCE) $@
+
+$(SYNTH_OUTPUTS_UTIL_FEATURE_IMPORTANCE) : \
+synth-output/%/utility_feature_importance.json : \
+run-inputs/%.json synth-output/%/synthetic_data_1.csv
+	python metrics/utility-metrics/feature_importance.py -i $< -o $$(dirname $@) &&\
+	$(ADD_PROVENANCE) $@
+
+
+##-------------------------------------
+## Helper targets for individual inputs
+##-------------------------------------
+
+##   make run-example
+##
+## produces synthetic data and metrics from run-inputs/example.json with output in synth-output/example/
+
+run-% :\
+synth-output/%/synthetic_data_1.csv\
+synth-output/%/utility_correlations.json\
+synth-output/%/disclosure_risk.json\
+synth-output/%/utility_diff.json\
+synth-output/%/utility_feature_importance.json\
+;
 
 
 ##-------------------------------------

diff --git a/README.md b/README.md
@@ -99,6 +99,25 @@ environmental variable `SGFROOT` to point to this location.  That is, in bash,
 We use the PrivBayes implementation within the DataSynthesizer fork found [here](https://github.com/gmingas/DataSynthesizer). 
 In order to install it, clone the above repository locally, go to its root directory and run `pip install .`
 
+#### Forked synthetic_data_release
+
+We use the PATE-GAN implementation within the `synthetic_data_release` fork found [here](https://github.com/kasra-hosseini/synthetic_data_release). 
+In order to use PATE-GAN in QUIPP:
+1. create a new directory:
+
+```bash
+cd /path/to/QUIPP-pipeline
+mkdir libs
+```
+
+2. Clone the above repository inside `libs` directory created in the previous step:
+
+```bash
+# from /path/to/QUIPP-pipeline 
+cd libs
+git clone https://github.com/kasra-hosseini/synthetic_data_release.git
+```
+
 ## Top-level directory contents
 
 The top-level directory structure mirrors the data pipeline.

diff --git a/datasets-raw/framingham/README.md b/datasets-raw/framingham/README.md
@@ -0,0 +1,19 @@
+"Framingham Heart Study" dataset by found in Kaggle (https://www.kaggle.com/amanajmera1/framingham-heart-study-dataset), 
+used under [CC0 1.0 Universal Public Domain Dedication](https://creativecommons.org/publicdomain/zero/1.0/) modified by filling NA values with the column means
+
+The dataset is not used for commercial purposes.
+
+Note: Might want to replace this with the details from https://biolincc.nhlbi.nih.gov/studies/framcohort/ , due to time constraints I am using the version available freely on Kaggle rather than waiting for approval, but I believe this is the true original source of the data.
+
+Instructions to clean (also included in `datasets-raw/framingham/clean.py`):
+
+```{python}
+import pandas as pd
+
+raw_df = pd.read_csv("framingham.csv")
+
+df = raw_df.fillna(raw_df.mean())
+df[["cigsPerDay", "age", "education", "BPMeds"]] = df[["cigsPerDay", "age", "education", "BPMeds"]].astype(int)
+
+df.to_csv("../../datasets/framinghamframingham_cleaned.csv", index=False)
+```
diff --git a/datasets-raw/framingham/clean.py b/datasets-raw/framingham/clean.py
@@ -0,0 +1,8 @@
+import pandas as pd
+
+raw_df = pd.read_csv("framingham.csv")
+
+df = raw_df.fillna(raw_df.mean())
+df[["cigsPerDay", "age", "education", "BPMeds"]] = df[["cigsPerDay", "age", "education", "BPMeds"]].astype(int)
+
+df.to_csv("../../datasets/framinghamframingham_cleaned.csv", index=False)
diff --git a/datasets-raw/framingham/framingham.csv b/datasets-raw/framingham/framingham.csv
diff --git a/datasets/README.md b/datasets/README.md
@@ -4,3 +4,4 @@ These datasets are in the [format required by the pipeline](../README.md#data-fo
 
 - [`generated/odi_nhs_ae`](generated/odi_nhs_ae): mock A&E dataset, generated with the scripts [here](../generators/odi-nhs-ae)
 - [`polish_data_2011`](polish_data_2011): A prepared version of the [Social Diagnosis](http://www.diagnoza.com/index-en.html) project data, Council for Social Monitoring 2011.  This is included in synthpop and extracted with [this script](polish_data_2011/data_prep.R).
+- [`adult_dataset`](adult_dataset): https://archive.ics.uci.edu/ml/datasets/adult
Original file line number	Diff line number	Diff line change
Expand Up		@@ -4,3 +4,4 @@ These datasets are in the [format required by the pipeline](../README.md#data-fo

		- [`generated/odi_nhs_ae`](generated/odi_nhs_ae): mock A&E dataset, generated with the scripts [here](../generators/odi-nhs-ae)
		- [`polish_data_2011`](polish_data_2011): A prepared version of the [Social Diagnosis](http://www.diagnoza.com/index-en.html) project data, Council for Social Monitoring 2011. This is included in synthpop and extracted with [this script](polish_data_2011/data_prep.R).
		- [`adult_dataset`](adult_dataset): https://archive.ics.uci.edu/ml/datasets/adult