Skip to content
This repository was archived by the owner on Nov 21, 2022. It is now read-only.

Commit 36ada88

Browse files
author
Issa
committed
Add DHS pre-processing notebook
1 parent a666622 commit 36ada88

File tree

4 files changed

+389
-124
lines changed

4 files changed

+389
-124
lines changed

notebooks/00_dhs_prep.ipynb

+257
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,257 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# Demographic and Health Survey (DHS) Data Preparation\n",
8+
"\n",
9+
"Download the Philippine National DHS Dataset from the [official website here](https://www.dhsprogram.com/what-we-do/survey/survey-display-510.cfm). Copy and unzip the file in the data directory. Importantly, the DHS folder should contain the following files:\n",
10+
"- `PHHR70DT/PHHR70FL.DTA`\n",
11+
"- `PHHR70DT/PHHR70FL.DO`"
12+
]
13+
},
14+
{
15+
"cell_type": "markdown",
16+
"metadata": {},
17+
"source": [
18+
"## Imports"
19+
]
20+
},
21+
{
22+
"cell_type": "code",
23+
"execution_count": 49,
24+
"metadata": {},
25+
"outputs": [],
26+
"source": [
27+
"import pandas as pd"
28+
]
29+
},
30+
{
31+
"cell_type": "markdown",
32+
"metadata": {},
33+
"source": [
34+
"## File locations"
35+
]
36+
},
37+
{
38+
"cell_type": "code",
39+
"execution_count": 50,
40+
"metadata": {},
41+
"outputs": [],
42+
"source": [
43+
"data_dir = '../data/'\n",
44+
"dhs_zip = data_dir + '<INSERT DHS FOLDER NAME HERE>/'\n",
45+
"dhs_file = dhs_zip + 'PHHR70DT/PHHR70FL.DTA'\n",
46+
"dhs_dict_file = dhs_zip + 'PHHR70DT/PHHR70FL.DO'"
47+
]
48+
},
49+
{
50+
"cell_type": "markdown",
51+
"metadata": {},
52+
"source": [
53+
"## Helper Function"
54+
]
55+
},
56+
{
57+
"cell_type": "code",
58+
"execution_count": 51,
59+
"metadata": {},
60+
"outputs": [],
61+
"source": [
62+
"def get_dhs_dict(dhs_dict_file):\n",
63+
" dhs_dict = dict()\n",
64+
" with open(dhs_dict_file, 'r', errors='replace') as file:\n",
65+
" line = file.readline()\n",
66+
" while line:\n",
67+
" line = file.readline()\n",
68+
" if 'label variable' in line:\n",
69+
" code = line.split()[2]\n",
70+
" colname = ' '.join([x.strip('\"') for x in line.split()[3:]])\n",
71+
" dhs_dict[code] = colname\n",
72+
" return dhs_dict"
73+
]
74+
},
75+
{
76+
"cell_type": "markdown",
77+
"metadata": {},
78+
"source": [
79+
"## Load DHS Dataset"
80+
]
81+
},
82+
{
83+
"cell_type": "code",
84+
"execution_count": 55,
85+
"metadata": {},
86+
"outputs": [
87+
{
88+
"name": "stdout",
89+
"output_type": "stream",
90+
"text": [
91+
"Data Dimensions: (27496, 342)\n"
92+
]
93+
}
94+
],
95+
"source": [
96+
"dhs = pd.read_stata(dhs_file, convert_categoricals=False)\n",
97+
"dhs_dict = get_dhs_dict(dhs_dict_file)\n",
98+
"dhs = dhs.rename(columns=dhs_dict).dropna(axis=1)\n",
99+
"print('Data Dimensions: {}'.format(dhs.shape))"
100+
]
101+
},
102+
{
103+
"cell_type": "markdown",
104+
"metadata": {},
105+
"source": [
106+
"## Aggregate Columns"
107+
]
108+
},
109+
{
110+
"cell_type": "code",
111+
"execution_count": 56,
112+
"metadata": {},
113+
"outputs": [
114+
{
115+
"name": "stdout",
116+
"output_type": "stream",
117+
"text": [
118+
"Data Dimensions: (1249, 4)\n"
119+
]
120+
},
121+
{
122+
"data": {
123+
"text/html": [
124+
"<div>\n",
125+
"<style scoped>\n",
126+
" .dataframe tbody tr th:only-of-type {\n",
127+
" vertical-align: middle;\n",
128+
" }\n",
129+
"\n",
130+
" .dataframe tbody tr th {\n",
131+
" vertical-align: top;\n",
132+
" }\n",
133+
"\n",
134+
" .dataframe thead tr th {\n",
135+
" text-align: left;\n",
136+
" }\n",
137+
"\n",
138+
" .dataframe thead tr:last-of-type th {\n",
139+
" text-align: right;\n",
140+
" }\n",
141+
"</style>\n",
142+
"<table border=\"1\" class=\"dataframe\">\n",
143+
" <thead>\n",
144+
" <tr>\n",
145+
" <th></th>\n",
146+
" <th>Wealth Index</th>\n",
147+
" <th>Education completed (years)</th>\n",
148+
" <th>Access to electricity</th>\n",
149+
" <th>Access to water (minutes)</th>\n",
150+
" </tr>\n",
151+
" <tr>\n",
152+
" <th>Cluster number</th>\n",
153+
" <th></th>\n",
154+
" <th></th>\n",
155+
" <th></th>\n",
156+
" <th></th>\n",
157+
" </tr>\n",
158+
" </thead>\n",
159+
" <tbody>\n",
160+
" <tr>\n",
161+
" <td>1</td>\n",
162+
" <td>-31881.608696</td>\n",
163+
" <td>9.391304</td>\n",
164+
" <td>0.913043</td>\n",
165+
" <td>0.0</td>\n",
166+
" </tr>\n",
167+
" <tr>\n",
168+
" <td>2</td>\n",
169+
" <td>-2855.375000</td>\n",
170+
" <td>9.708333</td>\n",
171+
" <td>0.958333</td>\n",
172+
" <td>0.0</td>\n",
173+
" </tr>\n",
174+
" </tbody>\n",
175+
"</table>\n",
176+
"</div>"
177+
],
178+
"text/plain": [
179+
" Wealth Index Education completed (years) \\\n",
180+
"Cluster number \n",
181+
"1 -31881.608696 9.391304 \n",
182+
"2 -2855.375000 9.708333 \n",
183+
"\n",
184+
" Access to electricity Access to water (minutes) \n",
185+
"Cluster number \n",
186+
"1 0.913043 0.0 \n",
187+
"2 0.958333 0.0 "
188+
]
189+
},
190+
"execution_count": 56,
191+
"metadata": {},
192+
"output_type": "execute_result"
193+
}
194+
],
195+
"source": [
196+
"data = dhs[[\n",
197+
" 'Cluster number',\n",
198+
" 'Wealth index factor score combined (5 decimals)',\n",
199+
" 'Education completed in single years',\n",
200+
" 'Has electricity'\n",
201+
"]].groupby('Cluster number').mean()\n",
202+
"\n",
203+
"data['Time to get to water source (minutes)'] = dhs[[\n",
204+
" 'Cluster number',\n",
205+
" 'Time to get to water source (minutes)'\n",
206+
"]].replace(996, 0).groupby('Cluster number').median()\n",
207+
"\n",
208+
"data.columns = [[\n",
209+
" 'Wealth Index',\n",
210+
" 'Education completed (years)',\n",
211+
" 'Access to electricity',\n",
212+
" 'Access to water (minutes)'\n",
213+
"]]\n",
214+
"\n",
215+
"print('Data Dimensions: {}'.format(data.shape))\n",
216+
"data.head(2)"
217+
]
218+
},
219+
{
220+
"cell_type": "markdown",
221+
"metadata": {},
222+
"source": [
223+
"## Save Processed DHS File"
224+
]
225+
},
226+
{
227+
"cell_type": "code",
228+
"execution_count": 54,
229+
"metadata": {},
230+
"outputs": [],
231+
"source": [
232+
"data.to_csv(data_dir+'dhs_indicators.csv')"
233+
]
234+
}
235+
],
236+
"metadata": {
237+
"kernelspec": {
238+
"display_name": "venv",
239+
"language": "python",
240+
"name": "venv"
241+
},
242+
"language_info": {
243+
"codemirror_mode": {
244+
"name": "ipython",
245+
"version": 3
246+
},
247+
"file_extension": ".py",
248+
"mimetype": "text/x-python",
249+
"name": "python",
250+
"nbconvert_exporter": "python",
251+
"pygments_lexer": "ipython3",
252+
"version": "3.7.3"
253+
}
254+
},
255+
"nbformat": 4,
256+
"nbformat_minor": 4
257+
}

0 commit comments

Comments
 (0)