From 322324f038edf8209109dee24c28afd56f0329cf Mon Sep 17 00:00:00 2001 From: Piyush Sambhi <46954957+sambhipiyush@users.noreply.github.com> Date: Sun, 10 May 2020 04:53:54 +0530 Subject: [PATCH 1/4] Create load_df_to_memory() and change in two function Create load_df_to_memory() and changes in answer_one and answer_two functions --- Assignment+3.py | 32 ++++++-------------------------- 1 file changed, 6 insertions(+), 26 deletions(-) diff --git a/Assignment+3.py b/Assignment+3.py index a044ed1..2f42e6d 100644 --- a/Assignment+3.py +++ b/Assignment+3.py @@ -58,8 +58,7 @@ # *This function should return a DataFrame with 20 columns and 15 entries.* # In[1]: - -def answer_one(): +def load_df_to_memory(): import pandas as pd import numpy as np energy = pd.read_excel('Energy Indicators.xls', skiprows=17, skip_footer= 38) @@ -80,7 +79,10 @@ def answer_one(): df = pd.merge(ScimEn, energy, how='inner',left_on='Country', right_on='Country') alldf = pd.merge(df,GDP, how='inner', left_on='Country', right_on='Country') alldf = alldf.set_index('Country') - return alldf[:15] + return alldf + +def answer_one(): + return load_df_to_memory()[:15] answer_one() @@ -100,29 +102,7 @@ def answer_one(): # In[3]: def answer_two(): - import pandas as pd - import numpy as np - energy = pd.read_excel('Energy Indicators.xls', skiprows=17, skip_footer= 38) - energy = energy[['Unnamed: 1', 'Petajoules', 'Gigajoules', '%']] - energy.columns = ['Country', 'Energy Supply', 'Energy Supply per Capita', '% Renewable'] - energy['Energy Supply'] = energy['Energy Supply'] * 1000000 - energy[['Energy Supply', 'Energy Supply per Capita', '% Renewable']]= energy[['Energy Supply', 'Energy Supply per Capita', '% Renewable']].replace('...', np.NaN) - energy['Country'] = energy['Country'].replace({'Republic of Korea' : 'South Korea', 'United States of America': 'United States', 'United Kingdom of Great Britain and Northern Ireland' : 'United Kingdom', 'China, Hong Kong Special Administrative Region' : 'Hong Kong', 'Iran (Islamic Republic of)':'Iran'}) - energy['Country'].str.replace(r" \(.*\)","") - - GDP = pd.read_csv('world_bank.csv', skiprows = 4) - GDP['Country Name'] = GDP['Country Name'].replace({'Korea, Rep.': 'South Korea', 'Iran, Islamic Rep.': 'Iran', 'Hong Kong SAR, China' : 'Hong Kong'}) - GDP = GDP[['Country Name', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015']] - GDP.columns = ['Country', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015'] - - ScimEn = pd.read_excel('scimagojr-3.xlsx') - - df = pd.merge(ScimEn, energy, how='inner',left_on='Country', right_on='Country') - alldf = pd.merge(df,GDP, how='inner', left_on='Country', right_on='Country') - alldf = alldf.set_index('Country') - answer_one = alldf[:15] - answer_two = alldf.shape[0] - answer_one.shape[0] - return answer_two + return load_df_to_memory().shape[0] - answer_one().shape[0] answer_two() From 03e913d20eeb92df4c4557fa8f0b8322610bbb2c Mon Sep 17 00:00:00 2001 From: Piyush Sambhi <46954957+sambhipiyush@users.noreply.github.com> Date: Sun, 10 May 2020 06:40:11 +0530 Subject: [PATCH 2/4] Updated answer_two function --- Assignment+3.py | 50 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/Assignment+3.py b/Assignment+3.py index 2f42e6d..f2be6df 100644 --- a/Assignment+3.py +++ b/Assignment+3.py @@ -102,7 +102,55 @@ def answer_one(): # In[3]: def answer_two(): - return load_df_to_memory().shape[0] - answer_one().shape[0] + energy = pd.read_excel("Energy Indicators.xls",skip_footer=38,skip_header=1,skiprows=17) # Skip header and footer + energy.drop(energy.columns[[0,1]],axis=1,inplace=True) # Drop first 2 columns + energy.columns = ['Country', 'Energy Supply', 'Energy Supply per Capita', '% Renewable'] + energy.dropna() # Drop rows with NaN values. + + energy['Country'] = energy['Country'].str.replace(r'\(.*\)', '') # Remove contents within parenthesis. + energy['Country'] = energy['Country'].str.replace('\d+', '') # Remove digits from names + energy['Country'] = energy['Country'].str.strip() # This brings the Iran energy values back! + + # Turn blank values into NaN + for col in energy: + energy[col] = energy[col].replace('...',np.nan) + + energy['Country'] = energy['Country'].str.replace('Republic of Korea','South Korea') + energy['Country'] = energy['Country'].str.replace('United States of America','United States') + energy['Country'] = energy['Country'].str.replace('United Kingdom of Great Britain and Northern Ireland','United Kingdom') + energy['Country'] = energy['Country'].str.replace('China, Hong Kong Special Administrative Region','Hong Kong') + + # GDP: + GDP = pd.read_csv('world_bank.csv', skiprows=3) # Skip header + + # Make first row the column names + new_header = GDP.iloc[0] + GDP = GDP[1:] + GDP.columns = new_header + + #GDP = GDP.rename(index=str,columns = {"Country Name":"Country"}) + GDP['Country Name'] = GDP['Country Name'].str.replace('Korea, Rep.','South Korea') + GDP['Country Name'] = GDP['Country Name'].str.replace('Iran, Islamic Rep.','Iran') + GDP['Country Name'] = GDP['Country Name'].str.replace('Hong Kong SAR, China','Hong Kong') + + # Change column name from 'Country Name' to 'Country' for merging 3 files on country name. + names = GDP.columns.tolist() + names[names.index('Country Name')] = 'Country' + GDP.columns = names + + # Only keep the columns from 2006-15. Drop column number 1 to 50. Don't need country code, etc. + GDP = GDP.drop(GDP.iloc[:,1:50], axis=1) + GDP.columns = GDP.columns.astype(str).str.split('.').str[0] # Remove '.0' at the end of the year columns. + + # SCIMEN: + ScimEn = pd.read_excel('scimagojr-3.xlsx') + + # LOST ENTRIES = LEN(OUTER JOIN) - LEN(INNER JOIN) + + # Need unique entries in all 3 sets so use concat. Can't do that with a left or right outer join! + num_outer = len(pd.concat([ScimEn['Country'],energy['Country'],GDP['Country']]).unique()) + num_inter = (GDP.merge(energy, left_on='Country', right_on='Country', how='inner').merge(ScimEn, left_on='Country', right_on='Country', how='inner').shape[0]) + return num_outer-num_inter answer_two() From fd43e39dc618e03331bfe7249ad1df0b47a810fe Mon Sep 17 00:00:00 2001 From: Piyush Sambhi <46954957+sambhipiyush@users.noreply.github.com> Date: Sun, 10 May 2020 06:41:21 +0530 Subject: [PATCH 3/4] Improved answer_three function --- Assignment+3.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Assignment+3.py b/Assignment+3.py index f2be6df..1fdddf4 100644 --- a/Assignment+3.py +++ b/Assignment+3.py @@ -168,8 +168,7 @@ def answer_two(): def answer_three(): Top15 = answer_one() - aveGDP = Top15[['2006','2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015']].mean(axis=1).rename('aveGDP').sort_values(ascending=False) - return aveGDP + return Top15.iloc[:, 10:].mean(axis=1).rename('avgGDP').sort_values(ascending=False) answer_three() From cebe25a84637a5e4b0f69502db969772a2e4365c Mon Sep 17 00:00:00 2001 From: Piyush Sambhi <46954957+sambhipiyush@users.noreply.github.com> Date: Sun, 10 May 2020 06:46:12 +0530 Subject: [PATCH 4/4] Modification in answer_seven and answer_eight functions --- Assignment+3.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/Assignment+3.py b/Assignment+3.py index 1fdddf4..780d5e5 100644 --- a/Assignment+3.py +++ b/Assignment+3.py @@ -231,9 +231,8 @@ def answer_six(): def answer_seven(): Top15 = answer_one() - Top15['Citation ratio'] = Top15['Self-citations'] / Top15['Citations'] - MaxCitationRatio = Top15['Citation ratio'].idxmax(), Top15['Citation ratio'].max() - return MaxCitationRatio + Top15['selfcitation_to_total'] = Top15['Self-citations']/Top15['Citations'] + return Top15['selfcitation_to_total'].idxmax(), Top15['selfcitation_to_total'].max() answer_seven() @@ -249,10 +248,8 @@ def answer_seven(): def answer_eight(): Top15 = answer_one() - Top15['PopEstimate'] = Top15['Energy Supply'] / Top15['Energy Supply per Capita'] - answer_eight = Top15['PopEstimate'].sort_values(ascending=False) - answer_eight = answer_eight.index.tolist()[2] - return answer_eight + Top15['estd_population'] = Top15['Energy Supply'] / Top15['Energy Supply per Capita'] + return Top15.sort_values(by=['estd_population'], ascending=False).index[2] answer_eight()