From 322324f038edf8209109dee24c28afd56f0329cf Mon Sep 17 00:00:00 2001
From: Piyush Sambhi <46954957+sambhipiyush@users.noreply.github.com>
Date: Sun, 10 May 2020 04:53:54 +0530
Subject: [PATCH 1/4] Create load_df_to_memory() and change in two function

Create load_df_to_memory() and changes in answer_one and answer_two functions
---
 Assignment+3.py | 32 ++++++--------------------------
 1 file changed, 6 insertions(+), 26 deletions(-)

diff --git a/Assignment+3.py b/Assignment+3.py
index a044ed1..2f42e6d 100644
--- a/Assignment+3.py
+++ b/Assignment+3.py
@@ -58,8 +58,7 @@
 # *This function should return a DataFrame with 20 columns and 15 entries.*
 
 # In[1]:
-
-def answer_one():
+def load_df_to_memory():
     import pandas as pd
     import numpy as np
     energy = pd.read_excel('Energy Indicators.xls', skiprows=17, skip_footer= 38)
@@ -80,7 +79,10 @@ def answer_one():
     df = pd.merge(ScimEn, energy, how='inner',left_on='Country', right_on='Country')
     alldf = pd.merge(df,GDP, how='inner', left_on='Country', right_on='Country')
     alldf = alldf.set_index('Country')
-    return alldf[:15]
+    return alldf
+
+def answer_one():
+    return load_df_to_memory()[:15]
 
 answer_one()
 
@@ -100,29 +102,7 @@ def answer_one():
 # In[3]:
 
 def answer_two():
-    import pandas as pd
-    import numpy as np
-    energy = pd.read_excel('Energy Indicators.xls', skiprows=17, skip_footer= 38)
-    energy = energy[['Unnamed: 1', 'Petajoules', 'Gigajoules', '%']]
-    energy.columns = ['Country', 'Energy Supply', 'Energy Supply per Capita', '% Renewable']
-    energy['Energy Supply'] = energy['Energy Supply'] * 1000000
-    energy[['Energy Supply', 'Energy Supply per Capita', '% Renewable']]= energy[['Energy Supply', 'Energy Supply per Capita', '% Renewable']].replace('...', np.NaN)
-    energy['Country'] = energy['Country'].replace({'Republic of Korea' : 'South Korea', 'United States of America': 'United States', 'United Kingdom of Great Britain and Northern Ireland' : 'United Kingdom', 'China, Hong Kong Special Administrative Region' : 'Hong Kong', 'Iran (Islamic Republic of)':'Iran'})
-    energy['Country'].str.replace(r" \(.*\)","")
-    
-    GDP = pd.read_csv('world_bank.csv', skiprows = 4)
-    GDP['Country Name'] = GDP['Country Name'].replace({'Korea, Rep.': 'South Korea', 'Iran, Islamic Rep.': 'Iran', 'Hong Kong SAR, China' : 'Hong Kong'})    
-    GDP = GDP[['Country Name', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015']]
-    GDP.columns = ['Country', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015']
-    
-    ScimEn = pd.read_excel('scimagojr-3.xlsx')
-    
-    df = pd.merge(ScimEn, energy, how='inner',left_on='Country', right_on='Country')
-    alldf = pd.merge(df,GDP, how='inner', left_on='Country', right_on='Country')
-    alldf = alldf.set_index('Country')
-    answer_one = alldf[:15]
-    answer_two = alldf.shape[0] - answer_one.shape[0]
-    return answer_two
+    return load_df_to_memory().shape[0] - answer_one().shape[0]
 
 answer_two()
 

From 03e913d20eeb92df4c4557fa8f0b8322610bbb2c Mon Sep 17 00:00:00 2001
From: Piyush Sambhi <46954957+sambhipiyush@users.noreply.github.com>
Date: Sun, 10 May 2020 06:40:11 +0530
Subject: [PATCH 2/4] Updated answer_two function

---
 Assignment+3.py | 50 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 49 insertions(+), 1 deletion(-)

diff --git a/Assignment+3.py b/Assignment+3.py
index 2f42e6d..f2be6df 100644
--- a/Assignment+3.py
+++ b/Assignment+3.py
@@ -102,7 +102,55 @@ def answer_one():
 # In[3]:
 
 def answer_two():
-    return load_df_to_memory().shape[0] - answer_one().shape[0]
+    energy = pd.read_excel("Energy Indicators.xls",skip_footer=38,skip_header=1,skiprows=17) # Skip header and footer
+    energy.drop(energy.columns[[0,1]],axis=1,inplace=True) # Drop first 2 columns
+    energy.columns = ['Country', 'Energy Supply', 'Energy Supply per Capita', '% Renewable']
+    energy.dropna() # Drop rows with NaN values.
+
+    energy['Country'] = energy['Country'].str.replace(r'\(.*\)', '') # Remove contents within parenthesis.
+    energy['Country'] = energy['Country'].str.replace('\d+', '') # Remove digits from names
+    energy['Country'] = energy['Country'].str.strip() # This brings the Iran energy values back!
+
+    # Turn blank values into NaN
+    for col in energy:
+        energy[col] = energy[col].replace('...',np.nan)
+
+    energy['Country'] = energy['Country'].str.replace('Republic of Korea','South Korea')
+    energy['Country'] = energy['Country'].str.replace('United States of America','United States')
+    energy['Country'] = energy['Country'].str.replace('United Kingdom of Great Britain and Northern Ireland','United Kingdom')
+    energy['Country'] = energy['Country'].str.replace('China, Hong Kong Special Administrative Region','Hong Kong')
+
+    # GDP:
+    GDP = pd.read_csv('world_bank.csv', skiprows=3) # Skip header
+
+    # Make first row the column names
+    new_header = GDP.iloc[0]
+    GDP = GDP[1:]
+    GDP.columns = new_header
+
+    #GDP = GDP.rename(index=str,columns = {"Country Name":"Country"})
+    GDP['Country Name'] = GDP['Country Name'].str.replace('Korea, Rep.','South Korea')
+    GDP['Country Name'] = GDP['Country Name'].str.replace('Iran, Islamic Rep.','Iran')
+    GDP['Country Name'] = GDP['Country Name'].str.replace('Hong Kong SAR, China','Hong Kong')
+
+    # Change column name from 'Country Name' to 'Country' for merging 3 files on country name.
+    names = GDP.columns.tolist()
+    names[names.index('Country Name')] = 'Country'
+    GDP.columns = names
+
+    # Only keep the columns from 2006-15. Drop column number 1 to 50. Don't need country code, etc.
+    GDP = GDP.drop(GDP.iloc[:,1:50], axis=1)
+    GDP.columns = GDP.columns.astype(str).str.split('.').str[0] # Remove '.0' at the end of the year columns.    
+
+    # SCIMEN:
+    ScimEn = pd.read_excel('scimagojr-3.xlsx')
+    
+    # LOST ENTRIES = LEN(OUTER JOIN) - LEN(INNER JOIN)
+    
+    # Need unique entries in all 3 sets so use concat. Can't do that with a left or right outer join!
+    num_outer = len(pd.concat([ScimEn['Country'],energy['Country'],GDP['Country']]).unique())
+    num_inter = (GDP.merge(energy, left_on='Country', right_on='Country', how='inner').merge(ScimEn, left_on='Country', right_on='Country', how='inner').shape[0])
+    return num_outer-num_inter
 
 answer_two()
 

From fd43e39dc618e03331bfe7249ad1df0b47a810fe Mon Sep 17 00:00:00 2001
From: Piyush Sambhi <46954957+sambhipiyush@users.noreply.github.com>
Date: Sun, 10 May 2020 06:41:21 +0530
Subject: [PATCH 3/4] Improved answer_three function

---
 Assignment+3.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/Assignment+3.py b/Assignment+3.py
index f2be6df..1fdddf4 100644
--- a/Assignment+3.py
+++ b/Assignment+3.py
@@ -168,8 +168,7 @@ def answer_two():
 
 def answer_three():
     Top15 = answer_one()
-    aveGDP = Top15[['2006','2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015']].mean(axis=1).rename('aveGDP').sort_values(ascending=False)
-    return aveGDP
+    return Top15.iloc[:, 10:].mean(axis=1).rename('avgGDP').sort_values(ascending=False)
 
 answer_three()
 

From cebe25a84637a5e4b0f69502db969772a2e4365c Mon Sep 17 00:00:00 2001
From: Piyush Sambhi <46954957+sambhipiyush@users.noreply.github.com>
Date: Sun, 10 May 2020 06:46:12 +0530
Subject: [PATCH 4/4] Modification in answer_seven and answer_eight functions

---
 Assignment+3.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/Assignment+3.py b/Assignment+3.py
index 1fdddf4..780d5e5 100644
--- a/Assignment+3.py
+++ b/Assignment+3.py
@@ -231,9 +231,8 @@ def answer_six():
 
 def answer_seven():
     Top15 = answer_one()
-    Top15['Citation ratio'] = Top15['Self-citations'] / Top15['Citations']
-    MaxCitationRatio = Top15['Citation ratio'].idxmax(), Top15['Citation ratio'].max()
-    return MaxCitationRatio
+    Top15['selfcitation_to_total'] = Top15['Self-citations']/Top15['Citations']
+    return  Top15['selfcitation_to_total'].idxmax(), Top15['selfcitation_to_total'].max()
 
 answer_seven()
 
@@ -249,10 +248,8 @@ def answer_seven():
 
 def answer_eight():
     Top15 = answer_one()
-    Top15['PopEstimate'] = Top15['Energy Supply'] / Top15['Energy Supply per Capita']
-    answer_eight = Top15['PopEstimate'].sort_values(ascending=False)
-    answer_eight = answer_eight.index.tolist()[2]
-    return answer_eight
+    Top15['estd_population'] = Top15['Energy Supply'] / Top15['Energy Supply per Capita']
+    return Top15.sort_values(by=['estd_population'], ascending=False).index[2]
 
 answer_eight()