Lab_2.Rmd

---
title: "Untitled"
author: "Akiva Finkelstein & Amit Yaron"
date: "3 5 2021"
output: html_document
---
```{r, warning=FALSE, message=FALSE, echo=FALSE}
library(MASS)
library(factoextra)
library(dplyr)
library(patchwork)
library(knitr)
library(grid)
library(gridExtra)
library(readxl)
library('dendextend')
options(scipen=999)
```
#1.1,1.2
```{r,warning=FALSE,echo=FALSE}
set.seed(1)
m1 = rnorm(10, mean = 0, sd = 1)
m2 = rnorm(10, mean = 0, sd = 1) 
m3 = rnorm(10, mean = 0, sd = 1)

m1_n = c(m1,rep(0,10))
m2_n = c(m2,rep(0,20))
m3_n = c(m3,rep(0,40))

mu = c(m1_n,m2_n,m3_n)

s_function = function(m1,m2,m3,p,sigma2_e){
  n=100
  m1_n = c(m1,rep(0,10))
  m2_n = c(m2,rep(0,20)) 
  m3_n = c(m3,rep(0,40))
  data = as.data.frame(matrix(data = NA,nrow = n , ncol = p))
  for( i in 1:p)
  { 
    tamp = c(rnorm(20, mean = mean(m1_n), sd = sigma2_e), 
             rnorm(30, mean = mean(m2_n),sd = sigma2_e),
             rnorm(50, mean = mean(m3_n), sd = sigma2_e)) 
    data[,i] = tamp
    
  }
  return(as.data.frame(data))
}
```


#1.3, 1.4
```{r,warning=FALSE, echo=FALSE}
#50 df
data_gen <- function(m1_n,m2_n,m3_n,p,sigma){
  sim_list <- list()
  for (i in 1:50) {
    temp <- s_function(m1_n, m2_n,m3_n, p, sigma)
    sim_list[[i]] <- temp
    temp <- data.frame()
  }
  return(c(sim_list))
}
# lists of 50 df
sim_10.1 <- data_gen(m1_n, m2_n,m3_n, 10, 1) 
sim_10.2 <- data_gen(m1_n, m2_n,m3_n, 10, 2)
sim_10.5 <- data_gen(m1_n, m2_n,m3_n, 10, 5)
sim_10.10 <- data_gen(m1_n, m2_n,m3_n, 10, 10)
sim_20.1 <- data_gen(m1_n, m2_n,m3_n, 20, 1)
sim_20.2 <- data_gen(m1_n, m2_n,m3_n, 20,2)
sim_20.5 <- data_gen(m1_n, m2_n,m3_n, 20, 5)
sim_20.10 <- data_gen(m1_n, m2_n,m3_n, 20, 10)
sim_50.1 <- data_gen(m1_n, m2_n,m3_n, 50, 1)
sim_50.2 <- data_gen(m1_n, m2_n,m3_n, 50, 2)
sim_50.5 <- data_gen(m1_n, m2_n,m3_n, 50, 5)
sim_50.10 <- data_gen(m1_n, m2_n,m3_n, 50, 10)
```
#1.5
```{r,warning=FALSE, echo=FALSE}
#kmean function
k_func <- function(lst){
  accuracy <-c()
  run_time <- c()
  
  for (i in 1:50) {
    start_time <- Sys.time()  # start 
    km <-  kmeans(x = lst[[i]], centers = 3)
    end_time <- Sys.time() #end
    km$cluster
    v1 <- rep(1,20)
    v2 <- rep(2,30)
    v3 <- rep(3,50)
    V <- c(v1,v2,v3)
    table(V, km$cluster)
    pr_table <-prop.table(table(V, km$cluster),2)
    max_diag <- 0
    for(j in 1:3) {
      max_diag <- max_diag +max(pr_table[j,])
    }
    accuracy[i] <-max_diag/3
    run_time[i] <- round(end_time - start_time,7)      
 }
   spe_acc <- data.frame(accuracy,run_time)
 return(spe_acc)
}
```
##1.6
```{r,warning=FALSE, echo=FALSE}
#### mean accuracy and run_time of 12 categories  

spe_acc_10.1 <- k_func(sim_10.1) 
sd_10.1 <- ((var(spe_acc_10.1$accuracy)/50)^0.5) #SD
spe_acc_10.1 <- summarise(spe_acc_10.1, Avg_accuracy = mean(spe_acc_10.1$accuracy),
          Avg_Run_Time = mean(spe_acc_10.1$run_time), #mean run_time & accuracy
          standard_deviation = sd_10.1)

spe_acc_10.2 <- k_func(sim_10.2)  
sd_10.2 <- ((var(spe_acc_10.2$accuracy)/50)^0.5)
spe_acc_10.2 <- summarise(spe_acc_10.2, Avg_accuracy = mean(spe_acc_10.2$accuracy),
                          Avg_Run_Time = mean(spe_acc_10.2$run_time),
                          standard_deviation = sd_10.2)

spe_acc_10.5 <- k_func(sim_10.5) 
sd_10.5 <- ((var(spe_acc_10.5$accuracy)/50)^0.5)
spe_acc_10.5 <- summarise(spe_acc_10.5, Avg_accuracy = mean(spe_acc_10.5$accuracy),
                          Avg_Run_Time = mean(spe_acc_10.5$run_time),
                          standard_deviation = sd_10.5)

spe_acc_10.10 <- k_func(sim_10.10)  
sd_10.10 <- ((var(spe_acc_10.10$accuracy)/50)^0.5)
spe_acc_10.10 <- summarise(spe_acc_10.10, Avg_accuracy = mean(spe_acc_10.10$accuracy),
                          Avg_Run_Time = mean(spe_acc_10.10$run_time),
                          standard_deviation = sd_10.10)
####
spe_acc_20.1 <- k_func(sim_20.1)
sd_20.1 <- ((var(spe_acc_20.1$accuracy)/50)^0.5)
spe_acc_20.1 <- summarise(spe_acc_20.1, Avg_accuracy = mean(spe_acc_20.1$accuracy),
                          Avg_Run_Time = mean(spe_acc_20.1$run_time),
                          standard_deviation = sd_20.1)

spe_acc_20.2 <- k_func(sim_20.2)
sd_20.2 <- ((var(spe_acc_20.2$accuracy)/50)^0.5)
spe_acc_20.2 <- summarise(spe_acc_20.2, Avg_accuracy = mean(spe_acc_20.2$accuracy),
                          Avg_Run_Time = mean(spe_acc_20.2$run_time),
                          standard_deviation = sd_20.2)
                            

spe_acc_20.5 <- k_func(sim_20.5) 
sd_20.5 <- ((var(spe_acc_20.5$accuracy)/50)^0.5)
spe_acc_20.5 <- summarise(spe_acc_20.5, Avg_accuracy = mean(spe_acc_20.5$accuracy),
                          Avg_Run_Time = mean(spe_acc_20.5$run_time),
                          standard_deviation = sd_20.5)

spe_acc_20.10 <- k_func(sim_20.10)  
sd_20.10 <- ((var(spe_acc_20.10$accuracy)/50)^0.5)
spe_acc_20.10 <- summarise(spe_acc_20.10, Avg_accuracy = mean(spe_acc_20.10$accuracy),
                           Avg_Run_Time = mean(spe_acc_20.10$run_time),
                           standard_deviation = sd_20.10)
####

spe_acc_50.1 <- k_func(sim_50.1) 
sd_50.1 <- ((var(spe_acc_50.1$accuracy)/50)^0.5)
spe_acc_50.1 <- summarise(spe_acc_50.1, Avg_accuracy = mean(spe_acc_50.1$accuracy),
                          Avg_Run_Time = mean(spe_acc_50.1$run_time),
                          standard_deviation = sd_50.1)

spe_acc_50.2 <- k_func(sim_50.2)  
sd_50.2 <- ((var(spe_acc_50.2$accuracy)/50)^0.5)
spe_acc_50.2 <- summarise(spe_acc_50.2, Avg_accuracy = mean(spe_acc_50.2$accuracy),
                          Avg_Run_Time = mean(spe_acc_50.2$run_time),
                          standard_deviation = sd_50.2)


spe_acc_50.5 <- k_func(sim_50.5) 
sd_50.5 <- ((var(spe_acc_50.5$accuracy)/50)^0.5)
spe_acc_50.5 <- summarise(spe_acc_50.5, Avg_accuracy = mean(spe_acc_50.5 $accuracy),
                          Avg_Run_Time = mean(spe_acc_50.5$run_time),
                          standard_deviation = sd_50.5)

spe_acc_50.10 <- k_func(sim_50.10)  
sd_50.10 <- ((var(spe_acc_50.10$accuracy)/50)^0.5)
spe_acc_50.10 <- summarise(spe_acc_50.10, Avg_accuracy = mean(spe_acc_50.10$accuracy),
                           Avg_Run_Time = mean(spe_acc_50.10$run_time),
                           standard_deviation = sd_50.10)
 

# joining all 12 data sets
joind_acc_tim <- rbind(spe_acc_10.1,spe_acc_10.2,spe_acc_10.5,spe_acc_10.10,
                       spe_acc_20.1,spe_acc_20.2,spe_acc_20.5,spe_acc_20.10,
                       spe_acc_50.1,spe_acc_50.2,spe_acc_50.5,spe_acc_50.10)
#giving row names
row.names(joind_acc_tim) <- c("p=10,sigma=1","p=10,sigma=2","p=10,sigma=5","p=10,sigma=10"
                              ,"p=20,sigma=1","p=20,sigma=2","p=20,sigma=5","p=20,sigma=10"
                              ,"p=50,sigma=1","p=50,sigma=2","p=50,sigma=5","p=50,sigma=10")

```

#############
#plot accuracy
```{r,warning=FALSE, echo=FALSE}
p_10 <- joind_acc_tim[1:4,]
p_10$sigma <- c(1,2,5,10)
p_20 <-joind_acc_tim[5:8,]
p_20$sigma <- c(1,2,5,10)
p_50 <-joind_acc_tim[9:12,]
p_50$sigma <- c(1,2,5,10)

gp10 <- ggplot()+
  geom_point(data = p_10, aes(x = sigma,y = Avg_accuracy), color = "blue")+
  geom_line(data = p_10, aes(x = sigma,y = Avg_accuracy), color = "red")+
  labs(title = "Accuracy for P = 10 \n& Different Values of sigma",subtitle = "",
        y= "Accuracy Percentage",x =  "sigma = {1, 2, 5, 10}")+
  theme(axis.title.y = element_text(size = rel(0.6), angle = 90), 
        plot.title = element_text(size = rel(0.85), hjust = 0.5, face = "bold"))

gp20 <- ggplot()+
  geom_point(data = p_20, aes(x = sigma,y = Avg_accuracy), color = "blue")+
  geom_line(data = p_20, aes(x = sigma,y = Avg_accuracy), color = "red")+
  labs(title = "Accuracy for P = 20 \n& Different Values of sigma",subtitle = "",
       y= "Accuracy Percentage",x =  "sigma = {1, 2, 5, 10}")+
  theme(axis.title.y = element_text(size = rel(0.6), angle = 90), 
        plot.title = element_text(size = rel(0.85), hjust = 0.5, face = "bold"))

gp50 <- ggplot()+
  geom_point(data = p_50, aes(x = sigma,y = Avg_accuracy), color = "blue")+
  geom_line(data = p_50, aes(x = sigma,y = Avg_accuracy), color = "red")+
  labs(title = "Accuracy for P = 50 \n& Different Values of sigma",subtitle = "",
       y= "Accuracy Percentage",x =  "sigma = {1, 2, 5, 10}")+
  theme(axis.title.y = element_text(size = rel(0.6), angle = 90), 
        plot.title = element_text(size = rel(0.85), hjust = 0.5, face = "bold"))


grid.arrange(gp10,gp20,gp50, nrow = 1)     # Create grid of plots 
```
#########
#plot sd
```{r,warning=FALSE, echo=FALSE,message=FALSE, width = 5}
gpp10 <- ggplot()+
  geom_point(data = p_10, aes(x = sigma,y = standard_deviation), color = "blue")+
  geom_line(data = p_10, aes(x = sigma,y = standard_deviation), color = "red")+
  labs(title = "standard_deviation for P = 10 \n& Different Values of sigma",subtitle = "",
       y= "standard_deviation",x =  "sigma = {1, 2, 5, 10}")+
  theme(axis.title.y = element_text(size = rel(0.6), angle = 90), 
        plot.title = element_text(size = rel(0.85), hjust = 0.5, face = "bold"))

gpp20 <- ggplot()+
  geom_point(data = p_20, aes(x = sigma,y = standard_deviation), color = "blue")+
  geom_line(data = p_20, aes(x = sigma,y = standard_deviation), color = "red")+
  labs(title = "standard_deviation for P = 20 \n& Different Values of sigma",subtitle = "",
       y= "standard_deviation",x =  "sigma = {1, 2, 5, 10}")+
  theme(axis.title.y = element_text(size = rel(0.6), angle = 90), 
        plot.title = element_text(size = rel(0.85), hjust = 0.5, face = "bold"))

gpp50 <- ggplot()+
  geom_point(data = p_50, aes(x = sigma,y = standard_deviation), color = "blue")+
  geom_line(data = p_50, aes(x = sigma,y = standard_deviation), color = "red")+
  labs(title = "standard_deviation for P = 50 \n& Different Values of sigma",subtitle = "",
       y= "standard_deviation",x =  "sigma = {1, 2, 5, 10}")+
  theme(axis.title.y = element_text(size = rel(0.6), angle = 90), 
        plot.title = element_text(size = rel(0.85), hjust = 0.5, face = "bold"))

grid.arrange(gpp10,gpp20,gpp50, nrow = 1) 

kable(joind_acc_tim)
```
#1.7
```{r,warning=FALSE, echo=FALSE}
#plot time

ggplot()+
  geom_col(data = joind_acc_tim, aes(y=Avg_Run_Time, x = row.names(joind_acc_tim), fill = row.names(joind_acc_tim) ))+
  labs(x = "parameters", y = "AVG Runnig_Time",
       title = "Average Running Time \nFor Different Combinations of P & sigma")+
  scale_fill_viridis_d(name = "parameters")+
  theme(axis.title.y = element_text(angle =90),
        plot.title = element_text(hjust = 0.5),
        axis.text.x = element_text(angle = 90, size = rel(0.7),face = "bold"))
```
#Q_1.8
We can see that overall the k-mean algorithms did not allocate the clusters
so well. The accuracy ranges from 39% to 41%.For p=50 we seem to get a slightly higher AVG accuracy level.

\nWe can not see a specific pattern for the standard deviation and the accuracy level.

\nThe run time for all data sets was very quick. As expected we can see that the data
with the longest run time is the data with the most variables and highest variance.
\nIn general we can see that it takes longer if thee are more variables in the data.


#Q_2
#open and create the relevant  Database 2.1
```{r,warning=FALSE, message=FALSE, echo=FALSE}
set.seed(42)
isb<- read.delim("cbs_demographics.txt")
yeshuvim<-read_xlsx("yeshuvim_2019.xlsx")
knesset<- read.csv("knesset_24.csv")
colnames(yeshuvim)=yeshuvim[1,]
yeshuvim<-yeshuvim[-c(1),]
colnames(yeshuvim)[4]="village"
data <- merge(isb,yeshuvim,by="village")#merge yeshvim and isb by village
colnames(knesset)[3]="סמל_ישוב"
data<-merge(data,knesset,by="סמל_ישוב")#merege data with knesset by = סמל ישוב 
sample <- sample_n(data,20)
row.names(sample)=sample$village
sample<-sample[,-c(2)]

```

#Create dendrogram of vote  2.2
```{r,warning=FALSE, message=FALSE, echo=FALSE}

vote<-sample[,30:68]
vote<-scale(vote)


#dendogram tree of V
dd <- dist(vote, method = "euclidean")
hc <- hclust(dd, method = "complete") 
plot(hc,main = "Cluster Dendrogram Vote Tree ",hang = -1, cex = 0.5,ylab ="Height")

#explain the graphD

```
Here we can see Cluster Deprogram of Votes
we chose the distance matrix with "Euclidean Metric".
and the hierarchical tree with the complete-link method.



#Create dendrogram of city   2.3
```{r,warning=FALSE, message=FALSE, echo=FALSE}
city<-cbind(sample[,2:15])
city<-scale(city)
#dendogram tree of V
ddcity <- dist(city, method = "euclidean")
hcity <- hclust(ddcity, method = "complete") 
plot(hcity,main = "Cluster Dendrogram City Tree ",hang = -1, cex = 0.5,ylab ="Height")

#explain the graphD


```
Here we can see Cluster Deprogram of Votes
we chose the distance matrix with "Euclidean Metric".
and the hierarchical tree with the  complete-link method


#Comapre 2 DendrogramsTree 2.4
```{r,warning=FALSE,echo=FALSE}
dend1 <- as.dendrogram (hc)
dend2 <- as.dendrogram (hcity)
labels_cex(dend1) <- 0.42
labels_cex(dend2) <- 0.42



plot(dend_diff(dend1, dend2, horiz = TRUE),sub="Vote as Cities")#Fix this title plot ! 




plot(dendlist(dend1,dend2) %>%
  untangle(method = "step1side") %>% # Find the best alignment layout
  tanglegram(),sub="Vote as Cities")


```
#Comparing:
Here we can see the Comparing between the two hierarchies.
In the left - the elections votes tree, and in the right - the demographic tree.
First, we can see that the distance is much bigger in the demographic tree -  meaning that the distances between the cities in the demographic data are bigger then the distances in the vote shares.
Also we can see in the red line that is distance between Holon and Netanya is the same in both trees.



#cor_bakers mesure similarity score for the two trees.Q2.5
#Baker’s Gamma Index
```{r,warning=FALSE,echo=FALSE}
print(paste0("Bakers_Gamma:",cor_bakers_gamma(dend1, dend2)))
```
#
We chose to use Baker’s Gamma Index to calculate the score between the trees.
It is defined as the rank correlation between the stages at which pairs of objects combine in both trees.
The value can range between -1 to 1. With near 0 values meaning that the two trees are not statistically similar at all.
For exact p-value one should use a permutation test. 
One such option will be to permute over the labels of one tree many times, calculating the distribution under the null hypothesis (keeping the trees topological constant).
Notice that this measure is not affected by the height of a branch but only of its relative position compared with other branches.
So here we can see the trees are not statistically similar.



#distribution forBaker’s Gamma Index Q2.6
```{r,warning=FALSE,echo=FALSE}
set.seed(23235)
the_cor <- cor_bakers_gamma(dend1, dend1)
the_cor2 <- cor_bakers_gamma(dend1, dend2)
R <- 200
cor_bakers_gamma_results <- numeric(R)
dend_mixed <- dend1
for(i in 1:R) {
   dend_mixed <- sample.dendrogram(dend_mixed, replace = FALSE)
   cor_bakers_gamma_results[i] <- cor_bakers_gamma(dend1, dend_mixed)
}
plot(density(cor_bakers_gamma_results),
     main = "Baker's gamma distribution under H0",
     xlim = c(-1,1))
abline(v = 0, lty = 2)
abline(v = the_cor, lty = 2, col = 2)
abline(v = the_cor2, lty = 2, col = 4)
legend("topleft", legend = c("Vote as itself", "Vote as Cities"), fill = c(2,4))
round(sum(the_cor2 < cor_bakers_gamma_results)/ R, 4)

title(sub = paste("One sided p-value:",
                  "cor =",  round(sum(the_cor < cor_bakers_gamma_results)/ R, 4),
                  " ; cor2 =",  round(sum(the_cor2 < cor_bakers_gamma_results)/ R, 4)
                  ))
```
Here we can see the distribution of the baker gamma.
Our Null hypothesis is that there is a statistical similarity between the trees.
From the previous section (baker gamma value =-0.156424) we can see there is no statistical similarity between the trees.
Also the P-value is larger then 0.05. Therefore we will reject the Null Hypothesis.