-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLab_2.Rmd
426 lines (351 loc) · 15.9 KB
/
Lab_2.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
---
title: "Untitled"
author: "Akiva Finkelstein & Amit Yaron"
date: "3 5 2021"
output: html_document
---
```{r, warning=FALSE, message=FALSE, echo=FALSE}
library(MASS)
library(factoextra)
library(dplyr)
library(patchwork)
library(knitr)
library(grid)
library(gridExtra)
library(readxl)
library('dendextend')
options(scipen=999)
```
#1.1,1.2
```{r,warning=FALSE,echo=FALSE}
set.seed(1)
m1 = rnorm(10, mean = 0, sd = 1)
m2 = rnorm(10, mean = 0, sd = 1)
m3 = rnorm(10, mean = 0, sd = 1)
m1_n = c(m1,rep(0,10))
m2_n = c(m2,rep(0,20))
m3_n = c(m3,rep(0,40))
mu = c(m1_n,m2_n,m3_n)
s_function = function(m1,m2,m3,p,sigma2_e){
n=100
m1_n = c(m1,rep(0,10))
m2_n = c(m2,rep(0,20))
m3_n = c(m3,rep(0,40))
data = as.data.frame(matrix(data = NA,nrow = n , ncol = p))
for( i in 1:p)
{
tamp = c(rnorm(20, mean = mean(m1_n), sd = sigma2_e),
rnorm(30, mean = mean(m2_n),sd = sigma2_e),
rnorm(50, mean = mean(m3_n), sd = sigma2_e))
data[,i] = tamp
}
return(as.data.frame(data))
}
```
#1.3, 1.4
```{r,warning=FALSE, echo=FALSE}
#50 df
data_gen <- function(m1_n,m2_n,m3_n,p,sigma){
sim_list <- list()
for (i in 1:50) {
temp <- s_function(m1_n, m2_n,m3_n, p, sigma)
sim_list[[i]] <- temp
temp <- data.frame()
}
return(c(sim_list))
}
# lists of 50 df
sim_10.1 <- data_gen(m1_n, m2_n,m3_n, 10, 1)
sim_10.2 <- data_gen(m1_n, m2_n,m3_n, 10, 2)
sim_10.5 <- data_gen(m1_n, m2_n,m3_n, 10, 5)
sim_10.10 <- data_gen(m1_n, m2_n,m3_n, 10, 10)
sim_20.1 <- data_gen(m1_n, m2_n,m3_n, 20, 1)
sim_20.2 <- data_gen(m1_n, m2_n,m3_n, 20,2)
sim_20.5 <- data_gen(m1_n, m2_n,m3_n, 20, 5)
sim_20.10 <- data_gen(m1_n, m2_n,m3_n, 20, 10)
sim_50.1 <- data_gen(m1_n, m2_n,m3_n, 50, 1)
sim_50.2 <- data_gen(m1_n, m2_n,m3_n, 50, 2)
sim_50.5 <- data_gen(m1_n, m2_n,m3_n, 50, 5)
sim_50.10 <- data_gen(m1_n, m2_n,m3_n, 50, 10)
```
#1.5
```{r,warning=FALSE, echo=FALSE}
#kmean function
k_func <- function(lst){
accuracy <-c()
run_time <- c()
for (i in 1:50) {
start_time <- Sys.time() # start
km <- kmeans(x = lst[[i]], centers = 3)
end_time <- Sys.time() #end
km$cluster
v1 <- rep(1,20)
v2 <- rep(2,30)
v3 <- rep(3,50)
V <- c(v1,v2,v3)
table(V, km$cluster)
pr_table <-prop.table(table(V, km$cluster),2)
max_diag <- 0
for(j in 1:3) {
max_diag <- max_diag +max(pr_table[j,])
}
accuracy[i] <-max_diag/3
run_time[i] <- round(end_time - start_time,7)
}
spe_acc <- data.frame(accuracy,run_time)
return(spe_acc)
}
```
##1.6
```{r,warning=FALSE, echo=FALSE}
#### mean accuracy and run_time of 12 categories
spe_acc_10.1 <- k_func(sim_10.1)
sd_10.1 <- ((var(spe_acc_10.1$accuracy)/50)^0.5) #SD
spe_acc_10.1 <- summarise(spe_acc_10.1, Avg_accuracy = mean(spe_acc_10.1$accuracy),
Avg_Run_Time = mean(spe_acc_10.1$run_time), #mean run_time & accuracy
standard_deviation = sd_10.1)
spe_acc_10.2 <- k_func(sim_10.2)
sd_10.2 <- ((var(spe_acc_10.2$accuracy)/50)^0.5)
spe_acc_10.2 <- summarise(spe_acc_10.2, Avg_accuracy = mean(spe_acc_10.2$accuracy),
Avg_Run_Time = mean(spe_acc_10.2$run_time),
standard_deviation = sd_10.2)
spe_acc_10.5 <- k_func(sim_10.5)
sd_10.5 <- ((var(spe_acc_10.5$accuracy)/50)^0.5)
spe_acc_10.5 <- summarise(spe_acc_10.5, Avg_accuracy = mean(spe_acc_10.5$accuracy),
Avg_Run_Time = mean(spe_acc_10.5$run_time),
standard_deviation = sd_10.5)
spe_acc_10.10 <- k_func(sim_10.10)
sd_10.10 <- ((var(spe_acc_10.10$accuracy)/50)^0.5)
spe_acc_10.10 <- summarise(spe_acc_10.10, Avg_accuracy = mean(spe_acc_10.10$accuracy),
Avg_Run_Time = mean(spe_acc_10.10$run_time),
standard_deviation = sd_10.10)
####
spe_acc_20.1 <- k_func(sim_20.1)
sd_20.1 <- ((var(spe_acc_20.1$accuracy)/50)^0.5)
spe_acc_20.1 <- summarise(spe_acc_20.1, Avg_accuracy = mean(spe_acc_20.1$accuracy),
Avg_Run_Time = mean(spe_acc_20.1$run_time),
standard_deviation = sd_20.1)
spe_acc_20.2 <- k_func(sim_20.2)
sd_20.2 <- ((var(spe_acc_20.2$accuracy)/50)^0.5)
spe_acc_20.2 <- summarise(spe_acc_20.2, Avg_accuracy = mean(spe_acc_20.2$accuracy),
Avg_Run_Time = mean(spe_acc_20.2$run_time),
standard_deviation = sd_20.2)
spe_acc_20.5 <- k_func(sim_20.5)
sd_20.5 <- ((var(spe_acc_20.5$accuracy)/50)^0.5)
spe_acc_20.5 <- summarise(spe_acc_20.5, Avg_accuracy = mean(spe_acc_20.5$accuracy),
Avg_Run_Time = mean(spe_acc_20.5$run_time),
standard_deviation = sd_20.5)
spe_acc_20.10 <- k_func(sim_20.10)
sd_20.10 <- ((var(spe_acc_20.10$accuracy)/50)^0.5)
spe_acc_20.10 <- summarise(spe_acc_20.10, Avg_accuracy = mean(spe_acc_20.10$accuracy),
Avg_Run_Time = mean(spe_acc_20.10$run_time),
standard_deviation = sd_20.10)
####
spe_acc_50.1 <- k_func(sim_50.1)
sd_50.1 <- ((var(spe_acc_50.1$accuracy)/50)^0.5)
spe_acc_50.1 <- summarise(spe_acc_50.1, Avg_accuracy = mean(spe_acc_50.1$accuracy),
Avg_Run_Time = mean(spe_acc_50.1$run_time),
standard_deviation = sd_50.1)
spe_acc_50.2 <- k_func(sim_50.2)
sd_50.2 <- ((var(spe_acc_50.2$accuracy)/50)^0.5)
spe_acc_50.2 <- summarise(spe_acc_50.2, Avg_accuracy = mean(spe_acc_50.2$accuracy),
Avg_Run_Time = mean(spe_acc_50.2$run_time),
standard_deviation = sd_50.2)
spe_acc_50.5 <- k_func(sim_50.5)
sd_50.5 <- ((var(spe_acc_50.5$accuracy)/50)^0.5)
spe_acc_50.5 <- summarise(spe_acc_50.5, Avg_accuracy = mean(spe_acc_50.5 $accuracy),
Avg_Run_Time = mean(spe_acc_50.5$run_time),
standard_deviation = sd_50.5)
spe_acc_50.10 <- k_func(sim_50.10)
sd_50.10 <- ((var(spe_acc_50.10$accuracy)/50)^0.5)
spe_acc_50.10 <- summarise(spe_acc_50.10, Avg_accuracy = mean(spe_acc_50.10$accuracy),
Avg_Run_Time = mean(spe_acc_50.10$run_time),
standard_deviation = sd_50.10)
# joining all 12 data sets
joind_acc_tim <- rbind(spe_acc_10.1,spe_acc_10.2,spe_acc_10.5,spe_acc_10.10,
spe_acc_20.1,spe_acc_20.2,spe_acc_20.5,spe_acc_20.10,
spe_acc_50.1,spe_acc_50.2,spe_acc_50.5,spe_acc_50.10)
#giving row names
row.names(joind_acc_tim) <- c("p=10,sigma=1","p=10,sigma=2","p=10,sigma=5","p=10,sigma=10"
,"p=20,sigma=1","p=20,sigma=2","p=20,sigma=5","p=20,sigma=10"
,"p=50,sigma=1","p=50,sigma=2","p=50,sigma=5","p=50,sigma=10")
```
#############
#plot accuracy
```{r,warning=FALSE, echo=FALSE}
p_10 <- joind_acc_tim[1:4,]
p_10$sigma <- c(1,2,5,10)
p_20 <-joind_acc_tim[5:8,]
p_20$sigma <- c(1,2,5,10)
p_50 <-joind_acc_tim[9:12,]
p_50$sigma <- c(1,2,5,10)
gp10 <- ggplot()+
geom_point(data = p_10, aes(x = sigma,y = Avg_accuracy), color = "blue")+
geom_line(data = p_10, aes(x = sigma,y = Avg_accuracy), color = "red")+
labs(title = "Accuracy for P = 10 \n& Different Values of sigma",subtitle = "",
y= "Accuracy Percentage",x = "sigma = {1, 2, 5, 10}")+
theme(axis.title.y = element_text(size = rel(0.6), angle = 90),
plot.title = element_text(size = rel(0.85), hjust = 0.5, face = "bold"))
gp20 <- ggplot()+
geom_point(data = p_20, aes(x = sigma,y = Avg_accuracy), color = "blue")+
geom_line(data = p_20, aes(x = sigma,y = Avg_accuracy), color = "red")+
labs(title = "Accuracy for P = 20 \n& Different Values of sigma",subtitle = "",
y= "Accuracy Percentage",x = "sigma = {1, 2, 5, 10}")+
theme(axis.title.y = element_text(size = rel(0.6), angle = 90),
plot.title = element_text(size = rel(0.85), hjust = 0.5, face = "bold"))
gp50 <- ggplot()+
geom_point(data = p_50, aes(x = sigma,y = Avg_accuracy), color = "blue")+
geom_line(data = p_50, aes(x = sigma,y = Avg_accuracy), color = "red")+
labs(title = "Accuracy for P = 50 \n& Different Values of sigma",subtitle = "",
y= "Accuracy Percentage",x = "sigma = {1, 2, 5, 10}")+
theme(axis.title.y = element_text(size = rel(0.6), angle = 90),
plot.title = element_text(size = rel(0.85), hjust = 0.5, face = "bold"))
grid.arrange(gp10,gp20,gp50, nrow = 1) # Create grid of plots
```
#########
#plot sd
```{r,warning=FALSE, echo=FALSE,message=FALSE, width = 5}
gpp10 <- ggplot()+
geom_point(data = p_10, aes(x = sigma,y = standard_deviation), color = "blue")+
geom_line(data = p_10, aes(x = sigma,y = standard_deviation), color = "red")+
labs(title = "standard_deviation for P = 10 \n& Different Values of sigma",subtitle = "",
y= "standard_deviation",x = "sigma = {1, 2, 5, 10}")+
theme(axis.title.y = element_text(size = rel(0.6), angle = 90),
plot.title = element_text(size = rel(0.85), hjust = 0.5, face = "bold"))
gpp20 <- ggplot()+
geom_point(data = p_20, aes(x = sigma,y = standard_deviation), color = "blue")+
geom_line(data = p_20, aes(x = sigma,y = standard_deviation), color = "red")+
labs(title = "standard_deviation for P = 20 \n& Different Values of sigma",subtitle = "",
y= "standard_deviation",x = "sigma = {1, 2, 5, 10}")+
theme(axis.title.y = element_text(size = rel(0.6), angle = 90),
plot.title = element_text(size = rel(0.85), hjust = 0.5, face = "bold"))
gpp50 <- ggplot()+
geom_point(data = p_50, aes(x = sigma,y = standard_deviation), color = "blue")+
geom_line(data = p_50, aes(x = sigma,y = standard_deviation), color = "red")+
labs(title = "standard_deviation for P = 50 \n& Different Values of sigma",subtitle = "",
y= "standard_deviation",x = "sigma = {1, 2, 5, 10}")+
theme(axis.title.y = element_text(size = rel(0.6), angle = 90),
plot.title = element_text(size = rel(0.85), hjust = 0.5, face = "bold"))
grid.arrange(gpp10,gpp20,gpp50, nrow = 1)
kable(joind_acc_tim)
```
#1.7
```{r,warning=FALSE, echo=FALSE}
#plot time
ggplot()+
geom_col(data = joind_acc_tim, aes(y=Avg_Run_Time, x = row.names(joind_acc_tim), fill = row.names(joind_acc_tim) ))+
labs(x = "parameters", y = "AVG Runnig_Time",
title = "Average Running Time \nFor Different Combinations of P & sigma")+
scale_fill_viridis_d(name = "parameters")+
theme(axis.title.y = element_text(angle =90),
plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 90, size = rel(0.7),face = "bold"))
```
#Q_1.8
We can see that overall the k-mean algorithms did not allocate the clusters
so well. The accuracy ranges from 39% to 41%.For p=50 we seem to get a slightly higher AVG accuracy level.
\nWe can not see a specific pattern for the standard deviation and the accuracy level.
\nThe run time for all data sets was very quick. As expected we can see that the data
with the longest run time is the data with the most variables and highest variance.
\nIn general we can see that it takes longer if thee are more variables in the data.
#Q_2
#open and create the relevant Database 2.1
```{r,warning=FALSE, message=FALSE, echo=FALSE}
set.seed(42)
isb<- read.delim("cbs_demographics.txt")
yeshuvim<-read_xlsx("yeshuvim_2019.xlsx")
knesset<- read.csv("knesset_24.csv")
colnames(yeshuvim)=yeshuvim[1,]
yeshuvim<-yeshuvim[-c(1),]
colnames(yeshuvim)[4]="village"
data <- merge(isb,yeshuvim,by="village")#merge yeshvim and isb by village
colnames(knesset)[3]="סמל_ישוב"
data<-merge(data,knesset,by="סמל_ישוב")#merege data with knesset by = סמל ישוב
sample <- sample_n(data,20)
row.names(sample)=sample$village
sample<-sample[,-c(2)]
```
#Create dendrogram of vote 2.2
```{r,warning=FALSE, message=FALSE, echo=FALSE}
vote<-sample[,30:68]
vote<-scale(vote)
#dendogram tree of V
dd <- dist(vote, method = "euclidean")
hc <- hclust(dd, method = "complete")
plot(hc,main = "Cluster Dendrogram Vote Tree ",hang = -1, cex = 0.5,ylab ="Height")
#explain the graphD
```
Here we can see Cluster Deprogram of Votes
we chose the distance matrix with "Euclidean Metric".
and the hierarchical tree with the complete-link method.
#Create dendrogram of city 2.3
```{r,warning=FALSE, message=FALSE, echo=FALSE}
city<-cbind(sample[,2:15])
city<-scale(city)
#dendogram tree of V
ddcity <- dist(city, method = "euclidean")
hcity <- hclust(ddcity, method = "complete")
plot(hcity,main = "Cluster Dendrogram City Tree ",hang = -1, cex = 0.5,ylab ="Height")
#explain the graphD
```
Here we can see Cluster Deprogram of Votes
we chose the distance matrix with "Euclidean Metric".
and the hierarchical tree with the complete-link method
#Comapre 2 DendrogramsTree 2.4
```{r,warning=FALSE,echo=FALSE}
dend1 <- as.dendrogram (hc)
dend2 <- as.dendrogram (hcity)
labels_cex(dend1) <- 0.42
labels_cex(dend2) <- 0.42
plot(dend_diff(dend1, dend2, horiz = TRUE),sub="Vote as Cities")#Fix this title plot !
plot(dendlist(dend1,dend2) %>%
untangle(method = "step1side") %>% # Find the best alignment layout
tanglegram(),sub="Vote as Cities")
```
#Comparing:
Here we can see the Comparing between the two hierarchies.
In the left - the elections votes tree, and in the right - the demographic tree.
First, we can see that the distance is much bigger in the demographic tree - meaning that the distances between the cities in the demographic data are bigger then the distances in the vote shares.
Also we can see in the red line that is distance between Holon and Netanya is the same in both trees.
#cor_bakers mesure similarity score for the two trees.Q2.5
#Baker’s Gamma Index
```{r,warning=FALSE,echo=FALSE}
print(paste0("Bakers_Gamma:",cor_bakers_gamma(dend1, dend2)))
```
#
We chose to use Baker’s Gamma Index to calculate the score between the trees.
It is defined as the rank correlation between the stages at which pairs of objects combine in both trees.
The value can range between -1 to 1. With near 0 values meaning that the two trees are not statistically similar at all.
For exact p-value one should use a permutation test.
One such option will be to permute over the labels of one tree many times, calculating the distribution under the null hypothesis (keeping the trees topological constant).
Notice that this measure is not affected by the height of a branch but only of its relative position compared with other branches.
So here we can see the trees are not statistically similar.
#distribution forBaker’s Gamma Index Q2.6
```{r,warning=FALSE,echo=FALSE}
set.seed(23235)
the_cor <- cor_bakers_gamma(dend1, dend1)
the_cor2 <- cor_bakers_gamma(dend1, dend2)
R <- 200
cor_bakers_gamma_results <- numeric(R)
dend_mixed <- dend1
for(i in 1:R) {
dend_mixed <- sample.dendrogram(dend_mixed, replace = FALSE)
cor_bakers_gamma_results[i] <- cor_bakers_gamma(dend1, dend_mixed)
}
plot(density(cor_bakers_gamma_results),
main = "Baker's gamma distribution under H0",
xlim = c(-1,1))
abline(v = 0, lty = 2)
abline(v = the_cor, lty = 2, col = 2)
abline(v = the_cor2, lty = 2, col = 4)
legend("topleft", legend = c("Vote as itself", "Vote as Cities"), fill = c(2,4))
round(sum(the_cor2 < cor_bakers_gamma_results)/ R, 4)
title(sub = paste("One sided p-value:",
"cor =", round(sum(the_cor < cor_bakers_gamma_results)/ R, 4),
" ; cor2 =", round(sum(the_cor2 < cor_bakers_gamma_results)/ R, 4)
))
```
Here we can see the distribution of the baker gamma.
Our Null hypothesis is that there is a statistical similarity between the trees.
From the previous section (baker gamma value =-0.156424) we can see there is no statistical similarity between the trees.
Also the P-value is larger then 0.05. Therefore we will reject the Null Hypothesis.