Lab4 (1).Rmd

---
title: "Lab4"
author: "Amit Yaron and Akiva Finkelstein"
date: "6/20/2021"
output: html_document
editor_options: 
  chunk_output_type: inline
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

```{r,message=FALSE,echo=FALSE,warning=FALSE}
library(randomForest)
library(tidyverse)
library(dplyr)
library(kableExtra)
library(bayestestR)
```

```{r,message=FALSE,echo=FALSE,warning=FALSE}


# helper function for visualization
show_digit = function(arr784, col = gray(12:1 / 12), ...) {
  image(matrix(as.matrix(arr784[-785]), nrow = 28)[, 28:1], col = col, ...)
}

# load image files
load_image_file = function(filename) {
  ret = list()
  f = file(filename, 'rb')
  readBin(f, 'integer', n = 1, size = 4, endian = 'big')
  n    = readBin(f, 'integer', n = 1, size = 4, endian = 'big')
  nrow = readBin(f, 'integer', n = 1, size = 4, endian = 'big')
  ncol = readBin(f, 'integer', n = 1, size = 4, endian = 'big')
  x = readBin(f, 'integer', n = n * nrow * ncol, size = 1, signed = FALSE)
  close(f)
  data.frame(matrix(x, ncol = nrow * ncol, byrow = TRUE))
}

# load label files
load_label_file = function(filename) {
  f = file(filename, 'rb')
  readBin(f, 'integer', n = 1, size = 4, endian = 'big')
  n = readBin(f, 'integer', n = 1, size = 4, endian = 'big')
  y = readBin(f, 'integer', n = n, size = 1, signed = FALSE)
  close(f)
  y
}

# load images
train = load_image_file("train-images-idx3-ubyte")
test  = load_image_file("t10k-images-idx3-ubyte")

# load labels
train$y = as.factor(load_label_file("train-labels-idx1-ubyte"))
test$y  = as.factor(load_label_file("t10k-labels-idx1-ubyte"))

#view test image
show_digit(train[10000, ])
```


```{r,message=FALSE,echo=FALSE,warning=FALSE}
set.seed(245)
train38 <- train[train$y==3| train$y==8,]
new_train <- sample_n(train38,4000,replace = F)
new_train_x<-new_train[,-which(names(new_train) == "y")]

test38<-test[test$y==3| test$y==8,]
test38_x<-test38[,-which(names(test38) == "y")]
```

#Q1.1:
##### WE choose to compare two methods: Random Forst and Logistic regression
we choose run Random Forest trees on the train data.
first we use regression tree on the training set then we use bagging to Reduces the variance in fitting.


The second method we use is logistic regression 
We assume a linear relationship between the predictor variables and the
log-odds (also called logit) of the event that Y=1(image is 3).

#for both model we pick a threshold of .5, since we assume the errors are treated symetrically i.e FN and FP are treated the same. 


#Random forest
```{r,message=FALSE,echo=FALSE,warning=FALSE}
rf_model <- randomForest(x = new_train_x, y =droplevels(new_train$y), ntree = 100, do.trace = 1,mtry = 30) 

rf_pred_train <- predict(rf_model, type = 'prob')

temp_rf_train<-rf_pred_train[,1]

rf_pred_train <- ifelse(temp_rf_train >0.5, 3, 8)

rf_pred_test <- predict(rf_model, newdata = test38_x, type = 'prob')

temp_rf_test<-rf_pred_test[,1]

rf_pred_test <- ifelse(temp_rf_test >0.5, 3, 8)
```

#logistic regression
```{r,message=FALSE,echo=FALSE,warning=FALSE}
logis_model <- glm(new_train$y ~., family = "binomial",data=new_train)
logis_pred_train = predict(logis_model,type = "response")
logis_pred_train <- ifelse(logis_pred_train > 0.5, 3, 8)
logis_pred_test = predict(logis_model,newdata = test38_x ,type = 'response')
logis_pred_temp <- ifelse(logis_pred_test > 0.5, 3, 8)
```


#Q1.2 confusion matrix
```{r,message=FALSE,echo=FALSE,warning=FALSE}
confus_mat = function(y_test,pred){
  pred_test1 <- as.data.frame(table(pred,y_test))
  pred_test1<- subset(pred_test1,pred_test1[,2] == 3|pred_test1[,2] == 8)
  pred_test1 <- as.matrix(pred_test1)
  if('3' %in% pred_test1[,1] &&  '8' %in% pred_test1[,1] == FALSE ) {
    pred_test1<- rbind(pred_test1,c(8,3,0))
    pred_test1<-rbind(pred_test1,c(8,8,0))
  }
  if('8' %in% pred_test1[,1] &&  '3' %in% pred_test1[,1] == FALSE ) {
    pred_test1<- rbind(c(3,8,0),pred_test1)
    pred_test1<-rbind(c(3,3,0),pred_test1)
  }
  rownames(pred_test1) <- seq(1,4)
  pred_test1 <- apply(pred_test1, 2, as.numeric)
  precision = pred_test1[1,3]/sum(pred_test1[1,3],pred_test1[2,3]) #TP/TP+FP
  recal <- pred_test1[1,3]/sum(pred_test1[1,3],pred_test1[3,3]) #TP/TP+FN
  f_P_rate <- pred_test1[2,3]/sum(pred_test1[2,3],pred_test1[4,3]) #FP/FP+TN
  
  dat<-data.frame("True_Pos"=pred_test1[1,3],"False_pos"=pred_test1[2,3],
        "False_Neg" = pred_test1[3,3],"True_neg" = pred_test1[4,3],
        "Precision"= precision,"recall" =recal,"False_Pos_rate"=f_P_rate)
  return(dat)
}

rand_fors_dat_test = confus_mat(test38$y,rf_pred_test)
rand_fors_dat_train = confus_mat(new_train$y,rf_pred_train)

logis_dat_test = confus_mat(test38$y,logis_pred_temp)
logis_dat_train = confus_mat(new_train$y,logis_pred_train)

logis_dat_train %>%
  kbl(caption = "Confusion matrix for the glm model",) %>%
  kable_material_dark("hover",full_width = F)
logis_dat_test %>%
  kbl(caption = "Confusion matrix for the glm model",) %>%
  kable_material_dark("hover",full_width = F)

rand_fors_dat_train %>%
  kbl(caption = "Confusion matrix for the random forest model",) %>%
  kable_material_dark("hover",full_width = F)
rand_fors_dat_test %>%
  kbl(caption = "Confusion matrix for the random forest model",) %>%
  kable_material_dark("hover",full_width = F)
```
#We can see from the tables above that the random forest model is a better predictor for this data set. It was correct more and mistaken less. We can see that the logistic regression on the training set has only mistakes and no correct predictions. For the random forest we can see that both the test and the training set are not so fr apart so we assume the model was not over fitted. 


#Q1.3 Draw ROC
```{r,message=FALSE,echo=FALSE,warning=FALSE}
roc_curve <- function(test_y,predictor, model_name){
  threshold<- seq(0,1,0.01)
  y_recall <- c()
  x_fp_rate <- c()
  for (i in threshold){
    predic1 <- ifelse(predictor > i, 3, 8)
    mat_temp <- confus_mat(test_y,predic1)
    mat_temp[is.na(mat_temp)] <- 0
    y_recall =  c(y_recall,mat_temp$recall)
    x_fp_rate = c(x_fp_rate,mat_temp$False_Pos_rate)
  }
  dat <- data.frame("recall"=y_recall,"False_positive_rate"= x_fp_rate)
  dat <- rbind(dat,c(1,1))
  auc = auc(x =dat$False_positive_rate, y=dat$recall )

  g <- ggplot(data = dat,aes(x=False_positive_rate,y=recall))+ 
  geom_line(col = "blue",aes(x=False_positive_rate,y=recall))+
    labs(y= "True Positive Rate", x =  "False Positive Rate")+
  ggtitle(paste("ROC of",model_name,"Model"),subtitle = paste("AUC =",auc))+
  theme(plot.title = element_text(size = rel(1.5), hjust=0.5,vjust = 2, face = "bold"))
  return(g)
}
g_logis = roc_curve(test38$y,1-logis_pred_test,"Logistic")
g_rf = roc_curve(test38$y,temp_rf_test,"Random Forest")
gridExtra::grid.arrange(g_rf,g_logis)
```
#From the plots above we can see that the random forest model is a better model beacause it has a higher auc level.



#Q1.4
```{r,message=FALSE,echo=FALSE,warning=FALSE}
t <- as.vector(rf_pred_test) > as.vector(test38$y)
t1<-which(t %in% TRUE) 
par(mfrow = c(1, 2))
show_digit(test38[t1[1], ])
show_digit(test38[t1[4], ])
mtext("The model predict 8 while the value is 3  ", side = 3, line = -1, outer = TRUE)
t3 <- as.vector(rf_pred_test) < as.vector(test38$y)
t2<-which(t3 %in% TRUE) 
par(mfrow = c(1, 2))
show_digit(test38[t2[1], ])
show_digit(test38[t2[2], ])
mtext("The model predict 3 while the value is 8", side = 3, line = -1, outer = TRUE)
```
#Using the Random forest model.
#In the second set we have images that were predicted 3 while the true value is 8. the number in both images seem to be rounded threes. Also the center of the tree seems to be very thick like an 8.

#In the second set of images we have images that were predicted 8 while the value is 3. It seems to us that on the image to the left the connection in the center of the 8 is more grey so perhaps the model thought is rounded 3. i the one to thr right the top part is not closed so for the same reason the model could not predict accuratly.



#Q_1.5
#WE think that the random forest will predict the image well since random forest classifies based on the differences in the values and not based on the values. While the logistic regression will not predict well since is based on the accrual values. So if you switch from black to white it will not be able to predict.