rm(list=ls()) ####Analysis of the diabetes data##### setwd("~/Seafile/Enseignement/M2_Classification/2021-22/Data") diabetes=read.table("diabetes.csv",sep=",",header=TRUE) attach(diabetes) SkinThickness[SkinThickness==0]<-median(SkinThickness[SkinThickness!=0]) Insulin[Insulin==0]<-median(Insulin[Insulin!=0]) Glucose[Glucose==0]<-median(Glucose[Glucose!=0]) BloodPressure[BloodPressure==0]<-median(BloodPressure[BloodPressure!=0]) BMI[BMI==0]<-median(BMI[BMI!=0]) diabetes$Insulin<-Insulin diabetes$SkinThickness<-SkinThickness diabetes$Glucose<-Glucose diabetes$BloodPressure<-BloodPressure diabetes$BMI<-BMI idx <- sample(1:768, 614, replace = F) dataL <- diabetes[c(idx),] dataT <- diabetes[-c(idx),] model_train_glm<-glm(Outcome ~ Pregnancies + DiabetesPedigreeFunction + BMI + Glucose,data=dataL,family="binomial") model_train_svm<-svm(factor(Outcome) ~ ., data = dataL, scale = TRUE, kernel = "radial", cost = 10) resPred_glm=predict(model_train_glm,newdata=dataT,type="response") resPred_svm=predict(model_train_svm,newdata=dataT) head(resPred_glm) pred.glm <- 1*(resPred_glm>=0.5) mean(pred.glm!=dataT$Outcome) #0.2207792 mean(resPred_svm!=dataT$Outcome) #0.2467532 #cross-validation for cost function #using the tune function Cgrid <- exp(seq(-5,15,by=2)*log(2)) Gamgrid <- exp(seq(-15,3,by=2)*log(2)) svm_tuned <- tune.svm(factor(Outcome) ~ ., data = diabetes, scale = TRUE,cost=Cgrid, gamma=Gamgrid, kernel = "radial") svm_tuned$best.performance #0.2277341 result=svm_tuned$best.model result$coefs result$cost #1.1 result$gamma #0.001953125 model_train_svm_best<-svm(factor(Outcome) ~ ., data = dataL, scale = TRUE, kernel = "radial", cost = result$cost,gamma=result$gamma) resPred_svm_best=predict(model_train_svm_best,newdata=dataT) mean(resPred_svm_best!=dataT$Outcome) #Cross-validated error using the best model cvgrid <- floor(seq(77,768,length.out=10)) left <- 1 pred.svm_best <- rep(NA,768) for (M in 1:10) { idx <- c(left:cvgrid[M]) dataL <- diabetes[-c(idx),] dataT <- diabetes[c(idx),] model_train_svm <- svm(factor(Outcome) ~ ., data = dataL, scale = TRUE, kernel = "radial", cost = result$cost, gamma = result$gamma) resPred_svm <- predict(model_train_svm,newdata=dataT)# pred.svm_best[idx] <- as.numeric(resPred_svm)-1 left <- cvgrid[M]+1 } mean(pred.svm_best!=diabetes$Outcome) #0.2395833