library(caret) library(FactoMineR) library(factoextra) Dataframe_x <-mordred[,5:767] fix(Dataframe_x) Dataframe_x1=fix(Dataframe_x) write.csv(Dataframe_x1,file="MoE.csv") library(corrplot) Result<-cor(Dataframe_x1) library(pheatmap) pheatmap(Result,main = "Pre-screening heatmap",cluster_rows = FALSE, cluster_cols = FALSE,show_rownames = F, show_colnames = F) #library(eoffice) #topptx(filename="mordred_pheatmap.pptx",width=10,height=5) #Zero- and Near Zero-Variance Predictors nzv <- nearZeroVar(Dataframe_x1, saveMetrics=FALSE, freqCut=20, uniqueCut=10) Dataframe_x <- Dataframe_x1[,-nzv] #Identifying Correlated Predictors highlyCorDescr <- findCorrelation(cor(Dataframe_x), cutoff = .94) Dataframe_x <- Dataframe_x[,-highlyCorDescr] library(corrplot) Result<-cor(Dataframe_x) library(pheatmap) pheatmap(Result,main = "heatmap after screening",cluster_rows = FALSE, cluster_cols = FALSE,show_rownames = F, show_colnames = F) # PCA analysis PCA.db <- Dataframe_x res.pca <- PCA(PCA.db, scale.unit = TRUE, graph = TRUE) #library(eoffice) #topptx(filename="before PCA.pptx",width=4.6,height=4.6) print(res.pca) fviz_pca_ind(res.pca,geom.ind ="point", col.ind =mordred$Type, addEllipses = FALSE, title="Before PCA", xlab="PC1(22.54%)", ylab="PC2(11.32%)", palette = c("black", "blue" )) library(eoffice) topptx(filename="before PCA .pptx",width=6,height=4.6) res.contrib <-fviz_contrib(res.pca, choice = "var", axes = 1:5, top = 100) contrib_db <- res.contrib$data selectbypca <- subset(contrib_db,contrib > 0.75) Dataframe_x <- Dataframe_x[,rownames(selectbypca)] res.pca<-PCA(Dataframe_x, scale.unit = TRUE,ncp = 5,graph = TRUE) library(eoffice) topptx(filename="after PCA.pptx",width=6,height=4.6) fviz_pca_var(res.pca, col.var = "contrib", gradient.cols = c("darkgreen", "#CD0000") )+theme_test()+ theme(axis.title=element_text(size=15))+ theme(axis.text.x = element_text(size = 15,colour = "black"))+ theme(axis.text.y = element_text(size = 15,colour = "black"))+ xlab("PC1(59.96%)")+ ylab("PC2(15.08%)") topptx(filename="after PCA.pptx",width=6,height=4.6) print(res.pca) ## **Results for the Principal Component Analysis (PCA)** fviz_pca_ind(res.pca,geom.ind ="point", col.ind =mordred$Type, addEllipses = FALSE, title="Before PCA", xlab="PC1(59.96%)", ylab="PC2(15.08%)", palette = c("black", "blue" )) topptx(filename="after PCA .pptx",width=6,height=4.6) #scale Dataframe_x <- scale(Dataframe_x) Dataframe_x <- data.frame(Dataframe_x) Dataframe_all<-data.frame(Dataframe_x,type=mordred$Type) fix(Dataframe_all) Dataframe_all$type <- as.factor(Dataframe_all$type) str(Dataframe_all) #write.csv(Dataframe_all,file="Dataframe_all.csv",row.names = FALSE) # sampling test_idx<-createDataPartition(Dataframe_all$type,p=0.25)$Resample testset<-Dataframe_all[test_idx,] trainingset<-Dataframe_all[-test_idx,] #RF model #-----parameter optimization----- library(cvTools) library(foreach) library(randomForest) K=5 R=3 cv<-cvFolds(NROW(trainingset),K=K,R=R,type = "random") grid<-expand.grid(ntree=c(300,500,700),mtry=c(15,20,25)) result<-foreach(g=1:NROW(grid),.combine = rbind)%do%{ foreach(r=1:R,.combine = rbind)%do%{ foreach(k=1:K,.combine = rbind)%do%{ validation_idx<-cv$subsets[which(cv$which==k),r] train<-trainingset[-validation_idx,] validation<-trainingset[validation_idx,] m<-randomForest(type~., data=train, ntree=grid[g,"ntree"], mtry=grid[g,"mtry"]) predicted<-predict(m,newdata = validation) precision<-sum(predicted==validation$type)/NROW(predicted) return(data.frame(g=g,precision=precision)) } } } library(plyr) ddply(result,.(g),summarize,mean_precision=mean(precision)) grid[,] #-----cross validation----- # prepare data library(caret) create_ten_fold_cv<-function(){ set.seed(0211) lapply(createFolds(trainingset$type,k=10), function(idx){ return(list(train=trainingset[-idx,], validation=trainingset[idx,])) }) } # modelling library(foreach) folds<-create_ten_fold_cv() vm_result<-foreach(f=folds)%do%{ model_RF<-randomForest(type~., data=f$train, mtree=700, mtry = 15) predicted<-predict(model_RF,newdata=f$validation,type= "class") return(list(actual=f$validation$type,predicted=predicted)) } # evaluation evaluation<-function(lst){ accuracy<-sapply(lst,function(one_result){ return(sum(one_result$predicted==one_result$actual)/NROW(one_result$actual)) }) print(sprintf("MEAN+/-SD:%.3f+/-%.3f", mean(accuracy),sd(accuracy))) return(accuracy) } (RF_accuracy<-evaluation(vm_result)) plot(density(RF_accuracy),main="RF") #-----Testset----- library(randomForest) RF.model <- randomForest(type~ ., data = trainingset, ntree = 300, mtry=15) RF.pred <- predict(RF.model,testset,type="response") predicted_test<-as.factor(RF.pred) actual<-as.factor(testset$type) xtabs(~predicted_test+actual) type <- (data.frame(RF.pred)$RF.pred) library(caret) confusionMatrix(as.factor(testset$type),type) data5<-table(predicted_test,actual) library(pheatmap) pheatmap(data5,color = colorRampPalette(c("white", "#E64B35B2"))(10), cluster_rows = FALSE,cluster_cols= FALSE,legend=TRUE, display_numbers=data5,fontsize_number=50,fontsize_row=40,fontsize_col=40) topptx(filename="rf-confusionMatrix.pptx",width=6,height=4.6) RF.pred.prob <- predict(RF.model, testset,type = c("prob")) library(ROCR) pred<-prediction(RF.pred.prob[,2],testset$type) plot(performance(pred,"tpr","fpr")) auc<-performance(pred,"auc") #LDA model library(MASS) LDA.model<-lda(type~.,trainingset) LDA.pred.<-predict(LDA.model, testset) LDA.pred.prob<-LDA.pred.$posterior predicted_test<-LDA.pred.$class actual_lda<-as.factor(testset$type) xtabs(~predicted_test+actual_lda) confusionMatrix(predicted_test,actual_lda) data4<-table(predicted_test,actual_lda) library(pheatmap) pheatmap(data4,color = colorRampPalette(c("white", "#E64B35B2"))(10), cluster_rows = FALSE,cluster_cols= FALSE,legend=TRUE, display_numbers=data4,fontsize_number=50,fontsize_row=40,fontsize_col=40) topptx(filename="lda-confusionMatrix.pptx",width=6,height=4.6) library(ROCR) pred<-prediction(LDA.pred.prob[,2],testset$type) plot(performance(pred,"tpr","fpr")) auc<-performance(pred,"auc") auc #SVM model #-----parameter optimization----- library(cvTools) library(foreach) library(kernlab) set.seed(0211) K=5 R=3 fix(trainingset) cv<-cvFolds(NROW(trainingset),K=K,R=R,type = "random") grid<-expand.grid(gamma=c(0.1, 0.01, 0.001), C=c(5, 10, 15, 20, 25)) result<-foreach(g=1:NROW(grid),.combine = rbind)%do%{ foreach(r=1:R,.combine = rbind)%do%{ foreach(k=1:K,.combine = rbind)%do%{ validation_idx<-cv$subsets[which(cv$which==k),r] train<-trainingset[-validation_idx,] validation<-trainingset[validation_idx,] m<-ksvm(type~. , data=train, kernel = "rbfdot", #Radial Basis kernel "Gaussian", gamma = grid[g,"gamma"] , C = grid[g,"C"]) predicted<-predict(m,newdata = validation) Accuracy<-sum(predicted==validation$type)/NROW(predicted) return(data.frame(g=g,Accuracy=Accuracy)) } } } library(plyr) ddply(result,.(g),summarize,mean_Accuracy=mean(Accuracy)) grid[,] #-----cross validation----- # prepare data library(caret) create_ten_fold_cv<-function(){ set.seed(211) lapply(createFolds(trainingset$type,k=10), function(idx){ return(list(train=trainingset[-idx,], validation=trainingset[idx,])) }) } # modelling library(foreach) folds<-create_ten_fold_cv() svm_result<-foreach(f=folds)%do%{ model_svm<-ksvm(type~., data=f$train, kernel = "rbfdot", gamma = 0.00100 , C =10) predicted<-predict(model_svm,newdata=f$validation) return(list(actual=f$validation$type,predicted=predicted)) } ## evaluation evaluation<-function(lst){ accuracy<-sapply(lst,function(one_result){ return(sum(one_result$predicted==one_result$actual)/NROW(one_result$actual)) }) print(sprintf("MEAN+/-SD:%.3f+/-%.3f", mean(accuracy),sd(accuracy))) return(accuracy) } (svm_accuracy<-evaluation(svm_result)) plot(density(svm_accuracy),main="svm") #-----Testset----- svm.model <- ksvm(type ~ ., data = trainingset,kernel = "rbfdot", gamma=0.00100, C=10, prob.model = TRUE) svm.pred <- predict(svm.model, testset) #confusionMatrix predicted_test<-as.factor(svm.pred) actual<-as.factor(testset$type) confusionMatrix(predicted_test,actual) data2<-table(predicted_test,actual) library(pheatmap) pheatmap(data2,color = colorRampPalette(c("white", "#E64B35B2"))(10), cluster_rows = FALSE,cluster_cols= FALSE,legend=TRUE, display_numbers=data2,fontsize_number=50,fontsize_row=40,fontsize_col=40) topptx(filename="svm-confusionMatrix.pptx",width=6,height=4.6) svm.pred.prob <- predict(svm.model, testset , type = c("probabilities")) library(ROCR) pred<-prediction(svm.pred.prob[,2],testset$type) plot(performance(pred,"tpr","fpr")) auc<-performance(pred,"auc") auc #KNN model library(caret) library(FactoMineR) library(factoextra) library(kohonen) library(ggplot2) library(kernlab) library(class) library(foreach) library(ROCR) create_ten_fold_cv<-function(){ set.seed(0211) lapply(createFolds(trainingset$type,k=10), function(idx){ return(list(train=trainingset[-idx,], validation=trainingset[idx,])) }) } folds<-create_ten_fold_cv() #-----modelling-------------------------------------------------------- vm_result<-foreach(f=folds)%do%{ predicted <- knn(train=f$train[,!colnames(f$train)%in%c("type")], test = f$validation[,!colnames(f$train)%in%c("type")], cl=f$train[,c("type")],k=1) return(list(actual=f$validation$type,predicted=predicted)) } #-----evaluation------------------------------------------------------ evaluation<-function(lst){ accuracy<-sapply(lst,function(one_result){ return(sum(one_result$predicted==one_result$actual)/NROW(one_result$actual)) }) print(sprintf("MEAN+/-SD:%.3f+/-%.3f", mean(accuracy),sd(accuracy))) return(accuracy) } (KNN_accuracy<-evaluation(vm_result)) plot(density(KNN_accuracy),main="KNN") knn.model <- knn3Train(trainingset[,-35], testset[,-35], trainingset[,35], k = 100, prob=TRUE) knn.pred <- knn(train=trainingset[,-35], test = testset[,-35], cl=trainingset$type, k=7) actual_knn<-as.factor(testset$type) type <- (data.frame(knn.pred)$knn.pred) predicted_test<-knn.pred confusionMatrix(as.factor(testset$type), type) xtabs(~predicted_test+actual_knn) data3<-table(predicted_test,actual_knn) library(pheatmap) pheatmap(data3,color = colorRampPalette(c("white", "#E64B35B2"))(10), cluster_rows = FALSE,cluster_cols= FALSE,legend=TRUE, display_numbers=data3,fontsize_number=50,fontsize_row=40,fontsize_col=40) topptx(filename="knn-confusionMatrix.pptx",width=6,height=4.6) knn.pred.prob <- attr(knn.model,"prob") library(ROCR) pred<-prediction(knn.pred.prob[,2],testset$type) plot(performance(pred,"tpr","fpr")) auc<-performance(pred,"auc") auc #self loop 1000 times rf <- data.frame(accuracy = 0) for (i in 1:1000) { set.seed(i) test_idx<-createDataPartition(Dataframe_all$type,p=0.25)$Resample testset<-Dataframe_all[test_idx,] trainingset<-Dataframe_all[-test_idx,] write.csv(testset ,file="testset.csv") RF.model <- randomForest(type~ ., data = trainingset, ntree = 300, mtry=20) RF.pred <- predict(RF.model, testset,type = "response") type <- (data.frame(RF.pred)$RF.pred) predicted_test<-RF.pred actual_rf<-as.factor(testset$type) aa_rf<- confusionMatrix(as.factor(testset$type), type) data1<-table(predicted_test,actual_rf) pheatmap(data1,color = colorRampPalette(c("white", "#E64B35B2"))(10), cluster_rows = FALSE,cluster_cols= FALSE,legend=TRUE, display_numbers=data1,fontsize_number=50,fontsize_row=40,fontsize_col=40) rf_accuracy <- data.frame(accuracy =data.frame(aa_rf[["overall"]])[1,1]) #save(RF.model, file = paste("./model/","_",i,"_",round(rf_accuracy,4),"_","RF_model.RData")) rf_1 <- rbind(rf_1 , rf_accuracy) print(i) } write.csv(rf_1 ,file="chemopy rf_1.csv",row.names = FALSE) #ROC library(pROC) library(ggplot2) svm.pred.prob <- as.data.frame(svm.pred.prob) RF.pred.prob <- as.data.frame(RF.pred.prob) knn.pred.prob <- as.data.frame(knn.pred.prob) LDA.pred.prob <- as.data.frame(LDA.pred.prob) ROC_db<-data.frame(SVM=svm.pred.prob[,2], kNN=knn.pred.prob[,2], RF=RF.pred.prob[,2], LDA=LDA.pred.prob[,2], type=testset$type, stringsAsFactors = FALSE) fix(ROC_db) data.frame(ROC_db) rocobj1 <- roc(ROC_db$type, ROC_db$SVM) rocobj2 <- roc(ROC_db$type, ROC_db$kNN) rocobj3 <- roc(ROC_db$type, ROC_db$RF) rocobj4 <- roc(ROC_db$type, ROC_db$LDA) auc(rocobj1) auc(rocobj2) auc(rocobj3) auc(rocobj4) plot(smooth(rocobj1),print.auc = TRUE, print.auc.x = 0.4, print.auc.y = 0.4, auc.polygon=TRUE, grid=c(0.1, 0.3),grid.col=c("green", "red"), max.auc.polygon=TRUE, #print.thres=TRUE, auc.polygon.col="white", legacy.axes = TRUE, print.auc.pattern = as.character("auc=0.9709"),identity=FALSE) plot(smooth(rocobj2), main="kNN",print.auc = TRUE, print.auc.x = 0.4, print.auc.y = 0.4, auc.polygon=TRUE, grid=c(0.1, 0.3),grid.col=c("green", "red"), max.auc.polygon=TRUE, #print.thres=TRUE, auc.polygon.col="white", legacy.axes = TRUE, print.auc.pattern = as.character("auc=0.9616"),identity=FALSE) plot(smooth(rocobj3), main="RF",print.auc = TRUE, print.auc.x = 0.4, print.auc.y = 0.4, auc.polygon=TRUE, grid=c(0.1, 0.3),grid.col=c("green", "red"), max.auc.polygon=TRUE, #print.thres=TRUE, auc.polygon.col="white", legacy.axes = TRUE, print.auc.pattern = as.character("auc=0.99"),identity=FALSE) plot(smooth(rocobj4), main="LDA",print.auc = TRUE, print.auc.x = 0.4, print.auc.y = 0.4, auc.polygon=TRUE, grid=c(0.1, 0.3),grid.col=c("green", "red"), max.auc.polygon=TRUE, #print.thres=TRUE, auc.polygon.col="white", legacy.axes = TRUE, print.auc.pattern = as.character("auc=0.9335"),identity=FALSE) plot(smooth(rocobj4),print.auc = TRUE, print.auc.x = -0.2, print.auc.y = 0.3, auc.polygon=TRUE, grid=c(0.1, 0.3),grid.col=c("green", "red"), max.auc.polygon=TRUE, #print.thres=TRUE, auc.polygon.col="white", legacy.axes = TRUE, print.auc.pattern = as.character("auc=0.9051"),identity=FALSE) plot.roc(smooth(rocobj2),add=T,col="red",print.auc=TRUE,print.auc.x=-0.2,print.auc.y=0.4,print.auc.pattern = as.character("SVM:AUC=0.9758"),identity=FALSE) plot.roc(smooth(rocobj1),add=T,col="purple",print.auc=TRUE,print.auc.x=-0.2,print.auc.y=0.5, print.auc.pattern = as.character("KNN:AUC=0.9820"),identity=FALSE) plot.roc(smooth(rocobj3),add=T,col="blue",print.auc=TRUE,print.auc.x=-0.2,print.auc.y=0.6,print.auc.pattern = as.character("RF:AUC=0.9917"),identity=FALSE) # plot(smooth(rocobj4),print.auc = TRUE, print.auc.x = -0.2, print.auc.y = 0.3, auc.polygon=TRUE, grid=c(0.1, 0.3),grid.col=c("green", "red"), max.auc.polygon=TRUE, #print.thres=TRUE, auc.polygon.col="white", legacy.axes = TRUE, print.auc.pattern = as.character("LDA:AUC=0.9335"),identity=FALSE) plot.roc(smooth(rocobj3),add=T,col="red",print.auc=TRUE,print.auc.x=-0.2,print.auc.y=0.4,print.auc.pattern = as.character("KNN:AUC=0.9616"),identity=FALSE) plot.roc(smooth(rocobj2),add=T,col="purple",print.auc=TRUE,print.auc.x=-0.2,print.auc.y=0.5, print.auc.pattern = as.character("SVM:AUC=0.9709"),identity=FALSE) plot.roc(smooth(rocobj1),add=T,col="blue",print.auc=TRUE,print.auc.x=-0.2,print.auc.y=0.6,print.auc.pattern = as.character("RF:AUC=0.9934"),identity=FALSE)