#install.packages("caret", dependencies=c("Depends", "Suggests")) library(caret) # attach the iris dataset to the environment data(iris) # rename the dataset dataset <- iris # define the filename filename <- "iris.csv" # load the CSV file from the local directory dataset <- read.csv(filename, header=FALSE) # set the column names in the dataset colnames(dataset) <- c("Sepal.Length","Sepal.Width","Petal.Length","Petal.Width","Species") # create a list of 80% of the rows in the original dataset we can use for training validation_index <- createDataPartition(dataset$Species, p=0.80, list=FALSE) # select 20% of the data for validation validation <- dataset[-validation_index,] # use the remaining 80% of data to training and testing the models dataset <- dataset[validation_index,] # dimensions of dataset dim(dataset) # list types for each attribute sapply(dataset, class) # list the levels for the class levels(dataset$Species) # summarize the class distribution percentage <- prop.table(table(dataset$Species)) * 100 cbind(freq=table(dataset$Species), percentage=percentage) # summarize the class distribution percentage <- prop.table(table(dataset$Species)) * 100 cbind(freq=table(dataset$Species), percentage=percentage) summary(dataset) # split input and output x <- dataset[,1:4] y <- dataset[,5] # boxplot for each attribute on one image par(mfrow=c(1,4)) for(i in 1:4) { boxplot(x[,i], main=names(iris)[i]) } # barplot for class breakdown plot(y) # scatterplot matrix featurePlot(x=x, y=y, plot="ellipse") # box and whisker plots for each attribute featurePlot(x=x, y=y, plot="box") # density plots for each attribute by class value scales <- list(x=list(relation="free"), y=list(relation="free")) featurePlot(x=x, y=y, plot="density", scales=scales) # Run algorithms using 10-fold cross validation control <- trainControl(method="cv", number=10) metric <- "Accuracy" # Let’s evaluate 5 different algorithms: # 1) Linear Discriminant Analysis (LDA) # 2) Classification and Regression Trees (CART) # 3) k-Nearest Neighbors (kNN) # 4) Support Vector Machines (SVM) with a linear kernel # 5) Random Forest (RF) ### Linear algorithms ### set.seed(7) fit.lda <- train(Species~., data=dataset, method="lda", metric=metric, trControl=control) ### Nonlinear algorithms ### # CART set.seed(7) fit.cart <- train(Species~., data=dataset, method="rpart", metric=metric, trControl=control) # kNN set.seed(7) fit.knn <- train(Species~., data=dataset, method="knn", metric=metric, trControl=control) ### Advanced algorithms ### # SVM set.seed(7) fit.svm <- train(Species~., data=dataset, method="svmRadial", metric=metric, trControl=control) # Random Forest set.seed(7) fit.rf <- train(Species~., data=dataset, method="rf", metric=metric, trControl=control) # summarize accuracy of models results <- resamples(list(lda=fit.lda, cart=fit.cart, knn=fit.knn, svm=fit.svm, rf=fit.rf)) summary(results) # compare accuracy of models dotplot(results) # summarize Best Model print(fit.lda) #%%%%%%%%%%%%%% Prediction %%%%%%%%%%%%%%% # estimate skill of LDA on the validation dataset predictions <- predict(fit.lda, validation) confusionMatrix(predictions, validation$Species)