Add files via upload

jp-jong · Jun 17, 2024 · 451afff · 451afff
1 parent 8676c3e
commit 451afff
Show file tree

Hide file tree

Showing 7 changed files with 6,594 additions and 0 deletions.
diff --git a/Misc Scripts/Iris_dataset.nb.html b/Misc Scripts/Iris_dataset.nb.html
diff --git a/Misc Scripts/decision_tree_1.R b/Misc Scripts/decision_tree_1.R
@@ -0,0 +1,85 @@
+# Install and load the required packages
+#install.packages("rpart")
+#install.packages("rpart.plot")
+library(rpart)
+library(rpart.plot)
+
+# Load the iris dataset
+data(iris)
+
+# Split the data into training and test sets
+set.seed(123)
+train_indices <- sample(1:nrow(iris), 0.7 * nrow(iris))
+train_data <- iris[train_indices, ]
+test_data <- iris[-train_indices, ]
+
+# Generate the decision tree
+decision_tree <- rpart(Species ~ ., data = train_data, method = "class")
+
+# Show the importance of the variables
+importance <- as.data.frame(decision_tree$variable.importance)
+importance$Variable <- rownames(importance)
+importance <- importance[order(-importance[,1]), ]
+colnames(importance)[1] <- "Importance"
+print("Variable Importance:")
+print(importance)
+
+# Visualize the tree
+rpart.plot(decision_tree, main = "Decision Tree for Iris Dataset")
+
+# Optional: Predict on test data and calculate accuracy
+predictions <- predict(decision_tree, newdata = test_data, type = "class")
+confusion_matrix <- table(Predicted = predictions, Actual = test_data$Species)
+print("Confusion Matrix:")
+print(confusion_matrix)
+
+accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
+print(paste("Accuracy:", round(accuracy * 100, 2), "%"))
+
+### added more trees and pruning
+# Install and load the required packages
+#install.packages("rpart")
+#install.packages("rpart.plot")
+library(rpart)
+library(rpart.plot)
+
+# Load the iris dataset
+data(iris)
+
+# Split the data into training and test sets
+set.seed(123)
+train_indices <- sample(1:nrow(iris), 0.7 * nrow(iris))
+train_data <- iris[train_indices, ]
+test_data <- iris[-train_indices, ]
+
+# Generate a deeper decision tree
+decision_tree <- rpart(Species ~ ., data = train_data, method = "class", 
+                       control = rpart.control(minsplit = 2, cp = 0))
+
+# Print the CP table to inspect the complexity parameter values
+printcp(decision_tree)
+
+# Prune the tree using the optimal cp value
+optimal_cp <- decision_tree$cptable[which.min(decision_tree$cptable[,"xerror"]), "CP"]
+pruned_tree <- prune(decision_tree, cp = optimal_cp)
+
+# Show the importance of the variables
+importance <- as.data.frame(pruned_tree$variable.importance)
+importance$Variable <- rownames(importance)
+importance <- importance[order(-importance[,1]), ]
+colnames(importance)[1] <- "Importance"
+print("Variable Importance:")
+print(importance)
+
+# Visualize the pruned tree
+rpart.plot(pruned_tree, main = "Pruned Decision Tree for Iris Dataset")
+
+# Optional: Predict on test data and calculate accuracy
+predictions <- predict(pruned_tree, newdata = test_data, type = "class")
+confusion_matrix <- table(Predicted = predictions, Actual = test_data$Species)
+print("Confusion Matrix:")
+print(confusion_matrix)
+
+accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
+print(paste("Accuracy:", round(accuracy * 100, 2), "%"))
+
diff --git a/Misc Scripts/decision_tree_feature_ex_mode.R b/Misc Scripts/decision_tree_feature_ex_mode.R
@@ -0,0 +1,63 @@
+# Install and load the required packages
+#install.packages("rpart")
+#install.packages("rpart.plot")
+library(rpart)
+library(rpart.plot)
+
+# Load the iris dataset
+data(iris)
+
+# Split the data into training and test sets
+set.seed(123)
+train_indices <- sample(1:nrow(iris), 0.7 * nrow(iris))
+train_data <- iris[train_indices, ]
+test_data <- iris[-train_indices, ]
+
+# Generate a deeper decision tree
+decision_tree <- rpart(Species ~ ., data = train_data, method = "class")
+
+# Print the CP table to inspect the complexity parameter values
+printcp(decision_tree)
+
+# Prune the tree using the optimal cp value
+optimal_cp <- decision_tree$cptable[which.min(decision_tree$cptable[,"xerror"]), "CP"]
+pruned_tree <- prune(decision_tree, cp = optimal_cp)
+
+# Show the importance of the variables
+importance <- as.data.frame(pruned_tree$variable.importance)
+importance$Variable <- rownames(importance)
+importance <- importance[order(-importance[,1]), ]
+colnames(importance)[1] <- "Importance"
+print("Variable Importance:")
+print(importance)
+
+# Identify the least important feature
+least_important_feature <- rownames(importance)[which.min(importance$Importance)]
+print(paste("Excluding the least important feature:", least_important_feature))
+
+# Exclude the least important feature from the dataset
+train_data_reduced <- train_data[, !(names(train_data) %in% least_important_feature)]
+test_data_reduced <- test_data[, !(names(test_data) %in% least_important_feature)]
+
+# Generate the decision tree again with the reduced dataset
+decision_tree_reduced <- rpart(Species ~ ., data = train_data_reduced, method = "class", 
+                               control = rpart.control(minsplit = 2, cp = 0))
+
+# Print the CP table to inspect the complexity parameter values for the reduced tree
+printcp(decision_tree_reduced)
+
+# Prune the reduced tree using the optimal cp value
+optimal_cp_reduced <- decision_tree_reduced$cptable[which.min(decision_tree_reduced$cptable[,"xerror"]), "CP"]
+pruned_tree_reduced <- prune(decision_tree_reduced, cp = optimal_cp_reduced)
+
+# Visualize the pruned tree for the reduced dataset
+rpart.plot(pruned_tree_reduced, main = "Pruned Decision Tree for Iris Dataset (Reduced Features)")
+
+# Predict on test data with the reduced model and calculate accuracy
+predictions_reduced <- predict(pruned_tree_reduced, newdata = test_data_reduced, type = "class")
+confusion_matrix_reduced <- table(Predicted = predictions_reduced, Actual = test_data$Species)
+print("Confusion Matrix (Reduced Features):")
+print(confusion_matrix_reduced)
+
+accuracy_reduced <- sum(diag(confusion_matrix_reduced)) / sum(confusion_matrix_reduced)
+print(paste("Accuracy (Reduced Features):", round(accuracy_reduced * 100, 2), "%"))
diff --git a/Project_iris.pptx b/Project_iris.pptx
diff --git a/Scripts/Project_iris.ipynb b/Scripts/Project_iris.ipynb
diff --git a/Scripts/visualization_iris.R b/Scripts/visualization_iris.R
@@ -0,0 +1,130 @@
+# Plotting the summaries for an overview
+library(reshape2)
+library(ggplot2)
+iris.df = melt(iris, id.vars = c('Species'))
+head(iris.df)
+
+ggplot(iris.df, aes(x= Species, y = value, fill = variable)) + 
+  geom_boxplot() +
+  xlab('Species') + ylab('Values') + theme_bw() + 
+  theme(text = element_text(size=16), 
+        axis.text.x = element_text(angle=0, hjust=0.5),
+        plot.title = element_text(hjust = 0.5),
+        plot.subtitle = element_text(hjust = 0.5)) +
+  ggtitle("Measurements by group") +
+  scale_fill_discrete(name = "Characteristics", 
+                      labels = c("Sepal Length", "Sepal Width","Petal Length", 
+                                 "Petal Width"))
+
+
+## Multivariate analysis 
+## PCA
+#PCA
+install.packages("factoextra")
+library(factoextra)
+iris.data <- iris[,1:4]
+iris.name < iris[,5]
+system.time(p <- prcomp(iris.data, scale=TRUE)) # user:0.001 system:0.001 elapsed: 0.005
+names(p)
+summary(p)
+
+# Plot PCA with labels
+fviz_pca_ind(p,
+             geom.ind = "point", # Show points only (no text)
+             col.ind = iris.name, # Color by species
+             addEllipses = FALSE, # Add concentration ellipses
+             legend.title = "Species") + 
+  theme_minimal() + 
+  labs(title = "PCA of Iris Dataset", center=TRUE,
+       x = "PC1",
+       y = "PC2") +
+  theme(plot.title = element_text(hjust = 0.5)) # Center the title
+
+
+# t-SNE
+set.seed(1)
+library(Rtsne)
+iris_d_tsne <- as.matrix(iris[,1:4])
+system.time(iris.rtsne <- Rtsne(iris_d_tsne, check_duplicates = FALSE, verbose = F)) # user: 0.321 system:0.00 elapsed:0.322
+
+# plot
+# Create a data frame for the t-SNE results
+tsne_df <- data.frame(iris.rtsne$Y, Species = iris$Species)
+colnames(tsne_df) <- c("tSNE1", "tSNE2", "Species")
+
+# Plot t-SNE results
+ggplot(tsne_df, aes(x = tSNE1, y = tSNE2, color = Species)) +
+  geom_point(size = 2) +
+  theme_minimal() +
+  labs(title = "t-SNE of Iris Dataset",
+       x = "t-SNE Dimension 1",
+       y = "t-SNE Dimension 2") +
+  theme(plot.title = element_text(hjust = 0.5)) # Center the title
+
+### Support Vector Machine
+library('e1071')
+
+index <- c(1:nrow(iris))
+test.index <- sample(index, size = (length(index)/3))
+train <- iris[-test.index,]
+test <- iris[test.index,]
+
+svm.model.linear <- svm(Species ~., data = train, kernel = 'linear')
+table(Prediction = predict(svm.model.linear, train),Truth = train$Species)
+
+svm.model.poly <- svm(Species ~ ., data = train, kernel = 'polynomial')
+table(Prediction = predict(svm.model.poly, train),Truth = train$Species)
+
+tuned.svm <- tune.svm(Species~., data = train, kernel = 'linear',
+                      gamma = seq(1/2^nrow(iris),1, .01), cost = 2^seq(-6, 4, 2))
+tuned.svm
+
+tuned.svm <- svm(Species ~ . , data = train, kernel = 'linear', gamma = 7.006492e-46, cost = 0.25)
+table(Prediction = predict(tuned.svm, train),Truth = train$Species)
+
+best.svm <- best.svm(Species~. , data = train, kernel = 'linear')
+best.svm
+table(Prediction = predict(best.svm, train), Truth = train$Species)
+
+best.svm.pred <- predict(best.svm, test)
+table(Prediction = best.svm.pred, Truth = test$Species)
+
+sum(test$Species == best.svm.pred)/50
+
+# ChatGPT's SVM
+# Install and load the required package
+install.packages("e1071")
+library(e1071)
+
+# Load the iris dataset
+data(iris)
+
+# Split the data into training and test sets
+set.seed(123)
+train_indices <- sample(1:nrow(iris), 0.7 * nrow(iris))
+train_data <- iris[train_indices, ]
+test_data <- iris[-train_indices, ]
+
+# Perform SVM with hyperparameter tuning using best.svm
+tuned_svm <- tune.svm(Species ~ ., data = train_data,
+                      gamma = 10^(-6:-1), cost = 10^(1:2))
+
+# Get the best model
+best_svm <- tuned_svm$best.model
+
+# Print the best parameters
+print(best_svm)
+
+# Make predictions on the test set
+svm_predictions <- predict(best_svm, newdata = test_data)
+
+# Confusion matrix
+confusion_matrix <- table(Predicted = svm_predictions, Actual = test_data$Species)
+print(confusion_matrix)
+
+# Calculate accuracy
+accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
+print(paste("Accuracy:", round(accuracy * 100, 2), "%"))
+
+
+