Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
jp-jong authored Jun 17, 2024
1 parent 8676c3e commit 451afff
Show file tree
Hide file tree
Showing 7 changed files with 6,594 additions and 0 deletions.
1,952 changes: 1,952 additions & 0 deletions Misc Scripts/Iris_dataset.nb.html

Large diffs are not rendered by default.

85 changes: 85 additions & 0 deletions Misc Scripts/decision_tree_1.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# Install and load the required packages
#install.packages("rpart")
#install.packages("rpart.plot")
library(rpart)
library(rpart.plot)

# Load the iris dataset
data(iris)

# Split the data into training and test sets
set.seed(123)
train_indices <- sample(1:nrow(iris), 0.7 * nrow(iris))
train_data <- iris[train_indices, ]
test_data <- iris[-train_indices, ]

# Generate the decision tree
decision_tree <- rpart(Species ~ ., data = train_data, method = "class")

# Show the importance of the variables
importance <- as.data.frame(decision_tree$variable.importance)
importance$Variable <- rownames(importance)
importance <- importance[order(-importance[,1]), ]
colnames(importance)[1] <- "Importance"
print("Variable Importance:")
print(importance)

# Visualize the tree
rpart.plot(decision_tree, main = "Decision Tree for Iris Dataset")

# Optional: Predict on test data and calculate accuracy
predictions <- predict(decision_tree, newdata = test_data, type = "class")
confusion_matrix <- table(Predicted = predictions, Actual = test_data$Species)
print("Confusion Matrix:")
print(confusion_matrix)

accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
print(paste("Accuracy:", round(accuracy * 100, 2), "%"))

### added more trees and pruning
# Install and load the required packages
#install.packages("rpart")
#install.packages("rpart.plot")
library(rpart)
library(rpart.plot)

# Load the iris dataset
data(iris)

# Split the data into training and test sets
set.seed(123)
train_indices <- sample(1:nrow(iris), 0.7 * nrow(iris))
train_data <- iris[train_indices, ]
test_data <- iris[-train_indices, ]

# Generate a deeper decision tree
decision_tree <- rpart(Species ~ ., data = train_data, method = "class",
control = rpart.control(minsplit = 2, cp = 0))

# Print the CP table to inspect the complexity parameter values
printcp(decision_tree)

# Prune the tree using the optimal cp value
optimal_cp <- decision_tree$cptable[which.min(decision_tree$cptable[,"xerror"]), "CP"]
pruned_tree <- prune(decision_tree, cp = optimal_cp)

# Show the importance of the variables
importance <- as.data.frame(pruned_tree$variable.importance)
importance$Variable <- rownames(importance)
importance <- importance[order(-importance[,1]), ]
colnames(importance)[1] <- "Importance"
print("Variable Importance:")
print(importance)

# Visualize the pruned tree
rpart.plot(pruned_tree, main = "Pruned Decision Tree for Iris Dataset")

# Optional: Predict on test data and calculate accuracy
predictions <- predict(pruned_tree, newdata = test_data, type = "class")
confusion_matrix <- table(Predicted = predictions, Actual = test_data$Species)
print("Confusion Matrix:")
print(confusion_matrix)

accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
print(paste("Accuracy:", round(accuracy * 100, 2), "%"))

63 changes: 63 additions & 0 deletions Misc Scripts/decision_tree_feature_ex_mode.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Install and load the required packages
#install.packages("rpart")
#install.packages("rpart.plot")
library(rpart)
library(rpart.plot)

# Load the iris dataset
data(iris)

# Split the data into training and test sets
set.seed(123)
train_indices <- sample(1:nrow(iris), 0.7 * nrow(iris))
train_data <- iris[train_indices, ]
test_data <- iris[-train_indices, ]

# Generate a deeper decision tree
decision_tree <- rpart(Species ~ ., data = train_data, method = "class")

# Print the CP table to inspect the complexity parameter values
printcp(decision_tree)

# Prune the tree using the optimal cp value
optimal_cp <- decision_tree$cptable[which.min(decision_tree$cptable[,"xerror"]), "CP"]
pruned_tree <- prune(decision_tree, cp = optimal_cp)

# Show the importance of the variables
importance <- as.data.frame(pruned_tree$variable.importance)
importance$Variable <- rownames(importance)
importance <- importance[order(-importance[,1]), ]
colnames(importance)[1] <- "Importance"
print("Variable Importance:")
print(importance)

# Identify the least important feature
least_important_feature <- rownames(importance)[which.min(importance$Importance)]
print(paste("Excluding the least important feature:", least_important_feature))

# Exclude the least important feature from the dataset
train_data_reduced <- train_data[, !(names(train_data) %in% least_important_feature)]
test_data_reduced <- test_data[, !(names(test_data) %in% least_important_feature)]

# Generate the decision tree again with the reduced dataset
decision_tree_reduced <- rpart(Species ~ ., data = train_data_reduced, method = "class",
control = rpart.control(minsplit = 2, cp = 0))

# Print the CP table to inspect the complexity parameter values for the reduced tree
printcp(decision_tree_reduced)

# Prune the reduced tree using the optimal cp value
optimal_cp_reduced <- decision_tree_reduced$cptable[which.min(decision_tree_reduced$cptable[,"xerror"]), "CP"]
pruned_tree_reduced <- prune(decision_tree_reduced, cp = optimal_cp_reduced)

# Visualize the pruned tree for the reduced dataset
rpart.plot(pruned_tree_reduced, main = "Pruned Decision Tree for Iris Dataset (Reduced Features)")

# Predict on test data with the reduced model and calculate accuracy
predictions_reduced <- predict(pruned_tree_reduced, newdata = test_data_reduced, type = "class")
confusion_matrix_reduced <- table(Predicted = predictions_reduced, Actual = test_data$Species)
print("Confusion Matrix (Reduced Features):")
print(confusion_matrix_reduced)

accuracy_reduced <- sum(diag(confusion_matrix_reduced)) / sum(confusion_matrix_reduced)
print(paste("Accuracy (Reduced Features):", round(accuracy_reduced * 100, 2), "%"))
Binary file added Project_iris.pptx
Binary file not shown.
4,213 changes: 4,213 additions & 0 deletions Scripts/Project_iris.ipynb

Large diffs are not rendered by default.

130 changes: 130 additions & 0 deletions Scripts/visualization_iris.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
# Plotting the summaries for an overview
library(reshape2)
library(ggplot2)
iris.df = melt(iris, id.vars = c('Species'))
head(iris.df)

ggplot(iris.df, aes(x= Species, y = value, fill = variable)) +
geom_boxplot() +
xlab('Species') + ylab('Values') + theme_bw() +
theme(text = element_text(size=16),
axis.text.x = element_text(angle=0, hjust=0.5),
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5)) +
ggtitle("Measurements by group") +
scale_fill_discrete(name = "Characteristics",
labels = c("Sepal Length", "Sepal Width","Petal Length",
"Petal Width"))


## Multivariate analysis
## PCA
#PCA
install.packages("factoextra")
library(factoextra)
iris.data <- iris[,1:4]
iris.name < iris[,5]
system.time(p <- prcomp(iris.data, scale=TRUE)) # user:0.001 system:0.001 elapsed: 0.005
names(p)
summary(p)

# Plot PCA with labels
fviz_pca_ind(p,
geom.ind = "point", # Show points only (no text)
col.ind = iris.name, # Color by species
addEllipses = FALSE, # Add concentration ellipses
legend.title = "Species") +
theme_minimal() +
labs(title = "PCA of Iris Dataset", center=TRUE,
x = "PC1",
y = "PC2") +
theme(plot.title = element_text(hjust = 0.5)) # Center the title


# t-SNE
set.seed(1)
library(Rtsne)
iris_d_tsne <- as.matrix(iris[,1:4])
system.time(iris.rtsne <- Rtsne(iris_d_tsne, check_duplicates = FALSE, verbose = F)) # user: 0.321 system:0.00 elapsed:0.322

# plot
# Create a data frame for the t-SNE results
tsne_df <- data.frame(iris.rtsne$Y, Species = iris$Species)
colnames(tsne_df) <- c("tSNE1", "tSNE2", "Species")

# Plot t-SNE results
ggplot(tsne_df, aes(x = tSNE1, y = tSNE2, color = Species)) +
geom_point(size = 2) +
theme_minimal() +
labs(title = "t-SNE of Iris Dataset",
x = "t-SNE Dimension 1",
y = "t-SNE Dimension 2") +
theme(plot.title = element_text(hjust = 0.5)) # Center the title

### Support Vector Machine
library('e1071')

index <- c(1:nrow(iris))
test.index <- sample(index, size = (length(index)/3))
train <- iris[-test.index,]
test <- iris[test.index,]

svm.model.linear <- svm(Species ~., data = train, kernel = 'linear')
table(Prediction = predict(svm.model.linear, train),Truth = train$Species)

svm.model.poly <- svm(Species ~ ., data = train, kernel = 'polynomial')
table(Prediction = predict(svm.model.poly, train),Truth = train$Species)

tuned.svm <- tune.svm(Species~., data = train, kernel = 'linear',
gamma = seq(1/2^nrow(iris),1, .01), cost = 2^seq(-6, 4, 2))
tuned.svm

tuned.svm <- svm(Species ~ . , data = train, kernel = 'linear', gamma = 7.006492e-46, cost = 0.25)
table(Prediction = predict(tuned.svm, train),Truth = train$Species)

best.svm <- best.svm(Species~. , data = train, kernel = 'linear')
best.svm
table(Prediction = predict(best.svm, train), Truth = train$Species)

best.svm.pred <- predict(best.svm, test)
table(Prediction = best.svm.pred, Truth = test$Species)

sum(test$Species == best.svm.pred)/50

# ChatGPT's SVM
# Install and load the required package
install.packages("e1071")
library(e1071)

# Load the iris dataset
data(iris)

# Split the data into training and test sets
set.seed(123)
train_indices <- sample(1:nrow(iris), 0.7 * nrow(iris))
train_data <- iris[train_indices, ]
test_data <- iris[-train_indices, ]

# Perform SVM with hyperparameter tuning using best.svm
tuned_svm <- tune.svm(Species ~ ., data = train_data,
gamma = 10^(-6:-1), cost = 10^(1:2))

# Get the best model
best_svm <- tuned_svm$best.model

# Print the best parameters
print(best_svm)

# Make predictions on the test set
svm_predictions <- predict(best_svm, newdata = test_data)

# Confusion matrix
confusion_matrix <- table(Predicted = svm_predictions, Actual = test_data$Species)
print(confusion_matrix)

# Calculate accuracy
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
print(paste("Accuracy:", round(accuracy * 100, 2), "%"))



Loading

0 comments on commit 451afff

Please sign in to comment.