apply cal_feature_sel function to training data in trans_classifier

ChiLiubio · Jan 30, 2025 · 001a15c · 001a15c
1 parent 404bbf8
commit 001a15c
Show file tree

Hide file tree

Showing 2 changed files with 78 additions and 71 deletions.
diff --git a/R/trans_classifier.R b/R/trans_classifier.R
@@ -137,6 +137,36 @@ trans_classifier <- R6::R6Class(classname = "trans_classifier",
 			invisible(self)
 		},
 		#' @description
+		#' Split data for training and testing.
+		#' 
+		#' @param prop.train default 3/4; the ratio of the data used for the training.
+		#' @return \code{data_train} and \code{data_test} in the object.
+		#' @examples
+		#' \dontrun{
+		#' t1$cal_split(prop.train = 3/4)
+		#' }
+		cal_split = function(prop.train = 3/4){
+			######################    DATA SPLIT: TRAIN and TEST
+			message("Creating training set and testing set ...")
+			data_response <- self$data_response
+			if(self$type == "Classification"){
+				data_response %<>% factor
+			}
+			data_feature <- self$data_feature
+
+			data_all <- data.frame(Response = data_response, data_feature, check.names = FALSE)
+			SplitData <- rsample::initial_split(data_all, prop = prop.train, strata = "Response")
+			train_data <- rsample::training(SplitData)
+			test_data <- rsample::testing(SplitData)
+			message("Stratified sampling with the proportion of ", prop.train*100 ,"% for the training set ...")
+
+			######################    DATA SPLIT end
+			self$data_train <- train_data
+			self$data_test <- test_data
+			message("Training and testing data are stored in object$data_train and object$data_test respectively ...")
+			invisible(self)
+		},
+		#' @description
 		#' Perform feature selection.
 		#' 	 See \href{https://topepo.github.io/caret/feature-selection-overview.html}{https://topepo.github.io/caret/feature-selection-overview.html} for more details.
 		#' 
@@ -155,23 +185,24 @@ trans_classifier <- R6::R6Class(classname = "trans_classifier",
 			boruta.repetitions = 4,
 			...
 			){
-			# ClassNames
-			data_response <- self$data_response
+			data_input <- self$data_train
+			data_x <- data_input[, -1]
+			data_y <- data_input[, 1]
+
 			if(self$type == "Classification"){
-				data_response <- factor(data_response)
+				data_y %<>% factor
 			}
-			DataX <- self$data_feature
 
 			###################### ----------------
 			######################    BORUTA
 			boruta.list <- list()
 			boura.fs <- function(i){
-				boruta.res <- Boruta::Boruta(x = DataX, y = data_response, 
+				boruta.res <- Boruta::Boruta(x = data_x, y = data_y, 
 					maxRuns = boruta.maxRuns, pValue = boruta.pValue, ...)
 				boruta.stats <- data.frame(Boruta::attStats(boruta.res))
 				boruta.list[[i]] <- rownames(boruta.stats[boruta.stats$decision =='Confirmed', ])
 			}
-			message("Running Feature Selection (Boruta) ...")
+			message("Running Feature Selection (Boruta) based on the training data ...")
 			boruta.list <- parallel::mclapply(1:boruta.repetitions, boura.fs)
 
 			boruta.final <- as.data.frame(table(unlist(boruta.list)))
@@ -181,38 +212,14 @@ trans_classifier <- R6::R6Class(classname = "trans_classifier",
 			message("End of Feature Selection - Total of selected features = ", boruta.n.features)
 			######################    BORUTA end
 			###################### ----------------
-			self$data_feature <- DataX[, boruta.list.top]
-			message("The selected features is reassigned to object$data_feature ...")
-			invisible(self)
-		},
-		#' @description
-		#' Split data for training and testing.
-		#' 
-		#' @param prop.train default 3/4; the ratio of the data used for the training.
-		#' @return \code{data_train} and \code{data_test} in the object.
-		#' @examples
-		#' \dontrun{
-		#' t1$cal_split(prop.train = 3/4)
-		#' }
-		cal_split = function(prop.train = 3/4){
-			######################    DATA SPLIT: TRAIN and TEST
-			message("Creating training set and testing set ...")
-			data_response <- self$data_response
-			if(self$type == "Classification"){
-				data_response %<>% factor
-			}
-			data_feature <- self$data_feature
-
-			data_all <- data.frame(Response = data_response, data_feature, check.names = FALSE)
-			SplitData <- rsample::initial_split(data_all, prop = prop.train, strata = "Response")
-			train_data <- rsample::training(SplitData)
-			test_data <- rsample::testing(SplitData)
-			message("Stratified sampling with the proportion of ", prop.train*100 ,"% for the training set ...")
-
-			######################    DATA SPLIT end
-			self$data_train <- train_data
-			self$data_test <- test_data
-			message("Training and testing data are stored in object$data_train and object$data_test respectively ...")
+			data_output <- data_input[, c(colnames(data_input)[1], boruta.list.top)]
+			self$data_train <- data_output
+
+			data_input <- self$data_test
+			data_output <- data_input[, c(colnames(data_input)[1], boruta.list.top)]
+			self$data_test <- data_output
+
+			message("Selected features are reassigned to object$data_train and object$data_test ...")
 			invisible(self)
 		},
 		#' @description

diff --git a/man/trans_classifier.Rd b/man/trans_classifier.Rd