diff --git a/.Rbuildignore b/.Rbuildignore index ae62917..5b2ee19 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -21,4 +21,4 @@ ^CRAN-SUBMISSION$ ^doc$ ^Meta$ -^vignettes$ +^vignettes diff --git a/DESCRIPTION b/DESCRIPTION index 5134b47..b455f9e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -13,6 +13,8 @@ License: MIT + file LICENSE URL: https://github.com/Amalan-ConStat/NeEDS4BigData,https://amalan-constat.github.io/NeEDS4BigData/index.html BugReports: https://github.com/Amalan-ConStat/NeEDS4BigData/issues +Depends: + R (>= 4.0.0) Imports: dplyr, foreach, @@ -21,6 +23,7 @@ Imports: ggplot2, ggridges, matrixStats, + mvnfast, psych, Rdpack, Rfast, @@ -42,6 +45,4 @@ Language: en-GB LazyData: true LazyDataCompression: xz RoxygenNote: 7.3.1 -Depends: - R (>= 3.5.0) Config/testthat/edition: 3 diff --git a/NAMESPACE b/NAMESPACE index f275c34..5c9db53 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -31,10 +31,13 @@ importFrom(Rdpack,reprompt) importFrom(Rfast,rowprods) importFrom(dplyr,group_by) importFrom(dplyr,summarise) +importFrom(gam,lo) importFrom(gam,s) importFrom(ggh4x,facet_grid2) importFrom(matrixStats,rowSums2) +importFrom(mvnfast,rmvn) importFrom(psych,tr) importFrom(rlang,.data) importFrom(tidyr,pivot_longer) importFrom(tidyr,starts_with) +importFrom(utils,combn) diff --git a/NeEDS4BigData.Rproj b/NeEDS4BigData.Rproj index b6d689d..473c4a7 100644 --- a/NeEDS4BigData.Rproj +++ b/NeEDS4BigData.Rproj @@ -19,6 +19,6 @@ StripTrailingWhitespace: Yes BuildType: Package PackageUseDevtools: Yes PackageInstallArgs: --no-multiarch --with-keep.source -PackageRoxygenize: rd,collate,namespace,vignette +PackageRoxygenize: rd,collate,namespace SpellingDictionary: en_GB diff --git a/R/ALoptimalGLMSub.R b/R/ALoptimalGLMSub.R index 03102b5..17dc365 100644 --- a/R/ALoptimalGLMSub.R +++ b/R/ALoptimalGLMSub.R @@ -64,8 +64,7 @@ #' #' r1<-300; r2<-rep(600,50); Original_Data<-Full_Data$Complete_Data; #' -#' ALoptimalGLMSub(r1 = r1, r2 = r2, -#' Y = as.matrix(Original_Data[,colnames(Original_Data) %in% c("Y")]), +#' ALoptimalGLMSub(r1 = r1, r2 = r2,Y = as.matrix(Original_Data[,1]), #' X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data), #' family = "linear")->Results #' @@ -77,8 +76,7 @@ #' #' r1<-300; r2<-rep(600,50); Original_Data<-Full_Data$Complete_Data; #' -#' ALoptimalGLMSub(r1 = r1, r2 = r2, -#' Y = as.matrix(Original_Data[,colnames(Original_Data) %in% c("Y")]), +#' ALoptimalGLMSub(r1 = r1, r2 = r2,Y = as.matrix(Original_Data[,1]), #' X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data), #' family = "logistic")->Results #' @@ -90,8 +88,7 @@ #' #' r1<-300; r2<-rep(600,50); Original_Data<-Full_Data$Complete_Data; #' -#' ALoptimalGLMSub(r1 = r1, r2 = r2, -#' Y = as.matrix(Original_Data[,colnames(Original_Data) %in% c("Y")]), +#' ALoptimalGLMSub(r1 = r1, r2 = r2,Y = as.matrix(Original_Data[,1]), #' X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data), #' family = "poisson")->Results #' @@ -129,7 +126,7 @@ ALoptimalGLMSub <- function(r1,r2,Y,X,N,family){ if(family %in% c("linear")){ PI.prop <- rep(1/N, N) - idx.prop <- sample(1:N, r1, T) + idx.prop <- sample(1:N, size = r1, replace = TRUE) x.prop <- X[idx.prop,] y.prop <- Y[idx.prop,] @@ -137,8 +134,8 @@ ALoptimalGLMSub <- function(r1,r2,Y,X,N,family){ pinv.prop <- N pinv.prop <- 1/PI.prop[idx.prop] - beta.prop<-solve(a=t(x.prop)%*%x.prop,b=t(x.prop)%*%y.prop) - Xbeta_Final<-as.vector(X%*%beta.prop) + beta.prop<-solve(a=crossprod(x.prop),b= crossprod(x.prop,y.prop)) + Xbeta_Final<-X%*%beta.prop Var.prop<-sum((Y-Xbeta_Final)^2)/N Epsilon.prop<-Y-Xbeta_Final @@ -161,7 +158,7 @@ ALoptimalGLMSub <- function(r1,r2,Y,X,N,family){ PI.mVc<-PI.mVc/sum(PI.mVc) ## mMSE - PI.mMSE<-sqrt(Epsilon.prop^2 * matrixStats::rowSums2((X %*% solve(t(X)%*%X))^2)) + PI.mMSE<-sqrt(Epsilon.prop^2 * matrixStats::rowSums2((X %*% solve(crossprod(X)))^2)) PI.mMSE<-PI.mMSE/sum(PI.mMSE) message("Step 1 of the algorithm completed.\n") @@ -169,7 +166,7 @@ ALoptimalGLMSub <- function(r1,r2,Y,X,N,family){ for (i in 1:length(r2)) { ## mVc - idx.mVc <- sample(1:N, r2[i]-r1, T, PI.mVc) + idx.mVc <- sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI.mVc) x.mVc <- X[c(idx.mVc, idx.prop),] y.mVc <- Y[c(idx.mVc, idx.prop)] @@ -178,8 +175,8 @@ ALoptimalGLMSub <- function(r1,r2,Y,X,N,family){ pi4_r<-sqrt(r2[i]*pinv.mVc^(-1)) X_r4<-x.mVc/pi4_r Y_r4<-y.mVc/pi4_r - beta.prop<-solve(a=t(X_r4)%*%X_r4,b=t(X_r4)%*%Y_r4) - Xbeta_Final<-as.vector(X%*%beta.prop) + beta.prop<-solve(a=crossprod(X_r4),b=crossprod(X_r4,Y_r4)) + Xbeta_Final<-X%*%beta.prop Var.prop<-sum((Y-Xbeta_Final)^2)/N Sample.mVc[[i+1]]<-idx.mVc; @@ -191,7 +188,7 @@ ALoptimalGLMSub <- function(r1,r2,Y,X,N,family){ } ## mMSE - idx.mMSE <- sample(1:N, r2[i]-r1, T, PI.mMSE) + idx.mMSE <- sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI.mMSE) x.mMSE <- X[c(idx.mMSE, idx.prop),] y.mMSE <- Y[c(idx.mMSE, idx.prop)] @@ -200,8 +197,8 @@ ALoptimalGLMSub <- function(r1,r2,Y,X,N,family){ pi4_r<-sqrt(r2[i]*pinv.mMSE^(-1)) X_r4<-x.mMSE/pi4_r Y_r4<-y.mMSE/pi4_r - beta.prop<-solve(a=t(X_r4)%*%X_r4,b=t(X_r4)%*%Y_r4) - Xbeta_Final<-as.vector(X%*%beta.prop) + beta.prop<-solve(a=crossprod(X_r4),b=crossprod(X_r4,Y_r4)) + Xbeta_Final<-X%*%beta.prop Var.prop<-sum((Y-Xbeta_Final)^2)/N Sample.mMSE[[i+1]]<-idx.mMSE; @@ -242,7 +239,7 @@ ALoptimalGLMSub <- function(r1,r2,Y,X,N,family){ n0 <- N - n1 PI.prop <- rep(1/(2*n0), N) PI.prop[Y==1] <- 1/(2*n1) - idx.prop <- sample(1:N, r1, T, PI.prop) + idx.prop <- sample(1:N, size = r1, replace = TRUE, prob = PI.prop) x.prop <- X[idx.prop,] y.prop <- Y[idx.prop,] @@ -274,7 +271,7 @@ ALoptimalGLMSub <- function(r1,r2,Y,X,N,family){ ## mMSE p.prop <- P.prop[idx.prop] w.prop <- p.prop * (1 - p.prop) - W.prop <- solve(t(x.prop) %*% (x.prop * w.prop * pinv.prop)) + W.prop <- solve(crossprod(x.prop,x.prop * w.prop * pinv.prop)) PI.mMSE<-sqrt((Y - P.prop)^2 * matrixStats::rowSums2((X%*%W.prop)^2)) PI.mMSE <- PI.mMSE/sum(PI.mMSE) @@ -284,7 +281,7 @@ ALoptimalGLMSub <- function(r1,r2,Y,X,N,family){ for (i in 1:length(r2)) { ## mVc - idx.mVc <- sample(1:N, r2[i]-r1, T, PI.mVc) + idx.mVc <- sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI.mVc) x.mVc <- X[c(idx.mVc, idx.prop), ] y.mVc <- Y[c(idx.mVc, idx.prop)] @@ -307,7 +304,7 @@ ALoptimalGLMSub <- function(r1,r2,Y,X,N,family){ Utility_mVc[i,-1]<-c(psych::tr(V_Final),det(solve(V_Final))) ## mMSE - idx.mMSE <- sample(1:N, r2[i]-r1, T, PI.mMSE) + idx.mMSE <- sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI.mMSE) x.mMSE <- X[c(idx.mMSE, idx.prop),] y.mMSE <- Y[c(idx.mMSE, idx.prop)] @@ -356,14 +353,14 @@ ALoptimalGLMSub <- function(r1,r2,Y,X,N,family){ } if(family %in% "poisson"){ PI.prop <- rep(1/N, N) - idx.prop <- sample(1:N, r1, T) + idx.prop <- sample(1:N, size = r1, replace = TRUE) x.prop<-X[idx.prop,] y.prop <- Y[idx.prop,] pinv.prop <- N pinv.prop <- 1/PI.prop[idx.prop] - fit.prop <- stats::glm(y.prop~x.prop-1,family = "poisson") + fit.prop <- stats::glm(y.prop~x.prop-1,family = "quasipoisson") beta.prop <- fit.prop$coefficients if(anyNA(beta.prop)){ @@ -389,7 +386,7 @@ ALoptimalGLMSub <- function(r1,r2,Y,X,N,family){ ## mMSE w.prop <- P.prop[idx.prop] - W.prop <- solve(t(x.prop) %*% (x.prop * w.prop * pinv.prop)) + W.prop <- solve(crossprod(x.prop,x.prop * w.prop * pinv.prop)) PI.mMSE<-sqrt((Y - P.prop)^2 * matrixStats::rowSums2((X%*%W.prop)^2)) PI.mMSE <- PI.mMSE/sum(PI.mMSE) @@ -399,13 +396,13 @@ ALoptimalGLMSub <- function(r1,r2,Y,X,N,family){ for (i in 1:length(r2)) { ## mVc - idx.mVc <- sample(1:N, r2[i]-r1, T, PI.mVc) + idx.mVc <- sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI.mVc) x.mVc <- X[c(idx.mVc, idx.prop),] y.mVc <- Y[c(idx.mVc, idx.prop)] pinv.mVc<-c(1 / PI.mVc[idx.mVc], pinv.prop) - fit.mVc <-stats::glm(y.mVc~x.mVc-1, family = "poisson",weights=pinv.mVc) + fit.mVc <-stats::glm(y.mVc~x.mVc-1, family = "quasipoisson",weights=pinv.mVc) Sample.mVc[[i+1]]<-idx.mVc; beta.mVc[i,-1] <- fit.mVc$coefficients @@ -422,13 +419,13 @@ ALoptimalGLMSub <- function(r1,r2,Y,X,N,family){ Utility_mVc[i,-1]<-c(psych::tr(V_Final),det(solve(V_Final))) ## mMSE - idx.mMSE <- sample(1:N, r2[i]-r1, T, PI.mMSE) + idx.mMSE <- sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI.mMSE) x.mMSE <- X[c(idx.mMSE, idx.prop),] y.mMSE <- Y[c(idx.mMSE, idx.prop)] pinv.mMSE<-c(1 / PI.mMSE[idx.mMSE], pinv.prop) - fit.mMSE <- stats::glm(y.mMSE~x.mMSE-1, family = "poisson",weights=pinv.mMSE) + fit.mMSE <- stats::glm(y.mMSE~x.mMSE-1, family = "quasipoisson",weights=pinv.mMSE) Sample.mMSE[[i+1]]<-idx.mMSE; beta.mMSE[i,-1] <-fit.mMSE$coefficients diff --git a/R/AoptimalGauLMSub.R b/R/AoptimalGauLMSub.R index 8826d23..98a9dca 100644 --- a/R/AoptimalGauLMSub.R +++ b/R/AoptimalGauLMSub.R @@ -52,14 +52,14 @@ #' #' r1<-300; r2<-rep(100*c(6,12),50); Original_Data<-Full_Data$Complete_Data; #' -#' AoptimalGauLMSub(r1 = r1, r2 = r2, -#' Y = as.matrix(Original_Data[,colnames(Original_Data) %in% c("Y")]), +#' AoptimalGauLMSub(r1 = r1, r2 = r2,Y = as.matrix(Original_Data[,1]), #' X = as.matrix(Original_Data[,-1]), #' N = nrow(Original_Data))->Results #' #' plot_Beta(Results) #' #' @importFrom Rdpack reprompt +#' @importFrom matrixStats rowSums2 #' @export AoptimalGauLMSub <- function(r1,r2,Y,X,N){ if(any(is.na(c(r1,r2,N))) | any(is.nan(c(r1,r2,N)))){ @@ -70,7 +70,7 @@ AoptimalGauLMSub <- function(r1,r2,Y,X,N){ stop("r1 or N has a value greater than length one") } - if(any(is.na(cbind(Y,X))) | any(is.nan(cbind(Y,X)))){ + if(anyNA(Y) | anyNA(X) | any(is.nan(Y)) | any(is.nan(X)) ){ stop("NA or Infinite or NAN values in the Y or X") } @@ -83,7 +83,7 @@ AoptimalGauLMSub <- function(r1,r2,Y,X,N){ } PI.prop <- rep(1/N, N) - idx.prop <- sample(1:N, r1, T) + idx.prop <- sample(1:N, size = r1, replace = TRUE) x.prop <- X[idx.prop,] y.prop <- Y[idx.prop,] @@ -91,8 +91,8 @@ AoptimalGauLMSub <- function(r1,r2,Y,X,N){ pinv.prop <- N pinv.prop <- 1/PI.prop[idx.prop] - beta.prop<-solve(a=t(x.prop)%*%x.prop,b=t(x.prop)%*%y.prop) - Xbeta_Final<-as.vector(X%*%beta.prop) + beta.prop<-solve(a=crossprod(x.prop),b= crossprod(x.prop,y.prop)) + Xbeta_Final<-X%*%beta.prop Var.prop<-sum((Y-Xbeta_Final)^2)/N Epsilon.prop<-Y-Xbeta_Final @@ -101,7 +101,7 @@ AoptimalGauLMSub <- function(r1,r2,Y,X,N){ } Second <- (Epsilon.prop^2 - Var.prop)^2/(4 * N^2 * Var.prop) - ML_Inv <- solve(t(X)%*%X) + ML_Inv <- solve(crossprod(X)) beta.mMSE<-matrix(nrow = length(r2),ncol = ncol(X)+1 ) Var_Epsilon<-matrix(nrow = length(r2),ncol = 2) @@ -114,7 +114,7 @@ AoptimalGauLMSub <- function(r1,r2,Y,X,N){ colnames(Var_Epsilon)<-c("r2","A-Optimality") ## mMSE - PI.mMSE <- sqrt(Epsilon.prop^2 * rowSums((X %*% ML_Inv)^2) + Second) + PI.mMSE <- sqrt(Epsilon.prop^2 * matrixStats::rowSums2((X %*% ML_Inv)^2) + Second) PI.mMSE <- PI.mMSE/sum(PI.mMSE) message("Step 1 of the algorithm completed.\n") @@ -122,7 +122,7 @@ AoptimalGauLMSub <- function(r1,r2,Y,X,N){ for (i in 1:length(r2)) { ## mMSE - idx.mMSE <- sample(1:N, r2[i]-r1, T, PI.mMSE) + idx.mMSE <- sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI.mMSE) x.mMSE <- X[c(idx.mMSE, idx.prop),] y.mMSE <- Y[c(idx.mMSE, idx.prop)] @@ -131,8 +131,8 @@ AoptimalGauLMSub <- function(r1,r2,Y,X,N){ pi4_r<-sqrt(r2[i]*pinv.mMSE^(-1)) X_r4<-x.mMSE/pi4_r Y_r4<-y.mMSE/pi4_r - beta.prop<-solve(a=t(X_r4)%*%X_r4,b=t(X_r4)%*%Y_r4) - Xbeta_Final<-as.vector(X%*%beta.prop) + beta.prop<-solve(a=crossprod(X_r4),b=crossprod(X_r4,Y_r4)) + Xbeta_Final<-X%*%beta.prop Var.prop<-sum((Y-Xbeta_Final)^2)/N Sample.mMSE[[i+1]]<-idx.mMSE; diff --git a/R/AoptimalMCGLMSub.R b/R/AoptimalMCGLMSub.R index 0806ca4..91dd46b 100644 --- a/R/AoptimalMCGLMSub.R +++ b/R/AoptimalMCGLMSub.R @@ -59,8 +59,7 @@ #' #' r1<-300; r2<-rep(100*c(6,12),50); Original_Data<-Full_Data$Complete_Data; #' -#' AoptimalMCGLMSub(r1 = r1, r2 = r2, -#' Y = as.matrix(Original_Data[,colnames(Original_Data) %in% c("Y")]), +#' AoptimalMCGLMSub(r1 = r1, r2 = r2,Y = as.matrix(Original_Data[,1]), #' X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data), #' family = "linear")->Results #' @@ -72,8 +71,7 @@ #' #' r1<-300; r2<-rep(100*c(6,12),50); Original_Data<-Full_Data$Complete_Data; #' -#' AoptimalMCGLMSub(r1 = r1, r2 = r2, -#' Y = as.matrix(Original_Data[,colnames(Original_Data) %in% c("Y")]), +#' AoptimalMCGLMSub(r1 = r1, r2 = r2,Y = as.matrix(Original_Data[,1]), #' X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data), #' family = "logistic")->Results #' @@ -85,8 +83,7 @@ #' #' r1<-300; r2<-rep(100*c(6,12),50); Original_Data<-Full_Data$Complete_Data; #' -#' AoptimalMCGLMSub(r1 = r1, r2 = r2, -#' Y = as.matrix(Original_Data[,colnames(Original_Data) %in% c("Y")]), +#' AoptimalMCGLMSub(r1 = r1, r2 = r2,Y = as.matrix(Original_Data[,1]), #' X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data), #' family = "poisson")->Results #' @@ -94,7 +91,6 @@ #' #' @importFrom Rdpack reprompt #' @import stats -#' @importFrom psych tr #' @importFrom matrixStats rowSums2 #' @export AoptimalMCGLMSub <- function(r1,r2,Y,X,N,family){ @@ -124,7 +120,7 @@ AoptimalMCGLMSub <- function(r1,r2,Y,X,N,family){ if(family %in% c("linear")){ PI.prop <- rep(1/N, N) - idx.prop <- sample(1:N, r1, T) + idx.prop <- sample(1:N, size = r1, replace = TRUE) x.prop <- X[idx.prop,] y.prop <- Y[idx.prop,] @@ -132,8 +128,8 @@ AoptimalMCGLMSub <- function(r1,r2,Y,X,N,family){ pinv.prop <- N pinv.prop <- 1/PI.prop[idx.prop] - beta.prop<-solve(a=t(x.prop)%*%x.prop,b=t(x.prop)%*%y.prop) - Xbeta_Final<-as.vector(X%*%beta.prop) + beta.prop<-solve(a=crossprod(x.prop),b= crossprod(x.prop,y.prop)) + Xbeta_Final<-X%*%beta.prop Var.prop<-sum((Y-Xbeta_Final)^2)/N Epsilon.prop<-Y-Xbeta_Final @@ -152,7 +148,7 @@ AoptimalMCGLMSub <- function(r1,r2,Y,X,N,family){ colnames(Var_Epsilon)<-c("r2","A-Optimality") ## mMSE - PI.mMSE<-sqrt(matrixStats::rowSums2((X %*% solve(t(X)%*%X))^2)) + PI.mMSE<-sqrt(matrixStats::rowSums2((X %*% solve(crossprod(X)))^2)) PI.mMSE<-PI.mMSE/sum(PI.mMSE) message("Step 1 of the algorithm completed.\n") @@ -160,17 +156,17 @@ AoptimalMCGLMSub <- function(r1,r2,Y,X,N,family){ for (i in 1:length(r2)) { ## mMSE - idx.mMSE <- sample(1:N, r2[i], T, PI.mMSE) + idx.mMSE <- sample(1:N, size = r2[i], replace = TRUE, prob = PI.mMSE) - x.mMSE <- X[c(idx.mMSE),] - y.mMSE <- Y[c(idx.mMSE)] + x.mMSE <- X[idx.mMSE,] + y.mMSE <- Y[idx.mMSE] pinv.mMSE<-c(1 / PI.mMSE[idx.mMSE]) pi4_r<-sqrt(r2[i]*pinv.mMSE^(-1)) X_r4<-x.mMSE/pi4_r Y_r4<-y.mMSE/pi4_r - beta.prop<-solve(a=t(X_r4)%*%X_r4,b=t(X_r4)%*%Y_r4) - Xbeta_Final<-as.vector(X%*%beta.prop) + beta.prop<-solve(a=crossprod(X_r4),b=crossprod(X_r4,Y_r4)) + Xbeta_Final1<-X%*%beta.prop Var.prop<-sum((Y-Xbeta_Final)^2)/N Sample.mMSE[[i+1]]<-idx.mMSE @@ -209,7 +205,7 @@ AoptimalMCGLMSub <- function(r1,r2,Y,X,N,family){ n0 <- N - n1 PI.prop <- rep(1/(2*n0), N) PI.prop[Y==1] <- 1/(2*n1) - idx.prop <- sample(1:N, r1, T, PI.prop) + idx.prop <- sample(1:N, size = r1, replace = TRUE, prob = PI.prop) x.prop <- X[idx.prop,] y.prop <- Y[idx.prop,] @@ -234,7 +230,7 @@ AoptimalMCGLMSub <- function(r1,r2,Y,X,N,family){ ## mMSE p.prop <- P.prop[idx.prop] w.prop <- p.prop * (1 - p.prop) - W.prop <- r1*solve(t(x.prop) %*% (x.prop * w.prop * pinv.prop)) + W.prop <- r1*solve(crossprod(x.prop,(x.prop * w.prop * pinv.prop))) PI.mMSE<-sqrt(P.prop*(1-P.prop))*sqrt(matrixStats::rowSums2((X%*%W.prop)^2)) PI.mMSE <- PI.mMSE/sum(PI.mMSE) @@ -244,10 +240,10 @@ AoptimalMCGLMSub <- function(r1,r2,Y,X,N,family){ for (i in 1:length(r2)) { ## mMSE - idx.mMSE <- sample(1:N, r2[i], T, PI.mMSE) + idx.mMSE <- sample(1:N, size = r2[i], replace = TRUE, prob = PI.mMSE) - x.mMSE <- X[c(idx.mMSE),] - y.mMSE <- Y[c(idx.mMSE)] + x.mMSE <- X[idx.mMSE,] + y.mMSE <- Y[idx.mMSE] fit.mMSE <- .getMLE(x=x.mMSE, y=y.mMSE,w=c(1 / PI.mMSE[idx.mMSE])) Sample.mMSE[[i+1]]<-idx.mMSE @@ -279,14 +275,14 @@ AoptimalMCGLMSub <- function(r1,r2,Y,X,N,family){ } if(family %in% "poisson"){ PI.prop <- rep(1/N, N) - idx.prop <- sample(1:N, r1, T) + idx.prop <- sample(1:N, size = r1, replace = TRUE) - x.prop<-X[idx.prop,] + x.prop <- X[idx.prop,] y.prop <- Y[idx.prop,] pinv.prop <- N pinv.prop <- 1/PI.prop[idx.prop] - fit.prop <- stats::glm(y.prop~x.prop-1,family = "poisson") + fit.prop <- stats::glm(y.prop~x.prop-1,family = "quasipoisson") beta.prop <- fit.prop$coefficients if(anyNA(beta.prop)){ @@ -305,7 +301,7 @@ AoptimalMCGLMSub <- function(r1,r2,Y,X,N,family){ ## mMSE w.prop <- P.prop[idx.prop] - W.prop <- r1*solve(t(x.prop) %*% (x.prop * w.prop * pinv.prop)) + W.prop <- r1*solve(crossprod(x.prop,x.prop * w.prop * pinv.prop)) PI.mMSE<-sqrt(P.prop)*sqrt(matrixStats::rowSums2((X%*%W.prop)^2)) PI.mMSE <- PI.mMSE/sum(PI.mMSE) @@ -315,13 +311,13 @@ AoptimalMCGLMSub <- function(r1,r2,Y,X,N,family){ for (i in 1:length(r2)) { ## mMSE - idx.mMSE <- sample(1:N, r2[i], T, PI.mMSE) + idx.mMSE <- sample(1:N, size = r2[i], replace = TRUE, prob = PI.mMSE) - x.mMSE <- X[c(idx.mMSE),] - y.mMSE <- Y[c(idx.mMSE)] + x.mMSE <- X[idx.mMSE,] + y.mMSE <- Y[idx.mMSE] pinv.mMSE<-c(1 / PI.mMSE[idx.mMSE]) - fit.mMSE <- stats::glm(y.mMSE~x.mMSE-1, family = "poisson",weights=pinv.mMSE) + fit.mMSE <- stats::glm(y.mMSE~x.mMSE-1, family = "quasipoisson",weights=pinv.mMSE) Sample.mMSE[[i+1]]<-idx.mMSE beta.mMSE[i,-1] <-fit.mMSE$coefficients diff --git a/R/Big_data.R b/R/Big_data.R index 8eb4c30..8a584b5 100644 --- a/R/Big_data.R +++ b/R/Big_data.R @@ -86,14 +86,14 @@ #' Generate data for Generalised Linear Models #' #' Function to simulate big data under linear, logistic and Poisson regression for sampling. -#' Covariate data X is through Normal or Uniform distribution for linear regression. -#' Covariate data X is through Exponential or Normal or Uniform distribution for logistic regression. +#' Covariate data X is through Normal, Multivariate Normal or Uniform distribution for linear regression. +#' Covariate data X is through Exponential, Normal, Multivariate Normal or Uniform distribution for logistic regression. #' Covariate data X is through Normal or Uniform distribution for Poisson regression. #' #' @usage #' GenGLMdata(Dist,Dist_Par,No_Of_Var,Beta,N,family) #' -#' @param Dist a character value for the distribution "Normal" or "Uniform or "Exponential" +#' @param Dist a character value for the distribution "Normal", "MVNormal", "Uniform or "Exponential" #' @param Dist_Par a list of parameters for the distribution that would generate data for covariate X #' @param No_Of_Var number of variables #' @param Beta a vector for the model parameters, including the intercept @@ -105,8 +105,8 @@ #' regression types. #' #' We have limited the covariate data generation for -#' linear regression through normal and uniform distribution, -#' logistic regression through exponential, normal and uniform and +#' linear regression through normal, multivariate normal and uniform distribution, +#' logistic regression through exponential, normal, multivariate normal and uniform distribution #' Poisson regression through normal and uniform distribution. #' #' @return @@ -118,17 +118,30 @@ #' \insertRef{lee1996hierarchical}{NeEDS4BigData} #' #' @examples -#' Dist<-"Normal"; Dist_Par<-list(Mean=0,Variance=1,Error_Variance=0.5) -#' No_Of_Var<-2; Beta<-c(-1,2,1); N<-5000; Family<-"linear" +#' No_Of_Var<-2; Beta<-c(-1,2,1); N<-5000; +#' +#' # Dist<-"Normal"; Dist_Par<-list(Mean=0,Variance=1,Error_Variance=0.5) +#' Dist<-"MVNormal"; +#' Dist_Par<-list(Mean=rep(0,No_Of_Var),Variance=diag(rep(2,No_Of_Var)),Error_Variance=0.5) +#' # Dist<-"Uniform"; Dist_Par<-list(Min=0,Max=1) +#' +#' Family<-"linear" #' Results<-GenGLMdata(Dist,Dist_Par,No_Of_Var,Beta,N,Family) #' -#' Dist<-"Normal"; Dist_Par<-list(Mean=0,Variance=1); Family<-"logistic" +#' Dist<-"Normal"; Dist_Par<-list(Mean=0,Variance=1); +#' # Dist<-"MVNormal"; Dist_Par<-list(Mean=rep(0,No_Of_Var),Variance=diag(rep(2,No_Of_Var))) +#' # Dist<-"Exponential"; Dist_Par<-list(Rate=3) +#' # Dist<-"Uniform"; Dist_Par<-list(Min=0,Max=1) +#' +#' Family<-"logistic" #' Results<-GenGLMdata(Dist,Dist_Par,No_Of_Var,Beta,N,Family) #' -#' Dist<-"Normal"; Family<-"poisson" +#' # Dist<-"Normal"; +#' Dist<-"Uniform"; Family<-"poisson" #' Results<-GenGLMdata(Dist,NULL,No_Of_Var,Beta,N,Family) #' #' @import stats +#' @importFrom mvnfast rmvn #' @export GenGLMdata<-function(Dist,Dist_Par,No_Of_Var,Beta,N,family){ if(any(is.na(c(Dist,Beta,No_Of_Var,N,family))) | any(is.nan(c(Dist,No_Of_Var,Beta,N,family)))){ @@ -148,14 +161,14 @@ GenGLMdata<-function(Dist,Dist_Par,No_Of_Var,Beta,N,family){ } if(family == "linear"){ - if(!(Dist == "Normal" | Dist == "Uniform")){ - stop("For linear regression select the distribution 'Normal' \n or 'Uniform' to generate the covarate data") + if(!(Dist == "Normal" | Dist == "MVNormal" | Dist == "Uniform")){ + stop("For linear regression select the distribution 'Normal', 'MVNormal' \n or 'Uniform' to generate the covarate data") } } if(family == "logistic"){ - if(!(Dist == "Exponential" | Dist == "Normal" | Dist == "Uniform")){ - stop("For logistic regression select the distribution 'Exponential', \n 'Normal' or 'Uniform' to generate the covarate data") + if(!(Dist == "Exponential" | Dist == "Normal" | Dist == "MVNormal" | Dist == "Uniform")){ + stop("For logistic regression select the distribution 'Exponential', \n 'Normal', 'MVNormal' or 'Uniform' to generate the covarate data") } } @@ -169,6 +182,9 @@ GenGLMdata<-function(Dist,Dist_Par,No_Of_Var,Beta,N,family){ if(Dist %in% "Normal"){ X<-replicate(No_Of_Var,stats::rnorm(n = N, mean = Dist_Par$Mean, sd = sqrt(Dist_Par$Variance))) } + if(Dist %in% "MVNormal"){ + X<-mvnfast::rmvn(n = N, mu = Dist_Par$Mean, sigma = sqrt(Dist_Par$Variance)) + } if(Dist %in% "Uniform"){ X<-replicate(No_Of_Var,stats::runif(n = N, min = Dist_Par$Min, max = Dist_Par$Max)) } @@ -193,6 +209,9 @@ GenGLMdata<-function(Dist,Dist_Par,No_Of_Var,Beta,N,family){ if(Dist %in% "Normal"){ X<-replicate(No_Of_Var,stats::rnorm(n = N, mean = Dist_Par$Mean, sd = sqrt(Dist_Par$Variance))) } + if(Dist %in% "MVNormal"){ + X<-mvnfast::rmvn(n = N, mu = Dist_Par$Mean, sigma = sqrt(Dist_Par$Variance)) + } if(Dist %in% "Uniform"){ X<-replicate(No_Of_Var,stats::runif(n = N, min = Dist_Par$Min, max = Dist_Par$Max)) } diff --git a/R/LCCsampling.R b/R/LCCsampling.R index 4586b9a..5a7ca19 100644 --- a/R/LCCsampling.R +++ b/R/LCCsampling.R @@ -52,7 +52,7 @@ #' #' r1<-300; r2<-rep(100*c(6,9,12),50); Original_Data<-Full_Data$Complete_Data; #' -#' LCCsampling(r1 = r1, r2 = r2, Y = as.matrix(Original_Data[,colnames(Original_Data) %in% c("Y")]), +#' LCCsampling(r1 = r1, r2 = r2, Y = as.matrix(Original_Data[,1]), #' X = as.matrix(Original_Data[,-1]), #' N = nrow(Original_Data))->Results #' @@ -85,7 +85,7 @@ LCCsampling<-function(r1,r2,Y,X,N){ n0 <- N - n1 PI.prop <- rep(1/(2*n0), N) PI.prop[Y==1] <- 1/(2*n1) - idx.prop <- sample(1:N, r1, T, PI.prop) + idx.prop <- sample(1:N, size = r1, replace = TRUE, prob = PI.prop) x.prop <- X[idx.prop,] y.prop <- Y[idx.prop,] @@ -115,10 +115,10 @@ LCCsampling<-function(r1,r2,Y,X,N){ for(i in 1:length(r2)){ ## local case control sampling - idx.LCC <- sample(1:N, r2[i], T, PI.LCC) + idx.LCC <- sample(1:N, size = r2[i], replace = TRUE, prob = PI.LCC) - x.LCC <- X[c(idx.LCC),] - y.LCC <- Y[c(idx.LCC)] + x.LCC <- X[idx.LCC,] + y.LCC <- Y[idx.LCC] pinv.LCC <- 1 fit.LCC <- .getMLE(x=x.LCC, y=y.LCC, w=pinv.LCC) beta.prop <- fit.LCC$par+beta.prop_start diff --git a/R/LeverageSampling.R b/R/LeverageSampling.R index d74512e..a2ca6b0 100644 --- a/R/LeverageSampling.R +++ b/R/LeverageSampling.R @@ -61,7 +61,7 @@ #' #' r<-rep(100*c(6,10),50); Original_Data<-Full_Data$Complete_Data; #' -#' LeverageSampling(r = r, Y = as.matrix(Original_Data[,colnames(Original_Data) %in% c("Y")]), +#' LeverageSampling(r = r, Y = as.matrix(Original_Data[,1]), #' X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data), #' S_alpha = 0.95, #' family = "linear")->Results @@ -74,7 +74,7 @@ #' #' r<-rep(100*c(6,10),50); Original_Data<-Full_Data$Complete_Data; #' -#' LeverageSampling(r = r, Y = as.matrix(Original_Data[,colnames(Original_Data) %in% c("Y")]), +#' LeverageSampling(r = r, Y = as.matrix(Original_Data[,1]), #' X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data), #' S_alpha = 0.95, #' family = "logistic")->Results @@ -87,7 +87,7 @@ #' #' r<-rep(100*c(6,10),50); Original_Data<-Full_Data$Complete_Data; #' -#' LeverageSampling(r = r, Y = as.matrix(Original_Data[,colnames(Original_Data) %in% c("Y")]), +#' LeverageSampling(r = r, Y = as.matrix(Original_Data[,1]), #' X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data), #' S_alpha = 0.95, #' family = "poisson")->Results @@ -142,17 +142,17 @@ LeverageSampling<-function(r,Y,X,N,S_alpha,family){ for (i in 1:length(r)) { # basic leverage sampling - idx.blev <- sample(1:N, size = r[i], replace = TRUE, PI.blev) + idx.blev <- sample(1:N, size = r[i], replace = TRUE, prob = PI.blev) - x.blev <- as.matrix(X[idx.blev,]) - y.blev <- as.matrix(Y[idx.blev]) + x.blev <- X[idx.blev,] + y.blev <- Y[idx.blev] wgt <- 1 / PI.blev[idx.blev] - Temp_Data <- data.frame(y = y.blev,x.blev, wgt = wgt) - lm.blev <- stats::lm(y ~ . - 1, weights = Temp_Data$wgt, data = Temp_Data[,-ncol(Temp_Data)]) + Temp_Data <- data.frame(y = y.blev,x.blev) + lm.blev <- stats::lm(y ~ . - 1, weights = wgt, data = Temp_Data) beta.prop<-stats::coefficients(lm.blev) - Xbeta_Final<-as.vector(X%*%beta.prop) + Xbeta_Final<-X%*%beta.prop Var.prop<-sum((Y-Xbeta_Final)^2)/N Sample.blev[[i]]<-idx.blev @@ -164,10 +164,10 @@ LeverageSampling<-function(r,Y,X,N,S_alpha,family){ } # unweighted leverage sampling - lm.uwlev <- stats::lm(y ~ . - 1, data = Temp_Data[,-ncol(Temp_Data)]) + lm.uwlev <- stats::lm(y ~ . - 1, data = Temp_Data) beta.prop<-stats::coefficients(lm.uwlev) - Xbeta_Final<-as.vector(X%*%beta.prop) + Xbeta_Final<-X%*%beta.prop Var.prop<-sum((Y-Xbeta_Final)^2)/N Sample.uwlev[[i]]<-idx.blev @@ -179,17 +179,17 @@ LeverageSampling<-function(r,Y,X,N,S_alpha,family){ } # shrinkage leverage sampling - idx.slev <- sample(N, size = r[i], replace = TRUE, PI.slev) + idx.slev <- sample(N, size = r[i], replace = TRUE, prob = PI.slev) - x.slev <- as.matrix(X[idx.slev,]) - y.slev <- as.matrix(Y[idx.slev]) + x.slev <- X[idx.slev,] + y.slev <- Y[idx.slev] wgt <- 1 / PI.slev[idx.slev] - Temp_Data <- data.frame(y = y.slev,x.slev, wgt = wgt) - lm.slev <- stats::lm(y ~ . - 1, weights = wgt, data = Temp_Data[,-ncol(Temp_Data)]) + Temp_Data <- data.frame(y = y.slev,x.slev) + lm.slev <- stats::lm(y ~ . - 1, weights = wgt, data = Temp_Data) beta.prop<-stats::coefficients(lm.slev) - Xbeta_Final<-as.vector(X%*%beta.prop) + Xbeta_Final<-X%*%beta.prop Var.prop<-sum((Y-Xbeta_Final)^2)/N Sample.slev[[i]]<-idx.slev @@ -233,7 +233,7 @@ LeverageSampling<-function(r,Y,X,N,S_alpha,family){ n0 <- N - n1 PI.prop <- rep(1/(2*n0), N) PI.prop[Y==1] <- 1/(2*n1) - idx.prop <- sample(1:N, r1, T, PI.prop) + idx.prop <- sample(1:N, size = r1, replace = TRUE, prob = PI.prop) x.prop <- X[idx.prop,] y.prop <- Y[idx.prop,] @@ -247,21 +247,9 @@ LeverageSampling<-function(r,Y,X,N,S_alpha,family){ p.prop<-1 - 1 / (1 + exp(X%*% beta.prop)) w.prop<-as.vector(sqrt(p.prop*(1-p.prop))) X_bar<- w.prop*X - XX_Inv<-solve(t(X_bar)%*%X_bar) + XX_Inv<-solve(crossprod(X_bar)) - # X_Temp<-X - # PP <-apply(X_Temp,1,function(X_Temp){ - # P.prop <- 1 - 1 / (1 + exp(X_Temp%*% beta.prop)) - # W.prop <- sqrt(P.prop*(1-P.prop)) - # x_bar <- W.prop%*%X_Temp - # x_bar%*%XX_Inv%*%t(x_bar) - # }) - - # Precompute terms outside the loop - P.prop <- 1 - 1 / (1 + exp(X %*% beta.prop)) - W.prop <- sqrt(P.prop * (1 - P.prop)) - X_bar_s <- as.vector(W.prop) * X - PP <- rowSums((X_bar_s %*% XX_Inv) * X_bar_s) + PP <- rowSums((X_bar %*% XX_Inv) * X_bar) PI.blev <- PP / sum(PP) PI.slev <- S_alpha * PI.blev + (1-S_alpha) * 1 / N @@ -278,8 +266,8 @@ LeverageSampling<-function(r,Y,X,N,S_alpha,family){ # basic leverage sampling idx.blev <- sample(N, size = r[i], replace = TRUE, PI.blev) - x.blev <- as.matrix(X[idx.blev,]) - y.blev <- as.matrix(Y[idx.blev]) + x.blev <- X[idx.blev,] + y.blev <- Y[idx.blev] wgt <- 1 / PI.blev[idx.blev] fit.blev <- .getMLE(x=x.blev, y=as.vector(y.blev), w=wgt) @@ -306,8 +294,8 @@ LeverageSampling<-function(r,Y,X,N,S_alpha,family){ # shrinkage leverage sampling idx.slev <- sample(1:N, size = r[i], replace = TRUE, PI.slev) - x.slev <- as.matrix(X[idx.slev,]) - y.slev <- as.matrix(Y[idx.slev]) + x.slev <- X[idx.slev,] + y.slev <- Y[idx.slev] wgt <- 1 / PI.slev[idx.slev] fit.slev <- .getMLE(x=x.slev, y=as.vector(y.slev), w=wgt) @@ -343,35 +331,26 @@ LeverageSampling<-function(r,Y,X,N,S_alpha,family){ if(family %in% c("poisson")){ r1<-round(min(r/2)) PI.prop <- rep(1/N, N) - idx.prop <- sample(1:N, r1, T) + idx.prop <- sample(1:N, size = r1, replace = TRUE) - x.prop<-X[idx.prop,] + x.prop <- X[idx.prop,] y.prop <- Y[idx.prop,] pinv.prop <- N pinv.prop <- 1/PI.prop[idx.prop] - fit.prop <- stats::glm(y.prop~x.prop-1,family = "poisson") + fit.prop <- stats::glm(y.prop~x.prop-1,family = "quasipoisson") beta.prop <- fit.prop$coefficients if(anyNA(beta.prop)){ stop("There are NA or NaN values in the model parameters") } - p.prop<-as.vector(sqrt(exp(X%*% beta.prop))) - X_bar<- p.prop*X - XX_Inv<-solve(t(X_bar)%*%X_bar) - - # X_Temp<-X; - # PP <-apply(X_Temp,1,function(X_Temp){ - # P.prop <- sqrt(exp(X_Temp%*%beta.prop)) - # x_bar <- P.prop%*%X_Temp - # x_bar%*%XX_Inv%*%t(x_bar) - # }) + p.prop<-exp(X%*% beta.prop) + w.prop<-as.vector(sqrt(p.prop)) + X_bar<- w.prop*X + XX_Inv<-solve(crossprod(X_bar)) # Precompute terms outside the loop - P.prop <- 1 - 1 / (1 + exp(X %*% beta.prop)) - W.prop <- sqrt(P.prop * (1 - P.prop)) - X_bar_s <- as.vector(W.prop) * X - PP <- rowSums((X_bar_s %*% XX_Inv) * X_bar_s) + PP <- rowSums((X_bar %*% XX_Inv) * X_bar) PI.blev <- PP / sum(PP) PI.slev <- S_alpha * PI.blev + (1-S_alpha) * 1 / N @@ -386,13 +365,13 @@ LeverageSampling<-function(r,Y,X,N,S_alpha,family){ for (i in 1:length(r)) { # basic leverage sampling - idx.blev <- sample(1:N, size = r[i], replace = TRUE, PI.blev) + idx.blev <- sample(1:N, size = r[i], replace = TRUE, prob = PI.blev) - x.blev <- as.matrix(X[idx.blev,]) - y.blev <- as.matrix(Y[idx.blev]) + x.blev <- X[idx.blev,] + y.blev <- Y[idx.blev] wgt <- 1 / PI.blev[idx.blev] - fit.blev <-stats::glm(y.blev~x.blev-1, family = "poisson",weights=wgt) + fit.blev <-stats::glm(y.blev~x.blev-1, family = "quasipoisson",weights=wgt) beta.prop <- fit.blev$coefficients Sample.blev[[i]]<-idx.blev @@ -403,7 +382,7 @@ LeverageSampling<-function(r,Y,X,N,S_alpha,family){ } # unweighted leverage sampling - fit.uwlev <- stats::glm(y.blev~x.blev-1, family = "poisson") + fit.uwlev <- stats::glm(y.blev~x.blev-1, family = "quasipoisson") beta.prop<-fit.uwlev$coefficients Sample.uwlev[[i]]<-idx.blev @@ -414,13 +393,13 @@ LeverageSampling<-function(r,Y,X,N,S_alpha,family){ } # shrinkage leverage sampling - idx.slev <- sample(1:N, size = r[i], replace = TRUE, PI.slev) + idx.slev <- sample(1:N, size = r[i], replace = TRUE, prob = PI.slev) - x.slev <- as.matrix(X[idx.slev,]) - y.slev <- as.matrix(Y[idx.slev]) + x.slev <- X[idx.slev,] + y.slev <- Y[idx.slev] wgt <- 1 / PI.slev[idx.slev] - fit.slev <- stats::glm(y.slev~x.slev-1, family = "poisson",weights = wgt) + fit.slev <- stats::glm(y.slev~x.slev-1, family = "quasipoisson",weights = wgt) beta.prop <- fit.slev$coefficients Sample.slev[[i]]<-idx.slev diff --git a/R/modelMissLinSub.R b/R/modelMissLinSub.R index 2b3498d..c5e7f29 100644 --- a/R/modelMissLinSub.R +++ b/R/modelMissLinSub.R @@ -97,7 +97,9 @@ #' @import stats #' @import foreach #' @importFrom gam s +#' @importFrom gam lo #' @importFrom Rfast rowprods +#' @importFrom utils combn #' @importFrom psych tr #' @export modelMissLinSub <- function(r1,r2,Y,X,N,Alpha,proportion){ @@ -132,31 +134,37 @@ modelMissLinSub <- function(r1,r2,Y,X,N,Alpha,proportion){ message("50% or >=50% of the big data is used to help find AMSE for the subsamples, \nthis could take some time.") } - idx.prop <- sample(1:N, r1, T) + main_effects <- paste0("s(X", 1:ncol(X[, -1]), ")") + + if(ncol(X[,-1]) == 2 ){ + two_way_interactions <- utils::combn(colnames(X[, -1]), 2, + function(cols){paste0("lo(", paste(cols, collapse = "*"), ")")}) + + my_formula<-stats::as.formula(paste("Y ~ ",paste(main_effects,collapse = " + "),"+", + paste(two_way_interactions,collapse = " + "))) + } else{ + my_formula<-stats::as.formula(paste("Y ~ ",paste(main_effects,collapse = " + "))) + } + + idx.prop <- sample(1:N, size = r1, replace = TRUE) x.prop <- X[idx.prop,] y.prop <- Y[idx.prop] - beta.prop<-solve(a=t(x.prop)%*%x.prop,b=t(x.prop)%*%y.prop) - Xbeta_Final<-as.vector(X%*%beta.prop) + beta.prop<-solve(a=crossprod(x.prop),b=crossprod(x.prop,y.prop)) + Xbeta_Final<-X%*%beta.prop Var.prop<-sum((Y-Xbeta_Final)^2)/N - - Xbeta.prop<-X%*%beta.prop - Epsilon.prop<-Y-Xbeta.prop - - my_formula<-stats::as.formula(paste("Y ~ ",paste(paste0("s(X",1:ncol(x.prop[,-1]),")"),collapse = " + "),"+", - paste(paste0("s(",paste0(colnames(x.prop[,-1]),collapse = "*"),")"), - collapse = " + "))) + Epsilon.prop<-Y-Xbeta_Final #calculate f_hat Assumed_Data<-data.frame(Y=y.prop,x.prop) fit_GAM<-gam::gam(my_formula,data=Assumed_Data) Xbeta_GAM<-gam::predict.Gam(fit_GAM,newdata = data.frame(X)) - f_estimate<-Xbeta_GAM-Xbeta.prop + f_estimate<-Xbeta_GAM-Xbeta_Final Var_GAM.prop<-sum((Y-Xbeta_GAM)^2)/N if(proportion*N != r1){ - idx.proportion <- sample(1:N, ceiling(proportion*N), T) + idx.proportion <- sample(1:N, size = ceiling(proportion*N), replace = TRUE) Y_proportion<-Y[idx.proportion] X_proportion<-X[idx.proportion,] @@ -164,14 +172,13 @@ modelMissLinSub <- function(r1,r2,Y,X,N,Alpha,proportion){ fit_GAM_Proportion<-gam::gam(my_formula,data=Proportion_Data) Xbeta_GAM_Proportion<-gam::predict.Gam(fit_GAM_Proportion,newdata = data.frame(X)) - beta_proportion<-solve(a=t(X_proportion)%*%X_proportion,b=t(X_proportion)%*%Y_proportion) + beta_proportion<-solve(a=crossprod(X_proportion),b=crossprod(X_proportion,Y_proportion)) Xbeta_proportion<-X%*%beta_proportion Var_GAM_Full<-sum((Y-Xbeta_GAM_Proportion)^2)/N Var_Full<-sum((Y-Xbeta_proportion)^2)/N F_Estimate_Full<-Xbeta_GAM_Proportion-Xbeta_proportion - } - else{ + } else { Var_GAM_Full<-Var_GAM.prop ; Var_Full<-Var.prop ; F_Estimate_Full<-f_estimate } @@ -180,7 +187,7 @@ modelMissLinSub <- function(r1,r2,Y,X,N,Alpha,proportion){ PI.mVc <- PI.mVc / sum(PI.mVc) # mMSE - PI.mMSE <- sqrt(Epsilon.prop^2 * rowSums((X %*% solve(t(X)%*%X))^2)) + PI.mMSE <- sqrt(Epsilon.prop^2 * rowSums((X %*% solve(crossprod(X)))^2)) PI.mMSE <- PI.mMSE / sum(PI.mMSE) Tempsy_Var_Gam_Var<-Var_GAM.prop*Var.prop^(-2) @@ -192,7 +199,8 @@ modelMissLinSub <- function(r1,r2,Y,X,N,Alpha,proportion){ f_r1 <- f_estimate[c(idx.prop, a)] Temp_Solve<-solve(crossprod(X_r1)) - Temp1 <- tcrossprod(X_r1 %*% Temp_Solve, X_r1) + Temp_Xr1_Solve<-X_r1 %*% Temp_Solve + Temp1 <- tcrossprod(Temp_Xr1_Solve,X_r1) L1_r1 <- Tempsy_Var_Gam_Var*psych::tr(Temp1) Temp_f_r1 <- Temp1 %*% f_r1 @@ -202,9 +210,9 @@ modelMissLinSub <- function(r1,r2,Y,X,N,Alpha,proportion){ L1_r1 + L2_r1 } - Temp1<-X[idx.prop,]%*%solve(t(X[idx.prop,])%*%X[idx.prop,])%*%t(X[idx.prop,]) + Temp1<-X[idx.prop,]%*%solve(crossprod(X[idx.prop,]))%*%t(X[idx.prop,]) L1_r1 <- Tempsy_Var_Gam_Var*psych::tr(Temp1) - L2_r1 <- sum((Var.prop^(-1)*(Temp1%*%f_estimate[idx.prop]-f_estimate[idx.prop]))^2) + L2_r1 <- sum((Var_prop_inv*(Temp1%*%f_estimate[idx.prop]-f_estimate[idx.prop]))^2) L_All_Temp<-L1_r1+L2_r1 L_All_Final<-abs(L_All_Temp-L_All) @@ -258,10 +266,12 @@ modelMissLinSub <- function(r1,r2,Y,X,N,Alpha,proportion){ message("Step 1 of the algorithm completed.\n") + VarFull_Inv<-Var_Full^(-1) + for (i in 1:length(r2)) { # mVc - idx.mVc <- sample(1:N, r2[i]-r1, T, PI.mVc) + idx.mVc <- sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI.mVc) x.mVc <- X[c(idx.mVc, idx.prop),] y.mVc <- Y[c(idx.mVc, idx.prop)] @@ -270,21 +280,21 @@ modelMissLinSub <- function(r1,r2,Y,X,N,Alpha,proportion){ pi4_r<-sqrt(r2[i]*w.mVc^(-1)) X_r4<-x.mVc/pi4_r Y_r4<-y.mVc/pi4_r - beta_mVc[i,]<-c(r2[i],solve(a=t(X_r4)%*%X_r4,b=t(X_r4)%*%Y_r4)) - Xbeta_Final<-as.vector(X%*%beta_mVc[i,-1]) + beta_mVc[i,]<-c(r2[i],solve(a=crossprod(X_r4),b=crossprod(X_r4,Y_r4))) + Xbeta_Final<-X%*%beta_mVc[i,-1] Var_Epsilon[i,2]<-sum((Y-Xbeta_Final)^2)/N - Temp1<-x.mVc%*%solve(t(x.mVc)%*%x.mVc)%*%t(x.mVc) + Temp1<-x.mVc%*%solve(crossprod(x.mVc))%*%t(x.mVc) L1_r1 <- Tempsy_Var_Gam_Var_AMSE*psych::tr(Temp1) - L2_r1 <- sum((Var_Full^(-1)*(Temp1%*%F_Estimate_Full[c(idx.mVc,idx.prop)] - - F_Estimate_Full[c(idx.mVc, idx.prop)]))^2) + L2_r1 <- sum((VarFull_Inv*(Temp1%*%F_Estimate_Full[c(idx.mVc,idx.prop)] - + F_Estimate_Full[c(idx.mVc, idx.prop)]))^2) AMSE_Sample_mVc[i,]<-c(r2[i],L1_r1,L2_r1,L1_r1+L2_r1) idx.mVc->Sample.mVc[[i+1]] # mMSE - idx.mMSE <- sample(1:N, r2[i]-r1, T, PI.mMSE) + idx.mMSE <- sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI.mMSE) x.mMSE <- X[c(idx.mMSE, idx.prop),] y.mMSE <- Y[c(idx.mMSE, idx.prop)] @@ -293,21 +303,21 @@ modelMissLinSub <- function(r1,r2,Y,X,N,Alpha,proportion){ pi4_r<-sqrt(r2[i]*w.mMSE^(-1)) X_r4<-x.mMSE/pi4_r Y_r4<-y.mMSE/pi4_r - beta_mMSE[i,]<-c(r2[i],solve(a=t(X_r4)%*%X_r4,b=t(X_r4)%*%Y_r4)) - Xbeta_Final<-as.vector(X%*%beta_mMSE[i,-1]) + beta_mMSE[i,]<-c(r2[i],solve(a=crossprod(X_r4),b=crossprod(X_r4,Y_r4))) + Xbeta_Final<-X%*%beta_mMSE[i,-1] Var_Epsilon[i,3]<-sum((Y-Xbeta_Final)^2)/N - Temp1<-x.mMSE%*%solve(t(x.mMSE)%*%x.mMSE)%*%t(x.mMSE) + Temp1<-x.mMSE%*%solve(crossprod(x.mMSE))%*%t(x.mMSE) L1_r1 <- Tempsy_Var_Gam_Var_AMSE*psych::tr(Temp1) - L2_r1 <- sum((Var_Full^(-1)*(Temp1%*%F_Estimate_Full[c(idx.mMSE,idx.prop)]- - F_Estimate_Full[c(idx.mMSE, idx.prop)]))^2) + L2_r1 <- sum((VarFull_Inv*(Temp1%*%F_Estimate_Full[c(idx.mMSE,idx.prop)]- + F_Estimate_Full[c(idx.mMSE, idx.prop)]))^2) AMSE_Sample_mMSE[i,]<-c(r2[i],L1_r1,L2_r1,L1_r1+L2_r1) idx.mMSE->Sample.mMSE[[i+1]] # RLmAMSE - idx.RLmAMSE <- sample(1:N, r2[i], T, PI.RLmAMSE) + idx.RLmAMSE <- sample(1:N, size = r2[i], replace = TRUE, prob = PI.RLmAMSE) x.RLmAMSE <- X[c(idx.RLmAMSE),] y.RLmAMSE <- Y[c(idx.RLmAMSE)] @@ -316,14 +326,14 @@ modelMissLinSub <- function(r1,r2,Y,X,N,Alpha,proportion){ pi4_r<-sqrt(r2[i]*w.RLmAMSE^(-1)) X_r4<-x.RLmAMSE/pi4_r Y_r4<-y.RLmAMSE/pi4_r - beta_RLmAMSE[i,]<-c(r2[i],solve(a=t(X_r4)%*%X_r4,b=t(X_r4)%*%Y_r4)) - Xbeta_Final<-as.vector(X%*%beta_RLmAMSE[i,-1]) + beta_RLmAMSE[i,]<-c(r2[i],solve(a=crossprod(X_r4),b=crossprod(X_r4,Y_r4))) + Xbeta_Final<-X%*%beta_RLmAMSE[i,-1] Var_Epsilon[i,4]<-sum((Y-Xbeta_Final)^2)/N - Temp1<-x.RLmAMSE%*%solve(t(x.RLmAMSE)%*%x.RLmAMSE)%*%t(x.RLmAMSE) + Temp1<-x.RLmAMSE%*%solve(crossprod(x.RLmAMSE))%*%t(x.RLmAMSE) L1_r1 <- Tempsy_Var_Gam_Var_AMSE*psych::tr(Temp1) - L2_r1 <- sum((Var_Full^(-1)*(Temp1%*%F_Estimate_Full[c(idx.RLmAMSE)] - - F_Estimate_Full[c(idx.RLmAMSE)]))^2) + L2_r1 <- sum((VarFull_Inv*(Temp1%*%F_Estimate_Full[c(idx.RLmAMSE)] - + F_Estimate_Full[c(idx.RLmAMSE)]))^2) AMSE_Sample_RLmAMSE[i,]<-c(r2[i],L1_r1,L2_r1,L1_r1+L2_r1) @@ -332,7 +342,7 @@ modelMissLinSub <- function(r1,r2,Y,X,N,Alpha,proportion){ for(j in 1:length(Alpha)) { # RLmAMSE Log Odds - idx.RLmAMSE <- sample(1:N, r2[i], T, PI.RLmAMSE_LO[,j]) + idx.RLmAMSE <- sample(1:N, size = r2[i], replace = TRUE, prob = PI.RLmAMSE_LO[,j]) x.RLmAMSE <- X[c(idx.RLmAMSE),] y.RLmAMSE <- Y[c(idx.RLmAMSE)] @@ -341,21 +351,21 @@ modelMissLinSub <- function(r1,r2,Y,X,N,Alpha,proportion){ pi4_r<-sqrt(r2[i]*w.RLmAMSE^(-1)) X_r4<-x.RLmAMSE/pi4_r Y_r4<-y.RLmAMSE/pi4_r - beta_RLmAMSE_LO[[j]][i,]<-c(r2[i],solve(a=t(X_r4)%*%X_r4,b=t(X_r4)%*%Y_r4)) - Xbeta_Final<-as.vector(X%*%beta_RLmAMSE_LO[[j]][i,-1]) + beta_RLmAMSE_LO[[j]][i,]<-c(r2[i],solve(a=crossprod(X_r4),b=crossprod(X_r4,Y_r4))) + Xbeta_Final<-X%*%beta_RLmAMSE_LO[[j]][i,-1] Var_RLmAMSE_LO[i,j]<-sum((Y-Xbeta_Final)^2)/N - Temp1<-x.RLmAMSE%*%solve(t(x.RLmAMSE)%*%x.RLmAMSE)%*%t(x.RLmAMSE) + Temp1<-x.RLmAMSE%*%solve(crossprod(x.RLmAMSE))%*%t(x.RLmAMSE) L1_r1 <- Tempsy_Var_Gam_Var_AMSE*psych::tr(Temp1) - L2_r1 <- sum((Var_Full^(-1)*(Temp1%*%F_Estimate_Full[c(idx.RLmAMSE)]- - F_Estimate_Full[c(idx.RLmAMSE)]))^2) + L2_r1 <- sum((VarFull_Inv*(Temp1%*%F_Estimate_Full[c(idx.RLmAMSE)]- + F_Estimate_Full[c(idx.RLmAMSE)]))^2) AMSE_Sample_RLmAMSE_LO[[j]][i,]<-c(r2[i],L1_r1,L2_r1,L1_r1+L2_r1) idx.RLmAMSE->Sample.RLmAMSE_LO[[j]][[i+1]] # Model robust RLmAMSE # RLmAMSE Power - idx.RLmAMSE <- sample(1:N, r2[i], T, PI.RLmAMSE_Pow[,j]) + idx.RLmAMSE <- sample(1:N, size = r2[i], replace = TRUE, prob = PI.RLmAMSE_Pow[,j]) x.RLmAMSE <- X[c(idx.RLmAMSE),] y.RLmAMSE <- Y[c(idx.RLmAMSE)] @@ -364,14 +374,14 @@ modelMissLinSub <- function(r1,r2,Y,X,N,Alpha,proportion){ pi4_r<-sqrt(r2[i]*w.RLmAMSE^(-1)) X_r4<-x.RLmAMSE/pi4_r Y_r4<-y.RLmAMSE/pi4_r - beta_RLmAMSE_Pow[[j]][i,]<-c(r2[i],solve(a=t(X_r4)%*%X_r4,b=t(X_r4)%*%Y_r4)) - Xbeta_Final<-as.vector(X%*%beta_RLmAMSE_Pow[[j]][i,-1]) + beta_RLmAMSE_Pow[[j]][i,]<-c(r2[i],solve(a=crossprod(X_r4),b=crossprod(X_r4,Y_r4))) + Xbeta_Final<-X%*%beta_RLmAMSE_Pow[[j]][i,-1] Var_RLmAMSE_Pow[i,j]<-sum((Y-Xbeta_Final)^2)/N - Temp1<-x.RLmAMSE%*%solve(t(x.RLmAMSE)%*%x.RLmAMSE)%*%t(x.RLmAMSE) + Temp1<-x.RLmAMSE%*%solve(crossprod(x.RLmAMSE))%*%t(x.RLmAMSE) L1_r1 <- Tempsy_Var_Gam_Var_AMSE*psych::tr(Temp1) - L2_r1 <- sum((Var_Full^(-1)*(Temp1%*%F_Estimate_Full[c(idx.RLmAMSE)] - - F_Estimate_Full[c(idx.RLmAMSE)]))^2) + L2_r1 <- sum((VarFull_Inv*(Temp1%*%F_Estimate_Full[c(idx.RLmAMSE)] - + F_Estimate_Full[c(idx.RLmAMSE)]))^2) AMSE_Sample_RLmAMSE_Pow[[j]][i,]<-c(r2[i],L1_r1,L2_r1,L1_r1+L2_r1) @@ -403,15 +413,15 @@ modelMissLinSub <- function(r1,r2,Y,X,N,Alpha,proportion){ Var_Data<-cbind.data.frame(Var_Epsilon,Var_RLmAMSE_LO,Var_RLmAMSE_Pow) colnames(Var_Data)<-c("r2","A-Optimality","L-Optimality","RLmAMSE", - paste0("RLmAMSE Log Odds ",Alpha), - paste0("RLmAMSE Power ",Alpha)) + paste0("RLmAMSE Log Odds ",Alpha), + paste0("RLmAMSE Power ",Alpha)) # AMSE Sample Data AMSE_Sample_Data<-cbind.data.frame("Method"=rep(Sampling_Methods,each=length(r2)), - rbind(AMSE_Sample_mMSE,AMSE_Sample_mVc, - AMSE_Sample_RLmAMSE, - do.call(rbind,AMSE_Sample_RLmAMSE_LO), - do.call(rbind,AMSE_Sample_RLmAMSE_Pow))) + rbind(AMSE_Sample_mMSE,AMSE_Sample_mVc, + AMSE_Sample_RLmAMSE, + do.call(rbind,AMSE_Sample_RLmAMSE_LO), + do.call(rbind,AMSE_Sample_RLmAMSE_Pow))) colnames(AMSE_Sample_Data)[-1]<-c("r2","Variance","Bias.2","AMSE") AMSE_Sample_Data[,-c(1,2)]<-AMSE_Sample_Data[,-c(1,2)]/r2 diff --git a/R/modelMissLogSub.R b/R/modelMissLogSub.R index 7e7187c..70416c7 100644 --- a/R/modelMissLogSub.R +++ b/R/modelMissLogSub.R @@ -96,6 +96,7 @@ #' @import foreach #' @importFrom gam s #' @importFrom Rfast rowprods +#' @importFrom utils combn #' @importFrom psych tr #' @export modelMissLogSub <- function(r1,r2,Y,X,N,Alpha,proportion){ @@ -130,11 +131,23 @@ modelMissLogSub <- function(r1,r2,Y,X,N,Alpha,proportion){ message("50% or >=50% of the big data is used to help find AMSE for the subsamples, \nthis could take some time.") } + main_effects <- paste0("s(X", 1:ncol(X[, -1]), ")") + + if(ncol(X[,-1]) == 2 ){ + two_way_interactions <- utils::combn(colnames(X[, -1]), 2, + function(cols){paste0("lo(", paste(cols, collapse = "*"), ")")}) + + my_formula<-stats::as.formula(paste("Y ~ ",paste(main_effects,collapse = " + "),"+", + paste(two_way_interactions,collapse = " + "))) + } else{ + my_formula<-stats::as.formula(paste("Y ~ ",paste(main_effects,collapse = " + "))) + } + n1 <- sum(Y) n0 <- N - n1 PI.prop <- rep(1/(2*n0), N) PI.prop[Y==1] <- 1/(2*n1) - idx.prop <- sample(1:N, r1, T, PI.prop) + idx.prop <- sample(1:N, size = r1, replace = TRUE, prob = PI.prop) x.prop <- X[idx.prop,] y.prop <- Y[idx.prop] @@ -146,20 +159,17 @@ modelMissLogSub <- function(r1,r2,Y,X,N,Alpha,proportion){ if (anyNA(beta.prop)){ stop("There are NA or NaN values in the model parameters") } - P.prop <- 1 - 1 / (1 + exp(X %*% beta.prop)) - - my_formula<-stats::as.formula(paste("Y ~ ",paste(paste0("s(X",1:ncol(x.prop[,-1]),")"),collapse = " + "),"+", - paste(paste0("s(",paste0(colnames(x.prop[,-1]),collapse = "*"),")"), - collapse = " + "))) + Xbeta_Final<-X %*% beta.prop + P.prop <- 1 - 1 / (1 + exp(Xbeta_Final)) # calculate f_hat Assumed_Data<-data.frame(Y=y.prop,x.prop) fit_GAM<-gam::gam(my_formula,data=Assumed_Data,family = "binomial") Xbeta_GAM<-gam::predict.Gam(fit_GAM,newdata = data.frame(X)) - f_estimate<-Xbeta_GAM - X%*%beta.prop + f_estimate<-Xbeta_GAM - Xbeta_Final if(proportion*N != r1){ - idx.proportion <- sample(1:N, ceiling(proportion*N), T, PI.prop) + idx.proportion <- sample(1:N, size = ceiling(proportion*N), replace = TRUE, prob = PI.prop) Y_proportion <- Y[idx.proportion] X_proportion <- X[idx.proportion,] @@ -175,8 +185,7 @@ modelMissLogSub <- function(r1,r2,Y,X,N,Alpha,proportion){ F_Estimate_Full<-Xbeta_GAM_proportion - Xbeta_proportion Beta_Estimate_Full<-beta.proportion - } - else { + } else { Beta_Estimate_Full<- beta.prop ; F_Estimate_Full<-f_estimate } @@ -187,7 +196,7 @@ modelMissLogSub <- function(r1,r2,Y,X,N,Alpha,proportion){ # mMSE p.prop <- P.prop[idx.prop] w.prop <- p.prop * (1 - p.prop) - W.prop <- solve(t(x.prop) %*% (x.prop * w.prop * pinv.prop)) + W.prop <- solve(crossprod(x.prop, x.prop * w.prop * pinv.prop)) PI.mMSE <- sqrt((Y - P.prop)^2 * rowSums((X%*%W.prop)^2)) PI.mMSE <- PI.mMSE / sum(PI.mMSE) @@ -214,18 +223,18 @@ modelMissLogSub <- function(r1,r2,Y,X,N,Alpha,proportion){ diff <- XH_b_r1 - f_r1 L2_r1 <- sum((W_r1 * diff)^2) - c(L1_r1+L2_r1) + L1_r1+L2_r1 } p_r1<-1-1/(1+exp(X[idx.prop,]%*%beta.prop)) W_r1<-as.vector(p_r1*(1-p_r1)) - H_r1 <-solve(t(X[idx.prop,]) %*% (X[idx.prop,] * W_r1)) + H_r1 <-solve(crossprod(X[idx.prop,], X[idx.prop,] * W_r1)) Temp1<-(W_r1*X[idx.prop,])%*%H_r1 p_Tr1<-1-1/(1+exp((X[idx.prop,] %*% beta.prop) + f_estimate[idx.prop])) W_Tr1<-as.vector(p_Tr1*(1-p_Tr1)) - H_Tr1 <-(t(X[idx.prop,]) %*% (X[idx.prop,] * W_Tr1)) - b_r1 <-(t(X[idx.prop,]) %*% (p_Tr1-p_r1)) + H_Tr1 <-crossprod(X[idx.prop,], X[idx.prop,] * W_Tr1) + b_r1 <-crossprod(X[idx.prop,], p_Tr1-p_r1) L1_r1 <- psych::tr(Temp1%*%H_Tr1%*%t(Temp1)) L2_r1 <- sum((W_r1*((X[idx.prop,]%*%H_r1%*%b_r1)-f_estimate[idx.prop]))^2) @@ -276,7 +285,7 @@ modelMissLogSub <- function(r1,r2,Y,X,N,Alpha,proportion){ for (i in 1:length(r2)) { # mVc - idx.mVc <- sample(1:N, r2[i]-r1, T, PI.mVc) + idx.mVc <- sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI.mVc) x.mVc <- X[c(idx.mVc, idx.prop),] y.mVc <- Y[c(idx.mVc, idx.prop)] @@ -289,20 +298,20 @@ modelMissLogSub <- function(r1,r2,Y,X,N,Alpha,proportion){ p_r1<-1-1/(1+exp(x.mVc%*%Beta_Estimate_Full)) W_r1<-as.vector(p_r1*(1-p_r1)) - H_r1 <-solve(t(x.mVc) %*% (x.mVc * W_r1)) + H_r1 <-solve(crossprod(x.mVc, x.mVc * W_r1)) Temp1<-(W_r1*x.mVc)%*%H_r1 p_Tr1<-1-1/(1+exp((x.mVc %*% Beta_Estimate_Full) + F_Estimate_Full[c(idx.mVc, idx.prop)])) W_Tr1<-as.vector(p_Tr1*(1-p_Tr1)) - H_Tr1 <-(t(x.mVc) %*% (x.mVc * W_Tr1)) - b_r1 <-(t(x.mVc) %*% (p_Tr1-p_r1)) + H_Tr1 <-crossprod(x.mVc, x.mVc * W_Tr1) + b_r1 <-crossprod(x.mVc, p_Tr1-p_r1) L1_r1 <- psych::tr(Temp1%*%H_Tr1%*%t(Temp1)) L2_r1 <- sum((W_r1*((x.mVc%*%H_r1%*%b_r1) - F_Estimate_Full[c(idx.mVc, idx.prop)]))^2) AMSE_Sample_mVc[i,]<-c(r2[i],L1_r1,L2_r1,L1_r1+L2_r1) # mMSE - idx.mMSE <- sample(1:N, r2[i]-r1, T, PI.mMSE) + idx.mMSE <- sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI.mMSE) x.mMSE <- X[c(idx.mMSE, idx.prop),] y.mMSE <- Y[c(idx.mMSE, idx.prop)] @@ -315,20 +324,20 @@ modelMissLogSub <- function(r1,r2,Y,X,N,Alpha,proportion){ p_r1<-1-1/(1+exp(x.mMSE%*%Beta_Estimate_Full)) W_r1<-as.vector(p_r1*(1-p_r1)) - H_r1 <-solve(t(x.mMSE) %*% (x.mMSE * W_r1)) + H_r1 <-solve(crossprod(x.mMSE, x.mMSE * W_r1)) Temp1<-(W_r1 * x.mMSE)%*%H_r1 p_Tr1<-1-1/(1+exp((x.mMSE %*% Beta_Estimate_Full) + F_Estimate_Full[c(idx.mMSE, idx.prop)])) W_Tr1<-as.vector(p_Tr1*(1-p_Tr1)) - H_Tr1 <-(t(x.mMSE) %*% (x.mMSE * W_Tr1)) - b_r1 <-(t(x.mMSE) %*% (p_Tr1-p_r1)) + H_Tr1 <-crossprod(x.mMSE, x.mMSE * W_Tr1) + b_r1 <-crossprod(x.mMSE, p_Tr1-p_r1) L1_r1 <- psych::tr(Temp1%*%H_Tr1%*%t(Temp1)) L2_r1 <- sum((W_r1*((x.mMSE%*%H_r1%*%b_r1) - F_Estimate_Full[c(idx.mMSE, idx.prop)]))^2) AMSE_Sample_mMSE[i,]<-c(r2[i],L1_r1,L2_r1,L1_r1+L2_r1) # RLmAMSE - idx.RLmAMSE <- sample(1:N, r2[i], T, PI.RLmAMSE) + idx.RLmAMSE <- sample(1:N, size = r2[i], replace = TRUE, prob = PI.RLmAMSE) x.RLmAMSE <- X[c(idx.RLmAMSE),] y.RLmAMSE <- Y[c(idx.RLmAMSE)] @@ -341,13 +350,13 @@ modelMissLogSub <- function(r1,r2,Y,X,N,Alpha,proportion){ p_r1<-1-1/(1+exp(x.RLmAMSE%*%Beta_Estimate_Full)) W_r1<-as.vector(p_r1*(1-p_r1)) - H_r1 <-solve(t(x.RLmAMSE) %*% (x.RLmAMSE * W_r1)) + H_r1 <-solve(crossprod(x.RLmAMSE, x.RLmAMSE * W_r1)) Temp1<-(W_r1 * x.RLmAMSE)%*%H_r1 p_Tr1<-1-1/(1+exp((x.RLmAMSE %*% Beta_Estimate_Full) + F_Estimate_Full[c(idx.RLmAMSE)])) W_Tr1<-as.vector(p_Tr1*(1-p_Tr1)) - H_Tr1 <-(t(x.RLmAMSE) %*% (x.RLmAMSE * W_Tr1)) - b_r1 <-(t(x.RLmAMSE) %*% (p_Tr1-p_r1)) + H_Tr1 <-crossprod(x.RLmAMSE, x.RLmAMSE * W_Tr1) + b_r1 <- crossprod(x.RLmAMSE, p_Tr1-p_r1) L1_r1 <- psych::tr(Temp1%*%H_Tr1%*%t(Temp1)) L2_r1 <- sum((W_r1*((x.RLmAMSE%*%H_r1%*%b_r1) - F_Estimate_Full[c(idx.RLmAMSE)]))^2) @@ -356,7 +365,7 @@ modelMissLogSub <- function(r1,r2,Y,X,N,Alpha,proportion){ for (j in 1:length(Alpha)) { # RLmAMSE Log Odds - idx.RLmAMSE <- sample(1:N, r2[i], T, PI.RLmAMSE_LO[,j]) + idx.RLmAMSE <- sample(1:N, size = r2[i], replace = TRUE, prob = PI.RLmAMSE_LO[,j]) x.RLmAMSE <- X[c(idx.RLmAMSE),] y.RLmAMSE <- Y[c(idx.RLmAMSE)] @@ -369,20 +378,20 @@ modelMissLogSub <- function(r1,r2,Y,X,N,Alpha,proportion){ p_r1<-1-1/(1+exp(x.RLmAMSE%*%Beta_Estimate_Full)) W_r1<-as.vector(p_r1*(1-p_r1)) - H_r1 <-solve(t(x.RLmAMSE) %*% (x.RLmAMSE * W_r1)) + H_r1 <-solve(crossprod(x.RLmAMSE, x.RLmAMSE * W_r1)) Temp1<-(W_r1 * x.RLmAMSE)%*%H_r1 p_Tr1<-1-1/(1+exp((x.RLmAMSE %*% Beta_Estimate_Full) + F_Estimate_Full[c(idx.RLmAMSE)])) W_Tr1<-as.vector(p_Tr1*(1-p_Tr1)) - H_Tr1 <-(t(x.RLmAMSE) %*% (x.RLmAMSE * W_Tr1)) - b_r1 <-(t(x.RLmAMSE) %*% (p_Tr1-p_r1)) + H_Tr1 <-crossprod(x.RLmAMSE, x.RLmAMSE * W_Tr1) + b_r1 <-crossprod(x.RLmAMSE, p_Tr1-p_r1) L1_r1 <- psych::tr(Temp1%*%H_Tr1%*%t(Temp1)) L2_r1 <- sum((W_r1*((x.RLmAMSE%*%H_r1%*%b_r1) - F_Estimate_Full[c(idx.RLmAMSE)]))^2) AMSE_Sample_RLmAMSE_LO[[j]][i,]<-c(r2[i],L1_r1,L2_r1,L1_r1+L2_r1) # RLmAMSE Power - idx.RLmAMSE <- sample(1:N, r2[i], T, PI.RLmAMSE_Pow[,j]) + idx.RLmAMSE <- sample(1:N, size = r2[i], replace = TRUE, prob = PI.RLmAMSE_Pow[,j]) x.RLmAMSE <- X[c(idx.RLmAMSE),] y.RLmAMSE <- Y[c(idx.RLmAMSE)] @@ -400,8 +409,8 @@ modelMissLogSub <- function(r1,r2,Y,X,N,Alpha,proportion){ p_Tr1<-1-1/(1+exp((x.RLmAMSE %*% Beta_Estimate_Full) + F_Estimate_Full[c(idx.RLmAMSE)])) W_Tr1<-as.vector(p_Tr1*(1-p_Tr1)) - H_Tr1 <-(t(x.RLmAMSE) %*% (x.RLmAMSE * W_Tr1)) - b_r1 <-(t(x.RLmAMSE) %*% (p_Tr1-p_r1)) + H_Tr1 <-crossprod(x.RLmAMSE, x.RLmAMSE * W_Tr1) + b_r1 <- crossprod(x.RLmAMSE, p_Tr1-p_r1) L1_r1 <- psych::tr(Temp1%*%H_Tr1%*%t(Temp1)) L2_r1 <- sum((W_r1*((x.RLmAMSE%*%H_r1%*%b_r1) - F_Estimate_Full[c(idx.RLmAMSE)]))^2) diff --git a/R/modelMissPoiSub.R b/R/modelMissPoiSub.R index b8dceb5..a71b6eb 100644 --- a/R/modelMissPoiSub.R +++ b/R/modelMissPoiSub.R @@ -97,6 +97,7 @@ #' @import foreach #' @importFrom gam s #' @importFrom Rfast rowprods +#' @importFrom utils combn #' @importFrom psych tr #' @export modelMissPoiSub <- function(r1,r2,Y,X,N,Alpha,proportion){ @@ -131,52 +132,60 @@ modelMissPoiSub <- function(r1,r2,Y,X,N,Alpha,proportion){ message("50% or >=50% of the big data is used to help find AMSE for the subsamples, \nthis could take some time.") } + main_effects <- paste0("s(X", 1:ncol(X[, -1]), ")") + + if(ncol(X[,-1]) %in% c(2:3) ){ + two_way_interactions <- utils::combn(colnames(X[, -1]), 2, + function(cols){paste0("lo(", paste(cols, collapse = "*"), ")")}) + + my_formula<-stats::as.formula(paste("Y ~ ",paste(main_effects,collapse = " + "),"+", + paste(two_way_interactions,collapse = " + "))) + } else { + my_formula<-stats::as.formula(paste("Y ~ ",paste(main_effects,collapse = " + "))) + } + PI.prop <- rep(1/N,N) - idx.prop <- sample(1:N, r1, T,PI.prop) + idx.prop <- sample(1:N, size = r1, replace = TRUE, prob = PI.prop) x.prop <- X[idx.prop,] y.prop <- Y[idx.prop] pinv.prop <- rep(N,r1) - fit.prop <- stats::glm(y.prop~x.prop-1,family="poisson") + fit.prop <- stats::glm(y.prop~x.prop-1,family="quasipoisson") beta.prop <- fit.prop$coefficients if (anyNA(beta.prop)){ stop("There are NA or NaN values in the model parameters") } - Lambda.prop <- exp(X %*% beta.prop) - - my_formula<-stats::as.formula(paste("Y ~ ",paste(paste0("gam::s(X",1:ncol(x.prop[,-1]),")"),collapse = " + "),"+", - paste(paste0("gam::s(",paste0(colnames(x.prop[,-1]),collapse = "*"),")"), - collapse = " + "))) + Xbeta_Final <- X %*% beta.prop + Lambda.prop <- exp(Xbeta_Final) #calculate f_hat Assumed_Data<-data.frame(Y=y.prop,x.prop) - fit_GAM<-gam::gam(my_formula,data=Assumed_Data,family = "poisson") + fit_GAM<-gam::gam(my_formula,data=Assumed_Data,family = "quasipoisson") Xbeta_GAM<-gam::predict.Gam(fit_GAM,newdata = data.frame(X)) Lambda_GAM<-exp(Xbeta_GAM) - f_estimate<-Xbeta_GAM - X %*% beta.prop + f_estimate<-Xbeta_GAM - Xbeta_Final if(proportion*N != r1){ - idx.proportion <- sample(1:N, ceiling(proportion*N), T, PI.prop) + idx.proportion <- sample(1:N, size = ceiling(proportion*N), replace = TRUE, prob = PI.prop) Y_proportion <- Y[idx.proportion] X_proportion <- X[idx.proportion,] pinv.proportion <- rep(N,ceiling(proportion*N)) - fit.proportion <- stats::glm(Y_proportion~X_proportion-1,family="poisson") + fit.proportion <- stats::glm(Y_proportion~X_proportion-1,family="quasipoisson") beta.proportion <- fit.proportion$coefficients Xbeta_proportion <- X %*% beta.proportion Assumed_Data<-data.frame(Y=Y_proportion,X_proportion) - fit_GAM_proportion<-gam::gam(my_formula,data=Assumed_Data,family = "poisson") + fit_GAM_proportion<-gam::gam(my_formula,data=Assumed_Data,family = "quasipoisson") Xbeta_GAM_proportion<-gam::predict.Gam(fit_GAM_proportion,newdata = data.frame(X)) F_Estimate_Full<-Xbeta_GAM_proportion - Xbeta_proportion Beta_Estimate_Full<-beta.proportion - } - else { + } else { Beta_Estimate_Full<- beta.prop ; F_Estimate_Full<-f_estimate } @@ -187,7 +196,7 @@ modelMissPoiSub <- function(r1,r2,Y,X,N,Alpha,proportion){ # mMSE lambda.prop <- Lambda.prop[idx.prop] w.prop <- lambda.prop - W.prop <- solve(t(x.prop) %*% (x.prop * w.prop * pinv.prop)) + W.prop <- solve(crossprod(x.prop, x.prop * w.prop * pinv.prop)) PI.mMSE <- sqrt((Y - Lambda.prop)^2 * rowSums((X%*%W.prop)^2)) PI.mMSE <- PI.mMSE / sum(PI.mMSE) @@ -197,14 +206,14 @@ modelMissPoiSub <- function(r1,r2,Y,X,N,Alpha,proportion){ X_r1<-X[c(idx.prop,a),] lambda_r1<-exp(X_r1%*%beta.prop) W_r1<-as.vector(lambda_r1) - H_r1 <-solve(t(X_r1) %*% (X_r1 * W_r1)) + H_r1 <-solve(crossprod(X_r1,X_r1 * W_r1)) Temp1<-(W_r1*X_r1)%*%H_r1 f_r1<-f_estimate[c(idx.prop,a)] lambda_Tr1<-exp((X_r1 %*% beta.prop) + f_r1) W_Tr1<-as.vector(lambda_Tr1) - H_Tr1 <-(t(X_r1) %*% (X_r1 * W_Tr1)) - b_r1 <-(t(X_r1) %*% (lambda_Tr1-lambda_r1)) + H_Tr1 <- crossprod(X_r1, X_r1 * W_Tr1) + b_r1 <- crossprod(X_r1, lambda_Tr1-lambda_r1) Temp1_H <- Temp1 %*% H_Tr1 diag_Temp <- rowSums(Temp1_H * Temp1) @@ -215,18 +224,18 @@ modelMissPoiSub <- function(r1,r2,Y,X,N,Alpha,proportion){ diff <- XH_b_r1 - f_r1 L2_r1 <- sum((W_r1 * diff)^2) - c(L1_r1+L2_r1) + L1_r1+L2_r1 } lambda_r1<-exp(X[idx.prop,]%*%beta.prop) W_r1<-as.vector(lambda_r1) - H_r1 <-solve(t(X[idx.prop,]) %*% (X[idx.prop,]*W_r1)) + H_r1 <-solve(crossprod(X[idx.prop,], X[idx.prop,]*W_r1)) Temp1<-(W_r1*X[idx.prop,])%*%H_r1 lambda_Tr1<-exp((X[idx.prop,] %*% beta.prop) + f_estimate[idx.prop]) W_Tr1<-as.vector(lambda_Tr1) - H_Tr1 <-(t(X[idx.prop,]) %*% (X[idx.prop,]*W_Tr1)) - b_r1 <-(t(X[idx.prop,]) %*% (lambda_Tr1-lambda_r1)) + H_Tr1 <- crossprod(X[idx.prop,], X[idx.prop,]*W_Tr1) + b_r1 <- crossprod(X[idx.prop,], lambda_Tr1-lambda_r1) L1_r1 <- psych::tr(Temp1%*%H_Tr1%*%t(Temp1)) L2_r1 <- sum((W_r1*((X[idx.prop,]%*%H_r1%*%b_r1)-f_estimate[idx.prop]))^2) @@ -277,12 +286,13 @@ modelMissPoiSub <- function(r1,r2,Y,X,N,Alpha,proportion){ for (i in 1:length(r2)) { # mVc - idx.mVc <- sample(1:N, r2[i]-r1, T, PI.mVc) + idx.mVc <- sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI.mVc) x.mVc <- X[c(idx.mVc, idx.prop),] y.mVc <- Y[c(idx.mVc, idx.prop)] - fit.mVc <- stats::glm(y.mVc~x.mVc-1,family="poisson",weights = c(1 / PI.mVc[idx.mVc], pinv.prop)) + fit.mVc <- stats::glm(y.mVc~x.mVc-1,family="quasipoisson", + weights = c(1 / PI.mVc[idx.mVc], pinv.prop)) beta_mVc[i,] <- c(r2[i],fit.mVc$coefficients) @@ -290,25 +300,26 @@ modelMissPoiSub <- function(r1,r2,Y,X,N,Alpha,proportion){ lambda_r1<-exp(x.mVc%*%Beta_Estimate_Full) W_r1<-as.vector(lambda_r1) - H_r1 <-solve(t(x.mVc) %*% (x.mVc * W_r1)) + H_r1 <-solve(crossprod(x.mVc, x.mVc * W_r1)) Temp1<-(W_r1 * x.mVc)%*%H_r1 lambda_Tr1<-exp((x.mVc %*% Beta_Estimate_Full) + F_Estimate_Full[c(idx.mVc, idx.prop)]) W_Tr1<-as.vector(lambda_Tr1) - H_Tr1 <-(t(x.mVc) %*% (x.mVc * W_Tr1)) - b_r1 <-(t(x.mVc) %*% (lambda_Tr1-lambda_r1)) + H_Tr1 <- crossprod(x.mVc, x.mVc * W_Tr1) + b_r1 <- crossprod(x.mVc, lambda_Tr1-lambda_r1) L1_r1 <- psych::tr(Temp1%*%H_Tr1%*%t(Temp1)) L2_r1 <- sum((W_r1*((x.mVc%*%H_r1%*%b_r1) - F_Estimate_Full[c(idx.mVc, idx.prop)]))^2) AMSE_Sample_mVc[i,]<-c(r2[i],L1_r1,L2_r1,L1_r1+L2_r1) # mMSE - idx.mMSE <- sample(1:N, r2[i]-r1, T, PI.mMSE) + idx.mMSE <- sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI.mMSE) x.mMSE <- X[c(idx.mMSE, idx.prop),] y.mMSE <- Y[c(idx.mMSE, idx.prop)] - fit.mMSE <- stats::glm(y.mMSE~x.mMSE-1,family = "poisson", weights =c(1 / PI.mMSE[idx.mMSE], pinv.prop) ) + fit.mMSE <- stats::glm(y.mMSE~x.mMSE-1,family = "quasipoisson", + weights =c(1 / PI.mMSE[idx.mMSE], pinv.prop) ) beta_mMSE[i,] <- c(r2[i],fit.mMSE$coefficients) @@ -316,25 +327,26 @@ modelMissPoiSub <- function(r1,r2,Y,X,N,Alpha,proportion){ lambda_r1<-exp(x.mMSE%*%Beta_Estimate_Full) W_r1<-as.vector(lambda_r1) - H_r1 <-solve(t(x.mMSE) %*% (x.mMSE * W_r1)) + H_r1 <-solve(crossprod(x.mMSE, x.mMSE * W_r1)) Temp1<-(W_r1*x.mMSE)%*%H_r1 lambda_Tr1<-exp((x.mMSE %*% Beta_Estimate_Full) + F_Estimate_Full[c(idx.mMSE, idx.prop)]) W_Tr1<-as.vector(lambda_Tr1) - H_Tr1 <-(t(x.mMSE) %*% (x.mMSE * W_Tr1)) - b_r1 <-(t(x.mMSE) %*% (lambda_Tr1-lambda_r1)) + H_Tr1 <- crossprod(x.mMSE, x.mMSE * W_Tr1) + b_r1 <- crossprod(x.mMSE, lambda_Tr1-lambda_r1) L1_r1 <- psych::tr(Temp1%*%H_Tr1%*%t(Temp1)) L2_r1 <- sum((W_r1*((x.mMSE%*%H_r1%*%b_r1) - F_Estimate_Full[c(idx.mMSE, idx.prop)]))^2) AMSE_Sample_mMSE[i,]<-c(r2[i],L1_r1,L2_r1,L1_r1+L2_r1) # RLmAMSE - idx.RLmAMSE <- sample(1:N, r2[i], T, PI.RLmAMSE) + idx.RLmAMSE <- sample(1:N, size = r2[i], replace = TRUE, prob = PI.RLmAMSE) - x.RLmAMSE <- X[c(idx.RLmAMSE),] - y.RLmAMSE <- Y[c(idx.RLmAMSE)] + x.RLmAMSE <- X[idx.RLmAMSE,] + y.RLmAMSE <- Y[idx.RLmAMSE] - fit.RLmAMSE <- stats::glm(y.RLmAMSE~x.RLmAMSE-1,family="poisson",weights = c(1 / PI.RLmAMSE[idx.RLmAMSE])) + fit.RLmAMSE <- stats::glm(y.RLmAMSE~x.RLmAMSE-1,family="quasipoisson", + weights = c(1 / PI.RLmAMSE[idx.RLmAMSE])) beta_RLmAMSE[i,] <- c(r2[i],fit.RLmAMSE$coefficients) @@ -342,27 +354,28 @@ modelMissPoiSub <- function(r1,r2,Y,X,N,Alpha,proportion){ lambda_r1<-exp(x.RLmAMSE%*%Beta_Estimate_Full) W_r1<-as.vector(lambda_r1) - H_r1 <-solve(t(x.RLmAMSE) %*% (x.RLmAMSE * W_r1)) + H_r1 <-solve(crossprod(x.RLmAMSE, x.RLmAMSE * W_r1)) Temp1<-(W_r1*x.RLmAMSE)%*%H_r1 - lambda_Tr1<-exp((x.RLmAMSE %*% Beta_Estimate_Full) + F_Estimate_Full[c(idx.RLmAMSE)]) + lambda_Tr1<-exp((x.RLmAMSE %*% Beta_Estimate_Full) + F_Estimate_Full[idx.RLmAMSE]) W_Tr1<-as.vector(lambda_Tr1) - H_Tr1 <-(t(x.RLmAMSE) %*% (x.RLmAMSE * W_Tr1)) - b_r1 <-(t(x.RLmAMSE) %*% (lambda_Tr1-lambda_r1)) + H_Tr1 <- crossprod(x.RLmAMSE, x.RLmAMSE * W_Tr1) + b_r1 <- crossprod(x.RLmAMSE, lambda_Tr1-lambda_r1) L1_r1 <- psych::tr(Temp1%*%H_Tr1%*%t(Temp1)) - L2_r1 <- sum((W_r1*((x.RLmAMSE%*%H_r1%*%b_r1) - F_Estimate_Full[c(idx.RLmAMSE)]))^2) + L2_r1 <- sum((W_r1*((x.RLmAMSE%*%H_r1%*%b_r1) - F_Estimate_Full[idx.RLmAMSE]))^2) AMSE_Sample_RLmAMSE[i,]<-c(r2[i],L1_r1,L2_r1,L1_r1+L2_r1) for (j in 1:length(Alpha)) { # RLmAMSE Log Odds - idx.RLmAMSE <- sample(1:N, r2[i], T, PI.RLmAMSE_LO[,j]) + idx.RLmAMSE <- sample(1:N, size = r2[i], replace = TRUE, prob = PI.RLmAMSE_LO[,j]) - x.RLmAMSE <- X[c(idx.RLmAMSE),] - y.RLmAMSE <- Y[c(idx.RLmAMSE)] + x.RLmAMSE <- X[idx.RLmAMSE,] + y.RLmAMSE <- Y[idx.RLmAMSE] - fit.RLmAMSE <- stats::glm(y.RLmAMSE~x.RLmAMSE-1,family="poisson",weights = c(1 / PI.RLmAMSE_LO[idx.RLmAMSE,j])) + fit.RLmAMSE <- stats::glm(y.RLmAMSE~x.RLmAMSE-1,family="quasipoisson", + weights = c(1 / PI.RLmAMSE_LO[idx.RLmAMSE,j])) beta_RLmAMSE_LO[[j]][i,] <- c(r2[i],fit.RLmAMSE$coefficients) @@ -370,25 +383,26 @@ modelMissPoiSub <- function(r1,r2,Y,X,N,Alpha,proportion){ lambda_r1<-exp(x.RLmAMSE%*%Beta_Estimate_Full) W_r1<-as.vector(lambda_r1) - H_r1 <-solve(t(x.RLmAMSE) %*% (x.RLmAMSE * W_r1)) + H_r1 <-solve(crossprod(x.RLmAMSE, x.RLmAMSE * W_r1)) Temp1<-(W_r1*x.RLmAMSE)%*%H_r1 - lambda_Tr1<-exp((x.RLmAMSE %*% Beta_Estimate_Full) + F_Estimate_Full[c(idx.RLmAMSE)]) + lambda_Tr1<-exp((x.RLmAMSE %*% Beta_Estimate_Full) + F_Estimate_Full[idx.RLmAMSE]) W_Tr1<-as.vector(lambda_Tr1) - H_Tr1 <-(t(x.RLmAMSE) %*% (x.RLmAMSE * W_Tr1)) - b_r1 <-(t(x.RLmAMSE) %*% (lambda_Tr1-lambda_r1)) + H_Tr1 <- crossprod(x.RLmAMSE, x.RLmAMSE * W_Tr1) + b_r1 <- crossprod(x.RLmAMSE, lambda_Tr1-lambda_r1) L1_r1 <- psych::tr(Temp1%*%H_Tr1%*%t(Temp1)) - L2_r1 <- sum((W_r1*((x.RLmAMSE%*%H_r1%*%b_r1) - F_Estimate_Full[c(idx.RLmAMSE)]))^2) + L2_r1 <- sum((W_r1*((x.RLmAMSE%*%H_r1%*%b_r1) - F_Estimate_Full[idx.RLmAMSE]))^2) AMSE_Sample_RLmAMSE_LO[[j]][i,]<-c(r2[i],L1_r1,L2_r1,L1_r1+L2_r1) # RLmAMSE Power - idx.RLmAMSE <- sample(1:N, r2[i], T, PI.RLmAMSE_Pow[,j]) + idx.RLmAMSE <- sample(1:N, size = r2[i], replace = TRUE, prob = PI.RLmAMSE_Pow[,j]) - x.RLmAMSE <- X[c(idx.RLmAMSE),] - y.RLmAMSE <- Y[c(idx.RLmAMSE)] + x.RLmAMSE <- X[idx.RLmAMSE,] + y.RLmAMSE <- Y[idx.RLmAMSE] - fit.RLmAMSE <- stats::glm(y.RLmAMSE~x.RLmAMSE-1,family="poisson",weights = c(1 / PI.RLmAMSE_Pow[idx.RLmAMSE,j])) + fit.RLmAMSE <- stats::glm(y.RLmAMSE~x.RLmAMSE-1,family="quasipoisson", + weights = c(1 / PI.RLmAMSE_Pow[idx.RLmAMSE,j])) beta_RLmAMSE_Pow[[j]][i,] <- c(r2[i],fit.RLmAMSE$coefficients) @@ -399,12 +413,12 @@ modelMissPoiSub <- function(r1,r2,Y,X,N,Alpha,proportion){ H_r1 <-solve(t(x.RLmAMSE) %*% (x.RLmAMSE * W_r1)) Temp1<-(W_r1*x.RLmAMSE)%*%H_r1 - lambda_Tr1<-exp((x.RLmAMSE %*% Beta_Estimate_Full) + F_Estimate_Full[c(idx.RLmAMSE)]) + lambda_Tr1<-exp((x.RLmAMSE %*% Beta_Estimate_Full) + F_Estimate_Full[idx.RLmAMSE]) W_Tr1<-as.vector(lambda_Tr1) - H_Tr1 <-(t(x.RLmAMSE) %*% (x.RLmAMSE * W_Tr1)) - b_r1 <-(t(x.RLmAMSE) %*% (lambda_Tr1-lambda_r1)) + H_Tr1 <- crossprod(x.RLmAMSE, x.RLmAMSE * W_Tr1) + b_r1 <- crossprod(x.RLmAMSE, lambda_Tr1-lambda_r1) L1_r1 <- psych::tr(Temp1%*%H_Tr1%*%t(Temp1)) - L2_r1 <- sum((W_r1*((x.RLmAMSE%*%H_r1%*%b_r1) - F_Estimate_Full[c(idx.RLmAMSE)]))^2) + L2_r1 <- sum((W_r1*((x.RLmAMSE%*%H_r1%*%b_r1) - F_Estimate_Full[idx.RLmAMSE]))^2) AMSE_Sample_RLmAMSE_Pow[[j]][i,]<-c(r2[i],L1_r1,L2_r1,L1_r1+L2_r1) } diff --git a/R/modelRobustLinSub.R b/R/modelRobustLinSub.R index 8612e1f..a1e6881 100644 --- a/R/modelRobustLinSub.R +++ b/R/modelRobustLinSub.R @@ -64,7 +64,7 @@ #' indexes<-1:ceiling(nrow(Electric_consumption)*0.005) #' Original_Data<-cbind(Electric_consumption[indexes,1],1, #' Electric_consumption[indexes,-1]) -#' colnames(Original_Data)<-c("Y",paste0("X",0:ncol(Original_Data[,-c(1,2)]))) +#' colnames(Original_Data)<-c("Y",paste0("X",0:ncol(Original_Data[,-c(1,2)]))) #' for (j in 3:5) { #' Original_Data[,j]<-scale(Original_Data[,j]) #' } @@ -91,8 +91,7 @@ #' #' r1<-300; r2<-rep(100*c(6,12),25); #' -#' modelRobustLinSub(r1 = r1, r2 = r2, -#' Y = as.matrix(Original_Data[,colnames(Original_Data) %in% c("Y")]), +#' modelRobustLinSub(r1 = r1, r2 = r2, Y = as.matrix(Original_Data[,1]), #' X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data), #' Apriori_probs = rep(1/length(All_Models),length(All_Models)), #' All_Combinations = All_Models, @@ -133,7 +132,7 @@ modelRobustLinSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov } PI.prop <- rep(1/N, N) - idx.prop <- sample(1:N, r1, T) + idx.prop <- sample(1:N, size = r1, replace = TRUE) x.prop<-lapply(1:length(All_Combinations),function(j){ X[idx.prop,All_Covariates %in% All_Combinations[[j]]] @@ -145,8 +144,8 @@ modelRobustLinSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov pinv.prop <- 1/PI.prop[idx.prop] fit.prop <- lapply(1:length(All_Combinations), function(j){ - beta.prop<-solve(a=t(x.prop[[j]])%*%x.prop[[j]],b=t(x.prop[[j]])%*%y.prop) - Xbeta_Final<-as.vector(X[,All_Covariates %in% All_Combinations[[j]] ]%*%beta.prop) + beta.prop<-solve(a=crossprod(x.prop[[j]]),b=crossprod(x.prop[[j]],y.prop)) + Xbeta_Final<-X[,All_Covariates %in% All_Combinations[[j]]]%*%beta.prop Var.prop<-sum((Y-Xbeta_Final)^2)/N Epsilon.prop<-Y-Xbeta_Final return(list("beta.prop"=beta.prop,"Var.prop"=Var.prop,"Epsilon.prop"=Epsilon.prop)) @@ -199,18 +198,10 @@ modelRobustLinSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov PI_MR.mVc<-matrixStats::rowSums2(do.call(cbind,PI_Single.mVc)%*%diag(Apriori_probs)) # Model Robust Results ## mMSE - # PI_Single.mMSE <- lapply(1:length(All_Combinations),function(j){ # Single Model Results - # PI.mMSE<-sqrt(fit.prop[[j]]$Epsilon.prop^2 * - # matrixStats::rowSums2((X[,All_Covariates %in% All_Combinations[[j]] ] %*% - # solve(t(X[,All_Covariates %in% All_Combinations[[j]] ])%*% - # X[,All_Covariates %in% All_Combinations[[j]] ]))^2)) - # return(PI.mMSE/sum(PI.mMSE)) - # }) - # For efficient row-wise operations PI_Single.mMSE <- lapply(seq_along(All_Combinations), function(j) { X_sub <- X[, All_Covariates %in% All_Combinations[[j]], drop = FALSE] - XtX_inv <- solve(crossprod(X_sub)) # More efficient than t(X) %*% X + XtX_inv <- solve(crossprod(X_sub)) row_sums_squared <- matrixStats::rowSums2((X_sub %*% XtX_inv)^2) PI.mMSE <- sqrt(fit.prop[[j]]$Epsilon.prop^2 * row_sums_squared) PI.mMSE / sum(PI.mMSE) @@ -223,9 +214,9 @@ modelRobustLinSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov { ## mVc idx_Single.mVc <- lapply(1:length(All_Combinations), function(j){ - sample(1:N, r2[i]-r1, T, PI_Single.mVc[[j]]) # Single Model Results + sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI_Single.mVc[[j]]) # Single Model Results }) - idx_MR.mVc <- sample(1:N, r2[i]-r1, T, PI_MR.mVc) # Model Robust Results + idx_MR.mVc <- sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI_MR.mVc) # Model Robust Results x_Single.mVc <-lapply(1:length(All_Combinations),function(j){ # Single Model Results X[c(idx_Single.mVc[[j]], idx.prop),All_Covariates %in% All_Combinations[[j]] ] @@ -245,8 +236,8 @@ modelRobustLinSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov pi4_r<-sqrt(r2[i]*pinv_Single.mVc^(-1)) X_r4<-x_Single.mVc[[j]]/pi4_r Y_r4<-y_Single.mVc[[j]]/pi4_r - beta.prop<-solve(a=t(X_r4)%*%X_r4,b=t(X_r4)%*%Y_r4) - Xbeta_Final<-as.vector(X[,All_Covariates %in% All_Combinations[[j]] ]%*%beta.prop) + beta.prop<-solve(a=crossprod(X_r4),b=crossprod(X_r4,Y_r4)) + Xbeta_Final<-X[,All_Covariates %in% All_Combinations[[j]] ]%*%beta.prop Var.prop<-sum((Y-Xbeta_Final)^2)/N return(list("beta.prop"=beta.prop,"Var.prop"=Var.prop)) }) @@ -255,8 +246,8 @@ modelRobustLinSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov pi4_r<-sqrt(r2[i]*pinv_MR.mVc^(-1)) X_r4<-x_MR.mVc[[j]]/pi4_r Y_r4<-y_MR.mVc/pi4_r - beta.prop<-solve(a=t(X_r4)%*%X_r4,b=t(X_r4)%*%Y_r4) # Model Robust Results - Xbeta_Final<-as.vector(X[,All_Covariates %in% All_Combinations[[j]] ]%*%beta.prop) + beta.prop<-solve(a=crossprod(X_r4),b=crossprod(X_r4,Y_r4)) + Xbeta_Final<-X[,All_Covariates %in% All_Combinations[[j]] ]%*%beta.prop Var.prop<-sum((Y-Xbeta_Final)^2)/N return(list("beta.prop"=beta.prop,"Var.prop"=Var.prop)) }) @@ -279,9 +270,9 @@ modelRobustLinSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov ## mMSE idx_Single.mMSE <- lapply(1:length(All_Combinations),function(j){ - sample(1:N, r2[i]-r1, T, PI_Single.mMSE[[j]]) # Single Model Results + sample(1:N, size = r2[i]-r1, replace = T, prob = PI_Single.mMSE[[j]]) # Single Model Results }) - idx_MR.mMSE <- sample(1:N, r2[i]-r1, T, PI_MR.mMSE) # Model Robust Results + idx_MR.mMSE <- sample(1:N, size = r2[i]-r1, replace = T, prob = PI_MR.mMSE) # Model Robust Results x_Single.mMSE <- lapply(1:length(All_Combinations),function(j){ X[c(idx_Single.mMSE[[j]], idx.prop),All_Covariates %in% All_Combinations[[j]] ] # Single Model Results @@ -301,8 +292,8 @@ modelRobustLinSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov pi4_r<-sqrt(r2[i]*pinv_Single.mMSE^(-1)) X_r4<-x_Single.mMSE[[j]]/pi4_r Y_r4<-y_Single.mMSE[[j]]/pi4_r - beta.prop<-solve(a=t(X_r4)%*%X_r4,b=t(X_r4)%*%Y_r4) - Xbeta_Final<-as.vector(X[,All_Covariates %in% All_Combinations[[j]] ]%*%beta.prop) + beta.prop<-solve(a=crossprod(X_r4),b=crossprod(X_r4,Y_r4)) + Xbeta_Final<-X[,All_Covariates %in% All_Combinations[[j]] ]%*%beta.prop Var.prop<-sum((Y-Xbeta_Final)^2)/N return(list("beta.prop"=beta.prop,"Var.prop"=Var.prop)) }) @@ -311,8 +302,8 @@ modelRobustLinSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov pi4_r<-sqrt(r2[i]*pinv_MR.mMSE^(-1)) X_r4<-x_MR.mMSE[[j]]/pi4_r Y_r4<-y_MR.mMSE/pi4_r - beta.prop<-solve(a=t(X_r4)%*%X_r4,b=t(X_r4)%*%Y_r4) # Model Robust Results - Xbeta_Final<-as.vector(X[,All_Covariates %in% All_Combinations[[j]] ]%*%beta.prop) + beta.prop<-solve(a=crossprod(X_r4),b=crossprod(X_r4,Y_r4)) + Xbeta_Final<-X[,All_Covariates %in% All_Combinations[[j]] ]%*%beta.prop Var.prop<-sum((Y-Xbeta_Final)^2)/N return(list("beta.prop"=beta.prop,"Var.prop"=Var.prop)) }) diff --git a/R/modelRobustLogSub.R b/R/modelRobustLogSub.R index 2a1bc5c..2adf2da 100644 --- a/R/modelRobustLogSub.R +++ b/R/modelRobustLogSub.R @@ -91,8 +91,7 @@ #' #' r1<-300; r2<-rep(100*c(6,12),25); #' -#' modelRobustLogSub(r1 = r1, r2 = r2, -#' Y = as.matrix(Original_Data[,colnames(Original_Data) %in% c("Y")]), +#' modelRobustLogSub(r1 = r1, r2 = r2, Y = as.matrix(Original_Data[,1]), #' X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data), #' Apriori_probs = rep(1/length(All_Models),length(All_Models)), #' All_Combinations = All_Models, @@ -137,7 +136,7 @@ modelRobustLogSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov n0 <- N - n1 PI.prop <- rep(1/(2*n0), N) PI.prop[Y==1] <- 1/(2*n1) - idx.prop <- sample(1:N, r1, T, PI.prop) + idx.prop <- sample(1:N, size = r1, replace = TRUE, prob = PI.prop) x.prop<-lapply(1:length(All_Combinations),function(j){ X[idx.prop,All_Covariates %in% All_Combinations[[j]] ] @@ -211,7 +210,7 @@ modelRobustLogSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov p_Single.prop[[j]] * (1 - p_Single.prop[[j]]) # Single Model Results }) W_Single.prop <- lapply(1:length(All_Combinations),function(j){ - solve(t(x.prop[[j]]) %*% (x.prop[[j]] * w_Single.prop[[j]] * pinv.prop)) # Single Model Results + solve(crossprod(x.prop[[j]],x.prop[[j]] * w_Single.prop[[j]] * pinv.prop)) # Single Model Results }) PI_Single.mMSE <- lapply(1:length(All_Combinations),function(j){ PI.mMSE<-sqrt((Y - P.prop[[j]])^2 * matrixStats::rowSums2((X[,All_Covariates %in% All_Combinations[[j]] ]%*% @@ -226,9 +225,9 @@ modelRobustLogSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov { ## mVc idx_Single.mVc <- lapply(1:length(All_Combinations), function(j){ - sample(1:N, r2[i]-r1, T, PI_Single.mVc[[j]]) # Single Model Results + sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI_Single.mVc[[j]]) # Single Model Results }) - idx_MR.mVc <- sample(1:N, r2[i]-r1, T, PI_MR.mVc) # Model Robust Results + idx_MR.mVc <- sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI_MR.mVc) # Model Robust Results x_Single.mVc <-lapply(1:length(All_Combinations),function(j){ X[c(idx_Single.mVc[[j]], idx.prop),All_Covariates %in% All_Combinations[[j]] ] # Single Model results @@ -244,7 +243,7 @@ modelRobustLogSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov fit_Single.mVc <-lapply(1:length(All_Combinations), function(j){ .getMLE(x=x_Single.mVc[[j]], y=y_Single.mVc[[j]], - w=c(1 / PI_Single.mVc[[j]][idx_Single.mVc[[j]]], pinv.prop)) # Single Model Results + w=c(1 / PI_Single.mVc[[j]][idx_Single.mVc[[j]]], pinv.prop)) # Single Model Results }) fit_MR.mVc <- lapply(1:length(All_Combinations),function(j){ @@ -268,12 +267,10 @@ modelRobustLogSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov V_Final<-lapply(1:length(All_Combinations),function(j){ pi<-1-1/(1 + exp(x_Single.mVc[[j]] %*% beta.mVc_Single[[j]][i,-1])) W<-as.vector(pi*(1-pi)*c(1 / PI_Single.mVc[[j]][idx_Single.mVc[[j]]], pinv.prop)) - Mx<-solve((t(x_Single.mVc[[j]]) %*% (x_Single.mVc[[j]] * W))) - + Mx<-solve(crossprod(x_Single.mVc[[j]],x_Single.mVc[[j]] * W)) Middle<-((as.vector(y_Single.mVc[[j]])-as.vector(pi))* as.vector(c(1 / PI_Single.mVc[[j]][idx_Single.mVc[[j]]], pinv.prop)))^2 - V_Temp<-(t(x_Single.mVc[[j]]) %*% (x_Single.mVc[[j]] * Middle) ) - + V_Temp<-crossprod(x_Single.mVc[[j]],x_Single.mVc[[j]] * Middle) Mx %*% V_Temp %*% Mx }) @@ -286,11 +283,9 @@ modelRobustLogSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov V_Final<-lapply(1:length(All_Combinations),function(j){ pi<-1-1/(1+exp(x_MR.mVc[[j]] %*% beta.mVc_MR[[j]][i,-1])) W<-as.vector(pi*(1-pi)*c(1 / PI_MR.mVc[idx_MR.mVc], pinv.prop)) - Mx<-solve((t(x_MR.mVc[[j]]) %*% (x_MR.mVc[[j]] * W) )) - + Mx<-solve(crossprod(x_MR.mVc[[j]], x_MR.mVc[[j]] * W)) Middle<-((as.vector(y_MR.mVc)-as.vector(pi))*as.vector(c(1 / PI_MR.mVc[idx_MR.mVc], pinv.prop)))^2 - V_Temp<-(t(x_MR.mVc[[j]]) %*% (x_MR.mVc[[j]] * Middle)) - + V_Temp<-crossprod(x_MR.mVc[[j]], x_MR.mVc[[j]] * Middle) Mx %*% V_Temp %*% Mx }) @@ -301,9 +296,9 @@ modelRobustLogSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov ## mMSE idx_Single.mMSE <- lapply(1:length(All_Combinations),function(j){ - sample(1:N, r2[i]-r1, T, PI_Single.mMSE[[j]]) # Single Model Results + sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI_Single.mMSE[[j]]) # Single Model Results }) - idx_MR.mMSE <- sample(1:N, r2[i]-r1, T, PI_MR.mMSE) # Model Robust Results + idx_MR.mMSE <- sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI_MR.mMSE) # Model Robust Results x_Single.mMSE <- lapply(1:length(All_Combinations),function(j){ X[c(idx_Single.mMSE[[j]], idx.prop),All_Covariates %in% All_Combinations[[j]] ] # Single Model Results @@ -319,7 +314,7 @@ modelRobustLogSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov fit_Single.mMSE <- lapply(1:length(All_Combinations),function(j){ .getMLE(x=x_Single.mMSE[[j]], y=y_Single.mMSE[[j]], - w=c(1 / PI_Single.mMSE[[j]][idx_Single.mMSE[[j]]], pinv.prop)) # Single Model Results + w=c(1 / PI_Single.mMSE[[j]][idx_Single.mMSE[[j]]], pinv.prop)) # Single Model Results }) fit_MR.mMSE <- lapply(1:length(All_Combinations), function(j){ @@ -345,12 +340,10 @@ modelRobustLogSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov V_Final<-lapply(1:length(All_Combinations),function(j){ pi<-1-1/(1 + exp(x_Single.mMSE[[j]] %*% beta.mMSE_Single[[j]][i,-1])) W<-as.vector(pi*(1-pi)*c(1 / PI_Single.mMSE[[j]][idx_Single.mMSE[[j]]], pinv.prop)) - Mx<-solve((t(x_Single.mMSE[[j]]) %*% (x_Single.mMSE[[j]] * W))) - + Mx<-solve(crossprod(x_Single.mMSE[[j]], x_Single.mMSE[[j]] * W)) Middle<-((as.vector(y_Single.mMSE[[j]])-as.vector(pi))* as.vector(c(1 / PI_Single.mMSE[[j]][idx_Single.mMSE[[j]]], pinv.prop)))^2 - V_Temp<-(t(x_Single.mMSE[[j]]) %*% (x_Single.mMSE[[j]] * Middle)) - + V_Temp<-crossprod(x_Single.mMSE[[j]], x_Single.mMSE[[j]] * Middle) Mx %*% V_Temp %*% Mx }) @@ -363,11 +356,9 @@ modelRobustLogSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov V_Final<-lapply(1:length(All_Combinations),function(j){ pi<-1-1/(1+exp(x_MR.mMSE[[j]] %*% beta.mMSE_MR[[j]][i,-1])) W<-as.vector(pi*(1-pi)*c(1 / PI_MR.mMSE[idx_MR.mMSE], pinv.prop)) - Mx<-solve((t(x_MR.mMSE[[j]]) %*% (x_MR.mMSE[[j]] * W))) - + Mx<-solve(crossprod(x_MR.mMSE[[j]], x_MR.mMSE[[j]] * W)) Middle<-((as.vector(y_MR.mMSE)-as.vector(pi))*as.vector(c(1 / PI_MR.mMSE[idx_MR.mMSE], pinv.prop)))^2 - V_Temp<-(t(x_MR.mMSE[[j]]) %*% (x_MR.mMSE[[j]] * Middle)) - + V_Temp<-crossprod(x_MR.mMSE[[j]], x_MR.mMSE[[j]] * Middle) Mx %*% V_Temp %*% Mx }) @@ -426,7 +417,7 @@ modelRobustLogSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov msg <- "NA" while (loop <= Loop) { pr <- c(1 - 1 / (1 + exp(x %*% beta))) - H <- t(x) %*% (pr * (1 - pr) * w * x) + H <- crossprod(x,(pr * (1 - pr) * w * x)) S <- colSums((y - pr) * w * x) tryCatch( {shs <- NA diff --git a/R/modelRobustPoiSub.R b/R/modelRobustPoiSub.R index 9a6de83..606bd1f 100644 --- a/R/modelRobustPoiSub.R +++ b/R/modelRobustPoiSub.R @@ -93,8 +93,7 @@ #' #' r1<-300; r2<-rep(100*c(6,12),25); #' -#' modelRobustPoiSub(r1 = r1, r2 = r2, -#' Y = as.matrix(Original_Data[,colnames(Original_Data) %in% c("Y")]), +#' modelRobustPoiSub(r1 = r1, r2 = r2, Y = as.matrix(Original_Data[,1]), #' X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data), #' Apriori_probs = rep(1/length(All_Models),length(All_Models)), #' All_Combinations = All_Models, @@ -137,7 +136,7 @@ modelRobustPoiSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov } PI.prop <- rep(1/N, N) - idx.prop <- sample(1:N, r1, T) + idx.prop <- sample(1:N, size = r1, replace = TRUE) x.prop<-lapply(1:length(All_Combinations),function(j){ X[idx.prop,All_Covariates %in% All_Combinations[[j]]] @@ -149,7 +148,7 @@ modelRobustPoiSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov pinv.prop <- 1/PI.prop[idx.prop] fit.prop <- lapply(1:length(All_Combinations), function(j){ - stats::glm(y.prop~x.prop[[j]]-1,family = "poisson") + stats::glm(y.prop~x.prop[[j]]-1,family = "quasipoisson") }) beta.prop<-list() @@ -210,7 +209,7 @@ modelRobustPoiSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov P.prop[[j]][idx.prop] # Single Model Results }) W_Single.prop <- lapply(1:length(All_Combinations),function(j){ - solve(t(x.prop[[j]]) %*% (x.prop[[j]] * w_Single.prop[[j]] * pinv.prop)) # Single Model Results + solve(crossprod(x.prop[[j]], x.prop[[j]] * w_Single.prop[[j]] * pinv.prop)) # Single Model Results }) PI_Single.mMSE <- lapply(1:length(All_Combinations),function(j){ # Single Model Results PI.mMSE<-sqrt((Y - P.prop[[j]])^2 * @@ -225,9 +224,9 @@ modelRobustPoiSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov { ## mVc idx_Single.mVc <- lapply(1:length(All_Combinations), function(j){ - sample(1:N, r2[i]-r1, T, PI_Single.mVc[[j]]) # Single Model Results + sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI_Single.mVc[[j]]) # Single Model Results }) - idx_MR.mVc <- sample(1:N, r2[i]-r1, T, PI_MR.mVc) # Model Robust Results + idx_MR.mVc <- sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI_MR.mVc) # Model Robust Results x_Single.mVc <-lapply(1:length(All_Combinations),function(j){ # Single Model Results X[c(idx_Single.mVc[[j]], idx.prop),All_Covariates %in% All_Combinations[[j]] ] @@ -244,11 +243,11 @@ modelRobustPoiSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov fit_Single.mVc <-lapply(1:length(All_Combinations), function(j){ # Single Model Results pinv_Single.mVc<-c(1 / PI_Single.mVc[[j]][idx_Single.mVc[[j]]], pinv.prop) - stats::glm(y_Single.mVc[[j]]~x_Single.mVc[[j]]-1, family = "poisson",weights=pinv_Single.mVc) + stats::glm(y_Single.mVc[[j]]~x_Single.mVc[[j]]-1, family = "quasipoisson",weights=pinv_Single.mVc) }) fit_MR.mVc <- lapply(1:length(All_Combinations),function(j){ - stats::glm(y_MR.mVc~x_MR.mVc[[j]]-1,family="poisson",weights=pinv_MR.mVc) # Model Robust Results + stats::glm(y_MR.mVc~x_MR.mVc[[j]]-1,family="quasipoisson",weights=pinv_MR.mVc) # Model Robust Results }) for (j in 1:length(All_Combinations)) @@ -268,9 +267,8 @@ modelRobustPoiSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov V_Final<-lapply(1:length(All_Combinations),function(j){ pi<-c(exp(x_Single.mVc[[j]] %*% beta.mVc_Single[[j]][i,-1])) pinv_Single.mVc<-c(1 / PI_Single.mVc[[j]][idx_Single.mVc[[j]]], pinv.prop) - Mx<-solve(t(x_Single.mVc[[j]]) %*% (x_Single.mVc[[j]] * pi * pinv_Single.mVc)) - V_Temp<-t(x_Single.mVc[[j]]) %*% (x_Single.mVc[[j]]*((as.vector(y_Single.mVc[[j]])-pi)*pinv_Single.mVc)^2) - + Mx<-solve(crossprod(x_Single.mVc[[j]], x_Single.mVc[[j]] * pi * pinv_Single.mVc)) + V_Temp<-crossprod(x_Single.mVc[[j]], x_Single.mVc[[j]]*((as.vector(y_Single.mVc[[j]])-pi)*pinv_Single.mVc)^2) Mx %*% V_Temp %*% Mx }) @@ -282,9 +280,8 @@ modelRobustPoiSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov # Model Robust Results V_Final<-lapply(1:length(All_Combinations),function(j){ pi<- c(exp(x_MR.mVc[[j]] %*% beta.mVc_MR[[j]][i,-1])) - Mx<-solve(t(x_MR.mVc[[j]]) %*% (x_MR.mVc[[j]]*pi*pinv_MR.mVc)) - V_Temp<-t(x_MR.mVc[[j]]) %*% (x_MR.mVc[[j]]*((as.vector(y_MR.mVc)-pi)*pinv_MR.mVc)^2) - + Mx<-solve(crossprod(x_MR.mVc[[j]], x_MR.mVc[[j]]*pi*pinv_MR.mVc)) + V_Temp<-crossprod(x_MR.mVc[[j]], x_MR.mVc[[j]]*((as.vector(y_MR.mVc)-pi)*pinv_MR.mVc)^2) Mx %*% V_Temp %*% Mx }) @@ -295,9 +292,9 @@ modelRobustPoiSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov ## mMSE idx_Single.mMSE <- lapply(1:length(All_Combinations),function(j){ - sample(1:N, r2[i]-r1, T, PI_Single.mMSE[[j]]) # Single Model Results + sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI_Single.mMSE[[j]]) # Single Model Results }) - idx_MR.mMSE <- sample(1:N, r2[i]-r1, T, PI_MR.mMSE) # Model Robust Results + idx_MR.mMSE <- sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI_MR.mMSE) # Model Robust Results x_Single.mMSE <- lapply(1:length(All_Combinations),function(j){ X[c(idx_Single.mMSE[[j]], idx.prop),All_Covariates %in% All_Combinations[[j]] ] # Single Model Results @@ -314,11 +311,11 @@ modelRobustPoiSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov fit_Single.mMSE <- lapply(1:length(All_Combinations),function(j){ # Single Model Results pinv_Single.mMSE<-c(1 / PI_Single.mMSE[[j]][idx_Single.mMSE[[j]]], pinv.prop) - stats::glm(y_Single.mMSE[[j]]~x_Single.mMSE[[j]]-1, family = "poisson",weights=pinv_Single.mMSE) + stats::glm(y_Single.mMSE[[j]]~x_Single.mMSE[[j]]-1, family = "quasipoisson",weights=pinv_Single.mMSE) }) fit_MR.mMSE <- lapply(1:length(All_Combinations), function(j){ # Model Robust Results - stats::glm(y_MR.mMSE~x_MR.mMSE[[j]]-1, family = "poisson",weights=pinv_MR.mMSE) + stats::glm(y_MR.mMSE~x_MR.mMSE[[j]]-1, family = "quasipoisson",weights=pinv_MR.mMSE) }) for (j in 1:length(All_Combinations)) @@ -340,9 +337,8 @@ modelRobustPoiSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov V_Final<-lapply(1:length(All_Combinations),function(j){ pi<-c(exp(x_Single.mMSE[[j]] %*% beta.mMSE_Single[[j]][i,-1])) pinv_Single.mMSE<-c(1 / PI_Single.mMSE[[j]][idx_Single.mMSE[[j]]], pinv.prop) - Mx<-solve(t(x_Single.mMSE[[j]]) %*% (x_Single.mMSE[[j]]*pi*pinv_Single.mMSE)) - V_Temp<-t(x_Single.mMSE[[j]]) %*% (x_Single.mMSE[[j]]*((as.vector(y_Single.mMSE[[j]])-pi)*pinv_Single.mMSE)^2) - + Mx<-solve(crossprod(x_Single.mMSE[[j]], x_Single.mMSE[[j]]*pi*pinv_Single.mMSE)) + V_Temp<-crossprod(x_Single.mMSE[[j]], x_Single.mMSE[[j]]*((as.vector(y_Single.mMSE[[j]])-pi)*pinv_Single.mMSE)^2) Mx %*% V_Temp %*% Mx }) @@ -354,9 +350,8 @@ modelRobustPoiSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov # Model Robust Results V_Final<-lapply(1:length(All_Combinations),function(j){ pi<-c(exp(x_MR.mMSE[[j]] %*% beta.mMSE_MR[[j]][i,-1])) - Mx<-solve(t(x_MR.mMSE[[j]]) %*% (x_MR.mMSE[[j]]*pi*pinv_MR.mMSE)) - V_Temp<-t(x_MR.mMSE[[j]]) %*% (x_MR.mMSE[[j]]*((as.vector(y_MR.mMSE)-pi)*pinv_MR.mMSE)^2) - + Mx<-solve(crossprod(x_MR.mMSE[[j]], x_MR.mMSE[[j]]*pi*pinv_MR.mMSE)) + V_Temp<-crossprod(x_MR.mMSE[[j]], x_MR.mMSE[[j]]*((as.vector(y_MR.mMSE)-pi)*pinv_MR.mMSE)^2) Mx %*% V_Temp %*% Mx }) diff --git a/_pkgdown.yml b/_pkgdown.yml index 8e664c5..fff3af1 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -10,7 +10,7 @@ template: primary: "#003366" development: - mode: auto + mode: release navbar: title: NeEDS4BigData @@ -35,6 +35,10 @@ navbar: href: articles/Basic_Sampling_Poi_Reg.html - text: Poisson Regression - Model robust and misspecification href: articles/Poisson_Regression.html + - text: Benchmarking - Model based subsampling + href: articles/Benchmarking_model_based.html + - text: Benchmarking - Model robust subsampling + href: articles/Benchmarking_model_robust.html - text: Versions href: news/index.html right: diff --git a/inst/WORDLIST b/inst/WORDLIST index 5f27af9..e847ea8 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -1,6 +1,5 @@ AMSE Abhinav -Apriori Berard BigData CodeFactor @@ -8,25 +7,21 @@ Fanaee Hebrail LmAMSE MJ +MVNormal NeEDS Poisson RLmAMSE Rajen Sceaux UCI -Var codecov -criterions doi endogeneity esign frac ggplot ldots -misspecifications nonskin -packag -packageversion poisson softmax th diff --git a/man/ALoptimalGLMSub.Rd b/man/ALoptimalGLMSub.Rd index 9697367..5ce7142 100644 --- a/man/ALoptimalGLMSub.Rd +++ b/man/ALoptimalGLMSub.Rd @@ -69,8 +69,7 @@ Full_Data<-GenGLMdata(Dist,Dist_Par,No_Of_Var,Beta,N,Family) r1<-300; r2<-rep(600,50); Original_Data<-Full_Data$Complete_Data; -ALoptimalGLMSub(r1 = r1, r2 = r2, - Y = as.matrix(Original_Data[,colnames(Original_Data) \%in\% c("Y")]), +ALoptimalGLMSub(r1 = r1, r2 = r2,Y = as.matrix(Original_Data[,1]), X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data), family = "linear")->Results @@ -82,8 +81,7 @@ Full_Data<-GenGLMdata(Dist,Dist_Par,No_Of_Var,Beta,N,Family) r1<-300; r2<-rep(600,50); Original_Data<-Full_Data$Complete_Data; -ALoptimalGLMSub(r1 = r1, r2 = r2, - Y = as.matrix(Original_Data[,colnames(Original_Data) \%in\% c("Y")]), +ALoptimalGLMSub(r1 = r1, r2 = r2,Y = as.matrix(Original_Data[,1]), X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data), family = "logistic")->Results @@ -95,8 +93,7 @@ Full_Data<-GenGLMdata(Dist,NULL,No_Of_Var,Beta,N,Family) r1<-300; r2<-rep(600,50); Original_Data<-Full_Data$Complete_Data; -ALoptimalGLMSub(r1 = r1, r2 = r2, - Y = as.matrix(Original_Data[,colnames(Original_Data) \%in\% c("Y")]), +ALoptimalGLMSub(r1 = r1, r2 = r2,Y = as.matrix(Original_Data[,1]), X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data), family = "poisson")->Results diff --git a/man/AoptimalGauLMSub.Rd b/man/AoptimalGauLMSub.Rd index d19259a..822f594 100644 --- a/man/AoptimalGauLMSub.Rd +++ b/man/AoptimalGauLMSub.Rd @@ -58,8 +58,7 @@ Full_Data<-GenGLMdata(Dist,Dist_Par,No_Of_Var,Beta,N,Family) r1<-300; r2<-rep(100*c(6,12),50); Original_Data<-Full_Data$Complete_Data; -AoptimalGauLMSub(r1 = r1, r2 = r2, - Y = as.matrix(Original_Data[,colnames(Original_Data) \%in\% c("Y")]), +AoptimalGauLMSub(r1 = r1, r2 = r2,Y = as.matrix(Original_Data[,1]), X = as.matrix(Original_Data[,-1]), N = nrow(Original_Data))->Results diff --git a/man/AoptimalMCGLMSub.Rd b/man/AoptimalMCGLMSub.Rd index 5f51f5e..5c1470a 100644 --- a/man/AoptimalMCGLMSub.Rd +++ b/man/AoptimalMCGLMSub.Rd @@ -66,8 +66,7 @@ Full_Data<-GenGLMdata(Dist,Dist_Par,No_Of_Var,Beta,N,Family) r1<-300; r2<-rep(100*c(6,12),50); Original_Data<-Full_Data$Complete_Data; -AoptimalMCGLMSub(r1 = r1, r2 = r2, - Y = as.matrix(Original_Data[,colnames(Original_Data) \%in\% c("Y")]), +AoptimalMCGLMSub(r1 = r1, r2 = r2,Y = as.matrix(Original_Data[,1]), X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data), family = "linear")->Results @@ -79,8 +78,7 @@ Full_Data<-GenGLMdata(Dist,Dist_Par,No_Of_Var,Beta,N,Family) r1<-300; r2<-rep(100*c(6,12),50); Original_Data<-Full_Data$Complete_Data; -AoptimalMCGLMSub(r1 = r1, r2 = r2, - Y = as.matrix(Original_Data[,colnames(Original_Data) \%in\% c("Y")]), +AoptimalMCGLMSub(r1 = r1, r2 = r2,Y = as.matrix(Original_Data[,1]), X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data), family = "logistic")->Results @@ -92,8 +90,7 @@ Full_Data<-GenGLMdata(Dist,NULL,No_Of_Var,Beta,N,Family) r1<-300; r2<-rep(100*c(6,12),50); Original_Data<-Full_Data$Complete_Data; -AoptimalMCGLMSub(r1 = r1, r2 = r2, - Y = as.matrix(Original_Data[,colnames(Original_Data) \%in\% c("Y")]), +AoptimalMCGLMSub(r1 = r1, r2 = r2,Y = as.matrix(Original_Data[,1]), X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data), family = "poisson")->Results diff --git a/man/GenGLMdata.Rd b/man/GenGLMdata.Rd index 4572397..f1f2a08 100644 --- a/man/GenGLMdata.Rd +++ b/man/GenGLMdata.Rd @@ -7,7 +7,7 @@ GenGLMdata(Dist,Dist_Par,No_Of_Var,Beta,N,family) } \arguments{ -\item{Dist}{a character value for the distribution "Normal" or "Uniform or "Exponential"} +\item{Dist}{a character value for the distribution "Normal", "MVNormal", "Uniform or "Exponential"} \item{Dist_Par}{a list of parameters for the distribution that would generate data for covariate X} @@ -26,8 +26,8 @@ The output of \code{GenGLMdata} gives a list of } \description{ Function to simulate big data under linear, logistic and Poisson regression for sampling. -Covariate data X is through Normal or Uniform distribution for linear regression. -Covariate data X is through Exponential or Normal or Uniform distribution for logistic regression. +Covariate data X is through Normal, Multivariate Normal or Uniform distribution for linear regression. +Covariate data X is through Exponential, Normal, Multivariate Normal or Uniform distribution for logistic regression. Covariate data X is through Normal or Uniform distribution for Poisson regression. } \details{ @@ -35,19 +35,31 @@ Big data for the Generalised Linear Models are generated by the "linear", "logis regression types. We have limited the covariate data generation for -linear regression through normal and uniform distribution, -logistic regression through exponential, normal and uniform and +linear regression through normal, multivariate normal and uniform distribution, +logistic regression through exponential, normal, multivariate normal and uniform distribution Poisson regression through normal and uniform distribution. } \examples{ -Dist<-"Normal"; Dist_Par<-list(Mean=0,Variance=1,Error_Variance=0.5) -No_Of_Var<-2; Beta<-c(-1,2,1); N<-5000; Family<-"linear" +No_Of_Var<-2; Beta<-c(-1,2,1); N<-5000; + +# Dist<-"Normal"; Dist_Par<-list(Mean=0,Variance=1,Error_Variance=0.5) +Dist<-"MVNormal"; +Dist_Par<-list(Mean=rep(0,No_Of_Var),Variance=diag(rep(2,No_Of_Var)),Error_Variance=0.5) +# Dist<-"Uniform"; Dist_Par<-list(Min=0,Max=1) + +Family<-"linear" Results<-GenGLMdata(Dist,Dist_Par,No_Of_Var,Beta,N,Family) -Dist<-"Normal"; Dist_Par<-list(Mean=0,Variance=1); Family<-"logistic" +Dist<-"Normal"; Dist_Par<-list(Mean=0,Variance=1); +# Dist<-"MVNormal"; Dist_Par<-list(Mean=rep(0,No_Of_Var),Variance=diag(rep(2,No_Of_Var))) +# Dist<-"Exponential"; Dist_Par<-list(Rate=3) +# Dist<-"Uniform"; Dist_Par<-list(Min=0,Max=1) + +Family<-"logistic" Results<-GenGLMdata(Dist,Dist_Par,No_Of_Var,Beta,N,Family) -Dist<-"Normal"; Family<-"poisson" +# Dist<-"Normal"; +Dist<-"Uniform"; Family<-"poisson" Results<-GenGLMdata(Dist,NULL,No_Of_Var,Beta,N,Family) } diff --git a/man/LCCsampling.Rd b/man/LCCsampling.Rd index 51e38ee..bd7ce26 100644 --- a/man/LCCsampling.Rd +++ b/man/LCCsampling.Rd @@ -58,7 +58,7 @@ Full_Data<-GenGLMdata(Dist,Dist_Par,No_Of_Var,Beta,N,Family) r1<-300; r2<-rep(100*c(6,9,12),50); Original_Data<-Full_Data$Complete_Data; -LCCsampling(r1 = r1, r2 = r2, Y = as.matrix(Original_Data[,colnames(Original_Data) \%in\% c("Y")]), +LCCsampling(r1 = r1, r2 = r2, Y = as.matrix(Original_Data[,1]), X = as.matrix(Original_Data[,-1]), N = nrow(Original_Data))->Results diff --git a/man/LeverageSampling.Rd b/man/LeverageSampling.Rd index cecaff1..03c3ac9 100644 --- a/man/LeverageSampling.Rd +++ b/man/LeverageSampling.Rd @@ -67,7 +67,7 @@ Full_Data<-GenGLMdata(Dist,Dist_Par,No_Of_Var,Beta,N,Family) r<-rep(100*c(6,10),50); Original_Data<-Full_Data$Complete_Data; -LeverageSampling(r = r, Y = as.matrix(Original_Data[,colnames(Original_Data) \%in\% c("Y")]), +LeverageSampling(r = r, Y = as.matrix(Original_Data[,1]), X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data), S_alpha = 0.95, family = "linear")->Results @@ -80,7 +80,7 @@ Full_Data<-GenGLMdata(Dist,Dist_Par,No_Of_Var,Beta,N,Family) r<-rep(100*c(6,10),50); Original_Data<-Full_Data$Complete_Data; -LeverageSampling(r = r, Y = as.matrix(Original_Data[,colnames(Original_Data) \%in\% c("Y")]), +LeverageSampling(r = r, Y = as.matrix(Original_Data[,1]), X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data), S_alpha = 0.95, family = "logistic")->Results @@ -93,7 +93,7 @@ Full_Data<-GenGLMdata(Dist,NULL,No_Of_Var,Beta,N,Family) r<-rep(100*c(6,10),50); Original_Data<-Full_Data$Complete_Data; -LeverageSampling(r = r, Y = as.matrix(Original_Data[,colnames(Original_Data) \%in\% c("Y")]), +LeverageSampling(r = r, Y = as.matrix(Original_Data[,1]), X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data), S_alpha = 0.95, family = "poisson")->Results diff --git a/man/modelRobustLinSub.Rd b/man/modelRobustLinSub.Rd index ed5d132..8030587 100644 --- a/man/modelRobustLinSub.Rd +++ b/man/modelRobustLinSub.Rd @@ -73,7 +73,7 @@ where \eqn{q=1,\ldots,Q} and \eqn{Q} is the number of models in the model set. indexes<-1:ceiling(nrow(Electric_consumption)*0.005) Original_Data<-cbind(Electric_consumption[indexes,1],1, Electric_consumption[indexes,-1]) - colnames(Original_Data)<-c("Y",paste0("X",0:ncol(Original_Data[,-c(1,2)]))) +colnames(Original_Data)<-c("Y",paste0("X",0:ncol(Original_Data[,-c(1,2)]))) for (j in 3:5) { Original_Data[,j]<-scale(Original_Data[,j]) } @@ -100,8 +100,7 @@ names(All_Models)<-paste0("Model_",1:length(All_Models)) r1<-300; r2<-rep(100*c(6,12),25); -modelRobustLinSub(r1 = r1, r2 = r2, - Y = as.matrix(Original_Data[,colnames(Original_Data) \%in\% c("Y")]), +modelRobustLinSub(r1 = r1, r2 = r2, Y = as.matrix(Original_Data[,1]), X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data), Apriori_probs = rep(1/length(All_Models),length(All_Models)), All_Combinations = All_Models, diff --git a/man/modelRobustLogSub.Rd b/man/modelRobustLogSub.Rd index 22a216b..aaf6564 100644 --- a/man/modelRobustLogSub.Rd +++ b/man/modelRobustLogSub.Rd @@ -100,8 +100,7 @@ names(All_Models)<-paste0("Model_",1:length(All_Models)) r1<-300; r2<-rep(100*c(6,12),25); -modelRobustLogSub(r1 = r1, r2 = r2, - Y = as.matrix(Original_Data[,colnames(Original_Data) \%in\% c("Y")]), +modelRobustLogSub(r1 = r1, r2 = r2, Y = as.matrix(Original_Data[,1]), X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data), Apriori_probs = rep(1/length(All_Models),length(All_Models)), All_Combinations = All_Models, diff --git a/man/modelRobustPoiSub.Rd b/man/modelRobustPoiSub.Rd index 89763b2..5ea1535 100644 --- a/man/modelRobustPoiSub.Rd +++ b/man/modelRobustPoiSub.Rd @@ -102,8 +102,7 @@ names(All_Models)<-paste0("Model_",1:length(All_Models)) r1<-300; r2<-rep(100*c(6,12),25); -modelRobustPoiSub(r1 = r1, r2 = r2, - Y = as.matrix(Original_Data[,colnames(Original_Data) \%in\% c("Y")]), +modelRobustPoiSub(r1 = r1, r2 = r2, Y = as.matrix(Original_Data[,1]), X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data), Apriori_probs = rep(1/length(All_Models),length(All_Models)), All_Combinations = All_Models, diff --git a/renv.lock b/renv.lock index aa810b4..a06ab8e 100644 --- a/renv.lock +++ b/renv.lock @@ -9,6 +9,13 @@ ] }, "Packages": { + "BH": { + "Package": "BH", + "Version": "1.87.0-1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "468d9a03ba57f22ebde50060fd13ba9f" + }, "GPArotation": { "Package": "GPArotation", "Version": "2024.3-1", @@ -22,9 +29,9 @@ }, "MASS": { "Package": "MASS", - "Version": "7.3-61", + "Version": "7.3-64", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "R", "grDevices", @@ -33,11 +40,11 @@ "stats", "utils" ], - "Hash": "0cafd6f0500e5deba33be22c46bf6055" + "Hash": "49d2d8090b74c1179df1aff16201caf8" }, "Matrix": { "Package": "Matrix", - "Version": "1.7-0", + "Version": "1.7-1", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -50,7 +57,7 @@ "stats", "utils" ], - "Hash": "1920b2f11133b12350024297d8a4ff4a" + "Hash": "5122bb14d8736372411f955e1b16bc8a" }, "R6": { "Package": "R6", @@ -74,20 +81,20 @@ }, "Rcpp": { "Package": "Rcpp", - "Version": "1.0.12", + "Version": "1.0.13-1", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "methods", "utils" ], - "Hash": "5ea2700d21e038ace58269ecdbeb9ec0" + "Hash": "6b868847b365672d6c1677b1608da9ed" }, "RcppArmadillo": { "Package": "RcppArmadillo", - "Version": "0.12.8.4.0", + "Version": "14.2.2-1", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "R", "Rcpp", @@ -95,7 +102,7 @@ "stats", "utils" ], - "Hash": "a535dd90aa8ec9a2e63a8f79b4a1bf43" + "Hash": "9da7c242d94a8419d045f6b3a64b9765" }, "RcppGSL": { "Package": "RcppGSL", @@ -110,13 +117,13 @@ }, "RcppParallel": { "Package": "RcppParallel", - "Version": "5.1.7", + "Version": "5.1.9", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "R" ], - "Hash": "a45594a00f5dbb073d5ec9f48592a08a" + "Hash": "f38a72a419b91faac0ce5d9eee04c120" }, "RcppZiggurat": { "Package": "RcppZiggurat", @@ -136,9 +143,9 @@ }, "Rdpack": { "Package": "Rdpack", - "Version": "2.6.1", + "Version": "2.6", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "R", "methods", @@ -146,13 +153,13 @@ "tools", "utils" ], - "Hash": "24a964d2cf75ad25d7b843856c8d4c93" + "Hash": "3e1384ada5d3948b392e98b11434d972" }, "Rfast": { "Package": "Rfast", - "Version": "2.1.0", + "Version": "2.1.3", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "R", "Rcpp", @@ -160,18 +167,18 @@ "RcppParallel", "RcppZiggurat" ], - "Hash": "79e8394e1fa06a4ae954b70ca9b16e8f" + "Hash": "994cb1d45f644b5fdf59df4a2dfb57ed" }, "cli": { "Package": "cli", - "Version": "3.6.2", + "Version": "3.6.3", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "R", "utils" ], - "Hash": "1216ac65ac55ec0058a6f75d7ca0fd52" + "Hash": "b21916dd77a27642b447374a5d30ecf3" }, "codetools": { "Package": "codetools", @@ -185,9 +192,9 @@ }, "colorspace": { "Package": "colorspace", - "Version": "2.1-0", + "Version": "2.1-1", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "R", "grDevices", @@ -195,17 +202,17 @@ "methods", "stats" ], - "Hash": "f20c47fd52fae58b4e377c37bb8c335b" + "Hash": "d954cb1c57e8d8b756165d7ba18aa55a" }, "cpp11": { "Package": "cpp11", - "Version": "0.4.7", + "Version": "0.5.1", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "R" ], - "Hash": "5a295d7d963cc5035284dcdbaf334f4e" + "Hash": "9df43854f1c84685d095ed6270b52387" }, "dplyr": { "Package": "dplyr", @@ -264,7 +271,7 @@ }, "gam": { "Package": "gam", - "Version": "1.22-4", + "Version": "1.22-5", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -274,7 +281,7 @@ "splines", "stats" ], - "Hash": "1d00ef061ca18a5d4dc4483bb75599cb" + "Hash": "453698a6f7535fbb77841e8015b5ee57" }, "generics": { "Package": "generics", @@ -289,7 +296,7 @@ }, "ggh4x": { "Package": "ggh4x", - "Version": "0.2.8", + "Version": "0.3.0", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -303,7 +310,7 @@ "stats", "vctrs" ], - "Hash": "012d7705b4cdc7b5ee89895aeefc1d45" + "Hash": "8e0e5e8db2bda5c3bdc48268afa5afea" }, "ggplot2": { "Package": "ggplot2", @@ -330,20 +337,34 @@ ], "Hash": "44c6a2f8202d5b7e878ea274b1092426" }, + "ggridges": { + "Package": "ggridges", + "Version": "0.5.6", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "ggplot2", + "grid", + "scales", + "withr" + ], + "Hash": "66488692cb8621bc78df1b9b819497a6" + }, "glue": { "Package": "glue", - "Version": "1.7.0", + "Version": "1.8.0", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "R", "methods" ], - "Hash": "e0b3a53876554bd45879e596cdb10a52" + "Hash": "5899f1eaa825580172bb56c08266f37c" }, "gtable": { "Package": "gtable", - "Version": "0.3.5", + "Version": "0.3.6", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -352,9 +373,10 @@ "glue", "grid", "lifecycle", - "rlang" + "rlang", + "stats" ], - "Hash": "e18861963cbc65a27736e02b3cd3c4a0" + "Hash": "de949855009e2d4d0e52a844e30617ae" }, "isoband": { "Package": "isoband", @@ -429,13 +451,13 @@ }, "matrixStats": { "Package": "matrixStats", - "Version": "1.3.0", + "Version": "1.4.1", "Source": "Repository", - "Repository": "RSPM", + "Repository": "CRAN", "Requirements": [ "R" ], - "Hash": "4b3ea27a19d669c0405b38134d89a9d1" + "Hash": "8885ffb1f46e820dede6b2ca9442abca" }, "mgcv": { "Package": "mgcv", @@ -475,11 +497,23 @@ ], "Hash": "4fd8900853b746af55b81fda99da7695" }, + "mvnfast": { + "Package": "mvnfast", + "Version": "0.2.8", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "BH", + "Rcpp", + "RcppArmadillo" + ], + "Hash": "e65cac8e8501bdfbdca0412c37bb18c9" + }, "nlme": { "Package": "nlme", - "Version": "3.1-165", + "Version": "3.1-166", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "R", "graphics", @@ -487,16 +521,15 @@ "stats", "utils" ], - "Hash": "2769a88be217841b1f33ed469675c3cc" + "Hash": "ccbb8846be320b627e6aa2b4616a2ded" }, "pillar": { "Package": "pillar", - "Version": "1.9.0", + "Version": "1.10.0", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "cli", - "fansi", "glue", "lifecycle", "rlang", @@ -504,7 +537,7 @@ "utils", "vctrs" ], - "Hash": "15da5a8412f317beeee6175fbc76f4bb" + "Hash": "101ca350beea21261a15ba169d7a8513" }, "pkgconfig": { "Package": "pkgconfig", @@ -518,7 +551,7 @@ }, "psych": { "Package": "psych", - "Version": "2.4.6.26", + "Version": "2.4.12", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -532,7 +565,7 @@ "parallel", "stats" ], - "Hash": "4448d5f3ac3e2cbf79074391d494637e" + "Hash": "b6659cfdaf2545e88959f00bdb0a0951" }, "purrr": { "Package": "purrr", @@ -551,7 +584,7 @@ }, "rbibutils": { "Package": "rbibutils", - "Version": "2.2.16", + "Version": "2.3", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -559,17 +592,17 @@ "tools", "utils" ], - "Hash": "8c06968e0a5b0209c5f34239b1302336" + "Hash": "dfc034a172fd88fc66b1a703894c4185" }, "renv": { "Package": "renv", - "Version": "1.0.7", + "Version": "1.0.11", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "utils" ], - "Hash": "397b7b2a265bc5a7a06852524dabae20" + "Hash": "47623f66b4e80b3b0587bc5d7b309888" }, "rlang": { "Package": "rlang", @@ -726,15 +759,15 @@ }, "withr": { "Package": "withr", - "Version": "3.0.0", + "Version": "3.0.2", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "R", "grDevices", "graphics" ], - "Hash": "d31b6c62c10dcf11ec530ca6b0dd5d35" + "Hash": "cc2d62c76458d425210d1eb1478b30b4" } } } diff --git a/renv/activate.R b/renv/activate.R index d13f993..0eb5108 100644 --- a/renv/activate.R +++ b/renv/activate.R @@ -2,7 +2,7 @@ local({ # the requested version of renv - version <- "1.0.7" + version <- "1.0.11" attr(version, "sha") <- NULL # the project directory @@ -98,6 +98,66 @@ local({ unloadNamespace("renv") # load bootstrap tools + ansify <- function(text) { + if (renv_ansify_enabled()) + renv_ansify_enhanced(text) + else + renv_ansify_default(text) + } + + renv_ansify_enabled <- function() { + + override <- Sys.getenv("RENV_ANSIFY_ENABLED", unset = NA) + if (!is.na(override)) + return(as.logical(override)) + + pane <- Sys.getenv("RSTUDIO_CHILD_PROCESS_PANE", unset = NA) + if (identical(pane, "build")) + return(FALSE) + + testthat <- Sys.getenv("TESTTHAT", unset = "false") + if (tolower(testthat) %in% "true") + return(FALSE) + + iderun <- Sys.getenv("R_CLI_HAS_HYPERLINK_IDE_RUN", unset = "false") + if (tolower(iderun) %in% "false") + return(FALSE) + + TRUE + + } + + renv_ansify_default <- function(text) { + text + } + + renv_ansify_enhanced <- function(text) { + + # R help links + pattern <- "`\\?(renv::(?:[^`])+)`" + replacement <- "`\033]8;;ide:help:\\1\a?\\1\033]8;;\a`" + text <- gsub(pattern, replacement, text, perl = TRUE) + + # runnable code + pattern <- "`(renv::(?:[^`])+)`" + replacement <- "`\033]8;;ide:run:\\1\a\\1\033]8;;\a`" + text <- gsub(pattern, replacement, text, perl = TRUE) + + # return ansified text + text + + } + + renv_ansify_init <- function() { + + envir <- renv_envir_self() + if (renv_ansify_enabled()) + assign("ansify", renv_ansify_enhanced, envir = envir) + else + assign("ansify", renv_ansify_default, envir = envir) + + } + `%||%` <- function(x, y) { if (is.null(x)) y else x } @@ -142,7 +202,10 @@ local({ # compute common indent indent <- regexpr("[^[:space:]]", lines) common <- min(setdiff(indent, -1L)) - leave - paste(substring(lines, common), collapse = "\n") + text <- paste(substring(lines, common), collapse = "\n") + + # substitute in ANSI links for executable renv code + ansify(text) } @@ -305,8 +368,11 @@ local({ quiet = TRUE ) - if ("headers" %in% names(formals(utils::download.file))) - args$headers <- renv_bootstrap_download_custom_headers(url) + if ("headers" %in% names(formals(utils::download.file))) { + headers <- renv_bootstrap_download_custom_headers(url) + if (length(headers) && is.character(headers)) + args$headers <- headers + } do.call(utils::download.file, args) @@ -385,10 +451,21 @@ local({ for (type in types) { for (repos in renv_bootstrap_repos()) { + # build arguments for utils::available.packages() call + args <- list(type = type, repos = repos) + + # add custom headers if available -- note that + # utils::available.packages() will pass this to download.file() + if ("headers" %in% names(formals(utils::download.file))) { + headers <- renv_bootstrap_download_custom_headers(repos) + if (length(headers) && is.character(headers)) + args$headers <- headers + } + # retrieve package database db <- tryCatch( as.data.frame( - utils::available.packages(type = type, repos = repos), + do.call(utils::available.packages, args), stringsAsFactors = FALSE ), error = identity @@ -470,6 +547,14 @@ local({ } + renv_bootstrap_github_token <- function() { + for (envvar in c("GITHUB_TOKEN", "GITHUB_PAT", "GH_TOKEN")) { + envval <- Sys.getenv(envvar, unset = NA) + if (!is.na(envval)) + return(envval) + } + } + renv_bootstrap_download_github <- function(version) { enabled <- Sys.getenv("RENV_BOOTSTRAP_FROM_GITHUB", unset = "TRUE") @@ -477,16 +562,16 @@ local({ return(FALSE) # prepare download options - pat <- Sys.getenv("GITHUB_PAT") - if (nzchar(Sys.which("curl")) && nzchar(pat)) { + token <- renv_bootstrap_github_token() + if (nzchar(Sys.which("curl")) && nzchar(token)) { fmt <- "--location --fail --header \"Authorization: token %s\"" - extra <- sprintf(fmt, pat) + extra <- sprintf(fmt, token) saved <- options("download.file.method", "download.file.extra") options(download.file.method = "curl", download.file.extra = extra) on.exit(do.call(base::options, saved), add = TRUE) - } else if (nzchar(Sys.which("wget")) && nzchar(pat)) { + } else if (nzchar(Sys.which("wget")) && nzchar(token)) { fmt <- "--header=\"Authorization: token %s\"" - extra <- sprintf(fmt, pat) + extra <- sprintf(fmt, token) saved <- options("download.file.method", "download.file.extra") options(download.file.method = "wget", download.file.extra = extra) on.exit(do.call(base::options, saved), add = TRUE) diff --git a/tests/testthat/test-GenGLMdata.R b/tests/testthat/test-GenGLMdata.R index b7f1367..8b1fe53 100644 --- a/tests/testthat/test-GenGLMdata.R +++ b/tests/testthat/test-GenGLMdata.R @@ -71,13 +71,13 @@ test_that("Error on Results output for Family",{ Family<-"linear"; Dist<-"Exp"; Beta<-c(-1,2,1) test_that("Error on Results output for linear Dist",{ expect_error(GenGLMdata(Dist,Dist_Par,No_Of_Var,Beta,N,Family), - "For linear regression select the distribution 'Normal' \n or 'Uniform' to generate the covarate data") + "For linear regression select the distribution 'Normal', 'MVNormal' \n or 'Uniform' to generate the covarate data") }) Family<-"logistic"; Dist<-"Exp" test_that("Error on Results output for logistic Dist",{ expect_error(GenGLMdata(Dist,Dist_Par,No_Of_Var,Beta,N,Family), - "For logistic regression select the distribution 'Exponential', \n 'Normal' or 'Uniform' to generate the covarate data") + "For logistic regression select the distribution 'Exponential', \n 'Normal', 'MVNormal' or 'Uniform' to generate the covarate data") }) Family<-"poisson"; Dist<-"Exp" diff --git a/vignettes/Basic_Sampling_Lin_Reg.Rmd b/vignettes/Basic_Sampling_Lin_Reg.Rmd index 0f703fb..7280327 100644 --- a/vignettes/Basic_Sampling_Lin_Reg.Rmd +++ b/vignettes/Basic_Sampling_Lin_Reg.Rmd @@ -45,14 +45,14 @@ Theme_special<-function(){ # Understanding the electric consumption data ``Electric power consumption'' data [@hebrailindividual], which contains $2,049,280$ measurements for a house located at Sceaux, France between December 2006 and November 2010. -The data contains $4$ columns, the first column is the response variable and the rest are covariates, however we only use the first $5\%$ of the data for this demonstration. +The data contains $4$ columns, the first column is the response variable and the rest are covariates, however we only use the first $25\%$ of the data for this demonstration. The response $y$ is the log scaled intensity, while the covariates are active electrical energy in the a) kitchen ($X_1$), b) laundry room ($X_2$) and c) water-heater and air-conditioner ($X_3$). The covariates are scaled to be mean of zero and variance of one. For the given data subsampling methods are implemented assuming the main effects model can describe the data. ```{r Exploring data} -# Selecting 5% of the big data and prepare it -indexes<-1:ceiling(nrow(Electric_consumption)*0.05) +# Selecting 25% of the big data and prepare it +indexes<-1:ceiling(nrow(Electric_consumption)*0.25) Original_Data<-cbind(Electric_consumption[indexes,1],1,Electric_consumption[indexes,-1]) colnames(Original_Data)<-c("Y",paste0("X",0:ncol(Original_Data[,-c(1,2)]))) @@ -65,7 +65,7 @@ head(Original_Data) %>% caption = "First few observations of the electric consumption data.") # Setting the sample sizes -N<-nrow(Original_Data); M<-250; k<-seq(6,18,by=3)*100; rep_k<-rep(k,each=M) +N<-nrow(Original_Data); M<-250; k<-seq(8,20,by=2)*100; rep_k<-rep(k,each=M) ``` Based on this model the methods random sampling, leverage sampling [@ma2014statistical;@ma2015leveraging], $A$-optimality subsampling with Gaussian Linear Model [@lee2022sampling], $A$- and $L$-optimality subsampling [@ai2021optimal;@yao2021review] and $A$-optimality subsampling with response constraint [@zhang2021optimal] are implemented on the big data. @@ -80,8 +80,8 @@ Method_Colour<-c("#BBFFBB","#50FF50","#00BB00","#008600", Method_Shape_Types<-c(rep(17,4),rep(4,3),16) ``` -The obtained samples and their respective model parameter estimates are over $M=100$ simulations across different sample sizes $k=(600,\ldots,1500)$. -We set the initial sample size of $r1=300$ for the methods that requires a random sample. +The obtained samples and their respective model parameter estimates are over $M=250$ simulations across different sample sizes $k=(800,\ldots,2000)$. +We set the initial sample size of $r1=400$ for the methods that requires a random sample. From the final samples, model parameters of the assume model are estimated. These estimated model parameters are compared with the estimated model parameters of the full big data through the mean squared error $MSE(\tilde{\beta}_k,\hat{\beta})=\frac{1}{MJ}\sum_{i=1}^M \sum_{j=1}^J(\tilde{\beta}_{k,j} - \hat{\beta}_j)^2$. Here, $\tilde{\beta}_k$ is the estimated model parameters from the sample of size $k$ and $\hat{\beta}$ is the estimated model parameters from the full big data, while $j$ is index of the model parameter. @@ -119,7 +119,7 @@ colnames(Final_Beta_LS)<-c("Method","Sample",paste0("Beta",0:3)) ```{r A-optimality subsampling for GauLin Model} # A-optimality subsampling for Gaussian Linear Model -NeEDS4BigData::AoptimalGauLMSub(r1=300,r2=rep_k, +NeEDS4BigData::AoptimalGauLMSub(r1=400,r2=rep_k, Y=as.matrix(Original_Data[,1]), X=as.matrix(Original_Data[,-1]),N=N)->Results Final_Beta_AoptGauLM<-Results$Beta_Estimates @@ -131,7 +131,7 @@ colnames(Final_Beta_AoptGauLM)<-c("Method","Sample",paste0("Beta",0:3)) ```{r A- and L- optimaliy subsampling} # A- and L-optimality subsampling for GLM -NeEDS4BigData::ALoptimalGLMSub(r1=300,r2=rep_k, +NeEDS4BigData::ALoptimalGLMSub(r1=400,r2=rep_k, Y=as.matrix(Original_Data[,1]), X=as.matrix(Original_Data[,-1]), N=N,family = "linear")->Results @@ -143,7 +143,7 @@ colnames(Final_Beta_ALoptGLM)<-c("Method","Sample",paste0("Beta",0:3)) ```{r A-optimality subsampling with measurement constraints} # A-optimality subsampling for without response -NeEDS4BigData::AoptimalMCGLMSub(r1=300,r2=rep_k, +NeEDS4BigData::AoptimalMCGLMSub(r1=400,r2=rep_k, Y=as.matrix(Original_Data[,1]), X=as.matrix(Original_Data[,-1]), N=N,family="linear")->Results diff --git a/vignettes/Basic_Sampling_Log_Reg.Rmd b/vignettes/Basic_Sampling_Log_Reg.Rmd index 4b89e51..62e5330 100644 --- a/vignettes/Basic_Sampling_Log_Reg.Rmd +++ b/vignettes/Basic_Sampling_Log_Reg.Rmd @@ -66,7 +66,7 @@ head(Original_Data) %>% caption = "First few observations of the skin segmentation data.") # Setting the sample sizes -N<-nrow(Original_Data); M<-300; k<-seq(9,24,by=3)*100; rep_k<-rep(k,each=M) +N<-nrow(Original_Data); M<-250; k<-seq(8,20,by=2)*100; rep_k<-rep(k,each=M) ``` Based on this model the methods random sampling, leverage sampling [@ma2014statistical;@ma2015leveraging], local case control sampling [@fithian2015local], $A$- and $L$-optimality subsampling [@ai2021optimal;@yao2021review] and $A$-optimality subsampling with response constraint [@zhang2021optimal] are implemented on the big data. @@ -81,8 +81,8 @@ Method_Colour<-c("#BBFFBB","#50FF50","#00BB00", Method_Shape_Types<-c(rep(17,3),rep(4,4),16) ``` -The obtained samples and their respective model parameter estimates are over $M=100$ simulations across different sample sizes $k=(600,\ldots,1500)$. -We set the initial sample size of $r1=300$ for the methods that requires a random sample. +The obtained samples and their respective model parameter estimates are over $M=250$ simulations across different sample sizes $k=(800,\ldots,2000)$. +We set the initial sample size of $r1=400$ for the methods that requires a random sample. From the final samples, model parameters of the assume model are estimated. These estimated model parameters are compared with the estimated model parameters of the full big data through the mean squared error $MSE(\tilde{\beta}_k,\hat{\beta})=\frac{1}{MJ}\sum_{i=1}^M \sum_{j=1}^J(\tilde{\beta}_{k,j} - \hat{\beta}_j)^2$. Here, $\tilde{\beta}_k$ is the estimated model parameters from the sample of size $k$ and $\hat{\beta}$ is the estimated model parameters from the full big data, while $j$ is index of the model parameter. @@ -120,7 +120,7 @@ colnames(Final_Beta_LS)<-c("Method","Sample",paste0("Beta",0:3)) ```{r local case control sampling} # Local case control sampling -NeEDS4BigData::LCCsampling(r1=300,r2=rep_k, +NeEDS4BigData::LCCsampling(r1=400,r2=rep_k, Y=as.matrix(Original_Data[,1]), X=as.matrix(Original_Data[,-1]), N=N)->Results @@ -133,7 +133,7 @@ colnames(Final_Beta_LCCS)<-c("Method","Sample",paste0("Beta",0:3)) ```{r A- and L-optimality subsampling} # A- and L-optimality subsampling for GLM -NeEDS4BigData::ALoptimalGLMSub(r1=300,r2=rep_k, +NeEDS4BigData::ALoptimalGLMSub(r1=400,r2=rep_k, Y=as.matrix(Original_Data[,1]), X=as.matrix(Original_Data[,-1]), N=N,family = "logistic")->Results @@ -145,7 +145,7 @@ colnames(Final_Beta_ALoptGLM)<-c("Method","Sample",paste0("Beta",0:3)) ```{r A-optimality subsampling under measurement constraints} # A-optimality subsampling for without response -NeEDS4BigData::AoptimalMCGLMSub(r1=300,r2=rep_k, +NeEDS4BigData::AoptimalMCGLMSub(r1=400,r2=rep_k, Y=as.matrix(Original_Data[,1]), X=as.matrix(Original_Data[,-1]), N=N,family="logistic")->Results @@ -173,7 +173,7 @@ remove(Final_Beta_RS,Final_Beta_LS,Final_Beta_LCCS,Final_Beta_ALoptGLM, The mean squared error is plotted below. -```{r Model parameter estimates,fig.width=7,fig.height=5,fig.align='center',fig.cap="Mean squared error for all the subsampling methods with 5% and 95% percentile intervals."} +```{r Model parameter estimates,fig.width=7,fig.height=10,fig.align='center',fig.cap="Mean squared error for a) all the subsampling methods and b) without leverage and local case control sampling, with 5% and 95% percentile intervals."} # Obtain the mean squared error for the model parameter estimates MSE_Beta<-data.frame("Method"=Final_Beta$Method, "Sample"=Final_Beta$Sample, @@ -198,7 +198,27 @@ ggplot(data=Mean_Data,aes(x=factor(Sample),y=Mean,color=Method,shape=Method))+ scale_color_manual(values = Method_Colour)+ scale_shape_manual(values = Method_Shape_Types)+ theme_bw()+Theme_special()+ - guides(colour = guide_legend(nrow = 3)) + guides(colour = guide_legend(nrow = 3))->p1 + +Left_out<-c(paste0(c("Basic ","Shrinkage ","Unweighted "),"Leverage"), + "Local case control sampling") +# Plot for the mean squared error except leverage and local case control sampling +ggplot(data=Mean_Data[!(Mean_Data$Method %in% Left_out),], + aes(x=factor(Sample),y=Mean,color=Method,shape=Method))+ + xlab("Sample Size")+ylab("Squared Error")+ + geom_point(size=3,position = position_dodge(width = 0.5))+ + geom_line(data=Mean_Data[!(Mean_Data$Method %in% Left_out),], + aes(x=factor(Sample),y=Mean,group=Method,color=Method), + position = position_dodge(width = 0.5))+ + geom_errorbar(data=Mean_Data[!(Mean_Data$Method %in% Left_out),], + aes(ymin=min,ymax=max), + width=0.3,position = position_dodge(width = 0.5))+ + scale_color_manual(values = Method_Colour[-c(4:7)])+ + scale_shape_manual(values = Method_Shape_Types[-c(4:7)])+ + theme_bw()+Theme_special()+ + guides(colour = guide_legend(nrow = 2))->p2 + +ggarrange(p1,p2,nrow = 2,ncol = 1,labels = "auto") ``` ## References diff --git a/vignettes/Basic_Sampling_Poi_Reg.Rmd b/vignettes/Basic_Sampling_Poi_Reg.Rmd index 6d21bf7..1d6dde9 100644 --- a/vignettes/Basic_Sampling_Poi_Reg.Rmd +++ b/vignettes/Basic_Sampling_Poi_Reg.Rmd @@ -64,7 +64,7 @@ head(Original_Data) %>% caption = "First few observations of the bike sharing data.") # Setting the sample sizes -N<-nrow(Original_Data); M<-250; k<-seq(6,18,by=3)*100; rep_k<-rep(k,each=M) +N<-nrow(Original_Data); M<-250; k<-seq(8,20,by=2)*100; rep_k<-rep(k,each=M) ``` Based on this model the methods random sampling, leverage sampling [@ma2014statistical;@ma2015leveraging], $A$- and $L$-optimality subsampling [@ai2021optimal;@yao2021review] and $A$-optimality subsampling with response constraint [@zhang2021optimal] are implemented on the big data. @@ -79,8 +79,8 @@ Method_Colour<-c("#BBFFBB","#50FF50","#00BB00", Method_Shape_Types<-c(rep(17,3),rep(4,3),16) ``` -The obtained samples and their respective model parameter estimates are over $M=100$ simulations across different sample sizes $k=(600,\ldots,1500)$. -We set the initial sample size of $r1=300$ for the methods that requires a random sample. +The obtained samples and their respective model parameter estimates are over $M=250$ simulations across different sample sizes $k=(800,\ldots,2000)$. +We set the initial sample size of $r1=400$ for the methods that requires a random sample. From the final samples, model parameters of the assume model are estimated. These estimated model parameters are compared with the estimated model parameters of the full big data through the mean squared error $MSE(\tilde{\beta}_k,\hat{\beta})=\frac{1}{MJ}\sum_{i=1}^M \sum_{j=1}^J(\tilde{\beta}_{k,j} - \hat{\beta}_j)^2$. Here, $\tilde{\beta}_k$ is the estimated model parameters from the sample of size $k$ and $\hat{\beta}$ is the estimated model parameters from the full big data, while $j$ is index of the model parameter. @@ -118,7 +118,7 @@ colnames(Final_Beta_LS)<-c("Method","Sample",paste0("Beta",0:3)) ```{r A- and L-optimality subsampling} # A- and L-optimality subsampling for GLM -NeEDS4BigData::ALoptimalGLMSub(r1=300,r2=rep_k, +NeEDS4BigData::ALoptimalGLMSub(r1=400,r2=rep_k, Y=as.matrix(Original_Data[,1]), X=as.matrix(Original_Data[,-1]), N=N,family = "poisson")->Results @@ -130,7 +130,7 @@ colnames(Final_Beta_ALoptGLM)<-c("Method","Sample",paste0("Beta",0:3)) ```{r A-optimality subsampling with measurement constraints} # A-optimality subsampling without response -NeEDS4BigData::AoptimalMCGLMSub(r1=300,r2=rep_k, +NeEDS4BigData::AoptimalMCGLMSub(r1=400,r2=rep_k, Y=as.matrix(Original_Data[,1]), X=as.matrix(Original_Data[,-1]), N=N,family="poisson")->Results @@ -158,7 +158,7 @@ remove(Final_Beta_RS,Final_Beta_LS,Final_Beta_ALoptGLM, The mean squared error is plotted below. -```{r First scenario model parameter estimates,fig.width=7,fig.height=5,fig.align='center',fig.cap="Mean squared error for all the subsampling methods with 5% and 95% percentile intervals."} +```{r First scenario model parameter estimates,fig.width=7,fig.height=10,fig.align='center',fig.cap="Mean squared error for a) all the subsampling methods and b) without unweighted leverage sampling, with 5% and 95% percentile intervals."} # Obtain the mean squared error for the model parameter estimates MSE_Beta<-data.frame("Method"=Final_Beta$Method, "Sample"=Final_Beta$Sample, @@ -183,7 +183,25 @@ ggplot(data=Mean_Data,aes(x=factor(Sample),y=Mean,color=Method,shape=Method))+ scale_color_manual(values = Method_Colour)+ scale_shape_manual(values = Method_Shape_Types)+ theme_bw()+Theme_special()+ - guides(colour = guide_legend(nrow = 3)) + guides(colour = guide_legend(nrow = 3))->p1 + +# Plot for the mean squared error with all methods except unweighted leverage +ggplot(data=Mean_Data[Mean_Data$Method != "Unweighted Leverage",], + aes(x=factor(Sample),y=Mean,color=Method,shape=Method))+ + xlab("Sample Size")+ylab("Squared Error")+ + geom_point(size=3,position = position_dodge(width = 0.5))+ + geom_line(data=Mean_Data[Mean_Data$Method != "Unweighted Leverage",], + aes(x=factor(Sample),y=Mean,group=Method,color=Method), + position = position_dodge(width = 0.5))+ + geom_errorbar(data=Mean_Data[Mean_Data$Method != "Unweighted Leverage",], + aes(ymin=min,ymax=max),width=0.3, + position = position_dodge(width = 0.5))+ + scale_color_manual(values = Method_Colour[-7])+ + scale_shape_manual(values = Method_Shape_Types[-7])+ + theme_bw()+Theme_special()+ + guides(colour = guide_legend(nrow = 3))->p2 + +ggarrange(p1,p2,nrow = 2,ncol = 1,labels = "auto") ``` ## References diff --git a/vignettes/Benchmarking_model_based.Rmd b/vignettes/Benchmarking_model_based.Rmd new file mode 100644 index 0000000..a422fa9 --- /dev/null +++ b/vignettes/Benchmarking_model_based.Rmd @@ -0,0 +1,271 @@ +--- +title: "Benchmarking model-based subsampling functions" +resource_files: + - additionaldata/Results_Model_based.Rdata +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Benchmarking model-based subsampling functions} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + + + +```{r knitr options,include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +```{r load packages and theme,message=FALSE} +# Load the R packages +library(ggplot2) +library(ggpubr) +library(tidyr) +library(dplyr) + +# Theme for plots +Theme_special<-function(){ + theme(legend.key.width=unit(1,"cm"), + axis.text.x = element_text(color = "black",size=14), + axis.text.y = element_text(color = "black",size=14), + strip.text = element_text(colour = "black",size = 16,face="bold"), + panel.grid.minor.x = element_blank(), + axis.title= element_text(color = "black",face = "bold",size = 14), + legend.text = element_text(color = "black", size=16), + legend.position = "bottom", + legend.margin=margin(0,0,0,0), + legend.box.margin=margin(-1,-2,-1,-2)) +} + +load("additionaldata/Results_Model_based.Rdata") +``` + +## Setup + +The `lm()`, `biglm()`, `glm()` and `bigglm()` functions are benchmarked against the model-based subsampling functions. +Benchmarking is conducted across all three regression problems using a consistent setup throughout. +For the big data sizes $N = \{10^4,5 \times 10^4,10^5,5 \times 10^5,10^6,5 \times 10^6\}$ and for five different Covariates sizes $\{5,10,25,50,100\}$, the functions were replicated $50$ times. +When using the subsampling functions, the initial subsample sizes $\{100,100,250,500,1000\}$ were selected based on the covariate sizes, while the final subsample size was fixed at $2500$ to ensure successful implementation. + +```{r N and Covariate size} +N_size<-c(1,5,10,50,100,500)*10000 +N_size_labels<-c("10^4","5 x 10^4","10^5","5 x 10^5","10^6","5 x 10^6") +CS_size<-c(5,10,25,50,100) +``` + +This implementation was conducted on a high-performance computing system; however, the code for linear regression-related functions is shown below. +Furthermore, it can be extended to logistic and Poisson regression by incorporating the relevant subsampling functions and model configurations. + +```{r Code setup for benchmarking,eval=FALSE} +# load the packages +library(NeEDS4BigData) +library(here) +library(biglm) + +# indexes for N and covariate sizes +indexA <- as.numeric(Sys.getenv("indexA")) +indexB <- as.numeric(Sys.getenv("indexB")) + +# set N and covariate size, and assign replicates +N_size<-c(1,5,10,50,100,500)*10000 +Covariate_size<-c(5,10,25,50,100) +Replicates<-50 + +# set the initial and final subsample sizes, +# with the distribution parameters to generate data +r1<-c(1,1,2.5,5,10)*100; r2<-2500; +Dist<-"Normal"; Dist_Par<-list(Mean=0,Variance=1,Error_Variance=0.5); +Family<-"linear" + +# assign the indexes +N_idx<-indexA +Covariate_idx<-indexB +No_Of_Var<-Covariate_size[Covariate_idx] +N<-N_size[N_idx] + +# generate the big data based on N and covariate size +Beta<-c(-1,rep(0.5,Covariate_size[Covariate_idx])) +Generated_Data<-GenGLMdata(Dist,Dist_Par,No_Of_Var,Beta,N,Family) +Full_Data<-Generated_Data$Complete_Data + +cat("N size :",N_size[N_idx]," and Covariate size :",Covariate_size[Covariate_idx],"\n") + +# formula for the linear regression model +lm_formula<-as.formula(paste("Y ~", paste(paste0("X",0:ncol(Full_Data[,-c(1,2)])), + collapse = " + "))) + +# benchmarking the stats::lm() function +lm<-sapply(1:Replicates,function(i){ + start_T<-Sys.time() + suppressMessages(lm(Y~.-1,data=data.frame(Full_Data))) + return(difftime(Sys.time(),start_T,units = "secs")) +}) + +cat("Benchmarked lm() function.\n") + +# benchmarking the biglm::biglm() function +biglm<-sapply(1:Replicates,function(i){ + start_T<-Sys.time() + suppressMessages(biglm(lm_formula,data=data.frame(Full_Data))) + return(difftime(Sys.time(),start_T,units = "secs")) +}) + +cat("Benchmarked biglm() function.\n") + +# benchmarking the NeEDS4BigData::AoptimalGauLMSub() function +AoptimalGauLMSub<-sapply(1:Replicates,function(i){ + start_T<-Sys.time() + suppressMessages(AoptimalGauLMSub(r1[Covariate_idx],r2, + Y=as.matrix(Full_Data[,1]), + X=as.matrix(Full_Data[,-1]), + N=nrow(Full_Data))) + return(difftime(Sys.time(),start_T,units = "secs")) +}) + +cat("Benchmarked AoptimalGauLMSub() function.\n") + +# benchmarking the NeEDS4BigData::LeverageSampling() function +LeverageSampling<-sapply(1:Replicates,function(i){ + start_T<-Sys.time() + suppressMessages(LeverageSampling(r2, + Y=as.matrix(Full_Data[,1]), + X=as.matrix(Full_Data[,-1]), + N=nrow(Full_Data), + S_alpha=0.95, + family=Family)) + return(difftime(Sys.time(),start_T,units = "secs")) +}) + +cat("Benchmarked LeverageSampling() function.\n") + +# benchmarking the NeEDS4BigData::ALoptimalGLMSub() function +ALoptimalGLMSub<-sapply(1:Replicates,function(i){ + start_T<-Sys.time() + suppressMessages(ALoptimalGLMSub(r1[Covariate_idx],r2, + Y=as.matrix(Full_Data[,1]), + X=as.matrix(Full_Data[,-1]), + N=nrow(Full_Data), + family=Family)) + return(difftime(Sys.time(),start_T,units = "secs")) +}) + +cat("Benchmarked ALoptimalGLMSub() function.\n") + +# benchmarking the NeEDS4BigData::AoptimalMCGLMSub() function +AoptimalMCGLMSub<-sapply(1:Replicates,function(i){ + start_T<-Sys.time() + suppressMessages(AoptimalMCGLMSub(r1[Covariate_idx],r2, + Y=as.matrix(Full_Data[,1]), + X=as.matrix(Full_Data[,-1]), + N=nrow(Full_Data), + family=Family)) + return(difftime(Sys.time(),start_T,units = "secs")) +}) + +cat("Benchmarked AoptimalMCGLMSub() function.\n") + +Plot_Data<-cbind.data.frame("N size"=N_size[N_idx], + "Covariate size"=Covariate_size[Covariate_idx],"lm()"=lm, + "biglm()"=biglm,"AoptimalGauLMSub()"=AoptimalGauLMSub, + "LeverageSampling()"=LeverageSampling, + "ALoptimalGLMSub()"=ALoptimalGLMSub, + "AoptimalMCGLMSub()"=AoptimalMCGLMSub) + +save(Plot_Data, + file=here("Results",paste0("Output_NS_",N_idx,"_CS_",Covariate_idx,".RData"))) +``` + +## Linear Regression + +For linear regression, the following functions from the NeEDS4BigData package: (1) `AoptimalGauLMSub()`, (2) `LeverageSampling()`, (3) `ALoptimalGLMSub()`, and (4) `AoptimalMCGLMSub()` are compared against `lm()` and `biglm()`. + +```{r linear regression,fig.width=5*3,fig.height=5*2,fig.align='center',fig.cap="Average time for functions, with 5% and 95% percentile intervals under linear regression."} +Methods_FCT<-c("lm()","biglm()","LeverageSampling()","AoptimalMCGLMSub()", + "AoptimalGauLMSub()","ALoptimalGLMSub()") +Method_Colors<-c("grey","black","#F76D5E","#A50021","#BBFFBB","#50FF50") + +Final_Linear_Regression %>% + pivot_longer(cols = `lm()`:`AoptimalMCGLMSub()`, + names_to = "Methods",values_to = "Time") %>% + group_by(`N size`,`Covariate size`,Methods) %>% + summarise(Mean=mean(Time),min=quantile(Time,0.05),max=quantile(Time,0.95), + .groups = "drop") %>% + mutate(Methods=factor(Methods,levels = Methods_FCT,labels = Methods_FCT), + `N size`=factor(`N size`,levels = N_size, + labels=paste0("N = ",N_size_labels))) %>% +ggplot(.,aes(x=factor(`Covariate size`),y=Mean,color=Methods,group=Methods))+ + geom_point(position = position_dodge(width = 0.5))+ + geom_line(position = position_dodge(width = 0.5))+ + geom_errorbar(aes(ymin=min,ymax=max),position = position_dodge(width = 0.5))+ + facet_wrap(~factor(`N size`),scales = "free",nrow=2)+ + xlab("Number of Covariates")+ylab("Time in Seconds")+ + scale_color_manual(values = Method_Colors)+ + theme_bw()+ggtitle("Linear Regression")+Theme_special() +``` + +In general, our subsampling functions perform faster as the size of the big data and the number of covariates increase, except for `LeverageSampling()`. +Even when a solution for linear regression is available in an analytical form, the subsampling functions perform as well as `biglm()`. + +## Logistic Regression + +For logistic regression, the following functions: (1) `LCCSampling()`, (2) `LeverageSampling()`, (3) `AoptimalMCGLMSub()`, and (4) `ALoptimalGLMSub()` are compared against `glm()` and `bigglm()`. + +```{r logistic regression,fig.width=5*3,fig.height=5*2,fig.align='center',fig.cap="Average time for functions, with 5% and 95% percentile intervals under logistic regression."} +Methods_FCT<-c("glm()","bigglm()","LCCSampling()","LeverageSampling()", + "AoptimalMCGLMSub()","ALoptimalGLMSub()") +Method_Colors<-c("grey","black","#F76D5E","#A50021","#BBFFBB","#50FF50") + +Final_Logistic_Regression %>% + pivot_longer(cols = `glm()`:`AoptimalMCGLMSub()`, + names_to = "Methods",values_to = "Time") %>% + group_by(`N size`,`Covariate size`,Methods) %>% + summarise(Mean=mean(Time),min=quantile(Time,0.05),max=quantile(Time,0.95), + .groups = "drop") %>% + mutate(Methods=factor(Methods,levels = Methods_FCT,labels = Methods_FCT), + `N size`=factor(`N size`,levels = N_size, + labels=paste0("N = ",N_size_labels))) %>% +ggplot(.,aes(x=factor(`Covariate size`),y=Mean,color=Methods,group=Methods))+ + geom_point(position = position_dodge(width = 0.5))+ + geom_line(position = position_dodge(width = 0.5))+ + geom_errorbar(aes(ymin=min,ymax=max),position = position_dodge(width = 0.5))+ + facet_wrap(~factor(`N size`),scales = "free",nrow=2)+ + xlab("Number of Covariates")+ylab("Time in Seconds")+ + scale_color_manual(values = Method_Colors)+ + theme_bw()+Theme_special()+ggtitle("Logistic Regression") +``` + +There seems to be little difference between using `glm()` and `bigglm()`, while the subsampling functions perform faster, with the performance gap increasing as the size of the big data and the number of covariates grow. + +## Poisson Regression + +For Poisson regression, the following functions: (1) `LeverageSampling()`, (2) `AoptimalMCGLMSub()`, and (3) `ALoptimalGLMSub()` are compared against `glm()` and `bigglm()`. + +```{r poisson regression, fig.width=5*3,fig.height=5*2,fig.align='center',fig.cap="Average time for functions, with 5% and 95% percentile intervals under Poisson regression."} +Methods_FCT<-c("glm()","bigglm()","LeverageSampling()", + "AoptimalMCGLMSub()","ALoptimalGLMSub()") +Method_Colors<-c("grey","black","#A50021","#BBFFBB","#50FF50") + +Final_Poisson_Regression %>% + pivot_longer(cols = `glm()`:`AoptimalMCGLMSub()`, + names_to = "Methods",values_to = "Time") %>% + group_by(`N size`,`Covariate size`,Methods) %>% + summarise(Mean=mean(Time),min=quantile(Time,0.05),max=quantile(Time,0.95), + .groups = "drop") %>% + mutate(Methods=factor(Methods,levels = Methods_FCT,labels = Methods_FCT), + `N size`=factor(`N size`,levels = N_size, + labels=paste0("N = ",N_size_labels))) %>% +ggplot(.,aes(x=factor(`Covariate size`),y=Mean,color=Methods,group=Methods))+ + geom_point(position = position_dodge(width = 0.5))+ + geom_line(position = position_dodge(width = 0.5))+ + geom_errorbar(aes(ymin=min,ymax=max),position = position_dodge(width = 0.5))+ + facet_wrap(~factor(`N size`),scales = "free",nrow=2)+ + xlab("Number of Covariates")+ylab("Time in Seconds")+ + scale_color_manual(values = Method_Colors)+ + theme_bw()+Theme_special()+ggtitle("Poisson Regression") +``` + +Similar to logistic regression, the subsampling functions perform faster than the `glm()` and `bigglm()` functions. + +In summary, the subsampling functions available in this R package perform best under high-dimensional data. diff --git a/vignettes/Benchmarking_model_robust.Rmd b/vignettes/Benchmarking_model_robust.Rmd new file mode 100644 index 0000000..225b46d --- /dev/null +++ b/vignettes/Benchmarking_model_robust.Rmd @@ -0,0 +1,284 @@ +--- +title: "Benchmarking model-robust subsampling functions" +resource_files: + - additionaldata/Results_Model_robust.Rdata +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Benchmarking model-robust subsampling functions} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + + + +```{r knitr options,include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +```{r load packages and theme,message=FALSE} +# Load the R packages +library(ggplot2) +library(ggpubr) +library(tidyr) +library(dplyr) + +# Theme for plots +Theme_special<-function(){ + theme(legend.key.width=unit(1,"cm"), + axis.text.x = element_text(color = "black",size=14), + axis.text.y = element_text(color = "black",size=14), + strip.text = element_text(colour = "black",size = 16,face="bold"), + panel.grid.minor.x = element_blank(), + axis.title= element_text(color = "black",face = "bold",size = 14), + legend.text = element_text(color = "black", size=16), + legend.position = "bottom", + legend.margin=margin(0,0,0,0), + legend.box.margin=margin(-1,-2,-1,-2)) +} + +load("additionaldata/Results_Model_robust.Rdata") +``` + +## Setup + +The `lm()`, `biglm()`, `glm()`, and `bigglm()` functions are benchmarked against the model-robust subsampling functions. +This benchmarking is conducted across all three regression problems using a consistent setup throughout. +For large data sizes $N = \{10^4,5 \times 10^4,10^5,5 \times 10^5,10^6,5 \times 10^6\}$, three different covariate sizes $\{5,10,25,50,100\}$, and three different model sizes $\{3,5,10\}$, the functions were replicated 50 times. +When using the model-robust subsampling functions, the initial subsample sizes $\{100,250,500\}$ were selected based on the covariate sizes, while the final subsample size was fixed at $2500$. +Here, the models included in the model set are selected based on their AIC values, with a squared term derived from the main effects. + +```{r N and Covariate size} +N_size<-c(1,5,10,50,100,500)*10000 +N_size_labels<-c("10^4","5 x 10^4","10^5","5 x 10^5","10^6","5 x 10^6") +CS_size<-c(10,25,50) +NOM_size<-c(3,5,10) +N_and_CS_size <- c(t(outer(CS_size, N_size_labels, + function(cs, n) paste0("N = ",n, + " and \nNumber of Covariates ",cs)))) +``` + +This implementation was conducted on a high-performance computing system; however, the code for linear regression-related functions is shown below. +Furthermore, it can be extended to logistic and Poisson regression using the relevant subsampling functions and model configurations. + +```{r Code setup for benchmarking,eval=FALSE} +# load the packages +library(NeEDS4BigData) +library(here) +library(biglm) + +# indexes for N, covariate sizes and number of models +indexA <- as.numeric(Sys.getenv("indexA")) +indexB <- as.numeric(Sys.getenv("indexB")) +indexC <- as.numeric(Sys.getenv("indexC")) + +# set N, covariate size, number of models and assign replicates +N_size<-c(1,5,10,50,100,500)*10000 +Covariate_size<-c(10,25,50) +No_of_Models<-c(3,5,10) +Replicates<-50 + +# set the initial and final subsample sizes, +# with the distribution parameters to generate data +r1<-c(1,2.5,5)*100; r2<-2500; +Dist<-"Normal"; Dist_Par<-list(Mean=0,Variance=1,Error_Variance=0.5); +Family<-"linear" + +# assign the indexes +N_idx<-indexA; Covariate_idx<-indexB; ModelSet_idx<-indexC +No_Of_Var<-Covariate_size[Covariate_idx] +N<-N_size[N_idx] +Model_Set<-No_of_Models[ModelSet_idx] + +# generate the big data based on N and covariate size +Beta<-c(-1,rep(0.5,Covariate_size[Covariate_idx])) +Generated_Data<-GenGLMdata(Dist,Dist_Par,No_Of_Var,Beta,N,Family) +Full_Data<-Generated_Data$Complete_Data + +# Find the models in the model set for model-robust subsampling +# based on AIC values +Full_Data<-cbind(Full_Data,Full_Data[,-c(1,2)]^2) +colnames(Full_Data)<-c("Y",paste0("X",0:No_Of_Var),paste0("X",1:No_Of_Var,"^2")) +Temp_Data<-Full_Data[sample(1:N,1000),] + +AIC_Values<-NULL +for (i in 1:No_Of_Var) { + temp_data<-as.data.frame(Temp_Data[,c("Y",paste0("X",0:No_Of_Var),paste0("X",i,"^2"))]) + model <- lm(Y~.-1, data = temp_data) + AIC_Values[i]<-AIC(model) +} + +# Covariates in model set +Best_Indices <- order(AIC_Values)[1:Model_Set] +Model_Apriori<-rep(1/Model_Set,Model_Set) +All_Combinations<-lm_formula<-list(); +for (NOM_idx in 1:Model_Set) { + All_Combinations[[NOM_idx]]<-c(paste0("X",0:No_Of_Var), + paste0("X",Best_Indices[NOM_idx],"^2")) + lm_formula[[NOM_idx]]<-as.formula(paste("Y ~", + paste(c(paste0("X",0:No_Of_Var), + paste0("X",Best_Indices[NOM_idx],"^2")), + collapse = " + "))) +} + +cat("N size :",N_size[N_idx]," and Covariate size :",Covariate_size[Covariate_idx], + " with the model set ",No_of_Models[ModelSet_idx],"\n") + +# benchmarking the stats::lm() function +lm<-sapply(1:Replicates,function(i){ + start_T<-Sys.time() + for(NOM_idx in 1:length(lm_formula)){ + suppressMessages(lm(lm_formula[[NOM_idx]],data=data.frame(Full_Data))) + } + return(difftime(Sys.time(),start_T,units = "secs")) +}) + +cat("Benchmarked lm() function.\n") + +# benchmarking the biglm::biglm() function +biglm<-sapply(1:Replicates,function(i){ + start_T<-Sys.time() + for (NOM_idx in 1:length(lm_formula)) { + suppressMessages(biglm(lm_formula[[NOM_idx]],data=data.frame(Full_Data))) + } + return(difftime(Sys.time(),start_T,units = "secs")) +}) + +cat("Benchmarked biglm() function.\n") + +# benchmarking the NeEDS4BigData::modelRobustLinSub() function +modelRobustLinSub<-sapply(1:Replicates,function(i){ + start_T<-Sys.time() + suppressMessages(modelRobustLinSub(r1[Covariate_idx],r2, + Y=as.matrix(Full_Data[,1]), + X=as.matrix(Full_Data[,-1]), + N=nrow(Full_Data), + Apriori_probs=Model_Apriori, + All_Combinations = All_Combinations, + All_Covariates = colnames(Full_Data[,-1]))) + return(difftime(Sys.time(),start_T,units = "secs")) +}) + +cat("Benchmarked modelRobustLinSub() function.") + +Plot_Data<-cbind.data.frame("N size"=N_size[N_idx], + "Covariate size"=Covariate_size[Covariate_idx], + "No of Models"= No_of_Models[ModelSet_idx], + "lm()"=lm,"biglm()"=biglm, + "modelRobustLinSub()"=modelRobustLinSub) + +save(Plot_Data, + file=here("Results",paste0("Output_NS_",N_idx,"_CS_",Covariate_idx, + "_MS_",ModelSet_idx,".RData"))) +``` + +## Linear regression + +For linear regression, the `modelRobustLinSub()` function from the NeEDS4BigData package was compared against `lm()` and `biglm()`. + +```{r linear regression, fig.height=4*3,fig.width=4*6,fig.align='center',fig.cap="Average time for functions, with 5% and 95% percentile intervals under model robust linear regression."} +Methods_FCT<-c("lm()","biglm()","modelRobustLinSub()") +Method_Colors<-c("grey","black","#50FF50") + +Final_Linear_Regression %>% + pivot_longer(cols = `lm()`:`modelRobustLinSub()`, + names_to = "Methods",values_to = "Time") %>% + mutate(Methods=factor(Methods,levels = Methods_FCT,labels = Methods_FCT), + `N size`=factor(`N size`,levels = N_size,labels = N_size_labels), + `N and Covariate size`=paste0("N = ",`N size`, + " and \nNumber of Covariates ", + `Covariate size`)) %>% + mutate(`N and Covariate size`=factor(`N and Covariate size`, + levels = N_and_CS_size, + labels = N_and_CS_size)) %>% + select(`N and Covariate size`,`No of Models`,Methods,Time) %>% + group_by(`N and Covariate size`,`No of Models`,Methods) %>% + summarise(Mean=mean(Time),min=quantile(Time,0.05),max=quantile(Time,0.95), + .groups = "drop") %>% + ggplot(.,aes(x=factor(`No of Models`),y=Mean,color=Methods,group=Methods))+ + geom_point(position = position_dodge(width = 0.5))+ + geom_line(position = position_dodge(width = 0.5))+ + geom_errorbar(aes(ymin=min,ymax=max),position = position_dodge(width = 0.5))+ + facet_wrap(.~factor(`N and Covariate size`),scales = "free",ncol=length(N_size))+ + xlab("Number of Models")+ylab("Time in Seconds")+ + scale_color_manual(values = Method_Colors)+ + theme_bw()+Theme_special()+ggtitle("Linear Regression") +``` + +In general, our subsampling functions perform faster as the size of the big data, the number of covariates, and the number of models increase. +Even when a solution for linear regression is available in an analytical form, the subsampling functions perform better than or as well as `biglm()`. + +## Logistic regression + +For logistic regression, the `modelRobustLogSub()` function was compared against `glm()` and `bigglm()`. + +```{r logistic regression, fig.height=4*3,fig.width=4*6,fig.align='center',fig.cap="Average time for functions, with 5% and 95% percentile intervals under model robust logistic regression."} +Methods_FCT<-c("glm()","bigglm()","modelRobustLogSub()") +Method_Colors<-c("grey","black","#50FF50") + +Final_Logistic_Regression %>% + pivot_longer(cols = `glm()`:`modelRobustLogSub()`, + names_to = "Methods",values_to = "Time") %>% + mutate(Methods=factor(Methods,levels = Methods_FCT,labels = Methods_FCT), + `N size`=factor(`N size`,levels = N_size,labels = N_size_labels), + `N and Covariate size`=paste0("N = ",`N size`, + " and \nNumber of Covariates ", + `Covariate size`)) %>% + mutate(`N and Covariate size`=factor(`N and Covariate size`, + levels = N_and_CS_size, + labels = N_and_CS_size)) %>% + select(`N and Covariate size`,`No of Models`,Methods,Time) %>% + group_by(`N and Covariate size`,`No of Models`,Methods) %>% + summarise(Mean=mean(Time),min=quantile(Time,0.05),max=quantile(Time,0.95), + .groups = "drop") %>% + ggplot(.,aes(x=factor(`No of Models`),y=Mean,color=Methods,group=Methods))+ + geom_point(position = position_dodge(width = 0.5))+ + geom_line(position = position_dodge(width = 0.5))+ + geom_errorbar(aes(ymin=min,ymax=max),position = position_dodge(width = 0.5))+ + facet_wrap(.~factor(`N and Covariate size`),scales = "free",ncol=length(N_size))+ + xlab("Number of Models")+ylab("Time in Seconds")+ + scale_color_manual(values = Method_Colors)+ + theme_bw()+Theme_special()+ggtitle("Logistic Regression") +``` + +It seems there is a significant difference between using `glm()` and `bigglm()`, with the model-robust subsampling function performing faster. +This performance gap increases as the size of the big data, the number of covariates, and the number of models grow. + +## Poisson Regression + +For Poisson regression, the function `modelRobustPoiSub()` was compared against `glm()` and `bigglm()`. + +```{r poisson regression, fig.height=4*3,fig.width=4*6,fig.align='center',fig.cap="Average time for functions, with 5% and 95% percentile intervals under model robust Poisson regression."} +Methods_FCT<-c("glm()","bigglm()","modelRobustPoiSub()") +Method_Colors<-c("grey","black","#50FF50") + +Final_Poisson_Regression %>% + pivot_longer(cols = `glm()`:`modelRobustPoiSub()`, + names_to = "Methods",values_to = "Time") %>% + mutate(Methods=factor(Methods,levels = Methods_FCT,labels = Methods_FCT), + `N size`=factor(`N size`,levels = N_size,labels = N_size_labels), + `N and Covariate size`=paste0("N = ",`N size`, + " and \nNumber of Covariates ", + `Covariate size`)) %>% + mutate(`N and Covariate size`=factor(`N and Covariate size`, + levels = N_and_CS_size, + labels = N_and_CS_size)) %>% + select(`N and Covariate size`,`No of Models`,Methods,Time) %>% + group_by(`N and Covariate size`,`No of Models`,Methods) %>% + summarise(Mean=mean(Time),min=quantile(Time,0.05),max=quantile(Time,0.95), + .groups = "drop") %>% + ggplot(.,aes(x=factor(`No of Models`),y=Mean,color=Methods,group=Methods))+ + geom_point(position = position_dodge(width = 0.5))+ + geom_line(position = position_dodge(width = 0.5))+ + geom_errorbar(aes(ymin=min,ymax=max),position = position_dodge(width = 0.5))+ + facet_wrap(.~factor(`N and Covariate size`),scales = "free",ncol=length(N_size))+ + xlab("Number of Models")+ylab("Time in Seconds")+ + scale_color_manual(values = Method_Colors)+ + theme_bw()+Theme_special()+ggtitle("Poisson Regression") +``` + +Similar to logistic regression, the model-robust subsampling function performs faster than the `glm()` and `bigglm()` functions. + +In summary, the model-robust subsampling functions available in this R package perform best under high-dimensional data. diff --git a/vignettes/Linear_Regression.Rmd b/vignettes/Linear_Regression.Rmd index bfe25ab..9974603 100644 --- a/vignettes/Linear_Regression.Rmd +++ b/vignettes/Linear_Regression.Rmd @@ -46,7 +46,7 @@ Theme_special<-function(){ # Understanding the electric consumption data ``Electric power consumption'' data [@hebrailindividual], which contains $2,049,280$ measurements for a house located at Sceaux, France between December 2006 and November 2010. -The data contains $4$ columns and has $2,049,280$ observations, first column is the response variable and the rest are covariates, however we only use the first $1\%$ of the data for this explanation. +The data contains $4$ columns and has $2,049,280$ observations, first column is the response variable and the rest are covariates, however we only use the first $10\%$ of the data for this explanation. The response $y$ is the log scaled intensity, while the covariates are active electrical energy in the a) kitchen ($X_1$), b) laundry room ($X_2$) and c) water-heater and air-conditioner ($X_3$). The covariates are scaled to be mean of zero and variance of one. @@ -56,8 +56,8 @@ Given data is analysed under two different scenarios, 2. subsampling method assuming the main effects model is potentially misspecified. ```{r Exploring data} -# Selecting 1% of the big data and prepare it -indexes<-1:ceiling(nrow(Electric_consumption)*0.01) +# Selecting 25% of the big data and prepare it +indexes<-1:ceiling(nrow(Electric_consumption)*0.10) Original_Data<-cbind(Electric_consumption[indexes,1],1, Electric_consumption[indexes,-1]) colnames(Original_Data)<-c("Y",paste0("X",0:ncol(Original_Data[,-c(1,2)]))) @@ -71,14 +71,14 @@ head(Original_Data) %>% caption = "First few observations of the electric consumption data.") # Setting the sample sizes -N<-nrow(Original_Data); M<-250; k<-seq(6,18,by=3)*100; rep_k<-rep(k,each=M) +N<-nrow(Original_Data); M<-250; k<-seq(8,20,by=2)*100; rep_k<-rep(k,each=M) ``` ## Model robust or average subsampling The method $A$- and $L$-optimality of model robust or average subsampling [@mahendran2023model] is compared against the $A$- and $L$-optimality subsampling [@ai2021optimal;@yao2021review] method. Here five different models are considered 1) main effects model ($\beta_0+\beta_1X_1+\beta_2X_2+\beta_3X_3$), 2-4) main effects model with the squared term of each covariate ($X^2_1 / X^2_2 / X^2_3$) and 5) main effects model with all the squared terms ($\beta_0+\beta_1X_1+\beta_2X_2+\beta_3X_3+\beta_4X^2_1+\beta_5X^2_2+\beta_6X^2_3$). -For each model $l$ the mean squared error of the model parameters $MSE_l(\tilde{\beta}_k,\hat{\beta})=\frac{1}{MJ} \sum_{i=1}^M \sum_{j=1}^J (\tilde{\beta}_{k,j} - \hat{\beta}_j)^2$ are calculated for the $M=100$ simulations across the sample sizes $k=(600,\ldots,1500)$ and the initial sample size is $r1=300$. +For each model $l$ the mean squared error of the model parameters $MSE_l(\tilde{\beta}_k,\hat{\beta})=\frac{1}{MJ} \sum_{i=1}^M \sum_{j=1}^J (\tilde{\beta}_{k,j} - \hat{\beta}_j)^2$ are calculated for the $M=250$ simulations across the sample sizes $k=(800,\ldots,2000)$ and the initial sample size is $r1=400$. Here, for the $l$-th model $\tilde{\beta}_k$ is the estimated model parameters from the sample of size $k$ and $\hat{\beta}$ is the estimated model parameters from the full big data, while $j$ is index of the model parameter. ```{r Define the methods} @@ -118,7 +118,7 @@ Below is the code of implementation for this scenario. ```{r Equal apriori probabilities, fig.width=12,fig.height=12,fig.align='center',fig.cap="Mean squared error for all the models with equal apriori in the order a to e for Model 1 to 5 across the subsampling methods under comparison."} All_Covariates<-colnames(Original_Data_ModelRobust)[-1] # A- and L-optimality model robust subsampling for linear regression -NeEDS4BigData::modelRobustLinSub(r1=300,r2=rep_k, +NeEDS4BigData::modelRobustLinSub(r1=400,r2=rep_k, Y=as.matrix(Original_Data_ModelRobust[,1]), X=as.matrix(Original_Data_ModelRobust[,-1]),N=N, Apriori_probs=rep(1/length(All_Models),length(All_Models)), @@ -180,10 +180,10 @@ Method_Colour<-c("#D82632","#A50021","#BBFFBB","#50FF50","#00BB00") Method_Shape_Types<-c(rep(8,2),rep(17,3)) # A- and L-optimality and RLmAMSE model misspecified subsampling for linear regression -NeEDS4BigData::modelMissLinSub(r1=300,r2=rep_k, +NeEDS4BigData::modelMissLinSub(r1=400,r2=rep_k, Y=as.matrix(Original_Data[,1]), X=as.matrix(Original_Data[,-1]), - N=N,Alpha=10,proportion=0.8)->Results + N=N,Alpha=10,proportion=0.5)->Results Final_Beta_modelMiss<-Results$Beta_Estimates Final_AMSE_modelMiss<-Results$AMSE_Estimates diff --git a/vignettes/Logistic_Regression.Rmd b/vignettes/Logistic_Regression.Rmd index 8467bbc..bcf072c 100644 --- a/vignettes/Logistic_Regression.Rmd +++ b/vignettes/Logistic_Regression.Rmd @@ -70,14 +70,14 @@ head(Original_Data) %>% caption = "First few observations of the skin segmentation data.") # Setting the sample sizes -N<-nrow(Original_Data); M<-250; k<-seq(6,18,by=3)*100; rep_k<-rep(k,each=M) +N<-nrow(Original_Data); M<-250; k<-seq(8,20,by=2)*100; rep_k<-rep(k,each=M) ``` ## Model robust or average subsampling The method $A$- and $L$-optimality of model robust or average subsampling [@mahendran2023model] is compared against the $A$- and $L$-optimality subsampling [@ai2021optimal;@yao2021review] method. Here five different models are considered 1) main effects model ($\beta_0+\beta_1X_1+\beta_2X_2+\beta_3X_3$), 2-4) main effects model with the squared term of each covariate ($X^2_1 / X^2_2 / X^2_3$) and 5) main effects model with all the squared terms ($\beta_0+\beta_1X_1+\beta_2X_2+\beta_3X_3+\beta_4X^2_1+\beta_5X^2_2+\beta_6X^2_3$). -For each model $j$ the mean squared error of the model parameters $MSE_l(\tilde{\beta}_k,\hat{\beta})=\frac{1}{MJ} \sum_{i=1}^M \sum_{j=1}^J (\tilde{\beta}_{k,j} - \hat{\beta}_j)^2$ are calculated for the $M=100$ simulations across the sample sizes $k=(600,\ldots,1500)$ and the initial sample size is $r1=300$. +For each model $j$ the mean squared error of the model parameters $MSE_l(\tilde{\beta}_k,\hat{\beta})=\frac{1}{MJ} \sum_{i=1}^M \sum_{j=1}^J (\tilde{\beta}_{k,j} - \hat{\beta}_j)^2$ are calculated for the $M=250$ simulations across the sample sizes $k=(800,\ldots,2000)$ and the initial sample size is $r1=400$. Here, for the $l$-th model $\tilde{\beta}_k$ is the estimated model parameters from the sample of size $k$ and $\hat{\beta}$ is the estimated model parameters from the full big data, while $j$ is index of the model parameter. ```{r Define the methods} @@ -117,7 +117,7 @@ Below is the code of implementation for this scenario. ```{r Equal apriori,fig.width=12,fig.height=12,fig.align='center', fig.cap="Mean squared error for all the models with equal apriori in the order a to e for Model 1 to 5 across the subsampling methods under comparison."} All_Covariates<-colnames(Original_Data_ModelRobust)[-1] # A- and L-optimality model robust subsampling for logistic regression -NeEDS4BigData::modelRobustLogSub(r1=300,r2=rep_k, +NeEDS4BigData::modelRobustLogSub(r1=400,r2=rep_k, Y=as.matrix(Original_Data_ModelRobust[,1]), X=as.matrix(Original_Data_ModelRobust[,-1]), N=N,Apriori_probs=rep(1/length(All_Models),length(All_Models)), @@ -180,10 +180,10 @@ Method_Colour<-c("#D82632","#A50021","#BBFFBB","#50FF50","#00BB00") Method_Shape_Types<-c(rep(8,2),rep(17,3)) # A- and L-optimality and RLmAMSE model misspecified subsampling for logistic regression -NeEDS4BigData::modelMissLogSub(r1=300,r2=rep_k, +NeEDS4BigData::modelMissLogSub(r1=400,r2=rep_k, Y=as.matrix(Original_Data[,1]), X=as.matrix(Original_Data[,-1]), - N=N,Alpha=10, proportion = 0.8)->Results + N=N,Alpha=10, proportion = 1)->Results Final_Beta_modelMiss<-Results$Beta_Estimates Final_AMSE_modelMiss<-Results$AMSE_Estimates diff --git a/vignettes/Poisson_Regression.Rmd b/vignettes/Poisson_Regression.Rmd index c387d6c..fd8be42 100644 --- a/vignettes/Poisson_Regression.Rmd +++ b/vignettes/Poisson_Regression.Rmd @@ -68,14 +68,14 @@ head(Original_Data) %>% caption = "First few observations of the bike sharing data.") # Setting the sample sizes -N<-nrow(Original_Data); M<-250; k<-seq(6,18,by=3)*100; rep_k<-rep(k,each=M) +N<-nrow(Original_Data); M<-250; k<-seq(8,20,by=2)*100; rep_k<-rep(k,each=M) ``` ## Model robust or average subsampling The method $A$- and $L$-optimality of model robust or average subsampling [@mahendran2023model] is compared against the $A$- and $L$-optimality subsampling [@ai2021optimal;@yao2021review] method. Here five different models are considered 1) main effects model ($\beta_0+\beta_1X_1+\beta_2X_2+\beta_3X_3$), 2-4) main effects model with the squared term of each covariate ($X^2_1 / X^2_2 / X^2_3$) and 5) main effects model with all the squared terms ($\beta_0+\beta_1X_1+\beta_2X_2+\beta_3X_3+\beta_4X^2_1+\beta_5X^2_2+\beta_6X^2_3$). -For each model $j$ the mean squared error of the model parameters $MSE_l(\tilde{\beta}_k,\hat{\beta})=\frac{1}{MJ} \sum_{i=1}^M \sum_{j=1}^J (\tilde{\beta}_{k,j} - \hat{\beta}_j)^2$ are calculated for the $M=100$ simulations across the sample sizes $k=(600,\ldots,1500)$ and the initial sample size is $r1=300$. +For each model $j$ the mean squared error of the model parameters $MSE_l(\tilde{\beta}_k,\hat{\beta})=\frac{1}{MJ} \sum_{i=1}^M \sum_{j=1}^J (\tilde{\beta}_{k,j} - \hat{\beta}_j)^2$ are calculated for the $M=250$ simulations across the sample sizes $k=(800,\ldots,2000)$ and the initial sample size is $r1=400$. Here, for the $l$-th model $\tilde{\beta}_k$ is the estimated model parameters from the sample of size $k$ and $\hat{\beta}$ is the estimated model parameters from the full big data, while $j$ is index of the model parameter. ```{r define subsampling methods} @@ -115,7 +115,7 @@ Below is the code of implementation for this scenario. ```{r Equal apriori,fig.width=12,fig.height=12,fig.align='center',fig.cap="Mean squared error for all the models with equal apriori in the order a to e for Model 1 to 5 across the subsampling methods under comparison."} All_Covariates<-colnames(Original_Data_ModelRobust)[-1] # A- and L-optimality model robust subsampling for poisson regression -NeEDS4BigData::modelRobustPoiSub(r1=300,r2=rep_k, +NeEDS4BigData::modelRobustPoiSub(r1=400,r2=rep_k, Y=as.matrix(Original_Data_ModelRobust[,1]), X=as.matrix(Original_Data_ModelRobust[,-1]), N=N,Apriori_probs=rep(1/length(All_Models),length(All_Models)), @@ -178,10 +178,10 @@ Method_Colour<-c("#D82632","#A50021","#BBFFBB","#50FF50","#00BB00") Method_Shape_Types<-c(rep(8,2),rep(17,3)) # A- and L-optimality and RLmAMSE model misspecified subsampling for poisson regression -NeEDS4BigData::modelMissPoiSub(r1=300,r2=rep_k, +NeEDS4BigData::modelMissPoiSub(r1=400,r2=rep_k, Y=as.matrix(Original_Data[,1]), X=as.matrix(Original_Data[,-1]), - N=N,Alpha=10, proportion = 0.8)->Results + N=N,Alpha=10, proportion = 1)->Results Final_Beta_modelMiss<-Results$Beta_Estimates Final_AMSE_modelMiss<-Results$AMSE_Estimates diff --git a/vignettes/additionaldata/Results_Model_based.Rdata b/vignettes/additionaldata/Results_Model_based.Rdata new file mode 100644 index 0000000..27eb766 Binary files /dev/null and b/vignettes/additionaldata/Results_Model_based.Rdata differ diff --git a/vignettes/additionaldata/Results_Model_robust.Rdata b/vignettes/additionaldata/Results_Model_robust.Rdata new file mode 100644 index 0000000..b4f2f51 Binary files /dev/null and b/vignettes/additionaldata/Results_Model_robust.Rdata differ