diff --git a/.Rbuildignore b/.Rbuildignore
index ae62917..5b2ee19 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -21,4 +21,4 @@
^CRAN-SUBMISSION$
^doc$
^Meta$
-^vignettes$
+^vignettes
diff --git a/DESCRIPTION b/DESCRIPTION
index 5134b47..b455f9e 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -13,6 +13,8 @@ License: MIT + file LICENSE
URL:
https://github.com/Amalan-ConStat/NeEDS4BigData,https://amalan-constat.github.io/NeEDS4BigData/index.html
BugReports: https://github.com/Amalan-ConStat/NeEDS4BigData/issues
+Depends:
+ R (>= 4.0.0)
Imports:
dplyr,
foreach,
@@ -21,6 +23,7 @@ Imports:
ggplot2,
ggridges,
matrixStats,
+ mvnfast,
psych,
Rdpack,
Rfast,
@@ -42,6 +45,4 @@ Language: en-GB
LazyData: true
LazyDataCompression: xz
RoxygenNote: 7.3.1
-Depends:
- R (>= 3.5.0)
Config/testthat/edition: 3
diff --git a/NAMESPACE b/NAMESPACE
index f275c34..5c9db53 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -31,10 +31,13 @@ importFrom(Rdpack,reprompt)
importFrom(Rfast,rowprods)
importFrom(dplyr,group_by)
importFrom(dplyr,summarise)
+importFrom(gam,lo)
importFrom(gam,s)
importFrom(ggh4x,facet_grid2)
importFrom(matrixStats,rowSums2)
+importFrom(mvnfast,rmvn)
importFrom(psych,tr)
importFrom(rlang,.data)
importFrom(tidyr,pivot_longer)
importFrom(tidyr,starts_with)
+importFrom(utils,combn)
diff --git a/NeEDS4BigData.Rproj b/NeEDS4BigData.Rproj
index b6d689d..473c4a7 100644
--- a/NeEDS4BigData.Rproj
+++ b/NeEDS4BigData.Rproj
@@ -19,6 +19,6 @@ StripTrailingWhitespace: Yes
BuildType: Package
PackageUseDevtools: Yes
PackageInstallArgs: --no-multiarch --with-keep.source
-PackageRoxygenize: rd,collate,namespace,vignette
+PackageRoxygenize: rd,collate,namespace
SpellingDictionary: en_GB
diff --git a/R/ALoptimalGLMSub.R b/R/ALoptimalGLMSub.R
index 03102b5..17dc365 100644
--- a/R/ALoptimalGLMSub.R
+++ b/R/ALoptimalGLMSub.R
@@ -64,8 +64,7 @@
#'
#' r1<-300; r2<-rep(600,50); Original_Data<-Full_Data$Complete_Data;
#'
-#' ALoptimalGLMSub(r1 = r1, r2 = r2,
-#' Y = as.matrix(Original_Data[,colnames(Original_Data) %in% c("Y")]),
+#' ALoptimalGLMSub(r1 = r1, r2 = r2,Y = as.matrix(Original_Data[,1]),
#' X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data),
#' family = "linear")->Results
#'
@@ -77,8 +76,7 @@
#'
#' r1<-300; r2<-rep(600,50); Original_Data<-Full_Data$Complete_Data;
#'
-#' ALoptimalGLMSub(r1 = r1, r2 = r2,
-#' Y = as.matrix(Original_Data[,colnames(Original_Data) %in% c("Y")]),
+#' ALoptimalGLMSub(r1 = r1, r2 = r2,Y = as.matrix(Original_Data[,1]),
#' X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data),
#' family = "logistic")->Results
#'
@@ -90,8 +88,7 @@
#'
#' r1<-300; r2<-rep(600,50); Original_Data<-Full_Data$Complete_Data;
#'
-#' ALoptimalGLMSub(r1 = r1, r2 = r2,
-#' Y = as.matrix(Original_Data[,colnames(Original_Data) %in% c("Y")]),
+#' ALoptimalGLMSub(r1 = r1, r2 = r2,Y = as.matrix(Original_Data[,1]),
#' X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data),
#' family = "poisson")->Results
#'
@@ -129,7 +126,7 @@ ALoptimalGLMSub <- function(r1,r2,Y,X,N,family){
if(family %in% c("linear")){
PI.prop <- rep(1/N, N)
- idx.prop <- sample(1:N, r1, T)
+ idx.prop <- sample(1:N, size = r1, replace = TRUE)
x.prop <- X[idx.prop,]
y.prop <- Y[idx.prop,]
@@ -137,8 +134,8 @@ ALoptimalGLMSub <- function(r1,r2,Y,X,N,family){
pinv.prop <- N
pinv.prop <- 1/PI.prop[idx.prop]
- beta.prop<-solve(a=t(x.prop)%*%x.prop,b=t(x.prop)%*%y.prop)
- Xbeta_Final<-as.vector(X%*%beta.prop)
+ beta.prop<-solve(a=crossprod(x.prop),b= crossprod(x.prop,y.prop))
+ Xbeta_Final<-X%*%beta.prop
Var.prop<-sum((Y-Xbeta_Final)^2)/N
Epsilon.prop<-Y-Xbeta_Final
@@ -161,7 +158,7 @@ ALoptimalGLMSub <- function(r1,r2,Y,X,N,family){
PI.mVc<-PI.mVc/sum(PI.mVc)
## mMSE
- PI.mMSE<-sqrt(Epsilon.prop^2 * matrixStats::rowSums2((X %*% solve(t(X)%*%X))^2))
+ PI.mMSE<-sqrt(Epsilon.prop^2 * matrixStats::rowSums2((X %*% solve(crossprod(X)))^2))
PI.mMSE<-PI.mMSE/sum(PI.mMSE)
message("Step 1 of the algorithm completed.\n")
@@ -169,7 +166,7 @@ ALoptimalGLMSub <- function(r1,r2,Y,X,N,family){
for (i in 1:length(r2))
{
## mVc
- idx.mVc <- sample(1:N, r2[i]-r1, T, PI.mVc)
+ idx.mVc <- sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI.mVc)
x.mVc <- X[c(idx.mVc, idx.prop),]
y.mVc <- Y[c(idx.mVc, idx.prop)]
@@ -178,8 +175,8 @@ ALoptimalGLMSub <- function(r1,r2,Y,X,N,family){
pi4_r<-sqrt(r2[i]*pinv.mVc^(-1))
X_r4<-x.mVc/pi4_r
Y_r4<-y.mVc/pi4_r
- beta.prop<-solve(a=t(X_r4)%*%X_r4,b=t(X_r4)%*%Y_r4)
- Xbeta_Final<-as.vector(X%*%beta.prop)
+ beta.prop<-solve(a=crossprod(X_r4),b=crossprod(X_r4,Y_r4))
+ Xbeta_Final<-X%*%beta.prop
Var.prop<-sum((Y-Xbeta_Final)^2)/N
Sample.mVc[[i+1]]<-idx.mVc;
@@ -191,7 +188,7 @@ ALoptimalGLMSub <- function(r1,r2,Y,X,N,family){
}
## mMSE
- idx.mMSE <- sample(1:N, r2[i]-r1, T, PI.mMSE)
+ idx.mMSE <- sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI.mMSE)
x.mMSE <- X[c(idx.mMSE, idx.prop),]
y.mMSE <- Y[c(idx.mMSE, idx.prop)]
@@ -200,8 +197,8 @@ ALoptimalGLMSub <- function(r1,r2,Y,X,N,family){
pi4_r<-sqrt(r2[i]*pinv.mMSE^(-1))
X_r4<-x.mMSE/pi4_r
Y_r4<-y.mMSE/pi4_r
- beta.prop<-solve(a=t(X_r4)%*%X_r4,b=t(X_r4)%*%Y_r4)
- Xbeta_Final<-as.vector(X%*%beta.prop)
+ beta.prop<-solve(a=crossprod(X_r4),b=crossprod(X_r4,Y_r4))
+ Xbeta_Final<-X%*%beta.prop
Var.prop<-sum((Y-Xbeta_Final)^2)/N
Sample.mMSE[[i+1]]<-idx.mMSE;
@@ -242,7 +239,7 @@ ALoptimalGLMSub <- function(r1,r2,Y,X,N,family){
n0 <- N - n1
PI.prop <- rep(1/(2*n0), N)
PI.prop[Y==1] <- 1/(2*n1)
- idx.prop <- sample(1:N, r1, T, PI.prop)
+ idx.prop <- sample(1:N, size = r1, replace = TRUE, prob = PI.prop)
x.prop <- X[idx.prop,]
y.prop <- Y[idx.prop,]
@@ -274,7 +271,7 @@ ALoptimalGLMSub <- function(r1,r2,Y,X,N,family){
## mMSE
p.prop <- P.prop[idx.prop]
w.prop <- p.prop * (1 - p.prop)
- W.prop <- solve(t(x.prop) %*% (x.prop * w.prop * pinv.prop))
+ W.prop <- solve(crossprod(x.prop,x.prop * w.prop * pinv.prop))
PI.mMSE<-sqrt((Y - P.prop)^2 * matrixStats::rowSums2((X%*%W.prop)^2))
PI.mMSE <- PI.mMSE/sum(PI.mMSE)
@@ -284,7 +281,7 @@ ALoptimalGLMSub <- function(r1,r2,Y,X,N,family){
for (i in 1:length(r2))
{
## mVc
- idx.mVc <- sample(1:N, r2[i]-r1, T, PI.mVc)
+ idx.mVc <- sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI.mVc)
x.mVc <- X[c(idx.mVc, idx.prop), ]
y.mVc <- Y[c(idx.mVc, idx.prop)]
@@ -307,7 +304,7 @@ ALoptimalGLMSub <- function(r1,r2,Y,X,N,family){
Utility_mVc[i,-1]<-c(psych::tr(V_Final),det(solve(V_Final)))
## mMSE
- idx.mMSE <- sample(1:N, r2[i]-r1, T, PI.mMSE)
+ idx.mMSE <- sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI.mMSE)
x.mMSE <- X[c(idx.mMSE, idx.prop),]
y.mMSE <- Y[c(idx.mMSE, idx.prop)]
@@ -356,14 +353,14 @@ ALoptimalGLMSub <- function(r1,r2,Y,X,N,family){
}
if(family %in% "poisson"){
PI.prop <- rep(1/N, N)
- idx.prop <- sample(1:N, r1, T)
+ idx.prop <- sample(1:N, size = r1, replace = TRUE)
x.prop<-X[idx.prop,]
y.prop <- Y[idx.prop,]
pinv.prop <- N
pinv.prop <- 1/PI.prop[idx.prop]
- fit.prop <- stats::glm(y.prop~x.prop-1,family = "poisson")
+ fit.prop <- stats::glm(y.prop~x.prop-1,family = "quasipoisson")
beta.prop <- fit.prop$coefficients
if(anyNA(beta.prop)){
@@ -389,7 +386,7 @@ ALoptimalGLMSub <- function(r1,r2,Y,X,N,family){
## mMSE
w.prop <- P.prop[idx.prop]
- W.prop <- solve(t(x.prop) %*% (x.prop * w.prop * pinv.prop))
+ W.prop <- solve(crossprod(x.prop,x.prop * w.prop * pinv.prop))
PI.mMSE<-sqrt((Y - P.prop)^2 * matrixStats::rowSums2((X%*%W.prop)^2))
PI.mMSE <- PI.mMSE/sum(PI.mMSE)
@@ -399,13 +396,13 @@ ALoptimalGLMSub <- function(r1,r2,Y,X,N,family){
for (i in 1:length(r2))
{
## mVc
- idx.mVc <- sample(1:N, r2[i]-r1, T, PI.mVc)
+ idx.mVc <- sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI.mVc)
x.mVc <- X[c(idx.mVc, idx.prop),]
y.mVc <- Y[c(idx.mVc, idx.prop)]
pinv.mVc<-c(1 / PI.mVc[idx.mVc], pinv.prop)
- fit.mVc <-stats::glm(y.mVc~x.mVc-1, family = "poisson",weights=pinv.mVc)
+ fit.mVc <-stats::glm(y.mVc~x.mVc-1, family = "quasipoisson",weights=pinv.mVc)
Sample.mVc[[i+1]]<-idx.mVc;
beta.mVc[i,-1] <- fit.mVc$coefficients
@@ -422,13 +419,13 @@ ALoptimalGLMSub <- function(r1,r2,Y,X,N,family){
Utility_mVc[i,-1]<-c(psych::tr(V_Final),det(solve(V_Final)))
## mMSE
- idx.mMSE <- sample(1:N, r2[i]-r1, T, PI.mMSE)
+ idx.mMSE <- sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI.mMSE)
x.mMSE <- X[c(idx.mMSE, idx.prop),]
y.mMSE <- Y[c(idx.mMSE, idx.prop)]
pinv.mMSE<-c(1 / PI.mMSE[idx.mMSE], pinv.prop)
- fit.mMSE <- stats::glm(y.mMSE~x.mMSE-1, family = "poisson",weights=pinv.mMSE)
+ fit.mMSE <- stats::glm(y.mMSE~x.mMSE-1, family = "quasipoisson",weights=pinv.mMSE)
Sample.mMSE[[i+1]]<-idx.mMSE;
beta.mMSE[i,-1] <-fit.mMSE$coefficients
diff --git a/R/AoptimalGauLMSub.R b/R/AoptimalGauLMSub.R
index 8826d23..98a9dca 100644
--- a/R/AoptimalGauLMSub.R
+++ b/R/AoptimalGauLMSub.R
@@ -52,14 +52,14 @@
#'
#' r1<-300; r2<-rep(100*c(6,12),50); Original_Data<-Full_Data$Complete_Data;
#'
-#' AoptimalGauLMSub(r1 = r1, r2 = r2,
-#' Y = as.matrix(Original_Data[,colnames(Original_Data) %in% c("Y")]),
+#' AoptimalGauLMSub(r1 = r1, r2 = r2,Y = as.matrix(Original_Data[,1]),
#' X = as.matrix(Original_Data[,-1]),
#' N = nrow(Original_Data))->Results
#'
#' plot_Beta(Results)
#'
#' @importFrom Rdpack reprompt
+#' @importFrom matrixStats rowSums2
#' @export
AoptimalGauLMSub <- function(r1,r2,Y,X,N){
if(any(is.na(c(r1,r2,N))) | any(is.nan(c(r1,r2,N)))){
@@ -70,7 +70,7 @@ AoptimalGauLMSub <- function(r1,r2,Y,X,N){
stop("r1 or N has a value greater than length one")
}
- if(any(is.na(cbind(Y,X))) | any(is.nan(cbind(Y,X)))){
+ if(anyNA(Y) | anyNA(X) | any(is.nan(Y)) | any(is.nan(X)) ){
stop("NA or Infinite or NAN values in the Y or X")
}
@@ -83,7 +83,7 @@ AoptimalGauLMSub <- function(r1,r2,Y,X,N){
}
PI.prop <- rep(1/N, N)
- idx.prop <- sample(1:N, r1, T)
+ idx.prop <- sample(1:N, size = r1, replace = TRUE)
x.prop <- X[idx.prop,]
y.prop <- Y[idx.prop,]
@@ -91,8 +91,8 @@ AoptimalGauLMSub <- function(r1,r2,Y,X,N){
pinv.prop <- N
pinv.prop <- 1/PI.prop[idx.prop]
- beta.prop<-solve(a=t(x.prop)%*%x.prop,b=t(x.prop)%*%y.prop)
- Xbeta_Final<-as.vector(X%*%beta.prop)
+ beta.prop<-solve(a=crossprod(x.prop),b= crossprod(x.prop,y.prop))
+ Xbeta_Final<-X%*%beta.prop
Var.prop<-sum((Y-Xbeta_Final)^2)/N
Epsilon.prop<-Y-Xbeta_Final
@@ -101,7 +101,7 @@ AoptimalGauLMSub <- function(r1,r2,Y,X,N){
}
Second <- (Epsilon.prop^2 - Var.prop)^2/(4 * N^2 * Var.prop)
- ML_Inv <- solve(t(X)%*%X)
+ ML_Inv <- solve(crossprod(X))
beta.mMSE<-matrix(nrow = length(r2),ncol = ncol(X)+1 )
Var_Epsilon<-matrix(nrow = length(r2),ncol = 2)
@@ -114,7 +114,7 @@ AoptimalGauLMSub <- function(r1,r2,Y,X,N){
colnames(Var_Epsilon)<-c("r2","A-Optimality")
## mMSE
- PI.mMSE <- sqrt(Epsilon.prop^2 * rowSums((X %*% ML_Inv)^2) + Second)
+ PI.mMSE <- sqrt(Epsilon.prop^2 * matrixStats::rowSums2((X %*% ML_Inv)^2) + Second)
PI.mMSE <- PI.mMSE/sum(PI.mMSE)
message("Step 1 of the algorithm completed.\n")
@@ -122,7 +122,7 @@ AoptimalGauLMSub <- function(r1,r2,Y,X,N){
for (i in 1:length(r2))
{
## mMSE
- idx.mMSE <- sample(1:N, r2[i]-r1, T, PI.mMSE)
+ idx.mMSE <- sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI.mMSE)
x.mMSE <- X[c(idx.mMSE, idx.prop),]
y.mMSE <- Y[c(idx.mMSE, idx.prop)]
@@ -131,8 +131,8 @@ AoptimalGauLMSub <- function(r1,r2,Y,X,N){
pi4_r<-sqrt(r2[i]*pinv.mMSE^(-1))
X_r4<-x.mMSE/pi4_r
Y_r4<-y.mMSE/pi4_r
- beta.prop<-solve(a=t(X_r4)%*%X_r4,b=t(X_r4)%*%Y_r4)
- Xbeta_Final<-as.vector(X%*%beta.prop)
+ beta.prop<-solve(a=crossprod(X_r4),b=crossprod(X_r4,Y_r4))
+ Xbeta_Final<-X%*%beta.prop
Var.prop<-sum((Y-Xbeta_Final)^2)/N
Sample.mMSE[[i+1]]<-idx.mMSE;
diff --git a/R/AoptimalMCGLMSub.R b/R/AoptimalMCGLMSub.R
index 0806ca4..91dd46b 100644
--- a/R/AoptimalMCGLMSub.R
+++ b/R/AoptimalMCGLMSub.R
@@ -59,8 +59,7 @@
#'
#' r1<-300; r2<-rep(100*c(6,12),50); Original_Data<-Full_Data$Complete_Data;
#'
-#' AoptimalMCGLMSub(r1 = r1, r2 = r2,
-#' Y = as.matrix(Original_Data[,colnames(Original_Data) %in% c("Y")]),
+#' AoptimalMCGLMSub(r1 = r1, r2 = r2,Y = as.matrix(Original_Data[,1]),
#' X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data),
#' family = "linear")->Results
#'
@@ -72,8 +71,7 @@
#'
#' r1<-300; r2<-rep(100*c(6,12),50); Original_Data<-Full_Data$Complete_Data;
#'
-#' AoptimalMCGLMSub(r1 = r1, r2 = r2,
-#' Y = as.matrix(Original_Data[,colnames(Original_Data) %in% c("Y")]),
+#' AoptimalMCGLMSub(r1 = r1, r2 = r2,Y = as.matrix(Original_Data[,1]),
#' X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data),
#' family = "logistic")->Results
#'
@@ -85,8 +83,7 @@
#'
#' r1<-300; r2<-rep(100*c(6,12),50); Original_Data<-Full_Data$Complete_Data;
#'
-#' AoptimalMCGLMSub(r1 = r1, r2 = r2,
-#' Y = as.matrix(Original_Data[,colnames(Original_Data) %in% c("Y")]),
+#' AoptimalMCGLMSub(r1 = r1, r2 = r2,Y = as.matrix(Original_Data[,1]),
#' X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data),
#' family = "poisson")->Results
#'
@@ -94,7 +91,6 @@
#'
#' @importFrom Rdpack reprompt
#' @import stats
-#' @importFrom psych tr
#' @importFrom matrixStats rowSums2
#' @export
AoptimalMCGLMSub <- function(r1,r2,Y,X,N,family){
@@ -124,7 +120,7 @@ AoptimalMCGLMSub <- function(r1,r2,Y,X,N,family){
if(family %in% c("linear")){
PI.prop <- rep(1/N, N)
- idx.prop <- sample(1:N, r1, T)
+ idx.prop <- sample(1:N, size = r1, replace = TRUE)
x.prop <- X[idx.prop,]
y.prop <- Y[idx.prop,]
@@ -132,8 +128,8 @@ AoptimalMCGLMSub <- function(r1,r2,Y,X,N,family){
pinv.prop <- N
pinv.prop <- 1/PI.prop[idx.prop]
- beta.prop<-solve(a=t(x.prop)%*%x.prop,b=t(x.prop)%*%y.prop)
- Xbeta_Final<-as.vector(X%*%beta.prop)
+ beta.prop<-solve(a=crossprod(x.prop),b= crossprod(x.prop,y.prop))
+ Xbeta_Final<-X%*%beta.prop
Var.prop<-sum((Y-Xbeta_Final)^2)/N
Epsilon.prop<-Y-Xbeta_Final
@@ -152,7 +148,7 @@ AoptimalMCGLMSub <- function(r1,r2,Y,X,N,family){
colnames(Var_Epsilon)<-c("r2","A-Optimality")
## mMSE
- PI.mMSE<-sqrt(matrixStats::rowSums2((X %*% solve(t(X)%*%X))^2))
+ PI.mMSE<-sqrt(matrixStats::rowSums2((X %*% solve(crossprod(X)))^2))
PI.mMSE<-PI.mMSE/sum(PI.mMSE)
message("Step 1 of the algorithm completed.\n")
@@ -160,17 +156,17 @@ AoptimalMCGLMSub <- function(r1,r2,Y,X,N,family){
for (i in 1:length(r2))
{
## mMSE
- idx.mMSE <- sample(1:N, r2[i], T, PI.mMSE)
+ idx.mMSE <- sample(1:N, size = r2[i], replace = TRUE, prob = PI.mMSE)
- x.mMSE <- X[c(idx.mMSE),]
- y.mMSE <- Y[c(idx.mMSE)]
+ x.mMSE <- X[idx.mMSE,]
+ y.mMSE <- Y[idx.mMSE]
pinv.mMSE<-c(1 / PI.mMSE[idx.mMSE])
pi4_r<-sqrt(r2[i]*pinv.mMSE^(-1))
X_r4<-x.mMSE/pi4_r
Y_r4<-y.mMSE/pi4_r
- beta.prop<-solve(a=t(X_r4)%*%X_r4,b=t(X_r4)%*%Y_r4)
- Xbeta_Final<-as.vector(X%*%beta.prop)
+ beta.prop<-solve(a=crossprod(X_r4),b=crossprod(X_r4,Y_r4))
+ Xbeta_Final1<-X%*%beta.prop
Var.prop<-sum((Y-Xbeta_Final)^2)/N
Sample.mMSE[[i+1]]<-idx.mMSE
@@ -209,7 +205,7 @@ AoptimalMCGLMSub <- function(r1,r2,Y,X,N,family){
n0 <- N - n1
PI.prop <- rep(1/(2*n0), N)
PI.prop[Y==1] <- 1/(2*n1)
- idx.prop <- sample(1:N, r1, T, PI.prop)
+ idx.prop <- sample(1:N, size = r1, replace = TRUE, prob = PI.prop)
x.prop <- X[idx.prop,]
y.prop <- Y[idx.prop,]
@@ -234,7 +230,7 @@ AoptimalMCGLMSub <- function(r1,r2,Y,X,N,family){
## mMSE
p.prop <- P.prop[idx.prop]
w.prop <- p.prop * (1 - p.prop)
- W.prop <- r1*solve(t(x.prop) %*% (x.prop * w.prop * pinv.prop))
+ W.prop <- r1*solve(crossprod(x.prop,(x.prop * w.prop * pinv.prop)))
PI.mMSE<-sqrt(P.prop*(1-P.prop))*sqrt(matrixStats::rowSums2((X%*%W.prop)^2))
PI.mMSE <- PI.mMSE/sum(PI.mMSE)
@@ -244,10 +240,10 @@ AoptimalMCGLMSub <- function(r1,r2,Y,X,N,family){
for (i in 1:length(r2))
{
## mMSE
- idx.mMSE <- sample(1:N, r2[i], T, PI.mMSE)
+ idx.mMSE <- sample(1:N, size = r2[i], replace = TRUE, prob = PI.mMSE)
- x.mMSE <- X[c(idx.mMSE),]
- y.mMSE <- Y[c(idx.mMSE)]
+ x.mMSE <- X[idx.mMSE,]
+ y.mMSE <- Y[idx.mMSE]
fit.mMSE <- .getMLE(x=x.mMSE, y=y.mMSE,w=c(1 / PI.mMSE[idx.mMSE]))
Sample.mMSE[[i+1]]<-idx.mMSE
@@ -279,14 +275,14 @@ AoptimalMCGLMSub <- function(r1,r2,Y,X,N,family){
}
if(family %in% "poisson"){
PI.prop <- rep(1/N, N)
- idx.prop <- sample(1:N, r1, T)
+ idx.prop <- sample(1:N, size = r1, replace = TRUE)
- x.prop<-X[idx.prop,]
+ x.prop <- X[idx.prop,]
y.prop <- Y[idx.prop,]
pinv.prop <- N
pinv.prop <- 1/PI.prop[idx.prop]
- fit.prop <- stats::glm(y.prop~x.prop-1,family = "poisson")
+ fit.prop <- stats::glm(y.prop~x.prop-1,family = "quasipoisson")
beta.prop <- fit.prop$coefficients
if(anyNA(beta.prop)){
@@ -305,7 +301,7 @@ AoptimalMCGLMSub <- function(r1,r2,Y,X,N,family){
## mMSE
w.prop <- P.prop[idx.prop]
- W.prop <- r1*solve(t(x.prop) %*% (x.prop * w.prop * pinv.prop))
+ W.prop <- r1*solve(crossprod(x.prop,x.prop * w.prop * pinv.prop))
PI.mMSE<-sqrt(P.prop)*sqrt(matrixStats::rowSums2((X%*%W.prop)^2))
PI.mMSE <- PI.mMSE/sum(PI.mMSE)
@@ -315,13 +311,13 @@ AoptimalMCGLMSub <- function(r1,r2,Y,X,N,family){
for (i in 1:length(r2))
{
## mMSE
- idx.mMSE <- sample(1:N, r2[i], T, PI.mMSE)
+ idx.mMSE <- sample(1:N, size = r2[i], replace = TRUE, prob = PI.mMSE)
- x.mMSE <- X[c(idx.mMSE),]
- y.mMSE <- Y[c(idx.mMSE)]
+ x.mMSE <- X[idx.mMSE,]
+ y.mMSE <- Y[idx.mMSE]
pinv.mMSE<-c(1 / PI.mMSE[idx.mMSE])
- fit.mMSE <- stats::glm(y.mMSE~x.mMSE-1, family = "poisson",weights=pinv.mMSE)
+ fit.mMSE <- stats::glm(y.mMSE~x.mMSE-1, family = "quasipoisson",weights=pinv.mMSE)
Sample.mMSE[[i+1]]<-idx.mMSE
beta.mMSE[i,-1] <-fit.mMSE$coefficients
diff --git a/R/Big_data.R b/R/Big_data.R
index 8eb4c30..8a584b5 100644
--- a/R/Big_data.R
+++ b/R/Big_data.R
@@ -86,14 +86,14 @@
#' Generate data for Generalised Linear Models
#'
#' Function to simulate big data under linear, logistic and Poisson regression for sampling.
-#' Covariate data X is through Normal or Uniform distribution for linear regression.
-#' Covariate data X is through Exponential or Normal or Uniform distribution for logistic regression.
+#' Covariate data X is through Normal, Multivariate Normal or Uniform distribution for linear regression.
+#' Covariate data X is through Exponential, Normal, Multivariate Normal or Uniform distribution for logistic regression.
#' Covariate data X is through Normal or Uniform distribution for Poisson regression.
#'
#' @usage
#' GenGLMdata(Dist,Dist_Par,No_Of_Var,Beta,N,family)
#'
-#' @param Dist a character value for the distribution "Normal" or "Uniform or "Exponential"
+#' @param Dist a character value for the distribution "Normal", "MVNormal", "Uniform or "Exponential"
#' @param Dist_Par a list of parameters for the distribution that would generate data for covariate X
#' @param No_Of_Var number of variables
#' @param Beta a vector for the model parameters, including the intercept
@@ -105,8 +105,8 @@
#' regression types.
#'
#' We have limited the covariate data generation for
-#' linear regression through normal and uniform distribution,
-#' logistic regression through exponential, normal and uniform and
+#' linear regression through normal, multivariate normal and uniform distribution,
+#' logistic regression through exponential, normal, multivariate normal and uniform distribution
#' Poisson regression through normal and uniform distribution.
#'
#' @return
@@ -118,17 +118,30 @@
#' \insertRef{lee1996hierarchical}{NeEDS4BigData}
#'
#' @examples
-#' Dist<-"Normal"; Dist_Par<-list(Mean=0,Variance=1,Error_Variance=0.5)
-#' No_Of_Var<-2; Beta<-c(-1,2,1); N<-5000; Family<-"linear"
+#' No_Of_Var<-2; Beta<-c(-1,2,1); N<-5000;
+#'
+#' # Dist<-"Normal"; Dist_Par<-list(Mean=0,Variance=1,Error_Variance=0.5)
+#' Dist<-"MVNormal";
+#' Dist_Par<-list(Mean=rep(0,No_Of_Var),Variance=diag(rep(2,No_Of_Var)),Error_Variance=0.5)
+#' # Dist<-"Uniform"; Dist_Par<-list(Min=0,Max=1)
+#'
+#' Family<-"linear"
#' Results<-GenGLMdata(Dist,Dist_Par,No_Of_Var,Beta,N,Family)
#'
-#' Dist<-"Normal"; Dist_Par<-list(Mean=0,Variance=1); Family<-"logistic"
+#' Dist<-"Normal"; Dist_Par<-list(Mean=0,Variance=1);
+#' # Dist<-"MVNormal"; Dist_Par<-list(Mean=rep(0,No_Of_Var),Variance=diag(rep(2,No_Of_Var)))
+#' # Dist<-"Exponential"; Dist_Par<-list(Rate=3)
+#' # Dist<-"Uniform"; Dist_Par<-list(Min=0,Max=1)
+#'
+#' Family<-"logistic"
#' Results<-GenGLMdata(Dist,Dist_Par,No_Of_Var,Beta,N,Family)
#'
-#' Dist<-"Normal"; Family<-"poisson"
+#' # Dist<-"Normal";
+#' Dist<-"Uniform"; Family<-"poisson"
#' Results<-GenGLMdata(Dist,NULL,No_Of_Var,Beta,N,Family)
#'
#' @import stats
+#' @importFrom mvnfast rmvn
#' @export
GenGLMdata<-function(Dist,Dist_Par,No_Of_Var,Beta,N,family){
if(any(is.na(c(Dist,Beta,No_Of_Var,N,family))) | any(is.nan(c(Dist,No_Of_Var,Beta,N,family)))){
@@ -148,14 +161,14 @@ GenGLMdata<-function(Dist,Dist_Par,No_Of_Var,Beta,N,family){
}
if(family == "linear"){
- if(!(Dist == "Normal" | Dist == "Uniform")){
- stop("For linear regression select the distribution 'Normal' \n or 'Uniform' to generate the covarate data")
+ if(!(Dist == "Normal" | Dist == "MVNormal" | Dist == "Uniform")){
+ stop("For linear regression select the distribution 'Normal', 'MVNormal' \n or 'Uniform' to generate the covarate data")
}
}
if(family == "logistic"){
- if(!(Dist == "Exponential" | Dist == "Normal" | Dist == "Uniform")){
- stop("For logistic regression select the distribution 'Exponential', \n 'Normal' or 'Uniform' to generate the covarate data")
+ if(!(Dist == "Exponential" | Dist == "Normal" | Dist == "MVNormal" | Dist == "Uniform")){
+ stop("For logistic regression select the distribution 'Exponential', \n 'Normal', 'MVNormal' or 'Uniform' to generate the covarate data")
}
}
@@ -169,6 +182,9 @@ GenGLMdata<-function(Dist,Dist_Par,No_Of_Var,Beta,N,family){
if(Dist %in% "Normal"){
X<-replicate(No_Of_Var,stats::rnorm(n = N, mean = Dist_Par$Mean, sd = sqrt(Dist_Par$Variance)))
}
+ if(Dist %in% "MVNormal"){
+ X<-mvnfast::rmvn(n = N, mu = Dist_Par$Mean, sigma = sqrt(Dist_Par$Variance))
+ }
if(Dist %in% "Uniform"){
X<-replicate(No_Of_Var,stats::runif(n = N, min = Dist_Par$Min, max = Dist_Par$Max))
}
@@ -193,6 +209,9 @@ GenGLMdata<-function(Dist,Dist_Par,No_Of_Var,Beta,N,family){
if(Dist %in% "Normal"){
X<-replicate(No_Of_Var,stats::rnorm(n = N, mean = Dist_Par$Mean, sd = sqrt(Dist_Par$Variance)))
}
+ if(Dist %in% "MVNormal"){
+ X<-mvnfast::rmvn(n = N, mu = Dist_Par$Mean, sigma = sqrt(Dist_Par$Variance))
+ }
if(Dist %in% "Uniform"){
X<-replicate(No_Of_Var,stats::runif(n = N, min = Dist_Par$Min, max = Dist_Par$Max))
}
diff --git a/R/LCCsampling.R b/R/LCCsampling.R
index 4586b9a..5a7ca19 100644
--- a/R/LCCsampling.R
+++ b/R/LCCsampling.R
@@ -52,7 +52,7 @@
#'
#' r1<-300; r2<-rep(100*c(6,9,12),50); Original_Data<-Full_Data$Complete_Data;
#'
-#' LCCsampling(r1 = r1, r2 = r2, Y = as.matrix(Original_Data[,colnames(Original_Data) %in% c("Y")]),
+#' LCCsampling(r1 = r1, r2 = r2, Y = as.matrix(Original_Data[,1]),
#' X = as.matrix(Original_Data[,-1]),
#' N = nrow(Original_Data))->Results
#'
@@ -85,7 +85,7 @@ LCCsampling<-function(r1,r2,Y,X,N){
n0 <- N - n1
PI.prop <- rep(1/(2*n0), N)
PI.prop[Y==1] <- 1/(2*n1)
- idx.prop <- sample(1:N, r1, T, PI.prop)
+ idx.prop <- sample(1:N, size = r1, replace = TRUE, prob = PI.prop)
x.prop <- X[idx.prop,]
y.prop <- Y[idx.prop,]
@@ -115,10 +115,10 @@ LCCsampling<-function(r1,r2,Y,X,N){
for(i in 1:length(r2)){
## local case control sampling
- idx.LCC <- sample(1:N, r2[i], T, PI.LCC)
+ idx.LCC <- sample(1:N, size = r2[i], replace = TRUE, prob = PI.LCC)
- x.LCC <- X[c(idx.LCC),]
- y.LCC <- Y[c(idx.LCC)]
+ x.LCC <- X[idx.LCC,]
+ y.LCC <- Y[idx.LCC]
pinv.LCC <- 1
fit.LCC <- .getMLE(x=x.LCC, y=y.LCC, w=pinv.LCC)
beta.prop <- fit.LCC$par+beta.prop_start
diff --git a/R/LeverageSampling.R b/R/LeverageSampling.R
index d74512e..a2ca6b0 100644
--- a/R/LeverageSampling.R
+++ b/R/LeverageSampling.R
@@ -61,7 +61,7 @@
#'
#' r<-rep(100*c(6,10),50); Original_Data<-Full_Data$Complete_Data;
#'
-#' LeverageSampling(r = r, Y = as.matrix(Original_Data[,colnames(Original_Data) %in% c("Y")]),
+#' LeverageSampling(r = r, Y = as.matrix(Original_Data[,1]),
#' X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data),
#' S_alpha = 0.95,
#' family = "linear")->Results
@@ -74,7 +74,7 @@
#'
#' r<-rep(100*c(6,10),50); Original_Data<-Full_Data$Complete_Data;
#'
-#' LeverageSampling(r = r, Y = as.matrix(Original_Data[,colnames(Original_Data) %in% c("Y")]),
+#' LeverageSampling(r = r, Y = as.matrix(Original_Data[,1]),
#' X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data),
#' S_alpha = 0.95,
#' family = "logistic")->Results
@@ -87,7 +87,7 @@
#'
#' r<-rep(100*c(6,10),50); Original_Data<-Full_Data$Complete_Data;
#'
-#' LeverageSampling(r = r, Y = as.matrix(Original_Data[,colnames(Original_Data) %in% c("Y")]),
+#' LeverageSampling(r = r, Y = as.matrix(Original_Data[,1]),
#' X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data),
#' S_alpha = 0.95,
#' family = "poisson")->Results
@@ -142,17 +142,17 @@ LeverageSampling<-function(r,Y,X,N,S_alpha,family){
for (i in 1:length(r)) {
# basic leverage sampling
- idx.blev <- sample(1:N, size = r[i], replace = TRUE, PI.blev)
+ idx.blev <- sample(1:N, size = r[i], replace = TRUE, prob = PI.blev)
- x.blev <- as.matrix(X[idx.blev,])
- y.blev <- as.matrix(Y[idx.blev])
+ x.blev <- X[idx.blev,]
+ y.blev <- Y[idx.blev]
wgt <- 1 / PI.blev[idx.blev]
- Temp_Data <- data.frame(y = y.blev,x.blev, wgt = wgt)
- lm.blev <- stats::lm(y ~ . - 1, weights = Temp_Data$wgt, data = Temp_Data[,-ncol(Temp_Data)])
+ Temp_Data <- data.frame(y = y.blev,x.blev)
+ lm.blev <- stats::lm(y ~ . - 1, weights = wgt, data = Temp_Data)
beta.prop<-stats::coefficients(lm.blev)
- Xbeta_Final<-as.vector(X%*%beta.prop)
+ Xbeta_Final<-X%*%beta.prop
Var.prop<-sum((Y-Xbeta_Final)^2)/N
Sample.blev[[i]]<-idx.blev
@@ -164,10 +164,10 @@ LeverageSampling<-function(r,Y,X,N,S_alpha,family){
}
# unweighted leverage sampling
- lm.uwlev <- stats::lm(y ~ . - 1, data = Temp_Data[,-ncol(Temp_Data)])
+ lm.uwlev <- stats::lm(y ~ . - 1, data = Temp_Data)
beta.prop<-stats::coefficients(lm.uwlev)
- Xbeta_Final<-as.vector(X%*%beta.prop)
+ Xbeta_Final<-X%*%beta.prop
Var.prop<-sum((Y-Xbeta_Final)^2)/N
Sample.uwlev[[i]]<-idx.blev
@@ -179,17 +179,17 @@ LeverageSampling<-function(r,Y,X,N,S_alpha,family){
}
# shrinkage leverage sampling
- idx.slev <- sample(N, size = r[i], replace = TRUE, PI.slev)
+ idx.slev <- sample(N, size = r[i], replace = TRUE, prob = PI.slev)
- x.slev <- as.matrix(X[idx.slev,])
- y.slev <- as.matrix(Y[idx.slev])
+ x.slev <- X[idx.slev,]
+ y.slev <- Y[idx.slev]
wgt <- 1 / PI.slev[idx.slev]
- Temp_Data <- data.frame(y = y.slev,x.slev, wgt = wgt)
- lm.slev <- stats::lm(y ~ . - 1, weights = wgt, data = Temp_Data[,-ncol(Temp_Data)])
+ Temp_Data <- data.frame(y = y.slev,x.slev)
+ lm.slev <- stats::lm(y ~ . - 1, weights = wgt, data = Temp_Data)
beta.prop<-stats::coefficients(lm.slev)
- Xbeta_Final<-as.vector(X%*%beta.prop)
+ Xbeta_Final<-X%*%beta.prop
Var.prop<-sum((Y-Xbeta_Final)^2)/N
Sample.slev[[i]]<-idx.slev
@@ -233,7 +233,7 @@ LeverageSampling<-function(r,Y,X,N,S_alpha,family){
n0 <- N - n1
PI.prop <- rep(1/(2*n0), N)
PI.prop[Y==1] <- 1/(2*n1)
- idx.prop <- sample(1:N, r1, T, PI.prop)
+ idx.prop <- sample(1:N, size = r1, replace = TRUE, prob = PI.prop)
x.prop <- X[idx.prop,]
y.prop <- Y[idx.prop,]
@@ -247,21 +247,9 @@ LeverageSampling<-function(r,Y,X,N,S_alpha,family){
p.prop<-1 - 1 / (1 + exp(X%*% beta.prop))
w.prop<-as.vector(sqrt(p.prop*(1-p.prop)))
X_bar<- w.prop*X
- XX_Inv<-solve(t(X_bar)%*%X_bar)
+ XX_Inv<-solve(crossprod(X_bar))
- # X_Temp<-X
- # PP <-apply(X_Temp,1,function(X_Temp){
- # P.prop <- 1 - 1 / (1 + exp(X_Temp%*% beta.prop))
- # W.prop <- sqrt(P.prop*(1-P.prop))
- # x_bar <- W.prop%*%X_Temp
- # x_bar%*%XX_Inv%*%t(x_bar)
- # })
-
- # Precompute terms outside the loop
- P.prop <- 1 - 1 / (1 + exp(X %*% beta.prop))
- W.prop <- sqrt(P.prop * (1 - P.prop))
- X_bar_s <- as.vector(W.prop) * X
- PP <- rowSums((X_bar_s %*% XX_Inv) * X_bar_s)
+ PP <- rowSums((X_bar %*% XX_Inv) * X_bar)
PI.blev <- PP / sum(PP)
PI.slev <- S_alpha * PI.blev + (1-S_alpha) * 1 / N
@@ -278,8 +266,8 @@ LeverageSampling<-function(r,Y,X,N,S_alpha,family){
# basic leverage sampling
idx.blev <- sample(N, size = r[i], replace = TRUE, PI.blev)
- x.blev <- as.matrix(X[idx.blev,])
- y.blev <- as.matrix(Y[idx.blev])
+ x.blev <- X[idx.blev,]
+ y.blev <- Y[idx.blev]
wgt <- 1 / PI.blev[idx.blev]
fit.blev <- .getMLE(x=x.blev, y=as.vector(y.blev), w=wgt)
@@ -306,8 +294,8 @@ LeverageSampling<-function(r,Y,X,N,S_alpha,family){
# shrinkage leverage sampling
idx.slev <- sample(1:N, size = r[i], replace = TRUE, PI.slev)
- x.slev <- as.matrix(X[idx.slev,])
- y.slev <- as.matrix(Y[idx.slev])
+ x.slev <- X[idx.slev,]
+ y.slev <- Y[idx.slev]
wgt <- 1 / PI.slev[idx.slev]
fit.slev <- .getMLE(x=x.slev, y=as.vector(y.slev), w=wgt)
@@ -343,35 +331,26 @@ LeverageSampling<-function(r,Y,X,N,S_alpha,family){
if(family %in% c("poisson")){
r1<-round(min(r/2))
PI.prop <- rep(1/N, N)
- idx.prop <- sample(1:N, r1, T)
+ idx.prop <- sample(1:N, size = r1, replace = TRUE)
- x.prop<-X[idx.prop,]
+ x.prop <- X[idx.prop,]
y.prop <- Y[idx.prop,]
pinv.prop <- N
pinv.prop <- 1/PI.prop[idx.prop]
- fit.prop <- stats::glm(y.prop~x.prop-1,family = "poisson")
+ fit.prop <- stats::glm(y.prop~x.prop-1,family = "quasipoisson")
beta.prop <- fit.prop$coefficients
if(anyNA(beta.prop)){
stop("There are NA or NaN values in the model parameters")
}
- p.prop<-as.vector(sqrt(exp(X%*% beta.prop)))
- X_bar<- p.prop*X
- XX_Inv<-solve(t(X_bar)%*%X_bar)
-
- # X_Temp<-X;
- # PP <-apply(X_Temp,1,function(X_Temp){
- # P.prop <- sqrt(exp(X_Temp%*%beta.prop))
- # x_bar <- P.prop%*%X_Temp
- # x_bar%*%XX_Inv%*%t(x_bar)
- # })
+ p.prop<-exp(X%*% beta.prop)
+ w.prop<-as.vector(sqrt(p.prop))
+ X_bar<- w.prop*X
+ XX_Inv<-solve(crossprod(X_bar))
# Precompute terms outside the loop
- P.prop <- 1 - 1 / (1 + exp(X %*% beta.prop))
- W.prop <- sqrt(P.prop * (1 - P.prop))
- X_bar_s <- as.vector(W.prop) * X
- PP <- rowSums((X_bar_s %*% XX_Inv) * X_bar_s)
+ PP <- rowSums((X_bar %*% XX_Inv) * X_bar)
PI.blev <- PP / sum(PP)
PI.slev <- S_alpha * PI.blev + (1-S_alpha) * 1 / N
@@ -386,13 +365,13 @@ LeverageSampling<-function(r,Y,X,N,S_alpha,family){
for (i in 1:length(r)) {
# basic leverage sampling
- idx.blev <- sample(1:N, size = r[i], replace = TRUE, PI.blev)
+ idx.blev <- sample(1:N, size = r[i], replace = TRUE, prob = PI.blev)
- x.blev <- as.matrix(X[idx.blev,])
- y.blev <- as.matrix(Y[idx.blev])
+ x.blev <- X[idx.blev,]
+ y.blev <- Y[idx.blev]
wgt <- 1 / PI.blev[idx.blev]
- fit.blev <-stats::glm(y.blev~x.blev-1, family = "poisson",weights=wgt)
+ fit.blev <-stats::glm(y.blev~x.blev-1, family = "quasipoisson",weights=wgt)
beta.prop <- fit.blev$coefficients
Sample.blev[[i]]<-idx.blev
@@ -403,7 +382,7 @@ LeverageSampling<-function(r,Y,X,N,S_alpha,family){
}
# unweighted leverage sampling
- fit.uwlev <- stats::glm(y.blev~x.blev-1, family = "poisson")
+ fit.uwlev <- stats::glm(y.blev~x.blev-1, family = "quasipoisson")
beta.prop<-fit.uwlev$coefficients
Sample.uwlev[[i]]<-idx.blev
@@ -414,13 +393,13 @@ LeverageSampling<-function(r,Y,X,N,S_alpha,family){
}
# shrinkage leverage sampling
- idx.slev <- sample(1:N, size = r[i], replace = TRUE, PI.slev)
+ idx.slev <- sample(1:N, size = r[i], replace = TRUE, prob = PI.slev)
- x.slev <- as.matrix(X[idx.slev,])
- y.slev <- as.matrix(Y[idx.slev])
+ x.slev <- X[idx.slev,]
+ y.slev <- Y[idx.slev]
wgt <- 1 / PI.slev[idx.slev]
- fit.slev <- stats::glm(y.slev~x.slev-1, family = "poisson",weights = wgt)
+ fit.slev <- stats::glm(y.slev~x.slev-1, family = "quasipoisson",weights = wgt)
beta.prop <- fit.slev$coefficients
Sample.slev[[i]]<-idx.slev
diff --git a/R/modelMissLinSub.R b/R/modelMissLinSub.R
index 2b3498d..c5e7f29 100644
--- a/R/modelMissLinSub.R
+++ b/R/modelMissLinSub.R
@@ -97,7 +97,9 @@
#' @import stats
#' @import foreach
#' @importFrom gam s
+#' @importFrom gam lo
#' @importFrom Rfast rowprods
+#' @importFrom utils combn
#' @importFrom psych tr
#' @export
modelMissLinSub <- function(r1,r2,Y,X,N,Alpha,proportion){
@@ -132,31 +134,37 @@ modelMissLinSub <- function(r1,r2,Y,X,N,Alpha,proportion){
message("50% or >=50% of the big data is used to help find AMSE for the subsamples, \nthis could take some time.")
}
- idx.prop <- sample(1:N, r1, T)
+ main_effects <- paste0("s(X", 1:ncol(X[, -1]), ")")
+
+ if(ncol(X[,-1]) == 2 ){
+ two_way_interactions <- utils::combn(colnames(X[, -1]), 2,
+ function(cols){paste0("lo(", paste(cols, collapse = "*"), ")")})
+
+ my_formula<-stats::as.formula(paste("Y ~ ",paste(main_effects,collapse = " + "),"+",
+ paste(two_way_interactions,collapse = " + ")))
+ } else{
+ my_formula<-stats::as.formula(paste("Y ~ ",paste(main_effects,collapse = " + ")))
+ }
+
+ idx.prop <- sample(1:N, size = r1, replace = TRUE)
x.prop <- X[idx.prop,]
y.prop <- Y[idx.prop]
- beta.prop<-solve(a=t(x.prop)%*%x.prop,b=t(x.prop)%*%y.prop)
- Xbeta_Final<-as.vector(X%*%beta.prop)
+ beta.prop<-solve(a=crossprod(x.prop),b=crossprod(x.prop,y.prop))
+ Xbeta_Final<-X%*%beta.prop
Var.prop<-sum((Y-Xbeta_Final)^2)/N
-
- Xbeta.prop<-X%*%beta.prop
- Epsilon.prop<-Y-Xbeta.prop
-
- my_formula<-stats::as.formula(paste("Y ~ ",paste(paste0("s(X",1:ncol(x.prop[,-1]),")"),collapse = " + "),"+",
- paste(paste0("s(",paste0(colnames(x.prop[,-1]),collapse = "*"),")"),
- collapse = " + ")))
+ Epsilon.prop<-Y-Xbeta_Final
#calculate f_hat
Assumed_Data<-data.frame(Y=y.prop,x.prop)
fit_GAM<-gam::gam(my_formula,data=Assumed_Data)
Xbeta_GAM<-gam::predict.Gam(fit_GAM,newdata = data.frame(X))
- f_estimate<-Xbeta_GAM-Xbeta.prop
+ f_estimate<-Xbeta_GAM-Xbeta_Final
Var_GAM.prop<-sum((Y-Xbeta_GAM)^2)/N
if(proportion*N != r1){
- idx.proportion <- sample(1:N, ceiling(proportion*N), T)
+ idx.proportion <- sample(1:N, size = ceiling(proportion*N), replace = TRUE)
Y_proportion<-Y[idx.proportion]
X_proportion<-X[idx.proportion,]
@@ -164,14 +172,13 @@ modelMissLinSub <- function(r1,r2,Y,X,N,Alpha,proportion){
fit_GAM_Proportion<-gam::gam(my_formula,data=Proportion_Data)
Xbeta_GAM_Proportion<-gam::predict.Gam(fit_GAM_Proportion,newdata = data.frame(X))
- beta_proportion<-solve(a=t(X_proportion)%*%X_proportion,b=t(X_proportion)%*%Y_proportion)
+ beta_proportion<-solve(a=crossprod(X_proportion),b=crossprod(X_proportion,Y_proportion))
Xbeta_proportion<-X%*%beta_proportion
Var_GAM_Full<-sum((Y-Xbeta_GAM_Proportion)^2)/N
Var_Full<-sum((Y-Xbeta_proportion)^2)/N
F_Estimate_Full<-Xbeta_GAM_Proportion-Xbeta_proportion
- }
- else{
+ } else {
Var_GAM_Full<-Var_GAM.prop ; Var_Full<-Var.prop ; F_Estimate_Full<-f_estimate
}
@@ -180,7 +187,7 @@ modelMissLinSub <- function(r1,r2,Y,X,N,Alpha,proportion){
PI.mVc <- PI.mVc / sum(PI.mVc)
# mMSE
- PI.mMSE <- sqrt(Epsilon.prop^2 * rowSums((X %*% solve(t(X)%*%X))^2))
+ PI.mMSE <- sqrt(Epsilon.prop^2 * rowSums((X %*% solve(crossprod(X)))^2))
PI.mMSE <- PI.mMSE / sum(PI.mMSE)
Tempsy_Var_Gam_Var<-Var_GAM.prop*Var.prop^(-2)
@@ -192,7 +199,8 @@ modelMissLinSub <- function(r1,r2,Y,X,N,Alpha,proportion){
f_r1 <- f_estimate[c(idx.prop, a)]
Temp_Solve<-solve(crossprod(X_r1))
- Temp1 <- tcrossprod(X_r1 %*% Temp_Solve, X_r1)
+ Temp_Xr1_Solve<-X_r1 %*% Temp_Solve
+ Temp1 <- tcrossprod(Temp_Xr1_Solve,X_r1)
L1_r1 <- Tempsy_Var_Gam_Var*psych::tr(Temp1)
Temp_f_r1 <- Temp1 %*% f_r1
@@ -202,9 +210,9 @@ modelMissLinSub <- function(r1,r2,Y,X,N,Alpha,proportion){
L1_r1 + L2_r1
}
- Temp1<-X[idx.prop,]%*%solve(t(X[idx.prop,])%*%X[idx.prop,])%*%t(X[idx.prop,])
+ Temp1<-X[idx.prop,]%*%solve(crossprod(X[idx.prop,]))%*%t(X[idx.prop,])
L1_r1 <- Tempsy_Var_Gam_Var*psych::tr(Temp1)
- L2_r1 <- sum((Var.prop^(-1)*(Temp1%*%f_estimate[idx.prop]-f_estimate[idx.prop]))^2)
+ L2_r1 <- sum((Var_prop_inv*(Temp1%*%f_estimate[idx.prop]-f_estimate[idx.prop]))^2)
L_All_Temp<-L1_r1+L2_r1
L_All_Final<-abs(L_All_Temp-L_All)
@@ -258,10 +266,12 @@ modelMissLinSub <- function(r1,r2,Y,X,N,Alpha,proportion){
message("Step 1 of the algorithm completed.\n")
+ VarFull_Inv<-Var_Full^(-1)
+
for (i in 1:length(r2))
{
# mVc
- idx.mVc <- sample(1:N, r2[i]-r1, T, PI.mVc)
+ idx.mVc <- sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI.mVc)
x.mVc <- X[c(idx.mVc, idx.prop),]
y.mVc <- Y[c(idx.mVc, idx.prop)]
@@ -270,21 +280,21 @@ modelMissLinSub <- function(r1,r2,Y,X,N,Alpha,proportion){
pi4_r<-sqrt(r2[i]*w.mVc^(-1))
X_r4<-x.mVc/pi4_r
Y_r4<-y.mVc/pi4_r
- beta_mVc[i,]<-c(r2[i],solve(a=t(X_r4)%*%X_r4,b=t(X_r4)%*%Y_r4))
- Xbeta_Final<-as.vector(X%*%beta_mVc[i,-1])
+ beta_mVc[i,]<-c(r2[i],solve(a=crossprod(X_r4),b=crossprod(X_r4,Y_r4)))
+ Xbeta_Final<-X%*%beta_mVc[i,-1]
Var_Epsilon[i,2]<-sum((Y-Xbeta_Final)^2)/N
- Temp1<-x.mVc%*%solve(t(x.mVc)%*%x.mVc)%*%t(x.mVc)
+ Temp1<-x.mVc%*%solve(crossprod(x.mVc))%*%t(x.mVc)
L1_r1 <- Tempsy_Var_Gam_Var_AMSE*psych::tr(Temp1)
- L2_r1 <- sum((Var_Full^(-1)*(Temp1%*%F_Estimate_Full[c(idx.mVc,idx.prop)] -
- F_Estimate_Full[c(idx.mVc, idx.prop)]))^2)
+ L2_r1 <- sum((VarFull_Inv*(Temp1%*%F_Estimate_Full[c(idx.mVc,idx.prop)] -
+ F_Estimate_Full[c(idx.mVc, idx.prop)]))^2)
AMSE_Sample_mVc[i,]<-c(r2[i],L1_r1,L2_r1,L1_r1+L2_r1)
idx.mVc->Sample.mVc[[i+1]]
# mMSE
- idx.mMSE <- sample(1:N, r2[i]-r1, T, PI.mMSE)
+ idx.mMSE <- sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI.mMSE)
x.mMSE <- X[c(idx.mMSE, idx.prop),]
y.mMSE <- Y[c(idx.mMSE, idx.prop)]
@@ -293,21 +303,21 @@ modelMissLinSub <- function(r1,r2,Y,X,N,Alpha,proportion){
pi4_r<-sqrt(r2[i]*w.mMSE^(-1))
X_r4<-x.mMSE/pi4_r
Y_r4<-y.mMSE/pi4_r
- beta_mMSE[i,]<-c(r2[i],solve(a=t(X_r4)%*%X_r4,b=t(X_r4)%*%Y_r4))
- Xbeta_Final<-as.vector(X%*%beta_mMSE[i,-1])
+ beta_mMSE[i,]<-c(r2[i],solve(a=crossprod(X_r4),b=crossprod(X_r4,Y_r4)))
+ Xbeta_Final<-X%*%beta_mMSE[i,-1]
Var_Epsilon[i,3]<-sum((Y-Xbeta_Final)^2)/N
- Temp1<-x.mMSE%*%solve(t(x.mMSE)%*%x.mMSE)%*%t(x.mMSE)
+ Temp1<-x.mMSE%*%solve(crossprod(x.mMSE))%*%t(x.mMSE)
L1_r1 <- Tempsy_Var_Gam_Var_AMSE*psych::tr(Temp1)
- L2_r1 <- sum((Var_Full^(-1)*(Temp1%*%F_Estimate_Full[c(idx.mMSE,idx.prop)]-
- F_Estimate_Full[c(idx.mMSE, idx.prop)]))^2)
+ L2_r1 <- sum((VarFull_Inv*(Temp1%*%F_Estimate_Full[c(idx.mMSE,idx.prop)]-
+ F_Estimate_Full[c(idx.mMSE, idx.prop)]))^2)
AMSE_Sample_mMSE[i,]<-c(r2[i],L1_r1,L2_r1,L1_r1+L2_r1)
idx.mMSE->Sample.mMSE[[i+1]]
# RLmAMSE
- idx.RLmAMSE <- sample(1:N, r2[i], T, PI.RLmAMSE)
+ idx.RLmAMSE <- sample(1:N, size = r2[i], replace = TRUE, prob = PI.RLmAMSE)
x.RLmAMSE <- X[c(idx.RLmAMSE),]
y.RLmAMSE <- Y[c(idx.RLmAMSE)]
@@ -316,14 +326,14 @@ modelMissLinSub <- function(r1,r2,Y,X,N,Alpha,proportion){
pi4_r<-sqrt(r2[i]*w.RLmAMSE^(-1))
X_r4<-x.RLmAMSE/pi4_r
Y_r4<-y.RLmAMSE/pi4_r
- beta_RLmAMSE[i,]<-c(r2[i],solve(a=t(X_r4)%*%X_r4,b=t(X_r4)%*%Y_r4))
- Xbeta_Final<-as.vector(X%*%beta_RLmAMSE[i,-1])
+ beta_RLmAMSE[i,]<-c(r2[i],solve(a=crossprod(X_r4),b=crossprod(X_r4,Y_r4)))
+ Xbeta_Final<-X%*%beta_RLmAMSE[i,-1]
Var_Epsilon[i,4]<-sum((Y-Xbeta_Final)^2)/N
- Temp1<-x.RLmAMSE%*%solve(t(x.RLmAMSE)%*%x.RLmAMSE)%*%t(x.RLmAMSE)
+ Temp1<-x.RLmAMSE%*%solve(crossprod(x.RLmAMSE))%*%t(x.RLmAMSE)
L1_r1 <- Tempsy_Var_Gam_Var_AMSE*psych::tr(Temp1)
- L2_r1 <- sum((Var_Full^(-1)*(Temp1%*%F_Estimate_Full[c(idx.RLmAMSE)] -
- F_Estimate_Full[c(idx.RLmAMSE)]))^2)
+ L2_r1 <- sum((VarFull_Inv*(Temp1%*%F_Estimate_Full[c(idx.RLmAMSE)] -
+ F_Estimate_Full[c(idx.RLmAMSE)]))^2)
AMSE_Sample_RLmAMSE[i,]<-c(r2[i],L1_r1,L2_r1,L1_r1+L2_r1)
@@ -332,7 +342,7 @@ modelMissLinSub <- function(r1,r2,Y,X,N,Alpha,proportion){
for(j in 1:length(Alpha))
{
# RLmAMSE Log Odds
- idx.RLmAMSE <- sample(1:N, r2[i], T, PI.RLmAMSE_LO[,j])
+ idx.RLmAMSE <- sample(1:N, size = r2[i], replace = TRUE, prob = PI.RLmAMSE_LO[,j])
x.RLmAMSE <- X[c(idx.RLmAMSE),]
y.RLmAMSE <- Y[c(idx.RLmAMSE)]
@@ -341,21 +351,21 @@ modelMissLinSub <- function(r1,r2,Y,X,N,Alpha,proportion){
pi4_r<-sqrt(r2[i]*w.RLmAMSE^(-1))
X_r4<-x.RLmAMSE/pi4_r
Y_r4<-y.RLmAMSE/pi4_r
- beta_RLmAMSE_LO[[j]][i,]<-c(r2[i],solve(a=t(X_r4)%*%X_r4,b=t(X_r4)%*%Y_r4))
- Xbeta_Final<-as.vector(X%*%beta_RLmAMSE_LO[[j]][i,-1])
+ beta_RLmAMSE_LO[[j]][i,]<-c(r2[i],solve(a=crossprod(X_r4),b=crossprod(X_r4,Y_r4)))
+ Xbeta_Final<-X%*%beta_RLmAMSE_LO[[j]][i,-1]
Var_RLmAMSE_LO[i,j]<-sum((Y-Xbeta_Final)^2)/N
- Temp1<-x.RLmAMSE%*%solve(t(x.RLmAMSE)%*%x.RLmAMSE)%*%t(x.RLmAMSE)
+ Temp1<-x.RLmAMSE%*%solve(crossprod(x.RLmAMSE))%*%t(x.RLmAMSE)
L1_r1 <- Tempsy_Var_Gam_Var_AMSE*psych::tr(Temp1)
- L2_r1 <- sum((Var_Full^(-1)*(Temp1%*%F_Estimate_Full[c(idx.RLmAMSE)]-
- F_Estimate_Full[c(idx.RLmAMSE)]))^2)
+ L2_r1 <- sum((VarFull_Inv*(Temp1%*%F_Estimate_Full[c(idx.RLmAMSE)]-
+ F_Estimate_Full[c(idx.RLmAMSE)]))^2)
AMSE_Sample_RLmAMSE_LO[[j]][i,]<-c(r2[i],L1_r1,L2_r1,L1_r1+L2_r1)
idx.RLmAMSE->Sample.RLmAMSE_LO[[j]][[i+1]] # Model robust RLmAMSE
# RLmAMSE Power
- idx.RLmAMSE <- sample(1:N, r2[i], T, PI.RLmAMSE_Pow[,j])
+ idx.RLmAMSE <- sample(1:N, size = r2[i], replace = TRUE, prob = PI.RLmAMSE_Pow[,j])
x.RLmAMSE <- X[c(idx.RLmAMSE),]
y.RLmAMSE <- Y[c(idx.RLmAMSE)]
@@ -364,14 +374,14 @@ modelMissLinSub <- function(r1,r2,Y,X,N,Alpha,proportion){
pi4_r<-sqrt(r2[i]*w.RLmAMSE^(-1))
X_r4<-x.RLmAMSE/pi4_r
Y_r4<-y.RLmAMSE/pi4_r
- beta_RLmAMSE_Pow[[j]][i,]<-c(r2[i],solve(a=t(X_r4)%*%X_r4,b=t(X_r4)%*%Y_r4))
- Xbeta_Final<-as.vector(X%*%beta_RLmAMSE_Pow[[j]][i,-1])
+ beta_RLmAMSE_Pow[[j]][i,]<-c(r2[i],solve(a=crossprod(X_r4),b=crossprod(X_r4,Y_r4)))
+ Xbeta_Final<-X%*%beta_RLmAMSE_Pow[[j]][i,-1]
Var_RLmAMSE_Pow[i,j]<-sum((Y-Xbeta_Final)^2)/N
- Temp1<-x.RLmAMSE%*%solve(t(x.RLmAMSE)%*%x.RLmAMSE)%*%t(x.RLmAMSE)
+ Temp1<-x.RLmAMSE%*%solve(crossprod(x.RLmAMSE))%*%t(x.RLmAMSE)
L1_r1 <- Tempsy_Var_Gam_Var_AMSE*psych::tr(Temp1)
- L2_r1 <- sum((Var_Full^(-1)*(Temp1%*%F_Estimate_Full[c(idx.RLmAMSE)] -
- F_Estimate_Full[c(idx.RLmAMSE)]))^2)
+ L2_r1 <- sum((VarFull_Inv*(Temp1%*%F_Estimate_Full[c(idx.RLmAMSE)] -
+ F_Estimate_Full[c(idx.RLmAMSE)]))^2)
AMSE_Sample_RLmAMSE_Pow[[j]][i,]<-c(r2[i],L1_r1,L2_r1,L1_r1+L2_r1)
@@ -403,15 +413,15 @@ modelMissLinSub <- function(r1,r2,Y,X,N,Alpha,proportion){
Var_Data<-cbind.data.frame(Var_Epsilon,Var_RLmAMSE_LO,Var_RLmAMSE_Pow)
colnames(Var_Data)<-c("r2","A-Optimality","L-Optimality","RLmAMSE",
- paste0("RLmAMSE Log Odds ",Alpha),
- paste0("RLmAMSE Power ",Alpha))
+ paste0("RLmAMSE Log Odds ",Alpha),
+ paste0("RLmAMSE Power ",Alpha))
# AMSE Sample Data
AMSE_Sample_Data<-cbind.data.frame("Method"=rep(Sampling_Methods,each=length(r2)),
- rbind(AMSE_Sample_mMSE,AMSE_Sample_mVc,
- AMSE_Sample_RLmAMSE,
- do.call(rbind,AMSE_Sample_RLmAMSE_LO),
- do.call(rbind,AMSE_Sample_RLmAMSE_Pow)))
+ rbind(AMSE_Sample_mMSE,AMSE_Sample_mVc,
+ AMSE_Sample_RLmAMSE,
+ do.call(rbind,AMSE_Sample_RLmAMSE_LO),
+ do.call(rbind,AMSE_Sample_RLmAMSE_Pow)))
colnames(AMSE_Sample_Data)[-1]<-c("r2","Variance","Bias.2","AMSE")
AMSE_Sample_Data[,-c(1,2)]<-AMSE_Sample_Data[,-c(1,2)]/r2
diff --git a/R/modelMissLogSub.R b/R/modelMissLogSub.R
index 7e7187c..70416c7 100644
--- a/R/modelMissLogSub.R
+++ b/R/modelMissLogSub.R
@@ -96,6 +96,7 @@
#' @import foreach
#' @importFrom gam s
#' @importFrom Rfast rowprods
+#' @importFrom utils combn
#' @importFrom psych tr
#' @export
modelMissLogSub <- function(r1,r2,Y,X,N,Alpha,proportion){
@@ -130,11 +131,23 @@ modelMissLogSub <- function(r1,r2,Y,X,N,Alpha,proportion){
message("50% or >=50% of the big data is used to help find AMSE for the subsamples, \nthis could take some time.")
}
+ main_effects <- paste0("s(X", 1:ncol(X[, -1]), ")")
+
+ if(ncol(X[,-1]) == 2 ){
+ two_way_interactions <- utils::combn(colnames(X[, -1]), 2,
+ function(cols){paste0("lo(", paste(cols, collapse = "*"), ")")})
+
+ my_formula<-stats::as.formula(paste("Y ~ ",paste(main_effects,collapse = " + "),"+",
+ paste(two_way_interactions,collapse = " + ")))
+ } else{
+ my_formula<-stats::as.formula(paste("Y ~ ",paste(main_effects,collapse = " + ")))
+ }
+
n1 <- sum(Y)
n0 <- N - n1
PI.prop <- rep(1/(2*n0), N)
PI.prop[Y==1] <- 1/(2*n1)
- idx.prop <- sample(1:N, r1, T, PI.prop)
+ idx.prop <- sample(1:N, size = r1, replace = TRUE, prob = PI.prop)
x.prop <- X[idx.prop,]
y.prop <- Y[idx.prop]
@@ -146,20 +159,17 @@ modelMissLogSub <- function(r1,r2,Y,X,N,Alpha,proportion){
if (anyNA(beta.prop)){
stop("There are NA or NaN values in the model parameters")
}
- P.prop <- 1 - 1 / (1 + exp(X %*% beta.prop))
-
- my_formula<-stats::as.formula(paste("Y ~ ",paste(paste0("s(X",1:ncol(x.prop[,-1]),")"),collapse = " + "),"+",
- paste(paste0("s(",paste0(colnames(x.prop[,-1]),collapse = "*"),")"),
- collapse = " + ")))
+ Xbeta_Final<-X %*% beta.prop
+ P.prop <- 1 - 1 / (1 + exp(Xbeta_Final))
# calculate f_hat
Assumed_Data<-data.frame(Y=y.prop,x.prop)
fit_GAM<-gam::gam(my_formula,data=Assumed_Data,family = "binomial")
Xbeta_GAM<-gam::predict.Gam(fit_GAM,newdata = data.frame(X))
- f_estimate<-Xbeta_GAM - X%*%beta.prop
+ f_estimate<-Xbeta_GAM - Xbeta_Final
if(proportion*N != r1){
- idx.proportion <- sample(1:N, ceiling(proportion*N), T, PI.prop)
+ idx.proportion <- sample(1:N, size = ceiling(proportion*N), replace = TRUE, prob = PI.prop)
Y_proportion <- Y[idx.proportion]
X_proportion <- X[idx.proportion,]
@@ -175,8 +185,7 @@ modelMissLogSub <- function(r1,r2,Y,X,N,Alpha,proportion){
F_Estimate_Full<-Xbeta_GAM_proportion - Xbeta_proportion
Beta_Estimate_Full<-beta.proportion
- }
- else {
+ } else {
Beta_Estimate_Full<- beta.prop ; F_Estimate_Full<-f_estimate
}
@@ -187,7 +196,7 @@ modelMissLogSub <- function(r1,r2,Y,X,N,Alpha,proportion){
# mMSE
p.prop <- P.prop[idx.prop]
w.prop <- p.prop * (1 - p.prop)
- W.prop <- solve(t(x.prop) %*% (x.prop * w.prop * pinv.prop))
+ W.prop <- solve(crossprod(x.prop, x.prop * w.prop * pinv.prop))
PI.mMSE <- sqrt((Y - P.prop)^2 * rowSums((X%*%W.prop)^2))
PI.mMSE <- PI.mMSE / sum(PI.mMSE)
@@ -214,18 +223,18 @@ modelMissLogSub <- function(r1,r2,Y,X,N,Alpha,proportion){
diff <- XH_b_r1 - f_r1
L2_r1 <- sum((W_r1 * diff)^2)
- c(L1_r1+L2_r1)
+ L1_r1+L2_r1
}
p_r1<-1-1/(1+exp(X[idx.prop,]%*%beta.prop))
W_r1<-as.vector(p_r1*(1-p_r1))
- H_r1 <-solve(t(X[idx.prop,]) %*% (X[idx.prop,] * W_r1))
+ H_r1 <-solve(crossprod(X[idx.prop,], X[idx.prop,] * W_r1))
Temp1<-(W_r1*X[idx.prop,])%*%H_r1
p_Tr1<-1-1/(1+exp((X[idx.prop,] %*% beta.prop) + f_estimate[idx.prop]))
W_Tr1<-as.vector(p_Tr1*(1-p_Tr1))
- H_Tr1 <-(t(X[idx.prop,]) %*% (X[idx.prop,] * W_Tr1))
- b_r1 <-(t(X[idx.prop,]) %*% (p_Tr1-p_r1))
+ H_Tr1 <-crossprod(X[idx.prop,], X[idx.prop,] * W_Tr1)
+ b_r1 <-crossprod(X[idx.prop,], p_Tr1-p_r1)
L1_r1 <- psych::tr(Temp1%*%H_Tr1%*%t(Temp1))
L2_r1 <- sum((W_r1*((X[idx.prop,]%*%H_r1%*%b_r1)-f_estimate[idx.prop]))^2)
@@ -276,7 +285,7 @@ modelMissLogSub <- function(r1,r2,Y,X,N,Alpha,proportion){
for (i in 1:length(r2))
{
# mVc
- idx.mVc <- sample(1:N, r2[i]-r1, T, PI.mVc)
+ idx.mVc <- sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI.mVc)
x.mVc <- X[c(idx.mVc, idx.prop),]
y.mVc <- Y[c(idx.mVc, idx.prop)]
@@ -289,20 +298,20 @@ modelMissLogSub <- function(r1,r2,Y,X,N,Alpha,proportion){
p_r1<-1-1/(1+exp(x.mVc%*%Beta_Estimate_Full))
W_r1<-as.vector(p_r1*(1-p_r1))
- H_r1 <-solve(t(x.mVc) %*% (x.mVc * W_r1))
+ H_r1 <-solve(crossprod(x.mVc, x.mVc * W_r1))
Temp1<-(W_r1*x.mVc)%*%H_r1
p_Tr1<-1-1/(1+exp((x.mVc %*% Beta_Estimate_Full) + F_Estimate_Full[c(idx.mVc, idx.prop)]))
W_Tr1<-as.vector(p_Tr1*(1-p_Tr1))
- H_Tr1 <-(t(x.mVc) %*% (x.mVc * W_Tr1))
- b_r1 <-(t(x.mVc) %*% (p_Tr1-p_r1))
+ H_Tr1 <-crossprod(x.mVc, x.mVc * W_Tr1)
+ b_r1 <-crossprod(x.mVc, p_Tr1-p_r1)
L1_r1 <- psych::tr(Temp1%*%H_Tr1%*%t(Temp1))
L2_r1 <- sum((W_r1*((x.mVc%*%H_r1%*%b_r1) - F_Estimate_Full[c(idx.mVc, idx.prop)]))^2)
AMSE_Sample_mVc[i,]<-c(r2[i],L1_r1,L2_r1,L1_r1+L2_r1)
# mMSE
- idx.mMSE <- sample(1:N, r2[i]-r1, T, PI.mMSE)
+ idx.mMSE <- sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI.mMSE)
x.mMSE <- X[c(idx.mMSE, idx.prop),]
y.mMSE <- Y[c(idx.mMSE, idx.prop)]
@@ -315,20 +324,20 @@ modelMissLogSub <- function(r1,r2,Y,X,N,Alpha,proportion){
p_r1<-1-1/(1+exp(x.mMSE%*%Beta_Estimate_Full))
W_r1<-as.vector(p_r1*(1-p_r1))
- H_r1 <-solve(t(x.mMSE) %*% (x.mMSE * W_r1))
+ H_r1 <-solve(crossprod(x.mMSE, x.mMSE * W_r1))
Temp1<-(W_r1 * x.mMSE)%*%H_r1
p_Tr1<-1-1/(1+exp((x.mMSE %*% Beta_Estimate_Full) + F_Estimate_Full[c(idx.mMSE, idx.prop)]))
W_Tr1<-as.vector(p_Tr1*(1-p_Tr1))
- H_Tr1 <-(t(x.mMSE) %*% (x.mMSE * W_Tr1))
- b_r1 <-(t(x.mMSE) %*% (p_Tr1-p_r1))
+ H_Tr1 <-crossprod(x.mMSE, x.mMSE * W_Tr1)
+ b_r1 <-crossprod(x.mMSE, p_Tr1-p_r1)
L1_r1 <- psych::tr(Temp1%*%H_Tr1%*%t(Temp1))
L2_r1 <- sum((W_r1*((x.mMSE%*%H_r1%*%b_r1) - F_Estimate_Full[c(idx.mMSE, idx.prop)]))^2)
AMSE_Sample_mMSE[i,]<-c(r2[i],L1_r1,L2_r1,L1_r1+L2_r1)
# RLmAMSE
- idx.RLmAMSE <- sample(1:N, r2[i], T, PI.RLmAMSE)
+ idx.RLmAMSE <- sample(1:N, size = r2[i], replace = TRUE, prob = PI.RLmAMSE)
x.RLmAMSE <- X[c(idx.RLmAMSE),]
y.RLmAMSE <- Y[c(idx.RLmAMSE)]
@@ -341,13 +350,13 @@ modelMissLogSub <- function(r1,r2,Y,X,N,Alpha,proportion){
p_r1<-1-1/(1+exp(x.RLmAMSE%*%Beta_Estimate_Full))
W_r1<-as.vector(p_r1*(1-p_r1))
- H_r1 <-solve(t(x.RLmAMSE) %*% (x.RLmAMSE * W_r1))
+ H_r1 <-solve(crossprod(x.RLmAMSE, x.RLmAMSE * W_r1))
Temp1<-(W_r1 * x.RLmAMSE)%*%H_r1
p_Tr1<-1-1/(1+exp((x.RLmAMSE %*% Beta_Estimate_Full) + F_Estimate_Full[c(idx.RLmAMSE)]))
W_Tr1<-as.vector(p_Tr1*(1-p_Tr1))
- H_Tr1 <-(t(x.RLmAMSE) %*% (x.RLmAMSE * W_Tr1))
- b_r1 <-(t(x.RLmAMSE) %*% (p_Tr1-p_r1))
+ H_Tr1 <-crossprod(x.RLmAMSE, x.RLmAMSE * W_Tr1)
+ b_r1 <- crossprod(x.RLmAMSE, p_Tr1-p_r1)
L1_r1 <- psych::tr(Temp1%*%H_Tr1%*%t(Temp1))
L2_r1 <- sum((W_r1*((x.RLmAMSE%*%H_r1%*%b_r1) - F_Estimate_Full[c(idx.RLmAMSE)]))^2)
@@ -356,7 +365,7 @@ modelMissLogSub <- function(r1,r2,Y,X,N,Alpha,proportion){
for (j in 1:length(Alpha))
{
# RLmAMSE Log Odds
- idx.RLmAMSE <- sample(1:N, r2[i], T, PI.RLmAMSE_LO[,j])
+ idx.RLmAMSE <- sample(1:N, size = r2[i], replace = TRUE, prob = PI.RLmAMSE_LO[,j])
x.RLmAMSE <- X[c(idx.RLmAMSE),]
y.RLmAMSE <- Y[c(idx.RLmAMSE)]
@@ -369,20 +378,20 @@ modelMissLogSub <- function(r1,r2,Y,X,N,Alpha,proportion){
p_r1<-1-1/(1+exp(x.RLmAMSE%*%Beta_Estimate_Full))
W_r1<-as.vector(p_r1*(1-p_r1))
- H_r1 <-solve(t(x.RLmAMSE) %*% (x.RLmAMSE * W_r1))
+ H_r1 <-solve(crossprod(x.RLmAMSE, x.RLmAMSE * W_r1))
Temp1<-(W_r1 * x.RLmAMSE)%*%H_r1
p_Tr1<-1-1/(1+exp((x.RLmAMSE %*% Beta_Estimate_Full) + F_Estimate_Full[c(idx.RLmAMSE)]))
W_Tr1<-as.vector(p_Tr1*(1-p_Tr1))
- H_Tr1 <-(t(x.RLmAMSE) %*% (x.RLmAMSE * W_Tr1))
- b_r1 <-(t(x.RLmAMSE) %*% (p_Tr1-p_r1))
+ H_Tr1 <-crossprod(x.RLmAMSE, x.RLmAMSE * W_Tr1)
+ b_r1 <-crossprod(x.RLmAMSE, p_Tr1-p_r1)
L1_r1 <- psych::tr(Temp1%*%H_Tr1%*%t(Temp1))
L2_r1 <- sum((W_r1*((x.RLmAMSE%*%H_r1%*%b_r1) - F_Estimate_Full[c(idx.RLmAMSE)]))^2)
AMSE_Sample_RLmAMSE_LO[[j]][i,]<-c(r2[i],L1_r1,L2_r1,L1_r1+L2_r1)
# RLmAMSE Power
- idx.RLmAMSE <- sample(1:N, r2[i], T, PI.RLmAMSE_Pow[,j])
+ idx.RLmAMSE <- sample(1:N, size = r2[i], replace = TRUE, prob = PI.RLmAMSE_Pow[,j])
x.RLmAMSE <- X[c(idx.RLmAMSE),]
y.RLmAMSE <- Y[c(idx.RLmAMSE)]
@@ -400,8 +409,8 @@ modelMissLogSub <- function(r1,r2,Y,X,N,Alpha,proportion){
p_Tr1<-1-1/(1+exp((x.RLmAMSE %*% Beta_Estimate_Full) + F_Estimate_Full[c(idx.RLmAMSE)]))
W_Tr1<-as.vector(p_Tr1*(1-p_Tr1))
- H_Tr1 <-(t(x.RLmAMSE) %*% (x.RLmAMSE * W_Tr1))
- b_r1 <-(t(x.RLmAMSE) %*% (p_Tr1-p_r1))
+ H_Tr1 <-crossprod(x.RLmAMSE, x.RLmAMSE * W_Tr1)
+ b_r1 <- crossprod(x.RLmAMSE, p_Tr1-p_r1)
L1_r1 <- psych::tr(Temp1%*%H_Tr1%*%t(Temp1))
L2_r1 <- sum((W_r1*((x.RLmAMSE%*%H_r1%*%b_r1) - F_Estimate_Full[c(idx.RLmAMSE)]))^2)
diff --git a/R/modelMissPoiSub.R b/R/modelMissPoiSub.R
index b8dceb5..a71b6eb 100644
--- a/R/modelMissPoiSub.R
+++ b/R/modelMissPoiSub.R
@@ -97,6 +97,7 @@
#' @import foreach
#' @importFrom gam s
#' @importFrom Rfast rowprods
+#' @importFrom utils combn
#' @importFrom psych tr
#' @export
modelMissPoiSub <- function(r1,r2,Y,X,N,Alpha,proportion){
@@ -131,52 +132,60 @@ modelMissPoiSub <- function(r1,r2,Y,X,N,Alpha,proportion){
message("50% or >=50% of the big data is used to help find AMSE for the subsamples, \nthis could take some time.")
}
+ main_effects <- paste0("s(X", 1:ncol(X[, -1]), ")")
+
+ if(ncol(X[,-1]) %in% c(2:3) ){
+ two_way_interactions <- utils::combn(colnames(X[, -1]), 2,
+ function(cols){paste0("lo(", paste(cols, collapse = "*"), ")")})
+
+ my_formula<-stats::as.formula(paste("Y ~ ",paste(main_effects,collapse = " + "),"+",
+ paste(two_way_interactions,collapse = " + ")))
+ } else {
+ my_formula<-stats::as.formula(paste("Y ~ ",paste(main_effects,collapse = " + ")))
+ }
+
PI.prop <- rep(1/N,N)
- idx.prop <- sample(1:N, r1, T,PI.prop)
+ idx.prop <- sample(1:N, size = r1, replace = TRUE, prob = PI.prop)
x.prop <- X[idx.prop,]
y.prop <- Y[idx.prop]
pinv.prop <- rep(N,r1)
- fit.prop <- stats::glm(y.prop~x.prop-1,family="poisson")
+ fit.prop <- stats::glm(y.prop~x.prop-1,family="quasipoisson")
beta.prop <- fit.prop$coefficients
if (anyNA(beta.prop)){
stop("There are NA or NaN values in the model parameters")
}
- Lambda.prop <- exp(X %*% beta.prop)
-
- my_formula<-stats::as.formula(paste("Y ~ ",paste(paste0("gam::s(X",1:ncol(x.prop[,-1]),")"),collapse = " + "),"+",
- paste(paste0("gam::s(",paste0(colnames(x.prop[,-1]),collapse = "*"),")"),
- collapse = " + ")))
+ Xbeta_Final <- X %*% beta.prop
+ Lambda.prop <- exp(Xbeta_Final)
#calculate f_hat
Assumed_Data<-data.frame(Y=y.prop,x.prop)
- fit_GAM<-gam::gam(my_formula,data=Assumed_Data,family = "poisson")
+ fit_GAM<-gam::gam(my_formula,data=Assumed_Data,family = "quasipoisson")
Xbeta_GAM<-gam::predict.Gam(fit_GAM,newdata = data.frame(X))
Lambda_GAM<-exp(Xbeta_GAM)
- f_estimate<-Xbeta_GAM - X %*% beta.prop
+ f_estimate<-Xbeta_GAM - Xbeta_Final
if(proportion*N != r1){
- idx.proportion <- sample(1:N, ceiling(proportion*N), T, PI.prop)
+ idx.proportion <- sample(1:N, size = ceiling(proportion*N), replace = TRUE, prob = PI.prop)
Y_proportion <- Y[idx.proportion]
X_proportion <- X[idx.proportion,]
pinv.proportion <- rep(N,ceiling(proportion*N))
- fit.proportion <- stats::glm(Y_proportion~X_proportion-1,family="poisson")
+ fit.proportion <- stats::glm(Y_proportion~X_proportion-1,family="quasipoisson")
beta.proportion <- fit.proportion$coefficients
Xbeta_proportion <- X %*% beta.proportion
Assumed_Data<-data.frame(Y=Y_proportion,X_proportion)
- fit_GAM_proportion<-gam::gam(my_formula,data=Assumed_Data,family = "poisson")
+ fit_GAM_proportion<-gam::gam(my_formula,data=Assumed_Data,family = "quasipoisson")
Xbeta_GAM_proportion<-gam::predict.Gam(fit_GAM_proportion,newdata = data.frame(X))
F_Estimate_Full<-Xbeta_GAM_proportion - Xbeta_proportion
Beta_Estimate_Full<-beta.proportion
- }
- else {
+ } else {
Beta_Estimate_Full<- beta.prop ; F_Estimate_Full<-f_estimate
}
@@ -187,7 +196,7 @@ modelMissPoiSub <- function(r1,r2,Y,X,N,Alpha,proportion){
# mMSE
lambda.prop <- Lambda.prop[idx.prop]
w.prop <- lambda.prop
- W.prop <- solve(t(x.prop) %*% (x.prop * w.prop * pinv.prop))
+ W.prop <- solve(crossprod(x.prop, x.prop * w.prop * pinv.prop))
PI.mMSE <- sqrt((Y - Lambda.prop)^2 * rowSums((X%*%W.prop)^2))
PI.mMSE <- PI.mMSE / sum(PI.mMSE)
@@ -197,14 +206,14 @@ modelMissPoiSub <- function(r1,r2,Y,X,N,Alpha,proportion){
X_r1<-X[c(idx.prop,a),]
lambda_r1<-exp(X_r1%*%beta.prop)
W_r1<-as.vector(lambda_r1)
- H_r1 <-solve(t(X_r1) %*% (X_r1 * W_r1))
+ H_r1 <-solve(crossprod(X_r1,X_r1 * W_r1))
Temp1<-(W_r1*X_r1)%*%H_r1
f_r1<-f_estimate[c(idx.prop,a)]
lambda_Tr1<-exp((X_r1 %*% beta.prop) + f_r1)
W_Tr1<-as.vector(lambda_Tr1)
- H_Tr1 <-(t(X_r1) %*% (X_r1 * W_Tr1))
- b_r1 <-(t(X_r1) %*% (lambda_Tr1-lambda_r1))
+ H_Tr1 <- crossprod(X_r1, X_r1 * W_Tr1)
+ b_r1 <- crossprod(X_r1, lambda_Tr1-lambda_r1)
Temp1_H <- Temp1 %*% H_Tr1
diag_Temp <- rowSums(Temp1_H * Temp1)
@@ -215,18 +224,18 @@ modelMissPoiSub <- function(r1,r2,Y,X,N,Alpha,proportion){
diff <- XH_b_r1 - f_r1
L2_r1 <- sum((W_r1 * diff)^2)
- c(L1_r1+L2_r1)
+ L1_r1+L2_r1
}
lambda_r1<-exp(X[idx.prop,]%*%beta.prop)
W_r1<-as.vector(lambda_r1)
- H_r1 <-solve(t(X[idx.prop,]) %*% (X[idx.prop,]*W_r1))
+ H_r1 <-solve(crossprod(X[idx.prop,], X[idx.prop,]*W_r1))
Temp1<-(W_r1*X[idx.prop,])%*%H_r1
lambda_Tr1<-exp((X[idx.prop,] %*% beta.prop) + f_estimate[idx.prop])
W_Tr1<-as.vector(lambda_Tr1)
- H_Tr1 <-(t(X[idx.prop,]) %*% (X[idx.prop,]*W_Tr1))
- b_r1 <-(t(X[idx.prop,]) %*% (lambda_Tr1-lambda_r1))
+ H_Tr1 <- crossprod(X[idx.prop,], X[idx.prop,]*W_Tr1)
+ b_r1 <- crossprod(X[idx.prop,], lambda_Tr1-lambda_r1)
L1_r1 <- psych::tr(Temp1%*%H_Tr1%*%t(Temp1))
L2_r1 <- sum((W_r1*((X[idx.prop,]%*%H_r1%*%b_r1)-f_estimate[idx.prop]))^2)
@@ -277,12 +286,13 @@ modelMissPoiSub <- function(r1,r2,Y,X,N,Alpha,proportion){
for (i in 1:length(r2))
{
# mVc
- idx.mVc <- sample(1:N, r2[i]-r1, T, PI.mVc)
+ idx.mVc <- sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI.mVc)
x.mVc <- X[c(idx.mVc, idx.prop),]
y.mVc <- Y[c(idx.mVc, idx.prop)]
- fit.mVc <- stats::glm(y.mVc~x.mVc-1,family="poisson",weights = c(1 / PI.mVc[idx.mVc], pinv.prop))
+ fit.mVc <- stats::glm(y.mVc~x.mVc-1,family="quasipoisson",
+ weights = c(1 / PI.mVc[idx.mVc], pinv.prop))
beta_mVc[i,] <- c(r2[i],fit.mVc$coefficients)
@@ -290,25 +300,26 @@ modelMissPoiSub <- function(r1,r2,Y,X,N,Alpha,proportion){
lambda_r1<-exp(x.mVc%*%Beta_Estimate_Full)
W_r1<-as.vector(lambda_r1)
- H_r1 <-solve(t(x.mVc) %*% (x.mVc * W_r1))
+ H_r1 <-solve(crossprod(x.mVc, x.mVc * W_r1))
Temp1<-(W_r1 * x.mVc)%*%H_r1
lambda_Tr1<-exp((x.mVc %*% Beta_Estimate_Full) + F_Estimate_Full[c(idx.mVc, idx.prop)])
W_Tr1<-as.vector(lambda_Tr1)
- H_Tr1 <-(t(x.mVc) %*% (x.mVc * W_Tr1))
- b_r1 <-(t(x.mVc) %*% (lambda_Tr1-lambda_r1))
+ H_Tr1 <- crossprod(x.mVc, x.mVc * W_Tr1)
+ b_r1 <- crossprod(x.mVc, lambda_Tr1-lambda_r1)
L1_r1 <- psych::tr(Temp1%*%H_Tr1%*%t(Temp1))
L2_r1 <- sum((W_r1*((x.mVc%*%H_r1%*%b_r1) - F_Estimate_Full[c(idx.mVc, idx.prop)]))^2)
AMSE_Sample_mVc[i,]<-c(r2[i],L1_r1,L2_r1,L1_r1+L2_r1)
# mMSE
- idx.mMSE <- sample(1:N, r2[i]-r1, T, PI.mMSE)
+ idx.mMSE <- sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI.mMSE)
x.mMSE <- X[c(idx.mMSE, idx.prop),]
y.mMSE <- Y[c(idx.mMSE, idx.prop)]
- fit.mMSE <- stats::glm(y.mMSE~x.mMSE-1,family = "poisson", weights =c(1 / PI.mMSE[idx.mMSE], pinv.prop) )
+ fit.mMSE <- stats::glm(y.mMSE~x.mMSE-1,family = "quasipoisson",
+ weights =c(1 / PI.mMSE[idx.mMSE], pinv.prop) )
beta_mMSE[i,] <- c(r2[i],fit.mMSE$coefficients)
@@ -316,25 +327,26 @@ modelMissPoiSub <- function(r1,r2,Y,X,N,Alpha,proportion){
lambda_r1<-exp(x.mMSE%*%Beta_Estimate_Full)
W_r1<-as.vector(lambda_r1)
- H_r1 <-solve(t(x.mMSE) %*% (x.mMSE * W_r1))
+ H_r1 <-solve(crossprod(x.mMSE, x.mMSE * W_r1))
Temp1<-(W_r1*x.mMSE)%*%H_r1
lambda_Tr1<-exp((x.mMSE %*% Beta_Estimate_Full) + F_Estimate_Full[c(idx.mMSE, idx.prop)])
W_Tr1<-as.vector(lambda_Tr1)
- H_Tr1 <-(t(x.mMSE) %*% (x.mMSE * W_Tr1))
- b_r1 <-(t(x.mMSE) %*% (lambda_Tr1-lambda_r1))
+ H_Tr1 <- crossprod(x.mMSE, x.mMSE * W_Tr1)
+ b_r1 <- crossprod(x.mMSE, lambda_Tr1-lambda_r1)
L1_r1 <- psych::tr(Temp1%*%H_Tr1%*%t(Temp1))
L2_r1 <- sum((W_r1*((x.mMSE%*%H_r1%*%b_r1) - F_Estimate_Full[c(idx.mMSE, idx.prop)]))^2)
AMSE_Sample_mMSE[i,]<-c(r2[i],L1_r1,L2_r1,L1_r1+L2_r1)
# RLmAMSE
- idx.RLmAMSE <- sample(1:N, r2[i], T, PI.RLmAMSE)
+ idx.RLmAMSE <- sample(1:N, size = r2[i], replace = TRUE, prob = PI.RLmAMSE)
- x.RLmAMSE <- X[c(idx.RLmAMSE),]
- y.RLmAMSE <- Y[c(idx.RLmAMSE)]
+ x.RLmAMSE <- X[idx.RLmAMSE,]
+ y.RLmAMSE <- Y[idx.RLmAMSE]
- fit.RLmAMSE <- stats::glm(y.RLmAMSE~x.RLmAMSE-1,family="poisson",weights = c(1 / PI.RLmAMSE[idx.RLmAMSE]))
+ fit.RLmAMSE <- stats::glm(y.RLmAMSE~x.RLmAMSE-1,family="quasipoisson",
+ weights = c(1 / PI.RLmAMSE[idx.RLmAMSE]))
beta_RLmAMSE[i,] <- c(r2[i],fit.RLmAMSE$coefficients)
@@ -342,27 +354,28 @@ modelMissPoiSub <- function(r1,r2,Y,X,N,Alpha,proportion){
lambda_r1<-exp(x.RLmAMSE%*%Beta_Estimate_Full)
W_r1<-as.vector(lambda_r1)
- H_r1 <-solve(t(x.RLmAMSE) %*% (x.RLmAMSE * W_r1))
+ H_r1 <-solve(crossprod(x.RLmAMSE, x.RLmAMSE * W_r1))
Temp1<-(W_r1*x.RLmAMSE)%*%H_r1
- lambda_Tr1<-exp((x.RLmAMSE %*% Beta_Estimate_Full) + F_Estimate_Full[c(idx.RLmAMSE)])
+ lambda_Tr1<-exp((x.RLmAMSE %*% Beta_Estimate_Full) + F_Estimate_Full[idx.RLmAMSE])
W_Tr1<-as.vector(lambda_Tr1)
- H_Tr1 <-(t(x.RLmAMSE) %*% (x.RLmAMSE * W_Tr1))
- b_r1 <-(t(x.RLmAMSE) %*% (lambda_Tr1-lambda_r1))
+ H_Tr1 <- crossprod(x.RLmAMSE, x.RLmAMSE * W_Tr1)
+ b_r1 <- crossprod(x.RLmAMSE, lambda_Tr1-lambda_r1)
L1_r1 <- psych::tr(Temp1%*%H_Tr1%*%t(Temp1))
- L2_r1 <- sum((W_r1*((x.RLmAMSE%*%H_r1%*%b_r1) - F_Estimate_Full[c(idx.RLmAMSE)]))^2)
+ L2_r1 <- sum((W_r1*((x.RLmAMSE%*%H_r1%*%b_r1) - F_Estimate_Full[idx.RLmAMSE]))^2)
AMSE_Sample_RLmAMSE[i,]<-c(r2[i],L1_r1,L2_r1,L1_r1+L2_r1)
for (j in 1:length(Alpha))
{
# RLmAMSE Log Odds
- idx.RLmAMSE <- sample(1:N, r2[i], T, PI.RLmAMSE_LO[,j])
+ idx.RLmAMSE <- sample(1:N, size = r2[i], replace = TRUE, prob = PI.RLmAMSE_LO[,j])
- x.RLmAMSE <- X[c(idx.RLmAMSE),]
- y.RLmAMSE <- Y[c(idx.RLmAMSE)]
+ x.RLmAMSE <- X[idx.RLmAMSE,]
+ y.RLmAMSE <- Y[idx.RLmAMSE]
- fit.RLmAMSE <- stats::glm(y.RLmAMSE~x.RLmAMSE-1,family="poisson",weights = c(1 / PI.RLmAMSE_LO[idx.RLmAMSE,j]))
+ fit.RLmAMSE <- stats::glm(y.RLmAMSE~x.RLmAMSE-1,family="quasipoisson",
+ weights = c(1 / PI.RLmAMSE_LO[idx.RLmAMSE,j]))
beta_RLmAMSE_LO[[j]][i,] <- c(r2[i],fit.RLmAMSE$coefficients)
@@ -370,25 +383,26 @@ modelMissPoiSub <- function(r1,r2,Y,X,N,Alpha,proportion){
lambda_r1<-exp(x.RLmAMSE%*%Beta_Estimate_Full)
W_r1<-as.vector(lambda_r1)
- H_r1 <-solve(t(x.RLmAMSE) %*% (x.RLmAMSE * W_r1))
+ H_r1 <-solve(crossprod(x.RLmAMSE, x.RLmAMSE * W_r1))
Temp1<-(W_r1*x.RLmAMSE)%*%H_r1
- lambda_Tr1<-exp((x.RLmAMSE %*% Beta_Estimate_Full) + F_Estimate_Full[c(idx.RLmAMSE)])
+ lambda_Tr1<-exp((x.RLmAMSE %*% Beta_Estimate_Full) + F_Estimate_Full[idx.RLmAMSE])
W_Tr1<-as.vector(lambda_Tr1)
- H_Tr1 <-(t(x.RLmAMSE) %*% (x.RLmAMSE * W_Tr1))
- b_r1 <-(t(x.RLmAMSE) %*% (lambda_Tr1-lambda_r1))
+ H_Tr1 <- crossprod(x.RLmAMSE, x.RLmAMSE * W_Tr1)
+ b_r1 <- crossprod(x.RLmAMSE, lambda_Tr1-lambda_r1)
L1_r1 <- psych::tr(Temp1%*%H_Tr1%*%t(Temp1))
- L2_r1 <- sum((W_r1*((x.RLmAMSE%*%H_r1%*%b_r1) - F_Estimate_Full[c(idx.RLmAMSE)]))^2)
+ L2_r1 <- sum((W_r1*((x.RLmAMSE%*%H_r1%*%b_r1) - F_Estimate_Full[idx.RLmAMSE]))^2)
AMSE_Sample_RLmAMSE_LO[[j]][i,]<-c(r2[i],L1_r1,L2_r1,L1_r1+L2_r1)
# RLmAMSE Power
- idx.RLmAMSE <- sample(1:N, r2[i], T, PI.RLmAMSE_Pow[,j])
+ idx.RLmAMSE <- sample(1:N, size = r2[i], replace = TRUE, prob = PI.RLmAMSE_Pow[,j])
- x.RLmAMSE <- X[c(idx.RLmAMSE),]
- y.RLmAMSE <- Y[c(idx.RLmAMSE)]
+ x.RLmAMSE <- X[idx.RLmAMSE,]
+ y.RLmAMSE <- Y[idx.RLmAMSE]
- fit.RLmAMSE <- stats::glm(y.RLmAMSE~x.RLmAMSE-1,family="poisson",weights = c(1 / PI.RLmAMSE_Pow[idx.RLmAMSE,j]))
+ fit.RLmAMSE <- stats::glm(y.RLmAMSE~x.RLmAMSE-1,family="quasipoisson",
+ weights = c(1 / PI.RLmAMSE_Pow[idx.RLmAMSE,j]))
beta_RLmAMSE_Pow[[j]][i,] <- c(r2[i],fit.RLmAMSE$coefficients)
@@ -399,12 +413,12 @@ modelMissPoiSub <- function(r1,r2,Y,X,N,Alpha,proportion){
H_r1 <-solve(t(x.RLmAMSE) %*% (x.RLmAMSE * W_r1))
Temp1<-(W_r1*x.RLmAMSE)%*%H_r1
- lambda_Tr1<-exp((x.RLmAMSE %*% Beta_Estimate_Full) + F_Estimate_Full[c(idx.RLmAMSE)])
+ lambda_Tr1<-exp((x.RLmAMSE %*% Beta_Estimate_Full) + F_Estimate_Full[idx.RLmAMSE])
W_Tr1<-as.vector(lambda_Tr1)
- H_Tr1 <-(t(x.RLmAMSE) %*% (x.RLmAMSE * W_Tr1))
- b_r1 <-(t(x.RLmAMSE) %*% (lambda_Tr1-lambda_r1))
+ H_Tr1 <- crossprod(x.RLmAMSE, x.RLmAMSE * W_Tr1)
+ b_r1 <- crossprod(x.RLmAMSE, lambda_Tr1-lambda_r1)
L1_r1 <- psych::tr(Temp1%*%H_Tr1%*%t(Temp1))
- L2_r1 <- sum((W_r1*((x.RLmAMSE%*%H_r1%*%b_r1) - F_Estimate_Full[c(idx.RLmAMSE)]))^2)
+ L2_r1 <- sum((W_r1*((x.RLmAMSE%*%H_r1%*%b_r1) - F_Estimate_Full[idx.RLmAMSE]))^2)
AMSE_Sample_RLmAMSE_Pow[[j]][i,]<-c(r2[i],L1_r1,L2_r1,L1_r1+L2_r1)
}
diff --git a/R/modelRobustLinSub.R b/R/modelRobustLinSub.R
index 8612e1f..a1e6881 100644
--- a/R/modelRobustLinSub.R
+++ b/R/modelRobustLinSub.R
@@ -64,7 +64,7 @@
#' indexes<-1:ceiling(nrow(Electric_consumption)*0.005)
#' Original_Data<-cbind(Electric_consumption[indexes,1],1,
#' Electric_consumption[indexes,-1])
-#' colnames(Original_Data)<-c("Y",paste0("X",0:ncol(Original_Data[,-c(1,2)])))
+#' colnames(Original_Data)<-c("Y",paste0("X",0:ncol(Original_Data[,-c(1,2)])))
#' for (j in 3:5) {
#' Original_Data[,j]<-scale(Original_Data[,j])
#' }
@@ -91,8 +91,7 @@
#'
#' r1<-300; r2<-rep(100*c(6,12),25);
#'
-#' modelRobustLinSub(r1 = r1, r2 = r2,
-#' Y = as.matrix(Original_Data[,colnames(Original_Data) %in% c("Y")]),
+#' modelRobustLinSub(r1 = r1, r2 = r2, Y = as.matrix(Original_Data[,1]),
#' X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data),
#' Apriori_probs = rep(1/length(All_Models),length(All_Models)),
#' All_Combinations = All_Models,
@@ -133,7 +132,7 @@ modelRobustLinSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov
}
PI.prop <- rep(1/N, N)
- idx.prop <- sample(1:N, r1, T)
+ idx.prop <- sample(1:N, size = r1, replace = TRUE)
x.prop<-lapply(1:length(All_Combinations),function(j){
X[idx.prop,All_Covariates %in% All_Combinations[[j]]]
@@ -145,8 +144,8 @@ modelRobustLinSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov
pinv.prop <- 1/PI.prop[idx.prop]
fit.prop <- lapply(1:length(All_Combinations), function(j){
- beta.prop<-solve(a=t(x.prop[[j]])%*%x.prop[[j]],b=t(x.prop[[j]])%*%y.prop)
- Xbeta_Final<-as.vector(X[,All_Covariates %in% All_Combinations[[j]] ]%*%beta.prop)
+ beta.prop<-solve(a=crossprod(x.prop[[j]]),b=crossprod(x.prop[[j]],y.prop))
+ Xbeta_Final<-X[,All_Covariates %in% All_Combinations[[j]]]%*%beta.prop
Var.prop<-sum((Y-Xbeta_Final)^2)/N
Epsilon.prop<-Y-Xbeta_Final
return(list("beta.prop"=beta.prop,"Var.prop"=Var.prop,"Epsilon.prop"=Epsilon.prop))
@@ -199,18 +198,10 @@ modelRobustLinSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov
PI_MR.mVc<-matrixStats::rowSums2(do.call(cbind,PI_Single.mVc)%*%diag(Apriori_probs)) # Model Robust Results
## mMSE
- # PI_Single.mMSE <- lapply(1:length(All_Combinations),function(j){ # Single Model Results
- # PI.mMSE<-sqrt(fit.prop[[j]]$Epsilon.prop^2 *
- # matrixStats::rowSums2((X[,All_Covariates %in% All_Combinations[[j]] ] %*%
- # solve(t(X[,All_Covariates %in% All_Combinations[[j]] ])%*%
- # X[,All_Covariates %in% All_Combinations[[j]] ]))^2))
- # return(PI.mMSE/sum(PI.mMSE))
- # })
-
# For efficient row-wise operations
PI_Single.mMSE <- lapply(seq_along(All_Combinations), function(j) {
X_sub <- X[, All_Covariates %in% All_Combinations[[j]], drop = FALSE]
- XtX_inv <- solve(crossprod(X_sub)) # More efficient than t(X) %*% X
+ XtX_inv <- solve(crossprod(X_sub))
row_sums_squared <- matrixStats::rowSums2((X_sub %*% XtX_inv)^2)
PI.mMSE <- sqrt(fit.prop[[j]]$Epsilon.prop^2 * row_sums_squared)
PI.mMSE / sum(PI.mMSE)
@@ -223,9 +214,9 @@ modelRobustLinSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov
{
## mVc
idx_Single.mVc <- lapply(1:length(All_Combinations), function(j){
- sample(1:N, r2[i]-r1, T, PI_Single.mVc[[j]]) # Single Model Results
+ sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI_Single.mVc[[j]]) # Single Model Results
})
- idx_MR.mVc <- sample(1:N, r2[i]-r1, T, PI_MR.mVc) # Model Robust Results
+ idx_MR.mVc <- sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI_MR.mVc) # Model Robust Results
x_Single.mVc <-lapply(1:length(All_Combinations),function(j){ # Single Model Results
X[c(idx_Single.mVc[[j]], idx.prop),All_Covariates %in% All_Combinations[[j]] ]
@@ -245,8 +236,8 @@ modelRobustLinSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov
pi4_r<-sqrt(r2[i]*pinv_Single.mVc^(-1))
X_r4<-x_Single.mVc[[j]]/pi4_r
Y_r4<-y_Single.mVc[[j]]/pi4_r
- beta.prop<-solve(a=t(X_r4)%*%X_r4,b=t(X_r4)%*%Y_r4)
- Xbeta_Final<-as.vector(X[,All_Covariates %in% All_Combinations[[j]] ]%*%beta.prop)
+ beta.prop<-solve(a=crossprod(X_r4),b=crossprod(X_r4,Y_r4))
+ Xbeta_Final<-X[,All_Covariates %in% All_Combinations[[j]] ]%*%beta.prop
Var.prop<-sum((Y-Xbeta_Final)^2)/N
return(list("beta.prop"=beta.prop,"Var.prop"=Var.prop))
})
@@ -255,8 +246,8 @@ modelRobustLinSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov
pi4_r<-sqrt(r2[i]*pinv_MR.mVc^(-1))
X_r4<-x_MR.mVc[[j]]/pi4_r
Y_r4<-y_MR.mVc/pi4_r
- beta.prop<-solve(a=t(X_r4)%*%X_r4,b=t(X_r4)%*%Y_r4) # Model Robust Results
- Xbeta_Final<-as.vector(X[,All_Covariates %in% All_Combinations[[j]] ]%*%beta.prop)
+ beta.prop<-solve(a=crossprod(X_r4),b=crossprod(X_r4,Y_r4))
+ Xbeta_Final<-X[,All_Covariates %in% All_Combinations[[j]] ]%*%beta.prop
Var.prop<-sum((Y-Xbeta_Final)^2)/N
return(list("beta.prop"=beta.prop,"Var.prop"=Var.prop))
})
@@ -279,9 +270,9 @@ modelRobustLinSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov
## mMSE
idx_Single.mMSE <- lapply(1:length(All_Combinations),function(j){
- sample(1:N, r2[i]-r1, T, PI_Single.mMSE[[j]]) # Single Model Results
+ sample(1:N, size = r2[i]-r1, replace = T, prob = PI_Single.mMSE[[j]]) # Single Model Results
})
- idx_MR.mMSE <- sample(1:N, r2[i]-r1, T, PI_MR.mMSE) # Model Robust Results
+ idx_MR.mMSE <- sample(1:N, size = r2[i]-r1, replace = T, prob = PI_MR.mMSE) # Model Robust Results
x_Single.mMSE <- lapply(1:length(All_Combinations),function(j){
X[c(idx_Single.mMSE[[j]], idx.prop),All_Covariates %in% All_Combinations[[j]] ] # Single Model Results
@@ -301,8 +292,8 @@ modelRobustLinSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov
pi4_r<-sqrt(r2[i]*pinv_Single.mMSE^(-1))
X_r4<-x_Single.mMSE[[j]]/pi4_r
Y_r4<-y_Single.mMSE[[j]]/pi4_r
- beta.prop<-solve(a=t(X_r4)%*%X_r4,b=t(X_r4)%*%Y_r4)
- Xbeta_Final<-as.vector(X[,All_Covariates %in% All_Combinations[[j]] ]%*%beta.prop)
+ beta.prop<-solve(a=crossprod(X_r4),b=crossprod(X_r4,Y_r4))
+ Xbeta_Final<-X[,All_Covariates %in% All_Combinations[[j]] ]%*%beta.prop
Var.prop<-sum((Y-Xbeta_Final)^2)/N
return(list("beta.prop"=beta.prop,"Var.prop"=Var.prop))
})
@@ -311,8 +302,8 @@ modelRobustLinSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov
pi4_r<-sqrt(r2[i]*pinv_MR.mMSE^(-1))
X_r4<-x_MR.mMSE[[j]]/pi4_r
Y_r4<-y_MR.mMSE/pi4_r
- beta.prop<-solve(a=t(X_r4)%*%X_r4,b=t(X_r4)%*%Y_r4) # Model Robust Results
- Xbeta_Final<-as.vector(X[,All_Covariates %in% All_Combinations[[j]] ]%*%beta.prop)
+ beta.prop<-solve(a=crossprod(X_r4),b=crossprod(X_r4,Y_r4))
+ Xbeta_Final<-X[,All_Covariates %in% All_Combinations[[j]] ]%*%beta.prop
Var.prop<-sum((Y-Xbeta_Final)^2)/N
return(list("beta.prop"=beta.prop,"Var.prop"=Var.prop))
})
diff --git a/R/modelRobustLogSub.R b/R/modelRobustLogSub.R
index 2a1bc5c..2adf2da 100644
--- a/R/modelRobustLogSub.R
+++ b/R/modelRobustLogSub.R
@@ -91,8 +91,7 @@
#'
#' r1<-300; r2<-rep(100*c(6,12),25);
#'
-#' modelRobustLogSub(r1 = r1, r2 = r2,
-#' Y = as.matrix(Original_Data[,colnames(Original_Data) %in% c("Y")]),
+#' modelRobustLogSub(r1 = r1, r2 = r2, Y = as.matrix(Original_Data[,1]),
#' X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data),
#' Apriori_probs = rep(1/length(All_Models),length(All_Models)),
#' All_Combinations = All_Models,
@@ -137,7 +136,7 @@ modelRobustLogSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov
n0 <- N - n1
PI.prop <- rep(1/(2*n0), N)
PI.prop[Y==1] <- 1/(2*n1)
- idx.prop <- sample(1:N, r1, T, PI.prop)
+ idx.prop <- sample(1:N, size = r1, replace = TRUE, prob = PI.prop)
x.prop<-lapply(1:length(All_Combinations),function(j){
X[idx.prop,All_Covariates %in% All_Combinations[[j]] ]
@@ -211,7 +210,7 @@ modelRobustLogSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov
p_Single.prop[[j]] * (1 - p_Single.prop[[j]]) # Single Model Results
})
W_Single.prop <- lapply(1:length(All_Combinations),function(j){
- solve(t(x.prop[[j]]) %*% (x.prop[[j]] * w_Single.prop[[j]] * pinv.prop)) # Single Model Results
+ solve(crossprod(x.prop[[j]],x.prop[[j]] * w_Single.prop[[j]] * pinv.prop)) # Single Model Results
})
PI_Single.mMSE <- lapply(1:length(All_Combinations),function(j){
PI.mMSE<-sqrt((Y - P.prop[[j]])^2 * matrixStats::rowSums2((X[,All_Covariates %in% All_Combinations[[j]] ]%*%
@@ -226,9 +225,9 @@ modelRobustLogSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov
{
## mVc
idx_Single.mVc <- lapply(1:length(All_Combinations), function(j){
- sample(1:N, r2[i]-r1, T, PI_Single.mVc[[j]]) # Single Model Results
+ sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI_Single.mVc[[j]]) # Single Model Results
})
- idx_MR.mVc <- sample(1:N, r2[i]-r1, T, PI_MR.mVc) # Model Robust Results
+ idx_MR.mVc <- sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI_MR.mVc) # Model Robust Results
x_Single.mVc <-lapply(1:length(All_Combinations),function(j){
X[c(idx_Single.mVc[[j]], idx.prop),All_Covariates %in% All_Combinations[[j]] ] # Single Model results
@@ -244,7 +243,7 @@ modelRobustLogSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov
fit_Single.mVc <-lapply(1:length(All_Combinations), function(j){
.getMLE(x=x_Single.mVc[[j]], y=y_Single.mVc[[j]],
- w=c(1 / PI_Single.mVc[[j]][idx_Single.mVc[[j]]], pinv.prop)) # Single Model Results
+ w=c(1 / PI_Single.mVc[[j]][idx_Single.mVc[[j]]], pinv.prop)) # Single Model Results
})
fit_MR.mVc <- lapply(1:length(All_Combinations),function(j){
@@ -268,12 +267,10 @@ modelRobustLogSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov
V_Final<-lapply(1:length(All_Combinations),function(j){
pi<-1-1/(1 + exp(x_Single.mVc[[j]] %*% beta.mVc_Single[[j]][i,-1]))
W<-as.vector(pi*(1-pi)*c(1 / PI_Single.mVc[[j]][idx_Single.mVc[[j]]], pinv.prop))
- Mx<-solve((t(x_Single.mVc[[j]]) %*% (x_Single.mVc[[j]] * W)))
-
+ Mx<-solve(crossprod(x_Single.mVc[[j]],x_Single.mVc[[j]] * W))
Middle<-((as.vector(y_Single.mVc[[j]])-as.vector(pi))*
as.vector(c(1 / PI_Single.mVc[[j]][idx_Single.mVc[[j]]], pinv.prop)))^2
- V_Temp<-(t(x_Single.mVc[[j]]) %*% (x_Single.mVc[[j]] * Middle) )
-
+ V_Temp<-crossprod(x_Single.mVc[[j]],x_Single.mVc[[j]] * Middle)
Mx %*% V_Temp %*% Mx
})
@@ -286,11 +283,9 @@ modelRobustLogSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov
V_Final<-lapply(1:length(All_Combinations),function(j){
pi<-1-1/(1+exp(x_MR.mVc[[j]] %*% beta.mVc_MR[[j]][i,-1]))
W<-as.vector(pi*(1-pi)*c(1 / PI_MR.mVc[idx_MR.mVc], pinv.prop))
- Mx<-solve((t(x_MR.mVc[[j]]) %*% (x_MR.mVc[[j]] * W) ))
-
+ Mx<-solve(crossprod(x_MR.mVc[[j]], x_MR.mVc[[j]] * W))
Middle<-((as.vector(y_MR.mVc)-as.vector(pi))*as.vector(c(1 / PI_MR.mVc[idx_MR.mVc], pinv.prop)))^2
- V_Temp<-(t(x_MR.mVc[[j]]) %*% (x_MR.mVc[[j]] * Middle))
-
+ V_Temp<-crossprod(x_MR.mVc[[j]], x_MR.mVc[[j]] * Middle)
Mx %*% V_Temp %*% Mx
})
@@ -301,9 +296,9 @@ modelRobustLogSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov
## mMSE
idx_Single.mMSE <- lapply(1:length(All_Combinations),function(j){
- sample(1:N, r2[i]-r1, T, PI_Single.mMSE[[j]]) # Single Model Results
+ sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI_Single.mMSE[[j]]) # Single Model Results
})
- idx_MR.mMSE <- sample(1:N, r2[i]-r1, T, PI_MR.mMSE) # Model Robust Results
+ idx_MR.mMSE <- sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI_MR.mMSE) # Model Robust Results
x_Single.mMSE <- lapply(1:length(All_Combinations),function(j){
X[c(idx_Single.mMSE[[j]], idx.prop),All_Covariates %in% All_Combinations[[j]] ] # Single Model Results
@@ -319,7 +314,7 @@ modelRobustLogSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov
fit_Single.mMSE <- lapply(1:length(All_Combinations),function(j){
.getMLE(x=x_Single.mMSE[[j]], y=y_Single.mMSE[[j]],
- w=c(1 / PI_Single.mMSE[[j]][idx_Single.mMSE[[j]]], pinv.prop)) # Single Model Results
+ w=c(1 / PI_Single.mMSE[[j]][idx_Single.mMSE[[j]]], pinv.prop)) # Single Model Results
})
fit_MR.mMSE <- lapply(1:length(All_Combinations), function(j){
@@ -345,12 +340,10 @@ modelRobustLogSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov
V_Final<-lapply(1:length(All_Combinations),function(j){
pi<-1-1/(1 + exp(x_Single.mMSE[[j]] %*% beta.mMSE_Single[[j]][i,-1]))
W<-as.vector(pi*(1-pi)*c(1 / PI_Single.mMSE[[j]][idx_Single.mMSE[[j]]], pinv.prop))
- Mx<-solve((t(x_Single.mMSE[[j]]) %*% (x_Single.mMSE[[j]] * W)))
-
+ Mx<-solve(crossprod(x_Single.mMSE[[j]], x_Single.mMSE[[j]] * W))
Middle<-((as.vector(y_Single.mMSE[[j]])-as.vector(pi))*
as.vector(c(1 / PI_Single.mMSE[[j]][idx_Single.mMSE[[j]]], pinv.prop)))^2
- V_Temp<-(t(x_Single.mMSE[[j]]) %*% (x_Single.mMSE[[j]] * Middle))
-
+ V_Temp<-crossprod(x_Single.mMSE[[j]], x_Single.mMSE[[j]] * Middle)
Mx %*% V_Temp %*% Mx
})
@@ -363,11 +356,9 @@ modelRobustLogSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov
V_Final<-lapply(1:length(All_Combinations),function(j){
pi<-1-1/(1+exp(x_MR.mMSE[[j]] %*% beta.mMSE_MR[[j]][i,-1]))
W<-as.vector(pi*(1-pi)*c(1 / PI_MR.mMSE[idx_MR.mMSE], pinv.prop))
- Mx<-solve((t(x_MR.mMSE[[j]]) %*% (x_MR.mMSE[[j]] * W)))
-
+ Mx<-solve(crossprod(x_MR.mMSE[[j]], x_MR.mMSE[[j]] * W))
Middle<-((as.vector(y_MR.mMSE)-as.vector(pi))*as.vector(c(1 / PI_MR.mMSE[idx_MR.mMSE], pinv.prop)))^2
- V_Temp<-(t(x_MR.mMSE[[j]]) %*% (x_MR.mMSE[[j]] * Middle))
-
+ V_Temp<-crossprod(x_MR.mMSE[[j]], x_MR.mMSE[[j]] * Middle)
Mx %*% V_Temp %*% Mx
})
@@ -426,7 +417,7 @@ modelRobustLogSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov
msg <- "NA"
while (loop <= Loop) {
pr <- c(1 - 1 / (1 + exp(x %*% beta)))
- H <- t(x) %*% (pr * (1 - pr) * w * x)
+ H <- crossprod(x,(pr * (1 - pr) * w * x))
S <- colSums((y - pr) * w * x)
tryCatch(
{shs <- NA
diff --git a/R/modelRobustPoiSub.R b/R/modelRobustPoiSub.R
index 9a6de83..606bd1f 100644
--- a/R/modelRobustPoiSub.R
+++ b/R/modelRobustPoiSub.R
@@ -93,8 +93,7 @@
#'
#' r1<-300; r2<-rep(100*c(6,12),25);
#'
-#' modelRobustPoiSub(r1 = r1, r2 = r2,
-#' Y = as.matrix(Original_Data[,colnames(Original_Data) %in% c("Y")]),
+#' modelRobustPoiSub(r1 = r1, r2 = r2, Y = as.matrix(Original_Data[,1]),
#' X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data),
#' Apriori_probs = rep(1/length(All_Models),length(All_Models)),
#' All_Combinations = All_Models,
@@ -137,7 +136,7 @@ modelRobustPoiSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov
}
PI.prop <- rep(1/N, N)
- idx.prop <- sample(1:N, r1, T)
+ idx.prop <- sample(1:N, size = r1, replace = TRUE)
x.prop<-lapply(1:length(All_Combinations),function(j){
X[idx.prop,All_Covariates %in% All_Combinations[[j]]]
@@ -149,7 +148,7 @@ modelRobustPoiSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov
pinv.prop <- 1/PI.prop[idx.prop]
fit.prop <- lapply(1:length(All_Combinations), function(j){
- stats::glm(y.prop~x.prop[[j]]-1,family = "poisson")
+ stats::glm(y.prop~x.prop[[j]]-1,family = "quasipoisson")
})
beta.prop<-list()
@@ -210,7 +209,7 @@ modelRobustPoiSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov
P.prop[[j]][idx.prop] # Single Model Results
})
W_Single.prop <- lapply(1:length(All_Combinations),function(j){
- solve(t(x.prop[[j]]) %*% (x.prop[[j]] * w_Single.prop[[j]] * pinv.prop)) # Single Model Results
+ solve(crossprod(x.prop[[j]], x.prop[[j]] * w_Single.prop[[j]] * pinv.prop)) # Single Model Results
})
PI_Single.mMSE <- lapply(1:length(All_Combinations),function(j){ # Single Model Results
PI.mMSE<-sqrt((Y - P.prop[[j]])^2 *
@@ -225,9 +224,9 @@ modelRobustPoiSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov
{
## mVc
idx_Single.mVc <- lapply(1:length(All_Combinations), function(j){
- sample(1:N, r2[i]-r1, T, PI_Single.mVc[[j]]) # Single Model Results
+ sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI_Single.mVc[[j]]) # Single Model Results
})
- idx_MR.mVc <- sample(1:N, r2[i]-r1, T, PI_MR.mVc) # Model Robust Results
+ idx_MR.mVc <- sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI_MR.mVc) # Model Robust Results
x_Single.mVc <-lapply(1:length(All_Combinations),function(j){ # Single Model Results
X[c(idx_Single.mVc[[j]], idx.prop),All_Covariates %in% All_Combinations[[j]] ]
@@ -244,11 +243,11 @@ modelRobustPoiSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov
fit_Single.mVc <-lapply(1:length(All_Combinations), function(j){ # Single Model Results
pinv_Single.mVc<-c(1 / PI_Single.mVc[[j]][idx_Single.mVc[[j]]], pinv.prop)
- stats::glm(y_Single.mVc[[j]]~x_Single.mVc[[j]]-1, family = "poisson",weights=pinv_Single.mVc)
+ stats::glm(y_Single.mVc[[j]]~x_Single.mVc[[j]]-1, family = "quasipoisson",weights=pinv_Single.mVc)
})
fit_MR.mVc <- lapply(1:length(All_Combinations),function(j){
- stats::glm(y_MR.mVc~x_MR.mVc[[j]]-1,family="poisson",weights=pinv_MR.mVc) # Model Robust Results
+ stats::glm(y_MR.mVc~x_MR.mVc[[j]]-1,family="quasipoisson",weights=pinv_MR.mVc) # Model Robust Results
})
for (j in 1:length(All_Combinations))
@@ -268,9 +267,8 @@ modelRobustPoiSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov
V_Final<-lapply(1:length(All_Combinations),function(j){
pi<-c(exp(x_Single.mVc[[j]] %*% beta.mVc_Single[[j]][i,-1]))
pinv_Single.mVc<-c(1 / PI_Single.mVc[[j]][idx_Single.mVc[[j]]], pinv.prop)
- Mx<-solve(t(x_Single.mVc[[j]]) %*% (x_Single.mVc[[j]] * pi * pinv_Single.mVc))
- V_Temp<-t(x_Single.mVc[[j]]) %*% (x_Single.mVc[[j]]*((as.vector(y_Single.mVc[[j]])-pi)*pinv_Single.mVc)^2)
-
+ Mx<-solve(crossprod(x_Single.mVc[[j]], x_Single.mVc[[j]] * pi * pinv_Single.mVc))
+ V_Temp<-crossprod(x_Single.mVc[[j]], x_Single.mVc[[j]]*((as.vector(y_Single.mVc[[j]])-pi)*pinv_Single.mVc)^2)
Mx %*% V_Temp %*% Mx
})
@@ -282,9 +280,8 @@ modelRobustPoiSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov
# Model Robust Results
V_Final<-lapply(1:length(All_Combinations),function(j){
pi<- c(exp(x_MR.mVc[[j]] %*% beta.mVc_MR[[j]][i,-1]))
- Mx<-solve(t(x_MR.mVc[[j]]) %*% (x_MR.mVc[[j]]*pi*pinv_MR.mVc))
- V_Temp<-t(x_MR.mVc[[j]]) %*% (x_MR.mVc[[j]]*((as.vector(y_MR.mVc)-pi)*pinv_MR.mVc)^2)
-
+ Mx<-solve(crossprod(x_MR.mVc[[j]], x_MR.mVc[[j]]*pi*pinv_MR.mVc))
+ V_Temp<-crossprod(x_MR.mVc[[j]], x_MR.mVc[[j]]*((as.vector(y_MR.mVc)-pi)*pinv_MR.mVc)^2)
Mx %*% V_Temp %*% Mx
})
@@ -295,9 +292,9 @@ modelRobustPoiSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov
## mMSE
idx_Single.mMSE <- lapply(1:length(All_Combinations),function(j){
- sample(1:N, r2[i]-r1, T, PI_Single.mMSE[[j]]) # Single Model Results
+ sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI_Single.mMSE[[j]]) # Single Model Results
})
- idx_MR.mMSE <- sample(1:N, r2[i]-r1, T, PI_MR.mMSE) # Model Robust Results
+ idx_MR.mMSE <- sample(1:N, size = r2[i]-r1, replace = TRUE, prob = PI_MR.mMSE) # Model Robust Results
x_Single.mMSE <- lapply(1:length(All_Combinations),function(j){
X[c(idx_Single.mMSE[[j]], idx.prop),All_Covariates %in% All_Combinations[[j]] ] # Single Model Results
@@ -314,11 +311,11 @@ modelRobustPoiSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov
fit_Single.mMSE <- lapply(1:length(All_Combinations),function(j){ # Single Model Results
pinv_Single.mMSE<-c(1 / PI_Single.mMSE[[j]][idx_Single.mMSE[[j]]], pinv.prop)
- stats::glm(y_Single.mMSE[[j]]~x_Single.mMSE[[j]]-1, family = "poisson",weights=pinv_Single.mMSE)
+ stats::glm(y_Single.mMSE[[j]]~x_Single.mMSE[[j]]-1, family = "quasipoisson",weights=pinv_Single.mMSE)
})
fit_MR.mMSE <- lapply(1:length(All_Combinations), function(j){ # Model Robust Results
- stats::glm(y_MR.mMSE~x_MR.mMSE[[j]]-1, family = "poisson",weights=pinv_MR.mMSE)
+ stats::glm(y_MR.mMSE~x_MR.mMSE[[j]]-1, family = "quasipoisson",weights=pinv_MR.mMSE)
})
for (j in 1:length(All_Combinations))
@@ -340,9 +337,8 @@ modelRobustPoiSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov
V_Final<-lapply(1:length(All_Combinations),function(j){
pi<-c(exp(x_Single.mMSE[[j]] %*% beta.mMSE_Single[[j]][i,-1]))
pinv_Single.mMSE<-c(1 / PI_Single.mMSE[[j]][idx_Single.mMSE[[j]]], pinv.prop)
- Mx<-solve(t(x_Single.mMSE[[j]]) %*% (x_Single.mMSE[[j]]*pi*pinv_Single.mMSE))
- V_Temp<-t(x_Single.mMSE[[j]]) %*% (x_Single.mMSE[[j]]*((as.vector(y_Single.mMSE[[j]])-pi)*pinv_Single.mMSE)^2)
-
+ Mx<-solve(crossprod(x_Single.mMSE[[j]], x_Single.mMSE[[j]]*pi*pinv_Single.mMSE))
+ V_Temp<-crossprod(x_Single.mMSE[[j]], x_Single.mMSE[[j]]*((as.vector(y_Single.mMSE[[j]])-pi)*pinv_Single.mMSE)^2)
Mx %*% V_Temp %*% Mx
})
@@ -354,9 +350,8 @@ modelRobustPoiSub <- function(r1,r2,Y,X,N,Apriori_probs,All_Combinations,All_Cov
# Model Robust Results
V_Final<-lapply(1:length(All_Combinations),function(j){
pi<-c(exp(x_MR.mMSE[[j]] %*% beta.mMSE_MR[[j]][i,-1]))
- Mx<-solve(t(x_MR.mMSE[[j]]) %*% (x_MR.mMSE[[j]]*pi*pinv_MR.mMSE))
- V_Temp<-t(x_MR.mMSE[[j]]) %*% (x_MR.mMSE[[j]]*((as.vector(y_MR.mMSE)-pi)*pinv_MR.mMSE)^2)
-
+ Mx<-solve(crossprod(x_MR.mMSE[[j]], x_MR.mMSE[[j]]*pi*pinv_MR.mMSE))
+ V_Temp<-crossprod(x_MR.mMSE[[j]], x_MR.mMSE[[j]]*((as.vector(y_MR.mMSE)-pi)*pinv_MR.mMSE)^2)
Mx %*% V_Temp %*% Mx
})
diff --git a/_pkgdown.yml b/_pkgdown.yml
index 8e664c5..fff3af1 100644
--- a/_pkgdown.yml
+++ b/_pkgdown.yml
@@ -10,7 +10,7 @@ template:
primary: "#003366"
development:
- mode: auto
+ mode: release
navbar:
title: NeEDS4BigData
@@ -35,6 +35,10 @@ navbar:
href: articles/Basic_Sampling_Poi_Reg.html
- text: Poisson Regression - Model robust and misspecification
href: articles/Poisson_Regression.html
+ - text: Benchmarking - Model based subsampling
+ href: articles/Benchmarking_model_based.html
+ - text: Benchmarking - Model robust subsampling
+ href: articles/Benchmarking_model_robust.html
- text: Versions
href: news/index.html
right:
diff --git a/inst/WORDLIST b/inst/WORDLIST
index 5f27af9..e847ea8 100644
--- a/inst/WORDLIST
+++ b/inst/WORDLIST
@@ -1,6 +1,5 @@
AMSE
Abhinav
-Apriori
Berard
BigData
CodeFactor
@@ -8,25 +7,21 @@ Fanaee
Hebrail
LmAMSE
MJ
+MVNormal
NeEDS
Poisson
RLmAMSE
Rajen
Sceaux
UCI
-Var
codecov
-criterions
doi
endogeneity
esign
frac
ggplot
ldots
-misspecifications
nonskin
-packag
-packageversion
poisson
softmax
th
diff --git a/man/ALoptimalGLMSub.Rd b/man/ALoptimalGLMSub.Rd
index 9697367..5ce7142 100644
--- a/man/ALoptimalGLMSub.Rd
+++ b/man/ALoptimalGLMSub.Rd
@@ -69,8 +69,7 @@ Full_Data<-GenGLMdata(Dist,Dist_Par,No_Of_Var,Beta,N,Family)
r1<-300; r2<-rep(600,50); Original_Data<-Full_Data$Complete_Data;
-ALoptimalGLMSub(r1 = r1, r2 = r2,
- Y = as.matrix(Original_Data[,colnames(Original_Data) \%in\% c("Y")]),
+ALoptimalGLMSub(r1 = r1, r2 = r2,Y = as.matrix(Original_Data[,1]),
X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data),
family = "linear")->Results
@@ -82,8 +81,7 @@ Full_Data<-GenGLMdata(Dist,Dist_Par,No_Of_Var,Beta,N,Family)
r1<-300; r2<-rep(600,50); Original_Data<-Full_Data$Complete_Data;
-ALoptimalGLMSub(r1 = r1, r2 = r2,
- Y = as.matrix(Original_Data[,colnames(Original_Data) \%in\% c("Y")]),
+ALoptimalGLMSub(r1 = r1, r2 = r2,Y = as.matrix(Original_Data[,1]),
X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data),
family = "logistic")->Results
@@ -95,8 +93,7 @@ Full_Data<-GenGLMdata(Dist,NULL,No_Of_Var,Beta,N,Family)
r1<-300; r2<-rep(600,50); Original_Data<-Full_Data$Complete_Data;
-ALoptimalGLMSub(r1 = r1, r2 = r2,
- Y = as.matrix(Original_Data[,colnames(Original_Data) \%in\% c("Y")]),
+ALoptimalGLMSub(r1 = r1, r2 = r2,Y = as.matrix(Original_Data[,1]),
X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data),
family = "poisson")->Results
diff --git a/man/AoptimalGauLMSub.Rd b/man/AoptimalGauLMSub.Rd
index d19259a..822f594 100644
--- a/man/AoptimalGauLMSub.Rd
+++ b/man/AoptimalGauLMSub.Rd
@@ -58,8 +58,7 @@ Full_Data<-GenGLMdata(Dist,Dist_Par,No_Of_Var,Beta,N,Family)
r1<-300; r2<-rep(100*c(6,12),50); Original_Data<-Full_Data$Complete_Data;
-AoptimalGauLMSub(r1 = r1, r2 = r2,
- Y = as.matrix(Original_Data[,colnames(Original_Data) \%in\% c("Y")]),
+AoptimalGauLMSub(r1 = r1, r2 = r2,Y = as.matrix(Original_Data[,1]),
X = as.matrix(Original_Data[,-1]),
N = nrow(Original_Data))->Results
diff --git a/man/AoptimalMCGLMSub.Rd b/man/AoptimalMCGLMSub.Rd
index 5f51f5e..5c1470a 100644
--- a/man/AoptimalMCGLMSub.Rd
+++ b/man/AoptimalMCGLMSub.Rd
@@ -66,8 +66,7 @@ Full_Data<-GenGLMdata(Dist,Dist_Par,No_Of_Var,Beta,N,Family)
r1<-300; r2<-rep(100*c(6,12),50); Original_Data<-Full_Data$Complete_Data;
-AoptimalMCGLMSub(r1 = r1, r2 = r2,
- Y = as.matrix(Original_Data[,colnames(Original_Data) \%in\% c("Y")]),
+AoptimalMCGLMSub(r1 = r1, r2 = r2,Y = as.matrix(Original_Data[,1]),
X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data),
family = "linear")->Results
@@ -79,8 +78,7 @@ Full_Data<-GenGLMdata(Dist,Dist_Par,No_Of_Var,Beta,N,Family)
r1<-300; r2<-rep(100*c(6,12),50); Original_Data<-Full_Data$Complete_Data;
-AoptimalMCGLMSub(r1 = r1, r2 = r2,
- Y = as.matrix(Original_Data[,colnames(Original_Data) \%in\% c("Y")]),
+AoptimalMCGLMSub(r1 = r1, r2 = r2,Y = as.matrix(Original_Data[,1]),
X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data),
family = "logistic")->Results
@@ -92,8 +90,7 @@ Full_Data<-GenGLMdata(Dist,NULL,No_Of_Var,Beta,N,Family)
r1<-300; r2<-rep(100*c(6,12),50); Original_Data<-Full_Data$Complete_Data;
-AoptimalMCGLMSub(r1 = r1, r2 = r2,
- Y = as.matrix(Original_Data[,colnames(Original_Data) \%in\% c("Y")]),
+AoptimalMCGLMSub(r1 = r1, r2 = r2,Y = as.matrix(Original_Data[,1]),
X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data),
family = "poisson")->Results
diff --git a/man/GenGLMdata.Rd b/man/GenGLMdata.Rd
index 4572397..f1f2a08 100644
--- a/man/GenGLMdata.Rd
+++ b/man/GenGLMdata.Rd
@@ -7,7 +7,7 @@
GenGLMdata(Dist,Dist_Par,No_Of_Var,Beta,N,family)
}
\arguments{
-\item{Dist}{a character value for the distribution "Normal" or "Uniform or "Exponential"}
+\item{Dist}{a character value for the distribution "Normal", "MVNormal", "Uniform or "Exponential"}
\item{Dist_Par}{a list of parameters for the distribution that would generate data for covariate X}
@@ -26,8 +26,8 @@ The output of \code{GenGLMdata} gives a list of
}
\description{
Function to simulate big data under linear, logistic and Poisson regression for sampling.
-Covariate data X is through Normal or Uniform distribution for linear regression.
-Covariate data X is through Exponential or Normal or Uniform distribution for logistic regression.
+Covariate data X is through Normal, Multivariate Normal or Uniform distribution for linear regression.
+Covariate data X is through Exponential, Normal, Multivariate Normal or Uniform distribution for logistic regression.
Covariate data X is through Normal or Uniform distribution for Poisson regression.
}
\details{
@@ -35,19 +35,31 @@ Big data for the Generalised Linear Models are generated by the "linear", "logis
regression types.
We have limited the covariate data generation for
-linear regression through normal and uniform distribution,
-logistic regression through exponential, normal and uniform and
+linear regression through normal, multivariate normal and uniform distribution,
+logistic regression through exponential, normal, multivariate normal and uniform distribution
Poisson regression through normal and uniform distribution.
}
\examples{
-Dist<-"Normal"; Dist_Par<-list(Mean=0,Variance=1,Error_Variance=0.5)
-No_Of_Var<-2; Beta<-c(-1,2,1); N<-5000; Family<-"linear"
+No_Of_Var<-2; Beta<-c(-1,2,1); N<-5000;
+
+# Dist<-"Normal"; Dist_Par<-list(Mean=0,Variance=1,Error_Variance=0.5)
+Dist<-"MVNormal";
+Dist_Par<-list(Mean=rep(0,No_Of_Var),Variance=diag(rep(2,No_Of_Var)),Error_Variance=0.5)
+# Dist<-"Uniform"; Dist_Par<-list(Min=0,Max=1)
+
+Family<-"linear"
Results<-GenGLMdata(Dist,Dist_Par,No_Of_Var,Beta,N,Family)
-Dist<-"Normal"; Dist_Par<-list(Mean=0,Variance=1); Family<-"logistic"
+Dist<-"Normal"; Dist_Par<-list(Mean=0,Variance=1);
+# Dist<-"MVNormal"; Dist_Par<-list(Mean=rep(0,No_Of_Var),Variance=diag(rep(2,No_Of_Var)))
+# Dist<-"Exponential"; Dist_Par<-list(Rate=3)
+# Dist<-"Uniform"; Dist_Par<-list(Min=0,Max=1)
+
+Family<-"logistic"
Results<-GenGLMdata(Dist,Dist_Par,No_Of_Var,Beta,N,Family)
-Dist<-"Normal"; Family<-"poisson"
+# Dist<-"Normal";
+Dist<-"Uniform"; Family<-"poisson"
Results<-GenGLMdata(Dist,NULL,No_Of_Var,Beta,N,Family)
}
diff --git a/man/LCCsampling.Rd b/man/LCCsampling.Rd
index 51e38ee..bd7ce26 100644
--- a/man/LCCsampling.Rd
+++ b/man/LCCsampling.Rd
@@ -58,7 +58,7 @@ Full_Data<-GenGLMdata(Dist,Dist_Par,No_Of_Var,Beta,N,Family)
r1<-300; r2<-rep(100*c(6,9,12),50); Original_Data<-Full_Data$Complete_Data;
-LCCsampling(r1 = r1, r2 = r2, Y = as.matrix(Original_Data[,colnames(Original_Data) \%in\% c("Y")]),
+LCCsampling(r1 = r1, r2 = r2, Y = as.matrix(Original_Data[,1]),
X = as.matrix(Original_Data[,-1]),
N = nrow(Original_Data))->Results
diff --git a/man/LeverageSampling.Rd b/man/LeverageSampling.Rd
index cecaff1..03c3ac9 100644
--- a/man/LeverageSampling.Rd
+++ b/man/LeverageSampling.Rd
@@ -67,7 +67,7 @@ Full_Data<-GenGLMdata(Dist,Dist_Par,No_Of_Var,Beta,N,Family)
r<-rep(100*c(6,10),50); Original_Data<-Full_Data$Complete_Data;
-LeverageSampling(r = r, Y = as.matrix(Original_Data[,colnames(Original_Data) \%in\% c("Y")]),
+LeverageSampling(r = r, Y = as.matrix(Original_Data[,1]),
X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data),
S_alpha = 0.95,
family = "linear")->Results
@@ -80,7 +80,7 @@ Full_Data<-GenGLMdata(Dist,Dist_Par,No_Of_Var,Beta,N,Family)
r<-rep(100*c(6,10),50); Original_Data<-Full_Data$Complete_Data;
-LeverageSampling(r = r, Y = as.matrix(Original_Data[,colnames(Original_Data) \%in\% c("Y")]),
+LeverageSampling(r = r, Y = as.matrix(Original_Data[,1]),
X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data),
S_alpha = 0.95,
family = "logistic")->Results
@@ -93,7 +93,7 @@ Full_Data<-GenGLMdata(Dist,NULL,No_Of_Var,Beta,N,Family)
r<-rep(100*c(6,10),50); Original_Data<-Full_Data$Complete_Data;
-LeverageSampling(r = r, Y = as.matrix(Original_Data[,colnames(Original_Data) \%in\% c("Y")]),
+LeverageSampling(r = r, Y = as.matrix(Original_Data[,1]),
X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data),
S_alpha = 0.95,
family = "poisson")->Results
diff --git a/man/modelRobustLinSub.Rd b/man/modelRobustLinSub.Rd
index ed5d132..8030587 100644
--- a/man/modelRobustLinSub.Rd
+++ b/man/modelRobustLinSub.Rd
@@ -73,7 +73,7 @@ where \eqn{q=1,\ldots,Q} and \eqn{Q} is the number of models in the model set.
indexes<-1:ceiling(nrow(Electric_consumption)*0.005)
Original_Data<-cbind(Electric_consumption[indexes,1],1,
Electric_consumption[indexes,-1])
- colnames(Original_Data)<-c("Y",paste0("X",0:ncol(Original_Data[,-c(1,2)])))
+colnames(Original_Data)<-c("Y",paste0("X",0:ncol(Original_Data[,-c(1,2)])))
for (j in 3:5) {
Original_Data[,j]<-scale(Original_Data[,j])
}
@@ -100,8 +100,7 @@ names(All_Models)<-paste0("Model_",1:length(All_Models))
r1<-300; r2<-rep(100*c(6,12),25);
-modelRobustLinSub(r1 = r1, r2 = r2,
- Y = as.matrix(Original_Data[,colnames(Original_Data) \%in\% c("Y")]),
+modelRobustLinSub(r1 = r1, r2 = r2, Y = as.matrix(Original_Data[,1]),
X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data),
Apriori_probs = rep(1/length(All_Models),length(All_Models)),
All_Combinations = All_Models,
diff --git a/man/modelRobustLogSub.Rd b/man/modelRobustLogSub.Rd
index 22a216b..aaf6564 100644
--- a/man/modelRobustLogSub.Rd
+++ b/man/modelRobustLogSub.Rd
@@ -100,8 +100,7 @@ names(All_Models)<-paste0("Model_",1:length(All_Models))
r1<-300; r2<-rep(100*c(6,12),25);
-modelRobustLogSub(r1 = r1, r2 = r2,
- Y = as.matrix(Original_Data[,colnames(Original_Data) \%in\% c("Y")]),
+modelRobustLogSub(r1 = r1, r2 = r2, Y = as.matrix(Original_Data[,1]),
X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data),
Apriori_probs = rep(1/length(All_Models),length(All_Models)),
All_Combinations = All_Models,
diff --git a/man/modelRobustPoiSub.Rd b/man/modelRobustPoiSub.Rd
index 89763b2..5ea1535 100644
--- a/man/modelRobustPoiSub.Rd
+++ b/man/modelRobustPoiSub.Rd
@@ -102,8 +102,7 @@ names(All_Models)<-paste0("Model_",1:length(All_Models))
r1<-300; r2<-rep(100*c(6,12),25);
-modelRobustPoiSub(r1 = r1, r2 = r2,
- Y = as.matrix(Original_Data[,colnames(Original_Data) \%in\% c("Y")]),
+modelRobustPoiSub(r1 = r1, r2 = r2, Y = as.matrix(Original_Data[,1]),
X = as.matrix(Original_Data[,-1]),N = nrow(Original_Data),
Apriori_probs = rep(1/length(All_Models),length(All_Models)),
All_Combinations = All_Models,
diff --git a/renv.lock b/renv.lock
index aa810b4..a06ab8e 100644
--- a/renv.lock
+++ b/renv.lock
@@ -9,6 +9,13 @@
]
},
"Packages": {
+ "BH": {
+ "Package": "BH",
+ "Version": "1.87.0-1",
+ "Source": "Repository",
+ "Repository": "CRAN",
+ "Hash": "468d9a03ba57f22ebde50060fd13ba9f"
+ },
"GPArotation": {
"Package": "GPArotation",
"Version": "2024.3-1",
@@ -22,9 +29,9 @@
},
"MASS": {
"Package": "MASS",
- "Version": "7.3-61",
+ "Version": "7.3-64",
"Source": "Repository",
- "Repository": "CRAN",
+ "Repository": "RSPM",
"Requirements": [
"R",
"grDevices",
@@ -33,11 +40,11 @@
"stats",
"utils"
],
- "Hash": "0cafd6f0500e5deba33be22c46bf6055"
+ "Hash": "49d2d8090b74c1179df1aff16201caf8"
},
"Matrix": {
"Package": "Matrix",
- "Version": "1.7-0",
+ "Version": "1.7-1",
"Source": "Repository",
"Repository": "CRAN",
"Requirements": [
@@ -50,7 +57,7 @@
"stats",
"utils"
],
- "Hash": "1920b2f11133b12350024297d8a4ff4a"
+ "Hash": "5122bb14d8736372411f955e1b16bc8a"
},
"R6": {
"Package": "R6",
@@ -74,20 +81,20 @@
},
"Rcpp": {
"Package": "Rcpp",
- "Version": "1.0.12",
+ "Version": "1.0.13-1",
"Source": "Repository",
- "Repository": "CRAN",
+ "Repository": "RSPM",
"Requirements": [
"methods",
"utils"
],
- "Hash": "5ea2700d21e038ace58269ecdbeb9ec0"
+ "Hash": "6b868847b365672d6c1677b1608da9ed"
},
"RcppArmadillo": {
"Package": "RcppArmadillo",
- "Version": "0.12.8.4.0",
+ "Version": "14.2.2-1",
"Source": "Repository",
- "Repository": "CRAN",
+ "Repository": "RSPM",
"Requirements": [
"R",
"Rcpp",
@@ -95,7 +102,7 @@
"stats",
"utils"
],
- "Hash": "a535dd90aa8ec9a2e63a8f79b4a1bf43"
+ "Hash": "9da7c242d94a8419d045f6b3a64b9765"
},
"RcppGSL": {
"Package": "RcppGSL",
@@ -110,13 +117,13 @@
},
"RcppParallel": {
"Package": "RcppParallel",
- "Version": "5.1.7",
+ "Version": "5.1.9",
"Source": "Repository",
- "Repository": "CRAN",
+ "Repository": "RSPM",
"Requirements": [
"R"
],
- "Hash": "a45594a00f5dbb073d5ec9f48592a08a"
+ "Hash": "f38a72a419b91faac0ce5d9eee04c120"
},
"RcppZiggurat": {
"Package": "RcppZiggurat",
@@ -136,9 +143,9 @@
},
"Rdpack": {
"Package": "Rdpack",
- "Version": "2.6.1",
+ "Version": "2.6",
"Source": "Repository",
- "Repository": "CRAN",
+ "Repository": "RSPM",
"Requirements": [
"R",
"methods",
@@ -146,13 +153,13 @@
"tools",
"utils"
],
- "Hash": "24a964d2cf75ad25d7b843856c8d4c93"
+ "Hash": "3e1384ada5d3948b392e98b11434d972"
},
"Rfast": {
"Package": "Rfast",
- "Version": "2.1.0",
+ "Version": "2.1.3",
"Source": "Repository",
- "Repository": "CRAN",
+ "Repository": "RSPM",
"Requirements": [
"R",
"Rcpp",
@@ -160,18 +167,18 @@
"RcppParallel",
"RcppZiggurat"
],
- "Hash": "79e8394e1fa06a4ae954b70ca9b16e8f"
+ "Hash": "994cb1d45f644b5fdf59df4a2dfb57ed"
},
"cli": {
"Package": "cli",
- "Version": "3.6.2",
+ "Version": "3.6.3",
"Source": "Repository",
- "Repository": "CRAN",
+ "Repository": "RSPM",
"Requirements": [
"R",
"utils"
],
- "Hash": "1216ac65ac55ec0058a6f75d7ca0fd52"
+ "Hash": "b21916dd77a27642b447374a5d30ecf3"
},
"codetools": {
"Package": "codetools",
@@ -185,9 +192,9 @@
},
"colorspace": {
"Package": "colorspace",
- "Version": "2.1-0",
+ "Version": "2.1-1",
"Source": "Repository",
- "Repository": "CRAN",
+ "Repository": "RSPM",
"Requirements": [
"R",
"grDevices",
@@ -195,17 +202,17 @@
"methods",
"stats"
],
- "Hash": "f20c47fd52fae58b4e377c37bb8c335b"
+ "Hash": "d954cb1c57e8d8b756165d7ba18aa55a"
},
"cpp11": {
"Package": "cpp11",
- "Version": "0.4.7",
+ "Version": "0.5.1",
"Source": "Repository",
- "Repository": "CRAN",
+ "Repository": "RSPM",
"Requirements": [
"R"
],
- "Hash": "5a295d7d963cc5035284dcdbaf334f4e"
+ "Hash": "9df43854f1c84685d095ed6270b52387"
},
"dplyr": {
"Package": "dplyr",
@@ -264,7 +271,7 @@
},
"gam": {
"Package": "gam",
- "Version": "1.22-4",
+ "Version": "1.22-5",
"Source": "Repository",
"Repository": "CRAN",
"Requirements": [
@@ -274,7 +281,7 @@
"splines",
"stats"
],
- "Hash": "1d00ef061ca18a5d4dc4483bb75599cb"
+ "Hash": "453698a6f7535fbb77841e8015b5ee57"
},
"generics": {
"Package": "generics",
@@ -289,7 +296,7 @@
},
"ggh4x": {
"Package": "ggh4x",
- "Version": "0.2.8",
+ "Version": "0.3.0",
"Source": "Repository",
"Repository": "CRAN",
"Requirements": [
@@ -303,7 +310,7 @@
"stats",
"vctrs"
],
- "Hash": "012d7705b4cdc7b5ee89895aeefc1d45"
+ "Hash": "8e0e5e8db2bda5c3bdc48268afa5afea"
},
"ggplot2": {
"Package": "ggplot2",
@@ -330,20 +337,34 @@
],
"Hash": "44c6a2f8202d5b7e878ea274b1092426"
},
+ "ggridges": {
+ "Package": "ggridges",
+ "Version": "0.5.6",
+ "Source": "Repository",
+ "Repository": "CRAN",
+ "Requirements": [
+ "R",
+ "ggplot2",
+ "grid",
+ "scales",
+ "withr"
+ ],
+ "Hash": "66488692cb8621bc78df1b9b819497a6"
+ },
"glue": {
"Package": "glue",
- "Version": "1.7.0",
+ "Version": "1.8.0",
"Source": "Repository",
- "Repository": "CRAN",
+ "Repository": "RSPM",
"Requirements": [
"R",
"methods"
],
- "Hash": "e0b3a53876554bd45879e596cdb10a52"
+ "Hash": "5899f1eaa825580172bb56c08266f37c"
},
"gtable": {
"Package": "gtable",
- "Version": "0.3.5",
+ "Version": "0.3.6",
"Source": "Repository",
"Repository": "RSPM",
"Requirements": [
@@ -352,9 +373,10 @@
"glue",
"grid",
"lifecycle",
- "rlang"
+ "rlang",
+ "stats"
],
- "Hash": "e18861963cbc65a27736e02b3cd3c4a0"
+ "Hash": "de949855009e2d4d0e52a844e30617ae"
},
"isoband": {
"Package": "isoband",
@@ -429,13 +451,13 @@
},
"matrixStats": {
"Package": "matrixStats",
- "Version": "1.3.0",
+ "Version": "1.4.1",
"Source": "Repository",
- "Repository": "RSPM",
+ "Repository": "CRAN",
"Requirements": [
"R"
],
- "Hash": "4b3ea27a19d669c0405b38134d89a9d1"
+ "Hash": "8885ffb1f46e820dede6b2ca9442abca"
},
"mgcv": {
"Package": "mgcv",
@@ -475,11 +497,23 @@
],
"Hash": "4fd8900853b746af55b81fda99da7695"
},
+ "mvnfast": {
+ "Package": "mvnfast",
+ "Version": "0.2.8",
+ "Source": "Repository",
+ "Repository": "CRAN",
+ "Requirements": [
+ "BH",
+ "Rcpp",
+ "RcppArmadillo"
+ ],
+ "Hash": "e65cac8e8501bdfbdca0412c37bb18c9"
+ },
"nlme": {
"Package": "nlme",
- "Version": "3.1-165",
+ "Version": "3.1-166",
"Source": "Repository",
- "Repository": "CRAN",
+ "Repository": "RSPM",
"Requirements": [
"R",
"graphics",
@@ -487,16 +521,15 @@
"stats",
"utils"
],
- "Hash": "2769a88be217841b1f33ed469675c3cc"
+ "Hash": "ccbb8846be320b627e6aa2b4616a2ded"
},
"pillar": {
"Package": "pillar",
- "Version": "1.9.0",
+ "Version": "1.10.0",
"Source": "Repository",
- "Repository": "CRAN",
+ "Repository": "RSPM",
"Requirements": [
"cli",
- "fansi",
"glue",
"lifecycle",
"rlang",
@@ -504,7 +537,7 @@
"utils",
"vctrs"
],
- "Hash": "15da5a8412f317beeee6175fbc76f4bb"
+ "Hash": "101ca350beea21261a15ba169d7a8513"
},
"pkgconfig": {
"Package": "pkgconfig",
@@ -518,7 +551,7 @@
},
"psych": {
"Package": "psych",
- "Version": "2.4.6.26",
+ "Version": "2.4.12",
"Source": "Repository",
"Repository": "CRAN",
"Requirements": [
@@ -532,7 +565,7 @@
"parallel",
"stats"
],
- "Hash": "4448d5f3ac3e2cbf79074391d494637e"
+ "Hash": "b6659cfdaf2545e88959f00bdb0a0951"
},
"purrr": {
"Package": "purrr",
@@ -551,7 +584,7 @@
},
"rbibutils": {
"Package": "rbibutils",
- "Version": "2.2.16",
+ "Version": "2.3",
"Source": "Repository",
"Repository": "RSPM",
"Requirements": [
@@ -559,17 +592,17 @@
"tools",
"utils"
],
- "Hash": "8c06968e0a5b0209c5f34239b1302336"
+ "Hash": "dfc034a172fd88fc66b1a703894c4185"
},
"renv": {
"Package": "renv",
- "Version": "1.0.7",
+ "Version": "1.0.11",
"Source": "Repository",
- "Repository": "CRAN",
+ "Repository": "RSPM",
"Requirements": [
"utils"
],
- "Hash": "397b7b2a265bc5a7a06852524dabae20"
+ "Hash": "47623f66b4e80b3b0587bc5d7b309888"
},
"rlang": {
"Package": "rlang",
@@ -726,15 +759,15 @@
},
"withr": {
"Package": "withr",
- "Version": "3.0.0",
+ "Version": "3.0.2",
"Source": "Repository",
- "Repository": "CRAN",
+ "Repository": "RSPM",
"Requirements": [
"R",
"grDevices",
"graphics"
],
- "Hash": "d31b6c62c10dcf11ec530ca6b0dd5d35"
+ "Hash": "cc2d62c76458d425210d1eb1478b30b4"
}
}
}
diff --git a/renv/activate.R b/renv/activate.R
index d13f993..0eb5108 100644
--- a/renv/activate.R
+++ b/renv/activate.R
@@ -2,7 +2,7 @@
local({
# the requested version of renv
- version <- "1.0.7"
+ version <- "1.0.11"
attr(version, "sha") <- NULL
# the project directory
@@ -98,6 +98,66 @@ local({
unloadNamespace("renv")
# load bootstrap tools
+ ansify <- function(text) {
+ if (renv_ansify_enabled())
+ renv_ansify_enhanced(text)
+ else
+ renv_ansify_default(text)
+ }
+
+ renv_ansify_enabled <- function() {
+
+ override <- Sys.getenv("RENV_ANSIFY_ENABLED", unset = NA)
+ if (!is.na(override))
+ return(as.logical(override))
+
+ pane <- Sys.getenv("RSTUDIO_CHILD_PROCESS_PANE", unset = NA)
+ if (identical(pane, "build"))
+ return(FALSE)
+
+ testthat <- Sys.getenv("TESTTHAT", unset = "false")
+ if (tolower(testthat) %in% "true")
+ return(FALSE)
+
+ iderun <- Sys.getenv("R_CLI_HAS_HYPERLINK_IDE_RUN", unset = "false")
+ if (tolower(iderun) %in% "false")
+ return(FALSE)
+
+ TRUE
+
+ }
+
+ renv_ansify_default <- function(text) {
+ text
+ }
+
+ renv_ansify_enhanced <- function(text) {
+
+ # R help links
+ pattern <- "`\\?(renv::(?:[^`])+)`"
+ replacement <- "`\033]8;;ide:help:\\1\a?\\1\033]8;;\a`"
+ text <- gsub(pattern, replacement, text, perl = TRUE)
+
+ # runnable code
+ pattern <- "`(renv::(?:[^`])+)`"
+ replacement <- "`\033]8;;ide:run:\\1\a\\1\033]8;;\a`"
+ text <- gsub(pattern, replacement, text, perl = TRUE)
+
+ # return ansified text
+ text
+
+ }
+
+ renv_ansify_init <- function() {
+
+ envir <- renv_envir_self()
+ if (renv_ansify_enabled())
+ assign("ansify", renv_ansify_enhanced, envir = envir)
+ else
+ assign("ansify", renv_ansify_default, envir = envir)
+
+ }
+
`%||%` <- function(x, y) {
if (is.null(x)) y else x
}
@@ -142,7 +202,10 @@ local({
# compute common indent
indent <- regexpr("[^[:space:]]", lines)
common <- min(setdiff(indent, -1L)) - leave
- paste(substring(lines, common), collapse = "\n")
+ text <- paste(substring(lines, common), collapse = "\n")
+
+ # substitute in ANSI links for executable renv code
+ ansify(text)
}
@@ -305,8 +368,11 @@ local({
quiet = TRUE
)
- if ("headers" %in% names(formals(utils::download.file)))
- args$headers <- renv_bootstrap_download_custom_headers(url)
+ if ("headers" %in% names(formals(utils::download.file))) {
+ headers <- renv_bootstrap_download_custom_headers(url)
+ if (length(headers) && is.character(headers))
+ args$headers <- headers
+ }
do.call(utils::download.file, args)
@@ -385,10 +451,21 @@ local({
for (type in types) {
for (repos in renv_bootstrap_repos()) {
+ # build arguments for utils::available.packages() call
+ args <- list(type = type, repos = repos)
+
+ # add custom headers if available -- note that
+ # utils::available.packages() will pass this to download.file()
+ if ("headers" %in% names(formals(utils::download.file))) {
+ headers <- renv_bootstrap_download_custom_headers(repos)
+ if (length(headers) && is.character(headers))
+ args$headers <- headers
+ }
+
# retrieve package database
db <- tryCatch(
as.data.frame(
- utils::available.packages(type = type, repos = repos),
+ do.call(utils::available.packages, args),
stringsAsFactors = FALSE
),
error = identity
@@ -470,6 +547,14 @@ local({
}
+ renv_bootstrap_github_token <- function() {
+ for (envvar in c("GITHUB_TOKEN", "GITHUB_PAT", "GH_TOKEN")) {
+ envval <- Sys.getenv(envvar, unset = NA)
+ if (!is.na(envval))
+ return(envval)
+ }
+ }
+
renv_bootstrap_download_github <- function(version) {
enabled <- Sys.getenv("RENV_BOOTSTRAP_FROM_GITHUB", unset = "TRUE")
@@ -477,16 +562,16 @@ local({
return(FALSE)
# prepare download options
- pat <- Sys.getenv("GITHUB_PAT")
- if (nzchar(Sys.which("curl")) && nzchar(pat)) {
+ token <- renv_bootstrap_github_token()
+ if (nzchar(Sys.which("curl")) && nzchar(token)) {
fmt <- "--location --fail --header \"Authorization: token %s\""
- extra <- sprintf(fmt, pat)
+ extra <- sprintf(fmt, token)
saved <- options("download.file.method", "download.file.extra")
options(download.file.method = "curl", download.file.extra = extra)
on.exit(do.call(base::options, saved), add = TRUE)
- } else if (nzchar(Sys.which("wget")) && nzchar(pat)) {
+ } else if (nzchar(Sys.which("wget")) && nzchar(token)) {
fmt <- "--header=\"Authorization: token %s\""
- extra <- sprintf(fmt, pat)
+ extra <- sprintf(fmt, token)
saved <- options("download.file.method", "download.file.extra")
options(download.file.method = "wget", download.file.extra = extra)
on.exit(do.call(base::options, saved), add = TRUE)
diff --git a/tests/testthat/test-GenGLMdata.R b/tests/testthat/test-GenGLMdata.R
index b7f1367..8b1fe53 100644
--- a/tests/testthat/test-GenGLMdata.R
+++ b/tests/testthat/test-GenGLMdata.R
@@ -71,13 +71,13 @@ test_that("Error on Results output for Family",{
Family<-"linear"; Dist<-"Exp"; Beta<-c(-1,2,1)
test_that("Error on Results output for linear Dist",{
expect_error(GenGLMdata(Dist,Dist_Par,No_Of_Var,Beta,N,Family),
- "For linear regression select the distribution 'Normal' \n or 'Uniform' to generate the covarate data")
+ "For linear regression select the distribution 'Normal', 'MVNormal' \n or 'Uniform' to generate the covarate data")
})
Family<-"logistic"; Dist<-"Exp"
test_that("Error on Results output for logistic Dist",{
expect_error(GenGLMdata(Dist,Dist_Par,No_Of_Var,Beta,N,Family),
- "For logistic regression select the distribution 'Exponential', \n 'Normal' or 'Uniform' to generate the covarate data")
+ "For logistic regression select the distribution 'Exponential', \n 'Normal', 'MVNormal' or 'Uniform' to generate the covarate data")
})
Family<-"poisson"; Dist<-"Exp"
diff --git a/vignettes/Basic_Sampling_Lin_Reg.Rmd b/vignettes/Basic_Sampling_Lin_Reg.Rmd
index 0f703fb..7280327 100644
--- a/vignettes/Basic_Sampling_Lin_Reg.Rmd
+++ b/vignettes/Basic_Sampling_Lin_Reg.Rmd
@@ -45,14 +45,14 @@ Theme_special<-function(){
# Understanding the electric consumption data
``Electric power consumption'' data [@hebrailindividual], which contains $2,049,280$ measurements for a house located at Sceaux, France between December 2006 and November 2010.
-The data contains $4$ columns, the first column is the response variable and the rest are covariates, however we only use the first $5\%$ of the data for this demonstration.
+The data contains $4$ columns, the first column is the response variable and the rest are covariates, however we only use the first $25\%$ of the data for this demonstration.
The response $y$ is the log scaled intensity, while the covariates are active electrical energy in the a) kitchen ($X_1$), b) laundry room ($X_2$) and c) water-heater and air-conditioner ($X_3$).
The covariates are scaled to be mean of zero and variance of one.
For the given data subsampling methods are implemented assuming the main effects model can describe the data.
```{r Exploring data}
-# Selecting 5% of the big data and prepare it
-indexes<-1:ceiling(nrow(Electric_consumption)*0.05)
+# Selecting 25% of the big data and prepare it
+indexes<-1:ceiling(nrow(Electric_consumption)*0.25)
Original_Data<-cbind(Electric_consumption[indexes,1],1,Electric_consumption[indexes,-1])
colnames(Original_Data)<-c("Y",paste0("X",0:ncol(Original_Data[,-c(1,2)])))
@@ -65,7 +65,7 @@ head(Original_Data) %>%
caption = "First few observations of the electric consumption data.")
# Setting the sample sizes
-N<-nrow(Original_Data); M<-250; k<-seq(6,18,by=3)*100; rep_k<-rep(k,each=M)
+N<-nrow(Original_Data); M<-250; k<-seq(8,20,by=2)*100; rep_k<-rep(k,each=M)
```
Based on this model the methods random sampling, leverage sampling [@ma2014statistical;@ma2015leveraging], $A$-optimality subsampling with Gaussian Linear Model [@lee2022sampling], $A$- and $L$-optimality subsampling [@ai2021optimal;@yao2021review] and $A$-optimality subsampling with response constraint [@zhang2021optimal] are implemented on the big data.
@@ -80,8 +80,8 @@ Method_Colour<-c("#BBFFBB","#50FF50","#00BB00","#008600",
Method_Shape_Types<-c(rep(17,4),rep(4,3),16)
```
-The obtained samples and their respective model parameter estimates are over $M=100$ simulations across different sample sizes $k=(600,\ldots,1500)$.
-We set the initial sample size of $r1=300$ for the methods that requires a random sample.
+The obtained samples and their respective model parameter estimates are over $M=250$ simulations across different sample sizes $k=(800,\ldots,2000)$.
+We set the initial sample size of $r1=400$ for the methods that requires a random sample.
From the final samples, model parameters of the assume model are estimated.
These estimated model parameters are compared with the estimated model parameters of the full big data through the mean squared error $MSE(\tilde{\beta}_k,\hat{\beta})=\frac{1}{MJ}\sum_{i=1}^M \sum_{j=1}^J(\tilde{\beta}_{k,j} - \hat{\beta}_j)^2$.
Here, $\tilde{\beta}_k$ is the estimated model parameters from the sample of size $k$ and $\hat{\beta}$ is the estimated model parameters from the full big data, while $j$ is index of the model parameter.
@@ -119,7 +119,7 @@ colnames(Final_Beta_LS)<-c("Method","Sample",paste0("Beta",0:3))
```{r A-optimality subsampling for GauLin Model}
# A-optimality subsampling for Gaussian Linear Model
-NeEDS4BigData::AoptimalGauLMSub(r1=300,r2=rep_k,
+NeEDS4BigData::AoptimalGauLMSub(r1=400,r2=rep_k,
Y=as.matrix(Original_Data[,1]),
X=as.matrix(Original_Data[,-1]),N=N)->Results
Final_Beta_AoptGauLM<-Results$Beta_Estimates
@@ -131,7 +131,7 @@ colnames(Final_Beta_AoptGauLM)<-c("Method","Sample",paste0("Beta",0:3))
```{r A- and L- optimaliy subsampling}
# A- and L-optimality subsampling for GLM
-NeEDS4BigData::ALoptimalGLMSub(r1=300,r2=rep_k,
+NeEDS4BigData::ALoptimalGLMSub(r1=400,r2=rep_k,
Y=as.matrix(Original_Data[,1]),
X=as.matrix(Original_Data[,-1]),
N=N,family = "linear")->Results
@@ -143,7 +143,7 @@ colnames(Final_Beta_ALoptGLM)<-c("Method","Sample",paste0("Beta",0:3))
```{r A-optimality subsampling with measurement constraints}
# A-optimality subsampling for without response
-NeEDS4BigData::AoptimalMCGLMSub(r1=300,r2=rep_k,
+NeEDS4BigData::AoptimalMCGLMSub(r1=400,r2=rep_k,
Y=as.matrix(Original_Data[,1]),
X=as.matrix(Original_Data[,-1]),
N=N,family="linear")->Results
diff --git a/vignettes/Basic_Sampling_Log_Reg.Rmd b/vignettes/Basic_Sampling_Log_Reg.Rmd
index 4b89e51..62e5330 100644
--- a/vignettes/Basic_Sampling_Log_Reg.Rmd
+++ b/vignettes/Basic_Sampling_Log_Reg.Rmd
@@ -66,7 +66,7 @@ head(Original_Data) %>%
caption = "First few observations of the skin segmentation data.")
# Setting the sample sizes
-N<-nrow(Original_Data); M<-300; k<-seq(9,24,by=3)*100; rep_k<-rep(k,each=M)
+N<-nrow(Original_Data); M<-250; k<-seq(8,20,by=2)*100; rep_k<-rep(k,each=M)
```
Based on this model the methods random sampling, leverage sampling [@ma2014statistical;@ma2015leveraging], local case control sampling [@fithian2015local], $A$- and $L$-optimality subsampling [@ai2021optimal;@yao2021review] and $A$-optimality subsampling with response constraint [@zhang2021optimal] are implemented on the big data.
@@ -81,8 +81,8 @@ Method_Colour<-c("#BBFFBB","#50FF50","#00BB00",
Method_Shape_Types<-c(rep(17,3),rep(4,4),16)
```
-The obtained samples and their respective model parameter estimates are over $M=100$ simulations across different sample sizes $k=(600,\ldots,1500)$.
-We set the initial sample size of $r1=300$ for the methods that requires a random sample.
+The obtained samples and their respective model parameter estimates are over $M=250$ simulations across different sample sizes $k=(800,\ldots,2000)$.
+We set the initial sample size of $r1=400$ for the methods that requires a random sample.
From the final samples, model parameters of the assume model are estimated.
These estimated model parameters are compared with the estimated model parameters of the full big data through the mean squared error $MSE(\tilde{\beta}_k,\hat{\beta})=\frac{1}{MJ}\sum_{i=1}^M \sum_{j=1}^J(\tilde{\beta}_{k,j} - \hat{\beta}_j)^2$.
Here, $\tilde{\beta}_k$ is the estimated model parameters from the sample of size $k$ and $\hat{\beta}$ is the estimated model parameters from the full big data, while $j$ is index of the model parameter.
@@ -120,7 +120,7 @@ colnames(Final_Beta_LS)<-c("Method","Sample",paste0("Beta",0:3))
```{r local case control sampling}
# Local case control sampling
-NeEDS4BigData::LCCsampling(r1=300,r2=rep_k,
+NeEDS4BigData::LCCsampling(r1=400,r2=rep_k,
Y=as.matrix(Original_Data[,1]),
X=as.matrix(Original_Data[,-1]),
N=N)->Results
@@ -133,7 +133,7 @@ colnames(Final_Beta_LCCS)<-c("Method","Sample",paste0("Beta",0:3))
```{r A- and L-optimality subsampling}
# A- and L-optimality subsampling for GLM
-NeEDS4BigData::ALoptimalGLMSub(r1=300,r2=rep_k,
+NeEDS4BigData::ALoptimalGLMSub(r1=400,r2=rep_k,
Y=as.matrix(Original_Data[,1]),
X=as.matrix(Original_Data[,-1]),
N=N,family = "logistic")->Results
@@ -145,7 +145,7 @@ colnames(Final_Beta_ALoptGLM)<-c("Method","Sample",paste0("Beta",0:3))
```{r A-optimality subsampling under measurement constraints}
# A-optimality subsampling for without response
-NeEDS4BigData::AoptimalMCGLMSub(r1=300,r2=rep_k,
+NeEDS4BigData::AoptimalMCGLMSub(r1=400,r2=rep_k,
Y=as.matrix(Original_Data[,1]),
X=as.matrix(Original_Data[,-1]),
N=N,family="logistic")->Results
@@ -173,7 +173,7 @@ remove(Final_Beta_RS,Final_Beta_LS,Final_Beta_LCCS,Final_Beta_ALoptGLM,
The mean squared error is plotted below.
-```{r Model parameter estimates,fig.width=7,fig.height=5,fig.align='center',fig.cap="Mean squared error for all the subsampling methods with 5% and 95% percentile intervals."}
+```{r Model parameter estimates,fig.width=7,fig.height=10,fig.align='center',fig.cap="Mean squared error for a) all the subsampling methods and b) without leverage and local case control sampling, with 5% and 95% percentile intervals."}
# Obtain the mean squared error for the model parameter estimates
MSE_Beta<-data.frame("Method"=Final_Beta$Method,
"Sample"=Final_Beta$Sample,
@@ -198,7 +198,27 @@ ggplot(data=Mean_Data,aes(x=factor(Sample),y=Mean,color=Method,shape=Method))+
scale_color_manual(values = Method_Colour)+
scale_shape_manual(values = Method_Shape_Types)+
theme_bw()+Theme_special()+
- guides(colour = guide_legend(nrow = 3))
+ guides(colour = guide_legend(nrow = 3))->p1
+
+Left_out<-c(paste0(c("Basic ","Shrinkage ","Unweighted "),"Leverage"),
+ "Local case control sampling")
+# Plot for the mean squared error except leverage and local case control sampling
+ggplot(data=Mean_Data[!(Mean_Data$Method %in% Left_out),],
+ aes(x=factor(Sample),y=Mean,color=Method,shape=Method))+
+ xlab("Sample Size")+ylab("Squared Error")+
+ geom_point(size=3,position = position_dodge(width = 0.5))+
+ geom_line(data=Mean_Data[!(Mean_Data$Method %in% Left_out),],
+ aes(x=factor(Sample),y=Mean,group=Method,color=Method),
+ position = position_dodge(width = 0.5))+
+ geom_errorbar(data=Mean_Data[!(Mean_Data$Method %in% Left_out),],
+ aes(ymin=min,ymax=max),
+ width=0.3,position = position_dodge(width = 0.5))+
+ scale_color_manual(values = Method_Colour[-c(4:7)])+
+ scale_shape_manual(values = Method_Shape_Types[-c(4:7)])+
+ theme_bw()+Theme_special()+
+ guides(colour = guide_legend(nrow = 2))->p2
+
+ggarrange(p1,p2,nrow = 2,ncol = 1,labels = "auto")
```
## References
diff --git a/vignettes/Basic_Sampling_Poi_Reg.Rmd b/vignettes/Basic_Sampling_Poi_Reg.Rmd
index 6d21bf7..1d6dde9 100644
--- a/vignettes/Basic_Sampling_Poi_Reg.Rmd
+++ b/vignettes/Basic_Sampling_Poi_Reg.Rmd
@@ -64,7 +64,7 @@ head(Original_Data) %>%
caption = "First few observations of the bike sharing data.")
# Setting the sample sizes
-N<-nrow(Original_Data); M<-250; k<-seq(6,18,by=3)*100; rep_k<-rep(k,each=M)
+N<-nrow(Original_Data); M<-250; k<-seq(8,20,by=2)*100; rep_k<-rep(k,each=M)
```
Based on this model the methods random sampling, leverage sampling [@ma2014statistical;@ma2015leveraging], $A$- and $L$-optimality subsampling [@ai2021optimal;@yao2021review] and $A$-optimality subsampling with response constraint [@zhang2021optimal] are implemented on the big data.
@@ -79,8 +79,8 @@ Method_Colour<-c("#BBFFBB","#50FF50","#00BB00",
Method_Shape_Types<-c(rep(17,3),rep(4,3),16)
```
-The obtained samples and their respective model parameter estimates are over $M=100$ simulations across different sample sizes $k=(600,\ldots,1500)$.
-We set the initial sample size of $r1=300$ for the methods that requires a random sample.
+The obtained samples and their respective model parameter estimates are over $M=250$ simulations across different sample sizes $k=(800,\ldots,2000)$.
+We set the initial sample size of $r1=400$ for the methods that requires a random sample.
From the final samples, model parameters of the assume model are estimated.
These estimated model parameters are compared with the estimated model parameters of the full big data through the mean squared error $MSE(\tilde{\beta}_k,\hat{\beta})=\frac{1}{MJ}\sum_{i=1}^M \sum_{j=1}^J(\tilde{\beta}_{k,j} - \hat{\beta}_j)^2$.
Here, $\tilde{\beta}_k$ is the estimated model parameters from the sample of size $k$ and $\hat{\beta}$ is the estimated model parameters from the full big data, while $j$ is index of the model parameter.
@@ -118,7 +118,7 @@ colnames(Final_Beta_LS)<-c("Method","Sample",paste0("Beta",0:3))
```{r A- and L-optimality subsampling}
# A- and L-optimality subsampling for GLM
-NeEDS4BigData::ALoptimalGLMSub(r1=300,r2=rep_k,
+NeEDS4BigData::ALoptimalGLMSub(r1=400,r2=rep_k,
Y=as.matrix(Original_Data[,1]),
X=as.matrix(Original_Data[,-1]),
N=N,family = "poisson")->Results
@@ -130,7 +130,7 @@ colnames(Final_Beta_ALoptGLM)<-c("Method","Sample",paste0("Beta",0:3))
```{r A-optimality subsampling with measurement constraints}
# A-optimality subsampling without response
-NeEDS4BigData::AoptimalMCGLMSub(r1=300,r2=rep_k,
+NeEDS4BigData::AoptimalMCGLMSub(r1=400,r2=rep_k,
Y=as.matrix(Original_Data[,1]),
X=as.matrix(Original_Data[,-1]),
N=N,family="poisson")->Results
@@ -158,7 +158,7 @@ remove(Final_Beta_RS,Final_Beta_LS,Final_Beta_ALoptGLM,
The mean squared error is plotted below.
-```{r First scenario model parameter estimates,fig.width=7,fig.height=5,fig.align='center',fig.cap="Mean squared error for all the subsampling methods with 5% and 95% percentile intervals."}
+```{r First scenario model parameter estimates,fig.width=7,fig.height=10,fig.align='center',fig.cap="Mean squared error for a) all the subsampling methods and b) without unweighted leverage sampling, with 5% and 95% percentile intervals."}
# Obtain the mean squared error for the model parameter estimates
MSE_Beta<-data.frame("Method"=Final_Beta$Method,
"Sample"=Final_Beta$Sample,
@@ -183,7 +183,25 @@ ggplot(data=Mean_Data,aes(x=factor(Sample),y=Mean,color=Method,shape=Method))+
scale_color_manual(values = Method_Colour)+
scale_shape_manual(values = Method_Shape_Types)+
theme_bw()+Theme_special()+
- guides(colour = guide_legend(nrow = 3))
+ guides(colour = guide_legend(nrow = 3))->p1
+
+# Plot for the mean squared error with all methods except unweighted leverage
+ggplot(data=Mean_Data[Mean_Data$Method != "Unweighted Leverage",],
+ aes(x=factor(Sample),y=Mean,color=Method,shape=Method))+
+ xlab("Sample Size")+ylab("Squared Error")+
+ geom_point(size=3,position = position_dodge(width = 0.5))+
+ geom_line(data=Mean_Data[Mean_Data$Method != "Unweighted Leverage",],
+ aes(x=factor(Sample),y=Mean,group=Method,color=Method),
+ position = position_dodge(width = 0.5))+
+ geom_errorbar(data=Mean_Data[Mean_Data$Method != "Unweighted Leverage",],
+ aes(ymin=min,ymax=max),width=0.3,
+ position = position_dodge(width = 0.5))+
+ scale_color_manual(values = Method_Colour[-7])+
+ scale_shape_manual(values = Method_Shape_Types[-7])+
+ theme_bw()+Theme_special()+
+ guides(colour = guide_legend(nrow = 3))->p2
+
+ggarrange(p1,p2,nrow = 2,ncol = 1,labels = "auto")
```
## References
diff --git a/vignettes/Benchmarking_model_based.Rmd b/vignettes/Benchmarking_model_based.Rmd
new file mode 100644
index 0000000..a422fa9
--- /dev/null
+++ b/vignettes/Benchmarking_model_based.Rmd
@@ -0,0 +1,271 @@
+---
+title: "Benchmarking model-based subsampling functions"
+resource_files:
+ - additionaldata/Results_Model_based.Rdata
+output: rmarkdown::html_vignette
+vignette: >
+ %\VignetteIndexEntry{Benchmarking model-based subsampling functions}
+ %\VignetteEngine{knitr::rmarkdown}
+ %\VignetteEncoding{UTF-8}
+---
+
+
+
+```{r knitr options,include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+```{r load packages and theme,message=FALSE}
+# Load the R packages
+library(ggplot2)
+library(ggpubr)
+library(tidyr)
+library(dplyr)
+
+# Theme for plots
+Theme_special<-function(){
+ theme(legend.key.width=unit(1,"cm"),
+ axis.text.x = element_text(color = "black",size=14),
+ axis.text.y = element_text(color = "black",size=14),
+ strip.text = element_text(colour = "black",size = 16,face="bold"),
+ panel.grid.minor.x = element_blank(),
+ axis.title= element_text(color = "black",face = "bold",size = 14),
+ legend.text = element_text(color = "black", size=16),
+ legend.position = "bottom",
+ legend.margin=margin(0,0,0,0),
+ legend.box.margin=margin(-1,-2,-1,-2))
+}
+
+load("additionaldata/Results_Model_based.Rdata")
+```
+
+## Setup
+
+The `lm()`, `biglm()`, `glm()` and `bigglm()` functions are benchmarked against the model-based subsampling functions.
+Benchmarking is conducted across all three regression problems using a consistent setup throughout.
+For the big data sizes $N = \{10^4,5 \times 10^4,10^5,5 \times 10^5,10^6,5 \times 10^6\}$ and for five different Covariates sizes $\{5,10,25,50,100\}$, the functions were replicated $50$ times.
+When using the subsampling functions, the initial subsample sizes $\{100,100,250,500,1000\}$ were selected based on the covariate sizes, while the final subsample size was fixed at $2500$ to ensure successful implementation.
+
+```{r N and Covariate size}
+N_size<-c(1,5,10,50,100,500)*10000
+N_size_labels<-c("10^4","5 x 10^4","10^5","5 x 10^5","10^6","5 x 10^6")
+CS_size<-c(5,10,25,50,100)
+```
+
+This implementation was conducted on a high-performance computing system; however, the code for linear regression-related functions is shown below.
+Furthermore, it can be extended to logistic and Poisson regression by incorporating the relevant subsampling functions and model configurations.
+
+```{r Code setup for benchmarking,eval=FALSE}
+# load the packages
+library(NeEDS4BigData)
+library(here)
+library(biglm)
+
+# indexes for N and covariate sizes
+indexA <- as.numeric(Sys.getenv("indexA"))
+indexB <- as.numeric(Sys.getenv("indexB"))
+
+# set N and covariate size, and assign replicates
+N_size<-c(1,5,10,50,100,500)*10000
+Covariate_size<-c(5,10,25,50,100)
+Replicates<-50
+
+# set the initial and final subsample sizes,
+# with the distribution parameters to generate data
+r1<-c(1,1,2.5,5,10)*100; r2<-2500;
+Dist<-"Normal"; Dist_Par<-list(Mean=0,Variance=1,Error_Variance=0.5);
+Family<-"linear"
+
+# assign the indexes
+N_idx<-indexA
+Covariate_idx<-indexB
+No_Of_Var<-Covariate_size[Covariate_idx]
+N<-N_size[N_idx]
+
+# generate the big data based on N and covariate size
+Beta<-c(-1,rep(0.5,Covariate_size[Covariate_idx]))
+Generated_Data<-GenGLMdata(Dist,Dist_Par,No_Of_Var,Beta,N,Family)
+Full_Data<-Generated_Data$Complete_Data
+
+cat("N size :",N_size[N_idx]," and Covariate size :",Covariate_size[Covariate_idx],"\n")
+
+# formula for the linear regression model
+lm_formula<-as.formula(paste("Y ~", paste(paste0("X",0:ncol(Full_Data[,-c(1,2)])),
+ collapse = " + ")))
+
+# benchmarking the stats::lm() function
+lm<-sapply(1:Replicates,function(i){
+ start_T<-Sys.time()
+ suppressMessages(lm(Y~.-1,data=data.frame(Full_Data)))
+ return(difftime(Sys.time(),start_T,units = "secs"))
+})
+
+cat("Benchmarked lm() function.\n")
+
+# benchmarking the biglm::biglm() function
+biglm<-sapply(1:Replicates,function(i){
+ start_T<-Sys.time()
+ suppressMessages(biglm(lm_formula,data=data.frame(Full_Data)))
+ return(difftime(Sys.time(),start_T,units = "secs"))
+})
+
+cat("Benchmarked biglm() function.\n")
+
+# benchmarking the NeEDS4BigData::AoptimalGauLMSub() function
+AoptimalGauLMSub<-sapply(1:Replicates,function(i){
+ start_T<-Sys.time()
+ suppressMessages(AoptimalGauLMSub(r1[Covariate_idx],r2,
+ Y=as.matrix(Full_Data[,1]),
+ X=as.matrix(Full_Data[,-1]),
+ N=nrow(Full_Data)))
+ return(difftime(Sys.time(),start_T,units = "secs"))
+})
+
+cat("Benchmarked AoptimalGauLMSub() function.\n")
+
+# benchmarking the NeEDS4BigData::LeverageSampling() function
+LeverageSampling<-sapply(1:Replicates,function(i){
+ start_T<-Sys.time()
+ suppressMessages(LeverageSampling(r2,
+ Y=as.matrix(Full_Data[,1]),
+ X=as.matrix(Full_Data[,-1]),
+ N=nrow(Full_Data),
+ S_alpha=0.95,
+ family=Family))
+ return(difftime(Sys.time(),start_T,units = "secs"))
+})
+
+cat("Benchmarked LeverageSampling() function.\n")
+
+# benchmarking the NeEDS4BigData::ALoptimalGLMSub() function
+ALoptimalGLMSub<-sapply(1:Replicates,function(i){
+ start_T<-Sys.time()
+ suppressMessages(ALoptimalGLMSub(r1[Covariate_idx],r2,
+ Y=as.matrix(Full_Data[,1]),
+ X=as.matrix(Full_Data[,-1]),
+ N=nrow(Full_Data),
+ family=Family))
+ return(difftime(Sys.time(),start_T,units = "secs"))
+})
+
+cat("Benchmarked ALoptimalGLMSub() function.\n")
+
+# benchmarking the NeEDS4BigData::AoptimalMCGLMSub() function
+AoptimalMCGLMSub<-sapply(1:Replicates,function(i){
+ start_T<-Sys.time()
+ suppressMessages(AoptimalMCGLMSub(r1[Covariate_idx],r2,
+ Y=as.matrix(Full_Data[,1]),
+ X=as.matrix(Full_Data[,-1]),
+ N=nrow(Full_Data),
+ family=Family))
+ return(difftime(Sys.time(),start_T,units = "secs"))
+})
+
+cat("Benchmarked AoptimalMCGLMSub() function.\n")
+
+Plot_Data<-cbind.data.frame("N size"=N_size[N_idx],
+ "Covariate size"=Covariate_size[Covariate_idx],"lm()"=lm,
+ "biglm()"=biglm,"AoptimalGauLMSub()"=AoptimalGauLMSub,
+ "LeverageSampling()"=LeverageSampling,
+ "ALoptimalGLMSub()"=ALoptimalGLMSub,
+ "AoptimalMCGLMSub()"=AoptimalMCGLMSub)
+
+save(Plot_Data,
+ file=here("Results",paste0("Output_NS_",N_idx,"_CS_",Covariate_idx,".RData")))
+```
+
+## Linear Regression
+
+For linear regression, the following functions from the NeEDS4BigData package: (1) `AoptimalGauLMSub()`, (2) `LeverageSampling()`, (3) `ALoptimalGLMSub()`, and (4) `AoptimalMCGLMSub()` are compared against `lm()` and `biglm()`.
+
+```{r linear regression,fig.width=5*3,fig.height=5*2,fig.align='center',fig.cap="Average time for functions, with 5% and 95% percentile intervals under linear regression."}
+Methods_FCT<-c("lm()","biglm()","LeverageSampling()","AoptimalMCGLMSub()",
+ "AoptimalGauLMSub()","ALoptimalGLMSub()")
+Method_Colors<-c("grey","black","#F76D5E","#A50021","#BBFFBB","#50FF50")
+
+Final_Linear_Regression %>%
+ pivot_longer(cols = `lm()`:`AoptimalMCGLMSub()`,
+ names_to = "Methods",values_to = "Time") %>%
+ group_by(`N size`,`Covariate size`,Methods) %>%
+ summarise(Mean=mean(Time),min=quantile(Time,0.05),max=quantile(Time,0.95),
+ .groups = "drop") %>%
+ mutate(Methods=factor(Methods,levels = Methods_FCT,labels = Methods_FCT),
+ `N size`=factor(`N size`,levels = N_size,
+ labels=paste0("N = ",N_size_labels))) %>%
+ggplot(.,aes(x=factor(`Covariate size`),y=Mean,color=Methods,group=Methods))+
+ geom_point(position = position_dodge(width = 0.5))+
+ geom_line(position = position_dodge(width = 0.5))+
+ geom_errorbar(aes(ymin=min,ymax=max),position = position_dodge(width = 0.5))+
+ facet_wrap(~factor(`N size`),scales = "free",nrow=2)+
+ xlab("Number of Covariates")+ylab("Time in Seconds")+
+ scale_color_manual(values = Method_Colors)+
+ theme_bw()+ggtitle("Linear Regression")+Theme_special()
+```
+
+In general, our subsampling functions perform faster as the size of the big data and the number of covariates increase, except for `LeverageSampling()`.
+Even when a solution for linear regression is available in an analytical form, the subsampling functions perform as well as `biglm()`.
+
+## Logistic Regression
+
+For logistic regression, the following functions: (1) `LCCSampling()`, (2) `LeverageSampling()`, (3) `AoptimalMCGLMSub()`, and (4) `ALoptimalGLMSub()` are compared against `glm()` and `bigglm()`.
+
+```{r logistic regression,fig.width=5*3,fig.height=5*2,fig.align='center',fig.cap="Average time for functions, with 5% and 95% percentile intervals under logistic regression."}
+Methods_FCT<-c("glm()","bigglm()","LCCSampling()","LeverageSampling()",
+ "AoptimalMCGLMSub()","ALoptimalGLMSub()")
+Method_Colors<-c("grey","black","#F76D5E","#A50021","#BBFFBB","#50FF50")
+
+Final_Logistic_Regression %>%
+ pivot_longer(cols = `glm()`:`AoptimalMCGLMSub()`,
+ names_to = "Methods",values_to = "Time") %>%
+ group_by(`N size`,`Covariate size`,Methods) %>%
+ summarise(Mean=mean(Time),min=quantile(Time,0.05),max=quantile(Time,0.95),
+ .groups = "drop") %>%
+ mutate(Methods=factor(Methods,levels = Methods_FCT,labels = Methods_FCT),
+ `N size`=factor(`N size`,levels = N_size,
+ labels=paste0("N = ",N_size_labels))) %>%
+ggplot(.,aes(x=factor(`Covariate size`),y=Mean,color=Methods,group=Methods))+
+ geom_point(position = position_dodge(width = 0.5))+
+ geom_line(position = position_dodge(width = 0.5))+
+ geom_errorbar(aes(ymin=min,ymax=max),position = position_dodge(width = 0.5))+
+ facet_wrap(~factor(`N size`),scales = "free",nrow=2)+
+ xlab("Number of Covariates")+ylab("Time in Seconds")+
+ scale_color_manual(values = Method_Colors)+
+ theme_bw()+Theme_special()+ggtitle("Logistic Regression")
+```
+
+There seems to be little difference between using `glm()` and `bigglm()`, while the subsampling functions perform faster, with the performance gap increasing as the size of the big data and the number of covariates grow.
+
+## Poisson Regression
+
+For Poisson regression, the following functions: (1) `LeverageSampling()`, (2) `AoptimalMCGLMSub()`, and (3) `ALoptimalGLMSub()` are compared against `glm()` and `bigglm()`.
+
+```{r poisson regression, fig.width=5*3,fig.height=5*2,fig.align='center',fig.cap="Average time for functions, with 5% and 95% percentile intervals under Poisson regression."}
+Methods_FCT<-c("glm()","bigglm()","LeverageSampling()",
+ "AoptimalMCGLMSub()","ALoptimalGLMSub()")
+Method_Colors<-c("grey","black","#A50021","#BBFFBB","#50FF50")
+
+Final_Poisson_Regression %>%
+ pivot_longer(cols = `glm()`:`AoptimalMCGLMSub()`,
+ names_to = "Methods",values_to = "Time") %>%
+ group_by(`N size`,`Covariate size`,Methods) %>%
+ summarise(Mean=mean(Time),min=quantile(Time,0.05),max=quantile(Time,0.95),
+ .groups = "drop") %>%
+ mutate(Methods=factor(Methods,levels = Methods_FCT,labels = Methods_FCT),
+ `N size`=factor(`N size`,levels = N_size,
+ labels=paste0("N = ",N_size_labels))) %>%
+ggplot(.,aes(x=factor(`Covariate size`),y=Mean,color=Methods,group=Methods))+
+ geom_point(position = position_dodge(width = 0.5))+
+ geom_line(position = position_dodge(width = 0.5))+
+ geom_errorbar(aes(ymin=min,ymax=max),position = position_dodge(width = 0.5))+
+ facet_wrap(~factor(`N size`),scales = "free",nrow=2)+
+ xlab("Number of Covariates")+ylab("Time in Seconds")+
+ scale_color_manual(values = Method_Colors)+
+ theme_bw()+Theme_special()+ggtitle("Poisson Regression")
+```
+
+Similar to logistic regression, the subsampling functions perform faster than the `glm()` and `bigglm()` functions.
+
+In summary, the subsampling functions available in this R package perform best under high-dimensional data.
diff --git a/vignettes/Benchmarking_model_robust.Rmd b/vignettes/Benchmarking_model_robust.Rmd
new file mode 100644
index 0000000..225b46d
--- /dev/null
+++ b/vignettes/Benchmarking_model_robust.Rmd
@@ -0,0 +1,284 @@
+---
+title: "Benchmarking model-robust subsampling functions"
+resource_files:
+ - additionaldata/Results_Model_robust.Rdata
+output: rmarkdown::html_vignette
+vignette: >
+ %\VignetteIndexEntry{Benchmarking model-robust subsampling functions}
+ %\VignetteEngine{knitr::rmarkdown}
+ %\VignetteEncoding{UTF-8}
+---
+
+
+
+```{r knitr options,include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+```{r load packages and theme,message=FALSE}
+# Load the R packages
+library(ggplot2)
+library(ggpubr)
+library(tidyr)
+library(dplyr)
+
+# Theme for plots
+Theme_special<-function(){
+ theme(legend.key.width=unit(1,"cm"),
+ axis.text.x = element_text(color = "black",size=14),
+ axis.text.y = element_text(color = "black",size=14),
+ strip.text = element_text(colour = "black",size = 16,face="bold"),
+ panel.grid.minor.x = element_blank(),
+ axis.title= element_text(color = "black",face = "bold",size = 14),
+ legend.text = element_text(color = "black", size=16),
+ legend.position = "bottom",
+ legend.margin=margin(0,0,0,0),
+ legend.box.margin=margin(-1,-2,-1,-2))
+}
+
+load("additionaldata/Results_Model_robust.Rdata")
+```
+
+## Setup
+
+The `lm()`, `biglm()`, `glm()`, and `bigglm()` functions are benchmarked against the model-robust subsampling functions.
+This benchmarking is conducted across all three regression problems using a consistent setup throughout.
+For large data sizes $N = \{10^4,5 \times 10^4,10^5,5 \times 10^5,10^6,5 \times 10^6\}$, three different covariate sizes $\{5,10,25,50,100\}$, and three different model sizes $\{3,5,10\}$, the functions were replicated 50 times.
+When using the model-robust subsampling functions, the initial subsample sizes $\{100,250,500\}$ were selected based on the covariate sizes, while the final subsample size was fixed at $2500$.
+Here, the models included in the model set are selected based on their AIC values, with a squared term derived from the main effects.
+
+```{r N and Covariate size}
+N_size<-c(1,5,10,50,100,500)*10000
+N_size_labels<-c("10^4","5 x 10^4","10^5","5 x 10^5","10^6","5 x 10^6")
+CS_size<-c(10,25,50)
+NOM_size<-c(3,5,10)
+N_and_CS_size <- c(t(outer(CS_size, N_size_labels,
+ function(cs, n) paste0("N = ",n,
+ " and \nNumber of Covariates ",cs))))
+```
+
+This implementation was conducted on a high-performance computing system; however, the code for linear regression-related functions is shown below.
+Furthermore, it can be extended to logistic and Poisson regression using the relevant subsampling functions and model configurations.
+
+```{r Code setup for benchmarking,eval=FALSE}
+# load the packages
+library(NeEDS4BigData)
+library(here)
+library(biglm)
+
+# indexes for N, covariate sizes and number of models
+indexA <- as.numeric(Sys.getenv("indexA"))
+indexB <- as.numeric(Sys.getenv("indexB"))
+indexC <- as.numeric(Sys.getenv("indexC"))
+
+# set N, covariate size, number of models and assign replicates
+N_size<-c(1,5,10,50,100,500)*10000
+Covariate_size<-c(10,25,50)
+No_of_Models<-c(3,5,10)
+Replicates<-50
+
+# set the initial and final subsample sizes,
+# with the distribution parameters to generate data
+r1<-c(1,2.5,5)*100; r2<-2500;
+Dist<-"Normal"; Dist_Par<-list(Mean=0,Variance=1,Error_Variance=0.5);
+Family<-"linear"
+
+# assign the indexes
+N_idx<-indexA; Covariate_idx<-indexB; ModelSet_idx<-indexC
+No_Of_Var<-Covariate_size[Covariate_idx]
+N<-N_size[N_idx]
+Model_Set<-No_of_Models[ModelSet_idx]
+
+# generate the big data based on N and covariate size
+Beta<-c(-1,rep(0.5,Covariate_size[Covariate_idx]))
+Generated_Data<-GenGLMdata(Dist,Dist_Par,No_Of_Var,Beta,N,Family)
+Full_Data<-Generated_Data$Complete_Data
+
+# Find the models in the model set for model-robust subsampling
+# based on AIC values
+Full_Data<-cbind(Full_Data,Full_Data[,-c(1,2)]^2)
+colnames(Full_Data)<-c("Y",paste0("X",0:No_Of_Var),paste0("X",1:No_Of_Var,"^2"))
+Temp_Data<-Full_Data[sample(1:N,1000),]
+
+AIC_Values<-NULL
+for (i in 1:No_Of_Var) {
+ temp_data<-as.data.frame(Temp_Data[,c("Y",paste0("X",0:No_Of_Var),paste0("X",i,"^2"))])
+ model <- lm(Y~.-1, data = temp_data)
+ AIC_Values[i]<-AIC(model)
+}
+
+# Covariates in model set
+Best_Indices <- order(AIC_Values)[1:Model_Set]
+Model_Apriori<-rep(1/Model_Set,Model_Set)
+All_Combinations<-lm_formula<-list();
+for (NOM_idx in 1:Model_Set) {
+ All_Combinations[[NOM_idx]]<-c(paste0("X",0:No_Of_Var),
+ paste0("X",Best_Indices[NOM_idx],"^2"))
+ lm_formula[[NOM_idx]]<-as.formula(paste("Y ~",
+ paste(c(paste0("X",0:No_Of_Var),
+ paste0("X",Best_Indices[NOM_idx],"^2")),
+ collapse = " + ")))
+}
+
+cat("N size :",N_size[N_idx]," and Covariate size :",Covariate_size[Covariate_idx],
+ " with the model set ",No_of_Models[ModelSet_idx],"\n")
+
+# benchmarking the stats::lm() function
+lm<-sapply(1:Replicates,function(i){
+ start_T<-Sys.time()
+ for(NOM_idx in 1:length(lm_formula)){
+ suppressMessages(lm(lm_formula[[NOM_idx]],data=data.frame(Full_Data)))
+ }
+ return(difftime(Sys.time(),start_T,units = "secs"))
+})
+
+cat("Benchmarked lm() function.\n")
+
+# benchmarking the biglm::biglm() function
+biglm<-sapply(1:Replicates,function(i){
+ start_T<-Sys.time()
+ for (NOM_idx in 1:length(lm_formula)) {
+ suppressMessages(biglm(lm_formula[[NOM_idx]],data=data.frame(Full_Data)))
+ }
+ return(difftime(Sys.time(),start_T,units = "secs"))
+})
+
+cat("Benchmarked biglm() function.\n")
+
+# benchmarking the NeEDS4BigData::modelRobustLinSub() function
+modelRobustLinSub<-sapply(1:Replicates,function(i){
+ start_T<-Sys.time()
+ suppressMessages(modelRobustLinSub(r1[Covariate_idx],r2,
+ Y=as.matrix(Full_Data[,1]),
+ X=as.matrix(Full_Data[,-1]),
+ N=nrow(Full_Data),
+ Apriori_probs=Model_Apriori,
+ All_Combinations = All_Combinations,
+ All_Covariates = colnames(Full_Data[,-1])))
+ return(difftime(Sys.time(),start_T,units = "secs"))
+})
+
+cat("Benchmarked modelRobustLinSub() function.")
+
+Plot_Data<-cbind.data.frame("N size"=N_size[N_idx],
+ "Covariate size"=Covariate_size[Covariate_idx],
+ "No of Models"= No_of_Models[ModelSet_idx],
+ "lm()"=lm,"biglm()"=biglm,
+ "modelRobustLinSub()"=modelRobustLinSub)
+
+save(Plot_Data,
+ file=here("Results",paste0("Output_NS_",N_idx,"_CS_",Covariate_idx,
+ "_MS_",ModelSet_idx,".RData")))
+```
+
+## Linear regression
+
+For linear regression, the `modelRobustLinSub()` function from the NeEDS4BigData package was compared against `lm()` and `biglm()`.
+
+```{r linear regression, fig.height=4*3,fig.width=4*6,fig.align='center',fig.cap="Average time for functions, with 5% and 95% percentile intervals under model robust linear regression."}
+Methods_FCT<-c("lm()","biglm()","modelRobustLinSub()")
+Method_Colors<-c("grey","black","#50FF50")
+
+Final_Linear_Regression %>%
+ pivot_longer(cols = `lm()`:`modelRobustLinSub()`,
+ names_to = "Methods",values_to = "Time") %>%
+ mutate(Methods=factor(Methods,levels = Methods_FCT,labels = Methods_FCT),
+ `N size`=factor(`N size`,levels = N_size,labels = N_size_labels),
+ `N and Covariate size`=paste0("N = ",`N size`,
+ " and \nNumber of Covariates ",
+ `Covariate size`)) %>%
+ mutate(`N and Covariate size`=factor(`N and Covariate size`,
+ levels = N_and_CS_size,
+ labels = N_and_CS_size)) %>%
+ select(`N and Covariate size`,`No of Models`,Methods,Time) %>%
+ group_by(`N and Covariate size`,`No of Models`,Methods) %>%
+ summarise(Mean=mean(Time),min=quantile(Time,0.05),max=quantile(Time,0.95),
+ .groups = "drop") %>%
+ ggplot(.,aes(x=factor(`No of Models`),y=Mean,color=Methods,group=Methods))+
+ geom_point(position = position_dodge(width = 0.5))+
+ geom_line(position = position_dodge(width = 0.5))+
+ geom_errorbar(aes(ymin=min,ymax=max),position = position_dodge(width = 0.5))+
+ facet_wrap(.~factor(`N and Covariate size`),scales = "free",ncol=length(N_size))+
+ xlab("Number of Models")+ylab("Time in Seconds")+
+ scale_color_manual(values = Method_Colors)+
+ theme_bw()+Theme_special()+ggtitle("Linear Regression")
+```
+
+In general, our subsampling functions perform faster as the size of the big data, the number of covariates, and the number of models increase.
+Even when a solution for linear regression is available in an analytical form, the subsampling functions perform better than or as well as `biglm()`.
+
+## Logistic regression
+
+For logistic regression, the `modelRobustLogSub()` function was compared against `glm()` and `bigglm()`.
+
+```{r logistic regression, fig.height=4*3,fig.width=4*6,fig.align='center',fig.cap="Average time for functions, with 5% and 95% percentile intervals under model robust logistic regression."}
+Methods_FCT<-c("glm()","bigglm()","modelRobustLogSub()")
+Method_Colors<-c("grey","black","#50FF50")
+
+Final_Logistic_Regression %>%
+ pivot_longer(cols = `glm()`:`modelRobustLogSub()`,
+ names_to = "Methods",values_to = "Time") %>%
+ mutate(Methods=factor(Methods,levels = Methods_FCT,labels = Methods_FCT),
+ `N size`=factor(`N size`,levels = N_size,labels = N_size_labels),
+ `N and Covariate size`=paste0("N = ",`N size`,
+ " and \nNumber of Covariates ",
+ `Covariate size`)) %>%
+ mutate(`N and Covariate size`=factor(`N and Covariate size`,
+ levels = N_and_CS_size,
+ labels = N_and_CS_size)) %>%
+ select(`N and Covariate size`,`No of Models`,Methods,Time) %>%
+ group_by(`N and Covariate size`,`No of Models`,Methods) %>%
+ summarise(Mean=mean(Time),min=quantile(Time,0.05),max=quantile(Time,0.95),
+ .groups = "drop") %>%
+ ggplot(.,aes(x=factor(`No of Models`),y=Mean,color=Methods,group=Methods))+
+ geom_point(position = position_dodge(width = 0.5))+
+ geom_line(position = position_dodge(width = 0.5))+
+ geom_errorbar(aes(ymin=min,ymax=max),position = position_dodge(width = 0.5))+
+ facet_wrap(.~factor(`N and Covariate size`),scales = "free",ncol=length(N_size))+
+ xlab("Number of Models")+ylab("Time in Seconds")+
+ scale_color_manual(values = Method_Colors)+
+ theme_bw()+Theme_special()+ggtitle("Logistic Regression")
+```
+
+It seems there is a significant difference between using `glm()` and `bigglm()`, with the model-robust subsampling function performing faster.
+This performance gap increases as the size of the big data, the number of covariates, and the number of models grow.
+
+## Poisson Regression
+
+For Poisson regression, the function `modelRobustPoiSub()` was compared against `glm()` and `bigglm()`.
+
+```{r poisson regression, fig.height=4*3,fig.width=4*6,fig.align='center',fig.cap="Average time for functions, with 5% and 95% percentile intervals under model robust Poisson regression."}
+Methods_FCT<-c("glm()","bigglm()","modelRobustPoiSub()")
+Method_Colors<-c("grey","black","#50FF50")
+
+Final_Poisson_Regression %>%
+ pivot_longer(cols = `glm()`:`modelRobustPoiSub()`,
+ names_to = "Methods",values_to = "Time") %>%
+ mutate(Methods=factor(Methods,levels = Methods_FCT,labels = Methods_FCT),
+ `N size`=factor(`N size`,levels = N_size,labels = N_size_labels),
+ `N and Covariate size`=paste0("N = ",`N size`,
+ " and \nNumber of Covariates ",
+ `Covariate size`)) %>%
+ mutate(`N and Covariate size`=factor(`N and Covariate size`,
+ levels = N_and_CS_size,
+ labels = N_and_CS_size)) %>%
+ select(`N and Covariate size`,`No of Models`,Methods,Time) %>%
+ group_by(`N and Covariate size`,`No of Models`,Methods) %>%
+ summarise(Mean=mean(Time),min=quantile(Time,0.05),max=quantile(Time,0.95),
+ .groups = "drop") %>%
+ ggplot(.,aes(x=factor(`No of Models`),y=Mean,color=Methods,group=Methods))+
+ geom_point(position = position_dodge(width = 0.5))+
+ geom_line(position = position_dodge(width = 0.5))+
+ geom_errorbar(aes(ymin=min,ymax=max),position = position_dodge(width = 0.5))+
+ facet_wrap(.~factor(`N and Covariate size`),scales = "free",ncol=length(N_size))+
+ xlab("Number of Models")+ylab("Time in Seconds")+
+ scale_color_manual(values = Method_Colors)+
+ theme_bw()+Theme_special()+ggtitle("Poisson Regression")
+```
+
+Similar to logistic regression, the model-robust subsampling function performs faster than the `glm()` and `bigglm()` functions.
+
+In summary, the model-robust subsampling functions available in this R package perform best under high-dimensional data.
diff --git a/vignettes/Linear_Regression.Rmd b/vignettes/Linear_Regression.Rmd
index bfe25ab..9974603 100644
--- a/vignettes/Linear_Regression.Rmd
+++ b/vignettes/Linear_Regression.Rmd
@@ -46,7 +46,7 @@ Theme_special<-function(){
# Understanding the electric consumption data
``Electric power consumption'' data [@hebrailindividual], which contains $2,049,280$ measurements for a house located at Sceaux, France between December 2006 and November 2010.
-The data contains $4$ columns and has $2,049,280$ observations, first column is the response variable and the rest are covariates, however we only use the first $1\%$ of the data for this explanation.
+The data contains $4$ columns and has $2,049,280$ observations, first column is the response variable and the rest are covariates, however we only use the first $10\%$ of the data for this explanation.
The response $y$ is the log scaled intensity, while the covariates are active electrical energy in the a) kitchen ($X_1$), b) laundry room ($X_2$) and c) water-heater and air-conditioner ($X_3$).
The covariates are scaled to be mean of zero and variance of one.
@@ -56,8 +56,8 @@ Given data is analysed under two different scenarios,
2. subsampling method assuming the main effects model is potentially misspecified.
```{r Exploring data}
-# Selecting 1% of the big data and prepare it
-indexes<-1:ceiling(nrow(Electric_consumption)*0.01)
+# Selecting 25% of the big data and prepare it
+indexes<-1:ceiling(nrow(Electric_consumption)*0.10)
Original_Data<-cbind(Electric_consumption[indexes,1],1,
Electric_consumption[indexes,-1])
colnames(Original_Data)<-c("Y",paste0("X",0:ncol(Original_Data[,-c(1,2)])))
@@ -71,14 +71,14 @@ head(Original_Data) %>%
caption = "First few observations of the electric consumption data.")
# Setting the sample sizes
-N<-nrow(Original_Data); M<-250; k<-seq(6,18,by=3)*100; rep_k<-rep(k,each=M)
+N<-nrow(Original_Data); M<-250; k<-seq(8,20,by=2)*100; rep_k<-rep(k,each=M)
```
## Model robust or average subsampling
The method $A$- and $L$-optimality of model robust or average subsampling [@mahendran2023model] is compared against the $A$- and $L$-optimality subsampling [@ai2021optimal;@yao2021review] method.
Here five different models are considered 1) main effects model ($\beta_0+\beta_1X_1+\beta_2X_2+\beta_3X_3$), 2-4) main effects model with the squared term of each covariate ($X^2_1 / X^2_2 / X^2_3$) and 5) main effects model with all the squared terms ($\beta_0+\beta_1X_1+\beta_2X_2+\beta_3X_3+\beta_4X^2_1+\beta_5X^2_2+\beta_6X^2_3$).
-For each model $l$ the mean squared error of the model parameters $MSE_l(\tilde{\beta}_k,\hat{\beta})=\frac{1}{MJ} \sum_{i=1}^M \sum_{j=1}^J (\tilde{\beta}_{k,j} - \hat{\beta}_j)^2$ are calculated for the $M=100$ simulations across the sample sizes $k=(600,\ldots,1500)$ and the initial sample size is $r1=300$.
+For each model $l$ the mean squared error of the model parameters $MSE_l(\tilde{\beta}_k,\hat{\beta})=\frac{1}{MJ} \sum_{i=1}^M \sum_{j=1}^J (\tilde{\beta}_{k,j} - \hat{\beta}_j)^2$ are calculated for the $M=250$ simulations across the sample sizes $k=(800,\ldots,2000)$ and the initial sample size is $r1=400$.
Here, for the $l$-th model $\tilde{\beta}_k$ is the estimated model parameters from the sample of size $k$ and $\hat{\beta}$ is the estimated model parameters from the full big data, while $j$ is index of the model parameter.
```{r Define the methods}
@@ -118,7 +118,7 @@ Below is the code of implementation for this scenario.
```{r Equal apriori probabilities, fig.width=12,fig.height=12,fig.align='center',fig.cap="Mean squared error for all the models with equal apriori in the order a to e for Model 1 to 5 across the subsampling methods under comparison."}
All_Covariates<-colnames(Original_Data_ModelRobust)[-1]
# A- and L-optimality model robust subsampling for linear regression
-NeEDS4BigData::modelRobustLinSub(r1=300,r2=rep_k,
+NeEDS4BigData::modelRobustLinSub(r1=400,r2=rep_k,
Y=as.matrix(Original_Data_ModelRobust[,1]),
X=as.matrix(Original_Data_ModelRobust[,-1]),N=N,
Apriori_probs=rep(1/length(All_Models),length(All_Models)),
@@ -180,10 +180,10 @@ Method_Colour<-c("#D82632","#A50021","#BBFFBB","#50FF50","#00BB00")
Method_Shape_Types<-c(rep(8,2),rep(17,3))
# A- and L-optimality and RLmAMSE model misspecified subsampling for linear regression
-NeEDS4BigData::modelMissLinSub(r1=300,r2=rep_k,
+NeEDS4BigData::modelMissLinSub(r1=400,r2=rep_k,
Y=as.matrix(Original_Data[,1]),
X=as.matrix(Original_Data[,-1]),
- N=N,Alpha=10,proportion=0.8)->Results
+ N=N,Alpha=10,proportion=0.5)->Results
Final_Beta_modelMiss<-Results$Beta_Estimates
Final_AMSE_modelMiss<-Results$AMSE_Estimates
diff --git a/vignettes/Logistic_Regression.Rmd b/vignettes/Logistic_Regression.Rmd
index 8467bbc..bcf072c 100644
--- a/vignettes/Logistic_Regression.Rmd
+++ b/vignettes/Logistic_Regression.Rmd
@@ -70,14 +70,14 @@ head(Original_Data) %>%
caption = "First few observations of the skin segmentation data.")
# Setting the sample sizes
-N<-nrow(Original_Data); M<-250; k<-seq(6,18,by=3)*100; rep_k<-rep(k,each=M)
+N<-nrow(Original_Data); M<-250; k<-seq(8,20,by=2)*100; rep_k<-rep(k,each=M)
```
## Model robust or average subsampling
The method $A$- and $L$-optimality of model robust or average subsampling [@mahendran2023model] is compared against the $A$- and $L$-optimality subsampling [@ai2021optimal;@yao2021review] method.
Here five different models are considered 1) main effects model ($\beta_0+\beta_1X_1+\beta_2X_2+\beta_3X_3$), 2-4) main effects model with the squared term of each covariate ($X^2_1 / X^2_2 / X^2_3$) and 5) main effects model with all the squared terms ($\beta_0+\beta_1X_1+\beta_2X_2+\beta_3X_3+\beta_4X^2_1+\beta_5X^2_2+\beta_6X^2_3$).
-For each model $j$ the mean squared error of the model parameters $MSE_l(\tilde{\beta}_k,\hat{\beta})=\frac{1}{MJ} \sum_{i=1}^M \sum_{j=1}^J (\tilde{\beta}_{k,j} - \hat{\beta}_j)^2$ are calculated for the $M=100$ simulations across the sample sizes $k=(600,\ldots,1500)$ and the initial sample size is $r1=300$.
+For each model $j$ the mean squared error of the model parameters $MSE_l(\tilde{\beta}_k,\hat{\beta})=\frac{1}{MJ} \sum_{i=1}^M \sum_{j=1}^J (\tilde{\beta}_{k,j} - \hat{\beta}_j)^2$ are calculated for the $M=250$ simulations across the sample sizes $k=(800,\ldots,2000)$ and the initial sample size is $r1=400$.
Here, for the $l$-th model $\tilde{\beta}_k$ is the estimated model parameters from the sample of size $k$ and $\hat{\beta}$ is the estimated model parameters from the full big data, while $j$ is index of the model parameter.
```{r Define the methods}
@@ -117,7 +117,7 @@ Below is the code of implementation for this scenario.
```{r Equal apriori,fig.width=12,fig.height=12,fig.align='center', fig.cap="Mean squared error for all the models with equal apriori in the order a to e for Model 1 to 5 across the subsampling methods under comparison."}
All_Covariates<-colnames(Original_Data_ModelRobust)[-1]
# A- and L-optimality model robust subsampling for logistic regression
-NeEDS4BigData::modelRobustLogSub(r1=300,r2=rep_k,
+NeEDS4BigData::modelRobustLogSub(r1=400,r2=rep_k,
Y=as.matrix(Original_Data_ModelRobust[,1]),
X=as.matrix(Original_Data_ModelRobust[,-1]),
N=N,Apriori_probs=rep(1/length(All_Models),length(All_Models)),
@@ -180,10 +180,10 @@ Method_Colour<-c("#D82632","#A50021","#BBFFBB","#50FF50","#00BB00")
Method_Shape_Types<-c(rep(8,2),rep(17,3))
# A- and L-optimality and RLmAMSE model misspecified subsampling for logistic regression
-NeEDS4BigData::modelMissLogSub(r1=300,r2=rep_k,
+NeEDS4BigData::modelMissLogSub(r1=400,r2=rep_k,
Y=as.matrix(Original_Data[,1]),
X=as.matrix(Original_Data[,-1]),
- N=N,Alpha=10, proportion = 0.8)->Results
+ N=N,Alpha=10, proportion = 1)->Results
Final_Beta_modelMiss<-Results$Beta_Estimates
Final_AMSE_modelMiss<-Results$AMSE_Estimates
diff --git a/vignettes/Poisson_Regression.Rmd b/vignettes/Poisson_Regression.Rmd
index c387d6c..fd8be42 100644
--- a/vignettes/Poisson_Regression.Rmd
+++ b/vignettes/Poisson_Regression.Rmd
@@ -68,14 +68,14 @@ head(Original_Data) %>%
caption = "First few observations of the bike sharing data.")
# Setting the sample sizes
-N<-nrow(Original_Data); M<-250; k<-seq(6,18,by=3)*100; rep_k<-rep(k,each=M)
+N<-nrow(Original_Data); M<-250; k<-seq(8,20,by=2)*100; rep_k<-rep(k,each=M)
```
## Model robust or average subsampling
The method $A$- and $L$-optimality of model robust or average subsampling [@mahendran2023model] is compared against the $A$- and $L$-optimality subsampling [@ai2021optimal;@yao2021review] method.
Here five different models are considered 1) main effects model ($\beta_0+\beta_1X_1+\beta_2X_2+\beta_3X_3$), 2-4) main effects model with the squared term of each covariate ($X^2_1 / X^2_2 / X^2_3$) and 5) main effects model with all the squared terms ($\beta_0+\beta_1X_1+\beta_2X_2+\beta_3X_3+\beta_4X^2_1+\beta_5X^2_2+\beta_6X^2_3$).
-For each model $j$ the mean squared error of the model parameters $MSE_l(\tilde{\beta}_k,\hat{\beta})=\frac{1}{MJ} \sum_{i=1}^M \sum_{j=1}^J (\tilde{\beta}_{k,j} - \hat{\beta}_j)^2$ are calculated for the $M=100$ simulations across the sample sizes $k=(600,\ldots,1500)$ and the initial sample size is $r1=300$.
+For each model $j$ the mean squared error of the model parameters $MSE_l(\tilde{\beta}_k,\hat{\beta})=\frac{1}{MJ} \sum_{i=1}^M \sum_{j=1}^J (\tilde{\beta}_{k,j} - \hat{\beta}_j)^2$ are calculated for the $M=250$ simulations across the sample sizes $k=(800,\ldots,2000)$ and the initial sample size is $r1=400$.
Here, for the $l$-th model $\tilde{\beta}_k$ is the estimated model parameters from the sample of size $k$ and $\hat{\beta}$ is the estimated model parameters from the full big data, while $j$ is index of the model parameter.
```{r define subsampling methods}
@@ -115,7 +115,7 @@ Below is the code of implementation for this scenario.
```{r Equal apriori,fig.width=12,fig.height=12,fig.align='center',fig.cap="Mean squared error for all the models with equal apriori in the order a to e for Model 1 to 5 across the subsampling methods under comparison."}
All_Covariates<-colnames(Original_Data_ModelRobust)[-1]
# A- and L-optimality model robust subsampling for poisson regression
-NeEDS4BigData::modelRobustPoiSub(r1=300,r2=rep_k,
+NeEDS4BigData::modelRobustPoiSub(r1=400,r2=rep_k,
Y=as.matrix(Original_Data_ModelRobust[,1]),
X=as.matrix(Original_Data_ModelRobust[,-1]),
N=N,Apriori_probs=rep(1/length(All_Models),length(All_Models)),
@@ -178,10 +178,10 @@ Method_Colour<-c("#D82632","#A50021","#BBFFBB","#50FF50","#00BB00")
Method_Shape_Types<-c(rep(8,2),rep(17,3))
# A- and L-optimality and RLmAMSE model misspecified subsampling for poisson regression
-NeEDS4BigData::modelMissPoiSub(r1=300,r2=rep_k,
+NeEDS4BigData::modelMissPoiSub(r1=400,r2=rep_k,
Y=as.matrix(Original_Data[,1]),
X=as.matrix(Original_Data[,-1]),
- N=N,Alpha=10, proportion = 0.8)->Results
+ N=N,Alpha=10, proportion = 1)->Results
Final_Beta_modelMiss<-Results$Beta_Estimates
Final_AMSE_modelMiss<-Results$AMSE_Estimates
diff --git a/vignettes/additionaldata/Results_Model_based.Rdata b/vignettes/additionaldata/Results_Model_based.Rdata
new file mode 100644
index 0000000..27eb766
Binary files /dev/null and b/vignettes/additionaldata/Results_Model_based.Rdata differ
diff --git a/vignettes/additionaldata/Results_Model_robust.Rdata b/vignettes/additionaldata/Results_Model_robust.Rdata
new file mode 100644
index 0000000..b4f2f51
Binary files /dev/null and b/vignettes/additionaldata/Results_Model_robust.Rdata differ