diff --git a/.github/workflows/R_CMD_check_Hades.yaml b/.github/workflows/R_CMD_check_Hades.yaml index 52b631e59..27dcc8d23 100644 --- a/.github/workflows/R_CMD_check_Hades.yaml +++ b/.github/workflows/R_CMD_check_Hades.yaml @@ -78,7 +78,7 @@ jobs: do eval sudo $cmd done < <(Rscript -e 'writeLines(remotes::system_requirements("ubuntu", "20.04"))') - + - uses: r-lib/actions/setup-r-dependencies@v2 with: extra-packages: any::rcmdcheck diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml new file mode 100644 index 000000000..7a5e8ac76 --- /dev/null +++ b/.github/workflows/pkgdown.yaml @@ -0,0 +1,46 @@ +# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples +# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +on: + push: + branches: [main, develop] + release: + types: [published] + workflow_dispatch: + +name: pkgdown + +jobs: + pkgdown: + runs-on: ubuntu-latest + # Only restrict concurrency for non-PR jobs + concurrency: + group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + steps: + - uses: actions/checkout@v2 + + - uses: r-lib/actions/setup-pandoc@v2 + + - uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::pkgdown, ohdsi/OhdsiRTools + needs: website + + - name: Build site + run: Rscript -e 'pkgdown::build_site_github_pages(new_process = FALSE, install = TRUE)' + + - name: Fix Hades Logo + run: Rscript -e 'OhdsiRTools::fixHadesLogo()' + + - name: Deploy to GitHub pages 🚀 + if: github.event_name != 'pull_request' + uses: JamesIves/github-pages-deploy-action@4.1.4 + with: + clean: false + branch: gh-pages + folder: docs diff --git a/DESCRIPTION b/DESCRIPTION index 7687fb3bf..be822bd1b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,8 +2,8 @@ Package: PatientLevelPrediction Type: Package Title: Developing patient level prediction using data in the OMOP Common Data Model -Version: 6.3.1 -Date: 2023-02-28 +Version: 6.3.2 +Date: 2023-05-15 Authors@R: c( person("Jenna", "Reps", email = "jreps@its.jnj.com", role = c("aut", "cre")), person("Martijn", "Schuemie", role = c("aut")), @@ -23,7 +23,7 @@ URL: https://ohdsi.github.io/PatientLevelPrediction, https://github.com/OHDSI/Pa BugReports: https://github.com/OHDSI/PatientLevelPrediction/issues VignetteBuilder: knitr Depends: - R (>= 3.3.0) + R (>= 4.0.0) Imports: Andromeda, Cyclops (>= 3.0.0), @@ -75,6 +75,7 @@ Remotes: ohdsi/Eunomia, ohdsi/FeatureExtraction, ohdsi/IterativeHardThresholding, + ohdsi/ParallelLogger, ohdsi/ShinyAppBuilder, ohdsi/ResultModelManager RoxygenNote: 7.2.3 diff --git a/NEWS.md b/NEWS.md index 91fd454a9..ee19962b5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,10 @@ +PatientLevelPrediction 6.3.2 +====================== +- fixed bug with database insert if result is incomplete +- updated/fixed documentation (Egill) +- added model path to models (Henrik) +- updated hyper-parameter saving to data.frame and made consistent + PatientLevelPrediction 6.3.1 ====================== - fixed bug with multiple covariate settings in diagnose plp diff --git a/R/CyclopsModels.R b/R/CyclopsModels.R index 535552c9a..83c61f098 100644 --- a/R/CyclopsModels.R +++ b/R/CyclopsModels.R @@ -156,7 +156,7 @@ fitCyclopsModel <- function( prediction$evaluationType <- 'Train' # get cv AUC if exists - cvPerFold <- c() + cvPerFold <- data.frame() if(!is.null(modelTrained$cv)){ cvPrediction <- do.call(rbind, lapply(modelTrained$cv, function(x){x$predCV})) cvPrediction$evaluationType <- 'CV' @@ -167,7 +167,17 @@ fitCyclopsModel <- function( cvPerFold <- unlist(lapply(modelTrained$cv, function(x){x$out_sample_auc})) if(length(cvPerFold)>0){ - names(cvPerFold) <- paste0('fold_auc', 1:length(cvPerFold)) + cvPerFold <- data.frame( + metric = 'AUC', + fold = 1:length(cvPerFold), + value = cvPerFold, + startingVariance = ifelse(is.null(param$priorParams$variance), 'NULL', param$priorParams$variance), + lowerLimit = ifelse(is.null(param$lowerLimit), 'NULL', param$lowerLimit), + upperLimit = ifelse(is.null(param$upperLimit), 'NULL', param$upperLimit), + tolerance = ifelse(is.null(settings$tolerance), 'NULL', settings$tolerance) + ) + } else{ + cvPerFold <- data.frame() } # remove the cv from the model: diff --git a/R/DatabaseMigration.R b/R/DatabaseMigration.R index 92c1166bb..bc480ce32 100644 --- a/R/DatabaseMigration.R +++ b/R/DatabaseMigration.R @@ -1,3 +1,21 @@ +# @file DatabaseMigration.R +# +# Copyright 2023 Observational Health Data Sciences and Informatics +# +# This file is part of PatientLevelPrediction +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitatons under the License. +# #' Migrate Data model #' @description #' Migrate data from current state to next state diff --git a/R/Fit.R b/R/Fit.R index ce629c093..104510fba 100644 --- a/R/Fit.R +++ b/R/Fit.R @@ -37,6 +37,7 @@ #' } #' @param search The search strategy for the hyper-parameter selection (currently not used) #' @param analysisId The id of the analysis +#' @param analysisPath The path of the analysis #' @return #' An object of class \code{plpModel} containing: #' @@ -53,7 +54,8 @@ fitPlp <- function( trainData, modelSettings, search = "grid", - analysisId + analysisId, + analysisPath ) { @@ -76,7 +78,8 @@ fitPlp <- function( trainData = trainData, modelSettings, # old: param = modelSettings$param, # make this model settings? search = search, - analysisId = analysisId + analysisId = analysisId, + analysisPath = analysisPath ) plpModel <- do.call(fun, args) ParallelLogger::logTrace('Returned from classifier function') diff --git a/R/KNN.R b/R/KNN.R index ea1d3cf0c..54ef1528d 100644 --- a/R/KNN.R +++ b/R/KNN.R @@ -64,7 +64,7 @@ setKNN <- function(k=1000, indexFolder=file.path(getwd(),'knn'), threads = 1 ){ return(result) } -fitKNN <- function(trainData, modelSettings, search = 'none', analysisId ){ +fitKNN <- function(trainData, modelSettings, search = 'none', analysisId, ...){ param <- modelSettings$param @@ -150,7 +150,7 @@ fitKNN <- function(trainData, modelSettings, search = 'none', analysisId ){ trainingTime = paste(as.character(abs(comp)), attr(comp,'units')), trainingDate = Sys.Date(), modelName = 'KNN', - hyperParamSearch =c(), + hyperParamSearch = data.frame(), finalModelParameters = list( k = k, threads = param$threads diff --git a/R/PatientLevelPrediction.R b/R/PatientLevelPrediction.R index cc2f08533..b4e2a82b5 100644 --- a/R/PatientLevelPrediction.R +++ b/R/PatientLevelPrediction.R @@ -22,6 +22,7 @@ #' #' @docType package #' @name PatientLevelPrediction +#' @keywords internal #' @importFrom dplyr %>% #' @importFrom rlang .data NULL diff --git a/R/PreprocessingData.R b/R/PreprocessingData.R index 200127713..834d27a85 100644 --- a/R/PreprocessingData.R +++ b/R/PreprocessingData.R @@ -15,7 +15,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -#' Create the settings for preprocessing the trainData using \code{ }. +#' Create the settings for preprocessing the trainData. #' #' @details #' Returns an object of class \code{preprocessingSettings} that specifies how to preprocess the training data diff --git a/R/RClassifier.R b/R/RClassifier.R index 020aa85e7..69b575193 100644 --- a/R/RClassifier.R +++ b/R/RClassifier.R @@ -3,8 +3,8 @@ fitRclassifier <- function( trainData, modelSettings, search = 'grid', - analysisId - ){ + analysisId, + ...){ param <- modelSettings$param diff --git a/R/Recalibration.R b/R/Recalibration.R index 24b0eff07..2053f7ed7 100644 --- a/R/Recalibration.R +++ b/R/Recalibration.R @@ -90,7 +90,8 @@ recalibratePlpRefit <- function( fitPlp( trainData = newData, modelSettings = setLassoRefit, - analysisId = 'recalibrationRefit' + analysisId = 'recalibrationRefit', + analysisPath = NULL ) }, error = function(e){ParallelLogger::logInfo(e); return(NULL)} diff --git a/R/RunMultiplePlp.R b/R/RunMultiplePlp.R index 3a287d182..eff802409 100644 --- a/R/RunMultiplePlp.R +++ b/R/RunMultiplePlp.R @@ -132,11 +132,10 @@ runMultiplePlp <- function( dataExists <- length(dir(file.path(saveDirectory, settings$dataLocation)))>0 if(dataExists){ - plpData <- PatientLevelPrediction::loadPlpData(file.path(saveDirectory, settings$dataLocation)) - analysisExists <- file.exists(file.path(saveDirectory, settings$analysisId,'diagnosePlp.rds')) if(!analysisExists){ + plpData <- PatientLevelPrediction::loadPlpData(file.path(saveDirectory, settings$dataLocation)) diagnosePlpSettings <- list( plpData = plpData, outcomeId = modelDesign$outcomeId, @@ -171,11 +170,10 @@ runMultiplePlp <- function( dataExists <- length(dir(file.path(saveDirectory, settings$dataLocation)))>0 if(dataExists){ - plpData <- PatientLevelPrediction::loadPlpData(file.path(saveDirectory, settings$dataLocation)) - analysisExists <- file.exists(file.path(saveDirectory, settings$analysisId,'plpResult', 'runPlp.rds')) + if(!analysisExists){ - + plpData <- PatientLevelPrediction::loadPlpData(file.path(saveDirectory, settings$dataLocation)) runPlpSettings <- list( plpData = plpData, outcomeId = modelDesign$outcomeId, diff --git a/R/RunPlp.R b/R/RunPlp.R index 29b17d745..2f9cd0199 100644 --- a/R/RunPlp.R +++ b/R/RunPlp.R @@ -366,7 +366,8 @@ runPlp <- function( settings <- list( trainData = data$Train, modelSettings = modelSettings, - analysisId = analysisId + analysisId = analysisId, + analysisPath = analysisPath ) ParallelLogger::logInfo(sprintf('Training %s model',settings$modelSettings$name)) diff --git a/R/SklearnClassifier.R b/R/SklearnClassifier.R index 96e8005a9..4e386b705 100644 --- a/R/SklearnClassifier.R +++ b/R/SklearnClassifier.R @@ -416,13 +416,29 @@ computeGridPerformance <- function(prediction, param, performanceFunct = 'comput } } - hyperSummary <- c(performanceFunct, performance, performanceFold, unlist(paramString)) - names(hyperSummary) <- c( - 'Metric', - 'cvPerformance', - paste0('cvPerformanceFold',1:length(performanceFold)), - names(param) + #hyperSummary <- c(performanceFunct, performance, performanceFold, unlist(paramString)) + #names(hyperSummary) <- c( + # 'Metric', + # 'cvPerformance', + # paste0('cvPerformanceFold',1:length(performanceFold)), + # names(param) + #) + paramValues <- unlist(paramString) + names(paramValues) <- names(param) + + hyperSummary <- as.data.frame( + c( + data.frame( + metric = performanceFunct, + fold = c("CV",as.character(1:length(performanceFold))), + value = c(performance,performanceFold) + ), + paramValues + ) ) + + + return( list( diff --git a/R/uploadToDatabase.R b/R/uploadToDatabase.R index 28230541a..eef4f2021 100644 --- a/R/uploadToDatabase.R +++ b/R/uploadToDatabase.R @@ -329,13 +329,18 @@ addMultipleRunPlpToDatabase <- function( ParallelLogger::logInfo('result loaded') # Add runPlp to the database - addRunPlpToDatabase( - runPlp = runPlp, - connectionDetails = connectionDetails, - databaseSchemaSettings = databaseSchemaSettings, - cohortDefinitions = cohortDefinitions, - databaseList = databaseList, - modelSaveLocation = modelSaveLocation + tryCatch( + {addRunPlpToDatabase( + runPlp = runPlp, + connectionDetails = connectionDetails, + databaseSchemaSettings = databaseSchemaSettings, + cohortDefinitions = cohortDefinitions, + databaseList = databaseList, + modelSaveLocation = modelSaveLocation + )}, error = function(e){ + ParallelLogger::logInfo('result upload failed: '); + ParallelLogger::logInfo(e) + } ) } #model not null diff --git a/README.md b/README.md index 7740510e2..52a2ac8bc 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ Features - Extracts the necessary data from a database in OMOP Common Data Model format for multiple covariate settings. - Uses a large set of covariates including for example all drugs, diagnoses, procedures, as well as age, comorbidity indexes, and custom covariates. - Allows you to add custom covariates or cohort covariates. -- Includes a large number of state-of-the-art machine learning algorithms that can be used to develop predictive models, including Regularized logistic regression, Random forest, Gradient boosting machines, Decision tree, Naive Bayes, K-nearest neighbours, Neural network and AdaBoost, SVM. +- Includes a large number of state-of-the-art machine learning algorithms that can be used to develop predictive models, including Regularized logistic regression, Random forest, Gradient boosting machines, Decision tree, Naive Bayes, K-nearest neighbours, Neural network, AdaBoost and Support vector machines. - Allows you to add custom algorithms. - Allows you to add custom feature engineering - Allows you to add custom under/over sampling (or any other sampling) [note: based on existing research this is not recommended] @@ -71,11 +71,11 @@ Demo of the Shiny Apps can be found here: Technology ========== -PatientLevelPrediction is an R package, with some functions implemented in python. +PatientLevelPrediction is an R package, with some functions using python through reticulate. System Requirements =================== -Requires R (version 3.3.0 or higher). Installation on Windows requires [RTools](http://cran.r-project.org/bin/windows/Rtools/). Libraries used in PatientLevelPrediction require Java and Python. +Requires R (version 4.0 or higher). Installation on Windows requires [RTools](http://cran.r-project.org/bin/windows/Rtools/). Libraries used in PatientLevelPrediction require Java and Python. The python installation is required for some of the machine learning algorithms. We advise to install Python 3.7 using Anaconda (https://www.continuum.io/downloads). @@ -129,7 +129,6 @@ Development =========== PatientLevelPrediction is being developed in R Studio. -Beta # Acknowledgements diff --git a/_pkgdown.yml b/_pkgdown.yml index 8e3b50682..abe042283 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -1,7 +1,11 @@ template: params: bootswatch: cosmo - + +development: + mode: auto + development: docs/dev + home: links: - text: Ask a question @@ -57,7 +61,8 @@ reference: - createRestrictPlpDataSettings - getPlpData - savePlpData - - loadPlpData + - loadPlpData + - getCohortCovariateData - title: "Settings for designing a prediction models" desc: > Design settings required when developing a model. @@ -67,12 +72,25 @@ reference: - createSampleSettings - createFeatureEngineeringSettings - createPreprocessSettings + - title: "Optional design settings" + desc: > + Settings for optional steps that can be used in the PLP pipeline + contents: + - createCohortCovariateSettings + - createRandomForestFeatureSelection + - createUnivariateFeatureSelection + - title: "External validation" + contents: + - createValidationSettings + - recalibratePlp + - recalibratePlpRefit - title: "Execution settings when developing a model" desc: > Execution settings required when developing a model. contents: - createLogSettings - createExecuteSettings + - createDefaultExecuteSettings - title: "Binary Classification Models" desc: > Functions for setting binary classifiers and their hyper-parameter search. @@ -86,6 +104,8 @@ reference: - setNaiveBayes - setRandomForest - setSVM + - setIterativeHardThresholding + - setLightGBM - title: "Survival Models" desc: > Functions for setting survival models and their hyper-parameter search. @@ -101,6 +121,7 @@ reference: - loadPlpModel - savePlpResult - loadPlpResult + - diagnosePlp - title: "Multiple Patient-Level Prediction Models" desc: > Functions for training mutliple patient-level-prediction model in an efficient way. @@ -110,12 +131,34 @@ reference: - validateMultiplePlp - savePlpAnalysesJson - loadPlpAnalysesJson + - diagnoseMultiplePlp + - title: "Individual pipeline functions" + desc: > + Functions for running parts of the PLP workflow + contents: + - createStudyPopulation + - splitData + - preprocessData + - fitPlp + - predictPlp + - evaluatePlp + - covariateSummary - title: "Saving results into database" desc: > Functions for saving the prediction model and performances into a database. contents: + - insertResultsToSqlite - createPlpResultTables - - populatePlpResultTables + - addMultipleRunPlpToDatabase + - addRunPlpToDatabase + - createDatabaseSchemaSettings + - createDatabaseList + - addDiagnosePlpToDatabase + - addMultipleDiagnosePlpToDatabase + - extractDatabaseToCsv + - insertCsvToDatabase + - insertModelDesignInDatabase + - migrateDataModel - title: "Shiny Viewers" desc: > Functions for viewing results via a shiny app @@ -140,6 +183,7 @@ reference: - plotPreferencePDF - plotPredictionDistribution - plotVariableScatterplot + - outcomeSurvivalPlot - title: "Learning Curves" desc: > Functions for creating and plotting learning curves @@ -151,3 +195,67 @@ reference: Functions for simulating cohort method data objects. contents: - simulatePlpData + - plpDataSimulationProfile + - title: "Data manipulation functions" + desc: > + Functions for manipulating data + contents: + - toSparseM + - MapIds + - title: "Helper/utility functions" + contents: + - listAppend + - listCartesian + - createTempModelLoc + - configurePython + - setPythonEnvironment + - title: "Evaluation measures" + contents: + - accuracy + - averagePrecision + - brierScore + - calibrationLine + - computeAuc + - f1Score + - falseDiscoveryRate + - falseNegativeRate + - falseOmissionRate + - falsePositiveRate + - ici + - modelBasedConcordance + - negativeLikelihoodRatio + - negativePredictiveValue + - positiveLikelihoodRatio + - positivePredictiveValue + - sensitivity + - specificity + - computeGridPerformance + - diagnosticOddsRatio + - getCalibrationSummary + - getDemographicSummary + - getThresholdSummary + - getThresholdSummary_binary + - getPredictionDistribution + - getPredictionDistribution_binary + - title: "Saving/loading models as json" + desc: > + Functions for saving or loading models as json + contents: + - sklearnFromJson + - sklearnToJson + - title: "Load/save for sharing" + desc: > + Functions for loading/saving objects for sharing + contents: + - savePlpShareable + - loadPlpShareable + - loadPrediction + - savePrediction + - title: "Feature importance" + contents: + - pfi + - title: "Other functions" + contents: + - predictCyclops + + diff --git a/docs/404.html b/docs/404.html deleted file mode 100644 index c63753b85..000000000 --- a/docs/404.html +++ /dev/null @@ -1,211 +0,0 @@ - - - - - - - - -Page not found (404) • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -Content not found. Please use links in the navbar. - -
- - - -
- - - - -
- - - - - - - - diff --git a/docs/articles/ATLAS_O.png b/docs/articles/ATLAS_O.png deleted file mode 100644 index 3cda2abf7..000000000 Binary files a/docs/articles/ATLAS_O.png and /dev/null differ diff --git a/docs/articles/ATLAS_T.png b/docs/articles/ATLAS_T.png deleted file mode 100644 index 8be57dc9e..000000000 Binary files a/docs/articles/ATLAS_T.png and /dev/null differ diff --git a/docs/articles/AddingCustomAlgorithms.html b/docs/articles/AddingCustomAlgorithms.html deleted file mode 100644 index d9dbffc64..000000000 --- a/docs/articles/AddingCustomAlgorithms.html +++ /dev/null @@ -1,527 +0,0 @@ - - - - - - - -Adding Custom Patient-Level Prediction Algorithms • PatientLevelPrediction - - - - - - - - - - -
-
- - - - -
-
- - - - - -
-

-Introduction

-

This vignette describes how you can add your own custom algorithms in the Observational Health Data Sciencs and Informatics (OHDSI) PatientLevelPrediction package. This allows you to fully leverage the OHDSI PatientLevelPrediction framework for model development and validation. This vignette assumes you have read and are comfortable with building single patient level prediction models as described in the BuildingPredictiveModels vignette.

-

We invite you to share your new algorithms with the OHDSI community through our GitHub repository.

-
-
-

-Algorithm Code Structure

-

Each algorithm in the package should be implemented in its own <Name>.R file, e.g. KNN.R, containing a set<Name> function and a fit<Name> function. Furthermore, a corresponding predict function in predict.R is needed (if there isn’t one available that would work, see example at the end of the document). We will now describe each of these functions in more detail below.

-
-

-Set

-

The set<Name> is a function that takes as input the different hyper-parameter values to do a grid search when training. The output of the functions needs to be a list as class modelSettings containing:

-
    -
  • param - all the combinations of the hyper-parameter values input
  • -
  • model - a string specifying what function to call to fit the model
  • -
  • name - a string containing the name of the model.
  • -
-

For example, if you were adding a model called madeUp that has two hyper-parameters then the set function should be:

-
-setMadeUp <- function(a=1, b=2, seed=NULL){
-  # add input checks here...
-  
-  # now create list of all combinations:
-  result <- list(model='fitMadeUp', # this will be called to train the made up model
-                 param= split(expand.grid(a=a, 
-                                          b=b,
-                                          seed=ifelse(is.null(seed),'NULL', seed)),
-                              1:(length(a)*length(b)  )),
-                 name='Made Up Algorithm'
-  )
-  class(result) <- 'modelSettings' 
-  
-  return(result)
-}
-
-
-

-Fit

-

This function should train your custom model for each parameter entry, pick the best parameters and train a final model for that setting.

-

The fit<Model> should have as inputs:

-
    -
  • population - the study popualation the model is being developed on
  • -
  • plpData - the plpData object
  • -
  • param - the hyper-parameters as a list of all combinations
  • -
  • quiet - T or F indicating whether to output progress
  • -
  • outcomeId - the outcome id
  • -
  • cohortId - the target population id
  • -
-

The fit function should return a list of class plpModel with the following objects:

-
    -
  • model - a trained model
  • -
  • modelSettings - a list containing the model and input param
  • -
  • trainCVAuc - a value with the train AUC value
  • -
  • hyperParamSearch - a dataframe with the hyperparameter grid and corresponding AUCs
  • -
  • metaData - the metaData from the plpData object
  • -
  • populationSettings - the settings used to create the population and define the time-at-risk
  • -
  • outcomeId - the outcomeId being predicted
  • -
  • cohortId - the cohortId corresponding to the target cohort
  • -
  • varImp - a dataframe with the covaraites and a measure of importance
  • -
  • trainingTime - how long it took to develop/evaluate the model
  • -
  • covariateMap - if the plpData are converted to a matrix for model compatibility this tells us what covariate each row in the matrix correpsonds to and is need when implementing the model on new data
  • -
-

The plpModel returned by fit also has a type attribute, this points to the predict function, for example attr(result, 'type') <- 'madeup' means when the model is applied to new data, the ‘predict.madeup’ function in Predict.R is called. if this doesnt exist, then the model will fail. Another attribute is the predictionType attr(result, 'predictionType') <- 'binary' this is currently not needed but may be important in the future when we expand to regression or multiclass classification.

-

For example:

-
-fitMadeUp <- function(population, plpData, param, quiet=F,
-                        outcomeId, cohortId, ...){
-  
-  # **************** code to train the model here
-  # trainedModel <- this code should apply each hyper-parameter using the cross validation
-  #                 then pick out the best hyper-parameter setting
-  #                 and finally fit a model on the whole train data using the 
-  #                 optimal hyper-parameter settings
-  # ****************
-  
-  # construct the standard output for a model:
-  result <- list(model = trainedModel,
-                 modelSettings = list(model='made_up', modelParameters=param),
-                 trainCVAuc = NULL,
-                 hyperParamSearch = hyperSummary,
-                 metaData = plpData$metaData,
-                 populationSettings = attr(population, 'metaData'),
-                 outcomeId=outcomeId,# can use populationSettings$outcomeId?
-                 cohortId=cohortId,
-                 varImp = NULL,
-                 trainingTime=comp,
-                 covariateMap=result$map
-  )
-  class(result) <- 'plpModel'
-  attr(result, 'type') <- 'madeup'
-  attr(result, 'predictionType') <- 'binary'
-  return(result)
-    
-}
-

You could make the fitMadeUp function cleaner by adding helper function in the MadeUp.R file that are called by the fit function. As the end of the fit function specified attr(result, 'type') <- 'madeup' we also need to make sure there is a predict.madeup function in Predict.R:

-
-
-

-Predict

-

The prediction function takes as input the plpModel returned by fit, a population and corresponding plpData. It returns a data.frame with the columns:

-
    -
  • rowId - the id for each person in the population
  • -
  • value - the predicted risk from the plpModel
  • -
-

If the population contains the columns outcomeCount and indexes, then these are also in the output.

-

For example:

-
-predict.madeup <- function(plpModel,population, plpData, ...){ 
-
-  # ************* code to do prediction for each rowId in population
-  # prediction <- code to do prediction here returning columns: rowId 
-  #               and value (predicted risk)
-  #**************
-  
-  prediction <- merge(population, prediction, by='rowId')
-  prediction <- prediction[,colnames(prediction)%in%c('rowId','outcomeCount',
-                                                      'indexes', 'value')] 
-  attr(prediction, "metaData") <- list(predictionType = "binary") 
-  return(prediction)
-  
-}
-
-
-
-

-Algorithm Example

-

Below a fully functional algorithm example is given, however we highly recommend you to have a look at the available algorithms in the package.

-
-

-Set

-
setMadeUp <- function(a=1, b=2, seed=NULL){
-  # check a is valid positive value
-  if(missing(a)){
-    stop('a must be input')
-  }
-  if(!class(a)%in%c('numeric','integer'){
-    stop('a must be numeric')
-  }
-  if(a < 0){
-    stop('a must be positive')
-  }
-  # check b is numeric
-  if(missing(b)){
-    stop('b must be input')
-  }
-  if(!class(b)%in%c('numeric','integer'){
-    stop('b must be numeric')
-  }
-  
-  # now create list of all combinations:
-  result <- list(model='fitMadeUp', 
-                 param= split(expand.grid(a=a, 
-                                          b=b,
-                                          seed=ifelse(is.null(seed),'NULL', seed)),
-                              1:(length(a)*length(b)  )),
-                 name='Made Up Algorithm'
-  )
-  class(result) <- 'modelSettings' 
-  
-  return(result)
-    
-  
-}
-
-
-

-Fit

-
-fitMadeUp <- function(population, plpData, param, quiet=F,
-                        outcomeId, cohortId, ...){
-    if(!quiet)
-    writeLines('Training Made Up model')
-  
-  if(param[[1]]$seed!='NULL')
-    set.seed(param[[1]]$seed)
-  
-    # check plpData is coo format:
-  if(!'ffdf'%in%class(plpData$covariates) )
-    stop('This algorithm requires plpData in coo format')
-  
-  metaData <- attr(population, 'metaData')
-  if(!is.null(population$indexes))
-    population <- population[population$indexes>0,]
-  attr(population, 'metaData') <- metaData
-  
-  # convert data into sparse R Matrix:
-  result <- toSparseM(plpData,population,map=NULL)
-  data <- result$data
-  
-  data <- data[population$rowId,]
-  
-  # set test/train sets (for printing performance as it trains)
-  if(!quiet)
-    writeLines(paste0('Training made up model on train set containing ', nrow(population), 
-                      ' people with ',sum(population$outcomeCount>0), ' outcomes'))
-  start <- Sys.time()
-  
-  #============= STEP 1 ======================================
-  # pick the best hyper-params and then do final training on all data...
-  writeLines('train')
-  datas <- list(population=population, data=data)
-  param.sel <- lapply(param, function(x) do.call(made_up_model, c(x,datas)  ))
-  hyperSummary <- do.call(rbind, lapply(param.sel, function(x) x$hyperSum))
-  hyperSummary <- as.data.frame(hyperSummary)
-  hyperSummary$auc <- unlist(lapply(param.sel, function(x) x$auc)) 
-  param.sel <- unlist(lapply(param.sel, function(x) x$auc))
-  param <- param[[which.max(param.sel)]]
-  
-  # set this so you do a final model train 
-  param$final=T
-  
-  writeLines('final train')
-  trainedModel <- do.call(made_up_model, c(param,datas)  )$model
-  
-  comp <- Sys.time() - start
-  if(!quiet)
-    writeLines(paste0('Model Made Up trained - took:',  format(comp, digits=3)))
-  
-  # construct the standard output for a model:
-  result <- list(model = trainedModel,
-                 modelSettings = list(model='made_up', modelParameters=param),
-                 trainCVAuc = NULL,
-                 hyperParamSearch = hyperSummary,
-                 metaData = plpData$metaData,
-                 populationSettings = attr(population, 'metaData'),
-                 outcomeId=outcomeId,# can use populationSettings$outcomeId?
-                 cohortId=cohortId,
-                 varImp = NULL,
-                 trainingTime=comp,
-                 covariateMap=result$map
-  )
-  class(result) <- 'plpModel'
-  attr(result, 'type') <- 'madeup'
-  attr(result, 'predictionType') <- 'binary'
-  return(result)
-    
-}
-
-
-

-Helpers

-

In the fit model a helper function made_up_model is called, this is the function that trains a model given the data and population (where the popualtion contains a column outcomeCount corresponding to the outcome). Both the data and population are ordered the same way:

-
-made_up_model <- function(data, population,
-                       a=1,b=1, final=F, ...){
-  
-  writeLines(paste('Training Made Up model with ',length(unique(population$indexes)),
-                   ' fold CV'))
-  if(!is.null(population$indexes) && final==F){
-    index_vect <- unique(population$indexes)
-    perform <- c()
-    
-    # create prediction matrix to store all predictions
-    predictionMat <- population
-    predictionMat$value <- 0
-    attr(predictionMat, "metaData") <- list(predictionType = "binary")
-    
-    for(index in 1:length(index_vect )){
-      writeLines(paste('Fold ',index, ' -- with ', sum(population$indexes!=index),
-                       'train rows'))
-      model <- madeup::model(x = data[population$indexes!=index,],
-                             y= population$outcomeCount[population$indexes!=index],
-                                  a=a, b=b)
-      
-      pred <- stats::predict(model, data[population$indexes==index,])
-      prediction <- population[population$indexes==index,]
-      prediction$value <- pred
-      attr(prediction, "metaData") <- list(predictionType = "binary")
-      aucVal <- computeAuc(prediction)
-      perform <- c(perform,aucVal)
-      
-      # add the fold predictions and compute AUC after loop
-      predictionMat$value[population$indexes==index] <- pred
-      
-     }
-    ##auc <- mean(perform) # want overal rather than mean
-    auc <- computeAuc(predictionMat)
-    
-    foldPerm <- perform
-  } else {
-    model <- madeup::model(x= data, 
-                                y= population$outcomeCount,
-                                a=a,b=b)
-    
-    pred <- stats::predict(model, data)
-    prediction <- population
-    prediction$value <- pred
-    attr(prediction, "metaData") <- list(predictionType = "binary") 
-    auc <- computeAuc(prediction)
-    foldPerm <- auc
-  }
-  
-  result <- list(model=model,
-                 auc=auc,
-                 hyperSum = unlist(list(a = a, b = b, fold_auc=foldPerm))
-  )
-  return(result)
-}
-
-
-

-Predict

-

The final step is to create a predict function for the model. This gets added to the predict.R file. In the example above the type attr(result, 'type') <- 'madeup' was madeup, so a predict.madeup function is required to be added into the predict.R. The predict function needs to take as input the plpModel returned by the fit function, the population to apply the model on and the plpData specifying the covariates of the population.

-
-predict.madeup <- function(plpModel,population, plpData, ...){ 
-  result <- toSparseM(plpData, population, map=plpModel$covariateMap)
-  data <- result$data[population$rowId,]
-  prediction <- data.frame(rowId=population$rowId, 
-                           value=stats::predict(plpModel$model, data)
-                           )
-  
-  prediction <- merge(population, prediction, by='rowId')
-  prediction <- prediction[,colnames(prediction)%in%
-                           c('rowId','outcomeCount','indexes', 'value')] # need to fix no index issue
-  attr(prediction, "metaData") <- list(predictionType = "binary") 
-  return(prediction)
-  
-}
-

As the madeup model uses the standard R prediction, it has the same prediction function as xgboost, so we could have not added a new prediction function and instead made the type of the result returned by fitMadeUpModel to attr(result, 'type') <- 'xgboost'.

-
-
-
-

-Acknowledgments

-

Considerable work has been dedicated to provide the PatientLevelPrediction package.

-
-citation("PatientLevelPrediction")
-
## 
-## To cite PatientLevelPrediction in publications use:
-## 
-## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design
-## and implementation of a standardized framework to generate and evaluate
-## patient-level prediction models using observational healthcare data."
-## _Journal of the American Medical Informatics Association_, *25*(8),
-## 969-975. <URL: https://doi.org/10.1093/jamia/ocy032>.
-## 
-## A BibTeX entry for LaTeX users is
-## 
-##   @Article{,
-##     author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek},
-##     title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data},
-##     journal = {Journal of the American Medical Informatics Association},
-##     volume = {25},
-##     number = {8},
-##     pages = {969-975},
-##     year = {2018},
-##     url = {https://doi.org/10.1093/jamia/ocy032},
-##   }
-

Please reference this paper if you use the PLP Package in your work:

-

Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.

-

This work is supported in part through the National Science Foundation grant IIS 1251151.

-
-
- - - -
- - - - -
- - - - - - diff --git a/docs/articles/AddingCustomAlgorithms_files/header-attrs-2.7/header-attrs.js b/docs/articles/AddingCustomAlgorithms_files/header-attrs-2.7/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/AddingCustomAlgorithms_files/header-attrs-2.7/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/AddingCustomFeatureEngineering.html b/docs/articles/AddingCustomFeatureEngineering.html deleted file mode 100644 index ba6f86c94..000000000 --- a/docs/articles/AddingCustomFeatureEngineering.html +++ /dev/null @@ -1,309 +0,0 @@ - - - - - - - -Adding Custom Feature Engineering Functions • PatientLevelPrediction - - - - - - - - - - -
-
- - - - -
-
- - - - - -
-

-Introduction

-

This vignette describes how you can add your own custom function for feature engineering in the Observational Health Data Sciencs and Informatics (OHDSI) PatientLevelPrediction package. This vignette assumes you have read and are comfortable with building single patient level prediction models as described in the BuildingPredictiveModels vignette.

-

We invite you to share your new feature engineering functions with the OHDSI community through our GitHub repository.

-
-
-

-Feature Engineering Function Code Structure

-

To make a custom feature engineering function that can be used within PatientLevelPrediction you need to write two different functions. The ‘create’ function and the ‘implement’ function.

-

The ‘create’ function, e.g., create<FeatureEngineeringFunctionName>, takes the parameters of the feature engineering ‘implement’ function as input, checks these are valid and outputs these as a list of class ‘featureEngineeringSettings’ with the ‘fun’ attribute specifying the ‘implement’ function to call.

-

The ‘implement’ function, e.g., implement<FeatureEngineeringFunctionName>, must take as input: * trainData - a list containing: - covariateData: the plpData$covariateData restricted to the training patients - labels: a data frame that contain rowId (patient identifier) and outcomeCount (the class labels) - folds: a data.frame that contains rowId (patient identifier) and index (the cross validation fold) * featureEngineeringSettings - the output of your create<FeatureEngineeringFunctionName>

-

The ‘implement’ function can then do any manipulation of the trainData (adding new features or removing features) but must output a trainData object containing the new covariateData, labels and folds for the training data patients.

-
-
-

-Example

-

Let’s consider the situation where we wish to create an age spline feature. To make this custom feature engineering function we need to write the ‘create’ and ‘implement’ R functions.

-
-

-Create function

-

Our age spline feature function will create a new feature using the plpData$cohorts ageYear column. We will implement a restricted cubic spline that requires specifying the number of knots. . Therefore, the inputs for this are: * knots an integer/double specifying the number of knots

-
-createAgeSpine <- function(
-                     knots = 5
-                     ){
-  
-  # add input checks
-  checkIsClass(knots, c('numeric','integer'))
-  checkHigher(knots,0)
-  
-  # create list of inputs to implement function
-  featureEngineeringSettings <- list(
-    knots = knots
-    )
-  
-  # specify the function that will implement the sampling
-  attr(featureEngineeringSettings, "fun") <- "implementAgeSpine"
-
-  # make sure the object returned is of class "sampleSettings"
-  class(featureEngineeringSettings) <- "featureEngineeringSettings"
-  return(featureEngineeringSettings)
-  
-}
-

We now need to create the ‘implement’ function implementAgeSpine()

-
-
-

-Implement function

-

All ‘implement’ functions must take as input the trainData and the featureEngineeringSettings (this is the output of the ‘create’ function). They must return a trainData object containing the new covariateData, labels and folds.

-

In our example, the createAgeSpine() will return a list with ‘knots’. The featureEngineeringSettings therefore contains this.

-
-implementAgeSpine <- function(trainData, featureEngineeringSettings){
-
-  # currently not used
-  knots <- featureEngineeringSettings$knots
-  
-  
-  # age in in trainData$labels as ageYear
-  ageData <- trainData$labels
-  
-  # now implement the code to do your desired feature engineering
-  
-  data <- Matrix::sparseMatrix(
-    i = 1:length(ageData$rowId),
-    j = rep(1, length(ageData$rowId)),
-    x = ageData$ageYear,
-    dims=c(length(ageData$rowId),1)
-  )
-  
-  data <- as.matrix(data)
-  x <- data[,1]
-  y <- ageData$outcomeCount
-  
-mRCS <- rms::ols(
-  y~rms::rcs(x, 
-             stats::quantile(
-               x, 
-               c(0, .05, .275, .5, .775, .95, 1),
-               include.lowest = TRUE
-               )
-             )
-  )
-
-newData <- data.frame(
-  rowId = ageData$rowId,
-  covariateId = 2002,
-  covariateValue = mRCS$fitted.values
-  )
-
-# add new data
-Andromeda::appendToTable(tbl = trainData$covariateData$covariates, 
-                         data = newData)
-  
-  # return the updated trainData
-  return(trainData)
-}
-
-
-
-

-Acknowledgments

-

Considerable work has been dedicated to provide the PatientLevelPrediction package.

-
-citation("PatientLevelPrediction")
-
## 
-## To cite PatientLevelPrediction in publications use:
-## 
-## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design
-## and implementation of a standardized framework to generate and evaluate
-## patient-level prediction models using observational healthcare data."
-## _Journal of the American Medical Informatics Association_, *25*(8),
-## 969-975. <URL: https://doi.org/10.1093/jamia/ocy032>.
-## 
-## A BibTeX entry for LaTeX users is
-## 
-##   @Article{,
-##     author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek},
-##     title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data},
-##     journal = {Journal of the American Medical Informatics Association},
-##     volume = {25},
-##     number = {8},
-##     pages = {969-975},
-##     year = {2018},
-##     url = {https://doi.org/10.1093/jamia/ocy032},
-##   }
-

Please reference this paper if you use the PLP Package in your work:

-

Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.

-

This work is supported in part through the National Science Foundation grant IIS 1251151.

-
-
- - - -
- - - - -
- - - - - - diff --git a/docs/articles/AddingCustomFeatureEngineering_files/header-attrs-2.11/header-attrs.js b/docs/articles/AddingCustomFeatureEngineering_files/header-attrs-2.11/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/AddingCustomFeatureEngineering_files/header-attrs-2.11/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/AddingCustomModels.html b/docs/articles/AddingCustomModels.html deleted file mode 100644 index f588c269e..000000000 --- a/docs/articles/AddingCustomModels.html +++ /dev/null @@ -1,643 +0,0 @@ - - - - - - - -Adding Custom Patient-Level Prediction Algorithms • PatientLevelPrediction - - - - - - - - - - -
-
- - - - -
-
- - - - - -
-

-Introduction

-

This vignette describes how you can add your own custom algorithms in the Observational Health Data Sciencs and Informatics (OHDSI) PatientLevelPrediction package. This allows you to fully leverage the OHDSI PatientLevelPrediction framework for model development and validation. This vignette assumes you have read and are comfortable with building single patient level prediction models as described in the BuildingPredictiveModels vignette.

-

We invite you to share your new algorithms with the OHDSI community through our GitHub repository.

-
-
-

-Algorithm Code Structure

-

Each algorithm in the package should be implemented in its own <Name>.R file, e.g. KNN.R, containing a set<Name> function, a fit<Name> function and a predict<Name> function. Occasionally the fit and prediction functions may be reused (if using an R classifier see RClassifier.R or if using a scikit-learn classifier see SklearnClassifier.R). We will now describe each of these functions in more detail below.

-
-

-Set

-

The set<Name> is a function that takes as input the different hyper-parameter values to do a grid search when training. The output of the functions needs to be a list as class modelSettings containing:

-
    -
  • param - all the combinations of the hyper-parameter values input
  • -
  • fitFunction - a string specifying what function to call to fit the model
  • -
-

The param object can have a setttings attribute containing any extra settings. For example to specify the model name and the seed used for reproducibility:

-
-attr(param, 'settings') <- list(
-  seed = 12,
-  modelName = "Special classifier"
-  )
-

For example, if you were adding a model called madeUp that has two hyper-parameters then the set function should be:

-
-setMadeUp <- function(a=c(1,4,10), b=2, seed=NULL){
-  # add input checks here...
-  
-  param <- split(
-    expand.grid(
-      a=a, 
-      b=b
-    ),
-    1:(length(a)*length(b))
-    )
-  
-  attr(param, 'settings') <- list(
-    modelName = "Made Up",
-    requiresDenseMatrix = TRUE,
-    seed = seed
-    )
-  
-  # now create list of all combinations:
-  result <- list(
-    fitFunction = 'fitMadeUp', # this will be called to train the made up model
-    param = param
-  )
-  class(result) <- 'modelSettings' 
-  
-  return(result)
-}
-
-
-

-Fit

-

This function should train your custom model for each parameter entry, pick the best parameters and train a final model for that setting.

-

The fit<Model> should have as inputs:

-
    -
  • trainData - a list containing the covariateData, labels and folds for the training population
  • -
  • param - the hyper-parameters as a list of all combinations
  • -
  • search - the type of hyper-parameter search
  • -
  • analysisId - an identifier for the analysis
  • -
-

The fit function should return a list of class plpModel with the following objects:

-
    -
  • model - a trained model (or location of the model if it is not an R object)
  • -
  • prediction - a data.frame object with the trainData$labels plus an extra column with the name ‘value’ corresponding to the predicted risk of having the outcome during the time-at-risk.
  • -
  • settings - a list containing: -
      -
    • plpDataSettings - the plpData settings e.g., attr(trainData, “metaData”)$plpDataSettings
    • -
    • covariateSettings - the covariate settings e.g., attr(trainData, “metaData”)$covariateSettings
    • -
    • populationSettings - the population settings e.g., attr(trainData, “metaData”)$populationSettings,
    • -
    • featureEngineering - the feature engineering settings e.g., attr(trainData\(covariateData, "metaData")\)featureEngineering,
    • -
    • tidyCovariates - the preprocessing settings e.g., attr(trainData\(covariateData, "metaData")\)tidyCovariateDataSettings,
    • -
    • requireDenseMatrix - does the model require a dense matrix? e.g., attr(param, ‘settings’)$requiresDenseMatrix,
    • -
    • modelSettings = a list containing: model (model name), param (the hyper-parameter search list), finalModelParameters (the final model hyper-parameters), extraSettings (any extra settings)
    • -
    • splitSettings - the split settings e.g., attr(trainData, “metaData”)$splitSettings,
    • -
    • sampleSettings - the sample settings e.g., attr(trainData, “metaData”)$sampleSettings
    • -
    -
  • -
  • trainDetails - a list containing: -
      -
    • analysisId - the identifier for the analysis
    • -
    • cdmDatabaseSchema - the database used to develop the model
    • -
    • outcomeId - the outcome id
    • -
    • cohortId - the target population id
    • -
    • attrition - the attrition
    • -
    • trainingTime - how long it took to train the model
    • -
    • trainingDate - date of model training
    • -
    • hyperParamSearch - the hyper-parameter search used to train the model
    • -
    -
  • -
  • covariateImportance - a data.frame containing the columns ‘covariateId’, ‘covariateValue’ (the variable importance) and ‘columnId’ (the column number that the variable need to be mapped to when implementing the model)
  • -
-

In additon the plpModel requires two attributes:

-
    -
  • predictionFunction - the name of the function used to make predictions
  • -
  • modelType - whether the model is ‘binary’ or ‘survival’
  • -
-

For example attr(result, 'predictionFunction') <- 'madeupPrediction' means when the model is applied to new data, the ‘madeupPrediction’ function is called to make predictions. If this doesnt exist, then the model will fail. The other attribute is the modelType attr(result, 'modelType') <- 'binary' this is needed when evaluating the model to ensure the correct evaluation is applied. Currently the evaluation supports ‘binary’ and ‘survival’ modelType.

-

Note: If a new modelType is desired, then the evalaution code within PatientLevelPrediction must be updated to specify how the new type is evaluated. This requires making edits to PatientLevelPrediction and then making a pull request to the PatientLevelPrediction github. The evaluation cannot have one off customization because the evaluation must be standardized to enable comparison across similar models.

-

A full example of a custom ‘binary’ classifier fit function is:

-
-fitMadeUp <- function(trainData, param, search, analysisId){
-  
-  # **************** code to train the model here
-  # trainedModel <- this code should apply each hyper-parameter combination   
-  # (param[[i]]) using the specified search (e.g., cross validation)
-  #                 then pick out the best hyper-parameter setting
-  #                 and finally fit a model on the whole train data using the 
-  #                 optimal hyper-parameter settings
-  # ****************
-  
-  # **************** code to apply the model to trainData
-  # prediction <- code to apply trainedModel to trainData
-  # ****************
-  
-  # **************** code to get variable importance (if possible)
-  # varImp <- code to get importance of each variable in trainedModel
-  # ****************
-  
-  
-  # construct the standard output for a model:
-  result <- list(model = trainedModel,
-                 prediction = prediction, # the train and maybe the cross validation predictions for the trainData
-    settings = list(
-      plpDataSettings = attr(trainData, "metaData")$plpDataSettings,
-      covariateSettings = attr(trainData, "metaData")$covariateSettings,
-      populationSettings = attr(trainData, "metaData")$populationSettings,
-      featureEngineering = attr(trainData$covariateData, "metaData")$featureEngineering,
-      tidyCovariates = attr(trainData$covariateData, "metaData")$tidyCovariateDataSettings, 
-      requireDenseMatrix = attr(param, 'settings')$requiresDenseMatrix,
-      modelSettings = list(
-        model = attr(param, 'settings')$modelName, # the model name
-        param = param,
-        finalModelParameters = param[[bestInd]], # best hyper-parameters
-        extraSettings = attr(param, 'settings')
-      ),
-      splitSettings = attr(trainData, "metaData")$splitSettings,
-      sampleSettings = attr(trainData, "metaData")$sampleSettings
-    ),
-    
-    trainDetails = list(
-      analysisId = analysisId,
-      cdmDatabaseSchema = attr(trainData, "metaData")$cdmDatabaseSchema,
-      outcomeId = attr(trainData, "metaData")$outcomeId,
-      cohortId = attr(trainData, "metaData")$cohortId,
-      attrition = attr(trainData, "metaData")$attrition, 
-      trainingTime = timeToTrain, # how long it took to train the model
-      trainingDate = Sys.Date(),
-      hyperParamSearch = hyperSummary # the hyper-parameters and performance data.frame
-    ),
-    covariateImportance = merge(trainData$covariateData$covariateRef, varImp, by='covariateId') # add variable importance to covariateRef if possible
-  )
-  class(result) <- 'plpModel'
-  attr(result, 'predictionFunction') <- 'madeupPrediction'
-  attr(result, 'modelType') <- 'binary'
-  return(result)
-    
-}
-

You could make the fitMadeUp function cleaner by adding helper function in the MadeUp.R file that are called by the fit function (for example a function to run cross validation). It is important to ensure there is a valid prediction function (the one specified by attr(result, 'predictionFunction') <- 'madeupPrediction' is madeupPrediction()) as specified below.

-
-
-

-Predict

-

The prediction function takes as input the plpModel returned by fit, new data and a corresponding cohort. It returns a data.frame with the same columns as cohort but with an additional column:

-
    -
  • value - the predicted risk from the plpModel for each patient in the cohort
  • -
-

For example:

-
-madeupPrediction <- function(plpModel, data, cohort){ 
-
-  # ************* code to do prediction for each rowId in cohort
-  # predictionValues <- code to do prediction here returning the predicted risk
-  #               (value) for each rowId in cohort 
-  #**************
-  
-  prediction <- merge(cohort, predictionValues, by='rowId')
-  attr(prediction, "metaData") <- list(modelType = attr(plpModel, 'modelType')) 
-  return(prediction)
-  
-}
-
-
-
-

-Algorithm Example

-

Below a fully functional algorithm example is given, however we highly recommend you to have a look at the available algorithms in the package (see GradientBoostingMachine.R for the set function, RClassifier.R for the fit and prediction function for R classifiers).

-
-

-Set

-
-setMadeUp <- function(a=c(1,4,6), b=2, seed=NULL){
-  # add input checks here...
-  
-  if(is.null(seed)){
-    seed <- sample(100000,1)
-  }
-  
-  param <- split(
-    expand.grid(
-      a=a, 
-      b=b
-    ),
-    1:(length(a)*length(b))
-    )
-  
-  attr(param, 'settings') <- list(
-    modelName = "Made Up",
-    requiresDenseMatrix = TRUE,
-    seed = seed
-    )
-  
-  # now create list of all combinations:
-  result <- list(
-    fitFunction = 'fitMadeUp', # this will be called to train the made up model
-    param = param
-  )
-  class(result) <- 'modelSettings' 
-  
-  return(result)
-}
-
-
-

-Fit

-
-fitMadeUp <- function(trainData, param, search, analysisId){
-
-  # set the seed for reproducibility
-  set.seed(attr(param, 'settings')$seed)
-  
-  # add folds to labels:
-  trainData$labels <- merge(trainData$labels, trainData$folds, by= 'rowId')
-  # convert data into sparse R Matrix:
-  mappedData <- toSparseM(trainData,map=NULL)
-  matrixData <- mappedData$dataMatrix
-  labels <- mappedData$labels
-  covariateRef <- mappedData$covariateRef
-
-  #============= STEP 1 ======================================
-  # pick the best hyper-params and then do final training on all data...
-  writeLines('Cross validation')
-  param.sel <- lapply(
-    param, 
-    function(x){
-      do.call(
-        made_up_model, 
-        list(
-          param = x, 
-          final = F, 
-          data = matrixData, 
-          labels = labels
-          )  
-      )
-      }
-    )
-  hyperSummary <- do.call(rbind, lapply(param.sel, function(x) x$hyperSum))
-  hyperSummary <- as.data.frame(hyperSummary)
-  hyperSummary$auc <- unlist(lapply(param.sel, function(x) x$auc)) 
-  param.sel <- unlist(lapply(param.sel, function(x) x$auc))
-  bestInd <- which.max(param.sel)
-  
-  #get cross val prediction for best hyper-parameters
-  prediction <- param.sel[[bestInd]]$prediction
-  prediction$evaluationType <- 'CV'
-  
-  writeLines('final train')
-  finalResult <- do.call(
-    made_up_model, 
-    list(
-      param = param[[bestInd]], 
-      final = T, 
-      data = matrixData, 
-      labels = labels
-      )  
-    )
-  
-  trainedModel <- finalResult$model
-  
-  # prediction risk on training data:
-  finalResult$prediction$evaluationType <- 'Train'
-  
-  # get CV and train prediction
-  prediction <- rbind(prediction, finalResult$prediction)
-  
-  varImp <- covariateRef %>% dplyr::collect()
-  # no feature importance available
-  vqrImp$covariateValue <- 0 
-  
- timeToTrain <- Sys.time() - start
-
-  # construct the standard output for a model:
-  result <- list(model = trainedModel,
-                 prediction = prediction, 
-    settings = list(
-      plpDataSettings = attr(trainData, "metaData")$plpDataSettings,
-      covariateSettings = attr(trainData, "metaData")$covariateSettings,
-      populationSettings = attr(trainData, "metaData")$populationSettings,
-      featureEngineering = attr(trainData$covariateData, "metaData")$featureEngineering,
-      tidyCovariates = attr(trainData$covariateData, "metaData")$tidyCovariateDataSettings, 
-      requireDenseMatrix = attr(param, 'settings')$requiresDenseMatrix,
-      modelSettings = list(
-        model = attr(param, 'settings')$modelName, # the model name
-        param = param,
-        finalModelParameters = param[[bestInd]], # best hyper-parameters
-        extraSettings = attr(param, 'settings')
-      ),
-      splitSettings = attr(trainData, "metaData")$splitSettings,
-      sampleSettings = attr(trainData, "metaData")$sampleSettings
-    ),
-    
-    trainDetails = list(
-      analysisId = analysisId,
-      cdmDatabaseSchema = attr(trainData, "metaData")$cdmDatabaseSchema,
-      outcomeId = attr(trainData, "metaData")$outcomeId,
-      cohortId = attr(trainData, "metaData")$cohortId,
-      attrition = attr(trainData, "metaData")$attrition, 
-      trainingTime = timeToTrain, # how long it took to train the model
-      trainingDate = Sys.Date(),
-      hyperParamSearch = hyperSummary # the hyper-parameters and performance data.frame
-    ),
-    covariateImportance = varImp
-  )
-  class(result) <- 'plpModel'
-  attr(result, 'predictionFunction') <- 'madeupPrediction'
-  attr(result, 'modelType') <- 'binary'
-  return(result)
-    
-}
-
-
-

-Helpers

-

In the fit model a helper function made_up_model is called, this is the function that trains a model given the data, labels and hyper-parameters.

-
-made_up_model <- function(param, data, final=F, labels){
-  
-  if(final==F){
-    # add value column to store all predictions
-    labels$value <- rep(0, nrow(labels))
-    attr(labels, "metaData") <- list(modelType = "binary")
-    
-    foldPerm <- c() # this holds CV aucs
-    for(index in 1:max(labels$index)){
-      model <- madeup::model(
-        x = data[labels$index!=index,], # remove left out fold
-        y = labels$outcomeCount[labels$index!=index],
-        a = param$a, 
-        b = param$b
-      )
-      
-      # predict on left out fold
-      pred <- stats::predict(model, data[labels$index==index,])
-      labels$value[labels$index==index] <- pred
-      
-      # calculate auc on help out fold  
-      aucVal <- computeAuc(labels[labels$index==index,])
-      foldPerm<- c(foldPerm,aucVal)    
-    }
-    auc <- computeAuc(labels) # overal AUC
-
-  } else {
-    model <- madeup::model(
-      x = data, 
-      y = labels$outcomeCount,
-      a = param$a,
-      b = param$b
-      )
-    
-    pred <- stats::predict(model, data)
-    labels$value <- pred
-    attr(labels, "metaData") <- list(modelType = "binary") 
-    auc <- computeAuc(labels)
-    foldPerm <- auc
-  }
-  
-  result <- list(
-    model = model,
-    auc = auc,
-    prediction = labels,
-    hyperSum = c(a = a, b = b, fold_auc = foldPerm)
-  )
-  
-  return(result)
-}
-
-
-

-Predict

-

The final step is to create a predict function for the model. In the example above the predeiction function attr(result, 'predictionFunction') <- 'madeupPrediction' was madeupPrediction, so a madeupPrediction function is required when applying the model. The predict function needs to take as input the plpModel returned by the fit function, new data to apply the model on and the cohort specifying the patients of interest to make the prediction for.

-
-madeupPrediction <- function(plpModel, data , cohort){ 
-  
-  if(class(data) == 'plpData'){
-    # convert
-    matrixObjects <- toSparseM(
-      plpData = data, 
-      cohort = cohort,
-      map = plpModel$covariateImportance %>% 
-        dplyr::select(.data$columnId, .data$covariateId)
-    )
-    
-    newData <- matrixObjects$dataMatrix
-    cohort <- matrixObjects$labels
-    
-  }else{
-    newData <- data
-  }
-  
-  if(class(plpModel) == 'plpModel'){
-    model <- plpModel$model
-  } else{
-    model <- plpModel
-  }
-  
-  cohort$value <- stats::predict(model, data)
-  
-  # fix the rowIds to be the old ones
-  # now use the originalRowId and remove the matrix rowId
-  cohort <- cohort %>% 
-    dplyr::select(-.data$rowId) %>%
-    dplyr::rename(rowId = .data$originalRowId)
-  
-  attr(cohort, "metaData") <- list(modelType = attr(plpModel, 'modelType')) 
-  return(cohort)
-  
-}
-

As the madeup model uses the standard R prediction, it has the same prediction function as xgboost, so we could have not added a new prediction function and instead made the predictionFunction of the result returned by fitMadeUpModel to attr(result, 'predictionFunction') <- 'predictXgboost'.

-
-
-
-

-Acknowledgments

-

Considerable work has been dedicated to provide the PatientLevelPrediction package.

-
-citation("PatientLevelPrediction")
-
## 
-## To cite PatientLevelPrediction in publications use:
-## 
-## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design
-## and implementation of a standardized framework to generate and evaluate
-## patient-level prediction models using observational healthcare data."
-## _Journal of the American Medical Informatics Association_, *25*(8),
-## 969-975. <URL: https://doi.org/10.1093/jamia/ocy032>.
-## 
-## A BibTeX entry for LaTeX users is
-## 
-##   @Article{,
-##     author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek},
-##     title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data},
-##     journal = {Journal of the American Medical Informatics Association},
-##     volume = {25},
-##     number = {8},
-##     pages = {969-975},
-##     year = {2018},
-##     url = {https://doi.org/10.1093/jamia/ocy032},
-##   }
-

Please reference this paper if you use the PLP Package in your work:

-

Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.

-

This work is supported in part through the National Science Foundation grant IIS 1251151.

-
-
- - - -
- - - - -
- - - - - - diff --git a/docs/articles/AddingCustomModels_files/header-attrs-2.11/header-attrs.js b/docs/articles/AddingCustomModels_files/header-attrs-2.11/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/AddingCustomModels_files/header-attrs-2.11/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/AddingCustomSamples.html b/docs/articles/AddingCustomSamples.html deleted file mode 100644 index c6a73fa11..000000000 --- a/docs/articles/AddingCustomSamples.html +++ /dev/null @@ -1,308 +0,0 @@ - - - - - - - -Adding Custom Sampling Functions • PatientLevelPrediction - - - - - - - - - - -
-
- - - - -
-
- - - - - -
-

-Introduction

-

This vignette describes how you can add your own custom function for sampling the target population in the Observational Health Data Sciencs and Informatics (OHDSI) PatientLevelPrediction package. This vignette assumes you have read and are comfortable with building single patient level prediction models as described in the BuildingPredictiveModels vignette.

-

We invite you to share your new sample functions with the OHDSI community through our GitHub repository.

-
-
-

-Sample Function Code Structure

-

To make a sampling function that can be used within PatientLevelPrediction you need to write two different functions. The ‘create’ function and the ‘implement’ function.

-

The ‘create’ function, e.g., create<SampleFunctionName>, takes the parameters of the sample ‘implement’ function as input, checks these are valid and outputs these as a list of class ‘sampleSettings’ with the ‘fun’ attribute specifying the ‘implement’ function to call.

-

The ‘implement’ function, e.g., implement<SampleFunctionName>, must take as input: * trainData - a list containing: - covariateData: the plpData$covariateData restricted to the training patients - labels: a data frame that contain rowId (patient identifier) and outcomeCount (the class labels) - folds: a data.frame that contains rowId (patient identifier) and index (the cross validation fold) * sampleSettings - the output of your create<SampleFunctionName>

-

The ‘implement’ function can then do any manipulation of the trainData (such as undersampling or oversampling) but must output a trainData object containing the covariateData, labels and folds for the new training data sample.

-
-
-

-Example

-

Let’s consider the situation where we wish to take a random sample of the training data population. To make this custom sampling function we need to write the ‘create’ and ‘implement’ R functions.

-
-

-Create function

-

Our random sampling function will randomly sample n patients from the trainData. Therefore, the inputs for this are: * n an integer/double specifying the number of patients to sample * sampleSeed an integer/double specifying the seed for reproducibility

-
-createRandomSampleSettings <- function(
-                     n = 10000,
-                     sampleSeed = sample(10000,1)
-                     ){
-  
-  # add input checks
-  checkIsClass(n, c('numeric','integer'))
-  checkHigher(n,0)
-  checkIsClass(sampleSeed, c('numeric','integer'))
-  
-  # create list of inputs to implement function
-  sampleSettings <- list(
-    n = n,
-    sampleSeed  = sampleSeed 
-    )
-  
-  # specify the function that will implement the sampling
-  attr(sampleSettings, "fun") <- "implementRandomSampleSettings"
-
-  # make sure the object returned is of class "sampleSettings"
-  class(sampleSettings) <- "sampleSettings"
-  return(sampleSettings)
-  
-}
-

We now need to create the ‘implement’ function implementRandomSampleSettings()

-
-
-

-Implement function

-

All ‘implement’ functions must take as input the trainData and the sampleSettings (this is the output of the ‘create’ function). They must return a trainData object containing the covariateData, labels and folds.

-

In our example, the createRandomSampleSettings() will return a list with ‘n’ and ‘sampleSeed’. The sampleSettings therefore contains these.

-
-implementRandomSampleSettings <- function(trainData, sampleSettings){
-
-  n <- sampleSetting$n
-  sampleSeed <- sampleSetting$sampleSeed
-  
-  if(n > nrow(trainData$labels)){
-    stop('Sample n bigger than training population')
-  }
-  
-  # set the seed for the randomization
-  set.seed(sampleSeed)
-  
-  # now implement the code to do your desired sampling
-  
-  sampleRowIds <- sample(trainData$labels$rowId, n)
-  
-  sampleTrainData <- list()
-  
-  sampleTrainData$labels <- trainData$labels %>% 
-    dplyr::filter(.data$rowId %in% sampleRowIds) %>% 
-    dplyr::collect()
-  
-  sampleTrainData$folds <- trainData$folds %>% 
-    dplyr::filter(.data$rowId %in% sampleRowIds) %>% 
-    dplyr::collect()
-  
-  sampleTrainData$covariateData <- Andromeda::andromeda()
-  sampleTrainData$covariateData$covariateRef <-trainData$covariateData$covariateRef
-  sampleTrainData$covariateData$covariates <- trainData$covariateData$covariates %>% dplyr::filter(.data$rowId %in% sampleRowIds)
-  
-  #update metaData$populationSize 
-  metaData <- attr(trainData$covariateData, 'metaData')
-  metaData$populationSize = n
-  attr(sampleTrainData$covariateData, 'metaData') <- metaData
-  
-  # make the cocvariateData the correct class
-  class(sampleTrainData$covariateData) <- 'CovariateData'
-  
-  # return the updated trainData
-  return(sampleTrainData)
-}
-
-
-
-

-Acknowledgments

-

Considerable work has been dedicated to provide the PatientLevelPrediction package.

-
-citation("PatientLevelPrediction")
-
## 
-## To cite PatientLevelPrediction in publications use:
-## 
-## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design
-## and implementation of a standardized framework to generate and evaluate
-## patient-level prediction models using observational healthcare data."
-## _Journal of the American Medical Informatics Association_, *25*(8),
-## 969-975. <URL: https://doi.org/10.1093/jamia/ocy032>.
-## 
-## A BibTeX entry for LaTeX users is
-## 
-##   @Article{,
-##     author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek},
-##     title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data},
-##     journal = {Journal of the American Medical Informatics Association},
-##     volume = {25},
-##     number = {8},
-##     pages = {969-975},
-##     year = {2018},
-##     url = {https://doi.org/10.1093/jamia/ocy032},
-##   }
-

Please reference this paper if you use the PLP Package in your work:

-

Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.

-

This work is supported in part through the National Science Foundation grant IIS 1251151.

-
-
- - - -
- - - - -
- - - - - - diff --git a/docs/articles/AddingCustomSamples_files/header-attrs-2.11/header-attrs.js b/docs/articles/AddingCustomSamples_files/header-attrs-2.11/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/AddingCustomSamples_files/header-attrs-2.11/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/AddingCustomSplitting.html b/docs/articles/AddingCustomSplitting.html deleted file mode 100644 index 24683120c..000000000 --- a/docs/articles/AddingCustomSplitting.html +++ /dev/null @@ -1,274 +0,0 @@ - - - - - - - -Adding Custom Data Splitting Functions • PatientLevelPrediction - - - - - - - - - - -
-
- - - - -
-
- - - - - -
-

-Introduction

-

This vignette describes how you can add your own custom function for splitting the labelled data into training data and validation data in the Observational Health Data Sciencs and Informatics (OHDSI) PatientLevelPrediction package. This vignette assumes you have read and are comfortable with building single patient level prediction models as described in the BuildingPredictiveModels vignette.

-

We invite you to share your new data splitting functions with the OHDSI community through our GitHub repository.

-
-
-

-Data Splitting Function Code Structure

-

To make a custom data splitting function that can be used within PatientLevelPrediction you need to write two different functions. The ‘create’ function and the ‘implement’ function.

-

The ‘create’ function, e.g., create<DataSplittingFunction>, takes the parameters of the data splitting ‘implement’ function as input, checks these are valid and outputs these as a list of class ‘splitSettings’ with the ‘fun’ attribute specifying the ‘implement’ function to call.

-

The ‘implement’ function, e.g., implement<DataSplittingFunction>, must take as input: * population: a data frame that contain rowId (patient identifier), ageYear, gender and outcomeCount (the class labels) * splitSettings - the output of your create<DataSplittingFunction>

-

The ‘implement’ function then needs to implement code to assign each rowId in the population to a splitId (<0 means in the train data, 0 means not used and >0 means in the training data with the value defining the cross validation fold).

-
-
-

-Example

-

Let’s consider the situation where we wish to create a split where females are used to train a model but males are used to evaluate the model.

-
-

-Create function

-

Our gender split function requires a single parameter, the number of folds used in cross validation. Therefore create a function with a single nfold input that returns a list of class ‘splitSettings’ with the ‘fun’ attribute specifying the ‘implement’ function we will use.

-
-createGenderSplit <- function(nfold)
-  {
-  
-  # create list of inputs to implement function
-  splitSettings <- list(nfold = nfold)
-  
-  # specify the function that will implement the sampling
-  attr(splitSettings, "fun") <- "implementGenderSplit"
-
-  # make sure the object returned is of class "sampleSettings"
-  class(splitSettings) <- "splitSettings"
-  return(splitSettings)
-  
-}
-

We now need to create the ‘implement’ function implementGenderSplit()

-
-
-

-Implement function

-

All ‘implement’ functions for data splitting must take as input the population and the splitSettings (this is the output of the ‘create’ function). They must return a data.frame containing columns: rowId and index.

-

The index is used to determine whether the patient (identifed by the rowId) is in the test set (index = -1) or train set (index > 0). In in the train set, the value corresponds to the cross validation fold. For example, if rowId 2 is assigned index 5, then it means the patient with the rowId 2 is used to train the model and is in fold 5.

-
-implementGenderSplit <- function(population, splitSettings){
-
-  # find the people who are male:
-  males <- population$rowId[population$gender == 8507]
-  females <- population$rowId[population$gender == 8532]
-  
-  splitIds <- data.frame(
-    rowId = c(males, females),
-    index = c(
-      rep(-1, length(males)),
-      sample(1:splitSettings$nfold, length(females), replace = T)
-    )
-  )
-  
-  # return the updated trainData
-  return(splitIds)
-}
-
-
-
-

-Acknowledgments

-

Considerable work has been dedicated to provide the PatientLevelPrediction package.

-
-citation("PatientLevelPrediction")
-
## 
-## To cite PatientLevelPrediction in publications use:
-## 
-## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design
-## and implementation of a standardized framework to generate and evaluate
-## patient-level prediction models using observational healthcare data."
-## _Journal of the American Medical Informatics Association_, *25*(8),
-## 969-975. <URL: https://doi.org/10.1093/jamia/ocy032>.
-## 
-## A BibTeX entry for LaTeX users is
-## 
-##   @Article{,
-##     author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek},
-##     title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data},
-##     journal = {Journal of the American Medical Informatics Association},
-##     volume = {25},
-##     number = {8},
-##     pages = {969-975},
-##     year = {2018},
-##     url = {https://doi.org/10.1093/jamia/ocy032},
-##   }
-

Please reference this paper if you use the PLP Package in your work:

-

Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.

-

This work is supported in part through the National Science Foundation grant IIS 1251151.

-
-
- - - -
- - - - -
- - - - - - diff --git a/docs/articles/AddingCustomSplitting_files/header-attrs-2.11/header-attrs.js b/docs/articles/AddingCustomSplitting_files/header-attrs-2.11/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/AddingCustomSplitting_files/header-attrs-2.11/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/AddingExistingModels.html b/docs/articles/AddingExistingModels.html deleted file mode 100644 index 6e493db0d..000000000 --- a/docs/articles/AddingExistingModels.html +++ /dev/null @@ -1,362 +0,0 @@ - - - - - - - -Implementing Existing Prediction Models using the OHDSI PatientLevelPrediction framework • PatientLevelPrediction - - - - - - - - - -
-
- - - -
-
- - - - - - -
-

-Introduction

-

This vignette describes how you can implement existing logistic regression models in the PatientLevelPrediction framework. This allows you to for example externally validate them at scale in the OHDSI data network.

-

As an example we are going to implement the CHADS2 model:

-

Gage BF, Waterman AD, Shannon W, Boechler M, Rich MW, Radford MJ. Validation of clinical classification schemes for predicting stroke: results from the National Registry of Atrial Fibrillation. JAMA. 2001 Jun 13;285(22):2864-70

-

To implement the model you need to create three tables: the model table, the covariate table, and the intercept table. The model table specifies the modelId (sequence number), the modelCovariateId (sequence number) and the covariateValue (beta for the covariate). The covariate table specifies the mapping between the covariates from the published model and the standard Patient Level Prediction framework covariates, i.e. its maps to a combination of an analysisid and a concept_id (see below). The intercept table specifies per modelId the intercept.

-
-
-

-Model implementation

-
-

-Define the model

-

The CHADS2 is a score based model with:

-
##   Points                        Covariate
-## 1      1         Congestive heart failure
-## 2      1                     Hypertension
-## 3      1                  Age >= 75 years
-## 4      1                Diabetes mellitus
-## 5      2 Stroke/transient ischemic attack
-

The model table should therefore be defined as:

-
##   modelId modelCovariateId covariateValue
-## 1       1                1              1
-## 2       1                2              1
-## 3       1                3              1
-## 4       1                4              1
-## 5       1                5              2
-

The covariateTable will then specify what standard covariates need to be included in the model.

-

In this case we choose the following Standard SNOMED concept_ids: 319835 for congestive heart failure, 316866 for hypertensive disorder, 201820 for diabetes, and 381591 for cerebrovascular disease. It is allowed to add multiple concept_ids as seperate rows for the same modelCovariateId if concept sets are needed. These concept_ids can be found using the vocabulary search in ATLAS.

-

The Patient Level Prediction standard covariates are of the form: conceptid*1000 + analysisid. The analysisid specifies the domain of the covariate and its lookback window. Examples can be found here: https://github.com/OHDSI/FeatureExtraction/blob/master/inst/csv/PrespecAnalyses.csv

-

Our example of CHADS2 uses agegroup and conditions in the full history. Therefore we need to define the standard covariates using the FeatureExtraction::createCovariateSettings as follows:

-
library(PatientLevelPrediction)
-covSet <- FeatureExtraction::createCovariateSettings(useDemographicsAgeGroup = T,                             
-                                                     useConditionOccurrenceLongTerm = T,
-                                                     includedCovariateIds = c(),
-                                                     longTermStartDays = -9999, 
-                                                     endDays = 0)
-

In the above code we used the useConditionOccurrenceLongTerm (these have an analysis id of 102) and we defined the longTermStartDays to be -9999 days relative to index (so we get the full history). We include the index date in our lookback period by specifying endDays = 0. The includeCovariateIds is set to 0, but this will be updated when you run the next code to pick out the standard covariates of interest. As we picked analysis id 102, the standard covariate for anytime prior congestive heart failure is 319835102, the same logic follows for the other conditions, so the covariate table will be:

-
##   modelCovariateId covariateId
-## 1                1   319835102
-## 2                2   316866102
-## 3                3       15003
-## 4                3       16003
-## 5                3       17003
-## 6                3       18003
-## 7                3       19003
-## 8                4   201820102
-## 9                5   381591102
-

modelCovariateId 3 was age>= 75, as the standard covariate age groups are in 5 year groups, we needed to add the age groups 75-80, 80-85, 85-90, 90-95 and 95-100, these correspond to the covaraiteIds 15003, 16003, 17003, 18003 and 19003 respectively.

-

To create the tables in R for CHADS2 you need to make the following dataframes:

-
model_table <- data.frame(modelId = c(1,1,1,1,1),
-                          modelCovariateId = 1:5, 
-                          coefficientValue = c(1, 1, 1, 1, 2)
-                          )
-
-covariate_table <- data.frame(modelCovariateId = c(1,2,3,3,3,3,3,4,5),
-                              covariateId = c(319835102, 316866102, 
-                                            15003, 16003, 17003, 18003, 19003, 
-                                            201820102, 381591102)
-                              )
-
-interceptTable <-  data.frame(modelId = 1, 
-                              interceptValue = 0)
-
-
-

-Create the model

-

Now you have everything in place actually create the existing model. First specify the current environment as executing createExistingModelSql creates two functions for running the existing model into the specificed environment. Next a few additional settings are needed: as some models require an intercept, there is an option for this (set it to 0 if an intercept isn’t needed), also the type specifies the final mapping (either logistic or linear/score), in our example we are calculating a score. We finally need to specify the analysisId for the newly created CHADS2 covariate.

-
e <- environment()
-PatientLevelPrediction::createExistingModelSql(modelTable = model_table, 
-                       modelNames = 'CHADS2', 
-                       interceptTable = data.frame(modelId = 1, interceptValue = 0),
-                       covariateTable = covariate_table, 
-                       type = 'score',
-                       analysisId = 112, covariateSettings = covSettings, e = e)
-

Once run you will find two new functions in your environment:

-
    -
  • createExistingmodelsCovariateSettings()
  • -
  • getExistingmodelsCovariateSettings()
  • -
-
-
-
-

-Run the model

-

Now you can use the functions you previously created to extract the existing model risk scores for a target population:

-
plpData <- PatientLevelPrediction::getPlpData(connectionDetails, 
-                      cdmDatabaseSchema = 'databasename.dbo',
-                      cohortId = 1,
-                      outcomeIds = 2, 
-                      cohortDatabaseSchema = 'databasename.dbo', 
-                      cohortTable =  'cohort' , 
-                      outcomeDatabaseSchema = 'databasename.dbo', 
-                      outcomeTable = 'cohort', 
-                      covariateSettings =  createExistingmodelsCovariateSettings(),
-                      sampleSize = 20000
-                      )
-

To implement and evaluate an existing model you can use the function:

-

PatientLevelPrediction::evaluateExistingModel()

-

with the following parameters:

-
    -
  • modelTable - a data.frame containing the columns: modelId, covariateId and coefficientValue
  • -
  • covariateTable - a data.frame containing the columns: covariateId and standardCovariateId - this provides a set of standardCovariateId to define each model covariate.
  • -
  • interceptTable - a data.frame containing the columns modelId and interceptValue or NULL if the model doesn’t have an intercept (equal to zero).
  • -
  • type - the type of model (either: score or logistic)
  • -
  • covariateSettings - this is used to determine the startDay and endDay for the standard covariates
  • -
  • customCovariates - a data.frame with the covariateId and sql to generate the covariate value.
  • -
  • riskWindowStart - the time at risk starts at target cohort start date + riskWindowStart
  • -
  • addExposureDaysToEnd - if true then the time at risk window ends a the cohort end date + riskWindowEnd rather than cohort start date + riskWindowEnd
  • -
  • riskWindowEnd - the time at risk ends at target cohort start/end date + riskWindowStart
  • -
  • requireTimeAtRisk - whether to add a constraint to the number of days observed during the time at risk period in including people into the study
  • -
  • minTimeAtRisk - the minimum number of days observation during the time at risk a target population person needs to be included
  • -
  • includeAllOutcomes - Include outcomes even if they do not satisfy the minTimeAtRisk? (useful if the outcome is associated to death or rare)
  • -
  • removeSubjectsWithPriorOutcome - remove target population people who have the outcome prior to the time at tisk period?
  • -
  • connectionDetails - the connection to the CDM database
  • -
-

Finally you need to add the settings for downloading the new data:

-
    -
  • cdmDatabaseSchema
  • -
  • cohortDatabaseSchema
  • -
  • cohortTable
  • -
  • cohortId
  • -
  • outcomeDatabaseSchema
  • -
  • outcomeTable
  • -
  • outcomeId
  • -
  • oracleTempSchema
  • -
-

To run the external validation of an existing model where the target population are those in the cohort table with id 1 and the outcome is those in the cohort table with id 2 and we are looking to predict first time occurrance of the outcome 1 day to 365 days after the target cohort start date (asusming you have the modelTable, covariateTable and interceptTable in the format explained above):

-
# if the existing model uses gender and condition groups looking back 200 days:
-covSet <- FeatureExtraction::createCovariateSettings(useDemographicsGender = T,
-                                                     useConditionGroupEraMediumTerm = T, 
-                                                     mediumTermStartDays = -200)
-result <- evaluateExistingModel(modelTable = modelTable,
-                                covariateTable = covariateTable,
-                                interceptTable = NULL,
-                                type = 'score', 
-                                covariateSettings =  covSet,
-                                riskWindowStart = 1, 
-                                addExposureDaysToEnd = F, 
-                                riskWindowEnd = 365, 
-                                requireTimeAtRisk = T, 
-                                minTimeAtRisk = 364, 
-                                includeAllOutcomes = T, 
-                                removeSubjectsWithPriorOutcome = T, 
-                                connectionDetails = connectionDetails, 
-                                cdmDatabaseSchema = 'databasename.dbo',
-                                cohortId = 1,
-                                outcomeId = 2, 
-                                cohortDatabaseSchema = 'databasename.dbo', 
-                                cohortTable =  'cohort' , 
-                                outcomeDatabaseSchema = 'databasename.dbo', 
-                                outcomeTable = 'cohort'
-                      )
-
-
-

-Acknowledgments

-

Considerable work has been dedicated to provide the PatientLevelPrediction package.

-
citation("PatientLevelPrediction")
-
## 
-## To cite PatientLevelPrediction in publications use:
-## 
-## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018).
-## "Design and implementation of a standardized framework to generate
-## and evaluate patient-level prediction models using observational
-## healthcare data." _Journal of the American Medical Informatics
-## Association_, *25*(8), 969-975. <URL:
-## https://doi.org/10.1093/jamia/ocy032>.
-## 
-## A BibTeX entry for LaTeX users is
-## 
-##   @Article{,
-##     author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek},
-##     title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data},
-##     journal = {Journal of the American Medical Informatics Association},
-##     volume = {25},
-##     number = {8},
-##     pages = {969-975},
-##     year = {2018},
-##     url = {https://doi.org/10.1093/jamia/ocy032},
-##   }
-

This work is supported in part through the National Science Foundation grant IIS 1251151.

-
-
- - - -
- - - -
- - - - - diff --git a/docs/articles/BestPractices.html b/docs/articles/BestPractices.html deleted file mode 100644 index c0490c754..000000000 --- a/docs/articles/BestPractices.html +++ /dev/null @@ -1,361 +0,0 @@ - - - - - - - -Best Practice Research • PatientLevelPrediction - - - - - - - - - - -
-
- - - - -
-
- - - - - -
-

-Best practice publications using the OHDSI PatientLevelPrediction framework

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-Topic - -Research Summary - -Link -
-Problem Specification - -When is prediction suitable in observational data? - -Guidelines needed -
-Data Creation - -Comparison of cohort vs case-control design - -Journal of Big Data -
-Data Creation - -Addressing loss to follow-up (right censoring) - -BMC medical informatics and decision makingk -
-Data Creation - -Investigating how to address left censoring in features construction - -BMC Medical Research Methodology -
-Data Creation - -Impact of over/under-sampling - -Study being developed -
-Data Creation - -Impact of phenotypes - -Study Done - Paper submitted -
-Model development - -How much data do we need for prediction - Learning curves at scale - -Preprint link -
-Model development - -What impact does test/train/validation design have on model performance - -BMJ Open -
-Model development - -What is the impact of the classifier - -JAMIA -
-Model development - -Can we find hyper-parameter combinations per classifier that consistently lead to good performing models when using claims/EHR data? - -Study needs to be done -
-Model development - -Can we use ensembles to combine models developed using different databases to improve models transportability? - - Paper under review at BMC -
-Evaluation - -How should we present model performance? (e.g., new visualizations) - -JAMIA Open -
-Evaluation - -How to interpret external validation performance (can we figure out why the performance drops or stays consistent)? - -Study needs to be done -
-Evaluation - -Recalibration methods - -Study needs to be done -
-Evaluation - -Is there a way to automatically simplify models? - -Study protocol under development -
-
-
- - - -
- - - - -
- - - - - - diff --git a/docs/articles/BestPractices_files/header-attrs-2.11/header-attrs.js b/docs/articles/BestPractices_files/header-attrs-2.11/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/BestPractices_files/header-attrs-2.11/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/BestPractices_files/header-attrs-2.7/header-attrs.js b/docs/articles/BestPractices_files/header-attrs-2.7/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/BestPractices_files/header-attrs-2.7/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/BuildingDeepLearningModels.html b/docs/articles/BuildingDeepLearningModels.html deleted file mode 100644 index 062ff4614..000000000 --- a/docs/articles/BuildingDeepLearningModels.html +++ /dev/null @@ -1,597 +0,0 @@ - - - - - - - -Building Deep Learning Models • PatientLevelPrediction - - - - - - - - - - -
-
- - - - -
-
- - - - - -
-

-Introduction

-

Electronic Health Records (EHR) data is high dimensional, heterogeneous, and sparse, which makes predictive modelling a challenge. In the early days, the machine learning community mainly focused on algorithm development, currently there is a shift to more powerful feature engineering. Deep Learning models are widely used to automatically learn high-level feature representations from the data, and have achieved remarkable results in image processing, speech recognition and computational biology. Recently, interesting results have been shown using EHRs, but more extensive research is needed to assess the power of Deep Learning in this domain.

-

This vignette describes how you can use the Observational Health Data Sciences and Informatics (OHDSI) PatientLevelPrediction package to build Deep Learning models. This vignette assumes you have read and are comfortable with building patient level prediction models as described in the BuildingPredictiveModels vignette. Furthermore, this vignette assumes you are familiar with Deep Learning methods.

-
-
-

-Background

-

Deep Learning models are build by stacking an often large number of neural network layers that perform feature engineering steps, e.g embedding, and are collapsed in a final softmax layer (basically a logistic regression layer). These algorithms need a lot of data to converge to a good representation, but currently the sizes of the EHR databases are growing fast which would make Deep Learning an interesting approach to test within OHDSI’s Patient-Level Prediction Framework. The current implementation allows us to perform research at scale on the value and limitations of Deep Learning using EHR data. For relatively small Target and Outcome cohorts, Deep Learning is most probably not the best choice.

-

Most current Deep Learning research is performed in python and we have developed a pipeline to interact with python. Multiple Deep Learning backends have been developed, e.g. Tensorflow, PyTorch, Keras (recently also available in R) etc. In the package we have implemented interaction with Keras in R and PyTorch in Python but we invite the community to add other backends.

-

Many network architectures have recently been proposed and we have implemented a number of them, however, this list will grow in the near future. It is important to understand that some of these architectures require a 2D data matrix, i.e. |patient|x|feature|, and others use a 3D data matrix |patient|x|feature|x|time|. The FeatureExtraction Package has been extended to enable the extraction of both data formats as will be described with examples below.

-

Note that training Deep Learning models is computationally intensive, our implementation therefore supports both GPU and CPU. It will automatically check whether there is GPU or not in your computer. A GPU is highly recommended for Deep Learning!

-
-
-

-Non-Temporal Architectures

-

We implemented the following non-temporal (2D data matrix) architectures using PyTorch:

-
1) Logistics regression (LRTorch)
-   A simple softmax layer with l2 regularization
-
-2) Feed forward network (MLPTorch) 
-   Supports multilayer perceptron (mlp_type = MLP) and 
-   Self-Normalizing Neural Networks (mlp_type = SNN)
-   Reference: https://arxiv.org/abs/1706.02515
-

For the above two methods, we implemented support for a stacked autoencoder and a variational autoencoder to reduce the feature dimension as a first step. These autoencoders learn efficient data encodings in an unsupervised manner by stacking multiple layers in a neural network. Compared to the standard implementations of LR and MLP these implementations can use the GPU power to speed up the gradient descent approach in the back propagation to optimize the weights of the classifier.

-

Table 1: Non-Temporal Deep Learning Models Hyper-Parameters

- ----- - - - - - - - - - - - - - - - - - -
NameDescriptionHyper-parameters
LRTorchLogistic Regression Modelw_decay (l2 regularization), epochs (number of epochs), class_weight (0 = inverse ratio between number of positive and negative examples, -1 = focal loss (https://arxiv.org/abs/1708.02002), or other), autoencoder (apply stacked autoencoder?, vae (apply variational autoencoder)
MLPTorchMulti-Layer Perceptron Modelmlp_type (MLP = default, SNN = self-normalizing neural network), size (number of hidden nodes), w_decay (l2 regularization), epochs (number of epochs), class_weight(0 = inverse ratio between number of positive and negative examples, -1 = focal loss, or other), autoencoder (apply stacked autoencoder), vae (apply variational autoencoder?)
-

##Example The approach for logistic regression (LRTorch) and the Multi-Layer Perceptron (MLPTorch) is identical. Here we will take LRTorch as an example.

-

You need to generate a population and plpData object as described in more detail in BuildingPredictiveModels vignette.

-

Alternatively, you can make use of the data simulator. The following code snippet creates a population of 12000 patients.

-
-set.seed(1234)
-data(plpDataSimulationProfile)
-sampleSize <- 12000
-plpData <- simulatePlpData(
-  plpDataSimulationProfile,
-  n = sampleSize
-)
-
-population <- createStudyPopulation(
-  plpData,
-  outcomeId = 2,
-  binary = TRUE,
-  firstExposureOnly = FALSE,
-  washoutPeriod = 0,
-  removeSubjectsWithPriorOutcome = FALSE,
-  priorOutcomeLookback = 99999,
-  requireTimeAtRisk = FALSE,
-  minTimeAtRisk = 0,
-  riskWindowStart = 0,
-  addExposureDaysToStart = FALSE,
-  riskWindowEnd = 365,
-  addExposureDaysToEnd = FALSE,
-  verbosity = "INFO"
-)
-

As an example we will build a LRTorch model. We could specify the stacked autoencoder or the variational autoencoder to be used for reducing the feature dimension as an initial layer, but for this example we do not.

-
-autoencoder <- FALSE
-vae <- FALSE
-

We added a class_weight for imbalanced data, the default value 0 is the inverse ratio between negatives and positives,-1 applies focal loss.

-
-class_weight <- 0
-
-# Specify the settings for Logistics regression model using Torch in Python
-model <- setLRTorch(autoencoder=autoencoder, vae=vae,  class_weight=class_weight)
-

No we define our modelling parameters.

-
-testFraction <- 0.2
-testSplit <- 'person'
-nfold <- 3
-splitSeed <- 1000
-

And we train and internally validate the model.

-
-results <- PatientLevelPrediction::runPlp(population = population, 
-                                          plpData = plpData, 
-                                          modelSettings = model,
-                                          testSplit=testSplit,
-                                          testFraction=testFraction,
-                                          nfold=nfold, 
-                                          splitSeed=splitSeed) 
-
-
-

-Temporal Architectures

-

Several architectures are implemented that can handle temporal data in PyTorch and R Keras.

-
-

-PyTorch CNN

-

We implemented the following convolutional models described in https://github.com/clinicalml/deepDiagnosis in CNNTorch:

-
    -
  1. -
    -Temporal Convolutional neural network over a backward window (type = cnn) -
    -
    - -
    -
  2. -
  3. -
    -Convolutional neural network over input and time dimension (type = mix) -
    -
    - -
    -
  4. -
  5. -
    -Multi-resolution temporal convolutional neural network (type = multi) -
    -
    - -
    -
  6. -
-

Furthermore, we added the following achitectures:

-
    -
  1. -
    -CNN with filters with three different parallel kernel sizes (3,4,5) and a fully connected layers (type = mlf) -
    -
    - -
    -
  2. -
  3. -
    -LSTM network over the backward window (type = lstm) -
    -
    - -
    -
  4. -
  5. -
    -Residual Learning Network as described in: https://arxiv.org/abs/1512.03385 (type = resnet) -
    -
    -This a very big network, see the paper for the topology. -
    -
  6. -
- ---- - - - - - - - - - - - - - - - - - - - - - - -
parameterdescription
nbfiltersThe number of convolution filters
epochsThe number of epochs
seedRandom seed
class_weightThe class weight used for imbalanced data
(0: Inverse ratio between positives and negatives, -1: Focal loss, or number)
-
-
-

-PyTorch RNN

-

The following recurrent neural network models are implemented in RNNTorch:

-
    -
  1. -
    -RNN with one LSTM layer fed into one fully connected layer (type = RNN) -
    -
    -
    - -
    - -
  2. -
  3. -
    -RNN with one bidirectional LSTM layer fed into one fully connected layer (type = BiRNN) -
    -
    -This network looks the same as above but then as a bi-directional version -
    -
  4. -
  5. -
    -One Gated Recurrent Unit layer fed into one fully connected layers (type = GRU) -
    -
    -This network looks the same as above but then implemented as GRU -
    -
  6. -
-

The following hyper-parameters can be set for these PyTorch models:

- ---- - - - - - - - - - - - - - - - - - - - - - - -
parameterdescription
hidden_sizeThe number of features in hidden state
epochsThe number of epochs
seedRandom seed
class_weightThe class weight used for imbalanced data
(0: Inverse ratio between positives and negatives, -1: Focal loss, or number)
-
-
-
-

-R Keras CNN

-

The following temporal architectures as described in https://arxiv.org/pdf/1608.00647.pdf were implemented using R Keras:

-
    -
  1. -
    -Multi-resolution CovNN model (CovNN.R) -
    -
    - -
    -
  2. -
  3. -
    -
    -Convolution across data and time according(CovNN2.R) -
    -
    - -
    -
    -
    -
  4. -
-

Furthermore, a custom build RNN is added that uses a variational autoencoder.

-
    -
  1. -
    -Clinically Informing application based on Recurrent Neural Network (CIReNN.R) -
    -
    - -
    -
  2. -
-

Table 2: Temporal Deep Learning Models

- ---- - - - - - - - - - - - - - - - - - - -
ModelHyper-parameters
CovNNbatchSize (The number of samples to used in each batch during model training), outcomeWeight (The weight assigned to the outcome), lr (The learning rate), decay (The decay of the learning rate), dropout ([currently not used] the dropout rate for regularization), epochs (The number of times data is used to train the model, e.g., epoches=1 means data only used once to train), filters (The number of columns output by each convolution), kernelSize (The number of time dimensions used for each convolution), loss (The loss function implemented), seed (The random seed)
CovNN2batchSize (The number of samples to used in each batch during model training), outcomeWeight (The weight assigned to the outcome), lr (The learning rate), decay (The decay of the learning rate), dropout ([currently not used] the dropout rate for regularization), epochs (The number of times data is used to train the model, e.g., epoches=1 means data only used once to train), filters (The number of columns output by each convolution), kernelSize (The number of time dimensions used for each convolution), loss (The loss function implemented), seed (The random seed)
CIReNNunits (The number of units of RNN layer - as a list of vectors), recurrentDropout (The reccurrent dropout rate), layerDropout (The layer dropout rate), lr (Learning rate), decay (Learning rate decay over each update), outcomeWeight (The weight of the outcome class in the loss function), batchSize (The number of data points to use per training batch), epochs (Number of times to iterate over data set), earlyStoppingMinDelta (Minimum change in the monitored quantity to qualify as an improvement for early stopping, i.e. an absolute change of less than min_delta in loss of validation data, will count as no improvement), earlyStoppingPatience (Number of epochs with no improvement after which training will be stopped), seed (Random seed used by Deep Learning model)
-
-
-

-Example

-

We will now show how to use the temporal models by using CNNTorch as an example.

-

You need to generate a population and plpData object as described in more detail in BuildingPredictiveModels vignette.

-

Note that for these algorithms you need to extracted temporal data as described in the [FeatureExtraction vignette] (https://github.com/OHDSI/FeatureExtraction/blob/master/inst/doc/UsingFeatureExtraction.pdf) as follows:

-
-settings <- createTemporalCovariateSettings(useConditionEraStart = FALSE,
-                                            useConditionEraOverlap = FALSE,
-                                            useConditionOccurrence = FALSE,
-                                            useConditionEraGroupStart = FALSE,
-                                            useConditionEraGroupOverlap = FALSE,
-                                            useDrugExposure = FALSE,
-                                            useDrugEraStart = FALSE,
-                                            useDrugEraOverlap = FALSE,
-                                            useMeasurement = FALSE,
-                                            useMeasurementValue = TRUE,
-                                            useMeasurementRangeGroup = FALSE,
-                                            useProcedureOccurrence = FALSE,
-                                            useDeviceExposure = FALSE,
-                                            useObservation = FALSE,
-                                            excludedCovariateConceptIds = c(316866),
-                                            addDescendantsToExclude = TRUE,
-                                            temporalStartDays = seq(from = -365, 
-                                                                    to = -1, by = 12), 
-                                            temporalEndDays = c(seq(from = -353, 
-                                                                    to = 0, by = 12), 0))
-
-plpData <- getPlpData(connectionDetails = connectionDetails,
-                        cdmDatabaseSchema = cdmDatabaseSchema,
-                        cohortDatabaseSchema = "results",
-                        cohortTable = "cohort",
-                        cohortId = 11,
-                        covariateSettings = settings,
-                        outcomeDatabaseSchema = resultsDatabaseSchema,
-                        outcomeTable = "cohort",
-                        outcomeIds = 25,
-                        cdmVersion = 5)
-

Each CNN/RNN has several hyper-parameters that can be set as shown in the Tables above, but for this example we take the defaults.

-
-# specify the the CNN
-model <- setCNNTorch(cnn_type='CNN')
-

Run the model training, for example with a testFraction = 0.2 and a split by person:

-
-results <- PatientLevelPrediction::runPlp(population, plpData, model,
-                                          testSplit='person',
-                                          testFraction=0.2,
-                                          nfold=3, 
-                                          splitSeed=1000) 
-
-
-
-

-Apply the trained Deep Learning model

-

Applying a Deep Learning is identical to the other models in the package:

-
-# load the trained model
-plpModel <- loadPlpModel(getwd(), "<your model>")
-
-# load the new plpData (should have the same temporal features!) and create the population
-plpData <- loadPlpData(getwd(), "<your data>")
-
-populationSettings <- plpModel$populationSettings
-populationSettings$plpData <- plpData
-population <- do.call(createStudyPopulation, populationSettings)  
-
-# apply the trained model on the new data
-validationResults <- applyModel(population, plpData, plpModel)
-
-
-

-Adding new architectures

-

It is possible to add new architectures in our framework using PyTorch or R Keras. We are happy to help you with this, please post your questions on the issue tracker of the package.

-
-
-

-Acknowledgments

-

Considerable work has been dedicated to provide the PatientLevelPrediction package.

-
-citation("PatientLevelPrediction")
-
## 
-## To cite PatientLevelPrediction in publications use:
-## 
-## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design
-## and implementation of a standardized framework to generate and evaluate
-## patient-level prediction models using observational healthcare data."
-## _Journal of the American Medical Informatics Association_, *25*(8),
-## 969-975. <URL: https://doi.org/10.1093/jamia/ocy032>.
-## 
-## A BibTeX entry for LaTeX users is
-## 
-##   @Article{,
-##     author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek},
-##     title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data},
-##     journal = {Journal of the American Medical Informatics Association},
-##     volume = {25},
-##     number = {8},
-##     pages = {969-975},
-##     year = {2018},
-##     url = {https://doi.org/10.1093/jamia/ocy032},
-##   }
-

Please reference this paper if you use the PLP Package in your work:

-

Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.

-
-
- - - -
- - - - -
- - - - - - diff --git a/docs/articles/BuildingDeepLearningModels_files/header-attrs-2.7/header-attrs.js b/docs/articles/BuildingDeepLearningModels_files/header-attrs-2.7/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/BuildingDeepLearningModels_files/header-attrs-2.7/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/BuildingEnsembleModels.html b/docs/articles/BuildingEnsembleModels.html deleted file mode 100644 index 8cbfd2eb8..000000000 --- a/docs/articles/BuildingEnsembleModels.html +++ /dev/null @@ -1,316 +0,0 @@ - - - - - - - -Building Ensemble Models • PatientLevelPrediction - - - - - - - - - - -
-
- - - - -
-
- - - - - -
-

-Introduction

-

Ensemble models combine several models to improve the overall performance. Traditionally, weak learners were combined to boost performance but recent results show that combining several strong approaches can also result in a better performance. There are many examples in literature where ensemble models outperform individual models using stacking, i.e. a final logistic regresssion layer accross the individual model outputs, but other approaches like weigthing has also shown promising results.

-

This vignette describes how you can use the Observational Health Data Sciencs and Informatics (OHDSI) PatientLevelPrediction package to build ensemble models. This vignette assumes you have read and are comfortable with building single patient level prediction models as described in the BuildingPredictiveModels vignette.

-

This will enable studying ensemble methods at scale in the OHDSI data network.

-
-

Ensemble model

-
-

In PatientLevelPrediction package, four ensemble strategies have been implemented:

-
    -
  1. average ensemble: Calculate the average probability from individual models
  2. -
  3. product ensemble: Calculate the product of probabilites from individual models.
  4. -
  5. weighted ensemble: Calculate the weighted average probability from individual models using train AUC as weights.
  6. -
  7. stacked ensemble: Train a logistics regression on outputs from individual models
  8. -
-
-
-

-Usage

-

Use the PatientLevelPrediction package to generate a population and plpData object. Alternatively, you can make use of the data simulator. The following code snippet creates a population of 12000 patients.

-
-data(plpDataSimulationProfile)
-set.seed(1234)
-sampleSize <- 2000
-plpData <- simulatePlpData(
-  plpDataSimulationProfile,
-  n = sampleSize
-)
-
-population <- createStudyPopulation(
-  plpData,
-  outcomeId = 2,
-  binary = TRUE,
-  firstExposureOnly = FALSE,
-  washoutPeriod = 0,
-  removeSubjectsWithPriorOutcome = FALSE,
-  priorOutcomeLookback = 99999,
-  requireTimeAtRisk = FALSE,
-  minTimeAtRisk = 0,
-  riskWindowStart = 0,
-  addExposureDaysToStart = FALSE,
-  riskWindowEnd = 365,
-  addExposureDaysToEnd = FALSE,
-  verbosity = "INFO"
-)
-

Specify the prediction algorithms to be combined.

-
-# Use LASSO logistic regression and Random Forest as base predictors
-model1 <- setLassoLogisticRegression()
-model2 <- setRandomForest()
-

Specify a test fraction and a sequence of training set fractions.

-
-testFraction <- 0.2
-

Specify an ensembleStrategy to combine multiple predictors. The strategy used for ensembling the outputs from different models, it can be ‘mean’, ‘product’, ‘weighted’ and ‘stacked’: ‘mean’ the average probability from differnt models ‘product’ the product rule ‘weighted’ the weighted average probability from different models using train AUC as weights. ‘stacked’ the stakced ensemble trains a logistics regression on different models.

-
-ensembleStrategy <- 'stacked'
-

Specify the test split to be used.

-
-# Use a split by person, alterantively a time split is possible
-testSplit <- 'person'
-

Run the ensemble learning to combine model1 and model2. You can also use different plpData for different models.

-
-ensembleResults <- PatientLevelPrediction::runEnsembleModel(population, 
-                                   dataList = list(plpData, plpData), 
-                                   modelList = list(model1, model2),
-                                   testSplit=testSplit,
-                                   testFraction=testFraction,
-                                   nfold=3, splitSeed=1000, 
-                                   ensembleStrategy = ensembleStrategy) 
-
-

-Saving and loading the ensemble model

-

You can save and load the model using:

-
-saveEnsemblePlpModel(ensembleResults$model, dirPath = file.path(getwd(), "model"))
-ensembleModel <- loadEnsemblePlpModel(getwd(), "model")
-
-
-
-

-Apply Ensemble model

-
-plpData <- loadPlpData("<data file>")
-populationSettings <- ensembleModel$populationSettings
-populationSettings$plpData <- plpData
-population <- do.call(createStudyPopulation, populationSettings)
-

Load the model.

-
-ensembleModel <- loadEnsemblePlpModel("<model folder>")
-

Get the predictions by applying the model:

-
-prediction <- applyEnsembleModel(population,
-                                  dataList = list(plpData, plpData),
-                                  ensembleModel = ensembleModel)$prediction
-
-
-

-Demo

-

We have added a demo of the ensemble training:

-
-# Show all demos in our package: 
- demo(package = "PatientLevelPrediction")
-
-# Run the learning curve
- demo("EnsembleModelDemo", package = "PatientLevelPrediction")
-
-
-

-Acknowledgments

-

Considerable work has been dedicated to provide the PatientLevelPrediction package.

-
-citation("PatientLevelPrediction")
-
## 
-## To cite PatientLevelPrediction in publications use:
-## 
-## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design
-## and implementation of a standardized framework to generate and evaluate
-## patient-level prediction models using observational healthcare data."
-## _Journal of the American Medical Informatics Association_, *25*(8),
-## 969-975. <URL: https://doi.org/10.1093/jamia/ocy032>.
-## 
-## A BibTeX entry for LaTeX users is
-## 
-##   @Article{,
-##     author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek},
-##     title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data},
-##     journal = {Journal of the American Medical Informatics Association},
-##     volume = {25},
-##     number = {8},
-##     pages = {969-975},
-##     year = {2018},
-##     url = {https://doi.org/10.1093/jamia/ocy032},
-##   }
-

Please reference this paper if you use the PLP Package in your work:

-

Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.

-
-
- - - -
- - - - -
- - - - - - diff --git a/docs/articles/BuildingEnsembleModels_files/header-attrs-2.11/header-attrs.js b/docs/articles/BuildingEnsembleModels_files/header-attrs-2.11/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/BuildingEnsembleModels_files/header-attrs-2.11/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/BuildingEnsembleModels_files/header-attrs-2.7/header-attrs.js b/docs/articles/BuildingEnsembleModels_files/header-attrs-2.7/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/BuildingEnsembleModels_files/header-attrs-2.7/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/BuildingMultiplePredictiveModels.html b/docs/articles/BuildingMultiplePredictiveModels.html deleted file mode 100644 index e8e402763..000000000 --- a/docs/articles/BuildingMultiplePredictiveModels.html +++ /dev/null @@ -1,481 +0,0 @@ - - - - - - - -Automatically Build Multiple Patient-Level Predictive Models • PatientLevelPrediction - - - - - - - - - - -
-
- - - - -
-
- - - - - -
-

-Introduction

-

In our paper, we propose a standardised framework for patient-level prediction that utilizes the OMOP CDM and standardized vocabularies, and describe the open-source software that we developed implementing the framework’s pipeline. The framework is the first to enforce existing best practice guidelines and will enable open dissemination of models that can be extensively validated across the network of OHDSI collaborators.

-

One our best practices is that we see the selection of models and all study setting as an emperical question, i.e. we should use a data-driven approach in which we try many settings. This vignette describes how you can use the Observational Health Data Sciencs and Informatics (OHDSI) PatientLevelPrediction package to automatically build multiple patient-level predictive models, e.g. different population settings, covariate settings, and modelsetting. This vignette assumes you have read and are comfortable with building single patient level prediction models as described in the BuildingPredictiveModels vignette.

-

Note that it is also possible to generate a Study Package directly in Atlas that allows for multiple patient-level prediction analyses this is out-of-scope for this vignette.

-
-
-

-Creating a model design

-

The first step is to specify each model you wish to develop by using the createModelDesign function. This function requires the following:

- - ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
The inputs for the model design
inputDescription
targetIdThe id for the target cohort
outcomeIdThe id for the outcome
restrictPlpDataSettingsThe settings used to restrict the target population, created with createRestrictPlpDataSettings()
populationSettingsThe settings used to restrict the target population and create the outcome labels, created with createStudyPopulationSettings()
covariateSettingsThe settings used to define the covariates, created with FeatureExtraction::createDefaultCovariateSettings()
sampleSettingsThe settings used to define any under/over sampling, created with createSampleSettings()
featureEngineeringSettingsThe settings used to define any feature engineering, created with createFeatureEngineeringSettings()
preprocessSettingsThe settings used to define any preprocessing, created with createPreprocessSettings()
modelSettingsThe settings used to define the model fitting settings, such as setLassoLogisticRegression()
-
-

-Model design example 1

-

For example, if we wanted to predict the outcome (id 2) occuring for the first time within 180 days of the the target population index date (id 1). We are only interested in index dates betwrrn 2018-2020. Finally, we only want to use age, gender in 5 year buckets and conditions as features. The model can be specified by:

-
-# Model 1 is only using data between 2018-2020:
-restrictPlpDataSettings <- createRestrictPlpDataSettings(
-  studyStartDate = '20180101', 
-  studyEndDate = '20191231'
-  )
-
-# predict outcome within 1 to 180 days after index
-# remove people with outcome prior and with < 365 days observation
-populationSettings <- createStudyPopulationSettings(
-  binary = T, 
-  firstExposureOnly = T, 
-  washoutPeriod = 365, 
-  removeSubjectsWithPriorOutcome = T,
-  priorOutcomeLookback = 9999,
-  requireTimeAtRisk = F, 
-  riskWindowStart = 1, 
-  riskWindowEnd = 180
-)
-
-# use age/gender in groups and condition groups as features
-covariateSettings <- FeatureExtraction::createCovariateSettings(
-  useDemographicsGender = T, 
-  useDemographicsAgeGroup = T, 
-  useConditionGroupEraAnyTimePrior = T
-)
-
-modelDesign1 <- createModelDesign(
-  targetId = 1, 
-  outcomeId = 2, 
-  restrictPlpDataSettings = restrictPlpDataSettings, 
-  populationSettings = populationSettings, 
-  covariateSettings = covariateSettings, 
-  featureEngineeringSettings = createFeatureEngineeringSettings(),
-  sampleSettings = createSampleSettings(), 
-  preprocessSettings = createPreprocessSettings(), 
-  modelSettings = setLassoLogisticRegression()
-  )
-
-
-

-Model design example 2

-

For the second example, we want to predict the outcome (id 2) occuring for the first time within 730 days of the the target population index date (id 1). We want to train a random forest classifier. Finally, we want to use age, gender in 5 year buckets, drug ingredients (and groups) and conditions as features. The model can be specified by:

-
-# Model 2 has no restrictions when extracting data
-restrictPlpDataSettings <- createRestrictPlpDataSettings(
-  )
-
-# predict outcome within 1 to 730 days after index
-# remove people with outcome prior and with < 365 days observation
-populationSettings <- createStudyPopulationSettings(
-  binary = T, 
-  firstExposureOnly = T, 
-  washoutPeriod = 365, 
-  removeSubjectsWithPriorOutcome = T,
-  priorOutcomeLookback = 9999,
-  requireTimeAtRisk = F, 
-  riskWindowStart = 1, 
-  riskWindowEnd = 730
-)
-
-# use age/gender in groups and condition/drug groups as features
-covariateSettings <- FeatureExtraction::createCovariateSettings(
-  useDemographicsGender = T, 
-  useDemographicsAgeGroup = T, 
-  useConditionGroupEraAnyTimePrior = T, 
-  useDrugGroupEraAnyTimePrior = T 
-)
-
-modelDesign2 <- createModelDesign(
-  targetId = 1, 
-  outcomeId = 2, 
-  restrictPlpDataSettings = restrictPlpDataSettings, 
-  populationSettings = populationSettings, 
-  covariateSettings = covariateSettings, 
-  featureEngineeringSettings = createRandomForestFeatureSelection(ntrees = 500, maxDepth = 7),
-  sampleSettings = createSampleSettings(), 
-  preprocessSettings = createPreprocessSettings(), 
-  modelSettings = setRandomForest()
-  )
-
-
-

-Model design example 3

-

For the third example, we want to predict the outcome (id 5) occuring during the cohort exposure of the the target population (id 1). We want to train a gradient boosting machine. Finally, we want to use age, gender in 5 year buckets and indications of measurements taken as features. The model can be specified by:

-
-# Model 3 has no restrictions when extracting data
-restrictPlpDataSettings <- createRestrictPlpDataSettings(
-  )
-
-# predict outcome during target cohort start/end 
-# remove people with  < 365 days observation
-populationSettings <- createStudyPopulationSettings(
-  binary = T, 
-  firstExposureOnly = T, 
-  washoutPeriod = 365, 
-  removeSubjectsWithPriorOutcome = F,
-  requireTimeAtRisk = F, 
-  riskWindowStart = 0,
-  startAnchor =  'cohort start',
-  riskWindowEnd = 0, 
-  endAnchor = 'cohort end'
-)
-
-# use age/gender in groups and measurement indicators as features
-covariateSettings <- FeatureExtraction::createCovariateSettings(
-  useDemographicsGender = T, 
-  useDemographicsAgeGroup = T, 
-  useMeasurementAnyTimePrior = T,
-  endDays = -1
-)
-
-modelDesign3 <- createModelDesign(
-  targetId = 1, 
-  outcomeId = 5, 
-  restrictPlpDataSettings = restrictPlpDataSettings, 
-  populationSettings = populationSettings, 
-  covariateSettings = covariateSettings, 
-  featureEngineeringSettings = createFeatureEngineeringSettings(),
-  sampleSettings = createSampleSettings(), 
-  preprocessSettings = createPreprocessSettings(), 
-  modelSettings = setGradientBoostingMachine()
-  )
-
-
-
-

-Running multiple models

-

As we will be downloading loads of data in the multiple plp analysis it is useful to set the Andromeda temp folder to a directory with write access and plenty of space. options(andromedaTempFolder = "c:/andromedaTemp")

-

To run the study requires setting up a connectionDetails object

-
-dbms <- "your dbms"
-user <- "your username"
-pw <- "your password"
-server <- "your server"
-port <- "your port"
-
-connectionDetails <- DatabaseConnector::createConnectionDetails(dbms = dbms,
-                                                                server = server,
-                                                                user = user,
-                                                                password = pw,
-                                                                port = port)
-

Next you need to specify the cdmDatabaseSchema where your cdm database is found and workDatabaseSchema where your target population and outcome cohorts are and you need to specify a label for the database name: a string with a shareable name of the database (this will be shown to OHDSI researchers if the results get transported).

-
cdmDatabaseSchema <- "your cdmDatabaseSchema"
-workDatabaseSchema <- "your workDatabaseSchema"
-cdmDatabaseName <- "your cdmDatabaseName"
-cohortTable <- "your cohort table",
-
-databaseDetails <- createDatabaseDetails(
-  connectionDetails = connectionDetails, 
-  cdmDatabaseSchema = cdmDatabaseSchema, 
-  cdmDatabaseName = cdmDatabaseName , 
-  cohortDatabaseSchema = workDatabaseSchema, 
-  cohortTable = cohortTable, 
-  outcomeDatabaseSchema = workDatabaseSchema, 
-  outcomeTable = cohortTable 
-  cdmVersion = 5
-    )
-

Now you can run the multiple patient-level prediction analysis:

-
-results <- runMultiplePlp(
-  databaseDetails = databaseDetails, 
-  modelDesignList = list(
-    modelDesign1, 
-    modelDesign2, 
-    modelDesign3
-    ), 
-  onlyFetchData = F, 
-  splitSettings = createDefaultSplitSetting(), 
-  logSettings = createLogSettings(), 
-  saveDirectory =  "./PlpMultiOutput"
-  )
-

This will then save all the plpData objects from the study into “./PlpMultiOutput/plpData_T1_L” and the results into “./PlpMultiOutput/Analysis_”. The csv file named settings.csv found in “./PlpMultiOutput” has a row for each prediction model developed and points to the plpData and settings used for the model development, it also has descriptions of the cohorts if these are input by the user.

-

Note that if for some reason the run is interrupted, e.g. because of an error, a new call to runMultiplePlp will continue and not restart until you remove the output folder.

-
-
-

-Validating multiple models

-

If you have access to multiple databases on the same server in different schemas you could evaluate accross these using this call:

-
-validationDatabaseDetails <- createDatabaseDetails(
-  connectionDetails = connectionDetails, 
-  cdmDatabaseSchema = 'new cdm schema', 
-  cdmDatabaseName = 'validation database', 
-  cohortDatabaseSchema = workDatabaseSchema, 
-  cohortTable = cohortTable, 
-  outcomeDatabaseSchema = workDatabaseSchema, 
-  outcomeTable = cohortTable, 
-  cdmVersion = 5
-  )
-
-val <- validateMultiplePlp(
-  analysesLocation = "./PlpMultiOutput",
-  valdiationDatabaseDetails = validationDatabaseDetails,
-  validationRestrictPlpDataSettings = createRestrictPlpDataSettings(),
-  recalibrate = NULL,
-  saveDirectory = "./PlpMultiOutput/validation"
-  )
-

This then saves the external validation results in the validation folder of the main study (the outputLocation you used in runPlpAnalyses).

-
-
-

-Viewing the results

-

To view the results for the multiple prediction analysis:

-
-viewMultiplePlp(analysesLocation="./PlpMultiOutput")
-

If the validation directory in “./PlpMultiOutput” has results, the external validation will also be displayed.

-
-
-

-Acknowledgments

-

Considerable work has been dedicated to provide the PatientLevelPrediction package.

-
-citation("PatientLevelPrediction")
-
## 
-## To cite PatientLevelPrediction in publications use:
-## 
-## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design
-## and implementation of a standardized framework to generate and evaluate
-## patient-level prediction models using observational healthcare data."
-## _Journal of the American Medical Informatics Association_, *25*(8),
-## 969-975. <URL: https://doi.org/10.1093/jamia/ocy032>.
-## 
-## A BibTeX entry for LaTeX users is
-## 
-##   @Article{,
-##     author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek},
-##     title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data},
-##     journal = {Journal of the American Medical Informatics Association},
-##     volume = {25},
-##     number = {8},
-##     pages = {969-975},
-##     year = {2018},
-##     url = {https://doi.org/10.1093/jamia/ocy032},
-##   }
-

Please reference this paper if you use the PLP Package in your work:

-

Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.

-
-
- - - -
- - - - -
- - - - - - diff --git a/docs/articles/BuildingMultiplePredictiveModels_files/header-attrs-2.11/header-attrs.js b/docs/articles/BuildingMultiplePredictiveModels_files/header-attrs-2.11/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/BuildingMultiplePredictiveModels_files/header-attrs-2.11/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/BuildingMultiplePredictiveModels_files/header-attrs-2.7/header-attrs.js b/docs/articles/BuildingMultiplePredictiveModels_files/header-attrs-2.7/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/BuildingMultiplePredictiveModels_files/header-attrs-2.7/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/BuildingPredictiveModels.html b/docs/articles/BuildingPredictiveModels.html deleted file mode 100644 index 1cc607322..000000000 --- a/docs/articles/BuildingPredictiveModels.html +++ /dev/null @@ -1,1502 +0,0 @@ - - - - - - - -Building patient-level predictive models • PatientLevelPrediction - - - - - - - - - - -
-
- - - - -
-
- - - - - -
-

-Introduction

-

Observational healthcare data, such as administrative claims and electronic health records, are increasingly used for clinical characterization of disease progression, quality improvement, and population-level effect estimation for medical product safety surveillance and comparative effectiveness. Advances in machine learning for large dataset analysis have led to increased interest in applying patient-level prediction on this type of data. Patient-level prediction offers the potential for medical practice to move beyond average treatment effects and to consider personalized risks as part of clinical decision-making. However, many published efforts in patient-level-prediction do not follow the model development guidelines, fail to perform extensive external validation, or provide insufficient model details that limits the ability of independent researchers to reproduce the models and perform external validation. This makes it hard to fairly evaluate the predictive performance of the models and reduces the likelihood of the model being used appropriately in clinical practice. To improve standards, several papers have been written detailing guidelines for best practices in developing and reporting prediction models.

-

The Transparent Reporting of a multivariable prediction model for Individual Prognosis Or Diagnosis (TRIPOD) statement provides clear recommendations for reporting prediction model development and validation and addresses some of the concerns related to transparency. However, data structure heterogeneity and inconsistent terminologies still make collaboration and model sharing difficult as different researchers are often required to write new code to extract the data from their databases and may define variables differently.

-

In our paper, we propose a standardised framework for patient-level prediction that utilizes the OMOP Common Data Model (CDM) and standardized vocabularies, and describe the open-source software that we developed implementing the framework’s pipeline. The framework is the first to support existing best practice guidelines and will enable open dissemination of models that can be extensively validated across the network of OHDSI collaborators.

-

Figure 1, illustrates the prediction problem we address. Among a population at risk, we aim to predict which patients at a defined moment in time (t = 0) will experience some outcome during a time-at-risk. Prediction is done using only information about the patients in an observation window prior to that moment in time.

-
-

The prediction problem

-
-

As shown in Figure 2, to define a prediction problem we have to define t=0 by a Target Cohort (T), the outcome we like to predict by an outcome cohort (O), and the time-at-risk (TAR). Furthermore, we have to make design choices for the model we like to develop, and determine the observational datasets to perform internal and external validation. This conceptual framework works for all type of prediction problems, for example those presented in Figure 3.

-
-

Design choices

-
-
-

Examples of prediction problems

-
-

This vignette describes how you can use the PatientLevelPrediction package to build patient-level predictive models. The package enables data extraction, model building, and model evaluation using data from databases that are translated into the OMOP CDM. In this vignette we assume you have installed the package correctly using the InstallationGuide.

-
-
-

-Study specification

-

We have to clearly specify our study upfront to be able to implement it. This means we need to define the prediction problem we like to address, in which population we will build the model, which model we will build and how we will evaluate its performance. To guide you through this process we will use a “Disease onset and progression” prediction type as an example.

-
-

-Problem definition 1: Stroke in afibrilation patients

-

Atrial fibrillation is a disease characterized by an irregular heart rate that can cause poor blood flow. Patients with atrial fibrillation are at increased risk of ischemic stroke. Anticoagulation is a recommended prophylaxis treatment strategy for patients at high risk of stroke, though the underuse of anticoagulants and persistent severity of ischemic stroke represents a substantial unmet medical need. Various strategies have been developed to predict risk of ischemic stroke in patients with atrial fibrillation. CHADS2 (Gage JAMA 2001) was developed as a risk score based on history of congestive heart failure, hypertension, age>=75, diabetes and stroke. CHADS2 was initially derived using Medicare claims data, where it achieved good discrimination (AUC=0.82). However, subsequent external validation studies revealed the CHADS2 had substantially lower predictive accuracy (Keogh Thromb Haemost 2011). Subsequent stroke risk calculators have been developed and evaluated, including the extension of CHADS2Vasc. The management of atrial fibrillation has evolved substantially over the last decade, for various reasons that include the introduction of novel oral anticoagulants. With these innovations has come a renewed interest in greater precision medicine for stroke prevention.

-

We will apply the PatientLevelPrediction package to observational healthcare data to address the following patient-level prediction question:

-

Amongst patients who are newly diagnosed with Atrial Fibrillation, which patients will go on to have Ischemic Stroke within 1 year?

-

We will define ‘patients who are newly diagnosed with Atrial Fibrillation’ as the first condition record of cardiac arrhythmia, which is followed by another cardiac arrhythmia condition record, at least two drug records for a drug used to treat arrhythmias, or a procedure to treat arrhythmias. We will define ‘Ischemic stroke events’ as ischemic stroke condition records during an inpatient or ER visit; successive records with > 180 day gap are considered independent episodes.

-
-
-

-Problem definition 2: Angioedema in ACE inhibitor users

-

Angiotensin converting enzyme inhibitors (ACE inhibitors) are medications used by patients with hypertension that widen the blood vessles and therefore increse the amount of blood pumped by the heart and decreases blood pressure. Ace inhibitors reduce a patients risk of cardiovasular disease but can lead to drug-induced angioedema.

-

We will apply the PatientLevelPrediction package to observational healthcare data to address the following patient-level prediction question:

-

Amongst patients who are newly dispensed an ACE inhibitor, which patients will go on to have angioedema within 1 year?

-

We will define ‘patients who are newly dispensed an ACE inhibitor’ as the first drug record of sny ACE inhibitor, […]which is followed by another cardiac arrhythmia condition record, at least two drug records for a drug used to treat arrhythmias, or a procedure to treat arrhythmias. We will define ‘angioedema’ as an angioedema condition record.

-
-
-

-Study population definition

-

The final study population in which we will develop our model is often a subset of the Target population, because we will e.g. apply criteria that are dependent on T and O or we want to do sensitivity analyses with subpopulations of T. For this we have to answer the following questions:

-
    -
  • What is the minimum amount of observation time we require before the start of the target cohort? This choice could depend on the available patient time in your training data, but also on the time you expect to be available in the data sources you want to apply the model on in the future. The longer the minimum observation time, the more baseline history time is available for each person to use for feature extraction, but the fewer patients will qualify for analysis. Moreover, there could be clinical reasons to choose a short or longer lookback period. For our example, we will use a prior history as lookback period (washout period).

  • -
  • Can patients enter the target cohort multiple times? In the target cohort definition, a person may qualify for the cohort multiple times during different spans of time, for example if they had different episodes of a disease or separate periods of exposure to a medical product. The cohort definition does not necessarily apply a restriction to only let the patients enter once, but in the context of a particular patient-level prediction problem, a user may want to restrict the cohort to the first qualifying episode. In our example, a person could only enter the target cohort once since our criteria was based on first occurrence of atrial fibrillation.

  • -
  • Do we allow persons to enter the cohort if they experienced the outcome before? Do we allow persons to enter the target cohort if they experienced the outcome before qualifying for the target cohort? Depending on the particular patient-level prediction problem, there may be a desire to predict ‘incident’ first occurrence of an outcome, in which case patients who have previously experienced the outcome are not ‘at-risk’ for having a first occurrence and therefore should be excluded from the target cohort. In other circumstances, there may be a desire to predict ‘prevalent’ episodes, whereby patients with prior outcomes can be included in the analysis and the prior outcome itself can be a predictor of future outcomes. For our prediction example, the answer to this question is ‘Yes, allow persons with prior outcomes’ because we know from the CHADS2 score that prior strokes are very predictive of future strokes. If this answer would have been ‘No’ we also have to decide how long we would look back for previous occurrences of the outcome.

  • -
  • How do we define the period in which we will predict our outcome relative to the target cohort start? We actually have to make two decisions to answer that question. First, does the time-at-risk window start at the date of the start of the target cohort or later? Arguments to make it start later could be that you want to avoid outcomes that were entered late in the record that actually occurred before the start of the target cohort or you want to leave a gap where interventions to prevent the outcome could theoretically be implemented. Second, you need to define the time-at-risk by setting the risk window end, as some specification of days offset relative to the target cohort start or end dates. For our problem we will predict in a ‘time-at-risk’ window starting 1 day after the start of the target cohort up to 365 days later (to look for 1-year risk following atrial fibrillation diagnosis).

  • -
  • Do we require a minimum amount of time-at-risk? We have to decide if we want to include patients that did not experience the outcome but did leave the database earlier than the end of our time-at-risk period. These patients may experience the outcome when we do not observe them. For our prediction problem we decide to answer this question with ‘Yes, require a mimimum time-at-risk’ for that reason. Furthermore, we have to decide if this constraint also applies to persons who experienced the outcome or we will include all persons with the outcome irrespective of their total time at risk. For example, if the outcome is death, then persons with the outcome are likely censored before the full time-at-risk period is complete.

  • -
-
-
-

-Model development settings

-

To develop the model we have to decide which algorithm(s) we like to train. We see the selection of the best algorithm for a certain prediction problem as an empirical question, i.e. you need to let the data speak for itself and try different approaches to find the best one. There is no algorithm that will work best for all problems (no free lunch). In our package we therefore aim to implement many algorithms. Furthermore, we made the system modular so you can add your own custom algorithms as described in more detail in the AddingCustomModels vignette.

-

Our package currently contains the following algorithms to choose from:

- ----- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
AlgorihmDescriptionHyper-parameters
Regularized Logistic RegressionLasso logistic regression belongs to the family of generalized linear models, where a linear combination of the variables is learned and finally a logistic function maps the linear combination to a value between 0 and 1. The lasso regularization adds a cost based on model complexity to the objective function when training the model. This cost is the sum of the absolute values of the linear combination of the coefficients. The model automatically performs feature selection by minimizing this cost. We use the Cyclic coordinate descent for logistic, Poisson and survival analysis (Cyclops) package to perform large-scale regularized logistic regression: https://github.com/OHDSI/Cyclops -var (starting variance), seed
Gradient boosting machinesGradient boosting machines is a boosting ensemble technique and in our framework it combines multiple decision trees. Boosting works by iteratively adding decision trees but adds more weight to the data-points that are misclassified by prior decision trees in the cost function when training the next tree. We use Extreme Gradient Boosting, which is an efficient implementation of the gradient boosting framework implemented in the xgboost R package available from CRAN.ntree (number of trees), max depth (max levels in tree), min rows (minimum data points in in node), learning rate, balance (balance class labels), seed
Random forestRandom forest is a bagging ensemble technique that combines multiple decision trees. The idea behind bagging is to reduce the likelihood of overfitting, by using weak classifiers, but combining multiple diverse weak classifiers into a strong classifier. Random forest accomplishes this by training multiple decision trees but only using a subset of the variables in each tree and the subset of variables differ between trees. Our packages uses the sklearn learn implementation of Random Forest in python.mtry (number of features in each tree),ntree (number of trees), maxDepth (max levels in tree), minRows (minimum data points in in node),balance (balance class labels), seed
K-nearest neighborsK-nearest neighbors (KNN) is an algorithm that uses some metric to find the K closest labelled data-points, given the specified metric, to a new unlabelled data-point. The prediction of the new data-points is then the most prevalent class of the K-nearest labelled data-points. There is a sharing limitation of KNN, as the model requires labelled data to perform the prediction on new data, and it is often not possible to share this data across data sites.We included the BigKnn classifier developed in OHDSI which is a large scale k-nearest neighbor classifier using the Lucene search engine: https://github.com/OHDSI/BigKnn -k (number of neighbours),weighted (weight by inverse frequency)
Naive BayesThe Naive Bayes algorithm applies the Bayes theorem with the ‘naive’ assumption of conditional independence between every pair of features given the value of the class variable. Based on the likelihood the data belongs to a class and the prior distribution of the class, a posterior distribution is obtained.none
AdaBoostAdaBoost is a boosting ensemble technique. Boosting works by iteratively adding classifiers but adds more weight to the data-points that are misclassified by prior classifiers in the cost function when training the next classifier. We use the sklearn ‘AdaboostClassifier’ implementation in Python.nEstimators (the maximum number of estimators at which boosting is terminated), learningRate (learning rate shrinks the contribution of each classifier by learning_rate. There is a trade-off between learningRate and nEstimators)
Decision TreeA decision tree is a classifier that partitions the variable space using individual tests selected using a greedy approach. It aims to find partitions that have the highest information gain to separate the classes. The decision tree can easily overfit by enabling a large number of partitions (tree depth) and often needs some regularization (e.g., pruning or specifying hyper-parameters that limit the complexity of the model). We use the sklearn ‘DecisionTreeClassifier’ implementation in Python.maxDepth (the maximum depth of the tree), minSamplesSplit,minSamplesLeaf, minImpuritySplit (threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf.), seed,classWeight (‘Balance’ or ‘None’)
Multilayer PerceptionNeural networks contain multiple layers that weight their inputs using a non-linear function. The first layer is the input layer, the last layer is the output layer the between are the hidden layers. Neural networks are generally trained using feed forward back-propagation. This is when you go through the network with a data-point and calculate the error between the true label and predicted label, then go backwards through the network and update the linear function weights based on the error. This can also be performed as a batch, where multiple data-points are feesize (the number of hidden nodes), alpha (the l2 regularisation), seed
Deep Learning (now in seperate DeepPatientLevelPrediction R package)Deep learning such as deep nets, convolutional neural networks or recurrent neural networks are similar to a neural network but have multiple hidden layers that aim to learn latent representations useful for prediction. In the seperate BuildingDeepLearningModels vignette we describe these models and hyper-parameters in more detailsee OHDSI/DeepPatientLevelPrediction
-

Furthermore, we have to decide on the covariates that we will use to train our model. This choice can be driven by domain knowledge of available computational resources. In our example, we like to add the Gender, Age, Conditions, Drugs Groups, and Visit Count. We also have to specify in which time windows we will look and we decide to look in year before and any time prior.

-

Finally, we have to define how we will train and test our model on our data, i.e. how we perform internal validation. For this we have to decide how we divide our dataset in a training and testing dataset and how we randomly assign patients to these two sets. Dependent on the size of the training set we can decide how much data we like to use for training, typically this is a 75%, 25% split. If you have very large datasets you can use more data for training. To randomly assign patients to the training and testing set, there are two commonly used approaches:

-
    -
  1. split by person. In this case a random seed is used to assign the patient to either sets.
  2. -
  3. split by time. In this case a time point is used to split the persons, e.g. 75% of the data is before and 25% is after this date. The advantage of this is that you take into consideration that the health care system has changed over time.
  4. -
-

We now completely defined our studies and implement them:

- -
-
-
-

-Example 1: Stroke in afibrilation patients

-
-

-Study Specification

-

For our first prediction model we decide to start with a Regularized Logistic Regression and will use the default parameters. We will do a 75%-25% split by person.

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
DefinitionValue
Problem Definition
Target Cohort (T)‘Patients who are newly diagnosed with Atrial Fibrillation’ defined as the first condition record of cardiac arrhythmia, which is followed by another cardiac arrhythmia condition record, at least two drug records for a drug used to treat arrhythmias, or a procedure to treat arrhythmias.
Outcome Cohort (O)‘Ischemic stroke events’ defined as ischemic stroke condition records during an inpatient or ER visit; successive records with > 180 day gap are considered independent episodes.
Time-at-risk (TAR)1 day till 365 days from cohort start
Population Definition
Washout Period1095
Enter the target cohort multiple times?No
Allow prior outcomes?Yes
Start of time-at-risk1 day
End of time-at-risk365 days
Require a minimum amount of time-at-risk?Yes (364 days)
Model Development
AlgorithmRegularized Logistic Regression
Hyper-parametersvariance = 0.01 (Default)
CovariatesGender, Age, Conditions (ever before, <365), Drugs Groups (ever before, <365), and Visit Count
Data split75% train, 25% test. Randomly assigned by person
-

According to the best practices we need to make a protocol that completely specifies how we plan to execute our study. This protocol will be assessed by the governance boards of the participating data sources in your network study. For this a template could be used but we prefer to automate this process as much as possible by adding functionality to automatically generate study protocol from a study specification. We will discuss this in more detail later.

-
-
-

-Study implementation

-

Now we have completely design our study we have to implement the study. We have to generate the target and outcome cohorts and we need to develop the R code to run against our CDM that will execute the full study.

-
-

-Cohort instantiation

-

For our study we need to know when a person enters the target and outcome cohorts. This is stored in a table on the server that contains the cohort start date and cohort end date for all subjects for a specific cohort definition. This cohort table has a very simple structure as shown below:

-
    -
  • -cohort_definition_id, a unique identifier for distinguishing between different types of cohorts, e.g. cohorts of interest and outcome cohorts.
  • -
  • -subject_id, a unique identifier corresponding to the person_id in the CDM.
  • -
  • -cohort_start_date, the date the subject enters the cohort.
  • -
  • -cohort_end_date, the date the subject leaves the cohort.
  • -
-

How do we fill this table according to our cohort definitions? There are two options for this:

-
    -
  1. use the interactive cohort builder tool in ATLAS which can be used to create cohorts based on inclusion criteria and will automatically populate this cohort table.

  2. -
  3. write your own custom SQL statements to fill the cohort table.

  4. -
-

Both methods are described below for our example prediction problem.

-
-
-

-ATLAS cohort builder

-
-

Target Cohort Atrial Fibrillation

-
-

ATLAS allows you to define cohorts interactively by specifying cohort entry and cohort exit criteria. Cohort entry criteria involve selecting one or more initial events, which determine the start date for cohort entry, and optionally specifying additional inclusion criteria which filter to the qualifying events. Cohort exit criteria are applied to each cohort entry record to determine the end date when the person’s episode no longer qualifies for the cohort. For the outcome cohort the end date is less relevant. As an example, Figure 4 shows how we created the Atrial Fibrillation cohort and Figure 5 shows how we created the stroke cohort in ATLAS.

-
-

Outcome Cohort Stroke

-
-

The T and O cohorts can be found here:

- -

In depth explanation of cohort creation in ATLAS is out of scope of this vignette but can be found on the OHDSI wiki pages (link).

-

Note that when a cohort is created in ATLAS the cohortid is needed to extract the data in R. The cohortid can be found at the top of the ATLAS screen, e.g. 1769447 in Figure 4.

-
-
-

-Custom cohorts

-

It is also possible to create cohorts without the use of ATLAS. Using custom cohort code (SQL) you can make more advanced cohorts if needed.

-

For our example study, we need to create at table to hold the cohort data and we need to create SQL code to instantiate this table for both the AF and Stroke cohorts. Therefore, we create a file called AfStrokeCohorts.sql with the following contents:

-
/***********************************
-File AfStrokeCohorts.sql 
-***********************************/
-/*
-Create a table to store the persons in the T and C cohort
-*/
-
-IF OBJECT_ID('@resultsDatabaseSchema.PLPAFibStrokeCohort', 'U') IS NOT NULL 
-DROP TABLE @resultsDatabaseSchema.PLPAFibStrokeCohort;
-
-CREATE TABLE @resultsDatabaseSchema.PLPAFibStrokeCohort 
-( 
-cohort_definition_id INT, 
-subject_id BIGINT,
-cohort_start_date DATE, 
-cohort_end_date DATE
-);
-
-
-/*
-T cohort:  [PatientLevelPrediction vignette]:  T : patients who are newly 
-diagnosed with Atrial fibrillation
-- persons with a condition occurrence record of 'Atrial fibrillation' or 
-any descendants, indexed at the first diagnosis
-- who have >1095 days of prior observation before their first diagnosis
-- and have no warfarin exposure any time prior to first AFib diagnosis
-*/
-INSERT INTO @resultsDatabaseSchema.AFibStrokeCohort (cohort_definition_id, 
-subject_id, 
-cohort_start_date, 
-cohort_end_date)
-SELECT 1 AS cohort_definition_id,
-AFib.person_id AS subject_id,
-AFib.condition_start_date AS cohort_start_date,
-observation_period.observation_period_end_date AS cohort_end_date
-FROM
-(
-  SELECT person_id, min(condition_start_date) as condition_start_date
-  FROM @cdmDatabaseSchema.condition_occurrence
-  WHERE condition_concept_id IN (SELECT descendant_concept_id FROM 
-  @cdmDatabaseSchema.concept_ancestor WHERE ancestor_concept_id IN 
-  (313217 /*atrial fibrillation*/))
-  GROUP BY person_id
-) AFib
-  INNER JOIN @cdmDatabaseSchema.observation_period
-  ON AFib.person_id = observation_period.person_id
-  AND AFib.condition_start_date >= dateadd(dd,1095, 
-  observation_period.observation_period_start_date)
-  AND AFib.condition_start_date <= observation_period.observation_period_end_date
-  LEFT JOIN
-  (
-  SELECT person_id, min(drug_exposure_start_date) as drug_exposure_start_date
-  FROM @cdmDatabaseSchema.drug_exposure
-  WHERE drug_concept_id IN (SELECT descendant_concept_id FROM 
-  @cdmDatabaseSchema.concept_ancestor WHERE ancestor_concept_id IN 
-  (1310149 /*warfarin*/))
-  GROUP BY person_id
-  ) warfarin
-  ON Afib.person_id = warfarin.person_id
-  AND Afib.condition_start_date > warfarin.drug_exposure_start_date
-  WHERE warfarin.person_id IS NULL
-  ;
-  
-  /*
-  C cohort:  [PatientLevelPrediction vignette]:  O: Ischemic stroke events
-  - inpatient visits that include a condition occurrence record for 
-  'cerebral infarction' and descendants, 'cerebral thrombosis', 
-  'cerebral embolism', 'cerebral artery occlusion' 
-  */
-  INSERT INTO @resultsDatabaseSchema.AFibStrokeCohort (cohort_definition_id, 
-  subject_id, 
-  cohort_start_date, 
-  cohort_end_date)
-  SELECT 2 AS cohort_definition_id,
-  visit_occurrence.person_id AS subject_id,
-  visit_occurrence.visit_start_date AS cohort_start_date,
-  visit_occurrence.visit_end_date AS cohort_end_date
-  FROM  
-  (
-  SELECT person_id, condition_start_date
-  FROM @cdmDatabaseSchema.condition_occurrence
-  WHERE condition_concept_id IN (SELECT DISTINCT descendant_concept_id FROM 
-  @cdmDatabaseSchema.concept_ancestor WHERE ancestor_concept_id IN 
-  (443454 /*cerebral infarction*/) OR descendant_concept_id IN 
-  (441874 /*cerebral thrombosis*/, 375557 /*cerebral embolism*/, 
-  372924 /*cerebral artery occlusion*/))
-  ) stroke
-  INNER JOIN @cdmDatabaseSchema.visit_occurrence
-  ON stroke.person_id = visit_occurrence.person_id
-  AND stroke.condition_start_date >= visit_occurrence.visit_start_date
-  AND stroke.condition_start_date <= visit_occurrence.visit_end_date
-  AND visit_occurrence.visit_concept_id IN (9201, 262 /*'Inpatient Visit'  or 
-  'Emergency Room and Inpatient Visit'*/)
-  GROUP BY visit_occurrence.person_id, visit_occurrence.visit_start_date, 
-  visit_occurrence.visit_end_date
-  ;
-  
-

This is parameterized SQL which can be used by the SqlRender package. We use parameterized SQL so we do not have to pre-specify the names of the CDM and result schemas. That way, if we want to run the SQL on a different schema, we only need to change the parameter values; we do not have to change the SQL code. By also making use of translation functionality in SqlRender, we can make sure the SQL code can be run in many different environments.

-

To execute this sql against our CDM we first need to tell R how to connect to the server. PatientLevelPrediction uses the DatabaseConnector package, which provides a function called createConnectionDetails. Type ?createConnectionDetails for the specific settings required for the various database management systems (DBMS). For example, one might connect to a PostgreSQL database using this code:

-
-  connectionDetails <- createConnectionDetails(dbms = "postgresql", 
-  server = "localhost/ohdsi", 
-  user = "joe", 
-  password = "supersecret")
-  
-  cdmDatabaseSchema <- "my_cdm_data"
-  cohortsDatabaseSchema <- "my_results"
-  cdmVersion <- "5"
-

The last three lines define the cdmDatabaseSchema and cohortsDatabaseSchema variables, as well as the CDM version. We will use these later to tell R where the data in CDM format live, where we want to create the cohorts of interest, and what version CDM is used. Note that for Microsoft SQL Server, databaseschemas need to specify both the database and the schema, so for example cdmDatabaseSchema <- "my_cdm_data.dbo".

-
-  library(SqlRender)
-  sql <- readSql("AfStrokeCohorts.sql")
-  sql <- renderSql(sql,
-  cdmDatabaseSchema = cdmDatabaseSchema,
-  cohortsDatabaseSchema = cohortsDatabaseSchema,
-  post_time = 30,
-  pre_time = 365)$sql
-  sql <- translateSql(sql, targetDialect = connectionDetails$dbms)$sql
-  
-  connection <- connect(connectionDetails)
-  executeSql(connection, sql)
-

In this code, we first read the SQL from the file into memory. In the next line, we replace four parameter names with the actual values. We then translate the SQL into the dialect appropriate for the DBMS we already specified in the connectionDetails. Next, we connect to the server, and submit the rendered and translated SQL.

-

If all went well, we now have a table with the events of interest. We can see how many events per type:

-
-  sql <- paste("SELECT cohort_definition_id, COUNT(*) AS count",
-  "FROM @cohortsDatabaseSchema.AFibStrokeCohort",
-  "GROUP BY cohort_definition_id")
-  sql <- renderSql(sql, cohortsDatabaseSchema = cohortsDatabaseSchema)$sql
-  sql <- translateSql(sql, targetDialect = connectionDetails$dbms)$sql
-  
-  querySql(connection, sql)
-
##   cohort_definition_id  count
-## 1                    1 527616
-## 2                    2 221555
-
-
-

-Study script creation

-

In this section we assume that our cohorts have been created either by using ATLAS or a custom SQL script. We will first explain how to create an R script yourself that will execute our study as we have defined earlier.

-
-
-

-Data extraction

-

Now we can tell PatientLevelPrediction to extract all necessary data for our analysis. This is done using the FeatureExtractionPackage. In short the FeatureExtractionPackage allows you to specify which features (covariates) need to be extracted, e.g. all conditions and drug exposures. It also supports the creation of custom covariates. For more detailed information on the FeatureExtraction package see its vignettes. For our example study we decided to use these settings:

-
-  covariateSettings <- createCovariateSettings(useDemographicsGender = TRUE,
-  useDemographicsAge = TRUE,
-  useConditionGroupEraLongTerm = TRUE,
-  useConditionGroupEraAnyTimePrior = TRUE,
-  useDrugGroupEraLongTerm = TRUE,
-  useDrugGroupEraAnyTimePrior = TRUE,
-  useVisitConceptCountLongTerm = TRUE,
-  longTermStartDays = -365,
-  endDays = -1)
-

The final step for extracting the data is to run the getPlpData function and input the connection details, the database schema where the cohorts are stored, the cohort definition ids for the cohort and outcome, and the washoutPeriod which is the minimum number of days prior to cohort index date that the person must have been observed to be included into the data, and finally input the previously constructed covariate settings.

-
-databaseDetails <- createDatabaseDetails(
-  connectionDetails = connectionDetails,
-  cdmDatabaseSchema = cdmDatabaseSchema,
-  cdmDatabaseName = '',
-  cohortDatabaseSchema = resultsDatabaseSchema,
-  cohortTable = 'AFibStrokeCohort',
-  cohortId = 1,
-  outcomeDatabaseSchema = resultsDatabaseSchema,
-  outcomeTable = 'AFibStrokeCohort',
-  outcomeIds = 2,
-  cdmVersion = 5
-  )
-
-# here you can define whether you want to sample the target cohort and add any
-# restrictions based on minimum prior observation, index date restrictions
-# or restricting to first index date (if people can be in target cohort multiple times)
-restrictPlpDataSettings <- createRestrictPlpDataSettings(sampleSize = 10000)
-
-  plpData <- getPlpData(
-    databaseDetails = databaseDetails, 
-    covariateSettings = covariateSettings,
-    restrictPlpDataSettings = restrictPlpDataSettings
-  )
-

Note that if the cohorts are created in ATLAS its corresponding cohort database schema needs to be selected. There are many additional parameters for the createRestrictPlpDataSettings function which are all documented in the PatientLevelPrediction manual. The resulting plpData object uses the package Andromeda (which uses SQLite) to store information in a way that ensures R does not run out of memory, even when the data are large.

-

Creating the plpData object can take considerable computing time, and it is probably a good idea to save it for future sessions. Because plpData uses Andromeda, we cannot use R’s regular save function. Instead, we’ll have to use the savePlpData() function:

-
-savePlpData(plpData, "stroke_in_af_data")
-

We can use the loadPlpData() function to load the data in a future session.

-
-
-

-Additional inclusion criteria

-

To completely define the prediction problem the final study population is obtained by applying additional constraints on the two earlier defined cohorts, e.g., a minumim time at risk can be enforced (requireTimeAtRisk, minTimeAtRisk) and we can specify if this also applies to patients with the outcome (includeAllOutcomes). Here we also specify the start and end of the risk window relative to target cohort start. For example, if we like the risk window to start 30 days after the at-risk cohort start and end a year later we can set riskWindowStart = 30 and riskWindowEnd = 365. In some cases the risk window needs to start at the cohort end date. This can be achieved by setting addExposureToStart = TRUE which adds the cohort (exposure) time to the start date.

-

In Appendix 1, we demonstrate the effect of these settings on the subset of the persons in the target cohort that end up in the final study population.

-

In the example below all the settings we defined for our study are imposed:

-
-  populationSettings <- createStudyPopulationSettings(
-  washoutPeriod = 1095,
-  firstExposureOnly = FALSE,
-  removeSubjectsWithPriorOutcome = FALSE,
-  priorOutcomeLookback = 1,
-  riskWindowStart = 1,
-  riskWindowEnd = 365,
-  startAnchor =  'cohort start',
-  endAnchor =  'cohort start',
-  minTimeAtRisk = 364,
-  requireTimeAtRisk = TRUE,
-  includeAllOutcomes = TRUE
-  )
-
-
-

-Spliting the data into training/validation/testing datasets

-

When developing a prediction model using supervised learning (when you have features paired with labels for a set of patients), the first step is to design the development/internal validation process. This requires specifying how to select the model hyper-parameters, how to learn the model parameters and how to fairly evaluate the model. In general, the validation set is used to pick hyper-parameters, the training set is used to learn the model parameters and the test set is used to perform fair internal validation. However, cross-validation can be implemented to pick the hyper-parameters on the training data (so a validation data set is not required). Cross validation can also be used to estimate internal validation (so a testing data set is not required).

-

In small data the best approach for internal validation has been shown to be boostrapping. However, in big data (many patients and many features) bootstrapping is generally not feasible. In big data our research has shown that it is just important to have some form of fair evaluation (use a test set or cross validation). For full details see our BMJ open paper.

-

In the PatientLevelPrediction package, the splitSettings define how the plpData are partitioned into training/validation/testing data. Cross validation is always done, but using a test set is optional (when the data are small, it may be optimal to not use a test set). For the splitSettings we can use the type (stratified/time/subject) and testFraction parameters to split the data in a 75%-25% split and run the patient-level prediction pipeline:

-
-  splitSettings <- createDefaultSplitSetting(
-    trainFraction = 0.75,
-    testFraction = 0.25,
-    type = 'stratified',
-    nfold = 2, 
-    splitSeed = 1234
-    )
-

Note: it is possible to add a custom method to specify how the plpData are partitioned into training/validation/testing data, see vignette for custom splitting.

-
-
-

-Preprocessing the training data

-

There a numerous data processing settings that a user must specify when developing a prediction model. These are: * Whether to under-sample or over-sample the training data (this may be useful when there is class imballance (e.g., the outcome is very rare or very common)) * Whether to perform feature engineering or feature selection (e.g., create latent variables that are not observed in the data or reduce the dimensionality of the data) * Whether to remove redundant features and normalize the data (this is required for some models)

-

The default sample settings does nothing, it simply returns the trainData as input, see below:

-
-  sampleSettings <- createSampleSettings()
-

However, the current package contains methods of under-sampling the non-outcome patients. To perform undersampling, the type input should be ‘underSample’ and numberOutcomestoNonOutcomes must be specified (an integer specifying the number of non-outcomes per outcome). It is possible to add any custom function for over/under sampling, see vignette for custom sampling.

-

It is possible to specify a combination of feature engineering functions that take as input the trainData and output a new trainData with different features. The default feature engineering setting does nothing:

-
-  featureEngineeringSettings <- createFeatureEngineeringSettings()
-

However, it is possible to add custom feature engineering functions into the pipeline, see vignette for custom feature engineering.

-

Finally, the preprocessing setting is required. For this setting the user can define minFraction, this removes any features that is observed in the training data for less than 0.01 fraction of the patients. So, if minFraction = 0.01 then any feature that is seen in less than 1 percent of the target population is removed. The input normalize specifies whether the features are scaled between 0 and 1, this is required for certain models (e.g., LASSO logistic regression). The input removeRedundancy specifies whether features that are observed in all of the target population are removed.

-
-  preprocessSettingsSettings <- createPreprocessSettings(
-    minFraction = 0.01, 
-    normalize = T, 
-    removeRedundancy = T
-      )
-
-
-

-Model Development

-

In the set function of an algorithm the user can specify a list of eligible values for each hyper-parameter. All possible combinations of the hyper-parameters are included in a so-called grid search using cross-validation on the training set. If a user does not specify any value then the default value is used instead.

-

For example, if we use the following settings for the gradientBoostingMachine: ntrees=c(100,200), maxDepth=4 the grid search will apply the gradient boosting machine algorithm with ntrees=100 and maxDepth=4 plus the default settings for other hyper-parameters and ntrees=200 and maxDepth=4 plus the default settings for other hyper-parameters. The hyper-parameters that lead to the bestcross-validation performance will then be chosen for the final model. For our problem we choose to build a logistic regression model with the default hyper-parameters

- -

The runPlP function requires the plpData, the outcomeId specifying the outcome being predicted and the settings: populationSettings, splitSettings, sampleSettings, featureEngineeringSettings, preprocessSettings and modelSettings to train and evaluate the model.

-
-  lrResults <- runPlp(
-    plpData = plpData,
-    outcomeId = 2, 
-    analysisId = 'singleDemo',
-    analysisName = 'Demonstration of runPlp for training single PLP models',
-    populationSettings = populationSettings, 
-    splitSettings = splitSettings,
-    sampleSettings = sampleSettings, 
-    featureEngineeringSettings = featureEngineeringSettings, 
-    preprocessSettings = preprocessSettings,
-    modelSettings = lrModel,
-    logSettings = createLogSettings(), 
-    executeSettings = createExecuteSettings(
-      runSplitData = T, 
-      runSampleData = T, 
-      runfeatureEngineering = T, 
-      runPreprocessData = T, 
-      runModelDevelopment = T, 
-      runCovariateSummary = T
-    ), 
-    saveDirectory = file.path(getwd(), 'singlePlp')
-    )
-

Under the hood the package will now use the Cyclops package to fit a large-scale regularized regression using 75% of the data and will evaluate the model on the remaining 25%. A results data structure is returned containing information about the model, its performance etc.

-

You can save the model using:

-
-savePlpModel(lrResults$model, dirPath = file.path(getwd(), "model"))
-

You can load the model using:

-
-plpModel <- loadPlpModel(file.path(getwd(), "model"))
-

You can also save the full results structure using:

-
-savePlpResult(lrResults, location = file.path(getwd(), "lr"))
-

To load the full results structure use:

-
-lrResults <- loadPlpResult(file.path(getwd(), "lr"))
-
-
-
-
-
-

-Example 2: Angioedema in ACE inhibitor users

-
-

-Study Specification

- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
DefinitionValue
Problem Definition
Target Cohort (T)‘Patients who are newly dispensed an ACE inhibitor’ defined as the first drug record of any ACE inhibitor
Outcome Cohort (O)‘Angioedema’ defined as an angioedema condition record during an inpatient or ER visit
Time-at-risk (TAR)1 day till 365 days from cohort start
Population Definition
Washout Period365
Enter the target cohort multiple times?No
Allow prior outcomes?No
Start of time-at-risk1 day
End of time-at-risk365 days
Require a minimum amount of time-at-risk?Yes (364 days)
Model Development
AlgorithmGradient Boosting Machine
Hyper-parametersntree:5000, max depth:4 or 7 or 10 and learning rate: 0.001 or 0.01 or 0.1 or 0.9
CovariatesGender, Age, Conditions (ever before, <365), Drugs Groups (ever before, <365), and Visit Count
Data split75% train, 25% test. Randomly assigned by person
-

According to the best practices we need to make a protocol that completely specifies how we plan to execute our study. This protocol will be assessed by the governance boards of the participating data sources in your network study. For this a template could be used but we prefer to automate this process as much as possible by adding functionality to automatically generate study protocol from a study specification. We will discuss this in more detail later.

-
-
-

-Study implementation

-

Now we have completely design our study we have to implement the study. We have to generate the target and outcome cohorts and we need to develop the R code to run against our CDM that will execute the full study.

-
-

-Cohort instantiation

-

For our study we need to know when a person enters the target and outcome cohorts. This is stored in a table on the server that contains the cohort start date and cohort end date for all subjects for a specific cohort definition. This cohort table has a very simple structure as shown below:

-
    -
  • -cohort_definition_id, a unique identifier for distinguishing between different types of cohorts, e.g. cohorts of interest and outcome cohorts.
  • -
  • -subject_id, a unique identifier corresponding to the person_id in the CDM.
  • -
  • -cohort_start_date, the date the subject enters the cohort.
  • -
  • -cohort_end_date, the date the subject leaves the cohort.
  • -
-

How do we fill this table according to our cohort definitions? There are two options for this:

-
    -
  1. use the interactive cohort builder tool in ATLAS which can be used to create cohorts based on inclusion criteria and will automatically populate this cohort table.

  2. -
  3. write your own custom SQL statements to fill the cohort table.

  4. -
-

Both methods are described below for our example prediction problem.

-
-
-

-ATLAS cohort builder

-
-

Target Cohort ACE inhibitors

-
-

ATLAS allows you to define cohorts interactively by specifying cohort entry and cohort exit criteria. Cohort entry criteria involve selecting one or more initial events, which determine the start date for cohort entry, and optionally specifying additional inclusion criteria which filter to the qualifying events. Cohort exit criteria are applied to each cohort entry record to determine the end date when the person’s episode no longer qualifies for the cohort. For the outcome cohort the end date is less relevant. As an example, Figure 6 shows how we created the ACE inhibitors cohort and Figure 7 shows how we created the angioedema cohort in ATLAS.

-
-

Outcome Cohort Angioedema

-
-

The T and O cohorts can be found here:

- -

In depth explanation of cohort creation in ATLAS is out of scope of this vignette but can be found on the OHDSI wiki pages (link).

-

Note that when a cohort is created in ATLAS the cohortid is needed to extract the data in R. The cohortid can be found at the top of the ATLAS screen, e.g. 1770617 in Figure 6.

-
-
-

-Custom cohorts

-

It is also possible to create cohorts without the use of ATLAS. Using custom cohort code (SQL) you can make more advanced cohorts if needed.

-

For our example study, we need to create at table to hold the cohort data and we need to create SQL code to instantiate this table for both the AF and Stroke cohorts. Therefore, we create a file called AceAngioCohorts.sql with the following contents:

-
  /***********************************
-    File AceAngioCohorts.sql 
-  ***********************************/
-    /*
-    Create a table to store the persons in the T and C cohort
-  */
-    
-    IF OBJECT_ID('@resultsDatabaseSchema.PLPAceAngioCohort', 'U') IS NOT NULL 
-  DROP TABLE @resultsDatabaseSchema.PLPAceAngioCohort;
-  
-  CREATE TABLE @resultsDatabaseSchema.PLPAceAngioCohort 
-  ( 
-    cohort_definition_id INT, 
-    subject_id BIGINT,
-    cohort_start_date DATE, 
-    cohort_end_date DATE
-  );
-  
-  
-  /*
-    T cohort:  [PatientLevelPrediction vignette]:  T : patients who are newly 
-  dispensed an ACE inhibitor
-  - persons with a drug exposure record of any 'ACE inhibitor' or 
-  any descendants, indexed at the first diagnosis
-  - who have >364 days of prior observation before their first dispensing
-  */
-    INSERT INTO @resultsDatabaseSchema.AceAngioCohort (cohort_definition_id, 
-                                                       subject_id, 
-                                                       cohort_start_date, 
-                                                       cohort_end_date)
-  SELECT 1 AS cohort_definition_id,
-  Ace.person_id AS subject_id,
-  Ace.drug_start_date AS cohort_start_date,
-  observation_period.observation_period_end_date AS cohort_end_date
-  FROM
-  (
-    SELECT person_id, min(drug_exposure_date) as drug_start_date
-    FROM @cdmDatabaseSchema.drug_exposure
-    WHERE drug_concept_id IN (SELECT descendant_concept_id FROM 
-                              @cdmDatabaseSchema.concept_ancestor WHERE ancestor_concept_id IN 
-                              (1342439,1334456, 1331235, 1373225, 1310756, 1308216, 1363749, 1341927, 1340128, 1335471 /*ace inhibitors*/))
-    GROUP BY person_id
-  ) Ace
-  INNER JOIN @cdmDatabaseSchema.observation_period
-  ON Ace.person_id = observation_period.person_id
-  AND Ace.drug_start_date >= dateadd(dd,364, 
-                                     observation_period.observation_period_start_date)
-  AND Ace.drug_start_date <= observation_period.observation_period_end_date
-  ;
-  
-  /*
-    C cohort:  [PatientLevelPrediction vignette]:  O: Angioedema
-  */
-    INSERT INTO @resultsDatabaseSchema.AceAngioCohort (cohort_definition_id, 
-                                                       subject_id, 
-                                                       cohort_start_date, 
-                                                       cohort_end_date)
-  SELECT 2 AS cohort_definition_id,
-  angioedema.person_id AS subject_id,
-  angioedema.condition_start_date AS cohort_start_date,
-  angioedema.condition_start_date AS cohort_end_date
-  FROM  
-  (
-    SELECT person_id, condition_start_date
-    FROM @cdmDatabaseSchema.condition_occurrence
-    WHERE condition_concept_id IN (SELECT DISTINCT descendant_concept_id FROM 
-                                   @cdmDatabaseSchema.concept_ancestor WHERE ancestor_concept_id IN 
-                                   (432791 /*angioedema*/) OR descendant_concept_id IN 
-                                   (432791 /*angioedema*/)
-    ) angioedema
-    
-    ;
-    
-

This is parameterized SQL which can be used by the SqlRender package. We use parameterized SQL so we do not have to pre-specify the names of the CDM and result schemas. That way, if we want to run the SQL on a different schema, we only need to change the parameter values; we do not have to change the SQL code. By also making use of translation functionality in SqlRender, we can make sure the SQL code can be run in many different environments.

-

To execute this sql against our CDM we first need to tell R how to connect to the server. PatientLevelPrediction uses the DatabaseConnector package, which provides a function called createConnectionDetails. Type ?createConnectionDetails for the specific settings required for the various database management systems (DBMS). For example, one might connect to a PostgreSQL database using this code:

-
-    connectionDetails <- createConnectionDetails(dbms = "postgresql", 
-                                                 server = "localhost/ohdsi", 
-                                                 user = "joe", 
-                                                 password = "supersecret")
-    
-    cdmDatabaseSchema <- "my_cdm_data"
-    cohortsDatabaseSchema <- "my_results"
-    cdmVersion <- "5"
-

The last three lines define the cdmDatabaseSchema and cohortsDatabaseSchema variables, as well as the CDM version. We will use these later to tell R where the data in CDM format live, where we want to create the cohorts of interest, and what version CDM is used. Note that for Microsoft SQL Server, databaseschemas need to specify both the database and the schema, so for example cdmDatabaseSchema <- "my_cdm_data.dbo".

-
-    library(SqlRender)
-    sql <- readSql("AceAngioCohorts.sql")
-    sql <- render(sql,
-                  cdmDatabaseSchema = cdmDatabaseSchema,
-                  cohortsDatabaseSchema = cohortsDatabaseSchema)
-    sql <- translate(sql, targetDialect = connectionDetails$dbms)
-    
-    connection <- connect(connectionDetails)
-    executeSql(connection, sql)
-

In this code, we first read the SQL from the file into memory. In the next line, we replace four parameter names with the actual values. We then translate the SQL into the dialect appropriate for the DBMS we already specified in the connectionDetails. Next, we connect to the server, and submit the rendered and translated SQL.

-

If all went well, we now have a table with the events of interest. We can see how many events per type:

-
-    sql <- paste("SELECT cohort_definition_id, COUNT(*) AS count",
-                 "FROM @cohortsDatabaseSchema.AceAngioCohort",
-                 "GROUP BY cohort_definition_id")
-    sql <- render(sql, cohortsDatabaseSchema = cohortsDatabaseSchema)
-    sql <- translate(sql, targetDialect = connectionDetails$dbms)
-    
-    querySql(connection, sql)
-
##   cohort_definition_id count
-## 1                    1     0
-## 2                    2     0
-
-
-

-Study script creation

-

In this section we assume that our cohorts have been created either by using ATLAS or a custom SQL script. We will first explain how to create an R script yourself that will execute our study as we have defined earlier.

-
-
-

-Data extraction

-

Now we can tell PatientLevelPrediction to extract all necessary data for our analysis. This is done using the FeatureExtractionPackage. In short the FeatureExtractionPackage allows you to specify which features (covariates) need to be extracted, e.g. all conditions and drug exposures. It also supports the creation of custom covariates. For more detailed information on the FeatureExtraction package see its vignettes. For our example study we decided to use these settings:

-
-    covariateSettings <- createCovariateSettings(useDemographicsGender = TRUE,
-                                                 useDemographicsAge = TRUE,
-                                                 useConditionGroupEraLongTerm = TRUE,
-                                                 useConditionGroupEraAnyTimePrior = TRUE,
-                                                 useDrugGroupEraLongTerm = TRUE,
-                                                 useDrugGroupEraAnyTimePrior = TRUE,
-                                                 useVisitConceptCountLongTerm = TRUE,
-                                                 longTermStartDays = -365,
-                                                 endDays = -1)
-

The final step for extracting the data is to run the getPlpData function and input the connection details, the database schema where the cohorts are stored, the cohort definition ids for the cohort and outcome, and the washoutPeriod which is the minimum number of days prior to cohort index date that the person must have been observed to be included into the data, and finally input the previously constructed covariate settings.

-
-databaseDetails <- createDatabaseDetails(
-  connectionDetails = connectionDetails,
-  cdmDatabaseSchema = cdmDatabaseSchema,
-  cohortDatabaseSchema = resultsDatabaseSchema,
-  cohortTable = 'AceAngioCohort',
-  cohortId = 1,
-  outcomeDatabaseSchema = resultsDatabaseSchema,
-  outcomeTable = 'AceAngioCohort',
-  outcomeIds = 2
-  )
-
-restrictPlpDataSettings <- createRestrictPlpDataSettings(
-  sampleSize = 10000
-  )
-
-plpData <- getPlpData(
-  databaseDetails = databaseDetails, 
-  covariateSettings = covariateSettings, 
-  restrictPlpDataSettings = restrictPlpDataSettings
-  )
-

Note that if the cohorts are created in ATLAS its corresponding cohort database schema needs to be selected. There are many additional parameters for the getPlpData function which are all documented in the PatientLevelPrediction manual. The resulting plpData object uses the package ff to store information in a way that ensures R does not run out of memory, even when the data are large.

-

Creating the plpData object can take considerable computing time, and it is probably a good idea to save it for future sessions. Because plpData uses ff, we cannot use R’s regular save function. Instead, we’ll have to use the savePlpData() function:

-
-savePlpData(plpData, "angio_in_ace_data")
-

We can use the loadPlpData() function to load the data in a future session.

-
-
-

-Additional inclusion criteria

-

To completely define the prediction problem the final study population is obtained by applying additional constraints on the two earlier defined cohorts, e.g., a minumim time at risk can be enforced (requireTimeAtRisk, minTimeAtRisk) and we can specify if this also applies to patients with the outcome (includeAllOutcomes). Here we also specify the start and end of the risk window relative to target cohort start. For example, if we like the risk window to start 30 days after the at-risk cohort start and end a year later we can set riskWindowStart = 30 and riskWindowEnd = 365. In some cases the risk window needs to start at the cohort end date. This can be achieved by setting addExposureToStart = TRUE which adds the cohort (exposure) time to the start date.

-

In Appendix 1, we demonstrate the effect of these settings on the subset of the persons in the target cohort that end up in the final study population.

-

In the example below all the settings we defined for our study are imposed:

-
-    populationSettings <- createStudyPopulationSettings(
-      washoutPeriod = 364,
-      firstExposureOnly = FALSE,
-      removeSubjectsWithPriorOutcome = TRUE,
-      priorOutcomeLookback = 9999,
-      riskWindowStart = 1,
-      riskWindowEnd = 365, 
-      minTimeAtRisk = 364,
-      startAnchor = 'cohort start',
-      endAnchor = 'cohort start',
-      requireTimeAtRisk = TRUE,
-      includeAllOutcomes = TRUE
-    )
-
-
-

-Spliting the data into training/validation/testing datasets

-

When developing a prediction model using supervised learning (when you have features paired with labels for a set of patients), the first step is to design the development/internal validation process. This requires specifying how to select the model hyper-parameters, how to learn the model parameters and how to fairly evaluate the model. In general, the validation set is used to pick hyper-parameters, the training set is used to learn the model parameters and the test set is used to perform fair internal validation. However, cross-validation can be implemented to pick the hyper-parameters on the training data (so a validation data set is not required). Cross validation can also be used to estimate internal validation (so a testing data set is not required).

-

In small data the best approach for internal validation has been shown to be boostrapping. However, in big data (many patients and many features) bootstrapping is generally not feasible. In big data our research has shown that it is just important to have some form of fair evaluation (use a test set or cross validation). For full details see our BMJ open paper.

-

In the PatientLevelPrediction package, the splitSettings define how the plpData are partitioned into training/validation/testing data. Cross validation is always done, but using a test set is optional (when the data are small, it may be optimal to not use a test set). For the splitSettings we can use the type (stratified/time/subject) and testFraction parameters to split the data in a 75%-25% split and run the patient-level prediction pipeline:

-
-  splitSettings <- createDefaultSplitSetting(
-    trainFraction = 0.75,
-    testFraction = 0.25,
-    type = 'stratified',
-    nfold = 2, 
-    splitSeed = 1234
-    )
-

Note: it is possible to add a custom method to specify how the plpData are partitioned into training/validation/testing data, see vignette for custom splitting.

-
-
-

-Preprocessing the training data

-

There a numerous data processing settings that a user must specify when developing a prediction model. These are: * Whether to under-sample or over-sample the training data (this may be useful when there is class imballance (e.g., the outcome is very rare or very common)) * Whether to perform feature engineering or feature selection (e.g., create latent variables that are not observed in the data or reduce the dimensionality of the data) * Whether to remove redundant features and normalize the data (this is required for some models)

-

The default sample settings does nothing, it simply returns the trainData as input, see below:

-
-  sampleSettings <- createSampleSettings()
-

However, the current package contains methods of under-sampling the non-outcome patients. To perform undersampling, the type input should be ‘underSample’ and numberOutcomestoNonOutcomes must be specified (an integer specifying the number of non-outcomes per outcome). It is possible to add any custom function for over/under sampling, see vignette for custom sampling.

-

It is possible to specify a combination of feature engineering functions that take as input the trainData and output a new trainData with different features. The default feature engineering setting does nothing:

-
-  featureEngineeringSettings <- createFeatureEngineeringSettings()
-

However, it is possible to add custom feature engineering functions into the pipeline, see vignette for custom feature engineering.

-

Finally, the preprocessing setting is required. For this setting the user can define minFraction, this removes any features that is observed in the training data for less than 0.01 fraction of the patients. So, if minFraction = 0.01 then any feature that is seen in less than 1 percent of the target population is removed. The input normalize specifies whether the features are scaled between 0 and 1, this is required for certain models (e.g., LASSO logistic regression). The input removeRedundancy specifies whether features that are observed in all of the target population are removed.

-
-  preprocessSettingsSettings <- createPreprocessSettings(
-    minFraction = 0.01, 
-    normalize = T, 
-    removeRedundancy = T
-      )
-
-
-

-Model Development

-

In the set function of an algorithm the user can specify a list of eligible values for each hyper-parameter. All possible combinations of the hyper-parameters are included in a so-called grid search using cross-validation on the training set. If a user does not specify any value then the default value is used instead.

-

For example, if we use the following settings for the gradientBoostingMachine: ntrees=c(100,200), maxDepth=4 the grid search will apply the gradient boosting machine algorithm with ntrees=100 and maxDepth=4 plus the default settings for other hyper-parameters and ntrees=200 and maxDepth=4 plus the default settings for other hyper-parameters. The hyper-parameters that lead to the bestcross-validation performance will then be chosen for the final model. For our problem we choose to build a logistic regression model with the default hyper-parameters

-
-gbmModel <- setGradientBoostingMachine(ntrees = 5000, maxDepth = c(4, 7, 10), learnRate = c(0.001, 
-    0.01, 0.1, 0.9))
-

The runPlP function requires the plpData, the outcomeId specifying the outcome being predicted and the settings: populationSettings, splitSettings, sampleSettings, featureEngineeringSettings, preprocessSettings and modelSettings to train and evaluate the model.

-
-  gbmResults <- runPlp(
-    plpData = plpData,
-    outcomeId = 2, 
-    analysisId = 'singleDemo2',
-    analysisName = 'Demonstration of runPlp for training single PLP models',
-    populationSettings = populationSettings, 
-    splitSettings = splitSettings,
-    sampleSettings = sampleSettings, 
-    featureEngineeringSettings = featureEngineeringSettings, 
-    preprocessSettings = preprocessSettings,
-    modelSettings = gbmModel,
-    logSettings = createLogSettings(), 
-    executeSettings = createExecuteSettings(
-      runSplitData = T, 
-      runSampleData = T, 
-      runfeatureEngineering = T, 
-      runPreprocessData = T, 
-      runModelDevelopment = T, 
-      runCovariateSummary = T
-    ), 
-    saveDirectory = file.path(getwd(), 'singlePlpExample2')
-    )
-

Under the hood the package will now use the R xgboost package to fit a a gradient boosting machine model using 75% of the data and will evaluate the model on the remaining 25%. A results data structure is returned containing information about the model, its performance etc.

-

You can save the model using:

-
-savePlpModel(gbmResults$model, dirPath = file.path(getwd(), "model"))
-

You can load the model using:

-
-plpModel <- loadPlpModel(file.path(getwd(), "model"))
-

You can also save the full results structure using:

-
-savePlpResult(gbmResults, location = file.path(getwd(), "gbm"))
-

To load the full results structure use:

-
-gbmResults <- loadPlpResult(file.path(getwd(), "gbm"))
-
-
-
-
-
-

-Study package creation

-

The script we created manually above can also be automatically created using a powerful feature in ATLAS. By creating a new prediction study (left menu) you can select the Target and Outcome as created in ATLAS, set all the study parameters, and then you can download a R package that you can use to execute your study. What is really powerful is that you can add multiple Ts, Os, covariate settings etc. The package will then run all the combinations of automatically as separate analyses. The screenshots below explain this process.

-
    -
  1. -
    -Create a new prediction study and select your target and outcome cohorts. -
    -
    - -
    -
  2. -
  3. -
    -Specify one or more analysis settings. -
    -
    - -
    -
  4. -
  5. -
    -Specify the trainings settigns -
    -
    - -
    -
  6. -
  7. -
    -Specify the execution settings -
    -
    - -
    -
  8. -
-
-

ATLAS can build a R package for you that will execute the full study against you CDM. Below the steps are explained how to do this in ATLAS.

-
    -
  1. -
    -

    Under utilities you can find download. Click on the button to review the full study specification

    -
    -
    -
    -

    R package download functionality in ATLAS

    -
    -
    -
  2. -
  3. -
    -

    You now have to review that you indeed want to run all these analyses (cartesian product of all the settings for each T and O combination.

    -
    -
    -
    -

    R package download functionality in ATLAS

    -
    -
    -
  4. -
  5. If you agree, you give the package a name, and download the package as a zipfile.

  6. -
  7. By opening the R package in R studio and building the package you can run the study using the execute function. Theres is also an example CodeToRun.R script available in the extras folder of the package with extra instructions.

  8. -
-
-
-

-Internal validation

-

Once we execute the study, the runPlp() function returns the trained model and the evaluation of the model on the train/test sets.

-

You can interactively view the results by running: viewPlp(runPlp=lrResults). This will generate a Shiny App in your browser in which you can view all performance measures created by the framework as shown in the figure below.

-
-Summary of all the performance measures of the analyses -
-Furthermore, many interactive plots are available in the Shiny App, for example the ROC curve in which you can move over the plot to see the threshold and the corresponding sensitivity and specificity values. -
-Example of the interactive ROC curve -
-

To generate and save all the evaluation plots to a folder run the following code:

-
-plotPlp(lrResults, dirPath = getwd())
-

The plots are described in more detail in the next sections.

-
-
-

-Discrimination

-

The Receiver Operating Characteristics (ROC) plot shows the sensitivity against 1-specificity on the test set. The plot illustrates how well the model is able to discriminate between the people with the outcome and those without. The dashed diagonal line is the performance of a model that randomly assigns predictions. The higher the area under the ROC plot the better the discrimination of the model. The plot is created by changing the probability threshold to assign the positive class.

-
-Receiver Operating Characteristic Plot -
-

## Calibration

-

The calibration plot shows how close the predicted risk is to the observed risk. The diagonal dashed line thus indicates a perfectly calibrated model. The ten (or fewer) dots represent the mean predicted values for each quantile plotted against the observed fraction of people in that quantile who had the outcome (observed fraction). The straight black line is the linear regression using these 10 plotted quantile mean predicted vs observed fraction points. The straight vertical lines represented the 95% lower and upper confidence intervals of the slope of the fitted line.

-
-Calibration Plot -
-
-
-
-

-Smooth Calibration

-

Similar to the traditional calibration shown above the Smooth Calibration plot shows the relationship between predicted and observed risk. the major difference is that the smooth fit allows for a more fine grained examination of this. Whereas the traditional plot will be heavily influenced by the areas with the highest density of data the smooth plot will provide the same information for this region as well as a more accurate interpretation of areas with lower density. the plot also contains information on the distribution of the outcomes relative to predicted risk.

-

However, the increased information gain comes at a computational cost. It is recommended to use the traditional plot for examination and then to produce the smooth plot for final versions. To create the smooth calibarion plot you have to run the follow command:

- -

See the help function for more information, on how to set the smoothing method etc.

-

The example below is from another study that better demonstrates the impact of using a smooth calibration plot. The default line fit would not highlight the miss-calibration at the lower predicted probability levels that well.

-
-Smooth Calibration plot -
-

## Preference distribution

-

The preference distribution plots are the preference score distributions corresponding to i) people in the test set with the outcome (red) and ii) people in the test set without the outcome (blue).

-
-Preference Plot -
-

## Predicted probability distribution

-

The prediction distribution box plots are for the predicted risks of the people in the test set with the outcome (class 1: blue) and without the outcome (class 0: red).

-

The box plots in the Figure show that the predicted probability of the outcome is indeed higher for those with the outcome but there is also overlap between the two distribution which lead to an imperfect discrimination.

-
-Prediction Distribution Box Plot -
-

## Test-Train similarity

-

The test-train similarity is assessed by plotting the mean covariate values in the train set against those in the test set for people with and without the outcome.

-

The results for our example of look very promising since the mean values of the covariates are on the diagonal.

-
-Similarity plots of train and test set -
-

## Variable scatter plot

-

The variable scatter plot shows the mean covariate value for the people with the outcome against the mean covariate value for the people without the outcome. The color of the dots corresponds to the inclusion (green) or exclusion in the model (blue), respectively. It is highly recommended to use the Shiny App since this allows you to hoover over a covariate to show more details (name, value etc).

-

The plot shows that the mean of most of the covariates is higher for subjects with the outcome compared to those without.

-
-Variabel scatter Plot -
-

## Precision recall

-

Precision (P) is defined as the number of true positives (Tp) over the number of true positives plus the number of false positives (Fp).

-
-P <- Tp/(Tp + Fp)
-

Recall (R) is defined as the number of true positives (Tp) over the number of true positives plus the number of false negatives (Fn).

-
-R <- Tp/(Tp + Fn)
-

These quantities are also related to the (F1) score, which is defined as the harmonic mean of precision and recall.

-
-F1 <- 2 * P * R/(P + R)
-

Note that the precision can either decrease or increase if the threshold is lowered. Lowering the threshold of a classifier may increase the denominator, by increasing the number of results returned. If the threshold was previously set too high, the new results may all be true positives, which will increase precision. If the previous threshold was about right or too low, further lowering the threshold will introduce false positives, decreasing precision.

-

For Recall the denominator does not depend on the classifier threshold (Tp+Fn is a constant). This means that lowering the classifier threshold may increase recall, by increasing the number of true positive results. It is also possible that lowering the threshold may leave recall unchanged, while the precision fluctuates.

-
-Precision Recall Plot -
-

## Demographic summary

-

This plot shows for females and males the expected and observed risk in different age groups together with a confidence area.

-

The results show that our model is well calibrated across gender and age groups.

-
-Demographic Summary Plot -
-

# External validation

-

We recommend to always perform external validation, i.e. apply the final model on as much new datasets as feasible and evaluate its performance.

-
-# load the trained model
-plpModel <- loadPlpModel(getwd(),'model')
-
-# add details of new database
-validationDatabaseDetails <- createDatabaseDetails()
-
-# to externally validate the model and perform recalibration run:
-externalValidateDbPlp(
-  plpModel = plpModel,
-  validationDatabaseDetails = validationDatabaseDetails,
-  validationRestrictPlpDataSettings = plpModel$settings$plpDataSettings,
-  settings = createValidationSettings(
-    recalibrate = 'weakRecalibration'
-    ),
-  outputFolder = getwd()
-)
-

This will extract the new plpData from the specified schemas and cohort tables. It will then apply the same population settings and the trained plp model. Finally, it will evaluate the performance and return the standard output as validation$performanceEvaluation and it will also return the prediction on the population as validation$prediction. They can be inserted into the shiny app for viewing the model and validation by running: viewPlp(runPlp=plpResult, validatePlp=validation ).

-
-
-
-
-

-Other functionality

-

The package has much more functionality than described in this vignette and contributions have been made my many persons in the OHDSI community. The table below provides an overview:

- ----- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
FunctionalityDescriptionVignette
Builing Multiple ModelsThis vignette describes how you can run multiple models automaticallyVignette
Custom ModelsThis vignette describes how you can add your own custom algorithms in the frameworkVignette
Custom Splitting FunctionsThis vignette describes how you can add your own custom training/validation/testing splitting functions in the frameworkVignette
Custom Sampling FunctionsThis vignette describes how you can add your own custom sampling functions in the frameworkVignette
Custom Feature Engineering/SelectionThis vignette describes how you can add your own custom feature engineering and selection functions in the frameworkVignette
Ensemble modelsThis vignette describes how you can use the framework to build ensemble models, i.e combine multiple models in a super learnerVignette
Learning curvesLearning curves assess the effect of training set size on model performance by training a sequence of prediction models on successively larger subsets of the training set. A learning curve plot can also help in diagnosing a bias or variance problem as explained below.Vignette
-
-
-

-Demos

-

We have added several demos in the package that run on simulated data:

-
-# Show all demos in our package: 
-demo(package = "PatientLevelPrediction")
-
-# For example, to run the SingleModelDemo that runs Lasso and shows you how to run the Shiny App use this call
-demo("SingleModelDemo", package = "PatientLevelPrediction")
-
-
-
-

-Acknowledgments

-

Considerable work has been dedicated to provide the PatientLevelPrediction package.

-
-citation("PatientLevelPrediction")
-
## 
-## To cite PatientLevelPrediction in publications use:
-## 
-## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design
-## and implementation of a standardized framework to generate and evaluate
-## patient-level prediction models using observational healthcare data."
-## _Journal of the American Medical Informatics Association_, *25*(8),
-## 969-975. <URL: https://doi.org/10.1093/jamia/ocy032>.
-## 
-## A BibTeX entry for LaTeX users is
-## 
-##   @Article{,
-##     author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek},
-##     title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data},
-##     journal = {Journal of the American Medical Informatics Association},
-##     volume = {25},
-##     number = {8},
-##     pages = {969-975},
-##     year = {2018},
-##     url = {https://doi.org/10.1093/jamia/ocy032},
-##   }
-

Further, PatientLevelPrediction makes extensive use of the Cyclops package.

-
-citation("Cyclops")
-
## 
-## To cite Cyclops in publications use:
-## 
-## Suchard MA, Simpson SE, Zorych I, Ryan P, Madigan D (2013). "Massive
-## parallelization of serial inference algorithms for complex generalized
-## linear models." _ACM Transactions on Modeling and Computer Simulation_,
-## *23*, 10. <URL: https://dl.acm.org/doi/10.1145/2414416.2414791>.
-## 
-## A BibTeX entry for LaTeX users is
-## 
-##   @Article{,
-##     author = {M. A. Suchard and S. E. Simpson and I. Zorych and P. Ryan and D. Madigan},
-##     title = {Massive parallelization of serial inference algorithms for complex generalized linear models},
-##     journal = {ACM Transactions on Modeling and Computer Simulation},
-##     volume = {23},
-##     pages = {10},
-##     year = {2013},
-##     url = {https://dl.acm.org/doi/10.1145/2414416.2414791},
-##   }
-

Please reference this paper if you use the PLP Package in your work:

-

Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.

-

This work is supported in part through the National Science Foundation grant IIS 1251151.

-
-
-
-

-Appendix 1: Study population settings details

-

In the figures below the effect is shown of the removeSubjectsWithPriorOutcome, requireTimAtRisk, and includeAllOutcomes booleans on the final study population. We start with a Target Cohort with firstExposureOnly = false and we require a washout period = 1095. We then subset the target cohort based on additional constraints. The final study population in the Venn diagrams below are colored green.

-
    -
  1. -
    -Require minimum time-at-risk for all person in the target cohort -
    -
    - -
    -
  2. -
  3. -
    -Require minumum time-at-risk for target cohort, except for persons with outcomes during time-at-risk. -
    -
    - -
    -
  4. -
-) -
-
-Include all persons in the target cohort exclude persons with prior outcomes -
-
- -
-
-
    -
  1. -
    -Require minimum time-at-risk for target cohort, except for persons with outcomes during time-at-risk, exclude persons with prior outcomes -
    -
    - -
    -
  2. -
-) -
-
-Include all persons in target cohort exclude persons with prior outcomes -
-
- -
-
-
    -
  1. -
    -Include all persons in target cohort -
    -
    - -
    -
  2. -
-
-
- - - -
- - - - -
- - - - - - diff --git a/docs/articles/BuildingPredictiveModels_files/header-attrs-2.11/header-attrs.js b/docs/articles/BuildingPredictiveModels_files/header-attrs-2.11/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/BuildingPredictiveModels_files/header-attrs-2.11/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/BuildingPredictiveModels_files/header-attrs-2.7/header-attrs.js b/docs/articles/BuildingPredictiveModels_files/header-attrs-2.7/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/BuildingPredictiveModels_files/header-attrs-2.7/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/CreatingLearningCurves.html b/docs/articles/CreatingLearningCurves.html deleted file mode 100644 index 739749046..000000000 --- a/docs/articles/CreatingLearningCurves.html +++ /dev/null @@ -1,333 +0,0 @@ - - - - - - - -Creating Learning Curves • PatientLevelPrediction - - - - - - - - - - -
-
- - - - -
-
- - - - - -
-

-Introduction

-

This vignette describes how you can use the Observational Health Data Sciences and Informatics (OHDSI) PatientLevelPrediction package to create learning curves. This vignette assumes you have read and are comfortable with building patient level prediction models as described in the BuildingPredictiveModels vignette.

-

Prediction models will show overly-optimistic performance when predicting on the same data as used for training. Therefore, best-practice is to partition our data into a training set and testing set. We then train our prediction model on the training set portion and asses its ability to generalize to unseen data by measuring its performance on the testing set.

-

Learning curves assess the effect of training set size on model performance by training a sequence of prediction models on successively larger subsets of the training set. A learning curve plot can also help in diagnosing a bias or variance problem as explained below.

-
-

Learning curve example.

-
-

Figure 1, shows an example of learning curve plot in which the vertical axis represents the model performance and the horizontal axis the training set size. If training set size is small, the performance on the training set is high, because a model can often be fitted well to a limited number of training examples. At the same time, the performance on the testing set will be poor, because the model trained on such a limited number of training examples will not generalize well to unseen data in the testing set. As the training set size increases, the performance of the model on the training set will decrease. It becomes more difficult for the model to find a good fit through all the training examples. Also, the model will be trained on a more representative portion of training examples, making it generalize better to unseen data. This can be observed by the increasin testing set performance.

-

The learning curve can help us in diagnosing bias and variance problems with our classifier which will provide guidance on how to further improve our model. We can observe high variance (overfitting) in a prediction model if it performs well on the training set, but poorly on the testing set (Figure 2). Adding additional data is a common approach to counteract high variance. From the learning curve it becomes apparent, that adding additional data may improve performance on the testing set a little further, as the learning curve has not yet plateaued and, thus, the model is not saturated yet. Therefore, adding more data will decrease the gap between training set and testing set, which is the main indicator for a high variance problem.

-
-

Prediction model suffering from high variance.

-
-

Furthermore, we can observe high bias (underfitting) if a prediction model performs poorly on the training set as well as on the testing set (Figure 3). The learning curves of training set and testing set have flattened on a low performance with only a small gap in between them. Adding additional data will in this case have little to no impact on the model performance. Choosing another prediction algorithm that can find more complex (for example non-linear) relationships in the data may be an alternative approach to consider in this high bias situation.

-
-

Prediction model suffering from high bias.

-
-
-
-

-Creating the learning curve

-

Use the PatientLevelPrediction package to create a plpData object . Alternatively, you can make use of the data simulator. The following code snippet creates data for 12000 patients.

-
-set.seed(1234)
-data(plpDataSimulationProfile)
-sampleSize <- 12000
-plpData <- simulatePlpData(
-  plpDataSimulationProfile,
-  n = sampleSize
-)
-

Specify the population settings (this does additional exclusions such as requiring minimum prior observation or no prior outcome as well as specifying the time-at-risk period to enable labels to be created):

-
-populationSettings <- createStudyPopulationSettings(
-  binary = TRUE,
-  firstExposureOnly = FALSE,
-  washoutPeriod = 0,
-  removeSubjectsWithPriorOutcome = FALSE,
-  priorOutcomeLookback = 99999,
-  requireTimeAtRisk = FALSE,
-  minTimeAtRisk = 0,
-  riskWindowStart = 0,
-  riskWindowEnd = 365,
-  verbosity = "INFO"
-)
-

Specify the prediction algorithm to be used.

-
-# Use LASSO logistic regression
-modelSettings <- setLassoLogisticRegression()
-

Specify the split settings and a sequence of training set fractions (these over ride the splitSetting trainFraction). Alternatively, instead of trainFractions, you can provide a sequence of training events (trainEvents) instead of the training set fractions. This is recommended, because our research has shown that number of events is the important determinant of model performance. Make sure that your training set contains the number of events specified.

-
-splitSettings = createDefaultSplitSetting(
-  testFraction = 0.2,  
-  type = 'stratified',
-  splitSeed = 1000
-  )
-
-trainFractions <- seq(0.1, 0.8, 0.1) # Create eight training set fractions
-
-# alternatively use a sequence of training events by uncommenting the line below.
-# trainEvents <- seq(100, 5000, 100)
-

Create the learning curve object.

-
-learningCurve <- createLearningCurve(
-  plpData = plpData,
-  outcomeId = 2,  
-  parallel = T,
-  cores = 4,
-  modelSettings = modelSettings,
-  saveDirectory = getwd(),
-  analysisId = 'learningCurve',
-  populationSettings = populationSettings,
-  splitSettings = splitSettings,
-  trainFractions = trainFractions,
-  trainEvents = NULL,
-  preprocessSettings = createPreprocessSettings(
-    minFraction = 0.001,
-    normalize = T
-  ),
-  executeSettings = createExecuteSettings(
-    runSplitData = T, 
-    runSampleData = F,
-    runfeatureEngineering = F,
-    runPreprocessData = T,
-    runModelDevelopment = T,
-    runCovariateSummary = F
-    )
-)
-

Plot the learning curve object (Figure 4). Specify one of the available metrics: AUROC, AUPRC, sBrier. Moreover, you can specify what metric to put on the abscissa, number of observations or number of events. We recommend the latter, because events are determinant of model performance and allow you to better compare learning curves across different prediction problems and databases.

-
-plotLearningCurve(
-  learningCurve,
-  metric = 'AUROC',
-  abscissa = 'events',
-  plotTitle = 'Learning Curve',
-  plotSubtitle = 'AUROC performance'
-)
-
-

Learning curve plot.

-
-
-
-

-Parallel processing

-

The learning curve object can be created in parallel, which can reduce computation time significantly. Whether to run the code in parallel or not is specified using the parallel input. Currently this functionality is only available for LASSO logistic regression and gradient boosting machines. Depending on the number of parallel workers it may require a significant amount of memory. We advise to use the parallelized learning curve function for parameter search and exploratory data analysis.

-

When running in parrallel, R will find the number of available processing cores automatically and register the required parallel backend. Alternatively, you can provide the number of cores you wish to use via the cores input.

-
-
-

-Demo

-

We have added a demo of the learningcurve:

-
-# Show all demos in our package: 
- demo(package = "PatientLevelPrediction")
-
-# Run the learning curve
- demo("LearningCurveDemo", package = "PatientLevelPrediction")
-

Do note that running this demo can take a considerable amount of time (15 min on Quad core running in parallel)!

-
-
-

-Publication

-

A publication titled ‘How little data do we need for patient-level prediction?’ uses the learning curve functionality in this package and can be accessed as preprint in the arXiv archives at https://arxiv.org/abs/2008.07361.

-
-
-

-Acknowledgments

-

Considerable work has been dedicated to provide the PatientLevelPrediction package.

-
-citation("PatientLevelPrediction")
-
## 
-## To cite PatientLevelPrediction in publications use:
-## 
-## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design
-## and implementation of a standardized framework to generate and evaluate
-## patient-level prediction models using observational healthcare data."
-## _Journal of the American Medical Informatics Association_, *25*(8),
-## 969-975. <URL: https://doi.org/10.1093/jamia/ocy032>.
-## 
-## A BibTeX entry for LaTeX users is
-## 
-##   @Article{,
-##     author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek},
-##     title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data},
-##     journal = {Journal of the American Medical Informatics Association},
-##     volume = {25},
-##     number = {8},
-##     pages = {969-975},
-##     year = {2018},
-##     url = {https://doi.org/10.1093/jamia/ocy032},
-##   }
-

Please reference this paper if you use the PLP Package in your work:

-

Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.

-
-
- - - -
- - - - -
- - - - - - diff --git a/docs/articles/CreatingLearningCurves_files/header-attrs-2.11/header-attrs.js b/docs/articles/CreatingLearningCurves_files/header-attrs-2.11/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/CreatingLearningCurves_files/header-attrs-2.11/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/CreatingLearningCurves_files/header-attrs-2.7/header-attrs.js b/docs/articles/CreatingLearningCurves_files/header-attrs-2.7/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/CreatingLearningCurves_files/header-attrs-2.7/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/CreatingNetworkStudies.html b/docs/articles/CreatingNetworkStudies.html deleted file mode 100644 index c5febb13c..000000000 --- a/docs/articles/CreatingNetworkStudies.html +++ /dev/null @@ -1,242 +0,0 @@ - - - - - - - -Making patient-level predictive network study packages • PatientLevelPrediction - - - - - - - - - - -
-
- - - - -
-
- - - - - -
-
-

-Introduction

-

The OHDSI Patient Level Prediction (PLP) package provides the framework to implement prediction models at scale. This can range from developing a large number of models across sites (methodology and study design insight) to extensive external validation of existing models in the OHDSI PLP framework (model insight). This vignette describes how you can use the PatientLevelPrediction package to create a network study package.

-
-
-

-Useful publication

-

The open access publication A standardized analytics pipeline for reliable and rapid development and validation of prediction models using observational health data details the process used to develop and validate prediction models using the OHDSI prediction framework and tools. This publication describes each of the steps and then demonstrates these by focusing on predicting death in those who have covid-19.

-
-
-

-Main steps for running a network study

-
-

-Step 1 – developing the study

-
    -
  • Design the study: target/outcome cohort logic, concept sets for medical definitions, settings for developing new model or validation of adding existing models to framework. Suggestion: look in literature for validated definitions.
  • -
  • Write a protocol that motivates the study and provides full details (sufficient for people to replicate the study in the future).
  • -
  • Write an R package for implementing the study across diverse computational environments [see guidance below for structure of package and use the skeleton github package here: https://github.com/OHDSI/SkeletonPredictionStudy ]
  • -
-
-
-

-Step 2 – implementing the study part 1

-
    -
  • Get contributors to install the package and dependencies. Ensure the package is installed correctly for each contributor by asking them to run the checkInstall functions (as specified in the InstallationGuide).
  • -
  • Get contributors to run the createCohort function to inspect the target/outcome definitions. If the definitions are not suitable for a site, go back to step 1 and revise the cohort definitions.
  • -
-
-
-

-Step 3 – implementing the study part 2 (make sure the package is functioning as planned and the definitions are valid across sites)

-
    -
  • Get contributors to run the main.R with the settings configured to their environment
  • -
  • Get the contributors to submit the results
  • -
-
-
-

-Step 4 – Publication

-

The study creator has the first option to be first author, if he/she does not wish to be first author then he/she can pick the most suitable person from the contributors. All contributors will be listed as authors on the paper. The last author will be the person who lead/managed the study, if this was the first author then the first author can pick the most suitable last author. All authors between the first and last author will be alphabetical by last name.

-
-
-
-

-Package Skeleton - File Structure

-
    -
  • DESCRIPTION: This file describes the R package and the dependencies
  • -
  • NAMESPACE: This file is created automatically by Roxygen
  • -
  • Readme.md: This file should provide the step by step guidance on implementing the package
  • -
  • R
  • -
  • helpers.r: all the custom functions used by the package should be in this file (e.g., checkInstall)
  • -
  • main.r: this file will call the functions in helpers.r to execute the full study
  • -
  • submit.r: this file will be called at the end to submit the compressed folder to the study creator/manager.
  • -
  • Man: this folder will contain the documentation for the functions in helpers.r (this should be automatically generated by roxygen)
  • -
  • Inst
  • -
  • sql/sql_sever * targetCohort: the target cohort parameterised sql code * outcomeCohort: the outcome cohort parameterised sql code
  • -
  • plp_models: place any PLP models here
  • -
  • Extras
  • -
-
-
- - - -
- - - - -
- - - - - - diff --git a/docs/articles/CreatingNetworkStudies_files/header-attrs-2.11/header-attrs.js b/docs/articles/CreatingNetworkStudies_files/header-attrs-2.11/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/CreatingNetworkStudies_files/header-attrs-2.11/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/CreatingNetworkStudies_files/header-attrs-2.7/header-attrs.js b/docs/articles/CreatingNetworkStudies_files/header-attrs-2.7/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/CreatingNetworkStudies_files/header-attrs-2.7/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/CreatingShinyApp.html b/docs/articles/CreatingShinyApp.html deleted file mode 100644 index 838fedfa8..000000000 --- a/docs/articles/CreatingShinyApp.html +++ /dev/null @@ -1,456 +0,0 @@ - - - - - - - -Creating Shiny App • PatientLevelPrediction - - - - - - - - - - -
-
- - - - -
-
- - - - - -
-

-Introduction

-

In this vignette we will show with example code how to create a shiny app and add the shiny app online for other researcher around the whole to explore.

-

There are two ways to create the shiny app: 1) Using the atlas R generated prediction package 2) Manually using the PatientLevelPrediction functions in a script

-

We assume you have experience with using the OHDSI PatientLevelPrediction package to develop and externally validate prediction models using data in the OMOP CDM. If you do not have experience with this then please first read our general vignette BuildingPredictiveModels vignette.

-
-
-

-Atlas Development Shiny App

-
-

-Step 1: Run the model development package to get results

-

To create a shiny app project via the Atlas auto-generated prediction R package you named ‘myPackage’ you need to run the execute function:

-
-library(myPackage)
-myPackage::execute(connectionDetails = connectionDetails,
-        cdmDatabaseSchema = 'myDatabaseSchema.dbo',
-        cdmDatabaseName = 'MyDatabase',
-        cohortDatabaseSchema = 'myDatabaseSchema.ohdsi_results',
-        cohortTable = 'cohort',
-        outputFolder = 'C:/myResults',
-        createProtocol = F,
-        createCohorts = F,
-        runAnalyses = T,
-        createResultsDoc = F,
-        packageResults = F,
-        createValidationPackage = F, 
-        minCellCount= 5,
-        createShiny = F,
-        createJournalDocument = F,
-        analysisIdDocument = 1)
-

This will extract data based on the settings you supplied in the Atlas prediction design from cohort tables already generated in your CDM database schema. The PatientLevelPrediction framework will then run and develop/evaluate models saving the results to the location specified by outputFolder (e.g., ‘C:/myResults’).

-
-
-

-Step 2: Create the shiny app

-

To create a shiny app project with these results you can then simply run:

-
-myPackage::execute(connectionDetails = connectionDetails,
-        cdmDatabaseSchema = 'myDatabaseSchema.dbo',
-        cdmDatabaseName = 'MyDatabase',
-        cohortDatabaseSchema = 'myDatabaseSchema.ohdsi_results',
-        cohortTable = 'cohort',
-        outputFolder = 'C:/myResults',
-        minCellCount= 5,
-        createShiny = T)
-

making sure the outputFolder is the same location used when you ran the analysis. This code populates a shiny app project with the results but removes sensitive date such as connection settings, the cdmDatabaseSchema setting, the predicton matrix and any sensitive counts less than ‘minCellCount’ from the covariate summary and performance evalaution.

-

The shiny app project populated with the model development results can then be found at ‘[outputFolder]/ShinyApp’ e.g., ‘C:/myResults/ShinyApp’.

- -
-
-

-Step 3: Sharing the shiny app

-

Once you are happy with your app, you can publish it onto https://data.ohdsi.org by adding the folder ‘ShinyApp’ to the OHDSI githib ShinyDeploy (https://github.com/OHDSI/ShinyDeploy/). Continuing the example, we would copy the folder ‘[outputFolder]/ShinyApp’ and paste it to the local github clone of ShinyDeploy. We recommend renaming the folder from ‘ShinyApp’ to a name that describes your prediction, e.g., ‘StrokeinAF’. Then commit the changes and make a pull request to ShinyDeploy. Once accepted your shiny app will be viewable at ‘https://data.ohdsi.org’. If you commited the folder named ‘StrokeInAF’ then the shiny app will be hosted at ‘https://data.ohdsi.org/StrokeInAF’.

-
-
-
-

-Atlas External Validation

-

To include external validation results you can use the Atlas generated R study package to create the external validation package:

-
-myPackage::execute(connectionDetails = connectionDetails,
-        cdmDatabaseSchema = 'myDatabaseSchema.dbo',
-        cdmDatabaseName = 'MyDatabase',
-        cohortDatabaseSchema = 'myDatabaseSchema.ohdsi_results',
-        cohortTable = 'cohort',
-        outputFolder = 'C:/myResults',
-        createValidationPackage = T)
-

This will create a new R package inside the ‘outputFolder’ location with the word ‘Validation’ appended the name of your development package. For example, if your ‘outputFolder’ was ‘C:/myResults’ and your development package was named ‘myPackage’ then the validation package will be found at: ‘C:/myResults/myPackageValidation’. When running the valdiation package make sure to set the ‘outputFolder’ to the Validation folder within your model development outputFolder location:

-
-myPackageValidation::execute(connectionDetails = connectionDetails,
-                 databaseName = databaseName,
-                 cdmDatabaseSchema = cdmDatabaseSchema,
-                 cohortDatabaseSchema = cohortDatabaseSchema,
-                 oracleTempSchema = oracleTempSchema,
-                 cohortTable = cohortTable,
-                 outputFolder = 'C:/myResults/Validation',
-                 createCohorts = T,
-                 runValidation = T,
-                 packageResults = F,
-                 minCellCount = 5,
-                 sampleSize = NULL)
-

Now you can rerun Steps 2-3 to populate the shiny app project that will also include the validation results (as long as the validation results are in the Validation folder found in the Step 1 outputFolder location e.g., in ‘C:/myResults/Validation’).

-
-
-

-Combining multiple atlas results into one shiny app:

-

The code below can be used to combine multiple Atlas packages’ results into one shiny app:

-
-populateMultipleShinyApp <- function(shinyDirectory,
-                             resultDirectory,
-                             minCellCount = 10,
-                             databaseName = 'sharable name of development data'){
-  
-  #check inputs
-  if(missing(shinyDirectory)){
-    shinyDirectory <- system.file("shiny", "PLPViewer", package = "SkeletonPredictionStudy")
-  }
-  if(missing(resultDirectory)){
-    stop('Need to enter the resultDirectory')
-  }
-  
-
-    for(i in 1:length(resultDirectory)){
-      if(!dir.exists(resultDirectory[i])){
-        stop(paste('resultDirectory ',i,' does not exist'))
-      }
-    }
-  
-  outputDirectory <- file.path(shinyDirectory,'data')
-  
-  # create the shiny data folder
-  if(!dir.exists(outputDirectory)){
-    dir.create(outputDirectory, recursive = T)
-  }
-  
-  
-  # need to edit settings ...
-  files <- c()
-  for(i in 1:length(resultDirectory)){
-  # copy the settings csv
-  file <- utils::read.csv(file.path(resultDirectory[i],'settings.csv'))
-  file$analysisId <- 1000*as.double(file$analysisId)+i
-  files <- rbind(files, file)
-  }
-  utils::write.csv(files, file.path(outputDirectory,'settings.csv'), row.names = F)
-  
-  for(i in 1:length(resultDirectory)){
-  # copy each analysis as a rds file and copy the log
-  files <- dir(resultDirectory[i], full.names = F)
-  files <- files[grep('Analysis', files)]
-  for(file in files){
-    
-    if(!dir.exists(file.path(outputDirectory,paste0('Analysis_',1000*as.double(gsub('Analysis_','',file))+i)))){
-      dir.create(file.path(outputDirectory,paste0('Analysis_',1000*as.double(gsub('Analysis_','',file))+i)))
-    }
-    
-    if(dir.exists(file.path(resultDirectory[i],file, 'plpResult'))){
-      res <- PatientLevelPrediction::loadPlpResult(file.path(resultDirectory[i],file, 'plpResult'))
-      res <- PatientLevelPrediction::transportPlp(res, n= minCellCount, 
-                                                  save = F, dataName = databaseName[i])
-      
-      res$covariateSummary <- res$covariateSummary[res$covariateSummary$covariateValue!=0,]
-      covSet <- res$model$metaData$call$covariateSettings
-      res$model$metaData <- NULL
-      res$model$metaData$call$covariateSettings <- covSet
-      res$model$predict <- NULL
-      if(!is.null(res$performanceEvaluation$evaluationStatistics)){
-      res$performanceEvaluation$evaluationStatistics[,1] <- paste0('Analysis_',1000*as.double(gsub('Analysis_','',file))+i)
-      } else{
-        writeLines(paste0(resultDirectory[i],file, '-ev'))
-      }
-      if(!is.null(res$performanceEvaluation$thresholdSummary)){
-      res$performanceEvaluation$thresholdSummary[,1] <- paste0('Analysis_',1000*as.double(gsub('Analysis_','',file))+i)
-      }else{
-        writeLines(paste0(resultDirectory[i],file, '-thres'))
-      }
-      if(!is.null(res$performanceEvaluation$demographicSummary)){
-      res$performanceEvaluation$demographicSummary[,1] <- paste0('Analysis_',1000*as.double(gsub('Analysis_','',file))+i)
-      } else{
-        writeLines(paste0(resultDirectory[i],file, '-dem'))
-      }
-      if(!is.null(res$performanceEvaluation$calibrationSummary)){
-      res$performanceEvaluation$calibrationSummary[,1] <- paste0('Analysis_',1000*as.double(gsub('Analysis_','',file))+i)
-      }else{
-        writeLines(paste0(resultDirectory[i],file, '-cal'))
-      }
-      if(!is.null(res$performanceEvaluation$predictionDistribution)){
-      res$performanceEvaluation$predictionDistribution[,1] <- paste0('Analysis_',1000*as.double(gsub('Analysis_','',file))+i)
-      }else{
-        writeLines(paste0(resultDirectory[i],file, '-dist'))
-      }
-      saveRDS(res, file.path(outputDirectory,paste0('Analysis_',1000*as.double(gsub('Analysis_','',file))+i), 'plpResult.rds'))
-    }
-    if(file.exists(file.path(resultDirectory[i],file, 'plpLog.txt'))){
-      file.copy(from = file.path(resultDirectory[i],file, 'plpLog.txt'), 
-                to = file.path(outputDirectory,paste0('Analysis_',1000*as.double(gsub('Analysis_','',file))+i), 'plpLog.txt'))
-    }
-  }
-  }
-  
-  
-  
-  for(i in 1:length(resultDirectory)){
-  # copy any validation results
-  if(dir.exists(file.path(resultDirectory[i],'Validation'))){
-    valFolders <-  dir(file.path(resultDirectory[i],'Validation'), full.names = F)
-    
-    if(length(valFolders)>0){
-      # move each of the validation rds
-      for(valFolder in valFolders){
-        
-        # get the analysisIds
-        valSubfolders <- dir(file.path(resultDirectory[i],'Validation',valFolder), full.names = F)
-        if(length(valSubfolders)!=0){
-          for(valSubfolder in valSubfolders ){
-            valSubfolderUpdate <- paste0('Analysis_', as.double(gsub('Analysis_','', valSubfolder))*1000+i)
-            valOut <- file.path(valFolder,valSubfolderUpdate)
-            valOutOld <- file.path(valFolder,valSubfolder)
-            if(!dir.exists(file.path(outputDirectory,'Validation',valOut))){
-              dir.create(file.path(outputDirectory,'Validation',valOut), recursive = T)
-            }
-            
-            
-            if(file.exists(file.path(resultDirectory[i],'Validation',valOutOld, 'validationResult.rds'))){
-              res <- readRDS(file.path(resultDirectory[i],'Validation',valOutOld, 'validationResult.rds'))
-              res <- PatientLevelPrediction::transportPlp(res, n= minCellCount, 
-                                                          save = F, dataName = databaseName[i])
-              res$covariateSummary <- res$covariateSummary[res$covariateSummary$covariateValue!=0,]
-              saveRDS(res, file.path(outputDirectory,'Validation',valOut, 'validationResult.rds'))
-            }
-          }
-        }
-        
-      }
-      
-    }
-    
-  }
-  }
-  
-  return(outputDirectory)
-  
-}
-
-

-Example code to combine multiple results

-

The following code will combine the results found in ‘C:/myResults’, ‘C:/myResults2’ and ‘C:/myResults3’ into the shiny project at ‘C:/R/library/myPackage/shiny/PLPViewer’:

-
-populateMultipleShinyApp(shinyDirectory = 'C:/R/library/myPackage/shiny/PLPViewer',
-                                     resultDirectory = c('C:/myResults',
-                                                         'C:/myResults2',
-                                                         'C:/myResults3'),
-                                     minCellCount = 0,
-                                     databaseName = c('database1','database2','database3'))
-
-
-
-

-Manual App Creation

-

[instructions coming soon]

-
-
-

-Acknowledgments

-

Considerable work has been dedicated to provide the PatientLevelPrediction package.

-
-citation("PatientLevelPrediction")
-
## 
-## To cite PatientLevelPrediction in publications use:
-## 
-## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design
-## and implementation of a standardized framework to generate and evaluate
-## patient-level prediction models using observational healthcare data."
-## _Journal of the American Medical Informatics Association_, *25*(8),
-## 969-975. <URL: https://doi.org/10.1093/jamia/ocy032>.
-## 
-## A BibTeX entry for LaTeX users is
-## 
-##   @Article{,
-##     author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek},
-##     title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data},
-##     journal = {Journal of the American Medical Informatics Association},
-##     volume = {25},
-##     number = {8},
-##     pages = {969-975},
-##     year = {2018},
-##     url = {https://doi.org/10.1093/jamia/ocy032},
-##   }
-

Please reference this paper if you use the PLP Package in your work:

-

Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.

-
-
- - - -
- - - - -
- - - - - - diff --git a/docs/articles/CreatingShinyApp_files/header-attrs-2.11/header-attrs.js b/docs/articles/CreatingShinyApp_files/header-attrs-2.11/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/CreatingShinyApp_files/header-attrs-2.11/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/CreatingShinyApp_files/header-attrs-2.7/header-attrs.js b/docs/articles/CreatingShinyApp_files/header-attrs-2.7/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/CreatingShinyApp_files/header-attrs-2.7/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/CustomPredictionAlgorithms.html b/docs/articles/CustomPredictionAlgorithms.html deleted file mode 100644 index c38730513..000000000 --- a/docs/articles/CustomPredictionAlgorithms.html +++ /dev/null @@ -1,463 +0,0 @@ - - - - - - - -Custom patient-level prediction algorithms • PatientLevelPrediction - - - - - - - - - -
-
- - - -
-
- - - - - - -
-

-Introduction

-

The PatientLevelPrediction package enables data extraction, model building, and model evaluation using data from databases that are translated into the Observational Medical Outcomes Partnership Common Data Model (OMOP CDM).

-

This vignette describes how you can add custom algorithms to the list of available algorithms in the PatientLevelPrediction package. This would allow you to fully leverage the OHDSI PatientLevelPrediction model development process with your own favourite algorithm.

-

Of course, we invite you to share your new algorithms with the community through the GitHub repository.

-
-
-

-General Structure

-

To add a custom classifier to the package you need to add the set, the fit functions into a R file with the model name. You also need to ensure there is a corresponding predict function in predict.R. For example, if you were to make a made up model, then in MadeUp.R you would add the following models:

-
-

-set

-

The setNewModel is a function that takes as input the different hyper-parameter values to do a grid search when training the model. The output of the model a list as class ‘modelSettings’ containing: + param - all the combinations of the hyper-parameter values input + model - a string specifying what function to call to fit the model + name - a string containing the name of the model.
-For example, if you were adding a model call madeUp that had two hyper-parameters then the set function would be:

-
setMadeUp <- function(a=1, b=2, seed=NULL){
-  # add input checks here...
-  
-  # now create list of all combinations:
-  result <- list(model='fitMadeUp', # this will be calle to train the made up model
-                 param= split(expand.grid(a=a, 
-                                          b=b,
-                                          seed=ifelse(is.null(seed),'NULL', seed)),
-                              1:(length(a)*length(b)  )),
-                 name='Made Up Algorithm'
-  )
-  class(result) <- 'modelSettings' 
-  
-  return(result)
-}
-
-
-

-fit

-

fitNewModel this function takes as input: * population - the study popualation the model is being developed on * plpData - the plpData object * param - the hyper-parameters as a list of all combinations * quiet - T or F indicating whether to output progress * outcomeId - the outcome id * cohortId - the target population id

-

then it trains a model for each param entry, picks the best param entry and trains a final model for that setting. The fit function returns a list of class plpModel with the following objects: * model - a trained model * modelSettings - a list containing the model and input param * trainCVAac - a value with the train AUC value * hyperParamSearch - a dataframe with the hyperparameter grid and corresponding AUCs * metaData - the metaData from the plpData object * populationSettings - the settings used to create the population and define the time-at-risk * outcomeId - the outcomeId being predicted * cohortId - the cohortId corresponding to the target cohort * varImp - a dataframe with the covaraites and a measure of importance * trainingTime - how long it took to develop/evaluate the model * covariateMap - if the plpData are converted to a matrix for model compatibility this tells us what covariate each row in the matrix correpsonds to and is need when implementing the model on new data The plpModle returned by fit also has a type attribute, this points to the predict function, for example attr(result, 'type') <- 'madeup' means when the model is applied to new data, the ‘predict.madeup’ function in Predict.R is called. if this doesnt exist, then the model will fail. Another attribute is the predictionType attr(result, 'predictionType') <- 'binary' this is currently not needed but may be important in the future when we expand to regression or multiclass classification.

-

The fit shell is:

-
fitMadeUp <- function(population, plpData, param, quiet=F,
-                        outcomeId, cohortId, ...){
-  
-  # **************** code to train the model here
-  # trainedModel <- this code should apply each hyper-param using the cross validation
-  #                 then pick out the best hyper-param setting
-  #                 and finally fit a model on the whole train data using the 
-  #                 optimal hyper-param settings
-  # ****************
-  
-  # construct the standard output for a model:
-  result <- list(model = trainedModel,
-                 modelSettings = list(model='made_up', modelParameters=param),
-                 trainCVAuc = NULL,
-                 hyperParamSearch = hyperSummary,
-                 metaData = plpData$metaData,
-                 populationSettings = attr(population, 'metaData'),
-                 outcomeId=outcomeId,# can use populationSettings$outcomeId?
-                 cohortId=cohortId,
-                 varImp = NULL,
-                 trainingTime=comp,
-                 covariateMap=result$map
-  )
-  class(result) <- 'plpModel'
-  attr(result, 'type') <- 'madeup'
-  attr(result, 'predictionType') <- 'binary'
-  return(result)
-    
-}
-

You can wish to make the fitMadeUp function cleaner by adding helper function in the MadeUp.R file that are called by the fit function. As the end of the fit function specified attr(result, 'type') <- 'madeup' we also need to make sure there is a predict.madeup function in Predict.R:

-
-
-

-predict

-

The prediction function takes as input the plpModel returned by fit, a population and corresponding plpData. It returns a data.frame with the columns: * rowId - the id for each person in the population * value - the predicted risk from the plpModel If the population contains the columns outcomeCount and indexes, then these are also output.

-
predict.madeup <- function(plpModel, population, plpData, ...) {
-    
-    # ************* code to do prediction for each rowId in population
-    # prediction <- code to do prediction here returning columns: rowId and
-    # value (predicted risk) **************
-    
-    prediction <- merge(population, prediction, by = "rowId")
-    prediction <- prediction[, colnames(prediction) %in% c("rowId", "outcomeCount", 
-        "indexes", "value")]
-    attr(prediction, "metaData") <- list(predictionType = "binary")
-    return(prediction)
-    
-}
-
-
-
-

-R Model Example

-
-

-set

-
setMadeUp <- function(a=1, b=2, seed=NULL){
-  # check a is valid positive value
-  if(missing(a)){
-    stop('a must be input')
-  }
-  if(!class(a)%in%c('numeric','integer'){
-    stop('a must be numeric')
-  }
-  if(a < 0){
-    stop('a must be positive')
-  }
-  # check b is numeric
-  if(missing(b)){
-    stop('b must be input')
-  }
-  if(!class(b)%in%c('numeric','integer'){
-    stop('b must be numeric')
-  }
-  
-  # now create list of all combinations:
-  result <- list(model='fitMadeUp', 
-                 param= split(expand.grid(a=a, 
-                                          b=b,
-                                          seed=ifelse(is.null(seed),'NULL', seed)),
-                              1:(length(a)*length(b)  )),
-                 name='Made Up Algorithm'
-  )
-  class(result) <- 'modelSettings' 
-  
-  return(result)
-    
-  
-}
-
-
-

-fit

-
fitMadeUp <- function(population, plpData, param, quiet=F,
-                        outcomeId, cohortId, ...){
-    if(!quiet)
-    writeLines('Training Made Up model')
-  
-  if(param[[1]]$seed!='NULL')
-    set.seed(param[[1]]$seed)
-  
-    # check plpData is coo format:
-  if(!'ffdf'%in%class(plpData$covariates) )
-    stop('This algorithm requires plpData in coo format')
-  
-  metaData <- attr(population, 'metaData')
-  if(!is.null(population$indexes))
-    population <- population[population$indexes>0,]
-  attr(population, 'metaData') <- metaData
-  #TODO - how to incorporate indexes?
-  
-  # convert data into sparse R Matrix:
-  result <- toSparseM(plpData,population,map=NULL)
-  data <- result$data
-  
-  data <- data[population$rowId,]
-  
-  # set test/train sets (for printing performance as it trains)
-  if(!quiet)
-    writeLines(paste0('Training made up model on train set containing ', nrow(population), ' people with ',sum(population$outcomeCount>0), ' outcomes'))
-  start <- Sys.time()
-  
-  #============= STEP 1 ======================================
-  # pick the best hyper-params and then do final training on all data...
-  writeLines('train')
-  datas <- list(population=population, data=data)
-  param.sel <- lapply(param, function(x) do.call(made_up_model, c(x,datas)  ))
-  hyperSummary <- do.call(rbind, lapply(param.sel, function(x) x$hyperSum))
-  hyperSummary <- as.data.frame(hyperSummary)
-  hyperSummary$auc <- unlist(lapply(param.sel, function(x) x$auc)) 
-  param.sel <- unlist(lapply(param.sel, function(x) x$auc))
-  param <- param[[which.max(param.sel)]]
-  
-  # set this so you do a final model train 
-  param$final=T
-  
-  writeLines('final train')
-  trainedModel <- do.call(made_up_model, c(param,datas)  )$model
-  
-  comp <- Sys.time() - start
-  if(!quiet)
-    writeLines(paste0('Model Made Up trained - took:',  format(comp, digits=3)))
-  
-  # construct the standard output for a model:
-  result <- list(model = trainedModel,
-                 modelSettings = list(model='made_up', modelParameters=param),
-                 trainCVAuc = NULL,
-                 hyperParamSearch = hyperSummary,
-                 metaData = plpData$metaData,
-                 populationSettings = attr(population, 'metaData'),
-                 outcomeId=outcomeId,# can use populationSettings$outcomeId?
-                 cohortId=cohortId,
-                 varImp = NULL,
-                 trainingTime=comp,
-                 covariateMap=result$map
-  )
-  class(result) <- 'plpModel'
-  attr(result, 'type') <- 'madeup'
-  attr(result, 'predictionType') <- 'binary'
-  return(result)
-    
-}
-
-
-

-helpers

-

In the fit model I specified calling made_up_model, this is the function that trains a model given the data and population (where the popualtion contains a column outcomeCount corresponding to the outcome). Both the data and population are ordered the same way:

-
made_up_model <- function(data, population, a = 1, b = 1, final = F, ...) {
-    
-    writeLines(paste("Training Made Up model with ", length(unique(population$indexes)), 
-        " fold CV"))
-    if (!is.null(population$indexes) && final == F) {
-        index_vect <- unique(population$indexes)
-        perform <- c()
-        
-        # create prediction matrix to store all predictions
-        predictionMat <- population
-        predictionMat$value <- 0
-        attr(predictionMat, "metaData") <- list(predictionType = "binary")
-        
-        for (index in 1:length(index_vect)) {
-            writeLines(paste("Fold ", index, " -- with ", sum(population$indexes != 
-                index), "train rows"))
-            model <- madeup::model(x = data[population$indexes != index, ], 
-                y = population$outcomeCount[population$indexes != index], a = a, 
-                b = b)
-            
-            pred <- stats::predict(model, data[population$indexes == index, 
-                ])
-            prediction <- population[population$indexes == index, ]
-            prediction$value <- pred
-            attr(prediction, "metaData") <- list(predictionType = "binary")
-            aucVal <- computeAuc(prediction)
-            perform <- c(perform, aucVal)
-            
-            # add the fold predictions and compute AUC after loop
-            predictionMat$value[population$indexes == index] <- pred
-            
-        }
-        ## auc <- mean(perform) # want overal rather than mean
-        auc <- computeAuc(predictionMat)
-        
-        foldPerm <- perform
-    } else {
-        model <- madeup::model(x = data, y = population$outcomeCount, a = a, 
-            b = b)
-        
-        pred <- stats::predict(model, data)
-        prediction <- population
-        prediction$value <- pred
-        attr(prediction, "metaData") <- list(predictionType = "binary")
-        auc <- computeAuc(prediction)
-        foldPerm <- auc
-    }
-    
-    result <- list(model = model, auc = auc, hyperSum = unlist(list(a = a, b = b, 
-        fold_auc = foldPerm)))
-    return(result)
-}
-
-
-

-Predict

-

The final step is to create a predict function for the model. This gets added to the predict.R file. In the example above the type attr(result, 'type') <- 'madeup' was madeup, so a predict.madeup function is required to be added into the predict.R. The predict function needs to take as input the plpModel returned by the fit function, the population to apply the model on and the plpData specifying the covariates of the population.

-
predict.madeup <- function(plpModel, population, plpData, ...) {
-    result <- toSparseM(plpData, population, map = plpModel$covariateMap)
-    data <- result$data[population$rowId, ]
-    prediction <- data.frame(rowId = population$rowId, value = stats::predict(plpModel$model, 
-        data))
-    
-    prediction <- merge(population, prediction, by = "rowId")
-    prediction <- prediction[, colnames(prediction) %in% c("rowId", "outcomeCount", 
-        "indexes", "value")]  # need to fix no index issue
-    attr(prediction, "metaData") <- list(predictionType = "binary")
-    return(prediction)
-    
-}
-

As the madeup model uses the standard R prediction, it has the same prediction function as xgboost, so we could have not added a new prediction function and instead made the tpye of the result returned by fitMadeUpModel to attr(result, 'type') <- 'xgboost'.

-
-
-
- - - -
- - - -
- - - - - diff --git a/docs/articles/Figure1.png b/docs/articles/Figure1.png deleted file mode 100644 index 878a509e2..000000000 Binary files a/docs/articles/Figure1.png and /dev/null differ diff --git a/docs/articles/GeneratingLearningCurves.html b/docs/articles/GeneratingLearningCurves.html deleted file mode 100644 index cb9467f1d..000000000 --- a/docs/articles/GeneratingLearningCurves.html +++ /dev/null @@ -1,294 +0,0 @@ - - - - - - - -Generating Learning Curves • PatientLevelPrediction - - - - - - - - - - -
-
- - - - -
-
- - - - -
-

-Introduction

-

This vignette describes how you can use the Observational Health Data Sciences and Informatics (OHDSI) PatientLevelPrediction package to generate learning curves. This vignette assumes you have read and are comfortable with building patient level prediction models as described in the BuildingPredictiveModels vignette.

-

Prediction models will show overly-optimistic performance when predicting on the same data as used for training. Therefore, best-practice is to partition our data into a training set and testing set. We then train our prediction model on the training set portion and asses its ability to generalize to unseen data by measuring its performance on the testing set.

-

Learning curves assess the effect of training set size on model performance by training a sequence of prediction models on successively larger subsets of the training set. A learning curve plot can also help in diagnosing a bias or variance problem as explained below.

-
-

Learning curve example.

-
-

Figure 1, shows an example of learning curve plot in which the vertical axis represents the model performance and the horizontal axis the training set size. If training set size is small, the performance on the training set is high, because a model can often be fitted well to a limited number of training examples. At the same time, the performance on the testing set will be poor, because the model trained on such a limited number of training examples will not generalize well to unseen data in the testing set. As the training set size increases, the performance of the model on the training set will decrease. It becomes more difficult for the model to find a good fit through all the training examples. Also, the model will be trained on a more representative portion of training examples, making it generalize better to unseen data. This can be observed by the increasin testing set performance.

-

The learning curve can help us in diagnosing bias and variance problems with our classifier which will provide guidance on how to further improve our model. We can observe high variance (overfitting) in a prediction model if it performs well on the training set, but poorly on the testing set (Figure 2). Adding additional data is a common approach to counteract high variance. From the learning curve it becomes apparent, that adding additional data may improve performance on the testing set a little further, as the learning curve has not yet plateaued and, thus, the model is not saturated yet. Therefore, adding more data will decrease the gap between training set and testing set, which is the main indicator for a high variance problem.

-
-

Prediction model suffering from high variance.

-
-

Furthermore, we can observe high bias (underfitting) if a prediction model performs poorly on the training set as well as on the testing set (Figure 3). The learning curves of training set and testing set have flattened on a low performance with only a small gap in between them. Adding additional data will in this case have little to no impact on the model performance. Choosing another prediction algorithm that can find more complex (for example non-linear) relationships in the data may be an alternative approach to consider in this high bias situation.

-
-

Prediction model suffering from high bias.

-
-
-
-

-Generating the learning curve

-

Use the PatientLevelPrediction package to generate a population and plpData object. Alternatively, you can make use of the data simulator. The following code snippet creates a population of 12000 patients.

-
set.seed(1234)
-data(plpDataSimulationProfile)
-sampleSize <- 12000
-plpData <- simulatePlpData(
-  plpDataSimulationProfile,
-  n = sampleSize
-)
-
-population <- createStudyPopulation(
-  plpData,
-  outcomeId = 2,
-  binary = TRUE,
-  firstExposureOnly = FALSE,
-  washoutPeriod = 0,
-  removeSubjectsWithPriorOutcome = FALSE,
-  priorOutcomeLookback = 99999,
-  requireTimeAtRisk = FALSE,
-  minTimeAtRisk = 0,
-  riskWindowStart = 0,
-  addExposureDaysToStart = FALSE,
-  riskWindowEnd = 365,
-  addExposureDaysToEnd = FALSE,
-  verbosity = "INFO"
-)
-

Specify the prediction algorithm to be used.

-
# Use LASSO logistic regression
-modelSettings <- setLassoLogisticRegression()
-

Specify a test fraction and a sequence of training set fractions.

-
testFraction <- 0.2
-trainFractions <- seq(0.1, 0.8, 0.1) # Create eight training set fractions
-

Specify the test split to be used.

-
# Use a split by person, alternatively a time split is possible
-testSplit <- 'stratified'
-

Create the learning curve object.

-
learningCurve <- createLearningCurve(population,
-                                     plpData = plpData,
-                                     modelSettings = modelSettings,
-                                     testFraction = 0.2,
-                                     verbosity = "TRACE",
-                                     trainFractions = trainFractions,
-                                     splitSeed = 1000,
-                                     saveModel = TRUE)
-

Plot the learning curve object (Figure 4). Specify one of the available metrics: AUROC, AUPRC, sBrier.

-
plotLearningCurve(
-  learningCurve,
-  metric='AUROC',
-  plotTitle = 'Learning Curve',
-  plotSubtitle = 'AUROC performance'
-)
-
-

Learning curve plot.

-
-
-
-

-Parallel processing

-

The learning curve object can be created in parallel, which can reduce computation time significantly. Currently this functionality is only available for LASSO logistic regression. Depending on the number of parallel workers it may require a significant amount of memory. We advise to use the parallelized learning curve function for parameter search and exploratory data analysis. Logging and saving functionality is unavailable.

-

Use the parallelized version of the learning curve function to create the learning curve object in parallel. R will find the number of available processing cores automatically and register the required parallel backend.

-
learningCurvePar <- createLearningCurvePar(
-  population,
-  plpData =  plpData,
-  modelSettings = modelSettings,
-  testSplit = testSplit,
-  testFraction = testFraction,
-  trainFractions = trainFractions,
-  splitSeed = 1000
-)
-
-
-

-Demo

-

We have added a demo of the learningcurve:

-
# Show all demos in our package: 
- demo(package = "PatientLevelPrediction")
-
-# Run the learning curve
- demo("LearningCurveDemo", package = "PatientLevelPrediction")
-

Do note that running this demo can take a considerable amount of time (15 min on Quad core running in parallel)!

-
-
-

-Acknowledgments

-

Considerable work has been dedicated to provide the PatientLevelPrediction package.

-
citation("PatientLevelPrediction")
-
## 
-## To cite PatientLevelPrediction in publications use:
-## 
-## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design
-## and implementation of a standardized framework to generate and evaluate
-## patient-level prediction models using observational healthcare data."
-## _Journal of the American Medical Informatics Association_, *25*(8),
-## 969-975. <URL: https://doi.org/10.1093/jamia/ocy032>.
-## 
-## A BibTeX entry for LaTeX users is
-## 
-##   @Article{,
-##     author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek},
-##     title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data},
-##     journal = {Journal of the American Medical Informatics Association},
-##     volume = {25},
-##     number = {8},
-##     pages = {969-975},
-##     year = {2018},
-##     url = {https://doi.org/10.1093/jamia/ocy032},
-##   }
-

Please reference this paper if you use the PLP Package in your work:

-

Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.

-
-
- - - -
- - - - -
- - - - - - diff --git a/docs/articles/ImplementingExistingModels.html b/docs/articles/ImplementingExistingModels.html deleted file mode 100644 index 2bb38a32e..000000000 --- a/docs/articles/ImplementingExistingModels.html +++ /dev/null @@ -1,359 +0,0 @@ - - - - - - - -Implementing Existing Prediction Models using the OHDSI PatientLevelPrediction Framework • PatientLevelPrediction - - - - - - - - - -
-
- - - -
-
- - - - - -
-

-Introduction

-

This vignette describes how you can implement existing logistic regression models in the Observational Health Data Sciences (OHDSI) PatientLevelPrediction framework. This allows you to for example externally validate them at scale in the OHDSI data network.

-

As an example we are going to implement the CHADS2 model as described in:

-

Gage BF, Waterman AD, Shannon W, Boechler M, Rich MW, Radford MJ. Validation of clinical classification schemes for predicting stroke: results from the National Registry of Atrial Fibrillation. JAMA. 2001 Jun 13;285(22):2864-70

-

To implement the model you need to create three tables: the model table, the covariate table, and the intercept table. The model table specifies the modelId (sequence number), the modelCovariateId (sequence number) and the covariateValue (beta for the covariate). The covariate table specifies the mapping between the covariates from the published model and the standard covariates, i.e. it maps to a combination of an analysisid and a concept_id (see below). The intercept table specifies per modelId the intercept.

-
-
-

-Model implementation

-
-

-Define the model

-

The CHADS2 is a score based model with:

-
##   Points                        Covariate
-## 1      1         Congestive heart failure
-## 2      1                     Hypertension
-## 3      1                  Age >= 75 years
-## 4      1                Diabetes mellitus
-## 5      2 Stroke/transient ischemic attack
-

The model table should therefore be defined as:

-
##   modelId modelCovariateId covariateValue
-## 1       1                1              1
-## 2       1                2              1
-## 3       1                3              1
-## 4       1                4              1
-## 5       1                5              2
-

The covariateTable will then specify what standard covariates need to be included in the model.

-

In this case we choose the following Standard SNOMED concept_ids: 319835 for congestive heart failure, 316866 for hypertensive disorder, 201820 for diabetes, and 381591 for cerebrovascular disease. It is allowed to add multiple concept_ids as separate rows for the same modelCovariateId if concept sets are needed. These concept_ids can be found using the vocabulary search in ATLAS.

-

The standard covariates are of the form: conceptid*1000 + analysisid. The analysisid specifies the domain of the covariate and its lookback window. Examples can be found here: https://github.com/OHDSI/FeatureExtraction/blob/master/inst/csv/PrespecAnalyses.csv

-

Our example of CHADS2 uses agegroup and conditions in the full history. Therefore we need to define the standard covariates using the FeatureExtraction::createCovariateSettings as follows:

-
library(PatientLevelPrediction)
-covSet <- FeatureExtraction::createCovariateSettings(useDemographicsAgeGroup = T,                             
-                                                     useConditionOccurrenceLongTerm = T,
-                                                     includedCovariateIds = NULL,
-                                                     longTermStartDays = -9999, 
-                                                     endDays = 0)
-

In the above code we used the useConditionOccurrenceLongTerm (these have an analysis id of 102) and we defined the longTermStartDays to be -9999 days relative to index (so we get the full history). We include the index date in our lookback period by specifying endDays = 0. The includeCovariateIds is set to NULL here, but this will be updated automatically later on. As we picked analysis id 102, the standard covariate for anytime prior congestive heart failure is 319835102. The same logic follows for the other conditions, so the covariate table will be:

-
##   modelCovariateId covariateId
-## 1                1   319835102
-## 2                2   316866102
-## 3                3       15003
-## 4                3       16003
-## 5                3       17003
-## 6                3       18003
-## 7                3       19003
-## 8                4   201820102
-## 9                5   381591102
-

modelCovariateId 3 was age>= 75, as the standard covariate age groups are in 5 year groups, we needed to add the age groups 75-80, 80-85, 85-90, 90-95 and 95-100, these correspond to the covaraiteIds 15003, 16003, 17003, 18003 and 19003 respectively.

-

To create the tables in R for CHADS2 you need to make the following dataframes:

-
model_table <- data.frame(modelId = c(1,1,1,1,1),
-                          modelCovariateId = 1:5, 
-                          coefficientValue = c(1, 1, 1, 1, 2)
-                          )
-
-covariate_table <- data.frame(modelCovariateId = c(1,2,3,3,3,3,3,4,5),
-                              covariateId = c(319835102, 316866102, 
-                                            15003, 16003, 17003, 18003, 19003, 
-                                            201820102, 381591102)
-                              )
-
-interceptTable <-  data.frame(modelId = 1, 
-                              interceptValue = 0)
-
-
-

-Create the model

-

Now you have everything in place to actually create the existing model. First specify the current environment as executing createExistingModelSql creates two functions for running the existing model into the specified environment. You need to specify the type of model (either logistic or score), in our example we are calculating a score. We finally need to specify the analysisId for the newly created CHADS2 covariate.

-
e <- environment()
-PatientLevelPrediction::createExistingModelSql(modelTable = model_table, 
-                       modelNames = 'CHADS2', 
-                       interceptTable = data.frame(modelId = 1, interceptValue = 0),
-                       covariateTable = covariate_table, 
-                       type = 'score',
-                       analysisId = 112, covariateSettings = covSettings, e = e)
-

Once run you will find two new functions in your environment:

-
    -
  • createExistingmodelsCovariateSettings()
  • -
  • getExistingmodelsCovariateSettings()
  • -
-
-
-

-Run the model

-

Now you can use the functions you previously created to extract the existing model risk scores for a target population:

-
plpData <- PatientLevelPrediction::getPlpData(connectionDetails, 
-                      cdmDatabaseSchema = 'databasename.dbo',
-                      cohortId = 1,
-                      outcomeIds = 2, 
-                      cohortDatabaseSchema = 'databasename.dbo', 
-                      cohortTable =  'cohort' , 
-                      outcomeDatabaseSchema = 'databasename.dbo', 
-                      outcomeTable = 'cohort', 
-                      covariateSettings =  createExistingmodelsCovariateSettings(),
-                      sampleSize = 20000
-                      )
-

To implement and evaluate an existing model you can use the function:

-

PatientLevelPrediction::evaluateExistingModel()

-

with the following parameters:

-
    -
  • modelTable - a data.frame containing the columns: modelId, covariateId and coefficientValue
  • -
  • covariateTable - a data.frame containing the columns: covariateId and standardCovariateId - this provides a set of standardCovariateId to define each model covariate.
  • -
  • interceptTable - a data.frame containing the columns modelId and interceptValue or NULL if the model doesn’t have an intercept (equal to zero).
  • -
  • type - the type of model (either: score or logistic)
  • -
  • covariateSettings - this is used to determine the startDay and endDay for the standard covariates
  • -
  • customCovariates - a data.frame with the covariateId and sql to generate the covariate value.
  • -
  • riskWindowStart - the time at risk starts at target cohort start date + riskWindowStart
  • -
  • addExposureDaysToEnd - if true then the time at risk window ends a the cohort end date + riskWindowEnd rather than cohort start date + riskWindowEnd
  • -
  • riskWindowEnd - the time at risk ends at target cohort start/end date + riskWindowStart
  • -
  • requireTimeAtRisk - whether to add a constraint to the number of days observed during the time at risk period in including people into the study
  • -
  • minTimeAtRisk - the minimum number of days observation during the time at risk a target population person needs to be included
  • -
  • includeAllOutcomes - Include outcomes even if they do not satisfy the minTimeAtRisk? (useful if the outcome is associated to death or rare)
  • -
  • removeSubjectsWithPriorOutcome - remove target population people who have the outcome prior to the time at tisk period?
  • -
  • connectionDetails - the connection to the CDM database
  • -
-

Finally you need to add the settings for downloading the new data:

-
    -
  • cdmDatabaseSchema
  • -
  • cohortDatabaseSchema
  • -
  • cohortTable
  • -
  • cohortId
  • -
  • outcomeDatabaseSchema
  • -
  • outcomeTable
  • -
  • outcomeId
  • -
  • oracleTempSchema
  • -
-

To run the external validation of an existing model where the target population are those in the cohort table with id 1 and the outcome is those in the cohort table with id 2 and we are looking to predict first time occurrence of the outcome 1 day to 365 days after the target cohort start date (assuming you have the modelTable, covariateTable and interceptTable in the format explained above):

-
# in our example the existing model uses gender and condition groups looking back 200 days:
-covSet <- FeatureExtraction::createCovariateSettings(useDemographicsGender = T,
-                                                     useConditionGroupEraMediumTerm = T, 
-                                                     mediumTermStartDays = -200)
-
-result <- evaluateExistingModel(modelTable = modelTable,
-                                covariateTable = covariateTable,
-                                interceptTable = NULL,
-                                type = 'score', 
-                                covariateSettings =  covSet,
-                                riskWindowStart = 1, 
-                                addExposureDaysToEnd = F, 
-                                riskWindowEnd = 365, 
-                                requireTimeAtRisk = T, 
-                                minTimeAtRisk = 364, 
-                                includeAllOutcomes = T, 
-                                removeSubjectsWithPriorOutcome = T, 
-                                connectionDetails = connectionDetails, 
-                                cdmDatabaseSchema = 'databasename.dbo',
-                                cohortId = 1,
-                                outcomeId = 2, 
-                                cohortDatabaseSchema = 'databasename.dbo', 
-                                cohortTable =  'cohort' , 
-                                outcomeDatabaseSchema = 'databasename.dbo', 
-                                outcomeTable = 'cohort'
-                      )
-

Result will contain the performance and the predictions made by the model.

-
-
-
-

-Acknowledgments

-

Considerable work has been dedicated to provide the PatientLevelPrediction package.

-
citation("PatientLevelPrediction")
-
## 
-## To cite PatientLevelPrediction in publications use:
-## 
-## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018).
-## "Design and implementation of a standardized framework to generate
-## and evaluate patient-level prediction models using observational
-## healthcare data." _Journal of the American Medical Informatics
-## Association_, *25*(8), 969-975. <URL:
-## https://doi.org/10.1093/jamia/ocy032>.
-## 
-## A BibTeX entry for LaTeX users is
-## 
-##   @Article{,
-##     author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek},
-##     title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data},
-##     journal = {Journal of the American Medical Informatics Association},
-##     volume = {25},
-##     number = {8},
-##     pages = {969-975},
-##     year = {2018},
-##     url = {https://doi.org/10.1093/jamia/ocy032},
-##   }
-

Please reference this paper if you use the PLP Package in your work:

-

Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.

-

This work is supported in part through the National Science Foundation grant IIS 1251151.

-
-
- - - -
- - - -
- - - - - diff --git a/docs/articles/InstallationGuide.html b/docs/articles/InstallationGuide.html deleted file mode 100644 index 29f442285..000000000 --- a/docs/articles/InstallationGuide.html +++ /dev/null @@ -1,313 +0,0 @@ - - - - - - - -Patient-Level Prediction Installation Guide • PatientLevelPrediction - - - - - - - - - - -
-
- - - - -
-
- - - - - -
-

-Introduction

-

This vignette describes how you need to install the Observational Health Data Sciencs and Informatics (OHDSI) PatientLevelPrediction package under Windows, Mac, and Linux.

-
-
-

-Software Prerequisites

-
-

-Windows Users

-

Under Windows the OHDSI Patient Level Prediction (PLP) package requires installing:

- -
-
-

-Mac/Linux Users

-

Under Mac and Linux the OHDSI Patient Level Prediction (PLP) package requires installing:

- -
-
-
-

-Installing the Package

-

The preferred way to install the package is by using remotes, which will automatically install the latest release and all the latest dependencies.

-

If you do not want the official release you could install the bleading edge version of the package (latest develop branch).

-

Note that the latest develop branch could contain bugs, please report them to us if you experience problems.

-
-

-Installing PatientLevelPrediction using remotes

-

To install using remotes run:

-
-install.packages("remotes")
-remotes::install_github("OHDSI/FeatureExtraction")
-remotes::install_github("OHDSI/PatientLevelPrediction")
-

When installing make sure to close any other Rstudio sessions that are using PatientLevelPrediction or any dependency. Keeping Rstudio sessions open can cause locks that prevent the package installing.

-
-
-
-

-Creating Python Reticulate Environment

-

Many of the classifiers in the PatientLevelPrediction use a Python back end. To set up a python environment run:

-
-library(PatientLevelPrediction)
-reticulate::install_miniconda()
-configurePython(envname='r-reticulate', envtype='conda')
-

Some of the less frequently used classifiers are not installed during this set-up to add them run:

-

For GBM survival:

-
-reticulate::conda_install(envname='r-reticulate', packages = c('scikit-survival'), forge = TRUE, pip = FALSE, pip_ignore_installed = TRUE, conda = "auto", channel = 'sebp')
-
-
-

-Testing installation

-

To test whether the package is installed correctly, using the test script in ‘/extras’, run:

-
-# load the checkPlpInstallation function
-library(devtools)
-source_url('https://raw.github.com/OHDSI/PatientLevelPrediction/issue242/extras/checkPlpInstallation.R')
-
-# set up the database connection details
-library(DatabaseConnector)
-connectionDetails <- createConnectionDetails(
-  dbms = 'sql_server', 
-  user = 'username', 
-  password = 'hidden', 
-  server = 'your server', 
-  port = 'your port'
-  )
-
-# run the test
-checkPlpInstallation(
-  connectionDetails = connectionDetails, 
-  python = T
-  )
-

To test the installation (excluding python) run:

-
-checkPlpInstallation(
-  connectionDetails = connectionDetails, 
-  python = F
-  )
-

The check can take a while to run since it will build the following models in sequence on simulated data: Logistic Regression, RandomForest, MLP, AdaBoost, Decision Tree, Naive Bayes, KNN, Gradient Boosting. Moreover, it will test the database connection.

-
-
-

-Installation issues

-

Installation issues need to be posted in our issue tracker: http://github.com/OHDSI/PatientLevelPrediction/issues

-

The list below provides solutions for some common issues:

-
    -
  1. If you have an error when trying to install a package in R saying ‘Dependancy X not available …’ then this can sometimes be fixed by running install.packages('X') and then once that completes trying to reinstall the package that had the error.

  2. -
  3. I have found that using the github `remotes`` to install packages can be impacted if you have multiple R sessions open as one session with a library open can cause the library to be locked and this can prevent an install of a package that depends on that library.

  4. -
-
-
-

-Acknowledgments

-

Considerable work has been dedicated to provide the PatientLevelPrediction package.

-
-citation("PatientLevelPrediction")
-
## 
-## To cite PatientLevelPrediction in publications use:
-## 
-## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design
-## and implementation of a standardized framework to generate and evaluate
-## patient-level prediction models using observational healthcare data."
-## _Journal of the American Medical Informatics Association_, *25*(8),
-## 969-975. <URL: https://doi.org/10.1093/jamia/ocy032>.
-## 
-## A BibTeX entry for LaTeX users is
-## 
-##   @Article{,
-##     author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek},
-##     title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data},
-##     journal = {Journal of the American Medical Informatics Association},
-##     volume = {25},
-##     number = {8},
-##     pages = {969-975},
-##     year = {2018},
-##     url = {https://doi.org/10.1093/jamia/ocy032},
-##   }
-

Please reference this paper if you use the PLP Package in your work:

-

Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.

-

This work is supported in part through the National Science Foundation grant IIS 1251151.

-
-
- - - -
- - - - -
- - - - - - diff --git a/docs/articles/InstallationGuide_files/header-attrs-2.11/header-attrs.js b/docs/articles/InstallationGuide_files/header-attrs-2.11/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/InstallationGuide_files/header-attrs-2.11/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/InstallationGuide_files/header-attrs-2.7/header-attrs.js b/docs/articles/InstallationGuide_files/header-attrs-2.7/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/InstallationGuide_files/header-attrs-2.7/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/PatientLevelPrediction.html b/docs/articles/PatientLevelPrediction.html deleted file mode 100644 index 1b8d82727..000000000 --- a/docs/articles/PatientLevelPrediction.html +++ /dev/null @@ -1,301 +0,0 @@ - - - - - - - -Quick Install Guide • PatientLevelPrediction - - - - - - - - - - -
-
- - - - -
-
- - - - - -
-

-Quick Install Guide

-
-
-

-Intalling the R package

-

The preferred way to install the package is by using drat, which will automatically install the latest release and all the latest dependencies. If the drat code fails or you do not want the official release you could use devtools to install the bleading edge version of the package (latest master). Note that the latest master could contain bugs, please report them to us if you experience problems.

-

To install using drat run:

-
-install.packages("drat")
-drat::addRepo("OHDSI")
-install.packages("PatientLevelPrediction")
-

To install using devtools run:

-
-install.packages('devtools')
-devtools::install_github("OHDSI/FeatureExtraction")
-devtools::install_github('ohdsi/PatientLevelPrediction')
-

When installing using devtools make sure to close any other Rstudio sessions that are using PatientLevelPrediction or any dependency. Keeping Rstudio sessions open can cause locks that prevent the package installing.

-
-
-

-Setting up Python

-

Many of the classifiers in PatientLevelPrediction use python. To use the python classifiers you need to install and set up the a python environment in R. We used the reticulate package:

-
-library(PatientLevelPrediction)
-reticulate::install_miniconda()
-configurePython(envname='r-reticulate', envtype='conda')
-

To add the R keras interface, in Rstudio run:

-
-devtools::install_github("rstudio/keras")
-library(keras)
-install_keras()
-

Some of the less frequently used classifiers are considered optional and are not installed by default. To install then, run:

-

For GBM survival:

-
-reticulate::conda_install(envname='r-reticulate', packages = c('scikit-survival'), forge = TRUE, pip = FALSE, pip_ignore_installed = TRUE, conda = "auto", channel = 'sebp')
-

For any of the torch models:

-
-reticulate::conda_install(envname='r-reticulate', packages = c('pytorch', 'torchvision', 'cpuonly'), forge = TRUE, pip = FALSE, channel = 'pytorch', pip_ignore_installed = TRUE, conda = 'auto')
-
-
-

-Testing the PatientLevelPrediction Installation

-

To test whether the package is installed correctly run:

-
-library(PatientLevelPrediction)
-library(DatabaseConnector)
-connectionDetails <- createConnectionDetails(dbms = 'sql_server', 
-                                             user = 'username', 
-                                             password = 'hidden', 
-                                             server = 'your server', 
-                                             port = 'your port')
-PatientLevelPrediction::checkPlpInstallation(connectionDetails = connectionDetails, 
-                                             python = T)
-

To test the installation (excluding python) run:

-
-library(PatientLevelPrediction)
-library(DatabaseConnector)
-connectionDetails <- createConnectionDetails(dbms = 'sql_server', 
-                                           user = 'username', 
-                                           password = 'hidden', 
-                                           server = 'your server', 
-                                           port = 'your port')
-PatientLevelPrediction::checkPlpInstallation(connectionDetails = connectionDetails, 
-                                             python = F)
-

The check can take a while to run since it will build the following models in sequence on simulated data: Logistic Regression, RandomForest, MLP, AdaBoost, Decision Tree, Naive Bayes, KNN, Gradient Boosting. Moreover, it will test the database connection.

-
-
-
-

-Common issues

-
-

-python environment Mac/linux users:

-

to make sure R uses the r-reticulate python environment you may need to edit your .Rprofile with the location of the python binary for the PLP environment. Edit the .Rprofile by running:

-
-usethis::edit_r_profile()
-

and add

-
-Sys.setenv(PATH = paste("your python bin location", Sys.getenv("PATH"), sep=":"))
-

to the file then save. Where your python bin location is the location returned by

-
-reticulate::conda_list() 
-

e.g., My PLP virtual environment location was /anaconda3/envs/PLP/bin/python so I added:
-Sys.setenv(PATH = paste(“/anaconda3/envs/PLP/bin”, Sys.getenv(“PATH”), sep=“:”))

-
-
-
-

-Old Instructions

-
-

-To configure python via anaconda

-
    -
  • Close your RStudio
  • -
  • Install python 3.7 using anaconda (https://www.anaconda.com/download) [make sure you pick the correct operating system] and note the installation location. Anaconda should update you path variable with the python binary.
  • -
  • Open a new Rstudio and check whether your python is configured correctly by running:
  • -
-
-system("python --version") 
-

If set up correctly you should see “Python 3.x.x :: Anaconda, Inc.” returned.

-
    -
  • If not set up correctly then: -
      -
    • Windows users: make sure your anaconda python binary is in the System PATH environmental variable: go to my computer -> system properties -> advanced system settings Then at the bottom right you’ll see a button: Environmental Variables, clicking on that will enable you to edit the PATH variable. Add the following Anaconda locations to your path: D:\Anaconda3;D:\Anaconda3\Scripts;D:\Anaconda3\Library\bin (this assumes your installation was done to D:\Anaconda3).
    • -
    • Mac/Linux users: edit the bash profile to add python in the Path by running in the terminal: touch ~/.bash_profile; open ~/.bash_profile; and adding in the location of python in the PATH variable. Unfortunately, you also need to make an edit to the .Rprofile for R to get the correct PATH. To do this open the .Rprofile by running:
    • -
    -
  • -
-
-  usethis::edit_r_profile()
-

and in this file add

-
-Sys.setenv(PATH = paste("your python bin location", Sys.getenv("PATH"), sep=":"))
-
    -
  • After editing your Path or .Rprofile open a new Rstudio session and test that python is correctly set up via
  • -
-
-system("python --version")
-
-
-
- - - -
- - - - -
- - - - - - diff --git a/docs/articles/PatientLevelPrediction_files/header-attrs-2.11/header-attrs.js b/docs/articles/PatientLevelPrediction_files/header-attrs-2.11/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/PatientLevelPrediction_files/header-attrs-2.11/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/PatientLevelPrediction_files/header-attrs-2.7/header-attrs.js b/docs/articles/PatientLevelPrediction_files/header-attrs-2.7/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/PatientLevelPrediction_files/header-attrs-2.7/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/PlottingLearningCurves.html b/docs/articles/PlottingLearningCurves.html deleted file mode 100644 index e6cf4939c..000000000 --- a/docs/articles/PlottingLearningCurves.html +++ /dev/null @@ -1,276 +0,0 @@ - - - - - - - -Plotting learning curves • PatientLevelPrediction - - - - - - - - - -
-
- - - -
-
- - - - - -
-

-Introduction

-

Prediction models will show overly-optimistic performance when predicting on the same data as used for training. Therefore, we generally partition our data into a training set and testing set. We then train our prediction model on the training set portion and asses its ability to generalize to unseen data by measuring its performance on the testing set.

-

Learning curves inform about the effect of training set size on model performance by training a sequence of prediction models on successively larger subsets of the training set. A learning curve plot can also help in diagnosing a bias or variance problem. Learning curves objects can be created and plotted with the PatientLevelPrediction package.

-
-

-Background

-

Figure 1 shows a commonly observed learning curve plot, where model performance is mapped to the vertical axis and training set size is mapped to the horizontal axis. If training set size is small, the performance on the training set is high, because a model can generally be fitted well to a limited number of training examples. At the same time, the performance on the testing set will be poor, because the model trained on such a limited number of training examples will not generalize well to unseen data in the testing set. As the training set size increases, the performance of the model on the training set will decrease. It becomes more difficult for the model to find a good fit through all the training examples. Also, the model will be trained on a more representative portion of training examples, making it generalize better to unseen data. This can be observed by the testing set performance increasing.

-
-Figure 1. Learning curve plot with model performance mapped to the vertical axis and training set size mapped to the horizontal axis.

Figure 1. Learning curve plot with model performance mapped to the vertical axis and training set size mapped to the horizontal axis.

-
-
-
-

-Bias and variance

-

We can observe high variance (overfitting) in a prediction model if it performs well on the training set, but poorly on the testing set (Figure 2). Adding additional data is a common approach to counteract high variance. From the learning curve it becomes apparent, that adding additional data may improve performance on the testing set a little further, as the learning curve has not yet plateaued and, thus, the model is not saturated yet. Therefore, adding more data will decrease the gap between training set and testing set, which is the main indicator for a high variance problem.

-
-Figure 2. Prediction model suffering from high variance.

Figure 2. Prediction model suffering from high variance.

-
-

Furthermore, we can observe high bias (underfitting) if a prediction model performs poorly on the training set as well as on the testing set (Figure 3). The learning curves of training set and testing set have flattened on a low performance with only a small gap in between them. Adding additional data will in this case have little to no impact on the model performance. Choosing another prediction algorithm that can find more complex (potentiallly non-linear) relationships in the data may be an alternative approach to consider.

-
-Figure 3. Prediction model suffering from high bias.

Figure 3. Prediction model suffering from high bias.

-
-
-
-
-

-Usage

-

Use the OHDSI tool ecosystem to generate a population and plpData object. Alternatively, you can make use of the data simulator. The following code snippet creates a population of 12000 patients.

-
set.seed(1234)
-data(plpDataSimulationProfile)
-sampleSize <- 12000
-plpData <- simulatePlpData(
-  plpDataSimulationProfile,
-  n = sampleSize
-)
-
-population <- createStudyPopulation(
-  plpData,
-  outcomeId = 2,
-  binary = TRUE,
-  firstExposureOnly = FALSE,
-  washoutPeriod = 0,
-  removeSubjectsWithPriorOutcome = FALSE,
-  priorOutcomeLookback = 99999,
-  requireTimeAtRisk = FALSE,
-  minTimeAtRisk = 0,
-  riskWindowStart = 0,
-  addExposureDaysToStart = FALSE,
-  riskWindowEnd = 365,
-  addExposureDaysToEnd = FALSE,
-  verbosity = futile.logger::INFO
-)
-

Specify the prediction algorithm to be used.

-
# Use LASSO logistic regression
-modelSettings <- setLassoLogisticRegression()
-

Specify a test fraction and a sequence of training set fractions.

-
testFraction <- 0.2
-trainFractions <- seq(0.1, 0.8, 0.1)
-

Specify the test split to be used.

-
# Use a split by person, alterantively a time split is possible
-testSplit <- 'person'
-

Create the learning curve object.

-
learningCurve <- createLearningCurve(
-  population,
-  plpData = plpData,
-  modelSettings = modelSettings,
-  testFraction = testFraction,
-  trainFractions = trainFractions,
-  splitSeed = 1000,
-  saveModel = FALSE,
-  timeStamp = FALSE
-)
-

Plot the learning curve object (Figure 4). Specify one of the available metrics: AUROC, AUPRC, sBrier.

-
plotLearningCurve(
-  learningCurve,
-  metric='AUROC',
-  plotTitle = 'Learning Curve',
-  plotSubtitle = 'AUROC performance'
-)
-
-Figure 4. Learning curve plot.

Figure 4. Learning curve plot.

-
-
-

-Parallel processing

-

The learning curve object can be created in parallel, which can reduce computation time significantly. Currently this functionality is only available for LASSO logistic regression. Depending on the number of parallel workers it may require a significant amount of memory. We advise to use the parallelized learning curve function for parameter search and exploratory data analysis. Logging and saving functionality is unavailable.

-

Use the parallelized version of the learning curve function to create the learning curve object in parallel. R will find the number of available processing cores automatically and register the required parallel backend.

-
learningCurvePar <- createLearningCurvePar(
-  population,
-  plpData =  plpData,
-  modelSettings = modelSettings,
-  testSplit = testSplit,
-  testFraction = testFraction,
-  trainFractions = trainFractions,
-  splitSeed = 1000
-)
-
-
-
- - - -
- - - -
- - - - - diff --git a/docs/articles/Videos.html b/docs/articles/Videos.html deleted file mode 100644 index 8f0d7b5f9..000000000 --- a/docs/articles/Videos.html +++ /dev/null @@ -1,355 +0,0 @@ - - - - - - - -Demo Videos • PatientLevelPrediction - - - - - - - - - - -
-
- - - - -
-
- - - - - -
-

-What is a cohort table?

- - - - - - - - - -
-Click To Launch - -Description of Demo -
-Video Vignette PLP Package - -Learn what a cohort table looks like and what columns are required. -
-
-
-

-Setting up a connection between your database and R

- - - - - - - - - -
-Click To Launch - -Description of Demo -
-Video Vignette PLP Package - -Learn how to configure the connection to your OMOP CDM data from R using the OHDSI DatabaseConnector package. -
-
-
-

-Running a single PatientLevelPrediction model

- - - - - - - - - -
-Click To Launch - -Description of Demo -
-Video Vignette PLP Package - -Learn how to develop and validate a single PatientLevelPrediction model. -
-
-
-

-Running multiple PatientLevelPrediction models study

- - - - - - - - - -
-Click To Launch - -Description of Demo -
-Video Vignette PLP Package - -Learn how to develop and validate multiple PatientLevelPrediction models. -
-
-
-

-Designing a study in Atlas

- - - - - - - - - -
-Click To Launch - -Description of Demo -
-Video Vignette PLP Package - -Learn how to design a multiple or single PatientLevelPrediction study using Atlas. Atlas creates an R package that just needs to be built and then you’re on your way to developing multiple models! -
-
-
-

-Building and running the Atlas study

- - - - - - - - - -
-Click To Launch - -Description of Demo -
-Video Vignette PLP Package - -Learn how to build the R package generated by Atlas and how to then run the study. -
-
-
-

-Exploring the results in the shiny app

- - - - - - - - - -
-Click To Launch - -Description of Demo -
-Video Vignette PLP Package - -Learn how to interactively explore the model performance and model via the shiny apps viewPlp() and viewMultiplePlp() -
-
-
-

-Validating existing models on OMOP CDM data

- - - - - - - - - -
-Click To Launch - -Description of Demo -
-Video Vignette PLP Package - -This demo shows how you can add any existing score or logistic model and valdiate the model on new OMOP CDM data. This is useful for benchmarking when developing new models or to perform extensive external validation of a model across the OHDSI network. -
-
-
- - - -
- - - - -
- - - - - - diff --git a/docs/articles/Videos_files/header-attrs-2.11/header-attrs.js b/docs/articles/Videos_files/header-attrs-2.11/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/Videos_files/header-attrs-2.11/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/Videos_files/header-attrs-2.7/header-attrs.js b/docs/articles/Videos_files/header-attrs-2.7/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/Videos_files/header-attrs-2.7/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/arch1.png b/docs/articles/arch1.png deleted file mode 100644 index e4846e56f..000000000 Binary files a/docs/articles/arch1.png and /dev/null differ diff --git a/docs/articles/atlasdownload1.png b/docs/articles/atlasdownload1.png deleted file mode 100644 index ef6559fa9..000000000 Binary files a/docs/articles/atlasdownload1.png and /dev/null differ diff --git a/docs/articles/atlasdownload2.png b/docs/articles/atlasdownload2.png deleted file mode 100644 index 619f8c799..000000000 Binary files a/docs/articles/atlasdownload2.png and /dev/null differ diff --git a/docs/articles/atlasplp1.png b/docs/articles/atlasplp1.png deleted file mode 100644 index 4b21b2143..000000000 Binary files a/docs/articles/atlasplp1.png and /dev/null differ diff --git a/docs/articles/atlasplp2.png b/docs/articles/atlasplp2.png deleted file mode 100644 index 6bc7b93ad..000000000 Binary files a/docs/articles/atlasplp2.png and /dev/null differ diff --git a/docs/articles/atlasplp3.png b/docs/articles/atlasplp3.png deleted file mode 100644 index 0911b31ea..000000000 Binary files a/docs/articles/atlasplp3.png and /dev/null differ diff --git a/docs/articles/atlasplp4.png b/docs/articles/atlasplp4.png deleted file mode 100644 index b5db1b153..000000000 Binary files a/docs/articles/atlasplp4.png and /dev/null differ diff --git a/docs/articles/cirenn.png b/docs/articles/cirenn.png deleted file mode 100644 index f4e8ed054..000000000 Binary files a/docs/articles/cirenn.png and /dev/null differ diff --git a/docs/articles/cnn_lstm.png b/docs/articles/cnn_lstm.png deleted file mode 100644 index a16e1417d..000000000 Binary files a/docs/articles/cnn_lstm.png and /dev/null differ diff --git a/docs/articles/cnn_mlf2.png b/docs/articles/cnn_mlf2.png deleted file mode 100644 index 2b69c159b..000000000 Binary files a/docs/articles/cnn_mlf2.png and /dev/null differ diff --git a/docs/articles/conv_arch1.png b/docs/articles/conv_arch1.png deleted file mode 100644 index 5970b3f1c..000000000 Binary files a/docs/articles/conv_arch1.png and /dev/null differ diff --git a/docs/articles/conv_arch2.png b/docs/articles/conv_arch2.png deleted file mode 100644 index a51ccf08e..000000000 Binary files a/docs/articles/conv_arch2.png and /dev/null differ diff --git a/docs/articles/covcnn.png b/docs/articles/covcnn.png deleted file mode 100644 index 82dd2832f..000000000 Binary files a/docs/articles/covcnn.png and /dev/null differ diff --git a/docs/articles/covcnn2.png b/docs/articles/covcnn2.png deleted file mode 100644 index 0734a49eb..000000000 Binary files a/docs/articles/covcnn2.png and /dev/null differ diff --git a/docs/articles/demographicSummary.png b/docs/articles/demographicSummary.png deleted file mode 100644 index 8ceafbee8..000000000 Binary files a/docs/articles/demographicSummary.png and /dev/null differ diff --git a/docs/articles/ensemble.png b/docs/articles/ensemble.png deleted file mode 100644 index 6e2173a48..000000000 Binary files a/docs/articles/ensemble.png and /dev/null differ diff --git a/docs/articles/example1/ATLAS_O.png b/docs/articles/example1/ATLAS_O.png deleted file mode 100644 index 3cda2abf7..000000000 Binary files a/docs/articles/example1/ATLAS_O.png and /dev/null differ diff --git a/docs/articles/example1/ATLAS_T.png b/docs/articles/example1/ATLAS_T.png deleted file mode 100644 index 8be57dc9e..000000000 Binary files a/docs/articles/example1/ATLAS_T.png and /dev/null differ diff --git a/docs/articles/example2/aceinhibitors.png b/docs/articles/example2/aceinhibitors.png deleted file mode 100644 index ce5148f1d..000000000 Binary files a/docs/articles/example2/aceinhibitors.png and /dev/null differ diff --git a/docs/articles/example2/angioedema.png b/docs/articles/example2/angioedema.png deleted file mode 100644 index 3adc8dcc9..000000000 Binary files a/docs/articles/example2/angioedema.png and /dev/null differ diff --git a/docs/articles/generalizability.png b/docs/articles/generalizability.png deleted file mode 100644 index b476ea71f..000000000 Binary files a/docs/articles/generalizability.png and /dev/null differ diff --git a/docs/articles/index.html b/docs/articles/index.html deleted file mode 100644 index f48ef229e..000000000 --- a/docs/articles/index.html +++ /dev/null @@ -1,236 +0,0 @@ - - - - - - - - -Articles • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
- -
- - - -
- - - - - - - - diff --git a/docs/articles/learningCurve.png b/docs/articles/learningCurve.png deleted file mode 100644 index 19cd06691..000000000 Binary files a/docs/articles/learningCurve.png and /dev/null differ diff --git a/docs/articles/learningCurveBias.png b/docs/articles/learningCurveBias.png deleted file mode 100644 index 3bd9f580a..000000000 Binary files a/docs/articles/learningCurveBias.png and /dev/null differ diff --git a/docs/articles/learningCurvePlot.png b/docs/articles/learningCurvePlot.png deleted file mode 100644 index a5e1f9e96..000000000 Binary files a/docs/articles/learningCurvePlot.png and /dev/null differ diff --git a/docs/articles/learningCurveVariance.png b/docs/articles/learningCurveVariance.png deleted file mode 100644 index 3212e6106..000000000 Binary files a/docs/articles/learningCurveVariance.png and /dev/null differ diff --git a/docs/articles/lstm_last.png b/docs/articles/lstm_last.png deleted file mode 100644 index 3e6fc16e5..000000000 Binary files a/docs/articles/lstm_last.png and /dev/null differ diff --git a/docs/articles/popdef1.png b/docs/articles/popdef1.png deleted file mode 100644 index 3d654fe7d..000000000 Binary files a/docs/articles/popdef1.png and /dev/null differ diff --git a/docs/articles/popdef2.png b/docs/articles/popdef2.png deleted file mode 100644 index a596e188d..000000000 Binary files a/docs/articles/popdef2.png and /dev/null differ diff --git a/docs/articles/popdef3.png b/docs/articles/popdef3.png deleted file mode 100644 index 34527ef9f..000000000 Binary files a/docs/articles/popdef3.png and /dev/null differ diff --git a/docs/articles/popdef4.png b/docs/articles/popdef4.png deleted file mode 100644 index 35d4949a5..000000000 Binary files a/docs/articles/popdef4.png and /dev/null differ diff --git a/docs/articles/popdef5.png b/docs/articles/popdef5.png deleted file mode 100644 index f6315b8a8..000000000 Binary files a/docs/articles/popdef5.png and /dev/null differ diff --git a/docs/articles/popdef6.png b/docs/articles/popdef6.png deleted file mode 100644 index 96a8abd1f..000000000 Binary files a/docs/articles/popdef6.png and /dev/null differ diff --git a/docs/articles/precisionRecall.png b/docs/articles/precisionRecall.png deleted file mode 100644 index 1f1d0f154..000000000 Binary files a/docs/articles/precisionRecall.png and /dev/null differ diff --git a/docs/articles/predictionDistribution.png b/docs/articles/predictionDistribution.png deleted file mode 100644 index 87bafc361..000000000 Binary files a/docs/articles/predictionDistribution.png and /dev/null differ diff --git a/docs/articles/preferencePDF.png b/docs/articles/preferencePDF.png deleted file mode 100644 index 3b3528452..000000000 Binary files a/docs/articles/preferencePDF.png and /dev/null differ diff --git a/docs/articles/problems.png b/docs/articles/problems.png deleted file mode 100644 index 931efa6d6..000000000 Binary files a/docs/articles/problems.png and /dev/null differ diff --git a/docs/articles/shinyroc.png b/docs/articles/shinyroc.png deleted file mode 100644 index 579fab31f..000000000 Binary files a/docs/articles/shinyroc.png and /dev/null differ diff --git a/docs/articles/shinysummary.png b/docs/articles/shinysummary.png deleted file mode 100644 index 75cec2430..000000000 Binary files a/docs/articles/shinysummary.png and /dev/null differ diff --git a/docs/articles/smoothCalibration.jpeg b/docs/articles/smoothCalibration.jpeg deleted file mode 100644 index 72c3cdb7a..000000000 Binary files a/docs/articles/smoothCalibration.jpeg and /dev/null differ diff --git a/docs/articles/sparseCalibration.png b/docs/articles/sparseCalibration.png deleted file mode 100644 index d6e34c0cf..000000000 Binary files a/docs/articles/sparseCalibration.png and /dev/null differ diff --git a/docs/articles/sparseRoc.png b/docs/articles/sparseRoc.png deleted file mode 100644 index 8a4b13cec..000000000 Binary files a/docs/articles/sparseRoc.png and /dev/null differ diff --git a/docs/articles/studydesign.png b/docs/articles/studydesign.png deleted file mode 100644 index 453f4aadd..000000000 Binary files a/docs/articles/studydesign.png and /dev/null differ diff --git a/docs/articles/variableScatterplot.png b/docs/articles/variableScatterplot.png deleted file mode 100644 index bdcf0df4a..000000000 Binary files a/docs/articles/variableScatterplot.png and /dev/null differ diff --git a/docs/authors.html b/docs/authors.html deleted file mode 100644 index 31e2e4ba2..000000000 --- a/docs/authors.html +++ /dev/null @@ -1,247 +0,0 @@ - - - - - - - - -Citation and Authors • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -

Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). -“Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data.” -Journal of the American Medical Informatics Association, 25(8), 969-975. -https://doi.org/10.1093/jamia/ocy032. -

-
@Article{,
-  author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek},
-  title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data},
-  journal = {Journal of the American Medical Informatics Association},
-  volume = {25},
-  number = {8},
-  pages = {969-975},
-  year = {2018},
-  url = {https://doi.org/10.1093/jamia/ocy032},
-}
- - - -
    -
  • -

    Jenna Reps. Author, maintainer. -

    -
  • -
  • -

    Martijn Schuemie. Author. -

    -
  • -
  • -

    Marc Suchard. Author. -

    -
  • -
  • -

    Patrick Ryan. Author. -

    -
  • -
  • -

    Peter Rijnbeek. Author. -

    -
  • -
- -
- -
- - - - -
- - - - - - - - diff --git a/docs/bootstrap-toc.css b/docs/bootstrap-toc.css deleted file mode 100644 index 5a859415c..000000000 --- a/docs/bootstrap-toc.css +++ /dev/null @@ -1,60 +0,0 @@ -/*! - * Bootstrap Table of Contents v0.4.1 (http://afeld.github.io/bootstrap-toc/) - * Copyright 2015 Aidan Feldman - * Licensed under MIT (https://github.com/afeld/bootstrap-toc/blob/gh-pages/LICENSE.md) */ - -/* modified from https://github.com/twbs/bootstrap/blob/94b4076dd2efba9af71f0b18d4ee4b163aa9e0dd/docs/assets/css/src/docs.css#L548-L601 */ - -/* All levels of nav */ -nav[data-toggle='toc'] .nav > li > a { - display: block; - padding: 4px 20px; - font-size: 13px; - font-weight: 500; - color: #767676; -} -nav[data-toggle='toc'] .nav > li > a:hover, -nav[data-toggle='toc'] .nav > li > a:focus { - padding-left: 19px; - color: #563d7c; - text-decoration: none; - background-color: transparent; - border-left: 1px solid #563d7c; -} -nav[data-toggle='toc'] .nav > .active > a, -nav[data-toggle='toc'] .nav > .active:hover > a, -nav[data-toggle='toc'] .nav > .active:focus > a { - padding-left: 18px; - font-weight: bold; - color: #563d7c; - background-color: transparent; - border-left: 2px solid #563d7c; -} - -/* Nav: second level (shown on .active) */ -nav[data-toggle='toc'] .nav .nav { - display: none; /* Hide by default, but at >768px, show it */ - padding-bottom: 10px; -} -nav[data-toggle='toc'] .nav .nav > li > a { - padding-top: 1px; - padding-bottom: 1px; - padding-left: 30px; - font-size: 12px; - font-weight: normal; -} -nav[data-toggle='toc'] .nav .nav > li > a:hover, -nav[data-toggle='toc'] .nav .nav > li > a:focus { - padding-left: 29px; -} -nav[data-toggle='toc'] .nav .nav > .active > a, -nav[data-toggle='toc'] .nav .nav > .active:hover > a, -nav[data-toggle='toc'] .nav .nav > .active:focus > a { - padding-left: 28px; - font-weight: 500; -} - -/* from https://github.com/twbs/bootstrap/blob/e38f066d8c203c3e032da0ff23cd2d6098ee2dd6/docs/assets/css/src/docs.css#L631-L634 */ -nav[data-toggle='toc'] .nav > .active > ul { - display: block; -} diff --git a/docs/bootstrap-toc.js b/docs/bootstrap-toc.js deleted file mode 100644 index 1cdd573b2..000000000 --- a/docs/bootstrap-toc.js +++ /dev/null @@ -1,159 +0,0 @@ -/*! - * Bootstrap Table of Contents v0.4.1 (http://afeld.github.io/bootstrap-toc/) - * Copyright 2015 Aidan Feldman - * Licensed under MIT (https://github.com/afeld/bootstrap-toc/blob/gh-pages/LICENSE.md) */ -(function() { - 'use strict'; - - window.Toc = { - helpers: { - // return all matching elements in the set, or their descendants - findOrFilter: function($el, selector) { - // http://danielnouri.org/notes/2011/03/14/a-jquery-find-that-also-finds-the-root-element/ - // http://stackoverflow.com/a/12731439/358804 - var $descendants = $el.find(selector); - return $el.filter(selector).add($descendants).filter(':not([data-toc-skip])'); - }, - - generateUniqueIdBase: function(el) { - var text = $(el).text(); - var anchor = text.trim().toLowerCase().replace(/[^A-Za-z0-9]+/g, '-'); - return anchor || el.tagName.toLowerCase(); - }, - - generateUniqueId: function(el) { - var anchorBase = this.generateUniqueIdBase(el); - for (var i = 0; ; i++) { - var anchor = anchorBase; - if (i > 0) { - // add suffix - anchor += '-' + i; - } - // check if ID already exists - if (!document.getElementById(anchor)) { - return anchor; - } - } - }, - - generateAnchor: function(el) { - if (el.id) { - return el.id; - } else { - var anchor = this.generateUniqueId(el); - el.id = anchor; - return anchor; - } - }, - - createNavList: function() { - return $(''); - }, - - createChildNavList: function($parent) { - var $childList = this.createNavList(); - $parent.append($childList); - return $childList; - }, - - generateNavEl: function(anchor, text) { - var $a = $(''); - $a.attr('href', '#' + anchor); - $a.text(text); - var $li = $('
  • '); - $li.append($a); - return $li; - }, - - generateNavItem: function(headingEl) { - var anchor = this.generateAnchor(headingEl); - var $heading = $(headingEl); - var text = $heading.data('toc-text') || $heading.text(); - return this.generateNavEl(anchor, text); - }, - - // Find the first heading level (`

    `, then `

    `, etc.) that has more than one element. Defaults to 1 (for `

    `). - getTopLevel: function($scope) { - for (var i = 1; i <= 6; i++) { - var $headings = this.findOrFilter($scope, 'h' + i); - if ($headings.length > 1) { - return i; - } - } - - return 1; - }, - - // returns the elements for the top level, and the next below it - getHeadings: function($scope, topLevel) { - var topSelector = 'h' + topLevel; - - var secondaryLevel = topLevel + 1; - var secondarySelector = 'h' + secondaryLevel; - - return this.findOrFilter($scope, topSelector + ',' + secondarySelector); - }, - - getNavLevel: function(el) { - return parseInt(el.tagName.charAt(1), 10); - }, - - populateNav: function($topContext, topLevel, $headings) { - var $context = $topContext; - var $prevNav; - - var helpers = this; - $headings.each(function(i, el) { - var $newNav = helpers.generateNavItem(el); - var navLevel = helpers.getNavLevel(el); - - // determine the proper $context - if (navLevel === topLevel) { - // use top level - $context = $topContext; - } else if ($prevNav && $context === $topContext) { - // create a new level of the tree and switch to it - $context = helpers.createChildNavList($prevNav); - } // else use the current $context - - $context.append($newNav); - - $prevNav = $newNav; - }); - }, - - parseOps: function(arg) { - var opts; - if (arg.jquery) { - opts = { - $nav: arg - }; - } else { - opts = arg; - } - opts.$scope = opts.$scope || $(document.body); - return opts; - } - }, - - // accepts a jQuery object, or an options object - init: function(opts) { - opts = this.helpers.parseOps(opts); - - // ensure that the data attribute is in place for styling - opts.$nav.attr('data-toggle', 'toc'); - - var $topContext = this.helpers.createChildNavList(opts.$nav); - var topLevel = this.helpers.getTopLevel(opts.$scope); - var $headings = this.helpers.getHeadings(opts.$scope, topLevel); - this.helpers.populateNav($topContext, topLevel, $headings); - } - }; - - $(function() { - $('nav[data-toggle="toc"]').each(function(i, el) { - var $nav = $(el); - Toc.init($nav); - }); - }); -})(); diff --git a/docs/docsearch.css b/docs/docsearch.css deleted file mode 100644 index e5f1fe1df..000000000 --- a/docs/docsearch.css +++ /dev/null @@ -1,148 +0,0 @@ -/* Docsearch -------------------------------------------------------------- */ -/* - Source: https://github.com/algolia/docsearch/ - License: MIT -*/ - -.algolia-autocomplete { - display: block; - -webkit-box-flex: 1; - -ms-flex: 1; - flex: 1 -} - -.algolia-autocomplete .ds-dropdown-menu { - width: 100%; - min-width: none; - max-width: none; - padding: .75rem 0; - background-color: #fff; - background-clip: padding-box; - border: 1px solid rgba(0, 0, 0, .1); - box-shadow: 0 .5rem 1rem rgba(0, 0, 0, .175); -} - -@media (min-width:768px) { - .algolia-autocomplete .ds-dropdown-menu { - width: 175% - } -} - -.algolia-autocomplete .ds-dropdown-menu::before { - display: none -} - -.algolia-autocomplete .ds-dropdown-menu [class^=ds-dataset-] { - padding: 0; - background-color: rgb(255,255,255); - border: 0; - max-height: 80vh; -} - -.algolia-autocomplete .ds-dropdown-menu .ds-suggestions { - margin-top: 0 -} - -.algolia-autocomplete .algolia-docsearch-suggestion { - padding: 0; - overflow: visible -} - -.algolia-autocomplete .algolia-docsearch-suggestion--category-header { - padding: .125rem 1rem; - margin-top: 0; - font-size: 1.3em; - font-weight: 500; - color: #00008B; - border-bottom: 0 -} - -.algolia-autocomplete .algolia-docsearch-suggestion--wrapper { - float: none; - padding-top: 0 -} - -.algolia-autocomplete .algolia-docsearch-suggestion--subcategory-column { - float: none; - width: auto; - padding: 0; - text-align: left -} - -.algolia-autocomplete .algolia-docsearch-suggestion--content { - float: none; - width: auto; - padding: 0 -} - -.algolia-autocomplete .algolia-docsearch-suggestion--content::before { - display: none -} - -.algolia-autocomplete .ds-suggestion:not(:first-child) .algolia-docsearch-suggestion--category-header { - padding-top: .75rem; - margin-top: .75rem; - border-top: 1px solid rgba(0, 0, 0, .1) -} - -.algolia-autocomplete .ds-suggestion .algolia-docsearch-suggestion--subcategory-column { - display: block; - padding: .1rem 1rem; - margin-bottom: 0.1; - font-size: 1.0em; - font-weight: 400 - /* display: none */ -} - -.algolia-autocomplete .algolia-docsearch-suggestion--title { - display: block; - padding: .25rem 1rem; - margin-bottom: 0; - font-size: 0.9em; - font-weight: 400 -} - -.algolia-autocomplete .algolia-docsearch-suggestion--text { - padding: 0 1rem .5rem; - margin-top: -.25rem; - font-size: 0.8em; - font-weight: 400; - line-height: 1.25 -} - -.algolia-autocomplete .algolia-docsearch-footer { - width: 110px; - height: 20px; - z-index: 3; - margin-top: 10.66667px; - float: right; - font-size: 0; - line-height: 0; -} - -.algolia-autocomplete .algolia-docsearch-footer--logo { - background-image: url("data:image/svg+xml;utf8,"); - background-repeat: no-repeat; - background-position: 50%; - background-size: 100%; - overflow: hidden; - text-indent: -9000px; - width: 100%; - height: 100%; - display: block; - transform: translate(-8px); -} - -.algolia-autocomplete .algolia-docsearch-suggestion--highlight { - color: #FF8C00; - background: rgba(232, 189, 54, 0.1) -} - - -.algolia-autocomplete .algolia-docsearch-suggestion--text .algolia-docsearch-suggestion--highlight { - box-shadow: inset 0 -2px 0 0 rgba(105, 105, 105, .5) -} - -.algolia-autocomplete .ds-suggestion.ds-cursor .algolia-docsearch-suggestion--content { - background-color: rgba(192, 192, 192, .15) -} diff --git a/docs/docsearch.js b/docs/docsearch.js deleted file mode 100644 index b35504cd3..000000000 --- a/docs/docsearch.js +++ /dev/null @@ -1,85 +0,0 @@ -$(function() { - - // register a handler to move the focus to the search bar - // upon pressing shift + "/" (i.e. "?") - $(document).on('keydown', function(e) { - if (e.shiftKey && e.keyCode == 191) { - e.preventDefault(); - $("#search-input").focus(); - } - }); - - $(document).ready(function() { - // do keyword highlighting - /* modified from https://jsfiddle.net/julmot/bL6bb5oo/ */ - var mark = function() { - - var referrer = document.URL ; - var paramKey = "q" ; - - if (referrer.indexOf("?") !== -1) { - var qs = referrer.substr(referrer.indexOf('?') + 1); - var qs_noanchor = qs.split('#')[0]; - var qsa = qs_noanchor.split('&'); - var keyword = ""; - - for (var i = 0; i < qsa.length; i++) { - var currentParam = qsa[i].split('='); - - if (currentParam.length !== 2) { - continue; - } - - if (currentParam[0] == paramKey) { - keyword = decodeURIComponent(currentParam[1].replace(/\+/g, "%20")); - } - } - - if (keyword !== "") { - $(".contents").unmark({ - done: function() { - $(".contents").mark(keyword); - } - }); - } - } - }; - - mark(); - }); -}); - -/* Search term highlighting ------------------------------*/ - -function matchedWords(hit) { - var words = []; - - var hierarchy = hit._highlightResult.hierarchy; - // loop to fetch from lvl0, lvl1, etc. - for (var idx in hierarchy) { - words = words.concat(hierarchy[idx].matchedWords); - } - - var content = hit._highlightResult.content; - if (content) { - words = words.concat(content.matchedWords); - } - - // return unique words - var words_uniq = [...new Set(words)]; - return words_uniq; -} - -function updateHitURL(hit) { - - var words = matchedWords(hit); - var url = ""; - - if (hit.anchor) { - url = hit.url_without_anchor + '?q=' + escape(words.join(" ")) + '#' + hit.anchor; - } else { - url = hit.url + '?q=' + escape(words.join(" ")); - } - - return url; -} diff --git a/docs/index.html b/docs/index.html deleted file mode 100644 index d889ddc26..000000000 --- a/docs/index.html +++ /dev/null @@ -1,343 +0,0 @@ - - - - - - - -Package for patient level prediction using data in the OMOP Common Data - Model • PatientLevelPrediction - - - - - - - - - - -
    -
    - - - - -
    -
    - -
    - - -

    PatientLevelPrediction is part of HADES.

    -
    -
    -

    -Introduction

    -

    PatientLevelPrediction is an R package for building and validating patient-level predictive models using data in the OMOP Common Data Model format.

    -

    Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.

    -

    The figure below illustrates the prediction problem we address. Among a population at risk, we aim to predict which patients at a defined moment in time (t = 0) will experience some outcome during a time-at-risk. Prediction is done using only information about the patients in an observation window prior to that moment in time.

    -

    -

    To define a prediction problem we have to define t=0 by a Target Cohort (T), the outcome we like to predict by an outcome cohort (O), and the time-at-risk (TAR). Furthermore, we have to make design choices for the model we like to develop, and determine the observational datasets to perform internal and external validation. This conceptual framework works for all type of prediction problems, for example those presented below (T=green, O=red).

    -

    -
    -
    -

    -Features

    -
      -
    • Takes one or more target cohorts (Ts) and one or more outcome cohorts (Os) and develops and validates models for all T and O combinations.
    • -
    • Allows for multiple prediction design options.
    • -
    • Extracts the necessary data from a database in OMOP Common Data Model format for multiple covariate settings.
    • -
    • Uses a large set of covariates including for example all drugs, diagnoses, procedures, as well as age, comorbidity indexes, and custom covariates.
    • -
    • Includes a large number of state-of-the-art machine learning algorithms that can be used to develop predictive models, including Regularized logistic regression, Random forest, Gradient boosting machines, Decision tree, Naive Bayes, K-nearest neighbours, Neural network and Deep learning (Convolutional neural networks, Recurrent neural network and Deep nets).
    • -
    • Allows you to add custom algorithms.
    • -
    • Contains functionality to externally validate models.
    • -
    • Includes functions to plot and explore model performance (ROC + Calibration).
    • -
    • Includes a shiny app to interactively view and explore results.
    • -
    • Implements existing models.
    • -
    • Builds ensemble models.
    • -
    • Builds Deep Learning models.
    • -
    • Generates learning curves.
    • -
    • Automatically creates a word document containing all the study results.
    • -
    -
    -
    -

    -Screenshots

    - - - - - - - - - -
    -

    Calibration plot

    -
    -

    ROC plot

    -
    -Calibration Plot - -ROC Plot -
    -

    Demo of the Shiny Apps can be found here:

    - -
    -
    -

    -Technology

    -

    PatientLevelPrediction is an R package, with some functions implemented in C++ and python.

    -
    -
    -

    -System Requirements

    -

    Requires R (version 3.3.0 or higher). Installation on Windows requires RTools. Libraries used in PatientLevelPrediction require Java and Python.

    -

    The python installation is required for some of the machine learning algorithms. We advise to install Python 3.7 using Anaconda (https://www.continuum.io/downloads).

    -
    -
    -

    -Getting Started

    -
      -
    • To install the package please read the Package Installation guide

    • -
    • Have a look at the video below for an extensive demo of the package.

    • -
    -

    Video Vignette PLP Package

    -

    Please read the main vignette for the package:

    - -

    In addition we have created vignettes that describe advanced functionality in more detail:

    - -

    Package manual: PatientLevelPrediction.pdf

    -
    -
    -

    -User Documentation

    -

    Documentation can be found on the package website.

    -

    PDF versions of the documentation are also available, as mentioned above.

    -
    -
    -

    -Support

    - -
    -
    -

    -Contributing

    -

    Read here how you can contribute to this package.

    -
    -
    -

    -License

    -

    PatientLevelPrediction is licensed under Apache License 2.0

    -
    -
    -

    -Development

    -

    PatientLevelPrediction is being developed in R Studio.

    -

    Beta

    -
    -
    -

    -Acknowledgements

    -
      -
    • The package is maintained by Jenna Reps and Peter Rijnbeek and has been developed with major contributions from Martijn Schuemie, Patrick Ryan, and Marc Suchard.
    • -
    • We like to thank the following persons for their contributions to the package: Seng Chan You, Ross Williams, Henrik John, Xiaoyong Pan, James Wiggins.
    • -
    • This project is supported in part through the National Science Foundation grant IIS 1251151.
    • -
    -
    - -
    - - -
    - - -
    - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - diff --git a/docs/link.svg b/docs/link.svg deleted file mode 100644 index 88ad82769..000000000 --- a/docs/link.svg +++ /dev/null @@ -1,12 +0,0 @@ - - - - - - diff --git a/docs/news/index.html b/docs/news/index.html deleted file mode 100644 index 366774228..000000000 --- a/docs/news/index.html +++ /dev/null @@ -1,625 +0,0 @@ - - - - - - - - -Changelog • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    -PatientLevelPrediction 5.0.3

    -
      -
    • updated result schema (added model_design table with settings and added attrition table)
    • -
    • updated shiny app for new database result schema
    • -
    • removed C++ code for AUC and Rcpp dependency, now using pROC instead as faster
    • -
    • made covariate summary optional when externally validating
    • -
    -
    -
    -

    -PatientLevelPrediction 5.0.2

    -
      -
    • updated json structure for specifying study design (made it friendlier to read)
    • -
    • includes smooth calibration plot fix (done by Alex)
    • -
    • fixed bug with multiple sample methods or feature engineering settings causing invalid error
    • -
    -
    -
    -

    -PatientLevelPrediction 5.0.0

    -
      -
    • plpModel now saved as json files when possible
    • -
    • Updated runPlp to make more modular
    • -
    • now possible to customise data splitting, feature engineering, sampling (over/under) and learning algorithm
    • -
    • added function for extracting cohort covariates
    • -
    • updated evalaution to evaluate per strata (evaluation column)
    • -
    • updated plpModel structure
    • -
    • updated runPlp structure
    • -
    • updated shiny and package to use tidyr and not reshape2
    • -
    • sklearn learning algorithms share the same fit function
    • -
    • r learning algorithms share the same fit function
    • -
    • interface to cyclops code revised
    • -
    • ensemble learning removed (will be in separate package)
    • -
    • deep learning removed (will be in DeepPatientLevelPrediction package)
    • -
    -
    -
    -

    -PatientLevelPrediction 4.4.2

    -
      -
    • revised toSparseM() to do conversion in one go but check RAM availablility beforehand.
    • -
    • removed temporal plpData conversion in toSparseM (this will be done in DeepPatientLevelPrediction)
    • -
    -
    -
    -

    -PatientLevelPrediction 4.4.1

    -
      -
    • shiny can now read csv results
    • -
    • objects loaded via loadPlpFromCsv() can be saved using savePlpResult()
    • -
    -
    -
    -

    -PatientLevelPrediction 4.4.0

    -
      -
    • added database result storage
    • -
    • added interface to database results in shiny
    • -
    • merged in shinyRepo that changed the shiny app to make it modular and added new features
    • -
    • removed deep learning as this is being added into new OHDSI package DeepPatientLevelPrediction
    • -
    -
    -
    -

    -PatientLevelPrediction 4.3.10

    -
      -
    • save xgboost model as json file for transparency
    • -
    • set connectionDetails to NULL in getPlpData
    • -
    -
    -
    -

    -PatientLevelPrediction 4.3.9

    -
      -
    • updated andromeda functions - restrict to pop and tidy covs for speed
    • -
    • quick fix for GBM survival predicting negative values
    • -
    • fixed occasional demoSum error for survival models
    • -
    • updated index creation to use Andromeda function
    • -
    -
    -
    -

    -PatientLevelPrediction 4.3.8

    -
      -
    • fixed bug when normalize data is false
    • -
    • fixed bugs when single feature (gbm + python)
    • -
    • updated GBM
    • -
    -
    -
    -

    -PatientLevelPrediction 4.3.7

    -
      -
    • updated calibration slope
    • -
    • fixed missing age/gender in prediction
    • -
    • fixed shiny intercept bug
    • -
    • fixed diagnostic
    • -
    • fixed missing covariateSettings in load cvs plp
    • -
    -
    -
    -

    -PatientLevelPrediction 4.3.6

    -
      -
    • Removed plpData from evaluation
    • -
    • Added recalibration into externalVal
    • -
    • Updated shiny app for recalibration
    • -
    • Added population creation setting to use cohortEndDate as timeAtRisk end
    • -
    • fixed tests
    • -
    -
    -
    -

    -PatientLevelPrediction 4.3.3

    -
      -
    • Reduced imports by adding code to install some dependencies when used
    • -
    -
    -
    -

    -PatientLevelPrediction 4.3.2

    -
      -
    • fixed csv result saving bug when no model param
    • -
    -
    -
    -

    -PatientLevelPrediction 4.3.1

    -
      -
    • fixed r check vignette issues
    • -
    • added conda install to test
    • -
    -
    -
    -

    -PatientLevelPrediction 4.3.0

    -
      -
    • finalised permutation feature importance
    • -
    -
    -
    -

    -PatientLevelPrediction 4.2.10

    -
      -
    • fixed deepNN index issue (reported on github - thanks dapritchard)
    • -
    • add compression to python pickles
    • -
    • removed requirement to have outcomeCount for prediction with python models
    • -
    -
    -
    -

    -PatientLevelPrediction 4.2.9

    -
      -
    • cleaned all checks
    • -
    • fixed bug in python toSparseMatrix
    • -
    • fixed warning in studyPop
    • -
    -
    -
    -

    -PatientLevelPrediction 4.2.8

    -
      -
    • fixed bug (identified by Chungsoo) in covariateSummary
    • -
    • fixed bug with thresholdSummary
    • -
    • edited threshold summary function to make it cleaner
    • -
    • added to ensemble where you can combine multiple models into an ensemble
    • -
    • cleaned up the notes and tests
    • -
    • updated simulated data covariateId in tests to use integer64
    • -
    • fixed description imports (and sorted them)
    • -
    -
    -
    -

    -PatientLevelPrediction 4.2.7

    -
      -
    • fixed Cox model calibration plots
    • -
    • fixed int64 conversion bug
    • -
    -
    -
    -

    -PatientLevelPrediction 4.2.6

    -
      -
    • added baseline risk to Cox model
    • -
    -
    -
    -

    -PatientLevelPrediction 4.2.3

    -
      -
    • updated shiny: added attrition and hyper-parameter grid search into settings
    • -
    -
    -
    -

    -PatientLevelPrediction 4.2.2

    -
      -
    • updated shiny app added 95% CI to AUC in summary, size is now complete data size and there is a column valPercent that tells what percentage of the data were used for validation
    • -
    -
    -
    -

    -PatientLevelPrediction 4.2.1

    -
      -
    • updated GBMsurvival to use survival metrics and c-stat
    • -
    -
    -
    -

    -PatientLevelPrediction 4.2.0

    -
      -
    • added survival metrics
    • -
    -
    -
    -

    -PatientLevelPrediction 4.1.0

    -
      -
    • added updates and fixes into master from development branch
    • -
    -
    -
    -

    -PatientLevelPrediction 4.0.6

    -
      -
    • fixed bug with pdw data extraction due to multiple person_id columns
    • -
    • fixed bug in shiny app converting covariate values due to tibble
    • -
    -
    -
    -

    -PatientLevelPrediction 4.0.5

    -
      -
    • added calibration updates: cal-in-large, weak cal
    • -
    • updated smooth cal plot (sample for speed in big data)
    • -
    • defaulted to 100 values in calibrationSummary + updated cal plot
    • -
    -
    -
    -

    -PatientLevelPrediction 4.0.4

    -
      -
    • fixed backwards compat with normalization
    • -
    • fixed python joblib dependancy
    • -
    -
    -
    -

    -PatientLevelPrediction 4.0.2

    -
      -
    • fixed bug in preprocessing
    • -
    • added cross validation aucs to LR, GBM, RF and MLP
    • -
    • added more settings into MLP
    • -
    • added threads option in LR
    • -
    -
    -
    -

    -PatientLevelPrediction 4.0.1

    -
      -
    • fixed minor bug with shiny dependency
    • -
    • fixed some tests
    • -
    • added standardizedMeanDiff to covariatesummary
    • -
    • updated createStudyPopulation to make it cleaner to read and count outcome per TAR
    • -
    -
    -
    -

    -PatientLevelPrediction 4.0.0

    -
      -
    • Andromeda replaced ff data objects
    • -
    • added age/gender into cohort
    • -
    • fixed python warnings
    • -
    • updated shiny plp viewer
    • -
    -
    -
    -

    -PatientLevelPrediction 3.0.16

    -
      -
    • Fixed bug when running multiple analyses using a data extraction sample with multiple covariate settings
    • -
    -
    -
    -

    -PatientLevelPrediction 3.0.15

    -
      -
    • improved shiny PLP viewer
    • -
    • added diagnostic shiny viewer
    • -
    -
    -
    -

    -PatientLevelPrediction 3.0.14

    -
      -
    • updated external validate code to enable custom covariates using ATLAS cohorts
    • -
    • fixed issues with startAnchor and endAnchor
    • -
    -
    -
    -

    -PatientLevelPrediction 3.0.13

    -
      -
    • Deprecating addExposureDaysToStart and addExposureDaysToEnd arguments in createStudyPopulation, adding new arguments called startAnchor and endAnchor. The hope is this is less confusing.
    • -
    • fixed transfer learning code (can now transfer or fine-tune model)
    • -
    • made view plp shiny apps work when some results are missing
    • -
    -
    -
    -

    -PatientLevelPrediction 3.0.12

    -
      -
    • set up testing
    • -
    • fixed build warnings
    • -
    -
    -
    -

    -PatientLevelPrediction 3.0.11

    -
      -
    • added tests to get >70% coverage (keras tests too slow for travis)
    • -
    • Fixed minor bugs
    • -
    • Fixed deep learning code and removed pythonInR dependancy
    • -
    • combined shiny into one file with one interface
    • -
    -
    -
    -

    -PatientLevelPrediction 3.0.10

    -
      -
    • added recalibration using 25% sample in existing models
    • -
    • added option to provide score to probabilities for existing models
    • -
    • fixed warnings with some plots
    • -
    -
    -
    -

    -PatientLevelPrediction 3.0.9

    -

    Small bug fixes:

    -
      -
    • added analysisId into model saving/loading
    • -
    • made external validation saving recursive
    • -
    • added removal of patients with negative TAR when creating population
    • -
    • added option to apply model without preprocessing settings (make them NULL)
    • -
    • updated create study population to remove patients with negative time-at-risk
    • -
    -
    -
    -

    -PatientLevelPrediction 3.0.8

    -

    Changes:

    -
      -
    • merged in bug fix from Martijn - fixed AUC bug causing crash with big data
    • -
    • update SQL code to be compatible with v6.0 OMOP CDM
    • -
    • added save option to external validate PLP
    • -
    -
    -
    -

    -PatientLevelPrediction 3.0.7

    -

    Changes:

    -
      -
    • Updated splitting functions to include a splitby subject and renamed personSplitter to randomSplitter
    • -
    • Cast indices to integer in python functions to fix bug with non integer sparse matrix indices
    • -
    -
    -
    -

    -PatientLevelPrediction 3.0.5

    -

    Changes:

    -
      -
    • Added GLM status to log (will now inform about any fitting issue in log)
    • -
    • Added GBM survival model (still under development)
    • -
    • Added RF quantile regression (still under development)
    • -
    • Updated viewMultiplePlp() to match PLP skeleton package app
    • -
    • Updated single plp vignette with additional example
    • -
    • Merge in deep learning updates from Chan
    • -
    -
    -
    -

    -PatientLevelPrediction 3.0.4

    -

    Changes:

    -
      -
    • Updated website
    • -
    -
    -
    -

    -PatientLevelPrediction 3.0.3

    -

    Changes:

    -
      -
    • Added more tests
    • -
    • test files now match R files
    • -
    -
    -
    -

    -PatientLevelPrediction 3.0.2

    -

    Changes:

    -
      -
    • Fixed ensemble stacker
    • -
    -
    -
    -

    -PatientLevelPrediction 3.0.1

    -

    Changes:

    -
      -
    • Using reticulate for python interface
    • -
    • Speed improvements
    • -
    • Bug fixes
    • -
    -
    -
    - - - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/pkgdown.css b/docs/pkgdown.css deleted file mode 100644 index 1273238dd..000000000 --- a/docs/pkgdown.css +++ /dev/null @@ -1,367 +0,0 @@ -/* Sticky footer */ - -/** - * Basic idea: https://philipwalton.github.io/solved-by-flexbox/demos/sticky-footer/ - * Details: https://github.com/philipwalton/solved-by-flexbox/blob/master/assets/css/components/site.css - * - * .Site -> body > .container - * .Site-content -> body > .container .row - * .footer -> footer - * - * Key idea seems to be to ensure that .container and __all its parents__ - * have height set to 100% - * - */ - -html, body { - height: 100%; -} - -body { - position: relative; -} - -body > .container { - display: flex; - height: 100%; - flex-direction: column; -} - -body > .container .row { - flex: 1 0 auto; -} - -footer { - margin-top: 45px; - padding: 35px 0 36px; - border-top: 1px solid #e5e5e5; - color: #666; - display: flex; - flex-shrink: 0; -} -footer p { - margin-bottom: 0; -} -footer div { - flex: 1; -} -footer .pkgdown { - text-align: right; -} -footer p { - margin-bottom: 0; -} - -img.icon { - float: right; -} - -img { - max-width: 100%; -} - -/* Fix bug in bootstrap (only seen in firefox) */ -summary { - display: list-item; -} - -/* Typographic tweaking ---------------------------------*/ - -.contents .page-header { - margin-top: calc(-60px + 1em); -} - -dd { - margin-left: 3em; -} - -/* Section anchors ---------------------------------*/ - -a.anchor { - margin-left: -30px; - display:inline-block; - width: 30px; - height: 30px; - visibility: hidden; - - background-image: url(./link.svg); - background-repeat: no-repeat; - background-size: 20px 20px; - background-position: center center; -} - -.hasAnchor:hover a.anchor { - visibility: visible; -} - -@media (max-width: 767px) { - .hasAnchor:hover a.anchor { - visibility: hidden; - } -} - - -/* Fixes for fixed navbar --------------------------*/ - -.contents h1, .contents h2, .contents h3, .contents h4 { - padding-top: 60px; - margin-top: -40px; -} - -/* Navbar submenu --------------------------*/ - -.dropdown-submenu { - position: relative; -} - -.dropdown-submenu>.dropdown-menu { - top: 0; - left: 100%; - margin-top: -6px; - margin-left: -1px; - border-radius: 0 6px 6px 6px; -} - -.dropdown-submenu:hover>.dropdown-menu { - display: block; -} - -.dropdown-submenu>a:after { - display: block; - content: " "; - float: right; - width: 0; - height: 0; - border-color: transparent; - border-style: solid; - border-width: 5px 0 5px 5px; - border-left-color: #cccccc; - margin-top: 5px; - margin-right: -10px; -} - -.dropdown-submenu:hover>a:after { - border-left-color: #ffffff; -} - -.dropdown-submenu.pull-left { - float: none; -} - -.dropdown-submenu.pull-left>.dropdown-menu { - left: -100%; - margin-left: 10px; - border-radius: 6px 0 6px 6px; -} - -/* Sidebar --------------------------*/ - -#pkgdown-sidebar { - margin-top: 30px; - position: -webkit-sticky; - position: sticky; - top: 70px; -} - -#pkgdown-sidebar h2 { - font-size: 1.5em; - margin-top: 1em; -} - -#pkgdown-sidebar h2:first-child { - margin-top: 0; -} - -#pkgdown-sidebar .list-unstyled li { - margin-bottom: 0.5em; -} - -/* bootstrap-toc tweaks ------------------------------------------------------*/ - -/* All levels of nav */ - -nav[data-toggle='toc'] .nav > li > a { - padding: 4px 20px 4px 6px; - font-size: 1.5rem; - font-weight: 400; - color: inherit; -} - -nav[data-toggle='toc'] .nav > li > a:hover, -nav[data-toggle='toc'] .nav > li > a:focus { - padding-left: 5px; - color: inherit; - border-left: 1px solid #878787; -} - -nav[data-toggle='toc'] .nav > .active > a, -nav[data-toggle='toc'] .nav > .active:hover > a, -nav[data-toggle='toc'] .nav > .active:focus > a { - padding-left: 5px; - font-size: 1.5rem; - font-weight: 400; - color: inherit; - border-left: 2px solid #878787; -} - -/* Nav: second level (shown on .active) */ - -nav[data-toggle='toc'] .nav .nav { - display: none; /* Hide by default, but at >768px, show it */ - padding-bottom: 10px; -} - -nav[data-toggle='toc'] .nav .nav > li > a { - padding-left: 16px; - font-size: 1.35rem; -} - -nav[data-toggle='toc'] .nav .nav > li > a:hover, -nav[data-toggle='toc'] .nav .nav > li > a:focus { - padding-left: 15px; -} - -nav[data-toggle='toc'] .nav .nav > .active > a, -nav[data-toggle='toc'] .nav .nav > .active:hover > a, -nav[data-toggle='toc'] .nav .nav > .active:focus > a { - padding-left: 15px; - font-weight: 500; - font-size: 1.35rem; -} - -/* orcid ------------------------------------------------------------------- */ - -.orcid { - font-size: 16px; - color: #A6CE39; - /* margins are required by official ORCID trademark and display guidelines */ - margin-left:4px; - margin-right:4px; - vertical-align: middle; -} - -/* Reference index & topics ----------------------------------------------- */ - -.ref-index th {font-weight: normal;} - -.ref-index td {vertical-align: top; min-width: 100px} -.ref-index .icon {width: 40px;} -.ref-index .alias {width: 40%;} -.ref-index-icons .alias {width: calc(40% - 40px);} -.ref-index .title {width: 60%;} - -.ref-arguments th {text-align: right; padding-right: 10px;} -.ref-arguments th, .ref-arguments td {vertical-align: top; min-width: 100px} -.ref-arguments .name {width: 20%;} -.ref-arguments .desc {width: 80%;} - -/* Nice scrolling for wide elements --------------------------------------- */ - -table { - display: block; - overflow: auto; -} - -/* Syntax highlighting ---------------------------------------------------- */ - -pre { - word-wrap: normal; - word-break: normal; - border: 1px solid #eee; -} - -pre, code { - background-color: #f8f8f8; - color: #333; -} - -pre code { - overflow: auto; - word-wrap: normal; - white-space: pre; -} - -pre .img { - margin: 5px 0; -} - -pre .img img { - background-color: #fff; - display: block; - height: auto; -} - -code a, pre a { - color: #375f84; -} - -a.sourceLine:hover { - text-decoration: none; -} - -.fl {color: #1514b5;} -.fu {color: #000000;} /* function */ -.ch,.st {color: #036a07;} /* string */ -.kw {color: #264D66;} /* keyword */ -.co {color: #888888;} /* comment */ - -.message { color: black; font-weight: bolder;} -.error { color: orange; font-weight: bolder;} -.warning { color: #6A0366; font-weight: bolder;} - -/* Clipboard --------------------------*/ - -.hasCopyButton { - position: relative; -} - -.btn-copy-ex { - position: absolute; - right: 0; - top: 0; - visibility: hidden; -} - -.hasCopyButton:hover button.btn-copy-ex { - visibility: visible; -} - -/* headroom.js ------------------------ */ - -.headroom { - will-change: transform; - transition: transform 200ms linear; -} -.headroom--pinned { - transform: translateY(0%); -} -.headroom--unpinned { - transform: translateY(-100%); -} - -/* mark.js ----------------------------*/ - -mark { - background-color: rgba(255, 255, 51, 0.5); - border-bottom: 2px solid rgba(255, 153, 51, 0.3); - padding: 1px; -} - -/* vertical spacing after htmlwidgets */ -.html-widget { - margin-bottom: 10px; -} - -/* fontawesome ------------------------ */ - -.fab { - font-family: "Font Awesome 5 Brands" !important; -} - -/* don't display links in code chunks when printing */ -/* source: https://stackoverflow.com/a/10781533 */ -@media print { - code a:link:after, code a:visited:after { - content: ""; - } -} diff --git a/docs/pkgdown.js b/docs/pkgdown.js deleted file mode 100644 index 7e7048fae..000000000 --- a/docs/pkgdown.js +++ /dev/null @@ -1,108 +0,0 @@ -/* http://gregfranko.com/blog/jquery-best-practices/ */ -(function($) { - $(function() { - - $('.navbar-fixed-top').headroom(); - - $('body').css('padding-top', $('.navbar').height() + 10); - $(window).resize(function(){ - $('body').css('padding-top', $('.navbar').height() + 10); - }); - - $('[data-toggle="tooltip"]').tooltip(); - - var cur_path = paths(location.pathname); - var links = $("#navbar ul li a"); - var max_length = -1; - var pos = -1; - for (var i = 0; i < links.length; i++) { - if (links[i].getAttribute("href") === "#") - continue; - // Ignore external links - if (links[i].host !== location.host) - continue; - - var nav_path = paths(links[i].pathname); - - var length = prefix_length(nav_path, cur_path); - if (length > max_length) { - max_length = length; - pos = i; - } - } - - // Add class to parent
  • , and enclosing
  • if in dropdown - if (pos >= 0) { - var menu_anchor = $(links[pos]); - menu_anchor.parent().addClass("active"); - menu_anchor.closest("li.dropdown").addClass("active"); - } - }); - - function paths(pathname) { - var pieces = pathname.split("/"); - pieces.shift(); // always starts with / - - var end = pieces[pieces.length - 1]; - if (end === "index.html" || end === "") - pieces.pop(); - return(pieces); - } - - // Returns -1 if not found - function prefix_length(needle, haystack) { - if (needle.length > haystack.length) - return(-1); - - // Special case for length-0 haystack, since for loop won't run - if (haystack.length === 0) { - return(needle.length === 0 ? 0 : -1); - } - - for (var i = 0; i < haystack.length; i++) { - if (needle[i] != haystack[i]) - return(i); - } - - return(haystack.length); - } - - /* Clipboard --------------------------*/ - - function changeTooltipMessage(element, msg) { - var tooltipOriginalTitle=element.getAttribute('data-original-title'); - element.setAttribute('data-original-title', msg); - $(element).tooltip('show'); - element.setAttribute('data-original-title', tooltipOriginalTitle); - } - - if(ClipboardJS.isSupported()) { - $(document).ready(function() { - var copyButton = ""; - - $(".examples, div.sourceCode").addClass("hasCopyButton"); - - // Insert copy buttons: - $(copyButton).prependTo(".hasCopyButton"); - - // Initialize tooltips: - $('.btn-copy-ex').tooltip({container: 'body'}); - - // Initialize clipboard: - var clipboardBtnCopies = new ClipboardJS('[data-clipboard-copy]', { - text: function(trigger) { - return trigger.parentNode.textContent; - } - }); - - clipboardBtnCopies.on('success', function(e) { - changeTooltipMessage(e.trigger, 'Copied!'); - e.clearSelection(); - }); - - clipboardBtnCopies.on('error', function() { - changeTooltipMessage(e.trigger,'Press Ctrl+C or Command+C to copy'); - }); - }); - } -})(window.jQuery || window.$) diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml deleted file mode 100644 index 7f8ad5eb7..000000000 --- a/docs/pkgdown.yml +++ /dev/null @@ -1,20 +0,0 @@ -pandoc: 2.11.4 -pkgdown: 1.6.1 -pkgdown_sha: ~ -articles: - AddingCustomFeatureEngineering: AddingCustomFeatureEngineering.html - AddingCustomModels: AddingCustomModels.html - AddingCustomSamples: AddingCustomSamples.html - AddingCustomSplitting: AddingCustomSplitting.html - BestPractices: BestPractices.html - BuildingEnsembleModels: BuildingEnsembleModels.html - BuildingMultiplePredictiveModels: BuildingMultiplePredictiveModels.html - BuildingPredictiveModels: BuildingPredictiveModels.html - CreatingLearningCurves: CreatingLearningCurves.html - CreatingNetworkStudies: CreatingNetworkStudies.html - CreatingShinyApp: CreatingShinyApp.html - InstallationGuide: InstallationGuide.html - PatientLevelPrediction: PatientLevelPrediction.html - Videos: Videos.html -last_built: 2022-03-09T19:04Z - diff --git a/docs/reference/PatientLevelPrediction.html b/docs/reference/PatientLevelPrediction.html deleted file mode 100644 index 635b9807e..000000000 --- a/docs/reference/PatientLevelPrediction.html +++ /dev/null @@ -1,216 +0,0 @@ - - - - - - - - -PatientLevelPrediction — PatientLevelPrediction • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    A package for running predictions using data in the OMOP CDM

    -
    - - - - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/Rplot001.png b/docs/reference/Rplot001.png deleted file mode 100644 index 17a358060..000000000 Binary files a/docs/reference/Rplot001.png and /dev/null differ diff --git a/docs/reference/accuracy.html b/docs/reference/accuracy.html deleted file mode 100644 index 92d9a6902..000000000 --- a/docs/reference/accuracy.html +++ /dev/null @@ -1,243 +0,0 @@ - - - - - - - - -Calculate the accuracy — accuracy • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Calculate the accuracy

    -
    - -
    accuracy(TP, TN, FN, FP)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - -
    TP

    Number of true positives

    TN

    Number of true negatives

    FN

    Number of false negatives

    FP

    Number of false positives

    - -

    Value

    - -

    accuracy value

    -

    Details

    - -

    Calculate the accuracy

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/addRecalibration.html b/docs/reference/addRecalibration.html deleted file mode 100644 index 2b920805e..000000000 --- a/docs/reference/addRecalibration.html +++ /dev/null @@ -1,229 +0,0 @@ - - - - - - - - -addRecalibration — addRecalibration • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Adds the recalibration results to the main results

    -
    - -
    addRecalibration(performanceEvaluation, recalibration)
    - -

    Arguments

    - - - - - - - - - - -
    performanceEvaluation

    The main result performanceEvaluation

    recalibration

    The recalibration result

    - -

    Value

    - -

    An object of class runPlp that is recalibrated on the new data

    -

    Details

    - -

    Append the recalibration results into the main results

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/applyEnsembleModel.html b/docs/reference/applyEnsembleModel.html deleted file mode 100644 index cc9346c8e..000000000 --- a/docs/reference/applyEnsembleModel.html +++ /dev/null @@ -1,280 +0,0 @@ - - - - - - - - -Apply trained ensemble model on new data Apply a Patient Level Prediction model on Patient Level -Prediction Data and get the predicted risk in [0,1] for each person in the population. If the user -inputs a population with an outcomeCount column then the function also returns the evaluation of -the prediction (AUC, brier score, calibration) — applyEnsembleModel • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Apply trained ensemble model on new data Apply a Patient Level Prediction model on Patient Level -Prediction Data and get the predicted risk in [0,1] for each person in the population. If the user -inputs a population with an outcomeCount column then the function also returns the evaluation of -the prediction (AUC, brier score, calibration)

    -
    - -
    applyEnsembleModel(
    -  population,
    -  dataList,
    -  ensembleModel,
    -  analysisId = NULL,
    -  calculatePerformance = T
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - -
    population

    The population of people who you want to predict the risk for

    dataList

    The plpData list for the population

    ensembleModel

    The trained ensemble model returned by running runEnsembleModel

    analysisId

    The analysis ID, which is the ID of running ensemble model training.

    calculatePerformance

    Whether to also calculate the performance metrics [default TRUE]

    - - -

    Examples

    -
    if (FALSE) { -# load the model and data -plpData <- loadPlpData("plpdata/") -results <- PatientLevelPrediction::runEnsembleModel(population, - dataList = list(plpData, plpData), - modelList = list(model, model), - testSplit = "person", - testFraction = 0.2, - nfold = 3, - splitSeed = 1000, - ensembleStrategy = "stacked") -# use the same population settings as the model: -populationSettings <- plpModel$populationSettings -populationSettings$plpData <- plpData -population <- do.call(createStudyPopulation, populationSettings) - -# get the prediction, please make sure the ensemble strategy for training and apply is the same: -prediction <- applyEnsembleModel(population, - dataList = list(plpData, plpData), - ensembleModel = results, - analysisId = NULL)$prediction -} -
    -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/applyModel.html b/docs/reference/applyModel.html deleted file mode 100644 index 2b5feb7af..000000000 --- a/docs/reference/applyModel.html +++ /dev/null @@ -1,281 +0,0 @@ - - - - - - - - -Apply train model on new data -Apply a Patient Level Prediction model on Patient Level Prediction Data and get the predicted risk -in [0,1] for each person in the population. If the user inputs a population with an outcomeCount -column then the function also returns the evaluation of the prediction (AUC, brier score, -calibration) — applyModel • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Apply train model on new data -Apply a Patient Level Prediction model on Patient Level Prediction Data and get the predicted risk -in [0,1] for each person in the population. If the user inputs a population with an outcomeCount -column then the function also returns the evaluation of the prediction (AUC, brier score, -calibration)

    -
    - -
    applyModel(
    -  population,
    -  plpData,
    -  plpModel,
    -  calculatePerformance = T,
    -  databaseOutput = NULL,
    -  silent = F
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - -
    population

    The population of people who you want to predict the risk for

    plpData

    The plpData for the population

    plpModel

    The trained PatientLevelPrediction model

    calculatePerformance

    Whether to also calculate the performance metrics [default TRUE]

    databaseOutput

    Whether to save the details into the prediction database

    silent

    Whether to turn off progress reporting

    - - -

    Examples

    -
    if (FALSE) { -# load the model and data -plpData <- loadPlpData("C:/plpdata") -plpModel <- loadPlpModel("C:/plpmodel") - -# use the same population settings as the model: -populationSettings <- plpModel$populationSettings -populationSettings$plpData <- plpData -population <- do.call(createStudyPopulation, populationSettings) - -# get the prediction: -prediction <- applyModel(population, plpData, plpModel)$prediction -} -
    -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/averagePrecision.html b/docs/reference/averagePrecision.html deleted file mode 100644 index 35065ca7d..000000000 --- a/docs/reference/averagePrecision.html +++ /dev/null @@ -1,231 +0,0 @@ - - - - - - - - -Calculate the average precision — averagePrecision • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Calculate the average precision

    -
    - -
    averagePrecision(prediction)
    - -

    Arguments

    - - - - - - -
    prediction

    A prediction object

    - -

    Value

    - -

    The average precision

    -

    Details

    - -

    Calculates the average precision from a predition object

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/brierScore.html b/docs/reference/brierScore.html deleted file mode 100644 index d3606f9bf..000000000 --- a/docs/reference/brierScore.html +++ /dev/null @@ -1,231 +0,0 @@ - - - - - - - - -brierScore — brierScore • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    brierScore

    -
    - -
    brierScore(prediction)
    - -

    Arguments

    - - - - - - -
    prediction

    A prediction object

    - -

    Value

    - -

    A list containing the brier score and the scaled brier score

    -

    Details

    - -

    Calculates the brierScore from prediction object

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/bySumFf.html b/docs/reference/bySumFf.html deleted file mode 100644 index 87ca32289..000000000 --- a/docs/reference/bySumFf.html +++ /dev/null @@ -1,214 +0,0 @@ - - - - - - - - -Compute sum of values binned by a second variable — bySumFf • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - -
    - -
    -
    - - -
    - -

    Compute sum of values binned by a second variable

    - -
    - -
    bySumFf(values, bins)
    - -

    Arguments

    - - - - - - - - - - -
    values

    An ff object containing the numeric values to be summed

    bins

    An ff object containing the numeric values to bin by

    - - -

    Examples

    -
    values <- ff::as.ff(c(1, 1, 2, 2, 1)) -bins <- ff::as.ff(c(1, 1, 1, 2, 2)) -bySumFf(values, bins)
    #> bins sums -#> 1 1 4 -#> 2 2 3
    -
    -
    - -
    - -
    - - -
    -

    Site built with pkgdown 1.3.0.

    -
    -
    -
    - - - - - - diff --git a/docs/reference/calibrationLine.html b/docs/reference/calibrationLine.html deleted file mode 100644 index e0f6267d5..000000000 --- a/docs/reference/calibrationLine.html +++ /dev/null @@ -1,232 +0,0 @@ - - - - - - - - -calibrationLine — calibrationLine • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    calibrationLine

    -
    - -
    calibrationLine(prediction, numberOfStrata = 10)
    - -

    Arguments

    - - - - - - - - - - -
    prediction

    A prediction object

    numberOfStrata

    The number of groups to split the prediction into

    - -

    Details

    - -

    Calculates the calibration from prediction object

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/checkPlpInstallation.html b/docs/reference/checkPlpInstallation.html deleted file mode 100644 index c9d6dee27..000000000 --- a/docs/reference/checkPlpInstallation.html +++ /dev/null @@ -1,229 +0,0 @@ - - - - - - - - -Check PatientLevelPrediction and its dependencies are correctly installed — checkPlpInstallation • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Check PatientLevelPrediction and its dependencies are correctly installed

    -
    - -
    checkPlpInstallation(connectionDetails = NULL, python = T)
    - -

    Arguments

    - - - - - - - - - - -
    connectionDetails

    An R object of type
    connectionDetails created using the function -createConnectionDetails in the DatabaseConnector package.

    python

    Whether to test the python models

    - -

    Details

    - -

    This function checks whether PatientLevelPrediction and its dependencies are correctly installed. This will -check the database connectivity, some models, and large data object -handling (ff).

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/checkffFolder.html b/docs/reference/checkffFolder.html deleted file mode 100644 index 72bfe06df..000000000 --- a/docs/reference/checkffFolder.html +++ /dev/null @@ -1,198 +0,0 @@ - - - - - - - - -Check if the fftempdir is writable — checkffFolder • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - -
    - -
    -
    - - -
    - -

    Check if the fftempdir is writable

    - -
    - -
    checkffFolder()
    - -

    Details

    - -

    This function checks whether the fftempdir is writable. -If not, it will ask the use to specify a writable folder.

    - - -
    - -
    - -
    - - -
    -

    Site built with pkgdown 1.3.0.

    -
    -
    -
    - - - - - - diff --git a/docs/reference/clearffTempDir.html b/docs/reference/clearffTempDir.html deleted file mode 100644 index a742cfd80..000000000 --- a/docs/reference/clearffTempDir.html +++ /dev/null @@ -1,191 +0,0 @@ - - - - - - - - -clearffTempDir — clearffTempDir • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - -
    - -
    -
    - - -
    - -

    Clears the temporary ff directory to free up disk space.

    - -
    - -
    clearffTempDir()
    - - -
    - -
    - -
    - - -
    -

    Site built with pkgdown 1.3.0.

    -
    -
    -
    - - - - - - diff --git a/docs/reference/combinePlpModelSettings.html b/docs/reference/combinePlpModelSettings.html deleted file mode 100644 index 471203a99..000000000 --- a/docs/reference/combinePlpModelSettings.html +++ /dev/null @@ -1,231 +0,0 @@ - - - - - - - - -combine two objects specifying multiple Plp model settings — combinePlpModelSettings • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    combine two objects specifying multiple Plp model settings

    -
    - -
    combinePlpModelSettings(plpModelSetting1, plpModelSetting2)
    - -

    Arguments

    - - - - - - - - - - -
    plpModelSetting1

    A combination of model, covariate and population settings

    plpModelSetting2

    A combination of model, covariate and population settings

    - -

    Value

    - -

    A list containing a dataframe settingLookupTable containing all the model, covariate and popualtion combination details, -a list models containing all the model settings, a list covariateSettings containing all the covariate settings and a list -populationSettings containing all the population settings.

    -

    Details

    - -

    Takes two output of running createPlpModelSettings() and combined them

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/computeAuc.html b/docs/reference/computeAuc.html deleted file mode 100644 index 69cdd4b86..000000000 --- a/docs/reference/computeAuc.html +++ /dev/null @@ -1,234 +0,0 @@ - - - - - - - - -Compute the area under the ROC curve — computeAuc • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Compute the area under the ROC curve

    -
    - -
    computeAuc(prediction, confidenceInterval = FALSE)
    - -

    Arguments

    - - - - - - - - - - -
    prediction

    A prediction object as generated using the -predict functions.

    confidenceInterval

    Should 95 percebt confidence intervals be computed?

    - -

    Details

    - -

    Computes the area under the ROC curve for the predicted probabilities, given the true observed -outcomes.

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/computeAucFromDataFrames.html b/docs/reference/computeAucFromDataFrames.html deleted file mode 100644 index 0cbce670f..000000000 --- a/docs/reference/computeAucFromDataFrames.html +++ /dev/null @@ -1,251 +0,0 @@ - - - - - - - - -Compute the area under the ROC curve — computeAucFromDataFrames • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Compute the area under the ROC curve

    -
    - -
    computeAucFromDataFrames(
    -  prediction,
    -  status,
    -  time = NULL,
    -  confidenceInterval = FALSE,
    -  timePoint,
    -  modelType = "logistic"
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - -
    prediction

    A vector with the predicted hazard rate.

    status

    A vector with the status of 1 (event) or 0 (no event).

    time

    Only for survival models: a vector with the time to event or censor -(which ever comes first).

    confidenceInterval

    Should 95 percebt confidence intervals be computed?

    timePoint

    Only for survival models: time point when the AUC should be evaluated

    modelType

    Type of model. Currently supported are "logistic" and "survival".

    - -

    Details

    - -

    Computes the area under the ROC curve for the predicted probabilities, given the true observed -outcomes.

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/configurePython.html b/docs/reference/configurePython.html deleted file mode 100644 index 4ddcfe043..000000000 --- a/docs/reference/configurePython.html +++ /dev/null @@ -1,233 +0,0 @@ - - - - - - - - -Sets up a virtual environment to use for PLP (can be conda or python) — configurePython • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Sets up a virtual environment to use for PLP (can be conda or python)

    -
    - -
    configurePython(envname = "PLP", envtype = NULL)
    - -

    Arguments

    - - - - - - - - - - -
    envname

    A string for the name of the virtual environment (default is 'PLP')

    envtype

    An option for specifying the environment as'conda' or 'python'. If NULL then the default is 'conda' for windows users and 'python' for non-windows users

    - -

    Details

    - -

    This function creates a virtual environment that can be used by PatientLevelPrediction -and installs all the required package dependancies. If using python, pip must be set up.

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/covariateSummary.html b/docs/reference/covariateSummary.html deleted file mode 100644 index 5d58c34fa..000000000 --- a/docs/reference/covariateSummary.html +++ /dev/null @@ -1,269 +0,0 @@ - - - - - - - - -covariateSummary — covariateSummary • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Summarises the covariateData to calculate the mean and standard deviation per covaraite -if the labels are input it also stratifies this by class label and if the trainRowIds and testRowIds -specifying the patients in the train/test sets respectively are input, these values are also stratified -by train and test set

    -
    - -
    covariateSummary(
    -  covariateData,
    -  cohort,
    -  labels = NULL,
    -  strata = NULL,
    -  variableImportance = NULL,
    -  featureEngineering = NULL
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - -
    covariateData

    The covariateData part of the plpData that is -extracted using getPlpData

    cohort

    The patient cohort to calculate the summary

    labels

    A data.frame with the columns rowId and outcomeCount

    strata

    A data.frame containing the columns rowId, strataName

    variableImportance

    A data.frame with the columns covariateId and -value (the variable importance value)

    featureEngineering

    (currently not used ) -A function or list of functions specifying any feature engineering -to create covariates before summarising

    - -

    Value

    - -

    A data.frame containing: CovariateCount CovariateMean and CovariateStDev plus these values -for any specified stratification

    -

    Details

    - -

    The function calculates various metrics to measure the performance of the model

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/createCohortCovariateSettings.html b/docs/reference/createCohortCovariateSettings.html deleted file mode 100644 index df10200a9..000000000 --- a/docs/reference/createCohortCovariateSettings.html +++ /dev/null @@ -1,286 +0,0 @@ - - - - - - - - -Extracts covariates based on cohorts — createCohortCovariateSettings • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Extracts covariates based on cohorts

    -
    - -
    createCohortCovariateSettings(
    -  cohortName,
    -  settingId,
    -  cohortDatabaseSchema,
    -  cohortTable,
    -  cohortId,
    -  startDay = -30,
    -  endDay = 0,
    -  count = F,
    -  ageInteraction = F,
    -  lnAgeInteraction = F,
    -  analysisId = 456
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    cohortName

    Name for the cohort

    settingId

    A unique id for the covariate time and

    cohortDatabaseSchema

    The schema of the database with the cohort

    cohortTable

    the table name that contains the covariate cohort

    cohortId

    cohort id for the covariate cohort

    startDay

    The number of days prior to index to start observing the cohort

    endDay

    The number of days prior to index to stop observing the cohort

    count

    If FALSE the covariate value is binary (1 means cohort occurred between index+startDay and index+endDay, 0 means it did not) -If TRUE then the covariate value is the number of unique cohort_start_dates between index+startDay and index+endDay

    ageInteraction

    If TRUE multiple covariate value by the patient's age in years

    lnAgeInteraction

    If TRUE multiple covariate value by the log of the patient's age in years

    analysisId

    The analysisId for the covariate

    - -

    Value

    - -

    An object of class covariateSettings specifying how to create the cohort covariate with the covariateId - cohortId x 100000 + settingId x 1000 + analysisId

    -

    Details

    - -

    The user specifies a cohort and time period and then a covariate is constructed whether they are in the -cohort during the time periods relative to target population cohort index

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/createDatabaseDetails.html b/docs/reference/createDatabaseDetails.html deleted file mode 100644 index bac71f798..000000000 --- a/docs/reference/createDatabaseDetails.html +++ /dev/null @@ -1,300 +0,0 @@ - - - - - - - - -Create a setting that holds the details about the cdmDatabase connection for data extraction — createDatabaseDetails • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Create a setting that holds the details about the cdmDatabase connection for data extraction

    -
    - -
    createDatabaseDetails(
    -  connectionDetails,
    -  cdmDatabaseSchema,
    -  cdmDatabaseName,
    -  tempEmulationSchema = cdmDatabaseSchema,
    -  cohortDatabaseSchema = cdmDatabaseSchema,
    -  cohortTable = "cohort",
    -  outcomeDatabaseSchema = cdmDatabaseSchema,
    -  outcomeTable = "cohort",
    -  cohortId = NULL,
    -  outcomeIds = NULL,
    -  cdmVersion = 5
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    connectionDetails

    An R object of type connectionDetails created using the -function createConnectionDetails in the -DatabaseConnector package.

    cdmDatabaseSchema

    The name of the database schema that contains the OMOP CDM -instance. Requires read permissions to this database. On SQL -Server, this should specifiy both the database and the schema, -so for example 'cdm_instance.dbo'.

    cdmDatabaseName

    A string with a shareable name of the database (this will be shown to OHDSI researchers if the results get transported)

    tempEmulationSchema

    For dmbs like Oracle only: the name of the database schema where you -want all temporary tables to be managed. Requires -create/insert permissions to this database.

    cohortDatabaseSchema

    The name of the database schema that is the location where the -target cohorts are available. Requires read -permissions to this database.

    cohortTable

    The tablename that contains the target cohorts. Expectation is cohortTable -has format of COHORT table: COHORT_DEFINITION_ID, SUBJECT_ID, -COHORT_START_DATE, COHORT_END_DATE.

    outcomeDatabaseSchema

    The name of the database schema that is the location where the -data used to define the outcome cohorts is available. Requires read permissions to -this database.

    outcomeTable

    The tablename that contains the outcome cohorts. Expectation is -outcomeTable has format of COHORT table: COHORT_DEFINITION_ID, -SUBJECT_ID, COHORT_START_DATE, COHORT_END_DATE.

    cohortId

    An integer specifying the cohort id for the target cohort

    outcomeIds

    A single integer or vector of integers specifying the cohort ids for the outcome cohorts

    cdmVersion

    Define the OMOP CDM version used: currently support "4" and -"5".

    - -

    Value

    - -

    A list with the the database specific settings (this is used by the runMultiplePlp function and the skeleton packages)

    -

    Details

    - -

    This function simply stores the settings for communicating with the cdmDatabase when extracting -the target cohort and outcomes

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/createDefaultExecuteSettings.html b/docs/reference/createDefaultExecuteSettings.html deleted file mode 100644 index 0b51578f2..000000000 --- a/docs/reference/createDefaultExecuteSettings.html +++ /dev/null @@ -1,223 +0,0 @@ - - - - - - - - -Creates default list of settings specifying what parts of runPlp to execute — createDefaultExecuteSettings • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Creates default list of settings specifying what parts of runPlp to execute

    -
    - -
    createDefaultExecuteSettings()
    - - -

    Value

    - -

    list with TRUE for split, preprocess, model development and covariate summary

    -

    Details

    - -

    runs split, preprocess, model development and covariate summary

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/createDefaultSplitSetting.html b/docs/reference/createDefaultSplitSetting.html deleted file mode 100644 index 3e9229228..000000000 --- a/docs/reference/createDefaultSplitSetting.html +++ /dev/null @@ -1,263 +0,0 @@ - - - - - - - - -Create the settings for defining how the plpData are split into test/validation/train sets using -default splitting functions (either random stratified by outcome, time or subject splitting). — createDefaultSplitSetting • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Create the settings for defining how the plpData are split into test/validation/train sets using -default splitting functions (either random stratified by outcome, time or subject splitting).

    -
    - -
    createDefaultSplitSetting(
    -  testFraction = 0.25,
    -  trainFraction = 0.75,
    -  splitSeed = sample(1e+05, 1),
    -  nfold = 3,
    -  type = "stratified"
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - -
    testFraction

    (numeric) A real number between 0 and 1 indicating the test set fraction of the data

    trainFraction

    (numeric) A real number between 0 and 1 indicating the train set fraction of the data. -If not set train is equal to 1 - test

    splitSeed

    (numeric) A seed to use when splitting the data for reproducibility (if not set a random number will be generated)

    nfold

    (numeric) An integer > 1 specifying the number of folds used in cross validation

    type

    (character) Choice of:

      -
    • 'stratified' Each data point is randomly assigned into the test or a train fold set but this is done stratified such that the outcome rate is consistent in each partition

    • -
    • 'time') Older data are assigned into the training set and newer data are assigned into the test set

    • -
    • 'subject' Data are partitioned by subject, if a subject is in the data more than once, all the data points for the subject are assigned either into the test data or into the train data (not both).

    • -
    - -

    Value

    - -

    An object of class splitSettings

    -

    Details

    - -

    Returns an object of class splitSettings that specifies the splitting function that will be called and the settings

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/createEnsemble.html b/docs/reference/createEnsemble.html deleted file mode 100644 index 6b66f293b..000000000 --- a/docs/reference/createEnsemble.html +++ /dev/null @@ -1,227 +0,0 @@ - - - - - - - - -Combine models into an Ensemble — createEnsemble • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Combine models into an Ensemble

    -
    - -
    createEnsemble(runPlpList, weighted = F, weights = NULL)
    - -

    Arguments

    - - - - - - - - - - - - - - -
    runPlpList

    The runPlp results for the different models to combine

    weighted

    If F then mean across models is used, if T must input weights or AUC weighting is used

    weights

    A vector of length(runPlpList) with the weights to assign each model

    - - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/createExecuteSettings.html b/docs/reference/createExecuteSettings.html deleted file mode 100644 index 02209dc22..000000000 --- a/docs/reference/createExecuteSettings.html +++ /dev/null @@ -1,258 +0,0 @@ - - - - - - - - -Creates list of settings specifying what parts of runPlp to execute — createExecuteSettings • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Creates list of settings specifying what parts of runPlp to execute

    -
    - -
    createExecuteSettings(
    -  runSplitData = F,
    -  runSampleData = F,
    -  runfeatureEngineering = F,
    -  runPreprocessData = F,
    -  runModelDevelopment = F,
    -  runCovariateSummary = F
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - -
    runSplitData

    TRUE or FALSE whether to split data into train/test

    runSampleData

    TRUE or FALSE whether to over or under sample

    runfeatureEngineering

    TRUE or FALSE whether to do feature engineering

    runPreprocessData

    TRUE or FALSE whether to do preprocessing

    runModelDevelopment

    TRUE or FALSE whether to develop the model

    runCovariateSummary

    TRUE or FALSE whether to create covariate summary

    - -

    Value

    - -

    list with TRUE/FALSE for each part of runPlp

    -

    Details

    - -

    define what parts of runPlp to execute

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/createExistingModelSql.html b/docs/reference/createExistingModelSql.html deleted file mode 100644 index 786f006d3..000000000 --- a/docs/reference/createExistingModelSql.html +++ /dev/null @@ -1,249 +0,0 @@ - - - - - - - - -Apply an existing logistic regression prediction model — createExistingModelSql • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - -
    - -
    -
    - - -
    - -

    Apply an existing logistic regression prediction model

    - -
    - -
    createExistingModelSql(modelTable, modelNames, interceptTable, covariateTable,
    -  type = "logistic", analysisId = 112, covariateSettings, asFunctions = F,
    -  customCovariates = NULL, e = environment(), covariateValues = F)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    modelTable

    A dataframe or list of dataframes with columns: modelId, modelCovariateId, coefficientValue all doubles

    modelNames

    A name used in the covariate function names (no spaces)

    interceptTable

    A dataframe or list of dataframes with the columns: modelId, interceptValue

    covariateTable

    A dataframe or list of dataframes with columns: modelCovariateId, covariateId (the mapping of covariate_id to standard covariates)

    type

    The type of model: logistic or linear/score

    analysisId

    The covariate analysis_id (default 112)

    covariateSettings

    The settings for the standard covariates (needs for temporal settings)

    asFunctions

    If T then return two functions

    customCovariates

    enables custome SQL to be used to create custom covariates

    e

    The environment to output the covariate setting functions to

    covariateValues

    boolean Whether to also download the covariates that make up the risk score

    - -

    Details

    - -

    This function is used to create custom covariates corresponding to existing models

    - - -
    - -
    - -
    - - -
    -

    Site built with pkgdown 1.3.0.

    -
    -
    -
    - - - - - - diff --git a/docs/reference/createFeatureEngineeringSettings.html b/docs/reference/createFeatureEngineeringSettings.html deleted file mode 100644 index 4a9ef8b11..000000000 --- a/docs/reference/createFeatureEngineeringSettings.html +++ /dev/null @@ -1,233 +0,0 @@ - - - - - - - - -Create the settings for defining any feature engineering that will be done — createFeatureEngineeringSettings • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Create the settings for defining any feature engineering that will be done

    -
    - -
    createFeatureEngineeringSettings(type = "none")
    - -

    Arguments

    - - - - - - -
    type

    (character) Choice of:

      -
    • 'none' No feature engineering - this is the default

    • -
    - -

    Value

    - -

    An object of class featureEngineeringSettings

    -

    Details

    - -

    Returns an object of class featureEngineeringSettings that specifies the sampling function that will be called and the settings

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/createLearningCurve.html b/docs/reference/createLearningCurve.html deleted file mode 100644 index ef6cc8335..000000000 --- a/docs/reference/createLearningCurve.html +++ /dev/null @@ -1,351 +0,0 @@ - - - - - - - - -createLearningCurve — createLearningCurve • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Creates a learning curve object, which can be plotted using the - plotLearningCurve() function.

    -
    - -
    createLearningCurve(
    -  plpData,
    -  outcomeId,
    -  parallel = T,
    -  cores = 4,
    -  modelSettings,
    -  saveDirectory = getwd(),
    -  analysisId = "learningCurve",
    -  populationSettings = createStudyPopulationSettings(),
    -  splitSettings = createDefaultSplitSetting(),
    -  trainFractions = c(0.25, 0.5, 0.75),
    -  trainEvents = c(500, 1000, 1500),
    -  sampleSettings = createSampleSettings(),
    -  featureEngineeringSettings = createFeatureEngineeringSettings(),
    -  preprocessSettings = createPreprocessSettings(minFraction = 0.001, normalize = T),
    -  logSettings = createLogSettings(),
    -  executeSettings = createExecuteSettings(runSplitData = T, runSampleData = F,
    -    runfeatureEngineering = F, runPreprocessData = T, runModelDevelopment = T,
    -    runCovariateSummary = F)
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    plpData

    An object of type plpData - the patient level prediction -data extracted from the CDM.

    outcomeId

    (integer) The ID of the outcome.

    parallel

    Whether to run the code in parallel

    cores

    The number of computer cores to use if running in parallel

    modelSettings

    An object of class modelSettings created using one of the function:

      -
    • setLassoLogisticRegression() A lasso logistic regression model

    • -
    • setGradientBoostingMachine() A gradient boosting machine

    • -
    • setAdaBoost() An ada boost model

    • -
    • setRandomForest() A random forest model

    • -
    • setDecisionTree() A decision tree model

    • -
    • setCovNN()) A convolutional neural network model

    • -
    • setCIReNN() A recurrent neural network model

    • -
    • setMLP() A neural network model

    • -
    • setDeepNN() A deep neural network model

    • -
    • setKNN() A KNN model

    • -
    saveDirectory

    The path to the directory where the results will be saved (if NULL uses working directory)

    analysisId

    (integer) Identifier for the analysis. It is used to create, e.g., the result folder. Default is a timestamp.

    populationSettings

    An object of type populationSettings created using createStudyPopulationSettings that -specifies how the data class labels are defined and addition any exclusions to apply to the -plpData cohort

    splitSettings

    An object of type splitSettings that specifies how to split the data into train/validation/test. -The default settings can be created using createDefaultSplitSetting.

    trainFractions

    A list of training fractions to create models for. -Note, providing trainEvents will override your input to -trainFractions.

    trainEvents

    Events have shown to be determinant of model performance. -Therefore, it is recommended to provide trainEvents rather than -trainFractions. Note, providing trainEvents will override -your input to trainFractions. The format should be as follows:

    sampleSettings

    An object of type sampleSettings that specifies any under/over sampling to be done. -The default is none.

    featureEngineeringSettings

    An object of featureEngineeringSettings specifying any feature engineering to be learned (using the train data)

    preprocessSettings

    An object of preprocessSettings. This setting specifies the minimum fraction of -target population who must have a covariate for it to be included in the model training -and whether to normalise the covariates before training

    logSettings

    An object of logSettings created using createLogSettings -specifying how the logging is done

    executeSettings

    An object of executeSettings specifying which parts of the analysis to run

    - -

    Value

    - -

    A learning curve object containing the various performance measures - obtained by the model for each training set fraction. It can be plotted - using plotLearningCurve.

    - -

    Examples

    -
    if (FALSE) { -# define model -modelSettings = PatientLevelPrediction::setLassoLogisticRegression() - -# create learning curve -learningCurve <- PatientLevelPrediction::createLearningCurve(population, - plpData, - modelSettings) -# plot learning curve -PatientLevelPrediction::plotLearningCurve(learningCurve) -} - -
    -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/createLearningCurvePar.html b/docs/reference/createLearningCurvePar.html deleted file mode 100644 index bf980a2e2..000000000 --- a/docs/reference/createLearningCurvePar.html +++ /dev/null @@ -1,382 +0,0 @@ - - - - - - - - -createLearningCurvePar — createLearningCurvePar • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Creates a learning curve in parallel, which can be plotted using - the plotLearningCurve() function. Currently this functionality is - only supported by Lasso Logistic Regression.

    -
    - -
    createLearningCurvePar(
    -  population,
    -  plpData,
    -  modelSettings,
    -  testSplit = "stratified",
    -  testFraction = 0.25,
    -  trainFractions = c(0.25, 0.5, 0.75),
    -  trainEvents = NULL,
    -  splitSeed = NULL,
    -  nfold = 3,
    -  indexes = NULL,
    -  verbosity = "TRACE",
    -  minCovariateFraction = 0.001,
    -  normalizeData = T,
    -  saveDirectory = getwd(),
    -  savePlpData = F,
    -  savePlpResult = F,
    -  savePlpPlots = F,
    -  saveEvaluation = F,
    -  timeStamp = FALSE,
    -  analysisId = "lc-",
    -  cores = NULL
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    population

    The population created using createStudyPopulation() -that will be used to develop the model.

    plpData

    An object of type plpData - the patient level -prediction data extracted from the CDM.

    modelSettings

    An object of class modelSettings created using -one of the function. Currently only one model is supported:

      -
    • setLassoLogisticRegression - a lasso logistic regression - model

    • -
    testSplit

    Specifies the type of evaluation used. Can be either -'person' or 'time'. The value 'time' finds the date -that splots the population into the testing and training fractions -provided. Patients with an index after this date are assigned to the test -set and patients with an index prior to this date are assigned to the -training set. The value 'person' splits the data randomly into -testing and training sets according to fractions provided. The split is -stratified by the class label.

    testFraction

    The fraction of the data, which will be used as the -testing set in the patient split evaluation.

    trainFractions

    A list of training fractions to create models for. -Note, providing trainEvents will override your input to -trainFractions.

    trainEvents

    Events have shown to be determinant of model performance. -Therefore, it is recommended to provide trainEvents rather than -trainFractions. Note, providing trainEvents will override -your input to trainFractions. The format should be as follows:

    splitSeed

    The seed used to split the testing and training set when -using a 'person' type split

    nfold

    The number of folds used in the cross validation (default = -3).

    indexes

    A dataframe containing a rowId and index column where the -index value of -1 means in the test set, and positive integer represents -the cross validation fold (default is NULL).

    verbosity

    Sets the level of the verbosity. If the log level is at or -higher in priority than the logger threshold, a message will print. The -levels are:

      -
    • DEBUG - highest verbosity showing all debug statements

    • -
    • TRACE - showing information about start and end of steps

    • -
    • INFO - show informative messages (default)

    • -
    • WARN - show warning messages

    • -
    • ERROR - show error messages

    • -
    • FATAL - be silent except for fatal errors

    • -
    minCovariateFraction

    Minimum covariate prevalence in population to -avoid removal during preprocssing.

    normalizeData

    Whether to normalise the data

    saveDirectory

    Location to save log and results

    savePlpData

    Whether to save the plpData

    savePlpResult

    Whether to save the plpResult

    savePlpPlots

    Whether to save the plp plots

    saveEvaluation

    Whether to save the plp performance csv files

    timeStamp

    Include a timestamp in the log

    analysisId

    The analysis unique identifier

    cores

    The number of cores to use

    - -

    Value

    - -

    A learning curve object containing the various performance measures - obtained by the model for each training set fraction. It can be plotted - using plotLearningCurve.

    - -

    Examples

    -
    if (FALSE) { -# define model -modelSettings = setLassoLogisticRegression() - -# register parallel backend -registerParallelBackend() - -# create learning curve -learningCurve <- createLearningCurvePar(population, - plpData, - modelSettings) -# plot learning curve -plotLearningCurve(learningCurve) -} - -
    -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/createLogSettings.html b/docs/reference/createLogSettings.html deleted file mode 100644 index a37373f1c..000000000 --- a/docs/reference/createLogSettings.html +++ /dev/null @@ -1,246 +0,0 @@ - - - - - - - - -Create the settings for logging the progression of the analysis — createLogSettings • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Create the settings for logging the progression of the analysis

    -
    - -
    createLogSettings(verbosity = "DEBUG", timeStamp = T, logName = "runPlp Log")
    - -

    Arguments

    - - - - - - - - - - - - - - -
    verbosity

    Sets the level of the verbosity. If the log level is at or higher in priority than the logger threshold, a message will print. The levels are:

      -
    • DEBUGHighest verbosity showing all debug statements

    • -
    • TRACEShowing information about start and end of steps

    • -
    • INFOShow informative information (Default)

    • -
    • WARNShow warning messages

    • -
    • ERRORShow error messages

    • -
    • FATALBe silent except for fatal errors

    • -
    timeStamp

    If TRUE a timestamp will be added to each logging statement. Automatically switched on for TRACE level.

    logName

    A string reference for the logger

    - -

    Value

    - -

    An object of class logSettings

    -

    Details

    - -

    Returns an object of class logSettings that specifies the logger settings

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/createLrSql.html b/docs/reference/createLrSql.html deleted file mode 100644 index 754439f3c..000000000 --- a/docs/reference/createLrSql.html +++ /dev/null @@ -1,255 +0,0 @@ - - - - - - - - -Convert logistic regression model to sql code... — createLrSql • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Convert logistic regression model to sql code...

    -
    - -
    createLrSql(
    -  models,
    -  modelNames,
    -  covariateConstructionName = "prediction",
    -  modelTable = "#model_table",
    -  analysisId = 111,
    -  e = environment(),
    -  databaseOutput = NULL
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    models

    A trianed plp model.

    modelNames

    A name used in the covariate function names (no spaces)

    covariateConstructionName

    the name used for the create covariate function

    modelTable

    The temporary table name storing the model details

    analysisId

    The covariate analysis_id

    e

    The environment to output the covariate setting functions to

    databaseOutput

    If you want to output to go inot a cohort table add the "database.schema.tablename" here

    - -

    Details

    - -

    This function is used to create custom covariates for a logistic regression model -(currently only supports, demographics/conditions/drug/procedures/observations and measurement concepts)

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/createModelDesign.html b/docs/reference/createModelDesign.html deleted file mode 100644 index 1277b3cb9..000000000 --- a/docs/reference/createModelDesign.html +++ /dev/null @@ -1,278 +0,0 @@ - - - - - - - - -Specify settings for deceloping a single model — createModelDesign • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Specify settings for deceloping a single model

    -
    - -
    createModelDesign(
    -  targetId,
    -  outcomeId,
    -  restrictPlpDataSettings = createRestrictPlpDataSettings(),
    -  populationSettings = createStudyPopulationSettings(),
    -  covariateSettings = FeatureExtraction::createDefaultCovariateSettings(),
    -  featureEngineeringSettings = NULL,
    -  sampleSettings = NULL,
    -  preprocessSettings = NULL,
    -  modelSettings = NULL,
    -  runCovariateSummary = T
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    targetId

    The id of the target cohort that will be used for data extraction (e.g., the ATLAS id)

    outcomeId

    The id of the outcome that will be used for data extraction (e.g., the ATLAS id)

    restrictPlpDataSettings

    The settings specifying the extra restriction settings when extracting the data created using createRestrictPlpDataSettings().

    populationSettings

    The population settings specified by createStudyPopulationSettings()

    covariateSettings

    The covariate settings, this can be a list or a single 'covariateSetting' object.

    featureEngineeringSettings

    Either NULL or an object of class featureEngineeringSettings specifying any feature engineering used during model development

    sampleSettings

    Either NULL or an object of class sampleSettings with the over/under sampling settings used for model development

    preprocessSettings

    Either NULL or an object of class preprocessSettings created using createPreprocessingSettings()

    modelSettings

    The model settings such as setLassoLogisticRegression()

    runCovariateSummary

    Whether to run the covariateSummary

    - -

    Value

    - -

    A list with analysis settings used to develop a single prediction model

    -

    Details

    - -

    This specifies a single analysis for developing as single model

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/createPlpJournalDocument.html b/docs/reference/createPlpJournalDocument.html deleted file mode 100644 index 7639d076c..000000000 --- a/docs/reference/createPlpJournalDocument.html +++ /dev/null @@ -1,287 +0,0 @@ - - - - - - - - -createPlpJournalDocument — createPlpJournalDocument • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Creates a template for a prediction journal paper with the characteristics/results filled in

    -
    - -
    createPlpJournalDocument(
    -  plpResult = NULL,
    -  plpValidation = NULL,
    -  plpData = NULL,
    -  targetName = "<target population>",
    -  outcomeName = "<outcome>",
    -  table1 = F,
    -  connectionDetails = NULL,
    -  includeTrain = FALSE,
    -  includeTest = TRUE,
    -  includePredictionPicture = TRUE,
    -  includeAttritionPlot = TRUE,
    -  outputLocation = file.path(getwd(), "plp_journal_document.docx"),
    -  save = T
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    plpResult

    An object of type plpResult returned by running runPlp()

    plpValidation

    An object of type validatePlp returned by running externalValidatePlp()

    plpData

    The plpData

    targetName

    A string with the target description name

    outcomeName

    A string with the outcome description name

    table1

    Whether to include table1 (characteristics)

    connectionDetails

    The connection required to calcualte the characteristics

    includeTrain

    Whether to include the train set performance

    includeTest

    Whether to include the test set performance

    includePredictionPicture

    Whether to include a picture detailing the prediction problem

    includeAttritionPlot

    Whether to include the attriction plot

    outputLocation

    The location to write the document to

    save

    If false this fucntion returns the document and does not save to outputLocation

    - -

    Value

    - -

    A work document containing the selected outputs within the user's directory at location specified in outputLocation

    -

    Details

    - -

    The function creates a word document containing the analysis details, data summary and prediction model results.

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/createPlpModelSettings.html b/docs/reference/createPlpModelSettings.html deleted file mode 100644 index 82d054b9e..000000000 --- a/docs/reference/createPlpModelSettings.html +++ /dev/null @@ -1,236 +0,0 @@ - - - - - - - - -create a an object specifying the multiple Plp model settings — createPlpModelSettings • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    create a an object specifying the multiple Plp model settings

    -
    - -
    createPlpModelSettings(modelList, covariateSettingList, populationSettingList)
    - -

    Arguments

    - - - - - - - - - - - - - - -
    modelList

    A list of model settings

    covariateSettingList

    A list of covariate settings

    populationSettingList

    A list of population settings

    - -

    Value

    - -

    A list containing a dataframe settingLookupTable containing all the model, covariate and popualtion combination details, -a list models containing all the model settings, a list covariateSettings containing all the covariate settings and a list -populationSettings containing all the population settings.

    -

    Details

    - -

    Takes a list of models, covariates, population and returns the cartesian product combining all -settings.

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/createPlpReport.html b/docs/reference/createPlpReport.html deleted file mode 100644 index 6a956a89e..000000000 --- a/docs/reference/createPlpReport.html +++ /dev/null @@ -1,267 +0,0 @@ - - - - - - - - -createPlpReport — createPlpReport • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Creates a word document report of the prediction

    -
    - -
    createPlpReport(
    -  plpResult = NULL,
    -  plpValidation = NULL,
    -  plpData = NULL,
    -  targetName = "<target population>",
    -  outcomeName = "<outcome>",
    -  targetDefinition = NULL,
    -  outcomeDefinition = NULL,
    -  outputLocation = file.path(getwd(), "plp_report.docx"),
    -  save = T
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    plpResult

    An object of type plpResult returned by running runPlp()

    plpValidation

    An object of type validatePlp returned by running externalValidatePlp()

    plpData

    The plpData

    targetName

    A string with the target description name

    outcomeName

    A string with the outcome description name

    targetDefinition

    The cohort details

    outcomeDefinition

    The cohort details

    outputLocation

    The location to write the document to

    save

    If false the output of the function of the function is the document rather than creating the document in outputLocation

    - -

    Value

    - -

    A work document containing the selected outputs within the user's directory at location specified in outputLocation

    -

    Details

    - -

    The function creates a word document containing the analysis details, data summary and prediction model results.

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/createPlpResultTables.html b/docs/reference/createPlpResultTables.html deleted file mode 100644 index 742a4f777..000000000 --- a/docs/reference/createPlpResultTables.html +++ /dev/null @@ -1,270 +0,0 @@ - - - - - - - - -Create the results tables to store PatientLevelPrediction models and results into a database — createPlpResultTables • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    This function executes a large set of SQL statements to create tables that can store models and results

    -
    - -
    createPlpResultTables(
    -  conn,
    -  resultSchema,
    -  targetDialect = "postgresql",
    -  deleteExistingTables = T,
    -  createTables = T,
    -  stringAppendToTables = "",
    -  tempEmulationSchema = getOption("sqlRenderTempEmulationSchema"),
    -  testFile = NULL
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    conn

    A connection to a database created by using the -function connect in the -DatabaseConnector package.

    resultSchema

    The name of the database schema that the result tables will be created.

    targetDialect

    The database management system being used

    deleteExistingTables

    If true any existing tables matching the PatientLevelPrediction result tables names will be deleted

    createTables

    If true the PatientLevelPrediction result tables will be created

    stringAppendToTables

    A string that appends to the PatientLevelPrediction result tables

    tempEmulationSchema

    The temp schema used when the database management system is oracle

    testFile

    (used for testing) The location of an sql file with the table creation code

    - -

    Value

    - -

    Returns NULL but creates the required tables into the specified database schema.

    -

    Details

    - -

    This function can be used to create (or delete) PatientLevelPrediction result tables

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/createPreprocessSettings.html b/docs/reference/createPreprocessSettings.html deleted file mode 100644 index 0beec10c9..000000000 --- a/docs/reference/createPreprocessSettings.html +++ /dev/null @@ -1,243 +0,0 @@ - - - - - - - - -Create the settings for preprocessing the trainData using . — createPreprocessSettings • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Create the settings for preprocessing the trainData using .

    -
    - -
    createPreprocessSettings(
    -  minFraction = 0.001,
    -  normalize = TRUE,
    -  removeRedundancy = TRUE
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - -
    minFraction

    The minimum fraction of target population who must have a covariate for it to be included in the model training

    normalize

    Whether to normalise the covariates before training (Default: TRUE)

    removeRedundancy

    Whether to remove redundant features (Default: TRUE)

    - -

    Value

    - -

    An object of class preprocessingSettings

    -

    Details

    - -

    Returns an object of class preprocessingSettings that specifies how to preprocess the training data

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/createRandomForestFeatureSelection.html b/docs/reference/createRandomForestFeatureSelection.html deleted file mode 100644 index 7cee442c3..000000000 --- a/docs/reference/createRandomForestFeatureSelection.html +++ /dev/null @@ -1,235 +0,0 @@ - - - - - - - - -Create the settings for random foreat based feature selection — createRandomForestFeatureSelection • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Create the settings for random foreat based feature selection

    -
    - -
    createRandomForestFeatureSelection(ntrees = 2000, maxDepth = 17)
    - -

    Arguments

    - - - - - - - - - - -
    ntrees

    number of tree in forest

    maxDepth

    MAx depth of each tree

    - -

    Value

    - -

    An object of class featureEngineeringSettings

    -

    Details

    - -

    Returns an object of class featureEngineeringSettings that specifies the sampling function that will be called and the settings

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/createRestrictPlpDataSettings.html b/docs/reference/createRestrictPlpDataSettings.html deleted file mode 100644 index 2e1ab7462..000000000 --- a/docs/reference/createRestrictPlpDataSettings.html +++ /dev/null @@ -1,262 +0,0 @@ - - - - - - - - -createRestrictPlpDataSettings define extra restriction settings when calling getPlpData — createRestrictPlpDataSettings • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    This function creates the settings used to restrict the target cohort when calling getPlpData

    -
    - -
    createRestrictPlpDataSettings(
    -  studyStartDate = "",
    -  studyEndDate = "",
    -  firstExposureOnly = F,
    -  washoutPeriod = 0,
    -  sampleSize = NULL
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - -
    studyStartDate

    A calendar date specifying the minimum date that a cohort index -date can appear. Date format is 'yyyymmdd'.

    studyEndDate

    A calendar date specifying the maximum date that a cohort index -date can appear. Date format is 'yyyymmdd'. Important: the study -end data is also used to truncate risk windows, meaning no outcomes -beyond the study end date will be considered.

    firstExposureOnly

    Should only the first exposure per subject be included? Note that -this is typically done in the createStudyPopulation function, -but can already be done here for efficiency reasons.

    washoutPeriod

    The mininum required continuous observation time prior to index -date for a person to be included in the at risk cohort. Note that -this is typically done in the createStudyPopulation function, -but can already be done here for efficiency reasons.

    sampleSize

    If not NULL, the number of people to sample from the target cohort

    - -

    Value

    - -

    A setting object of class restrictPlpDataSettings containing a list getPlpData extra settings

    -

    Details

    - -

    Users need to specify the extra restrictions to apply when downloading the target cohort

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/createSampleSettings.html b/docs/reference/createSampleSettings.html deleted file mode 100644 index 049018f9a..000000000 --- a/docs/reference/createSampleSettings.html +++ /dev/null @@ -1,252 +0,0 @@ - - - - - - - - -Create the settings for defining how the trainData from splitData are sampled using -default sample functions. — createSampleSettings • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Create the settings for defining how the trainData from splitData are sampled using -default sample functions.

    -
    - -
    createSampleSettings(
    -  type = "none",
    -  numberOutcomestoNonOutcomes = 1,
    -  sampleSeed = sample(10000, 1)
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - -
    type

    (character) Choice of:

      -
    • 'none' No sampling is applied - this is the default

    • -
    • 'underSample')Undersample the non-outcome class to make the data more ballanced

    • -
    • 'overSample'Oversample the outcome class by adding in each outcome multiple times

    • -
    numberOutcomestoNonOutcomes

    (numeric) An numeric specifying the require number of non-outcomes per outcome

    sampleSeed

    (numeric) A seed to use when splitting the data for reproducibility (if not set a random number will be generated)

    - -

    Value

    - -

    An object of class sampleSettings

    -

    Details

    - -

    Returns an object of class sampleSettings that specifies the sampling function that will be called and the settings

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/createStudyPopulation.html b/docs/reference/createStudyPopulation.html deleted file mode 100644 index fa9c31d7f..000000000 --- a/docs/reference/createStudyPopulation.html +++ /dev/null @@ -1,260 +0,0 @@ - - - - - - - - -Create a study population — createStudyPopulation • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Create a study population

    -
    - -
    createStudyPopulation(
    -  plpData,
    -  outcomeId,
    -  populationSettings,
    -  population = NULL
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - -
    plpData

    An object of type plpData as generated using -getplpData.

    outcomeId

    The ID of the outcome.

    populationSettings

    An object of class populationSettings created using createPopulationSettings

    population

    If specified, this population will be used as the starting point instead of the -cohorts in the plpData object.

    - -

    Value

    - -

    A data frame specifying the study population. This data frame will have the following columns:

    -
    rowId

    A unique identifier for an exposure

    -
    subjectId

    The person ID of the subject

    -
    cohortStartdate

    The index date

    -
    outcomeCount

    The number of outcomes observed during the risk window

    -
    timeAtRisk

    The number of days in the risk window

    -
    survivalTime

    The number of days until either the outcome or the end of the risk window

    - -
    - -

    Details

    - -

    Create a study population by enforcing certain inclusion and exclusion criteria, defining -a risk window, and determining which outcomes fall inside the risk window.

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/createStudyPopulationSettings.html b/docs/reference/createStudyPopulationSettings.html deleted file mode 100644 index d6e0ed341..000000000 --- a/docs/reference/createStudyPopulationSettings.html +++ /dev/null @@ -1,299 +0,0 @@ - - - - - - - - -create the study population settings — createStudyPopulationSettings • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    create the study population settings

    -
    - -
    createStudyPopulationSettings(
    -  binary = T,
    -  includeAllOutcomes = T,
    -  firstExposureOnly = FALSE,
    -  washoutPeriod = 0,
    -  removeSubjectsWithPriorOutcome = TRUE,
    -  priorOutcomeLookback = 99999,
    -  requireTimeAtRisk = T,
    -  minTimeAtRisk = 364,
    -  riskWindowStart = 1,
    -  startAnchor = "cohort start",
    -  riskWindowEnd = 365,
    -  endAnchor = "cohort start",
    -  restrictTarToCohortEnd = F
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    binary

    Forces the outcomeCount to be 0 or 1 (use for binary prediction problems)

    includeAllOutcomes

    (binary) indicating whether to include people with outcomes who are not observed for the whole at risk period

    firstExposureOnly

    Should only the first exposure per subject be included? Note that -this is typically done in the createStudyPopulation function,

    washoutPeriod

    The mininum required continuous observation time prior to index -date for a person to be included in the cohort.

    removeSubjectsWithPriorOutcome

    Remove subjects that have the outcome prior to the risk window start?

    priorOutcomeLookback

    How many days should we look back when identifying prior outcomes?

    requireTimeAtRisk

    Should subject without time at risk be removed?

    minTimeAtRisk

    The minimum number of days at risk required to be included

    riskWindowStart

    The start of the risk window (in days) relative to the index date (+ -days of exposure if the addExposureDaysToStart parameter is -specified).

    startAnchor

    The anchor point for the start of the risk window. Can be "cohort start" or "cohort end".

    riskWindowEnd

    The end of the risk window (in days) relative to the index data (+ -days of exposure if the addExposureDaysToEnd parameter is -specified).

    endAnchor

    The anchor point for the end of the risk window. Can be "cohort start" or "cohort end".

    restrictTarToCohortEnd

    If using a survival model and you want the time-at-risk to end at the cohort end date set this to T

    - -

    Value

    - -

    A list containing all the settings required for creating the study population

    -

    Details

    - -

    Takes as input the inputs to create study population

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/createUnivariateFeatureSelection.html b/docs/reference/createUnivariateFeatureSelection.html deleted file mode 100644 index 8c7eab965..000000000 --- a/docs/reference/createUnivariateFeatureSelection.html +++ /dev/null @@ -1,231 +0,0 @@ - - - - - - - - -Create the settings for defining any feature selection that will be done — createUnivariateFeatureSelection • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Create the settings for defining any feature selection that will be done

    -
    - -
    createUnivariateFeatureSelection(k = 100)
    - -

    Arguments

    - - - - - - -
    k

    This function returns the K features most associated (univariately) to the outcome

    - -

    Value

    - -

    An object of class featureEngineeringSettings

    -

    Details

    - -

    Returns an object of class featureEngineeringSettings that specifies the sampling function that will be called and the settings

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/createValidationSettings.html b/docs/reference/createValidationSettings.html deleted file mode 100644 index cd816a609..000000000 --- a/docs/reference/createValidationSettings.html +++ /dev/null @@ -1,235 +0,0 @@ - - - - - - - - -createValidationSettings define optional settings for performing external validation — createValidationSettings • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    This function creates the settings required by externalValidatePlp

    -
    - -
    createValidationSettings(recalibrate = NULL, runCovariateSummary = T)
    - -

    Arguments

    - - - - - - - - - - -
    recalibrate

    A vector of characters specifying the recalibration method to apply

    runCovariateSummary

    Whether to run the covariate summary for the validation data

    - -

    Value

    - -

    A setting object of class validationSettings containing a list of settings for externalValidatePlp

    -

    Details

    - -

    Users need to specify whether they want to sample or recalibate when performing external validation

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/diagnostic.html b/docs/reference/diagnostic.html deleted file mode 100644 index b66905ad0..000000000 --- a/docs/reference/diagnostic.html +++ /dev/null @@ -1,287 +0,0 @@ - - - - - - - - -diagnostic - Investigates the prediction problem settings - use before training a model — diagnostic • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    This function runs a set of prediction diagnoses to help pick a suitable T, O, TAR and determine -whether the prediction problem is worth executing.

    -
    - -
    diagnostic(
    -  plpData = NULL,
    -  cdmDatabaseName = "none",
    -  cohortName,
    -  outcomeNames,
    -  databaseDetails,
    -  restrictPlpDataSettings,
    -  populationSettings,
    -  outputFolder = NULL,
    -  minCellCount = 5
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    plpData

    The data object to do the diagnostic on - if NULL you need to specify the connection settings below

    cdmDatabaseName

    The name of the database being diagnosed

    cohortName

    Name of the target cohort

    outcomeNames

    Vector of outcome names

    databaseDetails

    (only used is plpData is NULL) The database details created using createDatabaseDetails

    restrictPlpDataSettings

    (only used is plpData is NULL) The restrictPlpDataSettings created using createRestrictPlpDataSettings

    populationSettings

    The population setting details created using createPopulationSettings

    outputFolder

    Location to save results for shiny app

    minCellCount

    The minimum count that will be displayed

    - -

    Value

    - -

    An object containing the model or location where the model is save, the data selection settings, the preprocessing -and training settings as well as various performance measures obtained by the model.

    -
    distribution

    list for each O of a data.frame containing: i) Time to observation end distribution, ii) Time from observation start distribution, iii) Time to event distribution and iv) Time from last prior event to index distribution (only for patients in T who have O before index)

    -
    incident

    list for each O of incidence of O in T during TAR

    -
    characterization

    list for each O of Characterization of T, TnO, Tn~O

    - -

    Details

    - -

    Users can define set of Ts, Os, databases and population settings. A list of data.frames containing details such as -follow-up time distribution, time-to-event information, characteriszation details, time from last prior event, -observation time distribution.

    - -

    Examples

    -
    if (FALSE) { -#******** EXAMPLE 1 ********* -} -
    -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/diagnosticOddsRatio.html b/docs/reference/diagnosticOddsRatio.html deleted file mode 100644 index d6c34055f..000000000 --- a/docs/reference/diagnosticOddsRatio.html +++ /dev/null @@ -1,243 +0,0 @@ - - - - - - - - -Calculate the diagnostic odds ratio — diagnosticOddsRatio • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Calculate the diagnostic odds ratio

    -
    - -
    diagnosticOddsRatio(TP, TN, FN, FP)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - -
    TP

    Number of true positives

    TN

    Number of true negatives

    FN

    Number of false negatives

    FP

    Number of false positives

    - -

    Value

    - -

    diagnosticOddsRatio value

    -

    Details

    - -

    Calculate the diagnostic odds ratio

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/drawAttritionDiagramPlp.html b/docs/reference/drawAttritionDiagramPlp.html deleted file mode 100644 index 13bdca5d4..000000000 --- a/docs/reference/drawAttritionDiagramPlp.html +++ /dev/null @@ -1,244 +0,0 @@ - - - - - - - - -Draw the attrition diagram — drawAttritionDiagramPlp • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    drawAttritionDiagramPlp draws the attition diagram, showing how many people were excluded from -the study population, and for what reasons.

    -
    - -
    drawAttritionDiagramPlp(
    -  attrition,
    -  targetLabel = "Target Population",
    -  outcomeLabel = "Outcome Count",
    -  fileName = NULL
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - -
    attrition

    The table of attrition details return from the population attr(popualtion, 'metaData')$attrition

    targetLabel

    A label to us for the treated cohort.

    outcomeLabel

    A label to us for the comparator cohort.

    fileName

    Name of the file where the plot should be saved, for example 'plot.png'. -See the function ggsave in the ggplot2 package for supported file -formats.

    - -

    Value

    - -

    A ggplot object. Use the ggsave function to save to file in a different -format.

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/evaluateExistingModel.html b/docs/reference/evaluateExistingModel.html deleted file mode 100644 index 256889239..000000000 --- a/docs/reference/evaluateExistingModel.html +++ /dev/null @@ -1,347 +0,0 @@ - - - - - - - - -evaluateExistingModel — evaluateExistingModel • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - -
    - -
    -
    - - -
    - -

    This function implements an existing model

    - -
    - -
    evaluateExistingModel(modelTable, covariateTable, interceptTable = NULL,
    -  type = "score", covariateSettings, customCovariates = NULL,
    -  addExposureDaysToStart = F, riskWindowStart = 1,
    -  addExposureDaysToEnd = F, riskWindowEnd = 365, requireTimeAtRisk = T,
    -  minTimeAtRisk = 364, includeAllOutcomes = T,
    -  removeSubjectsWithPriorOutcome = T, priorOutcomeLookback = 99999,
    -  verbosity = "INFO", washoutPeriod = 0, firstExposureOnly = F,
    -  binary = T, connectionDetails, cdmDatabaseSchema, cohortDatabaseSchema,
    -  cohortTable, cohortId, outcomeDatabaseSchema, outcomeTable, outcomeId,
    -  oracleTempSchema = cdmDatabaseSchema, modelName = "existingModel",
    -  calibrationPopulation = NULL, covariateSummary = T, cdmVersion = 5)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    modelTable

    The model covariates and scores

    covariateTable

    The mapping from model covariates to standard covariates

    interceptTable

    The model intercepts

    type

    Model type (score or logistic)

    covariateSettings

    The standard covariate settings (specify covariate lookback time)

    customCovariates

    A table of covariateId, sql (sql creates the custom covariate)

    addExposureDaysToStart

    riskWindowStart relative to the cohort end date instead of the cohort start date?

    riskWindowStart

    The day after index to start predicting the outcome

    addExposureDaysToEnd

    riskWindowEnd relative to the cohort end date instead of the cohort start date?

    riskWindowEnd

    The day after index to stop predicting the outcome

    requireTimeAtRisk

    Do you want to ignore people who leave the database some point between the riskWindowStart and riskWindowEnd

    minTimeAtRisk

    If requireTimeAtRisk is TRUE, how many days must they be observed before leaving to get included (default recommendation is all risk period: riskWindowEnd-riskWindowStart)

    includeAllOutcomes

    Setting this to TRUE means people with the outcome who leave the data during the risk period are still included, so only non-outcome people who leave during the risk period are removed

    removeSubjectsWithPriorOutcome

    Remove people from the target population if they have the outcome prior to target cohort start date

    priorOutcomeLookback

    Lookback for removeSubjectsWithPriorOutcome

    verbosity

    The study population creation verbosity

    washoutPeriod

    Remove patients from the population with less than washoutPeriod of days prior observation

    firstExposureOnly

    If patients are in the target population multiple times, use only the first date

    binary

    Binary classificsation (T or F)

    connectionDetails

    The details to connect to the CDM

    cdmDatabaseSchema

    A string specifying the database containing the cdm

    cohortDatabaseSchema

    A string specifying the database containing the target cohorts

    cohortTable

    A string specifying the table containing the target cohorts

    cohortId

    An iteger specifying the cohort id for the target cohorts

    outcomeDatabaseSchema

    A string specifying the database containing the outcome cohorts

    outcomeTable

    A string specifying the table containing the outcome cohorts

    outcomeId

    An iteger specifying the cohort id for the outcome cohorts

    oracleTempSchema

    The temp oracle schema

    modelName

    The name of the model

    calibrationPopulation

    A data.frame of subjectId, cohortStartDate, indexes used to recalibrate the model on new data

    covariateSummary

    Whether to calculate the covariateSummary

    cdmVersion

    The CDM version being used

    - -

    Value

    - -

    The performance of the existing model and prediction

    - -

    Details

    - -

    Implements an existing model and evaluates its performance

    - - -
    - -
    - -
    - - -
    -

    Site built with pkgdown 1.3.0.

    -
    -
    -
    - - - - - - diff --git a/docs/reference/evaluateMultiplePlp.html b/docs/reference/evaluateMultiplePlp.html deleted file mode 100644 index 63876ebea..000000000 --- a/docs/reference/evaluateMultiplePlp.html +++ /dev/null @@ -1,309 +0,0 @@ - - - - - - - - -externally validate the multiple plp models across new datasets — evaluateMultiplePlp • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    This function loads all the models in a multiple plp analysis folder and -validates the models on new data

    -
    - -
    evaluateMultiplePlp(
    -  analysesLocation,
    -  outputLocation,
    -  connectionDetails,
    -  validationSchemaTarget,
    -  validationSchemaOutcome,
    -  validationSchemaCdm,
    -  databaseNames,
    -  validationTableTarget,
    -  validationTableOutcome,
    -  validationIdTarget = NULL,
    -  validationIdOutcome = NULL,
    -  oracleTempSchema = NULL,
    -  verbosity = "INFO",
    -  keepPrediction = F,
    -  recalibrate = NULL,
    -  sampleSize = NULL
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    analysesLocation

    The location where the multiple plp analyses are

    outputLocation

    The location to save to validation results

    connectionDetails

    The connection details for extracting the new data

    validationSchemaTarget

    A string or list of strings specifying the database containing the target cohorts

    validationSchemaOutcome

    A string or list of strings specifying the database containing the outcome cohorts

    validationSchemaCdm

    A string or list of strings specifying the database containing the cdm

    databaseNames

    A string of lift of strings specifying sharing friendly database names corresponding to validationSchemaCdm

    validationTableTarget

    A string or list of strings specifying the table containing the target cohorts

    validationTableOutcome

    A string or list of strings specifying the table containing the outcome cohorts

    validationIdTarget

    An iteger or list of integers specifying the cohort id for the target cohorts

    validationIdOutcome

    An iteger or list of integers specifying the cohort id for the outcome cohorts

    oracleTempSchema

    The temp oracle schema requires read/write

    verbosity

    Sets the level of the verbosity. If the log level is at or higher in priority than the logger threshold, a message will print. The levels are:

      -
    • DEBUGHighest verbosity showing all debug statements

    • -
    • TRACEShowing information about start and end of steps

    • -
    • INFOShow informative information (Default)

    • -
    • WARNShow warning messages

    • -
    • ERRORShow error messages

    • -
    • FATALBe silent except for fatal errors

    • -
    keepPrediction

    Whether to keep the predicitons for the new data

    recalibrate

    A vector of recalibration methods (currently supports 'RecalibrationintheLarge' and/or 'weakRecalibration')

    sampleSize

    If not NULL, the number of people to sample from the target cohort

    - -

    Details

    - -

    Users need to input a location where the results of the multiple plp analyses -are found and the connection and database settings for the new data

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/evaluatePlp.html b/docs/reference/evaluatePlp.html deleted file mode 100644 index 1060d323c..000000000 --- a/docs/reference/evaluatePlp.html +++ /dev/null @@ -1,236 +0,0 @@ - - - - - - - - -evaluatePlp — evaluatePlp • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Evaluates the performance of the patient level prediction model

    -
    - -
    evaluatePlp(prediction, typeColumn = "evaluationType")
    - -

    Arguments

    - - - - - - - - - - -
    prediction

    The patient level prediction model's prediction

    typeColumn

    The column name in the prediction object that is used to -stratify the evaluation

    - -

    Value

    - -

    A list containing the performance values

    -

    Details

    - -

    The function calculates various metrics to measure the performance of the model

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/exportPlpDataToCsv.html b/docs/reference/exportPlpDataToCsv.html deleted file mode 100644 index 32eabfb24..000000000 --- a/docs/reference/exportPlpDataToCsv.html +++ /dev/null @@ -1,231 +0,0 @@ - - - - - - - - -Export all data in a plpData object to CSV files — exportPlpDataToCsv • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - -
    - -
    -
    - - -
    - -

    Export all data in a plpData object to CSV files

    - -
    - -
    exportPlpDataToCsv(plpData, outputFolder)
    - -

    Arguments

    - - - - - - - - - - -
    plpData

    An object of type plpData.

    outputFolder

    The folder on the file system where the CSV files will be created. If the -folder does not yet exist it will be created.

    - -

    Details

    - -

    Created a set of CSV files in the output folder with all the data in the plplData object. This -function is intended to be used for research into prediction methods. The following files will be -created:

    -
    cohort.csv

    Listing all persons and their prediction periods. This file -will have these fields: row_id (a unique ID per period), person_id, cohort_start_date, cohort_id, -time (number of days in the window).

    outcomes.csv

    Listing all outcomes per period. This -file will have these fields: row_id, outcome_id, outcome_count, time_to_event.

    -
    exclude.csv

    Either not exported or a file listing per outcome ID which windows had the -outcome prior to the window and should therefore be removed prior to fitting the model. This object -will have these fields: rowId, outcomeId.

    covariates.csv

    Listing the baseline covariates -per person in the cohorts. This is done using a sparse representation: covariates with a value of 0 -are omitted to save space. The covariates file will have three columns: rowId, covariateId, and -covariateValue.

    covariateRef.csv

    A file describing the covariates that have been -extracted.

    metaData

    Some information on how the plpData object was constructed.

    - - -

    Examples

    -
    # NOT RUN {
    -exportPlpDataToCsv(plpData, "s:/temp/exportTest")
    -# }
    -
    - -
    - -
    - - -
    -

    Site built with pkgdown 1.3.0.

    -
    -
    -
    - - - - - - diff --git a/docs/reference/externalValidateDbPlp.html b/docs/reference/externalValidateDbPlp.html deleted file mode 100644 index 8c8dd4311..000000000 --- a/docs/reference/externalValidateDbPlp.html +++ /dev/null @@ -1,260 +0,0 @@ - - - - - - - - -externalValidateDbPlp - Validate a model on new databases — externalValidateDbPlp • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    This function extracts data using a user specified connection and cdm_schema, applied the model and then calcualtes the performance

    -
    - -
    externalValidateDbPlp(
    -  plpModel,
    -  validationDatabaseDetails = createDatabaseDetails(),
    -  validationRestrictPlpDataSettings = createRestrictPlpDataSettings(),
    -  settings = createValidationSettings(recalibrate = "weakRecalibration"),
    -  logSettings = createLogSettings(verbosity = "INFO", logName = "validatePLP"),
    -  outputFolder = getwd()
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - -
    plpModel

    The model object returned by runPlp() containing the trained model

    validationDatabaseDetails

    A list of objects of class databaseDetails created using createDatabaseDetails

    validationRestrictPlpDataSettings

    A list of population restriction settings created by createRestrictPlpDataSettings()

    settings

    A settings object of class validationSettings created using createValidationSettings

    logSettings

    An object of logSettings created using createLogSettings -specifying how the logging is done

    outputFolder

    The directory to save the validation results to (subfolders are created per database in validationDatabaseDetails)

    - -

    Value

    - -

    A list containing the performance for each validation_schema

    -

    Details

    - -

    Users need to input a trained model (the output of runPlp()) and new database connections. The function will return a list of length equal to the -number of cdm_schemas input with the performance on the new data

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/externalValidatePlp.html b/docs/reference/externalValidatePlp.html deleted file mode 100644 index 51f1df281..000000000 --- a/docs/reference/externalValidatePlp.html +++ /dev/null @@ -1,310 +0,0 @@ - - - - - - - - -externalValidatePlp - Validate a model on new databases — externalValidatePlp • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    This function extracts data using a user specified connection and cdm_schema, applied the model and then calcualtes the performance

    -
    - -
    externalValidatePlp(
    -  plpResult,
    -  connectionDetails,
    -  validationSchemaTarget,
    -  validationSchemaOutcome,
    -  validationSchemaCdm,
    -  databaseNames,
    -  validationTableTarget = "cohort",
    -  validationTableOutcome = "cohort",
    -  validationIdTarget = NULL,
    -  validationIdOutcome = NULL,
    -  oracleTempSchema = NULL,
    -  verbosity = "INFO",
    -  keepPrediction = F,
    -  recalibrate = NULL,
    -  sampleSize = NULL,
    -  outputFolder
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    plpResult

    The object returned by runPlp() containing the trained model

    connectionDetails

    The connection details for extracting the new data

    validationSchemaTarget

    A string or vector of strings specifying the database containing the target cohorts

    validationSchemaOutcome

    A string or vector of strings specifying the database containing the outcome cohorts

    validationSchemaCdm

    A string or vector of strings specifying the database containing the cdm

    databaseNames

    A string of vector of strings specifying sharing friendly database names corresponding to validationSchemaCdm

    validationTableTarget

    A string or vector of strings specifying the table containing the target cohorts

    validationTableOutcome

    A string or vector of strings specifying the table containing the outcome cohorts

    validationIdTarget

    An iteger specifying the cohort id for the target cohort

    validationIdOutcome

    An iteger specifying the cohort id for the outcome cohort

    oracleTempSchema

    The temp oracle schema requires read/write

    verbosity

    Sets the level of the verbosity. If the log level is at or higher in priority than the logger threshold, a message will print. The levels are:

      -
    • DEBUGHighest verbosity showing all debug statements

    • -
    • TRACEShowing information about start and end of steps

    • -
    • INFOShow informative information (Default)

    • -
    • WARNShow warning messages

    • -
    • ERRORShow error messages

    • -
    • FATALBe silent except for fatal errors

    • -
    keepPrediction

    Whether to keep the predicitons for the new data

    recalibrate

    A vector of characters specifying the recalibration method to apply

    sampleSize

    If not NULL, the number of people to sample from the target cohort

    outputFolder

    If you want to save the results enter the directory to save here

    - -

    Value

    - -

    A list containing the performance for each validation_schema

    -

    Details

    - -

    Users need to input a trained model (the output of runPlp()) and new database connections. The function will return a list of length equal to the -number of cdm_schemas input with the performance on the new data

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/f1Score.html b/docs/reference/f1Score.html deleted file mode 100644 index 2f25b115f..000000000 --- a/docs/reference/f1Score.html +++ /dev/null @@ -1,243 +0,0 @@ - - - - - - - - -Calculate the f1Score — f1Score • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Calculate the f1Score

    -
    - -
    f1Score(TP, TN, FN, FP)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - -
    TP

    Number of true positives

    TN

    Number of true negatives

    FN

    Number of false negatives

    FP

    Number of false positives

    - -

    Value

    - -

    f1Score value

    -

    Details

    - -

    Calculate the f1Score

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/falseDiscoveryRate.html b/docs/reference/falseDiscoveryRate.html deleted file mode 100644 index 5b2a7df58..000000000 --- a/docs/reference/falseDiscoveryRate.html +++ /dev/null @@ -1,243 +0,0 @@ - - - - - - - - -Calculate the falseDiscoveryRate — falseDiscoveryRate • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Calculate the falseDiscoveryRate

    -
    - -
    falseDiscoveryRate(TP, TN, FN, FP)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - -
    TP

    Number of true positives

    TN

    Number of true negatives

    FN

    Number of false negatives

    FP

    Number of false positives

    - -

    Value

    - -

    falseDiscoveryRate value

    -

    Details

    - -

    Calculate the falseDiscoveryRate

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/falseNegativeRate.html b/docs/reference/falseNegativeRate.html deleted file mode 100644 index b58d5a5ec..000000000 --- a/docs/reference/falseNegativeRate.html +++ /dev/null @@ -1,243 +0,0 @@ - - - - - - - - -Calculate the falseNegativeRate — falseNegativeRate • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Calculate the falseNegativeRate

    -
    - -
    falseNegativeRate(TP, TN, FN, FP)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - -
    TP

    Number of true positives

    TN

    Number of true negatives

    FN

    Number of false negatives

    FP

    Number of false positives

    - -

    Value

    - -

    falseNegativeRate value

    -

    Details

    - -

    Calculate the falseNegativeRate

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/falseOmissionRate.html b/docs/reference/falseOmissionRate.html deleted file mode 100644 index 254724bd1..000000000 --- a/docs/reference/falseOmissionRate.html +++ /dev/null @@ -1,243 +0,0 @@ - - - - - - - - -Calculate the falseOmissionRate — falseOmissionRate • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Calculate the falseOmissionRate

    -
    - -
    falseOmissionRate(TP, TN, FN, FP)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - -
    TP

    Number of true positives

    TN

    Number of true negatives

    FN

    Number of false negatives

    FP

    Number of false positives

    - -

    Value

    - -

    falseOmissionRate value

    -

    Details

    - -

    Calculate the falseOmissionRate

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/falsePositiveRate.html b/docs/reference/falsePositiveRate.html deleted file mode 100644 index 8e25c27bc..000000000 --- a/docs/reference/falsePositiveRate.html +++ /dev/null @@ -1,243 +0,0 @@ - - - - - - - - -Calculate the falsePositiveRate — falsePositiveRate • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Calculate the falsePositiveRate

    -
    - -
    falsePositiveRate(TP, TN, FN, FP)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - -
    TP

    Number of true positives

    TN

    Number of true negatives

    FN

    Number of false negatives

    FP

    Number of false positives

    - -

    Value

    - -

    falsePositiveRate value

    -

    Details

    - -

    Calculate the falsePositiveRate

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/fitGLMModel.html b/docs/reference/fitGLMModel.html deleted file mode 100644 index 69c0fd178..000000000 --- a/docs/reference/fitGLMModel.html +++ /dev/null @@ -1,258 +0,0 @@ - - - - - - - - -Fit a predictive model — fitGLMModel • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Fit a predictive model

    -
    - -
    fitGLMModel(
    -  population,
    -  plpData,
    -  modelType = "logistic",
    -  excludeCovariateIds = c(),
    -  includeCovariateIds = c(),
    -  prior = Cyclops::createPrior("laplace", useCrossValidation = TRUE),
    -  control = Cyclops::createControl(cvType = "auto", fold = 3, startingVariance = 0.01,
    -    lowerLimit = 0.01, upperLimit = 20, tolerance = 2e-06, cvRepetitions = 1,
    -    selectorType = "byPid", noiseLevel = "silent", threads = -1, maxIterations = 3000)
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    population

    A population object generated by createStudyPopulation, potentially filtered by other functions.

    plpData

    An object of type plpData as generated using -getPlpData.

    modelType

    The type of outcome model that will be used. Possible values are -"logistic", "poisson", or "cox".

    excludeCovariateIds

    Exclude these covariates from the outcome model.

    includeCovariateIds

    Include only these covariates in the outcome model.

    prior

    The prior used to fit the model. See -createPrior for details.

    control

    The control object used to control the cross-validation used to -determine the hyperparameters of the prior (if applicable). See -createControl for details.

    - - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/fitPlp.html b/docs/reference/fitPlp.html deleted file mode 100644 index c8d40ca24..000000000 --- a/docs/reference/fitPlp.html +++ /dev/null @@ -1,259 +0,0 @@ - - - - - - - - -fitPlp — fitPlp • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Train various models using a default parameter gird search or user specified parameters

    -
    - -
    fitPlp(trainData, modelSettings, search = "grid", analysisId)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - -
    trainData

    An object of type TrainData created using splitData -data extracted from the CDM.

    modelSettings

    An object of class modelSettings created using one of the function:

      -
    • logisticRegressionModel() A lasso logistic regression model

    • -
    • GBMclassifier() A gradient boosting machine

    • -
    • RFclassifier() A random forest model

    • -
    • GLMclassifier () A generalised linear model

    • -
    • KNNclassifier() A KNN model

    • -
    search

    The search strategy for the hyper-parameter selection (currently not used)

    analysisId

    The id of the analysis

    - -

    Value

    - -

    An object of class plpModel containing:

    -
    model

    The trained prediction model

    -
    modelLoc

    The path to where the model is saved (if saved)

    -
    trainAuc

    The AUC obtained on the training set

    -
    trainCalibration

    The calibration obtained on the training set

    -
    modelSettings

    A list specifiying the model, preprocessing, outcomeId and cohortId

    -
    metaData

    The model meta data

    -
    trainingTime

    The time taken to train the classifier

    - -

    Details

    - -

    The user can define the machine learning model to train (regularised logistic regression, random forest, -gradient boosting machine, neural network and )

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/getAttritionTable.html b/docs/reference/getAttritionTable.html deleted file mode 100644 index 34c3b36c1..000000000 --- a/docs/reference/getAttritionTable.html +++ /dev/null @@ -1,217 +0,0 @@ - - - - - - - - -Get the attrition table for a population — getAttritionTable • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Get the attrition table for a population

    -
    - -
    getAttritionTable(object)
    - -

    Arguments

    - - - - - - -
    object

    Either an object of type plpData, a population object generated by functions -like createStudyPopulation, or an object of type outcomeModel.

    - -

    Value

    - -

    A data frame specifying the number of people and exposures in the population after specific steps of filtering.

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.5.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/getCalibration.html b/docs/reference/getCalibration.html deleted file mode 100644 index 938f1c319..000000000 --- a/docs/reference/getCalibration.html +++ /dev/null @@ -1,236 +0,0 @@ - - - - - - - - -Get a sparse summary of the calibration — getCalibration • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Get a sparse summary of the calibration

    -
    - -
    getCalibration(prediction, numberOfStrata = 10, truncateFraction = 0.01)
    - -

    Arguments

    - - - - - - - - - - - - - - -
    prediction

    A prediction object as generated using the -predict functions.

    numberOfStrata

    The number of strata in the plot.

    truncateFraction

    This fraction of probability values will be ignored when plotting, to -avoid the x-axis scale being dominated by a few outliers.

    - -

    Value

    - -

    A dataframe with the calibration summary

    -

    Details

    - -

    Generates a sparse summary showing the predicted probabilities and the observed fractions. Predictions are -stratefied into equally sized bins of predicted probabilities.

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/getCalibrationSummary.html b/docs/reference/getCalibrationSummary.html deleted file mode 100644 index 03ff5060e..000000000 --- a/docs/reference/getCalibrationSummary.html +++ /dev/null @@ -1,256 +0,0 @@ - - - - - - - - -Get a sparse summary of the calibration — getCalibrationSummary • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Get a sparse summary of the calibration

    -
    - -
    getCalibrationSummary(
    -  prediction,
    -  predictionType,
    -  typeColumn = "evaluation",
    -  numberOfStrata = 100,
    -  truncateFraction = 0.05
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - -
    prediction

    A prediction object as generated using the -predict functions.

    predictionType

    The type of prediction (binary or survival)

    typeColumn

    A column that is used to stratify the results

    numberOfStrata

    The number of strata in the plot.

    truncateFraction

    This fraction of probability values will be ignored when plotting, to -avoid the x-axis scale being dominated by a few outliers.

    - -

    Value

    - -

    A dataframe with the calibration summary

    -

    Details

    - -

    Generates a sparse summary showing the predicted probabilities and the observed fractions. Predictions are -stratefied into equally sized bins of predicted probabilities.

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/getCohortCovariateData.html b/docs/reference/getCohortCovariateData.html deleted file mode 100644 index 43ae2546e..000000000 --- a/docs/reference/getCohortCovariateData.html +++ /dev/null @@ -1,274 +0,0 @@ - - - - - - - - -Extracts covariates based on cohorts — getCohortCovariateData • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Extracts covariates based on cohorts

    -
    - -
    getCohortCovariateData(
    -  connection,
    -  oracleTempSchema = NULL,
    -  cdmDatabaseSchema,
    -  cdmVersion = "5",
    -  cohortTable = "#cohort_person",
    -  rowIdField = "row_id",
    -  aggregated,
    -  cohortId,
    -  covariateSettings
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    connection

    The database connection

    oracleTempSchema

    The temp schema if using oracle

    cdmDatabaseSchema

    The schema of the OMOP CDM data

    cdmVersion

    version of the OMOP CDM data

    cohortTable

    the table name that contains the target population cohort

    rowIdField

    string representing the unique identifier in the target population cohort

    aggregated

    whether the covariate should be aggregated

    cohortId

    cohort id for the target population cohort

    covariateSettings

    settings for the covariate cohorts and time periods

    - -

    Value

    - -

    The models will now be in the package

    -

    Details

    - -

    The user specifies a cohort and time period and then a covariate is constructed whether they are in the -cohort during the time periods relative to target population cohort index

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/getCovariateData.html b/docs/reference/getCovariateData.html deleted file mode 100644 index e7ab32589..000000000 --- a/docs/reference/getCovariateData.html +++ /dev/null @@ -1,242 +0,0 @@ - - - - - - - - -Get the covaridate data for a cohort table — getCovariateData • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - -
    - -
    -
    - - -
    - -

    This function executes some SQL to extract covaraite data for a cohort table

    - -
    - -
    getCovariateData(connection, cdmDatabaseSchema,
    -  oracleTempSchema = cdmDatabaseSchema, cohortTable = "#cohort_person",
    -  cdmVersion = 5, covariateSettings)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - -
    connection

    Can also use an existing connection rather than the connectionDetails

    cdmDatabaseSchema

    The name of the database schema that contains the OMOP CDM -instance. Requires read permissions to this database. On SQL -Server, this should specifiy both the database and the schema, -so for example 'cdm_instance.dbo'.

    oracleTempSchema

    For Oracle only: the name of the database schema where you want -all temporary tables to be managed. Requires create/insert -permissions to this database.

    cohortTable

    The temp table containing the cohort of people

    cdmVersion

    The version of the CDM (default 5)

    covariateSettings

    An object of type covariateSettings as created using the -createCovariateSettings function in the -FeatureExtraction package.

    - -

    Value

    - -

    Returns the covariates for the people in the temp table

    - -

    Details

    - - - - -
    - -
    - -
    - - -
    -

    Site built with pkgdown 1.3.0.

    -
    -
    -
    - - - - - - diff --git a/docs/reference/getDemographicSummary.html b/docs/reference/getDemographicSummary.html deleted file mode 100644 index fc69358f0..000000000 --- a/docs/reference/getDemographicSummary.html +++ /dev/null @@ -1,239 +0,0 @@ - - - - - - - - -Get a calibration per age/gender groups — getDemographicSummary • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Get a calibration per age/gender groups

    -
    - -
    getDemographicSummary(prediction, predictionType, typeColumn = "evaluation")
    - -

    Arguments

    - - - - - - - - - - - - - - -
    prediction

    A prediction object

    predictionType

    The type of prediction (binary or survival)

    typeColumn

    A column that is used to stratify the results

    - -

    Value

    - -

    A dataframe with the calibration summary

    -

    Details

    - -

    Generates a data.frame with the calibration per each 5 year age group and gender group

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/getModelDetails.html b/docs/reference/getModelDetails.html deleted file mode 100644 index 6d2e4a0cc..000000000 --- a/docs/reference/getModelDetails.html +++ /dev/null @@ -1,215 +0,0 @@ - - - - - - - - -Get the predictive model details — getModelDetails • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - -
    - -
    -
    - - -
    - -

    getModelDetails shows the full model, so showing the betas of all variables included in the -model, along with the variable names

    - -
    - -
    getModelDetails(predictiveModel, plpData)
    - -

    Arguments

    - - - - - - - - - - -
    predictiveModel

    An object of type predictiveModel as generated using he -fitPlp function.

    plpData

    An object of type plpData as generated using -getPlpData.

    - -

    Details

    - -

    Shows the coefficients and names of the covariates with non-zero coefficients.

    - - -
    - -
    - -
    - - -
    -

    Site built with pkgdown 1.3.0.

    -
    -
    -
    - - - - - - diff --git a/docs/reference/getPlpData.html b/docs/reference/getPlpData.html deleted file mode 100644 index 9464df6ad..000000000 --- a/docs/reference/getPlpData.html +++ /dev/null @@ -1,263 +0,0 @@ - - - - - - - - -Get the patient level prediction data from the server — getPlpData • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    This function executes a large set of SQL statements against the database in OMOP CDM format to -extract the data needed to perform the analysis.

    -
    - -
    getPlpData(databaseDetails, covariateSettings, restrictPlpDataSettings)
    - -

    Arguments

    - - - - - - - - - - - - - - -
    databaseDetails

    The cdm database details created using createDatabaseDetails()

    covariateSettings

    An object of type covariateSettings as created using the -createCovariateSettings function in the -FeatureExtraction package.

    restrictPlpDataSettings

    Extra settings to apply to the target population while extracting data. Created using createRestrictPlpDataSettings().

    - -

    Value

    - -

    Returns an object of type plpData, containing information on the cohorts, their -outcomes, and baseline covariates. Information about multiple outcomes can be captured at once for -efficiency reasons. This object is a list with the following components:

    -
    outcomes

    A data frame listing the outcomes per person, including the time to event, and -the outcome id. Outcomes are not yet filtered based on risk window, since this is done at -a later stage.

    cohorts

    A data frame listing the persons in each cohort, listing their -exposure status as well as the time to the end of the observation period and time to the end of the -cohort (usually the end of the exposure era).

    covariates

    An ffdf object listing the -baseline covariates per person in the two cohorts. This is done using a sparse representation: -covariates with a value of 0 are omitted to save space.

    covariateRef

    An ffdf object describing the covariates that have been extracted.

    -
    metaData

    A list of objects with information on how the cohortMethodData object was -constructed.

    -

    The generic () and summary() functions have been implemented for this object.

    -

    Details

    - -

    Based on the arguments, the at risk cohort data is retrieved, as well as outcomes -occurring in these subjects. The at risk cohort is identified through -user-defined cohorts in a cohort table either inside the CDM instance or in a separate schema. -Similarly, outcomes are identified -through user-defined cohorts in a cohort table either inside the CDM instance or in a separate -schema. Covariates are automatically extracted from the appropriate tables within the CDM. -If you wish to exclude concepts from covariates you will need to -manually add the concept_ids and descendants to the excludedCovariateConceptIds of the -covariateSettings argument.

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/getPlpTable.html b/docs/reference/getPlpTable.html deleted file mode 100644 index 92a51f7b7..000000000 --- a/docs/reference/getPlpTable.html +++ /dev/null @@ -1,259 +0,0 @@ - - - - - - - - -Create a dataframe with the summary details of the population cohort for publications — getPlpTable • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Create a dataframe with the summary details of the population cohort for publications

    -
    - -
    getPlpTable(
    -  cdmDatabaseSchema,
    -  oracleTempSchema,
    -  covariateSettings,
    -  longTermStartDays = -365,
    -  population,
    -  connectionDetails,
    -  cohortTable = "#temp_person"
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    cdmDatabaseSchema

    The schema containing the OMOP CDM data

    oracleTempSchema

    The oracle schema if needed

    covariateSettings

    The covariateSettings if different from default

    longTermStartDays

    How far to look back when looking for the variables in the data

    population

    The population you want the summary table for

    connectionDetails

    The connection details used to connect to the CDM database

    cohortTable

    The name of the temp table that will store the popualtion cohort

    - -

    Details

    - -

    This function is used to create a summary table for population to be inserted into publications

    - -

    Examples

    -
    if (FALSE) { -getTable1 (plpData, population, connectionDetails) -} -
    -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/getPredictionDistribution.html b/docs/reference/getPredictionDistribution.html deleted file mode 100644 index e2a3887d6..000000000 --- a/docs/reference/getPredictionDistribution.html +++ /dev/null @@ -1,244 +0,0 @@ - - - - - - - - -Calculates the prediction distribution — getPredictionDistribution • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Calculates the prediction distribution

    -
    - -
    getPredictionDistribution(
    -  prediction,
    -  predictionType,
    -  typeColumn = "evaluation"
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - -
    prediction

    A prediction object

    predictionType

    The type of prediction (binary or survival)

    typeColumn

    A column that is used to stratify the results

    - -

    Value

    - -

    The 0.00, 0.1, 0.25, 0.5, 0.75, 0.9, 1.00 quantile pf the prediction, -the mean and standard deviation per class

    -

    Details

    - -

    Calculates the quantiles from a predition object

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/getPredictionDistribution_binary.html b/docs/reference/getPredictionDistribution_binary.html deleted file mode 100644 index 9d4ccbde5..000000000 --- a/docs/reference/getPredictionDistribution_binary.html +++ /dev/null @@ -1,240 +0,0 @@ - - - - - - - - -Calculates the prediction distribution — getPredictionDistribution_binary • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Calculates the prediction distribution

    -
    - -
    getPredictionDistribution_binary(prediction, evalColumn, ...)
    - -

    Arguments

    - - - - - - - - - - - - - - -
    prediction

    A prediction object

    evalColumn

    A column that is used to stratify the results

    ...

    Other inputs

    - -

    Value

    - -

    The 0.00, 0.1, 0.25, 0.5, 0.75, 0.9, 1.00 quantile pf the prediction, -the mean and standard deviation per class

    -

    Details

    - -

    Calculates the quantiles from a predition object

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/getThresholdSummary.html b/docs/reference/getThresholdSummary.html deleted file mode 100644 index 9a1f6cc64..000000000 --- a/docs/reference/getThresholdSummary.html +++ /dev/null @@ -1,240 +0,0 @@ - - - - - - - - -Calculate all measures for sparse ROC — getThresholdSummary • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Calculate all measures for sparse ROC

    -
    - -
    getThresholdSummary(prediction, predictionType, typeColumn = "evaluation")
    - -

    Arguments

    - - - - - - - - - - - - - - -
    prediction

    A prediction object

    predictionType

    The type of prediction (binary or survival)

    typeColumn

    A column that is used to stratify the results

    - -

    Value

    - -

    A data.frame with all the measures

    -

    Details

    - -

    Calculates the TP, FP, TN, FN, TPR, FPR, accuracy, PPF, FOR and Fmeasure -from a prediction object

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/getThresholdSummary_binary.html b/docs/reference/getThresholdSummary_binary.html deleted file mode 100644 index b45c0a63c..000000000 --- a/docs/reference/getThresholdSummary_binary.html +++ /dev/null @@ -1,240 +0,0 @@ - - - - - - - - -Calculate all measures for sparse ROC when prediction is bianry classification — getThresholdSummary_binary • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Calculate all measures for sparse ROC when prediction is bianry classification

    -
    - -
    getThresholdSummary_binary(prediction, evalColumn, ...)
    - -

    Arguments

    - - - - - - - - - - - - - - -
    prediction

    A prediction object

    evalColumn

    A column that is used to stratify the results

    ...

    Other inputs

    - -

    Value

    - -

    A data.frame with all the measures

    -

    Details

    - -

    Calculates the TP, FP, TN, FN, TPR, FPR, accuracy, PPF, FOR and Fmeasure -from a prediction object

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/grepCovariateNames.html b/docs/reference/grepCovariateNames.html deleted file mode 100644 index 8438e0454..000000000 --- a/docs/reference/grepCovariateNames.html +++ /dev/null @@ -1,223 +0,0 @@ - - - - - - - - -Extract covariate names — grepCovariateNames • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - -
    - -
    -
    - - -
    - -

    Extracts covariate names using a regular-expression.

    - -
    - -
    grepCovariateNames(pattern, object)
    - -

    Arguments

    - - - - - - - - - - -
    pattern

    A regular expression with which to name covariate names

    object

    An R object of type plpData or covariateData.

    - -

    Value

    - -

    Returns a data.frame containing information about covariates that match a regular -expression. This data.frame has the following columns:

    -
    covariateId

    Numerical identifier for use in model fitting using these covariates

    -
    covariateName

    Text identifier

    analysisId

    Analysis identifier

    conceptId

    OMOP -common data model concept identifier, or 0

    - - -

    Details

    - -

    This function extracts covariate names that match a regular-expression for a -plpData or covariateData object.

    - - -
    - -
    - -
    - - -
    -

    Site built with pkgdown 1.3.0.

    -
    -
    -
    - - - - - - diff --git a/docs/reference/ici.html b/docs/reference/ici.html deleted file mode 100644 index 449fcbe26..000000000 --- a/docs/reference/ici.html +++ /dev/null @@ -1,236 +0,0 @@ - - - - - - - - -Calculate the Integrated Calibration Information from Austin and Steyerberg -https://onlinelibrary.wiley.com/doi/full/10.1002/sim.8281 — ici • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Calculate the Integrated Calibration Information from Austin and Steyerberg -https://onlinelibrary.wiley.com/doi/full/10.1002/sim.8281

    -
    - -
    ici(prediction)
    - -

    Arguments

    - - - - - - -
    prediction

    the prediction object found in the plpResult object

    - -

    Value

    - -

    Integrated Calibration Information

    -

    Details

    - -

    Calculate the Integrated Calibration Information

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/index.html b/docs/reference/index.html deleted file mode 100644 index 5c485e21f..000000000 --- a/docs/reference/index.html +++ /dev/null @@ -1,701 +0,0 @@ - - - - - - - - -Function reference • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -

    Extracting data from the OMOP CDM database

    -

    Functions for getting the necessary data from the database in Common Data Model and saving/loading.

    -
    -

    createDatabaseDetails()

    -

    Create a setting that holds the details about the cdmDatabase connection for data extraction

    -

    createRestrictPlpDataSettings()

    -

    createRestrictPlpDataSettings define extra restriction settings when calling getPlpData

    -

    getPlpData()

    -

    Get the patient level prediction data from the server

    -

    savePlpData()

    -

    Save the cohort data to folder

    -

    loadPlpData()

    -

    Load the cohort data from a folder

    -

    Settings for designing a prediction models

    -

    Design settings required when developing a model.

    -
    -

    createStudyPopulationSettings()

    -

    create the study population settings

    -

    createDefaultSplitSetting()

    -

    Create the settings for defining how the plpData are split into test/validation/train sets using -default splitting functions (either random stratified by outcome, time or subject splitting).

    -

    createSampleSettings()

    -

    Create the settings for defining how the trainData from splitData are sampled using -default sample functions.

    -

    createFeatureEngineeringSettings()

    -

    Create the settings for defining any feature engineering that will be done

    -

    createPreprocessSettings()

    -

    Create the settings for preprocessing the trainData using .

    -

    Execution settings when developing a model

    -

    Execution settings required when developing a model.

    -
    -

    createLogSettings()

    -

    Create the settings for logging the progression of the analysis

    -

    createExecuteSettings()

    -

    Creates list of settings specifying what parts of runPlp to execute

    -

    Binary Classification Models

    -

    Functions for setting binary classifiers and their hyper-parameter search.

    -
    -

    setAdaBoost()

    -

    Create setting for AdaBoost with python DecisionTreeClassifier base estimator

    -

    setDecisionTree()

    -

    Create setting for the scikit-learn 1.0.1 DecisionTree with python

    -

    setGradientBoostingMachine()

    -

    Create setting for gradient boosting machine model using gbm_xgboost implementation

    -

    setKNN()

    -

    Create setting for knn model

    -

    setLassoLogisticRegression()

    -

    Create setting for lasso logistic regression

    -

    setMLP()

    -

    Create setting for neural network model with python

    -

    setNaiveBayes()

    -

    Create setting for naive bayes model with python

    -

    setRandomForest()

    -

    Create setting for random forest model with python (very fast)

    -

    setSVM()

    -

    Create setting for the python sklearn SVM (SVC function)

    -

    Survival Models

    -

    Functions for setting survival models and their hyper-parameter search.

    -
    -

    setCoxModel()

    -

    Create setting for lasso Cox model

    -

    Single Patient-Level Prediction Model

    -

    Functions for training/evaluating/applying a single patient-level-prediction model

    -
    -

    runPlp()

    -

    runPlp - Develop and internally evaluate a model using specified settings

    -

    externalValidateDbPlp()

    -

    externalValidateDbPlp - Validate a model on new databases

    -

    savePlpModel()

    -

    Saves the plp model

    -

    loadPlpModel()

    -

    loads the plp model

    -

    savePlpResult()

    -

    Saves the result from runPlp into the location directory

    -

    loadPlpResult()

    -

    Loads the evalaution dataframe

    -

    Multiple Patient-Level Prediction Models

    -

    Functions for training mutliple patient-level-prediction model in an efficient way.

    -
    -

    createModelDesign()

    -

    Specify settings for deceloping a single model

    -

    runMultiplePlp()

    -

    Run a list of predictions analyses

    -

    validateMultiplePlp()

    -

    externally validate the multiple plp models across new datasets

    -

    savePlpAnalysesJson()

    -

    Save the modelDesignList to a json file

    -

    loadPlpAnalysesJson()

    -

    Load the multiple prediction json settings from a file

    -

    Saving results into database

    -

    Functions for saving the prediction model and performances into a database.

    -
    -

    createPlpResultTables()

    -

    Create the results tables to store PatientLevelPrediction models and results into a database

    -

    populatePlpResultTables()

    -

    Populate the PatientLevelPrediction results tables

    -

    Shiny Viewers

    -

    Functions for viewing results via a shiny app

    -
    -

    viewPlp()

    -

    viewPlp - Interactively view the performance and model settings

    -

    viewMultiplePlp()

    -

    open a local shiny app for viewing the result of a multiple PLP analyses

    -

    viewDatabaseResultPlp()

    -

    open a local shiny app for viewing the result of a PLP analyses from a database

    -

    Plotting

    -

    Functions for various performance plots

    -
    -

    plotPlp()

    -

    Plot all the PatientLevelPrediction plots

    -

    plotSparseRoc()

    -

    Plot the ROC curve using the sparse thresholdSummary data frame

    -

    plotSmoothCalibration()

    -

    Plot the smooth calibration as detailed in Calster et al. "A calibration heirarchy for risk models -was defined: from utopia to empirical data" (2016)

    -

    plotSparseCalibration()

    -

    Plot the calibration

    -

    plotSparseCalibration2()

    -

    Plot the conventional calibration

    -

    plotDemographicSummary()

    -

    Plot the Observed vs. expected incidence, by age and gender

    -

    plotF1Measure()

    -

    Plot the F1 measure efficiency frontier using the sparse thresholdSummary data frame

    -

    plotGeneralizability()

    -

    Plot the train/test generalizability diagnostic

    -

    plotPrecisionRecall()

    -

    Plot the precision-recall curve using the sparse thresholdSummary data frame

    -

    plotPredictedPDF()

    -

    Plot the Predicted probability density function, showing prediction overlap between true and false cases

    -

    plotPreferencePDF()

    -

    Plot the preference score probability density function, showing prediction overlap between true and false cases -#'

    -

    plotPredictionDistribution()

    -

    Plot the side-by-side boxplots of prediction distribution, by class#'

    -

    plotVariableScatterplot()

    -

    Plot the variable importance scatterplot

    -

    Learning Curves

    -

    Functions for creating and plotting learning curves

    -
    -

    createLearningCurve()

    -

    createLearningCurve

    -

    plotLearningCurve()

    -

    plotLearningCurve

    -

    Simulation

    -

    Functions for simulating cohort method data objects.

    -
    -

    simulatePlpData()

    -

    Generate simulated data

    -
    - - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/insertDbPopulation.html b/docs/reference/insertDbPopulation.html deleted file mode 100644 index c733a5724..000000000 --- a/docs/reference/insertDbPopulation.html +++ /dev/null @@ -1,245 +0,0 @@ - - - - - - - - -Insert a population into a database — insertDbPopulation • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - -
    - -
    -
    - - -
    - -

    Insert a population into a database

    - -
    - -
    insertDbPopulation(population, cohortIds = 1, connectionDetails,
    -  cohortDatabaseSchema, cohortTable = "cohort", createTable = FALSE,
    -  dropTableIfExists = TRUE, cdmVersion = "5")
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    population

    Either an object of type plpData or a population object generated by functions -like createStudyPopulation.

    cohortIds

    The IDs to be used for the treated and comparator cohort, respectively.

    connectionDetails

    An R object of type
    connectionDetails created using the -function createConnectionDetails in the -DatabaseConnector package.

    cohortDatabaseSchema

    The name of the database schema where the data will be written. -Requires write permissions to this database. On SQL -Server, this should specifiy both the database and the schema, -so for example 'cdm_instance.dbo'.

    cohortTable

    The name of the table in the database schema where the data will be written.

    createTable

    Should a new table be created? If not, the data will be inserted into an existing -table.

    dropTableIfExists

    If createTable = TRUE and the table already exists it will be overwritten.

    cdmVersion

    Define the OMOP CDM version used: currently support "4" and "5".

    - -

    Details

    - -

    Inserts a population table into a database. The table in the database will have the same structure as the -'cohort' table in the Common Data Model.

    - - -
    - -
    - -
    - - -
    -

    Site built with pkgdown 1.3.0.

    -
    -
    -
    - - - - - - diff --git a/docs/reference/interpretInstallCode.html b/docs/reference/interpretInstallCode.html deleted file mode 100644 index a8fcd6e16..000000000 --- a/docs/reference/interpretInstallCode.html +++ /dev/null @@ -1,222 +0,0 @@ - - - - - - - - -Tells you the package issue — interpretInstallCode • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Tells you the package issue

    -
    - -
    interpretInstallCode(response)
    - -

    Arguments

    - - - - - - -
    response

    The response code from checkPlpInstallation()

    - -

    Details

    - -

    This function prints any issues found during the checkPlpInstallation() call

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/launchDiagnosticsExplorer.html b/docs/reference/launchDiagnosticsExplorer.html deleted file mode 100644 index 17135c991..000000000 --- a/docs/reference/launchDiagnosticsExplorer.html +++ /dev/null @@ -1,235 +0,0 @@ - - - - - - - - -Launch the Diagnostics Explorer Shiny app — launchDiagnosticsExplorer • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Launch the Diagnostics Explorer Shiny app

    -
    - -
    launchDiagnosticsExplorer(dataFolder, launch.browser = FALSE)
    - -

    Arguments

    - - - - - - - - - - -
    dataFolder

    A folder where the exported zip files with the results are stored. -Zip files containing results from multiple databases can be placed in the same -folder.

    launch.browser

    Should the app be launched in your default browser, or in a Shiny window. -Note: copying to clipboard will not work in a Shiny window.

    - -

    Details

    - -

    Launches a Shiny app that allows the user to explore the diagnostics

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/listAppend.html b/docs/reference/listAppend.html deleted file mode 100644 index d98a5a463..000000000 --- a/docs/reference/listAppend.html +++ /dev/null @@ -1,232 +0,0 @@ - - - - - - - - -join two lists — listAppend • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    join two lists

    -
    - -
    listAppend(a, b)
    - -

    Arguments

    - - - - - - - - - - -
    a

    A list

    b

    Another list

    - -

    Details

    - -

    This function joins two lists

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/loadEnsemblePlpModel.html b/docs/reference/loadEnsemblePlpModel.html deleted file mode 100644 index 117c7a11c..000000000 --- a/docs/reference/loadEnsemblePlpModel.html +++ /dev/null @@ -1,222 +0,0 @@ - - - - - - - - -loads the Ensmeble plp model and return a model list — loadEnsemblePlpModel • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    loads the Ensmeble plp model and return a model list

    -
    - -
    loadEnsemblePlpModel(dirPath)
    - -

    Arguments

    - - - - - - -
    dirPath

    The location of the model

    - -

    Details

    - -

    Loads a plp model list that was saved using savePlpModel()

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/loadEnsemblePlpResult.html b/docs/reference/loadEnsemblePlpResult.html deleted file mode 100644 index 231ea5eb7..000000000 --- a/docs/reference/loadEnsemblePlpResult.html +++ /dev/null @@ -1,222 +0,0 @@ - - - - - - - - -loads the Ensemble plp results — loadEnsemblePlpResult • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    loads the Ensemble plp results

    -
    - -
    loadEnsemblePlpResult(dirPath)
    - -

    Arguments

    - - - - - - -
    dirPath

    The location of the model

    - -

    Details

    - -

    Loads a plp model list that was saved using saveEnsemblePlpResults()

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/loadPlpAnalysesJson.html b/docs/reference/loadPlpAnalysesJson.html deleted file mode 100644 index 513a9a062..000000000 --- a/docs/reference/loadPlpAnalysesJson.html +++ /dev/null @@ -1,235 +0,0 @@ - - - - - - - - -Load the multiple prediction json settings from a file — loadPlpAnalysesJson • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Load the multiple prediction json settings from a file

    -
    - -
    loadPlpAnalysesJson(jsonFileLocation)
    - -

    Arguments

    - - - - - - -
    jsonFileLocation

    The location of the file 'predictionAnalysisList.json' with the modelDesignList

    - -

    Details

    - -

    This function interprets a json with the multiple prediction settings and creates a list -that can be combined with connection settings to run a multiple prediction study

    - -

    Examples

    -
    if (FALSE) { -modelDesignList <- loadPlpAnalysesJson('location of json settings')$analysis -} - -
    -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/loadPlpData.html b/docs/reference/loadPlpData.html deleted file mode 100644 index a8da4e124..000000000 --- a/docs/reference/loadPlpData.html +++ /dev/null @@ -1,241 +0,0 @@ - - - - - - - - -Load the cohort data from a folder — loadPlpData • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    loadPlpData loads an object of type plpData from a folder in the file -system.

    -
    - -
    loadPlpData(file, readOnly = TRUE)
    - -

    Arguments

    - - - - - - - - - - -
    file

    The name of the folder containing the data.

    readOnly

    If true, the data is opened read only.

    - -

    Value

    - -

    An object of class plpData.

    -

    Details

    - -

    The data will be written to a set of files in the folder specified by the user.

    - -

    Examples

    -
    # todo - -
    -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/loadPlpFromCsv.html b/docs/reference/loadPlpFromCsv.html deleted file mode 100644 index c51564912..000000000 --- a/docs/reference/loadPlpFromCsv.html +++ /dev/null @@ -1,222 +0,0 @@ - - - - - - - - -Loads parts of the plp result saved as csv files for transparent sharing — loadPlpFromCsv • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Loads parts of the plp result saved as csv files for transparent sharing

    -
    - -
    loadPlpFromCsv(dirPath)
    - -

    Arguments

    - - - - - - -
    dirPath

    The directory with the results as csv files

    - -

    Details

    - -

    Load the main results from csv files into a runPlp object

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/loadPlpModel.html b/docs/reference/loadPlpModel.html deleted file mode 100644 index e78445f92..000000000 --- a/docs/reference/loadPlpModel.html +++ /dev/null @@ -1,228 +0,0 @@ - - - - - - - - -loads the plp model — loadPlpModel • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    loads the plp model

    -
    - -
    loadPlpModel(dirPath)
    - -

    Arguments

    - - - - - - -
    dirPath

    The location of the model

    - -

    Details

    - -

    Loads a plp model that was saved using savePlpModel()

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/loadPlpResult.html b/docs/reference/loadPlpResult.html deleted file mode 100644 index 961c710ec..000000000 --- a/docs/reference/loadPlpResult.html +++ /dev/null @@ -1,228 +0,0 @@ - - - - - - - - -Loads the evalaution dataframe — loadPlpResult • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Loads the evalaution dataframe

    -
    - -
    loadPlpResult(dirPath)
    - -

    Arguments

    - - - - - - -
    dirPath

    The directory where the evaluation was saved

    - -

    Details

    - -

    Loads the evaluation

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/loadPlpShareable.html b/docs/reference/loadPlpShareable.html deleted file mode 100644 index 774b07d56..000000000 --- a/docs/reference/loadPlpShareable.html +++ /dev/null @@ -1,228 +0,0 @@ - - - - - - - - -Loads the plp result saved as json/csv files for transparent sharing — loadPlpShareable • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Loads the plp result saved as json/csv files for transparent sharing

    -
    - -
    loadPlpShareable(loadDirectory)
    - -

    Arguments

    - - - - - - -
    loadDirectory

    The directory with the results as json/csv files

    - -

    Details

    - -

    Load the main results from json/csv files into a runPlp object

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/loadPrediction.html b/docs/reference/loadPrediction.html deleted file mode 100644 index f2ec47d45..000000000 --- a/docs/reference/loadPrediction.html +++ /dev/null @@ -1,228 +0,0 @@ - - - - - - - - -Loads the prediciton dataframe to csv — loadPrediction • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Loads the prediciton dataframe to csv

    -
    - -
    loadPrediction(fileLocation)
    - -

    Arguments

    - - - - - - -
    fileLocation

    The location with the saved prediction

    - -

    Details

    - -

    Loads the prediciton RDS file

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/loadPredictionAnalysisList.html b/docs/reference/loadPredictionAnalysisList.html deleted file mode 100644 index 0ae126bb7..000000000 --- a/docs/reference/loadPredictionAnalysisList.html +++ /dev/null @@ -1,240 +0,0 @@ - - - - - - - - -Load the multiple prediction json settings from a file — loadPredictionAnalysisList • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Load the multiple prediction json settings from a file

    -
    - -
    loadPredictionAnalysisList(predictionAnalysisListFile)
    - -

    Arguments

    - - - - - - -
    predictionAnalysisListFile

    The prediciton specification json extracted from atlas.

    - -

    Details

    - -

    This function interprets a json with the multiple prediction settings and creates a list -that can be combined with connection settings to run a multiple prediction study

    - -

    Examples

    -
    if (FALSE) { -predictionAnalysisList <- loadPredictionAnalysisList('./predictionStudyAnalyses.json') -predictionAnalysisList$connectionDetails = connectionDetails -predictionAnalysisList$cdmDatabaseSchema = cdmDatabaseSchema -predictionAnalysisList$cdmDatabaseName = cdmDatabaseName -predictionAnalysisList$oracleTempSchema = oracleTempSchema -predictionAnalysisList$cohortDatabaseSchema = cohortDatabaseSchema -predictionAnalysisList$cohortTable = cohortTable -predictionAnalysisList$outcomeDatabaseSchema = outcomeDatabaseSchema -predictionAnalysisList$outcomeTable = outcomeTable -predictionAnalysisList$cdmVersion = cdmVersion -predictionAnalysisList$outputFolder = outputFolder -result <- do.call(runPlpAnalyses, predictionAnalysisList) -} - -
    -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/modelBasedConcordance.html b/docs/reference/modelBasedConcordance.html deleted file mode 100644 index 2665220c4..000000000 --- a/docs/reference/modelBasedConcordance.html +++ /dev/null @@ -1,236 +0,0 @@ - - - - - - - - -Calculate the model-based concordance, which is a calculation of the expected discrimination performance of a model under the assumption the model predicts the "TRUE" outcome -as detailed in van Klaveren et al. https://pubmed.ncbi.nlm.nih.gov/27251001/ — modelBasedConcordance • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Calculate the model-based concordance, which is a calculation of the expected discrimination performance of a model under the assumption the model predicts the "TRUE" outcome -as detailed in van Klaveren et al. https://pubmed.ncbi.nlm.nih.gov/27251001/

    -
    - -
    modelBasedConcordance(prediction)
    - -

    Arguments

    - - - - - - -
    prediction

    the prediction object found in the plpResult object

    - -

    Value

    - -

    model-based concordance value

    -

    Details

    - -

    Calculate the model-based concordance

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/negativeLikelihoodRatio.html b/docs/reference/negativeLikelihoodRatio.html deleted file mode 100644 index 9082c2e50..000000000 --- a/docs/reference/negativeLikelihoodRatio.html +++ /dev/null @@ -1,243 +0,0 @@ - - - - - - - - -Calculate the negativeLikelihoodRatio — negativeLikelihoodRatio • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Calculate the negativeLikelihoodRatio

    -
    - -
    negativeLikelihoodRatio(TP, TN, FN, FP)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - -
    TP

    Number of true positives

    TN

    Number of true negatives

    FN

    Number of false negatives

    FP

    Number of false positives

    - -

    Value

    - -

    negativeLikelihoodRatio value

    -

    Details

    - -

    Calculate the negativeLikelihoodRatio

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/negativePredictiveValue.html b/docs/reference/negativePredictiveValue.html deleted file mode 100644 index 5fb3da3af..000000000 --- a/docs/reference/negativePredictiveValue.html +++ /dev/null @@ -1,243 +0,0 @@ - - - - - - - - -Calculate the negativePredictiveValue — negativePredictiveValue • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Calculate the negativePredictiveValue

    -
    - -
    negativePredictiveValue(TP, TN, FN, FP)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - -
    TP

    Number of true positives

    TN

    Number of true negatives

    FN

    Number of false negatives

    FP

    Number of false positives

    - -

    Value

    - -

    negativePredictiveValue value

    -

    Details

    - -

    Calculate the negativePredictiveValue

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/outcomeSurvivalPlot.html b/docs/reference/outcomeSurvivalPlot.html deleted file mode 100644 index c3fa6bfb5..000000000 --- a/docs/reference/outcomeSurvivalPlot.html +++ /dev/null @@ -1,261 +0,0 @@ - - - - - - - - -Plot the outcome incidence over time — outcomeSurvivalPlot • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Plot the outcome incidence over time

    -
    - -
    outcomeSurvivalPlot(
    -  plpData,
    -  outcomeId,
    -  populationSettings = createStudyPopulationSettings(binary = T, includeAllOutcomes =
    -    T, firstExposureOnly = FALSE, washoutPeriod = 0, removeSubjectsWithPriorOutcome =
    -    TRUE, priorOutcomeLookback = 99999, requireTimeAtRisk = F, riskWindowStart = 1,
    -    startAnchor = "cohort start", riskWindowEnd = 3650, endAnchor = "cohort start"),
    -  riskTable = T,
    -  confInt = T,
    -  yLabel = "Fraction of those who are outcome free in target population"
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - -
    plpData

    The plpData object returned by running getPlpData()

    outcomeId

    The cohort id corresponding to the outcome

    populationSettings

    The population settings created using createStudyPopulationSettings

    riskTable

    (binary) Whether to include a table at the bottom of the plot showing the number of people at risk over time

    confInt

    (binary) Whether to include a confidence interval

    yLabel

    (string) The label for the y-axis

    - -

    Value

    - -

    TRUE if it ran

    -

    Details

    - -

    This creates a survival plot that can be used to pick a suitable time-at-risk period

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/personSplitter.html b/docs/reference/personSplitter.html deleted file mode 100644 index 8b337e3dc..000000000 --- a/docs/reference/personSplitter.html +++ /dev/null @@ -1,244 +0,0 @@ - - - - - - - - -Split data into random subsets stratified by class — personSplitter • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Split data into random subsets stratified by class

    -
    - -
    personSplitter(population, test = 0.3, train = NULL, nfold = 3, seed = NULL)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - -
    population

    An object created using createStudyPopulation().

    test

    A real number between 0 and 1 indicating the test set fraction of the data

    train

    A real number between 0 and 1 indicating the train set fraction of the data. -If not set train is equal to 1 - test

    nfold

    An integer >= 1 specifying the number of folds used in cross validation

    seed

    If set a fixed seed is used, otherwise a random split is performed

    - -

    Value

    - -

    A dataframe containing the columns: rowId and index

    -

    Details

    - -

    Returns a dataframe of rowIds and indexes with a -1 index indicating the rowId belongs to the test -set and a positive integer index value indicating the rowId's cross valiation fold within the train -set.

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/pfi.html b/docs/reference/pfi.html deleted file mode 100644 index 61a9d5dc6..000000000 --- a/docs/reference/pfi.html +++ /dev/null @@ -1,269 +0,0 @@ - - - - - - - - -pfi — pfi • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Calculate the permutation feature importance for a PLP model.

    -
    - -
    pfi(
    -  plpResult,
    -  population,
    -  plpData,
    -  repeats = 1,
    -  covariates = NULL,
    -  cores = NULL,
    -  log = NULL,
    -  logthreshold = "INFO"
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    plpResult

    An object of type runPlp

    population

    The population created using createStudyPopulation() who will have their risks predicted

    plpData

    An object of type plpData - the patient level prediction -data extracted from the CDM.

    repeats

    The number of times to permute each covariate

    covariates

    A vector of covariates to calculate the pfi for. If NULL it uses all covariates included in the model.

    cores

    Number of cores to use when running this (it runs in parallel)

    log

    A location to save the log for running pfi

    logthreshold

    The log threshold (e.g., INFO, TRACE, ...)

    - -

    Value

    - -

    A dataframe with the covariateIds and the pfi (change in AUC caused by permuting the covariate) value

    -

    Details

    - -

    The function permutes the each covariate/features <repeats> times and calculates the mean AUC change caused by the permutation.

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/plotDemographicSummary.html b/docs/reference/plotDemographicSummary.html deleted file mode 100644 index 08466abab..000000000 --- a/docs/reference/plotDemographicSummary.html +++ /dev/null @@ -1,252 +0,0 @@ - - - - - - - - -Plot the Observed vs. expected incidence, by age and gender — plotDemographicSummary • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Plot the Observed vs. expected incidence, by age and gender

    -
    - -
    plotDemographicSummary(
    -  plpResult,
    -  typeColumn = "evaluation",
    -  saveLocation = NULL,
    -  fileName = "roc.png"
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - -
    plpResult

    A plp result object as generated using the runPlp function.

    typeColumn

    The name of the column specifying the evaluation type

    saveLocation

    Directory to save plot (if NULL plot is not saved)

    fileName

    Name of the file to save to plot, for example -'plot.png'. See the function ggsave in the ggplot2 package for -supported file formats.

    - -

    Value

    - -

    A ggplot object. Use the ggsave function to save to file in a different -format.

    -

    Details

    - -

    Create a plot showing the Observed vs. expected incidence, by age and gender -#'

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/plotF1Measure.html b/docs/reference/plotF1Measure.html deleted file mode 100644 index bbd38b250..000000000 --- a/docs/reference/plotF1Measure.html +++ /dev/null @@ -1,251 +0,0 @@ - - - - - - - - -Plot the F1 measure efficiency frontier using the sparse thresholdSummary data frame — plotF1Measure • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Plot the F1 measure efficiency frontier using the sparse thresholdSummary data frame

    -
    - -
    plotF1Measure(
    -  plpResult,
    -  typeColumn = "evaluation",
    -  saveLocation = NULL,
    -  fileName = "roc.png"
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - -
    plpResult

    A plp result object as generated using the runPlp function.

    typeColumn

    The name of the column specifying the evaluation type

    saveLocation

    Directory to save plot (if NULL plot is not saved)

    fileName

    Name of the file to save to plot, for example -'plot.png'. See the function ggsave in the ggplot2 package for -supported file formats.

    - -

    Value

    - -

    A ggplot object. Use the ggsave function to save to file in a different -format.

    -

    Details

    - -

    Create a plot showing the F1 measure efficiency frontier using the sparse thresholdSummary data frame

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/plotGeneralizability.html b/docs/reference/plotGeneralizability.html deleted file mode 100644 index 806b1e2da..000000000 --- a/docs/reference/plotGeneralizability.html +++ /dev/null @@ -1,248 +0,0 @@ - - - - - - - - -Plot the train/test generalizability diagnostic — plotGeneralizability • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Plot the train/test generalizability diagnostic

    -
    - -
    plotGeneralizability(
    -  covariateSummary,
    -  saveLocation = NULL,
    -  fileName = "Generalizability.png"
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - -
    covariateSummary

    A prediction object as generated using the -runPlp function.

    saveLocation

    Directory to save plot (if NULL plot is not saved)

    fileName

    Name of the file to save to plot, for example -'plot.png'. See the function ggsave in the ggplot2 package for -supported file formats.

    - -

    Value

    - -

    A ggplot object. Use the ggsave function to save to file in a different -format.

    -

    Details

    - -

    Create a plot showing the train/test generalizability diagnostic -#'

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/plotLearningCurve.html b/docs/reference/plotLearningCurve.html deleted file mode 100644 index a3a514041..000000000 --- a/docs/reference/plotLearningCurve.html +++ /dev/null @@ -1,280 +0,0 @@ - - - - - - - - -plotLearningCurve — plotLearningCurve • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Create a plot of the learning curve using the object returned -from createLearningCurve.

    -
    - -
    plotLearningCurve(
    -  learningCurve,
    -  metric = "AUROC",
    -  abscissa = "events",
    -  plotTitle = "Learning Curve",
    -  plotSubtitle = NULL,
    -  fileName = NULL
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - -
    learningCurve

    An object returned by createLearningCurve -function.

    metric

    Specifies the metric to be plotted:

      -
    • 'AUROC' - use the area under the Receiver Operating - Characteristic curve

    • -
    • 'AUPRC' - use the area under the Precision-Recall curve

    • -
    • 'sBrier' - use the scaled Brier score

    • -
    abscissa

    Specify the abscissa metric to be plotted:

      -
    • 'events' - use number of events

    • -
    • 'observations' - use number of observations

    • -
    plotTitle

    Title of the learning curve plot.

    plotSubtitle

    Subtitle of the learning curve plot.

    fileName

    Filename of plot to be saved, for example 'plot.png'. -See the function ggsave in the ggplot2 package for supported file -formats.

    - -

    Value

    - -

    A ggplot object. Use the ggsave function to save to -file in a different format.

    - -

    Examples

    -
    if (FALSE) { -# create learning curve object -learningCurve <- createLearningCurve(population, - plpData, - modelSettings) -# plot the learning curve -plotLearningCurve(learningCurve) -} - -
    -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/plotPlp.html b/docs/reference/plotPlp.html deleted file mode 100644 index 199ff8253..000000000 --- a/docs/reference/plotPlp.html +++ /dev/null @@ -1,240 +0,0 @@ - - - - - - - - -Plot all the PatientLevelPrediction plots — plotPlp • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Plot all the PatientLevelPrediction plots

    -
    - -
    plotPlp(plpResult, saveLocation = NULL, typeColumn = "evaluation")
    - -

    Arguments

    - - - - - - - - - - - - - - -
    plpResult

    Object returned by the runPlp() function

    saveLocation

    Name of the directory where the plots should be saved (NULL means no saving)

    typeColumn

    The name of the column specifying the evaluation type -(to stratify the plots)

    - -

    Value

    - -

    TRUE if it ran

    -

    Details

    - -

    Create a directory with all the plots

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/plotPrecisionRecall.html b/docs/reference/plotPrecisionRecall.html deleted file mode 100644 index fba15177f..000000000 --- a/docs/reference/plotPrecisionRecall.html +++ /dev/null @@ -1,251 +0,0 @@ - - - - - - - - -Plot the precision-recall curve using the sparse thresholdSummary data frame — plotPrecisionRecall • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Plot the precision-recall curve using the sparse thresholdSummary data frame

    -
    - -
    plotPrecisionRecall(
    -  plpResult,
    -  typeColumn = "evaluation",
    -  saveLocation = NULL,
    -  fileName = "roc.png"
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - -
    plpResult

    A plp result object as generated using the runPlp function.

    typeColumn

    The name of the column specifying the evaluation type

    saveLocation

    Directory to save plot (if NULL plot is not saved)

    fileName

    Name of the file to save to plot, for example -'plot.png'. See the function ggsave in the ggplot2 package for -supported file formats.

    - -

    Value

    - -

    A ggplot object. Use the ggsave function to save to file in a different -format.

    -

    Details

    - -

    Create a plot showing the precision-recall curve using the sparse thresholdSummary data frame

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/plotPredictedPDF.html b/docs/reference/plotPredictedPDF.html deleted file mode 100644 index 9e51a26f6..000000000 --- a/docs/reference/plotPredictedPDF.html +++ /dev/null @@ -1,251 +0,0 @@ - - - - - - - - -Plot the Predicted probability density function, showing prediction overlap between true and false cases — plotPredictedPDF • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Plot the Predicted probability density function, showing prediction overlap between true and false cases

    -
    - -
    plotPredictedPDF(
    -  plpResult,
    -  typeColumn = "evaluation",
    -  saveLocation = NULL,
    -  fileName = "PredictedPDF.png"
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - -
    plpResult

    A plp result object as generated using the runPlp function.

    typeColumn

    The name of the column specifying the evaluation type

    saveLocation

    Directory to save plot (if NULL plot is not saved)

    fileName

    Name of the file to save to plot, for example -'plot.png'. See the function ggsave in the ggplot2 package for -supported file formats.

    - -

    Value

    - -

    A ggplot object. Use the ggsave function to save to file in a different -format.

    -

    Details

    - -

    Create a plot showing the predicted probability density function, showing prediction overlap between true and false cases

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/plotPredictionDistribution.html b/docs/reference/plotPredictionDistribution.html deleted file mode 100644 index a2e751e3e..000000000 --- a/docs/reference/plotPredictionDistribution.html +++ /dev/null @@ -1,252 +0,0 @@ - - - - - - - - -Plot the side-by-side boxplots of prediction distribution, by class#' — plotPredictionDistribution • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Plot the side-by-side boxplots of prediction distribution, by class#'

    -
    - -
    plotPredictionDistribution(
    -  plpResult,
    -  typeColumn = "evaluation",
    -  saveLocation = NULL,
    -  fileName = "PredictionDistribution.png"
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - -
    plpResult

    A plp result object as generated using the runPlp function.

    typeColumn

    The name of the column specifying the evaluation type

    saveLocation

    Directory to save plot (if NULL plot is not saved)

    fileName

    Name of the file to save to plot, for example -'plot.png'. See the function ggsave in the ggplot2 package for -supported file formats.

    - -

    Value

    - -

    A ggplot object. Use the ggsave function to save to file in a different -format.

    -

    Details

    - -

    Create a plot showing the side-by-side boxplots of prediction distribution, by class -#'

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/plotPreferencePDF.html b/docs/reference/plotPreferencePDF.html deleted file mode 100644 index 2b7105906..000000000 --- a/docs/reference/plotPreferencePDF.html +++ /dev/null @@ -1,257 +0,0 @@ - - - - - - - - -Plot the preference score probability density function, showing prediction overlap between true and false cases -#' — plotPreferencePDF • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Plot the preference score probability density function, showing prediction overlap between true and false cases -#'

    -
    - -
    plotPreferencePDF(
    -  plpResult,
    -  typeColumn = "evaluation",
    -  saveLocation = NULL,
    -  fileName = "plotPreferencePDF.png"
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - -
    plpResult

    A plp result object as generated using the runPlp function.

    typeColumn

    The name of the column specifying the evaluation type

    saveLocation

    Directory to save plot (if NULL plot is not saved)

    fileName

    Name of the file to save to plot, for example -'plot.png'. See the function ggsave in the ggplot2 package for -supported file formats.

    - -

    Value

    - -

    A ggplot object. Use the ggsave function to save to file in a different -format.

    -

    Details

    - -

    Create a plot showing the preference score probability density function, showing prediction overlap between true and false cases -#'

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/plotRoc.html b/docs/reference/plotRoc.html deleted file mode 100644 index 247e2f260..000000000 --- a/docs/reference/plotRoc.html +++ /dev/null @@ -1,233 +0,0 @@ - - - - - - - - -Plot the ROC curve — plotRoc • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Plot the ROC curve

    -
    - -
    plotRoc(prediction, fileName = NULL)
    - -

    Arguments

    - - - - - - - - - - -
    prediction

    A prediction object as generated using the -predictProbabilities function.

    fileName

    Name of the file where the plot should be saved, for example -'plot.png'. See the function ggsave in the ggplot2 package for -supported file formats.

    - -

    Value

    - -

    A ggplot object. Use the ggsave function to save to file in a different -format.

    -

    Details

    - -

    Create a plot showing the Receiver Operator Characteristics (ROC) curve.

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/plotSmoothCalibration.html b/docs/reference/plotSmoothCalibration.html deleted file mode 100644 index 05215025e..000000000 --- a/docs/reference/plotSmoothCalibration.html +++ /dev/null @@ -1,289 +0,0 @@ - - - - - - - - -Plot the smooth calibration as detailed in Calster et al. "A calibration heirarchy for risk models -was defined: from utopia to empirical data" (2016) — plotSmoothCalibration • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Plot the smooth calibration as detailed in Calster et al. "A calibration heirarchy for risk models -was defined: from utopia to empirical data" (2016)

    -
    - -
    plotSmoothCalibration(
    -  plpResult,
    -  smooth = "loess",
    -  span = 0.75,
    -  nKnots = 5,
    -  scatter = FALSE,
    -  bins = 20,
    -  sample = TRUE,
    -  typeColumn = "evaluation",
    -  saveLocation = NULL,
    -  fileName = "smoothCalibration.pdf"
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    plpResult

    The result of running runPlp function. An object containing the -model or location where the model is save, the data selection settings, the -preprocessing and training settings as well as various performance measures -obtained by the model.

    smooth

    options: 'loess' or 'rcs'

    span

    This specifies the width of span used for loess. This will allow for faster -computing and lower memory usage.

    nKnots

    The number of knots to be used by the rcs evaluation. Default is 5

    scatter

    plot the decile calibrations as points on the graph. Default is False

    bins

    The number of bins for the histogram. Default is 20.

    sample

    If using loess then by default 20,000 patients will be sampled to save time

    typeColumn

    The name of the column specifying the evaluation type

    saveLocation

    Directory to save plot (if NULL plot is not saved)

    fileName

    Name of the file to save to plot, for example -'plot.png'. See the function ggsave in the ggplot2 package for -supported file formats.

    - -

    Value

    - -

    A ggplot object.

    -

    Details

    - -

    Create a plot showing the smoothed calibration #'

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/plotSparseCalibration.html b/docs/reference/plotSparseCalibration.html deleted file mode 100644 index fbe0f0372..000000000 --- a/docs/reference/plotSparseCalibration.html +++ /dev/null @@ -1,252 +0,0 @@ - - - - - - - - -Plot the calibration — plotSparseCalibration • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Plot the calibration

    -
    - -
    plotSparseCalibration(
    -  plpResult,
    -  typeColumn = "evaluation",
    -  saveLocation = NULL,
    -  fileName = "roc.png"
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - -
    plpResult

    A plp result object as generated using the runPlp function.

    typeColumn

    The name of the column specifying the evaluation type

    saveLocation

    Directory to save plot (if NULL plot is not saved)

    fileName

    Name of the file to save to plot, for example -'plot.png'. See the function ggsave in the ggplot2 package for -supported file formats.

    - -

    Value

    - -

    A ggplot object. Use the ggsave function to save to file in a different -format.

    -

    Details

    - -

    Create a plot showing the calibration -#'

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/plotSparseCalibration2.html b/docs/reference/plotSparseCalibration2.html deleted file mode 100644 index 128e137a7..000000000 --- a/docs/reference/plotSparseCalibration2.html +++ /dev/null @@ -1,252 +0,0 @@ - - - - - - - - -Plot the conventional calibration — plotSparseCalibration2 • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Plot the conventional calibration

    -
    - -
    plotSparseCalibration2(
    -  plpResult,
    -  typeColumn = "evaluation",
    -  saveLocation = NULL,
    -  fileName = "roc.png"
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - -
    plpResult

    A plp result object as generated using the runPlp function.

    typeColumn

    The name of the column specifying the evaluation type

    saveLocation

    Directory to save plot (if NULL plot is not saved)

    fileName

    Name of the file to save to plot, for example -'plot.png'. See the function ggsave in the ggplot2 package for -supported file formats.

    - -

    Value

    - -

    A ggplot object. Use the ggsave function to save to file in a different -format.

    -

    Details

    - -

    Create a plot showing the calibration -#'

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/plotSparseRoc.html b/docs/reference/plotSparseRoc.html deleted file mode 100644 index 1f1f410e8..000000000 --- a/docs/reference/plotSparseRoc.html +++ /dev/null @@ -1,251 +0,0 @@ - - - - - - - - -Plot the ROC curve using the sparse thresholdSummary data frame — plotSparseRoc • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Plot the ROC curve using the sparse thresholdSummary data frame

    -
    - -
    plotSparseRoc(
    -  plpResult,
    -  typeColumn = "evaluation",
    -  saveLocation = NULL,
    -  fileName = "roc.png"
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - -
    plpResult

    A plp result object as generated using the runPlp function.

    typeColumn

    The name of the column specifying the evaluation type

    saveLocation

    Directory to save plot (if NULL plot is not saved)

    fileName

    Name of the file to save to plot, for example -'plot.png'. See the function ggsave in the ggplot2 package for -supported file formats.

    - -

    Value

    - -

    A ggplot object. Use the ggsave function to save to file in a different -format.

    -

    Details

    - -

    Create a plot showing the Receiver Operator Characteristics (ROC) curve.

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/plotVariableScatterplot.html b/docs/reference/plotVariableScatterplot.html deleted file mode 100644 index a2934945a..000000000 --- a/docs/reference/plotVariableScatterplot.html +++ /dev/null @@ -1,248 +0,0 @@ - - - - - - - - -Plot the variable importance scatterplot — plotVariableScatterplot • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Plot the variable importance scatterplot

    -
    - -
    plotVariableScatterplot(
    -  covariateSummary,
    -  saveLocation = NULL,
    -  fileName = "VariableScatterplot.png"
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - -
    covariateSummary

    A prediction object as generated using the -runPlp function.

    saveLocation

    Directory to save plot (if NULL plot is not saved)

    fileName

    Name of the file to save to plot, for example -'plot.png'. See the function ggsave in the ggplot2 package for -supported file formats.

    - -

    Value

    - -

    A ggplot object. Use the ggsave function to save to file in a different -format.

    -

    Details

    - -

    Create a plot showing the variable importance scatterplot -#'

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/plpDataSimulationProfile.html b/docs/reference/plpDataSimulationProfile.html deleted file mode 100644 index fe6c92245..000000000 --- a/docs/reference/plpDataSimulationProfile.html +++ /dev/null @@ -1,229 +0,0 @@ - - - - - - - - -A simulation profile — plpDataSimulationProfile • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    A simulation profile

    -
    - -
    data(plpDataSimulationProfile)
    - - -

    Format

    - -

    A data frame containing the following elements:

    -
    covariatePrevalence

    prevalence of all covariates

    -
    outcomeModels

    regression model parameters to simulate outcomes

    -
    metaData

    settings used to simulate the profile

    -
    covariateRef

    covariateIds and covariateNames

    -
    timePrevalence

    time window

    -
    exclusionPrevalence

    prevalence of exclusion of covariates

    - -
    - - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/populatePlpResultTables.html b/docs/reference/populatePlpResultTables.html deleted file mode 100644 index 09b0689e3..000000000 --- a/docs/reference/populatePlpResultTables.html +++ /dev/null @@ -1,356 +0,0 @@ - - - - - - - - -Populate the PatientLevelPrediction results tables — populatePlpResultTables • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    This function formats and uploads results that have been generated via an ATLAS prediction package into a database

    -
    - -
    populatePlpResultTables(
    -  conn,
    -  resultSchema,
    -  stringAppendToTables = "",
    -  targetDialect = "postgresql",
    -  tempEmulationSchema = getOption("sqlRenderTempEmulationSchema"),
    -  packageName,
    -  studyJsonList,
    -  studyName = "",
    -  studyDescription = "",
    -  researcherName = "",
    -  researcherEmail = "",
    -  researcherOrg = "",
    -  databaseName = NULL,
    -  databaseAcronym = NULL,
    -  databaseVersion = 1,
    -  databaseDescription = NULL,
    -  databaseType = NULL,
    -  valDatabases = list(ccae = list(name = "CCAE", description = "", version = 1, type =
    -    "US Claims")),
    -  resultLocation = NULL,
    -  resultPattern = "",
    -  validationLocation = file.path(resultLocation, "Validation"),
    -  addInternalValidation = T,
    -  addExternalValidation = T,
    -  gsubVal = NULL,
    -  removePattern = NULL
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    conn

    A connection to a database created by using the -function connect in the -DatabaseConnector package.

    resultSchema

    (string) The name of the database schema that the result tables will be created.

    stringAppendToTables

    (string) A string that appends to the PatientLevelPrediction result tables

    targetDialect

    (string) The database management system being used

    tempEmulationSchema

    (string) The temp schema used when the database management system is oracle

    packageName

    (string) The name of the ATLAS R package used to generate the results (this is used to extract cohort jsons)

    studyJsonList

    (list) A list of lists per cohort with the cohort_name, cohort_id and cohort_json

    studyName

    (string) A reference study name

    studyDescription

    (string) A description of the study

    researcherName

    (string) Name of the researcher who developed the study

    researcherEmail

    (string) Email of the researcher who developed the study

    researcherOrg

    (string) Organisation of the researcher who developed the study

    databaseName

    (string) name of the database used to develop the model/s

    databaseAcronym

    (string) acronym of the database used to develop the model/s

    databaseVersion

    (int) Version of the database used to develop the model/s

    databaseDescription

    (string) Description of the database used to develop the model/s

    databaseType

    (string) Type of the database used to develop the model/s (e.g., claims)

    valDatabases

    (list) A named list with details of the external validation databases. Needs to contain: name, description, version, type.

    resultLocation

    (string) location of directory where the main package results were saved

    resultPattern

    (string) A string to match to select models of interest

    validationLocation

    (string) location of directory where the validation package results were saved

    addInternalValidation

    (boolean) Whether the internval validation results should be uploaded

    addExternalValidation

    (boolean) Whether the externval validation results should be uploaded

    gsubVal

    (string) Remove patterns from the result name

    removePattern

    (string) Restrict to result names with this pattern

    - -

    Value

    - -

    Returns NULL but uploads all the results in resultLocation to the PatientLevelPrediction result tables in resultSchema

    -

    Details

    - -

    This function can be used upload PatientLevelPrediction results into a database

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/positiveLikelihoodRatio.html b/docs/reference/positiveLikelihoodRatio.html deleted file mode 100644 index 0c19aa720..000000000 --- a/docs/reference/positiveLikelihoodRatio.html +++ /dev/null @@ -1,243 +0,0 @@ - - - - - - - - -Calculate the positiveLikelihoodRatio — positiveLikelihoodRatio • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Calculate the positiveLikelihoodRatio

    -
    - -
    positiveLikelihoodRatio(TP, TN, FN, FP)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - -
    TP

    Number of true positives

    TN

    Number of true negatives

    FN

    Number of false negatives

    FP

    Number of false positives

    - -

    Value

    - -

    positiveLikelihoodRatio value

    -

    Details

    - -

    Calculate the positiveLikelihoodRatio

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/positivePredictiveValue.html b/docs/reference/positivePredictiveValue.html deleted file mode 100644 index 940084bea..000000000 --- a/docs/reference/positivePredictiveValue.html +++ /dev/null @@ -1,243 +0,0 @@ - - - - - - - - -Calculate the positivePredictiveValue — positivePredictiveValue • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Calculate the positivePredictiveValue

    -
    - -
    positivePredictiveValue(TP, TN, FN, FP)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - -
    TP

    Number of true positives

    TN

    Number of true negatives

    FN

    Number of false negatives

    FP

    Number of false positives

    - -

    Value

    - -

    positivePredictiveValue value

    -

    Details

    - -

    Calculate the positivePredictiveValue

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/predictAndromeda.html b/docs/reference/predictAndromeda.html deleted file mode 100644 index 96470b65b..000000000 --- a/docs/reference/predictAndromeda.html +++ /dev/null @@ -1,253 +0,0 @@ - - - - - - - - -Generated predictions from a regression model — predictAndromeda • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Generated predictions from a regression model

    -
    - -
    predictAndromeda(
    -  coefficients,
    -  population,
    -  covariateData,
    -  modelType = "logistic"
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - -
    coefficients

    A names numeric vector where the names are the covariateIds, except for the -first value which is expected to be the intercept.

    population

    A data frame containing the population to do the prediction for

    covariateData

    An andromeda object containing the covariateData with predefined columns -(see below).

    modelType

    Current supported types are "logistic", "poisson", "cox" or "survival".

    - -

    Details

    - -

    These columns are expected in the outcome object:

    - - - -
    rowId(integer)Row ID is used to link multiple covariates (x) to a single outcome (y)
    time(real)For models that use time (e.g. Poisson or Cox regression) this contains time
    (e.g. number of days)
    -

    These columns are expected in the covariates object:

    - - - -
    rowId(integer)Row ID is used to link multiple covariates (x) to a single outcome -(y)
    covariateId(integer)A numeric identifier of a covariate
    covariateValue(real)The value of the specified covariate
    - - - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/predictCyclops.html b/docs/reference/predictCyclops.html deleted file mode 100644 index 3a21d5630..000000000 --- a/docs/reference/predictCyclops.html +++ /dev/null @@ -1,241 +0,0 @@ - - - - - - - - -Create predictive probabilities — predictCyclops • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Create predictive probabilities

    -
    - -
    predictCyclops(plpModel, data, cohort)
    - -

    Arguments

    - - - - - - - - - - - - - - -
    plpModel

    An object of type predictiveModel as generated using -fitPlp.

    data

    The new plpData containing the covariateData for the new population

    cohort

    The cohort to calculate the prediction for

    - -

    Value

    - -

    The value column in the result data.frame is: logistic: probabilities of the outcome, poisson: -Poisson rate (per day) of the outome, survival: hazard rate (per day) of the outcome.

    -

    Details

    - -

    Generates predictions for the population specified in plpData given the model.

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/predictFfdf.html b/docs/reference/predictFfdf.html deleted file mode 100644 index 2affbc06c..000000000 --- a/docs/reference/predictFfdf.html +++ /dev/null @@ -1,235 +0,0 @@ - - - - - - - - -Generated predictions from a regression model — predictFfdf • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - -
    - -
    -
    - - -
    - -

    Generated predictions from a regression model

    - -
    - -
    predictFfdf(coefficients, population, covariates, modelType = "logistic")
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - -
    coefficients

    A names numeric vector where the names are the covariateIds, except for the -first value which is expected to be the intercept.

    population

    A data frame containing the population to do the prediction for

    covariates

    A data frame or ffdf object containing the covariates with predefined columns -(see below).

    modelType

    Current supported types are "logistic", "poisson", "cox" or "survival".

    - -

    Details

    - -

    These columns are expected in the outcome object:

    - - - - - -
    rowId
    (integer)Row ID is used to link multiple covariates (x) to a single outcome (y)
    time
    (real)For models that use time (e.g. Poisson or Cox regression) this contains time
    (e.g. number of days)
    -

    These columns are expected in the covariates object:

    - - - - - - -
    rowId
    (integer)Row ID is used to link multiple covariates (x) to a single outcome(y)
    covariateId
    (integer)A numeric identifier of a covariate
    covariateValue(real)
    The value of the specified covariate
    - - -
    - -
    - -
    - - -
    -

    Site built with pkgdown 1.3.0.

    -
    -
    -
    - - - - - - diff --git a/docs/reference/predictPlp.html b/docs/reference/predictPlp.html deleted file mode 100644 index dc81d5d0a..000000000 --- a/docs/reference/predictPlp.html +++ /dev/null @@ -1,244 +0,0 @@ - - - - - - - - -predictPlp — predictPlp • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Predict the risk of the outcome using the input plpModel for the input plpData

    -
    - -
    predictPlp(plpModel, plpData, population, timepoint)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - -
    plpModel

    An object of type plpModel - a patient level prediction model

    plpData

    An object of type plpData - the patient level prediction -data extracted from the CDM.

    population

    The population created using createStudyPopulation() who will have their risks predicted or a cohort without the outcome known

    timepoint

    The timepoint to predict risk (survival models only)

    - -

    Value

    - -

    A dataframe containing the prediction for each person in the population with an attribute metaData containing prediction details.

    -

    Details

    - -

    The function applied the trained model on the plpData to make predictions

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/predictProbabilities.html b/docs/reference/predictProbabilities.html deleted file mode 100644 index 6fbd0a8c7..000000000 --- a/docs/reference/predictProbabilities.html +++ /dev/null @@ -1,235 +0,0 @@ - - - - - - - - -Create predictive probabilities — predictProbabilities • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Create predictive probabilities

    -
    - -
    predictProbabilities(predictiveModel, population, covariateData)
    - -

    Arguments

    - - - - - - - - - - - - - - -
    predictiveModel

    An object of type predictiveModel as generated using -fitPlp.

    population

    The population to calculate the prediction for

    covariateData

    The covariateData containing the covariates for the population

    - -

    Value

    - -

    The value column in the result data.frame is: logistic: probabilities of the outcome, poisson: -Poisson rate (per day) of the outome, survival: hazard rate (per day) of the outcome.

    -

    Details

    - -

    Generates predictions for the population specified in plpData given the model.

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/preprocessData.html b/docs/reference/preprocessData.html deleted file mode 100644 index c6e79957d..000000000 --- a/docs/reference/preprocessData.html +++ /dev/null @@ -1,241 +0,0 @@ - - - - - - - - -A function that wraps around FeatureExtraction::tidyCovariateData to normalise the data -and remove rare or redundant features — preprocessData • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    A function that wraps around FeatureExtraction::tidyCovariateData to normalise the data -and remove rare or redundant features

    -
    - -
    preprocessData(covariateData, preprocessSettings)
    - -

    Arguments

    - - - - - - - - - - -
    covariateData

    The covariate part of the training data created by splitData after being sampled and having -any required feature engineering

    preprocessSettings

    The settings for the preprocessing created by createPreprocessSettings

    - -

    Value

    - -

    The data processed

    -

    Details

    - -

    Returns an object of class covariateData that has been processed

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/randomSplitter.html b/docs/reference/randomSplitter.html deleted file mode 100644 index 3f91292a3..000000000 --- a/docs/reference/randomSplitter.html +++ /dev/null @@ -1,244 +0,0 @@ - - - - - - - - -Split data into random subsets stratified by class — randomSplitter • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Split data into random subsets stratified by class

    -
    - -
    randomSplitter(population, test = 0.3, train = NULL, nfold = 3, seed = NULL)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - -
    population

    An object created using createStudyPopulation().

    test

    A real number between 0 and 1 indicating the test set fraction of the data

    train

    A real number between 0 and 1 indicating the train set fraction of the data. -If not set train is equal to 1 - test

    nfold

    An integer >= 1 specifying the number of folds used in cross validation

    seed

    If set a fixed seed is used, otherwise a random split is performed

    - -

    Value

    - -

    A dataframe containing the columns: rowId and index

    -

    Details

    - -

    Returns a dataframe of rowIds and indexes with a -1 index indicating the rowId belongs to the test -set and a positive integer index value indicating the rowId's cross valiation fold within the train -set.

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/recalibratePlp.html b/docs/reference/recalibratePlp.html deleted file mode 100644 index d789c3085..000000000 --- a/docs/reference/recalibratePlp.html +++ /dev/null @@ -1,249 +0,0 @@ - - - - - - - - -recalibratePlp — recalibratePlp • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Train various models using a default parameter gird search or user specified parameters

    -
    - -
    recalibratePlp(
    -  prediction,
    -  analysisId,
    -  typeColumn = "evaluationType",
    -  method = c("recalibrationInTheLarge", "weakRecalibration")
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - -
    prediction

    A prediction dataframe

    analysisId

    The model analysisId

    typeColumn

    The column name where the strata types are specified

    method

    Method used to recalibrate ('recalibrationInTheLarge' or 'weakRecalibration' )

    - -

    Value

    - -

    An object of class runPlp that is recalibrated on the new data

    -

    Details

    - -

    The user can define the machine learning model to train (regularised logistic regression, random forest, -gradient boosting machine, neural network and )

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/recalibratePlpRefit.html b/docs/reference/recalibratePlpRefit.html deleted file mode 100644 index 2af353ae8..000000000 --- a/docs/reference/recalibratePlpRefit.html +++ /dev/null @@ -1,241 +0,0 @@ - - - - - - - - -recalibratePlpRefit — recalibratePlpRefit • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Train various models using a default parameter gird search or user specified parameters

    -
    - -
    recalibratePlpRefit(plpModel, newPopulation, newData)
    - -

    Arguments

    - - - - - - - - - - - - - - -
    plpModel

    The trained plpModel (runPlp$model)

    newPopulation

    The population created using createStudyPopulation() who will have their risks predicted

    newData

    An object of type plpData - the patient level prediction -data extracted from the CDM.

    - -

    Value

    - -

    An object of class runPlp that is recalibrated on the new data

    -

    Details

    - -

    The user can define the machine learning model to train (regularised logistic regression, random forest, -gradient boosting machine, neural network and )

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/registerParallelBackend.html b/docs/reference/registerParallelBackend.html deleted file mode 100644 index 4d84ea497..000000000 --- a/docs/reference/registerParallelBackend.html +++ /dev/null @@ -1,228 +0,0 @@ - - - - - - - - -registerParallelBackend — registerParallelBackend • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Registers a parallel backend for multi core processing. The -number of cores will be detected automatically, unless specified otherwise.

    -
    - -
    registerParallelBackend(cores = NULL, logical = TRUE)
    - -

    Arguments

    - - - - - - - - - - -
    cores

    the number of cores to use for multi core processing

    logical

    whether to consider logical or physical cores

    - - -

    Examples

    -
    if (FALSE) { -# detect logical cores automatically -registerParallelBackend() - -# use four physical cores -numCores <- 4 -registerParallelBackend(numCores, logical = FALSE) -}
    -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.5.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/registerSequentialBackend.html b/docs/reference/registerSequentialBackend.html deleted file mode 100644 index 946d45c40..000000000 --- a/docs/reference/registerSequentialBackend.html +++ /dev/null @@ -1,212 +0,0 @@ - - - - - - - - -registerSequentialBackend — registerSequentialBackend • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    registerSequentialBackend registers a sequential backend for -single core processing.

    -
    - -
    registerSequentialBackend()
    - - - -

    Examples

    -
    if (FALSE) { -# register a sequential backend -registerSequentialBackend() -}
    -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.5.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/runEnsembleModel.html b/docs/reference/runEnsembleModel.html deleted file mode 100644 index 5085dccb2..000000000 --- a/docs/reference/runEnsembleModel.html +++ /dev/null @@ -1,333 +0,0 @@ - - - - - - - - -ensemble - Create an ensembling model using different models — runEnsembleModel • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    #'

    -
    - -
    runEnsembleModel(
    -  population,
    -  dataList,
    -  modelList,
    -  testSplit = "time",
    -  testFraction = 0.2,
    -  stackerUseCV = TRUE,
    -  splitSeed = NULL,
    -  nfold = 3,
    -  saveDirectory = NULL,
    -  saveEnsemble = F,
    -  savePlpData = F,
    -  savePlpResult = F,
    -  savePlpPlots = F,
    -  saveEvaluation = F,
    -  analysisId = NULL,
    -  verbosity = "INFO",
    -  ensembleStrategy = "mean",
    -  cores = NULL
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    population

    The population created using createStudyPopulation() who will be used to -develop the model

    dataList

    An list of object of type plpData - the patient level prediction -data extracted from the CDM.

    modelList

    An list of type of base model created using one of the function in final -ensembling model, the base model can be any model implemented in this -package.

    testSplit

    Either 'person' or 'time' specifying the type of evaluation used. 'time' -find the date where testFraction of patients had an index after the date -and assigns patients with an index prior to this date into the training -set and post the date into the test set 'person' splits the data into test -(1-testFraction of the data) and train (validationFraction of the data) -sets. The split is stratified by the class label.

    testFraction

    The fraction of the data to be used as the test set in the patient split -evaluation.

    stackerUseCV

    When doing stacking you can either use the train CV predictions to train the stacker (TRUE) or leave 20 percent of the data to train the stacker

    splitSeed

    The seed used to split the test/train set when using a person type -testSplit

    nfold

    The number of folds used in the cross validation (default 3)

    saveDirectory

    The path to the directory where the results will be saved (if NULL uses working directory)

    saveEnsemble

    Binary indicating whether to save the ensemble

    savePlpData

    Binary indicating whether to save the plpData object (default is F)

    savePlpResult

    Binary indicating whether to save the object returned by runPlp (default is F)

    savePlpPlots

    Binary indicating whether to save the performance plots as pdf files (default is F)

    saveEvaluation

    Binary indicating whether to save the oerformance as csv files (default is T)

    analysisId

    The analysis ID

    verbosity

    Sets the level of the verbosity. If the log level is at or higher in -priority than the logger threshold, a message will print. The levels are:

      -
    • DEBUGHighest verbosity showing all debug statements

    • -
    • TRACEShowing information about start and end of steps

    • -
    • INFOShow informative information (Default)

    • -
    • WARNShow warning messages

    • -
    • ERRORShow error messages

    • -
    • FATALBe silent except for fatal errors

    • -
    ensembleStrategy

    The strategy used for ensembling the outputs from different models, it can -be 'mean', 'product', 'weighted' and 'stacked' 'mean' the average -probability from differnt models 'product' the product rule 'weighted' the -weighted average probability from different models using train AUC as -weights. 'stacked' the stakced ensemble trains a logistics regression on -different models.

    cores

    The number of cores to use when training the ensemble

    - -

    Details

    - -

    This function applied a list of models and combines them into an ensemble model

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/runMultiplePlp.html b/docs/reference/runMultiplePlp.html deleted file mode 100644 index 2ed6463a2..000000000 --- a/docs/reference/runMultiplePlp.html +++ /dev/null @@ -1,277 +0,0 @@ - - - - - - - - -Run a list of predictions analyses — runMultiplePlp • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Run a list of predictions analyses

    -
    - -
    runMultiplePlp(
    -  databaseDetails = createDatabaseDetails(),
    -  modelDesignList = list(createModelDesign(targetId = 1, outcomeId = 2, modelSettings =
    -    setLassoLogisticRegression()), createModelDesign(targetId = 1, outcomeId = 3,
    -    modelSettings = setLassoLogisticRegression())),
    -  onlyFetchData = F,
    -  splitSettings = createDefaultSplitSetting(type = "stratified", testFraction = 0.25,
    -    trainFraction = 0.75, splitSeed = 123, nfold = 3),
    -  cohortDefinitions = NULL,
    -  logSettings = createLogSettings(verbosity = "DEBUG", timeStamp = T, logName =
    -    "runPlp Log"),
    -  saveDirectory = getwd()
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    databaseDetails

    The database settings created using createDatabaseDetails()

    modelDesignList

    A list of model designs created using createModelDesign()

    onlyFetchData

    Only fetches and saves the data object to the output folder without running the analysis.

    splitSettings

    The train/validation/test splitting used by all analyses created using createDefaultSplitSetting()

    cohortDefinitions

    A list of cohort definitions for the target and outcome cohorts

    logSettings

    The setting spexcifying the logging for the analyses created using createLogSettings()

    saveDirectory

    Name of the folder where all the outputs will written to.

    - -

    Value

    - -

    A data frame with the following columns:

    - - - - - - -
    analysisIdThe unique identifier -for a set of analysis choices.
    cohortIdThe ID of the target cohort populations.
    outcomeIdThe ID of the outcomeId.
    dataLocationThe location where the plpData was saved
    evaluationFolderThe name of file containing the evaluation saved as a csv
    the settings idsThe ids for all other settings used for model development.
    - - -

    Details

    - -

    This function will run all specified predictions as defined using .

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/runPlp.html b/docs/reference/runPlp.html deleted file mode 100644 index d3a6a999e..000000000 --- a/docs/reference/runPlp.html +++ /dev/null @@ -1,337 +0,0 @@ - - - - - - - - -runPlp - Develop and internally evaluate a model using specified settings — runPlp • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    This provides a general framework for training patient level prediction models. The user can select -various default feature selection methods or incorporate their own, The user can also select from -a range of default classifiers or incorporate their own. There are three types of evaluations for the model -patient (randomly splits people into train/validation sets) or year (randomly splits data into train/validation sets -based on index year - older in training, newer in validation) or both (same as year spliting but checks there are -no overlaps in patients within training set and validaiton set - any overlaps are removed from validation set)

    -
    - -
    runPlp(
    -  plpData,
    -  outcomeId = plpData$metaData$call$outcomeIds[1],
    -  analysisId = paste(Sys.Date(), plpData$metaData$call$outcomeIds[1], sep = "-"),
    -  analysisName = "Study details",
    -  populationSettings = createStudyPopulationSettings(),
    -  splitSettings = createDefaultSplitSetting(type = "stratified", testFraction = 0.25,
    -    trainFraction = 0.75, splitSeed = 123, nfold = 3),
    -  sampleSettings = createSampleSettings(type = "none"),
    -  featureEngineeringSettings = createFeatureEngineeringSettings(type = "none"),
    -  preprocessSettings = createPreprocessSettings(minFraction = 0.001, normalize = T),
    -  modelSettings = setLassoLogisticRegression(),
    -  logSettings = createLogSettings(verbosity = "DEBUG", timeStamp = T, logName =
    -    "runPlp Log"),
    -  executeSettings = createDefaultExecuteSettings(),
    -  saveDirectory = getwd()
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    plpData

    An object of type plpData - the patient level prediction -data extracted from the CDM.

    outcomeId

    (integer) The ID of the outcome.

    analysisId

    (integer) Identifier for the analysis. It is used to create, e.g., the result folder. Default is a timestamp.

    analysisName

    (character) Name for the analysis

    populationSettings

    An object of type populationSettings created using createStudyPopulationSettings that -specifies how the data class labels are defined and addition any exclusions to apply to the -plpData cohort

    splitSettings

    An object of type splitSettings that specifies how to split the data into train/validation/test. -The default settings can be created using createDefaultSplitSetting.

    sampleSettings

    An object of type sampleSettings that specifies any under/over sampling to be done. -The default is none.

    featureEngineeringSettings

    An object of featureEngineeringSettings specifying any feature engineering to be learned (using the train data)

    preprocessSettings

    An object of preprocessSettings. This setting specifies the minimum fraction of -target population who must have a covariate for it to be included in the model training -and whether to normalise the covariates before training

    modelSettings

    An object of class modelSettings created using one of the function:

      -
    • setLassoLogisticRegression() A lasso logistic regression model

    • -
    • setGradientBoostingMachine() A gradient boosting machine

    • -
    • setAdaBoost() An ada boost model

    • -
    • setRandomForest() A random forest model

    • -
    • setDecisionTree() A decision tree model

    • -
    • setCovNN()) A convolutional neural network model

    • -
    • setCIReNN() A recurrent neural network model

    • -
    • setMLP() A neural network model

    • -
    • setDeepNN() A deep neural network model

    • -
    • setKNN() A KNN model

    • -
    logSettings

    An object of logSettings created using createLogSettings -specifying how the logging is done

    executeSettings

    An object of executeSettings specifying which parts of the analysis to run

    saveDirectory

    The path to the directory where the results will be saved (if NULL uses working directory)

    - -

    Value

    - -

    An object containing the following:

    -

      -
    • inputSettingsA list containing all the settings used to develop the model

    • -
    • model The developed model of class plpModel

    • -
    • executionSummary A list containing the hardward details, R package details and execution time

    • -
    • performanceEvaluation Various internal performance metrics in sparse format

    • -
    • prediction The plpData cohort table with the predicted risks added as a column (named value)

    • -
    • covariateSummary) A characterization of the features for patients with and without the outcome during the time at risk

    • -
    • analysisRef A list with details about the analysis

    • -
    - -

    Details

    - -

    This function takes as input the plpData extracted from an OMOP CDM database and follows the specified settings to -develop and internally validate a model for the specified outcomeId.

    - -

    Examples

    -
    
    -  
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/runPlpAnalyses.html b/docs/reference/runPlpAnalyses.html deleted file mode 100644 index 4ab0d4ed9..000000000 --- a/docs/reference/runPlpAnalyses.html +++ /dev/null @@ -1,383 +0,0 @@ - - - - - - - - -Run a list of predictions — runPlpAnalyses • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Run a list of predictions

    -
    - -
    runPlpAnalyses(
    -  connectionDetails,
    -  cdmDatabaseSchema,
    -  cdmDatabaseName,
    -  oracleTempSchema = cdmDatabaseSchema,
    -  cohortDatabaseSchema = cdmDatabaseSchema,
    -  cohortTable = "cohort",
    -  outcomeDatabaseSchema = cdmDatabaseSchema,
    -  outcomeTable = "cohort",
    -  cdmVersion = 5,
    -  onlyFetchData = FALSE,
    -  outputFolder = "./PlpOutput",
    -  modelAnalysisList,
    -  cohortIds,
    -  cohortNames,
    -  outcomeIds,
    -  outcomeNames,
    -  washoutPeriod = 0,
    -  maxSampleSize = NULL,
    -  minCovariateFraction = 0,
    -  normalizeData = T,
    -  testSplit = "person",
    -  testFraction = 0.25,
    -  splitSeed = NULL,
    -  nfold = 3,
    -  verbosity = "INFO",
    -  settings = NULL
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    connectionDetails

    An R object of type connectionDetails created using the -function createConnectionDetails in the -DatabaseConnector package.

    cdmDatabaseSchema

    The name of the database schema that contains the OMOP CDM -instance. Requires read permissions to this database. On SQL -Server, this should specifiy both the database and the schema, -so for example 'cdm_instance.dbo'.

    cdmDatabaseName

    A string with a shareable name of the database (this will be shown to OHDSI researchers if the results get transported)

    oracleTempSchema

    For Oracle only: the name of the database schema where you -want all temporary tables to be managed. Requires -create/insert permissions to this database.

    cohortDatabaseSchema

    The name of the database schema that is the location where the -target cohorts are available. Requires read -permissions to this database.

    cohortTable

    The tablename that contains the target cohorts. Expectation is cohortTable -has format of COHORT table: COHORT_DEFINITION_ID, SUBJECT_ID, -COHORT_START_DATE, COHORT_END_DATE.

    outcomeDatabaseSchema

    The name of the database schema that is the location where the -data used to define the outcome cohorts is available. Requires read permissions to -this database.

    outcomeTable

    The tablename that contains the outcome cohorts. Expectation is -outcomeTable has format of COHORT table: COHORT_DEFINITION_ID, -SUBJECT_ID, COHORT_START_DATE, COHORT_END_DATE.

    cdmVersion

    Define the OMOP CDM version used: currently support "4" and -"5".

    onlyFetchData

    Only fetches and saves the data object to the output folder without running the analysis.

    outputFolder

    Name of the folder where all the outputs will written to.

    modelAnalysisList

    A list of objects of type modelSettings as created using -the createPlpModelSettings function.

    cohortIds

    A vector of cohortIds that specify all the target cohorts

    cohortNames

    A vector of cohortNames corresponding to the cohortIds

    outcomeIds

    A vector of outcomeIds that specify all the outcome cohorts

    outcomeNames

    A vector of outcomeNames corresponding to the outcomeIds

    washoutPeriod

    Minimum number of prior observation days

    maxSampleSize

    Max number of target people to sample from to develop models

    minCovariateFraction

    Any covariate with an incidence less than this value if ignored

    normalizeData

    Whether to normalize the covariates

    testSplit

    How to split into test/train (time or person)

    testFraction

    Fraction of data to use as test set

    splitSeed

    The seed used for the randomization into test/train

    nfold

    Number of folds used to do cross validation

    verbosity

    The logging level

    settings

    Specify the T, O, population, covariate and model settings

    - -

    Value

    - -

    A data frame with the following columns:

    - - - - - - - -
    analysisIdThe unique identifier -for a set of analysis choices.
    cohortIdThe ID of the target cohort populations.
    outcomeIdThe ID of the outcomeId.
    plpDataFolderThe location where the plpData was saved
    studyPopFileThe -name of the file containing the study population
    evaluationFolderThe name of file containing the evaluation saved as a csv
    modelFolderThe name of the file containing the developed model.
    - - -

    Details

    - -

    Run a list of predictions for the target cohorts and outcomes of interest. This function will run all -specified predictions, meaning that the total number of outcome -models is `length(cohortIds) * length(outcomeIds) * length(modelAnalysisList)`.

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/saveEnsemblePlpModel.html b/docs/reference/saveEnsemblePlpModel.html deleted file mode 100644 index 18b08bff5..000000000 --- a/docs/reference/saveEnsemblePlpModel.html +++ /dev/null @@ -1,226 +0,0 @@ - - - - - - - - -saves the Ensmeble plp model — saveEnsemblePlpModel • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    saves the Ensmeble plp model

    -
    - -
    saveEnsemblePlpModel(ensembleModel, dirPath)
    - -

    Arguments

    - - - - - - - - - - -
    ensembleModel

    The ensemble model to save

    dirPath

    The location to save the model

    - -

    Details

    - -

    Saves a plp ensemble model

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/saveEnsemblePlpResult.html b/docs/reference/saveEnsemblePlpResult.html deleted file mode 100644 index 323846315..000000000 --- a/docs/reference/saveEnsemblePlpResult.html +++ /dev/null @@ -1,226 +0,0 @@ - - - - - - - - -saves the Ensemble plp results — saveEnsemblePlpResult • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    saves the Ensemble plp results

    -
    - -
    saveEnsemblePlpResult(ensembleResult, dirPath)
    - -

    Arguments

    - - - - - - - - - - -
    ensembleResult

    The ensemble result

    dirPath

    The location to save the ensemble results

    - -

    Details

    - -

    Saves a plp ensemble results

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/savePlpAnalysesJson.html b/docs/reference/savePlpAnalysesJson.html deleted file mode 100644 index d5d6cc2aa..000000000 --- a/docs/reference/savePlpAnalysesJson.html +++ /dev/null @@ -1,249 +0,0 @@ - - - - - - - - -Save the modelDesignList to a json file — savePlpAnalysesJson • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Save the modelDesignList to a json file

    -
    - -
    savePlpAnalysesJson(
    -  modelDesignList = list(createModelDesign(targetId = 1, outcomeId = 2, modelSettings =
    -    setLassoLogisticRegression()), createModelDesign(targetId = 1, outcomeId = 3,
    -    modelSettings = setLassoLogisticRegression())),
    -  saveDirectory = NULL
    -)
    - -

    Arguments

    - - - - - - - - - - -
    modelDesignList

    A list of modelDesigns created using createModelDesign()

    saveDirectory

    The directory to save the modelDesignList settings

    - -

    Details

    - -

    This function creates a json file with the modelDesignList saved

    - -

    Examples

    -
    if (FALSE) { -savePlpAnalysesJson( -modelDesignList = list( -createModelDesign(targetId = 1, outcomeId = 2, modelSettings = setLassoLogisticRegression()), -createModelDesign(targetId = 1, outcomeId = 3, modelSettings = setLassoLogisticRegression()) -), -saveDirectory = 'C:/bestModels' -) -} - -
    -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/savePlpData.html b/docs/reference/savePlpData.html deleted file mode 100644 index e31608d82..000000000 --- a/docs/reference/savePlpData.html +++ /dev/null @@ -1,246 +0,0 @@ - - - - - - - - -Save the cohort data to folder — savePlpData • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    savePlpData saves an object of type plpData to folder.

    -
    - -
    savePlpData(plpData, file, envir = NULL, overwrite = F)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - -
    plpData

    An object of type plpData as generated using -getPlpData.

    file

    The name of the folder where the data will be written. The folder should -not yet exist.

    envir

    The environment for to evaluate variables when saving

    overwrite

    Whether to force overwrite an existing file

    - -

    Details

    - -

    The data will be written to a set of files in the folder specified by the user.

    - -

    Examples

    -
    # todo - -
    -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/savePlpModel.html b/docs/reference/savePlpModel.html deleted file mode 100644 index 16ae57e6c..000000000 --- a/docs/reference/savePlpModel.html +++ /dev/null @@ -1,232 +0,0 @@ - - - - - - - - -Saves the plp model — savePlpModel • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Saves the plp model

    -
    - -
    savePlpModel(plpModel, dirPath)
    - -

    Arguments

    - - - - - - - - - - -
    plpModel

    A trained classifier returned by running runPlp()$model

    dirPath

    A location to save the model to

    - -

    Details

    - -

    Saves the plp model to a user specificed folder

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/savePlpResult.html b/docs/reference/savePlpResult.html deleted file mode 100644 index 55990afa7..000000000 --- a/docs/reference/savePlpResult.html +++ /dev/null @@ -1,232 +0,0 @@ - - - - - - - - -Saves the result from runPlp into the location directory — savePlpResult • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Saves the result from runPlp into the location directory

    -
    - -
    savePlpResult(result, dirPath)
    - -

    Arguments

    - - - - - - - - - - -
    result

    The result of running runPlp()

    dirPath

    The directory to save the csv

    - -

    Details

    - -

    Saves the result from runPlp into the location directory

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/savePlpShareable.html b/docs/reference/savePlpShareable.html deleted file mode 100644 index 90e5e99e2..000000000 --- a/docs/reference/savePlpShareable.html +++ /dev/null @@ -1,236 +0,0 @@ - - - - - - - - -Save the plp result as json files and csv files for transparent sharing — savePlpShareable • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Save the plp result as json files and csv files for transparent sharing

    -
    - -
    savePlpShareable(result, saveDirectory, minCellCount = 10)
    - -

    Arguments

    - - - - - - - - - - - - - - -
    result

    An object of class runPlp with development or validation results

    saveDirectory

    The directory the save the results as csv files

    minCellCount

    Minimum cell count for the covariateSummary and certain evaluation results

    - -

    Details

    - -

    Saves the main results json/csv files (these files can be read by the shiny app)

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/savePlpToCsv.html b/docs/reference/savePlpToCsv.html deleted file mode 100644 index 367a94073..000000000 --- a/docs/reference/savePlpToCsv.html +++ /dev/null @@ -1,226 +0,0 @@ - - - - - - - - -Save parts of the plp result as a csv for transparent sharing — savePlpToCsv • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Save parts of the plp result as a csv for transparent sharing

    -
    - -
    savePlpToCsv(result, dirPath)
    - -

    Arguments

    - - - - - - - - - - -
    result

    An object of class runPlp with development or validation results

    dirPath

    The directory the save the results as csv files

    - -

    Details

    - -

    Saves the main results as a csv (these files can be read by the shiny app)

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/savePrediction.html b/docs/reference/savePrediction.html deleted file mode 100644 index c6b104576..000000000 --- a/docs/reference/savePrediction.html +++ /dev/null @@ -1,236 +0,0 @@ - - - - - - - - -Saves the prediction dataframe to RDS — savePrediction • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Saves the prediction dataframe to RDS

    -
    - -
    savePrediction(prediction, dirPath, fileName = "prediction.rds")
    - -

    Arguments

    - - - - - - - - - - - - - - -
    prediction

    The prediciton data.frame

    dirPath

    The directory to save the prediction RDS

    fileName

    The name of the RDS file that will be saved in dirPath

    - -

    Details

    - -

    Saves the prediction data frame returned by predict.R to an RDS file and returns the fileLocation where the prediction is saved

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/savePredictionAnalysisList.html b/docs/reference/savePredictionAnalysisList.html deleted file mode 100644 index 05ec5e435..000000000 --- a/docs/reference/savePredictionAnalysisList.html +++ /dev/null @@ -1,295 +0,0 @@ - - - - - - - - -Saves a json prediction settings given R settings — savePredictionAnalysisList • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Saves a json prediction settings given R settings

    -
    - -
    savePredictionAnalysisList(
    -  workFolder = "inst/settings",
    -  cohortIds,
    -  outcomeIds,
    -  cohortSettingCsv = file.path(workFolder, "CohortsToCreate.csv"),
    -  covariateSettingList,
    -  populationSettingList,
    -  modelSettingList,
    -  maxSampleSize = NULL,
    -  washoutPeriod = 0,
    -  minCovariateFraction = 0,
    -  normalizeData = T,
    -  testSplit = "person",
    -  testFraction = 0.25,
    -  splitSeed = 1,
    -  nfold = 3
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    workFolder

    Location to save json specification

    cohortIds

    Vector of target population cohort ids

    outcomeIds

    Vector of outcome cohort ids

    cohortSettingCsv

    The location to the csv containing the cohort details

    covariateSettingList

    A list of covariate settings

    populationSettingList

    A list of population settings

    modelSettingList

    A list of model settings

    maxSampleSize

    If not NULL then max number of target population to sample for model training

    washoutPeriod

    Minimum prior observation for each person in target pop to be included

    minCovariateFraction

    Minimum covariate fraction to include

    normalizeData

    Whether to normalise data

    testSplit

    Split by person or time

    testFraction

    Fractiuon of data to use for test set

    splitSeed

    Seed used in test split

    nfold

    Number of folds used when training model

    - -

    Details

    - -

    This function interprets a json with the multiple prediction settings and creates a list -that can be combined with connection settings to run a multiple prediction study

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/sensitivity.html b/docs/reference/sensitivity.html deleted file mode 100644 index 8d96e3d1d..000000000 --- a/docs/reference/sensitivity.html +++ /dev/null @@ -1,243 +0,0 @@ - - - - - - - - -Calculate the sensitivity — sensitivity • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Calculate the sensitivity

    -
    - -
    sensitivity(TP, TN, FN, FP)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - -
    TP

    Number of true positives

    TN

    Number of true negatives

    FN

    Number of false negatives

    FP

    Number of false positives

    - -

    Value

    - -

    sensitivity value

    -

    Details

    - -

    Calculate the sensitivity

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/setAdaBoost.html b/docs/reference/setAdaBoost.html deleted file mode 100644 index ddc12d5d6..000000000 --- a/docs/reference/setAdaBoost.html +++ /dev/null @@ -1,250 +0,0 @@ - - - - - - - - -Create setting for AdaBoost with python DecisionTreeClassifier base estimator — setAdaBoost • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Create setting for AdaBoost with python DecisionTreeClassifier base estimator

    -
    - -
    setAdaBoost(
    -  nEstimators = list(10, 50, 200),
    -  learningRate = list(1, 0.5, 0.1),
    -  algorithm = list("SAMME.R"),
    -  seed = sample(1e+06, 1)
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - -
    nEstimators

    (list) The maximum number of estimators at which boosting is terminated. In case of perfect fit, the learning procedure is stopped early.

    learningRate

    (list) Weight applied to each classifier at each boosting iteration. A higher learning rate increases the contribution of each classifier. There is a trade-off between the learningRate and nEstimators parameters -There is a trade-off between learningRate and nEstimators.

    algorithm

    (list) If ‘SAMME.R’ then use the SAMME.R real boosting algorithm. base_estimator must support calculation of class probabilities. If ‘SAMME’ then use the SAMME discrete boosting algorithm. The SAMME.R algorithm typically converges faster than SAMME, achieving a lower test error with fewer boosting iterations.

    seed

    A seed for the model

    - - -

    Examples

    -
    if (FALSE) { -model.adaBoost <- setAdaBoost(nEstimators = list(10,50,200), learningRate = list(1, 0.5, 0.1), - algorithm = list('SAMME.R'), seed = sample(1000000,1) - ) -} -
    -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/setCIReNN.html b/docs/reference/setCIReNN.html deleted file mode 100644 index f62ecff7e..000000000 --- a/docs/reference/setCIReNN.html +++ /dev/null @@ -1,346 +0,0 @@ - - - - - - - - -Create setting for CIReNN model — setCIReNN • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Create setting for CIReNN model

    -
    - -
    setCIReNN(
    -  numberOfRNNLayer = c(1),
    -  units = c(128, 64),
    -  recurrentDropout = c(0.2),
    -  layerDropout = c(0.2),
    -  lr = c(1e-04),
    -  decay = c(1e-05),
    -  outcomeWeight = c(0),
    -  batchSize = c(100),
    -  epochs = c(100),
    -  earlyStoppingMinDelta = c(1e-04),
    -  earlyStoppingPatience = c(10),
    -  bayes = T,
    -  useDeepEnsemble = F,
    -  numberOfEnsembleNetwork = 5,
    -  useVae = T,
    -  vaeDataSamplingProportion = 0.1,
    -  vaeValidationSplit = 0.2,
    -  vaeBatchSize = 100L,
    -  vaeLatentDim = 10L,
    -  vaeIntermediateDim = 256L,
    -  vaeEpoch = 100L,
    -  vaeEpislonStd = 1,
    -  useGPU = FALSE,
    -  maxGPUs = 2,
    -  seed = 1234
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    numberOfRNNLayer

    The number of RNN layer, only 1, 2, or 3 layers available now. eg. 1, c(1,2), c(1,2,3)

    units

    The number of units of RNN layer - as a list of vectors

    recurrentDropout

    The reccurrent dropout rate (regularisation)

    layerDropout

    The layer dropout rate (regularisation)

    lr

    Learning rate

    decay

    Learning rate decay over each update.

    outcomeWeight

    The weight of the outcome class in the loss function. Default is 0, which will be replaced by balanced weight.

    batchSize

    The number of data points to use per training batch

    epochs

    Number of times to iterate over dataset

    earlyStoppingMinDelta

    minimum change in the monitored quantity to qualify as an improvement for early stopping, i.e. an absolute change of less than min_delta in loss of validation data, will count as no improvement.

    earlyStoppingPatience

    Number of epochs with no improvement after which training will be stopped.

    bayes

    logical (either TRUE or FALSE) value for using Bayesian Drop Out Layer to measure uncertainty. If it is TRUE, both Epistemic and Aleatoric uncertainty will be measured through Bayesian Drop Out layer

    useDeepEnsemble

    logical (either TRUE or FALSE) value for using Deep Ensemble (Lakshminarayanan et al., 2017) to measure uncertainty. It cannot be used together with Bayesian deep learing.

    numberOfEnsembleNetwork

    Integer. Number of network used for Deep Ensemble (Lakshminarayanan et al recommended 5).

    useVae

    logical (either TRUE or FALSE) value for using Variational AutoEncoder before RNN

    vaeDataSamplingProportion

    Data sampling proportion for VAE

    vaeValidationSplit

    Validation split proportion for VAE

    vaeBatchSize

    batch size for VAE

    vaeLatentDim

    Number of latent dimesion for VAE

    vaeIntermediateDim

    Number of intermediate dimesion for VAE

    vaeEpoch

    Number of times to interate over dataset for VAE

    vaeEpislonStd

    Epsilon

    useGPU

    logical (either TRUE or FALSE) value. If you have GPUs in your machine, and want to use multiple GPU for deep learning, set this value as TRUE

    maxGPUs

    Integer, If you will use GPU, how many GPUs will be used for deep learning in VAE? GPU parallelisation for deep learning will be activated only when parallel vae is true. Integer >= 2 or list of integers, number of GPUs or list of GPU IDs on which to create model replicas.

    seed

    Random seed used by deep learning model

    - - -

    Examples

    -
    if (FALSE) { -model.CIReNN <- setCIReNN() -} -
    -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/setCNNTorch.html b/docs/reference/setCNNTorch.html deleted file mode 100644 index 798867100..000000000 --- a/docs/reference/setCNNTorch.html +++ /dev/null @@ -1,249 +0,0 @@ - - - - - - - - -Create setting for CNN model with python — setCNNTorch • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Create setting for CNN model with python

    -
    - -
    setCNNTorch(
    -  nbfilters = c(16, 32),
    -  epochs = c(20, 50),
    -  seed = 0,
    -  class_weight = 0,
    -  type = "CNN"
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - -
    nbfilters

    The number of filters

    epochs

    The number of epochs

    seed

    A seed for the model

    class_weight

    The class weight used for imbalanced data: - 0: Inverse ratio between positives and negatives --1: Focal loss

    type

    It can be normal 'CNN', 'CNN_LSTM', CNN_MLF' with multiple kernels with different kernel size, -'CNN_MIX', 'ResNet' and 'CNN_MULTI'

    - - -

    Examples

    -
    if (FALSE) { -model.cnnTorch <- setCNNTorch() -} -
    -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/setCompetingRiskModel.html b/docs/reference/setCompetingRiskModel.html deleted file mode 100644 index 0c452ef69..000000000 --- a/docs/reference/setCompetingRiskModel.html +++ /dev/null @@ -1,222 +0,0 @@ - - - - - - - - -Create setting for competing risk model (uses Fine-Gray model in Cyclops) — setCompetingRiskModel • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Create setting for competing risk model (uses Fine-Gray model in Cyclops)

    -
    - -
    setCompetingRiskModel(seed = NULL)
    - -

    Arguments

    - - - - - - -
    seed

    An option to add a seed when training the model

    - - -

    Examples

    -
    model.lr <- setCompetingRiskModel() -
    -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/setCovNN.html b/docs/reference/setCovNN.html deleted file mode 100644 index cc442a23a..000000000 --- a/docs/reference/setCovNN.html +++ /dev/null @@ -1,271 +0,0 @@ - - - - - - - - -Create setting for multi-resolution CovNN model (stucture based on https://arxiv.org/pdf/1608.00647.pdf CNN1) — setCovNN • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Create setting for multi-resolution CovNN model (stucture based on https://arxiv.org/pdf/1608.00647.pdf CNN1)

    -
    - -
    setCovNN(
    -  batchSize = 1000,
    -  outcomeWeight = 1,
    -  lr = 1e-05,
    -  decay = 1e-06,
    -  dropout = 0,
    -  epochs = 10,
    -  filters = 3,
    -  kernelSize = 10,
    -  loss = "binary_crossentropy",
    -  seed = NULL
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    batchSize

    The number of samples to used in each batch during model training

    outcomeWeight

    The weight assined to the outcome (make greater than 1 to reduce unballanced label issue)

    lr

    The learning rate

    decay

    The decay of the learning rate

    dropout

    [currently not used] the dropout rate for regularisation

    epochs

    The number of times data is used to train the model (e.g., epoches=1 means data only used once to train)

    filters

    The number of columns output by each convolution

    kernelSize

    The number of time dimensions used for each convolution

    loss

    The loss function implemented

    seed

    The random seed

    - - -

    Examples

    -
    if (FALSE) { -model.CovNN <- setCovNN() -} -
    -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/setCovNN2.html b/docs/reference/setCovNN2.html deleted file mode 100644 index bd8db7d28..000000000 --- a/docs/reference/setCovNN2.html +++ /dev/null @@ -1,271 +0,0 @@ - - - - - - - - -Create setting for CovNN2 model - convolution across input and time - https://arxiv.org/pdf/1608.00647.pdf — setCovNN2 • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Create setting for CovNN2 model - convolution across input and time - https://arxiv.org/pdf/1608.00647.pdf

    -
    - -
    setCovNN2(
    -  batchSize = 1000,
    -  outcomeWeight = 1,
    -  lr = 1e-05,
    -  decay = 1e-06,
    -  dropout = 0,
    -  epochs = 10,
    -  filters = 3,
    -  kernelSize = 10,
    -  loss = "binary_crossentropy",
    -  seed = NULL
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    batchSize

    The number of samples to used in each batch during model training

    outcomeWeight

    The weight assined to the outcome (make greater than 1 to reduce unballanced label issue)

    lr

    The learning rate

    decay

    The decay of the learning rate

    dropout

    [currently not used] the dropout rate for regularisation

    epochs

    The number of times data is used to train the model (e.g., epoches=1 means data only used once to train)

    filters

    The number of columns output by each convolution

    kernelSize

    The number of time dimensions used for each convolution

    loss

    The loss function implemented

    seed

    The random seed

    - - -

    Examples

    -
    if (FALSE) { -model.CovNN <- setCovNN() -} -
    -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/setCoxModel.html b/docs/reference/setCoxModel.html deleted file mode 100644 index 868631ba2..000000000 --- a/docs/reference/setCoxModel.html +++ /dev/null @@ -1,270 +0,0 @@ - - - - - - - - -Create setting for lasso Cox model — setCoxModel • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Create setting for lasso Cox model

    -
    - -
    setCoxModel(
    -  variance = 0.01,
    -  seed = NULL,
    -  includeCovariateIds = c(),
    -  noShrinkage = c(),
    -  threads = -1,
    -  upperLimit = 20,
    -  lowerLimit = 0.01,
    -  tolerance = 2e-07,
    -  maxIterations = 3000
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    variance

    Numeric: prior distribution starting variance

    seed

    An option to add a seed when training the model

    includeCovariateIds

    a set of covariate IDS to limit the analysis to

    noShrinkage

    a set of covariates whcih are to be forced to be included in the final model. default is the intercept

    threads

    An option to set number of threads when training model

    upperLimit

    Numeric: Upper prior variance limit for grid-search

    lowerLimit

    Numeric: Lower prior variance limit for grid-search

    tolerance

    Numeric: maximum relative change in convergence criterion from successive iterations to achieve convergence

    maxIterations

    Integer: maximum iterations of Cyclops to attempt before returning a failed-to-converge error

    - - -

    Examples

    -
    model.lr <- setCoxModel() -
    -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/setDecisionTree.html b/docs/reference/setDecisionTree.html deleted file mode 100644 index fd8302977..000000000 --- a/docs/reference/setDecisionTree.html +++ /dev/null @@ -1,282 +0,0 @@ - - - - - - - - -Create setting for the scikit-learn 1.0.1 DecisionTree with python — setDecisionTree • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Create setting for the scikit-learn 1.0.1 DecisionTree with python

    -
    - -
    setDecisionTree(
    -  criterion = list("gini"),
    -  splitter = list("best"),
    -  maxDepth = list(as.integer(4), as.integer(10), NULL),
    -  minSamplesSplit = list(2, 10),
    -  minSamplesLeaf = list(10, 50),
    -  minWeightFractionLeaf = list(0),
    -  maxFeatures = list(100, "auto", NULL),
    -  maxLeafNodes = list(NULL),
    -  minImpurityDecrease = list(10^-7),
    -  classWeight = list(NULL, "balanced"),
    -  seed = sample(1e+06, 1)
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    criterion

    The function to measure the quality of a split. Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain.

    splitter

    The strategy used to choose the split at each node. Supported strategies are “best” to choose the best split and “random” to choose the best random split.

    maxDepth

    (list) The maximum depth of the tree. If NULL, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.

    minSamplesSplit

    The minimum number of samples required to split an internal node

    minSamplesLeaf

    The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least minSamplesLeaf training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression.

    minWeightFractionLeaf

    The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sampleWeight is not provided.

    maxFeatures

    (list) The number of features to consider when looking for the best split (int/'auto'/NULL)

    maxLeafNodes

    (list) Grow a tree with max_leaf_nodes in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes. (int/NULL)

    minImpurityDecrease

    Threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf.

    classWeight

    (list) Weights associated with classes 'balance' or NULL

    seed

    The random state seed

    - - -

    Examples

    -
    if (FALSE) { -model.decisionTree <- setDecisionTree(maxDepth=10,minSamplesLeaf=10, seed=NULL ) -} -
    -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/setDeepNN.html b/docs/reference/setDeepNN.html deleted file mode 100644 index d037ae6af..000000000 --- a/docs/reference/setDeepNN.html +++ /dev/null @@ -1,261 +0,0 @@ - - - - - - - - -Create setting for DeepNN model — setDeepNN • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Create setting for DeepNN model

    -
    - -
    setDeepNN(
    -  units = list(c(128, 64), 128),
    -  layer_dropout = c(0.2),
    -  lr = c(1e-04),
    -  decay = c(1e-05),
    -  outcome_weight = c(1),
    -  batch_size = c(100),
    -  epochs = c(100),
    -  seed = NULL
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    units

    The number of units of the deep network - as a list of vectors

    layer_dropout

    The layer dropout rate (regularisation)

    lr

    Learning rate

    decay

    Learning rate decay over each update.

    outcome_weight

    The weight of the outcome class in the loss function

    batch_size

    The number of data points to use per training batch

    epochs

    Number of times to iterate over dataset

    seed

    Random seed used by deep learning model

    - - -

    Examples

    -
    if (FALSE) { -model <- setDeepNN() -} -
    -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/setGBMSurvival.html b/docs/reference/setGBMSurvival.html deleted file mode 100644 index 037065ffb..000000000 --- a/docs/reference/setGBMSurvival.html +++ /dev/null @@ -1,320 +0,0 @@ - - - - - - - - -Create setting for GBM Survival with python -#' @description -This creates a setting for fitting GBM surivial model. You need sksurv python install. To install this open your command line and type: conda install -c sebp scikit-survival — setGBMSurvival • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Create setting for GBM Survival with python -#' @description -This creates a setting for fitting GBM surivial model. You need sksurv python install. To install this open your command line and type: conda install -c sebp scikit-survival

    -
    - -
    setGBMSurvival(
    -  loss = "coxph",
    -  learningRate = 0.1,
    -  nEstimators = c(100),
    -  criterion = "friedman_mse",
    -  minSamplesSplit = 2,
    -  minSamplesLeaf = 1,
    -  minWeightFractionLeaf = 0,
    -  maxDepth = c(3, 10, 17),
    -  minImpuritySplit = NULL,
    -  minImpurityDecrease = 0,
    -  maxFeatures = NULL,
    -  maxLeafNodes = NULL,
    -  presort = NULL,
    -  subsample = 1,
    -  dropoutRate = 0,
    -  seed = NULL,
    -  quiet = F
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    loss

    A string specifying the loss function to minimise (default: 'coxph' )

    learningRate

    A double specifying the learning rate (controls convergence speed)

    nEstimators

    An integer specifying how many trees to build

    criterion

    Default: 'friedman_mse'

    minSamplesSplit

    An integer specifying min samples per tree split (complexity)

    minSamplesLeaf

    An integer specifying min samples per leaf (complexity)

    minWeightFractionLeaf

    Lookup

    maxDepth

    An integer specifying the max depth of trees (complexity)

    minImpuritySplit

    A double or NULL specifying the minimum impurity split

    minImpurityDecrease

    will add

    maxFeatures

    will add

    maxLeafNodes

    will add

    presort

    will add

    subsample

    will add

    dropoutRate

    will add

    seed

    will add

    quiet

    will add

    - -

    Details

    - -

    Pick the hyper-parameters you want to do a grid search for

    - -

    Examples

    -
    if (FALSE) { -gbmSurv <- setGBMSurvival(learningRate=c(0.1,0.01), nEstimators =c(10,50,100), - maxDepth=c(4,10,17), seed = 2) -} -
    -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/setGradientBoostingMachine.html b/docs/reference/setGradientBoostingMachine.html deleted file mode 100644 index e37a071cd..000000000 --- a/docs/reference/setGradientBoostingMachine.html +++ /dev/null @@ -1,262 +0,0 @@ - - - - - - - - -Create setting for gradient boosting machine model using gbm_xgboost implementation — setGradientBoostingMachine • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Create setting for gradient boosting machine model using gbm_xgboost implementation

    -
    - -
    setGradientBoostingMachine(
    -  ntrees = c(100, 1000),
    -  nthread = 20,
    -  earlyStopRound = 25,
    -  maxDepth = c(4, 6, 17),
    -  minRows = 2,
    -  learnRate = c(0.005, 0.01, 0.1),
    -  seed = sample(1e+07, 1)
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    ntrees

    The number of trees to build

    nthread

    The number of computer threads to (how many cores do you have?)

    earlyStopRound

    If the performance does not increase over earlyStopRound number of interactions then training stops (this prevents overfitting)

    maxDepth

    Maximum number of interactions - a large value will lead to slow model training

    minRows

    The minimum number of rows required at each end node of the tree

    learnRate

    The boosting learn rate

    seed

    An option to add a seed when training the final model

    - - -

    Examples

    -
    model.gbm <- setGradientBoostingMachine(ntrees=c(10,100), nthread=20, - maxDepth=c(4,6), learnRate=c(0.1,0.3)) - -
    -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/setIterativeHardThresholding.html b/docs/reference/setIterativeHardThresholding.html deleted file mode 100644 index 8bd6b1afb..000000000 --- a/docs/reference/setIterativeHardThresholding.html +++ /dev/null @@ -1,280 +0,0 @@ - - - - - - - - -Create setting for lasso logistic regression — setIterativeHardThresholding • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Create setting for lasso logistic regression

    -
    - -
    setIterativeHardThresholding(
    -  K = 10,
    -  penalty = "bic",
    -  seed = sample(1e+05, 1),
    -  exclude = c(),
    -  forceIntercept = F,
    -  fitBestSubset = FALSE,
    -  initialRidgeVariance = 10000,
    -  tolerance = 1e-08,
    -  maxIterations = 10000,
    -  threshold = 1e-06,
    -  delta = 0
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    K

    The maximum number of non-zero predictors

    penalty

    Specifies the IHT penalty; possible values are `BIC` or `AIC` or a numeric value

    seed

    An option to add a seed when training the model

    exclude

    A vector of numbers or covariateId names to exclude from prior

    forceIntercept

    Logical: Force intercept coefficient into regularization

    fitBestSubset

    Logical: Fit final subset with no regularization

    initialRidgeVariance

    integer

    tolerance

    numeric

    maxIterations

    integer

    threshold

    numeric

    delta

    numeric

    - - -

    Examples

    -
    -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/setKNN.html b/docs/reference/setKNN.html deleted file mode 100644 index 8d8a8c2ca..000000000 --- a/docs/reference/setKNN.html +++ /dev/null @@ -1,238 +0,0 @@ - - - - - - - - -Create setting for knn model — setKNN • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Create setting for knn model

    -
    - -
    setKNN(k = 1000, indexFolder = file.path(getwd(), "knn"), threads = 1)
    - -

    Arguments

    - - - - - - - - - - - - - - -
    k

    The number of neighbors to consider

    indexFolder

    The directory where the results and intermediate steps are output

    threads

    The number of threads to use when applying big knn

    - - -

    Examples

    -
    if (FALSE) { -model.knn <- setKNN(k=10000) -} -
    -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/setLRTorch.html b/docs/reference/setLRTorch.html deleted file mode 100644 index 0ea3d5ac4..000000000 --- a/docs/reference/setLRTorch.html +++ /dev/null @@ -1,253 +0,0 @@ - - - - - - - - -Create setting for logistics regression model with python — setLRTorch • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Create setting for logistics regression model with python

    -
    - -
    setLRTorch(
    -  w_decay = c(5e-04, 0.005),
    -  epochs = c(20, 50, 100),
    -  seed = NULL,
    -  class_weight = 0,
    -  autoencoder = FALSE,
    -  vae = FALSE
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - -
    w_decay

    The l2 regularisation

    epochs

    The number of epochs

    seed

    A seed for the model

    class_weight

    The class weight used for imbalanced data: - 0: Inverse ratio between positives and negatives --1: Focal loss

    autoencoder

    First learn stakced autoencoder for input features, then train LR on the encoded features.

    vae

    First learn stakced varational autoencoder for input features, then train LR on the encoded features.

    - - -

    Examples

    -
    if (FALSE) { -model.lrTorch <- setLRTorch() -} -
    -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/setLassoLogisticRegression.html b/docs/reference/setLassoLogisticRegression.html deleted file mode 100644 index aaccccd2e..000000000 --- a/docs/reference/setLassoLogisticRegression.html +++ /dev/null @@ -1,275 +0,0 @@ - - - - - - - - -Create setting for lasso logistic regression — setLassoLogisticRegression • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Create setting for lasso logistic regression

    -
    - -
    setLassoLogisticRegression(
    -  variance = 0.01,
    -  seed = NULL,
    -  includeCovariateIds = c(),
    -  noShrinkage = c(0),
    -  threads = -1,
    -  forceIntercept = F,
    -  upperLimit = 20,
    -  lowerLimit = 0.01,
    -  tolerance = 2e-06,
    -  maxIterations = 3000
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    variance

    Numeric: prior distribution starting variance

    seed

    An option to add a seed when training the model

    includeCovariateIds

    a set of covariate IDS to limit the analysis to

    noShrinkage

    a set of covariates whcih are to be forced to be included in the final model. default is the intercept

    threads

    An option to set number of threads when training model

    forceIntercept

    Logical: Force intercept coefficient into prior

    upperLimit

    Numeric: Upper prior variance limit for grid-search

    lowerLimit

    Numeric: Lower prior variance limit for grid-search

    tolerance

    Numeric: maximum relative change in convergence criterion from successive iterations to achieve convergence

    maxIterations

    Integer: maximum iterations of Cyclops to attempt before returning a failed-to-converge error

    - - -

    Examples

    -
    model.lr <- setLassoLogisticRegression() -
    -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/setMLP.html b/docs/reference/setMLP.html deleted file mode 100644 index 09cb9029f..000000000 --- a/docs/reference/setMLP.html +++ /dev/null @@ -1,337 +0,0 @@ - - - - - - - - -Create setting for neural network model with python — setMLP • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Create setting for neural network model with python

    -
    - -
    setMLP(
    -  hiddenLayerSizes = list(c(100), c(20, 4)),
    -  activation = list("relu"),
    -  solver = list("adam"),
    -  alpha = list(0.3, 0.01, 1e-04, 1e-06),
    -  batchSize = list("auto"),
    -  learningRate = list("constant"),
    -  learningRateInit = list(0.001),
    -  powerT = list(0.5),
    -  maxIter = list(200, 100),
    -  shuffle = list(TRUE),
    -  tol = list(1e-04),
    -  warmStart = list(TRUE),
    -  momentum = list(0.9),
    -  nesterovsMomentum = list(TRUE),
    -  earlyStopping = list(FALSE),
    -  validationFraction = list(0.1),
    -  beta1 = list(0.9),
    -  beta2 = list(0.999),
    -  epsilon = list(1, 0.1, 1e-08),
    -  nIterNoChange = list(10),
    -  seed = sample(1e+05, 1)
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    hiddenLayerSizes

    (list of vectors) The ith element represents the number of neurons in the ith hidden layer.

    activation

    (list) Activation function for the hidden layer.

      -
    • "identity": no-op activation, useful to implement linear bottleneck, returns f(x) = x

    • -
    • "logistic": the logistic sigmoid function, returns f(x) = 1 / (1 + exp(-x)).

    • -
    • "tanh": the hyperbolic tan function, returns f(x) = tanh(x).

    • -
    • "relu": the rectified linear unit function, returns f(x) = max(0, x)

    • -
    solver

    (list) The solver for weight optimization. (‘lbfgs’, ‘sgd’, ‘adam’)

    alpha

    (list) L2 penalty (regularization term) parameter.

    batchSize

    (list) Size of minibatches for stochastic optimizers. If the solver is ‘lbfgs’, the classifier will not use minibatch. When set to “auto”, batchSize=min(200, n_samples).

    learningRate

    (list) Only used when solver='sgd' Learning rate schedule for weight updates.‘constant’, ‘invscaling’, ‘adaptive’, default=’constant’

    learningRateInit

    (list) Only used when solver=’sgd’ or ‘adam’. The initial learning rate used. It controls the step-size in updating the weights.

    powerT

    (list) Only used when solver=’sgd’. The exponent for inverse scaling learning rate. It is used in updating effective learning rate when the learning_rate is set to ‘invscaling’.

    maxIter

    (list) Maximum number of iterations. The solver iterates until convergence (determined by ‘tol’) or this number of iterations. For stochastic solvers (‘sgd’, ‘adam’), note that this determines the number of epochs (how many times each data point will be used), not the number of gradient steps.

    shuffle

    (list) boolean: Whether to shuffle samples in each iteration. Only used when solver=’sgd’ or ‘adam’.

    tol

    (list) Tolerance for the optimization. When the loss or score is not improving by at least tol for nIterNoChange consecutive iterations, unless learning_rate is set to ‘adaptive’, convergence is considered to be reached and training stops.

    warmStart

    (list) When set to True, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution.

    momentum

    (list) Momentum for gradient descent update. Should be between 0 and 1. Only used when solver=’sgd’.

    nesterovsMomentum

    (list) Whether to use Nesterov’s momentum. Only used when solver=’sgd’ and momentum > 0.

    earlyStopping

    (list) boolean Whether to use early stopping to terminate training when validation score is not improving. If set to true, it will automatically set aside 10 percent of training data as validation and terminate training when validation score is not improving by at least tol for n_iter_no_change consecutive epochs.

    validationFraction

    (list) The proportion of training data to set aside as validation set for early stopping. Must be between 0 and 1. Only used if earlyStopping is True.

    beta1

    (list) Exponential decay rate for estimates of first moment vector in adam, should be in 0 to 1.

    beta2

    (list) Exponential decay rate for estimates of second moment vector in adam, should be in 0 to 1.

    epsilon

    (list) Value for numerical stability in adam.

    nIterNoChange

    (list) Maximum number of epochs to not meet tol improvement. Only effective when solver=’sgd’ or ‘adam’.

    seed

    A seed for the model

    - - -

    Examples

    -
    if (FALSE) { -model.mlp <- setMLP() -} -
    -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/setMLPTorch.html b/docs/reference/setMLPTorch.html deleted file mode 100644 index e12be4bc0..000000000 --- a/docs/reference/setMLPTorch.html +++ /dev/null @@ -1,263 +0,0 @@ - - - - - - - - -Create setting for neural network model with python — setMLPTorch • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Create setting for neural network model with python

    -
    - -
    setMLPTorch(
    -  size = c(500, 1000),
    -  w_decay = c(5e-04, 0.005),
    -  epochs = c(20, 50),
    -  seed = 0,
    -  class_weight = 0,
    -  mlp_type = "MLP",
    -  autoencoder = FALSE,
    -  vae = FALSE
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    size

    The number of hidden nodes

    w_decay

    The l2 regularisation

    epochs

    The number of epochs

    seed

    A seed for the model

    class_weight

    The class weight used for imbalanced data: - 0: Inverse ratio between positives and negatives --1: Focal loss

    mlp_type

    The type of multiple layer network, inlcuding MLP and SNN (self-normalizing neural network)

    autoencoder

    First learn stakced autoencoder for input features, then train MLP on the encoded features.

    vae

    First learn stakced varational autoencoder for input features, then train MLP on the encoded features.

    - - -

    Examples

    -
    if (FALSE) { -model.mlpTorch <- setMLPTorch() -} -
    -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/setNaiveBayes.html b/docs/reference/setNaiveBayes.html deleted file mode 100644 index 51cb57879..000000000 --- a/docs/reference/setNaiveBayes.html +++ /dev/null @@ -1,222 +0,0 @@ - - - - - - - - -Create setting for naive bayes model with python — setNaiveBayes • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Create setting for naive bayes model with python

    -
    - -
    setNaiveBayes()
    - - - -

    Examples

    -
    if (FALSE) { -model.nb <- setNaiveBayes() -} -
    -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/setPythonEnvironment.html b/docs/reference/setPythonEnvironment.html deleted file mode 100644 index 3d8362dca..000000000 --- a/docs/reference/setPythonEnvironment.html +++ /dev/null @@ -1,232 +0,0 @@ - - - - - - - - -Use the virtual environment created using configurePython() — setPythonEnvironment • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Use the virtual environment created using configurePython()

    -
    - -
    setPythonEnvironment(envname = "PLP", envtype = NULL)
    - -

    Arguments

    - - - - - - - - - - -
    envname

    A string for the name of the virtual environment (default is 'PLP')

    envtype

    An option for specifying the environment as'conda' or 'python'. If NULL then the default is 'conda' for windows users and 'python' for non-windows users

    - -

    Details

    - -

    This function sets PatientLevelPrediction to use a virtual environment

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/setRNNTorch.html b/docs/reference/setRNNTorch.html deleted file mode 100644 index 96663666f..000000000 --- a/docs/reference/setRNNTorch.html +++ /dev/null @@ -1,248 +0,0 @@ - - - - - - - - -Create setting for RNN model with python — setRNNTorch • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Create setting for RNN model with python

    -
    - -
    setRNNTorch(
    -  hidden_size = c(50, 100),
    -  epochs = c(20, 50),
    -  seed = 0,
    -  class_weight = 0,
    -  type = "RNN"
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - -
    hidden_size

    The hidden size

    epochs

    The number of epochs

    seed

    A seed for the model

    class_weight

    The class weight used for imbalanced data: - 0: Inverse ratio between positives and negatives --1: Focal loss

    type

    It can be normal 'RNN', 'BiRNN' (bidirectional RNN) and 'GRU'

    - - -

    Examples

    -
    if (FALSE) { -model.rnnTorch <- setRNNTorch() -} -
    -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/setRNNTorch2.html b/docs/reference/setRNNTorch2.html deleted file mode 100644 index 8193cd04e..000000000 --- a/docs/reference/setRNNTorch2.html +++ /dev/null @@ -1,226 +0,0 @@ - - - - - - - - -Create setting for RNN model with python — setRNNTorch2 • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - -
    - -
    -
    - - -
    - -

    Create setting for RNN model with python

    - -
    - -
    setRNNTorch2(hidden_size = c(50, 100), epochs = c(20, 50), seed = 0,
    -  class_weight = 0, type = "RNN")
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - -
    hidden_size

    The hidden size

    epochs

    The number of epochs

    seed

    A seed for the model

    class_weight

    The class weight used for imbalanced data: - 0: Inverse ratio between positives and negatives --1: Focal loss

    type

    It can be normal 'RNN', 'BiRNN' (bidirectional RNN) and 'GRU'

    - - -

    Examples

    -
    # NOT RUN {
    -model.rnnTorch <- setRNNTorch()
    -# }
    -
    - -
    - -
    - - -
    -

    Site built with pkgdown 1.3.0.

    -
    -
    -
    - - - - - - diff --git a/docs/reference/setRandomForest.html b/docs/reference/setRandomForest.html deleted file mode 100644 index d45605d22..000000000 --- a/docs/reference/setRandomForest.html +++ /dev/null @@ -1,310 +0,0 @@ - - - - - - - - -Create setting for random forest model with python (very fast) — setRandomForest • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Create setting for random forest model with python (very fast)

    -
    - -
    setRandomForest(
    -  ntrees = list(100, 500),
    -  criterion = list("gini"),
    -  maxDepth = list(4, 10, 17),
    -  minSamplesSplit = list(2, 5),
    -  minSamplesLeaf = list(1, 10),
    -  minWeightFractionLeaf = list(0),
    -  mtries = list("auto", "log2"),
    -  maxLeafNodes = list(NULL),
    -  minImpurityDecrease = list(0),
    -  bootstrap = list(TRUE),
    -  maxSamples = list(NULL, 0.9),
    -  oobScore = list(FALSE),
    -  nJobs = list(NULL),
    -  classWeight = list("balanced_subsample", NULL),
    -  seed = sample(1e+05, 1)
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    ntrees

    (list) The number of trees to build

    criterion

    (list) The function to measure the quality of a split. Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. Note: this parameter is tree-specific.

    maxDepth

    (list) The maximum depth of the tree. If NULL, then nodes are expanded until all leaves are pure or until all leaves contain less than minSamplesSplit samples.

    minSamplesSplit

    (list) The minimum number of samples required to split an internal node

    minSamplesLeaf

    (list) The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least minSamplesLeaf training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression.

    minWeightFractionLeaf

    (list) The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sampleWeight is not provided.

    mtries

    (list) The number of features to consider when looking for the best split:

      -
    • intthen consider max_features features at each split.

    • -
    • floatthen max_features is a fraction and round(max_features * n_features) features are considered at each split

    • -
    • 'auto'then max_features=sqrt(n_features)

    • -
    • 'sqrt'then max_features=sqrt(n_features) (same as “auto”)

    • -
    • 'log2'then max_features=log2(n_features).

    • -
    • NULLthen max_features=n_features

    • -
    maxLeafNodes

    (list) Grow trees with max_leaf_nodes in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.

    minImpurityDecrease

    (list) A node will be split if this split induces a decrease of the impurity greater than or equal to this value.

    bootstrap

    (list) Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.

    maxSamples

    (list) If bootstrap is True, the number of samples to draw from X to train each base estimator.

    oobScore

    (list) Whether to use out-of-bag samples to estimate the generalization score. Only available if bootstrap=True.

    nJobs

    The number of jobs to run in parallel.

    classWeight

    (list) Weights associated with classes. If not given, all classes are supposed to have weight one. NULL, “balanced”, “balanced_subsample”

    seed

    A seed when training the final model

    - - -

    Examples

    -
    if (FALSE) { -model.rf <- setRandomForest(mtries=list('auto',5,20), ntrees=c(10,100), - maxDepth=c(5,20)) -} -
    -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/setRandomForestQuantileRegressor.html b/docs/reference/setRandomForestQuantileRegressor.html deleted file mode 100644 index c01acfc7c..000000000 --- a/docs/reference/setRandomForestQuantileRegressor.html +++ /dev/null @@ -1,300 +0,0 @@ - - - - - - - - -Create setting for RandomForestQuantileRegressor with python scikit-garden (skgarden.quantile.RandomForestQuantileRegressor) -#' @description -This creates a setting for fitting a RandomForestQuantileRegressor model. You need skgarden python install. To install this open your command line and type: conda install -c conda-forge scikit-garden — setRandomForestQuantileRegressor • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Create setting for RandomForestQuantileRegressor with python scikit-garden (skgarden.quantile.RandomForestQuantileRegressor) -#' @description -This creates a setting for fitting a RandomForestQuantileRegressor model. You need skgarden python install. To install this open your command line and type: conda install -c conda-forge scikit-garden

    -
    - -
    setRandomForestQuantileRegressor(
    -  nEstimators = c(100),
    -  criterion = "mse",
    -  maxFeatures = -1,
    -  maxDepth = 4,
    -  minSamplesSplit = 2,
    -  minSamplesLeaf = 1,
    -  minWeightFractionLeaf = 0,
    -  maxLeafNodes = NULL,
    -  bootstrap = TRUE,
    -  oobScore = FALSE,
    -  warmStart = FALSE,
    -  seed = NULL,
    -  quiet = F
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    nEstimators

    (int default:100) The number of trees in the forest.

    criterion

    (string default="mse")) The function to measure the quality of a split. Supported criteria are "mse" for the mean squared error, which is equal to variance reduction as feature selection criterion, and "mae" for the mean absolute error.

    maxFeatures

    (int default: -1) The number of features to consider when looking for the best split. If -1 then use sqrt of total number of features.

    maxDepth

    (int default:4) The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than minSamplesSplit samples.

    minSamplesSplit

    An integer specifying min samples per tree split (complexity)

    minSamplesLeaf

    An integer specifying min samples per leaf (complexity)

    minWeightFractionLeaf

    Lookup

    maxLeafNodes

    (int) Grow trees with maxLeafNodes in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.

    bootstrap

    (boolean default:TRUE) Whether bootstrap samples are used when building trees.

    oobScore

    (boolean default:FALSE) Whether to use out-of-bag samples to estimate the R^2 on unseen data.

    warmStart

    (boolean default:FALSE) When set to True, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new forest.

    seed

    will add

    quiet

    will add

    - -

    Details

    - -

    Pick the hyper-parameters you want to do a grid search for

    - -

    Examples

    -
    if (FALSE) { -rfQR <- setRandomForestQuantileRegressor(nEstimators =c(10,50,100), - maxDepth=c(4,10,17), seed = 2) -} -
    -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/setSVM.html b/docs/reference/setSVM.html deleted file mode 100644 index 7c2c3a575..000000000 --- a/docs/reference/setSVM.html +++ /dev/null @@ -1,277 +0,0 @@ - - - - - - - - -Create setting for the python sklearn SVM (SVC function) — setSVM • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Create setting for the python sklearn SVM (SVC function)

    -
    - -
    setSVM(
    -  C = list(1, 0.9, 2, 0.1),
    -  kernel = list("rbf"),
    -  degree = list(1, 3, 5),
    -  gamma = list("scale", 1e-04, 3e-05, 0.001, 0.01, 0.25),
    -  coef0 = list(0),
    -  shrinking = list(TRUE),
    -  tol = list(0.001),
    -  classWeight = list("balanced", NULL),
    -  cacheSize = 500,
    -  seed = sample(1e+05, 1)
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    C

    (list) Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. The penalty is a squared l2 penalty.

    kernel

    (list) Specifies the kernel type to be used in the algorithm. one of ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’. If none is given ‘rbf’ will be used.

    degree

    (list) degree of kernel function is significant only in poly, rbf, sigmoid

    gamma

    (list) kernel coefficient for rbf and poly, by default 1/n_features will be taken. ‘scale’, ‘auto’ or float, default=’scale’

    coef0

    (list) independent term in kernel function. It is only significant in poly/sigmoid.

    shrinking

    (list) whether to use the shrinking heuristic.

    tol

    (list) Tolerance for stopping criterion.

    classWeight

    (list) Class weight based on imbalance either 'balanced' or NULL

    cacheSize

    Specify the size of the kernel cache (in MB).

    seed

    A seed for the model

    - - -

    Examples

    -
    if (FALSE) { -model.svm <- setSVM(kernel='rbf', seed = NULL) -} -
    -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/setSagemakerBinary.html b/docs/reference/setSagemakerBinary.html deleted file mode 100644 index a0d5969de..000000000 --- a/docs/reference/setSagemakerBinary.html +++ /dev/null @@ -1,251 +0,0 @@ - - - - - - - - -Create setting for sagemaker model — setSagemakerBinary • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Create setting for sagemaker model

    -
    - -
    setSagemakerBinary(
    -  classifier = "xgboost",
    -  bucket,
    -  prefix = "data",
    -  roleArn,
    -  otherparams = NULL,
    -  seed = NULL
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - -
    classifier

    The name of the sagemaker binary classifier to use (pick from: knn, xgboost or linear-learner)

    bucket

    The s3 bucker string to save data for model training

    prefix

    The s3 subdirectory for the data

    roleArn

    The amazon roleArn

    otherparams

    Other parameters for training (currently not working)

    seed

    The seed for the training

    - - -

    Examples

    -
    if (FALSE) { -model.sm <- setSagemakerBinary(classifier='gxboost', bucket='ohdsi3') -} -
    -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/similarPlpData.html b/docs/reference/similarPlpData.html deleted file mode 100644 index c7640b066..000000000 --- a/docs/reference/similarPlpData.html +++ /dev/null @@ -1,309 +0,0 @@ - - - - - - - - -Extract new plpData using plpModel settings -use metadata in plpModel to extract similar data and population for new databases: — similarPlpData • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Extract new plpData using plpModel settings -use metadata in plpModel to extract similar data and population for new databases:

    -
    - -
    similarPlpData(
    -  plpModel = NULL,
    -  newConnectionDetails,
    -  newCdmDatabaseSchema = NULL,
    -  newCohortDatabaseSchema = NULL,
    -  newCohortTable = NULL,
    -  newCohortId = NULL,
    -  newOutcomeDatabaseSchema = NULL,
    -  newOutcomeTable = NULL,
    -  newOutcomeId = NULL,
    -  newOracleTempSchema = newCdmDatabaseSchema,
    -  sample = NULL,
    -  createPopulation = T,
    -  createCohorts = T
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    plpModel

    The trained PatientLevelPrediction model or object returned by runPlp()

    newConnectionDetails

    The connectionDetails for the new database

    newCdmDatabaseSchema

    The database schema for the new CDM database

    newCohortDatabaseSchema

    The database schema where the cohort table is stored

    newCohortTable

    The table name of the cohort table

    newCohortId

    The cohort_definition_id for the cohort of at risk people

    newOutcomeDatabaseSchema

    The database schema where the outcome table is stored

    newOutcomeTable

    The table name of the outcome table

    newOutcomeId

    The cohort_definition_id for the outcome

    newOracleTempSchema

    The temp coracle schema

    sample

    The number of people to sample (default is NULL meaning use all data)

    createPopulation

    Whether to create the study population as well

    createCohorts

    No longer used

    - - -

    Examples

    -
    if (FALSE) { -# set the connection -connectionDetails <- DatabaseConnector::createConnectionDetails() - -# load the model and data -plpModel <- loadPlpModel("C:/plpmodel") - -# extract the new data in the 'newData.dbo' schema using the model settings -newDataList <- similarPlpData(plpModel=plpModel, - newConnectionDetails = connectionDetails, - newCdmDatabaseSchema = 'newData.dbo', - newCohortDatabaseSchema = 'newData.dbo', - newCohortTable = 'cohort', - newCohortId = 1, - newOutcomeDatabaseSchema = 'newData.dbo', - newOutcomeTable = 'outcome', - newOutcomeId = 2) - -# get the prediction: -prediction <- applyModel(newDataList$population, newDataList$plpData, plpModel)$prediction -} -
    -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/simulatePlpData.html b/docs/reference/simulatePlpData.html deleted file mode 100644 index 505986878..000000000 --- a/docs/reference/simulatePlpData.html +++ /dev/null @@ -1,238 +0,0 @@ - - - - - - - - -Generate simulated data — simulatePlpData • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    simulateplpData creates a plpData object with simulated data.

    -
    - -
    simulatePlpData(plpDataSimulationProfile, n = 10000)
    - -

    Arguments

    - - - - - - - - - - -
    plpDataSimulationProfile

    An object of type plpDataSimulationProfile as generated -using the
    createplpDataSimulationProfile function.

    n

    The size of the population to be generated.

    - -

    Value

    - -

    An object of type plpData.

    -

    Details

    - -

    This function generates simulated data that is in many ways similar to the original data on which -the simulation profile is based. The contains same outcome, comparator, and outcome concept IDs, -and the covariates and their 1st order statistics should be comparable.

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/specificity.html b/docs/reference/specificity.html deleted file mode 100644 index 6f582daef..000000000 --- a/docs/reference/specificity.html +++ /dev/null @@ -1,243 +0,0 @@ - - - - - - - - -Calculate the specificity — specificity • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Calculate the specificity

    -
    - -
    specificity(TP, TN, FN, FP)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - -
    TP

    Number of true positives

    TN

    Number of true negatives

    FN

    Number of false negatives

    FP

    Number of false positives

    - -

    Value

    - -

    specificity value

    -

    Details

    - -

    Calculate the specificity

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/splitData.html b/docs/reference/splitData.html deleted file mode 100644 index f4a91d37d..000000000 --- a/docs/reference/splitData.html +++ /dev/null @@ -1,252 +0,0 @@ - - - - - - - - -Split the plpData into test/train sets using a splitting settings of class splitSettings — splitData • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Split the plpData into test/train sets using a splitting settings of class splitSettings

    -
    - -
    splitData(
    -  plpData = plpData,
    -  population = population,
    -  splitSettings = splitSettings
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - -
    plpData

    An object of type plpData - the patient level prediction -data extracted from the CDM.

    population

    The population created using createStudyPopulation that define who will be used to develop the model

    splitSettings

    An object of type splitSettings specifying the split - the default can be created using createDefaultSplitSetting

    - -

    Value

    - -

    An object of class splitSettings

    -

    Details

    - -

    Returns a list containing the training data (Train) and optionally the test data (Test). Train is an Andromeda object containing

      -
    • covariateRef a table with the covariate information

    • -
    • labels) a table (rowId, outcomeCount, ...) for each data point in the train data (outcomeCount is the class label)

    • -
    • folds a table (rowId, index) specifying which training fold each data point is in.

    • -

    Test is an Andromeda object containing

      -
    • covariateRef a table with the covariate information

    • -
    • labels) a table (rowId, outcomeCount, ...) for each data point in the test data (outcomeCount is the class label)

    • -
    - - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/subjectSplitter.html b/docs/reference/subjectSplitter.html deleted file mode 100644 index 94a1942d5..000000000 --- a/docs/reference/subjectSplitter.html +++ /dev/null @@ -1,249 +0,0 @@ - - - - - - - - -Split data when patients are in the data multiple times such that the same patient is always either in the -train set or the test set (the same patient cannot be in both the test and train set at different times) — subjectSplitter • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Split data when patients are in the data multiple times such that the same patient is always either in the -train set or the test set (the same patient cannot be in both the test and train set at different times)

    -
    - -
    subjectSplitter(population, test = 0.3, train = NULL, nfold = 3, seed = NULL)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - -
    population

    An object created using createStudyPopulation().

    test

    A real number between 0 and 1 indicating the test set fraction of the data

    train

    A real number between 0 and 1 indicating the train set fraction of the data. -If not set train is equal to 1 - test

    nfold

    An integer >= 1 specifying the number of folds used in cross validation

    seed

    If set a fixed seed is used, otherwise a random split is performed

    - -

    Value

    - -

    A dataframe containing the columns: rowId and index

    -

    Details

    - -

    Returns a dataframe of rowIds and indexes with a -1 index indicating the rowId belongs to the test -set and a positive integer index value indicating the rowId's cross valiation fold within the train -set.

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/summaryPlpAnalyses.html b/docs/reference/summaryPlpAnalyses.html deleted file mode 100644 index 3f9fd8e36..000000000 --- a/docs/reference/summaryPlpAnalyses.html +++ /dev/null @@ -1,207 +0,0 @@ - - - - - - - - -summarises the multiple PLP results into a dataframe — summaryPlpAnalyses • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - -
    - -
    -
    - - -
    - -

    summarises the multiple PLP results into a dataframe

    - -
    - -
    summaryPlpAnalyses(analysesLocation)
    - -

    Arguments

    - - - - - - -
    analysesLocation

    The directory containing the results (with the analysis_x folders)

    - -

    Details

    - -

    Loads all the study results contained in the analysesLocation and aggregates a summary of the results

    - - -
    - -
    - -
    - - -
    -

    Site built with pkgdown 1.3.0.

    -
    -
    -
    - - - - - - diff --git a/docs/reference/timeSplitter.html b/docs/reference/timeSplitter.html deleted file mode 100644 index 216968c66..000000000 --- a/docs/reference/timeSplitter.html +++ /dev/null @@ -1,248 +0,0 @@ - - - - - - - - -Split test/train data by time and then partitions training set into random folds stratified by -class — timeSplitter • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Split test/train data by time and then partitions training set into random folds stratified by -class

    -
    - -
    timeSplitter(population, test = 0.3, train = NULL, nfold = 3, seed = NULL)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - -
    population

    An object created using createStudyPopulation().

    test

    A real number between 0 and 1 indicating the test set fraction of the data

    train

    A real number between 0 and 1 indicating the training set fraction of the data

    nfold

    An integer >= 1 specifying the number of folds used in cross validation

    seed

    If set a fixed seed is used, otherwise a random split is performed

    - -

    Value

    - -

    A dataframe containing the columns: rowId and index

    -

    Details

    - -

    Returns a dataframe of rowIds and indexes with a -1 index indicating the rowId belongs to the test -set and a positive integer index value indicating the rowId's cross valiation fold within the train -set.

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/toPlpData.html b/docs/reference/toPlpData.html deleted file mode 100644 index 6d54bce3a..000000000 --- a/docs/reference/toPlpData.html +++ /dev/null @@ -1,240 +0,0 @@ - - - - - - - - -Convert matrix into plpData — toPlpData • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - -
    - -
    -
    - - -
    - -

    Converts a matrix (rows = people, columns=variables) into the standard plpData

    - -
    - -
    toPlpData(data, columnInfo, outcomeId, outcomeThreshold = 0.5,
    -  indexTime = 0, includeIndexDay = T)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - -
    data

    An data.frame or matrix.

    columnInfo

    A dataframe with three columns, column 1 contains columnId, column 2 contains columnName for each column id and column 3 contains the columnTime - the time prior to index the variable was recorded

    outcomeId

    The column id containing the outcome

    outcomeThreshold

    The outcome value must be higher or equal to this for the person to have the outcome

    indexTime

    The time defining the index date

    includeIndexDay

    Boolean - whether to include variables recorded on index date

    - -

    Value

    - -

    Returns an object of class plpData

    - -

    Details

    - -

    This function converts matrix into plpData

    - - -

    Examples

    -
    #TODO - -
    -
    - -
    - -
    - - -
    -

    Site built with pkgdown 1.3.0.

    -
    -
    -
    - - - - - - diff --git a/docs/reference/toSparseM.html b/docs/reference/toSparseM.html deleted file mode 100644 index aa2b22c83..000000000 --- a/docs/reference/toSparseM.html +++ /dev/null @@ -1,253 +0,0 @@ - - - - - - - - -Convert the plpData in COO format into a sparse R matrix — toSparseM • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Converts the standard plpData to a sparse matrix

    -
    - -
    toSparseM(plpData, cohort = NULL, map = NULL)
    - -

    Arguments

    - - - - - - - - - - - - - - -
    plpData

    An object of type plpData with covariate in coo format - the patient level prediction -data extracted from the CDM.

    cohort

    If specified the plpData is restricted to the rowIds in the cohort (otherwise plpData$labels is used)

    map

    A covariate map (telling us the column number for covariates)

    - -

    Value

    - -

    Returns a list, containing the data as a sparse matrix, the plpData covariateRef -and a data.frame named map that tells us what covariate corresponds to each column -This object is a list with the following components:

    -
    data

    A sparse matrix with the rows corresponding to each person in the plpData and the columns corresponding to the covariates.

    -
    covariateRef

    The plpData covariateRef.

    -
    map

    A data.frame containing the data column ids and the corresponding covariateId from covariateRef.

    - -
    - -

    Details

    - -

    This function converts the covariate file from ffdf in COO format into a sparse matrix from -the package Matrix

    - -

    Examples

    -
    #TODO - -
    -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/toSparsePython.html b/docs/reference/toSparsePython.html deleted file mode 100644 index 638d441e8..000000000 --- a/docs/reference/toSparsePython.html +++ /dev/null @@ -1,245 +0,0 @@ - - - - - - - - -Convert the plpData in COO format into a sparse python matrix — toSparsePython • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - -
    - -
    -
    - - -
    - -

    Converts the standard plpData to a sparse matrix firectly into python

    - -
    - -
    toSparsePython(plpData, population, map = NULL, temporal = F,
    -  pythonExePath = NULL)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - -
    plpData

    An object of type plpData with covariate in coo format - the patient level prediction -data extracted from the CDM.

    population

    The population to include in the matrix

    map

    A covariate map (telling us the column number for covariates)

    temporal

    Whether to include timeId into tensor

    pythonExePath

    Location of python exe you want to use

    - -

    Value

    - -

    Returns a list, containing the python object name of the sparse matrix, the plpData covariateRef -and a data.frame named map that tells us what covariate corresponds to each column -This object is a list with the following components:

    -
    data

    The python object name containing a sparse matrix with the rows corresponding to each person in the plpData and the columns corresponding to the covariates.

    -
    covariateRef

    The plpData covariateRef.

    -
    map

    A data.frame containing the data column ids and the corresponding covariateId from covariateRef.

    -
    - - -

    Details

    - -

    This function converts the covariate file from ffdf in COO format into a sparse matrix from -the package Matrix

    - - -

    Examples

    -
    #TODO - -
    -
    - -
    - -
    - - -
    -

    Site built with pkgdown 1.3.0.

    -
    -
    -
    - - - - - - diff --git a/docs/reference/toSparseTorchPython.html b/docs/reference/toSparseTorchPython.html deleted file mode 100644 index 64c4fb439..000000000 --- a/docs/reference/toSparseTorchPython.html +++ /dev/null @@ -1,261 +0,0 @@ - - - - - - - - -Convert the plpData in COO format into a sparse python matrix using torch.sparse — toSparseTorchPython • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Converts the standard plpData to a sparse matrix firectly into python

    -
    - -
    toSparseTorchPython(
    -  plpData,
    -  population,
    -  map = NULL,
    -  temporal = F,
    -  pythonExePath = NULL
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - -
    plpData

    An object of type plpData with covariate in coo format - the patient level prediction -data extracted from the CDM.

    population

    The population to include in the matrix

    map

    A covariate map (telling us the column number for covariates)

    temporal

    Whether to include timeId into tensor

    pythonExePath

    Location of python exe you want to use

    - -

    Value

    - -

    Returns a list, containing the python object name of the sparse matrix, the plpData covariateRef -and a data.frame named map that tells us what covariate corresponds to each column -This object is a list with the following components:

    -
    data

    The python object name containing a sparse matrix with the rows corresponding to each person in the plpData and the columns corresponding to the covariates.

    -
    covariateRef

    The plpData covariateRef.

    -
    map

    A data.frame containing the data column ids and the corresponding covariateId from covariateRef.

    - -
    - -

    Details

    - -

    This function converts the covariate file from ffdf in COO format into a sparse matrix from -the package Matrix

    - -

    Examples

    -
    #TODO - -
    -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/toSparseTorchPython2.html b/docs/reference/toSparseTorchPython2.html deleted file mode 100644 index 79b057c34..000000000 --- a/docs/reference/toSparseTorchPython2.html +++ /dev/null @@ -1,245 +0,0 @@ - - - - - - - - -Convert the plpData in COO format into a sparse python matrix using torch.sparse — toSparseTorchPython2 • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - -
    - -
    -
    - - -
    - -

    Converts the standard plpData to a sparse matrix firectly into python

    - -
    - -
    toSparseTorchPython2(plpData, population, map = NULL, temporal = F,
    -  pythonExePath = NULL)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - -
    plpData

    An object of type plpData with covariate in coo format - the patient level prediction -data extracted from the CDM.

    population

    The population to include in the matrix

    map

    A covariate map (telling us the column number for covariates)

    temporal

    Whether to include timeId into tensor

    pythonExePath

    Location of python exe you want to use

    - -

    Value

    - -

    Returns a list, containing the python object name of the sparse matrix, the plpData covariateRef -and a data.frame named map that tells us what covariate corresponds to each column -This object is a list with the following components:

    -
    data

    The python object name containing a sparse matrix with the rows corresponding to each person in the plpData and the columns corresponding to the covariates.

    -
    covariateRef

    The plpData covariateRef.

    -
    map

    A data.frame containing the data column ids and the corresponding covariateId from covariateRef.

    -
    - - -

    Details

    - -

    This function converts the covariate file from ffdf in COO format into a sparse matrix from -the package Matrix

    - - -

    Examples

    -
    #TODO - -
    -
    - -
    - -
    - - -
    -

    Site built with pkgdown 1.3.0.

    -
    -
    -
    - - - - - - diff --git a/docs/reference/transferLearning.html b/docs/reference/transferLearning.html deleted file mode 100644 index 1cf29eb56..000000000 --- a/docs/reference/transferLearning.html +++ /dev/null @@ -1,279 +0,0 @@ - - - - - - - - -[Under development] Transfer learning — transferLearning • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    [Under development] Transfer learning

    -
    - -
    transferLearning(
    -  plpResult,
    -  plpData,
    -  population,
    -  fixLayers = T,
    -  includeTop = F,
    -  addLayers = c(100, 10),
    -  layerDropout = c(T, T),
    -  layerActivation = c("relu", "softmax"),
    -  outcomeWeight = 1,
    -  batchSize = 10000,
    -  epochs = 20
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    plpResult

    The plp result when training a kersa deep learning model on big data

    plpData

    The new data to fine tune the model on

    population

    The population for the new data

    fixLayers

    boolean specificying whether to fix weights in model being transferred

    includeTop

    If TRUE the final layer of the model being transferred is removed

    addLayers

    vector specifying nodes in each layer to add e.g. c(100,10) will add another layer with 100 nodels and then a final layer with 10

    layerDropout

    Add dropout to each new layer (binary vector length of addLayers)

    layerActivation

    Activation function for each new layer (string vector length of addLayers)

    outcomeWeight

    The weight to assign the class 1 when training the model

    batchSize

    Size of each batch for updating layers

    epochs

    Number of epoches to run

    - - -

    Examples

    -
    if (FALSE) { -modelSet <- setDeepNN() -plpResult <- runPlp(plpData, population, modelSettings = modelSet, ...) - -transferLearning(...) -} -
    -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/transportModel.html b/docs/reference/transportModel.html deleted file mode 100644 index 166870fe2..000000000 --- a/docs/reference/transportModel.html +++ /dev/null @@ -1,232 +0,0 @@ - - - - - - - - -Transports a plpModel to a new location and removes sensitive data — transportModel • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Transports a plpModel to a new location and removes sensitive data

    -
    - -
    transportModel(plpModel, outputFolder)
    - -

    Arguments

    - - - - - - - - - - -
    plpModel

    A trianed model.

    outputFolder

    The folder on the file system where the CSV files will be created. If the -folder does not yet exist it will be created.

    - -

    Details

    - -

    This function is used to

    - -

    Examples

    -
    if (FALSE) { -transportModel(plpModel, "s:/temp/exportTest") -} -
    -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/transportPlp.html b/docs/reference/transportPlp.html deleted file mode 100644 index 57f63a709..000000000 --- a/docs/reference/transportPlp.html +++ /dev/null @@ -1,290 +0,0 @@ - - - - - - - - -Transports a plpResult to a new location and removed sensitive data — transportPlp • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    Transports a plpResult to a new location and removed sensitive data

    -
    - -
    transportPlp(
    -  plpResult,
    -  modelName = NULL,
    -  dataName = NULL,
    -  outputFolder,
    -  n = NULL,
    -  includeEvaluationStatistics = T,
    -  includeThresholdSummary = T,
    -  includeDemographicSummary = T,
    -  includeCalibrationSummary = T,
    -  includePredictionDistribution = T,
    -  includeCovariateSummary = T,
    -  save = T,
    -  reduceSize = F
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    plpResult

    An object returned by running runPlp.

    modelName

    A string of the name of the model

    dataName

    A string of the name of the data

    outputFolder

    The folder on the file system where the CSV files will be created. If the -folder does not yet exist it will be created.

    n

    The minimum number of people required for each result summary to be included

    includeEvaluationStatistics

    Whether to include the evaluationStatistics

    includeThresholdSummary

    Whether to include the thresholdSummary

    includeDemographicSummary

    Whether to include the demographicSummary

    includeCalibrationSummary

    Whether to include the calibrationSummary

    includePredictionDistribution

    Whether to include the predictionDistribution

    includeCovariateSummary

    Whether to include the covariateSummary

    save

    Whether to save the result or just return the transportable object

    reduceSize

    Remove parts of runPlp object that are not needed but take up space

    - -

    Details

    - -

    This function is used to

    - -

    Examples

    -
    if (FALSE) { -transportPlp(plpResult, "s:/temp/exportTest", n=10) -} -
    -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/validateMultiplePlp.html b/docs/reference/validateMultiplePlp.html deleted file mode 100644 index 06aac497f..000000000 --- a/docs/reference/validateMultiplePlp.html +++ /dev/null @@ -1,253 +0,0 @@ - - - - - - - - -externally validate the multiple plp models across new datasets — validateMultiplePlp • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    This function loads all the models in a multiple plp analysis folder and -validates the models on new data

    -
    - -
    validateMultiplePlp(
    -  analysesLocation,
    -  validationDatabaseDetails,
    -  validationRestrictPlpDataSettings = createRestrictPlpDataSettings(),
    -  recalibrate = NULL,
    -  saveDirectory = NULL
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - -
    analysesLocation

    The location where the multiple plp analyses are

    validationDatabaseDetails

    The validation database settings created using createDatabaseDetails()

    validationRestrictPlpDataSettings

    The settings specifying the extra restriction settings when extracting the data created using createRestrictPlpDataSettings().

    recalibrate

    A vector of recalibration methods (currently supports 'RecalibrationintheLarge' and/or 'weakRecalibration')

    saveDirectory

    The location to save to validation results

    - -

    Details

    - -

    Users need to input a location where the results of the multiple plp analyses -are found and the connection and database settings for the new data

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/viewDatabaseResultPlp.html b/docs/reference/viewDatabaseResultPlp.html deleted file mode 100644 index 0b4be6fe9..000000000 --- a/docs/reference/viewDatabaseResultPlp.html +++ /dev/null @@ -1,260 +0,0 @@ - - - - - - - - -open a local shiny app for viewing the result of a PLP analyses from a database — viewDatabaseResultPlp • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    open a local shiny app for viewing the result of a PLP analyses from a database

    -
    - -
    viewDatabaseResultPlp(
    -  mySchema,
    -  myServer,
    -  myUser,
    -  myPassword,
    -  myDbms,
    -  myPort = NULL,
    -  myTableAppend
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    mySchema

    Database result schema containing the result tables

    myServer

    server with the result database

    myUser

    Username for the connection to the result database

    myPassword

    Password for the connection to the result database

    myDbms

    database management system for the result database

    myPort

    Port for the connection to the result database

    myTableAppend

    A string appended to the results tables (optional)

    - -

    Details

    - -

    Opens a shiny app for viewing the results of the models from a database

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/viewMultiplePlp.html b/docs/reference/viewMultiplePlp.html deleted file mode 100644 index 89200fb9e..000000000 --- a/docs/reference/viewMultiplePlp.html +++ /dev/null @@ -1,229 +0,0 @@ - - - - - - - - -open a local shiny app for viewing the result of a multiple PLP analyses — viewMultiplePlp • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    open a local shiny app for viewing the result of a multiple PLP analyses

    -
    - -
    viewMultiplePlp(analysesLocation)
    - -

    Arguments

    - - - - - - -
    analysesLocation

    The directory containing the results (with the analysis_x folders)

    - -

    Details

    - -

    Opens a shiny app for viewing the results of the models from various T,O, Tar and settings -settings.

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/reference/viewPlp.html b/docs/reference/viewPlp.html deleted file mode 100644 index 7ad734083..000000000 --- a/docs/reference/viewPlp.html +++ /dev/null @@ -1,235 +0,0 @@ - - - - - - - - -viewPlp - Interactively view the performance and model settings — viewPlp • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    - -
    -
    - - -
    -

    This is a shiny app for viewing interactive plots of the performance and the settings

    -
    - -
    viewPlp(runPlp, validatePlp = NULL)
    - -

    Arguments

    - - - - - - - - - - -
    runPlp

    The output of runPlp() (an object of class 'runPlp')

    validatePlp

    The output of externalValidatePlp (on object of class 'validatePlp')

    - -

    Value

    - -

    Opens a shiny app for interactively viewing the results

    -

    Details

    - -

    Either the result of runPlp and view the plots

    - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown 1.6.1.

    -
    - -
    -
    - - - - - - - - diff --git a/inst/doc/AddingCustomAlgorithms.tex b/inst/doc/AddingCustomAlgorithms.tex deleted file mode 100644 index 125ff3257..000000000 --- a/inst/doc/AddingCustomAlgorithms.tex +++ /dev/null @@ -1,617 +0,0 @@ -% Options for packages loaded elsewhere -\PassOptionsToPackage{unicode}{hyperref} -\PassOptionsToPackage{hyphens}{url} -% -\documentclass[ -]{article} -\usepackage{lmodern} -\usepackage{amssymb,amsmath} -\usepackage{ifxetex,ifluatex} -\ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex - \usepackage[T1]{fontenc} - \usepackage[utf8]{inputenc} - \usepackage{textcomp} % provide euro and other symbols -\else % if luatex or xetex - \usepackage{unicode-math} - \defaultfontfeatures{Scale=MatchLowercase} - \defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1} -\fi -% Use upquote if available, for straight quotes in verbatim environments -\IfFileExists{upquote.sty}{\usepackage{upquote}}{} -\IfFileExists{microtype.sty}{% use microtype if available - \usepackage[]{microtype} - \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts -}{} -\makeatletter -\@ifundefined{KOMAClassName}{% if non-KOMA class - \IfFileExists{parskip.sty}{% - \usepackage{parskip} - }{% else - \setlength{\parindent}{0pt} - \setlength{\parskip}{6pt plus 2pt minus 1pt}} -}{% if KOMA class - \KOMAoptions{parskip=half}} -\makeatother -\usepackage{xcolor} -\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available -\IfFileExists{bookmark.sty}{\usepackage{bookmark}}{\usepackage{hyperref}} -\hypersetup{ - pdftitle={Adding Custom Patient-Level Prediction Algorithms}, - pdfauthor={Jenna Reps, Martijn J. Schuemie, Patrick B. Ryan, Peter R. Rijnbeek}, - hidelinks, - pdfcreator={LaTeX via pandoc}} -\urlstyle{same} % disable monospaced font for URLs -\usepackage[margin=1in]{geometry} -\usepackage{color} -\usepackage{fancyvrb} -\newcommand{\VerbBar}{|} -\newcommand{\VERB}{\Verb[commandchars=\\\{\}]} -\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}} -% Add ',fontsize=\small' for more characters per line -\usepackage{framed} -\definecolor{shadecolor}{RGB}{248,248,248} -\newenvironment{Shaded}{\begin{snugshade}}{\end{snugshade}} -\newcommand{\AlertTok}[1]{\textcolor[rgb]{0.94,0.16,0.16}{#1}} -\newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.77,0.63,0.00}{#1}} -\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\BuiltInTok}[1]{#1} -\newcommand{\CharTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}} -\newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\ConstantTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} -\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{#1}} -\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\ErrorTok}[1]{\textcolor[rgb]{0.64,0.00,0.00}{\textbf{#1}}} -\newcommand{\ExtensionTok}[1]{#1} -\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\ImportTok}[1]{#1} -\newcommand{\InformationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} -\newcommand{\NormalTok}[1]{#1} -\newcommand{\OperatorTok}[1]{\textcolor[rgb]{0.81,0.36,0.00}{\textbf{#1}}} -\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}} -\newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}} -\newcommand{\RegionMarkerTok}[1]{#1} -\newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\StringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\VariableTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\WarningTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\usepackage{graphicx,grffile} -\makeatletter -\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi} -\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi} -\makeatother -% Scale images if necessary, so that they will not overflow the page -% margins by default, and it is still possible to overwrite the defaults -% using explicit options in \includegraphics[width, height, ...]{} -\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio} -% Set default figure placement to htbp -\makeatletter -\def\fps@figure{htbp} -\makeatother -\setlength{\emergencystretch}{3em} % prevent overfull lines -\providecommand{\tightlist}{% - \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}} -\setcounter{secnumdepth}{5} -\usepackage{fancyhdr} -\pagestyle{fancy} -\fancyhead{} -\fancyhead[CO,CE]{Custom Patient-Level Prediction Algorithms} -\fancyfoot[CO,CE]{PatientLevelPrediction Package Version 3.1.0} -\fancyfoot[LE,RO]{\thepage} -\renewcommand{\headrulewidth}{0.4pt} -\renewcommand{\footrulewidth}{0.4pt} - -\title{Adding Custom Patient-Level Prediction Algorithms} -\author{Jenna Reps, Martijn J. Schuemie, Patrick B. Ryan, Peter R. Rijnbeek} -\date{2020-06-03} - -\begin{document} -\maketitle - -{ -\setcounter{tocdepth}{2} -\tableofcontents -} -\hypertarget{introduction}{% -\section{Introduction}\label{introduction}} - -This vignette describes how you can add your own custom algorithms in -the Observational Health Data Sciencs and Informatics (OHDSI) -\href{http://github.com/OHDSI/PatientLevelPrediction}{\texttt{PatientLevelPrediction}} -package. This allows you to fully leverage the OHDSI -PatientLevelPrediction framework for model development and validation. -This vignette assumes you have read and are comfortable with building -single patient level prediction models as described in the -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/BuildingPredictiveModels.pdf}{\texttt{BuildingPredictiveModels} -vignette}. - -\textbf{We invite you to share your new algorithms with the OHDSI -community through our -\href{http://github.com/OHDSI/PatientLevelPrediction}{GitHub -repository}.} - -\hypertarget{algorithm-code-structure}{% -\section{Algorithm Code Structure}\label{algorithm-code-structure}} - -Each algorithm in the package should be implemented in its own -\textless Name\textgreater.R file, e.g.~KNN.R, containing a -set\textless Name\textgreater{} function and a -fit\textless Name\textgreater{} function. Furthermore, a corresponding -predict function in predict.R is needed (if there isn't one available -that would work, see example at the end of the document). We will now -describe each of these functions in more detail below. - -\hypertarget{set}{% -\subsection{Set}\label{set}} - -The set\textless Name\textgreater{} is a function that takes as input -the different hyper-parameter values to do a grid search when training. -The output of the functions needs to be a list as class -\texttt{modelSettings} containing: - -\begin{itemize} -\tightlist -\item - param - all the combinations of the hyper-parameter values input -\item - model - a string specifying what function to call to fit the model -\item - name - a string containing the name of the model. -\end{itemize} - -For example, if you were adding a model called madeUp that has two -hyper-parameters then the set function should be: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{setMadeUp <-}\StringTok{ }\ControlFlowTok{function}\NormalTok{(}\DataTypeTok{a=}\DecValTok{1}\NormalTok{, }\DataTypeTok{b=}\DecValTok{2}\NormalTok{, }\DataTypeTok{seed=}\OtherTok{NULL}\NormalTok{)\{} - \CommentTok{# add input checks here...} - - \CommentTok{# now create list of all combinations:} -\NormalTok{ result <-}\StringTok{ }\KeywordTok{list}\NormalTok{(}\DataTypeTok{model=}\StringTok{'fitMadeUp'}\NormalTok{, }\CommentTok{# this will be called to train the made up model} - \DataTypeTok{param=} \KeywordTok{split}\NormalTok{(}\KeywordTok{expand.grid}\NormalTok{(}\DataTypeTok{a=}\NormalTok{a, } - \DataTypeTok{b=}\NormalTok{b,} - \DataTypeTok{seed=}\KeywordTok{ifelse}\NormalTok{(}\KeywordTok{is.null}\NormalTok{(seed),}\StringTok{'NULL'}\NormalTok{, seed)),} - \DecValTok{1}\OperatorTok{:}\NormalTok{(}\KeywordTok{length}\NormalTok{(a)}\OperatorTok{*}\KeywordTok{length}\NormalTok{(b) )),} - \DataTypeTok{name=}\StringTok{'Made Up Algorithm'} -\NormalTok{ )} - \KeywordTok{class}\NormalTok{(result) <-}\StringTok{ 'modelSettings'} - - \KeywordTok{return}\NormalTok{(result)} -\NormalTok{\}} -\end{Highlighting} -\end{Shaded} - -\hypertarget{fit}{% -\subsection{Fit}\label{fit}} - -This function should train your custom model for each parameter entry, -pick the best parameters and train a final model for that setting. - -The fit\textless Model\textgreater{} should have as inputs: - -\begin{itemize} -\tightlist -\item - population - the study popualation the model is being developed on -\item - plpData - the plpData object -\item - param - the hyper-parameters as a list of all combinations -\item - quiet - T or F indicating whether to output progress -\item - outcomeId - the outcome id -\item - cohortId - the target population id -\end{itemize} - -The fit function should return a list of class \texttt{plpModel} with -the following objects: - -\begin{itemize} -\tightlist -\item - model - a trained model -\item - modelSettings - a list containing the model and input param -\item - trainCVAuc - a value with the train AUC value -\item - hyperParamSearch - a dataframe with the hyperparameter grid and - corresponding AUCs -\item - metaData - the metaData from the plpData object -\item - populationSettings - the settings used to create the population and - define the time-at-risk -\item - outcomeId - the outcomeId being predicted -\item - cohortId - the cohortId corresponding to the target cohort -\item - varImp - a dataframe with the covaraites and a measure of importance -\item - trainingTime - how long it took to develop/evaluate the model -\item - covariateMap - if the plpData are converted to a matrix for model - compatibility this tells us what covariate each row in the matrix - correpsonds to and is need when implementing the model on new data -\end{itemize} - -The plpModel returned by fit also has a type attribute, this points to -the predict function, for example -\texttt{attr(result,\ \textquotesingle{}type\textquotesingle{})\ \textless{}-\ \textquotesingle{}madeup\textquotesingle{}} -means when the model is applied to new data, the `predict.madeup' -function in Predict.R is called. if this doesnt exist, then the model -will fail. Another attribute is the predictionType -\texttt{attr(result,\ \textquotesingle{}predictionType\textquotesingle{})\ \textless{}-\ \textquotesingle{}binary\textquotesingle{}} -this is currently not needed but may be important in the future when we -expand to regression or multiclass classification. - -For example: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{fitMadeUp <-}\StringTok{ }\ControlFlowTok{function}\NormalTok{(population, plpData, param, }\DataTypeTok{quiet=}\NormalTok{F,} -\NormalTok{ outcomeId, cohortId, ...)\{} - - \CommentTok{# **************** code to train the model here} - \CommentTok{# trainedModel <- this code should apply each hyper-parameter using the cross validation} - \CommentTok{# then pick out the best hyper-parameter setting} - \CommentTok{# and finally fit a model on the whole train data using the } - \CommentTok{# optimal hyper-parameter settings} - \CommentTok{# ****************} - - \CommentTok{# construct the standard output for a model:} -\NormalTok{ result <-}\StringTok{ }\KeywordTok{list}\NormalTok{(}\DataTypeTok{model =}\NormalTok{ trainedModel,} - \DataTypeTok{modelSettings =} \KeywordTok{list}\NormalTok{(}\DataTypeTok{model=}\StringTok{'made_up'}\NormalTok{, }\DataTypeTok{modelParameters=}\NormalTok{param),} - \DataTypeTok{trainCVAuc =} \OtherTok{NULL}\NormalTok{,} - \DataTypeTok{hyperParamSearch =}\NormalTok{ hyperSummary,} - \DataTypeTok{metaData =}\NormalTok{ plpData}\OperatorTok{$}\NormalTok{metaData,} - \DataTypeTok{populationSettings =} \KeywordTok{attr}\NormalTok{(population, }\StringTok{'metaData'}\NormalTok{),} - \DataTypeTok{outcomeId=}\NormalTok{outcomeId,}\CommentTok{# can use populationSettings$outcomeId?} - \DataTypeTok{cohortId=}\NormalTok{cohortId,} - \DataTypeTok{varImp =} \OtherTok{NULL}\NormalTok{,} - \DataTypeTok{trainingTime=}\NormalTok{comp,} - \DataTypeTok{covariateMap=}\NormalTok{result}\OperatorTok{$}\NormalTok{map} -\NormalTok{ )} - \KeywordTok{class}\NormalTok{(result) <-}\StringTok{ 'plpModel'} - \KeywordTok{attr}\NormalTok{(result, }\StringTok{'type'}\NormalTok{) <-}\StringTok{ 'madeup'} - \KeywordTok{attr}\NormalTok{(result, }\StringTok{'predictionType'}\NormalTok{) <-}\StringTok{ 'binary'} - \KeywordTok{return}\NormalTok{(result)} - -\NormalTok{\}} -\end{Highlighting} -\end{Shaded} - -You could make the fitMadeUp function cleaner by adding helper function -in the MadeUp.R file that are called by the fit function. As the end of -the fit function specified -\texttt{attr(result,\ \textquotesingle{}type\textquotesingle{})\ \textless{}-\ \textquotesingle{}madeup\textquotesingle{}} -we also need to make sure there is a \texttt{predict.madeup} function in -Predict.R: - -\hypertarget{predict}{% -\subsection{Predict}\label{predict}} - -The prediction function takes as input the plpModel returned by fit, a -population and corresponding plpData. It returns a data.frame with the -columns: - -\begin{itemize} -\tightlist -\item - rowId - the id for each person in the population -\item - value - the predicted risk from the plpModel -\end{itemize} - -If the population contains the columns outcomeCount and indexes, then -these are also in the output. - -For example: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{predict.madeup <-}\StringTok{ }\ControlFlowTok{function}\NormalTok{(plpModel,population, plpData, ...)\{ } - - \CommentTok{# ************* code to do prediction for each rowId in population} - \CommentTok{# prediction <- code to do prediction here returning columns: rowId } - \CommentTok{# and value (predicted risk)} - \CommentTok{#**************} - -\NormalTok{ prediction <-}\StringTok{ }\KeywordTok{merge}\NormalTok{(population, prediction, }\DataTypeTok{by=}\StringTok{'rowId'}\NormalTok{)} -\NormalTok{ prediction <-}\StringTok{ }\NormalTok{prediction[,}\KeywordTok{colnames}\NormalTok{(prediction)}\OperatorTok{%in%}\KeywordTok{c}\NormalTok{(}\StringTok{'rowId'}\NormalTok{,}\StringTok{'outcomeCount'}\NormalTok{,} - \StringTok{'indexes'}\NormalTok{, }\StringTok{'value'}\NormalTok{)] } - \KeywordTok{attr}\NormalTok{(prediction, }\StringTok{"metaData"}\NormalTok{) <-}\StringTok{ }\KeywordTok{list}\NormalTok{(}\DataTypeTok{predictionType =} \StringTok{"binary"}\NormalTok{) } - \KeywordTok{return}\NormalTok{(prediction)} - -\NormalTok{\}} -\end{Highlighting} -\end{Shaded} - -\hypertarget{algorithm-example}{% -\section{Algorithm Example}\label{algorithm-example}} - -Below a fully functional algorithm example is given, however we highly -recommend you to have a look at the available algorithms in the package. - -\hypertarget{set-1}{% -\subsection{Set}\label{set-1}} - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{setMadeUp <-}\StringTok{ }\ControlFlowTok{function}\NormalTok{(}\DataTypeTok{a=}\DecValTok{1}\NormalTok{, }\DataTypeTok{b=}\DecValTok{2}\NormalTok{, }\DataTypeTok{seed=}\OtherTok{NULL}\NormalTok{)\{} - \CommentTok{# check a is valid positive value} - \ControlFlowTok{if}\NormalTok{(}\KeywordTok{missing}\NormalTok{(a))\{} - \KeywordTok{stop}\NormalTok{(}\StringTok{'a must be input'}\NormalTok{)} -\NormalTok{ \}} - \ControlFlowTok{if}\NormalTok{(}\OperatorTok{!}\KeywordTok{class}\NormalTok{(a)}\OperatorTok{%in%}\KeywordTok{c}\NormalTok{(}\StringTok{'numeric'}\NormalTok{,}\StringTok{'integer'}\NormalTok{)\{} - \KeywordTok{stop}\NormalTok{(}\StringTok{'a must be numeric'}\NormalTok{)} -\NormalTok{ \}} - \ControlFlowTok{if}\NormalTok{(a }\OperatorTok{<}\StringTok{ }\DecValTok{0}\NormalTok{)\{} - \KeywordTok{stop}\NormalTok{(}\StringTok{'a must be positive'}\NormalTok{)} -\NormalTok{ \}} - \CommentTok{# check b is numeric} - \ControlFlowTok{if}\NormalTok{(}\KeywordTok{missing}\NormalTok{(b))\{} - \KeywordTok{stop}\NormalTok{(}\StringTok{'b must be input'}\NormalTok{)} -\NormalTok{ \}} - \ControlFlowTok{if}\NormalTok{(}\OperatorTok{!}\KeywordTok{class}\NormalTok{(b)}\OperatorTok{%in%}\KeywordTok{c}\NormalTok{(}\StringTok{'numeric'}\NormalTok{,}\StringTok{'integer'}\NormalTok{)\{} - \KeywordTok{stop}\NormalTok{(}\StringTok{'b must be numeric'}\NormalTok{)} -\NormalTok{ \}} - - \CommentTok{# now create list of all combinations:} -\NormalTok{ result <-}\StringTok{ }\KeywordTok{list}\NormalTok{(}\DataTypeTok{model=}\StringTok{'fitMadeUp'}\NormalTok{, } - \DataTypeTok{param=} \KeywordTok{split}\NormalTok{(}\KeywordTok{expand.grid}\NormalTok{(}\DataTypeTok{a=}\NormalTok{a, } - \DataTypeTok{b=}\NormalTok{b,} - \DataTypeTok{seed=}\KeywordTok{ifelse}\NormalTok{(}\KeywordTok{is.null}\NormalTok{(seed),}\StringTok{'NULL'}\NormalTok{, seed)),} - \DecValTok{1}\OperatorTok{:}\NormalTok{(}\KeywordTok{length}\NormalTok{(a)}\OperatorTok{*}\KeywordTok{length}\NormalTok{(b) )),} - \DataTypeTok{name=}\StringTok{'Made Up Algorithm'} -\NormalTok{ )} - \KeywordTok{class}\NormalTok{(result) <-}\StringTok{ 'modelSettings'} - - \KeywordTok{return}\NormalTok{(result)} - - -\ErrorTok{\}} -\end{Highlighting} -\end{Shaded} - -\hypertarget{fit-1}{% -\subsection{Fit}\label{fit-1}} - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{fitMadeUp <-}\StringTok{ }\ControlFlowTok{function}\NormalTok{(population, plpData, param, }\DataTypeTok{quiet=}\NormalTok{F,} -\NormalTok{ outcomeId, cohortId, ...)\{} - \ControlFlowTok{if}\NormalTok{(}\OperatorTok{!}\NormalTok{quiet)} - \KeywordTok{writeLines}\NormalTok{(}\StringTok{'Training Made Up model'}\NormalTok{)} - - \ControlFlowTok{if}\NormalTok{(param[[}\DecValTok{1}\NormalTok{]]}\OperatorTok{$}\NormalTok{seed}\OperatorTok{!=}\StringTok{'NULL'}\NormalTok{)} - \KeywordTok{set.seed}\NormalTok{(param[[}\DecValTok{1}\NormalTok{]]}\OperatorTok{$}\NormalTok{seed)} - - \CommentTok{# check plpData is coo format:} - \ControlFlowTok{if}\NormalTok{(}\OperatorTok{!}\StringTok{'ffdf'}\OperatorTok{%in%}\KeywordTok{class}\NormalTok{(plpData}\OperatorTok{$}\NormalTok{covariates) )} - \KeywordTok{stop}\NormalTok{(}\StringTok{'This algorithm requires plpData in coo format'}\NormalTok{)} - -\NormalTok{ metaData <-}\StringTok{ }\KeywordTok{attr}\NormalTok{(population, }\StringTok{'metaData'}\NormalTok{)} - \ControlFlowTok{if}\NormalTok{(}\OperatorTok{!}\KeywordTok{is.null}\NormalTok{(population}\OperatorTok{$}\NormalTok{indexes))} -\NormalTok{ population <-}\StringTok{ }\NormalTok{population[population}\OperatorTok{$}\NormalTok{indexes}\OperatorTok{>}\DecValTok{0}\NormalTok{,]} - \KeywordTok{attr}\NormalTok{(population, }\StringTok{'metaData'}\NormalTok{) <-}\StringTok{ }\NormalTok{metaData} - - \CommentTok{# convert data into sparse R Matrix:} -\NormalTok{ result <-}\StringTok{ }\KeywordTok{toSparseM}\NormalTok{(plpData,population,}\DataTypeTok{map=}\OtherTok{NULL}\NormalTok{)} -\NormalTok{ data <-}\StringTok{ }\NormalTok{result}\OperatorTok{$}\NormalTok{data} - -\NormalTok{ data <-}\StringTok{ }\NormalTok{data[population}\OperatorTok{$}\NormalTok{rowId,]} - - \CommentTok{# set test/train sets (for printing performance as it trains)} - \ControlFlowTok{if}\NormalTok{(}\OperatorTok{!}\NormalTok{quiet)} - \KeywordTok{writeLines}\NormalTok{(}\KeywordTok{paste0}\NormalTok{(}\StringTok{'Training made up model on train set containing '}\NormalTok{, }\KeywordTok{nrow}\NormalTok{(population), } - \StringTok{' people with '}\NormalTok{,}\KeywordTok{sum}\NormalTok{(population}\OperatorTok{$}\NormalTok{outcomeCount}\OperatorTok{>}\DecValTok{0}\NormalTok{), }\StringTok{' outcomes'}\NormalTok{))} -\NormalTok{ start <-}\StringTok{ }\KeywordTok{Sys.time}\NormalTok{()} - - \CommentTok{#============= STEP 1 ======================================} - \CommentTok{# pick the best hyper-params and then do final training on all data...} - \KeywordTok{writeLines}\NormalTok{(}\StringTok{'train'}\NormalTok{)} -\NormalTok{ datas <-}\StringTok{ }\KeywordTok{list}\NormalTok{(}\DataTypeTok{population=}\NormalTok{population, }\DataTypeTok{data=}\NormalTok{data)} -\NormalTok{ param.sel <-}\StringTok{ }\KeywordTok{lapply}\NormalTok{(param, }\ControlFlowTok{function}\NormalTok{(x) }\KeywordTok{do.call}\NormalTok{(made_up_model, }\KeywordTok{c}\NormalTok{(x,datas) ))} -\NormalTok{ hyperSummary <-}\StringTok{ }\KeywordTok{do.call}\NormalTok{(rbind, }\KeywordTok{lapply}\NormalTok{(param.sel, }\ControlFlowTok{function}\NormalTok{(x) x}\OperatorTok{$}\NormalTok{hyperSum))} -\NormalTok{ hyperSummary <-}\StringTok{ }\KeywordTok{as.data.frame}\NormalTok{(hyperSummary)} -\NormalTok{ hyperSummary}\OperatorTok{$}\NormalTok{auc <-}\StringTok{ }\KeywordTok{unlist}\NormalTok{(}\KeywordTok{lapply}\NormalTok{(param.sel, }\ControlFlowTok{function}\NormalTok{(x) x}\OperatorTok{$}\NormalTok{auc)) } -\NormalTok{ param.sel <-}\StringTok{ }\KeywordTok{unlist}\NormalTok{(}\KeywordTok{lapply}\NormalTok{(param.sel, }\ControlFlowTok{function}\NormalTok{(x) x}\OperatorTok{$}\NormalTok{auc))} -\NormalTok{ param <-}\StringTok{ }\NormalTok{param[[}\KeywordTok{which.max}\NormalTok{(param.sel)]]} - - \CommentTok{# set this so you do a final model train } -\NormalTok{ param}\OperatorTok{$}\NormalTok{final=T} - - \KeywordTok{writeLines}\NormalTok{(}\StringTok{'final train'}\NormalTok{)} -\NormalTok{ trainedModel <-}\StringTok{ }\KeywordTok{do.call}\NormalTok{(made_up_model, }\KeywordTok{c}\NormalTok{(param,datas) )}\OperatorTok{$}\NormalTok{model} - -\NormalTok{ comp <-}\StringTok{ }\KeywordTok{Sys.time}\NormalTok{() }\OperatorTok{-}\StringTok{ }\NormalTok{start} - \ControlFlowTok{if}\NormalTok{(}\OperatorTok{!}\NormalTok{quiet)} - \KeywordTok{writeLines}\NormalTok{(}\KeywordTok{paste0}\NormalTok{(}\StringTok{'Model Made Up trained - took:'}\NormalTok{, }\KeywordTok{format}\NormalTok{(comp, }\DataTypeTok{digits=}\DecValTok{3}\NormalTok{)))} - - \CommentTok{# construct the standard output for a model:} -\NormalTok{ result <-}\StringTok{ }\KeywordTok{list}\NormalTok{(}\DataTypeTok{model =}\NormalTok{ trainedModel,} - \DataTypeTok{modelSettings =} \KeywordTok{list}\NormalTok{(}\DataTypeTok{model=}\StringTok{'made_up'}\NormalTok{, }\DataTypeTok{modelParameters=}\NormalTok{param),} - \DataTypeTok{trainCVAuc =} \OtherTok{NULL}\NormalTok{,} - \DataTypeTok{hyperParamSearch =}\NormalTok{ hyperSummary,} - \DataTypeTok{metaData =}\NormalTok{ plpData}\OperatorTok{$}\NormalTok{metaData,} - \DataTypeTok{populationSettings =} \KeywordTok{attr}\NormalTok{(population, }\StringTok{'metaData'}\NormalTok{),} - \DataTypeTok{outcomeId=}\NormalTok{outcomeId,}\CommentTok{# can use populationSettings$outcomeId?} - \DataTypeTok{cohortId=}\NormalTok{cohortId,} - \DataTypeTok{varImp =} \OtherTok{NULL}\NormalTok{,} - \DataTypeTok{trainingTime=}\NormalTok{comp,} - \DataTypeTok{covariateMap=}\NormalTok{result}\OperatorTok{$}\NormalTok{map} -\NormalTok{ )} - \KeywordTok{class}\NormalTok{(result) <-}\StringTok{ 'plpModel'} - \KeywordTok{attr}\NormalTok{(result, }\StringTok{'type'}\NormalTok{) <-}\StringTok{ 'madeup'} - \KeywordTok{attr}\NormalTok{(result, }\StringTok{'predictionType'}\NormalTok{) <-}\StringTok{ 'binary'} - \KeywordTok{return}\NormalTok{(result)} - -\NormalTok{\}} -\end{Highlighting} -\end{Shaded} - -\hypertarget{helpers}{% -\subsection{Helpers}\label{helpers}} - -In the fit model a helper function \texttt{made\_up\_model} is called, -this is the function that trains a model given the data and population -(where the popualtion contains a column outcomeCount corresponding to -the outcome). Both the data and population are ordered the same way: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{made_up_model <-}\StringTok{ }\ControlFlowTok{function}\NormalTok{(data, population,} - \DataTypeTok{a=}\DecValTok{1}\NormalTok{,}\DataTypeTok{b=}\DecValTok{1}\NormalTok{, }\DataTypeTok{final=}\NormalTok{F, ...)\{} - - \KeywordTok{writeLines}\NormalTok{(}\KeywordTok{paste}\NormalTok{(}\StringTok{'Training Made Up model with '}\NormalTok{,}\KeywordTok{length}\NormalTok{(}\KeywordTok{unique}\NormalTok{(population}\OperatorTok{$}\NormalTok{indexes)),} - \StringTok{' fold CV'}\NormalTok{))} - \ControlFlowTok{if}\NormalTok{(}\OperatorTok{!}\KeywordTok{is.null}\NormalTok{(population}\OperatorTok{$}\NormalTok{indexes) }\OperatorTok{&&}\StringTok{ }\NormalTok{final}\OperatorTok{==}\NormalTok{F)\{} -\NormalTok{ index_vect <-}\StringTok{ }\KeywordTok{unique}\NormalTok{(population}\OperatorTok{$}\NormalTok{indexes)} -\NormalTok{ perform <-}\StringTok{ }\KeywordTok{c}\NormalTok{()} - - \CommentTok{# create prediction matrix to store all predictions} -\NormalTok{ predictionMat <-}\StringTok{ }\NormalTok{population} -\NormalTok{ predictionMat}\OperatorTok{$}\NormalTok{value <-}\StringTok{ }\DecValTok{0} - \KeywordTok{attr}\NormalTok{(predictionMat, }\StringTok{"metaData"}\NormalTok{) <-}\StringTok{ }\KeywordTok{list}\NormalTok{(}\DataTypeTok{predictionType =} \StringTok{"binary"}\NormalTok{)} - - \ControlFlowTok{for}\NormalTok{(index }\ControlFlowTok{in} \DecValTok{1}\OperatorTok{:}\KeywordTok{length}\NormalTok{(index_vect ))\{} - \KeywordTok{writeLines}\NormalTok{(}\KeywordTok{paste}\NormalTok{(}\StringTok{'Fold '}\NormalTok{,index, }\StringTok{' -- with '}\NormalTok{, }\KeywordTok{sum}\NormalTok{(population}\OperatorTok{$}\NormalTok{indexes}\OperatorTok{!=}\NormalTok{index),} - \StringTok{'train rows'}\NormalTok{))} -\NormalTok{ model <-}\StringTok{ }\NormalTok{madeup}\OperatorTok{::}\KeywordTok{model}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ data[population}\OperatorTok{$}\NormalTok{indexes}\OperatorTok{!=}\NormalTok{index,],} - \DataTypeTok{y=}\NormalTok{ population}\OperatorTok{$}\NormalTok{outcomeCount[population}\OperatorTok{$}\NormalTok{indexes}\OperatorTok{!=}\NormalTok{index],} - \DataTypeTok{a=}\NormalTok{a, }\DataTypeTok{b=}\NormalTok{b)} - -\NormalTok{ pred <-}\StringTok{ }\NormalTok{stats}\OperatorTok{::}\KeywordTok{predict}\NormalTok{(model, data[population}\OperatorTok{$}\NormalTok{indexes}\OperatorTok{==}\NormalTok{index,])} -\NormalTok{ prediction <-}\StringTok{ }\NormalTok{population[population}\OperatorTok{$}\NormalTok{indexes}\OperatorTok{==}\NormalTok{index,]} -\NormalTok{ prediction}\OperatorTok{$}\NormalTok{value <-}\StringTok{ }\NormalTok{pred} - \KeywordTok{attr}\NormalTok{(prediction, }\StringTok{"metaData"}\NormalTok{) <-}\StringTok{ }\KeywordTok{list}\NormalTok{(}\DataTypeTok{predictionType =} \StringTok{"binary"}\NormalTok{)} -\NormalTok{ aucVal <-}\StringTok{ }\KeywordTok{computeAuc}\NormalTok{(prediction)} -\NormalTok{ perform <-}\StringTok{ }\KeywordTok{c}\NormalTok{(perform,aucVal)} - - \CommentTok{# add the fold predictions and compute AUC after loop} -\NormalTok{ predictionMat}\OperatorTok{$}\NormalTok{value[population}\OperatorTok{$}\NormalTok{indexes}\OperatorTok{==}\NormalTok{index] <-}\StringTok{ }\NormalTok{pred} - -\NormalTok{ \}} - \CommentTok{##auc <- mean(perform) # want overal rather than mean} -\NormalTok{ auc <-}\StringTok{ }\KeywordTok{computeAuc}\NormalTok{(predictionMat)} - -\NormalTok{ foldPerm <-}\StringTok{ }\NormalTok{perform} -\NormalTok{ \} }\ControlFlowTok{else}\NormalTok{ \{} -\NormalTok{ model <-}\StringTok{ }\NormalTok{madeup}\OperatorTok{::}\KeywordTok{model}\NormalTok{(}\DataTypeTok{x=}\NormalTok{ data, } - \DataTypeTok{y=}\NormalTok{ population}\OperatorTok{$}\NormalTok{outcomeCount,} - \DataTypeTok{a=}\NormalTok{a,}\DataTypeTok{b=}\NormalTok{b)} - -\NormalTok{ pred <-}\StringTok{ }\NormalTok{stats}\OperatorTok{::}\KeywordTok{predict}\NormalTok{(model, data)} -\NormalTok{ prediction <-}\StringTok{ }\NormalTok{population} -\NormalTok{ prediction}\OperatorTok{$}\NormalTok{value <-}\StringTok{ }\NormalTok{pred} - \KeywordTok{attr}\NormalTok{(prediction, }\StringTok{"metaData"}\NormalTok{) <-}\StringTok{ }\KeywordTok{list}\NormalTok{(}\DataTypeTok{predictionType =} \StringTok{"binary"}\NormalTok{) } -\NormalTok{ auc <-}\StringTok{ }\KeywordTok{computeAuc}\NormalTok{(prediction)} -\NormalTok{ foldPerm <-}\StringTok{ }\NormalTok{auc} -\NormalTok{ \}} - -\NormalTok{ result <-}\StringTok{ }\KeywordTok{list}\NormalTok{(}\DataTypeTok{model=}\NormalTok{model,} - \DataTypeTok{auc=}\NormalTok{auc,} - \DataTypeTok{hyperSum =} \KeywordTok{unlist}\NormalTok{(}\KeywordTok{list}\NormalTok{(}\DataTypeTok{a =}\NormalTok{ a, }\DataTypeTok{b =}\NormalTok{ b, }\DataTypeTok{fold_auc=}\NormalTok{foldPerm))} -\NormalTok{ )} - \KeywordTok{return}\NormalTok{(result)} -\NormalTok{\}} -\end{Highlighting} -\end{Shaded} - -\hypertarget{predict-1}{% -\subsection{Predict}\label{predict-1}} - -The final step is to create a predict function for the model. This gets -added to the predict.R file. In the example above the type -\texttt{attr(result,\ \textquotesingle{}type\textquotesingle{})\ \textless{}-\ \textquotesingle{}madeup\textquotesingle{}} -was madeup, so a \texttt{predict.madeup} function is required to be -added into the predict.R. The predict function needs to take as input -the plpModel returned by the fit function, the population to apply the -model on and the plpData specifying the covariates of the population. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{predict.madeup <-}\StringTok{ }\ControlFlowTok{function}\NormalTok{(plpModel,population, plpData, ...)\{ } -\NormalTok{ result <-}\StringTok{ }\KeywordTok{toSparseM}\NormalTok{(plpData, population, }\DataTypeTok{map=}\NormalTok{plpModel}\OperatorTok{$}\NormalTok{covariateMap)} -\NormalTok{ data <-}\StringTok{ }\NormalTok{result}\OperatorTok{$}\NormalTok{data[population}\OperatorTok{$}\NormalTok{rowId,]} -\NormalTok{ prediction <-}\StringTok{ }\KeywordTok{data.frame}\NormalTok{(}\DataTypeTok{rowId=}\NormalTok{population}\OperatorTok{$}\NormalTok{rowId, } - \DataTypeTok{value=}\NormalTok{stats}\OperatorTok{::}\KeywordTok{predict}\NormalTok{(plpModel}\OperatorTok{$}\NormalTok{model, data)} -\NormalTok{ )} - -\NormalTok{ prediction <-}\StringTok{ }\KeywordTok{merge}\NormalTok{(population, prediction, }\DataTypeTok{by=}\StringTok{'rowId'}\NormalTok{)} -\NormalTok{ prediction <-}\StringTok{ }\NormalTok{prediction[,}\KeywordTok{colnames}\NormalTok{(prediction)}\OperatorTok{%in%} -\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{'rowId'}\NormalTok{,}\StringTok{'outcomeCount'}\NormalTok{,}\StringTok{'indexes'}\NormalTok{, }\StringTok{'value'}\NormalTok{)] }\CommentTok{# need to fix no index issue} - \KeywordTok{attr}\NormalTok{(prediction, }\StringTok{"metaData"}\NormalTok{) <-}\StringTok{ }\KeywordTok{list}\NormalTok{(}\DataTypeTok{predictionType =} \StringTok{"binary"}\NormalTok{) } - \KeywordTok{return}\NormalTok{(prediction)} - -\NormalTok{\}} -\end{Highlighting} -\end{Shaded} - -As the madeup model uses the standard R prediction, it has the same -prediction function as xgboost, so we could have not added a new -prediction function and instead made the type of the result returned by -fitMadeUpModel to -\texttt{attr(result,\ \textquotesingle{}type\textquotesingle{})\ \textless{}-\ \textquotesingle{}xgboost\textquotesingle{}}. - -\hypertarget{acknowledgments}{% -\section{Acknowledgments}\label{acknowledgments}} - -Considerable work has been dedicated to provide the -\texttt{PatientLevelPrediction} package. - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{citation}\NormalTok{(}\StringTok{"PatientLevelPrediction"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -## -## To cite PatientLevelPrediction in publications use: -## -## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design and -## implementation of a standardized framework to generate and evaluate patient-level -## prediction models using observational healthcare data." _Journal of the American -## Medical Informatics Association_, *25*(8), 969-975. . -## -## A BibTeX entry for LaTeX users is -## -## @Article{, -## author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek}, -## title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data}, -## journal = {Journal of the American Medical Informatics Association}, -## volume = {25}, -## number = {8}, -## pages = {969-975}, -## year = {2018}, -## url = {https://doi.org/10.1093/jamia/ocy032}, -## } -\end{verbatim} - -\textbf{Please reference this paper if you use the PLP Package in your -work:} - -\href{http://dx.doi.org/10.1093/jamia/ocy032}{Reps JM, Schuemie MJ, -Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a -standardized framework to generate and evaluate patient-level prediction -models using observational healthcare data. J Am Med Inform Assoc. -2018;25(8):969-975.} - -This work is supported in part through the National Science Foundation -grant IIS 1251151. - -\end{document} diff --git a/inst/doc/AddingCustomFeatureEngineering.pdf b/inst/doc/AddingCustomFeatureEngineering.pdf index 4d97db3c5..e4e8220ce 100644 Binary files a/inst/doc/AddingCustomFeatureEngineering.pdf and b/inst/doc/AddingCustomFeatureEngineering.pdf differ diff --git a/inst/doc/BuildingDeepLearningModels.tex b/inst/doc/BuildingDeepLearningModels.tex deleted file mode 100644 index 8917b0849..000000000 --- a/inst/doc/BuildingDeepLearningModels.tex +++ /dev/null @@ -1,741 +0,0 @@ -% Options for packages loaded elsewhere -\PassOptionsToPackage{unicode}{hyperref} -\PassOptionsToPackage{hyphens}{url} -% -\documentclass[ -]{article} -\usepackage{lmodern} -\usepackage{amssymb,amsmath} -\usepackage{ifxetex,ifluatex} -\ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex - \usepackage[T1]{fontenc} - \usepackage[utf8]{inputenc} - \usepackage{textcomp} % provide euro and other symbols -\else % if luatex or xetex - \usepackage{unicode-math} - \defaultfontfeatures{Scale=MatchLowercase} - \defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1} -\fi -% Use upquote if available, for straight quotes in verbatim environments -\IfFileExists{upquote.sty}{\usepackage{upquote}}{} -\IfFileExists{microtype.sty}{% use microtype if available - \usepackage[]{microtype} - \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts -}{} -\makeatletter -\@ifundefined{KOMAClassName}{% if non-KOMA class - \IfFileExists{parskip.sty}{% - \usepackage{parskip} - }{% else - \setlength{\parindent}{0pt} - \setlength{\parskip}{6pt plus 2pt minus 1pt}} -}{% if KOMA class - \KOMAoptions{parskip=half}} -\makeatother -\usepackage{xcolor} -\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available -\IfFileExists{bookmark.sty}{\usepackage{bookmark}}{\usepackage{hyperref}} -\hypersetup{ - pdftitle={Building Deep Learning Models}, - pdfauthor={Peter R. Rijnbeek, Seng Chan You, Xiaoyong Pan, Jenna Reps}, - hidelinks, - pdfcreator={LaTeX via pandoc}} -\urlstyle{same} % disable monospaced font for URLs -\usepackage[margin=1in]{geometry} -\usepackage{color} -\usepackage{fancyvrb} -\newcommand{\VerbBar}{|} -\newcommand{\VERB}{\Verb[commandchars=\\\{\}]} -\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}} -% Add ',fontsize=\small' for more characters per line -\usepackage{framed} -\definecolor{shadecolor}{RGB}{248,248,248} -\newenvironment{Shaded}{\begin{snugshade}}{\end{snugshade}} -\newcommand{\AlertTok}[1]{\textcolor[rgb]{0.94,0.16,0.16}{#1}} -\newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.77,0.63,0.00}{#1}} -\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\BuiltInTok}[1]{#1} -\newcommand{\CharTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}} -\newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\ConstantTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} -\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{#1}} -\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\ErrorTok}[1]{\textcolor[rgb]{0.64,0.00,0.00}{\textbf{#1}}} -\newcommand{\ExtensionTok}[1]{#1} -\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\ImportTok}[1]{#1} -\newcommand{\InformationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} -\newcommand{\NormalTok}[1]{#1} -\newcommand{\OperatorTok}[1]{\textcolor[rgb]{0.81,0.36,0.00}{\textbf{#1}}} -\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}} -\newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}} -\newcommand{\RegionMarkerTok}[1]{#1} -\newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\StringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\VariableTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\WarningTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\usepackage{longtable,booktabs} -% Correct order of tables after \paragraph or \subparagraph -\usepackage{etoolbox} -\makeatletter -\patchcmd\longtable{\par}{\if@noskipsec\mbox{}\fi\par}{}{} -\makeatother -% Allow footnotes in longtable head/foot -\IfFileExists{footnotehyper.sty}{\usepackage{footnotehyper}}{\usepackage{footnote}} -\makesavenoteenv{longtable} -\usepackage{graphicx,grffile} -\makeatletter -\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi} -\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi} -\makeatother -% Scale images if necessary, so that they will not overflow the page -% margins by default, and it is still possible to overwrite the defaults -% using explicit options in \includegraphics[width, height, ...]{} -\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio} -% Set default figure placement to htbp -\makeatletter -\def\fps@figure{htbp} -\makeatother -\setlength{\emergencystretch}{3em} % prevent overfull lines -\providecommand{\tightlist}{% - \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}} -\setcounter{secnumdepth}{5} -\usepackage{fancyhdr} -\pagestyle{fancy} -\fancyhead{} -\fancyhead[CO,CE]{Building Deep Learning Models} -\fancyfoot[CO,CE]{PatientLevelPrediction Package Version 3.1.0} -\fancyfoot[LE,RO]{\thepage} -\renewcommand{\headrulewidth}{0.4pt} -\renewcommand{\footrulewidth}{0.4pt} - -\title{Building Deep Learning Models} -\author{Peter R. Rijnbeek, Seng Chan You, Xiaoyong Pan, Jenna Reps} -\date{2020-06-03} - -\begin{document} -\maketitle - -{ -\setcounter{tocdepth}{2} -\tableofcontents -} -\hypertarget{introduction}{% -\section{Introduction}\label{introduction}} - -Electronic Health Records (EHR) data is high dimensional, heterogeneous, -and sparse, which makes predictive modelling a challenge. In the early -days, the machine learning community mainly focused on algorithm -development, currently there is a shift to more powerful feature -engineering. Deep Learning models are widely used to automatically learn -high-level feature representations from the data, and have achieved -remarkable results in image processing, speech recognition and -computational biology. Recently, interesting results have been shown -using EHRs, but more extensive research is needed to assess the power of -Deep Learning in this domain. - -This vignette describes how you can use the Observational Health Data -Sciences and Informatics (OHDSI) -\href{http://github.com/OHDSI/PatientLevelPrediction}{\texttt{PatientLevelPrediction}} -package to build Deep Learning models. This vignette assumes you have -read and are comfortable with building patient level prediction models -as described in the -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/BuildingPredictiveModels.pdf}{\texttt{BuildingPredictiveModels} -vignette}. Furthermore, this vignette assumes you are familiar with Deep -Learning methods. - -\hypertarget{background}{% -\section{Background}\label{background}} - -Deep Learning models are build by stacking an often large number of -neural network layers that perform feature engineering steps, e.g -embedding, and are collapsed in a final softmax layer (basically a -logistic regression layer). These algorithms need a lot of data to -converge to a good representation, but currently the sizes of the EHR -databases are growing fast which would make Deep Learning an interesting -approach to test within OHDSI's -\href{https://academic.oup.com/jamia/article/25/8/969/4989437}{Patient-Level -Prediction Framework}. The current implementation allows us to perform -research at scale on the value and limitations of Deep Learning using -EHR data. For relatively small Target and Outcome cohorts, Deep Learning -is most probably not the best choice. - -Most current Deep Learning research is performed in python and we have -developed a pipeline to interact with python. Multiple Deep Learning -backends have been developed, e.g.~Tensorflow, PyTorch, Keras (recently -also available in R) etc. In the package we have implemented interaction -with Keras in R and PyTorch in Python but we invite the community to add -other backends. - -Many network architectures have recently been proposed and we have -implemented a number of them, however, this list will grow in the near -future. It is important to understand that some of these architectures -require a 2D data matrix, -i.e.~\textbar patient\textbar x\textbar feature\textbar, and others use -a 3D data matrix -\textbar patient\textbar x\textbar feature\textbar x\textbar time\textbar. -The \href{www.github.com/ohdsi/FeatureExtraction}{FeatureExtraction -Package} has been extended to enable the extraction of both data formats -as will be described with examples below. - -Note that training Deep Learning models is computationally intensive, -our implementation therefore supports both GPU and CPU. It will -automatically check whether there is GPU or not in your computer. A GPU -is highly recommended for Deep Learning! - -\hypertarget{non-temporal-architectures}{% -\section{Non-Temporal Architectures}\label{non-temporal-architectures}} - -We implemented the following non-temporal (2D data matrix) architectures -using PyTorch: - -\begin{verbatim} -1) Logistics regression (LRTorch) - A simple softmax layer with l2 regularization - -2) Feed forward network (MLPTorch) - Supports multilayer perceptron (mlp_type = MLP) and - Self-Normalizing Neural Networks (mlp_type = SNN) - Reference: https://arxiv.org/abs/1706.02515 -\end{verbatim} - -For the above two methods, we implemented support for a stacked -autoencoder and a variational autoencoder to reduce the feature -dimension as a first step. These autoencoders learn efficient data -encodings in an unsupervised manner by stacking multiple layers in a -neural network. Compared to the standard implementations of LR and MLP -these implementations can use the GPU power to speed up the gradient -descent approach in the back propagation to optimize the weights of the -classifier. - -Table 1: Non-Temporal Deep Learning Models Hyper-Parameters - -\begin{longtable}[]{@{}lll@{}} -\toprule -\begin{minipage}[b]{0.10\columnwidth}\raggedright -Name\strut -\end{minipage} & \begin{minipage}[b]{0.34\columnwidth}\raggedright -Description\strut -\end{minipage} & \begin{minipage}[b]{0.47\columnwidth}\raggedright -Hyper-parameters\strut -\end{minipage}\tabularnewline -\midrule -\endhead -\begin{minipage}[t]{0.10\columnwidth}\raggedright -LRTorch\strut -\end{minipage} & \begin{minipage}[t]{0.34\columnwidth}\raggedright -Logistic Regression Model\strut -\end{minipage} & \begin{minipage}[t]{0.47\columnwidth}\raggedright -w\_decay (l2 regularization), epochs (number of epochs), class\_weight -(0 = inverse ratio between number of positive and negative examples, -1 -= focal loss (\url{https://arxiv.org/abs/1708.02002}), or other), -autoencoder (apply stacked autoencoder?, vae (apply variational -autoencoder)\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.10\columnwidth}\raggedright -MLPTorch\strut -\end{minipage} & \begin{minipage}[t]{0.34\columnwidth}\raggedright -Multi-Layer Perceptron Model\strut -\end{minipage} & \begin{minipage}[t]{0.47\columnwidth}\raggedright -mlp\_type (MLP = default, SNN = self-normalizing neural network), size -(number of hidden nodes), w\_decay (l2 regularization), epochs (number -of epochs), class\_weight(0 = inverse ratio between number of positive -and negative examples, -1 = focal loss, or other), autoencoder (apply -stacked autoencoder), vae (apply variational autoencoder?)\strut -\end{minipage}\tabularnewline -\bottomrule -\end{longtable} - -\#\#Example The approach for logistic regression (LRTorch) and the -Multi-Layer Perceptron (MLPTorch) is identical. Here we will take -LRTorch as an example. - -You need to generate a \texttt{population} and \texttt{plpData} object -as described in more detail in -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/BuildingPredictiveModels.pdf}{\texttt{BuildingPredictiveModels} -vignette}. - -Alternatively, you can make use of the data simulator. The following -code snippet creates a population of 12000 patients. - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{set.seed}\NormalTok{(}\DecValTok{1234}\NormalTok{)} -\KeywordTok{data}\NormalTok{(plpDataSimulationProfile)} -\NormalTok{sampleSize <-}\StringTok{ }\DecValTok{12000} -\NormalTok{plpData <-}\StringTok{ }\KeywordTok{simulatePlpData}\NormalTok{(} -\NormalTok{ plpDataSimulationProfile,} - \DataTypeTok{n =}\NormalTok{ sampleSize} -\NormalTok{)} - -\NormalTok{population <-}\StringTok{ }\KeywordTok{createStudyPopulation}\NormalTok{(} -\NormalTok{ plpData,} - \DataTypeTok{outcomeId =} \DecValTok{2}\NormalTok{,} - \DataTypeTok{binary =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{firstExposureOnly =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{washoutPeriod =} \DecValTok{0}\NormalTok{,} - \DataTypeTok{removeSubjectsWithPriorOutcome =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{priorOutcomeLookback =} \DecValTok{99999}\NormalTok{,} - \DataTypeTok{requireTimeAtRisk =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{minTimeAtRisk =} \DecValTok{0}\NormalTok{,} - \DataTypeTok{riskWindowStart =} \DecValTok{0}\NormalTok{,} - \DataTypeTok{addExposureDaysToStart =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{riskWindowEnd =} \DecValTok{365}\NormalTok{,} - \DataTypeTok{addExposureDaysToEnd =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{verbosity =} \StringTok{"INFO"} -\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -As an example we will build a LRTorch model. We could specify the -stacked autoencoder or the variational autoencoder to be used for -reducing the feature dimension as an initial layer, but for this example -we do not. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{autoencoder <-}\StringTok{ }\OtherTok{FALSE} -\NormalTok{vae <-}\StringTok{ }\OtherTok{FALSE} -\end{Highlighting} -\end{Shaded} - -We added a class\_weight for imbalanced data, the default value 0 is the -inverse ratio between negatives and positives,-1 applies focal loss. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{class_weight <-}\StringTok{ }\DecValTok{0} -\end{Highlighting} -\end{Shaded} - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{# Specify the settings for Logistics regression model using Torch in Python} -\NormalTok{model <-}\StringTok{ }\KeywordTok{setLRTorch}\NormalTok{(}\DataTypeTok{autoencoder=}\NormalTok{autoencoder, }\DataTypeTok{vae=}\NormalTok{vae, }\DataTypeTok{class_weight=}\NormalTok{class_weight)} -\end{Highlighting} -\end{Shaded} - -No we define our modelling parameters. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{testFraction <-}\StringTok{ }\FloatTok{0.2} -\NormalTok{testSplit <-}\StringTok{ 'person'} -\NormalTok{nfold <-}\StringTok{ }\DecValTok{3} -\NormalTok{splitSeed <-}\StringTok{ }\DecValTok{1000} -\end{Highlighting} -\end{Shaded} - -And we train and internally validate the model. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{results <-}\StringTok{ }\NormalTok{PatientLevelPrediction}\OperatorTok{::}\KeywordTok{runPlp}\NormalTok{(}\DataTypeTok{population =}\NormalTok{ population, } - \DataTypeTok{plpData =}\NormalTok{ plpData, } - \DataTypeTok{modelSettings =}\NormalTok{ model,} - \DataTypeTok{testSplit=}\NormalTok{testSplit,} - \DataTypeTok{testFraction=}\NormalTok{testFraction,} - \DataTypeTok{nfold=}\NormalTok{nfold, } - \DataTypeTok{splitSeed=}\NormalTok{splitSeed) } -\end{Highlighting} -\end{Shaded} - -\hypertarget{temporal-architectures}{% -\section{Temporal Architectures}\label{temporal-architectures}} - -Several architectures are implemented that can handle temporal data in -PyTorch and R Keras. - -\hypertarget{pytorch-cnn}{% -\subsection{PyTorch CNN}\label{pytorch-cnn}} - -We implemented the following \textbf{convolutional} models described in -\url{https://github.com/clinicalml/deepDiagnosis} in CNNTorch: - -\begin{enumerate} -\def\labelenumi{\arabic{enumi})} -\item - Temporal Convolutional neural network over a backward window (type = - cnn) - - \includegraphics{arch1.png} -\item - Convolutional neural network over input and time dimension (type = - mix) - - \includegraphics{conv_arch2.png} -\item - Multi-resolution temporal convolutional neural network (type = multi) - - \includegraphics{conv_arch1.png} -\end{enumerate} - -Furthermore, we added the following achitectures: - -\begin{enumerate} -\def\labelenumi{\arabic{enumi})} -\setcounter{enumi}{3} -\item - CNN with filters with three different parallel kernel sizes (3,4,5) - and a fully connected layers (type = mlf) - - \includegraphics{cnn_mlf2.png} -\item - LSTM network over the backward window (type = lstm) - - \includegraphics{cnn_lstm.png} -\item - Residual Learning Network as described in: - \url{https://arxiv.org/abs/1512.03385} (type = resnet) - - This a very big network, see the paper for the topology. -\end{enumerate} - -\begin{longtable}[]{@{}ll@{}} -\toprule -\begin{minipage}[b]{0.26\columnwidth}\raggedright -parameter\strut -\end{minipage} & \begin{minipage}[b]{0.68\columnwidth}\raggedright -description\strut -\end{minipage}\tabularnewline -\midrule -\endhead -\begin{minipage}[t]{0.26\columnwidth}\raggedright -nbfilters\strut -\end{minipage} & \begin{minipage}[t]{0.68\columnwidth}\raggedright -The number of convolution filters\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.26\columnwidth}\raggedright -epochs\strut -\end{minipage} & \begin{minipage}[t]{0.68\columnwidth}\raggedright -The number of epochs\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.26\columnwidth}\raggedright -seed\strut -\end{minipage} & \begin{minipage}[t]{0.68\columnwidth}\raggedright -Random seed\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.26\columnwidth}\raggedright -class\_weight\strut -\end{minipage} & \begin{minipage}[t]{0.68\columnwidth}\raggedright -The class weight used for imbalanced data (0: Inverse ratio between -positives and negatives, -1: Focal loss, or number)\strut -\end{minipage}\tabularnewline -\bottomrule -\end{longtable} - -\hypertarget{pytorch-rnn}{% -\subsection{PyTorch RNN}\label{pytorch-rnn}} - -The following \textbf{recurrent neural network} models are implemented -in RNNTorch: - -\begin{enumerate} -\def\labelenumi{\arabic{enumi})} -\item - RNN with one LSTM layer fed into one fully connected layer (type = - RNN) - - \includegraphics{lstm_last.png} -\item - RNN with one bidirectional LSTM layer fed into one fully connected - layer (type = BiRNN) - - This network looks the same as above but then as a bi-directional - version -\item - One Gated Recurrent Unit layer fed into one fully connected layers - (type = GRU) - - This network looks the same as above but then implemented as GRU -\end{enumerate} - -The following hyper-parameters can be set for these PyTorch models: - -\begin{longtable}[]{@{}ll@{}} -\toprule -\begin{minipage}[b]{0.26\columnwidth}\raggedright -parameter\strut -\end{minipage} & \begin{minipage}[b]{0.68\columnwidth}\raggedright -description\strut -\end{minipage}\tabularnewline -\midrule -\endhead -\begin{minipage}[t]{0.26\columnwidth}\raggedright -hidden\_size\strut -\end{minipage} & \begin{minipage}[t]{0.68\columnwidth}\raggedright -The number of features in hidden state\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.26\columnwidth}\raggedright -epochs\strut -\end{minipage} & \begin{minipage}[t]{0.68\columnwidth}\raggedright -The number of epochs\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.26\columnwidth}\raggedright -seed\strut -\end{minipage} & \begin{minipage}[t]{0.68\columnwidth}\raggedright -Random seed\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.26\columnwidth}\raggedright -class\_weight\strut -\end{minipage} & \begin{minipage}[t]{0.68\columnwidth}\raggedright -The class weight used for imbalanced data (0: Inverse ratio between -positives and negatives, -1: Focal loss, or number)\strut -\end{minipage}\tabularnewline -\bottomrule -\end{longtable} - -\newpage - -\hypertarget{r-keras-cnn}{% -\subsection{R Keras CNN}\label{r-keras-cnn}} - -The following temporal architectures as described in -\url{https://arxiv.org/pdf/1608.00647.pdf} were implemented using R -Keras: - -\begin{enumerate} -\def\labelenumi{\arabic{enumi}.} -\item - Multi-resolution CovNN model (CovNN.R) - - \includegraphics{covcnn.png} -\item - Convolution across data and time according(CovNN2.R) - - \includegraphics{covcnn2.png} - - \newpage -\end{enumerate} - -Furthermore, a custom build RNN is added that uses a variational -autoencoder. - -\begin{enumerate} -\def\labelenumi{\arabic{enumi}.} -\setcounter{enumi}{2} -\item - Clinically Informing application based on Recurrent Neural Network - (CIReNN.R) - - \includegraphics{cirenn.png} -\end{enumerate} - -Table 2: Temporal Deep Learning Models - -\begin{longtable}[]{@{}ll@{}} -\toprule -\begin{minipage}[b]{0.11\columnwidth}\raggedright -Model\strut -\end{minipage} & \begin{minipage}[b]{0.83\columnwidth}\raggedright -Hyper-parameters\strut -\end{minipage}\tabularnewline -\midrule -\endhead -\begin{minipage}[t]{0.11\columnwidth}\raggedright -CovNN\strut -\end{minipage} & \begin{minipage}[t]{0.83\columnwidth}\raggedright -batchSize (The number of samples to used in each batch during model -training), outcomeWeight (The weight assigned to the outcome), lr (The -learning rate), decay (The decay of the learning rate), dropout -({[}currently not used{]} the dropout rate for regularization), epochs -(The number of times data is used to train the model, e.g., epoches=1 -means data only used once to train), filters (The number of columns -output by each convolution), kernelSize (The number of time dimensions -used for each convolution), loss (The loss function implemented), seed -(The random seed)\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.11\columnwidth}\raggedright -CovNN2\strut -\end{minipage} & \begin{minipage}[t]{0.83\columnwidth}\raggedright -batchSize (The number of samples to used in each batch during model -training), outcomeWeight (The weight assigned to the outcome), lr (The -learning rate), decay (The decay of the learning rate), dropout -({[}currently not used{]} the dropout rate for regularization), epochs -(The number of times data is used to train the model, e.g., epoches=1 -means data only used once to train), filters (The number of columns -output by each convolution), kernelSize (The number of time dimensions -used for each convolution), loss (The loss function implemented), seed -(The random seed)\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.11\columnwidth}\raggedright -CIReNN\strut -\end{minipage} & \begin{minipage}[t]{0.83\columnwidth}\raggedright -units (The number of units of RNN layer - as a list of vectors), -recurrentDropout (The reccurrent dropout rate), layerDropout (The layer -dropout rate), lr (Learning rate), decay (Learning rate decay over each -update), outcomeWeight (The weight of the outcome class in the loss -function), batchSize (The number of data points to use per training -batch), epochs (Number of times to iterate over data set), -earlyStoppingMinDelta (Minimum change in the monitored quantity to -qualify as an improvement for early stopping, i.e.~an absolute change of -less than min\_delta in loss of validation data, will count as no -improvement), earlyStoppingPatience (Number of epochs with no -improvement after which training will be stopped), seed (Random seed -used by Deep Learning model)\strut -\end{minipage}\tabularnewline -\bottomrule -\end{longtable} - -\hypertarget{example}{% -\subsection{Example}\label{example}} - -We will now show how to use the temporal models by using CNNTorch as an -example. - -You need to generate a \texttt{population} and \texttt{plpData} object -as described in more detail in -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/BuildingPredictiveModels.pdf}{\texttt{BuildingPredictiveModels} -vignette}. - -Note that for these algorithms you need to extracted temporal data as -described in the {[}FeatureExtraction vignette{]} -(\url{https://github.com/OHDSI/FeatureExtraction/blob/master/inst/doc/UsingFeatureExtraction.pdf}) -as follows: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{settings <-}\StringTok{ }\KeywordTok{createTemporalCovariateSettings}\NormalTok{(}\DataTypeTok{useConditionEraStart =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{useConditionEraOverlap =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{useConditionOccurrence =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{useConditionEraGroupStart =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{useConditionEraGroupOverlap =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{useDrugExposure =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{useDrugEraStart =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{useDrugEraOverlap =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{useMeasurement =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{useMeasurementValue =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{useMeasurementRangeGroup =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{useProcedureOccurrence =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{useDeviceExposure =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{useObservation =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{excludedCovariateConceptIds =} \KeywordTok{c}\NormalTok{(}\DecValTok{316866}\NormalTok{),} - \DataTypeTok{addDescendantsToExclude =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{temporalStartDays =} \KeywordTok{seq}\NormalTok{(}\DataTypeTok{from =} \DecValTok{-365}\NormalTok{, } - \DataTypeTok{to =} \DecValTok{-1}\NormalTok{, }\DataTypeTok{by =} \DecValTok{12}\NormalTok{), } - \DataTypeTok{temporalEndDays =} \KeywordTok{c}\NormalTok{(}\KeywordTok{seq}\NormalTok{(}\DataTypeTok{from =} \DecValTok{-353}\NormalTok{, } - \DataTypeTok{to =} \DecValTok{0}\NormalTok{, }\DataTypeTok{by =} \DecValTok{12}\NormalTok{), }\DecValTok{0}\NormalTok{))} - -\NormalTok{plpData <-}\StringTok{ }\KeywordTok{getPlpData}\NormalTok{(}\DataTypeTok{connectionDetails =}\NormalTok{ connectionDetails,} - \DataTypeTok{cdmDatabaseSchema =}\NormalTok{ cdmDatabaseSchema,} - \DataTypeTok{cohortDatabaseSchema =} \StringTok{"results"}\NormalTok{,} - \DataTypeTok{cohortTable =} \StringTok{"cohort"}\NormalTok{,} - \DataTypeTok{cohortId =} \DecValTok{11}\NormalTok{,} - \DataTypeTok{covariateSettings =}\NormalTok{ settings,} - \DataTypeTok{outcomeDatabaseSchema =}\NormalTok{ resultsDatabaseSchema,} - \DataTypeTok{outcomeTable =} \StringTok{"cohort"}\NormalTok{,} - \DataTypeTok{outcomeIds =} \DecValTok{25}\NormalTok{,} - \DataTypeTok{cdmVersion =} \DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -Each CNN/RNN has several hyper-parameters that can be set as shown in -the Tables above, but for this example we take the defaults. - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{# specify the the CNN} -\NormalTok{model <-}\StringTok{ }\KeywordTok{setCNNTorch}\NormalTok{(}\DataTypeTok{cnn_type=}\StringTok{'CNN'}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -Run the model training, for example with a testFraction = 0.2 and a -split by person: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{results <-}\StringTok{ }\NormalTok{PatientLevelPrediction}\OperatorTok{::}\KeywordTok{runPlp}\NormalTok{(population, plpData, model,} - \DataTypeTok{testSplit=}\StringTok{'person'}\NormalTok{,} - \DataTypeTok{testFraction=}\FloatTok{0.2}\NormalTok{,} - \DataTypeTok{nfold=}\DecValTok{3}\NormalTok{, } - \DataTypeTok{splitSeed=}\DecValTok{1000}\NormalTok{) } -\end{Highlighting} -\end{Shaded} - -\hypertarget{apply-the-trained-deep-learning-model}{% -\section{Apply the trained Deep Learning -model}\label{apply-the-trained-deep-learning-model}} - -Applying a Deep Learning is identical to the other models in the -package: - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{# load the trained model} -\NormalTok{plpModel <-}\StringTok{ }\KeywordTok{loadPlpModel}\NormalTok{(}\KeywordTok{getwd}\NormalTok{(), }\StringTok{""}\NormalTok{)} - -\CommentTok{# load the new plpData (should have the same temporal features!) and create the population} -\NormalTok{plpData <-}\StringTok{ }\KeywordTok{loadPlpData}\NormalTok{(}\KeywordTok{getwd}\NormalTok{(), }\StringTok{""}\NormalTok{)} - -\NormalTok{populationSettings <-}\StringTok{ }\NormalTok{plpModel}\OperatorTok{$}\NormalTok{populationSettings} -\NormalTok{populationSettings}\OperatorTok{$}\NormalTok{plpData <-}\StringTok{ }\NormalTok{plpData} -\NormalTok{population <-}\StringTok{ }\KeywordTok{do.call}\NormalTok{(createStudyPopulation, populationSettings) } - -\CommentTok{# apply the trained model on the new data} -\NormalTok{validationResults <-}\StringTok{ }\KeywordTok{applyModel}\NormalTok{(population, plpData, plpModel)} -\end{Highlighting} -\end{Shaded} - -\hypertarget{adding-new-architectures}{% -\section{Adding new architectures}\label{adding-new-architectures}} - -It is possible to add new architectures in our framework using PyTorch -or R Keras. We are happy to help you with this, please post your -questions on the -\href{www.github.com/OHDSI/PatientLevelPrediction/issues}{issue tracker} -of the package. - -\hypertarget{acknowledgments}{% -\section{Acknowledgments}\label{acknowledgments}} - -Considerable work has been dedicated to provide the -\texttt{PatientLevelPrediction} package. - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{citation}\NormalTok{(}\StringTok{"PatientLevelPrediction"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -## -## To cite PatientLevelPrediction in publications use: -## -## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design and -## implementation of a standardized framework to generate and evaluate patient-level -## prediction models using observational healthcare data." _Journal of the American -## Medical Informatics Association_, *25*(8), 969-975. . -## -## A BibTeX entry for LaTeX users is -## -## @Article{, -## author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek}, -## title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data}, -## journal = {Journal of the American Medical Informatics Association}, -## volume = {25}, -## number = {8}, -## pages = {969-975}, -## year = {2018}, -## url = {https://doi.org/10.1093/jamia/ocy032}, -## } -\end{verbatim} - -\textbf{Please reference this paper if you use the PLP Package in your -work:} - -\href{http://dx.doi.org/10.1093/jamia/ocy032}{Reps JM, Schuemie MJ, -Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a -standardized framework to generate and evaluate patient-level prediction -models using observational healthcare data. J Am Med Inform Assoc. -2018;25(8):969-975.} - -\end{document} diff --git a/inst/doc/BuildingEnsembleModels.tex b/inst/doc/BuildingEnsembleModels.tex deleted file mode 100644 index 85bc73667..000000000 --- a/inst/doc/BuildingEnsembleModels.tex +++ /dev/null @@ -1,369 +0,0 @@ -% Options for packages loaded elsewhere -\PassOptionsToPackage{unicode}{hyperref} -\PassOptionsToPackage{hyphens}{url} -% -\documentclass[ -]{article} -\usepackage{lmodern} -\usepackage{amssymb,amsmath} -\usepackage{ifxetex,ifluatex} -\ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex - \usepackage[T1]{fontenc} - \usepackage[utf8]{inputenc} - \usepackage{textcomp} % provide euro and other symbols -\else % if luatex or xetex - \usepackage{unicode-math} - \defaultfontfeatures{Scale=MatchLowercase} - \defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1} -\fi -% Use upquote if available, for straight quotes in verbatim environments -\IfFileExists{upquote.sty}{\usepackage{upquote}}{} -\IfFileExists{microtype.sty}{% use microtype if available - \usepackage[]{microtype} - \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts -}{} -\makeatletter -\@ifundefined{KOMAClassName}{% if non-KOMA class - \IfFileExists{parskip.sty}{% - \usepackage{parskip} - }{% else - \setlength{\parindent}{0pt} - \setlength{\parskip}{6pt plus 2pt minus 1pt}} -}{% if KOMA class - \KOMAoptions{parskip=half}} -\makeatother -\usepackage{xcolor} -\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available -\IfFileExists{bookmark.sty}{\usepackage{bookmark}}{\usepackage{hyperref}} -\hypersetup{ - pdftitle={Building Ensemble Models}, - pdfauthor={Xiaoyong Pan, Jenna Reps, Peter R. Rijnbeek}, - hidelinks, - pdfcreator={LaTeX via pandoc}} -\urlstyle{same} % disable monospaced font for URLs -\usepackage[margin=1in]{geometry} -\usepackage{color} -\usepackage{fancyvrb} -\newcommand{\VerbBar}{|} -\newcommand{\VERB}{\Verb[commandchars=\\\{\}]} -\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}} -% Add ',fontsize=\small' for more characters per line -\usepackage{framed} -\definecolor{shadecolor}{RGB}{248,248,248} -\newenvironment{Shaded}{\begin{snugshade}}{\end{snugshade}} -\newcommand{\AlertTok}[1]{\textcolor[rgb]{0.94,0.16,0.16}{#1}} -\newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.77,0.63,0.00}{#1}} -\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\BuiltInTok}[1]{#1} -\newcommand{\CharTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}} -\newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\ConstantTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} -\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{#1}} -\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\ErrorTok}[1]{\textcolor[rgb]{0.64,0.00,0.00}{\textbf{#1}}} -\newcommand{\ExtensionTok}[1]{#1} -\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\ImportTok}[1]{#1} -\newcommand{\InformationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} -\newcommand{\NormalTok}[1]{#1} -\newcommand{\OperatorTok}[1]{\textcolor[rgb]{0.81,0.36,0.00}{\textbf{#1}}} -\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}} -\newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}} -\newcommand{\RegionMarkerTok}[1]{#1} -\newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\StringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\VariableTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\WarningTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\usepackage{graphicx,grffile} -\makeatletter -\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi} -\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi} -\makeatother -% Scale images if necessary, so that they will not overflow the page -% margins by default, and it is still possible to overwrite the defaults -% using explicit options in \includegraphics[width, height, ...]{} -\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio} -% Set default figure placement to htbp -\makeatletter -\def\fps@figure{htbp} -\makeatother -\setlength{\emergencystretch}{3em} % prevent overfull lines -\providecommand{\tightlist}{% - \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}} -\setcounter{secnumdepth}{5} -\usepackage{fancyhdr} -\pagestyle{fancy} -\fancyhead{} -\fancyhead[CO,CE]{Installation Guide} -\fancyfoot[CO,CE]{PatientLevelPrediction Package Version 3.1.0} -\fancyfoot[LE,RO]{\thepage} -\renewcommand{\headrulewidth}{0.4pt} -\renewcommand{\footrulewidth}{0.4pt} - -\title{Building Ensemble Models} -\author{Xiaoyong Pan, Jenna Reps, Peter R. Rijnbeek} -\date{2020-06-03} - -\begin{document} -\maketitle - -{ -\setcounter{tocdepth}{2} -\tableofcontents -} -\hypertarget{introduction}{% -\section{Introduction}\label{introduction}} - -Ensemble models combine several models to improve the overall -performance. Traditionally, weak learners were combined to boost -performance but recent results show that combining several strong -approaches can also result in a better performance. There are many -examples in literature where ensemble models outperform individual -models using stacking, i.e.~a final logistic regresssion layer accross -the individual model outputs, but other approaches like weigthing has -also shown promising results. - -This vignette describes how you can use the Observational Health Data -Sciencs and Informatics (OHDSI) -\href{http://github.com/OHDSI/PatientLevelPrediction}{\texttt{PatientLevelPrediction}} -package to build ensemble models. This vignette assumes you have read -and are comfortable with building single patient level prediction models -as described in the -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/BuildingPredictiveModels.pdf}{\texttt{BuildingPredictiveModels} -vignette}. - -This will enable studying ensemble methods at scale in the OHDSI data -network. - -\begin{figure} -\centering -\includegraphics{ensemble.png} -\caption{Ensemble model} -\end{figure} - -In PatientLevelPrediction package, four ensemble strategies have been -implemented: - -\begin{enumerate} -\def\labelenumi{\arabic{enumi}.} -\tightlist -\item - average ensemble: Calculate the average probability from individual - models -\item - product ensemble: Calculate the product of probabilites from - individual models. -\item - weighted ensemble: Calculate the weighted average probability from - individual models using train AUC as weights. -\item - stacked ensemble: Train a logistics regression on outputs from - individual models -\end{enumerate} - -\hypertarget{usage}{% -\section{Usage}\label{usage}} - -Use the -\href{http://github.com/OHDSI/PatientLevelPrediction}{\texttt{PatientLevelPrediction}} -package to generate a \texttt{population} and \texttt{plpData} object. -Alternatively, you can make use of the data simulator. The following -code snippet creates a population of 12000 patients. - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{data}\NormalTok{(plpDataSimulationProfile)} -\KeywordTok{set.seed}\NormalTok{(}\DecValTok{1234}\NormalTok{)} -\NormalTok{sampleSize <-}\StringTok{ }\DecValTok{2000} -\NormalTok{plpData <-}\StringTok{ }\KeywordTok{simulatePlpData}\NormalTok{(} -\NormalTok{ plpDataSimulationProfile,} - \DataTypeTok{n =}\NormalTok{ sampleSize} -\NormalTok{)} - -\NormalTok{population <-}\StringTok{ }\KeywordTok{createStudyPopulation}\NormalTok{(} -\NormalTok{ plpData,} - \DataTypeTok{outcomeId =} \DecValTok{2}\NormalTok{,} - \DataTypeTok{binary =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{firstExposureOnly =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{washoutPeriod =} \DecValTok{0}\NormalTok{,} - \DataTypeTok{removeSubjectsWithPriorOutcome =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{priorOutcomeLookback =} \DecValTok{99999}\NormalTok{,} - \DataTypeTok{requireTimeAtRisk =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{minTimeAtRisk =} \DecValTok{0}\NormalTok{,} - \DataTypeTok{riskWindowStart =} \DecValTok{0}\NormalTok{,} - \DataTypeTok{addExposureDaysToStart =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{riskWindowEnd =} \DecValTok{365}\NormalTok{,} - \DataTypeTok{addExposureDaysToEnd =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{verbosity =} \StringTok{"INFO"} -\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -Specify the prediction algorithms to be combined. - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{# Use LASSO logistic regression and Random Forest as base predictors} -\NormalTok{model1 <-}\StringTok{ }\KeywordTok{setLassoLogisticRegression}\NormalTok{()} -\NormalTok{model2 <-}\StringTok{ }\KeywordTok{setRandomForest}\NormalTok{()} -\end{Highlighting} -\end{Shaded} - -Specify a test fraction and a sequence of training set fractions. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{testFraction <-}\StringTok{ }\FloatTok{0.2} -\end{Highlighting} -\end{Shaded} - -Specify an ensembleStrategy to combine multiple predictors. The strategy -used for ensembling the outputs from different models, it can be `mean', -`product', `weighted' and `stacked': `mean' the average probability from -differnt models `product' the product rule `weighted' the weighted -average probability from different models using train AUC as weights. -`stacked' the stakced ensemble trains a logistics regression on -different models. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{ensembleStrategy <-}\StringTok{ 'stacked'} -\end{Highlighting} -\end{Shaded} - -Specify the test split to be used. - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{# Use a split by person, alterantively a time split is possible} -\NormalTok{testSplit <-}\StringTok{ 'person'} -\end{Highlighting} -\end{Shaded} - -Run the ensemble learning to combine model1 and model2. You can also use -different plpData for different models. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{ensembleResults <-}\StringTok{ }\NormalTok{PatientLevelPrediction}\OperatorTok{::}\KeywordTok{runEnsembleModel}\NormalTok{(population, } - \DataTypeTok{dataList =} \KeywordTok{list}\NormalTok{(plpData, plpData), } - \DataTypeTok{modelList =} \KeywordTok{list}\NormalTok{(model1, model2),} - \DataTypeTok{testSplit=}\NormalTok{testSplit,} - \DataTypeTok{testFraction=}\NormalTok{testFraction,} - \DataTypeTok{nfold=}\DecValTok{3}\NormalTok{, }\DataTypeTok{splitSeed=}\DecValTok{1000}\NormalTok{, } - \DataTypeTok{ensembleStrategy =}\NormalTok{ ensembleStrategy) } -\end{Highlighting} -\end{Shaded} - -\hypertarget{saving-and-loading-the-ensemble-model}{% -\subsection{Saving and loading the ensemble -model}\label{saving-and-loading-the-ensemble-model}} - -You can save and load the model using: - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{saveEnsemblePlpModel}\NormalTok{(ensembleResults}\OperatorTok{$}\NormalTok{model, }\DataTypeTok{dirPath =} \KeywordTok{file.path}\NormalTok{(}\KeywordTok{getwd}\NormalTok{(), }\StringTok{"model"}\NormalTok{))} -\NormalTok{ensembleModel <-}\StringTok{ }\KeywordTok{loadEnsemblePlpModel}\NormalTok{(}\KeywordTok{getwd}\NormalTok{(), }\StringTok{"model"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\hypertarget{apply-ensemble-model}{% -\section{Apply Ensemble model}\label{apply-ensemble-model}} - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{plpData <-}\StringTok{ }\KeywordTok{loadPlpData}\NormalTok{(}\StringTok{""}\NormalTok{)} -\NormalTok{populationSettings <-}\StringTok{ }\NormalTok{ensembleModel}\OperatorTok{$}\NormalTok{populationSettings} -\NormalTok{populationSettings}\OperatorTok{$}\NormalTok{plpData <-}\StringTok{ }\NormalTok{plpData} -\NormalTok{population <-}\StringTok{ }\KeywordTok{do.call}\NormalTok{(createStudyPopulation, populationSettings)} -\end{Highlighting} -\end{Shaded} - -Load the model. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{ensembleModel <-}\StringTok{ }\KeywordTok{loadEnsemblePlpModel}\NormalTok{(}\StringTok{""}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -Get the predictions by applying the model: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{prediction <-}\StringTok{ }\KeywordTok{applyEnsembleModel}\NormalTok{(population,} - \DataTypeTok{dataList =} \KeywordTok{list}\NormalTok{(plpData, plpData),} - \DataTypeTok{ensembleModel =}\NormalTok{ ensembleModel)}\OperatorTok{$}\NormalTok{prediction} -\end{Highlighting} -\end{Shaded} - -\hypertarget{demo}{% -\section{Demo}\label{demo}} - -We have added a demo of the ensemble training: - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{# Show all demos in our package: } - \KeywordTok{demo}\NormalTok{(}\DataTypeTok{package =} \StringTok{"PatientLevelPrediction"}\NormalTok{)} - -\CommentTok{# Run the learning curve} - \KeywordTok{demo}\NormalTok{(}\StringTok{"EnsembleModelDemo"}\NormalTok{, }\DataTypeTok{package =} \StringTok{"PatientLevelPrediction"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\hypertarget{acknowledgments}{% -\section{Acknowledgments}\label{acknowledgments}} - -Considerable work has been dedicated to provide the -\texttt{PatientLevelPrediction} package. - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{citation}\NormalTok{(}\StringTok{"PatientLevelPrediction"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -## -## To cite PatientLevelPrediction in publications use: -## -## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design and -## implementation of a standardized framework to generate and evaluate patient-level -## prediction models using observational healthcare data." _Journal of the American -## Medical Informatics Association_, *25*(8), 969-975. . -## -## A BibTeX entry for LaTeX users is -## -## @Article{, -## author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek}, -## title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data}, -## journal = {Journal of the American Medical Informatics Association}, -## volume = {25}, -## number = {8}, -## pages = {969-975}, -## year = {2018}, -## url = {https://doi.org/10.1093/jamia/ocy032}, -## } -\end{verbatim} - -\textbf{Please reference this paper if you use the PLP Package in your -work:} - -\href{http://dx.doi.org/10.1093/jamia/ocy032}{Reps JM, Schuemie MJ, -Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a -standardized framework to generate and evaluate patient-level prediction -models using observational healthcare data. J Am Med Inform Assoc. -2018;25(8):969-975.} - -\end{document} diff --git a/inst/doc/BuildingMultiplePredictiveModels.tex b/inst/doc/BuildingMultiplePredictiveModels.tex deleted file mode 100644 index a4e00e6a2..000000000 --- a/inst/doc/BuildingMultiplePredictiveModels.tex +++ /dev/null @@ -1,449 +0,0 @@ -% Options for packages loaded elsewhere -\PassOptionsToPackage{unicode}{hyperref} -\PassOptionsToPackage{hyphens}{url} -% -\documentclass[ -]{article} -\usepackage{lmodern} -\usepackage{amssymb,amsmath} -\usepackage{ifxetex,ifluatex} -\ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex - \usepackage[T1]{fontenc} - \usepackage[utf8]{inputenc} - \usepackage{textcomp} % provide euro and other symbols -\else % if luatex or xetex - \usepackage{unicode-math} - \defaultfontfeatures{Scale=MatchLowercase} - \defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1} -\fi -% Use upquote if available, for straight quotes in verbatim environments -\IfFileExists{upquote.sty}{\usepackage{upquote}}{} -\IfFileExists{microtype.sty}{% use microtype if available - \usepackage[]{microtype} - \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts -}{} -\makeatletter -\@ifundefined{KOMAClassName}{% if non-KOMA class - \IfFileExists{parskip.sty}{% - \usepackage{parskip} - }{% else - \setlength{\parindent}{0pt} - \setlength{\parskip}{6pt plus 2pt minus 1pt}} -}{% if KOMA class - \KOMAoptions{parskip=half}} -\makeatother -\usepackage{xcolor} -\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available -\IfFileExists{bookmark.sty}{\usepackage{bookmark}}{\usepackage{hyperref}} -\hypersetup{ - pdftitle={Automatically Build Multiple Patient-Level Predictive Models}, - pdfauthor={Jenna Reps, Martijn J. Schuemie, Patrick B. Ryan, Peter R. Rijnbeek}, - hidelinks, - pdfcreator={LaTeX via pandoc}} -\urlstyle{same} % disable monospaced font for URLs -\usepackage[margin=1in]{geometry} -\usepackage{color} -\usepackage{fancyvrb} -\newcommand{\VerbBar}{|} -\newcommand{\VERB}{\Verb[commandchars=\\\{\}]} -\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}} -% Add ',fontsize=\small' for more characters per line -\usepackage{framed} -\definecolor{shadecolor}{RGB}{248,248,248} -\newenvironment{Shaded}{\begin{snugshade}}{\end{snugshade}} -\newcommand{\AlertTok}[1]{\textcolor[rgb]{0.94,0.16,0.16}{#1}} -\newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.77,0.63,0.00}{#1}} -\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\BuiltInTok}[1]{#1} -\newcommand{\CharTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}} -\newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\ConstantTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} -\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{#1}} -\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\ErrorTok}[1]{\textcolor[rgb]{0.64,0.00,0.00}{\textbf{#1}}} -\newcommand{\ExtensionTok}[1]{#1} -\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\ImportTok}[1]{#1} -\newcommand{\InformationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} -\newcommand{\NormalTok}[1]{#1} -\newcommand{\OperatorTok}[1]{\textcolor[rgb]{0.81,0.36,0.00}{\textbf{#1}}} -\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}} -\newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}} -\newcommand{\RegionMarkerTok}[1]{#1} -\newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\StringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\VariableTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\WarningTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\usepackage{graphicx,grffile} -\makeatletter -\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi} -\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi} -\makeatother -% Scale images if necessary, so that they will not overflow the page -% margins by default, and it is still possible to overwrite the defaults -% using explicit options in \includegraphics[width, height, ...]{} -\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio} -% Set default figure placement to htbp -\makeatletter -\def\fps@figure{htbp} -\makeatother -\setlength{\emergencystretch}{3em} % prevent overfull lines -\providecommand{\tightlist}{% - \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}} -\setcounter{secnumdepth}{5} -\usepackage{fancyhdr} -\pagestyle{fancy} -\fancyhead{} -\fancyhead[CO,CE]{Automatically Build Multiple Patient-Level Predictive Models} -\fancyfoot[CO,CE]{PatientLevelPrediction Package Version 3.1.0} -\fancyfoot[LE,RO]{\thepage} -\renewcommand{\headrulewidth}{0.4pt} -\renewcommand{\footrulewidth}{0.4pt} - -\title{Automatically Build Multiple Patient-Level Predictive Models} -\author{Jenna Reps, Martijn J. Schuemie, Patrick B. Ryan, Peter R. Rijnbeek} -\date{2020-06-03} - -\begin{document} -\maketitle - -{ -\setcounter{tocdepth}{2} -\tableofcontents -} -\hypertarget{introduction}{% -\section{Introduction}\label{introduction}} - -In our -\href{https://academic.oup.com/jamia/article/25/8/969/4989437}{\texttt{paper}}, -we propose a standardised framework for patient-level prediction that -utilizes the OMOP CDM and standardized vocabularies, and describe the -open-source software that we developed implementing the framework's -pipeline. The framework is the first to enforce existing best practice -guidelines and will enable open dissemination of models that can be -extensively validated across the network of OHDSI collaborators. - -One our best practices is that we see the selection of models and all -study setting as an emperical question, i.e.~we should use a data-driven -approach in which we try many settings. This vignette describes how you -can use the Observational Health Data Sciencs and Informatics (OHDSI) -\href{http://github.com/OHDSI/PatientLevelPrediction}{\texttt{PatientLevelPrediction}} -package to automatically build multiple patient-level predictive models, -e.g.~different population settings, covariate settings, and -modelsetting. This vignette assumes you have read and are comfortable -with building single patient level prediction models as described in the -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/BuildingPredictiveModels.pdf}{\texttt{BuildingPredictiveModels} -vignette}. - -Note that it is also possible to generate a Study Package directly in -Atlas that allows for multiple patient-level prediction analyses this is -out-of-scope for this vignette. - -\hypertarget{creating-the-setting-lists}{% -\section{Creating the setting lists}\label{creating-the-setting-lists}} - -To develop multiple models the user has to create a list of Study -Populations Settings, Covariate Settings, and Model Settings. These -lists will then be combined in a Model Analysis List and all -combinations of the elements in this list will be automatically run by -the package. - -\hypertarget{study-population-settings}{% -\subsection{Study population settings}\label{study-population-settings}} - -Suppose we like to make the following three population settings: - -\begin{itemize} -\tightlist -\item - study population 1: allows persons who have the outcome to leave the - database before the end of time-at-risk and only those without the - outcome who are observed for the whole time-at-risk period - (requireTimeAtRisk = T). -\item - study population 2: does not impose the restriction that persons who - do not experience the outcome need to be observed for the full - time-at-risk period (requireTimeAtRisk = F). -\item - study population 3: does impose the restriction that persons who do - not experience the outcome need to be observed for the full - time-at-risk period (requireTimeAtRisk = T) and allows persons that - had the outcome before (removeSubjectsWithPriorOutcome = F) -\end{itemize} - -The create a study population setting list we use the function -\texttt{createStudyPopulationSettings} as described below: - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{# define all study population settings} -\NormalTok{studyPop1 <-}\StringTok{ }\KeywordTok{createStudyPopulationSettings}\NormalTok{(}\DataTypeTok{binary =}\NormalTok{ T,} - \DataTypeTok{includeAllOutcomes =}\NormalTok{ F,} - \DataTypeTok{removeSubjectsWithPriorOutcome =}\NormalTok{ T,} - \DataTypeTok{priorOutcomeLookback =} \DecValTok{99999}\NormalTok{,} - \DataTypeTok{requireTimeAtRisk =}\NormalTok{ T,} - \DataTypeTok{minTimeAtRisk=}\DecValTok{364}\NormalTok{,} - \DataTypeTok{riskWindowStart =} \DecValTok{1}\NormalTok{,} - \DataTypeTok{riskWindowEnd =} \DecValTok{365}\NormalTok{,} - \DataTypeTok{verbosity =} \StringTok{"INFO"}\NormalTok{)} - -\NormalTok{studyPop2 <-}\StringTok{ }\KeywordTok{createStudyPopulationSettings}\NormalTok{(}\DataTypeTok{binary =}\NormalTok{ T,} - \DataTypeTok{includeAllOutcomes =}\NormalTok{ F,} - \DataTypeTok{removeSubjectsWithPriorOutcome =}\NormalTok{ T,} - \DataTypeTok{priorOutcomeLookback =} \DecValTok{99999}\NormalTok{,} - \DataTypeTok{requireTimeAtRisk =}\NormalTok{ F,} - \DataTypeTok{minTimeAtRisk=}\DecValTok{364}\NormalTok{,} - \DataTypeTok{riskWindowStart =} \DecValTok{1}\NormalTok{,} - \DataTypeTok{riskWindowEnd =} \DecValTok{365}\NormalTok{,} - \DataTypeTok{verbosity =} \StringTok{"INFO"}\NormalTok{)} - -\NormalTok{studyPop3 <-}\StringTok{ }\KeywordTok{createStudyPopulationSettings}\NormalTok{(}\DataTypeTok{binary =}\NormalTok{ T,} - \DataTypeTok{includeAllOutcomes =}\NormalTok{ F,} - \DataTypeTok{removeSubjectsWithPriorOutcome =}\NormalTok{ F,} - \DataTypeTok{priorOutcomeLookback =} \DecValTok{99999}\NormalTok{,} - \DataTypeTok{requireTimeAtRisk =}\NormalTok{ T,} - \DataTypeTok{minTimeAtRisk=}\DecValTok{364}\NormalTok{,} - \DataTypeTok{riskWindowStart =} \DecValTok{1}\NormalTok{,} - \DataTypeTok{riskWindowEnd =} \DecValTok{365}\NormalTok{,} - \DataTypeTok{verbosity =} \StringTok{"INFO"}\NormalTok{)} - -\CommentTok{# combine these in a population setting list} -\NormalTok{populationSettingList <-}\StringTok{ }\KeywordTok{list}\NormalTok{(studyPop1,studyPop2,studyPop3)} -\end{Highlighting} -\end{Shaded} - -\hypertarget{covariate-settings}{% -\subsection{Covariate settings}\label{covariate-settings}} - -The covariate settings are created using -\texttt{createCovariateSettings}. We can create multiple covariate -settings and then combine them in a list: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{covSet1 <-}\StringTok{ }\KeywordTok{createCovariateSettings}\NormalTok{(}\DataTypeTok{useDemographicsGender =}\NormalTok{ T, } - \DataTypeTok{useDemographicsAgeGroup =}\NormalTok{ T, } - \DataTypeTok{useConditionGroupEraAnyTimePrior =}\NormalTok{ T,} - \DataTypeTok{useDrugGroupEraAnyTimePrior =}\NormalTok{ T)} - -\NormalTok{covSet2 <-}\StringTok{ }\KeywordTok{createCovariateSettings}\NormalTok{(}\DataTypeTok{useDemographicsGender =}\NormalTok{ T, } - \DataTypeTok{useDemographicsAgeGroup =}\NormalTok{ T, } - \DataTypeTok{useConditionGroupEraAnyTimePrior =}\NormalTok{ T,} - \DataTypeTok{useDrugGroupEraAnyTimePrior =}\NormalTok{ F)} - -\NormalTok{covariateSettingList <-}\StringTok{ }\KeywordTok{list}\NormalTok{(covSet1, covSet2)} -\end{Highlighting} -\end{Shaded} - -\hypertarget{algorithm-settings}{% -\subsection{Algorithm settings}\label{algorithm-settings}} - -The model settings requires running the setModel functions for the -machine learning algorithms of interest and specifying the -hyper-parameter search and then combining these into a list. For -example, if we wanted to try a logistic regression, gradient boosting -machine and ada boost model then: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{gbm <-}\StringTok{ }\KeywordTok{setGradientBoostingMachine}\NormalTok{()} -\NormalTok{lr <-}\StringTok{ }\KeywordTok{setLassoLogisticRegression}\NormalTok{()} -\NormalTok{ada <-}\StringTok{ }\KeywordTok{setAdaBoost}\NormalTok{()} - -\NormalTok{modelList <-}\StringTok{ }\KeywordTok{list}\NormalTok{(gbm, lr, ada)} -\end{Highlighting} -\end{Shaded} - -\hypertarget{model-analysis-list}{% -\subsection{Model analysis list}\label{model-analysis-list}} - -To create the complete plp model settings use -\texttt{createPlpModelSettings} to combine the population, covariate and -model settings. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{modelAnalysisList <-}\StringTok{ }\KeywordTok{createPlpModelSettings}\NormalTok{(}\DataTypeTok{modelList =}\NormalTok{ modelList, } - \DataTypeTok{covariateSettingList =}\NormalTok{ covariateSettingList,} - \DataTypeTok{populationSettingList =}\NormalTok{ populationSettingList)} -\end{Highlighting} -\end{Shaded} - -\hypertarget{running-multiple-models}{% -\section{Running multiple models}\label{running-multiple-models}} - -As we will be downloading loads of data in the multiple plp analysis it -is useful to set the Andromeda temp folder to a directory with write -access and plenty of space. -\texttt{options(andromedaTempFolder\ =\ "c:/andromedaTemp")} - -To run the study requires setting up a connectionDetails object - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{dbms <-}\StringTok{ "your dbms"} -\NormalTok{user <-}\StringTok{ "your username"} -\NormalTok{pw <-}\StringTok{ "your password"} -\NormalTok{server <-}\StringTok{ "your server"} -\NormalTok{port <-}\StringTok{ "your port"} - -\NormalTok{connectionDetails <-}\StringTok{ }\NormalTok{DatabaseConnector}\OperatorTok{::}\KeywordTok{createConnectionDetails}\NormalTok{(}\DataTypeTok{dbms =}\NormalTok{ dbms,} - \DataTypeTok{server =}\NormalTok{ server,} - \DataTypeTok{user =}\NormalTok{ user,} - \DataTypeTok{password =}\NormalTok{ pw,} - \DataTypeTok{port =}\NormalTok{ port)} -\end{Highlighting} -\end{Shaded} - -Next you need to specify the cdmDatabaseSchema where your cdm database -is found and workDatabaseSchema where your target population and outcome -cohorts are and you need to specify a label for the database name: a -string with a shareable name of the database (this will be shown to -OHDSI researchers if the results get transported). - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{cdmDatabaseSchema <-}\StringTok{ "your cdmDatabaseSchema"} -\NormalTok{workDatabaseSchema <-}\StringTok{ "your workDatabaseSchema"} -\NormalTok{cdmDatabaseName <-}\StringTok{ "your cdmDatabaseName"} -\end{Highlighting} -\end{Shaded} - -Now you can run the multiple patient-level prediction analysis by -specifying the target cohort ids and outcome ids - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{allresults <-}\StringTok{ }\KeywordTok{runPlpAnalyses}\NormalTok{(}\DataTypeTok{connectionDetails =}\NormalTok{ connectionDetails,} - \DataTypeTok{cdmDatabaseSchema =}\NormalTok{ cdmDatabaseSchema,} - \DataTypeTok{cdmDatabaseName =}\NormalTok{ cdmDatabaseName,} - \DataTypeTok{oracleTempSchema =}\NormalTok{ cdmDatabaseSchema,} - \DataTypeTok{cohortDatabaseSchema =}\NormalTok{ workDatabaseSchema,} - \DataTypeTok{cohortTable =} \StringTok{"your cohort table"}\NormalTok{,} - \DataTypeTok{outcomeDatabaseSchema =}\NormalTok{ workDatabaseSchema,} - \DataTypeTok{outcomeTable =} \StringTok{"your cohort table"}\NormalTok{,} - \DataTypeTok{cdmVersion =} \DecValTok{5}\NormalTok{,} - \DataTypeTok{outputFolder =} \StringTok{"./PlpMultiOutput"}\NormalTok{,} - \DataTypeTok{modelAnalysisList =}\NormalTok{ modelAnalysisList,} - \DataTypeTok{cohortIds =} \KeywordTok{c}\NormalTok{(}\DecValTok{2484}\NormalTok{,}\DecValTok{6970}\NormalTok{),} - \DataTypeTok{cohortNames =} \KeywordTok{c}\NormalTok{(}\StringTok{'visit 2010'}\NormalTok{,}\StringTok{'test cohort'}\NormalTok{),} - \DataTypeTok{outcomeIds =} \KeywordTok{c}\NormalTok{(}\DecValTok{7331}\NormalTok{,}\DecValTok{5287}\NormalTok{),} - \DataTypeTok{outcomeNames =} \KeywordTok{c}\NormalTok{(}\StringTok{'outcome 1'}\NormalTok{,}\StringTok{'outcome 2'}\NormalTok{),} - \DataTypeTok{maxSampleSize =} \OtherTok{NULL}\NormalTok{,} - \DataTypeTok{minCovariateFraction =} \DecValTok{0}\NormalTok{,} - \DataTypeTok{normalizeData =}\NormalTok{ T,} - \DataTypeTok{testSplit =} \StringTok{"stratified"}\NormalTok{,} - \DataTypeTok{testFraction =} \FloatTok{0.25}\NormalTok{,} - \DataTypeTok{splitSeed =} \OtherTok{NULL}\NormalTok{,} - \DataTypeTok{nfold =} \DecValTok{3}\NormalTok{,} - \DataTypeTok{verbosity =} \StringTok{"INFO"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -This will then save all the plpData objects from the study into -``./PlpMultiOutput/plpData'', the populations for the analysis into -``./PlpMultiOutput/population'' and the results into -``./PlpMultiOutput/Result''. The csv named settings.csv found in -``./PlpMultiOutput'' has a row for each prediction model developed and -points to the plpData and population used for the model development, it -also has descriptions of the cohorts and settings if these are input by -the user. - -Note that if for some reason the run is interrupted, e.g.~because of an -error, a new call to \texttt{RunPlpAnalyses} will continue and not -restart until you remove the output folder. - -\hypertarget{validating-multiple-models}{% -\section{Validating multiple models}\label{validating-multiple-models}} - -If you have access to multiple databases on the same server in different -schemas you could evaluate accross these using this call: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{val <-}\StringTok{ }\KeywordTok{evaluateMultiplePlp}\NormalTok{(}\DataTypeTok{analysesLocation =} \StringTok{"./PlpMultiOutput"}\NormalTok{,} - \DataTypeTok{outputLocation =} \StringTok{"./PlpMultiOutput/validation"}\NormalTok{,} - \DataTypeTok{connectionDetails =}\NormalTok{ connectionDetails, } - \DataTypeTok{validationSchemaTarget =} \KeywordTok{list}\NormalTok{(}\StringTok{'new_database_1.dbo'}\NormalTok{,} - \StringTok{'new_database_2.dbo'}\NormalTok{),} - \DataTypeTok{validationSchemaOutcome =} \KeywordTok{list}\NormalTok{(}\StringTok{'new_database_1.dbo'}\NormalTok{,} - \StringTok{'new_database_2.dbo'}\NormalTok{),} - \DataTypeTok{validationSchemaCdm =} \KeywordTok{list}\NormalTok{(}\StringTok{'new_database_1.dbo'}\NormalTok{,} - \StringTok{'new_database_2.dbo'}\NormalTok{), } - \DataTypeTok{databaseNames =} \KeywordTok{c}\NormalTok{(}\StringTok{'database1'}\NormalTok{,}\StringTok{'database2'}\NormalTok{),} - \DataTypeTok{validationTableTarget =} \StringTok{'your new cohort table'}\NormalTok{,} - \DataTypeTok{validationTableOutcome =} \StringTok{'your new cohort table'}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -This then saves the external validation results in the validation folder -of the main study (the outputLocation you used in runPlpAnalyses). - -\hypertarget{viewing-the-results}{% -\section{Viewing the results}\label{viewing-the-results}} - -To view the results for the multiple prediction analysis: - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{viewMultiplePlp}\NormalTok{(}\DataTypeTok{analysesLocation=}\StringTok{"./PlpMultiOutput"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -If the validation directory in ``./PlpMultiOutput'' has results, the -external validation will also be displayed. - -\hypertarget{acknowledgments}{% -\section{Acknowledgments}\label{acknowledgments}} - -Considerable work has been dedicated to provide the -\texttt{PatientLevelPrediction} package. - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{citation}\NormalTok{(}\StringTok{"PatientLevelPrediction"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -## -## To cite PatientLevelPrediction in publications use: -## -## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design and -## implementation of a standardized framework to generate and evaluate patient-level -## prediction models using observational healthcare data." _Journal of the American -## Medical Informatics Association_, *25*(8), 969-975. . -## -## A BibTeX entry for LaTeX users is -## -## @Article{, -## author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek}, -## title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data}, -## journal = {Journal of the American Medical Informatics Association}, -## volume = {25}, -## number = {8}, -## pages = {969-975}, -## year = {2018}, -## url = {https://doi.org/10.1093/jamia/ocy032}, -## } -\end{verbatim} - -\textbf{Please reference this paper if you use the PLP Package in your -work:} - -\href{http://dx.doi.org/10.1093/jamia/ocy032}{Reps JM, Schuemie MJ, -Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a -standardized framework to generate and evaluate patient-level prediction -models using observational healthcare data. J Am Med Inform Assoc. -2018;25(8):969-975.} - -\end{document} diff --git a/inst/doc/BuildingPredictiveModels.tex b/inst/doc/BuildingPredictiveModels.tex deleted file mode 100644 index 8a29c5af4..000000000 --- a/inst/doc/BuildingPredictiveModels.tex +++ /dev/null @@ -1,2409 +0,0 @@ -% Options for packages loaded elsewhere -\PassOptionsToPackage{unicode}{hyperref} -\PassOptionsToPackage{hyphens}{url} -% -\documentclass[ -]{article} -\usepackage{lmodern} -\usepackage{amssymb,amsmath} -\usepackage{ifxetex,ifluatex} -\ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex - \usepackage[T1]{fontenc} - \usepackage[utf8]{inputenc} - \usepackage{textcomp} % provide euro and other symbols -\else % if luatex or xetex - \usepackage{unicode-math} - \defaultfontfeatures{Scale=MatchLowercase} - \defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1} -\fi -% Use upquote if available, for straight quotes in verbatim environments -\IfFileExists{upquote.sty}{\usepackage{upquote}}{} -\IfFileExists{microtype.sty}{% use microtype if available - \usepackage[]{microtype} - \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts -}{} -\makeatletter -\@ifundefined{KOMAClassName}{% if non-KOMA class - \IfFileExists{parskip.sty}{% - \usepackage{parskip} - }{% else - \setlength{\parindent}{0pt} - \setlength{\parskip}{6pt plus 2pt minus 1pt}} -}{% if KOMA class - \KOMAoptions{parskip=half}} -\makeatother -\usepackage{xcolor} -\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available -\IfFileExists{bookmark.sty}{\usepackage{bookmark}}{\usepackage{hyperref}} -\hypersetup{ - pdftitle={Building patient-level predictive models}, - pdfauthor={Jenna Reps, Martijn J. Schuemie, Patrick B. Ryan, Peter R. Rijnbeek}, - hidelinks, - pdfcreator={LaTeX via pandoc}} -\urlstyle{same} % disable monospaced font for URLs -\usepackage[margin=1in]{geometry} -\usepackage{color} -\usepackage{fancyvrb} -\newcommand{\VerbBar}{|} -\newcommand{\VERB}{\Verb[commandchars=\\\{\}]} -\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}} -% Add ',fontsize=\small' for more characters per line -\usepackage{framed} -\definecolor{shadecolor}{RGB}{248,248,248} -\newenvironment{Shaded}{\begin{snugshade}}{\end{snugshade}} -\newcommand{\AlertTok}[1]{\textcolor[rgb]{0.94,0.16,0.16}{#1}} -\newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.77,0.63,0.00}{#1}} -\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\BuiltInTok}[1]{#1} -\newcommand{\CharTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}} -\newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\ConstantTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} -\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{#1}} -\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\ErrorTok}[1]{\textcolor[rgb]{0.64,0.00,0.00}{\textbf{#1}}} -\newcommand{\ExtensionTok}[1]{#1} -\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\ImportTok}[1]{#1} -\newcommand{\InformationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} -\newcommand{\NormalTok}[1]{#1} -\newcommand{\OperatorTok}[1]{\textcolor[rgb]{0.81,0.36,0.00}{\textbf{#1}}} -\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}} -\newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}} -\newcommand{\RegionMarkerTok}[1]{#1} -\newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\StringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\VariableTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\WarningTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\usepackage{longtable,booktabs} -% Correct order of tables after \paragraph or \subparagraph -\usepackage{etoolbox} -\makeatletter -\patchcmd\longtable{\par}{\if@noskipsec\mbox{}\fi\par}{}{} -\makeatother -% Allow footnotes in longtable head/foot -\IfFileExists{footnotehyper.sty}{\usepackage{footnotehyper}}{\usepackage{footnote}} -\makesavenoteenv{longtable} -\usepackage{graphicx,grffile} -\makeatletter -\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi} -\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi} -\makeatother -% Scale images if necessary, so that they will not overflow the page -% margins by default, and it is still possible to overwrite the defaults -% using explicit options in \includegraphics[width, height, ...]{} -\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio} -% Set default figure placement to htbp -\makeatletter -\def\fps@figure{htbp} -\makeatother -\setlength{\emergencystretch}{3em} % prevent overfull lines -\providecommand{\tightlist}{% - \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}} -\setcounter{secnumdepth}{5} -\usepackage{fancyhdr} -\pagestyle{fancy} -\fancyhead{} -\fancyhead[CO,CE]{Installation Guide} -\fancyfoot[CO,CE]{PatientLevelPrediction Package Version 3.1.0} -\fancyfoot[LE,RO]{\thepage} -\renewcommand{\headrulewidth}{0.4pt} -\renewcommand{\footrulewidth}{0.4pt} - -\title{Building patient-level predictive models} -\author{Jenna Reps, Martijn J. Schuemie, Patrick B. Ryan, Peter R. Rijnbeek} -\date{2020-06-03} - -\begin{document} -\maketitle - -{ -\setcounter{tocdepth}{3} -\tableofcontents -} -\hypertarget{introduction}{% -\section{Introduction}\label{introduction}} - -Observational healthcare data, such as administrative claims and -electronic health records, are increasingly used for clinical -characterization of disease progression, quality improvement, and -population-level effect estimation for medical product safety -surveillance and comparative effectiveness. Advances in machine learning -for large dataset analysis have led to increased interest in applying -patient-level prediction on this type of data. Patient-level prediction -offers the potential for medical practice to move beyond average -treatment effects and to consider personalized risks as part of clinical -decision-making. However, many published efforts in -patient-level-prediction do not follow the model development guidelines, -fail to perform extensive external validation, or provide insufficient -model details that limits the ability of independent researchers to -reproduce the models and perform external validation. This makes it hard -to fairly evaluate the predictive performance of the models and reduces -the likelihood of the model being used appropriately in clinical -practice. To improve standards, several papers have been written -detailing guidelines for best practices in developing and reporting -prediction models. - -The Transparent Reporting of a multivariable prediction model for -\href{https://www.equator-network.org/reporting-guidelines/tripod-statement/}{\texttt{Individual\ Prognosis\ Or\ Diagnosis\ (TRIPOD)\ statement}} -provides clear recommendations for reporting prediction model -development and validation and addresses some of the concerns related to -transparency. However, data structure heterogeneity and inconsistent -terminologies still make collaboration and model sharing difficult as -different researchers are often required to write new code to extract -the data from their databases and may define variables differently. - -In our -\href{https://academic.oup.com/jamia/article/25/8/969/4989437}{\texttt{paper}}, -we propose a standardised framework for patient-level prediction that -utilizes the OMOP Common Data Model (CDM) and standardized vocabularies, -and describe the open-source software that we developed implementing the -framework's pipeline. The framework is the first to support existing -best practice guidelines and will enable open dissemination of models -that can be extensively validated across the network of OHDSI -collaborators. - -Figure 1, illustrates the prediction problem we address. Among a -population at risk, we aim to predict which patients at a defined moment -in time (t = 0) will experience some outcome during a time-at-risk. -Prediction is done using only information about the patients in an -observation window prior to that moment in time. - -\begin{figure} -\centering -\includegraphics{Figure1.png} -\caption{The prediction problem} -\end{figure} - -As shown in Figure 2, to define a prediction problem we have to define -t=0 by a Target Cohort (T), the outcome we like to predict by an outcome -cohort (O), and the time-at-risk (TAR). Furthermore, we have to make -design choices for the model we like to develop, and determine the -observational datasets to perform internal and external validation. This -conceptual framework works for all type of prediction problems, for -example those presented in Figure 3. - -\begin{figure} -\centering -\includegraphics{studydesign.png} -\caption{Design choices} -\end{figure} - -\begin{figure} -\centering -\includegraphics{problems.png} -\caption{Examples of prediction problems} -\end{figure} - -This vignette describes how you can use the -\texttt{PatientLevelPrediction} package to build patient-level -predictive models. The package enables data extraction, model building, -and model evaluation using data from databases that are translated into -the OMOP CDM. In this vignette we assume you have installed the package -correctly using the -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/InstallationGuide.pdf}{\texttt{InstallationGuide}}. - -\hypertarget{study-specification}{% -\section{Study specification}\label{study-specification}} - -We have to clearly specify our study upfront to be able to implement it. -This means we need to define the prediction problem we like to address, -in which population we will build the model, which model we will build -and how we will evaluate its performance. To guide you through this -process we will use a ``Disease onset and progression'' prediction type -as an example. - -\hypertarget{problem-definition-1-stroke-in-afibrilation-patients}{% -\subsection{Problem definition 1: Stroke in afibrilation -patients}\label{problem-definition-1-stroke-in-afibrilation-patients}} - -Atrial fibrillation is a disease characterized by an irregular heart -rate that can cause poor blood flow. Patients with atrial fibrillation -are at increased risk of ischemic stroke. Anticoagulation is a -recommended prophylaxis treatment strategy for patients at high risk of -stroke, though the underuse of anticoagulants and persistent severity of -ischemic stroke represents a substantial unmet medical need. Various -strategies have been developed to predict risk of ischemic stroke in -patients with atrial fibrillation. CHADS2 (Gage JAMA 2001) was developed -as a risk score based on history of congestive heart failure, -hypertension, age\textgreater=75, diabetes and stroke. CHADS2 was -initially derived using Medicare claims data, where it achieved good -discrimination (AUC=0.82). However, subsequent external validation -studies revealed the CHADS2 had substantially lower predictive accuracy -(Keogh Thromb Haemost 2011). Subsequent stroke risk calculators have -been developed and evaluated, including the extension of CHADS2Vasc. The -management of atrial fibrillation has evolved substantially over the -last decade, for various reasons that include the introduction of novel -oral anticoagulants. With these innovations has come a renewed interest -in greater precision medicine for stroke prevention. - -We will apply the PatientLevelPrediction package to observational -healthcare data to address the following patient-level prediction -question: - -Amongst patients who are newly diagnosed with Atrial Fibrillation, which -patients will go on to have Ischemic Stroke within 1 year? - -We will define `patients who are newly diagnosed with Atrial -Fibrillation' as the first condition record of cardiac arrhythmia, which -is followed by another cardiac arrhythmia condition record, at least two -drug records for a drug used to treat arrhythmias, or a procedure to -treat arrhythmias. We will define `Ischemic stroke events' as ischemic -stroke condition records during an inpatient or ER visit; successive -records with \textgreater{} 180 day gap are considered independent -episodes. - -\hypertarget{problem-definition-2-angioedema-in-ace-inhibitor-users}{% -\subsection{Problem definition 2: Angioedema in ACE inhibitor -users}\label{problem-definition-2-angioedema-in-ace-inhibitor-users}} - -Angiotensin converting enzyme inhibitors (ACE inhibitors) are -medications used by patients with hypertension that widen the blood -vessles and therefore increse the amount of blood pumped by the heart -and decreases blood pressure. Ace inhibitors reduce a patients risk of -cardiovasular disease but can lead to drug-induced angioedema. - -We will apply the PatientLevelPrediction package to observational -healthcare data to address the following patient-level prediction -question: - -Amongst patients who are newly dispensed an ACE inhibitor, which -patients will go on to have angioedema within 1 year? - -We will define `patients who are newly dispensed an ACE inhibitor' as -the first drug record of sny ACE inhibitor, {[}\ldots{]}which is -followed by another cardiac arrhythmia condition record, at least two -drug records for a drug used to treat arrhythmias, or a procedure to -treat arrhythmias. We will define `angioedema' as an angioedema -condition record. - -\hypertarget{study-population-definition}{% -\subsection{Study population -definition}\label{study-population-definition}} - -The final study population in which we will develop our model is often a -subset of the Target population, because we will e.g.~apply criteria -that are dependent on T and O or we want to do sensitivity analyses with -subpopulations of T. For this we have to answer the following questions: - -\begin{itemize} -\item - \emph{What is the minimum amount of observation time we require before - the start of the target cohort?} This choice could depend on the - available patient time in your training data, but also on the time you - expect to be available in the data sources you want to apply the model - on in the future. The longer the minimum observation time, the more - baseline history time is available for each person to use for feature - extraction, but the fewer patients will qualify for analysis. - Moreover, there could be clinical reasons to choose a short or longer - lookback period. For our example, we will use a prior history as - lookback period (washout period). -\item - \emph{Can patients enter the target cohort multiple times?} In the - target cohort definition, a person may qualify for the cohort multiple - times during different spans of time, for example if they had - different episodes of a disease or separate periods of exposure to a - medical product. The cohort definition does not necessarily apply a - restriction to only let the patients enter once, but in the context of - a particular patient-level prediction problem, a user may want to - restrict the cohort to the first qualifying episode. In our example, a - person could only enter the target cohort once since our criteria was - based on first occurrence of atrial fibrillation. -\item - \emph{Do we allow persons to enter the cohort if they experienced the - outcome before?} Do we allow persons to enter the target cohort if - they experienced the outcome before qualifying for the target cohort? - Depending on the particular patient-level prediction problem, there - may be a desire to predict `incident' first occurrence of an outcome, - in which case patients who have previously experienced the outcome are - not `at-risk' for having a first occurrence and therefore should be - excluded from the target cohort. In other circumstances, there may be - a desire to predict `prevalent' episodes, whereby patients with prior - outcomes can be included in the analysis and the prior outcome itself - can be a predictor of future outcomes. For our prediction example, the - answer to this question is `Yes, allow persons with prior outcomes' - because we know from the CHADS2 score that prior strokes are very - predictive of future strokes. If this answer would have been `No' we - also have to decide how long we would look back for previous - occurrences of the outcome. -\item - \emph{How do we define the period in which we will predict our outcome - relative to the target cohort start?} We actually have to make two - decisions to answer that question. First, does the time-at-risk window - start at the date of the start of the target cohort or later? - Arguments to make it start later could be that you want to avoid - outcomes that were entered late in the record that actually occurred - before the start of the target cohort or you want to leave a gap where - interventions to prevent the outcome could theoretically be - implemented. Second, you need to define the time-at-risk by setting - the risk window end, as some specification of days offset relative to - the target cohort start or end dates. For our problem we will predict - in a `time-at-risk' window starting 1 day after the start of the - target cohort up to 365 days later (to look for 1-year risk following - atrial fibrillation diagnosis). -\item - \emph{Do we require a minimum amount of time-at-risk?} We have to - decide if we want to include patients that did not experience the - outcome but did leave the database earlier than the end of our - time-at-risk period. These patients may experience the outcome when we - do not observe them. For our prediction problem we decide to answer - this question with `Yes, require a mimimum time-at-risk' for that - reason. Furthermore, we have to decide if this constraint also applies - to persons who experienced the outcome or we will include all persons - with the outcome irrespective of their total time at risk. For - example, if the outcome is death, then persons with the outcome are - likely censored before the full time-at-risk period is complete. -\end{itemize} - -\hypertarget{model-development-settings}{% -\subsection{Model development -settings}\label{model-development-settings}} - -To develop the model we have to decide which algorithm(s) we like to -train. We see the selection of the best algorithm for a certain -prediction problem as an empirical question, i.e.~you need to let the -data speak for itself and try different approaches to find the best one. -There is no algorithm that will work best for all problems (no free -lunch). In our package we therefore aim to implement many algorithms. -Furthermore, we made the system modular so you can add your own custom -algorithms as described in more detail in the -\href{Link\%20to\%20be\%20added}{\texttt{AddingCustomAlgorithms}} -vignette. - -Our package currently contains the following algorithms to choose from: - -\begin{longtable}[]{@{}lll@{}} -\toprule -\begin{minipage}[b]{0.11\columnwidth}\raggedright -Algorihm\strut -\end{minipage} & \begin{minipage}[b]{0.55\columnwidth}\raggedright -Description\strut -\end{minipage} & \begin{minipage}[b]{0.25\columnwidth}\raggedright -Hyper-parameters\strut -\end{minipage}\tabularnewline -\midrule -\endhead -\begin{minipage}[t]{0.11\columnwidth}\raggedright -Regularized Logistic Regression\strut -\end{minipage} & \begin{minipage}[t]{0.55\columnwidth}\raggedright -Lasso logistic regression belongs to the family of generalized linear -models, where a linear combination of the variables is learned and -finally a logistic function maps the linear combination to a value -between 0 and 1. The lasso regularization adds a cost based on model -complexity to the objective function when training the model. This cost -is the sum of the absolute values of the linear combination of the -coefficients. The model automatically performs feature selection by -minimizing this cost. We use the Cyclic coordinate descent for logistic, -Poisson and survival analysis (Cyclops) package to perform large-scale -regularized logistic regression: -\url{https://github.com/OHDSI/Cyclops}\strut -\end{minipage} & \begin{minipage}[t]{0.25\columnwidth}\raggedright -var (starting variance), seed\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.11\columnwidth}\raggedright -Gradient boosting machines\strut -\end{minipage} & \begin{minipage}[t]{0.55\columnwidth}\raggedright -Gradient boosting machines is a boosting ensemble technique and in our -framework it combines multiple decision trees. Boosting works by -iteratively adding decision trees but adds more weight to the -data-points that are misclassified by prior decision trees in the cost -function when training the next tree. We use Extreme Gradient Boosting, -which is an efficient implementation of the gradient boosting framework -implemented in the xgboost R package available from CRAN.\strut -\end{minipage} & \begin{minipage}[t]{0.25\columnwidth}\raggedright -ntree (number of trees), max depth (max levels in tree), min rows -(minimum data points in in node), learning rate, balance (balance class -labels), seed\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.11\columnwidth}\raggedright -Random forest\strut -\end{minipage} & \begin{minipage}[t]{0.55\columnwidth}\raggedright -Random forest is a bagging ensemble technique that combines multiple -decision trees. The idea behind bagging is to reduce the likelihood of -overfitting, by using weak classifiers, but combining multiple diverse -weak classifiers into a strong classifier. Random forest accomplishes -this by training multiple decision trees but only using a subset of the -variables in each tree and the subset of variables differ between trees. -Our packages uses the sklearn learn implementation of Random Forest in -python.\strut -\end{minipage} & \begin{minipage}[t]{0.25\columnwidth}\raggedright -mtry (number of features in each tree),ntree (number of trees), maxDepth -(max levels in tree), minRows (minimum data points in in node),balance -(balance class labels), seed\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.11\columnwidth}\raggedright -K-nearest neighbors\strut -\end{minipage} & \begin{minipage}[t]{0.55\columnwidth}\raggedright -K-nearest neighbors (KNN) is an algorithm that uses some metric to find -the K closest labelled data-points, given the specified metric, to a new -unlabelled data-point. The prediction of the new data-points is then the -most prevalent class of the K-nearest labelled data-points. There is a -sharing limitation of KNN, as the model requires labelled data to -perform the prediction on new data, and it is often not possible to -share this data across data sites.We included the BigKnn classifier -developed in OHDSI which is a large scale k-nearest neighbor classifier -using the Lucene search engine: -\url{https://github.com/OHDSI/BigKnn}\strut -\end{minipage} & \begin{minipage}[t]{0.25\columnwidth}\raggedright -k (number of neighbours),weighted (weight by inverse frequency)\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.11\columnwidth}\raggedright -Naive Bayes\strut -\end{minipage} & \begin{minipage}[t]{0.55\columnwidth}\raggedright -The Naive Bayes algorithm applies the Bayes theorem with the `naive' -assumption of conditional independence between every pair of features -given the value of the class variable. Based on the likelihood the data -belongs to a class and the prior distribution of the class, a posterior -distribution is obtained.\strut -\end{minipage} & \begin{minipage}[t]{0.25\columnwidth}\raggedright -none\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.11\columnwidth}\raggedright -AdaBoost\strut -\end{minipage} & \begin{minipage}[t]{0.55\columnwidth}\raggedright -AdaBoost is a boosting ensemble technique. Boosting works by iteratively -adding classifiers but adds more weight to the data-points that are -misclassified by prior classifiers in the cost function when training -the next classifier. We use the sklearn `AdaboostClassifier' -implementation in Python.\strut -\end{minipage} & \begin{minipage}[t]{0.25\columnwidth}\raggedright -nEstimators (the maximum number of estimators at which boosting is -terminated), learningRate (learning rate shrinks the contribution of -each classifier by learning\_rate. There is a trade-off between -learningRate and nEstimators)\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.11\columnwidth}\raggedright -Decision Tree\strut -\end{minipage} & \begin{minipage}[t]{0.55\columnwidth}\raggedright -A decision tree is a classifier that partitions the variable space using -individual tests selected using a greedy approach. It aims to find -partitions that have the highest information gain to separate the -classes. The decision tree can easily overfit by enabling a large number -of partitions (tree depth) and often needs some regularization (e.g., -pruning or specifying hyper-parameters that limit the complexity of the -model). We use the sklearn `DecisionTreeClassifier' implementation in -Python.\strut -\end{minipage} & \begin{minipage}[t]{0.25\columnwidth}\raggedright -maxDepth (the maximum depth of the tree), -minSamplesSplit,minSamplesLeaf, minImpuritySplit (threshold for early -stopping in tree growth. A node will split if its impurity is above the -threshold, otherwise it is a leaf.), seed,classWeight (`Balance' or -`None')\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.11\columnwidth}\raggedright -Multilayer Perception\strut -\end{minipage} & \begin{minipage}[t]{0.55\columnwidth}\raggedright -Neural networks contain multiple layers that weight their inputs using a -non-linear function. The first layer is the input layer, the last layer -is the output layer the between are the hidden layers. Neural networks -are generally trained using feed forward back-propagation. This is when -you go through the network with a data-point and calculate the error -between the true label and predicted label, then go backwards through -the network and update the linear function weights based on the error. -This can also be performed as a batch, where multiple data-points are -fee\strut -\end{minipage} & \begin{minipage}[t]{0.25\columnwidth}\raggedright -size (the number of hidden nodes), alpha (the l2 regularisation), -seed\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.11\columnwidth}\raggedright -Deep Learning\strut -\end{minipage} & \begin{minipage}[t]{0.55\columnwidth}\raggedright -Deep learning such as deep nets, convolutional neural networks or -recurrent neural networks are similar to a neural network but have -multiple hidden layers that aim to learn latent representations useful -for prediction. In the seperate BuildingDeepLearningModels vignette we -describe these models and hyper-parameters in more detail\strut -\end{minipage} & \begin{minipage}[t]{0.25\columnwidth}\raggedright -see BuildingDeepLearningModels vignette\strut -\end{minipage}\tabularnewline -\bottomrule -\end{longtable} - -Furthermore, we have to decide on the \textbf{covariates} that we will -use to train our model. This choice can be driven by domain knowledge of -available computational resources. In our example, we like to add the -Gender, Age, Conditions, Drugs Groups, and Visit Count. We also have to -specify in which time windows we will look and we decide to look in year -before and any time prior. - -Finally, we have to define how we will train and test our model on our -data, i.e.~how we perform \textbf{internal validation}. For this we have -to decide how we divide our dataset in a training and testing dataset -and how we randomly assign patients to these two sets. Dependent on the -size of the training set we can decide how much data we like to use for -training, typically this is a 75\%, 25\% split. If you have very large -datasets you can use more data for training. To randomly assign patients -to the training and testing set, there are two commonly used approaches: - -\begin{enumerate} -\def\labelenumi{\arabic{enumi}.} -\tightlist -\item - split by person. In this case a random seed is used to assign the - patient to either sets. -\item - split by time. In this case a time point is used to split the persons, - e.g.~75\% of the data is before and 25\% is after this date. The - advantage of this is that you take into consideration that the health - care system has changed over time. -\end{enumerate} - -We now completely defined our studies and implement them: - -\begin{itemize} -\tightlist -\item - \protect\hyperlink{example1}{See example 1: Stroke in afibrilation - patients} -\item - \protect\hyperlink{example2}{See example 2: Agioedema in ACE inhibitor - new users} -\end{itemize} - -\hypertarget{example1}{% -\section{Example 1: Stroke in afibrilation patients}\label{example1}} - -\hypertarget{study-specification-1}{% -\subsection{Study Specification}\label{study-specification-1}} - -For our first prediction model we decide to start with a Regularized -Logistic Regression and will use the default parameters. We will do a -75\%-25\% split by person. - -\begin{longtable}[]{@{}ll@{}} -\toprule -\begin{minipage}[b]{0.42\columnwidth}\raggedright -Definition\strut -\end{minipage} & \begin{minipage}[b]{0.52\columnwidth}\raggedright -Value\strut -\end{minipage}\tabularnewline -\midrule -\endhead -\begin{minipage}[t]{0.42\columnwidth}\raggedright -\textbf{Problem Definition}\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Target Cohort (T)\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -`Patients who are newly diagnosed with Atrial Fibrillation' defined as -the first condition record of cardiac arrhythmia, which is followed by -another cardiac arrhythmia condition record, at least two drug records -for a drug used to treat arrhythmias, or a procedure to treat -arrhythmias.\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Outcome Cohort (O)\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -`Ischemic stroke events' defined as ischemic stroke condition records -during an inpatient or ER visit; successive records with \textgreater{} -180 day gap are considered independent episodes.\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Time-at-risk (TAR)\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -1 day till 365 days from cohort start\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -\textbf{Population Definition}\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Washout Period\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -1095\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Enter the target cohort multiple times?\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -No\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Allow prior outcomes?\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -Yes\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Start of time-at-risk\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -1 day\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -End of time-at-risk\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -365 days\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Require a minimum amount of time-at-risk?\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -Yes (364 days)\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -\textbf{Model Development}\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Algorithm\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -Regularized Logistic Regression\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Hyper-parameters\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -variance = 0.01 (Default)\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Covariates\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -Gender, Age, Conditions (ever before, \textless365), Drugs Groups (ever -before, \textless365), and Visit Count\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Data split\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -75\% train, 25\% test. Randomly assigned by person\strut -\end{minipage}\tabularnewline -\bottomrule -\end{longtable} - -According to the best practices we need to make a protocol that -completely specifies how we plan to execute our study. This protocol -will be assessed by the governance boards of the participating data -sources in your network study. For this a template could be used but we -prefer to automate this process as much as possible by adding -functionality to automatically generate study protocol from a study -specification. We will discuss this in more detail later. - -\hypertarget{study-implementation}{% -\subsection{Study implementation}\label{study-implementation}} - -Now we have completely design our study we have to implement the study. -We have to generate the target and outcome cohorts and we need to -develop the R code to run against our CDM that will execute the full -study. - -\hypertarget{cohort-instantiation}{% -\subsubsection{Cohort instantiation}\label{cohort-instantiation}} - -For our study we need to know when a person enters the target and -outcome cohorts. This is stored in a table on the server that contains -the cohort start date and cohort end date for all subjects for a -specific cohort definition. This cohort table has a very simple -structure as shown below: - -\begin{itemize} -\tightlist -\item - \texttt{cohort\_definition\_id}, a unique identifier for - distinguishing between different types of cohorts, e.g.~cohorts of - interest and outcome cohorts. -\item - \texttt{subject\_id}, a unique identifier corresponding to the - \texttt{person\_id} in the CDM. -\item - \texttt{cohort\_start\_date}, the date the subject enters the cohort. -\item - \texttt{cohort\_end\_date}, the date the subject leaves the cohort. -\end{itemize} - -How do we fill this table according to our cohort definitions? There are -two options for this: - -\begin{enumerate} -\def\labelenumi{\arabic{enumi})} -\item - use the interactive cohort builder tool in - \href{www.github.com/OHDSI/ATLAS}{ATLAS} which can be used to create - cohorts based on inclusion criteria and will automatically populate - this cohort table. -\item - write your own custom SQL statements to fill the cohort table. -\end{enumerate} - -Both methods are described below for our example prediction problem. - -\hypertarget{atlas-cohort-builder}{% -\subsubsection{ATLAS cohort builder}\label{atlas-cohort-builder}} - -\begin{figure} -\centering -\includegraphics{example1/ATLAS_T.png} -\caption{Target Cohort Atrial Fibrillation} -\end{figure} - -ATLAS allows you to define cohorts interactively by specifying cohort -entry and cohort exit criteria. Cohort entry criteria involve selecting -one or more initial events, which determine the start date for cohort -entry, and optionally specifying additional inclusion criteria which -filter to the qualifying events. Cohort exit criteria are applied to -each cohort entry record to determine the end date when the person's -episode no longer qualifies for the cohort. For the outcome cohort the -end date is less relevant. As an example, Figure 4 shows how we created -the Atrial Fibrillation cohort and Figure 5 shows how we created the -stroke cohort in ATLAS. - -\begin{figure} -\centering -\includegraphics{example1/ATLAS_O.png} -\caption{Outcome Cohort Stroke} -\end{figure} - -The T and O cohorts can be found here: - -\begin{itemize} -\tightlist -\item - Atrial Fibrillaton (T): - \url{http://www.ohdsi.org/web/atlas/\#/cohortdefinition/1769447} -\item - Stroke (O) : - \url{http://www.ohdsi.org/web/atlas/\#/cohortdefinition/1769448} -\end{itemize} - -In depth explanation of cohort creation in ATLAS is out of scope of this -vignette but can be found on the OHDSI wiki pages -\href{http://www.ohdsi.org/web/wiki/doku.php?id=documentation:software:atlas}{(link)}. - -Note that when a cohort is created in ATLAS the cohortid is needed to -extract the data in R. The cohortid can be found at the top of the ATLAS -screen, e.g.~1769447 in Figure 4. - -\hypertarget{custom-cohorts}{% -\subsubsection{Custom cohorts}\label{custom-cohorts}} - -It is also possible to create cohorts without the use of ATLAS. Using -custom cohort code (SQL) you can make more advanced cohorts if needed. - -For our example study, we need to create at table to hold the cohort -data and we need to create SQL code to instantiate this table for both -the AF and Stroke cohorts. Therefore, we create a file called -\emph{AfStrokeCohorts.sql} with the following contents: - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{/***********************************} -\CommentTok{File AfStrokeCohorts.sql } -\CommentTok{***********************************/} -\CommentTok{/*} -\CommentTok{Create a table to store the persons in the T and C cohort} -\CommentTok{*/} - -\ControlFlowTok{IF}\NormalTok{ OBJECT_ID(}\StringTok{'@resultsDatabaseSchema.PLPAFibStrokeCohort'}\NormalTok{, }\StringTok{'U'}\NormalTok{) }\KeywordTok{IS} \KeywordTok{NOT} \KeywordTok{NULL} -\KeywordTok{DROP} \KeywordTok{TABLE}\NormalTok{ @resultsDatabaseSchema.PLPAFibStrokeCohort;} - -\KeywordTok{CREATE} \KeywordTok{TABLE}\NormalTok{ @resultsDatabaseSchema.PLPAFibStrokeCohort } -\NormalTok{( } -\NormalTok{cohort_definition_id }\DataTypeTok{INT}\NormalTok{, } -\NormalTok{subject_id BIGINT,} -\NormalTok{cohort_start_date }\DataTypeTok{DATE}\NormalTok{, } -\NormalTok{cohort_end_date }\DataTypeTok{DATE} -\NormalTok{);} - - -\CommentTok{/*} -\CommentTok{T cohort: [PatientLevelPrediction vignette]: T : patients who are newly } -\CommentTok{diagnosed with Atrial fibrillation} -\CommentTok{- persons with a condition occurrence record of 'Atrial fibrillation' or } -\CommentTok{any descendants, indexed at the first diagnosis} -\CommentTok{- who have >1095 days of prior observation before their first diagnosis} -\CommentTok{- and have no warfarin exposure any time prior to first AFib diagnosis} -\CommentTok{*/} -\KeywordTok{INSERT} \KeywordTok{INTO}\NormalTok{ @resultsDatabaseSchema.AFibStrokeCohort (cohort_definition_id, } -\NormalTok{subject_id, } -\NormalTok{cohort_start_date, } -\NormalTok{cohort_end_date)} -\KeywordTok{SELECT} \DecValTok{1} \KeywordTok{AS}\NormalTok{ cohort_definition_id,} -\NormalTok{AFib.person_id }\KeywordTok{AS}\NormalTok{ subject_id,} -\NormalTok{AFib.condition_start_date }\KeywordTok{AS}\NormalTok{ cohort_start_date,} -\NormalTok{observation_period.observation_period_end_date }\KeywordTok{AS}\NormalTok{ cohort_end_date} -\KeywordTok{FROM} -\NormalTok{(} - \KeywordTok{SELECT}\NormalTok{ person_id, }\FunctionTok{min}\NormalTok{(condition_start_date) }\KeywordTok{as}\NormalTok{ condition_start_date} - \KeywordTok{FROM}\NormalTok{ @cdmDatabaseSchema.condition_occurrence} - \KeywordTok{WHERE}\NormalTok{ condition_concept_id }\KeywordTok{IN}\NormalTok{ (}\KeywordTok{SELECT}\NormalTok{ descendant_concept_id }\KeywordTok{FROM} -\NormalTok{ @cdmDatabaseSchema.concept_ancestor }\KeywordTok{WHERE}\NormalTok{ ancestor_concept_id }\KeywordTok{IN} -\NormalTok{ (}\DecValTok{313217} \CommentTok{/*atrial fibrillation*/}\NormalTok{))} - \KeywordTok{GROUP} \KeywordTok{BY}\NormalTok{ person_id} -\NormalTok{) AFib} - \KeywordTok{INNER} \KeywordTok{JOIN}\NormalTok{ @cdmDatabaseSchema.observation_period} - \KeywordTok{ON}\NormalTok{ AFib.person_id }\OperatorTok{=}\NormalTok{ observation_period.person_id} - \KeywordTok{AND}\NormalTok{ AFib.condition_start_date }\OperatorTok{>=}\NormalTok{ dateadd(dd,}\DecValTok{1095}\NormalTok{, } -\NormalTok{ observation_period.observation_period_start_date)} - \KeywordTok{AND}\NormalTok{ AFib.condition_start_date }\OperatorTok{<=}\NormalTok{ observation_period.observation_period_end_date} - \KeywordTok{LEFT} \KeywordTok{JOIN} -\NormalTok{ (} - \KeywordTok{SELECT}\NormalTok{ person_id, }\FunctionTok{min}\NormalTok{(drug_exposure_start_date) }\KeywordTok{as}\NormalTok{ drug_exposure_start_date} - \KeywordTok{FROM}\NormalTok{ @cdmDatabaseSchema.drug_exposure} - \KeywordTok{WHERE}\NormalTok{ drug_concept_id }\KeywordTok{IN}\NormalTok{ (}\KeywordTok{SELECT}\NormalTok{ descendant_concept_id }\KeywordTok{FROM} -\NormalTok{ @cdmDatabaseSchema.concept_ancestor }\KeywordTok{WHERE}\NormalTok{ ancestor_concept_id }\KeywordTok{IN} -\NormalTok{ (}\DecValTok{1310149} \CommentTok{/*warfarin*/}\NormalTok{))} - \KeywordTok{GROUP} \KeywordTok{BY}\NormalTok{ person_id} -\NormalTok{ ) warfarin} - \KeywordTok{ON}\NormalTok{ Afib.person_id }\OperatorTok{=}\NormalTok{ warfarin.person_id} - \KeywordTok{AND}\NormalTok{ Afib.condition_start_date }\OperatorTok{>}\NormalTok{ warfarin.drug_exposure_start_date} - \KeywordTok{WHERE}\NormalTok{ warfarin.person_id }\KeywordTok{IS} \KeywordTok{NULL} -\NormalTok{ ;} - - \CommentTok{/*} -\CommentTok{ C cohort: [PatientLevelPrediction vignette]: O: Ischemic stroke events} -\CommentTok{ - inpatient visits that include a condition occurrence record for } -\CommentTok{ 'cerebral infarction' and descendants, 'cerebral thrombosis', } -\CommentTok{ 'cerebral embolism', 'cerebral artery occlusion' } -\CommentTok{ */} - \KeywordTok{INSERT} \KeywordTok{INTO}\NormalTok{ @resultsDatabaseSchema.AFibStrokeCohort (cohort_definition_id, } -\NormalTok{ subject_id, } -\NormalTok{ cohort_start_date, } -\NormalTok{ cohort_end_date)} - \KeywordTok{SELECT} \DecValTok{2} \KeywordTok{AS}\NormalTok{ cohort_definition_id,} -\NormalTok{ visit_occurrence.person_id }\KeywordTok{AS}\NormalTok{ subject_id,} -\NormalTok{ visit_occurrence.visit_start_date }\KeywordTok{AS}\NormalTok{ cohort_start_date,} -\NormalTok{ visit_occurrence.visit_end_date }\KeywordTok{AS}\NormalTok{ cohort_end_date} - \KeywordTok{FROM} -\NormalTok{ (} - \KeywordTok{SELECT}\NormalTok{ person_id, condition_start_date} - \KeywordTok{FROM}\NormalTok{ @cdmDatabaseSchema.condition_occurrence} - \KeywordTok{WHERE}\NormalTok{ condition_concept_id }\KeywordTok{IN}\NormalTok{ (}\KeywordTok{SELECT} \KeywordTok{DISTINCT}\NormalTok{ descendant_concept_id }\KeywordTok{FROM} -\NormalTok{ @cdmDatabaseSchema.concept_ancestor }\KeywordTok{WHERE}\NormalTok{ ancestor_concept_id }\KeywordTok{IN} -\NormalTok{ (}\DecValTok{443454} \CommentTok{/*cerebral infarction*/}\NormalTok{) }\KeywordTok{OR}\NormalTok{ descendant_concept_id }\KeywordTok{IN} -\NormalTok{ (}\DecValTok{441874} \CommentTok{/*cerebral thrombosis*/}\NormalTok{, }\DecValTok{375557} \CommentTok{/*cerebral embolism*/}\NormalTok{, } - \DecValTok{372924} \CommentTok{/*cerebral artery occlusion*/}\NormalTok{))} -\NormalTok{ ) stroke} - \KeywordTok{INNER} \KeywordTok{JOIN}\NormalTok{ @cdmDatabaseSchema.visit_occurrence} - \KeywordTok{ON}\NormalTok{ stroke.person_id }\OperatorTok{=}\NormalTok{ visit_occurrence.person_id} - \KeywordTok{AND}\NormalTok{ stroke.condition_start_date }\OperatorTok{>=}\NormalTok{ visit_occurrence.visit_start_date} - \KeywordTok{AND}\NormalTok{ stroke.condition_start_date }\OperatorTok{<=}\NormalTok{ visit_occurrence.visit_end_date} - \KeywordTok{AND}\NormalTok{ visit_occurrence.visit_concept_id }\KeywordTok{IN}\NormalTok{ (}\DecValTok{9201}\NormalTok{, }\DecValTok{262} \CommentTok{/*'Inpatient Visit' or } -\CommentTok{ 'Emergency Room and Inpatient Visit'*/}\NormalTok{)} - \KeywordTok{GROUP} \KeywordTok{BY}\NormalTok{ visit_occurrence.person_id, visit_occurrence.visit_start_date, } -\NormalTok{ visit_occurrence.visit_end_date} -\NormalTok{ ;} - -\end{Highlighting} -\end{Shaded} - -This is parameterized SQL which can be used by the -\href{http://github.com/OHDSI/SqlRender}{\texttt{SqlRender}} package. We -use parameterized SQL so we do not have to pre-specify the names of the -CDM and result schemas. That way, if we want to run the SQL on a -different schema, we only need to change the parameter values; we do not -have to change the SQL code. By also making use of translation -functionality in \texttt{SqlRender}, we can make sure the SQL code can -be run in many different environments. - -To execute this sql against our CDM we first need to tell R how to -connect to the server. \texttt{PatientLevelPrediction} uses the -\href{http://github.com/ohdsi/DatabaseConnector}{\texttt{DatabaseConnector}} -package, which provides a function called -\texttt{createConnectionDetails}. Type \texttt{?createConnectionDetails} -for the specific settings required for the various database management -systems (DBMS). For example, one might connect to a PostgreSQL database -using this code: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{ connectionDetails <-}\StringTok{ }\KeywordTok{createConnectionDetails}\NormalTok{(}\DataTypeTok{dbms =} \StringTok{"postgresql"}\NormalTok{, } - \DataTypeTok{server =} \StringTok{"localhost/ohdsi"}\NormalTok{, } - \DataTypeTok{user =} \StringTok{"joe"}\NormalTok{, } - \DataTypeTok{password =} \StringTok{"supersecret"}\NormalTok{)} - -\NormalTok{ cdmDatabaseSchema <-}\StringTok{ "my_cdm_data"} -\NormalTok{ cohortsDatabaseSchema <-}\StringTok{ "my_results"} -\NormalTok{ cdmVersion <-}\StringTok{ "5"} -\end{Highlighting} -\end{Shaded} - -The last three lines define the \texttt{cdmDatabaseSchema} and -\texttt{cohortsDatabaseSchema} variables, as well as the CDM version. We -will use these later to tell R where the data in CDM format live, where -we want to create the cohorts of interest, and what version CDM is used. -Note that for Microsoft SQL Server, databaseschemas need to specify both -the database and the schema, so for example -\texttt{cdmDatabaseSchema\ \textless{}-\ "my\_cdm\_data.dbo"}. - -\begin{Shaded} -\begin{Highlighting}[] - \KeywordTok{library}\NormalTok{(SqlRender)} -\NormalTok{ sql <-}\StringTok{ }\KeywordTok{readSql}\NormalTok{(}\StringTok{"AfStrokeCohorts.sql"}\NormalTok{)} -\NormalTok{ sql <-}\StringTok{ }\KeywordTok{renderSql}\NormalTok{(sql,} - \DataTypeTok{cdmDatabaseSchema =}\NormalTok{ cdmDatabaseSchema,} - \DataTypeTok{cohortsDatabaseSchema =}\NormalTok{ cohortsDatabaseSchema,} - \DataTypeTok{post_time =} \DecValTok{30}\NormalTok{,} - \DataTypeTok{pre_time =} \DecValTok{365}\NormalTok{)}\OperatorTok{$}\NormalTok{sql} -\NormalTok{ sql <-}\StringTok{ }\KeywordTok{translateSql}\NormalTok{(sql, }\DataTypeTok{targetDialect =}\NormalTok{ connectionDetails}\OperatorTok{$}\NormalTok{dbms)}\OperatorTok{$}\NormalTok{sql} - -\NormalTok{ connection <-}\StringTok{ }\KeywordTok{connect}\NormalTok{(connectionDetails)} - \KeywordTok{executeSql}\NormalTok{(connection, sql)} -\end{Highlighting} -\end{Shaded} - -In this code, we first read the SQL from the file into memory. In the -next line, we replace four parameter names with the actual values. We -then translate the SQL into the dialect appropriate for the DBMS we -already specified in the \texttt{connectionDetails}. Next, we connect to -the server, and submit the rendered and translated SQL. - -If all went well, we now have a table with the events of interest. We -can see how many events per type: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{ sql <-}\StringTok{ }\KeywordTok{paste}\NormalTok{(}\StringTok{"SELECT cohort_definition_id, COUNT(*) AS count"}\NormalTok{,} - \StringTok{"FROM @cohortsDatabaseSchema.AFibStrokeCohort"}\NormalTok{,} - \StringTok{"GROUP BY cohort_definition_id"}\NormalTok{)} -\NormalTok{ sql <-}\StringTok{ }\KeywordTok{renderSql}\NormalTok{(sql, }\DataTypeTok{cohortsDatabaseSchema =}\NormalTok{ cohortsDatabaseSchema)}\OperatorTok{$}\NormalTok{sql} -\NormalTok{ sql <-}\StringTok{ }\KeywordTok{translateSql}\NormalTok{(sql, }\DataTypeTok{targetDialect =}\NormalTok{ connectionDetails}\OperatorTok{$}\NormalTok{dbms)}\OperatorTok{$}\NormalTok{sql} - - \KeywordTok{querySql}\NormalTok{(connection, sql)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -## cohort_definition_id count -## 1 1 527616 -## 2 2 221555 -\end{verbatim} - -\hypertarget{study-script-creation}{% -\subsubsection{Study script creation}\label{study-script-creation}} - -In this section we assume that our cohorts have been created either by -using ATLAS or a custom SQL script. We will first explain how to create -an R script yourself that will execute our study as we have defined -earlier. - -\hypertarget{data-extraction}{% -\subsubsection{Data extraction}\label{data-extraction}} - -Now we can tell \texttt{PatientLevelPrediction} to extract all necessary -data for our analysis. This is done using the -\href{https://github.com/OHDSI/FeatureExtration}{\texttt{FeatureExtractionPackage}}. -In short the FeatureExtractionPackage allows you to specify which -features (covariates) need to be extracted, e.g.~all conditions and drug -exposures. It also supports the creation of custom covariates. For more -detailed information on the FeatureExtraction package see its -\href{https://github.com/OHDSI/FeatureExtration}{vignettes}. For our -example study we decided to use these settings: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{ covariateSettings <-}\StringTok{ }\KeywordTok{createCovariateSettings}\NormalTok{(}\DataTypeTok{useDemographicsGender =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{useDemographicsAge =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{useConditionGroupEraLongTerm =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{useConditionGroupEraAnyTimePrior =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{useDrugGroupEraLongTerm =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{useDrugGroupEraAnyTimePrior =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{useVisitConceptCountLongTerm =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{longTermStartDays =} \DecValTok{-365}\NormalTok{,} - \DataTypeTok{endDays =} \DecValTok{-1}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -The final step for extracting the data is to run the \texttt{getPlpData} -function and input the connection details, the database schema where the -cohorts are stored, the cohort definition ids for the cohort and -outcome, and the washoutPeriod which is the minimum number of days prior -to cohort index date that the person must have been observed to be -included into the data, and finally input the previously constructed -covariate settings. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{ plpData <-}\StringTok{ }\KeywordTok{getPlpData}\NormalTok{(}\DataTypeTok{connectionDetails =}\NormalTok{ connectionDetails,} - \DataTypeTok{cdmDatabaseSchema =}\NormalTok{ cdmDatabaseSchema,} - \DataTypeTok{cohortDatabaseSchema =}\NormalTok{ resultsDatabaseSchema,} - \DataTypeTok{cohortTable =} \StringTok{'AFibStrokeCohort'}\NormalTok{,} - \DataTypeTok{cohortId =} \DecValTok{1}\NormalTok{,} - \DataTypeTok{covariateSettings =}\NormalTok{ covariateSettings,} - \DataTypeTok{outcomeDatabaseSchema =}\NormalTok{ resultsDatabaseSchema,} - \DataTypeTok{outcomeTable =} \StringTok{'AFibStrokeCohort'}\NormalTok{,} - \DataTypeTok{outcomeIds =} \DecValTok{2}\NormalTok{,} - \DataTypeTok{sampleSize =} \DecValTok{10000} -\NormalTok{ )} -\end{Highlighting} -\end{Shaded} - -Note that if the cohorts are created in ATLAS its corresponding cohort -database schema needs to be selected. There are many additional -parameters for the \texttt{getPlpData} function which are all documented -in the \texttt{PatientLevelPrediction} manual. The resulting -\texttt{plpData} object uses the package \texttt{ff} to store -information in a way that ensures R does not run out of memory, even -when the data are large. - -Creating the \texttt{plpData} object can take considerable computing -time, and it is probably a good idea to save it for future sessions. -Because \texttt{plpData} uses \texttt{ff}, we cannot use R's regular -save function. Instead, we'll have to use the \texttt{savePlpData()} -function: - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{savePlpData}\NormalTok{(plpData, }\StringTok{"stroke_in_af_data"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -We can use the \texttt{loadPlpData()} function to load the data in a -future session. - -\hypertarget{additional-inclusion-criteria}{% -\subsubsection{Additional inclusion -criteria}\label{additional-inclusion-criteria}} - -To completely define the prediction problem the final study population -is obtained by applying additional constraints on the two earlier -defined cohorts, e.g., a minumim time at risk can be enforced -(\texttt{requireTimeAtRisk,\ minTimeAtRisk}) and we can specify if this -also applies to patients with the outcome (\texttt{includeAllOutcomes}). -Here we also specify the start and end of the risk window relative to -target cohort start. For example, if we like the risk window to start 30 -days after the at-risk cohort start and end a year later we can set -\texttt{riskWindowStart\ =\ 30} and \texttt{riskWindowEnd\ =\ 365}. In -some cases the risk window needs to start at the cohort end date. This -can be achieved by setting \texttt{addExposureToStart\ =\ TRUE} which -adds the cohort (exposure) time to the start date. - -In Appendix 1, we demonstrate the effect of these settings on the subset -of the persons in the target cohort that end up in the final study -population. - -In the example below all the settings we defined for our study are -imposed: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{ population <-}\StringTok{ }\KeywordTok{createStudyPopulation}\NormalTok{(}\DataTypeTok{plpData =}\NormalTok{ plpData,} - \DataTypeTok{outcomeId =} \DecValTok{2}\NormalTok{,} - \DataTypeTok{washoutPeriod =} \DecValTok{1095}\NormalTok{,} - \DataTypeTok{firstExposureOnly =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{removeSubjectsWithPriorOutcome =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{priorOutcomeLookback =} \DecValTok{1}\NormalTok{,} - \DataTypeTok{riskWindowStart =} \DecValTok{1}\NormalTok{,} - \DataTypeTok{riskWindowEnd =} \DecValTok{365}\NormalTok{,} - \DataTypeTok{addExposureDaysToStart =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{addExposureDaysToEnd =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{minTimeAtRisk =} \DecValTok{364}\NormalTok{,} - \DataTypeTok{requireTimeAtRisk =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{includeAllOutcomes =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{verbosity =} \StringTok{"DEBUG"} -\NormalTok{ )} -\end{Highlighting} -\end{Shaded} - -\hypertarget{model-development}{% -\subsubsection{Model Development}\label{model-development}} - -In the set function of an algorithm the user can specify a list of -eligible values for each hyper-parameter. All possible combinations of -the hyper-parameters are included in a so-called grid search using -cross-validation on the training set. If a user does not specify any -value then the default value is used instead. - -For example, if we use the following settings for the -gradientBoostingMachine: ntrees=c(100,200), maxDepth=4 the grid search -will apply the gradient boosting machine algorithm with ntrees=100 and -maxDepth=4 plus the default settings for other hyper-parameters and -ntrees=200 and maxDepth=4 plus the default settings for other -hyper-parameters. The hyper-parameters that lead to the -bestcross-validation performance will then be chosen for the final -model. For our problem we choose to build a logistic regression model -with the default hyper-parameters - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{lrModel <-}\StringTok{ }\KeywordTok{setLassoLogisticRegression}\NormalTok{()} -\end{Highlighting} -\end{Shaded} - -The \texttt{runPlP} function uses the population, \texttt{plpData}, and -model settings to train and evaluate the model. We can use the testSplit -(person/time) and testFraction parameters to split the data in a -75\%-25\% split and run the patient-level prediction pipeline: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{ lrResults <-}\StringTok{ }\KeywordTok{runPlp}\NormalTok{(population, plpData, }\DataTypeTok{modelSettings =}\NormalTok{ lrModel, }\DataTypeTok{testSplit=}\StringTok{'stratified'}\NormalTok{, } - \DataTypeTok{testFraction=}\FloatTok{0.25}\NormalTok{, }\DataTypeTok{nfold=}\DecValTok{2}\NormalTok{, }\DataTypeTok{splitSeed =} \DecValTok{1234}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -Under the hood the package will now use the -\href{www.github.com/OHDSI/Cyclops}{\texttt{Cyclops}} package to fit a -large-scale regularized regression using 75\% of the data and will -evaluate the model on the remaining 25\%. A results data structure is -returned containing information about the model, its performance etc. - -In the runPlp function there are several parameters to save the plpData, -plpResults, plpPlots, evaluation etc. which are all set to True by -default. However, there is also some functionality to this manually. - -You can save the model using: - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{savePlpModel}\NormalTok{(lrResults}\OperatorTok{$}\NormalTok{model, }\DataTypeTok{dirPath =} \KeywordTok{file.path}\NormalTok{(}\KeywordTok{getwd}\NormalTok{(), }\StringTok{"model"}\NormalTok{))} -\end{Highlighting} -\end{Shaded} - -You can load the model using: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{plpModel <-}\StringTok{ }\KeywordTok{loadPlpModel}\NormalTok{(}\KeywordTok{file.path}\NormalTok{(}\KeywordTok{getwd}\NormalTok{(), }\StringTok{"model"}\NormalTok{))} -\end{Highlighting} -\end{Shaded} - -You can also save the full results structure using: - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{savePlpResult}\NormalTok{(lrResults, }\DataTypeTok{location =} \KeywordTok{file.path}\NormalTok{(}\KeywordTok{getwd}\NormalTok{(), }\StringTok{"lr"}\NormalTok{))} -\end{Highlighting} -\end{Shaded} - -To load the full results structure use: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{lrResults <-}\StringTok{ }\KeywordTok{loadPlpResult}\NormalTok{(}\KeywordTok{file.path}\NormalTok{(}\KeywordTok{getwd}\NormalTok{(), }\StringTok{"lr"}\NormalTok{))} -\end{Highlighting} -\end{Shaded} - -\newpage - -\hypertarget{example2}{% -\section{Example 2: Angioedema in ACE inhibitor users}\label{example2}} - -\hypertarget{study-specification-2}{% -\subsection{Study Specification}\label{study-specification-2}} - -\begin{longtable}[]{@{}ll@{}} -\toprule -\begin{minipage}[b]{0.42\columnwidth}\raggedright -Definition\strut -\end{minipage} & \begin{minipage}[b]{0.52\columnwidth}\raggedright -Value\strut -\end{minipage}\tabularnewline -\midrule -\endhead -\begin{minipage}[t]{0.42\columnwidth}\raggedright -\textbf{Problem Definition}\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Target Cohort (T)\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -`Patients who are newly dispensed an ACE inhibitor' defined as the first -drug record of any ACE inhibitor\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Outcome Cohort (O)\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -`Angioedema' defined as an angioedema condition record during an -inpatient or ER visit\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Time-at-risk (TAR)\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -1 day till 365 days from cohort start\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -\textbf{Population Definition}\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Washout Period\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -365\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Enter the target cohort multiple times?\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -No\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Allow prior outcomes?\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -No\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Start of time-at-risk\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -1 day\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -End of time-at-risk\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -365 days\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Require a minimum amount of time-at-risk?\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -Yes (364 days)\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -\textbf{Model Development}\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Algorithm\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -Gradient Boosting Machine\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Hyper-parameters\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -ntree:5000, max depth:4 or 7 or 10 and learning rate: 0.001 or 0.01 or -0.1 or 0.9\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Covariates\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -Gender, Age, Conditions (ever before, \textless365), Drugs Groups (ever -before, \textless365), and Visit Count\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Data split\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -75\% train, 25\% test. Randomly assigned by person\strut -\end{minipage}\tabularnewline -\bottomrule -\end{longtable} - -According to the best practices we need to make a protocol that -completely specifies how we plan to execute our study. This protocol -will be assessed by the governance boards of the participating data -sources in your network study. For this a template could be used but we -prefer to automate this process as much as possible by adding -functionality to automatically generate study protocol from a study -specification. We will discuss this in more detail later. - -\hypertarget{study-implementation-1}{% -\subsection{Study implementation}\label{study-implementation-1}} - -Now we have completely design our study we have to implement the study. -We have to generate the target and outcome cohorts and we need to -develop the R code to run against our CDM that will execute the full -study. - -\hypertarget{cohort-instantiation-1}{% -\subsubsection{Cohort instantiation}\label{cohort-instantiation-1}} - -For our study we need to know when a person enters the target and -outcome cohorts. This is stored in a table on the server that contains -the cohort start date and cohort end date for all subjects for a -specific cohort definition. This cohort table has a very simple -structure as shown below: - -\begin{itemize} -\tightlist -\item - \texttt{cohort\_definition\_id}, a unique identifier for - distinguishing between different types of cohorts, e.g.~cohorts of - interest and outcome cohorts. -\item - \texttt{subject\_id}, a unique identifier corresponding to the - \texttt{person\_id} in the CDM. -\item - \texttt{cohort\_start\_date}, the date the subject enters the cohort. -\item - \texttt{cohort\_end\_date}, the date the subject leaves the cohort. -\end{itemize} - -How do we fill this table according to our cohort definitions? There are -two options for this: - -\begin{enumerate} -\def\labelenumi{\arabic{enumi})} -\item - use the interactive cohort builder tool in - \href{www.github.com/OHDSI/ATLAS}{ATLAS} which can be used to create - cohorts based on inclusion criteria and will automatically populate - this cohort table. -\item - write your own custom SQL statements to fill the cohort table. -\end{enumerate} - -Both methods are described below for our example prediction problem. - -\hypertarget{atlas-cohort-builder-1}{% -\subsubsection{ATLAS cohort builder}\label{atlas-cohort-builder-1}} - -\begin{figure} -\centering -\includegraphics{example2/aceinhibitors.png} -\caption{Target Cohort ACE inhibitors} -\end{figure} - -ATLAS allows you to define cohorts interactively by specifying cohort -entry and cohort exit criteria. Cohort entry criteria involve selecting -one or more initial events, which determine the start date for cohort -entry, and optionally specifying additional inclusion criteria which -filter to the qualifying events. Cohort exit criteria are applied to -each cohort entry record to determine the end date when the person's -episode no longer qualifies for the cohort. For the outcome cohort the -end date is less relevant. As an example, Figure 6 shows how we created -the ACE inhibitors cohort and Figure 7 shows how we created the -angioedema cohort in ATLAS. - -\begin{figure} -\centering -\includegraphics{example2/angioedema.png} -\caption{Outcome Cohort Angioedema} -\end{figure} - -The T and O cohorts can be found here: - -\begin{itemize} -\tightlist -\item - Ace inhibitors (T): - \url{http://www.ohdsi.org/web/atlas/\#/cohortdefinition/1770617} -\item - Angioedema (O) : - \url{http://www.ohdsi.org/web/atlas/\#/cohortdefinition/1770616} -\end{itemize} - -In depth explanation of cohort creation in ATLAS is out of scope of this -vignette but can be found on the OHDSI wiki pages -\href{http://www.ohdsi.org/web/wiki/doku.php?id=documentation:software:atlas}{(link)}. - -Note that when a cohort is created in ATLAS the cohortid is needed to -extract the data in R. The cohortid can be found at the top of the ATLAS -screen, e.g.~1770617 in Figure 6. - -\hypertarget{custom-cohorts-1}{% -\subsubsection{Custom cohorts}\label{custom-cohorts-1}} - -It is also possible to create cohorts without the use of ATLAS. Using -custom cohort code (SQL) you can make more advanced cohorts if needed. - -For our example study, we need to create at table to hold the cohort -data and we need to create SQL code to instantiate this table for both -the AF and Stroke cohorts. Therefore, we create a file called -\emph{AceAngioCohorts.sql} with the following contents: - -\begin{Shaded} -\begin{Highlighting}[] - \CommentTok{/***********************************} -\CommentTok{ File AceAngioCohorts.sql } -\CommentTok{ ***********************************/} - \CommentTok{/*} -\CommentTok{ Create a table to store the persons in the T and C cohort} -\CommentTok{ */} - - \ControlFlowTok{IF}\NormalTok{ OBJECT_ID(}\StringTok{'@resultsDatabaseSchema.PLPAceAngioCohort'}\NormalTok{, }\StringTok{'U'}\NormalTok{) }\KeywordTok{IS} \KeywordTok{NOT} \KeywordTok{NULL} - \KeywordTok{DROP} \KeywordTok{TABLE}\NormalTok{ @resultsDatabaseSchema.PLPAceAngioCohort;} - - \KeywordTok{CREATE} \KeywordTok{TABLE}\NormalTok{ @resultsDatabaseSchema.PLPAceAngioCohort } -\NormalTok{ ( } -\NormalTok{ cohort_definition_id }\DataTypeTok{INT}\NormalTok{, } -\NormalTok{ subject_id BIGINT,} -\NormalTok{ cohort_start_date }\DataTypeTok{DATE}\NormalTok{, } -\NormalTok{ cohort_end_date }\DataTypeTok{DATE} -\NormalTok{ );} - - - \CommentTok{/*} -\CommentTok{ T cohort: [PatientLevelPrediction vignette]: T : patients who are newly } -\CommentTok{ dispensed an ACE inhibitor} -\CommentTok{ - persons with a drug exposure record of any 'ACE inhibitor' or } -\CommentTok{ any descendants, indexed at the first diagnosis} -\CommentTok{ - who have >364 days of prior observation before their first dispensing} -\CommentTok{ */} - \KeywordTok{INSERT} \KeywordTok{INTO}\NormalTok{ @resultsDatabaseSchema.AceAngioCohort (cohort_definition_id, } -\NormalTok{ subject_id, } -\NormalTok{ cohort_start_date, } -\NormalTok{ cohort_end_date)} - \KeywordTok{SELECT} \DecValTok{1} \KeywordTok{AS}\NormalTok{ cohort_definition_id,} -\NormalTok{ Ace.person_id }\KeywordTok{AS}\NormalTok{ subject_id,} -\NormalTok{ Ace.drug_start_date }\KeywordTok{AS}\NormalTok{ cohort_start_date,} -\NormalTok{ observation_period.observation_period_end_date }\KeywordTok{AS}\NormalTok{ cohort_end_date} - \KeywordTok{FROM} -\NormalTok{ (} - \KeywordTok{SELECT}\NormalTok{ person_id, }\FunctionTok{min}\NormalTok{(drug_exposure_date) }\KeywordTok{as}\NormalTok{ drug_start_date} - \KeywordTok{FROM}\NormalTok{ @cdmDatabaseSchema.drug_exposure} - \KeywordTok{WHERE}\NormalTok{ drug_concept_id }\KeywordTok{IN}\NormalTok{ (}\KeywordTok{SELECT}\NormalTok{ descendant_concept_id }\KeywordTok{FROM} -\NormalTok{ @cdmDatabaseSchema.concept_ancestor }\KeywordTok{WHERE}\NormalTok{ ancestor_concept_id }\KeywordTok{IN} -\NormalTok{ (}\DecValTok{1342439}\NormalTok{,}\DecValTok{1334456}\NormalTok{, }\DecValTok{1331235}\NormalTok{, }\DecValTok{1373225}\NormalTok{, }\DecValTok{1310756}\NormalTok{, }\DecValTok{1308216}\NormalTok{, }\DecValTok{1363749}\NormalTok{, }\DecValTok{1341927}\NormalTok{, }\DecValTok{1340128}\NormalTok{, }\DecValTok{1335471} \CommentTok{/*ace inhibitors*/}\NormalTok{))} - \KeywordTok{GROUP} \KeywordTok{BY}\NormalTok{ person_id} -\NormalTok{ ) Ace} - \KeywordTok{INNER} \KeywordTok{JOIN}\NormalTok{ @cdmDatabaseSchema.observation_period} - \KeywordTok{ON}\NormalTok{ Ace.person_id }\OperatorTok{=}\NormalTok{ observation_period.person_id} - \KeywordTok{AND}\NormalTok{ Ace.drug_start_date }\OperatorTok{>=}\NormalTok{ dateadd(dd,}\DecValTok{364}\NormalTok{, } -\NormalTok{ observation_period.observation_period_start_date)} - \KeywordTok{AND}\NormalTok{ Ace.drug_start_date }\OperatorTok{<=}\NormalTok{ observation_period.observation_period_end_date} -\NormalTok{ ;} - - \CommentTok{/*} -\CommentTok{ C cohort: [PatientLevelPrediction vignette]: O: Angioedema} -\CommentTok{ */} - \KeywordTok{INSERT} \KeywordTok{INTO}\NormalTok{ @resultsDatabaseSchema.AceAngioCohort (cohort_definition_id, } -\NormalTok{ subject_id, } -\NormalTok{ cohort_start_date, } -\NormalTok{ cohort_end_date)} - \KeywordTok{SELECT} \DecValTok{2} \KeywordTok{AS}\NormalTok{ cohort_definition_id,} -\NormalTok{ angioedema.person_id }\KeywordTok{AS}\NormalTok{ subject_id,} -\NormalTok{ angioedema.condition_start_date }\KeywordTok{AS}\NormalTok{ cohort_start_date,} -\NormalTok{ angioedema.condition_start_date }\KeywordTok{AS}\NormalTok{ cohort_end_date} - \KeywordTok{FROM} -\NormalTok{ (} - \KeywordTok{SELECT}\NormalTok{ person_id, condition_start_date} - \KeywordTok{FROM}\NormalTok{ @cdmDatabaseSchema.condition_occurrence} - \KeywordTok{WHERE}\NormalTok{ condition_concept_id }\KeywordTok{IN}\NormalTok{ (}\KeywordTok{SELECT} \KeywordTok{DISTINCT}\NormalTok{ descendant_concept_id }\KeywordTok{FROM} -\NormalTok{ @cdmDatabaseSchema.concept_ancestor }\KeywordTok{WHERE}\NormalTok{ ancestor_concept_id }\KeywordTok{IN} -\NormalTok{ (}\DecValTok{432791} \CommentTok{/*angioedema*/}\NormalTok{) }\KeywordTok{OR}\NormalTok{ descendant_concept_id }\KeywordTok{IN} -\NormalTok{ (}\DecValTok{432791} \CommentTok{/*angioedema*/}\NormalTok{)} -\NormalTok{ ) angioedema} - -\NormalTok{ ;} - -\end{Highlighting} -\end{Shaded} - -This is parameterized SQL which can be used by the -\href{http://github.com/OHDSI/SqlRender}{\texttt{SqlRender}} package. We -use parameterized SQL so we do not have to pre-specify the names of the -CDM and result schemas. That way, if we want to run the SQL on a -different schema, we only need to change the parameter values; we do not -have to change the SQL code. By also making use of translation -functionality in \texttt{SqlRender}, we can make sure the SQL code can -be run in many different environments. - -To execute this sql against our CDM we first need to tell R how to -connect to the server. \texttt{PatientLevelPrediction} uses the -\href{http://github.com/ohdsi/DatabaseConnector}{\texttt{DatabaseConnector}} -package, which provides a function called -\texttt{createConnectionDetails}. Type \texttt{?createConnectionDetails} -for the specific settings required for the various database management -systems (DBMS). For example, one might connect to a PostgreSQL database -using this code: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{ connectionDetails <-}\StringTok{ }\KeywordTok{createConnectionDetails}\NormalTok{(}\DataTypeTok{dbms =} \StringTok{"postgresql"}\NormalTok{, } - \DataTypeTok{server =} \StringTok{"localhost/ohdsi"}\NormalTok{, } - \DataTypeTok{user =} \StringTok{"joe"}\NormalTok{, } - \DataTypeTok{password =} \StringTok{"supersecret"}\NormalTok{)} - -\NormalTok{ cdmDatabaseSchema <-}\StringTok{ "my_cdm_data"} -\NormalTok{ cohortsDatabaseSchema <-}\StringTok{ "my_results"} -\NormalTok{ cdmVersion <-}\StringTok{ "5"} -\end{Highlighting} -\end{Shaded} - -The last three lines define the \texttt{cdmDatabaseSchema} and -\texttt{cohortsDatabaseSchema} variables, as well as the CDM version. We -will use these later to tell R where the data in CDM format live, where -we want to create the cohorts of interest, and what version CDM is used. -Note that for Microsoft SQL Server, databaseschemas need to specify both -the database and the schema, so for example -\texttt{cdmDatabaseSchema\ \textless{}-\ "my\_cdm\_data.dbo"}. - -\begin{Shaded} -\begin{Highlighting}[] - \KeywordTok{library}\NormalTok{(SqlRender)} -\NormalTok{ sql <-}\StringTok{ }\KeywordTok{readSql}\NormalTok{(}\StringTok{"AceAngioCohorts.sql"}\NormalTok{)} -\NormalTok{ sql <-}\StringTok{ }\KeywordTok{render}\NormalTok{(sql,} - \DataTypeTok{cdmDatabaseSchema =}\NormalTok{ cdmDatabaseSchema,} - \DataTypeTok{cohortsDatabaseSchema =}\NormalTok{ cohortsDatabaseSchema)} -\NormalTok{ sql <-}\StringTok{ }\KeywordTok{translate}\NormalTok{(sql, }\DataTypeTok{targetDialect =}\NormalTok{ connectionDetails}\OperatorTok{$}\NormalTok{dbms)} - -\NormalTok{ connection <-}\StringTok{ }\KeywordTok{connect}\NormalTok{(connectionDetails)} - \KeywordTok{executeSql}\NormalTok{(connection, sql)} -\end{Highlighting} -\end{Shaded} - -In this code, we first read the SQL from the file into memory. In the -next line, we replace four parameter names with the actual values. We -then translate the SQL into the dialect appropriate for the DBMS we -already specified in the \texttt{connectionDetails}. Next, we connect to -the server, and submit the rendered and translated SQL. - -If all went well, we now have a table with the events of interest. We -can see how many events per type: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{ sql <-}\StringTok{ }\KeywordTok{paste}\NormalTok{(}\StringTok{"SELECT cohort_definition_id, COUNT(*) AS count"}\NormalTok{,} - \StringTok{"FROM @cohortsDatabaseSchema.AceAngioCohort"}\NormalTok{,} - \StringTok{"GROUP BY cohort_definition_id"}\NormalTok{)} -\NormalTok{ sql <-}\StringTok{ }\KeywordTok{render}\NormalTok{(sql, }\DataTypeTok{cohortsDatabaseSchema =}\NormalTok{ cohortsDatabaseSchema)} -\NormalTok{ sql <-}\StringTok{ }\KeywordTok{translate}\NormalTok{(sql, }\DataTypeTok{targetDialect =}\NormalTok{ connectionDetails}\OperatorTok{$}\NormalTok{dbms)} - - \KeywordTok{querySql}\NormalTok{(connection, sql)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -## cohort_definition_id count -## 1 1 0 -## 2 2 0 -\end{verbatim} - -\hypertarget{study-script-creation-1}{% -\subsubsection{Study script creation}\label{study-script-creation-1}} - -In this section we assume that our cohorts have been created either by -using ATLAS or a custom SQL script. We will first explain how to create -an R script yourself that will execute our study as we have defined -earlier. - -\hypertarget{data-extraction-1}{% -\subsubsection{Data extraction}\label{data-extraction-1}} - -Now we can tell \texttt{PatientLevelPrediction} to extract all necessary -data for our analysis. This is done using the -\href{https://github.com/OHDSI/FeatureExtration}{\texttt{FeatureExtractionPackage}}. -In short the FeatureExtractionPackage allows you to specify which -features (covariates) need to be extracted, e.g.~all conditions and drug -exposures. It also supports the creation of custom covariates. For more -detailed information on the FeatureExtraction package see its -\href{https://github.com/OHDSI/FeatureExtration}{vignettes}. For our -example study we decided to use these settings: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{ covariateSettings <-}\StringTok{ }\KeywordTok{createCovariateSettings}\NormalTok{(}\DataTypeTok{useDemographicsGender =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{useDemographicsAge =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{useConditionGroupEraLongTerm =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{useConditionGroupEraAnyTimePrior =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{useDrugGroupEraLongTerm =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{useDrugGroupEraAnyTimePrior =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{useVisitConceptCountLongTerm =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{longTermStartDays =} \DecValTok{-365}\NormalTok{,} - \DataTypeTok{endDays =} \DecValTok{-1}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -The final step for extracting the data is to run the \texttt{getPlpData} -function and input the connection details, the database schema where the -cohorts are stored, the cohort definition ids for the cohort and -outcome, and the washoutPeriod which is the minimum number of days prior -to cohort index date that the person must have been observed to be -included into the data, and finally input the previously constructed -covariate settings. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{ plpData <-}\StringTok{ }\KeywordTok{getPlpData}\NormalTok{(}\DataTypeTok{connectionDetails =}\NormalTok{ connectionDetails,} - \DataTypeTok{cdmDatabaseSchema =}\NormalTok{ cdmDatabaseSchema,} - \DataTypeTok{cohortDatabaseSchema =}\NormalTok{ resultsDatabaseSchema,} - \DataTypeTok{cohortTable =} \StringTok{'AceAngioCohort'}\NormalTok{,} - \DataTypeTok{cohortId =} \DecValTok{1}\NormalTok{,} - \DataTypeTok{covariateSettings =}\NormalTok{ covariateSettings,} - \DataTypeTok{outcomeDatabaseSchema =}\NormalTok{ resultsDatabaseSchema,} - \DataTypeTok{outcomeTable =} \StringTok{'AceAngioCohort'}\NormalTok{,} - \DataTypeTok{outcomeIds =} \DecValTok{2}\NormalTok{,} - \DataTypeTok{sampleSize =} \DecValTok{10000} -\NormalTok{ )} -\end{Highlighting} -\end{Shaded} - -Note that if the cohorts are created in ATLAS its corresponding cohort -database schema needs to be selected. There are many additional -parameters for the \texttt{getPlpData} function which are all documented -in the \texttt{PatientLevelPrediction} manual. The resulting -\texttt{plpData} object uses the package \texttt{ff} to store -information in a way that ensures R does not run out of memory, even -when the data are large. - -Creating the \texttt{plpData} object can take considerable computing -time, and it is probably a good idea to save it for future sessions. -Because \texttt{plpData} uses \texttt{ff}, we cannot use R's regular -save function. Instead, we'll have to use the \texttt{savePlpData()} -function: - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{savePlpData}\NormalTok{(plpData, }\StringTok{"angio_in_ace_data"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -We can use the \texttt{loadPlpData()} function to load the data in a -future session. - -\hypertarget{additional-inclusion-criteria-1}{% -\subsubsection{Additional inclusion -criteria}\label{additional-inclusion-criteria-1}} - -To completely define the prediction problem the final study population -is obtained by applying additional constraints on the two earlier -defined cohorts, e.g., a minumim time at risk can be enforced -(\texttt{requireTimeAtRisk,\ minTimeAtRisk}) and we can specify if this -also applies to patients with the outcome (\texttt{includeAllOutcomes}). -Here we also specify the start and end of the risk window relative to -target cohort start. For example, if we like the risk window to start 30 -days after the at-risk cohort start and end a year later we can set -\texttt{riskWindowStart\ =\ 30} and \texttt{riskWindowEnd\ =\ 365}. In -some cases the risk window needs to start at the cohort end date. This -can be achieved by setting \texttt{addExposureToStart\ =\ TRUE} which -adds the cohort (exposure) time to the start date. - -In Appendix 1, we demonstrate the effect of these settings on the subset -of the persons in the target cohort that end up in the final study -population. - -In the example below all the settings we defined for our study are -imposed: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{ population <-}\StringTok{ }\KeywordTok{createStudyPopulation}\NormalTok{(}\DataTypeTok{plpData =}\NormalTok{ plpData,} - \DataTypeTok{outcomeId =} \DecValTok{2}\NormalTok{,} - \DataTypeTok{washoutPeriod =} \DecValTok{364}\NormalTok{,} - \DataTypeTok{firstExposureOnly =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{removeSubjectsWithPriorOutcome =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{priorOutcomeLookback =} \DecValTok{9999}\NormalTok{,} - \DataTypeTok{riskWindowStart =} \DecValTok{1}\NormalTok{,} - \DataTypeTok{riskWindowEnd =} \DecValTok{365}\NormalTok{,} - \DataTypeTok{addExposureDaysToStart =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{addExposureDaysToEnd =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{minTimeAtRisk =} \DecValTok{364}\NormalTok{,} - \DataTypeTok{requireTimeAtRisk =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{includeAllOutcomes =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{verbosity =} \StringTok{"DEBUG"} -\NormalTok{ )} -\end{Highlighting} -\end{Shaded} - -\hypertarget{model-development-1}{% -\subsubsection{Model Development}\label{model-development-1}} - -In the set function of an algorithm the user can specify a list of -eligible values for each hyper-parameter. All possible combinations of -the hyper-parameters are included in a so-called grid search using -cross-validation on the training set. If a user does not specify any -value then the default value is used instead. - -For example, if we use the following settings for the -gradientBoostingMachine: ntrees=c(100,200), maxDepth=4 the grid search -will apply the gradient boosting machine algorithm with ntrees=100 and -maxDepth=4 plus the default settings for other hyper-parameters and -ntrees=200 and maxDepth=4 plus the default settings for other -hyper-parameters. The hyper-parameters that lead to the -bestcross-validation performance will then be chosen for the final -model. For our problem we choose to build a logistic regression model -with the default hyper-parameters - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{gbmModel <-}\StringTok{ }\KeywordTok{setGradientBoostingMachine}\NormalTok{(}\DataTypeTok{ntrees =} \DecValTok{5000}\NormalTok{, }\DataTypeTok{maxDepth =} \KeywordTok{c}\NormalTok{(}\DecValTok{4}\NormalTok{, }\DecValTok{7}\NormalTok{, }\DecValTok{10}\NormalTok{), }\DataTypeTok{learnRate =} \KeywordTok{c}\NormalTok{(}\FloatTok{0.001}\NormalTok{, } - \FloatTok{0.01}\NormalTok{, }\FloatTok{0.1}\NormalTok{, }\FloatTok{0.9}\NormalTok{))} -\end{Highlighting} -\end{Shaded} - -The \texttt{runPlP} function uses the population, \texttt{plpData}, and -model settings to train and evaluate the model. We can use the testSplit -(person/time) and testFraction parameters to split the data in a -75\%-25\% split and run the patient-level prediction pipeline: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{ gbmResults <-}\StringTok{ }\KeywordTok{runPlp}\NormalTok{(population, plpData, }\DataTypeTok{modelSettings =}\NormalTok{ gbmModel, }\DataTypeTok{testSplit=}\StringTok{'stratified'}\NormalTok{, } - \DataTypeTok{testFraction=}\FloatTok{0.25}\NormalTok{, }\DataTypeTok{nfold=}\DecValTok{2}\NormalTok{, }\DataTypeTok{splitSeed =} \DecValTok{1234}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -Under the hood the package will now use the R xgboost package to fit a a -gradient boosting machine model using 75\% of the data and will evaluate -the model on the remaining 25\%. A results data structure is returned -containing information about the model, its performance etc. - -In the runPlp function there are several parameters to save the plpData, -plpResults, plpPlots, evaluation etc. which are all set to True by -default. However, there is also some functionality to this manually. - -You can save the model using: - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{savePlpModel}\NormalTok{(gbmResults}\OperatorTok{$}\NormalTok{model, }\DataTypeTok{dirPath =} \KeywordTok{file.path}\NormalTok{(}\KeywordTok{getwd}\NormalTok{(), }\StringTok{"model"}\NormalTok{))} -\end{Highlighting} -\end{Shaded} - -You can load the model using: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{plpModel <-}\StringTok{ }\KeywordTok{loadPlpModel}\NormalTok{(}\KeywordTok{file.path}\NormalTok{(}\KeywordTok{getwd}\NormalTok{(), }\StringTok{"model"}\NormalTok{))} -\end{Highlighting} -\end{Shaded} - -You can also save the full results structure using: - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{savePlpResult}\NormalTok{(gbmResults, }\DataTypeTok{location =} \KeywordTok{file.path}\NormalTok{(}\KeywordTok{getwd}\NormalTok{(), }\StringTok{"gbm"}\NormalTok{))} -\end{Highlighting} -\end{Shaded} - -To load the full results structure use: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{gbmResults <-}\StringTok{ }\KeywordTok{loadPlpResult}\NormalTok{(}\KeywordTok{file.path}\NormalTok{(}\KeywordTok{getwd}\NormalTok{(), }\StringTok{"gbm"}\NormalTok{))} -\end{Highlighting} -\end{Shaded} - -\newpage - -\hypertarget{study-package-creation}{% -\section{Study package creation}\label{study-package-creation}} - -The script we created manually above can also be automatically created -using a powerful feature in ATLAS. By creating a new prediction study -(left menu) you can select the Target and Outcome as created in ATLAS, -set all the study parameters, and then you can download a R package that -you can use to execute your study. What is really powerful is that you -can add multiple Ts, Os, covariate settings etc. The package will then -run all the combinations of automatically as separate analyses. The -screenshots below explain this process. - -\begin{enumerate} -\def\labelenumi{\arabic{enumi})} -\item - Create a new prediction study and select your target and outcome - cohorts. - - \includegraphics{atlasplp1.png} -\item - Specify one or more analysis settings. - - \includegraphics{atlasplp2.png} - - \newpage -\item - Specify the trainings settigns - - \includegraphics{atlasplp3.png} -\item - Specify the execution settings - - \includegraphics{atlasplp4.png} -\end{enumerate} - -\newpage - -ATLAS can build a R package for you that will execute the full study -against you CDM. Below the steps are explained how to do this in ATLAS. - -\begin{enumerate} -\def\labelenumi{\arabic{enumi})} -\item - Under utilities you can find download. Click on the button to review - the full study specification - - \begin{figure} - \centering - \includegraphics{atlasdownload1.png} - \caption{R package download functionality in ATLAS} - \end{figure} -\item - You now have to review that you indeed want to run all these analyses - (cartesian product of all the settings for each T and O combination. - - \begin{figure} - \centering - \includegraphics{atlasdownload2.png} - \caption{R package download functionality in ATLAS} - \end{figure} -\item - If you agree, you give the package a name, and download the package as - a zipfile. -\item - By opening the R package in R studio and building the package you can - run the study using the \texttt{execute} function. Theres is also an - example CodeToRun.R script available in the extras folder of the - package with extra instructions. -\end{enumerate} - -\hypertarget{internal-validation}{% -\section{Internal validation}\label{internal-validation}} - -Once we execute the study, the runPlp() function returns the trained -model and the evaluation of the model on the train/test sets. - -You can interactively view the results by running: -\texttt{viewPlp(runPlp=lrResults)}. This will generate a Shiny App in -your browser in which you can view all performance measures created by -the framework as shown in the figure below. - -\includegraphics{shinysummary.png} - -Furthermore, many interactive plots are available in the Shiny App, for -example the ROC curve in which you can move over the plot to see the -threshold and the corresponding sensitivity and specificity values. - -\includegraphics{shinyroc.png} - -To generate and save all the evaluation plots to a folder run the -following code: - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{plotPlp}\NormalTok{(lrResults, }\DataTypeTok{dirPath =} \KeywordTok{getwd}\NormalTok{())} -\end{Highlighting} -\end{Shaded} - -The plots are described in more detail in the next sections. - -\newpage - -\hypertarget{discrimination}{% -\subsection{Discrimination}\label{discrimination}} - -The Receiver Operating Characteristics (ROC) plot shows the sensitivity -against 1-specificity on the test set. The plot illustrates how well the -model is able to discriminate between the people with the outcome and -those without. The dashed diagonal line is the performance of a model -that randomly assigns predictions. The higher the area under the ROC -plot the better the discrimination of the model. The plot is created by -changing the probability threshold to assign the positive class. - -\includegraphics{sparseRoc.png} - -\newpage \#\# Calibration - -The calibration plot shows how close the predicted risk is to the -observed risk. The diagonal dashed line thus indicates a perfectly -calibrated model. The ten (or fewer) dots represent the mean predicted -values for each quantile plotted against the observed fraction of people -in that quantile who had the outcome (observed fraction). The straight -black line is the linear regression using these 10 plotted quantile mean -predicted vs observed fraction points. The straight vertical lines -represented the 95\% lower and upper confidence intervals of the slope -of the fitted line. - -\includegraphics{sparseCalibration.png} - -\newpage - -\hypertarget{smooth-calibration}{% -\subsection{Smooth Calibration}\label{smooth-calibration}} - -Similar to the traditional calibration shown above the Smooth -Calibration plot shows the relationship between predicted and observed -risk. the major difference is that the smooth fit allows for a more fine -grained examination of this. Whereas the traditional plot will be -heavily influenced by the areas with the highest density of data the -smooth plot will provide the same information for this region as well as -a more accurate interpretation of areas with lower density. the plot -also contains information on the distribution of the outcomes relative -to predicted risk. - -However, the increased information gain comes at a computational cost. -It is recommended to use the traditional plot for examination and then -to produce the smooth plot for final versions. To create the smooth -calibarion plot you have to run the follow command: - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{plotSmoothCalibration}\NormalTok{(lrResults)} -\end{Highlighting} -\end{Shaded} - -See the help function for more information, on how to set the smoothing -method etc. - -The example below is from another study that better demonstrates the -impact of using a smooth calibration plot. The default line fit would -not highlight the miss-calibration at the lower predicted probability -levels that well. - -\includegraphics{smoothCalibration.jpeg} - -\newpage \#\# Preference distribution - -The preference distribution plots are the preference score distributions -corresponding to i) people in the test set with the outcome (red) and -ii) people in the test set without the outcome (blue). - -\includegraphics{preferencePDF.png} - -\newpage \#\# Predicted probability distribution - -The prediction distribution box plots are for the predicted risks of the -people in the test set with the outcome (class 1: blue) and without the -outcome (class 0: red). - -The box plots in the Figure show that the predicted probability of the -outcome is indeed higher for those with the outcome but there is also -overlap between the two distribution which lead to an imperfect -discrimination. - -\includegraphics{predictionDistribution.png} - -\newpage \#\# Test-Train similarity - -The test-train similarity is assessed by plotting the mean covariate -values in the train set against those in the test set for people with -and without the outcome. - -The results for our example of look very promising since the mean values -of the covariates are on the diagonal. - -\includegraphics{generalizability.png} - -\newpage \#\# Variable scatter plot - -The variable scatter plot shows the mean covariate value for the people -with the outcome against the mean covariate value for the people without -the outcome. The color of the dots corresponds to the inclusion (green) -or exclusion in the model (blue), respectively. It is highly recommended -to use the Shiny App since this allows you to hoover over a covariate to -show more details (name, value etc). - -The plot shows that the mean of most of the covariates is higher for -subjects with the outcome compared to those without. - -\includegraphics{variableScatterplot.png} - -\newpage \#\# Precision recall - -Precision (P) is defined as the number of true positives (Tp) over the -number of true positives plus the number of false positives (Fp). - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{P <-}\StringTok{ }\NormalTok{Tp}\OperatorTok{/}\NormalTok{(Tp }\OperatorTok{+}\StringTok{ }\NormalTok{Fp)} -\end{Highlighting} -\end{Shaded} - -Recall (R) is defined as the number of true positives (Tp) over the -number of true positives plus the number of false negatives (Fn). - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{R <-}\StringTok{ }\NormalTok{Tp}\OperatorTok{/}\NormalTok{(Tp }\OperatorTok{+}\StringTok{ }\NormalTok{Fn)} -\end{Highlighting} -\end{Shaded} - -These quantities are also related to the (F1) score, which is defined as -the harmonic mean of precision and recall. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{F1 <-}\StringTok{ }\DecValTok{2} \OperatorTok{*}\StringTok{ }\NormalTok{P }\OperatorTok{*}\StringTok{ }\NormalTok{R}\OperatorTok{/}\NormalTok{(P }\OperatorTok{+}\StringTok{ }\NormalTok{R)} -\end{Highlighting} -\end{Shaded} - -Note that the precision can either decrease or increase if the threshold -is lowered. Lowering the threshold of a classifier may increase the -denominator, by increasing the number of results returned. If the -threshold was previously set too high, the new results may all be true -positives, which will increase precision. If the previous threshold was -about right or too low, further lowering the threshold will introduce -false positives, decreasing precision. - -For Recall the denominator does not depend on the classifier threshold -(Tp+Fn is a constant). This means that lowering the classifier threshold -may increase recall, by increasing the number of true positive results. -It is also possible that lowering the threshold may leave recall -unchanged, while the precision fluctuates. - -\includegraphics{precisionRecall.png} - -\newpage \#\# Demographic summary - -This plot shows for females and males the expected and observed risk in -different age groups together with a confidence area. - -The results show that our model is well calibrated across gender and age -groups. - -\includegraphics{demographicSummary.png} - -\newpage \# External validation - -We recommend to always perform external validation, i.e.~apply the final -model on as much new datasets as feasible and evaluate its performance. - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{# load the trained model} -\NormalTok{plpModel <-}\StringTok{ }\KeywordTok{loadPlpModel}\NormalTok{(}\KeywordTok{getwd}\NormalTok{(),}\StringTok{'model'}\NormalTok{)} - -\CommentTok{#load the new plpData and create the population} -\NormalTok{plpData <-}\StringTok{ }\KeywordTok{loadPlpData}\NormalTok{(}\KeywordTok{getwd}\NormalTok{(),}\StringTok{'data'}\NormalTok{)} -\NormalTok{population <-}\StringTok{ }\KeywordTok{createStudyPopulation}\NormalTok{(plpData, } - \DataTypeTok{outcomeId =} \DecValTok{2}\NormalTok{, } - \DataTypeTok{includeAllOutcomes =} \OtherTok{TRUE}\NormalTok{, } - \DataTypeTok{firstExposureOnly =} \OtherTok{TRUE}\NormalTok{, } - \DataTypeTok{washoutPeriod =} \DecValTok{365}\NormalTok{, } - \DataTypeTok{removeSubjectsWithPriorOutcome =} \OtherTok{TRUE}\NormalTok{, } - \DataTypeTok{priorOutcomeLookback =} \DecValTok{365}\NormalTok{,} - \DataTypeTok{riskWindowStart =} \DecValTok{1}\NormalTok{,} - \DataTypeTok{requireTimeAtRisk =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{riskWindowEnd =} \DecValTok{365} -\NormalTok{)} - -\CommentTok{# apply the trained model on the new data} -\NormalTok{validationResults <-}\StringTok{ }\KeywordTok{applyModel}\NormalTok{(population,plpData,plpModel)} -\end{Highlighting} -\end{Shaded} - -To make things easier we also provide a function for performing external -validation of a model across one or multiple datasets: - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{# load the trained model} -\NormalTok{plpResult <-}\StringTok{ }\KeywordTok{loadPlpResult}\NormalTok{(}\KeywordTok{getwd}\NormalTok{(),}\StringTok{'plpResult'}\NormalTok{)} - -\NormalTok{connectionDetails <-}\StringTok{ }\KeywordTok{createConnectionDetails}\NormalTok{(}\DataTypeTok{dbms =} \StringTok{"postgresql"}\NormalTok{, } - \DataTypeTok{server =} \StringTok{"localhost/ohdsi"}\NormalTok{, } - \DataTypeTok{user =} \StringTok{"joe"}\NormalTok{, } - \DataTypeTok{password =} \StringTok{"supersecret"}\NormalTok{)} - -\NormalTok{validation <-}\StringTok{ }\KeywordTok{externalValidatePlp}\NormalTok{(}\DataTypeTok{plpResult =}\NormalTok{ plpResult, } - \DataTypeTok{connectionDetails =}\NormalTok{ connectionDetails,} - \DataTypeTok{validationSchemaTarget =} \StringTok{'new_cohort_schema'}\NormalTok{,} - \DataTypeTok{validationSchemaOutcome =} \StringTok{'new_cohort_schema'}\NormalTok{,} - \DataTypeTok{validationSchemaCdm =} \StringTok{'new_cdm_schema'}\NormalTok{, } - \DataTypeTok{validationTableTarget =} \StringTok{'cohort_table'}\NormalTok{,} - \DataTypeTok{validationTableOutcome =} \StringTok{'cohort_table'}\NormalTok{, } - \DataTypeTok{validationIdTarget =} \StringTok{'cohort_id'}\NormalTok{, } - \DataTypeTok{validationIdOutcome =} \StringTok{'outcome_id'}\NormalTok{, } - \DataTypeTok{keepPrediction =}\NormalTok{ T} -\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -This will extract the new plpData from the specified schemas and cohort -tables. It will then apply the same population settings and the trained -plp model. Finally, it will evaluate the performance and return the -standard output as \texttt{validation\$performance} and if -keepPrediction is TRUE then it will also return the prediction on the -population as \texttt{validation\$prediction}. They can be inserted into -the shiny app for viewing the model and validation by running: -\texttt{viewPlp(runPlp=plpResult,\ validatePlp=validation\ )}. - -If you want to validate on multiple databases available you can insert -the new schemas and cohort tables as a list: - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{# load the trained model} -\NormalTok{plpResult <-}\StringTok{ }\KeywordTok{loadPlpResult}\NormalTok{(}\KeywordTok{getwd}\NormalTok{(),}\StringTok{'plpResult'}\NormalTok{)} - -\NormalTok{connectionDetails <-}\StringTok{ }\KeywordTok{createConnectionDetails}\NormalTok{(}\DataTypeTok{dbms =} \StringTok{"postgresql"}\NormalTok{, } - \DataTypeTok{server =} \StringTok{"localhost/ohdsi"}\NormalTok{, } - \DataTypeTok{user =} \StringTok{"joe"}\NormalTok{, } - \DataTypeTok{password =} \StringTok{"supersecret"}\NormalTok{)} - -\NormalTok{validation <-}\StringTok{ }\KeywordTok{externalValidatePlp}\NormalTok{(}\DataTypeTok{plpResult =}\NormalTok{ plpResult, } - \DataTypeTok{connectionDetails =}\NormalTok{ connectionDetails,} - \DataTypeTok{validationSchemaTarget =} \KeywordTok{list}\NormalTok{(}\StringTok{'new_cohort_schema1'}\NormalTok{,} - \StringTok{'new_cohort_schema2'}\NormalTok{),} - \DataTypeTok{validationSchemaOutcome =} \KeywordTok{list}\NormalTok{(}\StringTok{'new_cohort_schema1'}\NormalTok{,} - \StringTok{'new_cohort_schema2'}\NormalTok{),} - \DataTypeTok{validationSchemaCdm =} \KeywordTok{list}\NormalTok{(}\StringTok{'new_cdm_schema1'}\NormalTok{,} - \StringTok{'new_cdm_schema2'}\NormalTok{), } - \DataTypeTok{validationTableTarget =} \KeywordTok{list}\NormalTok{(}\StringTok{'new_cohort_table1'}\NormalTok{,} - \StringTok{'new_cohort_table2'}\NormalTok{),} - \DataTypeTok{validationTableOutcome =} \KeywordTok{list}\NormalTok{(}\StringTok{'new_cohort_table1'}\NormalTok{,} - \StringTok{'new_cohort_table2'}\NormalTok{),} - \DataTypeTok{validationIdTarget =} \StringTok{'cohort_id'}\NormalTok{, } - \DataTypeTok{validationIdOutcome =} \StringTok{'outcome_id'}\NormalTok{, } - \DataTypeTok{keepPrediction =}\NormalTok{ T} -\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\hypertarget{journal-paper-generation}{% -\section{Journal paper generation}\label{journal-paper-generation}} - -We have added functionality to automatically generate a word document -you can use as start of a journal paper. It contains many of the -generated study details and results. If you have performed external -validation these results will can be added as well. Optionally, you can -add a ``Table 1'' that contains data on many covariates for the target -population. - -You can create the draft journal paper by running this function: - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{createPlpJournalDocument}\NormalTok{(}\DataTypeTok{plpResult =} \OperatorTok{<}\NormalTok{your plp results}\OperatorTok{>}\NormalTok{, } - \DataTypeTok{plpValidation =} \OperatorTok{<}\NormalTok{your validation results}\OperatorTok{>}\NormalTok{,} - \DataTypeTok{plpData =} \OperatorTok{<}\NormalTok{your plp data}\OperatorTok{>}\NormalTok{, } - \DataTypeTok{targetName =} \StringTok{""}\NormalTok{,} - \DataTypeTok{outcomeName =} \StringTok{""}\NormalTok{, } - \DataTypeTok{table1 =}\NormalTok{ F, } - \DataTypeTok{connectionDetails =} \OtherTok{NULL}\NormalTok{,} - \DataTypeTok{includeTrain =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{includeTest =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{includePredictionPicture =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{includeAttritionPlot =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{outputLocation =} \StringTok{""}\NormalTok{)}\ErrorTok{)} -\end{Highlighting} -\end{Shaded} - -For more details see the help page of the function. - -\newpage - -\hypertarget{other-functionality}{% -\section{Other functionality}\label{other-functionality}} - -The package has much more functionality than described in this vignette -and contributions have been made my many persons in the OHDSI community. -The table below provides an overview: - -\begin{longtable}[]{@{}lll@{}} -\toprule -\begin{minipage}[b]{0.18\columnwidth}\raggedright -Functionality\strut -\end{minipage} & \begin{minipage}[b]{0.55\columnwidth}\raggedright -Description\strut -\end{minipage} & \begin{minipage}[b]{0.18\columnwidth}\raggedright -Vignette\strut -\end{minipage}\tabularnewline -\midrule -\endhead -\begin{minipage}[t]{0.18\columnwidth}\raggedright -Builing Multiple Models\strut -\end{minipage} & \begin{minipage}[t]{0.55\columnwidth}\raggedright -This vignette describes how you can run multiple models -automatically\strut -\end{minipage} & \begin{minipage}[t]{0.18\columnwidth}\raggedright -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/BuildingMultiplePredictiveModels.pdf}{\texttt{Vignette}}\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.18\columnwidth}\raggedright -Custom algorithms\strut -\end{minipage} & \begin{minipage}[t]{0.55\columnwidth}\raggedright -This vignette describes how you can add your own custom algorithms in -the framework\strut -\end{minipage} & \begin{minipage}[t]{0.18\columnwidth}\raggedright -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/AddingCustomAlgorithms.pdf}{\texttt{Vignette}}\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.18\columnwidth}\raggedright -Ensemble models\strut -\end{minipage} & \begin{minipage}[t]{0.55\columnwidth}\raggedright -This vignette describes how you can use the framework to build ensemble -models, i.e combine multiple models in a super learner\strut -\end{minipage} & \begin{minipage}[t]{0.18\columnwidth}\raggedright -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/BuildingEnsembleModels.pdf}{\texttt{Vignette}}\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.18\columnwidth}\raggedright -Deep Learning Models\strut -\end{minipage} & \begin{minipage}[t]{0.55\columnwidth}\raggedright -We have added extensive functionality for Deep Learning including -several architectures in both pyTorch and Keras. These algorithms can be -trained using GPU power\strut -\end{minipage} & \begin{minipage}[t]{0.18\columnwidth}\raggedright -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/BuildingDeepLearningModels.pdf}{\texttt{Vignette}}\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.18\columnwidth}\raggedright -Learning curves\strut -\end{minipage} & \begin{minipage}[t]{0.55\columnwidth}\raggedright -Learning curves assess the effect of training set size on model -performance by training a sequence of prediction models on successively -larger subsets of the training set. A learning curve plot can also help -in diagnosing a bias or variance problem as explained below.\strut -\end{minipage} & \begin{minipage}[t]{0.18\columnwidth}\raggedright -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/GeneratingLearningCurves.pdf}{\texttt{Vignette}}\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.18\columnwidth}\raggedright -Implementing existing models\strut -\end{minipage} & \begin{minipage}[t]{0.55\columnwidth}\raggedright -This vignette describes how you can implement existing logistic -regression models in the framework, e.g.~as found in literature\strut -\end{minipage} & \begin{minipage}[t]{0.18\columnwidth}\raggedright -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/ImplementingExistingModels.pdf}{\texttt{Vignette}}\strut -\end{minipage}\tabularnewline -\bottomrule -\end{longtable} - -\hypertarget{demos}{% -\section{Demos}\label{demos}} - -We have added several demos in the package that run on simulated data: - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{# Show all demos in our package: } -\KeywordTok{demo}\NormalTok{(}\DataTypeTok{package =} \StringTok{"PatientLevelPrediction"}\NormalTok{)} - -\CommentTok{# For example, to run the SingleModelDemo that runs Lasso and shows you how to run the Shiny App use this call} -\KeywordTok{demo}\NormalTok{(}\StringTok{"SingleModelDemo"}\NormalTok{, }\DataTypeTok{package =} \StringTok{"PatientLevelPrediction"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\newpage - -\hypertarget{acknowledgments}{% -\section{Acknowledgments}\label{acknowledgments}} - -Considerable work has been dedicated to provide the -\texttt{PatientLevelPrediction} package. - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{citation}\NormalTok{(}\StringTok{"PatientLevelPrediction"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -## -## To cite PatientLevelPrediction in publications use: -## -## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design and -## implementation of a standardized framework to generate and evaluate patient-level -## prediction models using observational healthcare data." _Journal of the American -## Medical Informatics Association_, *25*(8), 969-975. . -## -## A BibTeX entry for LaTeX users is -## -## @Article{, -## author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek}, -## title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data}, -## journal = {Journal of the American Medical Informatics Association}, -## volume = {25}, -## number = {8}, -## pages = {969-975}, -## year = {2018}, -## url = {https://doi.org/10.1093/jamia/ocy032}, -## } -\end{verbatim} - -Further, \texttt{PatientLevelPrediction} makes extensive use of the -\texttt{Cyclops} package. - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{citation}\NormalTok{(}\StringTok{"Cyclops"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -## -## To cite Cyclops in publications use: -## -## Suchard MA, Simpson SE, Zorych I, Ryan P, Madigan D (2013). "Massive -## parallelization of serial inference algorithms for complex generalized linear -## models." _ACM Transactions on Modeling and Computer Simulation_, *23*, 10. . -## -## A BibTeX entry for LaTeX users is -## -## @Article{, -## author = {M. A. Suchard and S. E. Simpson and I. Zorych and P. Ryan and D. Madigan}, -## title = {Massive parallelization of serial inference algorithms for complex generalized linear models}, -## journal = {ACM Transactions on Modeling and Computer Simulation}, -## volume = {23}, -## pages = {10}, -## year = {2013}, -## url = {http://dl.acm.org/citation.cfm?id=2414791}, -## } -\end{verbatim} - -\textbf{Please reference this paper if you use the PLP Package in your -work:} - -\href{http://dx.doi.org/10.1093/jamia/ocy032}{Reps JM, Schuemie MJ, -Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a -standardized framework to generate and evaluate patient-level prediction -models using observational healthcare data. J Am Med Inform Assoc. -2018;25(8):969-975.} - -This work is supported in part through the National Science Foundation -grant IIS 1251151. - -\newpage - -\hypertarget{appendix-1-study-population-settings-details}{% -\section*{Appendix 1: Study population settings -details}\label{appendix-1-study-population-settings-details}} -\addcontentsline{toc}{section}{Appendix 1: Study population settings -details} - -In the figures below the effect is shown of the -removeSubjectsWithPriorOutcome, requireTimAtRisk, and includeAllOutcomes -booleans on the final study population. We start with a Target Cohort -with firstExposureOnly = false and we require a washout period = 1095. -We then subset the target cohort based on additional constraints. The -final study population in the Venn diagrams below are colored green. - -\begin{enumerate} -\def\labelenumi{\arabic{enumi})} -\item - Require minimum time-at-risk for all person in the target cohort - - \includegraphics{popdef1.png} -\item - Require minumum time-at-risk for target cohort, except for persons - with outcomes during time-at-risk. - - \includegraphics{popdef2.png} -\end{enumerate} - -\newpage 3) - -Include all persons in the target cohort exclude persons with prior -outcomes - -\includegraphics{popdef3.png} - -\begin{enumerate} -\def\labelenumi{\arabic{enumi})} -\setcounter{enumi}{3} -\item - Require minimum time-at-risk for target cohort, except for persons - with outcomes during time-at-risk, exclude persons with prior outcomes - - \includegraphics{popdef4.png} -\end{enumerate} - -\newpage - -\begin{enumerate} -\def\labelenumi{\arabic{enumi})} -\setcounter{enumi}{4} -\item - Include all persons in target cohort exclude persons with prior - outcomes - - \includegraphics{popdef5.png} -\item - Include all persons in target cohort - - \includegraphics{popdef6.png} -\end{enumerate} - -\end{document} diff --git a/inst/doc/CreatingLearningCurves.tex b/inst/doc/CreatingLearningCurves.tex deleted file mode 100644 index 79283a598..000000000 --- a/inst/doc/CreatingLearningCurves.tex +++ /dev/null @@ -1,408 +0,0 @@ -% Options for packages loaded elsewhere -\PassOptionsToPackage{unicode}{hyperref} -\PassOptionsToPackage{hyphens}{url} -% -\documentclass[ -]{article} -\usepackage{lmodern} -\usepackage{amssymb,amsmath} -\usepackage{ifxetex,ifluatex} -\ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex - \usepackage[T1]{fontenc} - \usepackage[utf8]{inputenc} - \usepackage{textcomp} % provide euro and other symbols -\else % if luatex or xetex - \usepackage{unicode-math} - \defaultfontfeatures{Scale=MatchLowercase} - \defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1} -\fi -% Use upquote if available, for straight quotes in verbatim environments -\IfFileExists{upquote.sty}{\usepackage{upquote}}{} -\IfFileExists{microtype.sty}{% use microtype if available - \usepackage[]{microtype} - \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts -}{} -\makeatletter -\@ifundefined{KOMAClassName}{% if non-KOMA class - \IfFileExists{parskip.sty}{% - \usepackage{parskip} - }{% else - \setlength{\parindent}{0pt} - \setlength{\parskip}{6pt plus 2pt minus 1pt}} -}{% if KOMA class - \KOMAoptions{parskip=half}} -\makeatother -\usepackage{xcolor} -\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available -\IfFileExists{bookmark.sty}{\usepackage{bookmark}}{\usepackage{hyperref}} -\hypersetup{ - pdftitle={Creating Learning Curves}, - pdfauthor={Luis H. John, Jenna M. Reps, Peter R. Rijnbeek}, - hidelinks, - pdfcreator={LaTeX via pandoc}} -\urlstyle{same} % disable monospaced font for URLs -\usepackage[margin=1in]{geometry} -\usepackage{color} -\usepackage{fancyvrb} -\newcommand{\VerbBar}{|} -\newcommand{\VERB}{\Verb[commandchars=\\\{\}]} -\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}} -% Add ',fontsize=\small' for more characters per line -\usepackage{framed} -\definecolor{shadecolor}{RGB}{248,248,248} -\newenvironment{Shaded}{\begin{snugshade}}{\end{snugshade}} -\newcommand{\AlertTok}[1]{\textcolor[rgb]{0.94,0.16,0.16}{#1}} -\newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.77,0.63,0.00}{#1}} -\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\BuiltInTok}[1]{#1} -\newcommand{\CharTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}} -\newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\ConstantTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} -\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{#1}} -\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\ErrorTok}[1]{\textcolor[rgb]{0.64,0.00,0.00}{\textbf{#1}}} -\newcommand{\ExtensionTok}[1]{#1} -\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\ImportTok}[1]{#1} -\newcommand{\InformationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} -\newcommand{\NormalTok}[1]{#1} -\newcommand{\OperatorTok}[1]{\textcolor[rgb]{0.81,0.36,0.00}{\textbf{#1}}} -\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}} -\newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}} -\newcommand{\RegionMarkerTok}[1]{#1} -\newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\StringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\VariableTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\WarningTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\usepackage{graphicx,grffile} -\makeatletter -\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi} -\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi} -\makeatother -% Scale images if necessary, so that they will not overflow the page -% margins by default, and it is still possible to overwrite the defaults -% using explicit options in \includegraphics[width, height, ...]{} -\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio} -% Set default figure placement to htbp -\makeatletter -\def\fps@figure{htbp} -\makeatother -\setlength{\emergencystretch}{3em} % prevent overfull lines -\providecommand{\tightlist}{% - \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}} -\setcounter{secnumdepth}{-\maxdimen} % remove section numbering -\usepackage{fancyhdr} -\pagestyle{fancy} -\fancyhead{} -\fancyhead[CO,CE]{Generating Learning Curves} -\fancyfoot[LE,RO]{\thepage} -\renewcommand{\headrulewidth}{0.4pt} -\renewcommand{\footrulewidth}{0.4pt} -\fancyfoot[CO,CE]{PatientLevelPrediction Package Version 4.0.6} - -\title{Creating Learning Curves} -\author{Luis H. John, Jenna M. Reps, Peter R. Rijnbeek} -\date{2020-08-19} - -\begin{document} -\maketitle - -\hypertarget{introduction}{% -\section{Introduction}\label{introduction}} - -This vignette describes how you can use the Observational Health Data -Sciences and Informatics (OHDSI) -\href{http://github.com/OHDSI/PatientLevelPrediction}{\texttt{PatientLevelPrediction}} -package to create learning curves. This vignette assumes you have read -and are comfortable with building patient level prediction models as -described in the -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/BuildingPredictiveModels.pdf}{\texttt{BuildingPredictiveModels} -vignette}. - -Prediction models will show overly-optimistic performance when -predicting on the same data as used for training. Therefore, -best-practice is to partition our data into a training set and testing -set. We then train our prediction model on the training set portion and -asses its ability to generalize to unseen data by measuring its -performance on the testing set. - -Learning curves assess the effect of training set size on model -performance by training a sequence of prediction models on successively -larger subsets of the training set. A learning curve plot can also help -in diagnosing a bias or variance problem as explained below. - -\begin{figure} -\centering -\includegraphics{learningCurve.png} -\caption{Learning curve example.} -\end{figure} - -Figure 1, shows an example of learning curve plot in which the vertical -axis represents the model performance and the horizontal axis the -training set size. If training set size is small, the performance on the -training set is high, because a model can often be fitted well to a -limited number of training examples. At the same time, the performance -on the testing set will be poor, because the model trained on such a -limited number of training examples will not generalize well to unseen -data in the testing set. As the training set size increases, the -performance of the model on the training set will decrease. It becomes -more difficult for the model to find a good fit through all the training -examples. Also, the model will be trained on a more representative -portion of training examples, making it generalize better to unseen -data. This can be observed by the increasin testing set performance. - -The learning curve can help us in diagnosing bias and variance problems -with our classifier which will provide guidance on how to further -improve our model. We can observe high variance (overfitting) in a -prediction model if it performs well on the training set, but poorly on -the testing set (Figure 2). Adding additional data is a common approach -to counteract high variance. From the learning curve it becomes -apparent, that adding additional data may improve performance on the -testing set a little further, as the learning curve has not yet -plateaued and, thus, the model is not saturated yet. Therefore, adding -more data will decrease the gap between training set and testing set, -which is the main indicator for a high variance problem. - -\begin{figure} -\centering -\includegraphics{learningCurveVariance.png} -\caption{Prediction model suffering from high variance.} -\end{figure} - -Furthermore, we can observe high bias (underfitting) if a prediction -model performs poorly on the training set as well as on the testing set -(Figure 3). The learning curves of training set and testing set have -flattened on a low performance with only a small gap in between them. -Adding additional data will in this case have little to no impact on the -model performance. Choosing another prediction algorithm that can find -more complex (for example non-linear) relationships in the data may be -an alternative approach to consider in this high bias situation. - -\begin{figure} -\centering -\includegraphics{learningCurveBias.png} -\caption{Prediction model suffering from high bias.} -\end{figure} - -\hypertarget{creating-the-learning-curve}{% -\section{Creating the learning -curve}\label{creating-the-learning-curve}} - -Use the -\href{http://github.com/OHDSI/PatientLevelPrediction}{\texttt{PatientLevelPrediction}} -package to create a \texttt{population} and \texttt{plpData} object. -Alternatively, you can make use of the data simulator. The following -code snippet creates a population of 12000 patients. - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{set.seed}\NormalTok{(}\DecValTok{1234}\NormalTok{)} -\KeywordTok{data}\NormalTok{(plpDataSimulationProfile)} -\NormalTok{sampleSize <-}\StringTok{ }\DecValTok{12000} -\NormalTok{plpData <-}\StringTok{ }\KeywordTok{simulatePlpData}\NormalTok{(} -\NormalTok{ plpDataSimulationProfile,} - \DataTypeTok{n =}\NormalTok{ sampleSize} -\NormalTok{)} - -\NormalTok{population <-}\StringTok{ }\KeywordTok{createStudyPopulation}\NormalTok{(} -\NormalTok{ plpData,} - \DataTypeTok{outcomeId =} \DecValTok{2}\NormalTok{,} - \DataTypeTok{binary =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{firstExposureOnly =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{washoutPeriod =} \DecValTok{0}\NormalTok{,} - \DataTypeTok{removeSubjectsWithPriorOutcome =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{priorOutcomeLookback =} \DecValTok{99999}\NormalTok{,} - \DataTypeTok{requireTimeAtRisk =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{minTimeAtRisk =} \DecValTok{0}\NormalTok{,} - \DataTypeTok{riskWindowStart =} \DecValTok{0}\NormalTok{,} - \DataTypeTok{riskWindowEnd =} \DecValTok{365}\NormalTok{,} - \DataTypeTok{verbosity =} \StringTok{"INFO"} -\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -Specify the prediction algorithm to be used. - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{# Use LASSO logistic regression} -\NormalTok{modelSettings <-}\StringTok{ }\KeywordTok{setLassoLogisticRegression}\NormalTok{()} -\end{Highlighting} -\end{Shaded} - -Specify a test fraction and a sequence of training set fractions. -Alternatively, you can provide a sequence of training events instead of -the training set fractions. This is recommended, because events are -determinant of model performance. Make sure that your training set can -provide the number of events specified. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{testFraction <-}\StringTok{ }\FloatTok{0.2} -\NormalTok{trainFractions <-}\StringTok{ }\KeywordTok{seq}\NormalTok{(}\FloatTok{0.1}\NormalTok{, }\FloatTok{0.8}\NormalTok{, }\FloatTok{0.1}\NormalTok{) }\CommentTok{# Create eight training set fractions} - -\CommentTok{# alternatively use a sequence of training events by uncommenting the line below.} -\CommentTok{# trainEvents <- seq(100, 5000, 100)} -\end{Highlighting} -\end{Shaded} - -Specify the test split to be used. - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{# Use a split by person, alternatively a time split is possible} -\NormalTok{testSplit <-}\StringTok{ 'stratified'} -\end{Highlighting} -\end{Shaded} - -Create the learning curve object. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{learningCurve <-}\StringTok{ }\KeywordTok{createLearningCurve}\NormalTok{(population,} - \DataTypeTok{plpData =}\NormalTok{ plpData,} - \DataTypeTok{modelSettings =}\NormalTok{ modelSettings,} - \DataTypeTok{testFraction =} \FloatTok{0.2}\NormalTok{,} - \DataTypeTok{verbosity =} \StringTok{"TRACE"}\NormalTok{,} - \DataTypeTok{trainFractions =}\NormalTok{ trainFractions,} - \CommentTok{# trainEvents = trainEvents,} - \DataTypeTok{splitSeed =} \DecValTok{1000}\NormalTok{,} - \DataTypeTok{saveModel =} \OtherTok{TRUE}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -Plot the learning curve object (Figure 4). Specify one of the available -metrics: \texttt{AUROC}, \texttt{AUPRC}, \texttt{sBrier}. Moreover, you -can specify what metric to put on the abscissa, number of -\texttt{observations} or number of \texttt{events}. We recommend the -latter, because \texttt{events} are determinant of model performance and -allow you to better compare learning curves across different prediction -problems and databases. - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{plotLearningCurve}\NormalTok{(} -\NormalTok{ learningCurve,} - \DataTypeTok{metric =} \StringTok{'AUROC'}\NormalTok{,} - \DataTypeTok{abscissa =} \StringTok{'events'}\NormalTok{,} - \DataTypeTok{plotTitle =} \StringTok{'Learning Curve'}\NormalTok{,} - \DataTypeTok{plotSubtitle =} \StringTok{'AUROC performance'} -\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{figure} -\centering -\includegraphics{learningCurvePlot.png} -\caption{Learning curve plot.} -\end{figure} - -\hypertarget{parallel-processing}{% -\section{Parallel processing}\label{parallel-processing}} - -The learning curve object can be created in parallel, which can reduce -computation time significantly. Currently this functionality is only -available for LASSO logistic regression and gradient boosting machines. -Depending on the number of parallel workers it may require a significant -amount of memory. We advise to use the parallelized learning curve -function for parameter search and exploratory data analysis. - -Use the parallelized version of the learning curve function to create -the learning curve object in parallel. R will find the number of -available processing cores automatically and register the required -parallel backend. Alternatively, you can provide the number of cores you -wish to use. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{learningCurvePar <-}\StringTok{ }\KeywordTok{createLearningCurvePar}\NormalTok{(} -\NormalTok{ population,} - \DataTypeTok{plpData =}\NormalTok{ plpData,} - \DataTypeTok{modelSettings =}\NormalTok{ modelSettings,} - \DataTypeTok{testSplit =}\NormalTok{ testSplit,} - \DataTypeTok{testFraction =}\NormalTok{ testFraction,} - \DataTypeTok{trainEvents =}\NormalTok{ trainEvents,} - \DataTypeTok{cores =} \DecValTok{4}\NormalTok{,} - \DataTypeTok{splitSeed =} \DecValTok{1000} -\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\hypertarget{demo}{% -\section{Demo}\label{demo}} - -We have added a demo of the learningcurve: - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{# Show all demos in our package: } - \KeywordTok{demo}\NormalTok{(}\DataTypeTok{package =} \StringTok{"PatientLevelPrediction"}\NormalTok{)} - -\CommentTok{# Run the learning curve} - \KeywordTok{demo}\NormalTok{(}\StringTok{"LearningCurveDemo"}\NormalTok{, }\DataTypeTok{package =} \StringTok{"PatientLevelPrediction"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -Do note that running this demo can take a considerable amount of time -(15 min on Quad core running in parallel)! - -\hypertarget{publication}{% -\section{Publication}\label{publication}} - -A publication titled `How little data do we need for patient-level -prediction?' uses the learning curve functionality in this package and -can be accessed as preprint in the arXiv archives at -\url{https://arxiv.org/abs/2008.07361}. - -\hypertarget{acknowledgments}{% -\section{Acknowledgments}\label{acknowledgments}} - -Considerable work has been dedicated to provide the -\texttt{PatientLevelPrediction} package. - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{citation}\NormalTok{(}\StringTok{"PatientLevelPrediction"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -## -## To cite PatientLevelPrediction in publications use: -## -## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design and implementation of a standardized framework -## to generate and evaluate patient-level prediction models using observational healthcare data." _Journal of the -## American Medical Informatics Association_, *25*(8), 969-975. . -## -## A BibTeX entry for LaTeX users is -## -## @Article{, -## author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek}, -## title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data}, -## journal = {Journal of the American Medical Informatics Association}, -## volume = {25}, -## number = {8}, -## pages = {969-975}, -## year = {2018}, -## url = {https://doi.org/10.1093/jamia/ocy032}, -## } -\end{verbatim} - -\textbf{Please reference this paper if you use the PLP Package in your -work:} - -\href{http://dx.doi.org/10.1093/jamia/ocy032}{Reps JM, Schuemie MJ, -Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a -standardized framework to generate and evaluate patient-level prediction -models using observational healthcare data. J Am Med Inform Assoc. -2018;25(8):969-975.} - -\end{document} diff --git a/inst/doc/CreatingNetworkstudies.tex b/inst/doc/CreatingNetworkstudies.tex deleted file mode 100644 index 62ec2f7e2..000000000 --- a/inst/doc/CreatingNetworkstudies.tex +++ /dev/null @@ -1,420 +0,0 @@ -% Options for packages loaded elsewhere -\PassOptionsToPackage{unicode}{hyperref} -\PassOptionsToPackage{hyphens}{url} -% -\documentclass[ -]{article} -\usepackage{lmodern} -\usepackage{amssymb,amsmath} -\usepackage{ifxetex,ifluatex} -\ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex - \usepackage[T1]{fontenc} - \usepackage[utf8]{inputenc} - \usepackage{textcomp} % provide euro and other symbols -\else % if luatex or xetex - \usepackage{unicode-math} - \defaultfontfeatures{Scale=MatchLowercase} - \defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1} -\fi -% Use upquote if available, for straight quotes in verbatim environments -\IfFileExists{upquote.sty}{\usepackage{upquote}}{} -\IfFileExists{microtype.sty}{% use microtype if available - \usepackage[]{microtype} - \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts -}{} -\makeatletter -\@ifundefined{KOMAClassName}{% if non-KOMA class - \IfFileExists{parskip.sty}{% - \usepackage{parskip} - }{% else - \setlength{\parindent}{0pt} - \setlength{\parskip}{6pt plus 2pt minus 1pt}} -}{% if KOMA class - \KOMAoptions{parskip=half}} -\makeatother -\usepackage{xcolor} -\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available -\IfFileExists{bookmark.sty}{\usepackage{bookmark}}{\usepackage{hyperref}} -\hypersetup{ - pdftitle={Making patient-level predictive network study packages}, - pdfauthor={Jenna Reps, Martijn J. Schuemie, Patrick B. Ryan, Peter R. Rijnbeek}, - hidelinks, - pdfcreator={LaTeX via pandoc}} -\urlstyle{same} % disable monospaced font for URLs -\usepackage[margin=1in]{geometry} -\usepackage{color} -\usepackage{fancyvrb} -\newcommand{\VerbBar}{|} -\newcommand{\VERB}{\Verb[commandchars=\\\{\}]} -\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}} -% Add ',fontsize=\small' for more characters per line -\usepackage{framed} -\definecolor{shadecolor}{RGB}{248,248,248} -\newenvironment{Shaded}{\begin{snugshade}}{\end{snugshade}} -\newcommand{\AlertTok}[1]{\textcolor[rgb]{0.94,0.16,0.16}{#1}} -\newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.77,0.63,0.00}{#1}} -\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\BuiltInTok}[1]{#1} -\newcommand{\CharTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}} -\newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\ConstantTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} -\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{#1}} -\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\ErrorTok}[1]{\textcolor[rgb]{0.64,0.00,0.00}{\textbf{#1}}} -\newcommand{\ExtensionTok}[1]{#1} -\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\ImportTok}[1]{#1} -\newcommand{\InformationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} -\newcommand{\NormalTok}[1]{#1} -\newcommand{\OperatorTok}[1]{\textcolor[rgb]{0.81,0.36,0.00}{\textbf{#1}}} -\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}} -\newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}} -\newcommand{\RegionMarkerTok}[1]{#1} -\newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\StringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\VariableTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\WarningTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\usepackage{longtable,booktabs} -% Correct order of tables after \paragraph or \subparagraph -\usepackage{etoolbox} -\makeatletter -\patchcmd\longtable{\par}{\if@noskipsec\mbox{}\fi\par}{}{} -\makeatother -% Allow footnotes in longtable head/foot -\IfFileExists{footnotehyper.sty}{\usepackage{footnotehyper}}{\usepackage{footnote}} -\makesavenoteenv{longtable} -\usepackage{graphicx,grffile} -\makeatletter -\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi} -\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi} -\makeatother -% Scale images if necessary, so that they will not overflow the page -% margins by default, and it is still possible to overwrite the defaults -% using explicit options in \includegraphics[width, height, ...]{} -\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio} -% Set default figure placement to htbp -\makeatletter -\def\fps@figure{htbp} -\makeatother -\setlength{\emergencystretch}{3em} % prevent overfull lines -\providecommand{\tightlist}{% - \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}} -\setcounter{secnumdepth}{5} - -\title{Making patient-level predictive network study packages} -\author{Jenna Reps, Martijn J. Schuemie, Patrick B. Ryan, Peter R. Rijnbeek} -\date{2020-06-03} - -\begin{document} -\maketitle - -{ -\setcounter{tocdepth}{2} -\tableofcontents -} -\newpage - -\hypertarget{introduction}{% -\section{Introduction}\label{introduction}} - -The OHDSI Patient Level Prediction (PLP) package provides the framework -to implement prediction models at scale. This can range from developing -a large number of models across sites (methodology and study design -insight) to extensive external validation of existing models in the -OHDSI PLP framework (model insight). This vignette describes how you can -use the \texttt{PatientLevelPrediction} package to create a network -study package. - -\hypertarget{running-a-network-study-process}{% -\section{Running a Network study -Process}\label{running-a-network-study-process}} - -\hypertarget{step-1-developing-the-study}{% -\subsection{Step 1 -- developing the -study}\label{step-1-developing-the-study}} - -\begin{itemize} -\tightlist -\item - Design the study: target/outcome cohort logic, concept sets for - medical definitions, settings for developing new model or validation - of adding existing models to framework. Suggestion: look in literature - for validated definitions. -\item - Write a protocol that motivates the study and provides full details - (sufficient for people to replicate the study in the future). -\item - Write an R package for implementing the study across diverse - computational environments {[}see guidance below for structure of - package and use the skeleton github package here: \ldots{} {]} -\end{itemize} - -\hypertarget{step-2-implementing-the-study-part-1}{% -\subsection{Step 2 -- implementing the study part -1}\label{step-2-implementing-the-study-part-1}} - -\begin{itemize} -\tightlist -\item - Get contributors to install the package and dependencies. Ensure the - package is installed correctly by running the checkInstall functions. -\item - Get contributors to run the createCohort function to inspect the - target/outcome definitions. If the definitions are not suitable for a - site, go back to step 1 and revise the cohort definitions. -\end{itemize} - -\hypertarget{step-3-implementing-the-study-part-2-make-sure-package-checks-outputs-the-package-is-functioning-as-planned-and-the-definitions-are-valid-across-sites}{% -\subsection{Step 3 -- implementing the study part 2 {[}make sure package -checks outputs the package is functioning as planned and the definitions -are valid across -sites{]}}\label{step-3-implementing-the-study-part-2-make-sure-package-checks-outputs-the-package-is-functioning-as-planned-and-the-definitions-are-valid-across-sites}} - -\begin{itemize} -\tightlist -\item - Get contributors to run the main.R with the settings configured to - their environment -\item - Get the contributors to submit the results -\end{itemize} - -\hypertarget{step-4-publication}{% -\subsection{Step 4 -- Publication}\label{step-4-publication}} - -\begin{itemize} -\tightlist -\item - The study creator has the first option to be first author, if he/she - does not wish to be first author then he/she can pick the most - suitable person from the contributors. All contributors will be listed - as authors on the paper. The last author will be the person who - lead/managed the study, if this was the first author then the first - author can pick the most suitable last author. All authors between the - first and last author will be alphabetical by last name. -\end{itemize} - -\hypertarget{package-skeleton---file-structure}{% -\section{Package Skeleton - File -Structure}\label{package-skeleton---file-structure}} - -\begin{itemize} -\tightlist -\item - DESCRIPTION -- This file describes the R package and the dependencies -\item - NAMESPACE -- This file is created automatically by Roxygen -\item - Readme.md -- This file should provide the step by step guidance on - implementing the package -\item - R -\item - helpers.r -- all the custom functions used by the package should be in - this file (e.g., checkInstall) -\item - main.r -- this file will call the functions in helpers.r to execute - the full study -\item - submit.r -- this file will be called at the end the submit the - compressed folder to the study creator/manager. -\item - Man -- this folder will contain the documentation for the functions in - helpers.r (this should be automatically generated by roxygen) -\item - Inst -\item - sql/sql\_sever - - \begin{itemize} - \tightlist - \item - targetCohort -- the target cohort parameterised sql code - \item - outcomeCohort -- the outcome cohort parameterised sql code - \end{itemize} -\item - extdata -- place any data required for the package here -\item - plp\_models -- place any PLP models here -\item - existing\_models -- place the files for existing models here -\item - Extras -\end{itemize} - -\hypertarget{package-skeleton---output-of-running-package}{% -\section{Package Skeleton - Output of Running -Package}\label{package-skeleton---output-of-running-package}} - -The output should contain three folders inside the study directory such -as -\texttt{outputLoc\ \textless{}-\ (file.path(getwd(),\ paste0(studyName\_database\_date)))}: -* Plots -- containing the test/train or validation ROC plot, calibration -plot, precision recall plot and optionally the demographic calibration -plot. * Results -- The output of running savePlpResult * Summary -- a -summary csv of performance and the table 1 csv - -Then there should also be a zip file of the folder in the working -directory containing the same folders and files but with sensitive -results removed (this will be created using the packageResults -function). Once the contributor has inspected the zipped file and is -happy with the content being shared, he/she can then finally run the -submit function with the details provided in the readme.md. - -\hypertarget{example-code-to-make-package-for-external-validation-of-plp-model}{% -\section{Example Code To Make Package For External Validation of PLP -Model}\label{example-code-to-make-package-for-external-validation-of-plp-model}} - -First you need to make a copy of the PatientLevelPrediciton skeleton -package found here: - -Assuming you ran a sucessful PatientLevelPrediction model development -and saved the output of \texttt{runPlp()} to to location `goodModel' in -your working directory then: - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{library}\NormalTok{(PatientLevelPrediction)} -\NormalTok{plpResult <-}\StringTok{ }\KeywordTok{loadPlpResult}\NormalTok{(}\StringTok{"goodModel"}\NormalTok{)} - -\CommentTok{# add the model to the skeleton package with sensitive information removed} -\KeywordTok{exportPlpResult}\NormalTok{(}\DataTypeTok{plpResult =}\NormalTok{ plpResult, }\DataTypeTok{modelName =} \StringTok{"Model Name"}\NormalTok{, }\DataTypeTok{packageName =} \StringTok{"Your Package Name"}\NormalTok{, } - \DataTypeTok{gitHubLocation =} \StringTok{"location/of/github"}\NormalTok{, }\DataTypeTok{includeEvaluationStatistics =}\NormalTok{ T, }\DataTypeTok{includeThresholdSummary =}\NormalTok{ T, } - \DataTypeTok{includeDemographicSummary =}\NormalTok{ T, }\DataTypeTok{includeCalibrationSummary =}\NormalTok{ T, }\DataTypeTok{includePredictionDistribution =}\NormalTok{ T, } - \DataTypeTok{includeCovariateSummary =}\NormalTok{ F)} -\end{Highlighting} -\end{Shaded} - -Now you want to add the cohorts (generally the parameterized sql -required to create one or more target and outcome cohorts). This should -be added into the inst/sql/sql\_server directory of your package. If you -are using atlas to create the cohorts then you can use: -\texttt{OhdsiRTools::insertCirceDefinitionInPackage()}. The settings for -the cohort creation are defined in the inst/extdata directory in the -file cohort\_details.csv. this file contains two columns: cohortName and -cohortId. The cohortName should contain the name of the sql file of the -cohort in inst/sql/sql\_server (e.g., a file called ``targetCohort.sql'' -has the name ``targetCohort'') and the cohortId is the default -cohort\_definition\_id that will be used when people run the study -corresponding to this cohort. The main.R file in the extras directory -contains the vanilla code to run a study with the model eported into the -package and the cohort files added. - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{library}\NormalTok{(PatientLevelPrediction)} -\CommentTok{# input settings for person running the study} -\NormalTok{connectionDetails <-}\StringTok{ " "} -\NormalTok{cdmDatabaseSchema <-}\StringTok{ "their_cdm_database"} -\NormalTok{databaseName <-}\StringTok{ "Name for database"} -\NormalTok{cohortDatabaseSchema <-}\StringTok{ "a_database_with_write_priv"} -\NormalTok{cohortTable <-}\StringTok{ "package_table"} -\NormalTok{outputLocation <-}\StringTok{ "location to save results"} - -\NormalTok{cohortDetails <-}\StringTok{ }\KeywordTok{createCohort}\NormalTok{(}\DataTypeTok{connectionDetails =}\NormalTok{ connectionDetails, }\DataTypeTok{cdmDatabaseSchema =}\NormalTok{ cdmDatabaseSchema, } - \DataTypeTok{cohortDatabaseSchema =}\NormalTok{ cohortDatabaseSchema, }\DataTypeTok{cohortTable =}\NormalTok{ cohortTable, }\DataTypeTok{package =} \StringTok{"Your Package Name"}\NormalTok{)} - -\NormalTok{plpResult <-}\StringTok{ }\KeywordTok{loadPlpResult}\NormalTok{(}\KeywordTok{system.file}\NormalTok{(}\StringTok{"model"}\NormalTok{, }\DataTypeTok{package =} \StringTok{"Your Package Name"}\NormalTok{))} -\NormalTok{result <-}\StringTok{ }\KeywordTok{externalValidatePlp}\NormalTok{(}\DataTypeTok{plpResult =}\NormalTok{ plpResult, }\DataTypeTok{connectionDetails =}\NormalTok{ connectionDetails, }\DataTypeTok{validationSchemaTarget =}\NormalTok{ cohortDatabaseSchema, } - \DataTypeTok{validationSchemaOutcome =}\NormalTok{ cohortDatabaseSchema, }\DataTypeTok{validationSchemaCdm =}\NormalTok{ cdmDatabaseSchema, }\DataTypeTok{validationTableTarget =}\NormalTok{ cohortTable, } - \DataTypeTok{validationTableOutcome =}\NormalTok{ cohortTable, }\DataTypeTok{validationIdTarget =}\NormalTok{ target_cohort_id, }\DataTypeTok{validationIdOutcome =}\NormalTok{ outcome_cohort_id)} - -\CommentTok{# save results to standard output} -\NormalTok{resultLoc <-}\StringTok{ }\KeywordTok{standardOutput}\NormalTok{(}\DataTypeTok{result =}\NormalTok{ result, }\DataTypeTok{outputLocation =}\NormalTok{ outputLocation, }\DataTypeTok{studyName =} \StringTok{"external validation of ... model"}\NormalTok{, } - \DataTypeTok{databaseName =}\NormalTok{ databaseName, }\DataTypeTok{cohortName =} \StringTok{"your cohortName"}\NormalTok{, }\DataTypeTok{outcomeName =} \StringTok{"your outcomeName"}\NormalTok{)} - -\CommentTok{# package results ready to submit} -\KeywordTok{packageResults}\NormalTok{(}\DataTypeTok{mainFolder =}\NormalTok{ resultLoc, }\DataTypeTok{includeROCplot =}\NormalTok{ T, }\DataTypeTok{includeCalibrationPlot =}\NormalTok{ T, }\DataTypeTok{includePRPlot =}\NormalTok{ T, } - \DataTypeTok{includeTable1 =}\NormalTok{ F, }\DataTypeTok{includeThresholdSummary =}\NormalTok{ T, }\DataTypeTok{includeDemographicSummary =}\NormalTok{ T, }\DataTypeTok{includeCalibrationSummary =}\NormalTok{ T, } - \DataTypeTok{includePredictionDistribution =}\NormalTok{ T, }\DataTypeTok{includeCovariateSummary =}\NormalTok{ F, }\DataTypeTok{removeLessThanN =}\NormalTok{ F, }\DataTypeTok{N =} \DecValTok{10}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -Where the target\_cohort\_id and outcome\_cohort\_id should correspond -to the cohort\_details.csv file. - -We recommend getting the network implementors to submit their results of -\texttt{createCohort()} before continuing with the study to ensure -definitions run across the network. After running the rest of main.R the -implementor should inspect the files in the export folder created by the -package to ensure there isn't sensitive data remaining. Once checked the -implementor can run submit.R to send the results to the study organisor. -The submit.R file is: - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{submitResults}\NormalTok{(}\DataTypeTok{exportFolder =}\NormalTok{ outputLocation, }\DataTypeTok{dbName =}\NormalTok{ databaseName, key, secret)} -\end{Highlighting} -\end{Shaded} - -\hypertarget{useful-patientlevelprediction-functions}{% -\section{Useful PatientLevelPrediction -Functions}\label{useful-patientlevelprediction-functions}} - -The functions to aid the creation of a network study are: - -\begin{longtable}[]{@{}lll@{}} -\toprule -\begin{minipage}[b]{0.26\columnwidth}\raggedright -Function\strut -\end{minipage} & \begin{minipage}[b]{0.42\columnwidth}\raggedright -Description\strut -\end{minipage} & \begin{minipage}[b]{0.23\columnwidth}\raggedright -Usage\strut -\end{minipage}\tabularnewline -\midrule -\endhead -\begin{minipage}[t]{0.26\columnwidth}\raggedright -\texttt{checkPlpInstall()}\strut -\end{minipage} & \begin{minipage}[t]{0.42\columnwidth}\raggedright -This function checks the connection, and various aspects of the PLP -package to check it is set up correctly\strut -\end{minipage} & \begin{minipage}[t]{0.23\columnwidth}\raggedright -This should be run with the appropriate settings to check the -contributor is set up correctly for the study\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.26\columnwidth}\raggedright -\texttt{getPlpData()}\strut -\end{minipage} & \begin{minipage}[t]{0.42\columnwidth}\raggedright -This function extracts the data from the cdm for model development\strut -\end{minipage} & \begin{minipage}[t]{0.23\columnwidth}\raggedright -This should be used if developing new models\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.26\columnwidth}\raggedright -\texttt{runPlp()}\strut -\end{minipage} & \begin{minipage}[t]{0.42\columnwidth}\raggedright -This function trains and tests a new PLP model\strut -\end{minipage} & \begin{minipage}[t]{0.23\columnwidth}\raggedright -This should be used if developing new models\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.26\columnwidth}\raggedright -\texttt{transportPlp()}\strut -\end{minipage} & \begin{minipage}[t]{0.42\columnwidth}\raggedright -This function exports the output of runPlp into an R package while -removing sensitive objects\strut -\end{minipage} & \begin{minipage}[t]{0.23\columnwidth}\raggedright -This should be used when saving a model into a study package to validate -the model\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.26\columnwidth}\raggedright -\texttt{externalValidatePlp()}\strut -\end{minipage} & \begin{minipage}[t]{0.42\columnwidth}\raggedright -This function requires the user to inpute an existing model and then -extracts the required data on a new database and applies/evaluates the -model.\strut -\end{minipage} & \begin{minipage}[t]{0.23\columnwidth}\raggedright -This should be used if validating a PLP model\strut -\end{minipage}\tabularnewline -\bottomrule -\end{longtable} - -\end{document} diff --git a/inst/doc/CreatingShinyApp.tex b/inst/doc/CreatingShinyApp.tex deleted file mode 100644 index 1e6884a8a..000000000 --- a/inst/doc/CreatingShinyApp.tex +++ /dev/null @@ -1,502 +0,0 @@ -% Options for packages loaded elsewhere -\PassOptionsToPackage{unicode}{hyperref} -\PassOptionsToPackage{hyphens}{url} -% -\documentclass[ -]{article} -\usepackage{lmodern} -\usepackage{amssymb,amsmath} -\usepackage{ifxetex,ifluatex} -\ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex - \usepackage[T1]{fontenc} - \usepackage[utf8]{inputenc} - \usepackage{textcomp} % provide euro and other symbols -\else % if luatex or xetex - \usepackage{unicode-math} - \defaultfontfeatures{Scale=MatchLowercase} - \defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1} -\fi -% Use upquote if available, for straight quotes in verbatim environments -\IfFileExists{upquote.sty}{\usepackage{upquote}}{} -\IfFileExists{microtype.sty}{% use microtype if available - \usepackage[]{microtype} - \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts -}{} -\makeatletter -\@ifundefined{KOMAClassName}{% if non-KOMA class - \IfFileExists{parskip.sty}{% - \usepackage{parskip} - }{% else - \setlength{\parindent}{0pt} - \setlength{\parskip}{6pt plus 2pt minus 1pt}} -}{% if KOMA class - \KOMAoptions{parskip=half}} -\makeatother -\usepackage{xcolor} -\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available -\IfFileExists{bookmark.sty}{\usepackage{bookmark}}{\usepackage{hyperref}} -\hypersetup{ - pdftitle={Creating Shiny App}, - pdfauthor={Jenna Reps}, - hidelinks, - pdfcreator={LaTeX via pandoc}} -\urlstyle{same} % disable monospaced font for URLs -\usepackage[margin=1in]{geometry} -\usepackage{color} -\usepackage{fancyvrb} -\newcommand{\VerbBar}{|} -\newcommand{\VERB}{\Verb[commandchars=\\\{\}]} -\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}} -% Add ',fontsize=\small' for more characters per line -\usepackage{framed} -\definecolor{shadecolor}{RGB}{248,248,248} -\newenvironment{Shaded}{\begin{snugshade}}{\end{snugshade}} -\newcommand{\AlertTok}[1]{\textcolor[rgb]{0.94,0.16,0.16}{#1}} -\newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.77,0.63,0.00}{#1}} -\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\BuiltInTok}[1]{#1} -\newcommand{\CharTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}} -\newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\ConstantTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} -\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{#1}} -\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\ErrorTok}[1]{\textcolor[rgb]{0.64,0.00,0.00}{\textbf{#1}}} -\newcommand{\ExtensionTok}[1]{#1} -\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\ImportTok}[1]{#1} -\newcommand{\InformationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} -\newcommand{\NormalTok}[1]{#1} -\newcommand{\OperatorTok}[1]{\textcolor[rgb]{0.81,0.36,0.00}{\textbf{#1}}} -\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}} -\newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}} -\newcommand{\RegionMarkerTok}[1]{#1} -\newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\StringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\VariableTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\WarningTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\usepackage{graphicx,grffile} -\makeatletter -\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi} -\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi} -\makeatother -% Scale images if necessary, so that they will not overflow the page -% margins by default, and it is still possible to overwrite the defaults -% using explicit options in \includegraphics[width, height, ...]{} -\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio} -% Set default figure placement to htbp -\makeatletter -\def\fps@figure{htbp} -\makeatother -\setlength{\emergencystretch}{3em} % prevent overfull lines -\providecommand{\tightlist}{% - \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}} -\setcounter{secnumdepth}{5} -\usepackage{fancyhdr} -\pagestyle{fancy} -\fancyhead{} -\fancyhead[CO,CE]{Installation Guide} -\fancyfoot[CO,CE]{PatientLevelPrediction Package Version 3.1.0} -\fancyfoot[LE,RO]{\thepage} -\renewcommand{\headrulewidth}{0.4pt} -\renewcommand{\footrulewidth}{0.4pt} - -\title{Creating Shiny App} -\author{Jenna Reps} -\date{2020-06-03} - -\begin{document} -\maketitle - -{ -\setcounter{tocdepth}{2} -\tableofcontents -} -\hypertarget{introduction}{% -\section{Introduction}\label{introduction}} - -In this vignette we will show with example code how to create a shiny -app and add the shiny app online for other researcher around the whole -to explore. - -There are two ways to create the shiny app: 1) Using the atlas R -generated prediction package 2) Manually using the -PatientLevelPrediction functions in a script - -We assume you have experience with using the OHDSI -PatientLevelPrediction package to develop and externally validate -prediction models using data in the OMOP CDM. If you do not have -experience with this then please first read our general vignette -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/BuildingPredictiveModels.pdf}{\texttt{BuildingPredictiveModels} -vignette}. - -\hypertarget{atlas-development-shiny-app}{% -\section{Atlas Development Shiny -App}\label{atlas-development-shiny-app}} - -\hypertarget{step-1-run-the-model-development-package-to-get-results}{% -\subsection{Step 1: Run the model development package to get -results}\label{step-1-run-the-model-development-package-to-get-results}} - -To create a shiny app project via the Atlas auto-generated prediction R -package you named `myPackage' you need to run the execute function: - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{library}\NormalTok{(myPackage)} -\NormalTok{myPackage}\OperatorTok{::}\KeywordTok{execute}\NormalTok{(}\DataTypeTok{connectionDetails =}\NormalTok{ connectionDetails,} - \DataTypeTok{cdmDatabaseSchema =} \StringTok{'myDatabaseSchema.dbo'}\NormalTok{,} - \DataTypeTok{cdmDatabaseName =} \StringTok{'MyDatabase'}\NormalTok{,} - \DataTypeTok{cohortDatabaseSchema =} \StringTok{'myDatabaseSchema.ohdsi_results'}\NormalTok{,} - \DataTypeTok{cohortTable =} \StringTok{'cohort'}\NormalTok{,} - \DataTypeTok{outputFolder =} \StringTok{'C:/myResults'}\NormalTok{,} - \DataTypeTok{createProtocol =}\NormalTok{ F,} - \DataTypeTok{createCohorts =}\NormalTok{ F,} - \DataTypeTok{runAnalyses =}\NormalTok{ T,} - \DataTypeTok{createResultsDoc =}\NormalTok{ F,} - \DataTypeTok{packageResults =}\NormalTok{ F,} - \DataTypeTok{createValidationPackage =}\NormalTok{ F, } - \DataTypeTok{minCellCount=} \DecValTok{5}\NormalTok{,} - \DataTypeTok{createShiny =}\NormalTok{ F,} - \DataTypeTok{createJournalDocument =}\NormalTok{ F,} - \DataTypeTok{analysisIdDocument =} \DecValTok{1}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -This will extract data based on the settings you supplied in the Atlas -prediction design from cohort tables already generated in your CDM -database schema. The PatientLevelPrediction framework will then run and -develop/evaluate models saving the results to the location specified by -outputFolder (e.g., `C:/myResults'). - -\hypertarget{step-2-create-the-shiny-app}{% -\subsection{Step 2: Create the shiny -app}\label{step-2-create-the-shiny-app}} - -To create a shiny app project with these results you can then simply -run: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{myPackage}\OperatorTok{::}\KeywordTok{execute}\NormalTok{(}\DataTypeTok{connectionDetails =}\NormalTok{ connectionDetails,} - \DataTypeTok{cdmDatabaseSchema =} \StringTok{'myDatabaseSchema.dbo'}\NormalTok{,} - \DataTypeTok{cdmDatabaseName =} \StringTok{'MyDatabase'}\NormalTok{,} - \DataTypeTok{cohortDatabaseSchema =} \StringTok{'myDatabaseSchema.ohdsi_results'}\NormalTok{,} - \DataTypeTok{cohortTable =} \StringTok{'cohort'}\NormalTok{,} - \DataTypeTok{outputFolder =} \StringTok{'C:/myResults'}\NormalTok{,} - \DataTypeTok{minCellCount=} \DecValTok{5}\NormalTok{,} - \DataTypeTok{createShiny =}\NormalTok{ T)} -\end{Highlighting} -\end{Shaded} - -making sure the outputFolder is the same location used when you ran the -analysis. This code populates a shiny app project with the results but -removes sensitive date such as connection settings, the -cdmDatabaseSchema setting, the predicton matrix and any sensitive counts -less than `minCellCount' from the covariate summary and performance -evalaution. - -The shiny app project populated with the model development results can -then be found at `{[}outputFolder{]}/ShinyApp' e.g., -`C:/myResults/ShinyApp'. - -\hypertarget{testing-optional-but-recommended}{% -\subsubsection{Testing (Optional but -recommended)}\label{testing-optional-but-recommended}} - -You can test the app by opening the shiny project within the -{[}outputFolder{]}/ShinyApp' folder, double click on the file named -`PLPViewer.Rproj'. This will open an R studio session with the shiny app -project loaded. Now load the `ui.R' files within this R studio session -and you will see a green arrow with the words `Run App' at the top right -of the script. Click on this and the shiny app with open. Note: You may -need to install some R pacakge dependancies for the shiny app to work. - -\hypertarget{step-3-sharing-the-shiny-app}{% -\subsection{Step 3: Sharing the shiny -app}\label{step-3-sharing-the-shiny-app}} - -Once you are happy with your app, you can publish it onto -\url{https://data.ohdsi.org} by adding the folder `ShinyApp' to the -OHDSI githib ShinyDeploy (\url{https://github.com/OHDSI/ShinyDeploy/}). -Continuing the example, we would copy the folder -`{[}outputFolder{]}/ShinyApp' and paste it to the local github clone of -ShinyDeploy. We recommend renaming the folder from `ShinyApp' to a name -that describes your prediction, e.g., `StrokeinAF'. Then commit the -changes and make a pull request to ShinyDeploy. Once accepted your shiny -app will be viewable at `\url{https://data.ohdsi.org}'. If you commited -the folder named `StrokeInAF' then the shiny app will be hosted at -`\url{https://data.ohdsi.org/StrokeInAF}'. - -\hypertarget{atlas-external-validation}{% -\section{Atlas External Validation}\label{atlas-external-validation}} - -To include external validation results you can use the Atlas generated R -study package to create the external validation package: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{myPackage}\OperatorTok{::}\KeywordTok{execute}\NormalTok{(}\DataTypeTok{connectionDetails =}\NormalTok{ connectionDetails,} - \DataTypeTok{cdmDatabaseSchema =} \StringTok{'myDatabaseSchema.dbo'}\NormalTok{,} - \DataTypeTok{cdmDatabaseName =} \StringTok{'MyDatabase'}\NormalTok{,} - \DataTypeTok{cohortDatabaseSchema =} \StringTok{'myDatabaseSchema.ohdsi_results'}\NormalTok{,} - \DataTypeTok{cohortTable =} \StringTok{'cohort'}\NormalTok{,} - \DataTypeTok{outputFolder =} \StringTok{'C:/myResults'}\NormalTok{,} - \DataTypeTok{createValidationPackage =}\NormalTok{ T)} -\end{Highlighting} -\end{Shaded} - -This will create a new R package inside the `outputFolder' location with -the word `Validation' appended the name of your development package. For -example, if your `outputFolder' was `C:/myResults' and your development -package was named `myPackage' then the validation package will be found -at: `C:/myResults/myPackageValidation'. When running the valdiation -package make sure to set the `outputFolder' to the Validation folder -within your model development outputFolder location: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{myPackageValidation}\OperatorTok{::}\KeywordTok{execute}\NormalTok{(}\DataTypeTok{connectionDetails =}\NormalTok{ connectionDetails,} - \DataTypeTok{databaseName =}\NormalTok{ databaseName,} - \DataTypeTok{cdmDatabaseSchema =}\NormalTok{ cdmDatabaseSchema,} - \DataTypeTok{cohortDatabaseSchema =}\NormalTok{ cohortDatabaseSchema,} - \DataTypeTok{oracleTempSchema =}\NormalTok{ oracleTempSchema,} - \DataTypeTok{cohortTable =}\NormalTok{ cohortTable,} - \DataTypeTok{outputFolder =} \StringTok{'C:/myResults/Validation'}\NormalTok{,} - \DataTypeTok{createCohorts =}\NormalTok{ T,} - \DataTypeTok{runValidation =}\NormalTok{ T,} - \DataTypeTok{packageResults =}\NormalTok{ F,} - \DataTypeTok{minCellCount =} \DecValTok{5}\NormalTok{,} - \DataTypeTok{sampleSize =} \OtherTok{NULL}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -Now you can rerun Steps 2-3 to populate the shiny app project that will -also include the validation results (as long as the validation results -are in the Validation folder found in the Step 1 outputFolder location -e.g., in `C:/myResults/Validation'). - -\hypertarget{combining-multiple-atlas-results-into-one-shiny-app}{% -\section{Combining multiple atlas results into one shiny -app:}\label{combining-multiple-atlas-results-into-one-shiny-app}} - -The code below can be used to combine multiple Atlas packages' results -into one shiny app: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{populateMultipleShinyApp <-}\StringTok{ }\ControlFlowTok{function}\NormalTok{(shinyDirectory,} -\NormalTok{ resultDirectory,} - \DataTypeTok{minCellCount =} \DecValTok{10}\NormalTok{,} - \DataTypeTok{databaseName =} \StringTok{'sharable name of development data'}\NormalTok{)\{} - - \CommentTok{#check inputs} - \ControlFlowTok{if}\NormalTok{(}\KeywordTok{missing}\NormalTok{(shinyDirectory))\{} -\NormalTok{ shinyDirectory <-}\StringTok{ }\KeywordTok{system.file}\NormalTok{(}\StringTok{"shiny"}\NormalTok{, }\StringTok{"PLPViewer"}\NormalTok{, }\DataTypeTok{package =} \StringTok{"SkeletonPredictionStudy"}\NormalTok{)} -\NormalTok{ \}} - \ControlFlowTok{if}\NormalTok{(}\KeywordTok{missing}\NormalTok{(resultDirectory))\{} - \KeywordTok{stop}\NormalTok{(}\StringTok{'Need to enter the resultDirectory'}\NormalTok{)} -\NormalTok{ \}} - - - \ControlFlowTok{for}\NormalTok{(i }\ControlFlowTok{in} \DecValTok{1}\OperatorTok{:}\KeywordTok{length}\NormalTok{(resultDirectory))\{} - \ControlFlowTok{if}\NormalTok{(}\OperatorTok{!}\KeywordTok{dir.exists}\NormalTok{(resultDirectory[i]))\{} - \KeywordTok{stop}\NormalTok{(}\KeywordTok{paste}\NormalTok{(}\StringTok{'resultDirectory '}\NormalTok{,i,}\StringTok{' does not exist'}\NormalTok{))} -\NormalTok{ \}} -\NormalTok{ \}} - -\NormalTok{ outputDirectory <-}\StringTok{ }\KeywordTok{file.path}\NormalTok{(shinyDirectory,}\StringTok{'data'}\NormalTok{)} - - \CommentTok{# create the shiny data folder} - \ControlFlowTok{if}\NormalTok{(}\OperatorTok{!}\KeywordTok{dir.exists}\NormalTok{(outputDirectory))\{} - \KeywordTok{dir.create}\NormalTok{(outputDirectory, }\DataTypeTok{recursive =}\NormalTok{ T)} -\NormalTok{ \}} - - - \CommentTok{# need to edit settings ...} -\NormalTok{ files <-}\StringTok{ }\KeywordTok{c}\NormalTok{()} - \ControlFlowTok{for}\NormalTok{(i }\ControlFlowTok{in} \DecValTok{1}\OperatorTok{:}\KeywordTok{length}\NormalTok{(resultDirectory))\{} - \CommentTok{# copy the settings csv} -\NormalTok{ file <-}\StringTok{ }\NormalTok{utils}\OperatorTok{::}\KeywordTok{read.csv}\NormalTok{(}\KeywordTok{file.path}\NormalTok{(resultDirectory[i],}\StringTok{'settings.csv'}\NormalTok{))} -\NormalTok{ file}\OperatorTok{$}\NormalTok{analysisId <-}\StringTok{ }\DecValTok{1000}\OperatorTok{*}\KeywordTok{as.double}\NormalTok{(file}\OperatorTok{$}\NormalTok{analysisId)}\OperatorTok{+}\NormalTok{i} -\NormalTok{ files <-}\StringTok{ }\KeywordTok{rbind}\NormalTok{(files, file)} -\NormalTok{ \}} -\NormalTok{ utils}\OperatorTok{::}\KeywordTok{write.csv}\NormalTok{(files, }\KeywordTok{file.path}\NormalTok{(outputDirectory,}\StringTok{'settings.csv'}\NormalTok{), }\DataTypeTok{row.names =}\NormalTok{ F)} - - \ControlFlowTok{for}\NormalTok{(i }\ControlFlowTok{in} \DecValTok{1}\OperatorTok{:}\KeywordTok{length}\NormalTok{(resultDirectory))\{} - \CommentTok{# copy each analysis as a rds file and copy the log} -\NormalTok{ files <-}\StringTok{ }\KeywordTok{dir}\NormalTok{(resultDirectory[i], }\DataTypeTok{full.names =}\NormalTok{ F)} -\NormalTok{ files <-}\StringTok{ }\NormalTok{files[}\KeywordTok{grep}\NormalTok{(}\StringTok{'Analysis'}\NormalTok{, files)]} - \ControlFlowTok{for}\NormalTok{(file }\ControlFlowTok{in}\NormalTok{ files)\{} - - \ControlFlowTok{if}\NormalTok{(}\OperatorTok{!}\KeywordTok{dir.exists}\NormalTok{(}\KeywordTok{file.path}\NormalTok{(outputDirectory,}\KeywordTok{paste0}\NormalTok{(}\StringTok{'Analysis_'}\NormalTok{,}\DecValTok{1000}\OperatorTok{*}\KeywordTok{as.double}\NormalTok{(}\KeywordTok{gsub}\NormalTok{(}\StringTok{'Analysis_'}\NormalTok{,}\StringTok{''}\NormalTok{,file))}\OperatorTok{+}\NormalTok{i))))\{} - \KeywordTok{dir.create}\NormalTok{(}\KeywordTok{file.path}\NormalTok{(outputDirectory,}\KeywordTok{paste0}\NormalTok{(}\StringTok{'Analysis_'}\NormalTok{,}\DecValTok{1000}\OperatorTok{*}\KeywordTok{as.double}\NormalTok{(}\KeywordTok{gsub}\NormalTok{(}\StringTok{'Analysis_'}\NormalTok{,}\StringTok{''}\NormalTok{,file))}\OperatorTok{+}\NormalTok{i)))} -\NormalTok{ \}} - - \ControlFlowTok{if}\NormalTok{(}\KeywordTok{dir.exists}\NormalTok{(}\KeywordTok{file.path}\NormalTok{(resultDirectory[i],file, }\StringTok{'plpResult'}\NormalTok{)))\{} -\NormalTok{ res <-}\StringTok{ }\NormalTok{PatientLevelPrediction}\OperatorTok{::}\KeywordTok{loadPlpResult}\NormalTok{(}\KeywordTok{file.path}\NormalTok{(resultDirectory[i],file, }\StringTok{'plpResult'}\NormalTok{))} -\NormalTok{ res <-}\StringTok{ }\NormalTok{PatientLevelPrediction}\OperatorTok{::}\KeywordTok{transportPlp}\NormalTok{(res, }\DataTypeTok{n=}\NormalTok{ minCellCount, } - \DataTypeTok{save =}\NormalTok{ F, }\DataTypeTok{dataName =}\NormalTok{ databaseName[i])} - -\NormalTok{ res}\OperatorTok{$}\NormalTok{covariateSummary <-}\StringTok{ }\NormalTok{res}\OperatorTok{$}\NormalTok{covariateSummary[res}\OperatorTok{$}\NormalTok{covariateSummary}\OperatorTok{$}\NormalTok{covariateValue}\OperatorTok{!=}\DecValTok{0}\NormalTok{,]} -\NormalTok{ covSet <-}\StringTok{ }\NormalTok{res}\OperatorTok{$}\NormalTok{model}\OperatorTok{$}\NormalTok{metaData}\OperatorTok{$}\NormalTok{call}\OperatorTok{$}\NormalTok{covariateSettings} -\NormalTok{ res}\OperatorTok{$}\NormalTok{model}\OperatorTok{$}\NormalTok{metaData <-}\StringTok{ }\OtherTok{NULL} -\NormalTok{ res}\OperatorTok{$}\NormalTok{model}\OperatorTok{$}\NormalTok{metaData}\OperatorTok{$}\NormalTok{call}\OperatorTok{$}\NormalTok{covariateSettings <-}\StringTok{ }\NormalTok{covSet} -\NormalTok{ res}\OperatorTok{$}\NormalTok{model}\OperatorTok{$}\NormalTok{predict <-}\StringTok{ }\OtherTok{NULL} - \ControlFlowTok{if}\NormalTok{(}\OperatorTok{!}\KeywordTok{is.null}\NormalTok{(res}\OperatorTok{$}\NormalTok{performanceEvaluation}\OperatorTok{$}\NormalTok{evaluationStatistics))\{} -\NormalTok{ res}\OperatorTok{$}\NormalTok{performanceEvaluation}\OperatorTok{$}\NormalTok{evaluationStatistics[,}\DecValTok{1}\NormalTok{] <-}\StringTok{ }\KeywordTok{paste0}\NormalTok{(}\StringTok{'Analysis_'}\NormalTok{,}\DecValTok{1000}\OperatorTok{*}\KeywordTok{as.double}\NormalTok{(}\KeywordTok{gsub}\NormalTok{(}\StringTok{'Analysis_'}\NormalTok{,}\StringTok{''}\NormalTok{,file))}\OperatorTok{+}\NormalTok{i)} -\NormalTok{ \} }\ControlFlowTok{else}\NormalTok{\{} - \KeywordTok{writeLines}\NormalTok{(}\KeywordTok{paste0}\NormalTok{(resultDirectory[i],file, }\StringTok{'-ev'}\NormalTok{))} -\NormalTok{ \}} - \ControlFlowTok{if}\NormalTok{(}\OperatorTok{!}\KeywordTok{is.null}\NormalTok{(res}\OperatorTok{$}\NormalTok{performanceEvaluation}\OperatorTok{$}\NormalTok{thresholdSummary))\{} -\NormalTok{ res}\OperatorTok{$}\NormalTok{performanceEvaluation}\OperatorTok{$}\NormalTok{thresholdSummary[,}\DecValTok{1}\NormalTok{] <-}\StringTok{ }\KeywordTok{paste0}\NormalTok{(}\StringTok{'Analysis_'}\NormalTok{,}\DecValTok{1000}\OperatorTok{*}\KeywordTok{as.double}\NormalTok{(}\KeywordTok{gsub}\NormalTok{(}\StringTok{'Analysis_'}\NormalTok{,}\StringTok{''}\NormalTok{,file))}\OperatorTok{+}\NormalTok{i)} -\NormalTok{ \}}\ControlFlowTok{else}\NormalTok{\{} - \KeywordTok{writeLines}\NormalTok{(}\KeywordTok{paste0}\NormalTok{(resultDirectory[i],file, }\StringTok{'-thres'}\NormalTok{))} -\NormalTok{ \}} - \ControlFlowTok{if}\NormalTok{(}\OperatorTok{!}\KeywordTok{is.null}\NormalTok{(res}\OperatorTok{$}\NormalTok{performanceEvaluation}\OperatorTok{$}\NormalTok{demographicSummary))\{} -\NormalTok{ res}\OperatorTok{$}\NormalTok{performanceEvaluation}\OperatorTok{$}\NormalTok{demographicSummary[,}\DecValTok{1}\NormalTok{] <-}\StringTok{ }\KeywordTok{paste0}\NormalTok{(}\StringTok{'Analysis_'}\NormalTok{,}\DecValTok{1000}\OperatorTok{*}\KeywordTok{as.double}\NormalTok{(}\KeywordTok{gsub}\NormalTok{(}\StringTok{'Analysis_'}\NormalTok{,}\StringTok{''}\NormalTok{,file))}\OperatorTok{+}\NormalTok{i)} -\NormalTok{ \} }\ControlFlowTok{else}\NormalTok{\{} - \KeywordTok{writeLines}\NormalTok{(}\KeywordTok{paste0}\NormalTok{(resultDirectory[i],file, }\StringTok{'-dem'}\NormalTok{))} -\NormalTok{ \}} - \ControlFlowTok{if}\NormalTok{(}\OperatorTok{!}\KeywordTok{is.null}\NormalTok{(res}\OperatorTok{$}\NormalTok{performanceEvaluation}\OperatorTok{$}\NormalTok{calibrationSummary))\{} -\NormalTok{ res}\OperatorTok{$}\NormalTok{performanceEvaluation}\OperatorTok{$}\NormalTok{calibrationSummary[,}\DecValTok{1}\NormalTok{] <-}\StringTok{ }\KeywordTok{paste0}\NormalTok{(}\StringTok{'Analysis_'}\NormalTok{,}\DecValTok{1000}\OperatorTok{*}\KeywordTok{as.double}\NormalTok{(}\KeywordTok{gsub}\NormalTok{(}\StringTok{'Analysis_'}\NormalTok{,}\StringTok{''}\NormalTok{,file))}\OperatorTok{+}\NormalTok{i)} -\NormalTok{ \}}\ControlFlowTok{else}\NormalTok{\{} - \KeywordTok{writeLines}\NormalTok{(}\KeywordTok{paste0}\NormalTok{(resultDirectory[i],file, }\StringTok{'-cal'}\NormalTok{))} -\NormalTok{ \}} - \ControlFlowTok{if}\NormalTok{(}\OperatorTok{!}\KeywordTok{is.null}\NormalTok{(res}\OperatorTok{$}\NormalTok{performanceEvaluation}\OperatorTok{$}\NormalTok{predictionDistribution))\{} -\NormalTok{ res}\OperatorTok{$}\NormalTok{performanceEvaluation}\OperatorTok{$}\NormalTok{predictionDistribution[,}\DecValTok{1}\NormalTok{] <-}\StringTok{ }\KeywordTok{paste0}\NormalTok{(}\StringTok{'Analysis_'}\NormalTok{,}\DecValTok{1000}\OperatorTok{*}\KeywordTok{as.double}\NormalTok{(}\KeywordTok{gsub}\NormalTok{(}\StringTok{'Analysis_'}\NormalTok{,}\StringTok{''}\NormalTok{,file))}\OperatorTok{+}\NormalTok{i)} -\NormalTok{ \}}\ControlFlowTok{else}\NormalTok{\{} - \KeywordTok{writeLines}\NormalTok{(}\KeywordTok{paste0}\NormalTok{(resultDirectory[i],file, }\StringTok{'-dist'}\NormalTok{))} -\NormalTok{ \}} - \KeywordTok{saveRDS}\NormalTok{(res, }\KeywordTok{file.path}\NormalTok{(outputDirectory,}\KeywordTok{paste0}\NormalTok{(}\StringTok{'Analysis_'}\NormalTok{,}\DecValTok{1000}\OperatorTok{*}\KeywordTok{as.double}\NormalTok{(}\KeywordTok{gsub}\NormalTok{(}\StringTok{'Analysis_'}\NormalTok{,}\StringTok{''}\NormalTok{,file))}\OperatorTok{+}\NormalTok{i), }\StringTok{'plpResult.rds'}\NormalTok{))} -\NormalTok{ \}} - \ControlFlowTok{if}\NormalTok{(}\KeywordTok{file.exists}\NormalTok{(}\KeywordTok{file.path}\NormalTok{(resultDirectory[i],file, }\StringTok{'plpLog.txt'}\NormalTok{)))\{} - \KeywordTok{file.copy}\NormalTok{(}\DataTypeTok{from =} \KeywordTok{file.path}\NormalTok{(resultDirectory[i],file, }\StringTok{'plpLog.txt'}\NormalTok{), } - \DataTypeTok{to =} \KeywordTok{file.path}\NormalTok{(outputDirectory,}\KeywordTok{paste0}\NormalTok{(}\StringTok{'Analysis_'}\NormalTok{,}\DecValTok{1000}\OperatorTok{*}\KeywordTok{as.double}\NormalTok{(}\KeywordTok{gsub}\NormalTok{(}\StringTok{'Analysis_'}\NormalTok{,}\StringTok{''}\NormalTok{,file))}\OperatorTok{+}\NormalTok{i), }\StringTok{'plpLog.txt'}\NormalTok{))} -\NormalTok{ \}} -\NormalTok{ \}} -\NormalTok{ \}} - - - - \ControlFlowTok{for}\NormalTok{(i }\ControlFlowTok{in} \DecValTok{1}\OperatorTok{:}\KeywordTok{length}\NormalTok{(resultDirectory))\{} - \CommentTok{# copy any validation results} - \ControlFlowTok{if}\NormalTok{(}\KeywordTok{dir.exists}\NormalTok{(}\KeywordTok{file.path}\NormalTok{(resultDirectory[i],}\StringTok{'Validation'}\NormalTok{)))\{} -\NormalTok{ valFolders <-}\StringTok{ }\KeywordTok{dir}\NormalTok{(}\KeywordTok{file.path}\NormalTok{(resultDirectory[i],}\StringTok{'Validation'}\NormalTok{), }\DataTypeTok{full.names =}\NormalTok{ F)} - - \ControlFlowTok{if}\NormalTok{(}\KeywordTok{length}\NormalTok{(valFolders)}\OperatorTok{>}\DecValTok{0}\NormalTok{)\{} - \CommentTok{# move each of the validation rds} - \ControlFlowTok{for}\NormalTok{(valFolder }\ControlFlowTok{in}\NormalTok{ valFolders)\{} - - \CommentTok{# get the analysisIds} -\NormalTok{ valSubfolders <-}\StringTok{ }\KeywordTok{dir}\NormalTok{(}\KeywordTok{file.path}\NormalTok{(resultDirectory[i],}\StringTok{'Validation'}\NormalTok{,valFolder), }\DataTypeTok{full.names =}\NormalTok{ F)} - \ControlFlowTok{if}\NormalTok{(}\KeywordTok{length}\NormalTok{(valSubfolders)}\OperatorTok{!=}\DecValTok{0}\NormalTok{)\{} - \ControlFlowTok{for}\NormalTok{(valSubfolder }\ControlFlowTok{in}\NormalTok{ valSubfolders )\{} -\NormalTok{ valSubfolderUpdate <-}\StringTok{ }\KeywordTok{paste0}\NormalTok{(}\StringTok{'Analysis_'}\NormalTok{, }\KeywordTok{as.double}\NormalTok{(}\KeywordTok{gsub}\NormalTok{(}\StringTok{'Analysis_'}\NormalTok{,}\StringTok{''}\NormalTok{, valSubfolder))}\OperatorTok{*}\DecValTok{1000}\OperatorTok{+}\NormalTok{i)} -\NormalTok{ valOut <-}\StringTok{ }\KeywordTok{file.path}\NormalTok{(valFolder,valSubfolderUpdate)} -\NormalTok{ valOutOld <-}\StringTok{ }\KeywordTok{file.path}\NormalTok{(valFolder,valSubfolder)} - \ControlFlowTok{if}\NormalTok{(}\OperatorTok{!}\KeywordTok{dir.exists}\NormalTok{(}\KeywordTok{file.path}\NormalTok{(outputDirectory,}\StringTok{'Validation'}\NormalTok{,valOut)))\{} - \KeywordTok{dir.create}\NormalTok{(}\KeywordTok{file.path}\NormalTok{(outputDirectory,}\StringTok{'Validation'}\NormalTok{,valOut), }\DataTypeTok{recursive =}\NormalTok{ T)} -\NormalTok{ \}} - - - \ControlFlowTok{if}\NormalTok{(}\KeywordTok{file.exists}\NormalTok{(}\KeywordTok{file.path}\NormalTok{(resultDirectory[i],}\StringTok{'Validation'}\NormalTok{,valOutOld, }\StringTok{'validationResult.rds'}\NormalTok{)))\{} -\NormalTok{ res <-}\StringTok{ }\KeywordTok{readRDS}\NormalTok{(}\KeywordTok{file.path}\NormalTok{(resultDirectory[i],}\StringTok{'Validation'}\NormalTok{,valOutOld, }\StringTok{'validationResult.rds'}\NormalTok{))} -\NormalTok{ res <-}\StringTok{ }\NormalTok{PatientLevelPrediction}\OperatorTok{::}\KeywordTok{transportPlp}\NormalTok{(res, }\DataTypeTok{n=}\NormalTok{ minCellCount, } - \DataTypeTok{save =}\NormalTok{ F, }\DataTypeTok{dataName =}\NormalTok{ databaseName[i])} -\NormalTok{ res}\OperatorTok{$}\NormalTok{covariateSummary <-}\StringTok{ }\NormalTok{res}\OperatorTok{$}\NormalTok{covariateSummary[res}\OperatorTok{$}\NormalTok{covariateSummary}\OperatorTok{$}\NormalTok{covariateValue}\OperatorTok{!=}\DecValTok{0}\NormalTok{,]} - \KeywordTok{saveRDS}\NormalTok{(res, }\KeywordTok{file.path}\NormalTok{(outputDirectory,}\StringTok{'Validation'}\NormalTok{,valOut, }\StringTok{'validationResult.rds'}\NormalTok{))} -\NormalTok{ \}} -\NormalTok{ \}} -\NormalTok{ \}} - -\NormalTok{ \}} - -\NormalTok{ \}} - -\NormalTok{ \}} -\NormalTok{ \}} - - \KeywordTok{return}\NormalTok{(outputDirectory)} - -\NormalTok{\}} -\end{Highlighting} -\end{Shaded} - -\hypertarget{example-code-to-combine-multiple-results}{% -\subsection{Example code to combine multiple -results}\label{example-code-to-combine-multiple-results}} - -The following code will combine the results found in `C:/myResults', -`C:/myResults2' and `C:/myResults3' into the shiny project at -`C:/R/library/myPackage/shiny/PLPViewer': - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{populateMultipleShinyApp}\NormalTok{(}\DataTypeTok{shinyDirectory =} \StringTok{'C:/R/library/myPackage/shiny/PLPViewer'}\NormalTok{,} - \DataTypeTok{resultDirectory =} \KeywordTok{c}\NormalTok{(}\StringTok{'C:/myResults'}\NormalTok{,} - \StringTok{'C:/myResults2'}\NormalTok{,} - \StringTok{'C:/myResults3'}\NormalTok{),} - \DataTypeTok{minCellCount =} \DecValTok{0}\NormalTok{,} - \DataTypeTok{databaseName =} \KeywordTok{c}\NormalTok{(}\StringTok{'database1'}\NormalTok{,}\StringTok{'database2'}\NormalTok{,}\StringTok{'database3'}\NormalTok{))} -\end{Highlighting} -\end{Shaded} - -\hypertarget{manual-app-creation}{% -\section{Manual App Creation}\label{manual-app-creation}} - -{[}instructions coming soon{]} - -\hypertarget{acknowledgments}{% -\section{Acknowledgments}\label{acknowledgments}} - -Considerable work has been dedicated to provide the -\texttt{PatientLevelPrediction} package. - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{citation}\NormalTok{(}\StringTok{"PatientLevelPrediction"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -## -## To cite PatientLevelPrediction in publications use: -## -## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design and -## implementation of a standardized framework to generate and evaluate patient-level -## prediction models using observational healthcare data." _Journal of the American -## Medical Informatics Association_, *25*(8), 969-975. . -## -## A BibTeX entry for LaTeX users is -## -## @Article{, -## author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek}, -## title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data}, -## journal = {Journal of the American Medical Informatics Association}, -## volume = {25}, -## number = {8}, -## pages = {969-975}, -## year = {2018}, -## url = {https://doi.org/10.1093/jamia/ocy032}, -## } -\end{verbatim} - -\textbf{Please reference this paper if you use the PLP Package in your -work:} - -\href{http://dx.doi.org/10.1093/jamia/ocy032}{Reps JM, Schuemie MJ, -Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a -standardized framework to generate and evaluate patient-level prediction -models using observational healthcare data. J Am Med Inform Assoc. -2018;25(8):969-975.} - -\end{document} diff --git a/inst/doc/Figure1.png b/inst/doc/Figure1.png deleted file mode 100644 index 878a509e2..000000000 Binary files a/inst/doc/Figure1.png and /dev/null differ diff --git a/inst/doc/InstallationGuide.pdf b/inst/doc/InstallationGuide.pdf index 4734063a1..605054361 100644 Binary files a/inst/doc/InstallationGuide.pdf and b/inst/doc/InstallationGuide.pdf differ diff --git a/inst/doc/InstallationGuide.tex b/inst/doc/InstallationGuide.tex deleted file mode 100644 index 937f4c3e7..000000000 --- a/inst/doc/InstallationGuide.tex +++ /dev/null @@ -1,369 +0,0 @@ -% Options for packages loaded elsewhere -\PassOptionsToPackage{unicode}{hyperref} -\PassOptionsToPackage{hyphens}{url} -% -\documentclass[ -]{article} -\usepackage{lmodern} -\usepackage{amssymb,amsmath} -\usepackage{ifxetex,ifluatex} -\ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex - \usepackage[T1]{fontenc} - \usepackage[utf8]{inputenc} - \usepackage{textcomp} % provide euro and other symbols -\else % if luatex or xetex - \usepackage{unicode-math} - \defaultfontfeatures{Scale=MatchLowercase} - \defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1} -\fi -% Use upquote if available, for straight quotes in verbatim environments -\IfFileExists{upquote.sty}{\usepackage{upquote}}{} -\IfFileExists{microtype.sty}{% use microtype if available - \usepackage[]{microtype} - \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts -}{} -\makeatletter -\@ifundefined{KOMAClassName}{% if non-KOMA class - \IfFileExists{parskip.sty}{% - \usepackage{parskip} - }{% else - \setlength{\parindent}{0pt} - \setlength{\parskip}{6pt plus 2pt minus 1pt}} -}{% if KOMA class - \KOMAoptions{parskip=half}} -\makeatother -\usepackage{xcolor} -\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available -\IfFileExists{bookmark.sty}{\usepackage{bookmark}}{\usepackage{hyperref}} -\hypersetup{ - pdftitle={Patient-Level Prediction Installation Guide}, - pdfauthor={Jenna Reps, Peter R. Rijnbeek}, - hidelinks, - pdfcreator={LaTeX via pandoc}} -\urlstyle{same} % disable monospaced font for URLs -\usepackage[margin=1in]{geometry} -\usepackage{color} -\usepackage{fancyvrb} -\newcommand{\VerbBar}{|} -\newcommand{\VERB}{\Verb[commandchars=\\\{\}]} -\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}} -% Add ',fontsize=\small' for more characters per line -\usepackage{framed} -\definecolor{shadecolor}{RGB}{248,248,248} -\newenvironment{Shaded}{\begin{snugshade}}{\end{snugshade}} -\newcommand{\AlertTok}[1]{\textcolor[rgb]{0.94,0.16,0.16}{#1}} -\newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.77,0.63,0.00}{#1}} -\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\BuiltInTok}[1]{#1} -\newcommand{\CharTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}} -\newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\ConstantTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} -\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{#1}} -\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\ErrorTok}[1]{\textcolor[rgb]{0.64,0.00,0.00}{\textbf{#1}}} -\newcommand{\ExtensionTok}[1]{#1} -\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\ImportTok}[1]{#1} -\newcommand{\InformationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} -\newcommand{\NormalTok}[1]{#1} -\newcommand{\OperatorTok}[1]{\textcolor[rgb]{0.81,0.36,0.00}{\textbf{#1}}} -\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}} -\newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}} -\newcommand{\RegionMarkerTok}[1]{#1} -\newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\StringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\VariableTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\WarningTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\usepackage{graphicx,grffile} -\makeatletter -\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi} -\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi} -\makeatother -% Scale images if necessary, so that they will not overflow the page -% margins by default, and it is still possible to overwrite the defaults -% using explicit options in \includegraphics[width, height, ...]{} -\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio} -% Set default figure placement to htbp -\makeatletter -\def\fps@figure{htbp} -\makeatother -\setlength{\emergencystretch}{3em} % prevent overfull lines -\providecommand{\tightlist}{% - \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}} -\setcounter{secnumdepth}{5} -\usepackage{fancyhdr} -\pagestyle{fancy} -\fancyhead{} -\fancyhead[CO,CE]{Installation Guide} -\fancyfoot[CO,CE]{PatientLevelPrediction Package Version 3.1.0} -\fancyfoot[LE,RO]{\thepage} -\renewcommand{\headrulewidth}{0.4pt} -\renewcommand{\footrulewidth}{0.4pt} - -\title{Patient-Level Prediction Installation Guide} -\author{Jenna Reps, Peter R. Rijnbeek} -\date{2020-06-03} - -\begin{document} -\maketitle - -{ -\setcounter{tocdepth}{2} -\tableofcontents -} -\hypertarget{introduction}{% -\section{Introduction}\label{introduction}} - -This vignette describes how you need to install the Observational Health -Data Sciencs and Informatics (OHDSI) -\href{http://github.com/OHDSI/PatientLevelPrediction}{\texttt{PatientLevelPrediction}} -package under Windows, Mac, and Linux. - -\hypertarget{software-prerequisites}{% -\section{Software Prerequisites}\label{software-prerequisites}} - -\hypertarget{windows-users}{% -\subsection{Windows Users}\label{windows-users}} - -Under Windows the OHDSI Patient Level Prediction (PLP) package requires -installing: - -\begin{itemize} -\tightlist -\item - R (\url{https://cran.cnr.berkeley.edu/} ) - (R \textgreater= 3.3.0, - but latest is recommended) -\item - Rstudio (\url{https://www.rstudio.com/} ) -\item - Java (\url{http://www.java.com} ) -\item - RTools (\url{https://cran.r-project.org/bin/windows/Rtools/}) -\end{itemize} - -\hypertarget{maclinux-users}{% -\subsection{Mac/Linux Users}\label{maclinux-users}} - -Under Mac and Linux the OHDSI Patient Level Prediction (PLP) package -requires installing: - -\begin{itemize} -\tightlist -\item - R (\url{https://cran.cnr.berkeley.edu/} ) - (R \textgreater= 3.3.0, - but latest is recommended) -\item - Rstudio (\url{https://www.rstudio.com/} ) -\item - Java (\url{http://www.java.com} ) -\item - Xcode command line tools(run in terminal: xcode-select --install) - {[}MAC USERS ONLY{]} -\end{itemize} - -\hypertarget{installing-the-package}{% -\section{Installing the Package}\label{installing-the-package}} - -The preferred way to install the package is by using drat, which will -automatically install the latest release and all the latest -dependencies. If the drat code fails or you do not want the official -release you could use devtools to install the bleading edge version of -the package (latest master). Note that the latest master could contain -bugs, please report them to us if you experience problems. - -\hypertarget{installing-patientlevelprediction-using-drat}{% -\subsection{Installing PatientLevelPrediction using -drat}\label{installing-patientlevelprediction-using-drat}} - -To install using drat run: - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{install.packages}\NormalTok{(}\StringTok{"drat"}\NormalTok{)} -\NormalTok{drat}\OperatorTok{::}\KeywordTok{addRepo}\NormalTok{(}\StringTok{"OHDSI"}\NormalTok{)} -\KeywordTok{install.packages}\NormalTok{(}\StringTok{"PatientLevelPrediction"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\hypertarget{installing-patientlevelprediction-using-devtools}{% -\subsection{Installing PatientLevelPrediction using -devtools}\label{installing-patientlevelprediction-using-devtools}} - -To install using devtools run: - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{install.packages}\NormalTok{(}\StringTok{'devtools'}\NormalTok{)} -\NormalTok{devtools}\OperatorTok{::}\KeywordTok{install_github}\NormalTok{(}\StringTok{"OHDSI/FeatureExtraction"}\NormalTok{)} -\NormalTok{devtools}\OperatorTok{::}\KeywordTok{install_github}\NormalTok{(}\StringTok{'ohdsi/PatientLevelPrediction'}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -When installing using devtools make sure to close any other Rstudio -sessions that are using PatientLevelPrediction or any dependency. -Keeping Rstudio sessions open can cause locks that prevent the package -installing. - -\hypertarget{creating-python-reticulate-environment}{% -\section{Creating Python Reticulate -Environment}\label{creating-python-reticulate-environment}} - -Many of the classifiers in the PatientLevelPrediction use a Python back -end. To set up a python environment run: - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{library}\NormalTok{(PatientLevelPrediction)} -\NormalTok{reticulate}\OperatorTok{::}\KeywordTok{install_miniconda}\NormalTok{()} -\KeywordTok{configurePython}\NormalTok{(}\DataTypeTok{envname=}\StringTok{'r-reticulate'}\NormalTok{, }\DataTypeTok{envtype=}\StringTok{'conda'}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -To add the R keras interface, in Rstudio run: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{devtools}\OperatorTok{::}\KeywordTok{install_github}\NormalTok{(}\StringTok{"rstudio/keras"}\NormalTok{)} -\KeywordTok{library}\NormalTok{(keras)} -\KeywordTok{install_keras}\NormalTok{()} -\end{Highlighting} -\end{Shaded} - -Some of the less frequently used classifiers are not installed during -this set-up to add them run: - -For GBM survival: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{reticulate}\OperatorTok{::}\KeywordTok{conda_install}\NormalTok{(}\DataTypeTok{envname=}\StringTok{'r-reticulate'}\NormalTok{, }\DataTypeTok{packages =} \KeywordTok{c}\NormalTok{(}\StringTok{'scikit-survival'}\NormalTok{), }\DataTypeTok{forge =} \OtherTok{TRUE}\NormalTok{, }\DataTypeTok{pip =} \OtherTok{FALSE}\NormalTok{, }\DataTypeTok{pip_ignore_installed =} \OtherTok{TRUE}\NormalTok{, }\DataTypeTok{conda =} \StringTok{"auto"}\NormalTok{, }\DataTypeTok{channel =} \StringTok{'sebp'}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -For any of the torch models: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{reticulate}\OperatorTok{::}\KeywordTok{conda_install}\NormalTok{(}\DataTypeTok{envname=}\StringTok{'r-reticulate'}\NormalTok{, }\DataTypeTok{packages =} \KeywordTok{c}\NormalTok{(}\StringTok{'pytorch'}\NormalTok{, }\StringTok{'torchvision'}\NormalTok{, }\StringTok{'cpuonly'}\NormalTok{), }\DataTypeTok{forge =} \OtherTok{TRUE}\NormalTok{, }\DataTypeTok{pip =} \OtherTok{FALSE}\NormalTok{, }\DataTypeTok{channel =} \StringTok{'pytorch'}\NormalTok{, }\DataTypeTok{pip_ignore_installed =} \OtherTok{TRUE}\NormalTok{, }\DataTypeTok{conda =} \StringTok{'auto'}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\hypertarget{testing-installation}{% -\section{Testing installation}\label{testing-installation}} - -To test whether the package is installed correctly run: - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{library}\NormalTok{(DatabaseConnector)} -\NormalTok{connectionDetails <-}\StringTok{ }\KeywordTok{createConnectionDetails}\NormalTok{(}\DataTypeTok{dbms =} \StringTok{'sql_server'}\NormalTok{, } - \DataTypeTok{user =} \StringTok{'username'}\NormalTok{, } - \DataTypeTok{password =} \StringTok{'hidden'}\NormalTok{, } - \DataTypeTok{server =} \StringTok{'your server'}\NormalTok{, } - \DataTypeTok{port =} \StringTok{'your port'}\NormalTok{)} -\NormalTok{PatientLevelPrediction}\OperatorTok{::}\KeywordTok{checkPlpInstallation}\NormalTok{(}\DataTypeTok{connectionDetails =}\NormalTok{ connectionDetails, } - \DataTypeTok{python =}\NormalTok{ T)} -\end{Highlighting} -\end{Shaded} - -To test the installation (excluding python) run: - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{library}\NormalTok{(DatabaseConnector)} -\NormalTok{connectionDetails <-}\StringTok{ }\KeywordTok{createConnectionDetails}\NormalTok{(}\DataTypeTok{dbms =} \StringTok{'sql_server'}\NormalTok{, } - \DataTypeTok{user =} \StringTok{'username'}\NormalTok{, } - \DataTypeTok{password =} \StringTok{'hidden'}\NormalTok{, } - \DataTypeTok{server =} \StringTok{'your server'}\NormalTok{, } - \DataTypeTok{port =} \StringTok{'your port'}\NormalTok{)} -\NormalTok{PatientLevelPrediction}\OperatorTok{::}\KeywordTok{checkPlpInstallation}\NormalTok{(}\DataTypeTok{connectionDetails =}\NormalTok{ connectionDetails, } - \DataTypeTok{python =}\NormalTok{ F)} -\end{Highlighting} -\end{Shaded} - -The check can take a while to run since it will build the following -models in sequence on simulated \url{data:Logistic} Regression, -RandomForest, MLP, AdaBoost, Decision Tree, Naive Bayes, KNN, Gradient -Boosting. Moreover, it will test the database connection. - -\hypertarget{installation-issues}{% -\section{Installation issues}\label{installation-issues}} - -Installation issues need to be posted in our issue tracker: -\url{http://github.com/OHDSI/PatientLevelPrediction/issues} - -The list below provides solutions for some common issues: - -\begin{enumerate} -\def\labelenumi{\arabic{enumi}.} -\item - If you have an error when trying to install a package in R saying - \textbf{`Dependancy X not available \ldots{}'} then this can sometimes - be fixed by running - \texttt{install.packages(\textquotesingle{}X\textquotesingle{})} and - then once that completes trying to reinstall the package that had the - error. -\item - I have found that using the github devtools to install packages can be - impacted if you have \textbf{multiple R sessions} open as one session - with a library open can causethe library to be locked and this can - prevent an install of a package that depends on that library. -\end{enumerate} - -\hypertarget{acknowledgments}{% -\section{Acknowledgments}\label{acknowledgments}} - -Considerable work has been dedicated to provide the -\texttt{PatientLevelPrediction} package. - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{citation}\NormalTok{(}\StringTok{"PatientLevelPrediction"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -## -## To cite PatientLevelPrediction in publications use: -## -## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design and -## implementation of a standardized framework to generate and evaluate patient-level -## prediction models using observational healthcare data." _Journal of the American -## Medical Informatics Association_, *25*(8), 969-975. . -## -## A BibTeX entry for LaTeX users is -## -## @Article{, -## author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek}, -## title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data}, -## journal = {Journal of the American Medical Informatics Association}, -## volume = {25}, -## number = {8}, -## pages = {969-975}, -## year = {2018}, -## url = {https://doi.org/10.1093/jamia/ocy032}, -## } -\end{verbatim} - -\textbf{Please reference this paper if you use the PLP Package in your -work:} - -\href{http://dx.doi.org/10.1093/jamia/ocy032}{Reps JM, Schuemie MJ, -Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a -standardized framework to generate and evaluate patient-level prediction -models using observational healthcare data. J Am Med Inform Assoc. -2018;25(8):969-975.} - -This work is supported in part through the National Science Foundation -grant IIS 1251151. - -\end{document} diff --git a/man/PatientLevelPrediction.Rd b/man/PatientLevelPrediction.Rd index 560e9be95..c3d7c5c47 100644 --- a/man/PatientLevelPrediction.Rd +++ b/man/PatientLevelPrediction.Rd @@ -7,3 +7,4 @@ \description{ A package for running predictions using data in the OMOP CDM } +\keyword{internal} diff --git a/man/createPreprocessSettings.Rd b/man/createPreprocessSettings.Rd index 17ae28208..6ce8ff1f7 100644 --- a/man/createPreprocessSettings.Rd +++ b/man/createPreprocessSettings.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/PreprocessingData.R \name{createPreprocessSettings} \alias{createPreprocessSettings} -\title{Create the settings for preprocessing the trainData using \code{ }.} +\title{Create the settings for preprocessing the trainData.} \usage{ createPreprocessSettings( minFraction = 0.001, @@ -21,7 +21,7 @@ createPreprocessSettings( An object of class \code{preprocessingSettings} } \description{ -Create the settings for preprocessing the trainData using \code{ }. +Create the settings for preprocessing the trainData. } \details{ Returns an object of class \code{preprocessingSettings} that specifies how to preprocess the training data diff --git a/man/fitPlp.Rd b/man/fitPlp.Rd index 6ff464174..84ef48595 100644 --- a/man/fitPlp.Rd +++ b/man/fitPlp.Rd @@ -4,7 +4,7 @@ \alias{fitPlp} \title{fitPlp} \usage{ -fitPlp(trainData, modelSettings, search = "grid", analysisId) +fitPlp(trainData, modelSettings, search = "grid", analysisId, analysisPath) } \arguments{ \item{trainData}{An object of type \code{TrainData} created using \code{splitData} @@ -22,6 +22,8 @@ data extracted from the CDM.} \item{search}{The search strategy for the hyper-parameter selection (currently not used)} \item{analysisId}{The id of the analysis} + +\item{analysisPath}{The path of the analysis} } \value{ An object of class \code{plpModel} containing: diff --git a/tests/testthat/test-KNN.R b/tests/testthat/test-KNN.R index 28280ca87..421a55fc2 100644 --- a/tests/testthat/test-KNN.R +++ b/tests/testthat/test-KNN.R @@ -6,7 +6,8 @@ test_that('KNN fit works', { plpModel <- fitPlp( trainData = tinyTrainData, modelSettings = modelSettings, - analysisId = 'KNN' + analysisId = 'KNN', + analysisPath = tempdir() ) expect_correct_fitPlp(plpModel, tinyTrainData) diff --git a/tests/testthat/test-LightGBM.R b/tests/testthat/test-LightGBM.R index 21fc5867b..f3ddef8f6 100644 --- a/tests/testthat/test-LightGBM.R +++ b/tests/testthat/test-LightGBM.R @@ -96,7 +96,8 @@ test_that("LightGBM working checks", { fitModel <- fitPlp( trainData = trainData, modelSettings = modelSettings, - analysisId = 'lgbmTest' + analysisId = 'lgbmTest', + analysisPath = tempdir() ) expect_equal(nrow(fitModel$prediction), nrow(trainData$labels)*2) diff --git a/tests/testthat/test-UploadToDatabase.R b/tests/testthat/test-UploadToDatabase.R index a6e960665..3184ddf13 100644 --- a/tests/testthat/test-UploadToDatabase.R +++ b/tests/testthat/test-UploadToDatabase.R @@ -167,7 +167,6 @@ test_that("results uploaded to database", { }) - test_that("database deletion", { skip_if(Sys.getenv('CI') != 'true', 'not run locally') createPlpResultTables( @@ -406,4 +405,52 @@ test_that("import from csv", { }) +# new - check null model just reports message +test_that("message if model is null", { + + model2 <- list(noModel = T) + attr(model2, "predictionFunction") <- 'noModel' + attr(model2, "saveType") <- 'RtoJson' + class(model2) <- 'plpModel' + + plpResult2 <- plpResult + plpResult2$model <- model2 + + savePlpResult(plpResult2, file.path(tempdir(), 'null_model', 'Analysis_1', 'plpResult')) + + nullModelServerLoc <- file.path(tempdir(), 'nullModelDatabase') + if(!dir.exists(file.path(tempdir(), 'nullModelDatabase'))){ + dir.create(file.path(tempdir(), 'nullModelDatabase'), recursive = T) + } + nullModelResultConnDetails <- DatabaseConnector::createConnectionDetails( + dbms = 'sqlite', + server = file.path(nullModelServerLoc,'sqlite.sqlite') + ) + nullModelDatabaseSchemaSettings <- createDatabaseSchemaSettings( + resultSchema = 'main', + tablePrefix = '', + targetDialect = 'sqlite', + tempEmulationSchema = NULL + ) + + createPlpResultTables( + connectionDetails = nullModelResultConnDetails, + targetDialect = 'sqlite', + resultSchema = 'main', + deleteTables = T, + createTables = T, + tablePrefix = '' + ) + + testthat::expect_message( + addMultipleRunPlpToDatabase( + connectionDetails = nullModelResultConnDetails, + databaseSchemaSettings = nullModelDatabaseSchemaSettings, + resultLocation = file.path(tempdir(), 'null_model'), + modelSaveLocation = file.path(tempdir(), 'null_model', 'models') + ) + ) + +}) + diff --git a/tests/testthat/test-cyclopsModels.R b/tests/testthat/test-cyclopsModels.R index 5d83890e9..0bbf6d779 100644 --- a/tests/testthat/test-cyclopsModels.R +++ b/tests/testthat/test-cyclopsModels.R @@ -244,7 +244,8 @@ fitModel <- fitPlp( trainData = trainData, modelSettings = modelSettings, search = "grid", - analysisId = 'lrTest' + analysisId = 'lrTest', + analysisPath = tempdir() ) expect_equal(length(unique(fitModel$prediction$evaluationType)),2) diff --git a/tests/testthat/test-fitting.R b/tests/testthat/test-fitting.R index dbd9a7328..ad22e92bf 100644 --- a/tests/testthat/test-fitting.R +++ b/tests/testthat/test-fitting.R @@ -26,7 +26,8 @@ plpModel <- fitPlp( trainData = trainData, modelSettings = modelSettings, search = "grid", - analysisId = 'fitting' + analysisId = 'fitting', + analysisPath = tempdir() ) expect_is(plpModel, 'plpModel') @@ -38,7 +39,8 @@ test_that("fitPlp input errors", { expect_error( fitPlp( trainData = trainData, - modelSettings = modelSettings + modelSettings = modelSettings, + analysisPath = tempDir() ) ) @@ -46,7 +48,8 @@ test_that("fitPlp input errors", { fitPlp( trainData = list(covariateData = NULL), modelSettings = modelSettings, - analysisId = 'fitting' + analysisId = 'fitting', + analysisPath = tempDir() ) ) @@ -54,6 +57,15 @@ test_that("fitPlp input errors", { fitPlp( trainData = trainData, modelSettings = NULL, + analysisId = 'fitting', + analysisPath = tempDir() + ) + ) + + expect_error( + fitPlp( + trainData = trainData, + modelSettings = modelSettings, analysisId = 'fitting' ) ) diff --git a/tests/testthat/test-rclassifier.R b/tests/testthat/test-rclassifier.R index f1536f890..54195ca08 100644 --- a/tests/testthat/test-rclassifier.R +++ b/tests/testthat/test-rclassifier.R @@ -93,7 +93,8 @@ test_that("GBM working checks", { fitModel <- fitPlp( trainData = trainData, modelSettings = modelSettings, - analysisId = 'gbmTest' + analysisId = 'gbmTest', + analysisPath = tempdir() ) expect_equal(nrow(fitModel$prediction), nrow(trainData$labels)*2) diff --git a/tests/testthat/test-sklearnClassifier.R b/tests/testthat/test-sklearnClassifier.R index 620cb9a0f..fb000a7b9 100644 --- a/tests/testthat/test-sklearnClassifier.R +++ b/tests/testthat/test-sklearnClassifier.R @@ -64,7 +64,8 @@ test_that("check fit of DecisionTree", { plpModel <- fitPlp( trainData = tinyTrainData, modelSettings = modelSettings, - analysisId = 'DecisionTree' + analysisId = 'DecisionTree', + analysisPath = tempdir() ) expect_correct_fitPlp(plpModel, trainData) @@ -95,7 +96,8 @@ test_that('AdaBoost fit works', { plpModel <- fitPlp( trainData = tinyTrainData, modelSettings = modelSettings, - analysisId = 'Adaboost' + analysisId = 'Adaboost', + analysisPath = tempdir() ) expect_correct_fitPlp(plpModel, trainData) @@ -117,7 +119,8 @@ test_that('RandomForest fit works', { plpModel <- fitPlp( trainData = tinyTrainData, modelSettings = modelSettings, - analysisId = 'RandomForest' + analysisId = 'RandomForest', + analysisPath = tempdir() ) expect_correct_fitPlp(plpModel, trainData) @@ -139,7 +142,8 @@ test_that('MLP fit works', { plpModel <- fitPlp( trainData = tinyTrainData, modelSettings = modelSettings, - analysisId = 'MLP' + analysisId = 'MLP', + analysisPath = tempdir() ) expect_correct_fitPlp(plpModel, trainData) @@ -154,7 +158,8 @@ test_that('Naive bayes fit works', { plpModel <- fitPlp( trainData = tinyTrainData, modelSettings = modelSettings, - analysisId = 'Naive bayes' + analysisId = 'Naive bayes', + analysisPath = tempdir() ) expect_correct_fitPlp(plpModel, trainData) @@ -172,7 +177,8 @@ test_that('Support vector machine fit works', { plpModel <- fitPlp( trainData = tinyTrainData, modelSettings = modelSettings, - analysisId = 'SVM' + analysisId = 'SVM', + analysisPath = tempdir() ) expect_correct_fitPlp(plpModel, trainData) @@ -189,7 +195,8 @@ test_that('Sklearn predict works', { plpModel <- fitPlp( trainData = tinyTrainData, modelSettings = modelSettings, - analysisId = 'Adaboost' + analysisId = 'Adaboost', + analysisPath = tempdir() ) predictions <- predictPythonSklearn(plpModel, diff --git a/vignettes/AddingCustomFeatureEngineering.Rmd b/vignettes/AddingCustomFeatureEngineering.Rmd index e07587829..62feedd35 100644 --- a/vignettes/AddingCustomFeatureEngineering.Rmd +++ b/vignettes/AddingCustomFeatureEngineering.Rmd @@ -1,6 +1,6 @@ --- title: "Adding Custom Feature Engineering Functions" -author: "Jenna Reps" +author: "Jenna Reps, Egill Fridgeirsson" date: "`r Sys.Date()`" header-includes: - \usepackage{fancyhdr} @@ -19,61 +19,63 @@ output: number_sections: yes toc: yes --- + +```{=html} - +``` ```{r, echo = FALSE, message = FALSE, warning = FALSE} library(PatientLevelPrediction) ``` + # Introduction -This vignette describes how you can add your own custom function for feature engineering in the Observational Health Data Sciencs and Informatics (OHDSI) [`PatientLevelPrediction`](http://github.com/OHDSI/PatientLevelPrediction) package. This vignette assumes you have read and are comfortable with building single patient level prediction models as described in the [`BuildingPredictiveModels` vignette](https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/BuildingPredictiveModels.pdf). +This vignette describes how you can add your own custom function for feature engineering in the Observational Health Data Sciences and Informatics (OHDSI) [`PatientLevelPrediction`](http://github.com/OHDSI/PatientLevelPrediction) package. This vignette assumes you have read and are comfortable with building single patient level prediction models as described in the [`BuildingPredictiveModels` vignette](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/BuildingPredictiveModels.pdf). **We invite you to share your new feature engineering functions with the OHDSI community through our [GitHub repository](http://github.com/OHDSI/PatientLevelPrediction).** -# Feature Engineering Function Code Structure +# Feature Engineering Function Code Structure -To make a custom feature engineering function that can be used within PatientLevelPrediction you need to write two different functions. The 'create' function and the 'implement' function. +To make a custom feature engineering function that can be used within PatientLevelPrediction you need to write two different functions. The 'create' function and the 'implement' function. The 'create' function, e.g., create\, takes the parameters of the feature engineering 'implement' function as input, checks these are valid and outputs these as a list of class 'featureEngineeringSettings' with the 'fun' attribute specifying the 'implement' function to call. -The 'implement' function, e.g., implement\, must take as input: - * trainData - a list containing: - - covariateData: the plpData$covariateData restricted to the training patients - - labels: a data frame that contain rowId (patient identifier) and outcomeCount (the class labels) - - folds: a data.frame that contains rowId (patient identifier) and index (the cross validation fold) - * featureEngineeringSettings - the output of your create\ - -The 'implement' function can then do any manipulation of the trainData (adding new features or removing features) but must output a trainData object containing the new covariateData, labels and folds for the training data patients. +The 'implement' function, e.g., implement\, must take as input: + +- `trainData` - a list containing: + + - `covariateData`: the `plpData$covariateData`restricted to the training patients + + - `labels`: a data frame that contain `rowId`(patient identifier) and `outcomeCount` (the class labels) + + - `folds`: a data.frame that contains `rowId` (patient identifier) and `index` (the cross validation fold) + +- `featureEngineeringSettings` - the output of your create\ + +The 'implement' function can then do any manipulation of the `trainData` (adding new features or removing features) but must output a `trainData` object containing the new `covariateData`, `labels` and `folds` for the training data patients. # Example -Let's consider the situation where we wish to create an age spline feature. To make this custom feature engineering function we need to write the 'create' and 'implement' R functions. +Let's consider the situation where we wish to create an age spline feature. To make this custom feature engineering function we need to write the 'create' and 'implement' R functions. ## Create function -Our age spline feature function will create a new feature using the plpData$cohorts ageYear column. We will implement a restricted cubic spline that requires specifying the number of knots. -. Therefore, the inputs for this are: - * `knots` an integer/double specifying the number of knots - +Our age spline feature function will create a new feature using the `plpData$cohorts$ageYear` column. We will implement a restricted cubic spline that requires specifying the number of knots. Therefore, the inputs for this are: `knots` - an integer/double specifying the number of knots. + ```{r, echo = TRUE, eval=FALSE} -createAgeSpine <- function( +createAgeSpline <- function( knots = 5 ){ - # add input checks - checkIsClass(knots, c('numeric','integer')) - checkHigher(knots,0) - # create list of inputs to implement function featureEngineeringSettings <- list( knots = knots ) # specify the function that will implement the sampling - attr(featureEngineeringSettings, "fun") <- "implementAgeSpine" + attr(featureEngineeringSettings, "fun") <- "implementAgeSplines" # make sure the object returned is of class "sampleSettings" class(featureEngineeringSettings) <- "featureEngineeringSettings" @@ -82,75 +84,76 @@ createAgeSpine <- function( } ``` -We now need to create the 'implement' function `implementAgeSpine()` +We now need to create the 'implement' function `implementAgeSplines()` ## Implement function -All 'implement' functions must take as input the trainData and the featureEngineeringSettings (this is the output of the 'create' function). They must return a trainData object containing the new covariateData, labels and folds. +All 'implement' functions must take as input the `trainData` and the `featureEngineeringSettings` (this is the output of the 'create' function). They must return a `trainData` object containing the new `covariateData`, `labels` and `folds`. -In our example, the `createAgeSpine()` will return a list with 'knots'. The featureEngineeringSettings therefore contains this. +In our example, the `createAgeSpline()` will return a list with 'knots'. The `featureEngineeringSettings` therefore contains this. ```{r tidy=FALSE,eval=FALSE} -implementAgeSpine <- function(trainData, featureEngineeringSettings){ - - # currently not used - knots <- featureEngineeringSettings$knots - - - # age in in trainData$labels as ageYear - ageData <- trainData$labels +implementAgeSplines <- function(trainData, featureEngineeringSettings, model=NULL) { + # if there is a model, it means this function is called through applyFeatureengineering, meaning it # should apply the model fitten on training data to the test data + if (is.null(model)) { + knots <- featureEngineeringSettings$knots + ageData <- trainData$labels + y <- ageData$outcomeCount + X <- ageData$ageYear + model <- mgcv::gam( + y ~ s(X, bs='cr', k=knots, m=2) + ) + newData <- data.frame( + rowId = ageData$rowId, + covariateId = 2002, + covariateValue = model$fitted.values + ) + } + else { + ageData <- trainData$labels + X <- trainData$labels$ageYear + y <- ageData$outcomeCount + newData <- data.frame(y=y, X=X) + yHat <- predict(model, newData) + newData <- data.frame( + rowId = trainData$labels$rowId, + covariateId = 2002, + covariateValue = yHat + ) + } - # now implement the code to do your desired feature engineering + # remove existing age if in covariates + trainData$covariateData$covariates <- trainData$covariateData$covariates |> + dplyr::filter(!covariateId %in% c(1002)) - data <- Matrix::sparseMatrix( - i = 1:length(ageData$rowId), - j = rep(1, length(ageData$rowId)), - x = ageData$ageYear, - dims=c(length(ageData$rowId),1) - ) + # update covRef + Andromeda::appendToTable(trainData$covariateData$covariateRef, + data.frame(covariateId=2002, + covariateName='Cubic restricted age splines', + analysisId=2, + conceptId=2002)) - data <- as.matrix(data) - x <- data[,1] - y <- ageData$outcomeCount + # update covariates + Andromeda::appendToTable(trainData$covariateData$covariates, newData) -mRCS <- rms::ols( - y~rms::rcs(x, - stats::quantile( - x, - c(0, .05, .275, .5, .775, .95, 1), - include.lowest = TRUE - ) - ) - ) - -newData <- data.frame( - rowId = ageData$rowId, - covariateId = 2002, - covariateValue = mRCS$fitted.values - ) - -# add new data -Andromeda::appendToTable(tbl = trainData$covariateData$covariates, - data = newData) - -featureEngeering <- list( - funct = 'implementAgeSpine', + featureEngineering <- list( + funct = 'implementAgeSplines', settings = list( - featureEngineeringSettings = featureEngineeringSettings + featureEngineeringSettings = featureEngineeringSettings, + model = model ) ) - attr(trainData, 'metaData')$featureEngineering = listAppend( - attr(trainData, 'metaData')$featureEngineering, - featureEngeering - ) - # return the updated trainData + attr(trainData$covariateData, 'metaData')$featureEngineering = listAppend( + attr(trainData$covariateData, 'metaData')$featureEngineering, + featureEngineering + ) + return(trainData) } ``` - # Acknowledgments Considerable work has been dedicated to provide the `PatientLevelPrediction` package. @@ -163,8 +166,4 @@ citation("PatientLevelPrediction") [Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.](http://dx.doi.org/10.1093/jamia/ocy032) - This work is supported in part through the National Science Foundation grant IIS 1251151. - - - diff --git a/vignettes/BuildingPredictiveModels.Rmd b/vignettes/BuildingPredictiveModels.Rmd index 7249f6421..b7bfbb3e8 100644 --- a/vignettes/BuildingPredictiveModels.Rmd +++ b/vignettes/BuildingPredictiveModels.Rmd @@ -368,7 +368,7 @@ In this section we assume that our cohorts have been created either by using ATL ### Data extraction -Now we can tell `PatientLevelPrediction` to extract all necessary data for our analysis. This is done using the [`FeatureExtractionPackage`](https://github.com/OHDSI/FeatureExtration). In short the FeatureExtractionPackage allows you to specify which features (covariates) need to be extracted, e.g. all conditions and drug exposures. It also supports the creation of custom covariates. For more detailed information on the FeatureExtraction package see its [vignettes](https://github.com/OHDSI/FeatureExtration). For our example study we decided to use these settings: +Now we can tell `PatientLevelPrediction` to extract all necessary data for our analysis. This is done using the [`FeatureExtractionPackage`](https://github.com/OHDSI/FeatureExtraction). In short the FeatureExtractionPackage allows you to specify which features (covariates) need to be extracted, e.g. all conditions and drug exposures. It also supports the creation of custom covariates. For more detailed information on the FeatureExtraction package see its [vignettes](https://github.com/OHDSI/FeatureExtraction). For our example study we decided to use these settings: ```{r tidy=FALSE,eval=FALSE} covariateSettings <- createCovariateSettings(useDemographicsGender = TRUE, @@ -763,7 +763,7 @@ In this section we assume that our cohorts have been created either by using ATL ### Data extraction -Now we can tell `PatientLevelPrediction` to extract all necessary data for our analysis. This is done using the [`FeatureExtractionPackage`](https://github.com/OHDSI/FeatureExtration). In short the FeatureExtractionPackage allows you to specify which features (covariates) need to be extracted, e.g. all conditions and drug exposures. It also supports the creation of custom covariates. For more detailed information on the FeatureExtraction package see its [vignettes](https://github.com/OHDSI/FeatureExtration). For our example study we decided to use these settings: +Now we can tell `PatientLevelPrediction` to extract all necessary data for our analysis. This is done using the [`FeatureExtractionPackage`](https://github.com/OHDSI/FeatureExtraction). In short the FeatureExtractionPackage allows you to specify which features (covariates) need to be extracted, e.g. all conditions and drug exposures. It also supports the creation of custom covariates. For more detailed information on the FeatureExtraction package see its [vignettes](https://github.com/OHDSI/FeatureExtraction). For our example study we decided to use these settings: ```{r tidy=FALSE,eval=FALSE} covariateSettings <- createCovariateSettings(useDemographicsGender = TRUE, diff --git a/vignettes/CovCNN.png b/vignettes/CovCNN.png deleted file mode 100644 index 82dd2832f..000000000 Binary files a/vignettes/CovCNN.png and /dev/null differ diff --git a/vignettes/InstallationGuide.Rmd b/vignettes/InstallationGuide.Rmd index 0fd3338c9..0b362487b 100644 --- a/vignettes/InstallationGuide.Rmd +++ b/vignettes/InstallationGuide.Rmd @@ -1,6 +1,6 @@ --- -title: "Patient-Level Prediction V5.1.0 Installation Guide" -author: "Jenna Reps, Peter R. Rijnbeek" +title: "Patient-Level Prediction Installation Guide" +author: "Jenna Reps, Peter R. Rijnbeek, Egill Fridgeirsson" date: '`r Sys.Date()`' header-includes: - \usepackage{fancyhdr} @@ -36,7 +36,7 @@ This vignette describes how you need to install the Observational Health Data Sc ## Windows Users Under Windows the OHDSI Patient Level Prediction (PLP) package requires installing: -* R (https://cran.cnr.berkeley.edu/ ) \- (R >= 3.3.0, but latest is recommended) +* R (https://cran.cnr.berkeley.edu/ ) \- (R >= 4.0.0, but latest is recommended) * Rstudio (https://www.rstudio.com/ ) * Java (http://www.java.com ) * RTools (https://cran.r-project.org/bin/windows/Rtools/) @@ -44,7 +44,7 @@ Under Windows the OHDSI Patient Level Prediction (PLP) package requires installi ## Mac/Linux Users Under Mac and Linux the OHDSI Patient Level Prediction (PLP) package requires installing: -* R (https://cran.cnr.berkeley.edu/ ) \- (R >= 3.3.0, but latest is recommended) +* R (https://cran.cnr.berkeley.edu/ ) \- (R >= 4.0.0, but latest is recommended) * Rstudio (https://www.rstudio.com/ ) * Java (http://www.java.com ) * Xcode command line tools(run in terminal: xcode-select --install) [MAC USERS ONLY] @@ -53,7 +53,7 @@ Under Mac and Linux the OHDSI Patient Level Prediction (PLP) package requires in # Installing the Package The preferred way to install the package is by using `remotes`, which will automatically install the latest release and all the latest dependencies. -If you do not want the official release you could install the bleading edge version of the package (latest develop branch). +If you do not want the official release you could install the bleeding edge version of the package (latest develop branch). Note that the latest develop branch could contain bugs, please report them to us if you experience problems. @@ -61,7 +61,6 @@ Note that the latest develop branch could contain bugs, please report them to us To install using `remotes` run: ```{r, echo = TRUE, message = FALSE, warning = FALSE,tidy=FALSE,eval=FALSE} install.packages("remotes") -remotes::install_github("OHDSI/FeatureExtraction") remotes::install_github("OHDSI/PatientLevelPrediction") ``` @@ -69,7 +68,7 @@ When installing make sure to close any other Rstudio sessions that are using `Pa # Creating Python Reticulate Environment -Many of the classifiers in the `PatientLevelPrediction` use a Python back end. To set up a python environment run: +Many of the classifiers in the `PatientLevelPrediction` use a Python backend. To set up a python environment run: ```{r, echo = TRUE, message = FALSE, warning = FALSE,tidy=FALSE,eval=FALSE} library(PatientLevelPrediction) @@ -78,50 +77,6 @@ configurePython(envname='r-reticulate', envtype='conda') ``` -Some of the less frequently used classifiers are not installed during this set-up to add them run: - -For GBM survival: -```{r, echo = TRUE, message = FALSE, warning = FALSE,tidy=FALSE,eval=FALSE} -reticulate::conda_install(envname='r-reticulate', packages = c('scikit-survival'), forge = TRUE, pip = FALSE, pip_ignore_installed = TRUE, conda = "auto", channel = 'sebp') - -``` - -# Testing installation -To test whether the package is installed correctly, using the test script in '/extras', run: -```{r, echo = TRUE, message = FALSE, warning = FALSE,tidy=FALSE,eval=FALSE} - -# load the checkPlpInstallation function -library(devtools) -source_url('https://raw.github.com/OHDSI/PatientLevelPrediction/issue242/extras/checkPlpInstallation.R') - -# set up the database connection details -library(DatabaseConnector) -connectionDetails <- createConnectionDetails( - dbms = 'sql_server', - user = 'username', - password = 'hidden', - server = 'your server', - port = 'your port' - ) - -# run the test -checkPlpInstallation( - connectionDetails = connectionDetails, - python = T - ) -``` - -To test the installation (excluding python) run: -```{r, echo = TRUE, message = FALSE, warning = FALSE,tidy=FALSE,eval=FALSE} - -checkPlpInstallation( - connectionDetails = connectionDetails, - python = F - ) -``` - -The check can take a while to run since it will build the following models in sequence on simulated data: Logistic Regression, RandomForest, MLP, AdaBoost, Decision Tree, Naive Bayes, KNN, Gradient Boosting. -Moreover, it will test the database connection. # Installation issues Installation issues need to be posted in our issue tracker: @@ -131,12 +86,12 @@ The list below provides solutions for some common issues: 1. If you have an error when trying to install a package in R saying **'Dependancy X not available ...'** then this can sometimes be fixed by running `install.packages('X')` and then once that completes trying to reinstall the package that had the error. -2. I have found that using the github `remotes`` to install packages can be impacted if you have **multiple R sessions** open as one session with a library open can cause the library to be locked and this can prevent an install of a package that depends on that library. +2. I have found that using the github `remotes` to install packages can be impacted if you have **multiple R sessions** open as one session with a library open can cause the library to be locked and this can prevent an install of a package that depends on that library. ## Common issues ### python environment Mac/linux users: -to make sure R uses the r-reticulate python environment you may need to edit your .Rprofile with the location of the python binary for the PLP environment. Edit the .Rprofile by running: +to make sure R uses the r-reticulate python environment you may need to edit your .Rprofile with the location of the python binary for the PLP environment. Edit the .Rprofile by running: ```{r eval=FALSE} usethis::edit_r_profile() ``` diff --git a/vignettes/Videos.rmd b/vignettes/Videos.rmd index 410697d7f..d651c0e01 100644 --- a/vignettes/Videos.rmd +++ b/vignettes/Videos.rmd @@ -23,184 +23,46 @@ output: number_sections: yes toc: yes --- + +```{=html} - - +``` ## What is a cohort table? - - - - - - - - - -
    -Click To Launch - -Description of Demo -
    - - -Learn what a cohort table looks like and what columns are required. -
    +| Click To Launch | Description of Demo | +|------------------------------------------------|------------------------| +| [![Video Vignette PLP Package](http://img.youtube.com/vi/BEukCbT8UoA/0.jpg){alt="Video Vignette PLP Package"}](https://youtu.be/GY2ZTcizY90) | Learn what a cohort table looks like and what columns are required. | ## Setting up a connection between your database and R - - - - - - - - - -
    -Click To Launch - -Description of Demo -
    - - -Learn how to configure the connection to your OMOP CDM data from R using the OHDSI DatabaseConnector package. -
    +| Click To Launch | Description of Demo | +|----------------------------------------|--------------------------------| +| [![Video Vignette PLP Package](http://img.youtube.com/vi/BEukCbT8UoA/0.jpg){alt="Video Vignette PLP Package"}](https://youtu.be/8F2X5SKN64w) | Learn how to configure the connection to your OMOP CDM data from R using the OHDSI DatabaseConnector package. | - - ## Running a single PatientLevelPrediction model - - - - - - - - - -
    -Click To Launch - -Description of Demo -
    - - -Learn how to develop and validate a single PatientLevelPrediction model. -
    - - + +| Click To Launch | Description of Demo | +|-----------------------------------------------|-------------------------| +| [![Video Vignette PLP Package](http://img.youtube.com/vi/7AraOsTynD4/0.jpg){alt="Video Vignette PLP Package"}](https://youtu.be/7AraOsTynD4) | Learn how to develop and validate a single PatientLevelPrediction model. | + ## Running multiple PatientLevelPrediction models study - - - - - - - - - -
    -Click To Launch - -Description of Demo -
    - - -Learn how to develop and validate multiple PatientLevelPrediction models. -
    - - -## Designing a study in Atlas - - - - - - - - - -
    -Click To Launch - -Description of Demo -
    - - -Learn how to design a multiple or single PatientLevelPrediction study using Atlas. Atlas creates an R package that just needs to be built and then you're on your way to developing multiple models! -
    - -## Building and running the Atlas study - - - - - - - - - -
    -Click To Launch - -Description of Demo -
    - - -Learn how to build the R package generated by Atlas and how to then run the study. -
    +| Click To Launch | Description of Demo | +|-----------------------------------------------|-------------------------| +| [![Video Vignette PLP Package](http://img.youtube.com/vi/7wUilx580PE/0.jpg){alt="Video Vignette PLP Package"}](https://youtu.be/7wUilx580PE) | Learn how to develop and validate multiple PatientLevelPrediction models. | + ## Exploring the results in the shiny app - - - - - - - - - -
    -Click To Launch - -Description of Demo -
    - - -Learn how to interactively explore the model performance and model via the shiny apps viewPlp() and viewMultiplePlp() -
    - + +| Click To Launch | Description of Demo | +|---------------------------------------|---------------------------------| +| [![Video Vignette PLP Package](http://img.youtube.com/vi/BulmuH32y_Y/0.jpg){alt="Video Vignette PLP Package"}](https://youtu.be/BulmuH32y_Y) | Learn how to interactively explore the model performance and model via the shiny apps viewPlp() and viewMultiplePlp() | + ## Validating existing models on OMOP CDM data - - - - - - - - - -
    -Click To Launch - -Description of Demo -
    - - -This demo shows how you can add any existing score or logistic model and valdiate the model on new OMOP CDM data. This is useful for benchmarking when developing new models or to perform extensive external validation of a model across the OHDSI network. -
    - \ No newline at end of file + +| Click To Launch | Description of Demo | +|--------------------------|----------------------------------------------| +| [![Video Vignette PLP Package](http://img.youtube.com/vi/BEukCbT8UoA/0.jpg){alt="Video Vignette PLP Package"}](https://youtu.be/oBsfg9hfrpI) | This demo shows how you can add any existing score or logistic model and validate the model on new OMOP CDM data. This is useful for benchmarking when developing new models or to perform extensive external validation of a model across the OHDSI network. | diff --git a/vignettes/arch1.png b/vignettes/arch1.png deleted file mode 100644 index e4846e56f..000000000 Binary files a/vignettes/arch1.png and /dev/null differ diff --git a/vignettes/cirenn.png b/vignettes/cirenn.png deleted file mode 100644 index f4e8ed054..000000000 Binary files a/vignettes/cirenn.png and /dev/null differ diff --git a/vignettes/cnn_lstm.png b/vignettes/cnn_lstm.png deleted file mode 100644 index a16e1417d..000000000 Binary files a/vignettes/cnn_lstm.png and /dev/null differ diff --git a/vignettes/cnn_mlf2.png b/vignettes/cnn_mlf2.png deleted file mode 100644 index 2b69c159b..000000000 Binary files a/vignettes/cnn_mlf2.png and /dev/null differ diff --git a/vignettes/conv_arch1.png b/vignettes/conv_arch1.png deleted file mode 100644 index 5970b3f1c..000000000 Binary files a/vignettes/conv_arch1.png and /dev/null differ diff --git a/vignettes/conv_arch2.png b/vignettes/conv_arch2.png deleted file mode 100644 index a51ccf08e..000000000 Binary files a/vignettes/conv_arch2.png and /dev/null differ diff --git a/vignettes/covcnn2.png b/vignettes/covcnn2.png deleted file mode 100644 index 0734a49eb..000000000 Binary files a/vignettes/covcnn2.png and /dev/null differ diff --git a/vignettes/lstm_last.png b/vignettes/lstm_last.png deleted file mode 100644 index 3e6fc16e5..000000000 Binary files a/vignettes/lstm_last.png and /dev/null differ