Merge pull request #1 from alethere/newcalc

Newcalc
alethere · Apr 1, 2022 · ddfe161 · ddfe161
2 parents 0bcda3e + 598ebaf
commit ddfe161
Show file tree

Hide file tree

Showing 11 changed files with 838 additions and 134 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -2,8 +2,12 @@
 S3method(predict_IBD,list)
 S3method(predict_IBD,matrix)
 S3method(predict_IBD,numeric)
-S3method(calc_IBD,numeric)
-S3method(calc_IBD,matrix)
+S3method(naive_IBD,numeric)
+S3method(naive_IBD,matrix)
+S3method(hmm_IBD,numeric)
+S3method(hmm_IBD,matrix)
+S3method(heur_IBD,numeric)
+S3method(heur_IBD,matrix)
 S3method(rec_count,matrix)
 S3method(rec_count,list)
 S3method(graphical_genotype,matrix)

diff --git a/R/calc_IBD.R b/R/calc_IBD.R
diff --git a/R/mapping.R b/R/mapping.R
@@ -13,19 +13,18 @@
 #' @return Estimated linkage map, including position and other parameters.
 #' @keywords internal
 mdsmap <- function(linkdf,ndim = 2){
-  linkdf <- linkdf[order(linkdf$marker_a,linkdf$marker_b),]
-  linkdf <- linkdf[!duplicated(paste0(linkdf$marker_a,linkdf$marker_b)),]
-  text <- paste(c(length(unique(unlist(linkdf[,1:2]))),
-                  apply(linkdf[,c(1,2,3,4)],1,paste,collapse = " ")),
-                collapse = "\n")
+  linkdf <- polymapR:::prepare_pwd(linkdf)
   file <- paste0("tmp.",paste0(sample(c(letters,LETTERS),10),collapse = ""),".map")
-  write(x = text,file = file)
-  if(ndim == 2){
-    newmap <- MDSMap::calc.maps.pc(file,weightfn = "lod2")
-  }else if(ndim == 3){
-    newmap <- MDSMap::calc.maps.sphere(file,weightfn = "lod2",p= 200)
-  }
+  count <- length(unique(unlist(linkdf[,1:2])))
+  write(count,file = file)
+  write.table(linkdf, file = file, append = T,
+              col.names = F, row.names = F, quote = F)
+
+  newmap <- MDSMap::calc.maps.pc(file,weightfn = "lod2",ndim = ndim)
   rem <- file.remove(file)
+  newmap <- newmap$locimap
+  colnames(newmap)[2] <- "marker"
+
   return(newmap)
 }
 
@@ -58,7 +57,7 @@ linkdf_shortcut <- function(geno,ploidy,p1name,p2name,ncores = 1){
                                                  ncores = ncores,
                                                  marker_assignment = ma,
                                                  LG_number = 1,verbose = F,
-                                                 convert_palindrome_markers = F)$LG1
+                                                 convert_palindrome_markers = T)$LG1
   linkdf_p2 <- polymapR::finish_linkage_analysis(dosage_matrix = as.matrix(geno),
                                                  target_parent = p2name,
                                                  other_parent = p1name,
@@ -67,7 +66,7 @@ linkdf_shortcut <- function(geno,ploidy,p1name,p2name,ncores = 1){
                                                  ncores = ncores,
                                                  marker_assignment = ma,
                                                  LG_number = 1,verbose = F,
-                                                 convert_palindrome_markers = F)$LG1
+                                                 convert_palindrome_markers = T)$LG1
 
   linkdf <- rbind(linkdf_p1,linkdf_p2)
   return(linkdf)

diff --git a/R/pred_IBD.R b/R/pred_IBD.R
@@ -56,7 +56,7 @@ predict_IBD <- function(IBD,map,interval = 10, method = "kosambi",
     }
   }else if(type[1] == "data.frame"){
     predict_IBD(IBD,map,interval, method = method,
-                       non_inf = non_inf,pred_points = pred_points)
+                non_inf = non_inf,pred_points = pred_points)
   }
 
   UseMethod("predict_IBD",IBD)

diff --git a/R/smooth_wrapper.R b/R/smooth_wrapper.R
@@ -39,6 +39,15 @@
 #' IBD probability non-informative (if they fall within the threshold they will be
 #' ignored during prediction). Defaults to 0.3 - 0.7. Symmetrical boundaries are
 #' recommended but not necessary.
+#' @param obs.method character, either "naive" or "heuristic" (or substrings). This parameter allows to
+#' switch between using the IBD calculation (for observed IBDs) described in the Smooth Descent paper, or the
+#' heuristic method from `polyqtlR`. However, our research has shown better results with the
+#' naive method.
+#' @param pred.method character, either "prediction" or "hmm" (or substrings). This parameter
+#' allows to switch between using the IBD calculation (for predicted IBDs) between the weighted
+#' average method or the Hidden Markov model implemented in `polyqtlR`. Our research shows better
+#' results in polyploids with the HMM, although for high marker densities the
+#' weighted average method is faster (specially if `prediction_points` is used)
 #' @param verbose logical, should smooth descent report the steps it takes?
 #'
 #'
@@ -74,9 +83,12 @@ smooth_descent <- function(geno,
                            prediction_points = NULL,
                            error_threshold = 0.8,
                            non_inf = c(0.3,0.7),
-                           verbose = T){
-  talk <- function(msg){
-    if(verbose) cat(msg)
+                           verbose = T,
+                           obs.method = "naive",
+                           pred.method = "prediction",
+                           hmm.error = 0.01){
+  talk <- function(...){
+    if(verbose) cat(...)
   }
   ## INPUT CHECK ##
   assertthat::assert_that(!is.null(rownames(geno)))
@@ -105,6 +117,11 @@ smooth_descent <- function(geno,
   geno <- geno[map$marker,]
   homologue <- homologue[map$marker,]
 
+  #Matching parameters
+  obs.method <- match.arg(obs.method,c("naive","heuristic"))
+  pred.method <- match.arg(pred.method,c("prediction","hmm"))
+  talk("Will estimate errors with",obs.method,"-",pred.method,"combination\n")
+
 
   ## Error estimation ##
   if(is.null(p1name)) p1name <- colnames(geno)[1]
@@ -114,15 +131,24 @@ smooth_descent <- function(geno,
   genocols <- which(colnames(geno) %in% c(p1name,p2name))
 
   talk("Obtaining IBD\n")
-  #The parents should not be included in the IBD calculation
-  obsIBD <- calc_IBD(geno = as.matrix(geno[,-genocols]),
-                          p1hom = as.matrix(homologue[,parentcols$p1]),
-                          p2hom = as.matrix(homologue[,parentcols$p2]),
-                          ploidy = ploidy)
+  obsIBD <- calc_IBD(geno = as.matrix(geno),
+                     p1hom = as.matrix(homologue[,parentcols$p1]),
+                     p2hom = as.matrix(homologue[,parentcols$p2]),
+                     ploidy = ploidy,map = map, p1name = p1name, p2name = p2name,
+                     method = obs.method)
 
   talk("Predicting IBD\n")
-  predIBD <- predict_IBD(obsIBD, map, interval = prediction_interval,non_inf = non_inf,
-                         pred_points = prediction_points)
+  if(pred.method == "hmm"){
+    predIBD <- calc_IBD(geno = as.matrix(geno),
+                        p1hom = as.matrix(homologue[,parentcols$p1]),
+                        p2hom = as.matrix(homologue[,parentcols$p2]),
+                        ploidy = ploidy,map = map, p1name = p1name, p2name = p2name,
+                        method = "hmm",hmm.error = hmm.error)
+  }else if(pred.method == "prediction"){
+    predIBD <- predict_IBD(obsIBD, map, interval = prediction_interval,non_inf = non_inf,
+                           pred_points = prediction_points)
+  }
+
   talk("Detecting errors\n")
   errors <- lapply(1:length(obsIBD),function(i){
     err <- abs(obsIBD[[i]] - predIBD[[i]]) > error_threshold
@@ -143,7 +169,9 @@ smooth_descent <- function(geno,
                 error = errors,
                 newIBD = obsIBD,
                 oldgeno = geno,
-                newgeno = geno)
+                newgeno = geno,
+                obs.method = obs.method,
+                pred.method = pred.method)
 
   }else{
     newIBD <- lapply(1:length(obsIBD),function(i){
@@ -164,18 +192,19 @@ smooth_descent <- function(geno,
     new_geno <- new_geno[rownames(geno),colnames(geno[,-genocols])]
     new_geno[is.na(new_geno)] <- geno[,-genocols][is.na(new_geno)]
     new_geno <- cbind(geno[,genocols],new_geno)
-    newIBD <- calc_IBD(geno = as.matrix(new_geno[,-genocols]),
+    newIBD <- calc_IBD(geno = as.matrix(new_geno),
                        p1hom = as.matrix(homologue[,parentcols$p1]),
                        p2hom = as.matrix(homologue[,parentcols$p2]),
-                       ploidy = ploidy)
-
+                       ploidy = ploidy,method = obs.method, map = map)
     res <- list(obsIBD = obsIBD,
                 predIBD = predIBD,
                 oldmap = map,
                 error = errors,
                 newIBD = newIBD,
                 oldgeno = geno,
-                newgeno = new_geno)
+                newgeno = new_geno,
+                obs.method = obs.method,
+                pred.method = pred.method)
   }
 
   #Diagnostics
@@ -190,7 +219,7 @@ smooth_descent <- function(geno,
                   pred = pred_rec,
                   new = new_rec)
 
-  res <- res[c("obsIBD","predIBD","error","newIBD","oldgeno","newgeno","oldmap","rec")]
+  res <- res[c("obsIBD","predIBD","error","newIBD","oldgeno","newgeno","oldmap","rec","obs.method","pred.method")]
   return(res)
 }
 
@@ -238,6 +267,15 @@ smooth_descent <- function(geno,
 #' IBD probability non-informative (if they fall within the threshold they will be
 #' ignored during prediction). Defaults to 0.3 - 0.7. Symmetrical boundaries are
 #' recommended but not necessary.
+#' @param obs.method character, either "naive" or "heuristic" (or substrings). This parameter allows to
+#' switch between using the IBD calculation (for observed IBDs) described in the Smooth Descent paper, or the
+#' heuristic method from `polyqtlR`. However, our research has shown better results with the
+#' naive method.
+#' @param pred.method character, either "prediction" or "hmm" (or substrings). This parameter
+#' allows to switch between using the IBD calculation (for predicted IBDs) between the weighted
+#' average method or the Hidden Markov model implemented in `polyqtlR`. Our research shows better
+#' results in polyploids with the HMM, although for high marker densities the
+#' weighted average method is faster (specially if `prediction_points` is used)
 #' @param verbose logical, should smooth descent report the steps it takes?
 #'
 #' @return list containing the following items:
@@ -271,21 +309,24 @@ smooth_descent <- function(geno,
 #'    estimate_premap = F, mapping_ndim = 3, ncores = 1)
 #' }
 smooth_map <- function(geno,
-                      homologue,
-                      map = NULL,
-                      ploidy = 2,
-                      p1name = NULL,
-                      p2name = NULL,
-                      prediction_interval = 10,
-                      prediction_threshold = 0.8,
-                      prediction_points = NULL,
-                      error_threshold = 0.8,
-                      ncores = 1,
-                      mapping_ndim = 2,
-                      estimate_premap = F,
-                      max_distance = 10,
-                      non_inf = c(0.3,0.7),
-                      verbose = T){
+                       homologue,
+                       map = NULL,
+                       ploidy = 2,
+                       p1name = NULL,
+                       p2name = NULL,
+                       prediction_interval = 10,
+                       prediction_threshold = 0.8,
+                       prediction_points = NULL,
+                       error_threshold = 0.8,
+                       ncores = 1,
+                       mapping_ndim = 2,
+                       estimate_premap = F,
+                       max_distance = 10,
+                       non_inf = c(0.3,0.7),
+                       verbose = T,
+                       obs.method = "naive",
+                       pred.method = "prediction",
+                       hmm.error = 0.01){
   talk <- function(msg){
     if(verbose) cat(msg)
   }
@@ -301,8 +342,6 @@ smooth_map <- function(geno,
                               p1name = p1name, p2name = p2name,
                               ncores = ncores)
     map <- mdsmap(linkdf,mapping_ndim)
-    map <- map$locimap
-    colnames(map)[2] <- "marker"
   }
 
   #We should eliminate markers that are distant from everything
@@ -318,12 +357,13 @@ smooth_map <- function(geno,
   map <- map[!far_marks,]
 
   res <- smooth_descent(geno = geno, homologue = homologue, map = map,
-                 ploidy = ploidy, p1name = p1name, p2name = p2name,
-                 prediction_interval = prediction_interval,
-                 prediction_threshold = prediction_threshold,
-                 prediction_points = prediction_points,
-                 error_threshold = error_threshold,
-                 verbose = verbose)
+                        ploidy = ploidy, p1name = p1name, p2name = p2name,
+                        prediction_interval = prediction_interval,
+                        prediction_threshold = prediction_threshold,
+                        prediction_points = prediction_points,
+                        error_threshold = error_threshold,
+                        verbose = verbose, obs.method = obs.method,
+                        pred.method = pred.method,hmm.error = hmm.error)
 
   talk("Estimating new linkage\n")
   geno <- as.matrix(res$newgeno)
@@ -333,8 +373,6 @@ smooth_map <- function(geno,
 
   talk("Mapping new genotypes\n")
   new_map <- mdsmap(linkdf,mapping_ndim)
-  new_map <- new_map$locimap
-  colnames(new_map)[2] <- "marker"
 
   #Diagnostics and output formatting
   res$oldmap = map
@@ -393,6 +431,15 @@ smooth_map <- function(geno,
 #' IBD probability non-informative (if they fall within the threshold they will be
 #' ignored during prediction). Defaults to 0.3 - 0.7. Symmetrical boundaries are
 #' recommended but not necessary.
+#' @param obs.method character, either "naive" or "heuristic" (or substrings). This parameter allows to
+#' switch between using the IBD calculation (for observed IBDs) described in the Smooth Descent paper, or the
+#' heuristic method from `polyqtlR`. However, our research has shown better results with the
+#' naive method.
+#' @param pred.method character, either "prediction" or "hmm" (or substrings). This parameter
+#' allows to switch between using the IBD calculation (for predicted IBDs) between the weighted
+#' average method or the Hidden Markov model implemented in `polyqtlR`. Our research shows better
+#' results in polyploids with the HMM, although for high marker densities the
+#' weighted average method is faster (specially if `prediction_points` is used)
 #' @param verbose logical, should smooth descent report the steps it takes?
 #'
 #' @return list of lists, where each element contains the following items:
@@ -442,7 +489,10 @@ smooth_map_iter <- function(geno,
                             estimate_premap = F,
                             max_distance = 10,
                             non_inf = c(0.3,0.7),
-                            verbose = T){
+                            verbose = T,
+                            obs.method = "naive",
+                            pred.method = "prediction",
+                            hmm.error = 0.01){
 
   talk <- function(msg){
     if(verbose) cat(msg)
@@ -464,41 +514,50 @@ smooth_map_iter <- function(geno,
   res <- list()
   for(i in 1:iters){
     talk(paste0("Iteration ",i," ----------\n"))
+    tic <- Sys.time()
     if(i == 1){
       this_iter <- try(smooth_map(geno = geno, homologue = homologue,
-                 map = map, ploidy = ploidy, p1name = p1name,
-                 p2name = p2name, prediction_interval = prediction_interval,
-                 prediction_threshold = prediction_threshold,
-                 error_threshold = err[i],
-                 ncores = ncores,
-                 mapping_ndim = mapping_ndim, estimate_premap = estimate_premap,
-                 max_distance = max_distance, verbose = verbose,
-                 prediction_points = prediction_points))
+                                  map = map, ploidy = ploidy, p1name = p1name,
+                                  p2name = p2name, prediction_interval = prediction_interval,
+                                  prediction_threshold = prediction_threshold,
+                                  error_threshold = err[i],
+                                  ncores = ncores,
+                                  mapping_ndim = mapping_ndim, estimate_premap = estimate_premap,
+                                  max_distance = max_distance, verbose = verbose,
+                                  prediction_points = prediction_points, obs.method = obs.method,
+                                  pred.method = pred.method,hmm.error = hmm.error))
     }else{
       #In the next iterations, a new starting map and new genotype is used
-      if(class(this_iter) == "try-error") next
+      if(class(this_iter) == "try-error"){
+        this_iter <- list()
+        next
+      }
       m <- this_iter$newmap
       g <- this_iter$newgeno
       this_iter <- try(smooth_map(geno = g,
-                              map = m,
-                              homologue = homologue,
-                              ploidy = ploidy,
-                              p1name = p1name,
-                              p2name = p2name,
-                              prediction_interval = prediction_interval,
-                              prediction_threshold = prediction_threshold,
-                              prediction_points = prediction_points,
-                              error_threshold = err[i],
-                              ncores = ncores,
-                              mapping_ndim = mapping_ndim,
-                              max_distance = max_distance,
-                              verbose = verbose))
+                                  map = m,
+                                  homologue = homologue,
+                                  ploidy = ploidy,
+                                  p1name = p1name,
+                                  p2name = p2name,
+                                  prediction_interval = prediction_interval,
+                                  prediction_threshold = prediction_threshold,
+                                  prediction_points = prediction_points,
+                                  error_threshold = err[i],
+                                  ncores = ncores,
+                                  mapping_ndim = mapping_ndim,
+                                  max_distance = max_distance,
+                                  verbose = verbose,
+                                  obs.method = obs.method,
+                                  pred.method = pred.method,
+                                  hmm.error = hmm.error))
     }
+    toc <- Sys.time()
+    this_iter$error_threshold <- err[i]
+    this_iter$time <- difftime(toc,tic, units = "secs")
     res[[i]] <- this_iter
-    res[[i]]$error_threshold <- err[i]
   }
 
   names(res) <- paste0("iter",1:length(res))
   return(res)
 }
-