From 0633a4b8156ff72d98e5cab361201a7864a04df9 Mon Sep 17 00:00:00 2001 From: Philippine Louail <127301965+philouail@users.noreply.github.com> Date: Wed, 4 Dec 2024 15:16:25 +0100 Subject: [PATCH] fail --- R/Spectra.R | 4 +- man/Spectra.Rd | 1376 +----------------------------------------- man/filterMsLevel.Rd | 64 ++ 3 files changed, 68 insertions(+), 1376 deletions(-) diff --git a/R/Spectra.R b/R/Spectra.R index 0d9217ef..8bf0565a 100644 --- a/R/Spectra.R +++ b/R/Spectra.R @@ -2034,8 +2034,8 @@ setMethod("combinePeaks", "Spectra", function(object, tolerance = 0, ppm = 20, #' should be explored and ideally be removed using for #' `QFeatures::reduceDataFrame()`, `PMS::reducePSMs()` or similar #' functions. -#' For a more general function that allows to append `data.frame`, -#' `DataFrame` and `matrix` see `cbind2()`. +#' For a more general function that allows to append `data.frame`, +#' `DataFrame` and `matrix` see `cbind2()`. #' #' @section Filter content of `peaksData()`: #' diff --git a/man/Spectra.Rd b/man/Spectra.Rd index 6f97d33a..5e4baaf7 100644 --- a/man/Spectra.Rd +++ b/man/Spectra.Rd @@ -11,91 +11,9 @@ \alias{Spectra,ANY-method} \alias{setBackend,Spectra,MsBackend-method} \alias{export,Spectra-method} -\alias{acquisitionNum,Spectra-method} -\alias{peaksData,Spectra-method} -\alias{peaksVariables,Spectra-method} -\alias{centroided,Spectra-method} -\alias{centroided<-,Spectra-method} -\alias{collisionEnergy,Spectra-method} -\alias{collisionEnergy<-,Spectra-method} -\alias{dataOrigin,Spectra-method} -\alias{dataOrigin<-,Spectra-method} -\alias{dataStorage,Spectra-method} -\alias{dropNaSpectraVariables,Spectra-method} -\alias{intensity,Spectra-method} -\alias{ionCount,Spectra-method} -\alias{isCentroided,Spectra-method} -\alias{isEmpty,Spectra-method} -\alias{isolationWindowLowerMz,Spectra-method} -\alias{isolationWindowLowerMz<-,Spectra-method} -\alias{isolationWindowTargetMz,Spectra-method} -\alias{isolationWindowTargetMz<-,Spectra-method} -\alias{isolationWindowUpperMz,Spectra-method} -\alias{isolationWindowUpperMz<-,Spectra-method} -\alias{containsMz,Spectra-method} -\alias{containsNeutralLoss,Spectra-method} -\alias{spectrapply,Spectra-method} -\alias{length,Spectra-method} -\alias{msLevel,Spectra-method} -\alias{mz,Spectra-method} -\alias{lengths,Spectra-method} -\alias{polarity,Spectra-method} -\alias{polarity<-,Spectra-method} -\alias{precScanNum,Spectra-method} -\alias{precursorCharge,Spectra-method} -\alias{precursorIntensity,Spectra-method} -\alias{precursorMz,Spectra-method} -\alias{rtime,Spectra-method} -\alias{rtime<-,Spectra-method} -\alias{scanIndex,Spectra-method} -\alias{selectSpectraVariables,Spectra-method} -\alias{smoothed,Spectra-method} -\alias{smoothed<-,Spectra-method} -\alias{spectraData,Spectra-method} -\alias{spectraData<-,Spectra-method} -\alias{spectraNames,Spectra-method} -\alias{spectraNames<-,Spectra-method} -\alias{spectraVariables,Spectra-method} -\alias{tic,Spectra-method} -\alias{$,Spectra-method} -\alias{$<-,Spectra-method} -\alias{[[,Spectra-method} -\alias{[[<-,Spectra-method} -\alias{cbind2,Spectra,dataframeOrDataFrame-method} -\alias{filterAcquisitionNum,Spectra-method} -\alias{filterEmptySpectra,Spectra-method} -\alias{filterDataOrigin,Spectra-method} -\alias{filterDataStorage,Spectra-method} -\alias{filterFourierTransformArtefacts,Spectra-method} -\alias{filterIntensity,Spectra-method} -\alias{filterIsolationWindow,Spectra-method} -\alias{filterMsLevel,Spectra-method} -\alias{filterMzRange,Spectra-method} -\alias{filterMzValues,Spectra-method} -\alias{filterPolarity,Spectra-method} -\alias{filterPrecursorMz,Spectra-method} -\alias{filterPrecursorMzRange,Spectra-method} -\alias{filterPrecursorMzValues,Spectra-method} -\alias{filterPrecursorCharge,Spectra-method} -\alias{filterPrecursorScan,Spectra-method} -\alias{filterRt,Spectra-method} -\alias{reset,Spectra-method} -\alias{filterRanges,Spectra-method} -\alias{filterValues,Spectra-method} -\alias{bin,Spectra-method} -\alias{compareSpectra,Spectra,Spectra-method} -\alias{compareSpectra,Spectra,missing-method} -\alias{pickPeaks,Spectra-method} -\alias{replaceIntensitiesBelow,Spectra-method} -\alias{smooth,Spectra-method} -\alias{addProcessing,Spectra-method} -\alias{coreSpectraVariables} -\alias{backendBpparam,Spectra-method} -\alias{combinePeaks,Spectra-method} -\alias{entropy,Spectra-method} -\alias{entropy,ANY-method} \alias{dataStorageBasePath,Spectra-method} \alias{dataStorageBasePath<-,Spectra-method} +\alias{cbind2,Spectra,dataframeOrDataFrame-method} \title{The Spectra class to manage and access MS data} \usage{ \S4method{Spectra}{missing}( @@ -147,413 +65,9 @@ \S4method{dataStorageBasePath}{Spectra}(object) -\S4method{peaksData}{Spectra}( - object, - columns = c("mz", "intensity"), - f = processingChunkFactor(object), - ..., - BPPARAM = bpparam() -) - -\S4method{peaksVariables}{Spectra}(object) - -\S4method{centroided}{Spectra}(object) - -\S4method{centroided}{Spectra}(object) <- value - -\S4method{collisionEnergy}{Spectra}(object) - -\S4method{collisionEnergy}{Spectra}(object) <- value - -\S4method{dataOrigin}{Spectra}(object) - -\S4method{dataOrigin}{Spectra}(object) <- value - -\S4method{dataStorage}{Spectra}(object) - -\S4method{dropNaSpectraVariables}{Spectra}(object) - -\S4method{intensity}{Spectra}(object, f = processingChunkFactor(object), ...) - -\S4method{ionCount}{Spectra}(object) - -\S4method{isCentroided}{Spectra}(object, ...) - -\S4method{isEmpty}{Spectra}(x) - -\S4method{isolationWindowLowerMz}{Spectra}(object) - -\S4method{isolationWindowLowerMz}{Spectra}(object) <- value - -\S4method{isolationWindowTargetMz}{Spectra}(object) - -\S4method{isolationWindowTargetMz}{Spectra}(object) <- value - -\S4method{isolationWindowUpperMz}{Spectra}(object) - -\S4method{isolationWindowUpperMz}{Spectra}(object) <- value - -\S4method{containsMz}{Spectra}( - object, - mz = numeric(), - tolerance = 0, - ppm = 20, - which = c("any", "all"), - BPPARAM = bpparam() -) - -\S4method{containsNeutralLoss}{Spectra}( - object, - neutralLoss = 0, - tolerance = 0, - ppm = 20, - BPPARAM = bpparam() -) - -\S4method{spectrapply}{Spectra}( - object, - FUN, - ..., - chunkSize = integer(), - f = factor(), - BPPARAM = SerialParam() -) - -\S4method{length}{Spectra}(x) - -\S4method{msLevel}{Spectra}(object) - -\S4method{mz}{Spectra}(object, f = processingChunkFactor(object), ...) - -\S4method{lengths}{Spectra}(x, use.names = FALSE) - -\S4method{polarity}{Spectra}(object) - -\S4method{polarity}{Spectra}(object) <- value - -\S4method{precScanNum}{Spectra}(object) - -\S4method{precursorCharge}{Spectra}(object) - -\S4method{precursorIntensity}{Spectra}(object) - -\S4method{precursorMz}{Spectra}(object) - -\S4method{rtime}{Spectra}(object) - -\S4method{rtime}{Spectra}(object) <- value - -\S4method{scanIndex}{Spectra}(object) - -\S4method{selectSpectraVariables}{Spectra}( - object, - spectraVariables = union(spectraVariables(object), peaksVariables(object)) -) - -\S4method{smoothed}{Spectra}(object) - -\S4method{smoothed}{Spectra}(object) <- value - -\S4method{spectraData}{Spectra}(object, columns = spectraVariables(object)) - -\S4method{spectraData}{Spectra}(object) <- value - -\S4method{spectraNames}{Spectra}(object) - -\S4method{spectraNames}{Spectra}(object) <- value - -\S4method{spectraVariables}{Spectra}(object) - -\S4method{tic}{Spectra}(object, initial = TRUE) - -\S4method{$}{Spectra}(x, name) - -\S4method{$}{Spectra}(x, name) <- value - -\S4method{[[}{Spectra}(x, i, j, ...) - -\S4method{[[}{Spectra}(x, i, j, ...) <- value - -\S4method{[}{Spectra}(x, i, j, ..., drop = FALSE) +\S4method{dataStorageBasePath}{Spectra}(object) <- value \S4method{cbind2}{Spectra,dataframeOrDataFrame}(x, y, ...) - -\S4method{filterAcquisitionNum}{Spectra}( - object, - n = integer(), - dataStorage = character(), - dataOrigin = character() -) - -\S4method{filterEmptySpectra}{Spectra}(object) - -\S4method{filterDataOrigin}{Spectra}(object, dataOrigin = character()) - -\S4method{filterDataStorage}{Spectra}(object, dataStorage = character()) - -\S4method{filterFourierTransformArtefacts}{Spectra}( - object, - halfWindowSize = 0.05, - threshold = 0.2, - keepIsotopes = TRUE, - maxCharge = 5, - isotopeTolerance = 0.005 -) - -\S4method{filterIntensity}{Spectra}( - object, - intensity = c(0, Inf), - msLevel. = uniqueMsLevels(object), - ... -) - -\S4method{filterIsolationWindow}{Spectra}(object, mz = numeric()) - -\S4method{filterMsLevel}{Spectra}(object, msLevel. = integer()) - -\S4method{filterMzRange}{Spectra}( - object, - mz = numeric(), - msLevel. = uniqueMsLevels(object), - keep = TRUE -) - -\S4method{filterMzValues}{Spectra}( - object, - mz = numeric(), - tolerance = 0, - ppm = 20, - msLevel. = uniqueMsLevels(object), - keep = TRUE -) - -\S4method{filterPolarity}{Spectra}(object, polarity = integer()) - -\S4method{filterPrecursorMz}{Spectra}(object, mz = numeric()) - -\S4method{filterPrecursorMzRange}{Spectra}(object, mz = numeric()) - -\S4method{filterPrecursorMzValues}{Spectra}(object, mz = numeric(), ppm = 20, tolerance = 0) - -\S4method{filterPrecursorCharge}{Spectra}(object, z = integer()) - -\S4method{filterPrecursorScan}{Spectra}(object, acquisitionNum = integer(), f = dataOrigin(object)) - -\S4method{filterRt}{Spectra}(object, rt = numeric(), msLevel. = uniqueMsLevels(object)) - -\S4method{reset}{Spectra}(object, ...) - -\S4method{filterRanges}{Spectra}( - object, - spectraVariables = character(), - ranges = numeric(), - match = c("all", "any") -) - -\S4method{filterValues}{Spectra}( - object, - spectraVariables = character(), - values = numeric(), - ppm = 0, - tolerance = 0, - match = c("all", "any") -) - -\S4method{bin}{Spectra}( - x, - binSize = 1L, - breaks = NULL, - msLevel. = uniqueMsLevels(x), - FUN = sum, - zero.rm = TRUE -) - -\S4method{compareSpectra}{Spectra,Spectra}( - x, - y, - MAPFUN = joinPeaks, - tolerance = 0, - ppm = 20, - FUN = ndotproduct, - ..., - SIMPLIFY = TRUE -) - -\S4method{compareSpectra}{Spectra,missing}( - x, - y = NULL, - MAPFUN = joinPeaks, - tolerance = 0, - ppm = 20, - FUN = ndotproduct, - ..., - SIMPLIFY = TRUE -) - -\S4method{pickPeaks}{Spectra}( - object, - halfWindowSize = 2L, - method = c("MAD", "SuperSmoother"), - snr = 0, - k = 0L, - descending = FALSE, - threshold = 0, - msLevel. = uniqueMsLevels(object), - ... -) - -\S4method{replaceIntensitiesBelow}{Spectra}( - object, - threshold = min, - value = 0, - msLevel. = uniqueMsLevels(object) -) - -\S4method{smooth}{Spectra}( - x, - halfWindowSize = 2L, - method = c("MovingAverage", "WeightedMovingAverage", "SavitzkyGolay"), - msLevel. = uniqueMsLevels(x), - ... -) - -\S4method{addProcessing}{Spectra}(object, FUN, ..., spectraVariables = character()) - -coreSpectraVariables() - -\S4method{uniqueMsLevels}{Spectra}(object, ...) - -\S4method{backendBpparam}{Spectra}(object, BPPARAM = bpparam()) - -\S4method{combinePeaks}{Spectra}( - object, - tolerance = 0, - ppm = 20, - intensityFun = base::mean, - mzFun = base::mean, - weighted = TRUE, - msLevel. = uniqueMsLevels(object), - ... -) - -\S4method{entropy}{Spectra}(object, normalized = TRUE) - -\S4method{entropy}{ANY}(object, ...) -} -\arguments{ -\item{object}{For \code{Spectra()}: either a \code{DataFrame} or \code{missing}. See -section on creation of \code{Spectra} objects for details. For all other -methods a \code{Spectra} object.} - -\item{f}{For \code{split()}: factor defining how to split \code{x}. See \code{\link[base:split]{base::split()}} -for details. For \code{setBackend()}: factor defining how to split the data -for parallelized copying of the spectra data to the new backend. For some -backends changing this parameter can lead to errors. -For \code{combineSpectra()}: \code{factor} defining the grouping of the spectra -that should be combined. For \code{spectrapply()}: \code{factor} how \code{object} -should be splitted. For \code{filterPrecursorScan()}: defining which spectra -belong to the same original data file (sample): Defaults to -\code{f = dataOrigin(x)}. -For \code{intensity()}, \code{mz()} and \code{peaksData()}: factor defining how data -should be chunk-wise loaded an processed. Defaults to -\code{\link[=processingChunkFactor]{processingChunkFactor()}}.} - -\item{BPPARAM}{Parallel setup configuration. See \code{\link[=bpparam]{bpparam()}} for more -information. This is passed directly to the \code{\link[=backendInitialize]{backendInitialize()}} method -of the \linkS4class{MsBackend}.} - -\item{...}{Additional arguments.} - -\item{x}{A \code{Spectra} object.} - -\item{p}{For \code{combineSpectra()}: \code{factor} defining how to split the input -\code{Spectra} for parallel processing. Defaults to \code{x$dataStorage}, i.e., -depending on the used backend, per-file parallel processing will be -performed.} - -\item{FUN}{For \code{addProcessing()}: function to be applied to the peak matrix -of each spectrum in \code{object}. For \code{compareSpectra()}: function to compare -intensities of peaks between two spectra with each other. -For \code{combineSpectra()}: function to combine the (peak matrices) of the -spectra. See section \emph{Data manipulations} and examples below for more -details. -For \code{bin()}: function to aggregate intensity values of peaks falling -into the same bin. Defaults to \code{FUN = sum} thus summing up intensities. -For \code{spectrapply()} and \code{chunkapply()}: function to be applied to -\code{Spectra}.} - -\item{y}{A \code{Spectra} object. -- For \code{joinSpectraData()}: a \code{DataFrame}. -- For \code{cbind2()} a \code{data.frame}, \code{DataFrame} or \code{matrix}.} - -\item{by.x}{A \code{character(1)} specifying the spectra variable used -for merging. Default is \code{"spectrumId"}.} - -\item{by.y}{A \code{character(1)} specifying the column used for -merging. Set to \code{by.x} if missing.} - -\item{suffix.y}{A \code{character(1)} specifying the suffix to be used -for making the names of columns in the merged spectra variables -unique. This suffix will be used to amend \code{names(y)}, while -\code{spectraVariables(x)} will remain unchanged.} - -\item{substDefinition}{For \code{deisotopeSpectra()} and -\code{filterPrecursorIsotopes()}: \code{matrix} or \code{data.frame} with definitions -of isotopic substitutions. Uses by default isotopic substitutions -defined from all compounds in the Human Metabolome Database (HMDB). See -\code{\link[=isotopologues]{isotopologues()}} or \code{\link[=isotopicSubstitutionMatrix]{isotopicSubstitutionMatrix()}} for details.} - -\item{tolerance}{For \code{compareSpectra()}, \code{containsMz()}, -\code{deisotopeSpectra()}, \code{filterMzValues()} and \code{reduceSpectra()}: -\code{numeric(1)} allowing to define a constant maximal accepted difference -between m/z values for peaks to be matched (or grouped). For -\code{containsMz()} it can also be of length equal \code{mz} to specify a different -tolerance for each m/z value. -For \code{filterPrecursorMaxIntensity()}: \code{numeric(1)} defining the -(constant) maximal accepted difference of precursor m/z values of -spectra for grouping them into \emph{precursor groups}. For -\code{filterPrecursorIsotopes()}: passed directly to the \code{\link[=isotopologues]{isotopologues()}} -function. For \code{filterValues()}: \code{numeric} of any length allowing to -define a maximal accepted difference between user input \code{values} and the -\code{spectraVariables} values. If it is not equal to the length of the -value provided with parameter \code{spectraVariables}, \code{tolerance[1]} will be -recycled. Default is \code{tolerance = 0}} - -\item{ppm}{For \code{compareSpectra()}, \code{containsMz()}, \code{deisotopeSpectra()}, -\code{filterMzValues()} and \code{reduceSpectra()}: \code{numeric(1)} -defining a relative, m/z-dependent, maximal accepted difference between -m/z values for peaks to be matched (or grouped). -For \code{filterPrecursorMaxIntensity()}: \code{numeric(1)} defining the relative -maximal accepted difference of precursor m/z values of spectra for -grouping them into \emph{precursor groups}. For \code{filterPrecursorIsotopes()}: -passed directly to the \code{\link[=isotopologues]{isotopologues()}} function. -For \code{filterValues()}: \code{numeric} of any length allowing to define -a maximal accepted difference between user input \code{values} and the -\code{spectraVariables} values. If it is not equal to the length of the -value provided with parameter \code{spectraVariables}, \code{ppm[1]} will be -recycled.} - -\item{charge}{For \code{deisotopeSpectra()}: expected charge of the ionized -compounds. See \code{\link[=isotopologues]{isotopologues()}} for details.} - -\item{by}{For \code{scalePeaks()}: function to calculate a single \code{numeric} from -intensity values of a spectrum by which all intensities (of -that spectrum) should be divided by. The default \code{by = sum} will -divide intensities of each spectrum by the sum of intensities of that -spectrum.} - -\item{msLevel.}{\code{integer} defining the MS level(s) of the spectra to which -the function should be applied (defaults to all MS levels of \code{object}. -For \code{filterMsLevel()}: the MS level to which \code{object} should be -subsetted.} - -\item{mz}{For \code{filterIsolationWindow()}: \code{numeric(1)} with the m/z value to -filter the object. For \code{filterPrecursorMz()} and \code{filterMzRange()}: -\code{numeric(2)} defining the lower and upper m/z boundary. -For \code{filterMzValues()} and \code{filterPrecursorMzValues()}: \code{numeric} with -the m/z values to match peaks or precursor m/z against.} - -\S4method{dataStorageBasePath}{Spectra}(object) <- value } \arguments{ \item{object}{For \code{Spectra()}: an object to instantiate the \code{Spectra} @@ -774,584 +288,6 @@ parameter \code{backend}. } } -\section{Accessing spectra data}{ - -\itemize{ -\item \code{$}, \verb{$<-}: gets (or sets) a spectra variable for all spectra in \code{object}. -See examples for details. Note that replacing values of a peaks variable -is not supported with a non-empty processing queue, i.e. if any filtering -or data manipulations on the peaks data was performed. In these cases -\code{\link[=applyProcessing]{applyProcessing()}} needs to be called first to apply all cached data -operations. -\item \code{[[}, \verb{[[<-}: access or set/add a single spectrum variable (column) in the -backend. -\item \code{acquisitionNum()}: returns the acquisition number of each -spectrum. Returns an \code{integer} of length equal to the number of -spectra (with \code{NA_integer_} if not available). -\item \code{centroided()}, \verb{centroided<-}: gets or sets the centroiding -information of the spectra. \code{centroided()} returns a \code{logical} -vector of length equal to the number of spectra with \code{TRUE} if a -spectrum is centroided, \code{FALSE} if it is in profile mode and \code{NA} -if it is undefined. See also \code{isCentroided()} for estimating from -the spectrum data whether the spectrum is centroided. \code{value} -for \verb{centroided<-} is either a single \code{logical} or a \code{logical} of -length equal to the number of spectra in \code{object}. -\item \code{collisionEnergy()}, \verb{collisionEnergy<-}: gets or sets the -collision energy for all spectra in \code{object}. \code{collisionEnergy()} -returns a \code{numeric} with length equal to the number of spectra -(\code{NA_real_} if not present/defined), \verb{collisionEnergy<-} takes a -\code{numeric} of length equal to the number of spectra in \code{object}. -\item \code{coreSpectraVariables()}: returns the \emph{core} spectra variables along with -their expected data type. -\item \code{dataOrigin()}, \verb{dataOrigin<-}: gets or sets the \emph{data origin} for each -spectrum. \code{dataOrigin()} returns a \code{character} vector (same length than -\code{object}) with the origin of the spectra. \verb{dataOrigin<-} expects a -\code{character} vector (same length than \code{object}) with the replacement -values for the data origin of each spectrum. -\item \code{dataStorage()}: returns a \code{character} vector (same length than \code{object}) -with the data storage location of each spectrum. -\item \code{intensity()}: gets the intensity values from the spectra. Returns -a \code{\link[=NumericList]{NumericList()}} of \code{numeric} vectors (intensity values for each -spectrum). The length of the list is equal to the number of -\code{spectra} in \code{object}. -\item \code{ionCount()}: returns a \code{numeric} with the sum of intensities for -each spectrum. If the spectrum is empty (see \code{isEmpty()}), -\code{NA_real_} is returned. -\item \code{isCentroided()}: a heuristic approach assessing if the spectra in -\code{object} are in profile or centroided mode. The function takes -the \code{qtl}th quantile top peaks, then calculates the difference -between adjacent m/z value and returns \code{TRUE} if the first -quartile is greater than \code{k}. (See \code{Spectra:::.isCentroided()} for -the code.) -\item \code{isEmpty()}: checks whether a spectrum in \code{object} is empty -(i.e. does not contain any peaks). Returns a \code{logical} vector of -length equal number of spectra. -\item \code{isolationWindowLowerMz()}, \verb{isolationWindowLowerMz<-}: gets or sets the -lower m/z boundary of the isolation window. -\item \code{isolationWindowTargetMz()}, \verb{isolationWindowTargetMz<-}: gets or sets the -target m/z of the isolation window. -\item \code{isolationWindowUpperMz()}, \verb{isolationWindowUpperMz<-}: gets or sets the -upper m/z boundary of the isolation window. -\item \code{containsMz()}: checks for each of the spectra whether they contain mass -peaks with an m/z equal to \code{mz} (given acceptable difference as defined by -parameters \code{tolerance} and \code{ppm} - see \code{\link[=common]{common()}} for details). Parameter -\code{which} allows to define whether any (\code{which = "any"}, the default) or -all (\code{which = "all"}) of the \code{mz} have to match. The function returns -\code{NA} if \code{mz} is of length 0 or is \code{NA}. -\item \code{containsNeutralLoss()}: checks for each spectrum in \code{object} if it has a -peak with an m/z value equal to its precursor m/z - \code{neutralLoss} (given -acceptable difference as defined by parameters \code{tolerance} and \code{ppm}). -Returns \code{NA} for MS1 spectra (or spectra without a precursor m/z). -\item \code{length()}: gets the number of spectra in the object. -\item \code{lengths()}: gets the number of peaks (m/z-intensity values) per -spectrum. Returns an \code{integer} vector (length equal to the -number of spectra). For empty spectra, \code{0} is returned. -\item \code{msLevel()}: gets the spectra's MS level. Returns an integer vector (names -being spectrum names, length equal to the number of spectra) with the MS -level for each spectrum. -\item \code{mz()}: gets the mass-to-charge ratios (m/z) from the -spectra. Returns a \code{\link[=NumericList]{NumericList()}} or length equal to the number of -spectra, each element a \code{numeric} vector with the m/z values of -one spectrum. -\item \code{peaksData()}: gets the \emph{peaks} data for all spectra in \code{object}. Peaks -data consist of the m/z and intensity values as well as possible additional -annotations (variables) of all peaks of each spectrum. The function -returns a \code{\link[=SimpleList]{SimpleList()}} of two dimensional arrays (either \code{matrix} or -\code{data.frame}), with each array providing the values for the requested -\emph{peak variables} (by default \code{"mz"} and \code{"intensity"}). Optional parameter -\code{columns} is passed to the backend's \code{peaksData()} function to allow -the selection of specific (or additional) peaks variables (columns) that -should be extracted (if available). Importantly, -it is \strong{not} guaranteed that each backend supports this parameter (while -each backend must support extraction of \code{"mz"} and \code{"intensity"} columns). -Parameter \code{columns} defaults to \code{c("mz", "intensity")} but any value -returned by \code{peaksVariables(object)} is supported. -Note also that it is possible to extract the peak data with -\code{as(x, "list")} and \code{as(x, "SimpleList")} as a \code{list} and \code{SimpleList}, -respectively. Note however that, in contrast to \code{peaksData()}, \code{as()} -does not support the parameter \code{columns}. -\item \code{peaksVariables()}: lists the available variables for mass peaks provided -by the backend. Default peak variables are \code{"mz"} and \code{"intensity"} (which -all backends need to support and provide), but some backends might provide -additional variables. -These variables correspond to the column names of the peak data array -returned by \code{peaksData()}. -\item \code{polarity()}, \verb{polarity<-}: gets or sets the polarity for each -spectrum. \code{polarity()} returns an \code{integer} vector (length equal -to the number of spectra), with \code{0} and \code{1} representing negative -and positive polarities, respectively. \verb{polarity<-} expects an -\code{integer} vector of length 1 or equal to the number of spectra. -\item \code{precursorCharge()}, \code{precursorIntensity()}, \code{precursorMz()}, -\code{precScanNum()}, \code{precAcquisitionNum()}: gets the charge (\code{integer}), -intensity (\code{numeric}), m/z (\code{numeric}), scan index (\code{integer}) -and acquisition number (\code{interger}) of the precursor for MS level > -2 spectra from the object. Returns a vector of length equal to -the number of spectra in \code{object}. \code{NA} are reported for MS1 -spectra of if no precursor information is available. -\item \code{rtime()}, \verb{rtime<-}: gets or sets the retention times (in seconds) -for each spectrum. \code{rtime()} returns a \code{numeric} vector (length -equal to the number of spectra) with the retention time for each -spectrum. \verb{rtime<-} expects a numeric vector with length equal -to the number of spectra. -\item \code{scanIndex()}: returns an \code{integer} vector with the \emph{scan index} -for each spectrum. This represents the relative index of the -spectrum within each file. Note that this can be different to the -\code{acquisitionNum} of the spectrum which represents the index of the -spectrum during acquisition/measurement (as reported in the mzML file). -\item \code{smoothed()},\verb{smoothed<-}: gets or sets whether a spectrum is -\emph{smoothed}. \code{smoothed()} returns a \code{logical} vector of length equal -to the number of spectra. \verb{smoothed<-} takes a \code{logical} vector -of length 1 or equal to the number of spectra in \code{object}. -\item \code{spectraData()}: gets general spectrum metadata (annotation, also called -header). \code{spectraData()} returns a \code{DataFrame}. Note that this -method does by default \strong{not} return m/z or intensity values. -\item \verb{spectraData<-}: \strong{replaces} the full spectra data of the \code{Spectra} -object with the one provided with \code{value}. The \verb{spectraData<-} function -expects a \code{DataFrame} to be passed as value with the same number of rows -as there a spectra in \code{object}. Note that replacing values of -peaks variables is not supported with a non-empty processing queue, i.e. -if any filtering or data manipulations on the peaks data was performed. -In these cases \code{\link[=applyProcessing]{applyProcessing()}} needs to be called first to apply all -cached data operations and empty the processing queue. -\item \code{spectraNames()}, \verb{spectraNames<-}: gets or sets the spectra names. -\item \code{spectraVariables()}: returns a \code{character} vector with the -available spectra variables (columns, fields or attributes of each -spectrum) available in \code{object}. Note that \code{spectraVariables()} does not -list the \emph{peak variables} (\code{"mz"}, \code{"intensity"} and eventual additional -annotations for each MS peak). Peak variables are returned by -\code{peaksVariables()}. -\item \code{tic()}: gets the total ion current/count (sum of signal of a -spectrum) for all spectra in \code{object}. By default, the value -reported in the original raw data file is returned. For an empty -spectrum, \code{0} is returned. -\item \code{uniqueMsLevels()}: get the unique MS levels available in \code{object}. This -function is supposed to be more efficient than \code{unique(msLevel(object))}. -} -} - -\section{Data subsetting, filtering and merging}{ - - -Subsetting and filtering of \code{Spectra} objects can be performed with the below -listed methods. -\itemize{ -\item \code{[}: subsets the spectra keeping only selected elements (\code{i}). The method -\strong{always} returns a \code{Spectra} object. -\item \code{cbind2()}: Appends multiple spectra variables from a \code{data.frame}, -\code{DataFrame} or \code{matrix} to the \code{Spectra} object at once. It does so -\emph{blindly} (e.g. do not check rownames compatibility) and is therefore at -the risk of the user. For a more controlled way of adding spectra -variables, the \code{joinSpectraData()} should be used. It will return a -\code{Spectra} object with the appended spectra variables. \code{cbind2()} does -check however that the number of rows of the \code{data.frame} or \code{DataFrame} -matches the number of spectra in the \code{Spectra} object. -\item \code{deisotopeSpectra()}: \emph{deisotopes} each spectrum keeping only the -monoisotopic peak for groups of isotopologues. Isotopologues are -estimated using the \code{\link[=isotopologues]{isotopologues()}} function from the -\emph{MetaboCoreUtils} package. Note that -the default parameters for isotope prediction/detection have been -determined using data from the Human Metabolome Database (HMDB) and -isotopes for elements other than CHNOPS might not be detected. See -parameter \code{substDefinition} in the documentation of \code{\link[=isotopologues]{isotopologues()}} for -more information. The approach and code to define the parameters for -isotope prediction is described -\href{https://github.com/EuracBiomedicalResearch/isotopologues}{here}. -\item \code{dropNaSpectraVariables()}: removes spectra variables (i.e. columns in the -object's \code{spectraData} that contain only missing values (\code{NA}). Note that -while columns with only \code{NA}s are removed, a \code{spectraData()} call after -\code{dropNaSpectraVariables()} might still show columns containing \code{NA} values -for \emph{core} spectra variables. -\item \code{filterAcquisitionNum()}: filters the object keeping only spectra matching -the provided acquisition numbers (argument \code{n}). If \code{dataOrigin} or -\code{dataStorage} is also provided, \code{object} is subsetted to the spectra with -an acquisition number equal to \code{n} \strong{in spectra with matching dataOrigin -or dataStorage values} retaining all other spectra. -Returns the filtered \code{Spectra}. -\item \code{filterDataOrigin()}: filters the object retaining spectra matching the -provided \code{dataOrigin}. Parameter \code{dataOrigin} has to be of type -\code{character} and needs to match exactly the data origin value of the -spectra to subset. -Returns the filtered \code{Spectra} object (with spectra ordered according to -the provided \code{dataOrigin} parameter). -\item \code{filterDataStorage()}: filters the object retaining spectra stored in the -specified \code{dataStorage}. Parameter \code{dataStorage} has to be of type -\code{character} and needs to match exactly the data storage value of the -spectra to subset. -Returns the filtered \code{Spectra} object (with spectra ordered according to -the provided \code{dataStorage} parameter). -\item \code{filterEmptySpectra()}: removes empty spectra (i.e. spectra without peaks). -Returns the filtered \code{Spectra} object (with spectra in their -original order). -\item \code{filterFourierTransformArtefacts()}: removes (Orbitrap) fast fourier -artefact peaks from spectra (see examples below). The function iterates -through all intensity ordered peaks in a spectrum and removes all peaks -with an m/z within +/- \code{halfWindowSize} of the current peak if their -intensity is lower than \code{threshold} times the current peak's intensity. -Additional parameters \code{keepIsotopes}, \code{maxCharge} and \code{isotopeTolerance} -allow to avoid removing of potential \verb{[13]C} isotope peaks (\code{maxCharge} -being the maximum charge that should be considered and \code{isotopeTolerance} -the absolute acceptable tolerance for matching their m/z). -See \code{\link[=filterFourierTransformArtefacts]{filterFourierTransformArtefacts()}} for details and background and -\code{deisitopeSpectra()} for an alternative. -\item \code{filterIntensity()}: filters each spectrum keeping only peaks with -intensities that are within the provided range or match the criteria of -the provided function. For the former, parameter \code{intensity} has to be a -\code{numeric} defining the intensity range, for the latter a \code{function} that -takes the intensity values of the spectrum and returns a \code{logical} whether -the peak should be retained or not (see examples below for details) - -additional parameters to the function can be passed with \code{...}. To -remove only peaks with intensities below a certain threshold, say 100, use -\code{intensity = c(100, Inf)}. Note: also a single value can be passed with -the \code{intensity} parameter in which case an upper limit of \code{Inf} is used. -Note that this function removes also peaks with missing intensities -(i.e. an intensity of \code{NA}). Parameter \code{msLevel.} allows to restrict the -filtering to spectra of the specified MS level(s). -\item \code{filterIsolationWindow()}: retains spectra that contain \code{mz} in their -isolation window m/z range (i.e. with an \code{isolationWindowLowerMz} <= \code{mz} -and \code{isolationWindowUpperMz} >= \code{mz}. Returns the filtered \code{Spectra} -object (with spectra in their original order). -\item \code{filterMsLevel()}: filters object by MS level keeping only spectra matching -the MS level specified with argument \code{msLevel}. Returns the filtered -\code{Spectra} (with spectra in their original order). -\item \code{filterMzRange()}: filters the object keeping or removing peaks in each -spectrum that are within the provided m/z range. Whether peaks are -retained or removed can be configured with parameter \code{keep} (default -\code{keep = TRUE}). -\item \code{filterMzValues()}: filters the object keeping \strong{all} peaks in each -spectrum that match the provided m/z value(s) (for \code{keep = TRUE}, the -default) or removing \strong{all} of them (for \code{keep = FALSE}). The m/z -matching considers also the absolute \code{tolerance} and m/z-relative -\code{ppm} values. \code{tolerance} and \code{ppm} have to be of length 1. -\item \code{filterPolarity()}: filters the object keeping only spectra matching the -provided polarity. Returns the filtered \code{Spectra} (with spectra in their -original order). -\item \code{filterPrecursorCharge()}: retains spectra with the defined precursor -charge(s). -\item \code{filterPrecursorIsotopes()}: groups MS2 spectra based on their precursor -m/z and precursor intensity into predicted isotope groups and keep for each -only the spectrum representing the monoisotopic precursor. MS1 spectra -are returned as is. See documentation for \code{deisotopeSpectra()} below for -details on isotope prediction and parameter description. -\item \code{filterPrecursorMaxIntensity()}: filters the \code{Spectra} keeping for groups -of (MS2) spectra with similar precursor m/z values (given parameters -\code{ppm} and \code{tolerance}) the one with the highest precursor intensity. The -function filters only MS2 spectra and returns all MS1 spectra. If -precursor intensities are \code{NA} for all spectra within a spectra group, the -first spectrum of that groups is returned. -Note: some manufacturers don't provide precursor intensities. These can -however also be estimated with \code{\link[=estimatePrecursorIntensity]{estimatePrecursorIntensity()}}. -\item \code{filterPrecursorMzRange()} (previously \code{filterPrecursorMz()} which is now -deprecated): retains spectra with a precursor m/z within the -provided m/z range. See examples for details on selecting spectra with -a precursor m/z for a target m/z accepting a small difference in \emph{ppm}. -\item \code{filterPrecursorMzValues()}: retains spectra with precursor m/z matching -any of the provided m/z values (given \code{ppm} and \code{tolerance}). Spectra with -missing precursor m/z value (e.g. MS1 spectra) are dropped. -\item \code{filterPrecursorPeaks()}: removes peaks from each spectrum in \code{object} with -an m/z equal or larger than the m/z of the precursor, depending on the -value of parameter \code{mz}: for \verb{mz = ==" (the default) peaks with matching m/z (considering an absolute and relative acceptable difference depending on }tolerance\code{and}ppm\verb{, respectively) are removed. For }mz = ">="\verb{all peaks with an m/z larger or equal to the precursor m/z (minus}tolerance\verb{and the}ppm\verb{of the precursor m/z) are removed. Parameter}msLevel.\verb{allows to restrict the filter to certain MS levels (by default the filter is applied to all MS levels). Note that no peaks are removed if the precursor m/z is}NA` (e.g. typically for MS1 spectra). -\item \code{filterPrecursorScan()}: retains parent (e.g. MS1) and children scans (e.g. -MS2) of acquisition number \code{acquisitionNum}. Returns the filtered -\code{Spectra} (with spectra in their original order). Parameter \code{f} allows to -define which spectra belong to the same sample or original data file ( -defaults to \code{f = dataOrigin(object)}). -\item \code{filterRt()}: retains spectra of MS level \code{msLevel} with retention -times (in seconds) within (\code{>=}) \code{rt[1]} and (\code{<=}) -\code{rt[2]}. Returns the filtered \code{Spectra} (with spectra in their -original order). -\item \code{filterRanges()}: allows filtering of the \code{Spectra} object based on user -defined \emph{numeric} ranges (parameter \code{ranges}) for one or more available -spectra variables in object (spectra variable names can be specified with -parameter \code{spectraVariables}). Spectra for which the value of a spectra -variable is within it's defined range are retained. If multiple -ranges/spectra variables are defined, the \code{match} parameter can be used -to specify whether all conditions (\code{match = "all"}; the default) or if -any of the conditions must match (\code{match = "any"}; all spectra for which -values are within any of the provided ranges are retained). -\item \code{filterValues()}: allows filtering of the \code{Spectra} object based on -similarities of \emph{numeric} values of one or more \code{spectraVariables(object)} -(parameter \code{spectraVariables}) to provided values (parameter \code{values}) -given acceptable differences (parameters tolerance and ppm). If multiple -values/spectra variables are defined, the \code{match} parameter can be used -to specify whether all conditions (\code{match = "all"}; the default) or if -any of the conditions must match (\code{match = "any"}; all spectra for which -values are within any of the provided ranges are retained). -\item \code{reduceSpectra()}: for groups of peaks within highly similar m/z values -within each spectrum (given \code{ppm} and \code{tolerance}), this function keeps -only the peak with the highest intensity removing all other peaks hence -\emph{reducing} each spectrum to the highest intensity peaks per \emph{peak group}. -Peak groups are defined using the \code{\link[=group]{group()}} function from the -\emph{MsCoreUtils} package. -\item \code{reset()}: restores the data to its original state (as much as possible): -removes any processing steps from the lazy processing queue and calls -\code{reset()} on the backend which, depending on the backend, can also undo -e.g. data filtering operations. Note that a \verb{reset*(} call after -\code{applyProcessing()} will not have any effect. See examples below for more -information. -\item \code{selectSpectraVariables()}: reduces the information within the object to -the selected spectra variables: all data for variables not specified will -be dropped. For mandatory columns (i.e., those listed by -\code{\link[=coreSpectraVariables]{coreSpectraVariables()}}, such as \emph{msLevel}, \emph{rtime} ...) only -the values will be dropped but not the variable itself. Additional (or -user defined) spectra variables will be completely removed. -Returns the filtered \code{Spectra}. -\item \code{split()}: splits the \code{Spectra} object based on parameter \code{f} into a \code{list} -of \code{Spectra} objects. -\item \code{joinSpectraData()}: Individual spectra variables can be directly -added with the \verb{$<-} or \verb{[[<-} syntax. The \code{joinSpectraData()} -function allows to merge a \code{DataFrame} to the existing spectra -data. This function diverges from the \code{\link[=merge]{merge()}} method in two -main ways: -\itemize{ -\item The \code{by.x} and \code{by.y} column names must be of length 1. -\item If variable names are shared in \code{x} and \code{y}, the spectra -variables of \code{x} are not modified. It's only the \code{y} -variables that are appended the suffix defined in -\code{suffix.y}. This is to avoid modifying any core spectra -variables that would lead to an invalid object. -\item Duplicated Spectra keys (i.e. \code{x[[by.x]]}) are not -allowed. Duplicated keys in the \code{DataFrame} (i.e \code{y[[by.y]]}) -throw a warning and only the last occurrence is kept. These -should be explored and ideally be removed using for -\code{QFeatures::reduceDataFrame()}, \code{PMS::reducePSMs()} or similar -functions. -For a more general function that allows to append \code{data.frame}, -\code{DataFrame} and \code{matrix} see \code{cbind2()}. -} -} - -Several \code{Spectra} objects can be concatenated into a single object with the -\code{c()} or the \code{concatenateSpectra()} function. Concatenation will fail if the -processing queue of any of the \code{Spectra} objects is not empty or if -different backends are used in the \code{Spectra} objects. The spectra variables -of the resulting \code{Spectra} object is the union of the spectra variables of -the individual \code{Spectra} objects. -} - -\section{Data manipulation and analysis methods}{ - - -Many data manipulation operations, such as those listed in this section, are -not applied immediately to the spectra, but added to a -\emph{lazy processing/manipulation queue}. Operations stored in this queue are -applied on-the-fly to spectra data each time it is accessed. This lazy -execution guarantees the same functionality for \code{Spectra} objects with -any backend, i.e. backends supporting to save changes to spectrum data -(\code{\link[=MsBackendMemory]{MsBackendMemory()}}, \code{\link[=MsBackendDataFrame]{MsBackendDataFrame()}} or \code{\link[=MsBackendHdf5Peaks]{MsBackendHdf5Peaks()}}) as -well as read-only backends (such as the \code{\link[=MsBackendMzR]{MsBackendMzR()}}). -Note that for the former it is possible to apply the processing queue and -write the modified peak data back to the data storage with the -\code{applyProcessing()} function. -\itemize{ -\item \code{addProcessing()}: adds an arbitrary function that should be applied to the -peaks matrix of every spectrum in \code{object}. The function (can be passed -with parameter \code{FUN}) is expected to take a peaks matrix as input and to -return a peaks matrix. A peaks matrix is a numeric matrix with two columns, -the first containing the m/z values of the peaks and the second the -corresponding intensities. The function has to have \code{...} in its -definition. Additional arguments can be passed with \code{...}. With parameter -\code{spectraVariables} it is possible to define additional spectra variables -from \code{object} that should be passed to the function \code{FUN}. These will be -passed by their name (e.g. specifying \code{spectraVariables = "precursorMz"} -will pass the spectra's precursor m/z as a parameter named \code{precursorMz} -to the function. The only exception is the spectra's MS level, these will -be passed to the function as a parameter called \code{spectrumMsLevel} (i.e. -with \code{spectraVariables = "msLevel"} the MS levels of each spectrum will be -submitted to the function as a parameter called \code{spectrumMsLevel}). -Examples are provided in the package vignette. -\item \code{applyProcessing()}: for \code{Spectra} objects that use a \strong{writeable} backend -only: apply all steps from the lazy processing queue to the peak data and -write it back to the data storage. Parameter \code{f} allows to specify how -\code{object} should be split for parallel processing. This should either be -equal to the \code{dataStorage}, or \code{f = rep(1, length(object))} to disable -parallel processing alltogether. Other partitionings might result in -errors (especially if a \code{MsBackendHdf5Peaks} backend is used). -\item \code{bin()}: aggregates individual spectra into discrete (m/z) bins. Binning is -performed only on spectra of the specified MS level(s) (parameter -\code{msLevel}, by default all MS levels of \code{x}). The bins can be defined with -parameter \code{breaks} which by default are equally sized bins, with size -being defined by parameter \code{binSize}, from the minimal to the maximal m/z -of all spectra (of MS level \code{msLevel}) within \code{x}. The same bins are used -for all spectra in \code{x}. All intensity values for peaks falling into the -same bin are aggregated using the function provided with parameter \code{FUN} -(defaults to \code{FUN = sum}, i.e. all intensities are summed up). Note that -the binning operation is applied to the peak data on-the-fly upon data -access and it is possible to \emph{revert} the operation with the \code{reset()} -function (see description of \code{reset()} above). -\item \code{combinePeaks()}: combines mass peaks within each spectrum with a -difference in their m/z values that is smaller than the maximal -acceptable difference defined by \code{ppm} and \code{tolerance}. Parameters -\code{intensityFun} and \code{mzFun} allow to define functions to aggregate the -intensity and m/z values for each such group of peaks. With -\code{weighted = TRUE} (the default), the m/z value of the combined peak is -calculated using an intensity-weighted mean and parameter \code{mzFun} is -ignored. The \code{\link[MsCoreUtils:group]{MsCoreUtils::group()}} function is used for the grouping of -mass peaks. Parameter \code{msLevel.} allows to define selected MS levels for -which peaks should be combined. This function returns a \code{Spectra} with -the same number of spectra than the input object, but with possibly -combined peaks within each spectrum. -dropped (i.e. their values are replaced with \code{NA}) for combined peaks -unless they are constant across the combined peaks. See also -\code{reduceSpectra()} for a function to select a single \emph{representative} -mass peak for each peak group. -\item \code{combineSpectra()}: combines sets of spectra into a single spectrum per -set. For each spectrum group (set), spectra variables from the first -spectrum are used and the peak matrices are combined using the function -specified with \code{FUN}, which defaults to \code{\link[=combinePeaksData]{combinePeaksData()}}. Please -refer to the \code{\link[=combinePeaksData]{combinePeaksData()}} help page for details and options of -the actual combination of peaks across the sets of spectra and to the -package vignette for examples and alternative ways to aggregate spectra. -The sets of spectra can be specified with parameter \code{f}. -In addition it is possible to define, with parameter \code{p} if and how to -split the input data for parallel processing. -This defaults to \code{p = x$dataStorage} and hence a per-file parallel -processing is applied for \code{Spectra} with file-based backends (such as the -\code{\link[=MsBackendMzR]{MsBackendMzR()}}). -Prior combination of the spectra all processings queued in the lazy -evaluation queue are applied. Be aware that calling \code{combineSpectra()} on a -\code{Spectra} object with certain backends that allow modifications might -\strong{overwrite} the original data. This does not happen with a -\code{MsBackendMemory} or \code{MsBackendDataFrame} backend, but with a -\code{MsBackendHdf5Peaks} backend the m/z and intensity values in the original -hdf5 file(s) will be overwritten. -The function returns a \code{Spectra} of length equal to the unique levels -of \code{f}. -\item \code{compareSpectra()}: compares each spectrum in \code{x} with each spectrum in \code{y} -using the function provided with \code{FUN} (defaults to \code{\link[=ndotproduct]{ndotproduct()}}). If -\code{y} is missing, each spectrum in \code{x} is compared with each other spectrum -in \code{x}. -The matching/mapping of peaks between the compared spectra is done with the -\code{MAPFUN} function. The default \code{\link[=joinPeaks]{joinPeaks()}} matches peaks of both spectra -and allows to keep all peaks from the first spectrum (\code{type = "left"}), -from the second (\code{type = "right"}), from both (\code{type = "outer"}) and to -keep only matching peaks (\code{type = "inner"}); see \code{\link[=joinPeaks]{joinPeaks()}} for more -information and examples). The \code{MAPFUN} function should have parameters -\code{x}, \code{y}, \code{xPrecursorMz} and \code{yPrecursorMz} as these values are passed to -the function. In addition to \code{joinPeaks()} also \code{\link[=joinPeaksGnps]{joinPeaksGnps()}} is -supported for GNPS-like similarity score calculations. Note that -\code{joinPeaksGnps()} should only be used in combination with -\code{FUN = MsCoreUtils::gnps} (see \code{\link[=joinPeaksGnps]{joinPeaksGnps()}} for more information and -details). Use \code{MAPFUN = joinPeaksNone} to disable internal peak -matching/mapping if a similarity scoring function is used that performs -the matching internally. -\code{FUN} is supposed to be a function to compare intensities of (matched) -peaks of the two spectra that are compared. The function needs to take two -matrices with columns \code{"mz"} and \code{"intensity"} as input and is supposed -to return a single numeric as result. In addition to the two peak matrices -the spectra's precursor m/z values are passed to the function as parameters -\code{xPrecursorMz} (precursor m/z of the \code{x} peak matrix) and \code{yPrecursorMz} -(precursor m/z of the \code{y} peak matrix). Additional parameters to functions -\code{FUN} and \code{MAPFUN} can be passed with \code{...}. Parameters \code{ppm} and -\code{tolerance} are passed to both \code{MAPFUN} and \code{FUN}. -The function returns a \code{matrix} with the results of \code{FUN} for each -comparison, number of rows equal to \code{length(x)} and number of columns -equal \code{length(y)} (i.e. element in row 2 and column 3 is the result from -the comparison of \code{x[2]} with \code{y[3]}). If \code{SIMPLIFY = TRUE} the \code{matrix} -is \emph{simplified} to a \code{numeric} if length of \code{x} or \code{y} is one. See also -the vignette for additional examples, such as using spectral entropy -similarity in the scoring. -\item \code{deisotopeSpectra()}: \emph{deisotopes} each spectrum keeping only the -monoisotopic peak for groups of isotopologues. Isotopologues are -estimated using the \code{\link[=isotopologues]{isotopologues()}} function from the \emph{MetaboCoreUtils} -package. Note that the default parameters for isotope -prediction/detection have been determined using data from the Human -Metabolome Database (HMDB) and isotopes for elements other than CHNOPS -might not be detected. See parameter \code{substDefinition} in the -documentation of \code{\link[=isotopologues]{isotopologues()}} for more information. The approach -and code to define the parameters for isotope prediction is described -\href{https://github.com/EuracBiomedicalResearch/isotopologues}{here}. -\item \code{entropy()}: calculates the entropy of each spectra based on the metrics -suggested by Li et al. (https://doi.org/10.1038/s41592-021-01331-z). -See also \code{\link[=nentropy]{nentropy()}} in the \emph{MsCoreUtils} package for details. -\item \code{estimatePrecursorIntensity()}: defines the precursor intensities for MS2 -spectra using the intensity of the matching MS1 peak from the -closest MS1 spectrum (i.e. the last MS1 spectrum measured before the -respective MS2 spectrum). With \code{method = "interpolation"} it is also -possible to calculate the precursor intensity based on an interpolation of -intensity values (and retention times) of the matching MS1 peaks from the -previous and next MS1 spectrum. See \code{\link[=estimatePrecursorIntensity]{estimatePrecursorIntensity()}} for -examples and more details. -\item \code{estimatePrecursorMz()}: \strong{for DDA data}: allows to estimate a fragment -spectra's precursor m/z based on the reported precursor m/z and the data -from the previous MS1 spectrum. See \code{\link[=estimatePrecursorMz]{estimatePrecursorMz()}} for details. -\item \code{neutralLoss()}: calculates neutral loss spectra for fragment spectra. See -\code{\link[=neutralLoss]{neutralLoss()}} for detailed documentation. -\item \code{processingLog()}: returns a \code{character} vector with the processing log -messages. -\item \code{reduceSpectra()}: keeps for groups of peaks with similar m/z values in -(given \code{ppm} and \code{tolerance}) in each spectrum only the peak with the -highest intensity removing all other peaks hence \emph{reducing} each -spectrum to the highest intensity peaks per \emph{peak group}. -Peak groups are defined using the \code{\link[=group]{group()}} function from the -\emph{MsCoreUtils} package. See also the \code{combinePeaks()} function for an -alternative function to combine peaks within each spectrum. -\item \code{scalePeaks()}: scales intensities of peaks within each spectrum depending -on parameter \code{by}. With \code{by = sum} (the default) peak intensities are -divided by the sum of peak intensities within each spectrum. The sum of -intensities is thus 1 for each spectrum after scaling. Parameter -\code{msLevel.} allows to apply the scaling of spectra of a certain MS level. -By default (\code{msLevel. = uniqueMsLevels(x)}) intensities for all -spectra will be scaled. -\item \code{spectrapply()}: applies a given function to each individual spectrum or -sets of a \code{Spectra} object. By default, the \code{Spectra} is split into -individual spectra (i.e. \code{Spectra} of length 1) and the function \code{FUN} -is applied to each of them. An alternative splitting can be defined with -parameter \code{f}. Parameters for \code{FUN} can be passed using \code{...}. -The returned result and its order depend on the function \code{FUN} and how -\code{object} is split (hence on \code{f}, if provided). Parallel processing is -supported and can be configured with parameter \code{BPPARAM}, is however only -suggested for computational intense \code{FUN}. -As an alternative to the (eventual parallel) processing of the full -\code{Spectra}, \code{spectrapply()} supports also a chunk-wise processing. For this, -parameter \code{chunkSize} needs to be specified. \code{object} is then split into -chunks of size \code{chunkSize} which are then (stepwise) processed by \code{FUN}. -This guarantees a lower memory demand (especially for on-disk backends) -since only the data for one chunk needs to be loaded into memory in each -iteration. Note that by specifying \code{chunkSize}, parameters \code{f} and -\code{BPPARAM} will be ignored. -See also \code{\link[=chunkapply]{chunkapply()}} or examples below for details on chunk-wise -processing. -\item \code{smooth()}: smooths individual spectra using a moving window-based approach -(window size = \code{2 * halfWindowSize}). Currently, the -Moving-Average- (\code{method = "MovingAverage"}), -Weighted-Moving-Average- (\verb{method = "WeightedMovingAverage")}, -weights depending on the distance of the center and calculated -\code{1/2^(-halfWindowSize:halfWindowSize)}) and -Savitzky-Golay-Smoothing (\code{method = "SavitzkyGolay"}) are supported. -For details how to choose the correct \code{halfWindowSize} please see -\code{\link[MsCoreUtils:smooth]{MsCoreUtils::smooth()}}. -\item \code{pickPeaks()}: picks peaks on individual spectra using a moving -window-based approach (window size = \code{2 * halfWindowSize}). For noisy -spectra there are currently two different noise estimators available, -the \emph{M}edian \emph{A}bsolute \emph{D}eviation (\code{method = "MAD"}) and -Friedman's Super Smoother (\code{method = "SuperSmoother"}), -as implemented in the \code{\link[MsCoreUtils:noise]{MsCoreUtils::noise()}}. -The method supports also to optionally \emph{refine} the m/z value of -the identified centroids by considering data points that belong (most -likely) to the same mass peak. Therefore the m/z value is calculated as an -intensity weighted average of the m/z values within the peak region. -The peak region is defined as the m/z values (and their respective -intensities) of the \code{2 * k} closest signals to the centroid or the closest -valleys (\code{descending = TRUE}) in the \code{2 * k} region. For the latter the \code{k} -has to be chosen general larger. See \code{\link[MsCoreUtils:refineCentroids]{MsCoreUtils::refineCentroids()}} for -details. -If the ratio of the signal to the highest intensity of the peak is below -\code{threshold} it will be ignored for the weighted average. -\item \code{replaceIntensitiesBelow()}: replaces intensities below a specified -threshold with the provided \code{value}. Parameter \code{threshold} can be either -a single numeric value or a function which is applied to all non-\code{NA} -intensities of each spectrum to determine a threshold value for each -spectrum. The default is \code{threshold = min} which replaces all values -which are <= the minimum intensity in a spectrum with \code{value} (the -default for \code{value} is \code{0}). Note that the function specified with -\code{threshold} is expected to have a parameter \code{na.rm} since \code{na.rm = TRUE} -will be passed to the function. If the spectrum is in profile mode, -ranges of successive non-0 peaks <= \code{threshold} are set to 0. -Parameter \code{msLevel.} allows to apply this to only spectra of certain MS -level(s). -} -} - \examples{ ## -------- CREATION OF SPECTRA OBJECTS -------- @@ -1410,314 +346,6 @@ head(dataOrigin(sciex)) head(dataOrigin(sciex_im)) -## ---- ACCESSING AND ADDING DATA ---- - -## Get the MS level for each spectrum. -msLevel(data) - -## Alternatively, we could also use $ to access a specific spectra variable. -## This could also be used to add additional spectra variables to the -## object (see further below). -data$msLevel - -## Get the intensity and m/z values. -intensity(data) -mz(data) - -## Determine whether one of the spectra has a specific m/z value -containsMz(data, mz = 120.4) - -## Accessing spectra variables works for all backends: -intensity(sciex) -intensity(sciex_im) - -## Get the m/z for the first spectrum. -mz(data)[[1]] - -## Get the peak data (m/z and intensity values). -pks <- peaksData(data) -pks -pks[[1]] -pks[[2]] - -## Note that we could get the same resulb by coercing the `Spectra` to -## a `list` or `SimpleList`: -as(data, "list") -as(data, "SimpleList") - -## List all available spectra variables (i.e. spectrum data and metadata). -spectraVariables(data) - -## For all *core* spectrum variables accessor functions are available. These -## return NA if the variable was not set. -centroided(data) -dataStorage(data) -rtime(data) -precursorMz(data) - -## The core spectra variables are: -coreSpectraVariables() - -## Add an additional metadata column. -data$spectrum_id <- c("sp_1", "sp_2") - -## List spectra variables, "spectrum_id" is now also listed -spectraVariables(data) - -## Get the values for the new spectra variable -data$spectrum_id - -## Extract specific spectra variables. -spectraData(data, columns = c("spectrum_id", "msLevel")) - -## Drop spectra variable data and/or columns. -res <- selectSpectraVariables(data, c("mz", "intensity")) - -## This removed the additional columns "spectrum_id" and deleted all values -## for all spectra variables, except "mz" and "intensity". -spectraData(res) - -## Compared to the data before selectSpectraVariables. -spectraData(data) - - -## ---- SUBSETTING, FILTERING AND COMBINING - -## Subset to all MS2 spectra. -data[msLevel(data) == 2] - -## Append new `spectraVariables` to the `spectraData` -df <- data.frame(cola = 4:5, colb = "b") -data_append <- cbind2(data, df) - -## Same with the filterMsLevel function -filterMsLevel(data, 2) - -## Below we combine the `data` and `sciex_im` objects into a single one. -data_comb <- c(data, sciex_im) - -## The combined Spectra contains a union of all spectra variables: -head(data_comb$spectrum_id) -head(data_comb$rtime) -head(data_comb$dataStorage) -head(data_comb$dataOrigin) - -## Filter a Spectra for a target precursor m/z with a tolerance of 10ppm -spd$precursorMz <- c(323.4, 543.2302) -data_filt <- Spectra(spd) -filterPrecursorMzRange(data_filt, mz = 543.23 + ppm(c(-543.23, 543.23), 10)) - -## Filter a Spectra keeping only peaks matching certain m/z values -sps_sub <- filterMzValues(data, mz = c(103, 104), tolerance = 0.3) -mz(sps_sub) - -## This function can also be used to remove specific peaks from a spectrum -## by setting `keep = FALSE`. -sps_sub <- filterMzValues(data, mz = c(103, 104), - tolerance = 0.3, keep = FALSE) -mz(sps_sub) - -## Note that `filterMzValues()` keeps or removes all peaks with a matching -## m/z given the provided `ppm` and `tolerance` parameters. - -## Filter a Spectra keeping only peaks within a m/z range -sps_sub <- filterMzRange(data, mz = c(100, 300)) -mz(sps_sub) - -## Remove empty spectra variables -sciex_noNA <- dropNaSpectraVariables(sciex) - -## Available spectra variables before and after `dropNaSpectraVariables()` -spectraVariables(sciex) -spectraVariables(sciex_noNA) - - -## Adding new spectra variables -sciex1 <- filterDataOrigin(sciex, dataOrigin(sciex)[1]) -spv <- DataFrame(spectrumId = sciex1$spectrumId[3:12], ## used for merging - var1 = rnorm(10), - var2 = sample(letters, 10)) -spv - -sciex2 <- joinSpectraData(sciex1, spv, by.y = "spectrumId") - -spectraVariables(sciex2) -spectraData(sciex2)[1:13, c("spectrumId", "var1", "var2")] - -## Removing fourier transform artefacts seen in Orbitra data. - -## Loading an Orbitrap spectrum with artefacts. -data(fft_spectrum) -plotSpectra(fft_spectrum, xlim = c(264.5, 265.5)) -plotSpectra(fft_spectrum, xlim = c(264.5, 265.5), ylim = c(0, 5e6)) - -fft_spectrum <- filterFourierTransformArtefacts(fft_spectrum) -fft_spectrum -plotSpectra(fft_spectrum, xlim = c(264.5, 265.5), ylim = c(0, 5e6)) - -## Using a few examples peaks in your data you can optimize the parameters -fft_spectrum_filtered <- filterFourierTransformArtefacts(fft_spectrum, - halfWindowSize = 0.2, - threshold = 0.005, - keepIsotopes = TRUE, - maxCharge = 5, - isotopeTolerance = 0.005 - ) - -fft_spectrum_filtered -length(mz(fft_spectrum_filtered)[[1]]) -plotSpectra(fft_spectrum_filtered, xlim = c(264.5, 265.5), ylim = c(0, 5e6)) - -## Using filterRanges to filter spectra object based on variables available -## in `spectraData`. -## First, determine the variable(s) on which to base the filtering: -sv <- c("rtime", "precursorMz", "peaksCount") -## Note that ANY variables can be chosen here, and as many as wanted. - -## Define the ranges (pairs of values with lower and upper boundary) to be -## used for the individual spectra variables. The first two values will be -## used for the first spectra variable (e.g., rtime here), the next two for -## the second (e.g. precursorMz here) and so on: -ranges <- c(30, 350, 200,500, 350, 600) - -## Input the parameters within the filterRanges function: -filt_spectra <- filterRanges(sciex, spectraVariables = sv, - ranges = ranges) - -## Using `filterRanges()` to filter spectra object with multiple ranges for -## the same `spectraVariable` (e.g, here rtime) -sv <- c("rtime", "rtime") -ranges <- c(30, 100, 200, 300) -filt_spectra <- filterRanges(sciex, spectraVariables = sv, - ranges = ranges, match = "any") - -## Using filterValues in a similar way to a filter spectra object based on -## variables available in `spectraData`. However, this time not based on -## ranges but similarities to user input single values with given -## tolerance/ppm -## First determine the variable(s) on which to base the filtering: -sv <- c("rtime", "precursorMz") -## Note that ANY variables can be chosen here, and as many as wanted. - -## Define the values that will be used to filter the spectra based on their -## similarities to their respective spectraVariables. -## The first values in the parameters values, tolerance and ppm will be -## used for the first spectra variable (e.g. rtime here), the next for the -## second (e.g. precursorMz here) and so on: -values <- c(350, 400) -tolerance <- c(100, 0) -ppm <- c(0,50) - -## Input the parameters within the `filterValues()` function: -filt_spectra <- filterValues(sciex, spectraVariables = sv, - values = values, tolerance = tolerance, ppm = ppm) - -## ---- DATA MANIPULATIONS AND OTHER OPERATIONS ---- - -## Set the data to be centroided -centroided(data) <- TRUE - -## Replace peak intensities below 40 with 3. -res <- replaceIntensitiesBelow(data, threshold = 40, value = 3) -res - -## Get the intensities of the first and second spectrum. -intensity(res)[[1]] -intensity(res)[[2]] - -## Remove all peaks with an intensity below 40. -res <- filterIntensity(res, intensity = c(40, Inf)) - -## Get the intensities of the first and second spectrum. -intensity(res)[[1]] -intensity(res)[[2]] - -## Lengths of spectra is now different -lengths(mz(res)) -lengths(mz(data)) - -## In addition it is possible to pass a function to `filterIntensity()`: in -## the example below we want to keep only peaks that have an intensity which -## is larger than one third of the maximal peak intensity in that spectrum. -keep_peaks <- function(x, prop = 3) { - x > max(x, na.rm = TRUE) / prop -} -res2 <- filterIntensity(data, intensity = keep_peaks) -intensity(res2)[[1L]] -intensity(data)[[1L]] - -## We can also change the proportion by simply passing the `prop` parameter -## to the function. To keep only peaks that have an intensity which is -## larger than half of the maximum intensity: -res2 <- filterIntensity(data, intensity = keep_peaks, prop = 2) -intensity(res2)[[1L]] -intensity(data)[[1L]] - -## Since data manipulation operations are by default not directly applied to -## the data but only added to the internal lazy evaluation queue, it is also -## possible to remove these data manipulations with the `reset()` function: -res_rest <- reset(res) -res_rest -lengths(mz(res_rest)) -lengths(mz(res)) -lengths(mz(data)) - -## `reset()` after a `applyProcessing()` can not restore the data, because -## the data in the backend was changed. Similarly, `reset()` after any -## filter operations can not restore data for a `Spectra` with a -## `MsBackendMemory` or `MsBackendDataFrame`. -res_2 <- applyProcessing(res) -res_rest <- reset(res_2) -lengths(mz(res)) -lengths(mz(res_rest)) - - -## Compare spectra: comparing spectra 2 and 3 against spectra 10:20 using -## the normalized dotproduct method. -res <- compareSpectra(sciex_im[2:3], sciex_im[10:20]) -## first row contains comparisons of spectrum 2 with spectra 10 to 20 and -## the second row comparisons of spectrum 3 with spectra 10 to 20 -res - -## To use a simple Pearson correlation instead we can define a function -## that takes the two peak matrices and calculates the correlation for -## their second columns (containing the intensity values). -correlateSpectra <- function(x, y, use = "pairwise.complete.obs", ...) { - cor(x[, 2], y[, 2], use = use) -} -res <- compareSpectra(sciex_im[2:3], sciex_im[10:20], - FUN = correlateSpectra) -res - -## Use compareSpectra to determine the number of common (matching) peaks -## with a ppm of 10: -## type = "inner" uses a *inner join* to match peaks, i.e. keeps only -## peaks that can be mapped betwen both spectra. The provided FUN returns -## simply the number of matching peaks. -compareSpectra(sciex_im[2:3], sciex_im[10:20], ppm = 10, type = "inner", - FUN = function(x, y, ...) nrow(x)) - -## Apply an arbitrary function to each spectrum in a Spectra. -## In the example below we calculate the mean intensity for each spectrum -## in a subset of the sciex_im data. Note that we can access all variables -## of each individual spectrum either with the `$` operator or the -## corresponding method. -res <- spectrapply(sciex_im[1:20], FUN = function(x) mean(x$intensity[[1]])) -head(res) - -## It is however important to note that dedicated methods to access the -## data (such as `intensity`) are much more efficient than using `lapply()`: -res <- lapply(intensity(sciex_im[1:20]), mean) -head(res) - -## As an alternative, applying a function `FUN` to a `Spectra` can be -## performed *chunk-wise*. The advantage of this is, that only the data for -## one chunk at a time needs to be loaded into memory reducing the memory -## demand. This type of processing can be performed by specifying the size -## of the chunks (i.e. number of spectra per chunk) with the `chunkSize` -## parameter -spectrapply(sciex_im[1:20], lengths, chunkSize = 5L) - ## -------- DATA EXPORT -------- ## Some `MsBackend` classes provide an `export()` method to export the data diff --git a/man/filterMsLevel.Rd b/man/filterMsLevel.Rd index 0ea3698b..09ed5b29 100644 --- a/man/filterMsLevel.Rd +++ b/man/filterMsLevel.Rd @@ -312,6 +312,18 @@ defining whether the condition has to match for all provided \item{values}{for \code{filterValues()}: A \code{numeric} vector that define the values to filter the Spectra data. These values need to be in the same order as the \code{spectraVariables} parameter.} + +\item{weighted}{For \code{combinePeaks()}: \code{logical(1)} whether m/z values of +peaks within each peak group should be aggregated into a single m/z +value using an intensity-weighted mean. Defaults to \code{weighted = TRUE}.} + +\item{which}{for \code{containsMz()}: either \code{"any"} or \code{"all"} defining whether +any (the default) or all provided \code{mz} have to be present in the +spectrum.} + +\item{y}{A \code{Spectra} object. +- For \code{joinSpectraData()}: a \code{DataFrame}. +- For \code{cbind2()} a \code{data.frame}, \code{DataFrame} or \code{matrix}.} } \description{ A variety of functions to filter or subset \code{Spectra} objects are available. @@ -336,6 +348,30 @@ a subset of the original object without affecting its content. \itemize{ \item \code{[}: subsets the spectra keeping only selected elements (\code{i}). The method \strong{always} returns a \code{Spectra} object. +\item \code{cbind2()}: Appends multiple spectra variables from a \code{data.frame}, +\code{DataFrame} or \code{matrix} to the \code{Spectra} object at once. It does so +\emph{blindly} (e.g. do not check rownames compatibility) and is therefore at +the risk of the user. For a more controlled way of adding spectra +variables, the \code{joinSpectraData()} should be used. It will return a +\code{Spectra} object with the appended spectra variables. \code{cbind2()} does +check however that the number of rows of the \code{data.frame} or \code{DataFrame} +matches the number of spectra in the \code{Spectra} object. +\item \code{deisotopeSpectra()}: \emph{deisotopes} each spectrum keeping only the +monoisotopic peak for groups of isotopologues. Isotopologues are +estimated using the \code{\link[=isotopologues]{isotopologues()}} function from the +\emph{MetaboCoreUtils} package. Note that +the default parameters for isotope prediction/detection have been +determined using data from the Human Metabolome Database (HMDB) and +isotopes for elements other than CHNOPS might not be detected. See +parameter \code{substDefinition} in the documentation of \code{\link[=isotopologues]{isotopologues()}} for +more information. The approach and code to define the parameters for +isotope prediction is described +\href{https://github.com/EuracBiomedicalResearch/isotopologues}{here}. +\item \code{dropNaSpectraVariables()}: removes spectra variables (i.e. columns in the +object's \code{spectraData} that contain only missing values (\code{NA}). Note that +while columns with only \code{NA}s are removed, a \code{spectraData()} call after +\code{dropNaSpectraVariables()} might still show columns containing \code{NA} values +for \emph{core} spectra variables. \item \code{filterAcquisitionNum()}: filters the object keeping only spectra matching the provided acquisition numbers (argument \code{n}). If \code{dataOrigin} or \code{dataStorage} is also provided, \code{object} is subsetted to the spectra with @@ -438,6 +474,27 @@ be dropped. For mandatory columns (i.e., those listed by the values will be dropped but not the variable itself. Additional (or user defined) spectra variables will be completely removed. Returns the filtered \code{Spectra}. +\item \code{joinSpectraData()}: Individual spectra variables can be directly +added with the \verb{$<-} or \verb{[[<-} syntax. The \code{joinSpectraData()} +function allows to merge a \code{DataFrame} to the existing spectra +data. This function diverges from the \code{\link[=merge]{merge()}} method in two +main ways: +\itemize{ +\item The \code{by.x} and \code{by.y} column names must be of length 1. +\item If variable names are shared in \code{x} and \code{y}, the spectra +variables of \code{x} are not modified. It's only the \code{y} +variables that are appended the suffix defined in +\code{suffix.y}. This is to avoid modifying any core spectra +variables that would lead to an invalid object. +\item Duplicated Spectra keys (i.e. \code{x[[by.x]]}) are not +allowed. Duplicated keys in the \code{DataFrame} (i.e \code{y[[by.y]]}) +throw a warning and only the last occurrence is kept. These +should be explored and ideally be removed using for +\code{QFeatures::reduceDataFrame()}, \code{PMS::reducePSMs()} or similar +functions. +For a more general function that allows to append \code{data.frame}, +\code{DataFrame} and \code{matrix} see \code{cbind2()}. +} } } @@ -595,6 +652,13 @@ filt_spectra ## Remove spectra variables without content (i.e. with only missing values) sps_noNA <- dropNaSpectraVariables(sps_dda) +## Append new `spectraVariables` to the `spectraData` +df <- data.frame(cola = 4:5, colb = "b") +data_append <- cbind2(data, df) + +## Same with the filterMsLevel function +filterMsLevel(data, 2) + ## This reduced the size of the object slightly print(object.size(sps_dda), unit = "MB") print(object.size(sps_noNA), unit = "MB")