blog on Miao Yu | 于淼

Reactomics analysis for MS-only data

Wed, 29 May 2024 00:00:00 +0000

Recently, I received multiple requests of reactomics analysis for MS only data such as FT-ICR MS or MS imaging data. In this case, it’s better to summary the answer with an example as reference. Here you are!

When retention time is not provided, m/z vector can still be used to check reaction level changes. To apply this analysis, you need to install the devel version(>=0.2.6) of PMD package:

remotes::install_github('yufree/pmd')

## Using github PAT from envvar GITHUB_PAT. Use `gitcreds::gitcreds_set()` and unset GITHUB_PAT in .Renviron (or elsewhere) if you want to use the more secure git credential store instead.

## Downloading GitHub repo yufree/pmd@HEAD

## 
## ── R CMD build ─────────────────────────────────────────────────────────────────
## * checking for file ‘/private/var/folders/nj/68q18qjd2x1cb8my282c58cr0000gn/T/Rtmpx52AfQ/remotes44f5531ee188/yufree-pmd-87e8de1/DESCRIPTION’ ... OK
## * preparing ‘pmd’:
## * checking DESCRIPTION meta-information ... OK
## * checking for LF line-endings in source and make files and shell scripts
## * checking for empty or unneeded directories
## * building ‘pmd_0.2.6.tar.gz’

You can still use getrda to find the high frequency PMDs.

library(pmd)
data(spmeinvivo)
# get the m/z
mz <- spmeinvivo$mz
# get the m/z intensity for all m/z, the row order is the same with mz
insms <- spmeinvivo$data
# check high frequency pmd
sda <- getrda(mz)

## 164462 pmd found.

## 20 pmd used.

colnames(sda)

##  [1] "0"       "1.001"   "1.002"   "1.003"   "1.004"   "2.015"   "2.016"  
##  [8] "14.015"  "17.026"  "18.011"  "21.982"  "28.031"  "28.032"  "44.026" 
## [15] "67.987"  "67.988"  "88.052"  "116.192" "135.974" "135.975"

# save them as numeric vector
hfpmd <- as.numeric(colnames(sda))

Then getpmddf function can be used to extract all the paired ions for certain PMD.

# get details for certain pmd
pmddf <- getpmddf(mz,pmd=18.011,digits = 3)
# add intensity for all the paired ions
mz1ins <- insms[match(pmddf$ms1,mz),]
mz2ins <- insms[match(pmddf$ms2,mz),]
# get the pmd pair intensity
pmdins <- mz1ins+mz2ins
# get the pmd total intensity across samples
pmdinsall <- apply(pmdins,2,sum)
# show the PMD intensity
pmdinsall

## 1405_Fish1_F1 1405_Fish1_F2 1405_Fish1_F3 1405_Fish2_F1 1405_Fish2_F2 
##       9898514       7801273      10363201       5847334      10479551 
## 1405_Fish2_F3 1405_Fish3_F1 1405_Fish3_F2 1405_Fish3_F3 
##       7021375      10584976      12989961      12559649

You can also calculate the static or dynamic PMD intensity for m/z only data.

# get the ratio of larger m/z over smaller m/z
ratio <- mz2ins/mz1ins
# filter PMD based on RSD% across samples
# cutoff 30%
cutoff <- 0.3
# get index for static PMD
rsdidx <- apply(ratio,1,function(x) sd(x)/mean(x)<cutoff)
# get static PMD
pmddfstatic <- pmddf[rsdidx,]
# get static intensity
pmdinsstatic <- pmdins[rsdidx,]
# normalize the ions pair intensity to avoid influences from large response factors
pmdinsstaticscale <- t(scale(t(pmdinsstatic)))
# get the pmd static intensity across samples
pmdinsstaticall <- apply(pmdinsstaticscale,2,sum)
# show the PMD static intensity for each sample
pmdinsstaticall

## 1405_Fish1_F1 1405_Fish1_F2 1405_Fish1_F3 1405_Fish2_F1 1405_Fish2_F2 
##         1.027       -16.704         2.374       -27.241        12.434 
## 1405_Fish2_F3 1405_Fish3_F1 1405_Fish3_F2 1405_Fish3_F3 
##       -17.758         7.924        19.803        18.142

# get index for dynamic PMD
rsdidx <- apply(ratio,1,function(x) sd(x)/mean(x)>=cutoff)
# get dynamic PMD
pmddfdynamic <- pmddf[rsdidx,]
# get dynamic intensity for ms1 and ms2
pmdinsdynamicms1 <- apply(mz1ins[rsdidx,],1,function(x) sd(x)/mean(x))
pmdinsdynamicms2 <- apply(mz2ins[rsdidx,],1,function(x) sd(x)/mean(x))
# find the stable ms and use ratio as intensity
idx <- pmdinsdynamicms1>pmdinsdynamicms2
pmdinsdynamic <- ratio[rsdidx,]
pmdinsdynamic[idx,] <- 1/ratio[rsdidx,][idx,]
# get the pmd dynamic intensity across samples
pmdinsdynamicall <- apply(pmdinsdynamic,2,sum)
# show the PMD dynamic intensity for each sample
pmdinsdynamicall

## 1405_Fish1_F1 1405_Fish1_F2 1405_Fish1_F3 1405_Fish2_F1 1405_Fish2_F2 
##         374.2         315.6         388.0         207.8         233.4 
## 1405_Fish2_F3 1405_Fish3_F1 1405_Fish3_F2 1405_Fish3_F3 
##         199.9         283.5         328.0         256.2

You can also use getpmddf function extract all the paired ions for multiple PMDs. Then you could generate the network based on the output.

# get details for certain pmd
pmddf <- getpmddf(mz,pmd=hfpmd,digits = 3)
# viz by igraph package
library(igraph)

## 
## Attaching package: 'igraph'

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

net <- graph_from_data_frame(pmddf,directed = F)
pal <- grDevices::rainbow(length(unique(E(net)$diff2)))
plot(net,vertex.label=NA,vertex.size = 5,edge.width = 3,edge.color = pal[as.numeric(as.factor(E(net)$diff2))],main = 'PMD network')
legend("topright",bty = "n",
       legend=unique(E(net)$diff2),
       fill=unique(pal[as.numeric(as.factor(E(net)$diff2))]), border=NA,horiz = F)

If you prefer to get a pmd network for a specific mass. You can still use getchain function.

data(spmeinvivo)
spmeinvivo$rt <- NULL
chain <- getchain(spmeinvivo,diff = c(2.02,14.02,15.99,58.04,13.98),mass = 286.3101,digits = 2,corcutoff = 0)
# show as network
net <- graph_from_data_frame(chain$sdac,directed = F)
pal <- grDevices::rainbow(5)
plot(net,vertex.label=round(as.numeric(V(net)$name),2),vertex.size =5,edge.width = 3,edge.color = pal[as.numeric(as.factor(E(net)$diff2))],vertex.label.dist=1,vertex.color=ifelse(round(as.numeric(V(net)$name),4) %in% 286.3101,'red','black'), main = 'PMD network')
legend("topright",bty = "n",
       legend=unique(E(net)$diff2),
       fill=unique(pal[as.numeric(as.factor(E(net)$diff2))]), border=NA,horiz = F)

Call for Papers: Artificial Intelligence and Machine Learning for Environmental & Health

Sun, 10 Mar 2024 00:00:00 +0000

Artificial intelligence (AI) and machine learning (ML) are transformative fields of computer science. They empower researchers to develop algorithms and models capable of extracting meaningful insights, making predictions, and automating tasks by analyzing and learning from data. Due to the intricate nature of environmental health issues, the integration of AI and machine learning is imperative. These advanced computational technologies have the potential to revolutionize environmental health studies through fundamentally improve and advance environmental exposure assessment, environmental health risk assessment, and environmental policy development.

This Virtual Special Issue from Environment & Health extends an invitation to scientists to share their innovative work on leveraging AI and ML for environmental health studies.

We welcome contributions of Articles, Reviews, Perspectives or Viewpoints that delve into topics including, but not limited to:

Source appointment Chemical toxicity prediction Identification or screening of unknown pollutants Human exposure assessment Molecular mechanisms between exposure and disease Data compliance and ethics By sharing these findings or perspectives, we hope to spur further innovation and advancements in this critical and rapidly evolving field.

Organizing Editors Miao Yu, Ph.D., Guest Editor The Jackson Laboratory, USA

Mingliang Fang, Ph.D., Guest Editor Fudan University, China

Zhenyu Tian, Ph.D., Guest Editor Northeastern University, USA

Bin Wang, Ph.D., Guest Editor Peking University, China

Douglas Walker, Ph.D., Guest Editor Emory University, USA

Yuming Guo, Ph.D., Associate Editor, Environment & Health Monash University, Australia

Molecular networking in R

Sun, 25 Jun 2023 00:00:00 +0000

I found lots of research using molecular networking in this year’s ASMS annual meeting. However, I didn’t find R code or package for molecular networking. It seems most people using molecular networking are using GNPS and don’t talk too much about the algorithm behind molecular networking. In this post, I will make a brief introduction about molecular networking and show some dirty code to perform molecular networking in R.

What is molecular networking?

Molecular networking is more about molecular network linked by MS2 similarity. In the network, nodes represent compounds with different MS2 spectra and edges represent the similarity of their MS2 spectra. When two compounds are connected by edge, they should have structure similarity and potential biological functional similarity.

From this definition, we know the precursors of connected compounds should be different. This is the major difference between molecular networking and MS2 spectra matching. In MS2 spectra matching, the purpose is identification of unknown MS2 spectra. In molecular networking, the purpose is classification of similar compounds. If one node in the molecular networking is known compound, we could infer the other nodes connected with this compound should also be compounds similar to this known compound such as metabolites or congeners. Though most of the users of GNPS using molecular networking as annotation tools, the most unique feature of molecular networking is to interpret the network for biological purpose. In the original publication of molecualr networking, such tool is designed to find new nature products, which is not for identification purpose only. This post is also not focused on identification and care more about the relation network among molecular.

How to define MS2 similairy?

If you are familiar with MS2 spectra matching, you might realize the precursors of matching two spectra should be the same or has isotopologue shift. However, molecular networking will consider the spectra similarity with different precursors, which is called modified cosine similarity in their original paper.

Before we discuss the modified cosine similarity, let’s review cosine similarity. Cosine similarity is very straightforward. If we have two vectors like [1,10,1] and [10,100,10], the cosine similarity is to calculate the normalized dot product, which can also be interpreted as the cosine of the angle between two vector. For vector [1,10,1] and [10,100,10], the value should be:

$$ cos(\theta) = \frac{(1*10 + 10*100 + 1*10)}{\sqrt{1*1+10*10+1*1} * \sqrt{10*10+100*100+10*10}} = 1 $$

In this case, the cosine value is 1 and angle should be 0. Those two vectors are similar in terms of cosine similarity.

For MS2 spectra matching, such two vectors should be the intensities with same m/z. In this case, you need to define the tolerance of m/z shifts to align two MS2 spectra before the calculation of cosine similarity.

OK, I hope you understand the regular way to compare two MS2 spectra now. Now we need to modify this algorithm for molecular networking:

Step 1: calculate paired mass distance between precursors
Step 2: Apply this mass distance to all the query MS2 spectra to generate a shift version of MS2 spectra with the same intensities profile
Step 3: Align the m/z between query MS2 spectra (both the original and shifted version of target MS2 spectra) and target MS2 spectra
Step 4: Calculate the cosine similarity between the aligned intensity as modified cosine similarity

I know you still confuse about the algorithm. I will give an example. Compound A has m/z 300 as precursor and m/z [100,200,250] as fragment ions with intensity [100,200,300]. Compound B had m/z 215.995 as precursor and m/z [100, 200, 265.995] as fragment ions with intensity [10,20,30].

Step 1: the paired mass distance of precursors is 15.995
Step 2: we generate a shift version of spectra A with m/z [115.995, 215.995, 265.995] with intensity [100,200,300]
Step 3: Align both the original and shifted spectra A with spectra B. We got aligned m/z[100,200,265.995] with intensity[100,200,300] for A and [10,20,30] for B
Step 4: the cosine similarity of between A and B is 1, which means A and B are structure similar to each other

In the above example, compound B is the oxidized metabolite of compound A. One fragment ions show the mass shift of oxidation while the smaller ions will not contain the fragment with oxidized parts. In our example, m/z 250 from A is aligned to m/z 265.995 from B considering mass shift of precursors while the other two ions(m/z 100 and 200) are still aligned with raw spectra of A and B. All those three fragment ions are aligned for those modified cosine similarity calculation. Such scenario is highly true for real world compounds and smart to link compounds with similar MS2 spectra. Now we can infer compound B should be a metabolite of compound A by checking and interpret the mass shift of precursors.

Different between molecular networking and PMD network

I also published tools to construct paired mass distance(PMD) network by MS1 only data. You might ask the differences between molecular networking and PMD network. Here is the similarity and difference:

In both PMD network and molecular networking, node are different compounds and the connection could be displayed as paired mass distance or mass shift. Both of them could be used to interpret relation among the compounds found in certain samples.

In PMD network, the paired mass distance is defined by paired mass distance of two MS1 ions. To perform PMD network analysis, you need to remove the redundant peaks from the same compounds by GlobalStd algorithm. Only the predefined PMDs will be used for connection. Such PMDs list could be generated based on domain knowledge or purely based on the frequency of PMDs among ions. When some PMDs always be found, such reaction should be considered as important relations.

In molecular networking, the paired mass distance is calculated between two precursors of two MS2 spectra. Modified cosine similarity is used to define the connection, which can also be interpreted by mass shifts. Here, you don’t need to tell the mass shifts of precursors and the algorithm will do this job. The only issue is that you need high quality MS2 data. In my experience, MS2 data collected for certain projects are always ‘identify’ the similar compounds profile and DDA mode usually only cover 10-20% of the MS1 ions found in corresponding MS1 full scan data.

In my opinion, if you preferred a high coverage of compounds in the samples, try PMD network on MS1 data first and then collected pseudotargeted MS2 data based on you PMD network results with modified cosine similarity matching. On the other hand, if you preferred a high confidence of identification at the very beginning, try molecular networking directly.

R code for molecuar networking

Here are two functions for molecular networking and I read the python code of matchms package to write those R functions.

find_matches is used to align the m/z of two MS2 spectra and mnmatch is used to perform molecular networking for certain MS2 spectra files. This function will only calculate the modified cosine similarity for all the MS2 spectra in one file and return a list object with two elements: one is the data table for network and another is also list object with matched spectra for detailed check.

find_matches <- function(spec1_mz, spec2_mz, tolerance, shift = 0) {
        matches <- data.frame()
        for (peak1_idx in seq_along(spec1_mz)) {
                mz <- spec1_mz[peak1_idx]
                low_bound <- mz - tolerance
                high_bound <- mz + tolerance
                for (peak2_idx in c(1:length(spec2_mz))) {
                        mz2 <- spec2_mz[peak2_idx] + shift
                        if (mz2 < high_bound & mz2 > low_bound) {
                                matches <- rbind.data.frame(matches,
                                                            c(peak1_idx, peak2_idx))
                        }
                }
        }
        if (nrow(matches) > 0) {
                colnames(matches) <- c('query', 'query2')
                return(matches)
        } else{
                return(NULL)
        }
}
mnmatch <- function(spectra,
                    binstep,
                    cf,
                    npeaks) {
        matches <- list()
        intersected_indices <- c()
        for (i in 1:(length(spectra) - 1)) {
                for (j in (i + 1):length(spectra)) {
                        ins <- intensity(spectra)[[i]]
                        ins <- ins / sum(ins)
                        pmz <- precursorMz(spectra[i])
                        pmz2 <- precursorMz(spectra[j])
                        diff <- pmz - pmz2
                        rt <- rtime(spectra[i])
                        rt2 <- rtime(spectra[j])
                        diffrt <- rt - rt2
                        insx1 <- insx2 <- insnx1 <- insnx2 <- c()
                        if (abs(diffrt) > 10) {
                                if (abs(diff) < binstep) {
                                        query <- mz(spectra)[[i]]
                                        query2 <- mz(spectra)[[j]]
                                        re <- find_matches(query,
                                                           query2,
                                                           binstep,
                                                           shift = 0)
                                        if (!is.null(re)) {
                                                insn <- intensity(spectra)[[j]]
                                                insn <- insn / sum(insn)
                                                insnx1 <- insn[re$query2]
                                                insx1 <- ins[re$query]
                                        }
                                        if (length(insnx1) > npeaks) {
                                                cos <-
                                                        crossprod(insx1,
                                                                  insnx1) / sqrt(
                                                                          crossprod(insx1) * crossprod(insnx1)
                                                                  )
                                                if (c(cos) > cf) {
                                                        intersected_indices <-
                                                                rbind(
                                                                        intersected_indices,
                                                                        c(
                                                                                i,
                                                                                j,
                                                                                as.numeric(
                                                                                        cos
                                                                                ),
                                                                                diff
                                                                        )
                                                                )
                                                        ms1 <- query[re$query]
                                                        ms2 <- query2[re$query2]
                                                        query <- cbind.data.frame(
                                                                mz = ms1,
                                                                ins = insx1
                                                        )
                                                        query2 <- cbind.data.frame(
                                                                mz = ms2,
                                                                ins = insnx1
                                                        )
                                                        queryraw <-
                                                                cbind.data.frame(
                                                                        mz = mz(
                                                                                spectra
                                                                        )[[i]],
                                                                        ins = ins
                                                                )
                                                        query2raw <-
                                                                cbind.data.frame(
                                                                        mz = mz(
                                                                                spectra
                                                                        )[[j]],
                                                                        ins = insn
                                                                )
                                                        diff <- diff
                                                        matcht <- list(
                                                                query,
                                                                query2,
                                                                queryraw,
                                                                query2raw,
                                                                diff
                                                        )
                                                        matches <- append(matches,
                                                                          list(
                                                                                  matcht
                                                                          ))
                                                }
                                        }
                                } else{
                                        query <- mz(spectra)[[i]]
                                        query2 <- mz(spectra)[[j]]
                                        re <- find_matches(query,
                                                           query2,
                                                           binstep,
                                                           shift = 0)
                                        if (!is.null(re)) {
                                                insn <- intensity(spectra)[[j]]
                                                insn <- insn / sum(insn)
                                                insnx1 <- insn[re$query2]
                                                insx1 <- ins[re$query]
                                        }
                                        re2 <- find_matches(query,
                                                            query2,
                                                            binstep,
                                                            shift = diff)
                                        if (!is.null(re2)) {
                                                insn <- intensity(spectra)[[j]]
                                                insn <- insn / sum(insn)
                                                insnx2 <- insn[re2$query2]
                                                insx2 <- ins[re2$query]
                                        }
                                        insx <- c(insx1, insx2)
                                        insnx <- c(insnx1, insnx2)
                                        if (length(insx) > npeaks) {
                                                cos <-
                                                        crossprod(insx, insnx) / sqrt(
                                                                crossprod(insx) * crossprod(insnx)
                                                        )
                                                if (c(cos) > cf) {
                                                        intersected_indices <-
                                                                rbind(
                                                                        intersected_indices,
                                                                        c(
                                                                                i,
                                                                                j,
                                                                                as.numeric(
                                                                                        cos
                                                                                ),
                                                                                diff
                                                                        )
                                                                )
                                                        ms1 <- c(query[re$query],
                                                                 query[re2$query])
                                                        ms2 <- c(query2[re$query2],
                                                                 query2[re2$query2])
                                                        query <-
                                                                cbind.data.frame(
                                                                        mz = ms1[order(ms1)],
                                                                        ins = insx[order(ms1)]
                                                                )
                                                        query2 <-
                                                                cbind.data.frame(
                                                                        mz = ms2[order(ms2)],
                                                                        ins = insnx[order(ms2)]
                                                                )
                                                        queryraw <-
                                                                cbind.data.frame(
                                                                        mz = mz(
                                                                                spectra
                                                                        )[[i]],
                                                                        ins = ins
                                                                )
                                                        query2raw <-
                                                                cbind.data.frame(
                                                                        mz = mz(
                                                                                spectra
                                                                        )[[j]],
                                                                        ins = insn
                                                                )
                                                        diff <- diff
                                                        matcht <- list(
                                                                query,
                                                                query2,
                                                                queryraw,
                                                                query2raw,
                                                                diff
                                                        )
                                                        matches <- append(matches,
                                                                          list(
                                                                                  matcht
                                                                          ))
                                                }
                                        }
                                }
                        }
                }
        }
        if (nrow(intersected_indices) > 0) {
                colnames(intersected_indices) <- c('query', 'query2', 'cos', 'diff')
                intersected_indices <- as.data.frame(intersected_indices)
        } else {
                intersected_indices <- NULL
        }
        return(list(intersected_indices, matches))
}

The usage is simple. You need to prepare mgf file for MS2 spectra. Here we also use binstep for 0.001Da to align two m/z, minimal 5 peaks for matching, and cutoff of 0.6 for cosine similarity:

library(Spectra)
specs <- Spectra('YOUFILE.msp', source = MsBackendMsp::MsBackendMsp())
result <- mnmatch(specs,binstep=0.001,cf=0.6,npeaks=5)
table <- result[[1]]
library(igragh)
net <- igraph::from_data_frame(table,directed = F)
# display molecular networking
plot(net)

Invitation to Submit Manuscripts for a Special Issue of Chemosphere

Mon, 11 Jul 2022 00:00:00 +0000

The prestigious journal Chemosphere is currently running a special issue entitled " Human Health Effects of Chemical Mixture Exposures". As we are acting as guest editors for this issue, we would like to welcome contributions from various disciplines. We kindly invite you to consider submitting your full paper to this special issue.

Guest editors

Prof. Dr. Peng Gao University of Pittsburgh School of Public Health peg47@pitt.edu
Prof. Dr. Hui Peng University of Toronto hui.peng@utoronto.ca
Dr. Miao Yu The Jackson Laboratory miao.yu@jax.org

Special issue information

Current environmental chemistry and toxicology studies mainly focus on a single stressor or single group of stressors, which does not reflect the multiple stressors in the dynamic exposome that humans are facing. Usually, human exposures are presented as cocktails with thousands of organic chemicals and dozens of inorganic chemicals being presented. However, the significant relationships and interactions among those stressors in the environment and their holistic human health effects remain unclear. Fortunately, the rapid developments of various techniques provide us with the possibility of revealing these mixture exposures. This Chemosphere special issue aims to provide a platform to dissect the complexity of chemical mixture exposures from experimental, analytical, and computational perspectives.

Manuscript submission information: The submission website for this journal is located at here. Author guidelines and manuscript submission to Chemosphere can be found here. To ensure that your manuscript is correctly submitted to the special issue, please select ‘‘VSI: Exposure of Mixture” when you reach the step of “Article Type” during the submission process.

Keywords

Human exposure, Chemical mixture, Health effects

Learn more about the benefits of publishing in a special issue.

Using xcmsrocker on HPC via Singularity

Thu, 26 May 2022 00:00:00 +0000

Docker should be the most popular container platform. Container distribution via dockerhub makes it easy to provide all-in-one development/data analysis environment for scientist. It’s always a good idea to use container on the high performance computing (HPC) cluster to accelerate data processing. Since Docker provides root access to the system they are running on, it’s always not allowed to be used on HPC. On the other hand, Singularity is more friendly to scientific research with MPI support, as well as security restriction.

I released xcmsrocker image for metabolomics data analysis for a long time and always said that it should be easy to deploy on HPC or cloud computing platform. It’s always right for the latter options and you can use docker image on the most popular cloud. However, you will need some extra work for HPC.

The first issue is to build a Singularity image from a docker image hosted on Docker Hub. You need to load singularity module after login on HPC:

ml singularity

Then pull the xcmsrocker image

singularity pull docker://yufree/xcmsrocker:lastest

Now you will find a file with name ‘xcmsrocker_latest.sif’ in you home folder. If your HPC use slurm for job management, you can use the following job script and save as a file called “rstudio-server.job”:

#!/bin/sh
#SBATCH --time=05:00:00
#SBATCH --signal=USR2
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=2
#SBATCH --mem=8192
#SBATCH --output=/home/%u/rstudio-server.job.%j

# Create temporary directory to be populated with directories to bind-mount in the container
# where writable file systems are necessary. Adjust path as appropriate for your computing environment.
workdir=$(python -c 'import tempfile; print(tempfile.mkdtemp())')

mkdir -p -m 700 ${workdir}/run ${workdir}/tmp ${workdir}/var/lib/rstudio-server
cat > ${workdir}/database.conf <<END
provider=sqlite
directory=/var/lib/rstudio-server
END

# Set OMP_NUM_THREADS to prevent OpenBLAS (and any other OpenMP-enhanced
# libraries used by R) from spawning more threads than the number of processors
# allocated to the job.
#
# Set R_LIBS_USER to a path specific to rocker/rstudio to avoid conflicts with
# personal libraries from any R installation in the host environment

cat > ${workdir}/rsession.sh <<END
#!/bin/sh
export OMP_NUM_THREADS=${SLURM_JOB_CPUS_PER_NODE}
export R_LIBS_USER=${HOME}/R/xcmsrocker
exec rsession "\${@}"
END

chmod +x ${workdir}/rsession.sh

export SINGULARITY_BIND="${workdir}/run:/run,${workdir}/tmp:/tmp,${workdir}/database.conf:/etc/rstudio/database.conf,${workdir}/rsession.sh:/etc/rstudio/rsession.sh,${workdir}/var/lib/rstudio-server:/var/lib/rstudio-server"

# Do not suspend idle sessions.
# Alternative to setting session-timeout-minutes=0 in /etc/rstudio/rsession.conf
# https://github.com/rstudio/rstudio/blob/v1.4.1106/src/cpp/server/ServerSessionManager.cpp#L126
export SINGULARITYENV_RSTUDIO_SESSION_TIMEOUT=0

export SINGULARITYENV_USER=$(id -un)
export SINGULARITYENV_PASSWORD=$(openssl rand -base64 15)
# get unused socket per https://unix.stackexchange.com/a/132524
# tiny race condition between the python & singularity commands
readonly PORT=$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1]); s.close()')
cat 1>&2 <<END
1. SSH tunnel from your workstation using the following command:

   ssh -N -L 8787:${HOSTNAME}:${PORT} ${SINGULARITYENV_USER}@LOGIN-HOST

   and point your web browser to http://localhost:8787

2. log in to RStudio Server using the following credentials:

   user: ${SINGULARITYENV_USER}
   password: ${SINGULARITYENV_PASSWORD}

When done using RStudio Server, terminate the job by:

1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
2. Issue the following command on the login node:

      scancel -f ${SLURM_JOB_ID}
END

singularity exec --cleanenv xcmsrocker_latest.sif \
    rserver --www-port ${PORT} \
            --auth-none=0 \
            --auth-pam-helper-path=pam-helper \
            --auth-stay-signed-in-days=30 \
            --auth-timeout-minutes=0 \
            --server-user XXX \
            --rsession-path=/etc/rstudio/rsession.sh
printf 'rserver exited' 1>&2

This file is modified from Rocker’s singularity tutorial.

Here, you need to change --server-user XXX to the user name for your HPC. For example, my user name to login HPC is ‘yufree’ and I will set --server-user yufree. This option will make sure you can login in your RStudio server and the default user don’t have access.

Then submit this job to HPC:

$ sbatch rstudio-server.job

Then you should see a file with job ID as extension such as ‘rstudio-server.job.xxxxxxx’ in your HPC home folder. ‘xxxxxxx’ is your job ID. Then you can check the content in this file:

cat rstudio-server.job.xxxxxxx

You will find the user name, password and port information on HPC. The user name should be the same as you HPC user name and password will change anytime you submit this job.

To access RStudio on your local computer, you need to bind your local port to the running HPC port. You need to open a new terminal to establish the SSH tunnel:

ssh -N -L 8787:[YOUR_PORT_INFORMATION] [HPC_USERNAME]@[HPC domain]

Here the port information is from rstudio-server.job.xxxxxxx. [HPC_USERNAME]@[HPC domain] is the same with the regular ssh information to login in HPC. This command will forward HPC’s port to port 8787 on your local computer. After you open the SSH tunnel, you can access the RStudio from xcmsrocker via your own browser: http://localhost:8787

Now you can enjoy your xcmsrocker image on HPC. Keep in mind that only the packages supporting parallel computing would get benefits from HPC resources. If the software doesn’t support parallel computing, you will need to modify their source code or it will be a waste of time to run them on HPC.

I am looking for a faculty position

Thu, 23 Sep 2021 00:00:00 +0000

I am looking for a faculty position on earth. It’s always right to fit the position with your skill sets. However, after sending a dozen of applications with tailored resume or research statements, I decide to leave my cover letter online with my desired research interests.

I am trained as an environmental analytical chemist from a state key laboratory under the supervision of Prof. Guibin Jiang in China. Then I worked with Prof. Janusz Pawliszyn in University of Waterloo, Canada for projects about in vivo SPME based metabolomics data analysis as a PostDoc. After two years’ training, I joined Institute for Exposomic Research at Mount Sinai for environmental exposure related bioinformatics studies and worked with Dr. Lauren Petrick. I have published 37 peer reviewed journal papers with 9 first author or co-first author papers. My publications have more than 800 citations and a h-index of 17. I have two papers selected as journal cover (AC and ES&T letter) and one paper selected as ES&T Letter 2018 best paper. I authored three R packages on CRAN and developed shiny applications for my research. More details can be found in my CV.

My research interests are the assessment of environmental exposures and impacts on humans through high resoltuion mass spectrometry based metabolomics analysis. I can apply in vivo SPME technique to capture real-time changes in living organisms. I proposed the concept of “reactomics” based on paired mass distances to retrieve the changes of general chemical relationship in the samples and developed related software and database. Besides, I proposed a concept called “gatekeeper” to explain the influence of multiple exposures or exposome on health outcomes at molecular levels by metabolomics or other omics data. Those techniques and models can be used to understand the health impact of general environmental exposures. I can be either an experimental or bioinformatic scientist. However, I will treat myself as a mass spectrometry guy to solve various environmental related scientific problem by both dry and wet lab skills.

I hope to continuously develop reactomics tools to investigate the influences of certain exposure and perform gatekeeper discovery for population-based exposure studies. I am planning to introduce machine learning into the biomarker reaction discovery based on reactomics and gatekeeper model for certain diseases. I am willing to collaborate with other researchers for multidisciplinary research projects.

Feel free to contact me if you need extra information. Thank you for your consideration.

Correlation coefficients cutoff to generate network in metabolomics

Wed, 28 Jul 2021 00:00:00 +0000

One common research purpose in metabolomics is to check the relations among the metabolites. Correlation network is one of the most popular way to show such relations. However, such network will change with different selection of the cutoff of correlation coefficients.

Let’s check some real world data.

library(pmd)
library(enviGCMS)
data(spmeinvivo)
# remove redundant peaks
newmet <- globalstd(spmeinvivo)

## 75 retention time cluster found.

## 369 paired masses found

## 5 unique within RT clusters high frequency PMD(s) used for further investigation.

## The unique within RT clusters high frequency PMD(s) is(are)  28.03 21.98 44.03 17.03 18.01.

## 719 isotopologue(s) related paired mass found.

## 492 multi-charger(s) related paired mass found.

## 8 retention group(s) have single peaks. 14 23 32 33 54 55 56 75

## 11 group(s) with multiple peaks while no isotope/paired relationship 4 5 7 8 11 41 42 49 68 72 73

## 9 group(s) with multiple peaks with isotope without paired relationship 2 9 22 26 52 62 64 66 70

## 4 group(s) with paired relationship without isotope 1 10 15 18

## 43 group(s) with paired relationship and isotope 3 6 12 13 16 17 19 20 21 24 25 27 28 29 30 31 34 35 36 37 38 39 40 43 44 45 46 47 48 50 51 53 57 58 59 60 61 63 65 67 69 71 74

## 291 std mass found.

metabolites <- getfilter(spmeinvivo,rowindex = newmet$stdmassindex)

Originally we have 1459 peaks. After removal of redundant peaks such as isotope, adducts and Neutral losses by globalstd algorithm, we have 291 peaks as the number of potential metabolites. To check their relations, we will calculate the paired correlation coefficients among their intensities.

metcor <- cor(t(metabolites$data))

Let’s check the distribution of correlation coefficients:

hist(metcor)

Since correlation coefficients are also associated with a p value, we can also check the distribution of p values.

cor.test.p <- function(x){
    FUN <- function(x, y) cor.test(x, y)[["p.value"]]
    z <- outer(
      colnames(x), 
      colnames(x), 
      Vectorize(function(i,j) FUN(x[,i], x[,j]))
    )
    dimnames(z) <- list(colnames(x), colnames(x))
    z
}

pmat <- cor.test.p(t(metabolites$data))
hist(pmat)

sum(pmat<0.05)/length(pmat)

## [1] 0.4145

41% original p values are less than 0.05. We can filter the correlation coefficients based on this rule.

metcor2 <- metcor[pmat<0.05]
hist(metcor2)

range(abs(metcor2))

## [1] 0.6664 1.0000

Here we can find the cutoff is around +/-0.67. However, we didn’t perform FDR control. If we use BH method to correct the p value, we will have a different cutoff.

pmat_adj <- p.adjust(pmat)
metcor3 <- metcor[pmat_adj<0.05]
range(abs(metcor3))

## [1] 0.9881 1.0000

Now the cutoff is 0.99. We can display the data as network:

metcor[pmat>=0.05] <- 0
library(igraph)

## 
## Attaching package: 'igraph'

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

net <- graph.adjacency(metcor,weighted=TRUE,diag=FALSE,mode = 'undirected')
plot(net,vertex.size=1,edge.width=1,vertex.label="")

Here it seems all metabolites are connected and FDR control will solve this issue.

metcor <- cor(t(metabolites$data))
metcor[pmat_adj>=0.05] <- 0
net <- graph.adjacency(metcor,weighted=TRUE,diag=FALSE,mode = 'undirected')
plot(net,vertex.size=1,edge.width=1,vertex.label="")

Here we will see the networks with few large clusters and lots of single metabolites without any association with each other.

If we didn’t consider the p values, we can also check the networks with different cutoffs.

n <- c()
for (i in seq(0,1,0.1)) {
        metcor <- cor(t(metabolites$data))
        metcor[metcor<i] <- 0
        net <- graph.adjacency(metcor,weighted=TRUE,diag=FALSE,mode = 'undirected')
        # plot(net,vertex.size=1,edge.width=1,vertex.label="")
        cn <- components(net)
        # check the numbers of cluster
        n <- c(n,length(table(membership(cn))[table(membership(cn))>1]))
}
plot(seq(0,1,0.1),n,xlab='cutoff',ylab = 'cluster number',type = 'l')

Here we can see the cluster numbers will firstly increase and then decrease. Let’s check \[0.8,1\] carefully.

n <- c()
for (i in seq(0.8,1,0.001)) {
        metcor <- cor(t(metabolites$data))
        metcor[metcor<i] <- 0
        net <- graph.adjacency(metcor,weighted=TRUE,diag=FALSE,mode = 'undirected')
        # plot(net,vertex.size=1,edge.width=1,vertex.label="")
        cn <- components(net)
        # check the numbers of cluster
        n <- c(n,length(table(membership(cn))[table(membership(cn))>1]))
}
plot(seq(0.8,1,0.001),n,xlab='cutoff',ylab = 'cluster number',type = 'l')

# display the cutoff
seq(0.8,1,0.001)[which.max(n)]

## [1] 0.988

Here we find the max number of network clusters has a similar cutoff of p value cutoff with FDR control. However, the computation process is much faster. When the cutoff is small, all metabolites are connected. When the cutoff is large, few metabolites will be covered. In terms of physics, largest number of network clusters means the coverage of largest numbers of connected metabolites with largest clusters separations. I think this should be the fastest way to select cutoff from the real world data.

Actually, I add a function called `getcf()` into `enet` package to automate find this cutoff of correlation network analysis. Here is the network for our demo data:

metcor <- cor(t(metabolites$data))
metcor[metcor<seq(0.8,1,0.001)[which.max(n)]] <- 0
net <- graph.adjacency(metcor,weighted=TRUE,diag=FALSE,mode = 'undirected')
plot(net,vertex.size=1,edge.width=1,vertex.label="")

The reason to avoid using p values or adjust p values of correlation test is not only the slow speed of computation, but also cutoff selection of p values or adjust p values is determined by the researcher instead of the data themselves. p value cutoff will not help us to find biological functional modules when all the metabolites are connected. In my opinion, each data sets can speak for itself by an automated cutoff selection process and I think the network cluster numbers can just take this job.

PS. I actually use the same idea to generate PMD metabolites network, which can be treated as another relation among metabolites with chemical meanings.

reactomics data analysis template within rmwf package

Tue, 09 Feb 2021 00:00:00 +0000

To make reactomics data analysis more transparent and reproducible, I included one template in rmwf package. You could install the package from Github.

install.packages('remotes')
remotes::install_github("yufree/rmwf")

If you use RStudio, you could try:

File-New file-R Markdown-from template

Then select ‘reactomics’ to use template for reactomics analysis. Here is a preview for data analysis of this study:

Demo data

path <- system.file("demodata/untarget", package = "rmwf")
files <- list.files(path,recursive = T,full.names = T)
ST000560pos <- enviGCMS::getmzrtcsv(files[grepl('ST000560mzrt',files)])

Remove the redundant peaks

# check the paired mass distance relationship
pmd <- pmd::getpaired(ST000560pos)

## 56 retention time cluster found.

## 826 paired masses found

## 23 unique within RT clusters high frequency PMD(s) used for further investigation.

## The unique within RT clusters high frequency PMD(s) is(are)  12 2.02 26.02 28.03 14.02 26.01 54.05 24 9.99 40.03 44.04 2.01 15.01 30.01 27.02 44.03 14.01 21.98 30.05 42.05 29.02 4.03 66.02.

## 182 isotopologue(s) related paired mass found.

## 1145 multi-charger(s) related paired mass found.

pmd::plotpaired(pmd)

Here we could see some common PMDs within the same retention time bins like 21.98Da for the mass differences between [M+Na] and [M+H]. Other PMDs might refer to in-source reaction such as PMD 2.02Da for opening or forming of double bond. Another common kinds of PMDs should the homologous series compounds which could not be separated by the column such as PMD 14.02Da for CH2, PMD 28.03Da for C2H4, PMD 44.03Da for C3H6, and 56.05Da for C4H8, as well as 58.04Da for C3H6O. There are also some PMDs which highly depended on the the samples’ matrix. Anyway, we will check those high frequency PMD considering isotopes, as well as multiple chargers to extract one peak for one potential compound. Such algorithm is called GlobalStd. The advantage of GlobalStd is that no pre-defined paired mass distances list is needed to remove redundant peaks. When a PMD appeared with high frequency in certain samples, it will be treated as potential adducts to be removed.

std <- pmd::getstd(pmd)

## 4 retention group(s) have single peaks. 52 54 55 56

## 10 group(s) with multiple peaks while no isotope/paired relationship 26 30 33 35 45 47 48 49 51 53

## 2 group(s) with multiple peaks with isotope without paired relationship 29 32

## 20 group(s) with paired relationship without isotope 10 11 13 14 15 16 18 20 23 27 28 31 36 38 40 42 43 44 46 50

## 20 group(s) with paired relationship and isotope 1 2 3 4 5 6 7 8 9 12 17 19 21 22 24 25 34 37 39 41

## 196 std mass found.

pmd::plotstd(std)

In this case, we get 205 peaks for 205 potential compounds. Now we could retain those peaks for reactomics analysis.

# generate new peak list and matrix sample
peakstd <- enviGCMS::getfilter(std,rowindex = std$stdmassindex)

GlobalStd algorithm was originally designed to retrieve independent peaks by the paired mass distances relationship among features without a predefined adducts or neutral loss list. However, the peaks from the same compounds should also be correlated with each other. Meanwhile, the independent peaks selection might still have peaks from the same compounds when the peaks’ high frequency PMDs are not independent. In this case, the GlobalStd algorithm could set a cutoff to re-check the independent peaks by their relationship with potential PMDs groups and select the base peaks for the clusters of peaks.

Meanwhile, network analysis could be used for PMDDA workflow to select precursor ion for MS/MS annotation. Such precursor ions was selected by checking the peak with highest intensity of each independent peaks’ high frequency PMD network cluster, which could be treated as pseudo spectra.

Extract high frequency PMDs

To retrieve the general chemical relationship, we will focus on high frequency PMDs within a certain metabolic profile. If one PMD occur multiple times among peaks from a snapshot of samples, certain reactions or bio-process should be important or occur multiple times compared with rarely PMD, which could be a random differences among compounds. In this case, extraction of high frequency PMDs will refine the investigation on a few active reactions instead of treating each peak individually, which is almost impossible for untargeted analysis.

Such PMDs frequency analysis should be performed on the data set with the redundant peaks removal. Otherwise, the high frequency PMD among compounds will be immersed by PMD with from isotopes, adducts or other common PMDs from the backgrounds.

You could define the cutoff of frequency while the default setting using the largest PMD network cluster numbers to determine the cutoff, which try to capture more information. Here we will retrieve high frequency PMDs from the demo data using a larger cutoff to reduce the complexity:

hfp <- pmd::getsda(std,freqcutoff = 8)

## 8 groups were found as high frequency PMD group.

## 0 was found as high frequency PMD. 
## 2.02 was found as high frequency PMD. 
## 12 was found as high frequency PMD. 
## 14.02 was found as high frequency PMD. 
## 24 was found as high frequency PMD. 
## 26.02 was found as high frequency PMD. 
## 28.03 was found as high frequency PMD. 
## 50.01 was found as high frequency PMD.

pmd::plotstdsda(hfp)

Here we could find 8 PMDs were selected as high frequency PMDs. PMD 0 Da could be some isomers, PMD 2.02 Da could be reduction reactions, etc. Some PMDs can be the combination of other PMDs, which could be a chain reactions. From the plot you might also identify the homologous series by the retention times relations.

When you have the lists of high frequency PMDs, you could check the PMDs changes among groups. Here we will quantitatively analysis certain PMD to show the reaction level changes.

# remove QC sample
hfp2 <- enviGCMS::getfilter(hfp,colindex = !grepl('QC',hfp$group$sample_group))
# check pmd 14.02
qreact <- pmd::getreact(hfp2,pmd = 14.02)
qreactsum <- apply(qreact$data,2,sum)
t.test(qreactsum~qreact$group$sample_group)

## 
##  Welch Two Sample t-test
## 
## data:  qreactsum by qreact$group$sample_group
## t = 2.2, df = 17, p-value = 0.04
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   2939 95129
## sample estimates:
## mean in group control    mean in group IgAN 
##               1247817               1198783

par(mfrow=c(1,1))
boxplot(qreactsum~qreact$group$sample_group,xlab='',ylab = 'intensity', main='PMD 14.02Da')

Here we could find PMD 14.02Da could be a biomarker reaction for case and control. Meanwhile, paired relationship could be connected into network to show the overall relationship within the samples.

Reactomics network analysis

The relation among those high frequency PMDs peaks could be further checked in two ways by network analysis: one from the correlation analysis and another from the PMD analysis. If we combined them together, reactomics network could be generated to capture the major reaction network within the samples. We will check them step by step.

Build the correlation network

library(igraph)

## 
## Attaching package: 'igraph'

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

cutoff <- 0.9
metacor <- stats::cor(t(peakstd$data))
metacor[abs(metacor)<cutoff] <- 0
df <- data.frame(from=rownames(peakstd$data)[which(lower.tri(metacor), arr.ind = T)[, 1]],to=rownames(peakstd$data)[which(lower.tri(metacor), arr.ind = T)[, 2]],cor=metacor[lower.tri(metacor)])
df <- df[abs(df$cor)>0,]
df$direction <- ifelse(df$cor>0,'positive','negative')
net <- igraph::graph_from_data_frame(df,directed = F)
netc <- igraph::components(net)
message(paste(netc$no, 'metabolites correlation network clusters found'))

## 16 metabolites correlation network clusters found

index <- rep(NA,length(rownames(peakstd$data)))
index[match(names(netc$membership),rownames(peakstd$data))] <- netc$membership
message(paste(sum(is.na(index)), 'out of', length(rownames(peakstd$data)), 'metabolites have no correlation with other metabolites'))

## 88 out of 197 metabolites have no correlation with other metabolites

plot(net,vertex.label=NA,vertex.size =5,edge.width = 3, main = 'Correlation network')

Here we could see the correlation among those peaks as network. 109 peaks have relations with each others and 88 peaks were single.

Build the PMD network

peaksda <- pmd::getsda(std,freqcutoff = 8)

## 8 groups were found as high frequency PMD group.

## 0 was found as high frequency PMD. 
## 2.02 was found as high frequency PMD. 
## 12 was found as high frequency PMD. 
## 14.02 was found as high frequency PMD. 
## 24 was found as high frequency PMD. 
## 26.02 was found as high frequency PMD. 
## 28.03 was found as high frequency PMD. 
## 50.01 was found as high frequency PMD.

df <- peaksda$sda
df$from <- paste0('M',round(df$ms1,4),'T',round(df$rt1,1))
df$to <- paste0('M',round(df$ms2,4),'T',round(df$rt2,1))
net <- graph_from_data_frame(df[,c('from','to','diff2')],directed = F)
netc <- igraph::components(net)
message(paste(netc$no, 'metabolites PMD network clusters found'))

## 15 metabolites PMD network clusters found

index <- rep(NA,length(rownames(peakstd$data)))
index[match(names(netc$membership),rownames(peakstd$data))] <- netc$membership
message(paste(sum(is.na(index)), 'out of', length(rownames(peakstd$data)), 'metabolites have no PMD relations with other metabolites'))

## 143 out of 197 metabolites have no PMD relations with other metabolites

pal <- grDevices::rainbow(8)
plot(net,vertex.label=NA,vertex.size =5,edge.width = 3,edge.color = pal[as.numeric(as.factor(E(net)$diff2))], main = 'PMD network')
legend("topright",bty = "n",
       legend=unique(E(net)$diff2),
       fill=unique(pal[as.numeric(as.factor(E(net)$diff2))]), border=NA,horiz = F)

unique(E(net)$diff2)

## [1] 24.00  0.00 50.01  2.02 12.00 28.03 26.02 14.02

By checking the high frequency PMD relation, we see a similar while different results. Those high frequency PMDs could also be linked to potential reactions such as 0Da for isomers, 2.02Da for double bonds breaking/forming. Such PMDs could reveal the major reactions found among the metabolites. 54 peaks have PMDs relations with each others and 143 peaks were single.

Here we need to define a frequency cutoff. With the increasing number of high frequency PMDs cutoff, the ions cluster numbers would firstly increase then decrease. At the very beginning, the increasing numbers will include more information because high frequency PMDs always capture real reactions or structures relationships among compounds. Low frequency PMDs will introduce limited information as they might be generated by random differences among ions. In terms of network analysis, when the high frequency PMD cutoff is small, the network clusters will be small. However, when the numbers of network clusters are not increasing any more with more PMDs included, the relationship information among ions will not increase and the cutoff could be automated detected by GlobalStd algorithm. In detail, the algorithm will try to include PMDs one by one starting from the highest frequency PMDs. Meanwhile, the ions cluster numbers were recorded for the generated network among independent peaks and the cutoff will be the PMDs list with the largest number of independent peaks’ network cluster.

Build the PMD network with correlation

We could combine the PMD relation with correlation together to show the quantitative reactomics networks within the samples. Those metabolites could be quantitatively checked among different samples.

peaksda <- pmd::getsda(std,freqcutoff = 8,corcutoff = 0.6)

## 8 groups were found as high frequency PMD group.

## 0 was found as high frequency PMD. 
## 2.02 was found as high frequency PMD. 
## 12 was found as high frequency PMD. 
## 14.02 was found as high frequency PMD. 
## 24 was found as high frequency PMD. 
## 26.02 was found as high frequency PMD. 
## 28.03 was found as high frequency PMD. 
## 50.01 was found as high frequency PMD.

df <- peaksda$sda
df$from <- paste0('M',round(df$ms1,4),'T',round(df$rt1,1))
df$to <- paste0('M',round(df$ms2,4),'T',round(df$rt2,1))
net <- graph_from_data_frame(df[,c('from','to','diff2')],directed = F)
netc <- igraph::components(net)
message(paste(netc$no, 'metabolites quantitative reactomics network clusters found'))

## 10 metabolites quantitative reactomics network clusters found

index <- rep(NA,length(rownames(peakstd$data)))
index[match(names(netc$membership),rownames(peakstd$data))] <- netc$membership
message(paste(sum(is.na(index)), 'out of', length(rownames(peakstd$data)), 'metabolites have no PMD&correlation relations with other metabolites'))

## 162 out of 197 metabolites have no PMD&correlation relations with other metabolites

net <- graph_from_data_frame(peaksda$sda,directed = F)
pal <- grDevices::rainbow(21)
plot(net,vertex.label=NA,vertex.size =5,edge.width = 3,edge.color = pal[as.numeric(as.factor(E(net)$diff2))], main = 'Quantitative reactomics network')
legend("topright",bty = "n",
       legend=unique(E(net)$diff2),
       fill=unique(pal[as.numeric(as.factor(E(net)$diff2))]), border=NA,horiz = F)

MS/MS annotation by paired mass distances analysis

Sun, 17 Jan 2021 00:00:00 +0000

Last year I make a poster presentation for MS/MS annotation by paired mass distance(PMD) analysis. It’s already been included as pmdanno function in pmd package. Here I will explain the principle of PMD annotation.

Firstly, you need a spectra database. Here I use HMDB MS/MS spectra database as an example. Then you will get a list with each compound as element. The list should have a element of spectra with mz and ins, an element of name, an element of prec for precursor ions. I have included this database in rmwf package.

# remotes::install_github('yufree/rmwf')
# remotes::install_github('yufree/pmd')
library(rmwf)
data("qtof")
str(qtof)

## List of 4
##  $ name   : chr [1:5062] "HMDB0000014" "HMDB0000014" "HMDB0000014" "HMDB0000014" ...
##  $ mz     : num [1:5062] 227 227 227 227 227 ...
##  $ msms   :List of 5062
##   ..$ : num 116
##   ..$ : num [1:3] 5 111 116
##   ..$ : num [1:15] 0.07 16.03 16.1 27.01 42.01 ...
##   ..$ : num [1:15] 0.07 16.03 16.1 27.01 42.01 ...
##   ..$ : num 116
##   ..$ : num [1:3] 5 111 116
##   ..$ : num [1:136] 1.98 2.01 2.01 2.02 2.02 2.02 3.99 3.99 4.03 4.03 ...
##   ..$ : num [1:36] 1 3.99 3.99 5 8.01 ...
##   ..$ : num [1:3] 30 44 74
##   ..$ : num [1:6] 18 18 36 83 101 ...
##   ..$ : num [1:15] 1.98 9.98 11.96 18.01 26.02 ...
##   ..$ : num [1:6] 18 18 36 83 101 ...
##   ..$ : num [1:15] 1.98 9.98 11.96 18.01 26.02 ...
##   ..$ : num [1:3] 30 44 74
##   ..$ : num 1
##   ..$ : num [1:10] 1 10 34 43 44 ...
##   ..$ : num [1:3] 1 18 19
##   ..$ : num [1:45] 0.98 1 9.98 15.01 16.03 ...
##   ..$ : num [1:190] 0.93 0.98 1 1 1.06 2.02 2.02 2.04 2.95 2.95 ...
##   ..$ : num [1:105] 1 1 1.01 1.06 2 2.01 2.02 2.95 2.98 3.01 ...
##   ..$ : num [1:703] 0.03 0.04 0.62 0.93 0.93 0.97 0.99 1 1 1 ...
##   ..$ : num [1:3] 46 71 117
##   ..$ : num [1:36] 2.02 8.01 9.98 12 12 ...
##   ..$ : num [1:21] 0.08 9.04 12 17.03 29.03 ...
##   ..$ : num [1:21] 0.08 9.04 12 17.03 29.03 ...
##   ..$ : num [1:3] 46 71 117
##   ..$ : num [1:36] 2.02 8.01 9.98 12 12 ...
##   ..$ : num 17
##   ..$ : num 27
##   ..$ : num [1:6] 15 27 27 42 42 ...
##   ..$ : num 27
##   ..$ : num [1:6] 15 27 27 42 42 ...
##   ..$ : num 17
##   ..$ : num [1:3] 1.01 59.01 60.02
##   ..$ : num [1:3] 1.01 59.01 60.02
##   ..$ : num [1:6] 18 37.1 55.1 212 249.1 ...
##   ..$ : num 212
##   ..$ : num 212
##   ..$ : num 212
##   ..$ : num 212
##   ..$ : num [1:6] 18 37.1 55.1 212 249.1 ...
##   ..$ : num 132
##   ..$ : num 132
##   ..$ : num [1:3] 27 132 159
##   ..$ : num 132
##   ..$ : num [1:3] 27 132 159
##   ..$ : num 132
##   ..$ : num 17
##   ..$ : num 132
##   ..$ : num 132
##   ..$ : num 132
##   ..$ : num 132
##   ..$ : num [1:3] 1 17 18
##   ..$ : num 9.45
##   ..$ : num 9.45
##   ..$ : num 194
##   ..$ : num [1:3] 55.1 194 249.1
##   ..$ : num [1:3] 55.1 194 249.1
##   ..$ : num 194
##   ..$ : num [1:10] 0.95 17.06 18.01 24.95 42.01 ...
##   ..$ : num [1:15] 0.95 1.01 1.01 2.02 17.06 ...
##   ..$ : num [1:10] 0.95 17.06 18.01 24.95 42.01 ...
##   ..$ : num [1:3] 1 36 37
##   ..$ : num [1:2485] 0.03 0.03 0.03 0.04 0.04 0.04 0.04 0.04 0.04 0.04 ...
##   ..$ : num 46
##   ..$ : num 46
##   ..$ : num [1:3] 18 26 44
##   ..$ : num [1:3] 18 26 44
##   ..$ : num 17
##   ..$ : num [1:3] 18 28 46
##   ..$ : num [1:3] 18 28 46
##   ..$ : num 26
##   ..$ : num 26
##   ..$ : num [1:6] 2.02 17.03 25.98 27.99 43.01 ...
##   ..$ : num [1:6] 2.02 17.03 25.98 27.99 43.01 ...
##   ..$ : num [1:3] 1 180 181
##   ..$ : num [1:16836] 0.02 0.03 0.03 0.03 0.04 0.04 0.05 0.05 0.05 0.06 ...
##   ..$ : num [1:15] 6.09 18.01 19.12 20.88 21.91 ...
##   ..$ : num 46
##   ..$ : num 46
##   ..$ : num 46
##   ..$ : num 46
##   ..$ : num 46
##   ..$ : num 46
##   ..$ : num 46
##   ..$ : num 46
##   ..$ : num [1:28] 1.01 1.98 8.06 15.94 15.99 ...
##   ..$ : num [1:28] 1.01 1.98 8.06 15.94 15.99 ...
##   ..$ : num 212
##   ..$ : num 212
##   ..$ : num [1:3] 18 225 243
##   ..$ : num [1:3] 18 225 243
##   ..$ : num [1:3] 18 225 243
##   ..$ : num [1:3] 18 225 243
##   ..$ : num 212
##   ..$ : num 212
##   ..$ : num [1:3] 2.02 44.03 46.04
##   ..$ : num [1:3] 2.02 44.03 46.04
##   ..$ : num 16
##   .. [list output truncated]
##  $ msmsraw:List of 5062
##   ..$ :'data.frame': 2 obs. of  2 variables:
##   .. ..$ masscharge: num [1:2] 112 228
##   .. ..$ intensity : num [1:2] 70 100
##   ..$ :'data.frame': 3 obs. of  2 variables:
##   .. ..$ masscharge: num [1:3] 112 117 228
##   .. ..$ intensity : num [1:3] 100 25.8 50.5
##   ..$ :'data.frame': 6 obs. of  2 variables:
##   .. ..$ masscharge: num [1:6] 66 93 135 210 226 ...
##   .. ..$ intensity : num [1:6] 15.5 100 15.2 12 14.2 ...
##   ..$ :'data.frame': 6 obs. of  2 variables:
##   .. ..$ masscharge: num [1:6] 66 93 135 210 226 ...
##   .. ..$ intensity : num [1:6] 15.5 100 15.2 12 14.3 ...
##   ..$ :'data.frame': 2 obs. of  2 variables:
##   .. ..$ masscharge: num [1:2] 112 228
##   .. ..$ intensity : num [1:2] 70 100
##   ..$ :'data.frame': 3 obs. of  2 variables:
##   .. ..$ masscharge: num [1:3] 112 117 228
##   .. ..$ intensity : num [1:3] 100 25.8 50.5
##   ..$ :'data.frame': 814 obs. of  2 variables:
##   .. ..$ masscharge: num [1:814] 45.2 45.4 45.4 45.8 46 ...
##   .. ..$ intensity : num [1:814] 2.75 2.38 1.81 2.23 2.07 ...
##   ..$ :'data.frame': 890 obs. of  2 variables:
##   .. ..$ masscharge: num [1:890] 44.9 44.9 44.9 45.7 45.7 ...
##   .. ..$ intensity : num [1:890] 0.927 1.514 1.947 0.402 0.34 ...
##   ..$ :'data.frame': 3 obs. of  2 variables:
##   .. ..$ masscharge: num [1:3] 108 138 182
##   .. ..$ intensity : num [1:3] 72.5 100 48.7
##   ..$ :'data.frame': 7 obs. of  2 variables:
##   .. ..$ masscharge: num [1:7] 65 92.1 120 138.1 148 ...
##   .. ..$ intensity : num [1:7] 12.29 8.8 6.49 7.67 100 ...
##   ..$ :'data.frame': 6 obs. of  2 variables:
##   .. ..$ masscharge: num [1:6] 65 92 110 120 122 ...
##   .. ..$ intensity : num [1:6] 88.3 42.8 11.8 24.6 13.5 ...
##   ..$ :'data.frame': 7 obs. of  2 variables:
##   .. ..$ masscharge: num [1:7] 65 92.1 120 138.1 148 ...
##   .. ..$ intensity : num [1:7] 12.31 8.81 6.51 7.71 100 ...
##   ..$ :'data.frame': 6 obs. of  2 variables:
##   .. ..$ masscharge: num [1:6] 65 92 110 120 122 ...
##   .. ..$ intensity : num [1:6] 88.3 42.8 11.8 24.6 13.5 ...
##   ..$ :'data.frame': 3 obs. of  2 variables:
##   .. ..$ masscharge: num [1:3] 108 138 182
##   .. ..$ intensity : num [1:3] 72.6 100 48.6
##   ..$ :'data.frame': 6 obs. of  2 variables:
##   .. ..$ masscharge: num [1:6] 166 200 209 243 244 ...
##   .. ..$ intensity : num [1:6] 1.05 3.46 1.31 100 12.76 ...
##   ..$ :'data.frame': 14 obs. of  2 variables:
##   .. ..$ masscharge: num [1:14] 122 156 165 166 167 ...
##   .. ..$ intensity : num [1:14] 7.75 10.08 4.93 30.33 3.05 ...
##   ..$ :'data.frame': 6 obs. of  2 variables:
##   .. ..$ masscharge: num [1:6] 227 228 229 245 246 ...
##   .. ..$ intensity : num [1:6] 59.52 5.77 2.16 100 10.01 ...
##   ..$ :'data.frame': 53 obs. of  2 variables:
##   .. ..$ masscharge: num [1:53] 97 98.1 100 101 105.1 ...
##   .. ..$ intensity : num [1:53] 13.87 1.2 1.36 1.07 8.73 ...
##   ..$ :'data.frame': 72 obs. of  2 variables:
##   .. ..$ masscharge: num [1:72] 79.1 81.1 82 85 91.1 ...
##   .. ..$ intensity : num [1:72] 2.94 2.13 3.41 3.7 6.6 ...
##   ..$ :'data.frame': 26 obs. of  2 variables:
##   .. ..$ masscharge: num [1:26] 91.1 93.1 94.1 97 99 ...
##   .. ..$ intensity : num [1:26] 18.02 8.07 8.51 81.98 9.6 ...
##   ..$ :'data.frame': 228 obs. of  2 variables:
##   .. ..$ masscharge: num [1:228] 95.1 95.1 95.3 96.2 96.9 ...
##   .. ..$ intensity : num [1:228] 11.61 7.12 2.62 8.8 1.87 ...
##   ..$ :'data.frame': 4 obs. of  2 variables:
##   .. ..$ masscharge: num [1:4] 110 156 210 227
##   .. ..$ intensity : num [1:4] 25.43 43.54 9.51 100
##   ..$ :'data.frame': 10 obs. of  2 variables:
##   .. ..$ masscharge: num [1:10] 83.1 93 95.1 110.1 122.1 ...
##   .. ..$ intensity : num [1:10] 12.8 10.7 11 100 13.1 ...
##   ..$ :'data.frame': 7 obs. of  2 variables:
##   .. ..$ masscharge: num [1:7] 81 93 110 154 163 ...
##   .. ..$ intensity : num [1:7] 14.4 11.1 100 79.3 11 ...
##   ..$ :'data.frame': 7 obs. of  2 variables:
##   .. ..$ masscharge: num [1:7] 81 93 110 154 163 ...
##   .. ..$ intensity : num [1:7] 14.4 11.1 100 79.3 11 ...
##   ..$ :'data.frame': 4 obs. of  2 variables:
##   .. ..$ masscharge: num [1:4] 110 156 210 227
##   .. ..$ intensity : num [1:4] 25.46 43.52 9.52 100
##   ..$ :'data.frame': 10 obs. of  2 variables:
##   .. ..$ masscharge: num [1:10] 83.1 93 95.1 110.1 122.1 ...
##   .. ..$ intensity : num [1:10] 12.8 10.7 11 100 13.1 ...
##   ..$ :'data.frame': 2 obs. of  2 variables:
##   .. ..$ masscharge: num [1:2] 119 136
##   .. ..$ intensity : num [1:2] 37.4 100
##   ..$ :'data.frame': 3 obs. of  2 variables:
##   .. ..$ masscharge: num [1:3] 92 107 134
##   .. ..$ intensity : num [1:3] 6.71 26.83 100
##   ..$ :'data.frame': 8 obs. of  2 variables:
##   .. ..$ masscharge: num [1:8] 64 65 68 90 92 ...
##   .. ..$ intensity : num [1:8] 6.91 37.74 8.31 8.91 74.77 ...
##   ..$ :'data.frame': 3 obs. of  2 variables:
##   .. ..$ masscharge: num [1:3] 92 107 134
##   .. ..$ intensity : num [1:3] 6.69 26.8 100
##   ..$ :'data.frame': 8 obs. of  2 variables:
##   .. ..$ masscharge: num [1:8] 64 65 68 90 92 ...
##   .. ..$ intensity : num [1:8] 6.91 37.73 8.33 8.95 74.78 ...
##   ..$ :'data.frame': 2 obs. of  2 variables:
##   .. ..$ masscharge: num [1:2] 119 136
##   .. ..$ intensity : num [1:2] 37.4 100
##   ..$ :'data.frame': 3 obs. of  2 variables:
##   .. ..$ masscharge: num [1:3] 58.1 59.1 118.1
##   .. ..$ intensity : num [1:3] 100 30.7 83
##   ..$ :'data.frame': 3 obs. of  2 variables:
##   .. ..$ masscharge: num [1:3] 58.1 59.1 118.1
##   .. ..$ intensity : num [1:3] 100 30.7 82.9
##   ..$ :'data.frame': 4 obs. of  2 variables:
##   .. ..$ masscharge: num [1:4] 79 97 134 346
##   .. ..$ intensity : num [1:4] 100 37.9 29.9 66.8
##   ..$ :'data.frame': 2 obs. of  2 variables:
##   .. ..$ masscharge: num [1:2] 136 348
##   .. ..$ intensity : num [1:2] 20.5 100
##   ..$ :'data.frame': 2 obs. of  2 variables:
##   .. ..$ masscharge: num [1:2] 136 348
##   .. ..$ intensity : num [1:2] 83.8 100
##   ..$ :'data.frame': 2 obs. of  2 variables:
##   .. ..$ masscharge: num [1:2] 136 348
##   .. ..$ intensity : num [1:2] 20.4 100
##   ..$ :'data.frame': 2 obs. of  2 variables:
##   .. ..$ masscharge: num [1:2] 136 348
##   .. ..$ intensity : num [1:2] 83.8 100
##   ..$ :'data.frame': 4 obs. of  2 variables:
##   .. ..$ masscharge: num [1:4] 79 97 134 346
##   .. ..$ intensity : num [1:4] 100 37.9 29.8 66.8
##   ..$ :'data.frame': 2 obs. of  2 variables:
##   .. ..$ masscharge: num [1:2] 136 268
##   .. ..$ intensity : num [1:2] 31.2 100
##   ..$ :'data.frame': 2 obs. of  2 variables:
##   .. ..$ masscharge: num [1:2] 136 268
##   .. ..$ intensity : num [1:2] 100 57.1
##   ..$ :'data.frame': 3 obs. of  2 variables:
##   .. ..$ masscharge: num [1:3] 107 134 266
##   .. ..$ intensity : num [1:3] 12.8 100 15.9
##   ..$ :'data.frame': 3 obs. of  2 variables:
##   .. ..$ masscharge: num [1:3] 119 136 268
##   .. ..$ intensity : num [1:3] 7.81 100 10.61
##   ..$ :'data.frame': 3 obs. of  2 variables:
##   .. ..$ masscharge: num [1:3] 107 134 266
##   .. ..$ intensity : num [1:3] 12.8 100 15.9
##   ..$ :'data.frame': 5 obs. of  2 variables:
##   .. ..$ masscharge: num [1:5] 136 137 268 269 270
##   .. ..$ intensity : num [1:5] 47.69 2.16 100 9.67 1.29
##   ..$ :'data.frame': 5 obs. of  2 variables:
##   .. ..$ masscharge: num [1:5] 94 119 120 136 137
##   .. ..$ intensity : num [1:5] 1.52 17.18 1.37 100 5.09
##   ..$ :'data.frame': 6 obs. of  2 variables:
##   .. ..$ masscharge: num [1:6] 119 136 137 268 269 ...
##   .. ..$ intensity : num [1:6] 0.3 100 5.31 68.77 6.91 ...
##   ..$ :'data.frame': 2 obs. of  2 variables:
##   .. ..$ masscharge: num [1:2] 136 268
##   .. ..$ intensity : num [1:2] 31.2 100
##   ..$ :'data.frame': 2 obs. of  2 variables:
##   .. ..$ masscharge: num [1:2] 136 268
##   .. ..$ intensity : num [1:2] 100 57
##   ..$ :'data.frame': 3 obs. of  2 variables:
##   .. ..$ masscharge: num [1:3] 119 136 268
##   .. ..$ intensity : num [1:3] 7.77 100 10.65
##   ..$ :'data.frame': 242 obs. of  2 variables:
##   .. ..$ masscharge: num [1:242] 46.8 47 47.9 48.7 48.7 ...
##   .. ..$ intensity : num [1:242] 1.127 0.999 0.384 0.973 1.434 ...
##   ..$ :'data.frame': 3 obs. of  2 variables:
##   .. ..$ masscharge: num [1:3] 90.1 99.5 111
##   .. ..$ intensity : num [1:3] 100 14.41 9.51
##   ..$ :'data.frame': 3 obs. of  2 variables:
##   .. ..$ masscharge: num [1:3] 90.1 99.5 111
##   .. ..$ intensity : num [1:3] 100 14.42 9.54
##   ..$ :'data.frame': 4 obs. of  2 variables:
##   .. ..$ masscharge: num [1:4] 136 330 330 330
##   .. ..$ intensity : num [1:4] 33.33 5.51 9.21 100
##   ..$ :'data.frame': 4 obs. of  2 variables:
##   .. ..$ masscharge: num [1:4] 79 107 134 328
##   .. ..$ intensity : num [1:4] 26.43 9.91 100 53.85
##   ..$ :'data.frame': 4 obs. of  2 variables:
##   .. ..$ masscharge: num [1:4] 79 107 134 328
##   .. ..$ intensity : num [1:4] 26.41 9.89 100 53.89
##   ..$ :'data.frame': 4 obs. of  2 variables:
##   .. ..$ masscharge: num [1:4] 136 330 330 330
##   .. ..$ intensity : num [1:4] 33.32 5.47 9.22 100
##   ..$ :'data.frame': 5 obs. of  2 variables:
##   .. ..$ masscharge: num [1:5] 60.1 85 102.1 103 162.1
##   .. ..$ intensity : num [1:5] 12.9 25.8 20.4 47.1 100
##   ..$ :'data.frame': 9 obs. of  2 variables:
##   .. ..$ masscharge: num [1:9] 57 58.1 59.1 60.1 61 ...
##   .. ..$ intensity : num [1:9] 5.23 42.66 14.31 39.56 2.91 ...
##   ..$ :'data.frame': 5 obs. of  2 variables:
##   .. ..$ masscharge: num [1:5] 60.1 85 102.1 103 162.1
##   .. ..$ intensity : num [1:5] 13.3 26.1 19.8 47.4 100
##   ..$ :'data.frame': 485 obs. of  2 variables:
##   .. ..$ masscharge: num [1:485] 46.3 46.5 47.5 47.7 48.8 ...
##   .. ..$ intensity : num [1:485] 0.3 0.343 0.314 0.279 0.414 ...
##   ..$ :'data.frame': 1000 obs. of  2 variables:
##   .. ..$ masscharge: num [1:1000] 45.1 45.2 45.2 45.3 45.6 ...
##   .. ..$ intensity : num [1:1000] 1.61 1.78 4.23 1.78 1.01 ...
##   ..$ :'data.frame': 2 obs. of  2 variables:
##   .. ..$ masscharge: num [1:2] 84.1 130.1
##   .. ..$ intensity : num [1:2] 100 38.1
##   ..$ :'data.frame': 2 obs. of  2 variables:
##   .. ..$ masscharge: num [1:2] 84.1 130.1
##   .. ..$ intensity : num [1:2] 100 38.1
##   ..$ :'data.frame': 4 obs. of  2 variables:
##   .. ..$ masscharge: num [1:4] 85 111 129 173
##   .. ..$ intensity : num [1:4] 100 10.71 26.83 8.71
##   ..$ :'data.frame': 4 obs. of  2 variables:
##   .. ..$ masscharge: num [1:4] 85 111 129 173
##   .. ..$ intensity : num [1:4] 100 10.71 26.79 8.71
##   ..$ :'data.frame': 2 obs. of  2 variables:
##   .. ..$ masscharge: num [1:2] 137 154
##   .. ..$ intensity : num [1:2] 100 95.4
##   ..$ :'data.frame': 4 obs. of  2 variables:
##   .. ..$ masscharge: num [1:4] 91.1 109.1 119 137.1
##   .. ..$ intensity : num [1:4] 14.92 1.38 16.93 100
##   ..$ :'data.frame': 8 obs. of  2 variables:
##   .. ..$ masscharge: num [1:8] 65 79.1 81.1 91.1 94 ...
##   .. ..$ intensity : num [1:8] 6.071 1.277 2.066 100 0.501 ...
##   ..$ :'data.frame': 11 obs. of  2 variables:
##   .. ..$ masscharge: num [1:11] 41 53 63 65 77 ...
##   .. ..$ intensity : num [1:11] 0.67 0.579 1.187 49.753 0.563 ...
##   ..$ :'data.frame': 16 obs. of  2 variables:
##   .. ..$ masscharge: num [1:16] 39 41 51 53 55 ...
##   .. ..$ intensity : num [1:16] 3.35 2.27 3.34 1.22 0.634 ...
##   ..$ :'data.frame': 9 obs. of  2 variables:
##   .. ..$ masscharge: num [1:9] 55 70 72 73 73.9 ...
##   .. ..$ intensity : num [1:9] 4.8 10.91 11.81 3.3 6.31 ...
##   ..$ :'data.frame': 9 obs. of  2 variables:
##   .. ..$ masscharge: num [1:9] 55 70 72 73 73.9 ...
##   .. ..$ intensity : num [1:9] 4.83 10.89 11.76 3.3 6.26 ...
##   ..$ :'data.frame': 447 obs. of  2 variables:
##   .. ..$ masscharge: num [1:447] 45.3 45.9 46.1 46.6 47 ...
##   .. ..$ intensity : num [1:447] 0.144 0.126 0.12 0.117 0.111 ...
##   ..$ :'data.frame': 286 obs. of  2 variables:
##   .. ..$ masscharge: num [1:286] 45.3 45.8 48.5 48.6 49.4 ...
##   .. ..$ intensity : num [1:286] 11.93 13.92 8.81 10.23 12.22 ...
##   ..$ :'data.frame': 8 obs. of  2 variables:
##   .. ..$ masscharge: num [1:8] 59 76 96.9 116 137.9 ...
##   .. ..$ intensity : num [1:8] 9.45 100 16.33 21.13 29.02 ...
##   ..$ :'data.frame': 2 obs. of  2 variables:
##   .. ..$ masscharge: num [1:2] 58.1 104.1
##   .. ..$ intensity : num [1:2] 67.1 100
##   ..$ :'data.frame': 2 obs. of  2 variables:
##   .. ..$ masscharge: num [1:2] 58.1 104.1
##   .. ..$ intensity : num [1:2] 100 14.4
##   ..$ :'data.frame': 2 obs. of  2 variables:
##   .. ..$ masscharge: num [1:2] 58.1 104.1
##   .. ..$ intensity : num [1:2] 60.9 100
##   ..$ :'data.frame': 2 obs. of  2 variables:
##   .. ..$ masscharge: num [1:2] 58.1 104.1
##   .. ..$ intensity : num [1:2] 100 25.5
##   ..$ :'data.frame': 2 obs. of  2 variables:
##   .. ..$ masscharge: num [1:2] 58.1 104.1
##   .. ..$ intensity : num [1:2] 67.1 100
##   ..$ :'data.frame': 2 obs. of  2 variables:
##   .. ..$ masscharge: num [1:2] 58.1 104.1
##   .. ..$ intensity : num [1:2] 100 14.4
##   ..$ :'data.frame': 2 obs. of  2 variables:
##   .. ..$ masscharge: num [1:2] 58.1 104.1
##   .. ..$ intensity : num [1:2] 60.8 100
##   ..$ :'data.frame': 2 obs. of  2 variables:
##   .. ..$ masscharge: num [1:2] 58.1 104.1
##   .. ..$ intensity : num [1:2] 100 25.5
##   ..$ :'data.frame': 14 obs. of  2 variables:
##   .. ..$ masscharge: num [1:14] 85 86.1 87 102.9 111 ...
##   .. ..$ intensity : num [1:14] 30 4.2 40.2 11.7 100 ...
##   ..$ :'data.frame': 14 obs. of  2 variables:
##   .. ..$ masscharge: num [1:14] 85 86.1 87 102.9 111 ...
##   .. ..$ intensity : num [1:14] 30.06 4.24 40.25 11.67 100 ...
##   ..$ :'data.frame': 2 obs. of  2 variables:
##   .. ..$ masscharge: num [1:2] 112 324
##   .. ..$ intensity : num [1:2] 50.7 100
##   ..$ :'data.frame': 2 obs. of  2 variables:
##   .. ..$ masscharge: num [1:2] 112 324
##   .. ..$ intensity : num [1:2] 100 33.2
##   ..$ :'data.frame': 4 obs. of  2 variables:
##   .. ..$ masscharge: num [1:4] 79 97 139 322
##   .. ..$ intensity : num [1:4] 100 52.45 7.51 60.36
##   ..$ :'data.frame': 4 obs. of  2 variables:
##   .. ..$ masscharge: num [1:4] 79 97 139 322
##   .. ..$ intensity : num [1:4] 100 51.85 6.81 58.96
##   ..$ :'data.frame': 4 obs. of  2 variables:
##   .. ..$ masscharge: num [1:4] 79 97 139 322
##   .. ..$ intensity : num [1:4] 100 52.48 7.46 60.4
##   ..$ :'data.frame': 4 obs. of  2 variables:
##   .. ..$ masscharge: num [1:4] 79 97 139 322
##   .. ..$ intensity : num [1:4] 100 51.86 6.83 58.94
##   ..$ :'data.frame': 2 obs. of  2 variables:
##   .. ..$ masscharge: num [1:2] 112 324
##   .. ..$ intensity : num [1:2] 50.6 100
##   ..$ :'data.frame': 2 obs. of  2 variables:
##   .. ..$ masscharge: num [1:2] 112 324
##   .. ..$ intensity : num [1:2] 100 33.3
##   ..$ :'data.frame': 3 obs. of  2 variables:
##   .. ..$ masscharge: num [1:3] 58.1 60.1 104.1
##   .. ..$ intensity : num [1:3] 19.1 38.1 100
##   ..$ :'data.frame': 3 obs. of  2 variables:
##   .. ..$ masscharge: num [1:3] 58.1 60.1 104.1
##   .. ..$ intensity : num [1:3] 19.1 38.1 100
##   ..$ :'data.frame': 4 obs. of  2 variables:
##   .. ..$ masscharge: num [1:4] 134 207 223 223
##   .. ..$ intensity : num [1:4] 4.2 13 3 100
##   .. [list output truncated]

This database has included all of the 5062 Q-ToF spectra from 1259 compounds in HMDB. We only considered the peaks larger than 10% of the base peak and calculated all of the paired mass distances within the spectra. For example, for compound HMDB0000014, the MS/MS spectra should be (112.1, 228.1) with intensity (69.97, 100). Then the PMD spectra for annotation should be 116 for this compounds.

For the PMD annotation, we will also compute the PMDs of input spectra. Then we compare the input PMDs with the database. Here we need three parameters to refine the candidates. The first parameter is ppm for mass accuracy of precursor ions. The second parameter is the range of precursor ions, the default setting should be 1.1 to include M+H or M-H. The third parameter is the pmd length percentage cutoff for annotation. 0.6(default) means 60 percentage of the pmds in your sample could be found in certain compound pmd database. The fourth parameter is the relative intensity cutoff for input spectra for pmd analysis, default 0.1 for 10 % of the base peak.

# this is the sepctra of HMDB0034004
file <- system.file("extdata", "challenge-msms.mgf", package = "rmwf")
# pmd msms annotation
anno <- pmd::pmdanno(file,db=qtof)
unique(anno$name)

## [1] "HMDB0034004" "HMDB0003217"

enviGCMS::plotanno(anno)

The score rule for pmd annotation is that the candidates will be ordered according to the overlapped pmd numbers. In this case, if two candidates have 3 and 4 pmd overlapped with the input spectra, the latter one will be the first candidate.

Such annotation could be used for MS1 annotation. However, without precursor ion to refine the candidates. It’s better to find the M+H or M-H in advance. In this case, the input spectra should be processed by isotope, adducts or neutral loss detection by pmd of 1.006Da, 22.98Da, etc. Then the following step should be the same as MS2 pmd annotation.

Retention time alignment for peaks list

Wed, 16 Dec 2020 00:00:00 +0000

A regular open source metabolomics workflow could start from the open source format of RAW data. For xcms or other software, algorithm like obiwarp could be used to align the peaks into features. However, some workflows will start from the exported csv files from the instruments. The major issue is that peaks list is not features table and multiple samples should be aligned. Here I will show a native method to align peaks across samples in R considering the mass accuracy and pre-defined retention time shift.

Firstly, the input object should be a list with elements as peaks list from single samples. It should be a data.frame with retention time, mass to charge ratio and intensities.

Then we need to assign a sample as the template for alignment. The output of the alignment function should use m/z and rt data from this sample as the features property.

Now we could align the peaks across samples. All the samples' peaks list should be aligned with the peaks list of template sample one by one. Here the alignment should consider the ppm of m/z and delta retention time. Each alignment will decrease the numbers of featuers when no peaks could be aligned to certain peaks in the template samples. This is a recursive process considering the aligned features' intensities would be saved and reduced for the next paired alignment.

When one peak from template sample could be aligned to multiple peaks in other peaks list, you need to define a function to deal with the intensity of feature in other samples. For example, you could use mean/median/sum to generate the features' intensities for the other samples. Since our inputs are peaks lists, no peaks properties such as peak width, peak shape could be checked. It’s better to control this when you output the peaks list for single samples.

If you are familiar with minifrac in xcms, you might find such alignment will actually set the minifrac as 1. In this case, you should perform this alignment on samples from the same group and you should not use this alignment for samples without group information.

The final output should show the feature' m/z, retention time and intensity across samples. However, this is alignment instead of correction. Such alignment will not correct the shift of retention time larger than certain cutoff, for example, 5s. The advantage of this alignment is that the concept is clear and easy to explain. The aligned peaks should be of high quality. This method could also be used to find the common ions across samples for quality control puporse.

Here is the code (I will put this function in enviGCMS package later):

# check input to make sure the each peaks list contain 'mz', 'rt' and 'ins' as m/z, retetion time in seconds and intensity of certain peaks.
data1 <- read.csv('sample1.csv')
data2 <- read.csv('sample2.csv')
data3 <- read.csv('sample3.csv')
# generate the list as input
li <- list(data1,data2,data3)
# define the function to align peaks list
getretcor <- function(list,cs=1,ppm=10,deltart=5, FUN){
  nli <- list[-cs]
  csd <- list[[cs]]
  i=1
  df1 <- csd
  ins <- df1$ins
  while(i<=length(nli)){
    df2 <- list[[i]]
    df <- enviGCMS::getalign(df1$mz,df2$mz,df1$rt,df2$rt,ppm=ppm,deltart=deltart)
  mr2 <- paste0(df2$mz,'@',df2$rt)
  mrx <- paste0(df$mz2,'@',df$rt2)
  
  df$ins2 <- df2$ins[match(mrx,mr2)]
  dfx <- df[!duplicated(df$xid),]
  dfx$ins2 <- aggregate(df$ins2,by=list(df$xid),FUN)[,2]
  df1 <- cbind.data.frame(mz=dfx$mz1,rt=dfx$rt1)
  if(length(dim(ins))>1){
    insn <- ins[df$xid,]
    ins <- cbind.data.frame(insn[!duplicated(df$xid),],dfx$ins2)
  }else{
    insn <- ins[df$xid]
    ins <- cbind.data.frame(ins1=insn[!duplicated(df$xid)],dfx$ins2)
  }
  i=i+1
  }
  re <- cbind.data.frame(df1,ins)
  colnames(re) <- c('mz','rt',paste0('ins',1:length(list)))
  return(re)
}
# usage
re <- getretcor(list,FUN=mean)