diff --git a/.Rbuildignore b/.Rbuildignore index bcf1535..c84c888 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -3,3 +3,5 @@ ^data-raw$ ^README\.Rmd$ ^README-.*\.png$ +^doc$ +^Meta$ diff --git a/.gitignore b/.gitignore index a17274e..377f270 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,7 @@ temp*.R inst/doc data-raw +doc +Meta +/doc/ +/Meta/ diff --git a/DESCRIPTION b/DESCRIPTION index c5a5e69..8554336 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,27 +1,27 @@ -Package: EPIC -Type: Package -Title: Estimate the Proportion of Immune and Cancer cells -Version: 1.1.5 -Authors@R: as.person(c( - "Julien Racle [aut, cre]", - "David Gfeller [aut]" - )) -Description: Package implementing EPIC method to estimate the proportion of - immune, stromal, endothelial and cancer or other cells from bulk gene - expression data. - It is based on reference gene expression profiles for the main non-malignant - cell types and it predicts the proportion of these cells and of the - remaining "other cells" (that are mostly cancer cells) for which no - reference profile is given. -Depends: - R (>= 3.2.0) -License: file LICENSE -LazyData: TRUE -RoxygenNote: 6.1.0 -Suggests: - testthat, - knitr, - rmarkdown -Imports: - stats -VignetteBuilder: knitr +Package: EPIC +Type: Package +Title: Estimate the Proportion of Immune and Cancer cells +Version: 1.1.6 +Authors@R: as.person(c( + "Julien Racle [aut, cre]", + "David Gfeller [aut]" + )) +Description: Package implementing EPIC method to estimate the proportion of + immune, stromal, endothelial and cancer or other cells from bulk gene + expression data. + It is based on reference gene expression profiles for the main non-malignant + cell types and it predicts the proportion of these cells and of the + remaining "other cells" (that are mostly cancer cells) for which no + reference profile is given. +Depends: + R (>= 3.2.0) +License: file LICENSE +LazyData: TRUE +RoxygenNote: 7.2.1 +Suggests: + testthat, + knitr, + rmarkdown +Imports: + stats +VignetteBuilder: knitr diff --git a/EPIC.Rproj b/EPIC.Rproj index 497f8bf..fc42c37 100644 --- a/EPIC.Rproj +++ b/EPIC.Rproj @@ -18,3 +18,4 @@ StripTrailingWhitespace: Yes BuildType: Package PackageUseDevtools: Yes PackageInstallArgs: --no-multiarch --with-keep.source +PackageRoxygenize: rd,vignette diff --git a/NEWS b/NEWS index 7d1afbd..5d60baf 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,7 @@ +Version 1.1.6 +------------------------------------------------------------------------ +* Changed person of contact for commercial licenses to Nadette Bulgin. + Version 1.1.5 ------------------------------------------------------------------------ * Renamed EPIC's vignette to call it through "vignette('EPIC')" diff --git a/README.Rmd b/README.Rmd index 551ffbf..c109ad3 100644 --- a/README.Rmd +++ b/README.Rmd @@ -36,7 +36,7 @@ out <- EPIC(bulk = bulkSamplesMatrix) out <- EPIC(bulk = bulkSamplesMatrix, reference = referenceCellsList) ``` -`out` is a list containing the various mRNA and cell fractions in each samples as well as some *data.frame* of the goodness of fit. +`out` is a list containing the various mRNA and cell fractions in each sample as well as some *data.frame* of the goodness of fit. Values of mRNA per cell and signature genes to use can also be changed: ```{r, eval = FALSE} @@ -73,8 +73,8 @@ without warranty of any kind. Please read the file "*LICENSE*" for details. If you plan to use EPIC (version 1.1) in any for-profit application, you are required to obtain a separate license. -To do so, please contact Ece Auffarth -([eauffarth@licr.org](mailto:eauffarth@licr.org)) at the Ludwig Institute for +To do so, please contact Nadette Bulgin +([nbulgin@lcr.org](mailto:nbulgin@lcr.org)) at the Ludwig Institute for Cancer Research Ltd. diff --git a/README.md b/README.md index 65f4baf..f00e7f0 100644 --- a/README.md +++ b/README.md @@ -2,19 +2,28 @@ EPIC package ================ -Description ------------ -Package implementing EPIC method to estimate the proportion of immune, stromal, endothelial and cancer or other cells from bulk gene expression data. It is based on reference gene expression profiles for the main non-malignant cell types and it predicts the proportion of these cells and of the remaining "*other cells*" (that are mostly cancer cells) for which no reference profile is given. +## Description -This method is described in the publication from *Racle et al., 2017* available at . +Package implementing EPIC method to estimate the proportion of immune, +stromal, endothelial and cancer or other cells from bulk gene expression +data. It is based on reference gene expression profiles for the main +non-malignant cell types and it predicts the proportion of these cells +and of the remaining “*other cells*” (that are mostly cancer cells) for +which no reference profile is given. -EPIC is also available as a web application: . +This method is described in the publication from *Racle et al., 2017* +available at . -Usage ------ +EPIC is also available as a web application: +. -The main function in this package is `EPIC`. It needs as input a matrix of the TPM (or RPKM) gene expression from the samples for which to estimate cell proportions. One can also define the reference cells to use +## Usage + +The main function in this package is `EPIC`. It needs as input a matrix +of the TPM (or RPKM) gene expression from the samples for which to +estimate cell proportions. One can also define the reference cells to +use ``` r # library(EPIC) ## If the package isn't loaded (or use EPIC::EPIC and so on). @@ -22,7 +31,8 @@ out <- EPIC(bulk = bulkSamplesMatrix) out <- EPIC(bulk = bulkSamplesMatrix, reference = referenceCellsList) ``` -`out` is a list containing the various mRNA and cell fractions in each samples as well as some *data.frame* of the goodness of fit. +`out` is a list containing the various mRNA and cell fractions in each +sample as well as some *data.frame* of the goodness of fit. Values of mRNA per cell and signature genes to use can also be changed: @@ -31,70 +41,138 @@ out <- EPIC(bulk = bulkSamplesMatrix, reference = referenceCellsList, mRNA_cell out <- EPIC(bulk = bulkSamplesMatrix, reference = referenceCellsList, mRNA_cell_sub = mRNA_cell_sub_vector) ``` -Various other options are available and are well documented in the help pages from EPIC: +Various other options are available and are well documented in the help +pages from EPIC: ``` r ?EPIC::EPIC ?EPIC::EPIC.package ``` -Installation ------------- +## Installation ``` r install.packages("devtools") devtools::install_github("GfellerLab/EPIC", build_vignettes=TRUE) ``` -Web application ---------------- - -EPIC is also available as a web application: . - -Python wrapper --------------- - -A pyhton wrapper has been written by Stephen C. Van Nostrand from MIT and is available at . - -License -------- - -EPIC can be used freely by academic groups for non-commercial purposes. The product is provided free of charge, and, therefore, on an "*as is*" basis, without warranty of any kind. Please read the file "*LICENSE*" for details. - -If you plan to use EPIC (version 1.1) in any for-profit application, you are required to obtain a separate license. To do so, please contact Ece Auffarth () at the Ludwig Institute for Cancer Research Ltd. - -Contact information -------------------- - -Julien Racle (), and David Gfeller (). - -FAQ ---- - -##### What do the "*other cells*" represent? - -- EPIC predicts the proportions of the various cell types for which we have gene expression reference profiles (and corresponding gene signatures). But, depending on the bulk sample, it is possible that some other cell types are present for which we don't have any reference profile. EPIC returns the proportion of these remaining cells under the name "*other cells*". In the case of tumor samples, most of these other cells would certainly correspond to the cancer cells, but it could be that there are also some stromal cells or epithelial cells for example. - -##### I receive an error message "*attempt to set 'colnames' on an object with less than two dimensions*". What can I do? - -- This is certainly that some of your data is a vector instead of a matrix. Please make sure that your bulk data is in the form of a matrix (and also your reference gene expression profiles if using custom ones). - -##### What is the meaning of the warning message telling that some mRNA\_cell values are unknown? - -- As described in our manuscript, EPIC first estimates the proportion of mRNA per cell type in the bulk and then it uses the fact that some cell types have more mRNA copies per cell than other to normalize this and obtain an estimate of the proportion of cells instead of mRNA (EPIC function returns both information if you need the one or the other). For this normalization we had either measured the amount of mRNA per cell or found it in the literature (fig. 1 – fig. supplement 2 of our paper). However we don’t currently have such values for the endothelial cells and CAFs. Therefore for these two cell types, we use an average value, which might not reflect their true value and this is the reason why we output this message. If you have some values for these mRNA/cell abundances, you can also add them into EPIC, with help of the parameter "*mRNA\_cell*" or “*mRNA\_cell\_sub*” (and that would be great to share these values). - - If the mRNA proportions of these cell types are low, then even if you don't correct the results with their true mRNA/cell abundances, it would not really have a big impact on the results. On the other side, if there are many of these cells in your bulk sample, the results might be a little bit biased, but the effect should be similar for all samples and thus not have a too big importance (maybe you wouldn’t be fully able to tell if there are more CAFs than Tcells for example, but you should still have a good estimate of which sample has more CAFs (or Tcells) than which other sample for example). - -##### I receive a warning message that "*the optimization didn't fully converge for some samples*". What does it mean? - -- When estimating the cell proportions EPIC performs a least square regression between the observed expression of the signature genes and the expression of these genes predicted based on the estimated proportions and gene expression reference profiles of the various cell types. - - When such a warning message appears, it means that the optimization didn’t manage to fully converge for this regression, for some of the samples. You can then check the "*fit.gof$convergeCode*" (and possibly also "*fit.gof$convergeMessage*") that is outputted by EPIC alongside the cell proportions. This will tell you which samples had issue with the convergence (a value of 0 means it converged ok, while other values are errors/warnings, their meaning can be found in the help of "*optim*" (or "*constrOptim*") function from R (from "*stats*" package) which is used during the optimization and we simply forward the message it returns). - - The error code that usually comes is a "1" which means that the maximum number of iterations has been reached in the optimization. This could mean there is an issue with the bulk gene expression data that maybe don’t completely follow the assumption of equation (1) from our manuscript. From our experience, it seems in practice that even when there was such a warning message the proportions were predicted well, it is maybe that the optimization just wants to be *too precise*, or maybe few of the signature genes didn’t match well but the rest of signature genes could be used to have a good estimate of the proportions. - - If you have some samples that seem to have strange results, it could however be useful to check that the issue is not that these samples didn’t converge well. To be more conservative you could also remove all the samples that didn't converge well as these are maybe outliers, if it is only a small fraction from your original samples. Another possibility would be to change the parameters of the optim/constrOptim function to allow for more iterations or maybe a weaker tolerance for the convergence, but for this you would need to tweak it directly in the code of EPIC, I didn't implement such option for EPIC. +## Web application + +EPIC is also available as a web application: +. + +## Python wrapper + +A pyhton wrapper has been written by Stephen C. Van Nostrand from MIT +and is available at . + +## License + +EPIC can be used freely by academic groups for non-commercial purposes. +The product is provided free of charge, and, therefore, on an “*as is*” +basis, without warranty of any kind. Please read the file “*LICENSE*” +for details. + +If you plan to use EPIC (version 1.1) in any for-profit application, you +are required to obtain a separate license. To do so, please contact +Nadette Bulgin () at the Ludwig Institute for Cancer +Research Ltd. + +## Contact information + +Julien Racle (), and David Gfeller +(). + +## FAQ + +##### What do the “*other cells*” represent? + +- EPIC predicts the proportions of the various cell types for which we + have gene expression reference profiles (and corresponding gene + signatures). But, depending on the bulk sample, it is possible that + some other cell types are present for which we don’t have any + reference profile. EPIC returns the proportion of these remaining + cells under the name “*other cells*”. In the case of tumor samples, + most of these other cells would certainly correspond to the cancer + cells, but it could be that there are also some stromal cells or + epithelial cells for example. + +##### I receive an error message “*attempt to set ‘colnames’ on an object with less than two dimensions*”. What can I do? + +- This is certainly that some of your data is a vector instead of a + matrix. Please make sure that your bulk data is in the form of a + matrix (and also your reference gene expression profiles if using + custom ones). + +##### What is the meaning of the warning message telling that some mRNA_cell values are unknown? + +- As described in our manuscript, EPIC first estimates the proportion of + mRNA per cell type in the bulk and then it uses the fact that some + cell types have more mRNA copies per cell than other to normalize this + and obtain an estimate of the proportion of cells instead of mRNA + (EPIC function returns both information if you need the one or the + other). For this normalization we had either measured the amount of + mRNA per cell or found it in the literature (fig. 1 – fig. supplement + 2 of our paper). However we don’t currently have such values for the + endothelial cells and CAFs. Therefore for these two cell types, we use + an average value, which might not reflect their true value and this is + the reason why we output this message. If you have some values for + these mRNA/cell abundances, you can also add them into EPIC, with help + of the parameter “*mRNA_cell*” or “*mRNA_cell_sub*” (and that would be + great to share these values). + + If the mRNA proportions of these cell types are low, then even if you + don’t correct the results with their true mRNA/cell abundances, it + would not really have a big impact on the results. On the other side, + if there are many of these cells in your bulk sample, the results + might be a little bit biased, but the effect should be similar for all + samples and thus not have a too big importance (maybe you wouldn’t be + fully able to tell if there are more CAFs than Tcells for example, but + you should still have a good estimate of which sample has more CAFs + (or Tcells) than which other sample for example). + +##### I receive a warning message that “*the optimization didn’t fully converge for some samples*”. What does it mean? + +- When estimating the cell proportions EPIC performs a least square + regression between the observed expression of the signature genes and + the expression of these genes predicted based on the estimated + proportions and gene expression reference profiles of the various cell + types. + + When such a warning message appears, it means that the optimization + didn’t manage to fully converge for this regression, for some of the + samples. You can then check the “*fit.gof\$convergeCode*” (and + possibly also “*fit.gof\$convergeMessage*”) that is outputted by EPIC + alongside the cell proportions. This will tell you which samples had + issue with the convergence (a value of 0 means it converged ok, while + other values are errors/warnings, their meaning can be found in the + help of “*optim*” (or “*constrOptim*”) function from R (from “*stats*” + package) which is used during the optimization and we simply forward + the message it returns). + + The error code that usually comes is a “1” which means that the + maximum number of iterations has been reached in the optimization. + This could mean there is an issue with the bulk gene expression data + that maybe don’t completely follow the assumption of equation (1) from + our manuscript. From our experience, it seems in practice that even + when there was such a warning message the proportions were predicted + well, it is maybe that the optimization just wants to be *too + precise*, or maybe few of the signature genes didn’t match well but + the rest of signature genes could be used to have a good estimate of + the proportions. + + If you have some samples that seem to have strange results, it could + however be useful to check that the issue is not that these samples + didn’t converge well. To be more conservative you could also remove + all the samples that didn’t converge well as these are maybe outliers, + if it is only a small fraction from your original samples. Another + possibility would be to change the parameters of the optim/constrOptim + function to allow for more iterations or maybe a weaker tolerance for + the convergence, but for this you would need to tweak it directly in + the code of EPIC, I didn’t implement such option for EPIC. ##### Who should I contact in case of a technical or other issue? -- Julien Racle (). Please provide as much details as possible and ideally send also an example input file (and/or reference profiles) that is causing the issue. +- Julien Racle (). Please provide as much details + as possible and ideally send also an example input file (and/or + reference profiles) that is causing the issue. diff --git a/man/BRef.Rd b/man/BRef.Rd index 40b2988..3bae7bd 100644 --- a/man/BRef.Rd +++ b/man/BRef.Rd @@ -4,11 +4,13 @@ \name{BRef} \alias{BRef} \title{Reference profiles from circulating immune cells.} -\format{A list of 3 elements: \describe{ \item{$refProfiles, +\format{ +A list of 3 elements: \describe{ \item{$refProfiles, $refProfiles.var}{Matrices (nGenes x nRefCells) of the gene expression (in TPM counts) from the reference cells and the variability of this gene expression for each gene and each cell type} \item{$sigGenes}{A list of - signature genes used to deconvolve the cell proportions} }} + signature genes used to deconvolve the cell proportions} } +} \source{ \enumerate{ \item \url{https://www.ebi.ac.uk/arrayexpress/experiments/E-GEOD-64655}, diff --git a/man/EPIC.Rd b/man/EPIC.Rd index c7f0b01..9b34e44 100644 --- a/man/EPIC.Rd +++ b/man/EPIC.Rd @@ -4,9 +4,17 @@ \alias{EPIC} \title{Estimate the proportion of immune and cancer cells.} \usage{ -EPIC(bulk, reference = NULL, mRNA_cell = NULL, mRNA_cell_sub = NULL, - sigGenes = NULL, scaleExprs = TRUE, withOtherCells = TRUE, - constrainedSum = TRUE, rangeBasedOptim = FALSE) +EPIC( + bulk, + reference = NULL, + mRNA_cell = NULL, + mRNA_cell_sub = NULL, + sigGenes = NULL, + scaleExprs = TRUE, + withOtherCells = TRUE, + constrainedSum = TRUE, + rangeBasedOptim = FALSE +) } \arguments{ \item{bulk}{A matrix (\code{nGenes} x \code{nSamples}) of the genes diff --git a/man/EPIC.package.Rd b/man/EPIC.package.Rd index 99de536..ba8bd20 100644 --- a/man/EPIC.package.Rd +++ b/man/EPIC.package.Rd @@ -3,7 +3,6 @@ \docType{package} \name{EPIC.package} \alias{EPIC.package} -\alias{EPIC.package-package} \title{EPIC: a package to Estimate the Proportion of Immune and Cancer cells from tumor gene expression data.} \description{ diff --git a/man/TRef.Rd b/man/TRef.Rd index cf2e256..80d6827 100644 --- a/man/TRef.Rd +++ b/man/TRef.Rd @@ -5,11 +5,13 @@ \alias{TRef} \title{Reference profiles obtained from single cell data of tumor infiltrating cells.} -\format{A list of 3 elements: \describe{ \item{$refProfiles, +\format{ +A list of 3 elements: \describe{ \item{$refProfiles, $refProfiles.var}{Matrices (nGenes x nRefCells) of the gene expression (in TPM counts) from the reference cells and the variability of this gene expression for each gene and each cell type} \item{$sigGenes}{A list of - signature genes used to deconvolve the cell proportions} }} + signature genes used to deconvolve the cell proportions} } +} \source{ \url{http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE72056} } diff --git a/man/mRNA_cell_default.Rd b/man/mRNA_cell_default.Rd index a96d2b0..4b04587 100644 --- a/man/mRNA_cell_default.Rd +++ b/man/mRNA_cell_default.Rd @@ -4,12 +4,14 @@ \name{mRNA_cell_default} \alias{mRNA_cell_default} \title{Values of mRNA / cell for the main cell types.} -\format{A named numeric vector of the relative amount of mRNA per cell type. +\format{ +A named numeric vector of the relative amount of mRNA per cell type. There are two additional "special cell types": the \emph{otherCells} which correspond to the uncharacterized cells present in the sample but without any reference profile and the \emph{default} which is the default value used for cells with reference profiles but without a value specified in the - \code{mRNA_cell_default} vector.} + \code{mRNA_cell_default} vector. +} \source{ \url{https://elifesciences.org/articles/26476} } diff --git a/man/melanoma_data.Rd b/man/melanoma_data.Rd index 634b7ea..5c13734 100644 --- a/man/melanoma_data.Rd +++ b/man/melanoma_data.Rd @@ -5,7 +5,8 @@ \alias{melanoma_data} \title{Example dataset containing data from lymph nodes from patients with metastatic melanoma.} -\format{This is a list of 3 elements: \describe{ +\format{ +This is a list of 3 elements: \describe{ \item{$counts}{(matrix of 49902 genes x 4 donors) The TPM normalized counts from the four donors. It has been obtained by mapping RNA-seq data to \emph{hg19} genome with help of \emph{RSEM}. Ensembl ID were then @@ -18,7 +19,8 @@ metastatic melanoma.} \item{$cellFractions.pred}{(matrix of 4 donors x 8 cell types) The proportions of the different cell types, as predicted by EPIC based on the reference profiles \code{TRef}.} -}} +} +} \source{ \url{https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE93722} } diff --git a/man/scaleCounts.Rd b/man/scaleCounts.Rd index f01a06f..b51d804 100644 --- a/man/scaleCounts.Rd +++ b/man/scaleCounts.Rd @@ -4,8 +4,7 @@ \alias{scaleCounts} \title{Scaling raw counts from each sample.} \usage{ -scaleCounts(counts, sigGenes = NULL, renormGenes = NULL, - normFact = NULL) +scaleCounts(counts, sigGenes = NULL, renormGenes = NULL, normFact = NULL) } \description{ Normalizing the sum of counts from each sample to 1e6. diff --git a/vignettes/EPIC.Rmd b/vignettes/EPIC.Rmd index 33f30f6..f57435b 100644 --- a/vignettes/EPIC.Rmd +++ b/vignettes/EPIC.Rmd @@ -69,8 +69,8 @@ without warranty of any kind. Please read the file "*LICENSE*" for details. If you plan to use EPIC (version 1.1) in any for-profit application, you are required to obtain a separate license. -To do so, please contact Ece Auffarth -([eauffarth@licr.org](mailto:eauffarth@licr.org)) at the Ludwig Institute for +To do so, please contact Nadette Bulgin +([nbulgin@lcr.org](mailto:nbulgin@lcr.org)) at the Ludwig Institute for Cancer Research Ltd.