cleaned up sample mapping more, got dates to be correct

PNNL-CompBio · Oct 29, 2024 · aae1c01 · aae1c01
1 parent 2066932
commit aae1c01
Show file tree

Hide file tree

Showing 8 changed files with 111 additions and 24 deletions.
diff --git a/build_script.py b/build_script.py
@@ -162,7 +162,7 @@ def main():
         #add chemical BMDS, fits, curves to existing data
         chemfiles=[]
         sampfiles=[]
-        print(fses)
+        #print(fses)
         for st in ['chemical','extract']:
             for dt in ['bmd','fit','dose']:
                 fdf = combineFiles(df.loc[df.sample_type==st].loc[df.data_type==dt],dt)

diff --git a/data/envSampCleanMapping.xlsx b/data/envSampCleanMapping.xlsx
diff --git a/exposome/README.md b/exposome/README.md
@@ -0,0 +1,9 @@
+## Exposome data processing
+This module contains the scripts and docker image to build the
+exposome data.
+
+
+```
+python build_script.py --expo
+
+```
diff --git a/sampleChemMapping/README.md b/sampleChemMapping/README.md
@@ -10,6 +10,13 @@ Rscript sampleChemMapping/mapSamplesToChems.R --sampId="+smap+' --chemId='+cid+\
             ' --sampMap='+smap
 ```
 
+To run/test docker image (from root of repo):
+
+```
+docker build . -t srp-samplechem -f sampleChemMapping/Dockerfile
+docker run -v $PWD:/tmp srp-samplechem
+```
+
 ### Chemical identifiers
 
 All chemicals must have a `cas_number` that has data download from the [EPA Comptox website](https://comptox.epa.gov/dashboard/batch-search). 
@@ -18,6 +25,11 @@ All cas numbers must have a `Chemical_ID` - these either come from the Tanguay l
 
 ### Sample measurements
 
-All sample measurements must comply with our pre-determined sample schema.
+All sample measurements must comply with our pre-determined sample
+schema. There must be a `Sample_ID` mapping to `SampleNumber` in the
+sample mapping file.
+
+### Benchmark dose values
 
-### Benchmark dose values
+These are processed from the stored BMD files and the recalculated
+ones passed into the argument. 
diff --git a/sampleChemMapping/mapSamplesToChems.R b/sampleChemMapping/mapSamplesToChems.R
@@ -332,7 +332,7 @@ buildSampleData<-function(fses_files, #files from barton that contain sample inf
             subset(!measurement_value_molar%in%c('0'))%>%
             subset(!measurement_value%in%c("0","NULL",""))#%>%
 #        select(-c(Sample_ID))#,Chemical_ID)) ##These two are added in the 4/27 version of the file
-
+       # print(head(sc))
     ##data added 1/19/2022
         #fses2<-subset(sampTab,name=='fses2')[['location']]
         #newSamp <- rio::import(fses2)|>#paste0(data.dir,'/fses/FSES_indoor_outdoor_study.xlsx'))%>%
@@ -424,7 +424,7 @@ buildSampleData<-function(fses_files, #files from barton that contain sample inf
 
     ##now we have one more rename of samples and metadata
     sampleNameRemap<-rio::import(sampMapping,which=1)|>#paste0(data.dir,'/envSampCleanMapping.xlsx'),which=1)%>%
-      dplyr::select(Sample_ID,date_sampled,sample_matrix,technology,#projectName='ProjectName',SampleName='NewSampleName',
+      dplyr::select(Sample_ID,#sample_matrix,technology,#date_sampled,projectName='ProjectName',SampleName='NewSampleName',
                     #LocationName='NewLocationName')%>%
                     ProjectName,NewSampleName,NewLocationName)%>%
       distinct()
@@ -440,12 +440,12 @@ buildSampleData<-function(fses_files, #files from barton that contain sample inf
     finalSampChem$projectName[nas]<-finalSampChem$ProjectName[nas]
     finalSampChem$LocationName[nas]<-finalSampChem$NewLocationName[nas]
     finalSampChem$SampleName[nas]<-finalSampChem$NewSampleName[nas]
-    finalSampChem$date_sampled.x[nas]<-finalSampChem$date_sampled.y[nas]
-    finalSampChem$sample_matrix.x[nas]<-finalSampChem$sample_matrix.y[nas]
-    finalSampChem$technology.x[nas]<-finalSampChem$technology.y[nas]
+    #finalSampChem$date_sampled.x[nas]<-finalSampChem$date_sampled.y[nas]
+    #finalSampChem$sample_matrix.x[nas]<-finalSampChem$sample_matrix.y[nas]
+    #finalSampChem$technology.x[nas]<-finalSampChem$technology.y[nas]
 
-    finalSampChem<-dplyr::select(finalSampChem,-c(ProjectName,NewSampleName,NewLocationName,date_sampled.y,sample_matrix.y,technology.y))%>%
-      dplyr::rename(sample_matrix='sample_matrix.x',date_sampled='date_sampled.x',technology='technology.x')%>%
+    finalSampChem<-finalSampChem|>#dplyr::select(finalSampChem,-c(ProjectName,NewSampleName,NewLocationName,date_sampled,sample_matrix,technology))%>%
+    #  dplyr::rename(sample_matrix='sample_matrix.x',date_sampled='date_sampled.x',technology='technology.x')%>%
       distinct()%>%
       subset(cas_number!='N/A')
 
@@ -857,7 +857,7 @@ buildDB<-function(chem.files=c(),extract.files=c()){
   samps<-sampChem%>%
     select(samp_columns)%>%
     distinct()
-
+#  print(head(samps))
   write.csv(samps,file=paste0(out.dir,'samples.csv'),quote=T,row.names=FALSE)
 
   ##bmds

diff --git a/srpAnalytics.yaml b/srpAnalytics.yaml
@@ -250,17 +250,36 @@ classes:
       - endPointLink
   allGeneEx:
     description: List of all experiments that measure gene expression changes in zebrafish upon chemical treatment changes
+    slots:
+      - Project
+      - cas_number
+      - Chemical_ID
+      - concentration
+    attributes:
+      Gene:
+        description:
+        range: integer
+      GeneID:
+        description:
+      adj_p_value:
+        description:
+        range: float
+      indication:
+        description:
+        range: integer
+      Log2FoldChange:
+        description: fold change
+        range: float
   srpDEGStats:
     description: Summary statistics
     slots:
       - Project
       - cas_number
       - Chemical_ID
+      - concentration
     attributes:
       link:
         description: Link to experiment?
-      concentration:
-        description: concentration at which drug was administered. includes number and unit
       DownRegulatedGenes:
         description: number of genes down-regulated upon chemical treatement
         range: integer
@@ -269,6 +288,29 @@ classes:
         range: integer
   srpDEGPathways:
     description: Pathways that are enriched in zebrafish genes that are differentially expressed upon treatment with a chemical
+    slots:
+      - Chemical_ID
+      - concentration
+    attributes:
+      term:
+        description:
+      adj_p_value:
+        description:
+        range: float
+      p_value:
+        description:
+        range: float
+      Genes:
+        description:      
+      enrichment_score:
+        description:
+        range: float
+      z_score:
+        description:
+        range: float
+      overlap:
+        description:
+
   exposomeGeneStats:
     description: Summary and link to exposome measurements of human genes that are differentailly expressed upon chemical treatment
 
diff --git a/zfExp/README.md b/zfExp/README.md
@@ -0,0 +1,20 @@
+## ZF Expression parsing
+
+This module parses the ZF expression data into files amenable to the
+data portal.
+
+
+This module also runs the `build_all.py` script to build the zf
+expression data. 
+
+```
+python build_script.py --zfExp
+```
+
+However the build script is also copied into the Dockerfile and run as below:
+
+```
+docker build . -t srp-zfexp -f zfExp/Dockerfile
+docker run zfexp
+
+```
diff --git a/zfExp/parseGexData.R b/zfExp/parseGexData.R
@@ -62,19 +62,19 @@ generateChemicalExamples<-function(genelist,deglist){
 
   for(chem in testchems){
     genes<-subset(genelist,Chemical_ID==chem)|>
-      mutate(Pvalue=-1*log10(adj_p_value),Significant=ifelse(indication==1,TRUE,FALSE),Concentration=as.factor(Concentration))|>
-      ggplot(aes(x=Log2FoldChange,y=Pvalue,col=Significant,alpha=0.5,shape=Concentration))+geom_point()
+      mutate(Pvalue=-1*log10(adj_p_value),Significant=ifelse(indication==1,TRUE,FALSE),concentration=as.factor(concentration))|>
+      ggplot(aes(x=Log2FoldChange,y=Pvalue,col=Significant,alpha=0.5,shape=concentration))+geom_point()
 
     genecount<-subset(deglist,Chemical_ID==chem)|>
-      mutate(Direction=ifelse(Log2FoldChange>0,"Up","Down"),Significant=ifelse(indication==1,TRUE,FALSE),Concentration=as.factor(Concentration))|>
+      mutate(Direction=ifelse(Log2FoldChange>0,"Up","Down"),Significant=ifelse(indication==1,TRUE,FALSE),concentration=as.factor(concentration))|>
       subset(Significant)|>
       subset(!is.na(Direction))|>
-      ggplot(aes(x=Direction,fill=Concentration))+geom_bar(position='dodge')
+      ggplot(aes(x=Direction,fill=concentration))+geom_bar(position='dodge')
 
     paths<-subset(deglist,Chemical_ID==chem)|>
       subset(toPlot==1)|>
       tidyr::separate(Overlap,into=c('top','bottom'),sep='/')|>
-      mutate(geneCount=as.numeric(top)/as.numeric(bottom),Concentration=as.factor(Concentration))|>
+      mutate(geneCount=as.numeric(top)/as.numeric(bottom),concentration=as.factor(concentration))|>
       ggplot(aes(x=reorder(Term,geneCount),y=geneCount,fill=-1*log10(Adjusted.P.value)))+geom_bar(stat='identity',position='dodge')+coord_flip()
     ##volcano plot for genes
     ##barplot for pathways
@@ -105,7 +105,7 @@ enrichSelectTop<-function(genelist,path,pvalue=0.05,top=20){
 }
 ##call functional enrichment and store results
 doEnrich<-function(genelist){
-    condition<-genelist|>dplyr::select(Chemical_ID,Concentration)|>
+    condition<-genelist|>dplyr::select(Chemical_ID,concentration)|>
       distinct()
     setEnrichrSite("FishEnrichr")
 #    dbs<-listEnrichrDbs()$libraryName
@@ -114,12 +114,14 @@ doEnrich<-function(genelist){
     allpaths<-genelist|>
       subset(indication==1)|>
     #  subset(Chemical_ID%in%c(3138,3130,3148))|>
-      group_by(Chemical_ID,Concentration)|>
+      group_by(Chemical_ID,concentration)|>
       summarize(enrich=enrichSelectTop(Gene,path,0.05,20))
 
     ##now unnest and filter
     sigpaths<-allpaths|>
-      unnest(cols=c(enrich))
+        unnest(cols=c(enrich))|>
+        dplyr::rename(adj_p_value='Adjusted.P.Value',p_value='P.Value',enrichment_score='Combined.Score',z_score='Z.Score')|>
+        dplyr::select(Chemical_ID,Term,concentration,z_score,enrichment_score,overlap,p_value,adj_p_value,Genes,toPlot)
 
     ##filter for signifiance, then move to long form table
     return(sigpaths)
@@ -200,8 +202,10 @@ main<-function(args=c()){
     dplyr::select(-c(control,treatment))|>
     mutate(Project='Zebrafish',Link='')|>
     left_join(chem)|>
-    mutate(Concentration=as.numeric(stringr::str_replace(Conc,'uM','')))|>
-    select(-Conc)
+    mutate(concentration=as.numeric(stringr::str_replace(Conc,'uM','')))|>
+      select(-Conc)|>
+      subset(!is.na(Log2FoldChange))|>
+      subset(!is.na(concentration))
 
 
   diffex <- allgenes|>
@@ -210,7 +214,7 @@ main<-function(args=c()){
   res<-allgenes|>
     subset(indication!=0)|>
     mutate(up=Log2FoldChange>0)|>
-    group_by(Project,cas_number,Concentration,Link,Chemical_ID,up)|>
+    group_by(Project,cas_number,concentration,Link,Chemical_ID,up)|>
     summarize(nGenes=n())|>subset(!is.na(up))