Skip to content

Commit

Permalink
cleaned up sample mapping more, got dates to be correct
Browse files Browse the repository at this point in the history
  • Loading branch information
sgosline committed Oct 29, 2024
1 parent 2066932 commit aae1c01
Show file tree
Hide file tree
Showing 8 changed files with 111 additions and 24 deletions.
2 changes: 1 addition & 1 deletion build_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ def main():
#add chemical BMDS, fits, curves to existing data
chemfiles=[]
sampfiles=[]
print(fses)
#print(fses)
for st in ['chemical','extract']:
for dt in ['bmd','fit','dose']:
fdf = combineFiles(df.loc[df.sample_type==st].loc[df.data_type==dt],dt)
Expand Down
Binary file modified data/envSampCleanMapping.xlsx
Binary file not shown.
9 changes: 9 additions & 0 deletions exposome/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
## Exposome data processing
This module contains the scripts and docker image to build the
exposome data.


```
python build_script.py --expo
```
16 changes: 14 additions & 2 deletions sampleChemMapping/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,13 @@ Rscript sampleChemMapping/mapSamplesToChems.R --sampId="+smap+' --chemId='+cid+\
' --sampMap='+smap
```

To run/test docker image (from root of repo):

```
docker build . -t srp-samplechem -f sampleChemMapping/Dockerfile
docker run -v $PWD:/tmp srp-samplechem
```

### Chemical identifiers

All chemicals must have a `cas_number` that has data download from the [EPA Comptox website](https://comptox.epa.gov/dashboard/batch-search).
Expand All @@ -18,6 +25,11 @@ All cas numbers must have a `Chemical_ID` - these either come from the Tanguay l

### Sample measurements

All sample measurements must comply with our pre-determined sample schema.
All sample measurements must comply with our pre-determined sample
schema. There must be a `Sample_ID` mapping to `SampleNumber` in the
sample mapping file.

### Benchmark dose values

### Benchmark dose values
These are processed from the stored BMD files and the recalculated
ones passed into the argument.
16 changes: 8 additions & 8 deletions sampleChemMapping/mapSamplesToChems.R
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,7 @@ buildSampleData<-function(fses_files, #files from barton that contain sample inf
subset(!measurement_value_molar%in%c('0'))%>%
subset(!measurement_value%in%c("0","NULL",""))#%>%
# select(-c(Sample_ID))#,Chemical_ID)) ##These two are added in the 4/27 version of the file

# print(head(sc))
##data added 1/19/2022
#fses2<-subset(sampTab,name=='fses2')[['location']]
#newSamp <- rio::import(fses2)|>#paste0(data.dir,'/fses/FSES_indoor_outdoor_study.xlsx'))%>%
Expand Down Expand Up @@ -424,7 +424,7 @@ buildSampleData<-function(fses_files, #files from barton that contain sample inf

##now we have one more rename of samples and metadata
sampleNameRemap<-rio::import(sampMapping,which=1)|>#paste0(data.dir,'/envSampCleanMapping.xlsx'),which=1)%>%
dplyr::select(Sample_ID,date_sampled,sample_matrix,technology,#projectName='ProjectName',SampleName='NewSampleName',
dplyr::select(Sample_ID,#sample_matrix,technology,#date_sampled,projectName='ProjectName',SampleName='NewSampleName',
#LocationName='NewLocationName')%>%
ProjectName,NewSampleName,NewLocationName)%>%
distinct()
Expand All @@ -440,12 +440,12 @@ buildSampleData<-function(fses_files, #files from barton that contain sample inf
finalSampChem$projectName[nas]<-finalSampChem$ProjectName[nas]
finalSampChem$LocationName[nas]<-finalSampChem$NewLocationName[nas]
finalSampChem$SampleName[nas]<-finalSampChem$NewSampleName[nas]
finalSampChem$date_sampled.x[nas]<-finalSampChem$date_sampled.y[nas]
finalSampChem$sample_matrix.x[nas]<-finalSampChem$sample_matrix.y[nas]
finalSampChem$technology.x[nas]<-finalSampChem$technology.y[nas]
#finalSampChem$date_sampled.x[nas]<-finalSampChem$date_sampled.y[nas]
#finalSampChem$sample_matrix.x[nas]<-finalSampChem$sample_matrix.y[nas]
#finalSampChem$technology.x[nas]<-finalSampChem$technology.y[nas]

finalSampChem<-dplyr::select(finalSampChem,-c(ProjectName,NewSampleName,NewLocationName,date_sampled.y,sample_matrix.y,technology.y))%>%
dplyr::rename(sample_matrix='sample_matrix.x',date_sampled='date_sampled.x',technology='technology.x')%>%
finalSampChem<-finalSampChem|>#dplyr::select(finalSampChem,-c(ProjectName,NewSampleName,NewLocationName,date_sampled,sample_matrix,technology))%>%
# dplyr::rename(sample_matrix='sample_matrix.x',date_sampled='date_sampled.x',technology='technology.x')%>%
distinct()%>%
subset(cas_number!='N/A')

Expand Down Expand Up @@ -857,7 +857,7 @@ buildDB<-function(chem.files=c(),extract.files=c()){
samps<-sampChem%>%
select(samp_columns)%>%
distinct()

# print(head(samps))
write.csv(samps,file=paste0(out.dir,'samples.csv'),quote=T,row.names=FALSE)

##bmds
Expand Down
46 changes: 44 additions & 2 deletions srpAnalytics.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -250,17 +250,36 @@ classes:
- endPointLink
allGeneEx:
description: List of all experiments that measure gene expression changes in zebrafish upon chemical treatment changes
slots:
- Project
- cas_number
- Chemical_ID
- concentration
attributes:
Gene:
description:
range: integer
GeneID:
description:
adj_p_value:
description:
range: float
indication:
description:
range: integer
Log2FoldChange:
description: fold change
range: float
srpDEGStats:
description: Summary statistics
slots:
- Project
- cas_number
- Chemical_ID
- concentration
attributes:
link:
description: Link to experiment?
concentration:
description: concentration at which drug was administered. includes number and unit
DownRegulatedGenes:
description: number of genes down-regulated upon chemical treatement
range: integer
Expand All @@ -269,6 +288,29 @@ classes:
range: integer
srpDEGPathways:
description: Pathways that are enriched in zebrafish genes that are differentially expressed upon treatment with a chemical
slots:
- Chemical_ID
- concentration
attributes:
term:
description:
adj_p_value:
description:
range: float
p_value:
description:
range: float
Genes:
description:
enrichment_score:
description:
range: float
z_score:
description:
range: float
overlap:
description:

exposomeGeneStats:
description: Summary and link to exposome measurements of human genes that are differentailly expressed upon chemical treatment

20 changes: 20 additions & 0 deletions zfExp/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
## ZF Expression parsing

This module parses the ZF expression data into files amenable to the
data portal.


This module also runs the `build_all.py` script to build the zf
expression data.

```
python build_script.py --zfExp
```

However the build script is also copied into the Dockerfile and run as below:

```
docker build . -t srp-zfexp -f zfExp/Dockerfile
docker run zfexp
```
26 changes: 15 additions & 11 deletions zfExp/parseGexData.R
Original file line number Diff line number Diff line change
Expand Up @@ -62,19 +62,19 @@ generateChemicalExamples<-function(genelist,deglist){

for(chem in testchems){
genes<-subset(genelist,Chemical_ID==chem)|>
mutate(Pvalue=-1*log10(adj_p_value),Significant=ifelse(indication==1,TRUE,FALSE),Concentration=as.factor(Concentration))|>
ggplot(aes(x=Log2FoldChange,y=Pvalue,col=Significant,alpha=0.5,shape=Concentration))+geom_point()
mutate(Pvalue=-1*log10(adj_p_value),Significant=ifelse(indication==1,TRUE,FALSE),concentration=as.factor(concentration))|>
ggplot(aes(x=Log2FoldChange,y=Pvalue,col=Significant,alpha=0.5,shape=concentration))+geom_point()

genecount<-subset(deglist,Chemical_ID==chem)|>
mutate(Direction=ifelse(Log2FoldChange>0,"Up","Down"),Significant=ifelse(indication==1,TRUE,FALSE),Concentration=as.factor(Concentration))|>
mutate(Direction=ifelse(Log2FoldChange>0,"Up","Down"),Significant=ifelse(indication==1,TRUE,FALSE),concentration=as.factor(concentration))|>
subset(Significant)|>
subset(!is.na(Direction))|>
ggplot(aes(x=Direction,fill=Concentration))+geom_bar(position='dodge')
ggplot(aes(x=Direction,fill=concentration))+geom_bar(position='dodge')

paths<-subset(deglist,Chemical_ID==chem)|>
subset(toPlot==1)|>
tidyr::separate(Overlap,into=c('top','bottom'),sep='/')|>
mutate(geneCount=as.numeric(top)/as.numeric(bottom),Concentration=as.factor(Concentration))|>
mutate(geneCount=as.numeric(top)/as.numeric(bottom),concentration=as.factor(concentration))|>
ggplot(aes(x=reorder(Term,geneCount),y=geneCount,fill=-1*log10(Adjusted.P.value)))+geom_bar(stat='identity',position='dodge')+coord_flip()
##volcano plot for genes
##barplot for pathways
Expand Down Expand Up @@ -105,7 +105,7 @@ enrichSelectTop<-function(genelist,path,pvalue=0.05,top=20){
}
##call functional enrichment and store results
doEnrich<-function(genelist){
condition<-genelist|>dplyr::select(Chemical_ID,Concentration)|>
condition<-genelist|>dplyr::select(Chemical_ID,concentration)|>
distinct()
setEnrichrSite("FishEnrichr")
# dbs<-listEnrichrDbs()$libraryName
Expand All @@ -114,12 +114,14 @@ doEnrich<-function(genelist){
allpaths<-genelist|>
subset(indication==1)|>
# subset(Chemical_ID%in%c(3138,3130,3148))|>
group_by(Chemical_ID,Concentration)|>
group_by(Chemical_ID,concentration)|>
summarize(enrich=enrichSelectTop(Gene,path,0.05,20))

##now unnest and filter
sigpaths<-allpaths|>
unnest(cols=c(enrich))
unnest(cols=c(enrich))|>
dplyr::rename(adj_p_value='Adjusted.P.Value',p_value='P.Value',enrichment_score='Combined.Score',z_score='Z.Score')|>
dplyr::select(Chemical_ID,Term,concentration,z_score,enrichment_score,overlap,p_value,adj_p_value,Genes,toPlot)

##filter for signifiance, then move to long form table
return(sigpaths)
Expand Down Expand Up @@ -200,8 +202,10 @@ main<-function(args=c()){
dplyr::select(-c(control,treatment))|>
mutate(Project='Zebrafish',Link='')|>
left_join(chem)|>
mutate(Concentration=as.numeric(stringr::str_replace(Conc,'uM','')))|>
select(-Conc)
mutate(concentration=as.numeric(stringr::str_replace(Conc,'uM','')))|>
select(-Conc)|>
subset(!is.na(Log2FoldChange))|>
subset(!is.na(concentration))


diffex <- allgenes|>
Expand All @@ -210,7 +214,7 @@ main<-function(args=c()){
res<-allgenes|>
subset(indication!=0)|>
mutate(up=Log2FoldChange>0)|>
group_by(Project,cas_number,Concentration,Link,Chemical_ID,up)|>
group_by(Project,cas_number,concentration,Link,Chemical_ID,up)|>
summarize(nGenes=n())|>subset(!is.na(up))


Expand Down

0 comments on commit aae1c01

Please sign in to comment.