Skip to content
This repository has been archived by the owner on Aug 31, 2022. It is now read-only.

add script to edit grants and theme in project-split manifests #116

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 65 additions & 0 deletions python/grant_dictionary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import synapseclient
import argparse
import pandas as pd


### Login to Synapse ###
def login():

syn = synapseclient.Synapse()
syn.login()

return syn


### Get arguments ###
def get_args():

parser = argparse.ArgumentParser(description='Get synapse grants table id')
parser.add_argument('table_id',
type=str,
help='Synapse grants merged table id')

return parser.parse_args()


### Retrieve grants merged table and turn into data frame ###
def get_grant_table(syn, table):

grants_query = (
f"SELECT grantNumber, theme, consortium, grantInstitution FROM {table}"
)
grants_df = syn.tableQuery(grants_query).asDataFrame()

return grants_df


def grant_dictionary(grants_df):

consortium_dict = dict(zip(grants_df.grantNumber, grants_df.consortium))
theme_dict = dict(zip(grants_df.grantNumber, grants_df.theme))
institution_dict = dict(
zip(grants_df.grantNumber, grants_df.grantInstitution))

# Make themes strings instead of lists
for key, value in theme_dict.items():
value = str(value)
value = value.strip('["').strip('"]').replace("'", "")
theme_dict.update({key: value})

print(consortium_dict)
print(theme_dict)
print(institution_dict)


def main():

syn = login()
args = get_args()
grants_df = get_grant_table(syn, args.table_id)

grant_dictionary(grants_df)


if __name__ == "__main__":
main()
196 changes: 196 additions & 0 deletions python/grant_dicts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
CONSORTIUM = {
'CA209891': 'CSBC',
'CA225088': 'CSBC',
'CA217655': 'PS-ON',
'CA210152': 'PS-ON',
'CA214282': 'PS-ON',
'CA214354': 'PS-ON',
'CA215798': 'CSBC',
'CA215709': 'CSBC',
'CA215794': 'CSBC',
'CA215848': 'CSBC',
'CA220378': 'CSBC',
'CA227550': 'CSBC',
'CA232137': 'CSBC',
'CA232161': 'CSBC',
'CA232382': 'CSBC',
'CA217378': 'CSBC',
'CA217376': 'CSBC',
'CA217450': 'CSBC',
'CA231978': 'CSBC',
'CA238720': 'CSBC',
'CA232216': 'CSBC',
'CA243007': 'CSBC',
'CA227544': 'CSBC',
'CA243073': 'CSBC',
'CA238475': 'CSBC',
'CA209923': 'CSBC',
'CA209997': 'CSBC',
'CA209975': 'CSBC',
'CA209971': 'CSBC',
'CA209992': 'CSBC',
'CA193419': 'PS-ON',
'CA210190': 'PS-ON',
'CA193417': 'PS-ON',
'CA193489': 'PS-ON',
'CA193461': 'PS-ON',
'CA193313': 'PS-ON',
'CA210181': 'PS-ON',
'CA210180': 'PS-ON',
'CA210173': 'PS-ON',
'CA210184': 'PS-ON',
'CA202123': 'PS-ON',
'CA202177': 'PS-ON',
'CA202241': 'PS-ON',
'CA202144': 'PS-ON',
'CA202229': 'PS-ON',
'CA209978': 'CSBC',
'CA184898': 'ICBP',
'CA217377': 'CSBC',
'CA209988': 'CSBC',
'CA217297': 'CSBC',
'CA199315': 'ICBP',
'CA188388': 'ICBP',
'CA215845': 'CSBC',
'CA184897': 'ICBP',
'CA195469': 'ICBP',
'CA250046': 'PS-ON',
'CA250040': 'PS-ON',
'CA214297': 'PS-ON',
'CA244107': 'PS-ON',
'CA244101': 'PS-ON',
'CA244100': 'PS-ON',
'CA244109': 'PS-ON',
'CA225566': 'PS-ON',
'CA214292': 'TEC',
'CA214411': 'TEC',
'CA214300': 'TEC',
'CA214369': 'TEC',
'CA214381': 'TEC',
'CA227136': 'TEC',
'CA241927': 'TEC',
'CA240301': 'TEC',
'CA232209': 'TEC',
'CA232517': 'TEC',
'CA245313': 'TEC',
'CA243072': 'CSBC',
'CA243075': 'CSBC',
'CA238728': 'CSBC',
'CA243004': 'CSBC',
'CA253553': 'CSBC',
'CA253472': 'CSBC',
'CA253540': 'CSBC',
'CA253547': 'CSBC',
'CA228963': 'PS-ON',
'CA261842': 'PS-ON',
'CA234787':
'NCI Clinical and Translational Exploratory/Developmental Studies',
'CA264610': 'CSBC',
'CA264620': 'CSBC',
'CA261822': 'PS-ON',
'CA250044': 'PS-ON',
'CA261841': 'PS-ON',
'CA254886': 'PS-ON',
'CA250481': 'PS-ON',
'CA260432': 'CSBC'
}

THEME = {
'CA209891': 'Heterogeneity, Evolution, Drug Resistance/Sensitivity',
'CA225088': 'Drug Resistance/Sensitivity, Microenvironment',
'CA217655': 'Drug Resistance/Sensitivity, Tumor-Immune, Metabolism',
'CA210152': 'Metastasis, Microenvironment',
'CA214282': 'Heterogeneity, Evolution',
'CA214354': 'Tumor-Immune, Microenvironment',
'CA215798': 'Drug Resistance/Sensitivity',
'CA215709': 'Heterogeneity, Drug Resistance/Sensitivity',
'CA215794': 'Heterogeneity, Evolution',
'CA215848': 'Metabolism, Drug Resistance/Sensitivity',
'CA220378': 'Heterogeneity, Evolution',
'CA227550': 'Drug Resistance/Sensitivity, Heterogeneity',
'CA232137': 'Metabolism, Heterogeneity, Microenvironment',
'CA232161': 'Heterogeneity',
'CA232382': 'Tumor-Immune, Drug Resistance/Sensitivity',
'CA217378': 'Heterogeneity, Evolution, Drug Resistance/Sensitivity',
'CA217376': 'Evolution, Heterogeneity, Epigenetics, Microenvironment',
'CA217450': 'Heterogeneity, Drug Resistance/Sensitivity, Microenvironment',
'CA231978': 'Drug Resistance/Sensitivity, Heterogeneity',
'CA238720': 'Drug Resistance/Sensitivity',
'CA232216': 'Tumor-Immune, Heterogeneity, Microenvironment',
'CA243007': 'Drug Resistance/Sensitivity',
'CA227544': 'Drug Resistance/Sensitivity, Heterogeneity',
'CA243073': 'Heterogeneity, Evolution',
'CA238475': 'Drug Resistance/Sensitivity, Heterogeneity',
'CA209923': 'Computational Resource',
'CA209997': 'Heterogeneity, Drug Resistance/Sensitivity',
'CA209975':
'Tumor-Immune, Microenvironment, Drug Resistance/Sensitivity, Metastasis',
'CA209971': 'Tumor-Immune, Metastasis, Microenvironment',
'CA209992': 'Metastasis, Microenvironment',
'CA193419': 'Metabolism, Heterogeneity, Evolution',
'CA210190': 'Microenvironment, Metastasis, Tumor-Immune',
'CA193417': 'Microenvironment, Evolution, Metastasis',
'CA193489':
'Evolution, Drug Resistance/Sensitivity, Microenvironment, Tumor-Immune',
'CA193461':
'Evolution, Drug Resistance/Sensitivity, Microenvironment, Heterogeneity',
'CA193313': 'Heterogeneity, Evolution, Metastasis',
'CA210181': 'Heterogeneity, Tumor-Immune, Microenvironment',
'CA210180': 'Heterogeneity, Microenvironment, Evolution',
'CA210173': 'Microenvironment, Metastasis, Metabolism',
'CA210184': 'Metabolism, Microenvironment, Metastasis',
'CA202123': 'Metastasis, Heterogeneity',
'CA202177': 'Microenvironment, Metastasis',
'CA202241': 'Heterogeneity, Microenvironment',
'CA202144': 'Heterogeneity, Evolution',
'CA202229': 'Microenvironment, Heterogeneity',
'CA209978':
'Drug Resistance/Sensitivity, Heterogeneity, Evolution, Metastasis',
'CA184898': 'Heterogeneity',
'CA217377': 'Drug Resistance/Sensitivity, Microenvironment',
'CA209988': 'Drug Resistance/Sensitivity, Heterogeneity, Microenvironment',
'CA217297': 'Drug Resistance/Sensitivity, Epigenetics',
'CA199315': 'Heterogeneity, Microenvironment',
'CA188388': 'Heterogeneity, Microenvironment',
'CA215845': 'Drug Resistance/Sensitivity, Heterogeneity',
'CA184897': 'Metastasis, Heterogeneity',
'CA195469': 'Heterogeneity, Evolution',
'CA250046': 'Evolution, Heterogeneity, Epigenetics',
'CA250040': 'Tumor-Immune',
'CA214297': 'Metastasis, Microenvironment',
'CA244107': 'Metastasis, Microenvironment',
'CA244101': 'Microenvironment, Metastasis, Heterogeneity, Evolution',
'CA244100': 'Tumor-Immune, Drug Resistance/Sensitivity',
'CA244109': 'Heterogeneity, Microenvironment',
'CA225566': 'Microenvironment',
'CA214292': 'Metastasis, Microenvironment',
'CA214411': 'Microenvironment, Drug Resistance/Sensitivity, Heterogeneity',
'CA214300': 'Metastasis, Microenvironment, Tumor-Immune',
'CA214369': 'Tumor-Immune, Microenvironment, Drug Resistance/Sensitivity',
'CA214381': 'Tumor-Immune, Microenvironment, Drug Resistance/Sensitivity',
'CA227136': 'Microenvironment, Drug Resistance/Sensitivity',
'CA241927': 'Metastasis, Microenvironment',
'CA240301': 'Metastasis, Microenvironment',
'CA232209': 'Microenvironment, Heterogeneity',
'CA232517': 'Metastasis, Microenvironment',
'CA245313': 'Microenvironment, Metastasis, Drug Resistance/Sensitivity',
'CA243072': 'Drug Resistance/Sensitivity',
'CA243075': 'Drug Resistance/Sensitivity',
'CA238728': 'Microenvironment, Tumor-Immune',
'CA243004': 'Microenvironment, Metastasis',
'CA253553': 'Metabolism, Tumor-Immune',
'CA253472': 'Tumor-Immune, Drug Resistance/Sensitivit',
'CA253540': 'Drug Resistance/Sensitivity, Heterogeneity',
'CA253547': 'Tumor-Immune',
'CA228963': 'Tumor-Immune',
'CA261842': 'Drug Resistance/Sensitivity',
'CA234787': 'Drug Resistance/Sensitivity',
'CA264610': 'Heterogeneity',
'CA264620': 'Drug Resistance/Sensitivity',
'CA261822': 'Drug Resistance/Sensitivity',
'CA250044': 'Metastasis',
'CA261841': 'Microenvironment',
'CA254886': 'Mechano-genetics',
'CA250481': 'Tumor-Immune',
'CA260432': 'Heterogeneity'
}
69 changes: 69 additions & 0 deletions python/split_manifest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# Preliminaries
import pandas as pd
import argparse
import os
import glob
from grant_dicts import CONSORTIUM, THEME
vpchung marked this conversation as resolved.
Show resolved Hide resolved


### Get arguments ###
def get_args():

parser = argparse.ArgumentParser(
description='Get file path of manifest csv')
parser.add_argument('directory_path',
type=str,
help='Path to directory that houses the manifest csvs')

return parser.parse_args()


### Get list of csv files ###
def get_files(directory):

files = glob.glob(f'{directory}**/**.csv')

return (files)


def split_manifests(files, directory):

data_types = ['Publication', 'Dataset', 'File', 'Tool']
for item in data_types:
# Create directories
os.mkdir(f'{directory}/{item}sSplit')
for file in files:
if item in file:
df = pd.read_csv(file, index_col=0, keep_default_na=False)
grant_col = f'{item} Grant Number'
consortium_col = f'{item} Consortium Name'
theme_col = f'{item} Theme Name'
# Change column grant type to list
df[grant_col] = df[grant_col].apply(lambda x: x.split(', '))
# Separate out rows with multiple grants
df = df.explode(grant_col)
# Make consortium and themes match grant
df[consortium_col] = df[grant_col].map(CONSORTIUM)
df[theme_col] = df[grant_col].map(THEME)
# Split into multiple manifests
grouped = df.groupby([grant_col])
print(f"Found {len(grouped.groups)} grant numbers in table "
"- splitting now...")
# Save dataframes as csvs
for grant_number in grouped.groups:
df = grouped.get_group(grant_number)
df.to_csv(f'{directory}/{item}sSplit/{grant_number}.csv')


def main():

args = get_args()
file_list = get_files(args.directory_path)

split_manifests(file_list, args.directory_path)

print("Done. Manifests split by grant number.")


if __name__ == "__main__":
main()
Loading