-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathKompThroughput-01.py
197 lines (156 loc) · 7.56 KB
/
KompThroughput-01.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
#### This script pulls in progress strains from Jax Lims and Core PFS and calculates a combined count
# Import all the required libraries
import mysql.connector as sql
import pandas as pd
import requests
import json
import sys
from datetime import datetime
# Connect to the Lims database
rslims_conn = sql.connect(
host="rslims.jax.org",
user="dba",
password="rsdba",
database="rslims"
)
# Query the database (sql)
rslims_data_table = pd.read_sql("""
SELECT t3.StockNumber, t4.LineName, t5.Sex, t7.GenotypeSymbol, t8.OrganismStatus, t9.ExitReason
FROM organismstudy t1
INNER JOIN organism t2 USING (_Organism_key)
INNER JOIN cv_organismstatus t8 USING (_OrganismStatus_key)
LEFT OUTER JOIN cv_exitreason t9 USING (_ExitReason_key)
INNER JOIN line t3 USING (_Line_key)
INNER JOIN linename t4 USING (_Line_key)
INNER JOIN cv_sex t5 USING (_Sex_key)
INNER JOIN genotype t6 USING (_Organism_key)
INNER JOIN cv_genotypesymbol t7 USING (_GenotypeSymbol_key)
WHERE
t1._Study_key = '27'
AND t3._LineStatus_key != '13'
AND t4.IsPrimaryName = '1'
AND t8._OrganismStatus_key != 14;
""",
rslims_conn)
# Close connection to the database
rslims_conn.close()
# Clean up the data table for Lims to match with Core PFS
# Clean the stock numbers to remove 0
rslims_data_table['StockNumber'] = rslims_data_table['StockNumber'].str.lstrip('0')
# Remove whitespaces
rslims_data_table = rslims_data_table.map(lambda x: x.strip() if isinstance(x, str) else x)
# Connect to the PFS via OData API
# Set your username and password for basic authentication
username = "[email protected]"
password = "vA&ce3(ROzAL"
# Query using the credentials
auth = (username, password)
queryString = "https://jacksonlabs.platformforscience.com/PROD/odata/KOMP_REQUEST?$expand=REV_MOUSESAMPLELOT_KOMPREQUEST($expand=SAMPLE/pfs.MOUSE_SAMPLE)&$count=true"
result = requests.get(queryString, auth=auth,headers = {"Prefer": "odata.maxpagesize=5000"})
# The query outputs in JSON format, clean and parse the JSON
# Parse the string data into a dictionary
content = json.loads(result.content)
# Get list of experiment entities
exp_ls = content["value"]
# Initialize a variable to store all the print results
output = ""
# For each experiment in the list, get the list EXPERIMENT_SAMPLES
for exp in exp_ls:
sample_lots = exp["REV_MOUSESAMPLELOT_KOMPREQUEST"] # Could be empty
# for each experiment_sample,
for sample_lot in sample_lots:
mouse_sample = sample_lot["SAMPLE"]
output += 'Mouse Name:' + mouse_sample['JAX_SAMPLE_EXTERNALID'] + "\n"
output += 'Mouse Sex:' + mouse_sample['JAX_MOUSESAMPLE_SEX'] + "\n"
idx = mouse_sample['JAX_MOUSESAMPLE_ALLELE'].index(' ')
output += 'Mouse Line:' + mouse_sample['JAX_MOUSESAMPLE_ALLELE'][0:idx] + "\n"
output += 'Mouse StockNumber:' + mouse_sample['JAX_MOUSESAMPLE_ALLELE'][idx+4:-1] + "\n" # 4 lets us skip the "JR"
output += 'Mouse Genotype:' + mouse_sample['JAX_MOUSESAMPLE_GENOTYPE'] + "\n"
#output += 'Mouse life status:' + mouse_sample['JAX_SAMPLELOT_STATUS'] + "\n" # Note this a sample_lot!
if sample_lot['JAX_SAMPLELOT_STATUS'] is None:
output += 'Mouse life status:' + "null" + "\n"
else:
output += 'Mouse life status:' + sample_lot['JAX_SAMPLELOT_STATUS'] + "\n"
if mouse_sample['JAX_MOUSESAMPLE_EXITREASON'] is None:
output += 'Mouse exit reason:' + "null" + "\n"
else:
output += 'Mouse exit reason:' + mouse_sample['JAX_MOUSESAMPLE_EXITREASON'] + "\n"
if mouse_sample['JAX_MOUSESAMPLE_ISFILLER'] is None:
output += 'Mouse filler:' + "null" + "\n"
else:
output += 'Mouse filler:' + mouse_sample['JAX_MOUSESAMPLE_ISFILLER'] + "\n"
output += "\n"
#print(output)
# Split the data into individual entries
entries = output.strip().split('\n\n')
# Initialize a list to store parsed entries
mouse_data = []
# Parse each entry and extract information
for entry in entries:
lines = entry.split('\n')
mouse_info = {}
for line in lines:
key, value = line.split(':')
mouse_info[key.strip()] = value.strip()
mouse_data.append(mouse_info)
# Store in a dataframe
pfs_data_parsed = pd.DataFrame(mouse_data)
# Check for duplicates and remove those
if pfs_data_parsed['Mouse Name'].duplicated().any():
pfs_data_parsed.drop_duplicates(subset=['Mouse Name'], inplace=True)
# print("Duplicates have been dropped from the DataFrame.")
#else:
# print("No duplicates found in the DataFrame.")
# Clean the data table from PFS to match with Lims
# Drop the column 'Number of experiments' and 'Mouse Name' if it exists
if 'Number of experiments' in pfs_data_parsed.columns:
pfs_data_parsed.drop(columns=['Number of experiments'], inplace=True)
if 'Mouse Name' in pfs_data_parsed.columns:
pfs_data_parsed.drop(columns=['Mouse Name'], inplace=True)
# Drop rows with 'Yes' in the Mouse filler column - we do not need to count filler mice
pfs_data_parsed = pfs_data_parsed[pfs_data_parsed["Mouse filler"].str.contains("Yes") == False]
# Drop the Mouse filler column
pfs_data_parsed.drop(['Mouse filler'], axis=1, inplace=True)
# Replace 'M' with 'Male' and 'F' with 'Female' in Mouse Sex column
pfs_data_parsed['Mouse Sex'] = pfs_data_parsed['Mouse Sex'].replace({'M': 'Male', 'F': 'Female'})
# Replace 'Discarded' with 'Euthanized' and 'Received' with 'Alive' in Mouse life status column
pfs_data_parsed['Mouse life status'] = pfs_data_parsed['Mouse life status'].replace({'Discarded': 'Euthanized', 'Received': 'Alive'})
# Rename the columns
pfs_data_parsed.rename(columns={
'Mouse Sex': 'Sex',
'Mouse Line': 'LineName',
'Mouse StockNumber': 'StockNumber',
'Mouse Genotype': 'GenotypeSymbol',
'Mouse life status': 'OrganismStatus',
'Mouse exit reason': 'ExitReason'}, inplace=True)
# Clean the stock numbers to remove 0
pfs_data_parsed['StockNumber'] = pfs_data_parsed['StockNumber'].str.lstrip('0')
# Remove whitespaces
pfs_data_parsed = pfs_data_parsed.map(lambda x: x.strip() if isinstance(x, str) else x)
# Modify the 'Exit Reason' column from 'null' to 'None'
pfs_data_parsed['ExitReason'] = pfs_data_parsed['ExitReason'].replace({'null': 'None'})
# Print the data tables from Lims and PFS
#print(rslims_data_table)
#print(pfs_data_parsed)
# Export the PFS data to check with the excel export directly from PFS
#pfs_data_parsed.to_csv("pfs_data_python_04_22.csv", index=False)
# Concatenate the two tables
# Merge the two tables
lims_pfs_merge = pd.concat([rslims_data_table, pfs_data_parsed], ignore_index=True)
# Empty rows break the code while grouping, replace with 'None' strings
# Convert blank values to None
lims_pfs_merge['ExitReason'] = lims_pfs_merge['ExitReason'].fillna('None').astype(str)
# Group by and count
grouped_lims_pfs_df = lims_pfs_merge.groupby(['StockNumber', 'LineName', 'Sex', 'GenotypeSymbol', 'OrganismStatus', 'ExitReason'], as_index=False).size()
# Drop rows for Stock Number 5304
grouped_lims_pfs_df = grouped_lims_pfs_df[grouped_lims_pfs_df["StockNumber"].str.contains("5304") == False]
# Get the current date and time
current_datetime = datetime.now()
# Format the date and time
date_string = current_datetime.strftime("%Y-%m-%d")
# Define the filename with the current date
filename = sys.argv[1]
#filename = f'lims_pfs_strain_inprogress_{date_string}.csv'
# Export as a csv file with date
grouped_lims_pfs_df.to_csv(filename, index=False)
#print(f"CSV file '{filename}' has been created.")