-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_all_children_from_tax.py
51 lines (38 loc) · 1.46 KB
/
get_all_children_from_tax.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import pandas as pd
from tqdm import tqdm
import numpy as np
import ast
import time
start_time = time.time()
df = pd.read_csv('taxonomy_with_direct_children.csv')
df['children_ids'] = df['children_ids'].fillna('[]')
temp = ast.literal_eval(df.at[0,'children_ids'])
temp.remove(1) # Remove the root node from its children to avoid infinite loop
df.at[0,'children_ids'] = str(temp)
dict_children = {}
def get_all_children_ids(df, tax_id):
global dict_children
# print(dict_children)
if tax_id in dict_children:
return dict_children[tax_id]
# print(f'Finding {tax_id}_children')
children_ids = set()
row = df[df['tax_id'] == tax_id]
if not row.empty:
direct_children_ids = ast.literal_eval(row['children_ids'].iloc[0])
if direct_children_ids:
for child_id in direct_children_ids:
children_ids.add(child_id)
children_ids.update(get_all_children_ids(df, child_id))
# print(f'Get {tax_id}_children: {children_ids}')
dict_children[tax_id] = children_ids
return children_ids
df['all_children_ids'] = ''
for index, row in tqdm(df[::-1].iterrows(), total=len(df), desc='Calculating children IDs'):
tax_id = row['tax_id']
children_ids = get_all_children_ids(df, tax_id)
df.at[index, 'all_children_ids'] = children_ids
df.to_csv('taxonomy_with_all_children.csv', index=False)
end_time = time.time()
print(f'Execution time: {end_time - start_time} seconds')
print('Done')