-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit c1228d7
Showing
30 changed files
with
1,163 additions
and
0 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
# coding=utf-8 | ||
import multiprocessing | ||
from time import sleep | ||
import requests | ||
import bs4 | ||
import sys | ||
from w3lib.html import remove_tags | ||
import json | ||
|
||
|
||
def get_mesh(phenotype): | ||
""" | ||
模拟https://www.nlm.nih.gov/mesh/2016/mesh_browser/MBrowser.html页面的表单提交,获取数据。 | ||
""" | ||
try: | ||
if phenotype.startswith('https://'): | ||
r = requests.get(phenotype) | ||
else: | ||
url = 'https://www.nlm.nih.gov/cgi/mesh/2016/MB_cgi' | ||
exact_submit = { | ||
'term': '%s' % phenotype, | ||
'exact': 'Find Exact Term' | ||
} | ||
# session = requests.session() | ||
r = requests.post(url, data=exact_submit) | ||
soup = bs4.BeautifulSoup(r.text, 'html.parser') | ||
tags = soup.find_all("tr") | ||
mesh_heading = '' | ||
tree_numbers = [] | ||
notes = [] | ||
unique_id = '' | ||
heading_mapped_to = {} | ||
for tag in tags: | ||
th = remove_tags(str(tag.find('th'))) | ||
td = remove_tags(str(tag.find('td'))) | ||
if th.strip() == 'MeSH Heading' or th.strip() == 'Name of Substance': | ||
mesh_heading = td | ||
elif th.strip() == 'Tree Number': | ||
tree_numbers.append(td) | ||
elif 'Note' in th: | ||
notes.append(td) | ||
elif th.strip() == 'Heading Mapped to': | ||
# heading_mapped_to[td.strip()] = get_mesh(td.strip().replace('*', '')) | ||
heading_mapped_to[td.strip()] = get_mesh( | ||
'https://www.nlm.nih.gov%s' % tag.find('a')['href'] | ||
) | ||
elif th.strip() == 'Unique ID': | ||
unique_id = td | ||
record = { | ||
'phenotype': phenotype, | ||
'mesh_heading': mesh_heading, | ||
'tree_numbers': tree_numbers, | ||
'note': ';'.join(notes), | ||
'unique_id': unique_id, | ||
'heading_mapped_to': heading_mapped_to | ||
} | ||
return record | ||
except: | ||
sleep(10) | ||
return { | ||
'phenotype': phenotype, | ||
'mesh_heading': 'error', | ||
'tree_numbers': 'error', | ||
'note': 'error', | ||
'unique_id': 'error', | ||
'heading_mapped_to': 'error', | ||
} | ||
|
||
|
||
def read_search_phenotypes(fp): | ||
phenotypes = [] | ||
with open(fp) as fi: | ||
for line in fi: | ||
if line.strip() == '': | ||
continue | ||
phenotypes.append(line.strip()) | ||
return phenotypes | ||
|
||
|
||
def batch(fp): | ||
phenotypes = read_search_phenotypes(fp) | ||
pool = multiprocessing.Pool(100) | ||
result = pool.map(get_mesh, phenotypes) | ||
json.dump(result, open('output/mesh_code_result.json', 'w')) | ||
|
||
|
||
if __name__ == '__main__': | ||
batch(sys.argv[1]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
import os | ||
import multiprocessing | ||
|
||
|
||
def run(gene): | ||
fn_list = os.listdir('result') | ||
if '%s.tsv' % gene not in fn_list: | ||
os.system( | ||
'aria2c --header="Cookie:donation-popup=true; ' | ||
'sessionid=o5tznyomgtr5x70wmej8hlyg4sck0k8z; _ga=GA1.2.801700580.1476441682" ' | ||
'"https://omim.org/search/?index=geneMap&search=%s&start=1&limit=10000&format=tab" ' | ||
'-o result/%s.tsv' % (gene, gene) | ||
) | ||
|
||
|
||
def get_genes(): | ||
genes = [] | ||
with open('/home/jjiang/omim.txt') as fi: | ||
for line in fi: | ||
genes.append(line.strip()) | ||
return genes | ||
|
||
|
||
def multi_run(): | ||
genes = get_genes() | ||
pool = multiprocessing.Pool(100) | ||
pool.map(run, genes) | ||
|
||
|
||
def get_all_records(): | ||
genes = get_genes() | ||
for gene in genes: | ||
with open('result/%s.tsv' % gene) as fi: | ||
for line in fi: | ||
if line.startswith('Gene Map Search'): | ||
continue | ||
try: | ||
if gene in line.split('\t')[2]: | ||
yield line | ||
except IndexError: | ||
continue | ||
|
||
|
||
def generate_all(): | ||
with open('all_record.tsv', 'w') as fo: | ||
for line in get_all_records(): | ||
fo.write(line) | ||
|
||
|
||
if __name__ == '__main__': | ||
generate_all() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
import os | ||
import urllib2 | ||
|
||
import bs4 | ||
|
||
|
||
def get_ENST(enst_number): | ||
cmd = 'wget "http://www.ensembl.org/Homo_sapiens/Export/Output/Transcript?db=core;flank3_display=0;' \ | ||
'flank5_display=0;output=fasta;t=%s;param=cdna;param=coding;param=peptide;param=utr5;param=utr3;' \ | ||
'param=exon;param=intron;genomic=unmasked;_format=Text" -O %s.fa' % (enst_number, enst_number) | ||
os.system(cmd) | ||
with open('%s.fa' % enst_number) as fi: | ||
seq = fi.read() | ||
os.system('rm %s.fa' % enst_number) | ||
return seq | ||
|
||
|
||
def get_uuid(nm_number): | ||
url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=nucleotide&term=%s' % nm_number | ||
response = urllib2.urlopen(url) | ||
soup = bs4.BeautifulSoup(response.read(), 'html.parser') | ||
id_list = [str(s).replace('<id>', '').replace('</id>', '') for s in soup('id')] | ||
if len(id_list) == 1: | ||
return id_list[0] | ||
else: | ||
return None | ||
|
||
|
||
def get_NM(nm_number): | ||
uuid = get_uuid(nm_number) | ||
if uuid is None: | ||
return '' | ||
cmd = 'wget "https://www.ncbi.nlm.nih.gov//sviewer/viewer.cgi?tool=portal&save=file&log$=seqview&db=nuccore&' \ | ||
'report=fasta&sort=&id=%s&from=begin&to=end&maxplex=1" -O %s.fa' % (uuid, nm_number) | ||
os.system(cmd) | ||
with open('%s.fa' % nm_number) as fi: | ||
seq = fi.read() | ||
os.system('rm %s.fa' % nm_number) | ||
return seq | ||
|
||
|
||
def get_transcripts(accession_list): | ||
seq_list = [] | ||
for accession in accession_list: | ||
if accession.startswith('ENST'): | ||
seq_list.append(get_ENST(accession)) | ||
elif accession.startswith('NM'): | ||
seq_list.append(get_NM(accession)) | ||
return '\n'.join(seq_list) | ||
|
||
|
||
if __name__ == '__main__': | ||
print get_uuid('NM_002944') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
import json | ||
import csv | ||
import sys | ||
|
||
reload(sys) | ||
sys.setdefaultencoding('utf8') | ||
|
||
|
||
def get_tree_numbers(rec): | ||
tree_nums = [] | ||
if rec['tree_numbers'] != 'error' and len(rec['tree_numbers']) > 0: | ||
return rec['tree_numbers'] | ||
elif rec['heading_mapped_to'] != 'error' and len(rec['heading_mapped_to'].keys()) > 0: | ||
for k, v in rec['heading_mapped_to'].items(): | ||
tree_nums.extend(get_tree_numbers(v)) | ||
return tree_nums | ||
else: | ||
return tree_nums | ||
|
||
|
||
def process(): | ||
with open('output/mesh_code_result.json') as fi: | ||
result = json.load(fi) | ||
|
||
tree_number_max_count = max([len(get_tree_numbers(record)) for record in result]) | ||
|
||
with open('output/mesh_code_result.csv', 'w') as fo: | ||
writer = csv.writer(fo) | ||
header_row = ['Phenotype', 'MeSH Heading/Name of Substance'] | ||
header_row.extend(['Tree Number'] * tree_number_max_count) | ||
header_row.append('Notes') | ||
header_row.append('Heading Mapped to') | ||
writer.writerow(header_row) | ||
for record in result: | ||
row = [record['phenotype'].strip(), record['mesh_heading'].strip()] | ||
tree_numbers = get_tree_numbers(record) | ||
empty = [''] * (tree_number_max_count - len(tree_numbers)) | ||
for tn in tree_numbers: | ||
row.append(tn.strip()) | ||
for e in empty: | ||
row.append(e) | ||
row.append(record['note'].strip()) | ||
row.append( | ||
';'.join(record['heading_mapped_to'].keys()) | ||
if record['heading_mapped_to'] != 'error' else 'error' | ||
) | ||
writer.writerow(row) |
Oops, something went wrong.