Skip to content

Commit

Permalink
Init repo
Browse files Browse the repository at this point in the history
  • Loading branch information
semal committed Nov 6, 2017
0 parents commit c1228d7
Show file tree
Hide file tree
Showing 30 changed files with 1,163 additions and 0 deletions.
Empty file added Controller/__init__.py
Empty file.
88 changes: 88 additions & 0 deletions Controller/get_mesh_code.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# coding=utf-8
import multiprocessing
from time import sleep
import requests
import bs4
import sys
from w3lib.html import remove_tags
import json


def get_mesh(phenotype):
"""
模拟https://www.nlm.nih.gov/mesh/2016/mesh_browser/MBrowser.html页面的表单提交,获取数据。
"""
try:
if phenotype.startswith('https://'):
r = requests.get(phenotype)
else:
url = 'https://www.nlm.nih.gov/cgi/mesh/2016/MB_cgi'
exact_submit = {
'term': '%s' % phenotype,
'exact': 'Find Exact Term'
}
# session = requests.session()
r = requests.post(url, data=exact_submit)
soup = bs4.BeautifulSoup(r.text, 'html.parser')
tags = soup.find_all("tr")
mesh_heading = ''
tree_numbers = []
notes = []
unique_id = ''
heading_mapped_to = {}
for tag in tags:
th = remove_tags(str(tag.find('th')))
td = remove_tags(str(tag.find('td')))
if th.strip() == 'MeSH Heading' or th.strip() == 'Name of Substance':
mesh_heading = td
elif th.strip() == 'Tree Number':
tree_numbers.append(td)
elif 'Note' in th:
notes.append(td)
elif th.strip() == 'Heading Mapped to':
# heading_mapped_to[td.strip()] = get_mesh(td.strip().replace('*', ''))
heading_mapped_to[td.strip()] = get_mesh(
'https://www.nlm.nih.gov%s' % tag.find('a')['href']
)
elif th.strip() == 'Unique ID':
unique_id = td
record = {
'phenotype': phenotype,
'mesh_heading': mesh_heading,
'tree_numbers': tree_numbers,
'note': ';'.join(notes),
'unique_id': unique_id,
'heading_mapped_to': heading_mapped_to
}
return record
except:
sleep(10)
return {
'phenotype': phenotype,
'mesh_heading': 'error',
'tree_numbers': 'error',
'note': 'error',
'unique_id': 'error',
'heading_mapped_to': 'error',
}


def read_search_phenotypes(fp):
phenotypes = []
with open(fp) as fi:
for line in fi:
if line.strip() == '':
continue
phenotypes.append(line.strip())
return phenotypes


def batch(fp):
phenotypes = read_search_phenotypes(fp)
pool = multiprocessing.Pool(100)
result = pool.map(get_mesh, phenotypes)
json.dump(result, open('output/mesh_code_result.json', 'w'))


if __name__ == '__main__':
batch(sys.argv[1])
51 changes: 51 additions & 0 deletions Controller/get_omim_tables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import os
import multiprocessing


def run(gene):
fn_list = os.listdir('result')
if '%s.tsv' % gene not in fn_list:
os.system(
'aria2c --header="Cookie:donation-popup=true; '
'sessionid=o5tznyomgtr5x70wmej8hlyg4sck0k8z; _ga=GA1.2.801700580.1476441682" '
'"https://omim.org/search/?index=geneMap&search=%s&start=1&limit=10000&format=tab" '
'-o result/%s.tsv' % (gene, gene)
)


def get_genes():
genes = []
with open('/home/jjiang/omim.txt') as fi:
for line in fi:
genes.append(line.strip())
return genes


def multi_run():
genes = get_genes()
pool = multiprocessing.Pool(100)
pool.map(run, genes)


def get_all_records():
genes = get_genes()
for gene in genes:
with open('result/%s.tsv' % gene) as fi:
for line in fi:
if line.startswith('Gene Map Search'):
continue
try:
if gene in line.split('\t')[2]:
yield line
except IndexError:
continue


def generate_all():
with open('all_record.tsv', 'w') as fo:
for line in get_all_records():
fo.write(line)


if __name__ == '__main__':
generate_all()
53 changes: 53 additions & 0 deletions Controller/get_transcripts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import os
import urllib2

import bs4


def get_ENST(enst_number):
cmd = 'wget "http://www.ensembl.org/Homo_sapiens/Export/Output/Transcript?db=core;flank3_display=0;' \
'flank5_display=0;output=fasta;t=%s;param=cdna;param=coding;param=peptide;param=utr5;param=utr3;' \
'param=exon;param=intron;genomic=unmasked;_format=Text" -O %s.fa' % (enst_number, enst_number)
os.system(cmd)
with open('%s.fa' % enst_number) as fi:
seq = fi.read()
os.system('rm %s.fa' % enst_number)
return seq


def get_uuid(nm_number):
url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=nucleotide&term=%s' % nm_number
response = urllib2.urlopen(url)
soup = bs4.BeautifulSoup(response.read(), 'html.parser')
id_list = [str(s).replace('<id>', '').replace('</id>', '') for s in soup('id')]
if len(id_list) == 1:
return id_list[0]
else:
return None


def get_NM(nm_number):
uuid = get_uuid(nm_number)
if uuid is None:
return ''
cmd = 'wget "https://www.ncbi.nlm.nih.gov//sviewer/viewer.cgi?tool=portal&save=file&log$=seqview&db=nuccore&' \
'report=fasta&sort=&id=%s&from=begin&to=end&maxplex=1" -O %s.fa' % (uuid, nm_number)
os.system(cmd)
with open('%s.fa' % nm_number) as fi:
seq = fi.read()
os.system('rm %s.fa' % nm_number)
return seq


def get_transcripts(accession_list):
seq_list = []
for accession in accession_list:
if accession.startswith('ENST'):
seq_list.append(get_ENST(accession))
elif accession.startswith('NM'):
seq_list.append(get_NM(accession))
return '\n'.join(seq_list)


if __name__ == '__main__':
print get_uuid('NM_002944')
47 changes: 47 additions & 0 deletions Controller/process_mesh_code_result.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import json
import csv
import sys

reload(sys)
sys.setdefaultencoding('utf8')


def get_tree_numbers(rec):
tree_nums = []
if rec['tree_numbers'] != 'error' and len(rec['tree_numbers']) > 0:
return rec['tree_numbers']
elif rec['heading_mapped_to'] != 'error' and len(rec['heading_mapped_to'].keys()) > 0:
for k, v in rec['heading_mapped_to'].items():
tree_nums.extend(get_tree_numbers(v))
return tree_nums
else:
return tree_nums


def process():
with open('output/mesh_code_result.json') as fi:
result = json.load(fi)

tree_number_max_count = max([len(get_tree_numbers(record)) for record in result])

with open('output/mesh_code_result.csv', 'w') as fo:
writer = csv.writer(fo)
header_row = ['Phenotype', 'MeSH Heading/Name of Substance']
header_row.extend(['Tree Number'] * tree_number_max_count)
header_row.append('Notes')
header_row.append('Heading Mapped to')
writer.writerow(header_row)
for record in result:
row = [record['phenotype'].strip(), record['mesh_heading'].strip()]
tree_numbers = get_tree_numbers(record)
empty = [''] * (tree_number_max_count - len(tree_numbers))
for tn in tree_numbers:
row.append(tn.strip())
for e in empty:
row.append(e)
row.append(record['note'].strip())
row.append(
';'.join(record['heading_mapped_to'].keys())
if record['heading_mapped_to'] != 'error' else 'error'
)
writer.writerow(row)
Loading

0 comments on commit c1228d7

Please sign in to comment.