-
Notifications
You must be signed in to change notification settings - Fork 0
/
Genemap2ToPheno.py
146 lines (116 loc) · 4.76 KB
/
Genemap2ToPheno.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# This is a simple script to parse the genemap2.txt file that
# can be downloaded from https://omim.org/
#
# The file can downloaded from https://omim.org/downloads
# (registration required).
#
# Imports
import sys
import re
# print header
h = ["approvedGeneSymbol", "geneSymbols", "mimNumber","phenotypeText", "phenotypeMimNumber","inheritances"]
print("\t".join(h))
# Read from stdin
for line in sys.stdin:
# Skip comments
if line.startswith('#'):
continue
# Strip trailing new line
line = line.strip('\n')
# Get the values
valueList = line.split('\t')
# Get the fields
chromosome = valueList[0]
genomicPositionStart = valueList[1]
genomicPositionend = valueList[2]
cytoLocation = valueList[3]
computedCytoLocation = valueList[4]
mimNumber = valueList[5]
geneSymbols = valueList[6]
geneName = valueList[7]
approvedGeneSymbol = valueList[8]
entrezGeneID = valueList[9]
ensemblGeneID = valueList[10]
comments = valueList[11]
phenotypeString = valueList[12]
mouse = valueList[13]
# Définir une fonction pour modifier le mode de transmission
def my_inherintance(inheritance):
inheritance = inheritance.strip()
replacements = {'Autosomal': 'A', ' recessive': 'R', ' dominant': 'D', 'Y-linked': 'YL', 'X-linked': 'XL'}
inheritance=re.sub('({})'.format('|'.join(map(re.escape, replacements.keys()))), lambda m: replacements[m.group()], inheritance)
return inheritance
# Skip empty phenotypes
if not phenotypeString:
continue
# Parse the phenotypes
phenotypeFull = phenotypeString.split(';')
for j, phenotype in enumerate(phenotypeFull):
# Clean the phenotype
phenotype = phenotype.strip()
# Long phenotype
matcher = re.match(r'^(.*),\s(\d{6})\s\((\d)\)(|, (.*))$', phenotype)
if matcher:
# Get the fields
phenotypeText = matcher.group(1)
phenotypeMimNumber = matcher.group(2)
phenotypeMappingKey = matcher.group(3)
inheritances = matcher.group(5)
# Get the inheritances, may or may not be there
if inheritances:
inh = inheritances.split(',')
for i, inheritance in enumerate(inh):
inh[i] = my_inherintance(inheritance)
inheritances = ",".join(inh)
# Update if inheritances
phenotype = "|".join([phenotypeText, phenotypeMimNumber, inheritances])
else:
phenotype = "|".join([phenotypeText, "", ""])
# Short phenotype
else:
matcher = re.match(r'^(.*)\((\d)\)(|, (.*))$', phenotype)
if matcher:
# Get the fields
phenotypeText = matcher.group(1)
phenotypeMappingKey = matcher.group(2)
inheritances = matcher.group(3)
# Get the inheritances, may or may not be there
if inheritances:
inh = inheritances.split(',')
for i, inheritance in enumerate(inh):
inh[i] = my_inherintance(inheritance)
inheritances = ",".join(inh)
# Update if inheritances
phenotype = "|".join([phenotypeText, "", inheritances])
else:
phenotype = "|".join([phenotypeText, "", ""])
# Update phenotype
phenotypeFull[j] = phenotype
phenotypeFull = "|".join(phenotypeFull)
# Format columns
phenotypeFull =phenotypeFull.split('|')
phenotypeText= phenotypeFull[0]
phenotypeMimNumber =phenotypeFull[1]
inheritances=phenotypeFull[2]
# Regrouper le texte, puis numéro OMIM puis phénotype
if len(phenotypeFull)>3:
for k,value in enumerate(phenotypeFull):
if k<3: continue #skip first three columns already implemented
if k % 3 == 0 and value != "":
if phenotypeText == "" : phenotypeText=value
phenotypeText="/".join([phenotypeText,value])
elif k % 3 == 1 and value != "":
if phenotypeMimNumber == "" : phenotypeMimNumber=value
phenotypeMimNumber="/".join([phenotypeMimNumber,value])
elif k % 3 == 2 and value != "":
if inheritances == "" : inheritances=value
inheritances="/".join([inheritances,value])
phenotypeFull = '\t'.join([phenotypeText, phenotypeMimNumber,inheritances])
# Export selected fields
final = "\t". join([approvedGeneSymbol, geneSymbols, mimNumber, phenotypeFull])
# Injecte la nouvelle ligne
line = final
print(line)