-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathfetch_mRNA_from_gtf.py
34 lines (28 loc) · 1.12 KB
/
fetch_mRNA_from_gtf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# -*- coding: utf-8 -*-
"""
Created on 2021-10-27
@author: Yudongcai
@Email: [email protected]
"""
import typer
import numpy as np
import pandas as pd
import pyranges as pr
from pyfaidx import Fasta
def main(fastafile: str = typer.Option(..., help="fasta file"),
gtffile: str = typer.Option(..., help="gtf file"),
outfile: str = typer.Option(..., help="output mRNA fasta file")):
"""根据gtf提取基因的mRNA序列,同一基因的不同转录本会merge起来,每个基因只输出一个合并后的mRNA序列"""
gr = pr.read_gtf(gtffile)
df = gr.merge(by=["Feature", "gene_id"], strand=False).as_df()
seq = Fasta(fasta)
with open(outfile, 'w') as f:
for gene, gdf in df.loc[df['Feature']=='exon', :].groupby('gene_id'):
f.write(f'>{gene}\n')
content = []
for chrom, start, end in gdf.sort_values('Start')[['Chromosome', 'Start', 'End']].values:
content.append(seq[chrom][start:end].seq)
else:
f.write(''.join(content)+'\n')
if __name__ == '__main__':
typer.run(main)