Skip to content

Commit

Permalink
Format with black (#4645)
Browse files Browse the repository at this point in the history
* Format with black

* style: Add pyproject.toml to match tools ruff

* style: ruff format

* style: pyproject.toml => ruff.toml

Don't want people getting the wrong idea that this is a python project
or something.

* Update cellranger_count.py

* style: Clean up whitespace

---------

Co-authored-by: Simon Pearce <[email protected]>
Co-authored-by: Edmund Miller <[email protected]>
Co-authored-by: Edmund Miller <[email protected]>
  • Loading branch information
4 people authored Jun 14, 2024
1 parent 612271e commit 82024cf
Show file tree
Hide file tree
Showing 9 changed files with 170 additions and 96 deletions.
11 changes: 6 additions & 5 deletions modules/nf-core/cellranger/count/templates/cellranger_count.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
Copyright (c) Gregor Sturm 2023 - MIT License
"""

from subprocess import run
from pathlib import Path
from textwrap import dedent
Expand Down Expand Up @@ -34,11 +35,11 @@ def chunk_iter(seq, size):
# Match R1 in the filename, but only if it is followed by a non-digit or non-character
# match "file_R1.fastq.gz", "file.R1_000.fastq.gz", etc. but
# do not match "SRR12345", "file_INFIXR12", etc
filename_pattern = r'([^a-zA-Z0-9])R1([^a-zA-Z0-9])'
filename_pattern = r"([^a-zA-Z0-9])R1([^a-zA-Z0-9])"

for i, (r1, r2) in enumerate(chunk_iter(fastqs, 2), start=1):
# double escapes are required because nextflow processes this python 'template'
if re.sub(filename_pattern, r'\\1R2\\2', r1.name) != r2.name:
if re.sub(filename_pattern, r"\\1R2\\2", r1.name) != r2.name:
raise AssertionError(
dedent(
f"""\
Expand All @@ -55,20 +56,20 @@ def chunk_iter(seq, size):
r1.rename(fastq_all / f"{sample_id}_S1_L{i:03d}_R1_001.fastq.gz")
r2.rename(fastq_all / f"{sample_id}_S1_L{i:03d}_R2_001.fastq.gz")

# fmt: off
run(
# fmt: off
[
"cellranger", "count",
"--id", "${prefix}",
"--fastqs", str(fastq_all),
"--transcriptome", "${reference.name}",
"--localcores", "${task.cpus}",
"--localmem", "${task.memory.toGiga()}",
*shlex.split("""${args}""")
*shlex.split("""${args}"""),
],
# fmt: on
check=True,
)
# fmt: on

# Output version information
version = run(
Expand Down
12 changes: 2 additions & 10 deletions modules/nf-core/custom/catadditionalfasta/templates/fasta2gtf.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,16 +114,8 @@ def main() -> None:
fasta_to_gtf("$add_fasta", f"{add_name}.gtf", "$biotype")

# Concatenate new fasta to existing fasta, and the GTF we just generated to the GTF
genome_name = (
"$params.genome"
if "$params.genome" != "null"
else os.path.splitext(os.path.basename("$fasta"))[0]
)
output_prefix = (
"$task.ext.prefix"
if "$task.ext.prefix" != "null"
else f"{genome_name}_{add_name}"
)
genome_name = "$params.genome" if "$params.genome" != "null" else os.path.splitext(os.path.basename("$fasta"))[0]
output_prefix = "$task.ext.prefix" if "$task.ext.prefix" != "null" else f"{genome_name}_{add_name}"

os.mkdir("out")
os.system(f"cat $fasta $add_fasta > out/{output_prefix}.fasta")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

"""Provide functions to merge multiple versions.yml files."""


import yaml
import platform
from textwrap import dedent
Expand Down
24 changes: 17 additions & 7 deletions modules/nf-core/custom/tx2gene/templates/tx2gene.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


def format_yaml_like(data: dict, indent: int = 0) -> str:
"""Formats a dictionary to a YAML-like string.
Expand All @@ -36,6 +37,7 @@ def format_yaml_like(data: dict, indent: int = 0) -> str:
yaml_str += f"{spaces}{key}: {value}\\n"
return yaml_str


def read_top_transcripts(quant_dir: str, file_pattern: str) -> Set[str]:
"""
Read the top 100 transcripts from the quantification file.
Expand Down Expand Up @@ -123,7 +125,12 @@ def parse_attributes(attributes_text: str) -> Dict[str, str]:


def map_transcripts_to_gene(
quant_type: str, gtf_file: str, quant_dir: str, gene_id: str, extra_id_field: str, output_file: str
quant_type: str,
gtf_file: str,
quant_dir: str,
gene_id: str,
extra_id_field: str,
output_file: str,
) -> bool:
"""
Map transcripts to gene names and write the output to a file.
Expand Down Expand Up @@ -156,7 +163,10 @@ def map_transcripts_to_gene(
attr_dict = parse_attributes(cols[8])
if gene_id in attr_dict and transcript_attribute in attr_dict:
# Create a unique identifier for the transcript-gene combination
transcript_gene_pair = (attr_dict[transcript_attribute], attr_dict[gene_id])
transcript_gene_pair = (
attr_dict[transcript_attribute],
attr_dict[gene_id],
)

# Check if the combination has already been seen
if transcript_gene_pair not in seen:
Expand All @@ -170,14 +180,14 @@ def map_transcripts_to_gene(

# Main function to parse arguments and call the mapping function
if __name__ == "__main__":
if '${task.ext.prefix}' != "null":
if "${task.ext.prefix}" != "null":
prefix = "${task.ext.prefix}."
elif '$meta.id' != "null":
prefix = '${meta.id}.'
elif "$meta.id" != "null":
prefix = "${meta.id}."
else:
prefix = ''
prefix = ""

if not map_transcripts_to_gene('$quant_type', '$gtf', 'quants', '$id', '$extra', f"{prefix}tx2gene.tsv"):
if not map_transcripts_to_gene("$quant_type", "$gtf", "quants", "$id", "$extra", f"{prefix}tx2gene.tsv"):
logger.error("Failed to map transcripts to genes.")

# Write the versions
Expand Down
166 changes: 103 additions & 63 deletions modules/nf-core/mygene/templates/mygene.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,10 @@ class Arguments:
"""
Parses the argments, including the ones coming from $task.ext.args.
"""

def __init__(self) -> None:
self.input = "$gene_list"
self.prefix = (
"$task.ext.prefix"
if "$task.ext.prefix" != "null"
else "$meta.id"
)
self.prefix = "$task.ext.prefix" if "$task.ext.prefix" != "null" else "$meta.id"
self.output_gmt = self.prefix + ".gmt"
self.output_tsv = self.prefix + ".tsv"
self.parse_ext_args("$task.ext.args")
Expand All @@ -46,17 +43,49 @@ def parse_ext_args(self, args_string: str) -> None:
args_list = shlex.split(args_string) # Split the string into a list of arguments
parser = argparse.ArgumentParser()
# input parameters
parser.add_argument('--columname', default='gene_id', help='Name of the column where the gene ids are stored in the input file. Default: gene_id')
parser.add_argument(
"--columname",
default="gene_id",
help="Name of the column where the gene ids are stored in the input file. Default: gene_id",
)
# filtering parameters
parser.add_argument('--species', default=None, help="Comma separated of common name of the species or taxon ids")
parser.add_argument('--go_category', default=None, help="Comma separated list of GO categories to keep. Default: all")
parser.add_argument('--go_evidence', default=None, help="Comma separated list of GO evidence codes to keep. Default: all")
parser.add_argument(
"--species",
default=None,
help="Comma separated of common name of the species or taxon ids",
)
parser.add_argument(
"--go_category",
default=None,
help="Comma separated list of GO categories to keep. Default: all",
)
parser.add_argument(
"--go_evidence",
default=None,
help="Comma separated list of GO evidence codes to keep. Default: all",
)
# additional parameters for querymany
parser.add_argument('--scopes', default=None, help="Comma separated list of scopes to search for.")
parser.add_argument('--entrezonly', default=False, help="When true, the query returns only the hits with valid Entrez gene ids. Default: false.")
parser.add_argument('--ensemblonly', default=False, help="When true, the query returns only the hits with valid Ensembl gene ids. Default: False")
parser.add_argument(
"--scopes",
default=None,
help="Comma separated list of scopes to search for.",
)
parser.add_argument(
"--entrezonly",
default=False,
help="When true, the query returns only the hits with valid Entrez gene ids. Default: false.",
)
parser.add_argument(
"--ensemblonly",
default=False,
help="When true, the query returns only the hits with valid Ensembl gene ids. Default: False",
)
# output parameters
parser.add_argument('--generate_tsv', default=False, help="Also generate a tsv file with the gene based information. Default: False")
parser.add_argument(
"--generate_tsv",
default=False,
help="Also generate a tsv file with the gene based information. Default: False",
)
args = parser.parse_args(args_list)

# Convert "null" values to default values
Expand Down Expand Up @@ -135,7 +164,17 @@ class MyGene:
of the query gene. Then, it queries for the annotations, and parses the go
terms all together with all the other information.
"""
def __init__(self, query: list, species: str, scopes: str, entrezonly: bool, ensemblonly: bool, go_category: str = None, go_evidence: str = None) -> None:

def __init__(
self,
query: list,
species: str,
scopes: str,
entrezonly: bool,
ensemblonly: bool,
go_category: str = None,
go_evidence: str = None,
) -> None:
self.query = query
self.fields = "go,symbol,name,taxid"
self.species = species
Expand All @@ -158,9 +197,9 @@ def query2idmap(self) -> dict:
species=self.species,
entrezonly=self.entrezonly,
ensemblonly=self.ensemblonly,
returnall=True
returnall=True,
)
return {dic['_id']: dic['query'] for dic in q['out'] if '_id' in dic}
return {dic["_id"]: dic["query"] for dic in q["out"] if "_id" in dic}

def id2info(self) -> list:
"""
Expand All @@ -177,22 +216,23 @@ def parse_go_based_info(self) -> dict:
"""
info = {}
for dic in self.id2info():

if 'go' not in dic:
if "go" not in dic:
continue
if self.go_category:
dic['go'] = {category: dic['go'][category] for category in self.go_category.split(",") if category in dic['go']}
for category, go_list in dic['go'].items():
dic["go"] = {
category: dic["go"][category] for category in self.go_category.split(",") if category in dic["go"]
}
for category, go_list in dic["go"].items():
if not isinstance(go_list, list):
go_list = [go_list]
for go in go_list:
if (self.go_evidence) and (go['evidence'] not in self.go_evidence.split(",")):
if (self.go_evidence) and (go["evidence"] not in self.go_evidence.split(",")):
continue

if go['id'] not in info:
info[go['id']] = [go['term'], self.idmap[dic['_id']]]
if go["id"] not in info:
info[go["id"]] = [go["term"], self.idmap[dic["_id"]]]
else:
info[go['id']].append(self.idmap[dic['_id']])
info[go["id"]].append(self.idmap[dic["_id"]])
return info

def parse_gene_based_info(self) -> dict:
Expand All @@ -206,30 +246,31 @@ def parse_gene_based_info(self) -> dict:
"""
info = {}
for dic in self.id2info():

if 'go' not in dic:
if "go" not in dic:
continue
if self.go_category:
dic['go'] = {category: dic['go'][category] for category in self.go_category.split(",") if category in dic['go']}
for category, go_list in dic['go'].items():
dic["go"] = {
category: dic["go"][category] for category in self.go_category.split(",") if category in dic["go"]
}
for category, go_list in dic["go"].items():
if not isinstance(go_list, list):
go_list = [go_list]
for go in go_list:
if (self.go_evidence) and (go['evidence'] not in self.go_evidence.split(",")):
if (self.go_evidence) and (go["evidence"] not in self.go_evidence.split(",")):
continue

current_info = {
'query': self.idmap[dic['_id']],
'mygene_id': dic['_id'],
'go_id': go['id'],
'go_term': go['term'],
'go_evidence': go['evidence'],
'go_category': category,
'symbol': dic['symbol'],
'name': dic['name'],
'taxid': dic['taxid']
"query": self.idmap[dic["_id"]],
"mygene_id": dic["_id"],
"go_id": go["id"],
"go_term": go["term"],
"go_evidence": go["evidence"],
"go_category": category,
"symbol": dic["symbol"],
"name": dic["name"],
"taxid": dic["taxid"],
}
info[self.idmap[dic['_id']]] = current_info
info[self.idmap[dic["_id"]]] = current_info
return info

def parse_and_save_to_gmt(self, filename: str) -> list:
Expand All @@ -239,7 +280,7 @@ def parse_and_save_to_gmt(self, filename: str) -> list:
"""
info = self.parse_go_based_info()
info = dict(sorted(info.items(), key=lambda x: x[0]))
with open(filename, 'w') as f:
with open(filename, "w") as f:
for go_id, go_list in info.items():
tmp = sorted(go_list[1:])
f.write(go_id + "\\t" + go_list[0] + "\\t" + "\\t".join(tmp) + "\\n")
Expand All @@ -251,7 +292,7 @@ def parse_and_save_to_tsv(self, filename: str) -> None:
The final tsv output will be sorted following the input query gene list order.
"""
info = self.parse_gene_based_info()
with open(filename, 'w') as f:
with open(filename, "w") as f:
f.write("\\t".join(info[self.query[0]].keys()) + "\\n")
for gene in self.query: # sorted by query gene list
if gene in info:
Expand All @@ -260,23 +301,22 @@ def parse_and_save_to_tsv(self, filename: str) -> None:


def load_list(filename: str, columname: str) -> list:
"""
It loads the list of gene ids from a file.
The columname is the name of the column where the gene ids are stored.
"""
if filename.split('.')[-1] == 'tsv':
sep = "\\t"
elif filename.split('.')[-1] == 'csv':
sep = ","
else:
raise ValueError("The input file extension should be either tsv or csv.")
with open(filename, 'r') as f:
idx = f.readline().strip().split(sep).index(columname)
return [line.strip().split(sep)[idx] for line in f]
"""
It loads the list of gene ids from a file.
The columname is the name of the column where the gene ids are stored.
"""
if filename.split(".")[-1] == "tsv":
sep = "\\t"
elif filename.split(".")[-1] == "csv":
sep = ","
else:
raise ValueError("The input file extension should be either tsv or csv.")
with open(filename, "r") as f:
idx = f.readline().strip().split(sep).index(columname)
return [line.strip().split(sep)[idx] for line in f]


if __name__ == "__main__":

# parse and print arguments
args = Arguments()
args.print_args()
Expand All @@ -286,14 +326,14 @@ def load_list(filename: str, columname: str) -> list:

# run mygene api
mg = MyGene(
gene_list,
species=args.species,
scopes=args.scopes,
entrezonly=args.entrezonly,
ensemblonly=args.ensemblonly,
go_category=args.go_category,
go_evidence=args.go_evidence
)
gene_list,
species=args.species,
scopes=args.scopes,
entrezonly=args.entrezonly,
ensemblonly=args.ensemblonly,
go_category=args.go_category,
go_evidence=args.go_evidence,
)

# parse annotations and save output files
mg.parse_and_save_to_gmt(args.output_gmt)
Expand Down
Loading

0 comments on commit 82024cf

Please sign in to comment.