Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: presets for cubi-tk sodar ingest-fastq (#232 ) #235

Merged
112 changes: 84 additions & 28 deletions cubi_tk/sodar/ingest_fastq.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,17 +30,41 @@
formatter = logzero.LogFormatter(fmt="%(message)s")
output_logger = logzero.setup_logger(formatter=formatter)

DEFAULT_SRC_REGEX = (
r"(.*/)?(?P<sample>.+?)"
r"(?:_S[0-9]+)?"
r"(?:_(?P<lane>L[0-9]+?))?"
r"(?:_(?P<mate>R[0-9]+?))?"
r"(?:_(?P<batch>[0-9]+?))?"
r"\.f(?:ast)?q\.gz"
)

#: Default value for --dest-pattern
DEFAULT_DEST_PATTERN = r"{collection_name}/raw_data/{date}/{filename}"
SRC_REGEX_PRESETS = {
"default": (
r"(.*/)?(?P<sample>.+?)"
r"(?:_S[0-9]+)?"
r"(?:_(?P<lane>L[0-9]+?))?"
r"(?:_(?P<mate>R[0-9]+?))?"
r"(?:_(?P<batch>[0-9]+?))?"
r"\.f(?:ast)?q\.gz"
),
"digestiflow": (
r"(.*/)?(?P<flowcell>[A-Z0-9]{9,10}?)/"
r"(?P<lane>L[0-9]{3}?)/"
r"(?:(?P<project>[A-Z][0-9]+_?))?"
r"(?P<sample>.+?)_"
r"S[0-9]+_L[0-9]{3}_R[0-9]_[0-9]{3}"
r"\.fastq\.gz"
),
"ONT": (
r"(.*/)?"
r"[0-9]{8}_" # Date
# Sample could be <ProjectID>_<LibID>_<SampleID>, but this is not given and may change between projects
r"(?P<sample>[a-zA-Z0-9_-]+?)/"
# RunID is <date>_<time>_<position>_<flowcellID>_<hash>
# Flowcells can be re-used, so taking the whole thing for uniqueness is best
r"(?P<RunID>[0-9]{8}_[0-9]+_[A-Z0-9]+_[A-Z0-9]+_[0-9a-z]+?)/"
r"(?:(?P<subfolder>[a-z0-9_]+/))?"
r".+\.(bam|pod5|txt|json)"
),
}

DEST_PATTERN_PRESETS = {
"default": r"{collection_name}/raw_data/{date}/{filename}",
"digestiflow": r"{collection_name}/raw_data/{flowcell}/{filename}",
"ONT": r"{collection_name}/raw_data/{RunID}/{subfolder}{filename}",
}

#: Default number of parallel transfers.
DEFAULT_NUM_TRANSFERS = 8
Expand Down Expand Up @@ -82,7 +106,11 @@ class SodarIngestFastq(SnappyItransferCommandBase):

def __init__(self, args):
super().__init__(args)
self.dest_pattern_fields = set(re.findall(r"(?<={).+?(?=})", self.args.remote_dir_pattern))
if self.args.remote_dir_pattern:
self.remote_dir_pattern = self.args.remote_dir_pattern
else:
self.remote_dir_pattern = DEST_PATTERN_PRESETS[self.args.preset]
self.dest_pattern_fields = set(re.findall(r"(?<={).+?(?=})", self.remote_dir_pattern))

@classmethod
def setup_argparse(cls, parser: argparse.ArgumentParser) -> None:
Expand Down Expand Up @@ -128,20 +156,29 @@ def setup_argparse(cls, parser: argparse.ArgumentParser) -> None:
action="store_true",
help="After files are transferred to SODAR, it will proceed with validation and move.",
)
parser.add_argument(
"--preset",
default="default",
choices=DEST_PATTERN_PRESETS.keys(),
help=f"Use predefined values for regular expression to find local files (--src-regex) and pattern to for "
f"constructing remote file paths.\nDefault src-regex: {SRC_REGEX_PRESETS['default']}.\n"
f"Default --remote-dir-pattern: {DEST_PATTERN_PRESETS['default']}.",
)
parser.add_argument(
"--src-regex",
default=DEFAULT_SRC_REGEX,
help=f"Regular expression to use for matching input fastq files, default: {DEFAULT_SRC_REGEX}. "
"All capture groups can be used for --remote-dir-pattern, but only 'sample' is used by default. "
"Only this regex controls which files are ingested, so other files than fastq.gz can be used too.",
default=None,
help="Manually defined regular expression to use for matching input fastq files. Takes precedence over "
"--preset. This regex controls which files are ingested, so it can be used for any file type. "
"Any named capture group in the regex can be used with --remote-dir-pattern. The 'sample' group is "
"used to set irods collection names (as-is or via --match-column).",
)
parser.add_argument(
"--remote-dir-pattern",
default=DEFAULT_DEST_PATTERN,
help=f"Pattern to use for constructing remote pattern, default: {DEFAULT_DEST_PATTERN}. "
"'collection_name' is the target iRODS collection and will be filled with the (-m regex modified) "
"'sample' unless --match-column is not used to fill it from the assay table. Any capture group of the "
"src-regex ('sample', 'lane', ...) can be used along with 'date' and 'filename'.",
default=None,
help="Manually defined pattern to use for constructing remote file paths. Takes precedence over "
"--preset. 'collection_name' is the target iRODS collection and will be filled with the (-m regex "
tedil marked this conversation as resolved.
Show resolved Hide resolved
"modified) 'sample', or if --match-column is used with the corresponding value from the assay table. "
"Any capture group of the src-regex ('sample', 'lane', ...) can be used along with 'date' and 'filename'.",
)
parser.add_argument(
"--match-column",
Expand Down Expand Up @@ -223,6 +260,13 @@ def check_args(self, args):
)
res = 1

if args.src_regex and args.remote_dir_pattern and args.preset != "default":
logger.error(
"Using both --src-regex and --remote-dir-pattern at the same time overwrites all values defined "
"by --preset. Please drop the use of --preset or at least one of the other manual definitions."
)
res = 1

return res

def get_project_uuid(self, lz_uuid: str):
Expand Down Expand Up @@ -417,6 +461,12 @@ def build_jobs(self, library_names=None):
folders = self.download_webdav(self.args.sources)
transfer_jobs = []

if self.args.src_regex:
use_regex = re.compile(self.args.src_regex)
else:
use_regex = re.compile(SRC_REGEX_PRESETS[self.args.preset])
# logger.debug(f"Using regex: {use_regex}")

for folder in folders:
for path in glob.iglob(f"{folder}/**/*", recursive=True):
real_path = os.path.realpath(path)
Expand All @@ -432,11 +482,10 @@ def build_jobs(self, library_names=None):
): # pragma: nocover
raise MissingFileException("Missing file %s" % (real_path + ".md5"))

m = re.match(self.args.src_regex, path)
# logger.debug(f"Checking file: {path}")
m = re.match(use_regex, path)
if m:
logger.debug(
"Matched %s with regex %s: %s", path, self.args.src_regex, m.groupdict()
)
logger.debug("Matched %s with regex %s: %s", path, use_regex, m.groupdict())
match_wildcards = dict(
item
for item in m.groupdict(default="").items()
Expand All @@ -449,9 +498,7 @@ def build_jobs(self, library_names=None):
sample_name = re.sub(m_pat, r_pat, sample_name)

try:
remote_file = pathlib.Path(
lz_irods_path
) / self.args.remote_dir_pattern.format(
remote_file = pathlib.Path(lz_irods_path) / self.remote_dir_pattern.format(
# Removed the `+ self.args.add_suffix` here, since adding anything after the file extension is a bad idea
filename=pathlib.Path(path).name,
date=self.args.remote_dir_date,
Expand Down Expand Up @@ -495,6 +542,15 @@ def execute(self) -> typing.Optional[int]:

lz_uuid, transfer_jobs = self.build_jobs()
transfer_jobs = sorted(transfer_jobs, key=lambda x: x.path_local)
# Exit early if no files were found/matched
if not transfer_jobs:
if self.args.src_regex:
used_regex = self.args.src_regex
else:
used_regex = SRC_REGEX_PRESETS[self.args.preset]

logger.warning("No matching files were found!\nUsed regex: %s", used_regex)
return None

if self.fix_md5_files:
transfer_jobs = self._execute_md5_files_fix(transfer_jobs)
Expand Down
2 changes: 1 addition & 1 deletion requirements/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,4 @@ pyflakes
setuptools ==65.6.3

# needed for testing snappy workflow methods
snappy-pipeline @ git+https://github.com/bihealth/snappy-pipeline
snappy-pipeline @ git+https://github.com/bihealth/snappy-pipeline@v0.1.1
Loading
Loading