From 982832ba6b048b52b36fe36613ec3fa40750a1c0 Mon Sep 17 00:00:00 2001
From: Olivier Gayot <olivier.gayot@canonical.com>
Date: Sun, 26 Nov 2023 20:50:18 +0100
Subject: [PATCH] Ensure that Python scripts close files that they open for
 writing

Python scripts under scripts/ in the source tree do not consistently
close files that they open for writing. While some of the scripts use
context managers, most of them do not (or do so inconsistently).  In
previous releases of Python, this apparently was not much of a concern.
However, Python 3.12 seems to be much less forgiving when files are not
properly closed. When running the test suite, many of the files that are
not explicitly closed appear truncated. This leads to various tests
failing or hanging.

Furthermore, khmer defines the get_file_writer() function, but it cannot
be consistently used as a context manager because it sometimes closes
the underlying file descriptor ; and sometimes does not depending on the
arguments.

Fixed by defining a new FileWriter context manager and ensuring that
each call to open() / get_file_writer() frees up resources properly.

Signed-off-by: Olivier Gayot <olivier.gayot@canonical.com>
---
 khmer/kfile.py                    |  35 +++++
 scripts/abundance-dist-single.py  |   4 +
 scripts/abundance-dist.py         |  33 ++---
 scripts/do-partition.py           |   4 +-
 scripts/extract-long-sequences.py |  14 +-
 scripts/extract-paired-reads.py   |  58 +++++----
 scripts/extract-partitions.py     |  21 ++-
 scripts/fastq-to-fasta.py         |  26 ++--
 scripts/filter-abund-single.py    |  35 ++---
 scripts/filter-abund.py           |  60 +++++----
 scripts/filter-stoptags.py        |   7 +-
 scripts/interleave-reads.py       |  59 +++++----
 scripts/normalize-by-median.py    |  59 +++++----
 scripts/partition-graph.py        |   3 +-
 scripts/sample-reads-randomly.py  |  33 ++---
 scripts/split-paired-reads.py     |  56 ++++----
 scripts/trim-low-abund.py         | 206 +++++++++++++++---------------
 17 files changed, 396 insertions(+), 317 deletions(-)

diff --git a/khmer/kfile.py b/khmer/kfile.py
index a3a8170627..7d0ccd72de 100755
--- a/khmer/kfile.py
+++ b/khmer/kfile.py
@@ -35,6 +35,7 @@
 """File handling/checking utilities for command-line scripts."""
 
 
+import contextlib
 import os
 import sys
 import errno
@@ -246,3 +247,37 @@ def get_file_writer(file_handle, do_gzip, do_bzip):
         ofile = file_handle
 
     return ofile
+
+
+@contextlib.contextmanager
+def FileWriter(file_handle, do_gzip, do_bzip, *, steal_ownership=False):
+    """Alternative to get_file_writer that requires the use of a with block.
+    The intent is to address an inherent problem with get_file_writer() that
+    makes it difficult to use as a context manager. When get_file_writer() is
+    called with both gzip=False and bzip=False, the underlying file handle is
+    returned. As a consequence, doing:
+    >  with get_file_writer(sys.stdout, bzip=False, gzip=False) as fh:
+    >      pass
+    ends up closing sys.stdout when the with block is exited. Using the
+    function without a context manager avoids the issue, but then it results in
+    leaked open files when either bzip=True or gzip=True.
+    FileWriter must be used as a context manager, but it ensures that resources
+    are closed upon exiting the with block. Furthermore, it can be explicitly
+    requested to close the underlying file_handle."""
+    ofile = None
+
+    if do_gzip and do_bzip:
+        raise ValueError("Cannot specify both bzip and gzip compression!")
+
+    if do_gzip:
+        ofile = gzip.GzipFile(fileobj=file_handle, mode='w')
+    elif do_bzip:
+        ofile = bz2.open(file_handle, mode='w')
+    else:
+        ofile = contextlib.nullcontext(enter_result=file_handle)
+
+    with ofile as x:
+        yield x
+
+    if steal_ownership:
+        file_handle.close()
diff --git a/scripts/abundance-dist-single.py b/scripts/abundance-dist-single.py
index 7fcf580276..409c7e7827 100755
--- a/scripts/abundance-dist-single.py
+++ b/scripts/abundance-dist-single.py
@@ -218,6 +218,10 @@ def __do_abundance_dist__(read_parser):
 
     log_info('wrote to: {output}', output=args.output_histogram_filename)
 
+    # Ensure that the output files are properly written. Python 3.12 seems to
+    # be less forgiving here ..
+    hist_fp.close()
+
 
 if __name__ == '__main__':
     main()
diff --git a/scripts/abundance-dist.py b/scripts/abundance-dist.py
index af8b6c5e8e..7076f6e253 100755
--- a/scripts/abundance-dist.py
+++ b/scripts/abundance-dist.py
@@ -42,6 +42,7 @@
 Use '-h' for parameter help.
 """
 
+import contextlib
 import sys
 import csv
 import khmer
@@ -143,26 +144,28 @@ def main():
         sys.exit(1)
 
     if args.output_histogram_filename in ('-', '/dev/stdout'):
-        countgraph_fp = sys.stdout
+        countgraph_ctx = contextlib.nullcontext(enter_result=sys.stdout)
     else:
-        countgraph_fp = open(args.output_histogram_filename, 'w')
-    countgraph_fp_csv = csv.writer(countgraph_fp)
-    # write headers:
-    countgraph_fp_csv.writerow(['abundance', 'count', 'cumulative',
-                                'cumulative_fraction'])
+        countgraph_ctx = open(args.output_histogram_filename, 'w')
 
-    sofar = 0
-    for _, i in enumerate(abundances):
-        if i == 0 and not args.output_zero:
-            continue
+    with countgraph_ctx as countgraph_fp:
+        countgraph_fp_csv = csv.writer(countgraph_fp)
+        # write headers:
+        countgraph_fp_csv.writerow(['abundance', 'count', 'cumulative',
+                                    'cumulative_fraction'])
 
-        sofar += i
-        frac = sofar / float(total)
+        sofar = 0
+        for _, i in enumerate(abundances):
+            if i == 0 and not args.output_zero:
+                continue
 
-        countgraph_fp_csv.writerow([_, i, sofar, round(frac, 3)])
+            sofar += i
+            frac = sofar / float(total)
 
-        if sofar == total:
-            break
+            countgraph_fp_csv.writerow([_, i, sofar, round(frac, 3)])
+
+            if sofar == total:
+                break
 
 
 if __name__ == '__main__':
diff --git a/scripts/do-partition.py b/scripts/do-partition.py
index 0027270b58..53552be442 100755
--- a/scripts/do-partition.py
+++ b/scripts/do-partition.py
@@ -168,8 +168,8 @@ def main():  # pylint: disable=too-many-locals,too-many-statements
         worker_q.put((nodegraph, _, start, end))
 
     print('enqueued %d subset tasks' % n_subsets, file=sys.stderr)
-    open('%s.info' % args.graphbase, 'w').write('%d subsets total\n'
-                                                % (n_subsets))
+    with open('%s.info' % args.graphbase, 'w') as info_fp:
+        info_fp.write('%d subsets total\n' % (n_subsets))
 
     if n_subsets < args.threads:
         args.threads = n_subsets
diff --git a/scripts/extract-long-sequences.py b/scripts/extract-long-sequences.py
index 7526c4aedf..c84e5d5d68 100755
--- a/scripts/extract-long-sequences.py
+++ b/scripts/extract-long-sequences.py
@@ -52,7 +52,7 @@
 import sys
 from khmer import __version__
 from khmer.utils import write_record
-from khmer.kfile import add_output_compression_type, get_file_writer
+from khmer.kfile import add_output_compression_type, FileWriter
 from khmer.khmer_args import sanitize_help, KhmerArgumentParser
 
 
@@ -81,12 +81,12 @@ def get_parser():
 
 def main():
     args = sanitize_help(get_parser()).parse_args()
-    outfp = get_file_writer(args.output, args.gzip, args.bzip)
-    for filename in args.input_filenames:
-        for record in screed.open(filename):
-            if len(record['sequence']) >= args.length:
-                write_record(record, outfp)
-    print('wrote to: ' + args.output.name, file=sys.stderr)
+    with FileWriter(args.output, args.gzip, args.bzip) as outfp:
+        for filename in args.input_filenames:
+            for record in screed.open(filename):
+                if len(record['sequence']) >= args.length:
+                    write_record(record, outfp)
+        print('wrote to: ' + args.output.name, file=sys.stderr)
 
 
 if __name__ == '__main__':
diff --git a/scripts/extract-paired-reads.py b/scripts/extract-paired-reads.py
index 29d7cbe3cb..8335cf893d 100755
--- a/scripts/extract-paired-reads.py
+++ b/scripts/extract-paired-reads.py
@@ -44,6 +44,7 @@
 
 Reads FASTQ and FASTA input, retains format for output.
 """
+from contextlib import nullcontext
 import sys
 import os.path
 import textwrap
@@ -53,7 +54,7 @@
 from khmer.khmer_args import sanitize_help, KhmerArgumentParser
 from khmer.khmer_args import FileType as khFileType
 from khmer.kfile import add_output_compression_type
-from khmer.kfile import get_file_writer
+from khmer.kfile import FileWriter
 
 from khmer.utils import broken_paired_reader, write_record, write_record_pair
 
@@ -132,39 +133,40 @@ def main():
 
     # OVERRIDE default output file locations with -p, -s
     if args.output_paired:
-        paired_fp = get_file_writer(args.output_paired, args.gzip, args.bzip)
-        out2 = paired_fp.name
+        paired_ctx = FileWriter(args.output_paired, args.gzip, args.bzip)
+        out2 = args.output_paired.name
     else:
         # Don't override, just open the default filename from above
-        paired_fp = get_file_writer(open(out2, 'wb'), args.gzip, args.bzip)
+        paired_ctx = FileWriter(open(out2, 'wb'), args.gzip, args.bzip,
+                                steal_ownership=True)
     if args.output_single:
-        single_fp = get_file_writer(args.output_single, args.gzip, args.bzip)
+        single_ctx = FileWriter(args.output_single, args.gzip, args.bzip)
         out1 = args.output_single.name
     else:
         # Don't override, just open the default filename from above
-        single_fp = get_file_writer(open(out1, 'wb'), args.gzip, args.bzip)
-
-    print('reading file "%s"' % infile, file=sys.stderr)
-    print('outputting interleaved pairs to "%s"' % out2, file=sys.stderr)
-    print('outputting orphans to "%s"' % out1, file=sys.stderr)
-
-    n_pe = 0
-    n_se = 0
-
-    reads = ReadParser(infile)
-    for index, is_pair, read1, read2 in broken_paired_reader(reads):
-        if index % 100000 == 0 and index > 0:
-            print('...', index, file=sys.stderr)
-
-        if is_pair:
-            write_record_pair(read1, read2, paired_fp)
-            n_pe += 1
-        else:
-            write_record(read1, single_fp)
-            n_se += 1
-
-    single_fp.close()
-    paired_fp.close()
+        single_ctx = FileWriter(open(out1, 'wb'), args.gzip, args.bzip,
+                                steal_ownership=True)
+
+    with paired_ctx as paired_fp, single_ctx as single_fp:
+        print('reading file "%s"' % infile, file=sys.stderr)
+        print('outputting interleaved pairs to "%s"' % out2,
+              file=sys.stderr)
+        print('outputting orphans to "%s"' % out1, file=sys.stderr)
+
+        n_pe = 0
+        n_se = 0
+
+        reads = ReadParser(infile)
+        for index, is_pair, read1, read2 in broken_paired_reader(reads):
+            if index % 100000 == 0 and index > 0:
+                print('...', index, file=sys.stderr)
+
+            if is_pair:
+                write_record_pair(read1, read2, paired_fp)
+                n_pe += 1
+            else:
+                write_record(read1, single_fp)
+                n_se += 1
 
     if n_pe == 0:
         raise Exception("no paired reads!? check file formats...")
diff --git a/scripts/extract-partitions.py b/scripts/extract-partitions.py
index a1d25dcdd6..b196c100c4 100755
--- a/scripts/extract-partitions.py
+++ b/scripts/extract-partitions.py
@@ -301,10 +301,10 @@ def main():
                                    args.max_size)
 
     if args.output_unassigned:
-        ofile = open('%s.unassigned.%s' % (args.prefix, suffix), 'wb')
-        unassigned_fp = get_file_writer(ofile, args.gzip, args.bzip)
-        extractor.process_unassigned(unassigned_fp)
-        unassigned_fp.close()
+        with open('%s.unassigned.%s' % (args.prefix, suffix), 'wb') as ofile:
+            unassigned_fp = get_file_writer(ofile, args.gzip, args.bzip)
+            extractor.process_unassigned(unassigned_fp)
+            unassigned_fp.close()
     else:
         extractor.process_unassigned()
 
@@ -320,13 +320,21 @@ def main():
         print('nothing to output; exiting!', file=sys.stderr)
         return
 
+    to_close = []
     # open a bunch of output files for the different groups
     group_fps = {}
     for index in range(extractor.group_n):
         fname = '%s.group%04d.%s' % (args.prefix, index, suffix)
-        group_fp = get_file_writer(open(fname, 'wb'), args.gzip,
+        back_fp = open(fname, 'wb')
+        group_fp = get_file_writer(back_fp, args.gzip,
                                    args.bzip)
         group_fps[index] = group_fp
+        # It feels more natural to close the writer before closing the
+        # underlying file. fp.close() is theoretically idempotent, so it should
+        # be fine even though sometimes get_file_writer "steals" ownership of
+        # the underlying stream.
+        to_close.append(group_fp)
+        to_close.append(back_fp)
 
     # write 'em all out!
     # refresh the generator
@@ -351,6 +359,9 @@ def main():
            args.prefix,
            suffix), file=sys.stderr)
 
+    for fp in to_close:
+        fp.close()
+
 
 if __name__ == '__main__':
     main()
diff --git a/scripts/fastq-to-fasta.py b/scripts/fastq-to-fasta.py
index ef597d9c39..c41d1e7ab6 100755
--- a/scripts/fastq-to-fasta.py
+++ b/scripts/fastq-to-fasta.py
@@ -45,7 +45,7 @@
 import sys
 import screed
 from khmer import __version__
-from khmer.kfile import (add_output_compression_type, get_file_writer,
+from khmer.kfile import (add_output_compression_type, FileWriter,
                          describe_file_handle)
 from khmer.utils import write_record
 from khmer.khmer_args import sanitize_help, KhmerArgumentParser
@@ -74,21 +74,21 @@ def main():
     args = sanitize_help(get_parser()).parse_args()
 
     print('fastq from ', args.input_sequence, file=sys.stderr)
-    outfp = get_file_writer(args.output, args.gzip, args.bzip)
-    n_count = 0
-    for n, record in enumerate(screed.open(args.input_sequence)):
-        if n % 10000 == 0:
-            print('...', n, file=sys.stderr)
+    with FileWriter(args.output, args.gzip, args.bzip) as outfp:
+        n_count = 0
+        for n, record in enumerate(screed.open(args.input_sequence)):
+            if n % 10000 == 0:
+                print('...', n, file=sys.stderr)
 
-        sequence = record['sequence']
+            sequence = record['sequence']
 
-        if 'N' in sequence:
-            if not args.n_keep:
-                n_count += 1
-                continue
+            if 'N' in sequence:
+                if not args.n_keep:
+                    n_count += 1
+                    continue
 
-        del record['quality']
-        write_record(record, outfp)
+            del record['quality']
+            write_record(record, outfp)
 
     print('\n' + 'lines from ' + args.input_sequence, file=sys.stderr)
 
diff --git a/scripts/filter-abund-single.py b/scripts/filter-abund-single.py
index 3edcef86ec..14e2944add 100755
--- a/scripts/filter-abund-single.py
+++ b/scripts/filter-abund-single.py
@@ -60,7 +60,7 @@
 from khmer.kfile import (check_input_files, check_space,
                          check_space_for_graph,
                          add_output_compression_type,
-                         get_file_writer)
+                         FileWriter)
 from khmer.khmer_logger import (configure_logging, log_info, log_error,
                                 log_warn)
 from khmer.trimming import (trim_record)
@@ -160,22 +160,23 @@ def main():
         outfile = os.path.basename(args.datafile) + '.abundfilt'
     else:
         outfile = args.outfile
-    outfp = open(outfile, 'wb')
-    outfp = get_file_writer(outfp, args.gzip, args.bzip)
-
-    paired_iter = broken_paired_reader(ReadParser(args.datafile),
-                                       min_length=graph.ksize(),
-                                       force_single=True)
-
-    for n, is_pair, read1, read2 in paired_iter:
-        assert not is_pair
-        assert read2 is None
-
-        trimmed_record, _ = trim_record(graph, read1, args.cutoff,
-                                        args.variable_coverage,
-                                        args.normalize_to)
-        if trimmed_record:
-            write_record(trimmed_record, outfp)
+
+    with FileWriter(open(outfile, 'wb'), args.gzip, args.bzip,
+                    steal_ownership=True) as outfp:
+
+        paired_iter = broken_paired_reader(ReadParser(args.datafile),
+                                           min_length=graph.ksize(),
+                                           force_single=True)
+
+        for n, is_pair, read1, read2 in paired_iter:
+            assert not is_pair
+            assert read2 is None
+
+            trimmed_record, _ = trim_record(graph, read1, args.cutoff,
+                                            args.variable_coverage,
+                                            args.normalize_to)
+            if trimmed_record:
+                write_record(trimmed_record, outfp)
 
     log_info('output in {outfile}', outfile=outfile)
 
diff --git a/scripts/filter-abund.py b/scripts/filter-abund.py
index cb729c9b77..1d32455edb 100755
--- a/scripts/filter-abund.py
+++ b/scripts/filter-abund.py
@@ -44,6 +44,7 @@
 
 Use '-h' for parameter help.
 """
+from contextlib import nullcontext
 import sys
 import os
 import textwrap
@@ -56,7 +57,7 @@
                               sanitize_help, check_argument_range)
 from khmer.khmer_args import FileType as khFileType
 from khmer.kfile import (check_input_files, check_space,
-                         add_output_compression_type, get_file_writer)
+                         add_output_compression_type, FileWriter)
 from khmer.khmer_logger import (configure_logging, log_info, log_error,
                                 log_warn)
 from khmer.trimming import (trim_record)
@@ -137,31 +138,38 @@ def main():
 
     if args.single_output_file:
         outfile = args.single_output_file.name
-        outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip)
-
-    # the filtering loop
-    for infile in infiles:
-        log_info('filtering {infile}', infile=infile)
-        if not args.single_output_file:
-            outfile = os.path.basename(infile) + '.abundfilt'
-            outfp = open(outfile, 'wb')
-            outfp = get_file_writer(outfp, args.gzip, args.bzip)
-
-        paired_iter = broken_paired_reader(ReadParser(infile),
-                                           min_length=ksize,
-                                           force_single=True)
-
-        for n, is_pair, read1, read2 in paired_iter:
-            assert not is_pair
-            assert read2 is None
-
-            trimmed_record, _ = trim_record(countgraph, read1, args.cutoff,
-                                            args.variable_coverage,
-                                            args.normalize_to)
-            if trimmed_record:
-                write_record(trimmed_record, outfp)
-
-        log_info('output in {outfile}', outfile=outfile)
+        out_single_ctx = FileWriter(args.single_output_file, args.gzip,
+                                    args.bzip)
+    else:
+        out_single_ctx = nullcontext()
+
+    with out_single_ctx as out_single_fp:
+        # the filtering loop
+        for infile in infiles:
+            log_info('filtering {infile}', infile=infile)
+            if not args.single_output_file:
+                outfile = os.path.basename(infile) + '.abundfilt'
+                out_ctx = FileWriter(open(outfile, 'wb'), args.gzip,
+                                     args.bzip, steal_ownership=True)
+            else:
+                out_ctx = nullcontext(enter_result=out_single_fp)
+
+            paired_iter = broken_paired_reader(ReadParser(infile),
+                                               min_length=ksize,
+                                               force_single=True)
+
+            with out_ctx as outfp:
+                for n, is_pair, read1, read2 in paired_iter:
+                    assert not is_pair
+                    assert read2 is None
+
+                    trimmed_record, _ = trim_record(countgraph, read1, args.cutoff,
+                                                    args.variable_coverage,
+                                                    args.normalize_to)
+                    if trimmed_record:
+                        write_record(trimmed_record, outfp)
+
+                log_info('output in {outfile}', outfile=outfile)
 
 
 if __name__ == '__main__':
diff --git a/scripts/filter-stoptags.py b/scripts/filter-stoptags.py
index a2bc48f170..b97910eafb 100755
--- a/scripts/filter-stoptags.py
+++ b/scripts/filter-stoptags.py
@@ -108,10 +108,9 @@ def process_fn(record):
         print('filtering', infile, file=sys.stderr)
         outfile = os.path.basename(infile) + '.stopfilt'
 
-        outfp = open(outfile, 'w')
-
-        tsp = ThreadedSequenceProcessor(process_fn)
-        tsp.start(verbose_loader(infile), outfp)
+        with open(outfile, 'w') as outfp:
+            tsp = ThreadedSequenceProcessor(process_fn)
+            tsp.start(verbose_loader(infile), outfp)
 
         print('output in', outfile, file=sys.stderr)
 
diff --git a/scripts/interleave-reads.py b/scripts/interleave-reads.py
index 65c557d5f2..4c02591157 100755
--- a/scripts/interleave-reads.py
+++ b/scripts/interleave-reads.py
@@ -52,7 +52,7 @@
 from khmer.kfile import check_input_files, check_space
 from khmer.khmer_args import sanitize_help, KhmerArgumentParser
 from khmer.khmer_args import FileType as khFileType
-from khmer.kfile import (add_output_compression_type, get_file_writer,
+from khmer.kfile import (add_output_compression_type, FileWriter,
                          describe_file_handle)
 from khmer.utils import (write_record_pair, check_is_left, check_is_right,
                          check_is_pair)
@@ -109,42 +109,41 @@ def main():
 
     print("Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file), file=sys.stderr)
 
-    outfp = get_file_writer(args.output, args.gzip, args.bzip)
-
-    counter = 0
-    screed_iter_1 = screed.open(s1_file)
-    screed_iter_2 = screed.open(s2_file)
-    for read1, read2 in zip_longest(screed_iter_1, screed_iter_2):
-        if read1 is None or read2 is None:
-            print(("ERROR: Input files contain different number"
-                   " of records."), file=sys.stderr)
-            sys.exit(1)
+    with FileWriter(args.output, args.gzip, args.bzip) as outfp:
+        counter = 0
+        screed_iter_1 = screed.open(s1_file)
+        screed_iter_2 = screed.open(s2_file)
+        for read1, read2 in zip_longest(screed_iter_1, screed_iter_2):
+            if read1 is None or read2 is None:
+                print(("ERROR: Input files contain different number"
+                       " of records."), file=sys.stderr)
+                sys.exit(1)
 
-        if counter % 100000 == 0:
-            print('...', counter, 'pairs', file=sys.stderr)
-        counter += 1
+            if counter % 100000 == 0:
+                print('...', counter, 'pairs', file=sys.stderr)
+            counter += 1
 
-        name1 = read1.name
-        name2 = read2.name
+            name1 = read1.name
+            name2 = read2.name
 
-        if not args.no_reformat:
-            if not check_is_left(name1):
-                name1 += '/1'
-            if not check_is_right(name2):
-                name2 += '/2'
+            if not args.no_reformat:
+                if not check_is_left(name1):
+                    name1 += '/1'
+                if not check_is_right(name2):
+                    name2 += '/2'
 
-            read1.name = name1
-            read2.name = name2
+                read1.name = name1
+                read2.name = name2
 
-            if not check_is_pair(read1, read2):
-                print("ERROR: This doesn't look like paired data! "
-                      "%s %s" % (read1.name, read2.name), file=sys.stderr)
-                sys.exit(1)
+                if not check_is_pair(read1, read2):
+                    print("ERROR: This doesn't look like paired data! "
+                          "%s %s" % (read1.name, read2.name), file=sys.stderr)
+                    sys.exit(1)
 
-        write_record_pair(read1, read2, outfp)
+            write_record_pair(read1, read2, outfp)
 
-    print('final: interleaved %d pairs' % counter, file=sys.stderr)
-    print('output written to', describe_file_handle(outfp), file=sys.stderr)
+        print('final: interleaved %d pairs' % counter, file=sys.stderr)
+        print('output written to', describe_file_handle(outfp), file=sys.stderr)
 
 
 if __name__ == '__main__':
diff --git a/scripts/normalize-by-median.py b/scripts/normalize-by-median.py
index 39e387663e..ac3c84540f 100755
--- a/scripts/normalize-by-median.py
+++ b/scripts/normalize-by-median.py
@@ -46,6 +46,7 @@
 Use '-h' for parameter help.
 """
 
+from contextlib import nullcontext
 import sys
 import screed
 import os
@@ -60,7 +61,7 @@
 import argparse
 from khmer.kfile import (check_space, check_space_for_graph,
                          check_valid_file_exists, add_output_compression_type,
-                         get_file_writer, describe_file_handle)
+                         FileWriter, describe_file_handle)
 from khmer.utils import (write_record, broken_paired_reader, ReadBundle,
                          clean_input_reads)
 from khmer.khmer_logger import (configure_logging, log_info, log_error)
@@ -360,39 +361,43 @@ def main():  # pylint: disable=too-many-branches,too-many-statements
     output_name = None
 
     if args.single_output_file:
-        outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip)
+        out_single_ctx = FileWriter(args.single_output_file, args.gzip, args.bzip)
     else:
+        out_single_ctx = nullcontext()
         if '-' in filenames or '/dev/stdin' in filenames:
             print("Accepting input from stdin; output filename must "
                   "be provided with '-o'.", file=sys.stderr)
             sys.exit(1)
 
-    #
-    # main loop: iterate over all files given, do diginorm.
-    #
-
-    for filename, require_paired in files:
-        if not args.single_output_file:
-            output_name = os.path.basename(filename) + '.keep'
-            outfp = open(output_name, 'wb')
-            outfp = get_file_writer(outfp, args.gzip, args.bzip)
-
-        # failsafe context manager in case an input file breaks
-        with catch_io_errors(filename, outfp, args.single_output_file,
-                             args.force, corrupt_files):
-            screed_iter = clean_input_reads(screed.open(filename))
-            reader = broken_paired_reader(screed_iter, min_length=args.ksize,
-                                          force_single=force_single,
-                                          require_paired=require_paired)
-
-            # actually do diginorm
-            for record in with_diagnostics(reader, filename):
-                if record is not None:
-                    write_record(record, outfp)
-
-            log_info('output in {name}', name=describe_file_handle(outfp))
+    with out_single_ctx as out_single_fp:
+        #
+        # main loop: iterate over all files given, do diginorm.
+        #
+        for filename, require_paired in files:
             if not args.single_output_file:
-                outfp.close()
+                output_name = os.path.basename(filename) + '.keep'
+                out_ctx = FileWriter(open(output_name, 'wb'), args.gzip,
+                                     args.bzip, steal_ownership=True)
+            else:
+                out_ctx = nullcontext(enter_result=out_single_fp)
+
+            with out_ctx as outfp:
+                # failsafe context manager in case an input file breaks
+                with catch_io_errors(filename, outfp, args.single_output_file,
+                                     args.force, corrupt_files):
+                    screed_iter = clean_input_reads(screed.open(filename))
+                    reader = broken_paired_reader(screed_iter,
+                                                  min_length=args.ksize,
+                                                  force_single=force_single,
+                                                  require_paired=require_paired)
+
+                    # actually do diginorm
+                    for record in with_diagnostics(reader, filename):
+                        if record is not None:
+                            write_record(record, outfp)
+
+                    log_info('output in {name}',
+                             name=describe_file_handle(outfp))
 
     # finished - print out some diagnostics.
 
diff --git a/scripts/partition-graph.py b/scripts/partition-graph.py
index f841fe4848..82e2bf705e 100755
--- a/scripts/partition-graph.py
+++ b/scripts/partition-graph.py
@@ -143,7 +143,8 @@ def main():
         worker_q.put((nodegraph, _, start, end))
 
     print('enqueued %d subset tasks' % n_subsets, file=sys.stderr)
-    open('%s.info' % basename, 'w').write('%d subsets total\n' % (n_subsets))
+    with open('%s.info' % basename, 'w') as info_fp:
+        info_fp.write('%d subsets total\n' % (n_subsets))
 
     n_threads = args.threads
     if n_subsets < n_threads:
diff --git a/scripts/sample-reads-randomly.py b/scripts/sample-reads-randomly.py
index 79b02d764e..c9ded92600 100755
--- a/scripts/sample-reads-randomly.py
+++ b/scripts/sample-reads-randomly.py
@@ -47,6 +47,7 @@
 """
 
 import argparse
+from contextlib import nullcontext
 import os.path
 import random
 import textwrap
@@ -55,7 +56,7 @@
 from khmer import __version__
 from khmer import ReadParser
 from khmer.kfile import (check_input_files, add_output_compression_type,
-                         get_file_writer)
+                         FileWriter)
 from khmer.khmer_args import sanitize_help, KhmerArgumentParser
 from khmer.utils import write_record, broken_paired_reader
 
@@ -201,27 +202,27 @@ def main():
         print('Writing %d sequences to %s' %
               (len(reads[0]), output_filename), file=sys.stderr)
 
-        output_file = args.output_file
-        if not output_file:
-            output_file = open(output_filename, 'wb')
+        output_back_ctx = nullcontext(args.output_file)
+        if not args.output_file:
+            output_back_ctx = open(output_filename, 'wb')
 
-        output_file = get_file_writer(output_file, args.gzip, args.bzip)
-
-        for records in reads[0]:
-            write_record(records[0], output_file)
-            if records[1] is not None:
-                write_record(records[1], output_file)
+        with output_back_ctx as output_back_fp:
+            with FileWriter(output_back_fp, args.gzip, args.bzip) as output_fp:
+                for records in reads[0]:
+                    write_record(records[0], output_fp)
+                    if records[1] is not None:
+                        write_record(records[1], output_fp)
     else:
         for n in range(num_samples):
             n_filename = output_filename + '.%d' % n
             print('Writing %d sequences to %s' %
                   (len(reads[n]), n_filename), file=sys.stderr)
-            output_file = get_file_writer(open(n_filename, 'wb'), args.gzip,
-                                          args.bzip)
-            for records in reads[n]:
-                write_record(records[0], output_file)
-                if records[1] is not None:
-                    write_record(records[1], output_file)
+            with FileWriter(open(n_filename, 'wb'), args.gzip, args.bzip,
+                            steal_ownership=True) as output_fp:
+                for records in reads[n]:
+                    write_record(records[0], output_fp)
+                    if records[1] is not None:
+                        write_record(records[1], output_fp)
 
 
 if __name__ == '__main__':
diff --git a/scripts/split-paired-reads.py b/scripts/split-paired-reads.py
index 5750100312..cd3dd06bc1 100755
--- a/scripts/split-paired-reads.py
+++ b/scripts/split-paired-reads.py
@@ -44,6 +44,7 @@
 
 Reads FASTQ and FASTA input, retains format for output.
 """
+from contextlib import nullcontext
 import sys
 import os
 import textwrap
@@ -56,7 +57,7 @@
                          UnpairedReadsError)
 from khmer.kfile import (check_input_files, check_space,
                          add_output_compression_type,
-                         get_file_writer, describe_file_handle)
+                         FileWriter, describe_file_handle)
 
 
 def get_parser():
@@ -145,22 +146,26 @@ def main():
 
     # OVERRIDE output file locations with -1, -2
     if args.output_first:
-        fp_out1 = get_file_writer(args.output_first, args.gzip, args.bzip)
-        out1 = fp_out1.name
+        out1_ctx = FileWriter(args.output_first, args.gzip, args.bzip)
+        out1 = args.output_first.name
     else:
         # Use default filename created above
-        fp_out1 = get_file_writer(open(out1, 'wb'), args.gzip, args.bzip)
+        out1_ctx = FileWriter(open(out1, 'wb'), args.gzip, args.bzip,
+                              steal_ownership=True)
     if args.output_second:
-        fp_out2 = get_file_writer(args.output_second, args.gzip, args.bzip)
-        out2 = fp_out2.name
+        out2_ctx = FileWriter(args.output_second, args.gzip, args.bzip)
+        out2 = args.output_second.name
     else:
         # Use default filename created above
-        fp_out2 = get_file_writer(open(out2, 'wb'), args.gzip, args.bzip)
+        out2_ctx = FileWriter(open(out2, 'wb'), args.gzip, args.bzip,
+                              steal_ownership=True)
 
     # put orphaned reads here, if -0!
     if args.output_orphaned:
-        fp_out0 = get_file_writer(args.output_orphaned, args.gzip, args.bzip)
+        out0_ctx = FileWriter(args.output_orphaned, args.gzip, args.bzip)
         out0 = describe_file_handle(args.output_orphaned)
+    else:
+        out0_ctx = nullcontext()
 
     counter1 = 0
     counter2 = 0
@@ -171,23 +176,24 @@ def main():
     paired_iter = broken_paired_reader(ReadParser(infile),
                                        require_paired=not args.output_orphaned)
 
-    try:
-        for index, is_pair, record1, record2 in paired_iter:
-            if index % 10000 == 0:
-                print('...', index, file=sys.stderr)
-
-            if is_pair:
-                write_record(record1, fp_out1)
-                counter1 += 1
-                write_record(record2, fp_out2)
-                counter2 += 1
-            elif args.output_orphaned:
-                write_record(record1, fp_out0)
-                counter3 += 1
-    except UnpairedReadsError as e:
-        print("Unpaired reads found starting at {name}; exiting".format(
-            name=e.read1.name), file=sys.stderr)
-        sys.exit(1)
+    with out0_ctx as fp_out0, out1_ctx as fp_out1, out2_ctx as fp_out2:
+        try:
+            for index, is_pair, record1, record2 in paired_iter:
+                if index % 10000 == 0:
+                    print('...', index, file=sys.stderr)
+
+                if is_pair:
+                    write_record(record1, fp_out1)
+                    counter1 += 1
+                    write_record(record2, fp_out2)
+                    counter2 += 1
+                elif args.output_orphaned:
+                    write_record(record1, fp_out0)
+                    counter3 += 1
+        except UnpairedReadsError as e:
+            print("Unpaired reads found starting at {name}; exiting".format(
+                name=e.read1.name), file=sys.stderr)
+            sys.exit(1)
 
     print("DONE; split %d sequences (%d left, %d right, %d orphans)" %
           (counter1 + counter2, counter1, counter2, counter3), file=sys.stderr)
diff --git a/scripts/trim-low-abund.py b/scripts/trim-low-abund.py
index 8572575111..48b1fa296b 100755
--- a/scripts/trim-low-abund.py
+++ b/scripts/trim-low-abund.py
@@ -43,6 +43,7 @@
 
 Use -h for parameter help.
 """
+from contextlib import nullcontext
 import csv
 import sys
 import os
@@ -63,7 +64,7 @@
 from khmer.utils import write_record, broken_paired_reader, ReadBundle
 from khmer.kfile import (check_space, check_space_for_graph,
                          check_valid_file_exists, add_output_compression_type,
-                         get_file_writer)
+                         get_file_writer, FileWriter)
 from khmer.khmer_logger import configure_logging, log_info, log_error
 from khmer.trimming import trim_record
 
@@ -374,108 +375,111 @@ def main():
     # only create the file writer once if outfp is specified; otherwise,
     # create it for each file.
     if args.output:
-        trimfp = get_file_writer(args.output, args.gzip, args.bzip)
+        trim_ctx = FileWriter(args.output, args.gzip, args.bzip)
+    else:
+        trim_ctx = nullcontext()
 
     pass2list = []
-    for filename in args.input_filenames:
-        # figure out temporary filename for 2nd pass
-        pass2filename = filename.replace(os.path.sep, '-') + '.pass2'
-        pass2filename = os.path.join(tempdir, pass2filename)
-        pass2fp = open(pass2filename, 'w')
-
-        # construct output filenames
-        if args.output is None:
-            # note: this will be saved in trimfp.
-            outfp = open(os.path.basename(filename) + '.abundtrim', 'wb')
-
-            # get file handle w/gzip, bzip
-            trimfp = get_file_writer(outfp, args.gzip, args.bzip)
-
-        # record all this info
-        pass2list.append((filename, pass2filename, trimfp))
-
-        # input file stuff: get a broken_paired reader.
-        paired_iter = broken_paired_reader(ReadParser(filename), min_length=K,
-                                           force_single=args.ignore_pairs)
-
-        # main loop through the file.
-        n_start = trimmer.n_reads
-        save_start = trimmer.n_saved
-
-        watermark = REPORT_EVERY_N_READS
-        for read in trimmer.pass1(paired_iter, pass2fp):
-            if (trimmer.n_reads - n_start) > watermark:
-                log_info("... {filename} {n_saved} {n_reads} {n_bp} "
-                         "{w_reads} {w_bp}", filename=filename,
-                         n_saved=trimmer.n_saved, n_reads=trimmer.n_reads,
-                         n_bp=trimmer.n_bp, w_reads=written_reads,
-                         w_bp=written_bp)
-                watermark += REPORT_EVERY_N_READS
-
-            # write out the trimmed/etc sequences that AREN'T going to be
-            # revisited in a 2nd pass.
-            write_record(read, trimfp)
-            written_bp += len(read)
-            written_reads += 1
-        pass2fp.close()
-
-        log_info("{filename}: kept aside {kept} of {total} from first pass",
-                 filename=filename, kept=trimmer.n_saved - save_start,
-                 total=trimmer.n_reads - n_start)
-
-    # first pass goes across all the data, so record relevant stats...
-    n_reads = trimmer.n_reads
-    n_bp = trimmer.n_bp
-    n_skipped = trimmer.n_skipped
-    bp_skipped = trimmer.bp_skipped
-    save_pass2_total = trimmer.n_saved
-
-    # ### SECOND PASS. ###
-
-    # nothing should have been skipped yet!
-    assert trimmer.n_skipped == 0
-    assert trimmer.bp_skipped == 0
-
-    if args.single_pass:
-        pass2list = []
-
-    # go back through all the files again.
-    for _, pass2filename, trimfp in pass2list:
-        log_info('second pass: looking at sequences kept aside in {pass2}',
-                 pass2=pass2filename)
-
-        # note that for this second pass, we don't care about paired
-        # reads - they will be output in the same order they're read in,
-        # so pairs will stay together if not orphaned.  This is in contrast
-        # to the first loop.  Hence, force_single=True below.
-
-        read_parser = ReadParser(pass2filename)
-        paired_iter = broken_paired_reader(read_parser,
-                                           min_length=K,
-                                           force_single=True)
-
-        watermark = REPORT_EVERY_N_READS
-        for read in trimmer.pass2(paired_iter):
-            if (trimmer.n_reads - n_start) > watermark:
-                log_info('... x 2 {a} {b} {c} {d} {e} {f} {g}',
-                         a=trimmer.n_reads - n_start,
-                         b=pass2filename, c=trimmer.n_saved,
-                         d=trimmer.n_reads, e=trimmer.n_bp,
-                         f=written_reads, g=written_bp)
-                watermark += REPORT_EVERY_N_READS
-
-            write_record(read, trimfp)
-            written_reads += 1
-            written_bp += len(read)
-
-        read_parser.close()
-
-        log_info('removing {pass2}', pass2=pass2filename)
-        os.unlink(pass2filename)
-
-        # if we created our own trimfps, close 'em.
-        if not args.output:
-            trimfp.close()
+    with trim_ctx as trimfp:
+        for filename in args.input_filenames:
+            # figure out temporary filename for 2nd pass
+            pass2filename = filename.replace(os.path.sep, '-') + '.pass2'
+            pass2filename = os.path.join(tempdir, pass2filename)
+            pass2fp = open(pass2filename, 'w')
+
+            # construct output filenames
+            if args.output is None:
+                # note: this will be saved in trimfp.
+                outfp = open(os.path.basename(filename) + '.abundtrim', 'wb')
+
+                # get file handle w/gzip, bzip
+                trimfp = get_file_writer(outfp, args.gzip, args.bzip)
+
+            # record all this info
+            pass2list.append((filename, pass2filename, trimfp))
+
+            # input file stuff: get a broken_paired reader.
+            paired_iter = broken_paired_reader(ReadParser(filename), min_length=K,
+                                               force_single=args.ignore_pairs)
+
+            # main loop through the file.
+            n_start = trimmer.n_reads
+            save_start = trimmer.n_saved
+
+            watermark = REPORT_EVERY_N_READS
+            for read in trimmer.pass1(paired_iter, pass2fp):
+                if (trimmer.n_reads - n_start) > watermark:
+                    log_info("... {filename} {n_saved} {n_reads} {n_bp} "
+                             "{w_reads} {w_bp}", filename=filename,
+                             n_saved=trimmer.n_saved, n_reads=trimmer.n_reads,
+                             n_bp=trimmer.n_bp, w_reads=written_reads,
+                             w_bp=written_bp)
+                    watermark += REPORT_EVERY_N_READS
+
+                # write out the trimmed/etc sequences that AREN'T going to be
+                # revisited in a 2nd pass.
+                write_record(read, trimfp)
+                written_bp += len(read)
+                written_reads += 1
+            pass2fp.close()
+
+            log_info("{filename}: kept aside {kept} of {total} from first pass",
+                     filename=filename, kept=trimmer.n_saved - save_start,
+                     total=trimmer.n_reads - n_start)
+
+        # first pass goes across all the data, so record relevant stats...
+        n_reads = trimmer.n_reads
+        n_bp = trimmer.n_bp
+        n_skipped = trimmer.n_skipped
+        bp_skipped = trimmer.bp_skipped
+        save_pass2_total = trimmer.n_saved
+
+        # ### SECOND PASS. ###
+
+        # nothing should have been skipped yet!
+        assert trimmer.n_skipped == 0
+        assert trimmer.bp_skipped == 0
+
+        if args.single_pass:
+            pass2list = []
+
+        # go back through all the files again.
+        for _, pass2filename, trimfp in pass2list:
+            log_info('second pass: looking at sequences kept aside in {pass2}',
+                     pass2=pass2filename)
+
+            # note that for this second pass, we don't care about paired
+            # reads - they will be output in the same order they're read in,
+            # so pairs will stay together if not orphaned.  This is in contrast
+            # to the first loop.  Hence, force_single=True below.
+
+            read_parser = ReadParser(pass2filename)
+            paired_iter = broken_paired_reader(read_parser,
+                                               min_length=K,
+                                               force_single=True)
+
+            watermark = REPORT_EVERY_N_READS
+            for read in trimmer.pass2(paired_iter):
+                if (trimmer.n_reads - n_start) > watermark:
+                    log_info('... x 2 {a} {b} {c} {d} {e} {f} {g}',
+                             a=trimmer.n_reads - n_start,
+                             b=pass2filename, c=trimmer.n_saved,
+                             d=trimmer.n_reads, e=trimmer.n_bp,
+                             f=written_reads, g=written_bp)
+                    watermark += REPORT_EVERY_N_READS
+
+                write_record(read, trimfp)
+                written_reads += 1
+                written_bp += len(read)
+
+            read_parser.close()
+
+            log_info('removing {pass2}', pass2=pass2filename)
+            os.unlink(pass2filename)
+
+            # if we created our own trimfps, close 'em.
+            if not args.output:
+                trimfp.close()
 
     try:
         log_info('removing temp directory & contents ({temp})', temp=tempdir)