SalomonisLab · michael-kotliar · Apr 7, 2023
diff --git a/altanalyze3/components/aggregate/main.py b/altanalyze3/components/aggregate/main.py
@@ -66,14 +66,20 @@ def add(self, sa, ea):
                 else:
                     assert(False), "Not implemented logic"
             else:                                                                                                    #    partial match
-                if sa.strand == "+":
-                    self.__partial_match.append((f"""{sa.gene}:{sa.exon}{sa_shift}-{ea.exon}{ea_shift}""", sa.strand))  #     start - end exons order
-                elif sa.strand == "-":
-                    self.__partial_match.append((f"""{sa.gene}:{ea.exon}{ea_shift}-{sa.exon}{sa_shift}""", sa.strand))  #     end - start exons order
+                if sa.strand == "+":                                                                                 #        start - end exons order
+                    self.__partial_match.append(
+                        (f"""{sa.gene}:{sa.exon}{sa_shift}-{ea.exon}{ea_shift}""", sa.strand)
+                    )
+                elif sa.strand == "-":                                                                               #        end - start exons order
+                    self.__partial_match.append(
+                        (f"""{sa.gene}:{ea.exon}{ea_shift}-{sa.exon}{sa_shift}""", sa.strand)
+                    )
                 else:
                     assert(False), "Not implemented logic"
         else:                                                                                                        # different gene and/or strand
-            self.__distant_match.append((f"""{sa.gene}:{sa.exon}{sa_shift}-{ea.gene}:{ea.exon}{ea_shift}""", "."))   #    distant match
+            self.__distant_match.append(                                                                             #    distant match
+                (f"""{sa.gene}:{sa.exon}{sa_shift}-{ea.gene}:{ea.exon}{ea_shift}""", ".")
+            )
 
     def best(self):
         all_annotations = self.__exact_match + self.__partial_match + self.__distant_match
@@ -348,7 +354,7 @@ def collect_results(args):
         logging.info(f"""Loading pickled introns counts from {args.int_counts_location}""")
         int_counts_df = pandas.read_pickle(args.int_counts_location)
         int_counts_df["annotation"] = int_counts_df.index.to_frame(index=False).name.values
-        int_counts_df.reset_index(level="strand", inplace=True)                                     # need to move "strand" from the index to a column
+        int_counts_df.reset_index(level="strand", inplace=True)                                # need to move "strand" from the index to a column
 
     if counts_df is None:
         counts_df = int_counts_df
@@ -358,9 +364,12 @@ def collect_results(args):
 
     counts_df.sort_index(ascending=True, inplace=True)
 
+    logging.debug("Adding column with gene name")
+    counts_df["gene"] = counts_df["annotation"].str.split(":", expand=True).loc[:, 0]          # in case of a distant match when we have two genes, we take the first one
+
     adata_location = args.output.with_suffix(".h5ad")
     logging.info(f"""Exporting aggregated counts to {adata_location}""")
-    metadata_columns = ["annotation", "strand"]
+    metadata_columns = ["chr", "start", "end", "gene", "strand", "annotation"]                 # may include columns names from the index
     export_counts_to_anndata(
         counts_df=counts_df,
         location=adata_location,

diff --git a/altanalyze3/utilities/io.py b/altanalyze3/utilities/io.py
@@ -147,7 +147,8 @@ def __get_name(series, strand_coords):
     csr_matrix = counts_df.loc[:, counts_columns].astype(pandas.SparseDtype(sparse_dtype, fill_value)).T.sparse.to_coo().tocsr()
     adata = anndata.AnnData(csr_matrix, dtype=sparse_dtype)
     adata.obs_names = counts_columns
-    adata_var = counts_df.copy().loc[:, metadata_columns].astype(str)    # can be empty df if metadata_columns is []
-    adata_var.index = adata_var.reset_index().agg(__get_name, axis="columns", strand_coords=strand_coords)
+    counts_df_no_index = counts_df.copy().reset_index()                    # index is moved to columns
+    adata_var = counts_df_no_index.loc[:, metadata_columns]                # can be empty df if metadata_columns is []
+    adata_var.index = counts_df_no_index.agg(__get_name, axis="columns", strand_coords=strand_coords)
     adata.var = adata_var
     adata.write(location)