Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Export gene name, chr, start, end as separate columns in var of anndata file #21

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 16 additions & 7 deletions altanalyze3/components/aggregate/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,14 +66,20 @@ def add(self, sa, ea):
else:
assert(False), "Not implemented logic"
else: # partial match
if sa.strand == "+":
self.__partial_match.append((f"""{sa.gene}:{sa.exon}{sa_shift}-{ea.exon}{ea_shift}""", sa.strand)) # start - end exons order
elif sa.strand == "-":
self.__partial_match.append((f"""{sa.gene}:{ea.exon}{ea_shift}-{sa.exon}{sa_shift}""", sa.strand)) # end - start exons order
if sa.strand == "+": # start - end exons order
self.__partial_match.append(
(f"""{sa.gene}:{sa.exon}{sa_shift}-{ea.exon}{ea_shift}""", sa.strand)
)
elif sa.strand == "-": # end - start exons order
self.__partial_match.append(
(f"""{sa.gene}:{ea.exon}{ea_shift}-{sa.exon}{sa_shift}""", sa.strand)
)
else:
assert(False), "Not implemented logic"
else: # different gene and/or strand
self.__distant_match.append((f"""{sa.gene}:{sa.exon}{sa_shift}-{ea.gene}:{ea.exon}{ea_shift}""", ".")) # distant match
self.__distant_match.append( # distant match
(f"""{sa.gene}:{sa.exon}{sa_shift}-{ea.gene}:{ea.exon}{ea_shift}""", ".")
)

def best(self):
all_annotations = self.__exact_match + self.__partial_match + self.__distant_match
Expand Down Expand Up @@ -348,7 +354,7 @@ def collect_results(args):
logging.info(f"""Loading pickled introns counts from {args.int_counts_location}""")
int_counts_df = pandas.read_pickle(args.int_counts_location)
int_counts_df["annotation"] = int_counts_df.index.to_frame(index=False).name.values
int_counts_df.reset_index(level="strand", inplace=True) # need to move "strand" from the index to a column
int_counts_df.reset_index(level="strand", inplace=True) # need to move "strand" from the index to a column

if counts_df is None:
counts_df = int_counts_df
Expand All @@ -358,9 +364,12 @@ def collect_results(args):

counts_df.sort_index(ascending=True, inplace=True)

logging.debug("Adding column with gene name")
counts_df["gene"] = counts_df["annotation"].str.split(":", expand=True).loc[:, 0] # in case of a distant match when we have two genes, we take the first one

adata_location = args.output.with_suffix(".h5ad")
logging.info(f"""Exporting aggregated counts to {adata_location}""")
metadata_columns = ["annotation", "strand"]
metadata_columns = ["chr", "start", "end", "gene", "strand", "annotation"] # may include columns names from the index
export_counts_to_anndata(
counts_df=counts_df,
location=adata_location,
Expand Down
5 changes: 3 additions & 2 deletions altanalyze3/utilities/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,8 @@ def __get_name(series, strand_coords):
csr_matrix = counts_df.loc[:, counts_columns].astype(pandas.SparseDtype(sparse_dtype, fill_value)).T.sparse.to_coo().tocsr()
adata = anndata.AnnData(csr_matrix, dtype=sparse_dtype)
adata.obs_names = counts_columns
adata_var = counts_df.copy().loc[:, metadata_columns].astype(str) # can be empty df if metadata_columns is []
adata_var.index = adata_var.reset_index().agg(__get_name, axis="columns", strand_coords=strand_coords)
counts_df_no_index = counts_df.copy().reset_index() # index is moved to columns
adata_var = counts_df_no_index.loc[:, metadata_columns] # can be empty df if metadata_columns is []
adata_var.index = counts_df_no_index.agg(__get_name, axis="columns", strand_coords=strand_coords)
adata.var = adata_var
adata.write(location)