Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Added new action create-feature-table #20

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions q2_amrfinderplus/feature_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import pandas as pd

from q2_amrfinderplus.types import AMRFinderPlusAnnotationsDirFmt


def create_feature_table(
annotations: AMRFinderPlusAnnotationsDirFmt,
) -> pd.DataFrame:
df = pd.DataFrame()
sample_dict = annotations.annotation_dict()

# Check if sample_dict is nested and create fake sample if needed
if type(next(iter(sample_dict.values()), None)) == str:
sample_dict = {"": sample_dict}

# Loop over all files, read in dataframes and concatenate them
for sample_id, file_dict in sample_dict.items():
for _id, file_fp in file_dict.items():
try:
file_df = pd.read_csv(
filepath_or_buffer=file_fp,
sep="\t",
usecols=["Contig id", "Gene symbol", "Start", "Stop", "Strand"],
)
except pd.errors.EmptyDataError as e:
raise ValueError(
"File is empty. All mutations output is empty if no organism was "
f"specified.\n\nOriginal error: {e}"
)
except ValueError as e:
raise ValueError(
"If the annotations were created solely from protein data, there "
"is no positional information and no gene abundance per contig "
f"can be calculated.\n\nOriginal error: {e}"
)

df = pd.concat([df, file_df])

# Drop duplicated rows and pivot table
df.drop_duplicates(keep="first", inplace=True)
df_pivot = pd.crosstab(df["Contig id"], df["Gene symbol"])

return df_pivot
15 changes: 15 additions & 0 deletions q2_amrfinderplus/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from q2_types.feature_data import FeatureData
from q2_types.feature_data_mag import MAG
from q2_types.feature_table import FeatureTable, Frequency
from q2_types.genome_data import Genes, GenomeData, Loci, Proteins
from q2_types.per_sample_sequences import Contigs, MAGs
from q2_types.sample_data import SampleData
Expand All @@ -18,6 +19,7 @@
from q2_amrfinderplus import __version__
from q2_amrfinderplus.annotate import annotate
from q2_amrfinderplus.database import fetch_amrfinderplus_db
from q2_amrfinderplus.feature_table import create_feature_table
from q2_amrfinderplus.types._format import (
AMRFinderPlusAnnotationFormat,
AMRFinderPlusAnnotationsDirFmt,
Expand Down Expand Up @@ -255,6 +257,19 @@
citations=[citations["feldgarden2021amrfinderplus"]],
)

plugin.methods.register_function(
function=create_feature_table,
inputs={"annotations": GenomeData[AMRFinderPlusAnnotations]},
outputs=[("table", FeatureTable[Frequency])],
parameters={},
input_descriptions={"annotations": "AMR annotations."},
output_descriptions={"table": "Frequency of AMR genes per contig."},
parameter_descriptions={},
name="Gene per contig frequency table",
description=(
"Create a gene per contig frequency table from AMRFinderPlus annotations."
),
)

plugin.register_semantic_type_to_format(
AMRFinderPlusDatabase,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Protein identifier Contig id Start Stop Strand Gene symbol Sequence name Scope Element type Element subtype Class Subclass Method Target length Reference sequence length % Coverage of reference sequence % Identity to reference sequence Alignment length Accession of closest sequence Name of closest sequence HMM id HMM description Hierarchy node
blaTEM-156 contig01 101 961 + blaTEM-156 class A beta-lactamase TEM-156 core AMR AMR BETA-LACTAM BETA-LACTAM ALLELEP 286 286 100.00 100.00 286 WP_061158039.1 class A beta-lactamase TEM-156 NF000531.2 TEM family class A beta-lactamase blaTEM-156
blaPDC-114_blast contig02 1 1191 + blaPDC PDC family class C beta-lactamase core AMR AMR BETA-LACTAM CEPHALOSPORIN INTERNAL_STOP 397 397 100.00 99.75 397 WP_061189306.1 class C beta-lactamase PDC-114 NF000422.6 PDC family class C beta-lactamase blaPDC
blaOXA-436_partial contig03 101 802 + blaOXA OXA-48 family class D beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM INTERNAL_STOP 233 265 87.92 100.00 233 WP_058842180.1 OXA-48 family carbapenem-hydrolyzing class D beta-lactamase OXA-436 NF012161.0 class D beta-lactamase blaOXA-48_fam
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Protein identifier Contig id Start Stop Strand Gene symbol Sequence name Scope Element type Element subtype Class Subclass Method Target length Reference sequence length % Coverage of reference sequence % Identity to reference sequence Alignment length Accession of closest sequence Name of closest sequence HMM id HMM description Hierarchy node
NA contig08 101 700 + blaTEM TEM family class A beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM INTERNAL_STOP 200 286 69.93 98.00 200 WP_110174956.1 class A beta-lactamase TEM-235 NA NA blaTEM
NA contig08 101 700 + blaTEM TEM family class A beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM INTERNAL_STOP 200 286 69.93 98.00 200 WP_122630841.1 class A beta-lactamase TEM-237 NA NA blaTEM
emrD3-suppressed-in-vibrio contig13 1 1137 + emrD3 multidrug efflux MFS transporter EmrD-3 plus AMR AMR EFFLUX EFFLUX EXACTP 379 379 100.00 100.00 379 ABQ18953.1 multidrug efflux MFS transporter EmrD-3 NA NA emrD3
arsR-suppressed-in-escherichia contig13 1141 1491 + arsR As(III)-sensing metalloregulatory transcriptional repressor ArsR plus STRESS METAL ARSENIC ARSENIC EXACTP 117 117 100.00 100.00 117 BAE77793.1 As(III)-sensing metalloregulatory transcriptional repressor ArsR NA NA arsR_K-12
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Protein identifier Gene symbol Sequence name Scope Element type Element subtype Class Subclass Method Target length Reference sequence length % Coverage of reference sequence % Identity to reference sequence Alignment length Accession of closest sequence Name of closest sequence HMM id HMM description Hierarchy node
aph3pp-Ib_partial_5p_neg aph(3'')-Ib aminoglycoside O-phosphotransferase APH(3'')-Ib core AMR AMR AMINOGLYCOSIDE STREPTOMYCIN PARTIALP 225 267 81.27 100.00 217 WP_001082319.1 aminoglycoside O-phosphotransferase APH(3'')-Ib NF032896.1 APH(3'') family aminoglycoside O-phosphotransferase aph(3'')-Ib
blaOXA-436_partial blaOXA OXA-48 family class D beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM PARTIALP 233 265 87.92 100.00 233 WP_058842180.1 OXA-48 family carbapenem-hydrolyzing class D beta-lactamase OXA-436 NF012161.0 class D beta-lactamase blaOXA-48_fam
43 changes: 43 additions & 0 deletions q2_amrfinderplus/tests/test_feature_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import pandas as pd
from qiime2.plugin.testing import TestPluginBase

from q2_amrfinderplus.feature_table import create_feature_table
from q2_amrfinderplus.types import AMRFinderPlusAnnotationsDirFmt


class TestFetchAMRFinderPlusDB(TestPluginBase):
package = "q2_amrfinderplus.tests"

def test_create_feature_table(self):
exp = pd.DataFrame(
{
"arsR": [0, 0, 0, 0, 1],
"blaOXA": [0, 0, 1, 0, 0],
"blaPDC": [0, 1, 0, 0, 0],
"blaTEM": [0, 0, 0, 1, 0],
"blaTEM-156": [1, 0, 0, 0, 0],
"emrD3": [0, 0, 0, 0, 1],
},
index=["contig01", "contig02", "contig03", "contig08", "contig13"],
)
exp.index.name = "Contig id"
exp.columns.name = "Gene symbol"
annotations = AMRFinderPlusAnnotationsDirFmt(
self.get_data_path("annotations_contigs"), mode="r"
)
obs = create_feature_table(annotations)
pd.testing.assert_frame_equal(exp, obs)

def test_value_error(self):
annotations = AMRFinderPlusAnnotationsDirFmt(
self.get_data_path("annotations_protein"), mode="r"
)
with self.assertRaisesRegex(ValueError, "solely from protein data"):
create_feature_table(annotations)

def test_empty_data_error(self):
annotations = AMRFinderPlusAnnotationsDirFmt()
with open(annotations.path / "sample1_amr_all_mutations.tsv", "w"):
pass
with self.assertRaisesRegex(ValueError, "File is empty"):
create_feature_table(annotations)
Loading