Skip to content

Commit

Permalink
Changes to QC pdf
Browse files Browse the repository at this point in the history
Fixes and changes:
- Removed "number of novel genes based on isoform number" heatmap in QC pdf (previously page 9)
- Increased font size of all plots in QC pdf
- Changed colours and added patterns to "Isoforms/Exons proportion" and "Number of 5' and 3' gene extensions" plots to match colours of other plots (Page 5, 9 and 14 of new pdf)
- Added ggpattern to environment dependencies
  • Loading branch information
Nicolaï Hoffmann committed Jul 10, 2024
1 parent 07aa9c1 commit 4492703
Show file tree
Hide file tree
Showing 4 changed files with 23 additions and 31 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ ANNEXA works by using only three parameter files (a reference genome, a referenc
2. Transcriptome reconstruction and quantification with [bambu](https://github.com/GoekeLab/bambu) or [StringTie](https://github.com/gpertea/stringtie).
3. Novel classification with [FEELnc](https://github.com/tderrien/FEELnc).
4. Retrieve information from input annotation and format final gtf with 3 level structure: gene -> transcript -> exon.
5. Predict the CDS of novel protein-coding transcripts with [Transdecoder](https://github.com/TransDecoder/TransDecoder).
5. Predict the CDS of novel protein-coding transcripts with [TransDecoder](https://github.com/TransDecoder/TransDecoder).
6. Filter novel transcripts based on [bambu NDR (Novel Discovery Rates)](https://github.com/GoekeLab/bambu) and/or [TransforKmers TSS validation](https://github.com/IGDRion/transforkmers) to assess fulllength transcripts.
7. Perform a quality control of both the full and filtered extended annotations (see [example](https://github.com/igdrion/ANNEXA/blob/master/examples/results/qc_gtf.pdf)).
8. Optional: Check gene body coverage with [RSeQC](http://rseqc.sourceforge.net/#genebody-coverage-py).
Expand Down
Binary file modified assets/metro_map.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
51 changes: 21 additions & 30 deletions bin/qc.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@ library(RColorBrewer)
library(grid)
library(gridExtra)
library(viridis)
library(ggpattern)

brew = c("#008387", "#a0d3db", "#ad8500", "#d9cb98")
palette = c("#00AFBB", "#FC4E07", "#E7B800")

theme_set(
theme_pubr() +
theme_pubr(base_size = 15) +
theme(
panel.grid.major = element_blank(),
axis.line.y = element_blank(),
Expand Down Expand Up @@ -74,17 +75,18 @@ len = gene %>%
# Nb isoformes
iso = gene %>%
mutate(isoformes = ifelse(nb_transcripts == 1, "1", "2+")) %>%
ggplot(aes(x = discovery, fill = isoformes)) +
ggplot(aes(x = discovery, fill = interaction(discovery,gene_biotype), pattern = isoformes)) +
ggtitle("Proportion of mono versus multi-isoform genes") +
geom_bar(position = "fill",
geom_bar_pattern(position = "fill",
alpha = 0.8,
colour = "black") +
scale_y_continuous(labels = scales::percent) +
facet_grid(~ gene_biotype) +
scale_fill_manual(values = palette) +
scale_fill_manual(values = brew) +
scale_pattern_manual(values = c("1" = "stripe", "2+" = "none")) +
guides(fill = "none", pattern = guide_legend(title = "Isoforms")) +
theme(axis.title.x = element_blank())


# Nombre de gène chaque catégorie
count = ggplot(data = gene, aes(x = gene_biotype, fill = paste(gene_biotype, discovery))) +
ggtitle("Number of genes") +
Expand Down Expand Up @@ -170,40 +172,28 @@ gene_counts_sample = gene %>%
.x))
)

# Heatmap Nb_isoforms and samples
gene_tx_samples = gene %>%
filter(discovery == "novel") %$%
as.data.frame(table(.$presents_in_sample, .$nb_transcripts)) %>%
ggplot(aes(
x = Var1,
y = Var2,
fill = log(Freq + 1)
)) +
guides(fill=FALSE) +
ggtitle("Number of novel genes (log) based on isoform\nnumber and number of samples with at least 1 count") +
geom_tile() +
labs(x = "Number of samples with at least 1 count", y = "Number of isoforms", fill =
"log(count)") +
scale_fill_viridis()

# 5'-3' extensions count
gene_ext = gene %>%
filter(ext_5 > 0 | ext_3 > 0) %>%
mutate(ext = ifelse(ext_5 > 0 & ext_3 > 0, "5'-3'",
ifelse(ext_5 > 0, "5'", "3'"))) %>%
ggplot(aes(x = gene_biotype, fill = ext)) +
ggplot(aes(x = gene_biotype, fill = gene_biotype, pattern = ext, pattern_angle = ext)) +
ggtitle("Number of 5' and 3' gene extensions") +
geom_bar(colour = "black",
geom_bar_pattern(colour = "black",
alpha = 0.8,
position = position_dodge()) +
scale_fill_manual(values = palette) +
scale_fill_manual(values = brew[c(2,4)]) +
scale_pattern_manual(values = c("stripe","stripe","crosshatch")) +
scale_pattern_angle_manual(values = c(45, -45, 45)) +
theme(legend.position = "top") +
theme(axis.title.x = element_blank()) +
labs(fill = "Gene extension", y = "Count") +
labs(y = "Count") +
guides(fill = "none",
pattern = guide_legend(title = "Gene extension"),
pattern_angle = guide_legend(title = "Gene extension")) +
scale_y_log10() +
annotation_logticks(sides = "l")


# 5'-3' extensions distribution
labels = c("5'", "3'")
names(labels) = c("ext_5", "ext_3")
Expand Down Expand Up @@ -285,14 +275,16 @@ tx_len = transcript %>%
# Mono vs multiexonique
tx_ex = transcript %>%
mutate(exons = ifelse(nb_exons == 1, "1", "2+")) %>%
ggplot(aes(x = tx_discovery, fill = exons)) +
ggplot(aes(x = tx_discovery, fill = interaction(tx_discovery, transcript_biotype), pattern = exons)) +
ggtitle("Proportion of mono versus multi exonic transcripts") +
geom_bar(position = "fill",
geom_bar_pattern(position = "fill",
alpha = 0.8,
colour = "black") +
scale_y_continuous(labels = scales::percent) +
facet_grid(~ transcript_biotype) +
scale_fill_manual(values = palette) +
scale_fill_manual(values = brew) +
scale_pattern_manual(values = c("1" = "stripe", "2+" = "none")) +
guides(fill = "none", pattern = guide_legend(title = "Exons")) +
theme(axis.title.x = element_blank())

# Count
Expand Down Expand Up @@ -422,7 +414,6 @@ iso
gene_counts
gene_validate
gene_counts_sample
gene_tx_samples
gene_ext
gene_ext_dist

Expand Down
1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ dependencies:
- conda-forge::r-ggpubr
- conda-forge::r-ggridges
- conda-forge::r-viridis
- conda-forge::r-ggpattern

- conda-forge::procps-ng
- conda-forge::mkl<=2024.0
Expand Down

0 comments on commit 4492703

Please sign in to comment.