Skip to content

Commit

Permalink
unpackaging all scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
carolinamonzo committed Apr 22, 2024
1 parent 268ccaa commit 4b3f4e5
Show file tree
Hide file tree
Showing 77 changed files with 469,580 additions and 0 deletions.
212 changes: 212 additions & 0 deletions IsoAnnot/InterproScan_install.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
!/bin/bash
# Title: InterProScan installation script for IsoAnnot
# Author: Darío González
# Description: This file installs InterProScan within IsoAnnot


Variables
INTERPROSCAN="https://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/5.62-94.0/interproscan-5.62-94.0-64-bit.tar.gz"
INTERPROSCAN_HASH="https://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/5.62-94.0/interproscan-5.62-94.0-64-bit.tar.gz.md5"
INTERPROSCAN_DOWNLOAD=$(basename $INTERPROSCAN) # name of the downloaded file
INTERPROSCAN_HASH_DOWNLOAD=$(basename $INTERPROSCAN_HASH) # name of the hash file

SIGNALP_DOWNLOAD_PAGE="https://services.healthtech.dtu.dk/cgi-bin/sw_request?software=signalp&version=4.1&packageversion=4.1g&platform=Linux"
TMHMM_DOWNLOAD_PAGE="https://services.healthtech.dtu.dk/cgi-bin/sw_request?software=tmhmm&version=2.0c&packageversion=2.0c&platform=Linux"

# Download InterProScan database
echo " ###################################################"
echo " ## InterProScan installation script for IsoAnnot ##"
echo " ###################################################"
echo
echo "Please choose one option of the following: "
echo " a) I want to INSTALL InterProScan from scratch."
echo " b) I moved the location of the IsoAnnot folder and want to MODIFY InterProScan installation PATHS."
echo
read -p "Enter option a or b: " execution_option


if [ $execution_option == "a" ]; then

echo "# Downloading InterProScan #"

MAX_RETRIES=3
TRY=0

while [ $TRY -lt $MAX_RETRIES ]
do
# Download the files
echo "# Downloading the file #"
echo "This may take a while"
wget ${INTERPROSCAN}
wget ${INTERPROSCAN_HASH}

# Calculate the downloaded file's MD5 hash
echo "# Checking integrity of the download #"
EXPECTED_HASH=$(awk '{print $1}' $INTERPROSCAN_HASH_DOWNLOAD)
DOWNLOADED_HASH=$(md5sum $INTERPROSCAN_DOWNLOAD | awk '{print $1}')

# Retry download if needed
if [ $DOWNLOADED_HASH = $EXPECTED_HASH ]
then
echo "File downloaded successfully and verified!"
break
else
echo "The downloaded file is corrupted!"
echo "Retrying..."
TRY += 1
rm -f $INTERPROSCAN_DOWNLOAD
rm -f $INTERPROSCAN_HASH_DOWNLOAD
fi
done

# Exit if maximum download retries were reached
if [ $TRY -ge $MAX_RETRIES ]; then
echo "Failed to download and verify the file after $TRY attempts."
echo "Aborting."
exit 1
fi

# Extract the files directly to the folder IsoAnnot needs
echo "# Decompressing the download #"
tar -pxvzf interproscan-*-bit.tar.gz --one-top-level=software/interproscan --strip-components 1

# Run the setup file of InterProScan
echo "## Configuring InterProScan ##"
cd software/interproscan/
python3 setup.py -f interproscan.properties
cd ../..

# Install the propietary databases in InterProScan
echo
echo "###############################################################"
echo "## INSTALL LICENSED DATABASES ##"
echo "## ##"
echo "## IsoAnnot uses 2 licensed databases from InterProScan. ##"
echo "## You can download them for free for academic purposes. ##"
echo "## ##"
echo "## This script helps you automate the installation for each ##"
echo "## database, but you will need to register and accept the ##"
echo "## licenses on your own. ##"
echo "## ##"
echo "## IF YOU DON'T INSTALL THIS DATABASES ISOANNOT WON'T WORK!! ##"
echo "###############################################################"
echo
read -p "Continue with the installation? [yes/no]: " ANSWER

# Exit the program if the user requested it
if [[ $ANSWER =~ ^[Nn][Oo]$ ]]
then
echo "Installation cancelled."
exit 1
fi

# Create a temporal folder to store the downloaded files
mkdir -p software/temp

# Print SignalP manual installation message
echo
echo "# Installing SignalP #"
echo "Follow the link bellow to the download page for SignalP, read and accept the license."
echo "You will receive an email with the download link for the software when you accept."
echo
echo "$SIGNALP_DOWNLOAD_PAGE"
echo

ANSWER="No" # set the variable for input loop

while [[ $ANSWER =~ ^[Nn][Oo]$ ]]
do
read -p "Paste the link from the email here: " LINK
read -p "Check the link is correct. Do you confirm the link? [yes/no]: " ANSWER
done

# Download and decompress the program
echo "Downloading file"
wget -P software/temp -nd -np -r -nH -l1 -A "signalp*.tar.gz" $LINK

tar -xvzf software/temp/signalp-* -C software/interproscan/bin/signalp/4.1/ --strip-components=1

# Move files to InterProScan folder
echo "Moving required files to InterProScan file tree"
mv software/temp/signalp-*/* software/interproscan/bin/signalp/4.1/

# Modify the `signalp` binary file so it can execute correctly
echo "Configuring SignalP"
PATH_TO_SIGNALP=$(realpath software/interproscan/bin/signalp/4.1)
echo $PATH_TO_SIGNALP
mv software/interproscan/bin/signalp/4.1/signalp software/interproscan/bin/signalp/4.1/signalp_backup
awk -v var=" \$ENV{SIGNALP} = '$PATH_TO_SIGNALP'" '{ if (NR == 13) print var; else print $0}' software/interproscan/bin/signalp/4.1/signalp_backup > software/interproscan/bin/signalp/4.1/signalp


# Add lines to InterProScan config file to activate SignalP
echo "Adding SignalP to InterProScan"

echo -en "\n\n" >> software/interproscan/interproscan.properties
echo "# SignalP" >> software/interproscan/interproscan.properties
echo "signalp_euk.signature.library.release=4.1" >> software/interproscan/interproscan.properties
echo "signalp_gram_positive.signature.library.release=4.1" >> software/interproscan/interproscan.properties
echo "signalp_gram_negative.signature.library.release=4.1" >> software/interproscan/interproscan.properties
echo "binary.signalp.path=bin/signalp/4.1/signalp" >> software/interproscan/interproscan.properties
echo "signalp.perl.library.dir=bin/signalp/4.1/lib" >> software/interproscan/interproscan.properties

echo "Finished installing SignalP"

Install TMHMM database
Print SignalP manual installation message
echo
echo "# Installing TMHMM #"
echo "Follow the link bellow to the download page for TMHMM, read and accept the license."
echo "You will receive an email with the download link for the software when you accept."
echo
echo "$TMHMM_DOWNLOAD_PAGE"
echo

ANSWER="No" # set the variable for input loop

while [[ $ANSWER =~ ^[Nn][Oo]$ ]]
do
read -p "Paste the link from the email here: " LINK
read -p "Check the link is correct. Do you confirm the link? [yes/no]: " ANSWER
done

# Download and decompress the program
echo "Downloading file"
# wget -P software/temp $LINK
wget -P software/temp -nd -np -r -nH -l1 -A "tmhmm*.tar.gz" $LINK

tar -xvzf software/temp/tmhmm-*.tar.gz -C software/temp/

# Move files to InterProScan folder
echo "Moving required files to InterProScan file tree"
mv software/temp/tmhmm*/bin/* software/interproscan/bin/tmhmm/2.0c/
mv software/temp/tmhmm*/lib/* software/interproscan/data/tmhmm/2.0c/

# Add lines to InterProScan config file to activate TMHMM
echo "Adding TMHMM to InterProScan"

echo -en "\n\n" >> software/interproscan/interproscan.properties
echo "# TMHMM" >> software/interproscan/interproscan.properties
echo "tmhmm.signature.library.release=2.0c" >> software/interproscan/interproscan.properties
echo "binary.tmhmm.path=bin/tmhmm/2.0c/decodeanhmm.Linux_x86_64" >> software/interproscan/interproscan.properties
echo "tmhmm.model.path=data/tmhmm/2.0c/TMHMM2.0.model" >> software/interproscan/interproscan.properties

echo "Finished installing TMHMM"

echo "# Removing leftover files #"
rm -rf software/temp

echo -en "\n\n"
echo "Your installation of InterProScan is complete!"

elif [ $execution_option == "b" ]; then
# Modify the `signalp` binary file so it can execute correctly
echo "Configuring SignalP"
PATH_TO_SIGNALP=$(realpath software/interproscan/bin/signalp/4.1)
mv software/interproscan/bin/signalp/4.1/signalp software/interproscan/bin/signalp/4.1/signalp_backup
awk -v var=" \$ENV{SIGNALP} = '$PATH_TO_SIGNALP'" '{ if (NR == 13) print var; else print $0}' software/interproscan/bin/signalp/4.1/signalp_backup > software/interproscan/bin/signalp/4.1/signalp

else
echo "Try again and choose option a or b."
exit 1

fi
3 changes: 3 additions & 0 deletions IsoAnnot/config/ensembl/athaliana/Snakefile.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
configfile: "config/ensembl/athaliana/config.yaml"

include: "../../generic/Snakefile_ensembl.smk"
29 changes: 29 additions & 0 deletions IsoAnnot/config/ensembl/athaliana/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
species: Arabidopsis thaliana
species_name: arabidopsis

biomart_host: plants.ensembl.org
biomart_dataset: athaliana_eg_gene

ensembl_cdna: ftp://ftp.ensemblgenomes.org/pub/release-45/plants/fasta/arabidopsis_thaliana/cdna/Arabidopsis_thaliana.TAIR10.cdna.all.fa.gz
ensembl_proteins: ftp://ftp.ensemblgenomes.org/pub/release-45/plants/fasta/arabidopsis_thaliana/pep/Arabidopsis_thaliana.TAIR10.pep.all.fa.gz
ensembl_gtf: ftp://ftp.ensemblgenomes.org/pub/release-45/plants/gtf/arabidopsis_thaliana/Arabidopsis_thaliana.TAIR10.45.gtf.gz
ensembl_reference: ftp://ftp.ensemblgenomes.org/pub/release-45/plants/fasta/arabidopsis_thaliana/dna/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.gz

prefix: Athaliana
db: ensembl
transcript_versioned: True

refseq_protein_dir: ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/plant/Arabidopsis_thaliana/latest_assembly_versions/GCF_000001735.4_TAIR10.1/
refseq_protein_fasta: ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/plant/Arabidopsis_thaliana/reference/GCF_000001735.4_TAIR10.1/GCF_000001735.4_TAIR10.1_protein.faa.gz
refseq_chr_accessions: ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/plant/Arabidopsis_thaliana/reference/GCF_000001735.4_TAIR10.1/GCF_000001735.4_TAIR10.1_assembly_structure/Primary_Assembly/assembled_chromosomes/chr2acc
refseq_gtf: ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/plant/Arabidopsis_thaliana/reference/GCF_000001735.4_TAIR10.1/GCF_000001735.4_TAIR10.1_genomic.gtf.gz

uniprot_fasta:
- ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000006548/UP000006548_3702.fasta.gz
- ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000006548/UP000006548_3702_additional.fasta.gz

uniprot_dat:
- ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000006548/UP000006548_3702.dat.gz
- ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000006548/UP000006548_3702_additional.dat.gz

reactome: https://plantreactome.gramene.org/download/current/Ensembl2PlantReactome_All_Levels.txt
3 changes: 3 additions & 0 deletions IsoAnnot/config/ensembl/dmelanogaster/Snakefile.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
configfile: "config/ensembl/dmelanogaster/config.yaml"

include: "../../generic/Snakefile_ensembl.smk"
29 changes: 29 additions & 0 deletions IsoAnnot/config/ensembl/dmelanogaster/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
species: Drosophila melanogaster
species_name: Drosophila

biomart_host: http://www.ensembl.org
biomart_dataset: dmelanogaster_gene_ensembl

ensembl_cdna: ftp://ftp.ensembl.org/pub/release-108/fasta/drosophila_melanogaster/cdna/Drosophila_melanogaster.BDGP6.32.cdna.all.fa.gz
ensembl_proteins: ftp://ftp.ensembl.org/pub/release-108/fasta/drosophila_melanogaster/pep/Drosophila_melanogaster.BDGP6.32.pep.all.fa.gz
ensembl_gtf: ftp://ftp.ensembl.org/pub/release-108/gtf/drosophila_melanogaster/Drosophila_melanogaster.BDGP6.32.108.chr.gtf.gz
ensembl_reference: ftp://ftp.ensembl.org/pub/release-108/fasta/drosophila_melanogaster/dna/Drosophila_melanogaster.BDGP6.32.dna.toplevel.fa.gz

prefix: Dmelanogaster
db: ensembl
transcript_versioned: False

refseq_protein_dir: ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/invertebrate/Drosophila_melanogaster/latest_assembly_versions/GCF_000001215.4_Release_6_plus_ISO1_MT/
refseq_protein_fasta: ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/invertebrate/Drosophila_melanogaster/latest_assembly_versions/GCF_000001215.4_Release_6_plus_ISO1_MT/GCF_000001215.4_Release_6_plus_ISO1_MT_protein.faa.gz
refseq_gtf: ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/invertebrate/Drosophila_melanogaster/latest_assembly_versions/GCF_000001215.4_Release_6_plus_ISO1_MT/GCF_000001215.4_Release_6_plus_ISO1_MT_genomic.gtf.gz
refseq_chr_accessions: ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/invertebrate/Drosophila_melanogaster/latest_assembly_versions/GCF_000001215.4_Release_6_plus_ISO1_MT/GCF_000001215.4_Release_6_plus_ISO1_MT_assembly_structure/Primary_Assembly/assembled_chromosomes/chr2acc

uniprot_fasta:
- ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000000803/UP000000803_7227.fasta.gz
- ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000000803/UP000000803_7227_additional.fasta.gz

uniprot_dat:
- ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000000803/UP000000803_7227.dat.gz
- ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000000803/UP000000803_7227_additional.dat.gz

reactome: https://reactome.org/download/current/Ensembl2Reactome_All_Levels.txt
3 changes: 3 additions & 0 deletions IsoAnnot/config/ensembl/drerio/Snakefile.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
configfile: "config/ensembl/drerio/config.yaml"

include: "../../generic/Snakefile_ensembl.smk"
28 changes: 28 additions & 0 deletions IsoAnnot/config/ensembl/drerio/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
species: Danio rerio
species_name: zebrafish

biomart_host: http://www.ensembl.org
biomart_dataset: drerio_gene_ensembl

ensembl_cdna: ftp://ftp.ensembl.org/pub/release-108/fasta/danio_rerio/cdna/Danio_rerio.GRCz11.cdna.all.fa.gz
ensembl_proteins: ftp://ftp.ensembl.org/pub/release-108/fasta/danio_rerio/pep/Danio_rerio.GRCz11.pep.all.fa.gz
ensembl_gtf: ftp://ftp.ensembl.org/pub/release-108/gtf/danio_rerio/Danio_rerio.GRCz11.108.chr.gtf.gz
ensembl_reference: ftp://ftp.ensembl.org/pub/release-108/fasta/danio_rerio/dna/Danio_rerio.GRCz11.dna.primary_assembly.fa.gz

prefix: Drerio
db: ensembl
transcript_versioned: False

refseq_protein_dir: ftp://ftp.ncbi.nlm.nih.gov/refseq/D_rerio/mRNA_Prot/
refseq_protein_fasta: ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_other/Danio_rerio/all_assembly_versions/GCF_000002035.6_GRCz11/GCF_000002035.6_GRCz11_protein.faa.gz
refseq_gtf: ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_other/Danio_rerio/all_assembly_versions/GCF_000002035.6_GRCz11/GCF_000002035.6_GRCz11_genomic.gtf.gz
refseq_chr_accessions: ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_other/Danio_rerio/all_assembly_versions/GCF_000002035.6_GRCz11/GCF_000002035.6_GRCz11_assembly_structure/Primary_Assembly/assembled_chromosomes/chr2acc

uniprot_fasta:
- ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000000437/UP000000437_7955.fasta.gz
- ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000000437/UP000000437_7955_additional.fasta.gz
uniprot_dat:
- ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000000437/UP000000437_7955.dat.gz
- ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000000437/UP000000437_7955_additional.dat.gz

reactome: https://reactome.org/download/current/Ensembl2Reactome_All_Levels.txt
3 changes: 3 additions & 0 deletions IsoAnnot/config/ensembl/hsapiens/Snakefile.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
configfile: "config/ensembl/hsapiens/config.yaml"

include: "../../generic/Snakefile_ensembl.smk"
30 changes: 30 additions & 0 deletions IsoAnnot/config/ensembl/hsapiens/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
species: Homo sapiens
species_name: human

biomart_host: http://www.ensembl.org
biomart_dataset: hsapiens_gene_ensembl

ensembl_cdna: ftp://ftp.ensembl.org/pub/release-108/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz
ensembl_proteins: ftp://ftp.ensembl.org/pub/release-108/fasta/homo_sapiens/pep/Homo_sapiens.GRCh38.pep.all.fa.gz
ensembl_gtf: ftp://ftp.ensembl.org/pub/release-108/gtf/homo_sapiens/Homo_sapiens.GRCh38.108.chr.gtf.gz
ensembl_reference: ftp://ftp.ensembl.org/pub/release-108/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz

prefix: Hsapiens
db: ensembl
transcript_versioned: False

refseq_protein_dir: ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/
refseq_protein_fasta: ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_protein.faa.gz
refseq_gtf: ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.gtf.gz
refseq_chr_accessions: ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_assembly_structure/Primary_Assembly/assembled_chromosomes/chr2acc

uniprot_fasta:
- ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640/UP000005640_9606.fasta.gz
- ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640/UP000005640_9606_additional.fasta.gz

uniprot_dat:
- ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640/UP000005640_9606.dat.gz
- ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640/UP000005640_9606_additional.dat.gz

reactome: https://reactome.org/download/current/Ensembl2Reactome_All_Levels.txt
layer_go: si
3 changes: 3 additions & 0 deletions IsoAnnot/config/ensembl/mmusculus/Snakefile.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
configfile: "config/ensembl/mmusculus/config.yaml"

include: "../../generic/Snakefile_ensembl.smk"
Loading

0 comments on commit 4b3f4e5

Please sign in to comment.