Merge pull request #20 from ylab-hi/dev

feat: fix detection of mps backend
ylab-hi · Oct 14, 2024 · bb18ace · bb18ace
2 parents 6134798 + a0e68af
commit bb18ace
Show file tree

Hide file tree

Showing 6 changed files with 780 additions and 25 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -4,9 +4,9 @@ version = "1.2.3"
 edition = "2021"
 authors = ["Yangyang Li <[email protected]>"]
 description = "A Genomic Language Model for Chimera Artifact Detection in Nanopore Direct RNA Sequencing"
-documentation = "https://docs.rs/bitflags"
+documentation = "https://github.com/ylab-hi/DeepChopper"
 homepage = "https://serde.rs"
-repository = "https://github.com/rust-lang/cargo"
+repository = "https://github.com/ylab-hi/DeepChopper"
 license = "Apache-2.0"
 readme = "README.md"
 

diff --git a/deepchopper/cli.py b/deepchopper/cli.py
@@ -129,9 +129,19 @@ def predict(
     output_path = Path(output_path or "predictions")
     callbacks = [deepchopper.models.callbacks.CustomWriter(output_dir=output_path, write_interval="batch")]
 
-    accelerator, devices = (
-        ("gpu", min(gpus, torch.cuda.device_count())) if gpus > 0 and torch.cuda.is_available() else ("cpu", "auto")
-    )
+    if gpus > 0:
+        if torch.cuda.is_available():
+            accelerator = "gpu"
+            devices = min(gpus, torch.cuda.device_count())
+        elif torch.backends.mps.is_available():
+            accelerator = "mps"
+            devices = "auto"  # MPS currently supports only one device
+        else:
+            accelerator = "cpu"
+            devices = "auto"
+    else:
+        accelerator = "cpu"
+        devices = "auto"
 
     trainer = lightning.pytorch.trainer.Trainer(
         accelerator=accelerator,

diff --git a/deepchopper/data/fq_datamodule.py b/deepchopper/data/fq_datamodule.py
@@ -126,7 +126,7 @@ def prepare_data(self) -> None:
             if Path(data_path).suffix == ".parquet":
                 pass
             else:
-                msg = f"Data file {data_path} is not in FastQ or Parquet format."
+                msg = f"Data file {data_path} is not in Parquet format."
                 raise ValueError(msg)
 
         self.hparams.train_data_path = Path(self.hparams.train_data_path).with_suffix(".parquet").as_posix()

diff --git a/documentation/cli.md b/documentation/cli.md
diff --git a/documentation/tutorial.md b/documentation/tutorial.md
@@ -11,7 +11,7 @@ Whether you're new to bioinformatics or an experienced researcher, this tutorial
   - [1. Data Acquisition](#1-data-acquisition)
   - [2. Basecall Using Dorado](#2-basecall-using-dorado)
   - [3. Encoding Data with DeepChopper](#3-encoding-data-with-deepchopper)
-  - [4. Predicting Chimeric Reads](#4-predicting-chimeric-reads)
+  - [4. Predicting Adapter to Detect Artificial Chimeric Reads](#4-predicting-adapter-to-detect-artificial-chimeric-reads)
   - [5. Chopping Artificial Sequences](#5-chopping-artificial-sequences)
   - [Next Steps](#next-steps)
   - [Troubleshooting](#troubleshooting)
@@ -22,6 +22,7 @@ Before we begin, ensure you have the following installed:
 
 - DeepChopper (latest version)
 - Dorado (Oxford Nanopore's basecaller)
+- Samtools (for BAM to FASTQ conversion)
 - Sufficient storage space for Nanopore data
 
 ## 1. Data Acquisition
@@ -42,53 +43,57 @@ Convert raw signal data to nucleotide sequences using Dorado.
 ```bash
 # Install Dorado (if not already installed)
 # Run Dorado without trimming
-dorado basecaller \
-    --model [email protected] \
-    --device cuda:all \
-    --not_trim \
-    path/to/your/pod5/files/ \
-    > raw.fastq
+dorado basecaller --no-trim rna002_70bps_hac@v3 200cases.pod5 > raw_no_trim.bam
+
+# Convert BAM to FASTQ
+samtools view raw_no_trim.bam -d dx:0 | samtools fastq > raw_no_trim.fastq
 ```
 
 ⚠️ **Important**: Use the `--not_trim` option to preserve potential chimeric sequences.
 
-Replace `path/to/your/pod5/files/` with the directory containing your POD5 files.
+Replace `200cases.pod5` with the directory containing your POD5 files.
 The output will be a FASTQ file containing the basecalled sequences.
 
+**Note**: For convenience, you can download a pre-prepared FASTQ file directly:
+
+```bash
+wget https://raw.githubusercontent.com/ylab-hi/DeepChopper/refs/heads/main/tests/data/raw_no_trim.fastq
+```
+
 ## 3. Encoding Data with DeepChopper
 
 Prepare your data for the prediction model:
 
 ```bash
 # Encode the FASTQ file
-deepchopper encode raw.fastq
+deepchopper encode raw_no_trim.fastq
 ```
 
 For large datasets, use chunking to avoid memory issues:
 
 ```bash
-deepchopper encode raw.fastq --chunk --chunk-size  100000
+deepchopper encode raw_no_trim.fastq --chunk --chunk-size  100000
 ```
 
-🔍 **Output**: Look for `encoded_data.parquet` or multiple `.parquet` files if chunking.
+🔍 **Output**: Look for `raw_no_trim.parquet` or multiple `.parquet` files under `raw_no_trim.fq_chunks` if chunking.
 
-## 4. Predicting Chimeric Reads
+## 4. Predicting Adapter to Detect Artificial Chimeric Reads
 
 Analyze the encoded data to identify potential chimeric reads:
 
 ```bash
 # Predict artifical sequences for reads
-deepchopper predict raw.parquet --ouput predictions
+deepchopper predict raw_no_trim.parquet --ouput predictions
 
 # Predict artifical sequences for reads using GPU
-deepchopper predict raw.parquet --ouput predictions --gpus 2
+deepchopper predict raw_no_trim.parquet --ouput predictions --gpus 2
 ```
 
 For chunked data:
 
 ```bash
-deepchopper predict raw_chunk1.parquet --output predictions_chunk1
-deepchopper predict raw_chunk2.parquet --output predictions_chunk2
+deepchopper predict raw_no_trim.fq_chunks/raw_no_trim.fq_0.parquet --output predictions_chunk1
+deepchopper predict raw_no_trim.fq_chunks/raw_no_trim.fq_1.parquet --output predictions_chunk2
 ```
 
 📊 **Results**: Check the `predictions` folder for output files.
@@ -101,18 +106,18 @@ Remove identified artificial sequences:
 
 ```bash
 # Chop artificial sequences
-deepchopper chop predictions/0 raw.fastq
+deepchopper chop predictions/0 raw_no_trim.fastq
 ```
 
 For chunked predictions:
 
 ```bash
-deepchopper chop predictions_chunk1/0 predictions_chunk2/0 raw.fastq
+deepchopper chop predictions_chunk1/0 predictions_chunk2/0 raw_no_trim.fastq
 ```
 
 🎉 **Success**: Look for the output file with the `.chop.fq.bgz` suffix.
 
-This command takes the original FASTQ file (`raw.fastq`) and the predictions (`predictions`), and produces a new FASTQ file (with suffix `.chop.fq.bgz`) with the chimeric reads chopped.
+This command takes the original FASTQ file (`raw_no_trim.fastq`) and the predictions (`predictions`), and produces a new FASTQ file (with suffix `.chop.fq.bgz`) with the chimeric reads chopped.
 
 ## Next Steps
 
@@ -123,15 +128,19 @@ This command takes the original FASTQ file (`raw.fastq`) and the predictions (`p
 ## Troubleshooting
 
 - **Issue**: Out of memory errors
+
   **Solution**: Try using the `--chunk` option in the encode step
 
 - **Issue**: Slow processing
+
   **Solution**: Ensure you're using GPU acceleration if available
 
 - **Issue**: Unexpected results
+
   **Solution**: Verify input data quality and check DeepChopper version
 
 - **Issue**: GPU driver compatibility error
+
   **Solution**: Update your GPU driver or install a compatible PyTorch version e.g., `pip install torch --force-reinstall --index-url https://download.pytorch.org/whl/cu118` to install a CUDA 11.8 compatible version.
 
 For more help, visit our [GitHub Issues](https://github.com/ylab-hi/DeepChopper/issues) page.