From 8d9d75626f4c1abdf3d987154d69e21399183edb Mon Sep 17 00:00:00 2001
From: fmaccha <fukui@sator.co.jp>
Date: Thu, 11 Jul 2024 15:21:36 +0900
Subject: [PATCH] #1 #2 add support for STDIN and compressed format (gzip, bz2)

---
 src/args.rs                                   |   5 +-
 src/buffered_read_seek.rs                     |  11 +-
 src/edam.rs                                   |   1 +
 src/ext_tools.rs                              |   2 +-
 src/module.rs                                 | 244 ++++++++++++--
 src/source.rs                                 | 304 +++++++++++++-----
 tests/inputs/toy_invalid_flag.sam             |  14 +
 tests/integration_tests.rs                    | 100 +++++-
 tests/outputs/expected_output.csv             |   6 +-
 tests/outputs/expected_output.json            |   2 +-
 tests/outputs/expected_output.yaml            |   6 +
 .../outputs/expected_output_module_order.csv  |   6 +-
 tests/outputs/expected_output_num_records.csv |   2 +
 tests/outputs/expected_output_run_cwl.csv     |   6 +-
 14 files changed, 585 insertions(+), 124 deletions(-)
 create mode 100644 tests/inputs/toy_invalid_flag.sam
 create mode 100644 tests/outputs/expected_output_num_records.csv
diff --git a/src/args.rs b/src/args.rs
index 51a3148..7b16573 100644
--- a/src/args.rs
+++ b/src/args.rs
@@ -49,12 +49,11 @@ pub struct Args {
     #[clap(short, long, conflicts_with_all = ["num_records"])]
     pub tidy: bool,
 
-    // TODO
     /// Do not try to decompress the input file when detecting the file format.
-    #[clap(long, hide = true)]
+    #[clap(long)]
     pub no_decompress: bool,
 
-    /// Number of records to read from the input file. Conflicts with `--tidy` option.
+    /// Number of records to read from the input file. Recommened to set it to a multiple of 4 to prevent false negatives. Conflicts with `--tidy` option.
     #[clap(short, long, default_value = "100000", value_parser = validate_num_records_greater_than_zero)]
     pub num_records: usize,
 
diff --git a/src/buffered_read_seek.rs b/src/buffered_read_seek.rs
index 5d68e86..0db6a58 100644
--- a/src/buffered_read_seek.rs
+++ b/src/buffered_read_seek.rs
@@ -19,21 +19,20 @@ impl<R: Read> OnetimeRewindableReader<R> {
 impl<R: Read> Read for OnetimeRewindableReader<R> {
     fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
         if let Some(ref mut buffer) = self.buffer {
-            // 既にバッファがある場合はそこから読み取る
+            // read from the buffer if it exists
             let count = buffer.read(buf)?;
             if count == 0 {
-                // バッファが空になった場合は、元の入力から読み取る
-                let innercount = self.inner.read(buf);
-                innercount
+                // if the buffer is empty, read from the original input
+                self.inner.read(buf)
             } else {
                 Ok(count)
             }
         } else {
-            // 初回の読み取りでバッファを準備
+            // Prepare the buffer for the first read
             let mut temp_buffer = vec![0; buf.len()];
             let count = self.inner.read(&mut temp_buffer)?;
             if count > 0 {
-                // 読み取ったデータをバッファとして保存
+                // save the read data as a buffer
                 self.buffer = Some(Cursor::new(temp_buffer));
                 self.read(buf)
             } else {
diff --git a/src/edam.rs b/src/edam.rs
index b8aaac5..1f75f1f 100644
--- a/src/edam.rs
+++ b/src/edam.rs
@@ -63,6 +63,7 @@ impl EdamMap {
     }
 }
 
+// A internal struct to deserialize EDAM table.
 #[derive(Debug, Clone, Deserialize)]
 struct Edam {
     #[serde(rename = "Class ID")]
diff --git a/src/ext_tools.rs b/src/ext_tools.rs
index b528e9b..44c2afa 100644
--- a/src/ext_tools.rs
+++ b/src/ext_tools.rs
@@ -17,7 +17,7 @@ pub fn invoke(
     cwl_file_path: &Path,
     target_file_path: &Path,
     cwl_input_file_path: &NamedTempFile,
-    options: &InvokeOptions,
+    _options: &InvokeOptions,
 ) -> Result<ModuleResult> {
     info!("Invoking ext_tools {}", cwl_file_path.display());
 
diff --git a/src/module.rs b/src/module.rs
index eb03f22..4b7ddb1 100644
--- a/src/module.rs
+++ b/src/module.rs
@@ -1,5 +1,5 @@
 use anyhow::{anyhow, bail, Result};
-use log::{debug, error, info};
+use log::{debug, info, warn};
 use serde::{Deserialize, Serialize};
 use std::collections::HashMap;
 use std::path::{Path, PathBuf};
@@ -7,7 +7,7 @@ use tempfile::{NamedTempFile, TempDir};
 use url::Url;
 
 use crate::args::{Args, OutputFormat};
-use crate::source::Source;
+use crate::source::{Source, CompressedFormat};
 
 // Struct to store the result of Parser invocation and ExtTools invocation.
 #[derive(Debug)]
@@ -17,6 +17,21 @@ pub struct ModuleResult {
     label: Option<String>,
     id: Option<String>,
     error_message: Option<String>,
+    decompressed: Option<DecompressedFormat>,
+}
+
+impl From<&CompressedFormat> for ModuleResult {
+    fn from(compressed_format: &CompressedFormat) -> Self {
+        match compressed_format {
+            CompressedFormat::Bgzf => ModuleResult::with_result(None, None),
+            CompressedFormat::GZ => ModuleResult::with_result(
+                Some("GZIP format".to_string()),
+                Some("http://edamontology.org/format_3989".to_string()),
+            ),
+            CompressedFormat::BZ2 => ModuleResult::with_result(None, None),
+            CompressedFormat::None => ModuleResult::with_result(None, None),
+        }
+    }
 }
 
 impl ModuleResult {
@@ -27,6 +42,7 @@ impl ModuleResult {
             label,
             id,
             error_message: None,
+            decompressed: None,
         }
     }
 
@@ -54,6 +70,20 @@ impl ModuleResult {
         self.input = input;
     }
 
+    fn swap_edam_of_module_result_and_compressed_format(&mut self, compressed_format_edam: Self) {
+        let tmp_label = self.label.to_owned();
+        let tmp_id = self.id.to_owned();
+
+        self.label = compressed_format_edam.label;
+        self.id = compressed_format_edam.id;
+
+    let tmp_decompressed = DecompressedFormat {
+        label: tmp_label,
+        id: tmp_id,
+    };
+        self.decompressed = Some(tmp_decompressed);
+    }
+
     pub fn create_module_results_string(
         module_results: &[ModuleResult],
         format: OutputFormat,
@@ -65,13 +95,15 @@ impl ModuleResult {
                     .delimiter(delimiter)
                     .from_writer(&mut data);
 
-                writer.write_record(["File Path", "Edam ID", "Label"])?;
+                writer.write_record(["File Path", "Edam ID", "Label", "Decompressed ID", "Decompressed Label"])?;
 
                 for module_result in module_results.iter() {
                     writer.serialize((
                         &module_result.input,
                         &module_result.id,
                         &module_result.label,
+                        &module_result.decompressed.as_ref().and_then(|d| d.id.as_ref()),
+                        &module_result.decompressed.as_ref().and_then(|d| d.label.as_ref()),
                     ))?;
                 }
             }
@@ -82,12 +114,71 @@ impl ModuleResult {
 
         match format {
             OutputFormat::Yaml => {
-                let mut serialized_map = HashMap::new();
+                let mut serialized_map: HashMap<String, serde_yaml::Value> = HashMap::new();
                 for module_result in module_results {
                     let target_file_path = &module_result.input;
+
+                    // create yaml map for decompressed field
+                    let mut de_map : HashMap<String, serde_yaml::Value> = HashMap::new();
+                    match &module_result.decompressed {
+                        Some(decompressed) => {
+                            match &decompressed.id {
+                                Some(id) => {
+                                    de_map.insert("id".to_string(), serde_yaml::Value::String(id.clone()));
+                                },
+                                None => {
+                                    de_map.insert("id".to_string(), serde_yaml::Value::Null);
+                                }
+                            }
+                            match &decompressed.label {
+                                Some(label) => {
+                                    de_map.insert("label".to_string(), serde_yaml::Value::String(label.clone()));
+                                },
+                                None => {
+                                    de_map.insert("label".to_string(), serde_yaml::Value::Null);
+                                }
+                            }
+                        },
+                        None => {
+                            de_map.insert("id".to_string(), serde_yaml::Value::Null);
+                            de_map.insert("label".to_string(), serde_yaml::Value::Null);
+                        }
+                    }
+
+                    // create yaml map for label and id fields
+                    let mut comp_map : HashMap<String, serde_yaml::Value> = HashMap::new();
+                    match &module_result.id {
+                        Some(id) => {
+                            comp_map.insert("id".to_string(), serde_yaml::Value::String(id.clone()));
+                        },
+                        None => {
+                            comp_map.insert("id".to_string(), serde_yaml::Value::Null);
+                        }
+                    }
+                    match &module_result.label {
+                        Some(label) => {
+                            comp_map.insert("label".to_string(), serde_yaml::Value::String(label.clone()));
+                        },
+                        None => {
+                            comp_map.insert("label".to_string(), serde_yaml::Value::Null);
+                        }
+                    }
+
+                    // add decompressed field to the yaml map
+                    comp_map.insert("decompressed".to_string(), serde_yaml::to_value(de_map)?);
+                    // match &module_result.decompressed {
+                    //     Some(decompressed) => {
+                    //         comp_map.insert("decompressed".to_string(), serde_yaml::to_value(decompressed)?);
+                    //     },
+                    //     None => {
+                    //         comp_map.insert("decompressed".to_string(), serde_yaml::Value::Null);
+                    //     }
+                    // }
+
+
                     serialized_map.insert(
                         target_file_path.clone(),
-                        HashMap::from([("id", &module_result.id), ("label", &module_result.label)]),
+                        serde_yaml::to_value(comp_map)?,
                     );
                 }
 
@@ -97,12 +188,73 @@ impl ModuleResult {
             OutputFormat::Tsv => csv_serialize(module_results, b'\t'),
             OutputFormat::Csv => csv_serialize(module_results, b','),
             OutputFormat::Json => {
-                let mut serialized_map = HashMap::new();
+                let mut serialized_map: HashMap<String, serde_json::Value> = HashMap::new();
                 for module_result in module_results {
                     let target_file_path = &module_result.input;
+
+                    // create json map for decompressed field
+                    let mut de_map : HashMap<String, serde_json::Value> = HashMap::new();
+                    match &module_result.decompressed {
+                        Some(decompressed) => {
+                            match &decompressed.id {
+                                Some(id) => {
+                                    de_map.insert("id".to_string(), serde_json::Value::String(id.clone()));
+                                },
+                                None => {
+                                    de_map.insert("id".to_string(), serde_json::Value::Null);
+                                }
+                            }
+                            match &decompressed.label {
+                                Some(label) => {
+                                    de_map.insert("label".to_string(), serde_json::Value::String(label.clone()));
+                                },
+                                None => {
+                                    de_map.insert("label".to_string(), serde_json::Value::Null);
+                                }
+                            }
+                        },
+                        None => {
+                            de_map.insert("id".to_string(), serde_json::Value::Null);
+                            de_map.insert("label".to_string(), serde_json::Value::Null);
+                        }
+                    }
+
+                    // create json map for label and id fields
+                    let mut comp_map : HashMap<String, serde_json::Value> = HashMap::new();
+                    match &module_result.id {
+                        Some(id) => {
+                            comp_map.insert("id".to_string(), serde_json::Value::String(id.clone()));
+                        },
+                        None => {
+                            comp_map.insert("id".to_string(), serde_json::Value::Null);
+                        }
+                    }
+                    match &module_result.label {
+                        Some(label) => {
+                            comp_map.insert("label".to_string(), serde_json::Value::String(label.clone()));
+                        },
+                        None => {
+                            comp_map.insert("label".to_string(), serde_json::Value::Null);
+                        }
+                    }
+
+                    // add components field to the json map
+                    comp_map.insert("decompressed".to_string(), serde_json::to_value(de_map)?);
+                    // match &module_result.decompressed {
+                    //     Some(decompressed) => {
+                    //         comp_map.insert("decompressed".to_string(), serde_json::to_value(decompressed)?);
+                    //     },
+                    //     None => {
+                    //         comp_map.insert("decompressed".to_string(), serde_json::Value::Null);
+                    //     }
+                    // }
+
+
+
+
                     serialized_map.insert(
                         target_file_path.clone(),
-                        HashMap::from([("id", &module_result.id), ("label", &module_result.label)]),
+                        serde_json::to_value(comp_map)?,
                     );
                 }
 
@@ -113,6 +265,12 @@ impl ModuleResult {
     }
 }
 
+#[derive(Debug, Serialize, Deserialize)]
+pub struct DecompressedFormat {
+    label: Option<String>,
+    id: Option<String>,
+}
+
 // Struct to deserialize the contents of the conf file.
 #[derive(Debug, Serialize, Deserialize)]
 pub struct Config {
@@ -135,18 +293,25 @@ impl From<&Args> for InvokeOptions {
     }
 }
 
+
+
 pub fn run(config: Config, args: Args) -> Result<()> {
     crate::logger::init_logger(args.verbose, args.quiet);
     info!("tataki started");
     debug!("Args: {:?}", args);
     debug!("Output format: {:?}", args.get_output_format());
 
-    // let invoke_options = InvokeOptions::from(&args);
     let invoke_options = InvokeOptions::from(&args);
 
+    let cwl_module_exists = cwl_module_exists(&config)?;
+
+    // validate the user-provided options and input arguments to ensure they are suitable for execution.
+    check_run_condition_cwl_module(&args.input, cwl_module_exists, &invoke_options)?;
+
     let temp_dir = crate::fetch::create_temporary_dir(&args.cache_dir)?;
     info!("Created temporary directory: {}", temp_dir.path().display());
 
+    // create an empty vector to store the results of each module invocation.
     let mut module_results: Vec<ModuleResult> = Vec::new();
 
     // insert "empty" module at the beginning of the module order, so that the empty module is always invoked first.
@@ -154,12 +319,11 @@ pub fn run(config: Config, args: Args) -> Result<()> {
     config.order.insert(0, "empty".to_string());
 
     for input in &args.input {
+        let mut input = input.clone();
         info!("Processing input: {}", input);
 
-        // TODO: ここで、inputがstdinだったらtidyゆるさないよ、とかやる必要がある。
-        // TODO: --tidy + 圧縮path の場合はreaderで渡したほうがいい気がする。 
         // Check if the input is stdin or path. If path, download the file if it is a url.
-        let target_source = match input.parse::<Source>()? {
+        let (target_source , compressed_format) = match input.parse::<Source>()? {
             Source::FilePath(p) => {
                 // Prepare input file path from url or local file path.
                 // Download the file and store it in the specified cache directory if input is url.
@@ -171,7 +335,7 @@ pub fn run(config: Config, args: Args) -> Result<()> {
                         path
                     }
                     None => {
-                        let path = PathBuf::from(input);
+                        let path = PathBuf::from(&input);
                         if !path.exists() {
                             bail!("The specified target file does not exist. Please check the path. : {}" ,path.display()
                             );
@@ -179,11 +343,22 @@ pub fn run(config: Config, args: Args) -> Result<()> {
                         path
                     }
                 };
-                Source::FilePath(target_file_path)
-                // TODO 必要だったらtempfileにせなかん
+
+                let (source, compressed_format) = Source::decompress_into_tempfile_from_filepath_if_needed(
+                    &target_file_path,
+                    &invoke_options,
+                    &temp_dir,
+                    cwl_module_exists,
+                )?;
+
+                match source {
+                    Some(source) => {(source, compressed_format)},
+                    None =>          {  (    Source::FilePath(target_file_path), compressed_format)},
+                }
             }
             Source::Stdin => {
-                unimplemented!("Stdin is not supported yet. Will be implemented in upcoming versions.");
+                info!("Reading from STDIN...");
+                input = "STDIN".to_string();
                 Source::convert_into_tempfile_from_stdin(&invoke_options, &temp_dir)?
             },
             Source::TempFile(_) => unreachable!(),
@@ -192,13 +367,19 @@ pub fn run(config: Config, args: Args) -> Result<()> {
 
         let mut module_result = run_modules(target_source, &config, &temp_dir, &invoke_options)?;
 
+        let compressed_format_edam = ModuleResult::from(&compressed_format);
+        // must swap the edam of the module result and the compressed format if decompress has been done.
+        match compressed_format {
+            CompressedFormat::None =>{},
+            CompressedFormat::Bgzf => {},
+            _ => {
+                module_result.swap_edam_of_module_result_and_compressed_format(compressed_format_edam);
+            }
+        }
 
         module_result.set_input(input.clone());
         module_results.push(module_result);
 
-        // TODO ここでSource::TempFileを使った場合は消す。
-
-        
     }
 
     // if args.cache_dir is Some, keep the temporary directory.
@@ -269,7 +450,7 @@ fn run_modules(
             let result = match module_extension {
                 "" => crate::parser::invoke(module, &target_source, invoke_options),
                 "cwl" => {
-                    // TODO ここ、match使わずに書けないだろうか
+                    // TODO might want to refactor this without using match statement.
                     // CWL module invocation is skipped if the input is not a file path or URL.
                     match target_source.as_path() {
                         Some(target_file_path) => {
@@ -304,7 +485,8 @@ fn run_modules(
                     }
                 },
                 Err(e) => {
-                    error!("An error occurred while trying to invoke the \'{}\' module. Reason:\n{}", module, e);
+                    // TODO fix an issue that a error here is absorbed by the find_map function.
+                    warn!("An error occurred while trying to invoke the \'{}\' module. Reason:\n{}", module, e);
                     None
                 },
             }
@@ -338,3 +520,25 @@ fn cwl_module_exists(config: &Config) -> Result<bool> {
     }
     Ok(false)
 }
+
+fn check_run_condition_cwl_module(inputs: &[String], cwl_module_exists: bool, invoke_options: &InvokeOptions) -> Result<()> {
+    // whether stdin is included in the input
+    let stdin_exists = inputs.iter().any(|input| input == "-");
+
+    /*
+    cwl 
+    - filepath
+        - plain, --no-decompress
+            - ok    
+        - compressed
+            - tidy needed
+    - stdin
+        - tidy needed
+     */
+
+    if cwl_module_exists && stdin_exists && !invoke_options.tidy {
+        bail!("The `--tidy` option is required when reading from STDIN and invoking CWL modules.");
+    }
+
+    Ok(())
+}
diff --git a/src/source.rs b/src/source.rs
index ef43e68..be35c8d 100644
--- a/src/source.rs
+++ b/src/source.rs
@@ -1,6 +1,7 @@
 use anyhow::{bail, Result};
 use bzip2::read::BzDecoder;
 use flate2::read::GzDecoder;
+use log::{debug, warn};
 use std::fs::File;
 use std::io::{BufRead, BufReader, Read, Seek, Write};
 use std::path::{Path, PathBuf};
@@ -57,115 +58,272 @@ impl Source {
         }
     }
 
+    /*
     pub fn as_memory(&self) -> Option<&Vec<u8>> {
         match self {
             Self::Memory(m) => Some(m),
             _ => None,
         }
     }
+    */
 
-    pub fn convert_into_tempfile_from_file(
+    // if the input is compressed and no_decompress is false, decompress the input, save it into tempfile and return Source::FilePath(tempfile path)
+    // if the input is either not compressed or no_decompress is true, return None
+    pub fn decompress_into_tempfile_from_filepath_if_needed(
         input_path: &Path,
         options: &InvokeOptions,
         temp_dir: &TempDir,
-    ) -> Result<Self> {
-        let file = File::open(input_path)?;
-        Self::convert_into_tempfile(file, options, temp_dir)
+        cwl_modules_exists: bool,
+    ) -> Result<(Option<Self>, CompressedFormat)> {
+        let mut file = File::open(input_path)?;
+
+        // read first 100 bytes from reader in order to infer compression format
+        let mut buffer = [0; 100];
+        let bytes_read = file.read(&mut buffer)?;
+
+        // rewind the reader to the beginning
+        file.rewind()?;
+
+        // TODO check
+        // if not inferrable, then None.
+        let compressed_format: CompressedFormat =
+            if let Some(inferred_type) = infer::get(&buffer[..bytes_read]) {
+                let extension = inferred_type.extension();
+                match extension {
+                    "gz" => {
+                        // check if the gz file is in BGZF format
+                        if is_gzfile_in_bgzf(&buffer) {
+                            debug!("Provided input is in BGZF format");
+                            CompressedFormat::Bgzf
+                        } else {
+                            debug!("Provided input is in GZ format");
+                            CompressedFormat::GZ
+                        }
+                    }
+                    "bz2" => {
+                        debug!("Provided input is in BZ2 format");
+                        CompressedFormat::BZ2
+                    }
+                    _ => {
+                        warn!("Provided input is in compressed format not supported by this tool");
+                        CompressedFormat::None
+                    }
+                }
+            } else {
+                // type was not inferred, return None
+                warn!("Provided input is in compressed format not supported by this tool");
+                CompressedFormat::None
+            };
+
+        // if input is plain text or in BGZF, we are not going to decompress it
+        let mut inferred_reader: Box<dyn Read> = match compressed_format {
+            CompressedFormat::Bgzf => {
+                // if BGZF, save the reader as is
+                return Ok((None, CompressedFormat::Bgzf));
+            }
+            CompressedFormat::GZ => {
+                // if GZ, save the GzDecoder reader
+                let decoder = GzDecoder::new(file);
+                Box::new(decoder)
+            }
+            CompressedFormat::BZ2 => {
+                // if BZ2, save the BzDecoder reader
+                let decoder = BzDecoder::new(file);
+                Box::new(decoder)
+            }
+            CompressedFormat::None => {
+                // if None, save the reader as is
+                return Ok((None, CompressedFormat::None));
+            }
+        };
+
+        // here the input is compressed and not BGZF
+        if options.no_decompress {
+            Ok((None, compressed_format))
+        } else {
+            if cwl_modules_exists && !options.tidy {
+                bail!("The `--tidy` options is required when using CWL modules with compressed input files. If you want to treat the input file as is and not decompress it, please use the `--no-decompress` option.");
+            }
+            let decompressed_tempfile = Self::read_numrecords_save_into_tempfile(
+                &mut inferred_reader,
+                options,
+                temp_dir,
+                false,
+            )?;
+
+            Ok((Some(decompressed_tempfile), compressed_format))
+        }
     }
 
+    // if the input from stdin is compressed and no_decompress is false, decompress the input, save it into tempfile and return Source::FilePath(tempfile path)
+    // if the input from stdin is either not compressed or no_decompress is true, save it into tempfile and return Source::FilePath(tempfile path)
     pub fn convert_into_tempfile_from_stdin(
         options: &InvokeOptions,
         temp_dir: &TempDir,
-    ) -> Result<Self> {
+    ) -> Result<(Self, CompressedFormat)> {
         let stdin = std::io::stdin();
         let handle = stdin.lock();
-        Self::convert_into_tempfile(handle, options, temp_dir)
-    }
 
-    fn convert_into_tempfile<R: Read>(
-        readable: R,
-        options: &InvokeOptions,
-        temp_dir: &TempDir,
-    ) -> Result<Self> {
-        // create a writer for tempfile
-        let mut tempfile = NamedTempFile::new_in(temp_dir)?;
-        // let mut writer = BufWriter::new(&tempfile);
+        let mut onetime_reader = OnetimeRewindableReader::new(handle);
 
-        let mut tmp_reader = OnetimeRewindableReader::new(readable);
-
-        // read first 10 bytes from reader in order to infer compression format
+        // read first 100 bytes from reader in order to infer compression format
         let mut buffer = [0; 100];
-        let bytes_read = tmp_reader.read(&mut buffer)?;
-
-        println!("buffer: {:?}", String::from_utf8(buffer.to_vec()));
+        let bytes_read = onetime_reader.read(&mut buffer)?;
 
         // rewind the reader to the beginning
-        tmp_reader.rewind()?;
-
-        // let mut buffer = [0; 10];
-        // let bytes_read = tmp_reader.read(&mut buffer)?;
-        // println!("buffer: {:?}", String::from_utf8(buffer.to_vec()));
-
-        // let bytes_read = tmp_reader.read(&mut buffer)?;
-        // println!("buffer: {:?}", String::from_utf8(buffer.to_vec()));
-
-        // let bytes_read = tmp_reader.read(&mut buffer)?;
-        // println!("buffer: {:?}", String::from_utf8(buffer.to_vec()));
-        // std::process::exit(1);
-
-        let reader: Box<dyn Read> = if options.no_decompress {
-            Box::new(tmp_reader)
-        } else if let Some(inferred_type) = infer::get(&buffer[..bytes_read]) {
-            let extension = inferred_type.extension();
-            match extension {
-                "gz" => {
-                    // TODO BAMのBGZFがここで誤認されないようにする
-                    let decoder = GzDecoder::new(tmp_reader);
-                    println!("gz!!!");
+        onetime_reader.rewind()?;
+
+        let mut is_bgzf = false;
+
+        // if not inferrable, then None.
+        // TODO refactor into a function
+        let compressed_format: CompressedFormat =
+            if let Some(inferred_type) = infer::get(&buffer[..bytes_read]) {
+                let extension = inferred_type.extension();
+                match extension {
+                    "gz" => {
+                        // check if the gz file is in BGZF format
+                        if is_gzfile_in_bgzf(&buffer) {
+                            is_bgzf = true;
+                            debug!("Provided input is in BGZF format");
+                            CompressedFormat::Bgzf
+                        } else {
+                            debug!("Provided input is in GZ format");
+                            CompressedFormat::GZ
+                        }
+                    }
+                    "bz2" => {
+                        debug!("Provided input is in BZ2 format");
+                        CompressedFormat::BZ2
+                    }
+                    _ => {
+                        warn!("Provided input is in compressed format not supported by this tool");
+                        CompressedFormat::None
+                    }
+                }
+            } else {
+                // type was not inferred, return None
+                warn!("Provided input is in compressed format not supported by this tool");
+                CompressedFormat::None
+            };
+
+        // use the reader as is if no_decompress is true
+        let mut inferred_reader: Box<dyn Read> = if options.no_decompress {
+            Box::new(onetime_reader)
+        } else {
+            match compressed_format {
+                CompressedFormat::Bgzf => {
+                    // if BGZF, save the reader as is
+                    Box::new(onetime_reader)
+                }
+                CompressedFormat::GZ => {
+                    // if GZ, save the GzDecoder reader
+                    let decoder = GzDecoder::new(onetime_reader);
                     Box::new(decoder)
                 }
-                "bz2" => {
-                    let decoder = BzDecoder::new(tmp_reader);
-                    println!("bz2!!!");
+                CompressedFormat::BZ2 => {
+                    // if BZ2, save the BzDecoder reader
+                    let decoder = BzDecoder::new(onetime_reader);
                     Box::new(decoder)
                 }
-                _ => {
-                    panic!("Unexpected infer result");
+                CompressedFormat::None => {
+                    // if None, save the reader as is
+                    Box::new(onetime_reader)
                 }
             }
-        } else {
-            Box::new(tmp_reader)
         };
 
-        let mut bufreader = BufReader::new(reader);
+        let tempfile_from_stdin = Self::read_numrecords_save_into_tempfile(
+            &mut inferred_reader,
+            options,
+            temp_dir,
+            is_bgzf || options.no_decompress,
+        )?;
 
-        // TODO tmpに保存する行数を4倍くらいにする。memo参照
-        let mut line_buffer = String::new();
-        let mut count = 0;
-        while options.tidy || count < options.num_records {
-            line_buffer.clear();
-            let bytes_read = bufreader.read_line(&mut line_buffer)?;
-            if bytes_read == 0 {
-                break;
-            }
-            count += 1;
-            // write line into tempfile
-            tempfile.write_all(line_buffer.as_bytes())?;
+        Ok((tempfile_from_stdin, compressed_format))
+    }
+
+    fn read_numrecords_save_into_tempfile<R: Read>(
+        reader: &mut R,
+        options: &InvokeOptions,
+        temp_dir: &TempDir,
+        is_binary: bool,
+    ) -> Result<Self> {
+        // create a writer for tempfile
+        let mut tempfile = NamedTempFile::new_in(temp_dir)?;
+
+        // if the input is binary, such as BGZF, read (100 * num_records) bytes and save it into tempfile
+        if is_binary {
+            let total_bytes_copied: u64 = if options.tidy {
+                std::io::copy(reader, &mut tempfile)?
+            } else {
+                let bytes_to_copy = 100 * options.num_records;
+                let mut limited_src = reader.take(bytes_to_copy as u64);
+                std::io::copy(&mut limited_src, &mut tempfile)?
+            };
+            debug!("Bytes read from STDIN: {}", total_bytes_copied);
         }
+        // if not in binary, read the first 4 * num_records lines (plus header) and save it into tempfile
+        else {
+            let mut bufreader = BufReader::new(reader);
 
-        // for (count, line) in bufreader.lines().enumerate() {
-        //     if !options.tidy && count >= options.num_records {
-        //         break;
-        //     }
-        //     let line = line?;
-        //     // write line into tempfile
-        //     tempfile.write_all(line.as_bytes())?;
+            let mut line_buffer = String::new();
+            let mut count = 0;
+            let mut header_count = 0;
+            let numlines_to_read = 4 * options.num_records;
+            let max_header_lines = 20;
+            while options.tidy || count < numlines_to_read {
+                line_buffer.clear();
 
-        //     // writeln!(writer, "{}", line)?;
-        // }
+                let bytes_read = bufreader.read_line(&mut line_buffer)?;
+                if bytes_read == 0 {
+                    break;
+                }
 
-        println!("tempfile: {:?}", tempfile.path());
+                // if the line read is presumably a comment line, do not count it as a read line until the count reaches the max_header_lines
+                if (line_buffer.starts_with('#') || line_buffer.starts_with('@'))
+                    && header_count < max_header_lines
+                {
+                    header_count += 1;
+                } else {
+                    count += 1
+                }
+
+                // write line into tempfile
+                tempfile.write_all(line_buffer.as_bytes())?;
+            }
+        }
 
         Ok(Self::TempFile(tempfile))
     }
 }
+
+// check if the gz file is particulary in BGZF format
+fn is_gzfile_in_bgzf(header_buffer: &[u8]) -> bool {
+    // check if the header is BGZF
+
+    // check if the header is long enough
+    let header_buffer_length = header_buffer.len() >= 15;
+
+    // GZ Flag = 4, means that there is an extra field
+    let flag = header_buffer[3] == 0x04;
+
+    // SI1 field of the extra field is 66
+    let si1 = header_buffer[12] == 0x42;
+
+    // SI2 field of the extra field is 67
+    let si2 = header_buffer[13] == 0x43;
+
+    // subfield length in the extra field is 2
+    let slen = header_buffer[14] == 0x02;
+
+    header_buffer_length && flag && si1 && si2 && slen
+}
+
+#[derive(Debug)]
+pub enum CompressedFormat {
+    Bgzf,
+    GZ,
+    BZ2,
+    None,
+}
diff --git a/tests/inputs/toy_invalid_flag.sam b/tests/inputs/toy_invalid_flag.sam
new file mode 100644
index 0000000..cc9c127
--- /dev/null
+++ b/tests/inputs/toy_invalid_flag.sam
@@ -0,0 +1,14 @@
+@SQ	SN:ref	LN:45
+@SQ	SN:ref2	LN:40
+r001	163	ref	7	30	8M4I4M1D3M	=	37	39	TTAGATAAAGAGGATACTG	*	XX:B:S,12561,2,20,112
+r002	A	ref	9	30	1S2I6M1P1I1P1I4M2I	*	0	0	AAAAGATAAGGGATAAA	*
+r003	0	ref	9	30	5H6M	*	0	0	AGCTAA	*
+r004	0	ref	16	30	6M14N1I5M	*	0	0	ATAGCTCTCAGC	*
+r003	16	ref	29	30	6H5M	*	0	0	TAGGC	*
+r001	83	ref	37	30	9M	=	7	-39	CAGCGCCAT	*
+x1	0	ref2	1	30	20M	*	0	0	aggttttataaaacaaataa	????????????????????
+x2	0	ref2	2	30	21M	*	0	0	ggttttataaaacaaataatt	?????????????????????
+x3	0	ref2	6	30	9M4I13M	*	0	0	ttataaaacAAATaattaagtctaca	??????????????????????????
+x4	0	ref2	10	30	25M	*	0	0	CaaaTaattaagtctacagagcaac	?????????????????????????
+x5	0	ref2	12	30	24M	*	0	0	aaTaattaagtctacagagcaact	????????????????????????
+x6	0	ref2	14	30	23M	*	0	0	Taattaagtctacagagcaacta	???????????????????????
diff --git a/tests/integration_tests.rs b/tests/integration_tests.rs
index 3f5d24c..5be3a79 100644
--- a/tests/integration_tests.rs
+++ b/tests/integration_tests.rs
@@ -7,18 +7,39 @@ use common::{calculate_checksum, tataki};
 
 /*
 test cases:
-- default
-- -f yaml
-- -f json --cache-dir
-- -o file
-- -c conf
-- --dry-run
-- --quiet
-- --verbose
-- -c cwl.conf
+1. default
+2. -f yaml
+3. -f json --cache-dir
+4. -o file
+5. -c conf
+6. --dry-run
+7. --quiet
+8. --verbose
+9. -c cwl.conf
+
+new test cases involving new features:
+10. --num-records <LINES>
+11. --tidy
+12. read STDIN
+    1. --num-records - (check #lines of a tempfile)
+    2. gzipped stdin
+    3. conflicts w/ `--tidy`
+    4. conflictl w/ cwl extension mode
+13. read compressed file
+    1. types
+        1. read gzipped file
+        2. read bz2 file
+        3. read bzgf file
+    2. --no-decompress
+    3. --tidy
+    4. conflicts w/ cwl extension mode
+
+
+13. --no-decompress
 */
 
 #[test]
+// 1. default
 fn output_in_csv() {
     let out = tataki(&["./inputs/toy.sam", "./inputs/toy.fa"], &[]);
 
@@ -42,6 +63,7 @@ fn output_in_csv() {
 }
 
 #[test]
+// 2. -f yaml
 fn output_in_yaml() {
     let out = tataki(&["./inputs/toy.sam", "./inputs/toy.fa"], &["-f", "yaml"]);
 
@@ -62,6 +84,7 @@ fn output_in_yaml() {
 }
 
 #[test]
+// 3. -f json --cache-dir
 fn output_in_json_and_can_keep_cache() {
     let out = tataki(
         &[
@@ -110,7 +133,7 @@ fn output_in_json_and_can_keep_cache() {
 }
 
 #[test]
-
+// 4. -o file
 fn can_output_to_file() {
     let _ = tataki(
         &["./inputs/toy.sam", "./inputs/toy.fa"],
@@ -125,11 +148,12 @@ fn can_output_to_file() {
 
     assert_eq!(
         output_sha256,
-        "81afa82dcd25f408d0f9a1e3ef01f360c158bb3cdbe2e59e8b6f648a34c8972c"
+        "b60ee9b2d903fa08aa29575e2a1b719ce3b678b374e2fb57ee64355e10534840"
     );
 }
 
 #[test]
+// 5. -c conf
 // Check if the output becomes null when a conf without sam and fasta is specified.
 fn can_use_config_file() {
     let out = tataki(
@@ -157,6 +181,7 @@ fn can_use_config_file() {
 }
 
 #[test]
+// 6. --dry-run
 fn can_dry_run() {
     let out = tataki(&["./inputs/toy.sam", "./inputs/toy.fa"], &["--dry-run"]);
 
@@ -177,6 +202,7 @@ fn can_dry_run() {
 }
 
 #[test]
+// 7. --quiet
 fn can_be_quiet() {
     let out = tataki(&["./inputs/toy.sam", "./inputs/toy.fa"], &["--quiet"]);
 
@@ -186,6 +212,7 @@ fn can_be_quiet() {
 }
 
 #[test]
+// 8. --verbose
 fn can_be_verbose() {
     let out = tataki(&["./inputs/toy.sam", "./inputs/toy.fa"], &["--verbose"]);
 
@@ -195,6 +222,7 @@ fn can_be_verbose() {
 }
 
 #[test]
+// 9. -c cwl.conf
 fn can_run_cwl() {
     let out = tataki(
         &["./inputs/toy.py", "./inputs/toy.fa"],
@@ -219,3 +247,53 @@ fn can_run_cwl() {
 
     assert_eq!(output_records, expected_output_records);
 }
+
+#[test]
+// 10. --num-records <LINES>
+// Check if tataki only reads a single records. The second line of the input file is in abnormal format. If tataki reads more than one record, this assert fails.
+fn can_limit_the_number_of_output_records() {
+    let out = tataki(&["./inputs/toy_invalid_flag.sam"], &["--num-records", "1"]);
+
+    let stdout = out.stdout;
+
+    let mut rdr = csv::Reader::from_reader(stdout.as_bytes());
+    let output_records = rdr
+        .records()
+        .collect::<Result<Vec<_>, csv::Error>>()
+        .expect("Failed to parse the output as CSV");
+
+    let mut expected_output_rdr =
+        csv::Reader::from_path(Path::new("tests/outputs/expected_output_num_records.csv"))
+            .expect("Failed to read the expected output file");
+    let expected_output_records = expected_output_rdr
+        .records()
+        .collect::<Result<Vec<_>, csv::Error>>()
+        .expect("Failed to parse the expected output as CSV");
+
+    assert_eq!(output_records, expected_output_records);
+}
+
+#[test]
+// 11. --tidy
+// Check if tataki attempt to read the whole lines of the input file and fail when parsing the line right after `--num-records` lines.
+fn can_read_entirety_of_input_file() {
+    let out = tataki(&["./inputs/toy.sam", "./inputs/toy.fa"], &["--tidy"]);
+
+    let stdout = out.stdout;
+
+    let mut rdr = csv::Reader::from_reader(stdout.as_bytes());
+    let output_records = rdr
+        .records()
+        .collect::<Result<Vec<_>, csv::Error>>()
+        .expect("Failed to parse the output as CSV");
+
+    let mut expected_output_rdr =
+        csv::Reader::from_path(Path::new("tests/outputs/expected_output.csv"))
+            .expect("Failed to read the expected output file");
+    let expected_output_records = expected_output_rdr
+        .records()
+        .collect::<Result<Vec<_>, csv::Error>>()
+        .expect("Failed to parse the expected output as CSV");
+
+    assert_eq!(output_records, expected_output_records);
+}
diff --git a/tests/outputs/expected_output.csv b/tests/outputs/expected_output.csv
index 7d9b373..e3743fb 100644
--- a/tests/outputs/expected_output.csv
+++ b/tests/outputs/expected_output.csv
@@ -1,3 +1,3 @@
-File Path,Edam ID,Label
-./inputs/toy.sam,http://edamontology.org/format_2573,SAM
-./inputs/toy.fa,http://edamontology.org/format_1929,FASTA
+File Path,Edam ID,Label,Decompressed ID,Decompressed Label
+./inputs/toy.sam,http://edamontology.org/format_2573,SAM,,
+./inputs/toy.fa,http://edamontology.org/format_1929,FASTA,,
diff --git a/tests/outputs/expected_output.json b/tests/outputs/expected_output.json
index 5fd3516..993a395 100644
--- a/tests/outputs/expected_output.json
+++ b/tests/outputs/expected_output.json
@@ -1 +1 @@
-{"./inputs/toy.sam":{"id":"http://edamontology.org/format_2573","label":"SAM"},"https://github.com/sapporo-wes/tataki/raw/main/tests/inputs/toy.fa":{"id":"http://edamontology.org/format_1929","label":"FASTA"}}
\ No newline at end of file
+{"./inputs/toy.sam":{"id":"http://edamontology.org/format_2573","label":"SAM","decompressed":{"label":null,"id":null}},"https://github.com/sapporo-wes/tataki/raw/main/tests/inputs/toy.fa":{"id":"http://edamontology.org/format_1929","label":"FASTA","decompressed":{"label":null,"id":null}}}
\ No newline at end of file
diff --git a/tests/outputs/expected_output.yaml b/tests/outputs/expected_output.yaml
index 068ab8c..b53f516 100644
--- a/tests/outputs/expected_output.yaml
+++ b/tests/outputs/expected_output.yaml
@@ -1,6 +1,12 @@
 ./inputs/toy.sam:
   label: SAM
   id: http://edamontology.org/format_2573
+  decompressed:
+    label: null
+    id: null
 ./inputs/toy.fa:
   label: FASTA
   id: http://edamontology.org/format_1929
+  decompressed:
+    label: null
+    id: null
diff --git a/tests/outputs/expected_output_module_order.csv b/tests/outputs/expected_output_module_order.csv
index 75b9d67..872c406 100644
--- a/tests/outputs/expected_output_module_order.csv
+++ b/tests/outputs/expected_output_module_order.csv
@@ -1,3 +1,3 @@
-File Path,Edam ID,Label
-./inputs/toy.sam,,
-./inputs/toy.fa,,
+File Path,Edam ID,Label,Decompressed ID,Decompressed Label
+./inputs/toy.sam,,,,
+./inputs/toy.fa,,,,
diff --git a/tests/outputs/expected_output_num_records.csv b/tests/outputs/expected_output_num_records.csv
new file mode 100644
index 0000000..615fd60
--- /dev/null
+++ b/tests/outputs/expected_output_num_records.csv
@@ -0,0 +1,2 @@
+File Path,Edam ID,Label,Decompressed ID,Decompressed Label
+./inputs/toy_invalid_flag.sam,http://edamontology.org/format_2573,SAM,,
diff --git a/tests/outputs/expected_output_run_cwl.csv b/tests/outputs/expected_output_run_cwl.csv
index d643b7d..2c93f1f 100644
--- a/tests/outputs/expected_output_run_cwl.csv
+++ b/tests/outputs/expected_output_run_cwl.csv
@@ -1,3 +1,3 @@
-File Path,Edam ID,Label
-./inputs/toy.py,http://edamontology.org/format_3996,Python script
-./inputs/toy.fa,,
+File Path,Edam ID,Label,Decompressed ID,Decompressed Label
+./inputs/toy.py,http://edamontology.org/format_3996,Python script,,
+./inputs/toy.fa,,,,