From 8d9d75626f4c1abdf3d987154d69e21399183edb Mon Sep 17 00:00:00 2001 From: fmaccha Date: Thu, 11 Jul 2024 15:21:36 +0900 Subject: [PATCH] #1 #2 add support for STDIN and compressed format (gzip, bz2) --- src/args.rs | 5 +- src/buffered_read_seek.rs | 11 +- src/edam.rs | 1 + src/ext_tools.rs | 2 +- src/module.rs | 244 ++++++++++++-- src/source.rs | 304 +++++++++++++----- tests/inputs/toy_invalid_flag.sam | 14 + tests/integration_tests.rs | 100 +++++- tests/outputs/expected_output.csv | 6 +- tests/outputs/expected_output.json | 2 +- tests/outputs/expected_output.yaml | 6 + .../outputs/expected_output_module_order.csv | 6 +- tests/outputs/expected_output_num_records.csv | 2 + tests/outputs/expected_output_run_cwl.csv | 6 +- 14 files changed, 585 insertions(+), 124 deletions(-) create mode 100644 tests/inputs/toy_invalid_flag.sam create mode 100644 tests/outputs/expected_output_num_records.csv diff --git a/src/args.rs b/src/args.rs index 51a3148..7b16573 100644 --- a/src/args.rs +++ b/src/args.rs @@ -49,12 +49,11 @@ pub struct Args { #[clap(short, long, conflicts_with_all = ["num_records"])] pub tidy: bool, - // TODO /// Do not try to decompress the input file when detecting the file format. - #[clap(long, hide = true)] + #[clap(long)] pub no_decompress: bool, - /// Number of records to read from the input file. Conflicts with `--tidy` option. + /// Number of records to read from the input file. Recommened to set it to a multiple of 4 to prevent false negatives. Conflicts with `--tidy` option. #[clap(short, long, default_value = "100000", value_parser = validate_num_records_greater_than_zero)] pub num_records: usize, diff --git a/src/buffered_read_seek.rs b/src/buffered_read_seek.rs index 5d68e86..0db6a58 100644 --- a/src/buffered_read_seek.rs +++ b/src/buffered_read_seek.rs @@ -19,21 +19,20 @@ impl OnetimeRewindableReader { impl Read for OnetimeRewindableReader { fn read(&mut self, buf: &mut [u8]) -> io::Result { if let Some(ref mut buffer) = self.buffer { - // 既にバッファがある場合はそこから読み取る + // read from the buffer if it exists let count = buffer.read(buf)?; if count == 0 { - // バッファが空になった場合は、元の入力から読み取る - let innercount = self.inner.read(buf); - innercount + // if the buffer is empty, read from the original input + self.inner.read(buf) } else { Ok(count) } } else { - // 初回の読み取りでバッファを準備 + // Prepare the buffer for the first read let mut temp_buffer = vec![0; buf.len()]; let count = self.inner.read(&mut temp_buffer)?; if count > 0 { - // 読み取ったデータをバッファとして保存 + // save the read data as a buffer self.buffer = Some(Cursor::new(temp_buffer)); self.read(buf) } else { diff --git a/src/edam.rs b/src/edam.rs index b8aaac5..1f75f1f 100644 --- a/src/edam.rs +++ b/src/edam.rs @@ -63,6 +63,7 @@ impl EdamMap { } } +// A internal struct to deserialize EDAM table. #[derive(Debug, Clone, Deserialize)] struct Edam { #[serde(rename = "Class ID")] diff --git a/src/ext_tools.rs b/src/ext_tools.rs index b528e9b..44c2afa 100644 --- a/src/ext_tools.rs +++ b/src/ext_tools.rs @@ -17,7 +17,7 @@ pub fn invoke( cwl_file_path: &Path, target_file_path: &Path, cwl_input_file_path: &NamedTempFile, - options: &InvokeOptions, + _options: &InvokeOptions, ) -> Result { info!("Invoking ext_tools {}", cwl_file_path.display()); diff --git a/src/module.rs b/src/module.rs index eb03f22..4b7ddb1 100644 --- a/src/module.rs +++ b/src/module.rs @@ -1,5 +1,5 @@ use anyhow::{anyhow, bail, Result}; -use log::{debug, error, info}; +use log::{debug, info, warn}; use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::path::{Path, PathBuf}; @@ -7,7 +7,7 @@ use tempfile::{NamedTempFile, TempDir}; use url::Url; use crate::args::{Args, OutputFormat}; -use crate::source::Source; +use crate::source::{Source, CompressedFormat}; // Struct to store the result of Parser invocation and ExtTools invocation. #[derive(Debug)] @@ -17,6 +17,21 @@ pub struct ModuleResult { label: Option, id: Option, error_message: Option, + decompressed: Option, +} + +impl From<&CompressedFormat> for ModuleResult { + fn from(compressed_format: &CompressedFormat) -> Self { + match compressed_format { + CompressedFormat::Bgzf => ModuleResult::with_result(None, None), + CompressedFormat::GZ => ModuleResult::with_result( + Some("GZIP format".to_string()), + Some("http://edamontology.org/format_3989".to_string()), + ), + CompressedFormat::BZ2 => ModuleResult::with_result(None, None), + CompressedFormat::None => ModuleResult::with_result(None, None), + } + } } impl ModuleResult { @@ -27,6 +42,7 @@ impl ModuleResult { label, id, error_message: None, + decompressed: None, } } @@ -54,6 +70,20 @@ impl ModuleResult { self.input = input; } + fn swap_edam_of_module_result_and_compressed_format(&mut self, compressed_format_edam: Self) { + let tmp_label = self.label.to_owned(); + let tmp_id = self.id.to_owned(); + + self.label = compressed_format_edam.label; + self.id = compressed_format_edam.id; + + let tmp_decompressed = DecompressedFormat { + label: tmp_label, + id: tmp_id, + }; + self.decompressed = Some(tmp_decompressed); + } + pub fn create_module_results_string( module_results: &[ModuleResult], format: OutputFormat, @@ -65,13 +95,15 @@ impl ModuleResult { .delimiter(delimiter) .from_writer(&mut data); - writer.write_record(["File Path", "Edam ID", "Label"])?; + writer.write_record(["File Path", "Edam ID", "Label", "Decompressed ID", "Decompressed Label"])?; for module_result in module_results.iter() { writer.serialize(( &module_result.input, &module_result.id, &module_result.label, + &module_result.decompressed.as_ref().and_then(|d| d.id.as_ref()), + &module_result.decompressed.as_ref().and_then(|d| d.label.as_ref()), ))?; } } @@ -82,12 +114,71 @@ impl ModuleResult { match format { OutputFormat::Yaml => { - let mut serialized_map = HashMap::new(); + let mut serialized_map: HashMap = HashMap::new(); for module_result in module_results { let target_file_path = &module_result.input; + + // create yaml map for decompressed field + let mut de_map : HashMap = HashMap::new(); + match &module_result.decompressed { + Some(decompressed) => { + match &decompressed.id { + Some(id) => { + de_map.insert("id".to_string(), serde_yaml::Value::String(id.clone())); + }, + None => { + de_map.insert("id".to_string(), serde_yaml::Value::Null); + } + } + match &decompressed.label { + Some(label) => { + de_map.insert("label".to_string(), serde_yaml::Value::String(label.clone())); + }, + None => { + de_map.insert("label".to_string(), serde_yaml::Value::Null); + } + } + }, + None => { + de_map.insert("id".to_string(), serde_yaml::Value::Null); + de_map.insert("label".to_string(), serde_yaml::Value::Null); + } + } + + // create yaml map for label and id fields + let mut comp_map : HashMap = HashMap::new(); + match &module_result.id { + Some(id) => { + comp_map.insert("id".to_string(), serde_yaml::Value::String(id.clone())); + }, + None => { + comp_map.insert("id".to_string(), serde_yaml::Value::Null); + } + } + match &module_result.label { + Some(label) => { + comp_map.insert("label".to_string(), serde_yaml::Value::String(label.clone())); + }, + None => { + comp_map.insert("label".to_string(), serde_yaml::Value::Null); + } + } + + // add decompressed field to the yaml map + comp_map.insert("decompressed".to_string(), serde_yaml::to_value(de_map)?); + // match &module_result.decompressed { + // Some(decompressed) => { + // comp_map.insert("decompressed".to_string(), serde_yaml::to_value(decompressed)?); + // }, + // None => { + // comp_map.insert("decompressed".to_string(), serde_yaml::Value::Null); + // } + // } + + serialized_map.insert( target_file_path.clone(), - HashMap::from([("id", &module_result.id), ("label", &module_result.label)]), + serde_yaml::to_value(comp_map)?, ); } @@ -97,12 +188,73 @@ impl ModuleResult { OutputFormat::Tsv => csv_serialize(module_results, b'\t'), OutputFormat::Csv => csv_serialize(module_results, b','), OutputFormat::Json => { - let mut serialized_map = HashMap::new(); + let mut serialized_map: HashMap = HashMap::new(); for module_result in module_results { let target_file_path = &module_result.input; + + // create json map for decompressed field + let mut de_map : HashMap = HashMap::new(); + match &module_result.decompressed { + Some(decompressed) => { + match &decompressed.id { + Some(id) => { + de_map.insert("id".to_string(), serde_json::Value::String(id.clone())); + }, + None => { + de_map.insert("id".to_string(), serde_json::Value::Null); + } + } + match &decompressed.label { + Some(label) => { + de_map.insert("label".to_string(), serde_json::Value::String(label.clone())); + }, + None => { + de_map.insert("label".to_string(), serde_json::Value::Null); + } + } + }, + None => { + de_map.insert("id".to_string(), serde_json::Value::Null); + de_map.insert("label".to_string(), serde_json::Value::Null); + } + } + + // create json map for label and id fields + let mut comp_map : HashMap = HashMap::new(); + match &module_result.id { + Some(id) => { + comp_map.insert("id".to_string(), serde_json::Value::String(id.clone())); + }, + None => { + comp_map.insert("id".to_string(), serde_json::Value::Null); + } + } + match &module_result.label { + Some(label) => { + comp_map.insert("label".to_string(), serde_json::Value::String(label.clone())); + }, + None => { + comp_map.insert("label".to_string(), serde_json::Value::Null); + } + } + + // add components field to the json map + comp_map.insert("decompressed".to_string(), serde_json::to_value(de_map)?); + // match &module_result.decompressed { + // Some(decompressed) => { + // comp_map.insert("decompressed".to_string(), serde_json::to_value(decompressed)?); + // }, + // None => { + // comp_map.insert("decompressed".to_string(), serde_json::Value::Null); + // } + // } + + + + serialized_map.insert( target_file_path.clone(), - HashMap::from([("id", &module_result.id), ("label", &module_result.label)]), + serde_json::to_value(comp_map)?, ); } @@ -113,6 +265,12 @@ impl ModuleResult { } } +#[derive(Debug, Serialize, Deserialize)] +pub struct DecompressedFormat { + label: Option, + id: Option, +} + // Struct to deserialize the contents of the conf file. #[derive(Debug, Serialize, Deserialize)] pub struct Config { @@ -135,18 +293,25 @@ impl From<&Args> for InvokeOptions { } } + + pub fn run(config: Config, args: Args) -> Result<()> { crate::logger::init_logger(args.verbose, args.quiet); info!("tataki started"); debug!("Args: {:?}", args); debug!("Output format: {:?}", args.get_output_format()); - // let invoke_options = InvokeOptions::from(&args); let invoke_options = InvokeOptions::from(&args); + let cwl_module_exists = cwl_module_exists(&config)?; + + // validate the user-provided options and input arguments to ensure they are suitable for execution. + check_run_condition_cwl_module(&args.input, cwl_module_exists, &invoke_options)?; + let temp_dir = crate::fetch::create_temporary_dir(&args.cache_dir)?; info!("Created temporary directory: {}", temp_dir.path().display()); + // create an empty vector to store the results of each module invocation. let mut module_results: Vec = Vec::new(); // insert "empty" module at the beginning of the module order, so that the empty module is always invoked first. @@ -154,12 +319,11 @@ pub fn run(config: Config, args: Args) -> Result<()> { config.order.insert(0, "empty".to_string()); for input in &args.input { + let mut input = input.clone(); info!("Processing input: {}", input); - // TODO: ここで、inputがstdinだったらtidyゆるさないよ、とかやる必要がある。 - // TODO: --tidy + 圧縮path の場合はreaderで渡したほうがいい気がする。 // Check if the input is stdin or path. If path, download the file if it is a url. - let target_source = match input.parse::()? { + let (target_source , compressed_format) = match input.parse::()? { Source::FilePath(p) => { // Prepare input file path from url or local file path. // Download the file and store it in the specified cache directory if input is url. @@ -171,7 +335,7 @@ pub fn run(config: Config, args: Args) -> Result<()> { path } None => { - let path = PathBuf::from(input); + let path = PathBuf::from(&input); if !path.exists() { bail!("The specified target file does not exist. Please check the path. : {}" ,path.display() ); @@ -179,11 +343,22 @@ pub fn run(config: Config, args: Args) -> Result<()> { path } }; - Source::FilePath(target_file_path) - // TODO 必要だったらtempfileにせなかん + + let (source, compressed_format) = Source::decompress_into_tempfile_from_filepath_if_needed( + &target_file_path, + &invoke_options, + &temp_dir, + cwl_module_exists, + )?; + + match source { + Some(source) => {(source, compressed_format)}, + None => { ( Source::FilePath(target_file_path), compressed_format)}, + } } Source::Stdin => { - unimplemented!("Stdin is not supported yet. Will be implemented in upcoming versions."); + info!("Reading from STDIN..."); + input = "STDIN".to_string(); Source::convert_into_tempfile_from_stdin(&invoke_options, &temp_dir)? }, Source::TempFile(_) => unreachable!(), @@ -192,13 +367,19 @@ pub fn run(config: Config, args: Args) -> Result<()> { let mut module_result = run_modules(target_source, &config, &temp_dir, &invoke_options)?; + let compressed_format_edam = ModuleResult::from(&compressed_format); + // must swap the edam of the module result and the compressed format if decompress has been done. + match compressed_format { + CompressedFormat::None =>{}, + CompressedFormat::Bgzf => {}, + _ => { + module_result.swap_edam_of_module_result_and_compressed_format(compressed_format_edam); + } + } module_result.set_input(input.clone()); module_results.push(module_result); - // TODO ここでSource::TempFileを使った場合は消す。 - - } // if args.cache_dir is Some, keep the temporary directory. @@ -269,7 +450,7 @@ fn run_modules( let result = match module_extension { "" => crate::parser::invoke(module, &target_source, invoke_options), "cwl" => { - // TODO ここ、match使わずに書けないだろうか + // TODO might want to refactor this without using match statement. // CWL module invocation is skipped if the input is not a file path or URL. match target_source.as_path() { Some(target_file_path) => { @@ -304,7 +485,8 @@ fn run_modules( } }, Err(e) => { - error!("An error occurred while trying to invoke the \'{}\' module. Reason:\n{}", module, e); + // TODO fix an issue that a error here is absorbed by the find_map function. + warn!("An error occurred while trying to invoke the \'{}\' module. Reason:\n{}", module, e); None }, } @@ -338,3 +520,25 @@ fn cwl_module_exists(config: &Config) -> Result { } Ok(false) } + +fn check_run_condition_cwl_module(inputs: &[String], cwl_module_exists: bool, invoke_options: &InvokeOptions) -> Result<()> { + // whether stdin is included in the input + let stdin_exists = inputs.iter().any(|input| input == "-"); + + /* + cwl + - filepath + - plain, --no-decompress + - ok + - compressed + - tidy needed + - stdin + - tidy needed + */ + + if cwl_module_exists && stdin_exists && !invoke_options.tidy { + bail!("The `--tidy` option is required when reading from STDIN and invoking CWL modules."); + } + + Ok(()) +} diff --git a/src/source.rs b/src/source.rs index ef43e68..be35c8d 100644 --- a/src/source.rs +++ b/src/source.rs @@ -1,6 +1,7 @@ use anyhow::{bail, Result}; use bzip2::read::BzDecoder; use flate2::read::GzDecoder; +use log::{debug, warn}; use std::fs::File; use std::io::{BufRead, BufReader, Read, Seek, Write}; use std::path::{Path, PathBuf}; @@ -57,115 +58,272 @@ impl Source { } } + /* pub fn as_memory(&self) -> Option<&Vec> { match self { Self::Memory(m) => Some(m), _ => None, } } + */ - pub fn convert_into_tempfile_from_file( + // if the input is compressed and no_decompress is false, decompress the input, save it into tempfile and return Source::FilePath(tempfile path) + // if the input is either not compressed or no_decompress is true, return None + pub fn decompress_into_tempfile_from_filepath_if_needed( input_path: &Path, options: &InvokeOptions, temp_dir: &TempDir, - ) -> Result { - let file = File::open(input_path)?; - Self::convert_into_tempfile(file, options, temp_dir) + cwl_modules_exists: bool, + ) -> Result<(Option, CompressedFormat)> { + let mut file = File::open(input_path)?; + + // read first 100 bytes from reader in order to infer compression format + let mut buffer = [0; 100]; + let bytes_read = file.read(&mut buffer)?; + + // rewind the reader to the beginning + file.rewind()?; + + // TODO check + // if not inferrable, then None. + let compressed_format: CompressedFormat = + if let Some(inferred_type) = infer::get(&buffer[..bytes_read]) { + let extension = inferred_type.extension(); + match extension { + "gz" => { + // check if the gz file is in BGZF format + if is_gzfile_in_bgzf(&buffer) { + debug!("Provided input is in BGZF format"); + CompressedFormat::Bgzf + } else { + debug!("Provided input is in GZ format"); + CompressedFormat::GZ + } + } + "bz2" => { + debug!("Provided input is in BZ2 format"); + CompressedFormat::BZ2 + } + _ => { + warn!("Provided input is in compressed format not supported by this tool"); + CompressedFormat::None + } + } + } else { + // type was not inferred, return None + warn!("Provided input is in compressed format not supported by this tool"); + CompressedFormat::None + }; + + // if input is plain text or in BGZF, we are not going to decompress it + let mut inferred_reader: Box = match compressed_format { + CompressedFormat::Bgzf => { + // if BGZF, save the reader as is + return Ok((None, CompressedFormat::Bgzf)); + } + CompressedFormat::GZ => { + // if GZ, save the GzDecoder reader + let decoder = GzDecoder::new(file); + Box::new(decoder) + } + CompressedFormat::BZ2 => { + // if BZ2, save the BzDecoder reader + let decoder = BzDecoder::new(file); + Box::new(decoder) + } + CompressedFormat::None => { + // if None, save the reader as is + return Ok((None, CompressedFormat::None)); + } + }; + + // here the input is compressed and not BGZF + if options.no_decompress { + Ok((None, compressed_format)) + } else { + if cwl_modules_exists && !options.tidy { + bail!("The `--tidy` options is required when using CWL modules with compressed input files. If you want to treat the input file as is and not decompress it, please use the `--no-decompress` option."); + } + let decompressed_tempfile = Self::read_numrecords_save_into_tempfile( + &mut inferred_reader, + options, + temp_dir, + false, + )?; + + Ok((Some(decompressed_tempfile), compressed_format)) + } } + // if the input from stdin is compressed and no_decompress is false, decompress the input, save it into tempfile and return Source::FilePath(tempfile path) + // if the input from stdin is either not compressed or no_decompress is true, save it into tempfile and return Source::FilePath(tempfile path) pub fn convert_into_tempfile_from_stdin( options: &InvokeOptions, temp_dir: &TempDir, - ) -> Result { + ) -> Result<(Self, CompressedFormat)> { let stdin = std::io::stdin(); let handle = stdin.lock(); - Self::convert_into_tempfile(handle, options, temp_dir) - } - fn convert_into_tempfile( - readable: R, - options: &InvokeOptions, - temp_dir: &TempDir, - ) -> Result { - // create a writer for tempfile - let mut tempfile = NamedTempFile::new_in(temp_dir)?; - // let mut writer = BufWriter::new(&tempfile); + let mut onetime_reader = OnetimeRewindableReader::new(handle); - let mut tmp_reader = OnetimeRewindableReader::new(readable); - - // read first 10 bytes from reader in order to infer compression format + // read first 100 bytes from reader in order to infer compression format let mut buffer = [0; 100]; - let bytes_read = tmp_reader.read(&mut buffer)?; - - println!("buffer: {:?}", String::from_utf8(buffer.to_vec())); + let bytes_read = onetime_reader.read(&mut buffer)?; // rewind the reader to the beginning - tmp_reader.rewind()?; - - // let mut buffer = [0; 10]; - // let bytes_read = tmp_reader.read(&mut buffer)?; - // println!("buffer: {:?}", String::from_utf8(buffer.to_vec())); - - // let bytes_read = tmp_reader.read(&mut buffer)?; - // println!("buffer: {:?}", String::from_utf8(buffer.to_vec())); - - // let bytes_read = tmp_reader.read(&mut buffer)?; - // println!("buffer: {:?}", String::from_utf8(buffer.to_vec())); - // std::process::exit(1); - - let reader: Box = if options.no_decompress { - Box::new(tmp_reader) - } else if let Some(inferred_type) = infer::get(&buffer[..bytes_read]) { - let extension = inferred_type.extension(); - match extension { - "gz" => { - // TODO BAMのBGZFがここで誤認されないようにする - let decoder = GzDecoder::new(tmp_reader); - println!("gz!!!"); + onetime_reader.rewind()?; + + let mut is_bgzf = false; + + // if not inferrable, then None. + // TODO refactor into a function + let compressed_format: CompressedFormat = + if let Some(inferred_type) = infer::get(&buffer[..bytes_read]) { + let extension = inferred_type.extension(); + match extension { + "gz" => { + // check if the gz file is in BGZF format + if is_gzfile_in_bgzf(&buffer) { + is_bgzf = true; + debug!("Provided input is in BGZF format"); + CompressedFormat::Bgzf + } else { + debug!("Provided input is in GZ format"); + CompressedFormat::GZ + } + } + "bz2" => { + debug!("Provided input is in BZ2 format"); + CompressedFormat::BZ2 + } + _ => { + warn!("Provided input is in compressed format not supported by this tool"); + CompressedFormat::None + } + } + } else { + // type was not inferred, return None + warn!("Provided input is in compressed format not supported by this tool"); + CompressedFormat::None + }; + + // use the reader as is if no_decompress is true + let mut inferred_reader: Box = if options.no_decompress { + Box::new(onetime_reader) + } else { + match compressed_format { + CompressedFormat::Bgzf => { + // if BGZF, save the reader as is + Box::new(onetime_reader) + } + CompressedFormat::GZ => { + // if GZ, save the GzDecoder reader + let decoder = GzDecoder::new(onetime_reader); Box::new(decoder) } - "bz2" => { - let decoder = BzDecoder::new(tmp_reader); - println!("bz2!!!"); + CompressedFormat::BZ2 => { + // if BZ2, save the BzDecoder reader + let decoder = BzDecoder::new(onetime_reader); Box::new(decoder) } - _ => { - panic!("Unexpected infer result"); + CompressedFormat::None => { + // if None, save the reader as is + Box::new(onetime_reader) } } - } else { - Box::new(tmp_reader) }; - let mut bufreader = BufReader::new(reader); + let tempfile_from_stdin = Self::read_numrecords_save_into_tempfile( + &mut inferred_reader, + options, + temp_dir, + is_bgzf || options.no_decompress, + )?; - // TODO tmpに保存する行数を4倍くらいにする。memo参照 - let mut line_buffer = String::new(); - let mut count = 0; - while options.tidy || count < options.num_records { - line_buffer.clear(); - let bytes_read = bufreader.read_line(&mut line_buffer)?; - if bytes_read == 0 { - break; - } - count += 1; - // write line into tempfile - tempfile.write_all(line_buffer.as_bytes())?; + Ok((tempfile_from_stdin, compressed_format)) + } + + fn read_numrecords_save_into_tempfile( + reader: &mut R, + options: &InvokeOptions, + temp_dir: &TempDir, + is_binary: bool, + ) -> Result { + // create a writer for tempfile + let mut tempfile = NamedTempFile::new_in(temp_dir)?; + + // if the input is binary, such as BGZF, read (100 * num_records) bytes and save it into tempfile + if is_binary { + let total_bytes_copied: u64 = if options.tidy { + std::io::copy(reader, &mut tempfile)? + } else { + let bytes_to_copy = 100 * options.num_records; + let mut limited_src = reader.take(bytes_to_copy as u64); + std::io::copy(&mut limited_src, &mut tempfile)? + }; + debug!("Bytes read from STDIN: {}", total_bytes_copied); } + // if not in binary, read the first 4 * num_records lines (plus header) and save it into tempfile + else { + let mut bufreader = BufReader::new(reader); - // for (count, line) in bufreader.lines().enumerate() { - // if !options.tidy && count >= options.num_records { - // break; - // } - // let line = line?; - // // write line into tempfile - // tempfile.write_all(line.as_bytes())?; + let mut line_buffer = String::new(); + let mut count = 0; + let mut header_count = 0; + let numlines_to_read = 4 * options.num_records; + let max_header_lines = 20; + while options.tidy || count < numlines_to_read { + line_buffer.clear(); - // // writeln!(writer, "{}", line)?; - // } + let bytes_read = bufreader.read_line(&mut line_buffer)?; + if bytes_read == 0 { + break; + } - println!("tempfile: {:?}", tempfile.path()); + // if the line read is presumably a comment line, do not count it as a read line until the count reaches the max_header_lines + if (line_buffer.starts_with('#') || line_buffer.starts_with('@')) + && header_count < max_header_lines + { + header_count += 1; + } else { + count += 1 + } + + // write line into tempfile + tempfile.write_all(line_buffer.as_bytes())?; + } + } Ok(Self::TempFile(tempfile)) } } + +// check if the gz file is particulary in BGZF format +fn is_gzfile_in_bgzf(header_buffer: &[u8]) -> bool { + // check if the header is BGZF + + // check if the header is long enough + let header_buffer_length = header_buffer.len() >= 15; + + // GZ Flag = 4, means that there is an extra field + let flag = header_buffer[3] == 0x04; + + // SI1 field of the extra field is 66 + let si1 = header_buffer[12] == 0x42; + + // SI2 field of the extra field is 67 + let si2 = header_buffer[13] == 0x43; + + // subfield length in the extra field is 2 + let slen = header_buffer[14] == 0x02; + + header_buffer_length && flag && si1 && si2 && slen +} + +#[derive(Debug)] +pub enum CompressedFormat { + Bgzf, + GZ, + BZ2, + None, +} diff --git a/tests/inputs/toy_invalid_flag.sam b/tests/inputs/toy_invalid_flag.sam new file mode 100644 index 0000000..cc9c127 --- /dev/null +++ b/tests/inputs/toy_invalid_flag.sam @@ -0,0 +1,14 @@ +@SQ SN:ref LN:45 +@SQ SN:ref2 LN:40 +r001 163 ref 7 30 8M4I4M1D3M = 37 39 TTAGATAAAGAGGATACTG * XX:B:S,12561,2,20,112 +r002 A ref 9 30 1S2I6M1P1I1P1I4M2I * 0 0 AAAAGATAAGGGATAAA * +r003 0 ref 9 30 5H6M * 0 0 AGCTAA * +r004 0 ref 16 30 6M14N1I5M * 0 0 ATAGCTCTCAGC * +r003 16 ref 29 30 6H5M * 0 0 TAGGC * +r001 83 ref 37 30 9M = 7 -39 CAGCGCCAT * +x1 0 ref2 1 30 20M * 0 0 aggttttataaaacaaataa ???????????????????? +x2 0 ref2 2 30 21M * 0 0 ggttttataaaacaaataatt ????????????????????? +x3 0 ref2 6 30 9M4I13M * 0 0 ttataaaacAAATaattaagtctaca ?????????????????????????? +x4 0 ref2 10 30 25M * 0 0 CaaaTaattaagtctacagagcaac ????????????????????????? +x5 0 ref2 12 30 24M * 0 0 aaTaattaagtctacagagcaact ???????????????????????? +x6 0 ref2 14 30 23M * 0 0 Taattaagtctacagagcaacta ??????????????????????? diff --git a/tests/integration_tests.rs b/tests/integration_tests.rs index 3f5d24c..5be3a79 100644 --- a/tests/integration_tests.rs +++ b/tests/integration_tests.rs @@ -7,18 +7,39 @@ use common::{calculate_checksum, tataki}; /* test cases: -- default -- -f yaml -- -f json --cache-dir -- -o file -- -c conf -- --dry-run -- --quiet -- --verbose -- -c cwl.conf +1. default +2. -f yaml +3. -f json --cache-dir +4. -o file +5. -c conf +6. --dry-run +7. --quiet +8. --verbose +9. -c cwl.conf + +new test cases involving new features: +10. --num-records +11. --tidy +12. read STDIN + 1. --num-records - (check #lines of a tempfile) + 2. gzipped stdin + 3. conflicts w/ `--tidy` + 4. conflictl w/ cwl extension mode +13. read compressed file + 1. types + 1. read gzipped file + 2. read bz2 file + 3. read bzgf file + 2. --no-decompress + 3. --tidy + 4. conflicts w/ cwl extension mode + + +13. --no-decompress */ #[test] +// 1. default fn output_in_csv() { let out = tataki(&["./inputs/toy.sam", "./inputs/toy.fa"], &[]); @@ -42,6 +63,7 @@ fn output_in_csv() { } #[test] +// 2. -f yaml fn output_in_yaml() { let out = tataki(&["./inputs/toy.sam", "./inputs/toy.fa"], &["-f", "yaml"]); @@ -62,6 +84,7 @@ fn output_in_yaml() { } #[test] +// 3. -f json --cache-dir fn output_in_json_and_can_keep_cache() { let out = tataki( &[ @@ -110,7 +133,7 @@ fn output_in_json_and_can_keep_cache() { } #[test] - +// 4. -o file fn can_output_to_file() { let _ = tataki( &["./inputs/toy.sam", "./inputs/toy.fa"], @@ -125,11 +148,12 @@ fn can_output_to_file() { assert_eq!( output_sha256, - "81afa82dcd25f408d0f9a1e3ef01f360c158bb3cdbe2e59e8b6f648a34c8972c" + "b60ee9b2d903fa08aa29575e2a1b719ce3b678b374e2fb57ee64355e10534840" ); } #[test] +// 5. -c conf // Check if the output becomes null when a conf without sam and fasta is specified. fn can_use_config_file() { let out = tataki( @@ -157,6 +181,7 @@ fn can_use_config_file() { } #[test] +// 6. --dry-run fn can_dry_run() { let out = tataki(&["./inputs/toy.sam", "./inputs/toy.fa"], &["--dry-run"]); @@ -177,6 +202,7 @@ fn can_dry_run() { } #[test] +// 7. --quiet fn can_be_quiet() { let out = tataki(&["./inputs/toy.sam", "./inputs/toy.fa"], &["--quiet"]); @@ -186,6 +212,7 @@ fn can_be_quiet() { } #[test] +// 8. --verbose fn can_be_verbose() { let out = tataki(&["./inputs/toy.sam", "./inputs/toy.fa"], &["--verbose"]); @@ -195,6 +222,7 @@ fn can_be_verbose() { } #[test] +// 9. -c cwl.conf fn can_run_cwl() { let out = tataki( &["./inputs/toy.py", "./inputs/toy.fa"], @@ -219,3 +247,53 @@ fn can_run_cwl() { assert_eq!(output_records, expected_output_records); } + +#[test] +// 10. --num-records +// Check if tataki only reads a single records. The second line of the input file is in abnormal format. If tataki reads more than one record, this assert fails. +fn can_limit_the_number_of_output_records() { + let out = tataki(&["./inputs/toy_invalid_flag.sam"], &["--num-records", "1"]); + + let stdout = out.stdout; + + let mut rdr = csv::Reader::from_reader(stdout.as_bytes()); + let output_records = rdr + .records() + .collect::, csv::Error>>() + .expect("Failed to parse the output as CSV"); + + let mut expected_output_rdr = + csv::Reader::from_path(Path::new("tests/outputs/expected_output_num_records.csv")) + .expect("Failed to read the expected output file"); + let expected_output_records = expected_output_rdr + .records() + .collect::, csv::Error>>() + .expect("Failed to parse the expected output as CSV"); + + assert_eq!(output_records, expected_output_records); +} + +#[test] +// 11. --tidy +// Check if tataki attempt to read the whole lines of the input file and fail when parsing the line right after `--num-records` lines. +fn can_read_entirety_of_input_file() { + let out = tataki(&["./inputs/toy.sam", "./inputs/toy.fa"], &["--tidy"]); + + let stdout = out.stdout; + + let mut rdr = csv::Reader::from_reader(stdout.as_bytes()); + let output_records = rdr + .records() + .collect::, csv::Error>>() + .expect("Failed to parse the output as CSV"); + + let mut expected_output_rdr = + csv::Reader::from_path(Path::new("tests/outputs/expected_output.csv")) + .expect("Failed to read the expected output file"); + let expected_output_records = expected_output_rdr + .records() + .collect::, csv::Error>>() + .expect("Failed to parse the expected output as CSV"); + + assert_eq!(output_records, expected_output_records); +} diff --git a/tests/outputs/expected_output.csv b/tests/outputs/expected_output.csv index 7d9b373..e3743fb 100644 --- a/tests/outputs/expected_output.csv +++ b/tests/outputs/expected_output.csv @@ -1,3 +1,3 @@ -File Path,Edam ID,Label -./inputs/toy.sam,http://edamontology.org/format_2573,SAM -./inputs/toy.fa,http://edamontology.org/format_1929,FASTA +File Path,Edam ID,Label,Decompressed ID,Decompressed Label +./inputs/toy.sam,http://edamontology.org/format_2573,SAM,, +./inputs/toy.fa,http://edamontology.org/format_1929,FASTA,, diff --git a/tests/outputs/expected_output.json b/tests/outputs/expected_output.json index 5fd3516..993a395 100644 --- a/tests/outputs/expected_output.json +++ b/tests/outputs/expected_output.json @@ -1 +1 @@ -{"./inputs/toy.sam":{"id":"http://edamontology.org/format_2573","label":"SAM"},"https://github.com/sapporo-wes/tataki/raw/main/tests/inputs/toy.fa":{"id":"http://edamontology.org/format_1929","label":"FASTA"}} \ No newline at end of file +{"./inputs/toy.sam":{"id":"http://edamontology.org/format_2573","label":"SAM","decompressed":{"label":null,"id":null}},"https://github.com/sapporo-wes/tataki/raw/main/tests/inputs/toy.fa":{"id":"http://edamontology.org/format_1929","label":"FASTA","decompressed":{"label":null,"id":null}}} \ No newline at end of file diff --git a/tests/outputs/expected_output.yaml b/tests/outputs/expected_output.yaml index 068ab8c..b53f516 100644 --- a/tests/outputs/expected_output.yaml +++ b/tests/outputs/expected_output.yaml @@ -1,6 +1,12 @@ ./inputs/toy.sam: label: SAM id: http://edamontology.org/format_2573 + decompressed: + label: null + id: null ./inputs/toy.fa: label: FASTA id: http://edamontology.org/format_1929 + decompressed: + label: null + id: null diff --git a/tests/outputs/expected_output_module_order.csv b/tests/outputs/expected_output_module_order.csv index 75b9d67..872c406 100644 --- a/tests/outputs/expected_output_module_order.csv +++ b/tests/outputs/expected_output_module_order.csv @@ -1,3 +1,3 @@ -File Path,Edam ID,Label -./inputs/toy.sam,, -./inputs/toy.fa,, +File Path,Edam ID,Label,Decompressed ID,Decompressed Label +./inputs/toy.sam,,,, +./inputs/toy.fa,,,, diff --git a/tests/outputs/expected_output_num_records.csv b/tests/outputs/expected_output_num_records.csv new file mode 100644 index 0000000..615fd60 --- /dev/null +++ b/tests/outputs/expected_output_num_records.csv @@ -0,0 +1,2 @@ +File Path,Edam ID,Label,Decompressed ID,Decompressed Label +./inputs/toy_invalid_flag.sam,http://edamontology.org/format_2573,SAM,, diff --git a/tests/outputs/expected_output_run_cwl.csv b/tests/outputs/expected_output_run_cwl.csv index d643b7d..2c93f1f 100644 --- a/tests/outputs/expected_output_run_cwl.csv +++ b/tests/outputs/expected_output_run_cwl.csv @@ -1,3 +1,3 @@ -File Path,Edam ID,Label -./inputs/toy.py,http://edamontology.org/format_3996,Python script -./inputs/toy.fa,, +File Path,Edam ID,Label,Decompressed ID,Decompressed Label +./inputs/toy.py,http://edamontology.org/format_3996,Python script,, +./inputs/toy.fa,,,,