diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index 09087ca31958..9721349b0184 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -230,6 +230,7 @@ pub struct Format { escape: Option, quote: Option, terminator: Option, + comment: Option, null_regex: NullRegex, truncated_rows: bool, } @@ -260,6 +261,11 @@ impl Format { self } + pub fn with_comment(mut self, comment: u8) -> Self { + self.comment = Some(comment); + self + } + /// Provide a regex to match null values, defaults to `^$` pub fn with_null_regex(mut self, null_regex: Regex) -> Self { self.null_regex = NullRegex(Some(null_regex)); @@ -353,6 +359,9 @@ impl Format { if let Some(t) = self.terminator { builder.terminator(csv::Terminator::Any(t)); } + if let Some(comment) = self.comment { + builder.comment(Some(comment)); + } builder.from_reader(reader) } @@ -360,6 +369,7 @@ impl Format { fn build_parser(&self) -> csv_core::Reader { let mut builder = csv_core::ReaderBuilder::new(); builder.escape(self.escape); + builder.comment(self.comment); if let Some(c) = self.delimiter { builder.delimiter(c); @@ -1109,6 +1119,11 @@ impl ReaderBuilder { self } + pub fn with_comment(mut self, comment: u8) -> Self { + self.format.comment = Some(comment); + self + } + /// Provide a regex to match null values, defaults to `^$` pub fn with_null_regex(mut self, null_regex: Regex) -> Self { self.format.null_regex = NullRegex(Some(null_regex)); @@ -2536,4 +2551,40 @@ mod tests { assert_eq!(&t.get(), expected, "{values:?}") } } + + #[test] + fn test_comment() { + let schema = Schema::new(vec![ + Field::new("a", DataType::Int8, false), + Field::new("b", DataType::Int8, false), + ]); + + let csv = "# comment1 \n1,2\n#comment2\n11,22"; + let mut read = Cursor::new(csv.as_bytes()); + let reader = ReaderBuilder::new(Arc::new(schema)) + .with_comment(b'#') + .build(&mut read) + .unwrap(); + + let batches = reader.collect::, _>>().unwrap(); + assert_eq!(batches.len(), 1); + let b = batches.first().unwrap(); + assert_eq!(b.num_columns(), 2); + assert_eq!( + b.column(0) + .as_any() + .downcast_ref::() + .unwrap() + .values(), + &vec![1, 11] + ); + assert_eq!( + b.column(1) + .as_any() + .downcast_ref::() + .unwrap() + .values(), + &vec![2, 22] + ); + } }