diff --git a/src/cfb.rs b/src/cfb.rs index a5f644e..62ed5a4 100644 --- a/src/cfb.rs +++ b/src/cfb.rs @@ -421,6 +421,11 @@ pub struct XlsEncoding { } impl XlsEncoding { + pub fn unicode() -> XlsEncoding { + XlsEncoding { + encoding: encoding_rs::UTF_16LE, + } + } pub fn from_codepage(codepage: u16) -> Result { let e = codepage::to_encoding(codepage).ok_or(CfbError::CodePageNotFound(codepage))?; Ok(XlsEncoding { encoding: e }) diff --git a/src/lib.rs b/src/lib.rs index 3365096..a14a176 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -461,7 +461,8 @@ impl Range { /// /// panics when a `Cell` row is lower than the first `Cell` row or /// bigger than the last `Cell` row. - pub fn from_sparse(cells: Vec>) -> Range { + pub fn from_sparse(mut cells: Vec>) -> Range { + cells.sort_by_key(|cell| (cell.pos.0, cell.pos.1)); if cells.is_empty() { Range::empty() } else { diff --git a/src/xls.rs b/src/xls.rs index fe19c35..88b2ecd 100644 --- a/src/xls.rs +++ b/src/xls.rs @@ -342,7 +342,7 @@ impl Xls { let cch = r.data[3] as usize; let cce = read_u16(&r.data[4..]) as usize; let mut name = String::new(); - read_unicode_string_no_cch(&encoding, &r.data[14..], &cch, &mut name); + read_unicode_string_no_cch(&r.data[14..], &cch, &mut name); let rgce = &r.data[r.data.len() - cce..]; let formula = parse_defined_names(rgce)?; defined_names.push((name, formula)); @@ -450,24 +450,27 @@ impl Xls { // it will appear in 0x0207 record coming next cells.push(Cell::new(fmla_pos, val)); } - let fmla = parse_formula( - &r.data[20..], - &fmla_sheet_names, - &defined_names, - &xtis, - &encoding, - ) - .unwrap_or_else(|e| { - debug!("{}", e); - format!( - "Unrecognised formula \ + let fmla = + parse_formula(&r.data[20..], &fmla_sheet_names, &defined_names, &xtis) + .unwrap_or_else(|e| { + debug!("{}", e); + format!( + "Unrecognised formula \ for cell ({}, {}): {:?}", - row, col, e - ) - }); + row, col, e + ) + }); formulas.push(Cell::new(fmla_pos, fmla)); } - _ => (), + // tests/high_byte_string.xls contains a record type that + // cannot be found in the "By Number" 2.3.2 table + 0x00D6 => { + let Ok(s) = parse_label(r.data, &encoding, biff) else { + continue; + }; + cells.extend(s); + } + _ => {} } } let range = Range::from_sparse(cells); @@ -756,14 +759,18 @@ fn parse_short_string( /// XLUnicodeString [MS-XLS 2.5.294] fn parse_string(r: &[u8], encoding: &XlsEncoding, biff: Biff) -> Result { - if r.len() < 4 { + if r.len() < 2 { return Err(XlsError::Len { typ: "string", - expected: 4, + expected: 2, found: r.len(), }); } let cch = read_u16(r) as usize; + if cch == 0 { + // tests/high_byte_string.xls + return Ok(String::new()); + } let (high_byte, start) = match biff { Biff::Biff2 | Biff::Biff3 | Biff::Biff4 | Biff::Biff5 => (None, 2), @@ -819,7 +826,7 @@ fn parse_label_sst(r: &[u8], strings: &[String]) -> Result>, X } fn parse_dimensions(r: &[u8]) -> Result { - let (rf, rl, cf, cl) = match r.len() { + let (rf, rl, mut cf, cl) = match r.len() { 10 => ( read_u16(&r[0..2]) as u32, read_u16(&r[2..4]) as u32, @@ -840,6 +847,12 @@ fn parse_dimensions(r: &[u8]) -> Result { }); } }; + // 2.5.53 ColU must be <= 0xFF, if larger, reasonable to assume + // starts at 0 + // tests/OOM_alloc2.xls + if 0xFF < cf || cl < cf { + cf = 0; + } if 1 <= rl && 1 <= cl { Ok(Dimensions { start: (rf, cf), @@ -984,8 +997,10 @@ fn read_dbcs( Ok(s) } -fn read_unicode_string_no_cch(encoding: &XlsEncoding, buf: &[u8], len: &usize, s: &mut String) { - encoding.decode_to(&buf[1..=*len], *len, s, Some(buf[0] & 0x1 != 0)); +fn read_unicode_string_no_cch(buf: &[u8], len: &usize, s: &mut String) -> usize { + XlsEncoding::unicode() + .decode_to(&buf[1..], *len, s, Some(buf[0] & 0x1 != 0)) + .1 } struct Record<'a> { @@ -1126,7 +1141,6 @@ fn parse_formula( sheets: &[String], names: &[(String, String)], xtis: &[Xti], - encoding: &XlsEncoding, ) -> Result { let mut stack = Vec::new(); let mut formula = String::with_capacity(rgce.len()); @@ -1245,9 +1259,9 @@ fn parse_formula( stack.push(formula.len()); formula.push('\"'); let cch = rgce[0] as usize; - read_unicode_string_no_cch(encoding, &rgce[1..], &cch, &mut formula); + let l = read_unicode_string_no_cch(&rgce[1..], &cch, &mut formula); formula.push('\"'); - rgce = &rgce[2 + cch..]; + rgce = &rgce[2 + l..]; } 0x18 => { rgce = &rgce[5..]; diff --git a/tests/OOM_alloc.xls b/tests/OOM_alloc.xls new file mode 100644 index 0000000..860c2ac Binary files /dev/null and b/tests/OOM_alloc.xls differ diff --git a/tests/OOM_alloc2.xls b/tests/OOM_alloc2.xls new file mode 100644 index 0000000..b6e8d6b Binary files /dev/null and b/tests/OOM_alloc2.xls differ diff --git a/tests/high_byte_string.xls b/tests/high_byte_string.xls new file mode 100644 index 0000000..4b4ba95 Binary files /dev/null and b/tests/high_byte_string.xls differ diff --git a/tests/test.rs b/tests/test.rs index d880f25..e2a5498 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -1900,6 +1900,28 @@ fn test_ref_xlsb() { ); } +fn test_high_byte_strings_and_unicode_strings_without_reserved_tags() { + // file contains XLUnicodeString with cch = 0 and do not have a reserved byte tag + // as well as record types that do not seem to be present in the spec + let mut xls: Xls<_> = wb("high_byte_string.xls"); + for (_, ws) in xls.worksheets() { + for (row, _, cell) in ws.used_cells() { + if row == 3 { + assert_eq!( + cell.as_string().unwrap(), + "Inside FERC's Gas Market Report monthly bidweek price file. " + ); + } + } + } +} + +#[test] +fn test_oom_allocation() { + let _xls: Xls<_> = wb("OOM_alloc.xls"); + let _xls: Xls<_> = wb("OOM_alloc2.xls"); +} + #[rstest] #[case("single-empty.ods")] #[case("multi-empty.ods")]