From d6e5e8747e3d3196af84e25dabb16bc7583eeeae Mon Sep 17 00:00:00 2001 From: nameexhaustion Date: Fri, 26 Jul 2024 22:36:35 +1000 Subject: [PATCH] fix: Scanning '%' from cloud (#17890) --- crates/polars-io/src/cloud/glob.rs | 35 +++++++++++++++++-- crates/polars-io/src/cloud/options.rs | 6 +++- .../polars-io/src/path_utils/hugging_face.rs | 7 +++- crates/polars-io/src/path_utils/mod.rs | 11 +++--- 4 files changed, 51 insertions(+), 8 deletions(-) diff --git a/crates/polars-io/src/cloud/glob.rs b/crates/polars-io/src/cloud/glob.rs index b4d74d093a8d..12ac8081a3b8 100644 --- a/crates/polars-io/src/cloud/glob.rs +++ b/crates/polars-io/src/cloud/glob.rs @@ -6,7 +6,7 @@ use polars_error::PolarsResult; use regex::Regex; use url::Url; -use super::CloudOptions; +use super::{parse_url, CloudOptions}; const DELIMITER: char = '/'; @@ -129,7 +129,7 @@ impl CloudLocation { /// Parse a CloudLocation from an url. pub fn new(url: &str, glob: bool) -> PolarsResult { - let parsed = Url::parse(url).map_err(to_compute_err)?; + let parsed = parse_url(url).map_err(to_compute_err)?; Self::from_url(&parsed, glob) } } @@ -316,4 +316,35 @@ mod test { }, ) } + + #[test] + fn test_cloud_location_percentages() { + use super::CloudLocation; + + let path = "s3://bucket/%25"; + let cloud_location = CloudLocation::new(path, true).unwrap(); + + assert_eq!( + cloud_location, + CloudLocation { + scheme: "s3".into(), + bucket: "bucket".into(), + prefix: "%25".into(), + expansion: None, + } + ); + + let path = "https://pola.rs/%25"; + let cloud_location = CloudLocation::new(path, true).unwrap(); + + assert_eq!( + cloud_location, + CloudLocation { + scheme: "https".into(), + bucket: "".into(), + prefix: "".into(), + expansion: None, + } + ); + } } diff --git a/crates/polars-io/src/cloud/options.rs b/crates/polars-io/src/cloud/options.rs index 0f48688051f7..de0968a80da0 100644 --- a/crates/polars-io/src/cloud/options.rs +++ b/crates/polars-io/src/cloud/options.rs @@ -154,7 +154,11 @@ impl CloudType { #[cfg(feature = "cloud")] pub(crate) fn parse_url(input: &str) -> std::result::Result { Ok(if input.contains("://") { - url::Url::parse(input)? + if input.starts_with("http://") || input.starts_with("https://") { + url::Url::parse(input) + } else { + url::Url::parse(&input.replace("%", "%25")) + }? } else { let path = std::path::Path::new(input); let mut tmp; diff --git a/crates/polars-io/src/path_utils/hugging_face.rs b/crates/polars-io/src/path_utils/hugging_face.rs index cc396dc928fc..3e8f701a21ed 100644 --- a/crates/polars-io/src/path_utils/hugging_face.rs +++ b/crates/polars-io/src/path_utils/hugging_face.rs @@ -214,6 +214,7 @@ pub(super) async fn expand_paths_hf( paths: &[PathBuf], check_directory_level: bool, cloud_options: Option<&CloudOptions>, + glob: bool, ) -> PolarsResult<(usize, Vec)> { assert!(!paths.is_empty()); @@ -251,7 +252,11 @@ pub(super) async fn expand_paths_hf( ); let rel_path = path_parts.path.as_str(); - let (prefix, expansion) = extract_prefix_expansion(rel_path)?; + let (prefix, expansion) = if glob { + extract_prefix_expansion(rel_path)? + } else { + (path_parts.path.clone(), None) + }; let expansion_matcher = &if expansion.is_some() { Some(Matcher::new(prefix.clone(), expansion.as_deref())?) } else { diff --git a/crates/polars-io/src/path_utils/mod.rs b/crates/polars-io/src/path_utils/mod.rs index 749e248881c6..c7a751b5bd24 100644 --- a/crates/polars-io/src/path_utils/mod.rs +++ b/crates/polars-io/src/path_utils/mod.rs @@ -151,10 +151,13 @@ pub fn expand_paths_hive( use crate::cloud::object_path_from_str; if first_path.starts_with("hf://") { - let (expand_start_idx, paths) = - crate::pl_async::get_runtime().block_on_potential_spawn( - hugging_face::expand_paths_hf(paths, check_directory_level, cloud_options), - )?; + let (expand_start_idx, paths) = crate::pl_async::get_runtime() + .block_on_potential_spawn(hugging_face::expand_paths_hf( + paths, + check_directory_level, + cloud_options, + glob, + ))?; return Ok((Arc::from(paths), expand_start_idx)); }