Skip to content

Commit

Permalink
fix: Scanning '%' from cloud (#17890)
Browse files Browse the repository at this point in the history
  • Loading branch information
nameexhaustion authored Jul 26, 2024
1 parent 43bf944 commit d6e5e87
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 8 deletions.
35 changes: 33 additions & 2 deletions crates/polars-io/src/cloud/glob.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use polars_error::PolarsResult;
use regex::Regex;
use url::Url;

use super::CloudOptions;
use super::{parse_url, CloudOptions};

const DELIMITER: char = '/';

Expand Down Expand Up @@ -129,7 +129,7 @@ impl CloudLocation {

/// Parse a CloudLocation from an url.
pub fn new(url: &str, glob: bool) -> PolarsResult<CloudLocation> {
let parsed = Url::parse(url).map_err(to_compute_err)?;
let parsed = parse_url(url).map_err(to_compute_err)?;
Self::from_url(&parsed, glob)
}
}
Expand Down Expand Up @@ -316,4 +316,35 @@ mod test {
},
)
}

#[test]
fn test_cloud_location_percentages() {
use super::CloudLocation;

let path = "s3://bucket/%25";
let cloud_location = CloudLocation::new(path, true).unwrap();

assert_eq!(
cloud_location,
CloudLocation {
scheme: "s3".into(),
bucket: "bucket".into(),
prefix: "%25".into(),
expansion: None,
}
);

let path = "https://pola.rs/%25";
let cloud_location = CloudLocation::new(path, true).unwrap();

assert_eq!(
cloud_location,
CloudLocation {
scheme: "https".into(),
bucket: "".into(),
prefix: "".into(),
expansion: None,
}
);
}
}
6 changes: 5 additions & 1 deletion crates/polars-io/src/cloud/options.rs
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,11 @@ impl CloudType {
#[cfg(feature = "cloud")]
pub(crate) fn parse_url(input: &str) -> std::result::Result<url::Url, url::ParseError> {
Ok(if input.contains("://") {
url::Url::parse(input)?
if input.starts_with("http://") || input.starts_with("https://") {
url::Url::parse(input)
} else {
url::Url::parse(&input.replace("%", "%25"))
}?
} else {
let path = std::path::Path::new(input);
let mut tmp;
Expand Down
7 changes: 6 additions & 1 deletion crates/polars-io/src/path_utils/hugging_face.rs
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,7 @@ pub(super) async fn expand_paths_hf(
paths: &[PathBuf],
check_directory_level: bool,
cloud_options: Option<&CloudOptions>,
glob: bool,
) -> PolarsResult<(usize, Vec<PathBuf>)> {
assert!(!paths.is_empty());

Expand Down Expand Up @@ -251,7 +252,11 @@ pub(super) async fn expand_paths_hf(
);
let rel_path = path_parts.path.as_str();

let (prefix, expansion) = extract_prefix_expansion(rel_path)?;
let (prefix, expansion) = if glob {
extract_prefix_expansion(rel_path)?
} else {
(path_parts.path.clone(), None)
};
let expansion_matcher = &if expansion.is_some() {
Some(Matcher::new(prefix.clone(), expansion.as_deref())?)
} else {
Expand Down
11 changes: 7 additions & 4 deletions crates/polars-io/src/path_utils/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -151,10 +151,13 @@ pub fn expand_paths_hive(
use crate::cloud::object_path_from_str;

if first_path.starts_with("hf://") {
let (expand_start_idx, paths) =
crate::pl_async::get_runtime().block_on_potential_spawn(
hugging_face::expand_paths_hf(paths, check_directory_level, cloud_options),
)?;
let (expand_start_idx, paths) = crate::pl_async::get_runtime()
.block_on_potential_spawn(hugging_face::expand_paths_hf(
paths,
check_directory_level,
cloud_options,
glob,
))?;

return Ok((Arc::from(paths), expand_start_idx));
}
Expand Down

0 comments on commit d6e5e87

Please sign in to comment.