Skip to content

Commit

Permalink
fix: ignore crc files when checking if provided path correspond to a …
Browse files Browse the repository at this point in the history
…valid delta table
  • Loading branch information
guillotjulien committed Jan 12, 2025
1 parent b831543 commit 48d4fb5
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 7 deletions.
7 changes: 7 additions & 0 deletions crates/core/src/kernel/snapshot/log_segment.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ lazy_static! {
static ref CHECKPOINT_FILE_PATTERN: Regex =
Regex::new(r"\d+\.checkpoint(\.\d+\.\d+)?\.parquet").unwrap();
static ref DELTA_FILE_PATTERN: Regex = Regex::new(r"^\d+\.json$").unwrap();
static ref CRC_FILE_PATTERN: Regex = Regex::new(r"^(\.)?\d+(\.crc|\.json)?\.crc$").unwrap();
pub(super) static ref TOMBSTONE_SCHEMA: StructType =
StructType::new(vec![ActionType::Remove.schema_field().clone(),]);
}
Expand Down Expand Up @@ -61,6 +62,12 @@ pub(crate) trait PathExt {
.map(|name| DELTA_FILE_PATTERN.captures(name).is_some())
.unwrap_or(false)
}

fn is_crc_file(&self) -> bool {
self.filename()
.map(|name| CRC_FILE_PATTERN.captures(name).is_some())
.unwrap()
}
}

impl PathExt for Path {
Expand Down
80 changes: 73 additions & 7 deletions crates/core/src/logstore/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -265,20 +265,23 @@ pub trait LogStore: Send + Sync + AsAny {

/// Check if the location is a delta table location
async fn is_delta_table_location(&self) -> DeltaResult<bool> {
// TODO We should really be using HEAD here, but this fails in windows tests
let object_store = self.object_store(None);
let mut stream = object_store.list(Some(self.log_path()));
if let Some(res) = stream.next().await {
while let Some(res) = stream.next().await {
match res {
Ok(meta) => {
Ok(meta.location.is_commit_file() || meta.location.is_checkpoint_file())
// crc files are valid files according to the protocol
if meta.location.is_crc_file() {
continue;
}
return Ok(meta.location.is_commit_file() || meta.location.is_checkpoint_file());
}
Err(ObjectStoreError::NotFound { .. }) => Ok(false),
Err(err) => Err(err)?,
Err(ObjectStoreError::NotFound { .. }) => return Ok(false),
Err(err) => return Err(err.into()),
}
} else {
Ok(false)
}

Ok(false)
}

#[cfg(feature = "datafusion")]
Expand Down Expand Up @@ -679,6 +682,69 @@ mod tests {
.await
.expect("Failed to identify table"));
}

#[tokio::test]
async fn test_is_location_a_table_crc() {
use object_store::path::Path;
use object_store::{PutOptions, PutPayload};
let location = Url::parse("memory://table").unwrap();
let store =
logstore_for(location, HashMap::default(), None).expect("Failed to get logstore");
assert!(!store
.is_delta_table_location()
.await
.expect("Failed to identify table"));

// Save .crc files to the transaction log directory (all 3 formats)
let payload = PutPayload::from_static(b"test");

let _put = store
.object_store(None)
.put_opts(
&Path::from("_delta_log/.0.crc.crc"),
payload.clone(),
PutOptions::default(),
)
.await
.expect("Failed to put");

let _put = store
.object_store(None)
.put_opts(
&Path::from("_delta_log/.0.json.crc"),
payload.clone(),
PutOptions::default(),
)
.await
.expect("Failed to put");

let _put = store
.object_store(None)
.put_opts(
&Path::from("_delta_log/0.crc"),
payload.clone(),
PutOptions::default(),
)
.await
.expect("Failed to put");

// Now add a commit
let _put = store
.object_store(None)
.put_opts(
&Path::from("_delta_log/0.json"),
payload.clone(),
PutOptions::default(),
)
.await
.expect("Failed to put");

// The table should be considered a delta table
assert!(store
.is_delta_table_location()
.await
.expect("Failed to identify table"));
}
}

#[cfg(feature = "datafusion")]
Expand Down

0 comments on commit 48d4fb5

Please sign in to comment.