From fdcb676f8ed403b9751fad415ef180c501373d5b Mon Sep 17 00:00:00 2001 From: Zhao Yuan <1627990440@qq.com> Date: Fri, 27 Oct 2023 02:21:15 +0000 Subject: [PATCH 01/11] Add algorithm "Dbscan cluster" and "exponential_smoothing" and introducing document Signed-off-by: Zhao Yuan <1627990440@qq.com> --- docs/chunk-deduplication.md | 117 ++++ src/bin/nydus-image/deduplicate.rs | 977 ++++++++++++++++++++++++++++- src/bin/nydus-image/main.rs | 89 ++- 3 files changed, 1157 insertions(+), 26 deletions(-) create mode 100644 docs/chunk-deduplication.md diff --git a/docs/chunk-deduplication.md b/docs/chunk-deduplication.md new file mode 100644 index 00000000000..97a8db2c3f4 --- /dev/null +++ b/docs/chunk-deduplication.md @@ -0,0 +1,117 @@ +# Notice [WIP] Pending further revisionsNotice +# Probntroduction +In container images, there are often a large number of duplicate files or content, and these duplicate parts occupy a large amount of storage space, especially in high-density deployment scenarios. As the number of Nydus images grows, it will bring many problems such as low storage space utilization and excessive consumption of bandwidth resources. To do this, an effective deduplication mechanism (deduplication) needs to be designed to solve this problem. + +Unlike traditional OCI, which distributes images at a layer-granular level, the smallest unit of a Nydus image is a chunk, so the deduplication algorithm needs to be deduplicated in chunk units. At the same time, we want to deduplicate multiple aspects of the Nydus image, including between Nydus images and between different versions of the same Nydus image. No matter which deduplication method is essentially to deduplicate the repeated chunks in the image, only one duplicate chunk is retained, and the reference to the chunk is used instead of other duplicate chunks to reduce the storage space occupation, so as to maximize the data transmission and storage capabilities of Nydus and improve the access speed and efficiency of the image. +# General idea +The deduplication algorithm first needs to select the duplicate chunk in the image according to the image information such as the number of occurrences of chunk, chunk size, chunk image to which the chunk belongs and the corresponding version, and generate chunkdict, chunkdict records the unique identifier or fingerprint of chunk, only need to store chunkdict, other images can refer to chunk in chunkdict by reference. + +The deduplication algorithm is divided into two parts, the first part is the DBSCAN clustering algorithm, which deduplicates different images; The second part is the exponential smoothing algorithm, which deduplicates different versions within the image. + +**The general process is as follows:** +1. Store the image information to the local database, +2. Extract the image information and call the DBSCAN clustering algorithm to deduplicate different images. +3. Deduplicate the dictionary content in 2, and call the exponential smoothing algorithm for each image separately for image version deduplication. +4. Get the deduplication dictionary generated by running the two algorithms and drop the disk. +# Algorithm detailed process +## Overall Input + +```shell +nydusify chunkdict generate --sources \ + localhost:5000:redis:nydus_7.0.1, \ + localhost:5000:redis:nydus_7.0.2,\ + localhost:5000:redis:nydus_7.0.3 \ +``` +*** +`nydusify chunkdict generate` calls two commands `nydus-image chunkdict save` and `nydus-image chunkdict generate` to store image information into the database and generate a list of chunks to be deduplicated + +Download multiple Nydus images in advance and put them into the repository as datasets, such as selecting 10 consecutive versions of redis and alpine as the image dataset, and execute the command `nydus-image chunkdict save` to store the information of the chunk and blob in the chunk and blob table of the database. + +```shell +# Deposit multiple images into the database +nydus-image chunkdict save --bootstrap \ + ./output/localhost:5000:redis:nydus_7.0.1/nydus_bootstrap, \ + ./output/localhost:5000:redis:nydus_7.0.2/nydus_bootstrap, \ + ./output/localhost:5000:redis:nydus_7.0.3/nydus_bootstrap \ +``` +Execute the command `nydus-image chunkdict generate` to access the database and call the deduplication algorithm to generate the chunk list +```shell +# Call the deduplication algorithm to generate chunk list +nydus-image chunkdict generate --database \ + sqlite:///path/imageservice/contrib/nydusify/chunkdict.db +``` + +*** +### Deduplication algorithm +#### Algorithm 1 Deduplication between different images (DBSCAN clustering algorithm) +*** +**Basic principle:** DBSCAN is a density-based clustering algorithm, which mainly investigates the connectivity between samples through sample density, samples of the same category, they are closely connected, in other words, there must be samples of the same category not far around any sample of the category. Therefore, it can group a group of objects with high density and close distance, can find clusters of arbitrary shapes, and does not need to specify the number of clusters in advance, which is suitable for high-density deployment scenarios. + +**Input:** Read the chunk information in the database and store it in the chunk list. Chunk information includes:image_name, version, chunk_blob_id, chunk_digest, chunk_compressed_size, and so on. + +**Output:** The chunk dictionary corresponding to each image cluster + +**Basic steps:** +**1.** Select a part of the version as the training set and the rest as the test set according to a certain proportion of all images. + +**2.** Divide all chunks in the training set into a new list according to the image_name, and each list corresponds to an image and all chunk sets in the image. + +**3.** These images are done using the DBSCAN (Density-Based Spatial Clustering of Applications with Noise) algorithm +Clustering. + +*** +3.1 Initialize the core point collection $Omega$ as an empty set,and set the clustering algorithm radius $gamma = 0.5$, and the sample number threshold $MinPts = 10$ + +3.2 Loop through each image and its corresponding chunk list,and calculate its distance from other images according to the following formula. +$$ distance (x,y)= \frac{\lvert C(R_x) \cup C(R_y) \rvert - \lvert C(R_x) \cap C(R_y) \rvert}{\lvert C(R_x) \cup C(R_y) \rvert }$$ +where $C(R_x)$ represents the unique chunk set of all training set images in the image. Calculate the number of images based on $distance(x,y) \leq \gamma$,If there are M y, such that $distance(x,y) \leq \gamma$, where $M \geq MinPts$, then add the imagex to the core point set, and image y is called the image in the neighborhood of the core image x; + +3.3 Initialize the number of cluster classes k=0, and then iterate the core point warehouse collection in turn, and add all the neighboring warehouses in the core point warehouse to the queue, if a warehouse in the neighborhood is also a core warehouse, all warehouses in its neighborhood join the queue, classify the warehouses in the above queue into a cluster class, and continue to traverse the core warehouse collection until all core warehouses are traversed. + +3.4 Calculate the frequency of chunks that appear in each class image. Add the chunk that appears in the image above $90%$ in the training set to the dictionary corresponding to the cluster class to generate a set of < cluster classes, and the dictionary > pairs. +*** +**4.** Adjust the neighborhood radius size and repeat step 3 to obtain multiple deduplication dictionaries. + +**5.** Use the test set to evaluate multiple deduplication dictionaries in 4, and select the chunk dictionary corresponding to the test set with the smallest storage space. + +**6.** Remove the chunk in the chunk dictionary selected in 5 for all images (training set and test set), and then repeat the operation 1-5 to generate the chunk dictionary until the maximum number of cycles is reached 7, or the discrete image ratio is greater than 80% of the total number of images. + +The principle of DBSCAN algorithm how to divide the cluster is shown in the diagram: +![在这里插入图片描述](https://img-blog.csdnimg.cn/5fba149720a34620873a5a2cb304d668.png#pic_center) +In this diagram, minPts = 4. Point A and the other red points are core points, because the area surrounding these points in an ε radius contain at least 4 points (including the point itself). Because they are all reachable from one another, they form a single cluster. Points B and C are not core points, but are reachable from A (via other core points) and thus belong to the cluster as well. Point N is a noise point that is neither a core point nor directly-reachable. + +**Remark:** This section of the picture and the associated DBSCAN algorithm description are referenced from : [https://en.wikipedia.org/wiki/DBSCAN](https://en.wikipedia.org/wiki/DBSCAN) +#### Algorithm 2 Deduplication between different versions of the image (exponential smoothing algorithm) +*** +**Basic principle:** Exponential smoothing algorithm is a method for time series data prediction and smoothing, the basic principle is to weighted average the data, give higher weight to the more recent repeated chunks, and constantly update the smoothing value, so the newer chunk has a greater impact on future forecasts, and the impact of older data will gradually weaken. + +**Input:** The training set and test set after deduplication in algorithm 1. + +**Output:** The chunk dictionary corresponding to each image. + +**Basic steps:** +**1.** Divide all chunks in the training set into a new list according to the image_name, and each list corresponds to an image and all chunk sets in the image. + +**2.** The different versions inside each image are sorted chronologically, and each chunk is scored according to the Exponential Smoothing formula. +$$S_0 =0 ,S_t = \alpha Y_{t-1} +(1- \alpha)S_{t-1} $$ +where, $\alpha=0.5$ , $Y_{t-1}$ indicates whether the chunk appeared in the previous image, 1 if it did, otherwise 0. + +**3.** Count the score for each chunk and select all chunks with a score greater than $THs$ as the chunk dictionary. Deduplicate the image version in the test set and calculate the storage space it occupies. + +**4.** Modify the value of $THs$ from 0.8 to 0.5 in steps of 0.05 and repeat steps 2 and 3 to generate multiple chunk dictionaries. + +**5.** Choose a chunk dictionary that minimizes the test set's storage space. +*** +### Exponential smoothing algorithm test table + +| image_name | version number | total_size | train_size | test_size | test_size after dedulicating | chunkdict_size | dedulicating rate | threshold | +|------------|----------------|------------|------------|-----------|------------------------------|----------------|-------------------|-----------| +| redis | 10 | 382.03 | 266.7 | 115.33 | 31.56 | 42.33 | 72.63% | 0.8-0.5 | +| python | 10 | 3509.91 | 2095.37 | 1414.54 | 123.33 | 588.61 | 91.28% | 0.8-0.5 | +| ubuntu | 10 | 317.33 | 222.11 | 95.22 | 12.27 | 39.61 | 87.11% | 0.8-0.5 | +| nginx | 10 | 396.86 | 284.4 | 112.46 | 50.54 | 83.54 | 55.06% | 0.8-0.5 | +| postgres | 10 | 1360.31 | 956.42 | 403.89 | 381.54 | 19.66 | 5.53% | 0.8-0.5 | +| alpine | 10 | 27.23 | 19.04 | 8.19 | 5.62 | 4.7 | 31.29% | 0.8-0.5 | +| node | 10 | 3698.44 | 2598.59 | 1099.85 | 429.39 | 649.42 | 60.96% | 0.8-0.5 | +| httpd | 10 | 561.99 | 385.79 | 176.2 | 85.7 | 54.15 | 51.36% | 0.8-0.5 | +*** diff --git a/src/bin/nydus-image/deduplicate.rs b/src/bin/nydus-image/deduplicate.rs index 83de9188940..ccac5ce31d1 100644 --- a/src/bin/nydus-image/deduplicate.rs +++ b/src/bin/nydus-image/deduplicate.rs @@ -4,13 +4,17 @@ //! Deduplicate for Chunk. use anyhow::{Context, Result}; +use core::cmp::Ordering; use nydus_api::ConfigV2; use nydus_builder::Tree; use nydus_rafs::metadata::RafsSuper; use nydus_storage::device::BlobInfo; use rusqlite::{params, Connection}; +use std::collections::HashSet; +use std::collections::{BTreeMap, HashMap}; use std::fs; use std::path::Path; +use std::result::Result::Ok; use std::sync::{Arc, Mutex}; #[derive(Debug)] @@ -66,15 +70,11 @@ pub struct SqliteDatabase { impl SqliteDatabase { pub fn new(database_url: &str) -> Result { - // Delete the database file if it exists. + // Connect to a database that already exists if let Ok(metadata) = fs::metadata(database_url) { if metadata.is_file() { - if let Err(err) = fs::remove_file(database_url) { - warn!( - "Warning: Unable to delete existing database file: {:?}.", - err - ); - } + } else { + panic!("Warning: Unable to find existing database file."); } } @@ -147,12 +147,14 @@ impl Deduplicate { &mut self, bootstrap_path: &Path, config: Arc, + image_name: String, + version_name: String, ) -> anyhow::Result>> { let (sb, _) = RafsSuper::load_from_file(bootstrap_path, config, false)?; self.create_tables()?; let blob_infos = sb.superblock.get_blob_infos(); self.insert_blobs(&blob_infos)?; - self.insert_chunks(&blob_infos, &sb)?; + self.insert_chunks(&blob_infos, &sb, image_name, version_name)?; Ok(blob_infos) } @@ -183,6 +185,8 @@ impl Deduplicate { &mut self, blob_infos: &[Arc], sb: &RafsSuper, + image_name: String, + version_name: String, ) -> anyhow::Result<()> { let process_chunk = &mut |t: &Tree| -> Result<()> { let node = t.lock_node(); @@ -191,6 +195,8 @@ impl Deduplicate { let chunk_blob_id = blob_infos[index as usize].blob_id(); self.db .insert_chunk(&Chunk { + image_name: image_name.to_string(), + version_name: version_name.to_string(), chunk_blob_id, chunk_digest: chunk.inner.id().to_string(), chunk_compressed_size: chunk.inner.compressed_size(), @@ -209,6 +215,565 @@ impl Deduplicate { } } +pub struct Algorithm { + algorithm_name: String, + db: D, +} + +// Generate deduplicated chunkdict by exponential_smoothing algorithm +type Versiondic = HashMap>; +// Generate deduplicated chunkdict by cluster algorithm +type Imagedic = Vec, Vec>>; + +impl Algorithm { + pub fn new(algorithm: String, db_url: &str) -> anyhow::Result { + let algorithm_name = algorithm; + let db = SqliteDatabase::new(db_url)?; + Ok(Self { algorithm_name, db }) + } + + // Call the algorithm to generate a dictionary + pub fn chunkdict_generate(&mut self) -> anyhow::Result<(Vec, Vec)> { + let all_chunks = self.db.chunk_table.list_all()?; + let mut chunkdict: Vec = Vec::new(); + let mut core_image = Vec::new(); + let mut noise_points = Vec::new(); + let (chunkdict_version, chunkdict_image) = match &self.algorithm_name as &str { + "exponential_smoothing" => Self::deduplicate_version(&all_chunks)?, + _ => { + bail!("Unsupported algorithm name:, please use a valid algorithm name, such as exponential_smoothing") + } + }; + for single_clustering in chunkdict_image { + for (image_list, cluster_dictionary) in single_clustering { + core_image.extend(image_list); + chunkdict.extend(cluster_dictionary); + } + } + for (_, dictionary) in chunkdict_version { + chunkdict.extend(dictionary); + } + let mut chunkdict_size = 0; + for i in &chunkdict { + chunkdict_size += i.chunk_compressed_size; + } + info!( + "Chunkdict size is {}", + chunkdict_size as f64 / 1024 as f64 / 1024 as f64 + ); + for chunk in all_chunks { + if !core_image.contains(&chunk.image_name) && !noise_points.contains(&chunk.image_name) + { + noise_points.push(chunk.image_name.clone()); + } + } + Ok((chunkdict, noise_points)) + } + + // Algorithm "exponential_smoothing" + // List all chunk and sort them by the order in chunk table + // Score each chunk by "exponential_smoothing" formula + // Select chunks whose score is greater than threshold and generate chunk dictionary + fn exponential_smoothing(all_chunks: Vec, threshold: f64) -> anyhow::Result> { + let alpha = 0.5; + let mut smoothed_data = Vec::new(); + + let mut last_start_version_index = 0; + let mut start_version_index = 0; + let mut last_end_version_index = 0; + + for (chunk_index, chunk) in all_chunks.iter().enumerate() { + let mut is_duplicate: f64 = 0.0; + if chunk.version_name == all_chunks[0].version_name { + let smoothed_score: f64 = 0.0; + smoothed_data.push(smoothed_score); + } else { + if all_chunks[chunk_index - 1].version_name != all_chunks[chunk_index].version_name + { + last_start_version_index = start_version_index; + start_version_index = chunk_index; + last_end_version_index = chunk_index - 1; + } + for last_chunk in all_chunks + .iter() + .take(last_end_version_index + 1) + .skip(last_start_version_index) + { + if chunk.chunk_digest == last_chunk.chunk_digest { + is_duplicate = 1.0; + break; + } + } + let smoothed_score: f64 = + alpha * is_duplicate + (1.0 - alpha) * smoothed_data[chunk_index - 1]; + smoothed_data.push(smoothed_score); + } + } + + let mut chunkdict: Vec = Vec::new(); + for i in 0..smoothed_data.len() { + let chunk = Chunk { + image_name: all_chunks[i].image_name.clone(), + version_name: all_chunks[i].version_name.clone(), + chunk_blob_id: all_chunks[i].chunk_blob_id.clone(), + chunk_digest: all_chunks[i].chunk_digest.clone(), + chunk_compressed_offset: all_chunks[i].chunk_compressed_offset, + chunk_uncompressed_offset: all_chunks[i].chunk_uncompressed_offset, + chunk_compressed_size: all_chunks[i].chunk_compressed_size, + chunk_uncompressed_size: all_chunks[i].chunk_uncompressed_size, + }; + if smoothed_data[i] > threshold { + chunkdict.push(chunk); + } + } + + // Deduplicate chunk dictionary + let mut unique_chunks: BTreeMap = BTreeMap::new(); + for chunk in &chunkdict { + if !unique_chunks.contains_key(&chunk.chunk_digest) { + unique_chunks.insert(chunk.chunk_digest.clone(), chunk.clone()); + } + } + let unique_chunk_list: Vec = unique_chunks.values().cloned().collect(); + Ok(unique_chunk_list) + } + + // Calculate the distance between two images + fn distance(image1: &[Chunk], image2: &[Chunk]) -> anyhow::Result { + // The total size of all chunks in both images + let mut image1_size: u64 = 0; + let mut image2_size: u64 = 0; + + for chunk1 in image1 { + image1_size += chunk1.chunk_compressed_size as u64; + } + for chunk2 in image2 { + image2_size += chunk2.chunk_compressed_size as u64; + } + + // The total size of the chunk repeated between two images + let all_chunks: Vec<&Chunk> = image1.iter().chain(image2.iter()).collect(); + let mut compressed_size_map: std::collections::HashMap = + std::collections::HashMap::new(); + let mut processed_digests: HashSet<&String> = HashSet::new(); + + for chunk in all_chunks { + if processed_digests.contains(&chunk.chunk_digest) { + let size = compressed_size_map + .entry(chunk.chunk_digest.clone()) + .or_insert(0); + *size += chunk.chunk_compressed_size as u64; + } + processed_digests.insert(&chunk.chunk_digest); + } + + let repeat_size: u64 = compressed_size_map.values().cloned().sum(); + let distance: f64 = 1.0 - (repeat_size as f64 / ((image1_size + image2_size) as f64)); + Ok(distance) + } + + // Divide the chunk list into sublists by image name + fn divide_by_image(all_chunks: &[Chunk]) -> anyhow::Result> { + let mut image_chunks: std::collections::HashMap> = + std::collections::HashMap::new(); + let mut datadict: Vec = Vec::new(); + for chunk in all_chunks { + image_chunks + .entry(chunk.image_name.clone()) + .or_insert(Vec::new()) + .push(chunk.clone()); + } + for (index, chunks) in image_chunks { + let data_point = DataPoint { + image_name: index, + chunk_list: chunks, + visited: false, + clustered: false, + cluster_id: 0, + }; + datadict.push(data_point); + } + Ok(datadict) + } + + fn divide_set( + chunks: &[Chunk], + train_percentage: f64, + ) -> anyhow::Result<(Vec, Vec)> { + // Create a HashMap to store the list of chunks for each image_name + let mut image_chunks: BTreeMap> = BTreeMap::new(); + + // Group chunks into image_name + for chunk in chunks { + let entry = image_chunks + .entry(chunk.image_name.clone()) + .or_insert(Vec::new()); + entry.push(chunk.clone()); + } + + // Create the final training and testing sets + let mut train_set: Vec = Vec::new(); + let mut test_set: Vec = Vec::new(); + + // Iterate through the list of Chunks for each image_name + for (_, chunk_list) in image_chunks.iter_mut() { + let mut version_chunks: BTreeMap> = BTreeMap::new(); + // Group the chunks in the image into version_name + for chunk in chunk_list { + let entry = version_chunks + .entry(CustomString(chunk.version_name.clone())) + .or_insert(Vec::new()); + entry.push(chunk.clone()); + } + + let num_version_groups = version_chunks.len(); + let num_train_groups = (num_version_groups as f64 * train_percentage) as usize; + let version_groups = version_chunks.into_iter().collect::>(); + let (train_version_groups, test_version_groups) = + version_groups.split_at(num_train_groups); + + for (_, train_chunks) in train_version_groups { + for chunk in train_chunks { + train_set.push(chunk.clone()); + } + } + + for (_, test_chunks) in test_version_groups { + for chunk in test_chunks { + test_set.push(chunk.clone()); + } + } + } + Ok((train_set, test_set)) + } + + // Dbscan clustering algorithm + fn dbsacn(data_point: &mut Vec, radius: f64) -> anyhow::Result<&Vec> { + let min_points = 10; + let mut cluster_id = 1; + + for i in 0..data_point.len() { + if data_point[i].visited { + continue; + } + if data_point[i].clustered { + continue; + } + + let mut neighbors = Vec::new(); + for j in 0..data_point.len() { + let distance = + Self::distance(&data_point[i].chunk_list, &data_point[j].chunk_list)?; + if !data_point[j].visited && distance <= radius { + neighbors.push(j); + } + } + if neighbors.len() < min_points { + data_point[i].clustered = false; + } else { + Self::expand_cluster(data_point, i, cluster_id, radius, min_points)?; + cluster_id += 1; + } + } + Ok(data_point) + } + + // Core point expansion cluster in dbscan algorithm + fn expand_cluster( + data_point: &mut Vec, + i: usize, + cluster_id: i32, + radius: f64, + min_points: usize, + ) -> anyhow::Result<()> { + data_point[i].clustered = true; + data_point[i].cluster_id = cluster_id; + + let mut stack = vec![i]; + while let Some(q) = stack.pop() { + if data_point[q].visited { + continue; + } + data_point[q].visited = true; + let mut q_neighbors = Vec::new(); + for j in 0..data_point.len() { + let distance = + Self::distance(&data_point[q].chunk_list, &data_point[j].chunk_list)?; + if !data_point[j].visited && distance <= radius { + q_neighbors.push(j); + } + } + if q_neighbors.len() >= min_points { + for &r_index in &q_neighbors { + if !data_point[r_index].visited { + data_point[r_index].visited = true; + stack.push(r_index) + } + if !data_point[r_index].clustered { + data_point[r_index].clustered = true; + data_point[r_index].cluster_id = cluster_id; + } + } + } else { + data_point[i].clustered = false; + } + } + Ok(()) + } + + // Aggregate the chunks in each cluster into a dictionary + fn aggregate_chunk( + data_point: &[DataPoint], + ) -> anyhow::Result, Vec>> { + // Divide chunk list according to clusters + let mut cluster_map: HashMap> = HashMap::new(); + for (index, point) in data_point.iter().enumerate() { + if point.clustered { + let cluster_id = point.cluster_id; + cluster_map + .entry(cluster_id) + .or_insert(Vec::new()) + .push(index); + } + } + + // Iterate through each cluster + let mut dictionary: HashMap, Vec> = HashMap::new(); + for (_, cluster_points) in cluster_map.iter() { + let mut image_total_counts: HashMap<&str, usize> = HashMap::new(); + let mut image_list: Vec = Vec::new(); + // Count the total number of images in the cluster + for &point_index in cluster_points { + let point = &data_point[point_index]; + // let all_count = 0; + let image_total_count = image_total_counts.entry(&point.image_name).or_insert(0); + *image_total_count += 1; + + image_list.push(point.image_name.clone()); + } + + // Count the number of images in which chunks appear in the cluster + let mut chunk_digest_counts: HashMap = HashMap::new(); + for &point_index in cluster_points { + let point = &data_point[point_index]; + let chunk_digest_set: HashSet = point + .chunk_list + .iter() + .map(|chunk| chunk.chunk_digest.clone()) + .collect(); + for chunk_digest in chunk_digest_set { + let count = chunk_digest_counts + .entry(chunk_digest.to_string()) + .or_insert(0); + *count += 1; + } + } + + let mut chunk_list: Vec = Vec::new(); + let mut added_chunk_digests: HashSet = HashSet::new(); + for &point_index in cluster_points { + let point = &data_point[point_index]; + for chunk in &point.chunk_list { + let chunk_digest = &chunk.chunk_digest; + if !added_chunk_digests.contains(chunk_digest) { + let count = chunk_digest_counts.get(chunk_digest).unwrap_or(&0); + if *count as f64 / image_total_counts.len() as f64 >= 0.9 { + chunk_list.push(chunk.clone()); + added_chunk_digests.insert(chunk_digest.to_string()); + } + } + } + } + dictionary.insert(image_list, chunk_list); + } + Ok(dictionary) + } + + fn deduplicate_image( + all_chunks: Vec, + ) -> anyhow::Result, Vec>>> { + let train_percentage = 0.7; + let max_cluster_count = 7; + let mut counter = 0; + let all_chunks_clone = all_chunks; + let mut data_dict: Vec, Vec>> = Vec::new(); + + let (mut train, mut test) = Self::divide_set(&all_chunks_clone, train_percentage)?; + while counter < max_cluster_count { + // Parameter settings + let mut data_point = Self::divide_by_image(&train)?; + let all_train_length = data_point.len(); + let mut radius = 0.5; + let max_radius = 0.9; + let mut test_chunk_sizes = Vec::new(); + let mut min_test_size: u64 = std::u64::MAX; + let mut min_data_dict = HashMap::new(); + let mut data_cluster_length = 0; + + // Adjust the radius size to select the dictionary that tests best + while radius <= max_radius { + let data_cluster = Self::dbsacn(&mut data_point, radius)?; + data_cluster_length = data_cluster.len(); + + let data_dict = Self::aggregate_chunk(data_cluster)?; + + let all_chunks: HashSet<&Chunk> = + data_dict.values().flat_map(|v| v.iter()).collect(); + let mut total_test_set_size: u64 = 0; + + for chunk in test.iter() { + if !all_chunks.contains(chunk) { + total_test_set_size += chunk.chunk_compressed_size as u64; + } + } + test_chunk_sizes.push((radius, total_test_set_size)); + min_test_size = total_test_set_size; + if total_test_set_size <= min_test_size { + min_test_size = total_test_set_size; + min_data_dict = data_dict; + } + radius += 0.05; + } + debug!("test set size is {}", min_test_size); + + let min_chunk_list: Vec = min_data_dict + .values() + .flat_map(|chunk_list| chunk_list.iter()) + .cloned() + .collect(); + let mut to_remove = Vec::new(); + for chunk in train.iter() { + if min_chunk_list.contains(chunk) { + to_remove.push(chunk.clone()); + } + } + for chunk in &to_remove { + train.retain(|c| c.chunk_digest != chunk.chunk_digest); + } + for chunk in &to_remove { + test.retain(|c| c.chunk_digest != chunk.chunk_digest); + } + if (data_cluster_length as f64 / all_train_length as f64) < 0.2 { + break; + } + data_dict.push(min_data_dict); + counter += 1; + } + Ok(data_dict) + } + + pub fn deduplicate_version(all_chunks: &[Chunk]) -> anyhow::Result<(Versiondic, Imagedic)> { + let mut all_chunks_size = 0; + for i in all_chunks { + all_chunks_size += i.chunk_compressed_size; + } + info!( + "All chunk size is {}", + all_chunks_size as f64 / 1024 as f64 / 1024 as f64 + ); + + let train_percentage = 0.7; + let datadict = Self::deduplicate_image(all_chunks.to_owned())?; + let (train, test) = Self::divide_set(all_chunks, train_percentage)?; + let mut train_set_size = 0; + for i in &train { + train_set_size += i.chunk_compressed_size; + } + info!( + "Train set size is {}", + train_set_size as f64 / 1024 as f64 / 1024 as f64 + ); + + let mut test_set_size = 0; + for i in &test { + test_set_size += i.chunk_compressed_size; + } + info!( + "Test set size is {}", + test_set_size as f64 / 1024 as f64 / 1024 as f64 + ); + + let mut version_datadict: HashMap> = HashMap::new(); + let mut data_point = Self::divide_by_image(&train)?; + + let mut threshold = 0.5; + let max_threshold = 0.8; + + let mut test_total_size = 0; + let mut min_test_size: u32 = std::u32::MAX; + let mut min_data_dict = HashMap::new(); + + while threshold <= max_threshold { + version_datadict.clear(); + for point in data_point.iter_mut() { + for single_dictionary in &datadict { + for (key, value) in single_dictionary.iter() { + if key.contains(&point.image_name) { + let mut to_remove = Vec::new(); + for chunk in point.chunk_list.iter() { + if value.contains(chunk) { + to_remove.push(chunk.clone()); + } + } + for chunk in to_remove { + point.chunk_list.retain(|c| c != &chunk); + } + } + } + } + let chunk_dict = Self::exponential_smoothing(point.chunk_list.clone(), threshold)?; + version_datadict.insert(point.image_name.clone(), chunk_dict); + } + + let mut test_by_image = Self::divide_by_image(&test)?; + for point in test_by_image.iter_mut() { + if version_datadict.contains_key(&point.image_name.clone()) { + let mut to_remove = Vec::new(); + let mut vec_string = Vec::new(); + let chunkdict_option = version_datadict.get(&point.image_name); + if let Some(chunkdict) = chunkdict_option { + for i in chunkdict { + vec_string.push(i.chunk_digest.clone()); + } + } + for chunk in point.chunk_list.iter() { + if vec_string.contains(&chunk.chunk_digest) { + to_remove.push(chunk.clone()); + } + } + for chunk in to_remove { + point.chunk_list.retain(|c| c != &chunk); + } + } + for chunk in point.chunk_list.iter() { + test_total_size += chunk.chunk_compressed_size; + } + } + if test_total_size <= min_test_size { + min_test_size = test_total_size; + min_data_dict = version_datadict.clone(); + } + threshold += 0.05; + } + info!( + "After deduplicating test set size is {} and deduplicating rate is {} ", + min_test_size as f64 / 1024 as f64 / 1024 as f64, + 1.0 - (min_test_size as f64) / (test_set_size as f64) + ); + Ok((min_data_dict, datadict)) + } +} + +#[allow(dead_code)] +#[derive(Debug)] +struct DataPoint { + image_name: String, + chunk_list: Vec, + visited: bool, + clustered: bool, + cluster_id: i32, +} + pub trait Table: Sync + Send + Sized + 'static where Err: std::error::Error + 'static, @@ -229,7 +794,7 @@ where fn list_paged(&self, offset: i64, limit: i64) -> Result, Err>; } -#[derive(Debug)] +#[derive()] pub struct ChunkTable { conn: Arc>, } @@ -250,8 +815,73 @@ impl ChunkTable { } } -#[derive(Debug)] +#[derive(Debug, Clone)] +struct CustomString(String); + +impl Ord for CustomString { + // Extract the numbers in the string + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + let mut current_number = String::new(); + + // Parse numbers in strings + let mut numbers1 = Vec::new(); + let mut numbers2 = Vec::new(); + + for ch in self.0.chars() { + if ch.is_ascii_digit() { + current_number.push(ch); + } else if !current_number.is_empty() { + if let Ok(number) = current_number.parse::() { + numbers1.push(number); + } + current_number.clear(); + } + } + if !current_number.is_empty() { + if let Ok(number) = current_number.parse::() { + numbers1.push(number); + } + } + current_number.clear(); + + for ch in other.0.chars() { + if ch.is_ascii_digit() { + current_number.push(ch); + } else if !current_number.is_empty() { + if let Ok(number) = current_number.parse::() { + numbers2.push(number); + } + current_number.clear(); + } + } + if !current_number.is_empty() { + if let Ok(number) = current_number.parse::() { + numbers2.push(number); + } + } + current_number.clear(); + numbers1.cmp(&numbers2) + } +} + +impl PartialOrd for CustomString { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl PartialEq for CustomString { + fn eq(&self, other: &Self) -> bool { + self.0 == other.0 + } +} + +impl Eq for CustomString {} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct Chunk { + image_name: String, + version_name: String, chunk_blob_id: String, chunk_digest: String, chunk_compressed_size: u32, @@ -277,6 +907,8 @@ impl Table for ChunkTable { .execute( "CREATE TABLE IF NOT EXISTS chunk ( id INTEGER PRIMARY KEY, + image_name TEXT, + version_name TEXT, chunk_blob_id TEXT NOT NULL, chunk_digest TEXT, chunk_compressed_size INT, @@ -296,6 +928,8 @@ impl Table for ChunkTable { .map_err(|e| DatabaseError::PoisonError(e.to_string()))? .execute( "INSERT INTO chunk( + image_name, + version_name, chunk_blob_id, chunk_digest, chunk_compressed_size, @@ -303,9 +937,11 @@ impl Table for ChunkTable { chunk_compressed_offset, chunk_uncompressed_offset ) - VALUES (?1, ?2, ?3, ?4, ?5, ?6); + VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8); ", rusqlite::params![ + chunk.image_name, + chunk.version_name, chunk.chunk_blob_id, chunk.chunk_digest, chunk.chunk_compressed_size, @@ -343,18 +979,20 @@ impl Table for ChunkTable { .map_err(|e| DatabaseError::PoisonError(e.to_string()))?; let mut stmt: rusqlite::Statement<'_> = conn_guard .prepare( - "SELECT id, chunk_blob_id, chunk_digest, chunk_compressed_size, + "SELECT id, image_name, version_name, chunk_blob_id, chunk_digest, chunk_compressed_size, chunk_uncompressed_size, chunk_compressed_offset, chunk_uncompressed_offset from chunk ORDER BY id LIMIT ?1 OFFSET ?2", )?; let chunk_iterator = stmt.query_map(params![limit, offset], |row| { Ok(Chunk { - chunk_blob_id: row.get(1)?, - chunk_digest: row.get(2)?, - chunk_compressed_size: row.get(3)?, - chunk_uncompressed_size: row.get(4)?, - chunk_compressed_offset: row.get(5)?, - chunk_uncompressed_offset: row.get(6)?, + image_name: row.get(1)?, + version_name: row.get(2)?, + chunk_blob_id: row.get(3)?, + chunk_digest: row.get(4)?, + chunk_compressed_size: row.get(5)?, + chunk_uncompressed_size: row.get(6)?, + chunk_compressed_offset: row.get(7)?, + chunk_uncompressed_offset: row.get(8)?, }) })?; let mut chunks = Vec::new(); @@ -488,6 +1126,33 @@ mod tests { use super::*; use rusqlite::Result; + #[test] + fn test_partial_cmp() -> Result<(), Box> { + let custom_string1 = CustomString("nydus_1.2.3".to_string()); + let custom_string2 = CustomString("nydus_1.2.10".to_string()); + let custom_string3 = CustomString("nydus_2.0".to_string()); + + assert!(custom_string1 < custom_string2); + assert!(custom_string2 < custom_string3); + assert!(custom_string1 < custom_string3); + + assert!(custom_string1 <= custom_string2); + assert!(custom_string2 <= custom_string3); + assert!(custom_string1 <= custom_string3); + + assert!(custom_string2 > custom_string1); + assert!(custom_string3 > custom_string2); + assert!(custom_string3 > custom_string1); + + assert!(custom_string2 >= custom_string1); + assert!(custom_string3 >= custom_string2); + assert!(custom_string3 >= custom_string1); + + assert_eq!(custom_string1, CustomString("nydus_1.2.3".to_string())); + assert_ne!(custom_string1, custom_string2); + Ok(()) + } + #[test] fn test_blob_table() -> Result<(), Box> { let blob_table = BlobTable::new_in_memory()?; @@ -511,6 +1176,8 @@ mod tests { let chunk_table = ChunkTable::new_in_memory()?; chunk_table.create()?; let chunk = Chunk { + image_name: "REDIS".to_string(), + version_name: "1.0.0".to_string(), chunk_blob_id: "BLOB123".to_string(), chunk_digest: "DIGEST123".to_string(), chunk_compressed_size: 512, @@ -520,6 +1187,8 @@ mod tests { }; chunk_table.insert(&chunk)?; let chunks = chunk_table.list_all()?; + assert_eq!(chunks[0].image_name, chunk.image_name); + assert_eq!(chunks[0].version_name, chunk.version_name); assert_eq!(chunks.len(), 1); assert_eq!(chunks[0].chunk_blob_id, chunk.chunk_blob_id); assert_eq!(chunks[0].chunk_digest, chunk.chunk_digest); @@ -566,6 +1235,8 @@ mod tests { for i in 0..200 { let i64 = i as u64; let chunk = Chunk { + image_name: format!("REDIS{}", i), + version_name: format!("1.0.0{}", i), chunk_blob_id: format!("BLOB{}", i), chunk_digest: format!("DIGEST{}", i), chunk_compressed_size: i, @@ -577,6 +1248,8 @@ mod tests { } let chunks = chunk_table.list_paged(100, 100)?; assert_eq!(chunks.len(), 100); + assert_eq!(chunks[0].image_name, "REDIS100"); + assert_eq!(chunks[0].version_name, "1.0.0100"); assert_eq!(chunks[0].chunk_blob_id, "BLOB100"); assert_eq!(chunks[0].chunk_digest, "DIGEST100"); assert_eq!(chunks[0].chunk_compressed_size, 100); @@ -585,4 +1258,272 @@ mod tests { assert_eq!(chunks[0].chunk_uncompressed_offset, 400); Ok(()) } + + #[test] + fn test_algorithm_exponential_smoothing() -> Result<(), Box> { + let threshold = 0.1; + let mut all_chunk: Vec = Vec::new(); + for i in 0..199 { + let i64 = i as u64; + let chunk = Chunk { + image_name: format!("REDIS{}", 0), + version_name: format!("1.0.0{}", (i + 1) / 100), + chunk_blob_id: format!("BLOB{}", i), + chunk_digest: format!("DIGEST{}", (i + 1) % 2), + chunk_compressed_size: i, + chunk_uncompressed_size: i * 2, + chunk_compressed_offset: i64 * 3, + chunk_uncompressed_offset: i64 * 4, + }; + all_chunk.push(chunk); + } + let chunkdict = Algorithm::::exponential_smoothing(all_chunk, threshold)?; + assert_eq!(chunkdict.len(), 2); + assert_eq!(chunkdict[0].image_name, "REDIS0"); + assert_eq!(chunkdict[0].version_name, "1.0.01"); + assert_eq!(chunkdict[0].chunk_blob_id, "BLOB99"); + assert_eq!(chunkdict[0].chunk_digest, "DIGEST0"); + assert_eq!(chunkdict[0].chunk_compressed_size, 99); + assert_eq!(chunkdict[0].chunk_uncompressed_size, 198); + assert_eq!(chunkdict[0].chunk_compressed_offset, 297); + assert_eq!(chunkdict[0].chunk_uncompressed_offset, 396); + Ok(()) + } + + #[test] + fn test_divide_by_image() -> Result<(), Box> { + let db_url = "./metadata.db"; + let chunk_table = ChunkTable::new(db_url)?; + chunk_table.create()?; + for i in 0..200 { + let i64 = i as u64; + let chunk = Chunk { + image_name: format!("REDIS{}", i / 50), + version_name: format!("1.0.0{}", (i + 1) / 100), + chunk_blob_id: format!("BLOB{}", i), + chunk_digest: format!("DIGEST{}", (i + 1) % 2), + chunk_compressed_size: i, + chunk_uncompressed_size: i * 2, + chunk_compressed_offset: i64 * 3, + chunk_uncompressed_offset: i64 * 4, + }; + chunk_table.insert(&chunk)?; + } + let algorithm = String::from("exponential_smoothing"); + let algorithm = Algorithm::::new(algorithm, db_url)?; + let all_chunks = algorithm.db.chunk_table.list_all()?; + assert_eq!(all_chunks.len(), 200); + let datadict = Algorithm::::divide_by_image(&all_chunks)?; + assert_eq!(datadict.len(), 4); + assert_eq!(datadict[3].cluster_id, 0); + assert_eq!(datadict[3].chunk_list.len(), 50); + chunk_table.clear()?; + Ok(()) + } + + #[test] + fn test_distance() -> Result<(), Box> { + let mut all_chunks1: Vec = Vec::new(); + for i in 0..200 { + let i64 = i as u64; + let chunk = Chunk { + image_name: format!("REDIS{}", 0), + version_name: format!("1.0.0{}", (i + 1) / 100), + chunk_blob_id: format!("BLOB{}", i), + chunk_digest: format!("DIGEST{}", (i + 1) % 4), + chunk_compressed_size: 1, + chunk_uncompressed_size: 1, + chunk_compressed_offset: i64 * 3, + chunk_uncompressed_offset: i64 * 4, + }; + all_chunks1.push(chunk); + } + let mut all_chunks2: Vec = Vec::new(); + for i in 0..200 { + let i64 = i as u64; + let chunk = Chunk { + image_name: format!("REDIS{}", 1), + version_name: format!("1.0.0{}", (i + 1) / 100), + chunk_blob_id: format!("BLOB{}", i), + chunk_digest: format!("DIGEST{}", (i + 1) % 4), + chunk_compressed_size: 1, + chunk_uncompressed_size: 1, + chunk_compressed_offset: i64 * 3, + chunk_uncompressed_offset: i64 * 4, + }; + all_chunks2.push(chunk); + } + let datadict = Algorithm::::distance(&all_chunks1, &all_chunks2)?; + assert!( + (datadict - 0.01).abs() <= 0.0001, + "Expected {} to be approximately equal to {} with tolerance {}", + datadict, + 0.01, + 0.0001 + ); + Ok(()) + } + + #[test] + fn test_divide_set() -> Result<(), Box> { + let mut all_chunks: Vec = Vec::new(); + for i in 0..200 { + for j in 0..100 { + let chunk = Chunk { + image_name: format!("REDIS{}", i), + version_name: format!("1.0.0{}", j / 10), + chunk_blob_id: format!("BLOB{}", j), + chunk_digest: format!("DIGEST{}", j + (i / 100) * 100), + chunk_compressed_size: 1, + chunk_uncompressed_size: 1, + chunk_compressed_offset: 1, + chunk_uncompressed_offset: 1, + }; + all_chunks.push(chunk); + } + } + assert_eq!(all_chunks.len(), 20000); + let (train, test) = Algorithm::::divide_set(&all_chunks, 0.7)?; + assert_eq!(train.len(), 14000); + assert_eq!(train[0].image_name, "REDIS0"); + assert_eq!(train[0].version_name, "1.0.00"); + assert_eq!(test.len(), 6000); + assert_eq!(test[0].image_name, "REDIS0"); + assert_eq!(test[0].version_name, "1.0.07"); + Ok(()) + } + + #[test] + fn test_dbscan() -> Result<(), Box> { + let mut all_chunks: Vec = Vec::new(); + let radius = 0.6; + for i in 0..200 { + for j in 0..100 { + let chunk = Chunk { + image_name: format!("REDIS{}", i), + version_name: format!("1.0.0{}", j / 10), + chunk_blob_id: format!("BLOB{}", j), + chunk_digest: format!("DIGEST{}", j + (i / 100) * 100), + chunk_compressed_size: 1, + chunk_uncompressed_size: 1, + chunk_compressed_offset: 1, + chunk_uncompressed_offset: 1, + }; + all_chunks.push(chunk); + } + } + assert_eq!(all_chunks.len(), 20000); + let mut data_point = Algorithm::::divide_by_image(&all_chunks)?; + let datadict = Algorithm::::dbsacn(&mut data_point, radius)?; + assert_eq!(datadict.len(), 200); + if datadict[150].chunk_list[0].chunk_digest == datadict[0].chunk_list[0].chunk_digest { + assert_eq!(datadict[150].cluster_id, 1); + } else { + assert_eq!(datadict[150].cluster_id, 2); + } + assert_eq!(datadict[0].cluster_id, 1); + assert!(datadict[150].clustered); + assert!(datadict[150].visited); + assert_eq!(datadict[0].chunk_list.len(), 100); + Ok(()) + } + + #[test] + fn test_aggregate_chunk() -> Result<(), Box> { + let mut all_chunks: Vec = Vec::new(); + let radius = 0.6; + for i in 0..200 { + for j in 0..100 { + let chunk = Chunk { + image_name: format!("REDIS{}", i), + version_name: format!("1.0.0{}", (j + 1) / 100), + chunk_blob_id: format!("BLOB{}", j), + chunk_digest: format!("DIGEST{}", j + (i / 100) * 100), + chunk_compressed_size: 1, + chunk_uncompressed_size: 1, + chunk_compressed_offset: 1, + chunk_uncompressed_offset: 1, + }; + all_chunks.push(chunk); + } + } + assert_eq!(all_chunks.len(), 20000); + let mut data_point = Algorithm::::divide_by_image(&all_chunks)?; + let data_cluster = Algorithm::::dbsacn(&mut data_point, radius)?; + let datadict = Algorithm::::aggregate_chunk(&data_cluster)?; + assert_eq!(datadict.len(), 2); + Ok(()) + } + + #[test] + fn test_deduplicate_image() -> Result<(), Box> { + let mut all_chunks: Vec = Vec::new(); + for i in 0..200 { + for j in 0..100 { + let chunk = Chunk { + image_name: format!("REDIS{}", i), + version_name: format!("1.0.0{}", j / 10), + chunk_blob_id: format!("BLOB{}", j), + chunk_digest: format!("DIGEST{}", j + (i / 100) * 100), + chunk_compressed_size: 1, + chunk_uncompressed_size: 1, + chunk_compressed_offset: 1, + chunk_uncompressed_offset: 1, + }; + all_chunks.push(chunk); + } + } + assert_eq!(all_chunks.len(), 20000); + let datadict = Algorithm::::deduplicate_image(all_chunks)?; + for i in datadict.clone() { + for (_, b) in i { + if !b.is_empty() { + assert_eq!(b.len(), 70); + } + } + } + assert_eq!(datadict[0].len(), 2); + assert_eq!(datadict[0].values().len(), 2); + assert_eq!(datadict[1].len(), 0); + assert_eq!(datadict[1].values().len(), 0); + assert_eq!(datadict.len(), 7); + Ok(()) + } + + #[test] + fn test_deduplicate_version() -> Result<(), Box> { + let mut all_chunks: Vec = Vec::new(); + let mut chunkdict: Vec = Vec::new(); + for i in 0..200 { + let i64 = i as u64; + let chunk = Chunk { + image_name: format!("REDIS{}", 0), + version_name: format!("1.0.0{}", (i + 1) / 20), + chunk_blob_id: format!("BLOB{}", i), + chunk_digest: format!("DIGEST{}", (i + 1) % 2), + chunk_compressed_size: i, + chunk_uncompressed_size: i * 2, + chunk_compressed_offset: i64 * 3, + chunk_uncompressed_offset: i64 * 4, + }; + all_chunks.push(chunk); + } + let (chunkdict_version, chunkdict_image) = + Algorithm::::deduplicate_version(&all_chunks)?; + for (_, dictionary) in chunkdict_version { + chunkdict.extend(dictionary); + } + + assert_eq!(chunkdict[0].image_name, "REDIS0"); + assert_eq!(chunkdict[0].chunk_compressed_size, 21); + assert_eq!(chunkdict.len(), 2); + + for single_clustering in chunkdict_image { + for (_, cluster_dictionary) in single_clustering { + chunkdict.extend(cluster_dictionary); + } + } + assert_eq!(chunkdict.len(), 2); + Ok(()) + } } diff --git a/src/bin/nydus-image/main.rs b/src/bin/nydus-image/main.rs index 29dd8a0e072..9ac94fd86d8 100644 --- a/src/bin/nydus-image/main.rs +++ b/src/bin/nydus-image/main.rs @@ -380,14 +380,14 @@ fn prepare_cmd_args(bti_string: &'static str) -> App { Arg::new("bootstrap") .short('B') .long("bootstrap") - .help("File path of RAFS meta blob/bootstrap") + .help("File path of RAFS meta blob/bootstrap, e.g. /path/output/localhost:5000:redis:nydus_7.0.1/nydus_bootstrap") .required(false), ) .arg( Arg::new("database") .long("database") - .help("Database connection URI for assisting chunk dict generation, e.g. sqlite:///path/to/database.db") - .default_value("sqlite://:memory:") + .help("Database connection URI for assisting chunk dict generation, e.g. sqlite:///path/chunkdict.db") + .default_value("sqlite:///home/runner/work/image-service/chunkdict/image-service/contrib/nydusify/chunkdict.db") .required(false), ) .arg( @@ -409,7 +409,25 @@ fn prepare_cmd_args(bti_string: &'static str) -> App { .required(false), ) .arg(arg_output_json.clone()) - ) + ) + .subcommand( + App::new("generate") + .about("generate chunk dictionary based on database") + .arg( + Arg::new("database") + .long("database") + .help("Database connection address for assisting chunk dictionary generation, e.g. sqlite:///path/chunkdict.db") + .required(true), + ) + .arg( + Arg::new("verbose") + .long("verbose") + .short('v') + .help("Output message in verbose mode") + .action(ArgAction::SetTrue) + .required(false), + ) + ) ); let app = app.subcommand( @@ -764,6 +782,9 @@ fn main() -> Result<()> { } else if let Some(matches) = cmd.subcommand_matches("chunkdict") { match matches.subcommand_name() { Some("save") => Command::chunkdict_save(matches.subcommand_matches("save").unwrap()), + Some("generate") => { + Command::chunkdict_generate(matches.subcommand_matches("generate").unwrap()) + } _ => { println!("{}", usage); Ok(()) @@ -1183,15 +1204,34 @@ impl Command { } fn chunkdict_save(matches: &ArgMatches) -> Result<()> { + // Parse the directory name of bootstrap and obtain the image name and version name let bootstrap_path = Self::get_bootstrap(matches)?; + let path = bootstrap_path.display().to_string(); + info!("Bootstrap path is {}", path); + let path_name: Vec<&str> = path.split('/').collect(); + + // Extract the image name and version name from the bootstrap directory + let bootstrap_dir = match path_name.get(path_name.len() - 2) { + Some(&bootstrap_dir) => bootstrap_dir.to_string(), + None => bail!("Invalid Bootstrap directory name"), + }; + let full_image_name: Vec<&str> = bootstrap_dir.split(':').collect(); + let image_name = match full_image_name.get(full_image_name.len() - 2) { + Some(&second_last) => second_last.to_string(), + None => bail!("Invalid image name"), + }; + let version_name = match full_image_name.last() { + Some(&last) => last.to_string(), + None => bail!("Invalid version name"), + }; + let config = Self::get_configuration(matches)?; let db_url: &String = matches.get_one::("database").unwrap(); - debug!("db_url: {}", db_url); + // For backward compatibility with v2.1. config .internal .set_blob_accessible(matches.get_one::("bootstrap").is_none()); - let db_strs: Vec<&str> = db_url.split("://").collect(); if db_strs.len() != 2 || (!db_strs[1].starts_with('/') && !db_strs[1].starts_with(':')) { bail!("Invalid database URL: {}", db_url); @@ -1201,16 +1241,49 @@ impl Command { "sqlite" => { let mut deduplicate: Deduplicate = Deduplicate::::new(db_strs[1])?; - deduplicate.save_metadata(bootstrap_path, config)? + deduplicate.save_metadata(bootstrap_path, config, image_name, version_name)? } _ => { - bail!("Unsupported database type: {}, please use a valid database URI, such as 'sqlite:///path/to/database.db'.", db_strs[0]) + bail!("Unsupported database type: {}, please use a valid database URI, such as 'sqlite:///path/to/chunkdict.db'.", db_strs[0]) } }; info!("Chunkdict metadata is saved at: {:?}", db_url); Ok(()) } + fn chunkdict_generate(matches: &ArgMatches) -> Result<()> { + // Connecting database and Generating chunk dictionary by algorithm "exponential_smoothing" + let db_url: &String = matches.get_one::("database").unwrap(); + debug!("db_url: {}", db_url); + let db_strs: Vec<&str> = db_url.split("://").collect(); + if db_strs.len() != 2 || (!db_strs[1].starts_with('/') && !db_strs[1].starts_with(':')) { + bail!("Invalid database URL: {}", db_url); + } + let algorithm = String::from("exponential_smoothing"); + + match db_strs[0] { + "sqlite" => { + let mut algorithm: deduplicate::Algorithm = + deduplicate::Algorithm::::new(algorithm, db_strs[1])?; + let (chunkdict, noise_points) = algorithm.chunkdict_generate()?; + info!( + "The length of chunkdict is {}", + Vec::::len(&chunkdict) + ); + info!("It is not recommended to use image deduplication"); + for image_name in noise_points { + info!("{}", image_name); + } + } + _ => { + bail!("Unsupported database type: {}, please use a valid database URI, such as 'sqlite:///path/to/chunkdict.db'.", db_strs[0]) + } + }; + + // To be continued, dump chunk of "chunk dictionary" ... + Ok(()) + } + fn merge(matches: &ArgMatches, build_info: &BuildTimeInfo) -> Result<()> { let source_bootstrap_paths: Vec = matches .get_many::("SOURCE") From db7d3952642f85021b43d9e414f3a323a5e93ff5 Mon Sep 17 00:00:00 2001 From: Lin Wang Date: Tue, 28 Nov 2023 16:22:46 +0800 Subject: [PATCH 02/11] nydus-image: Store chunk and blob metadata Signed-off-by: Lin Wang --- builder/src/core/context.rs | 23 +++ builder/src/generate.rs | 257 +++++++++++++++++++++++++++++ builder/src/lib.rs | 3 + src/bin/nydus-image/deduplicate.rs | 227 +++++++++++++++++-------- src/bin/nydus-image/main.rs | 216 +++++++++++++++++++++--- utils/src/digest.rs | 12 ++ 6 files changed, 644 insertions(+), 94 deletions(-) create mode 100644 builder/src/generate.rs diff --git a/builder/src/core/context.rs b/builder/src/core/context.rs index b9290d7015f..49d59734d32 100644 --- a/builder/src/core/context.rs +++ b/builder/src/core/context.rs @@ -955,6 +955,29 @@ impl BlobManager { } } + /// Get or cerate blob for chunkdict, this is used for chunk deduplication. + pub fn get_or_cerate_blob_for_chunkdict( + &mut self, + ctx: &BuildContext, + id: &str, + ) -> Result<(u32, &mut BlobContext)> { + if self.get_blob_idx_by_id(id).is_none() { + let blob_ctx = Self::new_blob_ctx(ctx)?; + self.current_blob_index = Some(self.alloc_index()?); + self.add_blob(blob_ctx); + } else { + self.current_blob_index = self.get_blob_idx_by_id(id); + } + + // Safe to unwrap because the blob context has been added. + Ok(self.get_current_blob().unwrap()) + } + + /// Determine if the given blob has been created. + pub fn has_blob(&self, blob_id: &str) -> bool { + self.get_blob_idx_by_id(blob_id).is_some() + } + /// Set the global chunk dictionary for chunk deduplication. pub fn set_chunk_dict(&mut self, dict: Arc) { self.global_chunk_dict = dict diff --git a/builder/src/generate.rs b/builder/src/generate.rs new file mode 100644 index 00000000000..576142b926c --- /dev/null +++ b/builder/src/generate.rs @@ -0,0 +1,257 @@ +// Copyright (C) 2022 Nydus Developers. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +//! Generate Chunkdict RAFS bootstrap. +//! Bug 1: Inconsistent Chunk Size Leading to Blob Size Less Than 4K +//! Description: The size of chunks is not consistent, which results in the possibility that a blob, composed of a group of these chunks, may be less than 4K in size. This inconsistency leads to a failure in passing the size check. +//! Bug 2: Incorrect Chunk Number Calculation Due to Premature Check Logic +//! Description: The current logic for calculating the chunk number is based on the formula size/chunk size. However, this approach is flawed as it precedes the actual check which accounts for chunk statistics. Consequently, this leads to inaccurate counting of chunk numbers. + +use super::core::node::{ChunkSource, NodeInfo}; +use super::{BlobManager, Bootstrap, BootstrapManager, BuildContext, BuildOutput, Tree}; +use crate::core::node::Node; +use crate::NodeChunk; +use anyhow::Result; +use nydus_rafs::metadata::chunk::ChunkWrapper; +use nydus_rafs::metadata::inode::InodeWrapper; +use nydus_rafs::metadata::layout::RafsXAttrs; +use nydus_rafs::metadata::RafsVersion; +use nydus_storage::meta::BlobChunkInfoV1Ondisk; +use nydus_utils::digest::RafsDigest; +use nydus_utils::lazy_drop; +use std::ffi::OsString; +use std::mem::size_of; +use std::path::PathBuf; +use std::sync::Arc; +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct ChunkdictChunkInfo { + pub image_name: String, + pub version_name: String, + pub chunk_blob_id: String, + pub chunk_digest: String, + pub chunk_compressed_size: u32, + pub chunk_uncompressed_size: u32, + pub chunk_compressed_offset: u64, + pub chunk_uncompressed_offset: u64, +} + +/// Struct to Generater chunkdict RAFS bootstrap. +pub struct Generater {} + +impl Generater { + // Generate chunkdict RAFS bootstrap. + #[allow(clippy::too_many_arguments)] + pub fn generate( + ctx: &mut BuildContext, + bootstrap_mgr: &mut BootstrapManager, + blob_mgr: &mut BlobManager, + chunkdict_origin: Vec, + ) -> Result { + // validate and remove chunks which bloned blob size is smaller than block. + let mut chunkdict = chunkdict_origin.to_vec(); + Self::validate_and_remove_chunks(&mut chunkdict, ctx); + + // build root tree + let mut tree = Self::build_root_tree()?; + + // build child tree + let child = Self::build_child_tree(ctx, blob_mgr, &chunkdict)?; + let result = vec![child]; + tree.children = result; + tree.lock_node() + .v5_set_dir_size(ctx.fs_version, &tree.children); + + Self::validate_tree(&tree)?; + + // build bootstrap + let mut bootstrap_ctx = bootstrap_mgr.create_ctx()?; + let mut bootstrap = Bootstrap::new(tree)?; + bootstrap.build(ctx, &mut bootstrap_ctx)?; + + let blob_table = blob_mgr.to_blob_table(ctx)?; + let storage = &mut bootstrap_mgr.bootstrap_storage; + bootstrap.dump(ctx, storage, &mut bootstrap_ctx, &blob_table)?; + + lazy_drop(bootstrap_ctx); + + BuildOutput::new(blob_mgr, &bootstrap_mgr.bootstrap_storage) + } + + /// validate tree + fn validate_tree(tree: &Tree) -> Result<()> { + let pre = &mut |t: &Tree| -> Result<()> { + let node = t.lock_node(); + debug!("chunkdict tree: "); + debug!("inode: {}", node); + for chunk in &node.chunks { + debug!("\t chunk: {}", chunk); + } + Ok(()) + }; + tree.walk_dfs_pre(pre)?; + debug!("chunkdict tree is valid."); + Ok(()) + } + + /// check blob uncompressed size is bigger than block + fn validate_and_remove_chunks(chunkdict: &mut Vec, ctx: &mut BuildContext) { + let mut chunk_sizes = std::collections::HashMap::new(); + + // Accumulate the uncompressed size for each chunk_blob_id + for chunk in chunkdict.iter() { + *chunk_sizes.entry(chunk.chunk_blob_id.clone()).or_insert(0) += + chunk.chunk_uncompressed_size as u64; + } + + // Find all chunk_blob_ids with a total uncompressed size > 4096 + let small_chunks: Vec = chunk_sizes + .into_iter() + .filter(|&(_, size)| size < ctx.v6_block_size()) + .inspect(|(id, _)| { + eprintln!( + "Warning: Blob with id '{}' is smaller than {} bytes.", + id, + ctx.v6_block_size() + ) + }) + .map(|(id, _)| id) + .collect(); + + // Retain only chunks with chunk_blob_id that has a total uncompressed size > 4096 + chunkdict.retain(|chunk| !small_chunks.contains(&chunk.chunk_blob_id)); + } + + /// Build root tree + pub fn build_root_tree() -> Result { + // inode + let mut inode = InodeWrapper::new(RafsVersion::V6); + inode.set_ino(0); + inode.set_uid(1000); + inode.set_gid(1000); + inode.set_projid(0); + inode.set_mode(0o660 | libc::S_IFDIR as u32); + inode.set_nlink(1); + inode.set_name_size("/".len()); + inode.set_rdev(0); + inode.set_blocks(256); + let node_info = NodeInfo { + explicit_uidgid: true, + src_dev: 66305, + src_ino: 24772610, + rdev: 0, + source: PathBuf::from("/"), + path: PathBuf::from("/"), + target: PathBuf::from("/"), + target_vec: vec![OsString::from("/")], + symlink: None, + xattrs: RafsXAttrs::default(), + v6_force_extended_inode: true, + }; + let root_node = Node::new(inode, node_info, 0); + let tree = Tree::new(root_node); + Ok(tree) + } + + /// Build child tree + fn build_child_tree( + ctx: &mut BuildContext, + blob_mgr: &mut BlobManager, + chunkdict: &[ChunkdictChunkInfo], + ) -> Result { + // node + let mut inode = InodeWrapper::new(RafsVersion::V6); + inode.set_ino(1); + inode.set_uid(0); + inode.set_gid(0); + inode.set_projid(0); + inode.set_mode(0o660 | libc::S_IFREG as u32); + inode.set_nlink(1); + inode.set_name_size("chunkdict".len()); + inode.set_rdev(0); + inode.set_blocks(256); + let node_info = NodeInfo { + explicit_uidgid: true, + src_dev: 66305, + src_ino: 24775126, + rdev: 0, + source: PathBuf::from("/"), + path: PathBuf::from("/chunkdict"), + target: PathBuf::from("/chunkdict"), + target_vec: vec![OsString::from("/"), OsString::from("/chunkdict")], + symlink: None, + xattrs: RafsXAttrs::new(), + v6_force_extended_inode: true, + }; + let mut node = Node::new(inode, node_info, 0); + + // insert chunks + Self::insert_chunks(ctx, blob_mgr, &mut node, chunkdict)?; + + let node_size: u64 = node + .chunks + .iter() + .map(|chunk| chunk.inner.uncompressed_size() as u64) + .sum(); + node.inode.set_size(node_size); + + // update child count + node.inode.set_child_count(node.chunks.len() as u32); + + let child = Tree::new(node); + child + .lock_node() + .v5_set_dir_size(ctx.fs_version, &child.children); + Ok(child) + } + + /// Insert chunks + fn insert_chunks( + ctx: &mut BuildContext, + blob_mgr: &mut BlobManager, + node: &mut Node, + chunkdict: &[ChunkdictChunkInfo], + ) -> Result<()> { + for chunk_info in chunkdict.iter() { + let chunk_size: u32 = chunk_info.chunk_compressed_size; + let file_offset = 1 as u64 * chunk_size as u64; + ctx.fs_version = RafsVersion::V6; + let mut chunk = ChunkWrapper::new(RafsVersion::V6); + + // update blob context + let (blob_index, blob_ctx) = + blob_mgr.get_or_cerate_blob_for_chunkdict(ctx, &chunk_info.chunk_blob_id)?; + if blob_ctx.blob_id.is_empty() { + blob_ctx.blob_id = chunk_info.chunk_blob_id.clone(); + } + let chunk_uncompressed_size = chunk_info.chunk_uncompressed_size; + let pre_d_offset = blob_ctx.current_uncompressed_offset; + blob_ctx.uncompressed_blob_size = pre_d_offset + chunk_uncompressed_size as u64; + blob_ctx.current_uncompressed_offset += chunk_uncompressed_size as u64; + + blob_ctx.blob_meta_header.set_ci_uncompressed_size( + blob_ctx.blob_meta_header.ci_uncompressed_size() + + size_of::() as u64, + ); + + // update chunk + let chunk_index = blob_ctx.alloc_chunk_index()?; + chunk.set_blob_index(blob_index); + chunk.set_index(chunk_index); + chunk.set_file_offset(file_offset); + chunk.set_compressed_size(chunk_info.chunk_compressed_size); + chunk.set_compressed_offset(chunk_info.chunk_compressed_offset); + chunk.set_uncompressed_size(chunk_info.chunk_uncompressed_size); + chunk.set_uncompressed_offset(chunk_info.chunk_uncompressed_offset); + chunk.set_id(RafsDigest::from_string(&chunk_info.chunk_digest)); + + debug!("chunk id: {}", chunk.id()); + + node.chunks.push(NodeChunk { + source: ChunkSource::Build, + inner: Arc::new(chunk.clone()), + }); + } + Ok(()) + } +} diff --git a/builder/src/lib.rs b/builder/src/lib.rs index 7d785ea3f88..bf18b43cec3 100644 --- a/builder/src/lib.rs +++ b/builder/src/lib.rs @@ -36,6 +36,8 @@ pub use self::core::overlay::{Overlay, WhiteoutSpec}; pub use self::core::prefetch::{Prefetch, PrefetchPolicy}; pub use self::core::tree::{MetadataTreeBuilder, Tree, TreeNode}; pub use self::directory::DirectoryBuilder; +pub use self::generate::ChunkdictChunkInfo; +pub use self::generate::Generater; pub use self::merge::Merger; pub use self::stargz::StargzBuilder; pub use self::tarball::TarballBuilder; @@ -43,6 +45,7 @@ pub use self::tarball::TarballBuilder; mod compact; mod core; mod directory; +mod generate; mod merge; mod stargz; mod tarball; diff --git a/src/bin/nydus-image/deduplicate.rs b/src/bin/nydus-image/deduplicate.rs index ccac5ce31d1..a3cbd0f9222 100644 --- a/src/bin/nydus-image/deduplicate.rs +++ b/src/bin/nydus-image/deduplicate.rs @@ -6,6 +6,7 @@ use anyhow::{Context, Result}; use core::cmp::Ordering; use nydus_api::ConfigV2; +use nydus_builder::ChunkdictChunkInfo; use nydus_builder::Tree; use nydus_rafs::metadata::RafsSuper; use nydus_storage::device::BlobInfo; @@ -51,13 +52,16 @@ pub trait Database { fn create_blob_table(&self) -> Result<()>; /// Inserts chunk information into the database. - fn insert_chunk(&self, chunk_info: &Chunk) -> Result<()>; + fn insert_chunk(&self, chunk_info: &ChunkdictChunkInfo) -> Result<()>; /// Inserts blob information into the database. fn insert_blob(&self, blob_info: &Blob) -> Result<()>; /// Retrieves all chunk information from the database. - fn get_chunks(&self) -> Result>; + fn get_chunks(&self) -> Result>; + + /// Retrieves all chunk information from the database filtered by blob ID. + fn get_chunks_by_blob_id(&self, blob_id: &str) -> Result>; /// Retrieves all blob information from the database. fn get_blobs(&self) -> Result>; @@ -106,7 +110,7 @@ impl Database for SqliteDatabase { BlobTable::create(&self.blob_table).context("Failed to create blob table") } - fn insert_chunk(&self, chunk: &Chunk) -> Result<()> { + fn insert_chunk(&self, chunk: &ChunkdictChunkInfo) -> Result<()> { self.chunk_table .insert(chunk) .context("Failed to insert chunk") @@ -118,10 +122,14 @@ impl Database for SqliteDatabase { .context("Failed to insert blob") } - fn get_chunks(&self) -> Result> { + fn get_chunks(&self) -> Result> { ChunkTable::list_all(&self.chunk_table).context("Failed to get chunks") } + fn get_chunks_by_blob_id(&self, blob_id: &str) -> Result> { + ChunkTable::list_all_by_blob_id(&self.chunk_table, blob_id).context("Failed to get chunks") + } + fn get_blobs(&self) -> Result> { BlobTable::list_all(&self.blob_table).context("Failed to get blobs") } @@ -194,7 +202,7 @@ impl Deduplicate { let index = chunk.inner.blob_index(); let chunk_blob_id = blob_infos[index as usize].blob_id(); self.db - .insert_chunk(&Chunk { + .insert_chunk(&ChunkdictChunkInfo { image_name: image_name.to_string(), version_name: version_name.to_string(), chunk_blob_id, @@ -221,9 +229,9 @@ pub struct Algorithm { } // Generate deduplicated chunkdict by exponential_smoothing algorithm -type Versiondic = HashMap>; +type Versiondic = HashMap>; // Generate deduplicated chunkdict by cluster algorithm -type Imagedic = Vec, Vec>>; +type Imagedic = Vec, Vec>>; impl Algorithm { pub fn new(algorithm: String, db_url: &str) -> anyhow::Result { @@ -233,9 +241,9 @@ impl Algorithm { } // Call the algorithm to generate a dictionary - pub fn chunkdict_generate(&mut self) -> anyhow::Result<(Vec, Vec)> { + pub fn chunkdict_generate(&mut self) -> anyhow::Result<(Vec, Vec)> { let all_chunks = self.db.chunk_table.list_all()?; - let mut chunkdict: Vec = Vec::new(); + let mut chunkdict: Vec = Vec::new(); let mut core_image = Vec::new(); let mut noise_points = Vec::new(); let (chunkdict_version, chunkdict_image) = match &self.algorithm_name as &str { @@ -274,7 +282,10 @@ impl Algorithm { // List all chunk and sort them by the order in chunk table // Score each chunk by "exponential_smoothing" formula // Select chunks whose score is greater than threshold and generate chunk dictionary - fn exponential_smoothing(all_chunks: Vec, threshold: f64) -> anyhow::Result> { + fn exponential_smoothing( + all_chunks: Vec, + threshold: f64, + ) -> anyhow::Result> { let alpha = 0.5; let mut smoothed_data = Vec::new(); @@ -310,9 +321,9 @@ impl Algorithm { } } - let mut chunkdict: Vec = Vec::new(); + let mut chunkdict: Vec = Vec::new(); for i in 0..smoothed_data.len() { - let chunk = Chunk { + let chunk = ChunkdictChunkInfo { image_name: all_chunks[i].image_name.clone(), version_name: all_chunks[i].version_name.clone(), chunk_blob_id: all_chunks[i].chunk_blob_id.clone(), @@ -328,18 +339,21 @@ impl Algorithm { } // Deduplicate chunk dictionary - let mut unique_chunks: BTreeMap = BTreeMap::new(); + let mut unique_chunks: BTreeMap = BTreeMap::new(); for chunk in &chunkdict { if !unique_chunks.contains_key(&chunk.chunk_digest) { unique_chunks.insert(chunk.chunk_digest.clone(), chunk.clone()); } } - let unique_chunk_list: Vec = unique_chunks.values().cloned().collect(); + let unique_chunk_list: Vec = unique_chunks.values().cloned().collect(); Ok(unique_chunk_list) } // Calculate the distance between two images - fn distance(image1: &[Chunk], image2: &[Chunk]) -> anyhow::Result { + fn distance( + image1: &[ChunkdictChunkInfo], + image2: &[ChunkdictChunkInfo], + ) -> anyhow::Result { // The total size of all chunks in both images let mut image1_size: u64 = 0; let mut image2_size: u64 = 0; @@ -352,7 +366,7 @@ impl Algorithm { } // The total size of the chunk repeated between two images - let all_chunks: Vec<&Chunk> = image1.iter().chain(image2.iter()).collect(); + let all_chunks: Vec<&ChunkdictChunkInfo> = image1.iter().chain(image2.iter()).collect(); let mut compressed_size_map: std::collections::HashMap = std::collections::HashMap::new(); let mut processed_digests: HashSet<&String> = HashSet::new(); @@ -373,8 +387,8 @@ impl Algorithm { } // Divide the chunk list into sublists by image name - fn divide_by_image(all_chunks: &[Chunk]) -> anyhow::Result> { - let mut image_chunks: std::collections::HashMap> = + fn divide_by_image(all_chunks: &[ChunkdictChunkInfo]) -> anyhow::Result> { + let mut image_chunks: std::collections::HashMap> = std::collections::HashMap::new(); let mut datadict: Vec = Vec::new(); for chunk in all_chunks { @@ -397,11 +411,11 @@ impl Algorithm { } fn divide_set( - chunks: &[Chunk], + chunks: &[ChunkdictChunkInfo], train_percentage: f64, - ) -> anyhow::Result<(Vec, Vec)> { + ) -> anyhow::Result<(Vec, Vec)> { // Create a HashMap to store the list of chunks for each image_name - let mut image_chunks: BTreeMap> = BTreeMap::new(); + let mut image_chunks: BTreeMap> = BTreeMap::new(); // Group chunks into image_name for chunk in chunks { @@ -412,12 +426,13 @@ impl Algorithm { } // Create the final training and testing sets - let mut train_set: Vec = Vec::new(); - let mut test_set: Vec = Vec::new(); + let mut train_set: Vec = Vec::new(); + let mut test_set: Vec = Vec::new(); // Iterate through the list of Chunks for each image_name for (_, chunk_list) in image_chunks.iter_mut() { - let mut version_chunks: BTreeMap> = BTreeMap::new(); + let mut version_chunks: BTreeMap> = + BTreeMap::new(); // Group the chunks in the image into version_name for chunk in chunk_list { let entry = version_chunks @@ -524,7 +539,7 @@ impl Algorithm { // Aggregate the chunks in each cluster into a dictionary fn aggregate_chunk( data_point: &[DataPoint], - ) -> anyhow::Result, Vec>> { + ) -> anyhow::Result, Vec>> { // Divide chunk list according to clusters let mut cluster_map: HashMap> = HashMap::new(); for (index, point) in data_point.iter().enumerate() { @@ -538,7 +553,7 @@ impl Algorithm { } // Iterate through each cluster - let mut dictionary: HashMap, Vec> = HashMap::new(); + let mut dictionary: HashMap, Vec> = HashMap::new(); for (_, cluster_points) in cluster_map.iter() { let mut image_total_counts: HashMap<&str, usize> = HashMap::new(); let mut image_list: Vec = Vec::new(); @@ -569,7 +584,7 @@ impl Algorithm { } } - let mut chunk_list: Vec = Vec::new(); + let mut chunk_list: Vec = Vec::new(); let mut added_chunk_digests: HashSet = HashSet::new(); for &point_index in cluster_points { let point = &data_point[point_index]; @@ -590,13 +605,13 @@ impl Algorithm { } fn deduplicate_image( - all_chunks: Vec, - ) -> anyhow::Result, Vec>>> { + all_chunks: Vec, + ) -> anyhow::Result, Vec>>> { let train_percentage = 0.7; let max_cluster_count = 7; let mut counter = 0; let all_chunks_clone = all_chunks; - let mut data_dict: Vec, Vec>> = Vec::new(); + let mut data_dict: Vec, Vec>> = Vec::new(); let (mut train, mut test) = Self::divide_set(&all_chunks_clone, train_percentage)?; while counter < max_cluster_count { @@ -617,7 +632,7 @@ impl Algorithm { let data_dict = Self::aggregate_chunk(data_cluster)?; - let all_chunks: HashSet<&Chunk> = + let all_chunks: HashSet<&ChunkdictChunkInfo> = data_dict.values().flat_map(|v| v.iter()).collect(); let mut total_test_set_size: u64 = 0; @@ -636,7 +651,7 @@ impl Algorithm { } debug!("test set size is {}", min_test_size); - let min_chunk_list: Vec = min_data_dict + let min_chunk_list: Vec = min_data_dict .values() .flat_map(|chunk_list| chunk_list.iter()) .cloned() @@ -662,7 +677,9 @@ impl Algorithm { Ok(data_dict) } - pub fn deduplicate_version(all_chunks: &[Chunk]) -> anyhow::Result<(Versiondic, Imagedic)> { + pub fn deduplicate_version( + all_chunks: &[ChunkdictChunkInfo], + ) -> anyhow::Result<(Versiondic, Imagedic)> { let mut all_chunks_size = 0; for i in all_chunks { all_chunks_size += i.chunk_compressed_size; @@ -693,7 +710,7 @@ impl Algorithm { test_set_size as f64 / 1024 as f64 / 1024 as f64 ); - let mut version_datadict: HashMap> = HashMap::new(); + let mut version_datadict: HashMap> = HashMap::new(); let mut data_point = Self::divide_by_image(&train)?; let mut threshold = 0.5; @@ -768,7 +785,7 @@ impl Algorithm { #[derive(Debug)] struct DataPoint { image_name: String, - chunk_list: Vec, + chunk_list: Vec, visited: bool, clustered: bool, cluster_id: i32, @@ -813,6 +830,62 @@ impl ChunkTable { conn: Arc::new(Mutex::new(conn)), }) } + + /// select all data filtered by blob ID. + fn list_all_by_blob_id(&self, blob_id: &str) -> Result, DatabaseError> { + let mut offset = 0; + let limit: i64 = 100; + let mut all_chunks_by_blob_id = Vec::new(); + + loop { + let chunks = self.list_paged_by_blob_id(blob_id, offset, limit)?; + if chunks.is_empty() { + break; + } + + all_chunks_by_blob_id.extend(chunks); + offset += limit; + } + + Ok(all_chunks_by_blob_id) + } + + /// select data with offset and limit filtered by blob ID. + fn list_paged_by_blob_id( + &self, + blob_id: &str, + offset: i64, + limit: i64, + ) -> Result, DatabaseError> { + let conn_guard = self + .conn + .lock() + .map_err(|e| DatabaseError::PoisonError(e.to_string()))?; + let mut stmt: rusqlite::Statement<'_> = conn_guard + .prepare( + "SELECT id, image_name, version_name, chunk_blob_id, chunk_digest, chunk_compressed_size, + chunk_uncompressed_size, chunk_compressed_offset, chunk_uncompressed_offset from chunk + WHERE chunk_blob_id = ?1 + ORDER BY id LIMIT ?2 OFFSET ?3", + )?; + let chunk_iterator = stmt.query_map(params![blob_id, limit, offset], |row| { + Ok(ChunkdictChunkInfo { + image_name: row.get(1)?, + version_name: row.get(2)?, + chunk_blob_id: row.get(3)?, + chunk_digest: row.get(4)?, + chunk_compressed_size: row.get(5)?, + chunk_uncompressed_size: row.get(6)?, + chunk_compressed_offset: row.get(7)?, + chunk_uncompressed_offset: row.get(8)?, + }) + })?; + let mut chunks = Vec::new(); + for chunk in chunk_iterator { + chunks.push(chunk.map_err(DatabaseError::SqliteError)?); + } + Ok(chunks) + } } #[derive(Debug, Clone)] @@ -878,19 +951,7 @@ impl PartialEq for CustomString { impl Eq for CustomString {} -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub struct Chunk { - image_name: String, - version_name: String, - chunk_blob_id: String, - chunk_digest: String, - chunk_compressed_size: u32, - chunk_uncompressed_size: u32, - chunk_compressed_offset: u64, - chunk_uncompressed_offset: u64, -} - -impl Table for ChunkTable { +impl Table for ChunkTable { fn clear(&self) -> Result<(), DatabaseError> { self.conn .lock() @@ -922,7 +983,7 @@ impl Table for ChunkTable { Ok(()) } - fn insert(&self, chunk: &Chunk) -> Result<(), DatabaseError> { + fn insert(&self, chunk: &ChunkdictChunkInfo) -> Result<(), DatabaseError> { self.conn .lock() .map_err(|e| DatabaseError::PoisonError(e.to_string()))? @@ -954,7 +1015,7 @@ impl Table for ChunkTable { Ok(()) } - fn list_all(&self) -> Result, DatabaseError> { + fn list_all(&self) -> Result, DatabaseError> { let mut offset = 0; let limit: i64 = 100; let mut all_chunks = Vec::new(); @@ -972,7 +1033,11 @@ impl Table for ChunkTable { Ok(all_chunks) } - fn list_paged(&self, offset: i64, limit: i64) -> Result, DatabaseError> { + fn list_paged( + &self, + offset: i64, + limit: i64, + ) -> Result, DatabaseError> { let conn_guard = self .conn .lock() @@ -984,7 +1049,7 @@ impl Table for ChunkTable { ORDER BY id LIMIT ?1 OFFSET ?2", )?; let chunk_iterator = stmt.query_map(params![limit, offset], |row| { - Ok(Chunk { + Ok(ChunkdictChunkInfo { image_name: row.get(1)?, version_name: row.get(2)?, chunk_blob_id: row.get(3)?, @@ -1175,7 +1240,7 @@ mod tests { fn test_chunk_table() -> Result<(), Box> { let chunk_table = ChunkTable::new_in_memory()?; chunk_table.create()?; - let chunk = Chunk { + let chunk = ChunkdictChunkInfo { image_name: "REDIS".to_string(), version_name: "1.0.0".to_string(), chunk_blob_id: "BLOB123".to_string(), @@ -1186,10 +1251,21 @@ mod tests { chunk_uncompressed_offset: 0, }; chunk_table.insert(&chunk)?; + let chunk2 = ChunkdictChunkInfo { + image_name: "REDIS".to_string(), + version_name: "1.0.0".to_string(), + chunk_blob_id: "BLOB456".to_string(), + chunk_digest: "DIGEST123".to_string(), + chunk_compressed_size: 512, + chunk_uncompressed_size: 1024, + chunk_compressed_offset: 0, + chunk_uncompressed_offset: 0, + }; + chunk_table.insert(&chunk2)?; let chunks = chunk_table.list_all()?; assert_eq!(chunks[0].image_name, chunk.image_name); assert_eq!(chunks[0].version_name, chunk.version_name); - assert_eq!(chunks.len(), 1); + assert_eq!(chunks.len(), 2); assert_eq!(chunks[0].chunk_blob_id, chunk.chunk_blob_id); assert_eq!(chunks[0].chunk_digest, chunk.chunk_digest); assert_eq!(chunks[0].chunk_compressed_size, chunk.chunk_compressed_size); @@ -1205,6 +1281,11 @@ mod tests { chunks[0].chunk_uncompressed_offset, chunk.chunk_uncompressed_offset ); + + let chunks = chunk_table.list_all_by_blob_id(&chunk.chunk_blob_id)?; + assert_eq!(chunks[0].chunk_blob_id, chunk.chunk_blob_id); + assert_eq!(chunks.len(), 1); + Ok(()) } @@ -1234,7 +1315,7 @@ mod tests { chunk_table.create()?; for i in 0..200 { let i64 = i as u64; - let chunk = Chunk { + let chunk = ChunkdictChunkInfo { image_name: format!("REDIS{}", i), version_name: format!("1.0.0{}", i), chunk_blob_id: format!("BLOB{}", i), @@ -1262,10 +1343,10 @@ mod tests { #[test] fn test_algorithm_exponential_smoothing() -> Result<(), Box> { let threshold = 0.1; - let mut all_chunk: Vec = Vec::new(); + let mut all_chunk: Vec = Vec::new(); for i in 0..199 { let i64 = i as u64; - let chunk = Chunk { + let chunk = ChunkdictChunkInfo { image_name: format!("REDIS{}", 0), version_name: format!("1.0.0{}", (i + 1) / 100), chunk_blob_id: format!("BLOB{}", i), @@ -1297,7 +1378,7 @@ mod tests { chunk_table.create()?; for i in 0..200 { let i64 = i as u64; - let chunk = Chunk { + let chunk = ChunkdictChunkInfo { image_name: format!("REDIS{}", i / 50), version_name: format!("1.0.0{}", (i + 1) / 100), chunk_blob_id: format!("BLOB{}", i), @@ -1323,10 +1404,10 @@ mod tests { #[test] fn test_distance() -> Result<(), Box> { - let mut all_chunks1: Vec = Vec::new(); + let mut all_chunks1: Vec = Vec::new(); for i in 0..200 { let i64 = i as u64; - let chunk = Chunk { + let chunk = ChunkdictChunkInfo { image_name: format!("REDIS{}", 0), version_name: format!("1.0.0{}", (i + 1) / 100), chunk_blob_id: format!("BLOB{}", i), @@ -1338,10 +1419,10 @@ mod tests { }; all_chunks1.push(chunk); } - let mut all_chunks2: Vec = Vec::new(); + let mut all_chunks2: Vec = Vec::new(); for i in 0..200 { let i64 = i as u64; - let chunk = Chunk { + let chunk = ChunkdictChunkInfo { image_name: format!("REDIS{}", 1), version_name: format!("1.0.0{}", (i + 1) / 100), chunk_blob_id: format!("BLOB{}", i), @@ -1366,10 +1447,10 @@ mod tests { #[test] fn test_divide_set() -> Result<(), Box> { - let mut all_chunks: Vec = Vec::new(); + let mut all_chunks: Vec = Vec::new(); for i in 0..200 { for j in 0..100 { - let chunk = Chunk { + let chunk = ChunkdictChunkInfo { image_name: format!("REDIS{}", i), version_name: format!("1.0.0{}", j / 10), chunk_blob_id: format!("BLOB{}", j), @@ -1395,11 +1476,11 @@ mod tests { #[test] fn test_dbscan() -> Result<(), Box> { - let mut all_chunks: Vec = Vec::new(); + let mut all_chunks: Vec = Vec::new(); let radius = 0.6; for i in 0..200 { for j in 0..100 { - let chunk = Chunk { + let chunk = ChunkdictChunkInfo { image_name: format!("REDIS{}", i), version_name: format!("1.0.0{}", j / 10), chunk_blob_id: format!("BLOB{}", j), @@ -1430,11 +1511,11 @@ mod tests { #[test] fn test_aggregate_chunk() -> Result<(), Box> { - let mut all_chunks: Vec = Vec::new(); + let mut all_chunks: Vec = Vec::new(); let radius = 0.6; for i in 0..200 { for j in 0..100 { - let chunk = Chunk { + let chunk = ChunkdictChunkInfo { image_name: format!("REDIS{}", i), version_name: format!("1.0.0{}", (j + 1) / 100), chunk_blob_id: format!("BLOB{}", j), @@ -1457,10 +1538,10 @@ mod tests { #[test] fn test_deduplicate_image() -> Result<(), Box> { - let mut all_chunks: Vec = Vec::new(); + let mut all_chunks: Vec = Vec::new(); for i in 0..200 { for j in 0..100 { - let chunk = Chunk { + let chunk = ChunkdictChunkInfo { image_name: format!("REDIS{}", i), version_name: format!("1.0.0{}", j / 10), chunk_blob_id: format!("BLOB{}", j), @@ -1492,11 +1573,11 @@ mod tests { #[test] fn test_deduplicate_version() -> Result<(), Box> { - let mut all_chunks: Vec = Vec::new(); - let mut chunkdict: Vec = Vec::new(); + let mut all_chunks: Vec = Vec::new(); + let mut chunkdict: Vec = Vec::new(); for i in 0..200 { let i64 = i as u64; - let chunk = Chunk { + let chunk = ChunkdictChunkInfo { image_name: format!("REDIS{}", 0), version_name: format!("1.0.0{}", (i + 1) / 20), chunk_blob_id: format!("BLOB{}", i), diff --git a/src/bin/nydus-image/main.rs b/src/bin/nydus-image/main.rs index 9ac94fd86d8..09d789bf874 100644 --- a/src/bin/nydus-image/main.rs +++ b/src/bin/nydus-image/main.rs @@ -28,9 +28,9 @@ use nydus::{get_build_time_info, setup_logging}; use nydus_api::{BuildTimeInfo, ConfigV2, LocalFsConfig}; use nydus_builder::{ parse_chunk_dict_arg, ArtifactStorage, BlobCacheGenerator, BlobCompactor, BlobManager, - BootstrapManager, BuildContext, BuildOutput, Builder, ConversionType, DirectoryBuilder, - Feature, Features, HashChunkDict, Merger, Prefetch, PrefetchPolicy, StargzBuilder, - TarballBuilder, WhiteoutSpec, + BootstrapManager, BuildContext, BuildOutput, Builder, ChunkdictChunkInfo, ConversionType, + DirectoryBuilder, Feature, Features, Generater, HashChunkDict, Merger, Prefetch, + PrefetchPolicy, StargzBuilder, TarballBuilder, WhiteoutSpec, }; use nydus_rafs::metadata::{MergeError, RafsSuper, RafsSuperConfig, RafsVersion}; use nydus_storage::backend::localfs::LocalFs; @@ -416,8 +416,75 @@ fn prepare_cmd_args(bti_string: &'static str) -> App { .arg( Arg::new("database") .long("database") - .help("Database connection address for assisting chunk dictionary generation, e.g. sqlite:///path/chunkdict.db") - .required(true), + .help("Database connection address for assisting chunk dictionary generation, e.g. /path/database.db") + .default_value("sqlite:///home/runner/output/database.db") + .required(false), + ) + .arg( + Arg::new("parent-bootstrap") + .long("parent-bootstrap") + .help("File path of the parent/referenced RAFS metadata blob (optional)") + .required(false), + ) + .arg( + Arg::new("bootstrap") + .long("bootstrap") + .short('B') + .help("Output path of nydus overlaid bootstrap"), + ) + .arg( + Arg::new("blob-dir") + .long("blob-dir") + .short('D') + .help("Directory path to save generated RAFS metadata and data blobs"), + ) + .arg(arg_chunk_dict.clone()) + .arg(arg_prefetch_policy.clone()) + .arg(arg_output_json.clone()) + .arg( + Arg::new("blob-digests") + .long("blob-digests") + .required(false) + .help("RAFS blob digest list separated by comma"), + ) + .arg( + Arg::new("original-blob-ids") + .long("original-blob-ids") + .required(false) + .help("original blob id list separated by comma, it may usually be a sha256 hex string"), + ) + .arg( + Arg::new("blob-sizes") + .long("blob-sizes") + .required(false) + .help("RAFS blob size list separated by comma"), + ) + .arg( + Arg::new("blob-toc-digests") + .long("blob-toc-digests") + .required(false) + .help("RAFS blob toc digest list separated by comma"), + ) + .arg( + Arg::new("blob-toc-sizes") + .long("blob-toc-sizes") + .required(false) + .help("RAFS blob toc size list separated by comma"), + ) + .arg(arg_config.clone()) + .arg( + Arg::new("SOURCE") + .help("bootstrap paths (allow one or more)") + .required(true) + .num_args(1..), + ) + .arg( + Arg::new("digester") + .long("digester") + .help("Algorithm to digest data chunks:") + .required(false) + .default_value("blake3") + .value_parser(["blake3", "sha256"]), ) .arg( Arg::new("verbose") @@ -427,6 +494,12 @@ fn prepare_cmd_args(bti_string: &'static str) -> App { .action(ArgAction::SetTrue) .required(false), ) + .arg( + Arg::new("features") + .long("features") + .value_parser(["blob-toc"]) + .help("Enable/disable features") + ) ) ); @@ -782,9 +855,10 @@ fn main() -> Result<()> { } else if let Some(matches) = cmd.subcommand_matches("chunkdict") { match matches.subcommand_name() { Some("save") => Command::chunkdict_save(matches.subcommand_matches("save").unwrap()), - Some("generate") => { - Command::chunkdict_generate(matches.subcommand_matches("generate").unwrap()) - } + Some("generate") => Command::chunkdict_generate( + matches.subcommand_matches("generate").unwrap(), + &build_info, + ), _ => { println!("{}", usage); Ok(()) @@ -1251,36 +1325,136 @@ impl Command { Ok(()) } - fn chunkdict_generate(matches: &ArgMatches) -> Result<()> { - // Connecting database and Generating chunk dictionary by algorithm "exponential_smoothing" + fn chunkdict_generate(matches: &ArgMatches, build_info: &BuildTimeInfo) -> Result<()> { let db_url: &String = matches.get_one::("database").unwrap(); - debug!("db_url: {}", db_url); + // save chunk and blob info to database + let source_bootstrap_paths: Vec = matches + .get_many::("SOURCE") + .map(|paths| paths.map(PathBuf::from).collect()) + .unwrap(); + for (_, bootstrap_path) in source_bootstrap_paths.iter().enumerate() { + let path = bootstrap_path.display().to_string(); + info!("Bootstrap path is {}", path); + let path_name: Vec<&str> = path.split('/').collect(); + + // Extract the image name and version name from the bootstrap directory + let bootstrap_dir = match path_name.get(path_name.len() - 2) { + Some(&bootstrap_dir) => bootstrap_dir.to_string(), + None => bail!("Invalid Bootstrap directory name"), + }; + let full_image_name: Vec<&str> = bootstrap_dir.split(':').collect(); + let image_name = match full_image_name.get(full_image_name.len() - 2) { + Some(&second_last) => second_last.to_string(), + None => bail!("Invalid image name"), + }; + let version_name = match full_image_name.last() { + Some(&last) => last.to_string(), + None => bail!("Invalid version name"), + }; + // For backward compatibility with v2.1. + let config = Self::get_configuration(matches)?; + config + .internal + .set_blob_accessible(matches.get_one::("bootstrap").is_none()); + let db_strs: Vec<&str> = db_url.split("://").collect(); + if db_strs.len() != 2 || (!db_strs[1].starts_with('/') && !db_strs[1].starts_with(':')) + { + bail!("Invalid database URL: {}", db_url); + } + match db_strs[0] { + "sqlite" => { + let mut deduplicate: Deduplicate = + Deduplicate::::new(db_strs[1])?; + deduplicate.save_metadata(bootstrap_path, config, image_name, version_name)? + } + _ => { + bail!("Unsupported database type: {}, please use a valid database URI, such as 'sqlite:///path/to/chunkdict.db'.", db_strs[0]) + } + }; + } + info!("Chunkdict metadata is saved at: {:?}", db_url); + + // Connecting database and Generating chunk dictionary by algorithm "exponential_smoothing" let db_strs: Vec<&str> = db_url.split("://").collect(); if db_strs.len() != 2 || (!db_strs[1].starts_with('/') && !db_strs[1].starts_with(':')) { bail!("Invalid database URL: {}", db_url); } let algorithm = String::from("exponential_smoothing"); + let _source_bootstrap_paths: Vec = matches + .get_many::("SOURCE") + .map(|paths| paths.map(PathBuf::from).collect()) + .unwrap(); + + let (chunkdict, noise_points): (Vec, Vec); match db_strs[0] { "sqlite" => { let mut algorithm: deduplicate::Algorithm = deduplicate::Algorithm::::new(algorithm, db_strs[1])?; - let (chunkdict, noise_points) = algorithm.chunkdict_generate()?; - info!( - "The length of chunkdict is {}", - Vec::::len(&chunkdict) - ); - info!("It is not recommended to use image deduplication"); - for image_name in noise_points { - info!("{}", image_name); - } + let result = algorithm.chunkdict_generate()?; + chunkdict = result.0; + noise_points = result.1; } _ => { bail!("Unsupported database type: {}, please use a valid database URI, such as 'sqlite:///path/to/chunkdict.db'.", db_strs[0]) } }; - // To be continued, dump chunk of "chunk dictionary" ... + info!( + "The length of chunkdict is {}", + Vec::::len(&chunkdict) + ); + info!("It is not recommended to use image deduplication"); + for image_name in noise_points { + info!("{}", image_name); + } + + // dump chunkdict to bootstrap + let features = Features::try_from( + matches + .get_one::("features") + .map(|s| s.as_str()) + .unwrap_or_default(), + )?; + let chunkdict_bootstrap_path = Self::get_bootstrap_storage(matches)?; + let config = + Self::get_configuration(matches).context("failed to get configuration information")?; + config + .internal + .set_blob_accessible(matches.get_one::("config").is_some()); + let mut build_ctx = BuildContext { + prefetch: Self::get_prefetch(matches)?, + ..Default::default() + }; + build_ctx.configuration = config; + build_ctx.blob_storage = Some(chunkdict_bootstrap_path); + build_ctx.blob_features = BlobFeatures::CAP_TAR_TOC; + build_ctx.blob_features.insert(BlobFeatures::ALIGNED); + // build_ctx.blob_features.insert(BlobFeatures::CHUNK_INFO_V2); + // build_ctx.blob_features.insert(BlobFeatures::ENCRYPTED); + build_ctx.features = features; + + let digester = matches + .get_one::("digester") + .map(|s| s.as_str()) + .unwrap_or_default() + .parse()?; + let mut blob_mgr = BlobManager::new(digester); + + let bootstrap_path = Self::get_bootstrap_storage(matches)?; + let mut bootstrap_mgr = BootstrapManager::new(Some(bootstrap_path), None); + + let output = + Generater::generate(&mut build_ctx, &mut bootstrap_mgr, &mut blob_mgr, chunkdict)?; + OutputSerializer::dump(matches, output, build_info).unwrap(); + info!( + "Chunkdict metadata is saved at: {:?}", + matches + .get_one::("bootstrap") + .map(|s| s.as_str()) + .unwrap_or_default(), + ); + Ok(()) } diff --git a/utils/src/digest.rs b/utils/src/digest.rs index 99d28d0ffed..510327e23e5 100644 --- a/utils/src/digest.rs +++ b/utils/src/digest.rs @@ -176,6 +176,18 @@ impl RafsDigest { } } + /// According to the format of sha256. + pub fn from_string(input: &str) -> Self { + let mut digest = RafsDigest::default(); + + for (i, byte) in input.as_bytes().chunks(2).enumerate() { + let hex_str = std::str::from_utf8(byte).unwrap(); + digest.data[i] = u8::from_str_radix(hex_str, 16).unwrap(); + } + + digest + } + pub fn hasher(algorithm: Algorithm) -> RafsDigestHasher { match algorithm { Algorithm::Blake3 => RafsDigestHasher::Blake3(Box::new(blake3::Hasher::new())), From b153bf6f4129dbd7ae60557d8013172924f276d9 Mon Sep 17 00:00:00 2001 From: Zhao Yuan <1627990440@qq.com> Date: Mon, 20 Nov 2023 15:28:48 +0000 Subject: [PATCH 03/11] Add function that nydusify invoke "nydus-image chunkdict generate" and add smoke test Signed-off-by: Zhao Yuan <1627990440@qq.com> --- contrib/nydusify/pkg/build/builder.go | 21 +++++++++ .../pkg/chunkdict/generator/generator.go | 38 +++++++++++++--- smoke/tests/image_test.go | 44 +++++++++++++++++++ 3 files changed, 96 insertions(+), 7 deletions(-) diff --git a/contrib/nydusify/pkg/build/builder.go b/contrib/nydusify/pkg/build/builder.go index 564f57454ae..4c8ec68e13f 100644 --- a/contrib/nydusify/pkg/build/builder.go +++ b/contrib/nydusify/pkg/build/builder.go @@ -43,6 +43,11 @@ type CompactOption struct { type SaveOption struct { BootstrapPath string + DatabasePath string +} + +type GenerateOption struct { + DatabasePath string } type Builder struct { @@ -157,6 +162,22 @@ func (builder *Builder) Save(option SaveOption) error { "warn", "--bootstrap", option.BootstrapPath, + "--database", + option.DatabasePath, + } + return builder.run(args, "") +} + +// Generate calls `nydus-image chunkdict generate` to get chunkdict +func (builder *Builder) Generate(option GenerateOption) error { + logrus.Infof("Invoking 'nydus-image chunkdict generate' subcommand") + args := []string{ + "chunkdict", + "generate", + "--log-level", + "warn", + "--database", + option.DatabasePath, } return builder.run(args, "") } diff --git a/contrib/nydusify/pkg/chunkdict/generator/generator.go b/contrib/nydusify/pkg/chunkdict/generator/generator.go index c9472c459cc..dd059b6ce4e 100644 --- a/contrib/nydusify/pkg/chunkdict/generator/generator.go +++ b/contrib/nydusify/pkg/chunkdict/generator/generator.go @@ -69,11 +69,15 @@ func (generator *Generator) Generate(ctx context.Context) error { } } } + if err := generator.deduplicating(ctx); err != nil { + return err + } return nil } // "save" stores information of chunk and blob of a Nydus Image in the database func (generator *Generator) save(ctx context.Context, index int) error { + currentDir, _ := os.Getwd() sourceParsed, err := generator.sourcesParser[index].Parse(ctx) if err != nil { return errors.Wrap(err, "parse Nydus image") @@ -81,26 +85,46 @@ func (generator *Generator) save(ctx context.Context, index int) error { // Create a directory to store the image bootstrap nydusImageName := strings.Replace(generator.Sources[index], "/", ":", -1) - folderPath := filepath.Join(generator.WorkDir, nydusImageName) - if err := os.MkdirAll(folderPath, fs.ModePerm); err != nil { + bootstrapFolderPath := filepath.Join(currentDir, generator.WorkDir, nydusImageName) + if err := os.MkdirAll(bootstrapFolderPath, fs.ModePerm); err != nil { return errors.Wrap(err, "creat work directory") } - if err := generator.Output(ctx, sourceParsed, folderPath, index); err != nil { + if err := generator.Output(ctx, sourceParsed, bootstrapFolderPath, index); err != nil { return errors.Wrap(err, "output image information") } + databaseName := "chunkdict.db" + databaseType := "sqlite" + DatabasePath := databaseType + "://" + filepath.Join(currentDir, generator.WorkDir, databaseName) + // Invoke "nydus-image save" command builder := build.NewBuilder(generator.NydusImagePath) if err := builder.Save(build.SaveOption{ - BootstrapPath: filepath.Join(folderPath, "nydus_bootstrap"), + BootstrapPath: filepath.Join(bootstrapFolderPath, "nydus_bootstrap"), + DatabasePath: DatabasePath, }); err != nil { return errors.Wrap(err, "invalid nydus bootstrap format") } - logrus.Infof("Save chunk information from image %s", generator.sourcesParser[index].Remote.Ref) + logrus.Infof("Saving chunk information from image %s", generator.sourcesParser[index].Remote.Ref) + + // if err := os.RemoveAll(folderPath); err != nil { + // return errors.Wrap(err, "remove work directory") + // } + return nil +} - if err := os.RemoveAll(folderPath); err != nil { - return errors.Wrap(err, "remove work directory") +func (generator *Generator) deduplicating(ctx context.Context) error { + builder := build.NewBuilder(generator.NydusImagePath) + currentDir, _ := os.Getwd() + + databaseName := "chunkdict.db" + databaseType := "sqlite" + DatabasePath := databaseType + "://" + filepath.Join(currentDir, generator.WorkDir, databaseName) + if err := builder.Generate(build.GenerateOption{ + DatabasePath: DatabasePath, + }); err != nil { + return errors.Wrap(err, "invalid nydus bootstrap format") } return nil } diff --git a/smoke/tests/image_test.go b/smoke/tests/image_test.go index d8f06bc41ef..75f9cd12be2 100644 --- a/smoke/tests/image_test.go +++ b/smoke/tests/image_test.go @@ -140,6 +140,50 @@ func (i *ImageTestSuite) TestConvertAndCopyImage(t *testing.T, ctx tool.Context, tool.RunWithoutOutput(t, checkCmd) } +func (i *ImageTestSuite) TestGenerate() test.Generator { + + scenarios := tool.DescartesIterator{} + scenarios.Dimension(paramImage, []interface{}{"nginx:latest"}) + + return func() (name string, testCase test.Case) { + if !scenarios.HasNext() { + return + } + scenario := scenarios.Next() + ctx := tool.DefaultContext(i.T) + + image := i.prepareImage(i.T, scenario.GetString(paramImage)) + return scenario.Str(), func(t *testing.T) { + i.TestGenerateChunkDict(t, *ctx, image) + } + } +} + +func (i *ImageTestSuite) TestGenerateChunkDict(t *testing.T, ctx tool.Context, source string) { + + // Prepare work directory + ctx.PrepareWorkDir(t) + defer ctx.Destroy(t) + + target := fmt.Sprintf("%s-nydus-%s", source, uuid.NewString()) + logLevel := "--log-level warn" + + // Convert image + convertCmd := fmt.Sprintf( + "%s %s convert --source %s --target %s --nydus-image %s --work-dir %s", + ctx.Binary.Nydusify, logLevel, source, target, ctx.Binary.Builder, ctx.Env.WorkDir, + ) + tool.RunWithoutOutput(t, convertCmd) + + nydusifyPath := ctx.Binary.Nydusify + generateCmd := fmt.Sprintf( + "%s %s chunkdict generate --sources %s --nydus-image %s --work-dir %s", + nydusifyPath, logLevel, target, ctx.Binary.Builder, ctx.Env.WorkDir, + ) + tool.RunWithoutOutput(t, generateCmd) + +} + func (i *ImageTestSuite) prepareImage(t *testing.T, image string) string { if i.preparedImages == nil { i.preparedImages = make(map[string]string) From d104c1263618c9b0c24f1d370bb309442be4b1b6 Mon Sep 17 00:00:00 2001 From: Zhao Yuan <1627990440@qq.com> Date: Fri, 8 Dec 2023 05:21:32 +0000 Subject: [PATCH 04/11] Merge invoked subcomand in nydusify and add smoke test Signed-off-by: Zhao Yuan <1627990440@qq.com> --- contrib/nydusify/pkg/build/builder.go | 33 ++--- .../pkg/chunkdict/generator/generator.go | 96 ++++++------- smoke/tests/image_test.go | 86 ++++++------ smoke/tests/tool/context.go | 2 + src/bin/nydus-image/deduplicate.rs | 6 +- src/bin/nydus-image/main.rs | 130 +----------------- 6 files changed, 112 insertions(+), 241 deletions(-) diff --git a/contrib/nydusify/pkg/build/builder.go b/contrib/nydusify/pkg/build/builder.go index 4c8ec68e13f..177c0b9a209 100644 --- a/contrib/nydusify/pkg/build/builder.go +++ b/contrib/nydusify/pkg/build/builder.go @@ -41,13 +41,11 @@ type CompactOption struct { CompactConfigPath string } -type SaveOption struct { - BootstrapPath string - DatabasePath string -} - type GenerateOption struct { - DatabasePath string + BootstrapPaths []string + DatabasePath string + ChunkdictBootstrapPath string + OutputPath string } type Builder struct { @@ -153,31 +151,22 @@ func (builder *Builder) Run(option BuilderOption) error { return builder.run(args, option.PrefetchPatterns) } -// Save calls `nydus-image chunkdict save` to parse Nydus bootstrap -func (builder *Builder) Save(option SaveOption) error { - args := []string{ - "chunkdict", - "save", - "--log-level", - "warn", - "--bootstrap", - option.BootstrapPath, - "--database", - option.DatabasePath, - } - return builder.run(args, "") -} - // Generate calls `nydus-image chunkdict generate` to get chunkdict func (builder *Builder) Generate(option GenerateOption) error { - logrus.Infof("Invoking 'nydus-image chunkdict generate' subcommand") + logrus.Infof("Invoking 'nydus-image chunkdict generate' command") args := []string{ "chunkdict", "generate", "--log-level", "warn", + "--bootstrap", + option.ChunkdictBootstrapPath, "--database", option.DatabasePath, + "--output-json", + option.OutputPath, } + args = append(args, option.BootstrapPaths...) + return builder.run(args, "") } diff --git a/contrib/nydusify/pkg/chunkdict/generator/generator.go b/contrib/nydusify/pkg/chunkdict/generator/generator.go index dd059b6ce4e..1de160295f6 100644 --- a/contrib/nydusify/pkg/chunkdict/generator/generator.go +++ b/contrib/nydusify/pkg/chunkdict/generator/generator.go @@ -59,72 +59,72 @@ func New(opt Opt) (*Generator, error) { // Generate saves multiple Nydus bootstraps into the database one by one. func (generator *Generator) Generate(ctx context.Context) error { - for index := range generator.Sources { - if err := generator.save(ctx, index); err != nil { - if utils.RetryWithHTTP(err) { + var bootstrapPaths []string + bootstrapPaths, err := generator.pull(ctx) + + if err != nil { + if utils.RetryWithHTTP(err) { + for index := range generator.Sources { generator.sourcesParser[index].Remote.MaybeWithHTTP(err) } - if err := generator.save(ctx, index); err != nil { - return err - } + } + bootstrapPaths, err = generator.pull(ctx) + if err != nil { + return err } } - if err := generator.deduplicating(ctx); err != nil { + + if err := generator.generate(ctx, bootstrapPaths); err != nil { return err } return nil } -// "save" stores information of chunk and blob of a Nydus Image in the database -func (generator *Generator) save(ctx context.Context, index int) error { - currentDir, _ := os.Getwd() - sourceParsed, err := generator.sourcesParser[index].Parse(ctx) - if err != nil { - return errors.Wrap(err, "parse Nydus image") - } - - // Create a directory to store the image bootstrap - nydusImageName := strings.Replace(generator.Sources[index], "/", ":", -1) - bootstrapFolderPath := filepath.Join(currentDir, generator.WorkDir, nydusImageName) - if err := os.MkdirAll(bootstrapFolderPath, fs.ModePerm); err != nil { - return errors.Wrap(err, "creat work directory") - } - if err := generator.Output(ctx, sourceParsed, bootstrapFolderPath, index); err != nil { - return errors.Wrap(err, "output image information") - } - - databaseName := "chunkdict.db" - databaseType := "sqlite" - DatabasePath := databaseType + "://" + filepath.Join(currentDir, generator.WorkDir, databaseName) +// Pull the bootstrap of nydus image +func (generator *Generator) pull(ctx context.Context) ([]string, error) { + var bootstrapPaths []string + for index := range generator.Sources { + sourceParsed, err := generator.sourcesParser[index].Parse(ctx) + if err != nil { + return nil, errors.Wrap(err, "parse Nydus image") + } - // Invoke "nydus-image save" command - builder := build.NewBuilder(generator.NydusImagePath) - if err := builder.Save(build.SaveOption{ - BootstrapPath: filepath.Join(bootstrapFolderPath, "nydus_bootstrap"), - DatabasePath: DatabasePath, - }); err != nil { - return errors.Wrap(err, "invalid nydus bootstrap format") + // Create a directory to store the image bootstrap + nydusImageName := strings.Replace(generator.Sources[index], "/", ":", -1) + bootstrapDirPath := filepath.Join(generator.WorkDir, nydusImageName) + if err := os.MkdirAll(bootstrapDirPath, fs.ModePerm); err != nil { + return nil, errors.Wrap(err, "creat work directory") + } + if err := generator.Output(ctx, sourceParsed, bootstrapDirPath, index); err != nil { + return nil, errors.Wrap(err, "output image information") + } + bootstrapPath := filepath.Join(bootstrapDirPath, "nydus_bootstrap") + bootstrapPaths = append(bootstrapPaths, bootstrapPath) } - - logrus.Infof("Saving chunk information from image %s", generator.sourcesParser[index].Remote.Ref) - - // if err := os.RemoveAll(folderPath); err != nil { - // return errors.Wrap(err, "remove work directory") - // } - return nil + return bootstrapPaths, nil } -func (generator *Generator) deduplicating(ctx context.Context) error { - builder := build.NewBuilder(generator.NydusImagePath) +func (generator *Generator) generate(ctx context.Context, bootstrapPaths []string) error { + // Invoke "nydus-image generate" command currentDir, _ := os.Getwd() - - databaseName := "chunkdict.db" + builder := build.NewBuilder(generator.NydusImagePath) databaseType := "sqlite" - DatabasePath := databaseType + "://" + filepath.Join(currentDir, generator.WorkDir, databaseName) + var databasePath string + if strings.HasPrefix(generator.WorkDir, "/") { + databasePath = databaseType + "://" + filepath.Join(generator.WorkDir, "database.db") + } else { + databasePath = databaseType + "://" + filepath.Join(currentDir, generator.WorkDir, "database.db") + } if err := builder.Generate(build.GenerateOption{ - DatabasePath: DatabasePath, + BootstrapPaths: bootstrapPaths, + ChunkdictBootstrapPath: filepath.Join(generator.WorkDir, "chunkdict_bootstrap"), + DatabasePath: databasePath, + OutputPath: filepath.Join(generator.WorkDir, "nydus_bootstrap_output.json"), }); err != nil { return errors.Wrap(err, "invalid nydus bootstrap format") } + + logrus.Infof("Successfully generate image chunk dictionary") + return nil } diff --git a/smoke/tests/image_test.go b/smoke/tests/image_test.go index 75f9cd12be2..9665f73c198 100644 --- a/smoke/tests/image_test.go +++ b/smoke/tests/image_test.go @@ -140,48 +140,50 @@ func (i *ImageTestSuite) TestConvertAndCopyImage(t *testing.T, ctx tool.Context, tool.RunWithoutOutput(t, checkCmd) } -func (i *ImageTestSuite) TestGenerate() test.Generator { - - scenarios := tool.DescartesIterator{} - scenarios.Dimension(paramImage, []interface{}{"nginx:latest"}) - - return func() (name string, testCase test.Case) { - if !scenarios.HasNext() { - return - } - scenario := scenarios.Next() - ctx := tool.DefaultContext(i.T) - - image := i.prepareImage(i.T, scenario.GetString(paramImage)) - return scenario.Str(), func(t *testing.T) { - i.TestGenerateChunkDict(t, *ctx, image) - } - } -} - -func (i *ImageTestSuite) TestGenerateChunkDict(t *testing.T, ctx tool.Context, source string) { - - // Prepare work directory - ctx.PrepareWorkDir(t) - defer ctx.Destroy(t) - - target := fmt.Sprintf("%s-nydus-%s", source, uuid.NewString()) - logLevel := "--log-level warn" - - // Convert image - convertCmd := fmt.Sprintf( - "%s %s convert --source %s --target %s --nydus-image %s --work-dir %s", - ctx.Binary.Nydusify, logLevel, source, target, ctx.Binary.Builder, ctx.Env.WorkDir, - ) - tool.RunWithoutOutput(t, convertCmd) - - nydusifyPath := ctx.Binary.Nydusify - generateCmd := fmt.Sprintf( - "%s %s chunkdict generate --sources %s --nydus-image %s --work-dir %s", - nydusifyPath, logLevel, target, ctx.Binary.Builder, ctx.Env.WorkDir, - ) - tool.RunWithoutOutput(t, generateCmd) - +func (i *ImageTestSuite) TestGenerateChunkdict() test.Generator { + return func() (name string, testCase test.Case) { + imagename1 := "redis:7.0.1" + imagename2 := "redis:7.0.2" + imagename3 := "redis:7.0.3" + image1 := i.prepareImage(i.T, imagename1) + image2 := i.prepareImage(i.T, imagename2) + image3 := i.prepareImage(i.T, imagename3) + ctx := tool.DefaultContext(i.T) + + // Prepare work directory + ctx.PrepareWorkDir(i.T) + defer ctx.Destroy(i.T) + + logLevel := "--log-level warn" + nydusifyPath := ctx.Binary.Nydusify + + target1 := fmt.Sprintf("%s-nydus-%s", image1, uuid.NewString()) + target2 := fmt.Sprintf("%s-nydus-%s", image2, uuid.NewString()) + target3 := fmt.Sprintf("%s-nydus-%s", image3, uuid.NewString()) + convertCmd1 := fmt.Sprintf( + "%s %s convert --source %s --target %s --nydus-image %s --work-dir %s", + ctx.Binary.Nydusify, logLevel, image1, target1, ctx.Binary.Builder, ctx.Env.TempDir, + ) + tool.RunWithoutOutput(i.T, convertCmd1) + convertCmd2 := fmt.Sprintf( + "%s %s convert --source %s --target %s --nydus-image %s --work-dir %s", + ctx.Binary.Nydusify, logLevel, image1, target2, ctx.Binary.Builder, ctx.Env.TempDir, + ) + tool.RunWithoutOutput(i.T, convertCmd2) + convertCmd3 := fmt.Sprintf( + "%s %s convert --source %s --target %s --nydus-image %s --work-dir %s", + ctx.Binary.Nydusify, logLevel, image1, target3, ctx.Binary.Builder, ctx.Env.TempDir, + ) + tool.RunWithoutOutput(i.T, convertCmd3) + target := fmt.Sprintf("%s,%s,%s", target1, target2, target3) + + generateCmd := fmt.Sprintf( + "%s %s chunkdict generate --sources %s --nydus-image %s --work-dir %s", + nydusifyPath, logLevel, target, ctx.Binary.Builder, ctx.Env.TempDir, + ) + tool.RunWithoutOutput(i.T, generateCmd) + return "generateChunkdict", nil + } } func (i *ImageTestSuite) prepareImage(t *testing.T, image string) string { diff --git a/smoke/tests/tool/context.go b/smoke/tests/tool/context.go index 2d74ac02e97..174b913ef42 100644 --- a/smoke/tests/tool/context.go +++ b/smoke/tests/tool/context.go @@ -42,6 +42,7 @@ type RuntimeContext struct { } type EnvContext struct { + TempDir string WorkDir string BlobDir string CacheDir string @@ -99,6 +100,7 @@ func (ctx *Context) PrepareWorkDir(t *testing.T) { require.NoError(t, err) ctx.Env = EnvContext{ + TempDir: tempDir, WorkDir: workDir, BlobDir: blobDir, CacheDir: cacheDir, diff --git a/src/bin/nydus-image/deduplicate.rs b/src/bin/nydus-image/deduplicate.rs index a3cbd0f9222..4b013daba5e 100644 --- a/src/bin/nydus-image/deduplicate.rs +++ b/src/bin/nydus-image/deduplicate.rs @@ -229,9 +229,9 @@ pub struct Algorithm { } // Generate deduplicated chunkdict by exponential_smoothing algorithm -type Versiondic = HashMap>; +type VersionMap = HashMap>; // Generate deduplicated chunkdict by cluster algorithm -type Imagedic = Vec, Vec>>; +type ImageMap = Vec, Vec>>; impl Algorithm { pub fn new(algorithm: String, db_url: &str) -> anyhow::Result { @@ -679,7 +679,7 @@ impl Algorithm { pub fn deduplicate_version( all_chunks: &[ChunkdictChunkInfo], - ) -> anyhow::Result<(Versiondic, Imagedic)> { + ) -> anyhow::Result<(VersionMap, ImageMap)> { let mut all_chunks_size = 0; for i in all_chunks { all_chunks_size += i.chunk_compressed_size; diff --git a/src/bin/nydus-image/main.rs b/src/bin/nydus-image/main.rs index 09d789bf874..d1cb8d47eb9 100644 --- a/src/bin/nydus-image/main.rs +++ b/src/bin/nydus-image/main.rs @@ -373,43 +373,6 @@ fn prepare_cmd_args(bti_string: &'static str) -> App { let app = app.subcommand( App::new("chunkdict") .about("deduplicate RAFS filesystem metadata") - .subcommand( - App::new("save") - .about("Save chunk info to a database") - .arg( - Arg::new("bootstrap") - .short('B') - .long("bootstrap") - .help("File path of RAFS meta blob/bootstrap, e.g. /path/output/localhost:5000:redis:nydus_7.0.1/nydus_bootstrap") - .required(false), - ) - .arg( - Arg::new("database") - .long("database") - .help("Database connection URI for assisting chunk dict generation, e.g. sqlite:///path/chunkdict.db") - .default_value("sqlite:///home/runner/work/image-service/chunkdict/image-service/contrib/nydusify/chunkdict.db") - .required(false), - ) - .arg( - Arg::new("blob-dir") - .long("blob-dir") - .short('D') - .conflicts_with("config") - .help( - "Directory for localfs storage backend, hosting data blobs and cache files", - ), - ) - .arg(arg_config.clone()) - .arg( - Arg::new("verbose") - .long("verbose") - .short('v') - .help("Output message in verbose mode") - .action(ArgAction::SetTrue) - .required(false), - ) - .arg(arg_output_json.clone()) - ) .subcommand( App::new("generate") .about("generate chunk dictionary based on database") @@ -420,12 +383,6 @@ fn prepare_cmd_args(bti_string: &'static str) -> App { .default_value("sqlite:///home/runner/output/database.db") .required(false), ) - .arg( - Arg::new("parent-bootstrap") - .long("parent-bootstrap") - .help("File path of the parent/referenced RAFS metadata blob (optional)") - .required(false), - ) .arg( Arg::new("bootstrap") .long("bootstrap") @@ -438,39 +395,8 @@ fn prepare_cmd_args(bti_string: &'static str) -> App { .short('D') .help("Directory path to save generated RAFS metadata and data blobs"), ) - .arg(arg_chunk_dict.clone()) .arg(arg_prefetch_policy.clone()) .arg(arg_output_json.clone()) - .arg( - Arg::new("blob-digests") - .long("blob-digests") - .required(false) - .help("RAFS blob digest list separated by comma"), - ) - .arg( - Arg::new("original-blob-ids") - .long("original-blob-ids") - .required(false) - .help("original blob id list separated by comma, it may usually be a sha256 hex string"), - ) - .arg( - Arg::new("blob-sizes") - .long("blob-sizes") - .required(false) - .help("RAFS blob size list separated by comma"), - ) - .arg( - Arg::new("blob-toc-digests") - .long("blob-toc-digests") - .required(false) - .help("RAFS blob toc digest list separated by comma"), - ) - .arg( - Arg::new("blob-toc-sizes") - .long("blob-toc-sizes") - .required(false) - .help("RAFS blob toc size list separated by comma"), - ) .arg(arg_config.clone()) .arg( Arg::new("SOURCE") @@ -854,7 +780,6 @@ fn main() -> Result<()> { Command::create(matches, &build_info) } else if let Some(matches) = cmd.subcommand_matches("chunkdict") { match matches.subcommand_name() { - Some("save") => Command::chunkdict_save(matches.subcommand_matches("save").unwrap()), Some("generate") => Command::chunkdict_generate( matches.subcommand_matches("generate").unwrap(), &build_info, @@ -1277,54 +1202,6 @@ impl Command { OutputSerializer::dump(matches, build_output, build_info) } - fn chunkdict_save(matches: &ArgMatches) -> Result<()> { - // Parse the directory name of bootstrap and obtain the image name and version name - let bootstrap_path = Self::get_bootstrap(matches)?; - let path = bootstrap_path.display().to_string(); - info!("Bootstrap path is {}", path); - let path_name: Vec<&str> = path.split('/').collect(); - - // Extract the image name and version name from the bootstrap directory - let bootstrap_dir = match path_name.get(path_name.len() - 2) { - Some(&bootstrap_dir) => bootstrap_dir.to_string(), - None => bail!("Invalid Bootstrap directory name"), - }; - let full_image_name: Vec<&str> = bootstrap_dir.split(':').collect(); - let image_name = match full_image_name.get(full_image_name.len() - 2) { - Some(&second_last) => second_last.to_string(), - None => bail!("Invalid image name"), - }; - let version_name = match full_image_name.last() { - Some(&last) => last.to_string(), - None => bail!("Invalid version name"), - }; - - let config = Self::get_configuration(matches)?; - let db_url: &String = matches.get_one::("database").unwrap(); - - // For backward compatibility with v2.1. - config - .internal - .set_blob_accessible(matches.get_one::("bootstrap").is_none()); - let db_strs: Vec<&str> = db_url.split("://").collect(); - if db_strs.len() != 2 || (!db_strs[1].starts_with('/') && !db_strs[1].starts_with(':')) { - bail!("Invalid database URL: {}", db_url); - } - - match db_strs[0] { - "sqlite" => { - let mut deduplicate: Deduplicate = - Deduplicate::::new(db_strs[1])?; - deduplicate.save_metadata(bootstrap_path, config, image_name, version_name)? - } - _ => { - bail!("Unsupported database type: {}, please use a valid database URI, such as 'sqlite:///path/to/chunkdict.db'.", db_strs[0]) - } - }; - info!("Chunkdict metadata is saved at: {:?}", db_url); - Ok(()) - } - fn chunkdict_generate(matches: &ArgMatches, build_info: &BuildTimeInfo) -> Result<()> { let db_url: &String = matches.get_one::("database").unwrap(); // save chunk and blob info to database @@ -1400,6 +1277,7 @@ impl Command { } }; + // Output noise point in DBSCAN clustering algorithm info!( "The length of chunkdict is {}", Vec::::len(&chunkdict) @@ -1409,7 +1287,7 @@ impl Command { info!("{}", image_name); } - // dump chunkdict to bootstrap + // Dump chunkdict to bootstrap let features = Features::try_from( matches .get_one::("features") @@ -1430,8 +1308,8 @@ impl Command { build_ctx.blob_storage = Some(chunkdict_bootstrap_path); build_ctx.blob_features = BlobFeatures::CAP_TAR_TOC; build_ctx.blob_features.insert(BlobFeatures::ALIGNED); - // build_ctx.blob_features.insert(BlobFeatures::CHUNK_INFO_V2); - // build_ctx.blob_features.insert(BlobFeatures::ENCRYPTED); + // Build_ctx.blob_features.insert(BlobFeatures::CHUNK_INFO_V2); + // Build_ctx.blob_features.insert(BlobFeatures::ENCRYPTED); build_ctx.features = features; let digester = matches From 7cd363106567f4600ab2908a568ef965da7734d4 Mon Sep 17 00:00:00 2001 From: Lin Wang Date: Tue, 28 Nov 2023 16:22:46 +0800 Subject: [PATCH 05/11] nydus-image: Store chunk and blob metadata Signed-off-by: Lin Wang --- builder/src/chunkdict_generator.rs | 255 +++++++++++++++++++++++++++++ builder/src/lib.rs | 6 +- src/bin/nydus-image/deduplicate.rs | 151 ++++++++++------- src/bin/nydus-image/main.rs | 111 +++++++++++-- 4 files changed, 452 insertions(+), 71 deletions(-) create mode 100644 builder/src/chunkdict_generator.rs diff --git a/builder/src/chunkdict_generator.rs b/builder/src/chunkdict_generator.rs new file mode 100644 index 00000000000..ed2b4b01d87 --- /dev/null +++ b/builder/src/chunkdict_generator.rs @@ -0,0 +1,255 @@ +// Copyright (C) 2023 Nydus Developers. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +//! Generate Chunkdict RAFS bootstrap. +//! ------------------------------------------------------------------------------------------------- +//! Bug 1: Inconsistent Chunk Size Leading to Blob Size Less Than 4K(v6_block_size) +//! Description: The size of chunks is not consistent, which results in the possibility that a blob, +//! composed of a group of these chunks, may be less than 4K(v6_block_size) in size. +//! This inconsistency leads to a failure in passing the size check. +//! ------------------------------------------------------------------------------------------------- +//! Bug 2: Incorrect Chunk Number Calculation Due to Premature Check Logic +//! Description: The current logic for calculating the chunk number is based on the formula size/chunk size. +//! However, this approach is flawed as it precedes the actual check which accounts for chunk statistics. +//! Consequently, this leads to inaccurate counting of chunk numbers. + +use super::core::node::{ChunkSource, NodeInfo}; +use super::{BlobManager, Bootstrap, BootstrapManager, BuildContext, BuildOutput, Tree}; +use crate::core::node::Node; +use crate::NodeChunk; +use anyhow::Result; +use nydus_rafs::metadata::chunk::ChunkWrapper; +use nydus_rafs::metadata::inode::InodeWrapper; +use nydus_rafs::metadata::layout::RafsXAttrs; +use nydus_storage::meta::BlobChunkInfoV1Ondisk; +use nydus_utils::digest::RafsDigest; +use std::ffi::OsString; +use std::mem::size_of; +use std::path::PathBuf; +use std::sync::Arc; + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct ChunkdictChunkInfo { + pub image_reference: String, + pub version: String, + pub chunk_blob_id: String, + pub chunk_digest: String, + pub chunk_compressed_size: u32, + pub chunk_uncompressed_size: u32, + pub chunk_compressed_offset: u64, + pub chunk_uncompressed_offset: u64, +} + +/// Struct to generate chunkdict RAFS bootstrap. +pub struct Generator {} + +impl Generator { + // Generate chunkdict RAFS bootstrap. + pub fn generate( + ctx: &mut BuildContext, + bootstrap_mgr: &mut BootstrapManager, + blob_mgr: &mut BlobManager, + chunkdict_origin: Vec, + ) -> Result { + // validate and remove chunks which bloned blob size is smaller than block. + let mut chunkdict = chunkdict_origin.to_vec(); + Self::validate_and_remove_chunks(ctx, &mut chunkdict); + + // build root tree + let mut tree = Self::build_root_tree(ctx)?; + + // build child tree + let child = Self::build_child_tree(ctx, blob_mgr, &chunkdict)?; + let result = vec![child]; + tree.children = result; + + Self::validate_tree(&tree)?; + + // build bootstrap + let mut bootstrap_ctx = bootstrap_mgr.create_ctx()?; + let mut bootstrap = Bootstrap::new(tree)?; + bootstrap.build(ctx, &mut bootstrap_ctx)?; + + let blob_table = blob_mgr.to_blob_table(ctx)?; + let storage = &mut bootstrap_mgr.bootstrap_storage; + bootstrap.dump(ctx, storage, &mut bootstrap_ctx, &blob_table)?; + + BuildOutput::new(blob_mgr, &bootstrap_mgr.bootstrap_storage) + } + + /// validate tree + fn validate_tree(tree: &Tree) -> Result<()> { + let pre = &mut |t: &Tree| -> Result<()> { + let node = t.lock_node(); + debug!("chunkdict tree: "); + debug!("inode: {}", node); + for chunk in &node.chunks { + debug!("\t chunk: {}", chunk); + } + Ok(()) + }; + tree.walk_dfs_pre(pre)?; + debug!("chunkdict tree is valid."); + Ok(()) + } + + /// check blob uncompressed size is bigger than block + fn validate_and_remove_chunks(ctx: &mut BuildContext, chunkdict: &mut Vec) { + let mut chunk_sizes = std::collections::HashMap::new(); + + // Accumulate the uncompressed size for each chunk_blob_id + for chunk in chunkdict.iter() { + *chunk_sizes.entry(chunk.chunk_blob_id.clone()).or_insert(0) += + chunk.chunk_uncompressed_size as u64; + } + // Find all chunk_blob_ids with a total uncompressed size > v6_block_size + let small_chunks: Vec = chunk_sizes + .into_iter() + .filter(|&(_, size)| size < ctx.v6_block_size()) + .inspect(|(id, _)| { + eprintln!( + "Warning: Blob with id '{}' is smaller than {} bytes.", + id, + ctx.v6_block_size() + ) + }) + .map(|(id, _)| id) + .collect(); + + // Retain only chunks with chunk_blob_id that has a total uncompressed size > v6_block_size + chunkdict.retain(|chunk| !small_chunks.contains(&chunk.chunk_blob_id)); + } + + /// Build root tree + pub fn build_root_tree(ctx: &mut BuildContext) -> Result { + // inode + let mut inode = InodeWrapper::new(ctx.fs_version); + inode.set_ino(1); + inode.set_uid(1000); + inode.set_gid(1000); + inode.set_projid(0); + inode.set_mode(0o660 | libc::S_IFDIR as u32); + inode.set_nlink(3); + inode.set_name_size("/".len()); + inode.set_rdev(0); + inode.set_blocks(256); + let node_info = NodeInfo { + explicit_uidgid: true, + src_dev: 66305, + src_ino: 24772610, + rdev: 0, + source: PathBuf::from("/"), + path: PathBuf::from("/"), + target: PathBuf::from("/"), + target_vec: vec![OsString::from("/")], + symlink: None, + xattrs: RafsXAttrs::default(), + v6_force_extended_inode: true, + }; + let root_node = Node::new(inode, node_info, 0); + let tree = Tree::new(root_node); + Ok(tree) + } + + /// Build child tree + fn build_child_tree( + ctx: &mut BuildContext, + blob_mgr: &mut BlobManager, + chunkdict: &[ChunkdictChunkInfo], + ) -> Result { + // node + let mut inode = InodeWrapper::new(ctx.fs_version); + inode.set_ino(2); + inode.set_uid(0); + inode.set_gid(0); + inode.set_projid(0); + inode.set_mode(0o660 | libc::S_IFREG as u32); + inode.set_nlink(1); + inode.set_name_size("chunkdict".len()); + inode.set_rdev(0); + inode.set_blocks(256); + let node_info = NodeInfo { + explicit_uidgid: true, + src_dev: 66305, + src_ino: 24775126, + rdev: 0, + source: PathBuf::from("/"), + path: PathBuf::from("/chunkdict"), + target: PathBuf::from("/chunkdict"), + target_vec: vec![OsString::from("/"), OsString::from("/chunkdict")], + symlink: None, + xattrs: RafsXAttrs::new(), + v6_force_extended_inode: true, + }; + let mut node = Node::new(inode, node_info, 0); + + // insert chunks + Self::insert_chunks(ctx, blob_mgr, &mut node, chunkdict)?; + + let node_size: u64 = node + .chunks + .iter() + .map(|chunk| chunk.inner.uncompressed_size() as u64) + .sum(); + node.inode.set_size(node_size); + + // update child count + node.inode.set_child_count(node.chunks.len() as u32); + + let child = Tree::new(node); + child + .lock_node() + .v5_set_dir_size(ctx.fs_version, &child.children); + Ok(child) + } + + /// Insert chunks + fn insert_chunks( + ctx: &mut BuildContext, + blob_mgr: &mut BlobManager, + node: &mut Node, + chunkdict: &[ChunkdictChunkInfo], + ) -> Result<()> { + for chunk_info in chunkdict.iter() { + let chunk_size: u32 = chunk_info.chunk_compressed_size; + let file_offset = 1 as u64 * chunk_size as u64; + let mut chunk = ChunkWrapper::new(ctx.fs_version); + + // update blob context + let (blob_index, blob_ctx) = + blob_mgr.get_or_cerate_blob_for_chunkdict(ctx, &chunk_info.chunk_blob_id)?; + if blob_ctx.blob_id.is_empty() { + blob_ctx.blob_id = chunk_info.chunk_blob_id.clone(); + } + let chunk_uncompressed_size = chunk_info.chunk_uncompressed_size; + let pre_d_offset = blob_ctx.current_uncompressed_offset; + blob_ctx.uncompressed_blob_size = pre_d_offset + chunk_uncompressed_size as u64; + blob_ctx.current_uncompressed_offset += chunk_uncompressed_size as u64; + + blob_ctx.blob_meta_header.set_ci_uncompressed_size( + blob_ctx.blob_meta_header.ci_uncompressed_size() + + size_of::() as u64, + ); + + // update chunk + let chunk_index = blob_ctx.alloc_chunk_index()?; + chunk.set_blob_index(blob_index); + chunk.set_index(chunk_index); + chunk.set_file_offset(file_offset); + chunk.set_compressed_size(chunk_info.chunk_compressed_size); + chunk.set_compressed_offset(chunk_info.chunk_compressed_offset); + chunk.set_uncompressed_size(chunk_info.chunk_uncompressed_size); + chunk.set_uncompressed_offset(chunk_info.chunk_uncompressed_offset); + chunk.set_id(RafsDigest::from_string(&chunk_info.chunk_digest)); + + debug!("chunk id: {}", chunk.id()); + + node.chunks.push(NodeChunk { + source: ChunkSource::Build, + inner: Arc::new(chunk.clone()), + }); + } + Ok(()) + } +} diff --git a/builder/src/lib.rs b/builder/src/lib.rs index bf18b43cec3..50d50b61115 100644 --- a/builder/src/lib.rs +++ b/builder/src/lib.rs @@ -36,8 +36,8 @@ pub use self::core::overlay::{Overlay, WhiteoutSpec}; pub use self::core::prefetch::{Prefetch, PrefetchPolicy}; pub use self::core::tree::{MetadataTreeBuilder, Tree, TreeNode}; pub use self::directory::DirectoryBuilder; -pub use self::generate::ChunkdictChunkInfo; -pub use self::generate::Generater; +pub use self::chunkdict_generator::ChunkdictChunkInfo; +pub use self::chunkdict_generator::Generator; pub use self::merge::Merger; pub use self::stargz::StargzBuilder; pub use self::tarball::TarballBuilder; @@ -45,7 +45,7 @@ pub use self::tarball::TarballBuilder; mod compact; mod core; mod directory; -mod generate; +mod chunkdict_generator; mod merge; mod stargz; mod tarball; diff --git a/src/bin/nydus-image/deduplicate.rs b/src/bin/nydus-image/deduplicate.rs index 4b013daba5e..6a5de6b5f83 100644 --- a/src/bin/nydus-image/deduplicate.rs +++ b/src/bin/nydus-image/deduplicate.rs @@ -6,15 +6,17 @@ use anyhow::{Context, Result}; use core::cmp::Ordering; use nydus_api::ConfigV2; +use nydus_builder::BuildContext; use nydus_builder::ChunkdictChunkInfo; use nydus_builder::Tree; -use nydus_rafs::metadata::RafsSuper; +use nydus_rafs::metadata::{RafsSuper, RafsVersion}; use nydus_storage::device::BlobInfo; use rusqlite::{params, Connection}; use std::collections::HashSet; use std::collections::{BTreeMap, HashMap}; +use std::convert::TryFrom; use std::fs; -use std::path::Path; +use std::path::{Path, PathBuf}; use std::result::Result::Ok; use std::sync::{Arc, Mutex}; @@ -135,6 +137,41 @@ impl Database for SqliteDatabase { } } +/// Get fs version from bootstrap file. +fn get_fs_version(bootstrap_path: &Path) -> Result { + let (sb, _) = RafsSuper::load_from_file(bootstrap_path, Arc::new(ConfigV2::default()), false)?; + RafsVersion::try_from(sb.meta.version).context("Failed to get RAFS version number") +} + +/// Checks if all Bootstrap versions are consistent. +/// If they are inconsistent, returns an error and prints the version of each Bootstrap. +pub fn check_bootstrap_versions_consistency( + ctx: &mut BuildContext, + bootstrap_paths: &[PathBuf], +) -> Result<()> { + let mut versions = Vec::new(); + + for bootstrap_path in bootstrap_paths { + let version = get_fs_version(bootstrap_path)?; + versions.push((bootstrap_path.clone(), version)); + } + + if !versions.is_empty() { + let first_version = versions[0].1; + ctx.fs_version = first_version; + if versions.iter().any(|(_, v)| *v != first_version) { + for (path, version) in &versions { + println!("Bootstrap path {:?} has version {:?}", path, version); + } + return Err(anyhow!( + "Bootstrap versions are inconsistent, cannot use chunkdict." + )); + } + } + + Ok(()) +} + pub struct Deduplicate { db: D, } @@ -203,8 +240,8 @@ impl Deduplicate { let chunk_blob_id = blob_infos[index as usize].blob_id(); self.db .insert_chunk(&ChunkdictChunkInfo { - image_name: image_name.to_string(), - version_name: version_name.to_string(), + image_reference: image_name.to_string(), + version: version_name.to_string(), chunk_blob_id, chunk_digest: chunk.inner.id().to_string(), chunk_compressed_size: chunk.inner.compressed_size(), @@ -229,9 +266,9 @@ pub struct Algorithm { } // Generate deduplicated chunkdict by exponential_smoothing algorithm -type VersionMap = HashMap>; +type Versiondic = HashMap>; // Generate deduplicated chunkdict by cluster algorithm -type ImageMap = Vec, Vec>>; +type Imagedic = Vec, Vec>>; impl Algorithm { pub fn new(algorithm: String, db_url: &str) -> anyhow::Result { @@ -270,9 +307,9 @@ impl Algorithm { chunkdict_size as f64 / 1024 as f64 / 1024 as f64 ); for chunk in all_chunks { - if !core_image.contains(&chunk.image_name) && !noise_points.contains(&chunk.image_name) + if !core_image.contains(&chunk.image_reference) && !noise_points.contains(&chunk.image_reference) { - noise_points.push(chunk.image_name.clone()); + noise_points.push(chunk.image_reference.clone()); } } Ok((chunkdict, noise_points)) @@ -295,11 +332,11 @@ impl Algorithm { for (chunk_index, chunk) in all_chunks.iter().enumerate() { let mut is_duplicate: f64 = 0.0; - if chunk.version_name == all_chunks[0].version_name { + if chunk.version == all_chunks[0].version { let smoothed_score: f64 = 0.0; smoothed_data.push(smoothed_score); } else { - if all_chunks[chunk_index - 1].version_name != all_chunks[chunk_index].version_name + if all_chunks[chunk_index - 1].version != all_chunks[chunk_index].version { last_start_version_index = start_version_index; start_version_index = chunk_index; @@ -324,8 +361,8 @@ impl Algorithm { let mut chunkdict: Vec = Vec::new(); for i in 0..smoothed_data.len() { let chunk = ChunkdictChunkInfo { - image_name: all_chunks[i].image_name.clone(), - version_name: all_chunks[i].version_name.clone(), + image_reference: all_chunks[i].image_reference.clone(), + version: all_chunks[i].version.clone(), chunk_blob_id: all_chunks[i].chunk_blob_id.clone(), chunk_digest: all_chunks[i].chunk_digest.clone(), chunk_compressed_offset: all_chunks[i].chunk_compressed_offset, @@ -393,7 +430,7 @@ impl Algorithm { let mut datadict: Vec = Vec::new(); for chunk in all_chunks { image_chunks - .entry(chunk.image_name.clone()) + .entry(chunk.image_reference.clone()) .or_insert(Vec::new()) .push(chunk.clone()); } @@ -420,7 +457,7 @@ impl Algorithm { // Group chunks into image_name for chunk in chunks { let entry = image_chunks - .entry(chunk.image_name.clone()) + .entry(chunk.image_reference.clone()) .or_insert(Vec::new()); entry.push(chunk.clone()); } @@ -436,7 +473,7 @@ impl Algorithm { // Group the chunks in the image into version_name for chunk in chunk_list { let entry = version_chunks - .entry(CustomString(chunk.version_name.clone())) + .entry(CustomString(chunk.version.clone())) .or_insert(Vec::new()); entry.push(chunk.clone()); } @@ -679,7 +716,7 @@ impl Algorithm { pub fn deduplicate_version( all_chunks: &[ChunkdictChunkInfo], - ) -> anyhow::Result<(VersionMap, ImageMap)> { + ) -> anyhow::Result<(Versiondic, Imagedic)> { let mut all_chunks_size = 0; for i in all_chunks { all_chunks_size += i.chunk_compressed_size; @@ -870,8 +907,8 @@ impl ChunkTable { )?; let chunk_iterator = stmt.query_map(params![blob_id, limit, offset], |row| { Ok(ChunkdictChunkInfo { - image_name: row.get(1)?, - version_name: row.get(2)?, + image_reference: row.get(1)?, + version: row.get(2)?, chunk_blob_id: row.get(3)?, chunk_digest: row.get(4)?, chunk_compressed_size: row.get(5)?, @@ -1001,8 +1038,8 @@ impl Table for ChunkTable { VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8); ", rusqlite::params![ - chunk.image_name, - chunk.version_name, + chunk.image_reference, + chunk.version, chunk.chunk_blob_id, chunk.chunk_digest, chunk.chunk_compressed_size, @@ -1050,8 +1087,8 @@ impl Table for ChunkTable { )?; let chunk_iterator = stmt.query_map(params![limit, offset], |row| { Ok(ChunkdictChunkInfo { - image_name: row.get(1)?, - version_name: row.get(2)?, + image_reference: row.get(1)?, + version: row.get(2)?, chunk_blob_id: row.get(3)?, chunk_digest: row.get(4)?, chunk_compressed_size: row.get(5)?, @@ -1241,8 +1278,8 @@ mod tests { let chunk_table = ChunkTable::new_in_memory()?; chunk_table.create()?; let chunk = ChunkdictChunkInfo { - image_name: "REDIS".to_string(), - version_name: "1.0.0".to_string(), + image_reference: "REDIS".to_string(), + version: "1.0.0".to_string(), chunk_blob_id: "BLOB123".to_string(), chunk_digest: "DIGEST123".to_string(), chunk_compressed_size: 512, @@ -1252,8 +1289,8 @@ mod tests { }; chunk_table.insert(&chunk)?; let chunk2 = ChunkdictChunkInfo { - image_name: "REDIS".to_string(), - version_name: "1.0.0".to_string(), + image_reference: "REDIS".to_string(), + version: "1.0.0".to_string(), chunk_blob_id: "BLOB456".to_string(), chunk_digest: "DIGEST123".to_string(), chunk_compressed_size: 512, @@ -1263,8 +1300,8 @@ mod tests { }; chunk_table.insert(&chunk2)?; let chunks = chunk_table.list_all()?; - assert_eq!(chunks[0].image_name, chunk.image_name); - assert_eq!(chunks[0].version_name, chunk.version_name); + assert_eq!(chunks[0].image_reference, chunk.image_reference); + assert_eq!(chunks[0].version, chunk.version); assert_eq!(chunks.len(), 2); assert_eq!(chunks[0].chunk_blob_id, chunk.chunk_blob_id); assert_eq!(chunks[0].chunk_digest, chunk.chunk_digest); @@ -1316,8 +1353,8 @@ mod tests { for i in 0..200 { let i64 = i as u64; let chunk = ChunkdictChunkInfo { - image_name: format!("REDIS{}", i), - version_name: format!("1.0.0{}", i), + image_reference: format!("REDIS{}", i), + version: format!("1.0.0{}", i), chunk_blob_id: format!("BLOB{}", i), chunk_digest: format!("DIGEST{}", i), chunk_compressed_size: i, @@ -1329,8 +1366,8 @@ mod tests { } let chunks = chunk_table.list_paged(100, 100)?; assert_eq!(chunks.len(), 100); - assert_eq!(chunks[0].image_name, "REDIS100"); - assert_eq!(chunks[0].version_name, "1.0.0100"); + assert_eq!(chunks[0].image_reference, "REDIS100"); + assert_eq!(chunks[0].version, "1.0.0100"); assert_eq!(chunks[0].chunk_blob_id, "BLOB100"); assert_eq!(chunks[0].chunk_digest, "DIGEST100"); assert_eq!(chunks[0].chunk_compressed_size, 100); @@ -1347,8 +1384,8 @@ mod tests { for i in 0..199 { let i64 = i as u64; let chunk = ChunkdictChunkInfo { - image_name: format!("REDIS{}", 0), - version_name: format!("1.0.0{}", (i + 1) / 100), + image_reference: format!("REDIS{}", 0), + version: format!("1.0.0{}", (i + 1) / 100), chunk_blob_id: format!("BLOB{}", i), chunk_digest: format!("DIGEST{}", (i + 1) % 2), chunk_compressed_size: i, @@ -1360,8 +1397,8 @@ mod tests { } let chunkdict = Algorithm::::exponential_smoothing(all_chunk, threshold)?; assert_eq!(chunkdict.len(), 2); - assert_eq!(chunkdict[0].image_name, "REDIS0"); - assert_eq!(chunkdict[0].version_name, "1.0.01"); + assert_eq!(chunkdict[0].image_reference, "REDIS0"); + assert_eq!(chunkdict[0].version, "1.0.01"); assert_eq!(chunkdict[0].chunk_blob_id, "BLOB99"); assert_eq!(chunkdict[0].chunk_digest, "DIGEST0"); assert_eq!(chunkdict[0].chunk_compressed_size, 99); @@ -1379,8 +1416,8 @@ mod tests { for i in 0..200 { let i64 = i as u64; let chunk = ChunkdictChunkInfo { - image_name: format!("REDIS{}", i / 50), - version_name: format!("1.0.0{}", (i + 1) / 100), + image_reference: format!("REDIS{}", i / 50), + version: format!("1.0.0{}", (i + 1) / 100), chunk_blob_id: format!("BLOB{}", i), chunk_digest: format!("DIGEST{}", (i + 1) % 2), chunk_compressed_size: i, @@ -1408,8 +1445,8 @@ mod tests { for i in 0..200 { let i64 = i as u64; let chunk = ChunkdictChunkInfo { - image_name: format!("REDIS{}", 0), - version_name: format!("1.0.0{}", (i + 1) / 100), + image_reference: format!("REDIS{}", 0), + version: format!("1.0.0{}", (i + 1) / 100), chunk_blob_id: format!("BLOB{}", i), chunk_digest: format!("DIGEST{}", (i + 1) % 4), chunk_compressed_size: 1, @@ -1423,8 +1460,8 @@ mod tests { for i in 0..200 { let i64 = i as u64; let chunk = ChunkdictChunkInfo { - image_name: format!("REDIS{}", 1), - version_name: format!("1.0.0{}", (i + 1) / 100), + image_reference: format!("REDIS{}", 1), + version: format!("1.0.0{}", (i + 1) / 100), chunk_blob_id: format!("BLOB{}", i), chunk_digest: format!("DIGEST{}", (i + 1) % 4), chunk_compressed_size: 1, @@ -1451,8 +1488,8 @@ mod tests { for i in 0..200 { for j in 0..100 { let chunk = ChunkdictChunkInfo { - image_name: format!("REDIS{}", i), - version_name: format!("1.0.0{}", j / 10), + image_reference: format!("REDIS{}", i), + version: format!("1.0.0{}", j / 10), chunk_blob_id: format!("BLOB{}", j), chunk_digest: format!("DIGEST{}", j + (i / 100) * 100), chunk_compressed_size: 1, @@ -1466,11 +1503,11 @@ mod tests { assert_eq!(all_chunks.len(), 20000); let (train, test) = Algorithm::::divide_set(&all_chunks, 0.7)?; assert_eq!(train.len(), 14000); - assert_eq!(train[0].image_name, "REDIS0"); - assert_eq!(train[0].version_name, "1.0.00"); + assert_eq!(train[0].image_reference, "REDIS0"); + assert_eq!(train[0].version, "1.0.00"); assert_eq!(test.len(), 6000); - assert_eq!(test[0].image_name, "REDIS0"); - assert_eq!(test[0].version_name, "1.0.07"); + assert_eq!(test[0].image_reference, "REDIS0"); + assert_eq!(test[0].version, "1.0.07"); Ok(()) } @@ -1481,8 +1518,8 @@ mod tests { for i in 0..200 { for j in 0..100 { let chunk = ChunkdictChunkInfo { - image_name: format!("REDIS{}", i), - version_name: format!("1.0.0{}", j / 10), + image_reference: format!("REDIS{}", i), + version: format!("1.0.0{}", j / 10), chunk_blob_id: format!("BLOB{}", j), chunk_digest: format!("DIGEST{}", j + (i / 100) * 100), chunk_compressed_size: 1, @@ -1516,8 +1553,8 @@ mod tests { for i in 0..200 { for j in 0..100 { let chunk = ChunkdictChunkInfo { - image_name: format!("REDIS{}", i), - version_name: format!("1.0.0{}", (j + 1) / 100), + image_reference: format!("REDIS{}", i), + version: format!("1.0.0{}", (j + 1) / 100), chunk_blob_id: format!("BLOB{}", j), chunk_digest: format!("DIGEST{}", j + (i / 100) * 100), chunk_compressed_size: 1, @@ -1542,8 +1579,8 @@ mod tests { for i in 0..200 { for j in 0..100 { let chunk = ChunkdictChunkInfo { - image_name: format!("REDIS{}", i), - version_name: format!("1.0.0{}", j / 10), + image_reference: format!("REDIS{}", i), + version: format!("1.0.0{}", j / 10), chunk_blob_id: format!("BLOB{}", j), chunk_digest: format!("DIGEST{}", j + (i / 100) * 100), chunk_compressed_size: 1, @@ -1578,8 +1615,8 @@ mod tests { for i in 0..200 { let i64 = i as u64; let chunk = ChunkdictChunkInfo { - image_name: format!("REDIS{}", 0), - version_name: format!("1.0.0{}", (i + 1) / 20), + image_reference: format!("REDIS{}", 0), + version: format!("1.0.0{}", (i + 1) / 20), chunk_blob_id: format!("BLOB{}", i), chunk_digest: format!("DIGEST{}", (i + 1) % 2), chunk_compressed_size: i, @@ -1595,7 +1632,7 @@ mod tests { chunkdict.extend(dictionary); } - assert_eq!(chunkdict[0].image_name, "REDIS0"); + assert_eq!(chunkdict[0].image_reference, "REDIS0"); assert_eq!(chunkdict[0].chunk_compressed_size, 21); assert_eq!(chunkdict.len(), 2); diff --git a/src/bin/nydus-image/main.rs b/src/bin/nydus-image/main.rs index d1cb8d47eb9..ab32e99feb9 100644 --- a/src/bin/nydus-image/main.rs +++ b/src/bin/nydus-image/main.rs @@ -29,7 +29,7 @@ use nydus_api::{BuildTimeInfo, ConfigV2, LocalFsConfig}; use nydus_builder::{ parse_chunk_dict_arg, ArtifactStorage, BlobCacheGenerator, BlobCompactor, BlobManager, BootstrapManager, BuildContext, BuildOutput, Builder, ChunkdictChunkInfo, ConversionType, - DirectoryBuilder, Feature, Features, Generater, HashChunkDict, Merger, Prefetch, + DirectoryBuilder, Feature, Features, Generator, HashChunkDict, Merger, Prefetch, PrefetchPolicy, StargzBuilder, TarballBuilder, WhiteoutSpec, }; use nydus_rafs::metadata::{MergeError, RafsSuper, RafsSuperConfig, RafsVersion}; @@ -45,7 +45,7 @@ use nydus_utils::{ }; use serde::{Deserialize, Serialize}; -use crate::deduplicate::Deduplicate; +use crate::deduplicate::{check_bootstrap_versions_consistency, Deduplicate}; use crate::unpack::{OCIUnpacker, Unpacker}; use crate::validator::Validator; @@ -383,6 +383,12 @@ fn prepare_cmd_args(bti_string: &'static str) -> App { .default_value("sqlite:///home/runner/output/database.db") .required(false), ) + .arg( + Arg::new("parent-bootstrap") + .long("parent-bootstrap") + .help("File path of the parent/referenced RAFS metadata blob (optional)") + .required(false), + ) .arg( Arg::new("bootstrap") .long("bootstrap") @@ -395,8 +401,39 @@ fn prepare_cmd_args(bti_string: &'static str) -> App { .short('D') .help("Directory path to save generated RAFS metadata and data blobs"), ) + .arg(arg_chunk_dict.clone()) .arg(arg_prefetch_policy.clone()) .arg(arg_output_json.clone()) + .arg( + Arg::new("blob-digests") + .long("blob-digests") + .required(false) + .help("RAFS blob digest list separated by comma"), + ) + .arg( + Arg::new("original-blob-ids") + .long("original-blob-ids") + .required(false) + .help("original blob id list separated by comma, it may usually be a sha256 hex string"), + ) + .arg( + Arg::new("blob-sizes") + .long("blob-sizes") + .required(false) + .help("RAFS blob size list separated by comma"), + ) + .arg( + Arg::new("blob-toc-digests") + .long("blob-toc-digests") + .required(false) + .help("RAFS blob toc digest list separated by comma"), + ) + .arg( + Arg::new("blob-toc-sizes") + .long("blob-toc-sizes") + .required(false) + .help("RAFS blob toc size list separated by comma"), + ) .arg(arg_config.clone()) .arg( Arg::new("SOURCE") @@ -780,6 +817,7 @@ fn main() -> Result<()> { Command::create(matches, &build_info) } else if let Some(matches) = cmd.subcommand_matches("chunkdict") { match matches.subcommand_name() { + Some("save") => Command::chunkdict_save(matches.subcommand_matches("save").unwrap()), Some("generate") => Command::chunkdict_generate( matches.subcommand_matches("generate").unwrap(), &build_info, @@ -1251,6 +1289,62 @@ impl Command { } info!("Chunkdict metadata is saved at: {:?}", db_url); + fn chunkdict_generate(matches: &ArgMatches, build_info: &BuildTimeInfo) -> Result<()> { + let mut build_ctx = BuildContext { + prefetch: Self::get_prefetch(matches)?, + ..Default::default() + }; + let db_url: &String = matches.get_one::("database").unwrap(); + // save chunk and blob info to database + let source_bootstrap_paths: Vec = matches + .get_many::("SOURCE") + .map(|paths| paths.map(PathBuf::from).collect()) + .unwrap(); + + check_bootstrap_versions_consistency(&mut build_ctx, &source_bootstrap_paths)?; + + for (_, bootstrap_path) in source_bootstrap_paths.iter().enumerate() { + let path = bootstrap_path.display().to_string(); + info!("Bootstrap path is {}", path); + let path_name: Vec<&str> = path.split('/').collect(); + + // Extract the image name and version name from the bootstrap directory + let bootstrap_dir = match path_name.get(path_name.len() - 2) { + Some(&bootstrap_dir) => bootstrap_dir.to_string(), + None => bail!("Invalid Bootstrap directory name"), + }; + let full_image_name: Vec<&str> = bootstrap_dir.split(':').collect(); + let image_name = match full_image_name.get(full_image_name.len() - 2) { + Some(&second_last) => second_last.to_string(), + None => bail!("Invalid image name"), + }; + let version_name = match full_image_name.last() { + Some(&last) => last.to_string(), + None => bail!("Invalid version name"), + }; + // For backward compatibility with v2.1. + let config = Self::get_configuration(matches)?; + config + .internal + .set_blob_accessible(matches.get_one::("bootstrap").is_none()); + let db_strs: Vec<&str> = db_url.split("://").collect(); + if db_strs.len() != 2 || (!db_strs[1].starts_with('/') && !db_strs[1].starts_with(':')) + { + bail!("Invalid database URL: {}", db_url); + } + match db_strs[0] { + "sqlite" => { + let mut deduplicate: Deduplicate = + Deduplicate::::new(db_strs[1])?; + deduplicate.save_metadata(bootstrap_path, config, image_name, version_name)? + } + _ => { + bail!("Unsupported database type: {}, please use a valid database URI, such as 'sqlite:///path/to/chunkdict.db'.", db_strs[0]) + } + }; + } + info!("Chunkdict metadata is saved at: {:?}", db_url); + // Connecting database and Generating chunk dictionary by algorithm "exponential_smoothing" let db_strs: Vec<&str> = db_url.split("://").collect(); if db_strs.len() != 2 || (!db_strs[1].starts_with('/') && !db_strs[1].starts_with(':')) { @@ -1277,7 +1371,6 @@ impl Command { } }; - // Output noise point in DBSCAN clustering algorithm info!( "The length of chunkdict is {}", Vec::::len(&chunkdict) @@ -1287,7 +1380,7 @@ impl Command { info!("{}", image_name); } - // Dump chunkdict to bootstrap + // dump chunkdict to bootstrap let features = Features::try_from( matches .get_one::("features") @@ -1300,16 +1393,12 @@ impl Command { config .internal .set_blob_accessible(matches.get_one::("config").is_some()); - let mut build_ctx = BuildContext { - prefetch: Self::get_prefetch(matches)?, - ..Default::default() - }; build_ctx.configuration = config; build_ctx.blob_storage = Some(chunkdict_bootstrap_path); build_ctx.blob_features = BlobFeatures::CAP_TAR_TOC; build_ctx.blob_features.insert(BlobFeatures::ALIGNED); - // Build_ctx.blob_features.insert(BlobFeatures::CHUNK_INFO_V2); - // Build_ctx.blob_features.insert(BlobFeatures::ENCRYPTED); + // build_ctx.blob_features.insert(BlobFeatures::CHUNK_INFO_V2); + // build_ctx.blob_features.insert(BlobFeatures::ENCRYPTED); build_ctx.features = features; let digester = matches @@ -1323,7 +1412,7 @@ impl Command { let mut bootstrap_mgr = BootstrapManager::new(Some(bootstrap_path), None); let output = - Generater::generate(&mut build_ctx, &mut bootstrap_mgr, &mut blob_mgr, chunkdict)?; + Generator::generate(&mut build_ctx, &mut bootstrap_mgr, &mut blob_mgr, chunkdict)?; OutputSerializer::dump(matches, output, build_info).unwrap(); info!( "Chunkdict metadata is saved at: {:?}", From 7de48359896df4738dbd0ad5be0939f8b3606ab0 Mon Sep 17 00:00:00 2001 From: Zhao Yuan <1627990440@qq.com> Date: Fri, 8 Dec 2023 05:21:32 +0000 Subject: [PATCH 06/11] Merge invoked subcomand in nydusify and add smoke test Signed-off-by: Zhao Yuan <1627990440@qq.com> --- src/bin/nydus-image/deduplicate.rs | 6 +- src/bin/nydus-image/main.rs | 94 ++---------------------------- 2 files changed, 7 insertions(+), 93 deletions(-) diff --git a/src/bin/nydus-image/deduplicate.rs b/src/bin/nydus-image/deduplicate.rs index 6a5de6b5f83..b2313297f21 100644 --- a/src/bin/nydus-image/deduplicate.rs +++ b/src/bin/nydus-image/deduplicate.rs @@ -266,9 +266,9 @@ pub struct Algorithm { } // Generate deduplicated chunkdict by exponential_smoothing algorithm -type Versiondic = HashMap>; +type VersionMap = HashMap>; // Generate deduplicated chunkdict by cluster algorithm -type Imagedic = Vec, Vec>>; +type ImageMap = Vec, Vec>>; impl Algorithm { pub fn new(algorithm: String, db_url: &str) -> anyhow::Result { @@ -716,7 +716,7 @@ impl Algorithm { pub fn deduplicate_version( all_chunks: &[ChunkdictChunkInfo], - ) -> anyhow::Result<(Versiondic, Imagedic)> { + ) -> anyhow::Result<(VersionMap, ImageMap)> { let mut all_chunks_size = 0; for i in all_chunks { all_chunks_size += i.chunk_compressed_size; diff --git a/src/bin/nydus-image/main.rs b/src/bin/nydus-image/main.rs index ab32e99feb9..b2cf763320d 100644 --- a/src/bin/nydus-image/main.rs +++ b/src/bin/nydus-image/main.rs @@ -383,12 +383,6 @@ fn prepare_cmd_args(bti_string: &'static str) -> App { .default_value("sqlite:///home/runner/output/database.db") .required(false), ) - .arg( - Arg::new("parent-bootstrap") - .long("parent-bootstrap") - .help("File path of the parent/referenced RAFS metadata blob (optional)") - .required(false), - ) .arg( Arg::new("bootstrap") .long("bootstrap") @@ -401,39 +395,8 @@ fn prepare_cmd_args(bti_string: &'static str) -> App { .short('D') .help("Directory path to save generated RAFS metadata and data blobs"), ) - .arg(arg_chunk_dict.clone()) .arg(arg_prefetch_policy.clone()) .arg(arg_output_json.clone()) - .arg( - Arg::new("blob-digests") - .long("blob-digests") - .required(false) - .help("RAFS blob digest list separated by comma"), - ) - .arg( - Arg::new("original-blob-ids") - .long("original-blob-ids") - .required(false) - .help("original blob id list separated by comma, it may usually be a sha256 hex string"), - ) - .arg( - Arg::new("blob-sizes") - .long("blob-sizes") - .required(false) - .help("RAFS blob size list separated by comma"), - ) - .arg( - Arg::new("blob-toc-digests") - .long("blob-toc-digests") - .required(false) - .help("RAFS blob toc digest list separated by comma"), - ) - .arg( - Arg::new("blob-toc-sizes") - .long("blob-toc-sizes") - .required(false) - .help("RAFS blob toc size list separated by comma"), - ) .arg(arg_config.clone()) .arg( Arg::new("SOURCE") @@ -817,7 +780,6 @@ fn main() -> Result<()> { Command::create(matches, &build_info) } else if let Some(matches) = cmd.subcommand_matches("chunkdict") { match matches.subcommand_name() { - Some("save") => Command::chunkdict_save(matches.subcommand_matches("save").unwrap()), Some("generate") => Command::chunkdict_generate( matches.subcommand_matches("generate").unwrap(), &build_info, @@ -1240,55 +1202,6 @@ impl Command { OutputSerializer::dump(matches, build_output, build_info) } - fn chunkdict_generate(matches: &ArgMatches, build_info: &BuildTimeInfo) -> Result<()> { - let db_url: &String = matches.get_one::("database").unwrap(); - // save chunk and blob info to database - let source_bootstrap_paths: Vec = matches - .get_many::("SOURCE") - .map(|paths| paths.map(PathBuf::from).collect()) - .unwrap(); - for (_, bootstrap_path) in source_bootstrap_paths.iter().enumerate() { - let path = bootstrap_path.display().to_string(); - info!("Bootstrap path is {}", path); - let path_name: Vec<&str> = path.split('/').collect(); - - // Extract the image name and version name from the bootstrap directory - let bootstrap_dir = match path_name.get(path_name.len() - 2) { - Some(&bootstrap_dir) => bootstrap_dir.to_string(), - None => bail!("Invalid Bootstrap directory name"), - }; - let full_image_name: Vec<&str> = bootstrap_dir.split(':').collect(); - let image_name = match full_image_name.get(full_image_name.len() - 2) { - Some(&second_last) => second_last.to_string(), - None => bail!("Invalid image name"), - }; - let version_name = match full_image_name.last() { - Some(&last) => last.to_string(), - None => bail!("Invalid version name"), - }; - // For backward compatibility with v2.1. - let config = Self::get_configuration(matches)?; - config - .internal - .set_blob_accessible(matches.get_one::("bootstrap").is_none()); - let db_strs: Vec<&str> = db_url.split("://").collect(); - if db_strs.len() != 2 || (!db_strs[1].starts_with('/') && !db_strs[1].starts_with(':')) - { - bail!("Invalid database URL: {}", db_url); - } - match db_strs[0] { - "sqlite" => { - let mut deduplicate: Deduplicate = - Deduplicate::::new(db_strs[1])?; - deduplicate.save_metadata(bootstrap_path, config, image_name, version_name)? - } - _ => { - bail!("Unsupported database type: {}, please use a valid database URI, such as 'sqlite:///path/to/chunkdict.db'.", db_strs[0]) - } - }; - } - info!("Chunkdict metadata is saved at: {:?}", db_url); - fn chunkdict_generate(matches: &ArgMatches, build_info: &BuildTimeInfo) -> Result<()> { let mut build_ctx = BuildContext { prefetch: Self::get_prefetch(matches)?, @@ -1371,6 +1284,7 @@ impl Command { } }; + // Output noise point in DBSCAN clustering algorithm info!( "The length of chunkdict is {}", Vec::::len(&chunkdict) @@ -1380,7 +1294,7 @@ impl Command { info!("{}", image_name); } - // dump chunkdict to bootstrap + // Dump chunkdict to bootstrap let features = Features::try_from( matches .get_one::("features") @@ -1397,8 +1311,8 @@ impl Command { build_ctx.blob_storage = Some(chunkdict_bootstrap_path); build_ctx.blob_features = BlobFeatures::CAP_TAR_TOC; build_ctx.blob_features.insert(BlobFeatures::ALIGNED); - // build_ctx.blob_features.insert(BlobFeatures::CHUNK_INFO_V2); - // build_ctx.blob_features.insert(BlobFeatures::ENCRYPTED); + // Build_ctx.blob_features.insert(BlobFeatures::CHUNK_INFO_V2); + // Build_ctx.blob_features.insert(BlobFeatures::ENCRYPTED); build_ctx.features = features; let digester = matches From 135779b478ef841507951c4dd34231088a04d507 Mon Sep 17 00:00:00 2001 From: Zhao Yuan <1627990440@qq.com> Date: Tue, 12 Dec 2023 04:46:35 +0000 Subject: [PATCH 07/11] Modify database fields synchronously Signed-off-by: Zhao Yuan <1627990440@qq.com> --- src/bin/nydus-image/deduplicate.rs | 52 ++++++++++++++++-------------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/src/bin/nydus-image/deduplicate.rs b/src/bin/nydus-image/deduplicate.rs index b2313297f21..fa090cf648b 100644 --- a/src/bin/nydus-image/deduplicate.rs +++ b/src/bin/nydus-image/deduplicate.rs @@ -192,14 +192,14 @@ impl Deduplicate { &mut self, bootstrap_path: &Path, config: Arc, - image_name: String, - version_name: String, + image_reference: String, + version: String, ) -> anyhow::Result>> { let (sb, _) = RafsSuper::load_from_file(bootstrap_path, config, false)?; self.create_tables()?; let blob_infos = sb.superblock.get_blob_infos(); self.insert_blobs(&blob_infos)?; - self.insert_chunks(&blob_infos, &sb, image_name, version_name)?; + self.insert_chunks(&blob_infos, &sb, image_reference, version)?; Ok(blob_infos) } @@ -230,8 +230,8 @@ impl Deduplicate { &mut self, blob_infos: &[Arc], sb: &RafsSuper, - image_name: String, - version_name: String, + image_reference: String, + version: String, ) -> anyhow::Result<()> { let process_chunk = &mut |t: &Tree| -> Result<()> { let node = t.lock_node(); @@ -240,8 +240,8 @@ impl Deduplicate { let chunk_blob_id = blob_infos[index as usize].blob_id(); self.db .insert_chunk(&ChunkdictChunkInfo { - image_reference: image_name.to_string(), - version: version_name.to_string(), + image_reference: image_reference.to_string(), + version: version.to_string(), chunk_blob_id, chunk_digest: chunk.inner.id().to_string(), chunk_compressed_size: chunk.inner.compressed_size(), @@ -436,7 +436,7 @@ impl Algorithm { } for (index, chunks) in image_chunks { let data_point = DataPoint { - image_name: index, + image_reference: index, chunk_list: chunks, visited: false, clustered: false, @@ -451,10 +451,10 @@ impl Algorithm { chunks: &[ChunkdictChunkInfo], train_percentage: f64, ) -> anyhow::Result<(Vec, Vec)> { - // Create a HashMap to store the list of chunks for each image_name + // Create a HashMap to store the list of chunks for each image_reference let mut image_chunks: BTreeMap> = BTreeMap::new(); - // Group chunks into image_name + // Group chunks into image_reference for chunk in chunks { let entry = image_chunks .entry(chunk.image_reference.clone()) @@ -466,11 +466,11 @@ impl Algorithm { let mut train_set: Vec = Vec::new(); let mut test_set: Vec = Vec::new(); - // Iterate through the list of Chunks for each image_name + // Iterate through the list of Chunks for each image_reference for (_, chunk_list) in image_chunks.iter_mut() { let mut version_chunks: BTreeMap> = BTreeMap::new(); - // Group the chunks in the image into version_name + // Group the chunks in the image into version for chunk in chunk_list { let entry = version_chunks .entry(CustomString(chunk.version.clone())) @@ -598,10 +598,12 @@ impl Algorithm { for &point_index in cluster_points { let point = &data_point[point_index]; // let all_count = 0; - let image_total_count = image_total_counts.entry(&point.image_name).or_insert(0); + let image_total_count = image_total_counts + .entry(&point.image_reference) + .or_insert(0); *image_total_count += 1; - image_list.push(point.image_name.clone()); + image_list.push(point.image_reference.clone()); } // Count the number of images in which chunks appear in the cluster @@ -762,7 +764,7 @@ impl Algorithm { for point in data_point.iter_mut() { for single_dictionary in &datadict { for (key, value) in single_dictionary.iter() { - if key.contains(&point.image_name) { + if key.contains(&point.image_reference) { let mut to_remove = Vec::new(); for chunk in point.chunk_list.iter() { if value.contains(chunk) { @@ -776,15 +778,15 @@ impl Algorithm { } } let chunk_dict = Self::exponential_smoothing(point.chunk_list.clone(), threshold)?; - version_datadict.insert(point.image_name.clone(), chunk_dict); + version_datadict.insert(point.image_reference.clone(), chunk_dict); } let mut test_by_image = Self::divide_by_image(&test)?; for point in test_by_image.iter_mut() { - if version_datadict.contains_key(&point.image_name.clone()) { + if version_datadict.contains_key(&point.image_reference.clone()) { let mut to_remove = Vec::new(); let mut vec_string = Vec::new(); - let chunkdict_option = version_datadict.get(&point.image_name); + let chunkdict_option = version_datadict.get(&point.image_reference); if let Some(chunkdict) = chunkdict_option { for i in chunkdict { vec_string.push(i.chunk_digest.clone()); @@ -821,7 +823,7 @@ impl Algorithm { #[allow(dead_code)] #[derive(Debug)] struct DataPoint { - image_name: String, + image_reference: String, chunk_list: Vec, visited: bool, clustered: bool, @@ -900,7 +902,7 @@ impl ChunkTable { .map_err(|e| DatabaseError::PoisonError(e.to_string()))?; let mut stmt: rusqlite::Statement<'_> = conn_guard .prepare( - "SELECT id, image_name, version_name, chunk_blob_id, chunk_digest, chunk_compressed_size, + "SELECT id, image_reference, version, chunk_blob_id, chunk_digest, chunk_compressed_size, chunk_uncompressed_size, chunk_compressed_offset, chunk_uncompressed_offset from chunk WHERE chunk_blob_id = ?1 ORDER BY id LIMIT ?2 OFFSET ?3", @@ -1005,8 +1007,8 @@ impl Table for ChunkTable { .execute( "CREATE TABLE IF NOT EXISTS chunk ( id INTEGER PRIMARY KEY, - image_name TEXT, - version_name TEXT, + image_reference TEXT, + version TEXT, chunk_blob_id TEXT NOT NULL, chunk_digest TEXT, chunk_compressed_size INT, @@ -1026,8 +1028,8 @@ impl Table for ChunkTable { .map_err(|e| DatabaseError::PoisonError(e.to_string()))? .execute( "INSERT INTO chunk( - image_name, - version_name, + image_reference, + version, chunk_blob_id, chunk_digest, chunk_compressed_size, @@ -1081,7 +1083,7 @@ impl Table for ChunkTable { .map_err(|e| DatabaseError::PoisonError(e.to_string()))?; let mut stmt: rusqlite::Statement<'_> = conn_guard .prepare( - "SELECT id, image_name, version_name, chunk_blob_id, chunk_digest, chunk_compressed_size, + "SELECT id, image_reference, version, chunk_blob_id, chunk_digest, chunk_compressed_size, chunk_uncompressed_size, chunk_compressed_offset, chunk_uncompressed_offset from chunk ORDER BY id LIMIT ?1 OFFSET ?2", )?; From eb211888c0f611228e0c3a74c5bfb04013d219e7 Mon Sep 17 00:00:00 2001 From: Zhao Yuan <1627990440@qq.com> Date: Fri, 22 Dec 2023 08:33:53 +0000 Subject: [PATCH 08/11] Add push chunkdict to registry and add smoke test(support v5 and v6) Signed-off-by: Zhao Yuan <1627990440@qq.com> --- builder/src/lib.rs | 6 +- contrib/nydusify/cmd/nydusify.go | 69 +++- .../pkg/chunkdict/generator/generator.go | 389 +++++++++++++++++- smoke/tests/image_test.go | 102 ++++- src/bin/nydus-image/deduplicate.rs | 6 +- 5 files changed, 535 insertions(+), 37 deletions(-) diff --git a/builder/src/lib.rs b/builder/src/lib.rs index 50d50b61115..d09c2d09f7e 100644 --- a/builder/src/lib.rs +++ b/builder/src/lib.rs @@ -23,6 +23,8 @@ use sha2::Digest; use self::core::node::{Node, NodeInfo}; +pub use self::chunkdict_generator::ChunkdictChunkInfo; +pub use self::chunkdict_generator::Generator; pub use self::compact::BlobCompactor; pub use self::core::bootstrap::Bootstrap; pub use self::core::chunk_dict::{parse_chunk_dict_arg, ChunkDict, HashChunkDict}; @@ -36,16 +38,14 @@ pub use self::core::overlay::{Overlay, WhiteoutSpec}; pub use self::core::prefetch::{Prefetch, PrefetchPolicy}; pub use self::core::tree::{MetadataTreeBuilder, Tree, TreeNode}; pub use self::directory::DirectoryBuilder; -pub use self::chunkdict_generator::ChunkdictChunkInfo; -pub use self::chunkdict_generator::Generator; pub use self::merge::Merger; pub use self::stargz::StargzBuilder; pub use self::tarball::TarballBuilder; +mod chunkdict_generator; mod compact; mod core; mod directory; -mod chunkdict_generator; mod merge; mod stargz; mod tarball; diff --git a/contrib/nydusify/cmd/nydusify.go b/contrib/nydusify/cmd/nydusify.go index 517a6588529..ff00fde1528 100644 --- a/contrib/nydusify/cmd/nydusify.go +++ b/contrib/nydusify/cmd/nydusify.go @@ -656,12 +656,50 @@ func main() { Usage: "One or more Nydus image reference(Multiple images should be split by commas)", EnvVars: []string{"SOURCES"}, }, + &cli.StringFlag{ + Name: "target", + Required: false, + Usage: "Target chunkdict image (Nydus) reference", + EnvVars: []string{"TARGET"}, + }, &cli.BoolFlag{ Name: "source-insecure", Required: false, Usage: "Skip verifying server certs for HTTPS source registry", EnvVars: []string{"SOURCE_INSECURE"}, }, + &cli.BoolFlag{ + Name: "target-insecure", + Required: false, + Usage: "Skip verifying server certs for HTTPS target registry", + EnvVars: []string{"TARGET_INSECURE"}, + }, + + &cli.StringFlag{ + Name: "backend-type", + Value: "", + Usage: "Type of storage backend, possible values: 'oss', 's3'", + EnvVars: []string{"BACKEND_TYPE"}, + }, + &cli.StringFlag{ + Name: "backend-config", + Value: "", + Usage: "Json configuration string for storage backend", + EnvVars: []string{"BACKEND_CONFIG"}, + }, + &cli.PathFlag{ + Name: "backend-config-file", + Value: "", + TakesFile: true, + Usage: "Json configuration file for storage backend", + EnvVars: []string{"BACKEND_CONFIG_FILE"}, + }, + &cli.StringFlag{ + Name: "push-chunk-size", + Value: "0MB", + Usage: "Chunk size for pushing a blob layer in chunked", + }, + &cli.StringFlag{ Name: "work-dir", Value: "./output", @@ -674,6 +712,12 @@ func main() { Usage: "Path to the nydus-image binary, default to search in PATH", EnvVars: []string{"NYDUS_IMAGE"}, }, + + &cli.BoolFlag{ + Name: "all-platforms", + Value: false, + Usage: "Generate chunkdict image for all platforms, conflicts with --platform", + }, &cli.StringFlag{ Name: "platform", Value: "linux/" + runtime.GOARCH, @@ -683,17 +727,40 @@ func main() { Action: func(c *cli.Context) error { setupLogLevel(c) + backendType, backendConfig, err := getBackendConfig(c, "", false) + if err != nil { + return err + } + pushChunkSize, err := humanize.ParseBytes(c.String("push-chunk-size")) + if err != nil { + return errors.Wrap(err, "invalid --push-chunk-size option") + } + if pushChunkSize > 0 { + logrus.Infof("will copy layer with chunk size %s", c.String("push-chunk-size")) + } + _, arch, err := provider.ExtractOsArch(c.String("platform")) if err != nil { return err } generator, err := generator.New(generator.Opt{ - WorkDir: c.String("work-dir"), Sources: c.StringSlice("sources"), + Target: c.String("target"), SourceInsecure: c.Bool("source-insecure"), + TargetInsecure: c.Bool("target-insecure"), + + BackendType: backendType, + BackendConfig: backendConfig, + BackendForcePush: c.Bool("backend-force-push"), + + WorkDir: c.String("work-dir"), NydusImagePath: c.String("nydus-image"), ExpectedArch: arch, + AllPlatforms: c.Bool("all-platforms"), + Platforms: c.String("platform"), + + PushChunkSize: int64(pushChunkSize), }) if err != nil { return err diff --git a/contrib/nydusify/pkg/chunkdict/generator/generator.go b/contrib/nydusify/pkg/chunkdict/generator/generator.go index 1de160295f6..5e91e9390a4 100644 --- a/contrib/nydusify/pkg/chunkdict/generator/generator.go +++ b/contrib/nydusify/pkg/chunkdict/generator/generator.go @@ -1,7 +1,10 @@ package generator import ( + "compress/gzip" "context" + "encoding/json" + "io" "io/fs" "os" "path/filepath" @@ -10,20 +13,49 @@ import ( "github.com/pkg/errors" "github.com/sirupsen/logrus" + "github.com/containerd/containerd/namespaces" + "github.com/dragonflyoss/nydus/contrib/nydusify/pkg/backend" "github.com/dragonflyoss/nydus/contrib/nydusify/pkg/build" "github.com/dragonflyoss/nydus/contrib/nydusify/pkg/parser" - "github.com/dragonflyoss/nydus/contrib/nydusify/pkg/provider" + originprovider "github.com/dragonflyoss/nydus/contrib/nydusify/pkg/provider" + "github.com/goharbor/acceleration-service/pkg/remote" + + "github.com/containerd/nydus-snapshotter/pkg/converter" + "github.com/dragonflyoss/nydus/contrib/nydusify/pkg/converter/provider" "github.com/dragonflyoss/nydus/contrib/nydusify/pkg/utils" + "github.com/dustin/go-humanize" + "github.com/goharbor/acceleration-service/pkg/platformutil" + serverutils "github.com/goharbor/acceleration-service/pkg/utils" + "github.com/opencontainers/go-digest" + "golang.org/x/sync/errgroup" + "golang.org/x/sync/semaphore" + + "github.com/containerd/containerd/content" + containerdErrdefs "github.com/containerd/containerd/errdefs" + "github.com/goharbor/acceleration-service/pkg/errdefs" + ocispec "github.com/opencontainers/image-spec/specs-go/v1" ) // Opt defines Chunkdict generate options. // Note: sources is one or more Nydus image references. type Opt struct { - WorkDir string Sources []string + Target string SourceInsecure bool + TargetInsecure bool + + BackendType string + BackendConfig string + BackendForcePush bool + + WorkDir string NydusImagePath string ExpectedArch string + + AllPlatforms bool + Platforms string + + PushChunkSize int64 } // Generator generates chunkdict by deduplicating multiple nydus images @@ -33,12 +65,16 @@ type Generator struct { sourcesParser []*parser.Parser } +type output struct { + Blobs []string +} + // New creates Generator instance. func New(opt Opt) (*Generator, error) { // TODO: support sources image resolver var sourcesParser []*parser.Parser for _, source := range opt.Sources { - sourcesRemote, err := provider.DefaultRemote(source, opt.SourceInsecure) + sourcesRemote, err := originprovider.DefaultRemote(source, opt.SourceInsecure) if err != nil { return nil, errors.Wrap(err, "Init source image parser") } @@ -74,10 +110,16 @@ func (generator *Generator) Generate(ctx context.Context) error { } } - if err := generator.generate(ctx, bootstrapPaths); err != nil { + chunkdictBootstrapPath, outputPath, err := generator.generate(ctx, bootstrapPaths) + if err != nil { return err } - return nil + + if err := generator.push(ctx, chunkdictBootstrapPath, outputPath); err != nil { + return err + } + + return os.RemoveAll(generator.WorkDir) } // Pull the bootstrap of nydus image @@ -104,10 +146,12 @@ func (generator *Generator) pull(ctx context.Context) ([]string, error) { return bootstrapPaths, nil } -func (generator *Generator) generate(ctx context.Context, bootstrapPaths []string) error { +func (generator *Generator) generate(_ context.Context, bootstrapSlice []string) (string, string, error) { // Invoke "nydus-image generate" command currentDir, _ := os.Getwd() builder := build.NewBuilder(generator.NydusImagePath) + + chunkdictBootstrapPath := filepath.Join(generator.WorkDir, "chunkdict_bootstrap") databaseType := "sqlite" var databasePath string if strings.HasPrefix(generator.WorkDir, "/") { @@ -115,16 +159,341 @@ func (generator *Generator) generate(ctx context.Context, bootstrapPaths []strin } else { databasePath = databaseType + "://" + filepath.Join(currentDir, generator.WorkDir, "database.db") } + outputPath := filepath.Join(generator.WorkDir, "nydus_bootstrap_output.json") + if err := builder.Generate(build.GenerateOption{ - BootstrapPaths: bootstrapPaths, - ChunkdictBootstrapPath: filepath.Join(generator.WorkDir, "chunkdict_bootstrap"), + BootstrapPaths: bootstrapSlice, + ChunkdictBootstrapPath: chunkdictBootstrapPath, DatabasePath: databasePath, - OutputPath: filepath.Join(generator.WorkDir, "nydus_bootstrap_output.json"), + OutputPath: outputPath, }); err != nil { - return errors.Wrap(err, "invalid nydus bootstrap format") + return "", "", errors.Wrap(err, "invalid nydus bootstrap format") } logrus.Infof("Successfully generate image chunk dictionary") + return chunkdictBootstrapPath, outputPath, nil +} +func hosts(generator *Generator) remote.HostFunc { + maps := make(map[string]bool) + for _, source := range generator.Sources { + maps[source] = generator.SourceInsecure + } + + maps[generator.Target] = generator.TargetInsecure + return func(ref string) (remote.CredentialFunc, bool, error) { + return remote.NewDockerConfigCredFunc(), maps[ref], nil + } +} + +func (generator *Generator) push(ctx context.Context, chunkdictBootstrapPath string, outputPath string) error { + // Basic configuration + ctx = namespaces.WithNamespace(ctx, "nydusify") + platformMC, err := platformutil.ParsePlatforms(generator.AllPlatforms, generator.Platforms) + if err != nil { + return err + } + + pvd, err := provider.New(generator.WorkDir, hosts(generator), 200, "v1", platformMC, generator.PushChunkSize) + if err != nil { + return err + } + + var bkd backend.Backend + if generator.BackendType != "" { + bkd, err = backend.NewBackend(generator.BackendType, []byte(generator.BackendConfig), nil) + if err != nil { + return errors.Wrapf(err, "new backend") + } + } + + // Pull a source image as a template + if err := pvd.Pull(ctx, generator.Sources[0]); err != nil { + if errdefs.NeedsRetryWithHTTP(err) { + pvd.UsePlainHTTP() + if err := pvd.Pull(ctx, generator.Sources[0]); err != nil { + return errors.Wrap(err, "try to pull image") + } + } else { + return errors.Wrap(err, "pull source image") + } + } + logrus.Infof("pulled source image %s", generator.Sources[0]) + sourceImage, err := pvd.Image(ctx, generator.Sources[0]) + if err != nil { + return errors.Wrap(err, "find image from store") + } + sourceDescs, err := serverutils.GetManifests(ctx, pvd.ContentStore(), *sourceImage, platformMC) + if err != nil { + return errors.Wrap(err, "get image manifests") + } + + targetDescs := make([]ocispec.Descriptor, len(sourceDescs)) + + sem := semaphore.NewWeighted(1) + eg := errgroup.Group{} + for idx := range sourceDescs { + func(idx int) { + eg.Go(func() error { + sem.Acquire(context.Background(), 1) + defer sem.Release(1) + sourceDesc := sourceDescs[idx] + targetDesc := &sourceDesc + // Get the blob from backend + if bkd != nil { + descs, _targetDesc, err := pushBlobFromBackend(ctx, pvd, bkd, sourceDesc, *generator, chunkdictBootstrapPath, outputPath) + if err != nil { + return errors.Wrap(err, "get resolver") + } + if _targetDesc != nil { + targetDesc = _targetDesc + store := newStore(pvd.ContentStore(), descs) + pvd.SetContentStore(store) + } + } + targetDescs[idx] = *targetDesc + + if err := pvd.Push(ctx, *targetDesc, generator.Target); err != nil { + if errdefs.NeedsRetryWithHTTP(err) { + pvd.UsePlainHTTP() + if err := pvd.Push(ctx, *targetDesc, generator.Target); err != nil { + return errors.Wrap(err, "try to push image manifest") + } + } else { + return errors.Wrap(err, "push target image manifest") + } + } + return nil + }) + }(idx) + } + if err := eg.Wait(); err != nil { + return errors.Wrap(err, "push image manifests") + } return nil } + +func pushBlobFromBackend( + ctx context.Context, pvd *provider.Provider, bkd backend.Backend, src ocispec.Descriptor, generator Generator, bootstrapPath string, outputPath string, +) ([]ocispec.Descriptor, *ocispec.Descriptor, error) { + manifest := ocispec.Manifest{} + if _, err := serverutils.ReadJSON(ctx, pvd.ContentStore(), &manifest, src); err != nil { + return nil, nil, errors.Wrap(err, "read manifest from store") + } + fsversion := src.Annotations["containerd.io/snapshot/nydus-fs-version"] + // Read the Nydusify output JSON to get the list of blobs + var out output + bytes, err := os.ReadFile(outputPath) + if err != nil { + return nil, nil, errors.Wrap(err, "read output file") + } + if err := json.Unmarshal(bytes, &out); err != nil { + return nil, nil, errors.Wrap(err, "unmarshal output json") + } + + blobIDs := []string{} + blobIDMap := map[string]bool{} + for _, blobID := range out.Blobs { + if blobIDMap[blobID] { + continue + } + blobIDs = append(blobIDs, blobID) + blobIDMap[blobID] = true + } + blobDescs := make([]ocispec.Descriptor, len(blobIDs)) + + eg, ctx := errgroup.WithContext(ctx) + sem := semaphore.NewWeighted(int64(provider.LayerConcurrentLimit)) + for idx := range blobIDs { + func(idx int) { + eg.Go(func() error { + sem.Acquire(context.Background(), 1) + defer sem.Release(1) + blobID := blobIDs[idx] + blobDigest := digest.Digest("sha256:" + blobID) + blobSize, err := bkd.Size(blobID) + if err != nil { + return errors.Wrap(err, "get blob size") + } + blobSizeStr := humanize.Bytes(uint64(blobSize)) + + logrus.WithField("digest", blobDigest).WithField("size", blobSizeStr).Infof("pushing blob from backend") + rc, err := bkd.Reader(blobID) + if err != nil { + return errors.Wrap(err, "get blob reader") + } + defer rc.Close() + blobDescs[idx] = ocispec.Descriptor{ + Digest: blobDigest, + Size: blobSize, + MediaType: converter.MediaTypeNydusBlob, + Annotations: map[string]string{ + converter.LayerAnnotationNydusBlob: "true", + }, + } + writer, err := getPushWriter(ctx, pvd, blobDescs[idx], generator.Opt) + if err != nil { + if errdefs.NeedsRetryWithHTTP(err) { + pvd.UsePlainHTTP() + writer, err = getPushWriter(ctx, pvd, blobDescs[idx], generator.Opt) + } + if err != nil { + return errors.Wrap(err, "get push writer") + } + } + if writer != nil { + defer writer.Close() + return content.Copy(ctx, writer, rc, blobSize, blobDigest) + } + + logrus.WithField("digest", blobDigest).WithField("size", blobSizeStr).Infof("pushed blob from backend") + + return nil + }) + }(idx) + } + + if err := eg.Wait(); err != nil { + return nil, nil, errors.Wrap(err, "push blobs") + } + + // Update manifest blob layers + manifest.Layers = nil + manifest.Layers = append(blobDescs, manifest.Layers...) + + // Update bootstrap + cw, err := content.OpenWriter(ctx, pvd.ContentStore(), content.WithRef("merge-bootstrap")) + if err != nil { + return nil, nil, errors.Wrap(err, "open content store writer") + } + defer cw.Close() + + bootstrapPathTar := "image/image.boot" + rc, err := utils.PackTargz(bootstrapPath, bootstrapPathTar, false) + if err != nil { + return nil, nil, errors.Wrap(err, "get bootstrap reader") + } + defer rc.Close() + + gw := gzip.NewWriter(cw) + uncompressedDgst := digest.SHA256.Digester() + compressed := io.MultiWriter(gw, uncompressedDgst.Hash()) + + buffer := make([]byte, 32*1024) + if _, err := io.CopyBuffer(compressed, rc, buffer); err != nil { + return nil, nil, errors.Wrapf(err, "copy bootstrap targz into content store") + } + if err := gw.Close(); err != nil { + return nil, nil, errors.Wrap(err, "close gzip writer") + } + + compressedDgst := cw.Digest() + if err := cw.Commit(ctx, 0, compressedDgst, content.WithLabels(map[string]string{ + "containerd.io/uncompressed": uncompressedDgst.Digest().String(), + })); err != nil { + if !containerdErrdefs.IsAlreadyExists(err) { + return nil, nil, errors.Wrap(err, "commit to content store") + } + } + if err := cw.Close(); err != nil { + return nil, nil, errors.Wrap(err, "close content store writer") + } + + bootstrapInfo, err := pvd.ContentStore().Info(ctx, compressedDgst) + if err != nil { + return nil, nil, errors.Wrap(err, "get info from content store") + } + bootstrapSize := bootstrapInfo.Size + + bootstrapDesc := ocispec.Descriptor{ + Digest: compressedDgst, + Size: bootstrapSize, + MediaType: "application/vnd.docker.image.rootfs.diff.tar.gzip", + Annotations: map[string]string{ + "containerd.io/snapshot/nydus-bootstrap": "true", + "containerd.io/snapshot/nydus-fs-version": fsversion, + }, + } + manifest.Layers = append(manifest.Layers, bootstrapDesc) + + // Update image config + blobDigests := []digest.Digest{} + for idx := range blobDescs { + blobDigests = append(blobDigests, blobDescs[idx].Digest) + } + + config := ocispec.Image{} + if _, err := serverutils.ReadJSON(ctx, pvd.ContentStore(), &config, manifest.Config); err != nil { + return nil, nil, errors.Wrap(err, "read config json") + } + config.RootFS.DiffIDs = nil + config.RootFS.DiffIDs = append(blobDigests, config.RootFS.DiffIDs...) + config.RootFS.DiffIDs = append(config.RootFS.DiffIDs, digest.Digest(uncompressedDgst.Digest().String())) + configDesc, err := serverutils.WriteJSON(ctx, pvd.ContentStore(), config, manifest.Config, generator.Target, nil) + if err != nil { + return nil, nil, errors.Wrap(err, "write config json") + } + manifest.Config = *configDesc + target, err := serverutils.WriteJSON(ctx, pvd.ContentStore(), &manifest, src, generator.Target, nil) + if err != nil { + return nil, nil, errors.Wrap(err, "write manifest json") + } + + return blobDescs, target, nil +} + +func getPushWriter(ctx context.Context, pvd *provider.Provider, desc ocispec.Descriptor, opt Opt) (content.Writer, error) { + resolver, err := pvd.Resolver(opt.Target) + if err != nil { + return nil, errors.Wrap(err, "get resolver") + } + + ref := opt.Target + if !strings.Contains(ref, "@") { + ref = ref + "@" + desc.Digest.String() + } + pusher, err := resolver.Pusher(ctx, ref) + if err != nil { + return nil, errors.Wrap(err, "create pusher") + } + writer, err := pusher.Push(ctx, desc) + if err != nil { + if containerdErrdefs.IsAlreadyExists(err) { + return nil, nil + } + return nil, err + } + + return writer, nil +} + +type store struct { + content.Store + remotes []ocispec.Descriptor +} + +func newStore(base content.Store, remotes []ocispec.Descriptor) *store { + return &store{ + Store: base, + remotes: remotes, + } +} + +func (s *store) Info(ctx context.Context, dgst digest.Digest) (content.Info, error) { + info, err := s.Store.Info(ctx, dgst) + if err != nil { + if !containerdErrdefs.IsNotFound(err) { + return content.Info{}, err + } + for _, desc := range s.remotes { + if desc.Digest == dgst { + return content.Info{ + Digest: desc.Digest, + Size: desc.Size, + }, nil + } + } + return content.Info{}, err + } + return info, nil +} diff --git a/smoke/tests/image_test.go b/smoke/tests/image_test.go index 9665f73c198..922da1464d3 100644 --- a/smoke/tests/image_test.go +++ b/smoke/tests/image_test.go @@ -5,7 +5,9 @@ package tests import ( + "encoding/json" "fmt" + "os" "path/filepath" "testing" @@ -141,49 +143,109 @@ func (i *ImageTestSuite) TestConvertAndCopyImage(t *testing.T, ctx tool.Context, } func (i *ImageTestSuite) TestGenerateChunkdict() test.Generator { - return func() (name string, testCase test.Case) { + return func() (name string, testCase test.Case) { imagename1 := "redis:7.0.1" imagename2 := "redis:7.0.2" imagename3 := "redis:7.0.3" image1 := i.prepareImage(i.T, imagename1) image2 := i.prepareImage(i.T, imagename2) image3 := i.prepareImage(i.T, imagename3) - ctx := tool.DefaultContext(i.T) + ctx := tool.DefaultContext(i.T) - // Prepare work directory - ctx.PrepareWorkDir(i.T) - defer ctx.Destroy(i.T) + // Prepare work directory + ctx.PrepareWorkDir(i.T) + defer ctx.Destroy(i.T) - logLevel := "--log-level warn" - nydusifyPath := ctx.Binary.Nydusify + logLevel := "--log-level warn" + nydusifyPath := ctx.Binary.Nydusify - target1 := fmt.Sprintf("%s-nydus-%s", image1, uuid.NewString()) - target2 := fmt.Sprintf("%s-nydus-%s", image2, uuid.NewString()) - target3 := fmt.Sprintf("%s-nydus-%s", image3, uuid.NewString()) - convertCmd1 := fmt.Sprintf( + // Test v6 + target1v6 := fmt.Sprintf("%s-nydus-%s", image1, uuid.NewString()) + target2v6 := fmt.Sprintf("%s-nydus-%s", image2, uuid.NewString()) + target3v6 := fmt.Sprintf("%s-nydus-%s", image3, uuid.NewString()) + convertCmd1 := fmt.Sprintf( "%s %s convert --source %s --target %s --nydus-image %s --work-dir %s", - ctx.Binary.Nydusify, logLevel, image1, target1, ctx.Binary.Builder, ctx.Env.TempDir, + ctx.Binary.Nydusify, logLevel, image1, target1v6, ctx.Binary.Builder, ctx.Env.TempDir, ) tool.RunWithoutOutput(i.T, convertCmd1) convertCmd2 := fmt.Sprintf( "%s %s convert --source %s --target %s --nydus-image %s --work-dir %s", - ctx.Binary.Nydusify, logLevel, image1, target2, ctx.Binary.Builder, ctx.Env.TempDir, + ctx.Binary.Nydusify, logLevel, image2, target2v6, ctx.Binary.Builder, ctx.Env.TempDir, ) tool.RunWithoutOutput(i.T, convertCmd2) convertCmd3 := fmt.Sprintf( "%s %s convert --source %s --target %s --nydus-image %s --work-dir %s", - ctx.Binary.Nydusify, logLevel, image1, target3, ctx.Binary.Builder, ctx.Env.TempDir, + ctx.Binary.Nydusify, logLevel, image3, target3v6, ctx.Binary.Builder, ctx.Env.TempDir, ) tool.RunWithoutOutput(i.T, convertCmd3) - target := fmt.Sprintf("%s,%s,%s", target1, target2, target3) - + + backendtype := "--backend-type oss" + sourceinsecure := "--source-insecure" + targetinsecure := "--target-insecure" + + jsonData := `{ + "endpoint": "oss-cn-zhangjiakou.aliyuncs.com", + "access_key_id": "LTAI5tKHuSQQXVjSE7PgKYhf", + "access_key_secret": "FBYp1JDxlIZt8cCpFWpq3j9HYokw8a", + "bucket_name": "testcompact1" + }` + + formattedData, err := json.MarshalIndent(json.RawMessage(jsonData), "", " ") + if err != nil { + fmt.Println("Error marshalling JSON:", err) + return + } + os.WriteFile("output.json", formattedData, 0644) + + backendconfigfile := "--backend-config-file output.json" + + targetv6 := fmt.Sprintf("%s,%s,%s", target1v6, target2v6, target3v6) + chunkdictv6 := fmt.Sprintf("%s-nydus-%s", image1, uuid.NewString()) + generateCmd := fmt.Sprintf( - "%s %s chunkdict generate --sources %s --nydus-image %s --work-dir %s", - nydusifyPath, logLevel, target, ctx.Binary.Builder, ctx.Env.TempDir, + "%s %s chunkdict generate --sources %s --target %s %s %s %s %s --nydus-image %s --work-dir %s", + nydusifyPath, logLevel, targetv6, chunkdictv6, sourceinsecure, targetinsecure, backendtype, backendconfigfile, ctx.Binary.Builder, filepath.Join(ctx.Env.WorkDir, "generate"), ) tool.RunWithoutOutput(i.T, generateCmd) - return "generateChunkdict", nil - } + + checkCmd := fmt.Sprintf( + "%s %s check --target %s --nydus-image %s --nydusd %s --work-dir %s", + nydusifyPath, logLevel, chunkdictv6, ctx.Binary.Builder, ctx.Binary.Nydusd, filepath.Join(ctx.Env.WorkDir, "check"), + ) + tool.RunWithoutOutput(i.T, checkCmd) + + // Test v5 + fsversion := "--fs-version 5" + target1v5 := fmt.Sprintf("%s-nydus5-%s", image1, uuid.NewString()) + target2v5 := fmt.Sprintf("%s-nydus5-%s", image2, uuid.NewString()) + target3v5 := fmt.Sprintf("%s-nydus5-%s", image3, uuid.NewString()) + convertCmd4 := fmt.Sprintf( + "%s %s convert --source %s --target %s --nydus-image %s %s --work-dir %s", + ctx.Binary.Nydusify, logLevel, image1, target1v5, ctx.Binary.Builder, fsversion, ctx.Env.TempDir, + ) + tool.RunWithoutOutput(i.T, convertCmd4) + convertCmd5 := fmt.Sprintf( + "%s %s convert --source %s --target %s --nydus-image %s %s --work-dir %s", + ctx.Binary.Nydusify, logLevel, image2, target2v5, ctx.Binary.Builder, fsversion, ctx.Env.TempDir, + ) + tool.RunWithoutOutput(i.T, convertCmd5) + convertCmd6 := fmt.Sprintf( + "%s %s convert --source %s --target %s --nydus-image %s %s --work-dir %s", + ctx.Binary.Nydusify, logLevel, image3, target3v5, ctx.Binary.Builder, fsversion, ctx.Env.TempDir, + ) + tool.RunWithoutOutput(i.T, convertCmd6) + + targetv5 := fmt.Sprintf("%s,%s,%s", target1v5, target2v5, target3v5) + chunkdictv5 := fmt.Sprintf("%s-nydus5-%s", image1, uuid.NewString()) + + generateCmd2 := fmt.Sprintf( + "%s %s chunkdict generate --sources %s --target %s %s %s %s %s --nydus-image %s --work-dir %s", + nydusifyPath, logLevel, targetv5, chunkdictv5, sourceinsecure, targetinsecure, backendtype, backendconfigfile, ctx.Binary.Builder, filepath.Join(ctx.Env.WorkDir, "generate"), + ) + tool.RunWithoutOutput(i.T, generateCmd2) + + return "generateChunkdict", nil + } } func (i *ImageTestSuite) prepareImage(t *testing.T, image string) string { diff --git a/src/bin/nydus-image/deduplicate.rs b/src/bin/nydus-image/deduplicate.rs index fa090cf648b..e27d3359b41 100644 --- a/src/bin/nydus-image/deduplicate.rs +++ b/src/bin/nydus-image/deduplicate.rs @@ -307,7 +307,8 @@ impl Algorithm { chunkdict_size as f64 / 1024 as f64 / 1024 as f64 ); for chunk in all_chunks { - if !core_image.contains(&chunk.image_reference) && !noise_points.contains(&chunk.image_reference) + if !core_image.contains(&chunk.image_reference) + && !noise_points.contains(&chunk.image_reference) { noise_points.push(chunk.image_reference.clone()); } @@ -336,8 +337,7 @@ impl Algorithm { let smoothed_score: f64 = 0.0; smoothed_data.push(smoothed_score); } else { - if all_chunks[chunk_index - 1].version != all_chunks[chunk_index].version - { + if all_chunks[chunk_index - 1].version != all_chunks[chunk_index].version { last_start_version_index = start_version_index; start_version_index = chunk_index; last_end_version_index = chunk_index - 1; From eae2b55996709f995a9da9bde8e55b9934ab5f25 Mon Sep 17 00:00:00 2001 From: Lin Wang Date: Tue, 9 Jan 2024 14:59:47 +0800 Subject: [PATCH 09/11] Revise based on comments --- builder/src/chunkdict_generator.rs | 17 +- builder/src/generate.rs | 257 ------------------ contrib/nydusify/cmd/nydusify.go | 14 - .../pkg/chunkdict/generator/generator.go | 92 ++++--- docs/chunk-deduplication.md | 76 +++--- smoke/tests/image_test.go | 74 +++-- src/bin/nydus-image/deduplicate.rs | 21 +- src/bin/nydus-image/main.rs | 71 +++-- 8 files changed, 215 insertions(+), 407 deletions(-) delete mode 100644 builder/src/generate.rs diff --git a/builder/src/chunkdict_generator.rs b/builder/src/chunkdict_generator.rs index ed2b4b01d87..354b6014e08 100644 --- a/builder/src/chunkdict_generator.rs +++ b/builder/src/chunkdict_generator.rs @@ -52,7 +52,7 @@ impl Generator { blob_mgr: &mut BlobManager, chunkdict_origin: Vec, ) -> Result { - // validate and remove chunks which bloned blob size is smaller than block. + // Validate and remove chunks whose belonged blob sizes are smaller than a block. let mut chunkdict = chunkdict_origin.to_vec(); Self::validate_and_remove_chunks(ctx, &mut chunkdict); @@ -136,8 +136,8 @@ impl Generator { inode.set_blocks(256); let node_info = NodeInfo { explicit_uidgid: true, - src_dev: 66305, - src_ino: 24772610, + src_dev: 0, + src_ino: 0, rdev: 0, source: PathBuf::from("/"), path: PathBuf::from("/"), @@ -171,8 +171,8 @@ impl Generator { inode.set_blocks(256); let node_info = NodeInfo { explicit_uidgid: true, - src_dev: 66305, - src_ino: 24775126, + src_dev: 0, + src_ino: 1, rdev: 0, source: PathBuf::from("/"), path: PathBuf::from("/chunkdict"), @@ -211,17 +211,14 @@ impl Generator { node: &mut Node, chunkdict: &[ChunkdictChunkInfo], ) -> Result<()> { - for chunk_info in chunkdict.iter() { + for (i, chunk_info) in chunkdict.iter().enumerate() { let chunk_size: u32 = chunk_info.chunk_compressed_size; - let file_offset = 1 as u64 * chunk_size as u64; + let file_offset = i as u64 * chunk_size as u64; let mut chunk = ChunkWrapper::new(ctx.fs_version); // update blob context let (blob_index, blob_ctx) = blob_mgr.get_or_cerate_blob_for_chunkdict(ctx, &chunk_info.chunk_blob_id)?; - if blob_ctx.blob_id.is_empty() { - blob_ctx.blob_id = chunk_info.chunk_blob_id.clone(); - } let chunk_uncompressed_size = chunk_info.chunk_uncompressed_size; let pre_d_offset = blob_ctx.current_uncompressed_offset; blob_ctx.uncompressed_blob_size = pre_d_offset + chunk_uncompressed_size as u64; diff --git a/builder/src/generate.rs b/builder/src/generate.rs deleted file mode 100644 index 576142b926c..00000000000 --- a/builder/src/generate.rs +++ /dev/null @@ -1,257 +0,0 @@ -// Copyright (C) 2022 Nydus Developers. All rights reserved. -// -// SPDX-License-Identifier: Apache-2.0 - -//! Generate Chunkdict RAFS bootstrap. -//! Bug 1: Inconsistent Chunk Size Leading to Blob Size Less Than 4K -//! Description: The size of chunks is not consistent, which results in the possibility that a blob, composed of a group of these chunks, may be less than 4K in size. This inconsistency leads to a failure in passing the size check. -//! Bug 2: Incorrect Chunk Number Calculation Due to Premature Check Logic -//! Description: The current logic for calculating the chunk number is based on the formula size/chunk size. However, this approach is flawed as it precedes the actual check which accounts for chunk statistics. Consequently, this leads to inaccurate counting of chunk numbers. - -use super::core::node::{ChunkSource, NodeInfo}; -use super::{BlobManager, Bootstrap, BootstrapManager, BuildContext, BuildOutput, Tree}; -use crate::core::node::Node; -use crate::NodeChunk; -use anyhow::Result; -use nydus_rafs::metadata::chunk::ChunkWrapper; -use nydus_rafs::metadata::inode::InodeWrapper; -use nydus_rafs::metadata::layout::RafsXAttrs; -use nydus_rafs::metadata::RafsVersion; -use nydus_storage::meta::BlobChunkInfoV1Ondisk; -use nydus_utils::digest::RafsDigest; -use nydus_utils::lazy_drop; -use std::ffi::OsString; -use std::mem::size_of; -use std::path::PathBuf; -use std::sync::Arc; -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub struct ChunkdictChunkInfo { - pub image_name: String, - pub version_name: String, - pub chunk_blob_id: String, - pub chunk_digest: String, - pub chunk_compressed_size: u32, - pub chunk_uncompressed_size: u32, - pub chunk_compressed_offset: u64, - pub chunk_uncompressed_offset: u64, -} - -/// Struct to Generater chunkdict RAFS bootstrap. -pub struct Generater {} - -impl Generater { - // Generate chunkdict RAFS bootstrap. - #[allow(clippy::too_many_arguments)] - pub fn generate( - ctx: &mut BuildContext, - bootstrap_mgr: &mut BootstrapManager, - blob_mgr: &mut BlobManager, - chunkdict_origin: Vec, - ) -> Result { - // validate and remove chunks which bloned blob size is smaller than block. - let mut chunkdict = chunkdict_origin.to_vec(); - Self::validate_and_remove_chunks(&mut chunkdict, ctx); - - // build root tree - let mut tree = Self::build_root_tree()?; - - // build child tree - let child = Self::build_child_tree(ctx, blob_mgr, &chunkdict)?; - let result = vec![child]; - tree.children = result; - tree.lock_node() - .v5_set_dir_size(ctx.fs_version, &tree.children); - - Self::validate_tree(&tree)?; - - // build bootstrap - let mut bootstrap_ctx = bootstrap_mgr.create_ctx()?; - let mut bootstrap = Bootstrap::new(tree)?; - bootstrap.build(ctx, &mut bootstrap_ctx)?; - - let blob_table = blob_mgr.to_blob_table(ctx)?; - let storage = &mut bootstrap_mgr.bootstrap_storage; - bootstrap.dump(ctx, storage, &mut bootstrap_ctx, &blob_table)?; - - lazy_drop(bootstrap_ctx); - - BuildOutput::new(blob_mgr, &bootstrap_mgr.bootstrap_storage) - } - - /// validate tree - fn validate_tree(tree: &Tree) -> Result<()> { - let pre = &mut |t: &Tree| -> Result<()> { - let node = t.lock_node(); - debug!("chunkdict tree: "); - debug!("inode: {}", node); - for chunk in &node.chunks { - debug!("\t chunk: {}", chunk); - } - Ok(()) - }; - tree.walk_dfs_pre(pre)?; - debug!("chunkdict tree is valid."); - Ok(()) - } - - /// check blob uncompressed size is bigger than block - fn validate_and_remove_chunks(chunkdict: &mut Vec, ctx: &mut BuildContext) { - let mut chunk_sizes = std::collections::HashMap::new(); - - // Accumulate the uncompressed size for each chunk_blob_id - for chunk in chunkdict.iter() { - *chunk_sizes.entry(chunk.chunk_blob_id.clone()).or_insert(0) += - chunk.chunk_uncompressed_size as u64; - } - - // Find all chunk_blob_ids with a total uncompressed size > 4096 - let small_chunks: Vec = chunk_sizes - .into_iter() - .filter(|&(_, size)| size < ctx.v6_block_size()) - .inspect(|(id, _)| { - eprintln!( - "Warning: Blob with id '{}' is smaller than {} bytes.", - id, - ctx.v6_block_size() - ) - }) - .map(|(id, _)| id) - .collect(); - - // Retain only chunks with chunk_blob_id that has a total uncompressed size > 4096 - chunkdict.retain(|chunk| !small_chunks.contains(&chunk.chunk_blob_id)); - } - - /// Build root tree - pub fn build_root_tree() -> Result { - // inode - let mut inode = InodeWrapper::new(RafsVersion::V6); - inode.set_ino(0); - inode.set_uid(1000); - inode.set_gid(1000); - inode.set_projid(0); - inode.set_mode(0o660 | libc::S_IFDIR as u32); - inode.set_nlink(1); - inode.set_name_size("/".len()); - inode.set_rdev(0); - inode.set_blocks(256); - let node_info = NodeInfo { - explicit_uidgid: true, - src_dev: 66305, - src_ino: 24772610, - rdev: 0, - source: PathBuf::from("/"), - path: PathBuf::from("/"), - target: PathBuf::from("/"), - target_vec: vec![OsString::from("/")], - symlink: None, - xattrs: RafsXAttrs::default(), - v6_force_extended_inode: true, - }; - let root_node = Node::new(inode, node_info, 0); - let tree = Tree::new(root_node); - Ok(tree) - } - - /// Build child tree - fn build_child_tree( - ctx: &mut BuildContext, - blob_mgr: &mut BlobManager, - chunkdict: &[ChunkdictChunkInfo], - ) -> Result { - // node - let mut inode = InodeWrapper::new(RafsVersion::V6); - inode.set_ino(1); - inode.set_uid(0); - inode.set_gid(0); - inode.set_projid(0); - inode.set_mode(0o660 | libc::S_IFREG as u32); - inode.set_nlink(1); - inode.set_name_size("chunkdict".len()); - inode.set_rdev(0); - inode.set_blocks(256); - let node_info = NodeInfo { - explicit_uidgid: true, - src_dev: 66305, - src_ino: 24775126, - rdev: 0, - source: PathBuf::from("/"), - path: PathBuf::from("/chunkdict"), - target: PathBuf::from("/chunkdict"), - target_vec: vec![OsString::from("/"), OsString::from("/chunkdict")], - symlink: None, - xattrs: RafsXAttrs::new(), - v6_force_extended_inode: true, - }; - let mut node = Node::new(inode, node_info, 0); - - // insert chunks - Self::insert_chunks(ctx, blob_mgr, &mut node, chunkdict)?; - - let node_size: u64 = node - .chunks - .iter() - .map(|chunk| chunk.inner.uncompressed_size() as u64) - .sum(); - node.inode.set_size(node_size); - - // update child count - node.inode.set_child_count(node.chunks.len() as u32); - - let child = Tree::new(node); - child - .lock_node() - .v5_set_dir_size(ctx.fs_version, &child.children); - Ok(child) - } - - /// Insert chunks - fn insert_chunks( - ctx: &mut BuildContext, - blob_mgr: &mut BlobManager, - node: &mut Node, - chunkdict: &[ChunkdictChunkInfo], - ) -> Result<()> { - for chunk_info in chunkdict.iter() { - let chunk_size: u32 = chunk_info.chunk_compressed_size; - let file_offset = 1 as u64 * chunk_size as u64; - ctx.fs_version = RafsVersion::V6; - let mut chunk = ChunkWrapper::new(RafsVersion::V6); - - // update blob context - let (blob_index, blob_ctx) = - blob_mgr.get_or_cerate_blob_for_chunkdict(ctx, &chunk_info.chunk_blob_id)?; - if blob_ctx.blob_id.is_empty() { - blob_ctx.blob_id = chunk_info.chunk_blob_id.clone(); - } - let chunk_uncompressed_size = chunk_info.chunk_uncompressed_size; - let pre_d_offset = blob_ctx.current_uncompressed_offset; - blob_ctx.uncompressed_blob_size = pre_d_offset + chunk_uncompressed_size as u64; - blob_ctx.current_uncompressed_offset += chunk_uncompressed_size as u64; - - blob_ctx.blob_meta_header.set_ci_uncompressed_size( - blob_ctx.blob_meta_header.ci_uncompressed_size() - + size_of::() as u64, - ); - - // update chunk - let chunk_index = blob_ctx.alloc_chunk_index()?; - chunk.set_blob_index(blob_index); - chunk.set_index(chunk_index); - chunk.set_file_offset(file_offset); - chunk.set_compressed_size(chunk_info.chunk_compressed_size); - chunk.set_compressed_offset(chunk_info.chunk_compressed_offset); - chunk.set_uncompressed_size(chunk_info.chunk_uncompressed_size); - chunk.set_uncompressed_offset(chunk_info.chunk_uncompressed_offset); - chunk.set_id(RafsDigest::from_string(&chunk_info.chunk_digest)); - - debug!("chunk id: {}", chunk.id()); - - node.chunks.push(NodeChunk { - source: ChunkSource::Build, - inner: Arc::new(chunk.clone()), - }); - } - Ok(()) - } -} diff --git a/contrib/nydusify/cmd/nydusify.go b/contrib/nydusify/cmd/nydusify.go index ff00fde1528..94d06fe70b0 100644 --- a/contrib/nydusify/cmd/nydusify.go +++ b/contrib/nydusify/cmd/nydusify.go @@ -694,11 +694,6 @@ func main() { Usage: "Json configuration file for storage backend", EnvVars: []string{"BACKEND_CONFIG_FILE"}, }, - &cli.StringFlag{ - Name: "push-chunk-size", - Value: "0MB", - Usage: "Chunk size for pushing a blob layer in chunked", - }, &cli.StringFlag{ Name: "work-dir", @@ -731,13 +726,6 @@ func main() { if err != nil { return err } - pushChunkSize, err := humanize.ParseBytes(c.String("push-chunk-size")) - if err != nil { - return errors.Wrap(err, "invalid --push-chunk-size option") - } - if pushChunkSize > 0 { - logrus.Infof("will copy layer with chunk size %s", c.String("push-chunk-size")) - } _, arch, err := provider.ExtractOsArch(c.String("platform")) if err != nil { @@ -759,8 +747,6 @@ func main() { ExpectedArch: arch, AllPlatforms: c.Bool("all-platforms"), Platforms: c.String("platform"), - - PushChunkSize: int64(pushChunkSize), }) if err != nil { return err diff --git a/contrib/nydusify/pkg/chunkdict/generator/generator.go b/contrib/nydusify/pkg/chunkdict/generator/generator.go index 5e91e9390a4..15effe4ec21 100644 --- a/contrib/nydusify/pkg/chunkdict/generator/generator.go +++ b/contrib/nydusify/pkg/chunkdict/generator/generator.go @@ -54,8 +54,6 @@ type Opt struct { AllPlatforms bool Platforms string - - PushChunkSize int64 } // Generator generates chunkdict by deduplicating multiple nydus images @@ -119,7 +117,8 @@ func (generator *Generator) Generate(ctx context.Context) error { return err } - return os.RemoveAll(generator.WorkDir) + // return os.RemoveAll(generator.WorkDir) + return nil } // Pull the bootstrap of nydus image @@ -147,7 +146,7 @@ func (generator *Generator) pull(ctx context.Context) ([]string, error) { } func (generator *Generator) generate(_ context.Context, bootstrapSlice []string) (string, string, error) { - // Invoke "nydus-image generate" command + // Invoke "nydus-image chunkdict generate" command currentDir, _ := os.Getwd() builder := build.NewBuilder(generator.NydusImagePath) @@ -194,7 +193,7 @@ func (generator *Generator) push(ctx context.Context, chunkdictBootstrapPath str return err } - pvd, err := provider.New(generator.WorkDir, hosts(generator), 200, "v1", platformMC, generator.PushChunkSize) + pvd, err := provider.New(generator.WorkDir, hosts(generator), 200, "v1", platformMC, 0) if err != nil { return err } @@ -207,17 +206,20 @@ func (generator *Generator) push(ctx context.Context, chunkdictBootstrapPath str } } - // Pull a source image as a template - if err := pvd.Pull(ctx, generator.Sources[0]); err != nil { - if errdefs.NeedsRetryWithHTTP(err) { - pvd.UsePlainHTTP() - if err := pvd.Pull(ctx, generator.Sources[0]); err != nil { - return errors.Wrap(err, "try to pull image") + // Pull source image + for index := range generator.Sources { + if err := pvd.Pull(ctx, generator.Sources[index]); err != nil { + if errdefs.NeedsRetryWithHTTP(err) { + pvd.UsePlainHTTP() + if err := pvd.Pull(ctx, generator.Sources[index]); err != nil { + return errors.Wrap(err, "try to pull image") + } + } else { + return errors.Wrap(err, "pull source image") } - } else { - return errors.Wrap(err, "pull source image") } } + logrus.Infof("pulled source image %s", generator.Sources[0]) sourceImage, err := pvd.Image(ctx, generator.Sources[0]) if err != nil { @@ -239,18 +241,18 @@ func (generator *Generator) push(ctx context.Context, chunkdictBootstrapPath str defer sem.Release(1) sourceDesc := sourceDescs[idx] targetDesc := &sourceDesc + // Get the blob from backend - if bkd != nil { - descs, _targetDesc, err := pushBlobFromBackend(ctx, pvd, bkd, sourceDesc, *generator, chunkdictBootstrapPath, outputPath) - if err != nil { - return errors.Wrap(err, "get resolver") - } - if _targetDesc != nil { - targetDesc = _targetDesc - store := newStore(pvd.ContentStore(), descs) - pvd.SetContentStore(store) - } + descs, _targetDesc, err := pushBlobFromBackend(ctx, pvd, bkd, sourceDesc, *generator, chunkdictBootstrapPath, outputPath) + if err != nil { + return errors.Wrap(err, "get resolver") } + if _targetDesc != nil { + targetDesc = _targetDesc + store := newStore(pvd.ContentStore(), descs) + pvd.SetContentStore(store) + } + targetDescs[idx] = *targetDesc if err := pvd.Push(ctx, *targetDesc, generator.Target); err != nil { @@ -309,20 +311,45 @@ func pushBlobFromBackend( eg.Go(func() error { sem.Acquire(context.Background(), 1) defer sem.Release(1) + blobID := blobIDs[idx] blobDigest := digest.Digest("sha256:" + blobID) - blobSize, err := bkd.Size(blobID) - if err != nil { - return errors.Wrap(err, "get blob size") - } - blobSizeStr := humanize.Bytes(uint64(blobSize)) - logrus.WithField("digest", blobDigest).WithField("size", blobSizeStr).Infof("pushing blob from backend") - rc, err := bkd.Reader(blobID) - if err != nil { - return errors.Wrap(err, "get blob reader") + var blobSize int64 + var rc io.ReadCloser + + if bkd != nil { + rc, err = bkd.Reader(blobID) + if err != nil { + return errors.Wrap(err, "get blob reader") + } + blobSize, err = bkd.Size(blobID) + if err != nil { + return errors.Wrap(err, "get blob size") + } + } else { + imageDesc, err := generator.sourcesParser[0].Remote.Resolve(ctx) + if err != nil { + if strings.Contains(err.Error(), "x509: certificate signed by unknown authority") { + logrus.Warningln("try to enable \"--source-insecure\" / \"--target-insecure\" option") + } + return errors.Wrap(err, "resolve image") + } + rc, err = generator.sourcesParser[0].Remote.Pull(ctx, *imageDesc, true) + if err != nil { + return errors.Wrap(err, "get blob reader") + } + blobInfo, err := pvd.ContentStore().Info(ctx, blobDigest) + if err != nil { + return errors.Wrap(err, "get info from content store") + } + blobSize = blobInfo.Size } defer rc.Close() + + blobSizeStr := humanize.Bytes(uint64(blobSize)) + logrus.WithField("digest", blobDigest).WithField("size", blobSizeStr).Infof("pushing blob from backend") + blobDescs[idx] = ocispec.Descriptor{ Digest: blobDigest, Size: blobSize, @@ -349,6 +376,7 @@ func pushBlobFromBackend( logrus.WithField("digest", blobDigest).WithField("size", blobSizeStr).Infof("pushed blob from backend") return nil + }) }(idx) } diff --git a/docs/chunk-deduplication.md b/docs/chunk-deduplication.md index 97a8db2c3f4..259169551d1 100644 --- a/docs/chunk-deduplication.md +++ b/docs/chunk-deduplication.md @@ -1,8 +1,8 @@ -# Notice [WIP] Pending further revisionsNotice -# Probntroduction +# Probntroduction In container images, there are often a large number of duplicate files or content, and these duplicate parts occupy a large amount of storage space, especially in high-density deployment scenarios. As the number of Nydus images grows, it will bring many problems such as low storage space utilization and excessive consumption of bandwidth resources. To do this, an effective deduplication mechanism (deduplication) needs to be designed to solve this problem. Unlike traditional OCI, which distributes images at a layer-granular level, the smallest unit of a Nydus image is a chunk, so the deduplication algorithm needs to be deduplicated in chunk units. At the same time, we want to deduplicate multiple aspects of the Nydus image, including between Nydus images and between different versions of the same Nydus image. No matter which deduplication method is essentially to deduplicate the repeated chunks in the image, only one duplicate chunk is retained, and the reference to the chunk is used instead of other duplicate chunks to reduce the storage space occupation, so as to maximize the data transmission and storage capabilities of Nydus and improve the access speed and efficiency of the image. + # General idea The deduplication algorithm first needs to select the duplicate chunk in the image according to the image information such as the number of occurrences of chunk, chunk size, chunk image to which the chunk belongs and the corresponding version, and generate chunkdict, chunkdict records the unique identifier or fingerprint of chunk, only need to store chunkdict, other images can refer to chunk in chunkdict by reference. @@ -13,32 +13,43 @@ The deduplication algorithm is divided into two parts, the first part is the DBS 2. Extract the image information and call the DBSCAN clustering algorithm to deduplicate different images. 3. Deduplicate the dictionary content in 2, and call the exponential smoothing algorithm for each image separately for image version deduplication. 4. Get the deduplication dictionary generated by running the two algorithms and drop the disk. +5. Generate a chunkdict image and push it to the remote repository # Algorithm detailed process ## Overall Input ```shell nydusify chunkdict generate --sources \ - localhost:5000:redis:nydus_7.0.1, \ - localhost:5000:redis:nydus_7.0.2,\ - localhost:5000:redis:nydus_7.0.3 \ + registry.com/redis:nydus_7.0.1, \ + registry.com/redis:nydus_7.0.2, \ + registry.com/redis:nydus_7.0.3 \ + -- target registry.com/redis:nydus_chunkdict \ + --source-insecure --target-insecure + # Optional + --backend-config-file /path/to/backend-config.json \ + --backend-type oss +``` + +# Use the chunk dict image to reduce the incremental size of the new image +``` +nydusify convert + --source registry.com/redis:OCI_7.0.4 \ + --target registry.com/redis:nydus_7.0.4 \ + --chunk-dict registry.com/redis:nydus_chunkdict ``` -*** -`nydusify chunkdict generate` calls two commands `nydus-image chunkdict save` and `nydus-image chunkdict generate` to store image information into the database and generate a list of chunks to be deduplicated -Download multiple Nydus images in advance and put them into the repository as datasets, such as selecting 10 consecutive versions of redis and alpine as the image dataset, and execute the command `nydus-image chunkdict save` to store the information of the chunk and blob in the chunk and blob table of the database. +*** +`nydusify chunkdict generate` calls subcommand `nydus-image chunkdict generate` to store image information into the database and generate a new bootstrap as chunkdict bootstrap. +Download multiple Nydus images in advance and put them into the repository as datasets, such as selecting 10 consecutive versions of redis and alpine as the image dataset, and execute the command `nydus-image chunkdict generate` to store the information of the chunk and blob in the chunk and blob table of the database. ```shell # Deposit multiple images into the database -nydus-image chunkdict save --bootstrap \ - ./output/localhost:5000:redis:nydus_7.0.1/nydus_bootstrap, \ - ./output/localhost:5000:redis:nydus_7.0.2/nydus_bootstrap, \ - ./output/localhost:5000:redis:nydus_7.0.3/nydus_bootstrap \ -``` -Execute the command `nydus-image chunkdict generate` to access the database and call the deduplication algorithm to generate the chunk list -```shell -# Call the deduplication algorithm to generate chunk list -nydus-image chunkdict generate --database \ - sqlite:///path/imageservice/contrib/nydusify/chunkdict.db +nydus-image chunkdict generate --source \ + /path/localhost:5000:redis:nydus_7.0.1/nydus_bootstrap, \ + /path/localhost:5000:redis:nydus_7.0.2/nydus_bootstrap, \ + /path/localhost:5000:redis:nydus_7.0.3/nydus_bootstrap \ + --bootstrap /path/to/chunkdict_bootstrap\ + --database /path/to/database.db\ + --output-json /path/to/nydus_bootstrap_output.json ``` *** @@ -77,10 +88,9 @@ where $C(R_x)$ represents the unique chunk set of all training set images in the **6.** Remove the chunk in the chunk dictionary selected in 5 for all images (training set and test set), and then repeat the operation 1-5 to generate the chunk dictionary until the maximum number of cycles is reached 7, or the discrete image ratio is greater than 80% of the total number of images. The principle of DBSCAN algorithm how to divide the cluster is shown in the diagram: -![在这里插入图片描述](https://img-blog.csdnimg.cn/5fba149720a34620873a5a2cb304d668.png#pic_center) -In this diagram, minPts = 4. Point A and the other red points are core points, because the area surrounding these points in an ε radius contain at least 4 points (including the point itself). Because they are all reachable from one another, they form a single cluster. Points B and C are not core points, but are reachable from A (via other core points) and thus belong to the cluster as well. Point N is a noise point that is neither a core point nor directly-reachable. - +![](https://img-blog.csdnimg.cn/5fba149720a34620873a5a2cb304d668.png#pic_center) **Remark:** This section of the picture and the associated DBSCAN algorithm description are referenced from : [https://en.wikipedia.org/wiki/DBSCAN](https://en.wikipedia.org/wiki/DBSCAN) + #### Algorithm 2 Deduplication between different versions of the image (exponential smoothing algorithm) *** **Basic principle:** Exponential smoothing algorithm is a method for time series data prediction and smoothing, the basic principle is to weighted average the data, give higher weight to the more recent repeated chunks, and constantly update the smoothing value, so the newer chunk has a greater impact on future forecasts, and the impact of older data will gradually weaken. @@ -102,16 +112,20 @@ where, $\alpha=0.5$ , $Y_{t-1}$ indicates whether the chunk appeared in the prev **5.** Choose a chunk dictionary that minimizes the test set's storage space. *** + + ### Exponential smoothing algorithm test table +Step 1: Download 10 OCI versions of an image and count the total size +Step 2: Convert OCI to nydus image, and then count the total size after conversion +Step 3: Select three versions of the image to generate chunkdict, use chunkdict to convert the remaining seven versions of the image, and then count the total size +dedulicating rate = (total_image_size(nydus) - total_image_size (nydus after dedulicating))/total_image_size(nydus) + + + +| image_name | version number | total_image_size(OCI) | total_image_size(nydus) | total_image_size (nydus after dedulicating) | chunkdict_image_size | dedulicating rate | +|------------|----------------|-----------------------|-------------------------|---------------------------------------------|----------------------|-------------------| +| redis | 10 | 341.78 | 419.37 | 319.48 | 41.87 | 23.82% | +| ubuntu | 10 | 290.26 | 308.59 | 140.28 | 30.8 | 54.54% | +| alpine | 10 | 26.9 | 27.55 | 24.7 | 2.74 | 10.34% | -| image_name | version number | total_size | train_size | test_size | test_size after dedulicating | chunkdict_size | dedulicating rate | threshold | -|------------|----------------|------------|------------|-----------|------------------------------|----------------|-------------------|-----------| -| redis | 10 | 382.03 | 266.7 | 115.33 | 31.56 | 42.33 | 72.63% | 0.8-0.5 | -| python | 10 | 3509.91 | 2095.37 | 1414.54 | 123.33 | 588.61 | 91.28% | 0.8-0.5 | -| ubuntu | 10 | 317.33 | 222.11 | 95.22 | 12.27 | 39.61 | 87.11% | 0.8-0.5 | -| nginx | 10 | 396.86 | 284.4 | 112.46 | 50.54 | 83.54 | 55.06% | 0.8-0.5 | -| postgres | 10 | 1360.31 | 956.42 | 403.89 | 381.54 | 19.66 | 5.53% | 0.8-0.5 | -| alpine | 10 | 27.23 | 19.04 | 8.19 | 5.62 | 4.7 | 31.29% | 0.8-0.5 | -| node | 10 | 3698.44 | 2598.59 | 1099.85 | 429.39 | 649.42 | 60.96% | 0.8-0.5 | -| httpd | 10 | 561.99 | 385.79 | 176.2 | 85.7 | 54.15 | 51.36% | 0.8-0.5 | *** diff --git a/smoke/tests/image_test.go b/smoke/tests/image_test.go index 922da1464d3..69117ef3f39 100644 --- a/smoke/tests/image_test.go +++ b/smoke/tests/image_test.go @@ -5,9 +5,7 @@ package tests import ( - "encoding/json" "fmt" - "os" "path/filepath" "testing" @@ -159,7 +157,7 @@ func (i *ImageTestSuite) TestGenerateChunkdict() test.Generator { logLevel := "--log-level warn" nydusifyPath := ctx.Binary.Nydusify - // Test v6 + // Generate v6 chunkdcit target1v6 := fmt.Sprintf("%s-nydus-%s", image1, uuid.NewString()) target2v6 := fmt.Sprintf("%s-nydus-%s", image2, uuid.NewString()) target3v6 := fmt.Sprintf("%s-nydus-%s", image3, uuid.NewString()) @@ -179,32 +177,32 @@ func (i *ImageTestSuite) TestGenerateChunkdict() test.Generator { ) tool.RunWithoutOutput(i.T, convertCmd3) - backendtype := "--backend-type oss" + // backendtype := "--backend-type oss" sourceinsecure := "--source-insecure" targetinsecure := "--target-insecure" - jsonData := `{ - "endpoint": "oss-cn-zhangjiakou.aliyuncs.com", - "access_key_id": "LTAI5tKHuSQQXVjSE7PgKYhf", - "access_key_secret": "FBYp1JDxlIZt8cCpFWpq3j9HYokw8a", - "bucket_name": "testcompact1" - }` + // jsonData := `{ + // "endpoint": "oss-cn-zhangjiakou.aliyuncs.com", + // "access_key_id": "LTAI5tKHuSQQXVjSE7PgKYhf", + // "access_key_secret": "FBYp1JDxlIZt8cCpFWpq3j9HYokw8a", + // "bucket_name": "testcompact1" + // }` - formattedData, err := json.MarshalIndent(json.RawMessage(jsonData), "", " ") - if err != nil { - fmt.Println("Error marshalling JSON:", err) - return - } - os.WriteFile("output.json", formattedData, 0644) + // formattedData, err := json.MarshalIndent(json.RawMessage(jsonData), "", " ") + // if err != nil { + // fmt.Println("Error marshalling JSON:", err) + // return + // } + // os.WriteFile("output.json", formattedData, 0644) - backendconfigfile := "--backend-config-file output.json" + // backendconfigfile := "--backend-config-file output.json" targetv6 := fmt.Sprintf("%s,%s,%s", target1v6, target2v6, target3v6) chunkdictv6 := fmt.Sprintf("%s-nydus-%s", image1, uuid.NewString()) generateCmd := fmt.Sprintf( - "%s %s chunkdict generate --sources %s --target %s %s %s %s %s --nydus-image %s --work-dir %s", - nydusifyPath, logLevel, targetv6, chunkdictv6, sourceinsecure, targetinsecure, backendtype, backendconfigfile, ctx.Binary.Builder, filepath.Join(ctx.Env.WorkDir, "generate"), + "%s %s chunkdict generate --sources %s --target %s %s %s --nydus-image %s --work-dir %s", + nydusifyPath, logLevel, targetv6, chunkdictv6, sourceinsecure, targetinsecure, ctx.Binary.Builder, filepath.Join(ctx.Env.WorkDir, "generate"), ) tool.RunWithoutOutput(i.T, generateCmd) @@ -214,7 +212,7 @@ func (i *ImageTestSuite) TestGenerateChunkdict() test.Generator { ) tool.RunWithoutOutput(i.T, checkCmd) - // Test v5 + // Generate v5 chunkdcit fsversion := "--fs-version 5" target1v5 := fmt.Sprintf("%s-nydus5-%s", image1, uuid.NewString()) target2v5 := fmt.Sprintf("%s-nydus5-%s", image2, uuid.NewString()) @@ -239,11 +237,43 @@ func (i *ImageTestSuite) TestGenerateChunkdict() test.Generator { chunkdictv5 := fmt.Sprintf("%s-nydus5-%s", image1, uuid.NewString()) generateCmd2 := fmt.Sprintf( - "%s %s chunkdict generate --sources %s --target %s %s %s %s %s --nydus-image %s --work-dir %s", - nydusifyPath, logLevel, targetv5, chunkdictv5, sourceinsecure, targetinsecure, backendtype, backendconfigfile, ctx.Binary.Builder, filepath.Join(ctx.Env.WorkDir, "generate"), + "%s %s chunkdict generate --sources %s --target %s %s %s --nydus-image %s --work-dir %s", + nydusifyPath, logLevel, targetv5, chunkdictv5, sourceinsecure, targetinsecure, ctx.Binary.Builder, filepath.Join(ctx.Env.WorkDir, "generate"), ) tool.RunWithoutOutput(i.T, generateCmd2) + // Test v6 chunkdict convert + target4v6 := fmt.Sprintf("%s-nydus-chunkdict-%s", image1, uuid.NewString()) + chunkdict1v6 := fmt.Sprintf("bootstrap:registry:%s", chunkdictv6) + convertCmd7 := fmt.Sprintf( + "%s %s convert --source %s --target %s --chunk-dict %s --nydus-image %s --work-dir %s", + ctx.Binary.Nydusify, logLevel, image1, target4v6, chunkdict1v6, ctx.Binary.Builder, ctx.Env.TempDir, + ) + tool.RunWithoutOutput(i.T, convertCmd7) + + checkCmd1 := fmt.Sprintf( + "%s %s check --target %s --nydus-image %s --nydusd %s --work-dir %s", + nydusifyPath, logLevel, target4v6, ctx.Binary.Builder, ctx.Binary.Nydusd, filepath.Join(ctx.Env.WorkDir, "check"), + ) + tool.RunWithoutOutput(i.T, checkCmd1) + + // Test v5 chunkdict convert + target4v5 := fmt.Sprintf("%s-nydus5-chunkdict-%s", image1, uuid.NewString()) + chunkdict1v5 := fmt.Sprintf("bootstrap:registry:%s", chunkdictv5) + + convertCmd8 := fmt.Sprintf( + "%s %s convert --source %s --target %s --chunk-dict %s --nydus-image %s %s --work-dir %s", + ctx.Binary.Nydusify, logLevel, image1, target4v5, chunkdict1v5, ctx.Binary.Builder, fsversion, ctx.Env.TempDir, + ) + tool.RunWithoutOutput(i.T, convertCmd8) + + + checkCmd2 := fmt.Sprintf( + "%s %s check --target %s --nydus-image %s --nydusd %s --work-dir %s", + nydusifyPath, logLevel, target4v5, ctx.Binary.Builder, ctx.Binary.Nydusd, filepath.Join(ctx.Env.WorkDir, "check"), + ) + tool.RunWithoutOutput(i.T, checkCmd2) + return "generateChunkdict", nil } } diff --git a/src/bin/nydus-image/deduplicate.rs b/src/bin/nydus-image/deduplicate.rs index e27d3359b41..8518d953948 100644 --- a/src/bin/nydus-image/deduplicate.rs +++ b/src/bin/nydus-image/deduplicate.rs @@ -8,9 +8,10 @@ use core::cmp::Ordering; use nydus_api::ConfigV2; use nydus_builder::BuildContext; use nydus_builder::ChunkdictChunkInfo; +use nydus_builder::ConversionType; use nydus_builder::Tree; use nydus_rafs::metadata::{RafsSuper, RafsVersion}; -use nydus_storage::device::BlobInfo; +use nydus_storage::device::{BlobFeatures, BlobInfo}; use rusqlite::{params, Connection}; use std::collections::HashSet; use std::collections::{BTreeMap, HashMap}; @@ -172,6 +173,24 @@ pub fn check_bootstrap_versions_consistency( Ok(()) } +// Get parent bootstrap context for chunkdict bootstrap. +pub fn update_ctx_from_parent_bootstrap( + ctx: &mut BuildContext, + bootstrap_path: &PathBuf, +) -> Result<()> { + let (sb, _) = RafsSuper::load_from_file(bootstrap_path, Arc::new(ConfigV2::default()), false)?; + + let config = sb.meta.get_config(); + config.check_compatibility(&sb.meta)?; + + if config.is_tarfs_mode { + ctx.conversion_type = ConversionType::TarToTarfs; + ctx.blob_features |= BlobFeatures::TARFS; + } + + Ok(()) +} + pub struct Deduplicate { db: D, } diff --git a/src/bin/nydus-image/main.rs b/src/bin/nydus-image/main.rs index b2cf763320d..cebdbf90cf3 100644 --- a/src/bin/nydus-image/main.rs +++ b/src/bin/nydus-image/main.rs @@ -13,7 +13,7 @@ extern crate log; extern crate serde_json; #[macro_use] extern crate lazy_static; -use crate::deduplicate::SqliteDatabase; +use crate::deduplicate::{update_ctx_from_parent_bootstrap, SqliteDatabase}; use std::convert::TryFrom; use std::fs::{self, metadata, DirEntry, File, OpenOptions}; use std::os::unix::fs::FileTypeExt; @@ -404,14 +404,6 @@ fn prepare_cmd_args(bti_string: &'static str) -> App { .required(true) .num_args(1..), ) - .arg( - Arg::new("digester") - .long("digester") - .help("Algorithm to digest data chunks:") - .required(false) - .default_value("blake3") - .value_parser(["blake3", "sha256"]), - ) .arg( Arg::new("verbose") .long("verbose") @@ -420,12 +412,6 @@ fn prepare_cmd_args(bti_string: &'static str) -> App { .action(ArgAction::SetTrue) .required(false), ) - .arg( - Arg::new("features") - .long("features") - .value_parser(["blob-toc"]) - .help("Enable/disable features") - ) ) ); @@ -1215,25 +1201,30 @@ impl Command { .unwrap(); check_bootstrap_versions_consistency(&mut build_ctx, &source_bootstrap_paths)?; + update_ctx_from_parent_bootstrap(&mut build_ctx, &source_bootstrap_paths[0])?; for (_, bootstrap_path) in source_bootstrap_paths.iter().enumerate() { - let path = bootstrap_path.display().to_string(); - info!("Bootstrap path is {}", path); - let path_name: Vec<&str> = path.split('/').collect(); + let path_name = bootstrap_path.as_path(); // Extract the image name and version name from the bootstrap directory - let bootstrap_dir = match path_name.get(path_name.len() - 2) { - Some(&bootstrap_dir) => bootstrap_dir.to_string(), + let bootstrap_dir = match path_name + .parent() + .and_then(|p| p.file_name().and_then(|f| f.to_str())) + { + Some(dir_str) => dir_str.to_string(), None => bail!("Invalid Bootstrap directory name"), }; let full_image_name: Vec<&str> = bootstrap_dir.split(':').collect(); let image_name = match full_image_name.get(full_image_name.len() - 2) { Some(&second_last) => second_last.to_string(), - None => bail!("Invalid image name"), + None => bail!( + "Invalid image name {:?}", + full_image_name.get(full_image_name.len() - 2) + ), }; - let version_name = match full_image_name.last() { + let image_tag = match full_image_name.last() { Some(&last) => last.to_string(), - None => bail!("Invalid version name"), + None => bail!("Invalid version name {:?}", full_image_name.last()), }; // For backward compatibility with v2.1. let config = Self::get_configuration(matches)?; @@ -1249,7 +1240,7 @@ impl Command { "sqlite" => { let mut deduplicate: Deduplicate = Deduplicate::::new(db_strs[1])?; - deduplicate.save_metadata(bootstrap_path, config, image_name, version_name)? + deduplicate.save_metadata(bootstrap_path, config, image_name, image_tag)? } _ => { bail!("Unsupported database type: {}, please use a valid database URI, such as 'sqlite:///path/to/chunkdict.db'.", db_strs[0]) @@ -1295,12 +1286,12 @@ impl Command { } // Dump chunkdict to bootstrap - let features = Features::try_from( - matches - .get_one::("features") - .map(|s| s.as_str()) - .unwrap_or_default(), - )?; + // let features = Features::try_from( + // matches + // .get_one::("features") + // .map(|s| s.as_str()) + // .unwrap_or_default(), + // )?; let chunkdict_bootstrap_path = Self::get_bootstrap_storage(matches)?; let config = Self::get_configuration(matches).context("failed to get configuration information")?; @@ -1309,18 +1300,18 @@ impl Command { .set_blob_accessible(matches.get_one::("config").is_some()); build_ctx.configuration = config; build_ctx.blob_storage = Some(chunkdict_bootstrap_path); - build_ctx.blob_features = BlobFeatures::CAP_TAR_TOC; - build_ctx.blob_features.insert(BlobFeatures::ALIGNED); + // build_ctx.blob_features = BlobFeatures::CAP_TAR_TOC; + // build_ctx.blob_features.insert(BlobFeatures::ALIGNED); // Build_ctx.blob_features.insert(BlobFeatures::CHUNK_INFO_V2); // Build_ctx.blob_features.insert(BlobFeatures::ENCRYPTED); - build_ctx.features = features; - - let digester = matches - .get_one::("digester") - .map(|s| s.as_str()) - .unwrap_or_default() - .parse()?; - let mut blob_mgr = BlobManager::new(digester); + // build_ctx.features = features; + + // let digester = matches + // .get_one::("digester") + // .map(|s| s.as_str()) + // .unwrap_or_default() + // .parse()?; + let mut blob_mgr = BlobManager::new(build_ctx.digester); let bootstrap_path = Self::get_bootstrap_storage(matches)?; let mut bootstrap_mgr = BootstrapManager::new(Some(bootstrap_path), None); From 18c77635ed1f4f6454147f37af385113df3eebe0 Mon Sep 17 00:00:00 2001 From: Lin Wang Date: Thu, 7 Mar 2024 23:19:20 +0800 Subject: [PATCH 10/11] Fixed bugs in chunkdict and added blobinfo support to chunkdict. --- builder/src/chunkdict_generator.rs | 63 ++++++++-- builder/src/core/context.rs | 14 +++ builder/src/lib.rs | 1 + rafs/src/metadata/cached_v5.rs | 1 + rafs/src/metadata/layout/v5.rs | 4 + rafs/src/metadata/layout/v6.rs | 8 +- src/bin/nydus-image/deduplicate.rs | 180 +++++++++++++++++++++++------ src/bin/nydus-image/main.rs | 70 ++++++----- storage/src/cache/filecache/mod.rs | 4 +- storage/src/device.rs | 17 +++ storage/src/meta/mod.rs | 25 +++- 11 files changed, 306 insertions(+), 81 deletions(-) diff --git a/builder/src/chunkdict_generator.rs b/builder/src/chunkdict_generator.rs index 354b6014e08..db2f5d5f88a 100644 --- a/builder/src/chunkdict_generator.rs +++ b/builder/src/chunkdict_generator.rs @@ -23,6 +23,7 @@ use nydus_rafs::metadata::chunk::ChunkWrapper; use nydus_rafs::metadata::inode::InodeWrapper; use nydus_rafs::metadata::layout::RafsXAttrs; use nydus_storage::meta::BlobChunkInfoV1Ondisk; +use nydus_utils::compress::Algorithm; use nydus_utils::digest::RafsDigest; use std::ffi::OsString; use std::mem::size_of; @@ -41,6 +42,16 @@ pub struct ChunkdictChunkInfo { pub chunk_uncompressed_offset: u64, } +pub struct ChunkdictBlobInfo { + pub blob_id: String, + pub blob_compressed_size: u64, + pub blob_uncompressed_size: u64, + pub blob_compressor: String, + pub blob_meta_ci_compressed_size: u64, + pub blob_meta_ci_uncompressed_size: u64, + pub blob_meta_ci_offset: u64, +} + /// Struct to generate chunkdict RAFS bootstrap. pub struct Generator {} @@ -50,17 +61,17 @@ impl Generator { ctx: &mut BuildContext, bootstrap_mgr: &mut BootstrapManager, blob_mgr: &mut BlobManager, - chunkdict_origin: Vec, + chunkdict_chunks_origin: Vec, + chunkdict_blobs: Vec, ) -> Result { // Validate and remove chunks whose belonged blob sizes are smaller than a block. - let mut chunkdict = chunkdict_origin.to_vec(); - Self::validate_and_remove_chunks(ctx, &mut chunkdict); - + let mut chunkdict_chunks = chunkdict_chunks_origin.to_vec(); + Self::validate_and_remove_chunks(ctx, &mut chunkdict_chunks); // build root tree let mut tree = Self::build_root_tree(ctx)?; // build child tree - let child = Self::build_child_tree(ctx, blob_mgr, &chunkdict)?; + let child = Self::build_child_tree(ctx, blob_mgr, &chunkdict_chunks, &chunkdict_blobs)?; let result = vec![child]; tree.children = result; @@ -156,7 +167,8 @@ impl Generator { fn build_child_tree( ctx: &mut BuildContext, blob_mgr: &mut BlobManager, - chunkdict: &[ChunkdictChunkInfo], + chunkdict_chunks: &[ChunkdictChunkInfo], + chunkdict_blobs: &[ChunkdictBlobInfo], ) -> Result { // node let mut inode = InodeWrapper::new(ctx.fs_version); @@ -185,7 +197,7 @@ impl Generator { let mut node = Node::new(inode, node_info, 0); // insert chunks - Self::insert_chunks(ctx, blob_mgr, &mut node, chunkdict)?; + Self::insert_chunks(ctx, blob_mgr, &mut node, chunkdict_chunks, chunkdict_blobs)?; let node_size: u64 = node .chunks @@ -209,16 +221,22 @@ impl Generator { ctx: &mut BuildContext, blob_mgr: &mut BlobManager, node: &mut Node, - chunkdict: &[ChunkdictChunkInfo], + chunkdict_chunks: &[ChunkdictChunkInfo], + chunkdict_blobs: &[ChunkdictBlobInfo], ) -> Result<()> { - for (i, chunk_info) in chunkdict.iter().enumerate() { + for (index, chunk_info) in chunkdict_chunks.iter().enumerate() { let chunk_size: u32 = chunk_info.chunk_compressed_size; - let file_offset = i as u64 * chunk_size as u64; + let file_offset = index as u64 * chunk_size as u64; let mut chunk = ChunkWrapper::new(ctx.fs_version); // update blob context let (blob_index, blob_ctx) = blob_mgr.get_or_cerate_blob_for_chunkdict(ctx, &chunk_info.chunk_blob_id)?; + if blob_ctx.blob_id.is_empty() { + blob_ctx.blob_id = chunk_info.chunk_blob_id.clone(); + } + + // blob_ctx. let chunk_uncompressed_size = chunk_info.chunk_uncompressed_size; let pre_d_offset = blob_ctx.current_uncompressed_offset; blob_ctx.uncompressed_blob_size = pre_d_offset + chunk_uncompressed_size as u64; @@ -228,6 +246,31 @@ impl Generator { blob_ctx.blob_meta_header.ci_uncompressed_size() + size_of::() as u64, ); + blob_ctx.blob_meta_header.set_ci_compressed_size( + blob_ctx.blob_meta_header.ci_uncompressed_size() + + size_of::() as u64, + ); + let chunkdict_blob_info = chunkdict_blobs + .iter() + .find(|blob| blob.blob_id == chunk_info.chunk_blob_id) + .unwrap(); + blob_ctx.blob_compressor = match chunkdict_blob_info.blob_compressor.as_str() { + "None" => Algorithm::None, + "Lz4Block" => Algorithm::Lz4Block, + "GZip" => Algorithm::GZip, + "Zstd" => Algorithm::Zstd, + _ => Algorithm::None, + }; + blob_ctx + .blob_meta_header + .set_ci_uncompressed_size(chunkdict_blob_info.blob_meta_ci_uncompressed_size); + blob_ctx + .blob_meta_header + .set_ci_compressed_size(chunkdict_blob_info.blob_meta_ci_compressed_size); + blob_ctx + .blob_meta_header + .set_ci_compressed_offset(chunkdict_blob_info.blob_meta_ci_offset); + blob_ctx.blob_meta_header.set_ci_compressor(Algorithm::Zstd); // update chunk let chunk_index = blob_ctx.alloc_chunk_index()?; diff --git a/builder/src/core/context.rs b/builder/src/core/context.rs index 49d59734d32..5a55a372cb4 100644 --- a/builder/src/core/context.rs +++ b/builder/src/core/context.rs @@ -597,6 +597,9 @@ impl BlobContext { blob_ctx .blob_meta_header .set_encrypted(features.contains(BlobFeatures::ENCRYPTED)); + blob_ctx + .blob_meta_header + .set_is_chunkdict_generated(features.contains(BlobFeatures::IS_CHUNKDICT_GENERATED)); blob_ctx } @@ -1120,6 +1123,7 @@ impl BlobManager { compressed_blob_size, blob_features, flags, + build_ctx.is_chunkdict_generated, ); } RafsBlobTable::V6(table) => { @@ -1139,6 +1143,7 @@ impl BlobManager { ctx.blob_toc_digest, ctx.blob_meta_size, ctx.blob_toc_size, + build_ctx.is_chunkdict_generated, ctx.blob_meta_header, ctx.cipher_object.clone(), ctx.cipher_ctx.clone(), @@ -1316,6 +1321,9 @@ pub struct BuildContext { pub configuration: Arc, /// Generate the blob cache and blob meta pub blob_cache_generator: Option, + + /// Whether is chunkdict. + pub is_chunkdict_generated: bool, } impl BuildContext { @@ -1384,6 +1392,7 @@ impl BuildContext { features, configuration: Arc::new(ConfigV2::default()), blob_cache_generator: None, + is_chunkdict_generated: false, } } @@ -1402,6 +1411,10 @@ impl BuildContext { pub fn set_configuration(&mut self, config: Arc) { self.configuration = config; } + + pub fn set_is_chunkdict(&mut self, is_chunkdict: bool) { + self.is_chunkdict_generated = is_chunkdict; + } } impl Default for BuildContext { @@ -1434,6 +1447,7 @@ impl Default for BuildContext { features: Features::new(), configuration: Arc::new(ConfigV2::default()), blob_cache_generator: None, + is_chunkdict_generated: false, } } } diff --git a/builder/src/lib.rs b/builder/src/lib.rs index d09c2d09f7e..54f47e264a7 100644 --- a/builder/src/lib.rs +++ b/builder/src/lib.rs @@ -23,6 +23,7 @@ use sha2::Digest; use self::core::node::{Node, NodeInfo}; +pub use self::chunkdict_generator::ChunkdictBlobInfo; pub use self::chunkdict_generator::ChunkdictChunkInfo; pub use self::chunkdict_generator::Generator; pub use self::compact::BlobCompactor; diff --git a/rafs/src/metadata/cached_v5.rs b/rafs/src/metadata/cached_v5.rs index db38d80a497..d8f915bbe13 100644 --- a/rafs/src/metadata/cached_v5.rs +++ b/rafs/src/metadata/cached_v5.rs @@ -990,6 +990,7 @@ mod cached_tests { 0, BlobFeatures::_V5_NO_EXT_BLOB_TABLE, meta.flags, + false, ); let mut cached_inode = CachedInodeV5::new(blob_table, meta.clone()); cached_inode.load(&meta, &mut reader).unwrap(); diff --git a/rafs/src/metadata/layout/v5.rs b/rafs/src/metadata/layout/v5.rs index 859e56ee989..aa8c2feb984 100644 --- a/rafs/src/metadata/layout/v5.rs +++ b/rafs/src/metadata/layout/v5.rs @@ -563,6 +563,7 @@ impl RafsV5BlobTable { compressed_size: u64, blob_features: BlobFeatures, flags: RafsSuperFlags, + is_chunkdict: bool, ) -> u32 { let blob_index = self.entries.len() as u32; let mut blob_info = BlobInfo::new( @@ -578,6 +579,9 @@ impl RafsV5BlobTable { blob_info.set_compressor(flags.into()); blob_info.set_digester(flags.into()); blob_info.set_prefetch_info(prefetch_offset as u64, prefetch_size as u64); + if is_chunkdict { + blob_info.set_chunkdict_generated(true); + } self.entries.push(Arc::new(blob_info)); self.extended.add( diff --git a/rafs/src/metadata/layout/v6.rs b/rafs/src/metadata/layout/v6.rs index 59ba6e8a396..980e77cbee7 100644 --- a/rafs/src/metadata/layout/v6.rs +++ b/rafs/src/metadata/layout/v6.rs @@ -1754,7 +1754,8 @@ impl RafsV6Blob { blob_features.bits() ); return false; - } else if !tarfs_mode + } else if !blob_features.contains(BlobFeatures::IS_CHUNKDICT_GENERATED) + && !tarfs_mode && ci_uncompr_size != count * size_of::() as u64 { error!( @@ -1819,6 +1820,7 @@ impl RafsV6BlobTable { blob_toc_digest: [u8; 32], blob_meta_size: u64, blob_toc_size: u32, + is_chunkdict: bool, header: BlobCompressionContextHeader, cipher_object: Arc, cipher_context: Option, @@ -1851,6 +1853,8 @@ impl RafsV6BlobTable { blob_info.set_blob_toc_size(blob_toc_size); blob_info.set_cipher_info(flags.into(), cipher_object, cipher_context); + blob_info.set_chunkdict_generated(is_chunkdict); + self.entries.push(Arc::new(blob_info)); blob_index @@ -2726,6 +2730,7 @@ mod tests { [0; 32], 0, 0, + false, BlobCompressionContextHeader::default(), Arc::new(crypt::Algorithm::Aes128Xts.new_cipher().unwrap()), Some(CipherContext::default()), @@ -2768,6 +2773,7 @@ mod tests { [0; 32], 0, 0, + false, BlobCompressionContextHeader::default(), Arc::new(crypt::Algorithm::Aes128Xts.new_cipher().unwrap()), Some(CipherContext::default()), diff --git a/src/bin/nydus-image/deduplicate.rs b/src/bin/nydus-image/deduplicate.rs index 8518d953948..8ce4b46fa4c 100644 --- a/src/bin/nydus-image/deduplicate.rs +++ b/src/bin/nydus-image/deduplicate.rs @@ -7,9 +7,9 @@ use anyhow::{Context, Result}; use core::cmp::Ordering; use nydus_api::ConfigV2; use nydus_builder::BuildContext; -use nydus_builder::ChunkdictChunkInfo; use nydus_builder::ConversionType; use nydus_builder::Tree; +use nydus_builder::{ChunkdictBlobInfo, ChunkdictChunkInfo}; use nydus_rafs::metadata::{RafsSuper, RafsVersion}; use nydus_storage::device::{BlobFeatures, BlobInfo}; use rusqlite::{params, Connection}; @@ -58,7 +58,7 @@ pub trait Database { fn insert_chunk(&self, chunk_info: &ChunkdictChunkInfo) -> Result<()>; /// Inserts blob information into the database. - fn insert_blob(&self, blob_info: &Blob) -> Result<()>; + fn insert_blob(&self, blob_info: &ChunkdictBlobInfo) -> Result<()>; /// Retrieves all chunk information from the database. fn get_chunks(&self) -> Result>; @@ -67,7 +67,10 @@ pub trait Database { fn get_chunks_by_blob_id(&self, blob_id: &str) -> Result>; /// Retrieves all blob information from the database. - fn get_blobs(&self) -> Result>; + fn get_blobs(&self) -> Result>; + + /// Retrieves blob information from the database filtered by blob ID. + fn get_blob_by_id(&self, blob_id: &str) -> Result; } pub struct SqliteDatabase { @@ -119,7 +122,7 @@ impl Database for SqliteDatabase { .context("Failed to insert chunk") } - fn insert_blob(&self, blob: &Blob) -> Result<()> { + fn insert_blob(&self, blob: &ChunkdictBlobInfo) -> Result<()> { self.blob_table .insert(blob) .context("Failed to insert blob") @@ -133,9 +136,13 @@ impl Database for SqliteDatabase { ChunkTable::list_all_by_blob_id(&self.chunk_table, blob_id).context("Failed to get chunks") } - fn get_blobs(&self) -> Result> { + fn get_blobs(&self) -> Result> { BlobTable::list_all(&self.blob_table).context("Failed to get blobs") } + + fn get_blob_by_id(&self, blob_id: &str) -> Result { + BlobTable::list_by_id(&self.blob_table, blob_id).context("Failed to get blob") + } } /// Get fs version from bootstrap file. @@ -235,10 +242,14 @@ impl Deduplicate { fn insert_blobs(&mut self, blob_infos: &[Arc]) -> anyhow::Result<()> { for blob in blob_infos { self.db - .insert_blob(&Blob { + .insert_blob(&ChunkdictBlobInfo { blob_id: blob.blob_id().to_string(), blob_compressed_size: blob.compressed_size(), blob_uncompressed_size: blob.uncompressed_size(), + blob_compressor: blob.compressor().to_string(), + blob_meta_ci_compressed_size: blob.meta_ci_compressed_size(), + blob_meta_ci_uncompressed_size: blob.meta_ci_uncompressed_size(), + blob_meta_ci_offset: blob.meta_ci_offset(), }) .context("Failed to insert blob")?; } @@ -297,11 +308,15 @@ impl Algorithm { } // Call the algorithm to generate a dictionary - pub fn chunkdict_generate(&mut self) -> anyhow::Result<(Vec, Vec)> { - let all_chunks = self.db.chunk_table.list_all()?; - let mut chunkdict: Vec = Vec::new(); + pub fn chunkdict_generate( + &mut self, + ) -> anyhow::Result<(Vec, Vec, Vec)> { + let all_chunks: Vec = self.db.chunk_table.list_all()?; + let mut chunkdict_chunks: Vec = Vec::new(); + let mut chunkdict_blobs: Vec = Vec::new(); let mut core_image = Vec::new(); let mut noise_points = Vec::new(); + let (chunkdict_version, chunkdict_image) = match &self.algorithm_name as &str { "exponential_smoothing" => Self::deduplicate_version(&all_chunks)?, _ => { @@ -311,14 +326,14 @@ impl Algorithm { for single_clustering in chunkdict_image { for (image_list, cluster_dictionary) in single_clustering { core_image.extend(image_list); - chunkdict.extend(cluster_dictionary); + chunkdict_chunks.extend(cluster_dictionary); } } for (_, dictionary) in chunkdict_version { - chunkdict.extend(dictionary); + chunkdict_chunks.extend(dictionary); } let mut chunkdict_size = 0; - for i in &chunkdict { + for i in &chunkdict_chunks { chunkdict_size += i.chunk_compressed_size; } info!( @@ -332,7 +347,35 @@ impl Algorithm { noise_points.push(chunk.image_reference.clone()); } } - Ok((chunkdict, noise_points)) + Self::fill_chunkdict(self, &mut chunkdict_chunks, &mut chunkdict_blobs)?; + Ok((chunkdict_chunks, chunkdict_blobs, noise_points)) + } + + /// Baseed chunk list to fill chunkdict, including all chunks in the same blob and all blobs in the chunkdict. + fn fill_chunkdict( + &mut self, + chunkdict_chunks: &mut Vec, + chunkdict_blobs: &mut Vec, + ) -> Result<()> { + let mut blob_ids = std::collections::HashSet::new(); + for chunk in chunkdict_chunks.iter() { + blob_ids.insert(chunk.chunk_blob_id.clone()); + } + for blob_id in blob_ids { + let mut chunks = self.db.get_chunks_by_blob_id(&blob_id)?; + chunks = chunks + .into_iter() + .collect::>() + .into_iter() + .collect::>(); + for chunk in chunks { + if !chunkdict_chunks.contains(&chunk) { + chunkdict_chunks.push(chunk); + } + } + chunkdict_blobs.push(self.db.get_blob_by_id(&blob_id)?); + } + Ok(()) } // Algorithm "exponential_smoothing" @@ -774,7 +817,7 @@ impl Algorithm { let mut threshold = 0.5; let max_threshold = 0.8; - let mut test_total_size = 0; + let mut test_total_size: u32 = 0; let mut min_test_size: u32 = std::u32::MAX; let mut min_data_dict = HashMap::new(); @@ -821,7 +864,9 @@ impl Algorithm { } } for chunk in point.chunk_list.iter() { - test_total_size += chunk.chunk_compressed_size; + test_total_size = test_total_size + .checked_add(chunk.chunk_compressed_size) + .unwrap_or(test_total_size); } } if test_total_size <= min_test_size { @@ -889,7 +934,7 @@ impl ChunkTable { }) } - /// select all data filtered by blob ID. + /// Select all data filtered by blob ID. fn list_all_by_blob_id(&self, blob_id: &str) -> Result, DatabaseError> { let mut offset = 0; let limit: i64 = 100; @@ -908,7 +953,7 @@ impl ChunkTable { Ok(all_chunks_by_blob_id) } - /// select data with offset and limit filtered by blob ID. + /// Select data with offset and limit filtered by blob ID. fn list_paged_by_blob_id( &self, blob_id: &str, @@ -1145,15 +1190,38 @@ impl BlobTable { conn: Arc::new(Mutex::new(conn)), }) } -} -pub struct Blob { - blob_id: String, - blob_compressed_size: u64, - blob_uncompressed_size: u64, + pub fn list_by_id(&self, blob_id: &str) -> Result { + let conn_guard = self + .conn + .lock() + .map_err(|e| DatabaseError::PoisonError(e.to_string()))?; + let mut stmt = conn_guard.prepare( + "SELECT blob_id, blob_compressed_size, blob_uncompressed_size, blob_compressor, blob_meta_ci_compressed_size, blob_meta_ci_uncompressed_size, blob_meta_ci_offset FROM blob WHERE blob_id = ?1", + )?; + let mut blob_iterator = stmt.query_map([blob_id], |row| { + Ok(ChunkdictBlobInfo { + blob_id: row.get(0)?, + blob_compressed_size: row.get(1)?, + blob_uncompressed_size: row.get(2)?, + blob_compressor: row.get(3)?, + blob_meta_ci_compressed_size: row.get(4)?, + blob_meta_ci_uncompressed_size: row.get(5)?, + blob_meta_ci_offset: row.get(6)?, + }) + })?; + + if let Some(blob) = blob_iterator.next() { + blob.map_err(DatabaseError::SqliteError) + } else { + Err(DatabaseError::SqliteError( + rusqlite::Error::QueryReturnedNoRows, + )) + } + } } -impl Table for BlobTable { +impl Table for BlobTable { fn clear(&self) -> Result<(), DatabaseError> { self.conn .lock() @@ -1169,10 +1237,14 @@ impl Table for BlobTable { .map_err(|e| DatabaseError::PoisonError(e.to_string()))? .execute( "CREATE TABLE IF NOT EXISTS blob ( - id INTEGER PRIMARY KEY, - blob_id TEXT NOT NULL, - blob_compressed_size INT, - blob_uncompressed_size INT + id INTEGER PRIMARY KEY, + blob_id TEXT NOT NULL, + blob_compressed_size INT, + blob_uncompressed_size INT, + blob_compressor TEXT, + blob_meta_ci_compressed_size INT, + blob_meta_ci_uncompressed_size INT, + blob_meta_ci_offset INT )", [], ) @@ -1180,7 +1252,7 @@ impl Table for BlobTable { Ok(()) } - fn insert(&self, blob: &Blob) -> Result<(), DatabaseError> { + fn insert(&self, blob: &ChunkdictBlobInfo) -> Result<(), DatabaseError> { self.conn .lock() .map_err(|e| DatabaseError::PoisonError(e.to_string()))? @@ -1188,21 +1260,29 @@ impl Table for BlobTable { "INSERT INTO blob ( blob_id, blob_compressed_size, - blob_uncompressed_size + blob_uncompressed_size, + blob_compressor, + blob_meta_ci_compressed_size, + blob_meta_ci_uncompressed_size, + blob_meta_ci_offset ) - VALUES (?1, ?2, ?3); + VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7); ", rusqlite::params![ blob.blob_id, blob.blob_compressed_size, - blob.blob_uncompressed_size + blob.blob_uncompressed_size, + blob.blob_compressor, + blob.blob_meta_ci_compressed_size, + blob.blob_meta_ci_uncompressed_size, + blob.blob_meta_ci_offset, ], ) .map_err(DatabaseError::SqliteError)?; Ok(()) } - fn list_all(&self) -> Result, DatabaseError> { + fn list_all(&self) -> Result, DatabaseError> { let mut offset = 0; let limit: i64 = 100; let mut all_blobs = Vec::new(); @@ -1220,7 +1300,7 @@ impl Table for BlobTable { Ok(all_blobs) } - fn list_paged(&self, offset: i64, limit: i64) -> Result, DatabaseError> { + fn list_paged(&self, offset: i64, limit: i64) -> Result, DatabaseError> { let conn_guard = self .conn .lock() @@ -1230,10 +1310,14 @@ impl Table for BlobTable { ORDER BY id LIMIT ?1 OFFSET ?2", )?; let blob_iterator = stmt.query_map(params![limit, offset], |row| { - Ok(Blob { + Ok(ChunkdictBlobInfo { blob_id: row.get(0)?, blob_compressed_size: row.get(1)?, blob_uncompressed_size: row.get(2)?, + blob_compressor: row.get(3)?, + blob_meta_ci_compressed_size: row.get(4)?, + blob_meta_ci_uncompressed_size: row.get(5)?, + blob_meta_ci_offset: row.get(6)?, }) })?; let mut blobs = Vec::new(); @@ -1280,10 +1364,14 @@ mod tests { fn test_blob_table() -> Result<(), Box> { let blob_table = BlobTable::new_in_memory()?; blob_table.create()?; - let blob = Blob { + let blob = ChunkdictBlobInfo { blob_id: "BLOB123".to_string(), blob_compressed_size: 1024, blob_uncompressed_size: 2048, + blob_compressor: "zstd".to_string(), + blob_meta_ci_compressed_size: 1024, + blob_meta_ci_uncompressed_size: 2048, + blob_meta_ci_offset: 0, }; blob_table.insert(&blob)?; let blobs = blob_table.list_all()?; @@ -1291,6 +1379,16 @@ mod tests { assert_eq!(blobs[0].blob_id, blob.blob_id); assert_eq!(blobs[0].blob_compressed_size, blob.blob_compressed_size); assert_eq!(blobs[0].blob_uncompressed_size, blob.blob_uncompressed_size); + assert_eq!(blobs[0].blob_compressor, blob.blob_compressor); + assert_eq!( + blobs[0].blob_meta_ci_compressed_size, + blob.blob_meta_ci_compressed_size + ); + assert_eq!( + blobs[0].blob_meta_ci_uncompressed_size, + blob.blob_meta_ci_uncompressed_size + ); + assert_eq!(blobs[0].blob_meta_ci_offset, blob.blob_meta_ci_offset); Ok(()) } @@ -1310,7 +1408,7 @@ mod tests { }; chunk_table.insert(&chunk)?; let chunk2 = ChunkdictChunkInfo { - image_reference: "REDIS".to_string(), + image_reference: "REDIS02".to_string(), version: "1.0.0".to_string(), chunk_blob_id: "BLOB456".to_string(), chunk_digest: "DIGEST123".to_string(), @@ -1352,10 +1450,14 @@ mod tests { let blob_table = BlobTable::new_in_memory()?; blob_table.create()?; for i in 0..200 { - let blob = Blob { + let blob = ChunkdictBlobInfo { blob_id: format!("BLOB{}", i), blob_compressed_size: i, blob_uncompressed_size: i * 2, + blob_compressor: "zstd".to_string(), + blob_meta_ci_compressed_size: i, + blob_meta_ci_uncompressed_size: i * 2, + blob_meta_ci_offset: i * 3, }; blob_table.insert(&blob)?; } @@ -1364,6 +1466,10 @@ mod tests { assert_eq!(blobs[0].blob_id, "BLOB100"); assert_eq!(blobs[0].blob_compressed_size, 100); assert_eq!(blobs[0].blob_uncompressed_size, 200); + assert_eq!(blobs[0].blob_compressor, "zstd"); + assert_eq!(blobs[0].blob_meta_ci_compressed_size, 100); + assert_eq!(blobs[0].blob_meta_ci_uncompressed_size, 200); + assert_eq!(blobs[0].blob_meta_ci_offset, 300); Ok(()) } diff --git a/src/bin/nydus-image/main.rs b/src/bin/nydus-image/main.rs index cebdbf90cf3..00f1e05256e 100644 --- a/src/bin/nydus-image/main.rs +++ b/src/bin/nydus-image/main.rs @@ -13,7 +13,10 @@ extern crate log; extern crate serde_json; #[macro_use] extern crate lazy_static; -use crate::deduplicate::{update_ctx_from_parent_bootstrap, SqliteDatabase}; +use crate::deduplicate::{ + check_bootstrap_versions_consistency, update_ctx_from_parent_bootstrap, Deduplicate, + SqliteDatabase, +}; use std::convert::TryFrom; use std::fs::{self, metadata, DirEntry, File, OpenOptions}; use std::os::unix::fs::FileTypeExt; @@ -28,9 +31,9 @@ use nydus::{get_build_time_info, setup_logging}; use nydus_api::{BuildTimeInfo, ConfigV2, LocalFsConfig}; use nydus_builder::{ parse_chunk_dict_arg, ArtifactStorage, BlobCacheGenerator, BlobCompactor, BlobManager, - BootstrapManager, BuildContext, BuildOutput, Builder, ChunkdictChunkInfo, ConversionType, - DirectoryBuilder, Feature, Features, Generator, HashChunkDict, Merger, Prefetch, - PrefetchPolicy, StargzBuilder, TarballBuilder, WhiteoutSpec, + BootstrapManager, BuildContext, BuildOutput, Builder, ChunkdictBlobInfo, ChunkdictChunkInfo, + ConversionType, DirectoryBuilder, Feature, Features, Generator, HashChunkDict, Merger, + Prefetch, PrefetchPolicy, StargzBuilder, TarballBuilder, WhiteoutSpec, }; use nydus_rafs::metadata::{MergeError, RafsSuper, RafsSuperConfig, RafsVersion}; use nydus_storage::backend::localfs::LocalFs; @@ -45,7 +48,6 @@ use nydus_utils::{ }; use serde::{Deserialize, Serialize}; -use crate::deduplicate::{check_bootstrap_versions_consistency, Deduplicate}; use crate::unpack::{OCIUnpacker, Unpacker}; use crate::validator::Validator; @@ -1260,15 +1262,20 @@ impl Command { .map(|paths| paths.map(PathBuf::from).collect()) .unwrap(); - let (chunkdict, noise_points): (Vec, Vec); + let (chunkdict_chunks, chunkdict_blobs, noise_points): ( + Vec, + Vec, + Vec, + ); match db_strs[0] { "sqlite" => { let mut algorithm: deduplicate::Algorithm = deduplicate::Algorithm::::new(algorithm, db_strs[1])?; let result = algorithm.chunkdict_generate()?; - chunkdict = result.0; - noise_points = result.1; + chunkdict_chunks = result.0; + chunkdict_blobs = result.1; + noise_points = result.2; } _ => { bail!("Unsupported database type: {}, please use a valid database URI, such as 'sqlite:///path/to/chunkdict.db'.", db_strs[0]) @@ -1278,7 +1285,7 @@ impl Command { // Output noise point in DBSCAN clustering algorithm info!( "The length of chunkdict is {}", - Vec::::len(&chunkdict) + Vec::::len(&chunkdict_chunks) ); info!("It is not recommended to use image deduplication"); for image_name in noise_points { @@ -1286,12 +1293,12 @@ impl Command { } // Dump chunkdict to bootstrap - // let features = Features::try_from( - // matches - // .get_one::("features") - // .map(|s| s.as_str()) - // .unwrap_or_default(), - // )?; + let features = Features::try_from( + matches + .get_one::("features") + .map(|s| s.as_str()) + .unwrap_or_default(), + )?; let chunkdict_bootstrap_path = Self::get_bootstrap_storage(matches)?; let config = Self::get_configuration(matches).context("failed to get configuration information")?; @@ -1300,24 +1307,31 @@ impl Command { .set_blob_accessible(matches.get_one::("config").is_some()); build_ctx.configuration = config; build_ctx.blob_storage = Some(chunkdict_bootstrap_path); - // build_ctx.blob_features = BlobFeatures::CAP_TAR_TOC; - // build_ctx.blob_features.insert(BlobFeatures::ALIGNED); - // Build_ctx.blob_features.insert(BlobFeatures::CHUNK_INFO_V2); - // Build_ctx.blob_features.insert(BlobFeatures::ENCRYPTED); - // build_ctx.features = features; - - // let digester = matches - // .get_one::("digester") - // .map(|s| s.as_str()) - // .unwrap_or_default() - // .parse()?; + build_ctx.blob_features = BlobFeatures::CAP_TAR_TOC; + build_ctx.blob_features.insert(BlobFeatures::ALIGNED); + build_ctx + .blob_features + .insert(BlobFeatures::IS_CHUNKDICT_GENERATED); + build_ctx + .blob_features + .insert(BlobFeatures::INLINED_CHUNK_DIGEST); + build_ctx.blob_features.insert(BlobFeatures::HAS_TAR_HEADER); + build_ctx.blob_features.insert(BlobFeatures::HAS_TOC); + build_ctx.features = features; + build_ctx.is_chunkdict_generated = true; + let mut blob_mgr = BlobManager::new(build_ctx.digester); let bootstrap_path = Self::get_bootstrap_storage(matches)?; let mut bootstrap_mgr = BootstrapManager::new(Some(bootstrap_path), None); - let output = - Generator::generate(&mut build_ctx, &mut bootstrap_mgr, &mut blob_mgr, chunkdict)?; + let output = Generator::generate( + &mut build_ctx, + &mut bootstrap_mgr, + &mut blob_mgr, + chunkdict_chunks, + chunkdict_blobs, + )?; OutputSerializer::dump(matches, output, build_info).unwrap(); info!( "Chunkdict metadata is saved at: {:?}", diff --git a/storage/src/cache/filecache/mod.rs b/storage/src/cache/filecache/mod.rs index d42ad388fcf..d1d338ad459 100644 --- a/storage/src/cache/filecache/mod.rs +++ b/storage/src/cache/filecache/mod.rs @@ -266,7 +266,9 @@ impl FileCacheEntry { ); return Err(einval!(msg)); } - let meta = if blob_info.meta_ci_is_valid() { + let meta = if blob_info.meta_ci_is_valid() + || blob_info.has_feature(BlobFeatures::IS_CHUNKDICT_GENERATED) + { let meta = FileCacheMeta::new( blob_file_path, blob_info.clone(), diff --git a/storage/src/device.rs b/storage/src/device.rs index 5151169e058..5bd78f1076a 100644 --- a/storage/src/device.rs +++ b/storage/src/device.rs @@ -77,6 +77,8 @@ bitflags! { const CAP_TAR_TOC = 0x4000_0000; /// Rafs V5 image without extended blob table, this is an internal flag. const _V5_NO_EXT_BLOB_TABLE = 0x8000_0000; + // /// Blob is generated with chunkdict + const IS_CHUNKDICT_GENERATED = 0x0000_0200; } } @@ -172,6 +174,9 @@ pub struct BlobInfo { cipher_object: Arc, /// Cipher context for encryption. cipher_ctx: Option, + + /// is chunkdict generated + is_chunkdict_generated: bool, } impl BlobInfo { @@ -215,6 +220,8 @@ impl BlobInfo { meta_path: Arc::new(Mutex::new(String::new())), cipher_object: Default::default(), cipher_ctx: None, + + is_chunkdict_generated: false, }; blob_info.compute_features(); @@ -222,6 +229,16 @@ impl BlobInfo { blob_info } + /// Set the is_chunkdict_generated flag. + pub fn set_chunkdict_generated(&mut self, is_chunkdict_generated: bool) { + self.is_chunkdict_generated = is_chunkdict_generated; + } + + /// Get the is_chunkdict_generated flag. + pub fn is_chunkdict_generated(&self) -> bool { + self.is_chunkdict_generated + } + /// Get the blob index in the blob array. pub fn blob_index(&self) -> u32 { self.blob_index diff --git a/storage/src/meta/mod.rs b/storage/src/meta/mod.rs index eff935cda17..d37713e9154 100644 --- a/storage/src/meta/mod.rs +++ b/storage/src/meta/mod.rs @@ -354,6 +354,15 @@ impl BlobCompressionContextHeader { ) } } + + /// Set flag indicating whether it's a blob for batch chunk or not. + pub fn set_is_chunkdict_generated(&mut self, enable: bool) { + if enable { + self.s_features |= BlobFeatures::IS_CHUNKDICT_GENERATED.bits(); + } else { + self.s_features &= !BlobFeatures::IS_CHUNKDICT_GENERATED.bits(); + } + } } /// Struct to manage blob chunk compression information, a wrapper over [BlobCompressionContext]. @@ -851,7 +860,8 @@ impl BlobCompressionContextInfo { if u32::from_le(header.s_magic) != BLOB_CCT_MAGIC || u32::from_le(header.s_magic2) != BLOB_CCT_MAGIC - || u32::from_le(header.s_ci_entries) != blob_info.chunk_count() + || (!blob_info.has_feature(BlobFeatures::IS_CHUNKDICT_GENERATED) + && u32::from_le(header.s_ci_entries) != blob_info.chunk_count()) || u32::from_le(header.s_ci_compressor) != blob_info.meta_ci_compressor() as u32 || u64::from_le(header.s_ci_offset) != blob_info.meta_ci_offset() || u64::from_le(header.s_ci_compressed_size) != blob_info.meta_ci_compressed_size() @@ -887,8 +897,9 @@ impl BlobCompressionContextInfo { || blob_info.has_feature(BlobFeatures::BATCH) { return Err(einval!("invalid feature flags in blob meta header!")); - } else if info_size != (chunk_count as usize) * (size_of::()) - || (aligned_info_size as u64) > BLOB_CCT_V1_MAX_SIZE + } else if !blob_info.has_feature(BlobFeatures::IS_CHUNKDICT_GENERATED) + && (info_size != (chunk_count as usize) * (size_of::()) + || (aligned_info_size as u64) > BLOB_CCT_V1_MAX_SIZE) { return Err(einval!("uncompressed size in blob meta header is invalid!")); } @@ -1770,7 +1781,10 @@ impl BlobMetaChunkArray { ) -> Result<&'a T> { assert!(index < chunk_info_array.len()); let entry = &chunk_info_array[index]; - entry.validate(state)?; + // If the chunk belongs to a chunkdict, skip the validation check. + if state.blob_features & BlobFeatures::IS_CHUNKDICT_GENERATED.bits() == 0 { + entry.validate(state)?; + } Ok(entry) } } @@ -1983,6 +1997,9 @@ pub fn format_blob_features(features: BlobFeatures) -> String { if features.contains(BlobFeatures::ENCRYPTED) { output += "encrypted "; } + if features.contains(BlobFeatures::IS_CHUNKDICT_GENERATED) { + output += "is-chunkdict-generated "; + } output.trim_end().to_string() } From 60933093c318b080b894bf4a5928a0dd3a557fe7 Mon Sep 17 00:00:00 2001 From: Lin Wang Date: Tue, 12 Mar 2024 00:08:44 +0800 Subject: [PATCH 11/11] nydus-image: merge main branch and remove unnecessary output. --- src/bin/nydus-image/deduplicate.rs | 5 ++++- src/bin/nydus-image/main.rs | 9 ++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/bin/nydus-image/deduplicate.rs b/src/bin/nydus-image/deduplicate.rs index 8ce4b46fa4c..db6dcc15a53 100644 --- a/src/bin/nydus-image/deduplicate.rs +++ b/src/bin/nydus-image/deduplicate.rs @@ -194,6 +194,9 @@ pub fn update_ctx_from_parent_bootstrap( ctx.conversion_type = ConversionType::TarToTarfs; ctx.blob_features |= BlobFeatures::TARFS; } + ctx.fs_version = + RafsVersion::try_from(sb.meta.version).context("Failed to get RAFS version")?; + ctx.compressor = config.compressor; Ok(()) } @@ -1306,7 +1309,7 @@ impl Table for BlobTable { .lock() .map_err(|e| DatabaseError::PoisonError(e.to_string()))?; let mut stmt: rusqlite::Statement<'_> = conn_guard.prepare( - "SELECT blob_id, blob_compressed_size, blob_uncompressed_size from blob + "SELECT blob_id, blob_compressed_size, blob_uncompressed_size, blob_compressor, blob_meta_ci_compressed_size, blob_meta_ci_uncompressed_size, blob_meta_ci_offset from blob ORDER BY id LIMIT ?1 OFFSET ?2", )?; let blob_iterator = stmt.query_map(params![limit, offset], |row| { diff --git a/src/bin/nydus-image/main.rs b/src/bin/nydus-image/main.rs index 1caca4180e5..577b7da1227 100644 --- a/src/bin/nydus-image/main.rs +++ b/src/bin/nydus-image/main.rs @@ -1344,7 +1344,14 @@ impl Command { chunkdict_chunks, chunkdict_blobs, )?; - OutputSerializer::dump(matches, output, build_info).unwrap(); + OutputSerializer::dump( + matches, + output, + build_info, + build_ctx.compressor, + build_ctx.fs_version, + ) + .unwrap(); info!( "Chunkdict metadata is saved at: {:?}", matches