diff --git a/builder/src/chunkdict_generator.rs b/builder/src/chunkdict_generator.rs index ed2b4b01d87..354b6014e08 100644 --- a/builder/src/chunkdict_generator.rs +++ b/builder/src/chunkdict_generator.rs @@ -52,7 +52,7 @@ impl Generator { blob_mgr: &mut BlobManager, chunkdict_origin: Vec, ) -> Result { - // validate and remove chunks which bloned blob size is smaller than block. + // Validate and remove chunks whose belonged blob sizes are smaller than a block. let mut chunkdict = chunkdict_origin.to_vec(); Self::validate_and_remove_chunks(ctx, &mut chunkdict); @@ -136,8 +136,8 @@ impl Generator { inode.set_blocks(256); let node_info = NodeInfo { explicit_uidgid: true, - src_dev: 66305, - src_ino: 24772610, + src_dev: 0, + src_ino: 0, rdev: 0, source: PathBuf::from("/"), path: PathBuf::from("/"), @@ -171,8 +171,8 @@ impl Generator { inode.set_blocks(256); let node_info = NodeInfo { explicit_uidgid: true, - src_dev: 66305, - src_ino: 24775126, + src_dev: 0, + src_ino: 1, rdev: 0, source: PathBuf::from("/"), path: PathBuf::from("/chunkdict"), @@ -211,17 +211,14 @@ impl Generator { node: &mut Node, chunkdict: &[ChunkdictChunkInfo], ) -> Result<()> { - for chunk_info in chunkdict.iter() { + for (i, chunk_info) in chunkdict.iter().enumerate() { let chunk_size: u32 = chunk_info.chunk_compressed_size; - let file_offset = 1 as u64 * chunk_size as u64; + let file_offset = i as u64 * chunk_size as u64; let mut chunk = ChunkWrapper::new(ctx.fs_version); // update blob context let (blob_index, blob_ctx) = blob_mgr.get_or_cerate_blob_for_chunkdict(ctx, &chunk_info.chunk_blob_id)?; - if blob_ctx.blob_id.is_empty() { - blob_ctx.blob_id = chunk_info.chunk_blob_id.clone(); - } let chunk_uncompressed_size = chunk_info.chunk_uncompressed_size; let pre_d_offset = blob_ctx.current_uncompressed_offset; blob_ctx.uncompressed_blob_size = pre_d_offset + chunk_uncompressed_size as u64; diff --git a/builder/src/generate.rs b/builder/src/generate.rs deleted file mode 100644 index 576142b926c..00000000000 --- a/builder/src/generate.rs +++ /dev/null @@ -1,257 +0,0 @@ -// Copyright (C) 2022 Nydus Developers. All rights reserved. -// -// SPDX-License-Identifier: Apache-2.0 - -//! Generate Chunkdict RAFS bootstrap. -//! Bug 1: Inconsistent Chunk Size Leading to Blob Size Less Than 4K -//! Description: The size of chunks is not consistent, which results in the possibility that a blob, composed of a group of these chunks, may be less than 4K in size. This inconsistency leads to a failure in passing the size check. -//! Bug 2: Incorrect Chunk Number Calculation Due to Premature Check Logic -//! Description: The current logic for calculating the chunk number is based on the formula size/chunk size. However, this approach is flawed as it precedes the actual check which accounts for chunk statistics. Consequently, this leads to inaccurate counting of chunk numbers. - -use super::core::node::{ChunkSource, NodeInfo}; -use super::{BlobManager, Bootstrap, BootstrapManager, BuildContext, BuildOutput, Tree}; -use crate::core::node::Node; -use crate::NodeChunk; -use anyhow::Result; -use nydus_rafs::metadata::chunk::ChunkWrapper; -use nydus_rafs::metadata::inode::InodeWrapper; -use nydus_rafs::metadata::layout::RafsXAttrs; -use nydus_rafs::metadata::RafsVersion; -use nydus_storage::meta::BlobChunkInfoV1Ondisk; -use nydus_utils::digest::RafsDigest; -use nydus_utils::lazy_drop; -use std::ffi::OsString; -use std::mem::size_of; -use std::path::PathBuf; -use std::sync::Arc; -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub struct ChunkdictChunkInfo { - pub image_name: String, - pub version_name: String, - pub chunk_blob_id: String, - pub chunk_digest: String, - pub chunk_compressed_size: u32, - pub chunk_uncompressed_size: u32, - pub chunk_compressed_offset: u64, - pub chunk_uncompressed_offset: u64, -} - -/// Struct to Generater chunkdict RAFS bootstrap. -pub struct Generater {} - -impl Generater { - // Generate chunkdict RAFS bootstrap. - #[allow(clippy::too_many_arguments)] - pub fn generate( - ctx: &mut BuildContext, - bootstrap_mgr: &mut BootstrapManager, - blob_mgr: &mut BlobManager, - chunkdict_origin: Vec, - ) -> Result { - // validate and remove chunks which bloned blob size is smaller than block. - let mut chunkdict = chunkdict_origin.to_vec(); - Self::validate_and_remove_chunks(&mut chunkdict, ctx); - - // build root tree - let mut tree = Self::build_root_tree()?; - - // build child tree - let child = Self::build_child_tree(ctx, blob_mgr, &chunkdict)?; - let result = vec![child]; - tree.children = result; - tree.lock_node() - .v5_set_dir_size(ctx.fs_version, &tree.children); - - Self::validate_tree(&tree)?; - - // build bootstrap - let mut bootstrap_ctx = bootstrap_mgr.create_ctx()?; - let mut bootstrap = Bootstrap::new(tree)?; - bootstrap.build(ctx, &mut bootstrap_ctx)?; - - let blob_table = blob_mgr.to_blob_table(ctx)?; - let storage = &mut bootstrap_mgr.bootstrap_storage; - bootstrap.dump(ctx, storage, &mut bootstrap_ctx, &blob_table)?; - - lazy_drop(bootstrap_ctx); - - BuildOutput::new(blob_mgr, &bootstrap_mgr.bootstrap_storage) - } - - /// validate tree - fn validate_tree(tree: &Tree) -> Result<()> { - let pre = &mut |t: &Tree| -> Result<()> { - let node = t.lock_node(); - debug!("chunkdict tree: "); - debug!("inode: {}", node); - for chunk in &node.chunks { - debug!("\t chunk: {}", chunk); - } - Ok(()) - }; - tree.walk_dfs_pre(pre)?; - debug!("chunkdict tree is valid."); - Ok(()) - } - - /// check blob uncompressed size is bigger than block - fn validate_and_remove_chunks(chunkdict: &mut Vec, ctx: &mut BuildContext) { - let mut chunk_sizes = std::collections::HashMap::new(); - - // Accumulate the uncompressed size for each chunk_blob_id - for chunk in chunkdict.iter() { - *chunk_sizes.entry(chunk.chunk_blob_id.clone()).or_insert(0) += - chunk.chunk_uncompressed_size as u64; - } - - // Find all chunk_blob_ids with a total uncompressed size > 4096 - let small_chunks: Vec = chunk_sizes - .into_iter() - .filter(|&(_, size)| size < ctx.v6_block_size()) - .inspect(|(id, _)| { - eprintln!( - "Warning: Blob with id '{}' is smaller than {} bytes.", - id, - ctx.v6_block_size() - ) - }) - .map(|(id, _)| id) - .collect(); - - // Retain only chunks with chunk_blob_id that has a total uncompressed size > 4096 - chunkdict.retain(|chunk| !small_chunks.contains(&chunk.chunk_blob_id)); - } - - /// Build root tree - pub fn build_root_tree() -> Result { - // inode - let mut inode = InodeWrapper::new(RafsVersion::V6); - inode.set_ino(0); - inode.set_uid(1000); - inode.set_gid(1000); - inode.set_projid(0); - inode.set_mode(0o660 | libc::S_IFDIR as u32); - inode.set_nlink(1); - inode.set_name_size("/".len()); - inode.set_rdev(0); - inode.set_blocks(256); - let node_info = NodeInfo { - explicit_uidgid: true, - src_dev: 66305, - src_ino: 24772610, - rdev: 0, - source: PathBuf::from("/"), - path: PathBuf::from("/"), - target: PathBuf::from("/"), - target_vec: vec![OsString::from("/")], - symlink: None, - xattrs: RafsXAttrs::default(), - v6_force_extended_inode: true, - }; - let root_node = Node::new(inode, node_info, 0); - let tree = Tree::new(root_node); - Ok(tree) - } - - /// Build child tree - fn build_child_tree( - ctx: &mut BuildContext, - blob_mgr: &mut BlobManager, - chunkdict: &[ChunkdictChunkInfo], - ) -> Result { - // node - let mut inode = InodeWrapper::new(RafsVersion::V6); - inode.set_ino(1); - inode.set_uid(0); - inode.set_gid(0); - inode.set_projid(0); - inode.set_mode(0o660 | libc::S_IFREG as u32); - inode.set_nlink(1); - inode.set_name_size("chunkdict".len()); - inode.set_rdev(0); - inode.set_blocks(256); - let node_info = NodeInfo { - explicit_uidgid: true, - src_dev: 66305, - src_ino: 24775126, - rdev: 0, - source: PathBuf::from("/"), - path: PathBuf::from("/chunkdict"), - target: PathBuf::from("/chunkdict"), - target_vec: vec![OsString::from("/"), OsString::from("/chunkdict")], - symlink: None, - xattrs: RafsXAttrs::new(), - v6_force_extended_inode: true, - }; - let mut node = Node::new(inode, node_info, 0); - - // insert chunks - Self::insert_chunks(ctx, blob_mgr, &mut node, chunkdict)?; - - let node_size: u64 = node - .chunks - .iter() - .map(|chunk| chunk.inner.uncompressed_size() as u64) - .sum(); - node.inode.set_size(node_size); - - // update child count - node.inode.set_child_count(node.chunks.len() as u32); - - let child = Tree::new(node); - child - .lock_node() - .v5_set_dir_size(ctx.fs_version, &child.children); - Ok(child) - } - - /// Insert chunks - fn insert_chunks( - ctx: &mut BuildContext, - blob_mgr: &mut BlobManager, - node: &mut Node, - chunkdict: &[ChunkdictChunkInfo], - ) -> Result<()> { - for chunk_info in chunkdict.iter() { - let chunk_size: u32 = chunk_info.chunk_compressed_size; - let file_offset = 1 as u64 * chunk_size as u64; - ctx.fs_version = RafsVersion::V6; - let mut chunk = ChunkWrapper::new(RafsVersion::V6); - - // update blob context - let (blob_index, blob_ctx) = - blob_mgr.get_or_cerate_blob_for_chunkdict(ctx, &chunk_info.chunk_blob_id)?; - if blob_ctx.blob_id.is_empty() { - blob_ctx.blob_id = chunk_info.chunk_blob_id.clone(); - } - let chunk_uncompressed_size = chunk_info.chunk_uncompressed_size; - let pre_d_offset = blob_ctx.current_uncompressed_offset; - blob_ctx.uncompressed_blob_size = pre_d_offset + chunk_uncompressed_size as u64; - blob_ctx.current_uncompressed_offset += chunk_uncompressed_size as u64; - - blob_ctx.blob_meta_header.set_ci_uncompressed_size( - blob_ctx.blob_meta_header.ci_uncompressed_size() - + size_of::() as u64, - ); - - // update chunk - let chunk_index = blob_ctx.alloc_chunk_index()?; - chunk.set_blob_index(blob_index); - chunk.set_index(chunk_index); - chunk.set_file_offset(file_offset); - chunk.set_compressed_size(chunk_info.chunk_compressed_size); - chunk.set_compressed_offset(chunk_info.chunk_compressed_offset); - chunk.set_uncompressed_size(chunk_info.chunk_uncompressed_size); - chunk.set_uncompressed_offset(chunk_info.chunk_uncompressed_offset); - chunk.set_id(RafsDigest::from_string(&chunk_info.chunk_digest)); - - debug!("chunk id: {}", chunk.id()); - - node.chunks.push(NodeChunk { - source: ChunkSource::Build, - inner: Arc::new(chunk.clone()), - }); - } - Ok(()) - } -} diff --git a/contrib/nydusify/cmd/nydusify.go b/contrib/nydusify/cmd/nydusify.go index ff00fde1528..94d06fe70b0 100644 --- a/contrib/nydusify/cmd/nydusify.go +++ b/contrib/nydusify/cmd/nydusify.go @@ -694,11 +694,6 @@ func main() { Usage: "Json configuration file for storage backend", EnvVars: []string{"BACKEND_CONFIG_FILE"}, }, - &cli.StringFlag{ - Name: "push-chunk-size", - Value: "0MB", - Usage: "Chunk size for pushing a blob layer in chunked", - }, &cli.StringFlag{ Name: "work-dir", @@ -731,13 +726,6 @@ func main() { if err != nil { return err } - pushChunkSize, err := humanize.ParseBytes(c.String("push-chunk-size")) - if err != nil { - return errors.Wrap(err, "invalid --push-chunk-size option") - } - if pushChunkSize > 0 { - logrus.Infof("will copy layer with chunk size %s", c.String("push-chunk-size")) - } _, arch, err := provider.ExtractOsArch(c.String("platform")) if err != nil { @@ -759,8 +747,6 @@ func main() { ExpectedArch: arch, AllPlatforms: c.Bool("all-platforms"), Platforms: c.String("platform"), - - PushChunkSize: int64(pushChunkSize), }) if err != nil { return err diff --git a/contrib/nydusify/pkg/chunkdict/generator/generator.go b/contrib/nydusify/pkg/chunkdict/generator/generator.go index 5e91e9390a4..15effe4ec21 100644 --- a/contrib/nydusify/pkg/chunkdict/generator/generator.go +++ b/contrib/nydusify/pkg/chunkdict/generator/generator.go @@ -54,8 +54,6 @@ type Opt struct { AllPlatforms bool Platforms string - - PushChunkSize int64 } // Generator generates chunkdict by deduplicating multiple nydus images @@ -119,7 +117,8 @@ func (generator *Generator) Generate(ctx context.Context) error { return err } - return os.RemoveAll(generator.WorkDir) + // return os.RemoveAll(generator.WorkDir) + return nil } // Pull the bootstrap of nydus image @@ -147,7 +146,7 @@ func (generator *Generator) pull(ctx context.Context) ([]string, error) { } func (generator *Generator) generate(_ context.Context, bootstrapSlice []string) (string, string, error) { - // Invoke "nydus-image generate" command + // Invoke "nydus-image chunkdict generate" command currentDir, _ := os.Getwd() builder := build.NewBuilder(generator.NydusImagePath) @@ -194,7 +193,7 @@ func (generator *Generator) push(ctx context.Context, chunkdictBootstrapPath str return err } - pvd, err := provider.New(generator.WorkDir, hosts(generator), 200, "v1", platformMC, generator.PushChunkSize) + pvd, err := provider.New(generator.WorkDir, hosts(generator), 200, "v1", platformMC, 0) if err != nil { return err } @@ -207,17 +206,20 @@ func (generator *Generator) push(ctx context.Context, chunkdictBootstrapPath str } } - // Pull a source image as a template - if err := pvd.Pull(ctx, generator.Sources[0]); err != nil { - if errdefs.NeedsRetryWithHTTP(err) { - pvd.UsePlainHTTP() - if err := pvd.Pull(ctx, generator.Sources[0]); err != nil { - return errors.Wrap(err, "try to pull image") + // Pull source image + for index := range generator.Sources { + if err := pvd.Pull(ctx, generator.Sources[index]); err != nil { + if errdefs.NeedsRetryWithHTTP(err) { + pvd.UsePlainHTTP() + if err := pvd.Pull(ctx, generator.Sources[index]); err != nil { + return errors.Wrap(err, "try to pull image") + } + } else { + return errors.Wrap(err, "pull source image") } - } else { - return errors.Wrap(err, "pull source image") } } + logrus.Infof("pulled source image %s", generator.Sources[0]) sourceImage, err := pvd.Image(ctx, generator.Sources[0]) if err != nil { @@ -239,18 +241,18 @@ func (generator *Generator) push(ctx context.Context, chunkdictBootstrapPath str defer sem.Release(1) sourceDesc := sourceDescs[idx] targetDesc := &sourceDesc + // Get the blob from backend - if bkd != nil { - descs, _targetDesc, err := pushBlobFromBackend(ctx, pvd, bkd, sourceDesc, *generator, chunkdictBootstrapPath, outputPath) - if err != nil { - return errors.Wrap(err, "get resolver") - } - if _targetDesc != nil { - targetDesc = _targetDesc - store := newStore(pvd.ContentStore(), descs) - pvd.SetContentStore(store) - } + descs, _targetDesc, err := pushBlobFromBackend(ctx, pvd, bkd, sourceDesc, *generator, chunkdictBootstrapPath, outputPath) + if err != nil { + return errors.Wrap(err, "get resolver") } + if _targetDesc != nil { + targetDesc = _targetDesc + store := newStore(pvd.ContentStore(), descs) + pvd.SetContentStore(store) + } + targetDescs[idx] = *targetDesc if err := pvd.Push(ctx, *targetDesc, generator.Target); err != nil { @@ -309,20 +311,45 @@ func pushBlobFromBackend( eg.Go(func() error { sem.Acquire(context.Background(), 1) defer sem.Release(1) + blobID := blobIDs[idx] blobDigest := digest.Digest("sha256:" + blobID) - blobSize, err := bkd.Size(blobID) - if err != nil { - return errors.Wrap(err, "get blob size") - } - blobSizeStr := humanize.Bytes(uint64(blobSize)) - logrus.WithField("digest", blobDigest).WithField("size", blobSizeStr).Infof("pushing blob from backend") - rc, err := bkd.Reader(blobID) - if err != nil { - return errors.Wrap(err, "get blob reader") + var blobSize int64 + var rc io.ReadCloser + + if bkd != nil { + rc, err = bkd.Reader(blobID) + if err != nil { + return errors.Wrap(err, "get blob reader") + } + blobSize, err = bkd.Size(blobID) + if err != nil { + return errors.Wrap(err, "get blob size") + } + } else { + imageDesc, err := generator.sourcesParser[0].Remote.Resolve(ctx) + if err != nil { + if strings.Contains(err.Error(), "x509: certificate signed by unknown authority") { + logrus.Warningln("try to enable \"--source-insecure\" / \"--target-insecure\" option") + } + return errors.Wrap(err, "resolve image") + } + rc, err = generator.sourcesParser[0].Remote.Pull(ctx, *imageDesc, true) + if err != nil { + return errors.Wrap(err, "get blob reader") + } + blobInfo, err := pvd.ContentStore().Info(ctx, blobDigest) + if err != nil { + return errors.Wrap(err, "get info from content store") + } + blobSize = blobInfo.Size } defer rc.Close() + + blobSizeStr := humanize.Bytes(uint64(blobSize)) + logrus.WithField("digest", blobDigest).WithField("size", blobSizeStr).Infof("pushing blob from backend") + blobDescs[idx] = ocispec.Descriptor{ Digest: blobDigest, Size: blobSize, @@ -349,6 +376,7 @@ func pushBlobFromBackend( logrus.WithField("digest", blobDigest).WithField("size", blobSizeStr).Infof("pushed blob from backend") return nil + }) }(idx) } diff --git a/docs/chunk-deduplication.md b/docs/chunk-deduplication.md index 97a8db2c3f4..259169551d1 100644 --- a/docs/chunk-deduplication.md +++ b/docs/chunk-deduplication.md @@ -1,8 +1,8 @@ -# Notice [WIP] Pending further revisionsNotice -# Probntroduction +# Probntroduction In container images, there are often a large number of duplicate files or content, and these duplicate parts occupy a large amount of storage space, especially in high-density deployment scenarios. As the number of Nydus images grows, it will bring many problems such as low storage space utilization and excessive consumption of bandwidth resources. To do this, an effective deduplication mechanism (deduplication) needs to be designed to solve this problem. Unlike traditional OCI, which distributes images at a layer-granular level, the smallest unit of a Nydus image is a chunk, so the deduplication algorithm needs to be deduplicated in chunk units. At the same time, we want to deduplicate multiple aspects of the Nydus image, including between Nydus images and between different versions of the same Nydus image. No matter which deduplication method is essentially to deduplicate the repeated chunks in the image, only one duplicate chunk is retained, and the reference to the chunk is used instead of other duplicate chunks to reduce the storage space occupation, so as to maximize the data transmission and storage capabilities of Nydus and improve the access speed and efficiency of the image. + # General idea The deduplication algorithm first needs to select the duplicate chunk in the image according to the image information such as the number of occurrences of chunk, chunk size, chunk image to which the chunk belongs and the corresponding version, and generate chunkdict, chunkdict records the unique identifier or fingerprint of chunk, only need to store chunkdict, other images can refer to chunk in chunkdict by reference. @@ -13,32 +13,43 @@ The deduplication algorithm is divided into two parts, the first part is the DBS 2. Extract the image information and call the DBSCAN clustering algorithm to deduplicate different images. 3. Deduplicate the dictionary content in 2, and call the exponential smoothing algorithm for each image separately for image version deduplication. 4. Get the deduplication dictionary generated by running the two algorithms and drop the disk. +5. Generate a chunkdict image and push it to the remote repository # Algorithm detailed process ## Overall Input ```shell nydusify chunkdict generate --sources \ - localhost:5000:redis:nydus_7.0.1, \ - localhost:5000:redis:nydus_7.0.2,\ - localhost:5000:redis:nydus_7.0.3 \ + registry.com/redis:nydus_7.0.1, \ + registry.com/redis:nydus_7.0.2, \ + registry.com/redis:nydus_7.0.3 \ + -- target registry.com/redis:nydus_chunkdict \ + --source-insecure --target-insecure + # Optional + --backend-config-file /path/to/backend-config.json \ + --backend-type oss +``` + +# Use the chunk dict image to reduce the incremental size of the new image +``` +nydusify convert + --source registry.com/redis:OCI_7.0.4 \ + --target registry.com/redis:nydus_7.0.4 \ + --chunk-dict registry.com/redis:nydus_chunkdict ``` -*** -`nydusify chunkdict generate` calls two commands `nydus-image chunkdict save` and `nydus-image chunkdict generate` to store image information into the database and generate a list of chunks to be deduplicated -Download multiple Nydus images in advance and put them into the repository as datasets, such as selecting 10 consecutive versions of redis and alpine as the image dataset, and execute the command `nydus-image chunkdict save` to store the information of the chunk and blob in the chunk and blob table of the database. +*** +`nydusify chunkdict generate` calls subcommand `nydus-image chunkdict generate` to store image information into the database and generate a new bootstrap as chunkdict bootstrap. +Download multiple Nydus images in advance and put them into the repository as datasets, such as selecting 10 consecutive versions of redis and alpine as the image dataset, and execute the command `nydus-image chunkdict generate` to store the information of the chunk and blob in the chunk and blob table of the database. ```shell # Deposit multiple images into the database -nydus-image chunkdict save --bootstrap \ - ./output/localhost:5000:redis:nydus_7.0.1/nydus_bootstrap, \ - ./output/localhost:5000:redis:nydus_7.0.2/nydus_bootstrap, \ - ./output/localhost:5000:redis:nydus_7.0.3/nydus_bootstrap \ -``` -Execute the command `nydus-image chunkdict generate` to access the database and call the deduplication algorithm to generate the chunk list -```shell -# Call the deduplication algorithm to generate chunk list -nydus-image chunkdict generate --database \ - sqlite:///path/imageservice/contrib/nydusify/chunkdict.db +nydus-image chunkdict generate --source \ + /path/localhost:5000:redis:nydus_7.0.1/nydus_bootstrap, \ + /path/localhost:5000:redis:nydus_7.0.2/nydus_bootstrap, \ + /path/localhost:5000:redis:nydus_7.0.3/nydus_bootstrap \ + --bootstrap /path/to/chunkdict_bootstrap\ + --database /path/to/database.db\ + --output-json /path/to/nydus_bootstrap_output.json ``` *** @@ -77,10 +88,9 @@ where $C(R_x)$ represents the unique chunk set of all training set images in the **6.** Remove the chunk in the chunk dictionary selected in 5 for all images (training set and test set), and then repeat the operation 1-5 to generate the chunk dictionary until the maximum number of cycles is reached 7, or the discrete image ratio is greater than 80% of the total number of images. The principle of DBSCAN algorithm how to divide the cluster is shown in the diagram: -![在这里插入图片描述](https://img-blog.csdnimg.cn/5fba149720a34620873a5a2cb304d668.png#pic_center) -In this diagram, minPts = 4. Point A and the other red points are core points, because the area surrounding these points in an ε radius contain at least 4 points (including the point itself). Because they are all reachable from one another, they form a single cluster. Points B and C are not core points, but are reachable from A (via other core points) and thus belong to the cluster as well. Point N is a noise point that is neither a core point nor directly-reachable. - +![](https://img-blog.csdnimg.cn/5fba149720a34620873a5a2cb304d668.png#pic_center) **Remark:** This section of the picture and the associated DBSCAN algorithm description are referenced from : [https://en.wikipedia.org/wiki/DBSCAN](https://en.wikipedia.org/wiki/DBSCAN) + #### Algorithm 2 Deduplication between different versions of the image (exponential smoothing algorithm) *** **Basic principle:** Exponential smoothing algorithm is a method for time series data prediction and smoothing, the basic principle is to weighted average the data, give higher weight to the more recent repeated chunks, and constantly update the smoothing value, so the newer chunk has a greater impact on future forecasts, and the impact of older data will gradually weaken. @@ -102,16 +112,20 @@ where, $\alpha=0.5$ , $Y_{t-1}$ indicates whether the chunk appeared in the prev **5.** Choose a chunk dictionary that minimizes the test set's storage space. *** + + ### Exponential smoothing algorithm test table +Step 1: Download 10 OCI versions of an image and count the total size +Step 2: Convert OCI to nydus image, and then count the total size after conversion +Step 3: Select three versions of the image to generate chunkdict, use chunkdict to convert the remaining seven versions of the image, and then count the total size +dedulicating rate = (total_image_size(nydus) - total_image_size (nydus after dedulicating))/total_image_size(nydus) + + + +| image_name | version number | total_image_size(OCI) | total_image_size(nydus) | total_image_size (nydus after dedulicating) | chunkdict_image_size | dedulicating rate | +|------------|----------------|-----------------------|-------------------------|---------------------------------------------|----------------------|-------------------| +| redis | 10 | 341.78 | 419.37 | 319.48 | 41.87 | 23.82% | +| ubuntu | 10 | 290.26 | 308.59 | 140.28 | 30.8 | 54.54% | +| alpine | 10 | 26.9 | 27.55 | 24.7 | 2.74 | 10.34% | -| image_name | version number | total_size | train_size | test_size | test_size after dedulicating | chunkdict_size | dedulicating rate | threshold | -|------------|----------------|------------|------------|-----------|------------------------------|----------------|-------------------|-----------| -| redis | 10 | 382.03 | 266.7 | 115.33 | 31.56 | 42.33 | 72.63% | 0.8-0.5 | -| python | 10 | 3509.91 | 2095.37 | 1414.54 | 123.33 | 588.61 | 91.28% | 0.8-0.5 | -| ubuntu | 10 | 317.33 | 222.11 | 95.22 | 12.27 | 39.61 | 87.11% | 0.8-0.5 | -| nginx | 10 | 396.86 | 284.4 | 112.46 | 50.54 | 83.54 | 55.06% | 0.8-0.5 | -| postgres | 10 | 1360.31 | 956.42 | 403.89 | 381.54 | 19.66 | 5.53% | 0.8-0.5 | -| alpine | 10 | 27.23 | 19.04 | 8.19 | 5.62 | 4.7 | 31.29% | 0.8-0.5 | -| node | 10 | 3698.44 | 2598.59 | 1099.85 | 429.39 | 649.42 | 60.96% | 0.8-0.5 | -| httpd | 10 | 561.99 | 385.79 | 176.2 | 85.7 | 54.15 | 51.36% | 0.8-0.5 | *** diff --git a/smoke/tests/image_test.go b/smoke/tests/image_test.go index 922da1464d3..69117ef3f39 100644 --- a/smoke/tests/image_test.go +++ b/smoke/tests/image_test.go @@ -5,9 +5,7 @@ package tests import ( - "encoding/json" "fmt" - "os" "path/filepath" "testing" @@ -159,7 +157,7 @@ func (i *ImageTestSuite) TestGenerateChunkdict() test.Generator { logLevel := "--log-level warn" nydusifyPath := ctx.Binary.Nydusify - // Test v6 + // Generate v6 chunkdcit target1v6 := fmt.Sprintf("%s-nydus-%s", image1, uuid.NewString()) target2v6 := fmt.Sprintf("%s-nydus-%s", image2, uuid.NewString()) target3v6 := fmt.Sprintf("%s-nydus-%s", image3, uuid.NewString()) @@ -179,32 +177,32 @@ func (i *ImageTestSuite) TestGenerateChunkdict() test.Generator { ) tool.RunWithoutOutput(i.T, convertCmd3) - backendtype := "--backend-type oss" + // backendtype := "--backend-type oss" sourceinsecure := "--source-insecure" targetinsecure := "--target-insecure" - jsonData := `{ - "endpoint": "oss-cn-zhangjiakou.aliyuncs.com", - "access_key_id": "LTAI5tKHuSQQXVjSE7PgKYhf", - "access_key_secret": "FBYp1JDxlIZt8cCpFWpq3j9HYokw8a", - "bucket_name": "testcompact1" - }` + // jsonData := `{ + // "endpoint": "oss-cn-zhangjiakou.aliyuncs.com", + // "access_key_id": "LTAI5tKHuSQQXVjSE7PgKYhf", + // "access_key_secret": "FBYp1JDxlIZt8cCpFWpq3j9HYokw8a", + // "bucket_name": "testcompact1" + // }` - formattedData, err := json.MarshalIndent(json.RawMessage(jsonData), "", " ") - if err != nil { - fmt.Println("Error marshalling JSON:", err) - return - } - os.WriteFile("output.json", formattedData, 0644) + // formattedData, err := json.MarshalIndent(json.RawMessage(jsonData), "", " ") + // if err != nil { + // fmt.Println("Error marshalling JSON:", err) + // return + // } + // os.WriteFile("output.json", formattedData, 0644) - backendconfigfile := "--backend-config-file output.json" + // backendconfigfile := "--backend-config-file output.json" targetv6 := fmt.Sprintf("%s,%s,%s", target1v6, target2v6, target3v6) chunkdictv6 := fmt.Sprintf("%s-nydus-%s", image1, uuid.NewString()) generateCmd := fmt.Sprintf( - "%s %s chunkdict generate --sources %s --target %s %s %s %s %s --nydus-image %s --work-dir %s", - nydusifyPath, logLevel, targetv6, chunkdictv6, sourceinsecure, targetinsecure, backendtype, backendconfigfile, ctx.Binary.Builder, filepath.Join(ctx.Env.WorkDir, "generate"), + "%s %s chunkdict generate --sources %s --target %s %s %s --nydus-image %s --work-dir %s", + nydusifyPath, logLevel, targetv6, chunkdictv6, sourceinsecure, targetinsecure, ctx.Binary.Builder, filepath.Join(ctx.Env.WorkDir, "generate"), ) tool.RunWithoutOutput(i.T, generateCmd) @@ -214,7 +212,7 @@ func (i *ImageTestSuite) TestGenerateChunkdict() test.Generator { ) tool.RunWithoutOutput(i.T, checkCmd) - // Test v5 + // Generate v5 chunkdcit fsversion := "--fs-version 5" target1v5 := fmt.Sprintf("%s-nydus5-%s", image1, uuid.NewString()) target2v5 := fmt.Sprintf("%s-nydus5-%s", image2, uuid.NewString()) @@ -239,11 +237,43 @@ func (i *ImageTestSuite) TestGenerateChunkdict() test.Generator { chunkdictv5 := fmt.Sprintf("%s-nydus5-%s", image1, uuid.NewString()) generateCmd2 := fmt.Sprintf( - "%s %s chunkdict generate --sources %s --target %s %s %s %s %s --nydus-image %s --work-dir %s", - nydusifyPath, logLevel, targetv5, chunkdictv5, sourceinsecure, targetinsecure, backendtype, backendconfigfile, ctx.Binary.Builder, filepath.Join(ctx.Env.WorkDir, "generate"), + "%s %s chunkdict generate --sources %s --target %s %s %s --nydus-image %s --work-dir %s", + nydusifyPath, logLevel, targetv5, chunkdictv5, sourceinsecure, targetinsecure, ctx.Binary.Builder, filepath.Join(ctx.Env.WorkDir, "generate"), ) tool.RunWithoutOutput(i.T, generateCmd2) + // Test v6 chunkdict convert + target4v6 := fmt.Sprintf("%s-nydus-chunkdict-%s", image1, uuid.NewString()) + chunkdict1v6 := fmt.Sprintf("bootstrap:registry:%s", chunkdictv6) + convertCmd7 := fmt.Sprintf( + "%s %s convert --source %s --target %s --chunk-dict %s --nydus-image %s --work-dir %s", + ctx.Binary.Nydusify, logLevel, image1, target4v6, chunkdict1v6, ctx.Binary.Builder, ctx.Env.TempDir, + ) + tool.RunWithoutOutput(i.T, convertCmd7) + + checkCmd1 := fmt.Sprintf( + "%s %s check --target %s --nydus-image %s --nydusd %s --work-dir %s", + nydusifyPath, logLevel, target4v6, ctx.Binary.Builder, ctx.Binary.Nydusd, filepath.Join(ctx.Env.WorkDir, "check"), + ) + tool.RunWithoutOutput(i.T, checkCmd1) + + // Test v5 chunkdict convert + target4v5 := fmt.Sprintf("%s-nydus5-chunkdict-%s", image1, uuid.NewString()) + chunkdict1v5 := fmt.Sprintf("bootstrap:registry:%s", chunkdictv5) + + convertCmd8 := fmt.Sprintf( + "%s %s convert --source %s --target %s --chunk-dict %s --nydus-image %s %s --work-dir %s", + ctx.Binary.Nydusify, logLevel, image1, target4v5, chunkdict1v5, ctx.Binary.Builder, fsversion, ctx.Env.TempDir, + ) + tool.RunWithoutOutput(i.T, convertCmd8) + + + checkCmd2 := fmt.Sprintf( + "%s %s check --target %s --nydus-image %s --nydusd %s --work-dir %s", + nydusifyPath, logLevel, target4v5, ctx.Binary.Builder, ctx.Binary.Nydusd, filepath.Join(ctx.Env.WorkDir, "check"), + ) + tool.RunWithoutOutput(i.T, checkCmd2) + return "generateChunkdict", nil } } diff --git a/src/bin/nydus-image/deduplicate.rs b/src/bin/nydus-image/deduplicate.rs index e27d3359b41..8518d953948 100644 --- a/src/bin/nydus-image/deduplicate.rs +++ b/src/bin/nydus-image/deduplicate.rs @@ -8,9 +8,10 @@ use core::cmp::Ordering; use nydus_api::ConfigV2; use nydus_builder::BuildContext; use nydus_builder::ChunkdictChunkInfo; +use nydus_builder::ConversionType; use nydus_builder::Tree; use nydus_rafs::metadata::{RafsSuper, RafsVersion}; -use nydus_storage::device::BlobInfo; +use nydus_storage::device::{BlobFeatures, BlobInfo}; use rusqlite::{params, Connection}; use std::collections::HashSet; use std::collections::{BTreeMap, HashMap}; @@ -172,6 +173,24 @@ pub fn check_bootstrap_versions_consistency( Ok(()) } +// Get parent bootstrap context for chunkdict bootstrap. +pub fn update_ctx_from_parent_bootstrap( + ctx: &mut BuildContext, + bootstrap_path: &PathBuf, +) -> Result<()> { + let (sb, _) = RafsSuper::load_from_file(bootstrap_path, Arc::new(ConfigV2::default()), false)?; + + let config = sb.meta.get_config(); + config.check_compatibility(&sb.meta)?; + + if config.is_tarfs_mode { + ctx.conversion_type = ConversionType::TarToTarfs; + ctx.blob_features |= BlobFeatures::TARFS; + } + + Ok(()) +} + pub struct Deduplicate { db: D, } diff --git a/src/bin/nydus-image/main.rs b/src/bin/nydus-image/main.rs index b2cf763320d..cebdbf90cf3 100644 --- a/src/bin/nydus-image/main.rs +++ b/src/bin/nydus-image/main.rs @@ -13,7 +13,7 @@ extern crate log; extern crate serde_json; #[macro_use] extern crate lazy_static; -use crate::deduplicate::SqliteDatabase; +use crate::deduplicate::{update_ctx_from_parent_bootstrap, SqliteDatabase}; use std::convert::TryFrom; use std::fs::{self, metadata, DirEntry, File, OpenOptions}; use std::os::unix::fs::FileTypeExt; @@ -404,14 +404,6 @@ fn prepare_cmd_args(bti_string: &'static str) -> App { .required(true) .num_args(1..), ) - .arg( - Arg::new("digester") - .long("digester") - .help("Algorithm to digest data chunks:") - .required(false) - .default_value("blake3") - .value_parser(["blake3", "sha256"]), - ) .arg( Arg::new("verbose") .long("verbose") @@ -420,12 +412,6 @@ fn prepare_cmd_args(bti_string: &'static str) -> App { .action(ArgAction::SetTrue) .required(false), ) - .arg( - Arg::new("features") - .long("features") - .value_parser(["blob-toc"]) - .help("Enable/disable features") - ) ) ); @@ -1215,25 +1201,30 @@ impl Command { .unwrap(); check_bootstrap_versions_consistency(&mut build_ctx, &source_bootstrap_paths)?; + update_ctx_from_parent_bootstrap(&mut build_ctx, &source_bootstrap_paths[0])?; for (_, bootstrap_path) in source_bootstrap_paths.iter().enumerate() { - let path = bootstrap_path.display().to_string(); - info!("Bootstrap path is {}", path); - let path_name: Vec<&str> = path.split('/').collect(); + let path_name = bootstrap_path.as_path(); // Extract the image name and version name from the bootstrap directory - let bootstrap_dir = match path_name.get(path_name.len() - 2) { - Some(&bootstrap_dir) => bootstrap_dir.to_string(), + let bootstrap_dir = match path_name + .parent() + .and_then(|p| p.file_name().and_then(|f| f.to_str())) + { + Some(dir_str) => dir_str.to_string(), None => bail!("Invalid Bootstrap directory name"), }; let full_image_name: Vec<&str> = bootstrap_dir.split(':').collect(); let image_name = match full_image_name.get(full_image_name.len() - 2) { Some(&second_last) => second_last.to_string(), - None => bail!("Invalid image name"), + None => bail!( + "Invalid image name {:?}", + full_image_name.get(full_image_name.len() - 2) + ), }; - let version_name = match full_image_name.last() { + let image_tag = match full_image_name.last() { Some(&last) => last.to_string(), - None => bail!("Invalid version name"), + None => bail!("Invalid version name {:?}", full_image_name.last()), }; // For backward compatibility with v2.1. let config = Self::get_configuration(matches)?; @@ -1249,7 +1240,7 @@ impl Command { "sqlite" => { let mut deduplicate: Deduplicate = Deduplicate::::new(db_strs[1])?; - deduplicate.save_metadata(bootstrap_path, config, image_name, version_name)? + deduplicate.save_metadata(bootstrap_path, config, image_name, image_tag)? } _ => { bail!("Unsupported database type: {}, please use a valid database URI, such as 'sqlite:///path/to/chunkdict.db'.", db_strs[0]) @@ -1295,12 +1286,12 @@ impl Command { } // Dump chunkdict to bootstrap - let features = Features::try_from( - matches - .get_one::("features") - .map(|s| s.as_str()) - .unwrap_or_default(), - )?; + // let features = Features::try_from( + // matches + // .get_one::("features") + // .map(|s| s.as_str()) + // .unwrap_or_default(), + // )?; let chunkdict_bootstrap_path = Self::get_bootstrap_storage(matches)?; let config = Self::get_configuration(matches).context("failed to get configuration information")?; @@ -1309,18 +1300,18 @@ impl Command { .set_blob_accessible(matches.get_one::("config").is_some()); build_ctx.configuration = config; build_ctx.blob_storage = Some(chunkdict_bootstrap_path); - build_ctx.blob_features = BlobFeatures::CAP_TAR_TOC; - build_ctx.blob_features.insert(BlobFeatures::ALIGNED); + // build_ctx.blob_features = BlobFeatures::CAP_TAR_TOC; + // build_ctx.blob_features.insert(BlobFeatures::ALIGNED); // Build_ctx.blob_features.insert(BlobFeatures::CHUNK_INFO_V2); // Build_ctx.blob_features.insert(BlobFeatures::ENCRYPTED); - build_ctx.features = features; - - let digester = matches - .get_one::("digester") - .map(|s| s.as_str()) - .unwrap_or_default() - .parse()?; - let mut blob_mgr = BlobManager::new(digester); + // build_ctx.features = features; + + // let digester = matches + // .get_one::("digester") + // .map(|s| s.as_str()) + // .unwrap_or_default() + // .parse()?; + let mut blob_mgr = BlobManager::new(build_ctx.digester); let bootstrap_path = Self::get_bootstrap_storage(matches)?; let mut bootstrap_mgr = BootstrapManager::new(Some(bootstrap_path), None);