diff --git a/doc/object_deduplication.md b/doc/object_deduplication.md new file mode 100644 index 0000000000000000000000000000000000000000..86b5127ee5847fe1ffa33f845b66cb63661e2897 --- /dev/null +++ b/doc/object_deduplication.md @@ -0,0 +1,94 @@ +# Design notes for Git object deduplication + +## Problem description + +Forking projects is a common workflow in GitLab. When a user forks a +project, GitLab creates a full clone of the repository associated with +the fork parent. If the repository is large and gets forked often, this +leads to a lot duplicated Git objects which take up disk space. + +We are adding Git object deduplication to GitLab to address this +problem. + +## Solution overview + +We have [chosen](https://gitlab.com/gitlab-org/gitaly/issues/1331) a +design where Git objects shared between repositories on the same Gitaly +storage shard can be shared via a **pool repository**. For each member +of the pool, the pool repository has a git remote pointing to it. +Conversly, each member points to the pool repository as an **alternate +object directory** using the `objects/info/alternates` in the +repository. + +From a Gitaly point of view this is a very transparent solution. Almost +all RPC's should continue to work without modification on repositories +that are linked to a pool. This is because Git natively supports +`objects/info/alternates`. + +What is new is that GitLab must manage the pool repository and the pool +relations in Gitaly. + +## Limitations + +- Pools are local to Gitaly storage shards. +- Repositories in a pool can see all objects in the pool repository if + they know the object ID (SHA1). + +The second property means that we should not mix repositories of +projects with different visibility scope (e.g. public vs private) in the +same storage pool. + +This also means that we cannot conflate the project fork network +relation with the repository pool relation. Storage pools will be +restricted within fork networks to public projects that live on the same +Gitaly shard. + +If a project in a storage pool changes visibility from public to private +we must extract it from the storage pool. + +## First iteration + +The first iteration of object deduplication is limited to the following scope: + +- Only works with new forked projects +- The parent of the forked project is using hashed storage +- The new forked project is using hashed storage +- Pool repositories are created once and do not pull in new git + objects. This means that the deduplication percentage will fall over + time as new objects get pushed to the repositories in a pool. We + will address this in a later iteration ("pool grooming") +### Scenarios + +#### Create pool from existing repo + +- SQL: create pool object +- Gitaly: create pool repo from existing repo. Create remote pointing + to existing repo, and clear top level refs in pool. +- SQL: link project to pool +- Gitaly: finalize link: create (remote and) objects/info/alternates + connection for existing repo + +If this fails in the middle there is no data loss in the existing repo. + +#### Clone new repo from origin in pool (e.g. a fork) + +- SQL: create project linked to pool. Project is in "being cloned" + state +- Gitaly: create new repo with local disk clone from origin +- Gitaly: create remote and objects/info/alternates connection for new + repo +- SQL: clear project "being cloned" state + +#### Project leaves pool (e.g. fork taken private) + +- SQL: mark project as "repo transitioning to private". git pushes are + blocked +- Gitaly: copy needed objects from pool with git repack -a +- Gitaly: remove objects/info/alternates link and pool remote +- SQL: unmark "repo transitioning to private". git pushes no longer + blocked + +This is problematic. If we fail in the middle, git pushes remain +blocked. Do we really need to block git pushes during this operation? If +we fail during the Gitaly parts we can re-create the pool links and +restart the repack. diff --git a/helper.rb b/helper.rb new file mode 100644 index 0000000000000000000000000000000000000000..eb022451ad9dfdfab18a0af0312438866bcf82b2 --- /dev/null +++ b/helper.rb @@ -0,0 +1,30 @@ +require 'open3' + +def run_pipeline(pipeline, dir) + warn "#{File.basename(dir)}$ #{pipeline.map { |c| c.join(' ') }.join(' | ')}" + + statuses = Open3.pipeline(*pipeline, chdir: dir) + + statuses.all? { |s| s && s.success? } +end + +def run_pipeline!(pipeline, dir) + abort "failed" unless run_pipeline(pipeline, dir) +end + +# Note: tricks with the 'dir' argument and File.basename are there only +# to make the script output prettier. +def run!(cmd, dir=nil, env={}) + abort "failed" unless run(cmd, dir, env) +end + +def run(cmd, dir=nil, env={}) + dir ||= Dir.pwd + cmd_s = cmd.join(' ') + warn "#{File.basename(dir)}$ #{cmd_s}" + start = Time.now + status = system(env, *cmd, chdir: dir) + delta = Time.now - start + warn sprintf("time: %.3fs\n", delta) if delta > 1.0 + status +end diff --git a/test-dedup b/test-dedup new file mode 100755 index 0000000000000000000000000000000000000000..42130c94d63189429e60be5c5cea77b84b71ebe9 --- /dev/null +++ b/test-dedup @@ -0,0 +1,158 @@ +#!/usr/bin/env ruby +require 'tempfile' +require 'fileutils' + +require_relative 'helper' + +TEST_REPO = ENV.fetch('TEST_REPO') +TMP_ROOT = File.absolute_path(Dir.mktmpdir) + +at_exit do + warn 'deleting tmp data' + FileUtils.rm_rf(TMP_ROOT) +end + +def main + children = {} + %w[repo1 repo2 repo3].each { |name| children[name] = create_child(name) } + + # Create a pool based on repo1 + # + pool = File.join(TMP_ROOT, 'pool.git') + source_name, source_path = 'repo1', children['repo1'] + create_pool_repository(source_name, source_path, pool) + link_repository_to_pool(pool, source_name, source_path) + show_sizes([pool, children['repo1']]) + + # Clone a new repo, repo4, from repo1 (fork parent) within the pool + # + target_name, target_path = 'repo4', child_path('repo4') + children[target_name] = target_path + prepare_clone_in_pool(pool, source_path, target_name, target_path) + link_repository_to_pool(pool, target_name, target_path) + show_sizes([pool, source_path, target_path]) + + # Repo4 leaves pool (e.g. fork switches to private) + # + repo_name, repo_path = 'repo4', child_path('repo4') + # To make things interesting, ensure repo4 is deduplicated + run!(%w[git repack --quiet -Ald], repo_path) + run!(%w[find objects -type f], repo_path) + show_sizes([repo_path]) + + # Enter critical section where repo may not receive pushes (???) + prepare_leave_pool(repo_path) + unlink_repository_from_pool(pool, repo_name, repo_path) + # Exit critical section + + # Sanity checks + run!(%w[find objects -type f], repo_path) + run!(%w[git fsck --connectivity-only], repo_path) + show_sizes([repo_path]) +end + +def create_pool_repository(source_name, source_path, pool) + puts '--- CreatePoolRepository' + # Use --local to get Git to use hardlinks, which avoids copying objects and packfiles + run!(%W[git clone --quiet --bare --local #{File.basename(source_path)} #{File.basename(pool)}], File.dirname(pool)) + + # After the initial clone, the pool repository has top-level refs. This + # is not what we want, we want these refs to live under refs/remotes. + create_remote_in_pool(pool, source_name, source_path) + run!(%W[git fetch --quiet #{source_name}], pool) + + # Now we can delete the top-level refs in the pool + run!(%w[git remote remove origin], pool) + delete_top_level_refs(pool) + puts '---' +end + +def create_remote_in_pool(pool, child_name, child_path) + # It is important that the remote is a relative path, so that it is + # stable across changing git data mountpoints. + unless run(%W[git remote add #{child_name} ../#{File.basename(child_path)}], pool) + run!(%W[git remote set-url #{child_name} ../#{File.basename(child_path)}], pool) + end + + # this ensures every ref, including remote refs/tags, gets fetched + run!(%W[git config remote.#{child_name}.fetch +refs/*:refs/remotes/#{child_name}/*], pool) + + # this prevents remote tags being fetched into the top-level refs/tags namespace + run!(%W[git config remote.#{child_name}.tagopt --no-tags], pool) +end + +def link_repository_to_pool(pool, child_name, child_path) + puts '--- LinkRepositoryToPool' + create_remote_in_pool(pool, child_name, child_path) + + open(File.join(child_path, 'objects/info/alternates'), 'w') do |f| + # It is important that this is a relative path, so that it is stable across changing git data mountpoints. + f.puts File.join('..', '..', File.basename(pool), 'objects') + end + + puts '---' +end + +def delete_top_level_refs(repo) + # Delete every ref that does not start with 'refs/remotes/' + run_pipeline!([ + %w[git for-each-ref --format=delete\ %(refname)], + %w[grep -v ^delete\ refs/remotes/], + %w[git update-ref --stdin], + ], repo) +end + +def show_sizes(paths) + paths.each do |p| + run!(%W[du -sh #{File.basename(p)}], File.dirname(p)) + end +end + +def create_child(name) + path = child_path(name) + run!(%W[git clone --quiet --bare #{TEST_REPO} #{File.basename(path)}], File.dirname(path)) + clean_child(path) + path +end + +def clean_child(path) + run!(%W[git remote remove origin], path) + run!(%W[rm -rf hooks], path) +end + +def child_path(name) + File.join(TMP_ROOT, name + '.git') +end + +def prepare_clone_in_pool(pool, source_path, target_name, target_path) + puts '--- PrepareCloneInPool' + + # After this clone, objects/info/alternates is already set up but it + # points to an absolute path. This is not what we want. This gets fixed + # later by link_repository_to_pool. + # + run!(%W[git clone --quiet --bare --local #{File.basename(source_path)} #{File.basename(target_path)}], File.dirname(target_path)) + clean_child(target_path) + + puts '---' +end + +def prepare_leave_pool(repo_path) + puts '--- PrepareLeavePool' + + # Because we are running 'git repack' _without_ '-l', Git will gather + # all the objects it needs from the pool repository into a new + # non-deduplicated packfile. + run!(%w[git repack --quiet -a], repo_path) + + puts '---' +end + +def unlink_repository_from_pool(pool, repo_name, repo_path) + puts '--- UnlinkRepositoryFromPool' + run!(%w[rm objects/info/alternates], repo_path) + run!(%W[git remote remove #{repo_name}], pool) + puts '---' +end + +main diff --git a/test-push-alternates b/test-push-alternates new file mode 100755 index 0000000000000000000000000000000000000000..94d52a71c8f08d35ae160b5baf528fa5649b6f42 --- /dev/null +++ b/test-push-alternates @@ -0,0 +1,95 @@ +#!/usr/bin/env ruby +require 'tempfile' +require 'fileutils' +require 'securerandom' + +require_relative 'helper' + +TEST_REPO = ENV.fetch('TEST_REPO') +TMP_ROOT = File.absolute_path(Dir.mktmpdir) + +at_exit do + warn '... deleting tmp data' + FileUtils.rm_rf(TMP_ROOT) +end + +def main + pool = File.join(TMP_ROOT, 'pool.git') + run!(%W[git clone --quiet --bare #{TEST_REPO} #{pool}]) + + child = File.join(TMP_ROOT, 'child.git') + run!(%W[git clone --quiet --bare --shared #{pool} #{child}]) + run!(%w[git remote remove origin], child) + + # Verify that child shares objects with pool + run!(%w[du -sh] + [pool, child]) + run!(%w[cat objects/info/alternates], child) + + # Create a dangling object in the pool repository + commit_id = new_commit(pool) + + # Make objects old; to see if mtimes get updated later + backdate_objects(pool) + run!(%w[find objects -type f -exec ls -l {} ;], pool) + + # Make the child refer to the dangling object + ref = new_test_ref + run!(%W[git update-ref #{ref} #{commit_id}], child) + run!(%W[git cat-file -p #{ref}], child) + + # Print mtimes + puts "--- looking for: #{commit_id}" + run!(%w[find objects -type f -exec ls -l {} ;], pool) + run!(%w[find objects -type f -exec ls -l {} ;], child) + + # Does not look good, no sign of mtime change. How will the pool know the object is used? + # Let's try freshening the commit. + run_pipeline!([%W[git cat-file commit #{commit_id}], %w[git hash-object -t commit -w --stdin]], child) + run!(%w[find objects -type f -exec ls -l {} ;], pool) + run!(%w[find objects -type f -exec ls -l {} ;], child) + # Freshening works! + + # Different scenario: 'git push' + + pusher = File.join(TMP_ROOT, 'pusher.git') + run!(%W[git clone --quiet --bare #{TEST_REPO} #{pusher}]) + commit_id = new_commit(pusher) + ref = new_test_ref + run!(%W[git update-ref #{ref} #{commit_id}], pusher) + + obj_name = "objects/#{commit_id[0,2]}/#{commit_id[2, commit_id.size]}" + FileUtils.mkdir_p(File.dirname(File.join(pool, obj_name))) + run!(%W[cp #{obj_name} #{pool}/#{obj_name}], pusher) + run!(%w[git repack --quiet -k -d -a], pool) + backdate_objects(pool) + + run!(%w[git config advice.objectNameWarning false], pusher) + run!(%w[git config advice.objectNameWarning false], child) + + run!(%W[git push #{child} #{ref}], pusher) + puts "--- looking for: #{commit_id}" + run!(%w[find objects -type f -exec ls -l {} ;], pool) + run!(%w[find objects -type f -exec ls -l {} ;], child) + run!(%W[git show #{commit_id}], pool) +end + +def new_commit(repo) + commit_id = IO.popen( + %W[git commit-tree -p HEAD -m hello#{SecureRandom.hex(2)} HEAD^{tree}], + chdir: repo, + &:read + ).chomp + abort "git commit-tree failed" unless $?.success? + + commit_id +end + +def new_test_ref + "refs/heads/test-ref-#{SecureRandom.hex(4)}" +end + +def backdate_objects(repo) + run!(%w[find objects -exec touch -t 200001010101 {} ;], repo) +end + +main