From ede6d31b4b88135f124032a8c3544b1640e50584 Mon Sep 17 00:00:00 2001 From: Jacob Vosmaer Date: Mon, 24 Sep 2018 18:36:29 +0200 Subject: [PATCH 01/23] Add script that explores dedup --- test-dedup | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 test-dedup diff --git a/test-dedup b/test-dedup new file mode 100644 index 00000000000..451bdefb36a --- /dev/null +++ b/test-dedup @@ -0,0 +1,70 @@ +#!/usr/bin/env ruby +require 'securerandom' +require 'tempfile' +require 'fileutils' + +TEST_REPO = ENV.fetch('TEST_REPO') +TMP_ROOT = Dir.mktmpdir + +def main + at_exit { FileUtils.rm_rf(TMP_ROOT) } + + children = {} + %w[repo1 repo2].each { |name| children[name] = create_child(name) } + + pool = File.join(TMP_ROOT, 'pool.git') + run!(%W[git init --bare #{pool}]) + show_sizes([pool]) + + children.each do |name, path| + run!(%W[git remote add #{name} #{path}], pool) + + # this ensures every ref, including remote refs/tags, gets fetched + run!(%W[git config remote.#{name}.fetch +refs/*:refs/remotes/#{name}/*], pool) + + # this prevents remote tags being fetched into the top-level refs/tags namespace + run!(%W[git config remote.#{name}.tagopt --no-tags], pool) + + open(File.join(path, 'objects/info/alternates'), 'w') do |f| + f.puts File.join(pool, 'objects') + end + end + run!(%W[git fetch --all --quiet], pool) + + show_sizes([pool] + children.values) + + children.each do |name, path| + # -l is important. Not sure about -A vs -a + run!(%w[git repack -Ald], path) + end + + show_sizes([pool] + children.values) +end + +def show_sizes(paths) + paths.each do |p| + run!(%W[du -sh #{p}]) + #run!(%W[find #{p}/objects -type f]) + end +end + +def create_child(name) + path = File.join(TMP_ROOT, name + '.git') + run!(%W[git clone --quiet --bare #{TEST_REPO} #{path}]) + run!(%W[git remote remove origin], path) + FileUtils.rm_rf(File.join(path, 'hooks')) + path +end + + +def run!(cmd, dir=nil) + dir ||= Dir.pwd + cmd_s = cmd.join(' ') + warn "#{File.basename(dir)}$ #{cmd_s}" + start = Time.now + abort "failed" unless system(*cmd, chdir: dir) + delta = Time.now - start + warn sprintf("time: %.3fs\n", delta) if delta > 1.0 +end + +main -- GitLab From febace6513e159a8946aa4f1af7a8c82796526a2 Mon Sep 17 00:00:00 2001 From: Jacob Vosmaer Date: Mon, 24 Sep 2018 18:46:16 +0200 Subject: [PATCH 02/23] Make output nicer --- test-dedup | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) mode change 100644 => 100755 test-dedup diff --git a/test-dedup b/test-dedup old mode 100644 new mode 100755 index 451bdefb36a..1d1d9146435 --- a/test-dedup +++ b/test-dedup @@ -13,7 +13,8 @@ def main %w[repo1 repo2].each { |name| children[name] = create_child(name) } pool = File.join(TMP_ROOT, 'pool.git') - run!(%W[git init --bare #{pool}]) + run!(%W[git init --bare #{File.basename(pool)}], File.dirname(pool)) + run!(%w[git config advice.objectNameWarning false], pool) show_sizes([pool]) children.each do |name, path| @@ -43,15 +44,15 @@ end def show_sizes(paths) paths.each do |p| - run!(%W[du -sh #{p}]) - #run!(%W[find #{p}/objects -type f]) + run!(%W[du -sh #{File.basename(p)}], File.dirname(p)) end end def create_child(name) path = File.join(TMP_ROOT, name + '.git') - run!(%W[git clone --quiet --bare #{TEST_REPO} #{path}]) + run!(%W[git clone --quiet --bare #{TEST_REPO} #{File.basename(path)}], File.dirname(path)) run!(%W[git remote remove origin], path) + run!(%w[git config advice.objectNameWarning false], path) FileUtils.rm_rf(File.join(path, 'hooks')) path end -- GitLab From 3365f91cd60556e3de2da3f060c9b76eaecb8f76 Mon Sep 17 00:00:00 2001 From: Jacob Vosmaer Date: Mon, 24 Sep 2018 18:49:21 +0200 Subject: [PATCH 03/23] Move alternates --- test-dedup | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test-dedup b/test-dedup index 1d1d9146435..4eead3cf02f 100755 --- a/test-dedup +++ b/test-dedup @@ -25,16 +25,16 @@ def main # this prevents remote tags being fetched into the top-level refs/tags namespace run!(%W[git config remote.#{name}.tagopt --no-tags], pool) - - open(File.join(path, 'objects/info/alternates'), 'w') do |f| - f.puts File.join(pool, 'objects') - end end run!(%W[git fetch --all --quiet], pool) show_sizes([pool] + children.values) children.each do |name, path| + open(File.join(path, 'objects/info/alternates'), 'w') do |f| + f.puts File.join(pool, 'objects') + end + # -l is important. Not sure about -A vs -a run!(%w[git repack -Ald], path) end -- GitLab From a1890c7997c99d10a27d8c3d90af1b017c4f01c0 Mon Sep 17 00:00:00 2001 From: Jacob Vosmaer Date: Wed, 26 Sep 2018 12:41:43 +0200 Subject: [PATCH 04/23] Make pool creation efficient --- test-dedup | 51 ++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 40 insertions(+), 11 deletions(-) diff --git a/test-dedup b/test-dedup index 4eead3cf02f..349a8f31338 100755 --- a/test-dedup +++ b/test-dedup @@ -1,5 +1,5 @@ #!/usr/bin/env ruby -require 'securerandom' +require 'open3' require 'tempfile' require 'fileutils' @@ -10,24 +10,27 @@ def main at_exit { FileUtils.rm_rf(TMP_ROOT) } children = {} - %w[repo1 repo2].each { |name| children[name] = create_child(name) } + %w[repo1 repo2 repo3].each { |name| children[name] = create_child(name) } pool = File.join(TMP_ROOT, 'pool.git') - run!(%W[git init --bare #{File.basename(pool)}], File.dirname(pool)) + source_name, source_path = children.first + + # This creates a bunch of refs in the top-level namespace we don't want. + # However, they speed up the first fetch, so we keep them for now. + run!(%W[git clone --bare --local #{source_path} #{File.basename(pool)}], File.dirname(pool)) + run!(%w[git config advice.objectNameWarning false], pool) show_sizes([pool]) children.each do |name, path| - run!(%W[git remote add #{name} #{path}], pool) - - # this ensures every ref, including remote refs/tags, gets fetched - run!(%W[git config remote.#{name}.fetch +refs/*:refs/remotes/#{name}/*], pool) - - # this prevents remote tags being fetched into the top-level refs/tags namespace - run!(%W[git config remote.#{name}.tagopt --no-tags], pool) + add_child_to_pool(pool, name, path) end + run!(%W[git fetch --all --quiet], pool) + # Now we can get rid of the top-level refs in pool.git. + delete_top_level_refs(pool) + show_sizes([pool] + children.values) children.each do |name, path| @@ -36,12 +39,38 @@ def main end # -l is important. Not sure about -A vs -a - run!(%w[git repack -Ald], path) + run!(%w[git repack -Ald --quiet], path) end show_sizes([pool] + children.values) end +def add_child_to_pool(pool, child_name, child_path) + run!(%W[git remote add #{child_name} #{child_path}], pool) + + # this ensures every ref, including remote refs/tags, gets fetched + run!(%W[git config remote.#{child_name}.fetch +refs/*:refs/remotes/#{child_name}/*], pool) + + # this prevents remote tags being fetched into the top-level refs/tags namespace + run!(%W[git config remote.#{child_name}.tagopt --no-tags], pool) +end + +def delete_top_level_refs(repo) + run_pipeline!([ + %w[git for-each-ref --format=delete\ %(refname)], + %w[grep -v ^delete\ refs/remotes/], + %w[git update-ref --stdin], + ], repo) +end + +def run_pipeline!(pipeline, dir) + warn "#{File.basename(dir)}$ #{pipeline.map { |c| c.join(' ') }.join(' | ')}" + + statuses = Open3.pipeline(*pipeline, chdir: dir) + + abort "failed" unless statuses.all? { |s| s && s.success? } +end + def show_sizes(paths) paths.each do |p| run!(%W[du -sh #{File.basename(p)}], File.dirname(p)) -- GitLab From 349b02228592f20525e749776b3c65413cbc2048 Mon Sep 17 00:00:00 2001 From: Jacob Vosmaer Date: Wed, 26 Sep 2018 12:59:44 +0200 Subject: [PATCH 05/23] clean up script output --- test-dedup | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/test-dedup b/test-dedup index 349a8f31338..c3a1820b359 100755 --- a/test-dedup +++ b/test-dedup @@ -7,7 +7,10 @@ TEST_REPO = ENV.fetch('TEST_REPO') TMP_ROOT = Dir.mktmpdir def main - at_exit { FileUtils.rm_rf(TMP_ROOT) } + at_exit do + warn 'deleting tmp data' + FileUtils.rm_rf(TMP_ROOT) + end children = {} %w[repo1 repo2 repo3].each { |name| children[name] = create_child(name) } @@ -17,7 +20,7 @@ def main # This creates a bunch of refs in the top-level namespace we don't want. # However, they speed up the first fetch, so we keep them for now. - run!(%W[git clone --bare --local #{source_path} #{File.basename(pool)}], File.dirname(pool)) + run!(%W[git clone --quiet --bare --local #{source_path} #{File.basename(pool)}], File.dirname(pool)) run!(%w[git config advice.objectNameWarning false], pool) show_sizes([pool]) -- GitLab From 0092c4cfa3f6d2b6d4305dbb7ed100460c08afe3 Mon Sep 17 00:00:00 2001 From: Jacob Vosmaer Date: Wed, 26 Sep 2018 18:24:05 +0200 Subject: [PATCH 06/23] Move at_exit --- test-dedup | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/test-dedup b/test-dedup index c3a1820b359..f0a429dfa10 100755 --- a/test-dedup +++ b/test-dedup @@ -6,11 +6,12 @@ require 'fileutils' TEST_REPO = ENV.fetch('TEST_REPO') TMP_ROOT = Dir.mktmpdir +at_exit do + warn 'deleting tmp data' + FileUtils.rm_rf(TMP_ROOT) +end + def main - at_exit do - warn 'deleting tmp data' - FileUtils.rm_rf(TMP_ROOT) - end children = {} %w[repo1 repo2 repo3].each { |name| children[name] = create_child(name) } -- GitLab From 554ba7f45cbd3a26ea92d832278ca5ee8d656c65 Mon Sep 17 00:00:00 2001 From: Jacob Vosmaer Date: Wed, 26 Sep 2018 18:26:51 +0200 Subject: [PATCH 07/23] Use relative paths --- test-dedup | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/test-dedup b/test-dedup index f0a429dfa10..52422088724 100755 --- a/test-dedup +++ b/test-dedup @@ -12,7 +12,6 @@ at_exit do end def main - children = {} %w[repo1 repo2 repo3].each { |name| children[name] = create_child(name) } @@ -39,7 +38,7 @@ def main children.each do |name, path| open(File.join(path, 'objects/info/alternates'), 'w') do |f| - f.puts File.join(pool, 'objects') + f.puts File.join('..', '..', File.basename(pool), 'objects') end # -l is important. Not sure about -A vs -a @@ -50,13 +49,13 @@ def main end def add_child_to_pool(pool, child_name, child_path) - run!(%W[git remote add #{child_name} #{child_path}], pool) + run!(%W[git remote add #{child_name} ../#{File.basename(child_path)}], pool) - # this ensures every ref, including remote refs/tags, gets fetched - run!(%W[git config remote.#{child_name}.fetch +refs/*:refs/remotes/#{child_name}/*], pool) + # this ensures every ref, including remote refs/tags, gets fetched + run!(%W[git config remote.#{child_name}.fetch +refs/*:refs/remotes/#{child_name}/*], pool) - # this prevents remote tags being fetched into the top-level refs/tags namespace - run!(%W[git config remote.#{child_name}.tagopt --no-tags], pool) + # this prevents remote tags being fetched into the top-level refs/tags namespace + run!(%W[git config remote.#{child_name}.tagopt --no-tags], pool) end def delete_top_level_refs(repo) -- GitLab From 0a379911eeb52b7571318e5f4371fa58bce7d099 Mon Sep 17 00:00:00 2001 From: Jacob Vosmaer Date: Fri, 28 Sep 2018 18:29:23 +0200 Subject: [PATCH 08/23] Add efficient "clone in pool" --- test-dedup | 64 ++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 50 insertions(+), 14 deletions(-) diff --git a/test-dedup b/test-dedup index 52422088724..c63da899a57 100755 --- a/test-dedup +++ b/test-dedup @@ -4,7 +4,7 @@ require 'tempfile' require 'fileutils' TEST_REPO = ENV.fetch('TEST_REPO') -TMP_ROOT = Dir.mktmpdir +TMP_ROOT = File.absolute_path(Dir.mktmpdir) at_exit do warn 'deleting tmp data' @@ -21,31 +21,48 @@ def main # This creates a bunch of refs in the top-level namespace we don't want. # However, they speed up the first fetch, so we keep them for now. run!(%W[git clone --quiet --bare --local #{source_path} #{File.basename(pool)}], File.dirname(pool)) - + run!(%w[git remote remove origin], pool) run!(%w[git config advice.objectNameWarning false], pool) show_sizes([pool]) - children.each do |name, path| - add_child_to_pool(pool, name, path) - end - + add_child_to_pool(pool, source_name, source_path) run!(%W[git fetch --all --quiet], pool) # Now we can get rid of the top-level refs in pool.git. delete_top_level_refs(pool) + children.each do |name, path| + next if name == source_name + + add_child_to_pool(pool, name, path) + end + + run!(%w[git remote -v], pool) show_sizes([pool] + children.values) - children.each do |name, path| - open(File.join(path, 'objects/info/alternates'), 'w') do |f| - f.puts File.join('..', '..', File.basename(pool), 'objects') - end + children.each do |_, path| + set_child_alternates(path, pool) # -l is important. Not sure about -A vs -a run!(%w[git repack -Ald --quiet], path) end show_sizes([pool] + children.values) + + target_name, target_path = 'repo4', child_path('repo4') + children[target_name] = target_path + clone_in_pool(pool, children['repo2'], target_name, target_path) + + show_sizes([pool] + children.values) + + run_pipeline([%w[git for-each-ref], %w[head -30]], target_path) + run!(%w[cat objects/info/alternates], target_path) +end + +def set_child_alternates(child, pool) + open(File.join(child, 'objects/info/alternates'), 'w') do |f| + f.puts File.join('..', '..', File.basename(pool), 'objects') + end end def add_child_to_pool(pool, child_name, child_path) @@ -66,14 +83,19 @@ def delete_top_level_refs(repo) ], repo) end -def run_pipeline!(pipeline, dir) +def run_pipeline(pipeline, dir) warn "#{File.basename(dir)}$ #{pipeline.map { |c| c.join(' ') }.join(' | ')}" statuses = Open3.pipeline(*pipeline, chdir: dir) - abort "failed" unless statuses.all? { |s| s && s.success? } + statuses.all? { |s| s && s.success? } end +def run_pipeline!(pipeline, dir) + abort "failed" unless run_pipeline(pipeline, dir) +end + + def show_sizes(paths) paths.each do |p| run!(%W[du -sh #{File.basename(p)}], File.dirname(p)) @@ -81,14 +103,28 @@ def show_sizes(paths) end def create_child(name) - path = File.join(TMP_ROOT, name + '.git') + path = child_path(name) run!(%W[git clone --quiet --bare #{TEST_REPO} #{File.basename(path)}], File.dirname(path)) + clean_child(path) + path +end + +def clean_child(path) run!(%W[git remote remove origin], path) run!(%w[git config advice.objectNameWarning false], path) FileUtils.rm_rf(File.join(path, 'hooks')) - path end +def child_path(name) + File.join(TMP_ROOT, name + '.git') +end + +def clone_in_pool(pool, source_path, target_name, target_path) + run!(%W[git clone --quiet --bare --reference #{pool} #{source_path} #{target_path}]) + clean_child(target_path) + add_child_to_pool(pool, target_name, target_path) + set_child_alternates(target_path, pool) +end def run!(cmd, dir=nil) dir ||= Dir.pwd -- GitLab From b872d04529c46f431266f1b62d27fcdcd8a87c24 Mon Sep 17 00:00:00 2001 From: Jacob Vosmaer Date: Wed, 3 Oct 2018 12:11:20 +0200 Subject: [PATCH 09/23] readability --- test-dedup | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test-dedup b/test-dedup index c63da899a57..c0154247dc3 100755 --- a/test-dedup +++ b/test-dedup @@ -20,7 +20,7 @@ def main # This creates a bunch of refs in the top-level namespace we don't want. # However, they speed up the first fetch, so we keep them for now. - run!(%W[git clone --quiet --bare --local #{source_path} #{File.basename(pool)}], File.dirname(pool)) + run!(%W[git clone --quiet --bare --local #{File.basename(source_path)} #{File.basename(pool)}], File.dirname(pool)) run!(%w[git remote remove origin], pool) run!(%w[git config advice.objectNameWarning false], pool) show_sizes([pool]) @@ -112,7 +112,7 @@ end def clean_child(path) run!(%W[git remote remove origin], path) run!(%w[git config advice.objectNameWarning false], path) - FileUtils.rm_rf(File.join(path, 'hooks')) + run!(%W[rm -rf hooks], path) end def child_path(name) -- GitLab From 8472b41423814a4c100b93e8cba6e6fe4e33950c Mon Sep 17 00:00:00 2001 From: Jacob Vosmaer Date: Wed, 3 Oct 2018 16:52:30 +0200 Subject: [PATCH 10/23] readability --- test-dedup | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test-dedup b/test-dedup index c0154247dc3..553d05a00ba 100755 --- a/test-dedup +++ b/test-dedup @@ -120,7 +120,7 @@ def child_path(name) end def clone_in_pool(pool, source_path, target_name, target_path) - run!(%W[git clone --quiet --bare --reference #{pool} #{source_path} #{target_path}]) + run!(%W[git clone --quiet --bare --reference #{File.basename(pool)} #{File.basename(source_path)} #{File.basename(target_path)}], File.dirname(target_path)) clean_child(target_path) add_child_to_pool(pool, target_name, target_path) set_child_alternates(target_path, pool) -- GitLab From 462739b57a8aab504132785102942f90e60aac40 Mon Sep 17 00:00:00 2001 From: Jacob Vosmaer Date: Fri, 12 Oct 2018 18:08:00 +0200 Subject: [PATCH 11/23] Add some methods that correspond to rpcs --- test-dedup | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/test-dedup b/test-dedup index 553d05a00ba..d24a49c0d92 100755 --- a/test-dedup +++ b/test-dedup @@ -20,16 +20,14 @@ def main # This creates a bunch of refs in the top-level namespace we don't want. # However, they speed up the first fetch, so we keep them for now. - run!(%W[git clone --quiet --bare --local #{File.basename(source_path)} #{File.basename(pool)}], File.dirname(pool)) - run!(%w[git remote remove origin], pool) - run!(%w[git config advice.objectNameWarning false], pool) + create_pool_repository(source_path, pool) show_sizes([pool]) add_child_to_pool(pool, source_name, source_path) run!(%W[git fetch --all --quiet], pool) # Now we can get rid of the top-level refs in pool.git. - delete_top_level_refs(pool) + clean_pool_repository_post_create(pool) children.each do |name, path| next if name == source_name @@ -59,6 +57,16 @@ def main run!(%w[cat objects/info/alternates], target_path) end +def create_pool_repository(source, pool) + run!(%W[git clone --quiet --bare --local #{File.basename(source)} #{File.basename(pool)}], File.dirname(pool)) +end + +def clean_pool_repository_post_create(pool) + run!(%w[git remote remove origin], pool) + run!(%w[git config advice.objectNameWarning false], pool) + delete_top_level_refs(pool) +end + def set_child_alternates(child, pool) open(File.join(child, 'objects/info/alternates'), 'w') do |f| f.puts File.join('..', '..', File.basename(pool), 'objects') @@ -126,6 +134,8 @@ def clone_in_pool(pool, source_path, target_name, target_path) set_child_alternates(target_path, pool) end +# Note: tricks with the 'dir' argument and File.basename are there only +# to make the script output prettier. def run!(cmd, dir=nil) dir ||= Dir.pwd cmd_s = cmd.join(' ') -- GitLab From a91a7297be033f109b23f9e6a6948bb9d8a93dd5 Mon Sep 17 00:00:00 2001 From: Jacob Vosmaer Date: Fri, 12 Oct 2018 18:11:57 +0200 Subject: [PATCH 12/23] illustrate link_repository_to_pool --- test-dedup | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/test-dedup b/test-dedup index d24a49c0d92..5112f1cbbf7 100755 --- a/test-dedup +++ b/test-dedup @@ -23,7 +23,7 @@ def main create_pool_repository(source_path, pool) show_sizes([pool]) - add_child_to_pool(pool, source_name, source_path) + link_repository_to_pool(pool, source_name, source_path) run!(%W[git fetch --all --quiet], pool) # Now we can get rid of the top-level refs in pool.git. @@ -32,15 +32,13 @@ def main children.each do |name, path| next if name == source_name - add_child_to_pool(pool, name, path) + link_repository_to_pool(pool, name, path) end run!(%w[git remote -v], pool) show_sizes([pool] + children.values) children.each do |_, path| - set_child_alternates(path, pool) - # -l is important. Not sure about -A vs -a run!(%w[git repack -Ald --quiet], path) end @@ -67,13 +65,7 @@ def clean_pool_repository_post_create(pool) delete_top_level_refs(pool) end -def set_child_alternates(child, pool) - open(File.join(child, 'objects/info/alternates'), 'w') do |f| - f.puts File.join('..', '..', File.basename(pool), 'objects') - end -end - -def add_child_to_pool(pool, child_name, child_path) +def link_repository_to_pool(pool, child_name, child_path) run!(%W[git remote add #{child_name} ../#{File.basename(child_path)}], pool) # this ensures every ref, including remote refs/tags, gets fetched @@ -81,6 +73,10 @@ def add_child_to_pool(pool, child_name, child_path) # this prevents remote tags being fetched into the top-level refs/tags namespace run!(%W[git config remote.#{child_name}.tagopt --no-tags], pool) + + open(File.join(child_path, 'objects/info/alternates'), 'w') do |f| + f.puts File.join('..', '..', File.basename(pool), 'objects') + end end def delete_top_level_refs(repo) @@ -130,8 +126,7 @@ end def clone_in_pool(pool, source_path, target_name, target_path) run!(%W[git clone --quiet --bare --reference #{File.basename(pool)} #{File.basename(source_path)} #{File.basename(target_path)}], File.dirname(target_path)) clean_child(target_path) - add_child_to_pool(pool, target_name, target_path) - set_child_alternates(target_path, pool) + link_repository_to_pool(pool, target_name, target_path) end # Note: tricks with the 'dir' argument and File.basename are there only -- GitLab From 9431282bf1cf8853814c735414bc64aef4dc0070 Mon Sep 17 00:00:00 2001 From: Jacob Vosmaer Date: Fri, 12 Oct 2018 18:17:33 +0200 Subject: [PATCH 13/23] More renames and comments --- test-dedup | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/test-dedup b/test-dedup index 5112f1cbbf7..3a88f46873d 100755 --- a/test-dedup +++ b/test-dedup @@ -47,7 +47,8 @@ def main target_name, target_path = 'repo4', child_path('repo4') children[target_name] = target_path - clone_in_pool(pool, children['repo2'], target_name, target_path) + prepare_clone_in_pool(pool, children['repo2'], target_name, target_path) + link_repository_to_pool(pool, target_name, target_path) show_sizes([pool] + children.values) @@ -66,6 +67,8 @@ def clean_pool_repository_post_create(pool) end def link_repository_to_pool(pool, child_name, child_path) + # It is important that the remote is a relative path, so that it is + # stable across changing git data mountpoints. run!(%W[git remote add #{child_name} ../#{File.basename(child_path)}], pool) # this ensures every ref, including remote refs/tags, gets fetched @@ -75,6 +78,7 @@ def link_repository_to_pool(pool, child_name, child_path) run!(%W[git config remote.#{child_name}.tagopt --no-tags], pool) open(File.join(child_path, 'objects/info/alternates'), 'w') do |f| + # It is important that this is a relative path, so that it is stable across changing git data mountpoints. f.puts File.join('..', '..', File.basename(pool), 'objects') end end @@ -123,10 +127,15 @@ def child_path(name) File.join(TMP_ROOT, name + '.git') end -def clone_in_pool(pool, source_path, target_name, target_path) +def prepare_clone_in_pool(pool, source_path, target_name, target_path) + # The --reference options is the secret sauce that prevents copying + # objects that exist in the pool. After this clone, + # objects/info/alternates is already set up but it points to an absolute + # path. This is not what we want. This gets fixed later by + # link_repository_to_pool. + # run!(%W[git clone --quiet --bare --reference #{File.basename(pool)} #{File.basename(source_path)} #{File.basename(target_path)}], File.dirname(target_path)) clean_child(target_path) - link_repository_to_pool(pool, target_name, target_path) end # Note: tricks with the 'dir' argument and File.basename are there only -- GitLab From e0ca8638f7aed76e56a9699ad0d6d4d60271c934 Mon Sep 17 00:00:00 2001 From: Jacob Vosmaer Date: Fri, 12 Oct 2018 18:20:59 +0200 Subject: [PATCH 14/23] remove fetch that doesn't explain anything --- test-dedup | 1 - 1 file changed, 1 deletion(-) diff --git a/test-dedup b/test-dedup index 3a88f46873d..0d2bb57e6ed 100755 --- a/test-dedup +++ b/test-dedup @@ -24,7 +24,6 @@ def main show_sizes([pool]) link_repository_to_pool(pool, source_name, source_path) - run!(%W[git fetch --all --quiet], pool) # Now we can get rid of the top-level refs in pool.git. clean_pool_repository_post_create(pool) -- GitLab From 0d041c63a9091a0364bb80ad7a88c8db638f846e Mon Sep 17 00:00:00 2001 From: Jacob Vosmaer Date: Tue, 16 Oct 2018 15:17:18 +0200 Subject: [PATCH 15/23] Ensure we have refs at all in pool --- test-dedup | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/test-dedup b/test-dedup index 0d2bb57e6ed..ade1b1ef77c 100755 --- a/test-dedup +++ b/test-dedup @@ -20,7 +20,7 @@ def main # This creates a bunch of refs in the top-level namespace we don't want. # However, they speed up the first fetch, so we keep them for now. - create_pool_repository(source_path, pool) + create_pool_repository(source_name, source_path, pool) show_sizes([pool]) link_repository_to_pool(pool, source_name, source_path) @@ -55,8 +55,10 @@ def main run!(%w[cat objects/info/alternates], target_path) end -def create_pool_repository(source, pool) - run!(%W[git clone --quiet --bare --local #{File.basename(source)} #{File.basename(pool)}], File.dirname(pool)) +def create_pool_repository(source_name, source_path, pool) + run!(%W[git clone --quiet --bare --local #{File.basename(source_path)} #{File.basename(pool)}], File.dirname(pool)) + create_remote_in_pool(pool, source_name, source_path) + run!(%W[git fetch --quiet #{source_name}], pool) end def clean_pool_repository_post_create(pool) @@ -65,16 +67,22 @@ def clean_pool_repository_post_create(pool) delete_top_level_refs(pool) end -def link_repository_to_pool(pool, child_name, child_path) +def create_remote_in_pool(pool, child_name, child_path) # It is important that the remote is a relative path, so that it is # stable across changing git data mountpoints. - run!(%W[git remote add #{child_name} ../#{File.basename(child_path)}], pool) + unless run(%W[git remote add #{child_name} ../#{File.basename(child_path)}], pool) + run!(%W[git remote set-url #{child_name} ../#{File.basename(child_path)}], pool) + end # this ensures every ref, including remote refs/tags, gets fetched run!(%W[git config remote.#{child_name}.fetch +refs/*:refs/remotes/#{child_name}/*], pool) # this prevents remote tags being fetched into the top-level refs/tags namespace run!(%W[git config remote.#{child_name}.tagopt --no-tags], pool) +end + +def link_repository_to_pool(pool, child_name, child_path) + create_remote_in_pool(pool, child_name, child_path) open(File.join(child_path, 'objects/info/alternates'), 'w') do |f| # It is important that this is a relative path, so that it is stable across changing git data mountpoints. @@ -140,13 +148,18 @@ end # Note: tricks with the 'dir' argument and File.basename are there only # to make the script output prettier. def run!(cmd, dir=nil) + abort "failed" unless run(cmd, dir) +end + +def run(cmd, dir=nil) dir ||= Dir.pwd cmd_s = cmd.join(' ') warn "#{File.basename(dir)}$ #{cmd_s}" start = Time.now - abort "failed" unless system(*cmd, chdir: dir) + status = system(*cmd, chdir: dir) delta = Time.now - start warn sprintf("time: %.3fs\n", delta) if delta > 1.0 + status end main -- GitLab From 0eb740be6489c8b881820f99b8468705c2581a50 Mon Sep 17 00:00:00 2001 From: Jacob Vosmaer Date: Tue, 16 Oct 2018 16:30:05 +0200 Subject: [PATCH 16/23] Add rpc annotations --- test-dedup | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/test-dedup b/test-dedup index ade1b1ef77c..6f63bd460fa 100755 --- a/test-dedup +++ b/test-dedup @@ -25,9 +25,6 @@ def main link_repository_to_pool(pool, source_name, source_path) - # Now we can get rid of the top-level refs in pool.git. - clean_pool_repository_post_create(pool) - children.each do |name, path| next if name == source_name @@ -50,21 +47,22 @@ def main link_repository_to_pool(pool, target_name, target_path) show_sizes([pool] + children.values) - - run_pipeline([%w[git for-each-ref], %w[head -30]], target_path) - run!(%w[cat objects/info/alternates], target_path) end def create_pool_repository(source_name, source_path, pool) + puts '--- CreatePoolRepository' + # Use --local to get Git to use hardlinks, which avoids copying objects and packfiles run!(%W[git clone --quiet --bare --local #{File.basename(source_path)} #{File.basename(pool)}], File.dirname(pool)) + + # After the initial clone, the pool repository has top-level refs. This + # is not what we want, we want these refs to live under refs/remotes. create_remote_in_pool(pool, source_name, source_path) run!(%W[git fetch --quiet #{source_name}], pool) -end -def clean_pool_repository_post_create(pool) + # Now we can delete the top-level refs in the pool run!(%w[git remote remove origin], pool) - run!(%w[git config advice.objectNameWarning false], pool) delete_top_level_refs(pool) + puts '---' end def create_remote_in_pool(pool, child_name, child_path) @@ -82,12 +80,15 @@ def create_remote_in_pool(pool, child_name, child_path) end def link_repository_to_pool(pool, child_name, child_path) + puts '--- LinkRepositoryToPool' create_remote_in_pool(pool, child_name, child_path) open(File.join(child_path, 'objects/info/alternates'), 'w') do |f| # It is important that this is a relative path, so that it is stable across changing git data mountpoints. f.puts File.join('..', '..', File.basename(pool), 'objects') end + + puts '---' end def delete_top_level_refs(repo) @@ -126,7 +127,6 @@ end def clean_child(path) run!(%W[git remote remove origin], path) - run!(%w[git config advice.objectNameWarning false], path) run!(%W[rm -rf hooks], path) end @@ -135,6 +135,8 @@ def child_path(name) end def prepare_clone_in_pool(pool, source_path, target_name, target_path) + puts '--- PrepareCloneInPool' + # The --reference options is the secret sauce that prevents copying # objects that exist in the pool. After this clone, # objects/info/alternates is already set up but it points to an absolute @@ -143,6 +145,8 @@ def prepare_clone_in_pool(pool, source_path, target_name, target_path) # run!(%W[git clone --quiet --bare --reference #{File.basename(pool)} #{File.basename(source_path)} #{File.basename(target_path)}], File.dirname(target_path)) clean_child(target_path) + + puts '---' end # Note: tricks with the 'dir' argument and File.basename are there only -- GitLab From 0fa8caf1a1d393bb4dcfb1ef82483e0c2985a544 Mon Sep 17 00:00:00 2001 From: Jacob Vosmaer Date: Tue, 16 Oct 2018 18:44:41 +0200 Subject: [PATCH 17/23] Add example for "leave pool" scenario --- helper.rb | 28 ++++++++++++++++ test-dedup | 98 +++++++++++++++++++++++------------------------------- 2 files changed, 69 insertions(+), 57 deletions(-) create mode 100644 helper.rb diff --git a/helper.rb b/helper.rb new file mode 100644 index 00000000000..5700ede3840 --- /dev/null +++ b/helper.rb @@ -0,0 +1,28 @@ +def run_pipeline(pipeline, dir) + warn "#{File.basename(dir)}$ #{pipeline.map { |c| c.join(' ') }.join(' | ')}" + + statuses = Open3.pipeline(*pipeline, chdir: dir) + + statuses.all? { |s| s && s.success? } +end + +def run_pipeline!(pipeline, dir) + abort "failed" unless run_pipeline(pipeline, dir) +end + +# Note: tricks with the 'dir' argument and File.basename are there only +# to make the script output prettier. +def run!(cmd, dir=nil) + abort "failed" unless run(cmd, dir) +end + +def run(cmd, dir=nil) + dir ||= Dir.pwd + cmd_s = cmd.join(' ') + warn "#{File.basename(dir)}$ #{cmd_s}" + start = Time.now + status = system(*cmd, chdir: dir) + delta = Time.now - start + warn sprintf("time: %.3fs\n", delta) if delta > 1.0 + status +end diff --git a/test-dedup b/test-dedup index 6f63bd460fa..15962a1150e 100755 --- a/test-dedup +++ b/test-dedup @@ -3,6 +3,8 @@ require 'open3' require 'tempfile' require 'fileutils' +require_relative 'helper' + TEST_REPO = ENV.fetch('TEST_REPO') TMP_ROOT = File.absolute_path(Dir.mktmpdir) @@ -15,38 +17,39 @@ def main children = {} %w[repo1 repo2 repo3].each { |name| children[name] = create_child(name) } + # Create a pool based on repo1 + # pool = File.join(TMP_ROOT, 'pool.git') - source_name, source_path = children.first - - # This creates a bunch of refs in the top-level namespace we don't want. - # However, they speed up the first fetch, so we keep them for now. + source_name, source_path = 'repo1', children['repo1'] create_pool_repository(source_name, source_path, pool) - show_sizes([pool]) - link_repository_to_pool(pool, source_name, source_path) + show_sizes([pool, children['repo1']]) - children.each do |name, path| - next if name == source_name - - link_repository_to_pool(pool, name, path) - end - - run!(%w[git remote -v], pool) - show_sizes([pool] + children.values) - - children.each do |_, path| - # -l is important. Not sure about -A vs -a - run!(%w[git repack -Ald --quiet], path) - end - - show_sizes([pool] + children.values) - + # Clone a new repo, repo4, from repo1 (fork parent) within the pool + # target_name, target_path = 'repo4', child_path('repo4') children[target_name] = target_path - prepare_clone_in_pool(pool, children['repo2'], target_name, target_path) + prepare_clone_in_pool(pool, source_path, target_name, target_path) link_repository_to_pool(pool, target_name, target_path) + show_sizes([pool, source_path, target_path]) - show_sizes([pool] + children.values) + # Repo4 leaves pool (e.g. fork switches to private) + # + repo_name, repo_path = 'repo4', child_path('repo4') + # To make things interesting, ensure repo4 is deduplicated + run!(%w[git repack --quiet -Ald], repo_path) + run!(%w[find objects -type f], repo_path) + show_sizes([repo_path]) + + # Enter critical section where repo may not receive pushes (???) + prepare_leave_pool(repo_path) + unlink_repository_from_pool(pool, repo_name, repo_path) + # Exit critical section + + # Sanity checks + run!(%w[find objects -type f], repo_path) + run!(%w[git fsck --connectivity-only], repo_path) + show_sizes([repo_path]) end def create_pool_repository(source_name, source_path, pool) @@ -99,19 +102,6 @@ def delete_top_level_refs(repo) ], repo) end -def run_pipeline(pipeline, dir) - warn "#{File.basename(dir)}$ #{pipeline.map { |c| c.join(' ') }.join(' | ')}" - - statuses = Open3.pipeline(*pipeline, chdir: dir) - - statuses.all? { |s| s && s.success? } -end - -def run_pipeline!(pipeline, dir) - abort "failed" unless run_pipeline(pipeline, dir) -end - - def show_sizes(paths) paths.each do |p| run!(%W[du -sh #{File.basename(p)}], File.dirname(p)) @@ -137,33 +127,27 @@ end def prepare_clone_in_pool(pool, source_path, target_name, target_path) puts '--- PrepareCloneInPool' - # The --reference options is the secret sauce that prevents copying - # objects that exist in the pool. After this clone, - # objects/info/alternates is already set up but it points to an absolute - # path. This is not what we want. This gets fixed later by - # link_repository_to_pool. - # - run!(%W[git clone --quiet --bare --reference #{File.basename(pool)} #{File.basename(source_path)} #{File.basename(target_path)}], File.dirname(target_path)) + # After this clone, objects/info/alternates is already set up but it + # points to an absolute path. This is not what we want. This gets fixed + # later by link_repository_to_pool. + # + run!(%W[git clone --quiet --bare --local #{File.basename(source_path)} #{File.basename(target_path)}], File.dirname(target_path)) clean_child(target_path) puts '---' end -# Note: tricks with the 'dir' argument and File.basename are there only -# to make the script output prettier. -def run!(cmd, dir=nil) - abort "failed" unless run(cmd, dir) +def prepare_leave_pool(repo_path) + puts '--- PrepareLeavePool' + run!(%w[git repack --quiet -a], repo_path) + puts '---' end -def run(cmd, dir=nil) - dir ||= Dir.pwd - cmd_s = cmd.join(' ') - warn "#{File.basename(dir)}$ #{cmd_s}" - start = Time.now - status = system(*cmd, chdir: dir) - delta = Time.now - start - warn sprintf("time: %.3fs\n", delta) if delta > 1.0 - status +def unlink_repository_from_pool(pool, repo_name, repo_path) + puts '--- UnlinkRepositoryFromPool' + run!(%w[rm objects/info/alternates], repo_path) + run!(%W[git remote remove #{repo_name}], pool) + puts '---' end main -- GitLab From d92600173ef1a75fc12fd76d85f9c7ad72b78355 Mon Sep 17 00:00:00 2001 From: Jacob Vosmaer Date: Wed, 17 Oct 2018 13:29:46 +0200 Subject: [PATCH 18/23] comments --- test-dedup | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test-dedup b/test-dedup index 15962a1150e..5e9b7417037 100755 --- a/test-dedup +++ b/test-dedup @@ -95,6 +95,7 @@ def link_repository_to_pool(pool, child_name, child_path) end def delete_top_level_refs(repo) + # Delete every ref that does not start with 'refs/remotes/' run_pipeline!([ %w[git for-each-ref --format=delete\ %(refname)], %w[grep -v ^delete\ refs/remotes/], @@ -139,7 +140,12 @@ end def prepare_leave_pool(repo_path) puts '--- PrepareLeavePool' + + # Because we are running 'git repack' _without_ '-l', Git will gather + # all the objects it needs from the pool repository into a new + # non-deduplicated packfile. run!(%w[git repack --quiet -a], repo_path) + puts '---' end -- GitLab From 0839dcc6b2c4adc5742250af11fd662418e64921 Mon Sep 17 00:00:00 2001 From: Jacob Vosmaer Date: Wed, 17 Oct 2018 16:02:32 +0200 Subject: [PATCH 19/23] Start exploring push edge cases --- helper.rb | 2 ++ test-dedup | 1 - test-push-alternates | 52 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 54 insertions(+), 1 deletion(-) create mode 100755 test-push-alternates diff --git a/helper.rb b/helper.rb index 5700ede3840..6ac62d13d2c 100644 --- a/helper.rb +++ b/helper.rb @@ -1,3 +1,5 @@ +require 'open3' + def run_pipeline(pipeline, dir) warn "#{File.basename(dir)}$ #{pipeline.map { |c| c.join(' ') }.join(' | ')}" diff --git a/test-dedup b/test-dedup index 5e9b7417037..42130c94d63 100755 --- a/test-dedup +++ b/test-dedup @@ -1,5 +1,4 @@ #!/usr/bin/env ruby -require 'open3' require 'tempfile' require 'fileutils' diff --git a/test-push-alternates b/test-push-alternates new file mode 100755 index 00000000000..98bea5e4446 --- /dev/null +++ b/test-push-alternates @@ -0,0 +1,52 @@ +#!/usr/bin/env ruby +require 'tempfile' +require 'fileutils' +require 'securerandom' + +require_relative 'helper' + +TEST_REPO = ENV.fetch('TEST_REPO') +TMP_ROOT = File.absolute_path(Dir.mktmpdir) +TEST_REF = "refs/heads/test-ref-#{SecureRandom.hex(4)}" + +at_exit do + warn '... deleting tmp data' + FileUtils.rm_rf(TMP_ROOT) +end + +def main + pool = File.join(TMP_ROOT, 'pool.git') + run!(%W[git clone --quiet --bare #{TEST_REPO} #{pool}]) + + child = File.join(TMP_ROOT, 'child.git') + run!(%W[git clone --quiet --bare --shared #{pool} #{child}]) + run!(%w[git remote remove origin], child) + + # Verify that child shares objects with pool + run!(%w[du -sh] + [pool, child]) + run!(%w[cat objects/info/alternates], child) + + # Create a dangling object in the pool repository + commit_id = IO.popen( + %w[git commit-tree -p HEAD -m hello HEAD^{tree}], + chdir: pool, + &:read + ).chomp + abort "git commit-tree failed" unless $?.success? + + # Make objects old; to see if mtimes get updated later + run!(%w[find objects -exec touch -t 200001010101 {} ;], pool) + run!(%w[find objects -type f -exec ls -l {} ;], pool) + + # Make the child refer to the dangling object + run!(%W[git update-ref #{TEST_REF} #{commit_id}], child) + run!(%W[git cat-file -p #{TEST_REF}], child) + + # Print mtimes + run!(%w[find objects -type f -exec ls -l {} ;], pool) + run!(%w[find objects -type f -exec ls -l {} ;], child) + + # Does not look good, no sign of mtime change. How will the pool know the object is used? +end + +main -- GitLab From e98131b9861c01ca8a26b3d16737aa119c5ed677 Mon Sep 17 00:00:00 2001 From: Jacob Vosmaer Date: Thu, 18 Oct 2018 11:19:03 +0200 Subject: [PATCH 20/23] try actual push --- test-push-alternates | 56 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 46 insertions(+), 10 deletions(-) diff --git a/test-push-alternates b/test-push-alternates index 98bea5e4446..fb42e822c47 100755 --- a/test-push-alternates +++ b/test-push-alternates @@ -7,7 +7,6 @@ require_relative 'helper' TEST_REPO = ENV.fetch('TEST_REPO') TMP_ROOT = File.absolute_path(Dir.mktmpdir) -TEST_REF = "refs/heads/test-ref-#{SecureRandom.hex(4)}" at_exit do warn '... deleting tmp data' @@ -27,26 +26,63 @@ def main run!(%w[cat objects/info/alternates], child) # Create a dangling object in the pool repository - commit_id = IO.popen( - %w[git commit-tree -p HEAD -m hello HEAD^{tree}], - chdir: pool, - &:read - ).chomp - abort "git commit-tree failed" unless $?.success? + commit_id = new_commit(pool) # Make objects old; to see if mtimes get updated later - run!(%w[find objects -exec touch -t 200001010101 {} ;], pool) + backdate_objects(pool) run!(%w[find objects -type f -exec ls -l {} ;], pool) # Make the child refer to the dangling object - run!(%W[git update-ref #{TEST_REF} #{commit_id}], child) - run!(%W[git cat-file -p #{TEST_REF}], child) + ref = new_test_ref + run!(%W[git update-ref #{ref} #{commit_id}], child) + run!(%W[git cat-file -p #{ref}], child) # Print mtimes + puts "--- looking for: #{commit_id}" run!(%w[find objects -type f -exec ls -l {} ;], pool) run!(%w[find objects -type f -exec ls -l {} ;], child) # Does not look good, no sign of mtime change. How will the pool know the object is used? + + # Try pushing... + + pusher = File.join(TMP_ROOT, 'pusher.git') + run!(%W[git clone --quiet --bare #{TEST_REPO} #{pusher}]) + commit_id = new_commit(pusher) + ref = new_test_ref + run!(%W[git update-ref #{ref} #{commit_id}], pusher) + + obj_name = "objects/#{commit_id[0,2]}/#{commit_id[2, commit_id.size]}" + FileUtils.mkdir_p(File.dirname(File.join(pool, obj_name))) + run!(%W[cp #{obj_name} #{pool}/#{obj_name}], pusher) + backdate_objects(pool) + + run!(%w[git config advice.objectNameWarning false], pusher) + run!(%w[git config advice.objectNameWarning false], child) + + run!(%W[git push #{child} #{ref}], pusher) + puts "--- looking for: #{commit_id}" + run!(%w[find objects -type f -exec ls -l {} ;], pool) + run!(%w[find objects -type f -exec ls -l {} ;], child) +end + +def new_commit(repo) + commit_id = IO.popen( + %W[git commit-tree -p HEAD -m hello#{SecureRandom.hex(2)} HEAD^{tree}], + chdir: repo, + &:read + ).chomp + abort "git commit-tree failed" unless $?.success? + + commit_id +end + +def new_test_ref + "refs/heads/test-ref-#{SecureRandom.hex(4)}" +end + +def backdate_objects(repo) + run!(%w[find objects -exec touch -t 200001010101 {} ;], repo) end main -- GitLab From bce083f7f2f078a5b49f7d0a7515c1bc1626befc Mon Sep 17 00:00:00 2001 From: Jacob Vosmaer Date: Fri, 19 Oct 2018 14:25:57 +0200 Subject: [PATCH 21/23] Add design overview doc --- doc/object_deduplication.md | 87 +++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 doc/object_deduplication.md diff --git a/doc/object_deduplication.md b/doc/object_deduplication.md new file mode 100644 index 00000000000..8e64c21ed6d --- /dev/null +++ b/doc/object_deduplication.md @@ -0,0 +1,87 @@ +# Design notes for Git object deduplication + +## Problem description + +Forking projects is a common workflow in GitLab. When a user forks a +project, GitLab creates a full clone of the repository associated with +the fork parent. If the repository is large and gets forked often, this +leads to a lot duplicated Git objects which take up disk space. + +We are adding Git object deduplication to GitLab to address this +problem. + +## Solution overview + +We have [chosen](https://gitlab.com/gitlab-org/gitaly/issues/1331) a +design where Git objects shared between repositories on the same Gitaly +storage shard can be shared via a **pool repository**. For each member +of the pool, the pool repository has a git remote pointing to it. +Conversly, each member points to the pool repository as an **alternate +object directory** using the `objects/info/alternates` in the +repository. + +From a Gitaly point of view this is a very transparent solution. Almost +all RPC's should continue to work without modification on repositories +that are linked to a pool. + +What is new is that GitLab must manage the pool repository and the pool +relations in Gitaly. + +## Limitations + +- Pools are local to Gitaly storage shards. +- Repositories in a pool can see all objects in the pool repository if + they know the object ID (SHA1). + +The second property means that we should not mix repositories of +projects with different visibility scope (e.g. public vs private) in the +same storage pool. + +This also means that we cannot conflate the project fork network +relation with the repository pool relation. Storage pools will be +restricted within fork networks to public projects that live on the same +Gitaly shard. + +If a project in a storage pool changes visibility from public to private +we must extract it from the storage pool. + +## First iteration + +The first iteration of object deduplication is limited to the following scope: + +- Only works with new forked projects +- The parent of the forked project is using hashed storage +- The new forked project is using hashed storage +- Pool repositories are created once and do not pull in new git + objects. This means that the deduplication percentage will fall over + time as new objects get pushed to the repositories in a pool. We + will address this in a later iteration ("pool grooming") + +### Scenarios + +#### Create pool from existing repo + +- SQL: create pool object +- Gitaly: create pool repo from existing repo. Create remote pointing to existing repo, and clear top level refs in pool. +- SQL: link project to pool +- Gitaly: finalize link: create (remote and) objects/info/alternates connection for existing repo + +If this fails in the middle there is no data loss in the existing repo. + +#### Clone new repo from origin in pool (e.g. a fork) + +- SQL: create project linked to pool. Project is in "being cloned" state +- Gitaly: create new repo with local disk clone from origin +- Gitaly: create remote and objects/info/alternates connection for new repo +- SQL: clear project "being cloned" state + +#### Project leaves pool (e.g. fork taken private) + +- SQL: mark project as "repo transitioning to private". git pushes are blocked +- Gitaly: copy needed objects from pool with git repack -a +- Gitaly: remove objects/info/alternates link and pool remote +- SQL: unmark "repo transitioning to private". git pushes no longer blocked + +This is problematic. If we fail in the middle, git pushes remain +blocked. Do we really need to block git pushes during this operation? +If we fail during the Gitaly parts we can re-create the pool links and restart the repack. \ No newline at end of file -- GitLab From d830cbd1595eeabe705abe583c7ae5d2bae5526a Mon Sep 17 00:00:00 2001 From: Jacob Vosmaer Date: Fri, 19 Oct 2018 14:28:17 +0200 Subject: [PATCH 22/23] update design notes --- doc/object_deduplication.md | 43 +++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/doc/object_deduplication.md b/doc/object_deduplication.md index 8e64c21ed6d..86b5127ee58 100644 --- a/doc/object_deduplication.md +++ b/doc/object_deduplication.md @@ -22,7 +22,8 @@ repository. From a Gitaly point of view this is a very transparent solution. Almost all RPC's should continue to work without modification on repositories -that are linked to a pool. +that are linked to a pool. This is because Git natively supports +`objects/info/alternates`. What is new is that GitLab must manage the pool repository and the pool relations in Gitaly. @@ -56,32 +57,38 @@ The first iteration of object deduplication is limited to the following scope: objects. This means that the deduplication percentage will fall over time as new objects get pushed to the repositories in a pool. We will address this in a later iteration ("pool grooming") - ### Scenarios #### Create pool from existing repo -- SQL: create pool object -- Gitaly: create pool repo from existing repo. Create remote pointing to existing repo, and clear top level refs in pool. -- SQL: link project to pool -- Gitaly: finalize link: create (remote and) objects/info/alternates connection for existing repo +- SQL: create pool object +- Gitaly: create pool repo from existing repo. Create remote pointing + to existing repo, and clear top level refs in pool. +- SQL: link project to pool +- Gitaly: finalize link: create (remote and) objects/info/alternates + connection for existing repo If this fails in the middle there is no data loss in the existing repo. -#### Clone new repo from origin in pool (e.g. a fork) +#### Clone new repo from origin in pool (e.g. a fork) -- SQL: create project linked to pool. Project is in "being cloned" state -- Gitaly: create new repo with local disk clone from origin -- Gitaly: create remote and objects/info/alternates connection for new repo -- SQL: clear project "being cloned" state +- SQL: create project linked to pool. Project is in "being cloned" + state +- Gitaly: create new repo with local disk clone from origin +- Gitaly: create remote and objects/info/alternates connection for new + repo +- SQL: clear project "being cloned" state -#### Project leaves pool (e.g. fork taken private) +#### Project leaves pool (e.g. fork taken private) -- SQL: mark project as "repo transitioning to private". git pushes are blocked -- Gitaly: copy needed objects from pool with git repack -a -- Gitaly: remove objects/info/alternates link and pool remote -- SQL: unmark "repo transitioning to private". git pushes no longer blocked +- SQL: mark project as "repo transitioning to private". git pushes are + blocked +- Gitaly: copy needed objects from pool with git repack -a +- Gitaly: remove objects/info/alternates link and pool remote +- SQL: unmark "repo transitioning to private". git pushes no longer + blocked This is problematic. If we fail in the middle, git pushes remain -blocked. Do we really need to block git pushes during this operation? -If we fail during the Gitaly parts we can re-create the pool links and restart the repack. \ No newline at end of file +blocked. Do we really need to block git pushes during this operation? If +we fail during the Gitaly parts we can re-create the pool links and +restart the repack. -- GitLab From 78bd08c46657c4bb87a9860736940381e381c772 Mon Sep 17 00:00:00 2001 From: Jacob Vosmaer Date: Thu, 25 Oct 2018 17:26:41 +0200 Subject: [PATCH 23/23] Investigate freshening --- helper.rb | 8 ++++---- test-push-alternates | 9 ++++++++- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/helper.rb b/helper.rb index 6ac62d13d2c..eb022451ad9 100644 --- a/helper.rb +++ b/helper.rb @@ -14,16 +14,16 @@ end # Note: tricks with the 'dir' argument and File.basename are there only # to make the script output prettier. -def run!(cmd, dir=nil) - abort "failed" unless run(cmd, dir) +def run!(cmd, dir=nil, env={}) + abort "failed" unless run(cmd, dir, env) end -def run(cmd, dir=nil) +def run(cmd, dir=nil, env={}) dir ||= Dir.pwd cmd_s = cmd.join(' ') warn "#{File.basename(dir)}$ #{cmd_s}" start = Time.now - status = system(*cmd, chdir: dir) + status = system(env, *cmd, chdir: dir) delta = Time.now - start warn sprintf("time: %.3fs\n", delta) if delta > 1.0 status diff --git a/test-push-alternates b/test-push-alternates index fb42e822c47..94d52a71c8f 100755 --- a/test-push-alternates +++ b/test-push-alternates @@ -43,8 +43,13 @@ def main run!(%w[find objects -type f -exec ls -l {} ;], child) # Does not look good, no sign of mtime change. How will the pool know the object is used? + # Let's try freshening the commit. + run_pipeline!([%W[git cat-file commit #{commit_id}], %w[git hash-object -t commit -w --stdin]], child) + run!(%w[find objects -type f -exec ls -l {} ;], pool) + run!(%w[find objects -type f -exec ls -l {} ;], child) + # Freshening works! - # Try pushing... + # Different scenario: 'git push' pusher = File.join(TMP_ROOT, 'pusher.git') run!(%W[git clone --quiet --bare #{TEST_REPO} #{pusher}]) @@ -55,6 +60,7 @@ def main obj_name = "objects/#{commit_id[0,2]}/#{commit_id[2, commit_id.size]}" FileUtils.mkdir_p(File.dirname(File.join(pool, obj_name))) run!(%W[cp #{obj_name} #{pool}/#{obj_name}], pusher) + run!(%w[git repack --quiet -k -d -a], pool) backdate_objects(pool) run!(%w[git config advice.objectNameWarning false], pusher) @@ -64,6 +70,7 @@ def main puts "--- looking for: #{commit_id}" run!(%w[find objects -type f -exec ls -l {} ;], pool) run!(%w[find objects -type f -exec ls -l {} ;], child) + run!(%W[git show #{commit_id}], pool) end def new_commit(repo) -- GitLab