diff --git a/changelogs/unreleased/zj-linguist-ruby-sidecar.yml b/changelogs/unreleased/zj-linguist-ruby-sidecar.yml new file mode 100644 index 0000000000000000000000000000000000000000..bae12a105998e1d8066d81dc95f04b766c062c8c --- /dev/null +++ b/changelogs/unreleased/zj-linguist-ruby-sidecar.yml @@ -0,0 +1,5 @@ +--- +title: Use caching for linguist results +merge_request: 751 +author: +type: performance diff --git a/cmd/gitaly/main.go b/cmd/gitaly/main.go index 591387a590abf3c28c76105167612e073601e8e7..22764f14cab5891dbb11e17c113e7039a5c1b80d 100644 --- a/cmd/gitaly/main.go +++ b/cmd/gitaly/main.go @@ -14,7 +14,6 @@ import ( "gitlab.com/gitlab-org/gitaly/internal/config" "gitlab.com/gitlab-org/gitaly/internal/connectioncounter" "gitlab.com/gitlab-org/gitaly/internal/git" - "gitlab.com/gitlab-org/gitaly/internal/linguist" "gitlab.com/gitlab-org/gitaly/internal/rubyserver" "gitlab.com/gitlab-org/gitaly/internal/server" "gitlab.com/gitlab-org/gitaly/internal/tempdir" @@ -43,10 +42,6 @@ func loadConfig(configPath string) error { return err } - if err := linguist.LoadColors(); err != nil { - return fmt.Errorf("load linguist colors: %v", err) - } - return nil } diff --git a/doc/configuration/README.md b/doc/configuration/README.md index 91755e4f558c0a4e91dfe16972bcc7289bd0d401..eef0cf297b5fa0102d1a694bd3f5c3d578b05769 100644 --- a/doc/configuration/README.md +++ b/doc/configuration/README.md @@ -111,7 +111,6 @@ max\_rss limit. |graceful_restart_timeout|string|no|Grace period to allow a gitaly-ruby process to finish ongoing requests. Default 10 minutes ("10m").| |restart_delay|string|no|Time memory must be high before a restart is triggered, in seconds. Default 5 minutes ("5m").| |num_workers|integer|no|Number of gitaly-ruby worker processes. Try increasing this number in case of ResourceExhausted errors. Default 2, minimum 2.| -|linguist_languages_path|string|no|Override for dynamic languages.json discovery. Default: "" (use dynamic discovery).| ### Logging diff --git a/internal/linguist/linguist.go b/internal/linguist/linguist.go deleted file mode 100644 index 9c7feeb2f78c9d5b9e47a5365a93bbf193646346..0000000000000000000000000000000000000000 --- a/internal/linguist/linguist.go +++ /dev/null @@ -1,98 +0,0 @@ -package linguist - -import ( - "context" - "crypto/sha256" - "encoding/json" - "fmt" - "io" - "io/ioutil" - "os" - "os/exec" - "path" - - "gitlab.com/gitlab-org/gitaly/internal/command" - "gitlab.com/gitlab-org/gitaly/internal/config" -) - -var ( - colorMap = make(map[string]Language) -) - -// Language is used to parse Linguist's language.json file. -type Language struct { - Color string `json:"color"` -} - -// Stats returns the repository's language stats as reported by 'git-linguist'. -func Stats(ctx context.Context, repoPath string, commitID string) (map[string]int, error) { - cmd := exec.Command("bundle", "exec", "bin/ruby-cd", repoPath, "git-linguist", "--commit="+commitID, "stats") - cmd.Dir = config.Config.Ruby.Dir - reader, err := command.New(ctx, cmd, nil, nil, nil, os.Environ()...) - if err != nil { - return nil, err - } - - data, err := ioutil.ReadAll(reader) - if err != nil { - return nil, err - } - - stats := make(map[string]int) - return stats, json.Unmarshal(data, &stats) -} - -// Color returns the color Linguist has assigned to language. -func Color(language string) string { - if color := colorMap[language].Color; color != "" { - return color - } - - colorSha := sha256.Sum256([]byte(language)) - return fmt.Sprintf("#%x", colorSha[0:3]) -} - -// LoadColors loads the name->color map from the Linguist gem. -func LoadColors() error { - jsonReader, err := openLanguagesJSON() - if err != nil { - return err - } - defer jsonReader.Close() - - return json.NewDecoder(jsonReader).Decode(&colorMap) -} - -func openLanguagesJSON() (io.ReadCloser, error) { - if jsonPath := config.Config.Ruby.LinguistLanguagesPath; jsonPath != "" { - // This is a fallback for environments where dynamic discovery of the - // linguist path via Bundler is not working for some reason, for example - // https://gitlab.com/gitlab-org/gitaly/issues/1119. - return os.Open(jsonPath) - } - - linguistPathSymlink, err := ioutil.TempFile("", "gitaly-linguist-path") - if err != nil { - return nil, err - } - defer os.Remove(linguistPathSymlink.Name()) - - if err := linguistPathSymlink.Close(); err != nil { - return nil, err - } - - // We use a symlink because we cannot trust Bundler to not print garbage - // on its stdout. - rubyScript := `FileUtils.ln_sf(Bundler.rubygems.find_name('github-linguist').first.full_gem_path, ARGV.first)` - cmd := exec.Command("bundle", "exec", "ruby", "-rfileutils", "-e", rubyScript, linguistPathSymlink.Name()) - cmd.Dir = config.Config.Ruby.Dir - - if err := cmd.Run(); err != nil { - if exitError, ok := err.(*exec.ExitError); ok { - err = fmt.Errorf("%v; stderr: %q", exitError, exitError.Stderr) - } - return nil, err - } - - return os.Open(path.Join(linguistPathSymlink.Name(), "lib/linguist/languages.json")) -} diff --git a/internal/linguist/linguist_test.go b/internal/linguist/linguist_test.go deleted file mode 100644 index aac91d866d0991e85b0feeeffbc082628be307c1..0000000000000000000000000000000000000000 --- a/internal/linguist/linguist_test.go +++ /dev/null @@ -1,32 +0,0 @@ -package linguist - -import ( - "path/filepath" - "testing" - - "github.com/stretchr/testify/require" - "gitlab.com/gitlab-org/gitaly/internal/config" - "gitlab.com/gitlab-org/gitaly/internal/testhelper" -) - -func TestLoadLanguages(t *testing.T) { - testhelper.ConfigureRuby() - - colorMap = make(map[string]Language) - require.NoError(t, LoadColors(), "load colors") - - require.Equal(t, "#701516", Color("Ruby"), "color value for 'Ruby'") -} - -func TestLoadLanguagesCustomPath(t *testing.T) { - jsonPath, err := filepath.Abs("testdata/fake-languages.json") - require.NoError(t, err) - - testhelper.ConfigureRuby() - config.Config.Ruby.LinguistLanguagesPath = jsonPath - - colorMap = make(map[string]Language) - require.NoError(t, LoadColors(), "load colors") - - require.Equal(t, "foo color", Color("FooBar")) -} diff --git a/internal/linguist/testdata/fake-languages.json b/internal/linguist/testdata/fake-languages.json deleted file mode 100644 index 83f7687f8e62b1fd69efd562b5eb7319d236cd14..0000000000000000000000000000000000000000 --- a/internal/linguist/testdata/fake-languages.json +++ /dev/null @@ -1,3 +0,0 @@ -{ -"FooBar": { "color": "foo color" } -} diff --git a/internal/service/commit/languages.go b/internal/service/commit/languages.go index 5d8227a6a321061ba7500ea876d8ba74c89fef08..5662d03c8bd1b92f3e22096387a9520a8529d2e2 100644 --- a/internal/service/commit/languages.go +++ b/internal/service/commit/languages.go @@ -1,93 +1,34 @@ package commit import ( - "io/ioutil" - "sort" - "strings" - - "google.golang.org/grpc/codes" - "google.golang.org/grpc/status" - - "gitlab.com/gitlab-org/gitaly/internal/git" - "gitlab.com/gitlab-org/gitaly/internal/helper" - "gitlab.com/gitlab-org/gitaly/internal/linguist" - "gitlab.com/gitlab-org/gitaly/internal/service/ref" - pb "gitlab.com/gitlab-org/gitaly-proto/go" + "gitlab.com/gitlab-org/gitaly/internal/rubyserver" + "gitlab.com/gitlab-org/gitaly/internal/service/ref" "golang.org/x/net/context" ) -func (*server) CommitLanguages(ctx context.Context, req *pb.CommitLanguagesRequest) (*pb.CommitLanguagesResponse, error) { +func (s *server) CommitLanguages(ctx context.Context, req *pb.CommitLanguagesRequest) (*pb.CommitLanguagesResponse, error) { repo := req.Repository - revision := string(req.Revision) - if revision == "" { + if len(req.Revision) == 0 { defaultBranch, err := ref.DefaultBranchName(ctx, req.Repository) if err != nil { return nil, err } - revision = string(defaultBranch) - } - commitID, err := lookupRevision(ctx, repo, revision) - if err != nil { - return nil, err + req.Revision = defaultBranch } - repoPath, err := helper.GetRepoPath(repo) + client, err := s.CommitServiceClient(ctx) if err != nil { return nil, err } - stats, err := linguist.Stats(ctx, repoPath, commitID) - if err != nil { - return nil, err - } - - resp := &pb.CommitLanguagesResponse{} - if len(stats) == 0 { - return resp, nil - } - - total := 0 - for _, count := range stats { - total += count - } - if total == 0 { - return nil, status.Errorf(codes.Internal, "linguist stats added up to zero: %v", stats) - } - - for lang, count := range stats { - l := &pb.CommitLanguagesResponse_Language{ - Name: lang, - Share: float32(100*count) / float32(total), - Color: linguist.Color(lang), - } - resp.Languages = append(resp.Languages, l) - } - - sort.Sort(languageSorter(resp.Languages)) - - return resp, nil -} - -type languageSorter []*pb.CommitLanguagesResponse_Language - -func (ls languageSorter) Len() int { return len(ls) } -func (ls languageSorter) Swap(i, j int) { ls[i], ls[j] = ls[j], ls[i] } -func (ls languageSorter) Less(i, j int) bool { return ls[i].Share > ls[j].Share } - -func lookupRevision(ctx context.Context, repo *pb.Repository, revision string) (string, error) { - revParse, err := git.Command(ctx, repo, "rev-parse", revision) + clientCtx, err := rubyserver.SetHeaders(ctx, repo) if err != nil { - return "", err - } - - revParseBytes, err := ioutil.ReadAll(revParse) - if err != nil { - return "", err + return nil, err } - return strings.TrimSpace(string(revParseBytes)), nil + return client.CommitLanguages(clientCtx, req) } diff --git a/internal/service/commit/languages_test.go b/internal/service/commit/languages_test.go index 0deeb7ca06170e608c7ad22270adce51cf0a57a4..b810108f1b252d5c1999d409137973aac1625027 100644 --- a/internal/service/commit/languages_test.go +++ b/internal/service/commit/languages_test.go @@ -33,9 +33,9 @@ func TestLanguages(t *testing.T) { require.NotZero(t, len(resp.Languages), "number of languages in response") expectedLanguages := []pb.CommitLanguagesResponse_Language{ - {Name: "Ruby", Share: 66, Color: "#701516"}, + {Name: "Ruby", Share: 65, Color: "#701516"}, {Name: "JavaScript", Share: 22, Color: "#f1e05a"}, - {Name: "HTML", Share: 7, Color: "#e34c26"}, + {Name: "HTML", Share: 8, Color: "#e34c26"}, {Name: "CoffeeScript", Share: 2, Color: "#244776"}, // Modula-2 is a special case because Linguist has no color for it. This // test case asserts that we invent a color for it (SHA256 of the name). diff --git a/internal/service/commit/testhelper_test.go b/internal/service/commit/testhelper_test.go index fe939f3f9274f9445836a1de89dc6e0cb09b58c9..f7f4551430a58c2390d01ba202172968933d5306 100644 --- a/internal/service/commit/testhelper_test.go +++ b/internal/service/commit/testhelper_test.go @@ -6,7 +6,6 @@ import ( "testing" "time" - "gitlab.com/gitlab-org/gitaly/internal/linguist" "gitlab.com/gitlab-org/gitaly/internal/rubyserver" "gitlab.com/gitlab-org/gitaly/internal/testhelper" @@ -18,8 +17,6 @@ import ( pb "gitlab.com/gitlab-org/gitaly-proto/go" ) -var () - func TestMain(m *testing.M) { os.Exit(testMain(m)) } @@ -30,9 +27,6 @@ func testMain(m *testing.M) int { defer testhelper.MustHaveNoChildProcess() testhelper.ConfigureRuby() - if err := linguist.LoadColors(); err != nil { - log.Fatal(err) - } var err error rubyServer, err = rubyserver.Start() diff --git a/ruby/Gemfile b/ruby/Gemfile index e06683b615d8fe3b1cc9b31daba4df8ebaf314e8..1a7b87dfb74153e46ae448dc3e94241e8d1f1ba9 100644 --- a/ruby/Gemfile +++ b/ruby/Gemfile @@ -1,7 +1,7 @@ source 'https://rubygems.org' gem 'rugged', '~> 0.27.2' -gem 'github-linguist', '~> 5.3.3', require: 'linguist' +gem 'github-linguist', '~> 6.1', require: 'linguist' gem 'gitlab-markup', '~> 1.6.2' gem 'gitaly-proto', '~> 0.107.0', require: 'gitaly' gem 'activesupport', '~> 5.0.2' diff --git a/ruby/Gemfile.lock b/ruby/Gemfile.lock index ab61f1041381479795e3e4e90a0033440632cc6f..ee7caf8d1dcd12244075910e5f203cba575af987 100644 --- a/ruby/Gemfile.lock +++ b/ruby/Gemfile.lock @@ -12,7 +12,7 @@ GEM charlock_holmes (0.7.6) concurrent-ruby (1.0.5) diff-lcs (1.3) - escape_utils (1.1.1) + escape_utils (1.2.1) faraday (0.12.2) multipart-post (>= 1.2, < 3) gemojione (3.3.0) @@ -20,9 +20,9 @@ GEM gitaly-proto (0.107.0) google-protobuf (~> 3.1) grpc (~> 1.10) - github-linguist (5.3.3) - charlock_holmes (~> 0.7.5) - escape_utils (~> 1.1.0) + github-linguist (6.2.0) + charlock_holmes (~> 0.7.6) + escape_utils (~> 1.2.0) mime-types (>= 1.19) rugged (>= 0.25.1) github-markup (1.7.0) @@ -143,7 +143,7 @@ DEPENDENCIES activesupport (~> 5.0.2) faraday (~> 0.12) gitaly-proto (~> 0.107.0) - github-linguist (~> 5.3.3) + github-linguist (~> 6.1) gitlab-gollum-lib (~> 4.2) gitlab-gollum-rugged_adapter (~> 0.4.4) gitlab-markup (~> 1.6.2) diff --git a/ruby/lib/gitaly_server.rb b/ruby/lib/gitaly_server.rb index a03f0dab6bb595ff76a511b799ab3a3dbfeb855e..1866d183f9c22277f1574486921b9e7587ba67f5 100644 --- a/ruby/lib/gitaly_server.rb +++ b/ruby/lib/gitaly_server.rb @@ -1,6 +1,8 @@ require 'gitaly' require_relative 'gitlab/git.rb' +require_relative 'gitlab/linguist/repository_languages.rb' +require_relative 'gitlab/linguist/cache.rb' require_relative 'gitaly_server/client.rb' require_relative 'gitaly_server/utils.rb' diff --git a/ruby/lib/gitaly_server/commit_service.rb b/ruby/lib/gitaly_server/commit_service.rb index 9e4ccd26177cf0533b7a859510d52ba8fdd30c55..dea4355810883069cd77d08ffd6bf73e22e1e907 100644 --- a/ruby/lib/gitaly_server/commit_service.rb +++ b/ruby/lib/gitaly_server/commit_service.rb @@ -1,3 +1,5 @@ +require 'linguist' + module GitalyServer class CommitService < Gitaly::CommitService::Service include Utils @@ -158,6 +160,31 @@ module GitalyServer end end + def commit_languages(request, call) + bridge_exceptions do + repo = Gitlab::Git::Repository.from_gitaly(request.repository, call) + + commit = Gitlab::Git::Commit.find(repo, request.revision) + raise GRPC::InvalidArgument, 'revision could not be resolved' unless commit + + languages = + Gitlab::Linguist::RepositoryLanguages.new(repo, commit) + .detect + + total_bytes = languages.sum(&:last).to_f + + languages.map! do |name, bytes| + Gitaly::CommitLanguagesResponse::Language.new( + name: name.to_s, + color: ::Linguist::Language.find_by_name(name)&.color || "##{Digest::SHA256.hexdigest(name)[0..5]}", + share: ((bytes / total_bytes) * 100).round, + ) + end + + Gitaly::CommitLanguagesResponse.new(languages: languages) + end + end + private # yields either signature chunks or signed_text chunks to the passed block diff --git a/ruby/lib/gitlab/linguist/cache.rb b/ruby/lib/gitlab/linguist/cache.rb new file mode 100644 index 0000000000000000000000000000000000000000..292e281f77e4c2c784cb626f4253074eee7a8761 --- /dev/null +++ b/ruby/lib/gitlab/linguist/cache.rb @@ -0,0 +1,50 @@ +module Gitlab + module Linguist + class Cache + OLD_STATS_KEY = 'old_stats'.freeze + OLD_COMMIT_OID_KEY = 'old_commit_oid'.freeze + + def initialize(repo_path) + @path = repo_path + end + + def write(linguist, commit_oid) + return if old_commit_oid == commit_oid + + FileUtils.mkdir_p(linguist_cache_directory) unless Dir.exist?(linguist_cache_directory) + + new_cache = { OLD_STATS_KEY => linguist.cache, OLD_COMMIT_OID_KEY => commit_oid } + + File.write(cache_path, Marshal.dump(new_cache)) + end + + def old_stats + cache[OLD_STATS_KEY] + end + + def old_commit_oid + cache[OLD_COMMIT_OID_KEY] + end + + private + + def cache + @cache ||= if File.exist?(cache_path) + Marshal.load(File.binread(cache_path)) + else + {} + end + rescue ArgumentError + @cache = {} + end + + def cache_path + File.join(linguist_cache_directory, 'linguist-cache') + end + + def linguist_cache_directory + File.join(@path, 'gitaly') + end + end + end +end diff --git a/ruby/lib/gitlab/linguist/repository_languages.rb b/ruby/lib/gitlab/linguist/repository_languages.rb new file mode 100644 index 0000000000000000000000000000000000000000..8ddbb32a2e6d8b5b11afd688dd8e8b96ab7d521c --- /dev/null +++ b/ruby/lib/gitlab/linguist/repository_languages.rb @@ -0,0 +1,24 @@ +module Gitlab + module Linguist + class RepositoryLanguages + def initialize(repo, commit) + @repo = repo + @commit = commit + @cache = Gitlab::Linguist::Cache.new(repo.path) + end + + def detect + linguist = ::Linguist::Repository.incremental(@repo.rugged, @commit.id, @cache.old_commit_oid, @cache.old_stats) + + languages = linguist + .languages + .sort_by { |_k, v| v } + .reverse + + @cache.write(linguist, @commit.id) + + languages + end + end + end +end diff --git a/ruby/spec/lib/gitlab/linguist/cache_spec.rb b/ruby/spec/lib/gitlab/linguist/cache_spec.rb new file mode 100644 index 0000000000000000000000000000000000000000..d5f0d997f4c70e63910d4d1c236d8d7ce0d8a4ce --- /dev/null +++ b/ruby/spec/lib/gitlab/linguist/cache_spec.rb @@ -0,0 +1,37 @@ +require 'spec_helper' + +describe Gitlab::Linguist::Cache do + include TestRepo + + let(:repository) { gitlab_git_from_gitaly(new_mutable_test_repo) } + let(:old_stats) { [{ 'foo.rb' => 'Ruby'}, { 'bar.go' => 'Go' }] } + let(:linguist) { double('linguist', cache: old_stats) } + + subject { described_class.new(repository.path) } + + describe '#write' do + it 'writes the cache in the Gitaly cache directory' do + subject.write(linguist, '0' * 40) + + expect(File.exist?(File.join(repository.path, 'gitaly', 'linguist-cache'))).to be(true) + end + end + + describe 'old_stats' do + context 'when there is no cache yet' do + it 'returns nil' do + expect(subject.old_stats).to be_nil + end + + end + + context 'when the cache has been written' do + before do + subject.write(linguist, '0' * 40) + + expect(subject.old_stats).not_to be_nil + expect(subject.old_stats).to eq('0' * 40) + end + end + end +end