diff --git a/.gitignore b/.gitignore index 08ae671a8d708dba05f91310a1a09979a7ccd22b..daa66f775543f140e15faeed4a45929bdce93a7c 100644 --- a/.gitignore +++ b/.gitignore @@ -17,3 +17,4 @@ cmd/gitaly-ssh/gitaly-ssh git-env /gitaly-debug /praefect +/gitaly-pack-objects \ No newline at end of file diff --git a/cmd/gitaly-pack-objects/README.md b/cmd/gitaly-pack-objects/README.md new file mode 100644 index 0000000000000000000000000000000000000000..757ede509c40b434b963aa6f65fe8e48b2be334d --- /dev/null +++ b/cmd/gitaly-pack-objects/README.md @@ -0,0 +1,15 @@ +# gitaly-pack-objects + +This is a **beta** pack-objects hook that can speed up a Git clone when +installed on a server. The only type of clone we can speed up is a full +clone. + +Also see https://gitlab.com/groups/gitlab-org/-/epics/1117. + +- compile the executable and install at some chosen path +- `git config --global uploadpack.packObjectsHook /path/to/gitaly-pack-objects` + (confighas to be global for some reason) +- in the bare repo you want to speed up, run + `mkdir -p gitaly && git bundle create gitaly/clone.bundle --branches --tags` +- now do a full clone from that repo. If it is a local clone, use + `git clone --no-local` to see the effect diff --git a/cmd/gitaly-pack-objects/main.go b/cmd/gitaly-pack-objects/main.go new file mode 100644 index 0000000000000000000000000000000000000000..a33b537377867e5b5ae39109e8557bad2fa1d6d8 --- /dev/null +++ b/cmd/gitaly-pack-objects/main.go @@ -0,0 +1,31 @@ +package main + +import ( + "context" + "log" + "os" + + "gitlab.com/gitlab-org/gitaly/internal/packobjects" +) + +func main() { + if len(os.Args) < 2 { + log.Fatal("not enough argument to pack-objects hook") + } + + if err := _main(); err != nil { + log.Fatal(err) + } +} + +func _main() error { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + wd, err := os.Getwd() + if err != nil { + return err + } + + return packobjects.PackObjects(ctx, wd, os.Args[1:], os.Stdin, os.Stdout, os.Stderr) +} diff --git a/internal/git/packfile.go b/internal/git/packfile.go new file mode 100644 index 0000000000000000000000000000000000000000..7d5551d984cdf436677e799071d1097f66a63a35 --- /dev/null +++ b/internal/git/packfile.go @@ -0,0 +1,145 @@ +package git + +import ( + "bytes" + "crypto/sha1" + "encoding/binary" + "fmt" + "hash" + "io" +) + +const ( + sumSize = sha1.Size + packBufferSize = 4096 +) + +type PackReader struct { + buf [packBufferSize]byte + avail []byte + reader io.Reader + readErr error + sum hash.Hash + numObjects uint32 +} + +const ( + packMagic = "PACK\x00\x00\x00\x02" + packHeaderSize = 12 +) + +// NewPackReader blocks until it has read the packfile header from r. +func NewPackReader(r io.Reader) (*PackReader, error) { + pr := &PackReader{ + reader: r, + sum: sha1.New(), + } + + header := make([]byte, packHeaderSize) + if _, err := io.ReadFull(pr.reader, header); err != nil { + return nil, err + } + + if magic := string(header[:len(packMagic)]); magic != packMagic { + return nil, fmt.Errorf("bad pack header: %q", magic) + } + + pr.numObjects = binary.BigEndian.Uint32(header[len(packMagic):]) + + if _, err := pr.sum.Write(header); err != nil { + return nil, err + } + + return pr, nil +} + +func (pr *PackReader) NumObjects() uint32 { return pr.numObjects } + +func (pr *PackReader) numBytesAvailable() int { return len(pr.avail) - sumSize } + +func (pr *PackReader) Read(p []byte) (int, error) { + if pr.numBytesAvailable() <= 0 && pr.readErr == nil { + copy(pr.buf[:], pr.avail) + + var nRead int + nRead, pr.readErr = pr.reader.Read(pr.buf[len(pr.avail):]) + if pr.readErr != nil && pr.readErr != io.EOF { + return 0, pr.readErr + } + + pr.avail = pr.buf[:len(pr.avail)+nRead] + + if n := pr.numBytesAvailable(); n > 0 { + if _, err := pr.sum.Write(pr.avail[:n]); err != nil { + return 0, err + } + } + } + + if pr.numBytesAvailable() <= 0 { + if pr.readErr == io.EOF && !bytes.Equal(pr.sum.Sum(nil), pr.avail) { + return 0, fmt.Errorf("packfile checksum mismatch") + } + + return 0, pr.readErr + } + + nYielded := copy(p, pr.avail[:pr.numBytesAvailable()]) + pr.avail = pr.avail[nYielded:] + return nYielded, nil +} + +type PackWriter struct { + w io.Writer + summer hash.Hash + flushed bool +} + +// NewWriter creates a new PackWriter, writes its header, and returns the +// PackWriter. The caller must call Flush() when done, or else the +// packfile written to w will be invalid. +func NewPackWriter(w io.Writer, numObjects uint32) (*PackWriter, error) { + pw := &PackWriter{ + summer: sha1.New(), + } + pw.w = io.MultiWriter(w, pw.summer) + + if _, err := pw.w.Write([]byte(packMagic)); err != nil { + return nil, err + } + + size := make([]byte, 4) + binary.BigEndian.PutUint32(size, numObjects) + if _, err := pw.w.Write(size); err != nil { + return nil, err + } + + return pw, nil +} + +type alreadyFlushedError struct{} + +func (alreadyFlushedError) Error() string { return "PackWriter already flushed" } + +func (pw *PackWriter) Write(p []byte) (int, error) { + if pw.flushed { + return 0, alreadyFlushedError{} + } + + return pw.w.Write(p) +} + +// Flush finalizes the packfile by writing its trailing checksum. +func (pw *PackWriter) Flush() error { + if pw.flushed { + return alreadyFlushedError{} + } + pw.flushed = true + + sum := pw.summer.Sum(nil) + + // Feeding the checksum back into pw.w messes up the state of pw.summer + // but we will not use it again so it's OK. + _, err := pw.w.Write(sum) + return err +} diff --git a/internal/git/packfile_test.go b/internal/git/packfile_test.go new file mode 100644 index 0000000000000000000000000000000000000000..68bcfa773795de4fdd223a62ced5805e3d419f01 --- /dev/null +++ b/internal/git/packfile_test.go @@ -0,0 +1,75 @@ +package git + +import ( + "bytes" + "io/ioutil" + "testing" + + "github.com/stretchr/testify/require" +) + +type packExample struct { + desc string + n uint32 + raw []byte + content []byte +} + +var packExamples = []packExample{ + { + desc: "small, 3 objects", + n: 3, + raw: []byte("PACK\x00\x00\x00\x02\x00\x00\x00\x03hello=?\x1A$\xB3\x8F\xCC\x96\xE0\xB0\xAC\xF0\x93\t\x85\xD8\x87K\xC5p"), + content: []byte("hello"), + }, +} + +func TestPackReader(t *testing.T) { + for _, tc := range packExamples { + t.Run(tc.desc, func(t *testing.T) { + pr, err := NewPackReader(bytes.NewReader(tc.raw)) + require.NoError(t, err) + + require.Equal(t, tc.n, pr.NumObjects(), "number of objects in packfile") + + out, err := ioutil.ReadAll(pr) + require.NoError(t, err, "read all data") + + require.Equal(t, string(tc.content), string(out), "packfile content") + }) + } +} + +// TODO add more PackReader tests: invalid header, length < 32, invalid checksum + +func TestPackWriter(t *testing.T) { + for _, tc := range packExamples { + t.Run(tc.desc, func(t *testing.T) { + out := &bytes.Buffer{} + pw, err := NewPackWriter(out, tc.n) + require.NoError(t, err) + + in := tc.content + nBytes, err := pw.Write(in) + require.NoError(t, err) + require.Equal(t, nBytes, len(in), "bytes written") + + require.NoError(t, pw.Flush(), "flush") + require.Equal(t, string(tc.raw), out.String()) + }) + } +} + +func TestPackWriterFlush(t *testing.T) { + out := &bytes.Buffer{} + pw, err := NewPackWriter(out, 123) + require.NoError(t, err) + + require.NoError(t, pw.Flush()) + + n, err := pw.Write([]byte("hello")) + require.Equal(t, 0, n, "bytes written should be 0") + require.IsType(t, alreadyFlushedError{}, err, "write error should be 'already flushed'") + + require.IsType(t, alreadyFlushedError{}, pw.Flush(), "flush error should be 'already flushed'") +} diff --git a/internal/packobjects/pack-objects-bundle.go b/internal/packobjects/pack-objects-bundle.go new file mode 100644 index 0000000000000000000000000000000000000000..5cfcd5467b8c5ef0cae4cba646b1e6576eb11f42 --- /dev/null +++ b/internal/packobjects/pack-objects-bundle.go @@ -0,0 +1,165 @@ +package packobjects + +import ( + "bufio" + "bytes" + "context" + "fmt" + "io" + "os" + "os/exec" + "path/filepath" + "regexp" + "strings" + + "gitlab.com/gitlab-org/gitaly/internal/command" + "gitlab.com/gitlab-org/gitaly/internal/git" +) + +const bundleFileName = "gitaly/clone.bundle" + +var shaRegex = regexp.MustCompile(`\A[0-9a-f]{40}\z`) + +func PackObjects(ctx context.Context, cwd string, args []string, stdin io.Reader, stdout, stderr io.Writer) error { + request := &bytes.Buffer{} + scanner := bufio.NewScanner(io.TeeReader(stdin, request)) + seenNot := false + isClone := true + for scanner.Scan() { + if !seenNot && scanner.Text() == "--not" { + seenNot = true + continue + } + + if seenNot && scanner.Text() != "" { + isClone = false + } + } + + if err := scanner.Err(); err != nil { + return err + } + + // TODO check args. If unexpected, return fallback. + + if !isClone { + return fallback(ctx, args, request, stdout, stderr) + } + + bundleFile, err := os.Open(filepath.Join(cwd, bundleFileName)) + if err != nil { + return fallback(ctx, args, request, stdout, stderr) + } + defer bundleFile.Close() + + bundle := bufio.NewReader(bundleFile) + + request = bytes.NewBuffer(bytes.TrimSpace(request.Bytes())) + if _, err := request.WriteString("\n"); err != nil { + return err + } + + if err := addBundleRefsToRequest(request, bundle); err != nil { + return err + } + + bundleReader, err := git.NewPackReader(bundle) + if err != nil { + return err + } + + cmd, err := command.New(ctx, exec.Command(args[0], args[1:]...), request, nil, stderr) + if err != nil { + return err + } + + packObjectsReader, err := git.NewPackReader(cmd) + if err != nil { + return err + } + + // TODO check for overflow + totalObjects := packObjectsReader.NumObjects() + bundleReader.NumObjects() + + w, err := git.NewPackWriter(stdout, totalObjects) + if err != nil { + return err + } + + if _, err := io.Copy(w, packObjectsReader); err != nil { + return err + } + + if err := cmd.Wait(); err != nil { + return err + } + + fmt.Fprintf(stderr, "Pre-computed packfile: %d objects\n", bundleReader.NumObjects()) + + if _, err := io.Copy(w, bundleReader); err != nil { + return err + } + + if err := w.Flush(); err != nil { + return err + } + + return nil +} + +func fallback(ctx context.Context, args []string, request io.Reader, stdout, stderr io.Writer) error { + cmd, err := command.New(ctx, exec.Command(args[0], args[1:]...), request, stdout, stderr) + if err != nil { + return err + } + + return cmd.Wait() +} + +func readLine(r *bufio.Reader) (string, error) { + line, err := r.ReadBytes('\n') + if err != nil { + return "", err + } + + return string(line[:len(line)-1]), nil +} + +const BundleHeader = "# v2 git bundle" + +func addBundleRefsToRequest(request io.Writer, bundle *bufio.Reader) error { + bundleHeader, err := readLine(bundle) + if err != nil { + return err + } + if bundleHeader != BundleHeader { + return fmt.Errorf("unexpected bundle header: %q", bundleHeader) + } + + for { + refLine, err := readLine(bundle) + if err != nil { + return err + } + + if refLine == "" { + break + } + + split := strings.SplitN(refLine, " ", 2) + if len(split) != 2 { + return fmt.Errorf("invalid ref line: %q", refLine) + } + + id := split[0] + if !shaRegex.MatchString(id) { + return fmt.Errorf("invalid object ID: %q", id) + } + + if _, err := fmt.Fprintln(request, id); err != nil { + return err + } + } + + return nil +}