From 68212d36361aa36d31ce1d765c883129bf35e82a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kamil=20Trzci=C5=84ski?= Date: Mon, 12 Oct 2020 13:17:17 +0200 Subject: [PATCH 1/2] Measure size of data --- internal/vfs/zip/archive.go | 46 ++++- internal/vfs/zip/archive_test.go | 6 +- internal/vfs/zip/big_archive_test.go | 242 +++++++++++++++++++++++++++ internal/vfs/zip/sizeof.go | 76 +++++++++ 4 files changed, 368 insertions(+), 2 deletions(-) create mode 100644 internal/vfs/zip/big_archive_test.go create mode 100644 internal/vfs/zip/sizeof.go diff --git a/internal/vfs/zip/archive.go b/internal/vfs/zip/archive.go index 548ba6518..b2e302df4 100644 --- a/internal/vfs/zip/archive.go +++ b/internal/vfs/zip/archive.go @@ -35,6 +35,15 @@ var ( errNotFile = errors.New("not a file") ) +type zipFile struct { + compressedSize64 uint64 + uncompressedSize64 uint64 + headerOffset int64 + Modified time.Time + CreatorVersion uint16 + ExternalAttrs uint32 // Meaning depends on CreatorVersion +} + // zipArchive implements the vfs.Root interface. // It represents a zip archive saving all its files in memory. // It holds an httprange.Resource that can be read with httprange.RangedReader in chunks. @@ -53,7 +62,8 @@ type zipArchive struct { archive *zip.Reader err error - files map[string]*zip.File + files map[string]*zip.File + zipFiles map[string]zipFile } func newArchive(fs *zipVFS, path string, openTimeout time.Duration) *zipArchive { @@ -62,6 +72,7 @@ func newArchive(fs *zipVFS, path string, openTimeout time.Duration) *zipArchive path: path, done: make(chan struct{}), files: make(map[string]*zip.File), + zipFiles: make(map[string]zipFile), openTimeout: openTimeout, cacheNamespace: strconv.FormatInt(atomic.AddInt64(&fs.archiveCount, 1), 10) + ":", } @@ -128,12 +139,17 @@ func (a *zipArchive) readArchive() { return } + const emptyComment = "" + // TODO: Improve preprocessing of zip archives https://gitlab.com/gitlab-org/gitlab-pages/-/issues/432 for _, file := range a.archive.File { if !strings.HasPrefix(file.Name, dirPrefix) { continue } + file.Comment = emptyComment + file.Extra = nil a.files[file.Name] = file + a.zipFiles[file.Name] = zipFile{} } // recycle memory @@ -159,6 +175,34 @@ func (a *zipArchive) findFile(name string) *zip.File { return nil } +func (a *zipArchive) Size() int64 { + visited := make(map[interface{}]struct{}) + size := sizeOf(a.files, visited) + sizeOf(*a.archive, visited) + + for _, file := range a.files { + size += sizeOf(*file, visited) + } + return size +} + +func (a *zipArchive) FileCount() int64 { + return int64(len(a.files)) +} + +func (a *zipArchive) SizePerFile() int64 { + return a.Size() / a.FileCount() +} + +func (a *zipArchive) ZipSize() int64 { + visited := make(map[interface{}]struct{}) + size := sizeOf(a.zipFiles, visited) + return size +} + +func (a *zipArchive) ZipSizePerFile() int64 { + return a.ZipSize() / a.FileCount() +} + // Open finds the file by name inside the zipArchive and returns a reader that can be served by the VFS func (a *zipArchive) Open(ctx context.Context, name string) (vfs.File, error) { file := a.findFile(name) diff --git a/internal/vfs/zip/archive_test.go b/internal/vfs/zip/archive_test.go index bd7627b10..553926889 100644 --- a/internal/vfs/zip/archive_test.go +++ b/internal/vfs/zip/archive_test.go @@ -259,13 +259,17 @@ func TestReadArchiveFails(t *testing.T) { } func openZipArchive(t *testing.T, requests *int64) (*zipArchive, func()) { + return openZipArchiveCustom(t, "group/zip.gitlab.io/public.zip", requests) +} + +func openZipArchiveCustom(t *testing.T, path string, requests *int64) (*zipArchive, func()) { t.Helper() if requests == nil { requests = new(int64) } - testServerURL, cleanup := newZipFileServerURL(t, "group/zip.gitlab.io/public.zip", requests) + testServerURL, cleanup := newZipFileServerURL(t, path, requests) fs := New().(*zipVFS) zip := newArchive(fs, testServerURL+"/public.zip", time.Second) diff --git a/internal/vfs/zip/big_archive_test.go b/internal/vfs/zip/big_archive_test.go new file mode 100644 index 000000000..2903d5836 --- /dev/null +++ b/internal/vfs/zip/big_archive_test.go @@ -0,0 +1,242 @@ +package zip + +import ( + "archive/zip" + "context" + "net/http" + "net/http/httptest" + "reflect" + "sort" + "sync/atomic" + "testing" + "time" + + "github.com/stretchr/testify/require" + + "gitlab.com/gitlab-org/gitlab-pages/internal/testhelpers" +) + +func TestBigArchive(t *testing.T) { + archive, cleanup := openZipArchiveCustom(t, "../../tmp/docs.zip", nil) + defer cleanup() + + logAll = true + t.Log("BaseFile", sizeOf(zip.File{}, nil)) + logAll = false + + t.Log("FileCount:", archive.FileCount()) + t.Log("Size:", archive.Size()) + t.Log("SizePerFile:", archive.SizePerFile()) + t.Log("ZipSize:", archive.ZipSize()) + t.Log("ZipSizePerFile:", archive.ZipSizePerFile()) +} + +func TestString(t *testing.T) { + value := reflect.ValueOf("test") + t.Log("kind", value.Kind()) + t.Log("pointer", value.Pointer()) +} + +type zipFiles []*zip.File + +func (p zipFiles) Len() int { return len(p) } +func (p zipFiles) Less(i, j int) bool { return p[i].Name < p[j].Name } +func (p zipFiles) Swap(i, j int) { p[i], p[j] = p[j], p[i] } + +const hashMod = 32 + +func hash(s string) uint32 { + var hash uint32 + hash = 2166136261 + for _, c := range s { + hash *= 16777619 + hash ^= uint32(c) + } + return hash % hashMod +} + +type zipFilePtr struct { + *zip.File +} + +func BenchmarkTestAccess(t *testing.B) { + archive, cleanup := openZipArchiveCustomB(t, "../../tmp/docs.zip", nil) + defer cleanup() + + var lastMapPtr map[string]*zip.File + var lastMapStruct map[string]zip.File + var lastMapStructPtr map[string]zipFilePtr + var lastMapCopyPtr map[string]*zip.File + var lastMapHashedPtr []map[string]*zip.File + var lastSlice zipFiles + + t.Run("create Map Ptr", func(t *testing.B) { + for i := 0; i < t.N; i++ { + lastMapPtr = make(map[string]*zip.File) + + for _, file := range archive.files { + lastMapPtr[file.Name] = file + } + } + }) + + t.Run("create Map Struct", func(t *testing.B) { + for i := 0; i < t.N; i++ { + lastMapStruct = make(map[string]zip.File) + + for _, file := range archive.files { + lastMapStruct[file.Name] = *file + } + } + }) + + t.Run("create Map StructPtr", func(t *testing.B) { + for i := 0; i < t.N; i++ { + lastMapStructPtr = make(map[string]zipFilePtr) + + for _, file := range archive.files { + lastMapStructPtr[file.Name] = zipFilePtr{File: file} + } + } + }) + + t.Run("create Map Hashed", func(t *testing.B) { + for i := 0; i < t.N; i++ { + lastMapHashedPtr = make([]map[string]*zip.File, hashMod) + + for i := 0; i < hashMod; i++ { + lastMapHashedPtr[i] = make(map[string]*zip.File) + } + + for _, file := range archive.files { + lastMapHashedPtr[hash(file.Name)][file.Name] = file + } + } + }) + + t.Run("create Map Copy Pointer", func(t *testing.B) { + for i := 0; i < t.N; i++ { + lastMapCopyPtr = make(map[string]*zip.File) + + for _, file := range archive.files { + newFile := &zip.File{} + *newFile = *file + lastMapCopyPtr[file.Name] = newFile + } + } + }) + + t.Run("create slice", func(t *testing.B) { + for i := 0; i < t.N; i++ { + lastSlice = make(zipFiles, len(archive.files)) + idx := 0 + + for _, file := range archive.files { + lastSlice[idx] = file + idx++ + } + + sort.Sort(lastSlice) + } + }) + + tests := []string{ + lastSlice[0].Name, + lastSlice[len(lastSlice)/2].Name, + lastSlice[len(lastSlice)-1].Name, + "not/existing", + } + + for _, test := range tests { + t.Run("file: "+test, func(t *testing.B) { + t.Run("map ptr", func(t *testing.B) { + for i := 0; i < t.N; i++ { + _ = lastMapPtr[test] + } + }) + + t.Run("map struct", func(t *testing.B) { + for i := 0; i < t.N; i++ { + _ = lastMapStruct[test] + } + }) + + t.Run("map struct ptr", func(t *testing.B) { + for i := 0; i < t.N; i++ { + _ = lastMapStructPtr[test] + } + }) + + t.Run("map copy ptr", func(t *testing.B) { + for i := 0; i < t.N; i++ { + _ = lastMapCopyPtr[test] + } + }) + + t.Run("map hashed ptr", func(t *testing.B) { + for i := 0; i < t.N; i++ { + _ = lastMapHashedPtr[hash(test)][test] + } + }) + + t.Run("binary search", func(t *testing.B) { + for i := 0; i < t.N; i++ { + idx := sort.Search(len(lastSlice), func(i int) bool { return lastSlice[i].Name >= test }) + if idx >= 0 { + item := lastSlice[idx] + if item.Name == test { + // no-op + } + } + } + }) + }) + } +} + +func openZipArchiveCustomB(t *testing.B, path string, requests *int64) (*zipArchive, func()) { + t.Helper() + + if requests == nil { + requests = new(int64) + } + + testServerURL, cleanup := newZipFileServerURLB(t, path, requests) + + fs := New().(*zipVFS) + zip := newArchive(fs, testServerURL+"/public.zip", time.Second) + + err := zip.openArchive(context.Background()) + require.NoError(t, err) + + // public/ public/index.html public/404.html public/symlink.html + // public/subdir/ public/subdir/hello.html public/subdir/linked.html + // public/bad_symlink.html public/subdir/2bp3Qzs... + require.NotZero(t, zip.files) + require.Equal(t, int64(3), atomic.LoadInt64(requests), "we expect three requests to open ZIP archive: size and two to seek central directory") + + return zip, func() { + cleanup() + } +} + +func newZipFileServerURLB(t *testing.B, zipFilePath string, requests *int64) (string, func()) { + t.Helper() + + chdir := testhelpers.ChdirInPath(t, "../../../shared/pages", &chdirSet) + + m := http.NewServeMux() + m.HandleFunc("/public.zip", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + http.ServeFile(w, r, zipFilePath) + if requests != nil { + atomic.AddInt64(requests, 1) + } + })) + + testServer := httptest.NewServer(m) + + return testServer.URL, func() { + chdir() + testServer.Close() + } +} diff --git a/internal/vfs/zip/sizeof.go b/internal/vfs/zip/sizeof.go new file mode 100644 index 000000000..c8ac22e7c --- /dev/null +++ b/internal/vfs/zip/sizeof.go @@ -0,0 +1,76 @@ +package zip + +import ( + "fmt" + "reflect" +) + +var logAll = false + +func logSizeOf(args ...interface{}) { + if !logAll { + return + } + fmt.Println(args...) +} + +func sizeOf(v interface{}, visited map[interface{}]struct{}) int64 { + if visited == nil { + visited = make(map[interface{}]struct{}) + } + + return sizeOf2(reflect.ValueOf(v), visited) +} + +func sizeOf2(s reflect.Value, visited map[interface{}]struct{}) int64 { + return int64(s.Type().Size()) + internalSizeOf(s, visited) +} + +// sizeOf reworked from https://stackoverflow.com/a/51432438 +func internalSizeOf(s reflect.Value, visited map[interface{}]struct{}) int64 { + var size int64 + + switch s.Kind() { + case reflect.Slice: + logSizeOf("Slice:", size) + for i := 0; i < s.Len(); i++ { + extra := sizeOf(s.Index(i).Interface(), visited) + logSizeOf("Slice", i, ":", extra) + size += extra + } + + case reflect.Map: + keys := s.MapKeys() + size += int64(float64(len(keys)) * 10.79) // approximation from https://golang.org/src/runtime/hashmap.go + logSizeOf("Map:", size) + for i := range keys { + keySize := sizeOf(keys[i].Interface(), visited) + valueSize := sizeOf(s.MapIndex(keys[i]).Interface(), visited) + logSizeOf("MapKey", i, ":", keySize, valueSize) + size += keySize + valueSize + } + + case reflect.String: + if _, ok := visited[s.String()]; ok { + break + } + visited[s.String()] = struct{}{} + size += int64(s.Len()) + logSizeOf("String", size) + + case reflect.Struct: + logSizeOf("Struct:", size, s.Type().Name()) + for i := 0; i < s.NumField(); i++ { + //FieldByName("headerOffset").Int() + extra := internalSizeOf(s.Field(i), visited) + logSizeOf("Struct Field", i, ":", s.Type().Field(i).Name, extra) + size += extra + } + + // case reflect.Ptr: + // logSizeOf("Pointer:", size) + // s = reflect.Indirect(s) + // return sizeOf2(s, visited) + } + return size +} -- GitLab From c94cfe14a401d74bea417d6ec6457f30013a18b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kamil=20Trzci=C5=84ski?= Date: Tue, 13 Oct 2020 18:19:42 +0200 Subject: [PATCH 2/2] WIP --- internal/vfs/zip/big_archive_test.go | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/internal/vfs/zip/big_archive_test.go b/internal/vfs/zip/big_archive_test.go index 2903d5836..b4c425a8e 100644 --- a/internal/vfs/zip/big_archive_test.go +++ b/internal/vfs/zip/big_archive_test.go @@ -68,6 +68,7 @@ func BenchmarkTestAccess(t *testing.B) { var lastMapStructPtr map[string]zipFilePtr var lastMapCopyPtr map[string]*zip.File var lastMapHashedPtr []map[string]*zip.File + var lastMapCopyFlatPtr map[string]*zip.File var lastSlice zipFiles t.Run("create Map Ptr", func(t *testing.B) { @@ -126,6 +127,19 @@ func BenchmarkTestAccess(t *testing.B) { } }) + t.Run("create Map Copy Flat Pointer", func(t *testing.B) { + for i := 0; i < t.N; i++ { + ptrs := make([]zip.File, len(archive.files)) + lastMapCopyFlatPtr = make(map[string]*zip.File) + + for _, file := range archive.files { + newFile := &ptrs[len(lastMapCopyFlatPtr)] + *newFile = *file + lastMapCopyFlatPtr[file.Name] = newFile + } + } + }) + t.Run("create slice", func(t *testing.B) { for i := 0; i < t.N; i++ { lastSlice = make(zipFiles, len(archive.files)) -- GitLab