From 704cc0f3fed34c4c05c3fdc7bb0bdd3e68e3ebb7 Mon Sep 17 00:00:00 2001 From: Jacob Vosmaer Date: Mon, 17 Aug 2020 11:36:08 +0200 Subject: [PATCH 1/3] WIP zipfs index on open --- internal/vfs/zipfs/attributes.go | 104 ++++++++++++++++++ internal/vfs/zipfs/entry.go | 89 +++++++++++++++ internal/vfs/zipfs/zipfs.go | 183 +++++++++++++++++++++++++++++++ 3 files changed, 376 insertions(+) create mode 100644 internal/vfs/zipfs/attributes.go create mode 100644 internal/vfs/zipfs/entry.go create mode 100644 internal/vfs/zipfs/zipfs.go diff --git a/internal/vfs/zipfs/attributes.go b/internal/vfs/zipfs/attributes.go new file mode 100644 index 000000000..812fe5a0a --- /dev/null +++ b/internal/vfs/zipfs/attributes.go @@ -0,0 +1,104 @@ +package zipfs + +import ( + "fmt" + "math" + "os" + "path/filepath" + "sort" + "strings" +) + +type Attributes struct { + Name string + Size int64 + CompressedSize int64 + Method uint16 + os.FileMode + Idx int +} + +func (fs *VFS) readAttributesFromZip() error { + for i, zf := range fs.zipReader.File { + name := filepath.Clean(zf.Name) + const publicPrefix = "public/" + if !strings.HasPrefix(name, publicPrefix) { + continue + } + name = strings.TrimPrefix(name, publicPrefix) + + mode := zf.FileInfo().Mode() + switch mode & os.ModeType { + case 0: + case os.ModeSymlink: + default: + continue + } + + attr := &Attributes{ + Name: name, + Method: zf.Method, + FileMode: mode, + Idx: i, + } + + var err error + attr.Size, err = fitInt64(zf.UncompressedSize64) + if err != nil { + return err + } + + attr.CompressedSize, err = fitInt64(zf.CompressedSize64) + if err != nil { + return err + } + + fs.files = append(fs.files, attr) + } + + // Sort the attributes so we can use binary search later. + sort.Slice(fs.files, func(i, j int) bool { return fs.files[i].Name <= fs.files[j].Name }) + + return nil +} + +func fitInt64(u uint64) (int64, error) { + if u > math.MaxInt64 { + return 0, fmt.Errorf("uint64 too large for int64") + } + return int64(u), nil +} + +func (fs *VFS) getAttributes(name string) (*Attributes, bool) { + name = filepath.Clean(name) + + i := sort.Search(len(fs.files), func(j int) bool { return fs.files[j].Name >= name }) + if i == len(fs.files) { + return nil, false + } + + if attr := fs.files[i]; attr.Name == name { + return attr, true + } + + // If name is "foo/bar", see if anything like "foo/bar/X" exists. We + // could use binary search again but i should be close to a match, if + // there is one. + dirPrefix := name + "/" + for j := i; j < len(fs.files); j++ { + attr := fs.files[j] + if n := len(dirPrefix); len(attr.Name) < n || attr.Name[:n] > dirPrefix { + break + } + + if strings.HasPrefix(attr.Name, dirPrefix) { + // "foo/bar/X" exists: return "foo/bar" as a directory. + return &Attributes{ + Name: name, + FileMode: os.ModeDir | 0755, + }, true + } + } + + return nil, false +} diff --git a/internal/vfs/zipfs/entry.go b/internal/vfs/zipfs/entry.go new file mode 100644 index 000000000..05702f909 --- /dev/null +++ b/internal/vfs/zipfs/entry.go @@ -0,0 +1,89 @@ +package zipfs + +import ( + "archive/zip" + "compress/flate" + "context" + "fmt" + "io" +) + +// ZipEntry represents an open entry in a zip archive. Either a regular file or a symlink. +type ZipEntry struct { + fs *VFS + *Attributes + ctx context.Context + + r io.Reader + c io.Closer +} + +// Seek implements enough of io.Seeker to support size lookups but +// nothing more. +func (zipEntry *ZipEntry) Seek(offset int64, whence int) (int64, error) { + if zipEntry.r != nil { + return 0, fmt.Errorf("seek after read") + } + + if offset != 0 { + return 0, fmt.Errorf("unsupported offset") + } + + switch whence { + case io.SeekStart: + return 0, nil + case io.SeekEnd: + return zipEntry.Size, nil + } + + return 0, fmt.Errorf("unsupported whence") +} + +func (zipEntry *ZipEntry) Close() error { + if zipEntry.c == nil { + return nil + } + + return zipEntry.c.Close() +} + +func (zipEntry *ZipEntry) Read(p []byte) (int, error) { + if zipEntry.r == nil { + if err := zipEntry.open(); err != nil { + return 0, err + } + } + + return zipEntry.r.Read(p) +} + +func (zipEntry *ZipEntry) open() error { + offset, err := zipEntry.fs.offset(zipEntry.Idx) + if err != nil { + return err + } + + zipFile, err := zipEntry.fs.opener(zipEntry.ctx) + if err != nil { + return err + } + + if _, err := zipFile.Seek(offset, io.SeekStart); err != nil { + zipFile.Close() + return err + } + + limitReader := io.LimitReader(zipFile, zipEntry.CompressedSize) + switch zipEntry.Method { + case zip.Store: + zipEntry.r = limitReader + case zip.Deflate: + zipEntry.r = flate.NewReader(limitReader) + default: + zipFile.Close() + return fmt.Errorf("invalid zip method") + } + + zipEntry.c = zipFile + return nil +} diff --git a/internal/vfs/zipfs/zipfs.go b/internal/vfs/zipfs/zipfs.go new file mode 100644 index 000000000..d91523dce --- /dev/null +++ b/internal/vfs/zipfs/zipfs.go @@ -0,0 +1,183 @@ +package zipfs + +import ( + "archive/zip" + "context" + "fmt" + "io" + "io/ioutil" + "os" + "sync" + "time" + + "gitlab.com/gitlab-org/gitlab-pages/internal/vfs" +) + +var _ vfs.VFS = &VFS{} + +type OpenZipFile interface { + io.ReadSeeker + io.ReaderAt + io.Closer +} + +// Opener is a factory that returns a new open zip file. We need this +// because httprs is not thread safe. +type Opener func(ctx context.Context) (OpenZipFile, error) + +type VFS struct { + opener Opener + expiry time.Time + + files []*Attributes + modTime time.Time + zipReader + offsetCache +} + +// zipReader is an open zipfile. We keep it around to lazily look up entry data offsets. +type zipReader struct { + *zip.Reader + c io.Closer + sync.Mutex +} + +func (zr *zipReader) DataOffset(i int) (int64, error) { + zr.Lock() + defer zr.Unlock() + return zr.Reader.File[i].DataOffset() +} + +func (zr *zipReader) Close() error { return zr.c.Close() } + +// offsetCache stores zip entry data offsets. We cache these offsets +// because each lookup requires an HTTP request and we can only look up +// one offset at a time with httprs. +type offsetCache struct { + offsets map[int]int64 + sync.RWMutex +} + +func seekerSize(s io.Seeker) (int64, error) { + size, err := s.Seek(0, io.SeekEnd) + if err != nil { + return 0, err + } + + _, err = s.Seek(0, io.SeekStart) + return size, err +} + +func Open(ctx context.Context, opener Opener, expiry time.Time) (*VFS, error) { + fs := &VFS{ + opener: opener, + expiry: expiry, + } + + f, err := fs.opener(ctx) + if err != nil { + return nil, err + } + + size, err := seekerSize(f) + if err != nil { + f.Close() + return nil, err + } + + // This will do IO: it reads and parses the zip central directory + fs.zipReader.Reader, err = zip.NewReader(f, size) + if err != nil { + f.Close() + return nil, err + } + fs.zipReader.c = f + + if err := fs.readAttributesFromZip(); err != nil { + f.Close() + return nil, err + } + + return fs, nil +} + +// offset returns the offset in the zip archive where the data of entry i starts. +func (fs *VFS) offset(i int) (int64, error) { + fs.offsetCache.RLock() + offset, ok := fs.offsetCache.offsets[i] + fs.offsetCache.RUnlock() + if ok { + return offset, nil + } + + offset, err := fs.zipReader.DataOffset(i) + if err != nil { + return 0, err + } + + fs.offsetCache.Lock() + fs.offsetCache.offsets[i] = offset + fs.offsetCache.Unlock() + + return offset, nil +} + +func (fs *VFS) HasExpired() bool { return time.Now().Add(10 * time.Minute).Before(fs.expiry) } + +type FileInfo struct { + *Attributes + modTime time.Time +} + +func (fi FileInfo) Name() string { return fi.Attributes.Name } +func (fi FileInfo) Size() int64 { return fi.Attributes.Size } +func (fi FileInfo) Mode() os.FileMode { return fi.Attributes.FileMode } +func (fi FileInfo) ModTime() time.Time { return fi.modTime } +func (fi FileInfo) IsDir() bool { return fi.Attributes.FileMode.IsDir() } +func (fi FileInfo) Sys() interface{} { return nil } + +type notFound struct{ name string } + +func (nf notFound) Error() string { return fmt.Sprintf("not found: %s", nf.name) } + +func (fs *VFS) Lstat(ctx context.Context, name string) (os.FileInfo, error) { + attr, ok := fs.getAttributes(name) + if !ok { + return nil, notFound{name} + } + + return FileInfo{Attributes: attr, modTime: fs.modTime}, nil +} + +func (fs *VFS) Readlink(ctx context.Context, name string) (string, error) { + attr, ok := fs.getAttributes(name) + if !ok { + return "", notFound{name} + } + if attr.FileMode&os.ModeSymlink == 0 { + return "", fmt.Errorf("not a symlink: %s", name) + } + + f := fs.zipEntry(ctx, attr) + defer f.Close() + + data, err := ioutil.ReadAll(f) + return string(data), err +} + +func (fs *VFS) Open(ctx context.Context, name string) (vfs.File, error) { + attr, ok := fs.getAttributes(name) + if !ok { + return nil, notFound{name} + } + + return fs.zipEntry(ctx, attr), nil +} + +func (fs *VFS) zipEntry(ctx context.Context, attr *Attributes) *ZipEntry { + return &ZipEntry{ + fs: fs, + Attributes: attr, + ctx: ctx, + } +} -- GitLab From 0f2aa74d4a3014aa8fc1db7fa6b5b09b1756cfad Mon Sep 17 00:00:00 2001 From: Jacob Vosmaer Date: Mon, 17 Aug 2020 16:20:56 +0200 Subject: [PATCH 2/3] Initialize map --- internal/vfs/zipfs/zipfs.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/internal/vfs/zipfs/zipfs.go b/internal/vfs/zipfs/zipfs.go index d91523dce..36da3f78a 100644 --- a/internal/vfs/zipfs/zipfs.go +++ b/internal/vfs/zipfs/zipfs.go @@ -116,6 +116,9 @@ func (fs *VFS) offset(i int) (int64, error) { } fs.offsetCache.Lock() + if fs.offsetCache.offsets == nil { + fs.offsetCache.offsets = make(map[int]int64) + } fs.offsetCache.offsets[i] = offset fs.offsetCache.Unlock() -- GitLab From bf336db1794eda93c7175f36eff5fe5f66ff9a57 Mon Sep 17 00:00:00 2001 From: Jacob Vosmaer Date: Mon, 17 Aug 2020 18:57:26 +0200 Subject: [PATCH 3/3] Move map initialization --- internal/vfs/zipfs/zipfs.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/internal/vfs/zipfs/zipfs.go b/internal/vfs/zipfs/zipfs.go index 36da3f78a..734ca1f01 100644 --- a/internal/vfs/zipfs/zipfs.go +++ b/internal/vfs/zipfs/zipfs.go @@ -73,6 +73,7 @@ func Open(ctx context.Context, opener Opener, expiry time.Time) (*VFS, error) { opener: opener, expiry: expiry, } + fs.offsetCache.offsets = make(map[int]int64) f, err := fs.opener(ctx) if err != nil { @@ -116,9 +117,6 @@ func (fs *VFS) offset(i int) (int64, error) { } fs.offsetCache.Lock() - if fs.offsetCache.offsets == nil { - fs.offsetCache.offsets = make(map[int]int64) - } fs.offsetCache.offsets[i] = offset fs.offsetCache.Unlock() -- GitLab