diff --git a/_support/benchmarking/experiments/mb-tran/config.yml b/_support/benchmarking/experiments/mb-tran/config.yml new file mode 100644 index 0000000000000000000000000000000000000000..4ee1b8700c9b5ad944050eb9c562fed76647d98b --- /dev/null +++ b/_support/benchmarking/experiments/mb-tran/config.yml @@ -0,0 +1,164 @@ +--- +project: "gitaly-benchmark-0150d6cf" +benchmark_region: "us-central1" +benchmark_zone: "us-central1-a" + +# Enable to use regional persistent disk https://cloud.google.com/compute/docs/disks/regional-persistent-disk +# Regional PD is supported on only E2, N1, N2, and N2D machine type VMs. +use_regional_disk: false +# One of these zones must be the zone for benchmark_zone +regional_disk_replica_zones: ["us-central1-a", "us-central1-b"] + +# The image to use for the client node and all Gitaly nodes. +os_image: "ubuntu-os-cloud/ubuntu-2204-lts" + +# Configuration parameters for the sole client instance. +client: + machine_type: "n2d-standard-4" + boot_disk_size: 20 + boot_disk_type: "pd-balanced" + # The client clones Gitaly in order to consume the protobuf definitions. + # This revision should be set to the latest of the gitaly_revision values + # specified for the gitaly_instances below. + gitaly_revision: HEAD + +# Configuration parameters for a collection of N Gitaly nodes. Benchmarks will execute for each Gitaly +# node. +gitaly_instances: + # Try to use a short name, otherwise we'll exceed the GCP resource name length of 63 characters + - name: "test" + machine_type: "n2d-standard-16" + boot_disk_type: "pd-balanced" + boot_disk_size: 20 + gitaly_revision: HEAD + disk_size: 100 + disk_type: "pd-balanced" + # ==== Standard Filesystem settings ==== + # Btrfs setup + # filesystem: "btrfs" # Options: ext4, xfs, btrfs + # fs_mount_opts: "noatime,compress=zstd,space_cache=v2,ssd,discard=async" + # fs_format_opts: "-f -n 16k" + # XFS setup + # filesystem: "xfs" # Options: ext4, xfs, btrfs + # fs_mount_opts: "defaults,discard" + # fs_format_opts: "-f" + # ext4 setup + filesystem: "ext4" # Options: ext4, xfs, btrfs + fs_mount_opts: "defaults,discard" + fs_format_opts: "" + + # These are manually templated and don't translate directly to config.toml entries. + config: + # Whether transactions should be enabled. + transactions: true + environment: + # Arbitrary environment variables. Note that SNAPSHOT_DRIVER doesn't actually do anything; it + # just serves as an example. + SNAPSHOT_DRIVER: deepclone + - name: "base" + machine_type: "n2d-standard-16" + boot_disk_type: "pd-balanced" + boot_disk_size: 20 + gitaly_revision: HEAD + disk_size: 100 + disk_type: "pd-balanced" + # ==== Standard Filesystem settings ==== + # Btrfs setup + # filesystem: "btrfs" # Options: ext4, xfs, btrfs + # fs_mount_opts: "noatime,compress=zstd,space_cache=v2,ssd,discard=async" + # fs_format_opts: "-f -n 16k" + # XFS setup + # filesystem: "xfs" # Options: ext4, xfs, btrfs + # fs_mount_opts: "defaults,discard" + # fs_format_opts: "-f" + # ext4 setup + filesystem: "ext4" # Options: ext4, xfs, btrfs + fs_mount_opts: "defaults,discard" + fs_format_opts: "" + + # These are manually templated and don't translate directly to config.toml entries. + config: + # Whether transactions should be enabled. + transactions: false + environment: + # Arbitrary environment variables. Note that SNAPSHOT_DRIVER doesn't actually do anything; it + # just serves as an example. + SNAPSHOT_DRIVER: deepclone + +# A list of repositories to be cloned onto the repositories disk, along with test inputs to be used +# for RPC calls. This section of the configuration is re-serialised into JSON and provided to the +# K6 script. +# +# NOTE: you may wish to delete some of these entries, otherwise benchmarking setup may take a while +# to clone each repo. +repositories: + - name: git + # Whether this repository should be tested. This toggle is read by the K6 script. + include_in_test: true + # Which reference backend to use. This defines how the repository will be cloned. + reference_backend: files + remote: "https://gitlab.com/gitlab-org/git.git" + revision: "2462961280690837670d997bde64bd4ebf8ae66d" + # Test data to be used as RPC inputs. The K6 script will randomly choose out of these available + # inputs. + testdata: + commits: + - "fea9d18c534a445ef6e488d8ee711fa92fa0e6bd" + - "0a15bb634cf005a0266ee1108ac31aa75649a61c" + - "217e4a23d76fe95a0f6ab0f6159de2460db6fcd9" + refs: + - "refs/heads/master" + - "refs/heads/next" + - "refs/tags/v2.50.0" + files: + - "README.md" + - "t/lib-diff.sh" + - "packfile.c" + directories: + - "t" + - "Documentation" + - "git-gui" + - name: gitlab + include_in_test: true + reference_backend: files + remote: "https://gitlab.com/gitlab-org/gitlab.git" + revision: "8f3978675aa4df643cff5a01a8e1896ae754685a" + testdata: + commits: + - "875ffb690e25eb8c98797b5641c6c16c71454b73" + - "3074e43761003e2566ea604053fe4988774d2896" + - "9ace97496c56335c5739c226853b468afd962830" + refs: + - "refs/heads/master" + - "refs/tags/v18.1.5-ee" + - "refs/tags/v17.11.6-ee" + files: + - "README.md" + - "lib/gitaly/server.rb" + - "ee/app/graphql/resolvers/epics_resolver.rb" + directories: + - "app" + - "rubocop" + - "qa" + - name: gitaly + include_in_test: true + reference_backend: files + remote: "https://gitlab.com/gitlab-org/gitaly.git" + revision: "4d78df8cb5c6b3abfef5530830dba6c67d9d4c53" + testdata: + commits: + - "765d81272feb53bcc0c50199183b4514c5ef7a73" + - "25965387d1a0a91d226912649180f38c04d89a36" + - "552d12d94dd24ad8dff93856e77a08b6a96f1d3e" + refs: + - "refs/heads/master" + - "refs/tags/v18.1.5" + - "refs/tags/v17.11.7" + files: + - "README.md" + - "internal/gitaly/rangediff/range_diff_test.go" + - "proto/go/gitalypb/blob.pb.go" + directories: + - "internal" + - "proto" + - "internal/gitaly/service/raft" diff --git a/_support/benchmarking/experiments/mb-tran/k6-benchmark.js b/_support/benchmarking/experiments/mb-tran/k6-benchmark.js new file mode 100644 index 0000000000000000000000000000000000000000..264d4ffde33c4fbf8c56a8dc74010f723c42c35f --- /dev/null +++ b/_support/benchmarking/experiments/mb-tran/k6-benchmark.js @@ -0,0 +1,300 @@ +import { Client, Stream, StatusOK } from 'k6/net/grpc' +import encoding from 'k6/encoding' +import { check } from 'k6' +import exec from 'k6/x/exec' + +// Consume the environment variables we set in the Ansible task. +const gitalyAddress = __ENV.GITALY_ADDRESS +const gitalyProtoDir = __ENV.GITALY_PROTO_DIR +const runName = __ENV.RUN_NAME +const workloadDuration = __ENV.WORKLOAD_DURATION + + +// optionsStatic returns a test scenario where constant load is offered to Gitaly +const optionsStatic = () => { + const SCENARIO_DEFAULTS = { + executor: 'constant-arrival-rate', + duration: workloadDuration, + timeUnit: '1s', + gracefulStop: '0s', + preAllocatedVUs: 40 + } + + return { + scenarios: { + findCommit: { ...SCENARIO_DEFAULTS, rate: 200, exec: 'findCommit' }, + getBlobs: { ...SCENARIO_DEFAULTS, rate: 200, exec: 'getBlobs' }, + getTreeEntries: { ...SCENARIO_DEFAULTS, rate: 200, exec: 'getTreeEntries' }, + treeEntry: { ...SCENARIO_DEFAULTS, rate: 100, exec: 'treeEntry' }, + listCommitsByOid: { ...SCENARIO_DEFAULTS, rate: 200, exec: 'listCommitsByOid' } + // writeAndDeleteRefs: { ...SCENARIO_DEFAULTS, rate: 100, exec: 'writeAndDeleteRefs' } + }, + setupTimeout: '5m' + } +} + +// optionsRamping returns a test scenario where a ramping workload is offered to Gitaly +const optionsRamping = () => { + const SCENARIO_DEFAULTS = { + executor: 'ramping-arrival-rate', + timeUnit: '1s', + preAllocatedVUs: 40 + } + + const stages_read = [{target: 50, duration: '100s'}, {target: 100, duration: '50s'}, {target: 200, duration: '100s'}, {target: 50, duration: '50s'}] + // const stages_write = [{target: 25, duration: '100s'}, {target: 50, duration: '50s'}, {target: 100, duration: '100s'}, {target: 25, duration: '50s'}] + + return { + scenarios: { + findCommit: { + ...SCENARIO_DEFAULTS, + stages: stages_read, + exec: 'findCommit' + }, + getBlobs: { + ...SCENARIO_DEFAULTS, + stages: stages_read, + exec: 'getBlobs' + }, + getTreeEntries: { + ...SCENARIO_DEFAULTS, + stages: stages_read, + exec: 'getTreeEntries' + }, + treeEntry: { + ...SCENARIO_DEFAULTS, + stages: stages_read, + exec: 'treeEntry' + }, + listCommitsByOid: { + ...SCENARIO_DEFAULTS, + stages: stages_read, + exec: 'listCommitsByOid' + } + // writeAndDeleteRefs: { + // ...SCENARIO_DEFAULTS, + // stages: stages_write, + // exec: 'writeAndDeleteRefs' + // } + }, + setupTimeout: '5m' + } + +} + +export const options = optionsRamping() + +const repos = JSON.parse(open("/opt/benchmark-gitaly/repositories.json")); + +const selectTestRepo = () => { + const active = repos.filter(r => r.include_in_test); + const repo = active[Math.floor(Math.random() * active.length)]; + + return { + repository: { + storageName: 'default', + relativePath: `${repo.name}`, + glRepository: repo.name, // irrelevant but mandatory + glProjectPath: `foo/bar/${repo.name}`, // irrelevant but mandatory + }, + commit: repo.testdata.commits[Math.floor(Math.random() * repo.testdata.commits.length)], + ref: repo.testdata.refs[Math.floor(Math.random() * repo.testdata.refs.length)], + file: repo.testdata.files[Math.floor(Math.random() * repo.testdata.files.length)], + directory: repo.testdata.directories[Math.floor(Math.random() * repo.testdata.directories.length)], + } +} + +const generateRandom = () => Math.random().toString(36).substring(2, 15) + Math.random().toString(23).substring(2, 5) + +export function setup () { + const setupCompletionSentinel = `/tmp/${runName}-setup-complete` + // Signal to Ansible that setup is complete, in a very hacky way. + exec.command('touch', [setupCompletionSentinel]) + + return { + setupCompletionSentinel + } +} + +export function teardown (context) { + exec.command('rm', [context.setupCompletionSentinel]) +} + +const client = new Client() +// k6 provides no easy way to list directory contents. +client.load([gitalyProtoDir], 'commit.proto', 'blob.proto', 'ref.proto', 'repository.proto') + +export function findCommit () { + client.connect(gitalyAddress, { + plaintext: true + }) + + const testRepo = selectTestRepo(); + const req = { + repository: testRepo.repository, + revision: encoding.b64encode(testRepo.commit) + } + + const res = client.invoke('gitaly.CommitService/FindCommit', req) + check(res, { + 'FindCommit - StatusOK': r => r && r.status === StatusOK + }) + + client.close() +} + +export function getBlobs () { + client.connect(gitalyAddress, { + plaintext: true + }) + + const testRepo = selectTestRepo(); + const req = { + repository: testRepo.repository, + revision_paths: [ + { + revision: testRepo.commit, + path: encoding.b64encode(testRepo.file) + } + ], + limit: -1 + } + + const stream = new Stream(client, 'gitaly.BlobService/GetBlobs') + stream.on('data', data => { + check(data, { + 'GetBlobs - data present in response': r => r && r.data + }) + }) + + stream.on('end', function () { + client.close() + }) + + stream.on('error', function(err) { + console.error(err) + }) + + stream.write(req) +} + +export function getTreeEntries () { + client.connect(gitalyAddress, { + plaintext: true + }) + + const testRepo = selectTestRepo(); + const req = { + repository: testRepo.repository, + revision: encoding.b64encode(testRepo.commit), + path: encoding.b64encode(testRepo.directory) + } + + const stream = new Stream(client, 'gitaly.CommitService/GetTreeEntries') + stream.on('data', data => { + check(data, { + 'GetTreeEntries - entries present in response': r => r && r.entries + }) + }) + + stream.on('end', function () { + client.close() + }) + + stream.on('error', function(err) { + console.error(err) + }) + + stream.write(req) +} + +export function treeEntry () { + client.connect(gitalyAddress, { + plaintext: true + }) + + const testRepo = selectTestRepo(); + const req = { + repository: testRepo.repository, + revision: encoding.b64encode(testRepo.ref), + path: encoding.b64encode(testRepo.file) + } + + const stream = new Stream(client, 'gitaly.CommitService/TreeEntry') + stream.on('data', data => { + check(data, { + 'TreeEntry - data present in response': r => r && r.data + }) + }) + + stream.on('end', function () { + client.close() + }) + + stream.on('error', function(err) { + console.error(err) + }) + + stream.write(req) +} + +export function listCommitsByOid () { + client.connect(gitalyAddress, { + plaintext: true + }) + + const testRepo = selectTestRepo(); + const req = { + repository: testRepo.repository, + oid: [testRepo.commit] + } + + const stream = new Stream(client, 'gitaly.CommitService/ListCommitsByOid') + stream.on('data', data => { + check(data, { + 'ListCommitsByOid - commits present in response': r => r && r.commits + }) + }) + + stream.on('end', function () { + client.close() + }) + + stream.on('error', function(err) { + console.error(err) + }) + + stream.write(req) +} + +export function writeAndDeleteRefs () { + client.connect(gitalyAddress, { + plaintext: true + }) + + const testRepo = selectTestRepo(); + const generatedRef = 'refs/test/' + generateRandom() + + const writeRefReq = { + repository: testRepo.repository, + ref: encoding.b64encode(generatedRef), + revision: encoding.b64encode(testRepo.commit) + } + + const writeRefRes = client.invoke('gitaly.RepositoryService/WriteRef', writeRefReq) + check(writeRefRes, { + 'WriteRef - StatusOK': r => r && r.status === StatusOK + }) + + const deleteRefsReq = { + repository: testRepo.repository, + refs: [encoding.b64encode(generatedRef)] + } + + const deleteRefsRes = client.invoke('gitaly.RefService/DeleteRefs', deleteRefsReq) + check(deleteRefsRes, { + 'DeleteRefs - StatusOK': r => r && r.status === StatusOK + }) + + client.close() +} diff --git a/_support/benchmarking/experiments/mb-tran/plot.py b/_support/benchmarking/experiments/mb-tran/plot.py new file mode 100644 index 0000000000000000000000000000000000000000..b5a7377e872148520b179925d80a3c56bb986ab2 --- /dev/null +++ b/_support/benchmarking/experiments/mb-tran/plot.py @@ -0,0 +1,337 @@ +import pandas as pd +from plotnine import * +import sys +import json + +# Define custom color palette +custom_colors = [ + "#ffd700", + "#fa8775", + "#ffb14e", + "#ea5f94", + "#cd34b5", + "#9d02d7", + "#0000ff", +] + +def load(fname): + # The log file is a newline-separated collection of JSON objects, each of which can + # be nested and needs to be flattened. + df = pd.json_normalize(pd.Series(open(fname).readlines()).apply(json.loads)) + + # The time column is often used for aggregations. + df["time"] = pd.to_datetime(df["time"]) + return df + + +def stats_rpc_count(df, outdir): + df = df[df["grpc.request.glRepository"].str.len() > 0] + df = df[df["grpc.method"].str.len() > 0] + + df = ( + df.groupby(["time_interval", "grpc.request.glRepository", "grpc.method", "grpc.code"]) + .size() + .reset_index(name="request_count") + ) + + with open(f"{outdir}/rpc_count_by_repo.txt", "w") as f: + f.write(df.to_string(index=False)) + + p = ( + ggplot( + df, + aes( + x="time_interval", + y="request_count", + color="grpc.method", + shape="grpc.request.glRepository", + ), + ) + + geom_line() + + scale_x_datetime(date_labels="%H:%M:%S", date_breaks="5 seconds") + + theme_seaborn( + style="darkgrid", context="notebook", font="sans-serif", font_scale=1) + + theme( + axis_text_x=element_text(rotation=45, hjust=1), figure_size=(12, 8), dpi=200 + ) + + labs( + title="gRPC Request Count", + x="Time", + y="Count", + color="Method", + shape="Repository", + ) + + facet_grid("grpc.request.glRepository", "grpc.code") + ) + + p.save(f"{outdir}/rpc_count_by_repo.png") + + +def stats_rpc_latency(df, outdir): + df = df[df["grpc.request.glRepository"].str.len() > 0] + df = df[df["grpc.method"].str.len() > 0] + df = df[df["grpc.time_ms"].notna()] + + df = ( + df.groupby(["time_interval", "grpc.request.glRepository", "grpc.method", "grpc.code"])[ + "grpc.time_ms" + ] + .quantile(0.95) + .reset_index() + ) + with open(f"{outdir}/rpc_latency_by_repo.txt", "w") as f: + f.write(df.to_string(index=False)) + + p = ( + ggplot( + df, + aes( + x="time_interval", + y="grpc.time_ms", + color="grpc.method", + shape="grpc.request.glRepository", + ), + ) + + geom_line() + + scale_x_datetime(date_labels="%H:%M:%S", date_breaks="5 seconds") + + scale_y_continuous(limits=(0, 12000)) + + theme_seaborn( + style="darkgrid", context="notebook", font="sans-serif", font_scale=1) + + theme( + axis_text_x=element_text(rotation=45, hjust=1), figure_size=(12, 16), dpi=200 + ) + + labs( + title="gRPC Response Latency", + x="Time", + y="Latency (ms, p95)", + color="Method", + shape="Repository", + ) + + facet_grid("grpc.request.glRepository", "grpc.code") + ) + + p.save(f"{outdir}/rpc_latency_by_repo.png") + + +def stats_snapshot(df, outdir): + if "snapshot.duration_ms" not in df.columns: + print("No snapshot creation events found in the log") + return + + df = df[df["snapshot.duration_ms"].notna()] + df = df[df["grpc.request.glRepository"].notna()] + + df = ( + df.groupby(["time_interval", "grpc.request.glRepository"])[ + "snapshot.duration_ms" + ] + .quantile(0.95) + .reset_index() + ) + with open(f"{outdir}/snapshot_creation_latency_by_repo.txt", "w") as f: + f.write(df.to_string(index=False)) + + p = ( + ggplot( + df, + aes( + x="time_interval", + y="snapshot.duration_ms", + color="grpc.request.glRepository", + ), + ) + + geom_line() + + scale_x_datetime(date_labels="%H:%M:%S", date_breaks="5 seconds") + + theme_seaborn( + style="darkgrid", context="notebook", font="sans-serif", font_scale=1) + + theme( + axis_text_x=element_text(rotation=45, hjust=1), figure_size=(12, 8), dpi=200 + ) + + labs( + title="Snapshot Creation Latency", + x="Time", + y="Latency (ms, p95)", + color="Repository", + ) + ) + + p.save(f"{outdir}/snapshot_creation_latency_by_repo.png") + + +def analyze_snapshot_creation_rate(df, outdir): + if "snapshot.duration_ms" not in df.columns: + print("No snapshot creation events found in the log") + return + + # Filter for snapshot creation events only + snapshots = df[df["snapshot.duration_ms"].notna()] + + interval = "1s" # 1 second windows + snapshots = with_interval(snapshots, interval) + + metrics = [] + + # Group by both time_interval AND snapshot.exclusive + for time_window in snapshots["time_interval"].unique(): + for exclusive_value in snapshots["snapshot.exclusive"].unique(): + window_data = snapshots[ + (snapshots["time_interval"] == time_window) & + (snapshots["snapshot.exclusive"] == exclusive_value) + ] + + count = len(window_data) + print(f"There are {count} snapshots (exclusive={exclusive_value}) in window {time_window}") + + if count > 0: + creation_rate = count / pd.Timedelta(interval).total_seconds() + p95_latency = window_data["snapshot.duration_ms"].quantile(0.95) + else: + creation_rate = 0 + p95_latency = None + + # Calculate latency percentiles + metrics.append( + { + "time_interval": time_window, + "exclusive": exclusive_value, + "count": count, + "creation_rate_per_sec": creation_rate, + "p95_latency_ms": p95_latency, + } + ) + + metrics_df = pd.DataFrame(metrics) + + # Remove rows with no data for cleaner plotting + plot_data = metrics_df[metrics_df["p95_latency_ms"].notna()] + + # Plot :: Latency vs Creation Rate (Throughput) grouped by exclusive flag + p = ( + ggplot(plot_data, aes(x="creation_rate_per_sec", color="exclusive")) + + geom_point(aes(y="p95_latency_ms"), size=3, alpha=0.7) + + geom_smooth( + aes(y="p95_latency_ms"), method="lm", se=False, size=1 + ) + + scale_color_manual(values=custom_colors, name="Exclusive") + + theme_seaborn( + style="darkgrid", context="notebook", font="sans-serif", font_scale=1) + + theme(figure_size=(12, 7), dpi=200) + + labs( + title="Impact of Creation Rate on Snapshot P95 Duration in 1s interval", + subtitle="P95 latencies vs actual snapshot throughput, grouped by exclusive flag", + x="Creation Rate - Throughput (snapshots completed/second)", + y="Snapshot P95 Duration Latency (ms)", + ) + ) + p.save(f"{outdir}/latency_vs_creation_rate.png") + print(f"Saved: {outdir}/latency_vs_creation_rate.png") + + +def analyze_snapshot_duration_by_repository(df, outdir): + if "snapshot.duration_ms" not in df.columns: + print("No snapshot creation events found in the log") + return + + # Filter for snapshot events AND valid repository paths + snapshots = df[ + (df["snapshot.duration_ms"].notna()) + & (df["grpc.request.glProjectPath"].notna()) + ] + + # Get repository counts + repo_counts = snapshots["grpc.request.glProjectPath"].value_counts() + + # Plot :: Overlapping histograms of snapshot duration by repository, log scale on X axis + p = ( + ggplot( + snapshots, aes(x="snapshot.duration_ms", fill="grpc.request.glProjectPath") + ) + + geom_histogram(bins=30, alpha=0.5, position="identity") + + theme_seaborn( + style="darkgrid", context="notebook", font="sans-serif", font_scale=1) + + theme( + figure_size=(16, 10), + dpi=200, + legend_position="right", + legend_title=element_text(size=10, weight="bold"), + legend_text=element_text(size=8), + ) + + labs( + title="Snapshot Duration Distribution by Repository", + subtitle="Overlapping histograms show how snapshot duration varies across repositories", + x="Duration (ms) - Log Scale", + y="Count", + ) + + scale_x_log10() + + scale_fill_manual( + values=custom_colors * 3, name="Repository" + ) # *3 to ensure enough colors, however they will start repetiting + ) + p.save(f"{outdir}/snapshot_duration_by_repository.png") + print(f"\nSaved: {outdir}/snapshot_duration_by_repository.png") + +def analyze_snapshot_by_files_dirs(df, outdir): + if "snapshot.duration_ms" not in df.columns: + print("No snapshot creation events found in the log") + return + + # Filter for snapshot events AND valid repository paths + snapshots = df[ + (df["snapshot.duration_ms"].notna()) + & (df["grpc.request.glProjectPath"].notna()) + ] + + relevant_cols = ['snapshot.directory_count', 'snapshot.file_count', 'snapshot.duration_ms'] + + # Plot :: Scatter plot: dirs (x) vs files (y), colored by duration + p = ( + ggplot(snapshots, aes(x='snapshot.directory_count', y='snapshot.file_count', color='snapshot.duration_ms')) + + geom_point(size=3, alpha=0.6) + + scale_color_gradient2( + low=custom_colors[0], # yellow for fast + mid=custom_colors[3], # pink for medium + high=custom_colors[-1], # indigo for slow + midpoint=snapshots['snapshot.duration_ms'].median(), + name='Duration (ms)' + ) + + theme_seaborn( + style="darkgrid", context="notebook", font="sans-serif", font_scale=1) + + theme( + figure_size=(12, 10), + dpi=200, + legend_position='right' + ) + + labs( + title="Snapshot Duration By Files X Directories", + subtitle="Each dot represents a snapshot operation", + x="Directory Count", + y="File Count" + ) + + facet_wrap("grpc.request.glRepository", ncol=1) + ) + + p.save(f"{outdir}/snapshot_files_dirs_duration.png") + print(f"\nSaved: {outdir}/snapshot_files_dirs_duration.png") + +def with_interval(df, interval): + df["time_interval"] = df["time"].dt.floor(interval) + return df + + +if __name__ == "__main__": + if len(sys.argv) < 3: + print("Usage: plot.py ") + sys.exit(1) + + log_filename = sys.argv[1] + output_directory = sys.argv[2] + + df = load(log_filename) + + with_interval(df, "1s") + stats_snapshot(df, output_directory) + stats_rpc_latency(df, output_directory) + stats_rpc_count(df, output_directory) + analyze_snapshot_creation_rate(df, output_directory) + analyze_snapshot_duration_by_repository(df, output_directory) + analyze_snapshot_by_files_dirs(df, output_directory) diff --git a/_support/benchmarking/experiments/mb-tran/profile-gitaly.sh b/_support/benchmarking/experiments/mb-tran/profile-gitaly.sh new file mode 100755 index 0000000000000000000000000000000000000000..2a37ecf93cfba405cfd6ba01f7759257a1ff97df --- /dev/null +++ b/_support/benchmarking/experiments/mb-tran/profile-gitaly.sh @@ -0,0 +1,103 @@ +#!/bin/sh +# +# profile-gitaly: Profile host with perf and libbpf-tools. +# Must be run as root. +# +# Mandatory arguments: +# -d : Number of seconds to profile for +# -g : Name of Git repository being used +# -o : Directory to write output to +# -r : Name of RPC being executed + +set -e + +usage() { + echo "Usage: $0 -d -o -r \ +-g " + exit 1 +} + +profile() { + # Profile on-CPU time for Gitaly and child processes + perf record --freq=99 -g --pid="$(pidof -s gitaly)" \ + --output="${gitaly_perf_data}" -- sleep "${seconds}" & + + # Profile on-CPU time for whole system + perf record --freq=97 -g --all-cpus \ + --output="${all_perf_data}" -- sleep "${seconds}" & + + # Profile off-CPU time for whole system (with filtering as a post-processing step) + min_stall_duration_us=1000 + offcpu_profile_raw_output_file="${out_dir}/offcpu_profile.raw.txt.gz" + bpftrace /usr/local/gitaly_offcpu_profiler/offcpu_profile.bt "${seconds}" "${min_stall_duration_us}" \ + | gzip > "${offcpu_profile_raw_output_file}" & + + wait +} + +generate_flamegraphs() { + gitaly_perf_txt="${out_dir}/gitaly-perf.txt.gz" + gitaly_perf_svg="${out_dir}/gitaly-perf.svg" + perf script --header --input="${gitaly_perf_data}" \ + | gzip > "${gitaly_perf_txt}" + zcat "${gitaly_perf_txt}" \ + | stackcollapse-perf --kernel \ + | flamegraph --hash --colors=perl > "${gitaly_perf_svg}" + + all_perf_txt="${out_dir}/all-perf.txt.gz" + all_perf_svg="${out_dir}/all-perf.svg" + perf script --header --input="${all_perf_data}" \ + | gzip > "${all_perf_txt}" + zcat "${all_perf_txt}" \ + | stackcollapse-perf --kernel \ + | flamegraph --hash --colors=perl > "${all_perf_svg}" + + /usr/local/gitaly_offcpu_profiler/offcpu_profile_postprocessing.sh "${offcpu_profile_raw_output_file}" +} + +main() { + if [ "$(id -u)" -ne 0 ]; then + echo "$0 must be run as root" >&2 + exit 1 + fi + + while getopts "hd:g:o:r:" arg; do + case "${arg}" in + d) seconds=${OPTARG} ;; + g) repo=${OPTARG} ;; + o) out_dir=${OPTARG} ;; + r) rpc=${OPTARG} ;; + h|*) usage ;; + esac + done + + if [ "${seconds}" -le 0 ] \ + || [ -z "${out_dir}" ] \ + || [ -z "${rpc}" ] \ + || [ -z "${repo}" ]; then + usage + fi + + if ! pidof gitaly > /dev/null; then + echo "Gitaly is not running, aborting" >&2 + exit 1 + fi + + # Ansible's minimal shell will may not include /usr/local/bin in $PATH + if ! printenv PATH | grep "/usr/local/bin" > /dev/null; then + export PATH="${PATH}:/usr/local/bin" + fi + + perf_tmp_dir=$(mktemp -d "/tmp/gitaly-perf-${repo}-${rpc}.XXXXXX") + gitaly_perf_data="${perf_tmp_dir}/gitaly-perf.out" + all_perf_data="${perf_tmp_dir}/all-perf.out" + + profile + + generate_flamegraphs + + chown -R git:git "${out_dir}" + rm -rf "${perf_tmp_dir}" +} + +main "$@" diff --git a/_support/benchmarking/roles/benchmark/vars/main.yml b/_support/benchmarking/roles/benchmark/vars/main.yml index f90aa2f05e18757e2bc01eb5a1b32d54dbe813fd..a9f2439ee8bc92ed4e56c178ed8300a4f9dbcd09 100644 --- a/_support/benchmarking/roles/benchmark/vars/main.yml +++ b/_support/benchmarking/roles/benchmark/vars/main.yml @@ -2,6 +2,6 @@ profile: true clear_page_cache: true # Profiling and the workload will begin concurrently. -profile_duration: 60 +profile_duration: 300 workload_duration: "60s" -workload_wait_duration: 120 +workload_wait_duration: 360