From 4a0950bb55f6d36999e0678501f214b40d052d92 Mon Sep 17 00:00:00 2001 From: Karthik Nayak Date: Wed, 15 Jan 2025 11:32:28 +0100 Subject: [PATCH 1/5] reftable: write correct max_update_index to header In 297c09eabb (refs: allow multiple reflog entries for the same refname, 2024-12-16), the reftable backend learned to handle multiple reflog entries within the same transaction. This was done by modifying the `update_index` for reflogs with multiple indices. During writing the logs, the `max_update_index` of the writer was modified to ensure the limits were raised to the modified `update_index`s. However, since ref entries are written before the modification to the `max_update_index`, if there are multiple blocks to be written, the reftable backend writes the header with the old `max_update_index`. When all logs are finally written, the footer will be written with the new `min_update_index`. This causes a mismatch between the header and the footer and causes the reftable file to be corrupted. The existing tests only spawn a single block and since headers are lazily written with the first block, the tests didn't capture this bug. To fix the issue, the appropriate `max_update_index` limit must be set even before the first block is written. Add a `max_index` field to the transaction which holds the `max_index` within all its updates, then propagate this value to the reftable backend, wherein this is used to the set the `max_update_index` correctly. The first 2 patches are cleanups in this context: Commit 1: marks `ref_transaction_update_reflog()` as static since it is only used internally in refs.c Commit 2: marks 'ref_update.index' as 'uint64_t' to safeguard against large repositories with millions of refs. Commit 3: contains the actual fix for the bug. Commit 4: adds a safeguard, so such bugs don't occur in the future. --- b4-submit-tracking --- { "series": { "revision": 2, "change-id": "20250115-461-corrupted-reftable-when-migrating-reflogs-63a0974301d8", "prefixes": [] } } -- GitLab From 3a9ae81532355acd5fe63bf025d87eb34dc5a161 Mon Sep 17 00:00:00 2001 From: Karthik Nayak Date: Thu, 16 Jan 2025 07:37:20 +0100 Subject: [PATCH 2/5] refs: mark `ref_transaction_update_reflog()` as static The `ref_transaction_update_reflog()` function is only used within 'refs.c', so mark it as static. Reported-by: Junio C Hamano Signed-off-by: Karthik Nayak --- refs.c | 16 +++++++++------- refs.h | 14 -------------- 2 files changed, 9 insertions(+), 21 deletions(-) diff --git a/refs.c b/refs.c index 0f41b2fd4a6..f63f0411016 100644 --- a/refs.c +++ b/refs.c @@ -1318,13 +1318,15 @@ int ref_transaction_update(struct ref_transaction *transaction, return 0; } -int ref_transaction_update_reflog(struct ref_transaction *transaction, - const char *refname, - const struct object_id *new_oid, - const struct object_id *old_oid, - const char *committer_info, unsigned int flags, - const char *msg, unsigned int index, - struct strbuf *err) +static int ref_transaction_update_reflog(struct ref_transaction *transaction, + const char *refname, + const struct object_id *new_oid, + const struct object_id *old_oid, + const char *committer_info, + unsigned int flags, + const char *msg, + unsigned int index, + struct strbuf *err) { struct ref_update *update; diff --git a/refs.h b/refs.h index a0cdd99250e..09be47afbee 100644 --- a/refs.h +++ b/refs.h @@ -771,20 +771,6 @@ int ref_transaction_update(struct ref_transaction *transaction, unsigned int flags, const char *msg, struct strbuf *err); -/* - * Similar to`ref_transaction_update`, but this function is only for adding - * a reflog update. Supports providing custom committer information. The index - * field can be utiltized to order updates as desired. When not used, the - * updates default to being ordered by refname. - */ -int ref_transaction_update_reflog(struct ref_transaction *transaction, - const char *refname, - const struct object_id *new_oid, - const struct object_id *old_oid, - const char *committer_info, unsigned int flags, - const char *msg, unsigned int index, - struct strbuf *err); - /* * Add a reference creation to transaction. new_oid is the value that * the reference should have after the update; it must not be -- GitLab From 96e03c74c786a883178ae59e431adaa06d3516fb Mon Sep 17 00:00:00 2001 From: Karthik Nayak Date: Thu, 16 Jan 2025 07:48:57 +0100 Subject: [PATCH 3/5] refs: use 'uint64_t' for 'ref_update.index' The 'ref_update.index' variable is used to store an index for a given reference update. This index is used to order the updates in a predetermined order, while the default ordering is alphabetical as per the refname. For large repositories with millions of references, it should be safer to use 'uint64_t'. Let's do that. Reported-by: brian m. carlson Signed-off-by: Karthik Nayak --- refs.c | 4 ++-- refs/refs-internal.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/refs.c b/refs.c index f63f0411016..ff4f1639161 100644 --- a/refs.c +++ b/refs.c @@ -1325,7 +1325,7 @@ static int ref_transaction_update_reflog(struct ref_transaction *transaction, const char *committer_info, unsigned int flags, const char *msg, - unsigned int index, + uint64_t index, struct strbuf *err) { struct ref_update *update; @@ -2800,7 +2800,7 @@ static int migrate_one_ref(const char *refname, const char *referent UNUSED, con } struct reflog_migration_data { - unsigned int index; + uint64_t index; const char *refname; struct ref_store *old_refs; struct ref_transaction *transaction; diff --git a/refs/refs-internal.h b/refs/refs-internal.h index 16550862d3e..f4c52201662 100644 --- a/refs/refs-internal.h +++ b/refs/refs-internal.h @@ -120,7 +120,7 @@ struct ref_update { * when migrating reflogs and we want to ensure we carry over the * same order. */ - unsigned int index; + uint64_t index; /* * If this ref_update was split off of a symref update via -- GitLab From 64fd6586e00ea3a8556f79423db5a4038c4e1c24 Mon Sep 17 00:00:00 2001 From: Karthik Nayak Date: Wed, 15 Jan 2025 11:33:21 +0100 Subject: [PATCH 4/5] reftable: write correct max_update_index to header In 297c09eabb (refs: allow multiple reflog entries for the same refname, 2024-12-16), the reftable backend learned to handle multiple reflog entries within the same transaction. This was done by modifying the `update_index` for reflogs with multiple indices. During writing the logs, the `max_update_index` of the writer was modified to ensure the limits were raised to the modified `update_index`s. However, since ref entries are written before the modification to the `max_update_index`, if there are multiple blocks to be written, the reftable backend writes the header with the old `max_update_index`. When all logs are finally written, the footer will be written with the new `min_update_index`. This causes a mismatch between the header and the footer and causes the reftable file to be corrupted. The existing tests only spawn a single block and since headers are lazily written with the first block, the tests didn't capture this bug. To fix the issue, the appropriate `max_update_index` limit must be set even before the first block is written. Add a `max_index` field to the transaction which holds the `max_index` within all its updates, then propagate this value to the reftable backend, wherein this is used to the set the `max_update_index` correctly. Add a test which creates a few thousand reference updates with multiple reflog entries, which should trigger the bug. Reported-by: brian m. carlson Signed-off-by: Karthik Nayak --- refs.c | 15 ++++++++++++++- refs/refs-internal.h | 1 + refs/reftable-backend.c | 20 ++++++++++---------- t/t1460-refs-migrate.sh | 12 ++++++++++++ 4 files changed, 37 insertions(+), 11 deletions(-) diff --git a/refs.c b/refs.c index ff4f1639161..b182d89c3f7 100644 --- a/refs.c +++ b/refs.c @@ -1318,6 +1318,19 @@ int ref_transaction_update(struct ref_transaction *transaction, return 0; } +static void ref_transaction_update_index(struct ref_transaction *transaction, + struct ref_update *update, + uint64_t index) +{ + update->index = index; + /* + * Reference backends may need to know the max index to optimize + * their writes. So we store the max_index on the transaction level. + */ + if (index > transaction->max_index) + transaction->max_index = index; +} + static int ref_transaction_update_reflog(struct ref_transaction *transaction, const char *refname, const struct object_id *new_oid, @@ -1345,7 +1358,7 @@ static int ref_transaction_update_reflog(struct ref_transaction *transaction, * old_oid verification which only makes sense for refs. */ update->flags &= ~REF_HAVE_OLD; - update->index = index; + ref_transaction_update_index(transaction, update, index); return 0; } diff --git a/refs/refs-internal.h b/refs/refs-internal.h index f4c52201662..8894b43d1d1 100644 --- a/refs/refs-internal.h +++ b/refs/refs-internal.h @@ -203,6 +203,7 @@ struct ref_transaction { enum ref_transaction_state state; void *backend_data; unsigned int flags; + uint64_t max_index; }; /* diff --git a/refs/reftable-backend.c b/refs/reftable-backend.c index 00d95a9a2f4..6814c87bc61 100644 --- a/refs/reftable-backend.c +++ b/refs/reftable-backend.c @@ -942,6 +942,7 @@ struct write_transaction_table_arg { size_t updates_nr; size_t updates_alloc; size_t updates_expected; + uint64_t max_index; }; struct reftable_transaction_data { @@ -1428,7 +1429,6 @@ static int write_transaction_table(struct reftable_writer *writer, void *cb_data struct reftable_log_record *logs = NULL; struct ident_split committer_ident = {0}; size_t logs_nr = 0, logs_alloc = 0, i; - uint64_t max_update_index = ts; const char *committer_info; int ret = 0; @@ -1438,7 +1438,12 @@ static int write_transaction_table(struct reftable_writer *writer, void *cb_data QSORT(arg->updates, arg->updates_nr, transaction_update_cmp); - reftable_writer_set_limits(writer, ts, ts); + /* + * During reflog migration, we add indexes for a single reflog with + * multiple entries. Each entry will contain a different update_index, + * so set the limits accordingly. + */ + reftable_writer_set_limits(writer, ts, ts + arg->max_index); for (i = 0; i < arg->updates_nr; i++) { struct reftable_transaction_update *tx_update = &arg->updates[i]; @@ -1540,12 +1545,6 @@ static int write_transaction_table(struct reftable_writer *writer, void *cb_data */ log->update_index = ts + u->index; - /* - * Note the max update_index so the limit can be set later on. - */ - if (log->update_index > max_update_index) - max_update_index = log->update_index; - log->refname = xstrdup(u->refname); memcpy(log->value.update.new_hash, u->new_oid.hash, GIT_MAX_RAWSZ); @@ -1609,8 +1608,6 @@ static int write_transaction_table(struct reftable_writer *writer, void *cb_data * and log blocks. */ if (logs) { - reftable_writer_set_limits(writer, ts, max_update_index); - ret = reftable_writer_add_logs(writer, logs, logs_nr); if (ret < 0) goto done; @@ -1631,6 +1628,9 @@ static int reftable_be_transaction_finish(struct ref_store *ref_store UNUSED, struct reftable_transaction_data *tx_data = transaction->backend_data; int ret = 0; + if (tx_data->args) + tx_data->args->max_index = transaction->max_index; + for (size_t i = 0; i < tx_data->args_nr; i++) { ret = reftable_addition_add(tx_data->args[i].addition, write_transaction_table, &tx_data->args[i]); diff --git a/t/t1460-refs-migrate.sh b/t/t1460-refs-migrate.sh index f59bc4860f1..307b2998efe 100755 --- a/t/t1460-refs-migrate.sh +++ b/t/t1460-refs-migrate.sh @@ -227,6 +227,18 @@ do done done +test_expect_success 'multiple reftable blocks with multiple entries' ' + test_when_finished "rm -rf repo" && + git init --ref-format=files repo && + test_commit -C repo first && + printf "create refs/heads/ref-%d HEAD\n" $(test_seq 5000) >stdin && + git -C repo update-ref --stdin stdin && + git -C repo update-ref --stdin Date: Thu, 16 Jan 2025 07:19:55 +0100 Subject: [PATCH 5/5] reftable: prevent 'update_index' changes after first block The previous commit fixes a bug wherein the 'max_update_index' of the reftable writer was modified after the first block was written. This caused a mismatch between the header and the footer since both store the 'max_update_index'. While, the bug itself was fixed, let's prevent further issues by adding a safeguard to avoid such issues in the future. We do this by adding a check in `reftable_writer_set_limits()` to ensure that the 'update_index' is only modified before the first block is written. Signed-off-by: Karthik Nayak --- reftable/writer.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/reftable/writer.c b/reftable/writer.c index 740c98038ea..c602b873543 100644 --- a/reftable/writer.c +++ b/reftable/writer.c @@ -182,6 +182,13 @@ int reftable_writer_new(struct reftable_writer **out, void reftable_writer_set_limits(struct reftable_writer *w, uint64_t min, uint64_t max) { + /* + * The limits shouldn't be modified post writing the first block, else + * it would cause a mismatch between the header and the footer. + */ + if (w->next) + BUG("update index modified after writing first block"); + w->min_update_index = min; w->max_update_index = max; } -- GitLab