From a5aa44e7930761cb900813d971b4105f901818fb Mon Sep 17 00:00:00 2001 From: Karthik Nayak Date: Wed, 15 Jan 2025 11:54:51 +0000 Subject: [PATCH 1/5] reftable: write correct max_update_index to header In 297c09eabb (refs: allow multiple reflog entries for the same refname, 2024-12-16), the reftable backend learned to handle multiple reflog entries within the same transaction. This was done modifying the `update_index` for reflogs with multiple indices. During writing the logs, the `max_update_index` of the writer was modified to ensure the limits were raised to the modified `update_index`s. However, since ref entries are written before the modification to the `max_update_index`, if there are multiple blocks to be written, the reftable backend writes the header with the old `max_update_index`. When all logs are finally written, the footer will be written with the new `min_update_index`. This causes a mismatch between the header and the footer and causes the reftable file to be corrupted. The existing tests only spawn a single block and since headers are lazily written with the first block, the tests didn't capture this bug. To fix the issue, the appropriate `max_update_index` limit must be set even before the first block is written. Add a `max_index` field to the transaction which holds the `max_index` within all its updates, then propagate this value to the reftable backend, wherein this is used to the set the `max_update_index` correctly. Add a test which creates a few thousand reference updates with multiple reflog entries, which should trigger the bug. Reported-by: brian m. carlson Signed-off-by: Karthik Nayak Signed-off-by: Junio C Hamano --- refs.c | 7 +++++++ refs/refs-internal.h | 1 + refs/reftable-backend.c | 20 ++++++++++---------- t/t1460-refs-migrate.sh | 12 ++++++++++++ 4 files changed, 30 insertions(+), 10 deletions(-) diff --git a/refs.c b/refs.c index 0f41b2fd4a6..f7b6f0f897e 100644 --- a/refs.c +++ b/refs.c @@ -1345,6 +1345,13 @@ int ref_transaction_update_reflog(struct ref_transaction *transaction, update->flags &= ~REF_HAVE_OLD; update->index = index; + /* + * Reference backends may need to know the max index to optimize + * their writes. So we store the max_index on the transaction level. + */ + if (index > transaction->max_index) + transaction->max_index = index; + return 0; } diff --git a/refs/refs-internal.h b/refs/refs-internal.h index 16550862d3e..aaab711bb96 100644 --- a/refs/refs-internal.h +++ b/refs/refs-internal.h @@ -203,6 +203,7 @@ struct ref_transaction { enum ref_transaction_state state; void *backend_data; unsigned int flags; + unsigned int max_index; }; /* diff --git a/refs/reftable-backend.c b/refs/reftable-backend.c index 00d95a9a2f4..289496058e6 100644 --- a/refs/reftable-backend.c +++ b/refs/reftable-backend.c @@ -942,6 +942,7 @@ struct write_transaction_table_arg { size_t updates_nr; size_t updates_alloc; size_t updates_expected; + unsigned int max_index; }; struct reftable_transaction_data { @@ -1428,7 +1429,6 @@ static int write_transaction_table(struct reftable_writer *writer, void *cb_data struct reftable_log_record *logs = NULL; struct ident_split committer_ident = {0}; size_t logs_nr = 0, logs_alloc = 0, i; - uint64_t max_update_index = ts; const char *committer_info; int ret = 0; @@ -1438,7 +1438,12 @@ static int write_transaction_table(struct reftable_writer *writer, void *cb_data QSORT(arg->updates, arg->updates_nr, transaction_update_cmp); - reftable_writer_set_limits(writer, ts, ts); + /* + * During reflog migration, we add indexes for a single reflog with + * multiple entries. Each entry will contain a different update_index, + * so set the limits accordingly. + */ + reftable_writer_set_limits(writer, ts, ts + arg->max_index); for (i = 0; i < arg->updates_nr; i++) { struct reftable_transaction_update *tx_update = &arg->updates[i]; @@ -1540,12 +1545,6 @@ static int write_transaction_table(struct reftable_writer *writer, void *cb_data */ log->update_index = ts + u->index; - /* - * Note the max update_index so the limit can be set later on. - */ - if (log->update_index > max_update_index) - max_update_index = log->update_index; - log->refname = xstrdup(u->refname); memcpy(log->value.update.new_hash, u->new_oid.hash, GIT_MAX_RAWSZ); @@ -1609,8 +1608,6 @@ static int write_transaction_table(struct reftable_writer *writer, void *cb_data * and log blocks. */ if (logs) { - reftable_writer_set_limits(writer, ts, max_update_index); - ret = reftable_writer_add_logs(writer, logs, logs_nr); if (ret < 0) goto done; @@ -1631,6 +1628,9 @@ static int reftable_be_transaction_finish(struct ref_store *ref_store UNUSED, struct reftable_transaction_data *tx_data = transaction->backend_data; int ret = 0; + if (tx_data->args) + tx_data->args->max_index = transaction->max_index; + for (size_t i = 0; i < tx_data->args_nr; i++) { ret = reftable_addition_add(tx_data->args[i].addition, write_transaction_table, &tx_data->args[i]); diff --git a/t/t1460-refs-migrate.sh b/t/t1460-refs-migrate.sh index f59bc4860f1..307b2998efe 100755 --- a/t/t1460-refs-migrate.sh +++ b/t/t1460-refs-migrate.sh @@ -227,6 +227,18 @@ do done done +test_expect_success 'multiple reftable blocks with multiple entries' ' + test_when_finished "rm -rf repo" && + git init --ref-format=files repo && + test_commit -C repo first && + printf "create refs/heads/ref-%d HEAD\n" $(test_seq 5000) >stdin && + git -C repo update-ref --stdin stdin && + git -C repo update-ref --stdin Date: Fri, 17 Jan 2025 07:28:09 +0100 Subject: [PATCH 2/5] refs: small followups to the migration corruption fix This is a follow up to the bug that was reported [1] around `git refs migrate --ref-format=reftable` where the migration would fail for repositories with reflogs with lots of entries. This was caused due to a mismatch in the reftable's header and footer, specifically WRT the 'max_update_index'. While there was a fix posted. This series is a small followup to fix some of the topics discussed there: 1. To mark `ref_transaction_update_reflog()` as static since it is only used internally within 'refs.c'. 2. To change the type of 'max_index' from 'unsigned int' to 'uint64_t'. This would be much safer for large repositories with millions of files and on 32bit systems. 3. To add a safeguard to prevent 'update_index' changes post any record addition. This is a preventive measure to ensure such bugs don't arise in the future. This is based on top of master 757161efcc (Sync with Git 2.48.1, 2025-01-13) with 'kn/reflog-migration-fix' merged in. [1]: https://lore.kernel.org/r/Z4UbkcmJAU1MT-Rs@tapette.crustytoothpaste.net Signed-off-by: Karthik Nayak --- Changes in v3: - Commit 3: Modify the commit message to make it clearer that this is a preventive measure. Modify the condition to check for both `last_key` and `next`. Also add a unit test. - Link to v2: https://lore.kernel.org/r/20250121-461-corrupted-reftable-followup-v2-0-37e26c7a79b4@gmail.com Changes in v2: - Commit 1: Keep the function comments. - Commit 3: Modify the requirement to be for any record writes instead of the first block write. This ensures that the limits need to be set before any records being added. Instead of using `BUG()`, return `REFTABLE_API_ERROR`. Handle all callers as needed. - Link to v1: https://lore.kernel.org/r/20250117-461-corrupted-reftable-followup-v1-0-70ee605ae3fe@gmail.com --- b4-submit-tracking --- # This section is used internally by b4 prep for tracking purposes. { "series": { "revision": 3, "change-id": "20250117-461-corrupted-reftable-followup-eb0e4fd1a723", "prefixes": [], "history": { "v1": [ "20250117-461-corrupted-reftable-followup-v1-0-70ee605ae3fe@gmail.com" ], "v2": [ "20250121-461-corrupted-reftable-followup-v2-0-37e26c7a79b4@gmail.com" ] } } } -- GitLab From cafede9b1055e6c886af033693c67948379337a6 Mon Sep 17 00:00:00 2001 From: Karthik Nayak Date: Thu, 16 Jan 2025 07:37:20 +0100 Subject: [PATCH 3/5] refs: mark `ref_transaction_update_reflog()` as static The `ref_transaction_update_reflog()` function is only used within 'refs.c', so mark it as static. Reported-by: Junio C Hamano Signed-off-by: Karthik Nayak --- refs.c | 22 +++++++++++++++------- refs.h | 14 -------------- 2 files changed, 15 insertions(+), 21 deletions(-) diff --git a/refs.c b/refs.c index f7b6f0f897e..ad6d7747171 100644 --- a/refs.c +++ b/refs.c @@ -1318,13 +1318,21 @@ int ref_transaction_update(struct ref_transaction *transaction, return 0; } -int ref_transaction_update_reflog(struct ref_transaction *transaction, - const char *refname, - const struct object_id *new_oid, - const struct object_id *old_oid, - const char *committer_info, unsigned int flags, - const char *msg, unsigned int index, - struct strbuf *err) +/* + * Similar to`ref_transaction_update`, but this function is only for adding + * a reflog update. Supports providing custom committer information. The index + * field can be utiltized to order updates as desired. When not used, the + * updates default to being ordered by refname. + */ +static int ref_transaction_update_reflog(struct ref_transaction *transaction, + const char *refname, + const struct object_id *new_oid, + const struct object_id *old_oid, + const char *committer_info, + unsigned int flags, + const char *msg, + unsigned int index, + struct strbuf *err) { struct ref_update *update; diff --git a/refs.h b/refs.h index a0cdd99250e..09be47afbee 100644 --- a/refs.h +++ b/refs.h @@ -771,20 +771,6 @@ int ref_transaction_update(struct ref_transaction *transaction, unsigned int flags, const char *msg, struct strbuf *err); -/* - * Similar to`ref_transaction_update`, but this function is only for adding - * a reflog update. Supports providing custom committer information. The index - * field can be utiltized to order updates as desired. When not used, the - * updates default to being ordered by refname. - */ -int ref_transaction_update_reflog(struct ref_transaction *transaction, - const char *refname, - const struct object_id *new_oid, - const struct object_id *old_oid, - const char *committer_info, unsigned int flags, - const char *msg, unsigned int index, - struct strbuf *err); - /* * Add a reference creation to transaction. new_oid is the value that * the reference should have after the update; it must not be -- GitLab From 737360d88391febf6065dc19e6244bb588138b00 Mon Sep 17 00:00:00 2001 From: Karthik Nayak Date: Fri, 17 Jan 2025 07:39:44 +0100 Subject: [PATCH 4/5] refs: use 'uint64_t' for 'ref_update.index' The 'ref_update.index' variable is used to store an index for a given reference update. This index is used to order the updates in a predetermined order, while the default ordering is alphabetical as per the refname. For large repositories with millions of references, it should be safer to use 'uint64_t'. Let's do that. This also is applied for all other code sections where we store 'index' and pass it around. Reported-by: brian m. carlson Signed-off-by: Karthik Nayak --- refs.c | 4 ++-- refs/refs-internal.h | 4 ++-- refs/reftable-backend.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/refs.c b/refs.c index ad6d7747171..ef04f403a6a 100644 --- a/refs.c +++ b/refs.c @@ -1331,7 +1331,7 @@ static int ref_transaction_update_reflog(struct ref_transaction *transaction, const char *committer_info, unsigned int flags, const char *msg, - unsigned int index, + uint64_t index, struct strbuf *err) { struct ref_update *update; @@ -2813,7 +2813,7 @@ static int migrate_one_ref(const char *refname, const char *referent UNUSED, con } struct reflog_migration_data { - unsigned int index; + uint64_t index; const char *refname; struct ref_store *old_refs; struct ref_transaction *transaction; diff --git a/refs/refs-internal.h b/refs/refs-internal.h index aaab711bb96..8894b43d1d1 100644 --- a/refs/refs-internal.h +++ b/refs/refs-internal.h @@ -120,7 +120,7 @@ struct ref_update { * when migrating reflogs and we want to ensure we carry over the * same order. */ - unsigned int index; + uint64_t index; /* * If this ref_update was split off of a symref update via @@ -203,7 +203,7 @@ struct ref_transaction { enum ref_transaction_state state; void *backend_data; unsigned int flags; - unsigned int max_index; + uint64_t max_index; }; /* diff --git a/refs/reftable-backend.c b/refs/reftable-backend.c index 289496058e6..6814c87bc61 100644 --- a/refs/reftable-backend.c +++ b/refs/reftable-backend.c @@ -942,7 +942,7 @@ struct write_transaction_table_arg { size_t updates_nr; size_t updates_alloc; size_t updates_expected; - unsigned int max_index; + uint64_t max_index; }; struct reftable_transaction_data { -- GitLab From 01bf1d765f8a6c037850c3c8827f799ff4e62817 Mon Sep 17 00:00:00 2001 From: Karthik Nayak Date: Mon, 20 Jan 2025 13:32:35 +0100 Subject: [PATCH 5/5] reftable: prevent 'update_index' changes after adding records The function `reftable_writer_set_limits()` allows updating the 'min_update_index' and 'max_update_index' of a reftable writer. These values are written to both the writer's header and footer. Since the header is written during the first block write, any subsequent changes to the update index would create a mismatch between the header and footer values. The footer would contain the newer values while the header retained the original ones. To protect against this bug, prevent callers from updating these values after any record is written. To do this, modify the function to return an error whenever the limits are modified after any record adds. Check for record adds within `reftable_writer_set_limits()` by checking the `last_key` and `next` variable. The former is updated after each record added, but is reset at certain points. The latter is set after writing the first block. Modify all callers of the function to anticipate a return type and handle it accordingly. Add a unit test to also ensure the function returns the error as expected. Helped-by: Patrick Steinhardt Signed-off-by: Karthik Nayak --- refs/reftable-backend.c | 20 +++++++++--- reftable/reftable-error.h | 1 + reftable/reftable-writer.h | 24 +++++++++------ reftable/stack.c | 6 ++-- reftable/writer.c | 15 ++++++++- t/unit-tests/t-reftable-stack.c | 54 +++++++++++++++++++++++++++++++-- 6 files changed, 99 insertions(+), 21 deletions(-) diff --git a/refs/reftable-backend.c b/refs/reftable-backend.c index 6814c87bc61..9cfb0cb2672 100644 --- a/refs/reftable-backend.c +++ b/refs/reftable-backend.c @@ -1443,7 +1443,9 @@ static int write_transaction_table(struct reftable_writer *writer, void *cb_data * multiple entries. Each entry will contain a different update_index, * so set the limits accordingly. */ - reftable_writer_set_limits(writer, ts, ts + arg->max_index); + ret = reftable_writer_set_limits(writer, ts, ts + arg->max_index); + if (ret < 0) + goto done; for (i = 0; i < arg->updates_nr; i++) { struct reftable_transaction_update *tx_update = &arg->updates[i]; @@ -1766,7 +1768,9 @@ static int write_copy_table(struct reftable_writer *writer, void *cb_data) deletion_ts = creation_ts = reftable_stack_next_update_index(arg->be->stack); if (arg->delete_old) creation_ts++; - reftable_writer_set_limits(writer, deletion_ts, creation_ts); + ret = reftable_writer_set_limits(writer, deletion_ts, creation_ts); + if (ret < 0) + goto done; /* * Add the new reference. If this is a rename then we also delete the @@ -2298,7 +2302,9 @@ static int write_reflog_existence_table(struct reftable_writer *writer, if (ret <= 0) goto done; - reftable_writer_set_limits(writer, ts, ts); + ret = reftable_writer_set_limits(writer, ts, ts); + if (ret < 0) + goto done; /* * The existence entry has both old and new object ID set to the @@ -2357,7 +2363,9 @@ static int write_reflog_delete_table(struct reftable_writer *writer, void *cb_da uint64_t ts = reftable_stack_next_update_index(arg->stack); int ret; - reftable_writer_set_limits(writer, ts, ts); + ret = reftable_writer_set_limits(writer, ts, ts); + if (ret < 0) + goto out; ret = reftable_stack_init_log_iterator(arg->stack, &it); if (ret < 0) @@ -2434,7 +2442,9 @@ static int write_reflog_expiry_table(struct reftable_writer *writer, void *cb_da if (arg->records[i].value_type == REFTABLE_LOG_UPDATE) live_records++; - reftable_writer_set_limits(writer, ts, ts); + ret = reftable_writer_set_limits(writer, ts, ts); + if (ret < 0) + return ret; if (!is_null_oid(&arg->update_oid)) { struct reftable_ref_record ref = {0}; diff --git a/reftable/reftable-error.h b/reftable/reftable-error.h index f4048265629..a7e33d964d0 100644 --- a/reftable/reftable-error.h +++ b/reftable/reftable-error.h @@ -30,6 +30,7 @@ enum reftable_error { /* Misuse of the API: * - on writing a record with NULL refname. + * - on writing a record before setting the writer limits. * - on writing a reftable_ref_record outside the table limits * - on writing a ref or log record before the stack's * next_update_inde*x diff --git a/reftable/reftable-writer.h b/reftable/reftable-writer.h index 5f9afa620bb..1ea014d389c 100644 --- a/reftable/reftable-writer.h +++ b/reftable/reftable-writer.h @@ -124,17 +124,21 @@ int reftable_writer_new(struct reftable_writer **out, int (*flush_func)(void *), void *writer_arg, const struct reftable_write_options *opts); -/* Set the range of update indices for the records we will add. When writing a - table into a stack, the min should be at least - reftable_stack_next_update_index(), or REFTABLE_API_ERROR is returned. - - For transactional updates to a stack, typically min==max, and the - update_index can be obtained by inspeciting the stack. When converting an - existing ref database into a single reftable, this would be a range of - update-index timestamps. +/* + * Set the range of update indices for the records we will add. When writing a + * table into a stack, the min should be at least + * reftable_stack_next_update_index(), or REFTABLE_API_ERROR is returned. + * + * For transactional updates to a stack, typically min==max, and the + * update_index can be obtained by inspeciting the stack. When converting an + * existing ref database into a single reftable, this would be a range of + * update-index timestamps. + * + * The function should be called before adding any records to the writer. If not + * it will fail with REFTABLE_API_ERROR. */ -void reftable_writer_set_limits(struct reftable_writer *w, uint64_t min, - uint64_t max); +int reftable_writer_set_limits(struct reftable_writer *w, uint64_t min, + uint64_t max); /* Add a reftable_ref_record. The record should have names that come after diff --git a/reftable/stack.c b/reftable/stack.c index 531660a49f0..9649dbbb04c 100644 --- a/reftable/stack.c +++ b/reftable/stack.c @@ -1058,8 +1058,10 @@ static int stack_write_compact(struct reftable_stack *st, for (size_t i = first; i <= last; i++) st->stats.bytes += st->readers[i]->size; - reftable_writer_set_limits(wr, st->readers[first]->min_update_index, - st->readers[last]->max_update_index); + err = reftable_writer_set_limits(wr, st->readers[first]->min_update_index, + st->readers[last]->max_update_index); + if (err < 0) + goto done; err = reftable_merged_table_new(&mt, st->readers + first, subtabs_len, st->opts.hash_id); diff --git a/reftable/writer.c b/reftable/writer.c index 740c98038ea..76e24018172 100644 --- a/reftable/writer.c +++ b/reftable/writer.c @@ -179,11 +179,24 @@ int reftable_writer_new(struct reftable_writer **out, return 0; } -void reftable_writer_set_limits(struct reftable_writer *w, uint64_t min, +int reftable_writer_set_limits(struct reftable_writer *w, uint64_t min, uint64_t max) { + /* + * Set the min/max update index limits for the reftable writer. + * This must be called before adding any records, since: + * - The 'next' field gets set after writing the first block. + * - The 'last_key' field updates with each new record (but resets + * after sections). + * Returns REFTABLE_API_ERROR if called after writing has begun. + */ + if (w->next || w->last_key.len) + return REFTABLE_API_ERROR; + w->min_update_index = min; w->max_update_index = max; + + return 0; } static void writer_release(struct reftable_writer *w) diff --git a/t/unit-tests/t-reftable-stack.c b/t/unit-tests/t-reftable-stack.c index aeec195b2b1..c3f0059c346 100644 --- a/t/unit-tests/t-reftable-stack.c +++ b/t/unit-tests/t-reftable-stack.c @@ -103,7 +103,8 @@ static void t_read_file(void) static int write_test_ref(struct reftable_writer *wr, void *arg) { struct reftable_ref_record *ref = arg; - reftable_writer_set_limits(wr, ref->update_index, ref->update_index); + check(!reftable_writer_set_limits(wr, ref->update_index, + ref->update_index)); return reftable_writer_add_ref(wr, ref); } @@ -143,7 +144,8 @@ static int write_test_log(struct reftable_writer *wr, void *arg) { struct write_log_arg *wla = arg; - reftable_writer_set_limits(wr, wla->update_index, wla->update_index); + check(!reftable_writer_set_limits(wr, wla->update_index, + wla->update_index)); return reftable_writer_add_log(wr, wla->log); } @@ -961,7 +963,7 @@ static void t_reflog_expire(void) static int write_nothing(struct reftable_writer *wr, void *arg UNUSED) { - reftable_writer_set_limits(wr, 1, 1); + check(!reftable_writer_set_limits(wr, 1, 1)); return 0; } @@ -1369,11 +1371,57 @@ static void t_reftable_stack_reload_with_missing_table(void) clear_dir(dir); } +static int write_limits_after_ref(struct reftable_writer *wr, void *arg) +{ + struct reftable_ref_record *ref = arg; + check(!reftable_writer_set_limits(wr, ref->update_index, ref->update_index)); + check(!reftable_writer_add_ref(wr, ref)); + return reftable_writer_set_limits(wr, ref->update_index, ref->update_index); +} + +static void t_reftable_invalid_limit_updates(void) +{ + struct reftable_ref_record ref = { + .refname = (char *) "HEAD", + .update_index = 1, + .value_type = REFTABLE_REF_SYMREF, + .value.symref = (char *) "master", + }; + struct reftable_write_options opts = { + .default_permissions = 0660, + }; + struct reftable_addition *add = NULL; + char *dir = get_tmp_dir(__LINE__); + struct reftable_stack *st = NULL; + int err; + + err = reftable_new_stack(&st, dir, &opts); + check(!err); + + reftable_addition_destroy(add); + + err = reftable_stack_new_addition(&add, st, 0); + check(!err); + + /* + * write_limits_after_ref also updates the update indexes after adding + * the record. This should cause an err to be returned, since the limits + * must be set at the start. + */ + err = reftable_addition_add(add, write_limits_after_ref, &ref); + check_int(err, ==, REFTABLE_API_ERROR); + + reftable_addition_destroy(add); + reftable_stack_destroy(st); + clear_dir(dir); +} + int cmd_main(int argc UNUSED, const char *argv[] UNUSED) { TEST(t_empty_add(), "empty addition to stack"); TEST(t_read_file(), "read_lines works"); TEST(t_reflog_expire(), "expire reflog entries"); + TEST(t_reftable_invalid_limit_updates(), "prevent limit updates after adding records"); TEST(t_reftable_stack_add(), "add multiple refs and logs to stack"); TEST(t_reftable_stack_add_one(), "add a single ref record to stack"); TEST(t_reftable_stack_add_performs_auto_compaction(), "addition to stack triggers auto-compaction"); -- GitLab