From d55fc5128b26a64c2e7b6612d0442c9e924696e8 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Thu, 1 Feb 2024 08:51:56 +0100 Subject: [PATCH 1/5] reftable/reader: be more careful about errors in indexed seeks When doing an indexed seek we first need to do a linear seek in order to find the index block for our wanted key. We do not check the returned error of the linear seek though. This is likely not an issue because the next call to `table_iter_next()` would return error, too. But it very much is a code smell when an error variable is being assigned to without actually checking it. Safeguard the code by checking for errors. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- reftable/reader.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/reftable/reader.c b/reftable/reader.c index 64dc366fb1..278f727a3d 100644 --- a/reftable/reader.c +++ b/reftable/reader.c @@ -509,6 +509,9 @@ static int reader_seek_indexed(struct reftable_reader *r, goto done; err = reader_seek_linear(&index_iter, &want_index); + if (err < 0) + goto done; + while (1) { err = table_iter_next(&index_iter, &index_result); table_iter_block_done(&index_iter); From 9ebb2d7b08c9f17a846b7c90082c9d15b9f6c9d2 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Thu, 1 Feb 2024 08:52:00 +0100 Subject: [PATCH 2/5] reftable/writer: use correct type to iterate through index entries The reftable writer is tracking the number of blocks it has to index via the `index_len` variable. But while this variable is of type `size_t`, some sites use an `int` to loop through the index entries. Convert the code to consistently use `size_t`. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- reftable/writer.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/reftable/writer.c b/reftable/writer.c index 92935baa70..5a0b87b406 100644 --- a/reftable/writer.c +++ b/reftable/writer.c @@ -379,20 +379,21 @@ int reftable_writer_add_logs(struct reftable_writer *w, static int writer_finish_section(struct reftable_writer *w) { + struct reftable_block_stats *bstats = NULL; uint8_t typ = block_writer_type(w->block_writer); uint64_t index_start = 0; int max_level = 0; - int threshold = w->opts.unpadded ? 1 : 3; + size_t threshold = w->opts.unpadded ? 1 : 3; int before_blocks = w->stats.idx_stats.blocks; - int err = writer_flush_block(w); - int i = 0; - struct reftable_block_stats *bstats = NULL; + int err; + + err = writer_flush_block(w); if (err < 0) return err; while (w->index_len > threshold) { struct reftable_index_record *idx = NULL; - int idx_len = 0; + size_t i, idx_len; max_level++; index_start = w->next; @@ -630,11 +631,8 @@ done: static void writer_clear_index(struct reftable_writer *w) { - int i = 0; - for (i = 0; i < w->index_len; i++) { + for (size_t i = 0; i < w->index_len; i++) strbuf_release(&w->index[i].last_key); - } - FREE_AND_NULL(w->index); w->index_len = 0; w->index_cap = 0; From b66e006ff534c72c70132105fff06601aa60a625 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Thu, 1 Feb 2024 08:52:04 +0100 Subject: [PATCH 3/5] reftable/writer: simplify writing index records When finishing the current section some index records might be written for the section to the table. The logic that adds these records to the writer duplicates what we already have in `writer_add_record()`, making this more complicated than it really has to be. Simplify the code by using `writer_add_record()` instead. While at it, drop the unneeded braces around a loop to make the code conform to our code style better. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- reftable/writer.c | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/reftable/writer.c b/reftable/writer.c index 5a0b87b406..b5bcce0292 100644 --- a/reftable/writer.c +++ b/reftable/writer.c @@ -412,26 +412,14 @@ static int writer_finish_section(struct reftable_writer *w) .idx = idx[i], }, }; - if (block_writer_add(w->block_writer, &rec) == 0) { - continue; - } - err = writer_flush_block(w); + err = writer_add_record(w, &rec); if (err < 0) return err; - - writer_reinit_block_writer(w, BLOCK_TYPE_INDEX); - - err = block_writer_add(w->block_writer, &rec); - if (err != 0) { - /* write into fresh block should always succeed - */ - abort(); - } } - for (i = 0; i < idx_len; i++) { + + for (i = 0; i < idx_len; i++) strbuf_release(&idx[i].last_key); - } reftable_free(idx); } From e7485601ca4da6a09c6488add181609cffec5799 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Thu, 1 Feb 2024 08:52:08 +0100 Subject: [PATCH 4/5] reftable/writer: fix writing multi-level indices When finishing a section we will potentially write an index that makes it more efficient to look up relevant blocks. The index records written will encode, for each block of the indexed section, what the offset of that block is as well as the last key of that block. Thus, the reader would iterate through the index records to find the first key larger or equal to the wanted key and then use the encoded offset to look up the desired block. When there are a lot of blocks to index though we may end up writing multiple index blocks, too. To not require a linear search across all index blocks we instead end up writing a multi-level index. Instead of referring to the block we are after, an index record may point to another index block. The reader will then access the highest-level index and follow down the chain of index blocks until it hits the sought-after block. It has been observed though that it is impossible to seek ref records of the last ref block when using a multi-level index. While the multi-level index exists and looks fine for most of the part, the highest-level index was missing an index record pointing to the last block of the next index. Thus, every additional level made more refs become unseekable at the end of the ref section. The root cause is that we are not flushing the last block of the current level once done writing the level. Consequently, it wasn't recorded in the blocks that need to be indexed by the next-higher level and thus we forgot about it. Fix this bug by flushing blocks after we have written all index records. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- reftable/readwrite_test.c | 56 +++++++++++++++++++++++++++++++++++++++ reftable/writer.c | 8 +++--- 2 files changed, 60 insertions(+), 4 deletions(-) diff --git a/reftable/readwrite_test.c b/reftable/readwrite_test.c index 6b99daeaf2..853923397e 100644 --- a/reftable/readwrite_test.c +++ b/reftable/readwrite_test.c @@ -866,6 +866,61 @@ static void test_write_multiple_indices(void) strbuf_release(&buf); } +static void test_write_multi_level_index(void) +{ + struct reftable_write_options opts = { + .block_size = 100, + }; + struct strbuf writer_buf = STRBUF_INIT, buf = STRBUF_INIT; + struct reftable_block_source source = { 0 }; + struct reftable_iterator it = { 0 }; + const struct reftable_stats *stats; + struct reftable_writer *writer; + struct reftable_reader *reader; + int err; + + writer = reftable_new_writer(&strbuf_add_void, &noop_flush, &writer_buf, &opts); + reftable_writer_set_limits(writer, 1, 1); + for (size_t i = 0; i < 200; i++) { + struct reftable_ref_record ref = { + .update_index = 1, + .value_type = REFTABLE_REF_VAL1, + .value.val1 = {i}, + }; + + strbuf_reset(&buf); + strbuf_addf(&buf, "refs/heads/%03" PRIuMAX, (uintmax_t)i); + ref.refname = buf.buf, + + err = reftable_writer_add_ref(writer, &ref); + EXPECT_ERR(err); + } + reftable_writer_close(writer); + + /* + * The written refs should be sufficiently large to result in a + * multi-level index. + */ + stats = reftable_writer_stats(writer); + EXPECT(stats->ref_stats.max_index_level == 2); + + block_source_from_strbuf(&source, &writer_buf); + err = reftable_new_reader(&reader, &source, "filename"); + EXPECT_ERR(err); + + /* + * Seeking the last ref should work as expected. + */ + err = reftable_reader_seek_ref(reader, &it, "refs/heads/199"); + EXPECT_ERR(err); + + reftable_iterator_destroy(&it); + reftable_writer_free(writer); + reftable_reader_free(reader); + strbuf_release(&writer_buf); + strbuf_release(&buf); +} + static void test_corrupt_table_empty(void) { struct strbuf buf = STRBUF_INIT; @@ -916,5 +971,6 @@ int readwrite_test_main(int argc, const char *argv[]) RUN_TEST(test_write_object_id_length); RUN_TEST(test_write_object_id_min_length); RUN_TEST(test_write_multiple_indices); + RUN_TEST(test_write_multi_level_index); return 0; } diff --git a/reftable/writer.c b/reftable/writer.c index b5bcce0292..0349094d29 100644 --- a/reftable/writer.c +++ b/reftable/writer.c @@ -418,15 +418,15 @@ static int writer_finish_section(struct reftable_writer *w) return err; } + err = writer_flush_block(w); + if (err < 0) + return err; + for (i = 0; i < idx_len; i++) strbuf_release(&idx[i].last_key); reftable_free(idx); } - err = writer_flush_block(w); - if (err < 0) - return err; - writer_clear_index(w); bstats = writer_reftable_block_stats(w, typ); From 4950acae7d0db40c327003eff2621aaa2172322c Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Thu, 1 Feb 2024 08:52:12 +0100 Subject: [PATCH 5/5] reftable: document reading and writing indices The way the index gets written and read is not trivial at all and requires the reader to piece together a bunch of parts to figure out how it works. Add some documentation to hopefully make this easier to understand for the next reader. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- reftable/reader.c | 27 +++++++++++++++++++++++++++ reftable/writer.c | 23 +++++++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/reftable/reader.c b/reftable/reader.c index 278f727a3d..6011d0aa04 100644 --- a/reftable/reader.c +++ b/reftable/reader.c @@ -508,11 +508,38 @@ static int reader_seek_indexed(struct reftable_reader *r, if (err < 0) goto done; + /* + * The index may consist of multiple levels, where each level may have + * multiple index blocks. We start by doing a linear search in the + * highest layer that identifies the relevant index block as well as + * the record inside that block that corresponds to our wanted key. + */ err = reader_seek_linear(&index_iter, &want_index); if (err < 0) goto done; + /* + * Traverse down the levels until we find a non-index entry. + */ while (1) { + /* + * In case we seek a record that does not exist the index iter + * will tell us that the iterator is over. This works because + * the last index entry of the current level will contain the + * last key it knows about. So in case our seeked key is larger + * than the last indexed key we know that it won't exist. + * + * There is one subtlety in the layout of the index section + * that makes this work as expected: the highest-level index is + * at end of the section and will point backwards and thus we + * start reading from the end of the index section, not the + * beginning. + * + * If that wasn't the case and the order was reversed then the + * linear seek would seek into the lower levels and traverse + * all levels of the index only to find out that the key does + * not exist. + */ err = table_iter_next(&index_iter, &index_result); table_iter_block_done(&index_iter); if (err != 0) diff --git a/reftable/writer.c b/reftable/writer.c index 0349094d29..e23953e498 100644 --- a/reftable/writer.c +++ b/reftable/writer.c @@ -391,6 +391,24 @@ static int writer_finish_section(struct reftable_writer *w) if (err < 0) return err; + /* + * When the section we are about to index has a lot of blocks then the + * index itself may span across multiple blocks, as well. This would + * require a linear scan over index blocks only to find the desired + * indexed block, which is inefficient. Instead, we write a multi-level + * index where index records of level N+1 will refer to index blocks of + * level N. This isn't constant time, either, but at least logarithmic. + * + * This loop handles writing this multi-level index. Note that we write + * the lowest-level index pointing to the indexed blocks first. We then + * continue writing additional index levels until the current level has + * less blocks than the threshold so that the highest level will be at + * the end of the index section. + * + * Readers are thus required to start reading the index section from + * its end, which is why we set `index_start` to the beginning of the + * last index section. + */ while (w->index_len > threshold) { struct reftable_index_record *idx = NULL; size_t i, idx_len; @@ -427,6 +445,11 @@ static int writer_finish_section(struct reftable_writer *w) reftable_free(idx); } + /* + * The index may still contain a number of index blocks lower than the + * threshold. Clear it so that these entries don't leak into the next + * index section. + */ writer_clear_index(w); bstats = writer_reftable_block_stats(w, typ);