Writing a sqlite clone from scratch in C
Part 16 - Rebalancing the B-Tree After Deletion
Part 18 - A Page Cache and Buffer Pool
Up until now, select dumps every row in the table. That’s fine for debugging, but a real database lets you ask for specific rows. Time to add a WHERE clause.
We’ll support filtering on the primary key (id), with five operators: =, >, <, >=, and <=. This is where our B-tree starts to really shine – instead of scanning every row, we can jump directly to the one we want.
First, we need a way to represent the filter condition. We’ll add a WhereOp enum and two new fields to Statement:
+typedef enum {
+ WHERE_NONE,
+ WHERE_EQ,
+ WHERE_GT,
+ WHERE_LT,
+ WHERE_GTE,
+ WHERE_LTE,
+} WhereOp;
+
typedef struct {
StatementType type;
Row row_to_insert; // only used by insert statement
+ WhereOp where_op; // only used by select statement
+ uint32_t where_id; // only used by select statement
} Statement;
WHERE_NONE means “no filter” – a full table scan, just like before.
The syntax is select where id <op> <value>. We parse it by tokenizing after the select keyword:
- if (strcmp(input_buffer->buffer, "select") == 0) {
+ if (strncmp(input_buffer->buffer, "select", 6) == 0) {
statement->type = STATEMENT_SELECT;
+ statement->where_op = WHERE_NONE;
+
+ if (strlen(input_buffer->buffer) > 6) {
+ char* token = strtok(input_buffer->buffer, " "); // "select"
+ token = strtok(NULL, " "); // "where"
+ if (token == NULL || strcmp(token, "where") != 0) {
+ return PREPARE_SYNTAX_ERROR;
+ }
+ token = strtok(NULL, " "); // "id"
+ if (token == NULL || strcmp(token, "id") != 0) {
+ return PREPARE_SYNTAX_ERROR;
+ }
+ token = strtok(NULL, " "); // operator
+ if (token == NULL) {
+ return PREPARE_SYNTAX_ERROR;
+ }
+ if (strcmp(token, "=") == 0) {
+ statement->where_op = WHERE_EQ;
+ } else if (strcmp(token, ">") == 0) {
+ statement->where_op = WHERE_GT;
+ } else if (strcmp(token, "<") == 0) {
+ statement->where_op = WHERE_LT;
+ } else if (strcmp(token, ">=") == 0) {
+ statement->where_op = WHERE_GTE;
+ } else if (strcmp(token, "<=") == 0) {
+ statement->where_op = WHERE_LTE;
+ } else {
+ return PREPARE_SYNTAX_ERROR;
+ }
+ token = strtok(NULL, " "); // value
+ if (token == NULL) {
+ return PREPARE_SYNTAX_ERROR;
+ }
+ statement->where_id = (uint32_t)atoi(token);
+ }
+
return PREPARE_SUCCESS;
}
If there’s nothing after select, we keep WHERE_NONE and do a full scan as before. If there is, we expect the exact pattern where id <op> <value>.
Here’s where it gets interesting. We rewrite execute_select() to use a switch on the operator:
For equality, we use table_find() to jump directly to the key. This is an O(log n) lookup – the whole reason we built a B-tree:
+ case WHERE_EQ: {
+ cursor = table_find(table, statement->where_id);
+ void* node = get_page(table->pager, cursor->page_num);
+ uint32_t num_cells = *leaf_node_num_cells(node);
+ if (cursor->cell_num < num_cells) {
+ uint32_t key = *leaf_node_key(node, cursor->cell_num);
+ if (key == statement->where_id) {
+ deserialize_row(cursor_value(cursor), &row);
+ print_row(&row);
+ }
+ }
+ free(cursor);
+ return EXECUTE_SUCCESS;
+ }
table_find() returns the position where the key should be. We still have to verify it’s actually there, since the key might not exist.
For greater-than queries, we position the cursor at the first qualifying key and scan forward through the sibling chain:
+ case WHERE_GT:
+ case WHERE_GTE: {
+ uint32_t start_key = (statement->where_op == WHERE_GT)
+ ? statement->where_id + 1
+ : statement->where_id;
+ cursor = table_find(table, start_key);
+ while (!(cursor->end_of_table)) {
+ deserialize_row(cursor_value(cursor), &row);
+ print_row(&row);
+ cursor_advance(cursor);
+ }
+ free(cursor);
+ return EXECUTE_SUCCESS;
+ }
For WHERE id > 5, we search for key 6. table_find() positions us at the first key >= 6, and we scan to the end. The next_leaf pointers we added in Part 12 make this traversal seamless across leaf node boundaries.
For less-than, we start at the beginning and stop when we hit the boundary:
+ case WHERE_LT:
+ case WHERE_LTE: {
+ cursor = table_start(table);
+ uint32_t limit = statement->where_id;
+ while (!(cursor->end_of_table)) {
+ void* node = get_page(table->pager, cursor->page_num);
+ uint32_t key = *leaf_node_key(node, cursor->cell_num);
+ if (statement->where_op == WHERE_LT && key >= limit) break;
+ if (statement->where_op == WHERE_LTE && key > limit) break;
+ deserialize_row(cursor_value(cursor), &row);
+ print_row(&row);
+ cursor_advance(cursor);
+ }
+ free(cursor);
+ return EXECUTE_SUCCESS;
+ }
Because our keys are stored in sorted order, we can stop early the moment we see a key that’s too large. We don’t have to scan the whole table.
Let’s try some queries:
db > insert 1 user1 person1@example.com
Executed.
db > insert 2 user2 person2@example.com
Executed.
db > insert 3 user3 person3@example.com
Executed.
db > select where id = 2
(2, user2, person2@example.com)
Executed.
db > select where id > 1
(2, user2, person2@example.com)
(3, user3, person3@example.com)
Executed.
db > select where id < 3
(1, user1, person1@example.com)
(2, user2, person2@example.com)
Executed.
And the automated tests:
+ it 'selects a single row with where id =' do
+ script = (1..5).map do |i|
+ "insert #{i} user#{i} person#{i}@example.com"
+ end
+ script << "select where id = 3"
+ script << ".exit"
+ result = run_script(script)
+ expect(result).to include("(3, user3, person3@example.com)")
+ expect(result).not_to include("(1, user1, person1@example.com)")
+ expect(result).not_to include("(5, user5, person5@example.com)")
+ end
+
+ it 'selects rows with where id >' do
+ script = (1..5).map do |i|
+ "insert #{i} user#{i} person#{i}@example.com"
+ end
+ script << "select where id > 3"
+ script << ".exit"
+ result = run_script(script)
+ expect(result).to include("(4, user4, person4@example.com)")
+ expect(result).to include("(5, user5, person5@example.com)")
+ expect(result).not_to include("(3, user3, person3@example.com)")
+ end
+
+ it 'selects rows with where id <' do
+ script = (1..5).map do |i|
+ "insert #{i} user#{i} person#{i}@example.com"
+ end
+ script << "select where id < 3"
+ script << ".exit"
+ result = run_script(script)
+ expect(result).to include("(1, user1, person1@example.com)")
+ expect(result).to include("(2, user2, person2@example.com)")
+ expect(result).not_to include("(3, user3, person3@example.com)")
+ end
+
+ it 'returns nothing for where clause with no matches' do
+ script = [
+ "insert 1 user1 person1@example.com",
+ "select where id = 5",
+ ".exit",
+ ]
+ result = run_script(script)
+ expect(result).to match_array([
+ "db > Executed.",
+ "db > Executed.",
+ "db > ",
+ ])
+ end
Notice how the equality query uses table_find() – a single O(log n) B-tree traversal – rather than scanning every row. This is the payoff for all that B-tree work. A full table scan touches every row. A point query touches only the pages on the path from root to the target leaf. For a table with millions of rows, that’s the difference between milliseconds and minutes.
Next time we’ll overhaul our pager into a proper buffer pool with dirty page tracking and LRU eviction.