Block level access lists

# Block level access lists TL;DR: It may be possible to improve the IO part of block verification 40-50x by adding access lists to blocks. ## Motivation 1. Full statelessness will take longer to implement 2. Statelessness using verkle or a binary Merkle tree will require a transition 3. Bandwidth is currently a bigger concern than execution But there may be a simpler alternative in the form of block level access lists. This is significantly smaller than full state witnesses (estimated 3x compared to verkle, ~30x compared to MPT), so saves bandwidth, but can allow faster verification by massively reducing IO overhead. ## More details * We add a list of all addresses that were accessed by any transaction to the block * The list is verified. Should addresses be missing or additional addresses be added, the block is invalid * When verifying a block, a node first gets the state from all addresses in the access list and then executes all transactions. Properties * This adds no overhead to the block producer, as they only need to record the state they are accessing anyway * Even self-builders get the benefit by building, if they keep the state for all mempool transactions in a cache while building * Fetching all the state in parallel is 40-50x cheaper than doing it serially, so block verification could be significantly easier * Even HDDs profit from this as they can reorder reads according to current disk/head state; it could even be possible to run an EL client on an HDD again, but this is probably not relevant anymore as SSDs have become sufficiently cheap * Maybe SATA SSDs and external USB drives become more viable? ## Next steps It would be great to get real life performance data on the hypothesis by modifying a client to implement this on real Ethereum state. ## Possible improvements * Adding information about which transactions access which state elements is easy, and would allow parallelizing block verification. * Adding the state content rather than just addresses would allow transaction execution completely independent from fetching state. However, this is currently not practical as pre-verkle, transactions can touch very large amounts of state cheaply via contract code ## Validation The C program below creates a large file (128 GB) with direct access (no cache) and compares the random serial access performance to parallel access. On two tested machines, the parallel random reads performed about 40-50x faster than serial reads (both NVMe SSDs). ```c #define _FILE_OFFSET_BITS 64 #include <stdio.h> #include <stdlib.h> #include <stdint.h> // for uint64_t #include <fcntl.h> // for open() #include <unistd.h> // for lseek(), read(), write() #include <time.h> // for clock_gettime() #include <sys/types.h> // for off_t #include <sys/stat.h> // for open() #include <errno.h> #include <malloc.h> // for posix_memalign #include <string.h> // for memset #include <pthread.h> // for threading #define FILE_NAME "largefile.dat" #define FILE_SIZE ((uint64_t) 128 * 1024 * 1024 * 1024) // 128 GB #define NUM_READS 1000000 #define BLOCK_SIZE 4096 // File system block size (commonly 4096 bytes) #define BUFFER_SIZE (BLOCK_SIZE * 256) // 1 MB buffer for writing #define READ_SIZE BLOCK_SIZE // Read size must be multiple of BLOCK_SIZE #define NUM_THREADS 400 #define READS_PER_THREAD (NUM_READS / NUM_THREADS) uint64_t rand_u64(FILE *fp) { uint64_t r; if (fread(&r, sizeof(r), 1, fp) != 1) { perror("Failed to read random data"); fclose(fp); exit(1); } return r; } typedef struct { int thread_id; uint64_t file_size; FILE *urandom; double thread_time; } thread_data_t; void *thread_read(void *arg) { thread_data_t *data = (thread_data_t *)arg; int fd; ssize_t bytes_read; void *read_buffer; struct timespec start_time, end_time; int i; // Open the file with O_DIRECT fd = open(FILE_NAME, O_RDONLY | O_DIRECT); if (fd < 0) { perror("Failed to open file in thread"); pthread_exit(NULL); } // Allocate aligned read buffer if (posix_memalign(&read_buffer, BLOCK_SIZE, READ_SIZE) != 0) { perror("Failed to allocate aligned read buffer in thread"); close(fd); pthread_exit(NULL); } // Start timing if (clock_gettime(CLOCK_MONOTONIC, &start_time) != 0) { perror("Failed to get start time in thread"); free(read_buffer); close(fd); pthread_exit(NULL); } for (i = 0; i < READS_PER_THREAD; i++) { uint64_t offset = rand_u64(data->urandom) % (data->file_size - READ_SIZE); // Align offset to BLOCK_SIZE offset = offset & ~(BLOCK_SIZE - 1); if (lseek(fd, (off_t)offset, SEEK_SET) == (off_t)-1) { perror("Failed to seek in thread"); free(read_buffer); close(fd); pthread_exit(NULL); } bytes_read = read(fd, read_buffer, READ_SIZE); if (bytes_read != READ_SIZE) { perror("Failed to read in thread"); free(read_buffer); close(fd); pthread_exit(NULL); } } // End timing if (clock_gettime(CLOCK_MONOTONIC, &end_time) != 0) { perror("Failed to get end time in thread"); free(read_buffer); close(fd); pthread_exit(NULL); } data->thread_time = (end_time.tv_sec - start_time.tv_sec) + (end_time.tv_nsec - start_time.tv_nsec) / 1e9; free(read_buffer); close(fd); pthread_exit(NULL); } int main() { int fd; uint64_t file_size = FILE_SIZE; ssize_t bytes_written; void *write_buffer; void *read_buffer; struct timespec start_time, end_time; double total_time_serial, total_time_parallel; int i; FILE *urandom; // Open /dev/urandom for random numbers urandom = fopen("/dev/urandom", "rb"); if (urandom == NULL) { perror("Failed to open /dev/urandom"); return 1; } // Create the large file and disable caching using O_DIRECT fd = open(FILE_NAME, O_RDWR | O_CREAT | O_DIRECT, S_IRUSR | S_IWUSR); if (fd < 0) { perror("Failed to open file"); fclose(urandom); return 1; } // Set the file size to 128 GB if (ftruncate(fd, (off_t)file_size) != 0) { perror("Failed to set file size"); close(fd); fclose(urandom); return 1; } // Allocate aligned write buffer if (posix_memalign(&write_buffer, BLOCK_SIZE, BUFFER_SIZE) != 0) { perror("Failed to allocate aligned write buffer"); close(fd); fclose(urandom); return 1; } // Write random data to the file printf("Writing random data to the file...\n"); uint64_t total_written = 0; while (total_written < file_size) { size_t to_write = BUFFER_SIZE; if (file_size - total_written < BUFFER_SIZE) to_write = file_size - total_written; // Read random data into write_buffer if (fread(write_buffer, 1, to_write, urandom) != to_write) { perror("Failed to read random data"); free(write_buffer); close(fd); fclose(urandom); return 1; } // Write data to file bytes_written = write(fd, write_buffer, to_write); if (bytes_written != to_write) { perror("Failed to write to file"); free(write_buffer); close(fd); fclose(urandom); return 1; } total_written += bytes_written; } free(write_buffer); printf("Finished writing random data.\n"); // Close and reopen the file to reset file offsets and cache state close(fd); // Reopen the file for reading with O_DIRECT fd = open(FILE_NAME, O_RDONLY | O_DIRECT); if (fd < 0) { perror("Failed to reopen file for reading"); fclose(urandom); return 1; } // Allocate aligned read buffer if (posix_memalign(&read_buffer, BLOCK_SIZE, READ_SIZE) != 0) { perror("Failed to allocate aligned read buffer"); close(fd); fclose(urandom); return 1; } // Serial Read Benchmark printf("Starting serial read benchmark...\n"); if (clock_gettime(CLOCK_MONOTONIC, &start_time) != 0) { perror("Failed to get start time"); free(read_buffer); close(fd); fclose(urandom); return 1; } for (i = 0; i < NUM_READS; i++) { uint64_t offset = rand_u64(urandom) % (file_size - READ_SIZE); // Align offset to BLOCK_SIZE offset = offset & ~(BLOCK_SIZE - 1); if (lseek(fd, (off_t)offset, SEEK_SET) == (off_t)-1) { perror("Failed to seek"); free(read_buffer); close(fd); fclose(urandom); return 1; } ssize_t bytes_read = read(fd, read_buffer, READ_SIZE); if (bytes_read != READ_SIZE) { perror("Failed to read"); free(read_buffer); close(fd); fclose(urandom); return 1; } } if (clock_gettime(CLOCK_MONOTONIC, &end_time) != 0) { perror("Failed to get end time"); free(read_buffer); close(fd); fclose(urandom); return 1; } total_time_serial = (end_time.tv_sec - start_time.tv_sec) + (end_time.tv_nsec - start_time.tv_nsec) / 1e9; printf("Serial read benchmark completed.\n"); printf("Total time for %d random reads (serial): %f seconds\n", NUM_READS, total_time_serial); printf("Average time per read (serial): %f milliseconds\n", (total_time_serial / NUM_READS) * 1000); free(read_buffer); close(fd); // Parallel Read Benchmark printf("Starting parallel read benchmark with %d threads...\n", NUM_THREADS); pthread_t threads[NUM_THREADS]; thread_data_t thread_data[NUM_THREADS]; int rc; // Rewind urandom to ensure different random numbers for the parallel benchmark fseek(urandom, 0, SEEK_SET); // Start timing if (clock_gettime(CLOCK_MONOTONIC, &start_time) != 0) { perror("Failed to get start time for parallel benchmark"); fclose(urandom); return 1; } for (i = 0; i < NUM_THREADS; i++) { thread_data[i].thread_id = i; thread_data[i].file_size = file_size; thread_data[i].urandom = fopen("/dev/urandom", "rb"); if (thread_data[i].urandom == NULL) { perror("Failed to open /dev/urandom in thread"); return 1; } rc = pthread_create(&threads[i], NULL, thread_read, (void *)&thread_data[i]); if (rc) { fprintf(stderr, "Error: Unable to create thread %d, rc = %d\n", i, rc); fclose(thread_data[i].urandom); return 1; } } // Wait for all threads to complete for (i = 0; i < NUM_THREADS; i++) { pthread_join(threads[i], NULL); fclose(thread_data[i].urandom); } // End timing if (clock_gettime(CLOCK_MONOTONIC, &end_time) != 0) { perror("Failed to geWriting random data to the file... Finished writing random data. Starting serial read benchmark... Serial read benchmark completed. Total time for 1000000 random reads (serial): 108.443336 seconds Average time per read (serial): 0.108443 milliseconds Starting parallel read benchmark with 400 threads... Parallel read benchmark completed. Total time for 1000000 random reads (parallel): 2.348680 seconds Average time per read (parallel): 0.002349 milliseconds Comparison: Serial total time: 108.443336 seconds Parallel total time: 2.348680 seconds Speedup: 46.172040x t end time for parallel benchmark"); return 1; } total_time_parallel = (end_time.tv_sec - start_time.tv_sec) + (end_time.tv_nsec - start_time.tv_nsec) / 1e9; printf("Parallel read benchmark completed.\n"); printf("Total time for %d random reads (parallel): %f seconds\n", NUM_READS, total_time_parallel); printf("Average time per read (parallel): %f milliseconds\n", (total_time_parallel / NUM_READS) * 1000); // Comparison printf("\nComparison:\n"); printf("Serial total time: %f seconds\n", total_time_serial); printf("Parallel total time: %f seconds\n", total_time_parallel); printf("Speedup: %fx\n", total_time_serial / total_time_parallel); fclose(urandom); return 0; } ``` Sample output: ``` Writing random data to the file... Finished writing random data. Starting serial read benchmark... Serial read benchmark completed. Total time for 1000000 random reads (serial): 108.443336 seconds Average time per read (serial): 0.108443 milliseconds Starting parallel read benchmark with 400 threads... Parallel read benchmark completed. Total time for 1000000 random reads (parallel): 2.348680 seconds Average time per read (parallel): 0.002349 milliseconds Comparison: Serial total time: 108.443336 seconds Parallel total time: 2.348680 seconds Speedup: 46.172040x ```