Often when an interpolated search is wrong the interpolation will tend to nail you against one side or the other of the range-- so the worst case is linear. By allowing only a finite number of failed probes (meaning they only move the same boundary as last time, an optimally working search will on average alternate hi/lo) you can maintain the log guarantee of bisection.
[...]
The binary search checks one value at a time. However, recent processors can load and check more than one value at once. They have excellent memory-level parllelism. This suggest that instead of a binary search, we might want to try a quaternary search..."
First of all, brilliant observations! (Overall, a great article too!)
Yes, today's processors indeed have a parallelism which was unconceived of at the time the original Mathematicians, then-to-be Computer Scientists, conceived of Binary Search...
Now I myself wonder if these ideas might be extended to GPU's, that is, if the massively parallel execution capability of GPU's could be extended to search for data like Binary Search does, and what such an appropriately parallelized algorithm/data structure would look like... keep in mind, if we consider an updateable data structure, then that means that parts of it may need to be appropriately locked at the same time that multiple searches and updates are occurring simultaneously... so what data structure/algorithm would be the most efficient for a massively parallel scenario like that?
Anyway, great article and brilliant observations!
40x Faster Binary Search - This talk will first expose the lie that binary search takes O(lg n) time — it very much does not! Instead, we will see that binary search has only constant overhead compared to an oracle. Then, we will exploit everything that modern CPUs have to offer (SIMD, ILP, prefetching, efficient caching) in order to gain 40x increased throughput over the Rust standard library implementation.
#include <stdbool.h> #include <stdint.h> #include <arm_neon.h>
/* Author: aam@fastmail.fm * * Apple M4 Max (P-core) variant of simd_quad which uses a key spline * to great effect (blog post summary incoming!) / bool simd_quad_m4(const uint16_t carr, int32_t cardinality, uint16_t pos) { enum { gap = 64 };
if (cardinality < gap) {
if (cardinality >= 32) {
// 32 <= n < 64: NEON-compare the first 32 as a single x4 load,
// sweep the remainder.
uint16x8_t needle = vdupq_n_u16(pos);
uint16x8x4_t v = vld1q_u16_x4(carr);
uint16x8_t hit = vorrq_u16(
vorrq_u16(vceqq_u16(v.val[0], needle), vceqq_u16(v.val[1], needle)),
vorrq_u16(vceqq_u16(v.val[2], needle), vceqq_u16(v.val[3], needle)));
if (vmaxvq_u16(hit) != 0) return true;
for (int32_t j = 32; j < cardinality; j++) {
uint16_t x = carr[j];
if (x >= pos) return x == pos;
}
return false;
}
if (cardinality >= 16) {
// 16 <= n < 32: paired x2 load + sweep tail.
uint16x8_t needle = vdupq_n_u16(pos);
uint16x8x2_t v = vld1q_u16_x2(carr);
uint16x8_t hit = vorrq_u16(vceqq_u16(v.val[0], needle),
vceqq_u16(v.val[1], needle));
if (vmaxvq_u16(hit) != 0) return true;
for (int32_t j = 16; j < cardinality; j++) {
uint16_t x = carr[j];
if (x >= pos) return x == pos;
}
return false;
}
if (cardinality >= 8) {
// 8 <= n < 16: single 128-bit compare + sweep tail.
uint16x8_t needle = vdupq_n_u16(pos);
uint16x8_t v = vld1q_u16(carr);
if (vmaxvq_u16(vceqq_u16(v, needle)) != 0) return true;
for (int32_t j = 8; j < cardinality; j++) {
uint16_t x = carr[j];
if (x >= pos) return x == pos;
}
return false;
}
for (int32_t j = 0; j < cardinality; j++) {
uint16_t v = carr[j];
if (v >= pos) return v == pos;
}
return false;
}
int32_t num_blocks = cardinality / gap;
int32_t base = 0;
int32_t n = num_blocks;
while (n > 3) {
int32_t quarter = n >> 2;
int32_t k1 = carr[(base + quarter + 1) * gap - 1];
int32_t k2 = carr[(base + 2 * quarter + 1) * gap - 1];
int32_t k3 = carr[(base + 3 * quarter + 1) * gap - 1];
int32_t c1 = (k1 < pos);
int32_t c2 = (k2 < pos);
int32_t c3 = (k3 < pos);
base += (c1 + c2 + c3) * quarter;
n -= 3 * quarter;
}
while (n > 1) {
int32_t half = n >> 1;
base = (carr[(base + half + 1) * gap - 1] < pos) ? base + half : base;
n -= half;
}
int32_t lo = (carr[(base + 1) * gap - 1] < pos) ? base + 1 : base;
if (lo < num_blocks) {
const uint16_t *blk = carr + lo * gap;
uint16x8_t needle = vdupq_n_u16(pos);
uint16x8x4_t a = vld1q_u16_x4(blk);
uint16x8x4_t b = vld1q_u16_x4(blk + 32);
uint16x8_t h0 = vorrq_u16(
vorrq_u16(vceqq_u16(a.val[0], needle), vceqq_u16(a.val[1], needle)),
vorrq_u16(vceqq_u16(a.val[2], needle), vceqq_u16(a.val[3], needle)));
uint16x8_t h1 = vorrq_u16(
vorrq_u16(vceqq_u16(b.val[0], needle), vceqq_u16(b.val[1], needle)),
vorrq_u16(vceqq_u16(b.val[2], needle), vceqq_u16(b.val[3], needle)));
return vmaxvq_u16(vorrq_u16(h0, h1)) != 0;
}
for (int32_t j = num_blocks * gap; j < cardinality; j++) {
uint16_t v = carr[j];
if (v >= pos) return v == pos;
}
return false;
}/* * Spine variant, M4 edition. * * pack the interpolation probe keys into a dense contiguous region so the * cold-cache pointer chase streams through consecutive cache lines: * * n=4096 -> 64 spine keys -> 128 B = 1 M4 cache line * n=2048 -> 32 spine keys -> 64 B = half a line * n=1024 -> 16 spine keys -> 32 B * * The entire interpolation phase for a max-sized Roaring container now * lives in one cache line. The final SIMD block check still loads from * carr. * * The num_blocks <= 3 fallback: * with very few blocks the carr-based probes accidentally prime the final * block's lines, which the spine path disrupts. / bool simd_quad_m4_spine(const uint16_t carr, const uint16_t spine, int32_t cardinality, uint16_t pos) { enum { gap = 64 };
if (cardinality < gap) {
// Same fast paths as simd_quad_m4 -- spine is irrelevant here.
if (cardinality >= 32) {
uint16x8_t needle = vdupq_n_u16(pos);
uint16x8x4_t v = vld1q_u16_x4(carr);
uint16x8_t hit = vorrq_u16(
vorrq_u16(vceqq_u16(v.val[0], needle), vceqq_u16(v.val[1], needle)),
vorrq_u16(vceqq_u16(v.val[2], needle), vceqq_u16(v.val[3], needle)));
if (vmaxvq_u16(hit) != 0) return true;
for (int32_t j = 32; j < cardinality; j++) {
uint16_t x = carr[j];
if (x >= pos) return x == pos;
}
return false;
}
if (cardinality >= 16) {
uint16x8_t needle = vdupq_n_u16(pos);
uint16x8x2_t v = vld1q_u16_x2(carr);
uint16x8_t hit = vorrq_u16(vceqq_u16(v.val[0], needle),
vceqq_u16(v.val[1], needle));
if (vmaxvq_u16(hit) != 0) return true;
for (int32_t j = 16; j < cardinality; j++) {
uint16_t x = carr[j];
if (x >= pos) return x == pos;
}
return false;
}
if (cardinality >= 8) {
uint16x8_t needle = vdupq_n_u16(pos);
uint16x8_t v = vld1q_u16(carr);
if (vmaxvq_u16(vceqq_u16(v, needle)) != 0) return true;
for (int32_t j = 8; j < cardinality; j++) {
uint16_t x = carr[j];
if (x >= pos) return x == pos;
}
return false;
}
for (int32_t j = 0; j < cardinality; j++) {
uint16_t v = carr[j];
if (v >= pos) return v == pos;
}
return false;
}
int32_t num_blocks = cardinality / gap;
if (num_blocks <= 3) {
return simd_quad_m4(carr, cardinality, pos);
}
int32_t base = 0;
int32_t n = num_blocks;
// Pull the whole spine into L1 up front. For n in [256, 4096] this is
// 1 line (128 B); for smaller n it is a partial line. Cheap on cold.
__builtin_prefetch(spine);
while (n > 3) {
int32_t quarter = n >> 2;
int32_t k1 = spine[base + quarter];
int32_t k2 = spine[base + 2 * quarter];
int32_t k3 = spine[base + 3 * quarter];
int32_t c1 = (k1 < pos);
int32_t c2 = (k2 < pos);
int32_t c3 = (k3 < pos);
base += (c1 + c2 + c3) * quarter;
n -= 3 * quarter;
}
while (n > 1) {
int32_t half = n >> 1;
base = (spine[base + half] < pos) ? base + half : base;
n -= half;
}
int32_t lo = (spine[base] < pos) ? base + 1 : base;
if (lo < num_blocks) {
const uint16_t *blk = carr + lo * gap;
uint16x8_t needle = vdupq_n_u16(pos);
uint16x8x4_t a = vld1q_u16_x4(blk);
uint16x8x4_t b = vld1q_u16_x4(blk + 32);
uint16x8_t h0 = vorrq_u16(
vorrq_u16(vceqq_u16(a.val[0], needle), vceqq_u16(a.val[1], needle)),
vorrq_u16(vceqq_u16(a.val[2], needle), vceqq_u16(a.val[3], needle)));
uint16x8_t h1 = vorrq_u16(
vorrq_u16(vceqq_u16(b.val[0], needle), vceqq_u16(b.val[1], needle)),
vorrq_u16(vceqq_u16(b.val[2], needle), vceqq_u16(b.val[3], needle)));
return vmaxvq_u16(vorrq_u16(h0, h1)) != 0;
}
for (int32_t j = num_blocks * gap; j < cardinality; j++) {
uint16_t v = carr[j];
if (v >= pos) return v == pos;
}
return false;
}// Build the spine for a given carr. Caller allocates cardinality/64 u16s. void simd_quad_m4_build_spine(const uint16_t carr, int32_t cardinality, uint16_t spine) { enum { gap = 64 }; int32_t num_blocks = cardinality / gap; for (int32_t i = 0; i < num_blocks; i++) { spine[i] = carr[(i + 1) gap - 1]; } }
[0] A nonsense thing to ask people to implement in an interview
You see this in rust, where they replaced the hash tables many years ago, the channel a couple of years ago, and most recently the sort implementations for both stable and unstable sort. I expect other languages / runtimes do similar things over time as well as CPUs change and new approaches are discovered.
It is looking like C++ 26 will get compile time reflection, which would make things like this even more feasible.