diff options
-rw-r--r-- | inc/hash.h | 4 | ||||
-rw-r--r-- | readme.md | 17 | ||||
-rw-r--r-- | src/hash.c | 80 |
3 files changed, 59 insertions, 42 deletions
@@ -7,8 +7,8 @@ #include <stdint.h> #include <stddef.h> -typedef uint32_t hash_t; +typedef uint64_t hash_t; -hash_t hash(const uint8_t *key, size_t len, uint32_t seed); +hash_t hash(const void *key, int len, uint64_t seed); #endif @@ -10,12 +10,17 @@ BLoC and heavily reduces its size. Before explaining the format or its optimization techniques, let me first show you some results: -1. The [bruijn](https://github.com/marvinborner/bruijn) expression +1. x86 C compiler [8cc](https://github.com/rui314/8cc) translated [to + lambda calculus](https://github.com/woodrush/lambda-8cc): + - the original expression takes ~5M bytes in bit-encoded BLC + - the same expression in BLoC needs only ~650K bytes (which is + around 2x the original 8cc!) +2. The [bruijn](https://github.com/marvinborner/bruijn) expression `fac (+30)`, where `fac` is the factorial implementation from `std/Math`: - the original expression takes 1200 bytes in bit-encoded BLC - the same expression in BLoC needs only 348 bytes -2. [My +3. [My solution](https://github.com/marvinborner/bruijn/blob/main/samples/aoc/2022/01/solve.bruijn) for the “Advent of Code” challenge [2022/01](https://adventofcode.com/2022/day/1) in @@ -94,11 +99,3 @@ As of right now, expressions **don’t** get beta-reduced or manipulated in any other way. As an idea for the future, long expressions could get reduced using different techniques/depths and then get replaced with the shortest one (as fully reduced expressions aren’t necessarily shorter). - -## Improvements - -There seem to be problems with *very* big files: -[8cc](https://github.com/woodrush/lambda-8cc) does not pass the bloc-blc -comparison test. I’ve not been able to reproduce this bug with any other -file and 8cc itself is too huge to comfortably debug the issue. If -you’re reading this: Please help me :( @@ -1,42 +1,62 @@ // murmur3 originally by Austin Appleby +// Copyright (c) 2013, ksss // Copyright (c) 2023, Marvin Borner <dev@marvinborner.de> // SPDX-License-Identifier: MIT #include <stdint.h> -#include <string.h> #include <hash.h> -static inline uint32_t murmur_32_scramble(uint32_t k) +hash_t hash(const void *key, int len, uint64_t seed) { - k *= 0xcc9e2d51; - k = (k << 15) | (k >> 17); - k *= 0x1b873593; - return k; -} + const uint64_t m = 0xc6a4a7935bd1e995ULL; + const int r = 47; + + uint64_t h = seed ^ (len * m); + + const uint64_t *data = (const uint64_t *)key; + const uint64_t *end = data + (len / 8); + + while (data != end) { + uint64_t k = *data++; + + k *= m; + k ^= k >> r; + k *= m; + + h ^= k; + h *= m; + } + + const unsigned char *data2 = (const unsigned char *)data; + + int b = len & 7; + if (b >= 7) { + h ^= ((uint64_t)data2[6]) << 48; + } + if (b >= 6) { + h ^= ((uint64_t)data2[5]) << 40; + } + if (b >= 5) { + h ^= ((uint64_t)data2[4]) << 32; + } + if (b >= 4) { + h ^= ((uint64_t)data2[3]) << 24; + } + if (b >= 3) { + h ^= ((uint64_t)data2[2]) << 16; + } + if (b >= 2) { + h ^= ((uint64_t)data2[1]) << 8; + } + if (b >= 1) { + h ^= ((uint64_t)data2[0]); + h *= m; + } + + h ^= h >> r; + h *= m; + h ^= h >> r; -hash_t hash(const uint8_t *key, size_t len, uint32_t seed) -{ - uint32_t h = seed; - uint32_t k; - for (size_t i = len >> 2; i; i--) { - memcpy(&k, key, sizeof(uint32_t)); - key += sizeof(uint32_t); - h ^= murmur_32_scramble(k); - h = (h << 13) | (h >> 19); - h = h * 5 + 0xe6546b64; - } - k = 0; - for (size_t i = len & 3; i; i--) { - k <<= 8; - k |= key[i - 1]; - } - h ^= murmur_32_scramble(k); - h ^= len; - h ^= h >> 16; - h *= 0x85ebca6b; - h ^= h >> 13; - h *= 0xc2b2ae35; - h ^= h >> 16; return h; } |