diff options
-rw-r--r-- | inc/spec.h | 6 | ||||
-rw-r--r-- | inc/tree.h | 46 | ||||
-rw-r--r-- | readme.md | 12 | ||||
-rw-r--r-- | src/build.c | 1 | ||||
-rw-r--r-- | src/free.c | 2 | ||||
-rw-r--r-- | src/main.c | 50 | ||||
-rw-r--r-- | src/parse.c | 54 | ||||
-rw-r--r-- | src/tree.c | 363 |
8 files changed, 461 insertions, 73 deletions
@@ -10,15 +10,11 @@ struct bloc_header { char identifier[BLOC_IDENTIFIER_LENGTH]; short length; - void *data; + void *entries; } __attribute__((packed)); struct bloc_entry { void *expression; } __attribute__((packed)); -struct bloc_structure { - void *expression; -} __attribute__((packed)); - #endif diff --git a/inc/tree.h b/inc/tree.h new file mode 100644 index 0000000..8f11f1b --- /dev/null +++ b/inc/tree.h @@ -0,0 +1,46 @@ +// Copyright (c) 2023, Marvin Borner <dev@marvinborner.de> +// SPDX-License-Identifier: MIT + +#ifndef BLOC_TREE_H +#define BLOC_TREE_H + +#include <stdint.h> + +#include <term.h> + +#define VALIDATED_TREE ((int)0x0) +#define INVALIDATED_TREE ((int)0xffffffff) +#define FREEABLE_TREE(t) \ + ((t)->state != VALIDATED_TREE && (t)->state != INVALIDATED_TREE) + +struct tree { + term_type type; + uint32_t hash; + int state; // zero or index to ref + int size; // blc length + union { + struct { + struct tree *term; + } abs; + struct { + struct tree *lhs; + struct tree *rhs; + } app; + struct { + int index; + } var; + struct { + size_t index; + } ref; + } u; +}; + +struct list { + int val; // length or priority + void *data; + struct list *next; +}; + +struct list *tree_merge_duplicates(struct term *term); + +#endif @@ -31,6 +31,8 @@ following derivation of normal bit-encoded BLC: | 1<sup>i+1</sup>0 | bruijn index `i` | | 011I | 2 byte index to an entry | +The final program will be in the last entry. + ## Example Let `E` be some kind of expression like `E=\x.(((M (\y.N)) M) N)`, where @@ -44,13 +46,13 @@ A possible encoding in BLoC: | from | to | content | |:-----|:-----|:--------------------------------------------------------------------------------------| | 0x00 | 0x04 | “BLoC” | -| 0x04 | 0x06 | number of entries: 2 | -| 0x06 | 0x16 | encoded `M` | -| 0x16 | 0x26 | encoded `N` | -| 0x26 | 0x33 | `00010010010011<M>00011<N>011<M>011<N>`, where `<M>=0` and `<N>=1` are 2 byte indices | +| 0x04 | 0x06 | number of entries: 3 | +| 0x06 | 0x17 | encoded `M`: gets a bit longer due to different encoding | +| 0x17 | 0x28 | encoded `N`: gets a bit longer due to different encoding | +| 0x28 | 0x35 | `00010010010011<M>00011<N>011<M>011<N>`, where `<M>=0` and `<N>=1` are 2 byte indices | Even in this small example BLoC uses less space than BLC (0x33 vs. 0x42 -bytes). Depending on the values of `M` and `N`, this could have +bytes). Depending on the content of `M` and `N`, this could have potentially been compressed even more. The compressor in this project uses Merkle trees to accomplish this. diff --git a/src/build.c b/src/build.c index 510afe0..d415c29 100644 --- a/src/build.c +++ b/src/build.c @@ -74,7 +74,6 @@ void write_bloc(struct term *term, const char *path) write_bblc(N, file); // TODO - short table_index = 0; char byte = 0; int bit = 0; write_bit(0, file, &byte, &bit); @@ -25,7 +25,7 @@ void free_term(struct term *term) free(term); break; default: - fprintf(stderr, "Invalid type %d\n", term->type); + fprintf(stderr, "invalid type %d\n", term->type); } } @@ -8,6 +8,7 @@ #include <term.h> #include <print.h> +#include <tree.h> #include <parse.h> #include <build.h> #include <free.h> @@ -69,30 +70,31 @@ static char *read_file(const char *path) int main(int argc, char **argv) { - /* if (argc < 2) { */ - /* fprintf(stderr, "Invalid arguments\n"); */ - /* return 1; */ - /* } */ - - /* char *input; */ - /* if (argv[1][0] == '-') { */ - /* input = read_stdin(); */ - /* } else { */ - /* input = read_file(argv[1]); */ - /* } */ - - /* if (!input) */ - /* return 1; */ - - /* struct term *parsed = parse_blc(input); */ - /* print_bruijn(parsed); */ - - write_bloc(0, "test.bloc"); - - const char *input = read_file("test.bloc"); - struct bloc_parsed *bloc = parse_bloc(input); - struct term *term = from_bloc(bloc); - print_blc(term); + if (argc < 2) { + fprintf(stderr, "Invalid arguments\n"); + return 1; + } + + char *input; + if (argv[1][0] == '-') { + input = read_stdin(); + } else { + input = read_file(argv[1]); + } + + if (!input) + return 1; + + struct term *parsed = parse_blc(input); + print_bruijn(parsed); + + tree_merge_duplicates(parsed); + + /* write_bloc(0, "test.bloc"); */ + /* const char *input = read_file("test.bloc"); */ + /* struct bloc_parsed *bloc = parse_bloc(input); */ + /* struct term *term = from_bloc(bloc); */ + /* print_blc(term); */ /* free_term(term); // TODO: Fix sharing user-after-free */ diff --git a/src/parse.c b/src/parse.c index 8c69056..5233354 100644 --- a/src/parse.c +++ b/src/parse.c @@ -44,19 +44,23 @@ struct term *parse_blc(const char *term) #define BIT_AT(i) (term[(i) / 8] & (1 << (7 - ((i) % 8)))) -// parses normal bit-encoded blc -static struct term *parse_bblc(const char *term, size_t *bit) +// parses bloc's bit-encoded blc +// 00M -> abstraction of M +// 010MN -> application of M and N +// 1X0 -> bruijn index, amount of 1s in X +// 011I -> 2B index to entry +static struct term *parse_bloc_bblc(const char *term, size_t *bit) { struct term *res = 0; if (!BIT_AT(*bit) && !BIT_AT(*bit + 1)) { (*bit) += 2; res = new_term(ABS); - res->u.abs.term = parse_bblc(term, bit); - } else if (!BIT_AT(*bit) && BIT_AT(*bit + 1)) { - (*bit) += 2; + res->u.abs.term = parse_bloc_bblc(term, bit); + } else if (!BIT_AT(*bit) && BIT_AT(*bit + 1) && !BIT_AT(*bit)) { + (*bit) += 3; res = new_term(APP); - res->u.app.lhs = parse_bblc(term, bit); - res->u.app.rhs = parse_bblc(term, bit); + res->u.app.lhs = parse_bloc_bblc(term, bit); + res->u.app.rhs = parse_bloc_bblc(term, bit); } else if (BIT_AT(*bit)) { const size_t cur = *bit; while (BIT_AT(*bit)) @@ -64,28 +68,8 @@ static struct term *parse_bblc(const char *term, size_t *bit) res = new_term(VAR); res->u.var.index = *bit - cur - 1; (*bit)++; - } else { - (*bit)++; - res = parse_bblc(term, bit); - } - return res; -} - -// parses bloc's bit-encoded blc (1I => 2B index) -static struct term *parse_bloc_bblc(const char *term, size_t *bit) -{ - struct term *res = 0; - if (!BIT_AT(*bit) && !BIT_AT(*bit + 1)) { - (*bit) += 2; - res = new_term(ABS); - res->u.abs.term = parse_bloc_bblc(term, bit); - } else if (!BIT_AT(*bit) && BIT_AT(*bit + 1)) { - (*bit) += 2; - res = new_term(APP); - res->u.app.lhs = parse_bloc_bblc(term, bit); - res->u.app.rhs = parse_bloc_bblc(term, bit); - } else if (BIT_AT(*bit)) { - (*bit) += 1; + } else if (!BIT_AT(*bit) && BIT_AT(*bit + 1) && BIT_AT(*bit)) { + (*bit) += 3; res = new_term(REF); short index = 0; for (int i = 0; i < 16; i++) @@ -112,19 +96,16 @@ struct bloc_parsed *parse_bloc(const void *bloc) parsed->length = header->length; parsed->entries = malloc(header->length * sizeof(struct term *)); - const struct bloc_entry *current = (const void *)&header->data; + const struct bloc_entry *current = (const void *)&header->entries; for (size_t i = 0; i < parsed->length; i++) { size_t len = 0; - parsed->entries[i] = parse_bblc((const char *)current, &len); + parsed->entries[i] = + parse_bloc_bblc((const char *)current, &len); current = (const struct bloc_entry *)(((const char *)current) + (len / 8) + (len % 8 != 0)); } - size_t len = 0; - const char *term = (const char *)current; - parsed->term = parse_bloc_bblc(term, &len); - return parsed; } @@ -139,8 +120,7 @@ static struct term *rec_bloc(struct term *term, struct bloc_parsed *bloc) rec_bloc(term->u.app.rhs, bloc); break; case VAR: - fprintf(stderr, "bloc can't have vars\n"); - return 0; + break; case REF: if (term->u.ref.index >= bloc->length) { fprintf(stderr, "invalid entry reference\n"); diff --git a/src/tree.c b/src/tree.c new file mode 100644 index 0000000..b129f9a --- /dev/null +++ b/src/tree.c @@ -0,0 +1,363 @@ +// Copyright (c) 2023, Marvin Borner <dev@marvinborner.de> +// SPDX-License-Identifier: MIT + +// We need to find the longest repeating subexpressions. +// We do this by creating a kind-of merkle tree out of the expressions +// and finding the largest repeating subtrees. + +#include <stdio.h> +#include <search.h> +#include <string.h> +#include <stdlib.h> + +#include <tree.h> + +static inline uint32_t murmur_32_scramble(uint32_t k) +{ + k *= 0xcc9e2d51; + k = (k << 15) | (k >> 17); + k *= 0x1b873593; + return k; +} + +// TODO: I'm really unsure whether murmur3 is appropriate for this. +static uint32_t murmur3_32(const uint8_t *key, size_t len, uint32_t seed) +{ + uint32_t h = seed; + uint32_t k; + for (size_t i = len >> 2; i; i--) { + memcpy(&k, key, sizeof(uint32_t)); + key += sizeof(uint32_t); + h ^= murmur_32_scramble(k); + h = (h << 13) | (h >> 19); + h = h * 5 + 0xe6546b64; + } + k = 0; + for (size_t i = len & 3; i; i--) { + k <<= 8; + k |= key[i - 1]; + } + h ^= murmur_32_scramble(k); + h ^= len; + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + return h; +} + +static struct list *list_end = 0; +static struct list *list_add(struct list *list, void *data) +{ + struct list *new = malloc(sizeof(*new)); + new->next = list; + new->data = data; + new->val = list ? list->val + 1 : 1; // amount of trees in list + return new; +} + +static struct list *list_add_prioritized(struct list *list, struct list *sub) +{ + struct list *new = malloc(sizeof(*new)); + /* new->val = ((struct tree *)sub->data)->size * sub->val; */ + new->val = ((struct tree *)sub->data)->size; + new->data = sub; + + if (!list) { + new->next = list; + return new; + } + + if (new->val >= list->val) { // insert at head + new->next = list; + return new; + } + + struct list *iterator = list; + while (iterator->next && new->val < iterator->next->val) { + iterator = iterator->next; + } + + new->next = iterator->next; + iterator->next = new; + + return list; +} + +// element of the tsearch tree +struct set_element { + uint32_t hash; + struct list *list; +}; + +// comparison_fn_t for tsearch +static int hash_compare(const void *_a, const void *_b) +{ + const struct set_element *a = _a; + const struct set_element *b = _b; + + if (a->hash < b->hash) + return -1; + if (a->hash > b->hash) + return 1; + return 0; +} + +// applies the hash function to the tree's elements (similar to merkle trees) +// TODO: as above: rethink hash choice +static struct tree *build_tree(struct term *term, void **set) +{ + struct tree *tree = malloc(sizeof(*tree)); + tree->type = term->type; + tree->state = VALIDATED_TREE; + + switch (term->type) { + case ABS: + tree->u.abs.term = build_tree(term->u.abs.term, set); + tree->hash = + murmur3_32((const uint8_t *)&tree->type, + sizeof(tree->type), tree->u.abs.term->hash); + tree->size = tree->u.abs.term->size + 2; + break; + case APP: + tree->u.app.lhs = build_tree(term->u.app.lhs, set); + tree->u.app.rhs = build_tree(term->u.app.rhs, set); + tree->hash = + murmur3_32((const uint8_t *)&tree->type, + sizeof(tree->type), tree->u.app.lhs->hash); + tree->hash = + murmur3_32((const uint8_t *)&tree->hash, + sizeof(tree->hash), tree->u.app.rhs->hash); + tree->size = tree->u.app.lhs->size + tree->u.app.rhs->size + 3; + break; + case VAR: + tree->u.var.index = term->u.var.index; + tree->hash = murmur3_32((const uint8_t *)&tree->type, + sizeof(tree->type), tree->u.var.index); + tree->size = term->u.var.index; + break; + default: + fprintf(stderr, "invalid type %d\n", term->type); + return 0; + } + + if (tree->size < 10) // not suitable for deduplication + return tree; + + struct set_element *element = malloc(sizeof(*element)); + element->hash = tree->hash; + + struct set_element **handle = tsearch(element, set, hash_compare); + if (*handle == element) { // first of its kind + element->list = list_add(list_end, tree); + return tree; + } + + free(element); // already exists, not needed + (*handle)->list = list_add((*handle)->list, tree); + + fprintf(stderr, "found suitable duplicate! %x %d\n", tree->hash, + tree->size); + return tree; +} + +static struct term *tree_to_term(struct tree *tree) +{ + struct term *term = new_term(tree->type); + + switch (tree->type) { + case ABS: + term->u.abs.term = tree_to_term(tree->u.abs.term); + break; + case APP: + term->u.app.lhs = tree_to_term(tree->u.app.lhs); + term->u.app.rhs = tree_to_term(tree->u.app.rhs); + break; + case VAR: + term->u.var.index = tree->u.var.index; + break; + case REF: + term->u.ref.index = tree->u.ref.index; + break; + default: + fprintf(stderr, "invalid type %d\n", term->type); + } + return term; +} + +static struct tree *clone_tree_root(struct tree *tree) +{ + struct tree *new = malloc(sizeof(*new)); + new->type = tree->type; + new->hash = tree->hash; + + switch (tree->type) { + case ABS: + new->u.abs.term = tree->u.abs.term; + break; + case APP: + new->u.app.lhs = tree->u.app.lhs; + new->u.app.rhs = tree->u.app.rhs; + break; + case VAR: + new->u.var.index = tree->u.var.index; + break; + default: + fprintf(stderr, "invalid type %d\n", tree->type); + free(new); + return 0; + } + + return new; +} + +static void invalidate_tree(struct tree *tree) +{ + tree->state = INVALIDATED_TREE; + switch (tree->type) { + case ABS: + invalidate_tree(tree->u.abs.term); + break; + case APP: + invalidate_tree(tree->u.app.lhs); + invalidate_tree(tree->u.app.rhs); + break; + case VAR: + break; + default: + fprintf(stderr, "invalid type %d\n", tree->type); + } +} + +static void free_tree(struct tree *tree) +{ + switch (tree->type) { + case ABS: + free_tree(tree->u.abs.term); + break; + case APP: + free_tree(tree->u.app.lhs); + free_tree(tree->u.app.rhs); + break; + case VAR: + break; + case REF: + break; + default: + fprintf(stderr, "invalid type %d\n", tree->type); + return; + } + + if (FREEABLE_TREE(tree)) + free(tree); +} + +static void ref_invalidated_tree(struct tree *tree) +{ + switch (tree->type) { + case ABS: + free_tree(tree->u.abs.term); + break; + case APP: + free_tree(tree->u.app.lhs); + free_tree(tree->u.app.rhs); + break; + case VAR: + break; + default: + fprintf(stderr, "invalid type %d\n", tree->type); + } + if (tree->state != INVALIDATED_TREE && + tree->state != VALIDATED_TREE) { // is reffed + tree->type = REF; + tree->u.ref.index = tree->state - 1; + tree->state = VALIDATED_TREE; + } +} + +struct list *tree_merge_duplicates(struct term *term) +{ + void *set = 0; + build_tree(term, &set); + if (!set) + return 0; + + // construct priority list while deleting set + struct list *prioritized = list_end; + while (set) { + struct set_element *element = *(struct set_element **)set; + prioritized = list_add_prioritized(prioritized, element->list); + tdelete(element, &set, hash_compare); + free(element); + } + + struct list *final = list_end; + struct list *invalidated = list_end; + + struct list *iterator = prioritized; + + // add longest (=> blueprint/structure of expression) + final = list_add(final, ((struct list *)iterator->data)->data); + iterator = iterator->next; + + while (iterator) { + struct list *list = iterator->data; + struct tree *head = list->data; + + print_bruijn(tree_to_term(head)); + printf("\n%x: %d\n\n", head->hash, list->val); + + if (head->state != VALIDATED_TREE) { + printf("skipping invalidated: %x\n", head->hash); + iterator = iterator->next; + continue; + } + + if (list->val > 1) { // needs merging + // clone root so it doesn't get replaced by a ref to itself + struct tree *cloned_head = clone_tree_root(head); + cloned_head->state = INVALIDATED_TREE; + final = list_add(final, cloned_head); + + // invalidate all subtrees + while (list) { + invalidate_tree(list->data); + ((struct tree *)list->data)->state = + final->val - 1; + + invalidated = list_add(invalidated, list->data); + list = list->next; + } + } + + iterator = iterator->next; + } + + printf("\n---\n"); + + iterator = invalidated; + while (iterator) { + struct tree *huh = iterator->data; + printf("%x %d\n", huh->hash, huh->state); + print_bruijn(tree_to_term(huh)); + printf("\n"); + ref_invalidated_tree(iterator->data); + struct list *temp = iterator->next; + free(iterator); + iterator = temp; + } + + printf("\n---\n"); + + iterator = final; + while (iterator) { + struct tree *huh = iterator->data; + struct term *foo = tree_to_term(huh); + print_bruijn(foo); + printf("\n%x\n\n", huh->hash); + iterator = iterator->next; + } + + return final; +} |