aboutsummaryrefslogtreecommitdiff
path: root/src/tree.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/tree.c')
-rw-r--r--src/tree.c363
1 files changed, 363 insertions, 0 deletions
diff --git a/src/tree.c b/src/tree.c
new file mode 100644
index 0000000..b129f9a
--- /dev/null
+++ b/src/tree.c
@@ -0,0 +1,363 @@
+// Copyright (c) 2023, Marvin Borner <dev@marvinborner.de>
+// SPDX-License-Identifier: MIT
+
+// We need to find the longest repeating subexpressions.
+// We do this by creating a kind-of merkle tree out of the expressions
+// and finding the largest repeating subtrees.
+
+#include <stdio.h>
+#include <search.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include <tree.h>
+
+static inline uint32_t murmur_32_scramble(uint32_t k)
+{
+ k *= 0xcc9e2d51;
+ k = (k << 15) | (k >> 17);
+ k *= 0x1b873593;
+ return k;
+}
+
+// TODO: I'm really unsure whether murmur3 is appropriate for this.
+static uint32_t murmur3_32(const uint8_t *key, size_t len, uint32_t seed)
+{
+ uint32_t h = seed;
+ uint32_t k;
+ for (size_t i = len >> 2; i; i--) {
+ memcpy(&k, key, sizeof(uint32_t));
+ key += sizeof(uint32_t);
+ h ^= murmur_32_scramble(k);
+ h = (h << 13) | (h >> 19);
+ h = h * 5 + 0xe6546b64;
+ }
+ k = 0;
+ for (size_t i = len & 3; i; i--) {
+ k <<= 8;
+ k |= key[i - 1];
+ }
+ h ^= murmur_32_scramble(k);
+ h ^= len;
+ h ^= h >> 16;
+ h *= 0x85ebca6b;
+ h ^= h >> 13;
+ h *= 0xc2b2ae35;
+ h ^= h >> 16;
+ return h;
+}
+
+static struct list *list_end = 0;
+static struct list *list_add(struct list *list, void *data)
+{
+ struct list *new = malloc(sizeof(*new));
+ new->next = list;
+ new->data = data;
+ new->val = list ? list->val + 1 : 1; // amount of trees in list
+ return new;
+}
+
+static struct list *list_add_prioritized(struct list *list, struct list *sub)
+{
+ struct list *new = malloc(sizeof(*new));
+ /* new->val = ((struct tree *)sub->data)->size * sub->val; */
+ new->val = ((struct tree *)sub->data)->size;
+ new->data = sub;
+
+ if (!list) {
+ new->next = list;
+ return new;
+ }
+
+ if (new->val >= list->val) { // insert at head
+ new->next = list;
+ return new;
+ }
+
+ struct list *iterator = list;
+ while (iterator->next && new->val < iterator->next->val) {
+ iterator = iterator->next;
+ }
+
+ new->next = iterator->next;
+ iterator->next = new;
+
+ return list;
+}
+
+// element of the tsearch tree
+struct set_element {
+ uint32_t hash;
+ struct list *list;
+};
+
+// comparison_fn_t for tsearch
+static int hash_compare(const void *_a, const void *_b)
+{
+ const struct set_element *a = _a;
+ const struct set_element *b = _b;
+
+ if (a->hash < b->hash)
+ return -1;
+ if (a->hash > b->hash)
+ return 1;
+ return 0;
+}
+
+// applies the hash function to the tree's elements (similar to merkle trees)
+// TODO: as above: rethink hash choice
+static struct tree *build_tree(struct term *term, void **set)
+{
+ struct tree *tree = malloc(sizeof(*tree));
+ tree->type = term->type;
+ tree->state = VALIDATED_TREE;
+
+ switch (term->type) {
+ case ABS:
+ tree->u.abs.term = build_tree(term->u.abs.term, set);
+ tree->hash =
+ murmur3_32((const uint8_t *)&tree->type,
+ sizeof(tree->type), tree->u.abs.term->hash);
+ tree->size = tree->u.abs.term->size + 2;
+ break;
+ case APP:
+ tree->u.app.lhs = build_tree(term->u.app.lhs, set);
+ tree->u.app.rhs = build_tree(term->u.app.rhs, set);
+ tree->hash =
+ murmur3_32((const uint8_t *)&tree->type,
+ sizeof(tree->type), tree->u.app.lhs->hash);
+ tree->hash =
+ murmur3_32((const uint8_t *)&tree->hash,
+ sizeof(tree->hash), tree->u.app.rhs->hash);
+ tree->size = tree->u.app.lhs->size + tree->u.app.rhs->size + 3;
+ break;
+ case VAR:
+ tree->u.var.index = term->u.var.index;
+ tree->hash = murmur3_32((const uint8_t *)&tree->type,
+ sizeof(tree->type), tree->u.var.index);
+ tree->size = term->u.var.index;
+ break;
+ default:
+ fprintf(stderr, "invalid type %d\n", term->type);
+ return 0;
+ }
+
+ if (tree->size < 10) // not suitable for deduplication
+ return tree;
+
+ struct set_element *element = malloc(sizeof(*element));
+ element->hash = tree->hash;
+
+ struct set_element **handle = tsearch(element, set, hash_compare);
+ if (*handle == element) { // first of its kind
+ element->list = list_add(list_end, tree);
+ return tree;
+ }
+
+ free(element); // already exists, not needed
+ (*handle)->list = list_add((*handle)->list, tree);
+
+ fprintf(stderr, "found suitable duplicate! %x %d\n", tree->hash,
+ tree->size);
+ return tree;
+}
+
+static struct term *tree_to_term(struct tree *tree)
+{
+ struct term *term = new_term(tree->type);
+
+ switch (tree->type) {
+ case ABS:
+ term->u.abs.term = tree_to_term(tree->u.abs.term);
+ break;
+ case APP:
+ term->u.app.lhs = tree_to_term(tree->u.app.lhs);
+ term->u.app.rhs = tree_to_term(tree->u.app.rhs);
+ break;
+ case VAR:
+ term->u.var.index = tree->u.var.index;
+ break;
+ case REF:
+ term->u.ref.index = tree->u.ref.index;
+ break;
+ default:
+ fprintf(stderr, "invalid type %d\n", term->type);
+ }
+ return term;
+}
+
+static struct tree *clone_tree_root(struct tree *tree)
+{
+ struct tree *new = malloc(sizeof(*new));
+ new->type = tree->type;
+ new->hash = tree->hash;
+
+ switch (tree->type) {
+ case ABS:
+ new->u.abs.term = tree->u.abs.term;
+ break;
+ case APP:
+ new->u.app.lhs = tree->u.app.lhs;
+ new->u.app.rhs = tree->u.app.rhs;
+ break;
+ case VAR:
+ new->u.var.index = tree->u.var.index;
+ break;
+ default:
+ fprintf(stderr, "invalid type %d\n", tree->type);
+ free(new);
+ return 0;
+ }
+
+ return new;
+}
+
+static void invalidate_tree(struct tree *tree)
+{
+ tree->state = INVALIDATED_TREE;
+ switch (tree->type) {
+ case ABS:
+ invalidate_tree(tree->u.abs.term);
+ break;
+ case APP:
+ invalidate_tree(tree->u.app.lhs);
+ invalidate_tree(tree->u.app.rhs);
+ break;
+ case VAR:
+ break;
+ default:
+ fprintf(stderr, "invalid type %d\n", tree->type);
+ }
+}
+
+static void free_tree(struct tree *tree)
+{
+ switch (tree->type) {
+ case ABS:
+ free_tree(tree->u.abs.term);
+ break;
+ case APP:
+ free_tree(tree->u.app.lhs);
+ free_tree(tree->u.app.rhs);
+ break;
+ case VAR:
+ break;
+ case REF:
+ break;
+ default:
+ fprintf(stderr, "invalid type %d\n", tree->type);
+ return;
+ }
+
+ if (FREEABLE_TREE(tree))
+ free(tree);
+}
+
+static void ref_invalidated_tree(struct tree *tree)
+{
+ switch (tree->type) {
+ case ABS:
+ free_tree(tree->u.abs.term);
+ break;
+ case APP:
+ free_tree(tree->u.app.lhs);
+ free_tree(tree->u.app.rhs);
+ break;
+ case VAR:
+ break;
+ default:
+ fprintf(stderr, "invalid type %d\n", tree->type);
+ }
+ if (tree->state != INVALIDATED_TREE &&
+ tree->state != VALIDATED_TREE) { // is reffed
+ tree->type = REF;
+ tree->u.ref.index = tree->state - 1;
+ tree->state = VALIDATED_TREE;
+ }
+}
+
+struct list *tree_merge_duplicates(struct term *term)
+{
+ void *set = 0;
+ build_tree(term, &set);
+ if (!set)
+ return 0;
+
+ // construct priority list while deleting set
+ struct list *prioritized = list_end;
+ while (set) {
+ struct set_element *element = *(struct set_element **)set;
+ prioritized = list_add_prioritized(prioritized, element->list);
+ tdelete(element, &set, hash_compare);
+ free(element);
+ }
+
+ struct list *final = list_end;
+ struct list *invalidated = list_end;
+
+ struct list *iterator = prioritized;
+
+ // add longest (=> blueprint/structure of expression)
+ final = list_add(final, ((struct list *)iterator->data)->data);
+ iterator = iterator->next;
+
+ while (iterator) {
+ struct list *list = iterator->data;
+ struct tree *head = list->data;
+
+ print_bruijn(tree_to_term(head));
+ printf("\n%x: %d\n\n", head->hash, list->val);
+
+ if (head->state != VALIDATED_TREE) {
+ printf("skipping invalidated: %x\n", head->hash);
+ iterator = iterator->next;
+ continue;
+ }
+
+ if (list->val > 1) { // needs merging
+ // clone root so it doesn't get replaced by a ref to itself
+ struct tree *cloned_head = clone_tree_root(head);
+ cloned_head->state = INVALIDATED_TREE;
+ final = list_add(final, cloned_head);
+
+ // invalidate all subtrees
+ while (list) {
+ invalidate_tree(list->data);
+ ((struct tree *)list->data)->state =
+ final->val - 1;
+
+ invalidated = list_add(invalidated, list->data);
+ list = list->next;
+ }
+ }
+
+ iterator = iterator->next;
+ }
+
+ printf("\n---\n");
+
+ iterator = invalidated;
+ while (iterator) {
+ struct tree *huh = iterator->data;
+ printf("%x %d\n", huh->hash, huh->state);
+ print_bruijn(tree_to_term(huh));
+ printf("\n");
+ ref_invalidated_tree(iterator->data);
+ struct list *temp = iterator->next;
+ free(iterator);
+ iterator = temp;
+ }
+
+ printf("\n---\n");
+
+ iterator = final;
+ while (iterator) {
+ struct tree *huh = iterator->data;
+ struct term *foo = tree_to_term(huh);
+ print_bruijn(foo);
+ printf("\n%x\n\n", huh->hash);
+ iterator = iterator->next;
+ }
+
+ return final;
+}