aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorMarvin Borner2023-04-13 13:08:21 +0200
committerMarvin Borner2023-04-13 13:08:21 +0200
commit3fec61c52b2636397012b82219e1ae5d19fc9fc0 (patch)
tree5789dbd0d8058fe885a797ea59c5209e91ea7597 /src
parent2ca9fd69c9b60c54316dfb25ba45e64fee0de968 (diff)
Basically working
many memory leaks, will fix kthxbye
Diffstat (limited to 'src')
-rw-r--r--src/build.c1
-rw-r--r--src/free.c2
-rw-r--r--src/main.c50
-rw-r--r--src/parse.c54
-rw-r--r--src/tree.c363
5 files changed, 407 insertions, 63 deletions
diff --git a/src/build.c b/src/build.c
index 510afe0..d415c29 100644
--- a/src/build.c
+++ b/src/build.c
@@ -74,7 +74,6 @@ void write_bloc(struct term *term, const char *path)
write_bblc(N, file);
// TODO
- short table_index = 0;
char byte = 0;
int bit = 0;
write_bit(0, file, &byte, &bit);
diff --git a/src/free.c b/src/free.c
index b22089e..1d512fa 100644
--- a/src/free.c
+++ b/src/free.c
@@ -25,7 +25,7 @@ void free_term(struct term *term)
free(term);
break;
default:
- fprintf(stderr, "Invalid type %d\n", term->type);
+ fprintf(stderr, "invalid type %d\n", term->type);
}
}
diff --git a/src/main.c b/src/main.c
index 454c9d9..5f82a3b 100644
--- a/src/main.c
+++ b/src/main.c
@@ -8,6 +8,7 @@
#include <term.h>
#include <print.h>
+#include <tree.h>
#include <parse.h>
#include <build.h>
#include <free.h>
@@ -69,30 +70,31 @@ static char *read_file(const char *path)
int main(int argc, char **argv)
{
- /* if (argc < 2) { */
- /* fprintf(stderr, "Invalid arguments\n"); */
- /* return 1; */
- /* } */
-
- /* char *input; */
- /* if (argv[1][0] == '-') { */
- /* input = read_stdin(); */
- /* } else { */
- /* input = read_file(argv[1]); */
- /* } */
-
- /* if (!input) */
- /* return 1; */
-
- /* struct term *parsed = parse_blc(input); */
- /* print_bruijn(parsed); */
-
- write_bloc(0, "test.bloc");
-
- const char *input = read_file("test.bloc");
- struct bloc_parsed *bloc = parse_bloc(input);
- struct term *term = from_bloc(bloc);
- print_blc(term);
+ if (argc < 2) {
+ fprintf(stderr, "Invalid arguments\n");
+ return 1;
+ }
+
+ char *input;
+ if (argv[1][0] == '-') {
+ input = read_stdin();
+ } else {
+ input = read_file(argv[1]);
+ }
+
+ if (!input)
+ return 1;
+
+ struct term *parsed = parse_blc(input);
+ print_bruijn(parsed);
+
+ tree_merge_duplicates(parsed);
+
+ /* write_bloc(0, "test.bloc"); */
+ /* const char *input = read_file("test.bloc"); */
+ /* struct bloc_parsed *bloc = parse_bloc(input); */
+ /* struct term *term = from_bloc(bloc); */
+ /* print_blc(term); */
/* free_term(term); // TODO: Fix sharing user-after-free */
diff --git a/src/parse.c b/src/parse.c
index 8c69056..5233354 100644
--- a/src/parse.c
+++ b/src/parse.c
@@ -44,19 +44,23 @@ struct term *parse_blc(const char *term)
#define BIT_AT(i) (term[(i) / 8] & (1 << (7 - ((i) % 8))))
-// parses normal bit-encoded blc
-static struct term *parse_bblc(const char *term, size_t *bit)
+// parses bloc's bit-encoded blc
+// 00M -> abstraction of M
+// 010MN -> application of M and N
+// 1X0 -> bruijn index, amount of 1s in X
+// 011I -> 2B index to entry
+static struct term *parse_bloc_bblc(const char *term, size_t *bit)
{
struct term *res = 0;
if (!BIT_AT(*bit) && !BIT_AT(*bit + 1)) {
(*bit) += 2;
res = new_term(ABS);
- res->u.abs.term = parse_bblc(term, bit);
- } else if (!BIT_AT(*bit) && BIT_AT(*bit + 1)) {
- (*bit) += 2;
+ res->u.abs.term = parse_bloc_bblc(term, bit);
+ } else if (!BIT_AT(*bit) && BIT_AT(*bit + 1) && !BIT_AT(*bit)) {
+ (*bit) += 3;
res = new_term(APP);
- res->u.app.lhs = parse_bblc(term, bit);
- res->u.app.rhs = parse_bblc(term, bit);
+ res->u.app.lhs = parse_bloc_bblc(term, bit);
+ res->u.app.rhs = parse_bloc_bblc(term, bit);
} else if (BIT_AT(*bit)) {
const size_t cur = *bit;
while (BIT_AT(*bit))
@@ -64,28 +68,8 @@ static struct term *parse_bblc(const char *term, size_t *bit)
res = new_term(VAR);
res->u.var.index = *bit - cur - 1;
(*bit)++;
- } else {
- (*bit)++;
- res = parse_bblc(term, bit);
- }
- return res;
-}
-
-// parses bloc's bit-encoded blc (1I => 2B index)
-static struct term *parse_bloc_bblc(const char *term, size_t *bit)
-{
- struct term *res = 0;
- if (!BIT_AT(*bit) && !BIT_AT(*bit + 1)) {
- (*bit) += 2;
- res = new_term(ABS);
- res->u.abs.term = parse_bloc_bblc(term, bit);
- } else if (!BIT_AT(*bit) && BIT_AT(*bit + 1)) {
- (*bit) += 2;
- res = new_term(APP);
- res->u.app.lhs = parse_bloc_bblc(term, bit);
- res->u.app.rhs = parse_bloc_bblc(term, bit);
- } else if (BIT_AT(*bit)) {
- (*bit) += 1;
+ } else if (!BIT_AT(*bit) && BIT_AT(*bit + 1) && BIT_AT(*bit)) {
+ (*bit) += 3;
res = new_term(REF);
short index = 0;
for (int i = 0; i < 16; i++)
@@ -112,19 +96,16 @@ struct bloc_parsed *parse_bloc(const void *bloc)
parsed->length = header->length;
parsed->entries = malloc(header->length * sizeof(struct term *));
- const struct bloc_entry *current = (const void *)&header->data;
+ const struct bloc_entry *current = (const void *)&header->entries;
for (size_t i = 0; i < parsed->length; i++) {
size_t len = 0;
- parsed->entries[i] = parse_bblc((const char *)current, &len);
+ parsed->entries[i] =
+ parse_bloc_bblc((const char *)current, &len);
current =
(const struct bloc_entry *)(((const char *)current) +
(len / 8) + (len % 8 != 0));
}
- size_t len = 0;
- const char *term = (const char *)current;
- parsed->term = parse_bloc_bblc(term, &len);
-
return parsed;
}
@@ -139,8 +120,7 @@ static struct term *rec_bloc(struct term *term, struct bloc_parsed *bloc)
rec_bloc(term->u.app.rhs, bloc);
break;
case VAR:
- fprintf(stderr, "bloc can't have vars\n");
- return 0;
+ break;
case REF:
if (term->u.ref.index >= bloc->length) {
fprintf(stderr, "invalid entry reference\n");
diff --git a/src/tree.c b/src/tree.c
new file mode 100644
index 0000000..b129f9a
--- /dev/null
+++ b/src/tree.c
@@ -0,0 +1,363 @@
+// Copyright (c) 2023, Marvin Borner <dev@marvinborner.de>
+// SPDX-License-Identifier: MIT
+
+// We need to find the longest repeating subexpressions.
+// We do this by creating a kind-of merkle tree out of the expressions
+// and finding the largest repeating subtrees.
+
+#include <stdio.h>
+#include <search.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include <tree.h>
+
+static inline uint32_t murmur_32_scramble(uint32_t k)
+{
+ k *= 0xcc9e2d51;
+ k = (k << 15) | (k >> 17);
+ k *= 0x1b873593;
+ return k;
+}
+
+// TODO: I'm really unsure whether murmur3 is appropriate for this.
+static uint32_t murmur3_32(const uint8_t *key, size_t len, uint32_t seed)
+{
+ uint32_t h = seed;
+ uint32_t k;
+ for (size_t i = len >> 2; i; i--) {
+ memcpy(&k, key, sizeof(uint32_t));
+ key += sizeof(uint32_t);
+ h ^= murmur_32_scramble(k);
+ h = (h << 13) | (h >> 19);
+ h = h * 5 + 0xe6546b64;
+ }
+ k = 0;
+ for (size_t i = len & 3; i; i--) {
+ k <<= 8;
+ k |= key[i - 1];
+ }
+ h ^= murmur_32_scramble(k);
+ h ^= len;
+ h ^= h >> 16;
+ h *= 0x85ebca6b;
+ h ^= h >> 13;
+ h *= 0xc2b2ae35;
+ h ^= h >> 16;
+ return h;
+}
+
+static struct list *list_end = 0;
+static struct list *list_add(struct list *list, void *data)
+{
+ struct list *new = malloc(sizeof(*new));
+ new->next = list;
+ new->data = data;
+ new->val = list ? list->val + 1 : 1; // amount of trees in list
+ return new;
+}
+
+static struct list *list_add_prioritized(struct list *list, struct list *sub)
+{
+ struct list *new = malloc(sizeof(*new));
+ /* new->val = ((struct tree *)sub->data)->size * sub->val; */
+ new->val = ((struct tree *)sub->data)->size;
+ new->data = sub;
+
+ if (!list) {
+ new->next = list;
+ return new;
+ }
+
+ if (new->val >= list->val) { // insert at head
+ new->next = list;
+ return new;
+ }
+
+ struct list *iterator = list;
+ while (iterator->next && new->val < iterator->next->val) {
+ iterator = iterator->next;
+ }
+
+ new->next = iterator->next;
+ iterator->next = new;
+
+ return list;
+}
+
+// element of the tsearch tree
+struct set_element {
+ uint32_t hash;
+ struct list *list;
+};
+
+// comparison_fn_t for tsearch
+static int hash_compare(const void *_a, const void *_b)
+{
+ const struct set_element *a = _a;
+ const struct set_element *b = _b;
+
+ if (a->hash < b->hash)
+ return -1;
+ if (a->hash > b->hash)
+ return 1;
+ return 0;
+}
+
+// applies the hash function to the tree's elements (similar to merkle trees)
+// TODO: as above: rethink hash choice
+static struct tree *build_tree(struct term *term, void **set)
+{
+ struct tree *tree = malloc(sizeof(*tree));
+ tree->type = term->type;
+ tree->state = VALIDATED_TREE;
+
+ switch (term->type) {
+ case ABS:
+ tree->u.abs.term = build_tree(term->u.abs.term, set);
+ tree->hash =
+ murmur3_32((const uint8_t *)&tree->type,
+ sizeof(tree->type), tree->u.abs.term->hash);
+ tree->size = tree->u.abs.term->size + 2;
+ break;
+ case APP:
+ tree->u.app.lhs = build_tree(term->u.app.lhs, set);
+ tree->u.app.rhs = build_tree(term->u.app.rhs, set);
+ tree->hash =
+ murmur3_32((const uint8_t *)&tree->type,
+ sizeof(tree->type), tree->u.app.lhs->hash);
+ tree->hash =
+ murmur3_32((const uint8_t *)&tree->hash,
+ sizeof(tree->hash), tree->u.app.rhs->hash);
+ tree->size = tree->u.app.lhs->size + tree->u.app.rhs->size + 3;
+ break;
+ case VAR:
+ tree->u.var.index = term->u.var.index;
+ tree->hash = murmur3_32((const uint8_t *)&tree->type,
+ sizeof(tree->type), tree->u.var.index);
+ tree->size = term->u.var.index;
+ break;
+ default:
+ fprintf(stderr, "invalid type %d\n", term->type);
+ return 0;
+ }
+
+ if (tree->size < 10) // not suitable for deduplication
+ return tree;
+
+ struct set_element *element = malloc(sizeof(*element));
+ element->hash = tree->hash;
+
+ struct set_element **handle = tsearch(element, set, hash_compare);
+ if (*handle == element) { // first of its kind
+ element->list = list_add(list_end, tree);
+ return tree;
+ }
+
+ free(element); // already exists, not needed
+ (*handle)->list = list_add((*handle)->list, tree);
+
+ fprintf(stderr, "found suitable duplicate! %x %d\n", tree->hash,
+ tree->size);
+ return tree;
+}
+
+static struct term *tree_to_term(struct tree *tree)
+{
+ struct term *term = new_term(tree->type);
+
+ switch (tree->type) {
+ case ABS:
+ term->u.abs.term = tree_to_term(tree->u.abs.term);
+ break;
+ case APP:
+ term->u.app.lhs = tree_to_term(tree->u.app.lhs);
+ term->u.app.rhs = tree_to_term(tree->u.app.rhs);
+ break;
+ case VAR:
+ term->u.var.index = tree->u.var.index;
+ break;
+ case REF:
+ term->u.ref.index = tree->u.ref.index;
+ break;
+ default:
+ fprintf(stderr, "invalid type %d\n", term->type);
+ }
+ return term;
+}
+
+static struct tree *clone_tree_root(struct tree *tree)
+{
+ struct tree *new = malloc(sizeof(*new));
+ new->type = tree->type;
+ new->hash = tree->hash;
+
+ switch (tree->type) {
+ case ABS:
+ new->u.abs.term = tree->u.abs.term;
+ break;
+ case APP:
+ new->u.app.lhs = tree->u.app.lhs;
+ new->u.app.rhs = tree->u.app.rhs;
+ break;
+ case VAR:
+ new->u.var.index = tree->u.var.index;
+ break;
+ default:
+ fprintf(stderr, "invalid type %d\n", tree->type);
+ free(new);
+ return 0;
+ }
+
+ return new;
+}
+
+static void invalidate_tree(struct tree *tree)
+{
+ tree->state = INVALIDATED_TREE;
+ switch (tree->type) {
+ case ABS:
+ invalidate_tree(tree->u.abs.term);
+ break;
+ case APP:
+ invalidate_tree(tree->u.app.lhs);
+ invalidate_tree(tree->u.app.rhs);
+ break;
+ case VAR:
+ break;
+ default:
+ fprintf(stderr, "invalid type %d\n", tree->type);
+ }
+}
+
+static void free_tree(struct tree *tree)
+{
+ switch (tree->type) {
+ case ABS:
+ free_tree(tree->u.abs.term);
+ break;
+ case APP:
+ free_tree(tree->u.app.lhs);
+ free_tree(tree->u.app.rhs);
+ break;
+ case VAR:
+ break;
+ case REF:
+ break;
+ default:
+ fprintf(stderr, "invalid type %d\n", tree->type);
+ return;
+ }
+
+ if (FREEABLE_TREE(tree))
+ free(tree);
+}
+
+static void ref_invalidated_tree(struct tree *tree)
+{
+ switch (tree->type) {
+ case ABS:
+ free_tree(tree->u.abs.term);
+ break;
+ case APP:
+ free_tree(tree->u.app.lhs);
+ free_tree(tree->u.app.rhs);
+ break;
+ case VAR:
+ break;
+ default:
+ fprintf(stderr, "invalid type %d\n", tree->type);
+ }
+ if (tree->state != INVALIDATED_TREE &&
+ tree->state != VALIDATED_TREE) { // is reffed
+ tree->type = REF;
+ tree->u.ref.index = tree->state - 1;
+ tree->state = VALIDATED_TREE;
+ }
+}
+
+struct list *tree_merge_duplicates(struct term *term)
+{
+ void *set = 0;
+ build_tree(term, &set);
+ if (!set)
+ return 0;
+
+ // construct priority list while deleting set
+ struct list *prioritized = list_end;
+ while (set) {
+ struct set_element *element = *(struct set_element **)set;
+ prioritized = list_add_prioritized(prioritized, element->list);
+ tdelete(element, &set, hash_compare);
+ free(element);
+ }
+
+ struct list *final = list_end;
+ struct list *invalidated = list_end;
+
+ struct list *iterator = prioritized;
+
+ // add longest (=> blueprint/structure of expression)
+ final = list_add(final, ((struct list *)iterator->data)->data);
+ iterator = iterator->next;
+
+ while (iterator) {
+ struct list *list = iterator->data;
+ struct tree *head = list->data;
+
+ print_bruijn(tree_to_term(head));
+ printf("\n%x: %d\n\n", head->hash, list->val);
+
+ if (head->state != VALIDATED_TREE) {
+ printf("skipping invalidated: %x\n", head->hash);
+ iterator = iterator->next;
+ continue;
+ }
+
+ if (list->val > 1) { // needs merging
+ // clone root so it doesn't get replaced by a ref to itself
+ struct tree *cloned_head = clone_tree_root(head);
+ cloned_head->state = INVALIDATED_TREE;
+ final = list_add(final, cloned_head);
+
+ // invalidate all subtrees
+ while (list) {
+ invalidate_tree(list->data);
+ ((struct tree *)list->data)->state =
+ final->val - 1;
+
+ invalidated = list_add(invalidated, list->data);
+ list = list->next;
+ }
+ }
+
+ iterator = iterator->next;
+ }
+
+ printf("\n---\n");
+
+ iterator = invalidated;
+ while (iterator) {
+ struct tree *huh = iterator->data;
+ printf("%x %d\n", huh->hash, huh->state);
+ print_bruijn(tree_to_term(huh));
+ printf("\n");
+ ref_invalidated_tree(iterator->data);
+ struct list *temp = iterator->next;
+ free(iterator);
+ iterator = temp;
+ }
+
+ printf("\n---\n");
+
+ iterator = final;
+ while (iterator) {
+ struct tree *huh = iterator->data;
+ struct term *foo = tree_to_term(huh);
+ print_bruijn(foo);
+ printf("\n%x\n\n", huh->hash);
+ iterator = iterator->next;
+ }
+
+ return final;
+}