diff options
-rw-r--r-- | inc/free.h | 13 | ||||
-rw-r--r-- | inc/parse.h | 1 | ||||
-rw-r--r-- | inc/term.h | 1 | ||||
-rw-r--r-- | makefile | 4 | ||||
-rw-r--r-- | readme.md | 68 | ||||
-rw-r--r-- | src/free.c | 41 | ||||
-rw-r--r-- | src/log.c | 4 | ||||
-rw-r--r-- | src/main.c | 5 | ||||
-rw-r--r-- | src/parse.c | 14 | ||||
-rw-r--r-- | src/term.c | 25 | ||||
-rw-r--r-- | src/tree.c | 61 |
11 files changed, 128 insertions, 109 deletions
diff --git a/inc/free.h b/inc/free.h deleted file mode 100644 index 34c82c7..0000000 --- a/inc/free.h +++ /dev/null @@ -1,13 +0,0 @@ -// Copyright (c) 2023, Marvin Borner <dev@marvinborner.de> -// SPDX-License-Identifier: MIT - -#ifndef BLOC_FREE_H -#define BLOC_FREE_H - -#include <parse.h> -#include <term.h> - -void free_term(struct term *term); -void free_bloc(struct bloc_parsed *bloc); - -#endif diff --git a/inc/parse.h b/inc/parse.h index 2071f65..205f204 100644 --- a/inc/parse.h +++ b/inc/parse.h @@ -16,6 +16,7 @@ struct bloc_parsed { struct term *parse_blc(const char *term); struct bloc_parsed *parse_bloc(const void *bloc); +void free_bloc(struct bloc_parsed *bloc); struct term *from_bloc(struct bloc_parsed *bloc); #endif @@ -28,5 +28,6 @@ struct term { }; struct term *new_term(term_type type); +void free_term(struct term *term); #endif @@ -22,7 +22,9 @@ ifeq ($(PREFIX),) PREFIX := /usr/local endif -all: genopts compile sync +all: genopts compile + +full: all sync genopts: @gengetopt -i ${CURDIR}/options.ggo -G --output-dir=$(SRC) @@ -10,13 +10,16 @@ BLoC and heavily reduces its size. Before explaining the format or its optimization techniques, let me first show you some results: -1. The bruijn expression `fac (+30)`, where `fac` is the factorial - implementation from `std/Math`: +1. The [bruijn](https://github.com/marvinborner/bruijn) expression + `fac (+30)`, where `fac` is the factorial implementation from + `std/Math`: - the original expression takes 2000 bytes in bit-encoded BLC - the same expression in BLoC needs only 423 bytes 2. [My solution](https://github.com/marvinborner/bruijn/blob/main/samples/aoc/2022/01/solve.bruijn) - for the “Advent of Code” challenge 2022/01 in bruijn: + for the “Advent of Code” challenge + [2022/01](https://adventofcode.com/2022/day/1) in + [bruijn](https://github.com/marvinborner/bruijn): - the original expression takes 6258 bytes in bit-encoded BLC - the same expression in BLoC needs only 1169 bytes @@ -43,15 +46,40 @@ following derivation of normal bit-encoded BLC: | 00M | abstraction of expression `M` | | 010MN | application of `M` and `N` | | 1<sup>i+1</sup>0 | bruijn index `i` | -| 011I | 2 byte index to an entry | +| 011I | index\* to an entry | + +(\*): The encoding of indices is quite special: $I=XA$, where +$X\in\\{00,01,10,11\\}$ and length of binary index $A$ is +$L(A)\in\\{1,2,4,8\\}$ byte respectively. The final program will be in the last entry. The indices start counting from the number of entries down to 0. +## Example + +Let `E` be some kind of expression like `E=\x.(((M (\y.N)) M) N)`, where +`M` and `N` are both arbitrary expressions of length 16. + +The raw BLC expression of `E` would then be `E=00010101M00NMN`. This +obviously has the drawback of redundant repetition of `M` and `N`. + +A possible encoding in BLoC: + +| from | to | content | +|:-----|:-----|:----------------------------------------------------------------------------------------------------------------| +| 0x00 | 0x04 | “BLoC” | +| 0x04 | 0x06 | number of entries: 3 | +| 0x06 | 0x17 | encoded `M`: gets a bit longer due to different encoding | +| 0x17 | 0x28 | encoded `N`: gets a bit longer due to different encoding | +| 0x28 | 0x34 | `00010010010011<M>00011<N>011<M>011<N>`, where `<M>=00{1}` and `<N>=00{1}` are indices with length of 1.25 byte | + +Even in this small example BLoC uses less space than BLC (0x34 vs. 0x42 +bytes). Depending on the content of `M` and `N`, this could have +potentially been compressed even more. + ## Optimizer -The optimizer converts a normal BLC expression to the BLoC format. Its -strategies are similar to compression using finite state entropy. +The optimizer converts a normal BLC expression to the BLoC format. In order to find the largest repeating sub-expressions, the expression first gets converted to a hashed tree (similar to a Merkle tree). @@ -65,24 +93,14 @@ in any other way. As an idea for the future, long expressions could get reduced using different techniques/depths and then get replaced with the shortest one (as fully reduced expressions aren’t necessarily shorter). -## Example +## Improvements -Let `E` be some kind of expression like `E=\x.(((M (\y.N)) M) N)`, where -`M` and `N` are both arbitrary expressions of length 16. +The current optimizer does not always make the best deduplication +choices. It seems like finding the optimal deduplications requires quite +complex algorithms which would probably be rather inefficient. -The raw BLC expression of `E` would then be `E=00010101M00NMN`. This -obviously has the drawback of redundant repetition of `M` and `N`. - -A possible encoding in BLoC: - -| from | to | content | -|:-----|:-----|:--------------------------------------------------------------------------------------| -| 0x00 | 0x04 | “BLoC” | -| 0x04 | 0x06 | number of entries: 3 | -| 0x06 | 0x17 | encoded `M`: gets a bit longer due to different encoding | -| 0x17 | 0x28 | encoded `N`: gets a bit longer due to different encoding | -| 0x28 | 0x35 | `00010010010011<M>00011<N>011<M>011<N>`, where `<M>=1` and `<N>=0` are 2 byte indices | - -Even in this small example BLoC uses less space than BLC (0x35 vs. 0x42 -bytes). Depending on the content of `M` and `N`, this could have -potentially been compressed even more. +For example, as of right now the length of an expression as seen by the +deduplicator doesn’t consider the change of length when sub-expressions +get replaced by a reference (of unknown bit length!) to another +expression. This results in entries like `(0 <1234>)` that would not +have needed to be deduplicated. diff --git a/src/free.c b/src/free.c deleted file mode 100644 index ecf1d9f..0000000 --- a/src/free.c +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright (c) 2023, Marvin Borner <dev@marvinborner.de> -// SPDX-License-Identifier: MIT - -#include <stdio.h> -#include <stdlib.h> - -#include <free.h> -#include <log.h> - -void free_term(struct term *term) -{ - switch (term->type) { - case ABS: - free_term(term->u.abs.term); - free(term); - break; - case APP: - free_term(term->u.app.lhs); - free_term(term->u.app.rhs); - free(term); - break; - case VAR: - free(term); - break; - case REF: - free(term); - break; - default: - fatal("invalid type %d\n", term->type); - } -} - -void free_bloc(struct bloc_parsed *bloc) -{ - for (size_t i = 0; i < bloc->length; i++) { - free_term(bloc->entries[i]); - } - - free(bloc->entries); - free(bloc); -} @@ -14,6 +14,8 @@ void debug(const char *format, ...) if (!debug_enabled) return; + fprintf(stderr, "[DEBUG] "); + va_list ap; va_start(ap, format); vfprintf(stderr, format, ap); @@ -27,6 +29,8 @@ void debug_enable(int enable) void fatal(const char *format, ...) { + fprintf(stderr, "[FATAL] "); + va_list ap; va_start(ap, format); vfprintf(stderr, format, ap); @@ -12,7 +12,6 @@ #include <tree.h> #include <parse.h> #include <build.h> -#include <free.h> // automatically generated using gengetopt #include "cmdline.h" @@ -25,7 +24,7 @@ static char *read_stdin(void) size_t size = 1; char *string = malloc(sizeof(char) * BUF_SIZE); if (!string) - return 0; + fatal("out of memory!\n"); string[0] = '\0'; while (fgets(buffer, BUF_SIZE, stdin)) { char *old = string; @@ -57,6 +56,8 @@ static char *read_file(const char *path) fseek(f, 0, SEEK_SET); char *string = malloc(fsize + 1); + if (!string) + fatal("out of memory!\n"); int ret = fread(string, fsize, 1, f); fclose(f); diff --git a/src/parse.c b/src/parse.c index b6645fd..9a8af45 100644 --- a/src/parse.c +++ b/src/parse.c @@ -111,6 +111,16 @@ struct bloc_parsed *parse_bloc(const void *bloc) return parsed; } +void free_bloc(struct bloc_parsed *bloc) +{ + for (size_t i = 0; i < bloc->length; i++) { + free_term(bloc->entries[i]); + } + + free(bloc->entries); + free(bloc); +} + static struct term *rec_bloc(struct term *term, struct bloc_parsed *bloc) { switch (term->type) { @@ -124,10 +134,8 @@ static struct term *rec_bloc(struct term *term, struct bloc_parsed *bloc) case VAR: break; case REF: - if (term->u.ref.index >= bloc->length) { + if (term->u.ref.index >= bloc->length) fatal("invalid entry reference\n"); - return 0; - } memcpy(term, bloc->entries[bloc->length - term->u.ref.index - 1], sizeof(*term)); @@ -12,7 +12,30 @@ struct term *new_term(term_type type) { struct term *term = malloc(sizeof(*term)); if (!term) - fatal("Out of memory!\n"); + fatal("out of memory!\n"); term->type = type; return term; } + +void free_term(struct term *term) +{ + switch (term->type) { + case ABS: + free_term(term->u.abs.term); + free(term); + break; + case APP: + free_term(term->u.app.lhs); + free_term(term->u.app.rhs); + free(term); + break; + case VAR: + free(term); + break; + case REF: + free(term); + break; + default: + fatal("invalid type %d\n", term->type); + } +} @@ -53,6 +53,8 @@ static struct list *list_end = 0; static struct list *list_add(struct list *list, void *data) { struct list *new = malloc(sizeof(*new)); + if (!new) + fatal("out of memory!\n"); new->next = list; new->data = data; new->val = list ? list->val + 1 : 1; // amount of trees in list @@ -79,10 +81,13 @@ static int hash_compare(const void *_a, const void *_b) } // applies the hash function to the tree's elements (similar to merkle trees) +// also creates a set of lists with deduplication candidates // TODO: as above: rethink hash choice static struct tree *build_tree(struct term *term, void **set) { struct tree *tree = malloc(sizeof(*tree)); + if (!tree) + fatal("out of memory!\n"); tree->type = term->type; tree->state = VALIDATED_TREE; tree->duplication_count = 1; @@ -120,6 +125,8 @@ static struct tree *build_tree(struct term *term, void **set) return tree; struct set_element *element = malloc(sizeof(*element)); + if (!element) + fatal("out of memory!\n"); element->hash = tree->hash; struct set_element **handle = tsearch(element, set, hash_compare); @@ -137,6 +144,8 @@ static struct tree *build_tree(struct term *term, void **set) static struct tree *clone_tree_root(struct tree *tree) { struct tree *new = malloc(sizeof(*new)); + if (!new) + fatal("out of memory!\n"); new->type = tree->type; new->hash = tree->hash; new->duplication_count = tree->duplication_count; @@ -226,6 +235,8 @@ static void ref_invalidated_tree(struct tree *tree) } } +// priority of candidate -> length of expression +// TODO: What about occurrence count (list length)? static pqueue_pri_t get_pri(void *a) { return ((struct tree *)((struct list *)a)->data)->size; @@ -240,14 +251,18 @@ struct list *tree_merge_duplicates(struct term *term) { debug("building the merkle tree and deduplication set\n"); + // get the deduplication candidates void *set = 0; build_tree(term, &set); if (!set) - return 0; + fatal("term too short\n"); // construct priority queue while deleting set + // ~> sorts the candidates by get_pri debug("constructing priority queue\n"); struct pqueue *prioritized = pqueue_init(2 << 15, cmp_pri, get_pri); + if (!prioritized) + fatal("can't create pqueue\n"); while (set) { struct set_element *element = *(struct set_element **)set; pqueue_insert(prioritized, element->list); @@ -265,33 +280,33 @@ struct list *tree_merge_duplicates(struct term *term) debug("iterating priority queue, invalidating duplicates\n"); struct list *iterator; while ((iterator = pqueue_pop(prioritized))) { + // only consider merging if they occur >1 times + if (iterator->val <= 1) + continue; + struct tree *head = iterator->data; + // skip if invalidated and not duplicated enough if (head->state != VALIDATED_TREE && - head->duplication_count >= - iterator->val) { // skip invalidated + head->duplication_count >= iterator->val) continue; - } - if (iterator->val > 1) { // needs merging - // clone root so it doesn't get replaced by a ref to itself - struct tree *cloned_head = clone_tree_root(head); - cloned_head->state = INVALIDATED_TREE; - final = list_add(final, cloned_head); - - // invalidate all subtrees - // invalidated trees will be replaced with a reference - struct list *list = iterator; - while (list) { - invalidate_tree(list->data, list->val); - - // keep a ref for later replacement - ((struct tree *)list->data)->state = - final->val - 1; - - invalidated = list_add(invalidated, list->data); - list = list->next; - } + // clone root so it doesn't get replaced by a ref to itself + struct tree *cloned_head = clone_tree_root(head); + cloned_head->state = INVALIDATED_TREE; + final = list_add(final, cloned_head); + + // invalidate all subtrees + // invalidated trees will be replaced with a reference + struct list *list = iterator; + while (list) { + invalidate_tree(list->data, list->val); + + // keep a ref for later replacement + ((struct tree *)list->data)->state = final->val - 1; + + invalidated = list_add(invalidated, list->data); + list = list->next; } } |