aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--inc/free.h13
-rw-r--r--inc/parse.h1
-rw-r--r--inc/term.h1
-rw-r--r--makefile4
-rw-r--r--readme.md68
-rw-r--r--src/free.c41
-rw-r--r--src/log.c4
-rw-r--r--src/main.c5
-rw-r--r--src/parse.c14
-rw-r--r--src/term.c25
-rw-r--r--src/tree.c61
11 files changed, 128 insertions, 109 deletions
diff --git a/inc/free.h b/inc/free.h
deleted file mode 100644
index 34c82c7..0000000
--- a/inc/free.h
+++ /dev/null
@@ -1,13 +0,0 @@
-// Copyright (c) 2023, Marvin Borner <dev@marvinborner.de>
-// SPDX-License-Identifier: MIT
-
-#ifndef BLOC_FREE_H
-#define BLOC_FREE_H
-
-#include <parse.h>
-#include <term.h>
-
-void free_term(struct term *term);
-void free_bloc(struct bloc_parsed *bloc);
-
-#endif
diff --git a/inc/parse.h b/inc/parse.h
index 2071f65..205f204 100644
--- a/inc/parse.h
+++ b/inc/parse.h
@@ -16,6 +16,7 @@ struct bloc_parsed {
struct term *parse_blc(const char *term);
struct bloc_parsed *parse_bloc(const void *bloc);
+void free_bloc(struct bloc_parsed *bloc);
struct term *from_bloc(struct bloc_parsed *bloc);
#endif
diff --git a/inc/term.h b/inc/term.h
index a90bdd3..1fe5a91 100644
--- a/inc/term.h
+++ b/inc/term.h
@@ -28,5 +28,6 @@ struct term {
};
struct term *new_term(term_type type);
+void free_term(struct term *term);
#endif
diff --git a/makefile b/makefile
index c7ade35..5575ae5 100644
--- a/makefile
+++ b/makefile
@@ -22,7 +22,9 @@ ifeq ($(PREFIX),)
PREFIX := /usr/local
endif
-all: genopts compile sync
+all: genopts compile
+
+full: all sync
genopts:
@gengetopt -i ${CURDIR}/options.ggo -G --output-dir=$(SRC)
diff --git a/readme.md b/readme.md
index 0818a16..d42b2ab 100644
--- a/readme.md
+++ b/readme.md
@@ -10,13 +10,16 @@ BLoC and heavily reduces its size.
Before explaining the format or its optimization techniques, let me
first show you some results:
-1. The bruijn expression `fac (+30)`, where `fac` is the factorial
- implementation from `std/Math`:
+1. The [bruijn](https://github.com/marvinborner/bruijn) expression
+ `fac (+30)`, where `fac` is the factorial implementation from
+ `std/Math`:
- the original expression takes 2000 bytes in bit-encoded BLC
- the same expression in BLoC needs only 423 bytes
2. [My
solution](https://github.com/marvinborner/bruijn/blob/main/samples/aoc/2022/01/solve.bruijn)
- for the “Advent of Code” challenge 2022/01 in bruijn:
+ for the “Advent of Code” challenge
+ [2022/01](https://adventofcode.com/2022/day/1) in
+ [bruijn](https://github.com/marvinborner/bruijn):
- the original expression takes 6258 bytes in bit-encoded BLC
- the same expression in BLoC needs only 1169 bytes
@@ -43,15 +46,40 @@ following derivation of normal bit-encoded BLC:
| 00M | abstraction of expression `M` |
| 010MN | application of `M` and `N` |
| 1<sup>i+1</sup>0 | bruijn index `i` |
-| 011I | 2 byte index to an entry |
+| 011I | index\* to an entry |
+
+(\*): The encoding of indices is quite special: $I=XA$, where
+$X\in\\{00,01,10,11\\}$ and length of binary index $A$ is
+$L(A)\in\\{1,2,4,8\\}$ byte respectively.
The final program will be in the last entry. The indices start counting
from the number of entries down to 0.
+## Example
+
+Let `E` be some kind of expression like `E=\x.(((M (\y.N)) M) N)`, where
+`M` and `N` are both arbitrary expressions of length 16.
+
+The raw BLC expression of `E` would then be `E=00010101M00NMN`. This
+obviously has the drawback of redundant repetition of `M` and `N`.
+
+A possible encoding in BLoC:
+
+| from | to | content |
+|:-----|:-----|:----------------------------------------------------------------------------------------------------------------|
+| 0x00 | 0x04 | “BLoC” |
+| 0x04 | 0x06 | number of entries: 3 |
+| 0x06 | 0x17 | encoded `M`: gets a bit longer due to different encoding |
+| 0x17 | 0x28 | encoded `N`: gets a bit longer due to different encoding |
+| 0x28 | 0x34 | `00010010010011<M>00011<N>011<M>011<N>`, where `<M>=00{1}` and `<N>=00{1}` are indices with length of 1.25 byte |
+
+Even in this small example BLoC uses less space than BLC (0x34 vs. 0x42
+bytes). Depending on the content of `M` and `N`, this could have
+potentially been compressed even more.
+
## Optimizer
-The optimizer converts a normal BLC expression to the BLoC format. Its
-strategies are similar to compression using finite state entropy.
+The optimizer converts a normal BLC expression to the BLoC format.
In order to find the largest repeating sub-expressions, the expression
first gets converted to a hashed tree (similar to a Merkle tree).
@@ -65,24 +93,14 @@ in any other way. As an idea for the future, long expressions could get
reduced using different techniques/depths and then get replaced with the
shortest one (as fully reduced expressions aren’t necessarily shorter).
-## Example
+## Improvements
-Let `E` be some kind of expression like `E=\x.(((M (\y.N)) M) N)`, where
-`M` and `N` are both arbitrary expressions of length 16.
+The current optimizer does not always make the best deduplication
+choices. It seems like finding the optimal deduplications requires quite
+complex algorithms which would probably be rather inefficient.
-The raw BLC expression of `E` would then be `E=00010101M00NMN`. This
-obviously has the drawback of redundant repetition of `M` and `N`.
-
-A possible encoding in BLoC:
-
-| from | to | content |
-|:-----|:-----|:--------------------------------------------------------------------------------------|
-| 0x00 | 0x04 | “BLoC” |
-| 0x04 | 0x06 | number of entries: 3 |
-| 0x06 | 0x17 | encoded `M`: gets a bit longer due to different encoding |
-| 0x17 | 0x28 | encoded `N`: gets a bit longer due to different encoding |
-| 0x28 | 0x35 | `00010010010011<M>00011<N>011<M>011<N>`, where `<M>=1` and `<N>=0` are 2 byte indices |
-
-Even in this small example BLoC uses less space than BLC (0x35 vs. 0x42
-bytes). Depending on the content of `M` and `N`, this could have
-potentially been compressed even more.
+For example, as of right now the length of an expression as seen by the
+deduplicator doesn’t consider the change of length when sub-expressions
+get replaced by a reference (of unknown bit length!) to another
+expression. This results in entries like `(0 <1234>)` that would not
+have needed to be deduplicated.
diff --git a/src/free.c b/src/free.c
deleted file mode 100644
index ecf1d9f..0000000
--- a/src/free.c
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (c) 2023, Marvin Borner <dev@marvinborner.de>
-// SPDX-License-Identifier: MIT
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#include <free.h>
-#include <log.h>
-
-void free_term(struct term *term)
-{
- switch (term->type) {
- case ABS:
- free_term(term->u.abs.term);
- free(term);
- break;
- case APP:
- free_term(term->u.app.lhs);
- free_term(term->u.app.rhs);
- free(term);
- break;
- case VAR:
- free(term);
- break;
- case REF:
- free(term);
- break;
- default:
- fatal("invalid type %d\n", term->type);
- }
-}
-
-void free_bloc(struct bloc_parsed *bloc)
-{
- for (size_t i = 0; i < bloc->length; i++) {
- free_term(bloc->entries[i]);
- }
-
- free(bloc->entries);
- free(bloc);
-}
diff --git a/src/log.c b/src/log.c
index 06c3614..a34c8e8 100644
--- a/src/log.c
+++ b/src/log.c
@@ -14,6 +14,8 @@ void debug(const char *format, ...)
if (!debug_enabled)
return;
+ fprintf(stderr, "[DEBUG] ");
+
va_list ap;
va_start(ap, format);
vfprintf(stderr, format, ap);
@@ -27,6 +29,8 @@ void debug_enable(int enable)
void fatal(const char *format, ...)
{
+ fprintf(stderr, "[FATAL] ");
+
va_list ap;
va_start(ap, format);
vfprintf(stderr, format, ap);
diff --git a/src/main.c b/src/main.c
index b2d86eb..a5faa80 100644
--- a/src/main.c
+++ b/src/main.c
@@ -12,7 +12,6 @@
#include <tree.h>
#include <parse.h>
#include <build.h>
-#include <free.h>
// automatically generated using gengetopt
#include "cmdline.h"
@@ -25,7 +24,7 @@ static char *read_stdin(void)
size_t size = 1;
char *string = malloc(sizeof(char) * BUF_SIZE);
if (!string)
- return 0;
+ fatal("out of memory!\n");
string[0] = '\0';
while (fgets(buffer, BUF_SIZE, stdin)) {
char *old = string;
@@ -57,6 +56,8 @@ static char *read_file(const char *path)
fseek(f, 0, SEEK_SET);
char *string = malloc(fsize + 1);
+ if (!string)
+ fatal("out of memory!\n");
int ret = fread(string, fsize, 1, f);
fclose(f);
diff --git a/src/parse.c b/src/parse.c
index b6645fd..9a8af45 100644
--- a/src/parse.c
+++ b/src/parse.c
@@ -111,6 +111,16 @@ struct bloc_parsed *parse_bloc(const void *bloc)
return parsed;
}
+void free_bloc(struct bloc_parsed *bloc)
+{
+ for (size_t i = 0; i < bloc->length; i++) {
+ free_term(bloc->entries[i]);
+ }
+
+ free(bloc->entries);
+ free(bloc);
+}
+
static struct term *rec_bloc(struct term *term, struct bloc_parsed *bloc)
{
switch (term->type) {
@@ -124,10 +134,8 @@ static struct term *rec_bloc(struct term *term, struct bloc_parsed *bloc)
case VAR:
break;
case REF:
- if (term->u.ref.index >= bloc->length) {
+ if (term->u.ref.index >= bloc->length)
fatal("invalid entry reference\n");
- return 0;
- }
memcpy(term,
bloc->entries[bloc->length - term->u.ref.index - 1],
sizeof(*term));
diff --git a/src/term.c b/src/term.c
index 50333c7..63bbb05 100644
--- a/src/term.c
+++ b/src/term.c
@@ -12,7 +12,30 @@ struct term *new_term(term_type type)
{
struct term *term = malloc(sizeof(*term));
if (!term)
- fatal("Out of memory!\n");
+ fatal("out of memory!\n");
term->type = type;
return term;
}
+
+void free_term(struct term *term)
+{
+ switch (term->type) {
+ case ABS:
+ free_term(term->u.abs.term);
+ free(term);
+ break;
+ case APP:
+ free_term(term->u.app.lhs);
+ free_term(term->u.app.rhs);
+ free(term);
+ break;
+ case VAR:
+ free(term);
+ break;
+ case REF:
+ free(term);
+ break;
+ default:
+ fatal("invalid type %d\n", term->type);
+ }
+}
diff --git a/src/tree.c b/src/tree.c
index 87219ae..755b5e2 100644
--- a/src/tree.c
+++ b/src/tree.c
@@ -53,6 +53,8 @@ static struct list *list_end = 0;
static struct list *list_add(struct list *list, void *data)
{
struct list *new = malloc(sizeof(*new));
+ if (!new)
+ fatal("out of memory!\n");
new->next = list;
new->data = data;
new->val = list ? list->val + 1 : 1; // amount of trees in list
@@ -79,10 +81,13 @@ static int hash_compare(const void *_a, const void *_b)
}
// applies the hash function to the tree's elements (similar to merkle trees)
+// also creates a set of lists with deduplication candidates
// TODO: as above: rethink hash choice
static struct tree *build_tree(struct term *term, void **set)
{
struct tree *tree = malloc(sizeof(*tree));
+ if (!tree)
+ fatal("out of memory!\n");
tree->type = term->type;
tree->state = VALIDATED_TREE;
tree->duplication_count = 1;
@@ -120,6 +125,8 @@ static struct tree *build_tree(struct term *term, void **set)
return tree;
struct set_element *element = malloc(sizeof(*element));
+ if (!element)
+ fatal("out of memory!\n");
element->hash = tree->hash;
struct set_element **handle = tsearch(element, set, hash_compare);
@@ -137,6 +144,8 @@ static struct tree *build_tree(struct term *term, void **set)
static struct tree *clone_tree_root(struct tree *tree)
{
struct tree *new = malloc(sizeof(*new));
+ if (!new)
+ fatal("out of memory!\n");
new->type = tree->type;
new->hash = tree->hash;
new->duplication_count = tree->duplication_count;
@@ -226,6 +235,8 @@ static void ref_invalidated_tree(struct tree *tree)
}
}
+// priority of candidate -> length of expression
+// TODO: What about occurrence count (list length)?
static pqueue_pri_t get_pri(void *a)
{
return ((struct tree *)((struct list *)a)->data)->size;
@@ -240,14 +251,18 @@ struct list *tree_merge_duplicates(struct term *term)
{
debug("building the merkle tree and deduplication set\n");
+ // get the deduplication candidates
void *set = 0;
build_tree(term, &set);
if (!set)
- return 0;
+ fatal("term too short\n");
// construct priority queue while deleting set
+ // ~> sorts the candidates by get_pri
debug("constructing priority queue\n");
struct pqueue *prioritized = pqueue_init(2 << 15, cmp_pri, get_pri);
+ if (!prioritized)
+ fatal("can't create pqueue\n");
while (set) {
struct set_element *element = *(struct set_element **)set;
pqueue_insert(prioritized, element->list);
@@ -265,33 +280,33 @@ struct list *tree_merge_duplicates(struct term *term)
debug("iterating priority queue, invalidating duplicates\n");
struct list *iterator;
while ((iterator = pqueue_pop(prioritized))) {
+ // only consider merging if they occur >1 times
+ if (iterator->val <= 1)
+ continue;
+
struct tree *head = iterator->data;
+ // skip if invalidated and not duplicated enough
if (head->state != VALIDATED_TREE &&
- head->duplication_count >=
- iterator->val) { // skip invalidated
+ head->duplication_count >= iterator->val)
continue;
- }
- if (iterator->val > 1) { // needs merging
- // clone root so it doesn't get replaced by a ref to itself
- struct tree *cloned_head = clone_tree_root(head);
- cloned_head->state = INVALIDATED_TREE;
- final = list_add(final, cloned_head);
-
- // invalidate all subtrees
- // invalidated trees will be replaced with a reference
- struct list *list = iterator;
- while (list) {
- invalidate_tree(list->data, list->val);
-
- // keep a ref for later replacement
- ((struct tree *)list->data)->state =
- final->val - 1;
-
- invalidated = list_add(invalidated, list->data);
- list = list->next;
- }
+ // clone root so it doesn't get replaced by a ref to itself
+ struct tree *cloned_head = clone_tree_root(head);
+ cloned_head->state = INVALIDATED_TREE;
+ final = list_add(final, cloned_head);
+
+ // invalidate all subtrees
+ // invalidated trees will be replaced with a reference
+ struct list *list = iterator;
+ while (list) {
+ invalidate_tree(list->data, list->val);
+
+ // keep a ref for later replacement
+ ((struct tree *)list->data)->state = final->val - 1;
+
+ invalidated = list_add(invalidated, list->data);
+ list = list->next;
}
}