diff options
-rw-r--r-- | experiments.md | 270 | ||||
-rw-r--r-- | inc/impl.h | 5 | ||||
-rw-r--r-- | makefile | 17 | ||||
-rw-r--r-- | notes.txt | 111 | ||||
-rw-r--r-- | readme.md | 147 | ||||
-rw-r--r-- | src/impl/abs.c (renamed from src/abs.c) | 0 | ||||
-rw-r--r-- | src/impl/abs_app_right.c | 104 | ||||
-rw-r--r-- | src/impl/app_both.c | 112 | ||||
-rw-r--r-- | src/impl/app_left.c (renamed from src/app.c) | 4 | ||||
-rw-r--r-- | src/impl/app_right.c | 89 | ||||
-rw-r--r-- | src/impl/merged_abs.c | 0 | ||||
-rw-r--r-- | src/impl/merged_app.c | 0 | ||||
-rw-r--r-- | src/impl/merged_both.c | 0 | ||||
-rw-r--r-- | src/main.c | 68 |
14 files changed, 773 insertions, 154 deletions
diff --git a/experiments.md b/experiments.md new file mode 100644 index 0000000..d6c64c9 --- /dev/null +++ b/experiments.md @@ -0,0 +1,270 @@ +# BLC Experiments + +Small experiment to find the smallest binary encoding of lambda terms. I +use an empirical approach with the terms located in `samples/`. The +source also serves as a reference C implementation of BLC encodings. + +Of course, encodings that allow sharing of duplicate terms (e.g. in a +binary encoded DAG) can be even smaller. I’ve done that in the +[BLoC](https://github.com/marvinborner/BLoC) encoding and I will apply +the insights from this experiments there as well. + +NOTE: Most of the test files don’t ever use larger indices than 9, so +don’t try to generalize the results below. BLC2, for example, only makes +sense when using many large indices. + +## TL;DR + +Optimizations for application chains don’t seem worth it. Optimizations +for abstraction chains give 3-5% space improvement for most larger +terms. Tricks to omit bits in closed terms always give smaller or +equivalent encodings. + + --- test/small --- + blc: avg. 0.000000% reduction + blc2: avg. -0.162328% reduction + closed: avg. 32.382648% reduction + abs: avg. -13.120658% reduction + app_left: avg. -5.775075% reduction + app_right: avg. 2.095019% reduction + app_both: avg. 2.696779% reduction + abs_app_left: avg. -18.895734% reduction + + --- test/medium --- + blc: avg. 0.000000% reduction + blc2: avg. -4.829910% reduction + closed: avg. 0.160938% reduction + abs: avg. 4.089551% reduction + app_left: avg. -0.507696% reduction + app_right: avg. -6.291403% reduction + app_both: avg. -5.881188% reduction + abs_app_left: avg. 3.581855% reduction + + --- test/large --- + blc: avg. 0.000000% reduction + blc2: avg. -4.473726% reduction + closed: avg. 0.009495% reduction + abs: avg. 3.859261% reduction + app_left: avg. -2.043374% reduction + app_right: avg. -4.231048% reduction + app_both: avg. -3.729422% reduction + abs_app_left: avg. 1.815887% reduction + +## Traditional BLC + +$$ +\begin{aligned} +f(\lambda m) &= 00\ f(M)\\ +f(M N) &= 01\ f(M)\ f(N)\\ +f(i) &= 1^i\ 0 +\end{aligned} +$$ + +**Problem**: de Bruijn indices are encoded in *unary* ($O(n)$). + +### Statistics + + test/medium/fib_rec42.blc: 14167 + test/medium/euler001.blc: 68690 + test/medium/harmonic_series42.blc: 38338 + test/medium/jottary.blc 4364 + test/medium/mutual_recusion.blc 9700 + test/medium/fac42.blc: 8681 + +### Resources: + +- [Binary Lambda + Calculus](https://tromp.github.io/cl/Binary_lambda_calculus.html) + (John Tromp) + +## Levenshtein Indices (`BLC2`) + +$$ +\begin{aligned} +f(\lambda M) &= 00\ f(M)\\ +f(M N) &= 01\ f(M)\ f(N)\\ +f(i) &= \mathrm{code}(i) +\end{aligned} +$$ + +### Statistics + + test/medium/fib_rec42.blc: 14167 -> 14868 (-4.948119% reduction) + test/medium/euler001.blc: 68690 -> 72011 (-4.834765% reduction) + test/medium/harmonic_series42.blc: 38338 -> 40278 (-5.060254% reduction) + test/medium/jottary.blc 4364 -> 4538 (-3.987168% reduction) + test/medium/mutual_recusion.blc 9700 -> 10186 (-5.010309% reduction) + test/medium/fac42.blc: 8681 -> 9121 (-5.068540% reduction) + +### Resources + +- [Levenshtein Coding](https://en.wikipedia.org/wiki/Levenshtein_coding) + (Wikipedia) + +## Mateusz (`closed`) + +Mateusz Naściszewski’s encoding is a slight variation of the normal +encoding. The insight is, that by ignoring open terms, you can skip +writing a bit in several cases: + +- for abstractions/applications in de Bruijn level 0 +- in indices pointing to the outermost abstraction (we can end parsing + at the $n$th `1`, otherwise the term would be open!) + +The encoding is guaranteed to have less or equal bit than normal BLC +(again, this only works for closed terms). + +$$ +\begin{aligned} +f(t) &= g(0, t)\\ +g(0, i) &= \text{error: open}\\ +g(1, 0) &= 1\\ +g(n, 0) &= 10\\ +g(n, i) &= 1\ g(n-1, i-1)\\ +g(0, \lambda M) &= 0\ g(1, M)\\ +g(n, \lambda M) &= 00\ g(n+1, M)\\ +g(0, M\ N) &= 1\ g(1, M)\\ +g(n, M\ N) &= 01\ g(n, M)\ g(n, N) +\end{aligned} +$$ + +(it looks more complicated than it is) + +### Statistics + + test/medium/fib_rec42.blc: 14167 -> 14157 (0.070587% reduction) + test/medium/euler001.blc: 68690 -> 68685 (0.007279% reduction) + test/medium/harmonic_series42.blc: 38338 -> 38292 (0.119985% reduction) + test/medium/jottary.blc 4364 -> 4349 (0.343721% reduction) + test/medium/mutual_recusion.blc 9700 -> 9680 (0.206186% reduction) + test/medium/fac42.blc: 8681 -> 8671 (0.115194% reduction) + +### Resources + +- [Implementation](https://github.com/tromp/AIT/commit/f8c7d191519bc8df4380d49934f2c9b9bdfeef19) + (John Tromp) + +## Merged Left-Apps (`app_left`) + +Here I “merge” multiple left applications with a single prefix. This +uses one bit more for binary applications. However, practical lambda +terms often use many consecutive left applications! + +$$ +\begin{aligned} +f(\lambda M) &= 01\ f(M)\\ +f(M_1\dots M_n) &= 0^n\ 1\ f(M_1)\dots f(M_n)\\ +f(i) &= 1^i 0 +\end{aligned} +$$ + +### Statistics + + test/medium/fib_rec42.blc: 14167 -> 14215 (-0.338816% reduction) + test/medium/euler001.blc: 68690 -> 68919 (-0.333382% reduction) + test/medium/harmonic_series42.blc: 38338 -> 38450 (-0.292138% reduction) + test/medium/jottary.blc 4364 -> 4459 (-2.176902% reduction) + test/medium/mutual_recusion.blc 9700 -> 9738 (-0.391753% reduction) + test/medium/fac42.blc: 8681 -> 8699 (-0.207349% reduction) + +## Merged Right-Apps (`app_right`) + +Same as previous, but for right associative application. Useful for +Church encodings. + +$$ +\begin{aligned} +f(\lambda M) &= 01\ f(M)\\ +f(M_1(\dots M_n)) &= 0^n\ 1\ f(M_1)\dots f(M_n)\\ +f(i) &= 1^i 0 +\end{aligned} +$$ + +### Statistics + + test/medium/fib_rec42.blc: 14167 -> 15029 (-6.084563% reduction) + test/medium/euler001.blc: 68690 -> 73035 (-6.325520% reduction) + test/medium/harmonic_series42.blc: 38338 -> 40606 (-5.915802% reduction) + test/medium/jottary.blc 4364 -> 4719 (-8.134739% reduction) + test/medium/mutual_recusion.blc 9700 -> 10314 (-6.329897% reduction) + test/medium/fac42.blc: 8681 -> 9207 (-6.059210% reduction) + +## Merged Bidirectional Apps (`app_both`) + +One extra bit to indicate whether to use `app_left` or `app_right`: + +$$ +\begin{aligned} +f(\lambda M) &= 01\ f(M)\\ +f(M_1\dots M_n) &= 0^n\ 10\ f(M_1)\dots f(M_n)\\ +f(M_1(\dots M_n)) &= 0^n\ 11\ f(M_1)\dots f(M_n)\\ +f(i) &= 1^i 0 +\end{aligned} +$$ + +### Statistics + + test/medium/fib_rec42.blc: 14167 -> 14962 (-5.611633% reduction) + test/medium/euler001.blc: 68690 -> 72681 (-5.810162% reduction) + test/medium/harmonic_series42.blc: 38338 -> 40381 (-5.328916% reduction) + test/medium/jottary.blc 4364 -> 4760 (-9.074244% reduction) + test/medium/mutual_recusion.blc 9700 -> 10260 (-5.773196% reduction) + test/medium/fac42.blc: 8681 -> 9150 (-5.402603% reduction) + +## Merged Multi-Abs + +Here I “merge” multiple abstractions with a single prefix. (`abs`) + +$$ +\begin{aligned} +f(\lambda_1\dots \lambda_n M) &= 0^{n+1}\ 1\ f(M)\\ +f(M\ N) &= 01\ f(M)\ f(N)\\ +f(i) &= 1^i 0 +\end{aligned} +$$ + +Slightly better, I can combine this with the previous trick for +applications! (`abs_app_left`) + +$$ +\begin{aligned} +f(\lambda_1\dots \lambda_n M) &= 0^n\ 10\ f(M)\\ +f(M_1\ M_n) &= 0^{n-1}\ 11\ f(M_1)\dots f(M_n)\\ +f(i) &= 1^i 0 +\end{aligned} +$$ + +Theoretically, you could also merge them in a single step. The *trick* +would be to see applications as zeroth abstractions. Any immediate +applications inside abstractions would be handled directly by the +abstraction constructor. + +However, unabstracted applications would get quite large so this +optimization doesn’t seem worth it. + +$$ +\begin{aligned} +f(\lambda_1\dots \lambda_n (M_1\dots M_m)) &= 0^{n+1}\ 1\ 0^{m+1}\ 1\ f(M_1)\dots f(M_n)\\ +f(i) &= 1^i 0 +\end{aligned} +$$ + +### Statistics + +abs: + + test/medium/fib_rec42.blc: 14167 -> 13519 (4.574010% reduction) + test/medium/euler001.blc: 68690 -> 65627 (4.459164% reduction) + test/medium/harmonic_series42.blc: 38338 -> 36557 (4.645521% reduction) + test/medium/jottary.blc 4364 -> 4295 (1.581118% reduction) + test/medium/mutual_recusion.blc 9700 -> 9285 (4.278351% reduction) + test/medium/fac42.blc: 8681 -> 8278 (4.642322% reduction) + +abs_app_left: + + test/medium/fib_rec42.blc: 14167 -> 13567 (4.235194% reduction) + test/medium/euler001.blc: 68690 -> 65856 (4.125783% reduction) + test/medium/harmonic_series42.blc: 38338 -> 36669 (4.353383% reduction) + test/medium/jottary.blc 4364 -> 4390 (-0.595784% reduction) + test/medium/mutual_recusion.blc 9700 -> 9323 (3.886598% reduction) + test/medium/fac42.blc: 8681 -> 8296 (4.434973% reduction) @@ -17,7 +17,10 @@ typedef struct { Impl impl_blc(void); Impl impl_blc2(void); Impl impl_closed(void); -Impl impl_app(void); +Impl impl_app_left(void); +Impl impl_app_right(void); +Impl impl_app_both(void); Impl impl_abs(void); +Impl impl_abs_app_left(void); #endif @@ -4,17 +4,20 @@ CC = gcc TG = ctags +# e.g. make full DEBUG=1 +DEBUG=0 + BUILD = ${CURDIR}/build SRC = ${CURDIR}/src INC = ${CURDIR}/inc SRCS = $(wildcard $(SRC)/*.c) $(wildcard $(SRC)/impl/*.c) OBJS = $(patsubst $(SRC)/%.c, $(BUILD)/%.o, $(SRCS)) -CFLAGS_DEBUG = -fsanitize=address,leak,undefined -g -O0 -CFLAGS_WARNINGS = -Wall -Wextra -Wshadow -Wpointer-arith -Wwrite-strings -Wredundant-decls -Wnested-externs -Wmissing-declarations -Wstrict-prototypes -Wmissing-prototypes -Wcast-qual -Wswitch-default -Wswitch-enum -Wunreachable-code -Wundef -Wold-style-definition -pedantic -Wno-switch-enum +#CFLAGS_DEBUG = -fsanitize=address,leak,undefined -g -O0 +CFLAGS_WARNINGS = -Wall -Wextra -Wshadow -Wpointer-arith -Wwrite-strings -Wredundant-decls -Wnested-externs -Wmissing-declarations -Wstrict-prototypes -Wmissing-prototypes -Wcast-qual -Wswitch-default -Wswitch-enum -Wunreachable-code -Wundef -Wold-style-definition -pedantic -Wno-switch-enum -DDEBUG=$(DEBUG) CFLAGS = $(CFLAGS_WARNINGS) -std=c99 -Ofast -I$(INC) -ifdef DEBUG # TODO: Somehow clean automagically +ifeq ($(DEBUG),1) # TODO: Somehow clean automagically CFLAGS += $(CFLAGS_DEBUG) endif @@ -26,22 +29,22 @@ all: compile full: all sync -compile: $(BUILD) $(OBJS) $(BUILD)/blc +compile: $(BUILD) $(OBJS) $(BUILD)/blc2blc clean: @rm -rf $(BUILD)/ install: - @install -m 755 $(BUILD)/blc $(DESTDIR)$(PREFIX)/bin/ + @install -m 755 $(BUILD)/blc2blc $(DESTDIR)$(PREFIX)/bin/ sync: # Ugly hack - @$(MAKE) $(BUILD)/blc --always-make --dry-run | grep -wE 'gcc|g\+\+' | grep -w '\-c' | jq -nR '[inputs|{directory:".", command:., file: match(" [^ ]+$$").string[1:]}]' >compile_commands.json + @$(MAKE) $(BUILD)/blc2blc --always-make --dry-run | grep -wE 'gcc|g\+\+' | grep -w '\-c' | jq -nR '[inputs|{directory:".", command:., file: match(" [^ ]+$$").string[1:]}]' >compile_commands.json @$(TG) -R --exclude=.git --exclude=build . $(BUILD)/%.o: $(SRC)/%.c @$(CC) -c -o $@ $(CFLAGS) $< -$(BUILD)/blc: $(OBJS) +$(BUILD)/blc2blc: $(OBJS) @$(CC) -o $@ $(CFLAGS) $^ .PHONY: all compile clean sync diff --git a/notes.txt b/notes.txt new file mode 100644 index 0000000..8bab649 --- /dev/null +++ b/notes.txt @@ -0,0 +1,111 @@ +# normal BLC + +- blc(\M) = 00 blc(M) +- blc(M N) = 01 blc(M) blc(N) +- blc(i) = 1^i 0 + +e.g. +- [[0]] = 00 00 10 +- [(0 0)] = 01 00 10 10 + +# my BLC + +- blc(\M) = 01 blc(M) +- blc(M1 .. Mn) = 0^n 1 blc(M1) .. blc(Mn) +- blc(i) = 1^i 0 + +e.g. +- [[0]] = 01 01 10 +- [(0 0)] = 01 001 10 10 + +# my BLC 2 + +- blc(\.n.\M) = 0^{n+1} 1 blc(M) +- blc(M N) = 01 blc(M) blc(N) +- blc(i) = 1^i 0 + +e.g. +- [[0]] = 0001 10 +- [(0 0)] = 001 01 10 10 + +# my BLC 2.5 LEFT + +- blc(\M) = 01 blc(M) +- blc(M1 .. Mn) = 0^n 1 blc(M1) .. blc(Mn) +- blc(i) = 1^i 0 + +e.g. +- (M1 M2) = 001 M1 M2 +- (((M1 M2) M3) M4) = 0001 M1 M2 M3 M4 +- (0 0) = 0011010 + +# my BLC 2.7 RIGHT + +- blc(\M) = 01 blc(M) +- blc(M1 (.. Mn)) = 0^n 1 blc(M1) .. blc(Mn) +- blc(i) = 1^i 0 + +e.g. +- (M1 M2) = 001 M1 M2 +- (M1 (M2 (M3 M4))) = 0001 M1 M2 M3 M4 +- (0 0) = 0011010 + +# my BLC 2.9 BOTH?! + +- blc(\M) = 01 blc(M) +- blc(M1 .. Mn) = 0^n 10 blc(M1) .. blc(Mn) +- blc(M1 (.. Mn)) = 0^n 11 blc(M1) .. blc(Mn) +- blc(i) = 1^i 0 + +e.g. +- (M1 M2) = 001 M1 M2 +- (((M1 M2) M3) M4) = 00010 M1 M2 M3 M4 +- (M1 (M2 (M3 M4))) = 00011 M1 M2 M3 M4 +- (0 0) = 0011010 + +# my BLC 3 ABSAPP MERGE + +- blc(\.n.\M) = 0^n 10 blc(M) +- blc(M1 .. Mn) = 0^{n-1} 11 blc(M1) .. blc(Mn) +- blc(i) = 1^i 0 + +e.g. +- [[0]] = 0010 10 +- [(0 0)] = 010 011 10 10 + +# my BLC 4 + +- blc(\.n.\(M1 .. Mm)) = 0^{n+1} 1 0^m 1 blc(M1) .. blc(Mn) +- blc(i) = 1^i 0 + +e.g. +- [[0]] = 000101 10 +- [(0 0)] = 001001 10 10 +- (0 0) = 011001 10 10 + +# my BLC 5 + +- blc(\.n.\(M1 .. Mm)) = 0^{n+1} 1^{m+1} 0 blc(M1) .. blc(Mn) +- blc(i) = 1^i 0 + +e.g. +- [[0]] = 00010 10 +- [(0 0)] = 00110 10 10 +- (0 0) = 001110 10 10 + +# Mateusz + +- [0] = 01 +- [[0]] = 00010 +- [[1]] = 00011 +- [[[0]]] = 0000010 +- [[[1]]] = 00000110 +- [[[2]]] = 00000111 + +- [(0 0)] = 00111 +- [[(0 1)]] = 000011011 +- [[(1 0)]] = 000011110 + +- [[(1 1)]] = 000011111 +- [[[(1 1)]]] = 0000001110110 +- [[[[(1 1)]]]] = 000000001110110 @@ -1,139 +1,24 @@ -# BLC Experiments +# BLC2BLC -Small experiment to find the smallest binary encoding of lambda terms. I -use an empirical approach with the terms located in `samples/`. The -source also serves as a reference C implementation of BLC encodings. +Easily convert binary lambda calculus encodings to other binary lambda +calculus encodings! -Of course, encodings that allow sharing of duplicate terms (e.g. in a -binary encoded DAG) can yield even smaller encodings. I’ve done that in -the [BLoC](https://github.com/marvinborner/BLoC) encoding and I will -apply the insights from this experiments there as well. +## Encodings -## Traditional BLC +See [`experiments.md`](experiments.md) for detailed explanations and +some comparisons. -$$ -\begin{aligned} -f(\lambda m) &= 00\ f(M)\\ -f(M N) &= 01\ f(M)\ f(N)\\ -f(i) &= 1^i\ 0 -\end{aligned} -$$ +Please create a PR if you know of other (better?) encodings! -**Problem**: de Bruijn indices are encoded in *unary* ($O(n)$). +## Usage -### Statistics +``` bash +$ make install # or make run +``` -TODO +Then use `blc2blc <from> <to>`. For example: -### Resources: - -- [Binary Lambda - Calculus](https://tromp.github.io/cl/Binary_lambda_calculus.html) - (John Tromp) - -## Levenshtein Indices (BLC2) - -$$ -\begin{aligned} -f(\lambda M) &= 00\ f(M)\\ -f(M N) &= 01\ f(M)\ f(N)\\ -f(i) &= \mathrm{code}(i) -\end{aligned} -$$ - -### Statistics - -TODO (compare with blc) - -### Resources - -- [Levenshtein Coding](https://en.wikipedia.org/wiki/Levenshtein_coding) - (Wikipedia) - -## Mateusz Naściszewski - -$$ -\begin{aligned} -f(t) &= g(0, t)\\ -g(0, i) &= \text{undefined}\\ -g(1, 0) &= 1\\ -g(n, 0) &= 10\\ -g(n, i) &= 1\ g(n-1, i-1)\\ -g(0, \lambda M) &= 0\ g(1, M)\\ -g(n, \lambda M) &= 00\ g(n+1, M)\\ -g(0, M\ N) &= 1\ g(1, M)\\ -g(n, M\ N) &= 01\ g(n, M)\ g(n, N) -\end{aligned} -$$ - -### Statistics - -TODO - -### Resources - -- [Implementation](https://github.com/tromp/AIT/commit/f8c7d191519bc8df4380d49934f2c9b9bdfeef19) - (John Tromp) - -## Merged Left-Apps - -Here I “merge” multiple left applications with a single prefix. This -uses one bit more for binary applications. However, practical lambda -terms often use many consecutive left applications! - -$$ -\begin{aligned} -f(\lambda M) &= 01\ f(M)\\ -f(M_1\dots M_n) &= 0^n\ 1\ f(M_1)\dots f(M_n)\\ -f(i) &= 1^i 0 -\end{aligned} -$$ - -### Statistics - -TODO - -### Resources - -- None? - -## Merged Multi-Abs - -Here I “merge” multiple abstractions with a single prefix. - -$$ -\begin{aligned} -f(\lambda_1\dots \lambda_n M) &= 0^{n+1}\ 1\ f(M)\\ -f(M\ N) &= 01\ f(M)\ f(N)\\ -f(i) &= 1^i 0 -\end{aligned} -$$ - -Slightly better, I can combine this with the previous technique! - -$$ -\begin{aligned} -f(\lambda_1\dots \lambda_n M) &= 0^n\ 11\ f(M)\\ -f(M_1\ M_n) &= 0^n\ 10\ f(M_1)\dots f(M_n)\\ -f(i) &= 1^i 0 -\end{aligned} -$$ - -Or, merge it directly! The *trick* is to see applications as zeroth -abstractions. Any immediate applications inside abstractions are handled -directly by the abstraction constructor. - -$$ -\begin{aligned} -f(\lambda_1\dots \lambda_n (M_1\dots M_m)) &= 0^n\ 1\ 0^m\ 1\ f(M_1)\dots f(M_n)\\ -f(i) &= 1^i 0 -\end{aligned} -$$ - -### Statistics - -TODO - -### Resources - -- None? +``` bash +$ echo 001110 | blc2blc blc blc2 +0011100 +``` diff --git a/src/abs.c b/src/impl/abs.c index a6aa76d..a6aa76d 100644 --- a/src/abs.c +++ b/src/impl/abs.c diff --git a/src/impl/abs_app_right.c b/src/impl/abs_app_right.c new file mode 100644 index 0000000..a346013 --- /dev/null +++ b/src/impl/abs_app_right.c @@ -0,0 +1,104 @@ +// Copyright (c) 2024, Marvin Borner <dev@marvinborner.de> +// SPDX-License-Identifier: MIT + +#include <log.h> +#include <impl.h> + +// my blc 3 for now! + +static void encode(Term *term, FILE *fp) +{ + switch (term->type) { + case ABS: + while (1) { + fprintf(fp, "0"); + if (term->u.abs.body->type != ABS) + break; + term = term->u.abs.body; + } + fprintf(fp, "10"); + encode(term->u.abs.body, fp); + break; + case APP:; + Term *stack[128] = { 0 }; // TODO: HEAP! + int stacked = 0; + + while (1) { + fprintf(fp, "0"); + stack[stacked++] = term->u.app.rhs; + term = term->u.app.lhs; + if (term->type != APP) + break; + } + + fprintf(fp, "11"); + encode(term, fp); + for (int i = stacked - 1; i >= 0; i--) + encode(stack[i], fp); + break; + case IDX: + for (size_t i = 0; i <= term->u.index; i++) + fprintf(fp, "1"); + fprintf(fp, "0"); + break; + default: + fatal("invalid type %d\n", term->type); + } +} + +static Term *decode(FILE *fp) +{ + Term *res = 0; + + char a = getc(fp); + + if (a == '0') { + size_t count = 0; + while (getc(fp) == '0') + count++; + + char b = getc(fp); + + if (b == '0') { + Term *head = term_new(ABS); + + res = head; + for (size_t i = 0; i < count; i++) { + res->u.abs.body = term_new(ABS); + res = res->u.abs.body; + } + res->u.abs.body = decode(fp); + res = head; + } else if (b == '1') { + res = term_new(APP); + res->u.app.lhs = decode(fp); + res->u.app.rhs = decode(fp); + + for (size_t i = 0; i < count; i++) { + Term *prev = res; + res = term_new(APP); + res->u.app.lhs = prev; + res->u.app.rhs = decode(fp); + } + } + } else if (a == '1') { + res = term_new(IDX); + res->u.index = 0; + while (getc(fp) == '1') + res->u.index++; + } + + if (!res) + fatal("invalid parsing state!\n"); + + return res; +} + +Impl impl_abs_app_left(void) +{ + return (Impl){ + .name = "abs_app_left", + .encode = encode, + .decode = decode, + }; +} diff --git a/src/impl/app_both.c b/src/impl/app_both.c new file mode 100644 index 0000000..47f78aa --- /dev/null +++ b/src/impl/app_both.c @@ -0,0 +1,112 @@ +// Copyright (c) 2024, Marvin Borner <dev@marvinborner.de> +// SPDX-License-Identifier: MIT + +#include <log.h> +#include <impl.h> + +static void encode(Term *term, FILE *fp) +{ + switch (term->type) { + case ABS: + fprintf(fp, "01"); + encode(term->u.abs.body, fp); + break; + case APP: + fprintf(fp, "0"); + + Term *stack[128] = { 0 }; // TODO: HEAP! + int stacked = 0; + + // TODO: there's probably something here about finding optimal + // left/right folds but this is ok for now + + int fold_right = term->u.app.rhs->type == APP; + + while (1) { + fprintf(fp, "0"); + stack[stacked++] = + fold_right ? term->u.app.lhs : term->u.app.rhs; + term = fold_right ? term->u.app.rhs : term->u.app.lhs; + if (term->type != APP) + break; + } + + fprintf(fp, "1%d", fold_right); + if (fold_right) { + for (int i = 0; i < stacked; i++) + encode(stack[i], fp); + encode(term, fp); + } else { + encode(term, fp); + for (int i = stacked - 1; i >= 0; i--) + encode(stack[i], fp); + } + break; + case IDX: + for (size_t i = 0; i <= term->u.index; i++) + fprintf(fp, "1"); + fprintf(fp, "0"); + break; + default: + fatal("invalid type %d\n", term->type); + } +} + +static Term *decode(FILE *fp) +{ + Term *res = 0; + + char a = getc(fp); + + if (a == '0') { + size_t count = 0; + while (getc(fp) == '0') + count++; + + if (count == 0) { + res = term_new(ABS); + res->u.abs.body = decode(fp); + } else if (getc(fp) == '1') { // fold right + Term *head = term_new(APP); + res = head; + for (size_t i = 0; i < count - 1; i++) { + res->u.app.lhs = decode(fp); + res->u.app.rhs = term_new(APP); + res = res->u.app.rhs; + } + res->u.app.lhs = decode(fp); + res->u.app.rhs = decode(fp); + res = head; + } else { // fold left + res = term_new(APP); + res->u.app.lhs = decode(fp); + res->u.app.rhs = decode(fp); + + for (size_t i = 0; i < count - 1; i++) { + Term *prev = res; + res = term_new(APP); + res->u.app.lhs = prev; + res->u.app.rhs = decode(fp); + } + } + } else if (a == '1') { + res = term_new(IDX); + res->u.index = 0; + while (getc(fp) == '1') + res->u.index++; + } + + if (!res) + fatal("invalid parsing state!\n"); + + return res; +} + +Impl impl_app_both(void) +{ + return (Impl){ + .name = "app_both", + .encode = encode, + .decode = decode, + }; +} diff --git a/src/app.c b/src/impl/app_left.c index cc7eda9..dcc62cd 100644 --- a/src/app.c +++ b/src/impl/app_left.c @@ -79,10 +79,10 @@ static Term *decode(FILE *fp) return res; } -Impl impl_app(void) +Impl impl_app_left(void) { return (Impl){ - .name = "app", + .name = "app_left", .encode = encode, .decode = decode, }; diff --git a/src/impl/app_right.c b/src/impl/app_right.c new file mode 100644 index 0000000..2583741 --- /dev/null +++ b/src/impl/app_right.c @@ -0,0 +1,89 @@ +// Copyright (c) 2024, Marvin Borner <dev@marvinborner.de> +// SPDX-License-Identifier: MIT + +#include <log.h> +#include <impl.h> + +static void encode(Term *term, FILE *fp) +{ + switch (term->type) { + case ABS: + fprintf(fp, "01"); + encode(term->u.abs.body, fp); + break; + case APP: + fprintf(fp, "0"); + + Term *stack[128] = { 0 }; // TODO: HEAP! + int stacked = 0; + + while (1) { + fprintf(fp, "0"); + stack[stacked++] = term->u.app.lhs; + term = term->u.app.rhs; + if (term->type != APP) + break; + } + + fprintf(fp, "1"); + for (int i = 0; i < stacked; i++) + encode(stack[i], fp); + encode(term, fp); + break; + case IDX: + for (size_t i = 0; i <= term->u.index; i++) + fprintf(fp, "1"); + fprintf(fp, "0"); + break; + default: + fatal("invalid type %d\n", term->type); + } +} + +static Term *decode(FILE *fp) +{ + Term *res = 0; + + char a = getc(fp); + + if (a == '0') { + size_t count = 0; + while (getc(fp) == '0') + count++; + + if (count == 0) { + res = term_new(ABS); + res->u.abs.body = decode(fp); + } else { + Term *head = term_new(APP); + res = head; + for (size_t i = 0; i < count - 1; i++) { + res->u.app.lhs = decode(fp); + res->u.app.rhs = term_new(APP); + res = res->u.app.rhs; + } + res->u.app.lhs = decode(fp); + res->u.app.rhs = decode(fp); + res = head; + } + } else if (a == '1') { + res = term_new(IDX); + res->u.index = 0; + while (getc(fp) == '1') + res->u.index++; + } + + if (!res) + fatal("invalid parsing state!\n"); + + return res; +} + +Impl impl_app_right(void) +{ + return (Impl){ + .name = "app_right", + .encode = encode, + .decode = decode, + }; +} diff --git a/src/impl/merged_abs.c b/src/impl/merged_abs.c deleted file mode 100644 index e69de29..0000000 --- a/src/impl/merged_abs.c +++ /dev/null diff --git a/src/impl/merged_app.c b/src/impl/merged_app.c deleted file mode 100644 index e69de29..0000000 --- a/src/impl/merged_app.c +++ /dev/null diff --git a/src/impl/merged_both.c b/src/impl/merged_both.c deleted file mode 100644 index e69de29..0000000 --- a/src/impl/merged_both.c +++ /dev/null @@ -5,13 +5,15 @@ #include <stdio.h> #include <dirent.h> #include <sys/stat.h> +#include <string.h> #include <term.h> #include <parse.h> +#include <log.h> #include <print.h> #include <impl.h> -#define N_IMPLS 5 +#define N_IMPLS 8 static size_t file_size(FILE *fp) { @@ -36,6 +38,8 @@ static size_t test_impl(Impl impl, Term *term, FILE *out) static void test_all(Impl impls[N_IMPLS], const char *dir_path, FILE *out) { + double results[N_IMPLS] = { 0 }; + char path[1024]; struct dirent *dir_entry; DIR *dir = opendir(dir_path); @@ -44,7 +48,7 @@ static void test_all(Impl impls[N_IMPLS], const char *dir_path, FILE *out) continue; sprintf(path, "%s/%s", dir_path, dir_entry->d_name); - printf("%s\n", path); + debug("%s\n", path); FILE *fp = fopen(path, "rb"); @@ -53,33 +57,71 @@ static void test_all(Impl impls[N_IMPLS], const char *dir_path, FILE *out) for (Impl *impl = impls; impl != impls + N_IMPLS; impl++) { size_t prev = file_size(fp); size_t after = test_impl(*impl, term, out); - printf("%s: %lu -> %lu (%f%% reduction)\n", impl->name, - prev, after, - (double)((long)prev - (long)after) / - (double)prev * 100); + double change = (double)((long)prev - (long)after) / + (double)prev * 100; + debug("%s: %lu -> %lu (%f%% reduction)\n", impl->name, + prev, after, change); + results[impl - impls] += change; + results[impl - impls] /= 2.0; } term_free(term); fclose(fp); - printf("\n"); + debug("\n"); } closedir(dir); + + printf("\n--- %s ---\n", dir_path); + for (size_t i = 0; i < N_IMPLS; i++) + printf("%s: avg. %f%% reduction\n", impls[i].name, results[i]); } -int main(void) +static void test(Impl impls[N_IMPLS]) { - Impl impls[N_IMPLS] = { impl_blc(), impl_blc2(), impl_closed(), - impl_abs(), impl_app() }; - FILE *out = tmpfile(); const char *dir_path = "test/small"; test_all(impls, dir_path, out); dir_path = "test/medium"; test_all(impls, dir_path, out); - /* dir_path = "test/large"; */ - /* test_all(impls, dir_path, out); */ + dir_path = "test/large"; + test_all(impls, dir_path, out); fclose(out); } + +static Impl find_impl(const char *impl, Impl impls[N_IMPLS]) +{ + for (int i = 0; i < N_IMPLS; i++) { + if (!strcmp(impl, impls[i].name)) + return impls[i]; + } + fatal("encoding '%s' not found!\n", impl); +} + +int main(int argc, char *argv[]) +{ +#ifdef DEBUG + debug_enable(DEBUG); +#endif + Impl impls[N_IMPLS] = { impl_blc(), impl_blc2(), + impl_closed(), impl_abs(), + impl_app_left(), impl_app_right(), + impl_app_both(), impl_abs_app_left() }; + if (argc == 1) { + test(impls); + return 0; + } + + if (argc != 3) { + fatal("invalid arguments, please use `blc2blc <from> <to>`\n"); + } + + Impl from = find_impl(argv[1], impls); + Impl to = find_impl(argv[2], impls); + Term *term = from.decode(stdin); + to.encode(term, stdout); + + return 0; +} |