From b517b450316ffd1b6ee9cb0c25085ff3b1d4b75b Mon Sep 17 00:00:00 2001
From: Marvin Borner
Date: Sun, 8 Sep 2024 02:59:29 +0200
Subject: Some implementations

---
 inc/impl.h         |  23 ++++++++++++
 makefile           |   6 ++--
 src/abs.c          |  81 +++++++++++++++++++++++++++++++++++++++++++
 src/app.c          |  89 +++++++++++++++++++++++++++++++++++++++++++++++
 src/impl/blc.c     |  65 ++++++++++++++++++++++++++++++++++
 src/impl/blc2.c    |  93 +++++++++++++++++++++++++++++++++++++++++++++++++
 src/impl/closed.c  | 100 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/impl/mateusz.c |   0
 src/main.c         |  76 ++++++++++++++++++++++++++++++++++++++--
 src/parse.c        |   7 ++--
 src/print.c        |  22 ------------
 src/term.c         |   3 +-
 12 files changed, 531 insertions(+), 34 deletions(-)
 create mode 100644 inc/impl.h
 create mode 100644 src/abs.c
 create mode 100644 src/app.c
 create mode 100644 src/impl/closed.c
 delete mode 100644 src/impl/mateusz.c

diff --git a/inc/impl.h b/inc/impl.h
new file mode 100644
index 0000000..115468b
--- /dev/null
+++ b/inc/impl.h
@@ -0,0 +1,23 @@
+// Copyright (c) 2024, Marvin Borner <dev@marvinborner.de>
+// SPDX-License-Identifier: MIT
+
+#ifndef IMPL_H
+#define IMPL_H
+
+#include <stdio.h>
+
+#include <term.h>
+
+typedef struct {
+	const char *name;
+	void (*encode)(Term *, FILE *);
+	Term *(*decode)(FILE *);
+} Impl;
+
+Impl impl_blc(void);
+Impl impl_blc2(void);
+Impl impl_closed(void);
+Impl impl_app(void);
+Impl impl_abs(void);
+
+#endif
diff --git a/makefile b/makefile
index bcd8d97..e755b25 100644
--- a/makefile
+++ b/makefile
@@ -7,7 +7,7 @@ TG = ctags
 BUILD = ${CURDIR}/build
 SRC = ${CURDIR}/src
 INC = ${CURDIR}/inc
-SRCS = $(wildcard $(SRC)/*.c)
+SRCS = $(wildcard $(SRC)/*.c) $(wildcard $(SRC)/impl/*.c)
 OBJS = $(patsubst $(SRC)/%.c, $(BUILD)/%.o, $(SRCS))
 
 CFLAGS_DEBUG = -fsanitize=address,leak,undefined -g -O0
@@ -29,7 +29,7 @@ full: all sync
 compile: $(BUILD) $(OBJS) $(BUILD)/blc
 
 clean:
-	@rm -rf $(BUILD)/*
+	@rm -rf $(BUILD)/
 
 install:
 	@install -m 755 $(BUILD)/blc $(DESTDIR)$(PREFIX)/bin/
@@ -47,4 +47,4 @@ $(BUILD)/blc: $(OBJS)
 .PHONY: all compile clean sync
 
 $(BUILD):
-	@mkdir -p $@
+	@mkdir -p $@ $@/impl/
diff --git a/src/abs.c b/src/abs.c
new file mode 100644
index 0000000..a6aa76d
--- /dev/null
+++ b/src/abs.c
@@ -0,0 +1,81 @@
+// Copyright (c) 2024, Marvin Borner <dev@marvinborner.de>
+// SPDX-License-Identifier: MIT
+
+#include <log.h>
+#include <impl.h>
+
+static void encode(Term *term, FILE *fp)
+{
+	switch (term->type) {
+	case ABS:
+		while (1) {
+			fprintf(fp, "0");
+			if (term->u.abs.body->type != ABS)
+				break;
+			term = term->u.abs.body;
+		}
+		fprintf(fp, "01");
+		encode(term->u.abs.body, fp);
+		break;
+	case APP:
+		fprintf(fp, "01");
+		encode(term->u.app.lhs, fp);
+		encode(term->u.app.rhs, fp);
+		break;
+	case IDX:
+		for (size_t i = 0; i <= term->u.index; i++)
+			fprintf(fp, "1");
+		fprintf(fp, "0");
+		break;
+	default:
+		fatal("invalid type %d\n", term->type);
+	}
+}
+
+static Term *decode(FILE *fp)
+{
+	Term *res = 0;
+
+	char a = getc(fp);
+
+	if (a == '0') {
+		size_t count = 0;
+		while (getc(fp) == '0')
+			count++;
+
+		if (count == 0) {
+			res = term_new(APP);
+			res->u.app.lhs = decode(fp);
+			res->u.app.rhs = decode(fp);
+		} else {
+			Term *head = term_new(ABS);
+
+			res = head;
+			for (size_t i = 0; i < count - 1; i++) {
+				res->u.abs.body = term_new(ABS);
+				res = res->u.abs.body;
+			}
+			res->u.abs.body = decode(fp);
+			res = head;
+		}
+	} else if (a == '1') {
+		res = term_new(IDX);
+		res->u.index = 0;
+		while (getc(fp) == '1')
+			res->u.index++;
+	}
+
+	if (!res)
+		fatal("invalid parsing state!\n");
+
+	return res;
+}
+
+Impl impl_abs(void)
+{
+	return (Impl){
+		.name = "abs",
+		.encode = encode,
+		.decode = decode,
+	};
+}
diff --git a/src/app.c b/src/app.c
new file mode 100644
index 0000000..cc7eda9
--- /dev/null
+++ b/src/app.c
@@ -0,0 +1,89 @@
+// Copyright (c) 2024, Marvin Borner <dev@marvinborner.de>
+// SPDX-License-Identifier: MIT
+
+#include <log.h>
+#include <impl.h>
+
+static void encode(Term *term, FILE *fp)
+{
+	switch (term->type) {
+	case ABS:
+		fprintf(fp, "01");
+		encode(term->u.abs.body, fp);
+		break;
+	case APP:
+		fprintf(fp, "0");
+
+		Term *stack[128] = { 0 }; // TODO: HEAP!
+		int stacked = 0;
+
+		while (1) {
+			fprintf(fp, "0");
+			stack[stacked++] = term->u.app.rhs;
+			term = term->u.app.lhs;
+			if (term->type != APP)
+				break;
+		}
+
+		fprintf(fp, "1");
+		encode(term, fp);
+		for (int i = stacked - 1; i >= 0; i--)
+			encode(stack[i], fp);
+		break;
+	case IDX:
+		for (size_t i = 0; i <= term->u.index; i++)
+			fprintf(fp, "1");
+		fprintf(fp, "0");
+		break;
+	default:
+		fatal("invalid type %d\n", term->type);
+	}
+}
+
+static Term *decode(FILE *fp)
+{
+	Term *res = 0;
+
+	char a = getc(fp);
+
+	if (a == '0') {
+		size_t count = 0;
+		while (getc(fp) == '0')
+			count++;
+
+		if (count == 0) {
+			res = term_new(ABS);
+			res->u.abs.body = decode(fp);
+		} else { // foldl1
+			res = term_new(APP);
+			res->u.app.lhs = decode(fp);
+			res->u.app.rhs = decode(fp);
+
+			for (size_t i = 0; i < count - 1; i++) {
+				Term *prev = res;
+				res = term_new(APP);
+				res->u.app.lhs = prev;
+				res->u.app.rhs = decode(fp);
+			}
+		}
+	} else if (a == '1') {
+		res = term_new(IDX);
+		res->u.index = 0;
+		while (getc(fp) == '1')
+			res->u.index++;
+	}
+
+	if (!res)
+		fatal("invalid parsing state!\n");
+
+	return res;
+}
+
+Impl impl_app(void)
+{
+	return (Impl){
+		.name = "app",
+		.encode = encode,
+		.decode = decode,
+	};
+}
diff --git a/src/impl/blc.c b/src/impl/blc.c
index e69de29..893b856 100644
--- a/src/impl/blc.c
+++ b/src/impl/blc.c
@@ -0,0 +1,65 @@
+// Copyright (c) 2024, Marvin Borner <dev@marvinborner.de>
+// SPDX-License-Identifier: MIT
+
+#include <log.h>
+#include <impl.h>
+
+static void encode(Term *term, FILE *fp)
+{
+	switch (term->type) {
+	case ABS:
+		fprintf(fp, "00");
+		encode(term->u.abs.body, fp);
+		break;
+	case APP:
+		fprintf(fp, "01");
+		encode(term->u.app.lhs, fp);
+		encode(term->u.app.rhs, fp);
+		break;
+	case IDX:
+		for (size_t i = 0; i <= term->u.index; i++)
+			fprintf(fp, "1");
+		fprintf(fp, "0");
+		break;
+	default:
+		fatal("invalid type %d\n", term->type);
+	}
+}
+
+static Term *decode(FILE *fp)
+{
+	Term *res = 0;
+
+	char a = getc(fp);
+
+	if (a == '0') {
+		char b = getc(fp);
+		if (b == '0') {
+			res = term_new(ABS);
+			res->u.abs.body = decode(fp);
+		} else if (b == '1') {
+			res = term_new(APP);
+			res->u.app.lhs = decode(fp);
+			res->u.app.rhs = decode(fp);
+		}
+	} else if (a == '1') {
+		res = term_new(IDX);
+		res->u.index = 0;
+		while (getc(fp) == '1')
+			res->u.index++;
+	}
+
+	if (!res)
+		fatal("invalid parsing state!\n");
+
+	return res;
+}
+
+Impl impl_blc(void)
+{
+	return (Impl){
+		.name = "blc",
+		.encode = encode,
+		.decode = decode,
+	};
+}
diff --git a/src/impl/blc2.c b/src/impl/blc2.c
index e69de29..3cf7ff8 100644
--- a/src/impl/blc2.c
+++ b/src/impl/blc2.c
@@ -0,0 +1,93 @@
+// Copyright (c) 2024, Marvin Borner <dev@marvinborner.de>
+// SPDX-License-Identifier: MIT
+
+#include <log.h>
+#include <impl.h>
+#include <stdint.h>
+
+// haskell by John Tromp
+static void levenshtein_encode(size_t n, FILE *fp)
+{
+	if (n == 0) {
+		fprintf(fp, "10");
+		return;
+	}
+
+	fprintf(fp, "1");
+
+	size_t length = (8 * sizeof(n)) - __builtin_clzll(n);
+	levenshtein_encode(length - 1, fp);
+
+	if (length == 1)
+		return;
+
+	for (size_t i = 1 << (length - 2); i != 0; i >>= 1)
+		fprintf(fp, "%d", (i & n) != 0);
+}
+
+static void encode(Term *term, FILE *fp)
+{
+	switch (term->type) {
+	case ABS:
+		fprintf(fp, "00");
+		encode(term->u.abs.body, fp);
+		break;
+	case APP:
+		fprintf(fp, "01");
+		encode(term->u.app.lhs, fp);
+		encode(term->u.app.rhs, fp);
+		break;
+	case IDX:
+		levenshtein_encode(term->u.index, fp);
+		break;
+	default:
+		fatal("invalid type %d\n", term->type);
+	}
+}
+
+// by John Tromp
+static size_t levenshtein_decode(FILE *fp)
+{
+	if (getc(fp) == '0')
+		return 0;
+	size_t x, l = levenshtein_decode(fp);
+	for (x = 1; l--;)
+		x = 2 * x + (getc(fp) != '0');
+	return x;
+}
+
+static Term *decode(FILE *fp)
+{
+	Term *res = 0;
+
+	char a = getc(fp);
+
+	if (a == '0') {
+		char b = getc(fp);
+		if (b == '0') {
+			res = term_new(ABS);
+			res->u.abs.body = decode(fp);
+		} else if (b == '1') {
+			res = term_new(APP);
+			res->u.app.lhs = decode(fp);
+			res->u.app.rhs = decode(fp);
+		}
+	} else if (a == '1') {
+		res = term_new(IDX);
+		res->u.index = levenshtein_decode(fp);
+	}
+
+	if (!res)
+		fatal("invalid parsing state!\n");
+
+	return res;
+}
+
+Impl impl_blc2(void)
+{
+	return (Impl){
+		.name = "blc2",
+		.encode = encode,
+		.decode = decode,
+	};
+}
diff --git a/src/impl/closed.c b/src/impl/closed.c
new file mode 100644
index 0000000..5b607d9
--- /dev/null
+++ b/src/impl/closed.c
@@ -0,0 +1,100 @@
+// Copyright (c) 2024, Marvin Borner <dev@marvinborner.de>
+// SPDX-License-Identifier: MIT
+
+// only works for closed terms!
+
+// technique proposed by Mateusz Naƛciszewski (afaik)
+// implemented by John Tromp in Haskell
+
+#include <log.h>
+#include <impl.h>
+#include <stdint.h>
+
+static void encode_index(size_t n, size_t i, FILE *fp)
+{
+	if (n == 0)
+		fatal("invalid index (must be closed)\n");
+
+	if (i == 0) {
+		fprintf(fp, "%s", n == 1 ? "1" : "10");
+	} else {
+		fprintf(fp, "1");
+		encode_index(n - 1, i - 1, fp);
+	}
+}
+
+static void encode_level(size_t n, Term *term, FILE *fp)
+{
+	switch (term->type) {
+	case ABS:
+		fprintf(fp, "%s", n == 0 ? "0" : "00");
+		encode_level(n + 1, term->u.abs.body, fp);
+		break;
+	case APP:
+		fprintf(fp, "%s", n == 0 ? "1" : "01");
+		encode_level(n, term->u.app.lhs, fp);
+		encode_level(n, term->u.app.rhs, fp);
+		break;
+	case IDX:
+		encode_index(n, term->u.index, fp);
+		break;
+	default:
+		fatal("invalid type %d\n", term->type);
+	}
+}
+
+static void encode(Term *term, FILE *fp)
+{
+	encode_level(0, term, fp);
+}
+
+static Term *decode_level(size_t n, FILE *fp)
+{
+	Term *res = 0;
+
+	char a = getc(fp);
+
+	if (a == '0' && n == 0) {
+		res = term_new(ABS);
+		res->u.abs.body = decode_level(n + 1, fp);
+	} else if (a == '0') {
+		char b = getc(fp);
+
+		if (b == '0') {
+			res = term_new(ABS);
+			res->u.abs.body = decode_level(n + 1, fp);
+		} else if (b == '1') {
+			res = term_new(APP);
+			res->u.app.lhs = decode_level(n, fp);
+			res->u.app.rhs = decode_level(n, fp);
+		}
+	} else if (a == '1' && n == 0) {
+		res = term_new(APP);
+		res->u.app.lhs = decode_level(n, fp);
+		res->u.app.rhs = decode_level(n, fp);
+	} else if (a == '1') {
+		res = term_new(IDX);
+		res->u.index = 0;
+		while (--n != 0 && getc(fp) == '1')
+			res->u.index++;
+	}
+
+	if (!res)
+		fatal("invalid parsing state!\n");
+
+	return res;
+}
+
+static Term *decode(FILE *fp)
+{
+	return decode_level(0, fp);
+}
+
+Impl impl_closed(void)
+{
+	return (Impl){
+		.name = "closed",
+		.encode = encode,
+		.decode = decode,
+	};
+}
diff --git a/src/impl/mateusz.c b/src/impl/mateusz.c
deleted file mode 100644
index e69de29..0000000
diff --git a/src/main.c b/src/main.c
index 802bfca..75dbb6a 100644
--- a/src/main.c
+++ b/src/main.c
@@ -1,15 +1,85 @@
 // Copyright (c) 2024, Marvin Borner <dev@marvinborner.de>
 // SPDX-License-Identifier: MIT
 
+#define _DEFAULT_SOURCE
 #include <stdio.h>
+#include <dirent.h>
+#include <sys/stat.h>
 
 #include <term.h>
 #include <parse.h>
 #include <print.h>
+#include <impl.h>
+
+#define N_IMPLS 5
+
+static size_t file_size(FILE *fp)
+{
+	struct stat st;
+	fstat(fileno(fp), &st);
+	return st.st_size;
+}
+
+static size_t test_impl(Impl impl, Term *term, FILE *out)
+{
+	rewind(out);
+	impl.encode(term, out);
+	size_t size = ftell(out);
+	rewind(out);
+
+	Term *recon = impl.decode(out);
+	term_diff(term, recon);
+	term_free(recon);
+
+	return size;
+}
+
+static void test_all(Impl impls[N_IMPLS], const char *dir_path, FILE *out)
+{
+	char path[1024];
+	struct dirent *dir_entry;
+	DIR *dir = opendir(dir_path);
+	while ((dir_entry = readdir(dir))) {
+		if (dir_entry->d_type != DT_REG)
+			continue;
+
+		sprintf(path, "%s/%s", dir_path, dir_entry->d_name);
+		printf("%s\n", path);
+
+		FILE *fp = fopen(path, "rb");
+
+		Term *term = parse_blc_fp(fp);
+
+		for (Impl *impl = impls; impl != impls + N_IMPLS; impl++) {
+			size_t prev = file_size(fp);
+			size_t after = test_impl(*impl, term, out);
+			printf("%s: %lu -> %lu (%f%% reduction)\n", impl->name,
+			       prev, after,
+			       (double)((long)prev - (long)after) /
+				       (double)prev * 100);
+		}
+
+		term_free(term);
+		fclose(fp);
+
+		printf("\n");
+	}
+	closedir(dir);
+}
 
 int main(void)
 {
-	Term *term = parse_blc_fp(stdin);
-	print_bruijn(term);
-	printf("\n");
+	Impl impls[N_IMPLS] = { impl_blc(), impl_blc2(), impl_closed(),
+				impl_abs(), impl_app() };
+
+	FILE *out = tmpfile();
+
+	const char *dir_path = "test/small";
+	test_all(impls, dir_path, out);
+	dir_path = "test/medium";
+	test_all(impls, dir_path, out);
+	/* dir_path = "test/large"; */
+	/* test_all(impls, dir_path, out); */
+
+	fclose(out);
 }
diff --git a/src/parse.c b/src/parse.c
index f2f89cd..86b084f 100644
--- a/src/parse.c
+++ b/src/parse.c
@@ -16,15 +16,12 @@ Term *parse_blc_fp(FILE *fp)
 		if (b == '0') {
 			res = term_new(ABS);
 			res->u.abs.body = parse_blc_fp(fp);
-		}
-		if (b == '1') {
+		} else if (b == '1') {
 			res = term_new(APP);
 			res->u.app.lhs = parse_blc_fp(fp);
 			res->u.app.rhs = parse_blc_fp(fp);
 		}
-	}
-
-	if (a == '1') {
+	} else if (a == '1') {
 		res = term_new(IDX);
 		res->u.index = 0;
 		while (getc(fp) == '1')
diff --git a/src/print.c b/src/print.c
index b6493e1..35278a1 100644
--- a/src/print.c
+++ b/src/print.c
@@ -28,25 +28,3 @@ void print_bruijn(Term *term)
 		fatal("invalid type %d\n", term->type);
 	}
 }
-
-void print_blc(Term *term)
-{
-	switch (term->type) {
-	case ABS:
-		printf("00");
-		print_blc(term->u.abs.body);
-		break;
-	case APP:
-		printf("01");
-		print_blc(term->u.app.lhs);
-		print_blc(term->u.app.rhs);
-		break;
-	case IDX:
-		for (size_t i = 0; i <= term->u.index; i++)
-			printf("1");
-		printf("0");
-		break;
-	default:
-		fatal("invalid type %d\n", term->type);
-	}
-}
diff --git a/src/term.c b/src/term.c
index e68611e..3074bfc 100644
--- a/src/term.c
+++ b/src/term.c
@@ -44,7 +44,8 @@ void term_diff(Term *a, Term *b)
 		print_bruijn(a);
 		fprintf(stderr, "\nTerm b: ");
 		print_bruijn(b);
-		fatal("\ntype mismatch %d %d\n", a->type, b->type);
+		fprintf(stderr, "\n");
+		fatal("type mismatch %d %d\n", a->type, b->type);
 	}
 
 	switch (a->type) {
-- 
cgit v1.2.3