Added testing flag

still not able to find the 8cc bug but idc because I tested it with many programs and it probably won't be an issue for now
author: Marvin Borner 2023-05-20 14:13:45 +0200
committer: Marvin Borner 2023-05-20 14:13:45 +0200
commit: 8499010b91a2c7496d6af74cce35a6b4e0378633 (patch)
tree: c57a6a6fb12f4653928662e93d4470453068e361
parent: f640ceee89836b56ac95c4eb1b0a43d1171c3354 (diff)
9 files changed, 154 insertions, 83 deletions
diff --git a/inc/build.h b/inc/build.h
index 2d99f9d..0b76768 100644
--- a/inc/build.h
+++ b/inc/build.h
@@ -4,11 +4,13 @@
 #ifndef BLOC_BUILD_H
 #define BLOC_BUILD_H
 
+#include <stdio.h>
+
 #include <spec.h>
 #include <tree.h>
 #include <parse.h>
 
-void write_bloc(struct list *table, const char *path);
-void write_blc(struct bloc_parsed *bloc, const char *path);
+void write_bloc(struct list *table, FILE *file);
+void write_blc(struct bloc_parsed *bloc, FILE *file);
 
 #endif
diff --git a/inc/parse.h b/inc/parse.h
index 205f204..8eba970 100644
--- a/inc/parse.h
+++ b/inc/parse.h
@@ -17,6 +17,5 @@ struct bloc_parsed {
 struct term *parse_blc(const char *term);
 struct bloc_parsed *parse_bloc(const void *bloc);
 void free_bloc(struct bloc_parsed *bloc);
-struct term *from_bloc(struct bloc_parsed *bloc);
 
 #endif
diff --git a/inc/term.h b/inc/term.h
index 1fe5a91..d368f0e 100644
--- a/inc/term.h
+++ b/inc/term.h
@@ -29,5 +29,6 @@ struct term {
 
 struct term *new_term(term_type type);
 void free_term(struct term *term);
+void diff_term(struct term *a, struct term *b);
 
 #endif
diff --git a/options.ggo b/options.ggo
index 11626b3..7cf6b77 100644
--- a/options.ggo
+++ b/options.ggo
@@ -8,3 +8,4 @@ option "verbose" v "enable debug logging output" flag off
 option "from-blc" b "convert from BLC to BLoC" flag off
 option "from-bloc" B "convert from BLoC to BLC" flag off
 option "dump" d "dump bloc file" dependon="from-bloc" flag off
+option "test" t "compare BLC with generated BLoC" dependon="from-blc" flag off
diff --git a/readme.md b/readme.md
index d42b2ab..80da044 100644
--- a/readme.md
+++ b/readme.md
@@ -71,7 +71,7 @@ A possible encoding in BLoC:
 | 0x04 | 0x06 | number of entries: 3                                                                                            |
 | 0x06 | 0x17 | encoded `M`: gets a bit longer due to different encoding                                                        |
 | 0x17 | 0x28 | encoded `N`: gets a bit longer due to different encoding                                                        |
-| 0x28 | 0x34 | `00010010010011<M>00011<N>011<M>011<N>`, where `<M>=00{1}` and `<N>=00{1}` are indices with length of 1.25 byte |
+| 0x28 | 0x34 | `00010010010011<M>00011<N>011<M>011<N>`, where `<M>=00{1}` and `<N>=00{0}` are indices with length of 1.25 byte |
 
 Even in this small example BLoC uses less space than BLC (0x34 vs. 0x42
 bytes). Depending on the content of `M` and `N`, this could have
@@ -95,12 +95,18 @@ shortest one (as fully reduced expressions aren’t necessarily shorter).
 
 ## Improvements
 
-The current optimizer does not always make the best deduplication
+There seem to be problems with *very* big files:
+[8cc](https://github.com/woodrush/lambda-8cc) does not pass the bloc-blc
+comparison test. I’ve not been able to reproduce this bug with any other
+file and 8cc itself is too huge to comfortably debug the issue. If
+you’re reading this: Please help me :(
+
+Also the current optimizer does not always make the best deduplication
 choices. It seems like finding the optimal deduplications requires quite
 complex algorithms which would probably be rather inefficient.
 
 For example, as of right now the length of an expression as seen by the
-deduplicator doesn’t consider the change of length when sub-expressions
-get replaced by a reference (of unknown bit length!) to another
-expression. This results in entries like `(0 <1234>)` that would not
-have needed to be deduplicated.
+deduplicator doesn’t consider the change of occurrence count when
+sub-expressions get replaced by a reference to another expression. This
+results in entries like `(0 <1234>)` that would not have needed to be
+deduplicated.
diff --git a/src/build.c b/src/build.c
index fff5651..9fdcad4 100644
--- a/src/build.c
+++ b/src/build.c
@@ -67,12 +67,9 @@ static void write_bblc(struct tree *tree, FILE *file)
 		fwrite(&byte, 1, 1, file);
 }
 
-void write_bloc(struct list *table, const char *path)
+static void write_bloc_file(struct list *table, FILE *file)
 {
 	short length = table->val;
-	debug("writing bloc with %ld elements to %s\n", length, path);
-
-	FILE *file = fopen(path, "wb");
 	fwrite(BLOC_IDENTIFIER, BLOC_IDENTIFIER_LENGTH, 1, file);
 	fwrite(&length, 2, 1, file);
 
@@ -81,8 +78,14 @@ void write_bloc(struct list *table, const char *path)
 		write_bblc(iterator->data, file);
 		iterator = iterator->next;
 	}
+}
 
-	fclose(file);
+void write_bloc(struct list *table, FILE *file)
+{
+	short length = table->val;
+	debug("writing bloc with %ld elements\n", length);
+
+	write_bloc_file(table, file);
 }
 
 static void fprint_bloc_blc(struct term *term, struct bloc_parsed *bloc,
@@ -115,12 +118,8 @@ static void fprint_bloc_blc(struct term *term, struct bloc_parsed *bloc,
 	}
 }
 
-void write_blc(struct bloc_parsed *bloc, const char *path)
+void write_blc(struct bloc_parsed *bloc, FILE *file)
 {
-	FILE *file = fopen(path, "wb");
-
 	fprint_bloc_blc(bloc->entries[bloc->length - 1], bloc, file);
 	fprintf(file, "\n");
-
-	fclose(file);
 }
diff --git a/src/main.c b/src/main.c
index a5faa80..4258281 100644
--- a/src/main.c
+++ b/src/main.c
@@ -44,13 +44,8 @@ static char *read_stdin(void)
 	return string;
 }
 
-static char *read_file(const char *path)
+static char *read_file(FILE *f)
 {
-	debug("reading from %s\n", path);
-	FILE *f = fopen(path, "rb");
-	if (!f)
-		fatal("can't open file %s: %s\n", path, strerror(errno));
-
 	fseek(f, 0, SEEK_END);
 	long fsize = ftell(f);
 	fseek(f, 0, SEEK_SET);
@@ -59,17 +54,104 @@ static char *read_file(const char *path)
 	if (!string)
 		fatal("out of memory!\n");
 	int ret = fread(string, fsize, 1, f);
-	fclose(f);
 
 	if (ret != 1) {
 		free(string);
-		fatal("can't read file %s: %s\n", path, strerror(errno));
+		fatal("can't read file: %s\n", strerror(errno));
 	}
 
 	string[fsize] = 0;
 	return string;
 }
 
+static char *read_path(const char *path)
+{
+	debug("reading from %s\n", path);
+	FILE *f = fopen(path, "rb");
+	if (!f)
+		fatal("can't open file %s: %s\n", path, strerror(errno));
+	char *string = read_file(f);
+	fclose(f);
+	return string;
+}
+
+static void test(char *input)
+{
+	debug("parsing as blc\n");
+
+	struct term *parsed_1 = parse_blc(input);
+	free(input);
+	debug("parsed original blc\n");
+
+	debug("merging duplicates\n");
+	struct list *table = tree_merge_duplicates(parsed_1);
+
+	FILE *temp_bloc = tmpfile();
+	write_bloc(table, temp_bloc);
+	tree_destroy(table);
+
+	debug("parsing as bloc\n");
+	char *temp = read_file(temp_bloc);
+	struct bloc_parsed *bloc = parse_bloc(temp);
+	fseek(temp_bloc, 0, SEEK_END);
+	fprintf(stderr, "size bloc: %lu\n", ftell(temp_bloc));
+	fclose(temp_bloc);
+
+	FILE *temp_blc = tmpfile();
+	write_blc(bloc, temp_blc);
+	char *input_2 = read_file(temp_blc);
+	struct term *parsed_2 = parse_blc(input_2);
+	fseek(temp_blc, 0, SEEK_END);
+	fprintf(stderr, "size blc: ~%lu\n", ftell(temp_blc) / 8);
+	fclose(temp_blc);
+	free(input_2);
+	debug("parsed reconstructed blc\n");
+
+	diff_term(parsed_1, parsed_2);
+	debug("diffed two terms\n");
+
+	free_term(parsed_1);
+	free_term(parsed_2);
+	debug("done!\n");
+}
+
+static void from_blc(char *input, char *output_path)
+{
+	debug("parsing as blc\n");
+
+	struct term *parsed = parse_blc(input);
+	debug("parsed blc\n");
+
+	debug("merging duplicates\n");
+	struct list *table = tree_merge_duplicates(parsed);
+
+	FILE *file = fopen(output_path, "wb");
+	write_bloc(table, file);
+	fclose(file);
+
+	tree_destroy(table);
+	free_term(parsed);
+	free(input);
+
+	debug("done!\n");
+}
+
+static void from_bloc(char *input, char *output_path, int dump)
+{
+	debug("parsing as bloc\n");
+
+	struct bloc_parsed *bloc = parse_bloc(input);
+	if (dump)
+		print_bloc(bloc);
+
+	FILE *file = fopen(output_path, "wb");
+	write_blc(bloc, file);
+	fclose(file);
+
+	free(input);
+	free_bloc(bloc);
+}
+
 int main(int argc, char **argv)
 {
 	struct gengetopt_args_info args;
@@ -82,40 +164,24 @@ int main(int argc, char **argv)
 	if (args.input_arg[0] == '-') {
 		input = read_stdin();
 	} else {
-		input = read_file(args.input_arg);
+		input = read_path(args.input_arg);
 	}
 
 	if (!input)
 		return 1;
 
-	if (args.from_blc_flag && !args.from_bloc_flag) {
-		debug("parsing as blc\n");
-
-		struct term *parsed = parse_blc(input);
-		debug("parsed blc\n");
-
-		debug("merging duplicates\n");
-		struct list *table = tree_merge_duplicates(parsed);
-
-		write_bloc(table, args.output_arg);
-
-		tree_destroy(table);
-		free_term(parsed);
-		free(input);
+	if (args.test_flag && args.from_blc_flag && !args.from_bloc_flag) {
+		test(input);
+		return 0;
+	}
 
-		debug("done!\n");
+	if (args.from_blc_flag && !args.from_bloc_flag) {
+		from_blc(input, args.output_arg);
 		return 0;
 	}
 
 	if (args.from_bloc_flag && !args.from_blc_flag) {
-		debug("parsing as bloc\n");
-
-		struct bloc_parsed *bloc = parse_bloc(input);
-		if (args.dump_flag)
-			print_bloc(bloc);
-		write_blc(bloc, args.output_arg);
-		free(input);
-		free_bloc(bloc);
+		from_bloc(input, args.output_arg, args.dump_flag);
 		return 0;
 	}
 
diff --git a/src/parse.c b/src/parse.c
index 9a8af45..f6e047d 100644
--- a/src/parse.c
+++ b/src/parse.c
@@ -120,35 +120,3 @@ void free_bloc(struct bloc_parsed *bloc)
 	free(bloc->entries);
 	free(bloc);
 }
-
-static struct term *rec_bloc(struct term *term, struct bloc_parsed *bloc)
-{
-	switch (term->type) {
-	case ABS:
-		rec_bloc(term->u.abs.term, bloc);
-		break;
-	case APP:
-		rec_bloc(term->u.app.lhs, bloc);
-		rec_bloc(term->u.app.rhs, bloc);
-		break;
-	case VAR:
-		break;
-	case REF:
-		if (term->u.ref.index >= bloc->length)
-			fatal("invalid entry reference\n");
-		memcpy(term,
-		       bloc->entries[bloc->length - term->u.ref.index - 1],
-		       sizeof(*term));
-		break;
-	default:
-		fatal("invalid type %d\n", term->type);
-		return 0;
-	}
-	return term;
-}
-
-struct term *from_bloc(struct bloc_parsed *bloc)
-{
-	struct term *last = bloc->entries[bloc->length - 1];
-	return rec_bloc(last, bloc);
-}
diff --git a/src/term.c b/src/term.c
index 63bbb05..2ffc1c8 100644
--- a/src/term.c
+++ b/src/term.c
@@ -6,6 +6,7 @@
 #include <string.h>
 
 #include <term.h>
+#include <print.h>
 #include <log.h>
 
 struct term *new_term(term_type type)
@@ -39,3 +40,31 @@ void free_term(struct term *term)
 		fatal("invalid type %d\n", term->type);
 	}
 }
+
+void diff_term(struct term *a, struct term *b)
+{
+	if (a->type != b->type) {
+		fprintf(stderr, "Term a: ");
+		print_bruijn(a);
+		fprintf(stderr, "\nTerm b: ");
+		print_bruijn(b);
+		fatal("\ntype mismatch %d %d\n", a->type, b->type);
+	}
+
+	switch (a->type) {
+	case ABS:
+		diff_term(a->u.abs.term, b->u.abs.term);
+		break;
+	case APP:
+		diff_term(a->u.app.lhs, b->u.app.lhs);
+		diff_term(a->u.app.rhs, b->u.app.rhs);
+		break;
+	case VAR:
+		if (a->u.var.index != b->u.var.index)
+			fatal("var mismatch %d=%d\n", a->u.var.index,
+			      b->u.var.index);
+		break;
+	default:
+		fatal("invalid type %d\n", a->type);
+	}
+}
author	Marvin Borner	2023-05-20 14:13:45 +0200
committer	Marvin Borner	2023-05-20 14:13:45 +0200
commit	8499010b91a2c7496d6af74cce35a6b4e0378633 (patch)
tree	c57a6a6fb12f4653928662e93d4470453068e361
parent	f640ceee89836b56ac95c4eb1b0a43d1171c3354 (diff)