aboutsummaryrefslogtreecommitdiff
path: root/src/tokenize.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/tokenize.c')
-rw-r--r--src/tokenize.c175
1 files changed, 175 insertions, 0 deletions
diff --git a/src/tokenize.c b/src/tokenize.c
new file mode 100644
index 0000000..3424454
--- /dev/null
+++ b/src/tokenize.c
@@ -0,0 +1,175 @@
+#include <assert.h>
+#include <ctype.h>
+#include <log.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <tokenize.h>
+
+static char next_non_alnum(struct ctx *ctx, size_t start)
+{
+ for (size_t i = start; i < ctx->size; i++)
+ if (!isalnum(ctx->data[i]))
+ return ctx->data[i];
+
+ errln(ctx, "Unexpected end of buffer");
+}
+
+static bool peek_to_is_alnum(struct ctx *ctx, size_t start, char ch)
+{
+ for (size_t i = start; i < ctx->size; i++) {
+ char cur = ctx->data[i];
+
+ if (cur == ch || cur == ';' || cur == ')')
+ return true;
+
+ if (!isalnum(cur))
+ return false;
+ }
+
+ errln(ctx, "Unexpected end of buffer");
+}
+
+static size_t peek_alnum_to(struct ctx *ctx, size_t start, char ch)
+{
+ for (size_t i = start; i < ctx->size; i++) {
+ char cur = ctx->data[i];
+
+ if (cur == ch || cur == ';' || cur == ')')
+ return i;
+
+ if (!isalnum(cur))
+ errln(ctx, "'%c' is not alpha-numeric", cur);
+ }
+
+ errln(ctx, "Unexpected end of buffer");
+}
+
+static size_t peek_special_to(struct ctx *ctx, size_t start, char ch)
+{
+ for (size_t i = start; i < ctx->size; i++) {
+ char cur = ctx->data[i];
+
+ if (cur == ch || cur == ';' || cur == ')')
+ return i;
+
+ if (isalnum(cur) || cur < '!' || cur > '~')
+ errln(ctx, "'%c' is not special", cur);
+ }
+
+ errln(ctx, "Unexpected end of buffer");
+}
+
+static size_t peek_to(struct ctx *ctx, size_t start, char ch)
+{
+ for (size_t i = start; i < ctx->size; i++) {
+ char cur = ctx->data[i];
+
+ if (cur == ch || cur == ';' || cur == ')')
+ return i;
+ }
+
+ errln(ctx, "Unexpected end of buffer");
+}
+
+static void token_add(struct ctx *ctx, enum token_type type, size_t start, size_t end)
+{
+ struct token token = { 0 };
+ token.type = type;
+ token.start = start;
+ token.end = end;
+
+ assert(++ctx->token_count < TOKENS_MAX);
+ ctx->tokens[ctx->token_count] = token;
+
+ if (type == NEWLINE) {
+ ctx->line++;
+ ctx->column = 0;
+ } else {
+ ctx->column += end - start;
+ }
+}
+
+static void token_print(struct ctx *ctx, struct token *token)
+{
+ assert(token->type != UNKNOWN);
+
+ printf("[token type=%d] '", token->type);
+ if (token->type == NEWLINE || token->type == END) {
+ printf("' (Unprintable)\n");
+ return;
+ }
+
+ for (size_t i = token->start; i < token->end; i++)
+ printf("%c", ctx->data[i]);
+ printf("'\n");
+}
+
+void tokens_print(struct ctx *ctx)
+{
+ for (size_t i = 1; i < ctx->token_count; i++)
+ token_print(ctx, &ctx->tokens[i]);
+}
+
+void tokenize(struct ctx *ctx)
+{
+ for (size_t i = 0; i < ctx->size; i++) {
+ const char cur = ctx->data[i];
+
+ switch (cur) {
+ case '\0':
+ token_add(ctx, END, i, i + 1);
+ return;
+ case '\n':
+ token_add(ctx, NEWLINE, i, i + 1);
+ continue;
+ case ';':
+ token_add(ctx, EOL, i, i + 1);
+ continue;
+ case '(':
+ token_add(ctx, LPAREN, i, i + 1);
+ continue;
+ case ')':
+ token_add(ctx, RPAREN, i, i + 1);
+ continue;
+ case '=':
+ token_add(ctx, EQUAL, i, i + 1);
+ continue;
+ case ' ':
+ ctx->column++;
+ continue;
+ default:
+ break;
+ }
+
+ if (next_non_alnum(ctx, i) == ':') { // Type with param identifier
+ size_t start_param = peek_alnum_to(ctx, i, ':') + 1;
+ size_t end_param;
+ if (peek_to_is_alnum(ctx, start_param, ' ')) {
+ end_param = peek_alnum_to(ctx, start_param, ' ');
+ } else { // Unnamed identifier ('_')
+ end_param = peek_to(ctx, start_param, ' ');
+ if (end_param - start_param != 1 || ctx->data[start_param] != '_')
+ errln(ctx, "Invalid param identifier");
+ }
+
+ token_add(ctx, TYPE, i, start_param - 1);
+ token_add(ctx, TYPEDELIM, start_param - 1, start_param);
+ token_add(ctx, PARAM, start_param, end_param);
+
+ i = end_param - 1;
+ continue;
+ }
+
+ if (peek_to_is_alnum(ctx, i, ' ')) { // General identifier
+ size_t end_ident = peek_alnum_to(ctx, i, ' ');
+ token_add(ctx, IDENT, i, end_ident);
+ i = end_ident - 1;
+ } else { // Special/custom operator
+ size_t end_operator = peek_special_to(ctx, i, ' ');
+ token_add(ctx, OPERATOR, i, end_operator);
+ i = end_operator - 1;
+ }
+ }
+
+ context_rewind(ctx);
+}