From f181a8f04dfdfd8829861e0d0d549f39e40081e6 Mon Sep 17 00:00:00 2001 From: Marvin Borner Date: Thu, 13 May 2021 12:03:43 +0200 Subject: New layout --- src/tokenize.c | 175 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 175 insertions(+) create mode 100644 src/tokenize.c (limited to 'src/tokenize.c') diff --git a/src/tokenize.c b/src/tokenize.c new file mode 100644 index 0000000..3424454 --- /dev/null +++ b/src/tokenize.c @@ -0,0 +1,175 @@ +#include +#include +#include +#include +#include +#include + +static char next_non_alnum(struct ctx *ctx, size_t start) +{ + for (size_t i = start; i < ctx->size; i++) + if (!isalnum(ctx->data[i])) + return ctx->data[i]; + + errln(ctx, "Unexpected end of buffer"); +} + +static bool peek_to_is_alnum(struct ctx *ctx, size_t start, char ch) +{ + for (size_t i = start; i < ctx->size; i++) { + char cur = ctx->data[i]; + + if (cur == ch || cur == ';' || cur == ')') + return true; + + if (!isalnum(cur)) + return false; + } + + errln(ctx, "Unexpected end of buffer"); +} + +static size_t peek_alnum_to(struct ctx *ctx, size_t start, char ch) +{ + for (size_t i = start; i < ctx->size; i++) { + char cur = ctx->data[i]; + + if (cur == ch || cur == ';' || cur == ')') + return i; + + if (!isalnum(cur)) + errln(ctx, "'%c' is not alpha-numeric", cur); + } + + errln(ctx, "Unexpected end of buffer"); +} + +static size_t peek_special_to(struct ctx *ctx, size_t start, char ch) +{ + for (size_t i = start; i < ctx->size; i++) { + char cur = ctx->data[i]; + + if (cur == ch || cur == ';' || cur == ')') + return i; + + if (isalnum(cur) || cur < '!' || cur > '~') + errln(ctx, "'%c' is not special", cur); + } + + errln(ctx, "Unexpected end of buffer"); +} + +static size_t peek_to(struct ctx *ctx, size_t start, char ch) +{ + for (size_t i = start; i < ctx->size; i++) { + char cur = ctx->data[i]; + + if (cur == ch || cur == ';' || cur == ')') + return i; + } + + errln(ctx, "Unexpected end of buffer"); +} + +static void token_add(struct ctx *ctx, enum token_type type, size_t start, size_t end) +{ + struct token token = { 0 }; + token.type = type; + token.start = start; + token.end = end; + + assert(++ctx->token_count < TOKENS_MAX); + ctx->tokens[ctx->token_count] = token; + + if (type == NEWLINE) { + ctx->line++; + ctx->column = 0; + } else { + ctx->column += end - start; + } +} + +static void token_print(struct ctx *ctx, struct token *token) +{ + assert(token->type != UNKNOWN); + + printf("[token type=%d] '", token->type); + if (token->type == NEWLINE || token->type == END) { + printf("' (Unprintable)\n"); + return; + } + + for (size_t i = token->start; i < token->end; i++) + printf("%c", ctx->data[i]); + printf("'\n"); +} + +void tokens_print(struct ctx *ctx) +{ + for (size_t i = 1; i < ctx->token_count; i++) + token_print(ctx, &ctx->tokens[i]); +} + +void tokenize(struct ctx *ctx) +{ + for (size_t i = 0; i < ctx->size; i++) { + const char cur = ctx->data[i]; + + switch (cur) { + case '\0': + token_add(ctx, END, i, i + 1); + return; + case '\n': + token_add(ctx, NEWLINE, i, i + 1); + continue; + case ';': + token_add(ctx, EOL, i, i + 1); + continue; + case '(': + token_add(ctx, LPAREN, i, i + 1); + continue; + case ')': + token_add(ctx, RPAREN, i, i + 1); + continue; + case '=': + token_add(ctx, EQUAL, i, i + 1); + continue; + case ' ': + ctx->column++; + continue; + default: + break; + } + + if (next_non_alnum(ctx, i) == ':') { // Type with param identifier + size_t start_param = peek_alnum_to(ctx, i, ':') + 1; + size_t end_param; + if (peek_to_is_alnum(ctx, start_param, ' ')) { + end_param = peek_alnum_to(ctx, start_param, ' '); + } else { // Unnamed identifier ('_') + end_param = peek_to(ctx, start_param, ' '); + if (end_param - start_param != 1 || ctx->data[start_param] != '_') + errln(ctx, "Invalid param identifier"); + } + + token_add(ctx, TYPE, i, start_param - 1); + token_add(ctx, TYPEDELIM, start_param - 1, start_param); + token_add(ctx, PARAM, start_param, end_param); + + i = end_param - 1; + continue; + } + + if (peek_to_is_alnum(ctx, i, ' ')) { // General identifier + size_t end_ident = peek_alnum_to(ctx, i, ' '); + token_add(ctx, IDENT, i, end_ident); + i = end_ident - 1; + } else { // Special/custom operator + size_t end_operator = peek_special_to(ctx, i, ' '); + token_add(ctx, OPERATOR, i, end_operator); + i = end_operator - 1; + } + } + + context_rewind(ctx); +} -- cgit v1.2.3