diff options
Diffstat (limited to 'src/tokenize.c')
-rw-r--r-- | src/tokenize.c | 175 |
1 files changed, 175 insertions, 0 deletions
diff --git a/src/tokenize.c b/src/tokenize.c new file mode 100644 index 0000000..3424454 --- /dev/null +++ b/src/tokenize.c @@ -0,0 +1,175 @@ +#include <assert.h> +#include <ctype.h> +#include <log.h> +#include <stdbool.h> +#include <stdio.h> +#include <tokenize.h> + +static char next_non_alnum(struct ctx *ctx, size_t start) +{ + for (size_t i = start; i < ctx->size; i++) + if (!isalnum(ctx->data[i])) + return ctx->data[i]; + + errln(ctx, "Unexpected end of buffer"); +} + +static bool peek_to_is_alnum(struct ctx *ctx, size_t start, char ch) +{ + for (size_t i = start; i < ctx->size; i++) { + char cur = ctx->data[i]; + + if (cur == ch || cur == ';' || cur == ')') + return true; + + if (!isalnum(cur)) + return false; + } + + errln(ctx, "Unexpected end of buffer"); +} + +static size_t peek_alnum_to(struct ctx *ctx, size_t start, char ch) +{ + for (size_t i = start; i < ctx->size; i++) { + char cur = ctx->data[i]; + + if (cur == ch || cur == ';' || cur == ')') + return i; + + if (!isalnum(cur)) + errln(ctx, "'%c' is not alpha-numeric", cur); + } + + errln(ctx, "Unexpected end of buffer"); +} + +static size_t peek_special_to(struct ctx *ctx, size_t start, char ch) +{ + for (size_t i = start; i < ctx->size; i++) { + char cur = ctx->data[i]; + + if (cur == ch || cur == ';' || cur == ')') + return i; + + if (isalnum(cur) || cur < '!' || cur > '~') + errln(ctx, "'%c' is not special", cur); + } + + errln(ctx, "Unexpected end of buffer"); +} + +static size_t peek_to(struct ctx *ctx, size_t start, char ch) +{ + for (size_t i = start; i < ctx->size; i++) { + char cur = ctx->data[i]; + + if (cur == ch || cur == ';' || cur == ')') + return i; + } + + errln(ctx, "Unexpected end of buffer"); +} + +static void token_add(struct ctx *ctx, enum token_type type, size_t start, size_t end) +{ + struct token token = { 0 }; + token.type = type; + token.start = start; + token.end = end; + + assert(++ctx->token_count < TOKENS_MAX); + ctx->tokens[ctx->token_count] = token; + + if (type == NEWLINE) { + ctx->line++; + ctx->column = 0; + } else { + ctx->column += end - start; + } +} + +static void token_print(struct ctx *ctx, struct token *token) +{ + assert(token->type != UNKNOWN); + + printf("[token type=%d] '", token->type); + if (token->type == NEWLINE || token->type == END) { + printf("' (Unprintable)\n"); + return; + } + + for (size_t i = token->start; i < token->end; i++) + printf("%c", ctx->data[i]); + printf("'\n"); +} + +void tokens_print(struct ctx *ctx) +{ + for (size_t i = 1; i < ctx->token_count; i++) + token_print(ctx, &ctx->tokens[i]); +} + +void tokenize(struct ctx *ctx) +{ + for (size_t i = 0; i < ctx->size; i++) { + const char cur = ctx->data[i]; + + switch (cur) { + case '\0': + token_add(ctx, END, i, i + 1); + return; + case '\n': + token_add(ctx, NEWLINE, i, i + 1); + continue; + case ';': + token_add(ctx, EOL, i, i + 1); + continue; + case '(': + token_add(ctx, LPAREN, i, i + 1); + continue; + case ')': + token_add(ctx, RPAREN, i, i + 1); + continue; + case '=': + token_add(ctx, EQUAL, i, i + 1); + continue; + case ' ': + ctx->column++; + continue; + default: + break; + } + + if (next_non_alnum(ctx, i) == ':') { // Type with param identifier + size_t start_param = peek_alnum_to(ctx, i, ':') + 1; + size_t end_param; + if (peek_to_is_alnum(ctx, start_param, ' ')) { + end_param = peek_alnum_to(ctx, start_param, ' '); + } else { // Unnamed identifier ('_') + end_param = peek_to(ctx, start_param, ' '); + if (end_param - start_param != 1 || ctx->data[start_param] != '_') + errln(ctx, "Invalid param identifier"); + } + + token_add(ctx, TYPE, i, start_param - 1); + token_add(ctx, TYPEDELIM, start_param - 1, start_param); + token_add(ctx, PARAM, start_param, end_param); + + i = end_param - 1; + continue; + } + + if (peek_to_is_alnum(ctx, i, ' ')) { // General identifier + size_t end_ident = peek_alnum_to(ctx, i, ' '); + token_add(ctx, IDENT, i, end_ident); + i = end_ident - 1; + } else { // Special/custom operator + size_t end_operator = peek_special_to(ctx, i, ' '); + token_add(ctx, OPERATOR, i, end_operator); + i = end_operator - 1; + } + } + + context_rewind(ctx); +} |