diff options
Diffstat (limited to 'src/tokenize.c')
-rw-r--r-- | src/tokenize.c | 241 |
1 files changed, 154 insertions, 87 deletions
diff --git a/src/tokenize.c b/src/tokenize.c index 2a0eea6..0fa58fe 100644 --- a/src/tokenize.c +++ b/src/tokenize.c @@ -1,89 +1,91 @@ #include <assert.h> #include <ctype.h> +#include <stdarg.h> #include <stdbool.h> #include <stddef.h> #include <stdio.h> #include <log.h> +#include <preprocess.h> #include <tokenize.h> -static char next_non_alnum(struct ctx *ctx, size_t start) -{ - for (size_t i = start; i < ctx->size; i++) - if (!isalnum(ctx->data[i])) - return ctx->data[i]; - - errln(&ctx->location, "Unexpected end of buffer"); -} +// TODO: Do some different limitations for identifiers/types -static bool peek_to_is_alnum(struct ctx *ctx, size_t start, char ch) +static size_t peek_identifier(struct ctx *ctx, size_t start, size_t opt_count, ...) { - for (size_t i = start; i < ctx->size; i++) { - char cur = ctx->data[i]; - - if (cur == ch || cur == ';' || cur == ')') - return true; - - if (!isalnum(cur)) - return false; - } - - errln(&ctx->location, "Unexpected end of buffer"); -} + if (isdigit(context_getch(ctx, start))) + errln(&ctx->location, "Identifiers can't start with numbers"); -static size_t peek_alnum_to(struct ctx *ctx, size_t start, char ch) -{ for (size_t i = start; i < ctx->size; i++) { - char cur = ctx->data[i]; + char cur = context_getch(ctx, i); + + // Check for every option in variadic argument + va_list ap; + va_start(ap, opt_count); + for (size_t j = 0; j < opt_count; j++) { + char ch = va_arg(ap, int); + if (cur == ch) { + va_end(ap); + return i; + } + } + va_end(ap); - if (cur == ch || cur == ';' || cur == ')') - return i; + if (cur == '\n') + errln(&ctx->location, "Unexpected end of line while scanning"); - if (!isalnum(cur)) - errln(&ctx->location, "'%c' is not alpha-numeric", cur); + if (!isalnum(cur) && (cur < '!' || cur > '~')) + errln(&ctx->location, "'%c' is not an identifier", cur); } - errln(&ctx->location, "Unexpected end of buffer"); + errln(&ctx->location, "Unexpected end of buffer while scanning"); } -static size_t peek_identifier(struct ctx *ctx, size_t start, char ch) +static size_t peek_type(struct ctx *ctx, size_t start, size_t opt_count, ...) { + if (isdigit(context_getch(ctx, start))) + errln(&ctx->location, "Types can't start with numbers"); + for (size_t i = start; i < ctx->size; i++) { - char cur = ctx->data[i]; + char cur = context_getch(ctx, i); + + // Check for every option in variadic argument + va_list ap; + va_start(ap, opt_count); + for (size_t j = 0; j < opt_count; j++) { + char ch = va_arg(ap, int); + if (cur == ch) { + va_end(ap); + return i; + } + } + va_end(ap); - if (cur == ch || cur == ';' || cur == ')') - return i; + if (cur == '\n') + errln(&ctx->location, "Unexpected end of line while scanning"); if (!isalnum(cur) && (cur < '!' || cur > '~')) errln(&ctx->location, "'%c' is not an identifier", cur); } - errln(&ctx->location, "Unexpected end of buffer"); -} - -static size_t peek_to(struct ctx *ctx, size_t start, char ch) -{ - for (size_t i = start; i < ctx->size; i++) { - char cur = ctx->data[i]; - - if (cur == ch || cur == ';' || cur == ')') - return i; - } - - errln(&ctx->location, "Unexpected end of buffer"); + errln(&ctx->location, "Unexpected end of buffer while scanning"); } static void token_add(struct ctx *ctx, enum token_type type, size_t start, size_t end) { + assert(type != UNKNOWN); + struct token token = { 0 }; token.type = type; token.string.start = start; token.string.end = end; token.location = ctx->location; - assert(++ctx->token_count < TOKENS_MAX); ctx->tokens[ctx->token_count] = token; + ctx->token_count++; + assert(ctx->token_count < TOKENS_MAX); + if (type == NEWLINE) { ctx->location.line++; ctx->location.column = 0; @@ -98,7 +100,7 @@ void token_print(struct ctx *ctx, struct token *token) printf("[token type=%d] ", token->type); if (token->type == NEWLINE || token->type == END) { - printf("' (Unprintable)\n"); + printf("(Unprintable)\n"); return; } @@ -108,60 +110,125 @@ void token_print(struct ctx *ctx, struct token *token) void tokenize(struct ctx *ctx) { + enum { + PARSE_DECLARATION, + PARSE_DEFINITION, + PARSE_NUMBER, + PARSE_BODY, + PARSE_STRING, + } state = PARSE_DECLARATION, + prev = PARSE_DECLARATION; + + // TODO: Clean this loop up (move into seperate tokenizing functions) + + size_t start; for (size_t i = 0; i < ctx->size; i++) { - const char cur = ctx->data[i]; - - switch (cur) { - case '\0': - token_add(ctx, END, i, i + 1); - return; - case '\n': - token_add(ctx, NEWLINE, i, i + 1); - continue; - case ';': - token_add(ctx, EOL, i, i + 1); - continue; - case '(': - token_add(ctx, LPAREN, i, i + 1); - continue; - case ')': - token_add(ctx, RPAREN, i, i + 1); + const char cur = context_getch(ctx, i); + + // String parsing + if (cur == '"') { + if (state == PARSE_STRING) { + token_add(ctx, STRING, start, i + 1); + state = prev; + } else { + state = PARSE_STRING; + start = i; + } continue; - case '=': - token_add(ctx, EQUAL, i, i + 1); + } else if (state == PARSE_STRING) { continue; - case ' ': - ctx->location.column++; + } + + if (state != PARSE_BODY) { + switch (cur) { + case '\0': + errln(&ctx->location, "Unexpected end of buffer"); + case '\n': + token_add(ctx, NEWLINE, i, i + 1); + continue; + case MACRO_SKIP: + ctx->location.column++; + continue; + case MACRO_NEWLINE: + ctx->location.line++; + continue; + default: + break; + } + } + + if (state == PARSE_BODY) { + switch (cur) { + case '(': + token_add(ctx, LPAREN, i, i + 1); + continue; + case ')': + token_add(ctx, RPAREN, i, i + 1); + continue; + case '\n': + token_add(ctx, NEWLINE, i, i + 1); + state = PARSE_DECLARATION; + continue; + default: + break; + } + + size_t end_ident = peek_identifier(ctx, i, 3, ' ', ')', '\n'); + token_add(ctx, IDENT, i, end_ident); + i = end_ident - (context_getch(ctx, end_ident) != ' '); continue; - default: - break; } - if (next_non_alnum(ctx, i) == ':') { // Type with param identifier - size_t start_param = peek_alnum_to(ctx, i, ':') + 1; - size_t end_param; - if (peek_to_is_alnum(ctx, start_param, ' ')) { - end_param = peek_alnum_to(ctx, start_param, ' '); - } else { // Unnamed identifier ('_') - end_param = peek_to(ctx, start_param, ' '); - if (end_param - start_param != 1 || ctx->data[start_param] != '_') - errln(&ctx->location, "Invalid param identifier"); + if (state == PARSE_DECLARATION) { + size_t end_ident = peek_identifier(ctx, i, 1, ' '); + token_add(ctx, IDENT, i, end_ident); + + size_t start_type = end_ident + 1; + while (context_getch(ctx, start_type) != '-' || + context_getch(ctx, start_type + 1) != '>') { + size_t end_type = peek_type(ctx, start_type, 1, ' '); + token_add(ctx, TYPE, start_type, end_type); + start_type = end_type + 1; } - token_add(ctx, TYPE, i, start_param - 1); - token_add(ctx, TYPEDELIM, start_param - 1, start_param); - token_add(ctx, PARAM, start_param, end_param); + if (context_getch(ctx, start_type + 2) != ' ') + errln(&ctx->location, "Missing space"); + token_add(ctx, TYPEDELIM, start_type, start_type + 2); + + start_type += 3; + size_t final_type = peek_type(ctx, start_type, 1, '\n'); + token_add(ctx, TYPE, start_type, final_type); - i = end_param - 1; + i = final_type - 1; + state = PARSE_DEFINITION; continue; } - if (peek_identifier(ctx, i, ' ')) { // General identifier - size_t end_ident = peek_to(ctx, i, ' '); + if (state == PARSE_DEFINITION) { + size_t end_ident = peek_identifier(ctx, i, 1, ' '); token_add(ctx, IDENT, i, end_ident); - i = end_ident - 1; + + size_t start_parameter = end_ident + 1; + while (context_getch(ctx, start_parameter) != ':') { + size_t end_parameter = + peek_identifier(ctx, start_parameter, 1, ' '); + token_add(ctx, PARAM, start_parameter, end_parameter); + start_parameter = end_parameter + 1; + } + + if (context_getch(ctx, start_parameter + 1) != ' ') + errln(&ctx->location, "Missing space"); + token_add(ctx, IDENTDELIM, start_parameter, start_parameter + 1); + + i = start_parameter + 1; + state = PARSE_BODY; + continue; } } + /* for (size_t i = 0; i < ctx->token_count; i++) */ + /* token_print(ctx, &ctx->tokens[i]); */ + + token_add(ctx, END, ctx->size, ctx->size); context_rewind(ctx); } |