aboutsummaryrefslogtreecommitdiff
path: root/src/tokenize.c
diff options
context:
space:
mode:
authorMarvin Borner2022-02-16 20:44:26 +0100
committerMarvin Borner2022-02-16 20:44:26 +0100
commit5cc450b6e8554f5d982f444b9026447971c94024 (patch)
tree7602f936718bc051b6901580b815cf7f53732f5e /src/tokenize.c
parent51c4defc436c0d119941eb6d5b953d27b5b8e6f7 (diff)
Huh
Diffstat (limited to 'src/tokenize.c')
-rw-r--r--src/tokenize.c241
1 files changed, 154 insertions, 87 deletions
diff --git a/src/tokenize.c b/src/tokenize.c
index 2a0eea6..0fa58fe 100644
--- a/src/tokenize.c
+++ b/src/tokenize.c
@@ -1,89 +1,91 @@
#include <assert.h>
#include <ctype.h>
+#include <stdarg.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdio.h>
#include <log.h>
+#include <preprocess.h>
#include <tokenize.h>
-static char next_non_alnum(struct ctx *ctx, size_t start)
-{
- for (size_t i = start; i < ctx->size; i++)
- if (!isalnum(ctx->data[i]))
- return ctx->data[i];
-
- errln(&ctx->location, "Unexpected end of buffer");
-}
+// TODO: Do some different limitations for identifiers/types
-static bool peek_to_is_alnum(struct ctx *ctx, size_t start, char ch)
+static size_t peek_identifier(struct ctx *ctx, size_t start, size_t opt_count, ...)
{
- for (size_t i = start; i < ctx->size; i++) {
- char cur = ctx->data[i];
-
- if (cur == ch || cur == ';' || cur == ')')
- return true;
-
- if (!isalnum(cur))
- return false;
- }
-
- errln(&ctx->location, "Unexpected end of buffer");
-}
+ if (isdigit(context_getch(ctx, start)))
+ errln(&ctx->location, "Identifiers can't start with numbers");
-static size_t peek_alnum_to(struct ctx *ctx, size_t start, char ch)
-{
for (size_t i = start; i < ctx->size; i++) {
- char cur = ctx->data[i];
+ char cur = context_getch(ctx, i);
+
+ // Check for every option in variadic argument
+ va_list ap;
+ va_start(ap, opt_count);
+ for (size_t j = 0; j < opt_count; j++) {
+ char ch = va_arg(ap, int);
+ if (cur == ch) {
+ va_end(ap);
+ return i;
+ }
+ }
+ va_end(ap);
- if (cur == ch || cur == ';' || cur == ')')
- return i;
+ if (cur == '\n')
+ errln(&ctx->location, "Unexpected end of line while scanning");
- if (!isalnum(cur))
- errln(&ctx->location, "'%c' is not alpha-numeric", cur);
+ if (!isalnum(cur) && (cur < '!' || cur > '~'))
+ errln(&ctx->location, "'%c' is not an identifier", cur);
}
- errln(&ctx->location, "Unexpected end of buffer");
+ errln(&ctx->location, "Unexpected end of buffer while scanning");
}
-static size_t peek_identifier(struct ctx *ctx, size_t start, char ch)
+static size_t peek_type(struct ctx *ctx, size_t start, size_t opt_count, ...)
{
+ if (isdigit(context_getch(ctx, start)))
+ errln(&ctx->location, "Types can't start with numbers");
+
for (size_t i = start; i < ctx->size; i++) {
- char cur = ctx->data[i];
+ char cur = context_getch(ctx, i);
+
+ // Check for every option in variadic argument
+ va_list ap;
+ va_start(ap, opt_count);
+ for (size_t j = 0; j < opt_count; j++) {
+ char ch = va_arg(ap, int);
+ if (cur == ch) {
+ va_end(ap);
+ return i;
+ }
+ }
+ va_end(ap);
- if (cur == ch || cur == ';' || cur == ')')
- return i;
+ if (cur == '\n')
+ errln(&ctx->location, "Unexpected end of line while scanning");
if (!isalnum(cur) && (cur < '!' || cur > '~'))
errln(&ctx->location, "'%c' is not an identifier", cur);
}
- errln(&ctx->location, "Unexpected end of buffer");
-}
-
-static size_t peek_to(struct ctx *ctx, size_t start, char ch)
-{
- for (size_t i = start; i < ctx->size; i++) {
- char cur = ctx->data[i];
-
- if (cur == ch || cur == ';' || cur == ')')
- return i;
- }
-
- errln(&ctx->location, "Unexpected end of buffer");
+ errln(&ctx->location, "Unexpected end of buffer while scanning");
}
static void token_add(struct ctx *ctx, enum token_type type, size_t start, size_t end)
{
+ assert(type != UNKNOWN);
+
struct token token = { 0 };
token.type = type;
token.string.start = start;
token.string.end = end;
token.location = ctx->location;
- assert(++ctx->token_count < TOKENS_MAX);
ctx->tokens[ctx->token_count] = token;
+ ctx->token_count++;
+ assert(ctx->token_count < TOKENS_MAX);
+
if (type == NEWLINE) {
ctx->location.line++;
ctx->location.column = 0;
@@ -98,7 +100,7 @@ void token_print(struct ctx *ctx, struct token *token)
printf("[token type=%d] ", token->type);
if (token->type == NEWLINE || token->type == END) {
- printf("' (Unprintable)\n");
+ printf("(Unprintable)\n");
return;
}
@@ -108,60 +110,125 @@ void token_print(struct ctx *ctx, struct token *token)
void tokenize(struct ctx *ctx)
{
+ enum {
+ PARSE_DECLARATION,
+ PARSE_DEFINITION,
+ PARSE_NUMBER,
+ PARSE_BODY,
+ PARSE_STRING,
+ } state = PARSE_DECLARATION,
+ prev = PARSE_DECLARATION;
+
+ // TODO: Clean this loop up (move into seperate tokenizing functions)
+
+ size_t start;
for (size_t i = 0; i < ctx->size; i++) {
- const char cur = ctx->data[i];
-
- switch (cur) {
- case '\0':
- token_add(ctx, END, i, i + 1);
- return;
- case '\n':
- token_add(ctx, NEWLINE, i, i + 1);
- continue;
- case ';':
- token_add(ctx, EOL, i, i + 1);
- continue;
- case '(':
- token_add(ctx, LPAREN, i, i + 1);
- continue;
- case ')':
- token_add(ctx, RPAREN, i, i + 1);
+ const char cur = context_getch(ctx, i);
+
+ // String parsing
+ if (cur == '"') {
+ if (state == PARSE_STRING) {
+ token_add(ctx, STRING, start, i + 1);
+ state = prev;
+ } else {
+ state = PARSE_STRING;
+ start = i;
+ }
continue;
- case '=':
- token_add(ctx, EQUAL, i, i + 1);
+ } else if (state == PARSE_STRING) {
continue;
- case ' ':
- ctx->location.column++;
+ }
+
+ if (state != PARSE_BODY) {
+ switch (cur) {
+ case '\0':
+ errln(&ctx->location, "Unexpected end of buffer");
+ case '\n':
+ token_add(ctx, NEWLINE, i, i + 1);
+ continue;
+ case MACRO_SKIP:
+ ctx->location.column++;
+ continue;
+ case MACRO_NEWLINE:
+ ctx->location.line++;
+ continue;
+ default:
+ break;
+ }
+ }
+
+ if (state == PARSE_BODY) {
+ switch (cur) {
+ case '(':
+ token_add(ctx, LPAREN, i, i + 1);
+ continue;
+ case ')':
+ token_add(ctx, RPAREN, i, i + 1);
+ continue;
+ case '\n':
+ token_add(ctx, NEWLINE, i, i + 1);
+ state = PARSE_DECLARATION;
+ continue;
+ default:
+ break;
+ }
+
+ size_t end_ident = peek_identifier(ctx, i, 3, ' ', ')', '\n');
+ token_add(ctx, IDENT, i, end_ident);
+ i = end_ident - (context_getch(ctx, end_ident) != ' ');
continue;
- default:
- break;
}
- if (next_non_alnum(ctx, i) == ':') { // Type with param identifier
- size_t start_param = peek_alnum_to(ctx, i, ':') + 1;
- size_t end_param;
- if (peek_to_is_alnum(ctx, start_param, ' ')) {
- end_param = peek_alnum_to(ctx, start_param, ' ');
- } else { // Unnamed identifier ('_')
- end_param = peek_to(ctx, start_param, ' ');
- if (end_param - start_param != 1 || ctx->data[start_param] != '_')
- errln(&ctx->location, "Invalid param identifier");
+ if (state == PARSE_DECLARATION) {
+ size_t end_ident = peek_identifier(ctx, i, 1, ' ');
+ token_add(ctx, IDENT, i, end_ident);
+
+ size_t start_type = end_ident + 1;
+ while (context_getch(ctx, start_type) != '-' ||
+ context_getch(ctx, start_type + 1) != '>') {
+ size_t end_type = peek_type(ctx, start_type, 1, ' ');
+ token_add(ctx, TYPE, start_type, end_type);
+ start_type = end_type + 1;
}
- token_add(ctx, TYPE, i, start_param - 1);
- token_add(ctx, TYPEDELIM, start_param - 1, start_param);
- token_add(ctx, PARAM, start_param, end_param);
+ if (context_getch(ctx, start_type + 2) != ' ')
+ errln(&ctx->location, "Missing space");
+ token_add(ctx, TYPEDELIM, start_type, start_type + 2);
+
+ start_type += 3;
+ size_t final_type = peek_type(ctx, start_type, 1, '\n');
+ token_add(ctx, TYPE, start_type, final_type);
- i = end_param - 1;
+ i = final_type - 1;
+ state = PARSE_DEFINITION;
continue;
}
- if (peek_identifier(ctx, i, ' ')) { // General identifier
- size_t end_ident = peek_to(ctx, i, ' ');
+ if (state == PARSE_DEFINITION) {
+ size_t end_ident = peek_identifier(ctx, i, 1, ' ');
token_add(ctx, IDENT, i, end_ident);
- i = end_ident - 1;
+
+ size_t start_parameter = end_ident + 1;
+ while (context_getch(ctx, start_parameter) != ':') {
+ size_t end_parameter =
+ peek_identifier(ctx, start_parameter, 1, ' ');
+ token_add(ctx, PARAM, start_parameter, end_parameter);
+ start_parameter = end_parameter + 1;
+ }
+
+ if (context_getch(ctx, start_parameter + 1) != ' ')
+ errln(&ctx->location, "Missing space");
+ token_add(ctx, IDENTDELIM, start_parameter, start_parameter + 1);
+
+ i = start_parameter + 1;
+ state = PARSE_BODY;
+ continue;
}
}
+ /* for (size_t i = 0; i < ctx->token_count; i++) */
+ /* token_print(ctx, &ctx->tokens[i]); */
+
+ token_add(ctx, END, ctx->size, ctx->size);
context_rewind(ctx);
}