2020-10-28 05:16:30 +00:00
|
|
|
#include "clex.h"
|
|
|
|
#include "cmem.h"
|
|
|
|
|
|
|
|
#include <string.h>
|
|
|
|
|
|
|
|
CReservedWord reservedWords[] = {
|
|
|
|
{TOKEN_AND, "and", 3},
|
2020-12-24 06:41:00 +00:00
|
|
|
{TOKEN_BREAK, "break", 5},
|
|
|
|
{TOKEN_CONTINUE, "continue", 8},
|
2020-10-28 05:16:30 +00:00
|
|
|
{TOKEN_DO, "do", 2},
|
|
|
|
{TOKEN_ELSE, "else", 4},
|
|
|
|
{TOKEN_ELSEIF, "elseif", 6},
|
|
|
|
{TOKEN_END, "end", 3},
|
|
|
|
{TOKEN_FALSE, "false", 5},
|
|
|
|
{TOKEN_FOR, "for", 3},
|
|
|
|
{TOKEN_FUNCTION, "function", 8},
|
|
|
|
{TOKEN_IF, "if", 2},
|
2020-12-16 03:21:51 +00:00
|
|
|
{TOKEN_IN, "in", 2},
|
2020-10-28 05:16:30 +00:00
|
|
|
{TOKEN_LOCAL, "local", 5},
|
|
|
|
{TOKEN_NIL, "nil", 3},
|
|
|
|
{TOKEN_NOT, "not", 3},
|
|
|
|
{TOKEN_OR, "or", 2},
|
2020-12-05 23:58:56 +00:00
|
|
|
{TOKEN_PROTO, "proto", 5},
|
2020-10-28 05:16:30 +00:00
|
|
|
{TOKEN_RETURN, "return", 6},
|
|
|
|
{TOKEN_THEN, "then", 4},
|
|
|
|
{TOKEN_TRUE, "true", 4},
|
|
|
|
{TOKEN_VAR, "var", 3},
|
|
|
|
{TOKEN_WHILE, "while", 5}
|
|
|
|
};
|
|
|
|
|
2020-11-26 05:34:02 +00:00
|
|
|
// returns true if current token is a heap allocated buffer
|
|
|
|
static bool isBuffer(CLexState *state) {
|
|
|
|
return state->buffer != NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
// marks the current token as heap allocated & allocates the buffer
|
|
|
|
static void makeBuffer(CLexState *state) {
|
|
|
|
state->buffer = cosmoM_xmalloc(state->cstate, sizeof(char) * 32); // start with a 32 character long buffer
|
|
|
|
state->bufCount = 0;
|
|
|
|
state->bufCap = 32;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void resetBuffer(CLexState *state) {
|
|
|
|
state->buffer = NULL;
|
|
|
|
state->bufCount = 0;
|
|
|
|
state->bufCap = 0;
|
|
|
|
}
|
|
|
|
|
2020-12-19 19:32:43 +00:00
|
|
|
// cancels the token heap buffer and frees it
|
2020-11-26 05:34:02 +00:00
|
|
|
static void freeBuffer(CLexState *state) {
|
|
|
|
cosmoM_freearray(state->cstate, char, state->buffer, state->bufCap);
|
|
|
|
|
|
|
|
resetBuffer(state);
|
|
|
|
}
|
|
|
|
|
|
|
|
// adds character to buffer
|
|
|
|
static void appendBuffer(CLexState *state, char c) {
|
|
|
|
cosmoM_growarray(state->cstate, char, state->buffer, state->bufCount, state->bufCap);
|
|
|
|
|
|
|
|
state->buffer[state->bufCount++] = c;
|
|
|
|
}
|
|
|
|
|
|
|
|
// saves the current character to the buffer, grows the buffer as needed
|
|
|
|
static void saveBuffer(CLexState *state) {
|
|
|
|
appendBuffer(state, *state->currentChar);
|
|
|
|
}
|
|
|
|
|
|
|
|
// resets the lex state buffer & returns the allocated buffer as a null terminated string
|
2020-11-28 01:34:54 +00:00
|
|
|
static char *cutBuffer(CLexState *state, int *length) {
|
2020-11-26 05:34:02 +00:00
|
|
|
// append the null terminator
|
|
|
|
appendBuffer(state, '\0');
|
|
|
|
|
|
|
|
// cache buffer info
|
|
|
|
char *buf = state->buffer;
|
|
|
|
size_t count = state->bufCount;
|
|
|
|
size_t cap = state->bufCap;
|
|
|
|
|
2020-11-28 01:34:54 +00:00
|
|
|
*length = count - 1;
|
|
|
|
|
2020-11-26 05:34:02 +00:00
|
|
|
// reset lex state buffer!
|
|
|
|
resetBuffer(state);
|
|
|
|
|
|
|
|
// shrink the buffer to only use what we need
|
|
|
|
return cosmoM_reallocate(state->cstate, buf, cap, count);
|
|
|
|
}
|
|
|
|
|
2020-10-28 05:16:30 +00:00
|
|
|
static CToken makeToken(CLexState *state, CTokenType type) {
|
|
|
|
CToken token;
|
|
|
|
token.type = type;
|
|
|
|
token.line = state->line;
|
2020-11-26 05:34:02 +00:00
|
|
|
|
|
|
|
if (isBuffer(state)) { // is the buffer heap-allocated?
|
2020-11-28 01:34:54 +00:00
|
|
|
token.start = cutBuffer(state, &token.length);
|
2020-11-26 05:34:02 +00:00
|
|
|
} else {
|
|
|
|
token.start = state->startChar;
|
|
|
|
token.length = state->currentChar - state->startChar; // delta between start & current
|
|
|
|
}
|
2020-10-28 05:16:30 +00:00
|
|
|
|
|
|
|
state->lastType = type;
|
|
|
|
|
|
|
|
return token;
|
|
|
|
}
|
|
|
|
|
|
|
|
static CToken makeError(CLexState *state, const char *msg) {
|
|
|
|
CToken token;
|
|
|
|
token.type = TOKEN_ERROR;
|
|
|
|
token.start = (char*)msg;
|
|
|
|
token.length = strlen(msg);
|
|
|
|
token.line = state->line;
|
|
|
|
|
2020-11-26 05:34:02 +00:00
|
|
|
if (isBuffer(state))
|
|
|
|
freeBuffer(state);
|
|
|
|
|
2020-10-28 05:16:30 +00:00
|
|
|
return token;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool isEnd(CLexState *state) {
|
2020-11-10 01:44:12 +00:00
|
|
|
return *state->currentChar == '\0';
|
2020-10-28 05:16:30 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool isNumerical(char c) {
|
|
|
|
return c >= '0' && c <= '9';
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool isAlpha(char c) {
|
|
|
|
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_'; // identifiers can have '_'
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool match(CLexState *state, char expected) {
|
|
|
|
if (isEnd(state) || *state->currentChar != expected)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// it matched, so increment the currentChar and return true
|
|
|
|
state->currentChar++;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
char peek(CLexState *state) {
|
|
|
|
return *state->currentChar;
|
|
|
|
}
|
|
|
|
|
|
|
|
static char peekNext(CLexState *state) {
|
|
|
|
if (isEnd(state))
|
|
|
|
return '\0';
|
|
|
|
|
|
|
|
return state->currentChar[1];
|
|
|
|
}
|
|
|
|
|
|
|
|
char next(CLexState *state) {
|
|
|
|
state->currentChar++;
|
|
|
|
return state->currentChar[-1];
|
|
|
|
}
|
|
|
|
|
|
|
|
CTokenType identifierType(CLexState *state) {
|
|
|
|
int length = state->currentChar - state->startChar;
|
|
|
|
|
|
|
|
// check against reserved word list
|
|
|
|
for (int i = 0; i < sizeof(reservedWords) / sizeof(CReservedWord); i++) {
|
|
|
|
// it matches the reserved word
|
|
|
|
if (reservedWords[i].len == length && memcmp(state->startChar, reservedWords[i].word, length) == 0)
|
|
|
|
return reservedWords[i].type;
|
|
|
|
}
|
|
|
|
|
|
|
|
// else, it's an identifier
|
|
|
|
return TOKEN_IDENTIFIER;
|
|
|
|
}
|
|
|
|
|
|
|
|
void skipWhitespace(CLexState *state) {
|
|
|
|
while (true) {
|
|
|
|
char c = peek(state);
|
|
|
|
switch (c) {
|
2020-11-10 01:44:12 +00:00
|
|
|
case '\n': // mark new line
|
|
|
|
state->line++;
|
2020-10-28 05:16:30 +00:00
|
|
|
case ' ':
|
|
|
|
case '\r':
|
|
|
|
case '\t':
|
|
|
|
next(state); // consume the whitespace
|
|
|
|
break;
|
2020-11-19 20:41:21 +00:00
|
|
|
case '/': // consume comments
|
|
|
|
if (peekNext(state) == '/') {
|
2020-10-28 05:16:30 +00:00
|
|
|
|
|
|
|
// skip to next line (also let \n be consumed on the next iteration to properly handle that)
|
|
|
|
while (!isEnd(state) && peek(state) != '\n' && peek(state) != '\0') // if it's not a newline or null terminator
|
|
|
|
next(state);
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
return; // it's a TOKEN_SLASH, let the main body handle that
|
|
|
|
default: // it's no longer whitespace, return!
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
CToken parseString(CLexState *state) {
|
2020-11-26 05:34:02 +00:00
|
|
|
makeBuffer(state); // buffer mode
|
2020-10-28 05:16:30 +00:00
|
|
|
while (peek(state) != '"' && !isEnd(state)) {
|
2020-11-26 18:48:36 +00:00
|
|
|
switch (peek(state)) {
|
|
|
|
case '\n': // strings can't stretch across lines
|
|
|
|
return makeError(state, "Unterminated string!");
|
|
|
|
case '\\': { // special character
|
|
|
|
next(state); // consume the '\' character
|
|
|
|
|
|
|
|
switch (peek(state)) {
|
|
|
|
case 'r': case 'n': appendBuffer(state, '\n'); break;
|
|
|
|
case 't': appendBuffer(state, '\t'); break;
|
|
|
|
case '\\': appendBuffer(state, '\\'); break;
|
2020-11-28 01:34:54 +00:00
|
|
|
default: {
|
|
|
|
if (isNumerical(peek(state))) {
|
|
|
|
char *numStart = state->currentChar;
|
|
|
|
|
|
|
|
// consume the number
|
|
|
|
while (isNumerical(peek(state)))
|
|
|
|
next(state);
|
|
|
|
state->currentChar--; // since next() is called after
|
|
|
|
|
|
|
|
int num = (int)strtol(numStart, NULL, 10);
|
|
|
|
|
|
|
|
if (num > 255) // sanity check
|
|
|
|
return makeError(state, "Character out of range! > 255!");
|
|
|
|
|
|
|
|
appendBuffer(state, num);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2020-11-26 18:48:36 +00:00
|
|
|
return makeError(state, "Unknown special character!"); // TODO: maybe a more descriptive error?
|
2020-11-28 01:34:54 +00:00
|
|
|
}
|
2020-11-26 18:48:36 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
next(state); // consume special character
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
default: {
|
|
|
|
saveBuffer(state); // save the character!
|
|
|
|
next(state); // consume
|
|
|
|
}
|
|
|
|
}
|
2020-10-28 05:16:30 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (isEnd(state))
|
|
|
|
return makeError(state, "Unterminated string!");
|
|
|
|
|
|
|
|
next(state); // consume closing quote
|
|
|
|
return makeToken(state, TOKEN_STRING);
|
|
|
|
}
|
|
|
|
|
|
|
|
CToken parseNumber(CLexState *state) {
|
|
|
|
// consume number
|
|
|
|
while (isNumerical(peek(state)))
|
|
|
|
next(state);
|
|
|
|
|
|
|
|
if (peek(state) == '.' && isNumerical(peekNext(state))) {
|
|
|
|
next(state); // consume '.'
|
|
|
|
|
|
|
|
// consume number
|
|
|
|
while (isNumerical(peek(state)))
|
|
|
|
next(state);
|
|
|
|
}
|
|
|
|
|
|
|
|
return makeToken(state, TOKEN_NUMBER);
|
|
|
|
}
|
|
|
|
|
|
|
|
CToken parseIdentifier(CLexState *state) {
|
|
|
|
// read literal
|
|
|
|
while ((isAlpha(peek(state)) || isNumerical(peek(state))) && !isEnd(state))
|
|
|
|
next(state);
|
|
|
|
|
|
|
|
return makeToken(state, identifierType(state)); // is it a reserved word?
|
|
|
|
}
|
|
|
|
|
|
|
|
CLexState *cosmoL_newLexState(CState *cstate, const char *source) {
|
|
|
|
CLexState *state = cosmoM_xmalloc(cstate, sizeof(CLexState));
|
|
|
|
state->startChar = (char*)source;
|
|
|
|
state->currentChar = (char*)source;
|
|
|
|
state->line = 1;
|
|
|
|
state->lastLine = 0;
|
|
|
|
state->lastType = TOKEN_ERROR;
|
2020-11-26 05:34:02 +00:00
|
|
|
state->cstate = cstate;
|
|
|
|
|
|
|
|
resetBuffer(state);
|
2020-10-28 05:16:30 +00:00
|
|
|
|
|
|
|
return state;
|
|
|
|
}
|
|
|
|
|
|
|
|
void cosmoL_freeLexState(CState *state, CLexState *lstate) {
|
|
|
|
cosmoM_free(state, CLexState, lstate);
|
|
|
|
}
|
|
|
|
|
|
|
|
CToken cosmoL_scanToken(CLexState *state) {
|
|
|
|
skipWhitespace(state);
|
|
|
|
|
|
|
|
state->startChar = state->currentChar;
|
|
|
|
|
|
|
|
if (isEnd(state))
|
|
|
|
return makeToken(state, TOKEN_EOF);
|
|
|
|
|
|
|
|
char c = next(state);
|
|
|
|
|
|
|
|
switch (c) {
|
|
|
|
// single character tokens
|
2020-11-10 01:44:12 +00:00
|
|
|
case '(': return makeToken(state, TOKEN_LEFT_PAREN);
|
|
|
|
case ')': return makeToken(state, TOKEN_RIGHT_PAREN);
|
|
|
|
case '{': return makeToken(state, TOKEN_LEFT_BRACE);
|
|
|
|
case '}': return makeToken(state, TOKEN_RIGHT_BRACE);
|
|
|
|
case '[': return makeToken(state, TOKEN_LEFT_BRACKET);
|
|
|
|
case ']': return makeToken(state, TOKEN_RIGHT_BRACKET);
|
2020-10-28 05:16:30 +00:00
|
|
|
case ';': return makeToken(state, TOKEN_EOS);
|
|
|
|
case ',': return makeToken(state, TOKEN_COMMA);
|
2020-12-10 02:32:42 +00:00
|
|
|
case ':': return makeToken(state, TOKEN_COLON);
|
2020-10-28 05:16:30 +00:00
|
|
|
case '*': return makeToken(state, TOKEN_STAR);
|
2021-01-01 06:47:15 +00:00
|
|
|
case '%': return makeToken(state, TOKEN_PERCENT);
|
2020-11-30 18:32:04 +00:00
|
|
|
case '#': return makeToken(state, TOKEN_POUND);
|
2020-10-28 05:16:30 +00:00
|
|
|
case '/': return makeToken(state, TOKEN_SLASH);
|
|
|
|
// two character tokens
|
2020-11-19 20:41:21 +00:00
|
|
|
case '+':
|
|
|
|
return match(state, '+') ? makeToken(state, TOKEN_PLUS_PLUS) : makeToken(state, TOKEN_PLUS);
|
|
|
|
case '-':
|
|
|
|
return match(state, '-') ? makeToken(state, TOKEN_MINUS_MINUS) : makeToken(state, TOKEN_MINUS);
|
2020-10-28 05:16:30 +00:00
|
|
|
case '.':
|
2020-12-27 04:01:22 +00:00
|
|
|
return match(state, '.') ? (match(state, '.') ? makeToken(state, TOKEN_DOT_DOT_DOT) : makeToken(state, TOKEN_DOT_DOT)) : makeToken(state, TOKEN_DOT);
|
2020-10-28 05:16:30 +00:00
|
|
|
case '!':
|
|
|
|
return match(state, '=') ? makeToken(state, TOKEN_BANG_EQUAL) : makeToken(state, TOKEN_BANG);
|
|
|
|
case '=':
|
|
|
|
return match(state, '=') ? makeToken(state, TOKEN_EQUAL_EQUAL) : makeToken(state, TOKEN_EQUAL);
|
|
|
|
case '>':
|
|
|
|
return match(state, '=') ? makeToken(state, TOKEN_GREATER_EQUAL) : makeToken(state, TOKEN_GREATER);
|
|
|
|
case '<':
|
|
|
|
return match(state, '=') ? makeToken(state, TOKEN_LESS_EQUAL) : makeToken(state, TOKEN_LESS);
|
|
|
|
// literals
|
|
|
|
case '"': return parseString(state);
|
|
|
|
default:
|
|
|
|
if (isNumerical(c))
|
|
|
|
return parseNumber(state);
|
|
|
|
if (isAlpha(c))
|
|
|
|
return parseIdentifier(state);
|
|
|
|
}
|
|
|
|
|
|
|
|
return makeError(state, "Unknown symbol!");
|
|
|
|
}
|