Cosmo/src/clex.c

484 lines
14 KiB
C

#include "clex.h"
#include "cmem.h"
#include <string.h>
CReservedWord reservedWords[] = {
{ TOKEN_AND, "and", 3},
{ TOKEN_BREAK, "break", 5},
{TOKEN_CONTINUE, "continue", 8},
{ TOKEN_DO, "do", 2},
{ TOKEN_ELSE, "else", 4},
{ TOKEN_ELSEIF, "elseif", 6},
{ TOKEN_END, "end", 3},
{ TOKEN_FALSE, "false", 5},
{ TOKEN_FOR, "for", 3},
{ TOKEN_FUNC, "func", 4},
{ TOKEN_IF, "if", 2},
{ TOKEN_IN, "in", 2},
{ TOKEN_LOCAL, "local", 5},
{ TOKEN_NIL, "nil", 3},
{ TOKEN_NOT, "not", 3},
{ TOKEN_OR, "or", 2},
{ TOKEN_PROTO, "proto", 5},
{ TOKEN_RETURN, "return", 6},
{ TOKEN_THEN, "then", 4},
{ TOKEN_TRUE, "true", 4},
{ TOKEN_LET, "let", 3},
{ TOKEN_WHILE, "while", 5}
};
// returns true if current token is a heap allocated buffer
static bool isBuffer(CLexState *state)
{
return state->buffer != NULL;
}
// marks the current token as heap allocated & allocates the buffer
static void makeBuffer(CLexState *state)
{
state->buffer =
cosmoM_xmalloc(state->cstate, sizeof(char) * 32); // start with a 32 character long buffer
state->bufCount = 0;
state->bufCap = 32;
}
static void resetBuffer(CLexState *state)
{
state->buffer = NULL;
state->bufCount = 0;
state->bufCap = 0;
}
// cancels the token heap buffer and frees it
static void freeBuffer(CLexState *state)
{
cosmoM_freeArray(state->cstate, char, state->buffer, state->bufCap);
resetBuffer(state);
}
// adds character to buffer
static void appendBuffer(CLexState *state, char c)
{
cosmoM_growArray(state->cstate, char, state->buffer, state->bufCount, state->bufCap);
state->buffer[state->bufCount++] = c;
}
// saves the current character to the buffer, grows the buffer as needed
static void saveBuffer(CLexState *state)
{
appendBuffer(state, *state->currentChar);
}
// resets the lex state buffer & returns the allocated buffer as a null terminated string
static char *cutBuffer(CLexState *state, int *length)
{
// append the null terminator
appendBuffer(state, '\0');
// cache buffer info
char *buf = state->buffer;
size_t count = state->bufCount;
size_t cap = state->bufCap;
*length = count - 1;
// reset lex state buffer!
resetBuffer(state);
// shrink the buffer to only use what we need
return cosmoM_reallocate(state->cstate, buf, cap, count);
}
static CToken makeToken(CLexState *state, CTokenType type)
{
CToken token;
token.type = type;
token.line = state->line;
if (isBuffer(state)) { // is the buffer heap-allocated?
token.start = cutBuffer(state, &token.length);
} else {
token.start = state->startChar;
token.length = state->currentChar - state->startChar; // delta between start & current
}
state->lastType = type;
return token;
}
static CToken makeError(CLexState *state, const char *msg)
{
CToken token;
token.type = TOKEN_ERROR;
token.start = (char *)msg;
token.length = strlen(msg);
token.line = state->line;
if (isBuffer(state))
freeBuffer(state);
return token;
}
static inline bool isEnd(CLexState *state)
{
return *state->currentChar == '\0';
}
static inline bool isNumerical(char c)
{
return c >= '0' && c <= '9';
}
static bool isAlpha(char c)
{
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_'; // identifiers can have '_'
}
static bool match(CLexState *state, char expected)
{
if (isEnd(state) || *state->currentChar != expected)
return false;
// it matched, so increment the currentChar and return true
state->currentChar++;
return true;
}
static char peek(CLexState *state)
{
return *state->currentChar;
}
static char peekNext(CLexState *state)
{
if (isEnd(state))
return '\0';
return state->currentChar[1];
}
static char next(CLexState *state)
{
if (isEnd(state))
return '\0'; // return a null terminator
state->currentChar++;
return state->currentChar[-1];
}
static bool isHex(char c)
{
return isNumerical(c) || ('A' <= c && 'F' >= c) || ('a' <= c && 'f' >= c);
}
static CTokenType identifierType(CLexState *state)
{
int length = state->currentChar - state->startChar;
// check against reserved word list
for (size_t i = 0; i < sizeof(reservedWords) / sizeof(CReservedWord); i++) {
// it matches the reserved word
if (reservedWords[i].len == length &&
memcmp(state->startChar, reservedWords[i].word, length) == 0)
return reservedWords[i].type;
}
// else, it's an identifier
return TOKEN_IDENTIFIER;
}
static void skipWhitespace(CLexState *state)
{
while (true) {
char c = peek(state);
switch (c) {
case '\n': // mark new line
state->line++;
case ' ':
case '\r':
case '\t':
next(state); // consume the whitespace
break;
case '/': // consume comments
if (peekNext(state) == '/') {
// skip to next line (also let \n be consumed on the next iteration to properly
// handle that)
while (!isEnd(state) &&
peek(state) != '\n') // if it's not a newline or the end of the source
next(state);
// keep consuming whitespace
break;
} else if (peekNext(state) == '*') { // multiline comments
while (!isEnd(state) &&
!(peek(state) == '*' &&
peekNext(state) ==
'/')) // if it's the end of the comment or the end of the source
next(state);
// consume the '*/'
next(state);
next(state);
// keep consuming whitespace
break;
}
return; // it's a TOKEN_SLASH, let the main body handle that
default: // it's no longer whitespace, return!
return;
}
}
}
static CToken parseString(CLexState *state)
{
makeBuffer(state); // buffer mode
while (peek(state) != '"' && !isEnd(state)) {
switch (peek(state)) {
case '\n': // strings can't stretch across lines
return makeError(state, "Unterminated string!");
case '\\': { // special character
next(state); // consume the '\' character
switch (peek(state)) {
case 'r':
case 'n':
appendBuffer(state, '\n');
break;
case 't':
appendBuffer(state, '\t');
break;
case '\\':
appendBuffer(state, '\\');
break;
case '"':
appendBuffer(state, '"');
break;
case 'x': // hexadecimal character encoding
next(state); // skip 'x'
if (isHex(peek(state))) {
char *numStart = state->currentChar;
// consume the hexnum
while (isHex(peek(state)))
next(state);
state->currentChar--; // since next() is called after
unsigned int num = (unsigned int)strtoul(numStart, NULL, 16);
if (num > 255) // sanity check
return makeError(state, "Character out of range! > 255!");
appendBuffer(state, num);
break;
}
return makeError(state, "Unknown hexadecimal character encoding!");
case 'b': // binary character encoding
next(state); // skip 'b'
if (peek(state) == '0' || peek(state) == '1') {
char *numStart = state->currentChar;
// consume the bin
while (peek(state) == '0' || peek(state) == '1')
next(state);
state->currentChar--; // since next() is called after
unsigned int num = (unsigned int)strtoul(numStart, NULL, 2);
if (num > 255) // sanity check
return makeError(state, "Character out of range! > 255!");
appendBuffer(state, num);
break;
}
return makeError(state, "Unknown binary character encoding!");
default: {
if (isNumerical(peek(state))) {
char *numStart = state->currentChar;
// consume the number
while (isNumerical(peek(state)))
next(state);
state->currentChar--; // since next() is called after
unsigned int num = (unsigned int)strtoul(numStart, NULL, 10);
if (num > 255) // sanity check
return makeError(state, "Character out of range! > 255!");
appendBuffer(state, num);
break;
}
return makeError(
state, "Unknown special character!"); // TODO: maybe a more descriptive error?
}
}
next(state); // consume special character
break;
}
default: {
saveBuffer(state); // save the character!
next(state); // consume
}
}
}
if (isEnd(state))
return makeError(state, "Unterminated string!");
next(state); // consume closing quote
return makeToken(state, TOKEN_STRING);
}
static CToken parseNumber(CLexState *state)
{
switch (peek(state)) {
case 'x': // hexadecimal number
next(state);
while (isHex(peek(state)))
next(state);
return makeToken(state, TOKEN_HEXNUMBER);
case 'b': // binary number
next(state);
while (peek(state) == '0' || peek(state) == '1')
next(state);
return makeToken(state, TOKEN_BINNUMBER);
default: // it's a one digit number!!!!!
if (!isNumerical(peek(state)) && !(peek(state) == '.'))
return makeToken(state, TOKEN_NUMBER);
// if it is a number, fall through and parse normally
}
// consume number
while (isNumerical(peek(state))) {
next(state);
}
if (peek(state) == '.' && isNumerical(peekNext(state))) {
next(state); // consume '.'
// consume number
while (isNumerical(peek(state)))
next(state);
}
return makeToken(state, TOKEN_NUMBER);
}
static CToken parseIdentifier(CLexState *state)
{
// read literal
while ((isAlpha(peek(state)) || isNumerical(peek(state))) && !isEnd(state))
next(state);
return makeToken(state, identifierType(state)); // is it a reserved word?
}
void cosmoL_initLexState(CState *cstate, CLexState *state, const char *source)
{
state->startChar = (char *)source;
state->currentChar = (char *)source;
state->line = 1;
state->lastLine = 0;
state->lastType = TOKEN_ERROR;
state->cstate = cstate;
resetBuffer(state);
}
void cosmoL_cleanupLexState(CState *state, CLexState *lstate)
{
// stubbed
}
CToken cosmoL_scanToken(CLexState *state)
{
skipWhitespace(state);
state->startChar = state->currentChar;
if (isEnd(state))
return makeToken(state, TOKEN_EOF);
char c = next(state);
switch (c) {
// single character tokens
case '(':
return makeToken(state, TOKEN_LEFT_PAREN);
case ')':
return makeToken(state, TOKEN_RIGHT_PAREN);
case '{':
return makeToken(state, TOKEN_LEFT_BRACE);
case '}':
return makeToken(state, TOKEN_RIGHT_BRACE);
case '[':
return makeToken(state, TOKEN_LEFT_BRACKET);
case ']':
return makeToken(state, TOKEN_RIGHT_BRACKET);
case ';':
return makeToken(state, TOKEN_EOS);
case ',':
return makeToken(state, TOKEN_COMMA);
case ':':
return makeToken(state, TOKEN_COLON);
case '*':
return makeToken(state, TOKEN_STAR);
case '%':
return makeToken(state, TOKEN_PERCENT);
case '^':
return makeToken(state, TOKEN_CARROT);
case '#':
return makeToken(state, TOKEN_POUND);
case '/':
return makeToken(state, TOKEN_SLASH);
// two character tokens
case '+':
return match(state, '+') ? makeToken(state, TOKEN_PLUS_PLUS) : makeToken(state, TOKEN_PLUS);
case '-':
return match(state, '-') ? makeToken(state, TOKEN_MINUS_MINUS)
: makeToken(state, TOKEN_MINUS);
case '.':
return match(state, '.') ? (match(state, '.') ? makeToken(state, TOKEN_DOT_DOT_DOT)
: makeToken(state, TOKEN_DOT_DOT))
: makeToken(state, TOKEN_DOT);
case '!':
return match(state, '=') ? makeToken(state, TOKEN_BANG_EQUAL)
: makeToken(state, TOKEN_BANG);
case '=':
return match(state, '=') ? makeToken(state, TOKEN_EQUAL_EQUAL)
: makeToken(state, TOKEN_EQUAL);
case '>':
return match(state, '=') ? makeToken(state, TOKEN_GREATER_EQUAL)
: makeToken(state, TOKEN_GREATER);
case '<':
return match(state, '=') ? makeToken(state, TOKEN_LESS_EQUAL)
: makeToken(state, TOKEN_LESS);
// literals
case '"':
return parseString(state);
default:
if (isNumerical(c))
return parseNumber(state);
if (isAlpha(c))
return parseIdentifier(state);
}
return makeError(state, "Unknown symbol!");
}