Cosmo/src/clex.c

#include "clex.h"
#include "cmem.h"

#include <string.h>

CReservedWord reservedWords[] = {
    {TOKEN_AND, "and", 3},
    {TOKEN_DO, "do", 2},
    {TOKEN_ELSE, "else", 4},
    {TOKEN_ELSEIF, "elseif", 6},
    {TOKEN_END, "end", 3},
    {TOKEN_FALSE, "false", 5},
    {TOKEN_FOR, "for", 3},
    {TOKEN_FUNCTION, "function", 8},
    {TOKEN_IF, "if", 2},
    {TOKEN_LOCAL, "local", 5},
    {TOKEN_NIL, "nil", 3},
    {TOKEN_NOT, "not", 3},
    {TOKEN_OR, "or", 2},
    {TOKEN_PROTO, "proto", 5},
    {TOKEN_RETURN, "return", 6},
    {TOKEN_THEN, "then", 4},
    {TOKEN_TRUE, "true", 4},
    {TOKEN_VAR, "var", 3},
    {TOKEN_WHILE, "while", 5}
};

// returns true if current token is a heap allocated buffer
static bool isBuffer(CLexState *state) {
    return state->buffer !=  NULL;
}

// marks the current token as heap allocated & allocates the buffer
static void makeBuffer(CLexState *state) {
    state->buffer = cosmoM_xmalloc(state->cstate, sizeof(char) * 32); // start with a 32 character long buffer
    state->bufCount = 0;
    state->bufCap = 32;
}

static void resetBuffer(CLexState *state) {
    state->buffer = NULL;
    state->bufCount = 0;
    state->bufCap = 0;
}

// cancels the token heap buffer and free's it
static void freeBuffer(CLexState *state) {
    cosmoM_freearray(state->cstate, char, state->buffer, state->bufCap);

    resetBuffer(state);
}

// adds character to buffer
static void appendBuffer(CLexState *state, char c) {
    cosmoM_growarray(state->cstate, char, state->buffer, state->bufCount, state->bufCap);

    state->buffer[state->bufCount++] = c;
}

// saves the current character to the buffer, grows the buffer as needed
static void saveBuffer(CLexState *state) {
    appendBuffer(state, *state->currentChar);
}

// resets the lex state buffer & returns the allocated buffer as a null terminated string
static char *cutBuffer(CLexState *state, int *length) {
    // append the null terminator
    appendBuffer(state, '\0');

    // cache buffer info
    char *buf = state->buffer;
    size_t count = state->bufCount;
    size_t cap = state->bufCap;

    *length = count - 1;

    // reset lex state buffer!
    resetBuffer(state);

    // shrink the buffer to only use what we need
    return cosmoM_reallocate(state->cstate, buf, cap, count);
}

static CToken makeToken(CLexState *state, CTokenType type) {
    CToken token;
    token.type = type;
    token.line = state->line;

    if (isBuffer(state)) { // is the buffer heap-allocated?
        token.start = cutBuffer(state, &token.length);
    } else {
        token.start = state->startChar;
        token.length = state->currentChar - state->startChar; // delta between start & current
    }
    
    state->lastType = type;

    return token;
}

static CToken makeError(CLexState *state, const char *msg) {
    CToken token;
    token.type = TOKEN_ERROR;
    token.start = (char*)msg;
    token.length = strlen(msg);
    token.line = state->line;

    if (isBuffer(state))
        freeBuffer(state);
        
    return token;
}

static inline bool isEnd(CLexState *state) {
    return *state->currentChar == '\0';
}

static inline bool isNumerical(char c) {
    return c >= '0' && c <= '9';
}

static bool isAlpha(char c) {
    return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_'; // identifiers can have '_'
}

static bool match(CLexState *state, char expected) {
    if (isEnd(state) || *state->currentChar != expected)
        return false;

    // it matched, so increment the currentChar and return true
    state->currentChar++;
    return true;
}

char peek(CLexState *state) {
    return *state->currentChar;
}

static char peekNext(CLexState *state) {
    if (isEnd(state)) 
        return '\0';
    
    return state->currentChar[1];
}

char next(CLexState *state) {
    state->currentChar++;
    return state->currentChar[-1];
}

CTokenType identifierType(CLexState *state) {
    int length = state->currentChar - state->startChar;

    // check against reserved word list
    for (int i = 0; i < sizeof(reservedWords) / sizeof(CReservedWord); i++) {
        // it matches the reserved word
        if (reservedWords[i].len == length && memcmp(state->startChar, reservedWords[i].word, length) == 0) 
            return reservedWords[i].type;
    }

    // else, it's an identifier
    return TOKEN_IDENTIFIER;
}

void skipWhitespace(CLexState *state) {
    while (true) {
        char c = peek(state);
        switch (c) {
            case '\n': // mark new line
                state->line++;
            case ' ':
            case '\r':
            case '\t':
                next(state); // consume the whitespace
                break;
            case '/': // consume comments
                if (peekNext(state) == '/') {
                    
                    // skip to next line (also let \n be consumed on the next iteration to properly handle that)
                    while (!isEnd(state) && peek(state) != '\n' && peek(state) != '\0') // if it's not a newline or null terminator 
                        next(state);
                    
                    break;
                }
                return; // it's a TOKEN_SLASH, let the main body handle that
            default: // it's no longer whitespace, return!
                return;
        }
    }
}

CToken parseString(CLexState *state) {
    makeBuffer(state); // buffer mode
    while (peek(state) != '"' && !isEnd(state)) {
        switch (peek(state)) {
            case '\n': // strings can't stretch across lines
                return makeError(state, "Unterminated string!");
            case '\\': { // special character
                next(state); // consume the '\' character

                switch (peek(state)) {
                    case 'r': case 'n': appendBuffer(state, '\n'); break;
                    case 't': appendBuffer(state, '\t'); break;
                    case '\\': appendBuffer(state, '\\'); break;
                    default: {
                        if (isNumerical(peek(state))) {
                            char *numStart = state->currentChar;

                            // consume the number
                            while (isNumerical(peek(state)))
                                next(state);
                            state->currentChar--; // since next() is called after
                            
                            int num = (int)strtol(numStart, NULL, 10);

                            if (num > 255) // sanity check
                                return makeError(state, "Character out of range! > 255!");

                            appendBuffer(state, num);
                            break;
                        }

                        return makeError(state, "Unknown special character!"); // TODO: maybe a more descriptive error?
                    }
                }

                next(state); // consume special character
                break;
            }
            default: {
                saveBuffer(state); // save the character!
                next(state); // consume
            }
        }
    }

    if (isEnd(state))
        return makeError(state, "Unterminated string!");
    
    next(state); // consume closing quote
    return makeToken(state, TOKEN_STRING);
}

CToken parseNumber(CLexState *state) {
    // consume number
    while (isNumerical(peek(state)))
        next(state);
    
    if (peek(state) == '.' && isNumerical(peekNext(state))) {
        next(state); // consume '.'

        // consume number
        while (isNumerical(peek(state)))
            next(state);
    }

    return makeToken(state, TOKEN_NUMBER);
}

CToken parseIdentifier(CLexState *state) {
    // read literal
    while ((isAlpha(peek(state)) || isNumerical(peek(state))) && !isEnd(state)) 
        next(state);
    
    return makeToken(state, identifierType(state)); // is it a reserved word?
}

CLexState *cosmoL_newLexState(CState *cstate, const char *source) {
    CLexState *state = cosmoM_xmalloc(cstate, sizeof(CLexState));
    state->startChar = (char*)source;
    state->currentChar = (char*)source;
    state->line = 1;
    state->lastLine = 0;
    state->lastType = TOKEN_ERROR;
    state->cstate = cstate;

    resetBuffer(state);

    return state;
}

void cosmoL_freeLexState(CState *state, CLexState *lstate) {
    cosmoM_free(state, CLexState, lstate);
}

CToken cosmoL_scanToken(CLexState *state) {
    skipWhitespace(state);

    state->startChar = state->currentChar;

    if (isEnd(state))
        return makeToken(state, TOKEN_EOF);
    
    char c = next(state);

    switch (c) {
        // single character tokens
        case '(': return makeToken(state, TOKEN_LEFT_PAREN);
        case ')': return makeToken(state, TOKEN_RIGHT_PAREN);
        case '{': return makeToken(state, TOKEN_LEFT_BRACE);
        case '}': return makeToken(state, TOKEN_RIGHT_BRACE);
        case '[': return makeToken(state, TOKEN_LEFT_BRACKET);
        case ']': return makeToken(state, TOKEN_RIGHT_BRACKET);
        case ';': return makeToken(state, TOKEN_EOS);
        case ',': return makeToken(state, TOKEN_COMMA);
        case '*': return makeToken(state, TOKEN_STAR);
        case '#': return makeToken(state, TOKEN_POUND);
        case '/': return makeToken(state, TOKEN_SLASH);
        // two character tokens
        case '+': 
            return match(state, '+') ? makeToken(state, TOKEN_PLUS_PLUS) : makeToken(state, TOKEN_PLUS);
        case '-': 
            return match(state, '-') ? makeToken(state, TOKEN_MINUS_MINUS) : makeToken(state, TOKEN_MINUS);
        case '.': 
            return match(state, '.') ? makeToken(state, TOKEN_DOT_DOT) : makeToken(state, TOKEN_DOT);
        case '!':
            return match(state, '=') ? makeToken(state, TOKEN_BANG_EQUAL) : makeToken(state, TOKEN_BANG);
        case '=':
            return match(state, '=') ? makeToken(state, TOKEN_EQUAL_EQUAL) : makeToken(state, TOKEN_EQUAL);
        case '>':
            return match(state, '=') ? makeToken(state, TOKEN_GREATER_EQUAL) : makeToken(state, TOKEN_GREATER);
        case '<':
            return match(state, '=') ? makeToken(state, TOKEN_LESS_EQUAL) : makeToken(state, TOKEN_LESS);
        // literals
        case '"': return parseString(state);
        default:
            if (isNumerical(c))
                return parseNumber(state);
            if (isAlpha(c))
                return parseIdentifier(state);
    }

    return makeError(state, "Unknown symbol!");
}
Initial commit 2020-10-28 05:16:30 +00:00			`#include "clex.h"`
			`#include "cmem.h"`

			`#include <string.h>`

			`CReservedWord reservedWords[] = {`
			`{TOKEN_AND, "and", 3},`
			`{TOKEN_DO, "do", 2},`
			`{TOKEN_ELSE, "else", 4},`
			`{TOKEN_ELSEIF, "elseif", 6},`
			`{TOKEN_END, "end", 3},`
			`{TOKEN_FALSE, "false", 5},`
			`{TOKEN_FOR, "for", 3},`
			`{TOKEN_FUNCTION, "function", 8},`
			`{TOKEN_IF, "if", 2},`
			`{TOKEN_LOCAL, "local", 5},`
			`{TOKEN_NIL, "nil", 3},`
			`{TOKEN_NOT, "not", 3},`
			`{TOKEN_OR, "or", 2},`
changed class -> proto 2020-12-05 23:58:56 +00:00			`{TOKEN_PROTO, "proto", 5},`
Initial commit 2020-10-28 05:16:30 +00:00			`{TOKEN_RETURN, "return", 6},`
			`{TOKEN_THEN, "then", 4},`
			`{TOKEN_TRUE, "true", 4},`
			`{TOKEN_VAR, "var", 3},`
			`{TOKEN_WHILE, "while", 5}`
			`};`

extended lexer, fixed table shrinking 2020-11-26 05:34:02 +00:00			`// returns true if current token is a heap allocated buffer`
			`static bool isBuffer(CLexState *state) {`
			`return state->buffer != NULL;`
			`}`

			`// marks the current token as heap allocated & allocates the buffer`
			`static void makeBuffer(CLexState *state) {`
			`state->buffer = cosmoM_xmalloc(state->cstate, sizeof(char) * 32); // start with a 32 character long buffer`
			`state->bufCount = 0;`
			`state->bufCap = 32;`
			`}`

			`static void resetBuffer(CLexState *state) {`
			`state->buffer = NULL;`
			`state->bufCount = 0;`
			`state->bufCap = 0;`
			`}`

			`// cancels the token heap buffer and free's it`
			`static void freeBuffer(CLexState *state) {`
			`cosmoM_freearray(state->cstate, char, state->buffer, state->bufCap);`

			`resetBuffer(state);`
			`}`

			`// adds character to buffer`
			`static void appendBuffer(CLexState *state, char c) {`
			`cosmoM_growarray(state->cstate, char, state->buffer, state->bufCount, state->bufCap);`

			`state->buffer[state->bufCount++] = c;`
			`}`

			`// saves the current character to the buffer, grows the buffer as needed`
			`static void saveBuffer(CLexState *state) {`
			`appendBuffer(state, *state->currentChar);`
			`}`

			`// resets the lex state buffer & returns the allocated buffer as a null terminated string`
fixed GC bug, extended strings 2020-11-28 01:34:54 +00:00			`static char cutBuffer(CLexState state, int *length) {`
extended lexer, fixed table shrinking 2020-11-26 05:34:02 +00:00			`// append the null terminator`
			`appendBuffer(state, '\0');`

			`// cache buffer info`
			`char *buf = state->buffer;`
			`size_t count = state->bufCount;`
			`size_t cap = state->bufCap;`

fixed GC bug, extended strings 2020-11-28 01:34:54 +00:00			`*length = count - 1;`

extended lexer, fixed table shrinking 2020-11-26 05:34:02 +00:00			`// reset lex state buffer!`
			`resetBuffer(state);`

			`// shrink the buffer to only use what we need`
			`return cosmoM_reallocate(state->cstate, buf, cap, count);`
			`}`

Initial commit 2020-10-28 05:16:30 +00:00			`static CToken makeToken(CLexState *state, CTokenType type) {`
			`CToken token;`
			`token.type = type;`
			`token.line = state->line;`
extended lexer, fixed table shrinking 2020-11-26 05:34:02 +00:00
			`if (isBuffer(state)) { // is the buffer heap-allocated?`
fixed GC bug, extended strings 2020-11-28 01:34:54 +00:00			`token.start = cutBuffer(state, &token.length);`
extended lexer, fixed table shrinking 2020-11-26 05:34:02 +00:00			`} else {`
			`token.start = state->startChar;`
			`token.length = state->currentChar - state->startChar; // delta between start & current`
			`}`
Initial commit 2020-10-28 05:16:30 +00:00
			`state->lastType = type;`

			`return token;`
			`}`

			`static CToken makeError(CLexState state, const char msg) {`
			`CToken token;`
			`token.type = TOKEN_ERROR;`
			`token.start = (char*)msg;`
			`token.length = strlen(msg);`
			`token.line = state->line;`

extended lexer, fixed table shrinking 2020-11-26 05:34:02 +00:00			`if (isBuffer(state))`
			`freeBuffer(state);`

Initial commit 2020-10-28 05:16:30 +00:00			`return token;`
			`}`

			`static inline bool isEnd(CLexState *state) {`
Major refactoring, added classes, many bug fixes 2020-11-10 01:44:12 +00:00			`return *state->currentChar == '\0';`
Initial commit 2020-10-28 05:16:30 +00:00			`}`

			`static inline bool isNumerical(char c) {`
			`return c >= '0' && c <= '9';`
			`}`

			`static bool isAlpha(char c) {`
			`return (c >= 'a' && c <= 'z') \|\| (c >= 'A' && c <= 'Z') \|\| c == '_'; // identifiers can have '_'`
			`}`

			`static bool match(CLexState *state, char expected) {`
			`if (isEnd(state) \|\| *state->currentChar != expected)`
			`return false;`

			`// it matched, so increment the currentChar and return true`
			`state->currentChar++;`
			`return true;`
			`}`

			`char peek(CLexState *state) {`
			`return *state->currentChar;`
			`}`

			`static char peekNext(CLexState *state) {`
			`if (isEnd(state))`
			`return '\0';`

			`return state->currentChar[1];`
			`}`

			`char next(CLexState *state) {`
			`state->currentChar++;`
			`return state->currentChar[-1];`
			`}`

			`CTokenType identifierType(CLexState *state) {`
			`int length = state->currentChar - state->startChar;`

			`// check against reserved word list`
			`for (int i = 0; i < sizeof(reservedWords) / sizeof(CReservedWord); i++) {`
			`// it matches the reserved word`
			`if (reservedWords[i].len == length && memcmp(state->startChar, reservedWords[i].word, length) == 0)`
			`return reservedWords[i].type;`
			`}`

			`// else, it's an identifier`
			`return TOKEN_IDENTIFIER;`
			`}`

			`void skipWhitespace(CLexState *state) {`
			`while (true) {`
			`char c = peek(state);`
			`switch (c) {`
Major refactoring, added classes, many bug fixes 2020-11-10 01:44:12 +00:00			`case '\n': // mark new line`
			`state->line++;`
Initial commit 2020-10-28 05:16:30 +00:00			`case ' ':`
			`case '\r':`
			`case '\t':`
			`next(state); // consume the whitespace`
			`break;`
minor refactoring, added inc and dec operators 2020-11-19 20:41:21 +00:00			`case '/': // consume comments`
			`if (peekNext(state) == '/') {`
Initial commit 2020-10-28 05:16:30 +00:00
			`// skip to next line (also let \n be consumed on the next iteration to properly handle that)`
			`while (!isEnd(state) && peek(state) != '\n' && peek(state) != '\0') // if it's not a newline or null terminator`
			`next(state);`

			`break;`
			`}`
			`return; // it's a TOKEN_SLASH, let the main body handle that`
			`default: // it's no longer whitespace, return!`
			`return;`
			`}`
			`}`
			`}`

			`CToken parseString(CLexState *state) {`
extended lexer, fixed table shrinking 2020-11-26 05:34:02 +00:00			`makeBuffer(state); // buffer mode`
Initial commit 2020-10-28 05:16:30 +00:00			`while (peek(state) != '"' && !isEnd(state)) {`
added special character support to strings 2020-11-26 18:48:36 +00:00			`switch (peek(state)) {`
			`case '\n': // strings can't stretch across lines`
			`return makeError(state, "Unterminated string!");`
			`case '\\': { // special character`
			`next(state); // consume the '\' character`

			`switch (peek(state)) {`
			`case 'r': case 'n': appendBuffer(state, '\n'); break;`
			`case 't': appendBuffer(state, '\t'); break;`
			`case '\\': appendBuffer(state, '\\'); break;`
fixed GC bug, extended strings 2020-11-28 01:34:54 +00:00			`default: {`
			`if (isNumerical(peek(state))) {`
			`char *numStart = state->currentChar;`

			`// consume the number`
			`while (isNumerical(peek(state)))`
			`next(state);`
			`state->currentChar--; // since next() is called after`

			`int num = (int)strtol(numStart, NULL, 10);`

			`if (num > 255) // sanity check`
			`return makeError(state, "Character out of range! > 255!");`

			`appendBuffer(state, num);`
			`break;`
			`}`

added special character support to strings 2020-11-26 18:48:36 +00:00			`return makeError(state, "Unknown special character!"); // TODO: maybe a more descriptive error?`
fixed GC bug, extended strings 2020-11-28 01:34:54 +00:00			`}`
added special character support to strings 2020-11-26 18:48:36 +00:00			`}`

			`next(state); // consume special character`
			`break;`
			`}`
			`default: {`
			`saveBuffer(state); // save the character!`
			`next(state); // consume`
			`}`
			`}`
Initial commit 2020-10-28 05:16:30 +00:00			`}`

			`if (isEnd(state))`
			`return makeError(state, "Unterminated string!");`

			`next(state); // consume closing quote`
			`return makeToken(state, TOKEN_STRING);`
			`}`

			`CToken parseNumber(CLexState *state) {`
			`// consume number`
			`while (isNumerical(peek(state)))`
			`next(state);`

			`if (peek(state) == '.' && isNumerical(peekNext(state))) {`
			`next(state); // consume '.'`

			`// consume number`
			`while (isNumerical(peek(state)))`
			`next(state);`
			`}`

			`return makeToken(state, TOKEN_NUMBER);`
			`}`

			`CToken parseIdentifier(CLexState *state) {`
			`// read literal`
			`while ((isAlpha(peek(state)) \|\| isNumerical(peek(state))) && !isEnd(state))`
			`next(state);`

			`return makeToken(state, identifierType(state)); // is it a reserved word?`
			`}`

			`CLexState cosmoL_newLexState(CState cstate, const char *source) {`
			`CLexState *state = cosmoM_xmalloc(cstate, sizeof(CLexState));`
			`state->startChar = (char*)source;`
			`state->currentChar = (char*)source;`
			`state->line = 1;`
			`state->lastLine = 0;`
			`state->lastType = TOKEN_ERROR;`
extended lexer, fixed table shrinking 2020-11-26 05:34:02 +00:00			`state->cstate = cstate;`

			`resetBuffer(state);`
Initial commit 2020-10-28 05:16:30 +00:00
			`return state;`
			`}`

			`void cosmoL_freeLexState(CState state, CLexState lstate) {`
			`cosmoM_free(state, CLexState, lstate);`
			`}`

			`CToken cosmoL_scanToken(CLexState *state) {`
			`skipWhitespace(state);`

			`state->startChar = state->currentChar;`

			`if (isEnd(state))`
			`return makeToken(state, TOKEN_EOF);`

			`char c = next(state);`

			`switch (c) {`
			`// single character tokens`
Major refactoring, added classes, many bug fixes 2020-11-10 01:44:12 +00:00			`case '(': return makeToken(state, TOKEN_LEFT_PAREN);`
			`case ')': return makeToken(state, TOKEN_RIGHT_PAREN);`
			`case '{': return makeToken(state, TOKEN_LEFT_BRACE);`
			`case '}': return makeToken(state, TOKEN_RIGHT_BRACE);`
			`case '[': return makeToken(state, TOKEN_LEFT_BRACKET);`
			`case ']': return makeToken(state, TOKEN_RIGHT_BRACKET);`
Initial commit 2020-10-28 05:16:30 +00:00			`case ';': return makeToken(state, TOKEN_EOS);`
			`case ',': return makeToken(state, TOKEN_COMMA);`
			`case '*': return makeToken(state, TOKEN_STAR);`
added # operator, improved error messages 2020-11-30 18:32:04 +00:00			`case '#': return makeToken(state, TOKEN_POUND);`
Initial commit 2020-10-28 05:16:30 +00:00			`case '/': return makeToken(state, TOKEN_SLASH);`
			`// two character tokens`
minor refactoring, added inc and dec operators 2020-11-19 20:41:21 +00:00			`case '+':`
			`return match(state, '+') ? makeToken(state, TOKEN_PLUS_PLUS) : makeToken(state, TOKEN_PLUS);`
			`case '-':`
			`return match(state, '-') ? makeToken(state, TOKEN_MINUS_MINUS) : makeToken(state, TOKEN_MINUS);`
Initial commit 2020-10-28 05:16:30 +00:00			`case '.':`
			`return match(state, '.') ? makeToken(state, TOKEN_DOT_DOT) : makeToken(state, TOKEN_DOT);`
			`case '!':`
			`return match(state, '=') ? makeToken(state, TOKEN_BANG_EQUAL) : makeToken(state, TOKEN_BANG);`
			`case '=':`
			`return match(state, '=') ? makeToken(state, TOKEN_EQUAL_EQUAL) : makeToken(state, TOKEN_EQUAL);`
			`case '>':`
			`return match(state, '=') ? makeToken(state, TOKEN_GREATER_EQUAL) : makeToken(state, TOKEN_GREATER);`
			`case '<':`
			`return match(state, '=') ? makeToken(state, TOKEN_LESS_EQUAL) : makeToken(state, TOKEN_LESS);`
			`// literals`
			`case '"': return parseString(state);`
			`default:`
			`if (isNumerical(c))`
			`return parseNumber(state);`
			`if (isAlpha(c))`
			`return parseIdentifier(state);`
			`}`

			`return makeError(state, "Unknown symbol!");`
			`}`