#include "clex.h" #include "cmem.h" #include CReservedWord reservedWords[] = { {TOKEN_AND, "and", 3}, {TOKEN_DO, "do", 2}, {TOKEN_ELSE, "else", 4}, {TOKEN_ELSEIF, "elseif", 6}, {TOKEN_END, "end", 3}, {TOKEN_FALSE, "false", 5}, {TOKEN_FOR, "for", 3}, {TOKEN_FUNCTION, "function", 8}, {TOKEN_CLASS, "class", 5}, {TOKEN_IF, "if", 2}, {TOKEN_LOCAL, "local", 5}, {TOKEN_NIL, "nil", 3}, {TOKEN_NOT, "not", 3}, {TOKEN_OR, "or", 2}, {TOKEN_RETURN, "return", 6}, {TOKEN_THEN, "then", 4}, {TOKEN_TRUE, "true", 4}, {TOKEN_VAR, "var", 3}, {TOKEN_WHILE, "while", 5} }; static CToken makeToken(CLexState *state, CTokenType type) { CToken token; token.type = type; token.start = state->startChar; token.length = state->currentChar - state->startChar; // delta between start & current token.line = state->line; state->lastType = type; return token; } static CToken makeError(CLexState *state, const char *msg) { CToken token; token.type = TOKEN_ERROR; token.start = (char*)msg; token.length = strlen(msg); token.line = state->line; return token; } static inline bool isEnd(CLexState *state) { return *state->currentChar == '\0'; } static inline bool isNumerical(char c) { return c >= '0' && c <= '9'; } static bool isAlpha(char c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_'; // identifiers can have '_' } static bool match(CLexState *state, char expected) { if (isEnd(state) || *state->currentChar != expected) return false; // it matched, so increment the currentChar and return true state->currentChar++; return true; } char peek(CLexState *state) { return *state->currentChar; } static char peekNext(CLexState *state) { if (isEnd(state)) return '\0'; return state->currentChar[1]; } char next(CLexState *state) { state->currentChar++; return state->currentChar[-1]; } CTokenType identifierType(CLexState *state) { int length = state->currentChar - state->startChar; // check against reserved word list for (int i = 0; i < sizeof(reservedWords) / sizeof(CReservedWord); i++) { // it matches the reserved word if (reservedWords[i].len == length && memcmp(state->startChar, reservedWords[i].word, length) == 0) return reservedWords[i].type; } // else, it's an identifier return TOKEN_IDENTIFIER; } void skipWhitespace(CLexState *state) { while (true) { char c = peek(state); switch (c) { case '\n': // mark new line state->line++; case ' ': case '\r': case '\t': next(state); // consume the whitespace break; case '/': // consume comments if (peekNext(state) == '/') { // skip to next line (also let \n be consumed on the next iteration to properly handle that) while (!isEnd(state) && peek(state) != '\n' && peek(state) != '\0') // if it's not a newline or null terminator next(state); break; } return; // it's a TOKEN_SLASH, let the main body handle that default: // it's no longer whitespace, return! return; } } } CToken parseString(CLexState *state) { while (peek(state) != '"' && !isEnd(state)) { if (peek(state) == '\n') // strings can't stretch across lines return makeError(state, "Unterminated string!"); next(state); // consume } if (isEnd(state)) return makeError(state, "Unterminated string!"); next(state); // consume closing quote return makeToken(state, TOKEN_STRING); } CToken parseNumber(CLexState *state) { // consume number while (isNumerical(peek(state))) next(state); if (peek(state) == '.' && isNumerical(peekNext(state))) { next(state); // consume '.' // consume number while (isNumerical(peek(state))) next(state); } return makeToken(state, TOKEN_NUMBER); } CToken parseIdentifier(CLexState *state) { // read literal while ((isAlpha(peek(state)) || isNumerical(peek(state))) && !isEnd(state)) next(state); return makeToken(state, identifierType(state)); // is it a reserved word? } CLexState *cosmoL_newLexState(CState *cstate, const char *source) { CLexState *state = cosmoM_xmalloc(cstate, sizeof(CLexState)); state->startChar = (char*)source; state->currentChar = (char*)source; state->line = 1; state->lastLine = 0; state->lastType = TOKEN_ERROR; return state; } void cosmoL_freeLexState(CState *state, CLexState *lstate) { cosmoM_free(state, CLexState, lstate); } CToken cosmoL_scanToken(CLexState *state) { skipWhitespace(state); state->startChar = state->currentChar; if (isEnd(state)) return makeToken(state, TOKEN_EOF); char c = next(state); switch (c) { // single character tokens case '(': return makeToken(state, TOKEN_LEFT_PAREN); case ')': return makeToken(state, TOKEN_RIGHT_PAREN); case '{': return makeToken(state, TOKEN_LEFT_BRACE); case '}': return makeToken(state, TOKEN_RIGHT_BRACE); case '[': return makeToken(state, TOKEN_LEFT_BRACKET); case ']': return makeToken(state, TOKEN_RIGHT_BRACKET); case ';': return makeToken(state, TOKEN_EOS); case ',': return makeToken(state, TOKEN_COMMA); case '*': return makeToken(state, TOKEN_STAR); case '/': return makeToken(state, TOKEN_SLASH); // two character tokens case '+': return match(state, '+') ? makeToken(state, TOKEN_PLUS_PLUS) : makeToken(state, TOKEN_PLUS); case '-': return match(state, '-') ? makeToken(state, TOKEN_MINUS_MINUS) : makeToken(state, TOKEN_MINUS); case '.': return match(state, '.') ? makeToken(state, TOKEN_DOT_DOT) : makeToken(state, TOKEN_DOT); case '!': return match(state, '=') ? makeToken(state, TOKEN_BANG_EQUAL) : makeToken(state, TOKEN_BANG); case '=': return match(state, '=') ? makeToken(state, TOKEN_EQUAL_EQUAL) : makeToken(state, TOKEN_EQUAL); case '>': return match(state, '=') ? makeToken(state, TOKEN_GREATER_EQUAL) : makeToken(state, TOKEN_GREATER); case '<': return match(state, '=') ? makeToken(state, TOKEN_LESS_EQUAL) : makeToken(state, TOKEN_LESS); // literals case '"': return parseString(state); default: if (isNumerical(c)) return parseNumber(state); if (isAlpha(c)) return parseIdentifier(state); } return makeError(state, "Unknown symbol!"); }