/** * ============================================================================ * (/^▽^)/ * THE TOKENIZER! * ============================================================================ */ /** * We're gonna start off with our first phase of parsing, lexical analysis, with * the tokenizer. * * We're just going to take our string of code and break it down into an array * of tokens. * * (add 2 (subtract 4 2)) => [{ type: 'paren', value: '(' }, ...] */ // We start by accepting an input string of code, and we're gonna set up two // things... function tokenizer(input) { // A `current` variable for tracking our position in the code like a cursor. let current = 0; // And a `tokens` array for pushing our tokens to. let tokens = []; // We start by creating a `while` loop where we are setting up our `current` // variable to be incremented as much as we want `inside` the loop. // // We do this because we may want to increment `current` many times within a // single loop because our tokens can be any length. while (current < input.length) { // We're also going to store the `current` character in the `input`. let char = input[current]; // The first thing we want to check for is an open parenthesis. This will // later be used for `CallExpression` but for now we only care about the // character. // // We check to see if we have an open parenthesis: if (char === '(') { // If we do, we push a new token with the type `paren` and set the value // to an open parenthesis. tokens.push({ type: 'paren', value: '(', }); // Then we increment `current` current++; // And we `continue` onto the next cycle of the loop. continue; } // Next we're going to check for a closing parenthesis. We do the same exact // thing as before: Check for a closing parenthesis, add a new token, // increment `current`, and `continue`. if (char === ')') { tokens.push({ type: 'paren', value: ')', }); current++; continue; } // Moving on, we're now going to check for whitespace. This is interesting // because we care that whitespace exists to separate characters, but it // isn't actually important for us to store as a token. We would only throw // it out later. // // So here we're just going to test for existence and if it does exist we're // going to just `continue` on. let WHITESPACE = /\s/; if (WHITESPACE.test(char)) { current++; continue; } // The next type of token is a number. This is different than what we have // seen before because a number could be any number of characters and we // want to capture the entire sequence of characters as one token. // // (add 123 456) // ^^^ ^^^ // Only two separate tokens // // So we start this off when we encounter the first number in a sequence. let NUMBERS = /[0-9]/; if (NUMBERS.test(char)) { // We're going to create a `value` string that we are going to push // characters to. let value = ''; // Then we're going to loop through each character in the sequence until // we encounter a character that is not a number, pushing each character // that is a number to our `value` and incrementing `current` as we go. while (NUMBERS.test(char)) { value += char; char = input[++current]; } // After that we push our `number` token to the `tokens` array. tokens.push({ type: 'number', value }); // And we continue on. continue; } // We'll also add support for strings in our language which will be any // text surrounded by double quotes ("). // // (concat "foo" "bar") // ^^^ ^^^ string tokens // // We'll start by checking for the opening quote: if (char === '"') { // Keep a `value` variable for building up our string token. let value = ''; // We'll skip the opening double quote in our token. char = input[++current]; // Then we'll iterate through each character until we reach another // double quote. while (char !== '"') { value += char; char = input[++current]; } // Skip the closing double quote. char = input[++current]; // And add our `string` token to the `tokens` array. tokens.push({ type: 'string', value }); continue; } // The last type of token will be a `name` token. This is a sequence of // letters instead of numbers, that are the names of functions in our lisp // syntax. // // (add 2 4) // ^^^ // Name token // let LETTERS = /[a-z]/i; if (LETTERS.test(char)) { let value = ''; // Again we're just going to loop through all the letters pushing them to // a value. while (LETTERS.test(char)) { value += char; char = input[++current]; } // And pushing that value as a token with the type `name` and continuing. tokens.push({ type: 'name', value }); continue; } // Finally if we have not matched a character by now, we're going to throw // an error and completely exit. throw new TypeError('I dont know what this character is: ' + char); } // Then at the end of our `tokenizer` we simply return the tokens array. return tokens; } // Just exporting our tokenizer to be used in the final compiler... module.exports = tokenizer;