From 8466b6a67df9d815f1b7a8b5d4eec5abca0aea44 Mon Sep 17 00:00:00 2001 From: James Kyle Date: Wed, 30 Mar 2016 19:27:29 -0700 Subject: [PATCH] Add tokenizer inline annotations --- super-tiny-compiler.js | 91 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 83 insertions(+), 8 deletions(-) diff --git a/super-tiny-compiler.js b/super-tiny-compiler.js index 1181a6c..6be8b64 100644 --- a/super-tiny-compiler.js +++ b/super-tiny-compiler.js @@ -321,36 +321,65 @@ * So let's begin... */ -/** - * ----------------------------------------------------------------------------- - * *Note:* This is all I've written so far, so the code below isn't annnotated - * yet. You can still read it all and it totally works, but I plan on improving - * this in the near future - * ----------------------------------------------------------------------------- - */ - /** * ============================================================================ * (/^▽^)/ * THE TOKENIZER! * ============================================================================ */ + +/** + * We're gonna start of with our first phase of parsing, lexical analysis, with the tokenizer. + * + * We're just going to take our string of code and break it down into an array of tokens. + * + * (add 2 (subtract 4 2)) => [{ type: 'paren', value: '(' }, ...] + */ + +// We start by accepting an input string of code, and we're gonna set up two +// things... function tokenizer(input) { + + // A `current` variable for tracking our position in the code like a cursor. var current = 0; + + // And a `tokens` array for pushing our tokens to. var tokens = []; + // We start by creating a `while` loop where we are setting up our `current` + // variable to be incremented as much as we want `inside` the loop. + // + // We do this because we may want to increment `current` many times within a + // single loop because our tokens can be any length. while (current < input.length) { + + // We're also going to store the `current` character in the `input`. var char = input[current]; + // The first thing we want to check for is an open parenthesis. This will + // later be used for `CallExpressions` but for now we only care about the + // character. + // + // We check to see if we have an open parenthesis: if (char === '(') { + + // If we do, we push a new token with the type `paren` and set the value + // to an open parenthesis. tokens.push({ type: 'paren', value: '(' }); + + // Then we increment `current` current++; + + // And we `continue` onto the next cycle of the loop. continue; } + // Next we're going to check for a closing parenthesis. We do the same exact + // thing as before: Check for a closing parenthesis, add a new token, + // increment current, and `continue`. if (char === ')') { tokens.push({ type: 'paren', @@ -360,38 +389,73 @@ function tokenizer(input) { continue; } + // Moving on we're now going to check for whitespace. This is interesting + // because we care that whitespace exists to separate characters, but it + // isn't actually important for us to store as a token. We would only throw + // it out later. + // + // So here we're just going to test for existance and if it does exist we're + // going to just `continue` on. var WHITESPACE = /\s/; if (WHITESPACE.test(char)) { current++; continue; } + // The next type of token is a number. This is different than what we have + // seen before because a number could many any number of characters and we + // want to capture the entire sequence of characters as one token. + // + // (add 123 456) + // ^^^ ^^^ + // Only two separate tokens + // + // So we start this off when we encounter the first number in a sequence. var NUMBERS = /[0-9]/; if (NUMBERS.test(char)) { + + // We're going to create a `value` string that we are going to push + // characters to. var value = ''; + // Then we're going to loop through each character in the sequence until + // we encounter a character that is not a number, pushing each character + // that is a number to our `value` and incrementing `current` as we go. while (NUMBERS.test(char)) { value += char; char = input[++current]; } + // After that we push our `number` token to the `tokens` array. tokens.push({ type: 'number', value: value }); + // And we continue on. continue; } + // The last type of token will be a `name` token. This is a sequence of + // letters instead of numbers, that are the names of functions in our lisp + // syntax. + // + // (add 2 4) + // ^^^ + // Name token + // var LETTERS = /[a-zA-Z]/; if (LETTERS.test(char)) { var value = ''; + // Again we're just going to loop through all the letters pushing them to + // a value. while (LETTERS.test(char)) { value += char; char = input[++current]; } + // And pushing that value as a token with the type `name` and continuing. tokens.push({ type: 'name', value: value @@ -400,12 +464,23 @@ function tokenizer(input) { continue; } + // Finally if we have not matched a character by now, we're going to throw + // an error and completely exit. throw new TypeError('I dont know what this character is: ' + char); } + // Then at the end of our `tokenizer` we simply return the tokens array. return tokens; } +/** + * ----------------------------------------------------------------------------- + * *Note:* This is all I've written so far, so the code below isn't annnotated + * yet. You can still read it all and it totally works, but I plan on improving + * this in the near future + * ----------------------------------------------------------------------------- + */ + /** * ============================================================================ * ヽ/❀o ل͜ o\ノ