mirror of
https://github.com/jamiebuilds/the-super-tiny-compiler.git
synced 2024-10-27 20:34:08 +00:00
Add tokenizer inline annotations
This commit is contained in:
parent
07aea0e7e6
commit
8466b6a67d
@ -321,36 +321,65 @@
|
|||||||
* So let's begin...
|
* So let's begin...
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
|
||||||
* -----------------------------------------------------------------------------
|
|
||||||
* *Note:* This is all I've written so far, so the code below isn't annnotated
|
|
||||||
* yet. You can still read it all and it totally works, but I plan on improving
|
|
||||||
* this in the near future
|
|
||||||
* -----------------------------------------------------------------------------
|
|
||||||
*/
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* ============================================================================
|
* ============================================================================
|
||||||
* (/^▽^)/
|
* (/^▽^)/
|
||||||
* THE TOKENIZER!
|
* THE TOKENIZER!
|
||||||
* ============================================================================
|
* ============================================================================
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* We're gonna start of with our first phase of parsing, lexical analysis, with the tokenizer.
|
||||||
|
*
|
||||||
|
* We're just going to take our string of code and break it down into an array of tokens.
|
||||||
|
*
|
||||||
|
* (add 2 (subtract 4 2)) => [{ type: 'paren', value: '(' }, ...]
|
||||||
|
*/
|
||||||
|
|
||||||
|
// We start by accepting an input string of code, and we're gonna set up two
|
||||||
|
// things...
|
||||||
function tokenizer(input) {
|
function tokenizer(input) {
|
||||||
|
|
||||||
|
// A `current` variable for tracking our position in the code like a cursor.
|
||||||
var current = 0;
|
var current = 0;
|
||||||
|
|
||||||
|
// And a `tokens` array for pushing our tokens to.
|
||||||
var tokens = [];
|
var tokens = [];
|
||||||
|
|
||||||
|
// We start by creating a `while` loop where we are setting up our `current`
|
||||||
|
// variable to be incremented as much as we want `inside` the loop.
|
||||||
|
//
|
||||||
|
// We do this because we may want to increment `current` many times within a
|
||||||
|
// single loop because our tokens can be any length.
|
||||||
while (current < input.length) {
|
while (current < input.length) {
|
||||||
|
|
||||||
|
// We're also going to store the `current` character in the `input`.
|
||||||
var char = input[current];
|
var char = input[current];
|
||||||
|
|
||||||
|
// The first thing we want to check for is an open parenthesis. This will
|
||||||
|
// later be used for `CallExpressions` but for now we only care about the
|
||||||
|
// character.
|
||||||
|
//
|
||||||
|
// We check to see if we have an open parenthesis:
|
||||||
if (char === '(') {
|
if (char === '(') {
|
||||||
|
|
||||||
|
// If we do, we push a new token with the type `paren` and set the value
|
||||||
|
// to an open parenthesis.
|
||||||
tokens.push({
|
tokens.push({
|
||||||
type: 'paren',
|
type: 'paren',
|
||||||
value: '('
|
value: '('
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Then we increment `current`
|
||||||
current++;
|
current++;
|
||||||
|
|
||||||
|
// And we `continue` onto the next cycle of the loop.
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Next we're going to check for a closing parenthesis. We do the same exact
|
||||||
|
// thing as before: Check for a closing parenthesis, add a new token,
|
||||||
|
// increment current, and `continue`.
|
||||||
if (char === ')') {
|
if (char === ')') {
|
||||||
tokens.push({
|
tokens.push({
|
||||||
type: 'paren',
|
type: 'paren',
|
||||||
@ -360,38 +389,73 @@ function tokenizer(input) {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Moving on we're now going to check for whitespace. This is interesting
|
||||||
|
// because we care that whitespace exists to separate characters, but it
|
||||||
|
// isn't actually important for us to store as a token. We would only throw
|
||||||
|
// it out later.
|
||||||
|
//
|
||||||
|
// So here we're just going to test for existance and if it does exist we're
|
||||||
|
// going to just `continue` on.
|
||||||
var WHITESPACE = /\s/;
|
var WHITESPACE = /\s/;
|
||||||
if (WHITESPACE.test(char)) {
|
if (WHITESPACE.test(char)) {
|
||||||
current++;
|
current++;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// The next type of token is a number. This is different than what we have
|
||||||
|
// seen before because a number could many any number of characters and we
|
||||||
|
// want to capture the entire sequence of characters as one token.
|
||||||
|
//
|
||||||
|
// (add 123 456)
|
||||||
|
// ^^^ ^^^
|
||||||
|
// Only two separate tokens
|
||||||
|
//
|
||||||
|
// So we start this off when we encounter the first number in a sequence.
|
||||||
var NUMBERS = /[0-9]/;
|
var NUMBERS = /[0-9]/;
|
||||||
if (NUMBERS.test(char)) {
|
if (NUMBERS.test(char)) {
|
||||||
|
|
||||||
|
// We're going to create a `value` string that we are going to push
|
||||||
|
// characters to.
|
||||||
var value = '';
|
var value = '';
|
||||||
|
|
||||||
|
// Then we're going to loop through each character in the sequence until
|
||||||
|
// we encounter a character that is not a number, pushing each character
|
||||||
|
// that is a number to our `value` and incrementing `current` as we go.
|
||||||
while (NUMBERS.test(char)) {
|
while (NUMBERS.test(char)) {
|
||||||
value += char;
|
value += char;
|
||||||
char = input[++current];
|
char = input[++current];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// After that we push our `number` token to the `tokens` array.
|
||||||
tokens.push({
|
tokens.push({
|
||||||
type: 'number',
|
type: 'number',
|
||||||
value: value
|
value: value
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// And we continue on.
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// The last type of token will be a `name` token. This is a sequence of
|
||||||
|
// letters instead of numbers, that are the names of functions in our lisp
|
||||||
|
// syntax.
|
||||||
|
//
|
||||||
|
// (add 2 4)
|
||||||
|
// ^^^
|
||||||
|
// Name token
|
||||||
|
//
|
||||||
var LETTERS = /[a-zA-Z]/;
|
var LETTERS = /[a-zA-Z]/;
|
||||||
if (LETTERS.test(char)) {
|
if (LETTERS.test(char)) {
|
||||||
var value = '';
|
var value = '';
|
||||||
|
|
||||||
|
// Again we're just going to loop through all the letters pushing them to
|
||||||
|
// a value.
|
||||||
while (LETTERS.test(char)) {
|
while (LETTERS.test(char)) {
|
||||||
value += char;
|
value += char;
|
||||||
char = input[++current];
|
char = input[++current];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// And pushing that value as a token with the type `name` and continuing.
|
||||||
tokens.push({
|
tokens.push({
|
||||||
type: 'name',
|
type: 'name',
|
||||||
value: value
|
value: value
|
||||||
@ -400,12 +464,23 @@ function tokenizer(input) {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Finally if we have not matched a character by now, we're going to throw
|
||||||
|
// an error and completely exit.
|
||||||
throw new TypeError('I dont know what this character is: ' + char);
|
throw new TypeError('I dont know what this character is: ' + char);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Then at the end of our `tokenizer` we simply return the tokens array.
|
||||||
return tokens;
|
return tokens;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* -----------------------------------------------------------------------------
|
||||||
|
* *Note:* This is all I've written so far, so the code below isn't annnotated
|
||||||
|
* yet. You can still read it all and it totally works, but I plan on improving
|
||||||
|
* this in the near future
|
||||||
|
* -----------------------------------------------------------------------------
|
||||||
|
*/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* ============================================================================
|
* ============================================================================
|
||||||
* ヽ/❀o ل͜ o\ノ
|
* ヽ/❀o ل͜ o\ノ
|
||||||
|
Loading…
Reference in New Issue
Block a user