diff --git a/README.md b/README.md index 27cdc65..b8935f2 100755 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -THE SUPER TINY COMPILER +THE SUPER TINY COMPILER ***Welcome to The Super Tiny Compiler!*** @@ -8,7 +8,7 @@ written in easy to read JavaScript. Reading through the guided code will help you learn about how *most* compilers work from end to end. -### [Want to jump into the code? Click here](super-tiny-compiler.js) +### [Want to jump into the code? Click here](the-super-tiny-compiler.js) --- @@ -27,7 +27,7 @@ the nerds are able to understand. ### Okay so where do I begin? -Awesome! Head on over to the [super-tiny-compiler.js](super-tiny-compiler.js) +Awesome! Head on over to the [the-super-tiny-compiler.js](the-super-tiny-compiler.js) file. ### I'm back, that didn't make sense diff --git a/package.json b/package.json new file mode 100644 index 0000000..21de19b --- /dev/null +++ b/package.json @@ -0,0 +1,7 @@ +{ + "name": "the-super-tiny-compiler", + "version": "0.0.0", + "author": "James Kyle (thejameskyle.com)", + "license": "CC-BY-4.0", + "main": "./the-super-tiny-compiler.js" +} diff --git a/test.js b/test.js index ef351a8..3b49c2c 100755 --- a/test.js +++ b/test.js @@ -1,16 +1,16 @@ -var superTinyCompiler = require('./super-tiny-compiler'); -var assert = require('assert'); +const { + tokenizer, + parser, + transformer, + codeGenerator, + compiler, +} = require('./super-tiny-compiler'); +const assert = require('assert'); -var tokenizer = superTinyCompiler.tokenizer; -var parser = superTinyCompiler.parser; -var transformer = superTinyCompiler.transformer; -var codeGenerator = superTinyCompiler.codeGenerator; -var compiler = superTinyCompiler.compiler; +const input = '(add 2 (subtract 4 2))'; +const output = 'add(2, subtract(4, 2));'; -var input = '(add 2 (subtract 4 2))'; -var output = 'add(2, subtract(4, 2));'; - -var tokens = [ +const tokens = [ { type: 'paren', value: '(' }, { type: 'name', value: 'add' }, { type: 'number', value: '2' }, @@ -22,7 +22,7 @@ var tokens = [ { type: 'paren', value: ')' } ]; -var ast = { +const ast = { type: 'Program', body: [{ type: 'CallExpression', @@ -44,7 +44,7 @@ var ast = { }] }; -var newAst = { +const newAst = { type: 'Program', body: [{ type: 'ExpressionStatement', diff --git a/super-tiny-compiler.js b/the-super-tiny-compiler.js similarity index 83% rename from super-tiny-compiler.js rename to the-super-tiny-compiler.js index e4e214d..0d62591 100755 --- a/super-tiny-compiler.js +++ b/the-super-tiny-compiler.js @@ -1,3 +1,5 @@ +'use strict'; + /** * TTTTTTTTTTTTTTTTTTTTTTTHHHHHHHHH HHHHHHHHHEEEEEEEEEEEEEEEEEEEEEE * T:::::::::::::::::::::TH:::::::H H:::::::HE::::::::::::::::::::E @@ -150,7 +152,7 @@ * { type: 'number', value: '4' }, * { type: 'number', value: '2' }, * { type: 'paren', value: ')' }, - * { type: 'paren', value: ')' } + * { type: 'paren', value: ')' }, * ] * * And an Abstract Syntax Tree (AST) might look like this: @@ -162,16 +164,16 @@ * name: 'add', * params: [{ * type: 'NumberLiteral', - * value: '2' + * value: '2', * }, { * type: 'CallExpression', * name: 'subtract', * params: [{ * type: 'NumberLiteral', - * value: '4' + * value: '4', * }, { * type: 'NumberLiteral', - * value: '2' + * value: '2', * }] * }] * }] @@ -198,7 +200,7 @@ * * { * type: 'NumberLiteral', - * value: '2' + * value: '2', * } * * Or maybe a node for a "CallExpression": @@ -206,7 +208,7 @@ * { * type: 'CallExpression', * name: 'subtract', - * params: [...nested nodes go here...] + * params: [...nested nodes go here...], * } * * When transforming the AST we can manipulate nodes by @@ -259,7 +261,7 @@ * we would likely introduce all sorts of abstractions here. But just visiting * each node in the tree is enough. * - * The reason I use the word “visiting” is because there is this pattern of how + * The reason I use the word "visiting" is because there is this pattern of how * to represent operations on elements of an object structure. * * Visitors @@ -270,7 +272,7 @@ * * var visitor = { * NumberLiteral() {}, - * CallExpression() {} + * CallExpression() {}, * }; * * When we traverse our AST we will call the methods on this visitor whenever we @@ -281,7 +283,45 @@ * * var visitor = { * NumberLiteral(node, parent) {}, - * CallExpression(node, parent) {} + * CallExpression(node, parent) {}, + * }; + * + * We call these functions when we "enter" the node. But there is also the + * possibilty of calling things on "exit". + * + * Imagine our tree structure from before in list form: + * + * - Program + * - CallExpression + * - NumberLiteral + * - CallExpression + * - NumberLiteral + * - NumberLiteral + * + * As we traverse down, we're going to reach branches with dead ends. As we + * finish each branch of the tree we "exit" it. So going down the tree we + * "enter" each node, and going back up we "exit". + * + * -> Program (enter) + * -> CallExpression (enter) + * -> Number Literal (enter) + * <- Number Literal (exit) + * -> Call Expression (enter) + * -> Number Literal (enter) + * <- Number Literal (exit) + * -> Number Literal (enter) + * <- Number Literal (exit) + * <- CallExpression (exit) + * <- CallExpression (exit) + * <- Program (exit) + * + * In order to supper that, our visitors will look like this: + * + * var visitor = { + * NumberLiteral: { + * enter(node, parent) {}, + * exit(node, parent) {}, + * } * }; */ @@ -343,10 +383,10 @@ function tokenizer(input) { // A `current` variable for tracking our position in the code like a cursor. - var current = 0; + let current = 0; // And a `tokens` array for pushing our tokens to. - var tokens = []; + let tokens = []; // We start by creating a `while` loop where we are setting up our `current` // variable to be incremented as much as we want `inside` the loop. @@ -356,10 +396,10 @@ function tokenizer(input) { while (current < input.length) { // We're also going to store the `current` character in the `input`. - var char = input[current]; + let char = input[current]; // The first thing we want to check for is an open parenthesis. This will - // later be used for `CallExpressions` but for now we only care about the + // later be used for `CallExpression` but for now we only care about the // character. // // We check to see if we have an open parenthesis: @@ -369,7 +409,7 @@ function tokenizer(input) { // to an open parenthesis. tokens.push({ type: 'paren', - value: '(' + value: '(', }); // Then we increment `current` @@ -385,7 +425,7 @@ function tokenizer(input) { if (char === ')') { tokens.push({ type: 'paren', - value: ')' + value: ')', }); current++; continue; @@ -398,7 +438,7 @@ function tokenizer(input) { // // So here we're just going to test for existence and if it does exist we're // going to just `continue` on. - var WHITESPACE = /\s/; + let WHITESPACE = /\s/; if (WHITESPACE.test(char)) { current++; continue; @@ -413,12 +453,12 @@ function tokenizer(input) { // Only two separate tokens // // So we start this off when we encounter the first number in a sequence. - var NUMBERS = /[0-9]/; + let NUMBERS = /[0-9]/; if (NUMBERS.test(char)) { // We're going to create a `value` string that we are going to push // characters to. - var value = ''; + let value = ''; // Then we're going to loop through each character in the sequence until // we encounter a character that is not a number, pushing each character @@ -429,15 +469,42 @@ function tokenizer(input) { } // After that we push our `number` token to the `tokens` array. - tokens.push({ - type: 'number', - value: value - }); + tokens.push({ type: 'number', value }); // And we continue on. continue; } + // We'll also add support for strings in our language which will be any + // text surrounded by double quotes ("). + // + // (concat "foo" "bar") + // ^^^ ^^^ string tokens + // + // We'll start by checking for the opening quote: + if (char === '"') { + // Keep a `value` variable for building up our string token. + let value = ''; + + // We'll skip the opening double quote in our token. + char = input[++current]; + + // Then we'll iterate through each character until we reach another + // double quote. + while (char !== '"') { + value += char; + char = input[++current]; + } + + // Skip the closing double quote. + char = input[++current]; + + // And add our `string` token to the `tokens` array. + tokens.push({ type: 'string', value }); + + continue; + } + // The last type of token will be a `name` token. This is a sequence of // letters instead of numbers, that are the names of functions in our lisp // syntax. @@ -446,9 +513,9 @@ function tokenizer(input) { // ^^^ // Name token // - var LETTERS = /[a-z]/i; + let LETTERS = /[a-z]/i; if (LETTERS.test(char)) { - var value = ''; + let value = ''; // Again we're just going to loop through all the letters pushing them to // a value. @@ -458,10 +525,7 @@ function tokenizer(input) { } // And pushing that value as a token with the type `name` and continuing. - tokens.push({ - type: 'name', - value: value - }); + tokens.push({ type: 'name', value }); continue; } @@ -493,14 +557,14 @@ function tokenizer(input) { function parser(tokens) { // Again we keep a `current` variable that we will use as a cursor. - var current = 0; + let current = 0; // But this time we're going to use recursion instead of a `while` loop. So we // define a `walk` function. function walk() { // Inside the walk function we start by grabbing the `current` token. - var token = tokens[current]; + let token = tokens[current]; // We're going to split each type of token off into a different code path, // starting off with `number` tokens. @@ -515,7 +579,18 @@ function parser(tokens) { // value to the value of our token. return { type: 'NumberLiteral', - value: token.value + value: token.value, + }; + } + + // If we have a string we will do the same as number and create a + // `StringLiteral` node. + if (token.type === 'string') { + current++; + + return { + type: 'StringLiteral', + value: token.value, }; } @@ -533,10 +608,10 @@ function parser(tokens) { // We create a base node with the type `CallExpression`, and we're going // to set the name as the current token's value since the next token after // the open parenthesis is the name of the function. - var node = { + let node = { type: 'CallExpression', name: token.value, - params: [] + params: [], }; // We increment `current` *again* to skip the name token. @@ -567,11 +642,11 @@ function parser(tokens) { // { type: 'number', value: '4' }, // { type: 'number', value: '2' }, // { type: 'paren', value: ')' }, <<< Closing parenthesis - // { type: 'paren', value: ')' } <<< Closing parenthesis + // { type: 'paren', value: ')' }, <<< Closing parenthesis // ] // // We're going to rely on the nested `walk` function to increment our - // `current` variable past any nested `CallExpressions`. + // `current` variable past any nested `CallExpression`. // So we create a `while` loop that will continue until it encounters a // token with a `type` of `'paren'` and a `value` of a closing @@ -601,16 +676,16 @@ function parser(tokens) { // Now, we're going to create our AST which will have a root which is a // `Program` node. - var ast = { + let ast = { type: 'Program', - body: [] + body: [], }; // And we're going to kickstart our `walk` function, pushing nodes to our // `ast.body` array. // // The reason we are doing this inside a loop is because our program can have - // `CallExpressions` after one another instead of being nested. + // `CallExpression` after one another instead of being nested. // // (add 2 2) // (subtract 4 2) @@ -646,7 +721,7 @@ function parser(tokens) { * * NumberLiteral(node, parent) { * // ... - * } + * }, * }); */ @@ -657,7 +732,7 @@ function traverser(ast, visitor) { // A `traverseArray` function that will allow us to iterate over an array and // call the next function that we will define: `traverseNode`. function traverseArray(array, parent) { - array.forEach(function(child) { + array.forEach(child => { traverseNode(child, parent); }); } @@ -668,11 +743,12 @@ function traverser(ast, visitor) { // We start by testing for the existence of a method on the visitor with a // matching `type`. - var method = visitor[node.type]; + let methods = visitor[node.type]; - // If it exists we'll call it with the `node` and its `parent`. - if (method) { - method(node, parent); + // If there is an `enter` method for this node type we'll call it with the + // `node` and its `parent`. + if (methods && methods.enter) { + methods.enter(node, parent); } // Next we are going to split things up by the current node type. @@ -688,14 +764,15 @@ function traverser(ast, visitor) { traverseArray(node.body, node); break; - // Next we do the same with `CallExpressions` and traverse their `params`. + // Next we do the same with `CallExpression` and traverse their `params`. case 'CallExpression': traverseArray(node.params, node); break; - // In the case of `NumberLiterals` we don't have any child nodes to visit, - // so we'll just break. + // In the cases of `NumberLiteral` and `StringLiteral` we don't have any + // child nodes to visit, so we'll just break. case 'NumberLiteral': + case 'StringLiteral': break; // And again, if we haven't recognized the node type then we'll throw an @@ -703,6 +780,12 @@ function traverser(ast, visitor) { default: throw new TypeError(node.type); } + + // If there is an `exit` method for this node type we'll call it with the + // `node` and its `parent`. + if (methods && methods.exit) { + methods.exit(node, parent); + } } // Finally we kickstart the traverser by calling `traverseNode` with our ast @@ -763,9 +846,9 @@ function transformer(ast) { // We'll create a `newAst` which like our previous AST will have a program // node. - var newAst = { + let newAst = { type: 'Program', - body: [] + body: [], }; // Next I'm going to cheat a little and create a bit of a hack. We're going to @@ -780,51 +863,66 @@ function transformer(ast) { // We'll start by calling the traverser function with our ast and a visitor. traverser(ast, { - // The first visitor method accepts `NumberLiterals` - NumberLiteral: function(node, parent) { - // We'll create a new node also named `NumberLiteral` that we will push to - // the parent context. - parent._context.push({ - type: 'NumberLiteral', - value: node.value - }); + // The first visitor method accepts any `NumberLiteral` + NumberLiteral: { + // We'll visit them on enter. + enter(node, parent) { + // We'll create a new node also named `NumberLiteral` that we will push to + // the parent context. + parent._context.push({ + type: 'NumberLiteral', + value: node.value, + }); + }, }, - // Next up, `CallExpressions`. - CallExpression: function(node, parent) { - - // We start creating a new node `CallExpression` with a nested - // `Identifier`. - var expression = { - type: 'CallExpression', - callee: { - type: 'Identifier', - name: node.name - }, - arguments: [] - }; + // Next we have `StringLiteral` + StringLiteral: { + enter(node, parent) { + parent._context.push({ + type: 'StringLiteral', + value: node.value, + }); + }, + }, - // Next we're going to define a new context on the original - // `CallExpression` node that will reference the `expression`'s arguments - // so that we can push arguments. - node._context = expression.arguments; - - // Then we're going to check if the parent node is a `CallExpression`. - // If it is not... - if (parent.type !== 'CallExpression') { - - // We're going to wrap our `CallExpression` node with an - // `ExpressionStatement`. We do this because the top level - // `CallExpressions` in JavaScript are actually statements. - expression = { - type: 'ExpressionStatement', - expression: expression + // Next up, `CallExpression`. + CallExpression: { + enter(node, parent) { + + // We start creating a new node `CallExpression` with a nested + // `Identifier`. + let expression = { + type: 'CallExpression', + callee: { + type: 'Identifier', + name: node.name, + }, + arguments: [], }; - } - // Last, we push our (possibly wrapped) `CallExpression` to the `parent`'s - // `context`. - parent._context.push(expression); + // Next we're going to define a new context on the original + // `CallExpression` node that will reference the `expression`'s arguments + // so that we can push arguments. + node._context = expression.arguments; + + // Then we're going to check if the parent node is a `CallExpression`. + // If it is not... + if (parent.type !== 'CallExpression') { + + // We're going to wrap our `CallExpression` node with an + // `ExpressionStatement`. We do this because the top level + // `CallExpression` in JavaScript are actually statements. + expression = { + type: 'ExpressionStatement', + expression: expression, + }; + } + + // Last, we push our (possibly wrapped) `CallExpression` to the `parent`'s + // `context`. + parent._context.push(expression); + }, } }); @@ -858,7 +956,7 @@ function codeGenerator(node) { return node.body.map(codeGenerator) .join('\n'); - // For `ExpressionStatements` we'll call the code generator on the nested + // For `ExpressionStatement` we'll call the code generator on the nested // expression and we'll add a semicolon... case 'ExpressionStatement': return ( @@ -866,7 +964,7 @@ function codeGenerator(node) { ';' // << (...because we like to code the *correct* way) ); - // For `CallExpressions` we will print the `callee`, add an open + // For `CallExpression` we will print the `callee`, add an open // parenthesis, we'll map through each node in the `arguments` array and run // them through the code generator, joining them with a comma, and then // we'll add a closing parenthesis. @@ -879,14 +977,18 @@ function codeGenerator(node) { ')' ); - // For `Identifiers` we'll just return the `node`'s name. + // For `Identifier` we'll just return the `node`'s name. case 'Identifier': return node.name; - // For `NumberLiterals` we'll just return the `node`'s value. + // For `NumberLiteral` we'll just return the `node`'s value. case 'NumberLiteral': return node.value; + // For `StringLiteral` we'll add quotations around the `node`'s value. + case 'StringLiteral': + return '"' + node.value + '"'; + // And if we haven't recognized the node, we'll throw an error. default: throw new TypeError(node.type); @@ -911,10 +1013,10 @@ function codeGenerator(node) { */ function compiler(input) { - var tokens = tokenizer(input); - var ast = parser(tokens); - var newAst = transformer(ast); - var output = codeGenerator(newAst); + let tokens = tokenizer(input); + let ast = parser(tokens); + let newAst = transformer(ast); + let output = codeGenerator(newAst); // and simply return the output! return output; @@ -929,9 +1031,10 @@ function compiler(input) { // Now I'm just exporting everything... module.exports = { - tokenizer: tokenizer, - parser: parser, - transformer: transformer, - codeGenerator: codeGenerator, - compiler: compiler + tokenizer, + parser, + traverser, + transformer, + codeGenerator, + compiler, };