jamiebuilds_the-super-tiny-.../super-tiny-compiler.js
2016-03-30 17:39:16 -07:00

632 lines
22 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* TTTTTTTTTTTTTTTTTTTTTTTHHHHHHHHH HHHHHHHHHEEEEEEEEEEEEEEEEEEEEEE
* T:::::::::::::::::::::TH:::::::H H:::::::HE::::::::::::::::::::E
* T:::::::::::::::::::::TH:::::::H H:::::::HE::::::::::::::::::::E
* T:::::TT:::::::TT:::::THH::::::H H::::::HHEE::::::EEEEEEEEE::::E
* TTTTTT T:::::T TTTTTT H:::::H H:::::H E:::::E EEEEEE
* T:::::T H:::::H H:::::H E:::::E
* T:::::T H::::::HHHHH::::::H E::::::EEEEEEEEEE
* T:::::T H:::::::::::::::::H E:::::::::::::::E
* T:::::T H:::::::::::::::::H E:::::::::::::::E
* T:::::T H::::::HHHHH::::::H E::::::EEEEEEEEEE
* T:::::T H:::::H H:::::H E:::::E
* T:::::T H:::::H H:::::H E:::::E EEEEEE
* TT:::::::TT HH::::::H H::::::HHEE::::::EEEEEEEE:::::E
* T:::::::::T H:::::::H H:::::::HE::::::::::::::::::::E
* T:::::::::T H:::::::H H:::::::HE::::::::::::::::::::E
* TTTTTTTTTTT HHHHHHHHH HHHHHHHHHEEEEEEEEEEEEEEEEEEEEEE
*
* SSSSSSSSSSSSSSS UUUUUUUU UUUUUUUUPPPPPPPPPPPPPPPPP EEEEEEEEEEEEEEEEEEEEEERRRRRRRRRRRRRRRRR
* SS:::::::::::::::SU::::::U U::::::UP::::::::::::::::P E::::::::::::::::::::ER::::::::::::::::R
* S:::::SSSSSS::::::SU::::::U U::::::UP::::::PPPPPP:::::P E::::::::::::::::::::ER::::::RRRRRR:::::R
* S:::::S SSSSSSSUU:::::U U:::::UUPP:::::P P:::::PEE::::::EEEEEEEEE::::ERR:::::R R:::::R
* S:::::S U:::::U U:::::U P::::P P:::::P E:::::E EEEEEE R::::R R:::::R
* S:::::S U:::::U U:::::U P::::P P:::::P E:::::E R::::R R:::::R
* S::::SSSS U:::::U U:::::U P::::PPPPPP:::::P E::::::EEEEEEEEEE R::::RRRRRR:::::R
* SS::::::SSSSS U:::::U U:::::U P:::::::::::::PP E:::::::::::::::E R:::::::::::::RR
* SSS::::::::SS U:::::U U:::::U P::::PPPPPPPPP E:::::::::::::::E R::::RRRRRR:::::R
* SSSSSS::::S U:::::U U:::::U P::::P E::::::EEEEEEEEEE R::::R R:::::R
* S:::::S U:::::U U:::::U P::::P E:::::E R::::R R:::::R
* S:::::S U::::::U U::::::U P::::P E:::::E EEEEEE R::::R R:::::R
* SSSSSSS S:::::S U:::::::UUU:::::::U PP::::::PP EE::::::EEEEEEEE:::::ERR:::::R R:::::R
* S::::::SSSSSS:::::S UU:::::::::::::UU P::::::::P E::::::::::::::::::::ER::::::R R:::::R
* S:::::::::::::::SS UU:::::::::UU P::::::::P E::::::::::::::::::::ER::::::R R:::::R
* SSSSSSSSSSSSSSS UUUUUUUUU PPPPPPPPPP EEEEEEEEEEEEEEEEEEEEEERRRRRRRR RRRRRRR
*
* TTTTTTTTTTTTTTTTTTTTTTTIIIIIIIIIINNNNNNNN NNNNNNNNYYYYYYY YYYYYYY
* T:::::::::::::::::::::TI::::::::IN:::::::N N::::::NY:::::Y Y:::::Y
* T:::::::::::::::::::::TI::::::::IN::::::::N N::::::NY:::::Y Y:::::Y
* T:::::TT:::::::TT:::::TII::::::IIN:::::::::N N::::::NY::::::Y Y::::::Y
* TTTTTT T:::::T TTTTTT I::::I N::::::::::N N::::::NYYY:::::Y Y:::::YYY
* T:::::T I::::I N:::::::::::N N::::::N Y:::::Y Y:::::Y
* T:::::T I::::I N:::::::N::::N N::::::N Y:::::Y:::::Y
* T:::::T I::::I N::::::N N::::N N::::::N Y:::::::::Y
* T:::::T I::::I N::::::N N::::N:::::::N Y:::::::Y
* T:::::T I::::I N::::::N N:::::::::::N Y:::::Y
* T:::::T I::::I N::::::N N::::::::::N Y:::::Y
* T:::::T I::::I N::::::N N:::::::::N Y:::::Y
* TT:::::::TT II::::::IIN::::::N N::::::::N Y:::::Y
* T:::::::::T I::::::::IN::::::N N:::::::N YYYY:::::YYYY
* T:::::::::T I::::::::IN::::::N N::::::N Y:::::::::::Y
* TTTTTTTTTTT IIIIIIIIIINNNNNNNN NNNNNNN YYYYYYYYYYYYY
*
* CCCCCCCCCCCCC OOOOOOOOO MMMMMMMM MMMMMMMMPPPPPPPPPPPPPPPPP IIIIIIIIIILLLLLLLLLLL EEEEEEEEEEEEEEEEEEEEEERRRRRRRRRRRRRRRRR
* CCC::::::::::::C OO:::::::::OO M:::::::M M:::::::MP::::::::::::::::P I::::::::IL:::::::::L E::::::::::::::::::::ER::::::::::::::::R
* CC:::::::::::::::C OO:::::::::::::OO M::::::::M M::::::::MP::::::PPPPPP:::::P I::::::::IL:::::::::L E::::::::::::::::::::ER::::::RRRRRR:::::R
* C:::::CCCCCCCC::::CO:::::::OOO:::::::OM:::::::::M M:::::::::MPP:::::P P:::::PII::::::IILL:::::::LL EE::::::EEEEEEEEE::::ERR:::::R R:::::R
* C:::::C CCCCCCO::::::O O::::::OM::::::::::M M::::::::::M P::::P P:::::P I::::I L:::::L E:::::E EEEEEE R::::R R:::::R
* C:::::C O:::::O O:::::OM:::::::::::M M:::::::::::M P::::P P:::::P I::::I L:::::L E:::::E R::::R R:::::R
* C:::::C O:::::O O:::::OM:::::::M::::M M::::M:::::::M P::::PPPPPP:::::P I::::I L:::::L E::::::EEEEEEEEEE R::::RRRRRR:::::R
* C:::::C O:::::O O:::::OM::::::M M::::M M::::M M::::::M P:::::::::::::PP I::::I L:::::L E:::::::::::::::E R:::::::::::::RR
* C:::::C O:::::O O:::::OM::::::M M::::M::::M M::::::M P::::PPPPPPPPP I::::I L:::::L E:::::::::::::::E R::::RRRRRR:::::R
* C:::::C O:::::O O:::::OM::::::M M:::::::M M::::::M P::::P I::::I L:::::L E::::::EEEEEEEEEE R::::R R:::::R
* C:::::C O:::::O O:::::OM::::::M M:::::M M::::::M P::::P I::::I L:::::L E:::::E R::::R R:::::R
* C:::::C CCCCCCO::::::O O::::::OM::::::M MMMMM M::::::M P::::P I::::I L:::::L LLLLLL E:::::E EEEEEE R::::R R:::::R
* C:::::CCCCCCCC::::CO:::::::OOO:::::::OM::::::M M::::::MPP::::::PP II::::::IILL:::::::LLLLLLLLL:::::LEE::::::EEEEEEEE:::::ERR:::::R R:::::R
* CC:::::::::::::::C OO:::::::::::::OO M::::::M M::::::MP::::::::P I::::::::IL::::::::::::::::::::::LE::::::::::::::::::::ER::::::R R:::::R
* CCC::::::::::::C OO:::::::::OO M::::::M M::::::MP::::::::P I::::::::IL::::::::::::::::::::::LE::::::::::::::::::::ER::::::R R:::::R
* CCCCCCCCCCCCC OOOOOOOOO MMMMMMMM MMMMMMMMPPPPPPPPPP IIIIIIIIIILLLLLLLLLLLLLLLLLLLLLLLLEEEEEEEEEEEEEEEEEEEEEERRRRRRRR RRRRRRR
*
* =======================================================================================================================================================================
* =======================================================================================================================================================================
* =======================================================================================================================================================================
* =======================================================================================================================================================================
*/
/**
* Today we're going write a compiler together. But not just any compiler... A
* super duper tiny teeny compiler! A compiler that is so small that if you
* remove all the comments this file would only be ~200 lines of actual code.
*
* We're going to compile some lisp-like function calls into some C-like
* function calls.
*
* If you are familiar with one or the other. I'll just give you a quick intro.
*
* If we had two functions `add` and `subtract` they would be written like this:
*
* LISP C
*
* 2 + 2 (add 2 2) add(2, 2)
* 4 - 2 (subtract 4 2) subtract(4, 2)
* 2 + (4 - 2) (add 2 (subtract 4 2)) add(2, subtract(4, 2))
*
* Easy peezy right?
*
* Well good, because this is exactly what we are going to compile. While this
* is neither a complete LISP or C syntax, it will be enough of the syntax to
* demonstrate many of major pieces of a modern compiler.
*/
/**
* Most compiler break down into three primary stages: Parsing, Transformation,
* and Code Generation
*
* 1. *Parsing* is taking raw code and turning it into a more abstract
* representation of the code.
*
* 2. *Transformation* takes this abstract representation and manipulates to do
* whatever the compiler wants it to.
*
* 3. *Code Generation* takes the transformed representation of the code and
* turns it into new code.
*/
/**
* Parsing
* -------
*
* Parsing typically gets broken down into two phases: Lexical Analysis and
* Syntactic Analysis.
*
* 1. *Lexical Analysis* takes the raw code and splits it apart into these things
* called tokens by a thing called a tokenizer (or lexer).
*
* Tokens are an array of tiny little objects that describe an isolated piece
* of the syntax. They could be numbers, labels, punctuation, operators,
* whatever.
*
* 2. *Syntactic Analysis* takes the tokens and reformats them into a
* representation that describes each part of the syntax and their relation
* to one another. This is known as an intermediate representation or
* Abstract Syntax Tree.
*
* An Abstract Syntax Tree or AST for short is a deeply nested object that
* represents code in a way that is both easy to work with and tells us a lot
* of information.
*
* For the following syntax:
*
* (add 2 (subtract 4 2))
*
* Tokens might look something like this:
*
* [
* { type: 'paren', value: '(' },
* { type: 'name', value: 'add' },
* { type: 'number', value: '2' },
* { type: 'paren', value: '(' },
* { type: 'name', value: 'subtract' },
* { type: 'number', value: '4' },
* { type: 'number', value: '2' },
* { type: 'paren', value: ')' },
* { type: 'paren', value: ')' }
* ]
*
* And an Abstract Syntax Tree (AST) might look like this:
*
* {
* type: 'Program',
* body: [{
* type: 'CallExpression',
* name: 'add',
* params: [{
* type: 'NumberLiteral',
* value: '2'
* }, {
* type: 'CallExpression',
* name: 'subtract',
* params: [{
* type: 'NumberLiteral',
* value: '4'
* }, {
* type: 'NumberLiteral',
* value: '2'
* }]
* }]
* }]
* }
*/
/**
* Transformation
* --------------
*
* The next type of stage of a compiler is transformation. Again, this just
* takes the AST from the last step and makes changes to it. It can manipulate
* the AST in the same language or it can translate it into an entirely new
* language.
*
* Lets look at how we would transform an AST.
*
* You might notice that our AST has elements within it that look very similar.
* There are these objects with a type property. Each of these are known as an
* AST Node. These nodes have defined properties on them that describe one
* isolated part of the tree.
*
* We can have a node for a "NumberLiteral":
*
* {
* type: 'NumberLiteral',
* value: '2'
* }
*
* Or maybe a node for a "CallExpression":
*
* {
* type: 'CallExpression',
* name: 'subtract',
* params: [...nested nodes go here...]
* }
*
* When transforming the AST we can manipulate nodes by
* adding/removing/replacing properties, we can add new nodes, remove nodes, or
* we could leave the existing AST alone and create and entirely new one based
* on it.
*
* Since were targeting a new language, were going to focus on creating an
* entirely new AST that is specific to the target language.
*
* Traversal
* ---------
*
* In order to navigate through all of these nodes, we need to be able to
* traverse through them. This traversal process goes to each node in the AST
* depth-first.
*
* {
* type: 'Program',
* body: [{
* type: 'CallExpression',
* name: 'add',
* params: [{
* type: 'NumberLiteral',
* value: '2'
* }, {
* type: 'CallExpression',
* name: 'subtract',
* params: [{
* type: 'NumberLiteral',
* value: '4'
* }, {
* type: 'NumberLiteral',
* value: '2'
* }]
* }]
* }]
* }
*
* So for the above AST we would go:
*
* 1. Program - Starting at the top level of the AST
* 2. CallExpression (add) - Moving to the first element of the Program's body
* 3. NumberLiteral (2) - Moving to the first element of CallExpression's params
* 4. CallExpression (subtract) - Moving to the second element of CallExpression's params
* 5. NumberLiteral (4) - Moving to the first element of CallExpression's params
* 6. NumberLiteral (2) - Moving to the second element of CallExpression's params
*
* If we were manipulating this AST directly instead of creating a separate AST
* we would likely introduce all sorts of abstractions here. But just visiting
* each node in the tree is enough.
*
* The reason I use the word “visiting” is because there is this pattern of how
* to represent operations on elements of an object structure.
*
* Visitors
* --------
*
* The basic idea here is that we are going to create a “visitor” object that
* has methods that will accept different node types.
*
* var visitor = {
* NumberLiteral() {},
* CallExpression() {}
* };
*
* When we traverse our AST we will call the methods on this visitor whenever we
* encounter a node of a matching type.
*
* In order to make this useful we will also pass the node and a reference to
* the parent node.
*
* var visitor = {
* NumberLiteral(node, parent) {},
* CallExpression(node, parent) {}
* };
*/
/**
* Code Generation
* ---------------
*
* The final phase of a compiler is code generation. Sometimes compilers will do
* things that overlap with transformation, but for the most part code
* generation just means take our AST and string-ify code back out.
*
* Code generators work several different ways, some compilers will reuse the
* tokens from earlier, others will have created a separate representation of
* the code so that they can print node linearly, but from what I can tell most
* will use the same AST we just created which is what were going to focus on.
*
* Effectively our code generator will know how to “print” all of the different
* node types of the AST, and it will recursively call itself to print nested
* nodes until everything is printed into one long string of code.
*/
/**
* And that's it! That's all the different pieces of a compiler.
*
* Now that isnt to say every compiler looks exactly like I described here.
* Compilers serve many different purposes, and they might need more steps than
* I have detailed.
*
* But now you should have a general high-level idea of what most compilers look
* like.
*
* Now that Ive explained all of this, youre all good to go write your own
* compilers right?
*
* Just kidding, that's what I'm here to help with :P
*
* So let's begin...
*/
/**
* -----------------------------------------------------------------------------
* *Note:* This is all I've written so far, so the code below isn't annnotated
* yet. You can still read it all and it totally works, but I plan on improving
* this in the near future
* -----------------------------------------------------------------------------
*/
/**
* ============================================================================
* (/^▽^)/
* THE TOKENIZER!
* ============================================================================
*/
function tokenizer(input) {
var current = 0;
var tokens = [];
while (current < input.length) {
var char = input[current];
if (char === '(') {
tokens.push({
type: 'paren',
value: '('
});
current++;
continue;
}
if (char === ')') {
tokens.push({
type: 'paren',
value: ')'
});
current++;
continue;
}
var WHITESPACE = /\s/;
if (WHITESPACE.test(char)) {
current++;
continue;
}
var NUMBERS = /[0-9]/;
if (NUMBERS.test(char)) {
var value = '';
while (NUMBERS.test(char)) {
value += char;
char = input[++current];
}
tokens.push({
type: 'number',
value: value
});
continue;
}
var LETTERS = /[a-zA-Z]/;
if (LETTERS.test(char)) {
var value = '';
while (LETTERS.test(char)) {
value += char;
char = input[++current];
}
tokens.push({
type: 'name',
value: value
});
continue;
}
throw new TypeError('I dont know what this character is: ' + char);
}
return tokens;
}
/**
* ============================================================================
* ヽ/❀o ل͜ o\ノ
* THE PARSER!!!
* ============================================================================
*/
function parser(tokens) {
var current = 0;
function walk() {
var token = tokens[current];
if (token.type === 'number') {
current++;
return {
type: 'NumberLiteral',
value: token.value
};
}
if (
token.type === 'paren' &&
token.value === '('
) {
current++;
var node = {
type: 'CallExpression',
name: tokens[current].value,
params: []
};
current++;
while (
token.type !== 'paren' ||
token.value !== ')'
) {
node.params.push(walk());
token = tokens[current];
}
current++;
return node;
}
throw new TypeError(token.type);
}
var program = {
type: 'Program',
body: []
};
while (current < tokens.length) {
program.body.push(walk());
}
return program;
}
/**
* ============================================================================
* ⌒(❀>◞౪◟<❀)⌒
* THE TRAVERSER!!!
* ============================================================================
*/
function traverser(program, visitor) {
function traverseArray(array, parent) {
array.forEach(function(child) {
traverseNode(child, parent);
});
}
function traverseNode(node, parent) {
var method = visitor[node.type];
if (method) {
method(node, parent);
}
switch (node.type) {
case 'Program':
traverseArray(node.body, node);
break;
case 'CallExpression':
traverseArray(node.params, node);
break;
case 'NumberLiteral':
break;
default:
throw new TypeError(node.type);
}
}
traverseNode(program, null);
}
/**
* ============================================================================
* ⁽(◍˃̵͈̑ᴗ˂̵͈̑)⁽
* THE TRANSFORMER!!!
* ============================================================================
*/
function transformer(program) {
var ast = {
type: 'Program',
body: []
};
program._context = ast.body;
traverser(program, {
NumberLiteral: function(node, parent) {
parent._context.push({
type: 'NumberLiteral',
value: node.value
});
},
CallExpression: function(node, parent) {
var expression = {
type: 'CallExpression',
callee: {
type: 'Identifier',
name: node.name
},
arguments: []
};
node._context = expression.arguments;
if (parent.type !== 'CallExpression') {
expression = {
type: 'ExpressionStatement',
expression: expression
};
}
parent._context.push(expression);
}
});
return ast;
}
/**
* ============================================================================
* ヾ(〃^∇^)ノ♪
* THE CODE GENERATOR!!!!
* ============================================================================
*/
function codeGenerator(node) {
switch (node.type) {
case 'Program':
return node.body.map(codeGenerator)
.join('\n');
case 'ExpressionStatement':
return (
codeGenerator(node.expression) +
';'
);
case 'CallExpression':
return (
codeGenerator(node.callee) +
'(' +
node.arguments.map(codeGenerator)
.join(', ') +
')'
);
case 'Identifier':
return node.name;
case 'NumberLiteral':
return node.value;
default:
throw new TypeError(node.type);
}
}
/**
* ============================================================================
* (۶* ‘ヮ’)۶”
* !!!!!!!!THE COMPILER!!!!!!!!
* ============================================================================
*/
function compiler(input) {
var tokens = tokenizer(input);
var ast = parser(tokens);
var newAst = transformer(ast);
var output = codeGenerator(newAst);
return output;
}
/**
* ============================================================================
* (๑˃̵ᴗ˂̵)و
* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!YOU MADE IT!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
* ============================================================================
*/
// Now I'm just exporting everything...
module.exports = {
tokenizer: tokenizer,
parser: parser,
transformer: transformer,
codeGenerator: codeGenerator,
compiler: compiler
};