diff --git a/super-tiny-compiler.php b/super-tiny-compiler.php new file mode 100644 index 0000000..5c45000 --- /dev/null +++ b/super-tiny-compiler.php @@ -0,0 +1,930 @@ + [{ type: 'paren', value: '(' }, ...] + */ + +// We start by accepting an input string of code, and we're gonna set up two +// things... +function tokenizer($input) { + + // A `current` variable for tracking our position in the code like a cursor. + $current = 0; + + // And a `tokens` array for pushing our tokens to. + $tokens = []; + + // We start by creating a `while` loop where we are setting up our `current` + // variable to be incremented as much as we want `inside` the loop. + // + // We do this because we may want to increment `current` many times within a + // single loop because our tokens can be any length. + while ($current < strlen($input)) { + + // We're also going to store the `current` character in the `input`. + $char = $input[$current]; + + // The first thing we want to check for is an open parenthesis. This will + // later be used for `CallExpressions` but for now we only care about the + // character. + // + // We check to see if we have an open parenthesis: + if ($char === '(') { + + // If we do, we push a new token with the type `paren` and set the value + // to an open parenthesis. + $tokens[] = [ + 'type' => 'paren', + 'value' => '(' + ]; + + // Then we increment `current` + $current++; + + // And we `continue` onto the next cycle of the loop. + continue; + } + + // Next we're going to check for a closing parenthesis. We do the same exact + // thing as before: Check for a closing parenthesis, add a new token, + // increment `current`, and `continue`. + if ($char === ')') { + $tokens[] = [ + 'type' => 'paren', + 'value' => ')' + ]; + $current++; + continue; + } + + // Moving on, we're now going to check for whitespace. This is interesting + // because we care that whitespace exists to separate characters, but it + // isn't actually important for us to store as a token. We would only throw + // it out later. + // + // So here we're just going to test for existence and if it does exist we're + // going to just `continue` on. + $WHITESPACE = '/\s/'; + if (preg_match($WHITESPACE, $char)) { + $current++; + continue; + } + + // The next type of token is a number. This is different than what we have + // seen before because a number could be any number of characters and we + // want to capture the entire sequence of characters as one token. + // + // (add 123 456) + // ^^^ ^^^ + // Only two separate tokens + // + // So we start this off when we encounter the first number in a sequence. + $NUMBERS = '/[0-9]/'; + if (preg_match($NUMBERS, $char)) { + + // We're going to create a `value` string that we are going to push + // characters to. + $value = ''; + + // Then we're going to loop through each character in the sequence until + // we encounter a character that is not a number, pushing each character + // that is a number to our `value` and incrementing `current` as we go. + while (preg_match($NUMBERS, $char)) { + $value .= $char; + $char = $input[++$current]; + } + + // After that we push our `number` token to the `tokens` array. + $tokens[] = [ + 'type' => 'number', + 'value' => $value + ]; + + // And we continue on. + continue; + } + + // The last type of token will be a `name` token. This is a sequence of + // letters instead of numbers, that are the names of functions in our lisp + // syntax. + // + // (add 2 4) + // ^^^ + // Name token + // + $LETTERS = '/[a-zA-Z]/'; + if (preg_match($LETTERS, $char)) { + $value = ''; + + // Again we're just going to loop through all the letters pushing them to + // a value. + while (preg_match($LETTERS, $char)) { + $value .= $char; + $char = $input[++$current]; + } + + // And pushing that value as a token with the type `name` and continuing. + $tokens[] = [ + 'type' => 'name', + 'value' => $value + ]; + + continue; + } + + // Finally if we have not matched a character by now, we're going to throw + // an error and completely exit. + throw new Exception('I dont know what this character is: ' . $char); + } + + // Then at the end of our `tokenizer` we simply return the tokens array. + return $tokens; +} + +/** + * ============================================================================ + * ヽ/❀o ل͜ o\ノ + * THE PARSER!!! + * ============================================================================ + */ + +/** + * For our parser we're going to take our array of tokens and turn it into an + * AST. + * + * [{ type: 'paren', value: '(' }, ...] => { type: 'Program', body: [...] } + */ + +// Okay, so we define a `parser` function that accepts our array of `tokens`. +function parser($tokens) { + + // Again we keep a `current` variable that we will use as a cursor. + $current = 0; + + // But this time we're going to use recursion instead of a `while` loop. So we + // define a `walk` function. + function walk(&$current, $tokens) { + // $walk = function walk($cur) use ($current, $tokens) { + + // Inside the walk function we start by grabbing the `current` token. + $token = $tokens[$current]; + + // We're going to split each type of token off into a different code path, + // starting off with `number` tokens. + // + // We test to see if we have a `number` token. + if ($token['type'] === 'number') { + + // If we have one, we'll increment `current`. + $current++; + + // And we'll return a new AST node called `NumberLiteral` and setting its + // value to the value of our token. + return [ + 'type' => 'NumberLiteral', + 'value' => $token['value'] + ]; + } + + // Next we're going to look for CallExpressions. We start this off when we + // encounter an open parenthesis. + if ( + $token['type'] === 'paren' && + $token['value'] === '(' + ) { + + // We'll increment `current` to skip the parenthesis since we don't care + // about it in our AST. + $token = $tokens[++$current]; + + // We create a base node with the type `CallExpression`, and we're going + // to set the name as the current token's value since the next token after + // the open parenthesis is the name of the function. + $node = [ + 'type' => 'CallExpression', + 'name' => $token['value'], + 'params' => [] + ]; + + // We increment `current` *again* to skip the name token. + $token = $tokens[++$current]; + + // And now we want to loop through each token that will be the `params` of + // our `CallExpression` until we encounter a closing parenthesis. + // + // Now this is where recursion comes in. Instead of trying to parse a + // potentially infinitely nested set of nodes we're going to rely on + // recursion to resolve things. + // + // To explain this, let's take our Lisp code. You can see that the + // parameters of the `add` are a number and a nested `CallExpression` that + // includes its own numbers. + // + // (add 2 (subtract 4 2)) + // + // You'll also notice that in our tokens array we have multiple closing + // parenthesis. + // + // [ + // { type: 'paren', value: '(' }, + // { type: 'name', value: 'add' }, + // { type: 'number', value: '2' }, + // { type: 'paren', value: '(' }, + // { type: 'name', value: 'subtract' }, + // { type: 'number', value: '4' }, + // { type: 'number', value: '2' }, + // { type: 'paren', value: ')' }, <<< Closing parenthesis + // { type: 'paren', value: ')' } <<< Closing parenthesis + // ] + // + // We're going to rely on the nested `walk` function to increment our + // `current` variable past any nested `CallExpressions`. + + // So we create a `while` loop that will continue until it encounters a + // token with a `type` of `'paren'` and a `value` of a closing + // parenthesis. + while ( + ($token['type'] !== 'paren') || + ($token['type'] === 'paren' && $token['value'] !== ')') + ) { + // we'll call the `walk` function which will return a `node` and we'll + // push it into our `node.params`. + $node['params'][] = walk($current, $tokens); + $token = $tokens[$current]; + } + + // Finally we will increment `current` one last time to skip the closing + // parenthesis. + $current++; + + // And return the node. + return $node; + } + + // Again, if we haven't recognized the token type by now we're going to + // throw an error. + throw new Exceptions($token['type']); + } + + // Now, we're going to create our AST which will have a root which is a + // `Program` node. + $ast = [ + 'type' => 'Program', + 'body' => [] + ]; + + // And we're going to kickstart our `walk` function, pushing nodes to our + // `ast.body` array. + // + // The reason we are doing this inside a loop is because our program can have + // `CallExpressions` after one another instead of being nested. + // + // (add 2 2) + // (subtract 4 2) + // + while ($current < count($tokens)) { + $ast['body'][] = walk($current, $tokens); + } + + // At the end of our parser we'll return the AST. + return $ast; +} + +/** + * ============================================================================ + * ⌒(❀>◞౪◟<❀)⌒ + * THE TRAVERSER!!! + * ============================================================================ + */ + +/** + * So now we have our AST, and we want to be able to visit different nodes with + * a visitor. We need to be able to call the methods on the visitor whenever we + * encounter a node with a matching type. + * + * traverse(ast, { + * Program(node, parent) { + * // ... + * }, + * + * CallExpression(node, parent) { + * // ... + * }, + * + * NumberLiteral(node, parent) { + * // ... + * } + * }); + */ + +// So we define a traverser function which accepts an AST and a +// visitor. Inside we're going to define two functions... +function traverser($ast, $visitor) { + + // A `traverseArray` function that will allow us to iterate over an array and + // call the next function that we will define: `traverseNode`. + function traverseArray($array, $parent, $visitor) { + foreach ($array as $child) { + traverseNode($child, $parent, $visitor); + } + } + + // `traverseNode` will accept a `node` and its `parent` node. So that it can + // pass both to our visitor methods. + function traverseNode($node, $parent, $visitor) { + + + // If it exists we'll call it with the `node` and its `parent`. + if (!empty($visitor[$node['type']])) { + // We start by testing for the existence of a method on the visitor with a + // matching `type`. + $method = $visitor[$node['type']]; + + // print_r($node['type']); + // print_r($method); + + ($method($node, $parent)); + } + + // Next we are going to split things up by the current node type. + switch ($node['type']) { + + // We'll start with our top level `Program`. Since Program nodes have a + // property named body that has an array of nodes, we will call + // `traverseArray` to traverse down into them. + // + // (Remember that `traverseArray` will in turn call `traverseNode` so we + // are causing the tree to be traversed recursively) + case 'Program': + traverseArray($node['body'], $node, $visitor); + break; + + // Next we do the same with `CallExpressions` and traverse their `params`. + case 'CallExpression': + traverseArray($node['params'], $node, $visitor); + break; + + // In the case of `NumberLiterals` we don't have any child nodes to visit, + // so we'll just break. + case 'NumberLiteral': + break; + + // And again, if we haven't recognized the node type then we'll throw an + // error. + default: + throw new Exception($node['type']); + } + } + + // Finally we kickstart the traverser by calling `traverseNode` with our ast + // with no `parent` because the top level of the AST doesn't have a parent. + traverseNode($ast, null, $visitor); +} + + +/** + * ============================================================================ + * ⁽(◍˃̵͈̑ᴗ˂̵͈̑)⁽ + * THE TRANSFORMER!!! + * ============================================================================ + */ + +/** + * Next up, the transformer. Our transformer is going to take the AST that we + * have built and pass it to our traverser function with a visitor and will + * create a new ast. + * + * ---------------------------------------------------------------------------- + * Original AST | Transformed AST + * ---------------------------------------------------------------------------- + * { | { + * type: 'Program', | type: 'Program', + * body: [{ | body: [{ + * type: 'CallExpression', | type: 'ExpressionStatement', + * name: 'add', | expression: { + * params: [{ | type: 'CallExpression', + * type: 'NumberLiteral', | callee: { + * value: '2' | type: 'Identifier', + * }, { | name: 'add' + * type: 'CallExpression', | }, + * name: 'subtract', | arguments: [{ + * params: [{ | type: 'NumberLiteral', + * type: 'NumberLiteral', | value: '2' + * value: '4' | }, { + * }, { | type: 'CallExpression', + * type: 'NumberLiteral', | callee: { + * value: '2' | type: 'Identifier', + * }] | name: 'subtract' + * }] | }, + * }] | arguments: [{ + * } | type: 'NumberLiteral', + * | value: '4' + * ---------------------------------- | }, { + * | type: 'NumberLiteral', + * | value: '2' + * | }] + * (sorry the other one is longer.) | } + * | } + * | }] + * | } + * ---------------------------------------------------------------------------- + */ + +// So we have our transformer function which will accept the lisp ast. +function transformer($ast) { + + // We'll create a `newAst` which like our previous AST will have a program + // node. + $newAst = [ + 'type' => 'Program', + 'body' => [] + ]; + + // Next I'm going to cheat a little and create a bit of a hack. We're going to + // use a property named `context` on our parent nodes that we're going to push + // nodes to their parent's `context`. Normally you would have a better + // abstraction than this, but for our purposes this keeps things simple. + // + // Just take note that the context is a reference *from* the old ast *to* the + // new ast. + $ast['_context'] = &$newAst['body']; + + // We'll start by calling the traverser function with our ast and a visitor. + traverser($ast, [ + + // The first visitor method accepts `NumberLiterals` + 'NumberLiteral' => function($node, $parent) { + // We'll create a new node also named `NumberLiteral` that we will push to + // the parent context. + $parent['_context'][] = [ + 'type' => 'NumberLiteral', + 'value' => $node['value'] + ]; + }, + + // Next up, `CallExpressions`. + 'CallExpression' => function (&$node, $parent) { + + // We start creating a new node `CallExpression` with a nested + // `Identifier`. + $expression = [ + 'type' => 'CallExpression', + 'callee' => [ + 'type' => 'Identifier', + 'name' => $node['name'] + ], + 'arguments' => [] + ]; + + // Next we're going to define a new context on the original + // `CallExpression` node that will reference the `expression`'s arguments + // so that we can push arguments. + $node['_context'] = &$expression['arguments']; + + // Then we're going to check if the parent node is a `CallExpression`. + // If it is not... + if ($parent['type'] !== 'CallExpression') { + + // We're going to wrap our `CallExpression` node with an + // `ExpressionStatement`. We do this because the top level + // `CallExpressions` in JavaScript are actually statements. + $expression = [ + 'type' => 'ExpressionStatement', + 'expression' => $expression + ]; + } + + // Last, we push our (possibly wrapped) `CallExpression` to the `parent`'s + // `context`. + $parent['_context'][] = $expression; + + return $parent; + } + ]); + + // At the end of our transformer function we'll return the new ast that we + // just created. + return $newAst; +} + + +/** + * ============================================================================ + * ヾ(〃^∇^)ノ♪ + * THE CODE GENERATOR!!!! + * ============================================================================ + */ + +/** + * Now let's move onto our last phase: The Code Generator. + * + * Our code generator is going to recursively call itself to print each node in + * the tree into one giant string. + */ + +function codeGenerator($node) { + + // We'll break things down by the `type` of the `node`. + switch ($node['type']) { + + // If we have a `Program` node. We will map through each node in the `body` + // and run them through the code generator and join them with a newline. + case 'Program': + return implode(PHP_EOL, array_map('codeGenerator', $node['body'])); + + // For `ExpressionStatements` we'll call the code generator on the nested + // expression and we'll add a semicolon... + case 'ExpressionStatement': + return ( + codeGenerator($node['expression']) . + ';' // << (...because we like to code the *correct* way) + ); + + // For `CallExpressions` we will print the `callee`, add an open + // parenthesis, we'll map through each node in the `arguments` array and run + // them through the code generator, joining them with a comma, and then + // we'll add a closing parenthesis. + case 'CallExpression': + return ( + codeGenerator($node['callee']) . + '(' . + implode(', ', array_map('codeGenerator', $node['arguments'])) . + ')' + ); + + // For `Identifiers` we'll just return the `node`'s name. + case 'Identifier': + return $node['name']; + + // For `NumberLiterals` we'll just return the `node`'s value. + case 'NumberLiteral': + return $node['value']; + + // And if we haven't recognized the node, we'll throw an error. + default: + throw new Exception($node['type']); + } +} + + +/** + * ============================================================================ + * (۶* ‘ヮ’)۶” + * !!!!!!!!THE COMPILER!!!!!!!! + * ============================================================================ + */ + +/** + * FINALLY! We'll create our `compiler` function. Here we will link together + * every part of the pipeline. + * + * 1. input => tokenizer => tokens + * 2. tokens => parser => ast + * 3. ast => transformer => newAst + * 4. newAst => generator => output + */ + +function compiler($input) { + $tokens = tokenizer($input); + $ast = parser($tokens); + $newAst = transformer($ast); + $output = codeGenerator($newAst); + + // and simply return the output! + return $output; +}