From 110068e7a5b647bf3e98badcb4b9e20cbb784f15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Cha=CC=81varri?= Date: Tue, 19 Sep 2017 21:08:09 +0200 Subject: [PATCH 1/3] Configuration --- .gitignore | 5 +++++ bsconfig.json | 14 ++++++++++++++ package.json | 11 ++++++++--- .../the-super-tiny-compiler.re | 0 4 files changed, 27 insertions(+), 3 deletions(-) create mode 100644 .gitignore create mode 100644 bsconfig.json rename the-super-tiny-compiler.js => src/the-super-tiny-compiler.re (100%) diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..77bd7bc --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +.DS_Store +.merlin +npm-debug.log +/lib/bs/ +/node_modules/ diff --git a/bsconfig.json b/bsconfig.json new file mode 100644 index 0000000..889b32d --- /dev/null +++ b/bsconfig.json @@ -0,0 +1,14 @@ +// This is the configuration file used by BuckleScript's build system bsb. Its documentation lives here: http://bucklescript.github.io/bucklescript/docson/#build-schema.json +// BuckleScript comes with its own parser for bsconfig.json, which is normal JSON, with the extra support of comments and trailing commas. +{ + "name": "the-super-tiny-compiler", + "version": "0.1.0", + "bsc-flags": ["-bs-super-errors"], + "sources": [ + "src" + ], + "bs-dependencies" : [ + // add your dependencies here. You'd usually install them normally through `npm install my-dependency`. If my-dependency has a bsconfig.json too, then everything will work seamlessly. + ], + "namespace": true +} diff --git a/package.json b/package.json index b3df43d..15aa46f 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,12 @@ { "name": "the-super-tiny-compiler", "version": "1.0.0", - "author": "James Kyle (thejameskyle.com)", - "license": "CC-BY-4.0", - "main": "./the-super-tiny-compiler.js" + "scripts": { + "build": "bsb -make-world", + "start": "bsb -make-world -w", + "clean": "bsb -clean-world" + }, + "devDependencies": { + "bs-platform": "1.9.2" + } } diff --git a/the-super-tiny-compiler.js b/src/the-super-tiny-compiler.re similarity index 100% rename from the-super-tiny-compiler.js rename to src/the-super-tiny-compiler.re From b14161b025f894b9276d21762e24a96f6a0000ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Cha=CC=81varri?= Date: Sat, 30 Sep 2017 19:21:36 +0200 Subject: [PATCH 2/3] Part 1: the lexer --- lib/js/src/compiler.js | 282 +++++++++ src/compiler.re | 59 ++ src/the-super-tiny-compiler.re | 1038 -------------------------------- 3 files changed, 341 insertions(+), 1038 deletions(-) create mode 100644 lib/js/src/compiler.js create mode 100755 src/compiler.re delete mode 100755 src/the-super-tiny-compiler.re diff --git a/lib/js/src/compiler.js b/lib/js/src/compiler.js new file mode 100644 index 0000000..04727cf --- /dev/null +++ b/lib/js/src/compiler.js @@ -0,0 +1,282 @@ +// Generated by BUCKLESCRIPT VERSION 1.9.3, PLEASE EDIT WITH CARE +'use strict'; + +var List = require("bs-platform/lib/js/list.js"); +var Block = require("bs-platform/lib/js/block.js"); +var $$String = require("bs-platform/lib/js/string.js"); +var Caml_string = require("bs-platform/lib/js/caml_string.js"); +var Js_primitive = require("bs-platform/lib/js/js_primitive.js"); + +function explode(s) { + var _i = s.length - 1 | 0; + var _l = /* [] */0; + while(true) { + var l = _l; + var i = _i; + if (i < 0) { + return l; + } else { + _l = /* :: */[ + Caml_string.get(s, i), + l + ]; + _i = i - 1 | 0; + continue ; + + } + }; +} + +function tokenizer(input) { + var _input = explode(input); + var _current = /* None */0; + var _tokens = /* [] */0; + while(true) { + var tokens = _tokens; + var current = _current; + var input$1 = _input; + if (input$1) { + var head = List.hd(input$1); + var tail = List.tl(input$1); + var exit = 0; + var exit$1 = 0; + var exit$2 = 0; + var switcher = head - 32 | 0; + if (switcher > 9 || switcher < 0) { + exit$2 = 3; + } else { + switch (switcher) { + case 0 : + if (current) { + var match = current[0]; + if (typeof match === "number") { + exit$1 = 2; + } else { + switch (match.tag | 0) { + case 0 : + _tokens = /* :: */[ + /* Number */Block.__(0, [match[0]]), + tokens + ]; + _current = /* None */0; + _input = tail; + continue ; + case 1 : + exit$2 = 3; + break; + case 2 : + _tokens = /* :: */[ + /* Name */Block.__(2, [match[0]]), + tokens + ]; + _current = /* None */0; + _input = tail; + continue ; + + } + } + } else { + exit$1 = 2; + } + break; + case 2 : + if (current) { + var match$1 = current[0]; + if (typeof match$1 === "number") { + return List.rev(tokens); + } else if (match$1.tag === 1) { + _tokens = /* :: */[ + /* String */Block.__(1, [match$1[0]]), + tokens + ]; + _current = /* None */0; + _input = tail; + continue ; + + } else { + return List.rev(tokens); + } + } else { + _current = /* Some */[/* String */Block.__(1, [""])]; + _input = tail; + continue ; + + } + break; + case 1 : + case 3 : + case 4 : + case 5 : + case 6 : + case 7 : + exit$2 = 3; + break; + case 8 : + if (current) { + exit$2 = 3; + } else { + _tokens = /* :: */[ + /* OpenParen */0, + tokens + ]; + _current = /* None */0; + _input = tail; + continue ; + + } + break; + case 9 : + if (current) { + var match$2 = current[0]; + if (typeof match$2 === "number") { + return List.rev(tokens); + } else { + switch (match$2.tag | 0) { + case 0 : + _tokens = /* :: */[ + /* CloseParen */1, + /* :: */[ + /* Number */Block.__(0, [match$2[0]]), + tokens + ] + ]; + _current = /* None */0; + _input = tail; + continue ; + case 1 : + exit$2 = 3; + break; + case 2 : + _tokens = /* :: */[ + /* CloseParen */1, + /* :: */[ + /* Name */Block.__(2, [match$2[0]]), + tokens + ] + ]; + _current = /* None */0; + _input = tail; + continue ; + + } + } + } else { + _tokens = /* :: */[ + /* CloseParen */1, + tokens + ]; + _current = /* None */0; + _input = tail; + continue ; + + } + break; + + } + } + if (exit$2 === 3) { + if (current) { + var match$3 = current[0]; + if (typeof match$3 === "number") { + exit$1 = 2; + } else if (match$3.tag === 1) { + _current = /* Some */[/* String */Block.__(1, [match$3[0] + $$String.make(1, head)])]; + _input = tail; + continue ; + + } else { + exit$1 = 2; + } + } else { + exit$1 = 2; + } + } + if (exit$1 === 2) { + if (head >= 32) { + if (head < 58) { + if (head >= 33) { + if (head >= 48) { + if (current) { + var match$4 = current[0]; + if (typeof match$4 === "number") { + return List.rev(tokens); + } else if (match$4.tag) { + return List.rev(tokens); + } else { + _current = /* Some */[/* Number */Block.__(0, [match$4[0] + $$String.make(1, head)])]; + _input = tail; + continue ; + + } + } else { + _current = /* Some */[/* Number */Block.__(0, [$$String.make(1, head)])]; + _input = tail; + continue ; + + } + } else { + return List.rev(tokens); + } + } else { + exit = 1; + } + } else if (head > 122 || head < 97) { + return List.rev(tokens); + } else if (current) { + var match$5 = current[0]; + if (typeof match$5 === "number") { + return List.rev(tokens); + } else if (match$5.tag === 2) { + _current = /* Some */[/* Name */Block.__(2, [match$5[0] + $$String.make(1, head)])]; + _input = tail; + continue ; + + } else { + return List.rev(tokens); + } + } else { + _current = /* Some */[/* Name */Block.__(2, [$$String.make(1, head)])]; + _input = tail; + continue ; + + } + } else if (head >= 11) { + if (head !== 13) { + return List.rev(tokens); + } else { + exit = 1; + } + } else if (head >= 9) { + exit = 1; + } else { + return List.rev(tokens); + } + } + if (exit === 1) { + if (current) { + return List.rev(tokens); + } else { + _current = /* None */0; + _input = tail; + continue ; + + } + } + + } else { + return List.rev(tokens); + } + }; +} + +console.log(Js_primitive.undefined_to_opt(JSON.stringify(tokenizer("(add 2 (subtract 4 2))")))); + +var machine = /* record */[ + /* current : None */0, + /* parsed : [] */0 +]; + +exports.machine = machine; +exports.explode = explode; +exports.tokenizer = tokenizer; +/* Not a pure module */ diff --git a/src/compiler.re b/src/compiler.re new file mode 100755 index 0000000..c1a0ddc --- /dev/null +++ b/src/compiler.re @@ -0,0 +1,59 @@ +type token = + | OpenParen + | CloseParen + | Number string + | String string + | Name string; + +type tokenMachine = { + current: option token, + parsed: list token +}; + +let machine = {current: None, parsed: []}; + +let explode s => { + let rec exp i l => + if (i < 0) { + l + } else { + exp (i - 1) [s.[i], ...l] + }; + exp (String.length s - 1) [] +}; + +let tokenizer input => { + let rec tok input current tokens => + switch input { + | [] => List.rev tokens + | _ => + let head = List.hd input; + let tail = List.tl input; + let next = tok tail; + switch (head, current, tokens) { + /* State: None */ + | ('(', None, t) => next None [OpenParen, ...t] + | (')', None, t) => next None [CloseParen, ...t] + | (' ' | '\t' | '\r' | '\n', None, t) => next None t + | ('"', None, t) => next (Some (String "")) t + | ('0'..'9' as i, None, t) => next (Some (Number (String.make 1 i))) t + | ('a'..'z' as i, None, t) => next (Some (Name (String.make 1 i))) t + /* State: String */ + | ('"', Some (String c), t) => next None [String c, ...t] /* TODO allow escaped double quotes :) */ + | (i, Some (String c), t) => next (Some (String (c ^ String.make 1 i))) t + /* State: Number */ + | ('0'..'9' as i, Some (Number c), t) => next (Some (Number (c ^ String.make 1 i))) t + | (')', Some (Number c), t) => next None [CloseParen, Number c, ...t] + | (' ', Some (Number c), t) => next None [Number c, ...t] + /* State: Name */ + | ('a'..'z' as i, Some (Name c), t) => next (Some (Name (c ^ String.make 1 i))) t + | (')', Some (Name c), t) => next None [CloseParen, Name c, ...t] + | (' ', Some (Name c), t) => next None [Name c, ...t] + /* Errors */ + | (_, _, t) => List.rev t /* TODO: handle errors */ + } + }; + tok (explode input) machine.current machine.parsed +}; + +Js.log @@ Js.Json.stringifyAny (tokenizer "(add 2 (subtract 4 2))"); \ No newline at end of file diff --git a/src/the-super-tiny-compiler.re b/src/the-super-tiny-compiler.re deleted file mode 100755 index c57f1ac..0000000 --- a/src/the-super-tiny-compiler.re +++ /dev/null @@ -1,1038 +0,0 @@ -'use strict'; - -/** - * TTTTTTTTTTTTTTTTTTTTTTTHHHHHHHHH HHHHHHHHHEEEEEEEEEEEEEEEEEEEEEE - * T:::::::::::::::::::::TH:::::::H H:::::::HE::::::::::::::::::::E - * T:::::::::::::::::::::TH:::::::H H:::::::HE::::::::::::::::::::E - * T:::::TT:::::::TT:::::THH::::::H H::::::HHEE::::::EEEEEEEEE::::E - * TTTTTT T:::::T TTTTTT H:::::H H:::::H E:::::E EEEEEE - * T:::::T H:::::H H:::::H E:::::E - * T:::::T H::::::HHHHH::::::H E::::::EEEEEEEEEE - * T:::::T H:::::::::::::::::H E:::::::::::::::E - * T:::::T H:::::::::::::::::H E:::::::::::::::E - * T:::::T H::::::HHHHH::::::H E::::::EEEEEEEEEE - * T:::::T H:::::H H:::::H E:::::E - * T:::::T H:::::H H:::::H E:::::E EEEEEE - * TT:::::::TT HH::::::H H::::::HHEE::::::EEEEEEEE:::::E - * T:::::::::T H:::::::H H:::::::HE::::::::::::::::::::E - * T:::::::::T H:::::::H H:::::::HE::::::::::::::::::::E - * TTTTTTTTTTT HHHHHHHHH HHHHHHHHHEEEEEEEEEEEEEEEEEEEEEE - * - * SSSSSSSSSSSSSSS UUUUUUUU UUUUUUUUPPPPPPPPPPPPPPPPP EEEEEEEEEEEEEEEEEEEEEERRRRRRRRRRRRRRRRR - * SS:::::::::::::::SU::::::U U::::::UP::::::::::::::::P E::::::::::::::::::::ER::::::::::::::::R - * S:::::SSSSSS::::::SU::::::U U::::::UP::::::PPPPPP:::::P E::::::::::::::::::::ER::::::RRRRRR:::::R - * S:::::S SSSSSSSUU:::::U U:::::UUPP:::::P P:::::PEE::::::EEEEEEEEE::::ERR:::::R R:::::R - * S:::::S U:::::U U:::::U P::::P P:::::P E:::::E EEEEEE R::::R R:::::R - * S:::::S U:::::U U:::::U P::::P P:::::P E:::::E R::::R R:::::R - * S::::SSSS U:::::U U:::::U P::::PPPPPP:::::P E::::::EEEEEEEEEE R::::RRRRRR:::::R - * SS::::::SSSSS U:::::U U:::::U P:::::::::::::PP E:::::::::::::::E R:::::::::::::RR - * SSS::::::::SS U:::::U U:::::U P::::PPPPPPPPP E:::::::::::::::E R::::RRRRRR:::::R - * SSSSSS::::S U:::::U U:::::U P::::P E::::::EEEEEEEEEE R::::R R:::::R - * S:::::S U:::::U U:::::U P::::P E:::::E R::::R R:::::R - * S:::::S U::::::U U::::::U P::::P E:::::E EEEEEE R::::R R:::::R - * SSSSSSS S:::::S U:::::::UUU:::::::U PP::::::PP EE::::::EEEEEEEE:::::ERR:::::R R:::::R - * S::::::SSSSSS:::::S UU:::::::::::::UU P::::::::P E::::::::::::::::::::ER::::::R R:::::R - * S:::::::::::::::SS UU:::::::::UU P::::::::P E::::::::::::::::::::ER::::::R R:::::R - * SSSSSSSSSSSSSSS UUUUUUUUU PPPPPPPPPP EEEEEEEEEEEEEEEEEEEEEERRRRRRRR RRRRRRR - * - * TTTTTTTTTTTTTTTTTTTTTTTIIIIIIIIIINNNNNNNN NNNNNNNNYYYYYYY YYYYYYY - * T:::::::::::::::::::::TI::::::::IN:::::::N N::::::NY:::::Y Y:::::Y - * T:::::::::::::::::::::TI::::::::IN::::::::N N::::::NY:::::Y Y:::::Y - * T:::::TT:::::::TT:::::TII::::::IIN:::::::::N N::::::NY::::::Y Y::::::Y - * TTTTTT T:::::T TTTTTT I::::I N::::::::::N N::::::NYYY:::::Y Y:::::YYY - * T:::::T I::::I N:::::::::::N N::::::N Y:::::Y Y:::::Y - * T:::::T I::::I N:::::::N::::N N::::::N Y:::::Y:::::Y - * T:::::T I::::I N::::::N N::::N N::::::N Y:::::::::Y - * T:::::T I::::I N::::::N N::::N:::::::N Y:::::::Y - * T:::::T I::::I N::::::N N:::::::::::N Y:::::Y - * T:::::T I::::I N::::::N N::::::::::N Y:::::Y - * T:::::T I::::I N::::::N N:::::::::N Y:::::Y - * TT:::::::TT II::::::IIN::::::N N::::::::N Y:::::Y - * T:::::::::T I::::::::IN::::::N N:::::::N YYYY:::::YYYY - * T:::::::::T I::::::::IN::::::N N::::::N Y:::::::::::Y - * TTTTTTTTTTT IIIIIIIIIINNNNNNNN NNNNNNN YYYYYYYYYYYYY - * - * CCCCCCCCCCCCC OOOOOOOOO MMMMMMMM MMMMMMMMPPPPPPPPPPPPPPPPP IIIIIIIIIILLLLLLLLLLL EEEEEEEEEEEEEEEEEEEEEERRRRRRRRRRRRRRRRR - * CCC::::::::::::C OO:::::::::OO M:::::::M M:::::::MP::::::::::::::::P I::::::::IL:::::::::L E::::::::::::::::::::ER::::::::::::::::R - * CC:::::::::::::::C OO:::::::::::::OO M::::::::M M::::::::MP::::::PPPPPP:::::P I::::::::IL:::::::::L E::::::::::::::::::::ER::::::RRRRRR:::::R - * C:::::CCCCCCCC::::CO:::::::OOO:::::::OM:::::::::M M:::::::::MPP:::::P P:::::PII::::::IILL:::::::LL EE::::::EEEEEEEEE::::ERR:::::R R:::::R - * C:::::C CCCCCCO::::::O O::::::OM::::::::::M M::::::::::M P::::P P:::::P I::::I L:::::L E:::::E EEEEEE R::::R R:::::R - * C:::::C O:::::O O:::::OM:::::::::::M M:::::::::::M P::::P P:::::P I::::I L:::::L E:::::E R::::R R:::::R - * C:::::C O:::::O O:::::OM:::::::M::::M M::::M:::::::M P::::PPPPPP:::::P I::::I L:::::L E::::::EEEEEEEEEE R::::RRRRRR:::::R - * C:::::C O:::::O O:::::OM::::::M M::::M M::::M M::::::M P:::::::::::::PP I::::I L:::::L E:::::::::::::::E R:::::::::::::RR - * C:::::C O:::::O O:::::OM::::::M M::::M::::M M::::::M P::::PPPPPPPPP I::::I L:::::L E:::::::::::::::E R::::RRRRRR:::::R - * C:::::C O:::::O O:::::OM::::::M M:::::::M M::::::M P::::P I::::I L:::::L E::::::EEEEEEEEEE R::::R R:::::R - * C:::::C O:::::O O:::::OM::::::M M:::::M M::::::M P::::P I::::I L:::::L E:::::E R::::R R:::::R - * C:::::C CCCCCCO::::::O O::::::OM::::::M MMMMM M::::::M P::::P I::::I L:::::L LLLLLL E:::::E EEEEEE R::::R R:::::R - * C:::::CCCCCCCC::::CO:::::::OOO:::::::OM::::::M M::::::MPP::::::PP II::::::IILL:::::::LLLLLLLLL:::::LEE::::::EEEEEEEE:::::ERR:::::R R:::::R - * CC:::::::::::::::C OO:::::::::::::OO M::::::M M::::::MP::::::::P I::::::::IL::::::::::::::::::::::LE::::::::::::::::::::ER::::::R R:::::R - * CCC::::::::::::C OO:::::::::OO M::::::M M::::::MP::::::::P I::::::::IL::::::::::::::::::::::LE::::::::::::::::::::ER::::::R R:::::R - * CCCCCCCCCCCCC OOOOOOOOO MMMMMMMM MMMMMMMMPPPPPPPPPP IIIIIIIIIILLLLLLLLLLLLLLLLLLLLLLLLEEEEEEEEEEEEEEEEEEEEEERRRRRRRR RRRRRRR - * - * ======================================================================================================================================================================= - * ======================================================================================================================================================================= - * ======================================================================================================================================================================= - * ======================================================================================================================================================================= - */ - -/** - * Today we're going to write a compiler together. But not just any compiler... A - * super duper teeny tiny compiler! A compiler that is so small that if you - * remove all the comments this file would only be ~200 lines of actual code. - * - * We're going to compile some lisp-like function calls into some C-like - * function calls. - * - * If you are not familiar with one or the other. I'll just give you a quick intro. - * - * If we had two functions `add` and `subtract` they would be written like this: - * - * LISP C - * - * 2 + 2 (add 2 2) add(2, 2) - * 4 - 2 (subtract 4 2) subtract(4, 2) - * 2 + (4 - 2) (add 2 (subtract 4 2)) add(2, subtract(4, 2)) - * - * Easy peezy right? - * - * Well good, because this is exactly what we are going to compile. While this - * is neither a complete LISP or C syntax, it will be enough of the syntax to - * demonstrate many of the major pieces of a modern compiler. - */ - -/** - * Most compilers break down into three primary stages: Parsing, Transformation, - * and Code Generation - * - * 1. *Parsing* is taking raw code and turning it into a more abstract - * representation of the code. - * - * 2. *Transformation* takes this abstract representation and manipulates to do - * whatever the compiler wants it to. - * - * 3. *Code Generation* takes the transformed representation of the code and - * turns it into new code. - */ - -/** - * Parsing - * ------- - * - * Parsing typically gets broken down into two phases: Lexical Analysis and - * Syntactic Analysis. - * - * 1. *Lexical Analysis* takes the raw code and splits it apart into these things - * called tokens by a thing called a tokenizer (or lexer). - * - * Tokens are an array of tiny little objects that describe an isolated piece - * of the syntax. They could be numbers, labels, punctuation, operators, - * whatever. - * - * 2. *Syntactic Analysis* takes the tokens and reformats them into a - * representation that describes each part of the syntax and their relation - * to one another. This is known as an intermediate representation or - * Abstract Syntax Tree. - * - * An Abstract Syntax Tree, or AST for short, is a deeply nested object that - * represents code in a way that is both easy to work with and tells us a lot - * of information. - * - * For the following syntax: - * - * (add 2 (subtract 4 2)) - * - * Tokens might look something like this: - * - * [ - * { type: 'paren', value: '(' }, - * { type: 'name', value: 'add' }, - * { type: 'number', value: '2' }, - * { type: 'paren', value: '(' }, - * { type: 'name', value: 'subtract' }, - * { type: 'number', value: '4' }, - * { type: 'number', value: '2' }, - * { type: 'paren', value: ')' }, - * { type: 'paren', value: ')' }, - * ] - * - * And an Abstract Syntax Tree (AST) might look like this: - * - * { - * type: 'Program', - * body: [{ - * type: 'CallExpression', - * name: 'add', - * params: [{ - * type: 'NumberLiteral', - * value: '2', - * }, { - * type: 'CallExpression', - * name: 'subtract', - * params: [{ - * type: 'NumberLiteral', - * value: '4', - * }, { - * type: 'NumberLiteral', - * value: '2', - * }] - * }] - * }] - * } - */ - -/** - * Transformation - * -------------- - * - * The next type of stage for a compiler is transformation. Again, this just - * takes the AST from the last step and makes changes to it. It can manipulate - * the AST in the same language or it can translate it into an entirely new - * language. - * - * Let’s look at how we would transform an AST. - * - * You might notice that our AST has elements within it that look very similar. - * There are these objects with a type property. Each of these are known as an - * AST Node. These nodes have defined properties on them that describe one - * isolated part of the tree. - * - * We can have a node for a "NumberLiteral": - * - * { - * type: 'NumberLiteral', - * value: '2', - * } - * - * Or maybe a node for a "CallExpression": - * - * { - * type: 'CallExpression', - * name: 'subtract', - * params: [...nested nodes go here...], - * } - * - * When transforming the AST we can manipulate nodes by - * adding/removing/replacing properties, we can add new nodes, remove nodes, or - * we could leave the existing AST alone and create an entirely new one based - * on it. - * - * Since we’re targeting a new language, we’re going to focus on creating an - * entirely new AST that is specific to the target language. - * - * Traversal - * --------- - * - * In order to navigate through all of these nodes, we need to be able to - * traverse through them. This traversal process goes to each node in the AST - * depth-first. - * - * { - * type: 'Program', - * body: [{ - * type: 'CallExpression', - * name: 'add', - * params: [{ - * type: 'NumberLiteral', - * value: '2' - * }, { - * type: 'CallExpression', - * name: 'subtract', - * params: [{ - * type: 'NumberLiteral', - * value: '4' - * }, { - * type: 'NumberLiteral', - * value: '2' - * }] - * }] - * }] - * } - * - * So for the above AST we would go: - * - * 1. Program - Starting at the top level of the AST - * 2. CallExpression (add) - Moving to the first element of the Program's body - * 3. NumberLiteral (2) - Moving to the first element of CallExpression's params - * 4. CallExpression (subtract) - Moving to the second element of CallExpression's params - * 5. NumberLiteral (4) - Moving to the first element of CallExpression's params - * 6. NumberLiteral (2) - Moving to the second element of CallExpression's params - * - * If we were manipulating this AST directly, instead of creating a separate AST, - * we would likely introduce all sorts of abstractions here. But just visiting - * each node in the tree is enough for what we're trying to do. - * - * The reason I use the word "visiting" is because there is this pattern of how - * to represent operations on elements of an object structure. - * - * Visitors - * -------- - * - * The basic idea here is that we are going to create a “visitor” object that - * has methods that will accept different node types. - * - * var visitor = { - * NumberLiteral() {}, - * CallExpression() {}, - * }; - * - * When we traverse our AST, we will call the methods on this visitor whenever we - * "enter" a node of a matching type. - * - * In order to make this useful we will also pass the node and a reference to - * the parent node. - * - * var visitor = { - * NumberLiteral(node, parent) {}, - * CallExpression(node, parent) {}, - * }; - * - * However, there also exists the possibility of calling things on "exit". Imagine - * our tree structure from before in list form: - * - * - Program - * - CallExpression - * - NumberLiteral - * - CallExpression - * - NumberLiteral - * - NumberLiteral - * - * As we traverse down, we're going to reach branches with dead ends. As we - * finish each branch of the tree we "exit" it. So going down the tree we - * "enter" each node, and going back up we "exit". - * - * -> Program (enter) - * -> CallExpression (enter) - * -> Number Literal (enter) - * <- Number Literal (exit) - * -> Call Expression (enter) - * -> Number Literal (enter) - * <- Number Literal (exit) - * -> Number Literal (enter) - * <- Number Literal (exit) - * <- CallExpression (exit) - * <- CallExpression (exit) - * <- Program (exit) - * - * In order to support that, the final form of our visitor will look like this: - * - * var visitor = { - * NumberLiteral: { - * enter(node, parent) {}, - * exit(node, parent) {}, - * } - * }; - */ - -/** - * Code Generation - * --------------- - * - * The final phase of a compiler is code generation. Sometimes compilers will do - * things that overlap with transformation, but for the most part code - * generation just means take our AST and string-ify code back out. - * - * Code generators work several different ways, some compilers will reuse the - * tokens from earlier, others will have created a separate representation of - * the code so that they can print node linearly, but from what I can tell most - * will use the same AST we just created, which is what we’re going to focus on. - * - * Effectively our code generator will know how to “print” all of the different - * node types of the AST, and it will recursively call itself to print nested - * nodes until everything is printed into one long string of code. - */ - -/** - * And that's it! That's all the different pieces of a compiler. - * - * Now that isn’t to say every compiler looks exactly like I described here. - * Compilers serve many different purposes, and they might need more steps than - * I have detailed. - * - * But now you should have a general high-level idea of what most compilers look - * like. - * - * Now that I’ve explained all of this, you’re all good to go write your own - * compilers right? - * - * Just kidding, that's what I'm here to help with :P - * - * So let's begin... - */ - -/** - * ============================================================================ - * (/^▽^)/ - * THE TOKENIZER! - * ============================================================================ - */ - -/** - * We're gonna start off with our first phase of parsing, lexical analysis, with - * the tokenizer. - * - * We're just going to take our string of code and break it down into an array - * of tokens. - * - * (add 2 (subtract 4 2)) => [{ type: 'paren', value: '(' }, ...] - */ - -// We start by accepting an input string of code, and we're gonna set up two -// things... -function tokenizer(input) { - - // A `current` variable for tracking our position in the code like a cursor. - let current = 0; - - // And a `tokens` array for pushing our tokens to. - let tokens = []; - - // We start by creating a `while` loop where we are setting up our `current` - // variable to be incremented as much as we want `inside` the loop. - // - // We do this because we may want to increment `current` many times within a - // single loop because our tokens can be any length. - while (current < input.length) { - - // We're also going to store the `current` character in the `input`. - let char = input[current]; - - // The first thing we want to check for is an open parenthesis. This will - // later be used for `CallExpression` but for now we only care about the - // character. - // - // We check to see if we have an open parenthesis: - if (char === '(') { - - // If we do, we push a new token with the type `paren` and set the value - // to an open parenthesis. - tokens.push({ - type: 'paren', - value: '(', - }); - - // Then we increment `current` - current++; - - // And we `continue` onto the next cycle of the loop. - continue; - } - - // Next we're going to check for a closing parenthesis. We do the same exact - // thing as before: Check for a closing parenthesis, add a new token, - // increment `current`, and `continue`. - if (char === ')') { - tokens.push({ - type: 'paren', - value: ')', - }); - current++; - continue; - } - - // Moving on, we're now going to check for whitespace. This is interesting - // because we care that whitespace exists to separate characters, but it - // isn't actually important for us to store as a token. We would only throw - // it out later. - // - // So here we're just going to test for existence and if it does exist we're - // going to just `continue` on. - let WHITESPACE = /\s/; - if (WHITESPACE.test(char)) { - current++; - continue; - } - - // The next type of token is a number. This is different than what we have - // seen before because a number could be any number of characters and we - // want to capture the entire sequence of characters as one token. - // - // (add 123 456) - // ^^^ ^^^ - // Only two separate tokens - // - // So we start this off when we encounter the first number in a sequence. - let NUMBERS = /[0-9]/; - if (NUMBERS.test(char)) { - - // We're going to create a `value` string that we are going to push - // characters to. - let value = ''; - - // Then we're going to loop through each character in the sequence until - // we encounter a character that is not a number, pushing each character - // that is a number to our `value` and incrementing `current` as we go. - while (NUMBERS.test(char)) { - value += char; - char = input[++current]; - } - - // After that we push our `number` token to the `tokens` array. - tokens.push({ type: 'number', value }); - - // And we continue on. - continue; - } - - // We'll also add support for strings in our language which will be any - // text surrounded by double quotes ("). - // - // (concat "foo" "bar") - // ^^^ ^^^ string tokens - // - // We'll start by checking for the opening quote: - if (char === '"') { - // Keep a `value` variable for building up our string token. - let value = ''; - - // We'll skip the opening double quote in our token. - char = input[++current]; - - // Then we'll iterate through each character until we reach another - // double quote. - while (char !== '"') { - value += char; - char = input[++current]; - } - - // Skip the closing double quote. - char = input[++current]; - - // And add our `string` token to the `tokens` array. - tokens.push({ type: 'string', value }); - - continue; - } - - // The last type of token will be a `name` token. This is a sequence of - // letters instead of numbers, that are the names of functions in our lisp - // syntax. - // - // (add 2 4) - // ^^^ - // Name token - // - let LETTERS = /[a-z]/i; - if (LETTERS.test(char)) { - let value = ''; - - // Again we're just going to loop through all the letters pushing them to - // a value. - while (LETTERS.test(char)) { - value += char; - char = input[++current]; - } - - // And pushing that value as a token with the type `name` and continuing. - tokens.push({ type: 'name', value }); - - continue; - } - - // Finally if we have not matched a character by now, we're going to throw - // an error and completely exit. - throw new TypeError('I dont know what this character is: ' + char); - } - - // Then at the end of our `tokenizer` we simply return the tokens array. - return tokens; -} - -/** - * ============================================================================ - * ヽ/❀o ل͜ o\ノ - * THE PARSER!!! - * ============================================================================ - */ - -/** - * For our parser we're going to take our array of tokens and turn it into an - * AST. - * - * [{ type: 'paren', value: '(' }, ...] => { type: 'Program', body: [...] } - */ - -// Okay, so we define a `parser` function that accepts our array of `tokens`. -function parser(tokens) { - - // Again we keep a `current` variable that we will use as a cursor. - let current = 0; - - // But this time we're going to use recursion instead of a `while` loop. So we - // define a `walk` function. - function walk() { - - // Inside the walk function we start by grabbing the `current` token. - let token = tokens[current]; - - // We're going to split each type of token off into a different code path, - // starting off with `number` tokens. - // - // We test to see if we have a `number` token. - if (token.type === 'number') { - - // If we have one, we'll increment `current`. - current++; - - // And we'll return a new AST node called `NumberLiteral` and setting its - // value to the value of our token. - return { - type: 'NumberLiteral', - value: token.value, - }; - } - - // If we have a string we will do the same as number and create a - // `StringLiteral` node. - if (token.type === 'string') { - current++; - - return { - type: 'StringLiteral', - value: token.value, - }; - } - - // Next we're going to look for CallExpressions. We start this off when we - // encounter an open parenthesis. - if ( - token.type === 'paren' && - token.value === '(' - ) { - - // We'll increment `current` to skip the parenthesis since we don't care - // about it in our AST. - token = tokens[++current]; - - // We create a base node with the type `CallExpression`, and we're going - // to set the name as the current token's value since the next token after - // the open parenthesis is the name of the function. - let node = { - type: 'CallExpression', - name: token.value, - params: [], - }; - - // We increment `current` *again* to skip the name token. - token = tokens[++current]; - - // And now we want to loop through each token that will be the `params` of - // our `CallExpression` until we encounter a closing parenthesis. - // - // Now this is where recursion comes in. Instead of trying to parse a - // potentially infinitely nested set of nodes we're going to rely on - // recursion to resolve things. - // - // To explain this, let's take our Lisp code. You can see that the - // parameters of the `add` are a number and a nested `CallExpression` that - // includes its own numbers. - // - // (add 2 (subtract 4 2)) - // - // You'll also notice that in our tokens array we have multiple closing - // parenthesis. - // - // [ - // { type: 'paren', value: '(' }, - // { type: 'name', value: 'add' }, - // { type: 'number', value: '2' }, - // { type: 'paren', value: '(' }, - // { type: 'name', value: 'subtract' }, - // { type: 'number', value: '4' }, - // { type: 'number', value: '2' }, - // { type: 'paren', value: ')' }, <<< Closing parenthesis - // { type: 'paren', value: ')' }, <<< Closing parenthesis - // ] - // - // We're going to rely on the nested `walk` function to increment our - // `current` variable past any nested `CallExpression`. - - // So we create a `while` loop that will continue until it encounters a - // token with a `type` of `'paren'` and a `value` of a closing - // parenthesis. - while ( - (token.type !== 'paren') || - (token.type === 'paren' && token.value !== ')') - ) { - // we'll call the `walk` function which will return a `node` and we'll - // push it into our `node.params`. - node.params.push(walk()); - token = tokens[current]; - } - - // Finally we will increment `current` one last time to skip the closing - // parenthesis. - current++; - - // And return the node. - return node; - } - - // Again, if we haven't recognized the token type by now we're going to - // throw an error. - throw new TypeError(token.type); - } - - // Now, we're going to create our AST which will have a root which is a - // `Program` node. - let ast = { - type: 'Program', - body: [], - }; - - // And we're going to kickstart our `walk` function, pushing nodes to our - // `ast.body` array. - // - // The reason we are doing this inside a loop is because our program can have - // `CallExpression` after one another instead of being nested. - // - // (add 2 2) - // (subtract 4 2) - // - while (current < tokens.length) { - ast.body.push(walk()); - } - - // At the end of our parser we'll return the AST. - return ast; -} - -/** - * ============================================================================ - * ⌒(❀>◞౪◟<❀)⌒ - * THE TRAVERSER!!! - * ============================================================================ - */ - -/** - * So now we have our AST, and we want to be able to visit different nodes with - * a visitor. We need to be able to call the methods on the visitor whenever we - * encounter a node with a matching type. - * - * traverse(ast, { - * Program(node, parent) { - * // ... - * }, - * - * CallExpression(node, parent) { - * // ... - * }, - * - * NumberLiteral(node, parent) { - * // ... - * }, - * }); - */ - -// So we define a traverser function which accepts an AST and a -// visitor. Inside we're going to define two functions... -function traverser(ast, visitor) { - - // A `traverseArray` function that will allow us to iterate over an array and - // call the next function that we will define: `traverseNode`. - function traverseArray(array, parent) { - array.forEach(child => { - traverseNode(child, parent); - }); - } - - // `traverseNode` will accept a `node` and its `parent` node. So that it can - // pass both to our visitor methods. - function traverseNode(node, parent) { - - // We start by testing for the existence of a method on the visitor with a - // matching `type`. - let methods = visitor[node.type]; - - // If there is an `enter` method for this node type we'll call it with the - // `node` and its `parent`. - if (methods && methods.enter) { - methods.enter(node, parent); - } - - // Next we are going to split things up by the current node type. - switch (node.type) { - - // We'll start with our top level `Program`. Since Program nodes have a - // property named body that has an array of nodes, we will call - // `traverseArray` to traverse down into them. - // - // (Remember that `traverseArray` will in turn call `traverseNode` so we - // are causing the tree to be traversed recursively) - case 'Program': - traverseArray(node.body, node); - break; - - // Next we do the same with `CallExpression` and traverse their `params`. - case 'CallExpression': - traverseArray(node.params, node); - break; - - // In the cases of `NumberLiteral` and `StringLiteral` we don't have any - // child nodes to visit, so we'll just break. - case 'NumberLiteral': - case 'StringLiteral': - break; - - // And again, if we haven't recognized the node type then we'll throw an - // error. - default: - throw new TypeError(node.type); - } - - // If there is an `exit` method for this node type we'll call it with the - // `node` and its `parent`. - if (methods && methods.exit) { - methods.exit(node, parent); - } - } - - // Finally we kickstart the traverser by calling `traverseNode` with our ast - // with no `parent` because the top level of the AST doesn't have a parent. - traverseNode(ast, null); -} - -/** - * ============================================================================ - * ⁽(◍˃̵͈̑ᴗ˂̵͈̑)⁽ - * THE TRANSFORMER!!! - * ============================================================================ - */ - -/** - * Next up, the transformer. Our transformer is going to take the AST that we - * have built and pass it to our traverser function with a visitor and will - * create a new ast. - * - * ---------------------------------------------------------------------------- - * Original AST | Transformed AST - * ---------------------------------------------------------------------------- - * { | { - * type: 'Program', | type: 'Program', - * body: [{ | body: [{ - * type: 'CallExpression', | type: 'ExpressionStatement', - * name: 'add', | expression: { - * params: [{ | type: 'CallExpression', - * type: 'NumberLiteral', | callee: { - * value: '2' | type: 'Identifier', - * }, { | name: 'add' - * type: 'CallExpression', | }, - * name: 'subtract', | arguments: [{ - * params: [{ | type: 'NumberLiteral', - * type: 'NumberLiteral', | value: '2' - * value: '4' | }, { - * }, { | type: 'CallExpression', - * type: 'NumberLiteral', | callee: { - * value: '2' | type: 'Identifier', - * }] | name: 'subtract' - * }] | }, - * }] | arguments: [{ - * } | type: 'NumberLiteral', - * | value: '4' - * ---------------------------------- | }, { - * | type: 'NumberLiteral', - * | value: '2' - * | }] - * (sorry the other one is longer.) | } - * | } - * | }] - * | } - * ---------------------------------------------------------------------------- - */ - -// So we have our transformer function which will accept the lisp ast. -function transformer(ast) { - - // We'll create a `newAst` which like our previous AST will have a program - // node. - let newAst = { - type: 'Program', - body: [], - }; - - // Next I'm going to cheat a little and create a bit of a hack. We're going to - // use a property named `context` on our parent nodes that we're going to push - // nodes to their parent's `context`. Normally you would have a better - // abstraction than this, but for our purposes this keeps things simple. - // - // Just take note that the context is a reference *from* the old ast *to* the - // new ast. - ast._context = newAst.body; - - // We'll start by calling the traverser function with our ast and a visitor. - traverser(ast, { - - // The first visitor method accepts any `NumberLiteral` - NumberLiteral: { - // We'll visit them on enter. - enter(node, parent) { - // We'll create a new node also named `NumberLiteral` that we will push to - // the parent context. - parent._context.push({ - type: 'NumberLiteral', - value: node.value, - }); - }, - }, - - // Next we have `StringLiteral` - StringLiteral: { - enter(node, parent) { - parent._context.push({ - type: 'StringLiteral', - value: node.value, - }); - }, - }, - - // Next up, `CallExpression`. - CallExpression: { - enter(node, parent) { - - // We start creating a new node `CallExpression` with a nested - // `Identifier`. - let expression = { - type: 'CallExpression', - callee: { - type: 'Identifier', - name: node.name, - }, - arguments: [], - }; - - // Next we're going to define a new context on the original - // `CallExpression` node that will reference the `expression`'s arguments - // so that we can push arguments. - node._context = expression.arguments; - - // Then we're going to check if the parent node is a `CallExpression`. - // If it is not... - if (parent.type !== 'CallExpression') { - - // We're going to wrap our `CallExpression` node with an - // `ExpressionStatement`. We do this because the top level - // `CallExpression` in JavaScript are actually statements. - expression = { - type: 'ExpressionStatement', - expression: expression, - }; - } - - // Last, we push our (possibly wrapped) `CallExpression` to the `parent`'s - // `context`. - parent._context.push(expression); - }, - } - }); - - // At the end of our transformer function we'll return the new ast that we - // just created. - return newAst; -} - -/** - * ============================================================================ - * ヾ(〃^∇^)ノ♪ - * THE CODE GENERATOR!!!! - * ============================================================================ - */ - -/** - * Now let's move onto our last phase: The Code Generator. - * - * Our code generator is going to recursively call itself to print each node in - * the tree into one giant string. - */ - -function codeGenerator(node) { - - // We'll break things down by the `type` of the `node`. - switch (node.type) { - - // If we have a `Program` node. We will map through each node in the `body` - // and run them through the code generator and join them with a newline. - case 'Program': - return node.body.map(codeGenerator) - .join('\n'); - - // For `ExpressionStatement` we'll call the code generator on the nested - // expression and we'll add a semicolon... - case 'ExpressionStatement': - return ( - codeGenerator(node.expression) + - ';' // << (...because we like to code the *correct* way) - ); - - // For `CallExpression` we will print the `callee`, add an open - // parenthesis, we'll map through each node in the `arguments` array and run - // them through the code generator, joining them with a comma, and then - // we'll add a closing parenthesis. - case 'CallExpression': - return ( - codeGenerator(node.callee) + - '(' + - node.arguments.map(codeGenerator) - .join(', ') + - ')' - ); - - // For `Identifier` we'll just return the `node`'s name. - case 'Identifier': - return node.name; - - // For `NumberLiteral` we'll just return the `node`'s value. - case 'NumberLiteral': - return node.value; - - // For `StringLiteral` we'll add quotations around the `node`'s value. - case 'StringLiteral': - return '"' + node.value + '"'; - - // And if we haven't recognized the node, we'll throw an error. - default: - throw new TypeError(node.type); - } -} - -/** - * ============================================================================ - * (۶* ‘ヮ’)۶” - * !!!!!!!!THE COMPILER!!!!!!!! - * ============================================================================ - */ - -/** - * FINALLY! We'll create our `compiler` function. Here we will link together - * every part of the pipeline. - * - * 1. input => tokenizer => tokens - * 2. tokens => parser => ast - * 3. ast => transformer => newAst - * 4. newAst => generator => output - */ - -function compiler(input) { - let tokens = tokenizer(input); - let ast = parser(tokens); - let newAst = transformer(ast); - let output = codeGenerator(newAst); - - // and simply return the output! - return output; -} - -/** - * ============================================================================ - * (๑˃̵ᴗ˂̵)و - * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!YOU MADE IT!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - * ============================================================================ - */ - -// Now I'm just exporting everything... -module.exports = { - tokenizer, - parser, - traverser, - transformer, - codeGenerator, - compiler, -}; From 81b5f5d8c0f5b01d63b10d56925fddce7a11200d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Cha=CC=81varri?= Date: Sat, 30 Sep 2017 20:07:30 +0200 Subject: [PATCH 3/3] Add debug printer --- lib/js/src/compiler.js | 41 ++++++++++++++++++++++++++++++++--------- src/compiler.re | 11 ++++++++++- 2 files changed, 42 insertions(+), 10 deletions(-) diff --git a/lib/js/src/compiler.js b/lib/js/src/compiler.js index 04727cf..19f3fc7 100644 --- a/lib/js/src/compiler.js +++ b/lib/js/src/compiler.js @@ -1,11 +1,10 @@ // Generated by BUCKLESCRIPT VERSION 1.9.3, PLEASE EDIT WITH CARE 'use strict'; -var List = require("bs-platform/lib/js/list.js"); -var Block = require("bs-platform/lib/js/block.js"); -var $$String = require("bs-platform/lib/js/string.js"); -var Caml_string = require("bs-platform/lib/js/caml_string.js"); -var Js_primitive = require("bs-platform/lib/js/js_primitive.js"); +var List = require("bs-platform/lib/js/list.js"); +var Block = require("bs-platform/lib/js/block.js"); +var $$String = require("bs-platform/lib/js/string.js"); +var Caml_string = require("bs-platform/lib/js/caml_string.js"); function explode(s) { var _i = s.length - 1 | 0; @@ -269,14 +268,38 @@ function tokenizer(input) { }; } -console.log(Js_primitive.undefined_to_opt(JSON.stringify(tokenizer("(add 2 (subtract 4 2))")))); +function printToken(token) { + if (typeof token === "number") { + if (token) { + return "CloseParen"; + } else { + return "OpenParen"; + } + } else { + switch (token.tag | 0) { + case 0 : + return "Number " + token[0]; + case 1 : + return "String " + token[0]; + case 2 : + return "Name " + token[0]; + + } + } +} + +List.iter((function (k) { + console.log(printToken(k)); + return /* () */0; + }), tokenizer("(add 2 (subtract 4 2))")); var machine = /* record */[ /* current : None */0, /* parsed : [] */0 ]; -exports.machine = machine; -exports.explode = explode; -exports.tokenizer = tokenizer; +exports.machine = machine; +exports.explode = explode; +exports.tokenizer = tokenizer; +exports.printToken = printToken; /* Not a pure module */ diff --git a/src/compiler.re b/src/compiler.re index c1a0ddc..f4eeef2 100755 --- a/src/compiler.re +++ b/src/compiler.re @@ -56,4 +56,13 @@ let tokenizer input => { tok (explode input) machine.current machine.parsed }; -Js.log @@ Js.Json.stringifyAny (tokenizer "(add 2 (subtract 4 2))"); \ No newline at end of file +let printToken token => + switch token { + | OpenParen => "OpenParen" + | CloseParen => "CloseParen" + | Number s => "Number " ^ s + | String s => "String " ^ s + | Name s => "Name " ^ s + }; + +List.iter (fun k => Js.log (printToken k)) (tokenizer "(add 2 (subtract 4 2))"); \ No newline at end of file