From 5da992116dcf597e4a798a20714b9baf09e019e5 Mon Sep 17 00:00:00 2001 From: Brian Edwards Date: Fri, 1 Apr 2016 00:49:40 -0500 Subject: [PATCH] super tiny compiler ported to Python 3 --- .gitignore | 2 + super_tiny_compiler_unannotated.py | 189 +++++++++++++++++++++++++++++ test.py | 84 +++++++++++++ 3 files changed, 275 insertions(+) create mode 100644 .gitignore create mode 100644 super_tiny_compiler_unannotated.py create mode 100644 test.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..36512d4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.cache +*.pyc diff --git a/super_tiny_compiler_unannotated.py b/super_tiny_compiler_unannotated.py new file mode 100644 index 0000000..9ce9544 --- /dev/null +++ b/super_tiny_compiler_unannotated.py @@ -0,0 +1,189 @@ +from itertools import chain +from string import ascii_letters, digits, whitespace + +NAME = 'name' +NUMBER = 'number' +PAREN = 'paren' + +MULTICHAR_TYPES = [(NUMBER, digits, int), + (NAME, ascii_letters, lambda x: x)] + + +class BacktrackingGenerator(): + + def __init__(self, input_): + + def generator(): + """Use send(True-ish) to backtrack.""" + for char in input_: + if (yield char): + # send(True-ish) was called + if (yield): + raise ValueError('Cannot backtrack twice in a row.') + yield char + + self.generator = generator() + + def __iter__(self): + return self.generator + + def __next__(self): + return next(self.generator) + + def take_while(self, predicate): + for char in self.generator: + if not predicate(char): + self.generator.send(1) # backtrack + break + yield char + + +def Token(type_, value): + return {'type': type_, 'value': value} + + +def tokenize(input_): + char_gen = BacktrackingGenerator(input_) + + def tokenize_multiple_chars(char): + for type_, charset, convert in MULTICHAR_TYPES: + if char in charset: + tail = char_gen.take_while(lambda c: c in charset) + value = convert(''.join(chain([char], tail))) + return type_, value + + for char in char_gen: + if char in whitespace: + continue + if char in '()': + yield Token(PAREN, char) + elif char in digits + ascii_letters: + yield Token(*tokenize_multiple_chars(char)) + else: + raise TypeError('I dont know what this character is: ' + char) + + +def NumberLiteral(value): + return {'type': 'NumberLiteral', 'value': value} + + +def CallExpression(name, params): + return {'type': 'CallExpression', 'name': name, 'params': params} + + +def parse(tokens): + + def walk(token): + if token['type'] == NUMBER: + return NumberLiteral(token['value']) + if token == Token(PAREN, '('): + token = next(tokens) + node = CallExpression(token['value'], []) + for token in tokens: + if token == Token(PAREN, ')'): + break + node['params'].append(walk(token)) + return node + raise TypeError(token['type']) + + ast = {'type': 'Program', 'body': []} + for token in tokens: + ast['body'].append(walk(token)) + return ast + + +class Traverser(): + + def __init__(self, visitor): + self.visitor = visitor + + def traverse(self, ast): + self.traverse_node(ast) + + def traverse_node(self, node, parent=None): + method = getattr(self.visitor, node['type'], None) + if method: + method(node, parent) + getattr(self, node['type'])(node) + + def Program(self, node): + for expression in node['body']: + self.traverse_node(expression, node) + + def CallExpression(self, node): + for param in node['params']: + self.traverse_node(param, node) + + @staticmethod + def NumberLiteral(node): + pass + + +def ExpressionStatement(expression): + return {'type': 'ExpressionStatement', 'expression': expression} + + +def Identifier(name): + return {'type': 'Identifier', 'name': name} + + +def NewCallExpression(callee, arguments): + return {'type': 'CallExpression', 'callee': callee, 'arguments': arguments} + + +class Transformer(): + + @classmethod + def transform(cls, ast): + new_ast = {'type': 'Program', 'body': []} + ast['_context'] = new_ast['body'] + Traverser(cls).traverse(ast) + return new_ast + + @staticmethod + def NumberLiteral(node, parent): + parent['_context'].append(NumberLiteral(node['value'])) + + @staticmethod + def CallExpression(node, parent): + expression = NewCallExpression(Identifier(node['name']), []) + node['_context'] = expression['arguments'] + if parent['type'] != 'CallExpression': + expression = ExpressionStatement(expression) + parent['_context'].append(expression) + + +class CodeGenerator(): + + @classmethod + def generate_code(cls, node): + return getattr(cls, node['type'])(node) + + @classmethod + def Program(cls, node): + return '\n'.join(map(cls.generate_code, node['body'])) + + @classmethod + def ExpressionStatement(cls, node): + return cls.generate_code(node['expression']) + ';' + + @classmethod + def CallExpression(cls, node): + return '{}({})'.format( + cls.generate_code(node['callee']), + ', '.join(map(cls.generate_code, node['arguments']))) + + @staticmethod + def Identifier(node): + return node['name'] + + @staticmethod + def NumberLiteral(node): + return str(node['value']) + + +def compile_(input_): + tokens = tokenize(input_) + ast = parse(tokens) + new_ast = Transformer.transform(ast) + return CodeGenerator.generate_code(new_ast) diff --git a/test.py b/test.py new file mode 100644 index 0000000..5198de8 --- /dev/null +++ b/test.py @@ -0,0 +1,84 @@ +# py.test test.py -v + +import pytest + +from super_tiny_compiler_unannotated import ( + NAME, NUMBER, PAREN, + BacktrackingGenerator, + CallExpression, ExpressionStatement, Identifier, NewCallExpression, + NumberLiteral, Token, + CodeGenerator, Transformer, + compile_, parse, tokenize) + +TEST_CASES = [ + { + 'input': '(add 2 (subtract 4 2))', + 'output': 'add(2, subtract(4, 2));', + 'tokens': [Token(PAREN, '('), + Token(NAME, 'add'), + Token(NUMBER, 2), + Token(PAREN, '('), + Token(NAME, 'subtract'), + Token(NUMBER, 4), + Token(NUMBER, 2), + Token(PAREN, ')'), + Token(PAREN, ')')], + 'ast': {'type': 'Program', + 'body': [CallExpression('add', + [NumberLiteral(2), + CallExpression('subtract', + [NumberLiteral(4), + NumberLiteral(2)])])] + }, + 'new_ast': { + 'type': 'Program', + 'body': [ + ExpressionStatement( + NewCallExpression( + Identifier('add'), + [NumberLiteral(2), + NewCallExpression( + Identifier('subtract'), + [NumberLiteral(4), + NumberLiteral(2)]) + ]) + ) + ]}} +] + + +def test_BacktrackingGenerator(): + g = BacktrackingGenerator('abcde') + assert next(g) == 'a' + assert ''.join(g.take_while(lambda c: c != 'e')) == 'bcd' + assert next(g) == 'e' + + +@pytest.mark.parametrize("input_, expected", + [(c['input'], c['tokens']) for c in TEST_CASES]) +def test_tokenize(input_, expected): + assert list(tokenize(input_)) == expected + + +@pytest.mark.parametrize("tokens, expected", + [(c['tokens'], c['ast']) for c in TEST_CASES]) +def test_parse(tokens, expected): + assert parse(iter(tokens)) == expected + + +@pytest.mark.parametrize("ast, expected", + [(c['ast'], c['new_ast']) for c in TEST_CASES]) +def test_transform(ast, expected): + assert Transformer.transform(ast) == expected + + +@pytest.mark.parametrize("new_ast, expected", + [(c['new_ast'], c['output']) for c in TEST_CASES]) +def test_generate_code(new_ast, expected): + assert CodeGenerator.generate_code(new_ast) == expected + + +@pytest.mark.parametrize("input_, expected", + [(c['input'], c['output']) for c in TEST_CASES]) +def test_compile(input_, expected): + assert compile_(input_) == expected