mirror of
https://github.com/jamiebuilds/the-super-tiny-compiler.git
synced 2024-10-27 20:34:08 +00:00
733 lines
24 KiB
C#
733 lines
24 KiB
C#
using System;
|
|
using System.Collections.Generic;
|
|
using System.Linq;
|
|
using System.Text;
|
|
using System.Text.RegularExpressions;
|
|
using System.Threading.Tasks;
|
|
|
|
namespace TheSuperThinyCompiler
|
|
{
|
|
public class TheSuperThinyCompiler
|
|
{
|
|
/**
|
|
* =============================================================================
|
|
* The Compiler like lisp-like function calls into some c-like function calls
|
|
* =============================================================================
|
|
*/
|
|
#region Compiler
|
|
|
|
/**
|
|
* ============================================================================
|
|
* (/^▽^)/
|
|
* THE TOKENIZER!
|
|
* ============================================================================
|
|
*/
|
|
public List<Token> tokenizer(string input)
|
|
{
|
|
var current = 0;
|
|
|
|
var tokens = new List<Token>();
|
|
|
|
while (current < input.Length)
|
|
{
|
|
var @char = input[current];
|
|
|
|
if (@char == '(')
|
|
{
|
|
tokens.Push(new Token()
|
|
{
|
|
Type = TokenTypeEnum.paren,
|
|
Value = "("
|
|
});
|
|
current++;
|
|
continue;
|
|
}
|
|
|
|
if (@char == ')')
|
|
{
|
|
tokens.Push(new Token()
|
|
{
|
|
Type = TokenTypeEnum.paren,
|
|
Value = ")"
|
|
});
|
|
|
|
current++;
|
|
continue;
|
|
}
|
|
|
|
Regex whitespace = new Regex(@"\s");
|
|
if (whitespace.IsMatch(@char.ToString()))
|
|
{
|
|
current++;
|
|
continue;
|
|
}
|
|
|
|
Regex numbers = new Regex(@"[0-9]");
|
|
if (numbers.IsMatch(@char.ToString()))
|
|
{
|
|
string value = string.Empty;
|
|
while (numbers.IsMatch(@char.ToString()))
|
|
{
|
|
value += @char;
|
|
@char = input[++current];
|
|
}
|
|
|
|
tokens.Push(new Token()
|
|
{
|
|
Type = TokenTypeEnum.number,
|
|
Value = value
|
|
});
|
|
|
|
continue;
|
|
}
|
|
|
|
Regex letters = new Regex(@"[a-z]", RegexOptions.IgnoreCase);
|
|
if (letters.IsMatch(@char.ToString()))
|
|
{
|
|
string value = string.Empty;
|
|
|
|
while (letters.IsMatch(@char.ToString()))
|
|
{
|
|
value += @char;
|
|
@char = input[++current];
|
|
}
|
|
tokens.Push(new Token()
|
|
{
|
|
Type = TokenTypeEnum.name,
|
|
Value = value
|
|
});
|
|
continue;
|
|
}
|
|
|
|
throw new Exception($"I dont know what this character is: '{@char}'");
|
|
}
|
|
|
|
return tokens;
|
|
}
|
|
|
|
/**
|
|
* ============================================================================
|
|
* ヽ/❀o ل͜ o\ノ
|
|
* THE PARSER!!!
|
|
* ============================================================================
|
|
*/
|
|
public LispAstNode parser(List<Token> tokens)
|
|
{
|
|
int current = 0;
|
|
|
|
LispAstNode lispAst = new LispAstNode()
|
|
{
|
|
Type = LispAstTypeEnum.Program,
|
|
Body = new List<LispAstNode>()
|
|
};
|
|
|
|
while (current < tokens.Count)
|
|
{
|
|
lispAst.Body.Push(walk(tokens, ref current));
|
|
}
|
|
|
|
return lispAst;
|
|
}
|
|
protected LispAstNode walk(List<Token> tokens, ref int current)
|
|
{
|
|
var token = tokens[current];
|
|
|
|
if (token.Type == TokenTypeEnum.number)
|
|
{
|
|
current++;
|
|
return new LispAstNode()
|
|
{
|
|
Type = LispAstTypeEnum.NumberLiteral,
|
|
Value = token.Value
|
|
};
|
|
}
|
|
|
|
if (token.Type == TokenTypeEnum.paren && token.Value == "(")
|
|
{
|
|
token = tokens[++current];
|
|
|
|
var node = new LispAstNode()
|
|
{
|
|
Type = LispAstTypeEnum.CallExpression,
|
|
Name = token.Value,
|
|
Params = new List<LispAstNode>()
|
|
};
|
|
|
|
token = tokens[++current];
|
|
|
|
while ((token.Type != TokenTypeEnum.paren) || (token.Type == TokenTypeEnum.paren && token.Value != ")"))
|
|
{
|
|
node.Params.Push(walk(tokens, ref current));
|
|
token = tokens[current];
|
|
}
|
|
|
|
// skip the ')'
|
|
current++;
|
|
return node;
|
|
}
|
|
|
|
throw new Exception($"{token.Type}");
|
|
}
|
|
|
|
/**
|
|
* ============================================================================
|
|
* ⌒(❀>◞౪◟<❀)⌒
|
|
* THE TRAVERSER!!!
|
|
* ============================================================================
|
|
*/
|
|
public void traverser(LispAstNode lispAst, LispVisitorType lispVisitor)
|
|
{
|
|
traverseNode(lispAst, null, lispVisitor);
|
|
}
|
|
protected void traverseArray(List<LispAstNode> array, LispAstNode parent, LispVisitorType lispVisitor)
|
|
{
|
|
array.ForEach(child => traverseNode(child, parent, lispVisitor));
|
|
}
|
|
protected void traverseNode(LispAstNode node, LispAstNode parent, LispVisitorType lispVisitor)
|
|
{
|
|
Action<LispAstNode, LispAstNode> method = null;
|
|
lispVisitor.TryGetValue(node.Type, out method);
|
|
method?.Invoke(node, parent);
|
|
|
|
switch (node.Type)
|
|
{
|
|
case LispAstTypeEnum.Program:
|
|
traverseArray(node.Body, node, lispVisitor);
|
|
break;
|
|
case LispAstTypeEnum.CallExpression:
|
|
traverseArray(node.Params, node, lispVisitor);
|
|
break;
|
|
case LispAstTypeEnum.NumberLiteral:
|
|
break;
|
|
default:
|
|
throw new Exception($"{node.Type}");
|
|
}
|
|
}
|
|
|
|
/**
|
|
* ============================================================================
|
|
* ⁽(◍˃̵͈̑ᴗ˂̵͈̑)⁽
|
|
* THE TRANSFORMER!!!
|
|
* ============================================================================
|
|
*/
|
|
public CAstNode transformer(LispAstNode lispAst)
|
|
{
|
|
var newAst = new CAstNode()
|
|
{
|
|
Type = CAstTypeEnum.Program,
|
|
Body = new List<CAstNode>()
|
|
};
|
|
|
|
lispAst.Context = newAst.Body;
|
|
|
|
traverser(lispAst, new LispVisitorType()
|
|
{
|
|
// The first lispVisitor method accepts any `NumberLiteral`
|
|
[LispAstTypeEnum.NumberLiteral] = (node, parent) =>
|
|
{
|
|
parent.Context.Push(new CAstNode()
|
|
{
|
|
Type = CAstTypeEnum.NumberLiteral,
|
|
Value = node.Value,
|
|
});
|
|
},
|
|
|
|
// Next up, `CallExpression`.
|
|
[LispAstTypeEnum.CallExpression] = (node, parent) =>
|
|
|
|
{
|
|
CAstNode expression = new CAstNode()
|
|
{
|
|
Type = CAstTypeEnum.CallExpression,
|
|
Callee = new CAstNode()
|
|
{
|
|
Type = CAstTypeEnum.Identifier,
|
|
Name = node.Name,
|
|
},
|
|
arguments = new List<CAstNode>()
|
|
};
|
|
|
|
node.Context = expression.arguments;
|
|
if (parent.Type != LispAstTypeEnum.CallExpression)
|
|
{
|
|
expression = new CAstNode()
|
|
{
|
|
Type = CAstTypeEnum.ExpressionStatement,
|
|
Expression = expression
|
|
};
|
|
}
|
|
|
|
parent.Context.Push(expression);
|
|
}
|
|
});
|
|
|
|
return newAst;
|
|
}
|
|
|
|
/**
|
|
* ============================================================================
|
|
* ヾ(〃^∇^)ノ♪
|
|
* THE CODE GENERATOR!!!!
|
|
* ============================================================================
|
|
*/
|
|
public string codeGenerator(CAstNode node)
|
|
{
|
|
switch (node.Type)
|
|
{
|
|
case CAstTypeEnum.Program:
|
|
return node.Body.Map(codeGenerator).Join("\n");
|
|
case CAstTypeEnum.ExpressionStatement:
|
|
return codeGenerator(node.Expression) + ";";
|
|
case CAstTypeEnum.CallExpression:
|
|
return codeGenerator(node.Callee) +
|
|
"(" +
|
|
node.arguments.Map(codeGenerator).Join(",") +
|
|
")";
|
|
case CAstTypeEnum.Identifier:
|
|
return node.Name;
|
|
case CAstTypeEnum.NumberLiteral:
|
|
return node.Value;
|
|
default:
|
|
throw new Exception($"{node.Type}");
|
|
}
|
|
}
|
|
|
|
/**
|
|
* ============================================================================
|
|
* (۶* ‘ヮ’)۶”
|
|
* !!!!!!!!THE COMPILER!!!!!!!!
|
|
* ============================================================================
|
|
*/
|
|
public string compiler(string input)
|
|
{
|
|
var tokens = tokenizer(input);
|
|
var ast = parser(tokens);
|
|
var newAst = transformer(ast);
|
|
var output = codeGenerator(newAst);
|
|
|
|
return output;
|
|
}
|
|
#endregion
|
|
|
|
/**
|
|
* =============================================================================
|
|
* The Decompiler like c-like function calls into some lisp-like function calls
|
|
* =============================================================================
|
|
*/
|
|
#region DeCompiler
|
|
/**
|
|
* ============================================================================
|
|
* (/^▽^)/
|
|
* THE TOKENIZER!
|
|
* ============================================================================
|
|
*/
|
|
public List<Token> detokenizer(string input)
|
|
{
|
|
var tokens = new List<Token>();
|
|
var current = 0;
|
|
|
|
while (current < input.Length)
|
|
{
|
|
var szChar = input[current].ToString();
|
|
|
|
Regex whitespace = new Regex(@"\s");
|
|
if (whitespace.IsMatch(szChar))
|
|
{
|
|
current++;
|
|
continue;
|
|
}
|
|
|
|
Regex letter = new Regex(@"[a-z]", RegexOptions.IgnoreCase);
|
|
if (letter.IsMatch(szChar))
|
|
{
|
|
string value = string.Empty;
|
|
while (letter.IsMatch(szChar))
|
|
{
|
|
value += szChar;
|
|
szChar = input[++current].ToString();
|
|
}
|
|
|
|
tokens.Push(new Token()
|
|
{
|
|
Type = TokenTypeEnum.name,
|
|
Value = value
|
|
});
|
|
continue;
|
|
}
|
|
|
|
if (szChar == ")")
|
|
{
|
|
tokens.Push(new Token()
|
|
{
|
|
Type = TokenTypeEnum.paren,
|
|
Value = szChar,
|
|
});
|
|
|
|
current++;
|
|
continue;
|
|
}
|
|
|
|
Regex number = new Regex(@"[0-9]");
|
|
if (number.IsMatch(szChar))
|
|
{
|
|
string value = string.Empty;
|
|
while (number.IsMatch(szChar))
|
|
{
|
|
value += szChar;
|
|
szChar = input[++current].ToString();
|
|
}
|
|
|
|
tokens.Push(new Token()
|
|
{
|
|
Type = TokenTypeEnum.number,
|
|
Value = value
|
|
});
|
|
|
|
continue;
|
|
}
|
|
|
|
if (szChar == ",")
|
|
{
|
|
tokens.Push(new Token()
|
|
{
|
|
Type = TokenTypeEnum.paren,
|
|
Value = szChar,
|
|
});
|
|
current++;
|
|
continue;
|
|
}
|
|
|
|
if (szChar == "(")
|
|
{
|
|
tokens.Push(new Token()
|
|
{
|
|
Type = TokenTypeEnum.paren,
|
|
Value = szChar,
|
|
});
|
|
current++;
|
|
continue;
|
|
}
|
|
|
|
if (szChar == ";")
|
|
{
|
|
tokens.Push(new Token()
|
|
{
|
|
Type = TokenTypeEnum.paren,
|
|
Value = szChar,
|
|
});
|
|
current++;
|
|
continue;
|
|
}
|
|
|
|
throw new Exception($"I dont know what this character is: {szChar}");
|
|
}
|
|
|
|
return tokens;
|
|
}
|
|
|
|
/**
|
|
* ============================================================================
|
|
* ヽ/❀o ل͜ o\ノ
|
|
* THE PARSER!!!
|
|
* ============================================================================
|
|
*/
|
|
public CAstNode deparser(List<Token> tokens)
|
|
{
|
|
CAstNode nast = new CAstNode()
|
|
{
|
|
Type = CAstTypeEnum.Program,
|
|
Body = new List<CAstNode>()
|
|
};
|
|
|
|
var current = 0;
|
|
|
|
while (current < tokens.Count)
|
|
{
|
|
nast.Body.Push(dewalk(tokens, ref current));
|
|
}
|
|
|
|
return nast;
|
|
|
|
}
|
|
protected CAstNode dewalk(List<Token> tokens, ref int current)
|
|
{
|
|
var token = tokens[current];
|
|
|
|
// 数字类型直接返回
|
|
if (token.Type == TokenTypeEnum.number)
|
|
{
|
|
current++;
|
|
return new CAstNode()
|
|
{
|
|
Type = CAstTypeEnum.NumberLiteral,
|
|
Value = token.Value
|
|
};
|
|
}
|
|
|
|
// 函数名也直接返回
|
|
if (token.Type == TokenTypeEnum.name)
|
|
{
|
|
var expression = new CAstNode()
|
|
{
|
|
Type = CAstTypeEnum.CallExpression,
|
|
Callee = new CAstNode()
|
|
{
|
|
Type = CAstTypeEnum.Identifier,
|
|
Name = token.Value,
|
|
},
|
|
arguments = new List<CAstNode>()
|
|
};
|
|
|
|
List<CAstNode> arguments = new List<CAstNode>();
|
|
|
|
// 跳过左括号
|
|
++current; // 这个位置是左括号,要跳过
|
|
token = tokens[++current];
|
|
|
|
while ((token.Type != TokenTypeEnum.paren) || (token.Type == TokenTypeEnum.paren && token.Value != ")"))
|
|
{
|
|
if ((token.Type == TokenTypeEnum.paren) && (token.Value == "," || token.Value == ";"))
|
|
{
|
|
token = tokens[++current];
|
|
continue;
|
|
}
|
|
|
|
expression.arguments.Push(dewalk(tokens, ref current));
|
|
token = tokens[current];
|
|
}
|
|
|
|
// 跳过右圆括号
|
|
current++;
|
|
|
|
// 跳过分号
|
|
token = tokens[current];
|
|
if (token.Type == TokenTypeEnum.paren && token.Value == ";") current++;
|
|
|
|
return expression;
|
|
}
|
|
|
|
|
|
throw new Exception($"{token.Type}");
|
|
}
|
|
|
|
/**
|
|
* ============================================================================
|
|
* ⌒(❀>◞౪◟<❀)⌒
|
|
* THE TRAVERSER!!!
|
|
* ============================================================================
|
|
*/
|
|
public void detraverser(CAstNode ast, CVisitorType visitor)
|
|
{
|
|
detraverseNode(ast, null, visitor);
|
|
}
|
|
|
|
protected void detraverseArray(List<CAstNode> array, CAstNode parent, CVisitorType visitor)
|
|
{
|
|
array.ForEach(child => detraverseNode(child, parent, visitor));
|
|
}
|
|
|
|
protected void detraverseNode(CAstNode node, CAstNode parent, CVisitorType visitor)
|
|
{
|
|
Action<CAstNode, CAstNode> method = null;
|
|
visitor.TryGetValue(node.Type, out method);
|
|
method?.Invoke(node, parent);
|
|
|
|
switch (node.Type)
|
|
{
|
|
case CAstTypeEnum.Program:
|
|
detraverseArray(node.Body, node, visitor);
|
|
break;
|
|
case CAstTypeEnum.ExpressionStatement:
|
|
detraverseNode(node.Expression, node, visitor);
|
|
break;
|
|
case CAstTypeEnum.CallExpression:
|
|
detraverseNode(node.Callee, node, visitor);
|
|
detraverseArray(node.arguments, node, visitor);
|
|
break;
|
|
case CAstTypeEnum.Identifier:
|
|
break;
|
|
case CAstTypeEnum.NumberLiteral:
|
|
break;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* ============================================================================
|
|
* ⁽(◍˃̵͈̑ᴗ˂̵͈̑)⁽
|
|
* THE TRANSFORMER!!!
|
|
* ============================================================================
|
|
*/
|
|
public LispAstNode detransformer(CAstNode ast)
|
|
{
|
|
var newAst = new LispAstNode()
|
|
{
|
|
Type = LispAstTypeEnum.Program,
|
|
Body = new List<LispAstNode>()
|
|
};
|
|
|
|
ast.Context = newAst.Body;
|
|
|
|
detraverser(ast, new CVisitorType()
|
|
{
|
|
[CAstTypeEnum.NumberLiteral] = (node, parent) =>
|
|
{
|
|
parent.Context.Push(new LispAstNode()
|
|
{
|
|
Type = LispAstTypeEnum.NumberLiteral,
|
|
Value = node.Value,
|
|
});
|
|
},
|
|
|
|
[CAstTypeEnum.CallExpression] = (node, parent) =>
|
|
{
|
|
LispAstNode @params = new LispAstNode()
|
|
{
|
|
Type = LispAstTypeEnum.CallExpression,
|
|
Name = node.Callee.Name,
|
|
Params = new List<LispAstNode>(),
|
|
};
|
|
|
|
node.Context = @params.Params;
|
|
parent.Context.Push(@params);
|
|
},
|
|
});
|
|
|
|
return newAst;
|
|
}
|
|
|
|
/**
|
|
* ============================================================================
|
|
* ヾ(〃^∇^)ノ♪
|
|
* THE CODE GENERATOR!!!!
|
|
* ============================================================================
|
|
*/
|
|
public string decodeGenerator(LispAstNode node)
|
|
{
|
|
switch (node.Type)
|
|
{
|
|
case LispAstTypeEnum.Program:
|
|
return node.Body.Map(decodeGenerator).Join("\n");
|
|
case LispAstTypeEnum.NumberLiteral:
|
|
return node.Value;
|
|
case LispAstTypeEnum.CallExpression:
|
|
return "(" + node.Name + " " + node.Params.Map(decodeGenerator).Join(" ") + ")";
|
|
default:
|
|
throw new Exception($"{node.Type}");
|
|
}
|
|
}
|
|
|
|
/**
|
|
* ============================================================================
|
|
* (۶* ‘ヮ’)۶”
|
|
* !!!!!!!!THE DECOMPILER!!!!!!!!
|
|
* ============================================================================
|
|
*/
|
|
|
|
public string decompiler(string input)
|
|
{
|
|
var tokens = detokenizer(input);
|
|
var ast = deparser(tokens);
|
|
var newAst = detransformer(ast);
|
|
var output = decodeGenerator(newAst);
|
|
return output;
|
|
}
|
|
#endregion
|
|
}
|
|
|
|
#region Models & Extenses
|
|
|
|
public static class TypeFunctionWrapperExtenses
|
|
{
|
|
public static void Push<T>(this List<T> source, T value) => source.Add(value);
|
|
public static string Join(this string[] strs, string s) => string.Join(s, strs);
|
|
public static V[] Map<T, V>(this List<T> source, Func<T, V> codeGenerator) =>
|
|
source == null || source.Count == 0 ? new V[0] : source.Select(codeGenerator).ToArray();
|
|
}
|
|
|
|
#region Token Model
|
|
|
|
public class Token
|
|
{
|
|
public TokenTypeEnum Type { get; set; }
|
|
public string Value { get; set; }
|
|
}
|
|
|
|
public enum TokenTypeEnum
|
|
{
|
|
paren,
|
|
name,
|
|
number,
|
|
}
|
|
#endregion
|
|
|
|
#region Lisp Style AST Model
|
|
public class LispAstNode
|
|
{
|
|
public LispAstTypeEnum Type { get; set; }
|
|
public string Name { get; set; }
|
|
public string Value { get; set; }
|
|
|
|
public List<LispAstNode> Params { get; set; } = new List<LispAstNode>();
|
|
public List<LispAstNode> Body { get; set; } = new List<LispAstNode>();
|
|
|
|
public List<CAstNode> Context { get; set; } = new List<CAstNode>();
|
|
}
|
|
|
|
public enum LispAstTypeEnum
|
|
{
|
|
Program,
|
|
CallExpression,
|
|
NumberLiteral
|
|
}
|
|
#endregion
|
|
|
|
#region C Style AST Model
|
|
|
|
public class CAstNode
|
|
{
|
|
public CAstTypeEnum Type { get; set; }
|
|
public string Name { get; set; }
|
|
public string Value { get; set; }
|
|
|
|
public CAstNode Expression { get; set; }
|
|
public CAstNode Callee { get; set; }
|
|
|
|
|
|
public List<CAstNode> arguments { get; set; } = new List<CAstNode>();
|
|
public List<CAstNode> Body { get; set; } = new List<CAstNode>();
|
|
|
|
public List<LispAstNode> Context { get; set; } = new List<LispAstNode>();
|
|
}
|
|
|
|
public enum CAstTypeEnum
|
|
{
|
|
Program,
|
|
ExpressionStatement,
|
|
CallExpression,
|
|
Identifier,
|
|
NumberLiteral,
|
|
}
|
|
|
|
#endregion
|
|
|
|
#region VisitorWrap
|
|
|
|
// I hate writing long types over and over……
|
|
|
|
public class LispVisitorType : Dictionary<LispAstTypeEnum, Action<LispAstNode, LispAstNode>>
|
|
{
|
|
// for visitor lisp-like AST
|
|
|
|
}
|
|
|
|
public class CVisitorType : Dictionary<CAstTypeEnum, Action<CAstNode, CAstNode>>
|
|
{
|
|
// for visitor C-like AST
|
|
}
|
|
|
|
#endregion
|
|
|
|
#endregion
|
|
|
|
}
|