C++实现编译器(8): 符号表、字符串、数组与字典¶
参照《Writing An Interpreter/Compiler In Go》,改用C++实现。
引言¶
本篇对应的源码位于目录: src/07/
src
|07
| |token
| | |token.hpp
| |evaluator
| | |evaluator.hpp
| | |builtins.hpp
| |CMakeLists.txt
| |test
| | |lexer_test.hpp
| | |parser_test.hpp
| | |evaluator_test.hpp
| | |symbol_table_test.hpp
| | |vm_test.hpp
| | |objects_test.hpp
| | |ast_test.hpp
| | |code_test.hpp
| | |compiler_test.hpp
| | |main.cpp
| |lexer
| | |lexer.hpp
| |repl
| | |repl.hpp
| |code
| | |code.hpp
| |objects
| | |objects.hpp
| | |environment.hpp
| |parser
| | |parser.hpp
| | |parser_tracing.hpp
| |vm
| | |vm.hpp
| |ast
| | |ast.hpp
| |main
| | |monkey.cpp
| |compiler
| | |symbol_table.hpp
| | |compiler.hpp
这里为虚拟机添加一个符号表
,用于绑定变量。
因为字节码每个操作码的操作数只能是整数,因此在符号表里面将变量和一个整数建立映射关系,然后通过这个整数来代表变量写到字节码。等到虚拟机解码执行的时候,这个数字就是这个变量解码后存储的索引,可以直接访问。
目前这个符号表是全局的,后续再处理非全局的。
增加对字符串、数组和哈希表的编译支持。
符号表¶
符号表是解释器和编译器中用于将标志符和与之对应数据结构关联。
using SymbolScope = std::string;
namespace SymbolScopeType
{
const SymbolScope GlobalScope = "GLOBAL";
}
struct Symbol{
std::string Name;
SymbolScope Scope;
int Index;
Symbol(std::string name, SymbolScope scope, int idx): Name(name), Scope(scope), Index(idx){}
};
struct SymbolTable{
std::map<std::string, std::shared_ptr<Symbol>> store;
int numDefinitions;
std::shared_ptr<Symbol> Define(std::string name)
{
auto symbol = std::make_shared<Symbol>(name, SymbolScopeType::GlobalScope, numDefinitions);
store[name] = symbol;
numDefinitions++;
return symbol;
}
std::shared_ptr<Symbol> Resolve(std::string name)
{
auto fit = store.find(name);
if(fit != store.end())
{
return fit->second;
}
else
{
return nullptr;
}
}
};
每个标志符都有一个对应的数字标记。
编译期绑定¶
编译器遇到变量就在符号表中为其指定一个整数,而执行时根据这个整数去动态构建的全局变量表中拿到对应的值:
struct Compiler
{
std::shared_ptr<compiler::SymbolTable> symbolTable;
//...
};
else if(node->GetNodeType() == ast::NodeType::LetStatement)
{
std::shared_ptr<ast::LetStatement> letObj = std::dynamic_pointer_cast<ast::LetStatement>(node);
auto resultObj = Compile(letObj->pValue);
if (evaluator::isError(resultObj))
{
return resultObj;
}
auto symbol = symbolTable->Define(letObj->pName->Value);
emit(bytecode::OpcodeType::OpSetGlobal, {symbol->Index});
}
else if(node->GetNodeType() == ast::NodeType::Identifier)
{
std::shared_ptr<ast::Identifier> identObj = std::dynamic_pointer_cast<ast::Identifier>(node);
auto symbol = symbolTable->Resolve(identObj->Value);
if(symbol == nullptr)
{
return evaluator::newError("undefined variable " + identObj->Value);
}
emit(bytecode::OpcodeType::OpGetGlobal, {symbol->Index});
}
遇到let
语句定义标志符或使用标志符就在字节码中写入它对应的数字。
虚拟机运行时支持全局变量¶
虚拟机解码字节码的时候,遇到变量定义时则自动生成并维护一份全局变量清单:
const int GlobalsSize = 65536;
struct VM{
std::vector<std::shared_ptr<objects::Object>> globals;
//...
VM(std::vector<std::shared_ptr<objects::Object>>& objs, bytecode::Instructions& ins):
constants(objs),
instructions(ins)
{
globals.resize(GlobalsSize);
//...
}
//...
};
case bytecode::OpcodeType::OpSetGlobal:
{
uint16_t globalIndex;
bytecode::ReadUint16(instructions, ip+1, globalIndex);
ip += 2;
globals[globalIndex] = Pop();
}
break;
case bytecode::OpcodeType::OpGetGlobal:
{
uint16_t globalIndex;
bytecode::ReadUint16(instructions, ip+1, globalIndex);
ip += 2;
auto result = Push(globals[globalIndex]);
if(evaluator::isError(result))
{
return result;
}
}
break;
因为全局变量清单是提前准备好内存大小的,因此基于索引来访问和更新基本是没有开销的。
REPL保持符号表状态¶
在REPL中要维护一份全局的状态,每次执行语句时候共用这一份状态:
编译器传入状态:
std::shared_ptr<Compiler> New()
{
return std::make_shared<Compiler>();
}
std::shared_ptr<Compiler> NewWithState(std::shared_ptr<compiler::SymbolTable> symbolTable,
std::vector<std::shared_ptr<objects::Object>>& constants)
{
std::shared_ptr<Compiler> compiler = New();
compiler->symbolTable = symbolTable;
compiler->constants = constants;
return compiler;
}
虚拟机传入状态:
std::shared_ptr<VM> New(std::shared_ptr<compiler::ByteCode> bytecode)
{
return std::make_shared<VM>(bytecode->Constants, bytecode->Instructions);
}
std::shared_ptr<VM> NewWithGlobalsStore(std::shared_ptr<compiler::ByteCode> bytecode,
std::vector<std::shared_ptr<objects::Object>>& s)
{
std::shared_ptr<VM> vm = New(bytecode);
vm->globals = s;
return vm;
}
REPL使用全局状态:
std::vector<std::shared_ptr<objects::Object>> constants{};
std::vector<std::shared_ptr<objects::Object>> globals(vm::GlobalsSize);
auto symbolTable = compiler::NewSymbolTable();
//auto comp = compiler::New();
auto comp = compiler::NewWithState(symbolTable, constants);
//auto machine = vm::New(comp->Bytecode());
auto code = comp->Bytecode();
auto machine = vm::NewWithGlobalsStore(code, globals);
// 更新状态为下一个命令使用
constants = code->Constants;
globals = machine->globals;
效果:
./monkey
Hello lesliezhu! This is the Monkey-CPP programming language!
Feel free to type in commands
>> let a = 1; let b = 2;
2
>> a
1
>> b
2
>> let c = a + b; c
3
>>
字符串¶
字符串是作为常量表达式来处理的,因此不需要添加新操作码:
else if(node->GetNodeType() == ast::NodeType::StringLiteral)
{
auto stringLiteral = std::dynamic_pointer_cast<ast::StringLiteral>(node);
auto strObj = std::make_shared<objects::String>(stringLiteral->Value);
auto pos = addConstant(strObj);
emit(bytecode::OpcodeType::OpConstant, {pos});
}
为字符串类型添加拼接操作:
std::shared_ptr<objects::Object> executeBInaryStringOperaction(bytecode::OpcodeType op,
std::shared_ptr<objects::Object> left,
std::shared_ptr<objects::Object> right)
{
auto rightObj = std::dynamic_pointer_cast<objects::String>(right);
auto leftObj = std::dynamic_pointer_cast<objects::String>(left);
std::string result;
switch (op)
{
case bytecode::OpcodeType::OpAdd:
result = leftObj->Value + rightObj->Value;
break;
default:
return evaluator::newError("unknow string operator: " + bytecode::OpcodeTypeStr(op));
break;
}
return Push(std::make_shared<objects::String>(result));
}
数组¶
数组的元素可以是任意表达式,因此在编译期是无法知道具体的求值结果的,但可以知道有多少个元素。因此只需要添加一个操作码,运行时通过把前面一定数量的元素解码组成数组即可。
else if(node->GetNodeType() == ast::NodeType::ArrayLiteral)
{
std::shared_ptr<ast::ArrayLiteral> arrayLiteral = std::dynamic_pointer_cast<ast::ArrayLiteral>(node);
for(auto &stmt: arrayLiteral->Elements)
{
auto resultObj = Compile(stmt);
if (evaluator::isError(resultObj))
{
return resultObj;
}
}
emit(bytecode::OpcodeType::OpArray, {static_cast<int>(arrayLiteral->Elements.size())});
}
虚拟机解码时重新构建数组并进行替换:
case bytecode::OpcodeType::OpArray:
{
uint16_t numElements;
bytecode::ReadUint16(instructions, ip+1, numElements);
ip += 2;
// 读取对应个数的元素组成数组
auto arrayObj = buildArray(sp - numElements, sp);
sp -= numElements; // 移出元素
auto result = Push(arrayObj); // 压入数组
if(evaluator::isError(result))
{
return result;
}
}
break;
执行效果:
>> [1,2,3]
[1, 2, 3]
>>
>> [1+2,2*3,3-4,if(true){false}]
[3, 6, -1, false]
>>
>> [1+2,2*3,3-4,if(false){false}]
[3, 6, -1, null]
哈希表或字典¶
和数组的方法类似,数组是有N个元素,那么字典就是有N个元素对,虚拟机解码时重新构建这个字典结构:
else if(node->GetNodeType() == ast::NodeType::HashLiteral)
{
std::shared_ptr<ast::HashLiteral> hashLiteral = std::dynamic_pointer_cast<ast::HashLiteral>(node);
std::vector<std::shared_ptr<ast::Expression>> keys{};
for(auto &pair: hashLiteral->Pairs)
{
keys.push_back(pair.first);
}
std::sort(keys.begin(), keys.end(), [](const auto &lhs, const auto& rhs){ return lhs->String() < rhs->String(); });
for(auto &key: keys)
{
auto resultObj = Compile(key);
if (evaluator::isError(resultObj))
{
return resultObj;
}
resultObj = Compile(hashLiteral->Pairs[key]);
if (evaluator::isError(resultObj))
{
return resultObj;
}
}
emit(bytecode::OpcodeType::OpHash, {2 * static_cast<int>(hashLiteral->Pairs.size())});
}
虚拟机重新组织字典对象:
case bytecode::OpcodeType::OpHash:
{
uint16_t numElements;
bytecode::ReadUint16(instructions, ip+1, numElements);
ip += 2;
auto hashObj = buildHash(sp - numElements, sp);
if(evaluator::isError(hashObj))
{
return hashObj;
}
sp -= numElements;
auto result = Push(hashObj);
if(evaluator::isError(result))
{
return result;
}
}
break;
数字和字典的两个辅助函数:
std::shared_ptr<objects::Object> buildArray(const int& startIndex, const int& endIndex)
{
std::vector<std::shared_ptr<objects::Object>> elements(endIndex - startIndex);
for(int i=startIndex; i < endIndex; i++)
{
elements[i - startIndex] = stack[i];
}
return std::make_shared<objects::Array>(elements);
}
std::shared_ptr<objects::Object> buildHash(const int& startIndex, const int& endIndex)
{
std::map<objects::HashKey, std::shared_ptr<objects::HashPair>> hashPairs;
for(int i=startIndex; i < endIndex; i += 2)
{
auto key = stack[i];
auto value = stack[i+1];
auto pair = std::make_shared<objects::HashPair>(key, value);
if(!key->Hashable())
{
return evaluator::newError("unusable as hash type: " + key->TypeStr());
}
hashPairs[key->GetHashKey()] = pair;
}
return std::make_shared<objects::Hash>(hashPairs);
}
效果:
>> let a = "Name"
"Name"
>> let b = {a: "Leslie", "age": 20 + 5}
{"age": 25, "Name": "Leslie"}
>> {a: b}
{"Name": {"age": 25, "Name": "Leslie"}}
>>
索引操作符¶
索引操作符语法是:
需要定义一个操作码,然后获取前两个元素分别作为操作数和索引即可:
else if(node->GetNodeType() == ast::NodeType::IndexExpression)
{
std::shared_ptr<ast::IndexExpression> indexObj = std::dynamic_pointer_cast<ast::IndexExpression>(node);
auto resultObj = Compile(indexObj->Left);
if (evaluator::isError(resultObj))
{
return resultObj;
}
resultObj = Compile(indexObj->Index);
if (evaluator::isError(resultObj))
{
return resultObj;
}
emit(bytecode::OpcodeType::OpIndex);
}
虚拟机这边直接复用求值器部分的代码进行处理:
case bytecode::OpcodeType::OpIndex:
{
auto index = Pop();
auto left = Pop();
auto result = executeIndexExpression(left, index);
if(evaluator::isError(result))
{
return result;
}
}
break;
std::shared_ptr<objects::Object> executeIndexExpression(std::shared_ptr<objects::Object> left,
std::shared_ptr<objects::Object> index)
{
if(left->Type() == objects::ObjectType::ARRAY && index->Type() == objects::ObjectType::INTEGER)
{
auto result = evaluator::evalArrayIndexExpression(left, index);
if (evaluator::isError(result))
{
return result;
}
return Push(result);
}
else if(left->Type() == objects::ObjectType::HASH)
{
auto result = evaluator::evalHashIndexExpression(left, index);
if (evaluator::isError(result))
{
return result;
}
return Push(result);
}
else
{
return evaluator::newError("index operator not supported: " + left->TypeStr());
}
}
执行效果:
总结¶
所有的一切都发生在调用栈,数组和哈希表等只需要知道元素数量,然后从调用栈出栈再构建恢复数据结构。
- 微信搜索: 「 MinYiLife 」, 关注公众号!
- 本文链接: https://www.lesliezhu.com/blog/2022/12/13/writing_an_interpreter_in_cpp_8/
- 版权声明: 原创文章,如需转载请注明文章作者和出处。谢谢!