C++实现编译器(8): 符号表、字符串、数组与字典¶

参照《Writing An Interpreter/Compiler In Go》，改用C++实现。

项目源码: https://github.com/LeslieZhu/monkey-cpp

引言¶

本篇对应的源码位于目录: src/07/

src
 |07
 | |token
 | | |token.hpp
 | |evaluator
 | | |evaluator.hpp
 | | |builtins.hpp
 | |CMakeLists.txt
 | |test
 | | |lexer_test.hpp
 | | |parser_test.hpp
 | | |evaluator_test.hpp
 | | |symbol_table_test.hpp
 | | |vm_test.hpp
 | | |objects_test.hpp
 | | |ast_test.hpp
 | | |code_test.hpp
 | | |compiler_test.hpp
 | | |main.cpp
 | |lexer
 | | |lexer.hpp
 | |repl
 | | |repl.hpp
 | |code
 | | |code.hpp
 | |objects
 | | |objects.hpp
 | | |environment.hpp
 | |parser
 | | |parser.hpp
 | | |parser_tracing.hpp
 | |vm
 | | |vm.hpp
 | |ast
 | | |ast.hpp
 | |main
 | | |monkey.cpp
 | |compiler
 | | |symbol_table.hpp
 | | |compiler.hpp

这里为虚拟机添加一个符号表，用于绑定变量。

因为字节码每个操作码的操作数只能是整数，因此在符号表里面将变量和一个整数建立映射关系，然后通过这个整数来代表变量写到字节码。等到虚拟机解码执行的时候，这个数字就是这个变量解码后存储的索引，可以直接访问。

目前这个符号表是全局的，后续再处理非全局的。

增加对字符串、数组和哈希表的编译支持。

符号表¶

符号表是解释器和编译器中用于将标志符和与之对应数据结构关联。

using SymbolScope = std::string;

namespace SymbolScopeType
{
    const SymbolScope GlobalScope = "GLOBAL";
}

struct Symbol{
    std::string Name;
    SymbolScope Scope;
    int Index;

    Symbol(std::string name, SymbolScope scope, int idx): Name(name), Scope(scope), Index(idx){}
};

struct SymbolTable{
    std::map<std::string, std::shared_ptr<Symbol>> store;
    int numDefinitions;

    std::shared_ptr<Symbol> Define(std::string name)
    {
        auto symbol = std::make_shared<Symbol>(name, SymbolScopeType::GlobalScope, numDefinitions);
        store[name] = symbol;
        numDefinitions++;
        return symbol;
    }

    std::shared_ptr<Symbol> Resolve(std::string name)
    {
        auto fit = store.find(name);
        if(fit != store.end())
        {
            return fit->second;
        }
        else
        {
            return nullptr;
        }
    }
};

每个标志符都有一个对应的数字标记。

编译期绑定¶

编译器遇到变量就在符号表中为其指定一个整数，而执行时根据这个整数去动态构建的全局变量表中拿到对应的值:

struct Compiler
{
    std::shared_ptr<compiler::SymbolTable> symbolTable;

    //...
};

else if(node->GetNodeType() == ast::NodeType::LetStatement)
{
    std::shared_ptr<ast::LetStatement> letObj = std::dynamic_pointer_cast<ast::LetStatement>(node);
    auto resultObj = Compile(letObj->pValue);
    if (evaluator::isError(resultObj))
    {
        return resultObj;
    }

    auto symbol = symbolTable->Define(letObj->pName->Value);
    emit(bytecode::OpcodeType::OpSetGlobal, {symbol->Index});
}
else if(node->GetNodeType() == ast::NodeType::Identifier)
{
    std::shared_ptr<ast::Identifier> identObj = std::dynamic_pointer_cast<ast::Identifier>(node);
    auto symbol = symbolTable->Resolve(identObj->Value);
    if(symbol == nullptr)
    {
        return evaluator::newError("undefined variable " + identObj->Value);
    }
    emit(bytecode::OpcodeType::OpGetGlobal, {symbol->Index});
}

遇到let语句定义标志符或使用标志符就在字节码中写入它对应的数字。

虚拟机运行时支持全局变量¶

虚拟机解码字节码的时候，遇到变量定义时则自动生成并维护一份全局变量清单：

const int GlobalsSize = 65536;

struct VM{
    std::vector<std::shared_ptr<objects::Object>> globals;
    //...

    VM(std::vector<std::shared_ptr<objects::Object>>& objs, bytecode::Instructions& ins):
    constants(objs),
    instructions(ins)
    {
        globals.resize(GlobalsSize);
        //...
    }

    //...
};


case bytecode::OpcodeType::OpSetGlobal:
    {
        uint16_t globalIndex;
        bytecode::ReadUint16(instructions, ip+1, globalIndex);
        ip += 2;
        globals[globalIndex] = Pop();
    }
    break;
case bytecode::OpcodeType::OpGetGlobal:
    {
        uint16_t globalIndex;
        bytecode::ReadUint16(instructions, ip+1, globalIndex);
        ip += 2;
        auto result = Push(globals[globalIndex]);
        if(evaluator::isError(result))
        {
           return result;
        }
    }
    break;

因为全局变量清单是提前准备好内存大小的，因此基于索引来访问和更新基本是没有开销的。

REPL保持符号表状态¶

在REPL中要维护一份全局的状态，每次执行语句时候共用这一份状态:

编译器传入状态:

std::shared_ptr<Compiler> New()
{
    return std::make_shared<Compiler>();
}

std::shared_ptr<Compiler> NewWithState(std::shared_ptr<compiler::SymbolTable> symbolTable,
                                       std::vector<std::shared_ptr<objects::Object>>& constants)
{
    std::shared_ptr<Compiler> compiler = New();
    compiler->symbolTable = symbolTable;
    compiler->constants = constants;
    return compiler;
}

虚拟机传入状态:

std::shared_ptr<VM> New(std::shared_ptr<compiler::ByteCode> bytecode)
{
    return std::make_shared<VM>(bytecode->Constants, bytecode->Instructions);
}

std::shared_ptr<VM> NewWithGlobalsStore(std::shared_ptr<compiler::ByteCode> bytecode,
                                        std::vector<std::shared_ptr<objects::Object>>& s)
{
    std::shared_ptr<VM> vm = New(bytecode);
    vm->globals = s;
    return vm;
}

REPL使用全局状态:

std::vector<std::shared_ptr<objects::Object>> constants{};
std::vector<std::shared_ptr<objects::Object>> globals(vm::GlobalsSize);
auto symbolTable = compiler::NewSymbolTable();

//auto comp = compiler::New();
auto comp = compiler::NewWithState(symbolTable, constants);

//auto machine = vm::New(comp->Bytecode());
auto code = comp->Bytecode();
auto machine = vm::NewWithGlobalsStore(code, globals);

// 更新状态为下一个命令使用
constants = code->Constants;
globals = machine->globals;

效果:

./monkey
Hello lesliezhu! This is the Monkey-CPP programming language!
Feel free to type in commands
>> let a = 1; let b = 2;
2
>> a
1
>> b
2
>> let c = a + b; c
3
>>

字符串¶

字符串是作为常量表达式来处理的，因此不需要添加新操作码：

else if(node->GetNodeType() == ast::NodeType::StringLiteral)
{
    auto stringLiteral = std::dynamic_pointer_cast<ast::StringLiteral>(node);
    auto strObj = std::make_shared<objects::String>(stringLiteral->Value);
    auto pos = addConstant(strObj);
    emit(bytecode::OpcodeType::OpConstant, {pos});
}

为字符串类型添加拼接操作:

std::shared_ptr<objects::Object> executeBInaryStringOperaction(bytecode::OpcodeType op,
                                                                std::shared_ptr<objects::Object> left,
                                                                std::shared_ptr<objects::Object> right)
{
    auto rightObj = std::dynamic_pointer_cast<objects::String>(right);
    auto leftObj = std::dynamic_pointer_cast<objects::String>(left);

    std::string result;

    switch (op)
    {
    case bytecode::OpcodeType::OpAdd:
        result = leftObj->Value + rightObj->Value;
        break;

    default:
        return evaluator::newError("unknow string operator: " + bytecode::OpcodeTypeStr(op));
        break;
    }

    return Push(std::make_shared<objects::String>(result));
}

数组¶

数组的元素可以是任意表达式，因此在编译期是无法知道具体的求值结果的，但可以知道有多少个元素。因此只需要添加一个操作码，运行时通过把前面一定数量的元素解码组成数组即可。

else if(node->GetNodeType() == ast::NodeType::ArrayLiteral)
{
    std::shared_ptr<ast::ArrayLiteral> arrayLiteral = std::dynamic_pointer_cast<ast::ArrayLiteral>(node);
    for(auto &stmt: arrayLiteral->Elements)
    {
        auto resultObj = Compile(stmt);
        if (evaluator::isError(resultObj))
        {
            return resultObj;
        }
    }

    emit(bytecode::OpcodeType::OpArray, {static_cast<int>(arrayLiteral->Elements.size())});
}

虚拟机解码时重新构建数组并进行替换:

case bytecode::OpcodeType::OpArray:
    {
        uint16_t numElements;
        bytecode::ReadUint16(instructions, ip+1, numElements);
        ip += 2;

        // 读取对应个数的元素组成数组
        auto arrayObj = buildArray(sp - numElements, sp);
        sp -= numElements; // 移出元素

        auto result = Push(arrayObj); // 压入数组

        if(evaluator::isError(result))
        {
           return result;
        }
    }
    break;

执行效果:

>> [1,2,3]
[1, 2, 3]
>>
>> [1+2,2*3,3-4,if(true){false}]
[3, 6, -1, false]
>>
>> [1+2,2*3,3-4,if(false){false}]
[3, 6, -1, null]

哈希表或字典¶

和数组的方法类似，数组是有N个元素，那么字典就是有N个元素对，虚拟机解码时重新构建这个字典结构:

else if(node->GetNodeType() == ast::NodeType::HashLiteral)
{
    std::shared_ptr<ast::HashLiteral> hashLiteral = std::dynamic_pointer_cast<ast::HashLiteral>(node);

    std::vector<std::shared_ptr<ast::Expression>> keys{};
    for(auto &pair: hashLiteral->Pairs)
    {
        keys.push_back(pair.first);
    }

    std::sort(keys.begin(), keys.end(), [](const auto &lhs, const auto& rhs){ return lhs->String() < rhs->String(); });

    for(auto &key: keys)
    {
        auto resultObj = Compile(key);
        if (evaluator::isError(resultObj))
        {
            return resultObj;
        }

        resultObj = Compile(hashLiteral->Pairs[key]);
        if (evaluator::isError(resultObj))
        {
            return resultObj;
        }
    }  

    emit(bytecode::OpcodeType::OpHash, {2 * static_cast<int>(hashLiteral->Pairs.size())});              
}

虚拟机重新组织字典对象:

 case bytecode::OpcodeType::OpHash:
     {
         uint16_t numElements;
         bytecode::ReadUint16(instructions, ip+1, numElements);
         ip += 2;

         auto hashObj = buildHash(sp - numElements, sp);
         if(evaluator::isError(hashObj))
         {
            return hashObj;
         }

         sp -= numElements;

         auto result = Push(hashObj);
         if(evaluator::isError(result))
         {
            return result;
         }
     }
     break;

数字和字典的两个辅助函数:

 std::shared_ptr<objects::Object> buildArray(const int& startIndex, const int& endIndex)
 {
     std::vector<std::shared_ptr<objects::Object>> elements(endIndex - startIndex);
     for(int i=startIndex; i < endIndex; i++)
     {
         elements[i - startIndex] = stack[i];
     }

     return std::make_shared<objects::Array>(elements);
 }

 std::shared_ptr<objects::Object> buildHash(const int& startIndex, const int& endIndex)
 {
     std::map<objects::HashKey, std::shared_ptr<objects::HashPair>> hashPairs;

     for(int i=startIndex; i < endIndex; i += 2)
     {
         auto key = stack[i];
         auto value = stack[i+1];

         auto pair = std::make_shared<objects::HashPair>(key, value);

         if(!key->Hashable())
         {
             return evaluator::newError("unusable as hash type: " + key->TypeStr());
         }

         hashPairs[key->GetHashKey()] = pair;
     }

     return std::make_shared<objects::Hash>(hashPairs);
 }

效果:

>> let a = "Name"
"Name"
>> let b = {a: "Leslie", "age": 20 + 5}
{"age": 25, "Name": "Leslie"}
>> {a: b}
{"Name": {"age": 25, "Name": "Leslie"}}
>>

索引操作符¶

索引操作符语法是:

<表达式>[表达式]

需要定义一个操作码，然后获取前两个元素分别作为操作数和索引即可:

else if(node->GetNodeType() == ast::NodeType::IndexExpression)
{
    std::shared_ptr<ast::IndexExpression> indexObj = std::dynamic_pointer_cast<ast::IndexExpression>(node);

    auto resultObj = Compile(indexObj->Left);
    if (evaluator::isError(resultObj))
    {
        return resultObj;
    }

    resultObj = Compile(indexObj->Index);
    if (evaluator::isError(resultObj))
    {
        return resultObj;
    }

    emit(bytecode::OpcodeType::OpIndex);
}

虚拟机这边直接复用求值器部分的代码进行处理:

case bytecode::OpcodeType::OpIndex:
    {
        auto index = Pop();
        auto left = Pop();

        auto result = executeIndexExpression(left, index);
        if(evaluator::isError(result))
        {
           return result;
        }
    }
    break;

std::shared_ptr<objects::Object> executeIndexExpression(std::shared_ptr<objects::Object> left,
                                                        std::shared_ptr<objects::Object> index)
{
    if(left->Type() == objects::ObjectType::ARRAY && index->Type() == objects::ObjectType::INTEGER)
    {
        auto result = evaluator::evalArrayIndexExpression(left, index);
        if (evaluator::isError(result))
        {
            return result;
        }

        return Push(result);

    }
    else if(left->Type() == objects::ObjectType::HASH)
    {
        auto result = evaluator::evalHashIndexExpression(left, index);
        if (evaluator::isError(result))
        {
            return result;
        }

        return Push(result);
    }
    else 
    {
        return evaluator::newError("index operator not supported: " + left->TypeStr());
    }
}

执行效果：

>> [1, 2, 3][1]
2
>> {"one": 1, "two": 2, "three": 3}["o" + "ne"]
1
>>

总结¶

所有的一切都发生在调用栈，数组和哈希表等只需要知道元素数量，然后从调用栈出栈再构建恢复数据结构。

本文作者：「吴羽舒(LeslieZhu) 」创作于Tuesday, December 13, 2022，共1961字，需要10分钟阅读时间
微信搜索： 「 MinYiLife 」, 关注公众号!
本文链接： https://www.lesliezhu.com/blog/2022/12/13/writing_an_interpreter_in_cpp_8/
版权声明： 原创文章，如需转载请注明文章作者和出处。谢谢！