参照《Writing An Interpreter/Compiler In Go》,改用C++实现。

项目源码: https://github.com/LeslieZhu/monkey-cpp

引言

本篇对应的源码位于目录: src/04/

04
 |token
 | |token.hpp
 |evaluator
 | |evaluator.hpp
 | |builtins.hpp
 |CMakeLists.txt
 |test
 | |lexer_test.hpp
 | |parser_test.hpp
 | |evaluator_test.hpp
 | |objects_test.hpp
 | |ast_test.hpp
 | |main.cpp
 |lexer
 | |lexer.hpp
 |repl
 | |repl.hpp
 |objects
 | |objects.hpp
 | |environment.hpp
 |parser
 | |parser.hpp
 | |parser_tracing.hpp
 |ast
 | |ast.hpp
 |main
 | |monkey.cpp

这篇主要在现有框架上扩展一些功能:

  • 字符串
  • 内置函数: len,first,rest,push,puts
  • 数组
  • 字典或哈希表
  • 索引表达式

字符串

以双引号包裹起来的就当做字符串,词法分析阶段添加:

case '"':
{
    tok.Type = token::types::STRING;
    tok.Literal = readString();
}
break;

暂时不考虑转义字符等:

std::string readString()
{
    int oldPosition = position + 1;
    while (true)
    {
        readChar();
        if(ch == '"' || ch == 0)
        {
            break;
        }
    }

    return input.substr(oldPosition, position - oldPosition);
}

其AST节点比较简单:

struct StringLiteral : Expression
{
    token::Token Token;
    std::string Value;
    //...
    virtual NodeType GetNodeType() { return ast::NodeType::StringLiteral; }
};

为字符串"这个Token绑定对应操作:

pParser->registerPrefix(token::types::STRING, &Parser::parseStringLiteral);

std::shared_ptr<ast::Expression> parseStringLiteral()
{
    std::shared_ptr<ast::StringLiteral> pStr = std::make_shared<ast::StringLiteral>(curToken);

    return pStr;
}

字符串对象表示为宿主语言里面的字符串类型包裹一下:

struct String : Object
{
    std::string Value;

    String(): Value(""){}
    String(std::string val) : Value(val) {}

    virtual ~String() {}
    virtual ObjectType Type() { return ObjectType::STRING; }
    virtual std::string Inspect()
    {
    	return "\"" + Value + "\"";
    }
};

求值就直接是创建字符串对象:

else if(node->GetNodeType() == ast::NodeType::StringLiteral)
{
    std::shared_ptr<ast::StringLiteral> stringLiteral = std::dynamic_pointer_cast<ast::StringLiteral>(node);
    return std::make_shared<objects::String>(stringLiteral->Value);
}

字符串拼接操作

拼接字符串和之前数字加法类似:

std::shared_ptr<objects::Object> evalStringInfixExpression(std::string ops, 
                                                           std::shared_ptr<objects::Object> left, 
                                                           std::shared_ptr<objects::Object> right)
{
    if(ops != "+")
    {
    	return newError("unknown operator: " + left->TypeStr() + " " + ops + " " + right->TypeStr());
    }
    std::string leftValue = std::dynamic_pointer_cast<objects::String>(left)->Value;
    std::string rightValue = std::dynamic_pointer_cast<objects::String>(right)->Value;

    std::shared_ptr<objects::String> result = std::make_shared<objects::String>(leftValue + rightValue);
    return result;
}

//...    
else if (left->Type() == objects::ObjectType::STRING && right->Type() == objects::ObjectType::STRING)
{
    return evalStringInfixExpression(ops, left, right);
}

执行效果:

>> let a = "你好"
>> a
"你好"
>> a + " this is " + "Monkey"
"你好 this is Monkey"

内置函数

一些常用的、效率高要求的函数,可以内置到解释器里面:

using BuiltinFunction = std::shared_ptr<objects::Object> (*)(std::vector<std::shared_ptr<objects::Object>>& args);

struct Builtin: Object
{
    BuiltinFunction Fn;

    Builtin(BuiltinFunction fn): Fn(fn){}
    virtual ~Builtin(){}
    virtual ObjectType Type() { return ObjectType::BUILTIN; }
    virtual std::string Inspect() { return "builltin function"; }
};

就用一个字典保存内置函数的清单:

std::map<std::string, std::shared_ptr<objects::Builtin>> builtins
{
    {
        "len", std::make_shared<objects::Builtin>(&BuiltinFunc_Len)
    },
    {
        "first", std::make_shared<objects::Builtin>(&BuiltinFunc_First)
    },
    {
        "last", std::make_shared<objects::Builtin>(&BuiltinFunc_Last)
    },
    {
        "rest", std::make_shared<objects::Builtin>(&BuiltinFunc_Rest)
    },
    {
        "push", std::make_shared<objects::Builtin>(&BuiltinFunc_Push)
    }
};

后续添加函数就往这里加就行,比如:

std::shared_ptr<objects::Object> BuiltinFunc_Len([[maybe_unused]] std::vector<std::shared_ptr<objects::Object>>& args)
{
    if(args.size() != 1)
    {
        return newError("wrong number of arguments. got=" + std::to_string(args.size()) + ", want=1");
    }

    if(std::shared_ptr<objects::String> obj = std::dynamic_pointer_cast<objects::String>(args[0]); obj != nullptr)
    {
        return std::make_shared<objects::Integer>(obj->Value.size());
    }
    else if(std::shared_ptr<objects::Array> obj = std::dynamic_pointer_cast<objects::Array>(args[0]); obj != nullptr)
    {
        return std::make_shared<objects::Integer>(obj->Elements.size());
    }
    else
    {
        return newError("argument to 'len' not supported, got " + args[0]->TypeStr());
    }
}

数组

数组定义为可以包括任何类型,但内容不可以改变。

词法Token新增:

const TokenType LBRACKET = "[";
const TokenType RBRACKET = "]";

//...

case '[':
    tok = newToken(token::types::LBRACKET, ch);
    break;
case ']':
    tok = newToken(token::types::RBRACKET, ch);
    break;

构建AST节点:

struct ArrayLiteral : Expression
    {
        token::Token Token; // '[' Token
        std::vector<std::shared_ptr<Expression>> Elements;

        ArrayLiteral(token::Token tok) : Token(tok) {}
        virtual ~ArrayLiteral() {}

        virtual void ExpressionNode() {}
        virtual std::string TokenLiteral() { return Token.Literal; }
        virtual std::string String() { 
            std::stringstream oss;
            std::vector<std::string> items{};
            for (auto &item : Elements)
            {
                items.push_back(item->String());
            }
            oss << "[" << Join(items, ", ") << "]";
            return oss.str();
        }
        virtual NodeType GetNodeType() { return ast::NodeType::ArrayLiteral; }
    };

解析数组AST会稍微复杂点:

pParser->registerPrefix(token::types::LBRACKET, &Parser::parseArrayLiteral);

std::shared_ptr<ast::Expression> parseArrayLiteral()
{
    std::shared_ptr<ast::ArrayLiteral> pArray = std::make_shared<ast::ArrayLiteral>(curToken);
    pArray->Elements = parseExpressionList(token::types::RBRACKET);

    return pArray;

}

std::vector<std::shared_ptr<ast::Expression>> parseExpressionList(token::TokenType tokEnd)
{
    std::vector<std::shared_ptr<ast::Expression>> list{};

    if(peekTokenIs(tokEnd))
    {
        nextToken();
        return list;
    }

    nextToken();
    list.push_back(parseExpression(Priority::LOWEST));

    while(peekTokenIs(token::types::COMMA))
    {
        nextToken();
        nextToken();
        list.push_back(parseExpression(Priority::LOWEST));
    }

    if(!expectPeek(tokEnd))
    {
        return std::vector<std::shared_ptr<ast::Expression>>{};
    }

    return list;
}

以逗号分隔的数组元素挨个读取,直到遇到]符号。

数组在内部的对象表示方法:

struct Array : Object
{
    std::vector<std::shared_ptr<Object>> Elements;

    Array(){}
    Array(std::vector<std::shared_ptr<Object>>& elements): Elements(elements){}
    virtual ~Array() {}
    virtual ObjectType Type() { return ObjectType::ARRAY; }
    virtual std::string Inspect()
    {
    	std::stringstream oss;
    	std::vector<std::string> items{};
    	for (auto &item : Elements)
    	{
    		items.push_back(item->Inspect());
    	}
    	oss << "[" << ast::Join(items, ", ") << "]";
    	return oss.str();
    }
};

// ...
else if(node->GetNodeType() == ast::NodeType::ArrayLiteral)
{
    std::shared_ptr<ast::ArrayLiteral> arrayObj = std::dynamic_pointer_cast<ast::ArrayLiteral>(node);
    auto elements = evalExpressions(arrayObj->Elements, env);
    if(elements.size() == 1 && isError(elements[0]))
    {
    	return elements[0];
    }

    return std::make_shared<objects::Array>(elements);
}

它的元素类型是可以任意类型的。

数组索引运算符表达式

<表达式>[<表达式>]

通过索引运算符来读取数组元素:

struct IndexExpression : Expression
{
    token::Token Token; // '[' Token
    std::shared_ptr<Expression> Left;
    std::shared_ptr<Expression> Index;

    // ...
    virtual std::string String() { 
        std::stringstream oss;
        oss << "(" << Left->String() << "[" << Index->String() << "])";
        return oss.str();
    }
    virtual NodeType GetNodeType() { return ast::NodeType::IndexExpression; }
};

索引运算符左边也是一个表达式,但要求必须是数组:

pParser->registerInfix(token::types::LBRACKET, &Parser::parseIndexExpression);

std::shared_ptr<ast::Expression> parseIndexExpression(std::shared_ptr<ast::Expression> left)
{
    std::shared_ptr<ast::IndexExpression> pExp = std::make_shared<ast::IndexExpression>(curToken, left);
    nextToken();
    pExp->Index = parseExpression(Priority::LOWEST);

    if(!expectPeek(token::types::RBRACKET))
    {
        return nullptr;
    }

    return pExp;
}

数组以索引求值:

else if(node->GetNodeType() == ast::NodeType::IndexExpression)
{
    std::shared_ptr<ast::IndexExpression> idxObj = std::dynamic_pointer_cast<ast::IndexExpression>(node);

    auto left = Eval(idxObj->Left, env);
    if(isError(left))
    {
    	return left;
    }

    auto index = Eval(idxObj->Index, env);
    if(isError(index))
    {
    	return index;
    }

    return evalIndexExpression(left, index);
}


//...

std::shared_ptr<objects::Object> evalArrayIndexExpression(std::shared_ptr<objects::Object> left, 
                                                          std::shared_ptr<objects::Object> index)
{
    std::shared_ptr<objects::Array> arrayObj = std::dynamic_pointer_cast<objects::Array>(left);
    auto idx = std::dynamic_pointer_cast<objects::Integer>(index)->Value;
    auto max = static_cast<int64_t>(arrayObj->Elements.size() - 1);

    if(idx < 0 || idx > max)
    {
    	return objects::NULL_OBJ;
    }

    return arrayObj->Elements[idx];
}

std::shared_ptr<objects::Object> evalIndexExpression(std::shared_ptr<objects::Object> left, 
                                                     std::shared_ptr<objects::Object> index)
{
    if(left->Type() == objects::ObjectType::ARRAY && index->Type() == objects::ObjectType::INTEGER)
    {
    	return evalArrayIndexExpression(left, index);
    } else {
    	return newError("index operator not supported: " + left->TypeStr());
    }
}

主要是做类型检查和索引下标检查,效果如下:

>> let myArray = [1, 2, 3, 4, "你好"]
>> myArray
[1, 2, 3, 4, "你好"]
>> len(myArray)
5
>> myArray[0]
1
>> myArray[4]
"你好"
>> myArray[1+3]
"你好"
>> myArray[5]
null

字典或哈希表

字典语法如下:

{<表达式>:<表达式>, <表达式>:<表达式>, ... }

每个键值对都使用冒号:来区分键和值,字典也使用索引操作符访问,只有字符串、整数、布尔值可以作为键值。

词法Token与AST节点:

const TokenType COLON = ":";

//...
case ':':
     tok = newToken(token::types::COLON, ch);
     break;

//...
struct HashLiteral : Expression
{
    token::Token Token; // '{' Token
    std::map<std::shared_ptr<Expression>, std::shared_ptr<Expression>> Pairs;

    //...
    virtual std::string String() { 
        std::stringstream oss;
        std::vector<std::string> items{};
        for (auto &[key, val] : Pairs)
        {
            items.push_back(key->String() + ":" + val->String());
        }
        oss << "{" << Join(items, ", ") << "}";
        return oss.str();
    }
    virtual NodeType GetNodeType() { return ast::NodeType::HashLiteral; }
};

解析哈希AST:

pParser->registerPrefix(token::types::LBRACE, &Parser::parseHashLiteral);

std::shared_ptr<ast::Expression> parseHashLiteral()
{
    std::shared_ptr<ast::HashLiteral> pHash = std::make_shared<ast::HashLiteral>(curToken);

    while(!peekTokenIs(token::types::RBRACE))
    {
        nextToken();
        auto key = parseExpression(Priority::LOWEST);

        if(!expectPeek(token::types::COLON))
        {
            return nullptr;
        }

        nextToken();

        auto value = parseExpression(Priority::LOWEST);

        pHash->Pairs[key] = value;

        if(!peekTokenIs(token::types::RBRACE) && !expectPeek(token::types::COMMA))
        {
            return nullptr;
        }
    }

    if(!expectPeek(token::types::RBRACE))
    {
        return nullptr;
    }

    return pHash;
}

哈希值

为了实现任何对象都可以作为哈希表的键,建立一个键的专门对象:

struct HashKey{
    ObjectType Type;
    uint64_t Value;

    HashKey(ObjectType type, uint64_t val): Type(type), Value(val){}

    bool operator==(const HashKey& rhs) const{
    	return (Type == rhs.Type && Value == rhs.Value);
    }

    bool operator!=(const HashKey& rhs) const{
    	return (Type != rhs.Type || Value != rhs.Value);
    }

    bool operator<(const HashKey &rhs) const
    {
    	return (Value < rhs.Value);
    }
};

struct Object
{
    virtual ~Object() {}
    virtual ObjectType Type() { return ObjectType::Null; }
    virtual std::string Inspect() { return ""; }

    virtual bool Hashable(){ return false; }
    virtual HashKey GetHashKey() {
    	return HashKey(Type(), 0);
    }
    //...
};

通过Hashable()来控制一个对象是否可以作为哈希表的键,同时通过统一的GetHashKey()生成键值,比如字符串求哈希值:

struct String : Object
{
    std::string Value;

    //...
    virtual bool Hashable(){ return true; }

    virtual HashKey GetHashKey() {
    	auto intVal = std::hash<std::string>{}(Value);
    	return HashKey(Type(), static_cast<uint64_t>(intVal));
    }
};

哈希表的键值对象

struct HashPair
{
    std::shared_ptr<Object> Key;
    std::shared_ptr<Object> Value;
};

struct Hash: Object
{
    std::map<HashKey, std::shared_ptr<HashPair>> Pairs;

    virtual ~Hash(){}
    virtual ObjectType Type() { return ObjectType::HASH; }
    virtual std::string Inspect() 
    { 
    	std::stringstream oss;
    	std::vector<std::string> items{};
    	for (auto &[key, pair] : Pairs)
    	{
    		items.push_back(pair->Key->Inspect() + ": " + pair->Value->Inspect());
    	}
    	oss << "{" << ast::Join(items, ", ") << "}";
    	return oss.str(); 
    }
};

通过HashPair保存了哈希表键的原始信息,为的是后面可以打印和遍历。

// ...
else if(node->GetNodeType() == ast::NodeType::HashLiteral)
{
    std::shared_ptr<ast::HashLiteral> hashObj = std::dynamic_pointer_cast<ast::HashLiteral>(node);
    return evalHashLiteral(hashObj, env);
}

std::shared_ptr<objects::Object> evalHashLiteral(std::shared_ptr<ast::HashLiteral> hashNode, 
                                                 std::shared_ptr<objects::Environment> env)
{
    std::map<objects::HashKey, std::shared_ptr<objects::HashPair>> pairs{};

    for(auto &[keyNode, valueNode]: hashNode->Pairs)
    {
    	auto key = Eval(keyNode, env);
    	if(isError(key))
    	{
    		return key;
    	}

    	if(!key->Hashable())
    	{
    		return newError("unusable as hash key: " + key->TypeStr());
    	}

    	auto value = Eval(valueNode, env);
    	if(isError(value))
    	{
    		return value;
    	}

    	auto hashed = key->GetHashKey();
    	pairs[hashed] = std::make_shared<objects::HashPair>(key, value);
    }

    return std::make_shared<objects::Hash>(pairs);
}

执行效果:

>> {"name": "Leslie", "age": 18, "male": true, 22: 9/3, true: "Mocker"}
{true: "Mocker", 22: 3, "name": "Leslie", "age": 18, "male": true}
>>

字典索引表达式

std::shared_ptr<objects::Object> evalHashIndexExpression(std::shared_ptr<objects::Object> left, 
                                                         std::shared_ptr<objects::Object> index)
{
    std::shared_ptr<objects::Hash> hashObj = std::dynamic_pointer_cast<objects::Hash>(left);

    if(!index->Hashable())
    {
    	return newError("unusable as hash key: " + index->TypeStr());
    }

    auto hashed = index->GetHashKey();

    auto fit = hashObj->Pairs.find(hashed);
    if(fit == hashObj->Pairs.end())
    {
    	return objects::NULL_OBJ;
    }

    return fit->second->Value;
}

执行效果:

>> let people = [{"name": "Leslie", "age": 24}, {"name": "羽舒", "age": 28}]
>> people
[{"name": "Leslie", "age": 24}, {"name": "羽舒", "age": 28}]
>> people[0]["name"]
"Leslie"
>> people[1]["age"]
28
>> people[1]["age"] + people[0]["age"]
52
>> let getName = fn(person){person["name"]}
>> getName(people[0])
"Leslie"
>> getName(people[1])
"羽舒"
>>

打印函数puts

>> puts("Hello World!")
"Hello World!"
null
>> let putsReturn = puts("Hello World!")
"Hello World!"
>> putsReturn
null
>>

只打印并返回null

总结

纯解释器的工作告一段落,后面是加入字节码等功能,内容偏向编译器了。