DuckDB中的Data Representation基本上由Val->Value->Vector->Data Chunk的结构实现。

Data Chunk

DataChunk是DuckDB中操作数据的logical unit。根据data_chunk.hpp中的解释

A Data Chunk represents a set of vectors.

The data chunk class is the intermediate representation used by the execution engine of DuckDB. It effectively represents a subset of a relation. It holds a set of vectors that all have the same length.

DataChunk类的成员变量包括datacountcapacityvector_caches

class DataChunk {
public:
	//! The vectors owned by the DataChunk.
	vector<Vector> data;
private:
	//! The amount of tuples stored in the data chunk
	idx_t count;
	//! The amount of tuples that can be stored in the data chunk
	idx_t capacity;
	//! Vector caches, used to store data when ::Initialize is called
	vector<VectorCache> vector_caches;
};

每个Vector长度必须相同,表示一个Column的data,所以vector<Vector> data就用来存储一个至多个column的数据。

Vector

DataChunk中的vector<Vector> data依赖于vector.hpp中的Vector

根据Vector的成员变量包括:

class Vector {
...
protected:
	//! The vector type specifies how the data of the vector is physically stored
	//! i.e. if it is a single repeated constant, if it is compressed
	VectorType vector_type;
	//! The type of the elements stored in the vector (e.g. integer, float)
	LogicalType type;
	//! A pointer to the data.
	data_ptr_t data;
	//! The validity mask of the vector
	ValidityMask validity;
	//! The main buffer holding the data of the vector
	buffer_ptr<VectorBuffer> buffer;
	//! The buffer holding auxiliary data of the vector
	//! e.g. a string vector uses this to store strings
	buffer_ptr<VectorBuffer> auxiliary;
};

其中的data_ptr_t data,其本质就是一个uint8_t类型的指向具体数据的指针

//! data pointers
typedef uint8_t data_t;
typedef data_t *data_ptr_t; 

Value

前述data_ptr_t data,其指针指向是具体存数据的类型就是Value,比如vector.cpp中的其中一个构造方法为Vector::Vector(const Value &value),并且从Vector::Reference(const Value &value)中的Vector::SetValue也能看出通过((T *)data)[index] = val.GetValueUnsafe<T>();来完成对data这个数组的赋值。

Vector::Vector(const Value &value) : type(value.type()) {
    Reference(value);
}

void Vector::Reference(const Value &value) {
    D_ASSERT(GetType().id() == value.type().id());
    this->vector_type = VectorType::CONSTANT_VECTOR;
    buffer = VectorBuffer::CreateConstantVector(value.type());
    auto internal_type = value.type().InternalType();
    if (internal_type == PhysicalType::STRUCT) {
        auto struct_buffer = make_unique<VectorStructBuffer>();
        auto &child_types = StructType::GetChildTypes(value.type());
        auto &child_vectors = struct_buffer->GetChildren();
        auto &value_children = StructValue::GetChildren(value);
        for (idx_t i = 0; i < child_types.size(); i++) {
            auto vector = make_unique<Vector>(value.IsNull() ? Value(child_types[i].second) : value_children[i]);
            child_vectors.push_back(std::move(vector));
        }
        auxiliary = std::move(struct_buffer);
        if (value.IsNull()) {
            SetValue(0, value);
        }
    } else if (internal_type == PhysicalType::LIST) {
        auto list_buffer = make_unique<VectorListBuffer>(value.type());
        auxiliary = std::move(list_buffer);
        data = buffer->GetData();
        SetValue(0, value);
    } else {
        auxiliary.reset();
        data = buffer->GetData();
        SetValue(0, value);
    }
}

void Vector::SetValue(idx_t index, const Value &val) {
    ...
    switch (GetType().InternalType()) {
    case PhysicalType::BOOL:
        ((bool *)data)[index] = val.GetValueUnsafe<bool>();
        break;
    case PhysicalType::INT8:
        ((int8_t *)data)[index] = val.GetValueUnsafe<int8_t>();
        break;
    ...
    case PhysicalType::INT128:
        ((hugeint_t *)data)[index] = val.GetValueUnsafe<hugeint_t>();
        break;
    ...
    case PhysicalType::FLOAT:
        ((float *)data)[index] = val.GetValueUnsafe<float>();
        break;
    case PhysicalType::DOUBLE:
        ((double *)data)[index] = val.GetValueUnsafe<double>();
        break;
    case PhysicalType::INTERVAL:
        ((interval_t *)data)[index] = val.GetValueUnsafe<interval_t>();
        break;
    case PhysicalType::VARCHAR:
        ((string_t *)data)[index] = StringVector::AddStringOrBlob(*this, StringValue::Get(val));
        break;
    case PhysicalType::STRUCT: {
        D_ASSERT(GetVectorType() == VectorType::CONSTANT_VECTOR || GetVectorType() == VectorType::FLAT_VECTOR);

        auto &children = StructVector::GetEntries(*this);
        auto &val_children = StructValue::GetChildren(val);
        D_ASSERT(val.IsNull() || children.size() == val_children.size());
        for (size_t i = 0; i < children.size(); i++) {
            auto &vec_child = children[i];
            if (!val.IsNull()) {
                auto &struct_child = val_children[i];
                vec_child->SetValue(index, struct_child);
            } else {
                vec_child->SetValue(index, Value());
            }
        }
        break;
    }
    case PhysicalType::LIST: {
        auto offset = ListVector::GetListSize(*this);
        auto &val_children = ListValue::GetChildren(val);
        if (!val_children.empty()) {
            for (idx_t i = 0; i < val_children.size(); i++) {
                ListVector::PushBack(*this, val_children[i]);
            }
        }
        //! now set the pointer
        auto &entry = ((list_entry_t *)data)[index];
        entry.length = val_children.size();
        entry.offset = offset;
        break;
    }
    ...
}

Value的具体定义为,

//! The Value object holds a single arbitrary value of any type that can be
//! stored in the database.
class Value {
...
private:
    //! The logical of the value
    LogicalType type_;
...
public:
    //! Whether or not the value is NULL
    bool is_null;

    //! The value of the object, if it is of a constant size Type
    union Val {
        int8_t boolean;
        int8_t tinyint;
        int16_t smallint;
        int32_t integer;
        int64_t bigint;
        uint8_t utinyint;
        uint16_t usmallint;
        uint32_t uinteger;
        uint64_t ubigint;
        hugeint_t hugeint;
        float float_;
        double double_;
        uintptr_t pointer;
        uint64_t hash;
        date_t date;
        dtime_t time;
        timestamp_t timestamp;
        interval_t interval;
    } value_;

    //! The value of the object, if it is of a variable size type
    string str_value;

    vector<Value> struct_value;
    vector<Value> list_value;
};

Val

Value的成员变量可以看出,DuckDB中的数据存储方式还是封装了不同的基础数据类型变量,一般的数据就是Union Val,字符串就是string str_value,再复杂的struct_valuelist_value就是通过vector<Value>来实现了。

VectorBuffer

Vector中最重要的buffer_ptr<VectorBuffer> buffer就是依赖于vector_buffer.hpp中定义的VectorBuffer

//! The VectorBuffer is a class used by the vector to hold its data
class VectorBuffer {
...
protected:
	VectorBufferType buffer_type;
	unique_ptr<VectorAuxiliaryData> aux_data;
	unique_ptr<data_t[]> data;
};

这里可以看出真正的数据是由unique_ptr<data_t[]> data来维持。

VectorCache

DataChunk中除了关键的vector<Vector> data还有vector<VectorCache> vector_caches,其中VectorCache定义在vector_cache.hpp

//! The VectorCache holds cached data that allows for re-use of the same memory by vectors
class VectorCache {
public:
	//! Instantiate a vector cache with the given type and capacity
	DUCKDB_API explicit VectorCache(Allocator &allocator, 
                                    const LogicalType &type,
	                                idx_t capacity = STANDARD_VECTOR_SIZE);

	buffer_ptr<VectorBuffer> buffer;

public:
	void ResetFromCache(Vector &result) const;

	const LogicalType &GetType() const;
};