DuckDB--Data Representation
DuckDB中的Data Representation基本上由Val->Value->Vector->Data Chunk的结构实现。
Data Chunk
DataChunk
是DuckDB中操作数据的logical unit。根据data_chunk.hpp
中的解释
A Data Chunk represents a set of vectors.
The data chunk class is the intermediate representation used by the execution engine of DuckDB. It effectively represents a subset of a relation. It holds a set of vectors that all have the same length.
DataChunk
类的成员变量包括data
,count
,capacity
,vector_caches
。
class DataChunk {
public:
//! The vectors owned by the DataChunk.
vector<Vector> data;
private:
//! The amount of tuples stored in the data chunk
idx_t count;
//! The amount of tuples that can be stored in the data chunk
idx_t capacity;
//! Vector caches, used to store data when ::Initialize is called
vector<VectorCache> vector_caches;
};
每个Vector
长度必须相同,表示一个Column的data,所以vector<Vector> data
就用来存储一个至多个column的数据。
Vector
DataChunk
中的vector<Vector> data
依赖于vector.hpp
中的Vector
。
根据Vector的成员变量包括:
class Vector {
...
protected:
//! The vector type specifies how the data of the vector is physically stored
//! i.e. if it is a single repeated constant, if it is compressed
VectorType vector_type;
//! The type of the elements stored in the vector (e.g. integer, float)
LogicalType type;
//! A pointer to the data.
data_ptr_t data;
//! The validity mask of the vector
ValidityMask validity;
//! The main buffer holding the data of the vector
buffer_ptr<VectorBuffer> buffer;
//! The buffer holding auxiliary data of the vector
//! e.g. a string vector uses this to store strings
buffer_ptr<VectorBuffer> auxiliary;
};
其中的data_ptr_t data
,其本质就是一个uint8_t
类型的指向具体数据的指针
//! data pointers
typedef uint8_t data_t;
typedef data_t *data_ptr_t;
Value
前述data_ptr_t data
,其指针指向是具体存数据的类型就是Value
,比如vector.cpp
中的其中一个构造方法为Vector::Vector(const Value &value)
,并且从Vector::Reference(const Value &value)
中的Vector::SetValue
也能看出通过((T *)data)[index] = val.GetValueUnsafe<T>();
来完成对data
这个数组的赋值。
Vector::Vector(const Value &value) : type(value.type()) {
Reference(value);
}
void Vector::Reference(const Value &value) {
D_ASSERT(GetType().id() == value.type().id());
this->vector_type = VectorType::CONSTANT_VECTOR;
buffer = VectorBuffer::CreateConstantVector(value.type());
auto internal_type = value.type().InternalType();
if (internal_type == PhysicalType::STRUCT) {
auto struct_buffer = make_unique<VectorStructBuffer>();
auto &child_types = StructType::GetChildTypes(value.type());
auto &child_vectors = struct_buffer->GetChildren();
auto &value_children = StructValue::GetChildren(value);
for (idx_t i = 0; i < child_types.size(); i++) {
auto vector = make_unique<Vector>(value.IsNull() ? Value(child_types[i].second) : value_children[i]);
child_vectors.push_back(std::move(vector));
}
auxiliary = std::move(struct_buffer);
if (value.IsNull()) {
SetValue(0, value);
}
} else if (internal_type == PhysicalType::LIST) {
auto list_buffer = make_unique<VectorListBuffer>(value.type());
auxiliary = std::move(list_buffer);
data = buffer->GetData();
SetValue(0, value);
} else {
auxiliary.reset();
data = buffer->GetData();
SetValue(0, value);
}
}
void Vector::SetValue(idx_t index, const Value &val) {
...
switch (GetType().InternalType()) {
case PhysicalType::BOOL:
((bool *)data)[index] = val.GetValueUnsafe<bool>();
break;
case PhysicalType::INT8:
((int8_t *)data)[index] = val.GetValueUnsafe<int8_t>();
break;
...
case PhysicalType::INT128:
((hugeint_t *)data)[index] = val.GetValueUnsafe<hugeint_t>();
break;
...
case PhysicalType::FLOAT:
((float *)data)[index] = val.GetValueUnsafe<float>();
break;
case PhysicalType::DOUBLE:
((double *)data)[index] = val.GetValueUnsafe<double>();
break;
case PhysicalType::INTERVAL:
((interval_t *)data)[index] = val.GetValueUnsafe<interval_t>();
break;
case PhysicalType::VARCHAR:
((string_t *)data)[index] = StringVector::AddStringOrBlob(*this, StringValue::Get(val));
break;
case PhysicalType::STRUCT: {
D_ASSERT(GetVectorType() == VectorType::CONSTANT_VECTOR || GetVectorType() == VectorType::FLAT_VECTOR);
auto &children = StructVector::GetEntries(*this);
auto &val_children = StructValue::GetChildren(val);
D_ASSERT(val.IsNull() || children.size() == val_children.size());
for (size_t i = 0; i < children.size(); i++) {
auto &vec_child = children[i];
if (!val.IsNull()) {
auto &struct_child = val_children[i];
vec_child->SetValue(index, struct_child);
} else {
vec_child->SetValue(index, Value());
}
}
break;
}
case PhysicalType::LIST: {
auto offset = ListVector::GetListSize(*this);
auto &val_children = ListValue::GetChildren(val);
if (!val_children.empty()) {
for (idx_t i = 0; i < val_children.size(); i++) {
ListVector::PushBack(*this, val_children[i]);
}
}
//! now set the pointer
auto &entry = ((list_entry_t *)data)[index];
entry.length = val_children.size();
entry.offset = offset;
break;
}
...
}
Value
的具体定义为,
//! The Value object holds a single arbitrary value of any type that can be
//! stored in the database.
class Value {
...
private:
//! The logical of the value
LogicalType type_;
...
public:
//! Whether or not the value is NULL
bool is_null;
//! The value of the object, if it is of a constant size Type
union Val {
int8_t boolean;
int8_t tinyint;
int16_t smallint;
int32_t integer;
int64_t bigint;
uint8_t utinyint;
uint16_t usmallint;
uint32_t uinteger;
uint64_t ubigint;
hugeint_t hugeint;
float float_;
double double_;
uintptr_t pointer;
uint64_t hash;
date_t date;
dtime_t time;
timestamp_t timestamp;
interval_t interval;
} value_;
//! The value of the object, if it is of a variable size type
string str_value;
vector<Value> struct_value;
vector<Value> list_value;
};
Val
从Value
的成员变量可以看出,DuckDB中的数据存储方式还是封装了不同的基础数据类型变量,一般的数据就是Union Val
,字符串就是string str_value
,再复杂的struct_value
和list_value
就是通过vector<Value>
来实现了。
VectorBuffer
Vector
中最重要的buffer_ptr<VectorBuffer> buffer
就是依赖于vector_buffer.hpp
中定义的VectorBuffer
。
//! The VectorBuffer is a class used by the vector to hold its data
class VectorBuffer {
...
protected:
VectorBufferType buffer_type;
unique_ptr<VectorAuxiliaryData> aux_data;
unique_ptr<data_t[]> data;
};
这里可以看出真正的数据是由unique_ptr<data_t[]> data
来维持。
VectorCache
DataChunk
中除了关键的vector<Vector> data
还有vector<VectorCache> vector_caches
,其中VectorCache
定义在vector_cache.hpp
。
//! The VectorCache holds cached data that allows for re-use of the same memory by vectors
class VectorCache {
public:
//! Instantiate a vector cache with the given type and capacity
DUCKDB_API explicit VectorCache(Allocator &allocator,
const LogicalType &type,
idx_t capacity = STANDARD_VECTOR_SIZE);
buffer_ptr<VectorBuffer> buffer;
public:
void ResetFromCache(Vector &result) const;
const LogicalType &GetType() const;
};