Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 50 additions & 21 deletions ggml/src/ggml-backend-meta.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include <cstring>
#include <map>
#include <memory>
#include <regex>
#include <string>
#include <tuple>
#include <utility>
Expand Down Expand Up @@ -452,9 +453,12 @@ static struct ggml_tensor * ggml_backend_meta_buffer_simple_tensor(const struct
return it->second[index];
}

static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(const struct ggml_tensor * tensor, bool assume_sync) {
const size_t n_bufs = ggml_backend_meta_buffer_n_bufs(tensor->buffer);
ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) tensor->buffer->context;
static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state_impl(
const struct ggml_tensor * tensor, bool assume_sync, ggml_backend_meta_buffer_context * buf_ctx,
ggml_backend_meta_get_split_state_t get_split_state, void * get_split_state_ud) {
const size_t n_bufs = buf_ctx->buf_configs.size();
const std::regex pattern_sched_input(".*#.*#.*");
const bool tensor_is_sched_input = tensor->op == GGML_OP_NONE && std::regex_match(tensor->name, pattern_sched_input); // FIXME

auto split_states_equal = [&](const ggml_backend_meta_split_state & a, const ggml_backend_meta_split_state & b) -> bool {
if (a.axis != b.axis) {
Expand Down Expand Up @@ -765,9 +769,7 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co
return {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1};
}
if (ggml_backend_buffer_get_usage(tensor->buffer) != GGML_BACKEND_BUFFER_USAGE_COMPUTE && tensor->view_src == nullptr) {
ggml_backend_dev_t dev = ggml_backend_buft_get_device(ggml_backend_buffer_get_type(tensor->buffer));
const ggml_backend_meta_device_context * dev_ctx = (const ggml_backend_meta_device_context *) dev->context;
ggml_backend_meta_split_state ret = dev_ctx->get_split_state(tensor, dev_ctx->get_split_state_ud);
ggml_backend_meta_split_state ret = get_split_state(tensor_is_sched_input ? tensor->src[0] : tensor, get_split_state_ud);
if (ret.axis >= 0 && ret.axis <= GGML_MAX_DIMS) {
const int64_t granularity = ret.axis == GGML_BACKEND_SPLIT_AXIS_0 ? ggml_blck_size(tensor->type) : 1;
int64_t ne_sum = 0;
Expand All @@ -786,13 +788,17 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co
src_ss[i] = {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1};
continue;
}
src_ss[i] = ggml_backend_meta_get_split_state(tensor->src[i], /*assume_sync =*/ true);
src_ss[i] = ggml_backend_meta_get_split_state_impl(tensor->src[i], /*assume_sync =*/ true, buf_ctx, get_split_state, get_split_state_ud);
GGML_ASSERT(src_ss[i].axis != GGML_BACKEND_SPLIT_AXIS_UNKNOWN);
}

ggml_backend_meta_split_state split_state;
switch (tensor->op) {
case GGML_OP_NONE: {
if (tensor_is_sched_input) {
split_state = src_ss[0];
break;
}
split_state = {GGML_BACKEND_SPLIT_AXIS_MIRRORED, {0}, 1};
} break;
case GGML_OP_DUP: {
Expand Down Expand Up @@ -985,7 +991,6 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co
}
if (split_state.axis >= 0 && split_state.axis < GGML_MAX_DIMS) {
bool first_src_split_by_axis = true;
const size_t n_bufs = ggml_backend_meta_buffer_n_bufs(tensor->buffer);

for (size_t i = 0; i < GGML_MAX_SRC; i++) {
if (tensor->src[i] == nullptr || src_ss[i].axis < 0 || src_ss[i].axis >= GGML_MAX_DIMS) {
Expand Down Expand Up @@ -1043,7 +1048,8 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co
if (!srcs_info.empty()) {
srcs_info += ", ";
}
const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(tensor->src[0], true);
const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state_impl(
tensor->src[0], true, buf_ctx, get_split_state, get_split_state_ud);
const char * axis_name = ggml_backend_meta_split_axis_name(split_state.axis);
std::string ne_info;
for (size_t j = 0; j < n_bufs; j++) {
Expand Down Expand Up @@ -1080,6 +1086,18 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co
return ret;
}

static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(const struct ggml_tensor * tensor, bool assume_sync) {
GGML_ASSERT(ggml_backend_buffer_is_meta(tensor->buffer));
ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) tensor->buffer->context;
ggml_backend_dev_t dev = ggml_backend_buft_get_device(ggml_backend_buffer_get_type(tensor->buffer));
const ggml_backend_meta_device_context * dev_ctx = (const ggml_backend_meta_device_context *) dev->context;

// The function below is recursive and works for any tensor shapes.
// However, some of the (recursively found) source tensors may not be on a meta buffer
// so the corresponding functionality needs to be retrieved from the first tensor.
return ggml_backend_meta_get_split_state_impl(tensor, assume_sync, buf_ctx, dev_ctx->get_split_state, dev_ctx->get_split_state_ud);
}

static void * ggml_backend_meta_buffer_get_base(ggml_backend_buffer_t buffer) {
GGML_UNUSED(buffer);
return (void *) 0x1000000000000000; // FIXME
Expand Down Expand Up @@ -1178,9 +1196,20 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer
return GGML_STATUS_SUCCESS;
}

static size_t get_next_highest_nb(const ggml_tensor * tensor, const int axis) {
const size_t nb_axis = tensor->nb[axis];
size_t next_highest_nb = ggml_nbytes(tensor);
for (int i = 0; i < GGML_MAX_DIMS; i++) {
if (tensor->nb[i] > nb_axis && tensor->nb[i] < next_highest_nb) {
next_highest_nb = tensor->nb[i];
}
}
return next_highest_nb;
}

static void ggml_backend_meta_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
const size_t n_bufs = ggml_backend_meta_buffer_n_bufs(buffer);
GGML_ASSERT(ggml_is_contiguous(tensor));
GGML_ASSERT(ggml_is_contiguously_allocated(tensor));

const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(tensor, /*assume_sync =*/ false);

Expand Down Expand Up @@ -1228,15 +1257,15 @@ static void ggml_backend_meta_buffer_set_tensor(ggml_backend_buffer_t buffer, gg
case GGML_BACKEND_SPLIT_AXIS_1:
case GGML_BACKEND_SPLIT_AXIS_2: {
// Exploit that tensors are contiguous to splice it with simple tensors as "chunks".
const size_t chunk_size_full = tensor->nb[split_state.axis + 1];
const size_t chunk_size_full = get_next_highest_nb(tensor, split_state.axis);
GGML_ASSERT(offset % chunk_size_full == 0);
GGML_ASSERT(size % chunk_size_full == 0);
const int64_t i_start = offset /chunk_size_full;
const int64_t i_stop = (offset + size)/chunk_size_full;
size_t offset_j = 0;
for (size_t j = 0; j < n_bufs; j++) {
ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
const size_t chunk_size_j = simple_tensor->nb[split_state.axis + 1];
const size_t chunk_size_j = get_next_highest_nb(simple_tensor, split_state.axis);
const size_t simple_offset = i_start * chunk_size_j;
ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_j, simple_offset, chunk_size_j, i_stop - i_start, chunk_size_j, chunk_size_full);
offset_j += chunk_size_j;
Expand Down Expand Up @@ -1270,7 +1299,7 @@ static void ggml_backend_meta_buffer_set_tensor(ggml_backend_buffer_t buffer, gg

static void ggml_backend_meta_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
const size_t n_bufs = ggml_backend_meta_buffer_n_bufs(buffer);
GGML_ASSERT(ggml_is_contiguous(tensor));
GGML_ASSERT(ggml_is_contiguously_allocated(tensor));

const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(tensor, /*assume_sync =*/ false);
GGML_ASSERT(split_state.n_segments == 1);
Expand All @@ -1280,15 +1309,15 @@ static void ggml_backend_meta_buffer_get_tensor(ggml_backend_buffer_t buffer, co
case GGML_BACKEND_SPLIT_AXIS_1:
case GGML_BACKEND_SPLIT_AXIS_2: {
// Exploit that tensors are contiguous to splice it with simple tensors as "chunks".
const size_t chunk_size_full = tensor->nb[split_state.axis + 1];
const size_t chunk_size_full = get_next_highest_nb(tensor, split_state.axis);
GGML_ASSERT(offset % chunk_size_full == 0);
GGML_ASSERT(size % chunk_size_full == 0);
const int64_t i_start = offset /chunk_size_full;
const int64_t i_stop = (offset + size)/chunk_size_full;
size_t offset_j = 0;
for (size_t j = 0; j < n_bufs; j++){
const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
const size_t chunk_size_j = simple_tensor->nb[split_state.axis + 1];
const size_t chunk_size_j = get_next_highest_nb(simple_tensor, split_state.axis);
const size_t simple_offset = i_start * chunk_size_j;
ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_j, simple_offset, chunk_size_j, i_stop - i_start, chunk_size_j, chunk_size_full);
offset_j += chunk_size_j;
Expand Down Expand Up @@ -1464,7 +1493,7 @@ static void ggml_backend_meta_free(ggml_backend_t backend) {
static void ggml_backend_meta_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
const size_t n_backends = ggml_backend_meta_n_backends(backend);
GGML_ASSERT(offset == 0);
GGML_ASSERT(ggml_is_contiguous(tensor));
GGML_ASSERT(ggml_is_contiguously_allocated(tensor));

const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(tensor, /*assume_sync =*/ false);
GGML_ASSERT(split_state.n_segments == 1);
Expand All @@ -1474,7 +1503,7 @@ static void ggml_backend_meta_set_tensor_async(ggml_backend_t backend, ggml_tens
case GGML_BACKEND_SPLIT_AXIS_1:
case GGML_BACKEND_SPLIT_AXIS_2: {
// Exploit that tensors are contiguous to splice it with simple tensors as "chunks".
const size_t chunk_size_full = tensor->nb[split_state.axis + 1];
const size_t chunk_size_full = get_next_highest_nb(tensor, split_state.axis);
GGML_ASSERT(offset % chunk_size_full == 0);
GGML_ASSERT(size % chunk_size_full == 0);
const int64_t i_start = offset /chunk_size_full;
Expand All @@ -1483,7 +1512,7 @@ static void ggml_backend_meta_set_tensor_async(ggml_backend_t backend, ggml_tens
for (size_t j = 0; j < n_backends; j++){
ggml_backend_t simple_backend = ggml_backend_meta_simple_backend(backend, j);
ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
const size_t chunk_size_j = simple_tensor->nb[split_state.axis + 1];
const size_t chunk_size_j = get_next_highest_nb(simple_tensor, split_state.axis);
ggml_backend_tensor_set_2d_async(simple_backend, simple_tensor, (const char *) data + offset_j, offset, chunk_size_j,
i_stop - i_start, chunk_size_j, chunk_size_full);
offset_j += chunk_size_j;
Expand All @@ -1505,7 +1534,7 @@ static void ggml_backend_meta_set_tensor_async(ggml_backend_t backend, ggml_tens
static void ggml_backend_meta_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
const size_t n_backends = ggml_backend_meta_n_backends(backend);
GGML_ASSERT(offset == 0);
GGML_ASSERT(ggml_is_contiguous(tensor));
GGML_ASSERT(ggml_is_contiguously_allocated(tensor));

const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(tensor, /*assume_sync =*/ false);
GGML_ASSERT(split_state.n_segments == 1);
Expand All @@ -1515,7 +1544,7 @@ static void ggml_backend_meta_get_tensor_async(ggml_backend_t backend, const ggm
case GGML_BACKEND_SPLIT_AXIS_1:
case GGML_BACKEND_SPLIT_AXIS_2: {
// Exploit that tensors are contiguous to splice it with simple tensors as "chunks".
const size_t chunk_size_full = tensor->nb[split_state.axis + 1];
const size_t chunk_size_full = get_next_highest_nb(tensor, split_state.axis);
GGML_ASSERT(offset % chunk_size_full == 0);
GGML_ASSERT(size % chunk_size_full == 0);
const int64_t i_start = offset /chunk_size_full;
Expand All @@ -1524,7 +1553,7 @@ static void ggml_backend_meta_get_tensor_async(ggml_backend_t backend, const ggm
for (size_t j = 0; j < n_backends; j++){
ggml_backend_t simple_backend = ggml_backend_meta_simple_backend(backend, j);
const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
const size_t chunk_size_j = simple_tensor->nb[split_state.axis + 1];
const size_t chunk_size_j = get_next_highest_nb(simple_tensor, split_state.axis);
ggml_backend_tensor_get_2d_async(simple_backend, simple_tensor, (char *) data + offset_j, offset, chunk_size_j,
i_stop - i_start, chunk_size_j, chunk_size_full);
offset_j += chunk_size_j;
Expand Down
2 changes: 2 additions & 0 deletions ggml/src/ggml-backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1334,6 +1334,7 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
tensor_copy = src; // use the original tensor as the current copy
} else {
tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
tensor_copy->src[0] = src;
ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
}
ggml_set_input(tensor_copy);
Expand All @@ -1353,6 +1354,7 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
ggml_backend_t backend = sched->backends[cur_backend_id];
for (int c = 0; c < sched->n_copies; c++) {
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
tensor_copy->src[0] = src;
ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
if (sched->n_copies > 1) {
ggml_set_input(tensor_copy);
Expand Down
Loading