diff --git a/ggml/src/ggml-backend-meta.cpp b/ggml/src/ggml-backend-meta.cpp index a2ab8872c4a..5ce8228bc96 100644 --- a/ggml/src/ggml-backend-meta.cpp +++ b/ggml/src/ggml-backend-meta.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -452,9 +453,12 @@ static struct ggml_tensor * ggml_backend_meta_buffer_simple_tensor(const struct return it->second[index]; } -static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(const struct ggml_tensor * tensor, bool assume_sync) { - const size_t n_bufs = ggml_backend_meta_buffer_n_bufs(tensor->buffer); - ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) tensor->buffer->context; +static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state_impl( + const struct ggml_tensor * tensor, bool assume_sync, ggml_backend_meta_buffer_context * buf_ctx, + ggml_backend_meta_get_split_state_t get_split_state, void * get_split_state_ud) { + const size_t n_bufs = buf_ctx->buf_configs.size(); + const std::regex pattern_sched_input(".*#.*#.*"); + const bool tensor_is_sched_input = tensor->op == GGML_OP_NONE && std::regex_match(tensor->name, pattern_sched_input); // FIXME auto split_states_equal = [&](const ggml_backend_meta_split_state & a, const ggml_backend_meta_split_state & b) -> bool { if (a.axis != b.axis) { @@ -765,9 +769,7 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co return {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1}; } if (ggml_backend_buffer_get_usage(tensor->buffer) != GGML_BACKEND_BUFFER_USAGE_COMPUTE && tensor->view_src == nullptr) { - ggml_backend_dev_t dev = ggml_backend_buft_get_device(ggml_backend_buffer_get_type(tensor->buffer)); - const ggml_backend_meta_device_context * dev_ctx = (const ggml_backend_meta_device_context *) dev->context; - ggml_backend_meta_split_state ret = dev_ctx->get_split_state(tensor, dev_ctx->get_split_state_ud); + ggml_backend_meta_split_state ret = get_split_state(tensor_is_sched_input ? tensor->src[0] : tensor, get_split_state_ud); if (ret.axis >= 0 && ret.axis <= GGML_MAX_DIMS) { const int64_t granularity = ret.axis == GGML_BACKEND_SPLIT_AXIS_0 ? ggml_blck_size(tensor->type) : 1; int64_t ne_sum = 0; @@ -786,13 +788,17 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co src_ss[i] = {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1}; continue; } - src_ss[i] = ggml_backend_meta_get_split_state(tensor->src[i], /*assume_sync =*/ true); + src_ss[i] = ggml_backend_meta_get_split_state_impl(tensor->src[i], /*assume_sync =*/ true, buf_ctx, get_split_state, get_split_state_ud); GGML_ASSERT(src_ss[i].axis != GGML_BACKEND_SPLIT_AXIS_UNKNOWN); } ggml_backend_meta_split_state split_state; switch (tensor->op) { case GGML_OP_NONE: { + if (tensor_is_sched_input) { + split_state = src_ss[0]; + break; + } split_state = {GGML_BACKEND_SPLIT_AXIS_MIRRORED, {0}, 1}; } break; case GGML_OP_DUP: { @@ -985,7 +991,6 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co } if (split_state.axis >= 0 && split_state.axis < GGML_MAX_DIMS) { bool first_src_split_by_axis = true; - const size_t n_bufs = ggml_backend_meta_buffer_n_bufs(tensor->buffer); for (size_t i = 0; i < GGML_MAX_SRC; i++) { if (tensor->src[i] == nullptr || src_ss[i].axis < 0 || src_ss[i].axis >= GGML_MAX_DIMS) { @@ -1043,7 +1048,8 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co if (!srcs_info.empty()) { srcs_info += ", "; } - const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(tensor->src[0], true); + const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state_impl( + tensor->src[0], true, buf_ctx, get_split_state, get_split_state_ud); const char * axis_name = ggml_backend_meta_split_axis_name(split_state.axis); std::string ne_info; for (size_t j = 0; j < n_bufs; j++) { @@ -1080,6 +1086,18 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co return ret; } +static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(const struct ggml_tensor * tensor, bool assume_sync) { + GGML_ASSERT(ggml_backend_buffer_is_meta(tensor->buffer)); + ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) tensor->buffer->context; + ggml_backend_dev_t dev = ggml_backend_buft_get_device(ggml_backend_buffer_get_type(tensor->buffer)); + const ggml_backend_meta_device_context * dev_ctx = (const ggml_backend_meta_device_context *) dev->context; + + // The function below is recursive and works for any tensor shapes. + // However, some of the (recursively found) source tensors may not be on a meta buffer + // so the corresponding functionality needs to be retrieved from the first tensor. + return ggml_backend_meta_get_split_state_impl(tensor, assume_sync, buf_ctx, dev_ctx->get_split_state, dev_ctx->get_split_state_ud); +} + static void * ggml_backend_meta_buffer_get_base(ggml_backend_buffer_t buffer) { GGML_UNUSED(buffer); return (void *) 0x1000000000000000; // FIXME @@ -1178,9 +1196,20 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer return GGML_STATUS_SUCCESS; } +static size_t get_next_highest_nb(const ggml_tensor * tensor, const int axis) { + const size_t nb_axis = tensor->nb[axis]; + size_t next_highest_nb = ggml_nbytes(tensor); + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if (tensor->nb[i] > nb_axis && tensor->nb[i] < next_highest_nb) { + next_highest_nb = tensor->nb[i]; + } + } + return next_highest_nb; +} + static void ggml_backend_meta_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { const size_t n_bufs = ggml_backend_meta_buffer_n_bufs(buffer); - GGML_ASSERT(ggml_is_contiguous(tensor)); + GGML_ASSERT(ggml_is_contiguously_allocated(tensor)); const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(tensor, /*assume_sync =*/ false); @@ -1228,7 +1257,7 @@ static void ggml_backend_meta_buffer_set_tensor(ggml_backend_buffer_t buffer, gg case GGML_BACKEND_SPLIT_AXIS_1: case GGML_BACKEND_SPLIT_AXIS_2: { // Exploit that tensors are contiguous to splice it with simple tensors as "chunks". - const size_t chunk_size_full = tensor->nb[split_state.axis + 1]; + const size_t chunk_size_full = get_next_highest_nb(tensor, split_state.axis); GGML_ASSERT(offset % chunk_size_full == 0); GGML_ASSERT(size % chunk_size_full == 0); const int64_t i_start = offset /chunk_size_full; @@ -1236,7 +1265,7 @@ static void ggml_backend_meta_buffer_set_tensor(ggml_backend_buffer_t buffer, gg size_t offset_j = 0; for (size_t j = 0; j < n_bufs; j++) { ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j); - const size_t chunk_size_j = simple_tensor->nb[split_state.axis + 1]; + const size_t chunk_size_j = get_next_highest_nb(simple_tensor, split_state.axis); const size_t simple_offset = i_start * chunk_size_j; ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_j, simple_offset, chunk_size_j, i_stop - i_start, chunk_size_j, chunk_size_full); offset_j += chunk_size_j; @@ -1270,7 +1299,7 @@ static void ggml_backend_meta_buffer_set_tensor(ggml_backend_buffer_t buffer, gg static void ggml_backend_meta_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { const size_t n_bufs = ggml_backend_meta_buffer_n_bufs(buffer); - GGML_ASSERT(ggml_is_contiguous(tensor)); + GGML_ASSERT(ggml_is_contiguously_allocated(tensor)); const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(tensor, /*assume_sync =*/ false); GGML_ASSERT(split_state.n_segments == 1); @@ -1280,7 +1309,7 @@ static void ggml_backend_meta_buffer_get_tensor(ggml_backend_buffer_t buffer, co case GGML_BACKEND_SPLIT_AXIS_1: case GGML_BACKEND_SPLIT_AXIS_2: { // Exploit that tensors are contiguous to splice it with simple tensors as "chunks". - const size_t chunk_size_full = tensor->nb[split_state.axis + 1]; + const size_t chunk_size_full = get_next_highest_nb(tensor, split_state.axis); GGML_ASSERT(offset % chunk_size_full == 0); GGML_ASSERT(size % chunk_size_full == 0); const int64_t i_start = offset /chunk_size_full; @@ -1288,7 +1317,7 @@ static void ggml_backend_meta_buffer_get_tensor(ggml_backend_buffer_t buffer, co size_t offset_j = 0; for (size_t j = 0; j < n_bufs; j++){ const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j); - const size_t chunk_size_j = simple_tensor->nb[split_state.axis + 1]; + const size_t chunk_size_j = get_next_highest_nb(simple_tensor, split_state.axis); const size_t simple_offset = i_start * chunk_size_j; ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_j, simple_offset, chunk_size_j, i_stop - i_start, chunk_size_j, chunk_size_full); offset_j += chunk_size_j; @@ -1464,7 +1493,7 @@ static void ggml_backend_meta_free(ggml_backend_t backend) { static void ggml_backend_meta_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { const size_t n_backends = ggml_backend_meta_n_backends(backend); GGML_ASSERT(offset == 0); - GGML_ASSERT(ggml_is_contiguous(tensor)); + GGML_ASSERT(ggml_is_contiguously_allocated(tensor)); const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(tensor, /*assume_sync =*/ false); GGML_ASSERT(split_state.n_segments == 1); @@ -1474,7 +1503,7 @@ static void ggml_backend_meta_set_tensor_async(ggml_backend_t backend, ggml_tens case GGML_BACKEND_SPLIT_AXIS_1: case GGML_BACKEND_SPLIT_AXIS_2: { // Exploit that tensors are contiguous to splice it with simple tensors as "chunks". - const size_t chunk_size_full = tensor->nb[split_state.axis + 1]; + const size_t chunk_size_full = get_next_highest_nb(tensor, split_state.axis); GGML_ASSERT(offset % chunk_size_full == 0); GGML_ASSERT(size % chunk_size_full == 0); const int64_t i_start = offset /chunk_size_full; @@ -1483,7 +1512,7 @@ static void ggml_backend_meta_set_tensor_async(ggml_backend_t backend, ggml_tens for (size_t j = 0; j < n_backends; j++){ ggml_backend_t simple_backend = ggml_backend_meta_simple_backend(backend, j); ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j); - const size_t chunk_size_j = simple_tensor->nb[split_state.axis + 1]; + const size_t chunk_size_j = get_next_highest_nb(simple_tensor, split_state.axis); ggml_backend_tensor_set_2d_async(simple_backend, simple_tensor, (const char *) data + offset_j, offset, chunk_size_j, i_stop - i_start, chunk_size_j, chunk_size_full); offset_j += chunk_size_j; @@ -1505,7 +1534,7 @@ static void ggml_backend_meta_set_tensor_async(ggml_backend_t backend, ggml_tens static void ggml_backend_meta_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { const size_t n_backends = ggml_backend_meta_n_backends(backend); GGML_ASSERT(offset == 0); - GGML_ASSERT(ggml_is_contiguous(tensor)); + GGML_ASSERT(ggml_is_contiguously_allocated(tensor)); const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(tensor, /*assume_sync =*/ false); GGML_ASSERT(split_state.n_segments == 1); @@ -1515,7 +1544,7 @@ static void ggml_backend_meta_get_tensor_async(ggml_backend_t backend, const ggm case GGML_BACKEND_SPLIT_AXIS_1: case GGML_BACKEND_SPLIT_AXIS_2: { // Exploit that tensors are contiguous to splice it with simple tensors as "chunks". - const size_t chunk_size_full = tensor->nb[split_state.axis + 1]; + const size_t chunk_size_full = get_next_highest_nb(tensor, split_state.axis); GGML_ASSERT(offset % chunk_size_full == 0); GGML_ASSERT(size % chunk_size_full == 0); const int64_t i_start = offset /chunk_size_full; @@ -1524,7 +1553,7 @@ static void ggml_backend_meta_get_tensor_async(ggml_backend_t backend, const ggm for (size_t j = 0; j < n_backends; j++){ ggml_backend_t simple_backend = ggml_backend_meta_simple_backend(backend, j); const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j); - const size_t chunk_size_j = simple_tensor->nb[split_state.axis + 1]; + const size_t chunk_size_j = get_next_highest_nb(simple_tensor, split_state.axis); ggml_backend_tensor_get_2d_async(simple_backend, simple_tensor, (char *) data + offset_j, offset, chunk_size_j, i_stop - i_start, chunk_size_j, chunk_size_full); offset_j += chunk_size_j; diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 1a555bf2a4d..bb4a7a84018 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1334,6 +1334,7 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra tensor_copy = src; // use the original tensor as the current copy } else { tensor_copy = ggml_dup_tensor_layout(sched->ctx, src); + tensor_copy->src[0] = src; ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c); } ggml_set_input(tensor_copy); @@ -1353,6 +1354,7 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra ggml_backend_t backend = sched->backends[cur_backend_id]; for (int c = 0; c < sched->n_copies; c++) { struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src); + tensor_copy->src[0] = src; ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c); if (sched->n_copies > 1) { ggml_set_input(tensor_copy);