/**
* Copyright 2020 Damir Valiev
*
* This file is part of JEN framework.
*
* JEN framework is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this library. If not, see <https://www.gnu.org/licenses/>
*/
#include <jen/compute.h>
#include "../device/device.h"
using namespace jen;
using namespace jen::compute;
struct jen::ModuleCompute::Data {
Device device;
};
static void
transitionLayout(Image *p, vkw::CmdBuffer *p_cmd,
vkw::ImLayout layout, vkw::StageMaskChange stages) {
vkw::BarrierImMem barrier; {
barrier.access_change.src = vkw::AccessMask();
barrier.access_change.dst = vkw::AccessMask();
barrier.layout_change.src = p->layout;
barrier.layout_change.dst = layout;
barrier.queue_family_change.set_both(VK_QUEUE_FAMILY_IGNORED);
barrier.image = p->image.image;
barrier.range.mip_levels_offset = 0;
barrier.range.mip_levels_count = p->mip_level_count;
barrier.range.layers_offset = 0;
barrier.range.layers_count = p->layer_count;
barrier.range.aspect = vkw::ImAspect::COLOR;
}
p_cmd->cmd_barriers(stages, {}, {}, barrier);
}
static void
check_transfer(const jen::DeviceBufferPart &part,
vkw::DeviceSize offset, vkw::DeviceSize size) {
jassert(offset + size <= part.size(), "region exceeds buffer");
jassert(part.is_mapped(), "cannot access memory");
jassert(not part.is_flush_needed(), "flush not supported");
}
static void
write_to_allocation(void *p_src, jen::DeviceBufferPart *p_dst,
vkw::DeviceSize dst_offset, vkw::DeviceSize size) {
check_transfer(*p_dst, dst_offset, size);
memcpy(p_dst->p_data() + dst_offset, p_src, size);
}
static void
read_from_allocation(jen::DeviceBufferPart *p_src, void *p_dst,
vkw::DeviceSize src_offset, vkw::DeviceSize size) {
check_transfer(*p_src, src_offset, size);
memcpy(p_dst, p_src->p_data() + src_offset, size);
}
struct jen::ComputeCmdUnit::Data {
[[nodiscard]] Result
init(Device*);
void
destroy();
[[nodiscard]] jen::Result
wait();
[[nodiscard]] Result
proceed_writes(BufferTransfers, ImagesTransfers);
[[nodiscard]] Result
proceed_staging_reads(BufferTransfers, ImagesTransfers);
struct SyncCounts : vk::SyncContainerCounts {
constexpr static const uint32_t FENCES = 2;
constexpr static const uint32_t SEMAPHORES = 2;
};
Device *p_dev;
vk::CmdPoolContainer<1, 0> compute_cmds;
vk::CmdPoolContainer<2, 0> transfer_cmds;
vk::SyncContainer<SyncCounts> syncs;
bool wait_transfer_write;
bool wait_compute;
bool wait_transfer_read;
jl::array<bool, SyncCounts::FENCES> reset_fence;
};
[[nodiscard]] Result ComputeCmdUnit::Data::
init(Device *p_d) {
p_dev = p_d;
Result res;
res = compute_cmds
.init(*p_dev, p_dev->queue_indices.compute.family,
vkw::CmdPoolFlag::MANUAL_CMD_RESET);
if (res != VK_SUCCESS)
return res;
res = transfer_cmds
.init(*p_dev, p_dev->queue_indices.transfer.family,
vkw::CmdPoolFlag::MANUAL_CMD_RESET);
if (res != VK_SUCCESS)
goto CCC;
res = syncs.init(*p_dev);
if (res != VK_SUCCESS)
goto CTC;
wait_transfer_write = wait_transfer_read = wait_compute = false;
reset_fence = {};
return VK_SUCCESS;
CTC:
transfer_cmds.destroy(*p_dev);
CCC:
compute_cmds.destroy(*p_dev);
return res;
}
[[nodiscard]] Result ComputeCmdUnit::
init(ModuleCompute mc) {
if (not jl::allocate(&p))
return VK_ERROR_OUT_OF_HOST_MEMORY;
Result res = p->init(&mc.p->device);
if (res != VK_SUCCESS)
jl::deallocate(&p);
return res;
}
void ComputeCmdUnit::destroy() {
p->destroy();
jl::deallocate(&p);
}
void ComputeCmdUnit::Data::
destroy() {
transfer_cmds.destroy(*p_dev);
compute_cmds.destroy(*p_dev);
syncs.destroy(*p_dev);
}
[[nodiscard]] Result ComputeCmdUnit::Data::
wait() {
jen::Result res;
if (wait_transfer_write or wait_compute) {
res = syncs.fences[0].wait(*p_dev, vkw::TIMEOUT_INFINITE);
if (res != VK_SUCCESS)
return res;
wait_compute = false;
wait_transfer_write = false;
}
if (wait_transfer_read) {
res = syncs.fences[1].wait_and_reset(*p_dev, vkw::TIMEOUT_INFINITE);
if (res != VK_SUCCESS)
return res;
wait_transfer_read = false;
}
for (uint32_t i = 0; i < reset_fence.count(); ++i) {
if (reset_fence[i]) {
res = syncs.fences[i].reset(*p_dev);
if (res != VK_SUCCESS)
return res;
reset_fence[i] = false;
}
}
return VK_SUCCESS;
}
[[nodiscard]] Result ComputeCmdUnit::Data::
proceed_writes(BufferTransfers buffer_writes,
ImagesTransfers images_writes)
{
auto &cmd = transfer_cmds.primary[0];
auto begin = [&cmd, this]() -> jen::Result {
if (not wait_transfer_write) {
jen::Result res;
res = cmd.begin(vkw::CmdUsage::ONE_TIME_SUBMIT);
if (res != VK_SUCCESS)
return res;
wait_transfer_write = true;
}
return VK_SUCCESS;
};
for (uint32_t i = 0; i < buffer_writes.count(); ++i) {
auto &write = buffer_writes[i];
auto &buffer = *write.p_buffer;
jen::DeviceBufferPart *p_part;
if (buffer.use_staging)
p_part = &buffer.staging;
else
p_part = &buffer.part;
write_to_allocation(write.p_data, p_part, write.offset, write.size);
if (buffer.use_staging) {
vkw::BufferChange bs;
bs.src = buffer.staging.buffer;
bs.dst = buffer.part.buffer;
vkw::BufferRegion region;
region.offsets.src = buffer.staging.offset();
region.offsets.dst = buffer.part.offset();
region.size = write.size;
auto res = begin();
if (res != VK_SUCCESS)
return res;
cmd.cmd_cp_buffer(bs, region);
}
}
for (uint32_t i = 0; i < images_writes.count(); ++i) {
auto res = begin();
if (res != VK_SUCCESS)
return res;
auto &w = images_writes[i];
auto &im = *w.p_image;
if (im.layout != vkw::ImLayout::TRANSFER_DST) {
vkw::StageMaskChange stages;
stages.src = vkw::StageFlag::TOP_OF_PIPE;
stages.dst = vkw::StageFlag::TRANSFER;
transitionLayout(&im, &cmd, vkw::ImLayout::TRANSFER_DST, stages);
}
uint32_t x_size = vkw::format_size(im.format);
for (auto &r : w.transfers) {
auto ext = im.extent;
uint64_t moffset = 0;
for (uint32_t j = 1; j < r.mip_level; ++j) {
moffset += ext.all_scale() * x_size;
ext /= 2;
ext.x = jl::max(ext.x, 1u);
ext.y = jl::max(ext.y, 1u);
ext.z = jl::max(ext.z, 1u);
}
uint64_t y_size = ext.x * x_size;
uint64_t z_size = ext.y * y_size;
uint64_t l_size = ext.z * z_size;
uint64_t write_size = (r.extent.x - r.offset.x) * x_size;
uint8_t *p_user = reinterpret_cast<uint8_t*>(r.p_data);
uint64_t l_offset = r.layer_offset * l_size;
uint64_t z_offset = l_offset + r.offset.z * z_size;
for (uint32_t z = 0; z < r.extent.z; ++z) {
uint64_t y_offset = z_offset + r.offset.y * y_size;
for (uint32_t y = 0; y < r.extent.y; ++y) {
write_to_allocation(p_user, &im.staging, y_offset, write_size);
p_user += write_size;
y_offset += y_size;
}
z_offset += z_size;
}
uint64_t offset = moffset;
offset += r.layer_offset * l_size;
offset += r.offset.z * z_size;
offset += r.offset.y * y_size;
vkw::BufferAndImageRegion region; {
region.bufferOffset = im.staging.offset() + offset;
region.bufferRowLength = ext.x;
region.bufferImageHeight = ext.y;
region.imageSubresource = {
vkw::ImAspect::COLOR, r.mip_level, r.layer_offset, 1
};
region.imageOffset.x = int32_t(r.offset.x);
region.imageOffset.y = int32_t(r.offset.y);
region.imageOffset.z = int32_t(r.offset.z);
region.imageExtent.width = r.extent.x;
region.imageExtent.height = r.extent.y;
region.imageExtent.depth = r.extent.z;
}
cmd.cmd_cp_buffer_to_image({im.staging.buffer, im.image.image},
region, vkw::ImLayout::TRANSFER_DST);
}
}
if (wait_transfer_write) {
jen::Result res;
res = cmd.end();
if (res != VK_SUCCESS)
return res;
vkw::QueueSignal signal(syncs.semaphores[0].p_vk);
vkw::QueueSubmit submit(cmd, {}, signal);
res = p_dev->queues.transfer.submit_locked(submit);
if (res != VK_SUCCESS)
return res;
for (uint32_t i = 0; i < images_writes.count(); ++i)
images_writes[i].p_image->layout = vkw::ImLayout::TRANSFER_DST;
}
return VK_SUCCESS;
}
[[nodiscard]] Result ComputeCmdUnit::Data::
proceed_staging_reads(BufferTransfers buffer_reads,
ImagesTransfers images_reads)
{
auto &cmd = transfer_cmds.primary[1];
auto begin = [&cmd, this]() -> jen::Result {
if (not wait_transfer_read) {
jen::Result res;
res = cmd.begin(vkw::CmdUsage::ONE_TIME_SUBMIT);
if (res != VK_SUCCESS)
return res;
wait_transfer_read = true;
}
return VK_SUCCESS;
};
for (uint32_t i = 0; i < buffer_reads.count(); ++i) {
auto &read = buffer_reads[i];
auto &buffer = *read.p_buffer;
if (buffer.use_staging) {
vkw::BufferChange bs;
bs.src = buffer.part.buffer;
bs.dst = buffer.staging.buffer;
vkw::BufferRegion region;
region.offsets.src = buffer.part.offset();
region.offsets.dst = buffer.staging.offset();
region.size = read.size;
auto res = begin();
if (res != VK_SUCCESS)
return res;
cmd.cmd_cp_buffer(bs, region);
}
}
for (uint32_t i = 0; i < images_reads.count(); ++i) {
auto res = begin();
if (res != VK_SUCCESS)
return res;
auto &w = images_reads[i];
auto &im = *w.p_image;
if (im.layout != vkw::ImLayout::TRANSFER_SRC) {
vkw::StageMaskChange stages;
stages.src = vkw::StageFlag::TOP_OF_PIPE;
stages.dst = vkw::StageFlag::TRANSFER;
transitionLayout(&im, &cmd, vkw::ImLayout::TRANSFER_SRC, stages);
}
uint32_t x_size = vkw::format_size(im.format);
for (auto &r : w.transfers) {
auto ext = im.extent;
uint64_t moffset = 0;
for (uint32_t j = 1; j < r.mip_level; ++j) {
moffset += ext.all_scale() * x_size;
ext /= 2;
ext.x = jl::max(ext.x, 1u);
ext.y = jl::max(ext.y, 1u);
ext.z = jl::max(ext.z, 1u);
}
uint64_t y_size = ext.x * x_size;
uint64_t z_size = ext.y * y_size;
uint64_t l_size = ext.z * z_size;
uint64_t offset = moffset;
offset += r.layer_offset * l_size;
offset += r.offset.z * z_size;
offset += r.offset.y * y_size;
vkw::BufferAndImageRegion region; {
region.bufferOffset = im.staging.offset() + offset;
region.bufferRowLength = ext.x;
region.bufferImageHeight = ext.y;
region.imageSubresource = {
vkw::ImAspect::COLOR, r.mip_level, r.layer_offset, 1
};
region.imageOffset.x = int32_t(r.offset.x);
region.imageOffset.y = int32_t(r.offset.y);
region.imageOffset.z = int32_t(r.offset.z);
region.imageExtent.width = r.extent.x;
region.imageExtent.height = r.extent.y;
region.imageExtent.depth = r.extent.z;
}
cmd.cmd_cp_image_to_buffer({im.image.image, im.staging.buffer},
region, vkw::ImLayout::TRANSFER_SRC);
}
}
if (wait_transfer_read) {
wait_compute = false;
jen::Result res;
res = transfer_cmds.primary[1].end();
if (res != VK_SUCCESS)
return res;
vkw::StageMask stage_mask = vkw::StageFlag::COMPUTE_SHADER;
vkw::QueueWait wait;
wait.semaphores = syncs.semaphores[1].p_vk;
wait.stage_masks = stage_mask;
vkw::QueueSubmit submit(cmd, wait);
res = p_dev->queues.transfer.submit_locked(submit, syncs.fences[1]);
if (res != VK_SUCCESS)
return res;
reset_fence[1] = true;
for (uint32_t i = 0; i < images_reads.count(); ++i)
images_reads[i].p_image->layout = vkw::ImLayout::TRANSFER_SRC;
}
return VK_SUCCESS;
}
[[nodiscard]] static Result
check_computeInfo(const Device &device, const ComputeInfo &info) {
for (int i = 0; i < 3; ++i)
if (info.group_count[i] >
device.properties.limits.maxComputeWorkGroupCount[i]) {
fprintf(stderr, "ComputeInfo.group_count[%i] "
"exceeds device limit maxComputeWorkGroupCount[%i]="
"%i. "
"Max of 65535 is recommended because it is "
"minimum possible supported value\n", i, i,
device.properties.limits.maxComputeWorkGroupCount[i]);
return vkw::ERROR_INVALID_USAGE;
}
return VK_SUCCESS;
}
[[nodiscard]] Result ComputeCmdUnit::
compute_status() {
jen::Result res;
if (p->wait_compute) {
res = p->syncs.fences[0].status(*p->p_dev);
if (res != VK_SUCCESS)
return res;
}
if (p->wait_transfer_read) {
res = p->syncs.fences[1].status(*p->p_dev);
if (res != VK_SUCCESS)
return res;
}
return VK_SUCCESS;
}
[[nodiscard]] Result ComputeCmdUnit::
compute(const ComputeInfo &info)
{
Result res;
res = check_computeInfo(*p->p_dev, info);
if (res != VK_SUCCESS)
return res;
res = p->wait();
if (res != VK_SUCCESS)
return res;
res = p->proceed_writes(info.buffer_writes, info.images_writes);
if (res != VK_SUCCESS)
return res;
auto &syncs = p->syncs;
auto &cmds = p->compute_cmds;
auto &pipeline = info.p_pipeline->pipeline;
auto &pipelineLayout = info.p_pipeline->layout;
auto &set = info.p_binding_set->set;
auto &cmd = cmds.primary[0];
res = cmd.begin(vkw::CmdUsage::ONE_TIME_SUBMIT);
if (res != VK_SUCCESS)
return res;
for (auto &im : info.p_bindings->storage_image) {
auto l = vkw::ImLayout::GENERAL;
if (im.p_image->layout == l)
continue;
vkw::StageMaskChange stages;
stages.src = vkw::StageFlag::TOP_OF_PIPE;
stages.dst = vkw::StageFlag::COMPUTE_SHADER;
transitionLayout(im.p_image, &cmd, l, stages);
}
cmd.cmd_set_pipeline(pipeline, vkw::BindPoint::COMPUTE);
cmd.cmd_set_descr_sets(vkw::BindPoint::COMPUTE, pipelineLayout, set, 0);
auto &gc = info.group_count;
cmd.cmd_dispatch({gc.x,gc.y,gc.z});
res = cmd.end();
if (res != VK_SUCCESS)
return res;
bool use_read_semaphore = false;
if (info.images_reads.count() > 0)
use_read_semaphore = true;
else for (uint32_t i = 0; i < info.buffer_reads.count(); ++i) {
if (info.buffer_reads[i].p_buffer->use_staging) {
use_read_semaphore = true;
break;
}
}
vkw::QueueWait wait;
vkw::StageMask wait_mask = vkw::StageFlag::TRANSFER;
if (p->wait_transfer_write) {
wait.semaphores = syncs.semaphores[0].p_vk;
wait.stage_masks = wait_mask;
}
else
wait = {};
vkw::QueueSignal signal;
if (use_read_semaphore)
signal = syncs.semaphores[1].p_vk;
else
signal = {};
vkw::QueueSubmit submit(cmd, wait, signal);
res = p->p_dev->queues.compute.submit_locked(submit, syncs.fences[0]);
if (res != VK_SUCCESS)
return res;
p->reset_fence[0] = true;
for (auto &im : info.p_bindings->storage_image) {
auto l = vkw::ImLayout::GENERAL;
im.p_image->layout = l;
}
p->wait_compute = true;
return p->proceed_staging_reads(info.buffer_reads, info.images_reads);
}
[[nodiscard]] Result ComputeCmdUnit::
read_result(BufferTransfers buffer_reads, ImagesTransfers images_reads) {
Result res;
res = p->wait();
if (res != VK_SUCCESS)
return res;
for (uint32_t i = 0; i < buffer_reads.count(); ++i) {
auto &read = buffer_reads[i];
auto &buffer = *read.p_buffer;
jen::DeviceBufferPart *p_part;
if (buffer.use_staging)
p_part = &buffer.staging;
else
p_part = &buffer.part;
read_from_allocation(p_part, read.p_data, read.offset, read.size);
}
for (uint32_t i = 0; i < images_reads.count(); ++i) {
auto &read = images_reads[i];
auto &im = *read.p_image;
auto p_part = &im.staging;
uint32_t x_size = vkw::format_size(im.format);
for (auto &r : read.transfers) {
auto ext = im.extent;
uint64_t moffset = 0;
for (uint32_t j = 1; j < r.mip_level; ++j) {
moffset += ext.all_scale() * x_size;
ext /= 2;
ext.x = jl::max(ext.x, 1u);
ext.y = jl::max(ext.y, 1u);
ext.z = jl::max(ext.z, 1u);
}
uint64_t y_size = ext.x * x_size;
uint64_t z_size = ext.y * y_size;
uint64_t l_size = ext.z * z_size;
uint64_t read_size = (r.extent.x - r.offset.x) * x_size;
uint8_t *p_user = reinterpret_cast<uint8_t*>(r.p_data);
uint64_t l_offset = moffset + r.layer_offset * l_size;
uint64_t z_offset = l_offset + r.offset.z * z_size;
for (uint32_t z = 0; z < r.extent.z; ++z) {
uint64_t y_offset = z_offset + r.offset.y * y_size;
for (uint32_t y = 0; y < r.extent.y; ++y) {
read_from_allocation(p_part, p_user, y_offset, read_size);
p_user += read_size;
y_offset += y_size;
}
z_offset += z_size;
}
}
}
return VK_SUCCESS;
}