Yes, the magic numbers are also adjusting the video levels (limited range).
I have changed the code like this:
#pragma once
#include "cuda_runtime.h"
class CudaConversionService
{
public:
CudaConversionService(const int width, const int height);
virtual ~CudaConversionService();
bool ConvertToAVUY64(const float* src, char* dst);
private:
const int width;
const int height;
const dim3 threadsPerBlock;
const dim3 numBlocks;
size_t sizeSrc;
size_t sizeDst;
};
#include <math.h>
#include "convert.h"
#define BLOCK_SIZE 32
#define Y_FOOT (16.0f / 219.0f)
#define Y_HEAD (255.0f / 219.0f)
#define CBCR_FOOT (128.0f / 224.0f)
#define CBCR_HEAD (256.0f / 224.0f)
#define F_USHRT_MAX 65535.0f
__device__ float4* d_src;
__device__ ushort4* d_dst;
__global__ void ayuv64_kernel(const float4* __restrict__ d_src, ushort4* __restrict__ d_dst, const int width, const int height)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= width || y >= height)
return;
const int p = y * width + x;
const int p2 = (height - 1 - y) * width + x;
// Swap channels and apply video levels
d_dst[p2] = make_ushort4(
(unsigned short)(d_src[p].w * F_USHRT_MAX),
(unsigned short)fmaf(d_src[p].z, F_USHRT_MAX / Y_HEAD, Y_FOOT * F_USHRT_MAX / Y_HEAD),
(unsigned short)fmaf(d_src[p].y, F_USHRT_MAX / CBCR_HEAD, CBCR_FOOT * F_USHRT_MAX / CBCR_HEAD),
(unsigned short)fmaf(d_src[p].x, F_USHRT_MAX / CBCR_HEAD, CBCR_FOOT * F_USHRT_MAX / CBCR_HEAD)
);
}
CudaConversionService::CudaConversionService(const int width, const int height):
width(width),
height(height),
threadsPerBlock(BLOCK_SIZE, BLOCK_SIZE),
numBlocks(
ceil((float)width / threadsPerBlock.x),
ceil((float)height / threadsPerBlock.y))
{}
CudaConversionService::~CudaConversionService()
{
if (d_src)
cudaFree(d_src);
if (d_dst)
cudaFree(d_dst);
}
bool CudaConversionService::Init()
{
// Define buffer sizes
sizeSrc = width * height * sizeof(float4);
sizeDst = width * height * sizeof(ushort4);
// Reserve memory on CPU and GPU
if (cudaMalloc((void**)&d_src, sizeSrc) == cudaSuccess)
{
if (cudaMalloc((void**)&d_dst, sizeDst) == cudaSuccess)
return true;
cudaFree(d_src);
}
return true;
}
bool CudaConversionService::ConvertToAVUY64(const float* src, char* dst)
{
// Copy framebuffer to device
if (cudaMemcpy(d_src, src, sizeSrc, cudaMemcpyHostToDevice) == cudaSuccess)
{
ayuv64_kernel << < numBlocks, threadsPerBlock >> > (d_src, d_dst, width, height);
if (cudaMemcpy(dst, d_dst, sizeDst, cudaMemcpyDeviceToHost) == cudaSuccess)
return true;
}
return false;
}
I guess to optimize it for for speed it is neccessary to process multiple frames at once (with one cudaMemCopy) to reduce overhead.
All my GPU are using PCIe x16 3.0 (tested it with GPU-Z).
About the value for “Code Generation”: Currently it is set to “compute_52,sm_52” which provides most backward compatibility. But I’d also like to use the advantages of more modern GPUs. How can I configure this in Visual Studio?