Hi,
I’m working on cuda 10.2 and I’m new in cude dev.
My purpose is to convert a float16 memory (mapped from Directx12 texture) in float32 memory.
(I base my code on sample vectorAdd.)
int numDevice = 0;
cudaGetDeviceCount(&numDevice);
bAvailable = numDevice > 0;
if (bAvailable)
{
cudaSetDevice(0);
char* ptx;
size_t ptxSize;
const char*kernel_file = "D:/Dev/Optro/features/debug_imGui/OptroView/Optroview/OptroViewCore/interop/float16_to_float32.cu";
/*compileFileToPTX and loadPTX are inspired from sample nvrtc_helper.h*/
if (compileFileToPTX(kernel_file, 0, nullptr, &ptx, &ptxSize, true))
{
m_cudaModule = loadPTX(ptx, 0, nullptr);
if(m_cudaModule)
checkCudaErrors(cuModuleGetFunction(&m_kernelFun, m_cudaModule, "convertFloat16ToFloat32"));
}
}
Variable m_cudaExternalMemoryPtr is the float16 memory mapped from DirectX texture.
Variable m_cudaExternalFloat32MemoryPtr is the float32 memory.
In the following code m_cudaExternalMemoryPtr is not null
if(m_cudaExternalFloat32MemoryPtr)
{
if(m_memoryFloat32Size != 2 * m_memorySize)
{
cudaFree(m_cudaExternalFloat32MemoryPtr);
m_cudaExternalFloat32MemoryPtr = nullptr;
m_memoryFloat32Size = 0;
}
}
if (m_cudaExternalFloat32MemoryPtr == nullptr)
{
m_memoryFloat32Size = 2 * m_memorySize;
cudaMalloc(&m_cudaExternalFloat32MemoryPtr, m_memoryFloat32Size);
}
size_t numElements = m_memoryFloat32Size / sizeof(float);
// Define block and grid dimensions
dim3 blockDim(256, 1, 1);
dim3 gridDim((numElements + blockDim.x - 1) / blockDim.x, 1, 1);
//int teated = 0;
CUdeviceptr d_NumElem;
checkCudaErrors(cuMemAlloc(&d_NumElem, sizeof(size_t)))
checkCudaErrors(cuMemcpyHtoD(d_NumElem, &numElements, sizeof(size_t)));
void* arr[] = { m_cudaExternalMemoryPtr, m_cudaExternalFloat32MemoryPtr, &d_NumElem };
checkCudaErrors(cuLaunchKernel(a_kernelFun, gridDim.x, gridDim.y,
gridDim.z, /* grid dim */
blockDim.x, blockDim.y,
blockDim.z, /* block dim */
0, 0, /* shared mem, stream */
&arr[0], /* arguments */
0))
The kernel code is:
#include <cuda_fp16.h>
extern "C" __global__ void convertFloat16ToFloat32(const half* input, float* output, size_t numElements) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < numElements) {
output[tid] = __half2float(input[tid]);
}
}
I have an invalid resource handle error on cuLaunchKernel.
Do you have an Idea ?
Thanks,
Sorry for my english