I’ve tried making a minimal program that I can run via nvcc bc4_surface.cu && ./a.out
:
// bc4_surface.cu
#include <cuda.h>
#include <stdio.h>
////////////////////////////////////////////////////////////////////////////////
// ERROR HANDLING
////////////////////////////////////////////////////////////////////////////////
static const char *_cudaGetErrorEnum(cudaError_t error)
{
return cudaGetErrorName(error);
}
template <typename T>
void check(T result, char const *const func, const char *const file,
int const line)
{
if (result)
{
fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
exit(EXIT_FAILURE);
}
}
#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
////////////////////////////////////////////////////////////////////////////////
// MAIN
////////////////////////////////////////////////////////////////////////////////
int main(int, char **)
{
auto chDescRgba =
cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsigned);
auto chDescBC4 = cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed4>();
cudaArray_t cuArrayRgba;
cudaArray_t cuArrayBC4;
cudaExtent extent = {
.width = 64,
.height = 64,
.depth = 1,
};
checkCudaErrors(cudaMalloc3DArray(&cuArrayRgba, &chDescRgba, extent, cudaArraySurfaceLoadStore));
printf("RGBA ok\n");
checkCudaErrors(cudaMalloc3DArray(&cuArrayBC4, &chDescBC4, extent, cudaArraySurfaceLoadStore));
printf("BC4 ok\n");
}
This leads to:
RGBA ok
CUDA error at bc4_surface.cu:48 code=801(cudaErrorNotSupported) "cudaMalloc3DArray(&cuArrayBC4, &chDescBC4, extent, cudaArraySurfaceLoadStore)"
I have successfully managed to cudaMemcpyToArray
from host to device into a BC4 texture which works.
I wanted to use a surface because then my BC4 encoder kernel can directly write into a surface.
If I cannot use a surface I believe I will instead need to encode into a normal buffer, then do a device-to-device copy into the texture memory. Probably not a huge perf loss but still.
Are surface writes into BCx just not supported?
Thanks.
EDIT: Some sysinfo if relevant:
- Arch linux
- Driver Version: 565.77
- CUDA Version: 12.7
- NVCC info:
Built on Tue_Oct_29_23:50:19_PDT_2024
Cuda compilation tools, release 12.6, V12.6.85
Build cuda_12.6.r12.6/compiler.35059454_0