Using BC4 in a surface object

I’ve tried making a minimal program that I can run via nvcc bc4_surface.cu && ./a.out:

// bc4_surface.cu

#include <cuda.h>
#include <stdio.h>

////////////////////////////////////////////////////////////////////////////////
// ERROR HANDLING
////////////////////////////////////////////////////////////////////////////////
static const char *_cudaGetErrorEnum(cudaError_t error)
{
    return cudaGetErrorName(error);
}

template <typename T>
void check(T result, char const *const func, const char *const file,
           int const line)
{
    if (result)
    {
        fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
                static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
        exit(EXIT_FAILURE);
    }
}

#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)

////////////////////////////////////////////////////////////////////////////////
// MAIN
////////////////////////////////////////////////////////////////////////////////

int main(int, char **)
{

    auto chDescRgba =
        cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsigned);
    auto chDescBC4 = cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed4>();
    cudaArray_t cuArrayRgba;
    cudaArray_t cuArrayBC4;

    cudaExtent extent = {
        .width = 64,
        .height = 64,
        .depth = 1,
    };

    checkCudaErrors(cudaMalloc3DArray(&cuArrayRgba, &chDescRgba, extent, cudaArraySurfaceLoadStore));
    printf("RGBA ok\n");

    checkCudaErrors(cudaMalloc3DArray(&cuArrayBC4, &chDescBC4, extent, cudaArraySurfaceLoadStore));
    printf("BC4 ok\n");
}

This leads to:

RGBA ok
CUDA error at bc4_surface.cu:48 code=801(cudaErrorNotSupported) "cudaMalloc3DArray(&cuArrayBC4, &chDescBC4, extent, cudaArraySurfaceLoadStore)"

I have successfully managed to cudaMemcpyToArray from host to device into a BC4 texture which works.

I wanted to use a surface because then my BC4 encoder kernel can directly write into a surface.
If I cannot use a surface I believe I will instead need to encode into a normal buffer, then do a device-to-device copy into the texture memory. Probably not a huge perf loss but still.

Are surface writes into BCx just not supported?

Thanks.

EDIT: Some sysinfo if relevant:

  • Arch linux
  • Driver Version: 565.77
  • CUDA Version: 12.7
  • NVCC info:
    Built on Tue_Oct_29_23:50:19_PDT_2024
    Cuda compilation tools, release 12.6, V12.6.85
    Build cuda_12.6.r12.6/compiler.35059454_0

It is possible to write to an OpenGL texture from Cuda, no need for a surface. You have to register and map the resource.

Thanks for the tip.

Are there any performace implications to consider when using textures via graphics interop instead of “directly” via the CUDA APIs?