cudaErrorInvalidValue when using float4* and cudaMemcpyToSymbol

Hi I am trying to create a device pointer to some data using float4 however I get an invalid Value error when copying the pointer value over using cudaMemcpyToSymbol (Line 12).

I have this working using the same code using ushort4, however when using float4 instead it does not.

Here is the relevant GPU code:

__device__ float4 *d_Ptr;
float4 *d_HostPtr;
texture<float4, 2, cudaReadModeElementType> texDummy;
cudaChannelFormatDesc ushortDummyDesc = cudaCreateChannelDesc<float4>();

extern "C"
cudaError_t CopyToFunction(float4* h_Ptr)
{
    cudaError_t error;

    error = cudaMalloc(&d_HostPtr, 5 * sizeof(float4));
    error = cudaMemcpyToSymbol(d_Ptr, &d_HostPtr, sizeof(float4));
    error = cudaMemcpy(d_HostPtr, h_Ptr, 5 * sizeof(float4), cudaMemcpyHostToDevice);

    return error;
}

extern "C"
cudaError_t CopyFromFunction(float4* h_Ptr)
{
    cudaError_t error;

    error = cudaMemcpy(h_Ptr, d_HostPtr, 5 * sizeof(float4), cudaMemcpyDeviceToHost);

    return error;
}

And here is the relevant CPU side code:

float4 cpuInputReferenceArray [5];
float4 *cpuInputPtr;
cpuInputPtr = cpuInputReferenceArray;

float4 cpuOutputReferenceArray [5];
float4 *cpuOutputPtr;
cpuOutputPtr = cpuOutputReferenceArray;

for(int i = 0; i < 5; i++)
{
    cpuInputPtr[i].x = i;
    cpuInputPtr[i].y = i * 2;
    cpuInputPtr[i].z = i * 3;
    cpuInputPtr[i].w = i * 4;
}

checkCudaErrors(CopyToFunction(cpuInputPtr));
getLastCudaError("Test Copy To Failed");

for(int i = 0; i < 5; i++)
{
    cpuInputPtr[i].x = 0;
    cpuInputPtr[i].y = 0;
    cpuInputPtr[i].z = 0;
    cpuInputPtr[i].w = 0;
}

checkCudaErrors(cudaDeviceSynchronize());

checkCudaErrors(CopyFromFunction(cpuOutputPtr));
getLastCudaError("Test Copy From Failed");

for(int i = 0; i < 5; i++)
{
    printf("Ptr at %d is %f\n", i, cpuOutputPtr[i].w);
}

Any help would be greatly appreciated.

You’re copying a pointer. This:

error = cudaMemcpyToSymbol(d_Ptr, &d_HostPtr, sizeof(float4));

should be this:

error = cudaMemcpyToSymbol(d_Ptr, &d_HostPtr, sizeof(float4 *));

sizeof(float4) is 16 bytes.
sizeof(float4 *) is 8 bytes (on a 64-bit platform)
sizeof(ushort4) is 8 bytes, which is why that seems to work

I would never have figured that you.

Thank you very much I appreciate it immensely!