__const__ writting with cudaMemcpyToSymbol

My code:

__device__ __constant__ float3 a[2048];

__device__ __constant__ float3 b[2048];

SAFE_CALL ( cudaMemcpyToSymbol(a,(const void*)hostData1,sizeof(float3)*nElem) )

SAFE_CALL ( cudaMemcpyToSymbol(b,(const void*)hostData2,sizeof(float3)*nElem) )

The nElem = 2048

The .ptx says:

.const .align 4 .b8 a[24576];

.const .align 4 .b8 b[24576];

The cudaMemcpyToSymbol does not fail because does not assert at all ( yes, i’m running debug mode ). With -deviceemu works ok too. However, when I erase the -deviceemu to run in a GF8500GT with CUDA 1.0 does not copy the data ( does not assert, returns cudaSuccess error ). Changing to Release mode yields the same results. Any idea why it does not copy the data?

I tryed also to put an “align” thing:

__device__ __constant__ __align__(16) float3 a[2048];

but same result…

I was desperated and also removed the device from the start:

__constant__ float3 a[2048];

same result…

Also is curious but if I do

__device__ /*__constant__*/ float3 a[2048];

commenting the constant works ok using HW-acceleration.

Any idea why cudaMemcpyToSymbol cannot copy data from host to HW constant memory pls?

thx

I had the same problem… Adding a __syncthreads; as first instruction of the kernel solved it :

__constant__ struct parameters { float* ptr; ...};

__global__ kernel()

{

    __syncthreads;       // CRASH if this line is removed..

    ptr[threadId]++;

}

My kernel does not crash! The memory is simply not copied to the constant variable and does not return error ( returns cudaSuccess, but does not copy the host data to the constant buffer )

How did you verify that the data was not copied?

Did you call cudaMemcpyFromSymbol() to see if your data successfully made a round-trip?

Hi i also had some problems at first coping data, was the host data also allocated with float3 ? First thing you can try is:

device constant float a[20483];
device constant float b[2048
3];

if this works, it means that you need to allocate the host memory differently.
I found that a malloc with size of works best (and not new). Second thing i can think of is the size of const memory, if you try a smaller array dose it work ?

but my guess is the first thing i mentioned …

good luck !

Because I just passed the data through the kernel, process it and then copy results back ( and then compare results in host ). Results weren’t touch by kernel at all ( the kernel was to add +1.0f to each input value ).

Yep, i’m using cudaMemcpyFromSymbol() to copy data from host to device constant memory. I’m using the CUDA_SAFE_CALL macro but returns cudaSuccess. No exception is thrown. No assert is fired. The data is simply not copied to the device constant buffer.

The constant buffer I employed is less than 64k btw ( in fact very few less, 6k ), so is in the correct limits. The kernel registers used is 16, so the 16*threadBlocks<8192 as required. The grid size is 8x1 blocks ( so is in the 65k x 65k limit ). The number of thread blocks is 128 ( so is less than the 512 limit too )… I don’t see the 5 second watchdog monitor small icon on the left-top part of the screen.

Also tryed with float4 aligned variable, but same result.

The host data was allocated with malloc first ( hostData1 = (float3*)malloc(2048*sizeof(float3)) ), then I tryed with cudaMallocHost page-locked … same result.

Of course with -deviceemu works ok…So I don’t understand why it fails. I bet is a driver bug.

Constant memory cannot be changed by the kernel. You need to put your data in the global memory instead.

Constant memory is useful for storing parameters needed by the kernel, but not for store the actual data itself.

I know, I do:

outData[idx].x = a[idx] + 1.0f;

I was asking earlier if you had issued a cudaMemcpyFromSymbol() to copy from “a” to the host. This will let you confirm if “a” really contains the data that you expected.

I think I just had a similar problem.

What I did was quite stupid actually: I created the constant memory, copied everything there and then passed a pointer to the kernel. That did not work - the data is not accessible from within the kernel via that pointer.
What I realized then is that the constant memory is accessible in the scope of the kernel anyway - I don’t have to pass a reference of the data to the kernel. So now I access just the variable I defined outside the kernel and that works.

Maybe you made the same error in reasoning as I did.

The complete code is pretty simple:

#include <assert.h>

#include <stdio.h>

#define NELEMENTS 2048

__device__ __const__ float gc_flA[NELEMENTS]; //tryed __const__alone too and __align__(16) too, noting...

void* allocateDeviceData ()

{

    void *ptr;

    CUDA_SAFE_CALL ( cudaMalloc(&ptr,NELEMENTS*sizeof(float)) )

    return ptr;

}

void* allocateHostData ()

{

    void *ptr;

    CUDA_SAFE_CALL ( cudaMallocHost(&ptr,NELEMENTS*sizeof(float)) ) //tryed also with a simple malloc, same result.. but page-locked malloc is supposed to be faster so..

   return ptr;

}

__global__ void myKernel ( float* data )

{

    const int idx = threadIdx.x + (blockIdx.x*blockDim.x);

    data[idx] += gc_flA[idx];

}

extern "C" void DoIT ()

{

    int i;

    float *hd = (float*)allocateHostData();

    for ( i=0; i<NELEMENTS; ++i )

    {

       hd[i] = 1.0f;

    }

   CUDA_SAFE_CALL(cudaMemcpyToSymbol(gc_flA,hd,NELEMENTS*sizeof(float)))

   float *devd = (float*)allocateDeviceData();

    CUDA_SAFE_CALL ( cudaMemset(devd,0,NELEMENTS*sizeof(float)) )

   dim3 grid_size, block_size; 

    grid_size.x = static_cast<unsigned int>(NELEMENTS/128);//nBlocks

    block_size.x = static_cast<unsigned int>(128);//tThreadsPerBlock

    myKernel<<<grid_size, block_size>>>(devd);

   cudaError_t kernelError = cudaThreadsSyncronize();

    assert ( cudaSuccess==kernelError );//to see if kernel fails to launch

    CUDA_SAFE_CALL(cudaMemcpy(hd,devd,NELEMENTS*sizeof(float),cudaMemcpyDeviceToHost))

 cudaFree(devd);

 //now test if returned data is ok

  for ( i=0; i<NELEMENTS; ++i )

  { 

      if ( hd[i]!=1.0f )

      {

          printf("bad data!");

          break;

      }

  }

 cudaMallocFree(hd);

}

I got thrash in HD after the kernel call so the constant memory is not copied well ( cureiously begin in debug mode I don’t get asserts fired on cudaMemcpyToSymbol call so I assume there is no internal error? ). If I remove the _constant thing works ok and got in HD elements “1.0f”

Have you tried calling cudaMemcpyToSymbol() with ASCII name of symbol? I.e. this way:

cudaMemcpyToSymbol("gc_flA",hd,NELEMENTS*sizeof(float));

And, btw, your for loops increment i before iteration (but after checking loop condition), so your code never modifies or accesses first element (hd[0]) but in last iteration it accesses element hd[NELEMENTS] which is beyond array boundaries (and this can easily trigger “bad data!” message).

Yep, I tryed the “gc_flA”… makes a crash. And kernel and app works ok if I just remove the constant thing ( using only the device ).

The ++i thing is to optimize. A STL/C++ trick. The postincrement operator returns an object, while the preincrement one returns a reference(so is more efficient). See this:

http://magnetiq.com/2006/08/26/opt-for-pre…ting-iterators/

http://discuss.fogcreek.com/joelonsoftware…81&ixReplies=17

On the other hand the

for ( i=0; i<NELEMENTS; ++i )

{

}

works ok ( you can debug it and see the element 0 and the NELEMENTS-1 is well set ). The “bad data!” text is never triggered. The NELEMENTS element is never assigned ( so no buffer overrun ).

Do this test to be sure:

#include <stdio.h>

void main ( int argc, char *argv[], char *envp[] )

{

    int i;

    int data[100];

   for ( i=0; i<100; ++i )

    {

        data[i] = i;

        printf("%i\n",data[i]);

    }

   return;

}

will show the data correctly in range [0,99]. Enabling the buffer overrun VS2005 feature or showing assembly you can see it never assigns data[100] and the data[0] is assigned well.

Hi santyhyammer,

Your code had a number of compile errors, like missing “;” and wrong function names. I fixed all of them, got it to compile and ran it. It worked fine for me. Did not see any “bad data” messages. I’m using Linux with CUDA 1.0. I’ve attached the modified code below.

#include <assert.h>

#include <stdio.h>

#include <cutil.h>

#define NELEMENTS 2048

__device__ __constant__ float gc_flA[NELEMENTS];

void* allocateDeviceData ()

{

    void *ptr;

    CUDA_SAFE_CALL ( cudaMalloc(&ptr,NELEMENTS*sizeof(float)) );

    return ptr;

}

void* allocateHostData ()

{

    void *ptr;

    CUDA_SAFE_CALL ( cudaMallocHost(&ptr,NELEMENTS*sizeof(float)) );

    return ptr;

}

__global__ void myKernel ( float* data )

{

    const int idx = threadIdx.x + (blockIdx.x*blockDim.x);

    data[idx] += gc_flA[idx];

}

int main(void)

{

    int i;

    float *hd = (float*)allocateHostData();

    for ( i=0; i<NELEMENTS; ++i )

    {

        hd[i] = 1.0f;

    }

   CUDA_SAFE_CALL(cudaMemcpyToSymbol(gc_flA,hd,NELEMENTS*sizeof(float)));

   float *devd = (float*)allocateDeviceData();

    CUDA_SAFE_CALL ( cudaMemset(devd,0,NELEMENTS*sizeof(float)) );

   dim3 grid_size, block_size;

    grid_size.x = static_cast<unsigned int>(NELEMENTS/128);//nBlocks

    block_size.x = static_cast<unsigned int>(128);//tThreadsPerBlock

    myKernel<<<grid_size, block_size>>>(devd);

   cudaError_t kernelError = cudaThreadSynchronize();

    assert ( cudaSuccess==kernelError );//to see if kernel fails to launch

    CUDA_SAFE_CALL(cudaMemcpy(hd,devd,NELEMENTS*sizeof(float),cudaMemcpyDeviceTo

   cudaFree(devd);

   //now test if returned data is ok

    for ( i=0; i<NELEMENTS; ++i )

    {

        if ( hd[i]!=1.0f )

        {

            printf("bad data!");

            break;

        }

    }

   cudaFreeHost(hd);

}

Thx for testing sphyraena.
Btw, i’m using GF8500 256Mb, FW162.18 over WinXP SP2 x86, 2Gb RAM and E2140.

I’m not familiar with the 8500. Could it be that it has too little constant memory to hold your array? Maybe you could try with a smaller array.