OK, this has gotten a little off topic since my original post. Check the manual section 4.2.2.2 (CUDA 1.1 at least). It specifically states that the use of device with constant is optional. In both cases, the declared variable resides in the constant memory space on the device.
To get back on topic for the OP, you are going to need to narrow your problem down to a minimal reproduction and post the code here. I just wrote a test using an initialized constant array and had no problems whatsoever.
#include <stdio.h>
# define CUDA_SAFE_CALL( call) do { \
cudaError err = call; \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} } while (0)
#ifdef NDEBUG
#define CUT_CHECK_ERROR(errorMessage)
#else
# define CUT_CHECK_ERROR(errorMessage) do { \
cudaThreadSynchronize(); \
cudaError_t err = cudaGetLastError(); \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
exit(EXIT_FAILURE); \
} } while (0)
#endif
__device__ __constant__ int constA[32] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
__constant__ int constB[32] = {10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131};
__global__ void copy_gmemA(int* g_odata)
{
const unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
g_odata[idx] = constA[threadIdx.x];
}
__global__ void copy_gmemB(int* g_odata)
{
const unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
g_odata[idx] = constB[threadIdx.x];
}
int main()
{
int *d_odata, *h_odata;
int len = 32;
int num_threads = 32;
CUDA_SAFE_CALL( cudaMalloc((void**)&d_odata, sizeof(int)*(len)) );
h_odata = (int *)malloc(sizeof(int) * len);
dim3 threads(num_threads, 1, 1);
dim3 grid(1, 1, 1);
copy_gmemA<<< grid, threads >>>(d_odata);
CUDA_SAFE_CALL( cudaMemcpy(h_odata, d_odata, sizeof(int)*len, cudaMemcpyDeviceToHost) );
printf("A: ");
for (int i = 0; i < 32; i++)
printf("%d ", h_odata[i]);
printf("\n\n");
copy_gmemB<<< grid, threads >>>(d_odata);
CUDA_SAFE_CALL( cudaMemcpy(h_odata, d_odata, sizeof(int)*len, cudaMemcpyDeviceToHost) );
printf("A: ");
for (int i = 0; i < 32; i++)
printf("%d ", h_odata[i]);
return 0;
}
When I run this, I get the expected output:
$ ./constant_test
A: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
A: 10 11 12 13 14 15 16 17 18 19 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131