this works. i made a alg bug. and extern should be in kernel, not global.
__constant__ int c_idata[1024]; //can i use int* here, and cudamalloc later?
..
//CUDA_SAFE_CALL( cudaMemcpy( c_idata, h_idata, sizeof(int) * nR, cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_idata, h_idata, sizeof(int) * nR, 0));
//both hangs system
dim3 Dg(2, 1, 1);
dim3 Db(2, 1, 1);
testKernel<<<Dg, Db>>>(d_odata, nR);
CUT_CHECK_ERROR("Kernel partitioning execution failed");
//kernel:
extern __constant__ int c_idata[1024];
__global__ void
testKernel(int* d_odata, const int nR)
{
int bx = blockIdx.x;
int tx = threadIdx.x;
int blockSize = nR / gridDim.x;
int loopTimes = blockSize/blockDim.x;
int dataIdx;
for( int i = 0; i < loopTimes; i++ )
{
dataIdx = blockSize * bx + i * blockDim.x + tx;
for( int j = 0; j < 1023; j++ )
{
//this hangs systerm: d_odata[dataIdx + 1] = c_idata[dataIdx];
//should be:
d_odata[dataIdx] = c_idata[dataIdx];
}
}
}