I have a simple test program that should be out-puting two zeros, but instead is out-puting two large numbers.
output:
1921233664.000000 1921233664.000000
but it should output:
0.000000 0.000000
Judging by the execution speed, i don’t think the kernel is even running. But i would expect an error then, which is not the case. The relevant parts of the code are below. If anyone has any idea why this isn’t working, that would be great. thanks.
#define THREADS_PER_BLOCK 128
#define BLOCKS 24
#define THREADS THREADS_PER_BLOCK*BLOCKS
__device float d_sum_squared_error[THREADS];
__device float d_sum_squared_error_probe[THREADS];
void init(int argc, char** argv) {
CUT_DEVICE_INIT(argc,argv);
}
global void zero_rmse(float *g_sum_squared_error, float *g_sum_squared_error_probe) {
g_sum_squared_error[blockIdx.x * THREADS_PER_BLOCK + threadIdx.x] = 0;
g_sum_squared_error_probe[blockIdx.x * THREADS_PER_BLOCK + threadIdx.x] = 0;
}
global void sum_rmse(float *g_sum_squared_error, float *g_sum_squared_error_probe) {
shared float sum_squared_error[THREADS_PER_BLOCK];
shared float sum_squared_error_probe[THREADS_PER_BLOCK];
if( blockIdx.x == 0) {
float c = 0;
c = 0;
for( int i = threadIdx.x; i < NUM_THREADS; i += THREADS_PER_BLOCK)
c += g_sum_squared_error[i];
sum_squared_error[threadIdx.x] = c;
__syncthreads();
if( threadIdx.x < warpSize ) {
c = 0;
for( int i = threadIdx.x; i < THREADS_PER_BLOCK; i += warpSize)
c += sum_squared_error[i];
sum_squared_error[threadIdx.x] = c;
}
if( threadIdx.x == 0 ) {
for( int i = 1; i < warpSize; i ++)
c += sum_squared_error[i];
g_sum_squared_error[0] = c;
}
}
if( blockIdx.x == 1) {
float c = 0;
c = 0;
for( int i = threadIdx.x; i < NUM_THREADS; i += THREADS_PER_BLOCK)
c += g_sum_squared_error_probe[i];
sum_squared_error_probe[threadIdx.x] = c;
__syncthreads();
if( threadIdx.x < warpSize ) {
c = 0;
for( int i = threadIdx.x; i < THREADS_PER_BLOCK; i += warpSize)
c += sum_squared_error_probe[i];
sum_squared_error_probe[threadIdx.x] = c;
}
if( threadIdx.x == 0 ) {
for( int i = 1; i < warpSize; i ++)
c += sum_squared_error_probe[i];
g_sum_squared_error_probe[0] = c;
}
}
}
void exec() {
float sum_squared_error, sum_squared_error_probe;
...
dim3 dimGrid(BLOCKS,1);
dim3 dimBlock(THREADS_PER_BLOCK,1);
zero_rmse<<<dimGrid,dimBlock>>>(d_sum_squared_error,d_sum_squared_error_probe);
CUT_CHECK_ERROR("error in zero_rmse");
CUDA_SAFE_CALL(cudaThreadSynchronize());
sum_rmse<<<dimGrid,dimBlock>>>(d_sum_squared_error,d_sum_squared_error_probe);
CUT_CHECK_ERROR("error in sum_rmse");
CUDA_SAFE_CALL(cudaThreadSynchronize());
CUDA_SAFE_CALL(cudaMemcpy(&sum_squared_error,d_sum_squared_error,sizeof(float), cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaMemcpy(&sum_squared_error_probe,d_sum_squared_error_probe,sizeof(float), cudaMemcpyDeviceToHost));
printf("%f %f\n",sum_squared_error,sum_squared_error_probe);
...
}
void main( int argc, char** argv) {
…
init(argc,argv);
…
exec();
…
}