Hello ,
I’ve copied a reduction kernel example from
I made a complete program out of this example. But it doesn’t work.
I want to compute the sum of a 256 long array of integers. Just to see how this reduction works.
This is the extracted kernel of the example with a small workaround.
#include <stdio.h>
#include <stdlib.h>
#include <cutil.h>
#define N 256
unsigned int gpuBytes;
__global__ void reduce0(int *g_idata, int *g_odata) {
extern __shared__ int sdata[];
// each thread loads one element from global to shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
printf("in kernel: i = %d\n", i);
sdata[tid] = g_idata[i];
printf("sdata[%d] = g_idata[%d] --- %d = %d \n", tid, i, sdata[tid], g_idata[i]);
__syncthreads();
// do reduction in shared mem
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
if (tid < s) {
printf("sdata[%d], sdata[%d] = %d, %d \n", tid, tid + s, sdata[tid], sdata[tid + s]);
//sdata[tid] += sdata[tid + s];
}
printf("s = %d\n", s);
__syncthreads();
}
// write result for this block to global mem
if (tid == 0)
g_odata[blockIdx.x] = sdata[0];
}
int main() {
int j[N];
int k[N];
int sum = 0;
for (int i = 0; i < N; i++) {
sum += i;
j[i] = i;
}
printf("sum on host: %d\n", sum);
int *gpu__j;
int *gpu__k;
gpuBytes = N * sizeof(int);
CUDA_SAFE_CALL(cudaMalloc(((void * * )( & gpu__j)), gpuBytes));
CUDA_SAFE_CALL(cudaMemcpy(gpu__j, j, gpuBytes, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMalloc(((void * * )( & gpu__k)), gpuBytes));
CUDA_SAFE_CALL(cudaMemcpy(gpu__k, k, gpuBytes, cudaMemcpyHostToDevice));
reduce0<<<1, N, 0, 0>>>(gpu__j, gpu__k);
gpuBytes = N * sizeof(int);
CUDA_SAFE_CALL(cudaMemcpy(j, gpu__j, gpuBytes, cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaFree(gpu__j));
CUDA_SAFE_CALL(cudaMemcpy(k, gpu__k, gpuBytes, cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaFree(gpu__k));
printf("sum on device : %d\n", k[0]);
}
In the kernel I have outputs like this:
sdata[64], sdata[192] = 0, 0
sdata[65], sdata[193] = 0, 0
sdata[66], sdata[194] = 0, 0
sdata[67], sdata[195] = 0, 0
…
…
…
Seems that the external shared variable doesn’t have the correct values. They are all 0. Running the program also implies a "unspecified launch failure" in the first cudamemcpy after the kernel launch. Can someone try to run this? Would be a pleasure.
Thnaks alot
sw