Hi there,

I read the paper “Parallel Prefix Sum (Scan) with CUDA” by Mark Harris.

I tried the up-sweep phase with an array of 32 elements and block size 8. The kernel is mostly the same as the example in the paper except that I used statically allocated shared memory. See the code below.

[codebox]#include

#include <cutil.h>

using namespace std;

**global** void sumkernel(float *Array_d, int n)

{

```
__shared__ float temp[2];
int Idx = threadIdx.x;
int offset = 1;
temp[2 * Idx] = Array_d[blockIdx.x * blockDim.x + 2 * Idx];
temp[2 * Idx + 1] = Array_d[blockIdx.x * blockDim.x + 2 * Idx + 1];
for(int d = 8 >> 1; d > 0; d >>= 1)
{
__syncthreads();
if(Idx < d)
{
int ai = offset * (2 * Idx + 1) -1;
int bi = offset * (2 * Idx + 2) -1;
temp[bi] = temp[bi] + temp[ai];
}
offset = offset * 2;
}
__syncthreads();
Array_d[blockIdx.x * blockDim.x + 2 * Idx] = temp[2 * Idx];
Array_d[blockIdx.x * blockDim.x + 2 * Idx + 1] = temp[2 * Idx + 1];
```

}

int main()

{

```
int i = 0;
const int n = 32;
float ArrayIn[n];
float sum = 0.0f;
for(i = 0; i < n; i++)
{
ArrayIn[i] = (float)i / n;
sum = sum + ArrayIn[i];
cout << ArrayIn[i] << " ";
}
cout << endl;
cout << "sum is " << sum << endl;
float Array_h[n];
float *Array_d = NULL;
CUDA_SAFE_CALL(cudaMalloc((void **)&Array_d, n * sizeof(float)));
CUDA_SAFE_CALL(cudaMemcpy(Array_d, ArrayIn, n * sizeof(float), cudaMemcpyHostToDevice));
int blocksize = 8;
int n_blocks = n / blocksize;
sumkernel <<< n_blocks, blocksize >>> (Array_d, n);
CUDA_SAFE_CALL(cudaMemcpy(Array_h, Array_d, n * sizeof(float), cudaMemcpyDeviceToHost));
for(i = 0; i < n; i++)
cout << Array_h[i] << " ";
cout << endl;
cout << "final sum is " << Array_h[7] + Array_h[15] + Array_h[23] + Array_h[31] << endl;
CUDA_SAFE_CALL(cudaFree(Array_d));
```

}[/codebox]

This program rarely works fine, most of time, the last block does not work.

I also tried an array of 125 elements with block size of 25. It didn’t work either. For convenience, I post the code of 125 elements here too.

[codebox]#include

#include <cutil.h>

using namespace std;

**global** void sumkernel(float *Array_d, int n)

{

```
__shared__ float temp[25];
int Idx = threadIdx.x;
int offset = 1;
//for(int i = 0; i < 16; i++)
// temp[16 * Idx + i] = Array_d[16 * Idx + i];
temp[5 * Idx] = Array_d[blockIdx.x * blockDim.x + 5 * Idx];
temp[5 * Idx + 1] = Array_d[blockIdx.x * blockDim.x + 5 * Idx + 1];
temp[5 * Idx + 2] = Array_d[blockIdx.x * blockDim.x + 5 * Idx + 2];
temp[5 * Idx + 3] = Array_d[blockIdx.x * blockDim.x + 5 * Idx + 3];
temp[5 * Idx + 4] = Array_d[blockIdx.x * blockDim.x + 5 * Idx + 4];
for(int d = 5; d > 0; d /= 5)
{
__syncthreads();
if(Idx < d)
{
//for(int j = 0; j < 16; j++)
// temp[offset * (16 * Idx + 16) - 1] = temp[offset * (16 * Idx + 16) - 1] + temp[offset * (16 * Idx + j + 1) - 1];
int ai = offset * (5 * Idx + 1) -1;
int bi = offset * (5 * Idx + 2) -1;
int ci = offset * (5 * Idx + 3) -1;
int di = offset * (5 * Idx + 4) -1;
int ei = offset * (5 * Idx + 5) -1;
temp[ei] = temp[ei] + temp[di] + temp[ci] + temp[bi] + temp[ai];
}
offset = offset * 5;
}
__syncthreads();
//for(int i = 0; i < 16; i++)
// Array_d[16 * Idx + i] = temp[16 * Idx + i];
Array_d[blockIdx.x * blockDim.x + 5 * Idx] = temp[5 * Idx];
Array_d[blockIdx.x * blockDim.x + 5 * Idx + 1] = temp[5 * Idx + 1];
Array_d[blockIdx.x * blockDim.x + 5 * Idx + 2] = temp[5 * Idx + 2];
Array_d[blockIdx.x * blockDim.x + 5 * Idx + 3] = temp[5 * Idx + 3];
Array_d[blockIdx.x * blockDim.x + 5 * Idx + 4] = temp[5 * Idx + 4];
```

}

int main()

{

```
const int n = 125;
float ArrayIn[n];
float sum = 0.0f;
for(int i = 0; i < n; i++)
{
ArrayIn[i] = (float)i / n;
sum = sum + ArrayIn[i];
}
cout << endl;
cout << "sum is " << sum << endl;
float Array_h[n];
float *Array_d = NULL;
CUDA_SAFE_CALL(cudaMalloc((void **)&Array_d, n * sizeof(float)));
CUDA_SAFE_CALL(cudaMemcpy(Array_d, ArrayIn, n * sizeof(float), cudaMemcpyHostToDevice));
int blocksize = 25;
int n_blocks = n / blocksize;
sumkernel <<< n_blocks, blocksize >>> (Array_d, n);
CUDA_SAFE_CALL(cudaMemcpy(Array_h, Array_d, n * sizeof(float), cudaMemcpyDeviceToHost));
for(int i = 0; i < n; i++)
cout << Array_h[i] << " ";
cout << endl;
cout << "final sum is " << Array_h[24] + Array_h[49] + Array_h[74] + Array_h[99] + Array_h[124] << endl;
CUDA_SAFE_CALL(cudaFree(Array_d));
```

} [/codebox]

I tried different size of “temp”, it does not help.

Can anybody help solve this problem?

Also, when should I use “extern **shared**” and how to use it?

Any idea will be highly appreciated!!

Yuping