I am referring to the sample code given in Example 39-1 here - Chapter 39. Parallel Prefix Sum (Scan) with CUDA which I have copied over.
This is my sample code to test the given -
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <iostream>
__global__
void scan(float *g_odata, float *g_idata, int n) {
extern __shared__ float temp[]; // allocated on invocation
int thid = threadIdx.x;
int pout = 0, pin = 1;
// load input into shared memory.
// Exclusive scan: shift right by one and set first element to 0
temp[pout * n + thid] = (thid > 0) ? g_idata[thid - 1] : 0;
__syncthreads();
for (int offset = 1; offset < n; offset *= 2)
{
pout = 1 - pout; // swap double buffer indices
pin = 1 - pout;
if (thid >= offset)
temp[pout * n + thid] += temp[pin * n + thid - offset];
else
temp[pout * n + thid] = temp[pin * n + thid];
__syncthreads();
}
g_odata[thid] = temp[pout * n + thid]; // write output
}
int main() {
float output[6];
float input[] = { 1, 2, 3, 4, 5 };
float* d_output, * d_input;
cudaMalloc((void**)&d_output, 6 * sizeof(float));
cudaMalloc((void**)&d_input, 5 * sizeof(float));
cudaMemcpy(d_input, input, 5 * sizeof(float), cudaMemcpyHostToDevice);
scan << <1, 6 >> > (d_output, d_input, 5);
cudaMemcpy(output, d_output, 6 * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_output);
cudaFree(d_input);
for (int i = 0; i < 6; i++) {
std::cout << output[i] << "\n";
}
return 0;
}
But this does not work. The answer I get is -
-2.15633e-10
4.59037e-41
4.34403e-44
0
1.4013e-45
0
What is going wrong?