Hi,
I am new to CUDA and C++ Programming. Currently I am working on 1D stencil and coded it as follows:
#include <iostream>
#define ARRAY_SIZE 30
#define THREADS_PER_BLOCK 512
#define RADIUS 3
void CreateInputArray(int* array)
{
for(int i=0; i<ARRAY_SIZE; i++)
{
array[i] = 1;
}
}
void PrintArray(int* array)
{
for(int i=0; i<ARRAY_SIZE - 2*RADIUS; i++)
{
if(i<5)
{
std::cout<<array[i]<<std::endl;
}
}
}
__global__ void Stencil1D(int* inp, int* out)
{
__shared__ int temp[THREADS_PER_BLOCK+2*RADIUS];
int global_index = (blockIdx.x * blockDim.x) + threadIdx.x;
int local_index = threadIdx.x + RADIUS;
temp[local_index] = inp[global_index];
if(threadIdx.x < RADIUS)
{
temp[local_index-RADIUS] = inp[global_index-RADIUS];
temp[local_index+blockDim.x] = inp[global_index+blockDim.x];
}
__syncthreads();
int temp_sum = 0;
for(int offset=-RADIUS; offset<=RADIUS; offset++)
{
temp_sum += temp[local_index+offset];
}
out[global_index] = temp_sum;
}
int main()
{
int* input = new int[ARRAY_SIZE];
int* output = new int[ARRAY_SIZE - 2*RADIUS];
CreateInputArray(input);
int *d_input, *d_output;
int size_to_be_alloted_input = ARRAY_SIZE * sizeof(int);
int size_to_be_alloted_output = (ARRAY_SIZE - 2*RADIUS) * sizeof(int);
cudaMalloc((void**) &d_input, size_to_be_alloted_input);
cudaMalloc((void**) &d_output, size_to_be_alloted_output);
cudaMemcpy(d_input, input, size_to_be_alloted_input, cudaMemcpyHostToDevice);
int block_count = ((ARRAY_SIZE - 2*RADIUS)+THREADS_PER_BLOCK-1)/THREADS_PER_BLOCK;
Stencil1D<<<block_count, THREADS_PER_BLOCK>>>(d_input, d_output);
cudaMemcpy(output, d_output, size_to_be_alloted_output, cudaMemcpyDeviceToHost);
PrintArray(output);
// Cleaning
delete[] input;
delete[] output;
cudaFree(d_input), cudaFree(d_output);
return 0;
}
The anticipated outcome dictates that each element in the output array should be 7. However, the initial three values are 4, 5, and 6, while the subsequent values are 7. I am currently attempting to identify the bug in the code but have been unsuccessful thus far. Could someone provide guidance on identifying the issue with the code?