It’s needed.
Let’s cut the problem down to a tractable size:
$ cat t41.cu
// include necessary libs
#include <cuda.h>
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
// CUDA kernel function
__global__ void convolution_2D_Kernel(float* d_m, float* d_mask, float* d_n, size_t a, size_t b, size_t maskWidth)
{
// define and initialize the variable that will be used for indexing
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
// define and initialize the variable that will allow the mask to align correctly with input array d_m
// integer division is fine because it always truncates
// we will only be using odd values for maskWidth anyway
// the truncation of integer division will provide the perfect offset for aligning the centre element of the mask with the target d_m element
int m_row = j - maskWidth / 2;
int m_col = i - maskWidth / 2;
// only have threads within the range of the length of the result array calculate an element of it
// this is to prevent global memory segfaults
if(i < b && j < a)
{
// iterate through all the elements in the mask
// here's where the actual convolution operation will occur
for(int k = 0; k < maskWidth; ++k)
{
for(int l = 0; l < maskWidth; ++l)
{
// this if statement is a boundary condition check
// it checks whether the indexes needed for the convolution operation are within the bounds of the input array
// if they are not
// extra "ghost" elements past the beginning and end of the input array are used
// these elements are set to 0 in our code
// this ghost element stuff is done implicitly
// as there is no need to do it explicitly, since 0 does not change the resulting element of the convolution operation on a specific element
// we just leave the accumulating result of the convolution operation alone if the indexes are out of bounds
if(m_row + l >= 0 && m_row + l < a && m_col + k >= 0 && m_col + k < b)
{
// if the boundary check is satisfied
// then accumulate one part of the convolution operation to the result element of the result d_n array
// the convolution operation is done column by column to allow the global memory accesses to be coalesced
// this allows us to increase memory bandwidth
d_n[j * b + i] += d_m[(m_row + l) * b + m_col + k] * d_mask[l * maskWidth + k];
}
}
}
}
}
// error checking function - checks for CUDA errors
void errorCheck(unsigned int line)
{
// get most recent CUDA error
cudaError_t cudaError = cudaGetLastError();
// if error code wasn't a code describing success
if(cudaError != cudaSuccess)
{
// output that there has been a CUDA error in the line of the CUDA function call
// and exit the program
printf("CUDA error in line %u in file %s: %s\n", line - 1, __FILE__, cudaGetErrorString(cudaError));
exit(EXIT_FAILURE);
}
}
// host function that calls the CUDA kernel
void convolution_2D(float* m, float* mask, float* n, size_t a, size_t b, size_t maskWidth)
{
// define and initialize dimension variables containing data regarding the dimensions of the grid and the dimensions of each block
dim3 numOfBlocks(ceil(b / 32.0), ceil(a / 32.0), 1);
dim3 numOfThreads(32, 32, 1);
// define and initialize variables containing the number of bytes in each array
size_t bytes_m = a * b * sizeof(float);
size_t bytes_mask = maskWidth * maskWidth * sizeof(float);
size_t bytes_n = a * b * sizeof(float);
// define the pointers that will point to the start of allocated device memory for each array
float* d_m;
float* d_mask;
float* d_n;
// allocate global memory for each array on the device and check for CUDA errors
cudaMalloc((void**) &d_m, bytes_m);
errorCheck(__LINE__);
cudaMalloc((void**) &d_mask, bytes_mask);
errorCheck(__LINE__);
cudaMalloc((void**) &d_n, bytes_n);
errorCheck(__LINE__);
cudaMemset(d_n, 0, bytes_n);
// copy the data of each array to allocated global memory on the device and check for CUDA errors
cudaMemcpy(d_m, m, bytes_m, cudaMemcpyHostToDevice);
errorCheck(__LINE__);
cudaMemcpy(d_mask, mask, bytes_mask, cudaMemcpyHostToDevice);
errorCheck(__LINE__);
// call the CUDA kernel and check for CUDA errors
convolution_2D_Kernel<<<numOfBlocks, numOfThreads>>>(d_m, d_mask, d_n, a, b, maskWidth);
errorCheck(__LINE__);
// copy the data of the result array from global memory to host DRAM and check for CUDA errors
cudaMemcpy(n, d_n, bytes_n, cudaMemcpyDeviceToHost);
errorCheck(__LINE__);
// free the allocated global memory and check for CUDA errors
cudaFree(d_m);
errorCheck(__LINE__);
cudaFree(d_mask);
errorCheck(__LINE__);
cudaFree(d_n);
errorCheck(__LINE__);
}
int main()
{
// define structs that will enable us to get the exec time of the program
struct timespec start, end;
// get the details regarding the start time of this program and store it in the start struct
clock_gettime(CLOCK_REALTIME, &start);
// initialize pseudo-random number generator with seed of current seconds since 01/01/1970
srand(time(NULL));
// define and initialize dimension variables for each array
// the input and result arrays have the same dimensions and thus share dimension variables
size_t a = 16;// rand() % 257 + 3840;
size_t b = 16; //rand() % 257 + 3840;
size_t maskWidth = 3; //2 * (rand() % 7 + 1) + 1;
// dynamically allocate DRAM memory for the arrays to account for them perhaps being too big to be statically allocated
float* m = (float*) malloc(a * b * sizeof(float));
float* mask = (float*) malloc(maskWidth * maskWidth * sizeof(float));
float* n = (float*) malloc(a * b * sizeof(float));
// assign a pseudo-random integer value from -64 to 64 for each element in input array m
for(int i = 0; i < a * b; ++i)
{
m[i] = rand() % 129 - 64;
}
//for (int i = 0; i < 64; i++) printf("%f\n", m[i]);
// assign a pseudo-random float value from 0 to 1 with a precision of 3 decimal places for each element in mask array
for(int j = 0; j < maskWidth; ++j)
{
mask[j] = rand() % 1001 / 1000.0;
}
//for (int i = 0; i < maskWidth; i++) printf("%f\n", mask[i]);
// perform 1D convolution operation on input array m using a given mask array
convolution_2D(m, mask, n, a, b, maskWidth);
for (int i = 0; i < a; i++) {
for (int j = 0; j <b; j++) printf("%f ", n[i*b+j]);
printf("\n"); }
// get the details regarding the end time of this program and store it in the end struct
clock_gettime(CLOCK_REALTIME, &end);
// calculate exec time in microseconds
time_t execTime = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
// output exec time
printf("Execution time: %d microseconds.", execTime);
// exit program
return 0;
}
$ nvcc -o t41 t41.cu
$ ./t41
0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
48.500999 93.016998 23.474001 25.264999 -51.390995 -25.866003 -63.832001 -1.214000 -11.368999 -39.022999 -64.384003 -58.870998 -45.981998 8.000998 51.223003 60.824001
-72.143997 -46.075001 31.052999 40.350002 -2.884000 -35.884998 -1.950001 -15.802999 -57.754002 -48.172997 5.657001 0.490000 -45.434002 -36.531002 43.243999 62.040001
71.361000 99.926003 88.692001 85.273994 19.278999 -53.289001 -81.881004 -57.788998 -41.424000 -0.513001 34.188000 58.087002 53.602997 15.667000 20.857998 4.264002
-9.909000 -18.561001 8.526000 45.807003 -14.290999 -58.541000 -30.635998 -28.140999 -73.406998 -96.243004 -87.130997 -60.412003 -85.866997 -16.799002 -3.896997 34.264000
32.849998 7.374000 -34.928001 -11.090002 5.765003 28.620001 -21.675999 -1.659001 -26.674000 33.047997 -4.176995 48.229996 -11.025997 44.829998 29.500002 51.855999
-47.736000 -26.362999 17.132999 9.173001 -45.007999 -83.903000 -76.955002 -59.887001 -34.689999 -26.269999 11.693000 11.435001 -2.877000 -62.577000 -102.214996 -83.736000
-72.890999 -86.353996 -4.018001 34.586002 81.169998 13.437003 -44.457001 -116.700996 -124.083000 -115.026001 -115.316002 -95.717003 -19.379999 58.885002 69.897003 21.431999
-36.530998 -72.321999 -50.295002 -25.194998 -10.768999 10.566000 47.674999 87.397003 18.331003 -12.661002 -60.196999 -45.186001 -24.449003 -15.758998 -30.797001 -52.903999
-55.737000 -83.475998 -30.650000 -40.563995 -41.098003 -24.219000 -11.512998 -14.333001 5.482997 11.752003 22.805998 27.022999 6.384003 -59.009003 -132.190994 -81.560005
-55.035000 -44.670002 14.526999 35.748005 6.050998 -2.915000 68.625000 75.963005 24.419003 -62.873001 -29.567001 -41.778999 -45.360001 -109.025002 -107.014999 -76.823997
-14.975999 7.134998 30.659000 50.734001 23.134998 -46.293999 -23.079002 -12.402997 4.849999 -48.142002 -57.705002 -62.141998 -62.951000 -44.107002 -9.585000 9.144000
-44.621998 -54.351002 -23.397999 15.194001 -17.271000 -26.074001 -67.871994 -52.727001 -43.413998 36.145000 44.334000 50.129997 -26.863998 -28.249002 -58.535000 -14.232000
-54.423000 -94.081001 -54.442005 4.904001 -1.470998 -38.390003 -84.883003 -63.013000 -34.692001 8.260001 19.573000 47.057999 5.236003 43.960999 -10.396997 15.896000
-46.107002 -5.646001 51.701000 51.906994 2.451000 11.212999 50.292000 5.080002 -24.967003 -77.434998 -11.281000 0.145000 46.656998 -22.909000 -11.479001 -9.527998
-5.813998 -19.582003 -37.142002 -39.360001 -12.476004 16.815001 36.530998 -1.152000 -45.298000 -3.835001 -0.802998 -5.297001 -13.791002 10.064001 29.554001 7.552000
Execution time: 260895 microseconds.$
It is now evident that your first row is all zeros (for a maskWidth of 3), but after that things are computed. In your original problem, this would mean that the first 4000 elements or so (at least - depending on maskWidth) would be zero. After that, you would get nonzero values, and that is exactly what I saw also.
Think about the behavior of your if-statement:
if(m_row + l >= 0 && m_row + l < a && m_col + k >= 0 && m_col + k < b)
when j is zero and l is zero. Also consider the effect of varying maskWidth