OK both approaches appear to be producing the same result (approximately). Good!
When I compare the performance of the 2D tiled convolution vs. the 2D non-tiled for the same dimensions, I always see that the tiled case is 2-3x faster than the untiled case.
Here is an example:
$ cat t42.cu
// include necessary libs
#include <cuda.h>
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
// define constant BLOCK_WIDTH
#define BLOCK_WIDTH 32
#define TOL 0.002
__global__ void convolution_2D_Kernel(float* d_m, float* d_mask, float* d_n, size_t a, size_t b, size_t maskWidth)
{
// define and initialize the variable that will be used for indexing
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
// define and initialize the variable that will allow the mask to align correctly with input array d_m
// integer division is fine because it always truncates
// we will only be using odd values for maskWidth anyway
// the truncation of integer division will provide the perfect offset for aligning the centre element of the mask with the target d_m element
int m_row = j - maskWidth / 2;
int m_col = i - maskWidth / 2;
// only have threads within the range of the length of the result array calculate an element of it
// this is to prevent global memory segfaults
if(i < b && j < a)
{
// iterate through all the elements in the mask
// here's where the actual convolution operation will occur
for(int k = 0; k < maskWidth; ++k)
{
for(int l = 0; l < maskWidth; ++l)
{
// this if statement is a boundary condition check
// it checks whether the indexes needed for the convolution operation are within the bounds of the input array
// if they are not
// extra "ghost" elements past the beginning and end of the input array are used
// these elements are set to 0 in our code
// this ghost element stuff is done implicitly
// as there is no need to do it explicitly, since 0 does not change the resulting element of the convolution operation on a specific element
// we just leave the accumulating result of the convolution operation alone if the indexes are out of bounds
if(m_row + l >= 0 && m_row + l < a && m_col + k >= 0 && m_col + k < b)
{
// if the boundary check is satisfied
// then accumulate one part of the convolution operation to the result element of the result d_n array
// the convolution operation is done column by column to allow the global memory accesses to be coalesced
// this allows us to increase memory bandwidth
d_n[j * b + i] += d_m[(m_row + l) * b + m_col + k] * d_mask[l * maskWidth + k];
}
}
}
}
}
// CUDA kernel function
__global__ void tiledConvolution_2D_Kernel(float* d_m, const float* __restrict__ d_mask, float* d_n, size_t a, size_t b, size_t maskWidth, int N_TILE_WIDTH)
{
// define and initialize the variable where the resulting element of the convolution operation will be calculated and stored
// this is to minimize writes to global memory
// as automatic variables are stored in register memory
float result = 0;
// define and initialize the variables that will be used for indexing - this is for brevity
int n_row = blockIdx.y * N_TILE_WIDTH + threadIdx.y;
int n_col = blockIdx.x * N_TILE_WIDTH + threadIdx.x;
int m_row = n_row - maskWidth / 2;
int m_col = n_col - maskWidth / 2;
// define shared memory input array tile
__shared__ float tile_m[BLOCK_WIDTH][BLOCK_WIDTH];
// if the input array index variables are within the bounds of the input array
// then load the elements of d_m into their respective positions in the tile
// otherwise just set the element of the tile to 0 (the element becomes a "ghost" element)
if(m_row >= 0 && m_row < a && m_col >= 0 && m_col < b)
{
tile_m[threadIdx.y][threadIdx.x] = d_m[m_row * b + m_col];
}
else
{
tile_m[threadIdx.y][threadIdx.x] = 0;
}
// sync all the threads in the block so faster threads don't work with uninitialized memory
__syncthreads();
// only allow a certain amount of threads per block to participate in calculating the result variable
// because we only need to calculate N_TILE_LENGTH elements
// < and not <= because of 0-based indexing
if(threadIdx.y < N_TILE_WIDTH && threadIdx.x < N_TILE_WIDTH && n_row < a && n_col < b)
{
// calculate value of result element
for(int i = 0; i < maskWidth; ++i)
{
for(int j = 0; j < maskWidth; ++j)
{
result += d_mask[i * maskWidth + j] * tile_m[threadIdx.y + i][threadIdx.x + j];
}
}
// write result variable to corresponding element of result array
d_n[n_row * b + n_col] = result;
}
}
// error checking function - checks for CUDA errors
void errorCheck(unsigned int line)
{
// get most recent CUDA error
cudaError_t cudaError = cudaGetLastError();
// if error code wasn't a code describing success
if(cudaError != cudaSuccess)
{
// output that there has been a CUDA error in the line of the CUDA function call
// and exit the program
printf("CUDA error in line %u in file %s: %s\n", line - 1, __FILE__, cudaGetErrorString(cudaError));
exit(EXIT_FAILURE);
}
}
// host function that calls the CUDA kernel
void convolution_2D(float* m, float* mask, float* n, size_t a, size_t b, size_t maskWidth, int N_TILE_WIDTH)
{
// define and initialize dimension variables containing data regarding the dimensions of the grid and the dimensions of each block
dim3 numOfBlocks(ceil(b / (float) N_TILE_WIDTH), ceil(a / (float) N_TILE_WIDTH), 1);
dim3 numOfThreads(BLOCK_WIDTH, BLOCK_WIDTH, 1);
// define and initialize variables containing the number of bytes in each array
size_t bytes_m = a * b * sizeof(float);
size_t bytes_mask = maskWidth * maskWidth * sizeof(float);
// define the pointers that will point to the start of allocated device memory for each array
float* d_m;
float* d_mask;
float* d_n;
float *h_n = new float[a*b];
// allocate global memory for each array on the device and check for CUDA errors
// input bytes variable is used for result array because cuda-memcheck 0 errors but illegal memory accessboth arrays have the same length
cudaMalloc((void**) &d_m, bytes_m);
errorCheck(__LINE__);
cudaMalloc((void**) &d_mask, bytes_mask);
errorCheck(__LINE__);
cudaMalloc((void**) &d_n, bytes_m);
errorCheck(__LINE__);
cudaMemset(d_n, 0, bytes_m);
// copy the data of each array to allocated global memory on the device and check for CUDA errors
cudaMemcpy(d_m, m, bytes_m, cudaMemcpyHostToDevice);
errorCheck(__LINE__);
cudaMemcpy(d_mask, mask, bytes_mask, cudaMemcpyHostToDevice);
errorCheck(__LINE__);
// call the CUDA kernel and check for CUDA errorswarning: argument is incompatible with corresponding format string conversion
tiledConvolution_2D_Kernel<<<numOfBlocks, numOfThreads>>>(d_m, d_mask, d_n, a, b, maskWidth, N_TILE_WIDTH);
cudaMemcpy(n, d_n, bytes_m, cudaMemcpyDeviceToHost);
cudaMemset(d_n, 0, bytes_m);
convolution_2D_Kernel<<<numOfBlocks, numOfThreads>>>(d_m, d_mask, d_n, a, b, maskWidth);
errorCheck(__LINE__);
// copy the data of the result array from global memory to host DRAM and check for CUDA errors
cudaMemcpy(h_n, d_n, bytes_m, cudaMemcpyDeviceToHost);
errorCheck(__LINE__);
for (int i = 0; i < a*b; i++) if (fabs(h_n[i] - n[i]) > TOL) {printf("mismatch at %d, was: %f, should be: %f\n", i, n[i], h_n[i]); return;}
// free the allocated global memory and check for CUDA errors
cudaFree(d_m);
errorCheck(__LINE__);
cudaFree(d_mask);
errorCheck(__LINE__);
cudaFree(d_n);
errorCheck(__LINE__);
delete[] h_n;
}
int main()
{
// define structs that will enable us to get the exec time of the program
struct timespec start, end;
// get the details regarding the start time of this program and store it in the start struct
clock_gettime(CLOCK_REALTIME, &start);
// initialize pseudo-random number generator with seed of current seconds since 01/01/1970
srand(time(NULL));
// define and initialize dimension variables for each array
// the input and result arrays have the same dimensions and thus share dimension variables
// int instead of size_t for result tile width because otherwise typecasting to float will cause errors in the host function that calls the kernel
size_t a = rand() % 513 + 7680;
size_t b = rand() % 513 + 7680;
size_t maskWidth = 2 * (rand() % 7 + 1) + 1;
int N_TILE_WIDTH = BLOCK_WIDTH - (maskWidth - 1);
// dynamically allocate DRAM memory for the arrays to account for them perhaps being too big to be statically allocated
float* m = (float*) malloc(a * b * sizeof(float));
float* mask = (float*) malloc(maskWidth * maskWidth * sizeof(float));
float* n = (float*) malloc(a * b * sizeof(float));
// assign a pseudo-random integer value from -64 to 64 for each element in input array m
for(int i = 0; i < a * b; ++i)
{
m[i] = rand() % 129 - 64;
}
// assign a pseudo-random float value from 0 to 1 with a precision of 3 decimal places for each element in mask array
for(int j = 0; j < maskWidth * maskWidth; ++j)
{
mask[j] = rand() % 1001 / 1000.0;
}
// perform 2D convolution operation on input array m using a given mask array
convolution_2D(m, mask, n, a, b, maskWidth, N_TILE_WIDTH);
// get the details regarding the end time of this program and store it in the end struct
clock_gettime(CLOCK_REALTIME, &end);
// calculate exec time in microseconds
time_t execTime = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
// output exec time
printf("Execution time: %d microseconds.", execTime);
// exit program
return 0;
}
$ nvcc -o t42 t42.cu
$ nvprof ./t42
==14044== NVPROF is profiling process 14044, command: ./t42
Execution time: 2277845 microseconds.==14044== Profiling application: ./t42
==14044== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 61.50% 219.53ms 2 109.76ms 109.45ms 110.08ms [CUDA memcpy DtoH]
33.94% 121.15ms 2 60.576ms 1.4400us 121.15ms [CUDA memcpy HtoD]
3.19% 11.373ms 1 11.373ms 11.373ms 11.373ms convolution_2D_Kernel(float*, float*, float*, unsigned long, unsigned long, unsigned long)
1.21% 4.3155ms 1 4.3155ms 4.3155ms 4.3155ms tiledConvolution_2D_Kernel(float*, float const *, float*, unsigned long, unsigned long, unsigned long, int)
0.16% 574.48us 2 287.24us 285.29us 289.19us [CUDA memset]
API calls: 50.66% 378.85ms 3 126.28ms 205.55us 378.14ms cudaMalloc
48.06% 359.39ms 4 89.848ms 62.204us 121.90ms cudaMemcpy
0.67% 5.0275ms 4 1.2569ms 591.32us 3.2430ms cuDeviceTotalMem
0.34% 2.5055ms 404 6.2010us 400ns 277.83us cuDeviceGetAttribute
0.20% 1.4733ms 3 491.10us 192.02us 879.13us cudaFree
0.05% 347.65us 4 86.911us 58.006us 168.14us cuDeviceGetName
0.01% 97.830us 2 48.915us 32.447us 65.383us cudaMemset
0.01% 78.585us 2 39.292us 14.549us 64.036us cudaLaunchKernel
0.00% 21.160us 4 5.2900us 2.9520us 7.7450us cuDeviceGetPCIBusId
0.00% 9.0290us 8 1.1280us 461ns 1.8180us cuDeviceGet
0.00% 5.6670us 10 566ns 215ns 1.9210us cudaGetLastError
0.00% 3.3180us 3 1.1060us 495ns 1.7900us cuDeviceGetCount
0.00% 3.1570us 4 789ns 620ns 1.1030us cuDeviceGetUuid
$ nvprof ./t42
==14058== NVPROF is profiling process 14058, command: ./t42
Execution time: 2216424 microseconds.==14058== Profiling application: ./t42
==14058== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 57.34% 216.99ms 2 108.50ms 108.40ms 108.59ms [CUDA memcpy DtoH]
19.42% 73.478ms 2 36.739ms 1.4720us 73.477ms [CUDA memcpy HtoD]
17.78% 67.285ms 1 67.285ms 67.285ms 67.285ms convolution_2D_Kernel(float*, float*, float*, unsigned long, unsigned long, unsigned long)
5.31% 20.112ms 1 20.112ms 20.112ms 20.112ms tiledConvolution_2D_Kernel(float*, float const *, float*, unsigned long, unsigned long, unsigned long, int)
0.15% 564.65us 2 282.33us 280.74us 283.91us [CUDA memset]
API calls: 52.67% 380.47ms 4 95.117ms 39.263us 176.67ms cudaMemcpy
46.03% 332.50ms 3 110.83ms 202.67us 331.80ms cudaMalloc
0.69% 4.9995ms 4 1.2499ms 593.62us 3.2076ms cuDeviceTotalMem
0.35% 2.5077ms 404 6.2070us 463ns 269.83us cuDeviceGetAttribute
0.20% 1.4137ms 3 471.22us 177.27us 845.00us cudaFree
0.04% 262.79us 4 65.696us 59.990us 70.600us cuDeviceGetName
0.01% 97.407us 2 48.703us 33.199us 64.208us cudaMemset
0.01% 81.192us 2 40.596us 15.260us 65.932us cudaLaunchKernel
0.00% 19.065us 4 4.7660us 3.1460us 7.8380us cuDeviceGetPCIBusId
0.00% 7.6230us 8 952ns 504ns 1.5880us cuDeviceGet
0.00% 6.9240us 10 692ns 244ns 2.2470us cudaGetLastError
0.00% 5.9100us 3 1.9700us 485ns 4.1660us cuDeviceGetCount
0.00% 3.4370us 4 859ns 647ns 1.0070us cuDeviceGetUuid
$
I still consider your code to be broken without the cudaMemset
operations I added above.