Below is a code snippet that computes the maxpool2D. Input is of the form HXWXC. Each channel is passed separately to the maxpool kernel. When the input size is 768x768x3 , there is no segmentation fault. But 768x768x4 gives a seg fault. There are no issues with smaller input sizes. Can somebody tell me what the issue could be?
#include <algorithm>
#include <cassert>
#include <cstdlib>
#include <iostream>
#include <vector>
using namespace std;
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "maxpool.h"
__global__ void maxpool_2d(int * data , int* out , int n , int out_width , int out_height , int channels)
{
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int value = 0;
int max_value = 0;
if((row < out_height) && (col < out_width))
{
for (int i = 0; i < filter_size; i++)
{
for (int j = 0; j < filter_size; j++)
{
value = data[((stride * row + i) * n + (stride * col + j))] ;
max_value = (max_value > value) ? max_value : value ;
}
}
out[row * out_width + col] = max_value;
//printf("row = %d , col = %d , val = %d \n" , row , col, out[row * out_width + col]);
}
}
int main()
{
int array_size =768;
int channels = 4;
unsigned long array_byte_size = sizeof(int) * array_size * array_size ;
int h_data[array_size*array_size*channels] = {0};
//int h_data[3][N][N];
initialiseMatrix3D(h_data , array_size , channels);
// printMatrix3D(h_data , array_size , channels);
int maxpool_height = ((array_size - filter_size) / stride + 1) ;
int maxpool_width = ((array_size - filter_size ) / stride + 1) ;
unsigned long out_byte_size = maxpool_height * maxpool_width * sizeof(int) ;
cout << "Maxpool Size " << maxpool_width << endl;
int *h_out;
h_out = (int *)malloc(out_byte_size * channels);
int * d_data ;
int *d_out;
cudaMalloc((void**)&d_data, array_byte_size * channels);
cudaMalloc((void**)&d_out, out_byte_size * channels);
dim3 block(2,2);
dim3 grid(array_size/block.x,array_size/block.y);
for(int i = 0 ; i < channels ; i++)
{
cudaMemcpy((d_data + array_size * array_size*i)[0], (h_data + array_size * array_size*i), array_byte_size, cudaMemcpyHostToDevice);
maxpool_2d << < grid, block >> > ((d_data + array_size * array_size*i) ,( d_out + maxpool_height * maxpool_width * i ), array_size , maxpool_width , maxpool_height , channels);
cudaMemcpy((h_out + maxpool_height * maxpool_width * i),(d_out + maxpool_height * maxpool_width * i), out_byte_size, cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
}
cudaFree(d_data);
cudaFree(d_out);
free(h_out);
cudaDeviceReset();
return 0;
}