Segmentation faults on increasing input size

Below is a code snippet that computes the maxpool2D. Input is of the form HXWXC. Each channel is passed separately to the maxpool kernel. When the input size is 768x768x3 , there is no segmentation fault. But 768x768x4 gives a seg fault. There are no issues with smaller input sizes. Can somebody tell me what the issue could be?

#include <algorithm>
#include <cassert>
#include <cstdlib>
#include <iostream>
#include <vector>

using namespace std;

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "maxpool.h"




__global__ void maxpool_2d(int * data ,  int* out , int n , int out_width , int out_height , int channels)
{

    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x; 

    int value = 0;
    int max_value = 0;
    if((row < out_height) && (col < out_width))
    {
    for (int i = 0; i < filter_size; i++) 
        {
            
            for (int j = 0; j < filter_size; j++) 
            {
                
                value = data[((stride * row  + i) * n + (stride * col + j))] ;
                max_value = (max_value > value) ? max_value : value ;
            
                
                            
            }
        }

    
    out[row * out_width + col] = max_value;

    //printf("row = %d , col = %d , val = %d \n" , row , col, out[row * out_width + col]);
    
        
        
    }
 

}

int main()
{
	int array_size =768;
    int channels = 4;

	unsigned long array_byte_size = sizeof(int) * array_size * array_size ;	
    int h_data[array_size*array_size*channels] = {0};
    //int h_data[3][N][N];
    initialiseMatrix3D(h_data , array_size , channels);
  //  printMatrix3D(h_data , array_size , channels);

    int maxpool_height = ((array_size - filter_size) / stride + 1) ;
    int maxpool_width = ((array_size - filter_size ) / stride + 1) ;
    unsigned long out_byte_size = maxpool_height * maxpool_width * sizeof(int) ;
    cout << "Maxpool Size " << maxpool_width << endl;
    int *h_out;
    h_out = (int *)malloc(out_byte_size * channels);

	int * d_data ;
    int  *d_out;

	cudaMalloc((void**)&d_data, array_byte_size * channels);
    cudaMalloc((void**)&d_out, out_byte_size * channels); 
	
   
	dim3 block(2,2);
	dim3 grid(array_size/block.x,array_size/block.y);

    for(int i = 0 ; i < channels ; i++)
    {
   
    cudaMemcpy((d_data + array_size * array_size*i)[0], (h_data + array_size * array_size*i), array_byte_size, cudaMemcpyHostToDevice);
maxpool_2d << < grid, block >> > ((d_data + array_size * array_size*i) ,( d_out + maxpool_height * maxpool_width * i ), array_size , maxpool_width , maxpool_height , channels);
    cudaMemcpy((h_out + maxpool_height * maxpool_width * i),(d_out + maxpool_height * maxpool_width * i), out_byte_size, cudaMemcpyDeviceToHost);

 	cudaDeviceSynchronize();

    }

  


   

    cudaFree(d_data);
    cudaFree(d_out);
    free(h_out);
	cudaDeviceReset();
    
	return 0;
}

That’s generally not a good idea with larger array sizes. Use a dynamic allocation instead:

int *h_data = new int[array_size*array_size*channels]();

The typical stack size on modern Linux systems is 8 MB. 768 * 768 * 3 * 4 bytes < 223 bytes = 8 MB, while 768 * 768 * 4 * 4 bytes > 223 bytes = 8 MB.

Therefore the stack is not the right place for large data objects, as Robert Crovella already pointed out. Use the heap instead for dynamic allocation with malloc() or new.