cuBLAS program crashes

Hey, I’m relatively new to CUDA development, however, I managed to get a simple matrix-vector multiplication program making use of cuBLAS to work, here’s the code snippet:

// includes, system

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <iostream>

/* Using updated (v2) interfaces to cublas and cusparse */

#include <cuda_runtime_api.h>

#include <cublas_v2.h>

int main(void){

	const int M = 40000 ;

	const int N = 6 ;

	// variables declaration

	float x[N], c[M] ;

	float *dev_x, *dev_c ;

	float matrix[M][N] ;

	float *dev_matrix ;

	// allocate the memory on the GPU

	cudaMalloc( (void**)&dev_x, N*sizeof(float) ) ;

	cudaMalloc( (void**)&dev_c, M*sizeof(float) ) ;

	cudaMalloc( (void**)&dev_matrix, M*N*sizeof(float) ) ;

	// fill the array x and initiate c by zero on the CPU

	for (int i=0; i<N; i++) {

		x[i] = (float)(0.1*i) ;

		c[i] = 0 ;

	}

	// fill in the matrix

	for(int i=0; i<M; i++){

		for(int j=0; j<N; j++){

			//matrix[i][j] = (float)((0.6*i+0.4*j)/(0.8*i+0.2*j)) ;

			matrix[i][j] = (float)(0.1*i*j) ;

		}

	}

	// copy the arrays 'a' and 'b' to the GPU

	cudaMemcpy( dev_x, x, N*sizeof(float), cudaMemcpyHostToDevice ) ;

	cudaMemcpy( dev_c, c, M*sizeof(float), cudaMemcpyHostToDevice ) ;

	cudaMemcpy( dev_matrix, matrix, M*N*sizeof(float), cudaMemcpyHostToDevice ) ;

/* Get handle to the CUBLAS context */

        cublasHandle_t cublasHandle = 0 ;

        cublasStatus_t cublasStatus ;

        cublasStatus = cublasCreate(&cublasHandle) ;

	cublasOperation_t trans = CUBLAS_OP_N ;

	float alpha = 1.0 ;

	float beta = 0.0 ;

	cublasStatus = cublasSgemv(cublasHandle, trans, M, N, &alpha, dev_matrix, M, dev_x, 1, &beta, dev_c, 1) ;

	if(cublasStatus != CUBLAS_STATUS_SUCCESS) printf("%s \n","cuBLAS SGEMV Error") ;

	// copy result back to host memory

	cudaMemcpy( c, dev_c, M*sizeof(float), cudaMemcpyDeviceToHost ) ;   // c = alpha*matrix*a + beta*c

	// free device memory

	cudaFree(dev_x);

        cudaFree(dev_matrix);

	cudaFree(dev_c);

	return 0 ;

}

The programs WORKS FINE and gives correct results for small dimensions matrix (around 250,000 elements), however, when I try to increase the matrix dimensions, the program crashes on execution, I get the widows message: PROGRAM_NAME.exe has stopped working, and the console returns a blank screen, is it a limitation of my GPU memory (GeForce GT 240) or what ?!