Error while running 1024*1024 matrix multiplication code for GPU.

I have made following code for Matrix multiplication.
I got two error on GTX 480 .

  1. Device driver stopped working.
  2. device driver sync error.

Can anybody explain me what is wrong?

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include<conio.h>
#include<stdlib.h>
#define size 1024
#define bsize 32
 
//Following function allocates shared memory 
__global__ void matrixmul(float *c, float *a, float *b)
{
    int i = blockIdx.x*blockDim.x + threadIdx.x;
	int j = blockIdx.y*blockDim.y + threadIdx.y;
	int tx=threadIdx.x;
	int ty=threadIdx.y;
    __shared__	float temp[bsize][bsize];
	temp[tx][ty]=0;
	for(int k=0; k<size; k++){
    temp[tx][ty] += a[k + j*size]*b[i + k*size];
			}
c[i + j*size] = temp[tx][ty];
}

int main()
{
  float *a;
  float *b;
  float *c;
  dim3 blocksize,gridsize;	
  float *dev_a, *dev_b, *dev_c;
  cudaError_t cudaStatus;

	//memory allocation
	a= (float *)  malloc(size*size * sizeof(float));
	b= (float *)  malloc(size*size * sizeof(float)) ;
	c= (float *)  malloc(size*size * sizeof(float)) ;
	
		//Initialization
	
	for(int i =0;i < size*size;i++)
	{
		a[i]=1;
		b[i]=1;
		
	}
    
	
	 // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaStatus = cudaMalloc((void**)&dev_c, size*size * sizeof(float));
    if (cudaStatus != cudaSuccess) {
        printf("cudaMalloc failed!");
            }

    cudaStatus = cudaMalloc((void**)&dev_a, size*size * sizeof(float));
    if (cudaStatus != cudaSuccess) {
        printf("cudaMalloc failed!");
        
    }

    cudaStatus = cudaMalloc((void**)&dev_b, size*size * sizeof(float));
    if (cudaStatus != cudaSuccess) {
        printf("cudaMalloc failed!");
            }

		// Copy input vectors from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_a, a, size*size * sizeof(float), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
       printf("cudaMemcpy failed host to device a!");
       
    }

    cudaStatus = cudaMemcpy(dev_b, b, size*size * sizeof(float), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        printf("cudaMemcpy failed host to device b!");
        
    }
	cudaMemset(dev_c,0,size*size*sizeof(float));
	
    // Launch a kernel on the GPU with one thread for each element.
	blocksize = dim3(bsize,bsize,1);
	gridsize = dim3(size/bsize,size/bsize,1);
    matrixmul <<<gridsize,blocksize>>> (dev_c, dev_a, dev_b);

	  cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        printf("cudaDeviceSynchronize  error  after launching matrixmul!\n");
        return 1;
		}

	
    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(c, dev_c, size*size * sizeof(float), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        printf("cudaMemcpy failed device to host!");
        
    }
	/*for(int i=0;i<size*size;i++)
	{
	printf("%0.0f  ", c[i]);
	if((i+1)%size ==0)
	printf("\n");
	} */ 

    cudaFree(dev_c);
    cudaFree(dev_a);
    cudaFree(dev_b);
	

	 // cudaDeviceReset must be called before exiting in order for profiling and
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        printf("cudaDeviceReset failed!");
        return 1;
    }

	//getch();
    
    return 0;
}

I was able to compile and run this code without any errors. I assume you have a problem with your configuration, maybe update your driver?

You are probably triggering the driver watchdog. Try to reduce the size to see if this is the problem ( time for SGEMM
is proportional to 2NMK). While writing the code for matrix multiplication is a good exercise, for production codes you
should consider calling CUBLAS.

Thank you for your reply @mfactia.
I tried to reduce size to 256 and program runs well upto 256. It gives error for size above that.
Can we increase watchdog time or i must use CUBLAS?
It gives me error in visual profiler. So anyway to solve that problem and measure execution time?