I have made following code for Matrix multiplication.
I got two error on GTX 480 .
- Device driver stopped working.
- device driver sync error.
Can anybody explain me what is wrong?
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include<conio.h>
#include<stdlib.h>
#define size 1024
#define bsize 32
//Following function allocates shared memory
__global__ void matrixmul(float *c, float *a, float *b)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
int j = blockIdx.y*blockDim.y + threadIdx.y;
int tx=threadIdx.x;
int ty=threadIdx.y;
__shared__ float temp[bsize][bsize];
temp[tx][ty]=0;
for(int k=0; k<size; k++){
temp[tx][ty] += a[k + j*size]*b[i + k*size];
}
c[i + j*size] = temp[tx][ty];
}
int main()
{
float *a;
float *b;
float *c;
dim3 blocksize,gridsize;
float *dev_a, *dev_b, *dev_c;
cudaError_t cudaStatus;
//memory allocation
a= (float *) malloc(size*size * sizeof(float));
b= (float *) malloc(size*size * sizeof(float)) ;
c= (float *) malloc(size*size * sizeof(float)) ;
//Initialization
for(int i =0;i < size*size;i++)
{
a[i]=1;
b[i]=1;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_c, size*size * sizeof(float));
if (cudaStatus != cudaSuccess) {
printf("cudaMalloc failed!");
}
cudaStatus = cudaMalloc((void**)&dev_a, size*size * sizeof(float));
if (cudaStatus != cudaSuccess) {
printf("cudaMalloc failed!");
}
cudaStatus = cudaMalloc((void**)&dev_b, size*size * sizeof(float));
if (cudaStatus != cudaSuccess) {
printf("cudaMalloc failed!");
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size*size * sizeof(float), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
printf("cudaMemcpy failed host to device a!");
}
cudaStatus = cudaMemcpy(dev_b, b, size*size * sizeof(float), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
printf("cudaMemcpy failed host to device b!");
}
cudaMemset(dev_c,0,size*size*sizeof(float));
// Launch a kernel on the GPU with one thread for each element.
blocksize = dim3(bsize,bsize,1);
gridsize = dim3(size/bsize,size/bsize,1);
matrixmul <<<gridsize,blocksize>>> (dev_c, dev_a, dev_b);
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
printf("cudaDeviceSynchronize error after launching matrixmul!\n");
return 1;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size*size * sizeof(float), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
printf("cudaMemcpy failed device to host!");
}
/*for(int i=0;i<size*size;i++)
{
printf("%0.0f ", c[i]);
if((i+1)%size ==0)
printf("\n");
} */
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
// cudaDeviceReset must be called before exiting in order for profiling and
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
printf("cudaDeviceReset failed!");
return 1;
}
//getch();
return 0;
}