unspecified launch failure kernel executes fine , but it gives error while copying back from devi

#include<stdio.h>
#include<stdlib.h>
#include<cuda.h>
//#include “/usr/cuda/common/inc/cutil.h”
//#include “/usr/cuda/common/inc/cutil_inline.h”
#include<time.h>

#define blocksize 16

void cudaErrorTest(char *msg);

global void mul(float *d_a,float *d_b,float *d_c,int size) {

int Row = blockIdx.y * blockDim.x + threadIdx.y;
int Col = blockIdx.x * blockDim.x + threadIdx.x;

for (int k = 0; k < blockDim.x; ++k)
{
d_c[RowCol+k] += d_a[Rowk+Col] * d_b[k*Col+Row];
}

}

int main(int argc,char *argv[2]) {

float *h_a,*h_b,*h_c,*d_a,*d_b,d_c,duration;
int size=atoi(argv[1]);
int ms=size
size,i,j;
clock_t start,end;

h_a=(float )malloc(mssizeof(float));
h_b=(float )malloc(mssizeof(float));
h_c=(float )malloc(mssizeof(float));

cudaMalloc((void**)&d_a,mssizeof(float));
cudaMalloc((void**)&d_b,ms
sizeof(float));
cudaMalloc((void**)&d_c,ms*sizeof(float));
cudaErrorTest(“Memory Allocation on Device”);

for(i=0;i<size;i++) {
for(j=0;j<size;j++) {
h_a[i*size+j]=rand()%1000;
h_b[i*size+j]=rand()%1000;
h_c[i*size+j]=0;
}
}

cudaMemcpy(d_a,h_a,mssizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(d_b,h_b,ms
sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(d_c,h_c,ms*sizeof(float),cudaMemcpyHostToDevice);
cudaErrorTest(“Memory Copy to device failed”);

// create and start timer
// cudaEvent_t start;
start=clock();

dim3 dimblock(blocksize,blocksize),dimgrid((int)(ms/256 + (ms%256 == 0 ?0:1)),(int)(ms/256 + (ms%256 == 0 ?0:1)));

mul<<<dimgrid,dimblock>>>(d_a,d_b,d_c,size);
cudaErrorTest(“Kernel Invocation failed”);
cudaThreadSynchronize();
cudaErrorTest(“Kernel Invocation failed”);
//cudaEvent_t end;
end=clock();

//cudaError_t cudaEventElapsedTime(duration,start,end);
duration=difftime(end,start)/CLOCKS_PER_SEC;
printf(“Processing time: %f (s) \n”,duration);

cudaMemcpy(h_c,d_c,ms*sizeof(float),cudaMemcpyDeviceToHost);
cudaErrorTest(“Memory Copy to device failed 2”);

printf(“Done !!! Success!! \n”);

cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
free(h_a);
free(h_b);
free(h_c);

}

void cudaErrorTest(char *msg) {
cudaError_t err=cudaGetLastError();

if(cudaSuccess != err) {
printf("cuda error : %s : %s ",msg,cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}

am i making any logical errors , 'm new to CUDA , i need some help from u guys to get started
it runs fine and for matrix multiplication using global memory , its taking about 1.16 seconds for a 4000x4000 matrix , i’m not sure how fast this is
but my cpu on parallelisation takes just .46 seconds

Thanks in advance