#include<stdio.h>
#include<stdlib.h>
#include<cuda.h>
//#include “/usr/cuda/common/inc/cutil.h”
//#include “/usr/cuda/common/inc/cutil_inline.h”
#include<time.h>
#define blocksize 16
void cudaErrorTest(char *msg);
global void mul(float *d_a,float *d_b,float *d_c,int size) {
int Row = blockIdx.y * blockDim.x + threadIdx.y;
int Col = blockIdx.x * blockDim.x + threadIdx.x;
for (int k = 0; k < blockDim.x; ++k)
{
d_c[RowCol+k] += d_a[Rowk+Col] * d_b[k*Col+Row];
}
}
int main(int argc,char *argv[2]) {
float *h_a,*h_b,*h_c,*d_a,*d_b,d_c,duration;
int size=atoi(argv[1]);
int ms=sizesize,i,j;
clock_t start,end;
h_a=(float )malloc(mssizeof(float));
h_b=(float )malloc(mssizeof(float));
h_c=(float )malloc(mssizeof(float));
cudaMalloc((void**)&d_a,mssizeof(float));
cudaMalloc((void**)&d_b,mssizeof(float));
cudaMalloc((void**)&d_c,ms*sizeof(float));
cudaErrorTest(“Memory Allocation on Device”);
for(i=0;i<size;i++) {
for(j=0;j<size;j++) {
h_a[i*size+j]=rand()%1000;
h_b[i*size+j]=rand()%1000;
h_c[i*size+j]=0;
}
}
cudaMemcpy(d_a,h_a,mssizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(d_b,h_b,mssizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(d_c,h_c,ms*sizeof(float),cudaMemcpyHostToDevice);
cudaErrorTest(“Memory Copy to device failed”);
// create and start timer
// cudaEvent_t start;
start=clock();
dim3 dimblock(blocksize,blocksize),dimgrid((int)(ms/256 + (ms%256 == 0 ?0:1)),(int)(ms/256 + (ms%256 == 0 ?0:1)));
mul<<<dimgrid,dimblock>>>(d_a,d_b,d_c,size);
cudaErrorTest(“Kernel Invocation failed”);
cudaThreadSynchronize();
cudaErrorTest(“Kernel Invocation failed”);
//cudaEvent_t end;
end=clock();
//cudaError_t cudaEventElapsedTime(duration,start,end);
duration=difftime(end,start)/CLOCKS_PER_SEC;
printf(“Processing time: %f (s) \n”,duration);
cudaMemcpy(h_c,d_c,ms*sizeof(float),cudaMemcpyDeviceToHost);
cudaErrorTest(“Memory Copy to device failed 2”);
printf(“Done !!! Success!! \n”);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
free(h_a);
free(h_b);
free(h_c);
}
void cudaErrorTest(char *msg) {
cudaError_t err=cudaGetLastError();
if(cudaSuccess != err) {
printf("cuda error : %s : %s ",msg,cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}
am i making any logical errors , 'm new to CUDA , i need some help from u guys to get started
it runs fine and for matrix multiplication using global memory , its taking about 1.16 seconds for a 4000x4000 matrix , i’m not sure how fast this is
but my cpu on parallelisation takes just .46 seconds
Thanks in advance