Hi all,

I am new to CUDA so bear with me. I am using the Quadro NVS 290 card to do a simple matrix multiplication. I am timing my kernel execution and duplicate serialized algorithm running on the CPU with CUDA event timers. The problem is that my serial cpu code is actually running faster than my kernel code. Any help would greatly be appreciated. The following is my kernel code and my serial cpu code:

//kernel invoke:

dim3 dimBlock(500);

dim3 dimGrid(2);

cudaEvent_t start, stop;

cudaEventCreate(&start);

cudaEventCreate(&stop);

cudaEventRecord(start, 0);

for(int = 0; < 1000; ++) matrix_Mult<<<dimGrid,dimBlock>>>(d_Matrix, d_Multiplier, );

cudaMemcpy(h_Matrix, d_Matrix, matrix_Size, cudaMemcpyDeviceToHost);

cudaEventRecord(stop, 0);

cudaEventSynchronize(stop);

float elapsedTime;

cudaEventElapsedTime(&elapsedTime, start, stop);

//kernel code:

/*********************************************

*Kernel to run Matrix Multiply

*

*matrix is a 1000 X 1000 array

******************************************** /* multiplier, int x)

global

void matrix_Mult(float matrix, float

{

int i = threadIdx.x;

int y = blockIdx.x;

//int x = ((blockIdx.x * 500 + i) % 1000);

float z = matrix[i + (y * 500) + (x * 1000)];

float v = multiplier[i + (y * 500)];

float b = z * v;

matrix[i + (y * 500) + (x * 1000)] = v;

}

//serial cpu code:

void matrixMultiply( float* matrix, float* multiplier)

{

for(int i = 0; i < 1000 ; i++)

{

for(int j = 0; j < 1000; j++)

{

matrix[i * 1000 + j] = matrix[i * 1000 + j] * multiplier[j];

}

}

}