Hello Threre.
I am using 8800GTX to prove the matrix multiplication application. I did my first version without using shared memory and i can not get back the matrix Multiplication result.
The amazing thing is that I’ve prove my program on Tesla C870 and it is running perfectly.
Please that kind of things are turning me crazy :(
Anyone of you know any bug related with that ?
This is my kernel
global void matrixMul (float* C, float* A, float* B, int wA, int wB)
{
int tx = threadIdx.x;
int ty = threadIdx.y;
int bx = blockIdx.x;
int by = blockIdx.y;
int indexA = by*BLOCK_SIZE*wA+ty*wA;
int indexB = bx*BLOCK_SIZE+tx;
int indexC= wB* BLOCK_SIZE * by + BLOCK_SIZE * bx;
float aux = 0.0;
for (int i = 0; i < 4096; i++){
aux+= A[indexA] * B [indexB];
indexA++;
indexB+=wB;
}
__syncthreads();
C[indexC+wB*ty+tx] = aux;
}
And this is the call to the kernel
// copy host memory to device
CUDA_SAFE_CALL(cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL(cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice) );
unsigned int size_C = WC * HC;
unsigned int mem_size_C = sizeof(float) * size_C;
float * h_C = (float *) malloc (mem_size_C);
float* d_C;
CUDA_SAFE_CALL(cudaMalloc((void**) &d_C, mem_size_C));
// setup execution parameters
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(WC / threads.x, HC / threads.y);
// create and start timer
unsigned int timer = 0;
CUT_SAFE_CALL(cutCreateTimer(&timer));
CUT_SAFE_CALL(cutStartTimer(timer));
// execute the kernel
matrixMul<<< grid, threads >>>(d_C, d_A, d_B, WA, WB);
cudaThreadSynchronize();
// check if kernel execution generated and error
CUT_CHECK_ERROR("Kernel execution failed");
// copy result from device to host
CUT_SAFE_CALL(cudaMemcpy(h_C, d_C,mem_size_C,
cudaMemcpyDeviceToHost) );
Cheers