anybody can help me with the example in chaper 6? do follow the example on p&G but failed

i copy the example in chapter6 in Programming Guide but can not get the right result…
anybody can tell me where i am wrong?

template.cu file is as follow:

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <cutil.h>
#include “cublas.h”
#include <template_kernel.cu>
#define n2 275275
extern “C”
void Mul(const float
A,const float* B,int hA,int wA,int wB,float* C);
global void Muld(float*,float*,int,int,float*);
int
main( int argc, char** argv)
{
int wA=275;
int hA=275;
int wB=275;
int wC=275;

FILE fp,p,q;
fp=fopen(“c://datatest4.txt”,“rb”);
p=fopen(“c://datatest5.txt”,“rb”);
q=fopen(“c://datatest6.txt”,“rb”);
float
h_A;
float
h_B;
float
h_C;
h_A = (float*)malloc(n2 * sizeof(h_A[0]));
h_B = (float*)malloc(n2 * sizeof(h_B[0]));
h_C = (float*)malloc(n2 * sizeof(h_C[0]));
/* Fill the matrices with test data /
for (int i = 0; i < 275
275; i++)
{
fscanf(fp,“%f”,&h_A[i]);
fscanf(p,“%f”,&h_B[i]);
fscanf(q,“%f”,&h_C[i]);
}
Mul(h_A,h_B, hA, wA, wB,h_C);
printf(“TEST PASS”);
getchar();

}

void Mul(const float* A,const float* B,int hA,int wA,int wB,float* C)
{
int size;
float* Ad;
size=hAwAsizeof(float);
cudaMalloc((void**)&Ad,size);
cudaMemcpy(Ad,A,size,cudaMemcpyHostToDevice);
float* Bd;
size=wAwBsizeof(float);
cudaMalloc((void**)&Bd,size);
cudaMemcpy(Bd,B,size,cudaMemcpyHostToDevice);
float Cd;
size=hA
wB*sizeof(float);
cudaMalloc((void**)&Cd,size);
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3 dimGrid(wB/dimBlock.x,hA/dimBlock.y);
Muld<<<dimGrid,dimBlock>>>(Ad,Bd,wA,wB,Cd);
cudaMemcpy(C,Cd,size,cudaMemcpyDeviceToHost);
cudaFree(Ad);
cudaFree(Bd);
cudaFree(Cd);
}

template_kernel.cu is as this:

#ifndef TEMPLATE_KERNEL_H
#define TEMPLATE_KERNEL_H
#define BLOCK_SIZE 16
#include <stdio.h>
#define SDATA( index) CUT_BANK_CHECKER(sdata, index)
global void
Muld(floatA,float B,int wA,int wB,float* C)
{
int bx=blockIdx.x;
int by=blockIdx.y;

int tx=threadIdx.x;
int ty=threadIdx.y;

int aBegin=wABLOCK_SIZEby;
int aEnd=aBegin+wA-1;
int aStep=BLOCK_SIZE;
int bBegin=BLOCK_SIZEbx;
int bStep=BLOCK_SIZE
wB;
float Csub=0;

for(int a=aBegin,b=bBegin;a<aEnd;a+=aStep,b+=bStep)
{

__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];

As[ty][tx]=A[a+wA*ty+tx];
Bs[ty][tx]=B[b+wB*ty+tx];
__syncthreads();

for(int k=0;k<BLOCK_SIZE;++k)
Csub+=As[ty][k]*Bs[k][tx];
__syncthreads();
}

int c=wBBLOCK_SIZEby+BLOCK_SIZEbx;
C[c+wB
ty+tx]=Csub;
}
#endif // #ifndef TEMPLATE_KERNEL_H

this programme can pass the compile ,but the result is wrong!

anybody can help me?

anybody can help me?

you can try emudebug, see where goes wrong.
Also be more specific of what’s wrong…

Please note that the example in Chapter 6 is really for the transposes

of the matrices. So, C = B * A, rather than A * B.