anybody can help me with the example in chaper 6? anybody can help me with the example?

i copy the example in chapter6 in Programming Guide but can not get the right result…
anybody can tell me where i am wrong?

template.cu file is as follow:

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <cutil.h>
#include “cublas.h”
#include <template_kernel.cu>
#define n2 275275
extern “C”
void Mul(const float
A,const float* B,int hA,int wA,int wB,float* C);
global void Muld(float*,float*,int,int,float*);
int
main( int argc, char** argv)
{
int wA=275;
int hA=275;
int wB=275;
int wC=275;

FILE fp,p,q;
fp=fopen(“c://datatest4.txt”,“rb”);
p=fopen(“c://datatest5.txt”,“rb”);
q=fopen(“c://datatest6.txt”,“rb”);
float
h_A;
float
h_B;
float
h_C;
h_A = (float*)malloc(n2 * sizeof(h_A[0]));
h_B = (float*)malloc(n2 * sizeof(h_B[0]));
h_C = (float*)malloc(n2 * sizeof(h_C[0]));
/* Fill the matrices with test data /
for (int i = 0; i < 275
275; i++)
{
fscanf(fp,"%f",&h_A[i]);
fscanf(p,"%f",&h_B[i]);
fscanf(q,"%f",&h_C[i]);
}
Mul(h_A,h_B, hA, wA, wB,h_C);
printf(“TEST PASS”);
getchar();

}

void Mul(const float* A,const float* B,int hA,int wA,int wB,float* C)
{
int size;
float* Ad;
size=hAwAsizeof(float);
cudaMalloc((void**)&Ad,size);
cudaMemcpy(Ad,A,size,cudaMemcpyHostToDevice);
float* Bd;
size=wAwBsizeof(float);
cudaMalloc((void**)&Bd,size);
cudaMemcpy(Bd,B,size,cudaMemcpyHostToDevice);
float Cd;
size=hA
wB*sizeof(float);
cudaMalloc((void**)&Cd,size);
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3 dimGrid(wB/dimBlock.x,hA/dimBlock.y);
Muld<<<dimGrid,dimBlock>>>(Ad,Bd,wA,wB,Cd);
cudaMemcpy(C,Cd,size,cudaMemcpyDeviceToHost);
cudaFree(Ad);
cudaFree(Bd);
cudaFree(Cd);
}

template_kernel.cu is as this:

#ifndef TEMPLATE_KERNEL_H
#define TEMPLATE_KERNEL_H
#define BLOCK_SIZE 16
#include <stdio.h>
#define SDATA( index) CUT_BANK_CHECKER(sdata, index)
global void
Muld(floatA,float B,int wA,int wB,float* C)
{
int bx=blockIdx.x;
int by=blockIdx.y;

int tx=threadIdx.x;
int ty=threadIdx.y;

int aBegin=wABLOCK_SIZEby;
int aEnd=aBegin+wA-1;
int aStep=BLOCK_SIZE;
int bBegin=BLOCK_SIZEbx;
int bStep=BLOCK_SIZE
wB;
float Csub=0;

for(int a=aBegin,b=bBegin;a<aEnd;a+=aStep,b+=bStep)
{

shared float As[BLOCK_SIZE][BLOCK_SIZE];
shared float Bs[BLOCK_SIZE][BLOCK_SIZE];

As[ty][tx]=A[a+wA*ty+tx];
Bs[ty][tx]=B[b+wB*ty+tx];
__syncthreads();

for(int k=0;k<BLOCK_SIZE;++k)
Csub+=As[ty][k]*Bs[k][tx];
__syncthreads();
}

int c=wBBLOCK_SIZEby+BLOCK_SIZEbx;
C[c+wB
ty+tx]=Csub;
}
#endif // #ifndef TEMPLATE_KERNEL_H

this programme can pass the compile ,but the result is wrong!

anybody can help me?

u already posted this AND quoted yourself in reply to it on the other post

http://forums.nvidia.com/index.php?showtopic=62085&hl=

dont double post

you mite want to give more info that wrong result, give some sample data and results.