i copy the example in chapter6 in Programming Guide but can not get the right result…
anybody can tell me where i am wrong?
template.cu file is as follow:
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <cutil.h>
#include “cublas.h”
#include <template_kernel.cu>
#define n2 275275
extern “C”
void Mul(const float A,const float* B,int hA,int wA,int wB,float* C);
global void Muld(float*,float*,int,int,float*);
int
main( int argc, char** argv)
{
int wA=275;
int hA=275;
int wB=275;
int wC=275;
FILE fp,p,q;
fp=fopen(“c://datatest4.txt”,“rb”);
p=fopen(“c://datatest5.txt”,“rb”);
q=fopen(“c://datatest6.txt”,“rb”);
float h_A;
float h_B;
float h_C;
h_A = (float*)malloc(n2 * sizeof(h_A[0]));
h_B = (float*)malloc(n2 * sizeof(h_B[0]));
h_C = (float*)malloc(n2 * sizeof(h_C[0]));
/* Fill the matrices with test data /
for (int i = 0; i < 275275; i++)
{
fscanf(fp,“%f”,&h_A[i]);
fscanf(p,“%f”,&h_B[i]);
fscanf(q,“%f”,&h_C[i]);
}
Mul(h_A,h_B, hA, wA, wB,h_C);
printf(“TEST PASS”);
getchar();
}
void Mul(const float* A,const float* B,int hA,int wA,int wB,float* C)
{
int size;
float* Ad;
size=hAwAsizeof(float);
cudaMalloc((void**)&Ad,size);
cudaMemcpy(Ad,A,size,cudaMemcpyHostToDevice);
float* Bd;
size=wAwBsizeof(float);
cudaMalloc((void**)&Bd,size);
cudaMemcpy(Bd,B,size,cudaMemcpyHostToDevice);
float Cd;
size=hAwB*sizeof(float);
cudaMalloc((void**)&Cd,size);
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3 dimGrid(wB/dimBlock.x,hA/dimBlock.y);
Muld<<<dimGrid,dimBlock>>>(Ad,Bd,wA,wB,Cd);
cudaMemcpy(C,Cd,size,cudaMemcpyDeviceToHost);
cudaFree(Ad);
cudaFree(Bd);
cudaFree(Cd);
}
template_kernel.cu is as this:
#ifndef TEMPLATE_KERNEL_H
#define TEMPLATE_KERNEL_H
#define BLOCK_SIZE 16
#include <stdio.h>
#define SDATA( index) CUT_BANK_CHECKER(sdata, index)
global void
Muld(floatA,float B,int wA,int wB,float* C)
{
int bx=blockIdx.x;
int by=blockIdx.y;
int tx=threadIdx.x;
int ty=threadIdx.y;
int aBegin=wABLOCK_SIZEby;
int aEnd=aBegin+wA-1;
int aStep=BLOCK_SIZE;
int bBegin=BLOCK_SIZEbx;
int bStep=BLOCK_SIZEwB;
float Csub=0;
for(int a=aBegin,b=bBegin;a<aEnd;a+=aStep,b+=bStep)
{
shared float As[BLOCK_SIZE][BLOCK_SIZE];
shared float Bs[BLOCK_SIZE][BLOCK_SIZE];
As[ty][tx]=A[a+wA*ty+tx];
Bs[ty][tx]=B[b+wB*ty+tx];
__syncthreads();
for(int k=0;k<BLOCK_SIZE;++k)
Csub+=As[ty][k]*Bs[k][tx];
__syncthreads();
}
int c=wBBLOCK_SIZEby+BLOCK_SIZEbx;
C[c+wBty+tx]=Csub;
}
#endif // #ifndef TEMPLATE_KERNEL_H
this programme can pass the compile ,but the result is wrong!
anybody can help me?