anybody can help me with the example in chaper 6? do follow the example on p&G but failed

dadada · March 12, 2008, 1:36pm

i copy the example in chapter6 in Programming Guide but can not get the right result…
anybody can tell me where i am wrong?

template.cu file is as follow:

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <cutil.h>
#include “cublas.h”
#include <template_kernel.cu>
#define n2 275275
extern “C”
void Mul(const float A,const float* B,int hA,int wA,int wB,float* C);
global void Muld(float*,float*,int,int,float*);
int
main( int argc, char** argv)
{
int wA=275;
int hA=275;
int wB=275;
int wC=275;

FILE fp,p,q;
fp=fopen(“c://datatest4.txt”,“rb”);
p=fopen(“c://datatest5.txt”,“rb”);
q=fopen(“c://datatest6.txt”,“rb”);
float h_A;
float h_B;
float h_C;
h_A = (float*)malloc(n2 * sizeof(h_A[0]));
h_B = (float*)malloc(n2 * sizeof(h_B[0]));
h_C = (float*)malloc(n2 * sizeof(h_C[0]));
/* Fill the matrices with test data /
for (int i = 0; i < 275275; i++)
{
fscanf(fp,“%f”,&h_A[i]);
fscanf(p,“%f”,&h_B[i]);
fscanf(q,“%f”,&h_C[i]);
}
Mul(h_A,h_B, hA, wA, wB,h_C);
printf(“TEST PASS”);
getchar();

}

void Mul(const float* A,const float* B,int hA,int wA,int wB,float* C)
{
int size;
float* Ad;
size=hAwAsizeof(float);
cudaMalloc((void**)&Ad,size);
cudaMemcpy(Ad,A,size,cudaMemcpyHostToDevice);
float* Bd;
size=wAwBsizeof(float);
cudaMalloc((void**)&Bd,size);
cudaMemcpy(Bd,B,size,cudaMemcpyHostToDevice);
float Cd;
size=hAwB*sizeof(float);
cudaMalloc((void**)&Cd,size);
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3 dimGrid(wB/dimBlock.x,hA/dimBlock.y);
Muld<<<dimGrid,dimBlock>>>(Ad,Bd,wA,wB,Cd);
cudaMemcpy(C,Cd,size,cudaMemcpyDeviceToHost);
cudaFree(Ad);
cudaFree(Bd);
cudaFree(Cd);
}

template_kernel.cu is as this:

#ifndef TEMPLATE_KERNEL_H
#define TEMPLATE_KERNEL_H
#define BLOCK_SIZE 16
#include <stdio.h>
#define SDATA( index) CUT_BANK_CHECKER(sdata, index)
global void
Muld(floatA,float B,int wA,int wB,float* C)
{
int bx=blockIdx.x;
int by=blockIdx.y;

int tx=threadIdx.x;
int ty=threadIdx.y;

int aBegin=wABLOCK_SIZEby;
int aEnd=aBegin+wA-1;
int aStep=BLOCK_SIZE;
int bBegin=BLOCK_SIZEbx;
int bStep=BLOCK_SIZEwB;
float Csub=0;

for(int a=aBegin,b=bBegin;a<aEnd;a+=aStep,b+=bStep)
{

__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];

As[ty][tx]=A[a+wA*ty+tx];
Bs[ty][tx]=B[b+wB*ty+tx];
__syncthreads();

for(int k=0;k<BLOCK_SIZE;++k)
Csub+=As[ty][k]*Bs[k][tx];
__syncthreads();
}

int c=wBBLOCK_SIZEby+BLOCK_SIZEbx;
C[c+wBty+tx]=Csub;
}
#endif // #ifndef TEMPLATE_KERNEL_H

this programme can pass the compile ,but the result is wrong!

anybody can help me?

dadada · March 13, 2008, 12:59am

anybody can help me?

i copy the example in chapter6 in Programming Guide but can not get the right result…

anybody can tell me where i am wrong?

template.cu file is as follow:

include <stdlib.h>

include <stdio.h>

include <string.h>

include <math.h>

include <cutil.h>

include “cublas.h”

include <template_kernel.cu>

define n2 275*275

extern “C”

void Mul(const float* A,const float* B,int hA,int wA,int wB,float* C);

global void Muld(float*,float*,int,int,float*);

int

main( int argc, char** argv)

{

int wA=275;

int hA=275;

int wB=275;

int wC=275;

FILE *fp,*p,*q;

fp=fopen(“c://datatest4.txt”,“rb”);

p=fopen(“c://datatest5.txt”,“rb”);

q=fopen(“c://datatest6.txt”,“rb”);

float* h_A;

float* h_B;

float* h_C;

h_A = (float*)malloc(n2 * sizeof(h_A[0]));

h_B = (float*)malloc(n2 * sizeof(h_B[0]));

h_C = (float*)malloc(n2 * sizeof(h_C[0]));

/* Fill the matrices with test data */

for (int i = 0; i < 275*275; i++)

{

fscanf(fp,“%f”,&h_A[i]);

fscanf(p,“%f”,&h_B[i]);

fscanf(q,“%f”,&h_C[i]);

}

Mul(h_A,h_B, hA, wA, wB,h_C);

printf(“TEST PASS”);

getchar();

}

void Mul(const float* A,const float* B,int hA,int wA,int wB,float* C)

{

int size;

float* Ad;

size=hAwAsizeof(float);

cudaMalloc((void**)&Ad,size);

cudaMemcpy(Ad,A,size,cudaMemcpyHostToDevice);

float* Bd;

size=wAwBsizeof(float);

cudaMalloc((void**)&Bd,size);

cudaMemcpy(Bd,B,size,cudaMemcpyHostToDevice);

float *Cd;

size=hAwBsizeof(float);

cudaMalloc((void**)&Cd,size);

dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);

dim3 dimGrid(wB/dimBlock.x,hA/dimBlock.y);

Muld<<<dimGrid,dimBlock>>>(Ad,Bd,wA,wB,Cd);

cudaMemcpy(C,Cd,size,cudaMemcpyDeviceToHost);

cudaFree(Ad);

cudaFree(Bd);

cudaFree(Cd);

}

template_kernel.cu is as this:

#ifndef TEMPLATE_KERNEL_H

define TEMPLATE_KERNEL_H

define BLOCK_SIZE 16

include <stdio.h>

define SDATA( index) CUT_BANK_CHECKER(sdata, index)

global void

Muld(floatA,float B,int wA,int wB,float* C)

{

int bx=blockIdx.x;

int by=blockIdx.y;

int tx=threadIdx.x;

int ty=threadIdx.y;

int aBegin=wABLOCK_SIZEby;

int aEnd=aBegin+wA-1;

int aStep=BLOCK_SIZE;

int bBegin=BLOCK_SIZE*bx;

int bStep=BLOCK_SIZE*wB;

float Csub=0;

for(int a=aBegin,b=bBegin;a<aEnd;a+=aStep,b+=bStep)

{
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
shared float Bs[BLOCK_SIZE][BLOCK_SIZE];
As[ty][tx]=A[a+wA*ty+tx];

Bs[ty][tx]=B[b+wB*ty+tx];

__syncthreads();
for(int k=0;k<BLOCK_SIZE;++k)

Csub+=As[ty][k]*Bs[k][tx];

__syncthreads();

}

int c=wBBLOCK_SIZEby+BLOCK_SIZE*bx;

C[c+wB*ty+tx]=Csub;

}

endif // #ifndef TEMPLATE_KERNEL_H

this programme can pass the compile ,but the result is wrong!

anybody can help me?

[snapback]341693[/snapback]

Hella_Yu · March 13, 2008, 1:21am

you can try emudebug, see where goes wrong.
Also be more specific of what’s wrong…

nasacort · March 13, 2008, 6:19pm

i copy the example in chapter6 in Programming Guide but can not get the right result…

anybody can tell me where i am wrong?

template.cu file is as follow:

include <stdlib.h>

include <stdio.h>

include <string.h>

include <math.h>

include <cutil.h>

include “cublas.h”

include <template_kernel.cu>

define n2 275*275

extern “C”

void Mul(const float* A,const float* B,int hA,int wA,int wB,float* C);

global void Muld(float*,float*,int,int,float*);

int

main( int argc, char** argv)

{

int wA=275;

int hA=275;

int wB=275;

int wC=275;

FILE *fp,*p,*q;

fp=fopen(“c://datatest4.txt”,“rb”);

p=fopen(“c://datatest5.txt”,“rb”);

q=fopen(“c://datatest6.txt”,“rb”);

float* h_A;

float* h_B;

float* h_C;

h_A = (float*)malloc(n2 * sizeof(h_A[0]));

h_B = (float*)malloc(n2 * sizeof(h_B[0]));

h_C = (float*)malloc(n2 * sizeof(h_C[0]));

/* Fill the matrices with test data */

for (int i = 0; i < 275*275; i++)

{

fscanf(fp,“%f”,&h_A[i]);

fscanf(p,“%f”,&h_B[i]);

fscanf(q,“%f”,&h_C[i]);

}

Mul(h_A,h_B, hA, wA, wB,h_C);

printf(“TEST PASS”);

getchar();

}

void Mul(const float* A,const float* B,int hA,int wA,int wB,float* C)

{

int size;

float* Ad;

size=hAwAsizeof(float);

cudaMalloc((void**)&Ad,size);

cudaMemcpy(Ad,A,size,cudaMemcpyHostToDevice);

float* Bd;

size=wAwBsizeof(float);

cudaMalloc((void**)&Bd,size);

cudaMemcpy(Bd,B,size,cudaMemcpyHostToDevice);

float *Cd;

size=hAwBsizeof(float);

cudaMalloc((void**)&Cd,size);

dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);

dim3 dimGrid(wB/dimBlock.x,hA/dimBlock.y);

Muld<<<dimGrid,dimBlock>>>(Ad,Bd,wA,wB,Cd);

cudaMemcpy(C,Cd,size,cudaMemcpyDeviceToHost);

cudaFree(Ad);

cudaFree(Bd);

cudaFree(Cd);

}

template_kernel.cu is as this:

#ifndef TEMPLATE_KERNEL_H

define TEMPLATE_KERNEL_H

define BLOCK_SIZE 16

include <stdio.h>

define SDATA( index) CUT_BANK_CHECKER(sdata, index)

global void

Muld(floatA,float B,int wA,int wB,float* C)

{

int bx=blockIdx.x;

int by=blockIdx.y;

int tx=threadIdx.x;

int ty=threadIdx.y;

int aBegin=wABLOCK_SIZEby;

int aEnd=aBegin+wA-1;

int aStep=BLOCK_SIZE;

int bBegin=BLOCK_SIZE*bx;

int bStep=BLOCK_SIZE*wB;

float Csub=0;

for(int a=aBegin,b=bBegin;a<aEnd;a+=aStep,b+=bStep)

{
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
shared float Bs[BLOCK_SIZE][BLOCK_SIZE];
As[ty][tx]=A[a+wA*ty+tx];

Bs[ty][tx]=B[b+wB*ty+tx];

__syncthreads();
for(int k=0;k<BLOCK_SIZE;++k)

Csub+=As[ty][k]*Bs[k][tx];

__syncthreads();

}

int c=wBBLOCK_SIZEby+BLOCK_SIZE*bx;

C[c+wB*ty+tx]=Csub;

}

endif // #ifndef TEMPLATE_KERNEL_H

this programme can pass the compile ,but the result is wrong!

anybody can help me?

[snapback]341693[/snapback]

Please note that the example in Chapter 6 is really for the transposes

of the matrices. So, C = B * A, rather than A * B.