hi…
I am sorry if I sound so dumb. Here is my trouble: I understand that we cannot compare float in C/C++. But how can I verify my CUDA output against CPU computed result? My second CUDA program is matrix multiplication (duh?) I compared results (host and CUDA output) and got out unscathed. But when I crank the input matrices size to 2 ^ 7 my results break apart :( . Here is the listing. I know it is not a great code. Don’t flame me, I am learning XD. I have also attached the diff file. Thanks for your time.
__global__ void MatMul(float *a, float *b, float *c, const unsigned int N)
{
unsigned int i = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int j = threadIdx.y + blockIdx.y * blockDim.y;
//if(i < N && j < N) really required ??
{
unsigned int k = 0;//blockIdx.y * blockDim.y, kEnd = blockIdx.y * (1 + blockDim.y);
unsigned int temp = i * N;
for(k = 0; k < N; ++k) c[temp + j] += a[temp + k] * b[k * N + j];
}
return;
}
void HostMatMul(float *a, float *b, float *c, const unsigned int N)
{
for(unsigned int i = 0; i < N; ++i)
{
unsigned int temp = i * N;
for(unsigned int j = 0; j < N; ++j)
{
unsigned int idx = temp + j;
c[idx] = 0.0;
for(unsigned int k = 0; k < N; ++k)
{
c[idx] += a[temp + k] * b[k * N + j];
}
//file >> c[idx];
}
}
//file.close();
return;
}
int HostVerifyResult(float *a, float *b, const unsigned int N)
{
float iRet = 0.0f;
unsigned int i, j, temp;
for(i = 0; i < N; ++i)
for(j = 0, temp = i * N; j < N; ++j)
{
iRet = a[temp + j] - b[temp +j];
if(iRet != 0.0f) cout << i << " x " << j << " : " << b[i * N + j] << " vs " << a[i * N + j] << " = " << iRet << endl;
}
return 0;
}
int main()
{
float *aD, *bD, *cD, *aH, *bH, *cH, *aVerify;
const unsigned int shiftFactor = 7; // breaks at 7. anything < 7 passes
const unsigned int N = 1 << shiftFactor;
const unsigned int matN = N * N;
const unsigned int memSize = sizeof(float) * N * N;
aH = (float*) malloc(memSize);
bH = (float*) malloc(memSize);
cH = (float*) malloc(memSize);
aVerify = (float*) malloc(memSize);
if(aH && bH && cH)
{
unsigned int i = N * N - 1;
do
{
bH[i] = aH[i] = (float) i;
cH[i] = 0.0f;
}while(i--);
const unsigned int n_threadsx = warpSize / 2;
const unsigned int n_threadsy = warpSize / 2;
const unsigned int n_threads = n_threadsx * n_threadsy;
const unsigned int n_blocks = matN / n_threads + ((matN % n_threads) ? 1 : 0);
cout <<"blocks : " << n_blocks << "\n";
dim3 dimBlocks (n_threadsx, n_threadsy);
dim3 dimGrid (1, n_blocks);
if(n_blocks > 1)
{
if(n_blocks > maxGridDim) cout << "WARNING thread per element policy cannot be satisfied!!\n";
else
{
dimGrid.y = 2 * N / warpSize;
dimGrid.x = 2 * N / WarpSize;
}
}
cout << "Grid Dim : " << dimGrid.x << " " << dimGrid.y << " " << dimGrid.z << endl;
cout << "Block Dim : " << dimBlocks.x << " " << dimBlocks.y << " " << dimBlocks.z << endl;
cudaMalloc((void**) &aD, memSize);
cudaMalloc((void**) &bD, memSize);
cudaMalloc((void**) &cD, memSize);
cudaMemcpy(aD, aH, memSize, cudaMemcpyHostToDevice);
cudaMemcpy(bD, bH, memSize, cudaMemcpyHostToDevice);
cudaMemcpy(cD, cH, memSize, cudaMemcpyHostToDevice);
MatMul <<<dimGrid, dimBlocks>>> (aD, bD, cD, N);
cudaThreadSynchronize(); // a necessary ?
cudaMemcpy(aVerify, cD, memSize, cudaMemcpyDeviceToHost);
HostMatMul(aH, bH, cH, N);
HostVerifyResult(cH, aVerify, N);
free(aH);
free(bH);
free(cH);
cudaFree(aD);
cudaFree(bD);
cudaFree(cD);
}
return 0;
}
EDIT:
It’s just a single bit difference. May be I should use smaller numbers… Will try that soon :)
res.zip (37.2 KB)