I did not find a separate “newbie” section. So I am posting this here. Hope I don’t offend the masters with my silly question.
I am new to CUDA programming. I started out by implementing a simple matrix multiplication algorithm. When I compile it for “device emulation” (-deviceemul) in my laptop(withiut graphics card), the binary is executing correctly and I am getting the correct answers. But when I compile it for my Desktop, installed with GeForce 8400 GS, with the option ‘-arch=sm_11’ and execute the binary I am getting junk values as answer. Am I missing something here?
These are the relevent details from PC with graphics card:
root@mtech-desktop:/home/mtech# nvcc -V
nvcc: NVIDIA ® Cuda compiler driver
Copyright © 2005-2010 NVIDIA Corporation
Built on Mon_Jun__7_18:10:28_PDT_2010
Cuda compilation tools, release 3.1, V0.2.1221
root@mtech-desktop:/home/mtech# uname -a
Linux mtech-desktop 2.6.32-21-generic #32-Ubuntu SMP Fri Apr 16 08:10:02 UTC 2010 i686 GNU/Linux
Graphics Processor:GeForce 8400 GS
CUDA Cores:8
VBIOS Version:62.98.3c.00.00
Memory:512 MB
Memory Interface:64-bit
Bus Type:PCI Express x16 Gen1
These are the relevent details from my laptop
nvcc: NVIDIA ® Cuda compiler driver
Copyright © 2005-2009 NVIDIA Corporation
Built on Thu_Jul__2_10:56:25_PDT_2009
Cuda compilation tools, release 2.3, V0.2.1221
This is the code I am trying to execute:
#include <stdio.h>
#include <cuda.h>
global void myKernel(int* deviceArrayPtrA,int* deviceArrayPtrB,int* deviceArrayPtrC, int pitchA,int pitchB,int pitchC,int k)
{
int i = threadIdx.x;
int j = threadIdx.y;
//
//int k=blockDim.x;
int* rowA=(int*)((char*)deviceArrayPtrA+jpitchA);
int rowC=(int*)((char*)deviceArrayPtrC+jpitchC);
int rowB;
/**/
for(int r=0;r<k;r++)
{
rowB=(int*)((char*)deviceArrayPtrB+r*pitchB);
rowC[i]=rowC[i] +(rowA[r]*rowB[i]);
}
}
main()
{
int* deviceArrayPtrA;
int* deviceArrayPtrB;
int* deviceArrayPtrC;
int cmn;
int wA,wB,hA,hB; //widths and heights of two matrices
size_t devicePitchA,devicePitchB,devicePitchC, hostPitchA,hostPitchB,hostPitchC, width, height;
int hostArrayA[50][50],hostArrayB[50][50],hostArrayC[50][50];
//width = 3;
//height = 3;
printf(“\nEnter Matrix A dimensions (Width Height):”);
scanf(“%d %d”,&wA,&hA);
printf(“\nEnter Matrix B dimensions(Width Height):”);
scanf(“%d %d”,&wB,&hB);
if(wA!=hB)
{
printf(“\n!!!Cannot muliply matrix!!!”);
printf(“\nExiting…”);
exit(1);
}
cmn=wA;
//initialize array A
for(int i = 0; i < hA; i++)
for(int j = 0; j < wA; j++)
{
printf(“Enter array element A[%d][%d]”,i,j);
scanf(“%d”,&hostArrayA[i][j]) ;
}
//initialize array B
for(int i = 0; i < hB; i++)
for(int j = 0; j < wB; j++)
{
printf(“Enter array element B[%d][%d]”,i,j);
scanf(“%d”,&hostArrayB[i][j]) ;
}
cudaMallocPitch((void**)&deviceArrayPtrA, &devicePitchA, wA * sizeof(int), hA);
cudaMallocPitch((void**)&deviceArrayPtrB, &devicePitchB, wB * sizeof(int), hB);
cudaMallocPitch((void**)&deviceArrayPtrC, &devicePitchC, wB * sizeof(int), hA);
printf(“\ndims %u %d”,width * sizeof(int), height);
printf(“\npitches are : %d %d %d\n”,devicePitchA,devicePitchB,devicePitchC);
hostPitchA = devicePitchA;
hostPitchB = devicePitchB;
hostPitchB = devicePitchB;
//Copies hostArray onto the pre-allocated device memory
cudaMemcpy2D(deviceArrayPtrA, devicePitchA, &hostArrayA, 50 * sizeof(int), wA * sizeof(int), hA, cudaMemcpyHostToDevice);
cudaMemcpy2D(deviceArrayPtrB, devicePitchB, &hostArrayB, 50 * sizeof(int), wB * sizeof(int), hB, cudaMemcpyHostToDevice);
dim3 threadperblock(wB,hA);
myKernel <<< 1,threadperblock >>> (deviceArrayPtrA,deviceArrayPtrB,deviceArrayPtrC, devicePitchA,devicePitchB,devicePitchC,cmn);
cudaMemcpy2D( &hostArrayC, 50 * sizeof(int),deviceArrayPtrC, devicePitchC, wB * sizeof(int), hA, cudaMemcpyDeviceToHost);
for(int i = 0; i < hA; i++) {
printf(“\n”);
for(int j = 0; j < wB; j++)
printf("%d ",hostArrayC[i][j]);
}
}
==========================================
AN this is the output:
Enter Matrix A dimensions (Width Height):2
2
Enter Matrix B dimensions(Width Height):2
2
Enter array element A[0][0]1
Enter array element A[0][1]2
Enter array element A[1][0]3
Enter array element A[1][1]4
Enter array element B[0][0]1
Enter array element B[0][1]2
Enter array element B[1][0]3
Enter array element B[1][1]4
dims 36424340 10314532
pitches are : 256 256 256
Answer:
27 30
35 42
=====================
Any help would be appreciated…