Drivers and Test Program? Do I need special drivers because of my OS?

I was informed by my instructor that any driver newer than a certain point should be sufficient to run CUDA applications, but I’m not sure if he was taking the fact that I’m running Linux into account. According to nvidia-settings, the version of the nVidia drivers I’m running is 173.14.09. My card is a GeForce 8600M GT, so it’s one of the supported cards AFAIK. Do I need to download a special Linux CUDA driver or should this be sufficient?

I ask because I’ve implemented a simple array addition program based on a template I had and it doesn’t appear that the nVidia card is actually doing any work; the array is all zeros when it’s done. If I compile using device emulation, the program works as expected. Is there a simple test binary out there I can run to see if my card is working at all?

Thanks!

[codebox]#include <stdio.h>

#include <stdlib.h>

using namespace std;

global void vectorAdditionKernel( const float* pVectorA, const float* pVectorB, float* pVectorC)

{

unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; // usually should use __mul24

pVectorC[i] = pVectorA[i] + pVectorB[i];

}

void vectorAddition(unsigned N, const float* pHostVectorA, const float* pHostVectorB, float* pHostVectorC)

{

printf("%d\n", N);

printf("%d %d %d\n", pHostVectorA, pHostVectorB, pHostVectorC);

const unsigned BLOCKSIZE = 512;

unsigned ThreadCount = N; // might need to increase to make integral multiple of block size

unsigned BlockCount = N / BLOCKSIZE;

unsigned VectorSize = ThreadCount * sizeof(float);

for (int i=0;i<N;i++)

{

	if (i%1024!=0) continue;

    printf("%f %f\n", pHostVectorA[i], pHostVectorB[i]);

}



printf("%d %d %d %d\n", BLOCKSIZE, ThreadCount, BlockCount, VectorSize);



float* pDeviceVectorA = 0;

float* pDeviceVectorB = 0;

float* pDeviceVectorC = 0;



cudaMalloc((void**)&pDeviceVectorA, VectorSize);

cudaMalloc((void**)&pDeviceVectorB, VectorSize);

cudaMalloc((void**)&pDeviceVectorC, VectorSize);



cudaMemcpy(pDeviceVectorA, pHostVectorA, VectorSize, cudaMemcpyHostToDevice);

cudaMemcpy(pDeviceVectorB, pHostVectorB, VectorSize, cudaMemcpyHostToDevice);



vectorAdditionKernel<<<BlockCount,BLOCKSIZE>>>(pDeviceVectorA, pDeviceVectorB, pDeviceVectorC); // blocks



cudaMemcpy(pHostVectorC, pDeviceVectorC, VectorSize, cudaMemcpyDeviceToHost);



for (int i=0;i<N;i++)

{

	if (i%1024!=0) continue;

    printf("%f %f %f\n", pHostVectorA[i], pHostVectorB[i], pHostVectorC[i]);

}	

}

int main(int argc, char** args)

{

const unsigned SIZE = 1048576;

float* a = (float*)malloc(sizeof(float)*SIZE);

float* b = (float*)malloc(sizeof(float)*SIZE);

float* c = (float*)malloc(sizeof(float)*SIZE);

for (int i=0;i<SIZE;i++)

{

	a[i] = i;

	b[i] = i;

}



vectorAddition(SIZE, a, b, c);

/* for (int i=0;i<SIZE;i++)

{

    printf("%f\n", c[i]);

}*/



free(a);

free(B);

free©;

}[/codebox]