Drivers and Test Program? Do I need special drivers because of my OS?

I was informed by my instructor that any driver newer than a certain point should be sufficient to run CUDA applications, but I’m not sure if he was taking the fact that I’m running Linux into account. According to nvidia-settings, the version of the nVidia drivers I’m running is 173.14.09. My card is a GeForce 8600M GT, so it’s one of the supported cards AFAIK. Do I need to download a special Linux CUDA driver or should this be sufficient?

I ask because I’ve implemented a simple array addition program based on a template I had and it doesn’t appear that the nVidia card is actually doing any work; the array is all zeros when it’s done. If I compile using device emulation, the program works as expected. Is there a simple test binary out there I can run to see if my card is working at all?


[codebox]#include <stdio.h>

#include <stdlib.h>

using namespace std;

global void vectorAdditionKernel( const float* pVectorA, const float* pVectorB, float* pVectorC)


unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; // usually should use __mul24

pVectorC[i] = pVectorA[i] + pVectorB[i];


void vectorAddition(unsigned N, const float* pHostVectorA, const float* pHostVectorB, float* pHostVectorC)


printf("%d\n", N);

printf("%d %d %d\n", pHostVectorA, pHostVectorB, pHostVectorC);

const unsigned BLOCKSIZE = 512;

unsigned ThreadCount = N; // might need to increase to make integral multiple of block size

unsigned BlockCount = N / BLOCKSIZE;

unsigned VectorSize = ThreadCount * sizeof(float);

for (int i=0;i<N;i++)


	if (i%1024!=0) continue;

    printf("%f %f\n", pHostVectorA[i], pHostVectorB[i]);


printf("%d %d %d %d\n", BLOCKSIZE, ThreadCount, BlockCount, VectorSize);

float* pDeviceVectorA = 0;

float* pDeviceVectorB = 0;

float* pDeviceVectorC = 0;

cudaMalloc((void**)&pDeviceVectorA, VectorSize);

cudaMalloc((void**)&pDeviceVectorB, VectorSize);

cudaMalloc((void**)&pDeviceVectorC, VectorSize);

cudaMemcpy(pDeviceVectorA, pHostVectorA, VectorSize, cudaMemcpyHostToDevice);

cudaMemcpy(pDeviceVectorB, pHostVectorB, VectorSize, cudaMemcpyHostToDevice);

vectorAdditionKernel<<<BlockCount,BLOCKSIZE>>>(pDeviceVectorA, pDeviceVectorB, pDeviceVectorC); // blocks

cudaMemcpy(pHostVectorC, pDeviceVectorC, VectorSize, cudaMemcpyDeviceToHost);

for (int i=0;i<N;i++)


	if (i%1024!=0) continue;

    printf("%f %f %f\n", pHostVectorA[i], pHostVectorB[i], pHostVectorC[i]);



int main(int argc, char** args)


const unsigned SIZE = 1048576;

float* a = (float*)malloc(sizeof(float)*SIZE);

float* b = (float*)malloc(sizeof(float)*SIZE);

float* c = (float*)malloc(sizeof(float)*SIZE);

for (int i=0;i<SIZE;i++)


	a[i] = i;

	b[i] = i;


vectorAddition(SIZE, a, b, c);

/* for (int i=0;i<SIZE;i++)


    printf("%f\n", c[i]);