I’m trying to convince a GeForce FX 5200 to work with CUDA and am having a bit of trouble - presumably because I can’t use the latest driver for Linux. It seems a though I am forced to use the driver contained in NVIDIA-Linux-x86-173.14.18-pkg1. However, when I compile a simple CUDA program with cuda 2.1, the program gets a “feature not implemented” error on the first cudaMalloc() call. I get the same results with cuda 2.0. When I try to use cuda 1.1, it spits lots of C++ related errors even though it is a C program.
Is there a particular version of cuda that I should be using with this card? If the answer is cuda 1.1, is there a version of gcc or other compile tricks that are known to work with it? I am using Fedora 9.
Results with 2.1 in emulator mode - everything seems to work:
[codebox]$ make simple
/tmp/cuda-2.1/cuda/bin/nvcc -V
nvcc: NVIDIA ® Cuda compiler driver
Copyright © 2005-2007 NVIDIA Corporation
Built on Wed_Dec__3_18:29:25_PST_2008
Cuda compilation tools, release 2.1, V0.2.1221
/tmp/cuda-2.1/cuda/bin/nvcc -deviceemu -g -c simple.o simple.cu
gcc -L/tmp/cuda-2.1/cuda/lib -Wl,-R/tmp/cuda-2.1/cuda/lib -lcudart -o simple simple.o
$ ./simple
1 + 3 = 4
2 + 4 = 6
[/codebox]
Results with 2.1 trying to use the GPU:
[codebox]$ make simple
/tmp/cuda-2.1/cuda/bin/nvcc -V
nvcc: NVIDIA ® Cuda compiler driver
Copyright © 2005-2007 NVIDIA Corporation
Built on Wed_Dec__3_18:29:25_PST_2008
Cuda compilation tools, release 2.1, V0.2.1221
/tmp/cuda-2.1/cuda/bin/nvcc -g -c simple.o simple.cu
gcc -L/tmp/cuda-2.1/cuda/lib -Wl,-R/tmp/cuda-2.1/cuda/lib -lcudart -o simple simple.o
$ ./simple
cudaMalloc a: feature is not yet implemented
[/codebox]
Results with 2.0 trying to use the GPU:
[codebox]$ make simple
/tmp/cuda-2.0/cuda/bin/nvcc -V
nvcc: NVIDIA ® Cuda compiler driver
Copyright © 2005-2007 NVIDIA Corporation
Built on Thu_Jun_19_04:48:21_PDT_2008
Cuda compilation tools, release 2.0, V0.2.1221
/tmp/cuda-2.0/cuda/bin/nvcc -g -c simple.o simple.cu
gcc -L/tmp/cuda-2.0/cuda/lib -Wl,-R/tmp/cuda-2.0/cuda/lib -lcudart -o simple simple.o
$ ./simple
cudaMalloc a: feature is not yet implemented
[/codebox]
However, 1.1 won’t even compile:
[codebox]$ make simple
/tmp/cuda-1.1/cuda/bin/nvcc -V
nvcc: NVIDIA ® Cuda compiler driver
Copyright © 2005-2006 NVIDIA Corporation
Built on Fri_Nov_30_02:31:29_PST_2007
Cuda compilation tools, release 1.1, V0.2.1221
/tmp/cuda-1.1/cuda/bin/nvcc -deviceemu -g -c simple.o simple.cu
"/usr/lib/gcc/i386-redhat-linux/4.3.0/…/…/…/…/include/c++/4.3.0/i386-redhat-linux/
bits/c++config.h", line 233: error:
expected a "{"
namespace std attribute ((visibility (“default”))) {
^
…
namespace __gnu_cxx attribute ((visibility (“default”))) {
^
Error limit reached.
100 errors detected in the compilation of “/tmp/tmpxft_0000218f_00000000-2.ii”.
Compilation terminated.
make: *** [simple.o] Error 255
[/codebox]
The source code is about as simple of an example as I can come up with that does proper error checking.
[codebox]#include <stdio.h>
#include <builtin_types.h>
global void
add(unsigned char *g_ia, unsigned char *g_ib, unsigned char *g_res)
{
const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
g_res[i] = g_ia[i] + g_ib[i];
}
int main(int argc, char **argv) {
unsigned char h_ia[] = {1,2}; /* host input */
unsigned char h_ib[] = {3,4}; /* host input */
unsigned char h_res[2]; /* host result */
unsigned char *d_ia, *d_ib; /* device input data */
unsigned char *d_res; /* device result */
unsigned int memsize = 2;
cudaError_t err;
/*
* Allocate space for input data and copy to device
*/
if ((err=cudaMalloc((void**) &d_ia, memsize)) != cudaSuccess) {
fprintf(stderr, "cudaMalloc a: %s\n", cudaGetErrorString(err));
return(1);
}
if ((err=cudaMemcpy(d_ia, h_ia, memsize, cudaMemcpyHostToDevice)) !=
cudaSuccess) {
fprintf(stderr, "cudaMemcpy a: %s\n", cudaGetErrorString(err));
return(1);
}
if (cudaMalloc((void**) &d_ib, memsize) != cudaSuccess) {
fprintf(stderr, "cudaMalloc b: %s\n", cudaGetErrorString(err));
return(1);
}
if ((err=cudaMemcpy(d_ib, h_ib, memsize, cudaMemcpyHostToDevice)) !=
cudaSuccess) {
fprintf(stderr, "cudaMemcpy b: %s\n", cudaGetErrorString(err));
return(1);
}
/*
* Allodate space for output data
*/
if ((err=cudaMalloc((void**) &d_res, memsize)) != cudaSuccess) {
fprintf(stderr, "cudaMalloc r: %s\n", cudaGetErrorString(err));
return(-1);
}
/*
* Set up execution
*/
dim3 grid(1, 1, 1);
dim3 threads(2, 1, 1);
/*
* Execute
*/
add<<< grid, threads, memsize >>>(d_ia, d_ib, d_res);
/*
* Copy result from device
*/
if ((err=cudaMemcpy(h_res, d_res, memsize, cudaMemcpyDeviceToHost))
!= cudaSuccess) {
fprintf(stderr, "cudaMemcpy r (d->h): %s\n",
cudaGetErrorString(err));
return(1);
}
for (int i=0; i<2; i++) {
printf("%d + %d = %d\n", h_ia[i], h_ib[i], h_res[i]);
}
return(0);
}
[/codebox]