I am running the following simple program:
#include "cuda.h"
#include "stdio.h"
#define SIZE 10
__global__ void vecAdd(float* A, float* B, float* C) {
// threadIdx.x is a built-in variable provided by CUDA at runtime
int i = threadIdx.x;
A[i]=0;
B[i]=i;
C[i] = A[i] + B[i];
}
int main() {
int N=SIZE;
float A, B, C;
float *devPtrA;
float *devPtrB;
float *devPtrC;
int memsize= SIZE * sizeof(float);
cudaMalloc((void**)&devPtrA, memsize);
cudaMalloc((void**)&devPtrB, memsize);
cudaMalloc((void**)&devPtrC, memsize);
cudaMemcpy(devPtrA, A, memsize, cudaMemcpyHostToDevice);
cudaMemcpy(devPtrB, B, memsize, cudaMemcpyHostToDevice);
// __global__ functions are called: Func<<< Dg, Db, Ns >>>(parameter);
vecAdd<<<1, N>>>(devPtrA, devPtrB, devPtrC);
cudaError_t cudaResult;
cudaResult = cudaGetLastError();
if (cudaResult != cudaSuccess) {
printf( "LAST ERROR: %s\n", cudaGetErrorString(cudaResult) );
}
cudaMemcpy(C, devPtrC, memsize, cudaMemcpyDeviceToHost);
for (int i=0; i<SIZE; i++)
printf("C[%d]=%f\n",i,C[i]);
cudaFree(devPtrA);
cudaFree(devPtrB);
cudaFree(devPtrC);
}
The program runs without errors if I compile like this:
nvcc -o simpleadd simpleadd.cu
However, I am running on a 64 bit Mac Pro and need to be able to specify my 64-bit architecture like this:
nvcc -Xcompiler "-arch x86_64" -o simpleadd simpleadd.cu
This gives the error:
error: -malign-double makes no sense in the 64bit mode
which through exhaustive searching I realized could be “fixed” by compiling like this:
nvcc -Xcompiler "-arch x86_64" --no-align-double -o simpleadd simpleadd.cu
So now I run using
./simpleadd
and see this error in my output:
LAST ERROR: too many resources requested for launch
Supposedly, this error refers to using too many registers, but I can’t imagine this program is stretching any resources of the Quadro 4000 for mac in this machine. Does anyone have any idea what might be causing this? In my real project, I am calling CUDA functions from MEX files, and their compilation requires that I compile with the “arch x86_64” flag because the linker auto-detects my system architecture, so I really need to solve this.
People in this forums indicated that the “–ptxas-options=-v” option could be used to get more information about resource usage. My results were:
ptxas info : Compiling entry function ‘Z6vecAddPfS_S’ for ‘sm_10’
ptxas info : Used 4 registers, 12+16 bytes smem
Thanks for any help.