Hello,
I am working my way through Cuda By Example and am having trouble with what should be a relatively straightforward computation. It seems as if when my arrays get to a certain size, I segfault immediately. Here is the code that segfaults:
#define N 700000
global void add( int* a, int* b, int* c) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
while (tid < N) {
c[tid] = a[tid] + b[tid];
tid += blockDim.x * gridDim.x;
}
}
int main (void) {
int devCount;
cudaGetDeviceCount(&devCount);
std::cout << "Device count: " << devCount << std::endl;
cudaDeviceProp prop;
for (int i = 0; i < devCount; i++) {
cudaGetDeviceProperties( &prop, i);
std::cout << "Name: " << prop.name << std::endl;
std::cout << "Clock rate: " << prop.clockRate << std::endl;
std::cout << "Multiprocessor count: " << prop.multiProcessorCount
<< std::endl;
std::cout << "Max threads per block: " << prop.maxThreadsPerBlock
<< std::endl;
}
int a[N], b[N], c[N];
int *dev_a, *dev_b, *dev_c;
auto tCuda = std::chrono::high_resolution_clock::now();
int status = cudaMalloc((void**)&dev_a, N * sizeof(int));
std::cout << status << std::endl;
status = cudaMalloc((void**)&dev_b, N * sizeof(int));
std::cout << status << std::endl;
cudaMalloc((void**)&dev_c, N * sizeof(int));
std::cout << status << std::endl;
for (int i = 0; i < N; i++) {
a[i] = -i;
b[i] = i * 2;
//c[i] = 0;
}
cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice);
add<<<200,200>>>(dev_a, dev_b, dev_c);
cudaThreadSynchronize();
cudaMemcpy(c,
dev_c,
N * sizeof(int),
cudaMemcpyDeviceToHost);
auto t = std::chrono::high_resolution_clock::now();
std::cout << "Time to run on gpu: "
<< std::chrono::duration<double,std::milli>(t-tCuda).count()
<< std::endl;
bool success = true;
for (int i = 0; i < N; i++) {
if ((a[i] + b[i]) != c[i]) {
std::cout << "Error: " << a[i] << " + " << b[i] << " != "
<< c[i] << std::endl;
success = false;
}
}
if (success) {
std::cout << “Success!” << std::endl;
}
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}
However, if I change N to 600000, then it works just fine. Is there a limit to an cuda memory allocation or copy? I am running on Ubuntu 14.04, using c++11, and my cuda-enable graphics card is a GeForce GTX 950. Any help or advice would be greatly appreciated.
Thanks,
Scott