I’m trying to get my recursive program to work on a machine with the C2050 and cuda toolkit 3.1. After debugging it with printf’s, I see that after about 13 or so iterations (with correct behavior), the kernel halts and execution returns to the CPU. I’m thinking the problem is lies with the depth of recursion that is allowed. So, I wrote a simple program (code shown below). With n=66, everything works fine. With n=67, all results are 0. (Can anyone replicate the output to confirm?) This recursion depth seems surprisingly small. Is there a way to set a bigger stack (by means of flags, etc)? Any help is appreciated.
#include <iostream>
#include <stdio.h>
#include <cuda.h>
__device__ int recFact(int fact) {
int temp;
if (fact <= 1) return 1;
temp = fact + recFact(fact-1);
return temp;
}
__global__ void vecMult_d(int *A, int *B, int N) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
B[i] = recFact(i);
}
int main() {
int n = 66;
int *a_h, *b_h;
a_h = (int*) malloc(n*4);
b_h = (int*) malloc(n*4);
int *a_d, *b_d;
cudaMalloc((void**)&a_d, n*sizeof(int));
cudaMalloc((void**)&b_d, n*sizeof(int));
dim3 dimBlock = 66; // Number of threads per block
dim3 dimGrid = 1; // Number of blocks
for (int j = 0; j < n; j++) a_h[j] = j;
cudaMemcpy(a_d, a_h, n*sizeof(int), cudaMemcpyHostToDevice);
vecMult_d<<<dimGrid,dimBlock>>>(a_d,b_d,n);
cudaThreadSynchronize();
cudaMemcpy(b_h, b_d, n*sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(a_d);
cudaFree(b_d);
for (int k = 0; k < n; k++) printf("B[%d] = %d\n", k, b_h[k]);
return(0);
}