I am using malloc() in kernels to allocate memory. But I found that the number of malloced blocks are very limited. So I wrote the following code to test it out.
With blocksize=4/8/16/32/64, the number of successful mallocs is 88K, with blocksize=128, the number is 44K. More malloc() will fail.
Is there a bug in the library, or am I doing something wrong?
I am using GTX480, NVCC 3.2, Ubuntu 10.10 x86_64.
#include <stdio.h>
#include "cuda.h"
#include <stdlib.h>
#include <iostream>
using namespace std;
const int grid=1;
const int block=32;
__global__ void foo(int num_alloc, int blocksize){
int threadID=blockIdx.x*block+threadIdx.x;
for(int i=0;i<num_alloc;i++){
void * ptr=malloc(blocksize);
if(!ptr){
printf("thread %d malloc failed when i=%d\n",threadID, i);
break;
}
}
}
int main(int argc, char ** argv){
if(argc!=3){
cout<<"usage: "<<argv[0]<<" n_allocs blocksize"<<endl;
return 1;
}
int n_alloc=atoi(argv[1]);
int blocksize=atoi(argv[2]);
foo<<<grid,block>>>(n_alloc, blocksize);
cudaThreadSynchronize();
return 0;
}