#include <stdio.h>
#include <cuda.h>
#include <curand.h>
int main(void){
curandGenerator_t gen1;
int stat;
int N=150000000; // can go to 350000000 without curand
float *v1_d;
stat=cudaMalloc((void **)&v1_d, N*sizeof(float));
if (stat != 0) printf("error on malloc: %d\n",stat);
stat=curandCreateGenerator(&gen1, CURAND_RNG_PSEUDO_XORWOW);
if (stat != 0) printf("error on CreateGenerator: %d\n",stat);
stat=curandSetPseudoRandomGeneratorSeed(gen1, 1777ULL);
if (stat != 0) printf("error on seed setting: %d\n",stat);
stat=curandGenerateUniform(gen1, v1_d, N);
if (stat != 0) printf("error on rand: %d\n",stat);
curandDestroyGenerator(gen1);
/* recover some space */
N=13000000;
float *v2_d;
stat=cudaMalloc((void **)&v2_d, N*sizeof(float));
if (stat != 0) printf("error on second malloc: %d\n",stat);
/* launch kernel here */
/* kernel done */
cudaFree(v1_d);
cudaFree(v2_d);
return 0;
}
#include <stdio.h>
#include <cuda.h>
#include <curand.h>
int main(void){
curandGenerator_t gen1;
int stat;
int N=150000000; // can go to 350000000 without curand
float *v1_d;
stat=cudaMalloc((void **)&v1_d, N*sizeof(float));
if (stat != 0) printf("error on malloc: %d\n",stat);
stat=curandCreateGenerator(&gen1, CURAND_RNG_PSEUDO_XORWOW);
if (stat != 0) printf("error on CreateGenerator: %d\n",stat);
stat=curandSetPseudoRandomGeneratorSeed(gen1, 1777ULL);
if (stat != 0) printf("error on seed setting: %d\n",stat);
stat=curandGenerateUniform(gen1, v1_d, N);
if (stat != 0) printf("error on rand: %d\n",stat);
curandDestroyGenerator(gen1);
/* recover some space */
N=13000000;
float *v2_d;
stat=cudaMalloc((void **)&v2_d, N*sizeof(float));
if (stat != 0) printf("error on second malloc: %d\n",stat);
/* launch kernel here */
/* kernel done */
cudaFree(v1_d);
cudaFree(v2_d);
return 0;
}
I was about to post the exact same thing. Generating ~ 2^24 pseudo-random numbers, and then destroying the generator causes around a 600 MB memory leak on my GTX 480. Considering I only have 1535 MB, that’s quite substantial. As you said, destroying the generator only frees up a couple megs. Not really sure what to do, other than go back to generating random numbers on the CPU…
before the random number generator…
Device Memory - Free: 1170, Total: 1535
during the random number generator…
Device Memory - Free: 449, Total: 1535
after the random number generator…
Device Memory - Free: 452, Total: 1535
I was about to post the exact same thing. Generating ~ 2^24 pseudo-random numbers, and then destroying the generator causes around a 600 MB memory leak on my GTX 480. Considering I only have 1535 MB, that’s quite substantial. As you said, destroying the generator only frees up a couple megs. Not really sure what to do, other than go back to generating random numbers on the CPU…
before the random number generator…
Device Memory - Free: 1170, Total: 1535
during the random number generator…
Device Memory - Free: 449, Total: 1535
after the random number generator…
Device Memory - Free: 452, Total: 1535
Thanks fna.
I should add that the same experiment on a GTX460 (with 2GB) reveals a much smaller leak - around 350MB
This is less than half the GTX480, probably because GTX460 has 7 SM and GTX480 has 15 SM
However on a 8600GT there is no leak at all. For filling a 490MB array the generator needs only 67MB of internal storage and frees it completely after DestroyGenerator.
So this seems a Fermi related thing, although a severe one.
As usual, there is no direct way to report this bug to the developers at nVidia, is there?
Lets just hope that they read this :-D
Thanks fna.
I should add that the same experiment on a GTX460 (with 2GB) reveals a much smaller leak - around 350MB
This is less than half the GTX480, probably because GTX460 has 7 SM and GTX480 has 15 SM
However on a 8600GT there is no leak at all. For filling a 490MB array the generator needs only 67MB of internal storage and frees it completely after DestroyGenerator.
So this seems a Fermi related thing, although a severe one.
As usual, there is no direct way to report this bug to the developers at nVidia, is there?
Lets just hope that they read this :-D
If you are registered developer, there is a way to file bug reports directly to NVIDA from the developer site. If you are not a registered developer, you can become one by filling out this form:
If you are registered developer, there is a way to file bug reports directly to NVIDA from the developer site. If you are not a registered developer, you can become one by filling out this form:
jmcarval2 - Thanks for pointing this out. I think the issue is that CURAND is setting a larger stack size for SM_20 and above. The larger stack ties up more device memory for the stack and leaves less for the heap. This should be fixed for the next release of CURAND.
In the meantime, you can reset the stack size back to the default size after calling curandDestroyGenerator() with this code:
I need to generate random numbers at multiple times during the course of my application (due to space concerns), and I’m losing almost 800MB of global memory space simply by creating the generator, leaving me with only a small amount of memory left over.
In other words, call curandGenerateSeeds() before you reset the stack size to a small size. That is the function that needs the large stack, it sets up the generator state. You can call it manually. If you don’t call it yourself, it will automatically be called by the first generation function. The code above should let the generator setup use the large stack, then reset the stack back smaller as soon as possible.
The above code is for illustrative purposes only - you should ALWAYS check EVERY return value and do proper error handling/reporting.