How to avoid calling cudamalloc inside a function?

Hi,

In the below exam code, calling the func1 multilple times with different array size everytime. This is taking more because everytime we call func1 it is allocating memory using cudamalloc, first cudamalloc is taking 100ms, this is very huge time. How to avoid calling cudamalloc inside a function?

void __global__ add_kernel(float *a, float *b, float *c)
{
   c[threadIdx.x] = a[threadIdx.x] + b[threadIdx.x];
}
void func1(float *a, float *b, float* c, int N)
{
   float *d_a, *d_b, *d_c;
   cudaMalloc(&d_a, N*sizeof(float));
   cudaMalloc(&d_b, N*sizeof(float));
   cudaMalloc(&d_c, N*sizeof(float));

   cudaMemcpy(d_a, a, N*sizeof(float), cudaMemcpyHostToDevice);
   cudaMemcpy(d_b, b, N*sizeof(float), cudaMemcpyHostToDevice);

   add_kernel<<<1, N>>>(d_a, d_b, d_c);
   cudaMemcpy(c, d_c, N*sizeof(float), cudaMemcpyDeviceToHost);
}
int main()
{
   int i;
   float *a, *b, *c;
   for(i = 0; i < 1000; i++)
   {
      randinit(a, i+1);
      randinit(b, i+1);
      func1(a,b,c,i+1);//Calling the function with different size every time.
   }
}

Just perform you cudaMalloc and cudaMemcpy outside the function and pass the device pointer

If I perform cudamalloc outside the function, then I need to pass the device pointer as a argument to func1(which is host function), will it work for this?

I don’t understand your question.
Why don’t you try allocating the max array size once. Once in func1 you’ll overwrite the previous values.

NOTE: Code not tested.

void __global__ add_kernel(float *a, float *b, float *c)
{
   c[threadIdx.x] = a[threadIdx.x] + b[threadIdx.x];
}

void func1(float *a, float *b, float* c, float* d_a, float* d_b, float* d_c, int N)
{
   cudaMemcpy(d_a, a, N*sizeof(float), cudaMemcpyHostToDevice);
   cudaMemcpy(d_b, b, N*sizeof(float), cudaMemcpyHostToDevice);

   add_kernel<<<1, N>>>(d_a, d_b, d_c);
   cudaMemcpy(c, d_c, N*sizeof(float), cudaMemcpyDeviceToHost);
}

int main()
{
   int i;
   float *a, *b, *c;
   float *d_a, *d_b, *d_c;
   
   int max_N = 1000;
   
   cudaMalloc(&d_a, 1000*sizeof(float));
   cudaMalloc(&d_b, 1000*sizeof(float));
   cudaMalloc(&d_c, 1000*sizeof(float));
   
   for(i = 0; i < 1000; i++)
   {
       randinit(a, i+1);
       randinit(b, i+1);
       func1(a,b,c,d_a,d_b,d_c,i+1);//Calling the function with different size every time.
   }

   free(a);
   free(b);
   free(c);
   cudaFree(d_a);
   cudaFree(d_b);
   cudaFree(d_c);
}