Dynamic memory allocation inside the kernel

Hello,
(I am not ashamed of asking again this question)
Is it possible to allocate global memory dynamically inside the CUDA kernel?
If it is possible, how far is it a good practice ? (is it efficient ?)
Thank you.

hi, yes, it possible. We use this to do reduction on the kernel like this:

__global__ void   loop_red_arn_47_cuda_kernel(float a[], CudaIndexType a0004, CudaIndexType a0003, CudaIndexType a0002, DvmType dim1_s, DvmType dim2_s, DvmType dim3_s, float s_grid[], float *s_init, float s1__1, float s1__2, float s1__3, float s1__4, float s1__5, float s1__6, float s1__7, float s1__8, float s1__9, float s1_grid[], DvmType dim1_s2, float s2_grid[], float *s2_init, float mm, float mm_grid[], CudaIndexType blocks[], int red_count, DvmType overall_blocks, long int n2, long int n1){

// Private variables
         long int k2;
         long int k1;
         long int cond_1;
         long int _k1;
         long int cond_0;
         long int _k2;

// Local needs
         CudaIndexType j, i;
         int ibof;
         float s2[17];
         float s1[9];
         int k_k3;
         int k_k2;
         int k_k1;
         float  *s;
         __shared__ DvmType __addr_s;

 // Allocate memory for reduction
         i = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
         if (i == 0) 
         {
            s = new float [dim1_s * dim2_s * dim3_s * blockDim.x * blockDim.y * blockDim.z];
            __addr_s = (DvmType &)s;
         }
         __syncthreads();
         s = (float *)__addr_s;
         s = s + i;

// Fill local variable with passed values
         for (k_k3 = 0 ; k_k3 < dim3_s ; k_k3 = k_k3 + 1)
         {
            for (k_k2 = 0 ; k_k2 < dim2_s ; k_k2 = k_k2 + 1)
            {
               for (k_k1 = 0 ; k_k1 < dim1_s ; k_k1 = k_k1 + 1)
               {
                  s[(k_k1 + k_k2 * dim1_s + k_k3 * (dim1_s * dim2_s)) * (blockDim.x * blockDim.y * blockDim.z)] = s_init[(k_k1 + k_k2 * dim1_s + k_k3 * (dim1_s * dim2_s)) * (blockDim.x * blockDim.y * blockDim.z)];
               }
            }
         }
         s1[0] = s1__1;
         s1[1] = s1__2;
         s1[2] = s1__3;
         s1[3] = s1__4;
         s1[4] = s1__5;
         s1[5] = s1__6;
         s1[6] = s1__7;
         s1[7] = s1__8;
         s1[8] = s1__9;
         for (k_k1 = 0 ; k_k1 < dim1_s2 ; k_k1 = k_k1 + 1)
         {
            s2[k_k1] = s2_init[k_k1];
         }

// Calculate each thread's loop variables' values
         ibof = blockIdx.x * 4;
         j = blocks[ibof + 0] + threadIdx.y;
         if (j <= blocks[ibof + 1]) 
         {
            i = blocks[ibof + 2] + threadIdx.x;
            if (i <= blocks[ibof + 3]) 
            {

// Loop body
               for (k2 = 1, cond_0 = abs(1 - n2) + abs(1), _k2 = 0 ; _k2 < cond_0 ; k2 = k2 + 1, _k2 = _k2 + 1)
               {
                  for (k1 = 1, cond_1 = abs(1 - n1) + abs(1), _k1 = 0 ; _k1 < cond_1 ; k1 = k1 + 1, _k1 = _k1 + 1)
                  {
                     s[(k1 - 1 + (k2 - 1) * dim1_s + (0 - 1) * (dim1_s * dim2_s)) * (blockDim.x * blockDim.y * blockDim.z)] = s[(k1 - 1 + (k2 - 1) * dim1_s + (0 - 1) * (dim1_s * dim2_s)) * (blockDim.x * blockDim.y * blockDim.z)] + a[k1 + a0004 * k2 + a0003 * i + a0002 * j];
                  }
               }
            }
         }

// Reduction
         i = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y);
         __dvmh_blockReduceSumN(s, blockDim.x * blockDim.y * blockDim.z, dim1_s * dim2_s * dim3_s);
         __dvmh_blockReduceMaxN<float, 9 >(s1);
         __dvmh_blockReduceMinN<float, 17 >(s2);
         mm = __dvmh_blockReduceSum(mm);
         if (i % warpSize == 0) 
         {
            mm_grid[blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = mm;
            for (k_k3 = 0 ; k_k3 < dim3_s ; k_k3 = k_k3 + 1)
            {
               for (k_k2 = 0 ; k_k2 < dim2_s ; k_k2 = k_k2 + 1)
               {
                  for (k_k1 = 0 ; k_k1 < dim1_s ; k_k1 = k_k1 + 1)
                  {
                     s_grid[overall_blocks * (k_k1 + k_k2 * dim1_s + k_k3 * (dim1_s * dim2_s)) + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s[(k_k1 + k_k2 * dim1_s + k_k3 * (dim1_s * dim2_s)) * (blockDim.x * blockDim.y * blockDim.z)];
                  }
               }
            }
            s1_grid[overall_blocks * 0 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s1[0];
            s1_grid[overall_blocks * 1 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s1[1];
            s1_grid[overall_blocks * 2 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s1[2];
            s1_grid[overall_blocks * 3 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s1[3];
            s1_grid[overall_blocks * 4 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s1[4];
            s1_grid[overall_blocks * 5 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s1[5];
            s1_grid[overall_blocks * 6 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s1[6];
            s1_grid[overall_blocks * 7 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s1[7];
            s1_grid[overall_blocks * 8 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s1[8];
            s2_grid[overall_blocks * 0 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s2[0];
            s2_grid[overall_blocks * 1 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s2[1];
            s2_grid[overall_blocks * 2 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s2[2];
            s2_grid[overall_blocks * 3 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s2[3];
            s2_grid[overall_blocks * 4 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s2[4];
            s2_grid[overall_blocks * 5 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s2[5];
            s2_grid[overall_blocks * 6 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s2[6];
            s2_grid[overall_blocks * 7 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s2[7];
            s2_grid[overall_blocks * 8 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s2[8];
            s2_grid[overall_blocks * 9 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s2[9];
            s2_grid[overall_blocks * 10 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s2[10];
            s2_grid[overall_blocks * 11 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s2[11];
            s2_grid[overall_blocks * 12 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s2[12];
            s2_grid[overall_blocks * 13 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s2[13];
            s2_grid[overall_blocks * 14 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s2[14];
            s2_grid[overall_blocks * 15 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s2[15];
            s2_grid[overall_blocks * 16 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s2[16];
         }

 // Deallocate memory for reduction
         __syncthreads();
         if (i == 0) 
         {
            delete  s;
         }
      }

memory allocation once on the block gives the best performance, but you must understand that it gives overheads.

dynamic memory allocation is also covered in the programming guide:

https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#dynamic-global-memory-allocation-and-operations

both malloc/free and new/delete are natively supported on devices of cc 2.0 and greater.