Hello,
i have 2 kernels:
- Used 10 registers, 4128+16 bytes smem, 8 bytes cmem[1]
__global__ void merge_oddeven_256_s_new(uint* d_block,uint n,uint pipeSize,int shift,uint gridDimx,uint gridDimy,uint gridDimz){
__shared__ uint result[512];
__shared__ uint blockA[256];
__shared__ uint blockB[256];
uint* d_blockA;
uint* d_blockB;
uint k,p;
uint blockId=blockIdx.x * gridDim.y + blockIdx.y;
uint blockId_z=blockId%( gridDimx * gridDimy );
uint blockId_x=blockId_z%( gridDimx );
uint blockId_y=blockId_z/( gridDimx );
blockId_z=blockId/( gridDimx * gridDimy );
p=(blockId_x);
p+=(2*(blockId_y+1)*gridDimx) + (blockId_z*n/gridDimz);
p+=shift;
d_blockB=d_block + pipeSize*p;
d_blockA=d_blockB - pipeSize*gridDimx;
if(threadIdx.y==0){
blockA[threadIdx.x]=d_blockA[threadIdx.x];
}else{
blockB[threadIdx.x]=d_blockB[threadIdx.x];
}
__syncthreads();
if(threadIdx.y==0){
k=blockA[threadIdx.x];
p=binarySearchLeft_new(blockB,pipeSize,k);
result[threadIdx.x+p]=k;
}else{
k=blockB[threadIdx.x];
p=binarySearchRight_new(blockA,pipeSize,k);
result[threadIdx.x+p]=k;
}
__syncthreads();
if(threadIdx.y==0){
d_blockA[threadIdx.x]=result[threadIdx.x];
}else{
d_blockB[threadIdx.x]=result[threadIdx.x+pipeSize];
}
}
- Used 11 registers, 3104+16 bytes smem, 4 bytes cmem[1]
__global__ void merge_oddeven_256_smin_new(uint* d_block,uint n,uint pipeSize,int shift,uint gridDimx,uint gridDimy,uint gridDimz){
__shared__ uint result[512];
__shared__ uint blockX[256];
uint* d_blockA;
uint* d_blockB;
uint k,p;
uint blockId=blockIdx.x * gridDim.y + blockIdx.y;
uint blockId_z=blockId%( gridDimx * gridDimy );
uint blockId_x=blockId_z%( gridDimx );
uint blockId_y=blockId_z/( gridDimx );
blockId_z=blockId/( gridDimx * gridDimy );
p=(blockId_x);
p+=(2*(blockId_y+1)*gridDimx) + (blockId_z*n/gridDimz);
//p+=(blockId_z*n/gridDimz);
p+=shift;
d_blockB=d_block + pipeSize*p;
d_blockA=d_blockB - pipeSize*gridDimx;
k=d_blockA[threadIdx.x];
blockX[threadIdx.x]=d_blockB[threadIdx.x];
__syncthreads();
p=binarySearchLeft_new(blockX,pipeSize,k);
result[threadIdx.x+p]=k;
__syncthreads();
k=d_blockB[threadIdx.x];
blockX[threadIdx.x]=d_blockA[threadIdx.x];
__syncthreads();
p=binarySearchRight_new(blockX,pipeSize,k);
result[threadIdx.x+p]=k;
__syncthreads();
d_blockA[threadIdx.x]=result[threadIdx.x];
d_blockB[threadIdx.x]=result[threadIdx.x+pipeSize];
}
i would like minimize registers usage in second kernel to 10 per thread, because in CUDA GPU Occupancy Calculator i can see now occupancy=67%, when registers per block=10 then occupancy=100%
Thanks :)