cuda accessing global memory slow

I have a cuda kernel doing some computation on a local variable (in register), and after it gets computed, its value gets written into a global array p:

__global__ void dd(	float* p, int dimX, int dimY, int dimZ	)
    {
    	int 
    		i = blockIdx.x*blockDim.x + threadIdx.x,
    		j = blockIdx.y*blockDim.y + threadIdx.y,
            k = blockIdx.z*blockDim.z + threadIdx.z,
    		idx = j*dimX*dimY + j*dimX +i;	 
     
    	if (i >= dimX || j >= dimY || k >= dimZ)
    	{
    		return;
    	}	
    
        float val = 0;
    
        val = SomeComputationOnVal();
        
         p[idx ]=  val;
    	__syncthreads();	
    		
    }

Unfortunately, this function executes very slow.

However, it runs very fast if I do this:

__global__ void dd(	float* p, int dimX, int dimY, int dimZ	)
        {
        	int 
        		i = blockIdx.x*blockDim.x + threadIdx.x,
        		j = blockIdx.y*blockDim.y + threadIdx.y,
                k = blockIdx.z*blockDim.z + threadIdx.z,
        		idx = j*dimX*dimY + j*dimX +i;	 
         
        	if (i >= dimX || j >= dimY || k >= dimZ)
        	{
        		return;
        	}	
        
            float val = 0;
        
            //val = SomeComputationOnVal();
            
             p[idx ]=  val;
        	__syncthreads();	
        		
        }

It also runs very fast if I do this:

__global__ void dd(	float* p, int dimX, int dimY, int dimZ	)
    {
    	int 
    		i = blockIdx.x*blockDim.x + threadIdx.x,
    		j = blockIdx.y*blockDim.y + threadIdx.y,
            k = blockIdx.z*blockDim.z + threadIdx.z,
    		idx = j*dimX*dimY + j*dimX +i;	 
     
    	if (i >= dimX || j >= dimY || k >= dimZ)
    	{
    		return;
    	}	
    
        float val = 0;
    
        val = SomeComputationOnVal();
        
      //   p[idx ]=  val;
    	__syncthreads();	
    		
    }

So I am confused, and have no idea how to solve this problem. I use NSight step in, and I did not find access violation yes.

Here is how I launch the kernel (dimX:924; dimY: 16: dimZ: 1120):

dim3 
          blockSize(8,16,2),
    	  gridSize(dimX/blockSize.x+1,dimY/blockSize.y, dimZ/blockSize.z);
    float* dev_p;		cudaMalloc((void**)&dev_p, dimX*dimY*dimZ*sizeof(float));
    
    dd<<<gridSize, blockSize>>>(	 dev_p,dimX,dimY,dimZ);

Anyone please gives some pinters? Thanks a lot

this code

float val = 0;
val = SomeComputationOnVal();
//   p[idx ]=  val;

does nothing - call to SomeComputationOnVal() is optimized out by the compiler since the result isn’t saved to global memory. yes, nvcc is smarter than you think! :)

idx = jdimXdimY + j*dimX +i;

here j repeated twice. may be an error?