C++ code lines convert Cuda

I When I run the following code lines, I have the result of [-19, -18, -17, -16, -15, -14, -13, -12, -11, -10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20;…

 void ndgridFunction(int height, int width, Mat *rs, Mat *cs){
    for(int i = -(height-1); i<= height; i++){
        for(int j= -(width - 1);j <= width; j++){
            cs->at<float>(i+(height - 1 ),j+(width - 1)) = float(j);
            rs->at<float>(i+(height - 1 ),j+(width - 1)) = float(i);
        }
    }
}

    int main()
{


    int height = 15;
    int width = 20;

    Mat cs      = Mat(height*2, width*2, CV_32F);
    Mat rs      = Mat(height*2, width*2, CV_32F);

    ndgridFunction(height,width, &rs, &cs);
    cout<<cs<<endl;

        }
    }

I convert it to Cuda codes in the following but I reach a different result. Here is my Cuda kernel and result.

__global__ void ndgridFunctionDev ( float *rs, float *cs,int height ,int width){

     int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
     int yIndex = blockIdx.y * blockDim.y + threadIdx.y;

    if((xIndex<=width) && (yIndex<height)){
    
        //const int tid = yIndex * width +  xIndex;

        for(int i = -(height-1); i<= height; i++){
            for(int j= -(width - 1);j <= width; j++){       
              
                
                cs[(xIndex+i-1)*width+(yIndex+j-1)]  = float(j);
                rs[(xIndex+i-1)*width+(yIndex+j-1)]  = float(i);
            }
        
        }
        
        
    }
    
}

void equalImagesFunc(cv::Mat& input,cv::Mat& gray, int height, int width){


    const int inputBytes = input.step * input.rows;
    const int grayBytes = gray.step * gray.rows;
    float *d_input, *d_gray;

    cudaMalloc((void**)&d_input,inputBytes);
    cudaMalloc((void**)&d_gray,grayBytes);

    const dim3 block(16,16);
    const dim3 grid(8,8);
    //cout << "row = " << input.rows << "cols= " << input.cols << endl; 
    ndgridFunctionDev<<<grid,block>>>(d_input,d_gray,width,height);
    cudaDeviceSynchronize();

    cudaMemcpy(input.ptr(),d_input,inputBytes,cudaMemcpyDeviceToHost);
    cudaMemcpy(gray.ptr(),d_gray,grayBytes,cudaMemcpyDeviceToHost);

    cudaFree(d_input);
    cudaFree(d_gray);
}

int main(void){

    int height = 15;
    int width = 20;

    Mat cs      = Mat(height*2, width*2, CV_32F);
    Mat rs      = Mat(height*2, width*2, CV_32F);

    equalImagesFunc(cs,rs,height,width);

    cout<<cs<<endl;

}

And result is :[1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4; 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7;…

How can I implement it to give me the same result ?