Hello,

I am writing a CUDA kernel to create a 3x3 covariance matrix for each location in the rows*cols main matrix. So that 3D matrix is rows*cols*9 in size, which i allocated in a single malloc accordingly. I need to access this in a single index value

the 9 values of the 3x3 covariance matrix get their values set according to the appropriate row r and column c from some other 2D arrays.

In other words - I need to calculate the appropriate index to access the 9 elements of the 3x3 covariance matrix, as well as the row and column offset of the 2D matrices that are inputs to the value, as well as the appropriate index for the storage array.

i have tried to simplify it down to the following:

extern “C” cuComplex* calc_cov_wrapper(int rows, int cols){

//set up kernel dimensions

int TILE_WIDTH = 512;

int num_blocks_x = (int)(cols+TILE_WIDTH)/TILE_WIDTH;

int num_blocks_y = rows/3;

dim3 dimBlock(TILE_WIDTH, 1,9);

dim3 dimGrid(num_blocks_x, num_blocks_y);

```
int memsize = (rows/3)*cols*9*sizeof(cuComplex);
cutilSafeCall( cudaMalloc((void **)&d_cov,memsize));
h_cov = (cuComplex *)malloc(rows*cols*9*sizeof(cuComplex));
calc_covMatrix<<<dimGrid, dimBlock>>>(d_cov, d_buffer1, d_buffer3, d_buffer2, (rows/3), cols, TILE_WIDTH, 0, memsize);
cutilSafeCall(cudaMemcpy(&h_cov[0], d_cov, memsize, cudaMemcpyDeviceToHost));
//calc 2nd third
calc_covMatrix<<<dimGrid, dimBlock>>>(d_cov, d_buffer1, d_buffer3, d_buffer2, (rows/3), cols, TILE_WIDTH, (rows/3), memsize);
cutilSafeCall(cudaMemcpy(&h_cov[rows/3], d_cov, memsize, cudaMemcpyDeviceToHost));
//calc last third
cudaFree(d_cov);
memsize = (rows/3 + rows%3)*cols*9*sizeof(cuComplex);
cutilSafeCall( cudaMalloc((void **)&d_cov,memsize));
num_blocks_y = rows/3 + rows%3;
dim3 dimGrid3(num_blocks_x, num_blocks_y);
calc_covMatrix<<<dimGrid3, dimBlock>>>(d_cov, d_buffer1, d_buffer3, d_buffer2, (rows/3) + (rows%3), cols, TILE_WIDTH, 2*(rows/3), memsize);
cutilSafeCall(cudaMemcpy(&h_cov[2*rows/3], d_cov, memsize, cudaMemcpyDeviceToHost));
cudaFree(d_cov);
return h_cov;
```

}

**global** void calc_covMatrix(cuComplex* d_cov, cuComplex* d_buffer1,cuComplex* d_buffer3,cuComplex* d_buffer2, int rows, int cols, int TILE_WIDTH, int rowoffset, int dcovSize)

{

int bx = blockIdx.x;

int by = blockIdx.y;

int tx = threadIdx.x;

int ty = threadIdx.y;

int tz = threadIdx.z;

int r = rowoffset + by + ty;

int c = bx*TILE_WIDTH + tx;
int offset = r*cols+c;

```
int ndx = (tz*cols+c)*9 + r;
```

if((r < rows) && (c < cols) && (ndx < dcovSize)){

cuComplex hhConj = cuConjf(d_buffer1[offset]);

cuComplex vvConj = cuConjf(d_buffer3[offset]);

const float sqrt2 = 1.414213562f;

cuComplex two_root = make_cuComplex(sqrt2, 0.0);

switch(tz)

{

case 0:

d_cov[ndx] = make_cuComplex(cuCabsf(d_buffer1[offset])*cuCabsf(d_buffer1[offset]), 0.0);

break;

case 1:

```
d_cov[ndx] = cuCmulf(two_root, cuCmulf(d_buffer1[offset], hhConj));
break;
case 2:
d_cov[ndx] = cuCmulf(d_buffer1[offset], vvConj);
break;
case 3:
d_cov[ndx] = cuCmulf(two_root, cuCmulf(d_buffer2[offset], hhConj));
break;
case 4:
d_cov[ndx] = make_cuComplex(cuCabsf(d_buffer2[offset])*cuCabsf(d_buffer2[offset]), 0.0);
break;
case 5:
d_cov[ndx] = cuCmulf(two_root, cuCmulf(d_buffer2[offset], vvConj));
break;
case 6:
d_cov[ndx] = cuCmulf(d_buffer3[offset], hhConj);
break;
case 7:
d_cov[ndx] = cuCmulf(two_root, cuCmulf(d_buffer3[offset], cuConjf(d_buffer2[offset])));
break;
case 8:
d_cov[ndx] = make_cuComplex(cuCabsf(d_buffer3[offset])*cuCabsf(d_buffer3[offset]), 0.0);
break;
}
```

}

}

I have added the full host and kernel code here. As you can see, I am trying to split the problem into 3rds based on the number of rows, because the full amount will not allocate on the GPU.

Any help is very much appreciated! I Basically can’t seem to figure out what my ndx values need to be within the kernel, or what they need to be in the host array returned by the wrapper. Are the GPU and CPU both memory aligned the same way? That is - they are both row major?