I have written a kernel that works fine and gives a good performance boost compared to the CPU, I use a GTX 480. The problem is though that my kernel uses 38 registers per thread, and thereby I can not achieve full occupancy (1536 threads on each MP) but only about half occupancy. According to my calculations I have to get down to 21 registers per thread to get full occupancy, is it possible to optimize the code that much? I can provide the kernel code tomorrow.

What makes you think full occupancy will perform any better? There is plenty of sound evidence to suggest that trying to optimize for full occupancy isn’t a such a good idea.

What makes you think full occupancy will perform any better? There is plenty of sound evidence to suggest that trying to optimize for full occupancy isn’t a such a good idea.

Interesting, I always thought that higher occupancy was better. Well I can still post my kernel tomorrow :)

Interesting, I always thought that higher occupancy was better. Well I can still post my kernel tomorrow :)

Here is my kernel code

```
#define IMUL(a, b) __mul24(a, b)
__constant__ float c_xtxxt_GLM[320];
__constant__ float c_X_GLM[320];
__global__ void CalculateActivityMap(float* activity_map, float* fMRI_Volumes, float* Brain_Voxels, float ctxtxc, int DATA_W, int DATA_H, int DATA_D, int DATA_T, int blocksInY, float invBlocksInY, int timeMultiples, int timeRest)
{
unsigned int blockIdxz = __float2uint_rd(blockIdx.y * invBlocksInY);
unsigned int blockIdxy = blockIdx.y - __umul24(blockIdxz,blocksInY);
unsigned int x = __umul24(blockIdx.x,blockDim.x) + threadIdx.x;
unsigned int y = __umul24(blockIdxy ,blockDim.y) + threadIdx.y;
unsigned int z = __umul24(blockIdxz ,blockDim.z) + threadIdx.z;
if (x >= DATA_W || y >= DATA_H || z >= DATA_D)
return;
if ( Brain_Voxels[x + y * DATA_W + z * DATA_W * DATA_H] == 1.0f )
{
__shared__ float s_Y[8][8][32]; // y,t,x
int t_offset = 0;
float beta1 = 0.0f;
float beta2 = 0.0f;
float eps, meaneps, vareps;
// Calculate betahat, i.e. multiply (x^T x)^(-1) x^T with Y
for (t_offset = 0; t_offset < timeMultiples * 8; t_offset += 8)
{
// Read data into shared memory
s_Y[threadIdx.y][0][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 0, DATA_W, DATA_H, DATA_D)];
s_Y[threadIdx.y][1][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 1, DATA_W, DATA_H, DATA_D)];
s_Y[threadIdx.y][2][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 2, DATA_W, DATA_H, DATA_D)];
s_Y[threadIdx.y][3][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 3, DATA_W, DATA_H, DATA_D)];
s_Y[threadIdx.y][4][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 4, DATA_W, DATA_H, DATA_D)];
s_Y[threadIdx.y][5][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 5, DATA_W, DATA_H, DATA_D)];
s_Y[threadIdx.y][6][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 6, DATA_W, DATA_H, DATA_D)];
s_Y[threadIdx.y][7][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 7, DATA_W, DATA_H, DATA_D)];
__syncthreads();
// Sum and multiply the values in shared memory
beta1 += s_Y[threadIdx.y][0][threadIdx.x] * c_xtxxt_GLM[t_offset + 0];
beta1 += s_Y[threadIdx.y][1][threadIdx.x] * c_xtxxt_GLM[t_offset + 1];
beta1 += s_Y[threadIdx.y][2][threadIdx.x] * c_xtxxt_GLM[t_offset + 2];
beta1 += s_Y[threadIdx.y][3][threadIdx.x] * c_xtxxt_GLM[t_offset + 3];
beta1 += s_Y[threadIdx.y][4][threadIdx.x] * c_xtxxt_GLM[t_offset + 4];
beta1 += s_Y[threadIdx.y][5][threadIdx.x] * c_xtxxt_GLM[t_offset + 5];
beta1 += s_Y[threadIdx.y][6][threadIdx.x] * c_xtxxt_GLM[t_offset + 6];
beta1 += s_Y[threadIdx.y][7][threadIdx.x] * c_xtxxt_GLM[t_offset + 7];
beta2 += s_Y[threadIdx.y][0][threadIdx.x] * c_xtxxt_GLM[DATA_T + t_offset + 0];
beta2 += s_Y[threadIdx.y][1][threadIdx.x] * c_xtxxt_GLM[DATA_T + t_offset + 1];
beta2 += s_Y[threadIdx.y][2][threadIdx.x] * c_xtxxt_GLM[DATA_T + t_offset + 2];
beta2 += s_Y[threadIdx.y][3][threadIdx.x] * c_xtxxt_GLM[DATA_T + t_offset + 3];
beta2 += s_Y[threadIdx.y][4][threadIdx.x] * c_xtxxt_GLM[DATA_T + t_offset + 4];
beta2 += s_Y[threadIdx.y][5][threadIdx.x] * c_xtxxt_GLM[DATA_T + t_offset + 5];
beta2 += s_Y[threadIdx.y][6][threadIdx.x] * c_xtxxt_GLM[DATA_T + t_offset + 6];
beta2 += s_Y[threadIdx.y][7][threadIdx.x] * c_xtxxt_GLM[DATA_T + t_offset + 7];
}
t_offset = timeMultiples * 8;
for (int t = 0; t < timeRest; t++)
{
s_Y[threadIdx.y][0][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + t, DATA_W, DATA_H, DATA_D)];
beta1 += s_Y[threadIdx.y][0][threadIdx.x] * c_xtxxt_Detrend[t_offset + t];
beta2 += s_Y[threadIdx.y][0][threadIdx.x] * c_xtxxt_Detrend[DATA_T + t_offset + t];
}
// Calculate the mean of the error eps
meaneps = 0.0f;
for (t_offset = 0; t_offset < timeMultiples * 8; t_offset += 8)
{
// Read data into shared memory again
s_Y[threadIdx.y][0][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 0, DATA_W, DATA_H, DATA_D)];
s_Y[threadIdx.y][1][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 1, DATA_W, DATA_H, DATA_D)];
s_Y[threadIdx.y][2][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 2, DATA_W, DATA_H, DATA_D)];
s_Y[threadIdx.y][3][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 3, DATA_W, DATA_H, DATA_D)];
s_Y[threadIdx.y][4][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 4, DATA_W, DATA_H, DATA_D)];
s_Y[threadIdx.y][5][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 5, DATA_W, DATA_H, DATA_D)];
s_Y[threadIdx.y][6][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 6, DATA_W, DATA_H, DATA_D)];
s_Y[threadIdx.y][7][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 7, DATA_W, DATA_H, DATA_D)];
__syncthreads();
meaneps += s_Y[threadIdx.y][0][threadIdx.x] - beta1 * c_X_GLM[t_offset + 0] - beta2 * c_X_GLM[DATA_T + t_offset + 0];
meaneps += s_Y[threadIdx.y][1][threadIdx.x] - beta1 * c_X_GLM[t_offset + 1] - beta2 * c_X_GLM[DATA_T + t_offset + 1];
meaneps += s_Y[threadIdx.y][2][threadIdx.x] - beta1 * c_X_GLM[t_offset + 2] - beta2 * c_X_GLM[DATA_T + t_offset + 2];
meaneps += s_Y[threadIdx.y][3][threadIdx.x] - beta1 * c_X_GLM[t_offset + 3] - beta2 * c_X_GLM[DATA_T + t_offset + 3];
meaneps += s_Y[threadIdx.y][4][threadIdx.x] - beta1 * c_X_GLM[t_offset + 4] - beta2 * c_X_GLM[DATA_T + t_offset + 4];
meaneps += s_Y[threadIdx.y][5][threadIdx.x] - beta1 * c_X_GLM[t_offset + 5] - beta2 * c_X_GLM[DATA_T + t_offset + 5];
meaneps += s_Y[threadIdx.y][6][threadIdx.x] - beta1 * c_X_GLM[t_offset + 6] - beta2 * c_X_GLM[DATA_T + t_offset + 6];
meaneps += s_Y[threadIdx.y][7][threadIdx.x] - beta1 * c_X_GLM[t_offset + 7] - beta2 * c_X_GLM[DATA_T + t_offset + 7];
}
t_offset = timeMultiples * 8;
for (int t = 0; t < timeRest; t++)
{
s_Y[threadIdx.y][0][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + t, DATA_W, DATA_H, DATA_D)];
meaneps += s_Y[threadIdx.y][0][threadIdx.x] - beta1 * c_X_GLM[t_offset + t] - beta2 * c_X_GLM[DATA_T + t_offset + t];
}
meaneps /= (float)DATA_T;
vareps = 0.0f;
t_offset = 0;
// Now calculate the variance of eps
for (t_offset = 0; t_offset < timeMultiples * 8; t_offset += 8)
{
// Read data into shared memory
s_Y[threadIdx.y][0][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 0, DATA_W, DATA_H, DATA_D)];
s_Y[threadIdx.y][1][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 1, DATA_W, DATA_H, DATA_D)];
s_Y[threadIdx.y][2][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 2, DATA_W, DATA_H, DATA_D)];
s_Y[threadIdx.y][3][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 3, DATA_W, DATA_H, DATA_D)];
s_Y[threadIdx.y][4][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 4, DATA_W, DATA_H, DATA_D)];
s_Y[threadIdx.y][5][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 5, DATA_W, DATA_H, DATA_D)];
s_Y[threadIdx.y][6][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 6, DATA_W, DATA_H, DATA_D)];
s_Y[threadIdx.y][7][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 7, DATA_W, DATA_H, DATA_D)];
__syncthreads();
eps = s_Y[threadIdx.y][0][threadIdx.x] - beta1 * c_X_GLM[t_offset + 0] - beta2 * c_X_GLM[DATA_T + t_offset + 0];
vareps += (eps - meaneps) * (eps - meaneps);
eps = s_Y[threadIdx.y][1][threadIdx.x] - beta1 * c_X_GLM[t_offset + 1] - beta2 * c_X_GLM[DATA_T + t_offset + 1];
vareps += (eps - meaneps) * (eps - meaneps);
eps = s_Y[threadIdx.y][2][threadIdx.x] - beta1 * c_X_GLM[t_offset + 2] - beta2 * c_X_GLM[DATA_T + t_offset + 2];
vareps += (eps - meaneps) * (eps - meaneps);
eps = s_Y[threadIdx.y][3][threadIdx.x] - beta1 * c_X_GLM[t_offset + 3] - beta2 * c_X_GLM[DATA_T + t_offset + 3];
vareps += (eps - meaneps) * (eps - meaneps);
eps = s_Y[threadIdx.y][4][threadIdx.x] - beta1 * c_X_GLM[t_offset + 4] - beta2 * c_X_GLM[DATA_T + t_offset + 4];
vareps += (eps - meaneps) * (eps - meaneps);
eps = s_Y[threadIdx.y][5][threadIdx.x] - beta1 * c_X_GLM[t_offset + 5] - beta2 * c_X_GLM[DATA_T + t_offset + 5];
vareps += (eps - meaneps) * (eps - meaneps);
eps = s_Y[threadIdx.y][6][threadIdx.x] - beta1 * c_X_GLM[t_offset + 6] - beta2 * c_X_GLM[DATA_T + t_offset + 6];
vareps += (eps - meaneps) * (eps - meaneps);
eps = s_Y[threadIdx.y][7][threadIdx.x] - beta1 * c_X_GLM[t_offset + 7] - beta2 * c_X_GLM[DATA_T + t_offset + 7];
vareps += (eps - meaneps) * (eps - meaneps);
}
t_offset = timeMultiples * 8;
for (int t = 0; t < timeRest; t++)
{
s_Y[threadIdx.y][0][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + t, DATA_W, DATA_H, DATA_D)];
eps = s_Y[threadIdx.y][0][threadIdx.x] - beta1 * c_X_GLM[t_offset + t] - beta2 * c_X_GLM[DATA_T + t_offset + t];
vareps += (eps - meaneps) * (eps - meaneps);
}
vareps /= ((float)DATA_T - 1);
activity_map[x + y * DATA_W + z * DATA_W * DATA_H] = (beta1 - beta2) / (sqrtf(vareps * ctxtxc));
}
else
{
activity_map[x + y * DATA_W + z * DATA_W * DATA_H] = 0.0f;
}
}
```

and here is the call from the host

```
#include "GLM_kernel.cu"
// activity_volume = fMRI_analysis(fMRI_volumes, X, xtxxt, ctxtxc)
void GLM(float* d_Activity_Volume, float* d_fMRI_Volumes, float* d_Brain_Voxels, float h_ctxtxc, int DATA_W, int DATA_H, int DATA_D, int DATA_T)
{
// Calculate the brain activity
int threadsInX = 32;
int threadsInY = 8;
int threadsInZ = 1;
int blocksInX = (DATA_W+threadsInX-1)/threadsInX;
int blocksInY = (DATA_H+threadsInY-1)/threadsInY;
int blocksInZ = (DATA_D+threadsInZ-1)/threadsInZ;
dim3 dimGrid = dim3(blocksInX, blocksInY*blocksInZ);
dim3 dimBlock = dim3(threadsInX, threadsInY, threadsInZ);
// Calculate how many time multiples there are
int timeMultiples = DATA_T / threadsInY;
int timeRest = DATA_T - timeMultiples * threadsInY;
CalculateActivityMap<<<dimGrid, dimBlock>>>(d_Activity_Volume, d_fMRI_Volumes, d_Brain_Voxels, h_ctxtxc, DATA_W, DATA_H, DATA_D, DATA_T, blocksInY, 1.0f/(float)blocksInY, timeMultiples, timeRest);
cutilCheckMsg(" ");
}
```

How can I optimize my code? I tried with “normal” for-loops (instead of 8 calculations in each loop) but I think it was slower.

Here is my kernel code

```
#define IMUL(a, b) __mul24(a, b)
__constant__ float c_xtxxt_GLM[320];
__constant__ float c_X_GLM[320];
__global__ void CalculateActivityMap(float* activity_map, float* fMRI_Volumes, float* Brain_Voxels, float ctxtxc, int DATA_W, int DATA_H, int DATA_D, int DATA_T, int blocksInY, float invBlocksInY, int timeMultiples, int timeRest)
{
unsigned int blockIdxz = __float2uint_rd(blockIdx.y * invBlocksInY);
unsigned int blockIdxy = blockIdx.y - __umul24(blockIdxz,blocksInY);
unsigned int x = __umul24(blockIdx.x,blockDim.x) + threadIdx.x;
unsigned int y = __umul24(blockIdxy ,blockDim.y) + threadIdx.y;
unsigned int z = __umul24(blockIdxz ,blockDim.z) + threadIdx.z;
if (x >= DATA_W || y >= DATA_H || z >= DATA_D)
return;
if ( Brain_Voxels[x + y * DATA_W + z * DATA_W * DATA_H] == 1.0f )
{
__shared__ float s_Y[8][8][32]; // y,t,x
int t_offset = 0;
float beta1 = 0.0f;
float beta2 = 0.0f;
float eps, meaneps, vareps;
// Calculate betahat, i.e. multiply (x^T x)^(-1) x^T with Y
for (t_offset = 0; t_offset < timeMultiples * 8; t_offset += 8)
{
// Read data into shared memory
s_Y[threadIdx.y][0][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 0, DATA_W, DATA_H, DATA_D)];
s_Y[threadIdx.y][1][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 1, DATA_W, DATA_H, DATA_D)];
s_Y[threadIdx.y][2][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 2, DATA_W, DATA_H, DATA_D)];
s_Y[threadIdx.y][3][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 3, DATA_W, DATA_H, DATA_D)];
s_Y[threadIdx.y][4][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 4, DATA_W, DATA_H, DATA_D)];
s_Y[threadIdx.y][5][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 5, DATA_W, DATA_H, DATA_D)];
s_Y[threadIdx.y][6][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 6, DATA_W, DATA_H, DATA_D)];
s_Y[threadIdx.y][7][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 7, DATA_W, DATA_H, DATA_D)];
__syncthreads();
// Sum and multiply the values in shared memory
beta1 += s_Y[threadIdx.y][0][threadIdx.x] * c_xtxxt_GLM[t_offset + 0];
beta1 += s_Y[threadIdx.y][1][threadIdx.x] * c_xtxxt_GLM[t_offset + 1];
beta1 += s_Y[threadIdx.y][2][threadIdx.x] * c_xtxxt_GLM[t_offset + 2];
beta1 += s_Y[threadIdx.y][3][threadIdx.x] * c_xtxxt_GLM[t_offset + 3];
beta1 += s_Y[threadIdx.y][4][threadIdx.x] * c_xtxxt_GLM[t_offset + 4];
beta1 += s_Y[threadIdx.y][5][threadIdx.x] * c_xtxxt_GLM[t_offset + 5];
beta1 += s_Y[threadIdx.y][6][threadIdx.x] * c_xtxxt_GLM[t_offset + 6];
beta1 += s_Y[threadIdx.y][7][threadIdx.x] * c_xtxxt_GLM[t_offset + 7];
beta2 += s_Y[threadIdx.y][0][threadIdx.x] * c_xtxxt_GLM[DATA_T + t_offset + 0];
beta2 += s_Y[threadIdx.y][1][threadIdx.x] * c_xtxxt_GLM[DATA_T + t_offset + 1];
beta2 += s_Y[threadIdx.y][2][threadIdx.x] * c_xtxxt_GLM[DATA_T + t_offset + 2];
beta2 += s_Y[threadIdx.y][3][threadIdx.x] * c_xtxxt_GLM[DATA_T + t_offset + 3];
beta2 += s_Y[threadIdx.y][4][threadIdx.x] * c_xtxxt_GLM[DATA_T + t_offset + 4];
beta2 += s_Y[threadIdx.y][5][threadIdx.x] * c_xtxxt_GLM[DATA_T + t_offset + 5];
beta2 += s_Y[threadIdx.y][6][threadIdx.x] * c_xtxxt_GLM[DATA_T + t_offset + 6];
beta2 += s_Y[threadIdx.y][7][threadIdx.x] * c_xtxxt_GLM[DATA_T + t_offset + 7];
}
t_offset = timeMultiples * 8;
for (int t = 0; t < timeRest; t++)
{
s_Y[threadIdx.y][0][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + t, DATA_W, DATA_H, DATA_D)];
beta1 += s_Y[threadIdx.y][0][threadIdx.x] * c_xtxxt_Detrend[t_offset + t];
beta2 += s_Y[threadIdx.y][0][threadIdx.x] * c_xtxxt_Detrend[DATA_T + t_offset + t];
}
// Calculate the mean of the error eps
meaneps = 0.0f;
for (t_offset = 0; t_offset < timeMultiples * 8; t_offset += 8)
{
// Read data into shared memory again
s_Y[threadIdx.y][0][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 0, DATA_W, DATA_H, DATA_D)];
s_Y[threadIdx.y][1][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 1, DATA_W, DATA_H, DATA_D)];
s_Y[threadIdx.y][2][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 2, DATA_W, DATA_H, DATA_D)];
s_Y[threadIdx.y][3][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 3, DATA_W, DATA_H, DATA_D)];
s_Y[threadIdx.y][4][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 4, DATA_W, DATA_H, DATA_D)];
s_Y[threadIdx.y][5][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 5, DATA_W, DATA_H, DATA_D)];
s_Y[threadIdx.y][6][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 6, DATA_W, DATA_H, DATA_D)];
s_Y[threadIdx.y][7][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 7, DATA_W, DATA_H, DATA_D)];
__syncthreads();
meaneps += s_Y[threadIdx.y][0][threadIdx.x] - beta1 * c_X_GLM[t_offset + 0] - beta2 * c_X_GLM[DATA_T + t_offset + 0];
meaneps += s_Y[threadIdx.y][1][threadIdx.x] - beta1 * c_X_GLM[t_offset + 1] - beta2 * c_X_GLM[DATA_T + t_offset + 1];
meaneps += s_Y[threadIdx.y][2][threadIdx.x] - beta1 * c_X_GLM[t_offset + 2] - beta2 * c_X_GLM[DATA_T + t_offset + 2];
meaneps += s_Y[threadIdx.y][3][threadIdx.x] - beta1 * c_X_GLM[t_offset + 3] - beta2 * c_X_GLM[DATA_T + t_offset + 3];
meaneps += s_Y[threadIdx.y][4][threadIdx.x] - beta1 * c_X_GLM[t_offset + 4] - beta2 * c_X_GLM[DATA_T + t_offset + 4];
meaneps += s_Y[threadIdx.y][5][threadIdx.x] - beta1 * c_X_GLM[t_offset + 5] - beta2 * c_X_GLM[DATA_T + t_offset + 5];
meaneps += s_Y[threadIdx.y][6][threadIdx.x] - beta1 * c_X_GLM[t_offset + 6] - beta2 * c_X_GLM[DATA_T + t_offset + 6];
meaneps += s_Y[threadIdx.y][7][threadIdx.x] - beta1 * c_X_GLM[t_offset + 7] - beta2 * c_X_GLM[DATA_T + t_offset + 7];
}
t_offset = timeMultiples * 8;
for (int t = 0; t < timeRest; t++)
{
s_Y[threadIdx.y][0][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + t, DATA_W, DATA_H, DATA_D)];
meaneps += s_Y[threadIdx.y][0][threadIdx.x] - beta1 * c_X_GLM[t_offset + t] - beta2 * c_X_GLM[DATA_T + t_offset + t];
}
meaneps /= (float)DATA_T;
vareps = 0.0f;
t_offset = 0;
// Now calculate the variance of eps
for (t_offset = 0; t_offset < timeMultiples * 8; t_offset += 8)
{
// Read data into shared memory
s_Y[threadIdx.y][0][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 0, DATA_W, DATA_H, DATA_D)];
s_Y[threadIdx.y][1][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 1, DATA_W, DATA_H, DATA_D)];
s_Y[threadIdx.y][2][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 2, DATA_W, DATA_H, DATA_D)];
s_Y[threadIdx.y][3][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 3, DATA_W, DATA_H, DATA_D)];
s_Y[threadIdx.y][4][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 4, DATA_W, DATA_H, DATA_D)];
s_Y[threadIdx.y][5][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 5, DATA_W, DATA_H, DATA_D)];
s_Y[threadIdx.y][6][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 6, DATA_W, DATA_H, DATA_D)];
s_Y[threadIdx.y][7][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + 7, DATA_W, DATA_H, DATA_D)];
__syncthreads();
eps = s_Y[threadIdx.y][0][threadIdx.x] - beta1 * c_X_GLM[t_offset + 0] - beta2 * c_X_GLM[DATA_T + t_offset + 0];
vareps += (eps - meaneps) * (eps - meaneps);
eps = s_Y[threadIdx.y][1][threadIdx.x] - beta1 * c_X_GLM[t_offset + 1] - beta2 * c_X_GLM[DATA_T + t_offset + 1];
vareps += (eps - meaneps) * (eps - meaneps);
eps = s_Y[threadIdx.y][2][threadIdx.x] - beta1 * c_X_GLM[t_offset + 2] - beta2 * c_X_GLM[DATA_T + t_offset + 2];
vareps += (eps - meaneps) * (eps - meaneps);
eps = s_Y[threadIdx.y][3][threadIdx.x] - beta1 * c_X_GLM[t_offset + 3] - beta2 * c_X_GLM[DATA_T + t_offset + 3];
vareps += (eps - meaneps) * (eps - meaneps);
eps = s_Y[threadIdx.y][4][threadIdx.x] - beta1 * c_X_GLM[t_offset + 4] - beta2 * c_X_GLM[DATA_T + t_offset + 4];
vareps += (eps - meaneps) * (eps - meaneps);
eps = s_Y[threadIdx.y][5][threadIdx.x] - beta1 * c_X_GLM[t_offset + 5] - beta2 * c_X_GLM[DATA_T + t_offset + 5];
vareps += (eps - meaneps) * (eps - meaneps);
eps = s_Y[threadIdx.y][6][threadIdx.x] - beta1 * c_X_GLM[t_offset + 6] - beta2 * c_X_GLM[DATA_T + t_offset + 6];
vareps += (eps - meaneps) * (eps - meaneps);
eps = s_Y[threadIdx.y][7][threadIdx.x] - beta1 * c_X_GLM[t_offset + 7] - beta2 * c_X_GLM[DATA_T + t_offset + 7];
vareps += (eps - meaneps) * (eps - meaneps);
}
t_offset = timeMultiples * 8;
for (int t = 0; t < timeRest; t++)
{
s_Y[threadIdx.y][0][threadIdx.x] = fMRI_Volumes[Calculate_4D_Index(x, y, z, t_offset + t, DATA_W, DATA_H, DATA_D)];
eps = s_Y[threadIdx.y][0][threadIdx.x] - beta1 * c_X_GLM[t_offset + t] - beta2 * c_X_GLM[DATA_T + t_offset + t];
vareps += (eps - meaneps) * (eps - meaneps);
}
vareps /= ((float)DATA_T - 1);
activity_map[x + y * DATA_W + z * DATA_W * DATA_H] = (beta1 - beta2) / (sqrtf(vareps * ctxtxc));
}
else
{
activity_map[x + y * DATA_W + z * DATA_W * DATA_H] = 0.0f;
}
}
```

and here is the call from the host

```
#include "GLM_kernel.cu"
// activity_volume = fMRI_analysis(fMRI_volumes, X, xtxxt, ctxtxc)
void GLM(float* d_Activity_Volume, float* d_fMRI_Volumes, float* d_Brain_Voxels, float h_ctxtxc, int DATA_W, int DATA_H, int DATA_D, int DATA_T)
{
// Calculate the brain activity
int threadsInX = 32;
int threadsInY = 8;
int threadsInZ = 1;
int blocksInX = (DATA_W+threadsInX-1)/threadsInX;
int blocksInY = (DATA_H+threadsInY-1)/threadsInY;
int blocksInZ = (DATA_D+threadsInZ-1)/threadsInZ;
dim3 dimGrid = dim3(blocksInX, blocksInY*blocksInZ);
dim3 dimBlock = dim3(threadsInX, threadsInY, threadsInZ);
// Calculate how many time multiples there are
int timeMultiples = DATA_T / threadsInY;
int timeRest = DATA_T - timeMultiples * threadsInY;
CalculateActivityMap<<<dimGrid, dimBlock>>>(d_Activity_Volume, d_fMRI_Volumes, d_Brain_Voxels, h_ctxtxc, DATA_W, DATA_H, DATA_D, DATA_T, blocksInY, 1.0f/(float)blocksInY, timeMultiples, timeRest);
cutilCheckMsg(" ");
}
```

How can I optimize my code? I tried with “normal” for-loops (instead of 8 calculations in each loop) but I think it was slower.

You can try the -maxregcount key to specify the desired number of registers. Fermi hardware is very efficient in spilling registers to cache so forced number of registers should not slow things down. Experiment will show whether lower number of registers will give more flops.

You can try the -maxregcount key to specify the desired number of registers. Fermi hardware is very efficient in spilling registers to cache so forced number of registers should not slow things down. Experiment will show whether lower number of registers will give more flops.

I think I tried this but the code became slower, but I can try it again.

I think I tried this but the code became slower, but I can try it again.

A couple of general recommendations:

(1) Avoid the use of __umul24() and __mul24(). Even on sm_1x platforms, the performance advantage tends to be minimal, and the use of these intrinsics may inhibit certain compiler optimizations.

(2) Use rsqrtf() instead of division by sqrtf(), unless your code requires IEEE-754 semantics.

As for the register pressure, as avidday points out, this may or may not impact the performance of this code. It seems that for best performance we would want to check all array accesses to make sure they follow a base+TID pattern (I can’t tell in the case of fMRI_Volumes, for example). Here is one thing you could try that should reduce register pressure (a little):

(3) Use the compiler flags -prev-div=false -prev-sqrt=false. The compiler default for sm_2x platforms is IEEE-compliant single-precision division and square root. While this is generally a good thing, the implementation of the IEEE-compliant operations eats a few more registers than the approximate versions. By passing these two flags, you instruct the compiler to emit code for approximate division and square root, potentially saving some registers. Obviously this change may also cause some slight reduction in the accuracy of the results, but the differenece may not matter in practical terms.

The following is probably going to result in increased register usage, but can also enable more aggressive optimizations, and may thus lead to improved performance overall. It may therefore be worth a try:

(4) Presumably the arrays activity_map, fMRI_Volumes, Brain_Voxels are all independent and non-overlapping areas of storage. If so, mark these pointers as restricted pointers to let the compiler know. Also, it seems only activity_map is written to. IF so, mark the other arrays as constants. This would give us:

float* **restrict** activity_map, const float* **restrict** fMRI_Volumes, const float* **restrict** Brain_Voxels

A couple of general recommendations:

(1) Avoid the use of __umul24() and __mul24(). Even on sm_1x platforms, the performance advantage tends to be minimal, and the use of these intrinsics may inhibit certain compiler optimizations.

(2) Use rsqrtf() instead of division by sqrtf(), unless your code requires IEEE-754 semantics.

As for the register pressure, as avidday points out, this may or may not impact the performance of this code. It seems that for best performance we would want to check all array accesses to make sure they follow a base+TID pattern (I can’t tell in the case of fMRI_Volumes, for example). Here is one thing you could try that should reduce register pressure (a little):

(3) Use the compiler flags -prev-div=false -prev-sqrt=false. The compiler default for sm_2x platforms is IEEE-compliant single-precision division and square root. While this is generally a good thing, the implementation of the IEEE-compliant operations eats a few more registers than the approximate versions. By passing these two flags, you instruct the compiler to emit code for approximate division and square root, potentially saving some registers. Obviously this change may also cause some slight reduction in the accuracy of the results, but the differenece may not matter in practical terms.

The following is probably going to result in increased register usage, but can also enable more aggressive optimizations, and may thus lead to improved performance overall. It may therefore be worth a try:

(4) Presumably the arrays activity_map, fMRI_Volumes, Brain_Voxels are all independent and non-overlapping areas of storage. If so, mark these pointers as restricted pointers to let the compiler know. Also, it seems only activity_map is written to. IF so, mark the other arrays as constants. This would give us:

float* **restrict** activity_map, const float* **restrict** fMRI_Volumes, const float* **restrict** Brain_Voxels

I’ve learned a thing or two from your response. Thank you.

Christian

I’ve learned a thing or two from your response. Thank you.

Christian

Can you pleaase provide your definitions of Calculate_4D_Index and c_xtxxt_Detrend please so I can compile this thing?

I love to crack register challenges, but I really need to run this through the compiler.

And for which architecture (sm_10, sm_11, sm_12, sm_13, sm_20, sm_21) are you compiling this, what is your CUDA SDK version?

Christian

Can you pleaase provide your definitions of Calculate_4D_Index and c_xtxxt_Detrend please so I can compile this thing?

I love to crack register challenges, but I really need to run this through the compiler.

And for which architecture (sm_10, sm_11, sm_12, sm_13, sm_20, sm_21) are you compiling this, what is your CUDA SDK version?

Christian

The detrend is not supposed to be there…D’oh…that explains some small errors that I get, thanks for finding the bug! Replace c_xtxxt_Detrend with c_xtxxt_GLM.

```
__device__ int Calculate_4D_Index(int x, int y, int z, int t, int DATA_W, int DATA_H, int DATA_D)
{
return x + y * DATA_W + z * DATA_W * DATA_H + t * DATA_W * DATA_H * DATA_D;
}
```

For my test dataset,

DATA_W = 64

DATA_H = 64

DATA_D = 22

DATA_T = 80

I compile for sm_20 (GTX 480), right now CUDA 3.0. I use Linux 64 bit so all pointers cost 2 registers?

Btw, what is sm_21 ?

The detrend is not supposed to be there…D’oh…that explains some small errors that I get, thanks for finding the bug! Replace c_xtxxt_Detrend with c_xtxxt_GLM.

```
__device__ int Calculate_4D_Index(int x, int y, int z, int t, int DATA_W, int DATA_H, int DATA_D)
{
return x + y * DATA_W + z * DATA_W * DATA_H + t * DATA_W * DATA_H * DATA_D;
}
```

For my test dataset,

DATA_W = 64

DATA_H = 64

DATA_D = 22

DATA_T = 80

I compile for sm_20 (GTX 480), right now CUDA 3.0. I use Linux 64 bit so all pointers cost 2 registers?

Btw, what is sm_21 ?

I think compute capability 2.1 is new to GF104, but I am actually not sure if there is a sm_21 architecture switch.

This is what I am getting now (CUDA SDK 2.3, compiled for sm_10) and I can start optimizing from here.

buchner@athlonx2:~/wanderine> nvcc --ptxas-options=-v -arch sm_10 GLM.cu -I /home/buchner/NVIDIA_CUDA_SDK/C/common/inc/

ptxas info : Compiling entry function ‘_Z20CalculateActivityMapPfS_S_fiiiiifii’

ptxas info : Used 30 registers, 8240+16 bytes smem, 2560 bytes cmem[0], 16 bytes cmem[1]

The question is: Is it any useful for you if I start optimizing registers for sm_1x compute models?

There should be a switch so you can use mixed architecture models (host 64 bit, GPU 32 bit), so this will save you some registers on pointers. The downside is that you will be limited to 4 GB addressable memory - and starting with CUDA 3.2 this model will become deprecated (read latest architecture brief on the registered developers site)