global void kernel_elmcol_m_unrolled(int *num_threads,

float *alpha,
const float *x,
int *incx,
float *y,
int *incy,
int *i,
int *i1,
int *num_row_elements)

{

const int tid = (blockIdx.x * blockDim.x + threadIdx.x) + (blockIdx.y * gridDim.x);
int k_num_threads = *num_threads;
if (tid < k_num_threads) {
int k_incx = *incx;
int k_incy = *incy;
int k_i = *i;
int k_i1 = *i1;
int k_num_row_elements = *num_row_elements;
int row_offset = tid % k_num_row_elements;
int col_offset = tid / k_num_row_elements;
y[IDX2C(k_i, k_i1+col_offset, c_num_equations)+row_offset*k_incy] +=
alpha[col_offset] * x[IDX2C(k_i, k_i, c_num_equations)+row_offset*k_incx];
}

}

[/codebox]

Now, this takes a really long time to run. Also, curiously when I use ‘=’ instead of ‘+=’ it runs about 3 times faster.

What possible things could be going wrong with this?