[quote name=‘marcof’ post=‘554063’ date=‘Jun 18 2009, 06:48 AM’]
Hi, I am having some troubles using the CSR kernels into my CG solver: the results are ok at the first iteration, but they become wrong afterwards.
- The scalar version of the kernel runs fine if I replace
[codebox]template
global void
spmv_csr_vector_kernel(const IndexType num_rows,
const IndexType * Ap,
const IndexType * Aj,
const ValueType * Ax,
const ValueType * x,
ValueType * y)
{
#ifdef DEVICE_EMULATION
const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index
const IndexType thread_lane = threadIdx.x & (WARP_SIZE-1); // thread index within the warp
const IndexType warp_id = thread_id / WARP_SIZE; // global warp index
const IndexType num_warps = (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps
for(IndexType row = warp_id; row < num_rows; row += num_warps){
// cheat and use one thread to process the whole row
if (thread_lane == 0){
ValueType sum = y[row];
const IndexType row_start = Ap[row];
const IndexType row_end = Ap[row+1];
for (IndexType jj = row_start; jj < row_end; jj++){
sum += Ax[jj] * fetch_x<UseCache>(Aj[jj], x);
}
y[row] = sum;
}
}
#else
__shared__ ValueType sdata[BLOCK_SIZE];
__shared__ IndexType ptrs[BLOCK_SIZE/WARP_SIZE][2];
const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index
const IndexType thread_lane = threadIdx.x & (WARP_SIZE-1); // thread index within the warp
const IndexType warp_id = thread_id / WARP_SIZE; // global warp index
const IndexType warp_lane = threadIdx.x / WARP_SIZE; // warp index within the CTA
const IndexType num_warps = (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps
for(IndexType row = warp_id; row < num_rows; row += num_warps){
// use two threads to fetch Ap[row] and Ap[row+1]
// this is considerably faster than the more straightforward option
if(thread_lane < 2)
ptrs[warp_lane][thread_lane] = Ap[row + thread_lane];
const IndexType row_start = ptrs[warp_lane][0]; //same as: row_start = Ap[row];
const IndexType row_end = ptrs[warp_lane][1]; //same as: row_end = Ap[row+1];
// compute local sum
sdata[threadIdx.x] = 0;
for(IndexType jj = row_start + thread_lane; jj < row_end; jj += WARP_SIZE)
sdata[threadIdx.x] += Ax[jj] * fetch_x<UseCache>(Aj[jj], x);
// reduce local sums to row sum (ASSUME: warpsize 32)
if (thread_lane < 16) { sdata[threadIdx.x] += sdata[threadIdx.x + 16]; }
if (thread_lane < 8) { sdata[threadIdx.x] += sdata[threadIdx.x + 8]; }
if (thread_lane < 4) { sdata[threadIdx.x] += sdata[threadIdx.x + 4]; }
if (thread_lane < 2) { sdata[threadIdx.x] += sdata[threadIdx.x + 2]; }
if (thread_lane < 1) { sdata[threadIdx.x] += sdata[threadIdx.x + 1]; }
// first thread writes warp result
if (thread_lane == 0)
y[row] += sdata[threadIdx.x];
}
#endif
}[/codebox]