Hello,
I have kernel for vector to matrix multiplication. If I do timing of that kernel in separate sample.cu file, profiling time is around 100 - 130 micro sec.
If I include the same kernel in my project with same configurations and input sizes, I’m getting above 260 micro sec.
Can anyone suggest me, why I’m getting the different timings?
This is the kernel code:
template<typename T>
__global__ void matvec_kernel(
T * dA,
T * dx,
T * dev_ptr_y)
{
unsigned int bid = blockIdx.x;
T * y_sub = dev_ptr_y + bid * BLOCK_SIZE;
T y_val = 0.0;
T * Asub;
T * xsub;
unsigned int row = threadIdx.x;
unsigned int col = threadIdx.y;
for (unsigned int m = 0; m < (NS / BLOCK_SIZE); ++m) {
__shared__ T A_shared[BLOCK_SIZE * BLOCK_SIZE];
__shared__ T x_shared[BLOCK_SIZE];
Asub = dA + BLOCK_SIZE * (bid + m * NS);
xsub = dx + m * BLOCK_SIZE;
/* x_shared <--- xsub */
if (row < BLOCK_SIZE)
x_shared[row] = xsub[row];
/* A_shared <--- Asub */
if (row < BLOCK_SIZE && col < BLOCK_SIZE)
A_shared[row + col * BLOCK_SIZE] = Asub[row + col * NS];
__syncthreads();
for (unsigned int e = 0; e < BLOCK_SIZE; ++e) {
if (row < BLOCK_SIZE)
y_val += A_shared[row + e * BLOCK_SIZE] * x_shared[e];
}
__syncthreads();
}
if (row < BLOCK_SIZE)
y_sub[row] = y_val;
}