I have a main.cu file with two marginally different kernels (and nothing else). When I run nvcc -cubin main.cu and inspect the resulting .cubin file, only one of the kernels is present. nvcc does not throw any errors or warnings. I made sure the .cubin file was getting recompiled. This is on mac os x 10.5, using version 2.0 of the cuda toolchain.
I tried asking my cuda-using friend. He says that due to the cubin embargo, kernels in the cubin army need special passports. Anyone have a better explanation?
global void MatrixMultiplierKernel2( float* C, float* A, float* D, float* B, int wA, int wB)
{
// wD = hD = wA
// Block index
int bx = blockIdx.x; // blocks
int by = blockIdx.y; // blocks
// Thread index
int tx = threadIdx.x; // bytes
int ty = threadIdx.y; // bytes
// Index of the first sub-matrix of A processed by the block
int aBegin = wA * KERNEL2_BLOCK_SIZE * by; // (KERNEL2_BLOCK_SIZE*by) is the number of rows down, wA is the pitch.
// Index of the last sub-matrix of A processed by the block
int aEnd = aBegin + wA - 1; //
// Step size used to iterate through the sub-matrices of A
int aStep = KERNEL2_BLOCK_SIZE;
// Index of the first sub-matrix of B processed by the block
int bBegin = KERNEL2_BLOCK_SIZE * bx;
// Step size used to iterate through the sub-matrices of B
int bStep = KERNEL2_BLOCK_SIZE * wB;
// Index of the first sub-Matrix of D processed by the block.
// int dBegin = KERNEL2_BLOCK_SIZE *
// Csub is used to store the element of the block sub-matrix
// that is computed by the thread
float Csub = 0;
// Declaration of the shared memory array As used to
// store the sub-matrix of A
__shared__ float As[KERNEL2_BLOCK_SIZE][KERNEL2_BLOCK_SIZE];
// Declaration of the shared memory array Bs used to
// store the sub-matrix of B
__shared__ float Bs[KERNEL2_BLOCK_SIZE][KERNEL2_BLOCK_SIZE];
// Hold a subvector of the diagonal of D in shared memory.
__shared__ float Ds[KERNEL2_BLOCK_SIZE][KERNEL2_BLOCK_SIZE];
// Loop over all the sub-matrices of A and B
// required to compute the block sub-matrix
for (int a = aBegin, b = bBegin, d = 0;
a <= aEnd;
a += aStep, b += bStep, d+=KERNEL2_BLOCK_SIZE) {
// Load the matrices from device memory
// to shared memory; each thread loads
// one element of each matrix
AS[ty][tx] = A[a + wA * ty + tx];
BS[ty][tx] = B[b + wB * ty + tx];
DS[ty][tx] = D[d + ty];
// Note: Should we multiply the rows of B by the values in the appropriate block of D in a separate kernel? Another Kernel call vs. More multiplications...
// Note: This implementation results in duplication of data (Each row is the same). Is this cheaper than branching using an if statement?
// The reads from global memory are cached
// Note: There is thread read overlap reading from device memory, but no overlap writing into shared memory. -> Could save some shared memory at the cost of overlapping writes.
// Synchronize to make sure the matrices are loaded
__syncthreads();
// Multiply the two matrices together;
// each thread computes one element
// of the block sub-matrix
for (int k = 0; k < KERNEL2_BLOCK_SIZE; ++k) {
//Csub += AS(ty, k) * BS(k, tx); // Debugging Version: Check if everything ~but d is working...
//Csub = DS(ty, tx); // Debugging: Check if D is getting filled in...
Csub += AS[ty][k] * DS[k][tx] * BS[k][tx];
}
// Note: Does this result in a bank conflict? Or is the same value broadcast to the 16 thread half-warp?
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
}
// Write the block sub-matrix to device memory;
// each thread writes one element
int c = wB * KERNEL2_BLOCK_SIZE * by + KERNEL2_BLOCK_SIZE * bx;
C[c + wB * ty + tx] = Csub;