Missing kernels in a .cubin file? Not all of my kernels are showing up in a .cubin file

Hello-

I have a main.cu file with two marginally different kernels (and nothing else). When I run nvcc -cubin main.cu and inspect the resulting .cubin file, only one of the kernels is present. nvcc does not throw any errors or warnings. I made sure the .cubin file was getting recompiled. This is on mac os x 10.5, using version 2.0 of the cuda toolchain.

I tried asking my cuda-using friend. He says that due to the cubin embargo, kernels in the cubin army need special passports. Anyone have a better explanation?

Cheers,
Drew

Could you post the function headers for the kernels?

Also, the compiler inlines all device kernels. So if one of them has a device prefix only, it probably won’t get a separate entry in the cubin.

these are prototypes for the two functions I have in my .cu file:

global void MatrixMultiplierKernel1( float* C, float* A, float* B, int hA, int wB, int wA);

global void MatrixMultiplierKernel2( float* C, float* A, float* D, float* B, int wA, int wB);

[codebox]// nvcc -cubin main.cu

/* Matrix multiplication: C = A * B.

  • Host code.

  • This sample implements matrix multiplication and is exactly the same as

  • Chapter 7 of the programming guide.

  • It has been written for clarity of exposition to illustrate various CUDA

  • programming principles, not with the goal of providing the most

  • performant generic kernel for matrix multiplication.

  • CUBLAS provides high-performance matrix multiplication.

*/

// includes, system

//#include <stdlib.h>

//#include <stdio.h>

//#include <string.h>

//#include <math.h>

//#include <cutil.h>

#define KERNEL1_BLOCK_SIZE 16

#define KERNEL2_BLOCK_SIZE 16

/*

global void trivial_kernel(int* a) {

// (*a)++;

__shared__ int a_shared;

a_shared = a[0];

a_shared++;

a[0] = a_shared;

}

*/

////////////////////////////////////////////////////////////////////////////////

//! Matrix multiplication on the device: C = A * B

//! wA is A’s width and wB is B’s width

////////////////////////////////////////////////////////////////////////////////

global void MatrixMultiplierKernel1( float* C, float* A, float* B, int hA, int wB, int wA)

{

// Block index

int bx = blockIdx.x;

int by = blockIdx.y;

// Thread index

int tx = threadIdx.x;

int ty = threadIdx.y;

// Index of the first sub-matrix of A processed by the block

int aBegin = wA * KERNEL1_BLOCK_SIZE * by;

// Index of the last sub-matrix of A processed by the block

int aEnd   = aBegin + wA - 1;

// Step size used to iterate through the sub-matrices of A

int aStep  = KERNEL1_BLOCK_SIZE;

// Index of the first sub-matrix of B processed by the block

int bBegin = KERNEL1_BLOCK_SIZE * bx;

// Step size used to iterate through the sub-matrices of B

int bStep  = KERNEL1_BLOCK_SIZE * wB;

// Csub is used to store the element of the block sub-matrix

// that is computed by the thread

float Csub = 0;

// Loop over all the sub-matrices of A and B

// required to compute the block sub-matrix

for (int a = aBegin, b = bBegin;

         a <= aEnd;

         a += aStep, b += bStep) {

// Declaration of the shared memory array As used to

    // store the sub-matrix of A

    __shared__ float As[KERNEL1_BLOCK_SIZE][KERNEL1_BLOCK_SIZE];

// Declaration of the shared memory array Bs used to

    // store the sub-matrix of B

    __shared__ float Bs[KERNEL1_BLOCK_SIZE][KERNEL1_BLOCK_SIZE];

// Load the matrices from device memory

    // to shared memory; each thread loads

    // one element of each matrix

    As[ty][tx] = A[a + wA * ty + tx];

    Bs[ty][tx] = B[b + wB * ty + tx];

// Synchronize to make sure the matrices are loaded

    __syncthreads();

// Multiply the two matrices together;

    // each thread computes one element

    // of the block sub-matrix

    for (int k = 0; k < KERNEL1_BLOCK_SIZE; ++k)

        Csub += As[ty][k] * Bs[k][tx];

// Synchronize to make sure that the preceding

    // computation is done before loading two new

    // sub-matrices of A and B in the next iteration

    __syncthreads();

}

// Write the block sub-matrix to device memory;

// each thread writes one element

int c = wB * KERNEL1_BLOCK_SIZE * by + KERNEL1_BLOCK_SIZE * bx;

C[c + wB * ty + tx] = Csub;

}

#if 0

////////////////////////////////////////////////////////////////////////////////

//! Matrix multiplication on the device: C = A * D * B

//! wA is A’s width and wB is B’s width

////////////////////////////////////////////////////////////////////////////////

global void MatrixMultiplierKernel2( float* C, float* A, float* D, float* B, int wA, int wB)

{

// wD = hD = wA

// Block index

int bx = blockIdx.x; // blocks

int by = blockIdx.y; // blocks

// Thread index

int tx = threadIdx.x; // bytes

int ty = threadIdx.y; // bytes

// Index of the first sub-matrix of A processed by the block

int aBegin = wA * KERNEL2_BLOCK_SIZE * by; // (KERNEL2_BLOCK_SIZE*by) is the number of rows down, wA is the pitch.  

// Index of the last sub-matrix of A processed by the block

int aEnd   = aBegin + wA - 1; //

// Step size used to iterate through the sub-matrices of A

int aStep  = KERNEL2_BLOCK_SIZE;

// Index of the first sub-matrix of B processed by the block

int bBegin = KERNEL2_BLOCK_SIZE * bx;

// Step size used to iterate through the sub-matrices of B

int bStep  = KERNEL2_BLOCK_SIZE * wB;

// Index of the first sub-Matrix of D processed by the block.

// int dBegin = KERNEL2_BLOCK_SIZE *

// Csub is used to store the element of the block sub-matrix

// that is computed by the thread

float Csub = 0;

// Declaration of the shared memory array As used to

// store the sub-matrix of A

__shared__ float As[KERNEL2_BLOCK_SIZE][KERNEL2_BLOCK_SIZE];

// Declaration of the shared memory array Bs used to

// store the sub-matrix of B

__shared__ float Bs[KERNEL2_BLOCK_SIZE][KERNEL2_BLOCK_SIZE];

// Hold a subvector of the diagonal of D in shared memory.

__shared__ float Ds[KERNEL2_BLOCK_SIZE][KERNEL2_BLOCK_SIZE];

// Loop over all the sub-matrices of A and B

// required to compute the block sub-matrix

for (int a = aBegin, b = bBegin, d = 0;

         a <= aEnd;

         a += aStep, b += bStep, d+=KERNEL2_BLOCK_SIZE) {

// Load the matrices from device memory

    // to shared memory; each thread loads

    // one element of each matrix

    AS[ty][tx] = A[a + wA * ty + tx];

    BS[ty][tx] = B[b + wB * ty + tx];

	DS[ty][tx] = D[d + ty];

	// Note: Should we multiply the rows of B by the values in the appropriate block of D in a separate kernel?  Another Kernel call vs. More multiplications...

	// Note: This implementation results in duplication of data (Each row is the same).  Is this cheaper than branching using an if statement?

				// The reads from global memory are cached

	// Note: There is thread read overlap reading from device memory, but no overlap writing into shared memory.  -> Could save some shared memory at the cost of overlapping writes.

	

    // Synchronize to make sure the matrices are loaded

    __syncthreads();

// Multiply the two matrices together;

    // each thread computes one element

    // of the block sub-matrix

    for (int k = 0; k < KERNEL2_BLOCK_SIZE; ++k) {

        //Csub += AS(ty, k) * BS(k, tx);  // Debugging Version: Check if everything ~but d is working...

		//Csub = DS(ty, tx);  // Debugging: Check if D is getting filled in...

        Csub += AS[ty][k] * DS[k][tx] * BS[k][tx];

	}

	// Note: Does this result in a bank conflict?  Or is the same value broadcast to the 16 thread half-warp?

// Synchronize to make sure that the preceding

    // computation is done before loading two new

    // sub-matrices of A and B in the next iteration

    __syncthreads();

}

// Write the block sub-matrix to device memory;

// each thread writes one element

int c = wB * KERNEL2_BLOCK_SIZE * by + KERNEL2_BLOCK_SIZE * bx;

C[c + wB * ty + tx] = Csub;

}

#endif

[/codebox]

Thanks!

I decided to work around this by using the runtime API instead of trying to load .cubin modules with the driver API.

nvcc defaults to C++ now, thus the names are probably getting mangled.

All kernels you want to be exported in a cubin must be prefixed with ‘extern “C” global’ for C++, and just __global for C. (basic C++)

Unless the compiler is spitting out any other errors/warnings, they are being exported properly.

thanks Smokey!