Alright so I went back and checked to make sure that we hadn’t missed a symbol table.
A fat binary contains the following fields:
typedef struct __cudaFatCudaBinaryRec {
unsigned long magic;
unsigned long version;
unsigned long gpuInfoVersion;
char* key;
char* ident;
char* usageMode;
__cudaFatPtxEntry *ptx;
__cudaFatCubinEntry *cubin;
__cudaFatDebugEntry *debug;
void* debugInfo;
unsigned int flags;
__cudaFatSymbol *exported;
__cudaFatSymbol *imported;
struct __cudaFatCudaBinaryRec *dependends;
unsigned int characteristic;
__cudaFatElfEntry *elf;
} __cudaFatCudaBinary;
binary->ptx is the actual PTX source. It looks like binary->exported and binary->imported would be symbol tables. However, they are always NULL pointers at runtime.
The ELF section is a little bit more informative. I wrote a simple program to extract all of the elf sections from a cuda binary as it executes. For this simple module:
#include <stdio.h>
extern "C" __global__ void sequence(int *A, int N) {
int i = threadIdx.x + blockIdx.x * blockDim.x;
if (i < N) {
A[i] = 2*i;
}
}
extern "C" __global__ void testShr(int *A, const int *B) {
int i = threadIdx.x + blockIdx.x * blockDim.x;
int b;
__shared__ int storage[256];
storage[threadIdx.x] = B[i];
__syncthreads();
if (i & 1) {
b = storage[threadIdx.x ^ 1] * 2;
}
else {
b = storage[threadIdx.x ^ 1] * 3;
}
A[i] = b;
}
int main(int argc, char *arg[]) {
const int N = 1024;
int *A_host, *A_gpu =0;
int errors = 0;
size_t bytes = sizeof(int)*N;
if (cudaMalloc((void **)&A_gpu, bytes) != cudaSuccess) {
printf("cudaMalloc() - failed to allocate %d bytes on device\n", (int)bytes);
return -1;
}
A_host = (int *)malloc(bytes);
for (int i = 0; i < N; i++) {
A_host[i] = -1;
}
cudaMemcpy(A_gpu, A_host, bytes, cudaMemcpyHostToDevice);
dim3 grid((N+31)/32,1);
dim3 block(32, 1);
sequence<<< grid, block >>>(A_gpu, N);
cudaMemcpy(A_host, A_gpu, bytes, cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++) {
if (A_host[i] != 2*i) {
++errors;
}
}
int *B_gpu = 0;
if (cudaMalloc((void **)&B_gpu, bytes) != cudaSuccess) {
printf("cudaMalloc() - failed to allocate %d bytes on device\n", (int)bytes);
cudaFree(A_gpu);
free(A_host);
return -1;
}
sequence<<< grid, block >>>(A_gpu, N);
testShr<<< grid, block >>>(B_gpu, A_gpu);
if (cudaMemcpy(A_host, B_gpu, bytes, cudaMemcpyDeviceToHost) != cudaSuccess) {
printf("cudaMemcpy(A, B) - failed to copy %d bytes from device to host\n", (int)bytes);
cudaFree(A_gpu);
cudaFree(B_gpu);
free(A_host);
}
for (int i = 0; (errors < 5) && i < N; ++i) {
int b;
if (i & 1) {
b = (i ^ 1) * 2 * 2;
}
else {
b = (i ^ 1) * 2 * 3;
}
int got = A_host[i];
if (b != got) {
printf("ERROR 1 [%d] - expected: %d, got: %d\n", i, b, got);
++errors;
}
}
cudaFree(B_gpu);
cudaFree(A_gpu);
free(A_host);
if (errors) {
printf("Pass/Fail : Fail\n");
}
else {
printf("Pass/Fail : Pass\n");
}
return 0;
}
I get two ELF binaries, the second of which is blank. When I look at the symbol tables using objdump I get the following:
SYMBOL TABLE:
00000000 l d *ABS* 00000000 .shstrtab
00000000 l d *ABS* 00000000 .strtab
00000000 l d *ABS* 00000000 .symtab
00000000 l d *UND* 00000000
00000000 l d *UND* 00000000
00000000 l d .text.testShr 000000a0 .text.testShr
00000000 l d .nv.info.testShr 00000000 .nv.info.testShr
00000000 l d .nv.info 00000000 .nv.info
00000000 l d .text.sequence 00000050 .text.sequence
00000000 l d .nv.info.sequence 00000000 .nv.info.sequence
00000000 l d .nv.shared.testShr 00000000 .nv.shared.testShr
00000000 l d .nv.constant16.testShr 00000000 .nv.constant16.testShr
00000000 l d .nv.constant0.testShr 00000000 .nv.constant0.testShr
00000000 l d .nv.constant0.sequence 00000000 .nv.constant0.sequence
00000000 g F .text.testShr 000000a0 0x10 testShr
00000000 g F .text.sequence 00000050 0x10 sequence
The two kernel names (testShr and sequence) are listed prominently as global symbols. So this could definitely be a way to get this information. This may appear in a future release of ocelot… Thanks avidday for the suggestion.