cudaGetExportTable a total hack

Possibly, but it doesn’t look like it from what I can tell. It seems like the device code is already available via cudaRegisterFatBinary that is contained within libcufft.so.

Also, here is a bit more information. It looks like my hack that I posted before fails for CUFFT on 64-bit machines. Here is a bit more information in-case other people want to try to take this further.

It looks like the export table ID is always the same for CUFFT and CUBLAS ( 0x11df21116e3393c6 0x9395d855f368c3a8 ), and that the table contains only three function pointers.

Here is a complete assembly dump of the third function returned including the entire reachable call graph in libcuda.so:

exported_function_3_of_3:

=> 0x00007fffecf17190:	push   %rbp

   0x00007fffecf17191:	mov    %rdi,%rbp

   0x00007fffecf17194:	xor    %edi,%edi

   0x00007fffecf17196:	push   %rbx

   0x00007fffecf17197:	mov    %rdx,%rbx

   0x00007fffecf1719a:	sub    $0x8,%rsp

   0x00007fffecf1719e:	test   %rsi,%rsi

   0x00007fffecf171a1:	cmove  %rsp,%rdi

   0x00007fffecf171a5:	mov    %rsi,(%rsp)

   0x00007fffecf171a9:	callq  0x7fffecea23a0   # begin_function_1

   0x00007fffecf171ae:	test   %eax,%eax

   0x00007fffecf171b0:	mov    %eax,%edx

   0x00007fffecf171b2:	jne    0x7fffecf171dd

   0x00007fffecf171b4:	mov    (%rsp),%rcx

   0x00007fffecf171b8:	mov    0x88(%rcx),%rax

   0x00007fffecf171bf:	test   %rax,%rax

   0x00007fffecf171c2:	jne    0x7fffecf171cf

   0x00007fffecf171c4:	jmp    0x7fffecf171e6

   0x00007fffecf171c6:	mov    0x28(%rax),%rax

   0x00007fffecf171ca:	test   %rax,%rax

   0x00007fffecf171cd:	je     0x7fffecf171e6

   0x00007fffecf171cf:	cmp    %rbx,0x8(%rax)

   0x00007fffecf171d3:	jne    0x7fffecf171c6

   0x00007fffecf171d5:	mov    0x10(%rax),%rbx

   0x00007fffecf171d9:	mov    %rbx,0x0(%rbp)

   0x00007fffecf171dd:	add    $0x8,%rsp

   0x00007fffecf171e1:	mov    %edx,%eax

   0x00007fffecf171e3:	pop    %rbx

   0x00007fffecf171e4:	pop    %rbp

   0x00007fffecf171e5:	retq   

   0x00007fffecf171e6:	add    $0x8,%rsp

   0x00007fffecf171ea:	mov    $0x190,%edx

   0x00007fffecf171ef:	pop    %rbx

   0x00007fffecf171f0:	pop    %rbp

   0x00007fffecf171f1:	mov    %edx,%eax

   0x00007fffecf171f3:	retq  

begin_function_1:   

   0x00007fffecea23a0:	mov    %rbp,-0x10(%rsp)

   0x00007fffecea23a5:	mov    %r12,-0x8(%rsp)

   0x00007fffecea23aa:	mov    %rdi,%r12

   0x00007fffecea23ad:	mov    %rbx,-0x18(%rsp)

   0x00007fffecea23b2:	sub    $0x18,%rsp

   0x00007fffecea23b6:	mov    0x9706db(%rip),%rbp        # 0x7fffed812a98

   0x00007fffecea23bd:	mov    0x578(%rbp),%ebx

   0x00007fffecea23c3:	callq  0x7fffed4b39b0       # begin_function_2

   0x00007fffecea23c8:	cmp    %eax,%ebx

   0x00007fffecea23ca:	jne    0x7fffecea23e7

   0x00007fffecea23cc:	mov    0x1d0(%rbp),%edx

   0x00007fffecea23d2:	mov    $0x4,%ecx

   0x00007fffecea23d7:	cmp    $0x321cba00,%edx

   0x00007fffecea23dd:	je     0x7fffecea23ec

   0x00007fffecea23df:	cmp    $0xabc123,%edx

   0x00007fffecea23e5:	je     0x7fffecea2401

   0x00007fffecea23e7:	mov    $0x3,%ecx

   0x00007fffecea23ec:	mov    (%rsp),%rbx

   0x00007fffecea23f0:	mov    0x8(%rsp),%rbp

   0x00007fffecea23f5:	mov    %ecx,%eax

   0x00007fffecea23f7:	mov    0x10(%rsp),%r12

   0x00007fffecea23fc:	add    $0x18,%rsp

   0x00007fffecea2400:	retq   

   0x00007fffecea2401:	xor    %cl,%cl

   0x00007fffecea2403:	test   %r12,%r12

   0x00007fffecea2406:	je     0x7fffecea23ec

   0x00007fffecea2408:	mov    0x0(%rbp),%edi

   0x00007fffecea240b:	callq  0x7fffed4b3aa0     # begin_function_3

   0x00007fffecea2410:	test   %rax,%rax

   0x00007fffecea2413:	mov    %rax,(%r12)

   0x00007fffecea2417:	mov    $0xc9,%ecx

   0x00007fffecea241c:	je     0x7fffecea23ec

begin_function_2:

   0x00007fffed4b39b0:	jmpq   0x7fffece88064 <getpid@plt>

   0x00007fffed4b39b5:	data32 nopw %cs:0x0(%rax,%rax,1)

   0x00007fffed4b39c0:	jmpq   0x7fffece887e4 <pthread_mutex_unlock@plt>

   0x00007fffed4b39c5:	data32 nopw %cs:0x0(%rax,%rax,1)

   0x00007fffed4b39d0:	jmpq   0x7fffece886e4 <pthread_mutex_lock@plt>

   0x00007fffed4b39d5:	data32 nopw %cs:0x0(%rax,%rax,1)

   0x00007fffed4b39e0:	jmpq   0x7fffece88784 <pthread_mutex_destroy@plt>

   0x00007fffed4b39e5:	data32 nopw %cs:0x0(%rax,%rax,1)

   0x00007fffed4b39f0:	xor    %esi,%esi

   0x00007fffed4b39f2:	jmpq   0x7fffece88074 <pthread_mutex_init@plt>

   0x00007fffed4b39f7:	nopw   0x0(%rax,%rax,1)

   0x00007fffed4b3a00:	push   %r12

   0x00007fffed4b3a02:	mov    %edi,%eax

   0x00007fffed4b3a04:	push   %rbp

   0x00007fffed4b3a05:	push   %rbx

   0x00007fffed4b3a06:	mov    $0x10624dd3,%ebx

   0x00007fffed4b3a0b:	mul    %ebx

   0x00007fffed4b3a0d:	sub    $0x20,%rsp

   0x00007fffed4b3a11:	lea    0x10(%rsp),%rbp

   0x00007fffed4b3a16:	mov    %rsp,%rsi

   0x00007fffed4b3a19:	shr    $0x6,%edx

   0x00007fffed4b3a1c:	mov    %edx,%ecx

   0x00007fffed4b3a1e:	imul   $0x3e8,%edx,%edx

   0x00007fffed4b3a24:	mov    %rcx,0x10(%rsp)

   0x00007fffed4b3a29:	sub    %edx,%edi

   0x00007fffed4b3a2b:	imul   $0xf4240,%edi,%eax

   0x00007fffed4b3a31:	mov    %eax,%edi

   0x00007fffed4b3a33:	mov    %rdi,0x18(%rsp)

   0x00007fffed4b3a38:	mov    %rbp,%rdi

   0x00007fffed4b3a3b:	callq  0x7fffece88514 <nanosleep@plt>

   0x00007fffed4b3a40:	test   %eax,%eax

   0x00007fffed4b3a42:	je     0x7fffed4b3a79

   0x00007fffed4b3a44:	callq  0x7fffece88424 <__errno_location@plt>

   0x00007fffed4b3a49:	mov    %rax,%rbx

   0x00007fffed4b3a4c:	nopl   0x0(%rax)

   0x00007fffed4b3a50:	jmp    0x7fffed4b3a74

   0x00007fffed4b3a52:	mov    0x8(%rsp),%rsi

   0x00007fffed4b3a57:	mov    (%rsp),%r8

   0x00007fffed4b3a5b:	mov    %rbp,%rdi

   0x00007fffed4b3a5e:	mov    %rsi,0x18(%rsp)

   0x00007fffed4b3a63:	mov    %rsp,%rsi

   0x00007fffed4b3a66:	mov    %r8,0x10(%rsp)

   0x00007fffed4b3a6b:	callq  0x7fffece88514 <nanosleep@plt>

   0x00007fffed4b3a70:	test   %eax,%eax

   0x00007fffed4b3a72:	je     0x7fffed4b3a79

   0x00007fffed4b3a74:	cmpl   $0x4,(%rbx)

   0x00007fffed4b3a77:	je     0x7fffed4b3a52

   0x00007fffed4b3a79:	add    $0x20,%rsp

   0x00007fffed4b3a7d:	pop    %rbx

   0x00007fffed4b3a7e:	pop    %rbp

   0x00007fffed4b3a7f:	pop    %r12

   0x00007fffed4b3a81:	retq   

begin_function_3:

   0x00007fffed4b3aa0:	dec    %edi

   0x00007fffed4b3aa2:	jmpq   0x7fffece87db4 <pthread_getspecific@plt>

   0x00007fffed4b3aa7:	nopw   0x0(%rax,%rax,1)

   0x00007fffed4b3ab0:	jmpq   0x7fffece88194 <pthread_key_delete@plt>

   0x00007fffed4b3ab5:	data32 nopw %cs:0x0(%rax,%rax,1)

   0x00007fffed4b3ac0:	sub    $0x18,%rsp

   0x00007fffed4b3ac4:	mov    %rdi,%rsi

   0x00007fffed4b3ac7:	lea    0x14(%rsp),%rax

   0x00007fffed4b3acc:	mov    %rax,%rdi

   0x00007fffed4b3acf:	callq  0x7fffece88754 <pthread_key_create@plt>

   0x00007fffed4b3ad4:	mov    0x14(%rsp),%ecx

   0x00007fffed4b3ad8:	mov    %eax,%edx

   0x00007fffed4b3ada:	xor    %eax,%eax

   0x00007fffed4b3adc:	inc    %ecx

   0x00007fffed4b3ade:	test   %edx,%edx

   0x00007fffed4b3ae0:	cmove  %ecx,%eax

   0x00007fffed4b3ae3:	add    $0x18,%rsp

   0x00007fffed4b3ae7:	retq

This assembly uses the GNU/C ATT syntax. It looks like all of these functions obey the stdcall calling convention, which makes life slightly easier. So there is a single result of each function that is returned in $eax, similar to the cudaError_t style of most other API calls. The returned values of this function are either 0 (probably something like cuda_success) or 400 (not sure what this means). begin_function_2 seems mutex related, possibly a spin lock or condition variable due to the nanosleep and mutex related calls.

Anyone want to take a crack at reverse engineering this one?