cudaGetExportTable a total hack

Gregory_Diamos · December 29, 2010, 10:27pm

Possibly, but it doesn’t look like it from what I can tell. It seems like the device code is already available via cudaRegisterFatBinary that is contained within libcufft.so.

Also, here is a bit more information. It looks like my hack that I posted before fails for CUFFT on 64-bit machines. Here is a bit more information in-case other people want to try to take this further.

It looks like the export table ID is always the same for CUFFT and CUBLAS ( 0x11df21116e3393c6 0x9395d855f368c3a8 ), and that the table contains only three function pointers.

Here is a complete assembly dump of the third function returned including the entire reachable call graph in libcuda.so:

exported_function_3_of_3:

=> 0x00007fffecf17190:	push   %rbp

   0x00007fffecf17191:	mov    %rdi,%rbp

   0x00007fffecf17194:	xor    %edi,%edi

   0x00007fffecf17196:	push   %rbx

   0x00007fffecf17197:	mov    %rdx,%rbx

   0x00007fffecf1719a:	sub    $0x8,%rsp

   0x00007fffecf1719e:	test   %rsi,%rsi

   0x00007fffecf171a1:	cmove  %rsp,%rdi

   0x00007fffecf171a5:	mov    %rsi,(%rsp)

   0x00007fffecf171a9:	callq  0x7fffecea23a0   # begin_function_1

   0x00007fffecf171ae:	test   %eax,%eax

   0x00007fffecf171b0:	mov    %eax,%edx

   0x00007fffecf171b2:	jne    0x7fffecf171dd

   0x00007fffecf171b4:	mov    (%rsp),%rcx

   0x00007fffecf171b8:	mov    0x88(%rcx),%rax

   0x00007fffecf171bf:	test   %rax,%rax

   0x00007fffecf171c2:	jne    0x7fffecf171cf

   0x00007fffecf171c4:	jmp    0x7fffecf171e6

   0x00007fffecf171c6:	mov    0x28(%rax),%rax

   0x00007fffecf171ca:	test   %rax,%rax

   0x00007fffecf171cd:	je     0x7fffecf171e6

   0x00007fffecf171cf:	cmp    %rbx,0x8(%rax)

   0x00007fffecf171d3:	jne    0x7fffecf171c6

   0x00007fffecf171d5:	mov    0x10(%rax),%rbx

   0x00007fffecf171d9:	mov    %rbx,0x0(%rbp)

   0x00007fffecf171dd:	add    $0x8,%rsp

   0x00007fffecf171e1:	mov    %edx,%eax

   0x00007fffecf171e3:	pop    %rbx

   0x00007fffecf171e4:	pop    %rbp

   0x00007fffecf171e5:	retq   

   0x00007fffecf171e6:	add    $0x8,%rsp

   0x00007fffecf171ea:	mov    $0x190,%edx

   0x00007fffecf171ef:	pop    %rbx

   0x00007fffecf171f0:	pop    %rbp

   0x00007fffecf171f1:	mov    %edx,%eax

   0x00007fffecf171f3:	retq  

begin_function_1:   

   0x00007fffecea23a0:	mov    %rbp,-0x10(%rsp)

   0x00007fffecea23a5:	mov    %r12,-0x8(%rsp)

   0x00007fffecea23aa:	mov    %rdi,%r12

   0x00007fffecea23ad:	mov    %rbx,-0x18(%rsp)

   0x00007fffecea23b2:	sub    $0x18,%rsp

   0x00007fffecea23b6:	mov    0x9706db(%rip),%rbp        # 0x7fffed812a98

   0x00007fffecea23bd:	mov    0x578(%rbp),%ebx

   0x00007fffecea23c3:	callq  0x7fffed4b39b0       # begin_function_2

   0x00007fffecea23c8:	cmp    %eax,%ebx

   0x00007fffecea23ca:	jne    0x7fffecea23e7

   0x00007fffecea23cc:	mov    0x1d0(%rbp),%edx

   0x00007fffecea23d2:	mov    $0x4,%ecx

   0x00007fffecea23d7:	cmp    $0x321cba00,%edx

   0x00007fffecea23dd:	je     0x7fffecea23ec

   0x00007fffecea23df:	cmp    $0xabc123,%edx

   0x00007fffecea23e5:	je     0x7fffecea2401

   0x00007fffecea23e7:	mov    $0x3,%ecx

   0x00007fffecea23ec:	mov    (%rsp),%rbx

   0x00007fffecea23f0:	mov    0x8(%rsp),%rbp

   0x00007fffecea23f5:	mov    %ecx,%eax

   0x00007fffecea23f7:	mov    0x10(%rsp),%r12

   0x00007fffecea23fc:	add    $0x18,%rsp

   0x00007fffecea2400:	retq   

   0x00007fffecea2401:	xor    %cl,%cl

   0x00007fffecea2403:	test   %r12,%r12

   0x00007fffecea2406:	je     0x7fffecea23ec

   0x00007fffecea2408:	mov    0x0(%rbp),%edi

   0x00007fffecea240b:	callq  0x7fffed4b3aa0     # begin_function_3

   0x00007fffecea2410:	test   %rax,%rax

   0x00007fffecea2413:	mov    %rax,(%r12)

   0x00007fffecea2417:	mov    $0xc9,%ecx

   0x00007fffecea241c:	je     0x7fffecea23ec

begin_function_2:

   0x00007fffed4b39b0:	jmpq   0x7fffece88064 <getpid@plt>

   0x00007fffed4b39b5:	data32 nopw %cs:0x0(%rax,%rax,1)

   0x00007fffed4b39c0:	jmpq   0x7fffece887e4 <pthread_mutex_unlock@plt>

   0x00007fffed4b39c5:	data32 nopw %cs:0x0(%rax,%rax,1)

   0x00007fffed4b39d0:	jmpq   0x7fffece886e4 <pthread_mutex_lock@plt>

   0x00007fffed4b39d5:	data32 nopw %cs:0x0(%rax,%rax,1)

   0x00007fffed4b39e0:	jmpq   0x7fffece88784 <pthread_mutex_destroy@plt>

   0x00007fffed4b39e5:	data32 nopw %cs:0x0(%rax,%rax,1)

   0x00007fffed4b39f0:	xor    %esi,%esi

   0x00007fffed4b39f2:	jmpq   0x7fffece88074 <pthread_mutex_init@plt>

   0x00007fffed4b39f7:	nopw   0x0(%rax,%rax,1)

   0x00007fffed4b3a00:	push   %r12

   0x00007fffed4b3a02:	mov    %edi,%eax

   0x00007fffed4b3a04:	push   %rbp

   0x00007fffed4b3a05:	push   %rbx

   0x00007fffed4b3a06:	mov    $0x10624dd3,%ebx

   0x00007fffed4b3a0b:	mul    %ebx

   0x00007fffed4b3a0d:	sub    $0x20,%rsp

   0x00007fffed4b3a11:	lea    0x10(%rsp),%rbp

   0x00007fffed4b3a16:	mov    %rsp,%rsi

   0x00007fffed4b3a19:	shr    $0x6,%edx

   0x00007fffed4b3a1c:	mov    %edx,%ecx

   0x00007fffed4b3a1e:	imul   $0x3e8,%edx,%edx

   0x00007fffed4b3a24:	mov    %rcx,0x10(%rsp)

   0x00007fffed4b3a29:	sub    %edx,%edi

   0x00007fffed4b3a2b:	imul   $0xf4240,%edi,%eax

   0x00007fffed4b3a31:	mov    %eax,%edi

   0x00007fffed4b3a33:	mov    %rdi,0x18(%rsp)

   0x00007fffed4b3a38:	mov    %rbp,%rdi

   0x00007fffed4b3a3b:	callq  0x7fffece88514 <nanosleep@plt>

   0x00007fffed4b3a40:	test   %eax,%eax

   0x00007fffed4b3a42:	je     0x7fffed4b3a79

   0x00007fffed4b3a44:	callq  0x7fffece88424 <__errno_location@plt>

   0x00007fffed4b3a49:	mov    %rax,%rbx

   0x00007fffed4b3a4c:	nopl   0x0(%rax)

   0x00007fffed4b3a50:	jmp    0x7fffed4b3a74

   0x00007fffed4b3a52:	mov    0x8(%rsp),%rsi

   0x00007fffed4b3a57:	mov    (%rsp),%r8

   0x00007fffed4b3a5b:	mov    %rbp,%rdi

   0x00007fffed4b3a5e:	mov    %rsi,0x18(%rsp)

   0x00007fffed4b3a63:	mov    %rsp,%rsi

   0x00007fffed4b3a66:	mov    %r8,0x10(%rsp)

   0x00007fffed4b3a6b:	callq  0x7fffece88514 <nanosleep@plt>

   0x00007fffed4b3a70:	test   %eax,%eax

   0x00007fffed4b3a72:	je     0x7fffed4b3a79

   0x00007fffed4b3a74:	cmpl   $0x4,(%rbx)

   0x00007fffed4b3a77:	je     0x7fffed4b3a52

   0x00007fffed4b3a79:	add    $0x20,%rsp

   0x00007fffed4b3a7d:	pop    %rbx

   0x00007fffed4b3a7e:	pop    %rbp

   0x00007fffed4b3a7f:	pop    %r12

   0x00007fffed4b3a81:	retq   

begin_function_3:

   0x00007fffed4b3aa0:	dec    %edi

   0x00007fffed4b3aa2:	jmpq   0x7fffece87db4 <pthread_getspecific@plt>

   0x00007fffed4b3aa7:	nopw   0x0(%rax,%rax,1)

   0x00007fffed4b3ab0:	jmpq   0x7fffece88194 <pthread_key_delete@plt>

   0x00007fffed4b3ab5:	data32 nopw %cs:0x0(%rax,%rax,1)

   0x00007fffed4b3ac0:	sub    $0x18,%rsp

   0x00007fffed4b3ac4:	mov    %rdi,%rsi

   0x00007fffed4b3ac7:	lea    0x14(%rsp),%rax

   0x00007fffed4b3acc:	mov    %rax,%rdi

   0x00007fffed4b3acf:	callq  0x7fffece88754 <pthread_key_create@plt>

   0x00007fffed4b3ad4:	mov    0x14(%rsp),%ecx

   0x00007fffed4b3ad8:	mov    %eax,%edx

   0x00007fffed4b3ada:	xor    %eax,%eax

   0x00007fffed4b3adc:	inc    %ecx

   0x00007fffed4b3ade:	test   %edx,%edx

   0x00007fffed4b3ae0:	cmove  %ecx,%eax

   0x00007fffed4b3ae3:	add    $0x18,%rsp

   0x00007fffed4b3ae7:	retq

This assembly uses the GNU/C ATT syntax. It looks like all of these functions obey the stdcall calling convention, which makes life slightly easier. So there is a single result of each function that is returned in $eax, similar to the cudaError_t style of most other API calls. The returned values of this function are either 0 (probably something like cuda_success) or 400 (not sure what this means). begin_function_2 seems mutex related, possibly a spin lock or condition variable due to the nanosleep and mutex related calls.

Anyone want to take a crack at reverse engineering this one?

Topic		Replies	Views
cuGetExportTable explanation CUDA Programming and Performance	1	1306	July 7, 2023
cudaGetExportTable/cuGetExportTable behavior CUDA Programming and Performance	1	13975	October 30, 2010
Simple Example to create DLL with NVCC CUDA Programming and Performance	38	51762	April 18, 2012
CUDA Toolkit 3.0 beta released now with public downloads CUDA Programming and Performance	104	430272	March 25, 2010
CUBLAS/CUFFT with Driver API CUDA Programming and Performance	8	5263	May 23, 2010
cuda obj files in a dll trying to create a dll using .cu files CUDA Programming and Performance	15	49516	July 1, 2019
How to hook CUDA runtime API in CUDA 11.4 CUDA Programming and Performance cuda	15	5380	July 12, 2023
Get device side function pointer of cuda kernel (without the signature) using just the kernel's symbol name as a string may be CUDA Programming and Performance	5	102	September 27, 2024
Runtime API to Driver API : translation pbs ? CUDA Programming and Performance	11	6954	November 6, 2008
Possible bug on beta 3.0 when using cufft and driver api CUDA Programming and Performance	4	2490	February 4, 2010

cudaGetExportTable a total hack

Related topics