Possibly, but it doesn’t look like it from what I can tell. It seems like the device code is already available via cudaRegisterFatBinary that is contained within libcufft.so.
Also, here is a bit more information. It looks like my hack that I posted before fails for CUFFT on 64-bit machines. Here is a bit more information in-case other people want to try to take this further.
It looks like the export table ID is always the same for CUFFT and CUBLAS ( 0x11df21116e3393c6 0x9395d855f368c3a8 ), and that the table contains only three function pointers.
Here is a complete assembly dump of the third function returned including the entire reachable call graph in libcuda.so:
exported_function_3_of_3:
=> 0x00007fffecf17190: push %rbp
0x00007fffecf17191: mov %rdi,%rbp
0x00007fffecf17194: xor %edi,%edi
0x00007fffecf17196: push %rbx
0x00007fffecf17197: mov %rdx,%rbx
0x00007fffecf1719a: sub $0x8,%rsp
0x00007fffecf1719e: test %rsi,%rsi
0x00007fffecf171a1: cmove %rsp,%rdi
0x00007fffecf171a5: mov %rsi,(%rsp)
0x00007fffecf171a9: callq 0x7fffecea23a0 # begin_function_1
0x00007fffecf171ae: test %eax,%eax
0x00007fffecf171b0: mov %eax,%edx
0x00007fffecf171b2: jne 0x7fffecf171dd
0x00007fffecf171b4: mov (%rsp),%rcx
0x00007fffecf171b8: mov 0x88(%rcx),%rax
0x00007fffecf171bf: test %rax,%rax
0x00007fffecf171c2: jne 0x7fffecf171cf
0x00007fffecf171c4: jmp 0x7fffecf171e6
0x00007fffecf171c6: mov 0x28(%rax),%rax
0x00007fffecf171ca: test %rax,%rax
0x00007fffecf171cd: je 0x7fffecf171e6
0x00007fffecf171cf: cmp %rbx,0x8(%rax)
0x00007fffecf171d3: jne 0x7fffecf171c6
0x00007fffecf171d5: mov 0x10(%rax),%rbx
0x00007fffecf171d9: mov %rbx,0x0(%rbp)
0x00007fffecf171dd: add $0x8,%rsp
0x00007fffecf171e1: mov %edx,%eax
0x00007fffecf171e3: pop %rbx
0x00007fffecf171e4: pop %rbp
0x00007fffecf171e5: retq
0x00007fffecf171e6: add $0x8,%rsp
0x00007fffecf171ea: mov $0x190,%edx
0x00007fffecf171ef: pop %rbx
0x00007fffecf171f0: pop %rbp
0x00007fffecf171f1: mov %edx,%eax
0x00007fffecf171f3: retq
begin_function_1:
0x00007fffecea23a0: mov %rbp,-0x10(%rsp)
0x00007fffecea23a5: mov %r12,-0x8(%rsp)
0x00007fffecea23aa: mov %rdi,%r12
0x00007fffecea23ad: mov %rbx,-0x18(%rsp)
0x00007fffecea23b2: sub $0x18,%rsp
0x00007fffecea23b6: mov 0x9706db(%rip),%rbp # 0x7fffed812a98
0x00007fffecea23bd: mov 0x578(%rbp),%ebx
0x00007fffecea23c3: callq 0x7fffed4b39b0 # begin_function_2
0x00007fffecea23c8: cmp %eax,%ebx
0x00007fffecea23ca: jne 0x7fffecea23e7
0x00007fffecea23cc: mov 0x1d0(%rbp),%edx
0x00007fffecea23d2: mov $0x4,%ecx
0x00007fffecea23d7: cmp $0x321cba00,%edx
0x00007fffecea23dd: je 0x7fffecea23ec
0x00007fffecea23df: cmp $0xabc123,%edx
0x00007fffecea23e5: je 0x7fffecea2401
0x00007fffecea23e7: mov $0x3,%ecx
0x00007fffecea23ec: mov (%rsp),%rbx
0x00007fffecea23f0: mov 0x8(%rsp),%rbp
0x00007fffecea23f5: mov %ecx,%eax
0x00007fffecea23f7: mov 0x10(%rsp),%r12
0x00007fffecea23fc: add $0x18,%rsp
0x00007fffecea2400: retq
0x00007fffecea2401: xor %cl,%cl
0x00007fffecea2403: test %r12,%r12
0x00007fffecea2406: je 0x7fffecea23ec
0x00007fffecea2408: mov 0x0(%rbp),%edi
0x00007fffecea240b: callq 0x7fffed4b3aa0 # begin_function_3
0x00007fffecea2410: test %rax,%rax
0x00007fffecea2413: mov %rax,(%r12)
0x00007fffecea2417: mov $0xc9,%ecx
0x00007fffecea241c: je 0x7fffecea23ec
begin_function_2:
0x00007fffed4b39b0: jmpq 0x7fffece88064 <getpid@plt>
0x00007fffed4b39b5: data32 nopw %cs:0x0(%rax,%rax,1)
0x00007fffed4b39c0: jmpq 0x7fffece887e4 <pthread_mutex_unlock@plt>
0x00007fffed4b39c5: data32 nopw %cs:0x0(%rax,%rax,1)
0x00007fffed4b39d0: jmpq 0x7fffece886e4 <pthread_mutex_lock@plt>
0x00007fffed4b39d5: data32 nopw %cs:0x0(%rax,%rax,1)
0x00007fffed4b39e0: jmpq 0x7fffece88784 <pthread_mutex_destroy@plt>
0x00007fffed4b39e5: data32 nopw %cs:0x0(%rax,%rax,1)
0x00007fffed4b39f0: xor %esi,%esi
0x00007fffed4b39f2: jmpq 0x7fffece88074 <pthread_mutex_init@plt>
0x00007fffed4b39f7: nopw 0x0(%rax,%rax,1)
0x00007fffed4b3a00: push %r12
0x00007fffed4b3a02: mov %edi,%eax
0x00007fffed4b3a04: push %rbp
0x00007fffed4b3a05: push %rbx
0x00007fffed4b3a06: mov $0x10624dd3,%ebx
0x00007fffed4b3a0b: mul %ebx
0x00007fffed4b3a0d: sub $0x20,%rsp
0x00007fffed4b3a11: lea 0x10(%rsp),%rbp
0x00007fffed4b3a16: mov %rsp,%rsi
0x00007fffed4b3a19: shr $0x6,%edx
0x00007fffed4b3a1c: mov %edx,%ecx
0x00007fffed4b3a1e: imul $0x3e8,%edx,%edx
0x00007fffed4b3a24: mov %rcx,0x10(%rsp)
0x00007fffed4b3a29: sub %edx,%edi
0x00007fffed4b3a2b: imul $0xf4240,%edi,%eax
0x00007fffed4b3a31: mov %eax,%edi
0x00007fffed4b3a33: mov %rdi,0x18(%rsp)
0x00007fffed4b3a38: mov %rbp,%rdi
0x00007fffed4b3a3b: callq 0x7fffece88514 <nanosleep@plt>
0x00007fffed4b3a40: test %eax,%eax
0x00007fffed4b3a42: je 0x7fffed4b3a79
0x00007fffed4b3a44: callq 0x7fffece88424 <__errno_location@plt>
0x00007fffed4b3a49: mov %rax,%rbx
0x00007fffed4b3a4c: nopl 0x0(%rax)
0x00007fffed4b3a50: jmp 0x7fffed4b3a74
0x00007fffed4b3a52: mov 0x8(%rsp),%rsi
0x00007fffed4b3a57: mov (%rsp),%r8
0x00007fffed4b3a5b: mov %rbp,%rdi
0x00007fffed4b3a5e: mov %rsi,0x18(%rsp)
0x00007fffed4b3a63: mov %rsp,%rsi
0x00007fffed4b3a66: mov %r8,0x10(%rsp)
0x00007fffed4b3a6b: callq 0x7fffece88514 <nanosleep@plt>
0x00007fffed4b3a70: test %eax,%eax
0x00007fffed4b3a72: je 0x7fffed4b3a79
0x00007fffed4b3a74: cmpl $0x4,(%rbx)
0x00007fffed4b3a77: je 0x7fffed4b3a52
0x00007fffed4b3a79: add $0x20,%rsp
0x00007fffed4b3a7d: pop %rbx
0x00007fffed4b3a7e: pop %rbp
0x00007fffed4b3a7f: pop %r12
0x00007fffed4b3a81: retq
begin_function_3:
0x00007fffed4b3aa0: dec %edi
0x00007fffed4b3aa2: jmpq 0x7fffece87db4 <pthread_getspecific@plt>
0x00007fffed4b3aa7: nopw 0x0(%rax,%rax,1)
0x00007fffed4b3ab0: jmpq 0x7fffece88194 <pthread_key_delete@plt>
0x00007fffed4b3ab5: data32 nopw %cs:0x0(%rax,%rax,1)
0x00007fffed4b3ac0: sub $0x18,%rsp
0x00007fffed4b3ac4: mov %rdi,%rsi
0x00007fffed4b3ac7: lea 0x14(%rsp),%rax
0x00007fffed4b3acc: mov %rax,%rdi
0x00007fffed4b3acf: callq 0x7fffece88754 <pthread_key_create@plt>
0x00007fffed4b3ad4: mov 0x14(%rsp),%ecx
0x00007fffed4b3ad8: mov %eax,%edx
0x00007fffed4b3ada: xor %eax,%eax
0x00007fffed4b3adc: inc %ecx
0x00007fffed4b3ade: test %edx,%edx
0x00007fffed4b3ae0: cmove %ecx,%eax
0x00007fffed4b3ae3: add $0x18,%rsp
0x00007fffed4b3ae7: retq
This assembly uses the GNU/C ATT syntax. It looks like all of these functions obey the stdcall calling convention, which makes life slightly easier. So there is a single result of each function that is returned in $eax, similar to the cudaError_t style of most other API calls. The returned values of this function are either 0 (probably something like cuda_success) or 400 (not sure what this means). begin_function_2 seems mutex related, possibly a spin lock or condition variable due to the nanosleep and mutex related calls.
Anyone want to take a crack at reverse engineering this one?