I include a simple program (does vector addition). It is written in Python, but the API calls are similar to the original CUDA API.
#!/bin/env python
coding:utf-8: © Arno Pähler, 2007-09
from ctypes import *
from time import time
from cuda_defs import *
from cuda_api import *
from cuda_utils import *
from gpuFunctions import gpuVADD
BLOCK_SIZE = 320
GRID_SIZE = 1024
demo zero-copy of CUDA 2.2
def hostAlloc(n,dtype=t_si32):
flags1 = cudaHostAllocMapped#|cudaHostAllocPortable#|cudaHostAllocWri
teCombined
flags2 = 0
p = p_void()
size = n*dtype().itemsize
c_type = numpy_to_ctypes[dtype]
cudaHostAlloc(byref(p),size,flags1)
getLastError()
r = nc_a((c_type*n).from_address(p.value))
d = p_void()
status = cudaHostGetDevicePointer(byref(d),p,flags2)
getLastError()
return r,d.value
def main(vlength = 128,loops = 1):
n2 = vlength ## Vector length
h_X = (c_float*n2)()
h_Y = (c_float*n2)()
h_X,d_X = hostAlloc(n2,t_fp32)
h_Y,d_Y = hostAlloc(n2,t_fp32)
h_X.fill(1)
h_Y.fill(loops)
print '%6.0f%6.0f' % (h_X[0],h_Y[0]),
blockDim = dim3(BLOCK_SIZE,1,1)
gridDim = dim3(GRID_SIZE,1,1)
t0 = time()
cudaThreadSynchronize()
for i in range(loops):
cudaConfigureCall(gridDim,blockDim,0,0)
## d_Y = d_Y + d_X
## note, that neither d_Y nor d_X
## have ever been set directly
## addition takes place on the GPU
## with data residing in main memory
gpuVADD(d_X,d_Y,n2)
cudaThreadSynchronize()
t0 = time()-t0
flops = (1.e-9*n2)*float(loops)
cudaThreadSynchronize()
h_Y (aka d_Y) has been altered
## without devie-to-host copy
v2MB = float(vlength)/float(1<<20)
print '%10.3f%10.3f%8.3f%6.0f%6.0f' % (v2MB,t0,flops/t0,h_X[0],h_Y[0])
freeHost(h_X)
freeHost(h_Y)
if name == ‘main’:
import sys
cudaSetDevice(0)
cudaSetDeviceFlags(cudaDeviceMapHost)
xmax = 26
LOOP = 2048
lmin,lmax = 18,xmax
if len(sys.argv) > 1:
lmin = lmax = int(sys.argv[1])
loopx = -1
if len(sys.argv) > 2:
loopx = int(sys.argv[2])
lmax = min(max(0,lmax),xmax)
lmin = min(max(0,lmin),lmax)
if lmin == lmax:
loopx = LOOP >> (lmin-18)
for l in range(lmin,lmax+1):
loops = max(LOOP >> (l-lmin),1)
vlength = 1 << l
if loopx > 0:
loops = loopx
print '%5d %5d' % (l,loops),
main(vlength,loops)
cudaThreadExit()