python-cuda bindings for CUDPP 1.1 ctypes Python bindings for CUDPP 1.1

As I have not updated my python-cuda bindings (I am working on more complete support for Cuda 2.3 beta and have now tested them both under 32- and 64-bit Linux - Jon Wright jonathan dot wrigth at gmail dot com also mailed me a brief patch getting this to work under Windows (? I have not tested it ? - e-mail him, if you want to know more), I attach four small Python script to create Python bindings for CUDPP - one of them a test script doing radix sorts ( my main interest in CUDPP). I wrote the compileCUDPP script, since the Makefile that came with CUDPP 1.1, in my case generated an archive rather than a dynamic lib. Tested this on OpenSuSE 11.1 64-bit with Python 2.6.2. I attached them inline - since I seem unable to attach files via the UPLOAD button. Sorry for the resulting lengthy post - aa_types.py is just a collection of aliases for ctypes and numpy types. The more interesting files would be compileCUDPP and cudpp.py. I’d be happy to post the sources of this and my recent python-cuda bindings on my ftp/htpp server - unadorned (no rpm or tgz, just the sources in a directory) if there is acute interest.

aa_types.py

coding:utf-8

############################################################
###############

Last modified: 2009.07.02

Created : 2008.??.??

Author : Arno Pähler

Version : 1.0

References : none

############################################################
###############

############################################################
###############

Copyright © 2008-2009 by Global Phasing Limited

All rights reserved.

This software is proprietary to and embodies the confidential

technology of Global Phasing Limited (GPhL). Possession,use,

duplication or dissemination of the software is authorised

only pursuant to a valid written licence from GPhL.

Contact: ??tobedetermined??@GlobalPhasing.com

############################################################
###############

import ctypes as ct
import numpy as np
import numpy.linalg as nl
import numpy.ctypeslib as nc

##############################

GLOBAL contants

##############################

pi = np.pi
degree2radian = d2r = pi/180.
radian2degree = r2d = 180./pi

##############################

generic ctypes.style types

##############################

c_si08 = ct.c_byte
c_si16 = ct.c_short
c_si32 = ct.c_int
c_si64 = ct.c_longlong

c_ui08 = ct.c_ubyte
c_ui16 = ct.c_ushort
c_ui32 = ct.c_uint
c_ui64 = ct.c_ulonglong

c_fp32 = ct.c_float
c_fp64 = ct.c_double

c_size = ct.c_size_t
p_char = ct.c_char_p
p_void = ct.c_void_p

p_si08 = ct.POINTER(c_si08)
p_si16 = ct.POINTER(c_si16)
p_si32 = ct.POINTER(c_si32)
p_si64 = ct.POINTER(c_si64)

p_ui08 = ct.POINTER(c_ui08)
p_ui16 = ct.POINTER(c_ui16)
p_ui32 = ct.POINTER(c_ui32)
p_ui64 = ct.POINTER(c_ui64)

p_fp32 = ct.POINTER(c_fp32)
p_fp64 = ct.POINTER(c_fp64)

size_t = ct.c_long

DLL = ct.CDLL

##############################

numpy.ctypeslibs types

##############################

t_si08 = np.int8
t_si16 = np.int16
t_si32 = np.int32
t_si64 = np.int64

t_ui08 = np.uint8
t_ui16 = np.uint16
t_ui32 = np.uint32
t_ui64 = np.uint64

t_fp32 = np.float32
t_fp64 = np.float64

t_cx32 = np.complex64 # each component is t_fp32
t_cx64 = np.complex128 # each component is t_fp64

n_char = nc.ndpointer()
n_void = nc.ndpointer()

n_ui08 = nc.ndpointer(t_ui08)
n_ui16 = nc.ndpointer(t_ui16)
n_ui32 = nc.ndpointer(t_ui32)
n_ui64 = nc.ndpointer(t_ui64)

n_si08 = nc.ndpointer(t_si08)
n_si16 = nc.ndpointer(t_si16)
n_si32 = nc.ndpointer(t_si32)
n_si64 = nc.ndpointer(t_si64)

n_fp32 = nc.ndpointer(t_fp32)
n_fp64 = nc.ndpointer(t_fp64)

n_cx32 = nc.ndpointer(t_cx32)
n_cx64 = nc.ndpointer(t_cx64)

nc_a = nc.as_array
nc_c = nc.as_ctypes

numpy_to_ctypes = {
t_si08 : c_si08,
t_si16 : c_si16,
t_si32 : c_si32,
t_si64 : c_si64,

t_ui08 : c_ui08,
t_ui16 : c_ui16,
t_ui32 : c_ui32,
t_ui64 : c_ui64,

t_fp32 : c_fp32,
t_fp64 : c_fp64,

}

def spo(n):
np.set_printoptions(precision=n,suppress=True)

def address(array):
return array.array_interface[‘data’][0]

cudpp.py

coding:utf-8

############################################################
###############

Last modified: 2009.07.20

Created : 2009.07.20

Author : Arno Pähler

Version : 1.0

References : none

############################################################
###############

#####################################################

Author: Arno Pähler, Berlin,DE

Sasayama,JP

© 2007 - 2009

#####################################################

for details see cudpp.h and cuddp documentation

from aa_types import *

CUDPP: cudpp.h

#enum CUDPPResult
#{
#CUDPP_SUCCESS = 0, /< No error. */
#CUDPP_ERROR_INVALID_HANDLE, /
< Specified handle (for example,
#to a plan) is invalid. /
#CUDPP_ERROR_ILLEGAL_CONFIGURATION, /
< Specified configuration is
#illegal. For example, an
#invalid or illogical
#combination of options. */
#CUDPP_ERROR_UNKNOWN = 9999 /**< Unknown or untraceable error. */
#};
CUDPPResult = c_si32

(
CUDPP_SUCCESS,
CUDPP_ERROR_INVALID_HANDLE,
CUDPP_ERROR_ILLEGAL_CONFIGURATION,
CUDPP_ERROR_UNKNOWN
) = range(3)+[9999]

#enum CUDPPOption
#{
#CUDPP_OPTION_FORWARD = 0x1, /< Algorithms operate forward:
#* from start to end of input
#* array */
#CUDPP_OPTION_BACKWARD = 0x2, /
< Algorithms operate backward:
#* from end to start of array /
#CUDPP_OPTION_EXCLUSIVE = 0x4, /**< Exclusive (for scans) - scan
#
includes all elements up to (but
#* not including) the current
#* element /
#CUDPP_OPTION_INCLUSIVE = 0x8, /**< Inclusive (for scans) - scan
#
includes all elements up to and
#* including the current element /
#CUDPP_OPTION_CTA_LOCAL = 0x10, /**< Algorithm performed only on
#
the CTAs (blocks) with no
#* communication between blocks.
#* @todo Currently ignored. /
#CUDPP_OPTION_KEYS_ONLY = 0x20, /**< No associated value to a key
#
(for global radix sort) */
#CUDPP_OPTION_KEY_VALUE_PAIRS = 0x40, /**< Each key has an associated value */
#};
CUDPPOption = c_si32

CUDPP_OPTION_FORWARD = 0x1
CUDPP_OPTION_BACKWARD = 0x2
CUDPP_OPTION_EXCLUSIVE = 0x4
CUDPP_OPTION_INCLUSIVE = 0x8
CUDPP_OPTION_CTA_LOCAL = 0x10
CUDPP_OPTION_KEYS_ONLY = 0x20
CUDPP_OPTION_KEY_VALUE_PAIRS = 0x40

#enum CUDPPDatatype
#{
#CUDPP_CHAR, //!< Character type (C char)
#CUDPP_UCHAR, //!< Unsigned character (byte) type (C unsigned char)
#CUDPP_INT, //!< Integer type (C int)
#CUDPP_UINT, //!< Unsigned integer type (C unsigned int)
#CUDPP_FLOAT //!< Float type (C float)
#};
CUDPPDatatype = c_si32

(
CUDPP_CHAR,
CUDPP_UCHAR,
CUDPP_INT,
CUDPP_UINT,
CUDPP_FLOAT
) = range(5)

#enum CUDPPOperator
#{
#CUDPP_ADD, //!< Addition of two operands
#CUDPP_MULTIPLY, //!< Multiplication of two operands
#CUDPP_MIN, //!< Minimum of two operands
#CUDPP_MAX //!< Maximum of two operands
#};
CUDPPOperator = c_si32
(
CUDPP_ADD,
CUDPP_MULTIPLY,
CUDPP_MIN,
CUDPP_MAX
) = range(4)

#enum CUDPPAlgorithm
#{
#CUDPP_SCAN,
#CUDPP_SEGMENTED_SCAN,
#CUDPP_COMPACT,
#CUDPP_REDUCE,
#CUDPP_SORT_RADIX,
#CUDPP_SPMVMULT, /< Sparse matrix-dense vector multiplication */
#CUDPP_RAND_MD5, /
< Pseudo Random Number Generator using MD5 hash algorithm*/
#CUDPP_ALGORITHM_INVALID, /**< Placeholder at end of enum */
#};

CUDPPAlgorithm = c_si32
(
CUDPP_SCAN,
CUDPP_SEGMENTED_SCAN,
CUDPP_COMPACT,
CUDPP_REDUCE,
CUDPP_SORT_RADIX,
CUDPP_SPMVMULT,
CUDPP_RAND_MD5,
CUDPP_ALGORITHM_INVALID
) = range(8)

#struct CUDPPConfiguration
#{
#CUDPPAlgorithm algorithm; //!< The algorithm to be used
#CUDPPOperator op; //!< The numerical operator to be applied
#CUDPPDatatype datatype; //!< The datatype of the input arrays
#unsigned int options; //!< Options to configure the algorithm
#};
class CUDPPConfiguration(ct.Structure):
fields = [(‘algorithm’, CUDPPAlgorithm),
(‘op’, CUDPPOperator),
(‘datatype’, CUDPPDatatype),
(‘options’, c_ui32)]

def __repr__(self):
    '''Print structured objects'''
    res = []
    for field in self._fields_:
        res.append('%s=%s' % (field[0], repr(getattr(self, field[0]))))
    return self.__class__.__name__ + '(' + ','.join(res) + ')'

def __str__(self):
    '''Print structured objects'''
    res = []
    for field in self._fields_:
        data = getattr(self,field[0])
        if field[0] == "maxThreadsDim":
            data = "%d %d %d" % (data[0],data[1],data[2])
        if field[0] == "maxGridSize":
            data = "%d %d %d" % (data[0],data[1],data[2])
        res.append('%-25s = %s' % (field[0], data))
    return "\n".join(res)

##define CUDPP_INVALID_HANDLE 0xC0DABAD1
#typedef size_t CUDPPHandle;
CUDPP_INVALID_HANDLE = 0xC0DABAD1
CUDPPHandle = c_size
CUDPPHandle_p = ct.POINTER(CUDPPHandle)

class cudppException(Exception):
pass

from cuda_libs import CUDPP as cudpp

#CUDPPResult cudppPlan(CUDPPHandle *planHandle,
#CUDPPConfiguration config,
#size_t n,
#size_t rows,
#size_t rowPitch);
cudppPlan = cudpp.cudppPlan
cudppPlan.restype = CUDPPResult
cudppPlan.argtypes = [ CUDPPHandle_p, CUDPPConfiguration,
c_size, c_size, c_size ]

#CUDPPResult cudppDestroyPlan(CUDPPHandle plan);
cudppDestroyPlan = cudpp.cudppDestroyPlan
cudppDestroyPlan.restype = CUDPPResult
cudppDestroyPlan.argtypes = [ CUDPPHandle ]

#CUDPPResult cudppScan(CUDPPHandle planHandle,
#void *d_out,
#const void *d_in,
#size_t numElements);
cudppScan = cudpp.cudppScan
cudppScan.restype = CUDPPResult
cudppScan.argtypes = [ CUDPPHandle,
p_void, p_void, c_size ]

#CUDPPResult cudppMultiScan(CUDPPHandle planHandle,
#void *d_out,
#const void *d_in,
#size_t numElements,
#size_t numRows);
cudppMultiScan = cudpp.cudppMultiScan
cudppMultiScan.restype = CUDPPResult
cudppMultiScan.argtypes = [ CUDPPHandle,
p_void, p_void, c_size, c_size ]

#CUDPPResult cudppSegmentedScan(CUDPPHandle planHandle,
#void *d_out,
#const void *d_idata,
#const unsigned int *d_iflags,
#size_t numElements);
cudppSegmentedScan = cudpp.cudppSegmentedScan
cudppSegmentedScan.restype = CUDPPResult
cudppSegmentedScan.argtypes = [ CUDPPHandle,
p_void, p_void, c_ui32, c_size ]

#CUDPPResult cudppCompact(CUDPPHandle planHandle,
#void *d_out,
#size_t *d_numValidElements,
#const void *d_in,
#const unsigned int *d_isValid,
#size_t numElements);
cudppCompact = cudpp.cudppCompact
cudppCompact.restype = CUDPPResult
cudppCompact.argtypes = [ CUDPPHandle,
p_void, c_size, p_void, c_ui32, c_size ]

#CUDPPResult cudppSort(CUDPPHandle planHandle,
#void *d_keys,
#void *d_values,
#int keybits,
#size_t numElements);
cudppSort = cudpp.cudppSort
cudppSort.restype = CUDPPResult
cudppSort.argtypes = [ CUDPPHandle,
p_void, p_void, c_si32, c_size ]

#CUDPPResult cudppSparseMatrix(CUDPPHandle *sparseMatrixHandle,
#CUDPPConfiguration config,
#size_t n,
#size_t rows,
#const void *A,
#const unsigned int *h_rowIndices,
#const unsigned int *h_indices);
cudppSparseMatrix = cudpp.cudppSparseMatrix
cudppSparseMatrix.restype = CUDPPResult
cudppSparseMatrix.argtypes = [ CUDPPHandle_p, CUDPPConfiguration,
c_size, c_size, p_void, p_ui32, p_ui32 ]

#CUDPPResult cudppDestroySparseMatrix(CUDPPHandle sparseMatrixHandle);
cudppDestroySparseMatrix = cudpp.cudppDestroySparseMatrix
cudppDestroySparseMatrix.restype = CUDPPResult
cudppDestroySparseMatrix.argtypes = [ CUDPPHandle ]

#CUDPPResult cudppSparseMatrixVectorMultiply(CUDPPHandle sparseMatrixHandle,
#void *d_y,
#const void *d_x);
cudppSparseMatrixVectorMultiply = cudpp.cudppSparseMatrixVectorMultiply
cudppSparseMatrixVectorMultiply.restype = CUDPPResult
cudppSparseMatrixVectorMultiply.argtypes = [ CUDPPHandle_p,
p_void, p_void ]

#CUDPPResult cudppRand(CUDPPHandle planHandle,void * d_out, size_t numElements);
cudppRand = cudpp.cudppRand
cudppRand.restype = CUDPPResult
cudppRand.argtypes = [ CUDPPHandle_p, p_void, c_size ]

#CUDPPResult cudppRandSeed(const CUDPPHandle planHandle, unsigned int seed);
cudppRandSeed = cudpp.cudppRandSeed
cudppRandSeed.restype = CUDPPResult
cudppRandSeed.argtypes = [ CUDPPHandle_p, c_ui32 ]

test_cudpp.py

#!/usr/bin/env python

coding:utf-8

############################################################
###############

Last modified: 2009.07.20

Created : 2009.07.20

Author : Arno Pähler

Version : 1.0

References : none

############################################################
###############

#####################################################

Author: Arno Pähler, Berlin,DE

Sasayama,JP

© 2007 - 2009

#####################################################

import time

from aa_types import *
from cudpp import *
from cuda_utils import *

def verifySort(keysSorted,valuesSorted,keysUnsorted,length):
for i in range(length-1):
if keysSorted[i]>keysSorted[i+1]:
print ‘Unordered key[%u]:%u > key[%u]:%u’ % (
i,keysSorted[i],i+1,keysSorted[i+1])
return -1
if isinstance(valuesSorted,int):
return 0
for i in range(length):
if keysUnsorted[(valuesSorted[i])] != keysSorted[i]:
print ‘Incorrectly sorted value[%u] (%u) %u != %u’ % (
i,valuesSorted[i],keysUnsorted[valuesSorted[i]],keysSorted[i ])
return -2

import numpy.random as nr
spo(3)

def radixSortTest(plan,config,tests,numTests,numElements,keybits
,
numIterations,quiet):

outString = '%s %s' % (
'float' if config.datatype == CUDPP_FLOAT else 'unsigned int',
'keys' if config.options == CUDPP_OPTION_KEYS_ONLY else 'key-value pairs')
data_type = t_fp32 if config.datatype == CUDPP_FLOAT else t_ui32

if data_type == t_ui32:
    h_keys         = nr.random_integers(1,999,numElements).astype(t_ui32)
else:
    h_keys         = nr.rand(numElements).astype(t_fp32)+.1
h_keysSorted   = np.zeros((numElements,),dtype=data_type)
h_values       = 0
h_valuesSorted = 0

c_keys = np.array(h_keys)
tc =time.time()
ix = np.argsort(c_keys,kind='quicksort')
tc =time.time()-tc
print c_keys[ix[:5]]
print 'CPU argsort = %.3f msec for %d keys' % (1.e3*tc,c_keys.size)
if config.options&CUDPP_OPTION_KEY_VALUE_PAIRS:
    h_values       = np.arange(numElements,dtype=t_ui32)
    h_valuesSorted = np.zeros((numElements,),dtype=t_ui32)

d_keys = gpuAlloc(numElements,dtype=data_type)
get_last_cuda_error('d_keys')
if config.options&CUDPP_OPTION_KEY_VALUE_PAIRS:
    d_values = gpuAlloc(numElements,dtype=t_ui32)
    get_last_cuda_error('d_values')
else:
    d_values = 0

start_event = cudaEvent_t()
stop_event = cudaEvent_t()
cudaEventCreate(ct.byref(start_event))
cudaEventCreate(ct.byref(stop_event))

for k in range(numTests):
    if numTests == 1:
        tests[0] = numElements
    if not quiet:
        print '\nRunning a sort of %d %s' % (tests[k], outString)
    totalTime = 0.
    wallTime = 0.

    len_test = tests[k]
    for i in range(numIterations):
        write_device(d_keys,h_keys[:len_test])
        if config.options&CUDPP_OPTION_KEY_VALUE_PAIRS:
            write_device(d_values,h_values[:len_test])

        t0 = time.time()
        cudaThreadSynchronize()

        cudaEventRecord(start_event,0)
        cudppSort(plan,d_keys,d_values,keybits,len_test)

        cudaEventRecord(stop_event, 0)
        cudaEventSynchronize(stop_event)

        etime = c_fp32()
        cudaEventElapsedTime(byref(etime),start_event,stop_event)
        totalTime += etime.value

        cudaThreadSynchronize()
        t0 = time.time()-t0
        wallTime += t0

    read_device(h_keysSorted[:len_test],d_keys)
    print h_keys[:5]
    print h_keysSorted[:5]
    print
    print h_keys[len_test-5:len_test]
    print h_keysSorted[len_test-5:len_test]
    if config.options&CUDPP_OPTION_KEY_VALUE_PAIRS:
        read_device(h_valuesSorted[:len_test],d_values)
        print
        print h_values[:5]
        print h_valuesSorted[:5]
        print
        print h_keys[h_valuesSorted[:5]]
        print h_keys[h_valuesSorted[len_test-5:len_test]]
    else:
        h_values = 0
    print

    verifySort(h_keysSorted,h_valuesSorted,h_keys,tests[k])

    if not quiet:
        print 'Average execution time: %0.2f ms' % (totalTime/numIterations)
        print 'Average wallclock time: %0.2f ms' % (1.e3*wallTime/numIterations)
    else:
        print '\t%10d\t%0.2f' % (tests[k],totalTime/numIterations)
        print '\t%10d\t%0.2f' % (tests[k],1.e3*wallTime/numIterations)
    print
    print 'Speedup GPU vs CPU: %0.2fx' % (tc*numIterations/wallTime)
cudaFree(d_keys);
cudaFree(d_values);

def testRadixSort(argv,configPtr=None):
argc = len(argv)

keybits,retval = 32,0
numIterations = 10

config = CUDPPConfiguration()
config.algorithm = CUDPP_SORT_RADIX
config.datatype = CUDPP_UINT
#config.datatype = CUDPP_FLOAT
config.options = CUDPP_OPTION_KEY_VALUE_PAIRS
#config.options = CUDPP_OPTION_KEYS_ONLY

if configPtr is not None:
   config = configPtr
else:
    config.datatype = CUDPP_UINT
    config.datatype = CUDPP_FLOAT

test = [39, 128, 256, 512, 513, 1000, 1024, 1025, 32768,
        45537, 65536, 131072, 262144, 500001, 524288,
        1048577, 1048576, 1048581, 2097152, 4194304,
        8388608]
test =  [4194304,6214149,8388608]
test =  [6214149]

numTests = len(test)
numElements = test[-1]

keysOnly = 'keysonly' in argv
keyValue = 'keyval' in argv
quiet = 'quiet' in argv

if 'float' in argv:
    config.datatype = CUDPP_FLOAT
elif 'uint' in argv:
    config.datatype = CUDPP_UINT

if config.options == CUDPP_OPTION_KEYS_ONLY|keysOnly:
    keysOnly = True
    config.options = CUDPP_OPTION_KEYS_ONLY
else:
    keysOnly = False
    config.options = CUDPP_OPTION_KEY_VALUE_PAIRS

plan = CUDPPHandle()
result = cudppPlan(ct.byref(plan),config,numElements,1,0)

if result != CUDPP_SUCCESS:
    print 'Error in plan creation'
    cudppDestroyPlan(plan)

#numTests = 1
if config.datatype == CUDPP_UINT:
    radixSortTest(plan,config,test,numTests,numElements,
        keybits,numIterations,quiet)
elif config.datatype == CUDPP_FLOAT:
    radixSortTest(plan,config,test,numTests,numElements,
        keybits,numIterations,quiet)

result = cudppDestroyPlan(plan)

if result != CUDPP_SUCCESS:
    print 'Error destroying CUDPPPlan for Scan'

if name == ‘main’:
from sys import argv
testRadixSort(argv)

compileCUDPP

#!/bin/env python

coding:utf-8: © Arno Pähler, 2007-09

script to create dynamic .so library for CUDPP

copy this script to the source directory

/where/it/was/unpacked/cudpp_1.1/cudpp/src

then execute it to create _cuddp%s.so % arch

where arch can be one of ‘32’ or ‘64’ and

where sm_level is a tuple (major,minor)

describing the compute capabilty level.

cuda_root should equal CUDA_INSTALL_PATH

cudpp.py contains the ctpyes Python bindings

import sys,os,user

arch = ‘64’
sm_level = (1,3)

cuda_root = ‘/usr/local/cuda%s/cuda’ % arch
cuda_inc = ‘%s/include’ % cuda_root
cuda_lib = ‘%s/lib’ % cuda_root

src_root = ‘.’

smx = ‘sm_%d%d’ % sm_level
vnn = ‘architecture’

ccompiler = ‘g++’
flags_cxx = ‘-fPIC -O3 -msse3 -malign-double’
flags = ’ ‘.join([’-m%s -c’ % arch,flags_cxx,])
includes = ‘-I%s -I%s -I%s’ % (cuda_inc,’…/include’,’…/…/common/inc’)

targets_cc = [
‘cudpp.cpp’,‘cudpp_maximal_launch.cpp’,
‘cudpp_plan.cpp’,‘cudpp_plan_manager.cpp’]

targets_cu = [
‘compact_app.cu’,‘radixsort_app.cu’,‘rand_app.cu’,
‘scan_app.cu’,‘segmented_scan_app.cu’,‘spmvmult_app.cu’]

targets = ’ '.join(targets_cc)

cmd1 = ‘%s %s %s %s’ % (ccompiler,flags,targets,includes)
print cmd1
os.system(cmd1)

flags_common = [’–use_fast_math’,
‘–gpu-%s %s’ % (vnn,smx),]

targets = ’ '.join(‘app/%s’ %x for x in targets_cu)

vcompiler = ‘nvcc’
ccompiler = ‘g++ -fPIC -shared -malign-double’

prefix = ‘_’
lib_name = ‘cudpp’

flags_so = [’-m%s’ % arch,’–compiler-options -fPIC’,]

flags = ’ '.join(flags_common+flags_so)

libraries = ‘-L%s -lcudart -lcuda’ % cuda_lib

so_file = ‘%s%s%s.so’ % (prefix,lib_name,arch)

cmd1 = ‘nvcc -c %s %s %s’ % (flags,includes,targets)
cmd2 = ‘%s -m%s *.o %s -o %s’ % (ccompiler,arch,libraries,so_file)
command = ‘%s ; %s’ % (cmd1,cmd2)
print cmd1
print cmd2
os.system(command)