#include #include #include #include typedef float2 Complex; float ***PyArray_to_FCArray(PyArrayObject *arrayin); Complex ***PyArray_to_CCArray(PyArrayObject *arrayin); cufftComplex *PyArray_to_CuArray(PyArrayObject *arrayin); float *** PyArray_to_FCArray(PyArrayObject *arrayin) { float ***arrayout, *a; int i, j, m, n, o; m = arrayin->dimensions[0]; n = arrayin->dimensions[1]; o = arrayin->dimensions[2]; a = (float *) arrayin->data; arrayout = (float ***)malloc(sizeof(float **) *m); for (i = 0; i < m; i++) { arrayout[i] = (float **)malloc(sizeof(float *) *n); for (j = 0; j < n; j++) arrayout[i][j] = a + i*m*o + j*o; } return arrayout; } Complex *** PyArray_to_CCArray(PyArrayObject *arrayin) { Complex ***arrayout, *a; int i, j, m, n, o; m = arrayin->dimensions[0]; n = arrayin->dimensions[1]; o = arrayin->dimensions[2]; a = (Complex *) arrayin->data; arrayout = (Complex ***)malloc(sizeof(Complex **) *m); for (i = 0; i < m; i++) { arrayout[i] = (Complex **)malloc(sizeof(Complex *) *n); for (j = 0; j < n; j++) arrayout[i][j] = a + i*m*o + j*o; } return arrayout; } cufftComplex * FPyArray_to_CuArray(PyArrayObject *arrayin) { cufftComplex *arrayout, *cudaarray; float ***arraytemp; int i, j, k, m, n, o; m = arrayin->dimensions[0]; n = arrayin->dimensions[1]; o = arrayin->dimensions[2]; arraytemp = PyArray_to_FCArray(arrayin); arrayout = (cufftComplex *)malloc(sizeof(cufftComplex)*m*n*o); for (i = 0; i < m; i++) for (j = 0; j < n; j++) for (k = 0; k < o; k++) { arrayout[i*m + j*n + k].x = arraytemp[i][j][k]; arrayout[i*m + j*n + k].y = 0.0; } cudaMalloc((void**)&cudaarray, sizeof(cufftComplex)*m*n*o); cudaMemcpy(cudaarray, arrayout, sizeof(cufftComplex)*m*n*o, cudaMemcpyHostToDevice); free(arrayout); return cudaarray; } cufftComplex * CPyArray_to_CuArray(PyArrayObject *arrayin) { cufftComplex *arrayout; Complex ***arraytemp; int i, j, k, m, n, o; m = arrayin->dimensions[0]; n = arrayin->dimensions[1]; o = arrayin->dimensions[2]; arraytemp = PyArray_to_CCArray(arrayin); arrayout = (cufftComplex *)malloc(sizeof(cufftComplex)*m*n*o); for (i = 0; i < m; i++) for (j = 0; j < n; j++) for (k = 0; k < o; k++) { arrayout[i*m + j*n + k].x = arraytemp[i][j][k].x; arrayout[i*m + j*n + k].y = arraytemp[i][j][k].y; } return arrayout; } cufftComplex *** CPyArray_to_CArray(PyArrayObject *arrayin) { cufftComplex ***arrayout, *a; int i, j, m, n, o; m = arrayin->dimensions[0]; n = arrayin->dimensions[1]; o = arrayin->dimensions[2]; a = (cufftComplex *) arrayin->data; arrayout = (cufftComplex ***)malloc(sizeof(cufftComplex **) * m); for (i = 0; i < m; i++) { arrayout[i] = (cufftComplex **)malloc(sizeof(cufftComplex *) * n); for (j = 0; j < n; j++) arrayout[i][j] = a + i*m*o + j*o; } return arrayout; } void free_CArray(double ***v) { free((char*) v); } void free_CFArray(float ***v) { free((char*) v); } void free_CCArray(Complex ***v) { free((char*) v); } static PyObject* test(PyObject* self, PyObject* args) { if (!PyArg_ParseTuple(args, "")) return 0; return Py_BuildValue("s", "Cuda works!"); } extern "C" static PyObject* fft3(PyObject* self, PyObject* args) { PyObject *input; PyArrayObject *a, *b; cufftComplex *cin, *cout, *cudaarrayf; Complex ***final; cufftHandle plan; int i, j, k, m, n, o, dim[3], FFT_SIZE; if (!PyArg_ParseTuple(args, "O", &input)) return NULL; a = (PyArrayObject *)PyArray_ContiguousFromObject(input, NPY_CFLOAT, 3, 3); m = dim[0] = a->dimensions[0]; n = dim[1] = a->dimensions[1]; o = dim[2] = a->dimensions[2]; FFT_SIZE = m*n*o; b = (PyArrayObject* ) PyArray_FromDims(3, dim, NPY_CFLOAT); // Convert PyArrayObjects to C pointers cin = CPyArray_to_CuArray(a); cout = (cufftComplex *)malloc(sizeof(cufftComplex)*FFT_SIZE); final = PyArray_to_CCArray(b); //Add cuda stuff here cufftPlan3d(&plan, m, n, o, CUFFT_C2C); cudaMalloc((void**)&cudaarrayf, sizeof(cufftComplex) * FFT_SIZE); cudaMemcpy(cudaarrayf, cin, sizeof(cufftComplex) * FFT_SIZE, cudaMemcpyHostToDevice); cufftExecC2C(plan, cudaarrayf, cudaarrayf, -1); cudaMemcpy(cout, cudaarrayf, sizeof(cufftComplex) * FFT_SIZE, cudaMemcpyDeviceToHost); cufftDestroy(plan); //cudaFree(cudaarrayf); //free(a); //Copy CudaArray to CArray for (i = 0; i < m; i++) for (j = 0; j < n; j++) for (k = 0; k < o; k++) { final[i][j][k].x = cout[i*m+j*n+k].x; final[i][j][k].y = cout[i*m+j*n+k].y; //fprintf(stderr, "%f+%fj\n", cout[i*m+j*n+k].x, cout[i*m+j*n+k].y); } return PyArray_Return(b); } extern "C" static PyObject* ifft3(PyObject* self, PyObject* args) { PyObject *input; PyArrayObject *a, *b; cufftComplex *cin, *cout, *cudaarrayi; Complex ***final; cufftHandle plan; int i, j, k, m, n, o, dim[3], FFT_SIZE; if (!PyArg_ParseTuple(args, "O", &input)) return NULL; a = (PyArrayObject *)PyArray_ContiguousFromObject(input, NPY_CFLOAT, 3, 3); m = dim[0] = a->dimensions[0]; n = dim[1] = a->dimensions[1]; o = dim[2] = a->dimensions[2]; FFT_SIZE = m*n*o; b = (PyArrayObject* ) PyArray_FromDims(3, dim, NPY_CFLOAT); // Convert PyArrayObjects to C pointers cin = CPyArray_to_CuArray(a); cout = (cufftComplex *)malloc(sizeof(cufftComplex)*m*n*o); final = PyArray_to_CCArray(b); //Add cuda stuff here cufftPlan3d(&plan, m, n, o, CUFFT_C2C); cudaMalloc((void**)&cudaarrayi, sizeof(cufftComplex) * FFT_SIZE); cudaMemcpy(cudaarrayi, cin, sizeof(cufftComplex) * FFT_SIZE, cudaMemcpyHostToDevice); cufftExecC2C(plan, cudaarrayi, cudaarrayi, 1); cudaMemcpy(cout, cudaarrayi, sizeof(cufftComplex) * FFT_SIZE, cudaMemcpyDeviceToHost); cufftDestroy(plan); //cudaFree(cudaarrayi); //free(a); for (i = 0; i < m; i++) for (j = 0; j < n; j++) for (k = 0; k < o; k++) { final[i][j][k].x = cout[i*m+j*n+k].x; final[i][j][k].y = cout[i*m+j*n+k].y; //fprintf(stderr, "%f+%fj\n", cout[i*m+j*n+k].x, cout[i*m+j*n+k].y); //fprintf(stderr, "%f+%fj\n", final[i][j][k].x, final[i][j][k].y); } return PyArray_Return(b); } static PyMethodDef cudaMethods[] = { {"test", test, METH_VARARGS, "Cuda test."}, {"fft3", fft3, METH_VARARGS, "Cuda 3D FFT."}, {"ifft3", ifft3, METH_VARARGS, "Cuda 3D IFFT."}, {NULL, NULL, 0, NULL} }; extern "C" void initcuda() { Py_InitModule("cuda", cudaMethods); import_array(); //Needed for Numpy }