Hello everyone.
I recently started porting a cpp matrix library to cuda code in order to accelerate the matrix operations.
in the old cpu-based code, i had a function in the matrix class, .apply(double (*func)(double)), that applied a function to each element of the matrix;
Matrix Matrix::apply(double (*func)(double)) const
{
Matrix applied(this->rows, this->cols);
for (int i = 0; i < this->rows; i++)
for (int j = 0; j < this->cols; j++)
applied.data[i][j] = func(this->data[i][j]);
return applied;
}
Now, how would i be able to sort of “copy” the function pointer from host to device, in order to give it to a kernel that does the exact same thing only parallelized between multiple threads like this:
void applyWrapper(double** A, double (*hostFuncPtr)(double), size_t rows, size_t cols
{
auto deviceFuncPtr = /* copy the function pointed by hostFuncPtr into a device function pointer */
applyKernel<<<1, 1>>>(A, deviceFuncPtr, rows, cols);
}
void applyKernel(double** A, double (*devFuncPtr)(double), size_t rows, size_t cols)
{
size_t idx = threadIdx.x;
size_t stride = blockDim.x;
for (size_t i = idx; i < A_cols; i += stride)
for (size_t j = 0; j < A_rows; j++)
A[j][i] = func(A[j][i]);
}