I practically copied the example code for getting sum and sum of squares from the thrust documentation, but doing it naively on the cpu finishes WAY faster (over at least 10x, cpu is almost instant and gpu I can count the seconds). The input array “arr” is roughly ~5000 x 6 if that means anything, I figure it should be faster with that many elements.
I’m new to GPU and my c++ is rusty (not that I was ever very great, i had one class and barely touched it since). I turned debug GPU (-G flag) off in compile options.
If it’s not an issue in the code/use case, the only other thing worth mentioning is I’m using a laptop w/ and integrated GeForce MX330, which I could not find reference to anywhere in the website stating which smxx I should use for compiling (the default said 50).
I wrote a small main program to show off timing. For me I got “Time taken CPU: 2845100 nanosTime taken GPU: 639698400 nanos” and in the actual program it’s even worse.
#include "cuda_runtime.h"
#include <omp.h>
#include <cmath>
#include <random>
#include <algorithm>
#include <iterator>
#include <chrono>
template <typename T>
struct square
{
__host__ __device__
T operator()(const T& x) const {
return x * x;
}
};
int biggerKernel(std::vector<std::vector<double>>& arr, int n1, int n2, double constXKSquared, double cosineConst, double* tx1arr) {
square<double> unary_op;
thrust::plus<double> binary_op;
double init = 0;
int size = n1;
for (int i = 0; i < n2; i++) {
thrust::device_vector<double> d_x(arr[i].begin(), arr[i].end());
// compute square of sums
double ss1 = thrust::transform_reduce(d_x.begin(), d_x.end(), unary_op, init, binary_op);
double term = thrust::reduce(d_x.begin(), d_x.end());
//some other stuff here, i commented it out for testing, it's just standard math stuff.
}
return 1;
}
int main() {
std::vector<std::vector<double>> test(6);
for (int i = 0; i < 6; i++) {
test[i].resize((1920 * 3), 0.0);
std::generate(test[i].begin(), test[i].end(), []() {
return rand() % 100;
});
}
auto start = std::chrono::steady_clock::now();
for (int i = 0; i < 6; i++) {
double sum = 0;
double sos = 0;
for (int j = 0; j < test[i].size(); j++) {
sos += pow(test[i][j], 2);
sum += test[i][j];
}
printf("Sum = %.4f sum-of-squares = %.4f\n", sum, sos);
}
auto end = std::chrono::steady_clock::now();
auto elapsed = end - start;
printf("Time taken CPU: %d\n", std::chrono::duration_cast<std::chrono::nanoseconds> (end - start));
start = std::chrono::steady_clock::now();
biggerKernel(test, 1920 * 3, 6, 0.0, 0.0, nullptr);
end = std::chrono::steady_clock::now();
elapsed = end - start;
printf("Time taken GPU: %d\n", std::chrono::duration_cast<std::chrono::nanoseconds> (end - start));
}