When starting a CUDA vector addition with full GPU capabilities, i recieve an error:
Unhandled exception at 0x7641C52F in Simple_mases.exe: Microsoft C++ exception: std::bad_alloc at memory location 0x003BF6F4.
Device:
Quadro 3000M
Max threads per block = 1024
Max grid size = 65535
#include "math.h"
#include "iostream"
#include <ctime>
#include <fstream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda.h>
#include <device_functions.h>
#include <cuda_runtime_api.h>
using namespace std;
__global__ void Simple(double *d_in_first, double *d_in_second, double *d_out) {
int indx = threadIdx.x + blockIdx.x * blockDim.x;
d_out[indx] = d_in_first[indx] + d_in_second[indx];
}
double random(double min, double max) {
return (double)(rand()) / RAND_MAX*(max - min) + min;
}
int main(int argc, char ** argv)
{
ofstream output("mass.txt", ios_base::app);
srand(time(NULL));
const int Block_Number = 65535;
const int Threads_per_Block = 1024;
const int Threads_Number = Block_Number * Threads_per_Block;
const int Bytes_for_Threads = (Threads_Number) * sizeof(double);
double *h_in_first = new double[Threads_Number];
double *h_in_second = new double[Threads_Number];
double *h_out = new double[Threads_Number];
for (int i = 0; i < Threads_Number; i++) {
h_in_first[i] = random(0.0000001, 0.0000002);
h_in_second[i] = random(0.0000003, 0.0000004);
}
double *d_in_first;
double *d_in_second;
double *d_out;
cudaMalloc((void**)&d_in_first, Bytes_for_Threads);
cudaMalloc((void**)&d_in_second, Bytes_for_Threads);
cudaMalloc((void**)&d_out, Bytes_for_Threads);
cudaMemcpy(d_in_first, h_in_first, Bytes_for_Threads, cudaMemcpyHostToDevice);
cudaMemcpy(d_in_second, h_in_second, Bytes_for_Threads, cudaMemcpyHostToDevice);
Simple<<<Block_Number, Threads_per_Block>>>(d_in_first, d_in_second, d_out);
cudaMemcpy(h_out, d_out, Bytes_for_Threads, cudaMemcpyDeviceToHost);
double sum_cpu = 0;
double sum_gpu = 0;
for (int i = 0; i < Threads_Number; i++) {
sum_cpu += h_in_first[i] + h_in_second[i];
sum_gpu += h_out[i];
}
output << "ArraySize: " << Threads_Number << " " << "CPU sum: " << sum_cpu << " " << "GPU sum: " << sum_gpu << endl;
output << endl;
cudaFree(d_in_first);
cudaFree(d_in_second);
cudaFree(d_out);
return 0;
}