CUDA Vector addition error.

When starting a CUDA vector addition with full GPU capabilities, i recieve an error:

Unhandled exception at 0x7641C52F in Simple_mases.exe: Microsoft C++ exception: std::bad_alloc at memory location 0x003BF6F4.

Device:
Quadro 3000M
Max threads per block = 1024
Max grid size = 65535

#include "math.h"
#include "iostream"
#include <ctime>
#include <fstream>

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda.h>

#include <device_functions.h>
#include <cuda_runtime_api.h>

using namespace std;

__global__ void Simple(double *d_in_first, double *d_in_second, double *d_out) {

	int indx = threadIdx.x + blockIdx.x * blockDim.x;
	d_out[indx] = d_in_first[indx] + d_in_second[indx];
}

double random(double min, double max) {
	return (double)(rand()) / RAND_MAX*(max - min) + min;
}


int main(int argc, char ** argv)
{
	ofstream output("mass.txt", ios_base::app);
	srand(time(NULL));

	const int Block_Number = 65535;
	const int Threads_per_Block = 1024;
	const int Threads_Number = Block_Number * Threads_per_Block;      
	const int Bytes_for_Threads = (Threads_Number) * sizeof(double); 

	double *h_in_first  = new double[Threads_Number];
	double *h_in_second = new double[Threads_Number];
	double *h_out = new double[Threads_Number];

	for (int i = 0; i < Threads_Number; i++) {
		h_in_first[i]  = random(0.0000001, 0.0000002);
		h_in_second[i] = random(0.0000003, 0.0000004);
	}


	double *d_in_first;
	double *d_in_second;
	double *d_out;

	cudaMalloc((void**)&d_in_first, Bytes_for_Threads);
	cudaMalloc((void**)&d_in_second, Bytes_for_Threads);
	cudaMalloc((void**)&d_out, Bytes_for_Threads);
	
	cudaMemcpy(d_in_first, h_in_first, Bytes_for_Threads, cudaMemcpyHostToDevice);
	cudaMemcpy(d_in_second, h_in_second, Bytes_for_Threads, cudaMemcpyHostToDevice);

	Simple<<<Block_Number, Threads_per_Block>>>(d_in_first, d_in_second, d_out);

	cudaMemcpy(h_out, d_out, Bytes_for_Threads, cudaMemcpyDeviceToHost);

	double sum_cpu = 0;
	double sum_gpu = 0;
	for (int i = 0; i < Threads_Number; i++) {
		sum_cpu += h_in_first[i] + h_in_second[i];
		sum_gpu += h_out[i];
	}
	output << "ArraySize: " << Threads_Number << "	" << "CPU sum: " << sum_cpu << "		" << "GPU sum: " << sum_gpu << endl;
	output << endl;

	cudaFree(d_in_first);
	cudaFree(d_in_second);
	cudaFree(d_out);
    return 0;
}

Looks like an exception is being thrown by your new calls. You could catch the exception and try to get more info from it.

I also highly recommend you consider refactoring your code to something like:

thrust::device_vector<int> a, b, c;
// fill vectors
thrust::transform(
  // input
  a.begin(), a.end(), 
  b.begin(), 
  // output
  c.begin(), 
  // binary callable
  thrust::plus<int>{});

65535 * 1024 * sizeof(double) * 3 = 1.5GB acquired. does your host/device have enough memory available ?
did you build it as a 64-bit application? and how about smaller Block_Number ?

You should execute your code with cuda memcheck if it is memory issue. Or you can easily compute free device memory.