Cuda code exited without printing the results and no error

The code from the tutorial:
#include <cuda_runtime.h>
#include <device_launch_parameters.h>

#include <stdlib.h>
#include <time.h>
#include <stdio.h>

global void mem_transfer_test(int* input) {
int tid = threadIdx.x;
int offset = blockDim.x * blockIdx.x;

int gid = offset + tid;
printf("threadIdx_X: %d, value : %d, GlobalIdx_X: %d", tid, input[gid], gid);


int main() {
int size = 128;
int array_byte_size = sizeof(int) * size;

int* h_input;
h_input = (int*)malloc(array_byte_size);

time_t t;
//	srand(time(NULL));

for (int i = 0; i < array_byte_size; i++) {
	h_input[i] = (int)(rand() & 0xff);

int* d_input;
cudaMalloc((void**)&d_input, array_byte_size);

cudaMemcpy(d_input, h_input, array_byte_size, cudaMemcpyHostToDevice);

dim3 block(64);
dim3 grid(2);

mem_transfer_test << <grid, block >> > (d_input);



return 0;


and the console had the following lines:
…\CudaRuntime2\x64\Debug\CudaRuntime2.exe (process 1016) exited with code -1073740940.
Press any key to close this window . . .

I couldn’t figure out why it isn’t giving the desired result.

Your for loop is broken. it should be:

for (int i = 0; i < size; i++) {
1 Like

A non-zero exit code indicates an error. In this case: STATUS_HEAP_CORRUPTION. That is Windows complaining about the host code misbehaving. An out-of-bounds write access to a dynamically allocated array would be my guess as to the root cause.

1 Like