Problem with malloc() and cudaMalloc() on Jetson TX1

Hi to everyone.
I have a problem with malloc() or cudaMalloc() size.

When I try to execute this simple code on Jetson TX1:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

// Each block transposes/copies a tile of TILE_DIM x TILE_DIM elements
// using TILE_DIM x BLOCK_ROWS threads, so that each thread transposes
// TILE_DIM/BLOCK_ROWS elements.  TILE_DIM must be an integral multiple of BLOCK_ROWS
#define TILE_DIM 64
#define BLOCK_ROWS 4


#define SIZE 16384

__global__ void Transpose(float *odata, float *idata, int width, int height)
{
	__shared__ float tile[TILE_DIM][TILE_DIM];

	int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
	int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
	int index_in = xIndex + (yIndex)*width;

	xIndex = blockIdx.y * TILE_DIM + threadIdx.x;
	yIndex = blockIdx.x * TILE_DIM + threadIdx.y;
	int index_out = xIndex + (yIndex)*height;

	for (int i = 0; i<TILE_DIM; i += BLOCK_ROWS)
	{
		tile[threadIdx.y + i][threadIdx.x] = idata[index_in + i*width];
	}

	__syncthreads();

	for (int i = 0; i<TILE_DIM; i += BLOCK_ROWS)
	{
		odata[index_out + i*height] = tile[threadIdx.x][threadIdx.y + i];
	}

}


int main()
{

	// execution configuration parameters
	dim3 grid(SIZE / TILE_DIM, SIZE / TILE_DIM), threads(TILE_DIM, BLOCK_ROWS);

	float *dev_A = 0;
	float *dev_B = 0;
	float *A = (float*)calloc(SIZE * SIZE,sizeof(float));
	//float *A = (float*)malloc(SIZE * SIZE * sizeof(float));
	if (A == NULL)
		printf("Malloc Failure!");
	//float *B = (float*)malloc(SIZE * SIZE * sizeof(float));
	float *B = (float*)calloc(SIZE * SIZE, sizeof(float));
	if (B == NULL)
			printf("Malloc Failure!");

	for (int i = 0; i < SIZE; i++)
		for (int j = 0; j < SIZE; j++){
		A[j + i * SIZE] = j + i * SIZE;
		}

	cudaMalloc((void**)&dev_A, SIZE * SIZE * sizeof(float));
	cudaMalloc((void**)&dev_B, SIZE * SIZE * sizeof(float));

	cudaMemcpy(dev_A, A, SIZE * SIZE * sizeof(float), cudaMemcpyHostToDevice);

	for(int i = 0; i < 30; i++)
		Transpose << <grid, threads >> >(dev_B, dev_A, SIZE, SIZE);

	cudaMemcpy(B, dev_B, SIZE * SIZE * sizeof(float), cudaMemcpyDeviceToHost);

	cudaFree(dev_A);
	cudaFree(dev_B);
	free(A);
	free(B);
    cudaDeviceReset();

    return 0;
}

The OS answer me “Killed”.

If I change the SIZE variable:

#define SIZE 8192

everything work. For this reason I think that the problem is malloc or cudaMalloc function.

There is some flag in project properties (something like “Enable LARGEADDRESSES” on Visual Studio) that I can check in order to solve this problem?

Thanks.

Ok. No problem. 2 malloc take 2 GB and 2 cudaMalloc() take 2 GB.

2 + 2 = 4 GB and Jetson TX1 does not have this amount of free memory.

If you did proper cuda error checking, I’m pretty sure at least one of those cudaMalloc calls would have returned an “out of memory” error