Problem with malloc() and cudaMalloc() on Jetson TX1

Mungio · March 21, 2017, 1:55pm

Hi to everyone.
I have a problem with malloc() or cudaMalloc() size.

When I try to execute this simple code on Jetson TX1:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

// Each block transposes/copies a tile of TILE_DIM x TILE_DIM elements
// using TILE_DIM x BLOCK_ROWS threads, so that each thread transposes
// TILE_DIM/BLOCK_ROWS elements.  TILE_DIM must be an integral multiple of BLOCK_ROWS
#define TILE_DIM 64
#define BLOCK_ROWS 4


#define SIZE 16384

__global__ void Transpose(float *odata, float *idata, int width, int height)
{
	__shared__ float tile[TILE_DIM][TILE_DIM];

	int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
	int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
	int index_in = xIndex + (yIndex)*width;

	xIndex = blockIdx.y * TILE_DIM + threadIdx.x;
	yIndex = blockIdx.x * TILE_DIM + threadIdx.y;
	int index_out = xIndex + (yIndex)*height;

	for (int i = 0; i<TILE_DIM; i += BLOCK_ROWS)
	{
		tile[threadIdx.y + i][threadIdx.x] = idata[index_in + i*width];
	}

	__syncthreads();

	for (int i = 0; i<TILE_DIM; i += BLOCK_ROWS)
	{
		odata[index_out + i*height] = tile[threadIdx.x][threadIdx.y + i];
	}

}


int main()
{

	// execution configuration parameters
	dim3 grid(SIZE / TILE_DIM, SIZE / TILE_DIM), threads(TILE_DIM, BLOCK_ROWS);

	float *dev_A = 0;
	float *dev_B = 0;
	float *A = (float*)calloc(SIZE * SIZE,sizeof(float));
	//float *A = (float*)malloc(SIZE * SIZE * sizeof(float));
	if (A == NULL)
		printf("Malloc Failure!");
	//float *B = (float*)malloc(SIZE * SIZE * sizeof(float));
	float *B = (float*)calloc(SIZE * SIZE, sizeof(float));
	if (B == NULL)
			printf("Malloc Failure!");

	for (int i = 0; i < SIZE; i++)
		for (int j = 0; j < SIZE; j++){
		A[j + i * SIZE] = j + i * SIZE;
		}

	cudaMalloc((void**)&dev_A, SIZE * SIZE * sizeof(float));
	cudaMalloc((void**)&dev_B, SIZE * SIZE * sizeof(float));

	cudaMemcpy(dev_A, A, SIZE * SIZE * sizeof(float), cudaMemcpyHostToDevice);

	for(int i = 0; i < 30; i++)
		Transpose << <grid, threads >> >(dev_B, dev_A, SIZE, SIZE);

	cudaMemcpy(B, dev_B, SIZE * SIZE * sizeof(float), cudaMemcpyDeviceToHost);

	cudaFree(dev_A);
	cudaFree(dev_B);
	free(A);
	free(B);
    cudaDeviceReset();

    return 0;
}

The OS answer me “Killed”.

If I change the SIZE variable:

#define SIZE 8192

everything work. For this reason I think that the problem is malloc or cudaMalloc function.

There is some flag in project properties (something like “Enable LARGEADDRESSES” on Visual Studio) that I can check in order to solve this problem?

Thanks.

Mungio · March 21, 2017, 2:05pm

Ok. No problem. 2 malloc take 2 GB and 2 cudaMalloc() take 2 GB.

2 + 2 = 4 GB and Jetson TX1 does not have this amount of free memory.

Robert_Crovella · March 21, 2017, 2:17pm

If you did proper cuda error checking, I’m pretty sure at least one of those cudaMalloc calls would have returned an “out of memory” error