Hi to everyone.
I have a problem with malloc() or cudaMalloc() size.
When I try to execute this simple code on Jetson TX1:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
// Each block transposes/copies a tile of TILE_DIM x TILE_DIM elements
// using TILE_DIM x BLOCK_ROWS threads, so that each thread transposes
// TILE_DIM/BLOCK_ROWS elements. TILE_DIM must be an integral multiple of BLOCK_ROWS
#define TILE_DIM 64
#define BLOCK_ROWS 4
#define SIZE 16384
__global__ void Transpose(float *odata, float *idata, int width, int height)
{
__shared__ float tile[TILE_DIM][TILE_DIM];
int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
int index_in = xIndex + (yIndex)*width;
xIndex = blockIdx.y * TILE_DIM + threadIdx.x;
yIndex = blockIdx.x * TILE_DIM + threadIdx.y;
int index_out = xIndex + (yIndex)*height;
for (int i = 0; i<TILE_DIM; i += BLOCK_ROWS)
{
tile[threadIdx.y + i][threadIdx.x] = idata[index_in + i*width];
}
__syncthreads();
for (int i = 0; i<TILE_DIM; i += BLOCK_ROWS)
{
odata[index_out + i*height] = tile[threadIdx.x][threadIdx.y + i];
}
}
int main()
{
// execution configuration parameters
dim3 grid(SIZE / TILE_DIM, SIZE / TILE_DIM), threads(TILE_DIM, BLOCK_ROWS);
float *dev_A = 0;
float *dev_B = 0;
float *A = (float*)calloc(SIZE * SIZE,sizeof(float));
//float *A = (float*)malloc(SIZE * SIZE * sizeof(float));
if (A == NULL)
printf("Malloc Failure!");
//float *B = (float*)malloc(SIZE * SIZE * sizeof(float));
float *B = (float*)calloc(SIZE * SIZE, sizeof(float));
if (B == NULL)
printf("Malloc Failure!");
for (int i = 0; i < SIZE; i++)
for (int j = 0; j < SIZE; j++){
A[j + i * SIZE] = j + i * SIZE;
}
cudaMalloc((void**)&dev_A, SIZE * SIZE * sizeof(float));
cudaMalloc((void**)&dev_B, SIZE * SIZE * sizeof(float));
cudaMemcpy(dev_A, A, SIZE * SIZE * sizeof(float), cudaMemcpyHostToDevice);
for(int i = 0; i < 30; i++)
Transpose << <grid, threads >> >(dev_B, dev_A, SIZE, SIZE);
cudaMemcpy(B, dev_B, SIZE * SIZE * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(dev_A);
cudaFree(dev_B);
free(A);
free(B);
cudaDeviceReset();
return 0;
}
The OS answer me “Killed”.
If I change the SIZE variable:
#define SIZE 8192
everything work. For this reason I think that the problem is malloc or cudaMalloc function.
There is some flag in project properties (something like “Enable LARGEADDRESSES” on Visual Studio) that I can check in order to solve this problem?
Thanks.