Need help on big data file

I am dealing with a big data file whose size is up to serveral tera bytes. Malloc such a big memory is impossible, so I can’t cache everything into the host memory.

What should I do to maximize the performance? Can anybody give me some code samples or hints ?

below is my attempt to the goal, any suggestion?

[codebox]int main(int argc, char* argv)


if(!InitCUDA()) {

	return 0;



// Copy cipher text to constant memory

DWORD cipherText[32] =


	0x48236065,	0x13DE2D16,

	0x9152846E,	0x9F686492,

	0xBAED17F2,	0xE8958B0C,

	0x478623B1,	0xAC4BF0AA,

	0x23F2DD3C,	0x7E2F6F5B,

	0x084FEDEB,	0x02D216EE,

	0x0CAC6E9B,	0x437E47C0,

	0x431146B4,	0x6BECB0D7,

	0x5916460A,	0xB83A4951,

	0x73F62882,	0x33CFCF62,

	0xBD21E797,	0x1D49C596,

	0xB200228A,	0x9B4E3DBE,

	0xDCD06586,	0x65A73A2E,

	0xB0D6E10A,	0xF4B34A77,

	0xB43D1316,	0xFC683C8B,



CUDA_SAFE_CALL( cudaMemcpyToSymbol(CipherText, cipherText, sizeof(cipherText)) );


// define CUDA block and threads

int numBlocks = 32768;

int numThreads = 128;


DWORD memsize = numBlocks * numThreads << 4;

DWORD realNum = numBlocks * numThreads;


// malloc space for input

ulong4 *device_keys[2];

char *host_keys[2];

CUDA_SAFE_CALL( cudaMalloc((void**) &device_keys[0], memsize) );

CUDA_SAFE_CALL( cudaMalloc((void**) &device_keys[1], memsize) );

CUDA_SAFE_CALL( cudaMallocHost(&host_keys[0], memsize) );

CUDA_SAFE_CALL( cudaMallocHost(&host_keys[1], memsize) );

// malloc space for output

DWORD *device_result, *host_result;

CUDA_SAFE_CALL( cudaMalloc((void**) &device_result, sizeof(DWORD)));

CUDA_SAFE_CALL( cudaMemset(device_result, 0xFF, sizeof(DWORD)) );

CUDA_SAFE_CALL( cudaMallocHost((void**) &host_result, sizeof(DWORD)));

*host_result = 0xFFFFFFFF;

// create work stream

cudaStream_t stream[2];



// open file for read

FILE *fp = fopen("1.dat", "rb");

if (fp == NULL)


	printf("Error: open 1.dat");

	CUT_EXIT(argc, argv);


// make timer

unsigned int timer = 0;

CUT_SAFE_CALL( cutCreateTimer( &timer));

CUT_SAFE_CALL( cutStartTimer( timer));

int loopTimes = 0, readLen = 0, streamId = 0;



	// read file

	readLen = fread(host_keys[streamId], 1, memsize, fp);

	realNum = readLen >> 4;

	// transfer data to memory

	CUDA_SAFE_CALL( cudaMemcpyAsync(host_result, device_result, sizeof(DWORD), cudaMemcpyDeviceToHost, stream[streamId]) );

	CUDA_SAFE_CALL( cudaMemcpyAsync(device_keys[streamId], host_keys[streamId], readLen, cudaMemcpyHostToDevice, stream[streamId]) );

	CUDA_SAFE_CALL( cudaStreamSynchronize(stream[streamId]) );

	// check if we found the key

	if (*host_result != 0xFFFFFFFF)




	// do kernel

	FindKey<<<numBlocks, numThreads, 0, stream[streamId]>>>(device_keys[streamId], device_result, realNum);

	CUT_CHECK_ERROR("Kernel execution failed\n");

	// switch stream

	streamId ^= 1;

	// increase loop times


}while (readLen >= memsize);

// stop timer

CUDA_SAFE_CALL( cudaThreadSynchronize() );

CUT_SAFE_CALL( cutStopTimer( timer));

printf("Processing time: %f (ms)\n", cutGetTimerValue( timer));

CUT_SAFE_CALL( cutDeleteTimer( timer));

// get result

if (readLen < memsize)


	CUDA_SAFE_CALL( cudaMemcpy(host_result, device_result, sizeof(DWORD), cudaMemcpyDeviceToHost));



if (*host_result == 0xFFFFFFFF)

	printf("no result found!");


	printf("result = %lu\n", loopTimes * (memsize >> 4) + *host_result);

// free all


// CUT_SAFE_CALL( cudaStreamDestroy(stream[0]) ); // <- this call will trigger an error, why?

// CUT_SAFE_CALL( cudaStreamDestroy(stream[1]) ); // <- this call will trigger an error, why?

CUDA_SAFE_CALL( cudaFree(device_result));

CUDA_SAFE_CALL( cudaFreeHost(host_result));

CUDA_SAFE_CALL( cudaFree(device_keys[0]));

CUDA_SAFE_CALL( cudaFree(device_keys[1]));

CUDA_SAFE_CALL( cudaFreeHost(host_keys[0]) );

CUDA_SAFE_CALL( cudaFreeHost(host_keys[1]) );

//CUT_EXIT(argc, argv);

return 0;


If you are working with a proprietary data format, then on a POSIX system, I would suggest using mmap(). The Windows API doesn’t have mmap(), but it does have MapViewOfFile(), which might make the file IO side of things a little easier. You will probably want to overlap host site disk IO and GPU side computation to improve performance.