I am dealing with a big data file whose size is up to serveral tera bytes. Malloc such a big memory is impossible, so I can’t cache everything into the host memory.
What should I do to maximize the performance? Can anybody give me some code samples or hints ?
below is my attempt to the goal, any suggestion?
[codebox]int main(int argc, char* argv)
{
if(!InitCUDA()) {
return 0;
}
//----------------------------------------------------------------------
// Copy cipher text to constant memory
DWORD cipherText[32] =
{
0x48236065, 0x13DE2D16,
0x9152846E, 0x9F686492,
0xBAED17F2, 0xE8958B0C,
0x478623B1, 0xAC4BF0AA,
0x23F2DD3C, 0x7E2F6F5B,
0x084FEDEB, 0x02D216EE,
0x0CAC6E9B, 0x437E47C0,
0x431146B4, 0x6BECB0D7,
0x5916460A, 0xB83A4951,
0x73F62882, 0x33CFCF62,
0xBD21E797, 0x1D49C596,
0xB200228A, 0x9B4E3DBE,
0xDCD06586, 0x65A73A2E,
0xB0D6E10A, 0xF4B34A77,
0xB43D1316, 0xFC683C8B,
0,0
};
CUDA_SAFE_CALL( cudaMemcpyToSymbol(CipherText, cipherText, sizeof(cipherText)) );
//----------------------------------------------------------------------
// define CUDA block and threads
int numBlocks = 32768;
int numThreads = 128;
//
DWORD memsize = numBlocks * numThreads << 4;
DWORD realNum = numBlocks * numThreads;
//----------------------------------------------------------------------
// malloc space for input
ulong4 *device_keys[2];
char *host_keys[2];
CUDA_SAFE_CALL( cudaMalloc((void**) &device_keys[0], memsize) );
CUDA_SAFE_CALL( cudaMalloc((void**) &device_keys[1], memsize) );
CUDA_SAFE_CALL( cudaMallocHost(&host_keys[0], memsize) );
CUDA_SAFE_CALL( cudaMallocHost(&host_keys[1], memsize) );
// malloc space for output
DWORD *device_result, *host_result;
CUDA_SAFE_CALL( cudaMalloc((void**) &device_result, sizeof(DWORD)));
CUDA_SAFE_CALL( cudaMemset(device_result, 0xFF, sizeof(DWORD)) );
CUDA_SAFE_CALL( cudaMallocHost((void**) &host_result, sizeof(DWORD)));
*host_result = 0xFFFFFFFF;
// create work stream
cudaStream_t stream[2];
cudaStreamCreate(&stream[0]);
cudaStreamCreate(&stream[1]);
// open file for read
FILE *fp = fopen("1.dat", "rb");
if (fp == NULL)
{
printf("Error: open 1.dat");
CUT_EXIT(argc, argv);
}
// make timer
unsigned int timer = 0;
CUT_SAFE_CALL( cutCreateTimer( &timer));
CUT_SAFE_CALL( cutStartTimer( timer));
int loopTimes = 0, readLen = 0, streamId = 0;
do
{
// read file
readLen = fread(host_keys[streamId], 1, memsize, fp);
realNum = readLen >> 4;
// transfer data to memory
CUDA_SAFE_CALL( cudaMemcpyAsync(host_result, device_result, sizeof(DWORD), cudaMemcpyDeviceToHost, stream[streamId]) );
CUDA_SAFE_CALL( cudaMemcpyAsync(device_keys[streamId], host_keys[streamId], readLen, cudaMemcpyHostToDevice, stream[streamId]) );
CUDA_SAFE_CALL( cudaStreamSynchronize(stream[streamId]) );
// check if we found the key
if (*host_result != 0xFFFFFFFF)
{
break;
}
// do kernel
FindKey<<<numBlocks, numThreads, 0, stream[streamId]>>>(device_keys[streamId], device_result, realNum);
CUT_CHECK_ERROR("Kernel execution failed\n");
// switch stream
streamId ^= 1;
// increase loop times
++loopTimes;
}while (readLen >= memsize);
// stop timer
CUDA_SAFE_CALL( cudaThreadSynchronize() );
CUT_SAFE_CALL( cutStopTimer( timer));
printf("Processing time: %f (ms)\n", cutGetTimerValue( timer));
CUT_SAFE_CALL( cutDeleteTimer( timer));
// get result
if (readLen < memsize)
{
CUDA_SAFE_CALL( cudaMemcpy(host_result, device_result, sizeof(DWORD), cudaMemcpyDeviceToHost));
--loopTimes;
}
if (*host_result == 0xFFFFFFFF)
printf("no result found!");
else
printf("result = %lu\n", loopTimes * (memsize >> 4) + *host_result);
// free all
fclose(fp);
// CUT_SAFE_CALL( cudaStreamDestroy(stream[0]) ); // ← this call will trigger an error, why?
// CUT_SAFE_CALL( cudaStreamDestroy(stream[1]) ); // ← this call will trigger an error, why?
CUDA_SAFE_CALL( cudaFree(device_result));
CUDA_SAFE_CALL( cudaFreeHost(host_result));
CUDA_SAFE_CALL( cudaFree(device_keys[0]));
CUDA_SAFE_CALL( cudaFree(device_keys[1]));
CUDA_SAFE_CALL( cudaFreeHost(host_keys[0]) );
CUDA_SAFE_CALL( cudaFreeHost(host_keys[1]) );
//CUT_EXIT(argc, argv);
return 0;
}[/codebox]