Cuda multithreading and stream problems generic system issues

I upgraded from a 8800 GT to 2 x 9800GTX+ last week, and I’ve sense encountered a couple of puzzling performance issues.

Originally on the 8800GT I had my kernel and memory calls organized into streams, and the code worked fine and was slightly faster to boot. However, in the new system, although the same code is still functionally correct, it runs like a dog (100x slowdown). Removing the streams seems to make the code happy, and it runs correctly and quickly. Does anybody have any idea of what could be going on.

So then, I tried to run my code on multiple GPUs using pthreads. No luck there either. The multi threaded code on two GPUS runs at half the speed of the original code on one GPU. I’m not really sure why this is, other than possibly the threads are being serialized/incurring other overhead somewhere. My kernel also runs in about 1.5 seconds, so pthread overhead may not be suffciently amortized (this seems like a stretch).

I was wondering if anyone had encountered/can explain either of these issues.

I can post code if this would help.

Not sure–can you post code? What does the simpleStreams benchmark show? What driver are you running?

pthread overhead is a stretch (for two threads? as slow as pthreads is, that shouldn’t be an overhead problem). Code would help.

The streams bench mark shows:

memcopy: 21.02

kernel: 34.03

non-streamed: 54.96 (55.05 expected)

8 streams: 47.37 (36.66 expected with compute capability 1.1 or later)


Test PASSED

Press ENTER to exit…

I just updated the cuda toolkit/driver to the most recent version, and that seems to have fixed the stream issue. Now the streamed version runs a bit faster than the non-streamed version.

Here’s code.

int GPU_N;
typedef struct {
int dev;
int max_ell;
} MD6Ctrl;

void hash_segment_par(int ell, size_t lo, size_t hi, md6_word **cudaDest, md6_word **cudaSrc)
{

int i;
//try to handle streams of stuff…
//have to malloc more memory by default

if ((hi-lo)==512PAR_KERNELS) { / compression input block size */

// use threadIdx.y
if(cudaDest == NULL) {
cudaMalloc((void
*) cudaDest,PIPESTEP16PAR_KERNELSsizeof(md6_word));
cudaMalloc((void**) cudaSrc,PIPESTEP
64PAR_KERNELSsizeof(md6_word));
}

//printf("A[%d]: %x, ell: %d, lo: %d, hi: %d size: %d\n", ell, A[ell], ell, lo, hi,64*PAR_KERNELS*sizeof(md6_word) ); 


cudaMemcpy((void*) *cudaSrc, (void*)&(A[ell][lo]), 64*PAR_KERNELS*sizeof(md6_word) ,cudaMemcpyHostToDevice);    

dim3 dimBlock(PAR_THREAD_X,PAR_THREAD_Y,16);
dim3 dimGrid(PAR_GRID_X,PAR_GRID_Y);
compress_block_par<<<dimGrid,dimBlock>>>(ell,lo,*cudaDest,*cudaSrc,zk);

cudaMemcpy(&(A[ell+1][lo/4]), *cudaDest, 16*PAR_KERNELS*sizeof(md6_word) ,cudaMemcpyDeviceToHost); 

//printf("\n");
//for(i = 0; i < PAR_KERNELS; i++) {
 // print_some("Result",16, (md6_word *)&(A[ell+1][lo/4+16*i]));
//} 

}
else if( hi-lo < 512*PAR_KERNELS) {

// printf(“Calling compress par hi: %d, lo: %d par_size: %d\n”, hi, lo, 512*PAR_KERNELS);
if(hi - lo == 512) {
hash_segment(ell,lo,hi);
} else {
size_t mid = (lo+hi)/2;
hash_segment(ell,lo,mid);
hash_segment(ell,mid,hi);
}
}
else
{
size_t mid = (lo+hi)/2;
hash_segment_par(ell,lo,mid,cudaDest,cudaSrc);
hash_segment_par(ell,mid,hi,cudaDest,cudaSrc);
}
}

// need to drill through mallocs…
// need to figure out how to fix the IDs
void hash_thread(MD6Ctrl* ctrl) {
md6_word cudaDest=NULL, cudaSrc=NULL;
int ell;
assert(cudaSuccess == cudaSetDevice(ctrl->dev));
printf(“In hash thread: %d\n”, ctrl->dev);
for (ell=0;ellmax_ell;ell++)
{
//printf(“Level: %d, start: %d, finish: %d\n”, ell, N[ell]/GPU_N
ctrl->dev,N[ell]/GPU_N
(ctrl->dev+1));
hash_segment_par(ell,N[ell]/GPU_Nctrl->dev,N[ell]/GPU_N(ctrl->dev+1),&cudaDest,&cudaSrc);
}
pthread_exit(NULL);
}

/* main */

int main(int argc, char **argv)
{ int ell;
int i;

//Cilk_time start_time, elapsed_time;
double seconds;

/* setup /
zk = 11; /
zn = 29 for half a gig a level 0; a gig total /
assert(cudaSuccess == cudaGetDeviceCount(&GPU_N));
//GPU_N = 1;
setup(zk);
printf("Input file has 2
*%d = %g bytes.\n",7+2*zk,(double)N[0]);

/* do computation */
//start_time = Cilk_get_wall_time();

pthread_t threads = (pthread_t) malloc(GPU_Nsizeof(pthread_t));
MD6Ctrl ctrls = (MD6Ctrl) malloc(GPU_N*sizeof(MD6Ctrl));

start_timer();
//printf(“GPU_N: %d, md6_b: %d\n”, GPU_N, md6_b);
for (ell=0;ell<zk;ell++)
{
if((N[ell] >= GPU_N512) && (N[ell+1] <= GPU_N512)) {
// printf(“lowest level: %d\n”, ell);
for(i=0;i<GPU_N;i++) {
ctrls[i].dev = i;
ctrls[i].max_ell = ell+1;
pthread_create(&(threads[i]), NULL, (void * () (void))hash_thread, (void*)&ctrls[i]);
}
for(i=0;i<GPU_N;i++) {
pthread_join(threads[i],NULL);
}
} else if(N[ell] >= GPU_N*512) {
// printf(“Hitting continue N[%d] = %d, next [%d]\n”, ell,N[ell],N[ell+1]);
// find the lowest level that can be taken by the threads
continue;
} else {
// printf(“Call hash seg N[%d] = %d, next [%d]\n”, ell,N[ell],N[ell+1]);
hash_segment(ell,0,N[ell]);
} // printf(“Done\n”);

}

end_timer();

print_time();

/* print out result, time used, and hashing speed /
printf(“\n”);
for (i=96;i<128;i+=8)
printf( PR_MD6_WORD " ",
(md6_word *)&(A[zk][i]));

return 0;
}

Oh, here’s a performance tidbit:

One thread, one GPU:
Input file has 2**29 = 5.36871e+08 bytes.
Hitting continue N[0] = 536870912, next [134217728]
Hitting continue N[1] = 134217728, next [33554432]
Hitting continue N[2] = 33554432, next [8388608]
Hitting continue N[3] = 8388608, next [2097152]
Hitting continue N[4] = 2097152, next [524288]
Hitting continue N[5] = 524288, next [131072]
Hitting continue N[6] = 131072, next [32768]
Hitting continue N[7] = 32768, next [8192]
Hitting continue N[8] = 8192, next [2048]
Hitting continue N[9] = 2048, next [512]
lowest level: 10
– Length = 5.368e+08 bytes
– Elapsed time = 1.720 seconds.
– Megabytes per second = 312.093.
– Total clock ticks = 5498550848
– Clock ticks / byte = 10

Two threads, two GPU:

Hitting continue N[0] = 536870912, next [134217728]
Hitting continue N[1] = 134217728, next [33554432]
Hitting continue N[2] = 33554432, next [8388608]
Hitting continue N[3] = 8388608, next [2097152]
Hitting continue N[4] = 2097152, next [524288]
Hitting continue N[5] = 524288, next [131072]
Hitting continue N[6] = 131072, next [32768]
Hitting continue N[7] = 32768, next [8192]
Hitting continue N[8] = 8192, next [2048]
lowest level: 9
Call hash seg N[10] = 512, next [128]
– Length = 5.368e+08 bytes
– Elapsed time = 2.440 seconds.
– Megabytes per second = 220.
– Total clock ticks = 4266843680
– Clock ticks / byte = 7