Here’s code.
int GPU_N;
typedef struct {
int dev;
int max_ell;
} MD6Ctrl;
void hash_segment_par(int ell, size_t lo, size_t hi, md6_word **cudaDest, md6_word **cudaSrc)
{
int i;
//try to handle streams of stuff…
//have to malloc more memory by default
if ((hi-lo)==512PAR_KERNELS) { / compression input block size */
// use threadIdx.y
if(cudaDest == NULL) {
cudaMalloc((void*) cudaDest,PIPESTEP16PAR_KERNELSsizeof(md6_word));
cudaMalloc((void**) cudaSrc,PIPESTEP64PAR_KERNELSsizeof(md6_word));
}
//printf("A[%d]: %x, ell: %d, lo: %d, hi: %d size: %d\n", ell, A[ell], ell, lo, hi,64*PAR_KERNELS*sizeof(md6_word) );
cudaMemcpy((void*) *cudaSrc, (void*)&(A[ell][lo]), 64*PAR_KERNELS*sizeof(md6_word) ,cudaMemcpyHostToDevice);
dim3 dimBlock(PAR_THREAD_X,PAR_THREAD_Y,16);
dim3 dimGrid(PAR_GRID_X,PAR_GRID_Y);
compress_block_par<<<dimGrid,dimBlock>>>(ell,lo,*cudaDest,*cudaSrc,zk);
cudaMemcpy(&(A[ell+1][lo/4]), *cudaDest, 16*PAR_KERNELS*sizeof(md6_word) ,cudaMemcpyDeviceToHost);
//printf("\n");
//for(i = 0; i < PAR_KERNELS; i++) {
// print_some("Result",16, (md6_word *)&(A[ell+1][lo/4+16*i]));
//}
}
else if( hi-lo < 512*PAR_KERNELS) {
// printf(“Calling compress par hi: %d, lo: %d par_size: %d\n”, hi, lo, 512*PAR_KERNELS);
if(hi - lo == 512) {
hash_segment(ell,lo,hi);
} else {
size_t mid = (lo+hi)/2;
hash_segment(ell,lo,mid);
hash_segment(ell,mid,hi);
}
}
else
{
size_t mid = (lo+hi)/2;
hash_segment_par(ell,lo,mid,cudaDest,cudaSrc);
hash_segment_par(ell,mid,hi,cudaDest,cudaSrc);
}
}
// need to drill through mallocs…
// need to figure out how to fix the IDs
void hash_thread(MD6Ctrl* ctrl) {
md6_word cudaDest=NULL, cudaSrc=NULL;
int ell;
assert(cudaSuccess == cudaSetDevice(ctrl->dev));
printf(“In hash thread: %d\n”, ctrl->dev);
for (ell=0;ellmax_ell;ell++)
{
//printf(“Level: %d, start: %d, finish: %d\n”, ell, N[ell]/GPU_Nctrl->dev,N[ell]/GPU_N(ctrl->dev+1));
hash_segment_par(ell,N[ell]/GPU_Nctrl->dev,N[ell]/GPU_N(ctrl->dev+1),&cudaDest,&cudaSrc);
}
pthread_exit(NULL);
}
/* main */
int main(int argc, char **argv)
{ int ell;
int i;
//Cilk_time start_time, elapsed_time;
double seconds;
/* setup /
zk = 11; / zn = 29 for half a gig a level 0; a gig total /
assert(cudaSuccess == cudaGetDeviceCount(&GPU_N));
//GPU_N = 1;
setup(zk);
printf("Input file has 2*%d = %g bytes.\n",7+2*zk,(double)N[0]);
/* do computation */
//start_time = Cilk_get_wall_time();
pthread_t threads = (pthread_t) malloc(GPU_Nsizeof(pthread_t));
MD6Ctrl ctrls = (MD6Ctrl) malloc(GPU_N*sizeof(MD6Ctrl));
start_timer();
//printf(“GPU_N: %d, md6_b: %d\n”, GPU_N, md6_b);
for (ell=0;ell<zk;ell++)
{
if((N[ell] >= GPU_N512) && (N[ell+1] <= GPU_N512)) {
// printf(“lowest level: %d\n”, ell);
for(i=0;i<GPU_N;i++) {
ctrls[i].dev = i;
ctrls[i].max_ell = ell+1;
pthread_create(&(threads[i]), NULL, (void * () (void))hash_thread, (void*)&ctrls[i]);
}
for(i=0;i<GPU_N;i++) {
pthread_join(threads[i],NULL);
}
} else if(N[ell] >= GPU_N*512) {
// printf(“Hitting continue N[%d] = %d, next [%d]\n”, ell,N[ell],N[ell+1]);
// find the lowest level that can be taken by the threads
continue;
} else {
// printf(“Call hash seg N[%d] = %d, next [%d]\n”, ell,N[ell],N[ell+1]);
hash_segment(ell,0,N[ell]);
} // printf(“Done\n”);
}
end_timer();
print_time();
/* print out result, time used, and hashing speed /
printf(“\n”);
for (i=96;i<128;i+=8)
printf( PR_MD6_WORD " ",(md6_word *)&(A[zk][i]));
return 0;
}