Dear all
this is code fragment for my program
I made this function as a recursive function, it compiles correctly but don’t give the right answer.
please help me.
//assume P=8
__device__ void FinalMerge(int *a, int *temp, int rec_num)
{
int count = 1;
int threadId = threadIdx.x + blockIdx.x*blockDim.x;
if (0 <= threadId && threadId < (P/2)){
int Lindex1 = rec_num * (threadId);
int LendIndex1 = Lindex1 + rec_num/2-1;
int Lindex2 = LendIndex1 + 1;
int LendIndex2 = Lindex2 + rec_num/2-1;
//---------------
int LtargetIndex = Lindex1;
int LsortedSize = 0;
while (Lindex1 <= LendIndex1 && Lindex2 <= LendIndex2){
if (LsortedSize == rec_num/2)
break;
if (a[Lindex1] <= a[Lindex2]){
temp[LtargetIndex] = a[Lindex1];
++LsortedSize;
++Lindex1;
++LtargetIndex;
}
else{
temp[LtargetIndex] = a[Lindex2];
++Lindex2;
++LsortedSize;
++LtargetIndex;
}
}
++count;
__syncthreads();
if(count<=(P/2)){
rec_num = (2 ^ count)*1024;
FinalMerge(temp, temp,temp10,(rec_num));
}
]
}
FinalMerge<< <P, 1024 >> >(dev_a, dev_temp);
cudaMemcpy(a, dev_temp, N*sizeof(int), cudaMemcpyDeviceToHost);