recusion in cuda

Dear all
this is code fragment for my program
I made this function as a recursive function, it compiles correctly but don’t give the right answer.
please help me.

//assume P=8
__device__ void FinalMerge(int *a, int *temp, int rec_num)
{

	int count = 1;
	int threadId = threadIdx.x + blockIdx.x*blockDim.x;
	
	if (0 <= threadId && threadId < (P/2)){
	
		int Lindex1 =  rec_num * (threadId);
		int LendIndex1 = Lindex1 + rec_num/2-1;
                int Lindex2 = LendIndex1 + 1; 
                int LendIndex2 = Lindex2 + rec_num/2-1;
		//---------------
		int LtargetIndex = Lindex1; 
		int LsortedSize = 0;

		
		while (Lindex1 <= LendIndex1 && Lindex2 <= LendIndex2){
			if (LsortedSize == rec_num/2)
				break;

			if (a[Lindex1] <= a[Lindex2]){
				temp[LtargetIndex] = a[Lindex1];
	                        ++LsortedSize; 		
                                ++Lindex1; 	
                        	++LtargetIndex;
			}

			else{
				temp[LtargetIndex] = a[Lindex2]; 
	                       ++Lindex2;  	
                              ++LsortedSize; 		
                              ++LtargetIndex;
			}


		}
	


	++count;
	__syncthreads();
	if(count<=(P/2)){
		rec_num = (2 ^ count)*1024;
		FinalMerge(temp, temp,temp10,(rec_num));
		
	}
	]
	
}
FinalMerge<< <P, 1024 >> >(dev_a, dev_temp);
cudaMemcpy(a, dev_temp, N*sizeof(int), cudaMemcpyDeviceToHost);

please define “the right answer”.

What you have shown here doesn’t make sense. A device function cannot be configured:

FinalMerge<< <P, 1024 >> >(dev_a, dev_temp);

You’re likely to get better help if you post a short, complete program, rather than fragments or snippets.

my program shall made merge sort
the function here is the final stage of sorting which merges the sorted array from each block

here my function can’t merge .
the final array not sorted.