New to cuda need some help

Hello guys,
I am still new to CUDA, and I did some kind of small program to understand it more. The program is a simple compression code that takes the input from a file and xor(byte1, byte2) extra then keeps on adding the results under %256 like a tree you can say. I am facing some problems in how to optimize the number of (grids, blocks) to do the job, and frequent copying of data to do the second job. I am looking forward for your comments and advices. :">

Thanks
-------------------------------------not sure how to post the code but I will past it all here------------------------------------
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <ctype.h>
#include <cuda.h>
#include <cuda_runtime.h>

//Kernel

global void compression(unsigned char d_a, unsigned char d_b, int nBytes, int nBytes2)
{
int idx = blockDim.x * blockIdx.x + threadIdx.x;
d_b[idx] = d_a[idx
2] ^ d_a[idx
2 + 1];

}
global void compression2(unsigned char d_a, unsigned char d_b, int nBytes, int nBytes2)
{
int idx = blockDim.x * blockIdx.x + threadIdx.x;
d_a[idx] = (d_b[idx
2] + d_b[idx
2 + 1]) % 256;
}

int main(int argc, char *argv)
{
//Declarations
int i, num_char;
FILE *input_file; //input file
//FILE *output_file; //output file

unsigned char *h_a;		//host pointer
unsigned char *d_a, *d_b;	//device pointer
int nBytes, nBytes2;

//timers
//unsigned int gpu_timer = 0;
//unsigned int gpu_timerNoMemCpy = 0;
//unsigned int cpu_timer = 0;

//get the size of the file
input_file=fopen(argv[1],“r”); //input file
if(input_file==NULL)return -1; //check if it is available
for(i = 0;(fgetc(input_file))!=-1; i++);
num_char=i-1; //number of char in the file
fclose(input_file);
printf(“The number of char is: %d\n”, num_char);
//start timer
//cutCreateTimer(&gpu_timer);
//cutStartTimer(gpu_timer);
//setting the data in the host memory
nBytes = num_char * sizeof(unsigned char);
h_a = (unsigned char *) malloc (nBytes);

input_file=fopen(argv[1],"r");   //input file again
for(i = 0; i < num_char;i++)h_a[i]=fgetc(input_file);
//check the input
printf("The data:\n");
for(i = 0; i < num_char;i++)printf("%u", h_a[i]);
printf("\n");
//end check

//for the second array
if(num_char % 2 == 0)
{
num_char = num_char / 2;
}
else
if(num_char % 2 != 0)
{
num_char = num_char / 2 + 1;
}
nBytes2 = num_char * sizeof(unsigned char);
//now allocate the memory in the device
//and copy the data from Host to Device
cudaMalloc((void**)&d_a, nBytes);
cudaMemcpy(d_a, h_a, nBytes, cudaMemcpyHostToDevice);
//now d_b where the result is going to be stored
//it has smaller array = 1/2 of d_a
cudaMalloc((void**)&d_b, nBytes2);
//now do whatever in the kernel
compression<<<2 * nBytes, nBytes / 2>>> (d_a, d_b, nBytes, nBytes2); //number of grids must be equal =< nBytes

//copy results back
cudaMemcpy(h_a, d_b, nBytes, cudaMemcpyDeviceToHost);
//check the result
printf(“The result of XOR:\n”);
for(i=0; i < nBytes2;i++)
printf("%d: %u\n",i, h_a[i]);
printf("\n");
//end check
//now take that data for the new calculation
if(nBytes2 % 2 == 0)
{
nBytes = nBytes2 / 2;
}
else
if(nBytes2 % 2 != 0)
{
nBytes = nBytes2 / 2 + 1;
}
while(nBytes > 0)
{
cudaMemcpy(d_b, h_a, nBytes2, cudaMemcpyHostToDevice);
compression2<<<1,nBytes2>>>(d_a, d_b, nBytes, nBytes2);
//copy results back
cudaMemcpy(h_a, d_a, nBytes, cudaMemcpyDeviceToHost);
if(nBytes == 1){break;}

	if(nBytes % 2 == 0)
	{
		nBytes = nBytes / 2;
	}
	else
	if(nBytes % 2 != 0)
	{
		nBytes = nBytes / 2 + 1;
	}
}
//check the result
printf("The result of Addition:\n");
printf("%u", h_a[0]);
printf("\n");
//cutStopTimer(gpu_timerNoMemCpy);
//cutStopTimer(gpu_timer);
//printf("\nGPU Processing time: %f (ms) \n", cutGetTimerValue(gpu_timer));
	//printf(  "GPU No MemCopy time: %f (ms) \n", cutGetTimerValue(gpu_timerNoMemCpy));
//cutDeleteTimer(gpu_timer);
    //cutDeleteTimer(gpu_timerNoMemCpy);

//close files and free pointers
fclose(input_file);
free(h_a);
cudaFree(d_a);
cudaFree(d_b);
}