Bitmap editing

Hi, I need to write a program that make some changes in Bitmap. I wrote something like this, but it won’t work. It should reverse green colour, but new bitmap (fsz) is exacly the same as original.

include <cstdlib>
#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
//#include "bitmap_image.hpp"
using namespace std;


#define THREADS_PER_BLOCK 3

FILE*forg=fopen("C:\Users\szparson\Documents\Visual Studio 2010\Projects\bitmapa\bitmapa\input.bmp", "rb");		//Uchwyt do orginalnego pliku
FILE*fsz=fopen("C:\Users\szparson\Documents\Visual Studio 2010\Projects\bitmapa\bitmapa\output.bmp", "wb");			//Uchwyt do nowego pliku

struct FileHeader {
      short bfType;
      int bfSize;
      short bfReserved1;
      short bfReserved2;
      short bfOffBits;
    };
FileHeader File;
 
struct PictureHeader {
      int biSize;
      int biWidth;
      int biHeight;
      short biPlanes;
      short biBitCount;
      int biCompression;
      int biSizeImage;
      int biXPelsPerMeter;
      int biYPelsPerMeter;
      int biClrUsed;
      int biClrImportant;
     };
PictureHeader Picture;
     

	
void header()
{
		
	 fread(&File.bfType,sizeof(File.bfType),1,forg);
	 cout<<"Typ:"<< hex << File.bfType <<endl;

	 fread(&File.bfSize,sizeof(File.bfSize),1,forg);
     cout << "Rozmiar pliku: " << dec << File.bfSize << " bajtow" << endl;
 
     fread(&File.bfReserved1,sizeof(File.bfReserved1),1,forg);
     cout << "Zarezerwowane1: " << File.bfReserved1 << endl; 
 
     fread(&File.bfReserved2,sizeof(File.bfReserved2),1,forg);
     cout << "Zarezerwowane2: " << File.bfReserved2 << endl;
 
     fread(&File.bfOffBits,sizeof(File.bfOffBits),1,forg);
     cout << "Pozycja danych obrazkowych: " << File.bfOffBits << endl;
	  
     printf("\n"); 
 
     fseek(forg,14,SEEK_SET);
     fread(&Picture.biSize,sizeof(Picture.biSize),1,forg);
     cout <<"Wielkosc naglowka informacyjnego: " << Picture.biSize << endl;   
 
     fread(&Picture.biWidth,sizeof(Picture.biWidth),1,forg);
     cout <<"Szerokosc: " << Picture.biWidth << " pikseli "<<endl; 
 
     fread(&Picture.biHeight,sizeof(Picture.biHeight),1,forg);
     cout << "Wysokosc: " << Picture.biHeight << " pikseli "<<  endl;    
 
     fread(&Picture.biPlanes,sizeof(Picture.biPlanes),1,forg);
     cout << "Liczba platow (zwykle 0): " << Picture.biPlanes << endl;  
 
     fread(&Picture.biBitCount,sizeof(Picture.biBitCount),1,forg);
     cout << "Liczba bitow na piksel:  (1, 4, 8, or 24)" << Picture.biBitCount << endl;    
 
     fread(&Picture.biCompression,sizeof(Picture.biCompression),1,forg);
     cout <<"Kompresja: " << Picture.biCompression << "(0=none, 1=RLE-8, 2=RLE-4)" << endl; 
 
     fread(&Picture.biSizeImage,sizeof(Picture.biSizeImage),1,forg);
      cout <<"Rozmiar samego rysunku: " << Picture.biSizeImage << endl;  
 
     fread(&Picture.biXPelsPerMeter,sizeof(Picture.biXPelsPerMeter),1,forg);
     cout <<"Rozdzielczosc pozioma: " << Picture.biXPelsPerMeter << endl;      
 
     fread(&Picture.biYPelsPerMeter,sizeof(Picture.biYPelsPerMeter),1,forg);
     cout <<"Rozdzielczosc pionowa: " << Picture.biYPelsPerMeter << endl; 
 
     fread(&Picture.biClrUsed,sizeof(Picture.biClrUsed),1,forg);
     cout <<"Liczba kolorow w palecie: "<< Picture.biClrUsed << endl; 
 
     fread(&Picture.biClrImportant,sizeof(Picture.biClrImportant),1,forg);
     cout <<"Wazne kolory w palecie: " << Picture.biClrImportant  << endl;
}



char z;


__global__ void ReadImage(int *B, int *G, int *R)
{
	int i = blockIdx.x + blockDim.x + threadIdx.x;
	B[i]= B[i];
	G[i]= 255-G[i];
	R[i]= R[i];
}

int main()
{
	header();

	int *B, *G, *R;
	int *d_B, *d_G, *d_R;


	B=new int[File.bfSize];
	G=new int[File.bfSize];
	R=new int[File.bfSize];
	

	cudaMalloc(&d_B, File.bfSize);	
	cudaMalloc(&d_G, File.bfSize);
	cudaMalloc(&d_R, File.bfSize);
	
	fseek(forg,0,SEEK_SET);					
    for(int i=0; i<File.bfOffBits; i++)
	{
		z=fgetc(forg);
		fprintf(fsz, "%c",z);			//Header for the new Bitmap
	}


	for ( int i = File.bfOffBits; i < File.bfSize; i++) 
	{
		B[i]=fgetc(forg);
		G[i]=fgetc(forg);
		R[i]=fgetc(forg);
    }
	
	
	cudaMemcpy(d_B, B, File.bfSize*sizeof(int), cudaMemcpyHostToDevice); 
	cudaMemcpy(d_G, G, File.bfSize*sizeof(int), cudaMemcpyHostToDevice); 	cudaMemcpy(d_R, R, File.bfSize*sizeof(int), cudaMemcpyHostToDevice); 
	
	
	 ReadImage<<< Picture.biWidth*Picture.biHeight, THREADS_PER_BLOCK>>>(d_B, d_G, d_R);

	 	
	cudaMemcpy(B, d_B, File.bfSize*sizeof(int), cudaMemcpyDeviceToHost);
	cudaMemcpy(G, d_G, File.bfSize*sizeof(int), cudaMemcpyDeviceToHost);
	cudaMemcpy(R, d_R, File.bfSize*sizeof(int), cudaMemcpyDeviceToHost);

	fseek(fsz,54,SEEK_SET);	
	 for (int i = File.bfOffBits; i < File.bfSize; i++) 
	 {
			fprintf(fsz, "%c", (int)(B[i]));
			fprintf(fsz, "%c", (int)(G[i]));
			fprintf(fsz, "%c", (int)(R[i]));
	}

	 
	 delete[] B;
	 delete[] G;
	 delete[] R;
	 cudaFree(d_B); cudaFree(d_G); cudaFree(d_R);


	 system("PAUSE");
}

First part of the code is just reading a header from bitmap. So you can skip right to the
global void ReadImage(int *B, int *G, int *R)

int i = blockIdx.x + blockDim.x + threadIdx.x;

should be

int i = blockIdx.x * blockDim.x + threadIdx.x;

(changed one + for a *)

Although the functionality of the code should now be correct, you would like to use more than 3 threads per block to get any performance out of your GPU, e.g. 256 would be a good number to start with.