Issue with GPU_Anim

Hello Im using 9800GT on Windows7 OS. The following code says, I have a invalid kernel image at line 77 of gpu_anim.h. gpu_anim.h is a header file which displays image using data directly from gpu. This code was provided from NVIDIA and I have never touched anything on such headerfile. Does anyone recognize the problem? Thank you.

//	Test program for animated display of oct .unp data files

//	Has gpu computation and direct rendering of results via OpenGL

//	compile with: nvcc -Llib -m32 src\octprocessor.cu -o src\octprocessor.exe

#define SWAP(a,b) tempr=(a);(a)=(b);(b)=tempr

#include "../common/book.h"

#include "../common/gpu_anim.h"

#include "device_fourier.h"

#define XDIM 1000

#define YDIM 1024

#define FRAMES 50

// Parameters from Coulter Cornea SDOCT system

#define START_WAVELENGTH 7.5727E+02

#define WAVELENGTH_SPACING 8.6911E-02

#define SECOND_ORDER_CORRECTION -5.7555E-06

#define THIRD_ORDER_CORRECTION 2.2916E-9

#define FOURTH_ORDER_CORRECTION -5.6882E-13

#define CAPTURE_LINE_OFFSET 450

#define RESAMPLE_POINTS 1024

#define D1 3.3e-5

unsigned short a[XDIM*YDIM*FRAMES]; // raw input data from file or device

int b[XDIM*YDIM*FRAMES];            // DC subtracted, integer input data sent to gpu (real)

int c[2*XDIM*YDIM*FRAMES];          // processed data returned from gpu (complex)

int sum_spectrum[YDIM];

float resamp[YDIM*2];

float k_resampledspacing;

int *dev_b, *dev_c, *dev_sum_spectrum;

float *dev_resamp;

char inputfile[] = "joecornearadialvolume1.unp";

bool dcsubtract = true;

bool fft = true;

bool resample = true;

void calc_resample_coefficients() {

	double lambda[YDIM];

	double k_sampled[YDIM];

	double k_resampled[YDIM];

	double lambda0 = START_WAVELENGTH;

	double lambda1 = WAVELENGTH_SPACING;

	double lambda2 = SECOND_ORDER_CORRECTION;

	double lambda3 = THIRD_ORDER_CORRECTION;

	double lambda4 = FOURTH_ORDER_CORRECTION;

	double kmax;

	double kmin;

	for (int y=0; y<YDIM; y++) {

		lambda[y] = lambda0 + lambda1*(y+CAPTURE_LINE_OFFSET) +

				lambda2*pow(double(y+CAPTURE_LINE_OFFSET), 2) +

				lambda3*pow(double(y+CAPTURE_LINE_OFFSET), 3) +

				lambda4*pow(double(y+CAPTURE_LINE_OFFSET), 4);

		k_sampled[y] = 1000000./lambda[y];

		}

	kmin = 1000000./lambda[0];		//wavenumber in mm^-1

	kmax = 1000000./lambda[YDIM-1];

	k_resampledspacing = (kmin - kmax)/(YDIM-1);

	for (int y=0; y<YDIM; y++) {

		k_resampled[y] = kmin + (double(y)/(YDIM - 1))*(kmax - kmin);

//		if (y%1 == 0) printf ("lambda[%d]=%f, k_sampled[%d]=%f, k_resampled[%d]=%f\n",y,lambda[y],y,k_sampled[y],y,k_resampled[y]);

	}

	for (int y=0; y<YDIM; y++) {

		for (int i=0; i<YDIM; i++) {

			if (k_resampled[y] >= k_sampled[i] && k_resampled[y+1] < k_sampled[i]) {

				resamp[y+0*YDIM] = double(i);

				resamp[y+1*YDIM] = k_resampled[y] - k_sampled[i];

			break;

			}

		}

	}

}

void readData( ) {

	FILE *fp;

	long size;

	fp=fopen(inputfile, "rb");

	if (fp==NULL) perror ("Error opening file");

	else {

		fseek (fp, 0, SEEK_END);

		size=ftell (fp);

		printf ("Size of inputfile: %ld bytes.\n",size);

		rewind (fp);

	}

	fread(a, 2, XDIM*YDIM*FRAMES, fp);

	fclose(fp);

	

	for (int f=0; f<FRAMES; f++) {

	  for (int x=0; x<XDIM; x++) {

      for (int y=0; y<YDIM; y++) {

        b[y+x*YDIM+f*YDIM*XDIM] = (int) a[y+x*YDIM+f*YDIM*XDIM];

      }

    }

  }

}

void calc_sum_spectrum(int f) {     //should move this onto gpu but would require thread synchronization

if (dcsubtract == true) {		// Do DC subtract

		for (int y=0; y<YDIM; y++) { sum_spectrum[y]=0;}

    for (int x=0; x<XDIM; x++) {

      for (int y=0; y<YDIM; y++) {

        sum_spectrum[y] += b[y+x*YDIM+f*YDIM*XDIM];

      }

		}

	}

}

	

__global__ void kernel( int *b, int *c, int *sum_spectrum, float *resamp, bool dcsubtract, bool resample, bool fft, float k_resampledspacing, uchar4 *ptr ){

	int x = blockIdx.x*XDIM/2 + threadIdx.x;                       // XDIM>>1 = XDIM/2

	int i, offset;

  float data[2*YDIM];

  float phase, temp, datatemp;

	if (fft == true) { 		                                                // Do FFT processing

    for (int y=0; y<YDIM; y++) {

      if (dcsubtract == true) { b[y+x*YDIM] -= sum_spectrum[y]/XDIM;}   // Do DC subtract                            

			if (resample == true) {                                           // Do resampling

        i = int(resamp[y+0*YDIM]); 

        datatemp = b[x*YDIM+i] - resamp[y+1*YDIM]*(b[x*YDIM+i+1]-b[x*YDIM+i])/k_resampledspacing;

			}

      else {

        datatemp = b[x*YDIM+y];

      }

      phase = D1*(y-YDIM/2)*(y-YDIM/2);  // second order dispersion correction

			data[2*y] = datatemp * __cosf(phase);

			data[2*y+1] = datatemp * __sinf(phase);

    }

    four1(data, YDIM, 1);

    for (int y=0; y<YDIM/2; y++) {

      c[x*2*YDIM+2*y] = data[2*y];

      c[x*2*YDIM+2*y+1] = data[2*y+1];

      temp = __log10f(data[2*y]*data[2*y] + data[2*y+1]*data[2*y+1])/12.;

      offset = x + (YDIM/2 - y) * XDIM;

      ptr[offset].x = 255 * temp - 100;

      ptr[offset].y = 255 * temp - 100;

      ptr[offset].z = 255 * temp - 100;

      ptr[offset].w = 255;

    }

  }

	else {				// Don't do FFT processing

		for (int y=0; y<YDIM; y++) {

      if (dcsubtract == true) {b[y+x*YDIM] -= sum_spectrum[y]/YDIM;}     // Do DC subtract

      int offset = x + y * XDIM;

			ptr[offset].x = 255 * (float)b[x*YDIM + y]/(65535);

			ptr[offset].y = 255 * (float)b[x*YDIM + y]/(65535);

			ptr[offset].z = 255 * (float)b[x*YDIM + y]/(65535);

			ptr[offset].w = 255;

		}

	}

}

double get_framerate ( ) {

	static clock_t newticks, oldticks;

	double framerate;

	newticks = clock( );

	framerate = CLOCKS_PER_SEC/( newticks - oldticks );

	oldticks = newticks;

	return framerate;

}

void generate_frame( uchar4 *pixels, void*, int ticks) {

// Note ticks variable generated by cpu_anim.h not used

//	dim3 grid(XDIM);

	static int frame;

	frame++;

	if ( frame >= FRAMES ) frame = 1;

	

  calc_sum_spectrum(frame);

	

	HANDLE_ERROR( cudaMemcpy( dev_b, b + frame*XDIM*YDIM, XDIM*YDIM*sizeof(int), cudaMemcpyHostToDevice ) );

	HANDLE_ERROR( cudaMemcpy( dev_sum_spectrum, sum_spectrum, YDIM*sizeof(int), cudaMemcpyHostToDevice ) );

	kernel<<<2,XDIM/2>>>( dev_b, dev_c, dev_sum_spectrum, dev_resamp, dcsubtract, resample, fft, k_resampledspacing, pixels );

	

	HANDLE_ERROR( cudaMemcpy( c + frame*XDIM*2*YDIM, dev_b, XDIM*YDIM*sizeof(int), cudaMemcpyDeviceToHost ) );

	printf ("Frame: %3d; Frames/sec: %4.1f\r", frame, get_framerate ( ) );

}

int main(int argc, char* argv[]) {

	GPUAnimBitmap bitmap( XDIM, YDIM/2, NULL );

	printf("Usage: no arguments for default values or add -f inputfile for input file or\n");

	printf("-nodcsubtract, -nofft, or -noresample to disable any of these functions.\n");

	printf("argc = %d, argv[0] = %s, argv[1] = %s\n",argc, argv[0], argv[1]);

for (int i = 1; i < argc; i++) { //Iterate over argv[] to get the parameters stored inside.

		if (strcmp(argv[i],"-f") == 0) {   // Next argument is filename to process

			strcpy(inputfile,argv[i + 1]); }

		if (strcmp(argv[i],"-nodcsubtract") == 0) {

			dcsubtract = false;

			printf("No DC subtraction.\n"); } 

		else if (strcmp(argv[i],"-nofft") == 0) {

			fft = false;

			printf("No fft.\n"); } 

		else if (strcmp(argv[i],"-noresample") == 0) {

			resample = false;

			printf("No resampling.\n"); } 

  }

	calc_resample_coefficients();

	HANDLE_ERROR( cudaMalloc( (void**)&dev_b, XDIM*YDIM*sizeof(int) ) );	

	HANDLE_ERROR( cudaMalloc( (void**)&dev_c, 2*XDIM*YDIM*sizeof(int) ) );	

  HANDLE_ERROR( cudaMalloc( (void**)&dev_sum_spectrum, YDIM*sizeof(int) ) );	

  HANDLE_ERROR( cudaMalloc( (void**)&dev_resamp, 2*YDIM*sizeof(float) ) );	

HANDLE_ERROR( cudaMemcpy( 	dev_resamp, resamp, 2*YDIM*sizeof(float), cudaMemcpyHostToDevice ) );

readData( );

	bitmap.anim_and_exit( (void (*)(uchar4*, void*, int))generate_frame, NULL );

}