Basic example: doesn't work! help me please !

Hi everybody External Media ,

I encounter a big problem with the update of new values in a very very basic API.

Indeed, I try to simulate a basc circuit with a not operator. I realize that with a class “signal” which have two values:

one for the current cycle (t0)

another for the nex cycle (t1)

Between each cycle, the current state of the signal is updated owing to the function maj( ). It just do t0=t1

This precisely where I have my problem: the change seems to be ignored! => t0 gives its value! External Media

This is the code:

#include <stdio.h>

#include <stdlib.h>

#include <cuda.h>

#include <cuda_runtime.h>

class  signal

{

	bool _t0,_t1;

  public:

	__device__

	signal()

	{

	}

	__device__

	void init(bool valeur)

	{

	  _t0=valeur;

	  _t1=valeur;

	}

	__device__

	bool t1()

	{

	  return _t1;

	}

	__device__

	bool t0()

	{

	  return _t0;

	}

	

	__device__

	void w(bool val)

	{

	  _t1=val;

	}

	

	__device__

	void maj()

	{

	  _t0=_t1;

	threadfence();

	};

};

__global__

void kernel(int n_cycles,int t_cycle,int* retour)

{

	signal s;

	s.init(true);

	int i;

  for(i=0;i<n_cycles;i++)

  {

	retour[t_cycle*i]=i;

	retour[t_cycle*i+1]=s.t0();

	retour[t_cycle*i+2]=s.t1();

	retour[t_cycle*i+3]=~s.t0();

	s.w(~s.t0());

	threadfence();

	retour[t_cycle*i+4]=s.t1();

	s.maj();

	threadfence();

	

  }

}

//============================================================

host

	

int main()

{

  int t_num_cycle	=1;

  int t_cycle		=4+t_num_cycle;

  int n_cycles		=100000/t_cycle;

  int t_buffer		=t_cycle*n_cycles;

  int t_bytes_buffer	=t_buffer*sizeof(int);

FILE* pFile;

  pFile = fopen ("simu2.trace","w");

int retour[t_buffer];

  int* retour_device;

cudaFuncSetCacheConfig(kernel, cudaFuncCachePreferL1);

	

  CUevent start, stop;

cudaEventCreate(&start);

  cudaEventCreate(&stop);

	

	

	

	cudaMalloc ( (void**) &retour_device, t_bytes_buffer);

	cudaEventRecord(start, 0);

	kernel<<<1,100>>>(n_cycles,t_cycle,retour_device);

	

	cudaThreadSynchronize();

	cudaMemcpy(retour,retour_device,t_bytes_buffer,cudaMemcpyDeviceToHo

st);

	cudaEventRecord(stop, 0);

  int j;

	for(j=0;j<n_cycles;j++)

	{

	  fprintf (pFile, "cycle %5i   t0:%i	t1:%i	~t0:%i	t1=~t0:%i \n",retour[j*t_cycle],retour[j*t_cycle+1],retour[j*t_cycle+2],retour[j*t_cycle+3],retour[j*t_cycle+4]);

 }

   fclose (pFile);

	

	

cudaEventSynchronize(stop);

float elapsedTime;

cudaEventElapsedTime(&elapsedTime, start, stop);

printf("===> %f  ms",elapsedTime);

  printf("\nnombre de cycles: %i \ntaille d'un cycle: %i \ntaille du buffer en int: %i \ntaille du buffer en octets: %i \n" ,n_cycles,t_cycle,t_buffer,t_bytes_buffer);

	return 0;

}

and the trace:

So I do not understand anything: do you any idea please? :)