Global device pointer access using cudaMemcpyToSymbol and cudaMemcpyFromSymbol

I am trying to use global device pointer vars to be used on any device and global subroutines calls. The follow example works fine in creating and performing operations over the device arrays d_a, d_b and d_c by calling sum_device() routine. The problem arises when I try to access these variables globally by calling alter_data without parameters. Follow the code:

__global__ void sum_device(double *d_a, double *d_b, double *d_c)
{ int tid;

 tid=threadIdx.x+blockIdx.x*blockDim.x;
 int offset=blockDim.x * gridDim.x;

 while(tid<SIZE)
 { d_c[tid]=d_a[tid]+d_b[tid];
   d_a[tid]=tid;
   d_b[tid]=tid+1;
   tid+=offset;
 }
}

__global__ void alter_data(void)
{ int tid;

 tid=threadIdx.x+blockIdx.x*blockDim.x;
 int offset=blockDim.x * gridDim.x;

 while(tid<SIZE)
 { d_Sc[tid]=2.0;
   d_Sa[tid]=0.0;
   d_Sb[tid]=1.0;
   tid+=offset;
 }
}

int main( int argc, char **argv)
{ double *a, *b, *c;

// host allocation memory

  a	= (double *)malloc(sizeof(double)*SIZE);
  b	= (double *)malloc(sizeof(double)*SIZE);
  c	= (double *)malloc(sizeof(double)*SIZE);

  if ( (a || b || c) == NULL)
  { cout<<"Error: Not enough memory on host."<<endl;
    exit(0);
  }
  init_host_data(a, b, c);
  cout<<"Initial Values before calls:"<<endl;
  print_vectors(a, b, c);
  copy_host_to_device(a, b);
  sum_device<<<1,SIZE>>>(d_a, d_b, d_c);
  copy_device_to_host(a, b, c);
  cout<<endl<<"Final Values after sum call:"<<endl;
  print_vectors(a, b, c);
  checkCudaErrors(cudaMemcpyToSymbol(d_Sa, &d_a, sizeof(double)*SIZE));
  checkCudaErrors(cudaMemcpyToSymbol(d_Sb, &d_b, sizeof(double)*SIZE));
  checkCudaErrors(cudaMemcpyToSymbol(d_Sc, &d_c, sizeof(double)*SIZE));
  alter_data<<<1,SIZE>>>();
  checkCudaErrors(cudaMemcpyFromSymbol(a, d_Sa, sizeof(double)*SIZE));
  checkCudaErrors(cudaMemcpyFromSymbol(b, d_Sb, sizeof(double)*SIZE));
  checkCudaErrors(cudaMemcpyFromSymbol(c, d_Sc, sizeof(double)*SIZE));
  cout<<endl<<"Final Values after modifying vars:"<<endl;
  print_vectors(a, b, c);
  return 0;
}

Form some reason the changes at the variables promoted by the alter_data<<<1,SIZE>>>() routine or the copy back to the host are not being properly done, L36-L43…

The code compiles fine.

Any help is appreciated.

PS.: when I point out the mouse over the cudaMemcpyToSymbol and cudaMemcpyFromSymbol functions the VS shows the older deprecated functions calls usage mode respectively…

My system: Window 8.1, VS 2010 Express and CUDA 5.5.

Are you trying to write into a constant variable within a kernel?
Constant variables are read-only once the kernel is executing.
Kernels without parameters don’t make sense to me.

I am so sorry… I had forgotten to mention about the variables. They are declared as:

__device__ double *d_Sa, *d_Sb, *d_Sc;

double *d_a, *d_b, *d_c;

I believe that the global variables d_Sa, d_Sb and d_Sc are all read-write variables or am I wrong?

The problem I am trying to solve is how to access global read/write pointer variables to doubles in a device without passing as parameters to the devices functions. I have many global vars in a equivalent host code that are accessed by many subroutines, and I just would like to do the same on the gpu, and I just can’t figure it out how to do it…

Here what I got so far in order to make it run:

__device__ void alter_data_once_again(void)
{ int tid;

 tid=threadIdx.x+blockIdx.x*blockDim.x;
 int offset=blockDim.x * gridDim.x;

 while(tid<SIZE)
 { d_Sc[tid]=5.0;
   d_Sa[tid]=3.0;
   d_Sb[tid]=4.0;
   tid+=offset;
 }
}


__global__ void alter_data(double *d_a, double *d_b, double *d_c)
{ int tid;

  d_Sa=d_a;
  d_Sb=d_b;
  d_Sc=d_c;

 tid=threadIdx.x+blockIdx.x*blockDim.x;
 int offset=blockDim.x * gridDim.x;

 while(tid<SIZE)
 { d_Sc[tid]=2.0;
   d_Sa[tid]=0.0;
   d_Sb[tid]=1.0;
   tid+=offset;
 }
 alter_data_once_again();
}

int main( int argc, char **argv)
{ double *a, *b, *c;

// host allocation memory

  a	= (double *)malloc(sizeof(double)*SIZE);
  b	= (double *)malloc(sizeof(double)*SIZE);
  c	= (double *)malloc(sizeof(double)*SIZE);

  if ( (a || b || c) == NULL)
  { cout<<"Error: Not enough memory on host."<<endl;
    exit(0);
  }
  init_host_data(a, b, c);
  cout<<"Initial Values before calls:"<<endl;
  print_vectors(a, b, c);
  copy_host_to_device(a, b);
  sum_device<<<1,SIZE>>>(d_a, d_b, d_c);
  copy_device_to_host(a, b, c);
  cout<<endl<<"Final Values after sum call:"<<endl;
  print_vectors(a, b, c);
  alter_data<<<1,SIZE>>>(d_a, d_b, d_c);
  copy_device_to_host(a, b, c);
  cout<<endl<<"Final Values after modifying vars:"<<endl;
  print_vectors(a, b, c);
  return 0;
}

It solved but I still had to pass the vars to a function to copy their addresses… Am I not able to make it run using cudaMemcpyToSymbol? It means that I will have to make a call to a global function, passing all my desired global variables, and then copying all theirs addresses?

Follow the code working fine now. No parameters is passed to the device function alter_data():

__device__ double *d_Sa, *d_Sb, *d_Sc;
double *d_a, *d_b, *d_c;

__global__ void alter_data(void)
{ int tid;

 tid=threadIdx.x+blockIdx.x*blockDim.x;
 int offset=blockDim.x * gridDim.x;

 while(tid<SIZE)
 { d_Sa[tid]=0.0;
   d_Sb[tid]=1.0;
   d_Sc[tid]=2.0;
   tid+=offset;
 }
}


int main( int argc, char **argv)
{ double *a, *b, *c;

// host allocation memory

  a	= (double *)malloc(sizeof(double)*SIZE);
  b	= (double *)malloc(sizeof(double)*SIZE);
  c	= (double *)malloc(sizeof(double)*SIZE);

  if ( (a || b || c) == NULL)
  { cout<<"Error: Not enough memory on host."<<endl;
    exit(0);
  }
  init_host_data(a, b, c);
  cout<<"Initial Values before calls:"<<endl;
  print_vectors(a, b, c);
  copy_host_to_device(a, b);
  sum_device<<<1,SIZE>>>(d_a, d_b, d_c);
  copy_device_to_host(a, b, c);
  cout<<endl<<"Final Values after sum call:"<<endl;
  print_vectors(a, b, c);
  checkCudaErrors(cudaMemcpyToSymbol(d_Sa, &d_a, sizeof(d_a)));
  checkCudaErrors(cudaMemcpyToSymbol(d_Sb, &d_b, sizeof(d_b)));
  checkCudaErrors(cudaMemcpyToSymbol(d_Sc, &d_c, sizeof(d_c)));
  alter_data<<<1,SIZE>>>();
  copy_device_to_host(a, b, c);
  cout<<endl<<"Final Values after modifying vars:"<<endl;
  print_vectors(a, b, c);
  return 0;
}