cudaMalloc'ed pointer should have global scope, but doesn't?

Hi All,

I’m trying to incorporate CUDA into an existing C project. I have a problem with the cudaMalloc’ed pointer getting lost. Here’s a trimmed-down version of the problem:

Makefile

##############################################################################
LIBS = -lcudart_static
LFLAGS = -L /Developer/NVIDIA/CUDA-6.5/lib
CUDAINC = -I /Developer/NVIDIA/CUDA-6.5/include
CFLAGS = -Xcompiler="-g, -O0, -Wall, -Wshadow, -Wno-deprecated, -arch x86_64" 
PCLOBJ = DbgMain.o ParticleDbg.o

pcl : $(PCLOBJ)
	nvcc $(LFLAGS) -o pcl $(PCLOBJ) $(LIBS)

DbgMain.o	: DbgMain.c ParticleDbg.h
	nvcc $(CFLAGS) -c DbgMain.c -o DbgMain.o
ParticleDbg.o    : ParticleDbg.c ParticleDbg.h
	nvcc -x cu -g $(CUDAINC) -c ParticleDbg.c -o ParticleDbg.o
clean:
	rm *.o
##############################################################################

Header

/****************************************************************************/
#ifndef __PCL_H__
#define __PCL_H__

#ifdef __cplusplus
   extern "C" {
#endif

struct ParticleType {
   double Pos[3];
   double Vel[3];
   double Frc[3];
};

long Npcl;
double PclMass, PclRadius;
long PclGeomTag;
struct ParticleType *Pcl;
struct ParticleType *CudaPcl;

void InitParticleSystem(double center[3]);
void ParticleSimStep(double dt);

#ifdef __cplusplus
   } /* extern "C" */
#endif

#endif /* __PCL_H__ */
/****************************************************************************/

CUDA Functions

/*********************************************************************/
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include "ParticleDbg.h"

#ifdef __cplusplus
   extern "C" {
#endif

size_t SizeOfPcl;

/*********************************************************************/
void InitParticleSystem(double center[3])
{
      struct ParticleType *P,*CudaPcl;
      cudaError_t err = cudaSuccess;

      cudaSetDevice(0);

      Npcl = 100;
      SizeOfPcl = Npcl*sizeof(struct ParticleType);
      Pcl = (struct ParticleType *) malloc(SizeOfPcl);
      err = cudaMalloc((void **)&CudaPcl,SizeOfPcl);
      printf("cudaSuccess = %i\n",(int) cudaSuccess);
      printf("err = %i\n",(int) err);
      printf("In Init: CudaPcl = %p\n",CudaPcl);
}
/*********************************************************************/
__global__ void CudaParticleEOM(struct ParticleType *CP,long Npcl)
{
      long Ip;
      struct ParticleType *P;

      Ip = blockDim.x*blockIdx.x + threadIdx.x;

      if (Ip < Npcl) {
         P = &CP[Ip];
         /* EOM would go here */
      }
}
/*********************************************************************/
void ParticleSimStep(double dt)
{
      printf("In SimStep: CudaPcl = %p\n",CudaPcl);
      cudaMemcpy(CudaPcl,Pcl,SizeOfPcl,cudaMemcpyHostToDevice);

      int ThreadsPerBlock = 1;
      int BlocksPerGrid = (Npcl+ThreadsPerBlock-1)/ThreadsPerBlock;
      CudaParticleEOM<<<BlocksPerGrid,ThreadsPerBlock>>>(CudaPcl,Npcl);

      cudaMemcpy(Pcl,CudaPcl,SizeOfPcl,cudaMemcpyDeviceToHost);
}
#ifdef __cplusplus
   } /* extern "C" */
#endif
/*********************************************************************/

Main

/*********************************************************************/
#include <stdio.h>
#include "ParticleDbg.h"

#ifdef __cplusplus
   extern "C" {
#endif

/*********************************************************************/
int main(int argc, char **argv)
{
      double center[3] = {1.0,2.0,3.0};

      InitParticleSystem(center);

      printf("Back in Main: CudaPcl = %p\n",CudaPcl);

      ParticleSimStep(0.1);

      return(0);
}
#ifdef __cplusplus
   } /* extern "C" */
#endif
/*********************************************************************/

Output

cudaSuccess = 0
err = 0
In Init: CudaPcl = 0x500b80000
Back in Main: CudaPcl = 0x0
In SimStep: CudaPcl = 0x0

So why does the CudaPcl pointer appear as null outside the function that
cudaMalloc’ed it?

Thanks,
-DwP

because you shouldn’t define global variables in a header file that gets included from several modules.

Define the variables once in the main program, and declare these as “extern” in a header file visible to other modules. This makes sure only one instance of the variable exists in the entire binary.

Christian

Okay, I get what you’re saying (to a limited extent, I’ll admit). So I modified the header and source as seen below. No change in the output though, so I think I have another underlying issue.

Here’s the modified header:

#ifndef __PCL_H__
#define __PCL_H__

#ifdef __cplusplus
   extern "C" {
#endif

struct ParticleType {
   double Pos[3];
   double Vel[3];
   double Frc[3];
};

EXTERN long Npcl;
EXTERN double PclMass, PclRadius;
EXTERN long PclGeomTag;
EXTERN struct ParticleType *Pcl;
EXTERN struct ParticleType *CudaPcl;

void InitParticleSystem(double center[3]);
void ParticleSimStep(double dt);

#ifdef __cplusplus
   } /* extern "C" */
#endif

#endif /* __PCL_H__ */

and the main file:

#include <stdio.h>
#define EXTERN
#include "ParticleDbg.h"
#undef EXTERN

#ifdef __cplusplus
   extern "C" {
#endif

int main(int argc, char **argv)
{
      double center[3] = {1.0,2.0,3.0};

      InitParticleSystem(center);

      printf("Back in Main: CudaPcl = %p\n",CudaPcl);

      ParticleSimStep(0.1);

      return(0);
}
#ifdef __cplusplus
   } /* extern "C" */
#endif

and the other (CUDA-containing) source file:

/*********************************************************************/
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>
#define EXTERN extern
#include "ParticleDbg.h"
#undef EXTERN

#ifdef __cplusplus
   extern "C" {
#endif

size_t SizeOfPcl;

/*********************************************************************/
void InitParticleSystem(double center[3])
{
      struct ParticleType *P,*CudaPcl;
      cudaError_t err = cudaSuccess;

      cudaSetDevice(0);

      Npcl = 100;
      SizeOfPcl = Npcl*sizeof(struct ParticleType);
      Pcl = (struct ParticleType *) malloc(SizeOfPcl);
      err = cudaMalloc((void **)&CudaPcl,SizeOfPcl);
      printf("cudaSuccess = %i\n",(int) cudaSuccess);
      printf("err = %i\n",(int) err);
      printf("In Init: CudaPcl = %p\n",CudaPcl);
}
/*********************************************************************/
__global__ void CudaParticleEOM(struct ParticleType *CP,long Npcl)
{
      long Ip;
      struct ParticleType *P;

      Ip = blockDim.x*blockIdx.x + threadIdx.x;

      if (Ip < Npcl) {
         P = &CP[Ip];
         /* EOM would go here */
      }
}
/*********************************************************************/
void ParticleSimStep(double dt)
{
      printf("In SimStep: CudaPcl = %p\n",CudaPcl);
      cudaMemcpy(CudaPcl,Pcl,SizeOfPcl,cudaMemcpyHostToDevice);

      int ThreadsPerBlock = 1;
      int BlocksPerGrid = (Npcl+ThreadsPerBlock-1)/ThreadsPerBlock;
      CudaParticleEOM<<<BlocksPerGrid,ThreadsPerBlock>>>(CudaPcl,Npcl);

      cudaMemcpy(Pcl,CudaPcl,SizeOfPcl,cudaMemcpyDeviceToHost);
}
#ifdef __cplusplus
   } /* extern "C" */
#endif

and the disappointing output:

cudaSuccess = 0
err = 0
In Init: CudaPcl = 0x500b80000
Back in Main: CudaPcl = 0x0
In SimStep: CudaPcl = 0x0

Did I do the EXTERN trick wrong? Or is something else going on?

Thanks,
-DwP

Never mind. Found the bug. Duplicate definition of CudaPcl. End thread.

extern is a C/C++ keyword and has to be lowercase.