Hi,
I want to use a pointer in global device memory to store matrix values. I want it in global memory because ultimately I want it persistant in between kernel calls (in a loop) without having to transfer the result to host at each loop iteration.
I can successfully allocate the device memory (with cudaMallocPitch or cudaMalloc), transfer data to device and even back to host (with cudaMemcpy2D or cudaMemcpy). But what I cannot do is access or change this data inside a kernel. There is no compilation error, just no effect when reading or writing data at run time…
I have found 2 work arounds to the problem:
WA1: is to declare the device global variable as an array of array, transfer data to device with cudaMemcpyToSymbol and back to host with cudaMemcpyFromSymbol
Problem here is that I cannot use pitched memory and this is precisely what I want to do.
WA2: is to pass a pointer as argument of the kernel (allocated with cudaMallocPitch or cudaMalloc and data copied with cudaMemcpy2D or cudaMalloc)
Solutions based on this approached are largely favoured by most people as I guess the first examples of the CUDA_C_Programming_Guide are based on that.
It seems to work fine but I actually do not understand why as as I read from the CUDA_C_Best_Practices_Guide v3.2 (3.2.2.4), “Shared memory holds the parameters or arguments that are passed to kernels at launch”. Why would the data be persistant between kernel launches then??? Is it not what global memory is there for??? Also pitched memory is supposed to help coalescing of global memory, so there must be a way to achieve what I am trying.
Here is the minimalistic code that reproduces my problem:
My config: Ubuntu 10.10, Cuda 3.2, GeForce 9500 (1.0)
Your help will be much appreciated.
#include <stdio.h>
#include <cutil.h>
#include <iostream>
#include <assert.h>
#include <vector>
#include <fstream>
#include <cutil_inline.h>
#define NBPOP 1024
#define NBDIM 20
//--------------------------------------------------------------------------
__device__ float d_Population[NBPOP*NBDIM];
__device__ float* d_Population2;
__device__ float d_E1;
__device__ float d_E2;
__device__ float d_E3;
__device__ float d_E4;
//--------------------------------------------------------------------------
//GPU KERNELS
__global__ void TestKernel(int count)
{
int tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
if(tid<count)
{
if(tid==0)
{
d_E1=d_Population[tid*NBDIM+0];
d_E2=d_Population[tid*NBDIM+1];
}
for(int j=0;j<NBDIM;j++)
d_Population[tid*NBDIM+j]+=3;
if(tid==0)
{
d_E3=d_Population[tid*NBDIM+0];
d_E4=d_Population[tid*NBDIM+1];
}
}
}
//--------------------------------------------------------------------------
__global__ void TestKernel2(int count,size_t pitch)
{
int tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
if(tid<count)
{
if(tid==0)
{
d_E1=d_Population2[tid*pitch+0];
d_E2=d_Population2[tid*pitch+1];
}
for(int j=0;j<NBDIM;j++)
d_Population2[tid*pitch+j]+=3;
if(tid==0)
{
d_E3=d_Population2[tid*pitch+0];
d_E4=d_Population2[tid*pitch+1];
}
}
}
//--------------------------------------------------------------------------
__global__ void TestKernel3(float* devPtr,int count,size_t pitch)
{
int tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
if(tid<count)
{
if(tid==0)
{
d_E1=devPtr[tid*pitch+0];
d_E2=devPtr[tid*pitch+1];
}
for(int j=0;j<NBDIM;j++)
devPtr[tid*pitch+j]+=3;
if(tid==0)
{
d_E3=devPtr[tid*pitch+0];
d_E4=devPtr[tid*pitch+1];
}
}
}
//--------------------------------------------------------------------------
////////////////////////////////////////////////////////////////////////////////
// Main program
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
std::cout << "Initializing device data...\n";
CUT_DEVICE_INIT(argc, argv);
//grid dimension
int dataAmount = NBPOP;
dim3 dimBlocs(64,1,1);
int nbblocs=(dataAmount+dimBlocs.x-1)/dimBlocs.x;
dim3 dimGrid;
if (nbblocs>65535)
{
dimGrid.x=65535;
dimGrid.y=(nbblocs+65535-1)/65535;
}
else
{
dimGrid.x=nbblocs;
dimGrid.y=1;
}
float h_Population[NBPOP*NBDIM];
float h_Population2[NBPOP*NBDIM];
for(int i=0;i<NBPOP;i++)
{
for(int j=0;j<NBDIM;j++)
{
h_Population[i*NBDIM+j]=i*NBDIM+j;
h_Population2[i*NBDIM+j]=-1;
}
}
float h_E1,h_E2,h_E3,h_E4;
h_E1=0;
h_E2=0;
h_E3=0;
h_E4=0;
//COPY TO DEVICE
cudaError_t cerr;
std::cout << "sizeof(h_Population) " << sizeof(h_Population) << "\n";
std::cout << "sizeof(float) * NBPOP * NBDIM " << sizeof(float) * NBPOP * NBDIM << "\n";
//DESIRED VERSION
//h_E1,h_E2,h_E3,h_E4 are rubish, h_Population2 holds values of h_Population (ie data transfer worked fine but kernel has no effect)
size_t pitch;
cerr=cudaMallocPitch(&d_Population2, &pitch, NBDIM * sizeof(float), NBPOP); //est ce qu'on peut faire ca directement?
std::cout << "pitch " << pitch << "\n";
std::cout << "pitch/sizeof(float) " << pitch/sizeof(float) << "\n";
if( cerr != cudaSuccess ) puts(cudaGetErrorString( cerr ));
//Copies hostArray onto the pre-allocated device memory
cerr=cudaMemcpy2D(d_Population2, pitch, &h_Population[0], NBDIM * sizeof(float) , NBDIM * sizeof(float), NBPOP, cudaMemcpyHostToDevice); //ok
if( cerr != cudaSuccess ) puts(cudaGetErrorString( cerr ));
// kernel call GPU
TestKernel2<<<dimGrid, dimBlocs>>>(NBPOP,pitch/sizeof(float));
CUT_CHECK_ERROR("TestKernel2() execution failed\n");
CUDA_SAFE_CALL( cudaThreadSynchronize() );
// Copy the data back to the host
cerr=cudaMemcpyFromSymbol(&h_E1,d_E1,sizeof(float));
if( cerr != cudaSuccess ) puts(cudaGetErrorString( cerr ));
cerr=cudaMemcpyFromSymbol(&h_E2,d_E2,sizeof(float));
if( cerr != cudaSuccess ) puts(cudaGetErrorString( cerr ));
cerr=cudaMemcpyFromSymbol(&h_E3,d_E3,sizeof(float));
if( cerr != cudaSuccess ) puts(cudaGetErrorString( cerr ));
cerr=cudaMemcpyFromSymbol(&h_E4,d_E4,sizeof(float));
if( cerr != cudaSuccess ) puts(cudaGetErrorString( cerr ));
std::cout << "E1 : " << h_E1 << " E2 : " << h_E2 << " E3 : " << h_E3 << " E4 : " << h_E4 << "\n";
std::cout << "h_Population2[0] before: " << h_Population2[0] << "\n";
cerr=cudaMemcpy2D(&h_Population2[0], NBDIM * sizeof(float), d_Population2, pitch , NBDIM * sizeof(float), NBPOP, cudaMemcpyDeviceToHost); //ok
if( cerr != cudaSuccess ) puts(cudaGetErrorString( cerr ));
std::cout << "h_Population2[0] after: " << h_Population2[0] << "\n";
cudaFree(d_Population2);
/*
//WORK AROUND 1
//h_E1=0,h_E2=1,h_E3=3,h_E4=4. h_Population2 holds values of h_Population shifted by 3. Everything as expected.
cerr=cudaMemcpyToSymbol(d_Population,&h_Population[0],sizeof(h_Population));
if( cerr != cudaSuccess ) puts(cudaGetErrorString( cerr ));
// kernel call GPU
TestKernel<<<dimGrid, dimBlocs>>>(NBPOP);
CUT_CHECK_ERROR("TestKernel() execution failed\n");
CUDA_SAFE_CALL( cudaThreadSynchronize() );
// Copy the data back to the host
cerr=cudaMemcpyFromSymbol(&h_E1,d_E1,sizeof(float));
if( cerr != cudaSuccess ) puts(cudaGetErrorString( cerr ));
cerr=cudaMemcpyFromSymbol(&h_E2,d_E2,sizeof(float));
if( cerr != cudaSuccess ) puts(cudaGetErrorString( cerr ));
cerr=cudaMemcpyFromSymbol(&h_E3,d_E3,sizeof(float));
if( cerr != cudaSuccess ) puts(cudaGetErrorString( cerr ));
cerr=cudaMemcpyFromSymbol(&h_E4,d_E4,sizeof(float));
if( cerr != cudaSuccess ) puts(cudaGetErrorString( cerr ));
std::cout << "E1 : " << h_E1 << " E2 : " << h_E2 << " E3 : " << h_E3 << " E4 : " << h_E4 << "\n";
cerr=cudaMemcpyFromSymbol(&h_Population2[0],d_Population,sizeof(h_Population2));
if( cerr != cudaSuccess ) puts(cudaGetErrorString( cerr ));
*/
/*
//WORK AROUND 2
//h_E1=0,h_E2=1,h_E3=3,h_E4=4. h_Population2 holds values of h_Population shifted by 3. Everything as expected.
size_t pitch2;
float* devPtr;
cerr=cudaMallocPitch(&devPtr, &pitch2, NBDIM * sizeof(float), NBPOP);
std::cout << "pitch2 " << pitch2 << "\n";
if( cerr != cudaSuccess ) puts(cudaGetErrorString( cerr ));
//Copies hostArray onto the pre-allocated device memory
cerr=cudaMemcpy2D(devPtr, pitch2, &h_Population[0], NBDIM * sizeof(float) , NBDIM * sizeof(float), NBPOP, cudaMemcpyHostToDevice); //ca plutot
if( cerr != cudaSuccess ) puts(cudaGetErrorString( cerr ));
// kernel call GPU
TestKernel3<<<dimGrid, dimBlocs>>>(devPtr,NBPOP,pitch2/sizeof(float));
CUT_CHECK_ERROR("TestKernel3() execution failed\n");
CUDA_SAFE_CALL( cudaThreadSynchronize() );
//TestKernel3<<<dimGrid, dimBlocs>>>(devPtr,NBPOP,pitch2/sizeof(float));
//CUT_CHECK_ERROR("TestKernel3() execution failed\n");
//CUDA_SAFE_CALL( cudaThreadSynchronize() );
// Copy the data back to the host
cerr=cudaMemcpyFromSymbol(&h_E1,d_E1,sizeof(float));
if( cerr != cudaSuccess ) puts(cudaGetErrorString( cerr ));
cerr=cudaMemcpyFromSymbol(&h_E2,d_E2,sizeof(float));
if( cerr != cudaSuccess ) puts(cudaGetErrorString( cerr ));
cerr=cudaMemcpyFromSymbol(&h_E3,d_E3,sizeof(float));
if( cerr != cudaSuccess ) puts(cudaGetErrorString( cerr ));
cerr=cudaMemcpyFromSymbol(&h_E4,d_E4,sizeof(float));
if( cerr != cudaSuccess ) puts(cudaGetErrorString( cerr ));
std::cout << "E1 : " << h_E1 << " E2 : " << h_E2 << " E3 : " << h_E3 << " E4 : " << h_E4 << "\n";
cerr=cudaMemcpy2D(&h_Population2[0], NBDIM * sizeof(float), devPtr, pitch2 , NBDIM * sizeof(float), NBPOP, cudaMemcpyDeviceToHost); //???
if( cerr != cudaSuccess ) puts(cudaGetErrorString( cerr ));
cudaFree(devPtr);
*/
//logfile
std::string filename="/home/ubuntu/NVIDIA_GPU_Computing_SDK/logGPU.txt";
std::ofstream fichier;
fichier.open(filename.c_str());
fichier.precision(15);
for (int i=0; i < NBPOP; i++)
{
for (int j=0; j < NBDIM; j++)
{
fichier << i << "\t" << j << "\t" << h_Population2[i*NBDIM+j] << "\n" ;
}
}
fichier.close();
//cleaning
std::cout << "Cleaning up...\n";
cudaThreadExit();
cutilExit(argc,argv);
}
template2.cu (8.57 KB)