I am using Tesla K40 (Compute Capability 3.5) and latest CUDA toolkit (7.5). I am running Ubuntu.
The following code is broken in release build but works under debug build:
To test the code just copy and paste the underlying code and on Nvidia Nsight and build it and run. Also ensure that the compiler is enabled for C++11.
#include<iostream>
#include<cuda_fp16.h>
#define Pindex 0
#define gpuE(ans) { gpuAssert((ans), __FILE__, __LINE__);}
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true){
if (code != cudaSuccess){
std::cout<<"GpuError : "<<cudaGetErrorString(code)<<" in "<<file<<" at : "<<line<<std::endl;
if(abort){
//exit(code);
}
}
}
using namespace std;
typedef struct {
unsigned short year;
unsigned char month;
unsigned char day;
unsigned char hour;
} fileData;
typedef struct {
unsigned int packeddateandtime;
} packedfileData;
__device__ __host__ fileData unpackdata(packedfileData pfd){
fileData fd;
unsigned int pfdDT = pfd.packeddateandtime;
fd.hour = pfdDT & 31;
fd.day = (pfdDT & (31 << 5)) >> 5;
fd.month = (pfdDT & (15 << 10)) >> 10;
fd.year = pfdDT >> 14;
return fd;
}
__device__ __host__ packedfileData bitpackdata(fileData file_Data){
unsigned int packeddateandtime = 0;
packeddateandtime |= file_Data.hour;
packeddateandtime |= file_Data.day << 5;
packeddateandtime |= file_Data.month << 10;
packeddateandtime |= file_Data.year << 14;
packedfileData pf;
pf.packeddateandtime = packeddateandtime;
return pf;
}
__device__ __host__ void extract(unsigned int index, packedfileData* pfd,fileData* unpackeddata) {
auto i = index;
if (i == Pindex){
auto m = pfd->packeddateandtime ;
printf("Got:%d \n", m);
}
unpackeddata[i] = unpackdata(pfd[i]);
if (i == Pindex) {
auto m = (int) unpackeddata[i].month;
printf("month:%d \n", m);
}
}
__global__ void extractGpu(packedfileData* pfd ,fileData* unpackeddata,unsigned int size) {
unsigned idx = blockIdx.x*blockDim.x+threadIdx.x;
if (idx < size){
extract(idx,pfd,unpackeddata);
}
}
int main(void)
{
packedfileData pfd[1];
fileData fd;
fileData fda[1];
fd.day =21;
fd.month = 4;
fd.year = 2017;
pfd[0] = bitpackdata(fd);
auto H2D = cudaMemcpyHostToDevice;
auto D2H = cudaMemcpyDeviceToHost;
packedfileData* d_pfd;
fileData* d_unpackeddata;
auto totalsize = 1;
gpuE(cudaMalloc(&d_pfd, totalsize * sizeof(packedfileData)));
gpuE(cudaMalloc(&d_unpackeddata, totalsize * sizeof(fileData)));
gpuE(cudaMemcpy(d_pfd, pfd, totalsize * sizeof(packedfileData), H2D));
int extractblocks = totalsize / 64 + 1;
cout << (int) unpackdata(pfd[Pindex]).month << endl;
cout << pfd[Pindex].packeddateandtime <<endl;
extractGpu<<<extractblocks, 64>>>(d_pfd, d_unpackeddata, totalsize);
gpuE(cudaPeekAtLastError());
cudaDeviceSynchronize();
gpuE(cudaFree(d_pfd));
gpuE(cudaFree(d_unpackeddata));
extract(0,pfd,fda);
}