Hello
My question is how is extern shared memory actually allocated. My problem’s arisen when I found execution of following code written in driver api by my student destroys driver and totally screw linux kernel. So my question is exactly how does the kernel allocate memory for extern shared arrays and then how it clears that memory. Does it do that with delete operator from the transpose.h file?
file : transpose.h
#ifndef _TRANSPOSE_H
#define _TRANSPOSE_H
#ifndef CUDA_KERNEL_SRC
int * transpose(int *, int, int);
#ifndef _TRANSPOSE_CPP
void operator delete[](void * x) { }
#endif
#endif
#define THREAD_ROWS 8
#define BLOCKDIM 32
#endif /* _TRANSPOSE_H */
file transpose.cu
#define CUDA_KERNEL_SRC
#include "transpose.h"
extern "C" {
__global__ void transpose(int * src, int * dst) {
extern __shared__ int sh[];
int posx = (BLOCKDIM * blockIdx.x) + threadIdx.x;
for (int yoff = threadIdx.y; yoff < BLOCKDIM; yoff += THREAD_ROWS)
//do sth unimportant
}
}
finally transpose.cpp
#define _TRANSPOSE_CPP
#include <cstdio>
#include <cuda.h>
#include "transpose.h"
int * transpose(int * host_src, int m, int n)
{
static CUresult res;
static CUdevice device;
static CUcontext context;
static CUmodule module = (CUmodule) 0;
static CUfunction function;
cuInit(0);
res = cuDeviceGet(&device, 0);
res = cuCtxCreate(&context, CU_CTX_SCHED_SPIN | CU_CTX_MAP_HOST, device);
res = cuModuleLoad(&module, "transpose.ptx");
res = cuModuleGetFunction(&function, module, "transpose");
int gridX = (n + BLOCKDIM - 1) / BLOCKDIM;
int gridY = (m + BLOCKDIM - 1) / BLOCKDIM;
CUdeviceptr dev_src;
cuMemAlloc(&dev_src, gridX * BLOCKDIM * gridY * BLOCKDIM * sizeof(int));
cuMemcpyHtoD(dev_src, host_src, gridX * BLOCKDIM * gridY * BLOCKDIM * sizeof(int));
CUdeviceptr dev_dst;
int * host_dst;
cuMemHostAlloc((void**) &host_dst, gridX * BLOCKDIM * gridY * BLOCKDIM * sizeof(int), CU_MEMHOSTALLOC_DEVICEMAP);
cuMemHostGetDevicePointer(&dev_dst, host_dst, 0);
void * args[] = { &dev_src, &dev_dst};
cuLaunchKernel(function,
gridX, gridY, 1,
BLOCKDIM, THREAD_ROWS, 1,
THREAD_ROWS * BLOCKDIM * sizeof(int), NULL, args, NULL);
cuCtxSynchronize();
cuMemFree(dev_dst);
cuMemFree(dev_src);
return host_dst;
}
and main file
#include "transpose.h"
int main() {
int *c, *d;
int n = 32, m = 32;
c = new int[m*n];
for(int j = 0; j < n; ++j) {
for(int i = 0; i < m; ++i) {
c[j*m + i] = j*m + i;
}
}
d = transpose(c, n, m);
delete [] c;
delete [] d;
return 0;
}