extern shared how is it actually allocated

Hello

My question is how is extern shared memory actually allocated. My problem’s arisen when I found execution of following code written in driver api by my student destroys driver and totally screw linux kernel. So my question is exactly how does the kernel allocate memory for extern shared arrays and then how it clears that memory. Does it do that with delete operator from the transpose.h file?

file : transpose.h

#ifndef _TRANSPOSE_H

#define _TRANSPOSE_H

#ifndef CUDA_KERNEL_SRC

int * transpose(int *, int, int);

#ifndef _TRANSPOSE_CPP

void operator delete[](void * x) { }

#endif

#endif

#define THREAD_ROWS 8

#define BLOCKDIM 32

#endif /* _TRANSPOSE_H */

file transpose.cu

#define CUDA_KERNEL_SRC

#include "transpose.h"

extern "C" {

__global__ void transpose(int * src, int * dst) {

	extern __shared__ int sh[];

int posx = (BLOCKDIM * blockIdx.x) + threadIdx.x;

	for (int yoff = threadIdx.y; yoff < BLOCKDIM; yoff += THREAD_ROWS)

             //do sth unimportant

}

}

finally transpose.cpp

#define _TRANSPOSE_CPP

#include <cstdio>

#include <cuda.h>

#include "transpose.h"

int * transpose(int * host_src, int m, int n)

{

	static CUresult res;

	static CUdevice device;

	static CUcontext context;

	static CUmodule module = (CUmodule) 0;

	static CUfunction function;

	cuInit(0);

	res = cuDeviceGet(&device, 0);

	res = cuCtxCreate(&context, CU_CTX_SCHED_SPIN | CU_CTX_MAP_HOST, device);

	res = cuModuleLoad(&module, "transpose.ptx");

	res = cuModuleGetFunction(&function, module, "transpose");

	int gridX = (n + BLOCKDIM - 1) / BLOCKDIM;

	int gridY = (m + BLOCKDIM - 1) / BLOCKDIM;

	CUdeviceptr dev_src;

	cuMemAlloc(&dev_src, gridX * BLOCKDIM * gridY * BLOCKDIM * sizeof(int));

	cuMemcpyHtoD(dev_src, host_src, gridX * BLOCKDIM * gridY * BLOCKDIM * sizeof(int));

	CUdeviceptr dev_dst;

	int * host_dst;

	cuMemHostAlloc((void**) &host_dst, gridX * BLOCKDIM * gridY * BLOCKDIM * sizeof(int), CU_MEMHOSTALLOC_DEVICEMAP);

	cuMemHostGetDevicePointer(&dev_dst, host_dst, 0);

	void * args[] = { &dev_src, &dev_dst};

	cuLaunchKernel(function,

		gridX, gridY, 1,

		BLOCKDIM, THREAD_ROWS, 1,

		THREAD_ROWS * BLOCKDIM * sizeof(int), NULL, args, NULL);

	cuCtxSynchronize();

	cuMemFree(dev_dst);

	cuMemFree(dev_src);

	return host_dst;

}

and main file

#include "transpose.h"

int main() {

    int *c, *d;

    int n = 32, m = 32;

    c = new int[m*n];

    for(int j = 0; j < n; ++j) {

        for(int i = 0; i < m; ++i) {

            c[j*m + i] = j*m + i;

        }

    }

    d = transpose(c, n, m);

    delete [] c;

    delete [] d;

    return 0;

}

the memory allocated by cuMemHostAlloc must be freed by a call to cuMemFreeHost, passing the address returned by the cuMemHostAlloc call. So the call to cuMemFree(dev_dst) is probably bad, freeing who knows what, or returning an error code if you’re lucky.

As far as the extern shared memory goes, I thought that was all Nvidia device memory, allocatable between L1 and shared memory in the latest chips. If you find out otherwise, please share, because I’ve wondered the same thing myself and came to my conclusion from a bunch of reading.

generally I thought exactly like you but main also includes that twisted delete operator which means

delete [] c; delete [] d;

actually don’t take place

Also I thought that several runs of this program depleted number of context available, but that also wasn’t the case

yes, I would agree the delete operator was redefined as a no-op, but that’s not the free I was referring to. I’m talking about the call “cuMemFree(dev_dst);”. That looks like it could do some damage and should have some error checking applied, as should all the other calls.