This code seems to work for me:
$ cat t7.cpp
#include <CL/opencl.h>
#include <stdio.h>
#include <stdlib.h>
#define TILE_WIDTH 16
#define DS 16384
const char source[] =
"__kernel void matrix_multiply(__global float *A, __global float *B,"
" __global float *C, volatile __global int *p, int width)"
"{"
" __local float Ashare[TILE_WIDTH][TILE_WIDTH];"
" __local float Bshare[TILE_WIDTH][TILE_WIDTH];"
" int bx = get_group_id(0);"
" int by = get_group_id(1);"
" int tx = get_local_id(0);"
" int ty = get_local_id(1);"
" int row = by * TILE_WIDTH + ty;"
" int col = bx * TILE_WIDTH + tx;"
" float result = 0;"
" for (int m = 0; m < width / TILE_WIDTH; m++) {"
" Ashare[ty][tx] = A[(row * width) + (m * TILE_WIDTH) + tx];"
" Bshare[ty][tx] = B[(((m * TILE_WIDTH) + ty) * width) + col];"
" barrier(CLK_LOCAL_MEM_FENCE); "
" for (int k = 0; k < TILE_WIDTH; k++) {"
" result += Ashare[ty][k] * Bshare[k][tx];"
" }"
" barrier(CLK_LOCAL_MEM_FENCE); "
" }"
" atomic_add(p, 1);"
" mem_fence(CLK_GLOBAL_MEM_FENCE);"
" C[(row * width) + col] = result;"
" };"
;
int main(int argc, char *argv[])
{
cl_platform_id platform;
cl_device_id device;
cl_context context;
cl_command_queue queue1, queue2;
cl_program program;
cl_mem mem1, mem2, mem3, mem4;
cl_kernel kernel;
cl_int err;
err = clGetPlatformIDs(1, &platform, NULL);
if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
queue1 = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, NULL);
const char *sources[1] = {source};
program = clCreateProgramWithSource(context, 1, sources, NULL, &err);
if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
err = clBuildProgram(program, 1, &device, "-D TILE_WIDTH=16", NULL, NULL);
if (err == CL_BUILD_PROGRAM_FAILURE) {
// Determine the size of the log
size_t log_size;
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
// Allocate memory for the log
char *log = (char *) malloc(log_size);
// Get the log
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
// Print the log
printf("%s\n", log);
}
if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
mem1 = clCreateBuffer(context, CL_MEM_READ_WRITE, DS*DS*sizeof(float), NULL, NULL);
mem2 = clCreateBuffer(context, CL_MEM_READ_WRITE, DS*DS*sizeof(float), NULL, NULL);
mem3 = clCreateBuffer(context, CL_MEM_READ_WRITE, DS*DS*sizeof(float), NULL, NULL);
cl_mem (*clCreateBufferNV)(cl_context,cl_mem_flags, cl_mem_flags_NV, size_t, void*, cl_int*) = (cl_mem (*)(cl_context,cl_mem_flags, cl_mem_flags_NV, size_t, void*, cl_int*)) clGetExtensionFunctionAddressForPlatform(platform, "clCreateBufferNV");
if (clCreateBufferNV == NULL) {printf("invalid function pointer request\n"); return -1;}
mem4 = clCreateBufferNV(context, CL_MEM_READ_WRITE , CL_MEM_LOCATION_HOST_NV, (size_t)4, NULL, &err);
if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
float *hdata = new float[DS*DS];
for (int i = 0; i < DS*DS; i++) hdata[i] = 1;
kernel = clCreateKernel(program, "matrix_multiply", &err);
if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
const size_t gwork_size[2] = {DS,DS};
const size_t lwork_size[2] = {TILE_WIDTH,TILE_WIDTH};
int msize = DS;
void *progress = clEnqueueMapBuffer(queue1, mem4, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, 4, 0, NULL, NULL, &err);
if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
volatile int *iprogress = (volatile int *)progress;
iprogress[0] = 0;
err = clSetKernelArg(kernel, 0, sizeof(mem1), &mem1);
if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
err = clSetKernelArg(kernel, 1, sizeof(mem2), &mem2);
if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
err = clSetKernelArg(kernel, 2, sizeof(mem3), &mem3);
if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
err = clSetKernelArg(kernel, 3, sizeof(mem4), &mem4);
if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
err = clSetKernelArg(kernel, 4, sizeof(msize), &msize);
if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
err = clEnqueueWriteBuffer(queue1, mem1, CL_TRUE, 0, DS*DS*sizeof(float), hdata, 0, NULL, NULL);
if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
err = clEnqueueWriteBuffer(queue1, mem2, CL_TRUE, 0, DS*DS*sizeof(float), hdata, 0, NULL, NULL);
if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
err = clEnqueueNDRangeKernel(queue1, kernel, 2, NULL, gwork_size, lwork_size, 0, NULL, NULL);
if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
for (int i = 0; i < 1000000; i++) printf("%d,%d\n", i, iprogress[0]);
err = clEnqueueReadBuffer(queue1, mem3, CL_TRUE, 0, DS*DS*sizeof(float), hdata, 0, NULL, NULL);
if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
for (int i = 0; i < DS*DS; i++)
if (hdata[i] != DS) {printf("error at %d, was %f, should be %f\n", i, hdata[i], (float)DS); return 1;}
printf("success!\n");
return 0;
}
$ nvcc -o t7 t7.cpp -lOpenCL
$ ./t7
0,0
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0
10,0
11,0
12,0
13,0
14,0
15,0
16,0
17,0
18,0
19,0
20,0
21,0
22,0
23,0
24,0
25,0
26,0
27,0
28,0
29,0
30,0
31,0
32,0
33,0
34,0
35,0
36,0
37,0
38,0
39,0
40,0
41,0
42,0
43,0
44,0
45,0
46,0
47,0
48,0
49,0
50,0
51,0
52,0
53,0
54,0
55,0
56,0
57,0
58,0
59,0
60,0
61,0
62,0
63,0
64,0
65,0
66,0
67,0
68,0
69,0
70,0
71,0
72,0
73,0
74,0
75,0
76,0
77,0
78,0
79,0
80,0
81,0
82,0
83,0
84,0
85,0
86,0
87,0
88,0
89,0
90,0
91,0
92,0
93,0
94,0
95,0
96,0
97,0
98,0
99,0
100,0
101,0
102,0
103,0
104,0
105,0
106,0
107,0
108,0
109,0
110,0
111,0
112,0
113,0
114,0
115,0
116,0
117,0
118,0
119,0
120,0
121,0
122,0
123,0
124,0
125,0
126,0
127,0
128,0
129,0
130,0
131,0
132,0
133,0
134,0
135,0
136,0
137,0
138,0
139,0
140,0
141,0
142,0
143,0
144,0
145,0
146,0
147,0
148,0
149,0
150,0
151,0
152,0
153,0
154,0
155,0
156,0
157,0
158,0
159,0
160,0
161,0
162,0
163,0
164,0
165,0
166,0
167,0
168,0
169,0
170,0
171,0
172,0
173,0
174,0
175,0
176,0
177,0
178,0
179,0
180,0
181,0
182,0
183,0
184,0
185,0
186,0
187,0
188,0
189,0
190,0
191,0
192,0
193,0
194,0
195,0
196,0
197,0
198,0
199,0
200,0
201,0
202,0
203,0
204,0
205,0
206,0
207,0
208,0
209,0
210,0
211,0
212,0
213,0
214,0
215,0
216,0
217,0
218,0
219,0
220,0
221,0
222,0
223,0
224,0
225,0
226,0
227,0
228,0
229,0
230,0
231,0
232,0
233,0
234,0
235,0
236,0
237,0
238,0
239,0
240,0
241,0
242,0
243,0
244,0
245,0
246,0
247,0
248,0
249,0
250,0
251,0
252,0
253,0
254,0
255,0
256,0
257,0
258,0
259,0
260,0
261,0
262,0
263,0
264,0
265,0
266,0
267,0
268,0
269,0
270,0
271,0
272,0
273,0
274,0
275,0
276,0
277,0
278,0
279,0
280,0
281,0
282,0
283,0
284,0
285,0
286,0
287,0
288,0
289,0
290,0
291,0
292,0
293,0
294,0
295,0
296,0
297,0
298,0
299,0
300,0
301,0
302,0
303,0
304,0
305,0
306,0
307,0
308,0
309,0
310,0
311,0
312,0
313,0
314,0
315,0
316,0
317,0
318,0
319,0
320,0
321,0
322,0
323,0
324,0
325,0
326,0
327,0
328,0
329,0
330,0
331,0
332,0
333,0
334,0
335,0
336,0
337,0
338,0
339,0
340,0
341,0
342,0
343,0
344,0
345,0
346,0
347,0
348,0
349,32
350,160
351,256
352,384
353,512
354,640
355,736
356,864
357,1056
358,1376
359,1472
360,1728
361,1856
362,1984
363,2080
364,2208
365,2304
366,2432
367,2528
368,2656
369,2752
370,2880
371,3008
372,3456
373,3584
374,3776
375,3904
376,4032
377,4128
378,4256
379,4384
380,4480
381,4736
382,4864
383,4992
384,5120
385,5248
386,5344
387,5472
388,5824
389,5952
390,6080
391,6304
392,6432
393,6560
394,6656
395,6880
396,7008
397,7136
398,7232
399,7360
400,7456
401,7584
402,7712
403,8096
404,8224
405,8384
406,8608
407,8736
... (output truncated)
999990,268435456
999991,268435456
999992,268435456
999993,268435456
999994,268435456
999995,268435456
999996,268435456
999997,268435456
999998,268435456
999999,268435456
success!
$
The final numerical output of the zero-copy variable iprogress[0] matches what I would expect for the matrix dimension. (16384*16384 = 268435456)
The nv extension function, as well as some hint of the rationale for it, is described here:
http://on-demand.gputechconf.com/gtc/2018/presentation/s8837-opencl-nvidia-recent-improvements-future-plans.pdf
CUDA 10.1, Tesla V100, CentOS 7, 418.67