Hello I am using nvc++ (HPCSDK 23.3.0) to compile C code and I received this message:
NVC+±F-0000-Internal compiler error. no type conversion available 0 (FDTD3dGPU.cpp: 127)
NVC++/x86-64 Linux 23.3-0: compilation aborted
make: *** [FDTD3dGPU.o] Error 2
The code in question is:
mtml@gpnpusc3000003[pts/1]fdtd3d-omp $ cat -n FDTD3dGPU.cpp
1 /*
2 * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
3 *
4 * Please refer to the NVIDIA end user license agreement (EULA) associated
5 * with this source code for terms and conditions that govern your use of
6 * this software. Any use, reproduction, disclosure, or distribution of
7 * this software and related documentation outside the terms of the EULA
8 * is strictly prohibited.
9 *
10 */
11
12 #include <iostream>
13 #include <algorithm>
14 #include <cstring>
15 #include <cmath>
16 #include <chrono>
17 #include <omp.h>
18 #include "FDTD3dGPU.h"
19 #include "shrUtils.h"
20
21 bool fdtdGPU(float *output, float *input, const float *coeff,
22 const int dimx, const int dimy, const int dimz, const int radius,
23 const int timesteps, const int argc, const char **argv)
24 {
25 bool ok = true;
26 const int outerDimx = dimx + 2 * radius;
27 const int outerDimy = dimy + 2 * radius;
28 const int outerDimz = dimz + 2 * radius;
29 const size_t volumeSize = outerDimx * outerDimy * outerDimz;
30 size_t teamSize[2];
31 size_t threadSize[2];
32
33 // Ensure that the inner data starts on a 128B boundary
34 const int padding = (128 / sizeof(float)) - radius;
35 const size_t paddedVolumeSize = volumeSize + padding;
36
37 float* bufferOut = (float*) malloc (paddedVolumeSize * sizeof(float));
38 float* bufferIn = (float*) malloc (paddedVolumeSize * sizeof(float));
39
40 memcpy(bufferIn + padding, input, volumeSize * sizeof(float));
41 memcpy(bufferOut + padding, input, volumeSize * sizeof(float));
42
43 // Get the maximum work group size
44 size_t userWorkSize = 256;
45
46 // Set the work group size
47 threadSize[0] = k_localWorkX;
48 threadSize[1] = userWorkSize / k_localWorkX;
49 teamSize[0] = (unsigned int)ceil((float)dimx / threadSize[0]);
50 teamSize[1] = (unsigned int)ceil((float)dimy / threadSize[1]);
51
52 int teamX = teamSize[0];
53 int teamY = teamSize[1];
54 int numTeam = teamX * teamY;
55
56 shrLog(" set thread size to %dx%d\n", threadSize[0], threadSize[1]);
57 shrLog(" set team size to %dx%d\n", teamSize[0], teamSize[1]);
58
59 // Execute the FDTD
60 shrLog(" GPU FDTD loop\n");
61
62 #pragma omp target data map(to: bufferIn[0:paddedVolumeSize], \
63 bufferOut[0:paddedVolumeSize], \
64 coeff[0:radius+1])
65 {
66 auto start = std::chrono::steady_clock::now();
67
68 for (int it = 0 ; it < timesteps ; it++)
69 {
70 #pragma omp target teams num_teams(numTeam) thread_limit(userWorkSize)
71 {
72 float tile [localWorkMaxY + 2*k_radius_default][localWorkMaxX + 2*k_radius_default];
73 #pragma omp parallel
74 {
75 bool valid = true;
76 const int ltidx = omp_get_thread_num() % k_localWorkX;
77 const int ltidy = omp_get_thread_num() / k_localWorkX;
78 const int workx = k_localWorkX;
79 const int worky = userWorkSize / k_localWorkX;
80 const int gtidx = (omp_get_team_num() % teamX) * workx + ltidx;
81 const int gtidy = (omp_get_team_num() / teamX) * worky + ltidy;
82
83 const int stride_y = dimx + 2 * k_radius_default;
84 const int stride_z = stride_y * (dimy + 2 * k_radius_default);
85
86 int inputIndex = 0;
87 int outputIndex = 0;
88
89 // Advance inputIndex to start of inner volume
90 inputIndex += k_radius_default * stride_y + k_radius_default + padding;
91
92 // Advance inputIndex to target element
93 inputIndex += gtidy * stride_y + gtidx;
94
95 float infront[k_radius_default];
96 float behind[k_radius_default];
97 float current;
98
99 const int tx = ltidx + k_radius_default;
100 const int ty = ltidy + k_radius_default;
101
102 if (gtidx >= dimx) valid = false;
103 if (gtidy >= dimy) valid = false;
104
105 // For simplicity we assume that the global size is equal to the actual
106 // problem size; since the global size must be a multiple of the local size
107 // this means the problem size must be a multiple of the local size (or
108 // padded to meet this constraint).
109 // Preload the "infront" and "behind" data
110 for (int i = k_radius_default - 2 ; i >= 0 ; i--)
111 {
112 behind[i] = bufferIn[inputIndex];
113 inputIndex += stride_z;
114 }
115
116 current = bufferIn[inputIndex];
117 outputIndex = inputIndex;
118 inputIndex += stride_z;
119
120 for (int i = 0 ; i < k_radius_default ; i++)
121 {
122 infront[i] = bufferIn[inputIndex];
123 inputIndex += stride_z;
124 }
125
126 // Step through the xy-planes
**127 for (int iz = 0 ; iz < dimz ; iz++)**
128 {
129 // Advance the slice (move the thread-front)
130 for (int i = k_radius_default - 1 ; i > 0 ; i--)
131 behind[i] = behind[i - 1];
132 behind[0] = current;
133 current = infront[0];
134 for (int i = 0 ; i < k_radius_default - 1 ; i++)
135 infront[i] = infront[i + 1];
136 infront[k_radius_default - 1] = bufferIn[inputIndex];
137
138 inputIndex += stride_z;
139 outputIndex += stride_z;
140 #pragma omp barrier
141
142 // Note that for the work items on the boundary of the problem, the
143 // supplied index when reading the halo (below) may wrap to the
144 // previous/next row or even the previous/next xy-plane. This is
145 // acceptable since a) we disable the output write for these work
146 // items and b) there is at least one xy-plane before/after the
147 // current plane, so the access will be within bounds.
148
149 // Update the data slice in the local tile
150 // Halo above & below
151 if (ltidy < k_radius_default)
152 {
153 tile[ltidy][tx] = bufferIn[outputIndex - k_radius_default * stride_y];
154 tile[ltidy + worky + k_radius_default][tx] = bufferIn[outputIndex + worky * stride_y];
155 }
156 // Halo left & right
157 if (ltidx < k_radius_default)
158 {
159 tile[ty][ltidx] = bufferIn[outputIndex - k_radius_default];
160 tile[ty][ltidx + workx + k_radius_default] = bufferIn[outputIndex + workx];
161 }
162 tile[ty][tx] = current;
163 #pragma omp barrier
164
165 // Compute the output value
166 float value = coeff[0] * current;
167 for (int i = 1 ; i <= k_radius_default ; i++)
168 {
169 value += coeff[i] * (infront[i-1] + behind[i-1] + tile[ty - i][tx] +
170 tile[ty + i][tx] + tile[ty][tx - i] + tile[ty][tx + i]);
171 }
172
173 // Store the output value
174 if (valid) bufferOut[outputIndex] = value;
175 }
176 }
177 }
178
179 // Toggle the buffers
180 float* tmp = bufferIn;
181 bufferIn = bufferOut;
182 bufferOut = tmp;
183 }
184
185 auto end = std::chrono::steady_clock::now();
186 auto time = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count();
187 printf("Average kernel execution time %f (s)\n", (time * 1e-9f) / timesteps);
188
189 #pragma omp target update from (bufferIn[0:paddedVolumeSize])
190 }
191
192 memcpy(output, bufferIn+padding, volumeSize*sizeof(float));
193 free(bufferIn);
194 free(bufferOut);
195 return ok;
196 }