Yesterday I was fixing a bunch of misaligned errors(nvcc -arch sm_20 causes access violations in shared memory - CUDA Programming and Performance - NVIDIA Developer Forums), with them all corrected I am now left with a runtime error which only occurs when not using nsight memory checker.
At runtime the error causes a break of Unknown Error (30) at the CUT_CHECK_ERROR after executing the kernel which I was fixing yesterday.;
CUT_CHECK_ERROR("avoid_pedestrians0 failed");
GPUFLAME_avoid_pedestrians<<<grid, threads, sm_size>>>(d_agents, d_pedestrian_locations, d_pedestrian_location_partition_matrix, d_rand48);
>>CUT_CHECK_ERROR("avoid_pedestrians1 failed");
Using the nsight analysis and not the memory debugger it picks up a single error
Call ID|Name |CudaError|Start Time (µs)|Duration (µs)|Context ID|Process ID|Thread ID
2263 |cudaDeviceSynchronize|30 |12,862,697.811 |4,551.503 |31 |6244 |6384
I’m unsure how to trace that error back to a source, from looking through the nsight guide It doesn’t make this clear either.
Furthermore visual studio states the error as;
First-chance exception at 0x753dc41f (KernelBase.dll) in PedestrianFLAMEGPU.exe: Microsoft C++ exception: cudaError_enum at memory location 0x0018f778..
Thanks
Currently I’ve had no luck finding an automated debug method to isolate the cause. However I have found that commenting the call to this method stops the runtime error from occuring.
/*
* get first non partitioned pedestrian_location message (first batch load into shared memory)
*/
__device__ xmachine_message_pedestrian_location* get_first_pedestrian_location_message(xmachine_message_pedestrian_location_list* messages, xmachine_message_pedestrian_location_PBM* partition_matrix, float x, float y, float z){
extern __shared__ int sm_data [];
char4* message_share = (char4*)&sm_data[0];
int3 relative_cell = make_int3(-2, -1, -1);
int cell_index_max = 0;
int cell_index = 0;
float3 position = make_float3(x, y, z);
int3 agent_grid_cell = message_pedestrian_location_grid_position(position);
if (load_next_pedestrian_location_message(messages, partition_matrix, relative_cell, cell_index_max, agent_grid_cell, cell_index))
{
int message_index = SHARE_INDEX(threadIdx.x, sizeof(xmachine_message_pedestrian_location));
return ((xmachine_message_pedestrian_location*)&message_share[message_index]);
}
else
{
return false;
}
}
In the above code, each occurrence of char4* was previously char* before the fixes to alignment errors I made yesterday. However changing them back, does not prevent the error.
Further isolated through commenting the cause comes down to the call on line 14 (above);
/** load_next_pedestrian_location_message
* Used to load the next message data to shared memory
* Idea is check the current cell index to see if we can simpley get a message from the current cell
* If we are at the end of the current cell then loop till we find the next cell with messages (this way we ignore cells with no messages)
* @param messages the message list
* @param partition_matrix the PBM
* @param relative_cell the relative partition cell position from the agent position
* @param cell_index_max the maximum index of the currnt partition cell
* @param agent_grid_cell the agents partition cell position
* @param cell_index the current cell index in agent_grid_cell+relative_cell
* @return true if a messag has been loaded into sm false otherwise
*/
__device__ int load_next_pedestrian_location_message(xmachine_message_pedestrian_location_list* messages, xmachine_message_pedestrian_location_PBM* partition_matrix, int3 relative_cell, int cell_index_max, int3 agent_grid_cell, int cell_index)
{
extern __shared__ int sm_data [];
char4* message_share = (char4*)&sm_data[0];
int move_cell = true;
cell_index ++;
//see if we need to move to a new partition cell
if(cell_index < cell_index_max)
move_cell = false;
while(move_cell)
{
//get the next relative grid position
if (next_cell2D(&relative_cell))
{
//calculate the next cells grid position and hash
int3 next_cell_position = agent_grid_cell + relative_cell;
int next_cell_hash = message_pedestrian_location_hash(next_cell_position);
//use the hash to calculate the start index
int cell_index_min = tex1Dfetch(tex_xmachine_message_pedestrian_location_pbm_start, next_cell_hash + d_tex_xmachine_message_pedestrian_location_pbm_start_offset);
//check for messages in the cell (empty cells with have a start index of baadf00d
if (cell_index_min != 0xffffffff)
{
//if there are messages in the cell then update the cell index max value
cell_index_max = tex1Dfetch(tex_xmachine_message_pedestrian_location_pbm_end, next_cell_hash + d_tex_xmachine_message_pedestrian_location_pbm_end_offset);
//start from the cell index min
cell_index = cell_index_min;
//exit the loop as we have found a valid cell with message data
move_cell = false;
}
}
else
{
//we have exhausted all the neightbouring cells so there are no more messages
return false;
}
}
//get message data using texture fetch
xmachine_message_pedestrian_location temp_message;
temp_message._relative_cell = relative_cell;
temp_message._cell_index_max = cell_index_max;
temp_message._cell_index = cell_index;
temp_message._agent_grid_cell = agent_grid_cell;
//Using texture cache
temp_message.x = tex1Dfetch(tex_xmachine_message_pedestrian_location_x, cell_index + d_tex_xmachine_message_pedestrian_location_x_offset);
temp_message.y = tex1Dfetch(tex_xmachine_message_pedestrian_location_y, cell_index + d_tex_xmachine_message_pedestrian_location_y_offset);
temp_message.z = tex1Dfetch(tex_xmachine_message_pedestrian_location_z, cell_index + d_tex_xmachine_message_pedestrian_location_z_offset);
//load it into shared memory (no sync as no sharing between threads)
int message_index = SHARE_INDEX(threadIdx.x, sizeof(xmachine_message_pedestrian_location));
xmachine_message_pedestrian_location* sm_message = ((xmachine_message_pedestrian_location*)&message_share[message_index]);
sm_message[0] = temp_message;
return true;
}
If I remove the 2nd to last line
sm_message[0] = temp_message;
the runtime error nolonger occurs, so I’ll assume that my error is caused by a memory exception in ‘thread-local memory’ on the last few lines because the CUDA memory checker doesn’t check that.
As message_share is defined from shared memory
extern __shared__ int sm_data [];
char4* message_share = (char4*)&sm_data[0];
...
xmachine_message_pedestrian_location* sm_message = ((xmachine_message_pedestrian_location*)&message_share[message_index]);
sm_message[0] = temp_message;
Tracing using the memory watcher shows the addresses from the first two sm_data indexes (to calculate the length of sm_data[0]) and the value of message_index;
sm_data [0]0x100[1]0x8ad(1965) msg_id 0x04(4)
sm_data [0]0x106[1]0x40a(772) msg_id 0x04(4)
sm_data [0]0x0fe[1]0x106(8) msg_id 0x124(292)
From this I can see that the 3rd call (which matches the number of outputs I get from a printf at this line, when not running through the debugger) would go out of my estimated bounds, which appears to be my problem.
However the address from sm_data[3] at this point matches cell_index_max from message_share which would leave the call inside bounds for what the original author was probably intending. And the actual kernel launch requests 3076 bytes of shared memory.
So I think I need to workout why the shared memory is ‘shrinking’ and if there is a legal way I can carry out the existing bypass.
I solved this by coincidence, when I was starting from scratch applying my fixes in order so I could explain them to the original author. Correcting all instances of char* message_share to the actual type, and then removing the shareOffset method from every one of its uses has stopped the runtime errors from occurring.