Hi, I am having trouble debugging a simple kernel I wrote. This is the first kernel I’ve written in cuda so maybe its an obvious mistake on my part. Here’s the code:
Code
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include
#include <assert.h>#include <cuda_runtime.h>
#define gpuErrchk(ans) { gpuAssert((ans), FILE, LINE); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,“GPUassert: %s %s %d\n”, cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}#define SHMEM_SIZE 256*6
global void BtdB(const float* restrict input, float* restrict output){
int tid = (blockIdx.x * blockDim.x * 6) + threadIdx.x * 6;__shared__ float s_d[SHMEM_SIZE]; if (tid<338682) { for(int i=0; i<6; i++){ s_d[tid] = input[tid]; } __syncthreads(); output[tid] = 4*s_d[tid] - 5*s_d[tid+2] + s_d[tid+4]; output[tid+1] = -4*s_d[tid+1] - 4*s_d[tid+2] + s_d[tid+3] + s_d[tid+4]; output[tid+2] = 4*s_d[tid+1] - 4*s_d[tid+2] - s_d[tid+3] + s_d[tid+4]; output[tid+3] = -2*s_d[tid+1] - s_d[tid+2] + 2*s_d[tid+3] + s_d[tid+4]; output[tid+4] = 2*s_d[tid+1] - s_d[tid+2] - 2*s_d[tid+3] + s_d[tid+4]; output[tid+5] = 4*s_d[tid+1] - 5*s_d[tid+3] + s_d[tid+5]; }
}
void verify(float* input, float* output_gpu){
float B[36] = {4,0,-5,0,1,0,0,-4,-4,1,1,0,0,4,-4,-1,1,0,0,-2,-1,2,1,0,0,2,-1,-2,1,0,0,4,0,-5,0,1};for(int i=0; i<6; i++){ for(int j=0; j<6; j++){ float tmp=0; for(int k=0; k<6; k++){ tmp = tmp + B[j*6+k]*input[i*6+k]; } assert(output_gpu[i*6+j] == tmp); } }
}
int main(){
float *input;
float *output;cudaMallocHost((void**) &input, 338688*sizeof(float)); cudaMallocHost((void**) &output, 338688*sizeof(float)); for(int i=0; i<338688; i++){ input[i] = (float)(rand() % 100); } float* d_input; float* d_output; cudaMalloc((void **) &d_input, 338688*sizeof(float)); cudaMalloc((void **) &d_output, 338688*sizeof(float)); cudaMemcpy(d_input, input, 338688*sizeof(float), cudaMemcpyHostToDevice); int THREADS = 256; int BLOCKS = (56448 + THREADS - 1) / THREADS; dim3 threads(THREADS); dim3 blocks(BLOCKS); BtdB<<<blocks, threads>>>(d_input,d_output); gpuErrchk( cudaPeekAtLastError() ); gpuErrchk( cudaDeviceSynchronize() ); gpuErrchk( cudaMemcpy(output, d_output, 338688*sizeof(float), cudaMemcpyDeviceToHost) ); verify(input,output); cudaFree(d_input); cudaFree(d_output); cudaFreeHost(input); cudaFreeHost(output); return 0;
}
The problem arises when I use shared memory. If I read straight from VRAM without writing to shared memory, then the code executes correctly so the indexing seems fine. There might be some shared memory restriction that I am not aware of. Here is the output of cuda-memcheck:
cuda-memcheck output
Invalid global write of size 4
========= at 0x000003b0 in /*****/Btdb.cu:29:BtdB(float const , float)
========= by thread (1,0,0) in block (1,0,0)
========= Address 0x01001818 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2b8) [0x1e5cc8]
========= Host Frame:./Btdb [0xc7bb]
========= Host Frame:./Btdb [0x52495]
========= Host Frame:./Btdb [0x843f]
========= Host Frame:./Btdb [0x82e4]
========= Host Frame:./Btdb [0x8333]
========= Host Frame:./Btdb [0x808b]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf3) [0x270b3]
========= Host Frame:./Btdb [0x7aee]
There are a lot of similar errors(different threads,blocks etc). The culprit seems to be this line of code:
for(int i=0; i<6; i++){
s_d[tid] = input[tid]; <------
}
Memcheck seems confused though as it shouble a shared write instead of a global one.
Some last things to note. I am using a gtx860m (cc 5.0). Driver 455.23.05 on ubuntu 20.04.1 and cuda 11.1
I am using these flags to compile: -G -gencode arch=compute_50,code=sm_50
Thank you for your time.