2D Gaussian on Image returns weird blotches

I’ve recently started writing CUDA code and have been having some issues with a 2D gaussian kernel. I’ve gotten the gaussian to work however there are blotches in the result.

Note: The returned image is at the bottom and I have a second createTexture function that takes in float* rather than uint8_t (since the gaussian kernel is a float) I omitted to keep the code blocks short

__global__ void d_GaussianBlurTexture(cudaTextureObject_t src, unsigned char* dst, int src_width, int src_height , cudaTextureObject_t k, int ksize)
{
  int col = blockIdx.x*blockDim.x + threadIdx.x;
  int row = blockIdx.y*blockDim.y + threadIdx.y;

  //checks if gaussian kernel extends < 0 or > src_width or > src_height
  if (row < (ksize-1)/2 || col < (ksize-1)/2 || row >= src_height-((ksize-1)/2) || col >= src_width-((ksize-1)/2))
    {
      dst[row*src_width+col] = tex1Dfetch<uint8_t>(src,row*src_width+col);
      return;
    }
  
  //if too many blocks called return if row*src_width+col > src_width*src_height
  if (row > src_height || col > src_width) return;
  
  float sum = 0.0;
  
  for(int j = -(ksize-1)/2; j <= (ksize-1)/2; j++)
    {
      for (int i = -(ksize-1)/2; i <= (ksize-1)/2; i++)
	{
	  int color = tex1Dfetch<uint8_t>(src, (row+j)*src_width + (col+i));
	  sum += (color * tex1Dfetch<float>(k,j+((ksize-1)/2)*ksize + i + ((ksize-1)/2)) );
	}
    }

  dst[row*src_width + col] = sum;

}

  cudaTextureObject_t createTexture(uint8_t* data, int N, int bits)
  {
    //Create texture object
    cudaResourceDesc resDesc;
    memset(&resDesc,0,sizeof(resDesc));
    resDesc.resType = cudaResourceTypeLinear;
    resDesc.res.linear.devPtr = data;
    resDesc.res.linear.desc.f = cudaChannelFormatKindUnsigned;
        
    resDesc.res.linear.desc.x = bits;
    resDesc.res.linear.sizeInBytes = N*sizeof(uint8_t);
    
    cudaTextureDesc texDesc;
    memset(&texDesc,0,sizeof(texDesc));
    texDesc.readMode = cudaReadModeElementType;
    
    //create texture object
    cudaTextureObject_t tex;
    cudaCreateTextureObject(&tex, &resDesc,&texDesc,NULL);
    return tex;
  }

Here are my grid and block dimensions and kernel call…

int main()
{

//..memory allocations and memcopy

cudaTextureObject_t k, src;
k = createTexture(d_gaussKernel, gauss_ksize*gauss_ksize,32); //Gaussian kernel of type float
src = createTexture(d_pixels, N, 8); //Grayscale of original image of type uint8_t

dim3 block(16,16);
  dim3 grid(ceil(width/16),ceil(height/16));

d_GaussianBlurTexture<<<grid, block>>>(src,d_resultPixels,width,height,k,gauss_ksize);

//..copy back to CPU and view
}

URLs to images:
https://ibb.co/BL17zCM <–Original
https://ibb.co/s12ttfY <–Result (You can see the blotches on the rails)

These blotches were either the subtle implicit conversion from float to int or overflow. Hard to tell looking back.