I am working on a kernel that merges a set of images into one base Image, by taking the max from the set of images and putting them into the final image. I have managed to get the runtime down to 8ms which is great however when I profile it Nsight compute seems to suggest that a further 77% improvement is possible coming from dram excessive read sectors.
Here is the kernel code:
__global__ void max_subpixel_coaleseced(unsigned char* baseImage, unsigned char* images, imageInfo* imageInfo) {
int width = blockDim.x;
int height = gridDim.x;
size_t offset = ((blockIdx.x) * width + threadIdx.x) * 4;
int numChannels = imageInfo->numChannels;
int imageSize = imageInfo->imageSize;
if (offset < (width * height)*4) {
//Reinterpret pointer to grab 4 channels
unsigned int pixel = *(reinterpret_cast<unsigned int*>(baseImage + offset));
size_t upperLimit = static_cast<size_t>(imageInfo->numImages) * (imageSize);
for (size_t vecOffset = offset; vecOffset < upperLimit; vecOffset+=imageSize){
//grab 4 channels to compare to
unsigned int compareto = *(reinterpret_cast<unsigned int*>(images + vecOffset));
//isolate channels, take the max and add them up
pixel = (max((pixel & 0xff000000), (compareto & 0xff000000)) |
max((pixel & 0x00ff0000), (compareto & 0x00ff0000)) |
max((pixel & 0x0000ff00), (compareto & 0x0000ff00)) |
max((pixel & 0x000000ff), (compareto & 0x000000ff))
);
}
//store values back into the base image
*reinterpret_cast<unsigned int*>(baseImage + offset) = pixel;
}
}
and here is the memory chart:
Fundamentally my questions is if these suggested performance gains are really possible? Or is it the case that Nsight compute just believes that its a memory bound task and theoretically if memory could keep up with this kernel at 100% compute that it would be 77% faster?