Hi there, a question related to cudaMallocPitch(), and a question about device global versus device local memory, and about CPU utilization:
- I am reading an image into gpu global memory (see below, error checking elided for brevity). The kernel does a very simple 3x3 blur on it, placing the blurred pixels in a second chunk of allocated gpu memory. The blurred pixels are read out to the CPU memory and written to a file.
If I run the program once, then comment out the kernel and compile - so the program cannot be putting data into the output device buffer- and run it again, I still get the processed image data back from the device.
It seems that successive calls to cudaMallocPitch() for same number of bytes - in successive runs of the program - return memory at the same base addresses.
Is this correct analysis?
-
I could pull the required rows of the image for a block from gpu global memory into block shared memory and do the processing there. I am thinking of the fast N body simulation presented in one of the CUDA programming PDF files. That seems like a lot of data copying - is this worth the trouble given my one row per thread approach? Would it make more sense if I were processing the image data in subrows (tiles) as is presented in the N body example?
-
The code appears to work, and it appears to do the blur on the GPU (GPU temperature rises when the code is modified to run the kernel in a loop). Yet the CPU utilization is 100% for the program throughout it’s execution. I rigged it such that the jpeg is read in to host memory, copied to device global memory, and then had the kernel execute 100 times, after which the result is copied back to host memory and written to a file. On an 8192x6144 image 100 kernel invocations takes a while, but top showed the program at 100% CPU the entire time.
Why would the CPU be busy ?
Sorry for so many questions, and thanks in advance for any insight.
- Eric
#define ONE_NINTH 0.111111f
int read_jpeg(char *filename, unsigned int *height, unsigned int *width,
void **devinputmem, void **devfiltermem, size_t *pitch) {
JSAMPLE *workdata, *tmpworkdata;
FILE * infile;
void *tmp;
struct jpeg_decompress_struct cinfo;
infile = fopen(filename, “rb”)
jpeg_stdio_src(&cinfo, infile);
jpeg_read_header(&cinfo, TRUE);
jpeg_start_decompress(&cinfo);
*width = cinfo.output_width;
height = cinfo.output_height;
size_t imgbytesize = cinfo.output_width * cinfo.output_components * cinfo.output_height;
size_t imgbytewidth = cinfo.output_width * cinfo.output_components;
// Allocate copy-buffer
cudaMallocHost(&tmp, imgbytesize);
workdata = (JSAMPLE) tmp;
// Allocate space device memory for input and filtered image
cudaMallocPitch(devinputmem, pitch, imgbytewidth, *height);
cudaMemset2D(*devinputmem, *pitch, 0, imgbytewidth, *height);
cudaMallocPitch(devfiltermem, pitch, imgbytewidth, *height);
cudaMemset2D(*devfiltermem, *pitch, 0, imgbytewidth, *height); // ensure previous run’s data is not available
// Samples are got from libjpeg as RGBRGBRGBRGBRGB…
// Ask for the entire image.
tmpworkdata = workdata;
while (cinfo.output_scanline < cinfo.output_height) {
jpeg_read_scanlines(&cinfo, &workdata, 1);
workdata += imgbytewidth;
}
// copy it to the video card
workdata = tmpworkdata;
cudaMemcpy2D( *devinputmem, pitch, (void)workdata, imgbytewidth, imgbytewidth, cinfo.output_height, cudaMemcpyHostToDevice);
// Now the image data is in the device.
jpeg_finish_decompress(&cinfo);
cudaFreeHost( workdata );
jpeg_destroy_decompress(&cinfo);
fclose(infile);
return 0;
}
int save_jpeg(char *filename, void *devfiltermem, size_t pitch, unsigned int W, unsigned int H) {
void * tmp;
JSAMPLE *workdata, tmpworkdata;
FILE outfile;
struct jpeg_compress_struct cinfo;
unsigned int imgbytesize = W * 3 * H;
unsigned int imgbytewidth = W * 3;
// Allocate copy-buffer
cudaMallocHost(&tmp, imgbytesize);
workdata = (JSAMPLE) tmp;
cinfo.err = jpeg_std_error(&jerr);
jpeg_create_compress(&cinfo);
outfile = fopen(filename, “wb”);
jpeg_stdio_dest(&cinfo, outfile);
cinfo.image_width = W; / image width and height, in pixels /
cinfo.image_height = H;
cinfo.input_components = 3; / # of color components per pixel /
cinfo.in_color_space = JCS_RGB; / colorspace of input image /
jpeg_set_defaults(&cinfo);
jpeg_start_compress(&cinfo, TRUE);
cudaMemcpy2D( (void)workdata, cinfo.image_width * cinfo.input_components, devfiltermem, pitch, cinfo.image_width * cinfo.input_components, cinfo.image_height, cudaMemcpyDeviceToHost);
tmpworkdata = workdata;
while (cinfo.next_scanline < cinfo.image_height) {
jpeg_write_scanlines(&cinfo, &workdata, 1);
workdata += imgbytewidth;
}
workdata = tmpworkdata;
cudaFreeHost(workdata);
jpeg_finish_compress(&cinfo);
jpeg_destroy_compress(&cinfo);
fclose(outfile);
return 0;
}
int main(int argc, char **argv) {
void *devin=NULL, *devout=NULL;
unsigned int width=0, height=0;
size_t pitch=0;
char *infile;
char outfile[256];
dim3 g, b;
size_t ns = 0;
if (argc != 3) {
cout << “Usage” << endl << “\t” << argv[0] << " infile.jpg outfile.jpg" << endl << endl;
exit(1);
}
infile = argv[1];
strcpy(outfile, “out-”);
strncat(outfile, argv[2], 251);
// read_jpeg() initializes the target memory (devout) to zero.
read_jpeg(infile, &height, &width, &devin, &devout, &pitch);
b.x = 448; b.y = 1; b.z = 1;
g.x = 2;
// int loop = 0;
// for (loop=0;loop<100;loop++) {
kernel_img_proc<<< g, b, ns >>>(devin, pitch, devout, pitch, width, height);
cudaThreadSynchronize();
// }
save_jpeg(outfile, devout, pitch, width, height);
cudaFree(devin);
cudaFree(devout);
return 0;
}
global void kernel_img_proc(void *in, size_t srcpitch, void *out, size_t destpitch,
unsigned int width, unsigned int height) {
float radius = 3;
unsigned char pin = (unsigned char) in;
unsigned char pout = (unsigned char) out;
// Copy by indexing rows using block index of the block,
// and thread index in the block.
// A processing function needs to know what row is being
// crunched so it can avoid referencing pixels outside the
// edges of the image.
int rownumber = (blockIdx.x*blockDim.x + threadIdx.x);
if (rownumber >= height)
return;
blur_integral_row(pin, srcpitch, pout, destpitch, width, height, rownumber, radius);
}
device void blur_integral_row( unsigned char *in, size_t srcpitch,
unsigned char *out, size_t destpitch,
unsigned int width, unsigned int height,
unsigned int rownumber, float radius) {
in += rownumber * srcpitch;
out += rownumber * destpitch;
if ((rownumber == 0) || (rownumber == (height-1)) ) { // just copy the pixels, do not blur them
for (size_t j=0; j<width; j++) {
( out + 3j ) = ( in + 3j );
( out + 3j + 1 ) = ( in + 3j + 1 );
( out + 3j + 2 ) = ( in + 3j + 2 );
}
return;
}
unsigned char *prev_row_in = in - srcpitch;
unsigned char *next_row_in = in + srcpitch;
for (size_t j=0; j<width; j++) {
( out + 3j ) = blur_integral_pixel(in, j, 0, prev_row_in, next_row_in );
( out + 3j + 1 ) = blur_integral_pixel(in, j, 1, prev_row_in, next_row_in );
( out + 3j + 2 ) = blur_integral_pixel(in, j, 2, prev_row_in, next_row_in );
}
}
// rgb param is 0 or 1 or 2… the offset from the pixel address of the R, G or B component.
device char blur_integral_pixel(unsigned char *row, size_t row_offset,
size_t rgb, unsigned char *prev, unsigned char *next) {
float sum = 0.0f;
// previous row
sum += ( prev + 3row_offset + rgb - 3 );
sum += ( prev + 3row_offset + rgb );
sum += ( prev + 3row_offset + rgb + 3 );
// current row
sum += ( row + 3row_offset + rgb - 3 );
sum += ( row + 3row_offset + rgb );
sum += ( row + 3row_offset + rgb + 3 );
// next row
sum += ( next + 3row_offset + rgb - 3 );
sum += ( next + 3row_offset + rgb );
sum += ( next + 3row_offset + rgb + 3 );
sum *= ONE_NINTH;
return (unsigned char) sum;
}