Image processing (divide image)

Welcome,

I want to add value to, a pixel in my image with size 8601x7561 but I can’t load such larg matrix in cuda. Could any one help me?

My code is:

#define ROZMIAR_BLOKU 512

__global__ void addValue(char *cuda_raster1, int height1, int width1,

char *cuda_raster_wyjsciowy)

{

		

	__shared__ char smem[ROZMIAR_BLOKU];

	int x = blockIdx.x + threadIdx.x;

	int y = blockIdx.y + threadIdx.y;

	

	unsigned int index = y*width1 + x;

	unsigned int bindex = threadIdx.y*blockDim.y+threadIdx.x;

	

	smem[bindex] = cuda_raster1[index];

	__syncthreads();

	

	if ( index < (x * y))

   {

		cuda_raster_wyjsciowy[index] = smem[bindex]+15; 

   }

	

}

sorry for my english.

Perhaps I am missing something, but why can’t you operate on a matrix that size?

An array of 8601x7561 char is only 65Mb in size, which should fit in the memory of just about any CUDA capable GPU, and 8601x7561 is much smaller than the block/grid size limits that CUDA imposes.

cuda return empty image.

So what arguments are you using to launch that kernel, and what is the error status from CUDA after the kernel has been launched?

I don’t have any errors. Only error that I have is empty image.

OK. Last try : could you post the host side code you use to call the kernel?

int main(int argc, char * argv[])

{

	TIFF *image1, *image2;

	uint32 width, height, *raster1;

	char *raster2; //raster wejsciowy przekazujemy do CUDA

	char *cuda_raster; //wewnetrzny raster CUDA

	char *cuda_raster_wyjsciowy; //raster wyjsciowy z CUDA

	char *raster_wyj;

	tsize_t stripSize;

	int R, G, B;

	unsigned long imagesize, c, e, n = -1;

	

	image1 = TIFFOpen("lena.tif","r");

	

	TIFFGetField(image1, TIFFTAG_IMAGEWIDTH, &width);

	TIFFGetField(image1, TIFFTAG_IMAGELENGTH, &height);

	

	imagesize = height * width + 1;

	

	raster1 = (uint32 *)malloc(sizeof(uint32) * imagesize);

	TIFFReadRGBAImage(image1, width, height, raster1, 0);

	

	raster2 = (char *)malloc(sizeof(char) * imagesize);

	for(e = height - 1l; e!=-1; e--)

	{

	for(c=0; c < width; c++)

	{

		n++;

		R = TIFFGetR(raster1[e * width + c]);

		G = TIFFGetG(raster1[e * width + c]);

		B = TIFFGetB(raster1[e * width + c]);

		raster2[n] = R;

		/*raster2[4*n+1] = G;

		raster2[4*n+2] = B;

		raster2[4*n+3] = 255;*/

	}

	}

	

	raster_wyj = (char *)calloc(imagesize, sizeof(char));

	cudaMalloc((void**)&cuda_raster, imagesize);

	cudaMalloc((void**)&cuda_raster_wyjsciowy, imagesize);

	int nBlocks = imagesize/ROZMIAR_BLOKU + (imagesize%ROZMIAR_BLOKU == 0 ? 0 : 1);

	cudaMemcpy(cuda_raster, raster2, imagesize, cudaMemcpyHostToDevice);

	addValue<<<nBlocks, ROZMIAR_BLOKU>>>(cuda_raster, height, width, cuda_raster_wyjsciowy);

	cudaThreadSynchronize();

	cudaMemcpy(raster_wyj, cuda_raster_wyjsciowy, imagesize, cudaMemcpyDeviceToHost);

			

	image2 = TIFFOpen("output18.tif", "w");

	

	TIFFSetField(image2, TIFFTAG_IMAGEWIDTH, width);

	TIFFSetField(image2, TIFFTAG_IMAGELENGTH, height);

	TIFFSetField(image2, TIFFTAG_PLANARCONFIG, PLANARCONFIG_CONTIG);

	TIFFSetField(image2, TIFFTAG_PHOTOMETRIC, PHOTOMETRIC_RGB);

	TIFFSetField(image2, TIFFTAG_BITSPERSAMPLE, 8);

	TIFFSetField(image2, TIFFTAG_SAMPLESPERPIXEL, 1);

	TIFFSetField(image2, TIFFTAG_ROWSPERSTRIP, height);

	

	TIFFWriteEncodedStrip(image2, 0 , raster_wyj, width * height);

	

		

	TIFFClose(image2);	

	TIFFClose(image1);

	return 0;

}

by now I’m testing this code for lena.tif

You say you get no errors, but that code contains no error checking at all. You should check the return status of each CUDA API function in this code section:

cudaMalloc((void**)&cuda_raster, imagesize);

	cudaMalloc((void**)&cuda_raster_wyjsciowy, imagesize);

	int nBlocks = imagesize/ROZMIAR_BLOKU + (imagesize%ROZMIAR_BLOKU == 0 ? 0 : 1);

	cudaMemcpy(cuda_raster, raster2, imagesize, cudaMemcpyHostToDevice);

	addValue<<<nBlocks, ROZMIAR_BLOKU>>>(cuda_raster, height, width, cuda_raster_wyjsciowy);

	cudaThreadSynchronize();

	cudaMemcpy(raster_wyj, cuda_raster_wyjsciowy, imagesize, cudaMemcpyDeviceToHost);

My guess is that the kernel is never running at all, but without error checking it is impossible to tell…

Application return : no error. This is a new code:

#include <cuda.h>

#include <stdio.h>

#include <sys/time.h>

#include <tiffio.h>

#define ROZMIAR_BLOKU 64

__global__ void AddValue(char *cuda_raster1, int height1, int width1,

char *cuda_raster_wyjsciowy)

{

	__shared__ char smem[ROZMIAR_BLOKU];

	int x = blockIdx.x + threadIdx.x;

	int y = blockIdx.y + threadIdx.y;

	

	unsigned int index = y*width1 + x;

	unsigned int bindex = threadIdx.y*blockDim.y+threadIdx.x;

	

	smem[bindex] = cuda_raster1[index];

	__syncthreads();

	

	if ( index < (x * y))

   {

		cuda_raster_wyjsciowy[index] = smem[bindex]+15;

   } 

}

int main(int argc, char * argv[])

{

	TIFF *image1, *image2;

	uint32 width, height, *raster1;

	char *raster2; //raster wejsciowy przekazujemy do CUDA

	char *cuda_raster; //wewnetrzny raster CUDA

	char *cuda_raster_wyjsciowy; //raster wyjsciowy z CUDA

	char *raster_wyj;

	tsize_t stripSize;

	int R, G, B;

	unsigned long imagesize, c, e, n = -1;

	

	image1 = TIFFOpen("lena.tif","r");

	

	TIFFGetField(image1, TIFFTAG_IMAGEWIDTH, &width);

	TIFFGetField(image1, TIFFTAG_IMAGELENGTH, &height);

	

	imagesize = height * width + 1;

	

	raster1 = (uint32 *)malloc(sizeof(uint32) * imagesize);

	TIFFReadRGBAImage(image1, width, height, raster1, 0);

	

	raster2 = (char *)malloc(sizeof(char) * imagesize);

	for(e = height - 1l; e!=-1; e--)

	{

	for(c=0; c < width; c++)

	{

		n++;

		R = TIFFGetR(raster1[e * width + c]);

		G = TIFFGetG(raster1[e * width + c]);

		B = TIFFGetB(raster1[e * width + c]);

		raster2[n] = R;

		/*raster2[4*n+1] = G;

		raster2[4*n+2] = B;

		raster2[4*n+3] = 255;*/

	}

	}

	

	raster_wyj = (char *)calloc(imagesize, sizeof(char));

	cudaMalloc((void**)&cuda_raster, imagesize);

	cudaMalloc((void**)&cuda_raster_wyjsciowy, imagesize);

	int nBlocks = imagesize/ROZMIAR_BLOKU + (imagesize%ROZMIAR_BLOKU == 0 ? 0 : 1);

	cudaMemcpy(cuda_raster, raster2, imagesize, cudaMemcpyHostToDevice);

	AddValue<<<nBlocks, ROZMIAR_BLOKU>>>(cuda_raster, height, width, cuda_raster_wyjsciowy);

	cudaThreadSynchronize();

	

	fprintf(stderr, "%s\n", cudaGetErrorString(cudaGetLastError()));

	cudaMemcpy(raster_wyj, cuda_raster_wyjsciowy, imagesize, cudaMemcpyDeviceToHost);

			

	image2 = TIFFOpen("output18.tif", "w");

	

	TIFFSetField(image2, TIFFTAG_IMAGEWIDTH, width);

	TIFFSetField(image2, TIFFTAG_IMAGELENGTH, height);

	TIFFSetField(image2, TIFFTAG_PLANARCONFIG, PLANARCONFIG_CONTIG);

	TIFFSetField(image2, TIFFTAG_PHOTOMETRIC, PHOTOMETRIC_RGB);

	TIFFSetField(image2, TIFFTAG_BITSPERSAMPLE, 8);

	TIFFSetField(image2, TIFFTAG_SAMPLESPERPIXEL, 1);

	TIFFSetField(image2, TIFFTAG_ROWSPERSTRIP, height);

	

	if(TIFFWriteEncodedStrip(image2, 0 , raster_wyj, width * height) == 0)

	{

	fprintf(stderr,"Error writing tif.\n");

	}

	

		

	TIFFClose(image2);	

	TIFFClose(image1);

	return 0;

}

Input image:

input

Output image:

output

That single error checking statement you put in doesn’t help. Check the return status from cudaThreadSynchronize().

How should I check the status? Sample code?

You are starting only 1D blocks, the y value is 0. The if statement doesn’t execute.

Also you should take into account the block dimension.

Ex:[codebox]x=blockIdx.x*blockDim.x+threadIdx.x[/codebox]

How about something like:

cudaError_t eval;

if ( (eval= cudaThreadSynchronize()) !=  cudaSuccess ) {

	fprintf( stderr, "\n Error %s in %s, line %d\n", cudaGetErrorString(eval), __FILE__, __LINE__ );

}

Disclaimer: that code was written out of my head and straight into the post, it may or may not contain typos or small errors that won’t compile. Everything necessary to fix it is in the documentation.

Cuda return “no error”;

doesn’t help

Your sole criteria for whether the CUDA code works or not seems to be the final tiff is correct. Consider this code:

#include <cuda.h>

#include <stdio.h>

#include <sys/time.h>

#include <tiffio.h>

#define ROZMIAR_BLOKU 64

#ifndef gquQ

#define gpuQ( condition ) {if( (condition) != 0 ) { fprintf( stderr, "\n FAILURE %d in %s, line %d\n", condition, __FILE__, __LINE__ );exit( 1 );}}

#endif

int main(int argc, char * argv[])

{

	TIFF *image1, *image2;

	uint32 width, height, *raster1;

	char *raster2; //raster wejsciowy przekazujemy do CUDA

	char *cuda_raster; //wewnetrzny raster CUDA

	char *cuda_raster_wyjsciowy; //raster wyjsciowy z CUDA

	char *raster_wyj;

	tsize_t stripSize;

	int R, G, B;

	long imagesize, c, e, n = -1;

	

	image1 = TIFFOpen("lena.tif","r");

	

	TIFFGetField(image1, TIFFTAG_IMAGEWIDTH, &width);

	TIFFGetField(image1, TIFFTAG_IMAGELENGTH, &height);

	

	imagesize = height * width + 1;

	

	raster1 = (uint32 *)malloc(sizeof(uint32) * imagesize);

	TIFFReadRGBAImage(image1, width, height, raster1, 0);

	

	raster2 = (char *)malloc(sizeof(char) * imagesize);

	for(e = height - 1l; e!=-1; e--)

	{

	for(c=0; c < width; c++)

	{

		n++;

		R = TIFFGetR(raster1[e * width + c]);

		G = TIFFGetG(raster1[e * width + c]);

		B = TIFFGetB(raster1[e * width + c]);

		raster2[n] = R;

		/*raster2[4*n+1] = G;

		raster2[4*n+2] = B;

		raster2[4*n+3] = 255;*/

	}

	}

	

	raster_wyj = (char *)calloc(imagesize, sizeof(char));

	gpuQ( cudaMalloc((void**)&cuda_raster, imagesize) );

	gpuQ( cudaMalloc((void**)&cuda_raster_wyjsciowy, imagesize) );

	gpuQ( cudaMemcpy(cuda_raster, raster2, imagesize, cudaMemcpyHostToDevice) );

	gpuQ( cudaMemcpy(cuda_raster_wyjsciowy, raster2, imagesize, cudaMemcpyHostToDevice) );

	gpuQ( cudaMemcpy(raster_wyj, cuda_raster_wyjsciowy, imagesize, cudaMemcpyDeviceToHost) );

			

	image2 = TIFFOpen("output18.tif", "w");

	

	TIFFSetField(image2, TIFFTAG_IMAGEWIDTH, width);

	TIFFSetField(image2, TIFFTAG_IMAGELENGTH, height);

	TIFFSetField(image2, TIFFTAG_PLANARCONFIG, PLANARCONFIG_CONTIG);

	TIFFSetField(image2, TIFFTAG_PHOTOMETRIC, PHOTOMETRIC_RGB);

	TIFFSetField(image2, TIFFTAG_BITSPERSAMPLE, 8);

	TIFFSetField(image2, TIFFTAG_SAMPLESPERPIXEL, 1);

	TIFFSetField(image2, TIFFTAG_ROWSPERSTRIP, height);

	

	if(TIFFWriteEncodedStrip(image2, 0 , raster_wyj, width * height) == 0)

	{

	fprintf(stderr,"Error writing tif.\n");

	}

	

		

	TIFFClose(image2);	

	TIFFClose(image1);

	return 0;

}

This is directly taken from your earlier post. You would agree that all it does is take the input tiff, copy it to GPU memory, copy it back, and the use your code for writing it to file. I would expect to wind up with a copy of the original image. Would you agree? Does it work for you? (I think it doesn’t).

Code that you write in the last post, return image that is the same like input image. But I want to change pixel value in cuda kernel. When cuda change value i want to write new tif file with changed values.

Yes I understand that. But your image reading/writing code is broken. It does not work. Please compile and run the code I posted above. You will see it produces an unreadable tiff file. Until you fix this, how can you possibly determine whether your CUDA code works or not?

I can open this file in IrfanView because it is monochromatic tif.

I compile you code posted above and I open output file in IrfanView .If you want normal tif you should:

for(e = height - 1l; e!=-1; e--)

	{

	for(c=0; c < width; c++)

	{

		n++;

		R = TIFFGetR(raster1[e * width + c]);

		G = TIFFGetG(raster1[e * width + c]);

		B = TIFFGetB(raster1[e * width + c]);

		raster2[n] = R;

		raster2[4*n+1] = G;

		raster2[4*n+2] = B;

		raster2[4*n+3] = 255;

	}

	}

and

TIFFSetField(image2, TIFFTAG_SAMPLESPERPIXEL, 4);

As clearly pointed out by avidday above, your image reading/writing code is wrong. I happen to have a skeleton for alike code around, so I’m attaching it here (changed so that it demonstrate what avidday suggested you to get right first, and that would be sequence of reading the image file from disk to host memory, copying it over to device memory, copying it back to host memory, and writing it down to disk again):

#include <assert.h>

#include <stdlib.h>

#include <tiffio.h>

#include <cuda.h>

int

main(int argc, char **argv)

{

	assert(argc == 3);

	TIFF		   *iimage = TIFFOpen(argv[1], "r");

	assert(iimage);

	uint32		  width;

	assert(TIFFGetField(iimage, TIFFTAG_IMAGEWIDTH, &width));

	assert(width > 0);

	uint32		  length;

	assert(TIFFGetField(iimage, TIFFTAG_IMAGELENGTH, &length));

	assert(length > 0);

	uint16		  bits_per_sample;

	assert(TIFFGetField(iimage, TIFFTAG_BITSPERSAMPLE, &bits_per_sample) != 0);

	assert(bits_per_sample == 8);

	uint16		  photometric;

	assert(TIFFGetField(iimage, TIFFTAG_PHOTOMETRIC, &photometric));

	assert(photometric == PHOTOMETRIC_RGB);

	uint16		  planar_config;

	assert(TIFFGetField(iimage, TIFFTAG_PLANARCONFIG, &planar_config) != 0);

	uint16		  samples_per_pixel;

	assert(TIFFGetField(iimage, TIFFTAG_SAMPLESPERPIXEL, &samples_per_pixel));

	assert(samples_per_pixel == 3);

	div_t		   pair = std::div(bits_per_sample, 8);

	int			 mul = (pair.rem == 0) ? pair.quot : pair.quot + 1;

	int			 size = width * length * samples_per_pixel * mul * sizeof(char);

	char		   *idata = (char *) malloc(size);

	assert(idata != NULL);

	char		   *curr = idata;

	int			 count = TIFFNumberOfStrips(iimage);

	for (int i = 0; i < count; ++i) {

		tsize_t		 in = TIFFReadEncodedStrip(iimage, i, curr, -1);

		assert(in != -1);

		curr += in;

	}

	TIFFClose(iimage);

	void		   *idata_d;

	assert(cudaMalloc(&idata_d, size) == cudaSuccess);

	assert(cudaMemcpy(idata_d, idata, size, cudaMemcpyHostToDevice) == cudaSuccess);

	void		   *odata_d;

	assert(cudaMalloc(&odata_d, size) == cudaSuccess);

	/* Replace following statement with calling corresponding CUDA kernel. */

	assert(cudaMemcpy(odata_d, idata_d, size, cudaMemcpyDeviceToDevice) == cudaSuccess);

	char		   *odata = (char *) malloc(size);

	assert(odata != NULL);

	assert(cudaMemcpy(odata, odata_d, size, cudaMemcpyDeviceToHost) == cudaSuccess);

	TIFF		   *oimage = TIFFOpen(argv[2], "w");

	assert(oimage);

	assert(TIFFSetField(oimage, TIFFTAG_IMAGEWIDTH, width));

	assert(TIFFSetField(oimage, TIFFTAG_IMAGELENGTH, length));

	assert(TIFFSetField(oimage, TIFFTAG_BITSPERSAMPLE, bits_per_sample));

	assert(TIFFSetField(oimage, TIFFTAG_COMPRESSION, COMPRESSION_DEFLATE));

	assert(TIFFSetField(oimage, TIFFTAG_PHOTOMETRIC, photometric));

	assert(TIFFSetField(oimage, TIFFTAG_SAMPLESPERPIXEL, samples_per_pixel));

	assert(TIFFSetField(oimage, TIFFTAG_PLANARCONFIG, planar_config));

	assert(TIFFSetField(oimage, TIFFTAG_ROWSPERSTRIP, length));

	tsize_t		 on = size;

	assert(TIFFWriteEncodedStrip(oimage, 0, odata, on) == on);

	TIFFClose(oimage);

	free(idata);

	free(odata);

	assert(cudaFree(idata_d) == cudaSuccess);

	assert(cudaFree(odata_d) == cudaSuccess);

}

Now, save this code say into foo.cu, and then compile with (this is for Linux):

nvcc -o foo foo.cu -ltiff

and then run with:

./foo lena.tif lena.copy.tif

and then a copy of original image should be produced in lena.copy.tif. If that OK, then you could proceed with trying to add your CUDA kernel to change pixel value.

(It would be good if you could learn from this example at least that you always have to check for error of any kind of API function you use that could possibly return an error designation.)

Thanks :"> but I can’t run your program. Error on run: Assertion `photometric == 2’.