As clearly pointed out by avidday above, your image reading/writing code is wrong. I happen to have a skeleton for alike code around, so I’m attaching it here (changed so that it demonstrate what avidday suggested you to get right first, and that would be sequence of reading the image file from disk to host memory, copying it over to device memory, copying it back to host memory, and writing it down to disk again):
#include <assert.h>
#include <stdlib.h>
#include <tiffio.h>
#include <cuda.h>
int
main(int argc, char **argv)
{
assert(argc == 3);
TIFF *iimage = TIFFOpen(argv[1], "r");
assert(iimage);
uint32 width;
assert(TIFFGetField(iimage, TIFFTAG_IMAGEWIDTH, &width));
assert(width > 0);
uint32 length;
assert(TIFFGetField(iimage, TIFFTAG_IMAGELENGTH, &length));
assert(length > 0);
uint16 bits_per_sample;
assert(TIFFGetField(iimage, TIFFTAG_BITSPERSAMPLE, &bits_per_sample) != 0);
assert(bits_per_sample == 8);
uint16 photometric;
assert(TIFFGetField(iimage, TIFFTAG_PHOTOMETRIC, &photometric));
assert(photometric == PHOTOMETRIC_RGB);
uint16 planar_config;
assert(TIFFGetField(iimage, TIFFTAG_PLANARCONFIG, &planar_config) != 0);
uint16 samples_per_pixel;
assert(TIFFGetField(iimage, TIFFTAG_SAMPLESPERPIXEL, &samples_per_pixel));
assert(samples_per_pixel == 3);
div_t pair = std::div(bits_per_sample, 8);
int mul = (pair.rem == 0) ? pair.quot : pair.quot + 1;
int size = width * length * samples_per_pixel * mul * sizeof(char);
char *idata = (char *) malloc(size);
assert(idata != NULL);
char *curr = idata;
int count = TIFFNumberOfStrips(iimage);
for (int i = 0; i < count; ++i) {
tsize_t in = TIFFReadEncodedStrip(iimage, i, curr, -1);
assert(in != -1);
curr += in;
}
TIFFClose(iimage);
void *idata_d;
assert(cudaMalloc(&idata_d, size) == cudaSuccess);
assert(cudaMemcpy(idata_d, idata, size, cudaMemcpyHostToDevice) == cudaSuccess);
void *odata_d;
assert(cudaMalloc(&odata_d, size) == cudaSuccess);
/* Replace following statement with calling corresponding CUDA kernel. */
assert(cudaMemcpy(odata_d, idata_d, size, cudaMemcpyDeviceToDevice) == cudaSuccess);
char *odata = (char *) malloc(size);
assert(odata != NULL);
assert(cudaMemcpy(odata, odata_d, size, cudaMemcpyDeviceToHost) == cudaSuccess);
TIFF *oimage = TIFFOpen(argv[2], "w");
assert(oimage);
assert(TIFFSetField(oimage, TIFFTAG_IMAGEWIDTH, width));
assert(TIFFSetField(oimage, TIFFTAG_IMAGELENGTH, length));
assert(TIFFSetField(oimage, TIFFTAG_BITSPERSAMPLE, bits_per_sample));
assert(TIFFSetField(oimage, TIFFTAG_COMPRESSION, COMPRESSION_DEFLATE));
assert(TIFFSetField(oimage, TIFFTAG_PHOTOMETRIC, photometric));
assert(TIFFSetField(oimage, TIFFTAG_SAMPLESPERPIXEL, samples_per_pixel));
assert(TIFFSetField(oimage, TIFFTAG_PLANARCONFIG, planar_config));
assert(TIFFSetField(oimage, TIFFTAG_ROWSPERSTRIP, length));
tsize_t on = size;
assert(TIFFWriteEncodedStrip(oimage, 0, odata, on) == on);
TIFFClose(oimage);
free(idata);
free(odata);
assert(cudaFree(idata_d) == cudaSuccess);
assert(cudaFree(odata_d) == cudaSuccess);
}
Now, save this code say into foo.cu, and then compile with (this is for Linux):
nvcc -o foo foo.cu -ltiff
and then run with:
./foo lena.tif lena.copy.tif
and then a copy of original image should be produced in lena.copy.tif. If that OK, then you could proceed with trying to add your CUDA kernel to change pixel value.
(It would be good if you could learn from this example at least that you always have to check for error of any kind of API function you use that could possibly return an error designation.)