I am working with two 1920x1080 images which I need to process to produce two same size outputs.
My final step interleaves results of half width images to get the full width. It is supposed to be a fine grain pixel by pixel interleave but I am getting a block based results.
I’ve tried various approaches including the code below for the interleaver which uses texture references. I’m using linear 1D arrays.
Sequence summary:
Map Direct3d textures
CudaMemcpy to gpu
process
interleave
CudaMemcpy to host
Display
[codebox]#include “stdafx.h”
// includes, cuda
#include <cuda.h>
#include <builtin_types.h>
#include <cuda_runtime_api.h>
#include <cuda_d3d9_interop.h>
// includes, project
#include <cutil_inline.h>
#include <rendercheck_d3d9.h>
#define MAX_EPSILON 10
#define BLOCKDIMX 32
#define BLOCKDIMY 6
#define RGB_LEN 3
#define RGBA_LEN 4
extern int g_width; // =1920
extern int g_height; // =1080
/*
-
Expands frame into 1080x1920 from two textures that are 1080x960 (base+enhanced frames),
-
also convert from float to unsigned char
*/
texture<unsigned char, 1, cudaReadModeElementType> texHalf1;
texture<unsigned char, 1, cudaReadModeElementType> texHalf2;
global void cuda_kernel_arrange(
texture<unsigned char, 1, cudaReadModeElementType> texHalf1,
texture<unsigned char, 1, cudaReadModeElementType> texHalf2,
unsigned char *texout,
int pitch,
int width,
int height)
{
int x = blockIdx.x*blockDim.x + threadIdx.x;
int y = blockIdx.y*blockDim.y + threadIdx.y;
int rowbytes = (RGB_LEN) * (width); //bytes in 1 row of half base/enh frame (960 * 4)
int offset = (y*rowbytes/2) + ((RGB_LEN)*(x/2));
unsigned char *pixel;
// in the case where, due to quantization into grids, we have
// more threads than pixels, skip the threads which don't
// correspond to valid pixels
//if (x >= width || y >= height) return;
// get a pointer to the pixel at (x,y)
pixel = &texout[y*pitch + RGBA_LEN*x];
//pixelin1 = &tex1[offset];
//pixelin2 = &tex2[offset];
//pixel[0] = (pixelin1[0]+pixelin2[0])/2;
//pixel[1] = (pixelin1[1]+pixelin2[1])/2;
//pixel[2] = (pixelin1[2]+pixelin2[2])/2;
//pixel[3] = 1; //alpha
//pixelin1[0] = 0x90;
//pixelin1[1] = 0x90;
//pixelin1[2] = 0x90;
// even rows
if ((y % 2) == 0) {
// even columns
if ((x % 2) == 0) {
pixel[0] = tex1Dfetch(texHalf2, offset);
pixel[1] = tex1Dfetch(texHalf2, offset + 1);
pixel[2] = tex1Dfetch(texHalf2, offset + 2);
pixel[3] = 1; //alpha
}
else { // odd columns
pixel[0] = tex1Dfetch(texHalf1, offset);
pixel[1] = tex1Dfetch(texHalf1, offset + 1);
pixel[2] = tex1Dfetch(texHalf1, offset + 2);
pixel[3] = 1; //alpha
}
}
else { // odd rows
if ((x % 2) == 0 ) { //even columns
pixel[0] = tex1Dfetch(texHalf1, offset);
pixel[1] = tex1Dfetch(texHalf1, offset + 1);
pixel[2] = tex1Dfetch(texHalf1, offset + 2);
pixel[3] = 1; //alpha
}
else { //odd columns
pixel[0] = tex1Dfetch(texHalf2, offset);
pixel[1] = tex1Dfetch(texHalf2, offset + 1);
pixel[2] = tex1Dfetch(texHalf2, offset + 2);
pixel[3] = 1; //alpha
}
}
}
extern “C”
void cuda_arrange(void* tex1, void* tex2, void* texout, int pitch)
{
cudaError_t error = cudaSuccess;
dim3 Db = dim3( BLOCKDIMX, BLOCKDIMY ); // block dimensions are fixed to be 256 threads
dim3 Dg = dim3( g_width/Db.x, g_height/Db.y );
cudaChannelFormatDesc channelDescHalf1 =cudaCreateChannelDesc<unsigned char>();
cudaChannelFormatDesc channelDescHalf2 =cudaCreateChannelDesc<unsigned char>();
cudaBindTexture(0, &texHalf1, tex1, &channelDescHalf1, g_height * g_width/2 * RGB_LEN );
cudaBindTexture(0, &texHalf2, tex2, &channelDescHalf1, g_height * g_width/2 * RGB_LEN );
cuda_kernel_arrange<<<Dg,Db>>>( texHalf1, texHalf2, (unsigned char *)texout, pitch, g_width, g_height );
error = cudaGetLastError();
if (error != cudaSuccess) {
printf("cuda_kernel_arrange() failed to launch error = %d\n", error);
}
}
[/codebox]
Any ideas what I might be doing wrong? Compiling with sm_12 for gtx 275.
Thanks,
Frank