different when bind texture to array or vector?

I meet a very strange issue when I use 1D texture to access data, similar code and data may have different behavior when I bind texture to array or vector.

First, I use some common code binding texture to cuda array, and access texture with tex1D function

template <typename T>

class array

{

  size_t m_width, m_height;

 cudaArray *m_array;    

public:

  __host__ __inline__ 

  array(size_t width = 0, size_t height = 1) 

    : m_width(width), m_height(height), m_array(NULL)

  {

    cudaChannelFormatDesc desc(cudaCreateChannelDesc<T>());

   CUDA_SAFE_CALL( cudaMallocArray( &m_array, &desc, m_width, m_height));

  }

  

  //...

  

  __host__ __inline__ 

  cudaArray *release(void) 

  {

    cudaArray *ptr = m_array;

    m_array = NULL;

    return ptr;

  }  

  __host__ __inline__ 

  void upload_from(const T *data) const

  {

    CUDA_SAFE_CALL( cudaMemcpyToArray( m_array, 0, 0, data, size(), cudaMemcpyHostToDevice));

  }

  __host__ __inline__ 

  void bind_to(const texture<T, 1, cudaReadModeElementType>& tex) const

  {

    CUDA_SAFE_CALL( cudaBindTextureToArray( tex, m_array));

  }

  

  //...

};

template <typename T>

__host__ __inline__ 

cudaArray * load_texture_array(const texture<T, 1, cudaReadModeElementType>& tex,

                               const T* data, size_t count)

{

  cuda::array<T> buf(count);  

  buf.upload_from(data);

  buf.bind_to(tex);

  return buf.release();

}    

That code works very well in my first two kernel function, but it have different behavior in my third kernel function.

When I use tex1D access shift_table texture, it works well; but if I what to access suffix_hash texture, it always return wrong data. The return data have similar value, such as 4 etc.

#define TEXTURE(name) g_tex_##name

#define TEXTURE_ARRAY(name) g_array_##name

#define DEFINE_TEXTURE_ARRAY(name, type) static texture<type, 1> TEXTURE(name); \

                                         static cudaArray * TEXTURE_ARRAY(name) = NULL

#define DEFINE_TEXTURE(name, type) DEFINE_TEXTURE_ARRAY(name, type)

#define TEXTURE_FETCH(tex, x) tex1D(tex, x)

DEFINE_TEXTURE(shift_table, unsigned char);

DEFINE_TEXTURE(prefix_hash, unsigned int);

DEFINE_TEXTURE(suffix_hash, unsigned int);

DEFINE_TEXTURE(pattern_table, int4);

DEFINE_TEXTURE(patterns, unsigned char);

typedef unsigned char shift_t;

typedef unsigned short hash_t;

__device__ __inline__ 

shift_t wm_shift(hash_t hash)

{

  return TEXTURE_FETCH(TEXTURE(shift_table), hash);

}

__device__ __inline__ 

hash_t wm_suffix_hash(hash_t hash)

{

  return TEXTURE_FETCH(TEXTURE(suffix_hash), hash);

}

After I switch 3rd kernel to use vector instead of array, it’s ok.

But my 1st and 2nd kernel will fail even they could works very well with texture binding array.

Both code would work very well in simulator mode.

It sounds crazy, but bother me several days.

Are there anybody could give me some hints? How about the difference between binding texture to array and vector?

Do we need prepare data for CUDA array in different way? Are there any auto alignment by CUDA driver?

btw: I use CUDA 1.1 on Windows 2003R2 with 8800GT

Thanks