I wonder how carry is handled in a simple vector addition (i.e. the one in sdk example)
__global__ void add_vectors_kernel(float *A, float *B, float *C, int N) {
// Determine which element this thread is computing
int block_id = blockIdx.x + gridDim.x * blockIdx.y;
int thread_id = blockDim.x * block_id + threadIdx.x;
// Compute a single element of the result vector (if the element is valid)
if (thread_id < N) C[thread_id] = A[thread_id] + B[thread_id];
}
__global__ void add_vectors_kernel(unsigned char *A, unsigned char *B, unsigned char *C, int N)
{
// Determine which element this thread is computing
int block_id = blockIdx.x + gridDim.x * blockIdx.y;
int thread_id = blockDim.x * block_id + threadIdx.x;
// Compute a single element of the result vector (if the element is valid)
if (thread_id < N){
C[thread_id] = A[thread_id] + B[thread_id];
}
}
then compiler would load A[thread_id], B[thread_id] to 32-bit register and
do addition of 32-bit register, finally write LSB 8-bit of 32-bit register to C[thread_id].
you can try following code to compute carry bit
__global__ void add_vectors_kernel(unsigned char *A, unsigned char *B, unsigned char *C, int N)
{
// Determine which element this thread is computing
int block_id = blockIdx.x + gridDim.x * blockIdx.y;
int thread_id = blockDim.x * block_id + threadIdx.x;
// Compute a single element of the result vector (if the element is valid)
if (thread_id < N){
int Areg = A[thread_id] ;
int Breg = B[thread_id] ;
int Creg = Areg + Breg ;
int carry = (Creg > 255) ;
C[thread_id] = Creg & 255 ;
}
}
The total minuend ‘c’ is always bigger than the total subtrahend ‘b’, but obviously is possible that the singular limb ‘c’ is smaller than the singular limb ‘b’.