Another attempt (solved the initial segmentation fault issue) below.
Code:
#include <random>
#include <assert.h>
#include <iostream>
#include "nvcomp/lz4.h"
void execute_example(uint8_t* input_data, const size_t in_bytes)
{
printf("[+] Creating CUDA stream.\n");
cudaStream_t stream;
cudaStreamCreate(&stream);
// First, initialize the data on the host.
printf("[+] Initialialising data on the device.\n");
// compute chunk sizes
printf(" [+] Compute chunk sizes.\n");
size_t* host_uncompressed_bytes;
const size_t chunk_size = 65536;
const size_t batch_size = (in_bytes + chunk_size - 1) / chunk_size;
printf(" [+] chunk_size = %zu\n", chunk_size);
printf(" [+] batch_size = %zu\n", batch_size);
printf(" [+] Allocating memory on the device.\n");
uint8_t* device_input_data;
cudaMalloc(&device_input_data, in_bytes);
cudaMemcpy(device_input_data, input_data, in_bytes, cudaMemcpyHostToDevice);
printf(" [+] Memory address of device_input_data: %p\n", device_input_data);
printf(" [+] Number of bytes: %zu\n", in_bytes);
printf(" [+] Bytes:\n");
// size_t num_bytes_to_print = in_bytes;
uint8_t* host_input_data;
cudaMallocHost(&host_input_data, in_bytes);
cudaMemcpy(host_input_data, device_input_data, in_bytes, cudaMemcpyDeviceToHost);
size_t num_bytes_to_print = 20;
for ( size_t i = 0; i < num_bytes_to_print; i++ )
if ( i == num_bytes_to_print - 1 ) printf("%X\n", host_input_data[i]);
else printf("%02hhX,", host_input_data[i]);
printf("...\n");
printf("[+] Initialialising data on the host.\n");
cudaMallocHost(&host_uncompressed_bytes, sizeof(size_t) * batch_size);
for (size_t i = 0; i < batch_size; ++i) {
if (i + 1 < batch_size) {
host_uncompressed_bytes[i] = chunk_size;
} else {
// last chunk may be smaller
host_uncompressed_bytes[i] = in_bytes - (chunk_size*i);
}
}
printf(" [+] Max bytes in host_uncompressed_bytes array slot: %zu\n", chunk_size);
printf(" [+] Slots in host_uncompressed_ptrs array: %zu\n", batch_size);
printf(" [+] Elements:\n");
for ( size_t i = 0; i < batch_size; i++ )
printf(" host_uncompressed_bytes[%zu]=%zu\n", i, host_uncompressed_bytes[i]);
// Setup an array of pointers to the start of each chunk
uint8_t** host_uncompressed_ptrs;
cudaMallocHost(&host_uncompressed_ptrs, sizeof(size_t) * batch_size);
for (size_t ix_chunk = 0; ix_chunk < batch_size; ix_chunk++) {
host_uncompressed_ptrs[ix_chunk] = device_input_data + chunk_size * ix_chunk;
}
// Setup an array of pointers to the start of each chunk
uint8_t** host_uncompressed_ptrs_read;
cudaMallocHost(&host_uncompressed_ptrs_read, sizeof(size_t) * batch_size);
for (size_t ix_chunk = 0; ix_chunk < batch_size; ix_chunk++) {
host_uncompressed_ptrs_read[ix_chunk] = host_input_data + chunk_size * ix_chunk;
}
printf(" [+] Content #1 (leading 5 and trailing 5 bytes):\n");
for ( size_t i = 0; i < batch_size; i++ )
{
uint8_t leading_values[5] = { 0x00 };
uint8_t trailing_values[5] = { 0x00 };
size_t start_ofs = 0;
size_t end_ofs = 5;
size_t idx = 0;
for ( size_t j = start_ofs; j < end_ofs; j++ )
leading_values[idx++] = host_uncompressed_ptrs_read[i][j];
start_ofs = host_uncompressed_bytes[i] - 5;
end_ofs = host_uncompressed_bytes[i];
idx = 0;
for ( size_t j = start_ofs; j < end_ofs; j++ )
trailing_values[idx++] = host_uncompressed_ptrs_read[i][j];
printf(" host_uncompressed_ptrs_read[%zu] = ", i);
for ( size_t j = 0; j < 5; j++ )
printf("%02hhX ", leading_values[j]);
printf(" . . . ");
for ( size_t j = 0; j < 5; j++ )
printf("%02hhX ", trailing_values[j]);
printf("\n");
}
// Moving the uncompressed data to the device
printf("[+] Allocate memory on the device.\n");
size_t* device_uncompressed_bytes;
void** device_uncompressed_ptrs;
cudaMalloc(&device_uncompressed_bytes, sizeof(size_t) * batch_size);
cudaMalloc(&device_uncompressed_ptrs, sizeof(size_t) * batch_size);
printf("[+] Move the uncompressed data from host to device.\n");
cudaMemcpy(device_uncompressed_bytes, host_uncompressed_bytes, sizeof(size_t) * batch_size, cudaMemcpyHostToDevice);
cudaMemcpy(device_uncompressed_ptrs, host_uncompressed_ptrs, sizeof(size_t) * batch_size, cudaMemcpyHostToDevice);
// Then we need to allocate the temporary workspace and output space needed by the compressor.
printf("[+] Allocate the temporary workspace on the device and output space needed by the compressor.\n");
size_t temp_bytes;
nvcompBatchedLZ4CompressGetTempSize(batch_size, chunk_size, nvcompBatchedLZ4DefaultOpts, &temp_bytes);
void* device_temp_ptr;
cudaMalloc(&device_temp_ptr, temp_bytes);
printf(" [+] Allocated temp_bytes = %zu - should be ( (chunk_size * batch_size) / 2 )\n", temp_bytes);
// get the maxmimum output size for each chunk
printf("[+] Get the maxmimum output size for each chunk.\n");
size_t max_out_bytes;
nvcompBatchedLZ4CompressGetMaxOutputChunkSize(chunk_size, nvcompBatchedLZ4DefaultOpts, &max_out_bytes);
printf(" [+] max_out_bytes = %zu\n", max_out_bytes);
// Next, allocate output space on the device
uint8_t** host_compressed_ptrs;
cudaMallocHost(&host_compressed_ptrs, sizeof(size_t) * batch_size);
for(size_t ix_chunk = 0; ix_chunk < batch_size; ++ix_chunk) {
cudaMalloc(&host_compressed_ptrs[ix_chunk], max_out_bytes);
}
size_t array_address_step = (size_t)(host_compressed_ptrs[1] - host_compressed_ptrs[0]);
printf(" [+] Address step within array (hex): %lX, (dec): %zu\n", array_address_step, array_address_step);
assert(array_address_step >= max_out_bytes);
printf("[+] Allocate device compressed pointers array.\n");
void** device_compressed_ptrs;
cudaMalloc(&device_compressed_ptrs, sizeof(size_t) * batch_size);
cudaMemcpy(device_compressed_ptrs, host_compressed_ptrs, sizeof(size_t) * batch_size, cudaMemcpyHostToDevice);
// allocate space for compressed chunk sizes to be written to
printf("[+] Allocate buffer space for compressed chunk sizes to be written to.\n");
size_t* device_compressed_bytes;
cudaMalloc(&device_compressed_bytes, sizeof(size_t) * batch_size);
// And finally, call the API to compress the data
printf("[+] Call the nvCOMP API to compress the data.\n");
nvcompStatus_t comp_res = nvcompBatchedLZ4CompressAsync(
device_uncompressed_ptrs,
device_uncompressed_bytes,
chunk_size, // The maximum chunk size
batch_size,
device_temp_ptr,
temp_bytes,
device_compressed_ptrs,
device_compressed_bytes,
nvcompBatchedLZ4DefaultOpts,
stream);
if (comp_res != nvcompSuccess)
{
std::cerr << "Failed compression!" << std::endl;
assert(comp_res == nvcompSuccess);
}
cudaStreamSynchronize(stream);
printf("[+] Copy compressed from device to host.\n");
size_t* read_back_compressed_bytes;
cudaMallocHost(&read_back_compressed_bytes, sizeof(size_t) * batch_size);
cudaMemcpy(read_back_compressed_bytes, device_compressed_bytes, sizeof(size_t) * batch_size, cudaMemcpyDeviceToHost);
for (size_t i = 0; i < batch_size; i++)
printf(" read_back_compressed_bytes[%zu]=%zu\n", i, read_back_compressed_bytes[i]);
uint8_t* read_back_compressed_data;
size_t total_compressed_bytes_len = array_address_step * batch_size * sizeof(uint8_t);
printf(" total_compressed_bytes_len = %zu (incl. additional \"padding\" dummy bytes at the end of each slot)\n", total_compressed_bytes_len);
cudaMallocHost(&read_back_compressed_data, total_compressed_bytes_len);
cudaMemcpy(read_back_compressed_data, device_compressed_ptrs, total_compressed_bytes_len, cudaMemcpyDeviceToHost); // Step #1 (copy all bytes starting from from device_compressed_ptrs over read_back_compressed_data)
uint8_t** read_back_compressed_ptrs;
cudaMallocHost(&read_back_compressed_ptrs, sizeof(size_t) * batch_size);
for (size_t ix_chunk = 0; ix_chunk < batch_size; ix_chunk++)
{
read_back_compressed_ptrs[ix_chunk] = read_back_compressed_data + array_address_step * ix_chunk; // Step #2 (map the host_compressed_ptrs over read_back_compressed_data)
}
printf(" [+] Content #2 (leading 5 and trailing 5 bytes):\n");
for (size_t i = 0; i < batch_size; i++)
{
uint8_t leading_values[5] = { 0x00 };
uint8_t trailing_values[5] = { 0x00 };
size_t start_ofs = 0;
size_t end_ofs = 5;
size_t idx = 0;
for (size_t j = start_ofs; j < end_ofs; j++)
leading_values[idx++] = read_back_compressed_ptrs[i][j];
start_ofs = read_back_compressed_bytes[i] - 5;
end_ofs = max_out_bytes;
idx = 0;
for (size_t j = start_ofs; j < read_back_compressed_bytes[i]; j++)
trailing_values[idx++] = read_back_compressed_ptrs[i][j];
printf(" read_back_compressed_ptrs[%zu] = ", i);
for (size_t j = 0; j < 5; j++)
printf("%02hhX ", leading_values[j]);
printf(" . . . ");
for (size_t j = 0; j < 5; j++)
printf("%02hhX ", trailing_values[j]);
printf("\n");
}
/*
// // //
size_t* copied_compressed_bytes;
cudaMalloc(&copied_compressed_bytes, sizeof(size_t) * batch_size);
cudaMemcpy(copied_compressed_bytes, read_back_compressed_bytes, sizeof(size_t) * batch_size, cudaMemcpyHostToDevice);
uint8_t* copied_compressed_data;
cudaMalloc(&copied_compressed_data, total_compressed_bytes_len);
cudaMemcpy(copied_compressed_data, read_back_compressed_data, total_compressed_bytes_len, cudaMemcpyHostToDevice);
// Redirect read_back_compressed_ptrs to copied_compressed_data
for (size_t ix_chunk = 0; ix_chunk < batch_size; ix_chunk++)
{
read_back_compressed_ptrs[ix_chunk] = copied_compressed_data + array_address_step * ix_chunk;
}
void** copied_compressed_ptrs;
cudaMalloc(&copied_compressed_ptrs, sizeof(size_t) * batch_size);
cudaMemcpy(copied_compressed_ptrs, read_back_compressed_ptrs, sizeof(size_t) * batch_size, cudaMemcpyHostToDevice);
// // //
// Decompression can be similarly performed on a batch of multiple compressed input chunks.
// As no metadata is stored with the compressed data, chunks can be re-arranged as well as decompressed
// with other chunks that originally were not compressed in the same batch.
// If we didn't have the uncompressed sizes, we'd need to compute this information here.
// We demonstrate how to do this.
nvcompBatchedLZ4GetDecompressSizeAsync(
copied_compressed_ptrs,
copied_compressed_bytes,
device_uncompressed_bytes,
batch_size,
stream);
// Next, allocate the temporary buffer
size_t decomp_temp_bytes;
nvcompBatchedLZ4DecompressGetTempSize(batch_size, chunk_size, &decomp_temp_bytes);
void * device_decomp_temp;
cudaMalloc(&device_decomp_temp, decomp_temp_bytes);
// allocate statuses
nvcompStatus_t* device_statuses;
cudaMalloc(&device_statuses, sizeof(nvcompStatus_t)*batch_size);
// Also allocate an array to store the actual_uncompressed_bytes.
// Note that we could use nullptr for this. We already have the
// actual sizes computed during the call to nvcompBatchedLZ4GetDecompressSizeAsync.
size_t* device_actual_uncompressed_bytes;
cudaMalloc(&device_actual_uncompressed_bytes, sizeof(size_t)*batch_size);
// And finally, call the decompression routine.
// This decompresses each input, device_compressed_ptrs[i], and places the decompressed
// result in the corresponding output list, device_uncompressed_ptrs[i]. It also writes
// the size of the uncompressed data to device_uncompressed_bytes[i].
nvcompStatus_t decomp_res = nvcompBatchedLZ4DecompressAsync(
copied_compressed_ptrs,
copied_compressed_bytes,
device_uncompressed_bytes,
device_actual_uncompressed_bytes,
batch_size,
device_decomp_temp,
decomp_temp_bytes,
device_uncompressed_ptrs,
device_statuses,
stream);
if (decomp_res != nvcompSuccess)
{
std::cerr << "Failed compression!" << std::endl;
assert(decomp_res == nvcompSuccess);
}
cudaStreamSynchronize(stream);
printf(" [+] Content #3 (leading 5 and trailing 5 bytes):\n");
size_t* feedback_host_uncompressed_bytes;
cudaMallocHost(&feedback_host_uncompressed_bytes, sizeof(size_t) * batch_size);
cudaMemcpy(feedback_host_uncompressed_bytes, device_actual_uncompressed_bytes, sizeof(size_t) * batch_size, cudaMemcpyDeviceToHost);
for (size_t i = 0; i < batch_size; i++)
printf(" feedback_host_uncompressed_bytes[%zu]=%zu\n", i, feedback_host_uncompressed_bytes[i]);
*/
}
int main()
{
// Initialize a random array of chars
const size_t in_bytes = 1000000;
uint8_t* uncompressed_data;
cudaMallocHost(&uncompressed_data, in_bytes);
std::mt19937 random_gen(42);
// char specialization of std::uniform_int_distribution is
// non-standard, and isn't available on MSVC, so use short instead,
// but with the range limited, and then cast below.
std::uniform_int_distribution<short> uniform_dist(0, 255);
for (size_t ix = 0; ix < in_bytes; ++ix) {
uncompressed_data[ix] = static_cast<char>(uniform_dist(random_gen));
}
execute_example(uncompressed_data, in_bytes);
return 0;
}
Output:
[+] Creating CUDA stream.
[+] Initialialising data on the device.
[+] Compute chunk sizes.
[+] chunk_size = 65536
[+] batch_size = 16
[+] Allocating memory on the device.
[+] Memory address of device_input_data: 0x7f440dc00000
[+] Number of bytes: 1000000
[+] Bytes:
5F,CB,F3,2E,BB,C7,99,98,27,72,27,19,0E,75,DD,55,99,24,B5,A6
...
[+] Initialialising data on the host.
[+] Max bytes in host_uncompressed_bytes array slot: 65536
[+] Slots in host_uncompressed_ptrs array: 16
[+] Elements:
host_uncompressed_bytes[0]=65536
host_uncompressed_bytes[1]=65536
host_uncompressed_bytes[2]=65536
host_uncompressed_bytes[3]=65536
host_uncompressed_bytes[4]=65536
host_uncompressed_bytes[5]=65536
host_uncompressed_bytes[6]=65536
host_uncompressed_bytes[7]=65536
host_uncompressed_bytes[8]=65536
host_uncompressed_bytes[9]=65536
host_uncompressed_bytes[10]=65536
host_uncompressed_bytes[11]=65536
host_uncompressed_bytes[12]=65536
host_uncompressed_bytes[13]=65536
host_uncompressed_bytes[14]=65536
host_uncompressed_bytes[15]=16960
[+] Content #1 (leading 5 and trailing 5 bytes):
host_uncompressed_ptrs_read[0] = 5F CB F3 2E BB . . . C1 FA 13 E6 27
host_uncompressed_ptrs_read[1] = E4 14 C9 51 B1 . . . FD E0 0A 75 0D
host_uncompressed_ptrs_read[2] = 57 30 A1 A8 9E . . . CC 30 04 78 54
host_uncompressed_ptrs_read[3] = 70 73 31 8D DE . . . 5D 09 B4 54 FC
host_uncompressed_ptrs_read[4] = 69 DE 3B F2 F8 . . . 97 40 45 90 D4
host_uncompressed_ptrs_read[5] = CF E6 F5 6B 02 . . . 7A 52 D1 F1 7F
host_uncompressed_ptrs_read[6] = 92 EB 2F D5 6F . . . E8 EB B2 86 49
host_uncompressed_ptrs_read[7] = 67 4D 3A F7 33 . . . 53 01 5A 41 7C
host_uncompressed_ptrs_read[8] = 2F B9 87 FB E5 . . . B6 B3 C9 4A 9B
host_uncompressed_ptrs_read[9] = 83 E4 5B 85 BD . . . 5A 58 54 0F 6E
host_uncompressed_ptrs_read[10] = FB A9 EA 41 F0 . . . 49 EB 0A E5 E8
host_uncompressed_ptrs_read[11] = D1 89 AE 15 ED . . . 61 2F C2 8A F5
host_uncompressed_ptrs_read[12] = 31 1B 5F 38 F2 . . . 55 C9 5A 5C 0B
host_uncompressed_ptrs_read[13] = E4 E3 75 A8 12 . . . 65 78 03 88 92
host_uncompressed_ptrs_read[14] = 13 60 03 1F 37 . . . 33 FC 44 18 D6
host_uncompressed_ptrs_read[15] = 6C 83 B6 0B 13 . . . 89 AA A6 A0 37
[+] Allocate memory on the device.
[+] Move the uncompressed data from host to device.
[+] Allocate the temporary workspace on the device and output space needed by the compressor.
[+] Allocated temp_bytes = 524288 - should be ( (chunk_size * batch_size) / 2 )
[+] Get the maxmimum output size for each chunk.
[+] max_out_bytes = 65800
[+] Address step within array (hex): 10200, (dec): 66048
[+] Allocate device compressed pointers array.
[+] Allocate buffer space for compressed chunk sizes to be written to.
[+] Call the nvCOMP API to compress the data.
[+] Copy compressed from device to host.
read_back_compressed_bytes[0]=65794
read_back_compressed_bytes[1]=65794
read_back_compressed_bytes[2]=65794
read_back_compressed_bytes[3]=65794
read_back_compressed_bytes[4]=65794
read_back_compressed_bytes[5]=65794
read_back_compressed_bytes[6]=65794
read_back_compressed_bytes[7]=65794
read_back_compressed_bytes[8]=65794
read_back_compressed_bytes[9]=65794
read_back_compressed_bytes[10]=65794
read_back_compressed_bytes[11]=65793
read_back_compressed_bytes[12]=65794
read_back_compressed_bytes[13]=65794
read_back_compressed_bytes[14]=65794
read_back_compressed_bytes[15]=17028
total_compressed_bytes_len = 1056768 (incl. additional "padding" dummy bytes at the end of each slot)
[+] Content #2 (leading 5 and trailing 5 bytes):
read_back_compressed_ptrs[0] = 00 00 00 00 00 . . . 00 00 00 00 00
read_back_compressed_ptrs[1] = 00 00 00 00 00 . . . 00 00 00 00 00
read_back_compressed_ptrs[2] = 00 00 00 00 00 . . . 00 00 00 00 00
read_back_compressed_ptrs[3] = 00 00 00 00 00 . . . 00 00 00 00 00
read_back_compressed_ptrs[4] = 00 00 00 00 00 . . . 00 00 00 00 00
read_back_compressed_ptrs[5] = 00 00 00 00 00 . . . 00 00 00 00 00
read_back_compressed_ptrs[6] = 00 00 00 00 00 . . . 00 00 00 00 00
read_back_compressed_ptrs[7] = 00 00 00 00 00 . . . 00 00 00 00 00
read_back_compressed_ptrs[8] = 00 00 00 00 00 . . . 00 00 00 00 00
read_back_compressed_ptrs[9] = 00 00 00 00 00 . . . 00 00 00 00 00
read_back_compressed_ptrs[10] = 00 00 00 00 00 . . . 00 00 00 00 00
read_back_compressed_ptrs[11] = 00 00 00 00 00 . . . 00 00 00 00 00
read_back_compressed_ptrs[12] = 00 00 00 00 00 . . . 00 00 00 00 00
read_back_compressed_ptrs[13] = 00 00 00 00 00 . . . 00 00 00 00 00
read_back_compressed_ptrs[14] = 00 00 00 00 00 . . . 00 00 00 00 00
read_back_compressed_ptrs[15] = 00 00 00 00 00 . . . 00 00 00 00 00
And the snippet with my attempt where I try to obtain the compressed bytes:
size_t array_address_step = (size_t)(host_compressed_ptrs[1] - host_compressed_ptrs[0]);
printf(" [+] Address step within array (hex): %lX, (dec): %zu\n", array_address_step, array_address_step);
assert(array_address_step >= max_out_bytes);
printf("[+] Copy compressed from device to host.\n");
size_t* read_back_compressed_bytes;
cudaMallocHost(&read_back_compressed_bytes, sizeof(size_t) * batch_size);
cudaMemcpy(read_back_compressed_bytes, device_compressed_bytes, sizeof(size_t) * batch_size, cudaMemcpyDeviceToHost);
for (size_t i = 0; i < batch_size; i++)
printf(" read_back_compressed_bytes[%zu]=%zu\n", i, read_back_compressed_bytes[i]);
uint8_t* read_back_compressed_data;
size_t total_compressed_bytes_len = array_address_step * batch_size * sizeof(uint8_t);
printf(" total_compressed_bytes_len = %zu (incl. additional \"padding\" dummy bytes at the end of each slot)\n", total_compressed_bytes_len);
cudaMallocHost(&read_back_compressed_data, total_compressed_bytes_len);
cudaMemcpy(read_back_compressed_data, device_compressed_ptrs, total_compressed_bytes_len, cudaMemcpyDeviceToHost); // Step #1 (copy all bytes starting from from device_compressed_ptrs over read_back_compressed_data)
uint8_t** read_back_compressed_ptrs;
cudaMallocHost(&read_back_compressed_ptrs, sizeof(size_t) * batch_size);
for (size_t ix_chunk = 0; ix_chunk < batch_size; ix_chunk++)
{
read_back_compressed_ptrs[ix_chunk] = read_back_compressed_data + array_address_step * ix_chunk; // Step #2 (map the host_compressed_ptrs over read_back_compressed_data)
}