mirror of
https://github.com/AlexandreRouma/SDRPlusPlus.git
synced 2026-04-19 14:52:43 +00:00
Bugfix + added M17 decoder to the linux CI
This commit is contained in:
2
core/libcorrect/src/convolutional/sse/CMakeLists.txt
Normal file
2
core/libcorrect/src/convolutional/sse/CMakeLists.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
set(SRCFILES lookup.c convolutional.c encode.c decode.c)
|
||||
add_library(correct-convolutional-sse OBJECT ${SRCFILES})
|
||||
21
core/libcorrect/src/convolutional/sse/convolutional.c
Normal file
21
core/libcorrect/src/convolutional/sse/convolutional.c
Normal file
@@ -0,0 +1,21 @@
|
||||
#include "correct/convolutional/sse/convolutional.h"
|
||||
|
||||
correct_convolutional_sse *correct_convolutional_sse_create(size_t rate,
|
||||
size_t order,
|
||||
const polynomial_t *poly) {
|
||||
correct_convolutional_sse *conv = malloc(sizeof(correct_convolutional_sse));
|
||||
correct_convolutional *init_conv = _correct_convolutional_init(&conv->base_conv, rate, order, poly);
|
||||
if (!init_conv) {
|
||||
free(conv);
|
||||
conv = NULL;
|
||||
}
|
||||
return conv;
|
||||
}
|
||||
|
||||
void correct_convolutional_sse_destroy(correct_convolutional_sse *conv) {
|
||||
if (conv->base_conv.has_init_decode) {
|
||||
oct_lookup_destroy(conv->oct_lookup);
|
||||
}
|
||||
_correct_convolutional_teardown(&conv->base_conv);
|
||||
free(conv);
|
||||
}
|
||||
319
core/libcorrect/src/convolutional/sse/decode.c
Normal file
319
core/libcorrect/src/convolutional/sse/decode.c
Normal file
@@ -0,0 +1,319 @@
|
||||
#include "correct/convolutional/sse/convolutional.h"
|
||||
|
||||
static void convolutional_sse_decode_inner(correct_convolutional_sse *sse_conv, unsigned int sets,
|
||||
const uint8_t *soft) {
|
||||
correct_convolutional *conv = &sse_conv->base_conv;
|
||||
shift_register_t highbit = 1 << (conv->order - 1);
|
||||
unsigned int hist_buf_index = conv->history_buffer->index;
|
||||
unsigned int hist_buf_cap = conv->history_buffer->cap;
|
||||
unsigned int hist_buf_len = conv->history_buffer->len;
|
||||
unsigned int hist_buf_rn_int = conv->history_buffer->renormalize_interval;
|
||||
unsigned int hist_buf_rn_cnt = conv->history_buffer->renormalize_counter;
|
||||
for (unsigned int i = conv->order - 1; i < (sets - conv->order + 1); i++) {
|
||||
distance_t *distances = conv->distances;
|
||||
// lasterrors are the aggregate bit errors for the states of
|
||||
// shiftregister for the previous time slice
|
||||
if (soft) {
|
||||
if (conv->soft_measurement == CORRECT_SOFT_LINEAR) {
|
||||
for (unsigned int j = 0; j < 1 << (conv->rate); j++) {
|
||||
distances[j] =
|
||||
metric_soft_distance_linear(j, soft + i * conv->rate, conv->rate);
|
||||
}
|
||||
} else {
|
||||
for (unsigned int j = 0; j < 1 << (conv->rate); j++) {
|
||||
distances[j] =
|
||||
metric_soft_distance_quadratic(j, soft + i * conv->rate, conv->rate);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
unsigned int out = bit_reader_read(conv->bit_reader, conv->rate);
|
||||
for (unsigned int i = 0; i < 1 << (conv->rate); i++) {
|
||||
distances[i] = metric_distance(i, out);
|
||||
}
|
||||
}
|
||||
oct_lookup_t oct_lookup = sse_conv->oct_lookup;
|
||||
oct_lookup_fill_distance(oct_lookup, distances);
|
||||
|
||||
// a mask to get the high order bit from the shift register
|
||||
unsigned int num_iter = highbit << 1;
|
||||
const distance_t *read_errors = conv->errors->read_errors;
|
||||
// aggregate bit errors for this time slice
|
||||
distance_t *write_errors = conv->errors->write_errors;
|
||||
|
||||
uint8_t *history = conv->history_buffer->history[hist_buf_index];
|
||||
;
|
||||
// walk through all states, ignoring oldest bit
|
||||
// we will track a best register state (path) and the number of bit
|
||||
// errors at that path at this time slice
|
||||
// this loop considers two paths per iteration (high order bit set,
|
||||
// clear)
|
||||
// so, it only runs numstates/2 iterations
|
||||
// we'll update the history for every state and find the path with the
|
||||
// least aggregated bit errors
|
||||
|
||||
// now run the main loop
|
||||
// we calculate 2 sets of 2 register states here (4 states per iter)
|
||||
// this creates 2 sets which share a predecessor, and 2 sets which share
|
||||
// a successor
|
||||
//
|
||||
// the first set definition is the two states that are the same except
|
||||
// for the least order bit
|
||||
// these two share a predecessor because their high n - 1 bits are the
|
||||
// same (differ only by newest bit)
|
||||
//
|
||||
// the second set definition is the two states that are the same except
|
||||
// for the high order bit
|
||||
// these two share a successor because the oldest high order bit will be
|
||||
// shifted out, and the other bits will be present in the successor
|
||||
//
|
||||
shift_register_t highbase = highbit >> 1;
|
||||
shift_register_t oct_highbase = highbase >> 2;
|
||||
for (shift_register_t low = 0, high = highbit, base = 0, oct = 0; high < num_iter;
|
||||
low += 32, high += 32, base += 16, oct += 4) {
|
||||
// shifted-right ancestors
|
||||
// low and low_plus_one share low_past_error
|
||||
// note that they are the same when shifted right by 1
|
||||
// same goes for high and high_plus_one
|
||||
__m128i past_shuffle_mask =
|
||||
_mm_set_epi32(0x07060706, 0x05040504, 0x03020302, 0x01000100);
|
||||
__m128i hist_mask =
|
||||
_mm_set_epi32(0x80808080, 0x80808080, 0x0e0c0a09, 0x07050301);
|
||||
|
||||
// the loop below calculates 64 register states per loop iteration
|
||||
// it does this by packing the 128-bit xmm registers with 8, 16-bit
|
||||
// distances
|
||||
// 4 of these registers hold distances for convolutional shift
|
||||
// register states with the high bit cleared
|
||||
// and 4 hold distances for the corresponding shift register
|
||||
// states with the high bit set
|
||||
// since each xmm register holds 8 distances, this adds up to a
|
||||
// total of 8 * 8 = 64 shift register states
|
||||
for (shift_register_t offset = 0, base_offset = 0; base_offset < 16;
|
||||
offset += 32, base_offset += 16) {
|
||||
// load the past error for the register states with the high
|
||||
// order bit cleared
|
||||
__m128i low_past_error =
|
||||
_mm_loadl_epi64((const __m128i *)(read_errors + base + base_offset));
|
||||
__m128i low_past_error0 =
|
||||
_mm_loadl_epi64((const __m128i *)(read_errors + base + base_offset + 4));
|
||||
__m128i low_past_error1 =
|
||||
_mm_loadl_epi64((const __m128i *)(read_errors + base + base_offset + 8));
|
||||
__m128i low_past_error2 =
|
||||
_mm_loadl_epi64((const __m128i *)(read_errors + base + base_offset + 12));
|
||||
|
||||
// shuffle the low past error
|
||||
// register states that differ only by their low order bit share
|
||||
// a past error
|
||||
low_past_error = _mm_shuffle_epi8(low_past_error, past_shuffle_mask);
|
||||
low_past_error0 = _mm_shuffle_epi8(low_past_error0, past_shuffle_mask);
|
||||
low_past_error1 = _mm_shuffle_epi8(low_past_error1, past_shuffle_mask);
|
||||
low_past_error2 = _mm_shuffle_epi8(low_past_error2, past_shuffle_mask);
|
||||
|
||||
// repeat past error lookup for register states with high order
|
||||
// bit set
|
||||
__m128i high_past_error =
|
||||
_mm_loadl_epi64((const __m128i *)(read_errors + highbase + base + base_offset));
|
||||
__m128i high_past_error0 = _mm_loadl_epi64(
|
||||
(const __m128i *)(read_errors + highbase + base + base_offset + 4));
|
||||
__m128i high_past_error1 = _mm_loadl_epi64(
|
||||
(const __m128i *)(read_errors + highbase + base + base_offset + 8));
|
||||
__m128i high_past_error2 = _mm_loadl_epi64(
|
||||
(const __m128i *)(read_errors + highbase + base + base_offset + 12));
|
||||
|
||||
high_past_error = _mm_shuffle_epi8(high_past_error, past_shuffle_mask);
|
||||
high_past_error0 = _mm_shuffle_epi8(high_past_error0, past_shuffle_mask);
|
||||
high_past_error1 = _mm_shuffle_epi8(high_past_error1, past_shuffle_mask);
|
||||
high_past_error2 = _mm_shuffle_epi8(high_past_error2, past_shuffle_mask);
|
||||
|
||||
// __m128i this_shuffle_mask = (__m128i){0x80800100, 0x80800302,
|
||||
// 0x80800504, 0x80800706};
|
||||
|
||||
// load the opaque oct distance table keys from out loop index
|
||||
distance_oct_key_t low_key = oct_lookup.keys[oct + (base_offset / 4)];
|
||||
distance_oct_key_t low_key0 = oct_lookup.keys[oct + (base_offset / 4) + 1];
|
||||
distance_oct_key_t low_key1 = oct_lookup.keys[oct + (base_offset / 4) + 2];
|
||||
distance_oct_key_t low_key2 = oct_lookup.keys[oct + (base_offset / 4) + 3];
|
||||
|
||||
// load the distances for the register states with high order
|
||||
// bit cleared
|
||||
__m128i low_this_error =
|
||||
_mm_load_si128((const __m128i *)(oct_lookup.distances + low_key));
|
||||
__m128i low_this_error0 =
|
||||
_mm_load_si128((const __m128i *)(oct_lookup.distances + low_key0));
|
||||
__m128i low_this_error1 =
|
||||
_mm_load_si128((const __m128i *)(oct_lookup.distances + low_key1));
|
||||
__m128i low_this_error2 =
|
||||
_mm_load_si128((const __m128i *)(oct_lookup.distances + low_key2));
|
||||
|
||||
// add the distance for this time slice to the past distances
|
||||
__m128i low_error = _mm_add_epi16(low_past_error, low_this_error);
|
||||
__m128i low_error0 = _mm_add_epi16(low_past_error0, low_this_error0);
|
||||
__m128i low_error1 = _mm_add_epi16(low_past_error1, low_this_error1);
|
||||
__m128i low_error2 = _mm_add_epi16(low_past_error2, low_this_error2);
|
||||
|
||||
// repeat oct distance table lookup for registers with high
|
||||
// order bit set
|
||||
distance_oct_key_t high_key =
|
||||
oct_lookup.keys[oct_highbase + oct + (base_offset / 4)];
|
||||
distance_oct_key_t high_key0 =
|
||||
oct_lookup.keys[oct_highbase + oct + (base_offset / 4) + 1];
|
||||
distance_oct_key_t high_key1 =
|
||||
oct_lookup.keys[oct_highbase + oct + (base_offset / 4) + 2];
|
||||
distance_oct_key_t high_key2 =
|
||||
oct_lookup.keys[oct_highbase + oct + (base_offset / 4) + 3];
|
||||
|
||||
__m128i high_this_error =
|
||||
_mm_load_si128((const __m128i *)(oct_lookup.distances + high_key));
|
||||
__m128i high_this_error0 =
|
||||
_mm_load_si128((const __m128i *)(oct_lookup.distances + high_key0));
|
||||
__m128i high_this_error1 =
|
||||
_mm_load_si128((const __m128i *)(oct_lookup.distances + high_key1));
|
||||
__m128i high_this_error2 =
|
||||
_mm_load_si128((const __m128i *)(oct_lookup.distances + high_key2));
|
||||
|
||||
__m128i high_error = _mm_add_epi16(high_past_error, high_this_error);
|
||||
__m128i high_error0 = _mm_add_epi16(high_past_error0, high_this_error0);
|
||||
__m128i high_error1 = _mm_add_epi16(high_past_error1, high_this_error1);
|
||||
__m128i high_error2 = _mm_add_epi16(high_past_error2, high_this_error2);
|
||||
|
||||
// distances for this time slice calculated
|
||||
|
||||
// find the least error between registers who differ only in
|
||||
// their high order bit
|
||||
__m128i min_error = _mm_min_epu16(low_error, high_error);
|
||||
__m128i min_error0 = _mm_min_epu16(low_error0, high_error0);
|
||||
__m128i min_error1 = _mm_min_epu16(low_error1, high_error1);
|
||||
__m128i min_error2 = _mm_min_epu16(low_error2, high_error2);
|
||||
|
||||
_mm_store_si128((__m128i *)(write_errors + low + offset), min_error);
|
||||
_mm_store_si128((__m128i *)(write_errors + low + offset + 8), min_error0);
|
||||
_mm_store_si128((__m128i *)(write_errors + low + offset + 16), min_error1);
|
||||
_mm_store_si128((__m128i *)(write_errors + low + offset + 24), min_error2);
|
||||
|
||||
// generate history bits as (low_error > least_error)
|
||||
// this operation fills each element with all 1s if true and 0s
|
||||
// if false
|
||||
// in other words, we set the history bit to 1 if
|
||||
// the register state with high order bit set was the least
|
||||
// error
|
||||
__m128i hist = _mm_cmpgt_epi16(low_error, min_error);
|
||||
// pack the bits down from 16-bit wide to 8-bit wide to
|
||||
// accomodate history table
|
||||
hist = _mm_shuffle_epi8(hist, hist_mask);
|
||||
|
||||
__m128i hist0 = _mm_cmpgt_epi16(low_error0, min_error0);
|
||||
hist0 = _mm_shuffle_epi8(hist0, hist_mask);
|
||||
|
||||
__m128i hist1 = _mm_cmpgt_epi16(low_error1, min_error1);
|
||||
hist1 = _mm_shuffle_epi8(hist1, hist_mask);
|
||||
|
||||
__m128i hist2 = _mm_cmpgt_epi16(low_error2, min_error2);
|
||||
hist2 = _mm_shuffle_epi8(hist2, hist_mask);
|
||||
|
||||
// write the least error so that the next time slice sees it as
|
||||
// the past error
|
||||
// store the history bits set by cmp and shuffle operations
|
||||
_mm_storel_epi64((__m128i *)(history + low + offset), hist);
|
||||
_mm_storel_epi64((__m128i *)(history + low + offset + 8), hist0);
|
||||
_mm_storel_epi64((__m128i *)(history + low + offset + 16), hist1);
|
||||
_mm_storel_epi64((__m128i *)(history + low + offset + 24), hist2);
|
||||
}
|
||||
}
|
||||
|
||||
// bypass the call to history buffer
|
||||
// we should really make that function inline and remove this below
|
||||
if (hist_buf_len == hist_buf_cap - 1 || hist_buf_rn_cnt == hist_buf_rn_int - 1) {
|
||||
// restore hist buffer state and invoke it
|
||||
conv->history_buffer->len = hist_buf_len;
|
||||
conv->history_buffer->index = hist_buf_index;
|
||||
conv->history_buffer->renormalize_counter = hist_buf_rn_cnt;
|
||||
history_buffer_process(conv->history_buffer, write_errors, conv->bit_writer);
|
||||
// restore our local values
|
||||
hist_buf_len = conv->history_buffer->len;
|
||||
hist_buf_index = conv->history_buffer->index;
|
||||
hist_buf_cap = conv->history_buffer->cap;
|
||||
hist_buf_rn_cnt = conv->history_buffer->renormalize_counter;
|
||||
} else {
|
||||
hist_buf_len++;
|
||||
hist_buf_index++;
|
||||
if (hist_buf_index == hist_buf_cap) {
|
||||
hist_buf_index = 0;
|
||||
}
|
||||
hist_buf_rn_cnt++;
|
||||
}
|
||||
error_buffer_swap(conv->errors);
|
||||
}
|
||||
conv->history_buffer->len = hist_buf_len;
|
||||
conv->history_buffer->index = hist_buf_index;
|
||||
conv->history_buffer->renormalize_counter = hist_buf_rn_cnt;
|
||||
}
|
||||
|
||||
static void _convolutional_sse_decode_init(correct_convolutional_sse *conv,
|
||||
unsigned int min_traceback,
|
||||
unsigned int traceback_length,
|
||||
unsigned int renormalize_interval) {
|
||||
_convolutional_decode_init(&conv->base_conv, min_traceback, traceback_length,
|
||||
renormalize_interval);
|
||||
conv->oct_lookup =
|
||||
oct_lookup_create(conv->base_conv.rate, conv->base_conv.order, conv->base_conv.table);
|
||||
}
|
||||
|
||||
static ssize_t _convolutional_sse_decode(correct_convolutional_sse *sse_conv,
|
||||
size_t num_encoded_bits, size_t num_encoded_bytes,
|
||||
uint8_t *msg, const soft_t *soft_encoded) {
|
||||
correct_convolutional *conv = &sse_conv->base_conv;
|
||||
if (!conv->has_init_decode) {
|
||||
uint64_t max_error_per_input = conv->rate * soft_max;
|
||||
// sse implementation unfortunately uses signed math on our unsigned values
|
||||
// reduces usable distance by /2
|
||||
unsigned int renormalize_interval = (distance_max / 2) / max_error_per_input;
|
||||
_convolutional_sse_decode_init(sse_conv, 5 * conv->order, 100 * conv->order,
|
||||
renormalize_interval);
|
||||
}
|
||||
|
||||
size_t sets = num_encoded_bits / conv->rate;
|
||||
// XXX fix this vvvvvv
|
||||
size_t decoded_len_bytes = num_encoded_bytes;
|
||||
bit_writer_reconfigure(conv->bit_writer, msg, decoded_len_bytes);
|
||||
|
||||
error_buffer_reset(conv->errors);
|
||||
history_buffer_reset(conv->history_buffer);
|
||||
|
||||
// no outputs are generated during warmup
|
||||
convolutional_decode_warmup(conv, sets, soft_encoded);
|
||||
convolutional_sse_decode_inner(sse_conv, sets, soft_encoded);
|
||||
convolutional_decode_tail(conv, sets, soft_encoded);
|
||||
|
||||
history_buffer_flush(conv->history_buffer, conv->bit_writer);
|
||||
|
||||
return bit_writer_length(conv->bit_writer);
|
||||
}
|
||||
|
||||
ssize_t correct_convolutional_sse_decode(correct_convolutional_sse *conv, const uint8_t *encoded,
|
||||
size_t num_encoded_bits, uint8_t *msg) {
|
||||
if (num_encoded_bits % conv->base_conv.rate) {
|
||||
// XXX turn this into an error code
|
||||
// printf("encoded length of message must be a multiple of rate\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
size_t num_encoded_bytes =
|
||||
(num_encoded_bits % 8) ? (num_encoded_bits / 8 + 1) : (num_encoded_bits / 8);
|
||||
bit_reader_reconfigure(conv->base_conv.bit_reader, encoded, num_encoded_bytes);
|
||||
|
||||
return _convolutional_sse_decode(conv, num_encoded_bits, num_encoded_bytes, msg, NULL);
|
||||
}
|
||||
|
||||
ssize_t correct_convolutional_sse_decode_soft(correct_convolutional_sse *conv, const soft_t *encoded,
|
||||
size_t num_encoded_bits, uint8_t *msg) {
|
||||
if (num_encoded_bits % conv->base_conv.rate) {
|
||||
// XXX turn this into an error code
|
||||
// printf("encoded length of message must be a multiple of rate\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
size_t num_encoded_bytes =
|
||||
(num_encoded_bits % 8) ? (num_encoded_bits / 8 + 1) : (num_encoded_bits / 8);
|
||||
|
||||
return _convolutional_sse_decode(conv, num_encoded_bits, num_encoded_bytes, msg, encoded);
|
||||
}
|
||||
9
core/libcorrect/src/convolutional/sse/encode.c
Normal file
9
core/libcorrect/src/convolutional/sse/encode.c
Normal file
@@ -0,0 +1,9 @@
|
||||
#include "correct/convolutional/sse/convolutional.h"
|
||||
|
||||
size_t correct_convolutional_sse_encode_len(correct_convolutional_sse *conv, size_t msg_len) {
|
||||
return correct_convolutional_encode_len(&conv->base_conv, msg_len);
|
||||
}
|
||||
|
||||
size_t correct_convolutional_sse_encode(correct_convolutional_sse *conv, const uint8_t *msg, size_t msg_len, uint8_t *encoded) {
|
||||
return correct_convolutional_encode(&conv->base_conv, msg, msg_len, encoded);
|
||||
}
|
||||
183
core/libcorrect/src/convolutional/sse/lookup.c
Normal file
183
core/libcorrect/src/convolutional/sse/lookup.c
Normal file
@@ -0,0 +1,183 @@
|
||||
#include "correct/convolutional/sse/lookup.h"
|
||||
|
||||
quad_lookup_t quad_lookup_create(unsigned int rate,
|
||||
unsigned int order,
|
||||
const unsigned int *table) {
|
||||
quad_lookup_t quads;
|
||||
|
||||
quads.keys = malloc(sizeof(unsigned int) * (1 << (order - 2)));
|
||||
quads.outputs = calloc((1 << (rate * 4)), sizeof(unsigned int));
|
||||
unsigned int *inv_outputs = calloc((1 << (rate * 4)), sizeof(unsigned int));
|
||||
unsigned int output_counter = 1;
|
||||
// for every (even-numbered) shift register state, find the concatenated output of the state
|
||||
// and the subsequent state that follows it (low bit set). then, check to see if this
|
||||
// concatenated output has a unique key assigned to it already. if not, give it a key.
|
||||
// if it does, retrieve the key. assign this key to the shift register state.
|
||||
for (unsigned int i = 0; i < (1 << (order - 2)); i++) {
|
||||
// first get the concatenated quad of outputs
|
||||
unsigned int out = table[i * 4 + 3];
|
||||
out <<= rate;
|
||||
out |= table[i * 4 + 2];
|
||||
out <<= rate;
|
||||
out |= table[i * 4 + 1];
|
||||
out <<= rate;
|
||||
out |= table[i * 4];
|
||||
|
||||
// does this concatenated output exist in the outputs table yet?
|
||||
if (!inv_outputs[out]) {
|
||||
// doesn't exist, allocate a new key
|
||||
inv_outputs[out] = output_counter;
|
||||
quads.outputs[output_counter] = out;
|
||||
output_counter++;
|
||||
}
|
||||
// set the opaque key for the ith shift register state to the concatenated output entry
|
||||
quads.keys[i] = inv_outputs[out];
|
||||
}
|
||||
quads.outputs_len = output_counter;
|
||||
quads.output_mask = (1 << (rate)) - 1;
|
||||
quads.output_width = rate;
|
||||
quads.distances = calloc(quads.outputs_len, sizeof(distance_quad_t));
|
||||
free(inv_outputs);
|
||||
return quads;
|
||||
}
|
||||
|
||||
void quad_lookup_destroy(quad_lookup_t quads) {
|
||||
free(quads.keys);
|
||||
free(quads.outputs);
|
||||
free(quads.distances);
|
||||
}
|
||||
|
||||
void quad_lookup_fill_distance(quad_lookup_t quads, distance_t *distances) {
|
||||
for (unsigned int i = 1; i < quads.outputs_len; i += 1) {
|
||||
output_quad_t concat_out = quads.outputs[i];
|
||||
unsigned int i_0 = concat_out & quads.output_mask;
|
||||
concat_out >>= quads.output_width;
|
||||
unsigned int i_1 = concat_out & quads.output_mask;
|
||||
concat_out >>= quads.output_width;
|
||||
unsigned int i_2 = concat_out & quads.output_mask;
|
||||
concat_out >>= quads.output_width;
|
||||
unsigned int i_3 = concat_out;
|
||||
|
||||
quads.distances[i] = ((uint64_t)distances[i_3] << 48) | ((uint64_t)distances[i_2] << 32) | (distances[i_1] << 16) | distances[i_0];
|
||||
}
|
||||
}
|
||||
|
||||
distance_oct_key_t oct_lookup_find_key(output_oct_t *outputs, output_oct_t out, size_t num_keys) {
|
||||
for (size_t i = 1; i < num_keys; i++) {
|
||||
if (outputs[i] == out) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
oct_lookup_t oct_lookup_create(unsigned int rate,
|
||||
unsigned int order,
|
||||
const unsigned int *table) {
|
||||
oct_lookup_t octs;
|
||||
|
||||
octs.keys = malloc((1 << (order - 3)) * sizeof(distance_oct_key_t));
|
||||
octs.outputs = malloc(((output_oct_t)2 << rate) * sizeof(uint64_t));
|
||||
output_oct_t *short_outs = calloc(((output_oct_t)2 << rate), sizeof(output_oct_t));
|
||||
size_t outputs_len = 2 << rate;
|
||||
unsigned int output_counter = 1;
|
||||
// for every (even-numbered) shift register state, find the concatenated output of the state
|
||||
// and the subsequent state that follows it (low bit set). then, check to see if this
|
||||
// concatenated output has a unique key assigned to it already. if not, give it a key.
|
||||
// if it does, retrieve the key. assign this key to the shift register state.
|
||||
for (shift_register_t i = 0; i < (1 << (order - 3)); i++) {
|
||||
// first get the concatenated oct of outputs
|
||||
output_oct_t out = table[i * 8 + 7];
|
||||
out <<= rate;
|
||||
out |= table[i * 8 + 6];
|
||||
out <<= rate;
|
||||
out |= table[i * 8 + 5];
|
||||
out <<= rate;
|
||||
out |= table[i * 8 + 4];
|
||||
out <<= rate;
|
||||
out |= table[i * 8 + 3];
|
||||
out <<= rate;
|
||||
out |= table[i * 8 + 2];
|
||||
out <<= rate;
|
||||
out |= table[i * 8 + 1];
|
||||
out <<= rate;
|
||||
out |= table[i * 8];
|
||||
|
||||
distance_oct_key_t key = oct_lookup_find_key(short_outs, out, output_counter);
|
||||
// does this concatenated output exist in the outputs table yet?
|
||||
if (!key) {
|
||||
// doesn't exist, allocate a new key
|
||||
// now build it in expanded form
|
||||
output_oct_t expanded_out = table[i * 8 + 7];
|
||||
expanded_out <<= 8;
|
||||
expanded_out |= table[i * 8 + 6];
|
||||
expanded_out <<= 8;
|
||||
expanded_out |= table[i * 8 + 5];
|
||||
expanded_out <<= 8;
|
||||
expanded_out |= table[i * 8 + 4];
|
||||
expanded_out <<= 8;
|
||||
expanded_out |= table[i * 8 + 3];
|
||||
expanded_out <<= 8;
|
||||
expanded_out |= table[i * 8 + 2];
|
||||
expanded_out <<= 8;
|
||||
expanded_out |= table[i * 8 + 1];
|
||||
expanded_out <<= 8;
|
||||
expanded_out |= table[i * 8];
|
||||
|
||||
if (output_counter == outputs_len) {
|
||||
octs.outputs = realloc(octs.outputs, outputs_len * 2 * sizeof(output_oct_t));
|
||||
short_outs = realloc(short_outs, outputs_len * 2 * sizeof(output_oct_t));
|
||||
outputs_len *= 2;
|
||||
}
|
||||
short_outs[output_counter] = out;
|
||||
octs.outputs[output_counter] = expanded_out;
|
||||
key = output_counter;
|
||||
output_counter++;
|
||||
}
|
||||
// set the opaque key for the ith shift register state to the concatenated output entry
|
||||
// we multiply the key by 2 since the distances are strided by 2
|
||||
octs.keys[i] = key * 2;
|
||||
}
|
||||
free(short_outs);
|
||||
octs.outputs_len = output_counter;
|
||||
octs.output_mask = (1 << (rate)) - 1;
|
||||
octs.output_width = rate;
|
||||
octs.distances = malloc(octs.outputs_len * 2 * sizeof(uint64_t));
|
||||
return octs;
|
||||
}
|
||||
|
||||
void oct_lookup_destroy(oct_lookup_t octs) {
|
||||
free(octs.keys);
|
||||
free(octs.outputs);
|
||||
free(octs.distances);
|
||||
}
|
||||
|
||||
// WIP: sse approach to filling the distance table
|
||||
/*
|
||||
void oct_lookup_fill_distance_sse(oct_lookup_t octs, distance_t *distances) {
|
||||
distance_pair_t *distance_pair = (distance_pair_t*)octs.distances;
|
||||
__v4si index_shuffle_mask = (__v4si){0xffffff00, 0xffffff01, 0xffffff02, 0xffffff03};
|
||||
__m256i dist_shuffle_mask = (__m256i){0x01000504, 0x09080d0c, 0xffffffff, 0xffffffff,
|
||||
0x01000504, 0x09080d0c, 0xffffffff, 0xffffffff};
|
||||
const int dist_permute_mask = 0x0c;
|
||||
for (unsigned int i = 1; i < octs.outputs_len; i += 2) {
|
||||
// big heaping todo vvv
|
||||
// a) we want 16 bit distances GATHERed, not 32 bit
|
||||
// b) we need to load 8 of those distances, not 4
|
||||
__v4si short_concat_index = _mm_loadl_epi64(octs.outputs + 2*i);
|
||||
__v4si short_concat_index0 = _mm_loadl_epi64(octs.outputs + 2*i + 1);
|
||||
__m256i concat_index = _mm256_cvtepu8_epi32(short_concat_index);
|
||||
__m256i concat_index0 = _mm256_cvtepu8_epi32(short_concat_index0);
|
||||
__m256i dist = _mm256_i32gather_epi32(distances, concat_index, sizeof(distance_t));
|
||||
__m256i dist0 = _mm256_i32gather_epi32(distances, concat_index0, sizeof(distance_t));
|
||||
dist = _mm256_shuffle_epi8(dist, dist_shuffle_mask);
|
||||
dist0 = _mm256_shuffle_epi8(dist0, dist_shuffle_mask);
|
||||
dist = __builtin_shufflevector(dist, dist, 0, 5, 0, 0);
|
||||
dist0 = __builtin_shufflevector(dist0, dist0, 0, 5, 0, 0);
|
||||
__v4si packed_dist = _mm256_castsi256_si128(dist);
|
||||
_mm_store_si128(distance_pair + 8 * i, packed_dist);
|
||||
__v4si packed_dist0 = _mm256_castsi256_si128(dist0);
|
||||
_mm_store_si128(distance_pair + 8 * i + 4, packed_dist0);
|
||||
}
|
||||
}
|
||||
*/
|
||||
Reference in New Issue
Block a user