37#pragma warning( push )
38#pragma warning( disable : 4996)
65 <<
"BitMagic Inverted List Compression Test (c) 2019" << std::endl
66 <<
"-u32in u32-input-file -- raw 32-bit unsigned int file" << std::endl
67 <<
"-bvout bvect-output-file -- bit-vector compressed out file" << std::endl
68 <<
"-svout svect-output-file -- bit-transposed sparse vectors out file" << std::endl
69 <<
"-bvin bvect-input-file -- bit-vector compressed in file" << std::endl
71 <<
"-level N -- compression level to use (up to 5)" << std::endl
72 <<
"-silent (-s) -- no progress print or messages" << std::endl
73 <<
"-verify -- verify compressed version " << std::endl
74 <<
"-decode -- run decode test (in-memory)" << std::endl
75 <<
"-diag (-d) -- print statistics/diagnostics info" << std::endl
76 <<
"-timing (-t) -- evaluate timing/duration of operations" << std::endl
105 for (
int i = 1; i < argc; ++i)
107 std::string arg = argv[i];
108 if ((arg ==
"-h") || (arg ==
"--help"))
114 if (arg ==
"-v" || arg ==
"-verify")
122 if (arg ==
"-decode")
130 if (arg ==
"-l" || arg ==
"-level")
134 const char* lvl = argv[++i];
136 c_level = (unsigned) std::strtoul(lvl, &end, 10);
139 std::cerr <<
"Error parsing -level: range error for "
146 std::cerr <<
"Error: -level requires compression level number" << std::endl;
152 if (arg ==
"-bvout" || arg ==
"--bvout")
160 std::cerr <<
"Error: -bvout requires file name" << std::endl;
165 if (arg ==
"-bvin" || arg ==
"--bvin")
173 std::cerr <<
"Error: -bvin requires file name" << std::endl;
179 if (arg ==
"-svin" || arg ==
"--svin")
187 std::cerr <<
"Error: -svin requires file name" << std::endl;
193 if (arg ==
"-u32in" || arg ==
"--u32in")
201 std::cerr <<
"Error: -u32in requires file name" << std::endl;
207 if (arg ==
"-svout" || arg ==
"--svout")
215 std::cerr <<
"Error: -svout requires file name" << std::endl;
222 if (arg ==
"-silent" || arg ==
"--silent" || arg ==
"-s" || arg ==
"--s")
224 if (arg ==
"-diag" || arg ==
"--diag" || arg ==
"-d" || arg ==
"--d")
226 if (arg ==
"-timing" || arg ==
"--timing" || arg ==
"-t" || arg ==
"--t")
250 typedef typename VT::value_type value_type;
255 fin.read((
char*) &len, std::streamsize(
sizeof(len)));
262 fin.read((
char*) &vec[0], std::streamsize(len*
sizeof(value_type)));
274 typename VT::value_type& min_delta,
275 typename VT::value_type& min_delta_cnt
278 typename VT::value_type md_cnt = min_delta_cnt = 0;
279 auto sz = vec.size();
282 auto i_prev = vec[0];
283 typename VT::value_type md = ~(
typename VT::value_type(0));
284 for (
typename VT::size_type i = 1; i < sz; ++i)
289 typename VT::value_type d1 = val - i_prev;
294 md_cnt += (d1 == md);
298 min_delta_cnt = md_cnt;
304template<
typename VT,
typename BV>
307 if (vec.size() != bv.count())
311 typename BV::enumerator en = bv.first();
312 typename VT::const_iterator it = vec.begin();
313 for (; en.valid(); ++en, ++it)
327 typename BV::statistics st;
329 auto bc = bv.count();
330 auto blocks_count = st.gap_blocks + st.bit_blocks;
331 if (bc <= blocks_count)
333 auto bc_parity = blocks_count * 6;
334 return (bc <= bc_parity);
353 bvs.serialize(bv, sbuf);
355 unsigned bv_size = (unsigned)sbuf.size();
356 bv_file.write((
char*)&bv_size,
sizeof(bv_size));
357 bv_file.write((
char*)sbuf.data(), (std::streamsize)sbuf.size());
359 throw std::runtime_error(
"Error write to bvect out file");
377 sv_bi.add(min_delta);
380 for (
unsigned k = 1; k < vec.size(); ++k)
384 if (delta < min_delta)
385 throw std::runtime_error(
"Input vector validation delta error");
397 unsigned sv_size = (unsigned)sv_lay.
size();
398 sv_file.write((
char*)&sv_size,
sizeof(sv_size));
399 sv_file.write((
char*)sv_lay.
data(), (std::streamsize)sv_lay.
size());
401 throw std::runtime_error(
"Error write to bvect out file");
420 sv_bi.add(min_delta);
423 for (
unsigned k = 1; k < vec.size(); ++k)
427 if (delta < min_delta)
428 throw std::runtime_error(
"Input vector validation delta error");
447 unsigned sv_size = (unsigned)sv_lay.
size();
448 sv_file.write((
char*)&sv_size,
sizeof(sv_size));
449 sv_file.write((
char*)sv_lay.
data(), (std::streamsize)sv_lay.
size());
451 throw std::runtime_error(
"Error write to bvect out file");
461 const std::string& bv_out_fname,
462 const std::string& sv_out_fname)
472 cout <<
"Reading input collection: " << fname << endl;
473 if (!bv_out_fname.empty())
474 cout <<
"Writing to BV collection: " << bv_out_fname << endl;
476 cout <<
"NO BV collection specified" << endl;
477 if (!sv_out_fname.empty())
478 cout <<
"Writing to SV collection: " << sv_out_fname << endl;
480 cout <<
"NO SV collection specified" << endl;
483 cout <<
"Compression level: " <<
c_level << endl;
487 vector<unsigned> vec;
488 std::ifstream fin(fname.c_str(), std::ios::in | std::ios::binary);
491 throw std::runtime_error(
"Cannot open input file");
494 std::ofstream bv_file;
495 if (!bv_out_fname.empty())
497 bv_file.open(bv_out_fname, std::ios::out | std::ios::binary);
499 throw std::runtime_error(
"Cannot open bvect out file");
501 std::ofstream sv_file;
502 if (!sv_out_fname.empty())
504 sv_file.open(sv_out_fname, std::ios::out | std::ios::binary);
506 throw std::runtime_error(
"Cannot open svect out file");
510 fin.seekg(0, std::ios::end);
511 std::streamsize fsize = fin.tellg();
513 fin.seekg(0, std::ios::beg);
531 for (i = 0;
true; ++i)
535 throw std::runtime_error(
"Error reading input file");
536 unsigned min_delta, min_delta_cnt;
541 throw std::runtime_error(
"Input vector validation failed");
542 if (!min_delta || !min_delta_cnt)
543 throw std::runtime_error(
"Input vector validation delta error");
545 min_delta_ints += min_delta_cnt;
547 total_ints += vec.size();
551 bool is_low_card =
false;
552 if (!bv_out_fname.empty())
557 total_low_card_size += sbuf.size();
564 if (!sv_out_fname.empty())
568 sv_size += sv_lay.
size();
590 std::streamsize fpos_curr = fin.tellg();
591 if (fpos_curr == fsize)
596 cout <<
"\r" << i <<
" " << fpos_curr <<
" / " << fsize
597 <<
" ( size=" << vec.size() <<
" ) " << (is_low_card ?
" * " :
" ")
598 <<
" sv=" << sv_cnt <<
" rsc_diff=" << rsc_diff_size
607 cout <<
"Total vectors=" << i << endl;
608 cout <<
" lo-card=" << total_low_card << endl;
609 cout <<
" lo-card size = " << total_low_card_size << endl;
610 cout <<
" SV cnt = " << sv_cnt << endl;
611 cout <<
" SV size = " << sv_size << endl;
612 cout <<
" RSC diff = " << rsc_diff_size << endl;
613 cout <<
"Total ints=" << total_ints << endl;
614 cout <<
" min-deltas = " << min_delta_ints << endl;
616 double min_delta_ratio = double(min_delta_ints) / double(total_ints);
617 cout <<
" min delta ratio = " << std::setprecision(3) << min_delta_ratio << endl;
619 if (!bv_out_fname.empty())
622 cout <<
"BV size = " << bv_size << endl;
625 double bv_bits_per_int = double(bv_size * 8ull - (i*
sizeof(
unsigned))) / double(total_ints);
626 cout <<
"BV Bits per/int = " << std::setprecision(3) << bv_bits_per_int << endl;
631 if (!sv_out_fname.empty())
634 cout <<
"SV size = " << sv_size << endl;
637 double sv_bits_per_int = double(sv_size * 8ull - (i*
sizeof(
unsigned))) / double(total_ints);
638 cout <<
"SV Bits per/int = " << std::setprecision(3) << sv_bits_per_int << endl;
657 bv_file.read((
char*) &len, std::streamsize(
sizeof(len)));
663 sbuf.resize(len,
false);
664 bv_file.read((
char*) sbuf.data(), std::streamsize(len));
678 const std::string& bv_in_fname)
686 cout <<
"Reading input collection: " << fname << endl;
687 if (!bv_in_fname.empty())
688 cout <<
"Reading BV collection: " << bv_in_fname << endl;
690 cout <<
"NO BV collection specified" << endl;
693 vector<unsigned> vec;
694 std::ifstream fin(fname.c_str(), std::ios::in | std::ios::binary);
697 throw std::runtime_error(
"Cannot open input file");
700 std::ifstream bv_file;
701 std::streamsize fsize = 0;
702 if (!bv_in_fname.empty())
704 bv_file.open(bv_in_fname, std::ios::in | std::ios::binary);
706 throw std::runtime_error(
"Cannot open bvect dump file");
707 fin.seekg(0, std::ios::end);
709 fin.seekg(0, std::ios::beg);
721 for (i = 0;
true; ++i)
725 throw std::runtime_error(
"Error reading input file");
727 total_ints += vec.size();
731 if (!bv_in_fname.empty())
738 throw std::runtime_error(
"Vector comparison failed");
742 std::streamsize fpos_curr = fin.tellg();
743 if (fpos_curr == fsize)
748 cout <<
"\r" << fpos_curr <<
"/" << fsize
749 <<
" ( size=" << vec.size() <<
" ) "
755 cout <<
"Verification complete." << endl;
756 cout <<
"Total vectors=" << i << endl;
757 cout <<
"Total ints=" << total_ints << endl;
767 std::ifstream bv_file;
768 std::streamsize fsize;
769 if (!bv_in_fname.empty())
771 bv_file.open(bv_in_fname, std::ios::in | std::ios::binary);
773 throw std::runtime_error(
"Cannot open bvect dump file");
774 bv_file.seekg(0, std::ios::end);
775 fsize = bv_file.tellg();
776 bv_file.seekg(0, std::ios::beg);
780 throw std::runtime_error(
"Cannot open bvect dump file");
789 for (i = 0;
true; ++i)
793 if (!bv_in_fname.empty())
799 std::streamsize fpos_curr = bv_file.tellg();
800 if (fpos_curr == fsize)
805 cout <<
"\r" << fpos_curr <<
"/" << fsize
811 cout <<
"Decode complete." << endl;
812 cout <<
"Total vectors=" << i << endl;
817int main(
int argc,
char *argv[])
835 cout <<
"Compression" << endl;
842 cout <<
"Verification." << endl;
849 cout <<
"Decode test." << endl;
856 std::cout << std::endl <<
"Timings (ms):" << std::endl;
860 catch (std::exception& ex)
862 std::cerr <<
"Error:" << ex.what() << std::endl;
871#pragma warning( pop )
Compressed bit-vector bvector<> container, set algebraic methods, traversal iterators.
#define BM_DECLARE_TEMP_BLOCK(x)
Algorithms for bvector<> (main include)
Serialization / compression of bvector<>. Set theoretical operations on compressed BLOBs.
Sparse constainer sparse_vector<> for integer types using bit-transposition transform.
Algorithms for bm::sparse_vector.
Compressed sparse container rsc_sparse_vector<> for integer types.
Serialization for sparse_vector<>
Timing utilities for benchmarking (internal)
pre-processor un-defines to avoid global space pollution (internal)
Bitvector Bit-vector container with runtime compression of bits.
bvector< Alloc > & set(size_type n, bool val=true)
Sets bit n if val is true, clears bit n if val is false.
void optimize(bm::word_t *temp_block=0, optmode opt_mode=opt_compress, statistics *stat=0)
Optimize memory bitvector's memory allocation.
bvector_size_type size_type
Utility class to collect performance measurements and statistics.
std::map< std::string, statistics > duration_map_type
test name to duration map
static void print_duration_map(TOut &tout, const duration_map_type &dmap, format fmt=ct_time)
void load_from(const sparse_vector_type &sv_src)
Load compressed vector from a sparse vector (with NULLs)
void optimize(bm::word_t *temp_block=0, typename bvector_type::optmode opt_mode=bvector_type::opt_compress, statistics *stat=0)
run memory optimization for all vector slices
Bit-vector serialization class.
void gap_length_serialization(bool value) BMNOEXCEPT
Set GAP length serialization (serializes GAP levels of the original vector)
void set_compression_level(unsigned clevel) BMNOEXCEPT
Set compression level.
void byte_order_serialization(bool value) BMNOEXCEPT
Set byte-order serialization (for cross platform compatibility)
succinct sparse vector with runtime compression using bit-slicing / transposition method
back_insert_iterator get_back_inserter()
Provide back insert iterator Back insert iterator implements buffered insertion, which is faster,...
friend back_insert_iterator
void optimize(bm::word_t *temp_block=0, typename bvector_type::optmode opt_mode=bvector_type::opt_compress, typename sparse_vector< Val, BV >::statistics *stat=0)
run memory optimization for all vector planes
@ BM_SORTED
input set is sorted (ascending order)
@ use_null
support "non-assigned" or "NULL" logic
size_t deserialize(BV &bv, const unsigned char *buf, bm::word_t *temp_block=0, const bm::bv_ref_vector< BV > *ref_vect=0)
Bitvector deserialization from a memory BLOB.
void sparse_vector_serialize(const SV &sv, sparse_vector_serial_layout< SV > &sv_layout, bm::word_t *temp_block=0)
Serialize sparse vector into a memory buffer(s) structure.
int main(int argc, char *argv[])
int validate_inp_vec(const VT &vec, typename VT::value_type &min_delta, typename VT::value_type &min_delta_cnt)
Check if input vector is monotonously sorted (true inverted list) along the way in computes a minimal...
static int parse_args(int argc, char *argv[])
bm::sparse_vector< unsigned, bm::bvector<> > sparse_vector_u32
static int read_bvector(std::ifstream &bv_file, bm::bvector<> &bv, bm::serializer< bm::bvector<> >::buffer &sbuf)
read and desrialize bit-bector from the dump file
static void verify_inv_dump_file(const std::string &fname, const std::string &bv_in_fname)
read the input collection sequence and dump file, verify correctness
bool is_super_sparse(const BV &bv)
Debug utility to detect super sparse bit-vectors which probably get bad compression rate.
bm::chrono_taker ::duration_map_type timing_map
static void compress_inv_dump_file(const std::string &fname, const std::string &bv_out_fname, const std::string &sv_out_fname)
read the input collection sequence, write using various compression schemes
int compare_vect(const VT &vec, const BV &bv)
Verification check if integer vector is equivalent to a bit-vector.
void write_as_svector(std::ofstream &sv_file, const VT &vec, unsigned min_delta, bm::sparse_vector_serial_layout< sparse_vector_u32 > &sv_lay)
convert vector into delta coded bit-transposed vector and append to the file
void write_as_rsc_svector(std::ofstream &sv_file, const VT &vec, unsigned min_delta, bm::sparse_vector_serial_layout< rsc_sparse_vector_u32 > &sv_lay)
convert vector into delta coded bit-transposed vector and append to the file
int io_read_u32_coll(std::ifstream &fin, VT &vec)
Read 32-bit vector size-prefix format (length:0, 1, 2, 3, ....)
bool write_as_bvector(std::ofstream &bv_file, const VT &vec, bm::serializer< bm::bvector<> > &bvs, bm::serializer< bm::bvector<> >::buffer &sbuf)
convert vector into bit-vector and append to the file
static void decode_test_dump_file(const std::string &bv_in_fname)
read and decode the compressed dump file
bm::rsc_sparse_vector< unsigned, sparse_vector_u32 > rsc_sparse_vector_u32
const unsigned set_compression_default
Default compression level.
unsigned long long int id64_t
layout class for serialization buffer structure
size_t size() const BMNOEXCEPT
return current serialized size
const unsigned char * data() const BMNOEXCEPT
Return serialization buffer pointer.