Utility to compress test sets of inverted lists (Gov2 corpus) and study compression characteristics and different comression levels
#include <iostream>
#include <chrono>
#include <thread>
#include <time.h>
#include <stdio.h>
#include <cstdlib>
#ifdef _MSC_VER
#pragma warning( push )
#pragma warning( disable : 4996)
#endif
#include <vector>
#include <chrono>
#include <map>
#include "bmdbg.h"
using namespace std;
static
{
std::cout
<< "BitMagic Inverted List Compression Test (c) 2019" << std::endl
<< "-u32in u32-input-file -- raw 32-bit unsigned int file" << std::endl
<< "-bvout bvect-output-file -- bit-vector compressed out file" << std::endl
<< "-svout svect-output-file -- bit-transposed sparse vectors out file" << std::endl
<< "-bvin bvect-input-file -- bit-vector compressed in file" << std::endl
<< std::endl
<< "-level N -- compression level to use (up to 5)" << std::endl
<< "-silent (-s) -- no progress print or messages" << std::endl
<< "-verify -- verify compressed version " << std::endl
<< "-decode -- run decode test (in-memory)" << std::endl
<< "-diag (-d) -- print statistics/diagnostics info" << std::endl
<< "-timing (-t) -- evaluate timing/duration of operations" << std::endl
;
}
static
{
for (int i = 1; i < argc; ++i)
{
std::string arg = argv[i];
if ((arg == "-h") || (arg == "--help"))
{
return 0;
}
if (arg == "-v" || arg == "-verify")
{
if (i + 1 < argc)
{
}
continue;
}
if (arg == "-decode")
{
if (i + 1 < argc)
{
}
continue;
}
if (arg == "-l" || arg == "-level")
{
if (i + 1 < argc)
{
const char* lvl = argv[++i];
char *end;
c_level = (unsigned) std::strtoul(lvl, &end, 10);
if (errno == ERANGE)
{
std::cerr << "Error parsing -level: range error for "
<< lvl<< std::endl;
return 1;
}
}
else
{
std::cerr << "Error: -level requires compression level number" << std::endl;
return 1;
}
continue;
}
if (arg == "-bvout" || arg == "--bvout")
{
if (i + 1 < argc)
{
}
else
{
std::cerr << "Error: -bvout requires file name" << std::endl;
return 1;
}
continue;
}
if (arg == "-bvin" || arg == "--bvin")
{
if (i + 1 < argc)
{
}
else
{
std::cerr << "Error: -bvin requires file name" << std::endl;
return 1;
}
continue;
}
if (arg == "-svin" || arg == "--svin")
{
if (i + 1 < argc)
{
}
else
{
std::cerr << "Error: -svin requires file name" << std::endl;
return 1;
}
continue;
}
if (arg == "-u32in" || arg == "--u32in")
{
if (i + 1 < argc)
{
}
else
{
std::cerr << "Error: -u32in requires file name" << std::endl;
return 1;
}
continue;
}
if (arg == "-svout" || arg == "--svout")
{
if (i + 1 < argc)
{
}
else
{
std::cerr << "Error: -svout requires file name" << std::endl;
return 1;
}
continue;
}
if (arg == "-silent" || arg == "--silent" || arg == "-s" || arg == "--s")
if (arg == "-diag" || arg == "--diag" || arg == "-d" || arg == "--d")
if (arg == "-timing" || arg == "--timing" || arg == "-t" || arg == "--t")
}
return 0;
}
template<class VT>
{
typedef typename VT::value_type value_type;
vec.resize(0);
if (!fin.good())
return -1;
value_type len;
fin.read((char*) &len, std::streamsize(sizeof(len)));
if (!fin.good())
return -1;
if (!len)
return -2;
vec.resize(len);
fin.read((char*) &vec[0], std::streamsize(len*sizeof(value_type)));
if (!fin.good())
return -1;
return 0;
}
template<typename VT>
typename VT::value_type& min_delta,
typename VT::value_type& min_delta_cnt
)
{
typename VT::value_type md_cnt = min_delta_cnt = 0;
auto sz = vec.size();
if (!sz)
return -1;
auto i_prev = vec[0];
typename VT::value_type md = ~(typename VT::value_type(0));
for (typename VT::size_type i = 1; i < sz; ++i)
{
auto val = vec[i];
if (val <= i_prev)
return -2;
typename VT::value_type d1 = val - i_prev;
if (d1 < md)
{
md_cnt = 0; md = d1;
}
md_cnt += (d1 == md);
i_prev = val;
}
min_delta = md;
min_delta_cnt = md_cnt;
return 0;
}
template<typename VT, typename BV>
{
if (vec.size() != bv.count())
{
return -1;
}
typename BV::enumerator en = bv.first();
typename VT::const_iterator it = vec.begin();
for (; en.valid(); ++en, ++it)
{
if (*en != *it)
return -1;
}
return 0;
}
template<typename BV>
{
typename BV::statistics st;
bv.calc_stat(&st);
auto bc = bv.count();
auto blocks_count = st.gap_blocks + st.bit_blocks;
if (bc <= blocks_count)
return true;
auto bc_parity = blocks_count * 6;
return (bc <= bc_parity);
}
template<typename VT>
const VT& vec,
{
bvs.serialize(bv, sbuf);
unsigned bv_size = (unsigned)sbuf.size();
bv_file.write((char*)&bv_size, sizeof(bv_size));
bv_file.write((char*)sbuf.data(), (std::streamsize)sbuf.size());
if (!bv_file.good())
throw std::runtime_error("Error write to bvect out file");
return false;
}
template<typename VT>
const VT& vec,
unsigned min_delta,
{
{
sv_bi.add(min_delta);
sv_bi.add(prev);
for (unsigned k = 1; k < vec.size(); ++k)
{
if (delta < min_delta)
throw std::runtime_error("Input vector validation delta error");
delta -= min_delta;
sv_bi.add(delta);
prev = curr;
}
sv_bi.flush();
}
unsigned sv_size = (unsigned)sv_lay.
size();
sv_file.write((char*)&sv_size, sizeof(sv_size));
sv_file.write((
char*)sv_lay.
data(), (std::streamsize)sv_lay.
size());
if (!sv_file.good())
throw std::runtime_error("Error write to bvect out file");
}
template<typename VT>
const VT& vec,
unsigned min_delta,
{
{
sv_bi.add(min_delta);
sv_bi.add(prev);
for (unsigned k = 1; k < vec.size(); ++k)
{
if (delta < min_delta)
throw std::runtime_error("Input vector validation delta error");
delta -= min_delta;
if (delta)
sv_bi.add(delta);
else
sv_bi.add_null();
prev = curr;
}
sv_bi.flush();
}
unsigned sv_size = (unsigned)sv_lay.
size();
sv_file.write((char*)&sv_size, sizeof(sv_size));
sv_file.write((
char*)sv_lay.
data(), (std::streamsize)sv_lay.
size());
if (!sv_file.good())
throw std::runtime_error("Error write to bvect out file");
}
static
const std::string& bv_out_fname,
const std::string& sv_out_fname)
{
cout << "Reading input collection: " << fname << endl;
if (!bv_out_fname.empty())
cout << "Writing to BV collection: " << bv_out_fname << endl;
else
cout << "NO BV collection specified" << endl;
if (!sv_out_fname.empty())
cout << "Writing to SV collection: " << sv_out_fname << endl;
else
cout << "NO SV collection specified" << endl;
cout <<
"Compression level: " <<
c_level << endl;
vector<unsigned> vec;
std::ifstream fin(fname.c_str(), std::ios::in | std::ios::binary);
if (!fin.good())
{
throw std::runtime_error("Cannot open input file");
}
std::ofstream bv_file;
if (!bv_out_fname.empty())
{
bv_file.open(bv_out_fname, std::ios::out | std::ios::binary);
if (!bv_file.good())
throw std::runtime_error("Cannot open bvect out file");
}
std::ofstream sv_file;
if (!sv_out_fname.empty())
{
sv_file.open(sv_out_fname, std::ios::out | std::ios::binary);
if (!sv_file.good())
throw std::runtime_error("Cannot open svect out file");
}
fin.seekg(0, std::ios::end);
std::streamsize fsize = fin.tellg();
fin.seekg(0, std::ios::beg);
for (i = 0; true; ++i)
{
if (ret != 0)
throw std::runtime_error("Error reading input file");
unsigned min_delta, min_delta_cnt;
{
if (ret != 0)
throw std::runtime_error("Input vector validation failed");
if (!min_delta || !min_delta_cnt)
throw std::runtime_error("Input vector validation delta error");
min_delta_ints += min_delta_cnt;
}
total_ints += vec.size();
bool is_low_card = false;
if (!bv_out_fname.empty())
{
if (is_low_card)
{
total_low_card_size += sbuf.size();
++total_low_card;
}
}
#if 0
if (!sv_out_fname.empty())
{
sv_size += sv_lay.
size();
}
#endif
std::streamsize fpos_curr = fin.tellg();
if (fpos_curr == fsize)
break;
{
cout << "\r" << i << " " << fpos_curr << " / " << fsize
<< " ( size=" << vec.size() << " ) " << (is_low_card ? " * " : " ")
<< " sv=" << sv_cnt << " rsc_diff=" << rsc_diff_size
<< flush;
}
}
cout << endl;
cout << "Total vectors=" << i << endl;
cout << " lo-card=" << total_low_card << endl;
cout << " lo-card size = " << total_low_card_size << endl;
cout << " SV cnt = " << sv_cnt << endl;
cout << " SV size = " << sv_size << endl;
cout << " RSC diff = " << rsc_diff_size << endl;
cout << "Total ints=" << total_ints << endl;
cout << " min-deltas = " << min_delta_ints << endl;
{
double min_delta_ratio = double(min_delta_ints) / double(total_ints);
cout << " min delta ratio = " << std::setprecision(3) << min_delta_ratio << endl;
}
if (!bv_out_fname.empty())
{
cout << "BV size = " << bv_size << endl;
double bv_bits_per_int = double(bv_size * 8ull - (i*sizeof(unsigned))) / double(total_ints);
cout << "BV Bits per/int = " << std::setprecision(3) << bv_bits_per_int << endl;
bv_file.close();
}
if (!sv_out_fname.empty())
{
cout << "SV size = " << sv_size << endl;
double sv_bits_per_int = double(sv_size * 8ull - (i*sizeof(unsigned))) / double(total_ints);
cout << "SV Bits per/int = " << std::setprecision(3) << sv_bits_per_int << endl;
sv_file.close();
}
}
static
{
if (!bv_file.good())
return -1;
unsigned len;
bv_file.read((char*) &len, std::streamsize(sizeof(len)));
if (!bv_file.good())
return -1;
if (!len)
return -2;
sbuf.resize(len, false);
bv_file.read((char*) sbuf.data(), std::streamsize(len));
if (!bv_file.good())
return -1;
return 0;
}
static
const std::string& bv_in_fname)
{
cout << "Reading input collection: " << fname << endl;
if (!bv_in_fname.empty())
cout << "Reading BV collection: " << bv_in_fname << endl;
else
cout << "NO BV collection specified" << endl;
vector<unsigned> vec;
std::ifstream fin(fname.c_str(), std::ios::in | std::ios::binary);
if (!fin.good())
{
throw std::runtime_error("Cannot open input file");
}
std::ifstream bv_file;
std::streamsize fsize = 0;
if (!bv_in_fname.empty())
{
bv_file.open(bv_in_fname, std::ios::in | std::ios::binary);
if (!bv_file.good())
throw std::runtime_error("Cannot open bvect dump file");
fin.seekg(0, std::ios::end);
fsize = fin.tellg();
fin.seekg(0, std::ios::beg);
}
for (i = 0; true; ++i)
{
if (ret != 0)
throw std::runtime_error("Error reading input file");
total_ints += vec.size();
if (!bv_in_fname.empty())
{
if (cmp != 0)
{
throw std::runtime_error("Vector comparison failed");
}
}
std::streamsize fpos_curr = fin.tellg();
if (fpos_curr == fsize)
break;
{
cout << "\r" << fpos_curr << "/" << fsize
<< " ( size=" << vec.size() << " ) "
<< flush;
}
}
cout << endl;
cout << "Verification complete." << endl;
cout << "Total vectors=" << i << endl;
cout << "Total ints=" << total_ints << endl;
}
static
{
std::ifstream bv_file;
std::streamsize fsize;
if (!bv_in_fname.empty())
{
bv_file.open(bv_in_fname, std::ios::in | std::ios::binary);
if (!bv_file.good())
throw std::runtime_error("Cannot open bvect dump file");
bv_file.seekg(0, std::ios::end);
fsize = bv_file.tellg();
bv_file.seekg(0, std::ios::beg);
}
else
{
throw std::runtime_error("Cannot open bvect dump file");
}
for (i = 0; true; ++i)
{
if (!bv_in_fname.empty())
{
}
std::streamsize fpos_curr = bv_file.tellg();
if (fpos_curr == fsize)
break;
{
cout << "\r" << fpos_curr << "/" << fsize
<< flush;
}
}
cout << endl;
cout << "Decode complete." << endl;
cout << "Total vectors=" << i << endl;
}
int main(
int argc,
char *argv[])
{
if (argc < 3)
{
return 1;
}
try
{
if (ret != 0)
return ret;
{
{
cout << "Compression" << endl;
}
else
{
{
cout << "Verification." << endl;
}
}
}
{
cout << "Decode test." << endl;
}
{
std::cout << std::endl << "Timings (ms):" << std::endl;
}
}
catch (std::exception& ex)
{
std::cerr << "Error:" << ex.what() << std::endl;
return 1;
}
return 0;
}
#ifdef _MSC_VER
#pragma warning( pop )
#endif
Compressed bit-vector bvector<> container, set algebraic methods, traversal iterators.
#define BM_DECLARE_TEMP_BLOCK(x)
Algorithms for bvector<> (main include)
Serialization / compression of bvector<>. Set theoretical operations on compressed BLOBs.
Sparse constainer sparse_vector<> for integer types using bit-transposition transform.
Algorithms for bm::sparse_vector.
Compressed sparse container rsc_sparse_vector<> for integer types.
Serialization for sparse_vector<>
Timing utilities for benchmarking (internal)
pre-processor un-defines to avoid global space pollution (internal)
Bitvector Bit-vector container with runtime compression of bits.
bvector< Alloc > & set(size_type n, bool val=true)
Sets bit n if val is true, clears bit n if val is false.
void optimize(bm::word_t *temp_block=0, optmode opt_mode=opt_compress, statistics *stat=0)
Optimize memory bitvector's memory allocation.
bvector_size_type size_type
Utility class to collect performance measurements and statistics.
std::map< std::string, statistics > duration_map_type
test name to duration map
static void print_duration_map(TOut &tout, const duration_map_type &dmap, format fmt=ct_time)
void load_from(const sparse_vector_type &sv_src)
Load compressed vector from a sparse vector (with NULLs)
void optimize(bm::word_t *temp_block=0, typename bvector_type::optmode opt_mode=bvector_type::opt_compress, statistics *stat=0)
run memory optimization for all vector slices
Bit-vector serialization class.
void gap_length_serialization(bool value) BMNOEXCEPT
Set GAP length serialization (serializes GAP levels of the original vector)
void set_compression_level(unsigned clevel) BMNOEXCEPT
Set compression level.
void byte_order_serialization(bool value) BMNOEXCEPT
Set byte-order serialization (for cross platform compatibility)
succinct sparse vector with runtime compression using bit-slicing / transposition method
back_insert_iterator get_back_inserter()
Provide back insert iterator Back insert iterator implements buffered insertion, which is faster,...
friend back_insert_iterator
void optimize(bm::word_t *temp_block=0, typename bvector_type::optmode opt_mode=bvector_type::opt_compress, typename sparse_vector< Val, BV >::statistics *stat=0)
run memory optimization for all vector planes
@ BM_SORTED
input set is sorted (ascending order)
@ use_null
support "non-assigned" or "NULL" logic
size_t deserialize(BV &bv, const unsigned char *buf, bm::word_t *temp_block=0, const bm::bv_ref_vector< BV > *ref_vect=0)
Bitvector deserialization from a memory BLOB.
void sparse_vector_serialize(const SV &sv, sparse_vector_serial_layout< SV > &sv_layout, bm::word_t *temp_block=0)
Serialize sparse vector into a memory buffer(s) structure.
int main(int argc, char *argv[])
int validate_inp_vec(const VT &vec, typename VT::value_type &min_delta, typename VT::value_type &min_delta_cnt)
Check if input vector is monotonously sorted (true inverted list) along the way in computes a minimal...
static int parse_args(int argc, char *argv[])
bm::sparse_vector< unsigned, bm::bvector<> > sparse_vector_u32
static int read_bvector(std::ifstream &bv_file, bm::bvector<> &bv, bm::serializer< bm::bvector<> >::buffer &sbuf)
read and desrialize bit-bector from the dump file
static void verify_inv_dump_file(const std::string &fname, const std::string &bv_in_fname)
read the input collection sequence and dump file, verify correctness
bool is_super_sparse(const BV &bv)
Debug utility to detect super sparse bit-vectors which probably get bad compression rate.
bm::chrono_taker ::duration_map_type timing_map
static void compress_inv_dump_file(const std::string &fname, const std::string &bv_out_fname, const std::string &sv_out_fname)
read the input collection sequence, write using various compression schemes
int compare_vect(const VT &vec, const BV &bv)
Verification check if integer vector is equivalent to a bit-vector.
void write_as_svector(std::ofstream &sv_file, const VT &vec, unsigned min_delta, bm::sparse_vector_serial_layout< sparse_vector_u32 > &sv_lay)
convert vector into delta coded bit-transposed vector and append to the file
void write_as_rsc_svector(std::ofstream &sv_file, const VT &vec, unsigned min_delta, bm::sparse_vector_serial_layout< rsc_sparse_vector_u32 > &sv_lay)
convert vector into delta coded bit-transposed vector and append to the file
int io_read_u32_coll(std::ifstream &fin, VT &vec)
Read 32-bit vector size-prefix format (length:0, 1, 2, 3, ....)
bool write_as_bvector(std::ofstream &bv_file, const VT &vec, bm::serializer< bm::bvector<> > &bvs, bm::serializer< bm::bvector<> >::buffer &sbuf)
convert vector into bit-vector and append to the file
static void decode_test_dump_file(const std::string &bv_in_fname)
read and decode the compressed dump file
bm::rsc_sparse_vector< unsigned, sparse_vector_u32 > rsc_sparse_vector_u32
const unsigned set_compression_default
Default compression level.
unsigned long long int id64_t
layout class for serialization buffer structure
size_t size() const BMNOEXCEPT
return current serialized size
const unsigned char * data() const BMNOEXCEPT
Return serialization buffer pointer.