#include <assert.h>
#include <stdlib.h>
#include <math.h>
#include <iostream>
#include <vector>
#include <list>
#include <map>
#include <algorithm>
#include <utility>
#include <memory>
#include <future>
#include <thread>
#include <mutex>
#include <atomic>
#include "bm64.h"
#include "bmdbg.h"
#include "dna_finger.h"
using namespace std;
#include "cmd_args.h"
template<typename FV>
void wait_for_slot(FV& futures,
unsigned* parallel_cnt,
unsigned concurrency)
{
do
{
for (auto e = futures.begin(); e != futures.end(); ++e)
{
std::future_status status = e->wait_for(std::chrono::milliseconds(100));
if (status == std::future_status::ready)
{
(*parallel_cnt) -= 1;
futures.erase(e);
break;
}
}
} while (*parallel_cnt >= concurrency);
}
{
public:
public:
{}
{
m_acc.push_back(acc);
m_seqs.emplace_back(seq_ptr);
}
{
{
static std::mutex mtx_counts_lock;
std::lock_guard<std::mutex> guard(mtx_counts_lock);
if (m_kmer_bufs.size() <= i)
m_kmer_bufs.resize(i+1);
m_kmer_bufs[i].reset(buf_ptr.release());
}
}
{
m_kmer_bufs.resize(this->
size());
}
{ assert(m_seqs.size() == m_acc.size()); return m_seqs.size(); }
const string&
get_acc(
size_t i)
const {
return m_acc[i]; }
size_t seq_size(
size_t i)
const {
return m_seqs[i]->size(); }
{
size_t sum = 0;
for (size_t i = 0; i < m_seqs.size(); ++i)
return sum;
}
size_t buf_size()
const {
return m_kmer_bufs.size(); }
size_t get_buf_size(
size_t i)
const {
return m_kmer_bufs[i]->size(); }
const unsigned char*
get_buf(
size_t i)
const
{
if (!p)
return 0;
return p->data();
}
private:
vector<unique_ptr<vector_char_type> > m_seqs;
vector<string> m_acc;
vector<unique_ptr<buffer_type> > m_kmer_bufs;
};
{
std::list<std::future<void> > futures;
k_mers_vect.resize(0);
k_mers_vect.reserve(bv_req_count);
for (; en.valid(); ++en)
{
auto i_idx = *en;
k_mers_vect.emplace_back(bv);
const unsigned char* buf = this->
get_buf(i_idx);
if (!buf)
continue;
futures.emplace_back(
std::async(std::launch::async,
[bv, buf]()
));
}
for (auto& e : futures)
e.wait();
}
static
{
std::string line, acc;
std::ifstream fin(fname.c_str(), std::ios::in);
if (!fin.good())
return -1;
for (size_t i = 0; std::getline(fin, line); ++i)
{
if (line.empty())
continue;
if (line.front() == '>')
{
if (!acc.empty())
{
seq_vect->shrink_to_fit();
acc.resize(0);
}
std::size_t pos = line.find_first_of(":");
if (pos == std::string::npos)
{
acc = line;
}
else
{
acc = line.substr(1, pos-1);
}
continue;
}
for (std::string::iterator it = line.begin(); it != line.end(); ++it)
seq_vect->push_back(*it);
}
if (!acc.empty())
{
seq_vect->shrink_to_fit();
}
cout << "\r \r" << endl;
return 0;
}
static
{
char magic_ch = '\t';
std::ofstream bfile (fname, std::ios::out | std::ios::binary);
if (!bfile.good())
{
std::cerr << "Cannot open file for write: " << fname << std::endl;
exit(1);
}
size_t sz = seq_coll.
size();
bfile.write((char*)&sz, std::streamsize(sizeof(sz)));
for (size_t i = 0; i < sz; ++i)
{
size_t buf_size = 0;
const unsigned char* buf = seq_coll.
get_buf(i);
if (!buf)
{
bfile.write((char*)&buf_size, std::streamsize(sizeof(buf_size)));
continue;
}
bfile.write((char*)&buf_size, std::streamsize(sizeof(buf_size)));
if (buf_size)
{
bfile.write((char*)buf, std::streamsize(buf_size));
bfile.write((char*)&magic_ch, 1);
}
}
}
static
{
char magic_ch = '\t';
std::ifstream bfile (fname, std::ios::in | std::ios::binary);
if (!bfile.good())
{
std::cerr << "Cannot open file for read: " << fname << std::endl;
exit(1);
}
size_t sz;
bfile.read((char*)&sz, std::streamsize(sizeof(sz)));
for (size_t i = 0; i < sz; ++i)
{
size_t buf_size = 0;
bfile.read((char*)&buf_size, std::streamsize(sizeof(buf_size)));
if (buf_size)
{
buf.resize(buf_size);
bfile.read((char*) buf.data(), std::streamsize(buf_size));
char control_ch = 0;
bfile.read((char*)&control_ch, 1);
if (control_ch != magic_ch)
{
cerr << "Error: read failure!" << endl;
exit(1);
}
}
}
}
inline
{
switch (bp)
{
case 'A':
dna_code = 0;
break;
case 'T':
dna_code = 1;
break;
case 'G':
dna_code = 2;
break;
case 'C':
dna_code = 3;
break;
default:
return false;
}
return true;
}
inline
size_t pos, unsigned k_size,
{
unsigned shift = 0;
dna += pos;
for (size_t i = 0; i < k_size; ++i)
{
char bp = dna[i];
if (!valid)
return false;
k_acc |= (dna_code << shift);
shift += 2;
}
k_mer = k_acc;
return true;
}
inline
{
static char lut[] = { 'A', 'T', 'G', 'C', 'N', 'M', '$' };
if (code < 5)
return lut[code];
assert(0);
return 'N';
}
inline
{
dna.resize(k_size);
for (size_t i = 0; i < k_size; ++i)
{
unsigned dna_code = unsigned(kmer_code & 3);
dna[i] = bp;
kmer_code >>= 2;
}
assert(!kmer_code);
}
template<typename BV>
unsigned k_size,
std::vector<bm::id64_t>& k_buf,
)
{
bv.clear();
bv.init();
if (seq_vect.empty())
return;
const char* dna_str = &seq_vect[0];
k_buf.reserve(size_t(chunk_size));
k_buf.resize(0);
{
vector_char_type::size_type dna_sz = seq_vect.size()-(k_size-1);
vector_char_type::size_type pos = 0;
bool valid = false;
for (; pos < dna_sz; ++pos)
{
if (valid)
{
k_buf.push_back(k_mer_code);
break;
}
}
const unsigned k_shift = (k_size-1) * 2;
if (valid)
{
for (++pos; pos < dna_sz; ++pos)
{
if (!valid)
{
pos += k_size;
for (; pos < dna_sz; ++pos)
{
if (valid)
{
k_buf.push_back(k_mer_code);
break;
}
}
continue;
}
k_mer_code = ((k_mer_code >> 2) | (bp_code << k_shift));
k_buf.push_back(k_mer_code);
if (k_buf.size() == chunk_size)
{
std::sort(k_buf.begin(), k_buf.end());
if (k_buf.size())
{
k_buf.resize(0);
bv.optimize();
}
float pcnt = float(pos) / float(dna_sz);
pcnt *= 100;
cout << "\r" << unsigned(pcnt) << "% of " << dna_sz
<< " (" << (pos+1) <<") "
<< flush;
}
}
}
if (k_buf.size())
{
std::sort(k_buf.begin(), k_buf.end());
}
}
}
static
size_t from, size_t to)
{
assert(from <= to);
if (!seq_coll.
size() || (from >= seq_coll.
size()))
return;
std::vector<bm::id64_t> k_buf;
typedef allocator_type::allocator_pool_type allocator_pool_type;
allocator_pool_type pool;
mp_guard_bv.assign_if_not_set(pool, bv);
if (!to || to >= seq_coll.
size())
unsigned cnt = 0;
for (size_t i = from; i <= to; ++i)
{
size_t blob_size = bvs.
serialize(bv, &buf[0], buf.size());
buf.resize(blob_size);
++cnt;
if (cnt >= 100)
{
cnt = 0;
}
}
}
static
unsigned concurrency)
{
if (!concurrency)
concurrency = 1;
size_t batch_size = total_seq_size / concurrency;
if (!batch_size)
batch_size = total_seq_size;
std::list<std::future<void> > futures;
for (
size_t from = 0; from <= seq_coll.
size(); )
{
size_t to = from;
for (
size_t to_pick = 0; to < seq_coll.
size(); ++to)
{
if (to_pick >= batch_size)
break;
}
futures.emplace_back(
std::async(std::launch::async,
[&seq_coll, k_size, from, to]() {
generate_k_mers(seq_coll, k_size, from, to); }
));
from = to+1;
}
unsigned long long cnt = seq_coll.
size();
for (auto& e : futures)
{
unsigned long long c_prev = 0;
while(1)
{
std::future_status status = e.wait_for(std::chrono::seconds(60));
if (status == std::future_status::ready)
break;
auto delta = c - c_prev;
c_prev = c;
auto remain_cnt = cnt - c;
auto remain_min = remain_cnt / delta;
cout << "\r" << c << ": progress per minute=" << delta;
if (remain_min < 120)
{
cout << " wait for " << remain_min << "m " << flush;
}
else
{
auto remain_h = remain_min / 60;
cout << " wait for " << remain_h << "h " << flush;
}
}
}
cout << endl;
}
{
public:
: m_lead_id(lead_id), m_bv_members(
bm::
BM_GAP)
{
m_bv_members.
set(lead_id);
}
{
m_bv_kmer_union |= bv_kmer;
}
{
std::lock_guard<std::mutex> guard(mtx_add_member_lock);
m_bv_kmer_union |= bv_kmer;
}
{
std::lock_guard<std::mutex> guard(mtx_add_member_lock);
m_bv_members.
merge(bv_seq);
m_bv_kmer_union.
merge(bv_kmer);
}
{
std::lock_guard<std::mutex> guard(mtx_add_member_lock);
}
private:
std::mutex mtx_add_member_lock;
};
{
public:
public:
{}
unsigned concurrency);
size_t groups_size()
const {
return m_seq_groups.size(); }
private:
};
{
for (groups_vector_type::iterator it = m_seq_groups.begin();
it != m_seq_groups.end(); )
{
auto cnt = bv_mem.
count();
if (cnt < 2)
it = m_seq_groups.erase(it);
else
++it;
}
}
{
}
{
for (groups_vector_type::const_iterator it = m_seq_groups.begin();
it != m_seq_groups.end(); ++it)
{
}
return m_all_members;
}
{
for (groups_vector_type::const_iterator it = m_seq_groups.begin();
it != m_seq_groups.end(); ++it)
{
}
cnt = cnt / m_seq_groups.size();
return cnt;
}
{
for (
auto it = sc.
m_seq_groups.begin(); it != sc.m_seq_groups.end(); ++it)
m_seq_groups.emplace_back(it->release());
sc.m_seq_groups.clear();
}
{
for (size_t i = 0; i < m_seq_groups.size(); ++i)
{
for (size_t j = 0; j < i; ++j)
{
}
}
}
static
unsigned* row,
size_t i,
const std::vector<std::unique_ptr<bvector_type> >& k_mers_vect)
{
size_t j;
for (j = 0; j < i; ++j)
{
row[j] = unsigned(and_cnt);
}
auto cnt = bv_i->
count();
row[j] = unsigned(cnt);
}
static
unsigned concurrency)
{
if (concurrency < 1)
concurrency = 1;
auto N = bv_mem_cnt;
const unsigned k_max_electors = 500;
if (N > k_max_electors)
{
rsub.
sample(bv_sub, bv_mem, k_max_electors);
}
std::list<std::future<void> > futures;
unsigned parallel_cnt = 0;
size_t i;
for (i = 0; i < N; ++i)
{
unsigned* row = dm.row(i);
do
{
if (parallel_cnt < concurrency)
{
futures.emplace_back(
std::async(std::launch::async,
[row, bv_i, i, &k_mers_vect]()
));
++parallel_cnt;
break;
}
else
{
}
} while(1);
}
for (auto& e : futures)
e.wait();
dm.replicate_triange();
}
static
{
{
auto idx = *en;
const unsigned char* buf = seq_coll.
get_buf(idx);
if (!buf)
continue;
}
}
unsigned concurrency)
{
std::list<std::future<void> > futures;
for (size_t k = 0; k < m_seq_groups.size(); ++k)
{
auto N = bv_all_members.
count();
auto all_members_count = N; (void) all_members_count;
unsigned k_max_electors = 200 * unsigned(log2(concurrency));
if (k_max_electors < 500)
k_max_electors = 500;
if (N > k_max_electors)
{
rsub.
sample(bv_sub, bv_all_members, k_max_electors);
bv_mem = &bv_sub;
}
dm.init();
dm.set_zero();
assert(bv_mem->
test(leader_idx));
assert(rank);
dm.sum(best_score, rank-1);
for (size_t i = 0; en.valid(); ++en, ++i)
{
dm.sum(cand_score, i);
if (cand_score > best_score)
{
best_score = cand_score;
leader_idx = *en;
}
}
if (leader_idx != old_leader_idx)
{
const unsigned char* buf = seq_coll.
get_buf(leader_idx);
assert(buf);
futures.emplace_back(
std::async(std::launch::async,
));
futures.emplace_back(
std::async(std::launch::async,
));
}
}
for (auto& e : futures)
e.wait();
}
{
cout << title << endl;
for (size_t i = 0; i < m_seq_groups.size(); ++i)
{
<< bv_mem.
count() << endl;
}
cout << "-----------\nTotal: " << m_seq_groups.size() << endl << endl;
}
static
float similarity_cut_off)
{
assert(similarity_cut_off < 1);
if (lead_id >= sz)
return;
const unsigned char* buf = seq_coll.
get_buf(lead_id);
if (!buf)
return;
float similarity_target = float(i_cnt * float(similarity_cut_off));
bool found = false;
for (size_t i = 0; i < sz; ++i)
{
bool is_except = bv_exceptions.
test(i);
if (is_except)
continue;
if (!buf)
continue;
if (and_cnt && (float(and_cnt) > similarity_target))
{
found = true;
}
}
if (!found)
}
{
if (&seq_group1 == & seq_group2)
return;
{
{
auto idx = *en;
assert(lead_idx1 != lead_idx2);
if (idx == lead_idx1)
{
continue;
}
if (idx == lead_idx2)
{
continue;
}
const unsigned char* buf = seq_coll.
get_buf(idx);
assert(buf);
if (and_cnt1 >= and_cnt2)
else
}
}
}
{
{}
void add(
size_t cluster_id,
{
if (!bv_m)
{
}
}
};
static
{
en.go_to(seq_id_from);
for ( ;en.valid(); ++en)
{
auto seq_id = *en;
if (seq_id > seq_id_to)
break;
const unsigned char* buf = seq_coll.
get_buf(seq_id);
if (!buf)
continue;
size_t cluster_idx(~0ull);
for (
size_t i = 0; i < cluster_groups.
groups_size(); ++i)
{
if (rep_and_cnt > best_score)
{
cluster_idx = i; best_score = rep_and_cnt;
}
}
if (cluster_idx != ~0ull)
{
acc.add(cluster_idx, seq_id, bv_k_mer);
}
}
for (
size_t i = 0; i < cluster_groups.
groups_size(); ++i)
{
if (!bv_m)
continue;
}
}
static
{
en.go_to(seq_id_from);
for ( ;en.valid(); ++en)
{
auto seq_id = *en;
if (seq_id > seq_id_to)
break;
const unsigned char* buf = seq_coll.
get_buf(seq_id);
if (!buf)
continue;
size_t cluster_idx(~0ull);
{
best_score = 0;
for (
size_t i = 0; i < cluster_groups.
groups_size(); ++i)
{
if (uni_and_cnt > best_score)
{
cluster_idx = i; best_score = uni_and_cnt;
}
}
if (cluster_idx != ~0ull)
{
}
}
}
}
static
unsigned num_clust,
float similarity_cut_off,
unsigned concurrency)
{
rsub.
sample(bv_rsub, bv_total, num_clust);
std::list<std::future<void> > futures;
unsigned parallel_cnt = 0;
{
auto idx = *en;
do
{
if (parallel_cnt < concurrency)
{
futures.emplace_back(
std::async(std::launch::async,
[&seq_coll, sg, &bv_rsub, similarity_cut_off]()
));
++parallel_cnt;
break;
}
else
{
}
} while(1);
}
for (auto& e : futures)
e.wait();
}
static
unsigned num_clust,
float similarity_cut_off,
unsigned concurrency)
{
assert(similarity_cut_off < 1);
return;
const unsigned max_pass = 3;
for (unsigned pass = 0; pass < max_pass; ++pass)
{
num_clust, similarity_cut_off, concurrency);
{
cout << " total = " << total_count << endl;
cout <<
" clustered = " << bv_clust.
count() << endl;
bv_total -= bv_clust;
total_count = bv_total.
count();
cout << " remain = " << total_count << endl;
}
if (!total_count)
break;
std::list<std::future<void> > futures;
assert(pair_vect.size());
for (size_t k = 0; k < pair_vect.size(); ++k)
{
auto seq_id_from = pair_vect[k].
first;
auto seq_id_to = pair_vect[k].second;
futures.emplace_back(
std::async(std::launch::async,
[&cluster_groups, &seq_coll, &bv_total, seq_id_from, seq_id_to]()
));
}
for (auto& e : futures)
e.wait();
cluster_groups.
print_summary(
"Clusters after phase 2 recruitment");
{
bv_total -= bv_clust;
rcount = bv_total.
count();
if (rcount)
{
cout << "Undistributed sequences = " << rcount << endl;
}
else
{
break;
}
}
if (rcount < avg_group_count)
{
break;
}
cout << "PASS=" << (pass+1) << endl << endl;
}
if (rcount)
{
{
cout << endl <<
" clustered = " << bv_clust.
count() << endl;
bv_total -= bv_clust;
rcount = bv_total.
count();
cout << " remain = " << rcount << endl;
}
if (rcount)
{
std::list<std::future<void> > futures;
for (size_t k = 0; k < pair_vect.size(); ++k)
{
auto seq_id_from = pair_vect[k].
first;
auto seq_id_to = pair_vect[k].second;
futures.emplace_back(
std::async(std::launch::async,
[&seq_clusters, &seq_coll, &bv_total, seq_id_from, seq_id_to]()
));
}
for (auto& e : futures)
e.wait();
{
cout << endl <<
" clustered = " << bv_clust.
count() << endl;
bv_total -= bv_clust;
rcount = bv_total.
count();
cout << " remain = " << rcount << endl;
}
}
}
if (rcount)
{
}
}
int main(
int argc,
char *argv[])
{
try
{
if (ret != 0)
{
cerr << "cmd-line parse error. " << endl;
return ret;
}
{
if (res != 0)
return res;
}
cout <<
"Sequences size = " << seq_coll.
size() << endl;
{
{
}
{
}
}
{
{
}
{
}
}
{
std::cout << std::endl << "Performance:" << std::endl;
}
}
catch(std::exception& ex)
{
std::cerr << ex.what() << std::endl;
return 1;
}
return 0;
}
#define BM_DECLARE_TEMP_BLOCK(x)
Algorithms for fast aggregation of N bvectors.
Algorithms for bvector<> (main include)
Generation of random subset.
Serialization / compression of bvector<>. Set theoretical operations on compressed BLOBs.
Algorithms for bm::sparse_vector.
Compressed sparse container rsc_sparse_vector<> for integer types.
Timing utilities for benchmarking (internal)
pre-processor un-defines to avoid global space pollution (internal)
void merge_from(CSeqClusters &sc)
Acquire all groups from another cluster collection.
size_t groups_size() const
void elect_leaders(const CSequenceColl &seq_coll, unsigned concurrency)
Find the best representatives in all cluster groups the criteria is maximum absolute similarity to al...
void resolve_duplicates(const CSequenceColl &seq_coll)
Resolve duplicate membership between groups.
void clear_empty_groups()
Remove groups which turned empty after clusterization.
CSeqGroup * get_group(size_t idx)
void print_summary(const char *title) const
print clusterization report
void add_group(CSeqGroup *sg)
std::vector< std::unique_ptr< CSeqGroup > > groups_vector_type
const bm::bvector & union_all_groups()
Compute union of all cluster group members.
bm::id64_t compute_avg_count() const
calculate avg cluster population count
void take_group(bm::bvector<> &bv_members)
memebers moved into their own group
Group (clustrer) of sequences.
const bm::bvector & get_members() const
bm::id64_t count_and_union_sync(const bm::bvector<> &bv)
bm::bvector & get_kmer_union()
CSeqGroup(bm::id64_t lead_id=~0ull)
void add_member_sync(bm::id64_t id, const bm::bvector<> &bv_kmer)
void set_lead(bm::id64_t lead_id)
set id for the group representative
void add_member(bm::id64_t id)
add a member to the group
void merge_member_sync(bm::bvector<> &bv_seq, bm::bvector<> &bv_kmer)
bool is_assigned()
check is cluster is non-empty
bm::id64_t get_lead() const
Get lead id.
void clear_member(bm::id64_t id)
Collection of sequences and k-mer fingerprint vectors.
void add_sequence(const string &acc, vector_char_type *seq_ptr)
void deserialize_k_mers(bvector_ptr_vector_type &k_mers_vect, const bm::bvector<> &bv_req, bm::bvector<>::size_type bv_req_count) const
Deserialize group of k-mer fingerprint vectors.
size_t total_seq_size() const
std::vector< unsigned char > buffer_type
const vector_char_type & get_sequence(size_t i) const
size_t get_buf_size(size_t i) const
Get k-mer vector BLOB size.
size_t seq_size(size_t i) const
const string & get_acc(size_t i) const
void set_buffer(size_t i, const buffer_type &buf)
const unsigned char * get_buf(size_t i) const
Get k-mer BLOB pointer.
void reset()
Reset aggregate groups, forget all attached vectors.
void combine_or(bvector_type &bv_target)
Aggregate added group of vectors using logical OR Operation does NOT perform an explicit reset of arg...
void set_optimization(typename bvector_type::optmode opt=bvector_type::opt_compress) BMNOEXCEPT
set on-the-fly bit-block compression By default aggregator does not try to optimize result,...
size_t add(const bvector_type *bv, unsigned agr_group=0)
Attach source bit-vector to a argument group (0 or 1).
Constant iterator designed to enumerate "ON" bits.
bool valid() const BMNOEXCEPT
Checks if iterator is still valid.
Bitvector Bit-vector container with runtime compression of bits.
bool test(size_type n) const BMNOEXCEPT
returns true if bit n is set and false is bit n is 0.
void merge(bm::bvector< Alloc > &bvect)
Merge/move content from another vector.
allocator_type::allocator_pool_type allocator_pool_type
size_type count() const BMNOEXCEPT
population count (count of ON bits)
bm::bvector< Alloc > & bit_and(const bm::bvector< Alloc > &bv1, const bm::bvector< Alloc > &bv2, typename bm::bvector< Alloc >::optmode opt_mode=opt_none)
3-operand AND : this := bv1 AND bv2
bvector< Alloc > & set(size_type n, bool val=true)
Sets bit n if val is true, clears bit n if val is false.
void set_allocator_pool(allocator_pool_type *pool_ptr) BMNOEXCEPT
Set allocator pool for local (non-th readed) memory cyclic(lots of alloc-free ops) opertations.
void optimize(bm::word_t *temp_block=0, optmode opt_mode=opt_compress, statistics *stat=0)
Optimize memory bitvector's memory allocation.
bool any() const BMNOEXCEPT
Returns true if any bits in this bitset are set, and otherwise returns false.
bvector_size_type size_type
enumerator first() const
Returns enumerator pointing on the first non-zero bit.
void swap(bvector< Alloc > &bvect) BMNOEXCEPT
Exchanges content of bv and this bvector.
size_type count_range(size_type left, size_type right, const rs_index_type &rs_idx) const BMNOEXCEPT
Returns count of 1 bits in the given range [left..right] Uses rank-select index to accelerate the sea...
size_type get_first() const BMNOEXCEPT
find first 1 bit in vector. Function may return 0 and this requires an extra check if bit 0 is actual...
bvector< Alloc > & set_range(size_type left, size_type right, bool value=true)
Sets all bits in the specified closed interval [left,right] Interval must be inside the bvector's siz...
void clear(const size_type *ids, size_type ids_size, bm::sort_order so=bm::BM_UNKNOWN)
clear list of bits in this bitset
void set_bit_no_check(size_type n)
Set bit without checking preconditions (size, etc)
Utility class to collect performance measurements and statistics.
std::map< std::string, statistics > duration_map_type
test name to duration map
static void print_duration_map(TOut &tout, const duration_map_type &dmap, format fmt=ct_time)
Deserializer, performs logical operations between bit-vector and serialized bit-vector.
size_type deserialize(bvector_type &bv, const unsigned char *buf, set_operation op, bool exit_on_one=false)
Deserialize bvector using buffer as set operation argument.
void sample(BV &bv_out, const BV &bv_in, size_type sample_count)
Get random subset of input vector.
Bit-vector serialization class.
void set_bookmarks(bool enable, unsigned bm_interval=256) BMNOEXCEPT
Add skip-markers to serialization BLOB for faster range decode at the expense of some BLOB size incre...
size_type serialize(const BV &bv, unsigned char *buf, size_t buf_size)
Bitvector serialization into memory block.
bm::alloc_pool_guard< allocator_pool_type, bvector< Alloc > > mem_pool_guard
@ BM_SORTED
input set is sorted (ascending order)
@ BM_GAP
GAP compression is ON.
size_t deserialize(BV &bv, const unsigned char *buf, bm::word_t *temp_block=0, const bm::bv_ref_vector< BV > *ref_vect=0)
Bitvector deserialization from a memory BLOB.
void rank_range_split(const BV &bv, typename BV::size_type rank, PairVect &target_v)
Algorithm to identify bit-vector ranges (splits) for the rank.
BV::size_type count_and(const BV &bv1, const BV &bv2) BMNOEXCEPT
Computes bitcount of AND operation of two bitsets.
unsigned long long int id64_t
std::vector< std::pair< bv_size_type, bv_size_type > > bv_ranges_vector
Utility class to accumulate cahnges to cluster before commiting it (mutex syncronous operation)
void add(size_t cluster_id, bm::bvector<>::size_type m_id, bm::bvector<> &bv_kmer)
bvector_ptr_vector_type bv_kmers
bvector_ptr_vector_type bv_members
size_t max_serialize_mem
estimated maximum memory for serialization
Statistical information about bitset's memory allocation details.
static int parse_args(int argc, char *argv[])
std::vector< char > vector_char_type
static void compute_jaccard_clusters(CSeqClusters &seq_clusters, const CSequenceColl &seq_coll, unsigned num_clust, float similarity_cut_off, unsigned concurrency)
static void assign_to_best_cluster(CSeqClusters &cluster_groups, const CSequenceColl &seq_coll, const bm::bvector<> &bv_seq_ids, bm::bvector<>::size_type seq_id_from, bm::bvector<>::size_type seq_id_to)
Compute AND similarity to all available clusters assign to the most similar using cluster representat...
int main(int argc, char *argv[])
bvector_type::size_type bv_size_type
static void generate_k_mers(CSequenceColl &seq_coll, unsigned k_size, size_t from, size_t to)
static void compute_random_clusters(CSeqClusters &cluster_groups, const CSequenceColl &seq_coll, const bm::bvector<> &bv_total, bm::random_subset< bvector_type > &rsub, unsigned num_clust, float similarity_cut_off, unsigned concurrency)
Pick random sequences as cluster seed elements, try attach initial sequences based on weighted simila...
void wait_for_slot(FV &futures, unsigned *parallel_cnt, unsigned concurrency)
wait for any opening in a list of futures used to schedule parallel tasks with CPU overbooking contro...
void translate_kmer(std::string &dna, bm::id64_t kmer_code, unsigned k_size)
Translate k-mer code into ATGC DNA string.
static void save_kmer_buffers(const std::string &fname, const CSequenceColl &seq_coll)
save k-mer vectors to a file
static int load_FASTA(const std::string &fname, CSequenceColl &seq_coll)
Load multi-sequence FASTA.
static void compute_and_sim_row(unsigned *row, const bm::bvector<> *bv_i, size_t i, const std::vector< std::unique_ptr< bvector_type > > &k_mers_vect)
Compute similarity distances for one row/vector (1:N) of distance matrix.
std::vector< std::pair< bv_size_type, bv_size_type > > bv_ranges_vector
static void load_kmer_buffers(const std::string &fname, CSequenceColl &seq_coll)
Load k-mer vectors.
bm::dynamic_heap_matrix< unsigned, bm::bvector<>::allocator_type > distance_matrix_type
std::string ikd_freq_name
std::string ikd_counts_name
static void compute_group(CSeqGroup &seq_group, const CSequenceColl &seq_coll, const bm::bvector<> &bv_exceptions, float similarity_cut_off)
bm::chrono_taker ::duration_map_type timing_map
bool get_kmer_code(const char *dna, size_t pos, unsigned k_size, bm::id64_t &k_mer)
Calculate k-mer as an unsigned long integer.
std::vector< char > vector_char_type
char int2DNA(unsigned code)
Translate integer code to DNA letter.
void generate_k_mer_bvector(BV &bv, const vector_char_type &seq_vect, unsigned k_size, std::vector< bm::id64_t > &k_buf, const bm::id64_t chunk_size=400000000)
This function turns each k-mer into an integer number and encodes it in a bit-vector (presense vector...
static void assign_to_best_cluster_union(CSeqClusters &cluster_groups, const CSequenceColl &seq_coll, const bm::bvector<> &bv_seq_ids, bm::bvector<>::size_type seq_id_from, bm::bvector<>::size_type seq_id_to)
Compute AND similarity to all available clusters assign to the most similar using UNION of k-mers in ...
static void generate_k_mers_parallel(CSequenceColl &seq_coll, unsigned k_size, unsigned concurrency)
bool get_DNA_code(char bp, bm::id64_t &dna_code)
static void compute_seq_group_union(CSeqGroup &seq_group, const CSequenceColl &seq_coll)
Compute union (Universe) of all k-mers in the cluster group Implemented as a OR of all k-mer fingerpr...
void resolve_duplicates(CSeqGroup &seq_group1, CSeqGroup &seq_group2, const CSequenceColl &seq_coll)
Resolve duplicate members between two groups.
std::atomic_ullong k_mer_progress_count(0)
static void compute_and_sim(distance_matrix_type &dm, const CSequenceColl &seq_coll, const bm::bvector<> &bv_mem, bm::bvector<>::size_type bv_mem_cnt, unsigned concurrency)
Compute similarity distances matrix (COUNT(AND(a, b))
std::vector< std::unique_ptr< bvector_type > > bvector_ptr_vector_type