67#include "dna_finger.h"
111void wait_for_slot(FV& futures,
unsigned* parallel_cnt,
unsigned concurrency)
115 for (
auto e = futures.begin(); e != futures.end(); ++e)
117 std::future_status status = e->wait_for(std::chrono::milliseconds(100));
118 if (status == std::future_status::ready)
120 (*parallel_cnt) -= 1;
125 }
while (*parallel_cnt >= concurrency);
142 m_acc.push_back(acc);
143 m_seqs.emplace_back(seq_ptr);
148 unique_ptr<buffer_type> buf_ptr(
new buffer_type(buf));
150 static std::mutex mtx_counts_lock;
151 std::lock_guard<std::mutex> guard(mtx_counts_lock);
153 if (m_kmer_bufs.size() <= i)
154 m_kmer_bufs.resize(i+1);
155 m_kmer_bufs[i].reset(buf_ptr.release());
160 m_kmer_bufs.resize(this->
size());
164 { assert(m_seqs.size() == m_acc.size());
return m_seqs.size(); }
166 const string&
get_acc(
size_t i)
const {
return m_acc[i]; }
169 size_t seq_size(
size_t i)
const {
return m_seqs[i]->size(); }
174 for (
size_t i = 0; i < m_seqs.size(); ++i)
180 size_t buf_size()
const {
return m_kmer_bufs.size(); }
203 vector<unique_ptr<vector_char_type> > m_seqs;
204 vector<string> m_acc;
205 vector<unique_ptr<buffer_type> > m_kmer_bufs;
212 std::list<std::future<void> > futures;
214 k_mers_vect.resize(0);
215 k_mers_vect.reserve(bv_req_count);
218 for (; en.
valid(); ++en)
222 k_mers_vect.emplace_back(bv);
223 const unsigned char* buf = this->
get_buf(i_idx);
226 futures.emplace_back(
227 std::async(std::launch::async,
233 for (
auto& e : futures)
247 std::string line, acc;
249 std::ifstream fin(fname.c_str(), std::ios::in);
252 for (
size_t i = 0; std::getline(fin, line); ++i)
257 if (line.front() ==
'>')
261 seq_vect->shrink_to_fit();
267 std::size_t pos = line.find_first_of(
":");
268 if (pos == std::string::npos)
274 acc = line.substr(1, pos-1);
278 for (std::string::iterator it = line.begin(); it != line.end(); ++it)
279 seq_vect->push_back(*it);
284 seq_vect->shrink_to_fit();
288 cout <<
"\r \r" << endl;
296 char magic_ch =
'\t';
297 std::ofstream bfile (fname, std::ios::out | std::ios::binary);
300 std::cerr <<
"Cannot open file for write: " << fname << std::endl;
305 size_t sz = seq_coll.
size();
306 bfile.write((
char*)&sz, std::streamsize(
sizeof(sz)));
310 for (
size_t i = 0; i < sz; ++i)
313 const unsigned char* buf = seq_coll.
get_buf(i);
316 bfile.write((
char*)&buf_size, std::streamsize(
sizeof(buf_size)));
320 bfile.write((
char*)&buf_size, std::streamsize(
sizeof(buf_size)));
323 bfile.write((
char*)buf, std::streamsize(buf_size));
324 bfile.write((
char*)&magic_ch, 1);
334 char magic_ch =
'\t';
335 std::ifstream bfile (fname, std::ios::in | std::ios::binary);
338 std::cerr <<
"Cannot open file for read: " << fname << std::endl;
344 bfile.read((
char*)&sz, std::streamsize(
sizeof(sz)));
350 for (
size_t i = 0; i < sz; ++i)
353 bfile.read((
char*)&buf_size, std::streamsize(
sizeof(buf_size)));
356 buf.resize(buf_size);
357 bfile.read((
char*) buf.data(), std::streamsize(buf_size));
359 bfile.read((
char*)&control_ch, 1);
360 if (control_ch != magic_ch)
362 cerr <<
"Error: read failure!" << endl;
403 size_t pos,
unsigned k_size,
411 for (
size_t i = 0; i < k_size; ++i)
418 k_acc |= (dna_code << shift);
431 static char lut[] = {
'A',
'T',
'G',
'C',
'N',
'M',
'$' };
447 for (
size_t i = 0; i < k_size; ++i)
449 unsigned dna_code = unsigned(kmer_code & 3);
477 std::vector<bm::id64_t>& k_buf,
483 if (seq_vect.empty())
485 const char* dna_str = &seq_vect[0];
487 k_buf.reserve(
size_t(chunk_size));
492 vector_char_type::size_type dna_sz = seq_vect.size()-(k_size-1);
493 vector_char_type::size_type pos = 0;
495 for (; pos < dna_sz; ++pos)
500 k_buf.push_back(k_mer_code);
505 const unsigned k_shift = (k_size-1) * 2;
508 for (++pos; pos < dna_sz; ++pos)
511 valid =
get_DNA_code(dna_str[pos + (k_size - 1)], bp_code);
515 for (; pos < dna_sz; ++pos)
520 k_buf.push_back(k_mer_code);
527 k_mer_code = ((k_mer_code >> 2) | (bp_code << k_shift));
529 k_buf.push_back(k_mer_code);
531 if (k_buf.size() == chunk_size)
533 std::sort(k_buf.begin(), k_buf.end());
541 float pcnt = float(pos) / float(dna_sz);
543 cout <<
"\r" << unsigned(pcnt) <<
"% of " << dna_sz
544 <<
" (" << (pos+1) <<
") "
552 std::sort(k_buf.begin(), k_buf.end());
562 size_t from,
size_t to)
565 if (!seq_coll.
size() || (from >= seq_coll.
size()))
568 std::vector<bm::id64_t> k_buf;
573 typedef allocator_type::allocator_pool_type allocator_pool_type;
574 allocator_pool_type pool;
578 mp_guard_bv.assign_if_not_set(pool, bv);
580 if (!to || to >= seq_coll.
size())
581 to = seq_coll.
size()-1;
587 for (
size_t i = from; i <= to; ++i)
600 buf.resize(blob_size);
617 unsigned concurrency)
624 size_t batch_size = total_seq_size / concurrency;
626 batch_size = total_seq_size;
627 std::list<std::future<void> > futures;
629 for (
size_t from = 0; from <= seq_coll.
size(); )
632 for (
size_t to_pick = 0; to < seq_coll.
size(); ++to)
635 if (to_pick >= batch_size)
639 futures.emplace_back(
640 std::async(std::launch::async,
641 [&seq_coll, k_size, from, to]() {
generate_k_mers(seq_coll, k_size, from, to); }
649 unsigned long long cnt = seq_coll.
size();
650 for (
auto& e : futures)
652 unsigned long long c_prev = 0;
655 std::future_status status = e.wait_for(std::chrono::seconds(60));
656 if (status == std::future_status::ready)
662 auto delta = c - c_prev;
665 auto remain_cnt = cnt - c;
666 auto remain_min = remain_cnt / delta;
667 cout <<
"\r" << c <<
": progress per minute=" << delta;
668 if (remain_min < 120)
670 cout <<
" wait for " << remain_min <<
"m " << flush;
674 auto remain_h = remain_min / 60;
675 cout <<
" wait for " << remain_h <<
"h " << flush;
690 : m_lead_id(lead_id), m_bv_members(
bm::
BM_GAP)
692 m_bv_members.
set(lead_id);
709 m_bv_kmer_union |= bv_kmer;
713 std::lock_guard<std::mutex> guard(mtx_add_member_lock);
715 m_bv_kmer_union |= bv_kmer;
720 std::lock_guard<std::mutex> guard(mtx_add_member_lock);
721 m_bv_members.
merge(bv_seq);
722 m_bv_kmer_union.
merge(bv_kmer);
727 std::lock_guard<std::mutex> guard(mtx_add_member_lock);
747 std::mutex mtx_add_member_lock;
789 unsigned concurrency);
809 for (groups_vector_type::iterator it = m_seq_groups.begin();
810 it != m_seq_groups.end(); )
814 auto cnt = bv_mem.
count();
816 it = m_seq_groups.erase(it);
833 m_all_members.
clear();
834 for (groups_vector_type::const_iterator it = m_seq_groups.begin();
835 it != m_seq_groups.end(); ++it)
844 return m_all_members;
850 for (groups_vector_type::const_iterator it = m_seq_groups.begin();
851 it != m_seq_groups.end(); ++it)
856 cnt = cnt / m_seq_groups.size();
863 for (
auto it = sc.m_seq_groups.begin(); it != sc.m_seq_groups.end(); ++it)
864 m_seq_groups.emplace_back(it->release());
865 sc.m_seq_groups.clear();
875 for (
size_t i = 0; i < m_seq_groups.size(); ++i)
878 for (
size_t j = 0; j < i; ++j)
894 const std::vector<std::unique_ptr<bvector_type> >& k_mers_vect)
897 for (j = 0; j < i; ++j)
901 row[j] = unsigned(and_cnt);
903 auto cnt = bv_i->
count();
904 row[j] = unsigned(cnt);
914 unsigned concurrency)
920 const unsigned k_max_electors = 500;
922 if (N > k_max_electors)
925 rsub.
sample(bv_sub, bv_mem, k_max_electors);
935 std::list<std::future<void> > futures;
936 unsigned parallel_cnt = 0;
939 for (i = 0; i < N; ++i)
942 unsigned* row = dm.row(i);
945 if (parallel_cnt < concurrency)
947 futures.emplace_back(
948 std::async(std::launch::async,
949 [row, bv_i, i, &k_mers_vect]()
964 for (
auto& e : futures)
967 dm.replicate_triange();
979 bv_kmer_union.
clear();
985 const unsigned char* buf = seq_coll.
get_buf(idx);
995 unsigned concurrency)
1000 std::list<std::future<void> > futures;
1002 for (
size_t k = 0; k < m_seq_groups.size(); ++k)
1007 auto N = bv_all_members.
count();
1008 auto all_members_count = N; (void) all_members_count;
1011 unsigned k_max_electors = 200 * unsigned(log2(concurrency));
1012 if (k_max_electors < 500)
1013 k_max_electors = 500;
1017 if (N > k_max_electors)
1020 rsub.
sample(bv_sub, bv_all_members, k_max_electors);
1041 assert(bv_mem->
test(leader_idx));
1044 dm.sum(best_score, rank-1);
1048 for (
size_t i = 0; en.
valid(); ++en, ++i)
1051 dm.sum(cand_score, i);
1052 if (cand_score > best_score)
1054 best_score = cand_score;
1059 if (leader_idx != old_leader_idx)
1062 const unsigned char* buf = seq_coll.
get_buf(leader_idx);
1068 futures.emplace_back(
1069 std::async(std::launch::async,
1073 futures.emplace_back(
1074 std::async(std::launch::async,
1081 for (
auto& e : futures)
1089 cout << title << endl;
1090 for (
size_t i = 0; i < m_seq_groups.size(); ++i)
1092 const CSeqGroup* sg = m_seq_groups[i].get();
1095 << bv_mem.
count() << endl;
1097 cout <<
"-----------\nTotal: " << m_seq_groups.size() << endl << endl;
1105 float similarity_cut_off)
1107 assert(similarity_cut_off < 1);
1111 size_t lead_id = seq_group.
get_lead();
1115 const unsigned char* buf = seq_coll.
get_buf(lead_id);
1122 auto i_cnt = bv.
count();
1124 float similarity_target = float(i_cnt *
float(similarity_cut_off));
1130 for (
size_t i = 0; i < sz; ++i)
1132 bool is_except = bv_exceptions.
test(i);
1143 if (and_cnt && (
float(and_cnt) > similarity_target))
1159 if (&seq_group1 == & seq_group2)
1181 auto lead_idx1 = seq_group1.
get_lead();
1182 auto lead_idx2 = seq_group2.
get_lead();
1183 assert(lead_idx1 != lead_idx2);
1185 if (idx == lead_idx1)
1190 if (idx == lead_idx2)
1196 const unsigned char* buf = seq_coll.
get_buf(idx);
1202 if (and_cnt1 >= and_cnt2)
1227 assert(!
bv_kmers[cluster_id].get());
1234 bv_k->
merge(bv_kmer);
1257 en.
go_to(seq_id_from);
1259 for ( ;en.
valid(); ++en)
1262 if (seq_id > seq_id_to)
1264 const unsigned char* buf = seq_coll.
get_buf(seq_id);
1274 size_t cluster_idx(~0ull);
1278 for (
size_t i = 0; i < cluster_groups.
groups_size(); ++i)
1285 if (rep_and_cnt > best_score)
1287 cluster_idx = i; best_score = rep_and_cnt;
1291 if (cluster_idx != ~0ull)
1293 acc.
add(cluster_idx, seq_id, bv_k_mer);
1299 for (
size_t i = 0; i < cluster_groups.
groups_size(); ++i)
1325 en.
go_to(seq_id_from);
1327 for ( ;en.
valid(); ++en)
1330 if (seq_id > seq_id_to)
1332 const unsigned char* buf = seq_coll.
get_buf(seq_id);
1342 size_t cluster_idx(~0ull);
1348 for (
size_t i = 0; i < cluster_groups.
groups_size(); ++i)
1354 if (uni_and_cnt > best_score)
1356 cluster_idx = i; best_score = uni_and_cnt;
1359 if (cluster_idx != ~0ull)
1379 float similarity_cut_off,
1380 unsigned concurrency)
1383 rsub.
sample(bv_rsub, bv_total, num_clust);
1385 std::list<std::future<void> > futures;
1386 unsigned parallel_cnt = 0;
1396 if (parallel_cnt < concurrency)
1398 futures.emplace_back(
1399 std::async(std::launch::async,
1400 [&seq_coll, sg, &bv_rsub, similarity_cut_off]()
1401 {
compute_group(*sg, seq_coll, bv_rsub, similarity_cut_off); }
1415 for (
auto& e : futures)
1423 float similarity_cut_off,
1424 unsigned concurrency)
1426 assert(similarity_cut_off < 1);
1437 const unsigned max_pass = 3;
1438 for (
unsigned pass = 0; pass < max_pass; ++pass)
1443 num_clust, similarity_cut_off, concurrency);
1465 cout <<
" total = " << total_count << endl;
1467 cout <<
" clustered = " << bv_clust.
count() << endl;
1469 bv_total -= bv_clust;
1470 total_count = bv_total.
count();
1471 cout <<
" remain = " << total_count << endl;
1477 std::list<std::future<void> > futures;
1484 assert(pair_vect.size());
1485 for (
size_t k = 0; k < pair_vect.size(); ++k)
1487 auto seq_id_from = pair_vect[k].first;
1488 auto seq_id_to = pair_vect[k].second;
1489 futures.emplace_back(
1490 std::async(std::launch::async,
1491 [&cluster_groups, &seq_coll, &bv_total, seq_id_from, seq_id_to]()
1495 for (
auto& e : futures)
1498 cluster_groups.
print_summary(
"Clusters after phase 2 recruitment");
1503 bv_total -= bv_clust;
1504 rcount = bv_total.
count();
1507 cout <<
"Undistributed sequences = " << rcount << endl;
1517 if (rcount < avg_group_count)
1523 cout <<
"PASS=" << (pass+1) << endl << endl;
1533 cout << endl <<
" clustered = " << bv_clust.
count() << endl;
1535 bv_total -= bv_clust;
1536 rcount = bv_total.
count();
1537 cout <<
" remain = " << rcount << endl;
1546 std::list<std::future<void> > futures;
1548 for (
size_t k = 0; k < pair_vect.size(); ++k)
1550 auto seq_id_from = pair_vect[k].first;
1551 auto seq_id_to = pair_vect[k].second;
1552 futures.emplace_back(
1553 std::async(std::launch::async,
1554 [&seq_clusters, &seq_coll, &bv_total, seq_id_from, seq_id_to]()
1558 for (
auto& e : futures)
1562 cout << endl <<
" clustered = " << bv_clust.
count() << endl;
1564 bv_total -= bv_clust;
1565 rcount = bv_total.
count();
1566 cout <<
" remain = " << rcount << endl;
1593 cerr <<
"cmd-line parse error. " << endl;
1608 cout <<
"Sequences size = " << seq_coll.
size() << endl;
1643 std::cout << std::endl <<
"Performance:" << std::endl;
1648 catch(std::exception& ex)
1650 std::cerr << ex.what() << std::endl;
#define BM_DECLARE_TEMP_BLOCK(x)
Algorithms for fast aggregation of N bvectors.
Algorithms for bvector<> (main include)
Generation of random subset.
Serialization / compression of bvector<>. Set theoretical operations on compressed BLOBs.
Algorithms for bm::sparse_vector.
Compressed sparse container rsc_sparse_vector<> for integer types.
Timing utilities for benchmarking (internal)
pre-processor un-defines to avoid global space pollution (internal)
void merge_from(CSeqClusters &sc)
Acquire all groups from another cluster collection.
size_t groups_size() const
void elect_leaders(const CSequenceColl &seq_coll, unsigned concurrency)
Find the best representatives in all cluster groups the criteria is maximum absolute similarity to al...
void resolve_duplicates(const CSequenceColl &seq_coll)
Resolve duplicate membership between groups.
CSeqClusters(const CSeqClusters &)=delete
void clear_empty_groups()
Remove groups which turned empty after clusterization.
CSeqGroup * get_group(size_t idx)
void print_summary(const char *title) const
print clusterization report
void add_group(CSeqGroup *sg)
std::vector< std::unique_ptr< CSeqGroup > > groups_vector_type
const bm::bvector & union_all_groups()
Compute union of all cluster group members.
bm::id64_t compute_avg_count() const
calculate avg cluster population count
void take_group(bm::bvector<> &bv_members)
memebers moved into their own group
Group (clustrer) of sequences.
const bm::bvector & get_members() const
void add_member(bm::id64_t id, const bm::bvector<> &bv_kmer)
bm::id64_t count_and_union_sync(const bm::bvector<> &bv)
const bm::bvector & get_rep() const
bm::bvector & get_kmer_union()
CSeqGroup(bm::id64_t lead_id=~0ull)
const bm::bvector & get_kmer_union() const
void add_member_sync(bm::id64_t id, const bm::bvector<> &bv_kmer)
void set_lead(bm::id64_t lead_id)
set id for the group representative
bm::bvector & get_members()
void add_member(bm::id64_t id)
add a member to the group
void merge_member_sync(bm::bvector<> &bv_seq, bm::bvector<> &bv_kmer)
bool is_assigned()
check is cluster is non-empty
bm::id64_t get_lead() const
Get lead id.
void clear_member(bm::id64_t id)
Collection of sequences and k-mer fingerprint vectors.
void add_sequence(const string &acc, vector_char_type *seq_ptr)
CSequenceColl(const CSequenceColl &)=delete
void deserialize_k_mers(bvector_ptr_vector_type &k_mers_vect, const bm::bvector<> &bv_req, bm::bvector<>::size_type bv_req_count) const
Deserialize group of k-mer fingerprint vectors.
size_t total_seq_size() const
std::vector< unsigned char > buffer_type
const vector_char_type & get_sequence(size_t i) const
size_t get_buf_size(size_t i) const
Get k-mer vector BLOB size.
size_t seq_size(size_t i) const
const string & get_acc(size_t i) const
void set_buffer(size_t i, const buffer_type &buf)
const unsigned char * get_buf(size_t i) const
Get k-mer BLOB pointer.
void reset()
Reset aggregate groups, forget all attached vectors.
void combine_or(bvector_type &bv_target)
Aggregate added group of vectors using logical OR Operation does NOT perform an explicit reset of arg...
void set_optimization(typename bvector_type::optmode opt=bvector_type::opt_compress) BMNOEXCEPT
set on-the-fly bit-block compression By default aggregator does not try to optimize result,...
size_t add(const bvector_type *bv, unsigned agr_group=0)
Attach source bit-vector to a argument group (0 or 1).
Constant iterator designed to enumerate "ON" bits.
bool go_to(size_type pos) BMNOEXCEPT
go to a specific position in the bit-vector (or next)
bool valid() const BMNOEXCEPT
Checks if iterator is still valid.
Bitvector Bit-vector container with runtime compression of bits.
bool test(size_type n) const BMNOEXCEPT
returns true if bit n is set and false is bit n is 0.
void merge(bm::bvector< Alloc > &bvect)
Merge/move content from another vector.
allocator_type::allocator_pool_type allocator_pool_type
size_type size() const BMNOEXCEPT
Returns bvector's capacity (number of bits it can store)
size_type count() const BMNOEXCEPT
population count (count of ON bits)
bm::bvector< Alloc > & bit_and(const bm::bvector< Alloc > &bv1, const bm::bvector< Alloc > &bv2, typename bm::bvector< Alloc >::optmode opt_mode=opt_none)
3-operand AND : this := bv1 AND bv2
bvector< Alloc > & set(size_type n, bool val=true)
Sets bit n if val is true, clears bit n if val is false.
void set_allocator_pool(allocator_pool_type *pool_ptr) BMNOEXCEPT
Set allocator pool for local (non-th readed) memory cyclic(lots of alloc-free ops) opertations.
void optimize(bm::word_t *temp_block=0, optmode opt_mode=opt_compress, statistics *stat=0)
Optimize memory bitvector's memory allocation.
bool any() const BMNOEXCEPT
Returns true if any bits in this bitset are set, and otherwise returns false.
bvector_size_type size_type
enumerator first() const
Returns enumerator pointing on the first non-zero bit.
void swap(bvector< Alloc > &bvect) BMNOEXCEPT
Exchanges content of bv and this bvector.
size_type count_range(size_type left, size_type right, const rs_index_type &rs_idx) const BMNOEXCEPT
Returns count of 1 bits in the given range [left..right] Uses rank-select index to accelerate the sea...
size_type get_first() const BMNOEXCEPT
find first 1 bit in vector. Function may return 0 and this requires an extra check if bit 0 is actual...
bvector< Alloc > & set_range(size_type left, size_type right, bool value=true)
Sets all bits in the specified closed interval [left,right] Interval must be inside the bvector's siz...
void clear(const size_type *ids, size_type ids_size, bm::sort_order so=bm::BM_UNKNOWN)
clear list of bits in this bitset
void set_bit_no_check(size_type n)
Set bit without checking preconditions (size, etc)
Utility class to collect performance measurements and statistics.
std::map< std::string, statistics > duration_map_type
test name to duration map
static void print_duration_map(TOut &tout, const duration_map_type &dmap, format fmt=ct_time)
Deserializer, performs logical operations between bit-vector and serialized bit-vector.
size_type deserialize(bvector_type &bv, const unsigned char *buf, set_operation op, bool exit_on_one=false)
Deserialize bvector using buffer as set operation argument.
void sample(BV &bv_out, const BV &bv_in, size_type sample_count)
Get random subset of input vector.
Bit-vector serialization class.
void set_bookmarks(bool enable, unsigned bm_interval=256) BMNOEXCEPT
Add skip-markers to serialization BLOB for faster range decode at the expense of some BLOB size incre...
size_type serialize(const BV &bv, unsigned char *buf, size_t buf_size)
Bitvector serialization into memory block.
bm::alloc_pool_guard< allocator_pool_type, bvector< Alloc > > mem_pool_guard
@ BM_SORTED
input set is sorted (ascending order)
@ BM_GAP
GAP compression is ON.
size_t deserialize(BV &bv, const unsigned char *buf, bm::word_t *temp_block=0, const bm::bv_ref_vector< BV > *ref_vect=0)
Bitvector deserialization from a memory BLOB.
void rank_range_split(const BV &bv, typename BV::size_type rank, PairVect &target_v)
Algorithm to identify bit-vector ranges (splits) for the rank.
BV::size_type count_and(const BV &bv1, const BV &bv2) BMNOEXCEPT
Computes bitcount of AND operation of two bitsets.
unsigned long long int id64_t
std::vector< std::pair< bv_size_type, bv_size_type > > bv_ranges_vector
Utility class to accumulate cahnges to cluster before commiting it (mutex syncronous operation)
void add(size_t cluster_id, bm::bvector<>::size_type m_id, bm::bvector<> &bv_kmer)
bvector_ptr_vector_type bv_kmers
bvector_ptr_vector_type bv_members
size_t max_serialize_mem
estimated maximum memory for serialization
Statistical information about bitset's memory allocation details.
static int parse_args(int argc, char *argv[])
std::vector< char > vector_char_type
static void compute_jaccard_clusters(CSeqClusters &seq_clusters, const CSequenceColl &seq_coll, unsigned num_clust, float similarity_cut_off, unsigned concurrency)
static void assign_to_best_cluster(CSeqClusters &cluster_groups, const CSequenceColl &seq_coll, const bm::bvector<> &bv_seq_ids, bm::bvector<>::size_type seq_id_from, bm::bvector<>::size_type seq_id_to)
Compute AND similarity to all available clusters assign to the most similar using cluster representat...
int main(int argc, char *argv[])
bvector_type::size_type bv_size_type
static void generate_k_mers(CSequenceColl &seq_coll, unsigned k_size, size_t from, size_t to)
static void compute_random_clusters(CSeqClusters &cluster_groups, const CSequenceColl &seq_coll, const bm::bvector<> &bv_total, bm::random_subset< bvector_type > &rsub, unsigned num_clust, float similarity_cut_off, unsigned concurrency)
Pick random sequences as cluster seed elements, try attach initial sequences based on weighted simila...
void wait_for_slot(FV &futures, unsigned *parallel_cnt, unsigned concurrency)
wait for any opening in a list of futures used to schedule parallel tasks with CPU overbooking contro...
void translate_kmer(std::string &dna, bm::id64_t kmer_code, unsigned k_size)
Translate k-mer code into ATGC DNA string.
static void save_kmer_buffers(const std::string &fname, const CSequenceColl &seq_coll)
save k-mer vectors to a file
static int load_FASTA(const std::string &fname, CSequenceColl &seq_coll)
Load multi-sequence FASTA.
static void compute_and_sim_row(unsigned *row, const bm::bvector<> *bv_i, size_t i, const std::vector< std::unique_ptr< bvector_type > > &k_mers_vect)
Compute similarity distances for one row/vector (1:N) of distance matrix.
std::vector< std::pair< bv_size_type, bv_size_type > > bv_ranges_vector
static void load_kmer_buffers(const std::string &fname, CSequenceColl &seq_coll)
Load k-mer vectors.
bm::dynamic_heap_matrix< unsigned, bm::bvector<>::allocator_type > distance_matrix_type
std::string ikd_freq_name
std::string ikd_counts_name
static void compute_group(CSeqGroup &seq_group, const CSequenceColl &seq_coll, const bm::bvector<> &bv_exceptions, float similarity_cut_off)
bm::chrono_taker ::duration_map_type timing_map
bool get_kmer_code(const char *dna, size_t pos, unsigned k_size, bm::id64_t &k_mer)
Calculate k-mer as an unsigned long integer.
std::vector< char > vector_char_type
char int2DNA(unsigned code)
Translate integer code to DNA letter.
void generate_k_mer_bvector(BV &bv, const vector_char_type &seq_vect, unsigned k_size, std::vector< bm::id64_t > &k_buf, const bm::id64_t chunk_size=400000000)
This function turns each k-mer into an integer number and encodes it in a bit-vector (presense vector...
static void assign_to_best_cluster_union(CSeqClusters &cluster_groups, const CSequenceColl &seq_coll, const bm::bvector<> &bv_seq_ids, bm::bvector<>::size_type seq_id_from, bm::bvector<>::size_type seq_id_to)
Compute AND similarity to all available clusters assign to the most similar using UNION of k-mers in ...
static void generate_k_mers_parallel(CSequenceColl &seq_coll, unsigned k_size, unsigned concurrency)
bool get_DNA_code(char bp, bm::id64_t &dna_code)
static void compute_seq_group_union(CSeqGroup &seq_group, const CSequenceColl &seq_coll)
Compute union (Universe) of all k-mers in the cluster group Implemented as a OR of all k-mer fingerpr...
void resolve_duplicates(CSeqGroup &seq_group1, CSeqGroup &seq_group2, const CSequenceColl &seq_coll)
Resolve duplicate members between two groups.
std::atomic_ullong k_mer_progress_count(0)
static void compute_and_sim(distance_matrix_type &dm, const CSequenceColl &seq_coll, const bm::bvector<> &bv_mem, bm::bvector<>::size_type bv_mem_cnt, unsigned concurrency)
Compute similarity distances matrix (COUNT(AND(a, b))
std::vector< std::unique_ptr< bvector_type > > bvector_ptr_vector_type