72#include "dna_finger.h"
121 std::ifstream fin(fname.c_str(), std::ios::in);
126 for (
unsigned i = 0; std::getline(fin, line); ++i)
131 for (std::string::iterator it = line.begin(); it != line.end(); ++it)
132 seq_vect.push_back(*it);
166 size_t pos,
unsigned k_size,
174 for (
size_t i = 0; i < k_size; ++i)
181 k_acc |= (dna_code << shift);
194 static char lut[] = {
'A',
'T',
'G',
'C',
'N',
'$' };
210 for (
size_t i = 0; i < k_size; ++i)
212 unsigned dna_code = unsigned(kmer_code & 3);
225 size_t pos,
unsigned k_size,
228 for (
size_t i = 0; i < k_size; ++i)
230 char bp = dna[pos+i];
231 unsigned dna_code = unsigned(k_mer & 3ull);
235 if (bp ==
'N' && bp_c !=
'A')
237 cerr << bp <<
" " << bp_c << endl;
238 cerr <<
"Error! N code mismatch at pos = " << pos+i
247 cerr <<
"Error! non-zero k-mer remainder at " << pos << endl;
255template<
typename VECT>
258 std::sort(vect.begin(), vect.end());
259 auto last = std::unique(vect.begin(), vect.end());
260 vect.erase(last, vect.end());
267template<
typename VECT,
typename COUNT_VECT>
272 std::sort(vect.begin(), vect.end());
273 typename VECT::value_type prev = vect[0];
274 typename COUNT_VECT::value_type cnt = 1;
275 auto vsize = vect.size();
277 for (; i < vsize; ++i)
285 cvect.inc_not_null(prev, cnt);
289 cvect.inc_not_null(prev, cnt);
290 assert(cvect.in_sync());
316 if (seq_vect.empty())
318 const char* dna_str = &seq_vect[0];
320 std::vector<bm::id64_t> k_buf;
321 k_buf.reserve(chunk_size);
325 vector_char_type::size_type dna_sz = seq_vect.size()-(k_size-1);
326 vector_char_type::size_type pos = 0;
328 for (; pos < dna_sz; ++pos)
333 k_buf.push_back(k_mer_code);
338 const unsigned k_shift = (k_size-1) * 2;
341 for (++pos; pos < dna_sz; ++pos)
344 valid =
get_DNA_code(dna_str[pos + (k_size - 1)], bp_code);
348 for (; pos < dna_sz; ++pos)
353 k_buf.push_back(k_mer_code);
360 k_mer_code = ((k_mer_code >> 2) | (bp_code << k_shift));
363 k_buf.push_back(k_mer_code);
371 assert(k_check == k_mer_code);
374 if (k_buf.size() >= chunk_size)
384 float pcnt = float(pos) / float(dna_sz);
386 cout <<
"\r" << unsigned(pcnt) <<
"% of " << dna_sz
387 <<
" (" << (pos+1) <<
") "
398 cout <<
"Unique k-mers: " << k_buf.size() << endl;
413 if (seq_vect.empty())
415 const char* dna_str = &seq_vect[0];
416 std::vector<bm::id64_t> k_buf;
417 k_buf.reserve(chunk_size);
420 vector_char_type::size_type dna_sz = seq_vect.size()-(k_size-1);
421 vector_char_type::size_type pos = 0;
423 for (; pos < dna_sz; ++pos)
428 k_buf.push_back(k_mer_code);
432 const unsigned k_shift = (k_size-1) * 2;
435 for (++pos; pos < dna_sz; ++pos)
438 valid =
get_DNA_code(dna_str[pos + (k_size - 1)], bp_code);
442 for (; pos < dna_sz; ++pos)
447 k_buf.push_back(k_mer_code);
454 k_mer_code = ((k_mer_code >> 2) | (bp_code << k_shift));
456 k_buf.push_back(k_mer_code);
458 if (k_buf.size() == chunk_size)
463 float pcnt = float(pos) / float(dna_sz);
465 cout <<
"\r" << unsigned(pcnt) <<
"% of " << dna_sz
466 <<
" (" << (pos+1) <<
") "
491 : m_seq_vect(seq_vect), m_k_size(k_size), m_from(from), m_to(to),
492 m_kmer_counts(kmer_counts)
496 : m_seq_vect(func.m_seq_vect), m_k_size(func.m_k_size),
497 m_from(func.m_from), m_to(func.m_to),
498 m_kmer_counts(func.m_kmer_counts)
506 kmer_counts_part.
sync();
509 if (m_seq_vect.empty())
512 const char* dna_str = &m_seq_vect[0];
513 std::vector<bm::id64_t> k_buf;
514 k_buf.reserve(chunk_size);
516 const auto k_size = m_k_size;
519 vector_char_type::size_type dna_sz = m_seq_vect.size()-(m_k_size-1);
520 vector_char_type::size_type pos = 0;
522 for (; pos < dna_sz; ++pos)
527 if (k_mer_code >= m_from && k_mer_code <= m_to)
528 k_buf.push_back(k_mer_code);
533 const unsigned k_shift = (k_size-1) * 2;
536 for (++pos; pos < dna_sz; ++pos)
539 valid =
get_DNA_code(dna_str[pos + (k_size - 1)], bp_code);
543 for (; pos < dna_sz; ++pos)
548 if (k_mer_code >= m_from && k_mer_code <= m_to)
549 k_buf.push_back(k_mer_code);
556 k_mer_code = ((k_mer_code >> 2) | (bp_code << k_shift));
558 if (k_mer_code >= m_from && k_mer_code <= m_to)
560 k_buf.push_back(k_mer_code);
561 if (k_buf.size() == chunk_size)
574 static std::mutex mtx_counts_lock;
575 std::lock_guard<std::mutex> guard(mtx_counts_lock);
598 unsigned concurrency)
601 typedef std::vector<std::pair<bv_size_type, bv_size_type> >
bv_ranges_vector;
608 if (split_rank < concurrency || concurrency < 2)
619 std::vector<std::future<void> > futures;
620 futures.reserve(concurrency);
622 for (
size_t k = 0; k < pair_vect.size(); ++k)
624 futures.emplace_back(std::async(std::launch::async,
633 for (
auto& e : futures)
638 std::future_status status = e.wait_for(std::chrono::seconds(60));
639 if (status == std::future_status::ready)
641 cout <<
"\r" << ++m <<
" min" << flush;
655 std::string kmer_str;
656 typename BV::size_type cnt = 0;
657 typename BV::enumerator en = bv_kmers.first();
658 for ( ;en.valid(); ++en, ++cnt)
660 auto k_mer_code = *en;
674 typename BV::size_type count =
dna_scanner.FindCount(kmer_str);
676 kmer_counts.
set(k_mer_code, (
unsigned)count);
678 if ((cnt % 1000) == 0)
679 cout <<
"\r" << cnt << flush;
694template<
typename DNA_Scan>
708 : m_parent_scanner(parent_scanner), m_bv_kmers(bv_kmers),
709 m_from(from), m_to(to), m_kmer_counts(kmer_counts)
714 : m_parent_scanner(func.m_parent_scanner), m_bv_kmers(func.m_bv_kmers),
715 m_from(func.m_from), m_to(func.m_to), m_kmer_counts(func.m_kmer_counts)
721 std::string kmer_str;
724 static std::mutex mtx_counts_lock;
728 for ( ;en.
valid(); ++en, ++cnt)
730 auto k_mer_code = *en;
731 if (k_mer_code > m_to)
738 m_Agg.set_compute_count(
true);
739 for (
size_t i = 0; i < kmer_str.size(); ++i)
741 const bvector_type& bv_mask = m_parent_scanner.GetVector(kmer_str[i]);
745 m_Agg.combine_shift_right_and(bv_search_res);
757 std::lock_guard<std::mutex> guard(mtx_counts_lock);
758 m_kmer_counts.
set(k_mer_code,
unsigned(search_count));
759 assert(m_kmer_counts.
in_sync());
768 const DNA_Scan& m_parent_scanner;
783 unsigned concurrency)
786 typedef std::vector<std::pair<bv_size_type, bv_size_type> >
bv_ranges_vector;
793 if (split_rank < concurrency || concurrency < 2)
805 std::vector<std::future<void> > futures;
806 futures.reserve(concurrency);
808 for (
size_t k = 0; k < pair_vect.size(); ++k)
810 futures.emplace_back(std::async(std::launch::async,
819 for (
auto& e : futures)
821 unsigned long long c_prev = 0;
824 std::future_status status = e.wait_for(std::chrono::seconds(60));
825 if (status == std::future_status::ready)
831 auto delta = c - c_prev;
834 auto remain_cnt = cnt - c;
835 auto remain_min = remain_cnt / delta;
836 cout <<
"\r" << c <<
": progress per minute=" << delta;
837 if (remain_min < 120)
839 cout <<
" wait for " << remain_min <<
"m " << flush;
843 auto remain_h = remain_min / 60;
844 cout <<
" wait for " << remain_h <<
"h " << flush;
864 auto en = bv_null->first();
865 for (; en.valid(); ++en)
867 auto kmer_code = *en;
868 auto kmer_count = kmer_counts.
get(kmer_code);
869 auto mit = hmap.find(kmer_count);
870 if (mit == hmap.end())
871 hmap[kmer_count] = 1;
884 outf.open(fname, ios::out | ios::trunc );
886 outf <<
"kmer count \t number of kmers\n";
888 auto it = hmap.rbegin();
auto it_end = hmap.rend();
889 for (; it != it_end; ++it)
891 outf << it->first <<
"\t" << it->second << endl;
924 auto total_kmers = bv_null->count();
925 bm::id64_t target_f_count = (total_kmers * percent) / 100;
927 auto it = hmap.rbegin();
928 auto it_end = hmap.rend();
929 for (; it != it_end; ++it)
931 auto kmer_count = it->first;
933 scanner.
find_eq(kmer_counts, kmer_count, bv_found);
934 auto found_cnt = bv_found.count();
939 for (; en.
valid(); ++en)
941 auto kmer_code = *en;
942 unsigned k_count = kmer_counts.
get(kmer_code);
949 assert(k_count == kmer_count);
951 frequent_bv.set(kmer_code);
952 if (kmer_count >= target_f_count)
954 frequent_bv.optimize();
962 frequent_bv.optimize();
966int main(
int argc,
char *argv[])
976 cerr <<
"cmd-line parse error. " << endl;
989 std::cout <<
"FASTA sequence size=" << seq_vect.size() << std::endl;
994 cout <<
"k-mer generation for k=" <<
ik_size << endl;
998 cout <<
"Found " << bv_kmers.
count() <<
" k-mers." << endl;
1002 bm::print_bvector_stat(cout,bv_kmers);
1003 size_t blob_size = bm::compute_serialization_size(bv_kmers);
1004 cout <<
"DNA 2-bit coded FASTA size=" << seq_vect.size()/4 << endl;
1005 cout <<
" Compressed size=" << blob_size << endl;
1013 bm::SaveBVector(
ikd_name.c_str(), bv_kmers);
1016 if (seq_vect.size())
1022 if (seq_vect.size() &&
1026 rsc_kmer_counts.
sync();
1028 rsc_kmer_counts2.
sync();
1031 cout <<
" Searching for k-mer counts..." << endl;
1035 cout <<
" ... using bm::aggregator<>" << endl;
1045 cout <<
" ... using std::sort() and count" << endl;
1058 cerr <<
"Error: Count vector save failed!" << endl;
1065 bool eq = rsc_kmer_counts.
equal(rsc_kmer_counts2);
1072 auto v1 = rsc_kmer_counts.
get(idx);
1073 auto v2 = rsc_kmer_counts2.
get(idx);
1074 cerr <<
"Mismatch at idx=" << idx <<
" v1=" << v1 <<
" v2=" << v2 << endl;
1075 assert(found); (void)found;
1077 cerr <<
"Integrity check failed!" << endl;
1111 cout <<
"Found frequent k-mers: " << bv_freq.
count() << endl;
1126 std::cout << std::endl <<
"Performance:" << std::endl;
1131 catch(std::exception& ex)
1133 std::cerr << ex.what() << std::endl;
Algorithms for fast aggregation of N bvectors.
Algorithms for bvector<> (main include)
Serialization / compression of bvector<>. Set theoretical operations on compressed BLOBs.
Algorithms for bm::sparse_vector.
Compressed sparse container rsc_sparse_vector<> for integer types.
Timing utilities for benchmarking (internal)
pre-processor un-defines to avoid global space pollution (internal)
k-mer counting job functor class using bm::aggregator<>
DNA_Scan::bvector_type bvector_type
Counting_JobFunctor(const DNA_Scan &parent_scanner, const bvector_type &bv_kmers, size_type from, size_type to, rsc_sparse_vector_u32 &kmer_counts)
constructor
bvector_type::size_type size_type
Counting_JobFunctor(const Counting_JobFunctor &func)
copy-ctor
void operator()()
Main logic (functor)
Utility for keeping all DNA finger print vectors and search using various techniques.
void BuildParallel(const vector< char > &sequence, unsigned threads)
Build fingerprint bit-vectors using bulk insert iterator and parallel processing.
Functor to process job batch (task)
void operator()()
Main logic (functor)
SortCounting_JobFunctor(const vector_char_type &seq_vect, unsigned k_size, size_type from, size_type to, rsc_sparse_vector_u32 &kmer_counts)
constructor
SortCounting_JobFunctor(const SortCounting_JobFunctor &func)
bvector_type::size_type size_type
Constant iterator designed to enumerate "ON" bits.
bool valid() const BMNOEXCEPT
Checks if iterator is still valid.
Bitvector Bit-vector container with runtime compression of bits.
size_type count() const BMNOEXCEPT
population count (count of ON bits)
bm::bvector< Alloc > & bit_sub(const bm::bvector< Alloc > &bv1, const bm::bvector< Alloc > &bv2, typename bm::bvector< Alloc >::optmode opt_mode=opt_none)
3-operand SUB : this := bv1 MINUS bv2 SUBtraction is also known as AND NOT
void optimize(bm::word_t *temp_block=0, optmode opt_mode=opt_compress, statistics *stat=0)
Optimize memory bitvector's memory allocation.
bvector_size_type size_type
Utility class to collect performance measurements and statistics.
std::map< std::string, statistics > duration_map_type
test name to duration map
static void print_duration_map(TOut &tout, const duration_map_type &dmap, format fmt=ct_time)
void merge_not_null(rsc_sparse_vector< Val, SV > &csv)
merge two vectors (argument gets destroyed) It is important that both vectors have the same NULL vect...
void set(size_type idx, value_type v)
set specified element with bounds checking and automatic resize
void optimize(bm::word_t *temp_block=0, typename bvector_type::optmode opt_mode=bvector_type::opt_compress, statistics *stat=0)
run memory optimization for all vector slices
void sync(bool force)
Re-calculate rank-select index for faster access to vector.
bool equal(const rsc_sparse_vector< Val, SV > &csv) const BMNOEXCEPT
check if another vector has the same content
value_type get(size_type idx) const BMNOEXCEPT
get specified element without bounds checking
const bvector_type * get_null_bvector() const BMNOEXCEPT
Get bit-vector of assigned values (or NULL)
bool in_sync() const BMNOEXCEPT
returns true if prefix sum table is in sync with the vector
SV::bvector_type bvector_type
algorithms for sparse_vector scan/search
void find_eq(const SV &sv, value_type value, bvector_type &bv_out)
find all sparse vector elements EQ to search value
succinct sparse vector with runtime compression using bit-slicing / transposition method
@ BM_SORTED
input set is sorted (ascending order)
@ BM_GAP
GAP compression is ON.
void rank_range_split(const BV &bv, typename BV::size_type rank, PairVect &target_v)
Algorithm to identify bit-vector ranges (splits) for the rank.
bool sparse_vector_find_first_mismatch(const SV &sv1, const SV &sv2, typename SV::size_type &midx, bm::null_support null_proc=bm::use_null)
Find first mismatch (element which is different) between two sparse vectors (uses linear scan in bit-...
unsigned long long int id64_t
bm::bvector ::size_type bv_size_type
std::vector< std::pair< bv_size_type, bv_size_type > > bv_ranges_vector
static int parse_args(int argc, char *argv[])
bm::aggregator< bm::bvector<> > aggregator_type
std::vector< char > vector_char_type
DNA_FingerprintScanner< bm::bvector<> > dna_scanner_type
int main(int argc, char *argv[])
void validate_k_mer(const char *dna, size_t pos, unsigned k_size, bm::id64_t k_mer)
QA function to validate if reverse k-mer decode gives the same string.
void compute_frequent_kmers(BV &frequent_bv, const histogram_map_u32 &hmap, const rsc_sparse_vector_u32 &kmer_counts, unsigned percent, unsigned k_size)
Create vector, representing subset of k-mers of high frequency.
void generate_k_mer_bvector(BV &bv, const vector_char_type &seq_vect, unsigned k_size, bool check)
This function turns each k-mer into an integer number and encodes it in a bit-vector (presense vector...
void translate_kmer(std::string &dna, bm::id64_t kmer_code, unsigned k_size)
Translate k-mer code into ATGC DNA string.
static void report_hmap(const string &fname, const histogram_map_u32 &hmap)
Save TSV report of k-mer frequences (reverse sorted, most frequent k-mers first)
std::map< unsigned, unsigned > histogram_map_u32
static void compute_kmer_histogram(histogram_map_u32 &hmap, const rsc_sparse_vector_u32 &kmer_counts)
Compute a map of how often each k-mer frequency is observed in the k-mer counts vector.
bm::sparse_vector< unsigned, bm::bvector<> > sparse_vector_u32
void count_kmers(const vector_char_type &seq_vect, unsigned k_size, rsc_sparse_vector_u32 &kmer_counts)
k-mer counting algorithm using reference sequence, regenerates k-mer codes, sorts them and counts
dna_scanner_type dna_scanner
std::string ikd_freq_name
std::string ikd_counts_name
bm::chrono_taker ::duration_map_type timing_map
bool get_kmer_code(const char *dna, size_t pos, unsigned k_size, bm::id64_t &k_mer)
Calculate k-mer as an unsigned long integer.
std::vector< char > vector_char_type
bm::rsc_sparse_vector< unsigned, sparse_vector_u32 > rsc_sparse_vector_u32
void sort_unique(VECT &vect)
Auxiliary function to do sort+unique on a vactor of ints removes duplicate elements.
char int2DNA(unsigned code)
Translate integer code to DNA letter.
void count_kmers_parallel(const BV &bv_kmers, const vector_char_type &seq_vect, rsc_sparse_vector_u32 &kmer_counts, unsigned k_size, unsigned concurrency)
MT k-mer counting.
static int load_FASTA(const std::string &fname, vector_char_type &seq_vect)
really simple FASTA parser (one entry per file)
void sort_count(VECT &vect, COUNT_VECT &cvect)
Auxiliary function to do sort+unique on a vactor of ints and save results in a counts vector.
bool get_DNA_code(char bp, bm::id64_t &dna_code)
std::atomic_ullong k_mer_progress_count(0)