#include <iostream>
#include <sstream>
#include <chrono>
#include <regex>
#include <time.h>
#include <stdio.h>
#include <stdexcept>
#include <memory>
#include <vector>
#include <chrono>
#include <map>
#include <utility>
#include <algorithm>
#include <unordered_map>
#include "bmdbg.h"
using namespace std;
static
{
std::cerr
<< "BitMagic DNA Search Sample (c) 2018" << std::endl
<< "-fa file-name -- input FASTA file" << std::endl
<< "-s hi|lo -- run substring search benchmark" << std::endl
<< "-diag -- run diagnostics" << std::endl
<< "-timing -- collect timings" << std::endl
;
}
static
{
for (int i = 1; i < argc; ++i)
{
std::string arg = argv[i];
if ((arg == "-h") || (arg == "--help"))
{
return 0;
}
if (arg == "-fa" || arg == "--fa")
{
if (i + 1 < argc)
{
}
else
{
std::cerr << "Error: -fa requires file name" << std::endl;
return 1;
}
continue;
}
if (arg == "-diag" || arg == "--diag" || arg == "-d" || arg == "--d")
if (arg == "-timing" || arg == "--timing" || arg == "-t" || arg == "--t")
if (arg == "-bench" || arg == "--bench" || arg == "-b" || arg == "--b")
if (arg == "-search" || arg == "--search" || arg == "-s" || arg == "--s")
{
if (i + 1 < argc)
{
std::string a = argv[i+1];
if (a != "-")
{
if (a == "l" || a == "lo")
{
++i;
}
else
if (a == "h" || a == "hi")
{
++i;
}
}
}
}
}
return 0;
}
typedef std::map<std::string, unsigned>
freq_map;
typedef std::vector<std::pair<unsigned, std::string> >
dict_vect;
static
int load_FASTA(
const std::string& fname, std::vector<char>& seq_vect)
{
seq_vect.resize(0);
std::ifstream fin(fname.c_str(), std::ios::in);
if (!fin.good())
return -1;
std::string line;
for (unsigned i = 0; std::getline(fin, line); ++i)
{
if (line.empty() ||
line.front() == '>')
continue;
for (std::string::iterator it = line.begin(); it != line.end(); ++it)
seq_vect.push_back(*it);
}
return 0;
}
{
public:
void Build(
const vector<char>& sequence)
{
for (size_t i = 0; i < sequence.size(); ++i)
{
unsigned pos = unsigned(i);
switch (sequence[i])
{
case 'A':
iA = pos;
break;
case 'C':
iC = pos;
break;
case 'G':
iG = pos;
break;
case 'T':
iT = pos;
break;
case 'N':
iN = pos;
break;
default:
break;
}
}
}
{
switch (letter)
{
case 'A':
case 'C':
case 'G':
case 'T':
case 'N':
default:
break;
}
throw runtime_error("Error. Invalid letter!");
}
void Find(
const string& word, vector<unsigned>& res)
{
if (word.empty())
return;
for (size_t i = 1; i < word.size(); ++i)
{
bv.shift_right();
bv &= bv_mask;
if (!any)
break;
}
unsigned ws = unsigned(word.size()) - 1;
};
void FindAggFused(
const string& word, vector<unsigned>& res)
{
if (word.empty())
return;
for (size_t i = 0; i < word.size(); ++i)
{
}
unsigned ws = unsigned(word.size()) - 1;
};
vector<vector<unsigned>>& hits)
{
vector<unique_ptr<aggregator_type> > agg_pipeline;
unsigned ws = 0;
for (const auto& w : words)
{
const string& word = get<0>(w);
for (size_t i = 0; i < word.size(); ++i)
{
agg_ptr->add(&bv_mask);
}
agg_pipeline.emplace_back(agg_ptr.release());
ws = unsigned(word.size()) - 1;
}
vector<unique_ptr<aggregator_type> >::iterator>(agg_pipeline.begin(), agg_pipeline.end());
for (size_t i = 0; i < agg_pipeline.size(); ++i)
{
vector<unsigned> res;
res.reserve(12000);
hits.emplace_back(res);
}
}
protected:
unsigned left_shift,
vector<unsigned>& res)
{
{
auto pos = *en;
res.push_back(pos - left_shift);
}
}
private:
};
static
vector<tuple<string,int>>& lo_words,
const vector<char>& data,
size_t N,
unsigned word_size)
{
cout << "k-mer generation... " << endl;
top_words.clear();
lo_words.clear();
if (data.size() < word_size)
return;
size_t end_pos = data.size() - word_size;
size_t i = 0;
map<string, int> words;
while (i < end_pos)
{
string s(&data[i], word_size);
if (s.find('N') == string::npos)
words[s] += 1;
i += word_size;
if (i % 10000 == 0)
{
cout << "\r" << i << "/" << end_pos << flush;
}
}
cout << endl << "Picking k-mer samples..." << flush;
multimap<int,string, greater<int>> dst;
for_each(words.begin(), words.end(), [&](const std::pair<string,int>& p)
{
dst.emplace(p.second, p.first);
});
{
auto it = dst.begin();
for(size_t count = 0; count < N && it !=dst.end(); ++it,++count)
top_words.emplace_back(it->second, it->first);
}
{
auto it = dst.rbegin();
for(size_t count = 0; count < N && it !=dst.rend(); ++it, ++count)
lo_words.emplace_back(it->second, it->first);
}
cout << "OK" << endl;
}
static
const char* word, unsigned word_size,
{
if (data.size() < word_size)
return;
size_t i = 0;
size_t end_pos = data.size() - word_size;
while (i < end_pos)
{
bool found = true;
for (size_t j = i, k = 0, l = word_size - 1; l > k; ++j, ++k, --l)
{
if (data[j] != word[k] || data[i + l] != word[l])
{
found = false;
break;
}
}
if (found)
r.push_back(unsigned(i));
++i;
}
}
static
vector<const char*> words,
unsigned word_size,
vector<vector<unsigned>>& hits)
{
if (data.size() < word_size)
return;
size_t i = 0;
size_t end_pos = data.size() - word_size;
size_t words_size = words.size();
while (i < end_pos)
{
for (size_t idx = 0; idx < words_size; ++idx)
{
auto& word = words[idx];
bool found = true;
for (size_t j = i, k = 0, l = word_size - 1; l > k; ++j, ++k, --l)
{
if (data[j] != word[k] || data[i + l] != word[l])
{
found = false;
break;
}
}
if (found)
{
hits[idx].push_back(unsigned(i));
break;
}
}
++i;
}
}
static
{
if (h1.size() != h2.size())
{
cerr << "size1 = " << h1.size() << " size2 = " << h2.size() << endl;
return false;
}
for (size_t i = 0; i < h1.size(); ++i)
{
if (h1[i] != h2[i])
return false;
}
return true;
}
int main(
int argc,
char *argv[])
{
if (argc < 3)
{
return 1;
}
std::vector<char> seq_vect;
try
{
if (ret != 0)
return ret;
{
if (res != 0)
return res;
std::cout << "FASTA sequence size=" << seq_vect.size() << std::endl;
{
}
}
{
vector<tuple<string,int> > h_words;
vector<tuple<string,int> > l_words;
vector<tuple<string,int>>& words =
h_word_set ? h_words : l_words;
vector<THitList> word_hits;
vector<THitList> word_hits_agg;
{
vector<const char*> word_list;
for (const auto& w : words)
{
word_list.push_back(get<0>(w).c_str());
}
word_hits.resize(words.size());
for_each(word_hits.begin(), word_hits.end(), [](
THitList& ht) {
ht.reserve(12000);
});
}
{
}
for (size_t word_idx = 0; word_idx < words.size(); ++ word_idx)
{
auto& word = get<0>(words[word_idx]);
{
word.c_str(), unsigned(word.size()),
hits1);
}
{
}
{
}
{
cout << "Mismatch ERROR for: " << word << endl;
}
else
{
cout << "Sigle pass mismatch ERROR for: " << word << endl;
}
else
{
cout << word_idx << ": " << word << ": " << hits1.size() << " hits " << endl;
}
}
}
{
std::cout << std::endl << "Performance:" << std::endl;
}
}
catch (std::exception& ex)
{
std::cerr << "Error:" << ex.what() << std::endl;
return 1;
}
return 0;
}
Compressed bit-vector bvector<> container, set algebraic methods, traversal iterators.
Algorithms for fast aggregation of N bvectors.
Algorithms for bvector<> (main include)
Serialization / compression of bvector<>. Set theoretical operations on compressed BLOBs.
Timing utilities for benchmarking (internal)
pre-processor un-defines to avoid global space pollution (internal)
Utility for keeping all DNA finger print vectors and search using various techniques.
void FindAggFused(const string &word, vector< unsigned > &res)
This method uses cache blocked aggregator with fused SHIFT+AND kernel.
void Find(const string &word, vector< unsigned > &res)
Find word strings using shift + and on fingerprint vectors (horizontal, non-fused basic method)
void Build(const vector< char > &sequence)
Build fingerprint bit-vectors from the original sequence.
const bm::bvector & GetVector(char letter) const
Return fingerprint bit-vector.
void TranslateResults(const bm::bvector<> &bv, unsigned left_shift, vector< unsigned > &res)
Translate search results vector using (word size) left shift.
void FindCollection(const vector< tuple< string, int > > &words, vector< vector< unsigned >> &hits)
Find a set of words in one pass using pipeline of aggregators (this is very experimental)
Algorithms for fast aggregation of a group of bit-vectors.
void combine_shift_right_and(bvector_type &bv_target)
Aggregate added group of vectors using SHIFT-RIGHT and logical AND Operation does NOT perform an expl...
void reset()
Reset aggregate groups, forget all attached vectors.
const bvector_type * get_target() const BMNOEXCEPT
size_t add(const bvector_type *bv, unsigned agr_group=0)
Attach source bit-vector to a argument group (0 or 1).
Output iterator iterator designed to set "ON" bits based on input sequence of integers.
Constant iterator designed to enumerate "ON" bits.
bool valid() const BMNOEXCEPT
Checks if iterator is still valid.
Bitvector Bit-vector container with runtime compression of bits.
bool any() const BMNOEXCEPT
Returns true if any bits in this bitset are set, and otherwise returns false.
enumerator first() const
Returns enumerator pointing on the first non-zero bit.
Utility class to collect performance measurements and statistics.
std::map< std::string, statistics > duration_map_type
test name to duration map
static void print_duration_map(TOut &tout, const duration_map_type &dmap, format fmt=ct_time)
@ BM_SORTED
input set is sorted (ascending order)
void aggregator_pipeline_execute(It first, It last)
Experimental method ro run multiple aggregators in sync.
static void find_words(const vector< char > &data, vector< const char * > words, unsigned word_size, vector< vector< unsigned >> &hits)
Find all words in one pass (cache coherent algorithm) (variation of 2-way string matching for collect...
int main(int argc, char *argv[])
static int parse_args(int argc, char *argv[])
vector< unsigned > THitList
std::map< std::string, unsigned > freq_map
static const size_t WORD_SIZE
bm::aggregator< bm::bvector<> > aggregator_type
static bool hitlist_compare(const THitList &h1, const THitList &h2)
Check search result match.
bm::chrono_taker ::duration_map_type timing_map
std::vector< std::pair< unsigned, std::string > > dict_vect
static void find_word_2way(vector< char > &data, const char *word, unsigned word_size, THitList &r)
2-way string matching
static int load_FASTA(const std::string &fname, std::vector< char > &seq_vect)
static void generate_kmers(vector< tuple< string, int >> &top_words, vector< tuple< string, int >> &lo_words, const vector< char > &data, size_t N, unsigned word_size)
generate the most frequent words of specified length from the input sequence