BitMagic-C++
xsample04a.cpp
Go to the documentation of this file.
1/*
2Copyright(c) 2018 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15
16For more information please visit: http://bitmagic.io
17*/
18
19/** \example xsample04a.cpp
20
21*/
22
23/*! \file xsample04a.cpp
24 \brief Example: DNA index construction
25
26*/
27
28#include <iostream>
29#include <sstream>
30#include <regex>
31#include <time.h>
32#include <stdio.h>
33
34#include <stdexcept>
35#include <memory>
36#include <vector>
37
38#include <future>
39#include <thread>
40#include <mutex>
41
42#include "bm.h"
43
44#include "bmdbg.h"
45#include "bmtimer.h"
46#include "bmundef.h" /* clear the pre-proc defines from BM */
47
48using namespace std;
49
50static
52{
53 std::cerr
54 << "BitMagic DNA Index Build Sample (c) 2018" << std::endl
55 << "-fa file-name -- input FASTA file" << std::endl
56 << "-j number -- number of parallel jobs to run" << std::endl
57 << "-timing -- collect timings" << std::endl
58 ;
59}
60
61
62
63
64// Arguments
65//
66std::string ifa_name;
67bool is_timing = false;
68unsigned parallel_jobs = 4;
69
70static
71int parse_args(int argc, char *argv[])
72{
73 for (int i = 1; i < argc; ++i)
74 {
75 std::string arg = argv[i];
76 if ((arg == "-h") || (arg == "--help"))
77 {
78 show_help();
79 return 0;
80 }
81 if (arg == "-fa" || arg == "--fa")
82 {
83 if (i + 1 < argc)
84 {
85 ifa_name = argv[++i];
86 }
87 else
88 {
89 std::cerr << "Error: -fa requires file name" << std::endl;
90 return 1;
91 }
92 continue;
93 }
94 if (arg == "-j" || arg == "--j")
95 {
96 if (i + 1 < argc)
97 {
98 parallel_jobs = unsigned(::atoi(argv[++i]));
99 }
100 else
101 {
102 std::cerr << "Error: -j requires number of jobs" << std::endl;
103 return 1;
104 }
105 continue;
106 }
107
108 if (arg == "-timing" || arg == "--timing" || arg == "-t" || arg == "--t")
109 is_timing = true;
110
111 } // for i
112 return 0;
113}
114
115
116
117// ----------------------------------------------------------------------------
118
120
121// FASTA format parser
122static
123int load_FASTA(const std::string& fname, std::vector<char>& seq_vect)
124{
125 bm::chrono_taker tt1(cout, "1. Parse FASTA", 1, &timing_map);
126
127 seq_vect.resize(0);
128 std::ifstream fin(fname.c_str(), std::ios::in);
129 if (!fin.good())
130 return -1;
131
132 std::string line;
133 for (unsigned i = 0; std::getline(fin, line); ++i)
134 {
135 if (line.empty() ||
136 line.front() == '>')
137 continue;
138
139 for (std::string::iterator it = line.begin(); it != line.end(); ++it)
140 seq_vect.push_back(*it);
141 } // for
142 return 0;
143}
144
145
146
147/**
148 Utility for keeping all DNA finger print vectors and search
149 using various techniques
150*/
152{
153public:
154 enum { eA = 0, eC, eG, eT, eN, eEnd };
155
157
158 /// Build fingerprint bit-vectors from the original sequence
159 ///
160 void Build(const vector<char>& sequence)
161 {
162 bm::bvector<>::insert_iterator iA = m_FPrintBV[eA].inserter();
163 bm::bvector<>::insert_iterator iC = m_FPrintBV[eC].inserter();
164 bm::bvector<>::insert_iterator iG = m_FPrintBV[eG].inserter();
165 bm::bvector<>::insert_iterator iT = m_FPrintBV[eT].inserter();
166 bm::bvector<>::insert_iterator iN = m_FPrintBV[eN].inserter();
167
168 for (size_t i = 0; i < sequence.size(); ++i)
169 {
170 unsigned pos = unsigned(i);
171 switch (sequence[i])
172 {
173 case 'A':
174 iA = pos;
175 break;
176 case 'C':
177 iC = pos;
178 break;
179 case 'G':
180 iG = pos;
181 break;
182 case 'T':
183 iT = pos;
184 break;
185 case 'N':
186 iN = pos;
187 break;
188 default:
189 break;
190 }
191 }
192 }
193
194 /// Build index using bulk insert iterator
195 ///
196 void BuildBulk(const vector<char>& sequence)
197 {
203
204 for (size_t i = 0; i < sequence.size(); ++i)
205 {
206 unsigned pos = unsigned(i);
207 switch (sequence[i])
208 {
209 case 'A':
210 iA = pos;
211 break;
212 case 'C':
213 iC = pos;
214 break;
215 case 'G':
216 iG = pos;
217 break;
218 case 'T':
219 iT = pos;
220 break;
221 case 'N':
222 iN = pos;
223 break;
224 default:
225 break;
226 }
227 }
228 }
229
230
231 /// Build fingerprint bit-vectors using bulk insert iterator and parallel
232 /// processing
233 ///
234 void BuildParallel(const vector<char>& sequence, unsigned threads)
235 {
236 struct Func
237 {
238 DNA_FingerprintScanner* target_idx;
239 const std::vector<char>* src_sequence;
240
241 Func(DNA_FingerprintScanner* idx, const vector<char>& src)
242 : target_idx(idx), src_sequence(&src) {}
243
244 void operator() (size_t from, size_t to)
245 {
246 const vector<char>& sequence = *src_sequence;
247 bm::bvector<> bvA, bvT, bvG, bvC, bvN;
248
249 {
255 for (size_t i = from; i < sequence.size() && (i < to); ++i)
256 {
257 unsigned pos = unsigned(i);
258 switch (sequence[i])
259 {
260 case 'A':
261 iA = pos;
262 break;
263 case 'C':
264 iC = pos;
265 break;
266 case 'G':
267 iG = pos;
268 break;
269 case 'T':
270 iT = pos;
271 break;
272 case 'N':
273 iN = pos;
274 break;
275 default:
276 break;
277 }
278 } // for
279 // Bulk insert iterator keeps an buffer, which has to be
280 // flushed, before all bits appear in the target vector
281 //
282 iA.flush();
283 iC.flush();
284 iT.flush();
285 iG.flush();
286 iN.flush();
287 }
288 // merge results of parallel processing back to index
289 target_idx->MergeVector('A', bvA);
290 target_idx->MergeVector('T', bvT);
291 target_idx->MergeVector('G', bvG);
292 target_idx->MergeVector('C', bvC);
293 target_idx->MergeVector('N', bvN);
294 }
295 };
296
297 if (threads <= 1)
298 {
299 BuildBulk(sequence);
300 return;
301 }
302
303 // Create parallel async tasks running on a range of source sequence
304 //
305 std::vector<std::future<void> > futures;
306 futures.reserve(8);
307 unsigned range = unsigned(sequence.size() / threads);
308
309 for (unsigned k = 0; k < sequence.size(); k += range)
310 {
311 futures.emplace_back(std::async(std::launch::async,
312 Func(this, sequence), k, k + range));
313 }
314
315 // wait for all tasks
316 for (auto& e : futures)
317 {
318 e.wait();
319 }
320 }
321
322 /// Thread sync bit-vector merge
323 ///
324 void MergeVector(char letter, bm::bvector<>& bv)
325 {
326 static std::mutex mtx_A;
327 static std::mutex mtx_T;
328 static std::mutex mtx_G;
329 static std::mutex mtx_C;
330 static std::mutex mtx_N;
331
332 switch (letter)
333 {
334 case 'A':
335 {
336 std::lock_guard<std::mutex> guard(mtx_A);
337 m_FPrintBV[eA].merge(bv);
338 }
339 break;
340 case 'C':
341 {
342 std::lock_guard<std::mutex> guard(mtx_C);
343 m_FPrintBV[eC].merge(bv);
344 }
345 break;
346 case 'G':
347 {
348 std::lock_guard<std::mutex> guard(mtx_G);
349 m_FPrintBV[eG].merge(bv);
350 }
351 break;
352 case 'T':
353 {
354 std::lock_guard<std::mutex> guard(mtx_T);
355 m_FPrintBV[eT].merge(bv);
356 }
357 break;
358 case 'N':
359 {
360 std::lock_guard<std::mutex> guard(mtx_N);
361 m_FPrintBV[eN].merge(bv);
362 }
363 break;
364 default:
365 break;
366 }
367
368 }
369
370 /// Return fingerprint bit-vector
371 const bm::bvector<>& GetVector(char letter) const
372 {
373 switch (letter)
374 {
375 case 'A':
376 return m_FPrintBV[eA];
377 case 'C':
378 return m_FPrintBV[eC];
379 case 'G':
380 return m_FPrintBV[eG];
381 case 'T':
382 return m_FPrintBV[eT];
383 case 'N':
384 return m_FPrintBV[eN];
385 default:
386 break;
387 }
388 throw runtime_error("Error. Invalid letter!");
389 }
390
391private:
392 bm::bvector<> m_FPrintBV[eEnd];
393};
394
395/// Check correctness of indexes constructed using different methods
396///
397static
399 const DNA_FingerprintScanner& idx2)
400{
401 std::vector<char> letters {'A', 'T', 'G', 'C'};
402 for (char base : letters)
403 {
404 const bm::bvector<>& bv1 = idx1.GetVector(base);
405 const bm::bvector<>& bv2 = idx2.GetVector(base);
406
407 int cmp = bv1.compare(bv2);
408 if (cmp != 0)
409 {
410 throw runtime_error(string("Fingerprint mismatch for:") + string(1, base));
411 }
412 } // for
413}
414
415
416
417int main(int argc, char *argv[])
418{
419 if (argc < 3)
420 {
421 show_help();
422 return 1;
423 }
424
425 std::vector<char> seq_vect;
426
427 try
428 {
429 auto ret = parse_args(argc, argv);
430 if (ret != 0)
431 return ret;
432
435
436 if (!ifa_name.empty())
437 {
438 auto res = load_FASTA(ifa_name, seq_vect);
439 if (res != 0)
440 return res;
441 std::cout << "FASTA sequence size=" << seq_vect.size() << std::endl;
442
443 {
444 bm::chrono_taker tt1(cout, "2. Build DNA index", 1, &timing_map);
445 idx1.Build(seq_vect);
446 }
447
448 if (parallel_jobs > 0)
449 {
450 std::cout << "jobs = " << parallel_jobs << std::endl;
451 bm::chrono_taker tt1(cout, "3. Build DNA index (bulk, parallel)", 1, &timing_map);
452 idx2.BuildParallel(seq_vect, parallel_jobs);
453 }
454
455 // compare results (correctness check)
456 //
457 fingerprint_compare(idx1, idx2);
458 }
459
460 if (is_timing) // print all collected timings
461 {
462 std::cout << std::endl << "Performance:" << std::endl;
464 }
465 }
466 catch (std::exception& ex)
467 {
468 std::cerr << "Error:" << ex.what() << std::endl;
469 return 1;
470 }
471
472 return 0;
473}
Compressed bit-vector bvector<> container, set algebraic methods, traversal iterators.
Timing utilities for benchmarking (internal)
pre-processor un-defines to avoid global space pollution (internal)
Utility for keeping all DNA finger print vectors and search using various techniques.
Definition: xsample04.cpp:181
void BuildParallel(const vector< char > &sequence, unsigned threads)
Build fingerprint bit-vectors using bulk insert iterator and parallel processing.
Definition: xsample04a.cpp:234
void Build(const vector< char > &sequence)
Build fingerprint bit-vectors from the original sequence.
Definition: xsample04a.cpp:160
void BuildBulk(const vector< char > &sequence)
Build index using bulk insert iterator.
Definition: xsample04a.cpp:196
void MergeVector(char letter, bm::bvector<> &bv)
Thread sync bit-vector merge.
Definition: xsample04a.cpp:324
const bm::bvector & GetVector(char letter) const
Return fingerprint bit-vector.
Definition: xsample04a.cpp:371
Output iterator iterator designed to set "ON" bits based on input sequence of integers.
Definition: bm.h:465
Output iterator iterator designed to set "ON" bits based on input sequence of integers (bit indeces).
Definition: bm.h:381
Bitvector Bit-vector container with runtime compression of bits.
Definition: bm.h:115
void merge(bm::bvector< Alloc > &bvect)
Merge/move content from another vector.
Definition: bm.h:5578
insert_iterator inserter()
Definition: bm.h:1265
int compare(const bvector< Alloc > &bvect) const BMNOEXCEPT
Lexicographical comparison with a bitvector.
Definition: bm.h:3709
Utility class to collect performance measurements and statistics.
Definition: bmtimer.h:41
std::map< std::string, statistics > duration_map_type
test name to duration map
Definition: bmtimer.h:66
static void print_duration_map(TOut &tout, const duration_map_type &dmap, format fmt=ct_time)
Definition: bmtimer.h:150
@ BM_SORTED
input set is sorted (ascending order)
Definition: bmconst.h:205
int main(int argc, char *argv[])
Definition: xsample04a.cpp:417
static int parse_args(int argc, char *argv[])
Definition: xsample04a.cpp:71
static void show_help()
Definition: xsample04a.cpp:51
static void fingerprint_compare(const DNA_FingerprintScanner &idx1, const DNA_FingerprintScanner &idx2)
Check correctness of indexes constructed using different methods.
Definition: xsample04a.cpp:398
std::string ifa_name
Definition: xsample04a.cpp:66
bm::chrono_taker ::duration_map_type timing_map
Definition: xsample04a.cpp:119
static int load_FASTA(const std::string &fname, std::vector< char > &seq_vect)
Definition: xsample04a.cpp:123
unsigned parallel_jobs
Definition: xsample04a.cpp:68
bool is_timing
Definition: xsample04a.cpp:67