BitMagic-C++
bmrandom.h
Go to the documentation of this file.
1#ifndef BMRANDOM__H__INCLUDED__
2#define BMRANDOM__H__INCLUDED__
3/*
4Copyright(c) 2002-2019 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
5
6Licensed under the Apache License, Version 2.0 (the "License");
7you may not use this file except in compliance with the License.
8You may obtain a copy of the License at
9
10 http://www.apache.org/licenses/LICENSE-2.0
11
12Unless required by applicable law or agreed to in writing, software
13distributed under the License is distributed on an "AS IS" BASIS,
14WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15See the License for the specific language governing permissions and
16limitations under the License.
17
18For more information please visit: http://bitmagic.io
19*/
20
21/*! \file bmrandom.h
22 \brief Generation of random subset
23*/
24
25#ifndef BM__H__INCLUDED__
26// BitMagic utility headers do not include main "bm.h" declaration
27// #include "bm.h" or "bm64.h" explicitly
28# error missing include (bm.h or bm64.h)
29#endif
30
31
32#include "bmfunc.h"
33#include "bmdef.h"
34
35#include <stdlib.h>
36#include <random>
37#include <algorithm>
38
39namespace bm
40{
41
42
43/*!
44 Class implements algorithm for random subset generation.
45
46 Implemented method tries to be fair, but doesn't guarantee
47 true randomeness or absense of bias.
48
49 Performace note:
50 Class holds temporary buffers and variables, so it is recommended to
51 re-use instances over multiple calls.
52
53 \ingroup setalgo
54*/
55template<class BV>
57{
58public:
59 typedef BV bvector_type;
60 typedef typename BV::size_type size_type;
61public:
64
65 /// Get random subset of input vector
66 ///
67 /// @param bv_out - destination vector
68 /// @param bv_in - input vector
69 /// @param sample_count - number of bits to pick
70 ///
71 void sample(BV& bv_out, const BV& bv_in, size_type sample_count);
72
73
74private:
75 typedef
76 typename BV::blocks_manager_type blocks_manager_type;
77
78private:
79 /// simple picking algorithm for small number of items
80 /// in [first,last] range
81 ///
82 void simple_pick(BV& bv_out,
83 const BV& bv_in,
84 size_type sample_count,
85 size_type first,
86 size_type last);
87
88 void get_subset(BV& bv_out,
89 const BV& bv_in,
90 size_type in_count,
91 size_type sample_count);
92
93 void get_block_subset(bm::word_t* blk_out,
94 const bm::word_t* blk_src,
95 unsigned count);
96 static
97 unsigned process_word(bm::word_t* blk_out,
98 const bm::word_t* blk_src,
99 unsigned nword,
100 unsigned take_count) BMNOEXCEPT;
101
102 static
103 void get_random_array(bm::word_t* blk_out,
105 unsigned bit_list_size,
106 unsigned count);
107 static
108 unsigned compute_take_count(unsigned bc,
109 size_type in_count, size_type sample_count) BMNOEXCEPT;
110
111
112private:
114 random_subset& operator=(const random_subset&);
115private:
116 typename bvector_type::rs_index_type rsi_; ///< RS index (block counts)
117 bvector_type bv_nb_; ///< blocks vector
119 bm::word_t* sub_block_;
120};
121
122
123///////////////////////////////////////////////////////////////////
124
125
126
127template<class BV>
129{
130 sub_block_ = new bm::word_t[bm::set_block_size];
131}
132
133template<class BV>
135{
136 delete [] sub_block_;
137}
138
139template<class BV>
141 const BV& bv_in,
142 size_type sample_count)
143{
144 if (sample_count == 0)
145 {
146 bv_out.clear(true);
147 return;
148 }
149
150 rsi_.init();
151 bv_in.build_rs_index(&rsi_, &bv_nb_);
152
153 size_type in_count = rsi_.count();
154 if (in_count <= sample_count)
155 {
156 bv_out = bv_in;
157 return;
158 }
159
160 float pick_ratio = float(sample_count) / float(in_count);
161 if (pick_ratio < 0.054f)
162 {
163 size_type first, last;
164 bool b = bv_in.find_range(first, last);
165 if (!b)
166 return;
167 simple_pick(bv_out, bv_in, sample_count, first, last);
168 return;
169 }
170
171 if (sample_count > in_count/2)
172 {
173 // build the complement vector and subtract it
174 BV tmp_bv;
175 size_type delta_count = in_count - sample_count;
176
177 get_subset(tmp_bv, bv_in, in_count, delta_count);
178 bv_out.bit_sub(bv_in, tmp_bv);
179 return;
180 }
181 get_subset(bv_out, bv_in, in_count, sample_count);
182}
183
184template<class BV>
185void random_subset<BV>::simple_pick(BV& bv_out,
186 const BV& bv_in,
187 size_type sample_count,
188 size_type first,
189 size_type last)
190{
191 bv_out.clear(true);
192
193 std::random_device rd;
194 #ifdef BM64ADDR
195 std::mt19937_64 mt_rand(rd());
196 #else
197 std::mt19937 mt_rand(rd());
198 #endif
199 std::uniform_int_distribution<size_type> dist(first, last);
200
201 while (sample_count)
202 {
203 size_type fidx;
204 size_type ridx = dist(mt_rand); // generate random position
205
206 BM_ASSERT(ridx >= first && ridx <= last);
207
208 bool b = bv_in.find(ridx, fidx); // find next valid bit after random
209 BM_ASSERT(b);
210 if (b)
211 {
212 // set true if was false
213 bool is_set = bv_out.set_bit_conditional(fidx, true, false);
214 sample_count -= is_set;
215 while (!is_set) // find next valid (and not set) bit
216 {
217 ++fidx;
218 // searching always left to right may create a bias...
219 b = bv_in.find(fidx, fidx);
220 if (!b)
221 break;
222 if (fidx > last)
223 break;
224 is_set = bv_out.set_bit_conditional(fidx, true, false);
225 sample_count -= is_set;
226 } // while
227 }
228 } // while
229}
230
231
232template<class BV>
233void random_subset<BV>::get_subset(BV& bv_out,
234 const BV& bv_in,
235 size_type in_count,
236 size_type sample_count)
237{
238 bv_out.clear(true);
239 bv_out.resize(bv_in.size());
240
241 const blocks_manager_type& bman_in = bv_in.get_blocks_manager();
242 blocks_manager_type& bman_out = bv_out.get_blocks_manager();
243
244 bm::word_t* tmp_block = bman_out.check_allocate_tempblock();
245
246 size_type first_nb, last_nb;
247 bool b = bv_nb_.find_range(first_nb, last_nb);
248 BM_ASSERT(b);
249 if (!b)
250 return;
251
252 std::random_device rd;
253 #ifdef BM64ADDR
254 std::mt19937_64 mt_rand(rd());
255 #else
256 std::mt19937 mt_rand(rd());
257 #endif
258 std::uniform_int_distribution<size_type> dist_nb(first_nb, last_nb);
259
260 size_type curr_sample_count = sample_count;
261 for (unsigned take_count = 0; curr_sample_count; curr_sample_count -= take_count)
262 {
263 // pick block at random
264 //
265 size_type nb;
266 size_type ridx = dist_nb(mt_rand); // generate random block idx
267 BM_ASSERT(ridx >= first_nb && ridx <= last_nb);
268
269 b = bv_nb_.find(ridx, nb); // find next valid nb
270 if (!b)
271 {
272 b = bv_nb_.find(first_nb, nb);
273 if (!b)
274 {
275 //b = bv_nb_.find(first_nb, nb);
276 BM_ASSERT(!bv_nb_.any());
277 //BM_ASSERT(b);
278 // TODO: seems to be some rounding error, needs to be addressed
279 return; // cannot find block
280 }
281 }
282 bv_nb_.clear_bit_no_check(nb); // remove from blocks list
283
284 // calculate proportinal sample count
285 //
286 unsigned bc = rsi_.count(nb);
287 BM_ASSERT(bc && (bc <= 65536));
288 take_count = compute_take_count(bc, in_count, sample_count);
289 if (take_count > curr_sample_count)
290 take_count = unsigned(curr_sample_count);
291 BM_ASSERT(take_count);
292 if (!take_count)
293 continue;
294 {
295 unsigned i0, j0;
296 bm::get_block_coord(nb, i0, j0);
297 const bm::word_t* blk_src = bman_in.get_block(i0, j0);
298 BM_ASSERT(blk_src);
299
300 // allocate target block
301 bm::word_t* blk_out = bman_out.get_block_ptr(i0, j0);
302 BM_ASSERT(!blk_out);
303 if (blk_out)
304 {
305 blk_out = bman_out.deoptimize_block(nb);
306 }
307 else
308 {
309 blk_out = bman_out.get_allocator().alloc_bit_block();
310 bman_out.set_block(nb, blk_out);
311 }
312 if (take_count == bc) // whole block take (strange)
313 {
314 // copy the whole src block
315 if (BM_IS_GAP(blk_src))
316 bm::gap_convert_to_bitset(blk_out, BMGAP_PTR(blk_src));
317 else
318 bm::bit_block_copy(blk_out, blk_src);
319 continue;
320 }
321 bm::bit_block_set(blk_out, 0);
322 if (bc < 4096) // use array shuffle
323 {
324 unsigned arr_len;
325 // convert source block to bit-block
326 if (BM_IS_GAP(blk_src))
327 {
328 arr_len = bm::gap_convert_to_arr(bit_list_,
329 BMGAP_PTR(blk_src),
331 }
332 else // bit-block
333 {
334 arr_len = bm::bit_block_convert_to_arr(bit_list_, blk_src, 0);
335 }
336 BM_ASSERT(arr_len);
337 get_random_array(blk_out, bit_list_, arr_len, take_count);
338 }
339 else // dense block
340 {
341 // convert source block to bit-block
342 if (BM_IS_GAP(blk_src))
343 {
344 bm::gap_convert_to_bitset(tmp_block, BMGAP_PTR(blk_src));
345 blk_src = tmp_block;
346 }
347 // pick random bits source block to target
348 get_block_subset(blk_out, blk_src, take_count);
349 }
350 }
351 } // for
352}
353
354template<class BV>
355unsigned random_subset<BV>::compute_take_count(
356 unsigned bc,
357 size_type in_count,
358 size_type sample_count) BMNOEXCEPT
359{
360 BM_ASSERT(sample_count);
361 float block_percent = float(bc) / float(in_count);
362 float bits_to_take = float(sample_count) * block_percent;
363 bits_to_take += 0.99f;
364 unsigned to_take = unsigned(bits_to_take);
365 if (to_take > bc)
366 to_take = bc;
367 if (!to_take)
368 to_take = unsigned(sample_count);
369
370 return to_take;
371}
372
373
374template<class BV>
375void random_subset<BV>::get_block_subset(bm::word_t* blk_out,
376 const bm::word_t* blk_src,
377 unsigned take_count)
378{
379 for (unsigned rounds = 0; take_count && rounds < 10; ++rounds)
380 {
381 // pick random scan start and scan direction
382 //
383 unsigned i = unsigned(rand()) % bm::set_block_size;
384 unsigned new_count;
385
386 for (; i < bm::set_block_size && take_count; ++i)
387 {
388 if (blk_src[i] && (blk_out[i] != blk_src[i]))
389 {
390 new_count = process_word(blk_out, blk_src, i, take_count);
391 take_count -= new_count;
392 }
393 } // for i
394
395 } // for
396 // if masked scan did not produce enough results..
397 //
398 if (take_count)
399 {
400 // Find all vacant bits: do logical (src SUB out)
401 for (unsigned i = 0; i < bm::set_block_size; ++i)
402 {
403 sub_block_[i] = blk_src[i] & ~~blk_out[i];
404 }
405 // now transform vacant bits to array, then pick random elements
406 //
407 unsigned arr_len =
408 bm::bit_block_convert_to_arr(bit_list_, sub_block_, 0);
409 // bm::gap_max_bits,
410 // bm::gap_max_bits,
411 // 0);
412 BM_ASSERT(arr_len);
413 get_random_array(blk_out, bit_list_, arr_len, take_count);
414 }
415}
416
417template<class BV>
418unsigned random_subset<BV>::process_word(bm::word_t* blk_out,
419 const bm::word_t* blk_src,
420 unsigned nword,
421 unsigned take_count) BMNOEXCEPT
422{
423 unsigned new_bits, mask;
424 do
425 {
426 mask = unsigned(rand());
427 mask ^= mask << 16;
428 } while (mask == 0);
429
430 std::random_device rd;
431 #ifdef BM64ADDR
432 std::mt19937_64 mt_rand(rd());
433 #else
434 std::mt19937 mt_rand(rd());
435 #endif
436 unsigned src_rand = blk_src[nword] & mask;
437 new_bits = src_rand & ~~blk_out[nword];
438 if (new_bits)
439 {
440 unsigned new_count = bm::word_bitcount(new_bits);
441
442 // check if we accidentally picked more bits than needed
443 if (new_count > take_count)
444 {
445 BM_ASSERT(take_count);
446
447 unsigned char blist[64];
448 unsigned arr_size = bm::bitscan(new_bits, blist);
449 BM_ASSERT(arr_size == new_count);
450 std::shuffle(blist, blist + arr_size, mt_rand);
451 unsigned value = 0;
452 for (unsigned j = 0; j < take_count; ++j)
453 {
454 value |= (1u << blist[j]);
455 }
456 new_bits = value;
457 new_count = take_count;
458
459 BM_ASSERT(bm::word_bitcount(new_bits) == take_count);
460 BM_ASSERT((new_bits & ~blk_src[nword]) == 0);
461 }
462
463 blk_out[nword] |= new_bits;
464 return new_count;
465 }
466 return 0;
467}
468
469
470template<class BV>
471void random_subset<BV>::get_random_array(bm::word_t* blk_out,
473 unsigned bit_list_size,
474 unsigned count)
475{
476 std::random_device rd;
477 #ifdef BM64ADDR
478 std::mt19937_64 mt_rand(rd());
479 #else
480 std::mt19937 mt_rand(rd());
481 #endif
482 std::shuffle(bit_list, bit_list + bit_list_size, mt_rand);
483 for (unsigned i = 0; i < count; ++i)
484 {
485 bm::set_bit(blk_out, bit_list[i]);
486 }
487}
488
489} // namespace
490
491
492#endif
Definitions(internal)
#define BMNOEXCEPT
Definition: bmdef.h:82
#define BMGAP_PTR(ptr)
Definition: bmdef.h:189
#define BM_IS_GAP(ptr)
Definition: bmdef.h:191
#define BM_ASSERT
Definition: bmdef.h:139
Bit manipulation primitives (internal)
rs_index< allocator_type > rs_index_type
Definition: bm.h:816
void sample(BV &bv_out, const BV &bv_in, size_type sample_count)
Get random subset of input vector.
Definition: bmrandom.h:140
BV::size_type size_type
Definition: bmrandom.h:60
BMFORCEINLINE bm::id_t word_bitcount(bm::id_t w) BMNOEXCEPT
Definition: bmutil.h:573
void bit_block_copy(bm::word_t *BMRESTRICT dst, const bm::word_t *BMRESTRICT src) BMNOEXCEPT
Bitblock copy operation.
Definition: bmfunc.h:6743
unsigned short bitscan(V w, B *bits) BMNOEXCEPT
Templated Bitscan with dynamic dispatch for best type.
Definition: bmfunc.h:761
BMFORCEINLINE void set_bit(unsigned *dest, unsigned bitpos) BMNOEXCEPT
Set 1 bit in a block.
Definition: bmfunc.h:3703
unsigned bit_block_convert_to_arr(T *BMRESTRICT dest, const unsigned *BMRESTRICT src, bool inverted) BMNOEXCEPT
Convert bit block into an array of ints corresponding to 1 bits.
Definition: bmfunc.h:8749
void bit_block_set(bm::word_t *BMRESTRICT dst, bm::word_t value) BMNOEXCEPT
Bitblock memset operation.
Definition: bmfunc.h:4422
unsigned bit_list(T w, B *bits) BMNOEXCEPT
Unpacks word into list of ON bit indexes.
Definition: bmfunc.h:587
D gap_convert_to_arr(D *BMRESTRICT dest, const T *BMRESTRICT buf, unsigned dest_len, bool invert=false) BMNOEXCEPT
Convert gap block into array of ints corresponding to 1 bits.
Definition: bmfunc.h:4908
void gap_convert_to_bitset(unsigned *BMRESTRICT dest, const T *BMRESTRICT buf, unsigned len=0) BMNOEXCEPT
GAP block to bitblock conversion.
Definition: bmfunc.h:4441
Definition: bm.h:78
unsigned int word_t
Definition: bmconst.h:39
BMFORCEINLINE void get_block_coord(BI_TYPE nb, unsigned &i, unsigned &j) BMNOEXCEPT
Recalc linear bvector block index into 2D matrix coordinates.
Definition: bmfunc.h:172
const unsigned set_block_size
Definition: bmconst.h:55
unsigned short gap_word_t
Definition: bmconst.h:78
const unsigned gap_max_bits
Definition: bmconst.h:81