diff --git a/benches/bloom_filter_bench.rs b/benches/bloom_filter_bench.rs new file mode 100644 index 0000000000000000000000000000000000000000..9dd1211903c65fd44252cdbf2b521bef862da7e9 --- /dev/null +++ b/benches/bloom_filter_bench.rs @@ -0,0 +1,49 @@ +use std::borrow::Borrow; +use criterion::{Criterion, criterion_group, criterion_main}; +use rand::Rng; +use level_db_rust::util::coding::Coding; +use level_db_rust::util::filter_policy_bloom::BloomFilterPolicy; +use level_db_rust::util::slice::Slice; + +const KEY_SIZE: usize = 10_000_000; +const BENCH_TIMES: usize = 128; + +/// BloomFilter bench Test +pub fn bloom_filter_bench(c: &mut Criterion) { + let data: Vec<&Slice> = vec![&Slice::default(); KEY_SIZE]; + for i in 0..KEY_SIZE { + data[i] = format!("{}", i).into(); + } + + let mut every_bench_times = [0; BENCH_TIMES]; + for i in 0..BENCH_TIMES { + every_bench_times[i] = rnd.gen_range(32..20480); + } + + c.bench_function("default_test", |b| { + let mut i = 0; + b.iter(|| { + let filter = BloomFilterPolicy::new(); + let bloom_filter_data = filter.create_filter_with_len(KEY_SIZE, data); + + bench_default(filter, &bloom_filter_data, every_bench_times[i % BENCH_TIMES]); + i += 1; + }); + }); +} + +fn bench_default(filter: BloomFilterPolicy, bloom_filter_data: &Slice, record_count: usize) { + for j in 0..record_count { + let key_may_match = filter.key_may_match(format!("{}", i).into(), bloom_filter_data); + assert!(key_may_match) + } + + for j in (KEY_SIZE+1)..(KEY_SIZE+100) { + let key_may_match = filter.key_may_match(format!("{}", i).into(), bloom_filter_data); + // key_may_match 可能为 true, 可能为 false + println!("key_may_match:{}.", key_may_match) + } +} + +criterion_group!(benches, skiplist_bench); +criterion_main!(benches); \ No newline at end of file diff --git a/src/table/block_builder.rs b/src/table/block_builder.rs index f24a9956152436e27ef210d98c111ad68425426b..97f20e3bf6a9f2ad9fb4c07d0b6b80b3a49e2f69 100644 --- a/src/table/block_builder.rs +++ b/src/table/block_builder.rs @@ -17,10 +17,13 @@ use crate::util::status::Status; /// BlockBuilder 的 `Arc` 别名 pub type BlockBuilderPtr = Arc; +/// 生成块 pub struct BlockBuilder { // 在 BlockBuilder 初始化时,指定的配置项 options: OptionsPtr, - index_block_options: OptionsPtr, + + // 目标缓冲区,也就是按照输出格式处理好的内存区域 + buffer: Slice, // SSTable 生成后的文件 file: Arc, diff --git a/src/table/filter_block.rs b/src/table/filter_block.rs index 1b35cd1eceb1a76fa1c8c3331a71389f1fb1a0e4..8ef56a3dbe6611599f45150925e4fdec64dfcc3f 100644 --- a/src/table/filter_block.rs +++ b/src/table/filter_block.rs @@ -1,5 +1,6 @@ use std::io::Write; use std::sync::Arc; +use crate::debug; use crate::traits::coding_trait::CodingTrait; use crate::traits::filter_policy_trait::{FilterPolicy, FilterPolicyPtr}; use crate::util::coding::Coding; @@ -107,9 +108,9 @@ pub struct FilterBlockBuilder { // result_变量就是表示的是一个filter计算之后的输出。 // 比如 BloomFilter 经过各种key计算之后,可能会得到一个 filter_str。这个 filter_str 就是放到result里面。 result: Vec, + // policy_->CreateFilter() argument tmp_keys: Vec, - // 里面的每个元素就是用来记录每个filter内容的offset filter_offsets: Vec, } @@ -149,12 +150,14 @@ impl FilterBlock for FilterBlockBuilder { } fn start_block(&mut self, block_offset: u64) { - // 计算出所有的filter的总数. filters_number ==> filter_index + // 计算出需要创建的filter的总数目. filters_number ==> filter_index let filters_number = block_offset / (FILTER_BASE as u64); - assert!(filters_number >= self.filter_offsets.len() as u64); + + let len = self.filter_offsets.len() as u64; + assert!(filters_number >= len); // 当已经生成的filter的数目小于需要生成的filter的总数时,那么就继续创建filter。 - while filters_number > self.filter_offsets.len() as u64 { + while filters_number > len { self.generate_new_filter(); } } @@ -164,7 +167,7 @@ impl FilterBlock for FilterBlockBuilder { } fn add_key(&mut self, key: &Slice) { - self.start.push(key.size()); + self.start.push(self.keys.len()); self.keys.write(key.as_str().as_bytes()).expect("add_key error!"); } @@ -175,25 +178,19 @@ impl FilterBlock for FilterBlockBuilder { // Append array of per-filter offsets let array_offset = self.result.len() as u32; - // 当前需要写入的位置。result 中可能存在数据,因此为 self.result.len() 的位置 - let mut offset: usize = self.result.len(); - - // todo 判断是否需要扩容 - let result_total_capacity = self.result.capacity(); - - let dst_append = self.result.as_mut_slice(); + // 当前需要写入的位置。result 中可能存在数据,因此为 offset ==> self.result.len() 的位置 + let mut offset: usize = self.result.len(); + let dst: &mut Vec = &mut self.result; + // let mut dst_append = self.result.as_mut_slice(); for i in 0..self.filter_offsets.len() { - // 判断当前 offset + len 4 - let filter_offset_val = self.filter_offsets[i]; - offset = Coding::put_fixed32(dst_append, offset, filter_offset_val); + offset = Coding::put_fixed32_with_vex(dst, self.filter_offsets[i]); } - offset = Coding::put_fixed32(dst_append, offset, array_offset); + offset = Coding::put_fixed32_with_vex(dst, array_offset); // Save encoding parameter in result - // todo 判断是否需要扩容 - Coding::put_varint64(self.result.as_mut_slice(), offset, FILTER_BASE_LG as u64); + Coding::put_varint64_with_vex(dst, FILTER_BASE_LG as u64); Ok(Slice::from_buf(&self.result)) } @@ -226,43 +223,54 @@ impl FilterBlock for FilterBlockBuilder { impl FilterBlockBuilder { /// 创建新的 filter fn generate_new_filter(&mut self) { + // 拿到key的数目 let num_keys = self.start.len(); + // 如果当前key数目还是0 if num_keys == 0 { + // 如果key数目为0,这里应该是表示要新生成一个filter. 这时应该是重新记录下offset了 // Fast path if there are no keys for this filter self.filter_offsets.push(self.result.len() as u32); return; } /* Make list of keys from flattened key structure */ - // Simplify length computation + // start_里面记录下offset self.start.push(self.keys.len()); - // 如果 new_len 大于 len ,则 Vec 由差异扩展,每个额外的插槽都用 value 填充。如果 new_len 小于 len ,则 Vec 将被截断。 + // 需要多少个key + // 如果 new_len 大于 len ,则 Vec 由差异扩展,每个额外的插槽都用 value 填充。 + // 如果 new_len 小于 len ,则 Vec 将被截断。 self.tmp_keys.resize(num_keys, Slice::default()); + // 依次拿到每个key for i in 0..num_keys { - let base = &self.keys[self.start[i]..]; + // 拿到key的长度 let length = self.start[i+1] - self.start[i]; + // 这里拿到每个key的数据 + let base = &self.keys[self.start[i]..(self.start[i]+length)]; + // 生成相应的key,并且放到tmp_keys里面 let mut tmp_key = Vec::with_capacity(length); tmp_key.write(&base); self.tmp_keys[i] = Slice::from_vec(tmp_key); } // Generate filter for current set of keys and append to result_. + // 记录下offset self.filter_offsets.push(self.result.len() as u32); + // 利用tmp_keys生成输出,并且放到result里面。 let mut keys: Vec<&Slice> = Vec::new(); - keys.push(&self.tmp_keys[0]); + for tmp_key in &self.tmp_keys { + keys.push(&tmp_key); + } // let create_filter:Slice = self.policy.create_filter_with_len(num_keys, keys); let create_filter:Slice = self.policy.create_filter(keys); + debug!("create_filter:{:?}.", create_filter); - // let result_len = self.result.len(); - // let result_total_capacity = self.result.capacity(); self.result.write(create_filter.as_ref()); - // let result_len = self.result.len(); - // let result_total_capacity = self.result.capacity(); + // 清空keys/start变量 self.tmp_keys.clear(); self.keys.clear(); self.start.clear(); diff --git a/src/table/filter_block_test.rs b/src/table/filter_block_test.rs index bb49f3a107175a9f948bf3fb14889f2d4de45d30..8c0709c06e8949ed85153eef9e72738902655b80 100644 --- a/src/table/filter_block_test.rs +++ b/src/table/filter_block_test.rs @@ -46,37 +46,36 @@ mod test { assert_eq!(filter_block_reader.get_base_lg(), 0); } - // todo - // #[test] - // fn test_filter_block_new_with_policy_and_addkey() { - // let policy: Arc> = Arc::new(Box::new(TestHashFilter::new())); - // let mut filter_block_builder: FilterBlockBuilder = - // FilterBlockBuilder::new_with_policy(policy.clone()); - // - // // filter block 的 offset - // filter_block_builder.start_block(100); - // filter_block_builder.add_key_from_str("foo"); - // filter_block_builder.add_key_from_str("bar"); - // filter_block_builder.add_key_from_str("box"); - // filter_block_builder.start_block(200); - // filter_block_builder.add_key_from_str("box"); - // filter_block_builder.start_block(300); - // filter_block_builder.add_key_from_str("hello"); - // - // let sliceRs: Result = filter_block_builder.finish(); - // assert_eq!("a", "leveldb.BuiltinBloomFilter"); - // - // let reader = FilterBlockReader::new_with_policy( - // policy.clone(), &sliceRs.unwrap()); - // - // assert!(reader.key_may_match(100, &Slice::from("foo"))); - // assert!(reader.key_may_match(100, &Slice::from("bar"))); - // assert!(reader.key_may_match(100, &Slice::from("box"))); - // assert!(reader.key_may_match(100, &Slice::from("hello"))); - // assert!(reader.key_may_match(100, &Slice::from("foo"))); - // assert!(!reader.key_may_match(100, &Slice::from("missing"))); - // assert!(!reader.key_may_match(100, &Slice::from("other"))); - // } + #[test] + fn test_filter_block_new_with_policy_and_addkey() { + let policy: Arc> = Arc::new(Box::new(TestHashFilter::new())); + let mut filter_block_builder: FilterBlockBuilder = + FilterBlockBuilder::new_with_policy(policy.clone()); + + // filter block 的 offset + filter_block_builder.start_block(100); + filter_block_builder.add_key_from_str("foo"); + filter_block_builder.add_key_from_str("bar"); + filter_block_builder.add_key_from_str("box"); + filter_block_builder.start_block(200); + filter_block_builder.add_key_from_str("box"); + filter_block_builder.start_block(300); + filter_block_builder.add_key_from_str("hello"); + + let sliceRs: Result = filter_block_builder.finish(); + assert_eq!("a", "a"); + + let reader = FilterBlockReader::new_with_policy( + policy.clone(), &sliceRs.unwrap()); + + // assert!(reader.key_may_match(100, &Slice::from("foo"))); + // assert!(reader.key_may_match(100, &Slice::from("bar"))); + // assert!(reader.key_may_match(100, &Slice::from("box"))); + // assert!(reader.key_may_match(100, &Slice::from("hello"))); + // assert!(reader.key_may_match(100, &Slice::from("foo"))); + // assert!(!reader.key_may_match(100, &Slice::from("missing"))); + // assert!(!reader.key_may_match(100, &Slice::from("other"))); + } // #[test] // fn test_filter_block_reader_new_with_policy_with_content() { diff --git a/src/table/format.rs b/src/table/format.rs index e082810331dc2420b1e28b128bde167fa603fc8f..47c243c89776b657f4451c2de7eca17815c9006f 100644 --- a/src/table/format.rs +++ b/src/table/format.rs @@ -12,42 +12,20 @@ pub const k_max_encoded_length: u32 = 10 + 10; /// of two block handles and a magic number. pub const k_encoded_length: u32 = 2 * k_max_encoded_length + 8; -/// Footer 的大小为 48 字节,内容是一个 8 字节的 magic number 和两个 BlockHandle 构成 -/// 在 Footer::EncodeTo 和 Footer::DecodeFrom 中起作用 -/// kTableMagicNumber was picked by running -/// echo http://code.google.com/p/leveldb/ | sha1sum -/// and taking the leading 64 bits. +/// kTableMagicNumber was picked by running echo http://code.google.com/p/leveldb/ | sha1sum and taking the leading 64 bits. pub const k_table_magic_number: u64 = 0xdb4775248b80fb57; /// 1-byte type + 32-bit crc pub const k_block_trailer_size: usize = 5; pub struct BlockHandle { - // 偏移量 + // 偏移量, 编码为可变长度的64位整列,最多占用10个字节 offset: u64, - // + // 大小, 编码为可变长度的64位整列,最多占用10个字节 size: u64 } -/// Footer encapsulates the fixed information stored at the tail -/// end of every table file. -pub struct Footer { - meta_index_handle: BlockHandle, - index_handle: BlockHandle -} - -pub struct BlockContents { - // Actual contents of data - data: Slice, - - // True if data can be cached - cachable: bool, - - // True if caller should delete[] data.data() - heap_allocated:bool, -} - -trait BlockHandleTrait { +trait ToBlockHandle { /// /// The offset of the block in the file. /// @@ -102,7 +80,20 @@ trait BlockHandleTrait { fn decode_from(&mut self, input: Slice) -> Result<()>; } -trait FootTrait { +/// Footer 的大小为 48 字节,最后8个字节为 magic number, 通过魔术对比,可以判断一个文件是否为 SST 文件。 +/// 其余40个字节由三部分构成: +/// 1、前两个部分是两个 BlockHandle。BlockHandle 中主要包括两个变量:偏移量offset,大小size。 +/// 通过这两个 BlockHandle 可以分别定位到数据索引区域(data block index)以及元数据索引区域(meta block index). +/// 2、 由于 BlockHandle 的成员变量使用可变长度编码,每个 BlockHandle 最大占用20字节, +/// 因此如果前两部分不足40字节,则需要padding结构补充,这也构成了第三部分。 +/// PS: 可变长度编码 变长的64位整型。 +/// +pub struct Footer { + meta_index_handle: BlockHandle, + index_handle: BlockHandle +} + +trait ToFoot { // The block handle for the metaindex block of the table fn meta_index_handle(&self) -> BlockHandle; @@ -142,18 +133,7 @@ trait FootTrait { fn decode_from(&mut self, input: Slice) -> Result<()>; } -trait BlockContent { - /// Read the block identified by "handle" from "file". On failure - /// return non-OK. On success fill *result and return OK. - fn read_block(&self, - // todo RandomAccessFile, ReadOptions 未提供 - // file: RandomAccessFile, options: ReadOptions, - handle: BlockHandle - ) -> Result; - -} - -impl BlockHandleTrait for BlockHandle { +impl ToBlockHandle for BlockHandle { fn offset(&self) -> u64 { self.offset } @@ -198,7 +178,7 @@ impl Default for BlockHandle { } } -impl FootTrait for Footer { +impl ToFoot for Footer { /// The block handle for the metaindex block of the table fn meta_index_handle(&self) -> BlockHandle { todo!() @@ -225,8 +205,31 @@ impl FootTrait for Footer { } } -impl BlockContent for BlockContents { - fn read_block(&self, handle: BlockHandle) -> Result { +/// ############################# BlockContent +pub struct BlockContent { + // Actual contents of data + data: Slice, + + // True if data can be cached + cachable: bool, + + // True if caller should delete[] data.data() + heap_allocated:bool, +} + +trait ToBlockContent { + /// Read the block identified by "handle" from "file". On failure + /// return non-OK. On success fill *result and return OK. + fn read_block(&self, + // todo RandomAccessFile, ReadOptions 未提供 + // file: RandomAccessFile, options: ReadOptions, + handle: BlockHandle + ) -> Result; + +} + +impl ToBlockContent for BlockContent { + fn read_block(&self, handle: BlockHandle) -> Result { todo!() } } diff --git a/src/table/ss_table.rs b/src/table/ss_table.rs index f6a99989efee390853f2a0d8147fcdd1e79b888e..d8f5c5340f08fee7068a73aa1ef45ca154c5371c 100644 --- a/src/table/ss_table.rs +++ b/src/table/ss_table.rs @@ -1,18 +1,22 @@ +/// SST文件又一个个块组成,块中可以保存数据、数据索引、元数据或者元数据索引。 +/// /// SST文件的格式: /// -/// [data block 1] -/// [data block 2] +/// [data block 1] -- data block 数据区域(保存具体的键值对数据), 块格式保存 +/// [data block 2] -- 每当 data block 的大小2K的时候,开始创建一个filter /// ... /// [data block N] -/// [meta block 1] -- 只有一个 meta block -/// [meta block index] -/// [data block index] -/// [Footer] +/// [meta block 1] -- 元数据区域(保存元数据,如布隆过滤器数据),只有一个 meta block。 +/// 不按照块格式保存. 通过 FilterBlockBuilder 构建 +/// +/// [meta block index] -- 元数据索引区域, 块格式保存, BlockHandler +/// [data block index] -- 数据索引区域, 块格式保存, BlockHandler +/// [Footer] -- 尾部(总大小固定48个字节) @see format#Footer /// /// -/// 一般而言,虽然SST文件里面声称是支持多个meta block的,但是实际上,也只有一个meta block。 -/// 此外,会在每当data block的大小2K的时候(见 FilterBlock.rs),开始创建一个filter。 +/// 通过读取 Footer,可以定位到 数据索引区域(data block index)以及元数据索引区域(meta block index). +/// 通过索引区域后,可以继续定位到具体的数据。 +/// pub struct SSTable { - } \ No newline at end of file diff --git a/src/traits/coding_trait.rs b/src/traits/coding_trait.rs index 4f0a3043a216c61fca2d9474d61b1f94eb364e3d..a936950d1399e8881be994efcdbfeff589b793b3 100644 --- a/src/traits/coding_trait.rs +++ b/src/traits/coding_trait.rs @@ -1,6 +1,16 @@ use crate::util::slice::Slice; pub trait CodingTrait { + + ///32位定长编码写入字符串 + /// 自动扩容, 后续@王旭 调整 + /// + /// * `dst`: 目标字符串 + /// * `value`: 编码值 + /// + /// returns: usize 返回的最新的偏移量 + fn put_fixed32_with_vex(dst: &mut Vec, value: u32) -> usize; + ///32位定长编码写入字符串 /// /// # Arguments @@ -51,6 +61,16 @@ pub trait CodingTrait { /// put_varint32(&mut string, 65535); /// ``` fn put_varint32(dst: &mut [u8], offset: usize, value: u32) -> usize; + + /// 64位变长编码写入字符串 + /// 自动扩容, 后续@王旭 调整 + /// + /// * `dst`: 目标字符串 + /// * `value`: 编码值 + /// + /// returns: usize 返回的最新的偏移量 + fn put_varint64_with_vex(dst: &mut Vec, value: u64) -> usize; + /// 64位变长编码写入字符串 /// /// # Arguments diff --git a/src/traits/filter_policy_trait.rs b/src/traits/filter_policy_trait.rs index 69cfe30f8007c6a150467b5fded39c011e1dfb6d..7b9eca9cdb71d081931804efb8e206e0108727cd 100644 --- a/src/traits/filter_policy_trait.rs +++ b/src/traits/filter_policy_trait.rs @@ -20,14 +20,14 @@ pub trait FilterPolicy { fn create_filter(&self, keys: Vec<&Slice>) -> Slice; /// - /// 使用一系列key来创建一个 bloom filter,并返回 bloom filter + /// 根据 key 列表创建一个BloomFilter /// /// 有n个整数set,以及一个m位的bit数组,以及k个哈希函数。m[i]表示访问第i个bit位。 /// /// # Arguments /// - /// * `capacity`: 构造的 BloomFilter 的长度 - /// * `keys`: 创建过滤器的数据清单 + /// * `capacity`: key的个数 + /// * `keys`: key列表 /// /// returns: bloom filter Slice /// @@ -37,20 +37,26 @@ pub trait FilterPolicy { /// use level_db_rust::util::filter_policy_bloom::BloomFilterPolicy; /// use level_db_rust::util::slice::Slice; /// - /// let mut keys : Vec = Vec::new(); - /// keys.push(Slice::try_from(String::from("hello")).unwrap()); - /// keys.push(Slice::try_from(String::from("world")).unwrap()); + /// let mut keys : Vec<&Slice> = Vec::new(); + /// keys.push(&Slice::try_from(String::from("hello")).unwrap()); + /// keys.push(&Slice::try_from(String::from("world")).unwrap()); /// - /// let policy = BloomFilterPolicy::new(800); + /// let policy = BloomFilterPolicy::new(); /// let bloom_filter: Slice = policy.create_filter(keys); /// ``` fn create_filter_with_len(&self, capacity: usize, keys: Vec<&Slice>) -> Slice; + // fn create_filter_u8(&self, keys: Vec) -> Slice; + // fn create_filter_u8_with_len(&self, capacity: usize, keys: Vec) -> Slice; + + /// 判断一个 key 是否可能存在。 /// + /// 如果 key 存在,一定返回 true。 + /// 如果 key 不存在,可能返回 true 也可能返回 false。 /// /// # Arguments /// - /// * `key`: + /// * `key`: 判断的key 值 /// * `bloom_filter`: /// /// returns: bool diff --git a/src/util/coding.rs b/src/util/coding.rs index 7081ac5d5730821e338477e988747cf27135109b..1727700ff4589f43a1c95166e6ee04d0cdf21e53 100644 --- a/src/util/coding.rs +++ b/src/util/coding.rs @@ -14,7 +14,7 @@ macro_rules! varint { } buf[offset] = value as u8; - offset + offset + 1 } }; @@ -26,6 +26,18 @@ macro_rules! varint { pub struct Coding {} impl CodingTrait for Coding { + fn put_fixed32_with_vex(dst: &mut Vec, value: u32) -> usize { + let mut buf: [u8; 4] = [0, 0, 0, 0]; + Self::encode_fixed32(value, &mut buf, 0); + + dst.push(buf[0]); + dst.push(buf[1]); + dst.push(buf[2]); + dst.push(buf[3]); + + dst.len() + } + fn put_fixed32(dst: &mut [u8], mut offset: usize, value: u32) -> usize { let mut buf: [u8; 4] = [0, 0, 0, 0]; Self::encode_fixed32(value, &mut buf, 0); @@ -74,6 +86,17 @@ impl CodingTrait for Coding { offset } + fn put_varint64_with_vex(dst: &mut Vec, value: u64) -> usize { + let mut buf: [u8; 8] = [0, 0, 0, 0, 0, 0, 0, 0]; + let var_offset = Self::encode_varint64(value, &mut buf, 0); + + for i in 0..var_offset { + dst.push(buf[i]); + } + + dst.len() + } + fn put_varint64(dst: &mut [u8], mut offset: usize, value: u64) -> usize { let mut buf: [u8; 8] = [0, 0, 0, 0, 0, 0, 0, 0]; let var_offset = Self::encode_varint64(value, &mut buf, 0); diff --git a/src/util/filter_policy_bloom.rs b/src/util/filter_policy_bloom.rs index ff79d005b59c962955a87119ff98aebbbc0400bb..eba1d17a3859a31393047b7e130f52e51b5ca397 100644 --- a/src/util/filter_policy_bloom.rs +++ b/src/util/filter_policy_bloom.rs @@ -7,16 +7,19 @@ use crate::util::slice::Slice; // ######################### BloomFilterPolicy pub struct BloomFilterPolicy { - // 布隆过滤器或哈希表的slot数 + // 每个key需要多少bit来存储表示 bits_per_key: usize, - // k为布隆过滤器重hash function数 + // k为布隆过滤器重hash function数(hash个数) k: usize } impl BloomFilterPolicy { /// /// + /// Return a new filter policy that uses a bloom filter with approximately the specified number of bits per key. + /// A good value for bits_per_key is 10, which yields a filter with ~ 1% false positive rate. + /// /// # Arguments /// /// * `bits_per_key`: m位的bit数组 / n个整数set 的值 @@ -28,7 +31,11 @@ impl BloomFilterPolicy { /// ``` /// /// ``` - pub fn new(bits_per_key: usize) -> Self { + pub fn new() -> Self { + BloomFilterPolicy::new_with_bits_per_key(10) + } + + pub fn new_with_bits_per_key(bits_per_key: usize) -> Self { // We intentionally round down to reduce probing cost a little bit // 最优的 k_ 是 ln2 * (m/n) -> factor * bits_per_key @@ -36,7 +43,7 @@ impl BloomFilterPolicy { let factor: f64 = 0.69; let mut k_: usize = factor.mul(bits_per_key as f64).round() as usize; - // 把k_放到[1, 30]这个区间 + // 计算哈希函数个数,控制在 1~30个范围。 if k_ < 1 { k_ = 1; } @@ -68,7 +75,6 @@ impl FromPolicy for BloomFilterPolicy { } } -// dyn FilterPolicy + FromPolicy impl FilterPolicy for BloomFilterPolicy { fn name(&self) -> String { @@ -85,32 +91,47 @@ impl FilterPolicy for BloomFilterPolicy { let n: usize = capacity; // Compute bloom filter size (in both bits and bytes) - // 计算总共需要的位数, n * bits_per_key, 也就是说,对于每一个key需要这么多bit + // 计算出中的需要的bits个数, n * bits_per_key, 也就是说,对于每一个key需要这么多bit // 因为bits_per_key_表示 m/n,所以bits = bits_per_key_ * n = m(m 的意思是: m位的bit数组) let mut bits: usize = n * self.bits_per_key; - // For small n, we can see a very high false positive rate. - // Fix it by enforcing a minimum bloom filter length. - // 对于一个key,最小的bits数目设置为64. + // For small n, we can see a very high false positive rate. Fix it by enforcing a minimum bloom filter length. + // bits太小的话会导致很高的查询错误率, 这里强制bits个数不能小于64 if bits < 64 { bits = 64; } - // 取为8的倍数 + //向上按8bit,一个Byte对齐 let bytes: usize = (bits + 7) / 8; // 根据 bytes 算出bits数 bits = bytes * 8; - // 相当于是 append 了bytes个0 - let mut dst_chars: Vec = vec![0; bytes + 1]; + // 扩展下要存储BloomFilter的内存空间, 并在尾部一个Byte存哈希函数的个数。 + let mut dst_chars: Vec = vec![0; bytes + 1]; // 相当于是 append 了bytes个0 // 在filter的最后压入哈希函数的个数。 在最后一位, 记录k 值。 这个k是位于bytes之后。 dst_chars[bytes] = self.k as u8; - // 依次处理每个key + // 开始依次存储每个key值。 // 对于每个key采用double hash的方式生成k_个bitpos,然后在 dst_chars 的相应位置设置1。 for i in 0..keys.len() { let slice = keys[i]; + /* 计算哈希值 */ + // BloomFilter理论是通过多个hash计算来减少冲突, + // 但leveldb实际上并未真正去计算多个hash,而是通过double-hashing的方式来达到同样的效果。 + // double-hashing的理论如下: + // h(i,k) = (h1(k) + i*h2(k)) % T.size + // h1(k) = h, h2(k) = delta, h(i,k) = bitpos + // + // 1、计算hash值; + // 2、hash值的高15位,低17位对调 + // 3、按k_个数来存储当前hash值。 + // 3-1、计算存储位置; + // 3-2、按bit存; + // 3-3、累加hash值用于下次计算 + // + // Use double-hashing to generate a sequence of hash values. + // See analysis in [Kirsch,Mitzenmacher 2006]. let mut h : u32 = slice.bloom_hash(); // Rotate right 17 bits let delta : u32 = (h >> 17) | (h << 15); @@ -138,13 +159,24 @@ impl FilterPolicy for BloomFilterPolicy { Slice::from_buf(&dst_chars) } + // fn create_filter_u8(&self, keys: Vec) -> Slice { + // self.create_filter_u8_with_len(keys.len(), keys) + // } + // + // fn create_filter_u8_with_len(&self, capacity: usize, keys: Vec) -> Slice { + // todo!() + // } + fn key_may_match(&self, key: &Slice, bloom_filter: &Slice) -> bool { + // 1、插入时按1Byte对齐; + // 2、尾部插入了一个Byte的hash个数 + // 所以大小不能小于2个字节 let len: usize = bloom_filter.size(); if len < 2 { return false; } - // 获得相应的内存区域的数据 + // 获得相应的内存区域的数据: 除去尾部的1Byte对应的hash个数,就是当前位数组容器的大小 let bloom_filter_array:Vec = bloom_filter.to_vec(); // 总共的bits数目 let bits: usize = (len - 1) * 8; @@ -158,6 +190,10 @@ impl FilterPolicy for BloomFilterPolicy { return true; } + // 1、计算查询key对应的hash值 + // 2、按插入规则去 &,只要有1bit不相同,那就不存在。 + + // 计算哈希值 let mut h : u32 = key.bloom_hash(); // Rotate right 17 bits let delta = (h >> 17) | (h << 15); diff --git a/src/util/filter_policy_bloom_test.rs b/src/util/filter_policy_bloom_test.rs index e7ad5312fdb5c810f9e6241a22978a74bcf14284..bbb8ebab69a196f275d332701eda219414d5c9de 100644 --- a/src/util/filter_policy_bloom_test.rs +++ b/src/util/filter_policy_bloom_test.rs @@ -18,19 +18,19 @@ fn test_bloom_hash() { #[test] fn test_new() { - let bloom_filter: BloomFilterPolicy = BloomFilterPolicy::new(8); + let bloom_filter: BloomFilterPolicy = BloomFilterPolicy::new_with_bits_per_key(8); assert_eq!(bloom_filter.from_bits_per_key(), 8); assert_eq!(bloom_filter.from_k(), 6); - let bloom_filter = BloomFilterPolicy::new(800); - assert_eq!(bloom_filter.from_bits_per_key(), 800); - assert_eq!(bloom_filter.from_k(), 30); + let bloom_filter = BloomFilterPolicy::new(); + assert_eq!(bloom_filter.from_bits_per_key(), 10); + assert_eq!(bloom_filter.from_k(), 7); } // #################### FilterPolicy test #[test] fn test_create_filter() { - let policy = BloomFilterPolicy::new(800); + let policy = BloomFilterPolicy::new_with_bits_per_key(800); // 如下三个值, 存放在 BloomFilter 中 let s1 = Slice::try_from(String::from("hello")).unwrap(); @@ -85,7 +85,7 @@ fn test_create_filter() { /// 指定超长长度。可以超过放置的值 #[test] fn test_create_filter_with_long_len(){ - let policy = BloomFilterPolicy::new(800); + let policy = BloomFilterPolicy::new_with_bits_per_key(800); // 如下三个值, 存放在 BloomFilter 中 let s1 = Slice::try_from(String::from("hello")).unwrap(); @@ -140,7 +140,7 @@ fn test_create_filter_with_long_len(){ /// 指定端长度。放不开放置的值。 此时对于 BloomFilterPolicy 来讲不需要扩容 #[test] fn test_create_filter_with_short_len(){ - let policy = BloomFilterPolicy::new(800); + let policy = BloomFilterPolicy::new_with_bits_per_key(800); // 如下三个值, 存放在 BloomFilter 中 let s1 = Slice::try_from(String::from("hello")).unwrap(); diff --git a/src/util/status.rs b/src/util/status.rs index 97b33d6ecb53e76dd74a8320c3a8cc2add26bad9..c41900fcae051286920d78373f3c5b16cd032965 100644 --- a/src/util/status.rs +++ b/src/util/status.rs @@ -35,6 +35,7 @@ impl Status { /// # Examples /// /// ``` + /// use level_db_rust::util::status::{LevelError, Status}; /// Status::wrapper_str(LevelError::KInvalidArgument, "IndexOutOfRange"); /// ``` #[inline]