From 9f2ee41424fd89ae36386c4c34c2af62290a33ac Mon Sep 17 00:00:00 2001 From: fengyang Date: Fri, 16 Dec 2022 01:03:38 +0800 Subject: [PATCH 1/5] =?UTF-8?q?=E5=88=9D=E6=AD=A5hash=E5=AE=9E=E7=8E=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Cargo.toml | 5 +++ src/util/hash.rs | 71 ++++++++++++++++++++++++++++++++++++++++--- src/util/hash_test.rs | 11 +++++++ 3 files changed, 83 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 7b77544..6f688ef 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,3 +10,8 @@ path = "src/lib.rs" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] + + +[profile.dev] + +[profile.release] diff --git a/src/util/hash.rs b/src/util/hash.rs index d383f2b..5a311c0 100644 --- a/src/util/hash.rs +++ b/src/util/hash.rs @@ -1,5 +1,11 @@ +use std::ops::{BitXor, Mul}; +use crate::traits::coding_trait::CodingTrait; +use crate::util::coding::Coding; -pub trait AsHash { +/// 本方案中,采用的是MurMurHash的一种变体,是一种高效低碰撞的非加密型哈希函数。具有较高的平衡性与低碰撞率 +pub struct Hash {} + +impl<'a> Hash { /// /// /// # Arguments @@ -15,7 +21,64 @@ pub trait AsHash { /// ``` /// /// ``` - fn hash(data: String, n: usize, seed: u32) -> u32; -} + pub fn hash(data: String, data_size: usize, seed: u32) -> u32 { + let murmur_hash : u32 = 0xc6a4a793; + let r : u32 = 24; + + let limit: usize = data_size; + let mul_first = data_size.mul(murmur_hash as usize); // x = data_size * murmur_hash + let mut h: usize = seed.bitxor(mul_first as u32) as usize; // h = seed ^ x + + // 每次按照四字节长度读取字节流中的数据 w,并使用普通的哈希函数计算哈希值。 + let mut position: usize = 0; + while position + 4 <= limit { + //每次解码前4个字节,直到最后剩下小于4个字节 + // rust的 &[u8] 是胖指针,带长度信息的,会做range check,所以是安全的。 + let slice_str: &[u8] = data[position..(position + 4)].as_ref(); + let w: u32 = Coding::decode_fixed32(slice_str); + + // 向后移动4个字节 + position += 4; + + // /计算过程中使用了自然溢出特性 + // h += w + h = h.wrapping_add(w as usize); + // h *= m + h = h.wrapping_mul(murmur_hash as usize); + // ^ 按位异或 bitxor , >> 右移位 shr, << 左移位 shl + // h ^= (h >> 16) == h ^= h.shr(16); + h = h.bitxor(h.wrapping_shr(16)); + } + + // 四字节读取则为了加速,最终可能剩下 3/2/1 个多余的字节, + // 将剩下的字节转化到 h 里面 + let cu = limit - position; + match cu { + 3 => { + let us: &[u8] = data[position..].as_ref(); + h = h.wrapping_add((us[2] as u32).wrapping_shl(16) as usize); + h = h.wrapping_add((us[1] as u32).wrapping_shl(8) as usize); + h = h.wrapping_add(us[0].into()); + }, + 2 => { + let us: &[u8] = data[position..].as_ref(); + h = h.wrapping_add((us[1] as u32).wrapping_shl(8) as usize); + h = h.wrapping_add(us[0].into()); + }, + 1 => { + let us: &[u8] = data[position..].as_ref(); + h = h.wrapping_add(us[0].into()); + // h *= m + h = h.wrapping_mul(murmur_hash as usize); + // h ^= (h >> r) == h ^= h.shr(r); + h = h.bitxor(h.wrapping_shr(r)); + }, + _ => {} + }; + + println!("hash usize: {}", h); + println!("hash u32: {}", h as u32); -pub struct Hash {} \ No newline at end of file + h as u32 + } +} \ No newline at end of file diff --git a/src/util/hash_test.rs b/src/util/hash_test.rs index 0384579..e2564a0 100644 --- a/src/util/hash_test.rs +++ b/src/util/hash_test.rs @@ -1,6 +1,17 @@ +use crate::util::hash::{Hash}; #[test] fn test_hash() { + let val = "aabbccd"; + let hash_val = Hash::hash(String::from(val), val.len(), 3); + println!("hash:{}", hash_val); + let val = "aabbcc"; + let hash_val = Hash::hash(String::from(val), val.len(), 3); + println!("hash:{}", hash_val); + + let val = "aabbc"; + let hash_val = Hash::hash(String::from(val), val.len(), 3); + println!("hash:{}", hash_val); } \ No newline at end of file -- Gitee From bff8833b7dba6058bcbc11e2bb215a133fba91b4 Mon Sep 17 00:00:00 2001 From: fengyang Date: Fri, 16 Dec 2022 14:03:42 +0800 Subject: [PATCH 2/5] =?UTF-8?q?=E5=88=9D=E6=AD=A5hash=E5=AE=9E=E7=8E=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/util/hash.rs | 13 +++++++++++-- src/util/hash_test.rs | 9 +++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/src/util/hash.rs b/src/util/hash.rs index 5a311c0..17aea26 100644 --- a/src/util/hash.rs +++ b/src/util/hash.rs @@ -21,14 +21,23 @@ impl<'a> Hash { /// ``` /// /// ``` - pub fn hash(data: String, data_size: usize, seed: u32) -> u32 { + pub fn hash(mut data: String, data_size: usize, seed: u32) -> u32 { + let data_u8_vec; + unsafe { + data_u8_vec = data.as_mut_vec(); + } + + Hash::hash_char(data_u8_vec, data_size, seed) + } + + pub fn hash_char(data: &Vec, data_size: usize, seed: u32) -> u32 { let murmur_hash : u32 = 0xc6a4a793; let r : u32 = 24; let limit: usize = data_size; let mul_first = data_size.mul(murmur_hash as usize); // x = data_size * murmur_hash let mut h: usize = seed.bitxor(mul_first as u32) as usize; // h = seed ^ x - + // 每次按照四字节长度读取字节流中的数据 w,并使用普通的哈希函数计算哈希值。 let mut position: usize = 0; while position + 4 <= limit { diff --git a/src/util/hash_test.rs b/src/util/hash_test.rs index e2564a0..29ece53 100644 --- a/src/util/hash_test.rs +++ b/src/util/hash_test.rs @@ -13,5 +13,14 @@ fn test_hash() { let val = "aabbc"; let hash_val = Hash::hash(String::from(val), val.len(), 3); println!("hash:{}", hash_val); +} +#[test] +fn test_hash_code() { + let data4: Vec = vec![0xe1, 0x80, 0xb9, 0x32]; + + let hash_val = Hash::hash_char(&data4, data4.len(), 3); + println!("hash:{}", hash_val); + // 3978388282 + // assert_eq!(0xed21633a, hash_val); } \ No newline at end of file -- Gitee From 7f5cef6fa5f234da367525f381f7c78f0dfa03b6 Mon Sep 17 00:00:00 2001 From: fengyang Date: Fri, 16 Dec 2022 22:13:34 +0800 Subject: [PATCH 3/5] =?UTF-8?q?hash=E5=AE=9E=E7=8E=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/util/hash.rs | 69 ++++++++++++++++++++++++------------------- src/util/hash_test.rs | 37 +++++++++++++++++++---- 2 files changed, 71 insertions(+), 35 deletions(-) diff --git a/src/util/hash.rs b/src/util/hash.rs index 17aea26..a134fc7 100644 --- a/src/util/hash.rs +++ b/src/util/hash.rs @@ -19,7 +19,9 @@ impl<'a> Hash { /// # Examples /// /// ``` - /// + /// let data3: Vec = vec![0xe2, 0x99, 0xa5]; + /// let hash_val = Hash::hash_char(&data3, data3.len(), 0xbc9f1d34); /// + /// assert_eq!(0x323c078f, hash_val); /// ``` pub fn hash(mut data: String, data_size: usize, seed: u32) -> u32 { let data_u8_vec; @@ -36,7 +38,7 @@ impl<'a> Hash { let limit: usize = data_size; let mul_first = data_size.mul(murmur_hash as usize); // x = data_size * murmur_hash - let mut h: usize = seed.bitxor(mul_first as u32) as usize; // h = seed ^ x + let mut h: u32 = seed.bitxor(mul_first as u32); // h = seed ^ x // 每次按照四字节长度读取字节流中的数据 w,并使用普通的哈希函数计算哈希值。 let mut position: usize = 0; @@ -51,9 +53,9 @@ impl<'a> Hash { // /计算过程中使用了自然溢出特性 // h += w - h = h.wrapping_add(w as usize); + h = h.wrapping_add(w); // h *= m - h = h.wrapping_mul(murmur_hash as usize); + h = h.wrapping_mul(murmur_hash); // ^ 按位异或 bitxor , >> 右移位 shr, << 左移位 shl // h ^= (h >> 16) == h ^= h.shr(16); h = h.bitxor(h.wrapping_shr(16)); @@ -61,33 +63,40 @@ impl<'a> Hash { // 四字节读取则为了加速,最终可能剩下 3/2/1 个多余的字节, // 将剩下的字节转化到 h 里面 - let cu = limit - position; - match cu { - 3 => { - let us: &[u8] = data[position..].as_ref(); - h = h.wrapping_add((us[2] as u32).wrapping_shl(16) as usize); - h = h.wrapping_add((us[1] as u32).wrapping_shl(8) as usize); - h = h.wrapping_add(us[0].into()); - }, - 2 => { - let us: &[u8] = data[position..].as_ref(); - h = h.wrapping_add((us[1] as u32).wrapping_shl(8) as usize); - h = h.wrapping_add(us[0].into()); - }, - 1 => { - let us: &[u8] = data[position..].as_ref(); - h = h.wrapping_add(us[0].into()); - // h *= m - h = h.wrapping_mul(murmur_hash as usize); - // h ^= (h >> r) == h ^= h.shr(r); - h = h.bitxor(h.wrapping_shr(r)); - }, - _ => {} - }; + let mut mark: usize = 0; + while limit - position - mark != 0 { + match limit - position - mark { + 3 => { + let us: &[u8] = data[position..].as_ref(); + let as_us: u32 = us[2] as u32; + h = h.wrapping_add(as_us.wrapping_shl(16)); + + mark += 1; + }, + 2 => { + let us: &[u8] = data[position..].as_ref(); + let as_us: u32 = us[1] as u32; + h = h.wrapping_add( as_us.wrapping_shl(8)); - println!("hash usize: {}", h); - println!("hash u32: {}", h as u32); + mark += 1; + }, + 1 => { + let us: &[u8] = data[position..].as_ref(); + let as_us: u32 = us[0] as u32; + h = h.wrapping_add(as_us); + // h *= m + h = h.wrapping_mul(murmur_hash); + // h ^= (h >> r) ==> h ^= h.shr(r); + h = h.bitxor(h.wrapping_shr(r)); + + mark += 1; + }, + _ => { + println!("0") + } + }; + } - h as u32 + h } } \ No newline at end of file diff --git a/src/util/hash_test.rs b/src/util/hash_test.rs index 29ece53..770192b 100644 --- a/src/util/hash_test.rs +++ b/src/util/hash_test.rs @@ -17,10 +17,37 @@ fn test_hash() { #[test] fn test_hash_code() { + let data1: Vec = vec![0x62]; + let data2: Vec = vec![0xc3, 0x97]; + let data3: Vec = vec![0xe2, 0x99, 0xa5]; let data4: Vec = vec![0xe1, 0x80, 0xb9, 0x32]; + let data5: Vec = vec![0x01, 0xc0, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x14, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x00, 0x14, + 0x00, 0x00, 0x00, 0x18, + 0x28, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00]; - let hash_val = Hash::hash_char(&data4, data4.len(), 3); - println!("hash:{}", hash_val); - // 3978388282 - // assert_eq!(0xed21633a, hash_val); -} \ No newline at end of file + let hash_val = Hash::hash_char(&vec![0], 0, 0xbc9f1d34); + assert_eq!(0xbc9f1d34, hash_val); + + let hash_val = Hash::hash_char(&data1, data1.len(), 0xbc9f1d34); + assert_eq!(0xef1345c4, hash_val); + + let hash_val = Hash::hash_char(&data2, data2.len(), 0xbc9f1d34); + assert_eq!(0x5b663814, hash_val); + let hash_val = Hash::hash_char(&data3, data3.len(), 0xbc9f1d34); + assert_eq!(0x323c078f, hash_val); + + let hash_val = Hash::hash_char(&data4, data4.len(), 0xbc9f1d34); + assert_eq!(0xed21633a, hash_val); + + let hash_val = Hash::hash_char(&data5, data5.len(), 0x12345678); + assert_eq!(0xf333dabb, hash_val); +} -- Gitee From 8a71d815a72a28a1c0af2913851af6980b24f499 Mon Sep 17 00:00:00 2001 From: fengyang Date: Fri, 16 Dec 2022 22:24:03 +0800 Subject: [PATCH 4/5] =?UTF-8?q?FilterPolicy=20=E5=AE=9A=E4=B9=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 2 +- src/traits/filter_policy_trait.rs | 7 +++++++ src/util/filter_policy.rs | 4 ++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index b7d8eb0..dc51f61 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,6 @@ LevelDB for rust | BloomFilter | fengyang | 0% | | CRC | wangboo、lxd5866 | | | Env | lxd5866 | | -| Hash | fengyang | 30% | +| Hash | fengyang | 100% | | MutexLock | kazeseiriou | | | Histgram | kazeseiriou | | \ No newline at end of file diff --git a/src/traits/filter_policy_trait.rs b/src/traits/filter_policy_trait.rs index fcfd19d..c751282 100644 --- a/src/traits/filter_policy_trait.rs +++ b/src/traits/filter_policy_trait.rs @@ -1,9 +1,16 @@ use crate::util::slice::Slice; +/// 用于key过滤,可以快速的排除不存在的key pub trait FilterPolicy { + /// filter的名字 + /// Return the name of this policy. Note that if the filter encoding + /// changes in an incompatible way, the name returned by this method + /// must be changed. Otherwise, old incompatible filters may be + /// passed to methods of this type. fn name() -> String; fn create_filter(&self, keys: Slice, n: u32) -> String; + fn key_may_match(key: &Slice, filter: &Slice) -> bool; } \ No newline at end of file diff --git a/src/util/filter_policy.rs b/src/util/filter_policy.rs index 9e9811e..ca88fa3 100644 --- a/src/util/filter_policy.rs +++ b/src/util/filter_policy.rs @@ -15,4 +15,8 @@ impl FilterPolicy for BloomFilterPolicy { fn create_filter(&self, keys: Slice, n: u32) -> String { todo!() } + + fn key_may_match(key: &Slice, filter: &Slice) -> bool { + todo!() + } } \ No newline at end of file -- Gitee From a4f1e02eb60070ed401cbd03da70f87414d895de Mon Sep 17 00:00:00 2001 From: fengyang Date: Fri, 16 Dec 2022 22:33:01 +0800 Subject: [PATCH 5/5] =?UTF-8?q?FilterPolicy=20=E5=AE=9A=E4=B9=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/traits/filter_policy_trait.rs | 26 +++++++++++++++++++++++++- src/util/filter_policy.rs | 2 +- src/util/hash.rs | 2 +- 3 files changed, 27 insertions(+), 3 deletions(-) diff --git a/src/traits/filter_policy_trait.rs b/src/traits/filter_policy_trait.rs index c751282..2ec339d 100644 --- a/src/traits/filter_policy_trait.rs +++ b/src/traits/filter_policy_trait.rs @@ -10,7 +10,31 @@ pub trait FilterPolicy { /// passed to methods of this type. fn name() -> String; - fn create_filter(&self, keys: Slice, n: u32) -> String; + /// 根据指定的参数创建过滤器,并返回结果, 结果为dst的原始内容 + append结果。 + /// 参数keys[0,n-1]包含依据用户提供的comparator排序的key列表--可重复, + /// 并把根据这些key创建的filter追加到 dst中。 + /// + /// keys[0,n-1] contains a list of keys (potentially with duplicates) + /// that are ordered according to the user supplied comparator. + /// Append a filter that summarizes keys[0,n-1] to *dst. + /// + /// Warning: do not change the initial contents of dst. Instead, + /// append the newly constructed filter to dst. + /// + /// # Arguments + /// + /// * `keys`: + /// * `n`: + /// * `dst`: + /// + /// returns: String + /// + /// # Examples + /// + /// ``` + /// + /// ``` + fn create_filter(&self, keys: Slice, n: u32, dst: String) -> String; fn key_may_match(key: &Slice, filter: &Slice) -> bool; } \ No newline at end of file diff --git a/src/util/filter_policy.rs b/src/util/filter_policy.rs index ca88fa3..b40e0dd 100644 --- a/src/util/filter_policy.rs +++ b/src/util/filter_policy.rs @@ -12,7 +12,7 @@ impl FilterPolicy for BloomFilterPolicy { String::from("leveldb.BuiltinBloomFilter2") } - fn create_filter(&self, keys: Slice, n: u32) -> String { + fn create_filter(&self, keys: Slice, n: u32, dst: String) -> String { todo!() } diff --git a/src/util/hash.rs b/src/util/hash.rs index a134fc7..ea82575 100644 --- a/src/util/hash.rs +++ b/src/util/hash.rs @@ -6,7 +6,7 @@ use crate::util::coding::Coding; pub struct Hash {} impl<'a> Hash { - /// + /// 计算 data 的 hash /// /// # Arguments /// -- Gitee