diff --git a/pack-rs/encode.rs b/pack-rs/encode.rs new file mode 100644 index 0000000000000000000000000000000000000000..08d6b9dd1f1fe36e7bdfdba613929f649b7215d2 --- /dev/null +++ b/pack-rs/encode.rs @@ -0,0 +1,254 @@ +//! +//! +//! + +use flate2::write::ZlibEncoder; +use sha1::{Digest, Sha1}; +use std::collections::VecDeque; +use std::{io::Write, sync::mpsc}; +use venus::internal::object::types::ObjectType; +use venus::{errors::GitError, hash::SHA1, internal::pack::entry::Entry}; + +const MIN_DELTA_RATE: f64 = 0.5; // minimum delta rate can accept + +pub struct PackEncoder { + object_number: usize, + process_index: usize, + window_size: usize, + window: VecDeque<(Entry, usize)>, // entry and offset + writer: W, + inner_offset: usize, // offset of current entry + inner_hash: Sha1, // Not SHA1 because need update trait + final_hash: Option, +} + +/// encode header of pack file (12 byte)
+/// include: 'PACK', Version(2), number of objects +fn encode_header(object_number: usize) -> Vec { + let mut result: Vec = vec![ + b'P', b'A', b'C', b'K', // The logotype of the Pack File + 0, 0, 0, 2, // generates version 2 only. + ]; + assert_ne!(object_number, 0); // guarantee self.number_of_objects!=0 + assert!(object_number < (1 << 32)); + //TODO: GitError:numbers of objects should < 4G , + result.append((object_number as u32).to_be_bytes().to_vec().as_mut()); // to 4 bytes (network byte order aka. big-endian) + result +} + +/// encode offset of delta object +fn encode_offset(mut value: usize) -> Vec { + assert_ne!(value, 0, "offset can't be zero"); + let mut bytes = Vec::new(); + let mut first_byte = true; + while value != 0 || first_byte { + let mut byte = (value & 0x7F) as u8; // 获取当前值的最低7位 + value >>= 7; // 右移7位准备处理下一个字节 + if first_byte { + first_byte = false; + } else { + byte -= 1; // sub 1 + byte |= 0x80; // set first bit one + } + bytes.push(byte); + } + bytes.reverse(); + bytes +} + +impl PackEncoder { + pub fn new(object_number: usize, window_size: usize, mut writer: W) -> Self { + let head = encode_header(object_number); + writer.write_all(&head).unwrap(); + let mut hash = Sha1::new(); + hash.update(&head); + PackEncoder { + object_number, + window_size, + process_index: 0, + window: VecDeque::with_capacity(window_size), + writer, + inner_offset: 12, // 12 bytes header + inner_hash: hash, + final_hash: None, + } + } + + /// get the hash of the pack file. if the pack file is not finished, return None + pub fn get_hash(&self) -> Option { + self.final_hash + } + + /// encode entries to a pack file with delta objects, write to writer + pub fn encode(&mut self, rx: mpsc::Receiver) -> Result<(), GitError> { + loop { + match rx.recv() { + Ok(entry) => { + self.process_index += 1; + // push window after encode to void diff by self + let offset = self.inner_offset; + self.encode_one_object(&entry)?; + self.window.push_back((entry, offset)); + if self.window.len() > self.window_size { + self.window.pop_front(); + } + } + Err(_) => { + if self.process_index != self.object_number { + panic!("not all objects are encoded"); + } + break; + } + } + } + + // hash signature + let hash_result = self.inner_hash.clone().finalize(); + self.final_hash = Some(SHA1::new(&hash_result.to_vec())); + self.writer.write_all(&hash_result).unwrap(); + Ok(()) + } + + /// try to encode as delta using objects in window + /// # Returns + /// return (delta entry, offset) if success make delta + /// return (origin Entry,None) if didn't delta, + fn try_as_offset_delta(&mut self, entry: &Entry) -> (Entry, Option) { + let mut best_base: Option<&(Entry, usize)> = None; + let mut best_rate: f64 = 0.0; + for try_base in self.window.iter() { + if try_base.0.obj_type != entry.obj_type { + continue; + } + let rate = delta::encode_rate(&try_base.0.data, &entry.data); + if rate > MIN_DELTA_RATE && rate > best_rate { + best_rate = rate; + best_base = Some(try_base); + } + } + if best_rate > 0.0 { + let best_base = best_base.unwrap(); // must some if best rate > 0 + let delta = delta::encode(&best_base.0.data, &entry.data); + let offset = self.inner_offset - best_base.1; + ( + Entry { + data: delta, + obj_type: ObjectType::OffsetDelta, + ..entry.clone() + }, + Some(offset), + ) + } else { + (entry.clone(), None) + } + } + + fn write_all_and_update(&mut self, data: &[u8]) { + self.inner_hash.update(data); + self.inner_offset += data.len(); + self.writer.write_all(data).unwrap(); + } + + /// encode one object, and update the hash + fn encode_one_object(&mut self, entry: &Entry) -> Result<(), GitError> { + // try encode as delta + let (entry, offset) = self.try_as_offset_delta(entry); + let obj_data = entry.data; + let obj_data_len = obj_data.len(); + let obj_type_number = entry.obj_type.to_u8(); + + // **header** encoding + let mut header_data = vec![(0x80 | (obj_type_number << 4)) + (obj_data_len & 0x0f) as u8]; + let mut size = obj_data_len >> 4; // 4 bit has been used in first byte + if size > 0 { + while size > 0 { + if size >> 7 > 0 { + header_data.push((0x80 | size) as u8); + size >>= 7; + } else { + header_data.push(size as u8); + break; + } + } + } else { + header_data.push(0); + } + self.write_all_and_update(&header_data); + + // **offset** encoding + if entry.obj_type == ObjectType::OffsetDelta { + let offset_data = encode_offset(offset.unwrap()); + self.write_all_and_update(&offset_data); + } else if entry.obj_type == ObjectType::HashDelta { + unreachable!("unsupported type") + } + + // **data** encoding, need zlib compress + let mut inflate = ZlibEncoder::new(Vec::new(), flate2::Compression::default()); + inflate.write_all(&obj_data) + .expect("zlib compress should never failed"); + inflate.flush().expect("zlib flush should never failed"); + let compressed_data = inflate.finish().expect("zlib compress should never failed"); + self.write_all_and_update(&compressed_data); + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use std::{io::Cursor, path::PathBuf, usize}; + + use venus::internal::object::blob::Blob; + + use crate::internal::pack::Pack; + + use super::*; + #[test] + fn test_pack_encoder() { + + fn encode_once(window_size: usize) -> Vec { + let mut writer: Vec = Vec::new(); + // make some different objects, or decode will fail + let str_vec = vec!["hello, code,", "hello, world.", "!", "123141251251"]; + let mut encoder = PackEncoder::new(str_vec.len(), window_size, &mut writer); + let (tx, rx) = mpsc::channel::(); + for str in str_vec { + let blob = Blob::from_content(str); + let entry: Entry = blob.into(); + tx.send(entry).unwrap(); + } + drop(tx); + encoder.encode(rx).unwrap(); + assert!(encoder.get_hash().is_some()); + writer + } + fn check_format(data: Vec) { + let mut p = Pack::new( + None, + Some(1024 * 20), + Some(PathBuf::from("/tmp/.cache_temp")), + ); + let mut reader = Cursor::new(data); + p.decode(&mut reader, |_|{}).expect("pack file format error"); + } + // without delta + let pack_without_delta = encode_once(0); + let pack_without_delta_size = pack_without_delta.len(); + check_format(pack_without_delta); + + // with delta + let pack_with_delta = encode_once(3); + assert_ne!(pack_with_delta.len(), pack_without_delta_size); + check_format(pack_with_delta); + } + + #[test] + fn test_encode_offset() { + let value = 11013; + let data = encode_offset(value); + println!("{:?}", data); + assert_eq!(data.len(), 2); + assert_eq!(data[0], 0b_1101_0101); + assert_eq!(data[1], 0b_0000_0101); + } +} diff --git a/pack-rs/utils.rs b/pack-rs/utils.rs new file mode 100644 index 0000000000000000000000000000000000000000..46d14731909e4935728ef4a379144ebff9a8ef50 --- /dev/null +++ b/pack-rs/utils.rs @@ -0,0 +1,498 @@ +//! +//! +//! +//! +use std::fs; +use std::io::{self, Read}; +use std::path::Path; +use sha1::{Digest, Sha1}; +use venus::hash::SHA1; +use venus::internal::object::types::ObjectType; + +/// Checks if the reader has reached EOF (end of file). +/// +/// It attempts to read a single byte from the reader into a buffer. +/// If `Ok(0)` is returned, it means no byte was read, indicating +/// that the end of the stream has been reached and there is no more +/// data left to read. +/// +/// Any other return value means that data was successfully read, so +/// the reader has not reached the end yet. +/// +/// # Arguments +/// +/// * `reader` - The reader to check for EOF state +/// It must implement the `std::io::Read` trait +/// +/// # Returns +/// +/// true if the reader reached EOF, false otherwise +#[allow(unused)] +pub fn is_eof(reader: &mut dyn Read) -> bool { + let mut buf = [0; 1]; + matches!(reader.read(&mut buf), Ok(0)) +} + +/// Reads a byte from the given stream and checks if there are more bytes to continue reading. +/// +/// The return value includes two parts: an unsigned integer formed by the first 7 bits of the byte, +/// and a boolean value indicating whether more bytes need to be read. +/// +/// # Parameters +/// * `stream`: The stream from which the byte is read. +/// +/// # Returns +/// Returns an `io::Result` containing a tuple. The first element is the value of the first 7 bits, +/// and the second element is a boolean indicating whether more bytes need to be read. +/// +#[allow(unused)] +pub fn read_byte_and_check_continuation(stream: &mut R) -> io::Result<(u8, bool)> { + // Create a buffer for a single byte + let mut bytes = [0; 1]; + + // Read exactly one byte from the stream into the buffer + stream.read_exact(&mut bytes)?; + + // Extract the byte from the buffer + let byte = bytes[0]; + + // Extract the first 7 bits of the byte + let value = byte & 0b0111_1111; + + // Check if the most significant bit (8th bit) is set, indicating more bytes to follow + let msb = byte >= 128; + + // Return the extracted value and the continuation flag + Ok((value, msb)) +} + +/// Reads bytes from the stream and parses the first byte for type and size. +/// Subsequent bytes are read as size bytes and are processed as variable-length +/// integer in little-endian order. The function returns the type and the computed size. +/// +/// # Parameters +/// * `stream`: The stream from which the bytes are read. +/// * `offset`: The offset of the stream. +/// +/// # Returns +/// Returns an `io::Result` containing a tuple of the type and the computed size. +/// +#[allow(unused)] +pub fn read_type_and_varint_size(stream: &mut R, offset: &mut usize) -> io::Result<(u8, usize)> { + let (first_byte, continuation) = read_byte_and_check_continuation(stream)?; + + // Increment the offset by one byte + *offset += 1; + + // Extract the type (bits 2, 3, 4 of the first byte) + let type_bits = (first_byte & 0b0111_0000) >> 4; + + // Initialize size with the last 4 bits of the first byte + let mut size: u64 = (first_byte & 0b0000_1111) as u64; + let mut shift = 4; // Next byte will shift by 4 bits + + let mut more_bytes = continuation; + while more_bytes { + let (next_byte, continuation) = read_byte_and_check_continuation(stream)?; + // Increment the offset by one byte + *offset += 1; + + size |= (next_byte as u64) << shift; + shift += 7; // Each subsequent byte contributes 7 more bits + more_bytes = continuation; + } + + Ok((type_bits, size as usize)) +} + +/// Reads a variable-length integer (VarInt) encoded in little-endian format from a source implementing the Read trait. +/// +/// The VarInt encoding uses the most significant bit (MSB) of each byte as a continuation bit. +/// The continuation bit being 1 indicates that there are following bytes. +/// The actual integer value is encoded in the remaining 7 bits of each byte. +/// +/// # Parameters +/// * `reader`: A source implementing the Read trait (e.g., file, network stream). +/// +/// # Returns +/// Returns a `Result` containing either: +/// * A tuple of the decoded `u64` value and the number of bytes read (`offset`). +/// * An `io::Error` in case of any reading error or if the VarInt is too long. +/// +#[allow(unused)] +pub fn read_varint_le(reader: &mut R) -> io::Result<(u64, usize)> { + // The decoded value + let mut value: u64 = 0; + // Bit shift for the next byte + let mut shift = 0; + // Number of bytes read + let mut offset = 0; + + loop { + // A buffer to read a single byte + let mut buf = [0; 1]; + // Read one byte from the reader + reader.read_exact(&mut buf)?; + + // The byte just read + let byte = buf[0]; + if shift > 63 { + // VarInt too long for u64 + return Err(io::Error::new(io::ErrorKind::InvalidData, "VarInt too long")); + } + + // Take the lower 7 bits of the byte + let byte_value = (byte & 0x7F) as u64; + // Add the byte value to the result, considering the shift + value |= byte_value << shift; + + // Increment the byte count + offset += 1; + // Check if the MSB is 0 (last byte) + if byte & 0x80 == 0 { + break; + } + + // Increment the shift for the next byte + shift += 7; + } + + Ok((value, offset)) +} + + + +/// The offset for an OffsetDelta object(big-endian order) +/// # Arguments +/// +/// * `stream`: Input Data Stream to read +/// # Returns +/// * (`delta_offset`(unsigned), `consume`) +pub fn read_offset_encoding(stream: &mut R) -> io::Result<(u64, usize)> { + // Like the object length, the offset for an OffsetDelta object + // is stored in a variable number of bytes, + // with the most significant bit of each byte indicating whether more bytes follow. + // However, the object length encoding allows redundant values, + // e.g. the 7-bit value [n] is the same as the 14- or 21-bit values [n, 0] or [n, 0, 0]. + // Instead, the offset encoding adds 1 to the value of each byte except the least significant one. + // And just for kicks, the bytes are ordered from *most* to *least* significant. + let mut value = 0; + let mut offset = 0; + loop { + let (byte_value, more_bytes) = read_byte_and_check_continuation(stream)?; + offset += 1; + value = (value << 7) | byte_value as u64; + if !more_bytes { + return Ok((value, offset)); + } + + value += 1; //important!: for n >= 2 adding 2^7 + 2^14 + ... + 2^(7*(n-1)) to the result + } +} + +/// Read the next N bytes from the reader +/// +#[inline] +pub fn read_bytes(stream: &mut R) -> io::Result<[u8; N]> { + let mut bytes = [0; N]; + stream.read_exact(&mut bytes)?; + + Ok(bytes) +} + + +/// Reads a partial integer from a stream. (little-endian order) +/// +/// # Arguments +/// +/// * `stream` - A mutable reference to a readable stream. +/// * `bytes` - The number of bytes to read from the stream. +/// * `present_bytes` - A mutable reference to a byte indicating which bits are present in the integer value. +/// +/// # Returns +/// +/// This function returns a result of type `io::Result`. If the operation is successful, the integer value +/// read from the stream is returned as `Ok(value)`. Otherwise, an `Err` variant is returned, wrapping an `io::Error` +/// that describes the specific error that occurred. +pub fn read_partial_int( + stream: &mut R, + bytes: u8, + present_bytes: &mut u8, +) -> io::Result { + let mut value: usize = 0; + + // Iterate over the byte indices + for byte_index in 0..bytes { + // Check if the current bit is present + if *present_bytes & 1 != 0 { + // Read a byte from the stream + let [byte] = read_bytes(stream)?; + + // Add the byte value to the integer value + value |= (byte as usize) << (byte_index * 8); + } + + // Shift the present bytes to the right + *present_bytes >>= 1; + } + + Ok(value) +} + +/// Calculate the SHA1 hash of the given object. +///
"` \0`" +///
data: The decompressed content of the object +pub fn calculate_object_hash(obj_type: ObjectType, data: &Vec) -> SHA1 { + let mut hash = Sha1::new(); + // Header: " \0" + hash.update(obj_type.to_bytes()); + hash.update(b" "); + hash.update(data.len().to_string()); + hash.update(b"\0"); + + // Decompressed data(raw content) + hash.update(data); + + let re: [u8; 20] = hash.finalize().into(); + SHA1(re) +} +/// Create an empty directory or clear the existing directory. +pub fn create_empty_dir>(path: P) -> io::Result<()> { + let dir = path.as_ref(); + // 删除整个文件夹 + if dir.exists() { + fs::remove_dir_all(dir)?; + } + // 重新创建文件夹 + fs::create_dir_all(dir)?; + Ok(()) +} + +/// Count the number of files in a directory and its subdirectories. +pub fn count_dir_files(path: &Path) -> io::Result { + let mut count = 0; + for entry in fs::read_dir(path)? { + let entry = entry?; + let path = entry.path(); + if path.is_dir() { + count += count_dir_files(&path)?; + } else { + count += 1; + } + } + Ok(count) +} + +/// Count the time taken to execute a block of code. +#[macro_export] +macro_rules! time_it { + ($msg:expr, $block:block) => { + { + let start = std::time::Instant::now(); + let result = $block; + let elapsed = start.elapsed(); + println!("{}: {:?}", $msg, elapsed); + result + } + }; +} + +#[cfg(test)] +mod tests { + use std::io; + use std::io::Cursor; + use std::io::Read; + use venus::internal::object::types::ObjectType; + + use crate::internal::pack::utils::*; + + #[test] + fn test_calc_obj_hash() { + let hash = calculate_object_hash(ObjectType::Blob, &b"a".to_vec()); + assert_eq!(hash.to_plain_str(), "2e65efe2a145dda7ee51d1741299f848e5bf752e"); + } + + #[test] + fn eof() { + let mut reader = Cursor::new(&b""[..]); + assert!(is_eof(&mut reader)); + } + + #[test] + fn not_eof() { + let mut reader = Cursor::new(&b"abc"[..]); + assert!(!is_eof(&mut reader)); + } + + #[test] + fn eof_midway() { + let mut reader = Cursor::new(&b"abc"[..]); + reader.read_exact(&mut [0; 2]).unwrap(); + assert!(!is_eof(&mut reader)); + } + + #[test] + fn reader_error() { + struct BrokenReader; + impl Read for BrokenReader { + fn read(&mut self, _: &mut [u8]) -> io::Result { + Err(io::Error::new(io::ErrorKind::Other, "error")) + } + } + + let mut reader = BrokenReader; + assert!(!is_eof(&mut reader)); + } + + // Test case for a byte without a continuation bit (most significant bit is 0) + #[test] + fn test_read_byte_and_check_continuation_no_continuation() { + let data = [0b0101_0101]; // 85 in binary, highest bit is 0 + let mut cursor = Cursor::new(data); + let (value, more_bytes) = read_byte_and_check_continuation(&mut cursor).unwrap(); + + assert_eq!(value, 85); // Expected value is 85 + assert!(!more_bytes); // No more bytes are expected + } + + // Test case for a byte with a continuation bit (most significant bit is 1) + #[test] + fn test_read_byte_and_check_continuation_with_continuation() { + let data = [0b1010_1010]; // 170 in binary, highest bit is 1 + let mut cursor = Cursor::new(data); + let (value, more_bytes) = read_byte_and_check_continuation(&mut cursor).unwrap(); + + assert_eq!(value, 42); // Expected value is 42 (170 - 128) + assert!(more_bytes); // More bytes are expected + } + + // Test cases for edge values, like the minimum and maximum byte values + #[test] + fn test_read_byte_and_check_continuation_edge_cases() { + // Test the minimum value (0) + let data = [0b0000_0000]; + let mut cursor = Cursor::new(data); + let (value, more_bytes) = read_byte_and_check_continuation(&mut cursor).unwrap(); + + assert_eq!(value, 0); // Expected value is 0 + assert!(!more_bytes); // No more bytes are expected + + // Test the maximum value (255) + let data = [0b1111_1111]; + let mut cursor = Cursor::new(data); + let (value, more_bytes) = read_byte_and_check_continuation(&mut cursor).unwrap(); + + assert_eq!(value, 127); // Expected value is 127 (255 - 128) + assert!(more_bytes); // More bytes are expected + } + + // Test with a single byte where msb is 0 (no continuation) + #[test] + fn test_single_byte_no_continuation() { + let data = [0b0101_0101]; // Type: 5 (101), Size: 5 (0101) + let mut offset: usize = 0; + let mut cursor = Cursor::new(data); + let (type_bits, size) = read_type_and_varint_size(&mut cursor, &mut offset).unwrap(); + + assert_eq!(offset, 1); // Offset is 1 + assert_eq!(type_bits, 5); // Expected type is 2 + assert_eq!(size, 5); // Expected size is 5 + } + + // Test with multiple bytes, where continuation occurs + #[test] + fn test_multiple_bytes_with_continuation() { + // Type: 5 (101), Sizes: 5 (0101), 3 (0000011) in little-endian order + let data = [0b1101_0101, 0b0000_0011]; // Second byte's msb is 0 + let mut offset: usize = 0; + let mut cursor = Cursor::new(data); + let (type_bits, size) = read_type_and_varint_size(&mut cursor, &mut offset).unwrap(); + + assert_eq!(offset, 2); // Offset is 2 + assert_eq!(type_bits, 5); // Expected type is 5 + // Expected size 000000110101 + // 110101 = 1 * 2^5 + 1 * 2^4 + 0 * 2^3 + 1 * 2^2 + 0 * 2^1 + 1 * 2^0= 53 + assert_eq!(size, 53); + } + + // Test with edge case where size is spread across multiple bytes + #[test] + fn test_edge_case_size_spread_across_bytes() { + // Type: 1 (001), Sizes: 15 (1111) in little-endian order + let data = [0b0001_1111, 0b0000_0010]; // Second byte's msb is 1 (continuation) + let mut offset: usize = 0; + let mut cursor = Cursor::new(data); + let (type_bits, size) = read_type_and_varint_size(&mut cursor, &mut offset).unwrap(); + + assert_eq!(offset, 1); // Offset is 1 + assert_eq!(type_bits, 1); // Expected type is 1 + // Expected size is 15 + assert_eq!(size, 15); + } + + #[test] + fn test_read_varint_le_single_byte() { + // Single byte: 0x05 (binary: 0000 0101) + // Represents the value 5 with no continuation bit set. + let data = vec![0x05]; + let mut cursor = Cursor::new(data); + let (value, offset) = read_varint_le(&mut cursor).unwrap(); + + assert_eq!(value, 5); + assert_eq!(offset, 1); + } + + #[test] + fn test_read_varint_le_multiple_bytes() { + // Two bytes: 0x85, 0x01 (binary: 1000 0101, 0000 0001) + // Represents the value 133. First byte has the continuation bit set. + let data = vec![0x85, 0x01]; + let mut cursor = Cursor::new(data); + let (value, offset) = read_varint_le(&mut cursor).unwrap(); + + assert_eq!(value, 133); + assert_eq!(offset, 2); + } + + #[test] + fn test_read_varint_le_large_number() { + // Five bytes: 0xFF, 0xFF, 0xFF, 0xFF, 0xF (binary: 1111 1111, 1111 1111, 1111 1111, 1111 1111, 0000 1111) + // Represents the value 134,217,727. All continuation bits are set except in the last byte. + let data = vec![0xFF, 0xFF, 0xFF, 0xFF, 0xF]; + let mut cursor = Cursor::new(data); + let (value, offset) = read_varint_le(&mut cursor).unwrap(); + + assert_eq!(value, 0xFFFFFFFF); + assert_eq!(offset, 5); + } + + #[test] + fn test_read_varint_le_zero() { + // Single byte: 0x00 (binary: 0000 0000) + // Represents the value 0 with no continuation bit set. + let data = vec![0x00]; + let mut cursor = Cursor::new(data); + let (value, offset) = read_varint_le(&mut cursor).unwrap(); + + assert_eq!(value, 0); + assert_eq!(offset, 1); + } + + #[test] + fn test_read_varint_le_too_long() { + let data = vec![0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x01]; + let mut cursor = Cursor::new(data); + let result = read_varint_le(&mut cursor); + + assert!(result.is_err()); + } + + #[test] + fn test_read_offset_encoding(){ + let data:Vec = vec![0b_1101_0101,0b_0000_0101]; + let mut cursor = Cursor::new(data); + let result = read_offset_encoding(&mut cursor); + assert!(result.is_ok()); + assert_eq!(result.unwrap(), (11013, 2)); + } +} \ No newline at end of file