From 6b3a9b67def7cce832f0cf7bb55c9523c625f37d Mon Sep 17 00:00:00 2001 From: sonichen <1606673007@qq.com> Date: Thu, 14 Sep 2023 18:21:53 +0800 Subject: [PATCH] mda project --- mda/BUILD | 16 + mda/Cargo.toml | 34 ++ mda/example_anno_config.toml | 31 ++ mda/src/anno_version_control.rs | 466 ++++++++++++++++++++ mda/src/entity.rs | 86 ++++ mda/src/lib.rs | 29 ++ mda/src/map/mod.rs | 2 + mda/src/map/read_from_file.rs | 152 +++++++ mda/src/map/read_from_folders.rs | 35 ++ mda/src/map_data.rs | 35 ++ mda/src/mda_operations/extract.rs | 620 ++++++++++++++++++++++++++ mda/src/mda_operations/generate.rs | 685 +++++++++++++++++++++++++++++ mda/src/mda_operations/mod.rs | 3 + mda/src/mda_operations/update.rs | 161 +++++++ mda/src/rev_anno.rs | 371 ++++++++++++++++ mda/src/run_mda.rs | 455 +++++++++++++++++++ mda/src/utils.rs | 387 ++++++++++++++++ 17 files changed, 3568 insertions(+) create mode 100644 mda/BUILD create mode 100644 mda/Cargo.toml create mode 100644 mda/example_anno_config.toml create mode 100644 mda/src/anno_version_control.rs create mode 100644 mda/src/entity.rs create mode 100644 mda/src/lib.rs create mode 100644 mda/src/map/mod.rs create mode 100644 mda/src/map/read_from_file.rs create mode 100644 mda/src/map/read_from_folders.rs create mode 100644 mda/src/map_data.rs create mode 100644 mda/src/mda_operations/extract.rs create mode 100644 mda/src/mda_operations/generate.rs create mode 100644 mda/src/mda_operations/mod.rs create mode 100644 mda/src/mda_operations/update.rs create mode 100644 mda/src/rev_anno.rs create mode 100644 mda/src/run_mda.rs create mode 100644 mda/src/utils.rs diff --git a/mda/BUILD b/mda/BUILD new file mode 100644 index 00000000..867db6ff --- /dev/null +++ b/mda/BUILD @@ -0,0 +1,16 @@ + +load("@crate_index//:defs.bzl", "aliases", "all_crate_deps") +load("@rules_rust//rust:defs.bzl", "rust_library", "rust_test", "rust_doc_test") + +rust_library( + name = "mda", + srcs = glob([ + "src/**/*.rs", + ]), + aliases = aliases(), + deps = all_crate_deps(), + proc_macro_deps = all_crate_deps( + proc_macro = True, + ), + visibility = ["//visibility:public"], +) diff --git a/mda/Cargo.toml b/mda/Cargo.toml new file mode 100644 index 00000000..1f168b4c --- /dev/null +++ b/mda/Cargo.toml @@ -0,0 +1,34 @@ +[package] +name = "mda" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[lib] +name = "mda" +path = "src/lib.rs" + + +[dependencies] +clap = { version = "4.3.0", features = ["derive"] } +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" +bincode = "1.3.3" +image = "0.24.6" +encoding = "0.2" +hound = "3.5.0" +anyhow = "1.0.72" +chrono="0.4.26" +rayon="1" +tokio="1.29.1" +walkdir = "2.3.1" +sha1="0.10.5" +hex="0.4.3" +env_logger = "0.10.0" +csv="1.2.2" +indicatif = "0.17.0" +mp4parse = "0.17.0" +prettytable = "0.10.0" +serde_derive = "1.0" +toml="0.8.0" \ No newline at end of file diff --git a/mda/example_anno_config.toml b/mda/example_anno_config.toml new file mode 100644 index 00000000..21405f6e --- /dev/null +++ b/mda/example_anno_config.toml @@ -0,0 +1,31 @@ +title = "anno config" + +# This is for processing the "combine" case (where all annotation data is in one file) when generating an MDA file. You can configure one or more annotation data with the following parameters: + +# id: the name of the annotation data. If not specified, the file name of the annotation data will be used as the ID. +# path (required): the file path of the annotation data. +# start: the line number to start reading from in the annotation data file. If not specified, it will start from the first line. +# end: the line number to stop reading from in the annotation data file. If not specified, it will read until the end of the file. + + +[[annotation]] +id = "identify" +path = "test/anno/identity_CelebA.txt" +start = 1 +end=1001 + +[[annotation]] +path = "test/anno/list_attr_celeba.txt" +start = 3 +end=1002 + +[[annotation]] +path = "test/anno/list_bbox_celeba.txt" +start = 3 +end=1002 + +[[annotation]] +id = "landmarks" +path = "test/anno/list_landmarks_celeba.txt" +start=3 +end=1002 diff --git a/mda/src/anno_version_control.rs b/mda/src/anno_version_control.rs new file mode 100644 index 00000000..18c86308 --- /dev/null +++ b/mda/src/anno_version_control.rs @@ -0,0 +1,466 @@ +//! Build a data structure similar to the revlog format to implement version control and incremental storage. + +use serde::{Deserialize, Serialize}; +use sha1::{Digest, Sha1}; +use std::cmp::{max, min}; +use std::process; + +mod constants { + pub const BLOCK_SIZE: usize = 10; // TODO + pub const NULLID: [u8; 20] = [0; 20]; + pub const SNAPSHOT_BASE: u32 = 3; +} + +/// Splitting large-scale data into fixed-size data blocks and recording the block numbers. +fn split_data_into_blocks(data: Vec, block_size: usize) -> (Vec, Vec) { + let mut blocks = Vec::new(); + let mut index = 0; + let mut block_number = 0; + let mut numbers: Vec = Vec::new(); + while index < data.len() { + numbers.push(block_number); + + let end = std::cmp::min(index + block_size, data.len()); + blocks.push(DataBlock::new(block_number, data[index..end].to_vec())); + index = end; + block_number += 1; + } + + (blocks, numbers) +} + +/// Comparing data block lists to find newly added data blocks. +fn find_different_blocks( + last_id: u8, + entries: &Vec, + current_data: &[u8], + _block_size: usize, +) -> Vec { + let blocks_list = get_data_blocks_up_to_id(last_id, entries); + let (current_data_blocks, _data_indices) = + split_data_into_blocks(current_data.clone().to_vec(), constants::BLOCK_SIZE); + + // Find elements in block1 that are not in block2 + let elements_not_in_block1: Vec = current_data_blocks + .iter() + .filter(|current_data_blocks_item| { + !blocks_list + .iter() + .any(|blocks_list_item| blocks_list_item.data == current_data_blocks_item.data) + }) + .cloned() + .collect(); + + elements_not_in_block1 +} + +/// add new blocks to blocklist +fn add_to_block_list( + mut block_list: Vec, + different_blocks: Vec, +) -> (Vec, Vec) { + let mut diff_number = Vec::::new(); + for mut block in different_blocks { + let last_block_number = block_list.last().map_or(0, |block| block.block_number); + + block.block_number = 1 + last_block_number; + diff_number.push(block.block_number); + block_list.push(block); + } + + // block_list + (block_list, diff_number) +} + +/// extract index from data blocks +fn extract_index(vec_data1: &[DataBlock], vec_data2: &[DataBlock]) -> Vec { + let mut index: Vec = Vec::new(); + for data_block1 in vec_data1.iter() { + if let Some(index_in_vec_data2) = vec_data2 + .iter() + .position(|data_block2| data_block1.data == data_block2.data) + { + index.push(vec_data2[index_in_vec_data2].block_number); + } + } + + index +} + +impl RevAnnoEntry { + /// new RevAnnoEntry + fn new(id: u8, index: Vec, blocks: Vec) -> Self { + RevAnnoEntry { id, index, blocks } + } + + /// add first RevAnnoEntry + pub fn init(content: &str) -> (Vec, Vec) { + // Config current content + let data: Vec = content.as_bytes().to_vec(); + let (blocks, data_indices) = split_data_into_blocks(data.clone(), constants::BLOCK_SIZE); + + // Config enrty + let entry = RevAnnoEntry::new(0, data_indices, blocks); + + let entries: Vec = vec![entry]; + + // Config Header + let nodeid = compute_nodeid(&constants::NULLID, &constants::NULLID, &data); + + let rev_anno_header = RevAnnoHeader::new( + 0, + 0, + data.len() as u32, + 0, + 0, + constants::NULLID, + constants::NULLID, + nodeid, + true, + ); + let headers: Vec = vec![rev_anno_header]; + (headers, entries) + } + + /// add entries to list + pub fn add( + content: &str, + mut entries: Vec, + mut headers: Vec, + ) -> (Vec, Vec) { + //Config data from last entry + let last_entry = entries.last().unwrap_or_else(|| { + println!("The last data is empty!"); + process::exit(1); + }); + let last_id = last_entry.id; + let last_header = headers.last().unwrap_or_else(|| { + println!("The last data is empty!"); + process::exit(1); + }); + let last_node_id = last_header.nodeid; + let mut last_p1 = last_header.p1rev; + if last_id == 0 { + last_p1 = last_header.nodeid; + } + + // Config current data info + let current_id = last_id + 1; + + // change to Vec + let current_data: Vec = content.as_bytes().to_vec(); + let (current_data_blocks, _data_indices) = + split_data_into_blocks(current_data.clone(), constants::BLOCK_SIZE); + + // Build a block list and record the construction number of the original data + let different_blocks = + find_different_blocks(last_id, &entries, ¤t_data, constants::BLOCK_SIZE); + + let block_list = get_data_blocks_up_to_id(last_id, &entries); + let (records, diff) = add_to_block_list(block_list, different_blocks); + + // assign id to diff blocks + let diff_blocks: Vec = records + .iter() + .filter_map(|record| { + if diff.contains(&record.block_number) { + Some(DataBlock { + block_number: record.block_number, + data: record.data.clone(), + }) + } else { + None + } + }) + .collect(); + + // get current index + + let matching_block_numbers = extract_index(¤t_data_blocks, &records); + let store_matching_block_numbers = matching_block_numbers.clone(); + + // Configure the entry: + // 1) If it already exists, do not store it. + // 2) If it is a snapshot, store the entire entry. + // 3) Otherwise, store only the differential data. + // Config entry + let nodeid = compute_nodeid(&constants::NULLID, &constants::NULLID, ¤t_data); + + let mut entry = RevAnnoEntry { + id: current_id, + index: matching_block_numbers, + blocks: diff_blocks, + }; + + // Check if it existed + for item in &entries { + if item.index == entry.index { + return (headers, entries.clone()); + } + } + + entries.push(entry); + + // check if it is a snapshot + + + if (current_id as u32) % constants::SNAPSHOT_BASE == 0 { + let mut all_blocks: Vec = Vec::new(); + for entry in &mut entries { + for block in &entry.blocks { + all_blocks.push(block.clone()); + } + } + let new_entry = RevAnnoEntry { + id: current_id, // Copy the id field + index: store_matching_block_numbers, // Copy the index field + blocks: all_blocks, // Use the cloned all_blocks here + }; + entry = new_entry; // Assign the modified new_entry back to entry + + if let Some(last_element) = entries.last_mut() { + *last_element = entry; + } + //Config header + let rev_anno_header = RevAnnoHeader::new( + current_id, + 0, + 0, + current_id as i32, + last_id as i32, + last_node_id, + constants::NULLID, + nodeid, + true, + ); + + headers.push(rev_anno_header); + (headers, entries) + } else { + + //Config header + let mut rev_anno_header = RevAnnoHeader::new( + current_id, + 0, + 0, + 0, + last_id as i32, + last_p1, + constants::NULLID, + nodeid, + false, + ); + let nearest_id = find_nearest_multiple_of_snapshot_base(last_id as u32); + match nearest_id { + Some(nearest_id) => { + rev_anno_header.baserev = nearest_id as i32; + + if let Some(nearest_item) = headers.get(nearest_id as usize) { + rev_anno_header.p2rev = nearest_item.nodeid; + } + } + None => { + rev_anno_header.baserev = current_id as i32; + } + } + + headers.push(rev_anno_header); + (headers, entries) + } + } +} + +/// Compute nodeid hash using sha1 +fn compute_nodeid(parent1: &[u8; 20], parent2: &[u8; 20], contents: &[u8]) -> [u8; 20] { + let mut hasher = Sha1::new(); + hasher.update(min(parent1, parent2)); + hasher.update(max(parent1, parent2)); + hasher.update(contents); + let result = hasher.finalize(); + let mut nodeid = [0u8; 20]; + nodeid.copy_from_slice(&result); + nodeid +} + +/// shorten nodeid +fn nodeid_to_short_hex(nodeid: &[u8; 20]) -> String { + let nodeid_hex_string: String = nodeid + .iter() + .take(6) + .map(|b| format!("{:02x}", b)) + .collect(); + nodeid_hex_string +} + +/// Function to combine Vec into text +fn combine_data_blocks_to_text(data_blocks: &Vec) -> String { + let mut combined_text = String::new(); + for data_block in data_blocks { + combined_text.push_str(std::str::from_utf8(&data_block.data).unwrap()); + } + combined_text +} + +/// Find the corresponding indexes by ID. +fn find_index_by_id(id: u8, delta_list: &[RevAnnoEntry]) -> Option> { + let delta_to_find = delta_list.iter().find(|entry| entry.id == id); + + delta_to_find.map(|entry| entry.index.clone()) +} + +/// Get all data blocks from ID 0 to the input ID. +fn get_data_blocks_up_to_id(last_id: u8, delta_list: &Vec) -> Vec { + let mut data_blocks = Vec::new(); + let nearest_id = find_nearest_multiple_of_snapshot_base(last_id as u32); + match nearest_id { + Some(nearest_id) => { + let mut delta_list_iter = delta_list + .iter() + .skip_while(|entry| entry.id < nearest_id as u8); + for entry in &mut delta_list_iter { + data_blocks.extend(entry.blocks.iter().cloned()); + } + } + None => { + for entry in delta_list { + if entry.id <= last_id { + data_blocks.extend(entry.blocks.iter().cloned()); + } + } + } + } + data_blocks +} +fn find_nearest_multiple_of_snapshot_base(target: u32) -> Option { + if target < constants::SNAPSHOT_BASE { + return None; + } + + let nearest_multiple = target - (target % constants::SNAPSHOT_BASE); + if nearest_multiple < constants::SNAPSHOT_BASE { + return None; + } + + Some(nearest_multiple) +} + +/// Get the Vec corresponding to the indexes. +fn get_data_blocks_by_index(index: &Vec, data_blocks: &[DataBlock]) -> Vec { + let mut result_blocks = Vec::new(); + for &idx in index { + if let Some(data_block) = data_blocks.iter().find(|block| block.block_number == idx) { + result_blocks.push(data_block.clone()); + } + } + result_blocks +} +/// Get full data(string) +pub fn get_full_data(id: u8, entries: Vec) -> String { + if let Some(index) = find_index_by_id(id, &entries) { + let data_blocks = get_data_blocks_up_to_id(id, &entries); + let selected_blocks = get_data_blocks_by_index(&index, &data_blocks); + combine_data_blocks_to_text(&selected_blocks) + } else { + println!("No data blocks found for ID {}", id); + process::exit(1); + } +} + +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct RevAnnoEntry { + pub id: u8, + pub index: Vec, + pub blocks: Vec, +} +/// Structure for a data block +#[derive(Debug, Clone, PartialEq, Deserialize, Serialize)] +pub struct DataBlock { + /// Block number of the data block + pub block_number: usize, + /// Content of the data block + pub data: Vec, +} + +impl DataBlock { + fn new(block_number: usize, data: Vec) -> Self { + DataBlock { block_number, data } + } +} +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct RevAnnoHeader { + pub rev: u8, + pub offset: u64, + pub length: u32, + pub baserev: i32, + pub linkrev: i32, + pub p1rev: [u8; 20], + pub p2rev: [u8; 20], + pub nodeid: [u8; 20], + pub snapshot: bool, +} +impl RevAnnoHeader { + #![allow(clippy::too_many_arguments)] + fn new( + rev: u8, + offset: u64, + length: u32, + baserev: i32, + linkrev: i32, + p1rev: [u8; 20], + p2rev: [u8; 20], + nodeid: [u8; 20], + snapshot: bool, + ) -> RevAnnoHeader { + RevAnnoHeader { + rev: (rev), + offset: (offset), + length, + baserev, + linkrev, + p1rev, + p2rev, + nodeid, + snapshot, + } + } +} + +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct RevAnno { + pub headers: Vec, + pub entries: Vec, +} + +impl RevAnno { + pub fn new(rev_anno_header: Vec, entries: Vec) -> Self { + RevAnno { + headers: (rev_anno_header), + entries: (entries), + } + } +} + +pub fn print_rev_anno_headers(headers: &Vec) { + println!( + "{:<6} {:<8} {:<7} {:<6} {:<7} {:<12} {:<12} {:<40} {:<6} ", + "rev", "offset", "length", "delta", "linkrev", "nodeid", "p1", "p2", "snap" + ); + for (count, header) in headers.iter().enumerate() { + let mut rev = header.rev.to_string(); + if count == headers.len() - 1 { + rev = header.rev.to_string() + "*"; + } + println!( + "{:<6} {:<8} {:<7} {:<6} {:<7} {:<12} {:<12} {:<40} {:<6} ", + rev, + header.offset, + header.length, + header.baserev, + header.linkrev, + nodeid_to_short_hex(&header.nodeid), + nodeid_to_short_hex(&header.p1rev), + nodeid_to_short_hex(&header.p2rev), + header.snapshot + ); + } +} diff --git a/mda/src/entity.rs b/mda/src/entity.rs new file mode 100644 index 00000000..ac8b35f7 --- /dev/null +++ b/mda/src/entity.rs @@ -0,0 +1,86 @@ +//! Store some entity. + +use serde::{Deserialize, Serialize}; + + + +#[derive(Serialize, Deserialize, Debug)] +pub struct MDAIndex { + pub header_offset: u64, + pub train_data_offset: u64, + pub annotations_offset:Vec +} +#[derive(Serialize, Deserialize, Debug,Clone)] +pub struct AnnoOffset{ + pub id:String, + pub header_offset:u64, + pub entries_offset:u64, +} +impl AnnoOffset { + pub fn new (id:&str)->AnnoOffset{ + AnnoOffset { id: id.to_string(), header_offset: 0, entries_offset: 0 } + } +} +/// Define the MDAHeader structure +#[derive(Serialize, Deserialize, Debug)] +pub struct MDAHeader { + pub tags: Vec, + pub train_data: TrainData, +} + +/// Define the train_data_index in header +#[derive(Serialize, Deserialize, Debug)] +pub struct TrainData { + pub data_type: String, + pub metadata: String, +} + +/// Type of training data +#[derive(Serialize, Deserialize, Debug)] +pub enum TrainingData { + Text(String), + Image(Vec), + Video(Vec), + Audio(Vec), +} + +/// Type of training data +#[derive(Serialize, Deserialize, Debug)] +pub enum DataType { + Text, + Image, + Video, + Audio, +} + +/// Used to store the image metadata +#[derive(Serialize, Deserialize, Debug)] +pub struct ImageMetaData { + pub size: (u32, u32), + pub channel_count: u8, + pub color_space: String, +} + +/// Used to store the text metadata +#[derive(Serialize, Deserialize, Debug)] +pub struct TextMetaData { + pub length: usize, + pub encoding: String, + pub vocabulary_size: usize, +} + +/// Used to store the aduio metadata +#[derive(Serialize, Deserialize, Debug)] +pub struct AudioMetaData { + pub duration: f64, + pub sample_rate: u32, + pub channels: u16, + pub bit_depth: u16, +} + +// VideoMetaData +#[derive(Debug, Clone)] +pub struct VideoMetaData { + pub duration: f64, + pub resolution: (u16, u16), +} diff --git a/mda/src/lib.rs b/mda/src/lib.rs new file mode 100644 index 00000000..a8d6dee6 --- /dev/null +++ b/mda/src/lib.rs @@ -0,0 +1,29 @@ +pub mod entity; +pub use entity::*; + +pub mod utils; +pub use utils::*; + + + +pub mod rev_anno; +pub use rev_anno::*; + + +pub mod mda_operations{ + pub mod generate; + pub mod extract; + pub mod update; +} +pub use mda_operations::generate; +pub use mda_operations::extract; +pub use mda_operations::update; + +pub mod map{ + pub mod read_from_file; + pub mod read_from_folders; +} +pub use map::*; + + +pub mod run_mda; \ No newline at end of file diff --git a/mda/src/map/mod.rs b/mda/src/map/mod.rs new file mode 100644 index 00000000..c7b72713 --- /dev/null +++ b/mda/src/map/mod.rs @@ -0,0 +1,2 @@ +pub mod read_from_file; +pub mod read_from_folders; \ No newline at end of file diff --git a/mda/src/map/read_from_file.rs b/mda/src/map/read_from_file.rs new file mode 100644 index 00000000..a14e9977 --- /dev/null +++ b/mda/src/map/read_from_file.rs @@ -0,0 +1,152 @@ +//! Used to map the traning data and its annotation data +//! Case1: All the annotation data is stored in one CSV or JSON pr TXT file and it needs to be parsed +//! +use csv::ReaderBuilder; +use serde::{Deserialize, Serialize}; +use serde_json::Value; +use std::error::Error; +use std::fs::File; +use std::io::{BufRead, BufReader}; + +/// Matching training data and annotation data +pub fn get_train_path_and_anno_content(file_path: &str, start_line: usize,end_line: usize) -> Vec { + + if file_path.ends_with("txt") { + read_txt_file_info(file_path, start_line,end_line) + } else if file_path.ends_with("csv") { + read_csv_file_info(file_path, start_line,end_line).unwrap() + } else if file_path.ends_with("json") { + read_json_file_info(file_path, start_line,end_line).unwrap() + } else { + std::process::exit(0); + } +} +/// Record the training data name and annotation content +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct AnnoInfo { + /// training data name + pub file_name: String, + /// annotation content + pub content: String, +} +impl AnnoInfo { + fn from_json_object(json_object: &Value) -> Option { + let file_name = json_object["filename"].as_str()?.to_string(); + let content = serde_json::to_string_pretty(json_object).ok()?; + Some(Self { file_name, content }) + } + pub fn new()->AnnoInfo{ + AnnoInfo { file_name: "".to_string(), content: "".to_string() } + } +} +fn read_txt_file_info(file_path: &str, start_line: usize, end_line: usize) -> Vec { + let file = File::open(file_path).expect("Failed to open the file"); + let reader = BufReader::new(file); + let mut txt_info_vec: Vec = Vec::new(); + let mut current_line = 1; + + for (_line_number, line) in reader.lines().enumerate() { + let line = line.expect("Failed to read line"); + + if current_line < start_line { + current_line += 1; + continue; + } + + let parts: Vec<&str> = line.splitn(2, ' ').collect(); + if parts.len() == 2 { + let txt_info = AnnoInfo { + file_name: parts[0].to_string(), + content: parts[1].to_string(), + }; + txt_info_vec.push(txt_info); + } + + current_line += 1; + + if end_line != 0 && current_line > end_line { + break; + } + } + + txt_info_vec +} + +fn read_csv_file_info(file_path: &str, start_line: usize, end_line: usize) -> Result, Box> { + // 省略文件打开和错误处理 + + let file = File::open(file_path)?; + let mut rdr = ReaderBuilder::new().from_reader(file); + let mut csv_info_vec: Vec = Vec::new(); + let mut current_line = 1; + + for (_line_number, result) in rdr.records().enumerate() { + let record = result.expect("Failed to read record"); + + if current_line < start_line { + current_line += 1; + continue; + } + + if record.len() >= 2 { + let file_name = record[0].to_string(); + let content = record.iter().skip(1).collect::>().join(" "); + let csv_info = AnnoInfo { file_name, content }; + csv_info_vec.push(csv_info); + } + + current_line += 1; + + if end_line != 0 && current_line > end_line { + break; + } + } + + Ok(csv_info_vec) +} + +fn read_json_file_info(file_path: &str, start_line: usize,end_line: usize) -> Result, Box> { + // 省略文件打开和错误处理 + + let file = File::open(file_path)?; + let json_data: Value = serde_json::from_reader(file)?; + let mut json_info_vec: Vec = Vec::new(); + let mut current_line = 1; + + for (_key, value) in json_data.as_object().unwrap() { + if current_line < start_line { + current_line += 1; + continue; + } + + if let Some(json_info) = AnnoInfo::from_json_object(value) { + json_info_vec.push(json_info); + } + + current_line += 1; + + if end_line != 0 && current_line > end_line { + break; + } + } + + Ok(json_info_vec) +} + + +// #[cfg(test)] +// mod tests { +// use super::*; + +// #[test] +// fn test_read_txt_file_info() { +// // Create a temporary file for testing +// use std::fs::File; +// use std::io::Write; + +// // Test the function with start_line=2 and end_line=4 +// let result = read_txt_file_info("D:/Workplace/internship/project/test_mda/anno/list_landmarks_celeba.txt", 3, 0); + +// println!("{:?}",result); +// } +// } diff --git a/mda/src/map/read_from_folders.rs b/mda/src/map/read_from_folders.rs new file mode 100644 index 00000000..22e41fc0 --- /dev/null +++ b/mda/src/map/read_from_folders.rs @@ -0,0 +1,35 @@ +//! Used to map the traning data and its annotation data +//! Case2: The training data and annotation data are stored in separate folders, with the training data files having the same filenames as the annotation data files. +//! + +use std::collections::HashMap; +use std::fs; +use std::path::PathBuf; + +/// Get files in the folder +pub fn get_files_in_folder(folder_path: &str) -> Vec { + fs::read_dir(folder_path) + .expect("Failed to read folder contents") + .filter_map(Result::ok) + .filter(|entry| entry.path().is_file()) + .map(|entry| entry.path()) + .collect() +} + +/// Map files in different folder +pub fn combine_files(a_files: Vec, b_files: Vec) -> HashMap { + let mut file_combinations = HashMap::new(); + + for a_file in a_files { + let a_file_stem = a_file.file_stem().unwrap(); + for b_file in &b_files { + let b_file_stem = b_file.file_stem().unwrap(); + if a_file_stem == b_file_stem { + file_combinations.insert(a_file.clone(), b_file.clone()); + break; + } + } + } + + file_combinations +} diff --git a/mda/src/map_data.rs b/mda/src/map_data.rs new file mode 100644 index 00000000..22e41fc0 --- /dev/null +++ b/mda/src/map_data.rs @@ -0,0 +1,35 @@ +//! Used to map the traning data and its annotation data +//! Case2: The training data and annotation data are stored in separate folders, with the training data files having the same filenames as the annotation data files. +//! + +use std::collections::HashMap; +use std::fs; +use std::path::PathBuf; + +/// Get files in the folder +pub fn get_files_in_folder(folder_path: &str) -> Vec { + fs::read_dir(folder_path) + .expect("Failed to read folder contents") + .filter_map(Result::ok) + .filter(|entry| entry.path().is_file()) + .map(|entry| entry.path()) + .collect() +} + +/// Map files in different folder +pub fn combine_files(a_files: Vec, b_files: Vec) -> HashMap { + let mut file_combinations = HashMap::new(); + + for a_file in a_files { + let a_file_stem = a_file.file_stem().unwrap(); + for b_file in &b_files { + let b_file_stem = b_file.file_stem().unwrap(); + if a_file_stem == b_file_stem { + file_combinations.insert(a_file.clone(), b_file.clone()); + break; + } + } + } + + file_combinations +} diff --git a/mda/src/mda_operations/extract.rs b/mda/src/mda_operations/extract.rs new file mode 100644 index 00000000..a248e861 --- /dev/null +++ b/mda/src/mda_operations/extract.rs @@ -0,0 +1,620 @@ +use crate::generate::RevAnnoWithID; +use crate::{ + extract_file_name, find_nearest_multiple_of_snapshot_base, get_full_data, message, + print_rev_anno_headers, save_audio_to_file, save_image_to_file, save_text_to_file, + save_video_to_file, write_strings_to_file, DataType, MDAHeader , MDAIndex, + RevAnno, RevAnnoEntry, RevAnnoHeader, +}; +use anyhow::Result; +use rayon::iter::IntoParallelRefIterator; +use rayon::iter::ParallelIterator; +use std::error::Error; +use std::fs; +use std::fs::File; +use std::io::{BufReader, Read, Seek, SeekFrom}; +use std::process; +use indicatif::ProgressBar; + +/// Read data from an MDA file. +pub fn read_info_from_mda(file_path: &str) -> Result<(MDAIndex, MDAHeader), Box> { + let file = File::open(file_path)?; + let mut reader = BufReader::new(file); + let index: MDAIndex = bincode::deserialize_from(&mut reader)?; + reader.seek(SeekFrom::Start(index.header_offset))?; + let header: MDAHeader = bincode::deserialize_from(&mut reader)?; + Ok((index, header)) +} +/// Get anno groups +pub fn read_anno_groups_from_mda(file_path: &str) -> Result, Box> { + let file = File::open(file_path)?; + let mut reader = BufReader::new(file); + let index: MDAIndex = bincode::deserialize_from(&mut reader)?; + let mut anno_groups = Vec::new(); + for item in index.annotations_offset { + anno_groups.push(item.clone().id); + } + Ok(anno_groups) +} + +// Read annotations versions from an MDA file +pub fn read_anno_from_mda(file_path: &str, group: &str, rev: i32) -> Result<(), Box> { + let rev_anno = match config_rev_anno_from_mda(file_path, group, rev) { + Ok(rev_anno) => rev_anno, + Err(err) => { + println!("Read Version Fail = {:?}", err); + process::exit(1); + } + }; + println!("Data Version for {:?}, anno group: {:?}", file_path, group); + print_rev_anno_headers(&rev_anno.headers); + + Ok(()) +} + +/// Extract training data from mda +fn extract_train_from_mda( + mda_path: &str, + training_data_path: &str, +) -> Result> { + let file = File::open(mda_path)?; + let mut reader = BufReader::new(file); + let index: MDAIndex = bincode::deserialize_from(&mut reader)?; + + reader.seek(SeekFrom::Start(index.train_data_offset))?; + let data_type: DataType = bincode::deserialize_from(&mut reader)?; + match data_type { + DataType::Text => { + let text: String = bincode::deserialize_from(&mut reader)?; + + save_text_to_file(&text, training_data_path)?; + } + DataType::Image => { + let image_data: Vec = bincode::deserialize_from(&mut reader)?; + + save_image_to_file(&image_data, training_data_path)?; + } + DataType::Video => { + let video_data: Vec = bincode::deserialize_from(&mut reader)?; + + save_video_to_file(&video_data, training_data_path)?; + } + DataType::Audio => { + let audio_data: Vec = bincode::deserialize_from(&mut reader)?; + + save_audio_to_file(&audio_data, training_data_path)?; + } + }; + + Ok(data_type) +} + + + /// read anno revanno from mda +#[allow(unused_assignments)] +pub fn config_rev_anno_from_mda( + file_path: &str, + group: &str, + rev: i32, +) -> Result> { + let mut rev = rev; + + let file = File::open(file_path)?; + let mut reader = BufReader::new(file); + + // Deserialize the MDAIndex structure from the file, which contains offsets for headers and entries + let index: MDAIndex = bincode::deserialize_from(&mut reader)?; + + let mut anno_headers_offset = 0; + let mut anno_entries_offset = 0; + let mut next_anno_entries_offset = 0; + if index.annotations_offset.len() == 1 { + anno_headers_offset = index.annotations_offset[0].header_offset; + anno_entries_offset = index.annotations_offset[0].entries_offset; + next_anno_entries_offset = 0; + } else { + for (counter, item) in index.annotations_offset.iter().enumerate() { + if item.id == group { + anno_headers_offset = item.header_offset; + anno_entries_offset = item.entries_offset; + if counter == index.annotations_offset.len() - 1 { + next_anno_entries_offset = 0; + } else { + let next_item = &index.annotations_offset[counter + 1]; + next_anno_entries_offset = next_item.clone().entries_offset; + } + break; + } + } + } + + reader.seek(SeekFrom::Start(anno_headers_offset))?; + + // Read the bytes data of the header information + let mut header_bytes = Vec::new(); + reader.read_to_end(&mut header_bytes)?; + // + // + + let mut headers: Vec = Vec::new(); + let entries_bytes = Vec::new(); + let mut rev_anno: RevAnno = RevAnno::new(headers.clone(), entries_bytes); + + // + let mut current_position = anno_headers_offset; + // + if rev == -1 { + let mut offset = 0; + if next_anno_entries_offset == 0 { + while offset < header_bytes.len() { + let header: RevAnnoHeader = bincode::deserialize(&header_bytes[offset..])?; + headers.push(header.clone()); + + offset += bincode::serialized_size(&header)? as usize; + } + } else { + while offset < header_bytes.len() && current_position < next_anno_entries_offset { + let header: RevAnnoHeader = bincode::deserialize(&header_bytes[offset..])?; + headers.push(header.clone()); + + offset += bincode::serialized_size(&header)? as usize; + current_position += bincode::serialized_size(&header)?; + } + } + + // If the rev is -1, set it to the last header's index, otherwise, use the provided rev + if rev == -1 { + rev = (headers.len() - 1) as i32; + } + + let header_number = rev + 1; + + headers = headers.into_iter().take(header_number as usize).collect(); + + let headers: Vec = + headers.into_iter().take(header_number as usize).collect(); + reader.seek(SeekFrom::Start(anno_entries_offset))?; + + let mut entries_bytes = Vec::new(); + reader.read_to_end(&mut entries_bytes)?; + let mut entries: Vec = Vec::new(); + let mut offset = 0; + + for rev_anno_header in &headers { + let entry_bytes = &&entries_bytes[offset..(offset + rev_anno_header.length as usize)]; + let entry: RevAnnoEntry = bincode::deserialize(entry_bytes)?; + entries.push(entry); + + offset += rev_anno_header.length as usize; + } + + rev_anno = RevAnno::new(headers, entries); + } else { + // is snapshot + // is diff situation + + match find_nearest_multiple_of_snapshot_base(rev) { + Some(nearest_rev) => { + if next_anno_entries_offset == 0 { + let mut offset = 0; + + while offset < header_bytes.len() { + let header: RevAnnoHeader = bincode::deserialize(&header_bytes[offset..])?; + + if (header.rev >= nearest_rev) && (header.rev <= rev) { + headers.push(header.clone()); + + offset += bincode::serialized_size(&header)? as usize; + } else { + offset += bincode::serialized_size(&header)? as usize; + } + } + } else { + let mut offset = 0; + + while offset < header_bytes.len() && current_position < next_anno_entries_offset + { + let header: RevAnnoHeader = bincode::deserialize(&header_bytes[offset..])?; + + if (header.rev >= nearest_rev) && (header.rev <= rev) { + headers.push(header.clone()); + + offset += bincode::serialized_size(&header)? as usize; + } else { + offset += bincode::serialized_size(&header)? as usize; + } + current_position += bincode::serialized_size(&header)?; + } + } + + // seek from the snapshot + reader.seek(SeekFrom::Start(headers[0].offset))?; + + let mut entries_bytes = Vec::new(); + reader.read_to_end(&mut entries_bytes)?; + + let mut entries: Vec = Vec::new(); + let mut offset = 0; + for rev_anno_header in &headers { + let entry_bytes = + &&entries_bytes[offset..(offset + rev_anno_header.length as usize)]; + let entry: RevAnnoEntry = bincode::deserialize(entry_bytes)?; + entries.push(entry); + + offset += rev_anno_header.length as usize; + } + + rev_anno = RevAnno::new(headers, entries); + } + None => { + if rev == 0 { + let offset = 0; + // config header + let header: RevAnnoHeader = bincode::deserialize(&header_bytes[offset..])?; + + headers.push(header.clone()); + + // config entry + reader.seek(SeekFrom::Start(header.clone().offset))?; + let mut entries_bytes = Vec::new(); + reader.read_to_end(&mut entries_bytes)?; + let mut entries: Vec = Vec::new(); + let entry_bytes = &&entries_bytes[0..(header.clone().length as usize)]; + let entry: RevAnnoEntry = bincode::deserialize(entry_bytes)?; + entries.push(entry); + + rev_anno = RevAnno::new(headers, entries); + } else { + if next_anno_entries_offset == 0 { + let mut offset = 0; + + while offset < header_bytes.len() { + let header: RevAnnoHeader = + bincode::deserialize(&header_bytes[offset..])?; + if header.rev <= rev { + headers.push(header.clone()); + } + + offset += bincode::serialized_size(&header)? as usize; + } + } else { + let mut offset = 0; + + while offset < header_bytes.len() + && current_position < next_anno_entries_offset + { + let header: RevAnnoHeader = + bincode::deserialize(&header_bytes[offset..])?; + if header.rev <= rev { + headers.push(header.clone()); + } + + offset += bincode::serialized_size(&header)? as usize; + current_position += bincode::serialized_size(&header)?; + } + } + // config entry + reader.seek(SeekFrom::Start(headers[0].clone().offset))?; + + let mut entries_bytes = Vec::new(); + reader.read_to_end(&mut entries_bytes)?; + + let mut entries: Vec = Vec::new(); + let mut offset = 0; + for rev_anno_header in &headers { + let entry_bytes = + &&entries_bytes[offset..(offset + rev_anno_header.length as usize)]; + let entry: RevAnnoEntry = bincode::deserialize(entry_bytes)?; + entries.push(entry); + + offset += rev_anno_header.length as usize; + } + + rev_anno = RevAnno::new(headers, entries); + } + } + } + } + Ok(rev_anno) +} + +/// extract train and anno from mda +pub fn extract_mda( + mda_path: &str, + training_data_path: &str, + anno_data_path: &str, + rev: i32, + format: &str, + group: &str, +) -> Result<(), Box> { + let pb = ProgressBar::new(1); + + let train_data = training_data_path.to_string() + &extract_file_name(mda_path); + let anno_data: String = anno_data_path.to_string() + &extract_file_name(mda_path); + match extract_data_from_mda(mda_path, &train_data, &anno_data, rev, format, group) { + Ok(_) => { + pb.inc(1); + + pb.finish_with_message("done"); + } + Err(e) => { + eprintln!("Extract Error:{:?}", e); + } + } + + Ok(()) +} + +/// extract train and anno from mda, more than one mda files +pub fn extract_mda_more( + mda_path: &str, + training_data_path: &str, + anno_data_path: &str, + rev: i32, + format: &str, + group: &str, + // config:&MDAOptions + threads: usize, +) -> Result<(), Box> { + // get all paths + let entries = fs::read_dir(mda_path)?; + let entries_vec: Vec<_> = entries.collect(); + let length = entries_vec.len(); + + let mut paths = Vec::new(); + for entry in entries_vec { + let entry = entry?; + let file_path = entry.path(); + + if file_path.is_file() { + paths.push(file_path.to_string_lossy().to_string()); + } + } + println!("{:?}", paths); + let pb = ProgressBar::new(length.try_into().unwrap()); + + + // use thread pool to generate files + let pool = rayon::ThreadPoolBuilder::new() + .num_threads(threads) + .build() + .unwrap(); + + pool.install(|| { + paths.par_iter().for_each(|path| { + let train_data = training_data_path.to_string() + &extract_file_name(path); + let anno_data: String = anno_data_path.to_string() + &extract_file_name(path); + match extract_data_from_mda( + path, + &train_data, + &anno_data, + rev, + format, + group, + ) { + Ok(_) => { + pb.inc(1); + } + Err(err) => { + eprintln!( + "\x1b[31m[ERROR]{}: {} {}\x1b[0m", + path, + message::GENERATE_MSG, + err + ); + } + } + }); + }); + + + pb.finish_with_message("done"); + + Ok(()) +} +/// Extract data from an MDA file. +pub fn extract_data_from_mda( + mda_path: &str, + training_data_path: &str, + anno_data_path: &str, + rev: i32, + format: &str, + group: &str, +) -> Result> { + + + + let _ = extract_anno_from_mda(mda_path, anno_data_path, rev, format, group); + extract_train_from_mda(mda_path, training_data_path) +} + +/// Extract anno data from mda +#[allow(unused_assignments)] +fn extract_anno_from_mda( + file_path: &str, + anno_data_path: &str, + rev: i32, + format: &str, + group: &str, +) -> Result<(), Box> { + let rev_anno = match config_rev_anno_from_mda(file_path, group, rev) { + Ok(rev_anno) => rev_anno, + Err(err) => { + println!("error={:?}", err); + process::exit(1); + } + }; + + let mut full_data: String = String::new(); + if rev == -1 { + let rev1 = rev_anno.entries.len() - 1; + full_data = get_full_data(rev1 as i32, rev_anno.entries); + } else { + full_data = get_full_data(rev, rev_anno.entries); + } + + let strings: Vec = vec![full_data.to_string()]; + + write_strings_to_file(&strings, anno_data_path, format)?; + Ok(()) +} +/// extract train and anno from mda, with the targeted group +#[allow(unused_assignments)] +pub fn get_all_rev_anno_with_id( + file_path: &str, + _rev: i32, +) -> Result, Box> { + + let file = File::open(file_path)?; + let mut reader = BufReader::new(file); + + // Deserialize the MDAIndex structure from the file, which contains offsets for headers and entries + let index: MDAIndex = bincode::deserialize_from(&mut reader)?; + + + let mut anno_data: Vec = Vec::new(); + + if index.annotations_offset.len() == 1 { + + let mut anno_headers_offset = 0; + let id = &index.annotations_offset[0].id; + anno_headers_offset = index.annotations_offset[0].header_offset; + let anno_entries_offset = index.annotations_offset[0].entries_offset; + + reader.seek(SeekFrom::Start(anno_headers_offset))?; + + // read data + // Read the bytes data of the header information + let mut header_bytes = Vec::new(); + reader.read_to_end(&mut header_bytes)?; + + // + + let mut headers: Vec = Vec::new(); + let entries_bytes = Vec::new(); + let mut rev_anno: RevAnno = RevAnno::new(headers.clone(), entries_bytes); + + let mut offset = 0; + while offset < header_bytes.len() { + let header: RevAnnoHeader = bincode::deserialize(&header_bytes[offset..])?; + headers.push(header.clone()); + offset += bincode::serialized_size(&header)? as usize; + } + + reader.seek(SeekFrom::Start(anno_entries_offset))?; + + let mut entries_bytes = Vec::new(); + reader.read_to_end(&mut entries_bytes)?; + let mut entries: Vec = Vec::new(); + let mut offset = 0; + for rev_anno_header in &headers { + let entry_bytes = &&entries_bytes[offset..(offset + rev_anno_header.length as usize)]; + let entry: RevAnnoEntry = bincode::deserialize(entry_bytes)?; + entries.push(entry); + + offset += rev_anno_header.length as usize; + } + + rev_anno = RevAnno::new(headers, entries); + + let rev_anno_with_id = RevAnnoWithID { + id: id.to_string(), + rev_anno + }; + anno_data.push(rev_anno_with_id); + } else { + for (counter, item) in index.annotations_offset.iter().enumerate() { + if counter == index.annotations_offset.len() - 1 { + let id = item.clone().id; + let anno_headers_offset = item.clone().header_offset; + let tmp_anno_entries_offset = item.clone().entries_offset; + + // get headers + reader.seek(SeekFrom::Start(anno_headers_offset))?; + let mut header_bytes = Vec::new(); + reader.read_to_end(&mut header_bytes)?; + + let mut headers: Vec = Vec::new(); + let entries_bytes = Vec::new(); + let mut rev_anno: RevAnno = RevAnno::new(headers.clone(), entries_bytes); + + let mut offset = 0; + while offset < header_bytes.len() { + let header: RevAnnoHeader = bincode::deserialize(&header_bytes[offset..])?; + headers.push(header.clone()); + offset += bincode::serialized_size(&header)? as usize; + } + + reader.seek(SeekFrom::Start(tmp_anno_entries_offset))?; + + let mut entries_bytes = Vec::new(); + reader.read_to_end(&mut entries_bytes)?; + let mut entries: Vec = Vec::new(); + let mut offset = 0; + for rev_anno_header in &headers { + let entry_bytes = + &&entries_bytes[offset..(offset + rev_anno_header.length as usize)]; + let entry: RevAnnoEntry = bincode::deserialize(entry_bytes)?; + entries.push(entry); + + offset += rev_anno_header.length as usize; + } + + rev_anno = RevAnno::new(headers, entries); + + let rev_anno_with_id = RevAnnoWithID { + id: id.to_string(), + rev_anno + }; + anno_data.push(rev_anno_with_id); + } else { + let id = item.clone().id; + let anno_headers_offset = item.clone().header_offset; + let anno_entries_offset = item.clone().entries_offset; + let next_anno_entries_offset = + index.annotations_offset[counter + 1].clone().entries_offset; + // get headers + reader.seek(SeekFrom::Start(anno_headers_offset))?; + let mut header_bytes = Vec::new(); + reader.read_to_end(&mut header_bytes)?; + + // read headers + let mut headers: Vec = Vec::new(); + let mut current_position = anno_headers_offset; + let mut offset = 0; + while offset < header_bytes.len() && current_position < next_anno_entries_offset { + let header: RevAnnoHeader = bincode::deserialize(&header_bytes[offset..])?; + headers.push(header.clone()); + + offset += bincode::serialized_size(&header)? as usize; + current_position += bincode::serialized_size(&header)?; + } + + // get entries + reader.seek(SeekFrom::Start(anno_entries_offset))?; + + let mut entries_bytes = Vec::new(); + reader.read_to_end(&mut entries_bytes)?; + let mut entries: Vec = Vec::new(); + let mut offset = 0; + + for rev_anno_header in &headers { + let entry_bytes = + &&entries_bytes[offset..(offset + rev_anno_header.length as usize)]; + let entry: RevAnnoEntry = bincode::deserialize(entry_bytes)?; + entries.push(entry); + + offset += rev_anno_header.length as usize; + } + + let rev_anno = RevAnno::new(headers, entries); + let rev_anno_with_id = RevAnnoWithID { + id: id.to_string(), + rev_anno + }; + anno_data.push(rev_anno_with_id); + } + } + } + + Ok(anno_data) +} diff --git a/mda/src/mda_operations/generate.rs b/mda/src/mda_operations/generate.rs new file mode 100644 index 00000000..4365aa51 --- /dev/null +++ b/mda/src/mda_operations/generate.rs @@ -0,0 +1,685 @@ +use crate::read_from_file::{get_train_path_and_anno_content, AnnoInfo}; +use crate::read_from_folders::{combine_files, get_files_in_folder}; +use crate::run_mda::MDAOptions; +use crate::{ + extract_audio_metadata, extract_filename_change_extension, extract_image_metadata, + extract_text_metadata, extract_video_info, get_anno_config, get_file_type, message, AnnoOffset, + AudioMetaData, DataType, ImageMetaData, MDAHeader, MDAIndex, RevAnno, + TextMetaData, TrainData, TrainingData, VideoMetaData, +}; +use anyhow::Result; +use bincode::serialize_into; +use indicatif::ProgressBar; +use rayon::iter::IntoParallelRefIterator; +use rayon::iter::ParallelIterator; + +use std::collections::HashMap; +use std::error::Error; +use std::fs; +use std::fs::File; +use std::io::{Read, Seek, SeekFrom, Write}; +use std::path::Path; +// Return the files in the targeted directory +pub fn list_files_in_directory(directory: &str) -> Result, std::io::Error> { + let dir_path = Path::new(directory); + let mut file_paths = Vec::new(); + + if dir_path.is_dir() { + for entry in fs::read_dir(dir_path)? { + let entry = entry?; + let file_path = entry.path(); + + if file_path.is_file() { + if let Some(file_name) = file_path.file_name() { + if let Some(file_name_str) = file_name.to_str() { + file_paths.push(file_name_str.to_string()); + } + } + } + } + } + + Ok(file_paths) +} +/// extract last part from a path +fn extract_last_folder_name(path: &str) -> Option<&str> { + let path = std::path::Path::new(path); + if let Some(last_component) = path.components().next_back() { + if let Some(folder_name) = last_component.as_os_str().to_str() { + return Some(folder_name); + } + } + None +} +/// get the file extension of the file +fn get_first_file_extension(folder_path: &str) -> Result, std::io::Error> { + let dir_path = Path::new(folder_path); + if dir_path.is_dir() { + for entry in fs::read_dir(dir_path)? { + let entry = entry?; + let file_path = entry.path(); + + if file_path.is_file() { + if let Some(extension) = file_path.extension() { + return Ok(Some(extension.to_str().unwrap().to_string())); + } + } + } + } + + Ok(None) +} +/// config the completed path +fn generate_final_path(train: &str, annos: &str, extension: &str) -> String { + let train_path = Path::new(train); + let annos_path = Path::new(annos); + + if let Some(file_name) = annos_path.file_stem() { + let new_file_name = format!("{}.{}", file_name.to_string_lossy(), extension); + let final_path = train_path.join(new_file_name); + final_path.to_string_lossy().into_owned() + } else { + panic!("Unable to generate final path."); + } +} +// Generate mda files, case: 1 to many in directory +pub fn generate_mda_separate_annotations_one_to_many_in_folder( + training_data: &str, + annotation_group: &str, + output: &str, + config: &MDAOptions, +) -> Result> { + // get folders + let folders: Vec<&str> = annotation_group.split(',').collect(); + + // get extension + let extension = match get_first_file_extension(training_data) { + Ok(extension) => match extension { + Some(data) => data, + None => "NONE".to_string(), + }, + Err(err) => { + eprintln!("Error: {}", err); + "NONE".to_string() + } + }; + + // config train and anno group + let mut train_map_anno: Vec = Vec::new(); + for folder in folders { + + let id=extract_last_folder_name(folder).unwrap_or("NONE"); + let mut anno_groups: Vec = Vec::new(); + match list_files_in_directory(folder) { + Ok(file_paths) => { + if file_paths.is_empty() { + println!("No files found in the directory."); + } else { + for file_path in file_paths { + let item = folder.to_owned() + &file_path; + // get content + let mut file: File = File::open(item.clone())?; + let mut anno_data = String::new(); + file.read_to_string(&mut anno_data)?; + + let file_name = generate_final_path(training_data, &file_path, &extension); + let anno_info = AnnoInfo { + file_name , + content: anno_data, + }; + + anno_groups.push(anno_info); + } + } + } + Err(err) => { + eprintln!("Error: {}", err); + } + } + + let anno_item = TrainMapAnno { + id: id.to_string(), + data: anno_groups, + }; + + train_map_anno.push(anno_item); + } + let mut anno_groups: Vec = Vec::new(); + for (_index, item) in train_map_anno.iter().enumerate() { + let id = item.id.clone(); + let data = &item.data; + + for tmp in data { + let anno_for_single = AnnoItem { + id: id.clone(), + content: tmp.clone().content, + }; + let anno = Annotation { + file_name: tmp.clone().file_name, + groups: vec![anno_for_single], + }; + anno_groups.push(anno); + } + } + let anno_groups = merge_annos(anno_groups); + + let pb = ProgressBar::new(anno_groups.len() as u64); + // generate + let pool = rayon::ThreadPoolBuilder::new() + .num_threads(config.threads.unwrap_or(10)) + .build() + .unwrap(); + + pool.install(|| { + anno_groups.par_iter().for_each(|item| { + match config_mda_content(&item.file_name, item.groups.clone(), output, config) { + Ok(_) => { + pb.inc(1); + } + Err(err) => { + println!("Fail to generate {:?} {:?}", item.file_name, err); + } + } + }); + }); + + pb.finish_with_message("done"); + Ok(anno_groups.len()) +} + +/// Generate mda files, case: 1 to many file +pub fn generate_mda_separate_annotations_one_to_many( + training_data: &str, + annotation_group: &str, + output: &str, + config: &MDAOptions, +) -> Result<(), Box> { + let pb = ProgressBar::new(1); + + // get paths + let paths: Vec<&str> = annotation_group.split(',').collect(); + let mut anno_groups: Vec = Vec::new(); + for path in paths { + // 1. read file content + let mut file: File = File::open(path)?; + let mut anno_data = String::new(); + file.read_to_string(&mut anno_data)?; + // 2. config + let anno_item = AnnoItem { + id: path.to_string(), + content: anno_data, + }; + anno_groups.push(anno_item); + } + config_mda_content(training_data, anno_groups, output, config)?; + pb.inc(1); + pb.finish_with_message("done"); + Ok(()) +} + +/// Get training data +pub fn config_training_data(file_path: &str) -> Result { + let path = Path::new(file_path); + let mut file = File::open(path).map_err(|e| format!("Error opening file: {}", e))?; + + let mut buffer = Vec::new(); + file.read_to_end(&mut buffer) + .map_err(|e| format!("Error reading file: {}", e))?; + + let file_extension = path.extension().and_then(|ext| ext.to_str()); + match file_extension { + Some("txt") => Ok(TrainingData::Text( + String::from_utf8_lossy(&buffer).to_string(), + )), + Some("jpg") | Some("jpeg") | Some("png") => Ok(TrainingData::Image(buffer)), + Some("mp4") | Some("avi") => Ok(TrainingData::Video(buffer)), + Some("wav") | Some("mp3") => Ok(TrainingData::Audio(buffer)), + _ => Err(String::from("Unsupported file type")), + } +} + +/// Get annotation data +pub fn config_annotation_data_by_content(content: &str) -> Result> { + Ok(RevAnno::set_initial_element(content)) +} + + +/// Extract metadata from training data +pub fn process_file(file_path: &str) -> Option> { + if file_path.ends_with(".jpg") || file_path.ends_with(".png") { + let image_metadata = extract_image_metadata(file_path); + Some(Box::new(image_metadata) as Box) + } else if file_path.ends_with(".mp4") || file_path.ends_with(".avi") { + match extract_video_info(file_path) { + Some(info) => Some(Box::new(info) as Box), + None => None, + } + } else if file_path.ends_with(".mp3") || file_path.ends_with(".wav") { + match extract_audio_metadata(file_path) { + Ok(audio_metadata) => return Some(Box::new(audio_metadata) as Box), + Err(err) => { + eprintln!("Error: {}", err); + None + } + } + } else if file_path.ends_with(".txt") || file_path.ends_with(".docx") { + let text_metadata = extract_text_metadata(file_path); + Some(Box::new(text_metadata) as Box) + } else { + None + } +} +/// Map training data and anno data +#[derive(Debug, Clone)] +pub struct TrainMapAnno { + /// anno group + pub id: String, + /// anno content + pub data: Vec, +} +#[derive(Debug, Clone)] +/// Record anno group and content +pub struct AnnoItem { + pub id: String, + pub content: String, +} + +/// Record all the anno group +#[derive(Debug, Clone)] +pub struct Annotation { + pub file_name: String, + pub groups: Vec, +} + +/// Generate mda files for combine case +pub fn generate_mda_combined_annotations( + training_data: &str, + anno_config: &str, + output: &str, + config: &MDAOptions, +) -> Result> { + // config the annotation info + let anno_config = get_anno_config(anno_config); + + // map training data, anno data + let mut train_map_anno: Vec = Vec::new(); + for item in &anno_config.annotation { + let map = get_train_path_and_anno_content(&item.path, item.start, item.end); + let temp = TrainMapAnno { + id: item.id.clone(), + data: map, + }; + train_map_anno.push(temp); + } + + // group the train and anno + let mut anno_groups: Vec = Vec::new(); + for (_index, item) in train_map_anno.iter().enumerate() { + let id = item.id.clone(); + let data = &item.data; + + for tmp in data { + let anno_for_single = AnnoItem { + id: id.clone(), + content: tmp.clone().content, + }; + let anno = Annotation { + file_name: tmp.clone().file_name, + groups: vec![anno_for_single], + }; + anno_groups.push(anno); + } + } + let mut anno_groups = merge_annos(anno_groups); + + // assign training data file name + for item in &mut anno_groups { + item.file_name = training_data.to_owned() + &item.file_name.to_string(); + } + + + // use pool to generate mda files + let pb = ProgressBar::new(anno_groups.len() as u64); + let pool = rayon::ThreadPoolBuilder::new() + .num_threads(config.threads.unwrap_or(10)) + .build() + .unwrap(); + + pool.install(|| { + anno_groups.par_iter().for_each(|item| { + match config_mda_content(&item.file_name, item.groups.clone(), output, config) { + Ok(_) => { + pb.inc(1); + } + Err(err) => { + eprintln!("Fail to generate {:?}! Error: {:?}", item.file_name, err); + } + } + }); + }); + + pb.finish_with_message("done"); + Ok(anno_groups.len()) +} + +/// Generate MDA file by content +pub fn config_mda_content( + training_data: &str, + anno_groups: Vec, + output: &str, + config: &MDAOptions, +) -> Result<(), Box> { + // config outpath + let filename = extract_filename_change_extension(training_data); + let output_path = output.to_owned() + filename; + + // MDAOptions MDAHeader Begin + // MDAOptions MDAHeader -- config metadata + let metadata = process_file(training_data) + .ok_or(training_data.clone().to_owned() + "Failed to extract metadata!")?; + + let meta: String; + if let Some(image_metadata) = metadata.downcast_ref::() { + meta = format!("{:?}", image_metadata); + } else if let Some(text_metadata) = metadata.downcast_ref::() { + meta = format!("{:?}", text_metadata); + } else if let Some(audio_metadata) = metadata.downcast_ref::() { + meta = format!("{:?}", audio_metadata); + } else if let Some(video_metadata) = metadata.downcast_ref::() { + meta = format!("{:?}", video_metadata); + } else { + return Err("Unknown metadata type".into()); + } + // MDAOptions MDAHeader -- config tags + let tags = match &config.tags { + Some(tags) => tags.split(',').map(|s| s.trim().to_string()).collect(), + None => vec![], + }; + + let file_type = match get_file_type(training_data) { + Some(file_type) => file_type, + None => { + println!("Unknown file type"); + std::process::exit(0); + } + }; + + let header = MDAHeader { + tags, + train_data: TrainData { + data_type: file_type.to_string(), + metadata: meta.to_string(), + }, + }; + // MDAOptions MDAHeader finish + + // MDAOptions Training Data + let train_data = match config_training_data(training_data) { + Ok(data) => data, + Err(error) => { + eprintln!("Fail to load training data {}", error); + std::process::exit(0); + } + }; + + // MDAOptions Annotation data + let mut rev_anno_ids: Vec = Vec::new(); + + for item in anno_groups { + let rev_anno = match config_annotation_data_by_content(&item.content) { + Ok(rev_anno) => rev_anno, + Err(err) => { + eprintln!("Fail to load annotation data {}", err); + std::process::exit(0); + } + }; + let temp = RevAnnoWithID { + id: item.id, + rev_anno, + }; + rev_anno_ids.push(temp); + } + + //Write data into mda file + write_mda_data(&output_path, header, train_data, &mut rev_anno_ids)?; + Ok(()) +} + + +/// Write data into mda files +pub fn write_mda_data( + file_path: &str, + header: MDAHeader, + train_data: TrainingData, + rev_anno_ids: &mut [RevAnnoWithID], +) -> Result<(), Box> { + let rev_anno_ids_clone = rev_anno_ids.to_owned(); + // create file + let mut file = File::create(file_path)?; + + // record the position of the MDAIndex + let index_placeholder_offset = file.stream_position()?; + let mut tmp_anno_offsets: Vec = Vec::new(); + for item in rev_anno_ids_clone { + let tmp = AnnoOffset::new(&item.id); + tmp_anno_offsets.push(tmp); + } + // Write MDAIndex into mda + serialize_into( + &file, + &MDAIndex { + header_offset: 0, + train_data_offset: 0, + annotations_offset: tmp_anno_offsets, + }, + )?; + + // Get the MDAHeader Info and write into mda + let header_offset = file.stream_position()?; + serialize_into(&file, &header)?; + + // Write training data + let train_data_offset = file.stream_position()?; + match train_data { + TrainingData::Text(t) => { + serialize_into(&file, &DataType::Text)?; + serialize_into(&file, &t)?; + } + TrainingData::Image(i) => { + serialize_into(&file, &DataType::Image)?; + serialize_into(&file, &i)?; + } + TrainingData::Video(v) => { + serialize_into(&file, &DataType::Video)?; + serialize_into(&file, &v)?; + } + TrainingData::Audio(a) => { + serialize_into(&file, &DataType::Audio)?; + serialize_into(&file, &a)?; + } + }; + + //Config Anno data + let mut tmp_anno_offsets_for_annotations: Vec = Vec::new(); + + for rev_anno_id in rev_anno_ids.iter_mut() { + let mut rev_anno = rev_anno_id.clone().rev_anno; + + let mut anno_entries_offset = file.stream_position()?; + let store_anno_entries_offset = anno_entries_offset; + let mut lengths: Vec = Vec::new(); + for entry in &rev_anno.entries { + let entry_bytes = bincode::serialize(entry)?; + file.write_all(&entry_bytes)?; + lengths.push(entry_bytes.len() as u64); + } + // Record the current offset as the starting position of the headers and update the RevlogIndex + let anno_headers_offset = file.stream_position()?; + + // Write the headers and update their offsets in the vector + for (rev_anno_header, &length) in rev_anno.headers.iter_mut().zip(lengths.iter()) { + rev_anno_header.offset = anno_entries_offset; + rev_anno_header.length = length; + let header_bytes = bincode::serialize(rev_anno_header)?; + file.write_all(&header_bytes)?; + anno_entries_offset += length; + } + + let tmp = AnnoOffset { + id: rev_anno_id.id.clone(), + header_offset: anno_headers_offset, + entries_offset: store_anno_entries_offset, + }; + tmp_anno_offsets_for_annotations.push(tmp); + } + + // Return to MDAIndex and update offset + file.seek(SeekFrom::Start(index_placeholder_offset))?; + + serialize_into( + &file, + &MDAIndex { + header_offset, + train_data_offset, + annotations_offset: tmp_anno_offsets_for_annotations, + }, + )?; + + Ok(()) +} + +/// Merge the same group(used to map train and anno) +pub fn merge_annos(annos: Vec) -> Vec { + let mut merged_annos_map: HashMap> = HashMap::new(); + + for anno in &annos { + let file_name = &anno.file_name; + let anno_value = anno.groups.clone(); + + if let Some(existing_annos) = merged_annos_map.get_mut(file_name) { + existing_annos.extend(anno_value); + } else { + merged_annos_map.insert(file_name.clone(), anno_value); + } + } + + let mut merged_annos: Vec = Vec::new(); + + for (file_name, annos) in merged_annos_map.iter() { + merged_annos.push(Annotation { + file_name: file_name.clone(), + groups: annos.clone(), + }); + } + + merged_annos +} + +/// Used to group anno data +#[derive(Debug, Clone)] +pub struct RevAnnoWithID { + pub id: String, + pub rev_anno: RevAnno, +} + +/// Generate mda files, case: 1 to 1 +pub fn generate_mda_separate_annotation_1_1( + training_data: &str, + annotation_data: &str, + output: &str, + config: &MDAOptions, +) -> Result<(), Box> { + + // get anno content + let mut file = File::open(annotation_data)?; + let mut anno_data = String::new(); + file.read_to_string(&mut anno_data)?; + + // config anno id + let id = extract_second_last_part(annotation_data,'/'); + let id = id.unwrap_or("NONE".to_string()); + let anno_item = AnnoItem { + id: id.to_string(), + content: anno_data, + }; + + // config mda + config_mda_content(training_data, vec![anno_item], output, config)?; + + Ok(()) +} + +/// Generate mda files, case: 1 to 1 file +pub fn generate_mda_separate_annotation_one_to_one( + training_data: &str, + annotation_data: &str, + output: &str, + config: &MDAOptions, +) -> Result<(), Box> { + let pb = ProgressBar::new(1); + generate_mda_separate_annotation_1_1(training_data, annotation_data, output, config)?; + pb.inc(1); + pb.finish_with_message("done"); + Ok(()) +} + +/// Generate mda files, case: 1 to 1 in directory +pub fn generate_mda_separate_annotation_one_to_one_in_folder( + train_path: &str, + anno_path: &str, + output: &str, + config: &MDAOptions, +) -> Result> { + // map train and anno + let train_files = get_files_in_folder(train_path); + let anno_files = get_files_in_folder(anno_path); + let file_combinations = combine_files(train_files, anno_files); + // set progress bar + let pb = ProgressBar::new(file_combinations.len() as u64); + + // use thread pool to generate files + let pool = rayon::ThreadPoolBuilder::new() + .num_threads(config.threads.unwrap_or(10)) + .build() + .unwrap(); + + pool.install(|| { + file_combinations + .par_iter() + .for_each( + |(train_file, anno_file)| match generate_mda_separate_annotation_1_1( + train_file.to_str().unwrap(), + anno_file.to_str().unwrap(), + output, + config, + ) { + Ok(_) => { + pb.inc(1); + } + Err(err) => { + eprintln!( + "\x1b[31m[ERROR]{}: {} {}\x1b[0m", + train_file.to_str().unwrap(), + message::GENERATE_MSG, + err + ); + } + }, + ); + }); + pb.finish_with_message("done"); + Ok(file_combinations.len()) +} + +/// extract second last part from a path +fn extract_second_last_part(input: &str, delimiter: char) -> Option { + let parts: Vec<&str> = input.split(delimiter).collect(); + + if parts.len() >= 2 { + Some(parts[parts.len() - 2].to_string()) + } else { + None + } +} + + diff --git a/mda/src/mda_operations/mod.rs b/mda/src/mda_operations/mod.rs new file mode 100644 index 00000000..75044a98 --- /dev/null +++ b/mda/src/mda_operations/mod.rs @@ -0,0 +1,3 @@ +pub mod extract; +pub mod generate; +pub mod update; \ No newline at end of file diff --git a/mda/src/mda_operations/update.rs b/mda/src/mda_operations/update.rs new file mode 100644 index 00000000..1e232f78 --- /dev/null +++ b/mda/src/mda_operations/update.rs @@ -0,0 +1,161 @@ +use crate::extract:: get_all_rev_anno_with_id ; +use crate::{extract_file_name, MDAIndex}; +use crate::generate::{ + write_mda_data, AnnoItem, Annotation, RevAnnoWithID, TrainMapAnno, +}; +use crate::read_from_file::get_train_path_and_anno_content; + +use crate::{ DataType, MDAHeader, RevAnno, TrainingData}; +use anyhow::Result; +use indicatif::ProgressBar; +use std::error::Error; +use std::fs::File; +use std::io::{BufReader, Read, Seek, SeekFrom}; +use std::process; +/// update anno in combined anno file +pub fn update_anno_in_combined_file( + mda: &str, + anno_data: &str, + // config: MDAOptions + start: usize, + end: usize, + group: &str, +) -> Result> { + let mut mda_anno_map = get_train_path_and_anno_content(anno_data, start, end); + + for item in &mut mda_anno_map { + let extract_name = extract_file_name(&item.file_name) + ".mda"; + let mda_name = mda.to_owned() + &extract_name; + item.file_name = mda_name; + } + let train_map_anno = TrainMapAnno { + id: group.to_string(), + data: mda_anno_map, + }; + // group the train and anno + let mut anno_groups: Vec = Vec::new(); + + let id = train_map_anno.id.clone(); + let data = &train_map_anno.data; + + for tmp in data { + let anno_for_single = AnnoItem { + id: id.clone(), + content: tmp.clone().content, + }; + let anno = Annotation { + file_name: tmp.clone().file_name, + groups: vec![anno_for_single], + }; + anno_groups.push(anno); + } + let pb: ProgressBar = ProgressBar::new(anno_groups.len() as u64); + let mut count=0; + for item in anno_groups { + // get previous data + // Get rev_anno + let mut rev_anno_ids: Vec = + match get_all_rev_anno_with_id(&item.file_name, -1) { + Ok(rev_anno) => rev_anno, + Err(err) => { + println!("Update annotation data fail! ={:?}", err); + process::exit(1); + } + }; + //update data + for data in &mut rev_anno_ids { + if group == data.id { + let rev_anno = RevAnno::add_element( + &item.groups[0].content, + data.clone().rev_anno.entries, + data.clone().rev_anno.headers, + ); + data.rev_anno = rev_anno; + } + } + update_rev_anno(&item.file_name, &mut rev_anno_ids)?; + pb.inc(1); + count+=1; + + } + pb.finish_with_message("done"); + + Ok(count ) + } +/// Update anno rev_anno and offset in mda +pub fn update_anno_in_mda( + file_path: &str, + anno_data_path: &str, + group: &str, +) -> Result<(), Box> { + // Get rev_anno + let mut rev_anno_ids: Vec = match get_all_rev_anno_with_id(file_path, -1) { + Ok(rev_anno) => rev_anno, + Err(err) => { + println!("Update annotation data fail! ={:?}", err); + process::exit(1); + } + }; + + // // Rewrite new data + let mut anno = File::open(anno_data_path)?; + let mut content = String::new(); + anno.read_to_string(&mut content)?; + + // find the changed data + for item in &mut rev_anno_ids { + if item.id == group { + let rev_anno = RevAnno::add_element( + &content, + item.clone().rev_anno.entries, + item.clone().rev_anno.headers, + ); + item.rev_anno = rev_anno; + break; + } + } + + update_rev_anno(file_path, &mut rev_anno_ids)?; + + Ok(()) +} + +/// Update rev_anno +fn update_rev_anno( + mda_path: &str, + rev_anno: &mut [RevAnnoWithID], +) -> Result<(), Box> { + let file = File::open(mda_path)?; + let mut reader = BufReader::new(file); + let index: MDAIndex = bincode::deserialize_from(&mut reader)?; + reader.seek(SeekFrom::Start(index.header_offset))?; + + let header: MDAHeader = bincode::deserialize_from(&mut reader)?; + + reader.seek(SeekFrom::Start(index.train_data_offset))?; + + let data_type: DataType = bincode::deserialize_from(&mut reader)?; + + let train_data: TrainingData = match data_type { + DataType::Text => { + let text: String = bincode::deserialize_from(&mut reader)?; + TrainingData::Text(text.clone()) + } + DataType::Image => { + let image_data: Vec = bincode::deserialize_from(&mut reader)?; + TrainingData::Image(image_data.clone()) + } + DataType::Video => { + let video_data: Vec = bincode::deserialize_from(&mut reader)?; + TrainingData::Video(video_data.clone()) + } + DataType::Audio => { + let audio_data: Vec = bincode::deserialize_from(&mut reader)?; + TrainingData::Audio(audio_data.clone()) + } + }; + + let _ = write_mda_data(mda_path, header, train_data, rev_anno); + + Ok(()) +} diff --git a/mda/src/rev_anno.rs b/mda/src/rev_anno.rs new file mode 100644 index 00000000..31a6d5d5 --- /dev/null +++ b/mda/src/rev_anno.rs @@ -0,0 +1,371 @@ +//! Build a block_data structure similar to the revlog format to implement version control and incremental storage. +use prettytable::{Cell, Row, Table}; +use serde::{Deserialize, Serialize}; +use std::process; + +mod constants { + pub const BLOCK_SIZE: usize = 10; + /// Snapshot Baseline Configuration + pub const SNAPSHOT_BASE: i32 = 10; +} + +/// Structure for a block +#[derive(Debug, Clone, PartialEq, Deserialize, Serialize)] +pub struct Block { + /// Block number of the block + pub block_number: u64, + /// Content of the block + pub block_data: Vec, +} + +impl Block { + /// Create Block + fn new(block_number: u64, block_data: Vec) -> Self { + Block { + block_number, + block_data, + } + } +} + +/// The Header of RevAnno +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct RevAnnoHeader { + /// The number of the record + pub rev: i32, + /// The offset of the corresponding RevAnnoEntry + pub offset: u64, + /// The length of the corresponding RevAnnoEntry + pub length: u64, + /// Is a snapshot or not + pub snapshot: bool, +} + +impl RevAnnoHeader { + #![allow(clippy::too_many_arguments)] + fn new(rev: i32, offset: u64, length: u64, snapshot: bool) -> RevAnnoHeader { + RevAnnoHeader { + rev, + offset: (offset), + length, + snapshot, + } + } +} + +/// The entry of RevAnno +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct RevAnnoEntry { + /// The number of the record + pub rev: i32, + /// The index of blocks + pub index: Vec, + /// The data block + pub blocks: Vec, +} + +impl RevAnnoEntry { + /// new RevAnnoEntry + fn new(rev: i32, index: Vec, blocks: Vec) -> Self { + RevAnnoEntry { rev, index, blocks } + } +} + +/// The RevAnno object +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct RevAnno { + /// The list of rev_anno_headers + pub headers: Vec, + /// The list of rev_anno_entries + pub entries: Vec, +} + +impl RevAnno { + /// Create Object + pub fn new(rev_anno_header: Vec, entries: Vec) -> Self { + RevAnno { + headers: (rev_anno_header), + entries: (entries), + } + } + /// add first RevAnnoEntry + pub fn set_initial_element(content: &str) -> RevAnno { + // Config current content + let block_data: Vec = content.as_bytes().to_vec(); + let (blocks, data_indices) = + split_data_into_blocks(block_data.clone(), constants::BLOCK_SIZE); + + // Config entries + let entries: Vec = vec![RevAnnoEntry::new(0, data_indices, blocks)]; + + // Config headers + let headers: Vec = vec![RevAnnoHeader::new(0, 0, 0, true)]; + RevAnno { headers, entries } + } + + /// add RevAnnoEntry + pub fn add_element( + content: &str, + mut entries: Vec, + mut headers: Vec, + ) -> RevAnno { + // Config block_data from last entry + let last_entry = entries.last().unwrap_or_else(|| { + eprintln!("AnnoDataError: Fail to add a RevAnnoEntry! The last block is empty!"); + process::exit(1); + }); + let last_rev = last_entry.rev; + + // Config current block_data info + let current_id = last_rev + 1; + + // change to Vec + let current_data: Vec = content.as_bytes().to_vec(); + let (current_data_blocks, _data_indices) = + split_data_into_blocks(current_data.clone(), constants::BLOCK_SIZE); + + // Build a block list and record the construction number of the original block_data + let different_blocks = + find_different_blocks(last_rev, &entries, ¤t_data, constants::BLOCK_SIZE); + + let block_list = get_data_blocks_up_to_id(last_rev, &entries); + let (records, diff) = add_to_block_list(block_list, different_blocks); + + // assign rev to diff blocks + let diff_blocks: Vec = records + .iter() + .filter_map(|record| { + if diff.contains(&record.block_number) { + Some(Block { + block_number: record.block_number, + block_data: record.block_data.clone(), + }) + } else { + None + } + }) + .collect(); + + // get current index + + let matching_block_numbers = extract_index(¤t_data_blocks, &records); + let store_matching_block_numbers = matching_block_numbers.clone(); + + // Configure the entry: + // 1) If it already exists, do not store it. + // 2) If it is a snapshot, store the entire entry. + // 3) Otherwise, store only the differential block_data. + // Config entry + + let mut entry = RevAnnoEntry { + rev: current_id, + index: matching_block_numbers, + blocks: diff_blocks, + }; + + // 1) Check if it existed + for item in &entries { + if item.index == entry.index { + return RevAnno { headers, entries }; + } + } + + entries.push(entry); + + // 2) Check if it is a snapshot + if (current_id) % constants::SNAPSHOT_BASE == 0 { + let mut all_blocks: Vec = Vec::new(); + for entry in &mut entries { + for block in &entry.blocks { + all_blocks.push(block.clone()); + } + } + let new_entry = RevAnnoEntry { + rev: current_id, // Copy the rev field + index: store_matching_block_numbers, // Copy the index field + blocks: all_blocks, // Use the cloned all_blocks here + }; + entry = new_entry; // Assign the modified new_entry back to entry + + if let Some(last_element) = entries.last_mut() { + *last_element = entry; + } + //Config header + let rev_anno_header = RevAnnoHeader::new(current_id, 0, 0, true); + headers.push(rev_anno_header); + } else { + // 3) + //Config header + let rev_anno_header = RevAnnoHeader::new(current_id, 0, 0, false); + headers.push(rev_anno_header); + } + RevAnno { headers, entries } + } +} + +/// Splitting large-scale data into fixed-size data blocks and recording the block numbers. +fn split_data_into_blocks(block_data: Vec, block_size: usize) -> (Vec, Vec) { + let mut blocks = Vec::new(); + let mut index = 0; + let mut block_number = 0; + let mut numbers: Vec = Vec::new(); + while index < block_data.len() { + numbers.push(block_number); + + let end = std::cmp::min(index + block_size, block_data.len()); + blocks.push(Block::new(block_number, block_data[index..end].to_vec())); + index = end; + block_number += 1; + } + + (blocks, numbers) +} + +/// Comparing data block lists to find newly added data blocks. +fn find_different_blocks( + last_rev: i32, + entries: &Vec, + current_data: &[u8], + _block_size: usize, +) -> Vec { + let blocks_list = get_data_blocks_up_to_id(last_rev, entries); + let (current_data_blocks, _data_indices) = + split_data_into_blocks(current_data.clone().to_vec(), constants::BLOCK_SIZE); + + // Find elements in block1 that are not in block2 + let elements_not_in_block1: Vec = current_data_blocks + .iter() + .filter(|current_data_blocks_item| { + !blocks_list.iter().any(|blocks_list_item| { + blocks_list_item.block_data == current_data_blocks_item.block_data + }) + }) + .cloned() + .collect(); + + elements_not_in_block1 +} + +/// add new blocks to blocklist +fn add_to_block_list( + mut block_list: Vec, + different_blocks: Vec, +) -> (Vec, Vec) { + let mut diff_number = Vec::::new(); + for mut block in different_blocks { + let last_block_number = block_list.last().map_or(0, |block| block.block_number); + + block.block_number = 1 + last_block_number; + diff_number.push(block.block_number); + block_list.push(block); + } + + // block_list + (block_list, diff_number) +} + +/// extract index from block_data blocks +fn extract_index(vec_data1: &[Block], vec_data2: &[Block]) -> Vec { + let mut index: Vec = Vec::new(); + for data_block1 in vec_data1.iter() { + if let Some(index_in_vec_data2) = vec_data2 + .iter() + .position(|data_block2| data_block1.block_data == data_block2.block_data) + { + index.push(vec_data2[index_in_vec_data2].block_number); + } + } + + index +} + +/// Function to combine Vec into text +fn combine_data_blocks_to_text(data_blocks: &Vec) -> String { + let mut combined_text = String::new(); + for data_block in data_blocks { + combined_text.push_str(std::str::from_utf8(&data_block.block_data).unwrap()); + } + combined_text +} + +/// Find the corresponding indexes by ID. +fn find_index_by_id(rev: i32, delta_list: &[RevAnnoEntry]) -> Option> { + let delta_to_find = delta_list.iter().find(|entry| entry.rev == rev); + + delta_to_find.map(|entry| entry.index.clone()) +} + +/// Get all block_data blocks from ID 0 to the input ID. +fn get_data_blocks_up_to_id(last_rev: i32, delta_list: &Vec) -> Vec { + let mut data_blocks = Vec::new(); + let nearest_id = find_nearest_multiple_of_snapshot_base(last_rev); + match nearest_id { + Some(nearest_id) => { + let mut delta_list_iter = delta_list.iter().skip_while(|entry| entry.rev < nearest_id); + for entry in &mut delta_list_iter { + data_blocks.extend(entry.blocks.iter().cloned()); + } + } + None => { + for entry in delta_list { + if entry.rev <= last_rev { + data_blocks.extend(entry.blocks.iter().cloned()); + } + } + } + } + data_blocks +} +pub fn find_nearest_multiple_of_snapshot_base(target: i32) -> Option { + if target < constants::SNAPSHOT_BASE { + return None; + } + + let nearest_multiple = target - (target % constants::SNAPSHOT_BASE); + if nearest_multiple < constants::SNAPSHOT_BASE { + return None; + } + + Some(nearest_multiple) +} + +/// Get the Vec corresponding to the indexes. +fn get_data_blocks_by_index(index: &Vec, data_blocks: &[Block]) -> Vec { + let mut result_blocks = Vec::new(); + for &idx in index { + if let Some(data_block) = data_blocks.iter().find(|block| block.block_number == idx) { + result_blocks.push(data_block.clone()); + } + } + result_blocks +} +/// Get full block_data(string) +pub fn get_full_data(rev: i32, entries: Vec) -> String { + if let Some(index) = find_index_by_id(rev, &entries) { + let data_blocks = get_data_blocks_up_to_id(rev, &entries); + let selected_blocks = get_data_blocks_by_index(&index, &data_blocks); + combine_data_blocks_to_text(&selected_blocks) + } else { + println!("No block_data blocks found for ID {}", rev); + process::exit(1); + } +} +/// Print header info to console +pub fn print_rev_anno_headers(headers: &Vec) { + let mut table = Table::new(); + + table.add_row(Row::new(vec![ + Cell::new("rev"), + Cell::new("offset"), + Cell::new("length"), + ])); + for header in headers { + table.add_row(Row::new(vec![ + Cell::new(&header.rev.to_string()), + Cell::new(&header.offset.to_string()), + Cell::new(&header.length.to_string()), + ])); + } + table.printstd(); + +} diff --git a/mda/src/run_mda.rs b/mda/src/run_mda.rs new file mode 100644 index 00000000..3d9470cc --- /dev/null +++ b/mda/src/run_mda.rs @@ -0,0 +1,455 @@ +use anyhow::Result; +use prettytable::{Cell, Row, Table}; + +use crate::extract::{ + extract_mda, extract_mda_more, read_anno_from_mda, read_anno_groups_from_mda, +}; +use crate::generate::{ + generate_mda_combined_annotations, generate_mda_separate_annotation_one_to_one, + generate_mda_separate_annotation_one_to_one_in_folder, + generate_mda_separate_annotations_one_to_many, + generate_mda_separate_annotations_one_to_many_in_folder, +}; +use crate::read_from_folders::*; +use crate::update::update_anno_in_combined_file; +use crate::*; +use crate::{extract::read_info_from_mda, update::update_anno_in_mda}; +use clap::Parser; +use indicatif::ProgressBar; +use rayon::prelude::*; +use serde::{Deserialize, Serialize}; +use std::error::Error; + +use std::path::Path; +/// Command Line Tool +#[derive(Parser, Debug)] +#[command(version = "0.1.0", about = "", long_about = "Design and Implementation of File for AI Training Data", after_help = "")] +#[derive(Deserialize, Serialize)] +pub struct MDAOptions { + /// 5 actions: generate, extract, list, group, version, update + #[arg(long)] + pub action: String, + + /// Training data file/folder path. + #[arg(long)] + pub train: Option, + + /// Annotation data file/folder path. + #[arg(long)] + pub anno: Option, + + /// Annotation data file/folder path, separated by commas + #[arg(long)] + pub annos: Option, + + /// Output data file/folder path. + #[arg(long)] + pub output: Option, + + /// MDA data file/folder path. + #[arg(long)] + pub mda: Option, + + /// Tags for MDA files + #[arg(long)] + pub tags: Option, + + /// Maximum number of threads + #[arg(long, default_value = "10")] + pub threads: Option, + + /// The version of MDA file + #[arg(long, default_value = "-1")] + pub rev: Option, + + /// Read from which line of the annotation file. + #[arg(long, default_value = "1")] + pub start: Option, + + /// Read from which line of the annotation file. + #[arg(long, default_value = "0")] + pub end: Option, + + /// The type of the annotation data: txt,json + #[arg(long, default_value = "txt")] + pub format: Option, + + /// Combined Annotation data config. + #[arg(long)] + pub anno_config: Option, + + /// The group of the annotation data + #[arg(long, default_value = "NONE")] + pub group: Option, + + /// The generation mode: one, multiple, combine + #[arg(long)] + pub mode: Option, + +} +#[allow(unused_assignments)] +pub fn run(config: MDAOptions) -> Result<(), Box> { + // Generate .mda file + if config.action == "list" { + match &config.mda { + Some(mda) => { + if is_directory(mda) { + let mut mda_files: Vec = Vec::new(); + find_mda_files_in_dir(Path::new(mda), &mut mda_files); + + let mut table = print_table_header(); + + for file in mda_files { + match read_info_from_mda(&file) { + Ok((index, header)) => { + table = print_table_cell(&file,table.clone(), index, header); + } + Err(err) => { + eprintln!( + "\x1b[31m[ERROR]{}: {} {}\x1b[0m", + mda, + message::FAIL_TO_READ, + err + ); + } + } + } + table.printstd(); + } else if is_file(mda) { + match read_info_from_mda(mda) { + Ok((index, header)) => { + let table = print_table_header(); + let table = print_table_cell(mda,table, index, header); + table.printstd(); + } + Err(err) => { + eprintln!( + "\x1b[31m[ERROR]{}: {} {}\x1b[0m", + mda, + message::FAIL_TO_READ, + err + ); + } + } + } + } + _ => { + println!("Please input mda file"); + std::process::exit(0); + } + } + + // Record end time + // record_end_time(start_time, number_of_mda_files, "generated"); + } else if config.action == "update" { + let group = match config.group { + Some(group) => { + if group == "NONE" { + eprintln!("Please input group"); + std::process::exit(0); + } + group + } + None => { + eprintln!("Please input group"); + std::process::exit(0); + } + }; + // Record start time + let start_time = record_start_time(&config.action); + + let number_of_mda_files = match (&config.mda, &config.anno) { + (Some(mda), Some(anno_data)) => { + if is_directory(mda) && is_directory(anno_data) { + let mda_files = get_files_in_folder(mda); + let anno_files = get_files_in_folder(anno_data); + let file_combinations = combine_files(mda_files, anno_files); + let pb: ProgressBar = ProgressBar::new(file_combinations.len() as u64); + + let pool = rayon::ThreadPoolBuilder::new() + .num_threads(config.threads.unwrap_or(10)) + .build() + .unwrap(); + + pool.install(|| { + file_combinations + .par_iter() + .for_each(|(mda_file, anno_file)| { + update_anno_in_mda( + mda_file.to_str().unwrap(), + anno_file.to_str().unwrap(), + &group, + ) + .unwrap_or_else(|err| { + eprintln!("Failed to process file combination: {}", err); + }); + pb.inc(1); + }); + }); + pb.finish_with_message("done"); + + file_combinations.len() + } else if is_file(mda) && is_file(anno_data) { + match update_anno_in_mda(mda, anno_data, &group) { + Ok(_) => { + let pb: ProgressBar = ProgressBar::new(1); + + pb.inc(1); + + pb.finish_with_message("done"); + } + Err(err) => { + eprintln!("Failed to read data from MDA file: {}", err); + } + } + 1 + } else if is_file(anno_data) && is_directory(mda) { + + + match update_anno_in_combined_file( + mda, + anno_data, + config.start.unwrap_or(1), + config.end.unwrap_or(0), + &group, + ) { + Ok(data) => data, + Err(err) => { + eprintln!("{} {}", message::INVALID_PATH_MSG, err); + std::process::exit(0); + } + } + + } else { + eprintln!("{}", message::INVALID_PATH_MSG); + 0 + } + } + + _ => { + eprintln!("{}", message::INVALID_PATH_MSG); + std::process::exit(0); + } + }; + // Record end time + record_end_time(start_time, number_of_mda_files, "updated"); + } else if config.action == "version" { + let group = match config.group { + Some(group) => { + if group == "NONE" { + eprintln!("Please input annotation group!"); + std::process::exit(0); + } + group + } + None => { + eprintln!("Please input annotation group!"); + std::process::exit(0); + } + }; + + match &config.mda { + Some(mda) => { + if is_file(mda) { + match read_anno_from_mda(mda, &group, -1) { + Ok(_) => {} + Err(err) => { + eprintln!("Failed to read data from MDA file: {}", err); + } + } + } + } + _ => { + eprintln!("{}", message::INVALID_PATH_MSG); + std::process::exit(0); + } + } + + } else if config.action == "group" { + match &config.mda { + Some(mda) => match read_anno_groups_from_mda(mda) { + Ok(groups) => { + let mut table = Table::new(); + table.add_row(Row::new(vec![ + Cell::new("ID"), + Cell::new("Annotation Group"), + ])); + let mut count = 1; + for item in groups { + table.add_row(Row::new(vec![ + Cell::new(&count.to_string()), + Cell::new(&item.to_string()), + ])); + count += 1; + } + table.printstd(); + } + Err(err) => { + eprintln!( + "\x1b[31m[ERROR]{}: {} {}\x1b[0m", + mda, + message::FAIL_TO_READ, + err + ); + } + }, + _ => { + println!("Please input mda file"); + std::process::exit(0); + } + } + } else if config.action == "extract" { + // Record start time + let start_time = record_start_time(&config.action); + let format: String = config.format.unwrap_or("txt".to_string()); + let group = match config.group { + Some(group) => { + if group == "NONE" { + eprintln!("Please input group"); + std::process::exit(0); + } + group + } + None => { + eprintln!("Please input group"); + std::process::exit(0); + } + }; + let threads = config.threads.unwrap_or(10); + let number_of_mda_files = match (&config.train, &config.anno, &config.mda) { + // Extract anno data into different files + (Some(train_data), Some(anno_data), Some(mda)) => { + + + let anno_version: i32 = config.rev.unwrap_or_default(); + if is_file(mda) && is_directory(train_data) && is_directory(anno_data) { + + + extract_mda(mda, train_data, anno_data, anno_version, &format, &group)?; + + 1 + } else if is_directory(mda) && is_directory(anno_data) { + + + extract_mda_more( + mda, + train_data, + anno_data, + anno_version, + &format, + &group, + threads, + )?; + 1 + }else { + eprintln!("{}", message::INVALID_PATH_MSG); + 0 + } + } + _ => { + eprintln!("{}", message::INVALID_PATH_MSG); + std::process::exit(0); + } + }; + // Record end time + record_end_time(start_time, number_of_mda_files, "extracted"); + } else if config.action == "generate" { + // Record start time + let start_time = record_start_time(&config.action); + + // Generate mda files + let mut number_of_mda_files = 0; + let mode = config.mode.as_ref().unwrap_or(&"NONE".to_string()).clone(); + if mode == "combine" { + number_of_mda_files = match (&config.train, &config.anno_config, &config.output) { + (Some(train_data), Some(anno_config), Some(output)) => { + match generate_mda_combined_annotations( + train_data, + anno_config, + output, + &config, + ) { + Ok(data) => data, + Err(err) => { + eprintln!("{} ERROR= {}", message::GENERATE_MSG, err); + std::process::exit(0); + } + } + } + _ => { + eprintln!("{}", message::INVALID_PATH_MSG); + std::process::exit(0); + } + }; + } else if mode == "one" { + number_of_mda_files = match (&config.train, &config.anno, &config.output) { + (Some(train_path), Some(anno_path), Some(output)) => { + if is_file(train_path) && is_file(anno_path) { + generate_mda_separate_annotation_one_to_one( + train_path, anno_path, output, &config, + )?; + 1 + } else if is_directory(train_path) && is_directory(anno_path) { + match generate_mda_separate_annotation_one_to_one_in_folder( + train_path, anno_path, output, &config, + ) { + Ok(data) => data, + Err(err) => { + eprintln!("{} ERROR= {}", message::GENERATE_MSG, err); + std::process::exit(0); + } + } + } else { + 0 + } + } + _ => { + eprintln!("{} {}", message::GENERATE_MSG, message::INVALID_PATH_MSG); + std::process::exit(0); + } + }; + } else if mode == "multiple" { + number_of_mda_files = match (&config.train, &config.annos, &config.output) { + (Some(train_path), Some(anno_group), Some(output)) => { + if is_file(train_path) { + generate_mda_separate_annotations_one_to_many( + train_path, anno_group, output, &config, + )?; + 1 + } else if is_directory(train_path) { + match generate_mda_separate_annotations_one_to_many_in_folder( + train_path, anno_group, output, &config, + ) { + Ok(data) => data, + Err(err) => { + eprintln!("{} ERROR= {}", message::GENERATE_MSG, err); + std::process::exit(0); + } + } + } else { + 0 + } + } + _ => { + eprintln!("{} {}", message::GENERATE_MSG, message::INVALID_PATH_MSG); + std::process::exit(0); + } + }; + } else { + eprintln!( + "{} Please input the correct generate mode!", + message::GENERATE_MSG, + + ); + std::process::exit(0); + } + + // Record end time + record_end_time(start_time, number_of_mda_files, "generated"); + } else { + println!( + "\x1b[38;5;208m[WARN]\x1b[0m Wrong action! Support 5 actions for MDA: generate, list, update, version, extract!\n- generate: generate mda files for data.\n- list: list basic info of mda files\n- update: update the annotation data in mda files\n- version: list all versions of mda files\n- extract: extract training data and annotation data from mda files" + ); + } + Ok(()) +} diff --git a/mda/src/utils.rs b/mda/src/utils.rs new file mode 100644 index 00000000..290c88b1 --- /dev/null +++ b/mda/src/utils.rs @@ -0,0 +1,387 @@ +//! It includes some common functionalities, helper functions, +//! that help simplify the development process and provide shared functionalities. + +extern crate image; +use crate::{ + AnnoOffset, AudioMetaData, ImageMetaData, MDAHeader , MDAIndex, TextMetaData, + VideoMetaData, +}; +use anyhow::Context; +use chrono::Local; +use encoding::{DecoderTrap, Encoding}; +use hound::{Error as boundError, WavReader}; +use image::{ColorType, GenericImageView}; +use mp4parse::read_mp4; +use prettytable::{Cell, Row, Table}; +use std::error::Error; +use std::fs; +use std::fs::File; +use std::io::{BufWriter, Write}; +use std::path::Path; +use std::time::Instant; +use walkdir::WalkDir; + +/// Information prompts +pub mod message { + pub const GENERATE_MSG: &str = "Fail to generate mda files!"; + pub const INVALID_PATH_MSG: &str = + "Please input the correct path for training data, annotation data and output data!"; + pub const FAIL_TO_READ: &str = "Failed to read data from MDA file!"; +} + +/// Get the file name of the input path +pub fn extract_file_name(file_path: &str) -> String { + let path = Path::new(file_path); + let file_name = path + .file_stem() + .and_then(|stem| stem.to_str()) + .unwrap_or(""); + file_name.to_string() +} + +/// Get the file name and assign .mda extension +pub fn extract_filename_change_extension(path: &str) -> &str { + let filename = path.rsplit('/').next().unwrap_or(""); + let new_filename = format!( + "{}.mda", + &filename[..filename.rfind('.').unwrap_or(filename.len())] + ); + Box::leak(new_filename.into_boxed_str()) +} + +/// Save text file +pub fn save_text_to_file(text: &str, file_path: &str) -> Result<(), Box> { + let file_path = file_path.to_string() + ".txt"; + let mut file = BufWriter::new(File::create(file_path)?); + file.write_all(text.as_bytes())?; + Ok(()) +} +/// Save image file +pub fn save_image_to_file(image_data: &[u8], file_path: &str) -> Result<(), Box> { + let file_path = file_path.to_string() + ".png"; + let mut file = BufWriter::new(File::create(file_path)?); + file.write_all(image_data)?; + Ok(()) +} +/// Save video file +pub fn save_video_to_file(video_data: &[u8], file_path: &str) -> Result<(), Box> { + let file_path = file_path.to_string() + ".mp4"; + let mut file = BufWriter::new(File::create(file_path)?); + file.write_all(video_data)?; + Ok(()) +} +/// Save aduio file +pub fn save_audio_to_file(audio_data: &[u8], file_path: &str) -> Result<(), Box> { + let file_path = file_path.to_string() + ".wav"; + let mut file = BufWriter::new(File::create(file_path)?); + file.write_all(audio_data)?; + Ok(()) +} + +/// Extract metadata from training data(image) +pub fn extract_image_metadata(image_path: &str) -> ImageMetaData { + let msg = "Failed to open file ".to_owned() + image_path.clone(); + let image = image::open(image_path).expect(&msg); + + let (width, height) = image.dimensions(); + let channel_count = match image.color() { + ColorType::L8 => 1, + ColorType::La8 => 2, + ColorType::Rgb8 => 3, + ColorType::Rgba8 => 4, + _ => panic!("Unsupported color type"), + }; + let color_space = match image { + image::DynamicImage::ImageRgb8(_) => "RGB".to_string(), + image::DynamicImage::ImageRgba8(_) => "RGBA".to_string(), + _ => "Unknown".to_string(), + }; + + ImageMetaData { + size: (width, height), + channel_count, + color_space, + } +} + +/// Extract metadata from training data(text) +pub fn extract_text_metadata(text_path: &str) -> TextMetaData { + let text = fs::read_to_string(text_path).expect("Failed to read text file"); + + let length = text.chars().count(); + + let (decoded_text, encoding) = match text.starts_with('\u{FEFF}') { + true => { + let decoded = encoding::all::UTF_8 + .decode(text[3..].as_bytes(), DecoderTrap::Replace) + .unwrap(); + (decoded, "UTF-8") + } + false => { + let decoded = encoding::all::ISO_8859_1 + .decode(text.as_bytes(), DecoderTrap::Replace) + .unwrap(); + (decoded, "ISO-8859-1") + } + }; + + let vocabulary_size = decoded_text + .split_whitespace() + .collect::>() + .len(); + + TextMetaData { + length, + encoding: encoding.to_string(), + vocabulary_size, + } +} + +/// Extract metadata from training data(video) +pub fn extract_video_info(file_path: &str) -> Option { + let mut file = File::open(file_path).ok()?; + let context = read_mp4(&mut file).ok()?; + + let video_track = context + .tracks + .iter() + .find(|track| track.track_type == mp4parse::TrackType::Video)?; + let duration = video_track.duration?; + + let media_timescale = context.timescale?.0; + let total_time = duration.0 / 10 + duration.1 as u64; + let track_duration_seconds = total_time as f64 / media_timescale as f64; + + if let Some(mp4parse::SampleEntry::Video(video_sample_entry)) = video_track + .stsd + .as_ref() + .and_then(|stsd| stsd.descriptions.get(0)) + { + let resolution = (video_sample_entry.width, video_sample_entry.height); + return Some(VideoMetaData { + duration: track_duration_seconds, + resolution, + }); + } + + None +} + +/// Extract metadata from training data(audio) +pub fn extract_audio_metadata(file_path: &str) -> Result { + let reader = WavReader::open(file_path)?; + let duration = reader.duration() as f64 / reader.spec().sample_rate as f64; + + let sample_rate = reader.spec().sample_rate; + let channels = reader.spec().channels; + let bit_depth = reader.spec().bits_per_sample; + + let audio_properties = AudioMetaData { + duration, + sample_rate, + channels, + bit_depth, + }; + + Ok(audio_properties) +} + +/// Get the type of the file. +pub fn get_file_type(file_path: &str) -> Option { + if file_path.ends_with(".jpg") || file_path.ends_with(".png") || file_path.ends_with(".jpeg") { + Some("Image".to_string()) + } else if file_path.ends_with(".mp4") || file_path.ends_with(".avi") { + Some("Video".to_string()) + } else if file_path.ends_with(".mp3") || file_path.ends_with(".wav") { + Some("Audio".to_string()) + } else if file_path.ends_with(".txt") || file_path.ends_with(".docx") { + Some("Text".to_string()) + } else { + None + } +} + +/// Find the .mda files in the folder. +pub fn find_mda_files_in_dir(dir: &Path, mda_files: &mut Vec) { + for entry in WalkDir::new(dir).into_iter().filter_map(|e| e.ok()) { + let path = entry.path(); + + if path.is_file() { + if let Some(file_name) = path.file_name() { + if let Some(file_name_str) = file_name.to_str() { + // Check if the file ends with ".mda" + if file_name_str.ends_with(".mda") { + if let Some(file_path_str) = path.to_str() { + mda_files.push(file_path_str.to_string()); + } + } + } + } + } + } +} + +/// Check if it is a folder. +pub fn is_directory(path: &str) -> bool { + let path = Path::new(path); + path.is_dir() +} + +/// Check if it is a file. +pub fn is_file(path: &str) -> bool { + let path = Path::new(path); + path.is_file() +} + +/// Write content to files +pub fn write_strings_to_file( + strings: &[String], + output_path: &str, + format: &str, +) -> anyhow::Result<()> { + let output_path = output_path.to_string() + "." + format; + let mut file = File::create(output_path).context("Failed to create output file")?; + + for string in strings { + file.write_all(string.as_bytes()) + .context("Failed to write to output file")?; + file.write_all(b"\n") + .context("Failed to write newline to output file")?; + } + + Ok(()) +} + +/// Record the start time +pub fn record_start_time(action: &str) -> Instant { + let start_time = Instant::now(); + println!( + "\x1b[38;5;208m[WARN]\x1b[0m[{}] Start to {} mda files...", + Local::now().format("%Y-%m-%d %H:%M:%S"), + action + ); + start_time +} + +/// Record the end time +pub fn record_end_time(start_time: Instant, number_of_mda_files: usize, action: &str) { + let end_time = Instant::now(); + let duration = end_time - start_time; + println!( + "\n\x1b[38;5;208m[WARN]\x1b[0m[{}] {} mda files have been {} in {:?}", + Local::now().format("%Y-%m-%d %H:%M:%S"), + number_of_mda_files, + action, + duration + ); +} + +pub fn print_table_header() -> Table{ + let mut table1 = Table::new(); + + table1.add_row(Row::new(vec![ + Cell::new("MDA File"), + Cell::new("MDA Header Offset"), + Cell::new("Training Data Offset"), + Cell::new("Tags"), + Cell::new("Training MetaData"), + ])); + + + table1 +} + +pub fn print_table_cell(file:&str,mut table: Table, index: MDAIndex, header: MDAHeader) -> Table { + table.add_row(Row::new(vec![ + Cell::new(file), + Cell::new(&index.header_offset.to_string()), + Cell::new(&index.train_data_offset.to_string()), + Cell::new(header.tags.join(", ").as_str()), + Cell::new(&header.train_data.metadata), + ])); + table +} + +use serde::Deserialize; +use std::process; + +#[derive(Debug, Deserialize)] +pub struct AnnoConfigItem { + #[serde(default = "default_id")] + pub id: String, + pub path: String, + #[serde(default = "default_start")] + pub start: usize, + #[serde(default = "default_end")] + pub end: usize, +} + +fn default_id() -> String { + "NONE".to_string() +} + +fn default_start() -> usize { + 1 +} +fn default_end() -> usize { + 0 +} +#[derive(Debug, Deserialize)] +pub struct AnnoConfig { + pub annotation: Vec, +} +fn extract_id_from_path(path: &str) -> String { + let path = Path::new(path); + path.file_stem().unwrap().to_string_lossy().into_owned() +} +fn parse_and_process_toml(toml_content: &str) -> Result, toml::de::Error> { + let parsed_toml: Result = toml::from_str(toml_content); + + match parsed_toml { + Ok(anno_config) => { + let mut annos = anno_config.annotation; + + for item in &mut annos { + if item.id == "NONE" { + item.id = extract_id_from_path(&item.path); // Call the default_id function to extract ID + } + } + + Ok(annos) + } + Err(err) => Err(err), + } +} + +fn read_toml_file(filename: &str) -> Result { + fs::read_to_string(filename) +} +pub fn get_anno_config(path: &str) -> AnnoConfig { + match read_toml_file(path) { + Ok(toml_content) => match parse_and_process_toml(&toml_content) { + Ok(annos) => AnnoConfig { annotation: annos }, + Err(err) => { + eprintln!("Error parsing and processing TOML: {}", err); + process::exit(1); + } + }, + Err(err) => { + eprintln!("Error reading the file: {}", err); + process::exit(1); + } + } +} +pub fn create_anno_offsets(anno_config: &AnnoConfig) -> Vec { + let mut anno_offsets = Vec::new(); + + for item in &anno_config.annotation { + let anno_offset = AnnoOffset { + id: item.id.clone(), + header_offset: 0, + entries_offset: 0, + }; + anno_offsets.push(anno_offset); + } + + anno_offsets +} -- Gitee