From c4022b2866c6ea4d0fb3f53cabf5fde76aa145e9 Mon Sep 17 00:00:00 2001 From: Fan Xiaoping Date: Thu, 2 Mar 2023 11:01:24 +0800 Subject: [PATCH 1/5] add ebpf_vm_executor header files Signed-off-by: Fan Xiaoping --- CMakeLists.txt | 8 + ebpf_vm_executor/CMakeLists.txt | 10 + ebpf_vm_executor/ebpf_vm_functions.h | 68 +++++++ ebpf_vm_executor/ebpf_vm_simulator.h | 223 ++++++++++++++++++++++ ebpf_vm_executor/ebpf_vm_transport.h | 56 ++++++ ebpf_vm_executor/ebpf_vm_transport_rdma.h | 56 ++++++ ebpf_vm_executor/list.h | 19 ++ ebpf_vm_executor/ub_list.h | 188 ++++++++++++++++++ how-to.txt | 14 ++ 9 files changed, 642 insertions(+) create mode 100644 CMakeLists.txt create mode 100644 ebpf_vm_executor/CMakeLists.txt create mode 100644 ebpf_vm_executor/ebpf_vm_functions.h create mode 100644 ebpf_vm_executor/ebpf_vm_simulator.h create mode 100644 ebpf_vm_executor/ebpf_vm_transport.h create mode 100644 ebpf_vm_executor/ebpf_vm_transport_rdma.h create mode 100644 ebpf_vm_executor/list.h create mode 100644 ebpf_vm_executor/ub_list.h create mode 100644 how-to.txt diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..443d3ee --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,8 @@ +#packet vm makefile + +cmake_minimum_required(VERSION 3.12.1) +project(ebpf_vm) + +add_compile_options(-g) +add_subdirectory (ebpf_vm_executor) +add_subdirectory (ebpf_vm_test) diff --git a/ebpf_vm_executor/CMakeLists.txt b/ebpf_vm_executor/CMakeLists.txt new file mode 100644 index 0000000..40aa1b1 --- /dev/null +++ b/ebpf_vm_executor/CMakeLists.txt @@ -0,0 +1,10 @@ +#packet vm makefile + +add_library(ebpf_vm_executor SHARED + ebpf_vm_elf.c + ebpf_vm_functions.c + ebpf_vm_simulator.c + ebpf_vm_transport_rdma.c +) + +target_link_libraries(ebpf_vm_executor -lpthread -lelf -libverbs) diff --git a/ebpf_vm_executor/ebpf_vm_functions.h b/ebpf_vm_executor/ebpf_vm_functions.h new file mode 100644 index 0000000..9d6b7e7 --- /dev/null +++ b/ebpf_vm_executor/ebpf_vm_functions.h @@ -0,0 +1,68 @@ +#ifndef _EBPF_VM_FUNCTIONS_H_ +#define _EBPF_VM_FUNCTIONS_H_ + +#define ARG_NOT_USED_6 uint64_t not_used1, uint64_t not_used2, uint64_t not_used3, uint64_t not_used4, uint64_t not_used5, uint64_t not_used6 +#define ARG_NOT_USED_5 uint64_t not_used1, uint64_t not_used2, uint64_t not_used3, uint64_t not_used4, uint64_t not_used5 +#define ARG_NOT_USED_4 uint64_t not_used1, uint64_t not_used2, uint64_t not_used3, uint64_t not_used4 +#define ARG_NOT_USED_3 uint64_t not_used1, uint64_t not_used2, uint64_t not_used3 +#define ARG_NOT_USED_2 uint64_t not_used1, uint64_t not_used2 +#define ARG_NOT_USED_1 uint64_t not_used1 + +#define VM_URL_SIZE 24 +#define INVALID_MMAP_ADDR ((void *)-1) + +#define join_thread fork_join + +enum { + MONITOR_T_BIGGER_THAN_VALUE, + MONITOR_T_LESS_THAN_VALUE, + MONITOR_T_EQUAL_VALUE, + MONITOR_T_NOT_EQUAL_VALUE, + MONITOR_T_CLEAR +}; + +enum { + EBPF_FUNC_debug_print = 1, + EBPF_FUNC_mmap, + EBPF_FUNC_monitor_address, + EBPF_FUNC_wait_for_address_event, + EBPF_FUNC_migrate_to, + EBPF_FUNC_clone_to, + EBPF_FUNC_switch_to_address_space, + EBPF_FUNC_memcpy, + EBPF_FUNC_fork_to, + EBPF_FUNC_fork_return, + EBPF_FUNC_fork_join +}; + +struct ub_address { + uint64_t access_key; + uint8_t url[VM_URL_SIZE]; +}; + +struct remote_thread { + struct ub_address target_node; + uint64_t id; + uint64_t result; +}; + +#ifndef PKT_VM_EXECUTOR + +static uint64_t (*debug_print)(uint64_t s) = (void *)EBPF_FUNC_debug_print; +static uint64_t (*mmap)(uint64_t va, uint64_t size) = (void *)EBPF_FUNC_mmap; +static uint64_t (*monitor_address)(uint64_t type, uint64_t target_address, uint64_t value, uint64_t tag) = (void *)EBPF_FUNC_monitor_address; +static uint64_t (*wait_for_address_event)(void) = (void *)EBPF_FUNC_wait_for_address_event; +static uint64_t (*migrate_to)(struct ub_address *dst) = (void *)EBPF_FUNC_migrate_to; +static uint64_t (*clone_to)(struct ub_address *target_list, int len) = (void *)EBPF_FUNC_clone_to; +static uint64_t (*fork_to)(struct remote_thread *thread_list, int len) = (void *)EBPF_FUNC_fork_to; +static uint64_t (*fork_return)(uint64_t result) = (void *)EBPF_FUNC_fork_return; +static uint64_t (*fork_join)(struct remote_thread *thread_list, int len) = (void *)EBPF_FUNC_fork_join; +static uint64_t (*switch_to_address_space)(int asid) = (void *)EBPF_FUNC_switch_to_address_space; +static uint64_t (*memcpy)(struct ub_address *dst, struct ub_address *src, int len, void *completion_addr, int result) = (void *)EBPF_FUNC_memcpy; + +#define start_remote_thread(THREAD_LIST, LEN) for(uint64_t result = fork_to(THREAD_LIST, LEN); \ + result < (LEN); \ + fork_return(result)) + +#endif /*PKT_VM_EXECUTOR*/ +#endif /*_EBPF_VM_FUNCTIONS_H_*/ \ No newline at end of file diff --git a/ebpf_vm_executor/ebpf_vm_simulator.h b/ebpf_vm_executor/ebpf_vm_simulator.h new file mode 100644 index 0000000..45f4698 --- /dev/null +++ b/ebpf_vm_executor/ebpf_vm_simulator.h @@ -0,0 +1,223 @@ +#ifndef _EBPF_VM_SIMULATOR_H_ +#define _EBPF_VM_SIMULATOR_H_ + +#include "ub_list.h" +#include "ebpf_vm_transport.h" + +#define EBPF_VM_STACK_DEPTH_MAX 3 +#define EBPF_VM_STACK_FRAME_SIZE 64 +#define EBPF_VM_DEFAULT_STACK_SIZE 128 +#define EBPF_VM_DEFAULT_DATA_SIZE 64 +#define PKT_VM_USER_REG_NUM 11 +#define PKT_VM_SYS_REG_NUM 4 +#define PKT_VM_INVALID_FUNC_IDX 0xffffffff +#define PKT_VM_MAX_SYMBS 256 + +enum { + /*00*/ EBPF_REG_RETURN_RESULT, + /*01*/ EBPF_REG_ARG1, + /*02*/ EBPF_REG_ARG2, + /*03*/ EBPF_REG_ARG3, + /*04*/ EBPF_REG_ARG4, + /*05*/ EBPF_REG_ARG5, + /*06*/ EBPF_REG_6, + /*07*/ EBPF_REG_7, + /*08*/ EBPF_REG_8, + /*09*/ EBPF_REG_9, + /*10*/ EBPF_REG_FP, +}; + +enum { + /*00*/ EBPF_SYS_REG_LR, + /*01*/ EBPF_SYS_REG_PC, + /*02*/ EBPF_SYS_REG_PAGE_TABLE_IDX, +}; + +enum { + EBPF_CLS_LD, /*0*/ + EBPF_CLS_LDX, /*1*/ + EBPF_CLS_ST, /*2*/ + EBPF_CLS_STX, /*3*/ + EBPF_CLS_ALU, /*4*/ + EBPF_CLS_JMP, /*5*/ + EBPF_CLS_RET, /*6*/ + EBPF_CLS_ALU64 /*7*/ +}; +#define EBPF_OPCODE_CLASS(code) ((code) & 0x7) + +enum { + EBPF_ALU_OP_ADD = 0 << 4, + EBPF_ALU_OP_SUB = 1 << 4, + EBPF_ALU_OP_MUL = 2 << 4, + EBPF_ALU_OP_DIV = 3 << 4, + EBPF_ALU_OP_OR = 4 << 4, + EBPF_ALU_OP_AND = 5 << 4, + EBPF_ALU_OP_LSH = 6 << 4, + EBPF_ALU_OP_RSH = 7 << 4, + EBPF_ALU_OP_NEG = 8 << 4, + EBPF_ALU_OP_MOD = 9 << 4, + EBPF_ALU_OP_XOR = 10 << 4, + EBPF_ALU_OP_MOV = 11 << 4, + EBPF_ALU_OP_ARSH = 12 << 4, + EBPF_ALU_OP_END = 13 << 4 +}; +#define EBPF_ALU_OP(code) ((code) & 0xf0) + +enum { + EBPF_JMP_OP_JA = 0 << 4, + EBPF_JMP_OP_JEQ = 1 << 4, + EBPF_JMP_OP_JGT = 2 << 4, + EBPF_JMP_OP_JGE = 3 << 4, + EBPF_JMP_OP_JSET = 4 << 4, + EBPF_JMP_OP_JNE = 5 << 4, + EBPF_JMP_OP_JSGT = 6 << 4, + EBPF_JMP_OP_JSGE = 7 << 4, + EBPF_JMP_OP_CALL = 8 << 4, + EBPF_JMP_OP_EXIT = 9 << 4, + EBPF_JMP_OP_JLT = 10 << 4, + EBPF_JMP_OP_JLE = 11 << 4, + EBPF_JMP_OP_JSLT = 12 << 4, + EBPF_JMP_OP_JSLE = 13 << 4 +}; +#define EBPF_JMP_OP(code) ((code) & 0xf0) + +#define EBPF_SRC_IS_IMM 0x00 +#define EBPF_SRC_IS_REG 0x08 +#define EBPF_PSEUDO_CALL 1 + +enum { + EBPF_W = 0 << 3, + EBPF_H = 1 << 3, + EBPF_B = 2 << 3, + EBPF_DW = 3 << 3, +}; +#define EBPF_MEM_SIZE(code) ((code) & 0x18) + +enum { + EBPF_IMM = 0 << 5, + EBPF_ABS = 1 << 5, + EBPF_IND = 2 << 5, + EBPF_MEM = 3 << 5, + EBPF_LEN = 4 << 5, + EBPF_MSH = 5 << 5, + EBPF_XADD = 6 << 5, +}; +#define EBPF_MODE(code) ((code) & 0xe0) + +#define EBPF_TO_LE 0x00 +#define EBPF_TO_BE 0x08 + +struct ebpf_vm; + +struct address_monitor_entry { + struct ub_list list; + int type; + uint64_t address; + uint64_t value; + uint64_t tag; +}; + +typedef uint64_t (*ebpf_external_func)(uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, struct ebpf_vm *vm); + +struct ebpf_symbol { + const char *name; + ebpf_external_func func; +}; + +extern struct ebpf_symbol ebpf_global_symbs[]; + +struct ebpf_instruction { + uint8_t opcode; + uint8_t dst_reg:4; + uint8_t src_reg:4; + int16_t offset; + int32_t immediate; +}; + +#define EBPF_RAW_INSN(CODE, DST, SRC, OFF, IMM) {CODE, DST, SRC, OFF, IMM} + +struct ebpf_vm_executor_config { + struct transport_config transport; +}; + +struct executor_state { + uint32_t should_stop:1; + uint32_t unused:31; +}; + +struct ebpf_vm_executor { + struct ub_list vm_list; + struct transport_ops *transport; + void *transport_ctx; + struct executor_state state; + uint64_t next_vm_id; +}; + +enum { + VM_STATE_RUNNING, + VM_STATE_EXIT, + VM_STATE_WAIT_FOR_ADDRESS, + VM_STATE_MIGRATE_TO, + VM_STATE_CLONE_TO +}; + +struct ebpf_vm_state { + uint8_t stack_depth; + uint8_t unused; + uint16_t next_data_to_use; + uint32_t vm_state; +}; + +#define ENTRY_MASK 0x00000000ffffffff +#define PACKET_VA_SHIFT 36 +#define INDEX_SHIFT 32 +#define PAGE_TABLE_ERROR 0xffffffffffffffff +#define PAGE_TABLE_NUM 1 +#define BUCKET_ENTRIES 1 + +struct vm_pte { + uint64_t va; + uint64_t size; +}; + +struct vm_ptb { + struct vm_pte entries[BUCKET_ENTRIES]; +}; + +struct vm_runtime_data { + struct ub_list list; + struct ebpf_vm_executor *executor; + struct ebpf_symbol *symbols; + uint64_t id; +}; + +struct ebpf_vm { + struct vm_runtime_data rd; + uint64_t reg[PKT_VM_USER_REG_NUM]; + uint64_t sys_reg[PKT_VM_SYS_REG_NUM]; + uint16_t code; + uint16_t stack; + uint16_t data; + uint16_t code_size; + uint16_t stack_size; + uint16_t data_size; + struct vm_ptb page_table[PAGE_TABLE_NUM]; + struct ebpf_vm_state state; + struct ub_list address_monitor_list; +}; + +#define ebpf_vm_code(VM) (struct ebpf_instruction *)((uint8_t *)(VM) + (VM)->code) + +struct ebpf_vm *create_vm(uint8_t *code, uint32_t code_size); +struct ebpf_vm *create_vm_from_elf(const char *elf_file_name); +int add_vm(struct ebpf_vm_executor *executor, struct ebpf_vm *vm); +int load_data(struct ebpf_vm *vm, uint8_t *data, uint32_t len); +void destroy_vm(struct ebpf_vm *vm); +void vm_executor_run(struct ebpf_vm_executor *executor); +uint64_t run_ebpf_vm(struct ebpf_vm *vm); +void update_vm_state(struct ebpf_vm *vm, int state); +uint64_t vm_mmu(uint64_t va, struct ebpf_vm *vm); +void *vm_executor_init(struct ebpf_vm_executor_config *cfg); +void vm_executor_destroy(struct ebpf_vm_executor *executor); + +#endif /*_EBPF_VM_SIMULATOR_H_*/ \ No newline at end of file diff --git a/ebpf_vm_executor/ebpf_vm_transport.h b/ebpf_vm_executor/ebpf_vm_transport.h new file mode 100644 index 0000000..f99ef0d --- /dev/null +++ b/ebpf_vm_executor/ebpf_vm_transport.h @@ -0,0 +1,56 @@ +#ifndef _EBPF_VM_TRANSPORT_H_ +#define _EBPF_VM_TRANSPORT_H_ + +#include + +enum { + PKT_VM_TRANSPORT_TYPE_UDP, + PKT_VM_TRANSPORT_TYPE_RDMA, + PKT_VM_TRANSPORT_TYPE_MAX +}; + +struct node_url { + uint32_t ip; + uint16_t port; + uint16_t reserved; +}; + +struct rdma_transport_config { + struct node_url self_url; + char *ib_devname; + int ib_port; + unsigned int max_msg_size; + unsigned int rx_depth; + int use_event; + int gid_index; +}; + +struct udp_transport_config { + struct node_url self_url; +}; + +struct transport_config { + uint32_t transport_type; + union { + struct rdma_transport_config rdma_cfg; + struct udp_transport_config udp_cfg; + }; +}; + +struct transport_message { + void *buf; + int buf_size; +}; + +struct transport_ops { + int type; + void *(*init)(struct transport_config *cfg); + void (*exit)(void *ctx); + int (*send)(void *ctx, struct node_url *dst, struct transport_message *msg); + int (*recv)(void *ctx, struct transport_message *msg); + void (*return_buf)(void *ctx, struct transport_message *msg); +}; + +int register_transport(struct transport_ops *ops); + +#endif \ No newline at end of file diff --git a/ebpf_vm_executor/ebpf_vm_transport_rdma.h b/ebpf_vm_executor/ebpf_vm_transport_rdma.h new file mode 100644 index 0000000..371ebef --- /dev/null +++ b/ebpf_vm_executor/ebpf_vm_transport_rdma.h @@ -0,0 +1,56 @@ +#ifndef _EBPF_VM_TRANSPORT_RDMA_H_ +#define _EBPF_VM_TRANSPORT_RDMA_H_ + +#include "ebpf_vm_transport.h" + +#define EXCH_MSG_PATTERN "0000:000000:000000:00000000000000000000000000000000" +#define GID_STR_SIZE 33 +#define UD_GRH_SIZE 40 + +enum { + PKT_VM_RDMA_RECV_WRID = 1, + PKT_VM_RDMA_SEND_WRID = 2 +}; + +struct rdma_addr_message { + int lid; + int qpn; + int psn; + union ibv_gid gid; +}; + +struct rdma_addr_info { + struct ub_list node; + struct node_url key; + struct rdma_addr_message info; + struct ibv_ah *ah; +}; + +struct pkt_vm_rdma_state { + uint32_t pending:1; + uint32_t should_stop:1; + uint32_t unused:30; +}; + +struct pkt_vm_rdma_context { + struct rdma_transport_config cfg; + struct ibv_context *context; + struct ibv_comp_channel *channel; + struct ibv_pd *pd; + struct ibv_mr *mr; + struct ibv_cq *cq; + struct ibv_qp *qp; + char *buf; + int buf_size; + char *send_buf; + int send_offset; + int send_flags; + int rx_depth; + pthread_t server_thread; + struct pkt_vm_rdma_state state; + struct ibv_port_attr portinfo; + struct rdma_addr_message local_addr; + struct ub_list dst_addr_list; +}; + +#endif \ No newline at end of file diff --git a/ebpf_vm_executor/list.h b/ebpf_vm_executor/list.h new file mode 100644 index 0000000..23aeffa --- /dev/null +++ b/ebpf_vm_executor/list.h @@ -0,0 +1,19 @@ +#ifndef _UB_LIST_H_ +#define _UB_LIST_H_ + +#include +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + +#define OBJ_OFFSETOF(obj_ptr, field) offsetof(typeof(*(obj_ptr)), field) +#define SIZEOF_FIELD(struct_type, field) (sizeof(((struct_type *)NULL)->field)) + +#ifdef __cplusplus +} +#endif + +#endif \ No newline at end of file diff --git a/ebpf_vm_executor/ub_list.h b/ebpf_vm_executor/ub_list.h new file mode 100644 index 0000000..37945c3 --- /dev/null +++ b/ebpf_vm_executor/ub_list.h @@ -0,0 +1,188 @@ +#ifndef TRANS_LIST_H +#define TRANS_LIST_H + +#include + +struct ub_list { + struct ub_list *next, *prev; +}; + +#define LIST_HEAD_INIT(name) { &(name), &(name) } + +#define LIST_HEAD(name) \ + struct ub_list name = LIST_HEAD_INIT(name) + +/** + * ub_list_init - Initialize a ub_list structure + * @list: ub_list structure to be initialized. + * + * Initializes the ub_list to point to itself. If it is a list header, + * the result is an empty list. + */ +static inline void ub_list_init(struct ub_list *list) +{ + list->next = list; + list->prev = list; +} + +/** + * container_of - cast a member of a structure out to the containing structure + * @ptr: the pointer to the member. + * @type: the type of the container struct this is embedded in. + * @member: the name of the member within the struct. + * + */ +#define container_of(ptr, type, member) ({ \ + const typeof( ((type *)0)->member ) *__mptr = (ptr); \ + (type *)( (char *)__mptr - offsetof(type,member) );}) + +/** + * list_entry - get the struct for this entry + * @ptr: the &struct ub_list pointer. + * @type: the type of the struct this is embedded in. + * @member: the name of the ub_list within the struct. + */ +#define list_entry(ptr, type, member) \ + container_of(ptr, type, member) + +/** + * list_first_entry - get the first element from a list + * @ptr: the list head to take the element from. + * @type: the type of the struct this is embedded in. + * @member: the name of the ub_list within the struct. + * + * Note, that list is expected to be not empty. + */ +#define list_first_entry(ptr, type, member) \ + list_entry((ptr)->next, type, member) + +/** + * list_next_entry - get the next element in list + * @pos: the type * to cursor + * @member: the name of the ub_list within the struct. + */ +#define list_next_entry(pos, member) \ + list_entry((pos)->member.next, typeof(*(pos)), member) + +/** + * list_entry_is_head - test if the entry points to the head of the list + * @pos: the type * to cursor + * @head: the head for your list. + * @member: the name of the ub_list within the struct. + */ +#define list_entry_is_head(pos, head, member) \ + (&pos->member == (head)) + +/** + * ub_list_is_empty - test if the list is empty + * @list: the head for your list. + */ +static inline int ub_list_is_empty(const struct ub_list *list) +{ + return list->next == list; +} + +/** + * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry + * @pos: the type * to use as a loop cursor. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the ub_list within the struct. + */ +#define UB_LIST_FOR_EACH_SAFE(pos, n, member, head) \ + for (pos = list_first_entry(head, typeof(*pos), member), \ + n = list_next_entry(pos, member); \ + !list_entry_is_head(pos, head, member); \ + pos = n, n = list_next_entry(n, member)) + +/** + * UB_LIST_FOR_EACH - iterate over a list + * @pos: the &struct ub_list to use as a loop cursor. + * @member: the name of the ub_list within the struct. + * @head: the head for your list. + */ +#define UB_LIST_FOR_EACH(pos, member, head) \ + for (pos = list_first_entry(head, typeof(*pos), member); \ + !list_entry_is_head(pos, head, member); \ + pos = list_next_entry(pos, member)) + +/* + * Insert a new entry between two known consecutive entries. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static inline void __list_add(struct ub_list *new, + struct ub_list *prev, + struct ub_list *next) +{ + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; +} + +/** + * ub_list_push_back - add a new entry at the tail of list + * @new: new entry to be added + * @head: list head to add it before + * + * Insert a new entry before the specified head. + * This is useful for implementing queues. + */ +static inline void ub_list_push_back(struct ub_list *head, struct ub_list *new) +{ + __list_add(new, head->prev, head); +} + +/** + * ub_list_push_head - add a new entry at the head of list + * @new: new entry to be added + * @head: list head to add it after + * + * Insert a new entry after the specified head. + * This is useful for implementing queues. + */ +static inline void ub_list_push_head(struct ub_list *new, struct ub_list *head) +{ + __list_add(new, head, head->next); +} + +/* + * Delete a list entry by making the prev/next entries + * point to each other. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static inline void __list_del(struct ub_list * prev, struct ub_list * next) +{ + next->prev = prev; + prev->next = next; +} + +/** + * list_del - deletes entry from list. + * @entry: the element to delete from the list. + * Note: list_empty() on entry does not return true after this, the entry is + * in an undefined state. + */ +static inline void __list_del_entry(struct ub_list *entry) +{ + __list_del(entry->prev, entry->next); +} + +/** + * ub_list_remove - deletes entry from list. + * @entry: the element to delete from the list. + * Note: list_empty() on entry does not return true after this, the entry is + * in an undefined state. + */ +static inline void ub_list_remove(struct ub_list *entry) +{ + __list_del_entry(entry); + entry->next = NULL; + entry->prev = NULL; +} + +#endif /*TRANS_LIST_H*/ \ No newline at end of file diff --git a/how-to.txt b/how-to.txt new file mode 100644 index 0000000..e3dc8f2 --- /dev/null +++ b/how-to.txt @@ -0,0 +1,14 @@ +1, build +1.1 prerequisite: sudo apt-get install llvm clang +1.2 compile executor: cd /path/to/ebpf_vm; mkdir build; cd build; cmake ..; make +1.3 compile ebpf program: cd /path/to/ebpf_vm/ebpf_example; make + +2, setup rdma environment +2.1 change mtu of ethernet adapter: sudo ifconfig enp0s8 mtu 4200 +2.2 create roce adapter: sudo rdma link add rxe_0 type rxe netdev enp0s8 + +3, run ebpf vm +3.1 suppose server is 192.168.100.10, and client is 192.168.100.20 +3.2 run server: /path/to/ebpf_vm/build/ebpf_vm_test/vm_test -a 192.168.100.10 -p 1881 -d rxe_0 -i 1 -s 4096 -r 128 -g 1 -t 0 +3.3 run client: /path/to/ebpf_vm/build/ebpf_vm_test/vm_test -a 192.168.100.10 -p 1881 -d rxe_0 -i 1 -s 4096 -r 128 -g 1 -t 0 -f /path/to/ebpf_vm/ebpf_example/vm_migrate.o + -- Gitee From fa5a2106a64350e3795368840d7df77a4aa667e0 Mon Sep 17 00:00:00 2001 From: Fan Xiaoping Date: Thu, 2 Mar 2023 11:08:55 +0800 Subject: [PATCH 2/5] add elf parser to get ebpf code section from elf file Signed-off-by: Fan Xiaoping --- ebpf_vm_executor/ebpf_vm_elf.c | 177 +++++++++++++++++++++++++++++++++ 1 file changed, 177 insertions(+) create mode 100644 ebpf_vm_executor/ebpf_vm_elf.c diff --git a/ebpf_vm_executor/ebpf_vm_elf.c b/ebpf_vm_executor/ebpf_vm_elf.c new file mode 100644 index 0000000..3076d2c --- /dev/null +++ b/ebpf_vm_executor/ebpf_vm_elf.c @@ -0,0 +1,177 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ebpf_vm_simulator.h" + +enum { + MP_ELF_SCN_SYMB, + MP_ELF_SCN_CODE, + MP_ELF_SCN_CODE_REL, + MP_ELF_SCN_MAX +}; + +struct mp_elf_context { + Elf *elf; + Elf64_Ehdr *elf_hdr; + struct { + Elf_Scn *scn; + Elf64_Shdr *hdr; + const Elf_Data *data; + } scn[MP_ELF_SCN_MAX]; +}; + +static Elf64_Sym *get_symbol_by_name(struct mp_elf_context *ctx, const char *name) +{ + Elf64_Sym *symbs = ctx->scn[MP_ELF_SCN_SYMB].data->d_buf; + int symbs_num = ctx->scn[MP_ELF_SCN_SYMB].hdr->sh_size / ctx->scn[MP_ELF_SCN_SYMB].hdr->sh_entsize; + + for (int idx = 0; idx < symbs_num; idx++) { + const char *symb_name = elf_strptr(ctx->elf, ctx->elf_hdr->e_shstrndx, symbs[idx].st_name); + if (strcmp(symb_name, name) == 0) { + return &symbs[idx]; + } + } + + return NULL; +} + +static uint32_t get_func_idx_by_name(struct ebpf_vm *vm, const char *symb_name) +{ + struct ebpf_symbol *symb = NULL; + int idx; + + if (vm->rd.symbols == NULL) { + return PKT_VM_INVALID_FUNC_IDX; + } + + for (idx = 0, symb = &vm->rd.symbols[idx]; + (symb->name != NULL) && (idx < PKT_VM_MAX_SYMBS); + symb = &vm->rd.symbols[++idx]) { + if (strcmp(symb->name, symb_name) == 0) { + return idx; + } + } + + return PKT_VM_INVALID_FUNC_IDX; +} + +static int32_t get_function_offset(struct mp_elf_context *ctx, const char *func_name) +{ + Elf64_Sym *symb = get_symbol_by_name(ctx, func_name); + if ((symb == NULL) || ((ELF64_ST_TYPE(symb->st_info)) != STT_FUNC) || + (symb->st_shndx != elf_ndxscn(ctx->scn[MP_ELF_SCN_CODE].scn))) { + return -1; + } + + return symb->st_value / sizeof(struct ebpf_instruction); +} + +static void do_relocation(struct ebpf_vm *vm, struct mp_elf_context *ctx) +{ + Elf64_Sym *symbs = ctx->scn[MP_ELF_SCN_SYMB].data->d_buf; + Elf64_Rel *reloc_entry = ctx->scn[MP_ELF_SCN_CODE_REL].data->d_buf; + int num_reloc_entry = ctx->scn[MP_ELF_SCN_CODE_REL].data->d_size / sizeof(Elf64_Rel); + + for (int idx = 0; idx < num_reloc_entry; idx++) { + int32_t ins_offset = reloc_entry[idx].r_offset / sizeof(struct ebpf_instruction); + int sym_idx = ELF64_R_SYM(reloc_entry[idx].r_info); + const char *symb_name = elf_strptr(ctx->elf, ctx->elf_hdr->e_shstrndx, symbs[sym_idx].st_name); + int32_t func_offset = get_function_offset(ctx, symb_name); + if (func_offset >= 0) { + /*Local function call has higher priority*/ + (ebpf_vm_code(vm))[ins_offset].immediate = (func_offset - ins_offset - 1); + } else { + uint32_t func_idx = get_func_idx_by_name(vm, symb_name); + if (func_idx != PKT_VM_INVALID_FUNC_IDX) { + (ebpf_vm_code(vm))[ins_offset].immediate = func_idx; + } + } + } +} + +static int setup_elf_context(struct mp_elf_context *ctx, int32_t fd) +{ + Elf_Scn *scn = NULL; + const char *sections[MP_ELF_SCN_MAX] = {".symtab", ".text", ".rel.text"}; + + elf_version(EV_CURRENT); + ctx->elf = elf_begin(fd, ELF_C_READ, NULL); + ctx->elf_hdr = elf64_getehdr(ctx->elf); + if(ctx->elf_hdr == NULL) { + printf("Failed to get elf header\n"); + return -1; + } + + while ((scn = elf_nextscn(ctx->elf, scn)) != NULL) { + Elf64_Shdr *section_hdr = elf64_getshdr(scn); + const char *section_name = elf_strptr(ctx->elf, ctx->elf_hdr->e_shstrndx, section_hdr->sh_name); + for (int idx = 0; idx < MP_ELF_SCN_MAX; idx++) { + if (strcmp(section_name, sections[idx]) == 0) { + ctx->scn[idx].scn = scn; + ctx->scn[idx].hdr = section_hdr; + ctx->scn[idx].data = elf_getdata(scn, NULL); + } + } + } + + if (ctx->scn[MP_ELF_SCN_SYMB].scn == NULL || ctx->scn[MP_ELF_SCN_CODE].scn == NULL || + ctx->scn[MP_ELF_SCN_CODE].hdr->sh_type != SHT_PROGBITS || + ctx->scn[MP_ELF_SCN_CODE].hdr->sh_flags != (SHF_ALLOC | SHF_EXECINSTR) || + (ctx->scn[MP_ELF_SCN_CODE_REL].hdr != NULL && ctx->scn[MP_ELF_SCN_CODE_REL].hdr->sh_type != SHT_REL)) { + return -1; + } + + return 0; +} + +struct ebpf_vm *create_vm_from_elf(const char *elf_file_name) +{ + struct ebpf_vm *vm = NULL; + struct mp_elf_context ctx = {0}; + int32_t fd, main_offset; + + fd = open(elf_file_name, O_RDONLY); + if (fd < 0) { + printf("Failed to open file %s\n", elf_file_name); + return NULL; + } + + if (0 != setup_elf_context(&ctx, fd)) { + printf("Failed to setup elf context\n"); + goto exit_clean; + } + + main_offset = get_function_offset(&ctx, "vm_main"); + if (main_offset < 0) { + printf("not able to find main function\n"); + goto exit_clean; + } + + vm = create_vm((uint8_t *)ctx.scn[MP_ELF_SCN_CODE].data->d_buf, (uint32_t)ctx.scn[MP_ELF_SCN_CODE].data->d_size); + if (vm == NULL) { + printf("Failed to create vm\n"); + goto exit_clean; + } + + vm->sys_reg[EBPF_SYS_REG_PC] = main_offset; + + if (ctx.scn[MP_ELF_SCN_CODE_REL].scn != NULL) { + do_relocation(vm, &ctx); + } + +exit_clean: + if (ctx.elf != NULL) { + elf_end(ctx.elf); + } + + close(fd); + return vm; +} \ No newline at end of file -- Gitee From 5096becab2fa63fd14eca551c9c71dcf8b10de13 Mon Sep 17 00:00:00 2001 From: Fan Xiaoping Date: Thu, 2 Mar 2023 11:11:13 +0800 Subject: [PATCH 3/5] add ebpf vm basic kernel functions Signed-off-by: Fan Xiaoping --- ebpf_vm_executor/ebpf_vm_functions.c | 195 +++++++++++++++++++++++++++ 1 file changed, 195 insertions(+) create mode 100644 ebpf_vm_executor/ebpf_vm_functions.c diff --git a/ebpf_vm_executor/ebpf_vm_functions.c b/ebpf_vm_executor/ebpf_vm_functions.c new file mode 100644 index 0000000..bc5bb88 --- /dev/null +++ b/ebpf_vm_executor/ebpf_vm_functions.c @@ -0,0 +1,195 @@ +#include +#include +#include +#include +#include +#include + +#define PKT_VM_EXECUTOR 1 + +#include "ebpf_vm_simulator.h" +#include "ebpf_vm_functions.h" + +static void address_monitor_list_add(uint64_t type, uint64_t monitor_address, uint64_t value, uint64_t tag, struct ebpf_vm *vm) +{ + struct address_monitor_entry *new_entry = NULL; + new_entry = calloc(1, sizeof(*new_entry)); + if (new_entry == NULL) { + return; + } + + new_entry->type = type; + new_entry->address = monitor_address; + new_entry->value = value; + new_entry->tag = tag; + ub_list_push_head(&new_entry->list, &vm->address_monitor_list); +} + +static struct address_monitor_entry *address_monitor_list_find(struct ebpf_vm *vm, uint64_t monitor_address) +{ + struct address_monitor_entry *entry, *tmp = NULL; + if (ub_list_is_empty(&vm->address_monitor_list)) { + return NULL; + } + UB_LIST_FOR_EACH_SAFE(entry, tmp, list, &vm->address_monitor_list) { + if (entry->address == monitor_address) { + return entry; + } + } + return NULL; +} + +static uint64_t ebpf_func_empty(ARG_NOT_USED_5, struct ebpf_vm *vm) +{ + printf("Warning: function is not resolved\n"); + return 0; +} + +static uint64_t ebpf_func_debug_print(uint64_t s, ARG_NOT_USED_4, struct ebpf_vm *vm) +{ + printf("vm debug: %ld\n", s); + return 0; +} + +static uint64_t ebpf_func_mmap(uint64_t va, uint64_t size, ARG_NOT_USED_3, struct ebpf_vm *vm) +{ + int idx = vm->sys_reg[EBPF_SYS_REG_PAGE_TABLE_IDX]; + + for (uint64_t index = 1; index < BUCKET_ENTRIES; index++) { + if (vm->page_table[idx].entries[index].size == 0) { + vm->page_table[idx].entries[index].va = va; + vm->page_table[idx].entries[index].size = size; + return (index << INDEX_SHIFT); + } + } + + return PAGE_TABLE_ERROR; +} + +static uint64_t ebpf_func_wait_for_address_event(ARG_NOT_USED_5, struct ebpf_vm *vm) +{ + struct address_monitor_entry *e = NULL; + uint64_t *host_va = NULL; + + UB_LIST_FOR_EACH(e, list, &vm->address_monitor_list) { + host_va = (uint64_t *)vm_mmu(e->address, vm); + if (((e->type == MONITOR_T_BIGGER_THAN_VALUE) && (*host_va <= e->value)) || + ((e->type == MONITOR_T_LESS_THAN_VALUE) && (*host_va >= e->value)) || + ((e->type == MONITOR_T_EQUAL_VALUE) && (*host_va != e->value)) || + ((e->type == MONITOR_T_NOT_EQUAL_VALUE) && (*host_va == e->value))) { + continue; + } + + update_vm_state(vm, VM_STATE_RUNNING); + return e->tag; + } + + update_vm_state(vm, VM_STATE_WAIT_FOR_ADDRESS); + return 0; +} + +static uint64_t ebpf_func_monitor_address(uint64_t type, uint64_t target_address, uint64_t value, uint64_t tag, ARG_NOT_USED_1, struct ebpf_vm *vm) +{ + struct address_monitor_entry *entry, *tmp = NULL; + entry = address_monitor_list_find(vm, target_address); + if (type == MONITOR_T_CLEAR) { + if (target_address == 0x0) { + UB_LIST_FOR_EACH_SAFE(entry, tmp, list, &vm->address_monitor_list) { + ub_list_remove(&entry->list); + free(entry); + } + return 0; + } + if (entry != NULL) { + ub_list_remove(&entry->list); + free(entry); + } + } else { + if (target_address == 0x0) { + return -1; + } + if (entry == NULL) { + (void)address_monitor_list_add(type, target_address, value, tag, vm); + } else { + entry->type = type; + entry->value = value; + entry->tag = tag; + } + } + + return 0; +} + +static uint64_t ebpf_func_migrate_to(uint64_t dst, ARG_NOT_USED_4, struct ebpf_vm *vm) +{ + struct ebpf_vm_executor *executor = vm->rd.executor; + struct transport_message send_msg; + struct ub_address *addr = NULL; + int ret; + + send_msg.buf = vm; + send_msg.buf_size = sizeof(struct ebpf_vm) + vm->code_size + vm->stack_size + vm->data_size; + addr = (struct ub_address *)vm_mmu(dst, vm); + + ret = executor->transport->send(executor->transport_ctx, (struct node_url *)addr->url, &send_msg); + if (ret != send_msg.buf_size) { + printf("Failed to migrate vm."); + } + + update_vm_state(vm, VM_STATE_EXIT); +} + +static uint64_t ebpf_func_clone_to(uint64_t dst_list, uint64_t len, ARG_NOT_USED_3, struct ebpf_vm *vm) +{ + struct ebpf_vm_executor *executor = vm->rd.executor; + struct transport_message send_msg; + struct ub_address *target_list = NULL; + + send_msg.buf = vm; + send_msg.buf_size = sizeof(struct ebpf_vm) + vm->code_size + vm->stack_size + vm->data_size; + target_list = (struct ub_address *)vm_mmu(dst_list, vm); + + for (int idx = 0; idx < len; idx++) { + struct node_url *dst; + int ret; + + dst = (struct node_url *)target_list[idx].url; + vm->reg[0] = idx; + ret = executor->transport->send(executor->transport_ctx, dst, &send_msg); + if (ret != send_msg.buf_size) { + printf("Failed to migrate vm."); + } + } + + return len; +} + +static uint64_t ebpf_func_switch_to_address_space(uint64_t asid, ARG_NOT_USED_4, struct ebpf_vm *vm) +{ + if (asid >= BUCKET_ENTRIES) { + printf("Only 2 address space are supported."); + return 0; + } + + vm->sys_reg[EBPF_SYS_REG_PAGE_TABLE_IDX] = asid; + return 0; +} + +static uint64_t ebpf_func_memcpy(uint64_t dst, uint64_t src, uint64_t len, uint64_t completion_addr, uint64_t result, struct ebpf_vm *vm) +{ + /*TBD*/ + return 0; +} + +struct ebpf_symbol ebpf_global_symbs[PKT_VM_MAX_SYMBS] = { + {"bug", ebpf_func_empty}, + {"debug_print", ebpf_func_debug_print}, + {"mmap", ebpf_func_mmap}, + {"monitor_address", ebpf_func_monitor_address}, + {"wait_for_address_event", ebpf_func_wait_for_address_event}, + {"migrate_to", ebpf_func_migrate_to}, + {"clone_to", ebpf_func_clone_to}, + {"switch_to_address_space", ebpf_func_switch_to_address_space}, + {"memcpy", ebpf_func_memcpy}, + {NULL, NULL} +}; \ No newline at end of file -- Gitee From 113538b2fd5fcb06ee68e11bca0b126ec8697807 Mon Sep 17 00:00:00 2001 From: Fan Xiaoping Date: Thu, 2 Mar 2023 11:13:11 +0800 Subject: [PATCH 4/5] add rdma transport for ebpf vm Signed-off-by: Fan Xiaoping --- ebpf_vm_executor/ebpf_vm_transport_rdma.c | 626 ++++++++++++++++++++++ 1 file changed, 626 insertions(+) create mode 100644 ebpf_vm_executor/ebpf_vm_transport_rdma.c diff --git a/ebpf_vm_executor/ebpf_vm_transport_rdma.c b/ebpf_vm_executor/ebpf_vm_transport_rdma.c new file mode 100644 index 0000000..08c4be0 --- /dev/null +++ b/ebpf_vm_executor/ebpf_vm_transport_rdma.c @@ -0,0 +1,626 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ub_list.h" +#include "ebpf_vm_transport_rdma.h" + +void wire_gid_to_gid(const uint8_t *wgid, union ibv_gid *gid) +{ + uint8_t tmp[9]; + __be32 v32; + int i; + uint32_t tmp_gid[4]; + + for (tmp[8] = 0, i = 0; i < 4; i++) { + memcpy(tmp, wgid + i * 8, 8); + sscanf(tmp, "%x", &v32); + tmp_gid[i] = be32toh(v32); + } + memcpy(gid, tmp_gid, sizeof(*gid)); +} + +void gid_to_wire_gid(const union ibv_gid *gid, uint8_t wgid[]) +{ + uint32_t tmp_gid[4]; + int i; + + memcpy(tmp_gid, gid, sizeof(tmp_gid)); + for (i = 0; i < 4; ++i) { + sprintf(&wgid[i * 8], "%08x", htobe32(tmp_gid[i])); + } +} + +static void printf_rdma_addr_message(struct rdma_addr_message *msg) +{ + char gid[33]; + + inet_ntop(AF_INET6, &msg->gid, gid, sizeof(gid)); + printf("address: LID 0x%04x, QPN 0x%06x, PSN 0x%06x: GID %s\n", + msg->lid, msg->qpn, msg->psn, gid); +} + +static struct rdma_addr_info *pkt_vm_rdma_find_dest(struct pkt_vm_rdma_context *ctx, struct node_url *n) +{ + struct rdma_addr_info *e; + + UB_LIST_FOR_EACH(e, node, &ctx->dst_addr_list) { + if (memcmp(&e->key, n, sizeof(struct node_url)) == 0) { + return e; + } + } + + return NULL; +} + +static struct rdma_addr_info *pkt_vm_rdma_add_dest(struct pkt_vm_rdma_context *ctx, struct node_url *n, uint8_t *msg) +{ + struct rdma_addr_info *dst; + uint8_t gid_str[GID_STR_SIZE]; + struct ibv_ah_attr ah_attr = {0}; + + dst = malloc(sizeof(*dst)); + if (dst == NULL) { + perror("Failed to allocate memory"); + return NULL; + } + + dst->key.ip = n->ip; + dst->key.port = n->port; + dst->key.reserved = 0; + + sscanf(msg, "%x:%x:%x:%s", &dst->info.lid, &dst->info.qpn, &dst->info.psn, gid_str); + wire_gid_to_gid(gid_str, &dst->info.gid); + + if (dst->info.gid.global.interface_id) { + ah_attr.is_global = 1; + ah_attr.grh.hop_limit = 1; + ah_attr.grh.dgid = dst->info.gid; + ah_attr.grh.sgid_index = ctx->cfg.gid_index; + } + + ah_attr.dlid = dst->info.lid; + ah_attr.port_num = ctx->cfg.ib_port; + dst->ah = ibv_create_ah(ctx->pd, &ah_attr); + if (!dst->ah) { + perror("Failed to create AH"); + return NULL; + } + + ub_list_push_back(&ctx->dst_addr_list, &dst->node); + printf_rdma_addr_message(&dst->info); + return dst; +} + +static void *pkt_vm_rdma_server_main(void *arg) +{ + struct pkt_vm_rdma_context *ctx = arg; + struct sockaddr_in name; + int sockfd, reuse_addr; + + sockfd = socket(AF_INET, SOCK_STREAM, 0); + if (sockfd < 0) { + perror("Failed to create socket"); + return NULL; + } + + reuse_addr = 1; + setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &reuse_addr, sizeof(reuse_addr)); + + name.sin_family = AF_INET; + name.sin_port = ctx->cfg.self_url.port; + name.sin_addr.s_addr = ctx->cfg.self_url.ip; + + printf("server listen on port: %d\n", ntohs(name.sin_port)); + + if (bind(sockfd, (struct sockaddr *)&name, sizeof(name)) < 0) { + perror("Failed to bind socket"); + close(sockfd); + return NULL; + } + + if (listen(sockfd, 10) < 0) { + perror("Failed to listen on socket"); + close(sockfd); + return NULL; + } + + while (ctx->state.should_stop == 0) { + uint8_t msg[sizeof(EXCH_MSG_PATTERN)]; + int connfd, n; + + connfd = accept(sockfd, NULL, NULL); + if (connfd < 0) { + perror("Failed to accept new connection"); + continue; + } + + n = read(connfd, msg, sizeof(msg)); + if (n != sizeof(msg)) { + perror("Couldn't read remote address"); + close(connfd); + continue; + } + + n = sprintf(msg, "%04x:%06x:%06x:", ctx->local_addr.lid, + ctx->local_addr.qpn, ctx->local_addr.psn); + gid_to_wire_gid(&ctx->local_addr.gid, (msg + n)); + + if (write(connfd, msg, sizeof(msg)) != sizeof(msg) || + read(connfd, msg, sizeof(msg)) != sizeof("done")) { + perror("Couldn't rea/write remote address"); + } + + close(connfd); + } + + close(sockfd); +} + +static struct rdma_addr_info *pkt_vm_rdma_get_node_info(struct pkt_vm_rdma_context *ctx, struct node_url *server_url) +{ + uint8_t msg[sizeof(EXCH_MSG_PATTERN)]; + struct sockaddr_in name = {0}; + int sockfd, n; + + sockfd = socket(AF_INET, SOCK_STREAM, 0); + if (sockfd < 0) { + perror("Failed to create socket"); + return NULL; + } + + name.sin_family = AF_INET; + name.sin_port = server_url->port; + name.sin_addr.s_addr = server_url->ip; + + if (connect(sockfd, (struct sockaddr *)&name, sizeof(name)) < 0) { + char svr[32]; + inet_ntop(AF_INET, &name.sin_addr.s_addr, svr, sizeof(svr)); + printf("server = %s, port = %d\n", svr, ntohs(name.sin_port)); + + perror("Failed to connect to server"); + close(sockfd); + return NULL; + } + + n = sprintf(msg, "%04x:%06x:%06x:", ctx->local_addr.lid, + ctx->local_addr.qpn, ctx->local_addr.psn); + gid_to_wire_gid(&ctx->local_addr.gid, (msg + n)); + + if (write(sockfd, msg, sizeof(msg)) != sizeof(msg)) { + perror("Couldn't send local address"); + close(sockfd); + return NULL; + } + + if (read(sockfd, msg, sizeof(msg)) != sizeof(msg) || + write(sockfd, "done", sizeof("done")) != sizeof("done")) { + perror("Couldn't rea/write remote address"); + close(sockfd); + return NULL; + } + + close(sockfd); + return pkt_vm_rdma_add_dest(ctx, server_url, msg); +} + +static int pkt_vm_rdma_enable_qp(struct pkt_vm_rdma_context *ctx) +{ + struct ibv_qp_attr attr = { + .qp_state = IBV_QPS_RTR + }; + + if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE)) { + fprintf(stderr, "Failed to modify QP to RTR\n"); + return 1; + } + + attr.qp_state = IBV_QPS_RTS; + attr.sq_psn = ctx->local_addr.psn; + + if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE | IBV_QP_SQ_PSN)) { + fprintf(stderr, "Failed to modify QP to RTS\n"); + return 1; + } + + return 0; +} + +static int pkt_vm_rdma_post_recv(struct pkt_vm_rdma_context *ctx, uint8_t *buf) +{ + struct ibv_sge list = { + .addr = (uintptr_t)buf, + .length = ctx->cfg.max_msg_size, + .lkey = ctx->mr->lkey + }; + struct ibv_recv_wr wr = { + .wr_id = (uint64_t)buf, + .sg_list = &list, + .num_sge = 1, + }; + struct ibv_recv_wr *bad_wr; + + return ibv_post_recv(ctx->qp, &wr, &bad_wr); +} + +static struct pkt_vm_rdma_context *pkt_vm_rdma_init_ctx(struct rdma_transport_config *cfg) +{ + struct ibv_device **dev_list; + struct ibv_device *ib_dev = NULL; + struct pkt_vm_rdma_context *ctx; + int idx; + + dev_list = ibv_get_device_list(NULL); + if (!dev_list) { + perror("Failed to get IB device list"); + return NULL; + } + + for (idx = 0; dev_list[idx]; idx++) { + if (!strcmp(ibv_get_device_name(dev_list[idx]), cfg->ib_devname)) { + ib_dev = dev_list[idx]; + break; + } + } + + if (!ib_dev) { + fprintf(stderr, "IB device %s not found.\n", cfg->ib_devname); + return NULL; + } + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + return NULL; + } + + memcpy(&ctx->cfg, cfg, sizeof(ctx->cfg)); + ctx->send_flags = IBV_SEND_SIGNALED; + ctx->rx_depth = cfg->rx_depth; + ctx->buf_size = 2 * cfg->rx_depth * cfg->max_msg_size; + ub_list_init(&ctx->dst_addr_list); + + ctx->buf = calloc(1, ctx->buf_size); + if (!ctx->buf) { + fprintf(stderr, "Failed to allocate recv buf.\n"); + goto clean_ctx; + } + ctx->send_buf = ctx->buf + cfg->rx_depth * cfg->max_msg_size; + ctx->send_offset = 0; + + ctx->context = ibv_open_device(ib_dev); + if (!ctx->context) { + fprintf(stderr, "Couldn't get context for %s\n", ibv_get_device_name(ib_dev)); + goto clean_buffer; + } + + { + struct ibv_port_attr port_info = {}; + int mtu; + + if (ibv_query_port(ctx->context, cfg->ib_port, &port_info)) { + fprintf(stderr, "Unable to query port info for port %d\n", cfg->ib_port); + goto clean_device; + } + mtu = 1 << (port_info.active_mtu + 7); + if (cfg->max_msg_size > mtu) { + fprintf(stderr, "Requested size larger than port MTU (%d)\n", mtu); + goto clean_device; + } + } + + if (cfg->use_event) { + ctx->channel = ibv_create_comp_channel(ctx->context); + if (!ctx->channel) { + fprintf(stderr, "Couldn't create completion channel\n"); + goto clean_device; + } + } else { + ctx->channel = NULL; + } + + ctx->pd = ibv_alloc_pd(ctx->context); + if (!ctx->pd) { + fprintf(stderr, "Couldn't allocate PD\n"); + goto clean_comp_channel; + } + + ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, ctx->buf_size, IBV_ACCESS_LOCAL_WRITE); + if (!ctx->mr) { + fprintf(stderr, "Couldn't register MR\n"); + goto clean_pd; + } + + ctx->cq = ibv_create_cq(ctx->context, cfg->rx_depth + 1, NULL, ctx->channel, 0); + if (!ctx->cq) { + fprintf(stderr, "Couldn't create CQ\n"); + goto clean_mr; + } + + { + struct ibv_qp_attr attr; + struct ibv_qp_init_attr init_attr = { + .send_cq = ctx->cq, + .recv_cq = ctx->cq, + .cap = { + .max_send_wr = 1, + .max_recv_wr = cfg->rx_depth, + .max_send_sge = 1, + .max_recv_sge = 1 + }, + .qp_type = IBV_QPT_UD, + }; + + ctx->qp = ibv_create_qp(ctx->pd, &init_attr); + if (!ctx->qp) { + fprintf(stderr, "Couldn't create QP\n"); + goto clean_cq; + } + + ibv_query_qp(ctx->qp, &attr, IBV_QP_CAP, &init_attr); + if (init_attr.cap.max_inline_data >= cfg->max_msg_size) { + ctx->send_flags |= IBV_SEND_INLINE; + } + } + + { + struct ibv_qp_attr attr = { + .qp_state = IBV_QPS_INIT, + .pkey_index = 0, + .port_num = cfg->ib_port, + .qkey = 0x11111111 + }; + + if (ibv_modify_qp(ctx->qp, &attr, + IBV_QP_STATE | + IBV_QP_PKEY_INDEX | + IBV_QP_PORT | + IBV_QP_QKEY)) { + fprintf(stderr, "Failed to modify QP to INIT\n"); + goto clean_qp; + } + } + + return ctx; + +clean_qp: + ibv_destroy_qp(ctx->qp); + +clean_cq: + ibv_destroy_cq(ctx->cq); + +clean_mr: + ibv_dereg_mr(ctx->mr); + +clean_pd: + ibv_dealloc_pd(ctx->pd); + +clean_comp_channel: + if (ctx->channel) + ibv_destroy_comp_channel(ctx->channel); + +clean_device: + ibv_close_device(ctx->context); + +clean_buffer: + free(ctx->buf); + +clean_ctx: + free(ctx); + + return NULL; +} + +static int pkt_vm_rdma_get_local_addr(struct pkt_vm_rdma_context *ctx) +{ + struct rdma_transport_config *cfg = &ctx->cfg; + + if (ibv_query_port(ctx->context, cfg->ib_port, &ctx->portinfo)) { + fprintf(stderr, "Couldn't get port info\n"); + return 1; + } + + ctx->local_addr.lid = ctx->portinfo.lid; + ctx->local_addr.qpn = ctx->qp->qp_num; + ctx->local_addr.psn = lrand48() & 0xffffff; + + if (cfg->gid_index >= 0) { + if (ibv_query_gid(ctx->context, cfg->ib_port, cfg->gid_index, &ctx->local_addr.gid)) { + fprintf(stderr, "Couldn't get local gid for gid index %d\n", cfg->gid_index); + return 1; + } + } else { + memset(&ctx->local_addr.gid, 0, sizeof(ctx->local_addr.gid)); + } + + printf_rdma_addr_message(&ctx->local_addr); + return 0; + +} + +int pkt_vm_rdma_send(void *info, struct node_url *n, struct transport_message *msg) +{ + struct pkt_vm_rdma_context *ctx = info; + struct rdma_addr_info *dst = pkt_vm_rdma_find_dest(ctx, n); + struct ibv_sge list = {0}; + struct ibv_send_wr wr = {0}; + struct ibv_send_wr *bad_wr; + + if (msg->buf_size > ctx->cfg.max_msg_size) { + printf("Message is too big to send.\n"); + return 0; + } + + if (dst == NULL) { + dst = pkt_vm_rdma_get_node_info(ctx, n); + if (dst == NULL) { + perror("Failed to get destination information"); + return 0; + } + } + + memcpy(ctx->send_buf + ctx->send_offset, msg->buf, msg->buf_size); + + list.addr = (uintptr_t)(ctx->send_buf + ctx->send_offset); + list.length = msg->buf_size; + list.lkey = ctx->mr->lkey; + + wr.wr_id = (uint64_t)list.addr; + wr.sg_list = &list; + wr.num_sge = 1; + wr.opcode = IBV_WR_SEND; + wr.send_flags = ctx->send_flags; + wr.wr.ud.ah = dst->ah; + wr.wr.ud.remote_qpn = dst->info.qpn; + wr.wr.ud.remote_qkey = 0x11111111; + + if (ibv_post_send(ctx->qp, &wr, &bad_wr) == 0) { + ctx->send_offset = (ctx->send_offset + ctx->cfg.max_msg_size) % (ctx->cfg.max_msg_size * ctx->cfg.rx_depth); + return msg->buf_size; + } else { + return 0; + } +} + +int pkt_vm_rdma_recv(void *info, struct transport_message *msg) +{ + struct pkt_vm_rdma_context *ctx = info; + struct ibv_wc wc; + + if (ibv_poll_cq(ctx->cq, 1, &wc) <= 0) { + return 0; + } + + if (wc.status != IBV_WC_SUCCESS) { + printf("wc failure status = %d.\n", wc.status); + return 0; + } + + if (wc.opcode != IBV_WC_RECV) { + if (wc.opcode != IBV_WC_SEND) { + printf("wc failure opcode = %d.\n", wc.opcode); + } + + return 0; + } + + msg->buf = (void *)((char *)wc.wr_id + UD_GRH_SIZE); + msg->buf_size = wc.byte_len - UD_GRH_SIZE; + + return msg->buf_size; +} + +static void pkt_vm_rdma_return_buf(void *info, struct transport_message *msg) +{ + pkt_vm_rdma_post_recv(info, msg->buf); +} + +static void pkt_vm_rdma_exit(void *info) +{ + struct pkt_vm_rdma_context *ctx = info; + struct rdma_addr_info *dst, *tmp; + + if (ctx->server_thread != (pthread_t)0) { + ctx->state.should_stop = 1; + pthread_join(ctx->server_thread, NULL); + } + + if (ibv_destroy_qp(ctx->qp)) { + fprintf(stderr, "Couldn't destroy OP\n"); + return; + } + + if (ibv_destroy_cq(ctx->cq)) { + fprintf(stderr, "Couldn't destroy CQ\n"); + return; + } + + if (ibv_dereg_mr(ctx->mr)) { + fprintf(stderr, "Couldn't deregister MR\n"); + return; + } + + UB_LIST_FOR_EACH_SAFE(dst, tmp, node, &ctx->dst_addr_list) { + ub_list_remove(&dst->node); + + if (ibv_destroy_ah(dst->ah)) { + perror("Couldn't destroy AH"); + } + + free(dst); + } + + if (ibv_dealloc_pd(ctx->pd)) { + fprintf(stderr, "Couldn't deallocate PD"); + return; + } + + if (ctx->channel) { + if (ibv_destroy_comp_channel(ctx->channel)) { + fprintf(stderr, "Couldn't destroy completion channel\n"); + return; + } + } + + if (ibv_close_device(ctx->context)) { + fprintf(stderr, "Couldn't release context\n"); + return; + } + + free(ctx->buf); + free(ctx); +} + +static void *pkt_vm_rdma_init(struct transport_config *cfg) +{ + struct pkt_vm_rdma_context *ctx = NULL; + int idx, ret; + + srand48(getpid() * time(NULL)); + + ctx = pkt_vm_rdma_init_ctx(&cfg->rdma_cfg); + if (ctx == NULL) { + printf("Failed to create rdma context.\n"); + return NULL; + } + + for (idx = 0; idx < cfg->rdma_cfg.rx_depth; idx++) { + ret = pkt_vm_rdma_post_recv(ctx, (ctx->buf + (idx * cfg->rdma_cfg.max_msg_size))); + if (ret != 0) { + perror("Failed to post recv buffer"); + } + } + + pkt_vm_rdma_get_local_addr(ctx); + + ret = pthread_create(&ctx->server_thread, NULL, pkt_vm_rdma_server_main, ctx); + if (ret != 0) { + perror("Failed to create server thread"); + pkt_vm_rdma_exit(ctx); + return NULL; + } + + pkt_vm_rdma_enable_qp(ctx); + return ctx; +} + +static struct transport_ops rdma_ops = { + .type = PKT_VM_TRANSPORT_TYPE_RDMA, + .init = pkt_vm_rdma_init, + .exit = pkt_vm_rdma_exit, + .send = pkt_vm_rdma_send, + .recv = pkt_vm_rdma_recv, + .return_buf = pkt_vm_rdma_return_buf, +}; + +static __attribute__((constructor)) void pkt_vm_rdma_register_transport(void) +{ + register_transport(&rdma_ops); +} \ No newline at end of file -- Gitee From c8e068fb23fc95dd678cfea3ed0e75f5f977d081 Mon Sep 17 00:00:00 2001 From: Fan Xiaoping Date: Thu, 2 Mar 2023 11:15:25 +0800 Subject: [PATCH 5/5] add ebpf vm runner: ebpf instruction interpreter Signed-off-by: Fan Xiaoping --- ebpf_vm_executor/ebpf_vm_simulator.c | 580 +++++++++++++++++++++++++++ 1 file changed, 580 insertions(+) create mode 100644 ebpf_vm_executor/ebpf_vm_simulator.c diff --git a/ebpf_vm_executor/ebpf_vm_simulator.c b/ebpf_vm_executor/ebpf_vm_simulator.c new file mode 100644 index 0000000..8f0c3d0 --- /dev/null +++ b/ebpf_vm_executor/ebpf_vm_simulator.c @@ -0,0 +1,580 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include "ub_list.h" + +#define PKT_VM_EXECUTOR 1 + +#include "ebpf_vm_simulator.h" +#include "ebpf_vm_transport.h" +#include "ebpf_vm_functions.h" + +struct transport_ops *registered_transport[PKT_VM_TRANSPORT_TYPE_MAX]; + +static uint64_t to_little_endian(uint64_t *v, uint32_t width) +{ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return *v; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + uint8_t *p = (uint8_t *)(v + 1); + uint64_t result = 0; + for (int idx = 0; idx < (width / 8); idx++) { + result |= *(--p); + result = result << 8; + } + return result; +#else +#error unsupported endianess +#endif +} + +static uint64_t to_big_endian(uint64_t *v, uint32_t width) +{ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + uint8_t *p = (uint8_t *)v; + uint64_t result = 0; + for (int idx = 0; idx < (width / 8); idx++) { + result |= *(p++); + result = result << 8; + } + return result; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + return *v; +#else +#error unsupported endianess +#endif +} + +uint64_t vm_mmu(uint64_t va, struct ebpf_vm *vm) +{ + struct vm_pte *e = NULL; + uint64_t offset = va & ENTRY_MASK; + int idx = vm->sys_reg[EBPF_SYS_REG_PAGE_TABLE_IDX]; + + if ((va >> PACKET_VA_SHIFT) != 0) { + return PAGE_TABLE_ERROR; + } + + e = &vm->page_table[idx].entries[(va >> INDEX_SHIFT)]; + if ((e->va != 0x00) && (offset < e->size)) { + return e->va + offset; + } + + return PAGE_TABLE_ERROR; +} + +void update_vm_state(struct ebpf_vm *vm, int state) +{ + vm->state.vm_state = state; +} + +static void save_caller_register(struct ebpf_vm *vm) +{ + uint64_t *fp = (uint64_t *)vm_mmu(vm->reg[EBPF_REG_FP], vm); + *fp++ = vm->reg[EBPF_REG_6]; + *fp++ = vm->reg[EBPF_REG_7]; + *fp++ = vm->reg[EBPF_REG_8]; + *fp++ = vm->reg[EBPF_REG_9]; + *fp++ = vm->sys_reg[EBPF_SYS_REG_LR]; +} + +static void restore_caller_register(struct ebpf_vm *vm) +{ + uint64_t *fp = (uint64_t *)vm_mmu(vm->reg[EBPF_REG_FP], vm); + vm->reg[EBPF_REG_6] = *fp++; + vm->reg[EBPF_REG_7] = *fp++; + vm->reg[EBPF_REG_8] = *fp++; + vm->reg[EBPF_REG_9] = *fp++; + vm->sys_reg[EBPF_SYS_REG_LR] = *fp++; +} + +uint64_t run_ebpf_vm(struct ebpf_vm *vm) +{ + struct ebpf_instruction *ins = ebpf_vm_code(vm) + vm->sys_reg[EBPF_SYS_REG_PC]; + + while (1) { + switch (ins->opcode) { + case (EBPF_CLS_ALU64 | EBPF_ALU_OP_ADD | EBPF_SRC_IS_IMM): { + vm->reg[ins->dst_reg] += (uint64_t)ins->immediate; + break; + } + case (EBPF_CLS_ALU64 | EBPF_ALU_OP_ADD | EBPF_SRC_IS_REG): { + vm->reg[ins->dst_reg] += vm->reg[ins->src_reg]; + break; + } + case (EBPF_CLS_ALU64 | EBPF_ALU_OP_SUB | EBPF_SRC_IS_IMM): { + vm->reg[ins->dst_reg] -= (uint64_t)ins->immediate; + break; + } + case (EBPF_CLS_ALU64 | EBPF_ALU_OP_SUB | EBPF_SRC_IS_REG): { + vm->reg[ins->dst_reg] -= vm->reg[ins->src_reg]; + break; + } + case (EBPF_CLS_ALU64 | EBPF_ALU_OP_MUL | EBPF_SRC_IS_IMM): { + vm->reg[ins->dst_reg] *= (uint64_t)ins->immediate; + break; + } + case (EBPF_CLS_ALU64 | EBPF_ALU_OP_MUL | EBPF_SRC_IS_REG): { + vm->reg[ins->dst_reg] *= vm->reg[ins->src_reg]; + break; + } + case (EBPF_CLS_ALU64 | EBPF_ALU_OP_DIV | EBPF_SRC_IS_IMM): { + vm->reg[ins->dst_reg] /= (uint64_t)ins->immediate; + break; + } + case (EBPF_CLS_ALU64 | EBPF_ALU_OP_DIV | EBPF_SRC_IS_REG): { + vm->reg[ins->dst_reg] /= vm->reg[ins->src_reg]; + break; + } + case (EBPF_CLS_ALU64 | EBPF_ALU_OP_OR | EBPF_SRC_IS_IMM): { + vm->reg[ins->dst_reg] |= (uint64_t)ins->immediate; + break; + } + case (EBPF_CLS_ALU64 | EBPF_ALU_OP_OR | EBPF_SRC_IS_REG): { + vm->reg[ins->dst_reg] |= vm->reg[ins->src_reg]; + break; + } + case (EBPF_CLS_ALU64 | EBPF_ALU_OP_AND | EBPF_SRC_IS_IMM): { + vm->reg[ins->dst_reg] &= (uint64_t)ins->immediate; + break; + } + case (EBPF_CLS_ALU64 | EBPF_ALU_OP_AND | EBPF_SRC_IS_REG): { + vm->reg[ins->dst_reg] &= vm->reg[ins->src_reg]; + break; + } + case (EBPF_CLS_ALU64 | EBPF_ALU_OP_LSH | EBPF_SRC_IS_IMM): { + vm->reg[ins->dst_reg] <<= (uint64_t)ins->immediate; + break; + } + case (EBPF_CLS_ALU64 | EBPF_ALU_OP_LSH | EBPF_SRC_IS_REG): { + vm->reg[ins->dst_reg] <<= vm->reg[ins->src_reg]; + break; + } + case (EBPF_CLS_ALU64 | EBPF_ALU_OP_RSH | EBPF_SRC_IS_IMM): { + vm->reg[ins->dst_reg] >>= (uint64_t)ins->immediate; + break; + } + case (EBPF_CLS_ALU64 | EBPF_ALU_OP_RSH | EBPF_SRC_IS_REG): { + vm->reg[ins->dst_reg] >>= vm->reg[ins->src_reg]; + break; + } + case (EBPF_CLS_ALU64 | EBPF_ALU_OP_NEG): { + vm->reg[ins->dst_reg] = (uint64_t)(-vm->reg[ins->dst_reg]); + break; + } + case (EBPF_CLS_ALU64 | EBPF_ALU_OP_MOD | EBPF_SRC_IS_IMM): { + vm->reg[ins->dst_reg] %= (uint64_t)ins->immediate; + break; + } + case (EBPF_CLS_ALU64 | EBPF_ALU_OP_MOD | EBPF_SRC_IS_REG): { + vm->reg[ins->dst_reg] %= vm->reg[ins->src_reg]; + break; + } + case (EBPF_CLS_ALU64 | EBPF_ALU_OP_XOR | EBPF_SRC_IS_IMM): { + vm->reg[ins->dst_reg] ^= (uint64_t)ins->immediate; + break; + } + case (EBPF_CLS_ALU64 | EBPF_ALU_OP_XOR | EBPF_SRC_IS_REG): { + vm->reg[ins->dst_reg] ^= vm->reg[ins->src_reg]; + break; + } + case (EBPF_CLS_ALU64 | EBPF_ALU_OP_MOV | EBPF_SRC_IS_IMM): { + vm->reg[ins->dst_reg] = (uint64_t)ins->immediate; + break; + } + case (EBPF_CLS_ALU64 | EBPF_ALU_OP_MOV | EBPF_SRC_IS_REG): { + vm->reg[ins->dst_reg] = vm->reg[ins->src_reg]; + break; + } + case (EBPF_CLS_ALU64 | EBPF_ALU_OP_ARSH | EBPF_SRC_IS_IMM): { + vm->reg[ins->dst_reg] = (int64_t)vm->reg[ins->dst_reg] >> (int64_t)ins->immediate; + break; + } + case (EBPF_CLS_ALU64 | EBPF_ALU_OP_ARSH | EBPF_SRC_IS_REG): { + vm->reg[ins->dst_reg] = (int64_t)vm->reg[ins->dst_reg] >> (int64_t)vm->reg[ins->src_reg]; + break; + } + case (EBPF_CLS_JMP | EBPF_JMP_OP_JA): { + ins += ins->offset; + break; + } + case (EBPF_CLS_JMP | EBPF_JMP_OP_JEQ | EBPF_SRC_IS_IMM): { + ins += ((uint64_t)vm->reg[ins->dst_reg] == (uint64_t)ins->immediate) ? ins->offset : 0; + break; + } + case (EBPF_CLS_JMP | EBPF_JMP_OP_JEQ | EBPF_SRC_IS_REG): { + ins += ((uint64_t)vm->reg[ins->dst_reg] == (uint64_t)vm->reg[ins->src_reg]) ? ins->offset : 0; + break; + } + case (EBPF_CLS_JMP | EBPF_JMP_OP_JGT | EBPF_SRC_IS_IMM): { + ins += ((uint64_t)vm->reg[ins->dst_reg] > (uint64_t)ins->immediate) ? ins->offset : 0; + break; + } + case (EBPF_CLS_JMP | EBPF_JMP_OP_JGT | EBPF_SRC_IS_REG): { + ins += ((uint64_t)vm->reg[ins->dst_reg] > (uint64_t)vm->reg[ins->src_reg]) ? ins->offset : 0; + break; + } + case (EBPF_CLS_JMP | EBPF_JMP_OP_JGE | EBPF_SRC_IS_IMM): { + ins += ((uint64_t)vm->reg[ins->dst_reg] >= (uint64_t)ins->immediate) ? ins->offset : 0; + break; + } + case (EBPF_CLS_JMP | EBPF_JMP_OP_JGE | EBPF_SRC_IS_REG): { + ins += ((uint64_t)vm->reg[ins->dst_reg] >= (uint64_t)vm->reg[ins->src_reg]) ? ins->offset : 0; + break; + } + case (EBPF_CLS_JMP | EBPF_JMP_OP_JSET | EBPF_SRC_IS_IMM): { + ins += ((uint64_t)vm->reg[ins->dst_reg] & (uint64_t)ins->immediate) ? ins->offset : 0; + break; + } + case (EBPF_CLS_JMP | EBPF_JMP_OP_JSET | EBPF_SRC_IS_REG): { + ins += ((uint64_t)vm->reg[ins->dst_reg] & (uint64_t)vm->reg[ins->src_reg]) ? ins->offset : 0; + break; + } + case (EBPF_CLS_JMP | EBPF_JMP_OP_JNE | EBPF_SRC_IS_IMM): { + ins += ((uint64_t)vm->reg[ins->dst_reg] != (uint64_t)ins->immediate) ? ins->offset : 0; + break; + } + case (EBPF_CLS_JMP | EBPF_JMP_OP_JNE | EBPF_SRC_IS_REG): { + ins += ((uint64_t)vm->reg[ins->dst_reg] != (uint64_t)vm->reg[ins->src_reg]) ? ins->offset : 0; + break; + } + case (EBPF_CLS_JMP | EBPF_JMP_OP_JSGT | EBPF_SRC_IS_IMM): { + ins += ((int64_t)vm->reg[ins->dst_reg] > (int64_t)ins->immediate) ? ins->offset : 0; + break; + } + case (EBPF_CLS_JMP | EBPF_JMP_OP_JSGT | EBPF_SRC_IS_REG): { + ins += ((int64_t)vm->reg[ins->dst_reg] > (int64_t)vm->reg[ins->src_reg]) ? ins->offset : 0; + break; + } + case (EBPF_CLS_JMP | EBPF_JMP_OP_JSGE | EBPF_SRC_IS_IMM): { + ins += ((int64_t)vm->reg[ins->dst_reg] >= (int64_t)ins->immediate) ? ins->offset : 0; + break; + } + case (EBPF_CLS_JMP | EBPF_JMP_OP_JSGE | EBPF_SRC_IS_REG): { + ins += ((int64_t)vm->reg[ins->dst_reg] >= (int64_t)vm->reg[ins->src_reg]) ? ins->offset : 0; + break; + } + case (EBPF_CLS_JMP | EBPF_JMP_OP_CALL): { + if (ins->src_reg == EBPF_PSEUDO_CALL) { + save_caller_register(vm); + vm->state.stack_depth++; + vm->reg[EBPF_REG_FP] -= EBPF_VM_STACK_FRAME_SIZE; + vm->sys_reg[EBPF_SYS_REG_LR] = ins - ebpf_vm_code(vm) + 1; + vm->sys_reg[EBPF_SYS_REG_PC] = vm->sys_reg[EBPF_SYS_REG_LR] + ins->immediate; + vm->reg[0] = run_ebpf_vm(vm); + if (vm->state.vm_state != VM_STATE_RUNNING) { + return 0; + } + } else if ((ins->immediate < PKT_VM_MAX_SYMBS) && (vm->rd.symbols[ins->immediate].func != NULL)) { + vm->sys_reg[EBPF_SYS_REG_PC] = ins - ebpf_vm_code(vm); + vm->reg[0] = vm->rd.symbols[ins->immediate].func(vm->reg[1], vm->reg[2], vm->reg[3], vm->reg[4], vm->reg[5], vm); + if (vm->state.vm_state != VM_STATE_RUNNING) { + return 0; + } + } + break; + } + case (EBPF_CLS_JMP | EBPF_JMP_OP_EXIT): { + if (vm->state.stack_depth != 0) { + vm->sys_reg[EBPF_SYS_REG_PC] = vm->sys_reg[EBPF_SYS_REG_LR]; + vm->reg[EBPF_REG_FP] += EBPF_VM_STACK_FRAME_SIZE; + vm->state.stack_depth--; + restore_caller_register(vm); + } else { + update_vm_state(vm, VM_STATE_EXIT); + } + return vm->reg[0]; + } + case (EBPF_CLS_JMP | EBPF_JMP_OP_JLT | EBPF_SRC_IS_IMM): { + ins += ((uint64_t)vm->reg[ins->dst_reg] < (uint64_t)ins->immediate) ? ins->offset : 0; + break; + } + case (EBPF_CLS_JMP | EBPF_JMP_OP_JLT | EBPF_SRC_IS_REG): { + ins += ((uint64_t)vm->reg[ins->dst_reg] < (uint64_t)vm->reg[ins->src_reg]) ? ins->offset : 0; + break; + } + case (EBPF_CLS_JMP | EBPF_JMP_OP_JLE | EBPF_SRC_IS_IMM): { + ins += ((uint64_t)vm->reg[ins->dst_reg] <= (uint64_t)ins->immediate) ? ins->offset : 0; + break; + } + case (EBPF_CLS_JMP | EBPF_JMP_OP_JLE | EBPF_SRC_IS_REG): { + ins += ((uint64_t)vm->reg[ins->dst_reg] <= (uint64_t)vm->reg[ins->src_reg]) ? ins->offset : 0; + break; + } + case (EBPF_CLS_JMP | EBPF_JMP_OP_JSLT | EBPF_SRC_IS_IMM): { + ins += ((int64_t)vm->reg[ins->dst_reg] < (int64_t)ins->immediate) ? ins->offset : 0; + break; + } + case (EBPF_CLS_JMP | EBPF_JMP_OP_JSLT | EBPF_SRC_IS_REG): { + ins += ((int64_t)vm->reg[ins->dst_reg] < (int64_t)vm->reg[ins->src_reg]) ? ins->offset : 0; + break; + } + case (EBPF_CLS_JMP | EBPF_JMP_OP_JSLE | EBPF_SRC_IS_IMM): { + ins += ((int64_t)vm->reg[ins->dst_reg] <= (int64_t)ins->immediate) ? ins->offset : 0; + break; + } + case (EBPF_CLS_JMP | EBPF_JMP_OP_JSLE | EBPF_SRC_IS_REG): { + ins += ((int64_t)vm->reg[ins->dst_reg] <= (int64_t)vm->reg[ins->src_reg]) ? ins->offset : 0; + break; + } + case (EBPF_CLS_LDX | EBPF_MEM | EBPF_B): { + uint64_t host_va = vm_mmu(vm->reg[ins->src_reg] + ins->offset, vm); + vm->reg[ins->dst_reg] = *((uint8_t *)host_va); + break; + } + case (EBPF_CLS_LDX | EBPF_MEM | EBPF_H): { + uint64_t host_va = vm_mmu(vm->reg[ins->src_reg] + ins->offset, vm); + vm->reg[ins->dst_reg] = *((uint16_t *)host_va); + break; + } + case (EBPF_CLS_LDX | EBPF_MEM | EBPF_W): { + uint64_t host_va = vm_mmu(vm->reg[ins->src_reg] + ins->offset, vm); + vm->reg[ins->dst_reg] = *((uint32_t *)host_va); + break; + } + case (EBPF_CLS_LDX | EBPF_MEM | EBPF_DW): { + uint64_t host_va = vm_mmu(vm->reg[ins->src_reg] + ins->offset, vm); + vm->reg[ins->dst_reg] = *((uint64_t *)host_va); + break; + } + case (EBPF_CLS_LD | EBPF_IMM | EBPF_DW): { + vm->reg[ins->dst_reg] = (uint32_t)ins[0].immediate | ((uint64_t)ins[1].immediate << 32); + ins++; + break; + } + case (EBPF_CLS_STX | EBPF_MEM | EBPF_B): { + uint64_t store_addr = vm_mmu(vm->reg[ins->dst_reg] + ins->offset, vm); + *(uint8_t *)store_addr = (uint8_t)vm->reg[ins->src_reg]; + break; + } + case (EBPF_CLS_STX | EBPF_MEM | EBPF_H): { + uint64_t store_addr = vm_mmu(vm->reg[ins->dst_reg] + ins->offset, vm); + *(uint16_t *)store_addr = (uint16_t)vm->reg[ins->src_reg]; + break; + } + case (EBPF_CLS_STX | EBPF_MEM | EBPF_W): { + uint64_t store_addr = vm_mmu(vm->reg[ins->dst_reg] + ins->offset, vm); + *(uint32_t *)store_addr = (uint32_t)vm->reg[ins->src_reg]; + break; + } + case (EBPF_CLS_STX | EBPF_MEM | EBPF_DW): { + uint64_t store_addr = vm_mmu(vm->reg[ins->dst_reg] + ins->offset, vm); + *(uint64_t *)store_addr = (uint64_t)vm->reg[ins->src_reg]; + break; + } + case (EBPF_CLS_STX | EBPF_XADD | EBPF_W): { + uint64_t store_addr = vm_mmu(vm->reg[ins->dst_reg] + ins->offset, vm); + __atomic_fetch_add((uint32_t *)store_addr, (uint32_t)vm->reg[ins->src_reg], __ATOMIC_RELAXED); + break; + } + case (EBPF_CLS_STX | EBPF_XADD | EBPF_DW): { + uint64_t store_addr = vm_mmu(vm->reg[ins->dst_reg] + ins->offset, vm); + __atomic_fetch_add((uint64_t *)store_addr, (uint64_t)vm->reg[ins->src_reg], __ATOMIC_RELAXED); + break; + } + case (EBPF_CLS_ST | EBPF_MEM | EBPF_B): { + uint64_t store_addr = vm_mmu(vm->reg[ins->dst_reg] + ins->offset, vm); + *(uint8_t *)store_addr = (uint8_t)ins->immediate; + break; + } + case (EBPF_CLS_ST | EBPF_MEM | EBPF_H): { + uint64_t store_addr = vm_mmu(vm->reg[ins->dst_reg] + ins->offset, vm); + *(uint16_t *)store_addr = (uint16_t)ins->immediate; + break; + } + case (EBPF_CLS_ST | EBPF_MEM | EBPF_W): { + uint64_t store_addr = vm_mmu(vm->reg[ins->dst_reg] + ins->offset, vm); + *(uint32_t *)store_addr = (uint32_t)ins->immediate; + break; + } + case (EBPF_CLS_ST | EBPF_MEM | EBPF_DW): { + uint64_t store_addr = vm_mmu(vm->reg[ins->dst_reg] + ins->offset, vm); + *(uint64_t *)store_addr = (uint64_t)ins->immediate; + break; + } + case (EBPF_CLS_ALU | EBPF_ALU_OP_END | EBPF_TO_LE): { + vm->reg[ins->dst_reg] = to_little_endian(&vm->reg[ins->dst_reg], ins->immediate); + break; + } + case (EBPF_CLS_ALU | EBPF_ALU_OP_END | EBPF_TO_BE): { + vm->reg[ins->dst_reg] = to_big_endian(&vm->reg[ins->dst_reg], ins->immediate); + break; + } + default: { + printf("invalid ebpf opcode %x\n", ins->opcode); + update_vm_state(vm, VM_STATE_EXIT); + return 0; + } + } /*end of switch*/ + /* increase PC */ + ins++; + } /*end of while*/ + + /*should never be here*/ + return 0; +} + +static void receive_vm(struct ebpf_vm_executor *executor, void *buf, int buf_size) +{ + struct ebpf_vm *vm = NULL; + + if (buf_size < sizeof(struct ebpf_vm)) { + printf("vm size is too small, buf_size = %d.\n", buf_size); + return; + } + + vm = calloc(1, buf_size); + if (vm == NULL) { + printf("Failed to allocate vm for input vm.\n"); + return; + } + + memcpy(vm, buf, buf_size); + + vm->page_table[0].entries[0].va = (uint64_t)vm + vm->data; + ub_list_init(&vm->address_monitor_list); + vm->sys_reg[EBPF_SYS_REG_PC]++; + update_vm_state(vm, VM_STATE_RUNNING); + + add_vm(executor, vm); +} + +void vm_executor_run(struct ebpf_vm_executor *executor) +{ + struct ebpf_vm *vm = NULL, *tmp = NULL; + struct transport_message recv_msg; + int msg_len; + + while (executor->state.should_stop == 0) { + UB_LIST_FOR_EACH_SAFE(vm, tmp, rd.list, &executor->vm_list) { + if (vm->state.vm_state == VM_STATE_RUNNING || + vm->state.vm_state == VM_STATE_WAIT_FOR_ADDRESS) { + run_ebpf_vm(vm); + } + + if (vm->state.vm_state == VM_STATE_EXIT) { + ub_list_remove(&vm->rd.list); + destroy_vm(vm); + } + } + + msg_len = executor->transport->recv(executor->transport_ctx, &recv_msg); + if (msg_len != 0) { + receive_vm(executor, recv_msg.buf, recv_msg.buf_size); + executor->transport->return_buf(executor->transport_ctx, &recv_msg); + } + } +} + +int add_vm(struct ebpf_vm_executor *executor, struct ebpf_vm *vm) +{ + vm->rd.id = executor->next_vm_id++; + vm->rd.symbols = ebpf_global_symbs; + vm->rd.executor = executor; + ub_list_push_back(&executor->vm_list, &vm->rd.list); + return 0; +} + +struct ebpf_vm *create_vm(uint8_t *code, uint32_t code_size) +{ + struct ebpf_vm *vm = NULL; + int total_size = sizeof(struct ebpf_vm); + + total_size += code_size; + total_size += EBPF_VM_DEFAULT_STACK_SIZE; + total_size += EBPF_VM_DEFAULT_DATA_SIZE; + + vm = calloc(1, total_size); + if (vm == NULL) { + return NULL; + } + + vm->code_size = code_size; + vm->stack_size = EBPF_VM_DEFAULT_STACK_SIZE; + vm->data_size = EBPF_VM_DEFAULT_DATA_SIZE; + vm->code = sizeof(struct ebpf_vm); + vm->data = vm->code + vm->code_size; + vm->stack = vm->data + vm->data_size; + vm->reg[EBPF_REG_FP] = vm->data_size + vm->stack_size - EBPF_VM_STACK_FRAME_SIZE; + vm->state.next_data_to_use = 0; + + vm->page_table[0].entries[0].va = (uint64_t)vm + vm->data; + vm->page_table[0].entries[0].size = vm->data_size + vm->stack_size; + + memcpy(((uint8_t *)vm + vm->code), code, code_size); + ub_list_init(&vm->address_monitor_list); + return vm; +} + +void destroy_vm(struct ebpf_vm *vm) +{ + struct address_monitor_entry *entry, *tmp = NULL; + UB_LIST_FOR_EACH_SAFE(entry, tmp, list, &vm->address_monitor_list){ + ub_list_remove(&entry->list); + free(entry); + } + free(vm); +} + +int load_data(struct ebpf_vm *vm, uint8_t *data, uint32_t len) +{ + int remain = vm->data_size - vm->state.next_data_to_use; + int copy_len = (len < remain) ? len : remain; + + if (copy_len != 0) { + memcpy(((uint8_t *)vm + vm->data + vm->state.next_data_to_use), data, copy_len); + vm->state.next_data_to_use += copy_len; + } + + return copy_len; +} + +int register_transport(struct transport_ops *ops) +{ + registered_transport[ops->type] = ops; + return 0; +} + +void *vm_executor_init(struct ebpf_vm_executor_config *cfg) +{ + struct ebpf_vm_executor *executor = NULL; + + executor = malloc(sizeof(*executor)); + if (executor == NULL) { + perror("Failed to allocate memory"); + return NULL; + } + + ub_list_init(&executor->vm_list); + executor->state.should_stop = 0; + executor->transport = registered_transport[PKT_VM_TRANSPORT_TYPE_RDMA]; + executor->transport_ctx = executor->transport->init(&cfg->transport); + if (executor->transport_ctx == NULL) { + perror("Failed to initialize transport"); + free(executor); + return NULL; + } + + return executor; +} + +void vm_executor_destroy(struct ebpf_vm_executor *executor) +{ + struct ebpf_vm *vm, *tmp; + + if (executor->transport_ctx) { + executor->transport->exit(executor->transport_ctx); + } + + UB_LIST_FOR_EACH_SAFE(vm, tmp, rd.list, &executor->vm_list){ + ub_list_remove(&vm->rd.list); + free(vm); + } + + free(executor); +} \ No newline at end of file -- Gitee