diff --git a/acpo/model-ai4cfh.acpo b/acpo/model-ai4cfh.acpo
new file mode 100644
index 0000000000000000000000000000000000000000..7639ecd1b39323dc9f0400445c11eb4e3c8c1e01
--- /dev/null
+++ b/acpo/model-ai4cfh.acpo
@@ -0,0 +1,16 @@
+ModelName=AI4CFH
+Features={BlockWithMultipleSuccecorsPerLoop, float32}, {PtrArgs, float32}, {MaxDomTreeLevel, float32}, {IsLinkOnceODR, float32}, {IsLocal, float32}, {Calls, float32}, {Blocks, float32}, {InitialSize, float32}, {MaxLoopDepth, float32}, {users, float32}, {InstructionPerBlock, float32}, {Loops, float32}, {conditionally_executed_blocks, float32}, {IsLinkOnce, float32}, {basic_block_count, float32}, {PtrCallee, float32}, {CallReturnPtr, float32}, {ConditionalBranch, float32}, {CBwithArg, float32}, {CallerHeight, float32}, {CallUsage, float32}, {IsRecursive, float32}, {NumCallsiteInLoop, float32}, {NumOfCallUsesInLoop, float32}, {EntryBlockFreq, float32}, {MaxCallsiteBlockFreq, float32}, {SuccessorPerBlock, float32}, {AvgVecInstr, float32}, {AvgNestedLoopLevel, float32}, {InstrPerLoop, float32}
+Outputs={FH, int64}
+Signature=serving_default
+ModelDirectory=./models/ai4c-fh
+OutputKey=output_0
+ModelInference=
+# Above ModelInference need to be updated on python side
+ModelName=AI4CFH
+Features={BlockWithMultipleSuccecorsPerLoop, float32}, {PtrArgs, float32}, {MaxDomTreeLevel, float32}, {IsLinkOnceODR, float32}, {IsLocal, float32}, {Calls, float32}, {Blocks, float32}, {InitialSize, float32}, {MaxLoopDepth, float32}, {users, float32}, {InstructionPerBlock, float32}, {Loops, float32}, {conditionally_executed_blocks, float32}, {IsLinkOnce, float32}, {basic_block_count, float32}, {PtrCallee, float32}, {CallReturnPtr, float32}, {ConditionalBranch, float32}, {CBwithArg, float32}, {CallerHeight, float32}, {CallUsage, float32}, {IsRecursive, float32}, {NumCallsiteInLoop, float32}, {NumOfCallUsesInLoop, float32}, {EntryBlockFreq, float32}, {MaxCallsiteBlockFreq, float32}, {SuccessorPerBlock, float32}, {AvgVecInstr, float32}, {AvgNestedLoopLevel, float32}, {InstrPerLoop, float32}
+Outputs={FH, int64}
+Signature=serving_default
+ModelDirectory=./models/ai4c-fh
+OutputKey=output_0
+ModelInference=
+# Above ModelInference need to be updated on python side
diff --git a/acpo/model-ai4cmemop.acpo b/acpo/model-ai4cmemop.acpo
new file mode 100644
index 0000000000000000000000000000000000000000..b60ba9a2a9bf10ff5b35ce82ceb58c3a70b84d17
--- /dev/null
+++ b/acpo/model-ai4cmemop.acpo
@@ -0,0 +1,8 @@
+ModelName=AI4CMEMOP
+Features={BlockWithMultipleSuccecorsPerLoop, float32}, {PtrArgs, float32}, {MaxDomTreeLevel, float32}, {IsLinkOnceODR, float32}, {IsLocal, float32}, {Calls, float32}, {Blocks, float32}, {InitialSize, float32}, {MaxLoopDepth, float32}, {users, float32}, {InstructionPerBlock, float32}, {Loops, float32}, {conditionally_executed_blocks, float32}, {IsLinkOnce, float32}, {basic_block_count, float32}, {PtrCallee, float32}, {CallReturnPtr, float32}, {ConditionalBranch, float32}, {CBwithArg, float32}, {CallerHeight, float32}, {CallUsage, float32}, {IsRecursive, float32}, {NumCallsiteInLoop, float32}, {NumOfCallUsesInLoop, float32}, {EntryBlockFreq, float32}, {MaxCallsiteBlockFreq, float32}, {SuccessorPerBlock, float32}, {AvgVecInstr, float32}, {AvgNestedLoopLevel, float32}, {InstrPerLoop, float32}, {ends_with_branch, float32}, {ends_with_cond_branch, float32}, {ends_with_return, float32}, {ends_with_unreachable, float32}, {num_succs, float32}, {num_preds, float32}, {num_stores, float32}, {num_loads, float32}, {num_calls, float32}, {num_phis, float32}, {num_inst, float32}, {memop_type, float32}, {dst_align, float32}, {dst_from, float32}, {src_align, float32}, {src_from, float32}, {opt_size, float32}
+Outputs={OPT, int64}
+Signature=serving_default
+ModelDirectory=./models/ai4c-memop
+OutputKey=output_0
+ModelInference=
+# Above ModelInference need to be updated on python side
diff --git a/acpo/model-bw.acpo b/acpo/model-bw.acpo
new file mode 100644
index 0000000000000000000000000000000000000000..20a35f94a81fa8c8780fc25eb6387ef2894f8e5e
--- /dev/null
+++ b/acpo/model-bw.acpo
@@ -0,0 +1,7 @@
+ModelName=BW
+Features={BlockWithMultipleSuccecorsPerLoop, float32},{PtrArgs, float32},{MaxDomTreeLevel, float32},{IsLinkOnceODR, float32},{IsLocal, float32},{Calls, float32},{Blocks, float32},{InitialSize, float32},{MaxLoopDepth, float32},{users, float32},{InstructionPerBlock, float32},{Loops, float32},{conditionally_executed_blocks, float32},{IsLinkOnce, float32},{basic_block_count, float32},{PtrCallee, float32},{CallReturnPtr, float32},{ConditionalBranch, float32},{CBwithArg, float32},{CallerHeight, float32},{CallUsage, float32},{IsRecursive, float32},{NumCallsiteInLoop, float32},{NumOfCallUsesInLoop, float32},{EntryBlockFreq, float32},{MaxCallsiteBlockFreq, float32},{SuccessorPerBlock, float32},{AvgVecInstr, float32},{AvgNestedLoopLevel, float32},{InstrPerLoop, float32},{num_successors, float32},{num_instrs, float32},{num_critical_edges, float32},{highest_num_instrs_in_succ, float32},{succ_num_with_highest_num_instrs, float32},{is_branch_inst, float32},{is_switch_inst, float32},{is_indirect_br_inst, float32},{is_invoke_inst, float32},{is_call_br_inst, float32},{is_second_succ_in_loop, float32},{is_first_succ_in_loop, float32},{is_bb_in_loop, float32},{is_iv_cmp, float32},{is_le_cmp, float32},{is_ge_cmp, float32},{is_lt_cmp, float32},{is_gt_cmp, float32},{is_ne_cmp, float32},{is_eq_cmp, float32},{is_second_op_constant, float32},{is_second_op_null, float32},{is_first_op_ptr, float32},{dest_num_successors, float32},{dest_num_instrs, float32},{dest_num_critical_edges, float32},{dest_is_branch_inst, float32},{dest_is_switch_inst, float32},{dest_is_indirect_br_inst, float32},{dest_is_invoke_inst, float32},{dest_is_call_br_inst, float32},{dest_succ_number, float32}
+Outputs={BW-BranchWeight, int64}
+Signature=serving_default
+ModelDirectory=./models/ai4c-bw
+OutputKey=output_0
+ModelInference=BWInference
\ No newline at end of file
diff --git a/acpo/models/ai4c-bw/BWCompiledModel-AARCH64.h b/acpo/models/ai4c-bw/BWCompiledModel-AARCH64.h
new file mode 100644
index 0000000000000000000000000000000000000000..7d2f1198000a0dfd82f5ae39e93354aba5e78872
--- /dev/null
+++ b/acpo/models/ai4c-bw/BWCompiledModel-AARCH64.h
@@ -0,0 +1,413 @@
+// Generated by tfcompile, the TensorFlow graph compiler.  DO NOT EDIT!
+//
+// This header was generated via ahead-of-time compilation of a TensorFlow
+// graph.  An object file corresponding to this header was also generated.
+// This header gives access to the functionality in that object file.
+//
+// clang-format off
+
+#ifndef TFCOMPILE_GENERATED__xla_BWCompiledModel_AARCH64_llvm_BWCompiledModel_H_  // NOLINT(build/header_guard)
+#define TFCOMPILE_GENERATED__xla_BWCompiledModel_AARCH64_llvm_BWCompiledModel_H_  // NOLINT(build/header_guard)
+
+
+
+#include "tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace Eigen { struct ThreadPoolDevice; }
+namespace xla { class ExecutableRunOptions; }
+
+// (Implementation detail) Entry point to the function in the object file.
+extern "C" void _xla_BWCompiledModel_AARCH64_llvm_BWCompiledModel(
+    void* result, const ::xla::ExecutableRunOptions* run_options,
+    const void** args, void** temps, XlaCustomCallStatus* status,
+    int64_t* profile_counters);
+
+
+
+
+namespace llvm {
+
+// BWCompiledModel represents a computation previously specified in a
+// TensorFlow graph, now compiled into executable code. This extends the generic
+// XlaCompiledCpuFunction class with statically type-safe arg and result
+// methods. Usage example:
+//
+//   BWCompiledModel computation;
+//   // ...set args using computation.argN methods
+//   CHECK(computation.Run());
+//   // ...inspect results using computation.resultN methods
+//
+// The Run method invokes the actual computation, with inputs read from arg
+// buffers, and outputs written to result buffers. Each Run call may also use
+// a set of temporary buffers for the computation.
+//
+// By default each instance of this class manages its own arg, result and temp
+// buffers. The AllocMode constructor parameter may be used to modify the
+// buffer allocation strategy.
+//
+// Under the default allocation strategy, this class is thread-compatible:
+// o Calls to non-const methods require exclusive access to the object.
+// o Concurrent calls to const methods are OK, if those calls are made while it
+//   is guaranteed that no thread may call a non-const method.
+//
+// The logical function signature is:
+//   (arg0: f32[1,62]) -> (f32[1,1], f32[1,10,1])
+//
+// Memory stats:
+//   arg bytes total:    248
+//   arg bytes aligned:  256
+//   temp bytes total:   1084
+//   temp bytes aligned: 1216
+class BWCompiledModel final : public tensorflow::XlaCompiledCpuFunction {
+ public:
+  // Number of input arguments for the compiled computation.
+  static constexpr size_t kNumArgs = 1;
+
+  static constexpr size_t kNumResults = 2;
+
+  // Number of variables for the compiled computation.
+  static constexpr size_t kNumVariables = 0;
+
+  // Byte size of each argument buffer. There are kNumArgs entries.
+  static const ::int64_t ArgSize(::tensorflow::int32 index) {
+    return BufferInfos()[ArgIndexToBufferIndex()[index]].size();
+  }
+
+  // Returns static data used to create an XlaCompiledCpuFunction.
+  static const tensorflow::XlaCompiledCpuFunction::StaticData& StaticData() {
+    static XlaCompiledCpuFunction::StaticData* kStaticData = [](){
+      XlaCompiledCpuFunction::StaticData* data =
+        new XlaCompiledCpuFunction::StaticData;
+      set_static_data_raw_function(data, _xla_BWCompiledModel_AARCH64_llvm_BWCompiledModel);
+      set_static_data_buffer_infos(data, BufferInfos());
+      set_static_data_num_buffers(data, kNumBuffers);
+      set_static_data_result_index_table(data, ResultIndexToBufferIndex());
+      set_static_data_num_results(data, kNumResults);
+      set_static_data_arg_index_table(data, ArgIndexToBufferIndex());
+      set_static_data_num_args(data, kNumArgs);
+      set_static_data_num_variables(data, kNumVariables);
+      set_static_data_result_index(data, kResultIndex);
+      set_static_data_arg_shape_infos(data, ArgShapeInfos());
+      set_static_data_result_shape_infos(data, ResultShapeInfos());
+      set_static_data_arg_names(data, StaticArgNames());
+      set_static_data_variable_names(data, StaticVariableNames());
+      set_static_data_result_names(data, StaticResultNames());
+      set_static_data_program_shape(data, StaticProgramShape());
+      set_static_data_hlo_profile_printer_data(
+          data, StaticHloProfilePrinterData());
+      set_static_data_use_xla_runtime(data, false);
+
+      return data;
+    }();
+    return *kStaticData;
+  }
+
+  BWCompiledModel(AllocMode alloc_mode =
+            AllocMode::ARGS_VARIABLES_RESULTS_PROFILES_AND_TEMPS)
+      : XlaCompiledCpuFunction(StaticData(), alloc_mode) {}
+
+  BWCompiledModel(const BWCompiledModel&) = delete;
+  BWCompiledModel& operator=(const BWCompiledModel&) = delete;
+
+  // Arg methods for managing input buffers. Buffers are in row-major order.
+  // There is a set of methods for each positional argument, with the following
+  // general form:
+  //
+  // void set_argN_data(void* data)
+  //   Sets the buffer of type T for positional argument N. May be called in
+  //   any AllocMode. Must be called before Run to have an effect. Must be
+  //   called in AllocMode::RESULTS_PROFILES_AND_TEMPS_ONLY for each positional
+  //   argument, to set the argument buffers.
+  //
+  // T* argN_data()
+  //   Returns the buffer of type T for positional argument N.
+  //
+  // T& argN(...dim indices...)
+  //   Returns a reference to the value of type T for positional argument N,
+  //   with dim indices specifying which value. No bounds checking is performed
+  //   on dim indices.
+
+  void set_arg0_data(const void* data) {
+    set_arg_data(0, data);
+  }
+  float* arg0_data() {
+    return static_cast<float*>(arg_data(0));
+  }
+  float& arg0(size_t dim0, size_t dim1) {
+    return (*static_cast<float(*)[1][62]>(
+        arg_data(0)))[dim0][dim1];
+  }
+  const float* arg0_data() const {
+    return static_cast<const float*>(arg_data(0));
+  }
+  const float& arg0(size_t dim0, size_t dim1) const {
+    return (*static_cast<const float(*)[1][62]>(
+        arg_data(0)))[dim0][dim1];
+  }
+  int arg0_size() const {
+    return 62 * sizeof(float);
+  }
+  int arg0_count() const {
+    return 62;
+  }
+
+  void set_arg_feed_input_1_data(const void* data) {
+    set_arg_data(0, data);
+  }
+  float* arg_feed_input_1_data() {
+    return static_cast<float*>(arg_data(0));
+  }
+  float& arg_feed_input_1(size_t dim0, size_t dim1) {
+    return (*static_cast<float(*)[1][62]>(
+        arg_data(0)))[dim0][dim1];
+  }
+  const float* arg_feed_input_1_data() const {
+    return static_cast<const float*>(arg_data(0));
+  }
+  const float& arg_feed_input_1(size_t dim0, size_t dim1) const {
+    return (*static_cast<const float(*)[1][62]>(
+        arg_data(0)))[dim0][dim1];
+  }
+  int arg_feed_input_1_size() const {
+    return 62 * sizeof(float);
+  }
+  int arg_feed_input_1_count() const {
+    return 62;
+  }
+
+  // Result methods for managing output buffers. Buffers are in row-major order.
+  // Must only be called after a successful Run call. There is a set of methods
+  // for each positional result, with the following general form:
+  //
+  // T* resultN_data()
+  //   Returns the buffer of type T for positional result N.
+  //
+  // T& resultN(...dim indices...)
+  //   Returns a reference to the value of type T for positional result N,
+  //   with dim indices specifying which value. No bounds checking is performed
+  //   on dim indices.
+  //
+  // Unlike the arg methods, there is no set_resultN_data method. The result
+  // buffers are managed internally, and may change after each call to Run.
+
+  float* result0_data() {
+    return static_cast<float*>(result_data(0));
+  }
+  float& result0(size_t dim0, size_t dim1) {
+    return (*static_cast<float(*)[1][1]>(
+        result_data(0)))[dim0][dim1];
+  }
+  const float* result0_data() const {
+    return static_cast<const float*>(result_data(0));
+  }
+  const float& result0(size_t dim0, size_t dim1) const {
+    return (*static_cast<const float(*)[1][1]>(
+        result_data(0)))[dim0][dim1];
+  }
+  int result0_size() const {
+    return 1 * sizeof(float);
+  }
+  int result0_count() const {
+    return 1;
+  }
+
+  float* result_fetch_output_1_data() {
+    return static_cast<float*>(result_data(0));
+  }
+  float& result_fetch_output_1(size_t dim0, size_t dim1) {
+    return (*static_cast<float(*)[1][1]>(
+        result_data(0)))[dim0][dim1];
+  }
+  const float* result_fetch_output_1_data() const {
+    return static_cast<const float*>(result_data(0));
+  }
+  const float& result_fetch_output_1(size_t dim0, size_t dim1) const {
+    return (*static_cast<const float(*)[1][1]>(
+        result_data(0)))[dim0][dim1];
+  }
+  int result_fetch_output_1_size() const {
+    return 1 * sizeof(float);
+  }
+  int result_fetch_output_1_count() const {
+    return 1;
+  }
+
+  float* result1_data() {
+    return static_cast<float*>(result_data(1));
+  }
+  float& result1(size_t dim0, size_t dim1, size_t dim2) {
+    return (*static_cast<float(*)[1][10][1]>(
+        result_data(1)))[dim0][dim1][dim2];
+  }
+  const float* result1_data() const {
+    return static_cast<const float*>(result_data(1));
+  }
+  const float& result1(size_t dim0, size_t dim1, size_t dim2) const {
+    return (*static_cast<const float(*)[1][10][1]>(
+        result_data(1)))[dim0][dim1][dim2];
+  }
+  int result1_size() const {
+    return 10 * sizeof(float);
+  }
+  int result1_count() const {
+    return 10;
+  }
+
+  float* result_fetch_output_0_data() {
+    return static_cast<float*>(result_data(1));
+  }
+  float& result_fetch_output_0(size_t dim0, size_t dim1, size_t dim2) {
+    return (*static_cast<float(*)[1][10][1]>(
+        result_data(1)))[dim0][dim1][dim2];
+  }
+  const float* result_fetch_output_0_data() const {
+    return static_cast<const float*>(result_data(1));
+  }
+  const float& result_fetch_output_0(size_t dim0, size_t dim1, size_t dim2) const {
+    return (*static_cast<const float(*)[1][10][1]>(
+        result_data(1)))[dim0][dim1][dim2];
+  }
+  int result_fetch_output_0_size() const {
+    return 10 * sizeof(float);
+  }
+  int result_fetch_output_0_count() const {
+    return 10;
+  }
+
+  // Methods for managing variable buffers. Buffers are in row-major order.
+  //
+  // For read-write variables we generate the following methods:
+  //
+  // void set_var_X_data(T* data)
+  //   Sets the buffer for variable X.  Must be called before Run if the
+  //   allocation mode is RESULTS_PROFILES_AND_TEMPS_ONLY.
+  //
+  // T* var_X_data()
+  //   Returns the buffer of type T for variable X.  If the allocation mode is
+  //   RESULTS_PROFILES_AND_TEMPS_ONLY then this buffer is the same as the
+  //   buffer passed to set_var_X_data.
+  //
+  // T& var_X(...dim indices...)
+  //   Returns a reference to the value of type T for variable X,
+  //   with dim indices specifying which value. No bounds checking is performed
+  //   on dim indices.
+  //
+  // For readonly variables we generate the same set of methods, except that we
+  // use `const T` instead of `T`.  We use `const T` to avoid erasing the
+  // constness of the buffer passed to `set_var_X_data` but the underlying
+  // buffer is not const (and thus the const can be safely const-cast'ed away)
+  // unless `set_var_X_data` is called with a pointer to constant storage.
+
+ private:
+  // Number of buffers for the compiled computation.
+  static constexpr size_t kNumBuffers = 21;
+
+  static const ::xla::cpu_function_runtime::BufferInfo* BufferInfos() {
+    static const ::xla::cpu_function_runtime::BufferInfo
+      kBufferInfos[kNumBuffers] = {
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{131072ULL, ~0U, ~0U}),
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{126976ULL, ~0U, ~0U}),
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{32768ULL, ~0U, ~0U}),
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{8192ULL, ~0U, ~0U}),
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{4096ULL, ~0U, ~0U}),
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{4096ULL, ~0U, ~0U}),
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{2560ULL, ~0U, ~0U}),
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{2048ULL, ~0U, ~0U}),
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{1024ULL, ~0U, ~0U}),
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{994ULL, 0U, ~0U}),
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{512ULL, ~0U, ~0U}),
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{256ULL, ~0U, ~0U}),
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{256ULL, ~0U, ~0U}),
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{256ULL, ~0U, ~0U}),
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{256ULL, ~0U, ~0U}),
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{161ULL, ~0U, 1U}),
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{160ULL, ~0U, ~0U}),
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{65ULL, ~0U, ~0U}),
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{17ULL, ~0U, 0U}),
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{16ULL, ~0U, ~0U}),
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{4097ULL, ~0U, ~0U})
+      };
+    return kBufferInfos;
+  }
+
+  static const ::tensorflow::int32* ResultIndexToBufferIndex() {
+    static constexpr ::tensorflow::int32 kResultIndexToBufferIndex[kNumResults] = {
+18, 15
+    };
+    return kResultIndexToBufferIndex;
+  }
+
+  static const ::tensorflow::int32* ArgIndexToBufferIndex() {
+    static constexpr ::tensorflow::int32 kArgIndexToBufferIndex[kNumArgs] = {
+9
+    };
+    return kArgIndexToBufferIndex;
+  }
+
+  // The 0-based index of the result tuple in the temporary buffers.
+  static constexpr size_t kResultIndex = 17;
+
+  // Shapes of the input arguments.
+  static constexpr int32_t kArg0Shapes[] = {
+1, 62
+  };
+  static const ShapeInfo* ArgShapeInfos() {
+    static constexpr ShapeInfo kArgShapeInfoTable[kNumArgs] = {
+{ kArg0Shapes, 2 },
+    };
+    return kArgShapeInfoTable;
+  };
+
+  // Shapes of the results.
+  static constexpr int32_t kResult0Shapes[] = {
+1, 1
+  };
+  static constexpr int32_t kResult1Shapes[] = {
+1, 10, 1
+  };
+  static const ShapeInfo* ResultShapeInfos() {
+    static constexpr ShapeInfo kResultShapeInfoTable[kNumResults] = {
+{ kResult0Shapes, 2 },
+{ kResult1Shapes, 3 },
+    };
+    return kResultShapeInfoTable;
+  };
+
+  // Array of names of each positional argument, terminated by nullptr.
+  static const char** StaticArgNames() {
+    static const char* kNames[] = {"feed_input_1", nullptr};
+    return kNames;
+  }
+
+  // Array of names of each positional variable, terminated by nullptr.
+  static const char** StaticVariableNames() {
+    static const char* kNames[] = {nullptr};
+    return kNames;
+  }
+
+  // Array of names of each positional result, terminated by nullptr.
+  static const char** StaticResultNames() {
+    static const char* kNames[] = {"fetch_output_1", "fetch_output_0", nullptr};
+    return kNames;
+  }
+
+  // Shape of the args and results.
+  static const ::xla::ProgramShapeProto* StaticProgramShape() {
+    static const ::xla::ProgramShapeProto* kShape = nullptr;
+    return kShape;
+  }
+
+  // Metadata that can be used to pretty-print profile counters.
+  static const ::xla::HloProfilePrinterData* StaticHloProfilePrinterData() {
+    static const ::xla::HloProfilePrinterData* kHloProfilePrinterData =
+      nullptr;
+    return kHloProfilePrinterData;
+  }
+};
+
+}  // end namespace llvm
+
+#endif  // TFCOMPILE_GENERATED__xla_BWCompiledModel_AARCH64_llvm_BWCompiledModel_H_
+
+// clang-format on
diff --git a/acpo/models/ai4c-bw/BWCompiledModel-AARCH64.o b/acpo/models/ai4c-bw/BWCompiledModel-AARCH64.o
new file mode 100644
index 0000000000000000000000000000000000000000..1159accd7e26bf8e81567e38f8ff15a7da7306c0
Binary files /dev/null and b/acpo/models/ai4c-bw/BWCompiledModel-AARCH64.o differ
diff --git a/acpo/models/ai4c-bw/BWCompiledModel-AARCH64_metadata.o b/acpo/models/ai4c-bw/BWCompiledModel-AARCH64_metadata.o
new file mode 100644
index 0000000000000000000000000000000000000000..cc02d5395c9cbe40bd5f9160f354848878b08721
Binary files /dev/null and b/acpo/models/ai4c-bw/BWCompiledModel-AARCH64_metadata.o differ
diff --git a/acpo/models/ai4c-bw/saved_model.pb b/acpo/models/ai4c-bw/saved_model.pb
new file mode 100644
index 0000000000000000000000000000000000000000..1b357fa46465cbe9226ebc5c83f51c93e5863db3
Binary files /dev/null and b/acpo/models/ai4c-bw/saved_model.pb differ
diff --git a/acpo/models/ai4c-bw/sc.pkl b/acpo/models/ai4c-bw/sc.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..71f2f78ef9e44c61f8f8fc4a3b179b970ef853af
Binary files /dev/null and b/acpo/models/ai4c-bw/sc.pkl differ
diff --git a/acpo/models/ai4c-bw/variables/variables.data-00000-of-00001 b/acpo/models/ai4c-bw/variables/variables.data-00000-of-00001
new file mode 100644
index 0000000000000000000000000000000000000000..75c0b74a789b88b033a473fb3470c987075e1065
Binary files /dev/null and b/acpo/models/ai4c-bw/variables/variables.data-00000-of-00001 differ
diff --git a/acpo/models/ai4c-bw/variables/variables.index b/acpo/models/ai4c-bw/variables/variables.index
new file mode 100644
index 0000000000000000000000000000000000000000..6d9e73c1d27edda44a04315e61a820329a5374a8
Binary files /dev/null and b/acpo/models/ai4c-bw/variables/variables.index differ
diff --git a/acpo/models/ai4c-fh/AI4CFHCompiledModel-AARCH64.h b/acpo/models/ai4c-fh/AI4CFHCompiledModel-AARCH64.h
new file mode 100644
index 0000000000000000000000000000000000000000..f854200380637541cadc8d0b359ad97aa0c6b859
--- /dev/null
+++ b/acpo/models/ai4c-fh/AI4CFHCompiledModel-AARCH64.h
@@ -0,0 +1,358 @@
+// Generated by tfcompile, the TensorFlow graph compiler.  DO NOT EDIT!
+//
+// This header was generated via ahead-of-time compilation of a TensorFlow
+// graph.  An object file corresponding to this header was also generated.
+// This header gives access to the functionality in that object file.
+//
+// clang-format off
+
+#ifndef TFCOMPILE_GENERATED__xla_AI4CFHCompiledModel_AARCH64_llvm_AI4CFHCompiledModel_H_  // NOLINT(build/header_guard)
+#define TFCOMPILE_GENERATED__xla_AI4CFHCompiledModel_AARCH64_llvm_AI4CFHCompiledModel_H_  // NOLINT(build/header_guard)
+
+
+
+#include "tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace Eigen { struct ThreadPoolDevice; }
+namespace xla { class ExecutableRunOptions; }
+
+// (Implementation detail) Entry point to the function in the object file.
+extern "C" void _xla_AI4CFHCompiledModel_AARCH64_llvm_AI4CFHCompiledModel(
+    void* result, const ::xla::ExecutableRunOptions* run_options,
+    const void** args, void** temps, XlaCustomCallStatus* status,
+    int64_t* profile_counters);
+
+
+
+
+namespace llvm {
+
+// AI4CFHCompiledModel represents a computation previously specified in a
+// TensorFlow graph, now compiled into executable code. This extends the generic
+// XlaCompiledCpuFunction class with statically type-safe arg and result
+// methods. Usage example:
+//
+//   AI4CFHCompiledModel computation;
+//   // ...set args using computation.argN methods
+//   CHECK(computation.Run());
+//   // ...inspect results using computation.resultN methods
+//
+// The Run method invokes the actual computation, with inputs read from arg
+// buffers, and outputs written to result buffers. Each Run call may also use
+// a set of temporary buffers for the computation.
+//
+// By default each instance of this class manages its own arg, result and temp
+// buffers. The AllocMode constructor parameter may be used to modify the
+// buffer allocation strategy.
+//
+// Under the default allocation strategy, this class is thread-compatible:
+// o Calls to non-const methods require exclusive access to the object.
+// o Concurrent calls to const methods are OK, if those calls are made while it
+//   is guaranteed that no thread may call a non-const method.
+//
+// The logical function signature is:
+//   (arg0: f32[1,30]) -> (f32[1,3])
+//
+// Memory stats:
+//   arg bytes total:    120
+//   arg bytes aligned:  128
+//   temp bytes total:   8212
+//   temp bytes aligned: 8320
+class AI4CFHCompiledModel final : public tensorflow::XlaCompiledCpuFunction {
+ public:
+  // Number of input arguments for the compiled computation.
+  static constexpr size_t kNumArgs = 1;
+
+  static constexpr size_t kNumResults = 1;
+
+  // Number of variables for the compiled computation.
+  static constexpr size_t kNumVariables = 0;
+
+  // Byte size of each argument buffer. There are kNumArgs entries.
+  static const ::int64_t ArgSize(::tensorflow::int32 index) {
+    return BufferInfos()[ArgIndexToBufferIndex()[index]].size();
+  }
+
+  // Returns static data used to create an XlaCompiledCpuFunction.
+  static const tensorflow::XlaCompiledCpuFunction::StaticData& StaticData() {
+    static XlaCompiledCpuFunction::StaticData* kStaticData = [](){
+      XlaCompiledCpuFunction::StaticData* data =
+        new XlaCompiledCpuFunction::StaticData;
+      set_static_data_raw_function(data, _xla_AI4CFHCompiledModel_AARCH64_llvm_AI4CFHCompiledModel);
+      set_static_data_buffer_infos(data, BufferInfos());
+      set_static_data_num_buffers(data, kNumBuffers);
+      set_static_data_result_index_table(data, ResultIndexToBufferIndex());
+      set_static_data_num_results(data, kNumResults);
+      set_static_data_arg_index_table(data, ArgIndexToBufferIndex());
+      set_static_data_num_args(data, kNumArgs);
+      set_static_data_num_variables(data, kNumVariables);
+      set_static_data_result_index(data, kResultIndex);
+      set_static_data_arg_shape_infos(data, ArgShapeInfos());
+      set_static_data_result_shape_infos(data, ResultShapeInfos());
+      set_static_data_arg_names(data, StaticArgNames());
+      set_static_data_variable_names(data, StaticVariableNames());
+      set_static_data_result_names(data, StaticResultNames());
+      set_static_data_program_shape(data, StaticProgramShape());
+      set_static_data_hlo_profile_printer_data(
+          data, StaticHloProfilePrinterData());
+      set_static_data_use_xla_runtime(data, false);
+
+      return data;
+    }();
+    return *kStaticData;
+  }
+
+  AI4CFHCompiledModel(AllocMode alloc_mode =
+            AllocMode::ARGS_VARIABLES_RESULTS_PROFILES_AND_TEMPS)
+      : XlaCompiledCpuFunction(StaticData(), alloc_mode) {}
+
+  AI4CFHCompiledModel(const AI4CFHCompiledModel&) = delete;
+  AI4CFHCompiledModel& operator=(const AI4CFHCompiledModel&) = delete;
+
+  // Arg methods for managing input buffers. Buffers are in row-major order.
+  // There is a set of methods for each positional argument, with the following
+  // general form:
+  //
+  // void set_argN_data(void* data)
+  //   Sets the buffer of type T for positional argument N. May be called in
+  //   any AllocMode. Must be called before Run to have an effect. Must be
+  //   called in AllocMode::RESULTS_PROFILES_AND_TEMPS_ONLY for each positional
+  //   argument, to set the argument buffers.
+  //
+  // T* argN_data()
+  //   Returns the buffer of type T for positional argument N.
+  //
+  // T& argN(...dim indices...)
+  //   Returns a reference to the value of type T for positional argument N,
+  //   with dim indices specifying which value. No bounds checking is performed
+  //   on dim indices.
+
+  void set_arg0_data(const void* data) {
+    set_arg_data(0, data);
+  }
+  float* arg0_data() {
+    return static_cast<float*>(arg_data(0));
+  }
+  float& arg0(size_t dim0, size_t dim1) {
+    return (*static_cast<float(*)[1][30]>(
+        arg_data(0)))[dim0][dim1];
+  }
+  const float* arg0_data() const {
+    return static_cast<const float*>(arg_data(0));
+  }
+  const float& arg0(size_t dim0, size_t dim1) const {
+    return (*static_cast<const float(*)[1][30]>(
+        arg_data(0)))[dim0][dim1];
+  }
+  int arg0_size() const {
+    return 30 * sizeof(float);
+  }
+  int arg0_count() const {
+    return 30;
+  }
+
+  void set_arg_feed_input_1_data(const void* data) {
+    set_arg_data(0, data);
+  }
+  float* arg_feed_input_1_data() {
+    return static_cast<float*>(arg_data(0));
+  }
+  float& arg_feed_input_1(size_t dim0, size_t dim1) {
+    return (*static_cast<float(*)[1][30]>(
+        arg_data(0)))[dim0][dim1];
+  }
+  const float* arg_feed_input_1_data() const {
+    return static_cast<const float*>(arg_data(0));
+  }
+  const float& arg_feed_input_1(size_t dim0, size_t dim1) const {
+    return (*static_cast<const float(*)[1][30]>(
+        arg_data(0)))[dim0][dim1];
+  }
+  int arg_feed_input_1_size() const {
+    return 30 * sizeof(float);
+  }
+  int arg_feed_input_1_count() const {
+    return 30;
+  }
+
+  // Result methods for managing output buffers. Buffers are in row-major order.
+  // Must only be called after a successful Run call. There is a set of methods
+  // for each positional result, with the following general form:
+  //
+  // T* resultN_data()
+  //   Returns the buffer of type T for positional result N.
+  //
+  // T& resultN(...dim indices...)
+  //   Returns a reference to the value of type T for positional result N,
+  //   with dim indices specifying which value. No bounds checking is performed
+  //   on dim indices.
+  //
+  // Unlike the arg methods, there is no set_resultN_data method. The result
+  // buffers are managed internally, and may change after each call to Run.
+
+  float* result0_data() {
+    return static_cast<float*>(result_data(0));
+  }
+  float& result0(size_t dim0, size_t dim1) {
+    return (*static_cast<float(*)[1][3]>(
+        result_data(0)))[dim0][dim1];
+  }
+  const float* result0_data() const {
+    return static_cast<const float*>(result_data(0));
+  }
+  const float& result0(size_t dim0, size_t dim1) const {
+    return (*static_cast<const float(*)[1][3]>(
+        result_data(0)))[dim0][dim1];
+  }
+  int result0_size() const {
+    return 3 * sizeof(float);
+  }
+  int result0_count() const {
+    return 3;
+  }
+
+  float* result_fetch_output_0_data() {
+    return static_cast<float*>(result_data(0));
+  }
+  float& result_fetch_output_0(size_t dim0, size_t dim1) {
+    return (*static_cast<float(*)[1][3]>(
+        result_data(0)))[dim0][dim1];
+  }
+  const float* result_fetch_output_0_data() const {
+    return static_cast<const float*>(result_data(0));
+  }
+  const float& result_fetch_output_0(size_t dim0, size_t dim1) const {
+    return (*static_cast<const float(*)[1][3]>(
+        result_data(0)))[dim0][dim1];
+  }
+  int result_fetch_output_0_size() const {
+    return 3 * sizeof(float);
+  }
+  int result_fetch_output_0_count() const {
+    return 3;
+  }
+
+  // Methods for managing variable buffers. Buffers are in row-major order.
+  //
+  // For read-write variables we generate the following methods:
+  //
+  // void set_var_X_data(T* data)
+  //   Sets the buffer for variable X.  Must be called before Run if the
+  //   allocation mode is RESULTS_PROFILES_AND_TEMPS_ONLY.
+  //
+  // T* var_X_data()
+  //   Returns the buffer of type T for variable X.  If the allocation mode is
+  //   RESULTS_PROFILES_AND_TEMPS_ONLY then this buffer is the same as the
+  //   buffer passed to set_var_X_data.
+  //
+  // T& var_X(...dim indices...)
+  //   Returns a reference to the value of type T for variable X,
+  //   with dim indices specifying which value. No bounds checking is performed
+  //   on dim indices.
+  //
+  // For readonly variables we generate the same set of methods, except that we
+  // use `const T` instead of `T`.  We use `const T` to avoid erasing the
+  // constness of the buffer passed to `set_var_X_data` but the underlying
+  // buffer is not const (and thus the const can be safely const-cast'ed away)
+  // unless `set_var_X_data` is called with a pointer to constant storage.
+
+ private:
+  // Number of buffers for the compiled computation.
+  static constexpr size_t kNumBuffers = 12;
+
+  static const ::xla::cpu_function_runtime::BufferInfo* BufferInfos() {
+    static const ::xla::cpu_function_runtime::BufferInfo
+      kBufferInfos[kNumBuffers] = {
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{16777216ULL, ~0U, ~0U}),
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{491520ULL, ~0U, ~0U}),
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{212992ULL, ~0U, ~0U}),
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{16384ULL, ~0U, ~0U}),
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{16384ULL, ~0U, ~0U}),
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{624ULL, ~0U, ~0U}),
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{482ULL, 0U, ~0U}),
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{208ULL, ~0U, ~0U}),
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{49ULL, ~0U, 0U}),
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{48ULL, ~0U, ~0U}),
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{33ULL, ~0U, ~0U}),
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{32769ULL, ~0U, ~0U})
+      };
+    return kBufferInfos;
+  }
+
+  static const ::tensorflow::int32* ResultIndexToBufferIndex() {
+    static constexpr ::tensorflow::int32 kResultIndexToBufferIndex[kNumResults] = {
+8
+    };
+    return kResultIndexToBufferIndex;
+  }
+
+  static const ::tensorflow::int32* ArgIndexToBufferIndex() {
+    static constexpr ::tensorflow::int32 kArgIndexToBufferIndex[kNumArgs] = {
+6
+    };
+    return kArgIndexToBufferIndex;
+  }
+
+  // The 0-based index of the result tuple in the temporary buffers.
+  static constexpr size_t kResultIndex = 10;
+
+  // Shapes of the input arguments.
+  static constexpr int32_t kArg0Shapes[] = {
+1, 30
+  };
+  static const ShapeInfo* ArgShapeInfos() {
+    static constexpr ShapeInfo kArgShapeInfoTable[kNumArgs] = {
+{ kArg0Shapes, 2 },
+    };
+    return kArgShapeInfoTable;
+  };
+
+  // Shapes of the results.
+  static constexpr int32_t kResult0Shapes[] = {
+1, 3
+  };
+  static const ShapeInfo* ResultShapeInfos() {
+    static constexpr ShapeInfo kResultShapeInfoTable[kNumResults] = {
+{ kResult0Shapes, 2 },
+    };
+    return kResultShapeInfoTable;
+  };
+
+  // Array of names of each positional argument, terminated by nullptr.
+  static const char** StaticArgNames() {
+    static const char* kNames[] = {"feed_input_1", nullptr};
+    return kNames;
+  }
+
+  // Array of names of each positional variable, terminated by nullptr.
+  static const char** StaticVariableNames() {
+    static const char* kNames[] = {nullptr};
+    return kNames;
+  }
+
+  // Array of names of each positional result, terminated by nullptr.
+  static const char** StaticResultNames() {
+    static const char* kNames[] = {"fetch_output_0", nullptr};
+    return kNames;
+  }
+
+  // Shape of the args and results.
+  static const ::xla::ProgramShapeProto* StaticProgramShape() {
+    static const ::xla::ProgramShapeProto* kShape = nullptr;
+    return kShape;
+  }
+
+  // Metadata that can be used to pretty-print profile counters.
+  static const ::xla::HloProfilePrinterData* StaticHloProfilePrinterData() {
+    static const ::xla::HloProfilePrinterData* kHloProfilePrinterData =
+      nullptr;
+    return kHloProfilePrinterData;
+  }
+};
+
+}  // end namespace llvm
+
+#endif  // TFCOMPILE_GENERATED__xla_AI4CFHCompiledModel_AARCH64_llvm_AI4CFHCompiledModel_H_
+
+// clang-format on
diff --git a/acpo/models/ai4c-fh/AI4CFHCompiledModel-AARCH64.o b/acpo/models/ai4c-fh/AI4CFHCompiledModel-AARCH64.o
new file mode 100644
index 0000000000000000000000000000000000000000..1efb5ffcc028c9033aa9e748804091da5a8ff739
Binary files /dev/null and b/acpo/models/ai4c-fh/AI4CFHCompiledModel-AARCH64.o differ
diff --git a/acpo/models/ai4c-fh/AI4CFHCompiledModel-AARCH64_metadata.o b/acpo/models/ai4c-fh/AI4CFHCompiledModel-AARCH64_metadata.o
new file mode 100644
index 0000000000000000000000000000000000000000..cc02d5395c9cbe40bd5f9160f354848878b08721
Binary files /dev/null and b/acpo/models/ai4c-fh/AI4CFHCompiledModel-AARCH64_metadata.o differ
diff --git a/acpo/models/ai4c-fh/saved_model.pb b/acpo/models/ai4c-fh/saved_model.pb
new file mode 100644
index 0000000000000000000000000000000000000000..8702645175054beff35ef7b6eab77deec6113c21
Binary files /dev/null and b/acpo/models/ai4c-fh/saved_model.pb differ
diff --git a/acpo/models/ai4c-fh/sc.pkl b/acpo/models/ai4c-fh/sc.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..72e7283b912227d186d99dd9ef8519c78b4552bb
Binary files /dev/null and b/acpo/models/ai4c-fh/sc.pkl differ
diff --git a/acpo/models/ai4c-fh/variables/variables.data-00000-of-00001 b/acpo/models/ai4c-fh/variables/variables.data-00000-of-00001
new file mode 100644
index 0000000000000000000000000000000000000000..84f6bd73d7523bd92d84365ab9b26974b423c9c3
Binary files /dev/null and b/acpo/models/ai4c-fh/variables/variables.data-00000-of-00001 differ
diff --git a/acpo/models/ai4c-fh/variables/variables.index b/acpo/models/ai4c-fh/variables/variables.index
new file mode 100644
index 0000000000000000000000000000000000000000..dcdb372b626ab6efee8c1042f66bb1756773e96b
Binary files /dev/null and b/acpo/models/ai4c-fh/variables/variables.index differ
diff --git a/acpo/models/ai4c-memop/AI4CMEMOPCompiledModel-AARCH64.h b/acpo/models/ai4c-memop/AI4CMEMOPCompiledModel-AARCH64.h
new file mode 100644
index 0000000000000000000000000000000000000000..fb8439a07f79777efdb75e7ce7b8976a9bfe3de0
--- /dev/null
+++ b/acpo/models/ai4c-memop/AI4CMEMOPCompiledModel-AARCH64.h
@@ -0,0 +1,358 @@
+// Generated by tfcompile, the TensorFlow graph compiler.  DO NOT EDIT!
+//
+// This header was generated via ahead-of-time compilation of a TensorFlow
+// graph.  An object file corresponding to this header was also generated.
+// This header gives access to the functionality in that object file.
+//
+// clang-format off
+
+#ifndef TFCOMPILE_GENERATED__xla_AI4CMEMOPCompiledModel_AARCH64_llvm_AI4CMEMOPCompiledModel_H_  // NOLINT(build/header_guard)
+#define TFCOMPILE_GENERATED__xla_AI4CMEMOPCompiledModel_AARCH64_llvm_AI4CMEMOPCompiledModel_H_  // NOLINT(build/header_guard)
+
+
+
+#include "tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace Eigen { struct ThreadPoolDevice; }
+namespace xla { class ExecutableRunOptions; }
+
+// (Implementation detail) Entry point to the function in the object file.
+extern "C" void _xla_AI4CMEMOPCompiledModel_AARCH64_llvm_AI4CMEMOPCompiledModel(
+    void* result, const ::xla::ExecutableRunOptions* run_options,
+    const void** args, void** temps, XlaCustomCallStatus* status,
+    int64_t* profile_counters);
+
+
+
+
+namespace llvm {
+
+// AI4CMEMOPCompiledModel represents a computation previously specified in a
+// TensorFlow graph, now compiled into executable code. This extends the generic
+// XlaCompiledCpuFunction class with statically type-safe arg and result
+// methods. Usage example:
+//
+//   AI4CMEMOPCompiledModel computation;
+//   // ...set args using computation.argN methods
+//   CHECK(computation.Run());
+//   // ...inspect results using computation.resultN methods
+//
+// The Run method invokes the actual computation, with inputs read from arg
+// buffers, and outputs written to result buffers. Each Run call may also use
+// a set of temporary buffers for the computation.
+//
+// By default each instance of this class manages its own arg, result and temp
+// buffers. The AllocMode constructor parameter may be used to modify the
+// buffer allocation strategy.
+//
+// Under the default allocation strategy, this class is thread-compatible:
+// o Calls to non-const methods require exclusive access to the object.
+// o Concurrent calls to const methods are OK, if those calls are made while it
+//   is guaranteed that no thread may call a non-const method.
+//
+// The logical function signature is:
+//   (arg0: f32[1,47]) -> (f32[1,2])
+//
+// Memory stats:
+//   arg bytes total:    188
+//   arg bytes aligned:  192
+//   temp bytes total:   8208
+//   temp bytes aligned: 8320
+class AI4CMEMOPCompiledModel final : public tensorflow::XlaCompiledCpuFunction {
+ public:
+  // Number of input arguments for the compiled computation.
+  static constexpr size_t kNumArgs = 1;
+
+  static constexpr size_t kNumResults = 1;
+
+  // Number of variables for the compiled computation.
+  static constexpr size_t kNumVariables = 0;
+
+  // Byte size of each argument buffer. There are kNumArgs entries.
+  static const ::int64_t ArgSize(::tensorflow::int32 index) {
+    return BufferInfos()[ArgIndexToBufferIndex()[index]].size();
+  }
+
+  // Returns static data used to create an XlaCompiledCpuFunction.
+  static const tensorflow::XlaCompiledCpuFunction::StaticData& StaticData() {
+    static XlaCompiledCpuFunction::StaticData* kStaticData = [](){
+      XlaCompiledCpuFunction::StaticData* data =
+        new XlaCompiledCpuFunction::StaticData;
+      set_static_data_raw_function(data, _xla_AI4CMEMOPCompiledModel_AARCH64_llvm_AI4CMEMOPCompiledModel);
+      set_static_data_buffer_infos(data, BufferInfos());
+      set_static_data_num_buffers(data, kNumBuffers);
+      set_static_data_result_index_table(data, ResultIndexToBufferIndex());
+      set_static_data_num_results(data, kNumResults);
+      set_static_data_arg_index_table(data, ArgIndexToBufferIndex());
+      set_static_data_num_args(data, kNumArgs);
+      set_static_data_num_variables(data, kNumVariables);
+      set_static_data_result_index(data, kResultIndex);
+      set_static_data_arg_shape_infos(data, ArgShapeInfos());
+      set_static_data_result_shape_infos(data, ResultShapeInfos());
+      set_static_data_arg_names(data, StaticArgNames());
+      set_static_data_variable_names(data, StaticVariableNames());
+      set_static_data_result_names(data, StaticResultNames());
+      set_static_data_program_shape(data, StaticProgramShape());
+      set_static_data_hlo_profile_printer_data(
+          data, StaticHloProfilePrinterData());
+      set_static_data_use_xla_runtime(data, false);
+
+      return data;
+    }();
+    return *kStaticData;
+  }
+
+  AI4CMEMOPCompiledModel(AllocMode alloc_mode =
+            AllocMode::ARGS_VARIABLES_RESULTS_PROFILES_AND_TEMPS)
+      : XlaCompiledCpuFunction(StaticData(), alloc_mode) {}
+
+  AI4CMEMOPCompiledModel(const AI4CMEMOPCompiledModel&) = delete;
+  AI4CMEMOPCompiledModel& operator=(const AI4CMEMOPCompiledModel&) = delete;
+
+  // Arg methods for managing input buffers. Buffers are in row-major order.
+  // There is a set of methods for each positional argument, with the following
+  // general form:
+  //
+  // void set_argN_data(void* data)
+  //   Sets the buffer of type T for positional argument N. May be called in
+  //   any AllocMode. Must be called before Run to have an effect. Must be
+  //   called in AllocMode::RESULTS_PROFILES_AND_TEMPS_ONLY for each positional
+  //   argument, to set the argument buffers.
+  //
+  // T* argN_data()
+  //   Returns the buffer of type T for positional argument N.
+  //
+  // T& argN(...dim indices...)
+  //   Returns a reference to the value of type T for positional argument N,
+  //   with dim indices specifying which value. No bounds checking is performed
+  //   on dim indices.
+
+  void set_arg0_data(const void* data) {
+    set_arg_data(0, data);
+  }
+  float* arg0_data() {
+    return static_cast<float*>(arg_data(0));
+  }
+  float& arg0(size_t dim0, size_t dim1) {
+    return (*static_cast<float(*)[1][47]>(
+        arg_data(0)))[dim0][dim1];
+  }
+  const float* arg0_data() const {
+    return static_cast<const float*>(arg_data(0));
+  }
+  const float& arg0(size_t dim0, size_t dim1) const {
+    return (*static_cast<const float(*)[1][47]>(
+        arg_data(0)))[dim0][dim1];
+  }
+  int arg0_size() const {
+    return 47 * sizeof(float);
+  }
+  int arg0_count() const {
+    return 47;
+  }
+
+  void set_arg_feed_input_1_data(const void* data) {
+    set_arg_data(0, data);
+  }
+  float* arg_feed_input_1_data() {
+    return static_cast<float*>(arg_data(0));
+  }
+  float& arg_feed_input_1(size_t dim0, size_t dim1) {
+    return (*static_cast<float(*)[1][47]>(
+        arg_data(0)))[dim0][dim1];
+  }
+  const float* arg_feed_input_1_data() const {
+    return static_cast<const float*>(arg_data(0));
+  }
+  const float& arg_feed_input_1(size_t dim0, size_t dim1) const {
+    return (*static_cast<const float(*)[1][47]>(
+        arg_data(0)))[dim0][dim1];
+  }
+  int arg_feed_input_1_size() const {
+    return 47 * sizeof(float);
+  }
+  int arg_feed_input_1_count() const {
+    return 47;
+  }
+
+  // Result methods for managing output buffers. Buffers are in row-major order.
+  // Must only be called after a successful Run call. There is a set of methods
+  // for each positional result, with the following general form:
+  //
+  // T* resultN_data()
+  //   Returns the buffer of type T for positional result N.
+  //
+  // T& resultN(...dim indices...)
+  //   Returns a reference to the value of type T for positional result N,
+  //   with dim indices specifying which value. No bounds checking is performed
+  //   on dim indices.
+  //
+  // Unlike the arg methods, there is no set_resultN_data method. The result
+  // buffers are managed internally, and may change after each call to Run.
+
+  float* result0_data() {
+    return static_cast<float*>(result_data(0));
+  }
+  float& result0(size_t dim0, size_t dim1) {
+    return (*static_cast<float(*)[1][2]>(
+        result_data(0)))[dim0][dim1];
+  }
+  const float* result0_data() const {
+    return static_cast<const float*>(result_data(0));
+  }
+  const float& result0(size_t dim0, size_t dim1) const {
+    return (*static_cast<const float(*)[1][2]>(
+        result_data(0)))[dim0][dim1];
+  }
+  int result0_size() const {
+    return 2 * sizeof(float);
+  }
+  int result0_count() const {
+    return 2;
+  }
+
+  float* result_fetch_output_0_data() {
+    return static_cast<float*>(result_data(0));
+  }
+  float& result_fetch_output_0(size_t dim0, size_t dim1) {
+    return (*static_cast<float(*)[1][2]>(
+        result_data(0)))[dim0][dim1];
+  }
+  const float* result_fetch_output_0_data() const {
+    return static_cast<const float*>(result_data(0));
+  }
+  const float& result_fetch_output_0(size_t dim0, size_t dim1) const {
+    return (*static_cast<const float(*)[1][2]>(
+        result_data(0)))[dim0][dim1];
+  }
+  int result_fetch_output_0_size() const {
+    return 2 * sizeof(float);
+  }
+  int result_fetch_output_0_count() const {
+    return 2;
+  }
+
+  // Methods for managing variable buffers. Buffers are in row-major order.
+  //
+  // For read-write variables we generate the following methods:
+  //
+  // void set_var_X_data(T* data)
+  //   Sets the buffer for variable X.  Must be called before Run if the
+  //   allocation mode is RESULTS_PROFILES_AND_TEMPS_ONLY.
+  //
+  // T* var_X_data()
+  //   Returns the buffer of type T for variable X.  If the allocation mode is
+  //   RESULTS_PROFILES_AND_TEMPS_ONLY then this buffer is the same as the
+  //   buffer passed to set_var_X_data.
+  //
+  // T& var_X(...dim indices...)
+  //   Returns a reference to the value of type T for variable X,
+  //   with dim indices specifying which value. No bounds checking is performed
+  //   on dim indices.
+  //
+  // For readonly variables we generate the same set of methods, except that we
+  // use `const T` instead of `T`.  We use `const T` to avoid erasing the
+  // constness of the buffer passed to `set_var_X_data` but the underlying
+  // buffer is not const (and thus the const can be safely const-cast'ed away)
+  // unless `set_var_X_data` is called with a pointer to constant storage.
+
+ private:
+  // Number of buffers for the compiled computation.
+  static constexpr size_t kNumBuffers = 12;
+
+  static const ::xla::cpu_function_runtime::BufferInfo* BufferInfos() {
+    static const ::xla::cpu_function_runtime::BufferInfo
+      kBufferInfos[kNumBuffers] = {
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{16777216ULL, ~0U, ~0U}),
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{770048ULL, ~0U, ~0U}),
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{212992ULL, ~0U, ~0U}),
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{16384ULL, ~0U, ~0U}),
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{16384ULL, ~0U, ~0U}),
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{754ULL, 0U, ~0U}),
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{416ULL, ~0U, ~0U}),
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{208ULL, ~0U, ~0U}),
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{33ULL, ~0U, 0U}),
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{33ULL, ~0U, ~0U}),
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{32ULL, ~0U, ~0U}),
+::xla::cpu_function_runtime::BufferInfo(::xla::cpu_function_runtime::EncodedBufferInfo{32769ULL, ~0U, ~0U})
+      };
+    return kBufferInfos;
+  }
+
+  static const ::tensorflow::int32* ResultIndexToBufferIndex() {
+    static constexpr ::tensorflow::int32 kResultIndexToBufferIndex[kNumResults] = {
+8
+    };
+    return kResultIndexToBufferIndex;
+  }
+
+  static const ::tensorflow::int32* ArgIndexToBufferIndex() {
+    static constexpr ::tensorflow::int32 kArgIndexToBufferIndex[kNumArgs] = {
+5
+    };
+    return kArgIndexToBufferIndex;
+  }
+
+  // The 0-based index of the result tuple in the temporary buffers.
+  static constexpr size_t kResultIndex = 9;
+
+  // Shapes of the input arguments.
+  static constexpr int32_t kArg0Shapes[] = {
+1, 47
+  };
+  static const ShapeInfo* ArgShapeInfos() {
+    static constexpr ShapeInfo kArgShapeInfoTable[kNumArgs] = {
+{ kArg0Shapes, 2 },
+    };
+    return kArgShapeInfoTable;
+  };
+
+  // Shapes of the results.
+  static constexpr int32_t kResult0Shapes[] = {
+1, 2
+  };
+  static const ShapeInfo* ResultShapeInfos() {
+    static constexpr ShapeInfo kResultShapeInfoTable[kNumResults] = {
+{ kResult0Shapes, 2 },
+    };
+    return kResultShapeInfoTable;
+  };
+
+  // Array of names of each positional argument, terminated by nullptr.
+  static const char** StaticArgNames() {
+    static const char* kNames[] = {"feed_input_1", nullptr};
+    return kNames;
+  }
+
+  // Array of names of each positional variable, terminated by nullptr.
+  static const char** StaticVariableNames() {
+    static const char* kNames[] = {nullptr};
+    return kNames;
+  }
+
+  // Array of names of each positional result, terminated by nullptr.
+  static const char** StaticResultNames() {
+    static const char* kNames[] = {"fetch_output_0", nullptr};
+    return kNames;
+  }
+
+  // Shape of the args and results.
+  static const ::xla::ProgramShapeProto* StaticProgramShape() {
+    static const ::xla::ProgramShapeProto* kShape = nullptr;
+    return kShape;
+  }
+
+  // Metadata that can be used to pretty-print profile counters.
+  static const ::xla::HloProfilePrinterData* StaticHloProfilePrinterData() {
+    static const ::xla::HloProfilePrinterData* kHloProfilePrinterData =
+      nullptr;
+    return kHloProfilePrinterData;
+  }
+};
+
+}  // end namespace llvm
+
+#endif  // TFCOMPILE_GENERATED__xla_AI4CMEMOPCompiledModel_AARCH64_llvm_AI4CMEMOPCompiledModel_H_
+
+// clang-format on
diff --git a/acpo/models/ai4c-memop/AI4CMEMOPCompiledModel-AARCH64.o b/acpo/models/ai4c-memop/AI4CMEMOPCompiledModel-AARCH64.o
new file mode 100644
index 0000000000000000000000000000000000000000..eeb6fd1e5d45f7e8a491eaf91025e9904e20e99a
Binary files /dev/null and b/acpo/models/ai4c-memop/AI4CMEMOPCompiledModel-AARCH64.o differ
diff --git a/acpo/models/ai4c-memop/AI4CMEMOPCompiledModel-AARCH64_metadata.o b/acpo/models/ai4c-memop/AI4CMEMOPCompiledModel-AARCH64_metadata.o
new file mode 100644
index 0000000000000000000000000000000000000000..cc02d5395c9cbe40bd5f9160f354848878b08721
Binary files /dev/null and b/acpo/models/ai4c-memop/AI4CMEMOPCompiledModel-AARCH64_metadata.o differ
diff --git a/acpo/models/ai4c-memop/saved_model.pb b/acpo/models/ai4c-memop/saved_model.pb
new file mode 100644
index 0000000000000000000000000000000000000000..44df523eeeca293ce283457f097b9692732d9c9a
Binary files /dev/null and b/acpo/models/ai4c-memop/saved_model.pb differ
diff --git a/acpo/models/ai4c-memop/sc.pkl b/acpo/models/ai4c-memop/sc.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..602ca9a9cab4375654ed6941e67c7a268c129b5c
Binary files /dev/null and b/acpo/models/ai4c-memop/sc.pkl differ
diff --git a/acpo/models/ai4c-memop/variables/variables.data-00000-of-00001 b/acpo/models/ai4c-memop/variables/variables.data-00000-of-00001
new file mode 100644
index 0000000000000000000000000000000000000000..84f6bd73d7523bd92d84365ab9b26974b423c9c3
Binary files /dev/null and b/acpo/models/ai4c-memop/variables/variables.data-00000-of-00001 differ
diff --git a/acpo/models/ai4c-memop/variables/variables.index b/acpo/models/ai4c-memop/variables/variables.index
new file mode 100644
index 0000000000000000000000000000000000000000..dcdb372b626ab6efee8c1042f66bb1756773e96b
Binary files /dev/null and b/acpo/models/ai4c-memop/variables/variables.index differ
diff --git a/acpo/models/fi-dummy/output_spec.json b/acpo/models/fi-dummy/output_spec.json
new file mode 100644
index 0000000000000000000000000000000000000000..1636185d6f73a63a5395b98f692df7a293120a8a
--- /dev/null
+++ b/acpo/models/fi-dummy/output_spec.json
@@ -0,0 +1,14 @@
+
+[
+    {
+        "logging_name": "inlining_decision",
+        "tensor_spec": {
+            "name": "PartitionedCall",
+            "port": 0,
+            "type": "float64_t",
+            "shape": [
+                1
+            ]
+        }
+    }
+]
diff --git a/acpo/models/fi-dummy/saved_model.pb b/acpo/models/fi-dummy/saved_model.pb
new file mode 100644
index 0000000000000000000000000000000000000000..fc47e10babe8e4eb25fe78752e716281e1f7f82f
Binary files /dev/null and b/acpo/models/fi-dummy/saved_model.pb differ
diff --git a/acpo/models/fi-dummy/variables/variables.data-00000-of-00001 b/acpo/models/fi-dummy/variables/variables.data-00000-of-00001
new file mode 100644
index 0000000000000000000000000000000000000000..4b27e1a41bd8cd41c633b7f50d572ad946e92d79
Binary files /dev/null and b/acpo/models/fi-dummy/variables/variables.data-00000-of-00001 differ
diff --git a/acpo/models/fi-dummy/variables/variables.index b/acpo/models/fi-dummy/variables/variables.index
new file mode 100644
index 0000000000000000000000000000000000000000..333057c66e88065a76dd0f39edd3fcfe208fc138
Binary files /dev/null and b/acpo/models/fi-dummy/variables/variables.index differ