From 4e1d0c47e7afb87894c8fc1674f5808fe44f9b9d Mon Sep 17 00:00:00 2001
From: swcompiler <lc@wxiat.com>
Date: Tue, 25 Mar 2025 09:44:08 +0800
Subject: [PATCH] Add sw64 support

---
 add-sw64-support.patch | 10241 +++++++++++++++++++++++++++++++++++++++
 luajit.spec            |    14 +-
 2 files changed, 10253 insertions(+), 2 deletions(-)
 create mode 100644 add-sw64-support.patch

diff --git a/add-sw64-support.patch b/add-sw64-support.patch
new file mode 100644
index 0000000..98c750d
--- /dev/null
+++ b/add-sw64-support.patch
@@ -0,0 +1,10241 @@
+diff --git a/Makefile b/Makefile
+index 0f93308..86714ae 100644
+--- a/Makefile
++++ b/Makefile
+@@ -88,7 +88,7 @@ FILES_INC= lua.h lualib.h lauxlib.h luaconf.h lua.hpp luajit.h
+ FILES_JITLIB= bc.lua bcsave.lua dump.lua p.lua v.lua zone.lua \
+ 	      dis_x86.lua dis_x64.lua dis_arm.lua dis_arm64.lua \
+ 	      dis_arm64be.lua dis_ppc.lua dis_mips.lua dis_mipsel.lua \
+-	      dis_mips64.lua dis_mips64el.lua vmdef.lua
++	      dis_mips64.lua dis_mips64el.lua vmdef.lua dis_sw64.lua
+ 
+ ifeq (,$(findstring Windows,$(OS)))
+   HOST_SYS:= $(shell uname -s)
+diff --git a/dynasm/dasm_sw64.h b/dynasm/dasm_sw64.h
+new file mode 100644
+index 0000000..319f79b
+--- /dev/null
++++ b/dynasm/dasm_sw64.h
+@@ -0,0 +1,419 @@
++/*
++** DynASM SW64 encoding engine.
++** Copyright (C) 2023 Mike Pall. All rights reserved.
++** Released under the MIT license. See dynasm.lua for full copyright notice.
++*/
++
++#include <stddef.h>
++#include <stdarg.h>
++#include <string.h>
++#include <stdlib.h>
++
++#define DASM_ARCH		"sw64"
++
++#ifndef DASM_EXTERN
++#define DASM_EXTERN(a, b, c, d) 0
++#endif
++
++/* Action definitions. */
++enum {
++  DASM_STOP, DASM_SECTION, DASM_ESC, DASM_REL_EXT,
++  /* The following actions need a buffer position. */
++  DASM_ALIGN, DASM_REL_LG, DASM_LABEL_LG,
++  /* The following actions also have an argument. */
++  DASM_REL_PC, DASM_LABEL_PC, DASM_IMM, DASM_IMMS,
++  DASM__MAX
++};
++
++/* Maximum number of section buffer positions for a single dasm_put() call. */
++#define DASM_MAXSECPOS 25
++
++/* DynASM encoder status codes. Action list offset or number are or'ed in. */
++#define DASM_S_OK 0x00000000
++#define DASM_S_NOMEM 0x01000000
++#define DASM_S_PHASE 0x02000000
++#define DASM_S_MATCH_SEC 0x03000000
++#define DASM_S_RANGE_I 0x11000000
++#define DASM_S_RANGE_SEC 0x12000000
++#define DASM_S_RANGE_LG 0x13000000
++#define DASM_S_RANGE_PC 0x14000000
++#define DASM_S_RANGE_REL 0x15000000
++#define DASM_S_UNDEF_LG 0x21000000
++#define DASM_S_UNDEF_PC 0x22000000
++
++/* Macros to convert positions (8 bit section + 24 bit index). */
++#define DASM_POS2IDX(pos) ((pos)&0x00ffffff)
++#define DASM_POS2BIAS(pos) ((pos)&0xff000000)
++#define DASM_SEC2POS(sec) ((sec) << 24)
++#define DASM_POS2SEC(pos) ((pos) >> 24)
++#define DASM_POS2PTR(D, pos) (D->sections[DASM_POS2SEC(pos)].rbuf + (pos))
++
++/* Action list type. */
++typedef const unsigned int *dasm_ActList;
++
++/* Per-section structure. */
++typedef struct dasm_Section {
++  int *rbuf;    /* Biased buffer pointer (negative section bias). */
++  int *buf;     /* True buffer pointer. */
++  size_t bsize; /* Buffer size in bytes. */
++  int pos;      /* Biased buffer position. */
++  int epos;     /* End of biased buffer position - max single put. */
++  int ofs;      /* Byte offset into section. */
++} dasm_Section;
++
++/* Core structure holding the DynASM encoding state. */
++struct dasm_State {
++  size_t psize;            /* Allocated size of this structure. */
++  dasm_ActList actionlist; /* Current actionlist pointer. */
++  int *lglabels;           /* Local/global chain/pos ptrs. */
++  size_t lgsize;
++  int *pclabels; /* PC label chains/pos ptrs. */
++  size_t pcsize;
++  void **globals;           /* Array of globals (bias -10). */
++  dasm_Section *section;    /* Pointer to active section. */
++  size_t codesize;          /* Total size of all code sections. */
++  int maxsection;           /* 0 <= sectionidx < maxsection. */
++  int status;               /* Status code. */
++  dasm_Section sections[1]; /* All sections. Alloc-extended. */
++};
++
++/* The size of the core structure depends on the max. number of sections. */
++#define DASM_PSZ(ms) (sizeof(dasm_State) + (ms - 1) * sizeof(dasm_Section))
++
++
++/* Initialize DynASM state. */
++void dasm_init(Dst_DECL, int maxsection)
++{
++  dasm_State *D;
++  size_t psz = 0;
++  int i;
++  Dst_REF = NULL;
++  DASM_M_GROW(Dst, struct dasm_State, Dst_REF, psz, DASM_PSZ(maxsection));
++  D = Dst_REF;
++  D->psize = psz;
++  D->lglabels = NULL;
++  D->lgsize = 0;
++  D->pclabels = NULL;
++  D->pcsize = 0;
++  D->globals = NULL;
++  D->maxsection = maxsection;
++  for (i = 0; i < maxsection; i++) {
++    D->sections[i].buf = NULL; /* Need this for pass3. */
++    D->sections[i].rbuf = D->sections[i].buf - DASM_SEC2POS(i);
++    D->sections[i].bsize = 0;
++    D->sections[i].epos = 0; /* Wrong, but is recalculated after resize. */
++  }
++}
++
++/* Free DynASM state. */
++void dasm_free(Dst_DECL)
++{
++  dasm_State *D = Dst_REF;
++  int i;
++  for (i = 0; i < D->maxsection; i++)
++    if (D->sections[i].buf)
++      DASM_M_FREE(Dst, D->sections[i].buf, D->sections[i].bsize);
++  if (D->pclabels) DASM_M_FREE(Dst, D->pclabels, D->pcsize);
++  if (D->lglabels) DASM_M_FREE(Dst, D->lglabels, D->lgsize);
++  DASM_M_FREE(Dst, D, D->psize);
++}
++
++/* Setup global label array. Must be called before dasm_setup(). */
++void dasm_setupglobal(Dst_DECL, void **gl, unsigned int maxgl)
++{
++  dasm_State *D = Dst_REF;
++  D->globals = gl - 10; /* Negative bias to compensate for locals. */
++  DASM_M_GROW(Dst, int, D->lglabels, D->lgsize, (10 + maxgl) * sizeof(int));
++}
++
++/* Grow PC label array. Can be called after dasm_setup(), too. */
++void dasm_growpc(Dst_DECL, unsigned int maxpc)
++{
++  dasm_State *D = Dst_REF;
++  size_t osz = D->pcsize;
++  DASM_M_GROW(Dst, int, D->pclabels, D->pcsize, maxpc * sizeof(int));
++  memset((void *)(((unsigned char *)D->pclabels) + osz), 0, D->pcsize - osz);
++}
++
++/* Setup encoder. */
++void dasm_setup(Dst_DECL, const void *actionlist)
++{
++  dasm_State *D = Dst_REF;
++  int i;
++  D->actionlist = (dasm_ActList)actionlist;
++  D->status = DASM_S_OK;
++  D->section = &D->sections[0];
++  memset((void *)D->lglabels, 0, D->lgsize);
++  if (D->pclabels) memset((void *)D->pclabels, 0, D->pcsize);
++  for (i = 0; i < D->maxsection; i++) {
++    D->sections[i].pos = DASM_SEC2POS(i);
++    D->sections[i].ofs = 0;
++  }
++}
++
++
++#ifdef DASM_CHECKS
++#define CK(x, st)                                                              \
++  do { if (!(x)) { \
++    D->status = DASM_S_##st|(p-D->actionlist-1); return; } } while (0)
++#define CKPL(kind, st)                                                         \
++  do { if ((size_t)((char *)pl-(char *)D->kind##labels) >= D->kind##size) { \
++    D->status = DASM_S_RANGE_##st|(p-D->actionlist-1); return; } } while (0)
++#else
++#define CK(x, st) ((void)0)
++#define CKPL(kind, st) ((void)0)
++#endif
++
++/* Pass 1: Store actions and args, link branches/labels, estimate offsets. */
++void dasm_put(Dst_DECL, int start, ...)
++{
++  va_list ap;
++  dasm_State *D = Dst_REF;
++  dasm_ActList p = D->actionlist + start;
++  dasm_Section *sec = D->section;
++  int pos = sec->pos, ofs = sec->ofs;
++  int *b;
++
++  if (pos >= sec->epos) {
++    DASM_M_GROW(Dst, int, sec->buf, sec->bsize,
++                sec->bsize + 2 * DASM_MAXSECPOS * sizeof(int));
++    sec->rbuf = sec->buf - DASM_POS2BIAS(pos);
++    sec->epos = (int)sec->bsize/sizeof(int) - DASM_MAXSECPOS+DASM_POS2BIAS(pos);
++  }
++
++  b = sec->rbuf;
++  b[pos++] = start;
++
++  va_start(ap, start);
++  while (1) {
++    unsigned int ins = *p++;
++    unsigned int action = (ins >> 16) - 0xff00;
++    if (action >= DASM__MAX) {
++      ofs += 4;
++    } else {
++      int *pl, n = action >= DASM_REL_PC ? va_arg(ap, int) : 0;
++      switch (action) {
++      case DASM_STOP: goto stop;
++      case DASM_SECTION:
++	n = (ins & 255); CK(n < D->maxsection, RANGE_SEC);
++	D->section = &D->sections[n]; goto stop;
++      case DASM_ESC: p++; ofs += 4; break;
++      case DASM_REL_EXT: break;
++      case DASM_ALIGN: ofs += (ins & 255); b[pos++] = ofs; break;
++      case DASM_REL_LG:
++	n = (ins & 2047) - 10; pl = D->lglabels + n;
++        /* Bkwd rel or global. */
++	if (n >= 0) { CK(n>=10||*pl<0, RANGE_LG); CKPL(lg, LG); goto putrel; }
++	pl += 10; n = *pl;
++	if (n < 0) n = 0;  /* Start new chain for fwd rel if label exists. */
++        goto linkrel;
++      case DASM_REL_PC:
++	pl = D->pclabels + n; CKPL(pc, PC);
++      putrel:
++        n = *pl;
++        if (n < 0) { /* Label exists. Get label pos and store it. */
++          b[pos] = -n;
++        } else {
++        linkrel:
++          b[pos] = n; /* Else link to rel chain, anchored at label. */
++          *pl = pos;
++        }
++        pos++;
++        break;
++      case DASM_LABEL_LG:
++	pl = D->lglabels + (ins & 2047) - 10; CKPL(lg, LG); goto putlabel;
++      case DASM_LABEL_PC:
++	pl = D->pclabels + n; CKPL(pc, PC);
++      putlabel:
++        n = *pl; /* n > 0: Collapse rel chain and replace with label pos. */
++	while (n > 0) { int *pb = DASM_POS2PTR(D, n); n = *pb; *pb = pos;
++        }
++        *pl = -pos;     /* Label exists now. */
++        b[pos++] = ofs; /* Store pass1 offset estimate. */
++        break;
++      case DASM_IMM: case DASM_IMMS:
++#ifdef DASM_CHECKS
++        CK((n & ((1 << ((ins >> 10) & 31)) - 1)) == 0, RANGE_I);
++#endif
++        n >>= ((ins >> 10) & 31);
++#ifdef DASM_CHECKS
++        if (ins & 0x8000)
++	  CK(((n + (1<<(((ins>>5)&31)-1)))>>((ins>>5)&31)) == 0, RANGE_I);
++        else
++          CK((n >> ((ins >> 5) & 31)) == 0, RANGE_I);
++#endif
++        b[pos++] = n;
++        break;
++      }
++    }
++  }
++stop:
++  va_end(ap);
++  sec->pos = pos;
++  sec->ofs = ofs;
++}
++#undef CK
++
++/* Pass 2: Link sections, shrink aligns, fix label offsets. */
++int dasm_link(Dst_DECL, size_t *szp)
++{
++  dasm_State *D = Dst_REF;
++  int secnum;
++  int ofs = 0;
++
++#ifdef DASM_CHECKS
++  *szp = 0;
++  if (D->status != DASM_S_OK) return D->status;
++  {
++    int pc;
++    for (pc = 0; pc * sizeof(int) < D->pcsize; pc++)
++      if (D->pclabels[pc] > 0) return DASM_S_UNDEF_PC|pc;
++  }
++#endif
++
++  { /* Handle globals not defined in this translation unit. */
++    int idx;
++    for (idx = 20; idx * sizeof(int) < D->lgsize; idx++) {
++      int n = D->lglabels[idx];
++      /* Undefined label: Collapse rel chain and replace with marker (< 0). */
++      while (n > 0) { int *pb = DASM_POS2PTR(D, n); n = *pb; *pb = -idx; }
++    }
++  }
++
++  /* Combine all code sections. No support for data sections (yet). */
++  for (secnum = 0; secnum < D->maxsection; secnum++) {
++    dasm_Section *sec = D->sections + secnum;
++    int *b = sec->rbuf;
++    int pos = DASM_SEC2POS(secnum);
++    int lastpos = sec->pos;
++
++    while (pos != lastpos) {
++      dasm_ActList p = D->actionlist + b[pos++];
++      while (1) {
++        unsigned int ins = *p++;
++        unsigned int action = (ins >> 16) - 0xff00;
++        switch (action) {
++	case DASM_STOP: case DASM_SECTION: goto stop;
++	case DASM_ESC: p++; break;
++	case DASM_REL_EXT: break;
++	case DASM_ALIGN: ofs -= (b[pos++] + ofs) & (ins & 255); break;
++	case DASM_REL_LG: case DASM_REL_PC: pos++; break;
++	case DASM_LABEL_LG: case DASM_LABEL_PC: b[pos++] += ofs; break;
++	case DASM_IMM: case DASM_IMMS: pos++; break;
++        }
++      }
++      stop: (void)0;
++    }
++    ofs += sec->ofs; /* Next section starts right after current section. */
++  }
++
++  D->codesize = ofs; /* Total size of all code sections */
++  *szp = ofs;
++  return DASM_S_OK;
++}
++
++#ifdef DASM_CHECKS
++#define CK(x, st)                                                              \
++  do { if (!(x)) return DASM_S_##st|(p-D->actionlist-1); } while (0)
++#else
++#define CK(x, st) ((void)0)
++#endif
++
++/* Pass 3: Encode sections. */
++int dasm_encode(Dst_DECL, void *buffer)
++{
++  dasm_State *D = Dst_REF;
++  char *base = (char *)buffer;
++  unsigned int *cp = (unsigned int *)buffer;
++  int secnum;
++
++  /* Encode all code sections. No support for data sections (yet). */
++  for (secnum = 0; secnum < D->maxsection; secnum++) {
++    dasm_Section *sec = D->sections + secnum;
++    int *b = sec->buf;
++    int *endb = sec->rbuf + sec->pos;
++
++    while (b != endb) {
++      dasm_ActList p = D->actionlist + *b++;
++      while (1) {
++        unsigned int ins = *p++;
++        unsigned int action = (ins >> 16) - 0xff00;
++        int n = (action >= DASM_ALIGN && action < DASM__MAX) ? *b++ : 0;
++        switch (action) {
++	case DASM_STOP: case DASM_SECTION: goto stop;
++	case DASM_ESC: *cp++ = *p++; break;
++        case DASM_REL_EXT:
++          n = DASM_EXTERN(Dst, (unsigned char *)cp, (ins & 2047), 1);
++          goto patchrel;
++        case DASM_ALIGN:
++	  ins &= 255; while ((((char *)cp - base) & ins)) *cp++ = 0x60000000;
++          break;
++        case DASM_REL_LG:
++          CK(n >= 0, UNDEF_LG);
++        case DASM_REL_PC:
++          CK(n >= 0, UNDEF_PC);
++          n = *DASM_POS2PTR(D, n);
++          if (ins & 2048)
++	    n = n - (int)((char *)cp - base);
++          else
++	    n = (n + (int)(size_t)base) & 0x0fffffff;
++        patchrel:
++          CK((n & 3) == 0 && ((n + ((ins & 2048) ? 0x00020000 : 0)) >>
++                              ((ins & 2048) ? 18 : 28)) == 0,
++             RANGE_REL);
++          cp[-1] |= ((n >> 2) & ((ins & 2048) ? 0x001fffff : 0x03ffffff));
++          break;
++        case DASM_LABEL_LG:
++	  ins &= 2047; if (ins >= 20) D->globals[ins-10] = (void *)(base + n);
++          break;
++	case DASM_LABEL_PC: break;
++	case DASM_IMMS:
++	  cp[-1] |= ((n>>3) & 4); n &= 0x1f;
++	  /* fallthrough */
++        case DASM_IMM:
++          cp[-1] |= (n & ((1 << ((ins >> 5) & 31)) - 1)) << (ins & 31);
++          break;
++	default: *cp++ = ins; break;
++        }
++      }
++      stop: (void)0;
++    }
++  }
++
++  if (base + D->codesize != (char *)cp) /* Check for phase errors. */
++    return DASM_S_PHASE;
++  return DASM_S_OK;
++}
++#undef CK
++
++/* Get PC label offset. */
++int dasm_getpclabel(Dst_DECL, unsigned int pc)
++{
++  dasm_State *D = Dst_REF;
++  if (pc * sizeof(int) < D->pcsize) {
++    int pos = D->pclabels[pc];
++    if (pos < 0) return *DASM_POS2PTR(D, -pos);
++    if (pos > 0) return -1;  /* Undefined. */
++  }
++  return -2; /* Unused or out of range. */
++}
++
++#ifdef DASM_CHECKS
++/* Optional sanity checker to call between isolated encoding steps. */
++int dasm_checkstep(Dst_DECL, int secmatch)
++{
++  dasm_State *D = Dst_REF;
++  if (D->status == DASM_S_OK) {
++    int i;
++    for (i = 1; i <= 9; i++) {
++      if (D->lglabels[i] > 0) { D->status = DASM_S_UNDEF_LG|i; break; }
++      D->lglabels[i] = 0;
++    }
++  }
++  if (D->status == DASM_S_OK && secmatch >= 0 &&
++      D->section != &D->sections[secmatch])
++    D->status = DASM_S_MATCH_SEC | (D->section - D->sections);
++  return D->status;
++}
++#endif
++
+diff --git a/dynasm/dasm_sw64.lua b/dynasm/dasm_sw64.lua
+new file mode 100644
+index 0000000..3787d6c
+--- /dev/null
++++ b/dynasm/dasm_sw64.lua
+@@ -0,0 +1,767 @@
++------------------------------------------------------------------------------
++-- DynASM SW64 module.
++--
++-- Copyright (C) 2023 Mike Pall. All rights reserved.
++-- See dynasm.lua for full copyright notice.
++------------------------------------------------------------------------------
++
++-- Module information:
++local _info = {
++  arch =	"sw64",
++  description =	"DynASM SW64 module",
++  version =	"1.4.0",
++  vernum =	 10400,
++  release =	"2023-02-03",
++  author =	"Mike Pall",
++  license =	"MIT",
++}
++
++-- Exported glue functions for the arch-specific module.
++local _M = { _info = _info }
++
++-- Cache library functions.
++local type, tonumber, pairs, ipairs = type, tonumber, pairs, ipairs
++local assert, setmetatable = assert, setmetatable
++local _s = string
++local sub, format, byte, char = _s.sub, _s.format, _s.byte, _s.char
++local match, gmatch = _s.match, _s.gmatch
++local concat, sort = table.concat, table.sort
++local bit = bit or require("bit")
++local band, shl, shr, sar = bit.band, bit.lshift, bit.rshift, bit.arshift
++local tohex = bit.tohex
++
++-- Inherited tables and callbacks.
++local g_opt, g_arch
++local wline, werror, wfatal, wwarn
++
++-- Action name list.
++-- CHECK: Keep this in sync with the C code!
++local action_names = {
++  "STOP", "SECTION", "ESC", "REL_EXT",
++  "ALIGN", "REL_LG", "LABEL_LG",
++  "REL_PC", "LABEL_PC", "IMM", "IMMS",
++}
++
++-- Maximum number of section buffer positions for dasm_put().
++-- CHECK: Keep this in sync with the C code!
++local maxsecpos = 25 -- Keep this low, to avoid excessively long C lines.
++
++-- Action name -> action number.
++local map_action = {}
++for n,name in ipairs(action_names) do
++  map_action[name] = n-1
++end
++
++-- Action list buffer.
++local actlist = {}
++
++-- Argument list for next dasm_put(). Start with offset 0 into action list.
++local actargs = { 0 }
++
++-- Current number of section buffer positions for dasm_put().
++local secpos = 1
++
++------------------------------------------------------------------------------
++
++-- Dump action names and numbers.
++local function dumpactions(out)
++  out:write("DynASM encoding engine action codes:\n")
++  for n,name in ipairs(action_names) do
++    local num = map_action[name]
++    out:write(format("  %-10s %02X  %d\n", name, num, num))
++  end
++  out:write("\n")
++end
++
++-- Write action list buffer as a huge static C array.
++local function writeactions(out, name)
++  local nn = #actlist
++  if nn == 0 then nn = 1; actlist[0] = map_action.STOP end
++  out:write("static const unsigned int ", name, "[", nn, "] = {\n")
++  for i = 1,nn-1 do
++    assert(out:write("0x", tohex(actlist[i]), ",\n"))
++  end
++  assert(out:write("0x", tohex(actlist[nn]), "\n};\n\n"))
++end
++
++------------------------------------------------------------------------------
++
++-- Add word to action list.
++local function wputxw(n)
++  assert(n >= 0 and n <= 0xffffffff and n % 1 == 0, "word out of range")
++  actlist[#actlist+1] = n
++end
++
++-- Add action to list with optional arg. Advance buffer pos, too.
++local function waction(action, val, a, num)
++  local w = assert(map_action[action], "bad action name `"..action.."'")
++  wputxw(0xff000000 + w * 0x10000 + (val or 0))
++  if a then actargs[#actargs+1] = a end
++  if a or num then secpos = secpos + (num or 1) end
++end
++
++-- Flush action list (intervening C code or buffer pos overflow).
++local function wflush(term)
++  if #actlist == actargs[1] then return end -- Nothing to flush.
++  if not term then waction("STOP") end -- Terminate action list.
++  wline(format("dasm_put(Dst, %s);", concat(actargs, ", ")), true)
++  actargs = { #actlist } -- Actionlist offset is 1st arg to next dasm_put().
++  secpos = 1 -- The actionlist offset occupies a buffer position, too.
++end
++
++-- Put escaped word.
++local function wputw(n)
++  if n >= 0xff000000 then waction("ESC") end
++  wputxw(n)
++end
++
++-- Reserve position for word.
++local function wpos()
++  local pos = #actlist+1
++  actlist[pos] = ""
++  return pos
++end
++
++-- Store word to reserved position.
++local function wputpos(pos, n)
++  assert(n >= 0 and n <= 0xffffffff and n % 1 == 0, "word out of range")
++  actlist[pos] = n
++end
++
++------------------------------------------------------------------------------
++
++-- Global label name -> global label number. With auto assignment on 1st use.
++local next_global = 20
++local map_global = setmetatable({}, { __index = function(t, name)
++  if not match(name, "^[%a_][%w_]*$") then werror("bad global label") end
++  local n = next_global
++  if n > 2047 then werror("too many global labels") end
++  next_global = n + 1
++  t[name] = n
++  return n
++end})
++
++-- Dump global labels.
++local function dumpglobals(out, lvl)
++  local t = {}
++  for name, n in pairs(map_global) do t[n] = name end
++  out:write("Global labels:\n")
++  for i=20,next_global-1 do
++    out:write(format("  %s\n", t[i]))
++  end
++  out:write("\n")
++end
++
++-- Write global label enum.
++local function writeglobals(out, prefix)
++  local t = {}
++  for name, n in pairs(map_global) do t[n] = name end
++  out:write("enum {\n")
++  for i=20,next_global-1 do
++    out:write("  ", prefix, t[i], ",\n")
++  end
++  out:write("  ", prefix, "_MAX\n};\n")
++end
++
++-- Write global label names.
++local function writeglobalnames(out, name)
++  local t = {}
++  for name, n in pairs(map_global) do t[n] = name end
++  out:write("static const char *const ", name, "[] = {\n")
++  for i=20,next_global-1 do
++    out:write("  \"", t[i], "\",\n")
++  end
++  out:write("  (const char *)0\n};\n")
++end
++
++------------------------------------------------------------------------------
++
++-- Extern label name -> extern label number. With auto assignment on 1st use.
++local next_extern = 0
++local map_extern_ = {}
++local map_extern = setmetatable({}, { __index = function(t, name)
++  -- No restrictions on the name for now.
++  local n = next_extern
++  if n > 2047 then werror("too many extern labels") end
++  next_extern = n + 1
++  t[name] = n
++  map_extern_[n] = name
++  return n
++end})
++
++-- Dump extern labels.
++local function dumpexterns(out, lvl)
++  out:write("Extern labels:\n")
++  for i=0,next_extern-1 do
++    out:write(format("  %s\n", map_extern_[i]))
++  end
++  out:write("\n")
++end
++
++-- Write extern label names.
++local function writeexternnames(out, name)
++  out:write("static const char *const ", name, "[] = {\n")
++  for i=0,next_extern-1 do
++    out:write("  \"", map_extern_[i], "\",\n")
++  end
++  out:write("  (const char *)0\n};\n")
++end
++
++------------------------------------------------------------------------------
++
++-- Arch-specific maps.
++local map_archdef = { zero="r31", sp="r30", ra="r26", pv="r27", fzero="f31" } -- Ext. register name -> int. name.
++
++local map_type = {}		-- Type name -> { ctype, reg }
++local ctypenum = 0		-- Type number (for Dt... macros).
++
++-- Reverse defines for registers.
++function _M.revdef(s)
++  if s == "r30" then return "sp"
++  elseif s == "r26" then return "ra"
++  elseif s == "r31" then return "zero"
++  elseif s == "f31" then return "fzero"
++  elseif s == "r27" then return "pv" end
++  return s
++end
++
++------------------------------------------------------------------------------
++
++-- Template strings for SW64 instructions.
++local map_op = {
++  ldi_2 =	"f8000000Ao", --0x3e
++  ldih_2 =	"fc000000Ao", --0x3f
++
++  ldl_2 =	"8c000000Ao", --0x23
++  ldw_2 =   "88000000Ao", --0x22
++  ldhu_2 =   "84000000Ao", --0x21
++  ldbu_2 =	"80000000Ao", --0x20
++
++  fstd_2 =  "bc000000Fo", --0x2f
++  fldd_2 =  "9c000000Fo", --0x27
++  flds_2 =  "98000000Fo", --0x26
++  fsts_2 =  "b8000000Fo", --0x2e
++  ifmovd_2= "601f0820AI",
++  ifmovs_2= "601f0800AI",
++  fimovd_2= "401f0f00FD",
++  fcvtds_2= "63e00420GI",
++  fcvtsd_2= "63e00400GI",
++  fcvtld_2= "63e005e0GI",
++  fcvtls_2= "63e005a0GI",
++  fcvtdl_2= "63e004e0GI",
++  fcvtdln_2 = "63e004a0GI",
++  fcvtdlp_2 = "63e00460GI",
++  fcvtdlz_2 = "63e00480GI",
++  fcvtwl_2 = "63e00500GI",
++  fcvtlw_2 = "63e00520GI",
++  fcpys_3 = "60000600FGI",
++  fcpysn_3 = "60000640FGI",
++
++  faddd_3 = "60000020FGI",
++  fsubd_3 = "60000060FGI",
++  fmuld_3 = "600000a0FGI",
++  fdivd_3 = "600000e0FGI",
++
++  fcmpeq_3 = "60000200FGI",
++  fcmple_3 = "60000220FGI",
++  fcmplt_3 = "60000240FGI",
++  fcmpun_3 = "60000260FGI",
++
++  stl_2 =	"ac000000Ao", --0x2b
++  stw_2 =   "a8000000Ao", --0x2a
++  sth_2 =	"a4000000Ao", --0x29
++  stb_2 =	"a0000000Ao", --0x28
++
++  addli_3 =	"48000100AjD", --0x12.08
++  subli_3 =	"48000120AjD", --0x12.09
++  mulli_3 =	"48000300AjD", --0x12.18
++  mull_3 =      "40000300ABD", --0x10.18
++  mulw_3 =      "40000200ABD", --0x10.10
++  addl_3 =	"40000100ABD", --0x10.08
++  subl_3 =	"40000120ABD", --0x10.09
++  subw_3 =	"40000020ABD", --0x10.01
++  subwi_3 =     "48000020AjD", --0x12.01
++  s8addl_3 =    "40000180ABD", --0x10.0c
++  s8addli_3 =   "48000180AjD", --0x12.0c
++  s8addw_3 =    "40000080ABD", --0x10.04
++  s8addwi_3 =   "48000080AjD", --0x12.04
++  s4addl_3 =    "40000140ABD", --0x10.0a
++  s4addli_3 =   "48000140AjD", --0x12.0a
++  s4addw_3 =    "40000040ABD", --0x10.02
++  s4addwi_3 =   "48000040AjD", --0x12.02
++  addw_3 =      "40000000ABD", --0x10.00
++  addwi_3 =     "48000000AjD", --0x12.00
++  divw_3 =      "40000220ABD", --0x10.11
++  udivw_3 =     "40000240ABD", --0x10.12
++  remw_3 =      "40000260ABD", --0x10.13
++  uremw_3 =     "40000280ABD", --0x10.14
++  divl_3 =      "40000340ABD", --0x10.1a
++  udivl_3 =     "40000360ABD", --0x10.1b
++  reml_3 =      "40000380ABD", --0x10.1c
++  ureml_3 =     "400003a0ABD", --0x10.1d
++
++  andi_3 =  "48000700AjD", 
++  and_3 =   "40000700ABD",
++  ornoti_3 ="48000760AjD",
++  ornot_3 = "40000760ABD",
++  bis_3 =   "40000740ABD",
++  bisi_3 =   "48000740AjD",
++  bic_3 =   "40000720ABD",
++  bici_3 =   "48000720AjD",
++  xori_3 =  "48000780AjD",
++  xor_3 =   "40000780ABD",
++  slli_3 = "48000900AjD",
++  sll_3 = "40000900ABD",
++  srli_3 = "48000920AjD",
++  srl_3 = "40000920ABD",
++  srai_3 = "48000940AjD",
++  sra_3 = "40000940ABD",
++  roll_3 = "40000960ABD",
++  rolli_3 = "48000960AjD",
++  sllw_3 = "40000980ABD",
++  sllwi_3 = "48000980AjD",
++  srlw_3 = "400009a0ABD",
++  srlwi_3 = "480009a0AjD",
++  sraw_3 = "400009c0ABD",
++  srawi_3 = "480009c0AjD",
++  rolw_3 = "400009e0ABD",
++  rolwi_3 = "480009e0AjD",
++
++  beq_2 =	"c0000000Ab", --0x30
++  bne_2 =	"c4000000Ab", --0x31
++  blt_2 =	"c8000000Ab", --0x32
++  ble_2 =	"cc000000Ab", --0x33
++  bgt_2 =	"d0000000Ab", --0x34
++  bge_2 =	"d4000000Ab", --0x35
++
++  fbeq_2 =  "e0000000Fb",
++  fbge_2 =  "f4000000Fb",
++  fbgt_2 =  "f0000000Fb",
++  fble_2 =  "ec000000Fb",
++  fblt_2 =  "e8000000Fb",
++  fbne_2 =  "e4000000Fb",
++
++  call_2 =	"04000000Ao", --0x1
++  ret_2 =	"08000000Ao", --0x2
++  jmp_2 =   "0C000000Ao", --0x3
++  br_2 =    "10000000Ab",  --0x4
++  getpc_1 = "10000000A", --br Rn, 0
++
++
++  cmpeq_3 =  "40000500ABD",
++  cmplt_3 =  "40000520ABD",
++  cmplti_3 = "48000520AjD",
++  cmple_3 =  "40000540ABD",
++  cmpult_3 = "40000560ABD",
++  cmpulti_3 = "48000560AjD",
++  cmpule_3 = "40000580ABD",
++  sbt_3 = "400005a0ABD",
++  sbti_3 = "480005a0AjD",
++  cbt_3 = "400005c0ABD",
++  cbti_3 = "480005c0AjD",
++
++
++  maskhw_3  = "40000cc0ABD",
++  maskhwi_3 = "48000cc0AjD",
++  maskhl_3  = "40000ce0ABD",
++  maskhli_3 = "48000ce0AjD",
++  maskll_3  = "40000c60ABD",
++  masklli_3 = "48000c60AjD",
++
++  zap_3  = "40000d00ABD",
++  zapi_3 = "48000d00AjD",
++
++  extlb_3 = "48000a00AjD",
++  extlh_3 = "48000a20AjD",
++  extlw_3 = "48000a40AjD",
++  extll_3 = "48000a60AjD",
++  exthb_3 = "48000a80AjD",
++  exthh_3 = "48000aa0AjD",
++  exthw_3 = "48000ac0AjD",
++  exthl_3 = "48000ae0AjD",
++
++  inslb_3 = "48000800AjD",
++
++  maskhw_3 = "48000cc0AjD",
++
++  sexth_2 =  "43e00d60BD",
++  sexthi_2 = "4be00d60iD",
++  sextb_2 =  "43e00d40BD",
++  sextbi_2 = "4be00d40iD",
++
++  selle_4 =  "44000c00ABCD",
++  sellei_4 = "4c000c00AiCD",
++  sellt_4 =  "44001000ABCD",
++  sellti_4 = "4c001000AiCD",
++  selgt_4 =  "44000800ABCD",
++  selgti_4 = "4c000800AiCD",
++  selge_4 =  "44000400ABCD",
++  selgei_4 = "4c000400AiCD",
++  selne_4 =  "44001400ABCD",
++  selnei_4 = "4c001400AiCD",
++  seleq_4 =  "44000000ABCD",
++  seleqi_4 = "4c000000AiCD",
++
++  fselne_4 = "64004400FGHI",
++  fseleq_4 = "64004000FGHI",
++
++  fsqrtd_2 = "63e00120GI",
++
++  setfpec1_0 = "60000aa0",
++  setfpec3_0 = "60000ae0",
++
++  syscall_0 = "00000083", --0x0.83
++  bpt_0 = "00000080", --0x0.80
++
++  ldw_dec_2 = "20004000Ap",
++  ldl_dec_2 = "20005000Ap",
++}
++
++------------------------------------------------------------------------------
++
++local function parse_gpr(expr)
++  local tname, ovreg = match(expr, "^([%w_]+):(r[1-3]?[0-9])$")
++  local tp = map_type[tname or expr]
++  if tp then
++    local reg = ovreg or tp.reg
++    if not reg then
++      werror("type `"..(tname or expr).."' needs a register override")
++    end
++    expr = reg
++  end
++  local r = match(expr, "^r([1-3]?[0-9])$")
++  if r then
++    r = tonumber(r)
++    if r <= 31 then return r, tp end
++  end
++  werror("bad register name `"..expr.."'")
++end
++
++local function parse_fpr(expr)
++  local r = match(expr, "^f([1-3]?[0-9])$")
++  if r then
++    r = tonumber(r)
++    if r <= 31 then return r end
++  end
++  werror("bad register name `"..expr.."'")
++end
++
++
++local function parse_imm(imm, bits, shift, scale, signed)
++  local n = tonumber(imm)
++  if n then
++    local m = sar(n, scale)
++    if shl(m, scale) == n then
++      if signed then
++	local s = sar(m, bits-1)
++	if s == 0 then return shl(m, shift)
++	elseif s == -1 then return shl(m + shl(1, bits), shift) end
++      else
++	if sar(m, bits) == 0 then return shl(m, shift) end
++      end
++    end
++    werror("out of range immediate `"..imm.."'")
++  elseif match(imm, "^[rf]([1-3]?[0-9])$") or
++	 match(imm, "^([%w_]+):([rf][1-3]?[0-9])$") then
++    werror("expected immediate operand, got register")
++  else
++    waction("IMM", (signed and 32768 or 0)+scale*1024+bits*32+shift, imm)
++    return 0
++  end
++end
++
++local function parse_disp(disp, width)
++  local imm, reg = match(disp, "^(.*)%(([%w_:]+)%)$")
++  if imm then
++    local r = shl(parse_gpr(reg), 16)
++    local extname = match(imm, "^extern%s+(%S+)$")
++    if extname then
++      waction("REL_EXT", map_extern[extname], nil, 1)
++      return r
++    else
++      return r + parse_imm(imm, width, 0, 0, true)
++    end
++  end
++  local reg, tailr = match(disp, "^([%w_:]+)%s*(.*)$")
++  if reg and tailr ~= "" then
++    local r, tp = parse_gpr(reg)
++    if tp then
++      waction("IMM", 32768+16*32, format(tp.ctypefmt, tailr))
++      return shl(r, 16)
++    end
++  end
++  werror("bad displacement `"..disp.."'")
++end
++
++local function parse_label(label, def)
++  local prefix = sub(label, 1, 2)
++  -- =>label (pc label reference)
++  if prefix == "=>" then
++    return "PC", 0, sub(label, 3)
++  end
++  -- ->name (global label reference)
++  if prefix == "->" then
++    return "LG", map_global[sub(label, 3)]
++  end
++  if def then
++    -- [1-9] (local label definition)
++    if match(label, "^[1-9]$") then
++      return "LG", 10+tonumber(label)
++    end
++  else
++    -- [<>][1-9] (local label reference)
++    local dir, lnum = match(label, "^([<>])([1-9])$")
++    if dir then -- Fwd: 1-9, Bkwd: 11-19.
++      return "LG", lnum + (dir == ">" and 0 or 10)
++    end
++    -- extern label (extern label reference)
++    local extname = match(label, "^extern%s+(%S+)$")
++    if extname then
++      return "EXT", map_extern[extname]
++    end
++  end
++  werror("bad label `"..label.."'")
++end
++
++------------------------------------------------------------------------------
++
++-- Handle opcodes defined with template strings.
++map_op[".template__"] = function(params, template, nparams)
++  if not params then return sub(template, 9) end
++  local op = tonumber(sub(template, 1, 8), 16)
++  local n = 1
++
++  -- Limit number of section buffer positions used by a single dasm_put().
++  -- A single opcode needs a maximum of 2 positions (ins/ext).
++  if secpos+2 > maxsecpos then wflush() end
++  local pos = wpos()
++
++  -- Process each character.
++  for p in gmatch(sub(template, 9), ".") do
++    if p == "A" then
++       op = op + shl(parse_gpr(params[n]), 21); n = n + 1
++    elseif p == "B" then
++       op = op + shl(parse_gpr(params[n]), 16); n = n + 1
++    elseif p == "C" then
++       op = op + shl(parse_gpr(params[n]), 5); n = n + 1
++    elseif p == "D" then
++       op = op + shl(parse_gpr(params[n]), 0); n = n + 1
++    elseif p == "F" then -- float version A
++      op = op + shl(parse_fpr(params[n]), 21); n = n + 1
++    elseif p == "G" then -- float version B
++      op = op + shl(parse_fpr(params[n]), 16); n = n + 1
++    elseif p == "H" then -- float version C
++      op = op + shl(parse_fpr(params[n]), 5); n = n + 1
++    elseif p == "I" then -- float version D
++       op = op + shl(parse_fpr(params[n]), 0); n = n + 1
++
++    elseif p == "i" then
++       op = op + parse_imm(params[n], 8, 13, 0, true); n = n + 1
++    elseif p == "j" then
++       op = op + parse_imm(params[n], 8, 13, 0, false); n = n + 1
++    elseif p == "o" then
++       op = op + parse_disp(params[n], 16); n = n + 1
++    elseif p == "p" then
++       op = op + parse_disp(params[n], 12); n = n + 1
++    elseif p == "b" then
++      local mode, m, s = parse_label(params[n], false)
++      if p == "b" then m = m + 2048 end
++      waction("REL_"..mode, m, s, 1)
++      n = n + 1
++    else
++       assert(false)
++    end
++  end
++  wputpos(pos, op)
++end
++
++------------------------------------------------------------------------------
++
++-- Pseudo-opcode to mark the position where the action list is to be emitted.
++map_op[".actionlist_1"] = function(params)
++  if not params then return "cvar" end
++  local name = params[1] -- No syntax check. You get to keep the pieces.
++  wline(function(out) writeactions(out, name) end)
++end
++
++-- Pseudo-opcode to mark the position where the global enum is to be emitted.
++map_op[".globals_1"] = function(params)
++  if not params then return "prefix" end
++  local prefix = params[1] -- No syntax check. You get to keep the pieces.
++  wline(function(out) writeglobals(out, prefix) end)
++end
++
++-- Pseudo-opcode to mark the position where the global names are to be emitted.
++map_op[".globalnames_1"] = function(params)
++  if not params then return "cvar" end
++  local name = params[1] -- No syntax check. You get to keep the pieces.
++  wline(function(out) writeglobalnames(out, name) end)
++end
++
++-- Pseudo-opcode to mark the position where the extern names are to be emitted.
++map_op[".externnames_1"] = function(params)
++  if not params then return "cvar" end
++  local name = params[1] -- No syntax check. You get to keep the pieces.
++  wline(function(out) writeexternnames(out, name) end)
++end
++
++------------------------------------------------------------------------------
++
++-- Label pseudo-opcode (converted from trailing colon form).
++map_op[".label_1"] = function(params)
++  if not params then return "[1-9] | ->global | =>pcexpr" end
++  if secpos+1 > maxsecpos then wflush() end
++  local mode, n, s = parse_label(params[1], true)
++  if mode == "EXT" then werror("bad label definition") end
++  waction("LABEL_"..mode, n, s, 1)
++end
++
++------------------------------------------------------------------------------
++
++-- Pseudo-opcodes for data storage.
++map_op[".long_*"] = function(params)
++  if not params then return "imm..." end
++  for _,p in ipairs(params) do
++    local n = tonumber(p)
++    if not n then werror("bad immediate `"..p.."'") end
++    if n < 0 then n = n + 2^32 end
++    wputw(n)
++    if secpos+2 > maxsecpos then wflush() end
++  end
++end
++
++map_op[".str100_1"] = function(params)
++   function empty(s)
++      str = ""
++      i=0
++      repeat
++         str = str .. "\0"
++         i = i+1
++      until i >= s
++      return str
++   end
++   str = string.format("%s\n", params[1])
++   if #str > 100 then
++      wfatal(".str100 only support string size below 100")
++   end
++   str = str..empty(100-#str)
++   i=0
++   while i ~= #str do
++      wputxw(shl(string.byte(str, i+4), 24) +
++                shl(string.byte(str, i+3), 16) +
++                shl(string.byte(str, i+2), 8) +
++                shl(string.byte(str, i+1), 0))
++      i = i + 4
++   end
++end
++
++
++-- Alignment pseudo-opcode.
++map_op[".align_1"] = function(params)
++  if not params then return "numpow2" end
++  if secpos+1 > maxsecpos then wflush() end
++  local align = tonumber(params[1])
++  if align then
++    local x = align
++    -- Must be a power of 2 in the range (2 ... 256).
++    for i=1,8 do
++      x = x / 2
++      if x == 1 then
++	waction("ALIGN", align-1, nil, 1) -- Action byte is 2**n-1.
++	return
++      end
++    end
++  end
++  werror("bad alignment")
++end
++
++------------------------------------------------------------------------------
++
++-- Pseudo-opcode for (primitive) type definitions (map to C types).
++map_op[".type_3"] = function(params, nparams)
++  if not params then
++    return nparams == 2 and "name, ctype" or "name, ctype, reg"
++  end
++  local name, ctype, reg = params[1], params[2], params[3]
++  if not match(name, "^[%a_][%w_]*$") then
++    werror("bad type name `"..name.."'")
++  end
++  local tp = map_type[name]
++  if tp then
++    werror("duplicate type `"..name.."'")
++  end
++  -- Add #type to defines. A bit unclean to put it in map_archdef.
++  map_archdef["#"..name] = "sizeof("..ctype..")"
++  -- Add new type and emit shortcut define.
++  local num = ctypenum + 1
++  map_type[name] = {
++    ctype = ctype,
++    ctypefmt = format("Dt%X(%%s)", num),
++    reg = reg,
++  }
++  wline(format("#define Dt%X(_V) (int)(ptrdiff_t)&(((%s *)0)_V)", num, ctype))
++  ctypenum = num
++end
++map_op[".type_2"] = map_op[".type_3"]
++
++-- Dump type definitions.
++local function dumptypes(out, lvl)
++  local t = {}
++  for name in pairs(map_type) do t[#t+1] = name end
++  sort(t)
++  out:write("Type definitions:\n")
++  for _,name in ipairs(t) do
++    local tp = map_type[name]
++    local reg = tp.reg or ""
++    out:write(format("  %-20s %-20s %s\n", name, tp.ctype, reg))
++  end
++  out:write("\n")
++end
++
++------------------------------------------------------------------------------
++
++-- Set the current section.
++function _M.section(num)
++  waction("SECTION", num)
++  wflush(true) -- SECTION is a terminal action.
++end
++
++------------------------------------------------------------------------------
++
++-- Dump architecture description.
++function _M.dumparch(out)
++  out:write(format("DynASM %s version %s, released %s\n\n",
++    _info.arch, _info.version, _info.release))
++  dumpactions(out)
++end
++
++-- Dump all user defined elements.
++function _M.dumpdef(out, lvl)
++  dumptypes(out, lvl)
++  dumpglobals(out, lvl)
++  dumpexterns(out, lvl)
++end
++
++------------------------------------------------------------------------------
++
++-- Pass callbacks from/to the DynASM core.
++function _M.passcb(wl, we, wf, ww)
++  wline, werror, wfatal, wwarn = wl, we, wf, ww
++  return wflush
++end
++
++-- Setup the arch-specific module.
++function _M.setup(arch, opt)
++  g_arch, g_opt = arch, opt
++end
++
++-- Merge the core maps and the arch-specific maps.
++function _M.mergemaps(map_coreop, map_def)
++  setmetatable(map_op, { __index = map_coreop })
++  setmetatable(map_def, { __index = map_archdef })
++  return map_op, map_def
++end
++
++return _M
++
++------------------------------------------------------------------------------
+diff --git a/src/Makefile b/src/Makefile
+index f56465d..0226e27 100644
+--- a/src/Makefile
++++ b/src/Makefile
+@@ -53,6 +53,7 @@ CCOPT_arm=
+ CCOPT_arm64=
+ CCOPT_ppc=
+ CCOPT_mips=
++CCOPT_sw64= -mieee
+ #
+ CCDEBUG=
+ # Uncomment the next line to generate debug information:
+@@ -232,6 +233,9 @@ TARGET_ASHLDFLAGS= $(LDOPTIONS) $(TARGET_XSHLDFLAGS) $(TARGET_FLAGS) $(TARGET_SH
+ TARGET_ALIBS= $(TARGET_XLIBS) $(LIBS) $(TARGET_LIBS)
+ 
+ TARGET_TESTARCH=$(shell $(TARGET_CC) $(TARGET_TCFLAGS) -E lj_arch.h -dM)
++ifneq (,$(findstring LJ_TARGET_SW64 ,$(TARGET_TESTARCH)))
++  TARGET_LJARCH= sw64
++else
+ ifneq (,$(findstring LJ_TARGET_X64 ,$(TARGET_TESTARCH)))
+   TARGET_LJARCH= x64
+ else
+@@ -272,6 +276,7 @@ endif
+ endif
+ endif
+ endif
++endif
+ 
+ ifneq (,$(findstring LJ_TARGET_PS3 1,$(TARGET_TESTARCH)))
+   TARGET_SYS= PS3
+@@ -425,6 +430,9 @@ ifneq (,$(findstring LJ_NO_UNWIND 1,$(TARGET_TESTARCH)))
+   DASM_AFLAGS+= -D NO_UNWIND
+   TARGET_ARCH+= -DLUAJIT_NO_UNWIND
+ endif
++ifneq (,$(findstring SW64_DEBUG_WI 1,$(TARGET_TESTARCH)))
++  DASM_AFLAGS+= -D SW64_DEBUG_WI
++endif
+ DASM_AFLAGS+= -D VER=$(subst LJ_ARCH_VERSION_,,$(filter LJ_ARCH_VERSION_%,$(subst LJ_ARCH_VERSION ,LJ_ARCH_VERSION_,$(TARGET_TESTARCH))))
+ ifeq (Windows,$(TARGET_SYS))
+   DASM_AFLAGS+= -D WIN
+@@ -439,6 +447,9 @@ ifeq (arm,$(TARGET_LJARCH))
+     DASM_AFLAGS+= -D IOS
+   endif
+ else
++ifeq (,$(findstring LJ_SW64_CORE4 ,$(TARGET_TESTARCH)))
++  DASM_AFLAGS+= -D SW64_CORE4
++endif
+ ifeq (ppc,$(TARGET_LJARCH))
+   ifneq (,$(findstring LJ_ARCH_SQRT 1,$(TARGET_TESTARCH)))
+     DASM_AFLAGS+= -D SQRT
+diff --git a/src/host/buildvm.c b/src/host/buildvm.c
+index de23fab..90a6556 100644
+--- a/src/host/buildvm.c
++++ b/src/host/buildvm.c
+@@ -65,6 +65,8 @@ static int collect_reloc(BuildCtx *ctx, uint8_t *addr, int idx, int type);
+ #include "../dynasm/dasm_ppc.h"
+ #elif LJ_TARGET_MIPS
+ #include "../dynasm/dasm_mips.h"
++#elif LJ_TARGET_SW64
++#include "../dynasm/dasm_sw64.h"
+ #else
+ #error "No support for this architecture (yet)"
+ #endif
+diff --git a/src/host/buildvm_asm.c b/src/host/buildvm_asm.c
+index ffd1490..2c01c02 100644
+--- a/src/host/buildvm_asm.c
++++ b/src/host/buildvm_asm.c
+@@ -159,7 +159,7 @@ static void emit_asm_wordreloc(BuildCtx *ctx, uint8_t *p, int n,
+ 	    ins, sym);
+     exit(1);
+   }
+-#elif LJ_TARGET_MIPS
++#elif LJ_TARGET_MIPS || LJ_TARGET_SW64
+   fprintf(stderr,
+ 	  "Error: unsupported opcode %08x for %s symbol relocation.\n",
+ 	  ins, sym);
+diff --git a/src/jit/bcsave.lua b/src/jit/bcsave.lua
+index c17c88e..3c47e25 100644
+--- a/src/jit/bcsave.lua
++++ b/src/jit/bcsave.lua
+@@ -64,7 +64,7 @@ local map_type = {
+ 
+ local map_arch = {
+   x86 = true, x64 = true, arm = true, arm64 = true, arm64be = true,
+-  ppc = true, mips = true, mipsel = true,
++  ppc = true, mips = true, mipsel = true, sw64 = true
+ }
+ 
+ local map_os = {
+@@ -200,7 +200,7 @@ typedef struct {
+ ]]
+   local symname = LJBC_PREFIX..ctx.modname
+   local is64, isbe = false, false
+-  if ctx.arch == "x64" or ctx.arch == "arm64" or ctx.arch == "arm64be" then
++  if ctx.arch == "x64" or ctx.arch == "arm64" or ctx.arch == "arm64be" or ctx.arch == "sw64" then
+     is64 = true
+   elseif ctx.arch == "ppc" or ctx.arch == "mips" then
+     isbe = true
+@@ -237,7 +237,7 @@ typedef struct {
+   hdr.eendian = isbe and 2 or 1
+   hdr.eversion = 1
+   hdr.type = f16(1)
+-  hdr.machine = f16(({ x86=3, x64=62, arm=40, arm64=183, arm64be=183, ppc=20, mips=8, mipsel=8 })[ctx.arch])
++  hdr.machine = f16(({ x86=3, x64=62, arm=40, arm64=183, arm64be=183, ppc=20, mips=8, mipsel=8,sw64=39190 })[ctx.arch])
+   if ctx.arch == "mips" or ctx.arch == "mipsel" then
+     hdr.flags = f32(0x50001006)
+   end
+@@ -355,7 +355,7 @@ typedef struct {
+   -- Create PE object and fill in header.
+   local o = ffi.new("PEobj")
+   local hdr = o.hdr
+-  hdr.arch = f16(({ x86=0x14c, x64=0x8664, arm=0x1c0, ppc=0x1f2, mips=0x366, mipsel=0x366 })[ctx.arch])
++  hdr.arch = f16(({ x86=0x14c, x64=0x8664, arm=0x1c0, ppc=0x1f2, mips=0x366, mipsel=0x366,sw64=0x9916 })[ctx.arch])
+   hdr.nsects = f16(2)
+   hdr.symtabofs = f32(ffi.offsetof(o, "sym0"))
+   hdr.nsyms = f32(6)
+diff --git a/src/jit/dis_sw64.lua b/src/jit/dis_sw64.lua
+new file mode 100644
+index 0000000..16b60bf
+--- /dev/null
++++ b/src/jit/dis_sw64.lua
+@@ -0,0 +1,649 @@
++----------------------------------------------------------------------------
++-- LuaJIT SW64 disassembler module.
++--
++-- Copyright (C) 2019 deepin inc. All rights reserved.
++-- Released under the MIT/X license. See Copyright Notice in luajit.h
++----------------------------------------------------------------------------
++-- This is a helper module used by the LuaJIT machine code dumper module.
++--
++-- It disassembles all standard SW64 instructions.
++------------------------------------------------------------------------------
++local type = type
++local byte, format = string.byte, string.format
++local match, gmatch = string.match, string.gmatch
++local concat = table.concat
++local bit = require("bit")
++local band, bor, tohex = bit.band, bit.bor, bit.tohex
++local lshift, rshift, arshift = bit.lshift, bit.rshift, bit.arshift
++
++------------------------------------------------------------------------------
++-- Primary and extended opcode maps
++------------------------------------------------------------------------------
++
++local OPC_SYSCALL = ""
++local OPC_MISI_MEMORY, OPC_FUNC_MEMORY = "", ""
++local OPC_MEMORY_F = "FBo"
++local OPC_MEMORY = "ABo"
++local OPC_CONTROL = "Ab"
++local OPC_CONTROL_F = "Fb"
++local OPC_ARITHMETIC = {
++   shift = 5,
++   mask = 0xff,
++   pat = "ABD",
++}
++local OPC_ARITHMETIC_F = {
++   shift = 5,
++   mask = 0xff,
++   pat = "FGI",
++}
++local OPC_ARITHMETIC_I = {
++   shift = 5,
++   mask = 0xff,
++   pat = "AjD",
++}
++local OPC_COMPLEX_ARITHMETIC = {
++   shift = 10,
++   mask = 0x7,
++   pat = "ABCD",
++}
++local OPC_COMPLEX_ARITHMETIC_F = {
++   shift = 10,
++   mask = 0x1f,
++   pat = "FGHI",
++}
++
++local ignores_tabs = {
++   F = { "FCVT", "IFMOV", "FCPY" },
++}
++
++function should_ignore(name, field)
++   pat = ignores_tabs[field] or {}
++   for _, p in ipairs(pat) do
++      if match(name, p) then
++         return true
++      end
++   end
++   return false
++end
++
++local class_tabs = {
++	[0x00] = OPC_SYSCALL,
++	[0x01] = OPC_MEMORY,
++	[0x02] = OPC_MEMORY,
++	[0x03] = OPC_MEMORY,
++	[0x04] = OPC_CONTROL,
++	[0x05] = OPC_CONTROL,
++	[0x06] = OPC_MISI_MEMORY,
++	[0x08] = OPC_FUNC_MEMORY,
++	[0x10] = OPC_ARITHMETIC,
++	[0x11] = OPC_COMPLEX_ARITHMETIC,
++	[0x12] = OPC_ARITHMETIC_I,
++	[0x13] = OPC_ARITHMETIC_I,
++	[0x18] = OPC_ARITHMETIC_F,
++	[0x19] = OPC_COMPLEX_ARITHMETIC_F,
++	[0x20] = OPC_MEMORY,
++	[0x21] = OPC_MEMORY,
++	[0x22] = OPC_MEMORY,
++	[0x23] = OPC_MEMORY,
++	[0x24] = OPC_MEMORY,
++	[0x25] = OPC_MEMORY,
++	[0x26] = OPC_MEMORY_F,
++	[0x27] = OPC_MEMORY_F,
++	[0x28] = OPC_MEMORY,
++	[0x29] = OPC_MEMORY,
++	[0x2A] = OPC_MEMORY,
++	[0x2B] = OPC_MEMORY,
++	[0x2C] = OPC_MEMORY,
++	[0x2D] = OPC_MEMORY,
++	[0x2E] = OPC_MEMORY_F,
++	[0x2F] = OPC_MEMORY_F,
++	[0x30] = OPC_CONTROL,
++	[0x31] = OPC_CONTROL,
++	[0x32] = OPC_CONTROL,
++	[0x33] = OPC_CONTROL,
++	[0x34] = OPC_CONTROL,
++	[0x35] = OPC_CONTROL,
++	[0x36] = OPC_CONTROL,
++	[0x37] = OPC_CONTROL,
++	[0x38] = OPC_CONTROL_F,
++	[0x39] = OPC_CONTROL_F,
++	[0x3A] = OPC_CONTROL_F,
++	[0x3B] = OPC_CONTROL_F,
++	[0x3C] = OPC_CONTROL_F,
++	[0x3D] = OPC_CONTROL_F,
++	[0x3e] = OPC_MEMORY,
++	[0x3f] = OPC_MEMORY,
++}
++
++local map_pri = {
++   [0x00] = {
++      [0x0] = "SYSCALL/B",
++      [0x1] = "SYSCALL"
++   },
++   [0x01]= {[0x0]= "CALL"},
++   [0x02]= {[0x0]= "RET"},
++   [0x03]= {[0x0]= "JMP"},
++   [0x04]= {[0x0]= "BR"},
++   [0x05]= {[0x0]= "BSR"},
++   [0x06]= {
++      [0x0000] = "MEMB",
++      [0x0001] = "IMEMB",
++      [0x1000] = "RD_F",
++      [0x1020] = "WR_F",
++   },
++   [0x08]= {
++      [0x0]= "LLDW",
++      [0x1]= "LLDL",
++      [0x8]= "LSTW",
++      [0x9]= "LSTL",
++	},
++   [0x10]= {
++		[0x00]= "ADDW",
++		[0x01]= "SUBW",
++		[0x02]= "S4ADDW",
++		[0x03]= "S4SUBW",
++		[0x04]= "S8ADDW",
++		[0x05]= "S8SUBW",
++		[0x08]= "ADDL",
++		[0x09]= "SUBL",
++		[0x0a]= "S4ADDL",
++		[0x0b]= "S4SUBL",
++		[0x0c]= "S8ADDL",
++		[0x0d]= "S8SUBL",
++		[0x10]= "MULW",
++		[0x11]= "DIVW",
++		[0x12]= "UDIVW",
++		[0x13]= "REMW",
++		[0x14]= "UREMW",
++		[0x18]= "MULL",
++		[0x19]= "UMULH",
++	        [0x1a]= "DIVL",
++		[0x1b]= "UDIVL",
++		[0x1c]= "REML",
++		[0x1d]= "UREML",
++		[0x1e]= "ADDPI",
++		[0x1f]= "ADDPIS",
++		[0x28]= "CMPEQ",
++		[0x29]= "CMPLT",
++		[0x2a]= "CMPLE",
++		[0x2b]= "CMPULT",
++		[0x2c]= "CMPULE",
++		[0x2d]= "SBT",
++		[0x2e]= "CBT",
++		[0x38]= "AND",
++		[0x39]= "BIC",
++		[0x3a]= "BIS",
++		[0x3b]= "ORNOT",
++		[0x3c]= "XOR",
++		[0x3d]= "EQV",
++		[0x40]= "INSLB",
++		[0x41]= "INSLH",
++		[0x42]= "INSLW",
++		[0x43]= "INSLL",
++		[0x44]= "INSHB",
++		[0x45]= "INSHH",
++		[0x46]= "INSHW",
++		[0x47]= "INSHL",
++		[0x48]= "SLL",
++		[0x49]= "SRL",
++		[0x4a]= "SRA",
++		[0x4b]= "ROLL",
++		[0x4c]= "SLLW",
++		[0x4d]= "SRLW",
++		[0x4e]= "SRAW",
++		[0x4f]= "ROLW",
++		[0x50]= "EXTLB",
++		[0x51]= "EXTLH",
++		[0x52]= "EXTLW",
++		[0x53]= "EXTLL",
++		[0x54]= "EXTHB",
++		[0x55]= "EXTHH",
++		[0x56]= "EXTHW",
++		[0x57]= "EXTHL",
++		[0x58]= "CTPOP",
++		[0x59]= "CTLZ",
++		[0x5a]= "CTTZ",
++		[0x60]= "MASKLB",
++		[0x61]= "MASKLH",
++		[0x62]= "MASKLW",
++		[0x63]= "MASKLL",
++		[0x64]= "MASKHB",
++		[0x65]= "MASKHH",
++		[0x66]= "MASKHW",
++		[0x67]= "MASKHL",
++		[0x68]= "ZAP",
++		[0x69]= "ZAPNOT",
++		[0x6a]= "SEXTB",
++		[0x6b]= "SEXTH",
++		[0x6c]= "CMPGEB",
++		[0x70]= "FIMOVS",
++		[0x78]= "FIMOVD",
++	},
++   [0x11]= {
++		[0x0]= "SELEQ",
++		[0x1]= "SELGE",
++		[0x2]= "SELGT",
++		[0x3]= "SELLE",
++		[0x4]= "SELLT",
++		[0x5]= "SELNE",
++		[0x6]= "SELLBC",
++		[0x7]= "SELLBS",
++	},
++   [0x12]= {
++		[0x00]= "ADDW",
++		[0x01]= "SUBW",
++		[0x02]= "S4ADDW",
++		[0x03]= "S4SUBW",
++		[0x04]= "S8ADDW",
++		[0x05]= "S8SUBW",
++		[0x08]= "ADDL",
++		[0x09]= "SUBL",
++		[0x0a]= "S4ADDL",
++		[0x0b]= "S4SUBL",
++		[0x0c]= "S8ADDL",
++		[0x0d]= "S8SUBL",
++		[0x10]= "MULW",
++		[0x18]= "MULL",
++		[0x19]= "UMULH",
++		[0x28]= "CMPEQ",
++		[0x29]= "CMPLT",
++		[0x2a]= "CMPLE",
++		[0x2b]= "CMPULT",
++		[0x2c]= "CMPULE",
++		[0X2d]= "SBT"
++		[0x2e]= "CBT",
++		[0x38]= "AND",
++		[0x39]= "BIC",
++		[0x3a]= "BIS",
++		[0x3b]= "ORNOT",
++		[0x3c]= "XOR",
++		[0x3d]= "EQV",
++		[0x40]= "INSLB",
++		[0x41]= "INSLH",
++		[0x42]= "INSLW",
++		[0x43]= "INSLL",
++		[0x44]= "INSHB",
++		[0x45]= "INSHH",
++		[0x46]= "INSHW",
++		[0x47]= "INSHL",
++		[0x48]= "SLL",
++		[0x49]= "SRL",
++		[0x4a]= "SRA",
++		[0x4b]= "ROLL",
++		[0x4c]= "SLLW",
++		[0x4d]= "SRLW",
++		[0x4e]= "SRAW",
++		[0x4f]= "ROLW"
++		[0x50]= "EXTLB",
++		[0x51]= "EXTLH",
++		[0x52]= "EXTLW",
++		[0x53]= "EXTLL",
++		[0x54]= "EXTHB",
++		[0x55]= "EXTHH",
++		[0x56]= "EXTHW",
++		[0x57]= "EXTHL",
++		[0x60]= "MASKLB",
++		[0x61]= "MASKLH",
++		[0x62]= "MASKLW",
++		[0x63]= "MASKLL",
++		[0x64]= "MASKHB",
++		[0x65]= "MASKHH",
++		[0x66]= "MASKHW",
++		[0x67]= "MASKHL",
++		[0x68]= "ZAP",
++		[0x69]= "ZAPNOT",
++		[0x6a]= "SEXTB",
++		[0x6b]= "SEXTH",
++		[0x6c]= "CMPGEB",
++	},
++    [0x13]= {
++		[0x0]= "SELEQ",
++		[0x1]= "SELGE",
++		[0x2]= "SELGT",
++		[0x3]= "SELLE",
++		[0x4]= "SELLT",
++		[0x5]= "SELNE",
++		[0x6]= "SELLBC",
++		[0x7]= "SELLBS",
++    },
++	[0x18]= {
++		[0x00]= "FADDS",
++		[0x01]= "FADDD",
++		[0x02]= "FSUBS",
++		[0x03]= "FSUBD",
++		[0x04]= "FMULS",
++		[0x05]= "FMULD",
++		[0x06]= "FDIVS",
++		[0x07]= "FDIVD",
++		[0x08]= "FSQRTS",
++		[0x09]= "FSQRTD",
++		[0x10]= "FCMPEQ",
++		[0x11]= "FCMPLE",
++		[0x12]= "FCMPLT",
++		[0x13]= "FCMPUN",
++		[0x20]= "FCVTSD",
++		[0x21]= "FCVTDS",
++		[0x22]= "FCVTDL_G",
++		[0x23]= "FCVTDL_P",
++		[0x24]= "FCVTDL_Z",
++		[0x25]= "FCVTDL_N",
++		[0x27]= "FCVTDL",
++		[0x28]= "FCVTWL",
++		[0x29]= "FCVTLW",
++		[0x2D]= "FCVTLS",
++		[0x2F]= "FCVTLD",
++		[0x30]= "FCPYS",
++		[0x31]= "FCPYSE",
++		[0x32]= "FCPYSN",
++		[0x40]= "IFMOVS",
++		[0x41]= "IFMOVD",
++		[0x50]= "RFPCR",
++		[0x51]= "WFPCR",
++		[0x54]= "SETFPEC0",
++		[0x55]= "SETFPEC1",
++		[0x56]= "SETFPEC2",
++		[0x57]= "SETFPEC3",
++	},
++    [0x19]= {
++		[0x00]= "FMAS",
++		[0x01]= "FMAD",
++		[0x02]= "FMSS",
++		[0x03]= "FMSD",
++		[0x04]= "FNMAS",
++		[0x05]= "FNMAD",
++		[0x06]= "FNMSS",
++		[0x07]= "FNMSD",
++
++		[0x10]= "FSELEQ",
++		[0x11]= "FSELNE",
++		[0x12]= "FSELLT",
++		[0x13]= "FSELLE",
++		[0x14]= "FSELGT",
++		[0x15]= "FSELGE",
++	},
++	[0x1D]= {[0]= "LBR"},
++	[0x20]= {[0]= "LDBU"},
++	[0x21]= {[0]= "LDHU"},
++	[0x22]= {[0]= "LDW"},
++	[0x23]= {[0]= "LDL"},
++	[0x24]= {[0]= "LDL_U"},
++	[0x25]= {[0]= "PRI_LD"},
++	[0x26]= {[0]= "FLDS"},
++	[0x27]= {[0]= "FLDD"},
++	[0x28]= {[0]= "STB"},
++	[0x29]= {[0]= "STH"},
++	[0x2A]= {[0]= "STW"},
++	[0x2B]= {[0]= "STL"},
++	[0x2C]= {[0]= "STL_U"},
++	[0x2D]= {[0]= "PRI_ST"},
++	[0x2E]= {[0]= "FSTS"},
++	[0x2F]= {[0]= "FSTD"},
++	[0x30]= {[0]= "BEQ"},
++	[0x31]= {[0]= "BNE"},
++	[0x32]= {[0]= "BLT"},
++	[0x33]= {[0]= "BLE"},
++	[0x34]= {[0]= "BGT"},
++	[0x35]= {[0]= "BGE"},
++	[0x36]= {[0]= "BLBC"},
++	[0x37]= {[0]= "BLBS"},
++	[0x38]= {[0]= "FBEQ"},
++	[0x39]= {[0]= "FBNE"},
++	[0x3A]= {[0]= "FBLT"},
++	[0x3B]= {[0]= "FBLE"},
++	[0x3C]= {[0]= "FBGT"},
++	[0x3D]= {[0]= "FBGE"},
++	[0x3e]= {[0]= "LDI"},
++	[0x3f]= {[0]= "LDIH"},
++}
++
++
++------------------------------------------------------------------------------
++
++local map_gpr = {
++   [0] = "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7",
++   "r8", "BASE", "r10", "r11", "r12", "r13", "r14", "JGL",
++   "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
++   "r24", "r25", "JTMP", "r27", "at", "r29", "sp", "zero",
++}
++
++------------------------------------------------------------------------------
++
++-- Output a nicely formatted line with an opcode and operands.
++local function putop(ctx, text, operands)
++   local pos = ctx.pos
++   local extra = ""
++   if ctx.rel then
++      local sym = ctx.symtab[ctx.rel]
++      if sym then extra = "\t->"..sym end
++   end
++   if ctx.hexdump > 0 then
++      ctx.out(format("%08x  %s  %-7s %s%s\n",
++                     ctx.addr+pos, tohex(ctx.op), text, concat(operands, ", "), extra))
++   else
++      ctx.out(format("%08x  %-7s %s%s\n",
++                     ctx.addr+pos, text, concat(operands, ", "), extra))
++   end
++   ctx.pos = pos + 4
++end
++
++-- Fallback for unknown opcodes.
++local function unknown(ctx)
++   return putop(ctx, ".long", { "0x"..tohex(ctx.op) })
++end
++
++local function get_le(ctx)
++   local pos = ctx.pos
++   local b0, b1, b2, b3 = byte(ctx.code, pos+1, pos+4)
++   return bor(lshift(b3, 24), lshift(b2, 16), lshift(b1, 8), b0)
++end
++
++-- Disassemble a single instruction.
++local function disass_ins(ctx)
++   local op = ctx:get()
++   local operands = {}
++   local last = nil
++   ctx.op = op
++   ctx.rel = nil
++
++   local opcode = band(rshift(op, 26), 0x3f)
++   local opat = map_pri[opcode]
++
++   local fn = 0
++   local pat = class_tabs[opcode]
++   local name = opat[0]
++   if pat.shift then
++      name = opat[band(rshift(op, pat.shift), pat.mask)]
++      pat = pat.pat
++   end
++   local isf = false
++
++   if name == "FIMOVD" then
++      pat = "FD"
++   elseif name == "IFMOVD" then
++      pat = "AI"
++   end
++
++
++   for p in gmatch(pat, ".") do
++      local x = nil
++      if should_ignore(name, p) then
++         -- do nothing
++      elseif p == "A" then
++         x = map_gpr[band(rshift(op, 21), 31)]
++      elseif p == "B" then
++         x = map_gpr[band(rshift(op, 16), 31)]
++      elseif p == "C" then
++         x = map_gpr[band(rshift(op, 5), 31)]
++      elseif p == "D" then
++         x = map_gpr[band(rshift(op, 0), 31)]
++      elseif p == "F" then
++         isf = true
++         x = "f"..band(rshift(op, 21), 31)
++      elseif p == "G" then
++         isf = true
++         x = "f"..band(rshift(op, 16), 31)
++      elseif p == "H" then
++         isf = true
++         x = "f"..band(rshift(op, 5), 31)
++      elseif p == "I" then
++         isf = true
++         x = "f"..band(rshift(op, 0), 31)
++      elseif p == "o" then
++         local disp = arshift(lshift(op, 16), 16)
++         if name == "LDI" and disp == 0 then
++            name = "MOVE"
++            operands[#operands] = last
++         else
++            operands[#operands] = format("%d(%s)", disp, last)
++         end
++      elseif p == "p" then
++         local index = map_gpr[band(rshift(op, 16), 31)]
++         operands[#operands] = format("%s(%s)", index, last)
++      elseif p == "b" then
++         x = ctx.addr + ctx.pos + arshift(lshift(op, 21), 21)*4 + 4
++         ctx.rel = x
++         x = format("0x%08x", x)
++      elseif p == "i" then
++         x = band(rshift(op, 13), 0x000ff)
++      elseif p == "j" then
++         x = band(rshift(op, 13), 0x000ff)
++      elseif p == "j" then
++         x = band(rshift(op, 13), 0x000ff)
++      elseif p == "1" then
++         if last == "ra" then
++            operands[#operands] = nil
++         end
++      else
++         assert(false)
++      end
++      if x then operands[#operands+1] = x; last = x end
++   end
++   return putop(ctx, name, operands)
++end
++
++------------------------------------------------------------------------------
++
++-- Disassemble a block of code.
++local function disass_block(ctx, ofs, len)
++   if not ofs then ofs = 0 end
++   local stop = len and ofs+len or #ctx.code
++   stop = stop - stop % 4
++   ctx.pos = ofs - ofs % 4
++   ctx.rel = nil
++   while ctx.pos < stop do disass_ins(ctx) end
++end
++
++-- Extended API: create a disassembler context. Then call ctx:disass(ofs, len).
++local function create(code, addr, out)
++   local ctx = {}
++   ctx.code = code
++   ctx.addr = addr or 0
++   ctx.out = out or io.write
++   ctx.symtab = {}
++   ctx.disass = disass_block
++   ctx.hexdump = 8
++   ctx.get = get_le
++   return ctx
++end
++
++-- Simple API: disassemble code (a string) at address and output via out.
++local function disass(code, addr, out)
++   create(code, addr, out):disass()
++end
++
++-- Return register name for RID.
++local function regname(r)
++   if r < 32 then return map_gpr[r] end
++   return "f"..(r-32)
++end
++
++
++
++function wi_debug(__obj, op, addr)
++   if not addr then
++      addr = 0
++   end
++   local operands = {}
++   local last = nil
++
++   local opcode = band(rshift(op, 26), 0x3f)
++   local opat = map_pri[opcode]
++
++   local fn = 0
++   local pat = class_tabs[opcode]
++   local name = opat[0]
++   if pat.shift then
++      name = opat[band(rshift(op, pat.shift), pat.mask)]
++      pat = pat.pat
++   end
++   local isf = false
++
++   if name == "FIMOVD" then
++      pat = "FD"
++   elseif name == "IFMOVD" then
++      pat = "AI"
++   end
++
++   for p in gmatch(pat, ".") do
++      local x = nil
++      if should_ignore(name, p) then
++         -- do nothing
++      elseif p == "A" then
++         x = map_gpr[band(rshift(op, 21), 31)]
++      elseif p == "B" then
++         x = map_gpr[band(rshift(op, 16), 31)]
++      elseif p == "C" then
++         x = map_gpr[band(rshift(op, 5), 31)]
++      elseif p == "D" then
++         x = map_gpr[band(rshift(op, 0), 31)]
++      elseif p == "F" then
++         isf = true
++         x = "f"..band(rshift(op, 21), 31)
++      elseif p == "G" then
++         isf = true
++         x = "f"..band(rshift(op, 16), 31)
++      elseif p == "H" then
++         isf = true
++         x = "f"..band(rshift(op, 5), 31)
++      elseif p == "I" then
++         isf = true
++         x = "f"..band(rshift(op, 0), 31)
++      elseif p == "o" then
++         local disp = arshift(lshift(op, 16), 16)
++         if name == "LDI" and disp == 0 then
++            name = "MOVE"
++            operands[#operands] = last
++         else
++            operands[#operands] = format("%d(%s)", disp, last)
++         end
++      elseif p == "p" then
++         local index = map_gpr[band(rshift(op, 16), 31)]
++         operands[#operands] = format("%s(%s)", index, last)
++      elseif p == "b" then
++         x = addr + arshift(lshift(op, 21), 21)*4 + 4
++         x = format("0x%08x", x)
++      elseif p == "i" then
++         x = band(rshift(op, 13), 0x000ff)
++      elseif p == "j" then
++         x = band(rshift(op, 13), 0x000ff)
++      elseif p == "j" then
++         x = band(rshift(op, 13), 0x000ff)
++      elseif p == "1" then
++         if last == "ra" then
++            operands[#operands] = nil
++         end
++      else
++         assert(false)
++      end
++      if x then operands[#operands+1] = x; last = x end
++   end
++   print(name, concat(operands, ", "))
++end
++
++-- Public module functions.
++return {
++   create = create,
++   disass = disass,
++   regname = regname,
++   wi_debug = wi_debug,
++}
+diff --git a/src/jit/dump.lua b/src/jit/dump.lua
+index 2bea652..3f90fe9 100644
+--- a/src/jit/dump.lua
++++ b/src/jit/dump.lua
+@@ -623,7 +623,7 @@ local function dump_texit(tr, ex, ngpr, nfpr, ...)
+ 	if i % 8 == 0 then out:write("\n") end
+       end
+     end
+-    if jit.arch == "mips" or jit.arch == "mipsel" then
++    if jit.arch == "mips" or jit.arch == "mipsel" or  jit.arch == "sw64" then
+       for i=1,nfpr,2 do
+ 	out:write(format(" %+17.14g", regs[ngpr+i]))
+ 	if i % 8 == 7 then out:write("\n") end
+diff --git a/src/lib_jit.c b/src/lib_jit.c
+index 22ca0a1..4be1761 100644
+--- a/src/lib_jit.c
++++ b/src/lib_jit.c
+@@ -732,6 +732,8 @@ static uint32_t jit_cpudetect(lua_State *L)
+   }
+ #endif
+ #endif
++#elif LJ_TARGET_SW64
++  /* Nothing to do. */
+ #else
+ #error "Missing CPU detection for this architecture"
+ #endif
+diff --git a/src/lj_arch.h b/src/lj_arch.h
+index c8d7138..070bd89 100644
+--- a/src/lj_arch.h
++++ b/src/lj_arch.h
+@@ -29,6 +29,9 @@
+ #define LUAJIT_ARCH_mips32	6
+ #define LUAJIT_ARCH_MIPS64	7
+ #define LUAJIT_ARCH_mips64	7
++#define LUAJIT_ARCH_SW64	77
++#define LUAJIT_ARCH_sw64	77
++
+ 
+ /* Target OS. */
+ #define LUAJIT_OS_OTHER		0
+@@ -55,6 +58,8 @@
+ #define LUAJIT_TARGET	LUAJIT_ARCH_MIPS64
+ #elif defined(__mips__) || defined(__mips) || defined(__MIPS__) || defined(__MIPS)
+ #define LUAJIT_TARGET	LUAJIT_ARCH_MIPS32
++#elif defined(__sw_64__)
++#define LUAJIT_TARGET	LUAJIT_ARCH_SW64
+ #else
+ #error "No support for this architecture (yet)"
+ #endif
+@@ -358,6 +363,24 @@
+ #define LJ_ARCH_VERSION		10
+ #endif
+ 
++#elif LUAJIT_TARGET == LUAJIT_ARCH_SW64
++
++#define LJ_ARCH_NAME		"sw64"
++#define LJ_ARCH_ENDIAN		LUAJIT_LE
++#define LJ_ARCH_BITS		64
++#define LJ_TARGET_SW64		1
++#define LJ_TARGET_EHRETREG	4  //TODO
++#define LJ_TARGET_EHRAREG	8  //??TODO
++#define LJ_TARGET_GC64          1
++#define LJ_TARGET_JUMPRANGE	21	/* 2*2^21 = 4MB-aligned region */
++#define LJ_TARGET_MASKSHIFT	1
++#define LJ_TARGET_MASKROT	1
++#define LJ_ARCH_NUMMODE		LJ_NUMMODE_DUAL
++#define LJ_ARCH_VERSION		10   // ??
++#define LJ_PAGESIZE		8192
++#define SW64_DEBUG_WI 0
++#define LJ_SW64_CORE4 0
++
+ #else
+ #error "No target architecture defined"
+ #endif
+@@ -553,7 +576,7 @@
+ #define LUAJIT_NO_UNWIND	1
+ #endif
+ 
+-#if defined(LUAJIT_NO_UNWIND) || defined(__symbian__) || LJ_TARGET_IOS || LJ_TARGET_PS3 || LJ_TARGET_PS4
++#if defined(LUAJIT_NO_UNWIND) || defined(__symbian__) || LJ_TARGET_IOS || LJ_TARGET_PS3 || LJ_TARGET_PS4 || LJ_TARGET_SW64
+ #define LJ_NO_UNWIND		1
+ #endif
+ 
+diff --git a/src/lj_asm.c b/src/lj_asm.c
+index c2cf5a9..febea55 100644
+--- a/src/lj_asm.c
++++ b/src/lj_asm.c
+@@ -177,6 +177,8 @@ IRFLDEF(FLOFS)
+ #include "lj_emit_ppc.h"
+ #elif LJ_TARGET_MIPS
+ #include "lj_emit_mips.h"
++#elif LJ_TARGET_SW64
++#include "lj_emit_sw64.h"
+ #else
+ #error "Missing instruction emitter for target CPU"
+ #endif
+@@ -1597,6 +1599,8 @@ static void asm_loop(ASMState *as)
+ #include "lj_asm_ppc.h"
+ #elif LJ_TARGET_MIPS
+ #include "lj_asm_mips.h"
++#elif LJ_TARGET_SW64
++#include "lj_asm_sw64.h"
+ #else
+ #error "Missing assembler for target CPU"
+ #endif
+@@ -2374,7 +2378,9 @@ void lj_asm_trace(jit_State *J, GCtrace *T)
+       T->nins = J->curfinal->nins;
+       break;  /* Done. */
+     }
+-
++#if SW64_DEBUG_WI
++    memset(as->mcbot, 0, sizeof(MCode)*(as->mctop - as->mcbot));
++#endif
+     /* Otherwise try again with a bigger IR. */
+     lj_trace_free(J2G(J), J->curfinal);
+     J->curfinal = NULL;  /* In case lj_trace_alloc() OOMs. */
+diff --git a/src/lj_asm_sw64.h b/src/lj_asm_sw64.h
+new file mode 100644
+index 0000000..6564c77
+--- /dev/null
++++ b/src/lj_asm_sw64.h
+@@ -0,0 +1,2072 @@
++/*
++** SW64 IR assembler (SSA IR -> machine code).
++** Copyright (C) 2019 deepin inc. See Copyright Notice in luajit.h
++*/
++
++#include <stdio.h>
++#define TODO do {printf("\e[1;34mTODO IMPLEMENT %s\e[m\n", __FUNCTION__); asm("bpt;bpt");} while(0);
++
++#define EXIT_ROOM 6
++
++/* -- Register allocator extensions --------------------------------------- */
++
++/* Allocate a register with a hint. */
++static Reg ra_hintalloc(ASMState *as, IRRef ref, Reg hint, RegSet allow)
++{
++  Reg r = IR(ref)->r;
++  if (ra_noreg(r)) {
++    if (!ra_hashint(r) && !iscrossref(as, ref))
++      ra_sethint(IR(ref)->r, hint);  /* Propagate register hint. */
++    r = ra_allocref(as, ref, allow);
++  }
++  ra_noweak(as, r);
++  return r;
++}
++
++/* Allocate a register or RID_ZERO. */
++static Reg ra_alloc1z(ASMState *as, IRRef ref, RegSet allow)
++{
++  Reg r = IR(ref)->r;
++  if (ra_noreg(r)) {
++    if (!(allow & RSET_FPR) && irref_isk(ref) && get_kval(IR(ref)) == 0)
++      return RID_ZERO;
++    r = ra_allocref(as, ref, allow);
++  } else {
++    ra_noweak(as, r);
++  }
++  return r;
++}
++
++/* Allocate two source registers for three-operand instructions. */
++static Reg ra_alloc2(ASMState *as, IRIns *ir, RegSet allow)
++{
++  IRIns *irl = IR(ir->op1), *irr = IR(ir->op2);
++  Reg left = irl->r, right = irr->r;
++  if (ra_hasreg(left)) {
++    ra_noweak(as, left);
++    if (ra_noreg(right))
++      right = ra_alloc1z(as, ir->op2, rset_exclude(allow, left));
++    else
++      ra_noweak(as, right);
++  } else if (ra_hasreg(right)) {
++    ra_noweak(as, right);
++    left = ra_alloc1z(as, ir->op1, rset_exclude(allow, right));
++  } else if (ra_hashint(right)) {
++    right = ra_alloc1z(as, ir->op2, allow);
++    left = ra_alloc1z(as, ir->op1, rset_exclude(allow, right));
++  } else {
++    left = ra_alloc1z(as, ir->op1, allow);
++    right = ra_alloc1z(as, ir->op2, rset_exclude(allow, left));
++  }
++  return left | (right << 8);
++}
++
++
++/* -- Operand fusion ------------------------------------------------------ */
++
++/* Limit linear search to this distance. Avoids O(n^2) behavior. */
++#define CONFLICT_SEARCH_LIM	31
++
++/* Check if there's no conflicting instruction between curins and ref. */
++static int noconflict(ASMState *as, IRRef ref, IROp conflict)
++{
++  IRIns *ir = as->ir;
++  IRRef i = as->curins;
++  if (i > ref + CONFLICT_SEARCH_LIM)
++    return 0;  /* Give up, ref is too far away. */
++  while (--i > ref)
++    if (ir[i].o == conflict)
++      return 0;  /* Conflict found. */
++  return 1;  /* Ok, no conflict. */
++}
++
++/* Fuse the array base of colocated arrays. */
++static int32_t asm_fuseabase(ASMState *as, IRRef ref)
++{
++  IRIns *ir = IR(ref);
++  if (ir->o == IR_TNEW && ir->op1 <= LJ_MAX_COLOSIZE &&
++      !neverfuse(as) && noconflict(as, ref, IR_NEWREF))
++    return (int32_t)sizeof(GCtab);
++  return 0;
++}
++
++/* Fuse array/hash/upvalue reference into register+offset operand. */
++static Reg asm_fuseahuref(ASMState *as, IRRef ref, int32_t *ofsp, RegSet allow)
++{
++  IRIns *ir = IR(ref);
++  if (ra_noreg(ir->r)) {
++    if (ir->o == IR_AREF) {
++      if (mayfuse(as, ref)) {
++        if (irref_isk(ir->op2)) {
++          IRRef tab = IR(ir->op1)->op1;
++          int32_t ofs = asm_fuseabase(as, tab);
++          IRRef refa = ofs ? tab : ir->op1;
++          ofs += 8*IR(ir->op2)->i;
++          if (checki16(ofs)) {
++            *ofsp = ofs;
++            return ra_alloc1(as, refa, allow);
++          }
++        }
++      }
++    } else if (ir->o == IR_HREFK) {
++      if (mayfuse(as, ref)) {
++        int32_t ofs = (int32_t)(IR(ir->op2)->op2 * sizeof(Node));
++        if (checki16(ofs)) {
++          *ofsp = ofs;
++          return ra_alloc1(as, ir->op1, allow);
++        }
++      }
++    } else if (ir->o == IR_UREFC) {
++      if (irref_isk(ir->op1)) {
++        GCfunc *fn = ir_kfunc(IR(ir->op1));
++        intptr_t ofs = (intptr_t)&gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv.tv;
++        intptr_t jgl = (intptr_t)J2G(as->J);
++        if ((uintptr_t)(ofs-jgl) < 65536) {
++          *ofsp = ofs-jgl-32768;
++          return RID_JGL;
++        } else {
++          *ofsp = (int16_t)ofs;
++          return ra_allock(as, ofs-(int16_t)ofs, allow);
++        }
++      }
++    }
++  }
++  *ofsp = 0;
++  return ra_alloc1(as, ref, allow);
++}
++
++/* Fuse XLOAD/XSTORE reference into load/store operand. */
++
++/* --- LOAD ADDRESS MACRO ------------------------------------------------ */
++
++static int asm_lda(ASMState* as, MCode* mcp, Reg dest, uintptr_t addr)
++{
++  int count = 0;
++  int16_t hi, lo;
++  MCode mtmp[5] = {0};
++  split64AddrHI32(addr, &hi, &lo);
++  if (hi != 0) {
++    // ldih dest, hi(zero)
++    mtmp[count++] = SW64I_LDIH | SW64F_A(dest) | SW64F_DISP(hi, RID_ZERO);
++  }
++  if (lo != 0) {
++    // ldi dest, lo(dest or zero)
++    mtmp[count++] = SW64I_LDI | SW64F_A(dest) | SW64F_DISP(lo, hi ? dest : RID_ZERO);
++  }
++  if (hi || lo) {
++    // slli dest, 32, dest
++    mtmp[count++] = SW64I_SLLI | SW64F_A(dest) | SW64F_j(32) | SW64F_D(dest);
++  }
++
++  split64AddrLO32(addr, &hi, &lo);
++  mtmp[count] = SW64I_LDIH | SW64F_A(dest) | SW64F_DISP(hi, count > 1 ? dest : RID_ZERO);
++  count++;
++  mtmp[count++] = SW64I_LDI | SW64F_A(dest) | SW64F_DISP(lo, dest);
++
++  for (int i=count-1; i>=0; i--) {
++    __WI(&mcp[i-count], mtmp[i]);
++  }
++  return count;
++}
++
++
++/* -- Guard handling ------------------------------------------------------ */
++
++/* Need some spare long-range jump slots, for out-of-range branches. */
++#define SW64_SPAREJUMP		4
++
++/* Setup spare long-range jump slots per mcarea. */
++static void asm_sparejump_setup(ASMState *as)
++{
++  MCode *mxp = as->mcbot;
++  if (((uintptr_t)mxp & (LJ_PAGESIZE-1)) == 8) {
++    lua_assert(SW64I_NOP == 0x43ff075f);
++    memset(mxp, SW64I_NOP, SW64_SPAREJUMP*2*sizeof(MCode));
++    mxp += SW64_SPAREJUMP*2;
++    lua_assert(mxp < as->mctop);
++    lj_mcode_sync(as->mcbot, mxp);
++    lj_mcode_commitbot(as->J, mxp);
++    as->mcbot = mxp;
++    as->mclim = as->mcbot + MCLIM_REDZONE;
++  }
++}
++
++/* Setup exit stub after the end of each trace. */
++static void asm_exitstub_setup(ASMState *as)
++{
++   MCode *mxp = as->mctop;
++  /*
++     stw TMP, 0(sp); //store exit number
++
++     ldi TMP, traceno(zero);
++     lda at, lj_vm_exit_handler
++     call zero, (at);
++  */
++
++  __WI(--mxp, SW64I_CALL | SW64F_A(RID_ZERO) | SW64F_DISP(0, RID_R28));
++
++  mxp -= asm_lda(as, mxp, RID_R28, (uintptr_t)(void*)lj_vm_exit_handler);
++
++  __WI(--mxp, SW64I_LDI | SW64F_A(RID_TMP) | SW64F_DISPI(as->T->traceno));
++
++#if SW64_DEBUG_WI
++  __WI(--mxp, SW64I_STL | SW64F_A(RID_TMP) | SW64F_DISP(0, RID_SP));
++#else
++  __WI(--mxp, SW64I_STW | SW64F_A(RID_TMP) | SW64F_DISP(0, RID_SP));
++#endif
++
++  as->mctop = mxp;
++}
++
++/* Keep this in-sync with exitstub_trace_addr(). */
++#define asm_exitstub_addr(as)	((as)->mctop)
++
++/* Emit conditional branch to exit for guard. */
++static void asm_guard(ASMState *as, SW64Ins mi, Reg a)
++{
++  lua_assert(a != RID_TMP);
++  MCode *target = asm_exitstub_addr(as);
++  MCode *p = as->mcp;
++  if (LJ_UNLIKELY(p == as->invmcp)) {
++    as->invmcp = NULL;
++    as->loopinv = 1;
++    as->mcp = p+1;
++    mi = invert_cond(mi);
++    target = p;  /* Patch target later in asm_loop_fixup. */
++  }
++  lua_assert(as->snapno >= 0);
++
++  // bxx a, target
++  emit_branch(as, mi, a, target);
++  emit_Ao(as, SW64I_LDI, RID_TMP, RID_ZERO, as->snapno);
++}
++static void asm_compare_guard(ASMState* as, SW64Ins cmp,
++                                Reg a, Reg b, MCode *target)
++{
++  switch(SW64_OP(cmp)) {
++  case SW64_OP(0x60000000):
++    emit_branch(as, SW64I_FBNE, RID_F28, target);
++#if SW64_DEBUG_WI
++    emit_Ao(as, SW64I_LDI, RID_TMP, RID_TMP, as->snapno);
++    emit_loadu64(as, RID_TMP, (((unsigned long)(void*)as->mcp) << 32));
++#else
++  // ldi RID_TMP, as->snapno(zero)
++    emit_Ao(as, SW64I_LDI, RID_TMP, RID_ZERO, as->snapno);
++#endif
++    emit_FGI(as, cmp, a, b, RID_F28);
++    break;
++  case SW64_OP(0x40000000):
++    emit_branch(as, SW64I_BNE, RID_R28, target);
++#if SW64_DEBUG_WI
++    emit_Ao(as, SW64I_LDI, RID_TMP, RID_TMP, as->snapno);
++    emit_loadu64(as, RID_TMP, (((unsigned long)(void*)as->mcp) << 32));
++#else
++  // ldi RID_TMP, as->snapno(zero)
++  emit_Ao(as, SW64I_LDI, RID_TMP, RID_ZERO, as->snapno);
++#endif
++    emit_ABD(as, cmp, a, b, RID_R28);
++    break;
++  default:
++    lua_assert(!"NOT HRERE");
++  }
++}
++
++/* -- Operand fusion ------------------------------------------------------ */
++
++/* Limit linear search to this distance. Avoids O(n^2) behavior. */
++#define CONFLICT_SEARCH_LIM	31
++
++/* Fuse XLOAD/XSTORE reference into load/store operand. */
++static void asm_fusexref(ASMState *as, SW64Ins mi, Reg rt, IRRef ref,
++			 RegSet allow, int32_t ofs)
++{
++  IRIns *ir = IR(ref);
++  Reg base;
++  if (ra_noreg(ir->r) && canfuse(as, ir)) {
++    if (ir->o == IR_ADD) {
++      intptr_t ofs2;
++      if (irref_isk(ir->op2) && (ofs2 = ofs + get_kval(IR(ir->op2)),
++                                 checki16(ofs2))) {
++        ref = ir->op1;
++        ofs = (int32_t)ofs2;
++      }
++    } else if (ir->o == IR_STRREF) {
++      intptr_t ofs2 = 65536;
++      lua_assert(ofs == 0);
++      ofs = (int32_t)sizeof(GCstr);
++      if (irref_isk(ir->op2)) {
++        ofs2 = ofs + get_kval(IR(ir->op2));
++        ref = ir->op1;
++      } else if (irref_isk(ir->op1)) {
++        ofs2 = ofs + get_kval(IR(ir->op1));
++        ref = ir->op2;
++      }
++      if (!checki16(ofs2)) {
++        /* NYI: Fuse ADD with constant. */
++        Reg right, left = ra_alloc2(as, ir, allow);
++        right = (left >> 8); left &= 255;
++        emit_Ao(as, mi, rt, RID_TMP, ofs);
++        emit_ABD(as, SW64I_ADDL, left, right, RID_TMP);
++        return;
++      }
++      ofs = ofs2;
++    }
++  }
++  base = ra_alloc1(as, ref, allow);
++  emit_Ao(as, mi, rt, base, ofs);
++}
++
++/* -- Calls --------------------------------------------------------------- */
++
++/* Generate a call to a C function. */
++static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args)
++{
++  uint32_t n, nargs = CCI_XNARGS(ci);
++  int32_t ofs = 0;
++  Reg gpr, fpr = REGARG_FIRSTFPR;
++  if ((void *)ci->func)
++    emit_call(as, (void *)ci->func, 1);
++  for (gpr = REGARG_FIRSTGPR; gpr <= REGARG_LASTGPR; gpr++)
++    as->cost[gpr] = REGCOST(~0u, ASMREF_L);
++  gpr = REGARG_FIRSTGPR;
++  for (n = 0; n < nargs; n++) {  /* Setup args. */
++    IRRef ref = args[n];
++    if (ref) {
++      IRIns *ir = IR(ref);
++      if (irt_isfp(ir->t) && fpr <= REGARG_LASTFPR &&
++          !(ci->flags & CCI_VARARG)) {
++        lua_assert(rset_test(as->freeset, fpr));  /* Already evicted. */
++        ra_leftov(as, fpr, ref);
++        fpr += 1;
++        gpr += 1;
++      } else {
++          if (gpr <= REGARG_LASTGPR) {
++            lua_assert(rset_test(as->freeset, gpr));  /* Already evicted. */
++            if (irt_isfp(ir->t)) {
++              RegSet of = as->freeset;
++              Reg r;
++              /* Workaround to protect argument GPRs from being used for remat. */
++              as->freeset &= ~RSET_RANGE(REGARG_FIRSTGPR, REGARG_LASTGPR+1);
++              r = ra_alloc1(as, ref, RSET_FPR);
++              as->freeset |= (of & RSET_RANGE(REGARG_FIRSTGPR, REGARG_LASTGPR+1));
++              if (irt_isnum(ir->t)) {
++                emit_GI(as, SW64I_FCVTLD, r, r);
++                emit_AI(as, SW64I_IFMOVD, gpr, r);
++                gpr++; fpr++;
++              } else if (irt_isfloat(ir->t)) {
++                emit_GI(as, SW64I_FCVTLS, r, r);
++                emit_AI(as, SW64I_IFMOVS, gpr, r);
++                gpr++; fpr++;
++              }
++            } else {
++              ra_leftov(as, gpr, ref);
++              gpr++; fpr++;
++            }
++          } else {
++            Reg r = ra_alloc1z(as, ref, irt_isfp(ir->t) ? RSET_FPR : RSET_GPR);
++            if (irt_isnum(ir->t)) {
++              emit_Ao(as, SW64I_FSTD, r, RID_SP, ofs);
++            } else if(irt_isfloat(ir->t)) {
++              emit_Ao(as, SW64I_FSTS, r, RID_SP, ofs);
++            } else {
++              emit_Ao(as, SW64I_STL, r, RID_SP, ofs);
++            }
++            ofs += 8;
++          }
++        }
++    } else {
++      fpr = REGARG_LASTFPR+1;
++      if (gpr <= REGARG_LASTGPR) {
++        gpr++; fpr++;
++      } else {
++        ofs += 8;
++      }
++    }
++    checkmclim(as);
++  }
++}
++
++/* Setup result reg/sp for call. Evict scratch regs. */
++static void asm_setupresult(ASMState *as, IRIns *ir, const CCallInfo *ci)
++{
++  WI_DEBUG_BEFORE();
++  RegSet drop = RSET_SCRATCH;
++  if ((ci->flags & CCI_NOFPRCLOBBER))
++    drop &= ~RSET_FPR;
++  if (ra_hasreg(ir->r))
++    rset_clear(drop, ir->r);  /* Dest reg handled below. */
++  ra_evictset(as, drop);  /* Evictions must be performed first. */
++  if (ra_used(ir)) {
++    lua_assert(!irt_ispri(ir->t));
++    if (irt_isfp(ir->t)) {
++      if ((ci->flags & CCI_CASTU64)) {
++        int32_t ofs = sps_scale(ir->s);
++        Reg dest = ir->r;
++        if (ra_hasreg(dest)) {
++          ra_free(as, dest);
++          ra_modified(as, dest);
++          // This doesn't require FCVTLD, refer to the `lj_math_random_step`
++          emit_AI(as, SW64I_IFMOVD, RID_RET, dest);
++        }
++        if (ofs) {
++          emit_Ao(as, SW64I_STL, RID_RET, RID_SP, ofs);
++        }
++      } else {
++        ra_destreg(as, ir, RID_FPRET);
++      }
++    } else {
++      ra_destreg(as, ir, RID_RET);
++    }
++  }
++  WI_DEBUG_END();
++}
++
++static void asm_callx(ASMState *as, IRIns *ir)
++{
++  IRRef args[CCI_NARGS_MAX*2];
++  CCallInfo ci;
++  IRRef func;
++  IRIns *irf;
++  ci.flags = asm_callx_flags(as, ir);
++  asm_collectargs(as, ir, &ci, args);
++  asm_setupresult(as, ir, &ci);
++  func = ir->op2; irf = IR(func);
++  if (irf->o == IR_CARG) { func = irf->op1; irf = IR(func); }
++  if (irref_isk(func)) {  /* Call to constant address. */
++    ci.func = (ASMFunction)(void *)get_kval(irf);
++  } else {  /* Need specific register for indirect calls. */
++    Reg r = ra_alloc1(as, func, RID2RSET(RID_CFUNCADDR));
++    MCode *p = as->mcp;
++
++    __WI(--p, SW64I_CALL | SW64F_A(RID_RA) | SW64F_B(r));
++    if (r != RID_CFUNCADDR)
++      __WI(--p, SW64I_LDI | SW64F_A(RID_CFUNCADDR) | SW64F_DISP(0, r));
++
++    as->mcp = p;
++    ci.func = (ASMFunction)(void *)0;
++  }
++  asm_gencall(as, &ci, args);
++}
++
++/* -- Returns ------------------------------------------------------------- */
++
++/* Return to lower frame. Guard that it goes to the right spot. */
++static void asm_retf(ASMState *as, IRIns *ir)
++{
++  Reg base = ra_alloc1(as, REF_BASE, RSET_GPR);
++  void *pc = ir_kptr(IR(ir->op2));
++  int32_t delta = 1+LJ_FR2+bc_a(*((const BCIns *)pc - 1));
++  as->topslot -= (BCReg)delta;
++  if ((int32_t)as->topslot < 0) as->topslot = 0;
++  irt_setmark(IR(REF_BASE)->t);  /* Children must not coalesce with BASE reg. */
++  emit_setgl(as, base, jit_base);
++  emit_addptr(as, base, -8*delta);
++
++  asm_guard(as, SW64I_BEQ, RID_R28);
++  emit_ABD(as, SW64I_CMPEQ, RID_TMP, ra_allock(as, igcptr(pc), rset_exclude(RSET_GPR, base)), RID_R28);
++
++  emit_Ao(as, SW64I_AL, RID_TMP, base, -8);
++}
++
++/* -- Type conversions ---------------------------------------------------- */
++
++static void asm_tointg(ASMState *as, IRIns *ir, Reg left)
++{
++  Reg tmp = ra_scratch(as, rset_exclude(RSET_FPR, left));
++  Reg dest = ra_dest(as, ir, RSET_GPR);
++  asm_guard(as, SW64I_FBEQ, tmp);
++  emit_FGI(as, SW64I_FCMPEQ, tmp, left, tmp);
++  emit_GI(as, SW64I_FCVTLD, tmp, tmp);
++  emit_FD(as, SW64I_FIMOVD, tmp, dest);
++  emit_GI(as, SW64I_FCVTLW, tmp, tmp);
++  lua_assert(irt_isint(ir->t));
++  emit_GI(as, SW64I_FCVTDL, left, tmp);
++}
++
++static void asm_tobit(ASMState *as, IRIns *ir)
++{
++  RegSet allow = RSET_FPR;
++  Reg dest = ra_dest(as, ir, RSET_GPR);
++  Reg left = ra_alloc1(as, ir->op1, allow);
++  Reg right = ra_alloc1(as, ir->op2, rset_clear(allow, left));
++  Reg tmp = ra_scratch(as, rset_clear(allow, right));
++
++  emit_ABD(as, SW64I_ADDW, RID_ZERO, dest, dest);
++  emit_FD(as, SW64I_FIMOVD, tmp, dest);
++  emit_FGI(as, SW64I_FADDD, left, right, tmp);
++}
++
++static void asm_conv(ASMState *as, IRIns *ir)
++{
++  IRType st = (IRType)(ir->op2 & IRCONV_SRCMASK);
++  int st64 = (st == IRT_I64 || st == IRT_U64 || (LJ_64 && st == IRT_P64));
++  int sti8 = st == IRT_I8;
++  int stu8 = st == IRT_U8;
++  int sti16 = st == IRT_I16;
++  int stu16 = st == IRT_U16;
++  int stu32 = st == IRT_U32;
++  int stu64 = st == IRT_U64;
++  int stfp = (st == IRT_NUM || st == IRT_FLOAT);
++
++  IRRef lref = ir->op1;
++
++  lua_assert(irt_type(ir->t) != st);
++
++  if (irt_isfp(ir->t)) {
++    Reg dest = ra_dest(as, ir, RSET_FPR);
++    if (stfp) {  /* FP to FP conversion. */
++      emit_GI(as, st == IRT_NUM ? SW64I_FCVTDS : SW64I_FCVTSD,
++              ra_alloc1(as, lref, RSET_FPR), dest);
++    } else if (stu64) {
++      Reg left = ra_alloc1(as, lref, RSET_GPR);
++      MCLabel l_end = emit_label(as);
++      if (irt_isfloat(ir->t)) {
++        TODO;
++      } else {
++        emit_FGI(as, SW64I_FADDD, dest, dest, dest);
++        emit_GI(as, SW64I_FCVTLD, dest, dest);
++        emit_AI(as, SW64I_IFMOVD, RID_R28, dest);
++        emit_ABD(as,SW64I_BIS, RID_R28, left, RID_R28);
++        emit_AjD(as,SW64I_ANDI, left, 1, left);
++        emit_AjD(as,SW64I_SRLI, left, 1, RID_R28);
++      }
++      emit_branch(as, SW64I_BGE, left, l_end);
++      emit_GI(as, SW64I_FCVTLD, dest, dest);
++      emit_AI(as, SW64I_IFMOVD, left, dest);
++    } else {  /* Integer to FP conversion. */
++      Reg left = ra_alloc1(as, lref, RSET_GPR);
++      emit_GI(as, irt_isfloat(ir->t) ? SW64I_FCVTLS : SW64I_FCVTLD, dest, dest);
++      if (stu32) {
++        emit_AI(as, SW64I_IFMOVD, RID_R28, dest);
++        emit_AjD(as, SW64I_EXTLWI, left, 0, RID_R28);
++      } else {
++        emit_AI(as, SW64I_IFMOVD, left, dest);
++      }
++    }
++  } else if (stfp) {  /* FP to integer conversion. */
++    if (irt_isguard(ir->t)) {
++      /* Checked conversions are only supported from number to int. */
++      lua_assert(irt_isint(ir->t) && st == IRT_NUM);
++      asm_tointg(as, ir, ra_alloc1(as, lref, RSET_FPR));
++    } else {
++      Reg dest = ra_dest(as, ir, RSET_GPR);
++      Reg left = ra_alloc1(as, lref, RSET_FPR);
++      Reg tmp = ra_scratch(as, rset_exclude(RSET_FPR, left));
++      if (irt_isu32(ir->t)) {  /* FP to U32 conversion. */
++        emit_AjD(as, SW64I_EXTLWI, dest, 0, dest);
++      }
++      emit_FD(as, SW64I_FIMOVD, tmp, dest);
++      emit_GI(as, SW64I_FCVTDL, left, tmp);
++    }
++  } else {
++    Reg dest = ra_dest(as, ir, RSET_GPR);
++    if (irt_isu32(ir->t)) {
++      emit_ABD(as, SW64I_EXTLWI, dest, 0, dest);
++    }
++
++    if (st64 && irt_isint(ir->t)) {
++      Reg left = ra_alloc1(as, lref, RSET_GPR);
++      emit_AjD(as, SW64I_EXTLWI, left, 0, dest);
++    } else if (irt_isu64(ir->t) && st == IRT_INT) {
++      Reg left = ra_alloc1(as, lref, RSET_GPR);
++      emit_AjD(as, SW64I_EXTLWI, left, 0, dest);
++    } else if (sti8) {
++      Reg left = ra_alloc1(as, lref, RSET_GPR);
++      if (!irt_is64(ir->t)) {
++        emit_ABD(as, SW64I_EXTLWI, dest, 0, dest);
++        emit_ABD(as, SW64I_ADDW, RID_ZERO, dest, dest);
++      }
++      emit_ABD(as, SW64I_SEXTB, 0, left, dest);
++    } else if (stu8) {
++      Reg left = ra_alloc1(as, lref, RSET_GPR);
++      emit_ABD(as, SW64I_EXTLBI, left, 0, dest);
++    } else if (sti16) {
++      Reg left = ra_alloc1(as, lref, RSET_GPR);
++      if (!irt_is64(ir->t)) {
++        emit_ABD(as, SW64I_EXTLWI, dest, 0, dest);
++        emit_ABD(as, SW64I_ADDW, RID_ZERO, dest, dest);
++      }
++      emit_ABD(as, SW64I_SEXTH, 0, left, dest);
++    } else if (stu16) {
++      Reg left = ra_alloc1(as, lref, RSET_GPR);
++      emit_ABD(as, SW64I_EXTLHI, left, 0, dest);
++    } else if (stu32) {
++      Reg left = ra_alloc1(as, lref, RSET_GPR);
++      if (irt_isint(ir->t))
++        emit_ABD(as, SW64I_ADDW, RID_ZERO, dest, dest);
++      emit_AjD(as, SW64I_EXTLWI, left, 0, dest);
++    } else {
++      ra_leftov(as, dest, lref);  /* Do nothing, but may need to move regs. */
++    }
++  }
++}
++
++static void asm_strto(ASMState *as, IRIns *ir)
++{
++  const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_strscan_num];
++  IRRef args[2];
++  int32_t ofs = 0;
++  RegSet drop = RSET_SCRATCH;
++  if (ra_hasreg(ir->r)) rset_set(drop, ir->r);  /* Spill dest reg (if any). */
++  ra_evictset(as, drop);
++  ofs = sps_scale(ir->s);
++  asm_guard(as, SW64I_BEQ, RID_RET);  /* Test return status. */
++  args[0] = ir->op1;      /* GCstr *str */
++  args[1] = ASMREF_TMP1;  /* TValue *n  */
++  asm_gencall(as, ci, args);
++  /* Store the result to the spill slot or temp slots. */
++  emit_Ao(as, SW64I_LDI, ra_releasetmp(as, ASMREF_TMP1),
++	   RID_SP, ofs);
++}
++
++/* -- Memory references --------------------------------------------------- */
++
++/* Store tagged value for ref at base+ofs. */
++static void asm_tvstore64(ASMState *as, Reg base, int32_t ofs, IRRef ref)
++{ 
++  RegSet allow = rset_exclude(RSET_GPR, base);
++  IRIns *ir = IR(ref);
++  lua_assert(irt_ispri(ir->t) || irt_isaddr(ir->t) || irt_isinteger(ir->t));
++  if (irref_isk(ref)) {
++    TValue k;
++    lj_ir_kvalue(as->J->L, &k, ir);
++    emit_Ao(as, SW64I_STL, ra_allock(as, (int64_t)k.u64, allow), base, ofs);
++  } else {
++    Reg src = ra_alloc1(as, ref, allow);
++    Reg type = ra_allock(as, (int64_t)irt_toitype(ir->t) << 47,
++			 rset_exclude(allow, src));
++    emit_Ao(as, SW64I_STL, RID_TMP, base, ofs);
++    if (irt_isinteger(ir->t)) {
++      emit_ABD(as, SW64I_ADDL, RID_TMP, type, RID_TMP);
++      emit_AjD(as, SW64I_EXTLWI, src, 0, RID_TMP);
++    } else {
++      emit_ABD(as, SW64I_ADDL, src, type, RID_TMP);
++    }
++  }
++}
++/* Get pointer to TValue. */
++static void asm_tvptr(ASMState *as, Reg dest, IRRef ref)
++{
++  IRIns *ir = IR(ref);
++  if (irt_isnum(ir->t)) {
++    if (irref_isk(ref))  /* Use the number constant itself as a TValue. */
++      ra_allockreg(as, igcptr(ir_knum(ir)), dest);
++    else  /* Otherwise force a spill and use the spill slot. */
++      emit_Ao(as, SW64I_LDI, dest, RID_SP, ra_spill(as, ir));
++  } else {
++    /* Otherwise use g->tmptv to hold the TValue. */
++    asm_tvstore64(as, dest, 0, ref);
++    emit_Ao(as, SW64I_LDI, dest, RID_JGL,
++	     (int32_t)(offsetof(global_State, tmptv)-32768));
++  }
++}
++
++static void asm_aref(ASMState *as, IRIns *ir)
++{
++  Reg dest = ra_dest(as, ir, RSET_GPR);
++  Reg idx, base;
++  if (irref_isk(ir->op2)) {
++    IRRef tab = IR(ir->op1)->op1;
++    int32_t ofs = asm_fuseabase(as, tab);
++    IRRef refa = ofs ? tab : ir->op1;
++    ofs += 8*IR(ir->op2)->i;
++    if (checki16(ofs)) {
++      base = ra_alloc1(as, refa, RSET_GPR);
++      emit_Ao(as, SW64I_LDI, dest, base, ofs);
++      return;
++    }
++  }
++  base = ra_alloc1(as, ir->op1, RSET_GPR);
++  idx = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, base));
++  emit_ABD(as, SW64I_S8ADDL, idx, base, dest);
++}
++
++/* Inlined hash lookup. Specialized for key type and for const keys.
++** The equivalent C code is:
++**   Node *n = hashkey(t, key);
++**   do {
++**     if (lj_obj_equal(&n->key, key)) return &n->val;
++**   } while ((n = nextnode(n)));
++**   return niltv(L);
++*/
++static void asm_href(ASMState *as, IRIns *ir, IROp merge)
++{
++  WI_DEBUG_BEFORE();
++  RegSet allow = RSET_GPR;
++  int destused = ra_used(ir);
++  Reg dest = ra_dest(as, ir, allow);
++  Reg tab = ra_alloc1(as, ir->op1, rset_clear(allow, dest));
++  Reg key = RID_NONE, type = RID_NONE, tmpnum = RID_NONE, tmp1, tmp2;
++
++  Reg cmp64 = RID_NONE;
++
++  IRRef refkey = ir->op2;
++  IRIns *irkey = IR(refkey);
++  int isk = irref_isk(refkey);
++  IRType1 kt = irkey->t;
++  uint32_t khash;
++  MCLabel l_end, l_loop, l_next;
++
++  rset_clear(allow, tab);
++  tmp1 = ra_scratch(as, allow);
++  rset_clear(allow, tmp1);
++  tmp2 = ra_scratch(as, allow);
++  rset_clear(allow, tmp2);
++
++  if ( irt_isnum(kt)) {
++    key = ra_alloc1(as, refkey, RSET_FPR);
++    tmpnum = ra_scratch(as, rset_exclude(RSET_FPR, key));
++  } else if (!irt_ispri(kt)) {
++    key = ra_alloc1(as, refkey, allow);
++    rset_clear(allow, key);
++  }
++
++  if (!irt_isnum(kt)) {
++    /* Allocate cmp64 register used for 64-bit comparisons */
++    if ( irt_isnum(kt)) {
++      cmp64 = key;
++    } else if (!isk && irt_isaddr(kt)) {
++      cmp64 = tmp2;
++    } else {
++      int64_t k;
++      if (isk && irt_isaddr(kt)) {
++	k = ((int64_t)irt_toitype(irkey->t) << 47) | irkey[1].tv.u64;
++      } else {
++	lua_assert(irt_ispri(kt) && !irt_isnil(kt));
++	k = ~((int64_t)~irt_toitype(ir->t) << 47);
++      }
++      cmp64 = ra_allock(as, k, allow);
++      rset_clear(allow, cmp64);
++    }
++  }
++
++  /* Key not found in chain: jump to exit (if merged) or load niltv. */
++  l_end = emit_label(as);
++  as->invmcp = NULL;
++  if (merge == IR_NE) {
++    asm_guard(as, SW64I_BEQ, RID_ZERO);
++  } else if (destused) {
++    emit_loada(as, dest, niltvg(J2G(as->J)));
++  }
++  /* Follow hash chain until the end. */
++  l_loop = --as->mcp;
++  emit_move(as, dest, tmp1);
++  emit_Ao(as, SW64I_AL, tmp1, dest, (int32_t)offsetof(Node, next));
++  l_next = emit_label(as);
++
++  /* Type and value comparison. */
++  if (merge == IR_EQ) {  /* Must match asm_guard(). */
++    l_end = asm_exitstub_addr(as);
++  }
++  if ( irt_isnum(kt)) {
++    emit_branch(as, SW64I_BEQ, RID_R28, l_end);
++    emit_Ao(as, SW64I_LDI, RID_TMP, RID_ZERO, as->snapno);
++    emit_FGI(as, SW64I_FCMPEQ, tmpnum, key, RID_R28);
++    Reg isnum = ra_allock(as, (int32_t)LJ_TISNUM, allow);
++    emit_branch(as, SW64I_BEQ, tmp1, l_next);
++    emit_ABD(as, SW64I_CMPULT, tmp1, isnum, tmp1);
++    emit_AjD(as, SW64I_SRAI, tmp1, 47, tmp1);
++    emit_AI(as, SW64I_IFMOVD, tmp1, tmpnum);
++  } else {
++    emit_branch(as, SW64I_BNE, RID_R28, l_end);
++    emit_ABD(as, SW64I_CMPEQ, tmp1, cmp64, RID_R28);
++    emit_Ao(as, SW64I_LDI, RID_TMP, RID_ZERO, as->snapno);
++  }
++  emit_Ao(as, SW64I_LDL, tmp1, dest, (int32_t)offsetof(Node, key.u64));
++  *l_loop = SW64I_BNE | SW64F_A(tmp1) | ((as->mcp-l_loop-1) & 0x1fffff);
++  if (!isk && irt_isaddr(kt)) {
++    type = ra_allock(as, (int64_t)irt_toitype(kt) << 47, allow);
++    emit_ABD(as, SW64I_ADDL, key, type, tmp2);
++    rset_clear(allow, type);
++  }
++
++  /* Load main position relative to tab->node into dest. */
++  khash = isk ? ir_khash(irkey) : 1;
++  if (khash == 0) {
++    emit_Ao(as, SW64I_AL, dest, tab, (int32_t)offsetof(GCtab, node));
++  } else {
++    Reg tmphash = tmp1;
++    if (isk)
++      tmphash = ra_allock(as, khash, allow);
++
++    emit_ABD(as, SW64I_ADDL, dest, tmp1, dest);
++    lua_assert(sizeof(Node) == 24);
++    emit_ABD(as, SW64I_SUBW, tmp2, tmp1, tmp1);
++    emit_AjD(as, SW64I_SLLI, tmp1, 3, tmp1);
++    emit_AjD(as, SW64I_SLLI, tmp1, 5, tmp2);
++
++    emit_ABD(as, SW64I_AND, tmp2, tmphash, tmp1); //tmp1 <- hmask & tmphash
++    emit_Ao(as, SW64I_AL, dest, tab, (int32_t)offsetof(GCtab, node));
++    emit_Ao(as, SW64I_LDW, tmp2, tab, (int32_t)offsetof(GCtab, hmask));
++
++    if (isk) {//TODO
++      /* Nothing to do. */
++    } else if (irt_isstr(kt)) {
++      emit_Ao(as, SW64I_LDW, tmp1, key, (int32_t)offsetof(GCstr, hash));
++    } else {  /* Must match with hash*() in lj_tab.c. */
++      //hi = tmp1, lo = tmp2
++      Reg hi = tmp1;
++      Reg lo = tmp2;
++
++      /* hi = hi - lj_rol(lo, HASH_ROT3); */
++      emit_ABD(as, SW64I_SUBL, hi, dest, hi);
++      emit_rotl32(as, lo, (HASH_ROT3)&31, dest, RID_R28);
++
++      /* hi = lo ^ lj_rol(hi, HASH_ROT1 + HASH_ROT2); */
++      emit_ABD(as, SW64I_XOR, lo, dest, hi);
++      emit_rotl32(as, hi, (HASH_ROT2+HASH_ROT1)&31, dest, RID_R28);
++
++      /* lo = lo - lj_rol(hi, HASH_ROT1); */
++      emit_ABD(as, SW64I_SUBL, lo, dest, lo);
++      emit_rotl32(as, hi, HASH_ROT1&31, dest, RID_R28);
++
++      /* lo = lo ^ hi; */
++      emit_ABD(as, SW64I_XOR, lo, hi, lo);
++
++
++      if (irt_isnum(kt)) {
++        emit_ABD(as, SW64I_ADDL, hi, hi, hi); // hi << 1
++
++        emit_AjD(as, SW64I_MASKLLI, tmp2, 4, lo); //lo
++        emit_AjD(as, SW64I_SRAI, tmp2, 32, hi); //hi
++
++        emit_FD(as, SW64I_FIMOVD, key, tmp2);
++      } else {
++        emit_ABD(as, SW64I_XOR, key, tmp1, tmp2);
++        emit_rotl32(as, tmp1, HASH_ROT1&31, dest, tmp2);
++        emit_ABD(as, SW64I_ADDL, key, ra_allock(as, HASH_BIAS, allow), tmp1);
++      }
++    }
++  }
++  WI_DEBUG_END();
++}
++
++static void asm_hrefk(ASMState *as, IRIns *ir)
++{
++  IRIns *kslot = IR(ir->op2);
++  IRIns *irkey = IR(kslot->op1);
++  int32_t ofs = (int32_t)(kslot->op2 * sizeof(Node));
++  int32_t kofs = ofs + (int32_t)offsetof(Node, key);
++  Reg dest = (ra_used(ir)||ofs > 32736) ? ra_dest(as, ir, RSET_GPR) : RID_NONE;
++  Reg node = ra_alloc1(as, ir->op1, RSET_GPR);
++  RegSet allow = rset_exclude(RSET_GPR, node);
++  Reg idx = node;
++  Reg key = ra_scratch(as, allow);
++  int64_t k;
++  lua_assert(ofs % sizeof(Node) == 0);
++  if (ofs > 32736) {
++    idx = dest;
++    rset_clear(allow, dest);
++    kofs = (int32_t)offsetof(Node, key);
++  } else if (ra_hasreg(dest)) {
++    emit_Ao(as, SW64I_LDI, dest, node, ofs);
++  }
++  if (irt_ispri(irkey->t)) {
++    lua_assert(!irt_isnil(irkey->t));
++    k = ~((int64_t)~irt_toitype(irkey->t) << 47);
++  } else if (irt_isnum(irkey->t)) {
++    k = (int64_t)ir_knum(irkey)->u64;
++  } else {
++    k = ((int64_t)irt_toitype(irkey->t) << 47) | (int64_t)ir_kgc(irkey);
++  }
++  asm_guard(as, SW64I_BEQ, RID_R28);
++  emit_ABD(as, SW64I_CMPEQ, key, ra_allock(as, k, allow), RID_R28);
++  emit_Ao(as, SW64I_LDL, key, idx, kofs);
++  if (ofs > 32736)
++    emit_ABD(as, SW64I_ADDL, node, ra_allock(as, ofs, allow), dest);
++}
++
++static void asm_uref(ASMState *as, IRIns *ir)
++{
++  Reg dest = ra_dest(as, ir, RSET_GPR);
++  if (irref_isk(ir->op1)) {
++    GCfunc *fn = ir_kfunc(IR(ir->op1));
++    MRef *v = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv.v;
++    emit_lsptr(as, SW64I_AL, dest, v, RSET_GPR);
++  } else {
++    Reg uv = ra_scratch(as, RSET_GPR);
++    Reg func = ra_alloc1(as, ir->op1, RSET_GPR);
++    if (ir->o == IR_UREFC) {
++      asm_guard(as, SW64I_BEQ, RID_R28);
++      emit_Ao(as, SW64I_LDI, dest, uv, (int32_t)offsetof(GCupval, tv));
++      emit_Ao(as, SW64I_LDBU, RID_R28, uv, (int32_t)offsetof(GCupval, closed));
++    } else {
++      emit_Ao(as, SW64I_AL, dest, uv, (int32_t)offsetof(GCupval, v));
++    }
++    emit_Ao(as, SW64I_AL, uv, func, (int32_t)offsetof(GCfuncL, uvptr) +
++	     (int32_t)sizeof(MRef) * (int32_t)(ir->op2 >> 8));
++  }
++}
++
++static void asm_fref(ASMState *as, IRIns *ir)
++{
++  UNUSED(as); UNUSED(ir);
++  lua_assert(!ra_used(ir));
++}
++
++static void asm_strref(ASMState *as, IRIns *ir)
++{
++  RegSet allow = RSET_GPR;
++  Reg dest = ra_dest(as, ir, allow);
++  Reg base = ra_alloc1(as, ir->op1, allow);
++  IRIns *irr = IR(ir->op2);
++  int32_t ofs = sizeof(GCstr);
++  rset_clear(allow, base);
++  if (irref_isk(ir->op2) && checki16(ofs + irr->i)) {
++    emit_Ao(as, SW64I_LDI, dest, base, ofs+irr->i);
++  } else {
++    emit_Ao(as, SW64I_LDI, dest, dest, ofs);
++    emit_ABD(as, SW64I_ADDL, base, ra_alloc1(as, ir->op2, allow), dest);
++  }
++}
++
++/* -- Loads and stores ---------------------------------------------------- */
++static void fxloadins_end(ASMState*as, SW64Ins mi, Reg r)
++{
++  if (mi == SW64I_EXTLWI) {
++    emit_AjD(as, SW64I_EXTLWI, r, 0, r);
++  } else if (mi)
++    emit_ABD(as, mi, RID_ZERO, r, r);
++}
++static SW64Ins asm_fxloadins(IRIns *ir, SW64Ins* mi2)
++{
++  *mi2 = 0;
++  switch (irt_type(ir->t)) {
++  case IRT_I8:
++    *mi2 = SW64I_SEXTB; //fallthrough
++  case IRT_U8:
++    return SW64I_LDBU;
++
++  case IRT_I16:
++    *mi2 = SW64I_SEXTH; //fallthrough
++  case IRT_U16:
++    return SW64I_LDHU;
++
++  case IRT_U32:
++    *mi2 = SW64I_EXTLWI; //fallthrough
++  case IRT_INT:
++    return SW64I_LDW;
++
++  case IRT_NUM:   return SW64I_FLDD;
++  case IRT_FLOAT: return SW64I_FLDS;
++  default: return irt_is64(ir->t) ? SW64I_LDL : SW64I_LDW;
++  }
++}
++
++static SW64Ins asm_fxstoreins(IRIns *ir)
++{
++  switch (irt_type(ir->t)) {
++  case IRT_I8: case IRT_U8: return SW64I_STB;
++  case IRT_I16: case IRT_U16: return SW64I_STH;
++  case IRT_NUM: return SW64I_FSTD;
++  case IRT_FLOAT: return SW64I_FSTS;
++#if LJ_64 && !LJ_GC64
++  case IRT_LIGHTUD: lua_assert(0);  /* NYI: mask 64 bit lightuserdata. */
++#endif
++  default: return (irt_is64(ir->t)) ? SW64I_STL : SW64I_STW;
++  }
++}
++
++static void asm_fload(ASMState *as, IRIns *ir)
++{
++  Reg dest = ra_dest(as, ir, RSET_GPR);
++  SW64Ins mi2 = 0;
++  SW64Ins mi = asm_fxloadins(ir, &mi2);
++  Reg idx;
++  int32_t ofs;
++  if (ir->op1 == REF_NIL) {
++    idx = RID_JGL;
++    ofs = (ir->op2 << 2) - 32768 - GG_OFS(g);
++  } else {
++    idx = ra_alloc1(as, ir->op1, RSET_GPR);
++    if (ir->op2 == IRFL_TAB_ARRAY) {
++      ofs = asm_fuseabase(as, ir->op1);
++      if (ofs) {  /* Turn the t->array load into an add for colocated arrays. */
++        emit_Ao(as, SW64I_LDI, dest, idx, ofs);
++        return;
++      }
++    }
++    ofs = field_ofs[ir->op2];
++  }
++  fxloadins_end(as, mi2, dest);
++  emit_Ao(as, mi, dest, idx, ofs);
++}
++
++static void asm_fstore(ASMState *as, IRIns *ir)
++{
++  if (ir->r != RID_SINK) {
++    Reg src = ra_alloc1z(as, ir->op2, RSET_GPR);
++    IRIns *irf = IR(ir->op1);
++    Reg idx = ra_alloc1(as, irf->op1, rset_exclude(RSET_GPR, src));
++    int32_t ofs = field_ofs[irf->op2];
++    SW64Ins mi = asm_fxstoreins(ir);
++    lua_assert(!irt_isfp(ir->t));
++    emit_Ao(as, mi, src, idx, ofs);
++  }
++}
++
++static void asm_xload(ASMState *as, IRIns *ir)
++{
++  SW64Ins mi2 = 0;
++  SW64Ins mi = asm_fxloadins(ir, &mi2);
++  Reg dest = ra_dest(as, ir,
++    irt_isfp(ir->t) ? RSET_FPR : RSET_GPR);
++  lua_assert(!(ir->op2 & IRXLOAD_UNALIGNED));
++  fxloadins_end(as, mi2, dest);
++  asm_fusexref(as, mi, dest, ir->op1, RSET_GPR, 0);
++}
++
++static void asm_xstore_(ASMState *as, IRIns *ir, int32_t ofs)
++{
++  if (ir->r != RID_SINK) {
++    Reg src = ra_alloc1z(as, ir->op2,
++      irt_isfp(ir->t) ? RSET_FPR : RSET_GPR);
++    asm_fusexref(as, asm_fxstoreins(ir), src, ir->op1,
++		 rset_exclude(RSET_GPR, src), ofs);
++  }
++}
++
++#define asm_xstore(as, ir)	asm_xstore_(as, ir, 0)
++
++#if LJ_64 && !LJ_GC64
++static Reg asm_load_lightud64(ASMState *as, IRIns *ir, int typecheck)
++{
++
++  if (ra_used(ir) || typecheck) {
++    Reg dest = ra_dest(as, ir, RSET_GPR);
++    if (typecheck) {
++      Reg tmp = ra_scratch(as, rset_exclude(RSET_GPR, dest));
++      asm_guard(as, SW64I_BEQ, RID_R28);
++      emit_ABD(as, SW64I_CMPEQ,
++               tmp, ra_allock(as, (int32_t)0x1fffe, rset_exclude(RSET_GPR, dest)),
++               RID_R28);
++      emit_AjD(as, SW64I_SRLI, dest, 47, tmp);
++    }
++    return dest;
++  } else {
++    return RID_NONE;
++  }
++}
++#endif
++
++static void asm_ahuvload(ASMState *as, IRIns *ir)
++{
++  Reg dest = RID_NONE, type, idx;
++  RegSet allow = RSET_GPR;
++  int32_t ofs = 0;
++  IRType1 t = ir->t;
++
++  type = ra_scratch(as, allow);
++  rset_clear(allow, type);
++
++  if (ra_used(ir)) {
++    lua_assert(irt_isnum(ir->t) || irt_isint(ir->t) || irt_isaddr(ir->t));
++    dest = ra_dest(as, ir, irt_isnum(t) ? RSET_FPR : allow);
++    rset_clear(allow, dest);
++    if (irt_isaddr(t))
++      emit_DEXTM(as, dest, dest, 0, 47);
++    else if (irt_isint(t))
++      emit_AjD(as, SW64I_ADDWI, dest, 0, dest);
++  }
++  idx = asm_fuseahuref(as, ir->op1, &ofs, allow);
++  rset_clear(allow, idx);
++  if (irt_isnum(t)) {
++    asm_guard(as, SW64I_BEQ, RID_R28);
++    emit_ABD(as, SW64I_CMPULT, type, ra_allock(as, (int32_t)LJ_TISNUM, allow), RID_R28);
++  } else {
++    asm_guard(as, SW64I_BEQ, RID_R28);
++    emit_ABD(as, SW64I_CMPEQ, type, ra_allock(as, (int32_t)irt_toitype(t), allow), RID_R28);
++  }
++  if (ra_hasreg(dest)) {
++    if (irt_isnum(t)){
++      emit_Fo(as, SW64I_FLDD, dest, idx, ofs);
++      dest = type;
++      }
++  } else {
++    dest = type;
++  }
++  emit_AjD(as, SW64I_SRAI, dest, 47, type);
++  emit_Ao(as, SW64I_LDL, dest, idx, ofs);
++}
++
++static void asm_ahustore(ASMState *as, IRIns *ir)
++{
++  RegSet allow = RSET_GPR;
++  Reg idx, src = RID_NONE, type = RID_NONE;
++  int32_t ofs = 0;
++  if (ir->r == RID_SINK)
++    return;
++  if (irt_isnum(ir->t)) {
++    src = ra_alloc1(as, ir->op2, RSET_FPR);
++    idx = asm_fuseahuref(as, ir->op1, &ofs, allow);
++    emit_Fo(as, SW64I_FSTD, src, idx, ofs);
++  } else {
++    Reg tmp = RID_TMP;
++    if (irt_ispri(ir->t)) {
++      tmp = ra_allock(as, ~((int64_t)~irt_toitype(ir->t) << 47), allow);
++      rset_clear(allow, tmp);
++  } else {
++      src = ra_alloc1(as, ir->op2, allow);
++      rset_clear(allow, src);
++      type = ra_allock(as, (int64_t)irt_toitype(ir->t) << 47, allow);
++      rset_clear(allow, type);
++    }
++    idx = asm_fuseahuref(as, ir->op1, &ofs, allow);
++    emit_Ao(as, SW64I_STL, tmp, idx, ofs);
++    if (ra_hasreg(src)) {
++      if (irt_isinteger(ir->t)) {
++	emit_ABD(as, SW64I_ADDL, tmp, type, tmp);
++        emit_AjD(as, SW64I_EXTLWI, src, 0, RID_TMP);
++      } else {
++	emit_ABD(as, SW64I_ADDL, src, type, tmp);
++      }
++    }
++  }
++}
++
++static void asm_sload(ASMState *as, IRIns *ir)
++{
++  Reg dest = RID_NONE, type = RID_NONE, base;
++  RegSet allow = RSET_GPR;
++  IRType1 t = ir->t;
++  int32_t ofs = 8*((int32_t)ir->op1-2);
++
++  lua_assert(!(ir->op2 & IRSLOAD_PARENT));  /* Handled by asm_head_side(). */
++  lua_assert(irt_isguard(ir->t) || !(ir->op2 & IRSLOAD_TYPECHECK));
++
++  if ((ir->op2 & IRSLOAD_CONVERT) && irt_isguard(t) && irt_isint(t)) {
++    dest = ra_scratch(as, RSET_FPR);
++    asm_tointg(as, ir, dest);
++    t.irt = IRT_NUM;  /* Continue with a regular number type check. */
++  } else if (ra_used(ir)) {
++    lua_assert(irt_isnum(ir->t) ||
++	       irt_isint(ir->t) || irt_isaddr(ir->t));
++    dest = ra_dest(as, ir, irt_isnum(t) ? RSET_FPR : allow);
++    rset_clear(allow, dest);
++    base = ra_alloc1(as, REF_BASE, allow);
++    rset_clear(allow, base);
++    if (ir->op2 & IRSLOAD_CONVERT) {
++      if (irt_isint(t)) {
++	Reg tmp = ra_scratch(as, RSET_FPR);
++	emit_FD(as, SW64I_FIMOVD, dest, tmp);
++        emit_GI(as, SW64I_FCVTDL_Z, tmp, tmp);
++        emit_GI(as, SW64I_FCVTLW, tmp, tmp);
++	dest = tmp;
++	t.irt = IRT_NUM;  /* Check for original type. */
++      } else {
++        Reg tmp = ra_scratch(as, RSET_GPR);
++        emit_GI(as, SW64I_FCVTLD, dest, dest);
++        emit_AI(as, SW64I_IFMOVD, tmp, dest);
++        dest = tmp;
++        t.irt = IRT_INT;  /* Check for original type. */
++      }
++    }
++    else if (irt_isaddr(t)) {
++      /* Clear type from pointers. */
++      emit_DEXTM(as, dest, dest, 0, 47);
++    } else if (irt_isint(t) && (ir->op2 & IRSLOAD_TYPECHECK)) {
++      /* Sign-extend integers. */
++      emit_AjD(as, SW64I_ADDWI, dest, 0, dest);
++    }
++    goto dotypecheck;
++  }
++  base = ra_alloc1(as, REF_BASE, allow);
++  rset_clear(allow, base);
++dotypecheck:
++  if ((ir->op2 & IRSLOAD_TYPECHECK)) {
++    type = dest < RID_MAX_GPR ? dest : RID_TMP;
++    if (irt_ispri(t)) {
++      Reg ktype = ra_allock(as, ~((int64_t)~irt_toitype(t) << 47), allow);
++      asm_guard(as, SW64I_BEQ, RID_R28);
++      emit_ABD(as, SW64I_CMPEQ, type, ktype, RID_R28);
++    } else {
++    if (irt_isnum(t)) {
++        Reg isnum = ra_allock(as, (int32_t)LJ_TISNUM, allow);
++        asm_guard(as, SW64I_BEQ, RID_R28);
++        emit_ABD(as, SW64I_CMPULT, RID_TMP, isnum, RID_R28);
++	if (ra_hasreg(dest))
++	  emit_Fo(as, SW64I_FLDD, dest, base, ofs);
++    } else {
++      Reg ktype2 = ra_allock(as, (int32_t)irt_toitype(t), allow);
++      asm_guard(as, SW64I_BEQ, RID_R28);
++      emit_ABD(as, SW64I_CMPEQ, RID_TMP, ktype2, RID_R28);
++      }
++      emit_AjD(as, SW64I_SRAI, type, 47, RID_TMP);
++    }
++    emit_Ao(as, SW64I_LDL, type, base, ofs);
++  } else if (ra_hasreg(dest)) {
++    if (irt_isnum(t))
++	  emit_Fo(as, SW64I_FLDD, dest, base, ofs);
++    else
++      emit_Ao(as, irt_isint(t) ? SW64I_LDW : SW64I_LDL, dest, base,
++	       ofs );
++  }
++}
++
++/* -- Allocations --------------------------------------------------------- */
++
++#if LJ_HASFFI
++static void asm_cnew(ASMState *as, IRIns *ir)
++{
++  CTState *cts = ctype_ctsG(J2G(as->J));
++  CTypeID id = (CTypeID)IR(ir->op1)->i;
++  CTSize sz;
++  CTInfo info = lj_ctype_info(cts, id, &sz);
++  const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_mem_newgco];
++  IRRef args[4];
++  RegSet drop = RSET_SCRATCH;
++  lua_assert(sz != CTSIZE_INVALID || (ir->o == IR_CNEW && ir->op2 != REF_NIL));
++
++  as->gcsteps++;
++  if (ra_hasreg(ir->r))
++    rset_clear(drop, ir->r);  /* Dest reg handled below. */
++  ra_evictset(as, drop);
++  if (ra_used(ir))
++    ra_destreg(as, ir, RID_RET);  /* GCcdata * */
++
++  /* Initialize immutable cdata object. */
++  if (ir->o == IR_CNEWI) {
++    RegSet allow = (RSET_GPR & ~RSET_SCRATCH);
++    emit_Ao(as, sz == 8 ? SW64I_STL : SW64I_STW, ra_alloc1(as, ir->op2, allow),
++	     RID_RET, sizeof(GCcdata));
++    lua_assert(sz == 4 || sz == 8);
++  } else if (ir->op2 != REF_NIL) {  /* Create VLA/VLS/aligned cdata. */
++    ci = &lj_ir_callinfo[IRCALL_lj_cdata_newv];
++    args[0] = ASMREF_L;     /* lua_State *L */
++    args[1] = ir->op1;      /* CTypeID id   */
++    args[2] = ir->op2;      /* CTSize sz    */
++    args[3] = ASMREF_TMP1;  /* CTSize align */
++    asm_gencall(as, ci, args);
++    emit_loadi(as, ra_releasetmp(as, ASMREF_TMP1), (int32_t)ctype_align(info));
++    return;
++  }
++
++  /* Initialize gct and ctypeid. lj_mem_newgco() already sets marked. */
++  emit_Ao(as, SW64I_STB, RID_R28, RID_RET, offsetof(GCcdata, gct));
++  emit_Ao(as, SW64I_LDI, RID_R28, RID_ZERO, ~LJ_TCDATA);
++
++  emit_Ao(as, SW64I_STH, RID_R28, RID_RET, offsetof(GCcdata, ctypeid));
++  emit_Ao(as, SW64I_LDI, RID_R28, RID_ZERO, id); /* Lower 16 bit used. Sign-ext ok. */
++  args[0] = ASMREF_L;     /* lua_State *L */
++  args[1] = ASMREF_TMP1;  /* MSize size   */
++  asm_gencall(as, ci, args);
++  ra_allockreg(as, (int32_t)(sz+sizeof(GCcdata)),
++	       ra_releasetmp(as, ASMREF_TMP1));
++}
++#else
++#define asm_cnew(as, ir)	((void)0)
++#endif
++
++/* -- Write barriers ------------------------------------------------------ */
++
++static void asm_tbar(ASMState *as, IRIns *ir)
++{
++  Reg tab = ra_alloc1(as, ir->op1, RSET_GPR);
++  Reg mark = ra_scratch(as, rset_exclude(RSET_GPR, tab));
++  Reg link = RID_TMP;
++  MCLabel l_end = emit_label(as);
++  emit_Ao(as, SW64I_AS, link, tab, (int32_t)offsetof(GCtab, gclist));
++  emit_Ao(as, SW64I_STB, mark, tab, (int32_t)offsetof(GCtab, marked));
++  emit_setgl(as, tab, gc.grayagain);
++  emit_getgl(as, link, gc.grayagain);
++  emit_branch(as, SW64I_BEQ, RID_TMP, l_end);
++  emit_ABD(as, SW64I_XOR, mark, RID_TMP, mark);  /* Clear black bit. */
++  emit_AjD(as, SW64I_ANDI, mark, LJ_GC_BLACK, RID_TMP);
++  emit_Ao(as, SW64I_LDBU, mark, tab, (int32_t)offsetof(GCtab, marked));
++}
++
++static void asm_obar(ASMState *as, IRIns *ir)
++{
++  const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_gc_barrieruv];
++  IRRef args[2];
++  MCLabel l_end;
++  Reg obj, val, tmp;
++  /* No need for other object barriers (yet). */
++  lua_assert(IR(ir->op1)->o == IR_UREFC);
++  ra_evictset(as, RSET_SCRATCH);
++  l_end = emit_label(as);
++  args[0] = ASMREF_TMP1;  /* global_State *g */
++  args[1] = ir->op1;      /* TValue *tv      */
++  asm_gencall(as, ci, args);
++  emit_Ao(as, SW64I_LDI, ra_releasetmp(as, ASMREF_TMP1), RID_JGL, -32768);
++  obj = IR(ir->op1)->r;
++  tmp = ra_scratch(as, rset_exclude(RSET_GPR, obj));
++
++  emit_AjD(as, SW64I_ANDI, tmp, LJ_GC_BLACK, tmp);
++
++  emit_branch(as, SW64I_BEQ, RID_TMP, l_end);
++  emit_AjD(as, SW64I_ANDI, RID_TMP, LJ_GC_WHITES, RID_TMP);
++
++  emit_branch(as, SW64I_BEQ, RID_TMP, l_end);
++  val = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, obj));
++  emit_Ao(as, SW64I_LDBU, tmp, obj,
++	   (int32_t)offsetof(GCupval, marked)-(int32_t)offsetof(GCupval, tv));
++  emit_Ao(as, SW64I_LDBU, RID_TMP, val, (int32_t)offsetof(GChead, marked));
++}
++
++/* -- Arithmetic and logic operations ------------------------------------- */
++
++static void asm_fparith(ASMState *as, IRIns *ir, SW64Ins mi)
++{
++  Reg dest = ra_dest(as, ir, RSET_FPR);
++  Reg right, left = ra_alloc2(as, ir, RSET_FPR);
++  right = (left >> 8); left &= 255;
++  emit_FGI(as, mi, left, right, dest);
++}
++
++static void asm_fpunary(ASMState *as, IRIns *ir, SW64Ins mi)
++{
++  Reg dest = ra_dest(as, ir, RSET_FPR);
++  Reg left = ra_hintalloc(as, ir->op1, dest, RSET_FPR);
++  emit_FGI(as, mi, RID_FZERO, left, dest);
++}
++
++static void asm_fpmath(ASMState *as, IRIns *ir)
++{
++  if (ir->op2 == IRFPM_EXP2 && asm_fpjoin_pow(as, ir))
++    return;
++  asm_callid(as, ir, IRCALL_lj_vm_floor + ir->op2);
++}
++
++#define asm_fpadd(as, ir)	asm_fparith(as, ir, SW64I_FADDD)
++#define asm_fpsub(as, ir)	asm_fparith(as, ir, SW64I_FSUBD)
++#define asm_fpmul(as, ir)	asm_fparith(as, ir, SW64I_FMULD)
++
++//TODO
++
++static void asm_add(ASMState *as, IRIns *ir)
++{
++  IRType1 t = ir->t;
++  if (irt_isnum(t)) {
++    asm_fpadd(as, ir);
++  } else {
++    Reg dest = ra_dest(as, ir, RSET_GPR);
++    Reg right, left = ra_hintalloc(as, ir->op1, dest, RSET_GPR);
++    if (irref_isk(ir->op2)) {
++      intptr_t k = get_kval(IR(ir->op2));
++      if (checku8(k)) {
++        emit_AjD(as, (LJ_64 && irt_is64(t)) ? SW64I_ADDLI : SW64I_ADDWI,
++                 left, k, dest);
++        return;
++      }
++    }
++    right = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, left));
++    emit_ABD(as, (LJ_64 && irt_is64(t)) ? SW64I_ADDL : SW64I_ADDW,
++             left, right, dest);
++  }
++}
++
++static void asm_sub(ASMState *as, IRIns *ir)
++{
++  if (irt_isnum(ir->t)) {
++    asm_fpsub(as, ir);
++  } else {
++    Reg dest = ra_dest(as, ir, RSET_GPR);
++    Reg right, left = ra_alloc2(as, ir, RSET_GPR);
++    right = (left >> 8); left &= 255;
++    emit_ABD(as, irt_is64(ir->t) ? SW64I_SUBL : SW64I_SUBW,
++             left, right, dest);
++  }
++}
++
++static void asm_mul(ASMState *as, IRIns *ir)
++{
++  if (irt_isnum(ir->t)) {
++    asm_fpmul(as, ir);
++  } else {
++    Reg dest = ra_dest(as, ir, RSET_GPR);
++    Reg right, left = ra_alloc2(as, ir, RSET_GPR);
++    right = (left >> 8); left &= 255;
++    emit_ABD(as, irt_is64(ir->t) ? SW64I_MULL : SW64I_MULW,
++             left, right, dest);
++  }
++}
++
++static void asm_mod(ASMState *as, IRIns *ir)
++{
++  if (!irt_isint(ir->t))
++    asm_callid(as, ir, irt_isi64(ir->t) ? IRCALL_lj_carith_modi64 :
++					  IRCALL_lj_carith_modu64);
++  else
++    asm_callid(as, ir, IRCALL_lj_vm_modi);
++}
++
++static void asm_pow(ASMState *as, IRIns *ir)
++{
++  if (!irt_isnum(ir->t))
++    asm_callid(as, ir, irt_isi64(ir->t) ? IRCALL_lj_carith_powi64 :
++               IRCALL_lj_carith_powu64);
++  else
++    asm_callid(as, ir, IRCALL_lj_vm_powi);
++}
++
++static void asm_div(ASMState *as, IRIns *ir)
++{
++  if (!irt_isnum(ir->t))
++    asm_callid(as, ir, irt_isi64(ir->t) ? IRCALL_lj_carith_divi64 :
++					  IRCALL_lj_carith_divu64);
++  else
++    asm_fparith(as, ir, SW64I_FDIVD);
++}
++
++static void asm_neg(ASMState *as, IRIns *ir)
++{
++  if (irt_isnum(ir->t)) {
++    Reg dest = ra_dest(as, ir, RSET_FPR);
++    Reg left = ra_hintalloc(as, ir->op1, dest, RSET_FPR);
++    emit_FGI(as, SW64I_FCPYSN, left, left, dest);
++  } else {
++    Reg dest = ra_dest(as, ir, RSET_GPR);
++    Reg left = ra_hintalloc(as, ir->op1, dest, RSET_GPR);
++    emit_ABD(as, (LJ_64 && irt_is64(ir->t)) ? SW64I_SUBL : SW64I_SUBW,
++             RID_ZERO, left, dest);
++  }
++}
++
++#define asm_abs(as, ir)		asm_fpunary(as, ir, SW64I_FABS)
++
++#define asm_atan2(as, ir)	asm_callid(as, ir, IRCALL_atan2)
++#define asm_ldexp(as, ir)	asm_callid(as, ir, IRCALL_ldexp)
++
++static void asm_arithov(ASMState *as, IRIns *ir)
++{
++  Reg right, left, dest = ra_dest(as, ir, RSET_GPR);
++  lua_assert(!irt_is64(ir->t));
++  if (irref_isk(ir->op2)) {
++    int k = IR(ir->op2)->i;
++    if (ir->o == IR_SUBOV) k = -k;
++    if (checki16(k)) {  /* (dest < left) == (k >= 0 ? 1 : 0) */
++      left = ra_alloc1(as, ir->op1, RSET_GPR);
++      asm_guard(as, k >= 0 ? SW64I_BNE : SW64I_BEQ, RID_R28);
++      emit_ABD(as, SW64I_CMPLT, dest, dest == left ? RID_TMP : left, RID_R28);
++      emit_Ao(as, SW64I_LDI, dest, left, k);
++      if (dest == left) emit_move(as, RID_TMP, left);
++      return;
++    }
++  }
++  left = ra_alloc2(as, ir, RSET_GPR);
++  right = (left >> 8); left &= 255;
++
++  asm_guard(as, SW64I_BLT, RID_R28);
++
++  emit_ABD(as, SW64I_AND, RID_TMP, RID_R28, RID_R28);
++  if (ir->o == IR_ADDOV) {  /* ((dest^left) & (dest^right)) < 0 */
++    emit_ABD(as, SW64I_XOR, dest, dest == right ? RID_TMP : right, RID_TMP);
++  } else {  /* ((dest^left) & (dest^~right)) < 0 */
++    emit_ABD(as, SW64I_XOR, RID_TMP, dest, RID_TMP);
++    emit_ABD(as, SW64I_EQV, dest == right ? RID_TMP : right, RID_ZERO, RID_TMP);
++  }
++
++  emit_ABD(as, SW64I_XOR, dest, dest == left ? RID_TMP : left, RID_R28);
++  emit_ABD(as, ir->o == IR_ADDOV ? SW64I_ADDW : SW64I_SUBW, left, right, dest);
++
++  if (dest == left || dest == right)
++    emit_move(as, RID_TMP, dest == left ? left : right);
++}
++
++#define asm_addov(as, ir)	asm_arithov(as, ir)
++#define asm_subov(as, ir)	asm_arithov(as, ir)
++
++static void asm_mulov(ASMState *as, IRIns *ir)
++{
++  Reg dest = ra_dest(as, ir, RSET_GPR);
++  Reg right, left = ra_alloc2(as, ir, RSET_GPR);
++  right = (left >> 8); left &= 255;
++
++  asm_guard(as, SW64I_BEQ, RID_R28);
++  emit_ABD(as, SW64I_CMPEQ, dest, RID_R28, RID_R28);
++
++  emit_ABD(as, SW64I_MULW, left, right, dest);
++  emit_ABD(as, SW64I_MULL, left, right, RID_R28);
++}
++
++static void asm_bnot(ASMState *as, IRIns *ir)
++{
++  Reg left, right, dest = ra_dest(as, ir, RSET_GPR);
++  IRIns *irl = IR(ir->op1);
++  if (mayfuse(as, ir->op1) && irl->o == IR_BOR) {
++    left = ra_alloc2(as, irl, RSET_GPR);
++    right = (left >> 8); left &= 255;
++  } else {
++    left = RID_ZERO;
++    right = ra_hintalloc(as, ir->op1, dest, RSET_GPR);
++  }
++  emit_ABD(as, SW64I_ORNOT, left, right, dest);
++}
++
++static void asm_bswap(ASMState *as, IRIns *ir)
++{
++  Reg dest = ra_dest(as, ir, RSET_GPR);
++  Reg left = ra_alloc1(as, ir->op1, rset_exclude(RSET_GPR, dest));
++  int is64 = irt_is64(ir->t);
++  int bit = is64 ? 64 : 32;
++
++  if (is64) {
++      /* 8. extlb left, 7 */
++      emit_ABD(as, SW64I_ADDL, RID_R28, dest, dest);
++      emit_AjD(as, SW64I_EXTLBI, left, 7, RID_R28);
++
++      /* 7. extlb left, 6 */
++      emit_ABD(as, SW64I_ADDL, RID_R28, dest, dest);
++      emit_AjD(as, SW64I_SLLI, RID_R28, bit-8*7, RID_R28);
++      emit_AjD(as, SW64I_EXTLBI, left, 6, RID_R28);
++
++      /* 6. extlb left, 5 */
++      emit_ABD(as, SW64I_ADDL, RID_R28, dest, dest);
++      emit_AjD(as, SW64I_SLLI, RID_R28, bit-8*6, RID_R28);
++      emit_AjD(as, SW64I_EXTLBI, left, 5, RID_R28);
++
++      /* 5. extlb left, 4 */
++      emit_ABD(as, SW64I_ADDL, RID_R28, dest, dest);
++      emit_AjD(as, SW64I_SLLI, RID_R28, bit-8*5, RID_R28);
++      emit_AjD(as, SW64I_EXTLBI, left, 4, RID_R28);
++  }
++
++  /* 4. extlb left, 3, AT; addl AT, dest, dest */
++  emit_ABD(as, SW64I_ADDL, RID_R28, dest, dest);
++  if (is64) emit_AjD(as, SW64I_SLLI, RID_R28, bit-8*4, RID_R28);
++  emit_AjD(as, SW64I_EXTLBI, left, 3, RID_R28);
++
++  /* 3. extlb left, 2, AT; slli AT, 8, AT; addl AT, dest, dest */
++  emit_ABD(as, SW64I_ADDL, RID_R28, dest, dest);
++  emit_AjD(as, SW64I_SLLI, RID_R28, bit-8*3, RID_R28);
++  emit_AjD(as, SW64I_EXTLBI, left, 2, RID_R28);
++
++  /* 2. extlb left, 1, AT; slli AT, 16, AT; addl AT, dest, dest */
++  emit_ABD(as, SW64I_ADDL, RID_R28, dest, dest);
++  emit_AjD(as, SW64I_SLLI, RID_R28, bit-8*2, RID_R28);
++  emit_AjD(as, SW64I_EXTLBI, left, 1, RID_R28);
++
++  /* 1. extlb left, 0, AT; slli AT, 24, dest */
++  emit_AjD(as, SW64I_SLLI, RID_R28, bit-8*1, dest);
++  emit_AjD(as, SW64I_EXTLBI, left, 0, RID_R28);
++}
++
++static void asm_bitop(ASMState *as, IRIns *ir, SW64Ins mi, SW64Ins mik)
++{
++  Reg dest = ra_dest(as, ir, RSET_GPR);
++  Reg right, left = ra_hintalloc(as, ir->op1, dest, RSET_GPR);
++  if (!irt_is64(ir->t)) {
++    emit_ABD(as, SW64I_ADDW, RID_ZERO, dest, dest); 
++  }
++
++  if (irref_isk(ir->op2)) {
++    intptr_t k = get_kval(IR(ir->op2));
++    if (checki8(k)) {
++      emit_AjD(as, mik, left, k, dest);
++      return;
++    }
++  }
++  right = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, left));
++  emit_ABD(as, mi, left, right, dest);
++}
++
++#define asm_band(as, ir)	asm_bitop(as, ir, SW64I_AND, SW64I_ANDI)
++#define asm_bor(as, ir)		asm_bitop(as, ir, SW64I_BIS, SW64I_BISI)
++#define asm_bxor(as, ir)	asm_bitop(as, ir, SW64I_XOR, SW64I_XORI)
++
++static void asm_bitshift(ASMState *as, IRIns *ir, SW64Ins mi, SW64Ins mik)
++{
++  Reg dest = ra_dest(as, ir, RSET_GPR);
++  int is64 = irt_is64(ir->t);
++  if (!is64)  
++    emit_ABD(as, SW64I_ADDW, RID_ZERO, dest, dest); // truncated it to 32 bit
++
++  if (irref_isk(ir->op2)) {  /* Constant shifts. */
++    uint32_t shift = (uint32_t)IR(ir->op2)->i;
++    Reg left = ra_hintalloc(as, ir->op1, dest, RSET_GPR);
++    emit_AjD(as, mik, is64 ? left : RID_R28, (shift & 63), dest);
++    if (!is64) {
++      if (mi != SW64I_SRAI && mi != SW64I_SRA)
++        emit_AjD(as, SW64I_EXTLWI, RID_R28, 0, RID_R28);
++      emit_ABD(as, SW64I_ADDW, RID_ZERO, left, RID_R28); // truncated it to 32 bit
++    }
++  } else {
++    Reg right, left = ra_alloc2(as, ir, RSET_GPR);
++    right = (left >> 8); left &= 255;
++    emit_ABD(as, mi, is64 ? left : RID_R28, right, dest);
++    if (!is64) {
++      if (mi != SW64I_SRAI && mi != SW64I_SRA)
++        emit_AjD(as, SW64I_EXTLWI, RID_R28, 0, RID_R28);
++      emit_ABD(as, SW64I_ADDW, RID_ZERO, left, RID_R28); // truncated it to 32 bit
++    }
++  }
++}
++
++#define asm_bshl(as, ir)	asm_bitshift(as, ir, SW64I_SLL, SW64I_SLLI)
++#define asm_bshr(as, ir)	asm_bitshift(as, ir, SW64I_SRL, SW64I_SRLI)
++#define asm_bsar(as, ir)	asm_bitshift(as, ir, SW64I_SRA, SW64I_SRAI)
++
++static void asm_brotx(ASMState *as, IRIns *ir, int mode)
++{
++    int is64 = irt_is64(ir->t);
++    Reg dest = ra_dest(as, ir, RSET_GPR);
++    if (irref_isk(ir->op2)) {  /* Constant shifts. */
++        uint32_t shift = (uint32_t)(IR(ir->op2)->i & 63);
++        Reg left = ra_hintalloc(as, ir->op1, dest, RSET_GPR);
++        if (is64) {
++          emit_rotx(as, left, shift, dest, RID_R28, mode);
++        } else {
++          emit_rotx32(as, left, shift, dest, RID_R28, mode);
++        }
++    } else {
++      Reg right, left = ra_alloc2(as, ir, RSET_GPR);
++      right = (left >> 8); left &= 255;
++      if (!is64) {
++        emit_ABD(as, SW64I_ADDW, dest, RID_ZERO, dest);
++      }
++      emit_ABD(as, SW64I_BIS, dest, RID_TMP, dest);
++
++      if (mode == 1) {
++        emit_ABD(as, SW64I_SLL, is64 ? left: RID_R28, right, dest);
++        emit_ABD(as, SW64I_SRL, is64 ? left: RID_R28, RID_TMP, RID_TMP);
++      } else if (mode == 2){
++        emit_ABD(as, SW64I_SRL, is64 ? left: RID_R28, right, dest);
++        emit_ABD(as, SW64I_SLL, is64 ? left: RID_R28, RID_TMP, RID_TMP);
++      } else {
++        lua_assert(0);
++      }
++      if (!is64) {
++        emit_AjD(as, SW64I_EXTLWI, left, 0, RID_R28);
++      }
++      emit_ABD(as, SW64I_SUBL, ra_allock(as, is64 ? 64 : 32, RSET_GPR), right, RID_TMP);
++    }
++}
++#define asm_brol(as, ir)    asm_brotx(as, ir, 1)
++#define asm_bror(as, ir)    asm_brotx(as, ir, 2)
++
++static void asm_min_max(ASMState *as, IRIns *ir, int ismax)
++{
++  if (irt_isnum(ir->t)) {
++    Reg dest = ra_dest(as, ir, RSET_FPR);
++    Reg right, left = ra_alloc2(as, ir, RSET_FPR);
++    right = (left >> 8); left &= 255;
++    emit_FGHI(as, SW64I_FSELEQ, RID_F28, left, right, dest);
++    emit_FGI(as, SW64I_FCMPLT, ismax ? left: right, ismax ? right : left,
++             RID_F28);
++  } else {
++    Reg dest = ra_dest(as, ir, RSET_GPR);
++    Reg right, left = ra_alloc2(as, ir, RSET_GPR);
++    right = (left >> 8); left &= 255;
++    emit_ABCD(as, SW64I_SELEQ, RID_R28, left, right, dest);
++    emit_ABD(as, SW64I_CMPLT, ismax ? left : right, ismax ? right : left,
++             RID_R28);
++  }
++}
++
++#define asm_min(as, ir)		asm_min_max(as, ir, 0)
++#define asm_max(as, ir)		asm_min_max(as, ir, 1)
++
++/* -- Comparisons --------------------------------------------------------- */
++
++static void asm_comp(ASMState *as, IRIns *ir)
++{
++  /* ORDER IR: LT GE LE GT  ULT UGE ULE UGT. */
++  IROp op = ir->o;
++  if (irt_isnum(ir->t)) {
++    MCLabel l_true;
++    Reg right, left = ra_alloc2(as, ir, RSET_FPR);
++    right = (left >> 8); left &= 255;
++    l_true = emit_label(as);
++    switch (op) {
++    case IR_LT:
++    case IR_ULT:
++      asm_guard(as, SW64I_FBEQ, RID_F28);
++      emit_FGI(as, SW64I_FCMPLT, left, right, RID_F28);
++      break;
++    case IR_GE:
++    case IR_UGE:
++      asm_guard(as, SW64I_FBEQ, RID_F28);
++      emit_FGI(as, SW64I_FCMPLE, right, left, RID_F28);
++      break;
++    case IR_LE:
++    case IR_ULE:
++      asm_guard(as, SW64I_FBEQ, RID_F28);
++      emit_FGI(as, SW64I_FCMPLE, left, right, RID_F28);
++      break;
++    case IR_GT:
++    case IR_UGT:
++      asm_guard(as, SW64I_FBEQ, RID_F28);
++      emit_FGI(as, SW64I_FCMPLT, right, left, RID_F28);
++      break;
++    default:
++      lua_assert(!"not here");
++    }
++
++    if (op & 4) {
++      emit_branch(as, SW64I_FBNE, RID_F28, l_true);
++    } else {
++      asm_guard(as, SW64I_FBNE, RID_F28);
++    }
++    emit_FGI(as, SW64I_FCMPUN, left, right, RID_F28);
++
++  } else {
++    Reg right, left = ra_alloc1(as, ir->op1, RSET_GPR);
++    if (op == IR_ABC) op = IR_UGT;
++
++    if ((op&4) == 0 && irref_isk(ir->op2) && get_kval(IR(ir->op2)) == 0) {
++      SW64Ins mi = (op&2) ? ((op&1) ? SW64I_BLE : SW64I_BGT) :
++        ((op&1) ? SW64I_BLT : SW64I_BGE);
++      asm_guard(as, mi, left);
++    } else {
++      if (irref_isk(ir->op2)) {
++	intptr_t k = get_kval(IR(ir->op2));
++	if ((op&2)) k++;
++	if (checki8(k)) {
++	  asm_guard(as, (op&1) ? SW64I_BNE : SW64I_BEQ, RID_R28);
++	  emit_AjD(as, (op&4) ? SW64I_CMPULTI : SW64I_CMPLTI,
++		   left, k, RID_R28);
++	  return;
++	}
++      }
++      right = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, left));
++      asm_guard(as, ((op^(op>>1))&1) ? SW64I_BNE : SW64I_BEQ, RID_R28);
++      emit_ABD(as, (op&4) ? SW64I_CMPULT : SW64I_CMPLT,
++               (op&2) ? right : left, (op&2) ? left : right, RID_R28);
++    }
++  }
++}
++
++
++static void asm_equal(ASMState *as, IRIns *ir)
++{
++  Reg right, left = ra_alloc2(as, ir, irt_isnum(ir->t) ? RSET_FPR : RSET_GPR);
++  int is_ne = ir->o & 1;
++  right = (left >> 8); left &= 255;
++  if (irt_isnum(ir->t)) {
++    MCLabel l_true = emit_label(as);
++
++    if (irref_isk(ir->op2) && get_kval(IR(ir->op2)) == 0) {
++      asm_guard(as, is_ne ? SW64I_FBEQ : SW64I_FBNE, left);
++      return;
++    }
++    asm_guard(as, is_ne ? SW64I_FBNE : SW64I_FBEQ, RID_F28);
++    emit_FGI(as, SW64I_FCMPEQ, left, right, RID_F28);
++
++    if (is_ne) {
++      emit_branch(as, SW64I_FBNE, RID_F28, l_true);
++    } else {
++      asm_guard(as, SW64I_FBNE, RID_F28);
++    }
++    emit_FGI(as, SW64I_FCMPUN, left, right, RID_F28);
++
++  } else {
++    if (irref_isk(ir->op2) && get_kval(IR(ir->op2)) == 0) {
++      asm_guard(as, is_ne ? SW64I_BEQ: SW64I_BNE, left);
++      return;
++    }
++    asm_guard(as, is_ne ? SW64I_BNE : SW64I_BEQ, RID_R28);
++    emit_ABD(as, SW64I_CMPEQ, left, right, RID_R28);
++  }
++}
++
++/* -- Support for 64 bit ops in 32 bit mode ------------------------------- */
++
++/* Hiword op of a split 64 bit op. Previous op must be the loword op. */
++static void asm_hiop(ASMState *as, IRIns *ir)
++{
++  UNUSED(as); UNUSED(ir); lua_assert(0);
++}
++
++/* -- Profiling ----------------------------------------------------------- */
++
++static void asm_prof(ASMState *as, IRIns *ir)
++{
++  UNUSED(ir);
++  asm_guard(as, SW64I_BNE, RID_R28);
++  emit_AjD(as, SW64I_ANDI, RID_R28, HOOK_PROFILE, RID_R28);
++  emit_lsglptr(as, SW64I_LDBU, RID_R28,
++	       (int32_t)offsetof(global_State, hookmask));
++}
++
++/* -- Stack handling ------------------------------------------------------ */
++
++/* Check Lua stack size for overflow. Use exit handler as fallback. */
++static void asm_stack_check(ASMState *as, BCReg topslot,
++			    IRIns *irp, RegSet allow, ExitNo exitno)
++{
++  /* Try to get an unused temp. register, otherwise spill/restore RID_RET*. */
++  Reg tmp, pbase = irp ? (ra_hasreg(irp->r) ? irp->r : RID_TMP) : RID_BASE;
++  ExitNo oldsnap = as->snapno;
++  rset_clear(allow, pbase);
++  tmp = allow ? rset_pickbot(allow) : RID_RET;
++  as->snapno = exitno;
++  asm_guard(as, SW64I_BLE, RID_R28);
++  as->snapno = oldsnap;
++  if (allow == RSET_EMPTY)  /* Restore temp. register. */
++    emit_Ao(as, SW64I_AL, tmp, RID_SP, 0);
++  else
++    ra_modified(as, tmp);
++  emit_Ao(as, SW64I_LDI, RID_R28, RID_TMP, -(8*topslot));
++  emit_ABD(as, SW64I_SUBL, tmp, pbase, RID_TMP);
++  emit_Ao(as, SW64I_AL, tmp, tmp, offsetof(lua_State, maxstack));
++  if (pbase == RID_TMP)
++    emit_getgl(as, RID_TMP, jit_base);
++  emit_getgl(as, tmp, cur_L);
++  if (allow == RSET_EMPTY)  /* Spill temp. register. */
++    emit_Ao(as, SW64I_AS, tmp, RID_SP, 0);
++}
++
++/* Restore Lua stack from on-trace state. */
++static void asm_stack_restore(ASMState *as, SnapShot *snap)
++{
++  WI_DEBUG_BEFORE();
++  SnapEntry *map = &as->T->snapmap[snap->mapofs];
++  SnapEntry *flinks = &as->T->snapmap[snap_nextofs(as->T, snap)-1-LJ_FR2];
++  MSize n, nent = snap->nent;
++  /* Store the value of all modified slots to the Lua stack. */
++  for (n = 0; n < nent; n++) {
++    SnapEntry sn = map[n];
++    BCReg s = snap_slot(sn);
++    int32_t ofs = 8*((int32_t)s-1-LJ_FR2);
++    IRRef ref = snap_ref(sn);
++    IRIns *ir = IR(ref);
++    if ((sn & SNAP_NORESTORE))
++      continue;
++    if (irt_isnum(ir->t)) {
++      Reg src = ra_alloc1(as, ref, RSET_FPR);
++      emit_Fo(as, SW64I_FSTD, src, RID_BASE, ofs);
++    } else {
++      asm_tvstore64(as, RID_BASE, ofs, ref);
++    }
++    checkmclim(as);
++  }
++  lua_assert(map + nent == flinks);
++  WI_DEBUG_END();
++}
++
++/* -- GC handling --------------------------------------------------------- */
++
++/* Check GC threshold and do one or more GC steps. */
++static void asm_gc_check(ASMState *as)
++{
++  WI_DEBUG_BEFORE();
++  const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_gc_step_jit];
++  IRRef args[2];
++  MCLabel l_end;
++  Reg tmp;
++  ra_evictset(as, RSET_SCRATCH);
++  l_end = emit_label(as);
++  /* Exit trace if in GCSatomic or GCSfinalize. Avoids syncing GC objects. */
++  /* Assumes asm_snap_prep() already done. */
++  asm_guard(as, SW64I_BNE, RID_RET);
++
++  args[0] = ASMREF_TMP1;  /* global_State *g */
++  args[1] = ASMREF_TMP2;  /* MSize steps     */
++  asm_gencall(as, ci, args);
++  emit_Ao(as, SW64I_LDI, ra_releasetmp(as, ASMREF_TMP1), RID_JGL, -32768);
++  tmp = ra_releasetmp(as, ASMREF_TMP2);
++  emit_loadi(as, tmp, as->gcsteps);
++  /* Jump around GC step if GC total < GC threshold. */
++  emit_branch(as, SW64I_BNE, RID_R28, l_end);
++  emit_ABD(as, SW64I_CMPULT, RID_TMP, tmp, RID_R28);
++
++  emit_getgl(as, tmp, gc.threshold);
++  emit_getgl(as, RID_TMP, gc.total);
++  as->gcsteps = 0;
++  checkmclim(as);
++  WI_DEBUG_END();
++}
++
++/* -- Loop handling ------------------------------------------------------- */
++
++/* Fixup the loop branch. */
++static void asm_loop_fixup(ASMState *as)
++{
++  WI_DEBUG_BEFORE();
++  MCode *p = as->mctop;
++  MCode *target = as->mcp;
++  for (int i=1; i<EXIT_ROOM; i++) {
++    __WI(p-EXIT_ROOM+i, SW64I_NOP);
++  }
++  if (as->loopinv) {  /* Inverted loop branch? */
++    /* asm_guard already inverted the cond branch. Only patch the target. */
++    p[-EXIT_ROOM] &= 0xffe00000u;
++    p[-EXIT_ROOM] |= ((uint32_t)(target-(p-EXIT_ROOM)-1) & 0x001fffffu);
++  } else {
++    __WI(p-EXIT_ROOM, SW64I_BR | SW64F_A(RID_ZERO) | SW64F_BRANCH(target - (p-EXIT_ROOM) - 1));
++  }
++  WI_DEBUG_END();
++}
++
++/* -- Head of trace ------------------------------------------------------- */
++
++/* Coalesce BASE register for a root trace. */
++static void asm_head_root_base(ASMState *as)
++{
++  WI_DEBUG_BEFORE();
++  IRIns *ir = IR(REF_BASE);
++  Reg r = ir->r;
++  if (as->loopinv) as->mctop--;
++  if (ra_hasreg(r)) {
++    ra_free(as, r);
++    if (rset_test(as->modset, r) || irt_ismarked(ir->t))
++      ir->r = RID_INIT;  /* No inheritance for modified BASE register. */
++    if (r != RID_BASE)
++      emit_move(as, r, RID_BASE);
++  }
++  WI_DEBUG_END();
++}
++
++/* Coalesce BASE register for a side trace. */
++static RegSet asm_head_side_base(ASMState *as, IRIns *irp, RegSet allow)
++{
++  WI_DEBUG_BEFORE();
++  IRIns *ir = IR(REF_BASE);
++  Reg r = ir->r;
++  if (as->loopinv) as->mctop--;
++  if (ra_hasreg(r)) {
++    ra_free(as, r);
++    if (rset_test(as->modset, r) || irt_ismarked(ir->t))
++      ir->r = RID_INIT;  /* No inheritance for modified BASE register. */
++    if (irp->r == r) {
++      rset_clear(allow, r);  /* Mark same BASE register as coalesced. */
++    } else if (ra_hasreg(irp->r) && rset_test(as->freeset, irp->r)) {
++      rset_clear(allow, irp->r);
++      emit_move(as, r, irp->r);  /* Move from coalesced parent reg. */
++    } else {
++      emit_getgl(as, r, jit_base);  /* Otherwise reload BASE. */
++    }
++  }
++  WI_DEBUG_END();
++  return allow;
++}
++
++/* -- Tail of trace ------------------------------------------------------- */
++
++/* Fixup the tail code. */
++static void asm_tail_fixup(ASMState *as, TraceNo lnk)
++{
++  WI_DEBUG_BEFORE();
++  MCode *p = as->mctop-1;
++  MCode *target;
++  int32_t spadj = as->T->spadjust;
++
++  /* Patch exit branch. */
++  target = lnk ? traceref(as->J, lnk)->mcode : (MCode *)lj_vm_exit_interp;
++
++  for (int i=0; i<EXIT_ROOM-1; i++) {
++    __WI(&p[-2-i], SW64I_NOP);
++  }
++
++  int delta = target - (p-1) -1;
++  if (SW64F_BRANCH(delta) == delta) {
++    __WI(&p[-1], SW64I_BR | SW64F_A(RID_ZERO) | SW64F_BRANCH(delta));
++    __WI(&p[-2], SW64I_LDI | SW64F_A(RID_SP) | SW64F_B(RID_SP) | spadj);
++  } else {
++    __WI(&p[-1], SW64I_CALL | SW64F_A(RID_ZERO) | SW64F_DISP(0, RID_R28));
++    __WI(&p[-2], SW64I_LDI | SW64F_A(RID_SP) | SW64F_B(RID_SP) | spadj);
++    asm_lda(as, p-2, RID_R28, (uintptr_t)(void*)target);
++  }
++  WI_DEBUG_END();
++}
++
++/* Prepare tail of code. */
++static void asm_tail_prep(ASMState *as)
++{
++  MCode *p = as->mctop - EXIT_ROOM;  /* Leave room for exit branch. */
++  if (as->loopref) {
++    as->invmcp = as->mcp = p;
++  } else {
++    as->mcp = p-1;  /* Leave room for stack pointer adjustment. */
++    as->invmcp = NULL;
++  }
++}
++
++/* -- Trace setup --------------------------------------------------------- */
++
++/* Ensure there are enough stack slots for call arguments. */
++static Reg asm_setup_call_slots(ASMState *as, IRIns *ir, const CCallInfo *ci)
++{
++  IRRef args[CCI_NARGS_MAX*2];
++  uint32_t i, nargs = CCI_XNARGS(ci);
++  int nslots = 0, ngpr = REGARG_NUMGPR;
++  asm_collectargs(as, ir, ci, args);
++  for (i = 0; i < nargs; i++) {
++    if (ngpr > 0) ngpr--; else nslots += 2;
++  }
++  if (nslots > as->evenspill)  /* Leave room for args in stack slots. */
++    as->evenspill = nslots;
++  return irt_isfp(ir->t) ? REGSP_HINT(RID_FPRET) : REGSP_HINT(RID_RET);
++}
++
++static void asm_setup_target(ASMState *as)
++{
++  asm_sparejump_setup(as);
++  asm_exitstub_setup(as);
++}
++
++/* -- Trace patching ------------------------------------------------------ */
++
++int is_branch_op(MCode ins)
++{
++  switch(ins & 0xfc000000) {
++  case SW64I_BEQ: case SW64I_BNE: case SW64I_BLT:
++  case SW64I_BLE: case SW64I_BGT: case SW64I_BGE:
++    return 1;
++  case SW64I_FBEQ:
++  case SW64I_FBGE:
++  case SW64I_FBGT:
++  case SW64I_FBLE:
++  case SW64I_FBLT:
++  case SW64I_FBNE:
++    return 1;
++  }
++  return 0;
++}
++
++/* Patch exit jumps of existing machine code to a new target. */
++void lj_asm_patchexit(jit_State *J, GCtrace *T, ExitNo exitno, MCode *target)
++{
++  WI_DEBUG_BEFORE();
++  MCode *p = T->mcode;
++  MCode *pe = (MCode *)((char *)p + T->szmcode);
++  MCode *px = exitstub_trace_addr(T, exitno);
++  MCode *cstart = NULL, *cstop = NULL;
++  MCode *mcarea = lj_mcode_patch(J, p, 0);
++#if SW64_DEBUG_WI
++  MCode exitload = SW64I_LDI | SW64F_A(RID_TMP) | SW64F_DISP(exitno, RID_TMP);
++  printf("try patching traceno:%d exitno:%d target:%p px:%p\n",
++         T->traceno, exitno, target, px);
++#else
++  MCode exitload = SW64I_LDI | SW64F_A(RID_TMP) | SW64F_DISP(exitno, RID_TMP);
++#endif
++
++  for (p++; p < pe; p++) {
++    /* Look for load of exit number. */
++    if (*p != exitload) {
++      continue;
++    }
++
++    /* Look for exitstub branch. Yes, this covers all used branch variants. */
++    if (is_branch_op(p[1])
++        && (SW64F_BRANCH(p[1]) == SW64F_BRANCH((px - (p+1) - 1)))) {
++      ptrdiff_t delta = target - (p+1) - 1;
++      if (IS_SW64F_BRANCH_VALID(delta)) {  /* Patch in-range branch. */
++#if SW64_DEBUG_WI
++        printf("p1atch at %p\n", p+2);
++#endif
++        __WI_REPLACE(p+1,
++                     (p[1] & (~0x1fffff)) | SW64F_BRANCH(delta),
++                     p[1]);
++      patchbranch:
++        cstop = p+1;
++        if (!cstart) cstart = p;
++      } else {  /* Branch out of range. Use spare jump slot in mcarea. */
++        TODO;
++      }
++    } else if (p[2] == SW64I_NOP) {
++#if SW64_DEBUG_WI
++      printf("p2atch at %p\n", p+2);
++#endif
++      ptrdiff_t delta = target - (p+2) - 1;
++      __WI_REPLACE(p+2, SW64I_BR | SW64F_A(RID_ZERO) | SW64F_BRANCH(delta), SW64I_NOP);
++      goto patchbranch;
++    }
++  }
++  if (cstart) lj_mcode_sync(cstart, cstop);
++  lj_mcode_patch(J, mcarea, 1);
++  WI_DEBUG_END();
++}
++
++#undef TODO
+diff --git a/src/lj_ccall.c b/src/lj_ccall.c
+index 5c252e5..19e5a06 100644
+--- a/src/lj_ccall.c
++++ b/src/lj_ccall.c
+@@ -562,6 +562,66 @@
+     goto done; \
+   }
+ 
++#elif LJ_TARGET_SW64
++/* -- SW64 calling conventions -------------------------------------------- */
++
++#define CCALL_HANDLE_STRUCTRET \
++  cc->retref = 1;  /* Return all structs by reference. */ \
++  cc->gpr[ngpr++] = (GPRArg)dp;
++
++#define CCALL_HANDLE_COMPLEXRET \
++  /* Complex values are returned in 2 FPRs. */ \
++  cc->retref = 0;
++
++#define CCALL_HANDLE_COMPLEXRET2                                        \
++  if (ctr->size == 2*sizeof(float)) {  /* Copy complex float from FPRs. */ \
++    ((float *)dp)[0] = (float)cc->fpr[0].d;                               \
++    ((float *)dp)[1] = (float)cc->fpr[1].d;                               \
++  } else {  /* Copy complex double from FPRs. */                        \
++    ((double *)dp)[0] = cc->fpr[0].d;                             \
++    ((double *)dp)[1] = cc->fpr[1].d;                             \
++  }
++
++#define CCALL_HANDLE_STRUCTARG \
++  if (!(sz <= 8*6)) { \
++    rp = cdataptr(lj_cdata_new(cts, did, sz)); \
++    sz = CTSIZE_PTR;  /* Pass all other structs by reference. */ \
++  }
++
++#define CCALL_HANDLE_COMPLEXARG \
++  if (sz == 2*sizeof(float)) { \
++    isfp = 2; \
++    if (ngpr < maxgpr) \
++      sz *= 2; \
++  }
++
++#define CCALL_HANDLE_REGARG \
++  {  /* Try to pass argument in GPRs. */                                \
++    cc->reg_is_word[ngpr] = (d->size == 4);                               \
++    if (ctype_iscomplex(d->info))             {                         \
++      cc->reg_is_word[ngpr] = d->size == 8;                             \
++      cc->reg_is_word[ngpr+1] = d->size==8;                             \
++    }                                                                   \
++    if (n > 1) {                                                        \
++      lua_assert(n == 2 || n == 4);  /* int64_t or complex (float). */ \
++    }                                                                  \
++    if (ngpr < maxgpr) {                                                \
++      dp = &cc->gpr[ngpr];                                              \
++      if (ngpr + n > maxgpr) {                                          \
++        nsp += ngpr + n - maxgpr;  /* Assumes contiguous gpr/stack fields. */ \
++        if (nsp > CCALL_MAXSTACK) goto err_nyi;  /* Too many arguments. */ \
++        ngpr = maxgpr;                                                  \
++      } else {                                                          \
++        ngpr += n;                                                      \
++      }                                                                 \
++      goto done;                                                        \
++    }                                                                   \
++  }
++
++#define CCALL_HANDLE_RET \
++  if (ctype_isfp(ctr->info) && ctr->size == sizeof(float)) \
++    ctr = ctype_get(cts, CTID_DOUBLE);  /* FPRs always hold doubles. */
++
+ #else
+ #error "Missing calling convention definitions for this architecture"
+ #endif
+@@ -921,6 +981,9 @@ static int ccall_set_args(lua_State *L, CTState *cts, CType *ct,
+ 
+   /* Clear unused regs to get some determinism in case of misdeclaration. */
+   memset(cc->gpr, 0, sizeof(cc->gpr));
++#if LJ_TARGET_SW64
++  memset(cc->reg_is_word, 0, sizeof(cc->reg_is_word));
++#endif
+ #if CCALL_NUM_FPR
+   memset(cc->fpr, 0, sizeof(cc->fpr));
+ #endif
+@@ -1044,7 +1107,7 @@ static int ccall_set_args(lua_State *L, CTState *cts, CType *ct,
+     if (isfp && d->size == sizeof(float))
+       ((float *)dp)[1] = ((float *)dp)[0];  /* Floats occupy high slot. */
+ #endif
+-#if LJ_TARGET_MIPS64 || (LJ_TARGET_ARM64 && LJ_BE)
++#if LJ_TARGET_MIPS64 || (LJ_TARGET_ARM64 && LJ_BE) || LJ_TARGET_SW64
+     if ((ctype_isinteger_or_bool(d->info) || ctype_isenum(d->info)
+ #if LJ_TARGET_MIPS64
+ 	 || (isfp && nsp == 0)
+@@ -1068,7 +1131,7 @@ static int ccall_set_args(lua_State *L, CTState *cts, CType *ct,
+       cc->fpr[nfpr-1].d[0] = cc->fpr[nfpr-2].d[1];  /* Split complex double. */
+       cc->fpr[nfpr-2].d[1] = 0;
+     }
+-#elif LJ_TARGET_ARM64 || (LJ_TARGET_MIPS64 && !LJ_ABI_SOFTFP)
++#elif LJ_TARGET_ARM64 || (LJ_TARGET_MIPS64 && !LJ_ABI_SOFTFP) || LJ_TARGET_SW64
+     if (isfp == 2 && (uint8_t *)dp < (uint8_t *)cc->stack) {
+       /* Split float HFA or complex float into separate registers. */
+       CTSize i = (sz >> 2) - 1;
+diff --git a/src/lj_ccall.h b/src/lj_ccall.h
+index 59f6648..a3cb613 100644
+--- a/src/lj_ccall.h
++++ b/src/lj_ccall.h
+@@ -126,6 +126,22 @@ typedef union FPRArg {
+   struct { LJ_ENDIAN_LOHI(float f; , float g;) };
+ } FPRArg;
+ 
++#elif LJ_TARGET_SW64
++
++#define CCALL_NARG_GPR		6
++#define CCALL_NARG_FPR		0 /* FP args are positional and overlay the GPR array. */
++#define CCALL_NRET_GPR		1
++#define CCALL_NRET_FPR		2
++
++#define CCALL_SPS_EXTRA		3
++#define CCALL_SPS_FREE		1
++
++typedef intptr_t GPRArg;
++typedef union FPRArg {
++  double d;
++  struct { float f; float g; };
++} FPRArg;
++
+ #else
+ #error "Missing calling convention definitions for this architecture"
+ #endif
+@@ -174,6 +190,9 @@ typedef LJ_ALIGN(CCALL_ALIGN_CALLSTATE) struct CCallState {
+ #if LJ_32
+   int32_t align1;
+ #endif
++#if LJ_TARGET_SW64
++  uint8_t reg_is_word[CCALL_NUM_GPR];
++#endif
+ #if CCALL_NUM_FPR
+   FPRArg fpr[CCALL_NUM_FPR];	/* Arguments/results in FPRs. */
+ #endif
+diff --git a/src/lj_ccallback.c b/src/lj_ccallback.c
+index 846827b..434c649 100644
+--- a/src/lj_ccallback.c
++++ b/src/lj_ccallback.c
+@@ -71,6 +71,10 @@ static MSize CALLBACK_OFS2SLOT(MSize ofs)
+ 
+ #define CALLBACK_MCODE_HEAD		52
+ 
++#elif LJ_TARGET_SW64
++
++#define CALLBACK_MCODE_HEAD	    4*13
++
+ #else
+ 
+ /* Missing support for this architecture. */
+@@ -238,6 +242,87 @@ static void callback_mcode_init(global_State *g, uint32_t *page)
+   }
+   lua_assert(p - page <= CALLBACK_MCODE_SIZE);
+ }
++
++#elif LJ_TARGET_SW64
++
++static void split32Addr(uint32_t addr, int16_t* hi, int16_t* lo)
++{
++  *hi = (int16_t)(addr >> 16);
++  *lo = (int16_t)(addr & 0xffff);
++  if (*lo < 0) {
++    *hi = *hi + 1;
++    *lo = (int16_t)(addr - ((int32_t)(*hi) << 16));
++  }
++}
++static void split64AddrHI32(uint64_t addr, int16_t*hi, int16_t*lo)
++{
++  split32Addr((uint32_t)(addr >> 32), hi, lo);
++}
++static void split64AddrLO32(uint64_t addr, int16_t*hi, int16_t*lo)
++{
++  split32Addr((uint32_t)(addr & 0xffffffff), hi, lo);
++}
++
++static void callback_mcode_init(global_State *g, uint32_t *page)
++{
++  uint32_t *p = page;
++  void *target = (void *)lj_vm_ffi_callback;
++  int16_t hi, lo;
++
++  MSize slot;
++  {
++  split64AddrHI32((uint64_t)target, &hi, &lo);
++  // ldih PV, h32_hi(zero)
++  *p++ = SW64I_LDIH | SW64F_A(RID_CFUNCADDR)| SW64F_DISP(hi, RID_ZERO);
++  // ldi PV, h32_lo(PV)
++  *p++ = SW64I_LDI |  SW64F_A(RID_CFUNCADDR)| SW64F_DISP(lo, RID_CFUNCADDR);
++  // slli PV, 32, PV
++  *p++ = SW64I_SLLI | SW64F_A(RID_CFUNCADDR)| SW64F_IMM(32) | SW64F_D(RID_CFUNCADDR);
++
++  split64AddrLO32((uint64_t)target, &hi, &lo);
++  // ldih at, lo32_hi(zero)
++  *p++ = SW64I_LDIH | SW64F_A(RID_R28)| SW64F_DISP(hi, RID_ZERO);
++  // ldi at, lo32_lo(at)
++  *p++ = SW64I_LDI |  SW64F_A(RID_R28)| SW64F_DISP(lo, RID_R28);
++  // addl PV, at, PV
++  *p++ = SW64I_ADDL | SW64F_A(RID_CFUNCADDR)| SW64F_B(RID_R28) | SW64F_D(RID_CFUNCADDR);
++  }
++
++  {
++  split64AddrHI32((uint64_t)g, &hi, &lo);
++  // ldih r2, h32_hi(zero)
++  *p++ = SW64I_LDIH | SW64F_A(RID_R2) | SW64F_DISP(hi, RID_ZERO);
++  // ldi r2, h32_lo(r2)
++  *p++ = SW64I_LDI  | SW64F_A(RID_R2) | SW64F_DISP(lo, RID_R2);
++  // slli r2, 32, r2
++  *p++ = SW64I_SLLI | SW64F_A(RID_R2)| SW64F_IMM(32) | SW64F_D(RID_R2);
++
++  split64AddrLO32((uint64_t)g, &hi, &lo);
++  // ldih at, lo32_hi(zero)
++  *p++ = SW64I_LDIH | SW64F_A(RID_R28)| SW64F_DISP(hi, RID_ZERO);
++  // ldi at, lo32_lo(at)
++  *p++ = SW64I_LDI |  SW64F_A(RID_R28)| SW64F_DISP(lo, RID_R28);
++  // addl r2, at, r2
++  *p++ = SW64I_ADDL | SW64F_A(RID_R2)| SW64F_B(RID_R28) | SW64F_D(RID_R2);
++  }
++
++  // call zero, 0(PV)
++  *p++ = SW64I_CALL | SW64F_A(RID_ZERO) | SW64F_DISP(0, RID_CFUNCADDR);
++
++  //lua_assert((p - page) * 4 <= CALLBACK_MCODE_HEAD);
++
++  for (slot = 0; slot < CALLBACK_MAX_SLOT; slot++) {
++    // ldi r1, slot(zero)
++    *p++ = SW64I_LDI | SW64F_A(RID_R1) | SW64F_DISP(slot, RID_ZERO);
++
++    // br zero, (page-p-2)
++    *p = SW64I_BR | SW64F_A(RID_ZERO) | SW64F_BRANCH((page-p-2)+1);
++    p++;
++  }
++
++  lua_assert(p - page <= CALLBACK_MCODE_SIZE);
++}
++
+ #else
+ /* Missing support for this architecture. */
+ #define callback_mcode_init(g, p)	UNUSED(p)
+@@ -495,6 +580,33 @@ void lj_ccallback_mcode_free(CTState *cts)
+   if (ctype_isfp(ctr->info) && ctr->size == sizeof(float)) \
+     ((float *)dp)[1] = *(float *)dp;
+ 
++#elif LJ_TARGET_SW64
++
++#define CALLBACK_HANDLE_REGARG                                                 \
++  if (ngpr + n <= maxgpr) {                                                    \
++    if (isfp) {                                                                \
++      FPRCBArg *reg = &(cts->cb.fpr[ngpr]);                               \
++      if (cta->size == 4) {                                                    \
++        reg->f[0] = (float)reg->d;                                           \
++      }\
++      sp = reg;                                 \
++    } else {\
++      intptr_t *reg = &(cts->cb.gpr[ngpr]);                               \
++      if (cta->size == 4) {                                             \
++        *reg = *(int*)reg;                                              \
++      }                                                                 \
++      sp = reg;                                                         \
++    }                                                                   \
++    ngpr += n;                                                          \
++    goto done;                                                          \
++  }
++
++#define CALLBACK_HANDLE_RET \
++  if (ctype_isfp(ctr->info) && ctr->size == sizeof(float))          \
++    *(double *)dp = *(float *)dp;  /* FPRs always hold doubles. */  \
++  if (ctype_isinteger(ctr->info) && ctr->size == 4) \
++    *((int64_t *)dp) = *(int *)dp;
++
+ #else
+ #error "Missing calling convention definitions for this architecture"
+ #endif
+diff --git a/src/lj_crecord.c b/src/lj_crecord.c
+index e32ae23..00de245 100644
+--- a/src/lj_crecord.c
++++ b/src/lj_crecord.c
+@@ -130,7 +130,7 @@ static IRType crec_ct2irt(CTState *cts, CType *ct)
+ /* Number of windowed registers used for optimized memory copy. */
+ #if LJ_TARGET_X86
+ #define CREC_COPY_REGWIN		2
+-#elif LJ_TARGET_PPC || LJ_TARGET_MIPS
++#elif LJ_TARGET_PPC || LJ_TARGET_MIPS || LJ_TARGET_SW64
+ #define CREC_COPY_REGWIN		8
+ #else
+ #define CREC_COPY_REGWIN		4
+diff --git a/src/lj_dispatch.c b/src/lj_dispatch.c
+index 5d6795f..d38b860 100644
+--- a/src/lj_dispatch.c
++++ b/src/lj_dispatch.c
+@@ -38,7 +38,7 @@ LJ_STATIC_ASSERT(GG_NUM_ASMFF == FF_NUM_ASMFUNC);
+ 
+ /* -- Dispatch table management ------------------------------------------- */
+ 
+-#if LJ_TARGET_MIPS
++#if LJ_TARGET_MIPS || LJ_TARGET_SW64
+ #include <math.h>
+ LJ_FUNCA_NORET void LJ_FASTCALL lj_ffh_coroutine_wrap_err(lua_State *L,
+ 							  lua_State *co);
+@@ -74,7 +74,7 @@ void lj_dispatch_init(GG_State *GG)
+   GG->g.bc_cfunc_ext = GG->g.bc_cfunc_int = BCINS_AD(BC_FUNCC, LUA_MINSTACK, 0);
+   for (i = 0; i < GG_NUM_ASMFF; i++)
+     GG->bcff[i] = BCINS_AD(BC__MAX+i, 0, 0);
+-#if LJ_TARGET_MIPS
++#if LJ_TARGET_MIPS || LJ_TARGET_SW64
+   memcpy(GG->got, dispatch_got, LJ_GOT__MAX*sizeof(ASMFunction *));
+ #endif
+ }
+diff --git a/src/lj_dispatch.h b/src/lj_dispatch.h
+index 5bda51a..9b8559e 100644
+--- a/src/lj_dispatch.h
++++ b/src/lj_dispatch.h
+@@ -12,7 +12,7 @@
+ #include "lj_jit.h"
+ #endif
+ 
+-#if LJ_TARGET_MIPS
++#if LJ_TARGET_MIPS || LJ_TARGET_SW64
+ /* Need our own global offset table for the dreaded MIPS calling conventions. */
+ 
+ #ifndef _LJ_VM_H
+@@ -89,7 +89,7 @@ typedef uint16_t HotCount;
+ typedef struct GG_State {
+   lua_State L;				/* Main thread. */
+   global_State g;			/* Global state. */
+-#if LJ_TARGET_MIPS
++#if LJ_TARGET_MIPS || LJ_TARGET_SW64
+   ASMFunction got[LJ_GOT__MAX];		/* Global offset table. */
+ #endif
+ #if LJ_HASJIT
+diff --git a/src/lj_emit_sw64.h b/src/lj_emit_sw64.h
+new file mode 100644
+index 0000000..dfd485f
+--- /dev/null
++++ b/src/lj_emit_sw64.h
+@@ -0,0 +1,454 @@
++/*
++** SW64 instruction emitter.
++** Copyright (C) 2019 deepin inc. See Copyright Notice in luajit.h
++*/
++
++#include <stdio.h>
++#define TODO do {printf("\e[1;34mTODO IMPLEMENT %s\e[m\n", __FUNCTION__); asm("bpt;bpt");} while(0);
++
++#if SW64_DEBUG_WI
++#include <string.h>
++const char* disass_ins(int ins, void* addr) {
++  static char cmd[1024];
++  sprintf(cmd, "./luajit -e 'require(\"jit.dis_sw64\"):wi_debug(%d, %p)'", ins, addr);
++  FILE* out = popen(cmd, "r");
++  memset(cmd, 0, sizeof(cmd));
++  fread(cmd, sizeof(cmd), 1, out);
++  pclose(out);
++  for (int i=0; i<sizeof(cmd); i++) {
++    switch (cmd[i]) {
++    case '\t': cmd[i] = ' '; break;
++    case '\n': cmd[i] = '\0'; return cmd;
++    }
++  }
++  return cmd;
++}
++
++ASMState *as=0;
++#define __WI_REPLACE(addr, ins, origin)                                 \
++  do {                                                                  \
++    MCode* x = addr;                                                    \
++    IRRef ir = -1;                                                      \
++    if (as!=0) ir = as->curins;                                         \
++    if (*x != origin) {                                                 \
++      printf("BUG: 1 overwrite[%p](0x%x) to 0x%lx on %s !\n",            \
++             x, (int)origin, (int)ins, __FUNCTION__);                   \
++      asm("bpt;bpt");                                                   \
++    } else {                                                            \
++      printf("\tIR' %d --> %p #%s\n", ir-REF_BASE, x, disass_ins(ins, x)); \
++    }                                                                   \
++    *x = ins;                                                           \
++  } while(0)
++
++#define __WI(addr, ins)                                                 \
++  do {                                                                  \
++    MCode* x = addr;                                                    \
++    IRRef ir = -1;                                                      \
++    if (as!=0) ir = as->curins;                                         \
++    if (*x != 0 && *x != 0x43ff075f) {                                  \
++      printf("BUG: 2 overwrite[%p](0x%x)(%s) to 0x%lx(%s) on %s! IR:0x%x\n", \
++             x, *x, disass_ins(*x, x), (unsigned long)ins, disass_ins(ins, x), __FUNCTION__, \
++             ir);                                                       \
++      asm("bpt;bpt");                                                   \
++    } else {                                                            \
++      printf("\tIR %d --> %p #%s\n", ir-REF_BASE, x, disass_ins(ins, x)); \
++    }                                                                   \
++    *x = ins;                                                           \
++  } while(0)
++#define WI_DEBUG_BEFORE() printf("before %s\n", __FUNCTION__);
++#define WI_DEBUG_END() printf("end    %s\n", __FUNCTION__);
++#else
++#define __WI(addr, ins)  *(addr) = ins;
++#define __WI_REPLACE(addr, ins, origin)  *(addr) = ins;
++#define WI_DEBUG_BEFORE()
++#define WI_DEBUG_END()
++#endif
++
++#if LJ_64
++static intptr_t get_k64val(IRIns *ir)
++{
++  if (ir->o == IR_KINT64) {
++    return (intptr_t)ir_kint64(ir)->u64;
++  } else if (ir->o == IR_KGC) {
++    return (intptr_t)ir_kgc(ir);
++  } else if (ir->o == IR_KPTR || ir->o == IR_KKPTR) {
++    return (intptr_t)ir_kptr(ir);
++  } else if (ir->o == IR_KNUM) {
++    return (intptr_t)ir_knum(ir)->u64;
++  } else {
++    lua_assert(ir->o == IR_KINT || ir->o == IR_KNULL);
++    return ir->i;  /* Sign-extended. */
++  }
++}
++#endif
++
++#if LJ_64
++#define get_kval(ir)		get_k64val(ir)
++#else
++#define get_kval(ir)		((ir)->i)
++#endif
++
++
++inline static void split32Addr(uint32_t addr, int16_t* hi, int16_t* lo)
++{
++  *hi = (int16_t)(addr >> 16);
++  *lo = (int16_t)(addr & 0xffff);
++  if (*lo < 0) {
++    *hi = *hi + 1;
++    *lo = (int16_t)(addr - ((int32_t)(*hi) << 16));
++  }
++}
++inline static void split64AddrHI32(uint64_t addr, int16_t*hi, int16_t*lo)
++{
++  int32_t addr_hi = (int32_t)(addr >> 32);
++  int32_t addr_lo = (addr & 0xffffffff);
++  if (addr_lo < 0) {
++    addr_hi++;
++  }
++  split32Addr((uint32_t)addr_hi, hi, lo);
++}
++inline static void split64AddrLO32(uint64_t addr, int16_t*hi, int16_t*lo)
++{
++  int32_t addr_hi = addr >> 32;
++  int32_t addr_lo = addr & 0xffffffff;
++  if (addr_lo < 0) {
++    addr_hi++;
++    addr_lo = addr - ((int64_t)addr_hi << 32);
++  }
++  split32Addr((uint32_t)(addr_lo), hi, lo);
++}
++
++/* -- Emit basic instructions --------------------------------------------- */
++
++static void emit_AjD(ASMState *as, SW64Ins mi, Reg a, uint8_t i, Reg d)
++{
++  lua_assert(a <=31 && a >= 0);
++  lua_assert(d <= 31 && d >= 0);
++  __WI(--as->mcp,  mi | SW64F_A(a) | SW64F_j(i) | SW64F_D(d));
++}
++
++static void emit_Ao(ASMState *as, SW64Ins mi, Reg a, Reg b, int32_t disp)
++{
++  lua_assert(b <= 31 && b >= 0);
++  lua_assert(((int16_t)disp) == disp);
++  __WI(--as->mcp, mi | SW64F_A(a) | SW64F_DISP(disp, b));
++}
++
++static void emit_ABD(ASMState *as, SW64Ins mi, Reg a, Reg b, Reg d)
++{
++  lua_assert(a <=31 && a >= 0);
++  lua_assert(b <=31 && b >= 0);
++  lua_assert(d <=31 && d >= 0);
++  __WI(--as->mcp, mi | SW64F_A(a) | SW64F_B(b) | SW64F_D(d));
++}
++
++static void emit_ABCD(ASMState *as, SW64Ins mi, Reg a, Reg b, Reg c, Reg d)
++{
++  lua_assert(a <=31 && a >= 0);
++  lua_assert(b <=31 && b >= 0);
++  lua_assert(c <=31 && c >= 0);
++  lua_assert(d <=31 && d >= 0);
++  __WI(--as->mcp, mi | SW64F_A(a) | SW64F_B(b) | SW64F_C(c) | SW64F_D(d));
++}
++
++static void emit_FGI(ASMState *as, SW64Ins mi, Reg f, Reg g, Reg i)
++{
++  lua_assert(f >= 32 && f <= 63);
++  lua_assert(g >= 32 && g <= 63);
++  lua_assert(i >= 32 && i <= 63);
++  __WI(--as->mcp, mi | SW64F_F(f) | SW64F_G(g) | SW64F_I(i));
++}
++static void emit_FGHI(ASMState *as, SW64Ins mi, Reg f, Reg g, Reg h, Reg i)
++{
++  lua_assert(f >= 32 && f <= 63);
++  lua_assert(g >= 32 && g <= 63);
++  lua_assert(h >= 32 && h <= 63);
++  lua_assert(i >= 32 && i <= 63);
++  __WI(--as->mcp, mi | SW64F_F(f) | SW64F_G(g) | SW64F_H(h) | SW64F_I(i));
++}
++static void emit_Fo(ASMState *as, SW64Ins mi, Reg f, Reg b, int16_t disp)
++{
++  lua_assert(f >= 32 && f <= 63);
++  lua_assert(b <= 31 && b >= 0);
++  __WI(--as->mcp, mi | SW64F_F(f) | SW64F_DISP(disp, b));
++}
++static void emit_FD(ASMState *as, SW64Ins mi, Reg f, Reg d)
++{
++  lua_assert(f >= 32 && f <= 63);
++  lua_assert(d >= 0 && d <= 31);
++  __WI(--as->mcp, mi | SW64F_F(f) | SW64F_D(d));
++}
++static void emit_AI(ASMState *as, SW64Ins mi, Reg a, Reg i)
++{
++  lua_assert(a >= 0 && a <= 31);
++  lua_assert(i >= 32 && i <= 63);
++  __WI(--as->mcp, mi | SW64F_A(a) | SW64F_I(i));
++}
++
++static void emit_GI(ASMState *as, SW64Ins mi, Reg g, Reg i)
++{
++  lua_assert(g >= 32 && g <= 63);
++  lua_assert(i >= 32 && i <= 63);
++  __WI(--as->mcp, mi | SW64F_F(RID_F31) | SW64F_G(g) | SW64F_I(i));
++}
++
++static void emit_void(ASMState *as, SW64Ins mi)
++{
++  __WI(--as->mcp, mi);
++}
++
++static void emit_DEXTM(ASMState *as, Reg rt, Reg rs, uint32_t pos, uint32_t size)
++{
++  emit_AjD(as, SW64I_SRLI, rt, 64-size, rt);
++  emit_AjD(as, SW64I_SLLI, rs, 64-pos-size, rt);
++}
++
++/* -- Emit loads/stores --------------------------------------------------- */
++
++/* Prefer rematerialization of BASE/L from global_State over spills. */
++#define emit_canremat(ref)	((ref) <= REF_BASE)
++
++/* Load a 32 bit constant into a GPR. */
++static void emit_loadi(ASMState *as, Reg r, int32_t i)
++{
++  int16_t hi, lo;
++  split32Addr(i, &hi, &lo);
++  emit_ABD(as, SW64I_ADDW, RID_ZERO, r, r);
++  if (i == 0 && r != RID_ZERO) {
++    emit_Ao(as, SW64I_LDI, r, RID_ZERO, 0);
++    return;
++  }
++  lua_assert(r != RID_ZERO);
++  if (lo != 0) {
++    emit_Ao(as, SW64I_LDI, r, hi ? r : RID_ZERO, lo);
++  }
++  if (hi != 0) {
++    // ldih dest, hi(zero)
++    emit_Ao(as, SW64I_LDIH, r, RID_ZERO, hi);
++  }
++}
++
++/* Load a 64 bit constant into a GPR. */
++static void emit_loadu64(ASMState *as, Reg r, uint64_t u64)
++{
++  int16_t h_hi, h_lo;
++  int16_t l_hi, l_lo;
++  if (u64 == 0) {
++    emit_Ao(as, SW64I_LDI, r, RID_ZERO, 0);
++    return;
++  }
++  lua_assert(r != RID_ZERO);
++
++  split64AddrLO32(u64, &l_hi, &l_lo);
++  split64AddrHI32(u64, &h_hi, &h_lo);
++  int has_high = h_hi || h_lo;
++
++  if (l_lo) {
++    emit_Ao(as, SW64I_LDI, r, l_hi || has_high ? r : RID_ZERO, l_lo);
++  }
++  if (l_hi) {
++    emit_Ao(as, SW64I_LDIH, r, has_high ? r : RID_ZERO, l_hi);
++  }
++
++  if (has_high) {
++    emit_AjD(as, SW64I_SLLI, r, 32, r);
++  }
++  if (h_lo) {
++    emit_Ao(as, SW64I_LDI, r, h_hi ? r : RID_ZERO, h_lo);
++  }
++  if (h_hi) {
++    emit_Ao(as, SW64I_LDIH, r, RID_ZERO, h_hi);
++  }
++}
++
++#define emit_loada(as, r, addr)		emit_loadu64(as, (r), u64ptr((addr)))
++
++static Reg ra_allock(ASMState *as, intptr_t k, RegSet allow);
++static void ra_allockreg(ASMState *as, intptr_t k, Reg r);
++
++/* Get/set from constant pointer. */
++static void emit_lsptr(ASMState *as, SW64Ins mi, Reg r, void *p, RegSet allow)
++{
++  intptr_t jgl = (intptr_t)(J2G(as->J));
++  intptr_t i = (intptr_t)(p);
++  Reg base;
++  if ((uint32_t)(i-jgl-32768) < 0x1000) {
++    i = i-jgl-32768;
++    base = RID_JGL;
++  } else {
++    base = ra_allock(as, i-(int16_t)i, allow);
++    i = (int16_t)i;
++  }
++  emit_Ao(as, mi, r, base, i);
++}
++
++static void emit_loadk64(ASMState *as, Reg r, IRIns *ir)
++{
++  const uint64_t *k = &ir_k64(ir)->u64;
++  Reg r64 = r;
++  if (rset_test(RSET_FPR, r)) {
++    emit_lsptr(as, SW64I_FLDD, ((r) & 31), (void *)&ir_knum((ir))->u64, RSET_GPR);
++  } else {
++    if ((uint32_t)((intptr_t)k-(intptr_t)J2G(as->J)) < 65536)
++      emit_lsptr(as, SW64I_LDL, r64, (void *)k, 0);
++    else
++      emit_loadu64(as, r64, *k);
++  }
++}
++
++/* Get/set global_State fields. */
++static void emit_lsglptr(ASMState *as, SW64Ins mi, Reg r, int32_t ofs)
++{
++  emit_Ao(as, mi, r, RID_JGL, ofs-32768);
++}
++
++#define emit_getgl(as, r, field) \
++  emit_lsglptr(as, SW64I_AL, (r), (int32_t)offsetof(global_State, field))
++#define emit_setgl(as, r, field) \
++  emit_lsglptr(as, SW64I_AS, (r), (int32_t)offsetof(global_State, field))
++
++/* Trace number is determined from per-trace exit stubs. */
++#define emit_setvmstate(as, i)		UNUSED(i)
++
++/* -- Emit control-flow instructions -------------------------------------- */
++
++/* Label for internal jumps. */
++typedef MCode *MCLabel;
++
++/* Return label pointing to current PC. */
++#define emit_label(as)		((as)->mcp)
++
++static SW64Ins invert_cond(SW64Ins mi)
++{
++  switch (mi) {
++  case SW64I_BEQ: return SW64I_BNE;
++  case SW64I_BNE: return SW64I_BEQ;
++  case SW64I_FBEQ: return SW64I_FBNE;
++  case SW64I_FBNE: return SW64I_FBEQ;
++  case SW64I_BLT: return SW64I_BGE;
++  case SW64I_BGE: return SW64I_BLT;
++  case SW64I_BLE: return SW64I_BGT;
++  case SW64I_BGT: return SW64I_BLE;
++  default:
++    printf("TODO HANDLING INVERT_COND:%x\n", mi);
++    asm("bpt;bpt");
++    return SW64I_NOP;
++  }
++}
++
++static void emit_branch(ASMState *as, SW64Ins mi, Reg a, MCode *target)
++{
++  MCode *p = as->mcp;
++  ptrdiff_t delta = target - p;
++  lua_assert(IS_SW64F_BRANCH_VALID(delta));
++  __WI(--p, mi | SW64F_A(a) | SW64F_BRANCH(delta));
++  as->mcp = p;
++}
++
++static void emit_jmp(ASMState *as, MCode *target)
++{
++  TODO
++  /* *--as->mcp = SW64I_NOP; */
++  /* emit_branch(as, SW64I_B, RID_ZERO, RID_ZERO, (target)); */
++}
++
++static void emit_call(ASMState *as, void *target, int needcfa)
++{
++  MCode *p = as->mcp;
++  __WI(--p, SW64I_CALL | SW64F_A(RID_RA) | SW64F_B(RID_CFUNCADDR));
++  needcfa = 1;
++  // TODO: use br if target in range
++  as->mcp = p;
++  if (needcfa) ra_allockreg(as, (intptr_t)target, RID_CFUNCADDR);
++}
++
++/* -- Emit generic operations --------------------------------------------- */
++
++#define emit_move(as, dst, src) \
++  emit_Ao(as, SW64I_LDI, (dst), (src), 0)
++
++#define emit_fmove(as, dst, src) \
++  emit_FGI(as, SW64I_FADDD, (src), (RID_FZERO), (dst))
++
++/* Generic move between two regs. */
++static void emit_movrr(ASMState *as, IRIns *ir, Reg dst, Reg src)
++{
++  if (dst < RID_MAX_GPR)
++    emit_move(as, dst, src);
++  else
++    emit_fmove(as, dst, src);
++}
++
++/* Generic load of register with base and (small) offset address. */
++static void emit_loadofs(ASMState *as, IRIns *ir, Reg r, Reg base, int32_t ofs)
++{
++  if (r < RID_MAX_GPR)
++    emit_Ao(as, irt_is64(ir->t) ? SW64I_LDL : SW64I_LDW, r, base, ofs);
++  else
++    emit_Fo(as, irt_isnum(ir->t) ? SW64I_FLDD : SW64I_FLDS,
++	     r, base, ofs);
++}
++
++/* Generic store of register with base and (small) offset address. */
++static void emit_storeofs(ASMState *as, IRIns *ir, Reg r, Reg base, int32_t ofs)
++{
++  if (r < RID_MAX_GPR)
++    emit_Ao(as, irt_is64(ir->t) ? SW64I_STL : SW64I_STW, r, base, ofs);
++  else
++    emit_Fo(as, irt_isnum(ir->t) ? SW64I_FSTD : SW64I_FSTS,
++	     r, base, ofs);
++}
++
++/* Add offset to pointer. */
++static void emit_addptr(ASMState *as, Reg r, int32_t ofs)
++{
++  if (ofs) {
++    lua_assert(checki16(ofs));
++    emit_Ao(as, SW64I_LDI, r, r, ofs);
++  }
++}
++
++#define emit_spsub(as, ofs)	emit_addptr(as, RID_SP, -(ofs))
++
++
++
++static void emit_rotx32(ASMState *as, Reg src, uint32_t shift, Reg dest, Reg tmp, int mode)
++{
++  lua_assert(src != tmp);
++  lua_assert(dest != tmp);
++
++  emit_ABD(as, SW64I_ADDW, dest, RID_ZERO, dest);
++  emit_ABD(as, SW64I_BIS, dest, tmp, dest);
++  if (mode == 1) {
++    emit_AjD(as, SW64I_SRLI, dest, (-shift) & 31, dest);
++    emit_AjD(as, SW64I_SLLI, dest, shift, tmp);
++  } else if (mode == 2) {
++    emit_AjD(as, SW64I_SLLI, dest, (-shift) & 31, dest);
++    emit_AjD(as, SW64I_SRLI, dest, shift, tmp);
++  } else {
++    lua_assert(0);
++  }
++  emit_AjD(as, SW64I_EXTLWI, src, 0, dest);
++}
++static void emit_rotx(ASMState *as, Reg src, uint32_t shift, Reg dest, Reg tmp, int mode)
++{
++  lua_assert(src != tmp);
++  lua_assert(dest != tmp);
++
++  emit_ABD(as, SW64I_BIS, dest, tmp, dest);
++  if (mode == 1) {
++    emit_AjD(as, SW64I_SRLI, src, (-shift) & 63, dest);
++    emit_AjD(as, SW64I_SLLI, src, shift, tmp);
++  } else if (mode == 2) {
++    emit_AjD(as, SW64I_SLLI, src, (-shift) & 63, dest);
++    emit_AjD(as, SW64I_SRLI, src, shift, tmp);
++  } else {
++    lua_assert(0);
++  }
++}
++#define emit_rotl32(as, src, shift, dest, tmp) emit_rotx32(as, src, shift, dest, tmp, 1)
++#define emit_rotr32(as, src, shift, dest, tmp) emit_rotx32(as, src, shift, dest, tmp, 2)
++#define emit_rotl(as, src, shift, dest, tmp) emit_rotx(as, src, shift, dest, tmp, 1)
++#define emit_rotr(as, src, shift, dest, tmp) emit_rotx(as, src, shift, dest, tmp, 2)
++
++#undef TODO
+diff --git a/src/lj_frame.h b/src/lj_frame.h
+index 19c49a4..a6e805e 100644
+--- a/src/lj_frame.h
++++ b/src/lj_frame.h
+@@ -264,6 +264,18 @@ enum { LJ_CONT_TAILCALL, LJ_CONT_FFI_CALLBACK };  /* Special continuations. */
+ #endif
+ #define CFRAME_OFS_MULTRES	0
+ #define CFRAME_SHIFT_MULTRES	3
++
++#elif LJ_TARGET_SW64
++#define CFRAME_OFS_ERRF		172     
++#define CFRAME_OFS_NRES		168     
++#define CFRAME_OFS_PREV		160     
++#define CFRAME_OFS_L		152     
++#define CFRAME_OFS_PC		144           
++#define CFRAME_SIZE	        176
++
++#define CFRAME_OFS_MULTRES	0      
++#define CFRAME_SHIFT_MULTRES    3
++
+ #else
+ #error "Missing CFRAME_* definitions for this architecture"
+ #endif
+diff --git a/src/lj_gdbjit.c b/src/lj_gdbjit.c
+index c219ffa..c89b60b 100644
+--- a/src/lj_gdbjit.c
++++ b/src/lj_gdbjit.c
+@@ -306,6 +306,9 @@ enum {
+ #elif LJ_TARGET_MIPS
+   DW_REG_SP = 29,
+   DW_REG_RA = 31,
++#elif LJ_TARGET_SW64
++  DW_REG_SP = 30,
++  DW_REG_RA = 26,
+ #else
+ #error "Unsupported target architecture"
+ #endif
+@@ -383,6 +386,8 @@ static const ELFheader elfhdr_template = {
+   .machine = 20,
+ #elif LJ_TARGET_MIPS
+   .machine = 8,
++#elif LJ_TARGET_SW64
++  .machine = 0x9916,
+ #else
+ #error "Unsupported target architecture"
+ #endif
+@@ -591,6 +596,10 @@ static void LJ_FASTCALL gdbjit_ehframe(GDBJITctx *ctx)
+       for (i = 23; i >= 16; i--) { DB(DW_CFA_offset|i); DUV(26-i); }
+       for (i = 30; i >= 20; i -= 2) { DB(DW_CFA_offset|32|i); DUV(42-i); }
+     }
++#elif LJ_TARGET_SW64
++    {
++      /* TODO: setup saved register position */
++    }
+ #else
+ #error "Unsupported target architecture"
+ #endif
+diff --git a/src/lj_jit.h b/src/lj_jit.h
+index 92054e3..c2a9f70 100644
+--- a/src/lj_jit.h
++++ b/src/lj_jit.h
+@@ -55,6 +55,11 @@
+ #else
+ #define JIT_F_CPUSTRING		"\010MIPS64R2"
+ #endif
++
++#elif LJ_TARGET_SW64
++#define JIT_F_SW1621 0x00000010
++#define JIT_F_CPU_FIRST		JIT_F_SW1621
++#define JIT_F_CPUSTRING		"\4SW6A"
+ #else
+ #define JIT_F_CPU_FIRST		0
+ #define JIT_F_CPUSTRING		""
+diff --git a/src/lj_snap.c b/src/lj_snap.c
+index bb063c2..33beb5c 100644
+--- a/src/lj_snap.c
++++ b/src/lj_snap.c
+@@ -715,7 +715,7 @@ static void snap_restoredata(GCtrace *T, ExitState *ex,
+ #if !LJ_SOFTFP
+       if (r >= RID_MAX_GPR) {
+ 	src = (int32_t *)&ex->fpr[r-RID_MIN_FPR];
+-#if LJ_TARGET_PPC
++#if LJ_TARGET_PPC || LJ_TARGET_SW64
+ 	if (sz == 4) {  /* PPC FPRs are always doubles. */
+ 	  *(float *)dst = (float)*(double *)src;
+ 	  return;
+diff --git a/src/lj_target.h b/src/lj_target.h
+index 8dcae95..c7bed12 100644
+--- a/src/lj_target.h
++++ b/src/lj_target.h
+@@ -55,7 +55,7 @@ typedef uint32_t RegSP;
+ /* Bitset for registers. 32 registers suffice for most architectures.
+ ** Note that one set holds bits for both GPRs and FPRs.
+ */
+-#if LJ_TARGET_PPC || LJ_TARGET_MIPS || LJ_TARGET_ARM64
++#if LJ_TARGET_PPC || LJ_TARGET_MIPS || LJ_TARGET_ARM64 || LJ_TARGET_SW64
+ typedef uint64_t RegSet;
+ #else
+ typedef uint32_t RegSet;
+@@ -69,7 +69,7 @@ typedef uint32_t RegSet;
+ #define rset_set(rs, r)		(rs |= RID2RSET(r))
+ #define rset_clear(rs, r)	(rs &= ~RID2RSET(r))
+ #define rset_exclude(rs, r)	(rs & ~RID2RSET(r))
+-#if LJ_TARGET_PPC || LJ_TARGET_MIPS || LJ_TARGET_ARM64
++#if LJ_TARGET_PPC || LJ_TARGET_MIPS || LJ_TARGET_ARM64 || LJ_TARGET_SW64
+ #define rset_picktop(rs)	((Reg)(__builtin_clzll(rs)^63))
+ #define rset_pickbot(rs)	((Reg)__builtin_ctzll(rs))
+ #else
+@@ -144,6 +144,8 @@ typedef uint32_t RegCost;
+ #include "lj_target_ppc.h"
+ #elif LJ_TARGET_MIPS
+ #include "lj_target_mips.h"
++#elif LJ_TARGET_SW64
++#include "lj_target_sw64.h"
+ #else
+ #error "Missing include for target CPU"
+ #endif
+diff --git a/src/lj_target_sw64.h b/src/lj_target_sw64.h
+new file mode 100644
+index 0000000..cb2f611
+--- /dev/null
++++ b/src/lj_target_sw64.h
+@@ -0,0 +1,283 @@
++/*
++** Definitions for SW64 CPUs.
++** Copyright (C) 2019-2019 deepin inc. See Copyright Notice in luajit.h
++*/
++
++#ifndef _LJ_TARGET_SW64_H
++#define _LJ_TARGET_SW64_H
++
++/* -- Registers IDs ------------------------------------------------------- */
++
++#define GPRDEF(_) \
++  _(R0) _(R1) _(R2) _(R3) _(R4) _(R5) _(R6) _(R7) \
++  _(R8) _(R9) _(R10) _(R11) _(R12) _(R13) _(R14) _(R15) \
++  _(R16) _(R17) _(R18) _(R19) _(R20) _(R21) _(R22) _(R23) \
++  _(R24) _(R25) _(RA) _(R27) _(R28) _(R29) _(SP) _(R31)
++#define FPRDEF(_) \
++  _(F0) _(F1) _(F2) _(F3) _(F4) _(F5) _(F6) _(F7) \
++  _(F8) _(F9) _(F10) _(F11) _(F12) _(F13) _(F14) _(F15) \
++  _(F16) _(F17) _(F18) _(F19) _(F20) _(F21) _(F22) _(F23) \
++  _(F24) _(F25) _(F26) _(F27) _(F28) _(F29) _(F30) _(F31)
++#define VRIDDEF(_)
++
++#define RIDENUM(name)	RID_##name,
++
++enum {
++  GPRDEF(RIDENUM)		/* General-purpose registers (GPRs). */
++  FPRDEF(RIDENUM)		/* Floating-point registers (FPRs). */
++  RID_MAX,
++  RID_ZERO = RID_R31,
++  RID_FZERO = RID_F31,
++  RID_TMP = RID_RA,
++  RID_GP = RID_R29,
++
++  /* Calling conventions. */
++  RID_RET = RID_R0,
++  RID_FPRET = RID_F0,
++  RID_CFUNCADDR = RID_R27,
++
++  /* These definitions must match with the *.dasc file(s): */
++  RID_BASE = RID_R9,		/* Interpreter BASE. */
++  RID_LPC = RID_R11,		/* Interpreter PC. */
++  RID_DISPATCH = RID_R12,	/* Interpreter DISPATCH table. */
++  RID_LREG = RID_R13,		/* Interpreter L. */
++  RID_JGL = RID_R15,		/* On-trace: global_State + 32768. */
++
++  /* Register ranges [min, max) and number of registers. */
++  RID_MIN_GPR = RID_R0,
++  RID_MAX_GPR = RID_R31+1,
++  RID_MIN_FPR = RID_F0,
++  RID_MAX_FPR = RID_F31+1,
++  RID_NUM_GPR = RID_MAX_GPR - RID_MIN_GPR,
++  RID_NUM_FPR = RID_MAX_FPR - RID_MIN_FPR	/* Only even regs are used. */
++};
++
++#define RID_NUM_KREF		RID_NUM_GPR
++#define RID_MIN_KREF		RID_R0
++
++/* -- Register sets ------------------------------------------------------- */
++
++/* Make use of all registers, except ZERO, TMP, SP, JGL and GP. */
++#define RSET_FIXED \
++  (RID2RSET(RID_ZERO)|RID2RSET(RID_TMP)|RID2RSET(RID_SP)|\
++   RID2RSET(RID_JGL)|RID2RSET(RID_GP)|RID2RSET(RID_R28))
++
++#define RSET_GPR	(RSET_RANGE(RID_MIN_GPR, RID_MAX_GPR) - RSET_FIXED)
++#define RSET_FPR    (RSET_RANGE(RID_MIN_FPR, RID_MAX_FPR) - RID2RSET(RID_FZERO))
++#define RSET_ALL	(RSET_GPR|RSET_FPR)
++#define RSET_INIT	RSET_ALL
++
++
++#define RSET_SCRATCH_GPR                                        \
++  (RSET_RANGE(RID_R0, RID_R8+1)|RSET_RANGE(RID_R16, RID_R25+1)| \
++   RID2RSET(RID_R27))
++#define RSET_SCRATCH_FPR (RSET_RANGE(RID_F0, RID_F1+1)|RSET_RANGE(RID_F10,RID_F30+1))
++#define RSET_SCRATCH		(RSET_SCRATCH_GPR|RSET_SCRATCH_FPR)
++
++#define REGARG_FIRSTGPR		RID_R16
++#define REGARG_LASTGPR		RID_R21
++#define REGARG_NUMGPR		6
++#define REGARG_FIRSTFPR		RID_F16
++#define REGARG_LASTFPR		RID_F21
++#define REGARG_NUMFPR		6
++
++
++
++/* -- Spill slots --------------------------------------------------------- */
++
++/* Spill slots are 32 bit wide. An even/odd pair is used for FPRs.
++**
++** SPS_FIXED: Available fixed spill slots in interpreter frame.
++** This definition must match with the *.dasc file(s).
++**
++** SPS_FIRST: First spill slot for general use.
++*/
++#if LJ_32
++#define SPS_FIXED	5
++#else
++#define SPS_FIXED	4
++#endif
++
++#define SPS_FIRST	4
++
++#define SPOFS_TMP	0
++
++#define sps_scale(slot)		(4 * (int32_t)(slot))
++#define sps_align(slot)		(((slot) - SPS_FIXED + 1) & ~1)
++
++/* -- Exit state ---------------------------------------------------------- */
++
++/* This definition must match with the *.dasc file(s). */
++typedef struct {
++  lua_Number fpr[RID_NUM_FPR];	/* Floating-point registers. */
++  intptr_t gpr[RID_NUM_GPR];	/* General-purpose registers. */
++  int32_t spill[256];		/* Spill slots. */
++} ExitState;
++
++/* Highest exit + 1 indicates stack check. */
++#define EXITSTATE_CHECKEXIT	1
++
++/* Return the address of a per-trace exit stub. */
++static LJ_AINLINE uint32_t *exitstub_trace_addr_(uint32_t *p)
++{
++  while (*p == 0x43ff075f) p++;  /* Skip SW64_NOP. */
++  return p;
++}
++/* Avoid dependence on lj_jit.h if only including lj_target.h. */
++#define exitstub_trace_addr(T, exitno) \
++  exitstub_trace_addr_((MCode *)((char *)(T)->mcode + (T)->szmcode))
++
++/* -- Instructions -------------------------------------------------------- */
++
++/* Instruction fields. */
++#define SW64F_A(r)	(((r) & 0x1f) << 21)
++#define SW64F_B(r)	(((r) & 0x1f) << 16)
++#define SW64F_C(r)	(((r) & 0x1f) << 5)
++#define SW64F_D(r)	(((r) & 0x1f) << 0)
++
++#define SW64F_F(r)	(((r) & 0x1f) << 21)
++#define SW64F_G(r)	(((r) & 0x1f) << 16)
++#define SW64F_H(r)	(((r) & 0x1f) << 5)
++#define SW64F_I(r)	(((r) & 0x1f) << 0)
++
++
++#define SW64F_IMM(i) (((i) & 0xff) << 13)
++#define SW64F_DISP(d, r) (((d) & 0xffff) | SW64F_B(r))
++#define SW64F_DISPI(d) (((d) & 0xffff) | SW64F_B(RID_ZERO))
++#define SW64F_BRANCH(d)	((d) & 0x1fffff)
++#define SW64F_j(i)	SW64F_IMM(i)
++#define IS_SW64F_BRANCH_VALID(d) (((int32_t)d) < 0x1fffff)
++
++#define SW64_OP(ins) ((ins) & 0x3f000000)
++
++typedef enum SW64Ins {
++
++  /* Load/store instructions. */
++  SW64I_LDL = 0x8c000000,
++  SW64I_STL = 0xac000000,
++  SW64I_LDW = 0x88000000,
++  SW64I_STW = 0xa8000000,
++  SW64I_LDHU = 0x84000000,
++  SW64I_STH = 0xa4000000,
++  SW64I_LDBU = 0x80000000,
++  SW64I_STB = 0xa0000000,
++  SW64I_FLDD = 0x9c000000,
++  SW64I_FSTD = 0xbc000000,
++  SW64I_FLDS = 0x98000000,
++  SW64I_FSTS = 0xb8000000,
++
++  SW64I_LDIH = 0xfc000000,
++  SW64I_LDI =  0xf8000000,
++  SW64I_CALL=  0x04000000,
++  SW64I_BR =   0x10000000,
++
++  SW64I_S4ADDL = 0x40000140,
++  SW64I_S4ADDLI = 0x48000140,
++  SW64I_S8ADDL = 0x40000180,
++  SW64I_S8ADDLI =0x48000180,
++  SW64I_ADDL = 0x40000100,
++  SW64I_ADDLI =0x48000100,
++  SW64I_ADDW = 0x40000000,
++  SW64I_ADDWI =0x48000000,
++  SW64I_SUBL = 0x40000120,
++  SW64I_SUBLI =0x48000120,
++  SW64I_SUBW = 0x40000020,
++  SW64I_MULL = 0x40000300,
++  SW64I_MULLI =0x48000300,
++  SW64I_MULW = 0x40000200,
++  SW64I_MULWI =0x48000200,
++  SW64I_UMULH = 0x40000320,
++  SW64I_UMULHI =0x48000320,
++
++  SW64I_FADDS = 0x60000000,
++  SW64I_FADDD = 0x60000020,
++  SW64I_FSUBD = 0x60000060,
++  SW64I_FMULD = 0x600000a0,
++  SW64I_FDIVD = 0x600000e0,
++
++  SW64I_SLL =  0x40000900,
++  SW64I_SLLI = 0x48000900,
++  SW64I_SRLI = 0x48000920,
++  SW64I_SRL =  0x40000920,
++  SW64I_SRAI = 0x48000940,
++  SW64I_SRA =  0x40000940,
++
++  SW64I_AND =  0x40000700,
++  SW64I_ANDI = 0x48000700,
++  SW64I_XOR =  0x40000780,
++  SW64I_XORI = 0x48000780,
++  SW64I_BIS =  0x40000740,
++  SW64I_BISI = 0x48000740,
++  SW64I_ORNOT = 0x40000760,
++  SW64I_ORNOTI = 0x48000760,
++  SW64I_EQV = 0x400007a0,
++  SW64I_EQVI =0x480007a0,
++
++  SW64I_BEQ = 0xc0000000,
++  SW64I_BNE = 0xc4000000,
++  SW64I_BLT = 0xc8000000,
++  SW64I_BLE = 0xcc000000,
++  SW64I_BGT = 0xd0000000,
++  SW64I_BGE = 0xd4000000,
++
++  SW64I_CMPEQ = 0x40000500,
++  SW64I_CMPULE = 0x40000580,
++  SW64I_CMPLE = 0x40000540,
++  SW64I_CMPULT = 0x40000560,
++  SW64I_CMPULTI = 0x48000560,
++  SW64I_CMPLT = 0x40000520,
++  SW64I_CMPLTI = 0x48000520,
++
++  SW64I_FCMPEQ = 0x60000200,
++  SW64I_FCMPLE = 0x60000220,
++  SW64I_FCMPLT = 0x60000240,
++  SW64I_FCMPUN = 0x60000260,
++  SW64I_FBEQ = 0xe0000000,
++  SW64I_FBGE = 0xf4000000,
++  SW64I_FBGT = 0xf0000000,
++  SW64I_FBLE = 0xec000000,
++  SW64I_FBLT = 0xe8000000,
++  SW64I_FBNE = 0xe4000000,
++
++  SW64I_BPT =  0x00000080,
++  SW64I_NOP =  0x43ff075f, //excb, same as gcc's asm("nop")
++
++  SW64I_FCVTLW = 0x63e00520,
++  SW64I_FCVTWL = 0x63e00500,
++  SW64I_FCVTLS = 0x63e005a0,
++  SW64I_FCVTLD = 0x63e005e0,
++  SW64I_FCVTDL = 0x63e004e0,
++  SW64I_FCVTDL_Z = 0x63e00480,
++  SW64I_FIMOVD = 0x401f0f00,
++  SW64I_IFMOVD = 0x601f0820,
++  SW64I_IFMOVS = 0x601f0800,
++  SW64I_FCVTDS = 0x63e00420,
++  SW64I_FCVTSD = 0x63e00400,
++
++  SW64I_MASKLLI = 0x48000c60,
++  SW64I_MASKLL = 0x40000c60,
++
++  SW64I_FABS =   0x63e00600,
++  SW64I_FCPYSN = 0x60000640,
++
++  SW64I_SEXTB =  0x43e00d40,
++  SW64I_SEXTH =  0x43e00d60,
++
++  SW64I_EXTLBI = 0x48000a00,
++  SW64I_EXTLHI = 0x48000a20,
++  SW64I_EXTLWI = 0x48000a40,
++
++  SW64I_SETFPEC1 = 0x60000aa0,
++  SW64I_SETFPEC3 = 0x60000ae0,
++
++  SW64I_SELEQ = 0x44000000,
++  SW64I_SELNE = 0x44001400,
++  SW64I_FSELEQ = 0x64004000,
++  SW64I_FSELNE = 0x64004400,
++
++  SW64I_AL = SW64I_LDL,
++  SW64I_AS = SW64I_STL,
++
++} SW64Ins;
++
++#endif
+diff --git a/src/lj_trace.c b/src/lj_trace.c
+index d85b47f..954d388 100644
+--- a/src/lj_trace.c
++++ b/src/lj_trace.c
+@@ -827,8 +827,57 @@ static TraceNo trace_exit_find(jit_State *J, MCode *pc)
+ }
+ #endif
+ 
++#if SW64_DEBUG_WI
++LUALIB_API const char *o_tostr(lua_State *L, TValue *o,
++				       const char *def, size_t *len)
++{
++  GCstr *s;
++  if (LJ_LIKELY(tvisstr(o))) {
++    s = strV(o);
++  } else if (tvisnil(o)) {
++    if (len != NULL) *len = def ? strlen(def) : 0;
++    return def;
++  } else if (tvisnumber(o)) {
++    lj_gc_check(L);
++    s = lj_strfmt_number(L, o);
++    setstrV(L, o, s);
++  } else {
++    return "Other";
++    //lj_err_argt(L, 0, LUA_TSTRING);
++  }
++  if (len != NULL)
++    *len = s->len;
++  return strdata(s);
++}
++void dump_base(char* msg, lua_State* L)
++{
++  return;
++  int n = L->top - L->base;
++  printf("%s N:%d\n", msg, n);
++
++  for (int i=0; i<n-1; i++)
++    {
++      TValue* o = L->base+i;
++      char* t = lj_typename(o);
++
++      if (tvisnum(o)) {
++        double n = numberVnum(o);
++        printf("%d\t%s\t%f\n", i, t, n);
++      } else if (tvisint(o)) {
++        printf("%d\t%s\t%d\n", i, t, intV(o));
++      } else {
++        printf("%d\t%s\n", i, t);
++      }
++    }
++}
++#endif
++
+ /* A trace exited. Restore interpreter state. */
++#if SW64_DEBUG_WI
++int LJ_FASTCALL lj_trace_exit(jit_State *J, void *exptr, unsigned long exit_addr)
++#else
+ int LJ_FASTCALL lj_trace_exit(jit_State *J, void *exptr)
++#endif
+ {
+   ERRNO_SAVE
+   lua_State *L = J->L;
+@@ -851,6 +900,10 @@ int LJ_FASTCALL lj_trace_exit(jit_State *J, void *exptr)
+   }
+ #endif
+   lua_assert(T != NULL && J->exitno < T->nsnap);
++#if SW64_DEBUG_WI
++    printf("-----------%s exitno:%d nsnap:%d traceno:%d exit_addr:0x%lx\n", __FUNCTION__,
++           J->exitno, T->nsnap, T->traceno, exit_addr);
++#endif
+   exd.J = J;
+   exd.exptr = exptr;
+   errcode = lj_vm_cpcall(L, NULL, &exd, trace_exit_cp);
+diff --git a/src/lj_trace.h b/src/lj_trace.h
+index 22cae74..4c20367 100644
+--- a/src/lj_trace.h
++++ b/src/lj_trace.h
+@@ -36,7 +36,11 @@ LJ_FUNC void lj_trace_freestate(global_State *g);
+ LJ_FUNC void lj_trace_ins(jit_State *J, const BCIns *pc);
+ LJ_FUNCA void LJ_FASTCALL lj_trace_hot(jit_State *J, const BCIns *pc);
+ LJ_FUNCA void LJ_FASTCALL lj_trace_stitch(jit_State *J, const BCIns *pc);
++#if SW64_DEBUG_WI
++LJ_FUNCA int LJ_FASTCALL lj_trace_exit(jit_State *J, void *exptr, unsigned long exit_addr);
++#else
+ LJ_FUNCA int LJ_FASTCALL lj_trace_exit(jit_State *J, void *exptr);
++#endif
+ 
+ /* Signal asynchronous abort of trace or end of trace. */
+ #define lj_trace_abort(g)	(G2J(g)->state &= ~LJ_TRACE_ACTIVE)
+diff --git a/src/lj_vmmath.c b/src/lj_vmmath.c
+index b231d3e..4d4d744 100644
+--- a/src/lj_vmmath.c
++++ b/src/lj_vmmath.c
+@@ -57,7 +57,7 @@ double lj_vm_foldarith(double x, double y, int op)
+   }
+ }
+ 
+-#if (LJ_HASJIT && !(LJ_TARGET_ARM || LJ_TARGET_ARM64 || LJ_TARGET_PPC)) || LJ_TARGET_MIPS
++#if (LJ_HASJIT && !(LJ_TARGET_ARM || LJ_TARGET_ARM64 || LJ_TARGET_PPC)) || LJ_TARGET_MIPS || LJ_TARGET_SW64
+ int32_t LJ_FASTCALL lj_vm_modi(int32_t a, int32_t b)
+ {
+   uint32_t y, ua, ub;
+diff --git a/src/vm_sw64.dasc b/src/vm_sw64.dasc
+new file mode 100644
+index 0000000..38a103f
+--- /dev/null
++++ b/src/vm_sw64.dasc
+@@ -0,0 +1,4761 @@
++|// Low-level VM code for SW64 CPUs.
++|// Bytecode interpreter, fast functions and helper functions.
++|// Copyright (C) 2023 Mike Pall. See Copyright Notice in luajit.h
++|
++|.arch sw64
++|.section code_op, code_sub
++|
++|.actionlist build_actionlist
++|.globals GLOB_
++|.globalnames globnames
++|.externnames extnames
++|// Fixed register assignments for the interpreter.
++|// Don't use: r31 = 0, r29 = gp, r30 = sp, r26 = ra
++|
++|// The following must be C callee-save (but BASE is often refetched).
++|.define BASE,		r9  //s0	// Base of current Lua stack frame.
++|.define KBASE,		r10 //s1	// Constants of current Lua function.
++|.define PC,		r11 //s2	// Next PC.
++|.define DISPATCH,	r12  //s3	// Opcode dispatch table.
++|.define LREG,		r13 //s4	// Register holding lua_State (also in SAVE_L).
++|.define MULTRES,	r21 //a5	// Size of multi-result: (nresults+1).
++|
++|.define JGL,		r15 //fp	// On-trace: global_State + 32768.
++|
++|// Constants for type-comparisons, stores and conversions. C callee-save.
++|.define TISNIL,	r15 //fp
++|.define TISNUM,	r8 //t7
++|.define TOBIT,		f8	// 2^52 + 2^51.  
++|
++|// The following temporaries are not saved across C calls, except for RA.
++|.define RA,		r14   //mips:s7	sw64:s5 // Callee-save.
++|.define RB,		r22   //t8
++|.define RC,		r23   //t9
++|.define RD,		r24   //t10
++|.define INS,		r25   //t11
++|
++|.define AT,		r28   //at	// Assembler temporary.
++|.define FAT,		f28   //at	// Assembler temporary.
++|.define TMP0,		r5    //t4
++|.define TMP1,		r6    //t5
++|.define TMP2,		r7    //t6
++|.define TMP3,		r3   //t2
++|.define TMP4,		r4     //t3
++|
++|// Calling conventions.
++|.define CFUNCADDR,	r27  //t12/pv
++|.define CARG1,		r16  //a0
++|.define CARG2,		r17  //a1
++|.define CARG3,		r18  //a2
++|.define CARG4,		r19  //a3
++|.define CARG5,		r20  //a4
++|.define CARG6,		r21  //a5
++|
++|.define CRET1,		r0   //v0
++|.define CRET2,		r2   //t1
++|
++|.define FCARG1,	f16
++|.define FCARG2,	f17
++|.define FCARG3,	f18
++|.define FCARG4,	f19
++|.define FCARG5,	f20
++|.define FCARG6,	f21
++|
++|.define FCRET1,		f0
++|.define FCRET2,		f1
++|
++|.define FTMP0,		f10
++|.define FTMP1,		f11
++|.define FTMP2,		f12
++|.define FTMP3,		f13
++|.define FTMP4,		f14
++|
++|// Stack layout while in interpreter. Must match with lj_frame.h.
++|
++|.define CFRAME_SPACE,	176	// Delta for sp.
++|
++|//----- 16 byte aligned, <-- sp entering interpreter
++|.define SAVE_ERRF,	172	// 32 bit values.
++|.define SAVE_NRES,	168
++|.define SAVE_CFRAME,	160	// 64 bit values.
++|.define SAVE_L,	152
++|.define SAVE_PC,	144
++|//----- 16 byte aligned
++|.define SAVE_GPR_,	80	// .. 80+8*8: 64 bit GPR saves. s0-s5
++|.define SAVE_FPR_,	16	// .. 16+8*8: 64 bit FPR saves. f2-f9
++|
++|.define TMPX,          8  
++|.define TMPD,		0
++|//----- 16 byte aligned
++|
++|.define TMPD_OFS,	0
++|
++|.define SAVE_MULTRES,	TMPD
++|
++|//-----------------------------------------------------------------------
++|
++|.macro saveregs
++|  ldi sp, -CFRAME_SPACE(sp) //TODO
++|  stl ra,   SAVE_GPR_+7*8(sp)
++|  stl r15,  SAVE_GPR_+6*8(sp)
++|  stl r14,  SAVE_GPR_+5*8(sp)
++|  stl r13,  SAVE_GPR_+4*8(sp)
++|  stl r12,  SAVE_GPR_+3*8(sp)
++|  stl r11,  SAVE_GPR_+2*8(sp)
++|  stl r10,  SAVE_GPR_+1*8(sp)
++|  stl r9,   SAVE_GPR_+0*8(sp)
++|  fstd f9,  SAVE_FPR_+7*8(sp)
++|  fstd f8,  SAVE_FPR_+6*8(sp)
++|  fstd f7, SAVE_FPR_+5*8(sp)
++|  fstd f6, SAVE_FPR_+4*8(sp)
++|  fstd f5, SAVE_FPR_+3*8(sp)
++|  fstd f4, SAVE_FPR_+2*8(sp)
++|  fstd f3, SAVE_FPR_+1*8(sp)
++|  fstd f2, SAVE_FPR_+0*8(sp)
++|.endmacro
++|
++|.macro restoreregs_ret
++|  ldl ra,   SAVE_GPR_+7*8(sp)
++|  ldl r15,  SAVE_GPR_+6*8(sp)
++|  ldl r14,  SAVE_GPR_+5*8(sp)
++|  ldl r13,  SAVE_GPR_+4*8(sp)
++|  ldl r12,  SAVE_GPR_+3*8(sp)
++|  ldl r11,  SAVE_GPR_+2*8(sp)
++|  ldl r10,  SAVE_GPR_+1*8(sp)
++|  ldl r9,   SAVE_GPR_+0*8(sp)
++|  fldd f9,  SAVE_FPR_+7*8(sp)
++|  fldd f8,  SAVE_FPR_+6*8(sp)
++|  fldd f7, SAVE_FPR_+5*8(sp)
++|  fldd f6, SAVE_FPR_+4*8(sp)
++|  fldd f5, SAVE_FPR_+3*8(sp)
++|  fldd f4, SAVE_FPR_+2*8(sp)
++|  fldd f3, SAVE_FPR_+1*8(sp)
++|  fldd f2, SAVE_FPR_+0*8(sp)
++|  ldi sp, CFRAME_SPACE(sp)
++|  ret zero, 0(ra)
++|.endmacro
++|
++|// Type definitions. Some of these are only used for documentation.
++|.type L,		lua_State,	LREG
++|.type GL,		global_State
++|.type TVALUE,		TValue
++|.type GCOBJ,		GCobj
++|.type STR,		GCstr
++|.type TAB,		GCtab
++|.type LFUNC,		GCfuncL
++|.type CFUNC,		GCfuncC
++|.type PROTO,		GCproto
++|.type UPVAL,		GCupval
++|.type NODE,		Node
++|.type NARGS8,		int
++|.type TRACE,		GCtrace
++|.type SBUF,		SBuf
++|
++|//-----------------------------------------------------------------------
++|
++|// Trap for not-yet-implemented parts.
++|.macro NYI; syscall ; .endmacro  //TODO
++|
++|//-----------------------------------------------------------------------
++|
++|// Access to frame relative to BASE.
++|.define FRAME_PC,	-8
++|.define FRAME_FUNC,	-16
++|
++|//-----------------------------------------------------------------------
++|
++|// Endian-specific defines. SW64 is little endian. 
++|.define OFS_RD,	2
++|.define OFS_RA,	1
++|.define OFS_OP,	0 
++|
++|// Instruction decode.
++|.macro decode_BC4b, dst; slli dst, 2, dst; addwi dst, 0, dst; .endmacro
++|.macro decode_BC8b, dst; slli dst, 3, dst; addwi dst, 0, dst; .endmacro
++|.macro decode_OP, dst, ins; andi ins, 0xff, dst; .endmacro
++|.macro decode_RA, dst, ins; extlb ins, 0x1, dst; decode_BC8b dst; .endmacro
++|.macro decode_RB, dst, ins; extlb ins, 0x3, dst; decode_BC8b dst; .endmacro
++|.macro decode_RC, dst, ins; extlb ins, 0x2, dst; decode_BC8b dst; .endmacro
++|.macro decode_RD, dst, ins; extlh ins, 0x2, dst; decode_BC8b dst; .endmacro
++|.macro decode_RDtoRC8, dst, src; ldi dst, 0x7f8(zero); and src, dst, dst; .endmacro
++|
++|// Instruction fetch.
++|.macro ins_NEXT1
++|  ldw INS, 0(PC)
++|  ldi  PC, 4(PC)
++|.endmacro
++|// Instruction decode+dispatch.
++|.macro ins_NEXT2
++|  decode_OP TMP1, INS
++|  decode_BC8b TMP1
++|  addl TMP1, DISPATCH, TMP0
++|  ldl TMP4, 0(TMP0)
++|   decode_RD RD, INS
++|   decode_RA RA, INS
++|  jmp zero, 0(TMP4)
++|.endmacro
++|.macro ins_NEXT
++|  ins_NEXT1
++|  ins_NEXT2
++|.endmacro
++|
++|// Instruction footer.
++|.if 1
++|  // Replicated dispatch. Less unpredictable branches, but higher I-Cache use.
++|  .define ins_next, ins_NEXT
++|  .define ins_next_, ins_NEXT
++|  .define ins_next1, ins_NEXT1
++|  .define ins_next2, ins_NEXT2
++|.else
++|  // Common dispatch. Lower I-Cache use, only one (very) unpredictable branch.
++|  // Affects only certain kinds of benchmarks (and only with -j off).
++|  .macro ins_next
++|    br zero, ->ins_next
++|  .endmacro
++|  .macro ins_next1
++|  .endmacro
++|  .macro ins_next2
++|    br zero, ->ins_next
++|  .endmacro
++|  .macro ins_next_
++|  ->ins_next:
++|    ins_NEXT
++|  .endmacro
++|.endif
++|
++|// Call decode and dispatch.
++|.macro ins_callt
++|  // BASE = new base, RB = LFUNC/CFUNC, RC = nargs*8, FRAME_PC(BASE) = PC
++|  ldl PC, LFUNC:RB->pc
++|  ldw INS, 0(PC)
++|  ldi PC, 4(PC)
++|  decode_OP TMP1, INS
++|  decode_RA RA, INS
++|  decode_BC8b TMP1
++|  addl TMP1, DISPATCH, TMP0
++|  ldl TMP0, 0(TMP0)
++|  addl RA, BASE, RA
++|  jmp zero, 0(TMP0)	
++|.endmacro
++|
++|.macro ins_call
++|  // BASE = new base, RB = LFUNC/CFUNC, RC = nargs*8, PC = caller PC
++|  stl PC, FRAME_PC(BASE)
++|  ins_callt
++|.endmacro
++|
++|//-----------------------------------------------------------------------
++|
++|.macro branch_RD
++|  zapi RD, 0xf0, RD; srli RD, 1, TMP0
++|  ldih TMP4, -0x2(zero)	// -BCBIAS_J*4
++|  addw TMP0, TMP4, TMP0	// (jump - 0x8000)<<2
++|  addl PC, TMP0, PC
++|.endmacro
++|
++|// Assumes DISPATCH is relative to GL.
++#define DISPATCH_GL(field)	(GG_DISP2G + (int)offsetof(global_State, field))
++#define DISPATCH_J(field)	(GG_DISP2J + (int)offsetof(jit_State, field))
++#define GG_DISP2GOT             (GG_OFS(got) - GG_OFS(dispatch))
++#define DISPATCH_GOT(name)      (GG_DISP2GOT + sizeof(void*)*LJ_GOT_##name)
++|
++#define PC2PROTO(field)  ((int)offsetof(GCproto, field)-(int)sizeof(GCproto))
++|
++|.macro load_got, func
++|  ldl CFUNCADDR, DISPATCH_GOT(func)(DISPATCH)
++|.endmacro
++|
++|.macro call_intern, func
++| stl MULTRES, TMPX(sp)
++| call ra, 0(CFUNCADDR)
++| ldl MULTRES, TMPX(sp)
++|.endmacro
++|
++|.macro call_extern
++| stl MULTRES, TMPX(sp)
++| call ra, 0(CFUNCADDR)
++| ldl MULTRES, TMPX(sp)
++| .endmacro
++|
++|.macro hotcheck, delta, target
++|  srli PC, 1, TMP1
++|  andi TMP1, 126, TMP1
++|  addl TMP1, DISPATCH, TMP1
++|  ldhu TMP2, GG_DISP2HOT(TMP1)
++|  subwi TMP2, delta, TMP2
++|  sth TMP2, GG_DISP2HOT(TMP1)
++|  blt TMP2, target
++|.endmacro
++|
++|.macro hotloop
++|  hotcheck HOTCOUNT_LOOP, ->vm_hotloop
++|.endmacro
++|
++|.macro hotcall
++|  hotcheck HOTCOUNT_CALL, ->vm_hotcall
++|.endmacro
++|
++|// Set current VM state. Uses TMP0.
++|.macro li_vmstate, st; ldi TMP0, ~LJ_VMST_..st(zero); .endmacro
++|.macro st_vmstate; stw TMP0, DISPATCH_GL(vmstate)(DISPATCH); .endmacro
++|
++|// Move table write barrier back. Overwrites mark and tmp.
++|.macro barrierback, tab, mark, tmp, target
++|  ldl tmp, DISPATCH_GL(gc.grayagain)(DISPATCH)
++|  andi mark, ~LJ_GC_BLACK & 255, mark		// black2gray(tab)
++|  stl tab, DISPATCH_GL(gc.grayagain)(DISPATCH)
++|  stb mark, tab->marked
++|  stl tmp, tab->gclist
++|  br zero, target
++|.endmacro
++|
++|.macro .DEXTM, rt, rs, pos, size
++|  slli rs, 64-pos-size, rt
++|  srli rt, 64-size, rt
++|.endmacro
++|
++|.macro .DINS, rt, rs, pos, size
++|  ldi CARG5, 1(zero); slli CARG5, size, CARG5; ldi CARG5, -1(CARG5);
++|  and rs, CARG5, TMP4;
++|  slli TMP4, pos, TMP4;
++|  slli CARG5, pos, CARG5;
++|  bic rt, CARG5, rt;
++|  bis rt, TMP4, rt;
++|.endmacro
++|
++|// Clear type tag. Isolate lowest 47 bits of reg.
++|.macro cleartp, reg; .DEXTM reg, reg, 0, 47; .endmacro 
++|.macro cleartp, dst, reg; .DEXTM dst, reg, 0, 47; .endmacro 
++|
++|// Set type tag: Merge 17 type bits into bits [47, 63] of dst.
++|.macro settp, dst, tp; .DINS dst, tp, 47, 17; .endmacro
++|
++|// Extract (negative) type tag.
++|.macro gettp, dst, src; srai src, 47, dst; .endmacro
++|
++|// Macros to check the TValue type and extract the GCobj. Branch on failure.
++|.macro checktp, reg, tp, target
++|  gettp TMP4, reg
++|  ldi TMP4, tp(TMP4)
++|  cleartp reg
++|  bne TMP4, target
++|.endmacro
++|.macro checktp, dst, reg, tp, target
++|  gettp TMP4, reg
++|  ldi TMP4, tp(TMP4)
++|  cleartp dst, reg
++|  bne TMP4, target
++|.endmacro
++|.macro checkstr, reg, target; checktp reg, -LJ_TSTR, target; .endmacro
++|.macro checktab, reg, target; checktp reg, -LJ_TTAB, target; .endmacro
++|.macro checkfunc, reg, target; checktp reg, -LJ_TFUNC, target; .endmacro
++|.macro checkint, reg, target
++|  gettp TMP4, reg
++|  ldi TISNUM, LJ_TISNUM(zero)
++|  cmpeq TMP4, TISNUM, AT
++|  beq AT, target
++|.endmacro
++|.macro checknum, reg, target
++|  gettp TMP4, reg
++|  ldi AT, LJ_TISNUM(zero)
++|  cmpult TMP4, AT, TMP4
++|  beq TMP4, target
++|.endmacro
++|
++|.macro mov_false, reg
++|  ldi reg, 0x0001(zero)
++|  slli reg, 47, reg
++|  ornot zero, reg, reg    // ~reg
++|.endmacro
++|.macro mov_true, reg
++|  ldi reg, 0x0001(zero)
++|  slli reg, 48, reg
++|  ornot zero, reg, reg    // ~reg
++|.endmacro
++|
++|.macro fcmp, op, a, b, reg, tmp;
++|  fcmp..op a, b, tmp
++|  fcvtdl tmp, tmp
++|  fimovd tmp, reg
++|.endmacro
++|
++|//-----------------------------------------------------------------------
++
++/* Generate subroutines used by opcodes and other parts of the VM. */
++/* The .code_sub section should be last to help static branch prediction. */
++static void build_subroutines(BuildCtx *ctx)
++{
++  |.code_sub
++  |
++  |//-----------------------------------------------------------------------
++  |//-- Return handling ----------------------------------------------------
++  |//-----------------------------------------------------------------------
++  |
++  |->vm_returnp:
++  |  // See vm_return. Also: TMP2 = previous base.
++  |  andi PC, FRAME_P, TMP0
++  |
++  |  // Return from pcall or xpcall fast func.
++  |  mov_true TMP1
++  |  beq TMP0, ->cont_dispatch
++  |  ldl PC, FRAME_PC(TMP2)		// Fetch PC of previous frame.
++  |  bis TMP2, zero, BASE			// Restore caller base.
++  |  // Prepending may overwrite the pcall frame, so do it at the end.
++  |  stl TMP1, -8(RA)			// Prepend true to results.
++  |  ldi RA, -8(RA)
++  |
++  |->vm_returnc:
++  |  addwi RD, 8, RD			// RD = (nresults+1)*8.
++  |  andi PC, FRAME_TYPE, TMP0
++  |  addwi zero, LUA_YIELD, CRET1
++  |  beq RD, ->vm_unwind_c_eh
++  |  bis RD, zero, MULTRES
++  |  beq TMP0, ->BC_RET_Z		// Handle regular return to Lua.
++  |
++  |->vm_return:
++  |  // BASE = base, RA = resultptr, RD/MULTRES = (nresults+1)*8, PC = return
++  |  // TMP0 = PC & FRAME_TYPE
++  |  subwi zero, 8, TMP2		// TMP2 = 0xfffffff8
++  |  xori TMP0, FRAME_C, TMP0
++  |  and TMP2, PC, TMP2
++  |  subl BASE, TMP2, TMP2		// TMP2 = previous base.
++  |  bne TMP0, ->vm_returnp
++  |
++  |  subwi RD, 8, TMP1 
++  |  stl TMP2, L->base
++  |  li_vmstate C
++  |  ldw TMP2, SAVE_NRES(sp)
++  |  ldi BASE, -16(BASE)
++  |  st_vmstate
++  |  s8addwi TMP2, 0, TMP2
++  |  beq TMP1, >2
++  |1:
++  |  subwi TMP1, 8, TMP1
++  |  ldl CRET1, 0(RA)
++  |  addli RA, 8, RA
++  |  stl CRET1, 0(BASE)
++  |  addli BASE, 8, BASE
++  |  bne TMP1, <1
++  |
++  |2:
++  |  cmpeq TMP2, RD, AT
++  |  beq AT, >6
++  |3:
++  |  stl BASE, L->top			// Store new top.
++  |
++  |->vm_leave_cp:
++  |  ldl TMP0, SAVE_CFRAME(sp)		// Restore previous C frame.
++  |  bis zero, zero, CRET1			// Ok return status for vm_pcall.
++  |  stl TMP0, L->cframe
++  |
++  |->vm_leave_unw:
++  |  restoreregs_ret
++  |
++  |6:
++  |  ldl TMP1, L->maxstack
++  |  cmplt TMP2, RD, TMP0
++  |  // More results wanted. Check stack size and fill up results with nil.
++  |  cmplt BASE, TMP1, TMP1
++  |  bne TMP0, >7
++  |  beq TMP1, >8
++  |  stl TISNIL, 0(BASE)
++  |  addwi RD, 8, RD
++  |  addli BASE, 8, BASE
++  |  br zero, <2
++  |
++  |7:  // Less results wanted.
++  |  subw RD, TMP2, TMP0
++  |  subl BASE, TMP0, TMP0		// Either keep top or shrink it.
++  |  selne TMP2, TMP0, BASE, BASE // LUA_MULTRET+1 case?
++  |  br zero, <3
++  |
++  |8:  // Corner case: need to grow stack for filling up results.
++  |  // This can happen if:
++  |  // - A C function grows the stack (a lot).
++  |  // - The GC shrinks the stack in between.
++  |  // - A return back from a lua_call() with (high) nresults adjustment.
++  |  load_got lj_state_growstack
++  |  stl BASE, L->top
++  |  bis RD, zero, MULTRES
++  |  zapi TMP2, 0xf0, CARG2;  srli CARG2, 3, CARG2
++  |  bis L, zero, CARG1
++  |  call_intern lj_state_growstack	// (lua_State *L, int n)
++  |  ldw TMP2, SAVE_NRES(sp)
++  |  ldl BASE, L->top // Need the (realloced) L->top in BASE.
++  |  bis MULTRES, zero, RD
++  |  s8addwi TMP2, 0, TMP2
++  |  br zero, <2
++  |
++  |->vm_unwind_c:			// Unwind C stack, return from vm_pcall.
++  |  // (void *cframe, int errcode)
++  |  bis CARG1, zero, sp
++  |  bis CARG2, zero, CRET1
++  |->vm_unwind_c_eh:			// Landing pad for external unwinder.
++  |  ldl L, SAVE_L(sp)
++  |  ldi TMP0, ~LJ_VMST_C(zero)
++  |  addwi TMP0, 0, TMP0
++  |  ldl GL:TMP1, L->glref
++  |  stw TMP0, GL:TMP1->vmstate
++  |  br zero, ->vm_leave_unw
++  |
++  |->vm_unwind_ff:			// Unwind C stack, return from ff pcall.
++  |  // (void *cframe)
++  |  ldi AT, CFRAME_RAWMASK(zero)
++  |  and CARG1, AT, sp
++  |->vm_unwind_ff_eh:			// Landing pad for external unwinder.
++  |  ldl L, SAVE_L(sp)
++  |  ldih TMP3, 0x59c0(zero)		// TOBIT = 2^52 + 2^51 (float).
++  |  ldi TISNIL, LJ_TNIL(zero)
++  |  ldi TISNUM, LJ_TISNUM(zero)
++  |  ldl BASE, L->base
++  |  ldl DISPATCH, L->glref		// Setup pointer to dispatch table.
++  |  ifmovs TMP3, TOBIT
++  |  mov_false TMP1
++  |  li_vmstate INTERP
++  |  ldl PC, FRAME_PC(BASE)		// Fetch PC of previous frame.
++  |  fcvtsd TOBIT, TOBIT
++  |  ldi RA, -8(BASE)           // Results start at BASE-8.
++  |  ldi DISPATCH, GG_G2DISP(DISPATCH)
++  |  stl TMP1, 0(RA)			// Prepend false to error message.
++  |  st_vmstate
++  |  ldi RD, 16(zero)			// 2 results: false + error message.
++  |  br zero, ->vm_returnc
++  |
++  |//-----------------------------------------------------------------------
++  |//-- Grow stack for calls -----------------------------------------------
++  |//-----------------------------------------------------------------------
++  |
++  |->vm_growstack_c:			// Grow stack for C function.
++  |  ldi CARG2, LUA_MINSTACK(zero)
++  |  br zero, >2
++  |
++  |->vm_growstack_l:			// Grow stack for Lua function.
++  |  // BASE = new base, RA = BASE+framesize*8, RC = nargs, PC = first PC
++  |  addl BASE, RC, RC
++  |  subl RA, BASE, RA
++  |  stl BASE, L->base
++  |  ldi PC, 4(PC)			// Must point after first instruction.
++  |  stl RC, L->top
++  |  zapi RA, 0xf0, CARG2; srli CARG2, 3, CARG2
++  |2:
++  |  // L->base = new base, L->top = top
++  |  load_got lj_state_growstack
++  |  stl PC, SAVE_PC(sp)
++  |  bis L, zero, CARG1
++  |  call_intern lj_state_growstack	// (lua_State *L, int n)
++  |  ldl BASE, L->base
++  |  ldl RC, L->top
++  |  ldl LFUNC:RB, FRAME_FUNC(BASE)
++  |  subl RC, BASE, RC
++  |  cleartp LFUNC:RB
++  |  // BASE = new base, RB = LFUNC/CFUNC, RC = nargs, FRAME_PC(BASE) = PC
++  |  ins_callt				// Just retry the call.
++  |
++  |//-----------------------------------------------------------------------
++  |//-- Entry points into the assembler VM ---------------------------------
++  |//-----------------------------------------------------------------------
++  |
++  |->vm_resume:				// Setup C frame and resume thread.
++  |  // (lua_State *L, TValue *base, int nres1 = 0, ptrdiff_t ef = 0)
++  |  saveregs
++  |  bis CARG1, zero, L
++  |  ldl DISPATCH, L->glref		// Setup pointer to dispatch table.
++  |  bis CARG2, zero, BASE
++  |  ldbu TMP1, L->status
++  |  stl L, SAVE_L(sp)
++  |  ldi PC, FRAME_CP(zero)
++  |  ldi TMP0, CFRAME_RESUME(sp)
++  |  ldi DISPATCH, GG_G2DISP(DISPATCH)
++  |  stw zero, SAVE_NRES(sp)
++  |  stw zero, SAVE_ERRF(sp)
++  |  stl CARG1, SAVE_PC(sp)			// Any value outside of bytecode is ok.
++  |  stl zero, SAVE_CFRAME(sp)
++  |  stl TMP0, L->cframe
++  |  beq TMP1, >3
++  |
++  |  // Resume after yield (like a return).
++  |  stl L, DISPATCH_GL(cur_L)(DISPATCH)
++  |  bis BASE, zero, RA
++  |  ldl BASE, L->base
++  |  ldl TMP1, L->top
++  |  ldl PC, FRAME_PC(BASE)
++  |  ldih TMP3, 0x59c0(zero)		// TOBIT = 2^52 + 2^51 (float).
++  |  subl TMP1, BASE, RD
++  |  ifmovs TMP3, TOBIT
++  |  stb zero, L->status
++  |  fcvtsd TOBIT, TOBIT
++  |  li_vmstate INTERP
++  |  ldi RD, 8(RD)
++  |  st_vmstate
++  |  bis RD, zero, MULTRES
++  |  andi PC, FRAME_TYPE, TMP0
++  |  ldi TISNIL, LJ_TNIL(zero)
++  |  ldi TISNUM, LJ_TISNUM(zero)
++  |  beq TMP0, ->BC_RET_Z
++  |  br zero, ->vm_return
++  |
++  |->vm_pcall:				// Setup protected C frame and enter VM.
++  |  // (lua_State *L, TValue *base, int nres1, ptrdiff_t ef)
++  |  saveregs
++  |  stw CARG4, SAVE_ERRF(sp)
++  |  ldi PC, FRAME_CP(zero)
++  |  br zero, >1
++  |
++  |->vm_call:				// Setup C frame and enter VM.
++  |  // (lua_State *L, TValue *base, int nres1)
++  |  saveregs
++  |  ldi PC, FRAME_C(zero)
++  |
++  |1:  // Entry point for vm_pcall above (PC = ftype).
++  |  ldl TMP1, L:CARG1->cframe
++  |  bis CARG1, zero, L
++  |  stw CARG3, SAVE_NRES(sp)
++  |  ldl DISPATCH, L->glref		// Setup pointer to dispatch table.
++  |  stl CARG1, SAVE_L(sp)
++  |  bis CARG2, zero, BASE
++  |  ldi DISPATCH, GG_G2DISP(DISPATCH)
++  |  stl CARG1, SAVE_PC(sp)		// Any value outside of bytecode is ok.
++  |  stl TMP1, SAVE_CFRAME(sp)
++  |  stl sp, L->cframe			// Add our C frame to cframe chain.
++  |
++  |3:  // Entry point for vm_cpcall/vm_resume (BASE = base, PC = ftype).
++  |  stl L, DISPATCH_GL(cur_L)(DISPATCH)
++  |  ldl TMP2, L->base			// TMP2 = old base (used in vmeta_call).
++  |  ldih TMP3, 0x59c0(zero)		// TOBIT = 2^52 + 2^51 (float).
++  |  ldl TMP1, L->top
++  |  ifmovs TMP3, TOBIT
++  |  addl PC, BASE, PC
++  |  subl TMP1, BASE, NARGS8:RC
++  |  ldi TISNUM, LJ_TISNUM(zero)
++  |  subl PC, TMP2, PC			// PC = frame delta + frame type
++  |  fcvtsd TOBIT, TOBIT
++  |  li_vmstate INTERP
++  |  ldi TISNIL, LJ_TNIL(zero)
++  |  st_vmstate
++  |
++  |->vm_call_dispatch:
++  |  // TMP2 = old base, BASE = new base, RC = nargs*8, PC = caller PC
++  |  ldl LFUNC:RB, FRAME_FUNC(BASE)
++  |  checkfunc LFUNC:RB, ->vmeta_call
++  |
++  |->vm_call_dispatch_f:
++  |  ins_call
++  |  // BASE = new base, RB = func, RC = nargs*8, PC = caller PC
++  |
++  |->vm_cpcall:				// Setup protected C frame, call C.
++  |  // (lua_State *L, lua_CFunction func, void *ud, lua_CPFunction cp)
++  |  saveregs
++  |  bis CARG1, zero, L
++  |  ldl TMP0, L:CARG1->stack
++  |  stl CARG1, SAVE_L(sp)
++  |  ldl TMP1, L->top
++  |  ldl DISPATCH, L->glref		// Setup pointer to dispatch table.
++  |  stl CARG1, SAVE_PC(sp)		// Any value outside of bytecode is ok.
++  |  subl TMP0, TMP1, TMP0		// Compute -savestack(L, L->top).
++  |  ldl TMP1, L->cframe
++  |  ldi DISPATCH, GG_G2DISP(DISPATCH)
++  |  stw TMP0, SAVE_NRES(sp)		// Neg. delta means cframe w/o frame.
++  |  stw zero, SAVE_ERRF(sp)		// No error function.
++  |  stl TMP1, SAVE_CFRAME(sp)
++  |  stl sp, L->cframe			// Add our C frame to cframe chain.
++  |  stl L, DISPATCH_GL(cur_L)(DISPATCH)
++  |  ldi CFUNCADDR, 0(CARG4)
++  |  call r26, 0(CFUNCADDR)			// (lua_State *L, lua_CFunction func, void *ud)
++  |  bis CRET1, zero, BASE
++  |  ldi PC, FRAME_CP(zero)
++  |  bne CRET1, <3			// Else continue with the call.
++  |  br zero, ->vm_leave_cp			// No base? Just remove C frame.
++  |
++  |//-----------------------------------------------------------------------
++  |//-- Metamethod handling ------------------------------------------------
++  |//-----------------------------------------------------------------------
++  |
++  |// The lj_meta_* functions (except for lj_meta_cat) don't reallocate the
++  |// stack, so BASE doesn't need to be reloaded across these calls.
++  |
++  |//-- Continuation dispatch ----------------------------------------------
++  |
++  |->cont_dispatch:
++  |  // BASE = meta base, RA = resultptr, RD = (nresults+1)
++  |  ldl TMP0, -32(BASE)		// Continuation.
++  |  bis BASE, zero, RB
++  |  bis TMP2, zero, BASE			// Restore caller BASE.
++  |  ldl LFUNC:TMP1, FRAME_FUNC(TMP2)
++  |.if FFI
++  |  cmpulti TMP0, 2, AT
++  |.endif
++  |  ldl PC, -24(RB)			// Restore PC from [cont|PC].
++  |  cleartp LFUNC:TMP1
++  |  addl RA, RD, TMP2
++  |  ldl TMP1, LFUNC:TMP1->pc
++  |  stl TISNIL, -8(TMP2)               // Ensure one valid arg.
++  |.if FFI
++  |  bne AT, >1
++  |.endif
++  |  // BASE = base, RA = resultptr, RB = meta base
++  |  ldl KBASE, PC2PROTO(k)(TMP1)
++  |  jmp zero, 0(TMP0)				// Jump to continuation.
++  |
++  |.if FFI
++  |1:
++  |  ldi TMP1, -32(RB)
++  |  bne TMP0, ->cont_ffi_callback	// cont = 1: return from FFI callback.
++  |  // cont = 0: tailcall from C function.
++  |  subl TMP1, BASE, RC
++  |  br zero, ->vm_call_tail
++  |.endif
++  |
++  |->cont_cat:				// RA = resultptr, RB = meta base
++  |  ldw INS, -4(PC)
++  |  ldi CARG2, -32(RB)
++  |  ldl CRET1, 0(RA)
++  |  decode_RB MULTRES, INS
++  |  decode_RA RA, INS
++  |  addl BASE, MULTRES, TMP1
++  |  stl BASE, L->base
++  |  subl CARG2, TMP1, CARG3
++  |  stl CRET1, 0(CARG2)
++  |  cmpeq TMP1, CARG2, AT
++  |  beq AT, ->BC_CAT_Z
++  |  addl RA, BASE, RA
++  |  stl CRET1, 0(RA)
++  |  br zero, ->cont_nop
++  |
++  |//-- Table indexing metamethods -----------------------------------------
++  |
++  |->vmeta_tgets1:
++  |  ldi CARG3, DISPATCH_GL(tmptv)(DISPATCH)
++  |  ldi TMP0, LJ_TSTR(zero)
++  |  settp STR:RC, TMP0
++  |  stl STR:RC, 0(CARG3)
++  |  br zero, >1
++  |
++  |->vmeta_tgets:
++  |  ldi CARG2, DISPATCH_GL(tmptv)(DISPATCH)
++  |  ldi TMP0, LJ_TTAB(zero)
++  |  ldi TMP1, LJ_TSTR(zero)
++  |  settp TAB:RB, TMP0
++  |  ldi CARG3, DISPATCH_GL(tmptv2)(DISPATCH)
++  |  stl TAB:RB, 0(CARG2)
++  |  settp STR:RC, TMP1
++  |  stl STR:RC, 0(CARG3)
++  |  br zero, >1
++  |
++  |->vmeta_tgetb:
++  |  ldi CARG3, DISPATCH_GL(tmptv)(DISPATCH)
++  |  ldi TISNUM, LJ_TISNUM(zero)
++  |  settp TMP0, TISNUM
++  |  stl TMP0, 0(CARG3)
++  |
++  |->vmeta_tgetv:
++  |1:
++  |  load_got lj_meta_tget
++  |  stl BASE, L->base
++  |  bis L, zero, CARG1
++  |  stl PC, SAVE_PC(sp)
++  |  call_intern lj_meta_tget		// (lua_State *L, TValue *o, TValue *k)
++  |  // Returns TValue * (finished) or NULL (metamethod).
++  |  ldi TMP1, -FRAME_CONT(BASE)
++  |  beq CRET1, >3
++  |  ldl TMP0, 0(CRET1)
++  |  stl TMP0, 0(RA)
++  |  ins_next
++  |
++  |3:  // Call __index metamethod.
++  |  // BASE = base, L->top = new base, stack = cont/func/t/k		
++  |  ldl BASE, L->top
++  |  stl PC, -24(BASE)			// [cont|PC]
++  |  subl BASE, TMP1, PC
++  |  ldl LFUNC:RB, FRAME_FUNC(BASE)	// Guaranteed to be a function here.
++  |  cleartp LFUNC:RB
++  |  ldi NARGS8:RC, 16(zero)
++  |  br zero, ->vm_call_dispatch_f
++  |
++  |->vmeta_tgetr:
++  |  load_got lj_tab_getinth
++  |  call_intern lj_tab_getinth		// (GCtab *t, int32_t key)
++  |  // Returns cTValue * or NULL.
++  |  bis TISNIL, zero, CARG2
++  |  beq CRET1, ->BC_TGETR_Z
++  |  ldl CARG2, 0(CRET1)
++  |  br zero, ->BC_TGETR_Z
++  |
++  |//-----------------------------------------------------------------------
++  |
++  |->vmeta_tsets1:
++  |  ldi CARG3, DISPATCH_GL(tmptv)(DISPATCH)
++  |  ldi TMP0, LJ_TSTR(zero)
++  |  settp STR:RC, TMP0
++  |  stl STR:RC, 0(CARG3)
++  |  br zero, >1
++  |
++  |->vmeta_tsets:
++  |  ldi CARG2, DISPATCH_GL(tmptv)(DISPATCH)
++  |  ldi TMP0, LJ_TTAB(zero)
++  |  ldi TMP1, LJ_TSTR(zero)
++  |  settp TAB:RB, TMP0
++  |  ldi CARG3, DISPATCH_GL(tmptv2)(DISPATCH)
++  |  stl TAB:RB, 0(CARG2)
++  |  settp STR:RC, TMP1
++  |  stl STR:RC, 0(CARG3)
++  |  br zero, >1
++  |
++  |->vmeta_tsetb:			// TMP0 = index
++  |  ldi CARG3, DISPATCH_GL(tmptv)(DISPATCH)
++  |  ldi TISNUM, LJ_TISNUM(zero)
++  |  settp TMP0, TISNUM
++  |  stl TMP0, 0(CARG3)
++  |
++  |->vmeta_tsetv:
++  |1:
++  |  load_got lj_meta_tset
++  |  stl BASE, L->base
++  |  bis L, zero, CARG1
++  |  stl PC, SAVE_PC(sp)
++  |  call_intern lj_meta_tset		// (lua_State *L, TValue *o, TValue *k)
++  |  // Returns TValue * (finished) or NULL (metamethod).
++  |  ldl TMP2, 0(RA)
++  |  beq CRET1, >3
++  |  // NOBARRIER: lj_meta_tset ensures the table is not black.
++  |  stl TMP2, 0(CRET1)
++  |  ins_next
++  |
++  |3:  // Call __newindex metamethod.
++  |  // BASE = base, L->top = new base, stack = cont/func/t/k/(v)
++  |  ldi TMP1, -FRAME_CONT(BASE)
++  |  ldl BASE, L->top
++  |  stl PC, -24(BASE)			// [cont|PC]
++  |  subl BASE, TMP1, PC
++  |  ldl LFUNC:RB, FRAME_FUNC(BASE)	// Guaranteed to be a function here.
++  |  ldi NARGS8:RC, 24(zero)		// 3 args for func(t, k, v)
++  |  cleartp LFUNC:RB
++  |  stl TMP2, 16(BASE)		// Copy value to third argument.
++  |  br zero, ->vm_call_dispatch_f
++  |
++  |->vmeta_tsetr:
++  |  load_got lj_tab_setinth
++  |  stl BASE, L->base
++  |  bis L, zero, CARG1
++  |  stl PC, SAVE_PC(sp)
++  |  call_intern lj_tab_setinth	// (lua_State *L, GCtab *t, int32_t key)
++  |  // Returns TValue *.
++  |  br zero, ->BC_TSETR_Z
++  |
++  |//-- Comparison metamethods ---------------------------------------------
++  |
++  |->vmeta_comp:
++  |  // RA/RD point to o1/o2.
++  |  bis RA, zero, CARG2
++  |  bis RD, zero, CARG3
++  |  load_got lj_meta_comp
++  |  ldi PC, -4(PC)
++  |  stl BASE, L->base
++  |  bis L, zero, CARG1
++  |  decode_OP CARG4, INS
++  |  stl PC, SAVE_PC(sp)
++  |  call_intern lj_meta_comp	// (lua_State *L, TValue *o1, *o2, int op)
++  |  // Returns 0/1 or TValue * (metamethod).
++  |3:
++  |  cmpulti CRET1, 2, TMP1
++  |  beq TMP1, ->vmeta_binop
++  |  subw zero, CRET1, TMP2
++  |4:
++  |  ldhu RD, OFS_RD(PC)
++  |  ldi PC, 4(PC)
++  |  ldih TMP1, -0x2(zero)		// -BCBIAS_J*4
++  |  s4addwi RD, 0, RD
++  |  addw RD, TMP1, RD
++  |  and RD, TMP2, RD
++  |  addl PC, RD, PC
++  |->cont_nop:
++  |  ins_next
++  |
++  |->cont_ra:				// RA = resultptr
++  |  ldbu TMP1, -4+OFS_RA(PC)
++  |  ldl TMP2, 0(RA)
++  |  s8addwi TMP1, 0, TMP1
++  |  addl TMP1, BASE, TMP1
++  |  stl TMP2, 0(TMP1)
++  |  br zero, ->cont_nop
++  |
++  |->cont_condt:			// RA = resultptr
++  |  ldl TMP0, 0(RA)
++  |  gettp TMP0, TMP0
++  |  // cmpulti TMP0, LJ_TISTRUECOND, TMP1
++  |  ldi TMP1, LJ_TISTRUECOND(zero)
++  |  cmpult TMP0, TMP1, TMP1
++  |  subw zero, TMP1, TMP2		// Branch if result is true.
++  |  br zero, <4
++  |
++  |->cont_condf:			// RA = resultptr
++  |  ldl TMP0, 0(RA)
++  |  gettp TMP0, TMP0
++  |  // cmpulti TMP0, LJ_TISTRUECOND, TMP1
++  |  ldi TMP1, LJ_TISTRUECOND(zero)
++  |  cmpult TMP0, TMP1, TMP1
++  |  subwi TMP1, 1, TMP2		// Branch if result is false.
++  |  br zero, <4
++  |
++  |->vmeta_equal:
++  |  // CARG1/CARG2 point to o1/o2. TMP0 is set to 0/1.
++  |  load_got lj_meta_equal
++  |  cleartp LFUNC:CARG3, CARG2
++  |  cleartp LFUNC:CARG2, CARG1
++  |  bis TMP0, zero, CARG4
++  |  ldi PC, -4(PC)
++  |  stl BASE, L->base
++  |  bis L, zero, CARG1
++  |  stl PC, SAVE_PC(sp)
++  |  call_intern lj_meta_equal		// (lua_State *L, GCobj *o1, *o2, int ne)
++  |  // Returns 0/1 or TValue * (metamethod).
++  |  br zero, <3
++  |
++  |->vmeta_equal_cd:
++  |.if FFI
++  |  load_got lj_meta_equal_cd
++  |  bis INS, zero, CARG2
++  |  ldi PC, -4(PC)
++  |  stl BASE, L->base
++  |  bis L, zero, CARG1
++  |  stl PC, SAVE_PC(sp)
++  |  call_intern lj_meta_equal_cd		// (lua_State *L, BCIns op)
++  |  // Returns 0/1 or TValue * (metamethod).
++  |  br zero, <3
++  |.endif
++  |
++  |->vmeta_istype:
++  |  load_got lj_meta_istype
++  |  ldi PC, -4(PC)
++  |  stl BASE, L->base
++  |  bis L, zero, CARG1 
++  |  zapi RA, 0xf0, CARG2
++  |  srli CARG2, 3, CARG2
++  |  zapi RD, 0xf0, CARG3
++  |  srli CARG3, 3, CARG3
++  |  stl PC, SAVE_PC(sp)
++  |  call_intern lj_meta_istype		// (lua_State *L, BCReg ra, BCReg tp)
++  |  br zero, ->cont_nop
++  |
++  |//-- Arithmetic metamethods ---------------------------------------------
++  |
++  |->vmeta_unm:
++  |  bis RB, zero, RC
++  |
++  |->vmeta_arith:
++  |  load_got lj_meta_arith
++  |  stl BASE, L->base
++  |  bis RA, zero, CARG2
++  |  stl PC, SAVE_PC(sp)
++  |  bis RB, zero, CARG3
++  |  bis RC, zero, CARG4
++  |  decode_OP CARG5, INS
++  |  bis L, zero, CARG1
++  |  call_intern lj_meta_arith		// (lua_State *L, TValue *ra,*rb,*rc, BCReg op)
++  |  // Returns NULL (finished) or TValue * (metamethod).
++  |  beq CRET1, ->cont_nop
++  |
++  |  // Call metamethod for binary op.
++  |->vmeta_binop:
++  |  // BASE = old base, CRET1 = new base, stack = cont/func/o1/o2
++  |  subl CRET1, BASE, TMP1
++  |  stl PC, -24(CRET1)			// [cont|PC]
++  |  bis BASE, zero, TMP2
++  |  ldi PC, FRAME_CONT(TMP1)
++  |  bis CRET1, zero, BASE
++  |  ldi NARGS8:RC, 16(zero)                  // 2 args for func(o1, o2).
++  |  br zero, ->vm_call_dispatch
++  |
++  |->vmeta_len:
++  |  // CARG2 already set by BC_LEN.
++#if LJ_52
++  |  bis CARG1, zero, MULTRES
++#endif
++  |  load_got lj_meta_len
++  |  stl BASE, L->base
++  |  bis L, zero, CARG1
++  |  stl PC, SAVE_PC(sp)
++  |  call_intern lj_meta_len		// (lua_State *L, TValue *o)
++  |  // Returns NULL (retry) or TValue * (metamethod base).
++#if LJ_52
++  |  bne CRET1, ->vmeta_binop		// Binop call for compatibility.
++  |  bis MULTRES, zero, CARG1
++  |  br zero, ->BC_LEN_Z
++#else
++  |  br zero, ->vmeta_binop			// Binop call for compatibility.
++#endif
++  |
++  |//-- Call metamethod ----------------------------------------------------
++  |
++  |->vmeta_call:			// Resolve and call __call metamethod.
++  |  // TMP2 = old base, BASE = new base, RC = nargs*8
++  |  load_got lj_meta_call
++  |  stl TMP2, L->base			// This is the callers base!
++  |  ldi CARG2, -16(BASE)
++  |  stl PC, SAVE_PC(sp)
++  |  addl BASE, RC, CARG3
++  |  bis L, zero, CARG1
++  |  bis NARGS8:RC, zero, MULTRES
++  |  call_intern lj_meta_call		// (lua_State *L, TValue *func, TValue *top)
++  |  ldl LFUNC:RB, FRAME_FUNC(BASE)	// Guaranteed to be a function here.
++  |  ldi NARGS8:RC, 8(MULTRES)	// Got one more argument now.
++  |  cleartp LFUNC:RB
++  |  ins_call
++  |
++  |->vmeta_callt:			// Resolve __call for BC_CALLT.
++  |  // BASE = old base, RA = new base, RC = nargs*8
++  |  load_got lj_meta_call
++  |  stl BASE, L->base
++  |  subli RA, 16, CARG2
++  |  stl PC, SAVE_PC(sp)
++  |  addl RA, RC, CARG3
++  |  bis L, zero, CARG1
++  |  bis NARGS8:RC, zero, MULTRES
++  |  call_intern lj_meta_call		// (lua_State *L, TValue *func, TValue *top)
++  |  ldl RB, FRAME_FUNC(RA)		// Guaranteed to be a function here.
++  |  ldl TMP1, FRAME_PC(BASE)
++  |  addli MULTRES, 8, NARGS8:RC	// Got one more argument now.
++  |  cleartp LFUNC:CARG3, RB
++  |  br zero, ->BC_CALLT_Z
++  |
++  |//-- Argument coercion for 'for' statement ------------------------------
++  |
++  |->vmeta_for:
++  |  load_got lj_meta_for
++  |  stl BASE, L->base
++  |  bis RA, zero, CARG2
++  |  stl PC, SAVE_PC(sp)
++  |  bis INS, zero, MULTRES
++  |  bis L, zero, CARG1
++  |  call_intern lj_meta_for	// (lua_State *L, TValue *base)
++  |.if JIT
++  |  decode_OP TMP0, MULTRES
++  |  ldi TMP1, BC_JFORI(zero)
++  |.endif
++  |  decode_RA RA, MULTRES
++  |  decode_RD RD, MULTRES
++  |.if JIT
++  |  cmpeq TMP0, TMP1, AT
++  |  bne AT, =>BC_JFORI
++  |.endif
++  |  br zero, =>BC_FORI
++  |
++  |//-----------------------------------------------------------------------
++  |//-- Fast functions -----------------------------------------------------
++  |//-----------------------------------------------------------------------
++  |
++  |.macro .ffunc, name
++  |->ff_ .. name:
++  |.endmacro
++  |
++  |.macro .ffunc_1, name
++  |->ff_ .. name:
++  |  ldl CARG1, 0(BASE)
++  |  beq NARGS8:RC, ->fff_fallback
++  |.endmacro
++  |
++  |.macro .ffunc_2, name
++  |->ff_ .. name:
++  |  cmpulti NARGS8:RC, 16, TMP0
++  |  ldl CARG1, 0(BASE)
++  |  ldl CARG2, 8(BASE)
++  |  bne TMP0, ->fff_fallback
++  |.endmacro
++  |
++  |.macro .ffunc_n, name
++  |->ff_ .. name:
++  |  ldl CARG1, 0(BASE)
++  |  fldd FCARG1, 0(BASE)
++  |  beq NARGS8:RC, ->fff_fallback
++  |  checknum CARG1, ->fff_fallback
++  |.endmacro
++  |
++  |.macro .ffunc_nn, name
++  |->ff_ .. name:
++  |  ldl CARG1, 0(BASE)
++  |  ldl CARG2, 8(BASE)
++  |  cmpulti NARGS8:RC, 16, TMP0
++  |  gettp TMP1, CARG1
++  |  bne TMP0, ->fff_fallback
++  |  gettp TMP2, CARG2
++  |  // cmpulti TMP1, LJ_TISNUM, TMP1
++  |  // cmpulti TMP2, LJ_TISNUM, TMP2
++  |  ldi AT, LJ_TISNUM(zero)
++  |  cmpult TMP1, AT, TMP1
++  |  cmpult TMP2, AT, TMP2
++  |  fldd FCARG1, 0(BASE)
++  |  and TMP1, TMP2, TMP1
++  |  fldd FCARG2, 8(BASE)
++  |  beq TMP1, ->fff_fallback
++  |.endmacro
++  |
++  |// Inlined GC threshold check.
++  |.macro ffgccheck
++  |  ldl TMP0, DISPATCH_GL(gc.total)(DISPATCH)
++  |  ldl TMP1, DISPATCH_GL(gc.threshold)(DISPATCH)
++  |  cmpult TMP0, TMP1, AT
++  |  bne AT, >1
++  |  br ra, ->fff_gcstep
++  |1:
++  |.endmacro
++  |
++  |//-- Base library: checks -----------------------------------------------
++  |.ffunc_1 assert
++  |  gettp TMP1, CARG1
++  |  // cmpulti TMP1, LJ_TISTRUECOND, TMP1
++  |  ldi AT, LJ_TISTRUECOND(zero)
++  |  cmpult TMP1, AT, TMP1
++  |  ldi RA, -16(BASE)
++  |  beq TMP1, ->fff_fallback
++  |  ldl PC, FRAME_PC(BASE)
++  |  addwi NARGS8:RC, 8, RD		// Compute (nresults+1)*8.
++  |  ldi TMP1, 8(BASE)
++  |  addl RA, RD, TMP2
++  |  stl CARG1, 0(RA)
++  |  cmpeq BASE, TMP2, AT
++  |  bne AT, ->fff_res		// Done if exactly 1 argument.
++  |1:
++  |  ldl TMP0, 0(TMP1)
++  |  stl TMP0, -16(TMP1)
++  |  bis TMP1, zero, AT
++  |  ldi TMP1, 8(TMP1)
++  |  cmpeq AT, TMP2, AT
++  |  beq AT, <1
++  |  br zero, ->fff_res
++  |
++  |.ffunc_1 type
++  |  gettp TMP0, CARG1
++  |  ldi TMP1, ~LJ_TISNUM(zero)
++  |  addwi TMP1, 0, TMP1
++  |  ldi TISNUM, LJ_TISNUM(zero)
++  |  cmpult TISNUM, TMP0, TMP2
++  |  ornot zero, TMP0, AT          // ~TMP0
++  |  seleq TMP2, TMP1, AT, AT
++  |  s8addl AT, CFUNC:RB, AT
++  |  ldl CARG1, CFUNC:AT->upvalue
++  |  br zero, ->fff_restv
++  |
++  |//-- Base library: getters and setters ---------------------------------
++  |
++  |.ffunc_1 getmetatable
++  |  gettp TMP2, CARG1
++  |  ldi TMP0, -LJ_TTAB(TMP2)
++  |  ldi TMP1, -LJ_TUDATA(TMP2)
++  |  seleq TMP0, zero, TMP1, TMP0
++  |  cleartp TAB:CARG1
++  |  bne TMP0, >6
++  |1:  // Field metatable must be at same offset for GCtab and GCudata!
++  |  ldl TAB:RB, TAB:CARG1->metatable
++  |2:
++  |  ldl STR:RC, DISPATCH_GL(gcroot[GCROOT_MMNAME+MM_metatable])(DISPATCH)
++  |  ldi CARG1, LJ_TNIL(zero)
++  |  beq TAB:RB, ->fff_restv
++  |  ldw TMP0, TAB:RB->hmask
++  |  ldw TMP1, STR:RC->hash
++  |  ldl NODE:TMP2, TAB:RB->node
++  |  and TMP1, TMP0, TMP1		// idx = str->hash & tab->hmask
++  |  slli TMP1, 5, TMP0
++  |  slli TMP1, 3, TMP1
++  |  subl TMP0, TMP1, TMP1
++  |  addl NODE:TMP2, TMP1, NODE:TMP2	// node = tab->node + (idx*32-idx*8)
++  |  ldi CARG4, LJ_TSTR(zero)
++  |  addwi CARG4, 0, CARG4
++  |  settp STR:RC, CARG4		// Tagged key to look for.
++  |3:  // Rearranged logic, because we expect _not_ to find the key.
++  |  ldl TMP0, NODE:TMP2->key
++  |  ldl CARG1, NODE:TMP2->val
++  |  ldl NODE:TMP2, NODE:TMP2->next
++  |  ldi TMP3, LJ_TTAB(zero)
++  |  cmpeq RC, TMP0, AT
++  |  bne AT, >5
++  |  bne NODE:TMP2, <3
++  |4:
++  |  bis RB, zero, CARG1
++  |  settp CARG1, TMP3
++  |  br zero, ->fff_restv			// Not found, keep default result.
++  |5:
++  |  cmpeq CARG1, TISNIL, AT
++  |  beq AT, ->fff_restv
++  |  br zero, <4				// Ditto for nil value.
++  |
++  |6:
++  |  // cmpulti TMP2, LJ_TISNUM, AT
++  |  ldi TISNUM, LJ_TISNUM(zero)
++  |  cmpult TMP2, TISNUM, AT
++  |  selne AT, TISNUM, TMP2, TMP2
++  |  slli TMP2, 3, TMP2
++  |  subl DISPATCH, TMP2, TMP0
++  |  ldl TAB:RB, DISPATCH_GL(gcroot[GCROOT_BASEMT])-8(TMP0)
++  |  br zero, <2
++  |
++  |.ffunc_2 setmetatable
++  |  // Fast path: no mt for table yet and not clearing the mt.
++  |  checktp TMP1, CARG1, -LJ_TTAB, ->fff_fallback
++  |  gettp TMP3, CARG2
++  |  ldl TAB:TMP0, TAB:TMP1->metatable
++  |  ldbu TMP2, TAB:TMP1->marked
++  |  ldi AT, -LJ_TTAB(TMP3)
++  |  cleartp TAB:CARG2
++  |  bis AT, TAB:TMP0, AT
++  |  bne AT, ->fff_fallback
++  |  andi TMP2, LJ_GC_BLACK, AT		// isblack(table)
++  |  stl TAB:CARG2, TAB:TMP1->metatable
++  |  beq AT, ->fff_restv
++  |  barrierback TAB:TMP1, TMP2, TMP0, ->fff_restv
++  |
++  |.ffunc rawget
++  |  ldl CARG2, 0(BASE)
++  |  cmpulti NARGS8:RC, 16, TMP0
++  |  load_got lj_tab_get
++  |  gettp TMP1, CARG2
++  |  cleartp CARG2
++  |  ldi TMP1, -LJ_TTAB(TMP1)
++  |  bis TMP0, TMP1, TMP0
++  |  ldi CARG3, 8(BASE)
++  |  bne TMP0, ->fff_fallback
++  |  bis L, zero, CARG1
++  |  call_intern lj_tab_get	// (lua_State *L, GCtab *t, cTValue *key)
++  |  // Returns cTValue *.
++  |  ldl CARG1, 0(CRET1)
++  |  br zero, ->fff_restv
++  |
++  |//-- Base library: conversions ------------------------------------------
++  |
++  |.ffunc tonumber
++  |  // Only handles the number case inline (without a base argument).
++  |  ldl CARG1, 0(BASE)
++  |  xori NARGS8:RC, 8, TMP0		// Exactly one number argument.
++  |  gettp TMP1, CARG1
++  |  ldi TISNUM, LJ_TISNUM(zero)
++  |  cmpult TISNUM, TMP1, TMP1
++  |  bis TMP0, TMP1, TMP0
++  |  bne TMP0, ->fff_fallback		// No args or CARG1 is not number
++  |  br zero, ->fff_restv
++  |
++  |.ffunc_1 tostring
++  |  // Only handles the string or number case inline.
++  |  gettp TMP0, CARG1
++  |  ldi AT, -LJ_TSTR(TMP0)
++  |  // A __tostring method in the string base metatable is ignored.
++  |  ldl TMP1, DISPATCH_GL(gcroot[GCROOT_BASEMT_NUM])(DISPATCH)
++  |  beq AT, ->fff_restv	// String key?
++  |  // Handle numbers inline, unless a number base metatable is present.
++  |  ldi TISNUM, LJ_TISNUM(zero)
++  |  cmpult TISNUM, TMP0, TMP0
++  |  stl BASE, L->base      // Add frame since C call can throw.
++  |  bis TMP0, TMP1, TMP0
++  |  bne TMP0, ->fff_fallback
++  |  stl PC, SAVE_PC(sp)		// Redundant (but a defined value).
++  |  ffgccheck
++  |  load_got lj_strfmt_number
++  |  bis L, zero, CARG1
++  |  bis BASE, zero, CARG2
++  |  call_intern lj_strfmt_number	// (lua_State *L, cTValue *o)
++  |  // Returns GCstr *.
++  |  ldi AT, LJ_TSTR(zero)
++  |  settp CRET1, AT
++  |  bis CRET1, zero, CARG1
++  |  br zero, ->fff_restv
++  |
++  |//-- Base library: iterators -------------------------------------------
++  |
++  |.ffunc_1 next
++  |  checktp CARG2, CARG1, -LJ_TTAB, ->fff_fallback
++  |  addl BASE, NARGS8:RC, TMP2
++  |  ldl PC, FRAME_PC(BASE)
++  |  stl TISNIL, 0(TMP2)		// Set missing 2nd arg to nil.
++  |  load_got lj_tab_next
++  |  stl BASE, L->base			// Add frame since C call can throw.
++  |  stl BASE, L->top			// Dummy frame length is ok.
++  |  ldi CARG3, 8(BASE)
++  |  stl PC, SAVE_PC(sp)
++  |  bis L, zero, CARG1
++  |  call_intern lj_tab_next		// (GCtab *t, cTValue *key, TValue *o)
++  |  // Returns 1=found, 0=end, -1=error.
++  |  bis TISNIL, zero, CARG1
++  |  beq CRET1, ->fff_restv		// End of traversal: return nil.
++  |  ldl TMP0, 8(BASE)
++  |  ldi RA, -16(BASE)
++  |  ldl TMP2, 16(BASE)
++  |  stl TMP0, 0(RA)
++  |  stl TMP2, 8(RA)
++  |  ldi RD, (2+1)*8(zero)
++  |  br zero, ->fff_res
++  |
++  |.ffunc_1 pairs
++  |  checktp TAB:TMP1, CARG1, -LJ_TTAB, ->fff_fallback
++  |  ldl PC, FRAME_PC(BASE)
++#if LJ_52
++  |  ldl TAB:TMP2, TAB:TMP1->metatable
++  |  ldl TMP0, CFUNC:RB->upvalue[0]
++  |  ldi RA, -16(BASE)
++  |  bne TAB:TMP2, ->fff_fallback
++#else
++  |  ldl TMP0, CFUNC:RB->upvalue[0]
++  |  ldi RA, -16(BASE)
++#endif
++  |  stl TISNIL, 0(BASE)
++  |  stl CARG1, -8(BASE)
++  |  stl TMP0, 0(RA)
++  |  ldi RD, (3+1)*8(zero)
++  |  br zero, ->fff_res
++  |
++  |.ffunc_2 ipairs_aux
++  |  checktab CARG1, ->fff_fallback
++  |  checkint CARG2, ->fff_fallback
++  |  ldw TMP0, TAB:CARG1->asize
++  |  ldl TMP1, TAB:CARG1->array
++  |  ldl PC, FRAME_PC(BASE)
++  |  addwi CARG2, 0, TMP2
++  |  addwi TMP2, 1, TMP2
++  |  cmpult TMP2, TMP0, AT
++  |  ldi RA, -16(BASE)
++  |  zapi TMP2, 0xf0, TMP0
++  |  ldi TISNUM, LJ_TISNUM(zero)
++  |  settp TMP0, TISNUM
++  |  stl TMP0, 0(RA)
++  |  beq AT, >2			// Not in array part?
++  |  s8addl TMP2, TMP1, TMP3
++  |  ldl TMP1, 0(TMP3)
++  |1:
++  |  ldi RD, (0+1)*8(zero)
++  |  cmpeq TMP1, TISNIL, AT
++  |  bne AT, ->fff_res	// End of iteration, return 0 results.
++  |  stl TMP1, -8(BASE)
++  |  ldi RD, (2+1)*8(zero)
++  |  br zero, ->fff_res
++  |2:  // Check for empty hash part first. Otherwise call C function.
++  |  ldw TMP0, TAB:CARG1->hmask
++  |  ldi RD, (0+1)*8(zero)
++  |  load_got lj_tab_getinth
++  |  beq TMP0, ->fff_res
++  |  bis TMP2, zero, CARG2
++  |  call_intern lj_tab_getinth		// (GCtab *t, int32_t key)
++  |  // Returns cTValue * or NULL.
++  |  ldi RD, (0+1)*8(zero)
++  |  beq CRET1, ->fff_res
++  |  ldl TMP1, 0(CRET1)
++  |  br zero, <1
++  |
++  |.ffunc_1 ipairs
++  |  checktp TAB:TMP1, CARG1, -LJ_TTAB, ->fff_fallback
++  |  ldl PC, FRAME_PC(BASE)
++#if LJ_52
++  |  ldl TAB:TMP2, TAB:TMP1->metatable
++#endif
++  |  ldl CFUNC:TMP0, CFUNC:RB->upvalue[0]
++  |  ldi RA, -16(BASE)
++#if LJ_52
++  |  bne TAB:TMP2, ->fff_fallback
++#endif
++  |  ldi TISNUM, LJ_TISNUM(zero)
++  |  slli TISNUM, 47, TMP1
++  |  stl CARG1, -8(BASE)
++  |  stl TMP1, 0(BASE)
++  |  stl CFUNC:TMP0, 0(RA)
++  |  ldi RD, (3+1)*8(zero)
++  |  br zero, ->fff_res
++  |
++  |//-- Base library: catch errors ----------------------------------------
++  |
++  |.ffunc pcall
++  |  ldi NARGS8:RC, -8(NARGS8:RC)
++  |  ldbu TMP3, DISPATCH_GL(hookmask)(DISPATCH)
++  |  bis BASE, zero, TMP2
++  |  cmplt NARGS8:RC, zero, AT
++  |  bne AT, ->fff_fallback
++  |  ldi BASE, 16(BASE)
++  |  // Remember active hook before pcall.
++  |  zapi TMP3, 0xf0, TMP3
++  |  srli TMP3, HOOK_ACTIVE_SHIFT, TMP3
++  |  andi TMP3, 1, TMP3
++  |  ldi PC, 16+FRAME_PCALL(TMP3)
++  |  beq NARGS8:RC, ->vm_call_dispatch
++  |1:
++  |  addl BASE, NARGS8:RC, TMP0
++  |2:
++  |  ldl TMP1, -16(TMP0)
++  |  stl TMP1, -8(TMP0)
++  |  ldi TMP0, -8(TMP0)
++  |  cmpeq TMP0, BASE, AT
++  |  beq AT, <2
++  |  br zero, ->vm_call_dispatch
++  |
++  |.ffunc xpcall
++  |  ldi NARGS8:TMP0, -16(NARGS8:RC)
++  |  ldl CARG1, 0(BASE)
++  |  ldl CARG2, 8(BASE)
++  |  ldbu TMP1, DISPATCH_GL(hookmask)(DISPATCH)
++  |  cmplt NARGS8:TMP0, zero, AT
++  |  bne AT, ->fff_fallback
++  |  gettp TMP2, CARG2
++  |  ldi TMP2, -LJ_TFUNC(TMP2)
++  |  bne TMP2, ->fff_fallback		// Traceback must be a function.
++  |  bis BASE, zero, TMP2
++  |  bis NARGS8:TMP0, zero, NARGS8:RC
++  |  ldi BASE, 24(BASE)
++  |  // Remember active hook before pcall.
++  |  zapi TMP3, 0xf0, TMP3
++  |  srli TMP3, HOOK_ACTIVE_SHIFT, TMP3
++  |  stl CARG2, 0(TMP2)			// Swap function and traceback.
++  |  andi TMP3, 1, TMP3
++  |  stl CARG1, 8(TMP2)
++  |  ldi PC, 24+FRAME_PCALL(TMP3)
++  |  beq NARGS8:RC, ->vm_call_dispatch
++  |  br zero, <1
++  |
++  |//-- Coroutine library --------------------------------------------------
++  |
++  |.macro coroutine_resume_wrap, resume
++  |.if resume
++  |.ffunc_1 coroutine_resume
++  |  checktp CARG1, CARG1, -LJ_TTHREAD, ->fff_fallback
++  |.else
++  |.ffunc coroutine_wrap_aux
++  |  ldl L:CARG1, CFUNC:RB->upvalue[0].gcr
++  |  cleartp L:CARG1
++  |.endif
++  |  ldbu TMP0, L:CARG1->status
++  |  ldl TMP1, L:CARG1->cframe
++  |  ldl CARG2, L:CARG1->top
++  |  ldl TMP2, L:CARG1->base
++  |  subwi TMP0, LUA_YIELD, CARG4
++  |  addl CARG2, TMP0, CARG3
++  |  ldi TMP3, 8(CARG2)
++  |  seleq CARG4, CARG2, TMP3, CARG2
++  |  cmplt zero, CARG4, AT
++  |  bne AT, ->fff_fallback		// st > LUA_YIELD?
++  |  xor TMP2, CARG3, TMP2
++  |  bis TMP2, TMP0, CARG4
++  |  bne TMP1, ->fff_fallback		// cframe != 0?
++  |  ldl TMP0, L:CARG1->maxstack
++  |  ldl PC, FRAME_PC(BASE)
++  |  beq CARG4, ->fff_fallback		// base == top && st == 0?
++  |  addl CARG2, NARGS8:RC, TMP2
++  |  cmpult TMP0, TMP2, CARG4
++  |  stl BASE, L->base
++  |  stl PC, SAVE_PC(sp)
++  |  bne CARG4, ->fff_fallback		// Stack overflow?
++  |1:
++  |.if resume
++  |  ldi BASE, 8(BASE)		// Keep resumed thread in stack for GC.
++  |  ldi NARGS8:RC, -8(NARGS8:RC)
++  |  ldi TMP2, -8(TMP2)
++  |.endif
++  |  stl TMP2, L:CARG1->top
++  |  stl BASE, L->top
++  |  addl BASE, NARGS8:RC, TMP1
++  |  bis CARG2, zero, CARG3
++  |2:  // Move args to coroutine.
++  |  ldl TMP0, 0(BASE)
++  |  cmpult BASE, TMP1, TMP3
++  |  ldi BASE, 8(BASE)
++  |  beq TMP3, >3
++  |  stl TMP0, 0(CARG3)
++  |  ldi CARG3, 8(CARG3)
++  |  br zero, <2
++  |3:
++  |  bis L:CARG1, zero, L:RA
++  |  br ra, ->vm_resume			// (lua_State *L, TValue *base, 0, 0)
++  |  // Returns thread status.
++  |4:
++  |  ldl TMP2, L:RA->base
++  |  // cmpulti CRET1, LUA_YIELD+1, TMP1
++  |  ldi TMP1, LUA_YIELD+1(zero)
++  |  cmpult CRET1, TMP1, TMP1
++  |  ldl TMP3, L:RA->top
++  |  li_vmstate INTERP
++  |  ldl BASE, L->base
++  |  stl L, DISPATCH_GL(cur_L)(DISPATCH)
++  |  st_vmstate
++  |  subl TMP3, TMP2, RD
++  |  beq TMP1, >8
++  |  ldl TMP0, L->maxstack
++  |  addl BASE, RD, TMP1
++  |  beq RD, >6			// No results?
++  |  addl TMP2, RD, TMP3
++  |  cmpult TMP0, TMP1, AT
++  |  bne AT, >9		// Need to grow stack?
++  |  stl TMP2, L:RA->top		// Clear coroutine stack.
++  |  bis BASE, zero, TMP1
++  |5:  // Move results from coroutine.
++  |  ldl TMP0, 0(TMP2)
++  |  ldi TMP2, 8(TMP2)
++  |  stl TMP0, 0(TMP1)
++  |  ldi TMP1, 8(TMP1)
++  |  cmpult TMP2, TMP3, AT
++  |  bne AT, <5
++  |6:
++  |.if resume
++  |  mov_true TMP1
++  |  ldi RD, 16(RD)
++  |7:
++  |  stl TMP1, -8(BASE)	// Prepend true/false to results.
++  |  ldi RA, -8(BASE)
++  |.else
++  |  bis BASE, zero, RA
++  |  ldi RD, 8(RD)
++  |.endif
++  |  andi PC, FRAME_TYPE, TMP0
++  |  stl PC, SAVE_PC(sp)
++  |  bis RD, zero, MULTRES
++  |  beq TMP0, ->BC_RET_Z
++  |  br zero, ->vm_return
++  |
++  |8:  // Coroutine returned with error (at co->top-1).
++  |.if resume
++  |  ldi TMP3, -8(TMP3)
++  |  mov_false TMP1
++  |  addwi zero, (2+1)*8, RD
++  |  ldl TMP0, 0(TMP3)
++  |  stl TMP3, L:RA->top		// Remove error from coroutine stack.
++  |  stl TMP0, 0(BASE)			// Copy error message.
++  |  br zero, <7
++  |.else
++  |  load_got lj_ffh_coroutine_wrap_err
++  |  bis L, zero, CARG1
++  |  bis L:RA, zero, CARG2
++  |  call_intern lj_ffh_coroutine_wrap_err  // (lua_State *L, lua_State *co)
++  |.endif
++  |
++  |9:  // Handle stack expansion on return from yield.
++  |  load_got lj_state_growstack
++  |  bis L, zero, CARG1
++  |  zapi RD, 0xf0, CARG2
++  |  srli CARG2, 3, CARG2
++  |  call_intern lj_state_growstack	// (lua_State *L, int n)
++  |  ldi CRET1, 0(zero)
++  |  br zero, <4
++  |.endmacro
++  |
++  |  coroutine_resume_wrap 1		// coroutine.resume
++  |  coroutine_resume_wrap 0		// coroutine.wrap
++  |
++  |.ffunc coroutine_yield
++  |  ldl TMP0, L->cframe
++  |  addl BASE, NARGS8:RC, TMP1
++  |  addwi zero, LUA_YIELD, CRET1
++  |  stl BASE, L->base
++  |  andi TMP0, CFRAME_RESUME, TMP0
++  |  stl TMP1, L->top
++  |  beq TMP0, ->fff_fallback
++  |  stl zero, L->cframe
++  |  stb CRET1, L->status
++  |  br zero, ->vm_leave_unw
++  |
++  |//-- Math library -------------------------------------------------------
++  |
++  |.macro math_round, func
++  |->ff_math_ .. func:
++  |  ldl CARG1, 0(BASE)
++  |  gettp TMP0, CARG1
++  |  beq NARGS8:RC, ->fff_fallback
++  |  ldi TISNUM, LJ_TISNUM(zero)
++  |  cmpeq TMP0, TISNUM, AT
++  |  bne AT, ->fff_restv
++  |  fldd FCARG1, 0(BASE)
++  |  ldi TISNUM, LJ_TISNUM(zero)
++  |  cmpult TMP0, TISNUM, AT
++  |  beq AT, ->fff_fallback
++  |  br ra, ->vm_ .. func
++  |  br zero, ->fff_resn
++  |.endmacro
++  |
++  |  math_round floor
++  |  math_round ceil
++  |
++  |.ffunc_1 math_abs
++  |  gettp CARG2, CARG1
++  |  ldi TMP2, -LJ_TISNUM(CARG2)
++  |  addwi CARG1, 0, TMP1
++  |  bne TMP2, >1
++  |  addwi TMP1, 0, TMP0
++  |  srai TMP0, 31, TMP0			// Extract sign. int
++  |  xor TMP1, TMP0, TMP1
++  |  subl TMP1, TMP0, CARG1
++  |  slli CARG1, 32, TMP3
++  |  ldi TISNUM, LJ_TISNUM(zero)
++  |  settp CARG1, TISNUM
++  |  cmplt TMP3, zero, AT
++  |  beq AT, ->fff_restv
++  |  ldi CARG1, 0x41e(zero)		// 2^31 as a double.
++  |  slli CARG1, 4, CARG1		// 0x41e0
++  |  addwi CARG1, 0, CARG1
++  |  slli CARG1, 48, CARG1
++  |  br zero, ->fff_restv
++  |1:
++  |  // cmpulti CARG2, LJ_TISNUM, TMP2
++  |  ldi TMP2, LJ_TISNUM(zero)
++  |  cmpult CARG2, TMP2, TMP2
++  |  .DEXTM CARG1, CARG1, 0, 63
++  |  beq TMP2, ->fff_fallback		// int
++  |// fallthrough
++  |
++  |->fff_restv:
++  |  // CARG1 = TValue result.
++  |  ldl PC, FRAME_PC(BASE)
++  |  ldi RA, -16(BASE)
++  |  stl CARG1, -16(BASE)
++  |->fff_res1:
++  |  // RA = results, PC = return.
++  |  ldi RD, (1+1)*8(zero)
++  |->fff_res:
++  |  // RA = results, RD = (nresults+1)*8, PC = return.
++  |  andi PC, FRAME_TYPE, TMP0
++  |  bis RD, zero, MULTRES
++  |  ldi RA, -16(BASE)
++  |  bne TMP0, ->vm_return
++  |  ldw INS, -4(PC)
++  |  decode_RB RB, INS
++  |5:
++  |  cmpult RD, RB, TMP2
++  |  decode_RA TMP0, INS
++  |  bne TMP2, >6			// More results expected?
++  |  // Adjust BASE. KBASE is assumed to be set for the calling frame.
++  |  subl RA, TMP0, BASE
++  |  ins_next
++  |
++  |6:  // Fill up results with nil.
++  |  addl RA, RD, TMP1
++  |  ldi RD, 8(RD)
++  |  stl TISNIL, -8(TMP1)
++  |  br zero, <5
++  |
++  |.macro math_extern, func
++  |  .ffunc_n math_ .. func
++  |  load_got func
++  |  call_extern
++  |  br zero, ->fff_resn
++  |.endmacro
++  |
++  |.macro math_extern2, func
++  |  .ffunc_nn math_ .. func
++  |  load_got func
++  |  call_extern
++  |  br zero, ->fff_resn
++  |.endmacro
++  |
++  |.ffunc_n math_sqrt
++  |  fsqrtd FCARG1, FCRET1
++  |->fff_resn:
++  |  ldl PC, FRAME_PC(BASE)
++  |  fstd FCRET1, -16(BASE)
++  |  br zero, ->fff_res1
++  |
++  |.ffunc math_log
++  |  ldi TMP1, 8(zero)
++  |  ldl CARG1, 0(BASE)
++  |  fldd FCARG1, 0(BASE)
++  |  cmpeq NARGS8:RC, TMP1, AT
++  |  beq AT, ->fff_fallback		// Need exactly 1 argument.
++  |  checknum CARG1, ->fff_fallback
++  |  load_got log
++  |  call_extern
++  |  br zero, ->fff_resn
++  |
++  |  math_extern log10
++  |  math_extern exp
++  |  math_extern sin
++  |  math_extern cos
++  |  math_extern tan
++  |  math_extern asin
++  |  math_extern acos
++  |  math_extern atan
++  |  math_extern sinh
++  |  math_extern cosh
++  |  math_extern tanh
++  |  math_extern2 pow
++  |  math_extern2 atan2
++  |  math_extern2 fmod
++  |
++  |.ffunc_2 math_ldexp
++  |  checknum CARG1, ->fff_fallback
++  |  checkint CARG2, ->fff_fallback
++  |  load_got ldexp
++  |  fldd FCARG1, 0(BASE)
++  |  ldw CARG2, 8(BASE)			// (double x, int exp)
++  |  call_extern
++  |  br zero, ->fff_resn
++  |
++  |.ffunc_n math_frexp
++  |  load_got frexp
++  |  ldl PC, FRAME_PC(BASE)
++  |  ldi CARG2, DISPATCH_GL(tmptv)(DISPATCH)
++  |  call_extern
++  |  ldw TMP1, DISPATCH_GL(tmptv)(DISPATCH)
++  |  ldi RA, -16(BASE)
++  |  ifmovs TMP1, FCARG2
++  |  fstd FCRET1, 0(RA)
++  |  fcvtwl FCARG2, FCARG2
++  |  fcvtld FCARG2, FCARG2  
++  |  fstd FCARG2, 8(RA)
++  |  ldi RD, (2+1)*8(zero)
++  |  br zero, ->fff_res
++  |
++  |.ffunc_n math_modf
++  |  load_got modf
++  |  ldl PC, FRAME_PC(BASE)
++  |  ldi CARG2, -16(BASE)
++  |  ldi RA, -16(BASE)
++  |  call_extern
++  |  fstd FCRET1, -8(BASE)
++  |  ldi RD, (2+1)*8(zero)
++  |  br zero, ->fff_res
++  |
++  |
++  |.macro math_minmax, name, intins, fpins
++  |  .ffunc_1 name
++  |  addl BASE, NARGS8:RC, TMP3
++  |  addli BASE, 8, TMP2
++  |  checkint CARG1, >5
++  |1:  // Handle integers.
++  |  ldl CARG2, 0(TMP2)
++  |  cmpeq TMP2, TMP3, AT
++  |  bne AT, ->fff_restv
++  |  addwi CARG1, 0, CARG1
++  |  checkint CARG2, >3
++  |  addwi CARG2, 0, CARG2
++  |  cmplt CARG1, CARG2, AT
++  |  intins AT, CARG2, CARG1, CARG1
++  |  ldi TMP2, 8(TMP2)
++  |  zapi CARG1, 0xf0, CARG1
++  |  ldi TISNUM, LJ_TISNUM(zero)
++  |  settp CARG1, TISNUM
++  |  br zero, <1
++  |
++  |3:  // Convert intermediate result to number and continue with number loop.
++  |  ifmovs CARG1, FCRET1
++  |  checknum CARG2, ->fff_fallback
++  |  fcvtwl FCRET1, FCRET1
++  |  fcvtld FCRET1, FCRET1
++  |  fldd FCARG1, 0(TMP2)
++  |  br zero, >7
++  |
++  |5: 
++  |  fldd FCRET1, 0(BASE)
++  |  ldl CARG2, 0(TMP2)
++  |  checknum CARG1, ->fff_fallback
++  |6:  // Handle numbers.
++  |  cmpeq TMP2, TMP3, AT
++  |  bne AT, ->fff_resn
++  |  fldd FCARG1, 0(TMP2)
++  |  checknum CARG2, >8
++  |7:
++  |  fcmplt FCRET1, FCARG1, FAT
++  |  fpins FAT, FCARG1, FCRET1, FCRET1
++  |  ldi TMP2, 8(TMP2)
++  |  br zero, <6
++  |
++  |8:  // Convert integer to number and continue with number loop.
++  |  flds FCARG1, 0(TMP2)
++  |  checkint CARG2, ->fff_fallback
++  |  fcvtwl FCARG1, FCARG1
++  |  fcvtld FCARG1, FCARG1
++  |  br zero, <7
++  |
++  |.endmacro
++  |
++  |  math_minmax math_min, seleq, fseleq
++  |  math_minmax math_max, selne, fselne
++  |
++  |//-- String library -----------------------------------------------------
++  |
++  |.ffunc string_byte			// Only handle the 1-arg case here.
++  |  ldl CARG1, 0(BASE)
++  |  gettp TMP0, CARG1
++  |  xori NARGS8:RC, 8, TMP1
++  |  ldi TMP0, -LJ_TSTR(TMP0)
++  |  bis TMP1, TMP0, TMP1
++  |  cleartp STR:CARG1
++  |  bne TMP1, ->fff_fallback		// Need exactly 1 string argument.
++  |  ldw TMP0, STR:CARG1->len
++  |  ldl PC, FRAME_PC(BASE)
++  |  cmpult zero, TMP0, RD
++  |  ldbu TMP2, STR:CARG1[1]		// Access is always ok (NUL at end).
++  |  addwi RD, 1, RD
++  |  s8addwi RD, 0, RD			// RD = ((str->len != 0)+1)*8
++  |  ldi TISNUM, LJ_TISNUM(zero)
++  |  settp TMP2, TISNUM
++  |  stl TMP2, -16(BASE)
++  |  br zero, ->fff_res
++  |
++  |.ffunc string_char			// Only handle the 1-arg case here.
++  |  ffgccheck
++  |  ldl CARG1, 0(BASE)
++  |  gettp TMP0, CARG1
++  |  xori NARGS8:RC, 8, TMP1		// Need exactly 1 argument.
++  |  ldi TMP0, -LJ_TISNUM(TMP0)	// Integer.
++  |  ldi TMP2, 255(zero)
++  |  addwi CARG1, 0, CARG1
++  |  bis TMP1, TMP0, TMP1
++  |  cmpult TMP2, CARG1, TMP2		// !(255 < n).
++  |  bis TMP1, TMP2, TMP1
++  |  ldi CARG3, 1(zero)
++  |  bne TMP1, ->fff_fallback
++  |  ldi CARG2, TMPD_OFS(sp)
++  |  stb CARG1, TMPD(sp)
++  |->fff_newstr:
++  |  load_got lj_str_new
++  |  stl BASE, L->base
++  |  stl PC, SAVE_PC(sp)
++  |  bis L, zero, CARG1
++  |  call_intern lj_str_new		// (lua_State *L, char *str, size_t l)
++  |  // Returns GCstr *.
++  |  ldl BASE, L->base
++  |->fff_resstr:
++  |  ldi AT, LJ_TSTR(zero)
++  |  settp CRET1, AT
++  |  bis CRET1, zero, CARG1
++  |  br zero, ->fff_restv
++  |
++  |.ffunc string_sub
++  |  ffgccheck
++  |  ldl CARG1, 0(BASE)
++  |  ldl CARG2, 8(BASE)
++  |  ldl CARG3, 16(BASE)
++  |  subwi NARGS8:RC, 16, TMP0
++  |  gettp TMP1, CARG1
++  |  cmplt TMP0, zero, AT
++  |  bne AT, ->fff_fallback
++  |  cleartp STR:CARG1, CARG1
++  |  subwi zero, 1, CARG4
++  |  beq TMP0, >1
++  |  addwi CARG3, 0, CARG4
++  |  checkint CARG3, ->fff_fallback
++  |1:
++  |  checkint CARG2, ->fff_fallback
++  |  ldi TMP0, -LJ_TSTR(TMP1)
++  |  addwi CARG2, 0, CARG3
++  |  bne TMP0, ->fff_fallback
++  |  ldw CARG2, STR:CARG1->len
++  |  // STR:CARG1 = str, CARG2 = str->len, CARG3 = start, CARG4 = end
++  |  addwi CARG2, 1, TMP0
++  |  cmplt CARG4, zero, TMP3
++  |  addw CARG4, TMP0, TMP2
++  |  cmplt CARG3, zero, TMP1
++  |  selne TMP3, TMP2, CARG4, CARG4		// if (end < 0) end += len+1
++  |  addw CARG3, TMP0, TMP2
++  |  selne TMP1, TMP2, CARG3, CARG3		// if (start < 0) start += len+1
++  |  ldi TMP3, 1(zero)
++  |  cmplt CARG4, zero, TMP2
++  |  cmplt zero, CARG3, TMP1
++  |  selne TMP2, zero, CARG4, CARG4               // if (end < 0) end = 0
++  |  selne TMP1, CARG3, TMP3, CARG3              // if (start < 1) start = 1
++  |  cmplt CARG2, CARG4, TMP2    
++  |  seleq TMP2, CARG4, CARG2, CARG4		// if (end > len) end = len
++  |  addl STR:CARG1, CARG3, CARG2
++  |  subl CARG4, CARG3, CARG3		// len = end - start
++  |  ldi CARG2, sizeof(GCstr)-1(CARG2)
++  |  addwi CARG3, 1, CARG3             // len += 1
++  |  cmplt CARG3, zero, AT
++  |  beq AT, ->fff_newstr
++  |->fff_emptystr:  // Return empty string.
++  |  ldi TMP1, LJ_TSTR(zero)
++  |  ldi STR:CARG1, DISPATCH_GL(strempty)(DISPATCH)
++  |  settp CARG1, TMP1
++  |  br zero, ->fff_restv
++  |
++  |.macro ffstring_op, name
++  |  .ffunc string_ .. name
++  |  ffgccheck
++  |  ldl CARG2, 0(BASE)
++  |  beq NARGS8:RC, ->fff_fallback
++  |  checkstr STR:CARG2, ->fff_fallback
++  |  ldi SBUF:CARG1, DISPATCH_GL(tmpbuf)(DISPATCH)
++  |  load_got lj_buf_putstr_ .. name
++  |  ldl TMP0, SBUF:CARG1->b
++  |  stl L, SBUF:CARG1->L
++  |  stl BASE, L->base
++  |  stl TMP0, SBUF:CARG1->p
++  |  stl PC, SAVE_PC(sp)
++  |  call_intern extern lj_buf_putstr_ .. name
++  |//  or SBUF:CARG1, SBUF:CRET1, zero
++  |  load_got lj_buf_tostr
++  |  bis SBUF:CRET1, zero, SBUF:CARG1
++  |  call_intern lj_buf_tostr
++  |  ldl BASE, L->base
++  |  br zero, ->fff_resstr
++  |.endmacro
++  |
++  |ffstring_op reverse
++  |ffstring_op lower
++  |ffstring_op upper
++  |
++  |//-- Bit library --------------------------------------------------------
++  |
++  |->vm_tobit_fb:
++  |  fldd FCARG1, 0(BASE)
++  |  beq TMP1, ->fff_fallback
++  |  faddd FCARG1, TOBIT, FCARG1
++  |  fimovd FCARG1, CRET1  
++  |  zapi CRET1, 0xf0, CRET1
++  |  ret zero, 0(ra)
++  |
++  |.macro .ffunc_bit, name
++  |  .ffunc_1 bit_..name
++  |  gettp TMP0, CARG1
++  |  zapi CARG1, 0xf0, CRET1
++  |  ldi TISNUM, LJ_TISNUM(zero)
++  |  cmpeq TMP0, TISNUM, AT
++  |  bne AT, >1
++  |  ldi TMP1, LJ_TISNUM(zero)
++  |  cmpult TMP0, TMP1, TMP1
++  |  br ra, ->vm_tobit_fb
++  |1:
++  |.endmacro
++  |
++  |.macro .ffunc_bit_op, name, bins
++  |  .ffunc_bit name
++  |  ldi TMP2, 8(BASE)
++  |  addl BASE, NARGS8:RC, TMP3
++  |1:
++  |  ldl TMP1, 0(TMP2)
++  |  cmpeq TMP2, TMP3, AT
++  |  bne AT, ->fff_resi
++  |  gettp TMP0, TMP1
++  |  ldi TMP2, 8(TMP2)
++  |  ldi TISNUM, LJ_TISNUM(zero)
++  |  cmpeq TMP0, TISNUM, AT
++  |  beq AT, >2
++  |  zapi TMP1, 0xf0, TMP1
++  |  bins CRET1, TMP1, CRET1
++  |  br zero, <1
++  |2:
++  |  fldd FCARG1, -8(TMP2)
++  |  // cmpulti TMP0, LJ_TISNUM, TMP0
++  |  ldi AT, LJ_TISNUM(zero)
++  |  cmpult TMP0, AT, TMP0
++  |  faddd FCARG1, TOBIT, FCARG1
++  |  beq TMP0, ->fff_fallback
++  |  fimovd FCARG1, TMP1 
++  |  zapi TMP1, 0xf0, TMP1
++  |  bins CRET1, TMP1, CRET1
++  |  br zero, <1
++  |.endmacro
++  |
++  |.ffunc_bit_op band, and
++  |.ffunc_bit_op bor, bis
++  |.ffunc_bit_op bxor, xor
++  |
++  |.ffunc_bit bswap
++  |  srli CRET1, 8, TMP0
++  |  srli CRET1, 24, TMP1
++  |  srli TMP0, 8, TMP2
++  |  andi TMP2, 0xff, TMP3
++  |  slli TMP3, 8, TMP3
++  |  .DINS TMP1, CRET1, 24, 8
++  |  .DINS TMP3, TMP0, 16, 8
++  |  bis TMP1, TMP3, CRET1
++  |  br zero, ->fff_resi
++  |
++  |.ffunc_bit tobit
++  |->fff_resi:
++  |  ldl PC, FRAME_PC(BASE)
++  |  ldi RA, -16(BASE)
++  |  ldi TISNUM, LJ_TISNUM(zero)
++  |  settp CRET1, TISNUM
++  |  stl CRET1, -16(BASE)
++  |  br zero, ->fff_res1
++  |
++  |.ffunc_bit bnot
++  |  ornot zero, CRET1, CRET1    // ~CRET1
++  |  zapi CRET1, 0xf0, CRET1
++  |  br zero, ->fff_resi
++  |
++  |.macro .ffunc_bit_sh, name, shins, shmod
++  |  .ffunc_2 bit_..name
++  |  gettp TMP0, CARG1
++  |  ldi TISNUM, LJ_TISNUM(zero)
++  |  cmpeq TMP0, TISNUM, AT
++  |  bne AT, >1
++  |  // cmpulti TMP0, LJ_TISNUM, TMP1
++  |  ldi AT, LJ_TISNUM(zero)
++  |  cmpult TMP0, AT, TMP1
++  |  br ra, ->vm_tobit_fb
++  |  bis CRET1, zero, CARG1		
++  |1:
++  |  gettp TMP0, CARG2
++  |  zapi CARG2, 0xf0, CARG2
++  |  ldi TISNUM, LJ_TISNUM(zero)
++  |  cmpeq TMP0, TISNUM, AT
++  |  beq AT, ->fff_fallback
++  |  addwi CARG1, 0, CARG1
++  |.if shmod == 1
++  |  subw zero, CARG2, CARG2
++  |.endif
++  |  shins CRET1, CARG1, CARG2
++  |  zapi CRET1, 0xf0, CRET1
++  |  br zero, ->fff_resi
++  |.endmacro
++  |
++  |.macro .SLLW, rd, rs, rt
++  |  andi rt, 0x1f, AT
++  |  sll rs, AT, rd
++  |  addwi rd, 0x0, rd
++  |.endmacro 
++  |
++  |.macro .SRLW, rd, rs, rt
++  |  andi rt, 0x1f, AT
++  |  zapi rs, 0xf0, rd
++  |  srl rd, AT, rd
++  |  addwi rd, 0, rd
++  |.endmacro 
++  |
++  |.macro .SRAW, rd, rs, rt
++  |  andi rt, 0x1f, AT
++  |  addwi rs, 0x0, rd
++  |  sra rd, AT, rd
++  |.endmacro 
++  |
++  |.macro .RORW, rd, rs, rt  //TODO CHECK
++  |  andi rt, 0x1f, TMP0
++  |  ldi TMP1, 32(zero)
++  |  subw TMP1, TMP0, TMP1
++  |  andi TMP0, 0x1f, TMP2
++  |  zapi rs, 0xf0, TMP0
++  |  srl TMP0, TMP2, TMP0
++  |  addwi TMP0, 0, TMP0
++  |  andi TMP1, 0x1f, TMP2
++  |  sll rs, TMP2, rd
++  |  addwi rd, 0x0, rd
++  |  bis TMP0, rd, rd
++  |  addwi rd, 0x0, rd
++  |.endmacro 
++  |
++  |.ffunc_bit_sh lshift, .SLLW, 0 
++  |.ffunc_bit_sh rshift, .SRLW, 0  
++  |.ffunc_bit_sh arshift, .SRAW, 0  
++  |.ffunc_bit_sh rol, .RORW, 1  
++  |.ffunc_bit_sh ror, .RORW, 0  
++  |
++  |//-----------------------------------------------------------------------
++  |
++  |->fff_fallback:			 // Call fast function fallback handler.
++  |  // BASE = new base, RB = CFUNC, RC = nargs*8
++  |  ldl PC, FRAME_PC(BASE)		// Fallback may overwrite PC.
++  |  ldl CARG3, CFUNC:RB->f
++  |  addl BASE, NARGS8:RC, TMP1
++  |  stl BASE, L->base
++  |  ldi TMP0, 8*LUA_MINSTACK(TMP1)
++  |  ldl TMP2, L->maxstack
++  |  stl PC, SAVE_PC(sp)			// Redundant (but a defined value).
++  |  stl TMP1, L->top
++  |  bis L, zero, CARG1
++  |  cmpult TMP2, TMP0, AT
++  |  bne AT, >5			// Need to grow stack.
++  |  ldi CFUNCADDR, 0(CARG3)
++  |  call r26, 0(CFUNCADDR)				// (lua_State *L)
++  |  // Either throws an error, or recovers and returns -1, 0 or nresults+1.
++  |  ldl BASE, L->base
++  |  s8addwi CRET1, 0, RD
++  |  ldi RA, -16(BASE)
++  |  cmplt zero, CRET1, AT
++  |  bne AT, ->fff_res		// Returned nresults+1?
++  |1:  // Returned 0 or -1: retry fast path.
++  |  ldl LFUNC:RB, FRAME_FUNC(BASE)
++  |  ldl TMP0, L->top
++  |  subl TMP0, BASE, NARGS8:RC
++  |  cleartp LFUNC:RB
++  |  bne CRET1, ->vm_call_tail		// Returned -1?
++  |  ins_callt				// Returned 0: retry fast path.
++  |
++  |// Reconstruct previous base for vmeta_call during tailcall.
++  |->vm_call_tail:
++  |  andi PC, FRAME_TYPE, TMP0
++  |  ldi TMP2, ~FRAME_TYPEP(zero)
++  |  and TMP2, PC, TMP1
++  |  bne TMP0, >3
++  |  ldbu TMP1, OFS_RA(PC)
++  |  s8addwi TMP1, 16, TMP1
++  |3:
++  |  subl BASE, TMP1, TMP2
++  |  br zero, ->vm_call_dispatch		// Resolve again for tailcall.
++  |
++  |5:  // Grow stack for fallback handler.
++  |  load_got lj_state_growstack
++  |  ldi CARG2, LUA_MINSTACK(zero)
++  |  bis L, zero, CARG1
++  |  call_intern lj_state_growstack	// (lua_State *L, int n)
++  |  ldl BASE, L->base
++  |  ldi CRET1, 0(zero)		// Set zero-flag to force retry.
++  |  br zero, <1
++  |
++  |->fff_gcstep:			// Call GC step function.
++  |  // BASE = new base, RC = nargs*8
++  |  bis ra, zero, MULTRES
++  |  load_got lj_gc_step
++  |  addl BASE, NARGS8:RC, TMP0	// Calculate L->top.
++  |  stl BASE, L->base
++  |  stl PC, SAVE_PC(sp)		// Redundant (but a defined value).
++  |  bis L, zero, CARG1
++  |  stl TMP0, L->top
++  |  call_intern lj_gc_step		// (lua_State *L)
++  |  ldl BASE, L->base
++  |  ldl TMP0, L->top
++  |  ldl CFUNC:RB, FRAME_FUNC(BASE)
++  |  cleartp CFUNC:RB
++  |  subl TMP0, BASE, NARGS8:RC
++  |  jmp zero, 0(MULTRES)
++  |
++  |//-----------------------------------------------------------------------
++  |//-- Special dispatch targets -------------------------------------------
++  |//-----------------------------------------------------------------------
++  |
++  |->vm_record:				// Dispatch target for recording phase.
++  |.if JIT
++  |  ldbu TMP3, DISPATCH_GL(hookmask)(DISPATCH)
++  |  andi TMP3, HOOK_VMEVENT, TMP1	// No recording while in vmevent.
++  |  ldw TMP2, DISPATCH_GL(hookcount)(DISPATCH)
++  |  bne TMP1, >5
++  |  // Decrement the hookcount for consistency, but always do the call.
++  |  andi TMP3, HOOK_ACTIVE, TMP1
++  |  subwi TMP2, 1, TMP2
++  |  bne TMP1, >1
++  |  andi TMP3, LUA_MASKLINE|LUA_MASKCOUNT, TMP1
++  |  beq TMP1, >1
++  |  stw TMP2, DISPATCH_GL(hookcount)(DISPATCH)
++  |  br zero, >1
++  |.endif
++  |
++  |->vm_rethook:			// Dispatch target for return hooks.
++  |  ldbu TMP3, DISPATCH_GL(hookmask)(DISPATCH)
++  |  andi TMP3, HOOK_ACTIVE, TMP1		// Hook already active?
++  |  beq TMP1, >1
++  |5:  // Re-dispatch to static ins.
++  |  ldl TMP1, GG_DISP2STATIC(TMP0)	// Assumes TMP0 holds DISPATCH+OP*4.
++  |  jmp zero, 0(TMP1)
++  |
++  |->vm_inshook:			// Dispatch target for instr/line hooks.
++  |  ldbu TMP3, DISPATCH_GL(hookmask)(DISPATCH)
++  |  ldw TMP2, DISPATCH_GL(hookcount)(DISPATCH)
++  |  andi TMP3, HOOK_ACTIVE, TMP1		// Hook already active?
++  |  bne TMP1, <5
++  |  andi TMP3, LUA_MASKLINE|LUA_MASKCOUNT, TMP1
++  |  subwi TMP2, 1, TMP2
++  |  beq TMP1, <5
++  |  stw TMP2, DISPATCH_GL(hookcount)(DISPATCH)
++  |  beq TMP2, >1
++  |  andi TMP3, LUA_MASKLINE, TMP1
++  |  load_got lj_dispatch_ins
++  |  beq TMP1, <5
++  |1:
++  |  load_got lj_dispatch_ins
++  |  stw MULTRES, TMPD(sp)
++  |  bis PC, zero, CARG2
++  |  stl BASE, L->base
++  |  bis L, zero, CARG1
++  |  // SAVE_PC must hold the _previous_ PC. The callee updates it with PC.
++  |  call_intern lj_dispatch_ins	// (lua_State *L, const BCIns *pc)
++  |3:
++  |  ldl BASE, L->base
++  |4:  // Re-dispatch to static ins.
++  |  ldw INS, -4(PC)
++  |  decode_OP TMP1, INS
++  |  decode_BC8b TMP1
++  |  addl TMP1, DISPATCH, TMP0
++  |  decode_RD RD, INS
++  |  ldl TMP1, GG_DISP2STATIC(TMP0)
++  |  decode_RA RA, INS
++  |  jmp zero, 0(TMP1)
++  |
++  |->cont_hook:				// Continue from hook yield.
++  |  ldi PC, 4(PC)
++  |  ldw MULTRES, -24(RB)		// Restore MULTRES for *M ins.
++  |  br zero, <4
++  |
++  |->vm_hotloop:			// Hot loop counter underflow.
++  |.if JIT
++  |  ldl LFUNC:TMP1, FRAME_FUNC(BASE)
++  |  ldi CARG1, GG_DISP2J(DISPATCH)
++  |  cleartp LFUNC:TMP1
++  |  stl PC, SAVE_PC(sp)
++  |  ldl TMP1, LFUNC:TMP1->pc
++  |  bis PC, zero, CARG2
++  |  stl L, DISPATCH_J(L)(DISPATCH)
++  |  ldbu TMP1, PC2PROTO(framesize)(TMP1)
++  |  load_got lj_trace_hot
++  |  stl BASE, L->base
++  |  s8addl TMP1, BASE, TMP1
++  |  stl TMP1, L->top
++  |  call_intern lj_trace_hot		// (jit_State *J, const BCIns *pc)
++  |  br zero, <3
++  |.endif
++  |
++  |
++  |->vm_callhook:			// Dispatch target for call hooks.
++  |  bis PC, zero, CARG2
++  |.if JIT
++  |  br zero, >1
++  |.endif
++  |
++  |->vm_hotcall:			// Hot call counter underflow.
++  |.if JIT
++  |  bisi PC, 1, CARG2
++  |1:
++  |.endif
++  |  load_got lj_dispatch_call
++  |  addl BASE, RC, TMP0
++  |  stl PC, SAVE_PC(sp)
++  |  stl BASE, L->base
++  |  subl RA, BASE, RA
++  |  stl TMP0, L->top
++  |  bis L, zero, CARG1
++  |  call_intern lj_dispatch_call		// (lua_State *L, const BCIns *pc)
++  |  // Returns ASMFunction.
++  |  ldl BASE, L->base
++  |  ldl TMP0, L->top
++  |  stl zero, SAVE_PC(sp)		// Invalidate for subsequent line hook.
++  |  addl RA, BASE, RA
++  |  subl TMP0, BASE, NARGS8:RC
++  |  ldl LFUNC:RB, FRAME_FUNC(BASE)
++  |  cleartp LFUNC:RB
++  |  ldw INS, -4(PC)
++  |  jmp zero, 0(CRET1)
++  |
++  |->cont_stitch:			// Trace stitching.
++  |.if JIT
++  |  // RA = resultptr, RB = meta base
++  |  ldw INS, -4(PC)
++  |  ldl TRACE:TMP2, -40(RB)		// Save previous trace.
++  |  decode_RA RC, INS
++  |  ldi TMP1, -8(MULTRES)
++  |  cleartp TRACE:TMP2
++  |  addl RC, BASE, RC			// Call base.
++  |  beq TMP1, >2
++  |1:  // Move results down.
++  |  ldl CARG1, 0(RA)
++  |  ldi TMP1, -8(TMP1)
++  |  ldi RA, 8(RA)
++  |  stl CARG1, 0(RC)
++  |  ldi RC, 8(RC)
++  |  bne TMP1, <1
++  |2:
++  |  decode_RA RA, INS
++  |  decode_RB RB, INS
++  |  addl RA, RB, RA
++  |  addl RA, BASE, RA
++  |3:
++  |  cmpult RC, RA, TMP1
++  |  bne TMP1, >9			// More results wanted?
++  |
++  |  ldhu TMP3, TRACE:TMP2->traceno
++  |  ldhu RD, TRACE:TMP2->link
++  |  load_got lj_dispatch_stitch
++  |  cmpeq RD, TMP3, AT
++  |  bne AT, ->cont_nop // Blacklisted.
++  |  s8addwi RD, 0, RD
++  |  bne  RD, =>BC_JLOOP		// Jump to stitched trace.
++  |
++  |  // Stitch a new trace to the previous trace.
++  |  stw TMP3, DISPATCH_J(exitno)(DISPATCH)
++  |  stl L, DISPATCH_J(L)(DISPATCH) 
++  |  stl BASE, L->base
++  |  ldi CARG1, GG_DISP2J(DISPATCH)
++  |  bis PC, zero, CARG2
++  |  call_intern lj_dispatch_stitch	// (jit_State *J, const BCIns *pc)
++  |  ldl BASE, L->base
++  |  br zero, ->cont_nop
++  |
++  |9:
++  |  stl TISNIL, 0(RC)
++  |  ldi RC, 8(RC)
++  |  br zero, <3
++  |.endif
++  |
++  |->vm_profhook:			// Dispatch target for profiler hook.
++#if LJ_HASPROFILE
++  |  load_got lj_dispatch_profile
++  |  bis L, zero, CARG1
++  |  bis PC, zero, CARG2
++  |  stl BASE, L->base
++  |  stw MULTRES, TMPD(sp)
++  |  call_intern lj_dispatch_profile	// (lua_State *L, const BCIns *pc)
++  |  // HOOK_PROFILE is off again, so re-dispatch to dynamic instruction.
++  |  ldi PC, -4(PC)
++  |  ldl BASE, L->base
++  |  br zero, ->cont_nop
++#endif
++  |
++  |//-----------------------------------------------------------------------
++  |//-- Trace exit handler -------------------------------------------------
++  |//-----------------------------------------------------------------------
++  |
++  |.macro savex_, a, b
++  |  fstd f..a, a*8(sp)
++  |  fstd f..b, b*8(sp)
++  |  stl r..a, 32*8+a*8(sp)
++  |  stl r..b, 32*8+b*8(sp)
++  |.endmacro
++  |
++  |->vm_exit_handler:
++  |.if JIT
++  |//Save all registers except RA and SP.On SW64 is r30 and r26.
++  |  ldi sp, -(32*8+32*8)(sp)
++  |  savex_ 0, 1
++  |  savex_ 2, 3
++  |  savex_ 4, 5
++  |  savex_ 6, 7
++  |  savex_ 8, 9
++  |  savex_ 10, 11
++  |  savex_ 12, 13
++  |  savex_ 14, 15
++  |  savex_ 16, 17
++  |  savex_ 18, 19
++  |  savex_ 20, 21
++  |  savex_ 22, 23
++  |  savex_ 24, 25
++  |  savex_ 27, 28
++  |  savex_ 29, 31
++  |  fstd f26, 26*8(sp)
++  |  fstd f30, 30*8(sp)
++  |  stl zero, 32*8+26*8(sp)		// Clear RID_TMP.
++  |  ldi TMP2, 32*8+32*8(sp)		// Recompute original value of sp.
++  |  stl TMP2, 32*8+30*8(sp)		// Store sp in RID_SP
++  |  li_vmstate EXIT
++  |  ldi DISPATCH, -GG_DISP2G-32768(JGL)
++  |  ldw TMP1, 0(TMP2)			// Load exit number.
++  |  st_vmstate
++  |  ldl L, DISPATCH_GL(cur_L)(DISPATCH)
++  |  ldl BASE, DISPATCH_GL(jit_base)(DISPATCH)
++  |  load_got lj_trace_exit
++  |  stl L, DISPATCH_J(L)(DISPATCH)
++  |  stw ra, DISPATCH_J(parent)(DISPATCH)	// Store trace number.
++  |  stl BASE, L->base
++  |  stw TMP1, DISPATCH_J(exitno)(DISPATCH)  // Store exit number.
++  |  ldi CARG1, GG_DISP2J(DISPATCH)
++  |  stl zero, DISPATCH_GL(jit_base)(DISPATCH)
++  |  bis sp, zero, CARG2
++  |  call_intern lj_trace_exit		// (jit_State *J, ExitState *ex)
++  |  // Returns MULTRES (unscaled) or negated error code.
++  |  ldl TMP1, L->cframe
++  |  ldi TMP2, -4(zero)
++  |  ldl BASE, L->base
++  |  and TMP1, TMP2, sp
++  |  ldl PC, SAVE_PC(sp)		// Get SAVE_PC.
++  |  stl L, SAVE_L(sp)			// Set SAVE_L (on-trace resume/yield).
++  |  br zero, >1
++  |.endif
++  |
++  |->vm_exit_interp:
++  |.if JIT
++  |  // CRET1 = MULTRES or negated error code, BASE, PC and JGL set.
++  |  ldl L, SAVE_L(sp)
++  |  ldi DISPATCH, -GG_DISP2G-32768(JGL)
++  |  stl BASE, L->base
++  |1:
++  |  ldl LFUNC:RB, FRAME_FUNC(BASE)
++  |  cmplt CRET1, zero, AT
++  |  bne AT, >9			// Check for error from exit.
++  |  ldih TMP3, 0x59c0(zero)		// TOBIT = 2^52 + 2^51 (float).
++  |  slli CRET1, 3, MULTRES
++  |  cleartp LFUNC:RB
++  |  stw MULTRES, TMPD(sp)
++  |  ldi TISNIL, LJ_TNIL(zero)
++  |  ldi TISNUM, LJ_TISNUM(zero)		// Setup type comparison constants.
++  |  ifmovs TMP3, TOBIT
++  |  ldl TMP1, LFUNC:RB->pc
++  |  stl zero, DISPATCH_GL(jit_base)(DISPATCH)
++  |  ldl KBASE, PC2PROTO(k)(TMP1)
++  |  fcvtsd TOBIT, TOBIT
++  |  // Modified copy of ins_next which handles function header dispatch, too.
++  |  ldw INS, 0(PC)
++  |  ldi PC, 4(PC)
++  |  // Assumes TISNIL == ~LJ_VMST_INTERP == -1
++  |  stw TISNIL, DISPATCH_GL(vmstate)(DISPATCH)
++  |  decode_OP TMP1, INS
++  |  decode_BC8b TMP1
++  |  // cmpulti TMP1, BC_FUNCF*8, TMP2
++  |  ldi TMP2, BC_FUNCF*8(zero)
++  |  cmpult TMP1, TMP2, TMP2
++  |  addl DISPATCH, TMP1, TMP0
++  |  decode_RD RD, INS
++  |  ldl TMP3, 0(TMP0)
++  |  decode_RA RA, INS
++  |  beq TMP2, >2
++  |  jmp zero, 0(TMP3)
++  |2:
++  |  // cmpulti TMP1, (BC_FUNCC+2)*8, TMP2	// Fast function?
++  |  ldi TMP2, (BC_FUNCF+2)*8(zero)
++  |  cmpult TMP1, TMP2, TMP2
++  |  ldl TMP1, FRAME_PC(BASE)
++  |  bne TMP2, >3
++  |  // Check frame below fast function.
++  |  andi TMP1, FRAME_TYPE, TMP0
++  |  bne TMP0, >3			// Trace stitching continuation?
++  |  // Otherwise set KBASE for Lua function below fast function.
++  |  ldw TMP2, -4(TMP1)
++  |  decode_RA TMP0, TMP2
++  |  subl BASE, TMP0, TMP1
++  |  ldl LFUNC:TMP2, -32(TMP1)
++  |  cleartp LFUNC:TMP2
++  |  ldl TMP1, LFUNC:TMP2->pc
++  |  ldl KBASE, PC2PROTO(k)(TMP1)
++  |3:
++  |  ldi RC, -8(MULTRES)
++  |  addl RA, BASE, RA
++  |  jmp zero, 0(TMP3)
++  |
++  |9:  // Rethrow error from the right C frame.
++  |  load_got lj_err_throw
++  |  subw zero, CRET1, CARG2		//TODO LA: sub.w  no trap
++  |  bis L, zero, CARG1
++  |  call_intern lj_err_throw		// (lua_State *L, int errcode)
++  |.endif
++  |
++  |//-----------------------------------------------------------------------
++  |//-- Math helper functions ----------------------------------------------
++  |//-----------------------------------------------------------------------
++  |   
++  |// Modifies AT, TMP0, FCRET1, FCRET2, FCARG1. Keeps all others incl. f2.
++  |.macro vm_round, func
++  |  // skip NaN && Inf
++  |  //      0 * NaN == NaN
++  |  //      0 * Inf == NaN
++  |  //      0 * Other == 0
++  |  fmuld fzero, FCARG1, FAT
++  |  fcmpun fzero, FAT, FAT
++  |  fbeq FAT, >1
++  |  faddd fzero, FCARG1, FCRET1
++  |  ret zero, 0(ra)
++  |1:
++  |.if "func"=="floor"
++  |  fcvtdln FCARG1, FAT;
++  |.endif
++  |.if "func"=="ceil"
++  |  fcvtdlp FCARG1, FAT;
++  |.endif
++  |.if "func"=="trunc"
++  |  fcvtdlz FCARG1, FAT;
++  |.endif
++  |  fcvtld FAT, FCRET1
++  |  ret zero, 0(ra)
++  |.endmacro
++  |
++  |
++  |->vm_floor:
++  |  vm_round floor
++  |->vm_ceil:
++  |  vm_round ceil
++  |->vm_trunc:
++  |.if JIT
++  |  vm_round trunc
++  |.endif
++  |
++  |
++  |//-----------------------------------------------------------------------
++  |//-- Miscellaneous functions --------------------------------------------
++  |//-----------------------------------------------------------------------
++  |
++  |.define NEXT_TAB,            TAB:CARG1
++  |.define NEXT_IDX,            CARG2
++  |.define NEXT_ASIZE,          CARG3
++  |.define NEXT_NIL,            CARG4
++  |.define NEXT_TMP0,           TMP0
++  |.define NEXT_TMP1,           TMP1
++  |.define NEXT_TMP2,           TMP2
++  |.define NEXT_RES_VK,         CRET1
++  |.define NEXT_RES_IDX,        CRET2
++  |.define NEXT_RES_PTR,        sp
++  |.define NEXT_RES_VAL,        0(sp)
++  |.define NEXT_RES_KEY,        8(sp)
++  |
++  |// TValue *lj_vm_next(GCtab *t, uint32_t idx)
++  |// Next idx returned in CRET2.
++  |//->vm_next:
++  |//.if JIT
++  |//  ldw NEXT_ASIZE, NEXT_TAB->asize
++  |//  ldl NEXT_TMP0, NEXT_TAB->array
++  |//  ldi NEXT_NIL, LJ_TNIL(zero)
++  |//1:  // Traverse array part.
++  |//  cmpult NEXT_IDX, NEXT_ASIZE, TMP3
++  |//  slli NEXT_IDX, 3, NEXT_TMP1
++  |//  addwi NEXT_TMP1, 0, NEXT_TMP1
++  |//  addl NEXT_TMP1, NEXT_TMP0, NEXT_TMP1
++  |//  beq TMP3, >5
++  |//  ldi TMP3, LJ_TISNUM(zero)
++  |//  ldl NEXT_TMP2, 0(NEXT_TMP1)
++  |//  slli TMP3, 47, TMP3
++  |//  bis NEXT_IDX, TMP3, NEXT_TMP1
++  |//  addwi NEXT_IDX, 1, NEXT_IDX
++  |//  cmpeq NEXT_TMP2, NEXT_NIL, NEXT_TMP2
++  |//  bne NEXT_TMP2, <1
++  |//  stl NEXT_TMP2, NEXT_RES_VAL
++  |//  stl NEXT_TMP1, NEXT_RES_KEY
++  |//  bis NEXT_RES_PTR, zero, NEXT_RES_VK
++  |//  bis NEXT_IDX, zero, NEXT_RES_IDX
++  |//  ret zero, 0(ra)
++  |
++  |//5:  // Traverse hash part.
++  |//  subw NEXT_IDX, NEXT_ASIZE, NEXT_RES_IDX
++  |//  ldw NEXT_TMP0, NEXT_TAB->hmask
++  |//  ldl NODE:NEXT_RES_VK, NEXT_TAB->node
++  |//  slli NEXT_RES_IDX, 5, NEXT_TMP2
++  |//  addwi NEXT_TMP2, 0, NEXT_TMP2
++  |//  slli NEXT_RES_IDX, 3, TMP3
++  |//  addwi TMP3, 0, TMP3
++  |//  subw NEXT_TMP2, TMP3, TMP3
++  |//  addl NODE:NEXT_RES_VK, TMP3, NODE:NEXT_RES_VK
++  |//6:
++  |//  cmpult NEXT_TMP0, NEXT_RES_IDX, TMP3
++  |//  bne TMP3, >8
++  |//  ldl NEXT_TMP2, NODE:NEXT_RES_VK->val
++  |//  addwi NEXT_RES_IDX, 1, NEXT_RES_IDX
++  |//  cmpeq NEXT_TMP2, NEXT_NIL, NEXT_TMP2
++  |//  beq NEXT_TMP2, >9
++  |  // Skip holes in hash part.
++  |//  ldi NODE:NEXT_RES_VK, sizeof(Node)(NODE:NEXT_RES_VK)
++  |//  br zero, <6
++  |
++  |//8:  // End of iteration. Set the key to nil (not the value).
++  |//  stl NEXT_NIL, NEXT_RES_KEY
++  |//  bis NEXT_RES_PTR, zero, NEXT_RES_VK
++  |//9:
++  |//  addw NEXT_RES_IDX, NEXT_ASIZE, NEXT_RES_IDX
++  |//  ret zero, 0(ra)
++  |//.endif
++  |
++  |//-----------------------------------------------------------------------
++  |//-- FFI helper functions -----------------------------------------------
++  |//-----------------------------------------------------------------------
++  |
++  |// Handler for callback functions. Callback slot number in r1, g in r2.
++  |->vm_ffi_callback:
++  |.if FFI
++  |.type CTSTATE, CTState, PC
++  |  saveregs
++  |  ldl CTSTATE, GL:r2->ctype_state
++  |  ldi DISPATCH, GG_G2DISP(r2)
++  |  load_got lj_ccallback_enter
++  |  stw r1, CTSTATE->cb.slot
++  |  stl CARG1, CTSTATE->cb.gpr[0]
++  |  fstd FCARG1, CTSTATE->cb.fpr[0]
++  |  stl CARG2, CTSTATE->cb.gpr[1]
++  |  fstd FCARG2, CTSTATE->cb.fpr[1]
++  |  stl CARG3, CTSTATE->cb.gpr[2]
++  |  fstd FCARG3, CTSTATE->cb.fpr[2]
++  |  stl CARG4, CTSTATE->cb.gpr[3]
++  |  fstd FCARG4, CTSTATE->cb.fpr[3]
++  |  stl CARG5, CTSTATE->cb.gpr[4]
++  |  fstd FCARG5, CTSTATE->cb.fpr[4]
++  |  stl CARG6, CTSTATE->cb.gpr[5]
++  |  fstd FCARG6, CTSTATE->cb.fpr[5]
++  |  ldi TMP0, CFRAME_SPACE(sp)
++  |  stl TMP0, CTSTATE->cb.stack
++  |  stl zero, SAVE_PC(sp)		// Any value outside of bytecode is ok.
++  |  bis CTSTATE, zero, CARG1
++  |  bis sp, zero, CARG2
++  |  call_intern lj_ccallback_enter	// (CTState *cts, void *cf)
++  |  // Returns lua_State *.
++  |  ldl BASE, L:CRET1->base
++  |  ldl RC, L:CRET1->top
++  |  bis CRET1, zero, L
++  |  ldih TMP3, 0x59c0(zero)		// TOBIT = 2^52 + 2^51 (float).
++  |  ldl LFUNC:RB, FRAME_FUNC(BASE)
++  |  ifmovs TMP3, TOBIT
++  |  ldi TISNIL, LJ_TNIL(zero)
++  |  ldi TISNUM, LJ_TISNUM(zero)
++  |  li_vmstate INTERP
++  |  subw RC, BASE, RC
++  |  cleartp LFUNC:RB
++  |  st_vmstate
++  |  fcvtsd TOBIT, TOBIT
++  |  ins_callt
++  |.endif
++  |
++  |->cont_ffi_callback:			// Return from FFI callback.
++  |.if FFI
++  |  load_got lj_ccallback_leave
++  |  ldl CTSTATE, DISPATCH_GL(ctype_state)(DISPATCH)
++  |  stl BASE, L->base
++  |  stl RB, L->top
++  |  stl L, CTSTATE->L
++  |  bis CTSTATE, zero, CARG1
++  |  bis RA, zero, CARG2
++  |  call_intern lj_ccallback_leave	// (CTState *cts, TValue *o)
++  |  fldd FCRET1, CTSTATE->cb.fpr[0]
++  |  ldl CRET1, CTSTATE->cb.gpr[0]
++  |  fldd FCRET2, CTSTATE->cb.fpr[1]
++  |  ldl CRET2, CTSTATE->cb.gpr[1]
++  |  br zero, ->vm_leave_unw
++  |.endif
++  |
++  |->vm_ffi_call:			// Call C function via FFI.
++  |  // Caveat: needs special frame unwinding, see below.
++  |.if FFI
++  |  .type CCSTATE, CCallState, CARG1
++  |  ldw TMP1, CCSTATE->spadj
++  |  ldbu CARG2, CCSTATE->nsp
++  |  bis sp, zero, TMP2
++  |  subl sp, TMP1, sp
++  |  stl ra, -8(TMP2)
++  |  s8addwi CARG2, 0, CARG2
++  |  stl r9, -16(TMP2)
++  |  stl CCSTATE, -24(TMP2)
++  |  bis TMP2, zero, r9
++  |  ldi TMP1, offsetof(CCallState, stack)(CCSTATE)
++  |  bis sp, zero, TMP2
++  |  addl TMP1, CARG2, TMP3
++  |  beq CARG2, >2
++  |1:
++  |  ldl TMP0, 0(TMP1)
++  |  ldi TMP1, 8(TMP1)
++  |  cmpult TMP1, TMP3, TMP4
++  |  stl TMP0, 0(TMP2)
++  |  ldi TMP2, 8(TMP2)
++  |  bne TMP4, <1
++  |2:
++  |  ldl CFUNCADDR, CCSTATE->func
++  |  fldd FCARG1, CCSTATE->gpr[0]
++  |  fldd FCARG2, CCSTATE->gpr[1]
++  |  fldd FCARG3, CCSTATE->gpr[2]
++  |  fldd FCARG4, CCSTATE->gpr[3]
++  |  fldd FCARG5, CCSTATE->gpr[4]
++  |  fldd FCARG6, CCSTATE->gpr[5]
++  |  ldl CARG2, CCSTATE->gpr[1]
++  |  ldl CARG3, CCSTATE->gpr[2]
++  |  ldl CARG4, CCSTATE->gpr[3]
++  |  ldl CARG5, CCSTATE->gpr[4]
++  |  ldl CARG6, CCSTATE->gpr[5]
++  |  ldl CARG1, CCSTATE->gpr[0]         // Do this last, since CCSTATE is CARG1.
++  |  call r26, 0(CFUNCADDR)
++  |  ldl CCSTATE:TMP1, -24(r9)
++  |  ldl TMP2, -16(r9)
++  |  ldl ra, -8(r9)
++  |  stl CRET1, CCSTATE:TMP1->gpr[0]
++  |  stl CRET2, CCSTATE:TMP1->gpr[1]
++  |  fstd FCRET1, CCSTATE:TMP1->fpr[0]
++  |  fstd FCRET2, CCSTATE:TMP1->fpr[1]
++  |  bis r9, zero, sp
++  |  bis TMP2, zero, r9
++  |  ret zero, 0(ra)
++  |.endif
++  |// Note: vm_ffi_call must be the last function in this object file!
++  |
++  |//-----------------------------------------------------------------------
++}
++
++//TODO cmx
++/* Generate the code for a single instruction. */
++static void build_ins(BuildCtx *ctx, BCOp op, int defop)
++{
++  int vk = 0;
++  |=>defop:
++
++  switch (op) {
++
++  /* -- Comparison ops ---------------------------------------------------- */
++
++  /* Remember: all ops branch for a true comparison, fall through otherwise. */
++
++  case BC_ISLT: case BC_ISGE: case BC_ISLE: case BC_ISGT:
++    |  // RA = src1*8, RD = src2*8, JMP with RD = target
++    |  addl RA, BASE, RA
++    |  addl RD, BASE, RD
++    if (op == BC_ISLT || op == BC_ISGE) {
++      |  ldl CARG1, 0(RA)
++      |  ldl CARG2, 0(RD)
++      |  gettp CARG3, CARG1
++      |  gettp CARG4, CARG2
++    } else {
++      |  ldl CARG2, 0(RA)
++      |  ldl CARG1, 0(RD)
++      |  gettp CARG3, CARG2
++      |  gettp CARG4, CARG1
++    }
++    |  ldhu TMP2, OFS_RD(PC)		// TMP2=jump
++    |  ldi PC, 4(PC)
++    |  ldi TISNUM, LJ_TISNUM(zero)
++    |  cmpeq CARG3, TISNUM, AT
++    |  beq AT, >2
++    |  decode_BC4b TMP2
++    |  ldi TISNUM, LJ_TISNUM(zero)
++    |  cmpeq CARG4, TISNUM, AT
++    |  beq AT, >5
++    |  addwi CARG1, 0, CARG1
++    |  addwi CARG2, 0, CARG2
++    |  ldih TMP3, -0x2(zero)		// -BCBIAS_J*4
++    |  cmplt CARG1, CARG2, TMP1
++    |  addw TMP2, TMP3, TMP2		// TMP2=(jump-0x8000)<<2
++    if (op == BC_ISLT || op == BC_ISGT) {
++      |  seleq TMP1, zero, TMP2, TMP2
++    } else {
++      |  selne TMP1, zero, TMP2,TMP2
++    }
++    |1:
++    |  addl PC, TMP2, PC
++    |  ins_next
++    |
++    |2:  // RA is not an integer.
++    |  // cmpulti CARG3, LJ_TISNUM, TMP1
++    |  ldi TMP1, LJ_TISNUM(zero)
++    |  cmpult CARG3, TMP1, TMP1
++    |  ldih TMP3, -0x2(zero)		// -BCBIAS_J*4
++    |  beq TMP1, ->vmeta_comp
++    |  // cmpulti CARG4, LJ_TISNUM, TMP1
++    |  ldi TMP1, LJ_TISNUM(zero)
++    |  cmpult CARG4, TMP1, TMP1
++    |  decode_BC4b TMP2
++    |  beq TMP1, >4
++    |  ifmovd CARG1, FTMP0
++    |  ifmovd CARG2, FTMP2
++    |3:  // RA and RD are both numbers.
++    if (op == BC_ISLT || op == BC_ISGE) {
++      |  fcmplt FTMP0, FTMP2, FTMP3
++    } else {
++      |  fcmplt FTMP0, FTMP2, FTMP3
++      |  fcmpun FTMP0, FTMP2, FTMP4    //TODO FCC CHECK
++      |  faddd FTMP3, FTMP4, FTMP3
++    }
++    |  addw TMP2, TMP3, TMP2
++    |  fimovd FTMP3, TMP3    //TODO CHECK
++    if (op == BC_ISLT || op == BC_ISGT) {
++      |  seleq TMP3, zero, TMP2, TMP2
++    } else {
++      |  selne TMP3, zero, TMP2, TMP2
++    }
++    |  br zero, <1
++    |
++    |4:  // RA is a number, RD is not a number.
++    |  // RA is a number, RD is an integer. Convert RD to a number.
++    |  ldi TISNUM, LJ_TISNUM(zero)
++    |  cmpeq CARG4, TISNUM, AT
++    |  beq AT, ->vmeta_comp
++    if (op == BC_ISLT || op == BC_ISGE) {
++      |  ifmovs CARG2, FTMP2
++      |  ifmovd CARG1, FTMP0
++      |  fcvtwl FTMP2, FTMP2
++      |  fcvtld FTMP2, FTMP2    
++    } else {
++      |  ifmovs CARG1, FTMP0
++      |  ifmovd CARG2, FTMP2
++      |  fcvtwl FTMP0, FTMP0
++      |  fcvtld FTMP0, FTMP0   
++    }
++    |  br zero, <3
++    |
++    |5:  // RA is an integer, RD is not an integer
++    |  // cmpulti CARG4, LJ_TISNUM, TMP1
++    |  ldi TMP1, LJ_TISNUM(zero)
++    |  cmpult CARG4, TMP1, TMP1
++    |  ldih TMP3, -0x2(zero)		// -BCBIAS_J*4
++    |  beq TMP1, ->vmeta_comp
++    |  // RA is an integer, RD is a number. Convert RA to a number.
++    if (op == BC_ISLT || op == BC_ISGE) {
++      |  ifmovs CARG1, FTMP0
++      |  ifmovd CARG2, FTMP2
++      |  fcvtwl FTMP0, FTMP0
++      |  fcvtld FTMP0, FTMP0
++    } else {
++      |  ifmovs CARG2, FTMP2
++      |  ifmovd CARG1, FTMP0
++      |  fcvtwl FTMP2, FTMP2
++      |  fcvtld FTMP2, FTMP2
++    }
++    |  br zero, <3
++    break;
++
++  case BC_ISEQV: case BC_ISNEV:
++    vk = op == BC_ISEQV;
++    |  // RA = src1*8, RD = src2*8, JMP with RD = target
++    |  addl RA, BASE, RA
++    |  addl RD, BASE, RD
++    |  ldi PC, 4(PC)
++    |  ldl CARG1, 0(RA)
++    |  ldl CARG2, 0(RD)
++    |  ldhu TMP2, -4+OFS_RD(PC)
++    |  gettp CARG3, CARG1
++    |  gettp CARG4, CARG2
++    |  ldi TISNUM, LJ_TISNUM(zero)
++    |  cmpult TISNUM, CARG3, TMP0
++    |  cmpult TISNUM, CARG4, TMP1
++    |  bis TMP0, TMP1, TMP0
++    |  ldih TMP3, -0x2(zero)		// -BCBIAS_J*4
++    if (vk) {
++      |  beq TMP0, ->BC_ISEQN_Z
++    } else {
++      |  beq TMP0, ->BC_ISNEN_Z
++    }
++    |// Either or both types are not numbers.
++    |.if FFI
++    |  // Check if RA or RD is a cdata.
++    |  ldi TMP0, LJ_TCDATA(zero)
++    |  addwi TMP0, 0, TMP0
++    |  cmpeq CARG3, TMP0, AT
++    |  bne AT, ->vmeta_equal_cd
++    |  cmpeq CARG4, TMP0, AT
++    |  bne AT, ->vmeta_equal_cd
++    |.endif
++    |  ldih TMP3, -0x2(zero)		// -BCBIAS_J*4
++    |  decode_BC4b TMP2
++    |  addw TMP2, TMP3, TMP2		// (jump-0x8000)<<2
++    |  cmpeq CARG1, CARG2, AT
++    |  beq AT, >2
++    |  // Tag and value are equal.
++    if (vk) {
++      |->BC_ISEQV_Z:
++      |  addl PC, TMP2, PC
++    }
++    |1:
++    |  ins_next
++    |
++    |2:  // Check if the tags are the same and it's a table or userdata.
++    |  xor CARG3, CARG4, TMP3			// Same type?
++    |  // cmpulti CARG3, LJ_TISTABUD+1, TMP0		// Table or userdata? TMP0=1
++    |  ldi TMP0, LJ_TISTABUD+1(zero)
++    |  cmpult CARG3, TMP0, TMP0
++    |  selne TMP3, zero, TMP0, TMP0		// TMP0=0: not same type, or same type table/userdata
++    |  cleartp TAB:TMP1, CARG1
++    if (vk) {
++      |  beq TMP0, <1
++    } else {
++      |  beq TMP0, ->BC_ISEQV_Z  // Reuse code from opposite instruction.
++    }
++    |  // Different tables or userdatas. Need to check __eq metamethod.
++    |  // Field metatable must be at same offset for GCtab and GCudata!
++    |  ldl TAB:TMP3, TAB:TMP1->metatable
++    if (vk) {
++      |  beq TAB:TMP3, <1		// No metatable?
++      |  ldbu TMP3, TAB:TMP3->nomm
++      |  andi TMP3, 1<<MM_eq, TMP3
++      |  addwi zero, 0, TMP0		// ne = 0
++      |  bne TMP3, <1			// Or 'no __eq' flag set?
++    } else {
++      |  beq TAB:TMP3,->BC_ISEQV_Z	// No metatable?
++      |  ldbu TMP3, TAB:TMP3->nomm
++      |  andi TMP3, 1<<MM_eq, TMP3
++      |  addwi zero, 1, TMP0		// ne = 1
++      |  bne TMP3, ->BC_ISEQV_Z	// Or 'no __eq' flag set?
++    }
++    |  br zero, ->vmeta_equal			// Handle __eq metamethod.
++    break;
++
++  case BC_ISEQS: case BC_ISNES:
++    vk = op == BC_ISEQS;
++    |  // RA = src*8, RD = str_const*8 (~), JMP with RD = target
++    |  addl RA, BASE, RA
++    |  ldi PC, 4(PC)
++    |  ldl CARG1, 0(RA)
++    |  subl KBASE, RD, RD
++    |  ldhu TMP2, -4+OFS_RD(PC)
++    |  ldl CARG2, -8(RD)		// KBASE-8-str_const*8
++    |.if FFI
++    |  gettp CARG3, CARG1
++    |  ldi TMP1, LJ_TCDATA(zero)
++    |  addwi TMP1, 0, TMP1
++    |.endif
++    |  ldi TMP0, LJ_TSTR(zero)
++    |  addwi TMP0, 0, TMP0
++    |  decode_BC4b TMP2
++    |  settp CARG2, TMP0
++    |  ldih TMP3, -0x2(zero)		// -BCBIAS_J*4
++    |.if FFI
++    |  cmpeq CARG3, TMP1, AT
++    |  bne AT, ->vmeta_equal_cd
++    |.endif
++    |  xor CARG1, CARG2, TMP0		// TMP2=0: A==D; TMP2!=0: A!=D
++    |  addw TMP2, TMP3, TMP2
++    if (vk) {
++      |  selne TMP0, zero, TMP2, TMP2
++    } else {
++      |  seleq TMP0, zero, TMP2, TMP2
++    }
++    |  addl PC, TMP2, PC
++    |  ins_next
++    break;
++
++  case BC_ISEQN: case BC_ISNEN:
++    vk = op == BC_ISEQN;
++    |  // RA = src*8, RD = num_const*8, JMP with RD = target
++    |  addl RA, BASE, RA
++    |  addl RD, KBASE, RD
++    |  ldl CARG1, 0(RA)
++    |  ldl CARG2, 0(RD)
++    |  ldhu TMP2, OFS_RD(PC)
++    |  ldi PC, 4(PC)
++    |  gettp CARG3, CARG1
++    |  gettp CARG4, CARG2
++    |  ldih TMP3, -0x2(zero)		// -BCBIAS_J*4
++    if (vk) {
++      |->BC_ISEQN_Z:
++    } else {
++      |->BC_ISNEN_Z:
++    }
++    |  decode_BC4b TMP2
++    |  ldi TISNUM, LJ_TISNUM(zero)
++    |  cmpeq CARG3, TISNUM, AT
++    |  beq AT, >4
++    |  addw TMP2, TMP3, TMP2
++    |  ldi TISNUM, LJ_TISNUM(zero)
++    |  cmpeq CARG4, TISNUM, AT
++    |  beq AT, >6
++    |  xor CARG1, CARG2, TMP0		// TMP0=0: A==D; TMP0!=0: A!=D
++    if (vk) {
++      |  selne TMP0, zero, TMP2, TMP2
++      |1:
++      |  addl PC, TMP2, PC
++    |2:
++    } else {
++      |  seleq TMP0, zero, TMP2, TMP2
++      |1:
++      |2:
++      |  addl PC, TMP2, PC
++    }
++    |3:
++    |  ins_next
++    |
++    |4:  // RA is not an integer.
++    |  ldi TISNUM, LJ_TISNUM(zero)
++    |  cmpult CARG3, TISNUM, TMP0
++    |  addw TMP2, TMP3, TMP2
++    |.if FFI
++    |  beq TMP0, >7
++    |.else
++    |  beq TMP0, <2
++    |.endif
++    |  ifmovd CARG1, FTMP0
++    |  ifmovd CARG2, FTMP2
++    |  ldi TISNUM, LJ_TISNUM(zero)
++    |  cmpeq CARG4, TISNUM, AT
++    |  beq AT, >5
++    |// RA is a number, RD is an integer.
++    |  ldl TMP3, 0(RD)
++    |  addw TMP3, zero, TMP3   //get [0:31] of RD
++    |  ifmovd TMP3, FTMP2 
++    |  fcvtld FTMP2, FTMP2   
++    |
++    |5:  // RA and RD are both numbers.
++    |  fcmpun FTMP0, FTMP2, FTMP3  
++    |  fimovd FTMP3, TMP4  //tmp4=2:is NaN; tmp0=0:isnot NaN
++    |  bne TMP4, >9
++    |  fcmpeq FTMP0, FTMP2, FTMP4 
++    |  fimovd FTMP4, TMP1  //tmp1=0:is eq
++    if (vk) {
++      | seleq TMP1, zero, TMP2, TMP2
++    } else {
++      | selne TMP1, zero, TMP2, TMP2
++    }
++    |  br zero, <1
++    |
++    |6: // RA is an integer, RD is a number.
++    |  ldi TISNUM, LJ_TISNUM(zero)
++    |  cmpult CARG4, TISNUM, TMP0
++    |.if FFI
++    |  beq TMP0, >8
++    |.else
++    |  beq TMP0, <2
++    |.endif
++    |  ifmovs CARG1, FTMP0
++    |  ifmovd CARG2, FTMP2
++    |  fcvtwl FTMP0, FTMP0
++    |  fcvtld FTMP0, FTMP0
++    |  br zero, <5
++    |
++    |.if FFI
++    |7:	// RA not int, not number
++    |  ldi TMP0, LJ_TCDATA(zero)
++    |  addwi TMP0, 0, TMP0
++    |  cmpeq CARG3, TMP0, AT
++    |  beq AT, <2
++    |  br zero, ->vmeta_equal_cd
++    |
++    |8:	// RD not int, not number
++    |  ldi TMP0, LJ_TCDATA(zero)
++    |  addwi TMP0, 0, TMP0
++    |  cmpeq CARG4, TMP0, AT
++    |  beq AT, <2
++    |  br zero, ->vmeta_equal_cd
++    |.endif
++    |
++    |9:	//is NaN
++    if (vk) {
++      | selne TMP4, zero, TMP2, TMP2
++    } else {
++      | seleq TMP4, zero, TMP2, TMP2
++    }
++    |  br zero, <1
++    break;
++
++  case BC_ISEQP: case BC_ISNEP:
++    vk = op == BC_ISEQP;
++    |  // RA = src*8, RD = primitive_type*8 (~), JMP with RD = target
++    |  addl RA, BASE, RA
++    |  zapi RD, 0xf0, TMP0
++    |  srli TMP0, 3, TMP0
++    |  ldl TMP1, 0(RA)
++    |  ornot zero, TMP0, TMP0		// ~TMP0: ~0 ~1 ~2
++    |  ldhu TMP2, OFS_RD(PC)		// TMP2: RD in next INS, branch target
++    |  gettp TMP1, TMP1
++    |  ldi PC, 4(PC)
++    |  xor TMP0, TMP1, TMP0		// TMP0=0 A=D; TMP0!=0 A!=D
++    |.if FFI
++    |  ldi TMP3, LJ_TCDATA(zero)
++    |  addwi TMP3, 0, TMP3
++    |  cmpeq TMP1, TMP3, AT
++    |  bne AT, ->vmeta_equal_cd
++    |.endif
++    |  decode_BC4b TMP2
++    |  ldih TMP3, -0x2(zero)		// -BCBIAS_J*4
++    |  addw TMP2, TMP3, TMP2		// TMP2=(jump-0x8000)<<2
++    if (vk) {
++      |  selne TMP0, zero, TMP2, TMP2
++    } else {
++      |  seleq TMP0, zero, TMP2, TMP2
++    }
++    |  addl PC, TMP2, PC
++    |  ins_next
++     break;
++
++  /* -- Unary test and copy ops ------------------------------------------- */
++
++  case BC_ISTC: case BC_ISFC: case BC_IST: case BC_ISF:
++    |  // RA = dst*8 or unused, RD = src*8, JMP with RD = target
++    |  addl RD, BASE, RD
++    |  ldhu TMP2, OFS_RD(PC)
++    |  ldl TMP0, 0(RD)
++    |  ldi PC, 4(PC)
++    |  gettp TMP0, TMP0
++    |  addl RA, BASE, RA
++    |  // cmpulti TMP0, LJ_TISTRUECOND, TMP0		// TMP0=1 true; TMP0=0 false
++    |  ldi AT, LJ_TISTRUECOND(zero)
++    |  cmpult TMP0, AT, TMP0
++    |  decode_BC4b TMP2
++    |  ldih TMP3, -0x2(zero)		// -BCBIAS_J*4
++    |  ldl CRET1, 0(RD)
++    |  addw TMP2, TMP3, TMP2		// (jump-0x8000)<<2
++    if (op == BC_IST || op == BC_ISTC) {
++      |  beq TMP0, >1
++      if (op == BC_ISTC) {
++        |  stl CRET1, 0(RA)
++      }
++    } else {
++      |  bne TMP0, >1
++      if (op == BC_ISFC) {
++	|  stl CRET1, 0(RA)
++    }
++    }
++    |  addl PC, TMP2, PC
++    |1:
++    |  ins_next
++    break;
++
++  case BC_ISTYPE:
++    |  // RA = src*8, RD = -type*8
++    |  addl BASE, RA, TMP0
++    |  zapi RD, 0xf0, TMP1
++    |  srli TMP1, 3, TMP1
++    |  ldl TMP0, 0(TMP0)
++    |  gettp TMP0, TMP0
++    |  addl TMP0, TMP1, TMP0		// if itype of RA == type, then TMP0=0
++    |  bne TMP0, ->vmeta_istype
++    |  ins_next
++    break;
++  case BC_ISNUM:
++    |  // RA = src*8, RD = -(TISNUM-1)*8
++    |  addl BASE, RA, TMP0
++    |  ldl TMP0, 0(TMP0)
++    |  checknum TMP0, ->vmeta_istype
++    |  ins_next
++    break;
++
++  /* -- Unary ops --------------------------------------------------------- */
++
++  case BC_MOV:
++    |  // RA = dst*8, RD = src*8
++    |  addl RD, BASE, RD
++    |  addl RA, BASE, RA
++    |  ldl TMP0, 0(RD)
++    |  ins_next1
++    |  stl TMP0, 0(RA)
++    |  ins_next2
++    break;
++  case BC_NOT:
++    |  // RA = dst*8, RD = src*8
++    |  addl RD, BASE, RD
++    |  addl RA, BASE, RA
++    |  ldl TMP0, 0(RD)
++    |  ldi TMP1, LJ_TTRUE(zero)
++    |  ins_next1
++    |  gettp TMP0, TMP0
++    |  cmpult TMP1, TMP0, TMP0
++    |  addwi TMP0, 1, TMP0
++    |  slli TMP0, 47, TMP0
++    |  ornot zero, TMP0, TMP0    // ~TMP0
++    |  stl TMP0, 0(RA)
++    |  ins_next2
++    break;
++  case BC_UNM:
++    |  // RA = dst*8, RD = src*8
++    |  addl BASE, RD, RB
++    |  addl BASE, RA, RA
++    |  ldl TMP0, 0(RB)
++    |  ldih TMP1, -32768(zero)  
++    |  gettp CARG3, TMP0
++    |  ldi TISNUM, LJ_TISNUM(zero); cmpeq CARG3, TISNUM, AT; beq AT, >1
++    |  subw zero, TMP0, TMP0
++    |  cmpeq TMP0, TMP1, AT; bne AT, ->vmeta_unm      // Meta handler deals with -2^31.
++    |  zapi TMP0, 0xf0, TMP0
++    |  ldi TISNUM, LJ_TISNUM(zero)
++    |  settp TMP0, TISNUM
++    |  br zero, >2
++    |1:
++    |  ldi AT, LJ_TISNUM(zero); cmpult CARG3, AT, TMP3
++    |  slli TMP1, 32, TMP1
++    |  beq TMP3, ->vmeta_unm
++    |  xor TMP0, TMP1, TMP0     // sign => ~sign
++    |2:
++    |  stl TMP0, 0(RA)
++    |  ins_next
++    break;
++  case BC_LEN:
++    |  // RA = dst*8, RD = src*8
++    |  addl BASE, RD, CARG2
++    |  ldl TMP0, 0(CARG2)
++    |  addl BASE, RA, RA
++    |  gettp TMP1, TMP0
++    |  ldi TMP2, -LJ_TSTR(TMP1)
++    |  cleartp STR:CARG1, TMP0
++    |  bne TMP2, >2
++    |  ldw CRET1, STR:CARG1->len
++    |1:
++    |  ldi TISNUM, LJ_TISNUM(zero)
++    |  settp CRET1, TISNUM
++    |  stl CRET1, 0(RA)
++    |  ins_next
++    |2:
++    |  ldi TMP2, -LJ_TTAB(TMP1)
++    |  bne TMP2, ->vmeta_len
++#if LJ_52
++    |  ldl TAB:TMP2, TAB:CARG1->metatable
++    |  bne TAB:TMP2, >9
++    |3:
++#endif
++    |->BC_LEN_Z:
++    |  load_got lj_tab_len
++    |  call_intern lj_tab_len		// (GCtab *t)
++    |  // Returns uint32_t (but less than 2^31).
++    |  br zero, <1
++#if LJ_52
++    |9:
++    |  ldbu TMP0, TAB:TMP2->nomm
++    |  andi TMP0, 1<<MM_len, TMP0
++    |  bne TMP0, <3			// 'no __len' flag set: done.
++    |  br zero, ->vmeta_len
++#endif
++    break;
++
++  /* -- Binary ops -------------------------------------------------------- */
++
++    |.macro fpmod, b, c, a
++    |  fdivd b, c, FCARG1
++    |  br ra, ->vm_floor		// floor(b/c)
++    |  fmuld FCRET1, c, a
++    |  fsubd b, a, a		// b - floor(b/c)*c
++    |.endmacro
++    |
++    |.macro ins_arithpre
++    ||vk = ((int)op - BC_ADDVN) / (BC_ADDNV-BC_ADDVN);
++    |  // RA = dst*8, RB = src1*8, RC = src2*8 | num_const*8
++    ||if (vk == 1) {
++    |   // RA = dst*8, RB = num_const*8, RC = src1*8
++    |   decode_RB RC, INS
++    |   decode_RDtoRC8 RB, RD
++    ||} else {
++    |   // RA = dst*8, RB = src1*8, RC = num_const*8
++    |   decode_RB RB, INS
++    |   decode_RDtoRC8 RC, RD
++    ||}
++    ||switch (vk) {
++    ||case 0:			// suffix is VN
++    |   addl RB, BASE, RB
++    |   addl RC, KBASE, RC
++    ||  break;
++    ||case 1:			// suffix is NV
++    |   addl RC, BASE, RC
++    |   addl RB, KBASE, RB
++    ||  break;
++    ||default:			// CAT or suffix is VV
++    |   addl RB, BASE, RB
++    |   addl RC, BASE, RC
++    ||  break;
++    ||}
++    |.endmacro
++    |
++    |.macro ins_arithfp, fpins, itype1, itype2
++    |  fldd FTMP0, 0(RB)
++    |  ldi TISNUM, LJ_TISNUM(zero)
++    |  cmpult itype1, TISNUM, itype1
++    |  cmpult itype2, TISNUM, itype2
++    |  fldd FTMP2, 0(RC)
++    |  and itype1, itype2, itype1
++    |  addl RA, BASE, RA
++    |  beq itype1, ->vmeta_arith
++    |  fpins FTMP0, FTMP2, FCRET1
++    |  ins_next1
++    |  fstd FCRET1, 0(RA)
++    |  ins_next2
++    |.endmacro
++    |
++    |.macro ins_arithead, itype1, itype2, tval1, tval2 
++    |  ldl tval1, 0(RB)
++    |  ldl tval2, 0(RC)
++    |  // Check for two integers.
++    |  gettp itype1, tval1
++    |  gettp itype2, tval2
++    |.endmacro
++    |
++    |.macro ins_arithdn, intins, fpins
++    |  ins_arithpre
++    |  ins_arithead TMP0, TMP1, CARG1, CARG2
++    |  ldi TISNUM, LJ_TISNUM(zero)
++    |  cmpeq TMP0, TISNUM, AT
++    |  beq AT, >1
++    |  cmpeq TMP1, TISNUM, AT
++    |  beq AT, >1
++    |  addwi CARG1, 0, CARG3
++    |  addwi CARG2, 0, CARG4
++    |.if "intins" == "addw"
++    |  intins CARG3, CARG4, CRET1
++    |  xor CRET1, CARG3, TMP1		// ((y^a) & (y^b)) < 0: overflow.
++    |  xor CRET1, CARG4, TMP2
++    |  and TMP1, TMP2, TMP1
++    |  addl RA, BASE, RA
++    |  cmplt TMP1, zero, AT
++    |  bne AT, ->vmeta_arith
++    |.elif "intins" == "subw"
++    |  intins CARG3, CARG4, CRET1
++    |  xor CRET1, CARG3, TMP1		// ((y^a) & (a^b)) < 0: overflow.
++    |  xor CARG3, CARG4, TMP2
++    |  and TMP1, TMP2, TMP1
++    |  addl RA, BASE, RA
++    |  cmplt TMP1, zero, AT
++    |  bne AT, ->vmeta_arith
++    |.elif "intins" == "mulw"  //TODO CHECK
++    |  mulw CARG3, CARG4, CRET1
++    |  mull CARG3, CARG4, TMP2
++    |  zapi TMP2, 0xf, TMP2
++    |  addwi CRET1, 0, CRET1
++    |  srai CRET1, 31, TMP1		// 63-32bit not all 0 or 1: overflow.
++    |  addl RA, BASE, RA
++    |  cmpeq TMP1, TMP2, AT
++    |  beq AT, ->vmeta_arith
++    |.endif
++    |  zapi CRET1, 0xf0, CRET1
++    |  ldi TISNUM, LJ_TISNUM(zero)
++    |  settp CRET1, TISNUM
++    |  stl CRET1, 0(RA)
++    |  ins_next
++    |1:  // Check for two numbers.
++    |  ins_arithfp, fpins, TMP0, TMP1
++    |.endmacro
++    |
++    |.macro ins_arithdiv, fpins
++    |  ins_arithpre
++    |  ins_arithead TMP0, TMP1, CARG1, CARG2
++    |  ins_arithfp, fpins, TMP0, TMP1
++    |.endmacro
++    |
++    |.macro ins_arithmod, fpins
++    |  ins_arithpre
++    |  ins_arithead TMP0, TMP1, CARG1, CARG2
++    |  load_got lj_vm_modi
++    |  ldi TISNUM, LJ_TISNUM(zero)
++    |  cmpeq TMP0, TISNUM, AT
++    |  beq AT, >1
++    |  cmpeq TMP1, TISNUM, AT
++    |  beq AT, >1
++    |  addwi CARG1, 0, CARG1
++    |  addwi CARG2, 0, CARG2
++    |  addl RA, BASE, RA
++    |  beq CARG2, ->vmeta_arith
++    |  call_intern lj_vm_modi
++    |  zapi CRET1, 0xf0, CRET1
++    |  ldi TISNUM, LJ_TISNUM(zero)
++    |  settp CRET1, TISNUM
++    |  stl CRET1, 0(RA)
++    |  ins_next
++    |1:  // Check for two numbers.
++    |  ins_arithfp, fpins, TMP0, TMP1
++    |.endmacro
++
++  case BC_ADDVN: case BC_ADDNV: case BC_ADDVV:
++    |  ins_arithdn addw, faddd
++    break;
++  case BC_SUBVN: case BC_SUBNV: case BC_SUBVV:
++    |  ins_arithdn subw, fsubd
++    break;
++  case BC_MULVN: case BC_MULNV: case BC_MULVV:
++    |  ins_arithdn mulw, fmuld    
++    break;
++  case BC_DIVVN: case BC_DIVNV: case BC_DIVVV:
++    |  ins_arithdiv fdivd
++    break;
++  case BC_MODVN: case BC_MODNV: case BC_MODVV:
++    |  ins_arithmod fpmod
++    break;
++  case BC_POW:
++    |  ins_arithpre
++    |  ldl CARG1, 0(RB)
++    |  ldl CARG2, 0(RC)
++    |  gettp TMP0, CARG1
++    |  gettp TMP1, CARG2
++    |  // cmpulti TMP0, LJ_TISNUM, TMP0
++    |  // cmpulti TMP1, LJ_TISNUM, TMP1
++    |  ldi AT, LJ_TISNUM(zero)
++    |  cmpult TMP0, AT, TMP0
++    |  cmpult TMP1, AT, TMP1
++    |  and TMP0, TMP1, TMP0
++    |  addl RA, BASE, RA
++    |  load_got pow
++    |  beq TMP0, ->vmeta_arith
++    |  fldd FCARG1, 0(RB)
++    |  fldd FCARG2, 0(RC)
++    |  call_extern
++    |  ins_next1
++    |  fstd FCRET1, 0(RA)
++    |  ins_next2
++    break;
++
++  case BC_CAT:
++    |  // RA = dst*8, RB = src_start*8, RC = src_end*8
++    |  decode_RB RB, INS
++    |  decode_RDtoRC8 RC, RD
++    |  subl RC, RB, CARG3
++    |  stl BASE, L->base
++    |  addl BASE, RC, CARG2
++    |  bis RB, zero, MULTRES
++    |->BC_CAT_Z:
++    |  load_got lj_meta_cat
++    |  zapi CARG3, 0xf0, CARG3
++    |  srli CARG3, 3, CARG3
++    |  stl PC, SAVE_PC(sp)
++    |  bis L, zero, CARG1
++    |  call_intern lj_meta_cat		// (lua_State *L, TValue *top, int left)
++    |  // Returns NULL (finished) or TValue * (metamethod).
++    |  ldl BASE, L->base
++    |  bne CRET1, ->vmeta_binop
++    |  addl BASE, MULTRES, RB
++    |  ldl TMP0, 0(RB)
++    |  addl RA, BASE, RA
++    |  stl TMP0, 0(RA)
++    |  ins_next
++    break;
++
++  /* -- Constant ops ------------------------------------------------------ */
++
++  case BC_KSTR:
++    |  // RA = dst*8, RD = str_const*8 (~)
++    |  subl KBASE, RD, TMP1
++    |  ldi TMP2, LJ_TSTR(zero)
++    |  addwi TMP2, 0, TMP2
++    |  ldl TMP0, -8(TMP1)		// KBASE-8-str_const*8
++    |  addl RA, BASE, RA
++    |  settp TMP0, TMP2
++    |  stl TMP0, 0(RA)
++    |  ins_next
++    break;
++  case BC_KCDATA:
++    |.if FFI
++    |  // RA = dst*8, RD = cdata_const*8 (~)
++    |  subl KBASE, RD, TMP1
++    |  ldl TMP0, -8(TMP1)		// KBASE-8-cdata_const*8
++    |  ldi TMP2, LJ_TCDATA(zero)
++    |  addwi TMP2, 0, TMP2
++    |  addl RA, BASE, RA
++    |  settp TMP0, TMP2
++    |  stl TMP0, 0(RA)
++    |  ins_next
++    |.endif
++    break;
++  case BC_KSHORT:
++    |  // RA = dst*8, RD = int16_literal*8
++    |  addwi INS, 0, RD
++    |  srai RD, 16, RD
++    |  addl RA, BASE, RA
++    |  zapi RD, 0xf0, RD
++    |  ldi TISNUM, LJ_TISNUM(zero)
++    |  settp RD, TISNUM
++    |  stl RD, 0(RA)
++    |  ins_next
++    break;
++  case BC_KNUM:
++    |  // RA = dst*8, RD = num_const*8
++    |  addl RD, KBASE, RD
++    |  addl RA, BASE, RA
++    |  ldl TMP0, 0(RD)
++    |  stl TMP0, 0(RA)
++    |  ins_next
++    break;
++  case BC_KPRI:
++    |  // RA = dst*8, RD = primitive_type*8 (~)
++    |  addl RA, BASE, RA
++    |  slli RD, 44, TMP0	// 44+3
++    |  ornot zero, TMP0, TMP0   // ~TMP0
++    |  stl TMP0, 0(RA)
++    |  ins_next
++    break;
++  case BC_KNIL:
++    |  // RA = base*8, RD = end*8
++    |  addl RA, BASE, RA
++    |  stl TISNIL, 0(RA)
++    |  ldi RA, 8(RA)
++    |  addl RD, BASE, RD
++    |1:
++    |  stl TISNIL, 0(RA)
++    |  cmplt RA, RD, TMP0
++    |  ldi RA, 8(RA)
++    |  bne TMP0, <1
++    |  ins_next
++    break;
++
++  /* -- Upvalue and function ops ------------------------------------------ */
++
++  case BC_UGET:
++    |  // RA = dst*8, RD = uvnum*8
++    |  ldl LFUNC:TMP0, FRAME_FUNC(BASE)
++    |  addl RA, BASE, RA
++    |  cleartp LFUNC:TMP0
++    |  addl RD, LFUNC:TMP0, RD
++    |  ldl UPVAL:TMP0, LFUNC:RD->uvptr
++    |  ldl TMP1, UPVAL:TMP0->v
++    |  ldl TMP2, 0(TMP1)
++    |  ins_next1
++    |  stl TMP2, 0(RA)
++    |  ins_next2
++    break;
++  case BC_USETV:
++    |  // RA = uvnum*8, RD = src*8
++    |  ldl LFUNC:TMP0, FRAME_FUNC(BASE)
++    |  addl RD, BASE, RD
++    |  cleartp LFUNC:TMP0
++    |  addl RA, LFUNC:TMP0, RA
++    |  ldl UPVAL:TMP0, LFUNC:RA->uvptr
++    |  ldl CRET1, 0(RD)
++    |  ldbu TMP3, UPVAL:TMP0->marked
++    |  ldl CARG2, UPVAL:TMP0->v
++    |  andi TMP3, LJ_GC_BLACK, TMP3	// isblack(uv)
++    |  ldbu TMP0, UPVAL:TMP0->closed
++    |  gettp TMP2, CRET1
++    |  stl CRET1, 0(CARG2)
++    |  bis TMP3, TMP0, TMP3
++    |  ldi TMP0, LJ_GC_BLACK|1(zero)
++    |  ldi TMP2, -(LJ_TNUMX+1)(TMP2)
++    |  cmpeq TMP3, TMP0, AT
++    |  bne AT, >2			// Upvalue is closed and black?
++    |1:
++    |  ins_next
++    |
++    |2:  // Check if new value is collectable.
++    |  // cmpulti TMP2, LJ_TISGCV - (LJ_TNUMX+1), TMP0
++    |  ldi TMP0, (LJ_TISGCV-(LJ_TNUMX+1))(zero)
++    |  cmpult TMP2, TMP0, TMP0
++    |  cleartp GCOBJ:CRET1, CRET1
++    |  beq TMP0, <1			// tvisgcv(v)
++    |  ldbu TMP3, GCOBJ:CRET1->gch.marked
++    |  andi TMP3, LJ_GC_WHITES, TMP3	// iswhite(v)
++    |  load_got lj_gc_barrieruv
++    |  beq TMP3, <1
++    |  // Crossed a write barrier. Move the barrier forward.
++    |  ldi CARG1, GG_DISP2G(DISPATCH)
++    |  call_intern lj_gc_barrieruv	// (global_State *g, TValue *tv)
++    |  br zero, <1
++    break;
++  case BC_USETS:
++    |  // RA = uvnum*8, RD = str_const*8 (~)
++    |  ldl LFUNC:TMP0, FRAME_FUNC(BASE)
++    |  subl KBASE, RD, TMP1
++    |  cleartp LFUNC:TMP0
++    |  addl RA, LFUNC:TMP0, RA
++    |  ldl UPVAL:TMP0, LFUNC:RA->uvptr
++    |  ldl STR:TMP1, -8(TMP1)		// KBASE-8-str_const*8
++    |  ldbu TMP2, UPVAL:TMP0->marked
++    |  ldl CARG2, UPVAL:TMP0->v
++    |  ldbu TMP3, STR:TMP1->marked
++    |  andi TMP2, LJ_GC_BLACK, TMP4	// isblack(uv)
++    |  ldbu TMP2, UPVAL:TMP0->closed
++    |  ldi TMP0, LJ_TSTR(zero)
++    |  settp TMP1, TMP0
++    |  stl TMP1, 0(CARG2)
++    |  bne TMP4, >2
++    |1:
++    |  ins_next
++    |
++    |2:  // Check if string is white and ensure upvalue is closed.
++    |  andi TMP3, LJ_GC_WHITES, TMP0     // iswhite(str)
++    |  beq TMP2, <1
++    |  load_got lj_gc_barrieruv
++    |  beq TMP0, <1
++    |  // Crossed a write barrier. Move the barrier forward.
++    |  ldi CARG1, GG_DISP2G(DISPATCH)
++    |  call_intern lj_gc_barrieruv	// (global_State *g, TValue *tv)
++    |  br zero, <1
++    break;
++  case BC_USETN:
++    |  // RA = uvnum*8, RD = num_const*8
++    |  ldl LFUNC:TMP0, FRAME_FUNC(BASE)
++    |  addl RD, KBASE, RD
++    |  cleartp LFUNC:TMP0
++    |  addl RA, LFUNC:TMP0, TMP0
++    |  ldl UPVAL:TMP0, LFUNC:TMP0->uvptr
++    |  ldl TMP1, 0(RD)
++    |  ldl TMP0, UPVAL:TMP0->v
++    |  stl TMP1, 0(TMP0)
++    |  ins_next
++    break;
++  case BC_USETP:
++    |  // RA = uvnum*8, RD = primitive_type*8 (~)
++    |  ldl LFUNC:TMP0, FRAME_FUNC(BASE)
++    |  slli RD, 44, TMP2
++    |  cleartp LFUNC:TMP0
++    |  addl RA, LFUNC:TMP0, TMP0
++    |  ornot zero, TMP2, TMP2          // ~TMP2
++    |  ldl UPVAL:TMP0, LFUNC:TMP0->uvptr
++    |  ldl TMP1, UPVAL:TMP0->v
++    |  stl TMP2, 0(TMP1)
++    |  ins_next
++    break;
++
++  case BC_UCLO:
++    |  // RA = level*8, RD = target
++    |  ldl TMP2, L->openupval
++    |  branch_RD			// Do this first since RD is not saved.
++    |  load_got lj_func_closeuv
++    |  stl BASE, L->base
++    |  bis L, zero, CARG1
++    |  beq TMP2, >1
++    |  addl BASE, RA, CARG2
++    |  call_intern lj_func_closeuv	// (lua_State *L, TValue *level)
++    |  ldl BASE, L->base
++    |1:
++    |  ins_next
++    break;
++
++  case BC_FNEW:
++    |  // RA = dst*8, RD = proto_const*8 (~) (holding function prototype)
++    |  load_got lj_func_newL_gc
++    |  subl KBASE, RD, TMP1
++    |  ldl CARG3, FRAME_FUNC(BASE)
++    |  ldl CARG2, -8(TMP1)		// KBASE-8-tab_const*8
++    |  stl BASE, L->base
++    |  stl PC, SAVE_PC(sp)
++    |  cleartp CARG3
++    |  bis L, zero, CARG1
++    |  // (lua_State *L, GCproto *pt, GCfuncL *parent)
++    |  call_intern lj_func_newL_gc
++    |  // Returns GCfuncL *.
++    |  ldi TMP0, LJ_TFUNC(zero)
++    |  ldl BASE, L->base
++    |  settp CRET1, TMP0
++    |  addl RA, BASE, RA
++    |  stl CRET1, 0(RA)
++    |  ins_next
++    break;
++
++  /* -- Table ops --------------------------------------------------------- */
++
++  case BC_TNEW:
++  case BC_TDUP:
++    |  // RA = dst*8, RD = (hbits|asize)*8 | tab_const*8 (~)
++    |  ldl TMP0, DISPATCH_GL(gc.total)(DISPATCH)
++    |  ldl TMP1, DISPATCH_GL(gc.threshold)(DISPATCH)
++    |  stl BASE, L->base
++    |  cmpult TMP0, TMP1, TMP2
++    |  stl PC, SAVE_PC(sp)
++    |  beq TMP2, >5
++    |1:
++    if (op == BC_TNEW) {
++      |  load_got lj_tab_new
++      |  zapi RD, 0xf0, CARG2
++      |  srli CARG2, 3, CARG2
++      |  ldi AT, 0x7ff(zero)
++      |  and CARG2, AT, CARG2
++      |  ldi TMP0, 0x801(zero)
++      |  subw CARG2, AT, TMP2
++      |  zapi RD, 0xf0, CARG3
++      |  srli CARG3, 14, CARG3
++      |  seleq TMP2, TMP0, CARG2, CARG2
++      |  // (lua_State *L, int32_t asize, uint32_t hbits)
++      |  bis L, zero, CARG1
++      |  call_intern lj_tab_new
++      |  // Returns Table *.
++    } else {
++      |  load_got lj_tab_dup
++      |  subl KBASE, RD, TMP1
++      |  bis L, zero, CARG1
++      |  ldl CARG2, -8(TMP1)            // KBASE-8-str_const*8
++      |  call_intern lj_tab_dup		// (lua_State *L, Table *kt)
++      |  // Returns Table *.
++    }
++    |  ldi TMP0, LJ_TTAB(zero)
++    |  ldl BASE, L->base
++    |  ins_next1
++    |  settp CRET1, TMP0
++    |  addl RA, BASE, RA
++    |  stl CRET1, 0(RA)
++    |  ins_next2
++    |5:
++    |  load_got lj_gc_step_fixtop
++    |  bis RD, zero, MULTRES
++    |  bis L, zero, CARG1
++    |  call_intern lj_gc_step_fixtop	// (lua_State *L)
++    |  bis MULTRES, zero, RD
++    |  br zero, <1
++    break;
++
++  case BC_GGET:
++    |  // RA = dst*8, RD = str_const*8 (~)
++  case BC_GSET:
++    |  // RA = src*8, RD = str_const*8 (~)
++    |  ldl LFUNC:TMP0, FRAME_FUNC(BASE)
++    |  subl KBASE, RD, TMP1
++    |  ldl STR:RC, -8(TMP1)	// KBASE-8-str_const*8
++    |  cleartp LFUNC:TMP0
++    |  ldl TAB:RB, LFUNC:TMP0->env
++    |  addl RA, BASE, RA
++    if (op == BC_GGET) {
++      |  br zero, ->BC_TGETS_Z
++    } else {
++      |  br zero, ->BC_TSETS_Z
++    }
++    break;
++
++  case BC_TGETV:
++    |  // RA = dst*8, RB = table*8, RC = key*8
++    |  decode_RB RB, INS
++    |  decode_RDtoRC8 RC, RD
++    |  addl BASE, RB, CARG2
++    |  addl BASE, RC, CARG3
++    |  ldl TAB:RB, 0(CARG2)
++    |  ldl TMP2, 0(CARG3)
++    |  addl RA, BASE, RA
++    |  checktab TAB:RB, ->vmeta_tgetv
++    |  gettp TMP3, TMP2
++    |  ldw TMP0, TAB:RB->asize
++    |  ldi TISNUM, LJ_TISNUM(zero)
++    |  cmpeq TMP3, TISNUM, AT
++    |  beq AT, >5		// Integer key?
++    |  addwi TMP2, 0, TMP2
++    |  ldl TMP1, TAB:RB->array
++    |  cmpult TMP2, TMP0, TMP3		//array part (keys = [0, asize-1])
++    |  s8addwi TMP2, 0, TMP2
++    |  beq TMP3, ->vmeta_tgetv		// Integer key and in array part?
++    |  addl TMP2, TMP1, TMP2
++    |  ldl AT, 0(TMP2)
++    |  ldl CRET1, 0(TMP2)
++    |  cmpeq AT, TISNIL, AT
++    |  bne AT, >2
++    |1:
++    |  ins_next1
++    |  stl CRET1, 0(RA)
++    |  ins_next2
++    |
++    |2:  // Check for __index if table value is nil.
++    |  ldl TAB:TMP2, TAB:RB->metatable
++    |  beq TAB:TMP2, <1		// No metatable: done.
++    |  ldbu TMP0, TAB:TMP2->nomm
++    |  andi TMP0, 1<<MM_index, TMP0
++    |  bne TMP0, <1			// 'no __index' flag set: done.
++    |  br zero, ->vmeta_tgetv
++    |
++    |5:
++    |  ldi TMP0, LJ_TSTR(zero)
++    |  cleartp RC, TMP2
++    |  cmpeq TMP3, TMP0, AT
++    |  beq AT, ->vmeta_tgetv	// String key?
++    |  br zero, ->BC_TGETS_Z
++    break;
++  case BC_TGETS:
++    |  // RA = dst*8, RB = table*8, RC = str_const*8 (~)
++    |  decode_RB RB, INS
++    |  decode_RDtoRC8 RC, RD     //TODO CHECK
++    |  addl BASE, RB, CARG2
++    |  subl KBASE, RC, CARG3
++    |  ldl TAB:RB, 0(CARG2)
++    |  addl RA, BASE, RA
++    |  ldl STR:RC, -8(CARG3)		// KBASE-8-str_const*8
++    |  checktab TAB:RB, ->vmeta_tgets1
++    |->BC_TGETS_Z:
++    |  // TAB:RB = GCtab *, STR:RC = GCstr *, RA = dst*8
++    |  ldw TMP0, TAB:RB->hmask
++    |  ldw TMP1, STR:RC->hash
++    |  ldl NODE:TMP2, TAB:RB->node
++    |  and TMP1, TMP0, TMP1		// idx = str->hash & tab->hmask
++    |  slli TMP1, 5, TMP0
++    |  addwi TMP0, 0, TMP0
++    |  s8addwi TMP1, 0, TMP1
++    |  subw TMP0, TMP1, TMP1
++    |  ldi TMP3, LJ_TSTR(zero)
++    |  addl NODE:TMP2, TMP1, NODE:TMP2	// node = tab->node + (idx*32-idx*8)
++    |  settp STR:RC, TMP3		// Tagged key to look for.
++    |1:
++    |  ldl CARG1, NODE:TMP2->key
++    |  ldl CRET1, NODE:TMP2->val
++    |  ldl NODE:TMP1, NODE:TMP2->next
++    |  ldl TAB:TMP3, TAB:RB->metatable
++    |  cmpeq CARG1, RC, TMP4
++    |  beq TMP4, >4
++    |  cmpeq CRET1, TISNIL, TMP4
++    |  bne TMP4, >5		// Key found, but nil value?
++    |3:
++    |  ins_next1
++    |  stl CRET1, 0(RA)
++    |  ins_next2
++    |
++    |4:  // Follow hash chain.
++    |  bis NODE:TMP1, zero, NODE:TMP2
++    |  bne NODE:TMP1, <1
++    |  // End of hash chain: key not found, nil result.
++    |
++    |5:  // Check for __index if table value is nil.
++    |  bis TISNIL, zero, CRET1
++    |  beq TAB:TMP3, <3		// No metatable: done.
++    |  ldbu TMP0, TAB:TMP3->nomm
++    |  andi TMP0, 1<<MM_index, TMP0
++    |  bne TMP0, <3			// 'no __index' flag set: done.
++    |  br zero, ->vmeta_tgets
++    break;
++  case BC_TGETB:
++    |  // RA = dst*8, RB = table*8, RC = index*8
++    |  decode_RB RB, INS
++    |  addl BASE, RB, CARG2
++    |  decode_RDtoRC8 RC, RD
++    |  ldl TAB:RB, 0(CARG2)
++    |  addl RA, BASE, RA
++    |  zapi RC, 0xf0, TMP0
++    |  srli TMP0, 3, TMP0
++    |  checktab TAB:RB, ->vmeta_tgetb
++    |  ldw TMP1, TAB:RB->asize
++    |  ldl TMP2, TAB:RB->array
++    |  cmpult TMP0, TMP1, TMP1
++    |  addl RC, TMP2, RC
++    |  beq TMP1, ->vmeta_tgetb
++    |  ldl CRET1, 0(RC)
++    |  cmpeq CRET1, TISNIL, AT
++    |  bne AT, >5
++    |1:
++    |  ins_next1
++    |  stl CRET1, 0(RA)
++    |  ins_next2
++    |
++    |5:  // Check for __index if table value is nil.
++    |  ldl TAB:TMP2, TAB:RB->metatable
++    |  beq TAB:TMP2, <1		// No metatable: done.
++    |  ldbu TMP1, TAB:TMP2->nomm
++    |  andi TMP1, 1<<MM_index, TMP1
++    |  bne TMP1, <1			// 'no __index' flag set: done.
++    |  br zero, ->vmeta_tgetb			// Caveat: preserve TMP0 and CARG2!
++    break;
++  case BC_TGETR:
++    |  // RA = dst*8, RB = table*8, RC = key*8
++    |  decode_RB RB, INS
++    |  decode_RDtoRC8 RC, RD
++    |  addl RB, BASE, RB
++    |  addl RC, BASE, RC
++    |  ldl TAB:CARG1, 0(RB)
++    |  ldw CARG2, 0(RC)
++    |  addl RA, BASE, RA
++    |  cleartp TAB:CARG1
++    |  ldw TMP0, TAB:CARG1->asize
++    |  ldl TMP1, TAB:CARG1->array
++    |  cmpult CARG2, TMP0, TMP0
++    |  s8addwi CARG2, 0, TMP2
++    |  addl TMP1, TMP2, CRET1
++    |  beq TMP0, ->vmeta_tgetr		// In array part?
++    |  ldl CARG2, 0(CRET1)
++    |->BC_TGETR_Z:
++    |  ins_next1
++    |  stl CARG2, 0(RA)
++    |  ins_next2
++    break;
++
++  case BC_TSETV:
++    |  // RA = src*8, RB = table*8, RC = key*8
++    |  decode_RB RB, INS
++    |  decode_RDtoRC8 RC, RD
++    |  addl BASE, RB, CARG2
++    |  addl BASE, RC, CARG3
++    |  ldl RB, 0(CARG2)
++    |  ldl TMP2, 0(CARG3)
++    |  addl RA, BASE, RA
++    |  checktab RB, ->vmeta_tsetv
++    |  addwi TMP2, 0, RC
++    |  checkint TMP2, >5
++    |  ldw TMP0, TAB:RB->asize
++    |  ldl TMP1, TAB:RB->array
++    |  cmpult RC, TMP0, TMP0
++    |  s8addwi RC, 0, TMP2
++    |  beq TMP0, ->vmeta_tsetv		// Integer key and in array part?
++    |  addl TMP1, TMP2, TMP1
++    |  ldbu TMP3, TAB:RB->marked
++    |  ldl TMP0, 0(TMP1)
++    |  ldl CRET1, 0(RA)
++    |  cmpeq TMP0, TISNIL, AT
++    |  bne AT, >3
++    |1:
++    |  andi TMP3, LJ_GC_BLACK, TMP2	// isblack(table)
++    |  stl CRET1, 0(TMP1)
++    |  bne TMP2, >7
++    |2:
++    |  ins_next
++    |
++    |3:  // Check for __newindex if previous value is nil.
++    |  ldl TAB:TMP2, TAB:RB->metatable
++    |  beq TAB:TMP2, <1		// No metatable: done.
++    |  ldbu TMP2, TAB:TMP2->nomm
++    |  andi TMP2, 1<<MM_newindex, TMP2
++    |  bne TMP2, <1			// 'no __newindex' flag set: done.
++    |  br zero, ->vmeta_tsetv
++    |5:
++    |  gettp TMP0, TMP2
++    |  ldi TMP0, -LJ_TSTR(TMP0)
++    |  bne TMP0, ->vmeta_tsetv
++    |  cleartp STR:RC, TMP2
++    |  br zero, ->BC_TSETS_Z			// String key?
++    |
++    |7:  // Possible table write barrier for the value. Skip valiswhite check.
++    |  barrierback TAB:RB, TMP3, TMP0, <2
++    break;
++  case BC_TSETS:
++    |  // RA = src*8, RB = table*8, RC = str_const*8 (~)
++    |  decode_RB RB, INS
++    |  decode_RDtoRC8 RC, RD
++    |  addl BASE, RB, CARG2
++    |  subl KBASE, RC, CARG3
++    |  ldl TAB:RB, 0(CARG2)
++    |  ldl RC, -8(CARG3)		// KBASE-8-str_const*8
++    |  addl RA, BASE, RA
++    |  cleartp STR:RC
++    |  checktab TAB:RB, ->vmeta_tsets1
++    |->BC_TSETS_Z:
++    |  // TAB:RB = GCtab *, STR:RC = GCstr *, RA = BASE+src*8
++    |  ldw TMP0, TAB:RB->hmask
++    |  ldw TMP1, STR:RC->hash
++    |  ldl NODE:TMP2, TAB:RB->node
++    |  stb zero, TAB:RB->nomm		// Clear metamethod cache.
++    |  and TMP1, TMP0, TMP1		// idx = str->hash & tab->hmask
++    |  slli TMP1, 5, TMP0
++    |  addwi TMP0, 0, TMP0
++    |  s8addwi TMP1, 0, TMP1
++    |  subw TMP0, TMP1, TMP1
++    |  ldi TMP3, LJ_TSTR(zero)
++    |  addl NODE:TMP2, TMP1, NODE:TMP2	// node = tab->node + (idx*32-idx*8)
++    |  settp STR:RC, TMP3		// Tagged key to look for.
++    |  fldd f9, 0(RA)
++    |1:
++    |  ldl TMP0, NODE:TMP2->key
++    |  ldl CARG2, NODE:TMP2->val
++    |  ldl NODE:TMP1, NODE:TMP2->next
++    |  ldbu TMP3, TAB:RB->marked
++    |  cmpeq TMP0, RC, AT
++    |  beq AT, >5
++    |  ldl TAB:TMP0, TAB:RB->metatable
++    |  cmpeq CARG2, TISNIL, AT
++    |  bne AT, >4		// Key found, but nil value?
++    |2:
++    |  andi TMP3, LJ_GC_BLACK, TMP3	// isblack(table)
++    |  fstd f9, NODE:TMP2->val
++    |  bne TMP3, >7
++    |3:
++    |  ins_next
++    |
++    |4:  // Check for __newindex if previous value is nil.
++    |  beq TAB:TMP0, <2		// No metatable: done.
++    |  ldbu TMP0, TAB:TMP0->nomm
++    |  andi TMP0, 1<<MM_newindex, TMP0
++    |  bne TMP0, <2			// 'no __newindex' flag set: done.
++    |  br zero, ->vmeta_tsets
++    |
++    |5:  // Follow hash chain.
++    |  bis NODE:TMP1, zero, NODE:TMP2
++    |  bne NODE:TMP1, <1
++    |  // End of hash chain: key not found, add a new one
++    |
++    |  // But check for __newindex first.
++    |  ldl TAB:TMP2, TAB:RB->metatable
++    |  ldi CARG3, DISPATCH_GL(tmptv)(DISPATCH)
++    |  beq TAB:TMP2, >6		// No metatable: continue.
++    |  ldbu TMP0, TAB:TMP2->nomm
++    |  andi TMP0, 1<<MM_newindex, TMP0
++    |  beq TMP0, ->vmeta_tsets		// 'no __newindex' flag NOT set: check.
++    |6:
++    |  load_got lj_tab_newkey
++    |  stl RC, 0(CARG3)
++    |  stl BASE, L->base
++    |  bis TAB:RB, zero, CARG2
++    |  stl PC, SAVE_PC(sp)
++    |  bis L, zero, CARG1
++    |  call_intern lj_tab_newkey	// (lua_State *L, GCtab *t, TValue *k
++    |  // Returns TValue *.
++    |  ldl BASE, L->base
++    |  fstd f9, 0(CRET1)
++    |  br zero, <3				// No 2nd write barrier needed.
++    |
++    |7:  // Possible table write barrier for the value. Skip valiswhite check.
++    |  barrierback TAB:RB, TMP3, TMP0, <3
++    break;
++  case BC_TSETB:
++    |  // RA = src*8, RB = table*8, RC = index*8
++    |  decode_RB RB, INS
++    |  decode_RDtoRC8 RC, RD
++    |  addl BASE, RB, CARG2
++    |  addl RA, BASE, RA
++    |  ldl TAB:RB, 0(CARG2)
++    |  zapi RC, 0xf0, TMP0
++    |  srli TMP0, 3, TMP0
++    |  checktab RB, ->vmeta_tsetb
++    |  ldw TMP1, TAB:RB->asize
++    |  ldl TMP2, TAB:RB->array
++    |  cmpult TMP0, TMP1, TMP1
++    |  addl RC, TMP2, RC
++    |  beq TMP1, ->vmeta_tsetb
++    |  ldl TMP1, 0(RC)
++    |  ldbu TMP3, TAB:RB->marked
++    |  cmpeq TMP1, TISNIL, AT
++    |  bne AT, >5
++    |1:
++    |  ldl CRET1, 0(RA)
++    |  andi TMP3, LJ_GC_BLACK, TMP1	// isblack(table)
++    |  stl CRET1, 0(RC)
++    |  bne TMP1, >7
++    |2:
++    |  ins_next
++    |
++    |5:  // Check for __newindex if previous value is nil.
++    |  ldl TAB:TMP2, TAB:RB->metatable
++    |  beq TAB:TMP2, <1		// No metatable: done.
++    |  ldbu TMP1, TAB:TMP2->nomm
++    |  andi TMP1, 1<<MM_newindex, TMP1
++    |  bne TMP1, <1			// 'no __newindex' flag set: done.
++    |  br zero, ->vmeta_tsetb	// Caveat: preserve TMP0 and CARG2!
++    |
++    |7:  // Possible table write barrier for the value. Skip valiswhite check.
++    |  barrierback TAB:RB, TMP3, TMP0, <2
++    break;
++  case BC_TSETR:
++    |  // RA = dst*8, RB = table*8, RC = key*8
++    |  decode_RB RB, INS
++    |  decode_RDtoRC8 RC, RD
++    |  addl BASE, RB, CARG1
++    |  addl BASE, RC, CARG3
++    |  ldl TAB:CARG2, 0(CARG1)
++    |  ldw CARG3, 0(CARG3)
++    |  cleartp TAB:CARG2
++    |  ldbu TMP3, TAB:CARG2->marked
++    |  ldw TMP0, TAB:CARG2->asize
++    |  ldl TMP1, TAB:CARG2->array
++    |  andi TMP3, LJ_GC_BLACK, TMP2	// isblack(table)
++    |  addl RA, BASE, RA
++    |  bne TMP2, >7
++    |2:
++    |  cmpult CARG3, TMP0, TMP0
++    |  s8addwi CARG3, 0, TMP2
++    |  addl TMP1, TMP2, CRET1
++    |  beq TMP0, ->vmeta_tsetr		// In array part?
++    |->BC_TSETR_Z:
++    |  ldl TMP1, 0(RA)
++    |  ins_next1
++    |  stl TMP1, 0(CRET1)
++    |  ins_next2
++    |
++    |7:  // Possible table write barrier for the value. Skip valiswhite check.
++    |  barrierback TAB:CARG2, TMP3, CRET1, <2
++    break;
++
++  case BC_TSETM:
++    |  // RA = base*8 (table at base-1), RD = num_const*8 (start index)
++    |  addl RA, BASE, RA
++    |1:
++    |  addl KBASE, RD, TMP3
++    |  ldl TAB:CARG2, -8(RA)		// Guaranteed to be a table.
++    |  subwi MULTRES, 8, TMP0
++    |  ldw TMP3, 0(TMP3)		// Integer constant is in lo-word.
++    |  zapi TMP0, 0xf0, CARG3
++    |  srli CARG3, 3, CARG3
++    |  beq TMP0, >4			// Nothing to copy?
++    |  cleartp TAB:CARG2
++    |  addw CARG3, TMP3, CARG3
++    |  ldw TMP2, TAB:CARG2->asize
++    |  s8addwi TMP3, 0, TMP1
++    |  ldbu TMP3, TAB:CARG2->marked
++    |  ldl CARG1, TAB:CARG2->array
++    |  cmpult TMP2, CARG3, TMP4
++    |  addl TMP0, RA, TMP2
++    |  bne TMP4, >5
++    |  addl TMP1, CARG1, TMP1
++    |  andi TMP3, LJ_GC_BLACK, TMP0	// isblack(table)
++    |3:  // Copy result slots to table.
++    |  ldl CRET1, 0(RA)
++    |  ldi RA, 8(RA)
++    |  cmpult RA, TMP2, TMP4
++    |  stl CRET1, 0(TMP1)
++    |  ldi TMP1, 8(TMP1)
++    |  bne TMP4, <3
++    |  bne TMP0, >7
++    |4:
++    |  ins_next
++    |
++    |5:  // Need to resize array part.
++    |  load_got lj_tab_reasize
++    |  stl BASE, L->base
++    |  stl PC, SAVE_PC(sp)
++    |  bis RD, zero, BASE
++    |  bis L, zero, CARG1
++    |  call_intern lj_tab_reasize	// (lua_State *L, GCtab *t, int nasize)
++    |  // Must not reallocate the stack.
++    |  bis BASE, zero, RD
++    |  ldl BASE, L->base        // Reload BASE for lack of a saved register.
++    |  br zero, <1
++    |
++    |7:  // Possible table write barrier for any value. Skip valiswhite check.
++    |  barrierback TAB:CARG2, TMP3, TMP0, <4
++    break;
++
++  /* -- Calls and vararg handling ----------------------------------------- */
++
++  case BC_CALLM:
++    |  // RA = base*8, (RB = (nresults+1)*8,) RC = extra_nargs*8
++    |  decode_RDtoRC8 NARGS8:RC, RD
++    |  addw NARGS8:RC, MULTRES, NARGS8:RC
++    |  br zero, ->BC_CALL_Z
++    break;
++  case BC_CALL:
++    |  // RA = base*8, (RB = (nresults+1)*8,) RC = (nargs+1)*8
++    |  decode_RDtoRC8 NARGS8:RC, RD
++    |->BC_CALL_Z:
++    |  bis BASE, zero, TMP2
++    |  addl BASE, RA, BASE
++    |  ldl LFUNC:RB, 0(BASE)
++    |  ldi BASE, 16(BASE)
++    |  subwi NARGS8:RC, 8, NARGS8:RC
++    |  checkfunc RB, ->vmeta_call
++    |  ins_call
++    break;
++
++  case BC_CALLMT:
++    |  // RA = base*8, (RB = 0,) RC = extra_nargs*8
++    |  addw NARGS8:RD, MULTRES, NARGS8:RD
++    |  br zero, ->BC_CALLT_Z1
++    break;
++  case BC_CALLT:
++    |  // RA = base*8, (RB = 0,) RC = (nargs+1)*8
++    |->BC_CALLT_Z1:
++    |  addl RA, BASE, RA
++    |  ldl LFUNC:RB, 0(RA)
++    |  bis RD, zero, NARGS8:RC
++    |  ldl TMP1, FRAME_PC(BASE)
++    |  ldi RA, 16(RA)
++    |  subwi NARGS8:RC, 8, NARGS8:RC
++    |  checktp CARG3, LFUNC:RB, -LJ_TFUNC, ->vmeta_callt
++    |->BC_CALLT_Z:
++    |  andi TMP1, FRAME_TYPE, TMP0	// Caveat: preserve TMP0 until the 'or'.
++    |  ldbu TMP3, LFUNC:CARG3->ffid
++    |  xori TMP1, FRAME_VARG, TMP2
++    |  bne TMP0, >7
++    |1:
++    |  stl LFUNC:RB, FRAME_FUNC(BASE)		// Copy function down, but keep PC.
++    |  cmpulti TMP3, 2, CARG4		// (> FF_C) Calling a fast function?
++    |  bis BASE, zero, TMP2
++    |  bis CARG3, zero, RB
++    |  bis NARGS8:RC, zero, TMP3
++    |  beq NARGS8:RC, >3
++    |2:
++    |  ldl CRET1, 0(RA)
++    |  ldi RA, 8(RA)
++    |  subwi TMP3, 8, TMP3
++    |  stl CRET1, 0(TMP2)
++    |  ldi TMP2, 8(TMP2)
++    |  bne TMP3, <2
++    |3:
++    |  bis TMP0, CARG4, TMP0
++    |  beq TMP0, >5
++    |4:
++    |  ins_callt
++    |
++    |5:  // Tailcall to a fast function with a Lua frame below.
++    |  ldw INS, -4(TMP1)
++    |  decode_RA RA, INS
++    |  subl BASE, RA, TMP1
++    |  ldl TMP1, -32(TMP1)
++    |  cleartp LFUNC:TMP1
++    |  ldl TMP1, LFUNC:TMP1->pc
++    |  ldl KBASE, PC2PROTO(k)(TMP1)     // Need to prepare KBASE.
++    |  br zero, <4
++    |
++    |7:  // Tailcall from a vararg function.
++    |  andi TMP2, FRAME_TYPEP, CARG4
++    |  subl BASE, TMP2, TMP2          // Relocate BASE down.
++    |  bne CARG4, <1			// Vararg frame below?
++    |  bis TMP2, zero, BASE
++    |  ldl TMP1, FRAME_PC(TMP2)
++    |  andi TMP1, FRAME_TYPE, TMP0
++    |  br zero, <1
++    break;
++
++  case BC_ITERC:
++    |  // RA = base*8, (RB = (nresults+1)*8, RC = (nargs+1)*8 ((2+1)*8))
++    |  bis BASE, zero, TMP2			// Save old BASE for vmeta_call.
++    |  addl BASE, RA, BASE
++    |  ldl RB, -24(BASE)		//A, A+1, A+2 = A-3, A-2, A-1.
++    |  ldl CARG1, -16(BASE)
++    |  ldl CARG2, -8(BASE)
++    |  ldi NARGS8:RC, 16(zero)		// Iterators get 2 arguments.
++    |  stl RB, 0(BASE)			// Copy callable.
++    |  stl CARG1, 16(BASE)		// Copy state.
++    |  stl CARG2, 24(BASE)		// Copy control var.
++    |  ldi BASE, 16(BASE)
++    |  checkfunc RB, ->vmeta_call
++    |  ins_call
++    break;
++
++  case BC_ITERN:
++    |  // RA = base*8, (RB = (nresults+1)*8, RC = (nargs+1)*8 (2+1)*8)
++    |  addl RA, BASE, RA
++    |  ldl TAB:RB, -16(RA)
++    |  ldw RC, -8(RA)		// Get index from control var.
++    |  cleartp TAB:RB
++    |  ldi PC, 4(PC)
++    |  ldw TMP0, TAB:RB->asize
++    |  ldl TMP1, TAB:RB->array
++    |  ldi TISNUM, LJ_TISNUM(zero)
++    |  slli TISNUM, 47, CARG3
++    |1:  // Traverse array part.
++    |  cmpult RC, TMP0, TMP2
++    |  s8addwi RC, 0, TMP3
++    |  beq TMP2, >5			// Index points after array part?
++    |  addl TMP3, TMP1, TMP3
++    |  ldl CARG1, 0(TMP3)
++    |  ldhu RD, -4+OFS_RD(PC)		// ITERL RD
++    |  bis RC, CARG3, TMP2
++    |  addwi RC, 1, RC
++    |  cmpeq CARG1, TISNIL, AT
++    |  bne AT, <1		// Skip holes in array part.
++    |  stl TMP2, 0(RA)
++    |  stl CARG1, 8(RA)
++    |  ldih TMP3, -0x2(zero)		// -BCBIAS_J*4
++    |  decode_BC4b RD
++    |  addl RD, TMP3, RD
++    |  stw RC, -8(RA)		// Update control var.
++    |  addl PC, RD, PC
++    |3:
++    |  ins_next
++    |
++    |5:  // Traverse hash part.
++    |  ldw TMP1, TAB:RB->hmask
++    |  subw RC, TMP0, RC
++    |  ldl TMP2, TAB:RB->node
++    |6:
++    |  cmpult TMP1, RC, CARG1		// End of iteration? Branch to ITERL+1.
++    |  slli RC, 5, TMP3
++    |  addwi TMP3, 0, TMP3
++    |  bne CARG1, <3
++    |  s8addwi RC, 0, RB
++    |  subw TMP3, RB, TMP3
++    |  addl TMP3, TMP2, NODE:TMP3	// node = tab->node + (idx*32-idx*8)
++    |  ldl CARG1, 0(NODE:TMP3)
++    |  ldhu RD, -4+OFS_RD(PC)		// ITERL RD
++    |  addwi RC, 1, RC
++    |  cmpeq CARG1, TISNIL, AT
++    |  bne AT, <6		// Skip holes in hash part.
++    |  ldl CARG2, NODE:TMP3->key
++    |  ldih TMP3, -0x2(zero)	// -BCBIAS_J*4
++    |  stl CARG1, 8(RA)
++    |  addw RC, TMP0, RC
++    |  decode_BC4b RD
++    |  addw RD, TMP3, RD
++    |  stl CARG2, 0(RA)
++    |  addl PC, RD, PC
++    |  stw RC, -8(RA)                // Update control var.
++    |  br zero, <3
++    break;
++
++  case BC_ISNEXT:
++    |  // RA = base*8, RD = target (points to ITERN)
++    |  addl RA, BASE, RA
++    |  zapi RD, 0xf0, TMP0
++    |  srli TMP0, 1, TMP0
++    |  ldl CFUNC:CARG1, -24(RA)
++    |  addl TMP0, PC, TMP0
++    |  ldl CARG2, -16(RA)
++    |  ldl CARG3, -8(RA)
++    |  ldih TMP2, -0x2(zero)	// -BCBIAS_J*4
++    |  checkfunc CFUNC:CARG1, >5
++    |  gettp CARG2, CARG2
++    |  ldi CARG2, -LJ_TTAB(CARG2)
++    |  ldbu TMP1, CFUNC:CARG1->ffid
++    |  ldi CARG3, -LJ_TNIL(CARG3)
++    |  bis CARG2, CARG3, TMP3
++    |  ldi TMP1, -FF_next_N(TMP1)
++    |  bis TMP3, TMP1, TMP3
++    |  ldih TMP1, 0x1(zero)
++    |  ldi TMP1, -0x2(TMP1)          //LJ_KEYINDEX
++    |  bne TMP3, >5
++    |  addl TMP0, TMP2, PC
++    |  slli TMP1, 16, TMP1
++    |  ldi TMP1, 0x7fff(TMP1)		
++    |  slli TMP1, 32, TMP1       //make TMP1=0xfffe7fff00000000
++    |  stl TMP1, -8(RA)
++    |1:
++    |  ins_next
++    |5:  // Despecialize bytecode if any of the checks fail.
++    |  ldi TMP3, BC_JMP(zero)
++    |  ldi TMP1, BC_ITERC(zero)
++    |  stb TMP3, -4+OFS_OP(PC)
++    |  addl TMP0, TMP2, PC
++    |  stb TMP1, OFS_OP(PC)
++    |  br zero, <1
++    break;
++
++  case BC_VARG:
++    |  // RA = base*8, RB = (nresults+1)*8, RC = numparams*8
++    |  ldl TMP0, FRAME_PC(BASE)
++    |  decode_RDtoRC8 RC, RD
++    |  decode_RB RB, INS
++    |  addl RC, BASE, RC
++    |  addl RA, BASE, RA
++    |  ldi RC, FRAME_VARG(RC)
++    |  addl RA, RB, TMP2
++    |  ldi TMP3, -16(BASE)		// TMP3 = vtop
++    |  subl RC, TMP0, RC		// RC = vbase
++    |  // Note: RC may now be even _above_ BASE if nargs was < numparams.
++    |  subl TMP3, RC, TMP1
++    |  beq RB, >5 // Copy all varargs?
++    |  ldi TMP2, -16(TMP2)
++    |1:  // Copy vararg slots to destination slots.
++    |  ldl CARG1, 0(RC)
++    |  cmpult RC, TMP3, TMP0
++    |  ldi RC, 8(RC)
++    |  selne TMP0, CARG1, TISNIL, CARG1
++    |  stl CARG1, 0(RA)
++    |  cmpult RA, TMP2, TMP0
++    |  ldi RA, 8(RA)
++    |  bne TMP0, <1
++    |3:
++    |  ins_next
++    |
++    |5:  // Copy all varargs.
++    |  ldl TMP0, L->maxstack
++    |  ldi MULTRES, 8(zero)		// MULTRES = (0+1)*8
++    |  cmplt zero, TMP1, AT
++    |  beq AT, <3			// No vararg slots?
++    |  addl RA, TMP1, TMP2
++    |  cmpult TMP0, TMP2, TMP2
++    |  ldi MULTRES, 8(TMP1)
++    |  bne TMP2, >7
++    |6:
++    |  ldl CRET1, 0(RC)
++    |  ldi RC, 8(RC)
++    |  stl CRET1, 0(RA)
++    |  cmpult RC, TMP3, TMP0
++    |  ldi RA, 8(RA)
++    |  bne TMP0, <6			// More vararg slots?
++    |  br zero, <3
++    |
++    |7:  // Grow stack for varargs.
++    |  load_got lj_state_growstack
++    |  stl RA, L->top
++    |  subl RA, BASE, RA
++    |  stl BASE, L->base
++    |  subl RC, BASE, BASE		// Need delta, because BASE may change.
++    |  stl PC, SAVE_PC(sp)
++    |  zapi TMP1, 0xf0, CARG2
++    |  srli CARG2, 3, CARG2
++    |  bis L, zero, CARG1
++    |  call_intern lj_state_growstack	// (lua_State *L, int n)
++    |  bis BASE, zero, RC
++    |  ldl BASE, L->base
++    |  addl RA, BASE, RA
++    |  addl RC, BASE, RC
++    |  ldi TMP3, -16(BASE)
++    |  br zero, <6
++    break;
++
++  /* -- Returns ----------------------------------------------------------- */
++
++  case BC_RETM:
++    |  // RA = results*8, RD = extra_nresults*8
++    |  addw RD, MULTRES, RD
++    |  br zero, ->BC_RET_Z1
++    break;
++
++  case BC_RET:
++    |  // RA = results*8, RD = (nresults+1)*8
++    |->BC_RET_Z1:
++    |  ldl PC, FRAME_PC(BASE)
++    |  addl RA, BASE, RA
++    |  bis RD, zero, MULTRES
++    |1:
++    |  andi PC, FRAME_TYPE, TMP0
++    |  xori PC, FRAME_VARG, TMP1
++    |  bne TMP0, ->BC_RETV_Z
++    |
++    |->BC_RET_Z:
++    |  // BASE = base, RA = resultptr, RD = (nresults+1)*8, PC = return
++    |  ldw INS, -4(PC)
++    |  ldi TMP2, -16(BASE)
++    |  ldi RC, -8(RD)
++    |  decode_RA TMP0, INS
++    |  decode_RB RB, INS
++    |  addl TMP2, RB, TMP3
++    |  subl TMP2, TMP0, BASE
++    |  beq RC, >3
++    |2:
++    |  ldl CRET1, 0(RA)
++    |  ldi RA, 8(RA)
++    |  ldi RC, -8(RC)
++    |  stl CRET1, 0(TMP2)
++    |  ldi TMP2, 8(TMP2)
++    |  bne RC, <2
++    |3:
++    |  ldi TMP3, -8(TMP3)
++    |5:
++    |  cmpult TMP2, TMP3, TMP0
++    |  ldl LFUNC:TMP1, FRAME_FUNC(BASE)
++    |  bne TMP0, >6
++    |  cleartp LFUNC:TMP1
++    |  ldl TMP1, LFUNC:TMP1->pc
++    |  ldl KBASE, PC2PROTO(k)(TMP1)
++    |  ins_next
++    |
++    |6:  // Fill up results with nil.
++    |  stl TISNIL, 0(TMP2)
++    |  ldi TMP2, 8(TMP2)
++    |  br zero, <5
++    |
++    |->BC_RETV_Z:  // Non-standard return case.
++    |  andi TMP1, FRAME_TYPEP, TMP2
++    |  bne TMP2, ->vm_return
++    |  // Return from vararg function: relocate BASE down.
++    |  subl BASE, TMP1, BASE
++    |  ldl PC, FRAME_PC(BASE)
++    |  br zero, <1
++    break;
++
++  case BC_RET0: case BC_RET1:
++    |  // RA = results*8, RD = (nresults+1)*8
++    |  ldl PC, FRAME_PC(BASE)
++    |  addl RA, BASE, RA
++    |  bis RD, zero, MULTRES
++    |  andi PC, FRAME_TYPE, TMP0
++    |  xori PC, FRAME_VARG, TMP1
++    |  bne TMP0, ->BC_RETV_Z
++    |  ldw INS, -4(PC)
++    |  ldi TMP2, -16(BASE)
++    if (op == BC_RET1) {
++      |  ldl CRET1, 0(RA)
++    }
++    |  decode_RB RB, INS
++    |  decode_RA RA, INS
++    |  subl TMP2, RA, BASE
++    if (op == BC_RET1) {
++      |  stl CRET1, 0(TMP2)
++    }
++    |5:
++    |  cmpult RD, RB, TMP0
++    |  ldl TMP1, FRAME_FUNC(BASE)
++    |  bne TMP0, >6
++    |  cleartp LFUNC:TMP1
++    |  ldl TMP1, LFUNC:TMP1->pc
++    |  ins_next1
++    |  ldl KBASE, PC2PROTO(k)(TMP1)
++    |  ins_next2
++    |
++    |6:  // Fill up results with nil.
++    |  ldi TMP2, 8(TMP2)
++    |  ldi RD, 8(RD)
++    if (op == BC_RET1) {
++      |  stl TISNIL, 0(TMP2)
++    } else {
++      |  stl TISNIL, -8(TMP2)
++    }
++    |  br zero, <5
++    break;
++
++  /* -- Loops and branches ------------------------------------------------ */
++
++  case BC_FORL:
++    |.if JIT
++    |  hotloop
++    |.endif
++    |  // Fall through. Assumes BC_IFORL follows.
++    break;
++
++  case BC_JFORI:
++  case BC_JFORL:
++#if !LJ_HASJIT
++    break;
++#endif
++  case BC_FORI:
++  case BC_IFORL:
++    |  // RA = base*8, RD = target (after end of loop or start of loop)
++    vk = (op == BC_IFORL || op == BC_JFORL);
++    |  addl RA, BASE, RA
++    |  ldl CARG1, FORL_IDX*8(RA)		// CARG1 = IDX
++    |  ldl CARG2, FORL_STEP*8(RA)		// CARG2 = STEP
++    |  ldl CARG3, FORL_STOP*8(RA)		// CARG3 = STOP
++    |  gettp CARG4, CARG1
++    |  gettp CARG5, CARG2
++    |  gettp CRET2, CARG3
++    if (op != BC_JFORL) {
++      |  zapi RD, 0xf0, RD
++      |  srli RD, 1, RD
++      |  ldih TMP2, -0x2(zero)   // -BCBIAS_J<<2
++      |  addl TMP2, RD, TMP2
++    }
++      |  ldi TISNUM, LJ_TISNUM(zero)
++      |  cmpeq CARG4, TISNUM, AT
++    |  beq AT, >3
++    |  addwi CARG1, 0, CARG4		// start
++    |  addwi CARG3, 0, CARG3		// stop
++    if (!vk) {				// init
++      |  ldi TISNUM, LJ_TISNUM(zero)
++      |  cmpeq CRET2, TISNUM, AT
++      |  beq AT,->vmeta_for
++      |  cmpeq CARG5, TISNUM, AT
++      |  beq AT, ->vmeta_for
++      |  .DEXTM TMP0, CARG2, 31, 1	// sign
++      |  cmplt CARG3, CARG4, CARG2
++      |  cmplt CARG4, CARG3, TMP1
++      |  selne TMP0, TMP1, CARG2, CARG2		// CARG2=0: +,start <= stop or -,start >= stop
++    } else {
++      |  addwi CARG2, 0, CARG5		// step
++      |  addw CARG4, CARG5, CARG1	// start + step
++      |  xor CARG1, CARG4, TMP3		// y^a
++      |  xor CARG1, CARG5, TMP1		// y^b
++      |  and TMP3, TMP1, TMP3
++      |  cmplt CARG1, CARG3, TMP1		// start+step < stop ?
++      |  cmplt CARG3, CARG1, CARG3	// stop < start+step ?
++      |  cmplt CARG5, zero, TMP0		// step < 0 ?
++      |  cmplt TMP3, zero, TMP3		// ((y^a) & (y^b)) < 0: overflow.
++      |  selne TMP0, TMP1, CARG3, CARG3
++      |  bis CARG3, TMP3, CARG2		// CARG2=1: overflow; CARG2=0: continue
++      |  zapi CARG1, 0xf0, CARG1
++      |  ldi TISNUM, LJ_TISNUM(zero)
++      |  settp CARG1, TISNUM
++      |  stl CARG1, FORL_IDX*8(RA)
++    }
++    |1:
++    if (op == BC_FORI) {
++      |  seleq CARG2, zero, TMP2, TMP2	// CARG2!=0: jump out the loop; CARG2==0: next INS
++      |  addl PC, TMP2, PC
++    } else if (op == BC_JFORI) {
++      |  addl PC, TMP2, PC
++      |  ldhu RD, -4+OFS_RD(PC)
++    } else if (op == BC_IFORL) {
++      |  selne CARG2, zero, TMP2, TMP2	// CARG2!=0: next INS; CARG2==0: jump back
++      |  addl PC, TMP2, PC
++    }
++    |  ins_next1
++    |  stl CARG1, FORL_EXT*8(RA)
++    |2:
++    if (op == BC_JFORI) {
++      |  decode_BC8b RD
++      |  beq CARG2, =>BC_JLOOP		// CARG2 == 0: excute the loop
++    } else if (op == BC_JFORL) {
++      |  beq CARG2, =>BC_JLOOP
++    }
++    |  ins_next2
++    |  //TODO FCC
++    |3:  // FP loop.
++    |  fldd FTMP0, FORL_IDX*8(RA)	// start
++    |  fldd FTMP1, FORL_STOP*8(RA)	// stop
++    |  ldl TMP0, FORL_STEP*8(RA)	// step
++    |  cmplt TMP0, zero, TMP0		// step < 0 ?
++    |  ifmovd TMP0, FTMP2
++    if (!vk) {
++      |  // cmpulti CARG4, LJ_TISNUM, TMP3	// start is number ?
++      |  // cmpulti CARG5, LJ_TISNUM, TMP0	// step is number ?
++      |  // cmpulti CARG6, LJ_TISNUM, TMP1	// stop is number ?
++      |  ldi TMP1, LJ_TISNUM(zero)
++      |  cmpult CARG4, TMP1, TMP3
++      |  cmpult CARG5, TMP1, TMP0
++      |  cmpult CRET2, TMP1, TMP1
++      |  and TMP3, TMP1, TMP3
++      |  and TMP0, TMP3, TMP0
++      |  beq TMP0, ->vmeta_for		// if start or step or stop isn't number
++      |  fcmplt FTMP0, FTMP1, FTMP3		// start < stop ?
++      |  fcmplt FTMP1, FTMP0, FTMP4		// stop < start ?
++      |  fseleq FTMP2, FTMP4, FTMP3, FTMP2  //TODO CHECK
++      |  fimovd FTMP2, CARG2	// CARG2=0:+,start<stop or -,start>stop
++    |  br zero, <1
++    } else {
++      |  fldd FTMP3, FORL_STEP*8(RA)
++      |  faddd FTMP0, FTMP3, FTMP0		// start + step
++      |  fcmplt FTMP0, FTMP1, FTMP3		// start + step < stop ?
++      |  fcmplt FTMP1, FTMP0, FTMP4
++      |  fseleq FTMP2, FTMP4, FTMP3, FTMP2
++      |  fimovd FTMP2, CARG2
++      if (op == BC_IFORL) {
++	|  selne CARG2, zero, TMP2, TMP2
++	|  addl PC, TMP2, PC
++      }
++      |  fstd FTMP0, FORL_IDX*8(RA)
++      |  ins_next1
++      |  fstd FTMP0, FORL_EXT*8(RA)
++      |  br zero, <2
++    }
++    break;
++
++  case BC_ITERL:
++    |.if JIT
++    |  hotloop
++    |.endif
++    |  // Fall through. Assumes BC_IITERL follows.
++    break;
++
++  case BC_JITERL:
++#if !LJ_HASJIT
++    break;
++#endif
++  case BC_IITERL:
++    |  // RA = base*8, RD = target
++    |  addl RA, BASE, RA
++    |  ldl TMP1, 0(RA)
++    |  cmpeq TMP1, TISNIL, AT
++    |  bne AT, >1		// Stop if iterator returned nil.
++    if (op == BC_JITERL) {
++      |  stl TMP1,-8(RA)
++      |  br zero, =>BC_JLOOP
++    } else {
++      |  branch_RD			// Otherwise save control var + branch.
++      |  stl TMP1, -8(RA)
++    }
++    |1:
++    |  ins_next
++    break;
++
++  case BC_LOOP:
++    |  // RA = base*8, RD = target (loop extent)
++    |  // Note: RA/RD is only used by trace recorder to determine scope/extent
++    |  // This opcode does NOT jump, it's only purpose is to detect a hot loop.
++    |.if JIT
++    |  hotloop
++    |.endif
++    |  // Fall through. Assumes BC_ILOOP follows.
++    break;
++
++  case BC_ILOOP:
++    |  // RA = base*8, RD = target (loop extent)
++    |  ins_next
++    break;
++
++  case BC_JLOOP:
++    |.if JIT
++    |  // RA = base*8 (ignored), RD = traceno*8
++    |  ldl TMP0, DISPATCH_J(trace)(DISPATCH)
++    |  addl TMP0, RD, TMP0
++    |  // Traces on SW64 don't store the trace number, so use 0.
++    |  stl zero, DISPATCH_GL(vmstate)(DISPATCH)
++    |  ldl TRACE:TMP1, 0(TMP0)
++    |  stl BASE, DISPATCH_GL(jit_base)(DISPATCH)	// store Current JIT code L->base
++    |  ldl TMP1, TRACE:TMP1->mcode
++    |  ldi JGL, GG_DISP2G+32768(DISPATCH)
++    |  stl L, DISPATCH_GL(tmpbuf.L)(DISPATCH)
++    |  jmp zero, 0(TMP1)
++    |.endif
++    break;
++
++  case BC_JMP:
++    |  // RA = base*8 (only used by trace recorder), RD = target
++    |  branch_RD		// PC + (jump - 0x8000)<<2
++    |  ins_next
++    break;
++
++  /* -- Function headers -------------------------------------------------- */
++
++  case BC_FUNCF:
++    |.if JIT
++    |  hotcall
++    |.endif
++  case BC_FUNCV:  /* NYI: compiled vararg functions. */
++    |  // Fall through. Assumes BC_IFUNCF/BC_IFUNCV follow.
++    break;
++
++  case BC_JFUNCF:
++#if !LJ_HASJIT
++    break;
++#endif
++  case BC_IFUNCF:
++    |  // BASE = new base, RA = BASE+framesize*8, RB = LFUNC, RC = nargs*8
++    |  ldl TMP2, L->maxstack
++    |  ldbu TMP1, -4+PC2PROTO(numparams)(PC)
++    |  ldl KBASE, -4+PC2PROTO(k)(PC)
++    |  cmpult TMP2, RA, TMP0
++    |  s8addwi TMP1, 0, TMP1			// numparams*8
++    |  bne TMP0, ->vm_growstack_l
++    |2:
++    |  cmpult NARGS8:RC, TMP1, TMP0		// Check for missing parameters.
++    |  bne TMP0, >3
++    if (op == BC_JFUNCF) {
++      |  decode_RD RD, INS
++      |  br zero, =>BC_JLOOP
++    } else {
++      |  ins_next
++    }
++    |
++    |3:  // Clear missing parameters.
++    |  addl BASE, NARGS8:RC, TMP0
++    |  stl TISNIL, 0(TMP0)
++    |  addwi NARGS8:RC, 8, NARGS8:RC
++    |  br zero, <2
++    break;
++
++  case BC_JFUNCV:
++#if !LJ_HASJIT
++    break;
++#endif
++    |  NYI  // NYI: compiled vararg functions
++    break;  /* NYI: compiled vararg functions. */
++
++  case BC_IFUNCV:
++    |  // BASE = new base, RA = BASE+framesize*8, RB = LFUNC, RC = nargs*8
++    |  ldi TMP0, LJ_TFUNC(zero)
++    |  addwi TMP0, 0, TMP0
++    |  addl BASE, RC, TMP1
++    |  ldl TMP2, L->maxstack
++    |  settp LFUNC:RB, TMP0
++    |  addl RA, RC, TMP0
++    |  stl LFUNC:RB, 0(TMP1)		// Store (tagged) copy of LFUNC.
++    |  ldi TMP3, 16+FRAME_VARG(RC)
++    |  cmpult TMP0, TMP2, TMP0
++    |  ldl KBASE, -4+PC2PROTO(k)(PC)
++    |  stl TMP3, 8(TMP1)                // Store delta + FRAME_VARG.
++    |  beq TMP0, ->vm_growstack_l
++    |  ldbu TMP2, -4+PC2PROTO(numparams)(PC)
++    |  bis BASE, zero, RA
++    |  bis TMP1, zero, RC
++    |  ins_next1
++    |  ldi BASE, 16(TMP1)
++    |  beq TMP2, >3
++    |1:
++    |  ldl TMP0, 0(RA)
++    |  cmpult RA, RC, AT			// Less args than parameters?
++    |  bis TMP0, zero, CARG1
++    |  selne AT, TMP0, TISNIL, TMP0		// Clear missing parameters.
++    |  seleq AT, CARG1, TISNIL, CARG1		// Clear old fixarg slot (help the GC).
++    |  subwi TMP2, 1, TMP2
++    |  stl TMP0, 16(TMP1)
++    |  ldi TMP1, 8(TMP1)
++    |  stl CARG1, 0(RA)
++    |  ldi RA, 8(RA)
++    |  bne TMP2, <1
++    |3:
++    |  ins_next2
++    break;
++
++  case BC_FUNCC:
++  case BC_FUNCCW:
++    |  // BASE = new base, RA = BASE+framesize*8, RB = CFUNC, RC = nargs*8
++    if (op == BC_FUNCC) {
++      |  ldl CFUNCADDR, CFUNC:RB->f
++    } else {
++      |  ldl CFUNCADDR, DISPATCH_GL(wrapf)(DISPATCH)
++    }
++    |  addl RA, NARGS8:RC, TMP1
++    |  ldl TMP2, L->maxstack
++    |  addl BASE, NARGS8:RC, RC
++    |  stl BASE, L->base		// base of currently excuting function
++    |  stl RC, L->top
++    |  cmpult TMP2, TMP1, AT
++    |  li_vmstate C			// ldi TMP0,  ~LJ_VMST_C(zero)
++    if (op == BC_FUNCCW) {
++      |  ldl CARG2, CFUNC:RB->f
++    }
++    |  bis L, zero, CARG1
++    |  bne AT, ->vm_growstack_c	// Need to grow stack.
++    |  st_vmstate			// .STXW TMP0, DISPATCH, DISPATCH_GL(vmstate)
++    |  call r26, 0(CFUNCADDR)		// (lua_State *L [, lua_CFunction f])
++    |  // Returns nresults.
++    |  ldl BASE, L->base
++    |  ldl TMP1, L->top
++    |  stl L, DISPATCH_GL(cur_L)(DISPATCH)
++    |  s8addwi CRET1, 0, RD
++    |  li_vmstate INTERP
++    |  ldl PC, FRAME_PC(BASE)		// Fetch PC of caller.
++    |  subl TMP1, RD, RA		// RA = L->top - nresults*8
++    |  st_vmstate
++    |  br zero, ->vm_returnc
++    break;
++
++  /* ---------------------------------------------------------------------- */
++
++  default:
++    fprintf(stderr, "Error: undefined opcode BC_%s\n", bc_names[op]);
++    exit(2);
++    break;
++  }
++}
++
++static int build_backend(BuildCtx *ctx)
++{
++  int op;
++
++  dasm_growpc(Dst, BC__MAX);
++
++  build_subroutines(ctx);
++
++  |.code_op
++  for (op = 0; op < BC__MAX; op++)
++    build_ins(ctx, (BCOp)op, op);
++
++  return BC__MAX;
++}
++
++/* Emit pseudo frame-info for all assembler functions. */
++static void emit_asm_debug(BuildCtx *ctx)
++{
++  int fcofs = (int)((uint8_t *)ctx->glob[GLOB_vm_ffi_call] - ctx->code);
++  int i;
++  switch (ctx->mode) {
++  case BUILD_elfasm:
++    fprintf(ctx->fp, "\t.section .debug_frame,\"\",@progbits\n");
++    fprintf(ctx->fp,
++	".Lframe0:\n"
++	"\t.4byte .LECIE0-.LSCIE0\n"
++	".LSCIE0:\n"
++	"\t.4byte 0xffffffff\n"
++	"\t.byte 0x1\n"
++	"\t.string \"\"\n"
++	"\t.uleb128 0x1\n"
++	"\t.sleb128 -4\n"
++	"\t.byte 31\n"
++	"\t.byte 0xc\n\t.uleb128 29\n\t.uleb128 0\n"
++	"\t.align 2\n"
++	".LECIE0:\n\n");
++    fprintf(ctx->fp,
++	".LSFDE0:\n"
++	"\t.4byte .LEFDE0-.LASFDE0\n"
++	".LASFDE0:\n"
++	"\t.4byte .Lframe0\n"
++	"\t.8byte .Lbegin\n"
++	"\t.8byte %d\n"
++	"\t.byte 0xe\n\t.uleb128 %d\n"
++	"\t.byte 0x9f\n\t.sleb128 2*5\n"
++	"\t.byte 0x9e\n\t.sleb128 2*6\n",
++	fcofs, CFRAME_SIZE);
++    for (i = 23; i >= 16; i--)
++      fprintf(ctx->fp, "\t.byte %d\n\t.uleb128 %d\n", 0x80+i, 2*(30-i));
++    for (i = 31; i >= 24; i--)
++      fprintf(ctx->fp, "\t.byte %d\n\t.uleb128 %d\n", 0x80+32+i, 2*(46-i));
++    fprintf(ctx->fp,
++	"\t.align 2\n"
++	".LEFDE0:\n\n");
++#if LJ_HASFFI
++    fprintf(ctx->fp,
++	".LSFDE1:\n"
++	"\t.4byte .LEFDE1-.LASFDE1\n"
++	".LASFDE1:\n"
++	"\t.4byte .Lframe0\n"
++	"\t.8byte lj_vm_ffi_call\n"
++	"\t.4byte %d\n"
++	"\t.byte 0x9f\n\t.uleb128 2*1\n"
++	"\t.byte 0x90\n\t.uleb128 2*2\n"
++	"\t.byte 0xd\n\t.uleb128 0x10\n"
++	"\t.align 2\n"
++	".LEFDE1:\n\n", (int)ctx->codesz - fcofs);
++#endif
++#if !LJ_NO_UNWIND
++    /* NYI */
++#endif
++    break;
++  default:
++    break;
++  }
++}
diff --git a/luajit.spec b/luajit.spec
index 1bea606..7f952f3 100644
--- a/luajit.spec
+++ b/luajit.spec
@@ -2,7 +2,7 @@
 
 Name:           luajit
 Version:        2.1.0
-Release:        9
+Release:        10
 Summary:        Just-In-Time Compiler for Lua
 License:        MIT
 URL:            http://luajit.org/
@@ -17,7 +17,8 @@ Source3:        apply-patches
 Patch0001:      luajit-2.1-d06beb0-update.patch
 Patch0002:      0002-luajit-add-secure-compile-option-fstack.patch
 Patch0003:      add-riscv-support.patch
-ExclusiveArch:  %{arm} %{ix86} x86_64 %{mips} aarch64 riscv64 loongarch64
+Patch0004:      add-sw64-support.patch
+ExclusiveArch:  %{arm} %{ix86} x86_64 %{mips} aarch64 riscv64 loongarch64 sw_64
 
 BuildRequires:  gcc
 BuildRequires:  make
@@ -56,6 +57,11 @@ cp %{SOURCE3} .
 sh ./apply-patches
 %endif
 
+# sw64 arch patch
+%ifarch sw_64
+%patch -P0004 -p1
+%endif
+
 sed -i -e '/install -m/s/-m/-p -m/' Makefile
 
 %build
@@ -94,6 +100,10 @@ cp -a doc _tmp_html/html
 %{_mandir}/man1/%{name}.1*
 
 %changelog
+
+* Tue Mar 25 2025 swcompiler <lc@wxiat.com> - 2.1.0-10
+- add sw64 support
+
 * Tue Apr 23 2024 zhaoxiaolin <zhaoxiaolin@loongson.cn> - 2.1.0-9
 - Add loongarch64 base support
 
-- 
Gitee