From 34c95d8f0f694f7990cb7b7a2ab04a235111a8a1 Mon Sep 17 00:00:00 2001
From: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
Date: Mon, 15 May 2023 11:28:11 +0800
Subject: [PATCH] Add loongarch64 base support

---
 apply-patches                                 |   13 +
 loongarch64.conf                              |   20 +
 ...64-Add-target-architecture-selection.patch |   83 +
 .../0002-LoongArch64-Add-DynASM-support.patch | 1454 +++++++
 ...register-assignments-for-the-interpr.patch |   94 +
 .../0004-LoongArch64-Add-stack-layout.patch   |  111 +
 ...some-general-macro-type-definitions-.patch |  246 ++
 ...gArch64-Add-pure-interpreter-backend.patch | 3578 +++++++++++++++++
 ...rch64-Add-definitions-for-target-CPU.patch |  364 ++
 ...Arch64-Add-some-constant-definitions.patch |   92 +
 ...64-Add-LoongArch-instruction-emitter.patch |  325 ++
 ...LoongArch64-Add-IR-assembler-support.patch | 2161 ++++++++++
 ...4-Add-JIT-support-in-the-interpreter.patch |  384 ++
 ...CPU-feature-detection-when-init-JIT-.patch |   28 +
 ...LoongArch-lp64-calling-conventions-a.patch |  322 ++
 ...ngArch64-Add-FFI-C-callback-handling.patch |  177 +
 ...4-Add-FFI-support-in-the-interpreter.patch |  163 +
 ...Add-DWARF-and-ELF-header-definitions.patch |  187 +
 ...64-Add-support-for-LuaJIT-VM-builder.patch |   46 +
 ...loongarch64-support-when-save-list-b.patch |   25 +
 ...-Add-LoongArch64-disassembler-module.patch |  716 ++++
 ...-LoongArch64-Add-support-in-Makefile.patch |   68 +
 luajit.spec                                   |   19 +-
 23 files changed, 10674 insertions(+), 2 deletions(-)
 create mode 100644 apply-patches
 create mode 100644 loongarch64.conf
 create mode 100644 loongarch64/0001-LoongArch64-Add-target-architecture-selection.patch
 create mode 100644 loongarch64/0002-LoongArch64-Add-DynASM-support.patch
 create mode 100644 loongarch64/0003-LoongArch64-Add-register-assignments-for-the-interpr.patch
 create mode 100644 loongarch64/0004-LoongArch64-Add-stack-layout.patch
 create mode 100644 loongarch64/0005-LoongArch64-Add-some-general-macro-type-definitions-.patch
 create mode 100644 loongarch64/0006-LoongArch64-Add-pure-interpreter-backend.patch
 create mode 100644 loongarch64/0007-LoongArch64-Add-definitions-for-target-CPU.patch
 create mode 100644 loongarch64/0008-LoongArch64-Add-some-constant-definitions.patch
 create mode 100644 loongarch64/0009-LoongArch64-Add-LoongArch-instruction-emitter.patch
 create mode 100644 loongarch64/0010-LoongArch64-Add-IR-assembler-support.patch
 create mode 100644 loongarch64/0011-LoongArch64-Add-JIT-support-in-the-interpreter.patch
 create mode 100644 loongarch64/0012-LoongArch64-Add-CPU-feature-detection-when-init-JIT-.patch
 create mode 100644 loongarch64/0013-LoongArch64-Add-LoongArch-lp64-calling-conventions-a.patch
 create mode 100644 loongarch64/0014-LoongArch64-Add-FFI-C-callback-handling.patch
 create mode 100644 loongarch64/0015-LoongArch64-Add-FFI-support-in-the-interpreter.patch
 create mode 100644 loongarch64/0016-LoongArch64-Add-DWARF-and-ELF-header-definitions.patch
 create mode 100644 loongarch64/0017-LoongArch64-Add-support-for-LuaJIT-VM-builder.patch
 create mode 100644 loongarch64/0018-LoongArch64-Add-loongarch64-support-when-save-list-b.patch
 create mode 100644 loongarch64/0019-LoongArch64-Add-LoongArch64-disassembler-module.patch
 create mode 100644 loongarch64/0020-LoongArch64-Add-support-in-Makefile.patch

diff --git a/apply-patches b/apply-patches
new file mode 100644
index 0000000..21931cc
--- /dev/null
+++ b/apply-patches
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+set -ex
+
+if [ ! -d loongarch64 ]; then
+	tar -xf loongarch64.tar.gz
+fi
+
+for p in $(cat loongarch64.conf); do
+	patch -p1 -s -i loongarch64/$p
+done
+
+rm -rf $0 loongarch64 loongarch64.tar.gz loongarch64.conf
diff --git a/loongarch64.conf b/loongarch64.conf
new file mode 100644
index 0000000..2034484
--- /dev/null
+++ b/loongarch64.conf
@@ -0,0 +1,20 @@
+0001-LoongArch64-Add-target-architecture-selection.patch
+0002-LoongArch64-Add-DynASM-support.patch
+0003-LoongArch64-Add-register-assignments-for-the-interpr.patch
+0004-LoongArch64-Add-stack-layout.patch
+0005-LoongArch64-Add-some-general-macro-type-definitions-.patch
+0006-LoongArch64-Add-pure-interpreter-backend.patch
+0007-LoongArch64-Add-definitions-for-target-CPU.patch
+0008-LoongArch64-Add-some-constant-definitions.patch
+0009-LoongArch64-Add-LoongArch-instruction-emitter.patch
+0010-LoongArch64-Add-IR-assembler-support.patch
+0011-LoongArch64-Add-JIT-support-in-the-interpreter.patch
+0012-LoongArch64-Add-CPU-feature-detection-when-init-JIT-.patch
+0013-LoongArch64-Add-LoongArch-lp64-calling-conventions-a.patch
+0014-LoongArch64-Add-FFI-C-callback-handling.patch
+0015-LoongArch64-Add-FFI-support-in-the-interpreter.patch
+0016-LoongArch64-Add-DWARF-and-ELF-header-definitions.patch
+0017-LoongArch64-Add-support-for-LuaJIT-VM-builder.patch
+0018-LoongArch64-Add-loongarch64-support-when-save-list-b.patch
+0019-LoongArch64-Add-LoongArch64-disassembler-module.patch
+0020-LoongArch64-Add-support-in-Makefile.patch
diff --git a/loongarch64/0001-LoongArch64-Add-target-architecture-selection.patch b/loongarch64/0001-LoongArch64-Add-target-architecture-selection.patch
new file mode 100644
index 0000000..cc09cf5
--- /dev/null
+++ b/loongarch64/0001-LoongArch64-Add-target-architecture-selection.patch
@@ -0,0 +1,83 @@
+From 8c89de037ec1e05cdd203eb0962d98c7d7a22d0c Mon Sep 17 00:00:00 2001
+From: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
+Date: Wed, 27 Jul 2022 15:47:31 +0800
+Subject: [PATCH 01/20] LoongArch64: Add target architecture selection
+
+---
+ src/lj_arch.h | 32 ++++++++++++++++++++++++++++++++
+ 1 file changed, 32 insertions(+)
+
+diff --git a/src/lj_arch.h b/src/lj_arch.h
+index 5fb798d9..d8e78762 100644
+--- a/src/lj_arch.h
++++ b/src/lj_arch.h
+@@ -31,6 +31,8 @@
+ #define LUAJIT_ARCH_mips32	6
+ #define LUAJIT_ARCH_MIPS64	7
+ #define LUAJIT_ARCH_mips64	7
++#define LUAJIT_ARCH_LOONGARCH64	8
++#define LUAJIT_ARCH_loongarch64	8
+ 
+ /* Target OS. */
+ #define LUAJIT_OS_OTHER		0
+@@ -65,6 +67,8 @@
+ #define LUAJIT_TARGET	LUAJIT_ARCH_MIPS64
+ #elif defined(__mips__) || defined(__mips) || defined(__MIPS__) || defined(__MIPS)
+ #define LUAJIT_TARGET	LUAJIT_ARCH_MIPS32
++#elif defined(__loongarch64)
++#define LUAJIT_TARGET	LUAJIT_ARCH_LOONGARCH64
+ #else
+ #error "No support for this architecture (yet)"
+ #endif
+@@ -435,6 +439,20 @@
+ #define LJ_ARCH_VERSION		10
+ #endif
+ 
++#elif LUAJIT_TARGET == LUAJIT_ARCH_LOONGARCH64
++#define LJ_ARCH_NAME		"loongarch64"
++#define LJ_ARCH_BITS		64
++#define LJ_ARCH_ENDIAN		LUAJIT_LE
++#define LJ_TARGET_LOONGARCH64	1
++#define LJ_TARGET_GC64		1
++#define LJ_TARGET_EHRETREG	4
++#define LJ_TARGET_EHRAREG	1
++#define LJ_TARGET_JUMPRANGE	27	/* +-2^27 = +-128MB */
++#define LJ_TARGET_MASKSHIFT	1
++#define LJ_TARGET_MASKROT	1
++#define LJ_TARGET_UNIFYROT	2	/* Want only IR_BROR. */
++#define LJ_ARCH_NUMMODE		LJ_NUMMODE_DUAL
++
+ #else
+ #error "No target architecture defined"
+ #endif
+@@ -465,6 +483,16 @@
+ #error "Need at least GCC 4.8 or newer"
+ #endif
+ #endif
++#elif LJ_TARGET_LOONGARCH64
++#if __clang__
++#if ((__clang_major__ < 8) || ((__clang_major__ == 8) && __clang_minor__ < 0)) && !defined(__NX_TOOLCHAIN_MAJOR__)
++#error "Need at least Clang 8.0 or newer"
++#endif
++#else
++#if (__GNUC__ < 8) || ((__GNUC__ == 8) && __GNUC_MINOR__ < 3)
++#error "Need at least GCC 8.3 or newer"
++#endif
++#endif
+ #elif !LJ_TARGET_PS3
+ #if (__GNUC__ < 4) || ((__GNUC__ == 4) && __GNUC_MINOR__ < 3)
+ #error "Need at least GCC 4.3 or newer"
+@@ -512,6 +540,10 @@
+ /* MIPS32ON64 aka n32 ABI support might be desirable, but difficult. */
+ #error "Only n64 ABI supported for MIPS64"
+ #endif
++#elif LJ_TARGET_LOONGARCH64
++#if !(defined(_ABILP64) && _LOONGARCH_SIM == _ABILP64)
++#error "Only LOONGARCH lp64d ABI is supported"
++#endif
+ #endif
+ #endif
+ 
+-- 
+2.20.1
+
diff --git a/loongarch64/0002-LoongArch64-Add-DynASM-support.patch b/loongarch64/0002-LoongArch64-Add-DynASM-support.patch
new file mode 100644
index 0000000..21462cc
--- /dev/null
+++ b/loongarch64/0002-LoongArch64-Add-DynASM-support.patch
@@ -0,0 +1,1454 @@
+From 3c34c69cd7b9ea332cc4da9162854e2f3f352d18 Mon Sep 17 00:00:00 2001
+From: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
+Date: Wed, 27 Jul 2022 15:48:44 +0800
+Subject: [PATCH 02/20] LoongArch64: Add DynASM support
+
+---
+ dynasm/dasm_loongarch64.h   | 448 +++++++++++++++++
+ dynasm/dasm_loongarch64.lua | 979 ++++++++++++++++++++++++++++++++++++
+ 2 files changed, 1427 insertions(+)
+ create mode 100644 dynasm/dasm_loongarch64.h
+ create mode 100644 dynasm/dasm_loongarch64.lua
+
+diff --git a/dynasm/dasm_loongarch64.h b/dynasm/dasm_loongarch64.h
+new file mode 100644
+index 00000000..7503e04e
+--- /dev/null
++++ b/dynasm/dasm_loongarch64.h
+@@ -0,0 +1,448 @@
++/*
++** DynASM LoongArch encoding engine.
++** Copyright (C) 2005-2022 Mike Pall. All rights reserved.
++** Released under the MIT license. See dynasm.lua for full copyright notice.
++*/
++
++#include <stddef.h>
++#include <stdarg.h>
++#include <string.h>
++#include <stdlib.h>
++
++#define DASM_ARCH		"loongarch64"
++
++#ifndef DASM_EXTERN
++#define DASM_EXTERN(a,b,c,d)	0
++#endif
++
++/* Action definitions. */
++enum {
++  DASM_STOP, DASM_SECTION, DASM_ESC, DASM_REL_EXT,
++  /* The following actions need a buffer position. */
++  DASM_ALIGN, DASM_REL_LG, DASM_LABEL_LG,
++  /* The following actions also have an argument. */
++  DASM_REL_PC, DASM_LABEL_PC, DASM_IMM, DASM_IMM2,
++  DASM__MAX
++};
++
++/* Maximum number of section buffer positions for a single dasm_put() call. */
++#define DASM_MAXSECPOS		25
++
++/* DynASM encoder status codes. Action list offset or number are or'ed in. */
++#define DASM_S_OK		0x00000000
++#define DASM_S_NOMEM		0x01000000
++#define DASM_S_PHASE		0x02000000
++#define DASM_S_MATCH_SEC	0x03000000
++#define DASM_S_RANGE_I		0x11000000
++#define DASM_S_RANGE_SEC	0x12000000
++#define DASM_S_RANGE_LG		0x13000000
++#define DASM_S_RANGE_PC		0x14000000
++#define DASM_S_RANGE_REL	0x15000000
++#define DASM_S_UNDEF_LG		0x21000000
++#define DASM_S_UNDEF_PC		0x22000000
++
++/* Macros to convert positions (8 bit section + 24 bit index). */
++#define DASM_POS2IDX(pos)	((pos)&0x00ffffff)
++#define DASM_POS2BIAS(pos)	((pos)&0xff000000)
++#define DASM_SEC2POS(sec)	((sec)<<24)
++#define DASM_POS2SEC(pos)	((pos)>>24)
++#define DASM_POS2PTR(D, pos)	(D->sections[DASM_POS2SEC(pos)].rbuf + (pos))
++
++/* Action list type. */
++typedef const unsigned int *dasm_ActList;
++
++/* Per-section structure. */
++typedef struct dasm_Section {
++  int *rbuf;		/* Biased buffer pointer (negative section bias). */
++  int *buf;		/* True buffer pointer. */
++  size_t bsize;		/* Buffer size in bytes. */
++  int pos;		/* Biased buffer position. */
++  int epos;		/* End of biased buffer position - max single put. */
++  int ofs;		/* Byte offset into section. */
++} dasm_Section;
++
++/* Core structure holding the DynASM encoding state. */
++struct dasm_State {
++  size_t psize;			/* Allocated size of this structure. */
++  dasm_ActList actionlist;	/* Current actionlist pointer. */
++  int *lglabels;		/* Local/global chain/pos ptrs. */
++  size_t lgsize;
++  int *pclabels;		/* PC label chains/pos ptrs. */
++  size_t pcsize;
++  void **globals;		/* Array of globals (bias -10). */
++  dasm_Section *section;	/* Pointer to active section. */
++  size_t codesize;		/* Total size of all code sections. */
++  int maxsection;		/* 0 <= sectionidx < maxsection. */
++  int status;			/* Status code. */
++  dasm_Section sections[1];	/* All sections. Alloc-extended. */
++};
++
++/* The size of the core structure depends on the max. number of sections. */
++#define DASM_PSZ(ms)	(sizeof(dasm_State)+(ms-1)*sizeof(dasm_Section))
++
++
++/* Initialize DynASM state. */
++void dasm_init(Dst_DECL, int maxsection)
++{
++  dasm_State *D;
++  size_t psz = 0;
++  int i;
++  Dst_REF = NULL;
++  DASM_M_GROW(Dst, struct dasm_State, Dst_REF, psz, DASM_PSZ(maxsection));
++  D = Dst_REF;
++  D->psize = psz;
++  D->lglabels = NULL;
++  D->lgsize = 0;
++  D->pclabels = NULL;
++  D->pcsize = 0;
++  D->globals = NULL;
++  D->maxsection = maxsection;
++  for (i = 0; i < maxsection; i++) {
++    D->sections[i].buf = NULL;  /* Need this for pass3. */
++    D->sections[i].rbuf = D->sections[i].buf - DASM_SEC2POS(i);
++    D->sections[i].bsize = 0;
++    D->sections[i].epos = 0;  /* Wrong, but is recalculated after resize. */
++  }
++}
++
++/* Free DynASM state. */
++void dasm_free(Dst_DECL)
++{
++  dasm_State *D = Dst_REF;
++  int i;
++  for (i = 0; i < D->maxsection; i++)
++    if (D->sections[i].buf)
++      DASM_M_FREE(Dst, D->sections[i].buf, D->sections[i].bsize);
++  if (D->pclabels) DASM_M_FREE(Dst, D->pclabels, D->pcsize);
++  if (D->lglabels) DASM_M_FREE(Dst, D->lglabels, D->lgsize);
++  DASM_M_FREE(Dst, D, D->psize);
++}
++
++/* Setup global label array. Must be called before dasm_setup(). */
++void dasm_setupglobal(Dst_DECL, void **gl, unsigned int maxgl)
++{
++  dasm_State *D = Dst_REF;
++  D->globals = gl - 10;  /* Negative bias to compensate for locals. */
++  DASM_M_GROW(Dst, int, D->lglabels, D->lgsize, (10+maxgl)*sizeof(int));
++}
++
++/* Grow PC label array. Can be called after dasm_setup(), too. */
++void dasm_growpc(Dst_DECL, unsigned int maxpc)
++{
++  dasm_State *D = Dst_REF;
++  size_t osz = D->pcsize;
++  DASM_M_GROW(Dst, int, D->pclabels, D->pcsize, maxpc*sizeof(int));
++  memset((void *)(((unsigned char *)D->pclabels)+osz), 0, D->pcsize-osz);
++}
++
++/* Setup encoder. */
++void dasm_setup(Dst_DECL, const void *actionlist)
++{
++  dasm_State *D = Dst_REF;
++  int i;
++  D->actionlist = (dasm_ActList)actionlist;
++  D->status = DASM_S_OK;
++  D->section = &D->sections[0];
++  memset((void *)D->lglabels, 0, D->lgsize);
++  if (D->pclabels) memset((void *)D->pclabels, 0, D->pcsize);
++  for (i = 0; i < D->maxsection; i++) {
++    D->sections[i].pos = DASM_SEC2POS(i);
++    D->sections[i].ofs = 0;
++  }
++}
++
++
++#ifdef DASM_CHECKS
++#define CK(x, st) \
++  do { if (!(x)) { \
++    D->status = DASM_S_##st|(int)(p-D->actionlist-1); return; } } while (0)
++#define CKPL(kind, st) \
++  do { if ((size_t)((char *)pl-(char *)D->kind##labels) >= D->kind##size) { \
++    D->status = DASM_S_RANGE_##st|(int)(p-D->actionlist-1); return; } } while (0)
++#else
++#define CK(x, st)	((void)0)
++#define CKPL(kind, st)	((void)0)
++#endif
++
++static int dasm_imm2(unsigned int n)
++{
++  if ((n >> 21) == 0)
++    return n;
++  else if ((n >> 26) == 0)
++    return n;
++  else
++    return -1;
++}
++
++/* Pass 1: Store actions and args, link branches/labels, estimate offsets. */
++void dasm_put(Dst_DECL, int start, ...)
++{
++  va_list ap;
++  dasm_State *D = Dst_REF;
++  dasm_ActList p = D->actionlist + start;
++  dasm_Section *sec = D->section;
++  int pos = sec->pos, ofs = sec->ofs;
++  int *b;
++
++  if (pos >= sec->epos) {
++    DASM_M_GROW(Dst, int, sec->buf, sec->bsize,
++      sec->bsize + 2*DASM_MAXSECPOS*sizeof(int));
++    sec->rbuf = sec->buf - DASM_POS2BIAS(pos);
++    sec->epos = (int)sec->bsize/sizeof(int) - DASM_MAXSECPOS+DASM_POS2BIAS(pos);
++  }
++
++  b = sec->rbuf;
++  b[pos++] = start;
++
++  va_start(ap, start);
++  while (1) {
++    unsigned int ins = *p++;
++    unsigned int action = (ins >> 16) - 0xff00;
++    if (action >= DASM__MAX) {
++      ofs += 4;
++    } else {
++      int *pl, n = action >= DASM_REL_PC ? va_arg(ap, int) : 0;
++      switch (action) {
++      case DASM_STOP: goto stop;
++      case DASM_SECTION:
++	n = (ins & 255); CK(n < D->maxsection, RANGE_SEC);
++	D->section = &D->sections[n]; goto stop;
++      case DASM_ESC: p++; ofs += 4; break;
++      case DASM_REL_EXT: break;
++      case DASM_ALIGN: ofs += (ins & 255); b[pos++] = ofs; break;
++      case DASM_REL_LG:
++	n = (ins & 2047) - 10; pl = D->lglabels + n;
++	/* Bkwd rel or global. */
++	if (n >= 0) { CK(n>=10||*pl<0, RANGE_LG); CKPL(lg, LG); goto putrel; }
++	pl += 10; n = *pl;
++	if (n < 0) n = 0;  /* Start new chain for fwd rel if label exists. */
++	goto linkrel;
++      case DASM_REL_PC:
++	pl = D->pclabels + n; CKPL(pc, PC);
++      putrel:
++	n = *pl;
++	if (n < 0) {  /* Label exists. Get label pos and store it. */
++	  b[pos] = -n;
++	} else {
++      linkrel:
++	  b[pos] = n;  /* Else link to rel chain, anchored at label. */
++	  *pl = pos;
++	}
++	pos++;
++	break;
++      case DASM_LABEL_LG:
++	pl = D->lglabels + (ins & 2047) - 10; CKPL(lg, LG); goto putlabel;
++      case DASM_LABEL_PC:
++	pl = D->pclabels + n; CKPL(pc, PC);
++      putlabel:
++	n = *pl;  /* n > 0: Collapse rel chain and replace with label pos. */
++	while (n > 0) { int *pb = DASM_POS2PTR(D, n); n = *pb; *pb = pos;
++	}
++	*pl = -pos;  /* Label exists now. */
++	b[pos++] = ofs;  /* Store pass1 offset estimate. */
++	break;
++      case DASM_IMM:
++#ifdef DASM_CHECKS
++	CK((n & ((1<<((ins>>10)&31))-1)) == 0, RANGE_I);
++#endif
++	n >>= ((ins>>10)&31);
++#ifdef DASM_CHECKS
++	if (ins & 0x8000)
++	  CK(((n + (1<<(((ins>>5)&31)-1)))>>((ins>>5)&31)) == 0, RANGE_I);
++	else
++	  CK((n>>((ins>>5)&31)) == 0, RANGE_I);
++#endif
++	b[pos++] = n;
++	break;
++      case DASM_IMM2:
++        CK(dasm_imm2((unsigned int)n) != -1, RANGE_I);
++        b[pos++] = n;
++        break;
++      }
++    }
++  }
++stop:
++  va_end(ap);
++  sec->pos = pos;
++  sec->ofs = ofs;
++}
++#undef CK
++
++/* Pass 2: Link sections, shrink aligns, fix label offsets. */
++int dasm_link(Dst_DECL, size_t *szp)
++{
++  dasm_State *D = Dst_REF;
++  int secnum;
++  int ofs = 0;
++
++#ifdef DASM_CHECKS
++  *szp = 0;
++  if (D->status != DASM_S_OK) return D->status;
++  {
++    int pc;
++    for (pc = 0; pc*sizeof(int) < D->pcsize; pc++)
++      if (D->pclabels[pc] > 0) return DASM_S_UNDEF_PC|pc;
++  }
++#endif
++
++  { /* Handle globals not defined in this translation unit. */
++    int idx;
++    for (idx = 10; idx*sizeof(int) < D->lgsize; idx++) {
++      int n = D->lglabels[idx];
++      /* Undefined label: Collapse rel chain and replace with marker (< 0). */
++      while (n > 0) { int *pb = DASM_POS2PTR(D, n); n = *pb; *pb = -idx; }
++    }
++  }
++
++  /* Combine all code sections. No support for data sections (yet). */
++  for (secnum = 0; secnum < D->maxsection; secnum++) {
++    dasm_Section *sec = D->sections + secnum;
++    int *b = sec->rbuf;
++    int pos = DASM_SEC2POS(secnum);
++    int lastpos = sec->pos;
++
++    while (pos != lastpos) {
++      dasm_ActList p = D->actionlist + b[pos++];
++      while (1) {
++	unsigned int ins = *p++;
++	unsigned int action = (ins >> 16) - 0xff00;
++	switch (action) {
++	case DASM_STOP: case DASM_SECTION: goto stop;
++	case DASM_ESC: p++; break;
++	case DASM_REL_EXT: break;
++	case DASM_ALIGN: ofs -= (b[pos++] + ofs) & (ins & 255); break;
++	case DASM_REL_LG: case DASM_REL_PC: pos++; break;
++	case DASM_LABEL_LG: case DASM_LABEL_PC: b[pos++] += ofs; break;
++	case DASM_IMM: case DASM_IMM2: pos++; break;
++	}
++      }
++      stop: (void)0;
++    }
++    ofs += sec->ofs;  /* Next section starts right after current section. */
++  }
++
++  D->codesize = ofs;  /* Total size of all code sections */
++  *szp = ofs;
++  return DASM_S_OK;
++}
++
++#ifdef DASM_CHECKS
++#define CK(x, st) \
++  do { if (!(x)) return DASM_S_##st|(int)(p-D->actionlist-1); } while (0)
++#else
++#define CK(x, st)	((void)0)
++#endif
++
++/* Pass 3: Encode sections. */
++int dasm_encode(Dst_DECL, void *buffer)
++{
++  dasm_State *D = Dst_REF;
++  char *base = (char *)buffer;
++  unsigned int *cp = (unsigned int *)buffer;
++  int secnum;
++
++  /* Encode all code sections. No support for data sections (yet). */
++  for (secnum = 0; secnum < D->maxsection; secnum++) {
++    dasm_Section *sec = D->sections + secnum;
++    int *b = sec->buf;
++    int *endb = sec->rbuf + sec->pos;
++
++    while (b != endb) {
++      dasm_ActList p = D->actionlist + *b++;
++      while (1) {
++	unsigned int ins = *p++;
++	unsigned int action = (ins >> 16) - 0xff00;
++	int n = (action >= DASM_ALIGN && action < DASM__MAX) ? *b++ : 0;
++	switch (action) {
++	case DASM_STOP: case DASM_SECTION: goto stop;
++	case DASM_ESC: *cp++ = *p++; break;
++	case DASM_REL_EXT:
++	  n = DASM_EXTERN(Dst, (unsigned char *)cp, (ins & 2047), 1);
++	  goto patchrel;
++	case DASM_ALIGN:
++	  ins &= 255; while ((((char *)cp - base) & ins)) *cp++ = 0x60000000;
++	  break;
++	case DASM_REL_LG:
++	  if (n < 0) {
++	    n = (int)((ptrdiff_t)D->globals[-n] - (ptrdiff_t)cp + 4);
++	    goto patchrel;
++	  }
++	  /* fallthrough */
++	case DASM_REL_PC:
++	  CK(n >= 0, UNDEF_PC);
++	  n = *DASM_POS2PTR(D, n);
++	  if (ins & 2048)
++	    n = (n + (int)(size_t)base) & 0x0fffffff;
++	  else
++	    n = n - (int)((char *)cp - base) + 4;
++	patchrel: {
++          unsigned int e = 16 + ((ins >> 12) & 15);
++          CK((n & 3) == 0 &&
++             ((n + ((ins & 2048) ? 0 : (1<<(e+1)))) >> (e+2)) == 0, RANGE_REL);
++          if (!(ins & 0xf800)) { /* BEQ, BNE, BLT, BGE, BLTU, BGEU */
++            cp[-1] |= (((n >> 2) & 0xffff) << 10);
++          } else if ((ins & 0x5000)) { /* BEQZ, BNEZ, BCEQZ, BCNEZ */
++            cp[-1] |= (((n >> 2) & 0xffff) << 10) | (((n >> 2) & 0x1f0000) >> 16);
++          } else if ((ins & 0xa000)) { /* B, BL */
++            cp[-1] |= (((n >> 2) & 0xffff) << 10) | (((n >> 2) & 0x3ff0000) >> 16);
++          }
++        }
++	  break;
++	case DASM_LABEL_LG:
++	  ins &= 2047; if (ins >= 20) D->globals[ins-10] = (void *)(base + n);
++	  break;
++	case DASM_LABEL_PC: break;
++	case DASM_IMM2: {
++	  //cp[-1] |= ((n>>3) & 4); n &= 0x1f;
++          unsigned int imm2n = dasm_imm2((unsigned int)n);
++          cp[-1] |= ((imm2n&0x3ff0000) | ((imm2n&0xffff))>>10);
++          }
++          break;
++	  /* fallthrough */
++	case DASM_IMM:
++	  cp[-1] |= (n & ((1<<((ins>>5)&31))-1)) << (ins&31);
++	  break;
++	default: *cp++ = ins; break;
++	}
++      }
++      stop: (void)0;
++    }
++  }
++
++  if (base + D->codesize != (char *)cp)  /* Check for phase errors. */
++    return DASM_S_PHASE;
++  return DASM_S_OK;
++}
++#undef CK
++
++/* Get PC label offset. */
++int dasm_getpclabel(Dst_DECL, unsigned int pc)
++{
++  dasm_State *D = Dst_REF;
++  if (pc*sizeof(int) < D->pcsize) {
++    int pos = D->pclabels[pc];
++    if (pos < 0) return *DASM_POS2PTR(D, -pos);
++    if (pos > 0) return -1;  /* Undefined. */
++  }
++  return -2;  /* Unused or out of range. */
++}
++
++#ifdef DASM_CHECKS
++/* Optional sanity checker to call between isolated encoding steps. */
++int dasm_checkstep(Dst_DECL, int secmatch)
++{
++  dasm_State *D = Dst_REF;
++  if (D->status == DASM_S_OK) {
++    int i;
++    for (i = 1; i <= 9; i++) {
++      if (D->lglabels[i] > 0) { D->status = DASM_S_UNDEF_LG|i; break; }
++      D->lglabels[i] = 0;
++    }
++  }
++  if (D->status == DASM_S_OK && secmatch >= 0 &&
++      D->section != &D->sections[secmatch])
++    D->status = DASM_S_MATCH_SEC|(int)(D->section-D->sections);
++  return D->status;
++}
++#endif
++
+diff --git a/dynasm/dasm_loongarch64.lua b/dynasm/dasm_loongarch64.lua
+new file mode 100644
+index 00000000..ba6bf67e
+--- /dev/null
++++ b/dynasm/dasm_loongarch64.lua
+@@ -0,0 +1,979 @@
++------------------------------------------------------------------------------
++-- DynASM LoongArch module.
++--
++-- Copyright (C) 2005-2022 Mike Pall. All rights reserved.
++-- See dynasm.lua for full copyright notice.
++------------------------------------------------------------------------------
++
++-- Module information:
++local _info = {
++  arch =	"loongarch64",
++  description =	"DynASM LoongArch64 module",
++  version =	"1.5.0",
++  vernum =	 10500,
++  release =	"2021-05-02",
++  author =	"Mike Pall",
++  license =	"MIT",
++}
++
++-- Exported glue functions for the arch-specific module.
++local _M = { _info = _info }
++
++-- Cache library functions.
++local type, tonumber, pairs, ipairs = type, tonumber, pairs, ipairs
++local assert, setmetatable = assert, setmetatable
++local _s = string
++local sub, format, byte, char = _s.sub, _s.format, _s.byte, _s.char
++local match, gmatch = _s.match, _s.gmatch
++local concat, sort = table.concat, table.sort
++local bit = bit or require("bit")
++local band, shl, shr, sar = bit.band, bit.lshift, bit.rshift, bit.arshift
++local tohex = bit.tohex
++
++-- Inherited tables and callbacks.
++local g_opt, g_arch
++local wline, werror, wfatal, wwarn
++
++-- Action name list.
++-- CHECK: Keep this in sync with the C code!
++local action_names = {
++  "STOP", "SECTION", "ESC", "REL_EXT",
++  "ALIGN", "REL_LG", "LABEL_LG",
++  "REL_PC", "LABEL_PC", "IMM", "IMM2",
++}
++
++-- Maximum number of section buffer positions for dasm_put().
++-- CHECK: Keep this in sync with the C code!
++local maxsecpos = 25 -- Keep this low, to avoid excessively long C lines.
++
++-- Action name -> action number.
++local map_action = {}
++for n,name in ipairs(action_names) do
++  map_action[name] = n-1
++end
++
++-- Action list buffer.
++local actlist = {}
++
++-- Argument list for next dasm_put(). Start with offset 0 into action list.
++local actargs = { 0 }
++
++-- Current number of section buffer positions for dasm_put().
++local secpos = 1
++
++------------------------------------------------------------------------------
++
++-- Dump action names and numbers.
++local function dumpactions(out)
++  out:write("DynASM encoding engine action codes:\n")
++  for n,name in ipairs(action_names) do
++    local num = map_action[name]
++    out:write(format("  %-10s %02X  %d\n", name, num, num))
++  end
++  out:write("\n")
++end
++
++-- Write action list buffer as a huge static C array.
++local function writeactions(out, name)
++  local nn = #actlist
++  if nn == 0 then nn = 1; actlist[0] = map_action.STOP end
++  out:write("static const unsigned int ", name, "[", nn, "] = {\n")
++  for i = 1,nn-1 do
++    assert(out:write("0x", tohex(actlist[i]), ",\n"))
++  end
++  assert(out:write("0x", tohex(actlist[nn]), "\n};\n\n"))
++end
++
++------------------------------------------------------------------------------
++
++-- Add word to action list.
++local function wputxw(n)
++  assert(n >= 0 and n <= 0xffffffff and n % 1 == 0, "word out of range")
++  actlist[#actlist+1] = n
++end
++
++-- Add action to list with optional arg. Advance buffer pos, too.
++local function waction(action, val, a, num)
++  local w = assert(map_action[action], "bad action name `"..action.."'")
++  wputxw(0xff000000 + w * 0x10000 + (val or 0))
++  if a then actargs[#actargs+1] = a end
++  if a or num then secpos = secpos + (num or 1) end
++end
++
++-- Flush action list (intervening C code or buffer pos overflow).
++local function wflush(term)
++  if #actlist == actargs[1] then return end -- Nothing to flush.
++  if not term then waction("STOP") end -- Terminate action list.
++  wline(format("dasm_put(Dst, %s);", concat(actargs, ", ")), true)
++  actargs = { #actlist } -- Actionlist offset is 1st arg to next dasm_put().
++  secpos = 1 -- The actionlist offset occupies a buffer position, too.
++end
++
++-- Put escaped word.
++local function wputw(n)
++  if n >= 0xff000000 then waction("ESC") end
++  wputxw(n)
++end
++
++-- Reserve position for word.
++local function wpos()
++  local pos = #actlist+1
++  actlist[pos] = ""
++  return pos
++end
++
++-- Store word to reserved position.
++local function wputpos(pos, n)
++  assert(n >= 0 and n <= 0xffffffff and n % 1 == 0, "word out of range")
++  actlist[pos] = n
++end
++
++------------------------------------------------------------------------------
++
++-- Global label name -> global label number. With auto assignment on 1st use.
++local next_global = 20
++local map_global = setmetatable({}, { __index = function(t, name)
++  if not match(name, "^[%a_][%w_]*$") then werror("bad global label") end
++  local n = next_global
++  if n > 2047 then werror("too many global labels") end
++  next_global = n + 1
++  t[name] = n
++  return n
++end})
++
++-- Dump global labels.
++local function dumpglobals(out, lvl)
++  local t = {}
++  for name, n in pairs(map_global) do t[n] = name end
++  out:write("Global labels:\n")
++  for i=20,next_global-1 do
++    out:write(format("  %s\n", t[i]))
++  end
++  out:write("\n")
++end
++
++-- Write global label enum.
++local function writeglobals(out, prefix)
++  local t = {}
++  for name, n in pairs(map_global) do t[n] = name end
++  out:write("enum {\n")
++  for i=20,next_global-1 do
++    out:write("  ", prefix, t[i], ",\n")
++  end
++  out:write("  ", prefix, "_MAX\n};\n")
++end
++
++-- Write global label names.
++local function writeglobalnames(out, name)
++  local t = {}
++  for name, n in pairs(map_global) do t[n] = name end
++  out:write("static const char *const ", name, "[] = {\n")
++  for i=20,next_global-1 do
++    out:write("  \"", t[i], "\",\n")
++  end
++  out:write("  (const char *)0\n};\n")
++end
++
++------------------------------------------------------------------------------
++
++-- Extern label name -> extern label number. With auto assignment on 1st use.
++local next_extern = 0
++local map_extern_ = {}
++local map_extern = setmetatable({}, { __index = function(t, name)
++  -- No restrictions on the name for now.
++  local n = next_extern
++  if n > 2047 then werror("too many extern labels") end
++  next_extern = n + 1
++  t[name] = n
++  map_extern_[n] = name
++  return n
++end})
++
++-- Dump extern labels.
++local function dumpexterns(out, lvl)
++  out:write("Extern labels:\n")
++  for i=0,next_extern-1 do
++    out:write(format("  %s\n", map_extern_[i]))
++  end
++  out:write("\n")
++end
++
++-- Write extern label names.
++local function writeexternnames(out, name)
++  out:write("static const char *const ", name, "[] = {\n")
++  for i=0,next_extern-1 do
++    out:write("  \"", map_extern_[i], "\",\n")
++  end
++  out:write("  (const char *)0\n};\n")
++end
++
++------------------------------------------------------------------------------
++
++-- Arch-specific maps.
++local map_archdef = { sp="r3", ra="r1" } -- Ext. register name -> int. name.
++
++local map_type = {}		-- Type name -> { ctype, reg }
++local ctypenum = 0		-- Type number (for Dt... macros).
++
++-- Reverse defines for registers.
++function _M.revdef(s)
++  if s == "r3" then return "sp"
++  elseif s == "r1" then return "ra" end
++  return s
++end
++
++------------------------------------------------------------------------------
++
++-- Template strings for LoongArch instructions.
++local map_op = {
++  ["clo.w_2"] =		"00001000DJ",
++  ["clz.w_2"] =		"00001400DJ",
++  ["cto.w_2"] =		"00001800DJ",
++  ["ctz.w_2"] =		"00001c00DJ",
++  ["clo.d_2"] =		"00002000DJ",
++  ["clz.d_2"] =		"00002400DJ",
++  ["cto.d_2"] =		"00002800DJ",
++  ["ctz.d_2"] =		"00002c00DJ",
++  ["revb.2h_2"] =	"00003000DJ",
++  ["revb.4h_2"] =	"00003400DJ",
++  ["revb.2w_2"] =	"00003800DJ",
++  ["revb.d_2"] = 	"00003c00DJ",
++  ["revh.2w_2"] =	"00004000DJ",
++  ["revh.d_2"] =	"00004400DJ",
++  ["bitrev.4b_2"] =	"00004800DJ",
++  ["bitrev.8b_2"] =	"00004c00DJ",
++  ["bitrev.w_2"] =	"00005000DJ",
++  ["bitrev.d_2"] =	"00005400DJ",
++  ["ext.w.h_2"] =	"00005800DJ",
++  ["ext.w.b_2"] =	"00005c00DJ",
++
++  ["add.w_3"] =		"00100000DJK",
++  ["add.d_3"] =		"00108000DJK",
++  ["sub.w_3"] =		"00110000DJK",
++  ["sub.d_3"] =		"00118000DJK",
++  slt_3 = 		"00120000DJK",
++  sltu_3 =		"00128000DJK",
++  maskeqz_3 = 		"00130000DJK",
++  masknez_3 =		"00138000DJK",
++
++  nor_3 =		"00140000DJK",
++  and_3 = 		"00148000DJK",
++  or_3 = 		"00150000DJK",
++  xor_3 = 		"00158000DJK",
++  orn_3 =		"00160000DJK",
++  andn_3 = 		"00168000DJK",
++  ["sll.w_3"] =		"00170000DJK",
++  ["srl.w_3"] =		"00178000DJK",
++  ["sra.w_3"] = 	"00180000DJK",
++  ["sll.d_3"] =		"00188000DJK",
++  ["srl.d_3"] =		"00190000DJK",
++  ["sra.d_3"] =		"00198000DJK",
++  ["rotr.w_3"] =	"001b0000DJK",
++  ["rotr.d_3"] =	"001b8000DJK",
++  ["mul.w_3"] =		"001c0000DJK",
++  ["mulh.w_3"] = 	"001c8000DJK",
++  ["mulh.wu_3"] =	"001d0000DJK",
++  ["mul.d_3"] =		"001d8000DJK",
++  ["mulh.d_3"] =	"001e0000DJK",
++  ["mulh.du_3"] =	"001e8000DJK",
++  ["mulw.d.w_3"] =	"001f0000DJK",
++  ["mulw.d.wu_3"] =	"001f8000DJK",
++
++  ["fabs.h_2"] =	"01140000FG",
++  ["fabs.s_2"] = 	"01140400FG",
++  ["fabs.d_2"] =	"01140800FG",
++  ["fneg.h_2"] =	"01141000FG",
++  ["fneg.s_2"] =	"01141400FG",
++  ["fneg.d_2"] =	"01141800FG",
++  ["flogb.h_2"] =	"01142000FG",
++  ["flogb.s_2"] =	"01142400FG",
++  ["flogb.d_2"] =	"01142800FG",
++  ["fclass.h_2"] =	"01143000FG",
++  ["fclass.s_2"] =	"01143400FG",
++  ["fclass.d_2"] =	"01143800FG",
++  ["fsqrt.h_2"] =	"01144000FG",
++  ["fsqrt.s_2"] =	"01144400FG",
++  ["fsqrt.d_2"] =	"01144800FG",
++  ["frecip.h_2"] = 	"01145000FG",
++  ["frecip.s_2"] =	"01145400FG",
++  ["frecip.d_2"] =	"01145800FG",
++  ["frsqrt.h_2"] =	"01146000FG",
++  ["frsqrt.s_2"] =	"01146400FG",
++  ["frsqrt.d_2"] =	"01146800FG",
++  ["frecipe.h_2"] =	"01147000FG",
++  ["frecipe.s_2"] =	"01147400FG",
++  ["frecipe.d_2"] =	"01147800FG",
++  ["frsqrte.h_2"] =	"01148000FG",
++  ["frsqrte.s_2"] =	"01148400FG",
++  ["frsqrte.d_2"] =	"01148800FG",
++
++  ["fmov.h_2"] =	"01149000FG",
++  ["fmov.s_2"] =	"01149400FG",
++  ["fmov.d_2"] =	"01149800FG",
++  ["movgr2fr.h_2"] =	"0114a000FJ",
++  ["movgr2fr.w_2"] =	"0114a400FJ",
++  ["movgr2fr.d_2"] =	"0114a800FJ",
++  ["movgr2frh.w_2"] =	"0114ac00FJ",
++  ["movfr2gr.h_2"] =	"0114b000DG",
++  ["movfr2gr.s_2"] =	"0114b400DG",
++  ["movfr2gr.d_2"] =	"0114b800DG",
++  ["movfrh2gr.s_2"] =	"0114bc00DG",
++  movgr2fcsr_2 =	"0114c000SG",
++  movfcsr2gr_2 =	"0114c800FR",
++  movfr2cf_2 =		"0114d000EG",
++  movcf2fr_2 =		"0114d400FA",
++  movgr2cf_2 =		"0114d800EG",
++  movcf2gr_2 =		"0114dc00DA",
++  ["fcvt.ld.d_2"] =	"0114e000FG",
++  ["fcvt.ud.d_2"] =	"0114e400FG",
++  ["fcvt.s.d_2"] = 	"01191800FG",
++  ["fcvt.d.s_2"] =	"01192400FG",
++  ["ftintrm.w.s_2"] =	"011a0400FG",
++  ["ftintrm.w.d_2"] =	"011a0800FG",
++  ["ftintrm.l.s_2"] =	"011a2400FG",
++  ["ftintrm.l.d_2"] =	"011a2800FG",
++  ["ftintrp.w.s_2"] =	"011a4400FG",
++  ["ftintrp.w.d_2"] =	"011a4800FG",
++  ["ftintrp.l.s_2"] =	"011a6400FG",
++  ["ftintrp.l.d_2"] =	"011a6800FG",
++  ["ftintrz.w.s_2"] =	"011a8400FG",
++  ["ftintrz.w.d_2"] =	"011a8800FG",
++  ["ftintrz.l.s_2"] =	"011aa400FG",
++  ["ftintrz.l.d_2"] =	"011aa800FG",
++  ["ftintrne.w.s_2"] =	"011ac400FG",
++  ["ftintrne.w.d_2"] =	"011ac800FG",
++  ["ftintrne.l.s_2"] =	"011ae400FG",
++  ["ftintrne.l.d_2"] =	"011ae800FG",
++  ["ftint.w.s_2"] =	"011b0400FG",
++  ["ftint.w.d_2"] =	"011b0800FG",
++  ["ftint.l.s_2"] =	"011b2400FG",
++  ["ftint.l.d_2"] =	"011b2800FG",
++  ["ffint.s.w_2"] =	"011d1000FG",
++  ["ffint.s.l_2"] =	"011d1800FG",
++  ["ffint.d.w_2"] =	"011d2000FG",
++  ["ffint.d.l_2"] =	"011d2800FG",
++  ["frint.s_2"] =	"011e4400FG",
++  ["frint.d_2"] =	"011e4800FG",
++
++  ["fadd.h_3"] =	"01000000FGH",
++  ["fadd.s_3"] =	"01008000FGH",
++  ["fadd.d_3"] =	"01010000FGH",
++  ["fsub.h_3"] =	"01020000FGH",
++  ["fsub.s_3"] =	"01028000FGH",
++  ["fsub.d_3"] =	"01030000FGH",
++  ["fmul.h_3"] =	"01040000FGH",
++  ["fmul.s_3"] =	"01048000FGH",
++  ["fmul.d_3"] =	"01050000FGH",
++  ["fdiv.h_3"] =	"01060000FGH",
++  ["fdiv.s_3"] =	"01068000FGH",
++  ["fdiv.d_3"] =	"01070000FGH",
++  ["fmax.h_3"] =	"01080000FGH",
++  ["fmax.s_3"] =	"01088000FGH",
++  ["fmax.d_3"] =	"01090000FGH",
++  ["fmin.h_3"] = 	"010a0000FGH",
++  ["fmin.s_3"] =	"010a8000FGH",
++  ["fmin.d_3"] =	"010b0000FGH",
++  ["fmaxa.h_3"] =	"010c0000FGH",
++  ["fmaxa.s_3"] =	"010c8000FGH",
++  ["fmaxa.d_3"] =	"010d0000FGH",
++  ["fmina.h_3"] =	"010e0000FGH",
++  ["fmina.s_3"] =	"010e8000FGH",
++  ["fmina.d_3"] =	"010f0000FGH",
++  ["fscaleb.h_3"] =	"01100000FGH",
++  ["fscaleb.s_3"] =	"01108000FGH",
++  ["fscaleb.d_3"] =	"01110000FGH",
++  ["fcopysign.h_3"] =	"01120000FGH",
++  ["fcopysign.s_3"] =	"01128000FGH",
++  ["fcopysign.d_3"] =	"01130000FGH",
++
++  ["fmadd.s_4"] =	"08100000FGHi",
++  ["fmadd.d_4"] =	"08200000FGHi",
++  ["fnmadd.d_4"] =	"08a00000FGHi",
++  ["fmsub.s_4"] =	"08500000FGHi",
++  ["fmsub.d_4"] =	"08600000FGHi",
++  ["fnmsub.d_4"] =	"08e00000FGHi",
++
++  ["alsl.w_4"] =	"00040000DJKQ",
++  ["alsl.wu_4"] =	"00060000DJKQ",
++  ["alsl.d_4"] =	"002c0000DJKQ",
++  ["bytepick.w_4"] =	"00080000DJKQ",
++  ["bytepick.d_4"] =	"000c0000DJKB",
++
++  ["div.w_3"] = 	"00200000DJK",
++  ["mod.w_3"] =		"00208000DJK",
++  ["div.wu_3"] =	"00210000DJK",
++  ["mod.wu_3"] =	"00218000DJK",
++  ["div.d_3"] =		"00220000DJK",
++  ["mod.d_3"] =		"00228000DJK",
++  ["div.du_3"] =	"00230000DJK",
++  ["mod.du_3"] =	"00238000DJK",
++  ["crc.w.b.w_3"] =	"00240000DJK",
++  ["crc.w.h.w_3"] =	"00248000DJK",
++  ["crc.w.w.w_3"] =	"00250000DJK",
++  ["crc.w.d.w_3"] =	"00258000DJK",
++  ["crcc.w.b.w_3"] =	"00260000DJK",
++  ["crcc.w.h.w_3"] =	"00268000DJK",
++  ["crcc.w.w.w_3"] =	"00270000DJK",
++  ["crcc.w.d.w_3"] =	"00278000DJK",
++
++  break_1 =		"002a0000C",
++  syscall_1 =		"002b0000C",
++
++  ["slli.w_3"] =	"00408000DJU",
++  ["slli.d_3"] =	"00410000DJV",
++  ["srli.w_3"] =	"00448000DJU",
++  ["srli.d_3"] =	"00450000DJV",
++  ["srai.w_3"] =	"00488000DJU",
++  ["srai.d_3"] =	"00490000DJV",
++  ["rotri.w_3"] =	"004c8000DJU",
++  ["rotri.d_3"] =	"004d0000DJV",
++
++  ["bstrins.w_4"] =	"00600000DJMU",
++  ["bstrpick.w_4"] =	"00608000DJMU",
++  ["bstrins.d_4"] = 	"00800000DJNV",
++  ["bstrpick.d_4"] =	"00c00000DJNV",
++  slti_3 =		"02000000DJX",
++  sltui_3 =		"02400000DJX",
++  ["addi.w_3"] =	"02800000DJX",
++  ["addi.d_3"] =	"02c00000DJX",
++  ["lu52i.d_3"] =	"03000000DJX",
++  andi_3 =		"03400000DJT",
++  ori_3 =		"03800000DJT",
++  xori_3 = 		"03c00000DJT",
++  ["lu12i.w_2"] =	"14000000DZ",
++  ["lu32i.d_2"] =	"16000000DZ",
++  pcaddi_2 =		"18000000DZ",
++  pcalau12i_2 = 	"1a000000DZ",
++  pcaddu12i_2 =		"1c000000DZ",
++  pcaddu18i_2 = 	"1e000000DZ",
++
++  ["ldx.b_3"] =		"38000000DJK",
++  ["ldx.h_3"] =		"38040000DJK",
++  ["ldx.w_3"] =		"38080000DJK",
++  ["ldx.d_3"] =		"380c0000DJK",
++  ["stx.b_3"] =		"38100000DJK",
++  ["stx.h_3"] =		"38140000DJK",
++  ["stx.w_3"] =		"38180000DJK",
++  ["stx.d_3"] =		"381c0000DJK",
++  ["ldx.bu_3"] =	"38200000DJK",
++  ["ldx.hu_3"] =	"38240000DJK",
++  ["ldx.wu_3"] =	"38280000DJK",
++  ["fldx.s_3"] =	"38300000FJK",
++  ["fldx.d_3"] =	"38340000FJK",
++  ["fstx.s_3"] =	"38380000FJK",
++  ["fstx.d_3"] =	"383c0000FJK",
++  ["fldgt.s_3"] =	"38740000FJK",
++  ["fldgt.d_3"] =	"38748000FJK",
++  ["fldle.s_3"] =	"38750000FJK",
++  ["fldle.d_3"] =	"38758000FJK",
++  ["fstgt.s_3"] =	"38760000FJK",
++  ["fstgt.d_3"] =	"38768000FJK",
++  ["fstle.s_3"] =	"38770000FJK",
++  ["fstle.d_3"] =	"38778000FJK",
++  ["ldgt.b_3"] =	"38780000DJK",
++  ["ldgt.h_3"] =	"38788000DJK",
++  ["ldgt.w_3"] =	"38790000DJK",
++  ["ldgt.d_3"] =	"38798000DJK",
++  ["ldle.b_3"] =	"387a0000DJK",
++  ["ldle.h_3"] =	"387a8000DJK",
++  ["ldle.w_3"] =	"387b0000DJK",
++  ["ldle.d_3"] =	"387b8000DJK",
++  ["stgt.b_3"] =	"387c0000DJK",
++  ["stgt.h_3"] =	"387c8000DJK",
++  ["stgt.w_3"] =	"387d0000DJK",
++  ["stgt.d_3"] =	"387d8000DJK",
++  ["stle.b_3"] =	"387e0000DJK",
++  ["stle.h_3"] =	"387e8000DJK",
++  ["stle.w_3"] =	"387f0000DJK",
++  ["stle.d_3"] =	"387f8000DJK",
++
++  ["ll.w_3"] =		"20000000DJW",
++  ["sc.w_3"] =		"21000000DJW",
++  ["ll.d_3"] =		"22000000DJW",
++  ["sc.d_3"] =		"23000000DJW",
++  ["ldptr.w_3"] =	"24000000DJW",
++  ["stptr.w_3"] =	"25000000DJW",
++  ["ldptr.d_3"] =	"26000000DJW",
++  ["stptr.d_3"] =	"27000000DJW",
++
++  ["ld.b_2"] =		"28000000Do",
++  ["ld.h_2"] =		"28400000Do",
++  ["ld.w_2"] =		"28800000Do",
++  ["ld.d_2"] =		"28c00000Do",
++  ["st.b_2"] =		"29000000Do",
++  ["st.h_2"] =		"29400000Do",
++  ["st.w_2"] =		"29800000Do",
++  ["st.d_2"] =		"29c00000Do",
++  ["ld.bu_2"] =		"2a000000Do",
++  ["ld.hu_2"] =		"2a400000Do",
++  ["ld.wu_2"] =		"2a800000Do",
++  ["ldx.d_3"] =		"380c0000DJK",
++  ["stx.d_3"] =		"381c0000DJK",
++  ["fld.s_2"] =		"2b000000Fo",
++  ["fst.s_2"] =		"2b400000Fo",
++  ["fld.d_2"] =		"2b800000Fo",
++  ["fst.d_2"] =		"2bc00000Fo",
++
++  ["fcmp.caf.s_3"] =	"0c100000EGH",
++  ["fcmp.saf.s_3"] =	"0c108000EGH",
++  ["fcmp.clt.s_3"] =	"0c110000EGH",
++  ["fcmp.slt.s_3"] =	"0c118000EGH",
++  ["fcmp.ceq.s_3"] =	"0c120000EGH",
++  ["fcmp.seq.s_3"] =	"0c128000EGH",
++  ["fcmp.cle.s_3"] =	"0c130000EGH",
++  ["fcmp.sle.s_3"] =	"0c138000EGH",
++  ["fcmp.cun.s_3"] =	"0c140000EGH",
++  ["fcmp.sun.s_3"] =	"0c148000EGH",
++  ["fcmp.cult.s_3"] =	"0c150000EGH",
++  ["fcmp.sult.s_3"] =	"0c158000EGH",
++  ["fcmp.cueq.s_3"] =	"0c160000EGH",
++  ["fcmp.sueq.s_3"] =	"0c168000EGH",
++  ["fcmp.cule.s_3"] =	"0c170000EGH",
++  ["fcmp.sule.s_3"] =	"0c178000EGH",
++  ["fcmp.cne.s_3"] =	"0c180000EGH",
++  ["fcmp.sne.s_3"] =	"0c188000EGH",
++  ["fcmp.cor.s_3"] =	"0c1a0000EGH",
++  ["fcmp.sor.s_3"] =	"0c1a8000EGH",
++  ["fcmp.cune.s_3"] =	"0c1c0000EGH",
++  ["fcmp.sune.s_3"] =	"0c1c8000EGH",
++  ["fcmp.caf.d_3"] =	"0c200000EGH",
++  ["fcmp.saf.d_3"] =	"0c208000EGH",
++  ["fcmp.clt.d_3"] =	"0c210000EGH",
++  ["fcmp.slt.d_3"] =	"0c218000EGH",
++  ["fcmp.ceq.d_3"] =	"0c220000EGH",
++  ["fcmp.seq.d_3"] =	"0c228000EGH",
++  ["fcmp.cle.d_3"] =	"0c230000EGH",
++  ["fcmp.sle.d_3"] =	"0c238000EGH",
++  ["fcmp.cun.d_3"] =	"0c240000EGH",
++  ["fcmp.sun.d_3"] =	"0c248000EGH",
++  ["fcmp.cult.d_3"] =	"0c250000EGH",
++  ["fcmp.sult.d_3"] =	"0c258000EGH",
++  ["fcmp.cueq.d_3"] =	"0c260000EGH",
++  ["fcmp.sueq.d_3"] =	"0c268000EGH",
++  ["fcmp.cule.d_3"] =	"0c270000EGH",
++  ["fcmp.sule.d_3"] =	"0c278000EGH",
++  ["fcmp.cne.d_3"] =	"0c280000EGH",
++  ["fcmp.sne.d_3"] =	"0c288000EGH",
++  ["fcmp.cor.d_3"] =	"0c2a0000EGH",
++  ["fcmp.sor.d_3"] =	"0c2a8000EGH",
++  ["fcmp.cune.d_3"] =	"0c2c0000EGH",
++  ["fcmp.sune.d_3"] =	"0c2c8000EGH",
++
++  fsel_4 =		"0d000000FGHI",
++
++  ["addu16i.d_3"] = 	"10000000DJY",
++  beqz_2 =		"40000000JL",
++  bnez_2 = 		"44000000JL",
++  bceqz_2 = 		"48000000AL",
++  bcnez_2 = 		"48000100AL",
++  jirl_3 =		"4c000000DJa",
++  b_1 =			"50000000P",
++  bl_1 =		"54000000P",
++  beq_3 =		"58000000JDO",
++  bne_3 = 		"5c000000JDO",
++  blt_3 = 		"60000000JDO",
++  bge_3 = 		"64000000JDO",
++  bltu_3 = 		"68000000JDO",
++  bgeu_3 = 		"6c000000JDO",
++}
++
++------------------------------------------------------------------------------
++
++local function parse_gpr(expr)
++  local tname, ovreg = match(expr, "^([%w_]+):(r[1-3]?[0-9])$")
++  local tp = map_type[tname or expr]
++  if tp then
++    local reg = ovreg or tp.reg
++    if not reg then
++      werror("type `"..(tname or expr).."' needs a register override")
++    end
++    expr = reg
++  end
++  local r = match(expr, "^r([1-3]?[0-9])$")
++  if r then
++    r = tonumber(r)
++    if r <= 31 then return r, tp end
++  end
++  werror("bad register name `"..expr.."'")
++end
++
++local function parse_fpr(expr)
++  local r = match(expr, "^f([1-3]?[0-9])$")
++  if r then
++    r = tonumber(r)
++    if r <= 31 then return r end
++  end
++  werror("bad register name `"..expr.."'")
++end
++
++local function parse_fcsr(expr)
++  local r = match(expr, "^fcsr([0-3])$")
++  if r then
++    r = tonumber(r)
++    return r
++  end
++  werror("bad register name `"..expr.."'")
++end
++
++local function parse_fcc(expr)
++  local r = match(expr, "^fcc([0-7])$")
++  if r then
++    r = tonumber(r)
++    return r
++  end
++  werror("bad register name `"..expr.."'")
++end
++
++local function parse_imm(imm, bits, shift, scale, signed, action)
++  local n = tonumber(imm)
++  if n then
++    local m = sar(n, scale)
++    if shl(m, scale) == n then
++      if signed then
++	local s = sar(m, bits-1)
++	if s == 0 or s == 1 then return shl(m, shift)
++	elseif s == -1 then return shl(m + shl(1, bits), shift) end
++      else
++	if sar(m, bits) == 0 then return shl(m, shift) end
++      end
++    end
++    werror("out of range immediate1 `"..imm.."'")
++  elseif match(imm, "^[rf]([1-3]?[0-9])$") or
++	 match(imm, "^([%w_]+):([rf][1-3]?[0-9])$") then
++    werror("expected immediate operand, got register")
++  else
++    waction(action or "IMM",
++	    (signed and 32768 or 0)+shl(scale, 10)+shl(bits, 5)+shift, imm)
++    return 0
++  end
++end
++
++local function parse_imm21or26(imm, i)
++  local n = tonumber(imm)
++  if n then
++    -- signed
++    local m = sar(n, 0)
++    if shl(m, 0) == n then
++      local s = sar(m, i-1)
++      if s == 0 then
++        return shl(sub(m, 1, 16), 10) + shl(sub(m, 17, i), 0)
++      elseif s == -1 then
++        return shl(sub(m, 1, 16), 10) + shl(sub(m, 17, i), 0)
++      end
++    end
++    werror("out of range immediate2 `"..imm.."'")
++  else
++    waction("IMM2", 0, imm)
++    return 0
++  end
++end
++
++local function parse_disp(disp)
++  local imm, reg = match(disp, "^(.*)%(([%w_:]+)%)$")
++  if imm then
++    local r = shl(parse_gpr(reg), 5)
++    local extname = match(imm, "^extern%s+(%S+)$")
++    if extname then
++      waction("REL_EXT", map_extern[extname], nil, 1)
++      return r
++    else
++      return r + parse_imm(imm, 12, 10, 0, true)
++    end
++  end
++  local reg, tailr = match(disp, "^([%w_:]+)%s*(.*)$")
++  if reg and tailr ~= "" then
++    local r, tp = parse_gpr(reg)
++    if tp then
++      waction("IMM", 32768+12*32+10, format(tp.ctypefmt, tailr))
++      return shl(r, 5)
++    end
++  end
++  werror("bad displacement `"..disp.."'")
++end
++
++local function parse_label(label, def)
++  local prefix = sub(label, 1, 2)
++  -- =>label (pc label reference)
++  if prefix == "=>" then
++    return "PC", 0, sub(label, 3)
++  end
++  -- ->name (global label reference)
++  if prefix == "->" then
++    return "LG", map_global[sub(label, 3)]
++  end
++  if def then
++    -- [1-9] (local label definition)
++    if match(label, "^[1-9]$") then
++      return "LG", 10+tonumber(label)
++    end
++  else
++    -- [<>][1-9] (local label reference)
++    local dir, lnum = match(label, "^([<>])([1-9])$")
++    if dir then -- Fwd: 1-9, Bkwd: 11-19.
++      return "LG", lnum + (dir == ">" and 0 or 10)
++    end
++    -- extern label (extern label reference)
++    local extname = match(label, "^extern%s+(%S+)$")
++    if extname then
++      return "EXT", map_extern[extname]
++    end
++  end
++  werror("bad label `"..label.."'")
++end
++
++local function branch_type(op)
++  if shr(op, 26) == 0x16 or shr(op, 26) == 0x17 or shr(op, 26) == 0x18 or
++     shr(op, 26) == 0x19 or shr(op, 26) == 0x1a or shr(op, 26) == 0x1b then
++    return 0 -- BEQ, BNE, BLT, BGE, BLTU, BGEU
++  elseif shr(op, 26) == 0x10 or shr(op, 26) == 0x11 or shr(op, 26) == 0x12 then
++    return 0x5000 -- BEQZ, BNEZ, BCEQZ, BCNEZ
++  elseif band(op, 0xf8000000) == 0x50000000 then return 0xa000 --B, BL
++  else
++    assert(false, "unknown branch type")
++  end
++end
++
++------------------------------------------------------------------------------
++
++-- Handle opcodes defined with template strings.
++map_op[".template__"] = function(params, template, nparams)
++  if not params then return sub(template, 9) end
++  local op = tonumber(sub(template, 1, 8), 16)
++  local n = 1
++
++  -- Limit number of section buffer positions used by a single dasm_put().
++  -- A single opcode needs a maximum of 2 positions (ins/ext).
++  if secpos+2 > maxsecpos then wflush() end
++  local pos = wpos()
++
++  -- Process each character.
++  for p in gmatch(sub(template, 9), ".") do
++    if p == "D" then
++      op = op + shl(parse_gpr(params[n]), 0); n = n + 1
++    elseif p == "J" then
++      op = op + shl(parse_gpr(params[n]), 5); n = n + 1
++    elseif p == "K" then
++      op = op + shl(parse_gpr(params[n]), 10); n = n + 1
++    elseif p == "F" then
++      op = op + shl(parse_fpr(params[n]), 0); n = n + 1
++    elseif p == "G" then
++      op = op + shl(parse_fpr(params[n]), 5); n = n + 1
++    elseif p == "H" then
++      op = op + shl(parse_fpr(params[n]), 10); n = n + 1
++    elseif p == "i" then
++      op = op + shl(parse_fpr(params[n]), 15); n = n + 1
++    elseif p == "I" then
++      op = op + shl(parse_fcc(params[n]), 15); n = n + 1
++    elseif p == "A" then
++      op = op + shl(parse_fcc(params[n]), 5); n = n + 1
++    elseif p == "E" then
++      op = op + shl(parse_fcc(params[n]), 0); n = n + 1
++    elseif op == "S" then
++      op = op + shl(parse_fcsr(params[n]), 0); n = n + 1
++    elseif op == "R" then
++      op = op + shl(parse_fcsr(params[n]), 5); n = n + 1
++    elseif p == "U" then
++      op = op + parse_imm(params[n], 5, 10, 0, false); n = n + 1
++    elseif p == "V" then
++      op = op + parse_imm(params[n], 6, 10, 0, false); n = n + 1
++    elseif p == "W" then
++      op = op + parse_imm(params[n], 14, 10, 0, true); n = n + 1
++    elseif p == "X" then
++      op = op + parse_imm(params[n], 12, 10, 0, true); n = n + 1
++    elseif p == "o" then
++      op = op + parse_disp(params[n]); n = n + 1
++    elseif p == "Y" then
++      op = op + parse_imm(params[n], 16, 10, 0, true); n = n + 1
++    elseif p == "Z" then
++      op = op + parse_imm(params[n], 20, 5, 0, true); n = n + 1
++    elseif p == "T" then
++      op = op + parse_imm(params[n], 12, 10, 0, false); n = n + 1
++    elseif p == "C" then
++      op = op + parse_imm(params[n], 15, 0, 0, false); n = n + 1
++    elseif p == "Q" then
++      op = op + parse_imm(params[n], 2, 15, 0, false); n = n + 1
++    elseif p == "B" then
++      op = op + parse_imm(params[n], 3, 15, 0, false); n = n + 1
++    elseif p == "M" then
++      op = op + parse_imm(params[n], 5, 16, 0, false); n = n + 1
++    elseif p == "N" then
++      op = op + parse_imm(params[n], 6, 16, 0, false); n = n + 1
++    elseif p == "O" or p == "L" or p == "P" then
++      local mode, m, s = parse_label(params[n], false)
++      local v = branch_type(op)
++      waction("REL_"..mode, m+v, s, 1)
++      n = n + 1
++    elseif p == "a" then
++      op = op + parse_imm(params[n], 16, 10, 0, true); n = n + 1
++    else
++      assert(false)
++    end
++  end
++  wputpos(pos, op)
++end
++
++------------------------------------------------------------------------------
++
++-- Pseudo-opcode to mark the position where the action list is to be emitted.
++map_op[".actionlist_1"] = function(params)
++  if not params then return "cvar" end
++  local name = params[1] -- No syntax check. You get to keep the pieces.
++  wline(function(out) writeactions(out, name) end)
++end
++
++-- Pseudo-opcode to mark the position where the global enum is to be emitted.
++map_op[".globals_1"] = function(params)
++  if not params then return "prefix" end
++  local prefix = params[1] -- No syntax check. You get to keep the pieces.
++  wline(function(out) writeglobals(out, prefix) end)
++end
++
++-- Pseudo-opcode to mark the position where the global names are to be emitted.
++map_op[".globalnames_1"] = function(params)
++  if not params then return "cvar" end
++  local name = params[1] -- No syntax check. You get to keep the pieces.
++  wline(function(out) writeglobalnames(out, name) end)
++end
++
++-- Pseudo-opcode to mark the position where the extern names are to be emitted.
++map_op[".externnames_1"] = function(params)
++  if not params then return "cvar" end
++  local name = params[1] -- No syntax check. You get to keep the pieces.
++  wline(function(out) writeexternnames(out, name) end)
++end
++
++------------------------------------------------------------------------------
++
++-- Label pseudo-opcode (converted from trailing colon form).
++map_op[".label_1"] = function(params)
++  if not params then return "[1-9] | ->global | =>pcexpr" end
++  if secpos+1 > maxsecpos then wflush() end
++  local mode, n, s = parse_label(params[1], true)
++  if mode == "EXT" then werror("bad label definition") end
++  waction("LABEL_"..mode, n, s, 1)
++end
++
++------------------------------------------------------------------------------
++
++-- Pseudo-opcodes for data storage.
++map_op[".long_*"] = function(params)
++  if not params then return "imm..." end
++  for _,p in ipairs(params) do
++    local n = tonumber(p)
++    if not n then werror("bad immediate `"..p.."'") end
++    if n < 0 then n = n + 2^32 end
++    wputw(n)
++    if secpos+2 > maxsecpos then wflush() end
++  end
++end
++
++-- Alignment pseudo-opcode.
++map_op[".align_1"] = function(params)
++  if not params then return "numpow2" end
++  if secpos+1 > maxsecpos then wflush() end
++  local align = tonumber(params[1])
++  if align then
++    local x = align
++    -- Must be a power of 2 in the range (2 ... 256).
++    for i=1,8 do
++      x = x / 2
++      if x == 1 then
++	waction("ALIGN", align-1, nil, 1) -- Action byte is 2**n-1.
++	return
++      end
++    end
++  end
++  werror("bad alignment")
++end
++
++------------------------------------------------------------------------------
++
++-- Pseudo-opcode for (primitive) type definitions (map to C types).
++map_op[".type_3"] = function(params, nparams)
++  if not params then
++    return nparams == 2 and "name, ctype" or "name, ctype, reg"
++  end
++  local name, ctype, reg = params[1], params[2], params[3]
++  if not match(name, "^[%a_][%w_]*$") then
++    werror("bad type name `"..name.."'")
++  end
++  local tp = map_type[name]
++  if tp then
++    werror("duplicate type `"..name.."'")
++  end
++  -- Add #type to defines. A bit unclean to put it in map_archdef.
++  map_archdef["#"..name] = "sizeof("..ctype..")"
++  -- Add new type and emit shortcut define.
++  local num = ctypenum + 1
++  map_type[name] = {
++    ctype = ctype,
++    ctypefmt = format("Dt%X(%%s)", num),
++    reg = reg,
++  }
++  wline(format("#define Dt%X(_V) (int)(ptrdiff_t)&(((%s *)0)_V)", num, ctype))
++  ctypenum = num
++end
++map_op[".type_2"] = map_op[".type_3"]
++
++-- Dump type definitions.
++local function dumptypes(out, lvl)
++  local t = {}
++  for name in pairs(map_type) do t[#t+1] = name end
++  sort(t)
++  out:write("Type definitions:\n")
++  for _,name in ipairs(t) do
++    local tp = map_type[name]
++    local reg = tp.reg or ""
++    out:write(format("  %-20s %-20s %s\n", name, tp.ctype, reg))
++  end
++  out:write("\n")
++end
++
++------------------------------------------------------------------------------
++
++-- Set the current section.
++function _M.section(num)
++  waction("SECTION", num)
++  wflush(true) -- SECTION is a terminal action.
++end
++
++------------------------------------------------------------------------------
++
++-- Dump architecture description.
++function _M.dumparch(out)
++  out:write(format("DynASM %s version %s, released %s\n\n",
++    _info.arch, _info.version, _info.release))
++  dumpactions(out)
++end
++
++-- Dump all user defined elements.
++function _M.dumpdef(out, lvl)
++  dumptypes(out, lvl)
++  dumpglobals(out, lvl)
++  dumpexterns(out, lvl)
++end
++
++------------------------------------------------------------------------------
++
++-- Pass callbacks from/to the DynASM core.
++function _M.passcb(wl, we, wf, ww)
++  wline, werror, wfatal, wwarn = wl, we, wf, ww
++  return wflush
++end
++
++-- Setup the arch-specific module.
++function _M.setup(arch, opt)
++  g_arch, g_opt = arch, opt
++end
++
++-- Merge the core maps and the arch-specific maps.
++function _M.mergemaps(map_coreop, map_def)
++  setmetatable(map_op, { __index = map_coreop })
++  setmetatable(map_def, { __index = map_archdef })
++  return map_op, map_def
++end
++
++return _M
++
++------------------------------------------------------------------------------
++
+-- 
+2.20.1
+
diff --git a/loongarch64/0003-LoongArch64-Add-register-assignments-for-the-interpr.patch b/loongarch64/0003-LoongArch64-Add-register-assignments-for-the-interpr.patch
new file mode 100644
index 0000000..bb38583
--- /dev/null
+++ b/loongarch64/0003-LoongArch64-Add-register-assignments-for-the-interpr.patch
@@ -0,0 +1,94 @@
+From 4a6f77c5dd016a10f59ec8c1c32c57b1187eee5f Mon Sep 17 00:00:00 2001
+From: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
+Date: Wed, 27 Jul 2022 15:52:31 +0800
+Subject: [PATCH 03/20] LoongArch64: Add register assignments for the
+ interpreter
+
+---
+ src/vm_loongarch64.dasc | 74 +++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 74 insertions(+)
+ create mode 100644 src/vm_loongarch64.dasc
+
+diff --git a/src/vm_loongarch64.dasc b/src/vm_loongarch64.dasc
+new file mode 100644
+index 00000000..c959c599
+--- /dev/null
++++ b/src/vm_loongarch64.dasc
+@@ -0,0 +1,74 @@
++|// Low-level VM code for LoongArch CPUs.
++|// Bytecode interpreter, fast functions and helper functions.
++|// Copyright (C) 2005-2022 Mike Pall. See Copyright Notice in luajit.h
++|
++|.arch loongarch64
++|
++|//-----------------------------------------------------------------------
++|
++|// Fixed register assignments for the interpreter.
++|// Don't use: r0 = 0, r1 = ra, r2 = tp, r3 = sp, r21 = reserved
++|
++|
++|// The following must be C callee-save (but BASE is often refetched).
++|.define BASE,		r23	// Base of current Lua stack frame.
++|.define KBASE,		r24	// Constants of current Lua function.
++|.define PC,		r25	// Next PC.
++|.define DISPATCH,	r26	// Opcode dispatch table.
++|.define LREG,		r27	// Register holding lua_State (also in SAVE_L).
++|.define MULTRES,	r28	// Size of multi-result: (nresults+1)*8.
++|
++|.define JGL,		r22	// On-trace: global_State + 32768.
++|
++|// Constants for type-comparisons, stores and conversions. C callee-save.
++|.define TISNIL,	r22
++|.define TISNUM,	r29
++|.define TOBIT,		f30	// 2^52 + 2^51.
++|
++|// The following temporaries are not saved across C calls, except for RA.
++|.define RA,		r30	// Callee-save.
++|.define RB,		r8
++|.define RC,		r9
++|.define RD,		r10
++|.define INS,		r11
++|
++|.define TMP0,		r12
++|.define TMP1,		r13
++|.define TMP2,		r14
++|.define TMP3,		r15
++|.define TMP4,		r17
++|
++|// Loongarch lp64 calling convention.
++|.define CARG1,		r4
++|.define CARG2,		r5
++|.define CARG3,		r6
++|.define CARG4,		r7
++|.define CARG5,		r8
++|.define CARG6,		r9
++|.define CARG7,		r10
++|.define CARG8,		r11
++|
++|.define CRET1,		r4
++|.define CRET2,		r5
++|
++|.define FARG1,		f0
++|.define FARG2,		f1
++|.define FARG3,		f2
++|.define FARG4,		f3
++|.define FARG5,		f4
++|.define FARG6,		f5
++|.define FARG7,		f6
++|.define FARG8,		f7
++|
++|.define FRET1,		f0
++|.define FRET2,		f1
++|
++|.define FTMP0,		f8
++|.define FTMP1,		f9
++|.define FTMP2,		f10
++|.define FTMP3,		f22
++|.define FTMP4,		f23
++|
++|.define FCC0,		fcc0
++|.define FCC1,		fcc1
++|
+-- 
+2.20.1
+
diff --git a/loongarch64/0004-LoongArch64-Add-stack-layout.patch b/loongarch64/0004-LoongArch64-Add-stack-layout.patch
new file mode 100644
index 0000000..707485f
--- /dev/null
+++ b/loongarch64/0004-LoongArch64-Add-stack-layout.patch
@@ -0,0 +1,111 @@
+From 383251857b9c1f5f06abe9ab0da60812c4e03123 Mon Sep 17 00:00:00 2001
+From: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
+Date: Wed, 27 Jul 2022 15:56:23 +0800
+Subject: [PATCH 04/20] LoongArch64: Add stack layout
+
+---
+ src/lj_frame.h          |  9 ++++++
+ src/vm_loongarch64.dasc | 70 +++++++++++++++++++++++++++++++++++++++++
+ 2 files changed, 79 insertions(+)
+
+diff --git a/src/lj_frame.h b/src/lj_frame.h
+index aa1dc11a..65854689 100644
+--- a/src/lj_frame.h
++++ b/src/lj_frame.h
+@@ -264,6 +264,15 @@ enum { LJ_CONT_TAILCALL, LJ_CONT_FFI_CALLBACK };  /* Special continuations. */
+ #endif
+ #define CFRAME_OFS_MULTRES	0
+ #define CFRAME_SHIFT_MULTRES	3
++#elif LJ_TARGET_LOONGARCH64
++#define CFRAME_OFS_ERRF		196
++#define CFRAME_OFS_NRES		192
++#define CFRAME_OFS_PREV		184
++#define CFRAME_OFS_L		176
++#define CFRAME_OFS_PC		168
++#define CFRAME_SIZE		200
++#define CFRAME_OFS_MULTRES	0
++#define CFRAME_SHIFT_MULTRES	3
+ #else
+ #error "Missing CFRAME_* definitions for this architecture"
+ #endif
+diff --git a/src/vm_loongarch64.dasc b/src/vm_loongarch64.dasc
+index c959c599..1388c684 100644
+--- a/src/vm_loongarch64.dasc
++++ b/src/vm_loongarch64.dasc
+@@ -72,3 +72,73 @@
+ |.define FCC0,		fcc0
+ |.define FCC1,		fcc1
+ |
++|// Stack layout while in interpreter. Must match with lj_frame.h.
++|// LoongArch64 hard-float.
++|
++|.define CFRAME_SPACE,	200	// Delta for sp.
++|
++|//----- 16 byte aligned, <-- sp entering interpreter
++|.define SAVE_ERRF,	196	// 32 bit values.
++|.define SAVE_NRES,	192
++|.define SAVE_CFRAME,	184	// 64 bit values.
++|.define SAVE_L,	176
++|.define SAVE_PC,	168
++|//----- 16 byte aligned
++|.define SAVE_GPR_,	80	// .. 80+11*8: 64 bit GPR saves.
++|.define SAVE_FPR_,	16	// .. 16+8*8: 64 bit FPR saves.
++|
++|
++|.define TMPD,		0
++|//----- 16 byte aligned
++|
++|.define TMPD_OFS,	0
++|
++|//-----------------------------------------------------------------------
++|
++|.macro saveregs
++|  addi.d sp, sp, -CFRAME_SPACE
++|  st.d ra, SAVE_GPR_+10*8(sp)
++|  st.d r22, SAVE_GPR_+9*8(sp)
++|  st.d r31, SAVE_GPR_+8*8(sp)
++|  fst.d f31, SAVE_FPR_+7*8(sp)
++|  st.d r30, SAVE_GPR_+7*8(sp)
++|  fst.d f30, SAVE_FPR_+6*8(sp)
++|  st.d r29, SAVE_GPR_+6*8(sp)
++|  fst.d f29, SAVE_FPR_+5*8(sp)
++|  st.d r28, SAVE_GPR_+5*8(sp)
++|  fst.d f28, SAVE_FPR_+4*8(sp)
++|  st.d r27, SAVE_GPR_+4*8(sp)
++|  fst.d f27, SAVE_FPR_+3*8(sp)
++|  st.d r26, SAVE_GPR_+3*8(sp)
++|  fst.d f26, SAVE_FPR_+2*8(sp)
++|  st.d r25, SAVE_GPR_+2*8(sp)
++|  fst.d f25, SAVE_FPR_+1*8(sp)
++|  st.d r24, SAVE_GPR_+1*8(sp)
++|  fst.d f24, SAVE_FPR_+0*8(sp)
++|  st.d r23, SAVE_GPR_+0*8(sp)
++|.endmacro
++|
++|.macro restoreregs_ret
++|  ld.d ra, SAVE_GPR_+10*8(sp)
++|  ld.d r22, SAVE_GPR_+9*8(sp)
++|  ld.d r31, SAVE_GPR_+8*8(sp)
++|  ld.d r30, SAVE_GPR_+7*8(sp)
++|  fld.d f31, SAVE_FPR_+7*8(sp)
++|  ld.d r29, SAVE_GPR_+6*8(sp)
++|  fld.d f30, SAVE_FPR_+6*8(sp)
++|  ld.d r28, SAVE_GPR_+5*8(sp)
++|  fld.d f29, SAVE_FPR_+5*8(sp)
++|  ld.d r27, SAVE_GPR_+4*8(sp)
++|  fld.d f28, SAVE_FPR_+4*8(sp)
++|  ld.d r26, SAVE_GPR_+3*8(sp)
++|  fld.d f27, SAVE_FPR_+3*8(sp)
++|  ld.d r25, SAVE_GPR_+2*8(sp)
++|  fld.d f26, SAVE_FPR_+2*8(sp)
++|  ld.d r24, SAVE_GPR_+1*8(sp)
++|  fld.d f25, SAVE_FPR_+1*8(sp)
++|  ld.d r23, SAVE_GPR_+0*8(sp)
++|  fld.d f24, SAVE_FPR_+0*8(sp)
++|  addi.d sp, sp, CFRAME_SPACE
++|  jirl r0, ra, 0
++|.endmacro
++|
+-- 
+2.20.1
+
diff --git a/loongarch64/0005-LoongArch64-Add-some-general-macro-type-definitions-.patch b/loongarch64/0005-LoongArch64-Add-some-general-macro-type-definitions-.patch
new file mode 100644
index 0000000..e8aeddb
--- /dev/null
+++ b/loongarch64/0005-LoongArch64-Add-some-general-macro-type-definitions-.patch
@@ -0,0 +1,246 @@
+From 26b5e066b1fec1d0a6a1e7baa33f63d065a7507d Mon Sep 17 00:00:00 2001
+From: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
+Date: Wed, 27 Jul 2022 16:00:02 +0800
+Subject: [PATCH 05/20] LoongArch64: Add some general macro/type definitions in
+ the interpreter
+
+---
+ src/vm_loongarch64.dasc | 225 ++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 225 insertions(+)
+
+diff --git a/src/vm_loongarch64.dasc b/src/vm_loongarch64.dasc
+index 1388c684..a12b93d4 100644
+--- a/src/vm_loongarch64.dasc
++++ b/src/vm_loongarch64.dasc
+@@ -142,3 +142,228 @@
+ |  jirl r0, ra, 0
+ |.endmacro
+ |
++|//-----------------------------------------------------------------------
++|
++|.macro .STXW, a, b, c
++|  addu16i.d r20, r0, c
++|  srai.d r20, r20, 16
++|  stx.w a, b, r20
++|.endmacro
++|
++|.macro .STXD, a, b, c
++|  addu16i.d r20, r0, c
++|  srai.d r20, r20, 16
++|  stx.d a, b, r20
++|.endmacro
++|
++|.macro .LDXW, a, b, c
++|  addu16i.d r20, r0, c
++|  srai.d r20, r20, 16
++|  ldx.w a, b, r20
++|.endmacro
++|
++|.macro .LDXD, a, b, c
++|  addu16i.d r20, r0, c
++|  srai.d r20, r20, 16
++|  ldx.d a, b, r20
++|.endmacro
++|
++|.macro .LDXBU, a, b, c
++|  addu16i.d r20, r0, c
++|  srai.d r20, r20, 16
++|  ldx.bu a, b, r20
++|.endmacro
++|
++|.macro .ADD16I, a, b, c
++|  addu16i.d r20, r0, c
++|  srai.d r20, r20, 16
++|  add.d a, b, r20
++|.endmacro
++|
++|// Type definitions. Some of these are only used for documentation.
++|.type L,		lua_State,	LREG
++|.type GL,		global_State
++|.type TVALUE,		TValue
++|.type GCOBJ,		GCobj
++|.type STR,		GCstr
++|.type TAB,		GCtab
++|.type LFUNC,		GCfuncL
++|.type CFUNC,		GCfuncC
++|.type PROTO,		GCproto
++|.type UPVAL,		GCupval
++|.type NODE,		Node
++|.type NARGS8,		int
++|.type TRACE,		GCtrace
++|.type SBUF,		SBuf
++|
++|//-----------------------------------------------------------------------
++|
++|// Trap for not-yet-implemented parts.
++|.macro NYI; break 0; .endmacro
++|
++|//-----------------------------------------------------------------------
++|
++|// Access to frame relative to BASE.
++|.define FRAME_PC,	-8
++|.define FRAME_FUNC,	-16
++|
++|//-----------------------------------------------------------------------
++|
++|// Endian-specific defines. LoongArch is little endian.
++|.define OFS_RD,	2
++|.define OFS_RA,	1
++|.define OFS_OP,	0
++|
++|// Instruction decode.
++|.macro decode_BC4b, dst; slli.w dst, dst, 2; .endmacro
++|.macro decode_BC8b, dst; slli.w dst, dst, 3; .endmacro
++|.macro decode_OP, dst, ins; andi dst, ins, 0xff; .endmacro
++|.macro decode_RA, dst, ins; bstrpick.d dst, ins, 15, 8; decode_BC8b dst; .endmacro
++|.macro decode_RB, dst, ins; bstrpick.d dst, ins, 31, 24; decode_BC8b dst; .endmacro
++|.macro decode_RC, dst, ins; bstrpick.d dst, ins, 23, 16; decode_BC8b dst; .endmacro
++|.macro decode_RD, dst, ins; bstrpick.d dst, ins, 31, 16; decode_BC8b dst; .endmacro
++|.macro decode_RDtoRC8, dst, src; andi dst, src, 0x7f8; .endmacro
++|
++|// Instruction fetch.
++|.macro ins_NEXT1
++|  ld.w INS, 0(PC)
++|  addi.d PC, PC, 4
++|.endmacro
++|// Instruction decode+dispatch.
++|.macro ins_NEXT2
++|  decode_OP TMP1, INS
++|  decode_BC8b TMP1
++|  add.d TMP0, DISPATCH, TMP1
++|  ld.d TMP4, 0(TMP0)
++|   decode_RD RD, INS
++|   decode_RA RA, INS
++|  jirl r0, TMP4, 0
++|.endmacro
++|.macro ins_NEXT
++|  ins_NEXT1
++|  ins_NEXT2
++|.endmacro
++|
++|// Instruction footer.
++|.if 1
++|  // Replicated dispatch. Less unpredictable branches, but higher I-Cache use.
++|  .define ins_next, ins_NEXT
++|  .define ins_next_, ins_NEXT
++|  .define ins_next1, ins_NEXT1
++|  .define ins_next2, ins_NEXT2
++|.else
++|  // Common dispatch. Lower I-Cache use, only one (very) unpredictable branch.
++|  // Affects only certain kinds of benchmarks (and only with -j off).
++|  .macro ins_next
++|    b ->ins_next
++|  .endmacro
++|  .macro ins_next1
++|  .endmacro
++|  .macro ins_next2
++|    b ->ins_next
++|  .endmacro
++|  .macro ins_next_
++|  ->ins_next:
++|    ins_NEXT
++|  .endmacro
++|.endif
++|
++|// Call decode and dispatch.
++|.macro ins_callt
++|  // BASE = new base, RB = LFUNC/CFUNC, RC = nargs*8, FRAME_PC(BASE) = PC
++|  ld.d PC, LFUNC:RB->pc
++|  ld.w INS, 0(PC)
++|  addi.d PC, PC, 4
++|  decode_OP TMP1, INS
++|  decode_RA RA, INS
++|  decode_BC8b TMP1
++|  add.d TMP0, DISPATCH, TMP1
++|  ld.d TMP0, 0(TMP0)
++|  add.d RA, RA, BASE
++|  jirl r0, TMP0, 0
++|.endmacro
++|
++|.macro ins_call
++|  // BASE = new base, RB = LFUNC/CFUNC, RC = nargs*8, PC = caller PC
++|  st.d PC, FRAME_PC(BASE)
++|  ins_callt
++|.endmacro
++|
++|//-----------------------------------------------------------------------
++|
++|.macro branch_RD
++|  srli.w TMP0, RD, 1
++|  addu16i.d TMP4, r0, -0x2	// -BCBIAS_J*4
++|  add.w TMP0, TMP0, TMP4	// (jump - 0x8000)<<2
++|  add.d PC, PC, TMP0
++|.endmacro
++|
++|// Assumes DISPATCH is relative to GL.
++#define DISPATCH_GL(field)	(GG_DISP2G + (int)offsetof(global_State, field))
++#define DISPATCH_J(field)	(GG_DISP2J + (int)offsetof(jit_State, field))
++|
++#define PC2PROTO(field)  ((int)offsetof(GCproto, field)-(int)sizeof(GCproto))
++|
++|
++|// Set current VM state. Uses TMP0.
++|.macro li_vmstate, st; addi.w TMP0, r0, ~LJ_VMST_..st; .endmacro
++|.macro st_vmstate; .STXW TMP0, DISPATCH, DISPATCH_GL(vmstate); .endmacro
++|
++|// Move table write barrier back. Overwrites mark and tmp.
++|.macro barrierback, tab, mark, tmp, target
++|  .LDXD tmp, DISPATCH, DISPATCH_GL(gc.grayagain)
++|  andi mark, mark, ~LJ_GC_BLACK & 255		// black2gray(tab)
++|  .STXD tab, DISPATCH, DISPATCH_GL(gc.grayagain)
++|  st.b mark, tab->marked
++|  st.d tmp, tab->gclist
++|  b target
++|.endmacro
++|
++|// Clear type tag. Isolate lowest 47 bits of reg.
++|.macro cleartp, reg; bstrpick.d reg, reg, 46, 0; .endmacro
++|.macro cleartp, dst, reg; bstrpick.d dst, reg, 46, 0; .endmacro
++|
++|// Set type tag: Merge 17 type bits into bits [47, 63] of dst.
++|.macro settp, dst, tp; bstrins.d dst, tp, 63, 47; .endmacro
++|
++|// Extract (negative) type tag.
++|.macro gettp, dst, src; srai.d dst, src, 47; .endmacro
++|
++|// Macros to check the TValue type and extract the GCobj. Branch on failure.
++|.macro checktp, reg, tp, target
++|  gettp TMP4, reg
++|  addi.d TMP4, TMP4, tp
++|  cleartp reg
++|  bnez TMP4, target
++|.endmacro
++|.macro checktp, dst, reg, tp, target
++|  gettp TMP4, reg
++|  addi.d TMP4, TMP4, tp
++|  cleartp dst, reg
++|  bnez TMP4, target
++|.endmacro
++|.macro checkstr, reg, target; checktp reg, -LJ_TSTR, target; .endmacro
++|.macro checktab, reg, target; checktp reg, -LJ_TTAB, target; .endmacro
++|.macro checkfunc, reg, target; checktp reg, -LJ_TFUNC, target; .endmacro
++|.macro checkint, reg, target
++|  gettp TMP4, reg
++|  bne TMP4, TISNUM, target
++|.endmacro
++|.macro checknum, reg, target
++|  gettp TMP4, reg
++|  sltui TMP4, TMP4, LJ_TISNUM
++|  beqz TMP4, target
++|.endmacro
++|
++|.macro mov_false, reg
++|  addi.d reg, r0, 0x0001
++|  slli.d reg, reg, 47
++|  nor reg, reg, r0
++|.endmacro
++|.macro mov_true, reg
++|  addi.d reg, r0, 0x0001
++|  slli.d reg, reg, 48
++|  nor reg, reg, r0
++|.endmacro
++|
++|//-----------------------------------------------------------------------
+-- 
+2.20.1
+
diff --git a/loongarch64/0006-LoongArch64-Add-pure-interpreter-backend.patch b/loongarch64/0006-LoongArch64-Add-pure-interpreter-backend.patch
new file mode 100644
index 0000000..6ed0860
--- /dev/null
+++ b/loongarch64/0006-LoongArch64-Add-pure-interpreter-backend.patch
@@ -0,0 +1,3578 @@
+From 806fd208b21befb6bd4e5fefb368052f558f30ec Mon Sep 17 00:00:00 2001
+From: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
+Date: Wed, 27 Jul 2022 16:02:03 +0800
+Subject: [PATCH 06/20] LoongArch64: Add pure interpreter backend
+
+---
+ src/lj_vmmath.c         |    3 +-
+ src/vm_loongarch64.dasc | 3536 +++++++++++++++++++++++++++++++++++++++
+ 2 files changed, 3538 insertions(+), 1 deletion(-)
+
+diff --git a/src/lj_vmmath.c b/src/lj_vmmath.c
+index b6cc60ba..e8cfc699 100644
+--- a/src/lj_vmmath.c
++++ b/src/lj_vmmath.c
+@@ -58,7 +58,8 @@ double lj_vm_foldarith(double x, double y, int op)
+ 
+ /* -- Helper functions for generated machine code ------------------------- */
+ 
+-#if (LJ_HASJIT && !(LJ_TARGET_ARM || LJ_TARGET_ARM64 || LJ_TARGET_PPC)) || LJ_TARGET_MIPS
++#if (LJ_HASJIT && !(LJ_TARGET_ARM || LJ_TARGET_ARM64 || LJ_TARGET_PPC)) || LJ_TARGET_MIPS \
++  || LJ_TARGET_LOONGARCH64
+ int32_t LJ_FASTCALL lj_vm_modi(int32_t a, int32_t b)
+ {
+   uint32_t y, ua, ub;
+diff --git a/src/vm_loongarch64.dasc b/src/vm_loongarch64.dasc
+index a12b93d4..9f98c382 100644
+--- a/src/vm_loongarch64.dasc
++++ b/src/vm_loongarch64.dasc
+@@ -3,6 +3,12 @@
+ |// Copyright (C) 2005-2022 Mike Pall. See Copyright Notice in luajit.h
+ |
+ |.arch loongarch64
++|.section code_op, code_sub
++|
++|.actionlist build_actionlist
++|.globals GLOB_
++|.globalnames globnames
++|.externnames extnames
+ |
+ |//-----------------------------------------------------------------------
+ |
+@@ -367,3 +373,3533 @@
+ |.endmacro
+ |
+ |//-----------------------------------------------------------------------
++
++/* Generate subroutines used by opcodes and other parts of the VM. */
++/* The .code_sub section should be last to help static branch prediction. */
++static void build_subroutines(BuildCtx *ctx)
++{
++  |.code_sub
++  |
++  |//-----------------------------------------------------------------------
++  |//-- Return handling ----------------------------------------------------
++  |//-----------------------------------------------------------------------
++  |
++  |->vm_returnp:
++  |  // See vm_return. Also: TMP2 = previous base.
++  |  andi TMP0, PC, FRAME_P
++  |
++  |  // Return from pcall or xpcall fast func.
++  |  mov_true TMP1
++  |  beqz TMP0, ->cont_dispatch
++  |  ld.d PC, FRAME_PC(TMP2)		// Fetch PC of previous frame.
++  |  or BASE, TMP2, r0			// Restore caller base.
++  |  // Prepending may overwrite the pcall frame, so do it at the end.
++  |  st.d TMP1, -8(RA)			// Prepend true to results.
++  |  addi.d RA, RA, -8
++  |
++  |->vm_returnc:
++  |  addi.w RD, RD, 8			// RD = (nresults+1)*8.
++  |  andi TMP0, PC, FRAME_TYPE
++  |  addi.w CRET1, r0, LUA_YIELD
++  |  beqz RD, ->vm_unwind_c_eh
++  |  or MULTRES, RD, r0
++  |  beqz TMP0, ->BC_RET_Z		// Handle regular return to Lua.
++  |
++  |->vm_return:
++  |  // BASE = base, RA = resultptr, RD/MULTRES = (nresults+1)*8, PC = return
++  |  // TMP0 = PC & FRAME_TYPE
++  |  addi.w TMP2, r0, -8		// TMP2 = 0xfffffff8
++  |  xori TMP0, TMP0, FRAME_C
++  |  and TMP2, PC, TMP2
++  |  sub.d TMP2, BASE, TMP2		// TMP2 = previous base.
++  |  bnez TMP0, ->vm_returnp
++  |
++  |  addi.w TMP1, RD, -8
++  |  st.d TMP2, L->base
++  |  li_vmstate C
++  |  ld.w TMP2, SAVE_NRES(sp)
++  |  addi.d BASE, BASE, -16
++  |  st_vmstate
++  |  slli.w TMP2, TMP2, 3
++  |  beqz TMP1, >2
++  |1:
++  |  addi.w TMP1, TMP1, -8
++  |  ld.d CRET1, 0(RA)
++  |  addi.d RA, RA, 8
++  |  st.d CRET1, 0(BASE)
++  |  addi.d BASE, BASE, 8
++  |  bnez TMP1, <1
++  |
++  |2:
++  |  bne TMP2, RD, >6
++  |3:
++  |  st.d BASE, L->top			// Store new top.
++  |
++  |->vm_leave_cp:
++  |  ld.d TMP0, SAVE_CFRAME(sp)		// Restore previous C frame.
++  |  or CRET1, r0, r0			// Ok return status for vm_pcall.
++  |  st.d TMP0, L->cframe
++  |
++  |->vm_leave_unw:
++  |  restoreregs_ret
++  |
++  |6:
++  |  ld.d TMP1, L->maxstack
++  |  slt TMP0, TMP2, RD
++  |  // More results wanted. Check stack size and fill up results with nil.
++  |  slt TMP1, BASE, TMP1
++  |  bnez TMP0, >7
++  |  beqz TMP1, >8
++  |  st.d TISNIL, 0(BASE)
++  |  addi.w RD, RD, 8
++  |  addi.d BASE, BASE, 8
++  |  b <2
++  |
++  |7:  // Less results wanted.
++  |  sub.w TMP0, RD, TMP2
++  |  sub.d TMP0, BASE, TMP0		// Either keep top or shrink it.
++  |  maskeqz TMP0, TMP0, TMP2		// LUA_MULTRET+1 case?
++  |  masknez BASE, BASE, TMP2
++  |  or BASE, BASE, TMP0
++  |  b <3
++  |
++  |8:  // Corner case: need to grow stack for filling up results.
++  |  // This can happen if:
++  |  // - A C function grows the stack (a lot).
++  |  // - The GC shrinks the stack in between.
++  |  // - A return back from a lua_call() with (high) nresults adjustment.
++  |
++  |  st.d BASE, L->top                   // Save current top held in BASE (yes).
++  |  or MULTRES, RD, r0
++  |  srli.w CARG2, TMP2, 3
++  |  or CARG1, L, r0
++  |  bl extern lj_state_growstack       // (lua_State *L, int n)
++  |  ld.w TMP2, SAVE_NRES(sp)
++  |  ld.d BASE, L->top			// Need the (realloced) L->top in BASE.
++  |  or RD, MULTRES, r0
++  |  slli.w TMP2, TMP2, 3
++  |  b <2
++  |
++  |->vm_unwind_c:			// Unwind C stack, return from vm_pcall.
++  |  // (void *cframe, int errcode)
++  |  or sp, CARG1, r0
++  |  or CRET1, CARG2, r0
++  |->vm_unwind_c_eh:			// Landing pad for external unwinder.
++  |  ld.d L, SAVE_L(sp)
++  |  addi.w TMP0, r0, ~LJ_VMST_C
++  |  ld.d GL:TMP1, L->glref
++  |  st.w TMP0, GL:TMP1->vmstate
++  |  b ->vm_leave_unw
++  |
++  |->vm_unwind_ff:			// Unwind C stack, return from ff pcall.
++  |  // (void *cframe)
++  |  addi.d TMP3, r0, CFRAME_RAWMASK
++  |  and sp, CARG1, TMP3
++  |->vm_unwind_ff_eh:			// Landing pad for external unwinder.
++  |  ld.d L, SAVE_L(sp)
++  |  addu16i.d TMP3, r0, 0x59c0		// TOBIT = 2^52 + 2^51 (float).
++  |  addi.d TISNIL, r0, LJ_TNIL
++  |  addi.d TISNUM, r0, LJ_TISNUM
++  |  ld.d BASE, L->base
++  |  ld.d DISPATCH, L->glref		// Setup pointer to dispatch table.
++  |  movgr2fr.w TOBIT, TMP3
++  |  mov_false TMP1
++  |  li_vmstate INTERP
++  |  ld.d PC, FRAME_PC(BASE)		// Fetch PC of previous frame.
++  |  fcvt.d.s TOBIT, TOBIT
++  |  addi.d RA, BASE, -8		// Results start at BASE-8.
++  |  .ADD16I DISPATCH, DISPATCH, GG_G2DISP
++  |  st.d TMP1, 0(RA)			// Prepend false to error message.
++  |  st_vmstate
++  |  addi.d RD, r0, 16			// 2 results: false + error message.
++  |  b ->vm_returnc
++  |
++  |
++  |//-----------------------------------------------------------------------
++  |//-- Grow stack for calls -----------------------------------------------
++  |//-----------------------------------------------------------------------
++  |
++  |->vm_growstack_c:			// Grow stack for C function.
++  |  addi.d CARG2, r0, LUA_MINSTACK
++  |  b >2
++  |
++  |->vm_growstack_l:			// Grow stack for Lua function.
++  |  // BASE = new base, RA = BASE+framesize*8, RC = nargs*8, PC = first PC
++  |  add.d RC, BASE, RC
++  |  sub.d RA, RA, BASE
++  |  st.d BASE, L->base
++  |  addi.d PC, PC, 4			// Must point after first instruction.
++  |  st.d RC, L->top
++  |  srli.w CARG2, RA, 3
++  |2:
++  |  // L->base = new base, L->top = top
++  |  st.d PC, SAVE_PC(sp)
++  |  or CARG1, L, r0
++  |  bl extern lj_state_growstack	// (lua_State *L, int n)
++  |  ld.d BASE, L->base
++  |  ld.d RC, L->top
++  |  ld.d LFUNC:RB, FRAME_FUNC(BASE)
++  |  sub.d RC, RC, BASE
++  |  cleartp LFUNC:RB
++  |  // BASE = new base, RB = LFUNC/CFUNC, RC = nargs*8, FRAME_PC(BASE) = PC
++  |  ins_callt				// Just retry the call.
++  |
++  |//-----------------------------------------------------------------------
++  |//-- Entry points into the assembler VM ---------------------------------
++  |//-----------------------------------------------------------------------
++  |
++  |->vm_resume:				// Setup C frame and resume thread.
++  |  // (lua_State *L, TValue *base, int nres1 = 0, ptrdiff_t ef = 0)
++  |  saveregs
++  |  or L, CARG1, r0
++  |  ld.d DISPATCH, L->glref		// Setup pointer to dispatch table.
++  |  or BASE, CARG2, r0
++  |  ld.bu TMP1, L->status
++  |  st.d L, SAVE_L(sp)
++  |  addi.d PC, r0, FRAME_CP
++  |  addi.d TMP0, sp, CFRAME_RESUME
++  |  .ADD16I DISPATCH, DISPATCH, GG_G2DISP
++  |  st.w r0, SAVE_NRES(sp)
++  |  st.w r0, SAVE_ERRF(sp)
++  |  st.d CARG1, SAVE_PC(sp)			// Any value outside of bytecode is ok.
++  |  st.d r0, SAVE_CFRAME(sp)
++  |  st.d TMP0, L->cframe
++  |  beqz TMP1, >3
++  |
++  |  // Resume after yield (like a return).
++  |  .STXD L, DISPATCH, DISPATCH_GL(cur_L)
++  |  or RA, BASE, r0
++  |  ld.d BASE, L->base
++  |  ld.d TMP1, L->top
++  |  ld.d PC, FRAME_PC(BASE)
++  |  addu16i.d TMP3, r0, 0x59c0		// TOBIT = 2^52 + 2^51 (float).
++  |  sub.d RD, TMP1, BASE
++  |  movgr2fr.w TOBIT, TMP3
++  |  st.b r0, L->status
++  |  fcvt.d.s TOBIT, TOBIT
++  |  li_vmstate INTERP
++  |  addi.d RD, RD, 8
++  |  st_vmstate
++  |  or MULTRES, RD, r0
++  |  andi TMP0, PC, FRAME_TYPE
++  |  addi.d TISNIL, r0, LJ_TNIL
++  |  addi.d TISNUM, r0, LJ_TISNUM
++  |  beqz TMP0, ->BC_RET_Z
++  |  b ->vm_return
++  |
++  |->vm_pcall:				// Setup protected C frame and enter VM.
++  |  // (lua_State *L, TValue *base, int nres1, ptrdiff_t ef)
++  |  saveregs
++  |  st.w CARG4, SAVE_ERRF(sp)
++  |  addi.d PC, r0, FRAME_CP
++  |  b >1
++  |
++  |->vm_call:				// Setup C frame and enter VM.
++  |  // (lua_State *L, TValue *base, int nres1)
++  |  saveregs
++  |  addi.d PC, r0, FRAME_C
++  |
++  |1:  // Entry point for vm_pcall above (PC = ftype).
++  |  ld.d TMP1, L:CARG1->cframe
++  |  or L, CARG1, r0
++  |  st.w CARG3, SAVE_NRES(sp)
++  |  ld.d DISPATCH, L->glref		// Setup pointer to dispatch table.
++  |  st.d CARG1, SAVE_L(sp)
++  |  or BASE, CARG2, r0
++  |  .ADD16I DISPATCH, DISPATCH, GG_G2DISP
++  |  st.d CARG1, SAVE_PC(sp)		// Any value outside of bytecode is ok.
++  |  st.d TMP1, SAVE_CFRAME(sp)
++  |  st.d sp, L->cframe			// Add our C frame to cframe chain.
++  |
++  |3:  // Entry point for vm_cpcall/vm_resume (BASE = base, PC = ftype).
++  |  .STXD L, DISPATCH, DISPATCH_GL(cur_L)
++  |  ld.d TMP2, L->base			// TMP2 = old base (used in vmeta_call).
++  |  addu16i.d TMP3, r0, 0x59c0		// TOBIT = 2^52 + 2^51 (float).
++  |  ld.d TMP1, L->top
++  |  movgr2fr.w TOBIT, TMP3
++  |  add.d PC, PC, BASE
++  |  sub.d NARGS8:RC, TMP1, BASE
++  |  addi.d TISNUM, r0, LJ_TISNUM
++  |  sub.d PC, PC, TMP2			// PC = frame delta + frame type
++  |  fcvt.d.s TOBIT, TOBIT
++  |  li_vmstate INTERP
++  |  addi.d TISNIL, r0, LJ_TNIL
++  |  st_vmstate
++  |
++  |->vm_call_dispatch:
++  |  // TMP2 = old base, BASE = new base, RC = nargs*8, PC = caller PC
++  |  ld.d LFUNC:RB, FRAME_FUNC(BASE)
++  |  checkfunc LFUNC:RB, ->vmeta_call
++  |
++  |->vm_call_dispatch_f:
++  |  ins_call
++  |  // BASE = new base, RB = func, RC = nargs*8, PC = caller PC
++  |
++  |->vm_cpcall:				// Setup protected C frame, call C.
++  |  // (lua_State *L, lua_CFunction func, void *ud, lua_CPFunction cp)
++  |  saveregs
++  |  or L, CARG1, r0
++  |  ld.d TMP0, L:CARG1->stack
++  |  st.d CARG1, SAVE_L(sp)
++  |  ld.d TMP1, L->top
++  |  ld.d DISPATCH, L->glref		// Setup pointer to dispatch table.
++  |  st.d CARG1, SAVE_PC(sp)		// Any value outside of bytecode is ok.
++  |  sub.d TMP0, TMP0, TMP1		// Compute -savestack(L, L->top).
++  |  ld.d TMP1, L->cframe
++  |  .ADD16I DISPATCH, DISPATCH, GG_G2DISP
++  |  st.w TMP0, SAVE_NRES(sp)		// Neg. delta means cframe w/o frame.
++  |  st.w r0, SAVE_ERRF(sp)		// No error function.
++  |  st.d TMP1, SAVE_CFRAME(sp)
++  |  st.d sp, L->cframe			// Add our C frame to cframe chain.
++  |  .STXD L, DISPATCH, DISPATCH_GL(cur_L)
++  |  jirl r1, CARG4, 0			// (lua_State *L, lua_CFunction func, void *ud)
++  |  or BASE, CRET1, r0
++  |  addi.d PC, r0, FRAME_CP
++  |  bnez CRET1, <3			// Else continue with the call.
++  |  b ->vm_leave_cp			// No base? Just remove C frame.
++  |
++  |//-----------------------------------------------------------------------
++  |//-- Metamethod handling ------------------------------------------------
++  |//-----------------------------------------------------------------------
++  |
++  |//-- Continuation dispatch ----------------------------------------------
++  |
++  |->cont_dispatch:
++  |  // BASE = meta base, RA = resultptr, RD = (nresults+1)*8
++  |  ld.d TMP0, -32(BASE)		// Continuation.
++  |  or RB, BASE, r0
++  |  or BASE, TMP2, r0			// Restore caller BASE.
++  |  ld.d LFUNC:TMP1, FRAME_FUNC(TMP2)
++  |  ld.d PC, -24(RB)			// Restore PC from [cont|PC].
++  |  cleartp LFUNC:TMP1
++  |  add.d TMP2, RA, RD
++  |  ld.d TMP1, LFUNC:TMP1->pc
++  |  st.d TISNIL, -8(TMP2)               // Ensure one valid arg.
++  |  // BASE = base, RA = resultptr, RB = meta base
++  |  ld.d KBASE, PC2PROTO(k)(TMP1)
++  |  jirl r0, TMP0, 0				// Jump to continuation.
++  |
++  |
++  |->cont_cat:				// RA = resultptr, RB = meta base
++  |  ld.w INS, -4(PC)
++  |  addi.d CARG2, RB, -32
++  |  ld.d TMP0, 0(RA)
++  |  decode_RB MULTRES, INS
++  |  decode_RA RA, INS
++  |  add.d TMP1, BASE, MULTRES
++  |  st.d BASE, L->base
++  |  sub.d CARG3, CARG2, TMP1
++  |  st.d TMP0, 0(CARG2)
++  |  bne TMP1, CARG2, ->BC_CAT_Z
++  |  add.d RA, BASE, RA
++  |  st.d TMP0, 0(RA)
++  |  b ->cont_nop
++  |
++  |//-- Table indexing metamethods -----------------------------------------
++  |
++  |->vmeta_tgets1:
++  |  .ADD16I CARG3, DISPATCH, DISPATCH_GL(tmptv)
++  |  addi.d TMP0, r0, LJ_TSTR
++  |  settp STR:RC, TMP0
++  |  st.d STR:RC, 0(CARG3)
++  |  b >1
++  |
++  |->vmeta_tgets:
++  |  .ADD16I CARG2, DISPATCH, DISPATCH_GL(tmptv)
++  |  addi.d TMP0, r0, LJ_TTAB
++  |  addi.d TMP1, r0, LJ_TSTR
++  |  settp TAB:RB, TMP0
++  |  .ADD16I CARG3, DISPATCH, DISPATCH_GL(tmptv2)
++  |  st.d TAB:RB, 0(CARG2)
++  |  settp STR:RC, TMP1
++  |  st.d STR:RC, 0(CARG3)
++  |  b >1
++  |
++  |->vmeta_tgetb:			// TMP0 = index
++  |  .ADD16I CARG3, DISPATCH, DISPATCH_GL(tmptv)
++  |  settp TMP0, TISNUM
++  |  st.d TMP0, 0(CARG3)
++  |
++  |->vmeta_tgetv:
++  |1:
++  |  st.d BASE, L->base
++  |  or CARG1, L, r0
++  |  st.d PC, SAVE_PC(sp)
++  |  bl extern lj_meta_tget		// (lua_State *L, TValue *o, TValue *k)
++  |  // Returns TValue * (finished) or NULL (metamethod).
++  |  beqz CRET1, >3
++  |  ld.d TMP0, 0(CRET1)
++  |  st.d TMP0, 0(RA)
++  |  ins_next
++  |
++  |3:  // Call __index metamethod.
++  |  // BASE = base, L->top = new base, stack = cont/func/t/k
++  |  addi.d TMP1, BASE, -FRAME_CONT
++  |  addi.d NARGS8:RC, r0, 16		// 2 args for func(t, k).
++  |  ld.d BASE, L->top
++  |  st.d PC, -24(BASE)			// [cont|PC]
++  |  sub.d PC, BASE, TMP1
++  |  ld.d LFUNC:RB, FRAME_FUNC(BASE)	// Guaranteed to be a function here.
++  |  cleartp LFUNC:RB
++  |  b ->vm_call_dispatch_f
++  |
++  |->vmeta_tgetr:
++  |  bl extern lj_tab_getinth		// (GCtab *t, int32_t key)
++  |  // Returns cTValue * or NULL.
++  |  or TMP1, TISNIL, r0
++  |  beqz CRET1, ->BC_TGETR_Z
++  |  ld.d TMP1, 0(CRET1)
++  |  b ->BC_TGETR_Z
++  |
++  |//-----------------------------------------------------------------------
++  |
++  |->vmeta_tsets1:
++  |  .ADD16I CARG3, DISPATCH, DISPATCH_GL(tmptv)
++  |  addi.d TMP0, r0, LJ_TSTR
++  |  settp STR:RC, TMP0
++  |  st.d STR:RC, 0(CARG3)
++  |  b >1
++  |
++  |->vmeta_tsets:
++  |  .ADD16I CARG2, DISPATCH, DISPATCH_GL(tmptv)
++  |  addi.d TMP0, r0, LJ_TTAB
++  |  addi.d TMP1, r0, LJ_TSTR
++  |  settp TAB:RB, TMP0
++  |  .ADD16I CARG3, DISPATCH, DISPATCH_GL(tmptv2)
++  |  st.d TAB:RB, 0(CARG2)
++  |  settp STR:RC, TMP1
++  |  st.d STR:RC, 0(CARG3)
++  |  b  >1
++  |
++  |->vmeta_tsetb:			// TMP0 = index
++  |  .ADD16I CARG3, DISPATCH, DISPATCH_GL(tmptv)
++  |  settp TMP0, TISNUM
++  |  st.d TMP0, 0(CARG3)
++  |
++  |->vmeta_tsetv:
++  |1:
++  |  st.d BASE, L->base
++  |  or CARG1, L, r0
++  |  st.d PC, SAVE_PC(sp)
++  |  bl extern lj_meta_tset		// (lua_State *L, TValue *o, TValue *k)
++  |  // Returns TValue * (finished) or NULL (metamethod).
++  |  ld.d TMP2, 0(RA)
++  |  beqz CRET1, >3
++  |  // NOBARRIER: lj_meta_tset ensures the table is not black.
++  |  st.d TMP2, 0(CRET1)
++  |  ins_next
++  |
++  |3:  // Call __newindex metamethod.
++  |  // BASE = base, L->top = new base, stack = cont/func/t/k/(v)
++  |  addi.d TMP1, BASE, -FRAME_CONT
++  |  ld.d BASE, L->top
++  |  st.d PC, -24(BASE)			// [cont|PC]
++  |   sub.d PC, BASE, TMP1
++  |  ld.d LFUNC:RB, FRAME_FUNC(BASE)	// Guaranteed to be a function here.
++  |  addi.d NARGS8:RC, r0, 24		// 3 args for func(t, k, v)
++  |  cleartp LFUNC:RB
++  |  st.d TMP2, 16(BASE)		// Copy value to third argument.
++  |  b ->vm_call_dispatch_f
++  |
++  |->vmeta_tsetr:
++  |  st.d BASE, L->base
++  |  or CARG1, L, r0
++  |  st.d PC, SAVE_PC(sp)
++  |  bl extern lj_tab_setinth	// (lua_State *L, GCtab *t, int32_t key)
++  |  // Returns TValue *.
++  |  b ->BC_TSETR_Z
++  |
++  |//-- Comparison metamethods ---------------------------------------------
++  |
++  |->vmeta_comp:
++  |  // RA/RD point to o1/o2.
++  |  or CARG2, RA, r0
++  |  or CARG3, RD, r0
++  |  addi.d PC, PC, -4
++  |  st.d BASE, L->base
++  |  or CARG1, L, r0
++  |  decode_OP CARG4, INS
++  |  st.d PC, SAVE_PC(sp)
++  |  bl extern lj_meta_comp	// (lua_State *L, TValue *o1, *o2, int op)
++  |  // Returns 0/1 or TValue * (metamethod).
++  |3:
++  |  sltui TMP1, CRET1, 2
++  |  beqz TMP1, ->vmeta_binop
++  |  sub.w TMP2, r0, CRET1
++  |4:
++  |  ld.hu RD, OFS_RD(PC)
++  |  addi.d PC, PC, 4
++  |  addu16i.d TMP1, r0, -0x2		// -BCBIAS_J*4
++  |  slli.w RD, RD, 2
++  |  add.w RD, RD, TMP1
++  |  and RD, RD, TMP2
++  |  add.d PC, PC, RD
++  |->cont_nop:
++  |  ins_next
++  |
++  |->cont_ra:				// RA = resultptr
++  |  ld.bu TMP1, -4+OFS_RA(PC)
++  |  ld.d TMP2, 0(RA)
++  |  slli.w TMP1, TMP1, 3
++  |  add.d TMP1, BASE, TMP1
++  |  st.d TMP2, 0(TMP1)
++  |  b ->cont_nop
++  |
++  |->cont_condt:			// RA = resultptr
++  |  ld.d TMP0, 0(RA)
++  |  gettp TMP0, TMP0
++  |  sltui TMP1, TMP0, LJ_TISTRUECOND
++  |  sub.w TMP2, r0, TMP1		// Branch if result is true.
++  |  b <4
++  |
++  |->cont_condf:			// RA = resultptr
++  |  ld.d TMP0, 0(RA)
++  |  gettp TMP0, TMP0
++  |  sltui TMP1, TMP0, LJ_TISTRUECOND
++  |  addi.w TMP2, TMP1, -1		// Branch if result is false.
++  |  b <4
++  |
++  |->vmeta_equal:
++  |  // CARG1/CARG2 point to o1/o2. TMP0 is set to 0/1.
++  |  cleartp LFUNC:CARG3, CARG2
++  |  cleartp LFUNC:CARG2, CARG1
++  |  or CARG4, TMP0, r0
++  |  addi.d PC, PC, -4
++  |  st.d BASE, L->base
++  |  or CARG1, L, r0
++  |  st.d PC, SAVE_PC(sp)
++  |  bl extern lj_meta_equal		// (lua_State *L, GCobj *o1, *o2, int ne)
++  |  // Returns 0/1 or TValue * (metamethod).
++  |  b <3
++  |
++  |
++  |->vmeta_istype:
++  |  addi.d PC, PC, -4
++  |  st.d BASE, L->base
++  |  or CARG1, L, r0 
++  |  srli.w CARG2, RA, 3
++  |  srli.w CARG3, RD, 3
++  |  st.d PC, SAVE_PC(sp)
++  |  bl extern lj_meta_istype		// (lua_State *L, BCReg ra, BCReg tp)
++  |  b ->cont_nop
++  |
++  |//-- Arithmetic metamethods ---------------------------------------------
++  |
++  |->vmeta_unm:
++  |  or RC, RB, r0
++  |
++  |->vmeta_arith:
++  |  st.d BASE, L->base
++  |  or CARG2, RA, r0
++  |  st.d PC, SAVE_PC(sp)
++  |  or CARG3, RB, r0
++  |  or CARG4, RC, r0
++  |  decode_OP CARG5, INS
++  |  or CARG1, L, r0
++  |  bl extern lj_meta_arith		// (lua_State *L, TValue *ra,*rb,*rc, BCReg op)
++  |  // Returns NULL (finished) or TValue * (metamethod).
++  |  beqz CRET1, ->cont_nop
++  |
++  |  // Call metamethod for binary op.
++  |->vmeta_binop:
++  |  // BASE = old base, CRET1 = new base, stack = cont/func/o1/o2
++  |  sub.d TMP1, CRET1, BASE
++  |  st.d PC, -24(CRET1)			// [cont|PC]
++  |  or TMP2, BASE, r0
++  |  addi.d PC, TMP1, FRAME_CONT
++  |  or BASE, CRET1, r0
++  |  addi.d NARGS8:RC, r0, 16                  // 2 args for func(o1, o2).
++  |  b ->vm_call_dispatch
++  |
++  |->vmeta_len:
++  |  // CARG2 already set by BC_LEN.
++#if LJ_52
++  |  or MULTRES, CARG1, r0
++#endif
++  |  st.d BASE, L->base
++  |  or CARG1, L, r0
++  |  st.d PC, SAVE_PC(sp)
++  |  bl extern lj_meta_len		// (lua_State *L, TValue *o)
++  |  // Returns NULL (retry) or TValue * (metamethod base).
++#if LJ_52
++  |  bnez CRET1, ->vmeta_binop		// Binop call for compatibility.
++  |  or CARG1, MULTRES, r0
++  |  b ->BC_LEN_Z
++#else
++  |  b ->vmeta_binop			// Binop call for compatibility.
++#endif
++  |
++  |//-- Call metamethod ----------------------------------------------------
++  |
++  |->vmeta_call:			// Resolve and call __call metamethod.
++  |  // TMP2 = old base, BASE = new base, RC = nargs*8
++  |  st.d TMP2, L->base			// This is the callers base!
++  |  addi.d CARG2, BASE, -16
++  |  st.d PC, SAVE_PC(sp)
++  |  add.d CARG3, BASE, RC
++  |  or CARG1, L, r0
++  |  or MULTRES, NARGS8:RC, r0
++  |  bl extern lj_meta_call		// (lua_State *L, TValue *func, TValue *top)
++  |  ld.d LFUNC:RB, FRAME_FUNC(BASE)	// Guaranteed to be a function here.
++  |  addi.d NARGS8:RC, MULTRES, 8	// Got one more argument now.
++  |  cleartp LFUNC:RB
++  |  ins_call
++  |
++  |->vmeta_callt:			// Resolve __call for BC_CALLT.
++  |  // BASE = old base, RA = new base, RC = nargs*8
++  |  st.d BASE, L->base
++  |  addi.d CARG2, RA, -16
++  |  st.d PC, SAVE_PC(sp)
++  |  add.d CARG3, RA, RC
++  |  or CARG1, L, r0
++  |  or MULTRES, NARGS8:RC, r0
++  |  bl extern lj_meta_call		// (lua_State *L, TValue *func, TValue *top)
++  |  ld.d RB, FRAME_FUNC(RA)		// Guaranteed to be a function here.
++  |  ld.d TMP1, FRAME_PC(BASE)
++  |  addi.d NARGS8:RC, MULTRES, 8	// Got one more argument now.
++  |  cleartp LFUNC:CARG3, RB
++  |  b ->BC_CALLT_Z
++  |
++  |//-- Argument coercion for 'for' statement ------------------------------
++  |
++  |->vmeta_for:
++  |  st.d BASE, L->base
++  |  or CARG2, RA, r0
++  |  st.d PC, SAVE_PC(sp)
++  |  or MULTRES, INS, r0
++  |  or CARG1, L, r0
++  |  bl extern lj_meta_for	// (lua_State *L, TValue *base)
++  |  decode_RA RA, MULTRES
++  |  decode_RD RD, MULTRES
++  |  b =>BC_FORI
++  |
++  |//-----------------------------------------------------------------------
++  |//-- Fast functions -----------------------------------------------------
++  |//-----------------------------------------------------------------------
++  |
++  |.macro .ffunc, name
++  |->ff_ .. name:
++  |.endmacro
++  |
++  |.macro .ffunc_1, name
++  |->ff_ .. name:
++  |  ld.d CARG1, 0(BASE)
++  |  beqz NARGS8:RC, ->fff_fallback
++  |.endmacro
++  |
++  |.macro .ffunc_2, name
++  |->ff_ .. name:
++  |  sltui TMP0, NARGS8:RC, 16
++  |  ld.d CARG1, 0(BASE)
++  |  ld.d CARG2, 8(BASE)
++  |  bnez TMP0, ->fff_fallback
++  |.endmacro
++  |
++  |.macro .ffunc_n, name
++  |->ff_ .. name:
++  |  ld.d CARG1, 0(BASE)
++  |  fld.d FARG1, 0(BASE)
++  |  beqz NARGS8:RC, ->fff_fallback
++  |  checknum CARG1, ->fff_fallback
++  |.endmacro
++  |
++  |.macro .ffunc_nn, name
++  |->ff_ .. name:
++  |  ld.d CARG1, 0(BASE)
++  |  ld.d CARG2, 8(BASE)
++  |  sltui TMP0, NARGS8:RC, 16
++  |  gettp TMP1, CARG1
++  |  bnez TMP0, ->fff_fallback
++  |  gettp TMP2, CARG2
++  |  sltui TMP1, TMP1, LJ_TISNUM
++  |  sltui TMP2, TMP2, LJ_TISNUM
++  |  fld.d FARG1, 0(BASE)
++  |  and TMP1, TMP1, TMP2
++  |  fld.d FARG2, 8(BASE)
++  |  beqz TMP1, ->fff_fallback
++  |.endmacro
++  |
++  |// Inlined GC threshold check.
++  |.macro ffgccheck
++  |  .LDXD TMP0, DISPATCH, DISPATCH_GL(gc.total)
++  |  .LDXD TMP1, DISPATCH, DISPATCH_GL(gc.threshold)
++  |  bltu TMP0, TMP1, >1
++  |  bl ->fff_gcstep
++  |1:
++  |.endmacro
++  |
++  |//-- Base library: checks -----------------------------------------------
++  |.ffunc_1 assert
++  |  gettp TMP1, CARG1
++  |//  ld.d PC, FRAME_PC(BASE)
++  |  sltui TMP1, TMP1, LJ_TISTRUECOND
++  |  addi.d RA, BASE, -16
++  |  beqz TMP1, ->fff_fallback
++  |  ld.d PC, FRAME_PC(BASE)
++  |  addi.w RD, NARGS8:RC, 8		// Compute (nresults+1)*8.
++  |  addi.d TMP1, BASE, 8
++  |  add.d TMP2, RA, RD
++  |  st.d CARG1, 0(RA)
++  |  beq BASE, TMP2, ->fff_res		// Done if exactly 1 argument.
++  |1:
++  |  ld.d TMP0, 0(TMP1)
++  |  st.d TMP0, -16(TMP1)
++  |  or TMP3, TMP1, r0
++  |  addi.d TMP1, TMP1, 8
++  |  bne TMP3, TMP2, <1
++  |  b ->fff_res
++  |
++  |.ffunc_1 type
++  |  gettp TMP0, CARG1
++  |  addi.w TMP1, r0, ~LJ_TISNUM
++  |  sltu TMP2, TISNUM, TMP0
++  |  nor TMP3, TMP0, r0
++  |  masknez TMP1, TMP1, TMP2
++  |  maskeqz TMP3, TMP3, TMP2
++  |  or TMP3, TMP3, TMP1
++  |  slli.d TMP3, TMP3, 3
++  |  add.d TMP3, CFUNC:RB, TMP3
++  |  ld.d CARG1, CFUNC:TMP3->upvalue
++  |  b ->fff_restv
++  |
++  |//-- Base library: getters and setters ---------------------------------
++  |
++  |.ffunc_1 getmetatable
++  |  gettp TMP2, CARG1
++  |  addi.d TMP0, TMP2, -LJ_TTAB
++  |  addi.d TMP1, TMP2, -LJ_TUDATA
++  |  maskeqz TMP0, TMP1, TMP0
++  |  cleartp TAB:CARG1
++  |  bnez TMP0, >6
++  |1:  // Field metatable must be at same offset for GCtab and GCudata!
++  |  ld.d TAB:RB, TAB:CARG1->metatable
++  |2:
++  |  .LDXD STR:RC, DISPATCH, DISPATCH_GL(gcroot[GCROOT_MMNAME+MM_metatable])
++  |  addi.d CARG1, r0, LJ_TNIL
++  |  beqz TAB:RB, ->fff_restv
++  |  ld.w TMP0, TAB:RB->hmask
++  |  ld.w TMP1, STR:RC->sid
++  |  ld.d NODE:TMP2, TAB:RB->node
++  |  and TMP1, TMP1, TMP0		// idx = str->sid & tab->hmask
++  |  slli.d TMP0, TMP1, 5
++  |  slli.d TMP1, TMP1, 3
++  |  sub.d TMP1, TMP0, TMP1
++  |  add.d NODE:TMP2, NODE:TMP2, TMP1	// node = tab->node + (idx*32-idx*8)
++  |  addi.w CARG4, r0, LJ_TSTR
++  |  settp STR:RC, CARG4		// Tagged key to look for.
++  |3:  // Rearranged logic, because we expect _not_ to find the key.
++  |  ld.d TMP0, NODE:TMP2->key
++  |  ld.d CARG1, NODE:TMP2->val
++  |  ld.d NODE:TMP2, NODE:TMP2->next
++  |  addi.d TMP3, r0, LJ_TTAB
++  |  beq RC, TMP0, >5
++  |  bnez NODE:TMP2, <3
++  |4:
++  |  or CARG1, RB, r0
++  |  settp CARG1, TMP3
++  |  b ->fff_restv			// Not found, keep default result.
++  |5:
++  |  bne CARG1, TISNIL, ->fff_restv
++  |  b <4				// Ditto for nil value.
++  |
++  |6:
++  |  sltui TMP3, TMP2, LJ_TISNUM
++  |  maskeqz TMP0, TISNUM, TMP3
++  |  masknez TMP3, TMP2, TMP3
++  |  or TMP2, TMP0, TMP3
++  |  slli.d TMP2, TMP2, 3
++  |  sub.d TMP0, DISPATCH, TMP2
++  |  .LDXD TAB:RB, TMP0, DISPATCH_GL(gcroot[GCROOT_BASEMT])-8
++  |  b <2
++  |
++  |.ffunc_2 setmetatable
++  |  // Fast path: no mt for table yet and not clearing the mt.
++  |  checktp TMP1, CARG1, -LJ_TTAB, ->fff_fallback
++  |  gettp TMP3, CARG2
++  |  ld.d TAB:TMP0, TAB:TMP1->metatable
++  |  ld.bu TMP2, TAB:TMP1->marked
++  |  addi.d TMP3, TMP3, -LJ_TTAB
++  |  cleartp TAB:CARG2
++  |  or TMP3, TMP3, TAB:TMP0
++  |  bnez TMP3, ->fff_fallback
++  |  andi TMP3, TMP2, LJ_GC_BLACK		// isblack(table)
++  |  st.d TAB:CARG2, TAB:TMP1->metatable
++  |  beqz TMP3, ->fff_restv
++  |  barrierback TAB:TMP1, TMP2, TMP0, ->fff_restv
++  |
++  |.ffunc rawget
++  |  ld.d CARG2, 0(BASE)
++  |  sltui TMP0, NARGS8:RC, 16
++  |  gettp TMP1, CARG2
++  |  cleartp CARG2
++  |  addi.d TMP1, TMP1, -LJ_TTAB
++  |  or TMP0, TMP0, TMP1
++  |  addi.d CARG3, BASE, 8
++  |  bnez TMP0, ->fff_fallback
++  |  or CARG1, L, r0
++  |  bl extern lj_tab_get	// (lua_State *L, GCtab *t, cTValue *key)
++  |  // Returns cTValue *.
++  |  ld.d CARG1, 0(CRET1)
++  |  b ->fff_restv
++  |
++  |//-- Base library: conversions ------------------------------------------
++  |
++  |.ffunc tonumber
++  |  // Only handles the number case inline (without a base argument).
++  |  ld.d CARG1, 0(BASE)
++  |  xori TMP0, NARGS8:RC, 8		// Exactly one number argument.
++  |  gettp TMP1, CARG1
++  |  sltu TMP1, TISNUM, TMP1
++  |  or TMP0, TMP0, TMP1
++  |  bnez TMP0, ->fff_fallback		// No args or CARG1 is not number
++  |  b ->fff_restv
++  |
++  |.ffunc_1 tostring
++  |  // Only handles the string or number case inline.
++  |  gettp TMP0, CARG1
++  |  addi.d TMP1, TMP0, -LJ_TSTR
++  |  // A __tostring method in the string base metatable is ignored.
++  |  beqz TMP1, ->fff_restv	// String key?
++  |  // Handle numbers inline, unless a number base metatable is present.
++  |  .LDXD TMP1, DISPATCH, DISPATCH_GL(gcroot[GCROOT_BASEMT_NUM])
++  |  sltu TMP0, TISNUM, TMP0
++  |  st.d BASE, L->base			// Add frame since C call can throw.
++  |  or TMP0, TMP0, TMP1
++  |  bnez TMP0, ->fff_fallback
++  |  st.d PC, SAVE_PC(sp)		// Redundant (but a defined value).
++  |  ffgccheck
++  |  or CARG1, L, r0
++  |  or CARG2, BASE, r0
++  |  bl extern lj_strfmt_number	// (lua_State *L, cTValue *o)
++  |  // Returns GCstr *.
++  |  addi.d TMP1, r0, LJ_TSTR
++  |//  ld.d BASE, L->base
++  |  settp CARG1, TMP1
++  |  b ->fff_restv
++  |
++  |//-- Base library: iterators -------------------------------------------
++  |
++  |.ffunc_1 next
++  |  checktp CARG1, -LJ_TTAB, ->fff_fallback
++  |  add.d TMP0, BASE, NARGS8:RC
++  |  ld.d PC, FRAME_PC(BASE)
++  |  st.d TISNIL, 0(TMP0)		// Set missing 2nd arg to nil.
++  |  addi.d CARG2, BASE, 8
++  |  addi.d CARG3, BASE, -16
++  |  bl extern lj_tab_next		// (GCtab *t, cTValue *key, TValue *o)
++  |  // Returns 1=found, 0=end, -1=error.
++  |//  addi.d RA, BASE, -16
++  |  addi.d RD, r0, (2+1)*8
++  |  blt r0, CRET1, ->fff_res		// Found key/value.
++  |  or TMP1, CRET1, r0
++  |  or CARG1, TISNIL, r0
++  |  beqz TMP1, ->fff_restv		// End of traversal: return nil.
++  |  ld.d CFUNC:RB, FRAME_FUNC(BASE)
++  |  addi.w RC, r0, 2*8
++  |  cleartp CFUNC:RB
++  |  b ->fff_fallback			// Invalid key.
++  |
++  |.ffunc_1 pairs
++  |  checktp TAB:TMP1, CARG1, -LJ_TTAB, ->fff_fallback
++  |  ld.d PC, FRAME_PC(BASE)
++#if LJ_52
++  |  ld.d TAB:TMP2, TAB:TMP1->metatable
++  |  ld.d TMP0, CFUNC:RB->upvalue[0]
++  |  addi.d RA, BASE, -16
++  |  bnez TAB:TMP2, ->fff_fallback
++#else
++  |  ld.d TMP0, CFUNC:RB->upvalue[0]
++  |  addi.d RA, BASE, -16
++#endif
++  |  st.d TISNIL, 0(BASE)
++  |  st.d CARG1, -8(BASE)
++  |  st.d TMP0, 0(RA)
++  |  addi.d RD, r0, (3+1)*8
++  |  b ->fff_res
++  |
++  |.ffunc_2 ipairs_aux
++  |  checktab CARG1, ->fff_fallback
++  |  checkint CARG2, ->fff_fallback
++  |  ld.w TMP0, TAB:CARG1->asize
++  |  ld.d TMP1, TAB:CARG1->array
++  |  ld.d PC, FRAME_PC(BASE)
++  |  slli.w TMP2, CARG2, 0
++  |  addi.w TMP2, TMP2, 1
++  |  sltu TMP3, TMP2, TMP0
++  |  addi.d RA, BASE, -16
++  |  bstrpick.d TMP0, TMP2, 31, 0
++  |  settp TMP0, TISNUM
++  |  st.d TMP0, 0(RA)
++  |  beqz TMP3, >2			// Not in array part?
++  |  slli.d TMP3, TMP2, 3
++  |  add.d TMP3, TMP1, TMP3
++  |  ld.d TMP1, 0(TMP3)
++  |1:
++  |  addi.d RD, r0, (0+1)*8
++  |  beq TMP1, TISNIL, ->fff_res	// End of iteration, return 0 results.
++  |  st.d TMP1, -8(BASE)
++  |  addi.d RD, r0, (2+1)*8
++  |  b ->fff_res
++  |2:  // Check for empty hash part first. Otherwise call C function.
++  |  ld.w TMP0, TAB:CARG1->hmask
++  |  addi.d RD, r0, (0+1)*8
++  |  beqz TMP0, ->fff_res
++  |  or CARG2, TMP2, r0
++  |  bl extern lj_tab_getinth		// (GCtab *t, int32_t key)
++  |  // Returns cTValue * or NULL.
++  |  addi.d RD, r0, (0+1)*8
++  |  beqz CRET1, ->fff_res
++  |  ld.d TMP1, 0(CRET1)
++  |  b <1
++  |
++  |.ffunc_1 ipairs
++  |  checktp TAB:TMP1, CARG1, -LJ_TTAB, ->fff_fallback
++  |  ld.d PC, FRAME_PC(BASE)
++#if LJ_52
++  |  ld.d TAB:TMP2, TAB:TMP1->metatable
++#endif
++  |  ld.d CFUNC:TMP0, CFUNC:RB->upvalue[0]
++  |  addi.d RA, BASE, -16
++#if LJ_52
++  |  bnez TAB:TMP2, ->fff_fallback
++#endif
++  |  slli.d TMP1, TISNUM, 47
++  |  st.d CARG1, -8(BASE)
++  |  st.d TMP1, 0(BASE)
++  |  st.d CFUNC:TMP0, 0(RA)
++  |  addi.d RD, r0, (3+1)*8
++  |  b ->fff_res
++  |
++  |//-- Base library: catch errors ----------------------------------------
++  |
++  |.ffunc pcall
++  |  addi.d NARGS8:RC, NARGS8:RC, -8
++  |  .LDXBU TMP3, DISPATCH, DISPATCH_GL(hookmask)
++  |  or TMP2, BASE, r0
++  |  blt NARGS8:RC, r0, ->fff_fallback
++  |  addi.d BASE, BASE, 16
++  |  // Remember active hook before pcall.
++  |  srli.w TMP3, TMP3, HOOK_ACTIVE_SHIFT
++  |  andi TMP3, TMP3, 1
++  |  addi.d PC, TMP3, 16+FRAME_PCALL
++  |  beqz NARGS8:RC, ->vm_call_dispatch
++  |1:
++  |  add.d TMP0, BASE, NARGS8:RC
++  |2:
++  |  ld.d TMP1, -16(TMP0)
++  |  st.d TMP1, -8(TMP0)
++  |  addi.d TMP0, TMP0, -8
++  |  bne TMP0, BASE, <2
++  |  b ->vm_call_dispatch
++  |
++  |.ffunc xpcall
++  |  addi.d NARGS8:TMP0, NARGS8:RC, -16
++  |  ld.d CARG1, 0(BASE)
++  |  ld.d CARG2, 8(BASE)
++  |  .LDXBU TMP1, DISPATCH, DISPATCH_GL(hookmask)
++  |  blt NARGS8:TMP0, r0, ->fff_fallback
++  |  gettp TMP2, CARG2
++  |  addi.d TMP2, TMP2, -LJ_TFUNC
++  |  bnez TMP2, ->fff_fallback		// Traceback must be a function.
++  |  or TMP2, BASE, r0
++  |  or NARGS8:RC, NARGS8:TMP0, r0
++  |  addi.d BASE, BASE, 24
++  |  // Remember active hook before pcall.
++  |  srli.w TMP3, TMP3, HOOK_ACTIVE_SHIFT
++  |  st.d CARG2, 0(TMP2)			// Swap function and traceback.
++  |  andi TMP3, TMP3, 1
++  |  st.d CARG1, 8(TMP2)
++  |  addi.d PC, TMP3, 24+FRAME_PCALL
++  |  beqz NARGS8:RC, ->vm_call_dispatch
++  |  b <1
++  |
++  |//-- Coroutine library --------------------------------------------------
++  |
++  |.macro coroutine_resume_wrap, resume
++  |.if resume
++  |.ffunc_1 coroutine_resume
++  |  checktp CARG1, CARG1, -LJ_TTHREAD, ->fff_fallback
++  |.else
++  |.ffunc coroutine_wrap_aux
++  |  ld.d L:CARG1, CFUNC:RB->upvalue[0].gcr
++  |  cleartp L:CARG1
++  |.endif
++  |  ld.bu TMP0, L:CARG1->status
++  |  ld.d TMP1, L:CARG1->cframe
++  |  ld.d CARG2, L:CARG1->top
++  |  ld.d TMP2, L:CARG1->base
++  |  addi.w CARG4, TMP0, -LUA_YIELD
++  |  add.d CARG3, CARG2, TMP0
++  |  addi.d TMP3, CARG2, 8
++  |  masknez CARG2, CARG2, CARG4
++  |  maskeqz TMP3, TMP3, CARG4
++  |  or CARG2, TMP3, CARG2
++  |  blt r0, CARG4, ->fff_fallback		// st > LUA_YIELD?
++  |  xor TMP2, TMP2, CARG3
++  |  or CARG4, TMP2, TMP0
++  |  bnez TMP1, ->fff_fallback		// cframe != 0?
++  |  ld.d TMP0, L:CARG1->maxstack
++  |  ld.d PC, FRAME_PC(BASE)
++  |  beqz CARG4, ->fff_fallback		// base == top && st == 0?
++  |  add.d TMP2, CARG2, NARGS8:RC
++  |  sltu CARG4, TMP0, TMP2
++  |  st.d BASE, L->base
++  |  st.d PC, SAVE_PC(sp)
++  |  bnez CARG4, ->fff_fallback		// Stack overflow?
++  |1:
++  |.if resume
++  |  addi.d BASE, BASE, 8		// Keep resumed thread in stack for GC.
++  |  addi.d NARGS8:RC, NARGS8:RC, -8
++  |  addi.d TMP2, TMP2, -8
++  |.endif
++  |  st.d TMP2, L:CARG1->top
++  |  st.d BASE, L->top
++  |  add.d TMP1, BASE, NARGS8:RC
++  |  or CARG3, CARG2, r0
++  |2:  // Move args to coroutine.
++  |  ld.d TMP0, 0(BASE)
++  |  sltu TMP3, BASE, TMP1
++  |  addi.d BASE, BASE, 8
++  |  beqz TMP3, >3
++  |  st.d TMP0, 0(CARG3)
++  |  addi.d CARG3, CARG3, 8
++  |  b <2
++  |3:
++  |  or L:RA, L:CARG1, r0
++  |  bl ->vm_resume			// (lua_State *L, TValue *base, 0, 0)
++  |  // Returns thread status.
++  |4:
++  |  ld.d TMP2, L:RA->base
++  |  sltui TMP1, CRET1, LUA_YIELD+1
++  |  ld.d TMP3, L:RA->top
++  |  li_vmstate INTERP
++  |  ld.d BASE, L->base
++  |  .STXD L, DISPATCH, DISPATCH_GL(cur_L)
++  |  st_vmstate
++  |  sub.d RD, TMP3, TMP2
++  |  beqz TMP1, >8
++  |  ld.d TMP0, L->maxstack
++  |  add.d TMP1, BASE, RD
++  |  beqz RD, >6			// No results?
++  |  add.d TMP3, TMP2, RD
++  |  bltu TMP0, TMP1, >9		// Need to grow stack?
++  |  st.d TMP2, L:RA->top		// Clear coroutine stack.
++  |  or TMP1, BASE, r0
++  |5:  // Move results from coroutine.
++  |  ld.d TMP0, 0(TMP2)
++  |  addi.d TMP2, TMP2, 8
++  |  st.d TMP0, 0(TMP1)
++  |  addi.d TMP1, TMP1, 8
++  |  bltu TMP2, TMP3, <5
++  |6:
++  |.if resume
++  |  mov_true TMP1
++  |  addi.d RD, RD, 16
++  |7:
++  |  st.d TMP1, -8(BASE)	// Prepend true/false to results.
++  |  addi.d RA, BASE, -8
++  |.else
++  |  or RA, BASE, r0
++  |  addi.d RD, RD, 8
++  |.endif
++  |  andi TMP0, PC, FRAME_TYPE
++  |  st.d PC, SAVE_PC(sp)
++  |  or MULTRES, RD, r0
++  |  beqz TMP0, ->BC_RET_Z
++  |  b ->vm_return
++  |
++  |8:  // Coroutine returned with error (at co->top-1).
++  |.if resume
++  |  addi.d TMP3, TMP3, -8
++  |  mov_false TMP1
++  |  addi.w RD, r0, (2+1)*8
++  |  ld.d TMP0, 0(TMP3)
++  |  st.d TMP3, L:RA->top		// Remove error from coroutine stack.
++  |  st.d TMP0, 0(BASE)			// Copy error message.
++  |  b <7
++  |.else
++  |  or CARG1, L, r0
++  |  or CARG2, L:RA, r0
++  |  bl extern lj_ffh_coroutine_wrap_err  // (lua_State *L, lua_State *co)
++  |.endif
++  |
++  |9:  // Handle stack expansion on return from yield.
++  |  or CARG1, L, r0
++  |  srli.w CARG2, RD, 3
++  |  bl extern lj_state_growstack	// (lua_State *L, int n)
++  |  addi.d CRET1, r0, 0
++  |  b <4
++  |.endmacro
++  |
++  |  coroutine_resume_wrap 1		// coroutine.resume
++  |  coroutine_resume_wrap 0		// coroutine.wrap
++  |
++  |.ffunc coroutine_yield
++  |  ld.d TMP0, L->cframe
++  |  add.d TMP1, BASE, NARGS8:RC
++  |  addi.w CRET1, r0, LUA_YIELD
++  |  st.d BASE, L->base
++  |  andi TMP0, TMP0, CFRAME_RESUME
++  |  st.d TMP1, L->top
++  |  beqz TMP0, ->fff_fallback
++  |  st.d r0, L->cframe
++  |  st.b CRET1, L->status
++  |  b ->vm_leave_unw
++  |
++  |//-- Math library -------------------------------------------------------
++  |
++  |.macro math_round, func
++  |->ff_math_ .. func:
++  |  ld.d CARG1, 0(BASE)
++  |  gettp TMP0, CARG1
++  |  beqz NARGS8:RC, ->fff_fallback
++  |  beq TMP0, TISNUM, ->fff_restv
++  |  fld.d FARG1, 0(BASE)
++  |  bgeu TMP0, TISNUM, ->fff_fallback
++  |  bl ->vm_ .. func
++  |  b ->fff_resn
++  |.endmacro
++  |
++  |  math_round floor
++  |  math_round ceil
++  |
++  |.ffunc_1 math_abs
++  |  gettp CARG2, CARG1
++  |  addi.d TMP2, CARG2, -LJ_TISNUM
++  |  slli.w TMP1, CARG1, 0
++  |  bnez TMP2, >1
++  |  srai.w TMP0, TMP1, 31			// Extract sign. int
++  |  xor TMP1, TMP1, TMP0
++  |  sub.d CARG1, TMP1, TMP0
++  |  slli.d TMP3, CARG1, 32
++  |  settp CARG1, TISNUM
++  |  bge TMP3, r0, ->fff_restv
++  |  ori CARG1, r0, 0x41e		// 2^31 as a double.
++  |  slli.w CARG1, CARG1, 4		// 0x41e0
++  |  slli.d CARG1, CARG1, 48
++  |  b ->fff_restv
++  |1:
++  |  sltui TMP2, CARG2, LJ_TISNUM
++  |  bstrpick.d CARG1, CARG1, 62, 0
++  |  beqz TMP2, ->fff_fallback		// int
++  |// fallthrough
++  |
++  |->fff_restv:
++  |  // CARG1 = TValue result.
++  |  ld.d PC, FRAME_PC(BASE)
++  |  st.d CARG1, -16(BASE)
++  |->fff_res1:
++  |  // RA = results, PC = return.
++  |  addi.d RD, r0, (1+1)*8
++  |->fff_res:
++  |  // RA = results, RD = (nresults+1)*8, PC = return.
++  |  andi TMP0, PC, FRAME_TYPE
++  |  or MULTRES, RD, r0
++  |  addi.d RA, BASE, -16
++  |  bnez TMP0, ->vm_return
++  |  ld.w INS, -4(PC)
++  |  decode_RB RB, INS
++  |5:
++  |  sltu TMP2, RD, RB
++  |  decode_RA TMP0, INS
++  |  bnez TMP2, >6			// More results expected?
++  |  // Adjust BASE. KBASE is assumed to be set for the calling frame.
++  |  sub.d BASE, RA, TMP0
++  |  ins_next
++  |
++  |6:  // Fill up results with nil.
++  |  add.d TMP1, RA, RD
++  |  addi.d RD, RD, 8
++  |  st.d TISNIL, -8(TMP1)
++  |  b <5
++  |
++  |.macro math_extern, func
++  |  .ffunc_n math_ .. func
++  |  bl extern func
++  |  b ->fff_resn
++  |.endmacro
++  |
++  |.macro math_extern2, func
++  |  .ffunc_nn math_ .. func
++  |  bl extern func
++  |  b ->fff_resn
++  |.endmacro
++  |
++  |.ffunc_n math_sqrt
++  |  fsqrt.d FRET1, FARG1
++  |->fff_resn:
++  |  ld.d PC, FRAME_PC(BASE)
++  |  fst.d FRET1, -16(BASE)
++  |  b ->fff_res1
++  |
++  |.ffunc math_log
++  |  addi.d TMP1, r0, 8
++  |  ld.d CARG1, 0(BASE)
++  |  fld.d FARG1, 0(BASE)
++  |  bne NARGS8:RC, TMP1, ->fff_fallback		// Need exactly 1 argument.
++  |  checknum CARG1, ->fff_fallback
++  |  bl extern log
++  |  b ->fff_resn
++  |
++  |  math_extern log10
++  |  math_extern exp
++  |  math_extern sin
++  |  math_extern cos
++  |  math_extern tan
++  |  math_extern asin
++  |  math_extern acos
++  |  math_extern atan
++  |  math_extern sinh
++  |  math_extern cosh
++  |  math_extern tanh
++  |  math_extern2 pow
++  |  math_extern2 atan2
++  |  math_extern2 fmod
++  |
++  |.ffunc_2 math_ldexp
++  |  checknum CARG1, ->fff_fallback
++  |  checkint CARG2, ->fff_fallback
++  |  fld.d FARG1, 0(BASE)
++  |  ld.w CARG1, 8(BASE)
++  |  bl extern ldexp			// (double x, int exp)
++  |  b ->fff_resn
++  |
++  |.ffunc_n math_frexp
++  |  ld.d PC, FRAME_PC(BASE)
++  |  .ADD16I CARG1, DISPATCH, DISPATCH_GL(tmptv)
++  |  bl extern frexp
++  |  .LDXW TMP1, DISPATCH, DISPATCH_GL(tmptv)
++  |  movgr2fr.w FARG2, TMP1
++  |  fst.d FRET1, -16(BASE)
++  |  ffint.d.w FARG2, FARG2
++  |  fst.d FARG2, -8(BASE)
++  |  addi.d RD, r0, (2+1)*8
++  |  b ->fff_res
++  |
++  |.ffunc_n math_modf
++  |  addi.d CARG1, BASE, -16
++  |  ld.d PC, FRAME_PC(BASE)
++  |  bl extern modf
++  |  fst.d FRET1, -8(BASE)
++  |  addi.d RD, r0, (2+1)*8
++  |  b ->fff_res
++  |
++  |.macro math_minmax, name, intins, intinsc, fpins
++  |  .ffunc_1 name
++  |  add.d TMP3, BASE, NARGS8:RC
++  |  addi.d TMP2, BASE, 8
++  |  checkint CARG1, >4
++  |1:  // Handle integers.
++  |  ld.d CARG2, 0(TMP2)
++  |  beq TMP2, TMP3, ->fff_restv
++  |  slli.w CARG1, CARG1, 0
++  |  checkint CARG2, >3
++  |  slli.w CARG2, CARG2, 0
++  |  slt TMP0, CARG1, CARG2
++  |  intins TMP1, CARG2, TMP0
++  |  intinsc CARG1, CARG1, TMP0
++  |  or CARG1, CARG1, TMP1
++  |  addi.d TMP2, TMP2, 8
++  |  bstrpick.d CARG1, CARG1, 31, 0
++  |  settp CARG1, TISNUM
++  |  b <1
++  |
++  |3:  // Convert intermediate result to number and continue with number loop.
++  |  movgr2fr.w FTMP3, CARG1
++  |  checknum CARG2, ->fff_fallback
++  |  ffint.d.w FTMP3, FTMP3
++  |  fld.d FARG1, 0(TMP2)
++  |  b >6
++  |
++  |4:
++  |  fld.d FTMP3, 0(BASE)
++  |5:  // Handle numbers.
++  |  ld.d CARG2, 0(TMP2)
++  |  checknum CARG1, ->fff_fallback
++  |  fld.d FTMP4, 0(TMP2)
++  |  beq TMP2, TMP3, ->fff_resn
++  |  checknum CARG2, >7
++  |6:
++  |  fpins FRET1, FTMP3, FTMP4
++  |  fmov.d FTMP3, FRET1
++  |  addi.d TMP2, TMP2, 8
++  |  b <5
++  |
++  |7:  // Convert integer to number and continue with number loop.
++  |  fld.s FARG1, 0(TMP2)
++  |  checkint CARG2, ->fff_fallback
++  |  ffint.d.w FARG1, FARG1
++  |  b <6
++  |.endmacro
++  |
++  |  math_minmax math_min, masknez, maskeqz, fmin.d
++  |  math_minmax math_max, maskeqz, masknez, fmax.d
++  |
++  |//-- String library -----------------------------------------------------
++  |
++  |.ffunc string_byte			// Only handle the 1-arg case here.
++  |  ld.d CARG1, 0(BASE)
++  |  gettp TMP0, CARG1
++  |  xori TMP1, NARGS8:RC, 8
++  |  addi.d TMP0, TMP0, -LJ_TSTR
++  |  or TMP1, TMP1, TMP0
++  |  cleartp STR:CARG1
++  |  bnez TMP1, ->fff_fallback		// Need exactly 1 string argument.
++  |  ld.w TMP0, STR:CARG1->len
++  |  ld.d PC, FRAME_PC(BASE)
++  |  sltu RD, r0, TMP0
++  |  ld.bu TMP2, STR:CARG1[1]		// Access is always ok (NUL at end).
++  |  addi.w RD, RD, 1
++  |  slli.w RD, RD, 3			// RD = ((str->len != 0)+1)*8
++  |  settp TMP2, TISNUM
++  |  st.d TMP2, -16(BASE)
++  |  b ->fff_res
++  |
++  |.ffunc string_char			// Only handle the 1-arg case here.
++  |  ffgccheck
++  |  ld.d CARG1, 0(BASE)
++  |  gettp TMP0, CARG1
++  |  xori TMP1, NARGS8:RC, 8		// Need exactly 1 argument.
++  |  addi.d TMP0, TMP0, -LJ_TISNUM	// Integer.
++  |  addi.d TMP2, r0, 255
++  |  slli.w CARG1, CARG1, 0
++  |  or TMP1, TMP1, TMP0
++  |  sltu TMP2, TMP2, CARG1		// !(255 < n).
++  |  or TMP1, TMP1, TMP2
++  |  addi.d CARG3, r0, 1
++  |  bnez TMP1, ->fff_fallback
++  |  addi.d CARG2, sp, TMPD_OFS
++  |  st.b CARG1, TMPD(sp)
++  |->fff_newstr:
++  |  st.d BASE, L->base
++  |  st.d PC, SAVE_PC(sp)
++  |  or CARG1, L, r0
++  |  bl extern lj_str_new		// (lua_State *L, char *str, size_t l)
++  |  // Returns GCstr *.
++  |  ld.d BASE, L->base
++  |->fff_resstr:
++  |  addi.d TMP1, r0, LJ_TSTR
++  |  settp CRET1, TMP1
++  |  b ->fff_restv
++  |
++  |.ffunc string_sub
++  |  ffgccheck
++  |  ld.d CARG1, 0(BASE)
++  |  ld.d CARG2, 8(BASE)
++  |  ld.d CARG3, 16(BASE)
++  |  addi.d TMP0, NARGS8:RC, -16
++  |  gettp TMP1, CARG1
++  |  blt TMP0, r0, ->fff_fallback
++  |  cleartp STR:CARG1, CARG1
++  |  addi.w CARG4, r0, -1
++  |  beqz TMP0, >1
++  |  slli.w CARG4, CARG3, 0
++  |  checkint CARG3, ->fff_fallback
++  |1:
++  |  checkint CARG2, ->fff_fallback
++  |  addi.d TMP0, TMP1, -LJ_TSTR
++  |  slli.w CARG3, CARG2, 0
++  |  bnez TMP0, ->fff_fallback
++  |  ld.w CARG2, STR:CARG1->len
++  |  // STR:CARG1 = str, CARG2 = str->len, CARG3 = start, CARG4 = end
++  |  addi.w TMP0, CARG2, 1
++  |  slt TMP3, CARG4, r0
++  |  add.w TMP2, CARG4, TMP0
++  |  slt TMP1, CARG3, r0
++  |  maskeqz TMP2, TMP2, TMP3
++  |  masknez CARG4, CARG4, TMP3
++  |  or CARG4, TMP2, CARG4		// if (end < 0) end += len+1
++  |  add.w TMP2, CARG3, TMP0
++  |  maskeqz TMP2, TMP2, TMP1
++  |  masknez CARG3, CARG3, TMP1
++  |  or CARG3, TMP2, CARG3		// if (start < 0) start += len+1
++  |  addi.d TMP3, r0, 1
++  |  slt TMP2, CARG4, r0
++  |  slt TMP1, r0, CARG3
++  |  masknez CARG4, CARG4, TMP2		// if (end < 0) end = 0
++  |  maskeqz CARG3, CARG3, TMP1
++  |  masknez TMP3, TMP3, TMP1
++  |  slt TMP2, CARG2, CARG4
++  |  or CARG3, TMP3, CARG3		// if (start < 1) start = 1
++  |  masknez CARG4, CARG4, TMP2
++  |  maskeqz CARG2, CARG2, TMP2
++  |  or CARG4, CARG2, CARG4		// if (end > len) end = len
++  |  add.d CARG2, STR:CARG1, CARG3
++  |  sub.d CARG3, CARG4, CARG3		// len = end - start
++  |  addi.d CARG2, CARG2, sizeof(GCstr)-1
++  |  addi.w CARG3, CARG3, 1             // len += 1
++  |  bge CARG3, r0, ->fff_newstr
++  |->fff_emptystr:  // Return empty string.
++  |  addi.d TMP1, r0, LJ_TSTR
++  |  .ADD16I STR:CARG1, DISPATCH, DISPATCH_GL(strempty)
++  |  settp CARG1, TMP1
++  |  b ->fff_restv
++  |
++  |.macro ffstring_op, name
++  |  .ffunc string_ .. name
++  |  ffgccheck
++  |  ld.d CARG2, 0(BASE)
++  |  beqz NARGS8:RC, ->fff_fallback
++  |  checkstr STR:CARG2, ->fff_fallback
++  |  .ADD16I SBUF:CARG1, DISPATCH, DISPATCH_GL(tmpbuf)
++  |  ld.d TMP0, SBUF:CARG1->b
++  |  st.d L, SBUF:CARG1->L
++  |  st.d BASE, L->base
++  |  st.d TMP0, SBUF:CARG1->w
++  |  st.d PC, SAVE_PC(sp)
++  |  bl extern lj_buf_putstr_ .. name
++  |//  or SBUF:CARG1, SBUF:CRET1, r0
++  |  bl extern lj_buf_tostr
++  |  ld.d BASE, L->base
++  |  b ->fff_resstr
++  |.endmacro
++  |
++  |ffstring_op reverse
++  |ffstring_op lower
++  |ffstring_op upper
++  |
++  |//-- Bit library --------------------------------------------------------
++  |
++  |->vm_tobit_fb:
++  |  fld.d FARG1, 0(BASE)
++  |  beqz TMP1, ->fff_fallback
++  |  fadd.d FARG1, FARG1, TOBIT
++  |  movfr2gr.s CRET1, FARG1
++  |  bstrpick.d CRET1, CRET1, 31, 0
++  |  jirl r0, ra, 0
++  |
++  |.macro .ffunc_bit, name
++  |  .ffunc_1 bit_..name
++  |  gettp TMP0, CARG1
++  |  bstrpick.d CRET1, CARG1, 31, 0
++  |  beq TMP0, TISNUM, >1
++  |  sltui TMP1, TMP0, LJ_TISNUM
++  |  bl ->vm_tobit_fb
++  |1:
++  |.endmacro
++  |
++  |.macro .ffunc_bit_op, name, bins
++  |  .ffunc_bit name
++  |  addi.d TMP2, BASE, 8
++  |  add.d TMP3, BASE, NARGS8:RC
++  |1:
++  |  ld.d TMP1, 0(TMP2)
++  |  beq TMP2, TMP3, ->fff_resi
++  |  gettp TMP0, TMP1
++  |  addi.d TMP2, TMP2, 8
++  |  bne TMP0, TISNUM, >2
++  |  bstrpick.d TMP1, TMP1, 31, 0
++  |  bins CRET1, CRET1, TMP1
++  |  b <1
++  |2:
++  |  fld.d FARG1, -8(TMP2)
++  |  sltui TMP0, TMP0, LJ_TISNUM
++  |  fadd.d FARG1, FARG1, TOBIT
++  |  beqz TMP0, ->fff_fallback
++  |  movfr2gr.s TMP1, FARG1
++  |  bstrpick.d TMP1, TMP1, 31, 0
++  |  bins CRET1, CRET1, TMP1
++  |  b <1
++  |.endmacro
++  |
++  |.ffunc_bit_op band, and
++  |.ffunc_bit_op bor, or
++  |.ffunc_bit_op bxor, xor
++  |
++  |.ffunc_bit bswap
++  |  srli.d TMP0, CRET1, 8
++  |  srli.d TMP1, CRET1, 24
++  |  srli.d TMP2,TMP0, 8
++  |  andi TMP3, TMP2, 0xff
++  |  slli.d TMP3, TMP3, 8
++  |  bstrins.d TMP1, CRET1, 31, 24
++  |  bstrins.d TMP3, TMP0, 23, 16
++  |  or CRET1, TMP1, TMP3
++  |  b ->fff_resi
++  |
++  |.ffunc_bit tobit
++  |->fff_resi:
++  |  settp CARG1, TISNUM	// CARG1 = CRET1
++  |  b ->fff_restv
++  |
++  |.ffunc_bit bnot
++  |  nor CRET1, CRET1, r0
++  |  bstrpick.d CRET1, CRET1, 31, 0
++  |  b ->fff_resi
++  |
++  |.macro .ffunc_bit_sh, name, shins, shmod
++  |  .ffunc_2 bit_..name
++  |  gettp TMP0, CARG1
++  |  beq TMP0, TISNUM, >1
++  |  sltui TMP1, TMP0, LJ_TISNUM
++  |  bl ->vm_tobit_fb
++  |//  or CARG1, CRET1, r0		// CARG1 = CRET1
++  |1:
++  |  gettp TMP0, CARG2
++  |  bstrpick.d CARG2, CARG2, 31, 0
++  |  bne TMP0, TISNUM, ->fff_fallback
++  |  slli.w CARG1, CARG1, 0
++  |.if shmod == 1
++  |  sub.w CARG2, r0, CARG2
++  |.endif
++  |  shins CRET1, CARG1, CARG2
++  |  bstrpick.d CRET1, CRET1, 31, 0
++  |  b ->fff_resi
++  |.endmacro
++  |
++  |.ffunc_bit_sh lshift, sll.w, 0
++  |.ffunc_bit_sh rshift, srl.w, 0
++  |.ffunc_bit_sh arshift, sra.w, 0
++  |.ffunc_bit_sh rol, rotr.w, 1
++  |.ffunc_bit_sh ror, rotr.w, 0
++  |
++  |//-----------------------------------------------------------------------
++  |
++  |->fff_fallback:			// Call fast function fallback handler.
++  |  // BASE = new base, RB = CFUNC, RC = nargs*8
++  |  ld.d PC, FRAME_PC(BASE)		// Fallback may overwrite PC.
++  |  ld.d CARG3, CFUNC:RB->f
++  |  add.d TMP1, BASE, NARGS8:RC
++  |  st.d BASE, L->base
++  |  addi.d TMP0, TMP1, 8*LUA_MINSTACK
++  |  ld.d TMP2, L->maxstack
++  |  st.d PC, SAVE_PC(sp)			// Redundant (but a defined value).
++  |  st.d TMP1, L->top
++  |  or CARG1, L, r0
++  |  bltu TMP2, TMP0, >5			// Need to grow stack.
++  |  jirl r1, CARG3, 0				// (lua_State *L)
++  |  // Either throws an error, or recovers and returns -1, 0 or nresults+1.
++  |  ld.d BASE, L->base
++  |  slli.w RD, CRET1, 3
++  |  blt r0, CRET1, ->fff_res		// Returned nresults+1?
++  |1:  // Returned 0 or -1: retry fast path.
++  |  ld.d LFUNC:RB, FRAME_FUNC(BASE)
++  |  ld.d TMP0, L->top
++  |  sub.d NARGS8:RC, TMP0, BASE
++  |  cleartp LFUNC:RB
++  |  bnez CRET1, ->vm_call_tail		// Returned -1?
++  |  ins_callt				// Returned 0: retry fast path.
++  |
++  |// Reconstruct previous base for vmeta_call during tailcall.
++  |->vm_call_tail:
++  |  andi TMP0, PC, FRAME_TYPE
++  |  addi.d TMP2, r0, ~FRAME_TYPEP	// TODO
++  |  and TMP1, PC, TMP2
++  |  bnez TMP0, >3
++  |  ld.bu TMP1, OFS_RA(PC)
++  |  slli.w TMP1, TMP1, 3
++  |  addi.w TMP1, TMP1, 16
++  |3:
++  |  sub.d TMP2, BASE, TMP1
++  |  b ->vm_call_dispatch		// Resolve again for tailcall.
++  |
++  |5:  // Grow stack for fallback handler.
++  |  addi.d CARG2, r0, LUA_MINSTACK
++  |  or CARG1, L, r0
++  |  bl extern lj_state_growstack	// (lua_State *L, int n)
++  |  ld.d BASE, L->base
++  |  addi.d CRET1, r0, 0		// Set zero-flag to force retry.
++  |  b <1
++  |
++  |->fff_gcstep:			// Call GC step function.
++  |  // BASE = new base, RC = nargs*8
++  |  or MULTRES, ra, r0
++  |  add.d TMP0, BASE, NARGS8:RC	// Calculate L->top.
++  |  st.d BASE, L->base
++  |  st.d PC, SAVE_PC(sp)		// Redundant (but a defined value).
++  |  or CARG1, L, r0
++  |  st.d TMP0, L->top
++  |  bl extern lj_gc_step		// (lua_State *L)
++  |  ld.d BASE, L->base
++  |//  or ra, MULTRES, r0
++  |  ld.d TMP0, L->top
++  |  ld.d CFUNC:RB, FRAME_FUNC(BASE)
++  |  cleartp CFUNC:RB
++  |  sub.d NARGS8:RC, TMP0, BASE
++  |  jirl r0, MULTRES, 0
++  |
++  |//-----------------------------------------------------------------------
++  |//-- Special dispatch targets -------------------------------------------
++  |//-----------------------------------------------------------------------
++  |
++  |->vm_record:				// Dispatch target for recording phase.
++  |
++  |->vm_rethook:			// Dispatch target for return hooks.
++  |  .LDXBU TMP3, DISPATCH, DISPATCH_GL(hookmask)
++  |  andi TMP1, TMP3, HOOK_ACTIVE		// Hook already active?
++  |  beqz TMP1, >1
++  |5:  // Re-dispatch to static ins.
++  |  ld.d TMP1, GG_DISP2STATIC(TMP0)	// Assumes TMP0 holds DISPATCH+OP*4.
++  |  jirl r0, TMP1, 0
++  |
++  |->vm_inshook:			// Dispatch target for instr/line hooks.
++  |  .LDXBU TMP3, DISPATCH, DISPATCH_GL(hookmask)
++  |  .LDXW TMP2, DISPATCH, DISPATCH_GL(hookcount)
++  |  andi TMP1, TMP3, HOOK_ACTIVE		// Hook already active?
++  |  bnez TMP1, <5
++  |  andi TMP1, TMP3, LUA_MASKLINE|LUA_MASKCOUNT
++  |  addi.w TMP2, TMP2, -1
++  |  beqz TMP1, <5
++  |  .STXW TMP2, DISPATCH, DISPATCH_GL(hookcount)
++  |  beqz TMP2, >1
++  |  andi TMP1, TMP3, LUA_MASKLINE
++  |  beqz TMP1, <5
++  |1:
++  |  st.w MULTRES, TMPD(sp)
++  |  or CARG2, PC, r0
++  |  st.d BASE, L->base
++  |  or CARG1, L, r0
++  |  // SAVE_PC must hold the _previous_ PC. The callee updates it with PC.
++  |  bl extern lj_dispatch_ins	// (lua_State *L, const BCIns *pc)
++  |3:
++  |  ld.d BASE, L->base
++  |4:  // Re-dispatch to static ins.
++  |  ld.w INS, -4(PC)
++  |  decode_OP TMP1, INS
++  |  decode_BC8b TMP1
++  |  add.d TMP0, DISPATCH, TMP1
++  |  decode_RD RD, INS
++  |  ld.d TMP1, GG_DISP2STATIC(TMP0)
++  |  decode_RA RA, INS
++  |  jirl r0, TMP1, 0
++  |
++  |->cont_hook:				// Continue from hook yield.
++  |  addi.d PC, PC, 4
++  |  ld.w MULTRES, -24(RB)		// Restore MULTRES for *M ins.
++  |  b <4
++  |
++  |
++  |->vm_callhook:			// Dispatch target for call hooks.
++  |  or CARG2, PC, r0
++  |
++  |
++  |->vm_profhook:			// Dispatch target for profiler hook.
++#if LJ_HASPROFILE
++  |  or CARG1, L, r0
++  |  or CARG2, PC, r0
++  |  st.d BASE, L->base
++  |  st.w MULTRES, TMPD(sp)
++  |  bl extern lj_dispatch_profile	// (lua_State *L, const BCIns *pc)
++  |  // HOOK_PROFILE is off again, so re-dispatch to dynamic instruction.
++  |  addi.d PC, PC, -4
++  |  ld.d BASE, L->base
++  |  b ->cont_nop
++#endif
++  |
++  |
++  |//-----------------------------------------------------------------------
++  |//-- Math helper functions ----------------------------------------------
++  |//-----------------------------------------------------------------------
++  |
++  |// Hard-float round to integer.
++  |// Modifies TMP0, TMP1, FARG1, FARG5, FTMP1, FTMP3, FTMP4
++  |.macro vm_round_hf, func
++  |  addu16i.d TMP0, r0, 0x4330		// Hiword of 2^52 (double).
++  |  slli.d TMP0, TMP0, 32
++  |  movgr2fr.d FARG5, TMP0
++  |  fabs.d FTMP4, FARG1		// |x|
++  |  movfr2gr.d TMP1, FARG1
++  |  fcmp.clt.d FCC0, FTMP4, FARG5
++  |  fadd.d FTMP3, FTMP4, FARG5		// (|x| + 2^52) - 2^52
++  |  fsub.d FTMP3, FTMP3, FARG5
++  |  bceqz FCC0, >1			// Truncate only if |x| < 2^52.
++  |  slt TMP1, TMP1, r0
++  |.if "func" == "ceil"
++  |  addu16i.d TMP0, r0, 0xbff0
++  |.else
++  |  addu16i.d TMP0, r0, 0x3ff0	// Hiword of +1 (double).
++  |.endif
++  |  fneg.d FTMP4, FTMP3
++  |  slli.d TMP0, TMP0, 32
++  |  movgr2fr.d FARG5, TMP0
++  |  movgr2fr.d FTMP1, TMP1
++  |  movfr2cf FCC0, FTMP1
++  |  fsel FTMP1, FTMP3, FTMP4, FCC0
++  |.if "func" == "ceil"
++  |  fcmp.clt.d FCC0, FTMP1, FARG1	// x > result?
++  |.else
++  |  fcmp.clt.d FCC0, FARG1, FTMP1	// x < result?
++  |.endif
++  |  fsub.d FTMP4, FTMP1, FARG5		// If yes, subtract +-1.
++  |  fsel FTMP3, FTMP1, FTMP4, FCC0
++  |  fmov.d FARG1, FTMP3
++  |  jirl r0, ra, 0
++  |1:
++  |  fmov.d FTMP3, FARG1
++  |  jirl r0, ra, 0
++  |.endmacro
++  |
++  |
++  |->vm_floor:
++  |  vm_round_hf floor
++  |->vm_ceil:
++  |  vm_round_hf ceil
++  |
++  |
++  |//-----------------------------------------------------------------------
++}
++
++/* Generate the code for a single instruction. */
++static void build_ins(BuildCtx *ctx, BCOp op, int defop)
++{
++  int vk = 0;
++  |=>defop:
++
++  switch (op) {
++
++  /* -- Comparison ops ---------------------------------------------------- */
++
++  /* Remember: all ops branch for a true comparison, fall through otherwise. */
++
++  case BC_ISLT: case BC_ISGE: case BC_ISLE: case BC_ISGT:
++    |  // RA = src1*8, RD = src2*8, JMP with RD = target
++    |  add.d RA, BASE, RA
++    |  add.d RD, BASE, RD
++    if (op == BC_ISLT || op == BC_ISGE) {
++      |  ld.d CARG1, 0(RA)
++      |  ld.d CARG2, 0(RD)
++      |  gettp CARG3, CARG1
++      |  gettp CARG4, CARG2
++    } else {
++      |  ld.d CARG2, 0(RA)
++      |  ld.d CARG1, 0(RD)
++      |  gettp CARG3, CARG2
++      |  gettp CARG4, CARG1
++    }
++    |  ld.hu TMP2, OFS_RD(PC)		// TMP2=jump
++    |  addi.d PC, PC, 4
++    |  bne CARG3, TISNUM, >2
++    |  decode_BC4b TMP2
++    |  bne CARG4, TISNUM, >5
++    |  slli.w CARG1, CARG1, 0
++    |  slli.w CARG2, CARG2, 0
++    |  addu16i.d TMP3, r0, -0x2		// -BCBIAS_J*4
++    |  slt TMP1, CARG1, CARG2
++    |  add.w TMP2, TMP2, TMP3		// TMP2=(jump-0x8000)<<2
++    if (op == BC_ISLT || op == BC_ISGT) {
++      |  maskeqz TMP2, TMP2, TMP1
++    } else {
++      |  masknez TMP2, TMP2,TMP1
++    }
++    |1:
++    |  add.d PC, PC, TMP2
++    |  ins_next
++    |
++    |2:  // RA is not an integer.
++    |  sltui TMP1, CARG3, LJ_TISNUM
++    |  addu16i.d TMP3, r0, -0x2		// -BCBIAS_J*4
++    |  beqz TMP1, ->vmeta_comp
++    |  sltui TMP1, CARG4, LJ_TISNUM
++    |  decode_BC4b TMP2
++    |  beqz TMP1, >4
++    |  movgr2fr.d FTMP0, CARG1
++    |  movgr2fr.d FTMP2, CARG2
++    |3:  // RA and RD are both numbers.
++    if (op == BC_ISLT || op == BC_ISGE) {
++      |  fcmp.clt.d FCC0, FTMP0, FTMP2
++    } else {
++      |  fcmp.cult.d FCC0, FTMP0, FTMP2
++    }
++    |  add.w TMP2, TMP2, TMP3
++    |  movcf2gr TMP3, FCC0
++    if (op == BC_ISLT || op == BC_ISGT) {
++      |  maskeqz TMP2, TMP2, TMP3
++    } else {
++      |  masknez TMP2, TMP2, TMP3
++    }
++    |  b <1
++    |
++    |4:  // RA is a number, RD is not a number.
++    |  // RA is a number, RD is an integer. Convert RD to a number.
++    |  bne CARG4, TISNUM, ->vmeta_comp
++    if (op == BC_ISLT || op == BC_ISGE) {
++      |  movgr2fr.w FTMP2, CARG2
++      |  movgr2fr.d FTMP0, CARG1
++      |  ffint.d.w FTMP2, FTMP2
++    } else {
++      |  movgr2fr.w FTMP0, CARG1
++      |  movgr2fr.d FTMP2, CARG2
++      |  ffint.d.w FTMP0, FTMP0
++    }
++    |  b <3
++    |
++    |5:  // RA is an integer, RD is not an integer
++    |  sltui TMP1, CARG4, LJ_TISNUM
++    |  addu16i.d TMP3, r0, -0x2		// -BCBIAS_J*4
++    |  beqz TMP1, ->vmeta_comp
++    |  // RA is an integer, RD is a number. Convert RA to a number.
++    if (op == BC_ISLT || op == BC_ISGE) {
++      |  movgr2fr.w FTMP0, CARG1
++      |  movgr2fr.d FTMP2, CARG2
++      |  ffint.d.w FTMP0, FTMP0
++    } else {
++      |  movgr2fr.w FTMP2, CARG2
++      |  movgr2fr.d FTMP0, CARG1
++      |  ffint.d.w FTMP2, FTMP2
++    }
++    |  b <3
++    break;
++
++  case BC_ISEQV: case BC_ISNEV:
++    vk = op == BC_ISEQV;
++    |  // RA = src1*8, RD = src2*8, JMP with RD = target
++    |  add.d RA, BASE, RA
++    |  add.d RD, BASE, RD
++    |  addi.d PC, PC, 4
++    |  ld.d CARG1, 0(RA)
++    |  ld.d CARG2, 0(RD)
++    |  ld.hu TMP2, -4+OFS_RD(PC)
++    |  gettp CARG3, CARG1
++    |  gettp CARG4, CARG2
++    |  sltu TMP0, TISNUM, CARG3
++    |  sltu TMP1, TISNUM, CARG4
++    |  or TMP0, TMP0, TMP1
++    |  addu16i.d TMP3, r0, -0x2		// -BCBIAS_J*4
++    if (vk) {
++      |  beqz TMP0, ->BC_ISEQN_Z
++    } else {
++      |  beqz TMP0, ->BC_ISNEN_Z
++    }
++    |// Either or both types are not numbers.
++    |  addu16i.d TMP3, r0, -0x2		// -BCBIAS_J*4
++    |  decode_BC4b TMP2
++    |  add.w TMP2, TMP2, TMP3		// (jump-0x8000)<<2
++    |  bne CARG1, CARG2, >2
++    |  // Tag and value are equal.
++    if (vk) {
++      |->BC_ISEQV_Z:
++      |  add.d PC, PC, TMP2
++    }
++    |1:
++    |  ins_next
++    |
++    |2:  // Check if the tags are the same and it's a table or userdata.
++    |  xor TMP3, CARG3, CARG4			// Same type?
++    |  sltui TMP0, CARG3, LJ_TISTABUD+1		// Table or userdata? TMP0=1
++    |  masknez TMP0, TMP0, TMP3		// TMP0=0: not same type, or same type table/userdata
++    |  cleartp TAB:TMP1, CARG1
++    if (vk) {
++      |  beqz TMP0, <1
++    } else {
++      |  beqz TMP0, ->BC_ISEQV_Z  // Reuse code from opposite instruction.
++    }
++    |  // Different tables or userdatas. Need to check __eq metamethod.
++    |  // Field metatable must be at same offset for GCtab and GCudata!
++    |  ld.d TAB:TMP3, TAB:TMP1->metatable
++    if (vk) {
++      |  beqz TAB:TMP3, <1		// No metatable?
++      |  ld.bu TMP3, TAB:TMP3->nomm
++      |  andi TMP3, TMP3, 1<<MM_eq
++      |  addi.w TMP0, r0, 0		// ne = 0
++      |  bnez TMP3, <1			// Or 'no __eq' flag set?
++    } else {
++      |  beqz TAB:TMP3,->BC_ISEQV_Z	// No metatable?
++      |  ld.bu TMP3, TAB:TMP3->nomm
++      |  andi TMP3, TMP3, 1<<MM_eq
++      |  addi.w TMP0, r0, 1		// ne = 1
++      |  bnez TMP3, ->BC_ISEQV_Z	// Or 'no __eq' flag set?
++    }
++    |  b ->vmeta_equal			// Handle __eq metamethod.
++    break;
++
++  case BC_ISEQS: case BC_ISNES:
++    vk = op == BC_ISEQS;
++    |  // RA = src*8, RD = str_const*8 (~), JMP with RD = target
++    |  add.d RA, BASE, RA
++    |  addi.d PC, PC, 4
++    |  ld.d CARG1, 0(RA)
++    |  sub.d RD, KBASE, RD
++    |  ld.hu TMP2, -4+OFS_RD(PC)
++    |  ld.d CARG2, -8(RD)		// KBASE-8-str_const*8
++    |  addi.w TMP0, r0, LJ_TSTR
++    |  decode_BC4b TMP2
++    |  settp CARG2, TMP0
++    |  addu16i.d TMP3, r0, -0x2		// -BCBIAS_J*4
++    |  xor TMP0, CARG1, CARG2		// TMP2=0: A==D; TMP2!=0: A!=D
++    |  add.w TMP2, TMP2, TMP3
++    if (vk) {
++      |  masknez TMP2, TMP2, TMP0
++    } else {
++      |  maskeqz TMP2, TMP2, TMP0
++    }
++    |  add.d PC, PC, TMP2
++    |  ins_next
++    break;
++
++  case BC_ISEQN: case BC_ISNEN:
++    vk = op == BC_ISEQN;
++    |  // RA = src*8, RD = num_const*8, JMP with RD = target
++    |  add.d RA, BASE, RA
++    |  add.d RD, KBASE, RD
++    |  ld.d CARG1, 0(RA)
++    |  ld.d CARG2, 0(RD)
++    |  ld.hu TMP2, OFS_RD(PC)
++    |  addi.d PC, PC, 4
++    |  gettp CARG3, CARG1
++    |  gettp CARG4, CARG2
++    |  addu16i.d TMP3, r0, -0x2		// -BCBIAS_J*4
++    if (vk) {
++      |->BC_ISEQN_Z:
++    } else {
++      |->BC_ISNEN_Z:
++    }
++    |  decode_BC4b TMP2
++    |  bne CARG3, TISNUM, >4
++    |  add.w TMP2, TMP2, TMP3
++    |  bne CARG4, TISNUM, >6
++    |  xor TMP0, CARG1, CARG2		// TMP0=0: A==D; TMP0!=0: A!=D
++    |1:
++    if (vk) {
++      |  masknez TMP2, TMP2, TMP0
++      |  add.d PC, PC, TMP2
++      |2:
++    } else {
++      |  maskeqz TMP2, TMP2, TMP0
++      |2:
++      |  add.d PC, PC, TMP2
++    }
++    |3:
++    |  ins_next
++    |
++    |4:  // RA is not an integer.
++    |  sltu TMP0, CARG3, TISNUM
++    |  add.w TMP2, TMP2, TMP3
++    |  beqz TMP0, <2
++    |  movgr2fr.d FTMP0, CARG1
++    |  movgr2fr.d FTMP2, CARG2
++    |  bne CARG4, TISNUM, >5
++    |// RA is a number, RD is an integer.
++    |  ffint.d.w FTMP2, FTMP2
++    |
++    |5:  // RA and RD are both numbers.
++    |  fcmp.cune.d FCC0, FTMP0, FTMP2
++    |  movcf2gr TMP0, FCC0
++    |  b <1
++    |
++    |6: // RA is an integer, RD is a number.
++    |  sltu TMP0, CARG4, TISNUM
++    |  beqz TMP0, <2
++    |  movgr2fr.w FTMP0, CARG1
++    |  movgr2fr.d FTMP2, CARG2
++    |  ffint.d.w FTMP0, FTMP0
++    |  b <5
++    |
++    break;
++
++  case BC_ISEQP: case BC_ISNEP:
++    vk = op == BC_ISEQP;
++    |  // RA = src*8, RD = primitive_type*8 (~), JMP with RD = target
++    |  add.d RA, BASE, RA
++    |  srli.w TMP0, RD, 3
++    |  ld.d TMP1, 0(RA)
++    |  nor TMP0, TMP0, r0		// ~TMP0: ~0 ~1 ~2
++    |  ld.hu TMP2, OFS_RD(PC)		// TMP2: RD in next INS, branch target
++    |  gettp TMP1, TMP1
++    |  addi.d PC, PC, 4
++    |  xor TMP0, TMP1, TMP0		// TMP0=0 A=D; TMP0!=0 A!=D
++    |  decode_BC4b TMP2
++    |  addu16i.d TMP3, r0, -0x2		// -BCBIAS_J*4
++    |  add.w TMP2, TMP2, TMP3		// TMP2=(jump-0x8000)<<2
++    if (vk) {
++      |  masknez TMP2, TMP2, TMP0
++    } else {
++      |  maskeqz TMP2, TMP2, TMP0
++    }
++    |  add.d PC, PC, TMP2
++    |  ins_next
++    break;
++
++  /* -- Unary test and copy ops ------------------------------------------- */
++
++  case BC_ISTC: case BC_ISFC: case BC_IST: case BC_ISF:
++    |  // RA = dst*8 or unused, RD = src*8, JMP with RD = target
++    |  add.d RD, BASE, RD
++    |  ld.hu TMP2, OFS_RD(PC)
++    |  ld.d TMP0, 0(RD)
++    |  addi.d PC, PC, 4
++    |  gettp TMP0, TMP0
++    |  add.d RA, BASE, RA
++    |  sltui TMP0, TMP0, LJ_TISTRUECOND		// TMP0=1 true; TMP0=0 false
++    |  decode_BC4b TMP2
++    |  addu16i.d TMP3, r0, -0x2		// -BCBIAS_J*4
++    |  ld.d CRET1, 0(RD)
++    |  add.w TMP2, TMP2, TMP3		// (jump-0x8000)<<2
++    if (op == BC_IST || op == BC_ISTC) {
++      |  beqz TMP0, >1
++      if (op == BC_ISTC) {
++        |  st.d CRET1, 0(RA)
++      }
++    } else {
++      |  bnez TMP0, >1
++      if (op == BC_ISFC) {
++	|  st.d CRET1, 0(RA)
++      }
++    }
++    |  add.d PC, PC, TMP2
++    |1:
++    |  ins_next
++    break;
++
++  case BC_ISTYPE:
++    |  // RA = src*8, RD = -type*8
++    |  add.d TMP0, BASE, RA
++    |  srli.w TMP1, RD, 3
++    |  ld.d TMP0, 0(TMP0)
++    |  gettp TMP0, TMP0
++    |  add.d TMP0, TMP0, TMP1		// if itype of RA == type, then TMP0=0
++    |  bnez TMP0, ->vmeta_istype
++    |  ins_next
++    break;
++  case BC_ISNUM:
++    |  // RA = src*8, RD = -(TISNUM-1)*8
++    |  add.d TMP0, BASE, RA
++    |  ld.d TMP0, 0(TMP0)
++    |  checknum TMP0, ->vmeta_istype
++    |  ins_next
++    break;
++
++  /* -- Unary ops --------------------------------------------------------- */
++
++  case BC_MOV:
++    |  // RA = dst*8, RD = src*8
++    |  add.d RD, BASE, RD
++    |  add.d RA, BASE, RA
++    |  ld.d TMP0, 0(RD)
++    |  ins_next1
++    |  st.d TMP0, 0(RA)
++    |  ins_next2
++    break;
++  case BC_NOT:
++    |  // RA = dst*8, RD = src*8
++    |  add.d RD, BASE, RD
++    |  add.d RA, BASE, RA
++    |  ld.d TMP0, 0(RD)
++    |  addi.d TMP1, r0, LJ_TTRUE
++    |  ins_next1
++    |  gettp TMP0, TMP0
++    |  sltu TMP0, TMP1, TMP0
++    |  addi.w TMP0, TMP0, 1
++    |  slli.d TMP0, TMP0, 47
++    |  nor TMP0, TMP0, r0
++    |  st.d TMP0, 0(RA)
++    |  ins_next2
++    break;
++  case BC_UNM:
++    |  // RA = dst*8, RD = src*8
++    |  add.d RB, BASE, RD
++    |  add.d RA, BASE, RA
++    |  ld.d TMP0, 0(RB)
++    |  addu16i.d TMP1, r0, 0x8000
++    |  gettp CARG3, TMP0
++    |  bne CARG3, TISNUM, >1
++    |  sub.w TMP0, r0, TMP0
++    |  beq TMP0, TMP1, ->vmeta_unm      // Meta handler deals with -2^31.
++    |  bstrpick.d TMP0, TMP0, 31, 0
++    |  settp TMP0, TISNUM
++    |  b >2
++    |1:
++    |  sltui TMP3, CARG3, LJ_TISNUM
++    |  slli.d TMP1, TMP1, 32
++    |  beqz TMP3, ->vmeta_unm
++    |  xor TMP0, TMP0, TMP1     // sign => ~sign
++    |2:
++    |  st.d TMP0, 0(RA)
++    |  ins_next
++    break;
++  case BC_LEN:
++    |  // RA = dst*8, RD = src*8
++    |  add.d CARG2, BASE, RD
++    |  ld.d TMP0, 0(CARG2)
++    |  add.d RA, BASE, RA
++    |  gettp TMP1, TMP0
++    |  addi.d TMP2, TMP1, -LJ_TSTR
++    |  cleartp STR:CARG1, TMP0
++    |  bnez TMP2, >2
++    |  ld.w CARG1, STR:CARG1->len
++    |1:
++    |  settp CARG1, TISNUM
++    |  st.d CARG1, 0(RA)
++    |  ins_next
++    |2:
++    |  addi.d TMP2, TMP1, -LJ_TTAB
++    |  bnez TMP2, ->vmeta_len
++#if LJ_52
++    |  ld.d TAB:TMP2, TAB:CARG1->metatable
++    |  bnez TAB:TMP2, >9
++    |3:
++#endif
++    |->BC_LEN_Z:
++    |  bl extern lj_tab_len		// (GCtab *t)
++    |  // Returns uint32_t (but less than 2^31).
++    |  b <1
++#if LJ_52
++    |9:
++    |  ld.bu TMP0, TAB:TMP2->nomm
++    |  andi TMP0, TMP0, 1<<MM_len
++    |  bnez TMP0, <3			// 'no __len' flag set: done.
++    |  b ->vmeta_len
++#endif
++    break;
++
++  /* -- Binary ops -------------------------------------------------------- */
++
++    |.macro fpmod, a, b, c
++    |  fdiv.d FARG1, b, c
++    |  bl ->vm_floor		// floor(b/c)
++    |  fmul.d a, FRET1, c
++    |  fsub.d a, b, a		// b - floor(b/c)*c
++    |.endmacro
++    |
++    |.macro ins_arithpre
++    ||vk = ((int)op - BC_ADDVN) / (BC_ADDNV-BC_ADDVN);
++    |  // RA = dst*8, RB = src1*8, RC = src2*8 | num_const*8
++    ||if (vk == 1) {
++    |   // RA = dst*8, RB = num_const*8, RC = src1*8
++    |   decode_RB RC, INS
++    |   decode_RDtoRC8 RB, RD
++    ||} else {
++    |   // RA = dst*8, RB = src1*8, RC = num_const*8
++    |   decode_RB RB, INS
++    |   decode_RDtoRC8 RC, RD
++    ||}
++    ||switch (vk) {
++    ||case 0:			// suffix is VN
++    |   add.d RB, BASE, RB
++    |   add.d RC, KBASE, RC
++    ||  break;
++    ||case 1:			// suffix is NV
++    |   add.d RC, BASE, RC
++    |   add.d RB, KBASE, RB
++    ||  break;
++    ||default:			// CAT or suffix is VV
++    |   add.d RB, BASE, RB
++    |   add.d RC, BASE, RC
++    ||  break;
++    ||}
++    |.endmacro
++    |
++    |.macro ins_arithfp, fpins, itype1, itype2
++    |  fld.d FTMP0, 0(RB)
++    |  sltu itype1, itype1, TISNUM
++    |  sltu itype2, itype2, TISNUM
++    |  fld.d FTMP2, 0(RC)
++    |  and itype1, itype1, itype2
++    |  add.d RA, BASE, RA
++    |  beqz itype1, ->vmeta_arith
++    |  fpins FRET1, FTMP0, FTMP2
++    |  ins_next1
++    |  fst.d FRET1, 0(RA)
++    |  ins_next2
++    |.endmacro
++    |
++    |.macro ins_arithead, itype1, itype2, tval1, tval2 
++    |  ld.d tval1, 0(RB)
++    |  ld.d tval2, 0(RC)
++    |  // Check for two integers.
++    |  gettp itype1, tval1
++    |  gettp itype2, tval2
++    |.endmacro
++    |
++    |.macro ins_arithdn, intins, fpins
++    |  ins_arithpre
++    |  ins_arithead TMP0, TMP1, CARG1, CARG2
++    |  bne TMP0, TISNUM, >1
++    |  bne TMP1, TISNUM, >1
++    |  slli.w CARG3, CARG1, 0
++    |  slli.w CARG4, CARG2, 0
++    |.if "intins" == "add.w"
++    |  intins CRET1, CARG3, CARG4
++    |  xor TMP1, CRET1, CARG3		// ((y^a) & (y^b)) < 0: overflow.
++    |  xor TMP2, CRET1, CARG4
++    |  and TMP1, TMP1, TMP2
++    |  add.d RA, BASE, RA
++    |  blt TMP1, r0, ->vmeta_arith
++    |.elif "intins" == "sub.w"
++    |  intins CRET1, CARG3, CARG4
++    |  xor TMP1, CRET1, CARG3		// ((y^a) & (a^b)) < 0: overflow.
++    |  xor TMP2, CARG3, CARG4
++    |  and TMP1, TMP1, TMP2
++    |  add.d RA, BASE, RA
++    |  blt TMP1, r0, ->vmeta_arith
++    |.elif "intins" == "mulw.d.w"
++    |  mul.w CRET1, CARG3, CARG4
++    |  mulh.w TMP2, CARG3, CARG4
++    |  srai.w TMP1, CRET1, 31		// 63-32bit not all 0 or 1: overflow.
++    |  add.d RA, BASE, RA
++    |  bne TMP1, TMP2, ->vmeta_arith
++    |.endif
++    |  bstrpick.d CRET1, CRET1, 31, 0
++    |  settp CRET1, TISNUM
++    |  st.d CRET1, 0(RA)
++    |  ins_next
++    |1:  // Check for two numbers.
++    |  ins_arithfp, fpins, TMP0, TMP1
++    |.endmacro
++    |
++    |.macro ins_arithdiv, fpins
++    |  ins_arithpre
++    |  ins_arithead TMP0, TMP1, CARG1, CARG2
++    |  ins_arithfp, fpins, TMP0, TMP1
++    |.endmacro
++    |
++    |.macro ins_arithmod, fpins
++    |  ins_arithpre
++    |  ins_arithead TMP0, TMP1, CARG1, CARG2
++    |  bne TMP0, TISNUM, >1
++    |  bne TMP1, TISNUM, >1
++    |  slli.w CARG1, CARG1, 0
++    |  slli.w CARG2, CARG2, 0
++    |  add.d RA, BASE, RA
++    |  beqz CARG2, ->vmeta_arith
++    |  bl extern lj_vm_modi
++    |  bstrpick.d CRET1, CRET1, 31, 0
++    |  settp CRET1, TISNUM
++    |  st.d CRET1, 0(RA)
++    |  ins_next
++    |1:  // Check for two numbers.
++    |  ins_arithfp, fpins, TMP0, TMP1
++    |.endmacro
++    
++  case BC_ADDVN: case BC_ADDNV: case BC_ADDVV:
++    |  ins_arithdn add.w, fadd.d
++    break;
++  case BC_SUBVN: case BC_SUBNV: case BC_SUBVV:
++    |  ins_arithdn sub.w, fsub.d
++    break;
++  case BC_MULVN: case BC_MULNV: case BC_MULVV:
++    |  ins_arithdn mulw.d.w, fmul.d
++    break;
++  case BC_DIVVN: case BC_DIVNV: case BC_DIVVV:
++    |  ins_arithdiv fdiv.d
++    break;
++  case BC_MODVN: case BC_MODNV: case BC_MODVV:
++    |  ins_arithmod fpmod
++    break;
++  case BC_POW:
++    |  ins_arithpre
++    |  ld.d CARG1, 0(RB)
++    |  ld.d CARG2, 0(RC)
++    |  gettp TMP0, CARG1
++    |  gettp TMP1, CARG2
++    |  sltui TMP0, TMP0, LJ_TISNUM
++    |  sltui TMP1, TMP1, LJ_TISNUM
++    |  and TMP0, TMP0, TMP1
++    |  add.d RA, BASE, RA
++    |  beqz TMP0, ->vmeta_arith
++    |  fld.d FARG1, 0(RB)
++    |  fld.d FARG2, 0(RC)
++    |  bl extern pow
++    |  ins_next1
++    |  fst.d FRET1, 0(RA)
++    |  ins_next2
++    break;
++
++  case BC_CAT:
++    |  // RA = dst*8, RB = src_start*8, RC = src_end*8
++    |  decode_RB RB, INS
++    |  decode_RDtoRC8 RC, RD
++    |  sub.d CARG3, RC, RB
++    |  st.d BASE, L->base
++    |  add.d CARG2, BASE, RC
++    |  or MULTRES, RB, r0
++    |->BC_CAT_Z:
++    |  srli.w CARG3, CARG3, 3
++    |  st.d PC, SAVE_PC(sp)
++    |  or CARG1, L, r0
++    |  bl extern lj_meta_cat		// (lua_State *L, TValue *top, int left)
++    |  // Returns NULL (finished) or TValue * (metamethod).
++    |  ld.d BASE, L->base
++    |  bnez CRET1, ->vmeta_binop
++    |  add.d RB, BASE, MULTRES
++    |  ld.d TMP0, 0(RB)
++    |  add.d RA, BASE, RA
++    |  st.d TMP0, 0(RA)
++    |  ins_next
++    break;
++
++  /* -- Constant ops ------------------------------------------------------ */
++
++  case BC_KSTR:
++    |  // RA = dst*8, RD = str_const*8 (~)
++    |  sub.d TMP1, KBASE, RD
++    |  addi.w TMP2, r0, LJ_TSTR
++    |  ld.d TMP0, -8(TMP1)		// KBASE-8-str_const*8
++    |  add.d RA, BASE, RA
++    |  settp TMP0, TMP2
++    |  st.d TMP0, 0(RA)
++    |  ins_next
++    break;
++  case BC_KCDATA:
++    break;
++  case BC_KSHORT:
++    |  // RA = dst*8, RD = int16_literal*8
++    |  srai.w RD, INS, 16
++    |  add.d RA, BASE, RA
++    |  bstrpick.d RD, RD, 31, 0
++    |  settp RD, TISNUM
++    |  st.d RD, 0(RA)
++    |  ins_next
++    break;
++  case BC_KNUM:
++    |  // RA = dst*8, RD = num_const*8
++    |  add.d RD, KBASE, RD
++    |  add.d RA, BASE, RA
++    |  ld.d TMP0, 0(RD)
++    |  st.d TMP0, 0(RA)
++    |  ins_next
++    break;
++  case BC_KPRI:
++    |  // RA = dst*8, RD = primitive_type*8 (~)
++    |  add.d RA, BASE, RA
++    |  slli.d TMP0, RD, 44	// 44+3
++    |  nor TMP0, TMP0, r0
++    |  st.d TMP0, 0(RA)
++    |  ins_next
++    break;
++  case BC_KNIL:
++    |  // RA = base*8, RD = end*8
++    |  add.d RA, BASE, RA
++    |  st.d TISNIL, 0(RA)
++    |  addi.d RA, RA, 8
++    |  add.d RD, BASE, RD
++    |1:
++    |  st.d TISNIL, 0(RA)
++    |  slt TMP0, RA, RD
++    |  addi.d RA, RA, 8
++    |  bnez TMP0, <1
++    |  ins_next
++    break;
++
++  /* -- Upvalue and function ops ------------------------------------------ */
++
++  case BC_UGET:
++    |  // RA = dst*8, RD = uvnum*8
++    |  ld.d LFUNC:TMP0, FRAME_FUNC(BASE)
++    |  add.d RA, BASE, RA
++    |  cleartp LFUNC:TMP0
++    |  add.d RD, RD, LFUNC:TMP0
++    |  ld.d UPVAL:TMP0, LFUNC:RD->uvptr
++    |  ld.d TMP1, UPVAL:TMP0->v
++    |  ld.d TMP2, 0(TMP1)
++    |  ins_next1
++    |  st.d TMP2, 0(RA)
++    |  ins_next2
++    break;
++  case BC_USETV:
++    |  // RA = uvnum*8, RD = src*8
++    |  ld.d LFUNC:TMP0, FRAME_FUNC(BASE)
++    |  add.d RD, BASE, RD
++    |  cleartp LFUNC:TMP0
++    |  add.d RA, RA, LFUNC:TMP0
++    |  ld.d UPVAL:TMP0, LFUNC:RA->uvptr
++    |  ld.d CRET1, 0(RD)
++    |  ld.bu TMP3, UPVAL:TMP0->marked
++    |  ld.d CARG2, UPVAL:TMP0->v
++    |  andi TMP3, TMP3, LJ_GC_BLACK	// isblack(uv)
++    |  ld.bu TMP0, UPVAL:TMP0->closed
++    |  gettp TMP2, CRET1
++    |  st.d CRET1, 0(CARG2)
++    |  or TMP3, TMP3, TMP0
++    |  addi.d TMP0, r0, LJ_GC_BLACK|1
++    |  addi.d TMP2, TMP2, -(LJ_TNUMX+1)
++    |  beq TMP3, TMP0, >2			// Upvalue is closed and black?
++    |1:
++    |  ins_next
++    |
++    |2:  // Check if new value is collectable.
++    |  sltui TMP0, TMP2, LJ_TISGCV - (LJ_TNUMX+1)
++    |  cleartp GCOBJ:CRET1, CRET1
++    |  beqz TMP0, <1			// tvisgcv(v)
++    |  ld.bu TMP3, GCOBJ:CRET1->gch.marked
++    |  andi TMP3, TMP3, LJ_GC_WHITES	// iswhite(v)
++    |  beqz TMP3, <1
++    |  // Crossed a write barrier. Move the barrier forward.
++    |  .ADD16I CARG1, DISPATCH, GG_DISP2G
++    |  bl extern lj_gc_barrieruv	// (global_State *g, TValue *tv)
++    |  b <1
++    break;
++  case BC_USETS:
++    |  // RA = uvnum*8, RD = str_const*8 (~)
++    |  ld.d LFUNC:TMP0, FRAME_FUNC(BASE)
++    |  sub.d TMP1, KBASE, RD
++    |  cleartp LFUNC:TMP0
++    |  add.d RA, RA, LFUNC:TMP0
++    |  ld.d UPVAL:TMP0, LFUNC:RA->uvptr
++    |  ld.d STR:TMP1, -8(TMP1)		// KBASE-8-str_const*8
++    |  ld.bu TMP2, UPVAL:TMP0->marked
++    |  ld.d CARG2, UPVAL:TMP0->v
++    |  ld.bu TMP3, STR:TMP1->marked
++    |  andi TMP4, TMP2, LJ_GC_BLACK	// isblack(uv)
++    |  ld.bu TMP2, UPVAL:TMP0->closed
++    |  addi.d TMP0, r0, LJ_TSTR
++    |  settp TMP1, TMP0
++    |  st.d TMP1, 0(CARG2)
++    |  bnez TMP4, >2
++    |1:
++    |  ins_next
++    |
++    |2:  // Check if string is white and ensure upvalue is closed.
++    |  beqz TMP2, <1
++    |  andi TMP0, TMP3, LJ_GC_WHITES     // iswhite(str)
++    |  beqz TMP0, <1
++    |  // Crossed a write barrier. Move the barrier forward.
++    |  .ADD16I CARG1, DISPATCH, GG_DISP2G
++    |  bl extern lj_gc_barrieruv	// (global_State *g, TValue *tv)
++    |  b <1
++    break;
++  case BC_USETN:
++    |  // RA = uvnum*8, RD = num_const*8
++    |  ld.d LFUNC:TMP0, FRAME_FUNC(BASE)
++    |  add.d RD, KBASE, RD
++    |  cleartp LFUNC:TMP0
++    |  add.d TMP0, RA, LFUNC:TMP0
++    |  ld.d UPVAL:TMP0, LFUNC:TMP0->uvptr
++    |  ld.d TMP1, 0(RD)
++    |  ld.d TMP0, UPVAL:TMP0->v
++    |  st.d TMP1, 0(TMP0)
++    |  ins_next
++    break;
++  case BC_USETP:
++    |  // RA = uvnum*8, RD = primitive_type*8 (~)
++    |  ld.d LFUNC:TMP0, FRAME_FUNC(BASE)
++    |  slli.d TMP2, RD, 44
++    |  cleartp LFUNC:TMP0
++    |  add.d TMP0, RA, LFUNC:TMP0
++    |  nor TMP2, TMP2, r0
++    |  ld.d UPVAL:TMP0, LFUNC:TMP0->uvptr
++    |  ld.d TMP1, UPVAL:TMP0->v
++    |  st.d TMP2, 0(TMP1)
++    |  ins_next
++    break;
++
++  case BC_UCLO:
++    |  // RA = level*8, RD = target
++    |  ld.d TMP2, L->openupval
++    |  branch_RD			// Do this first since RD is not saved.
++    |  st.d BASE, L->base
++    |  or CARG1, L, r0
++    |  beqz TMP2, >1
++    |  add.d CARG2, BASE, RA
++    |  bl extern lj_func_closeuv	// (lua_State *L, TValue *level)
++    |  ld.d BASE, L->base
++    |1:
++    |  ins_next
++    break;
++
++  case BC_FNEW:
++    |  // RA = dst*8, RD = proto_const*8 (~) (holding function prototype)
++    |  sub.d TMP1, KBASE, RD
++    |  ld.d CARG3, FRAME_FUNC(BASE)
++    |  ld.d CARG2, -8(TMP1)		// KBASE-8-tab_const*8
++    |  st.d BASE, L->base
++    |  st.d PC, SAVE_PC(sp)
++    |  cleartp CARG3
++    |  or CARG1, L, r0
++    |  // (lua_State *L, GCproto *pt, GCfuncL *parent)
++    |  bl extern lj_func_newL_gc
++    |  // Returns GCfuncL *.
++    |  addi.d TMP0, r0, LJ_TFUNC
++    |  ld.d BASE, L->base
++    |  settp CRET1, TMP0
++    |  add.d RA, BASE, RA
++    |  st.d CRET1, 0(RA)
++    |  ins_next
++    break;
++
++  /* -- Table ops --------------------------------------------------------- */
++
++  case BC_TNEW:
++  case BC_TDUP:
++    |  // RA = dst*8, RD = (hbits|asize)*8 | tab_const*8 (~)
++    |  .LDXD TMP0, DISPATCH, DISPATCH_GL(gc.total)
++    |  .LDXD TMP1, DISPATCH, DISPATCH_GL(gc.threshold)
++    |  st.d BASE, L->base
++    |  sltu TMP2, TMP0, TMP1
++    |  st.d PC, SAVE_PC(sp)
++    |  beqz TMP2, >5
++    |1:
++    if (op == BC_TNEW) {
++      |  srli.w CARG2, RD, 3
++      |  andi CARG2, CARG2, 0x7ff
++      |  ori TMP0, r0, 0x801
++      |  addi.w TMP2, CARG2, -0x7ff
++      |  srli.w CARG3, RD, 14
++      |  masknez TMP0, TMP0, TMP2
++      |  maskeqz CARG2, CARG2, TMP2
++      |  or CARG2, CARG2, TMP0
++      |  // (lua_State *L, int32_t asize, uint32_t hbits)
++      |  or CARG1, L, r0
++      |  bl extern lj_tab_new
++      |  // Returns Table *.
++    } else {
++      |  sub.d TMP1, KBASE, RD
++      |  or CARG1, L, r0
++      |  ld.d CARG2, -8(TMP1)            // KBASE-8-str_const*8
++      |  bl extern lj_tab_dup		// (lua_State *L, Table *kt)
++      |  // Returns Table *.
++    }
++    |  addi.d TMP0, r0, LJ_TTAB
++    |  ld.d BASE, L->base
++    |  ins_next1
++    |  settp CRET1, TMP0
++    |  add.d RA, BASE, RA
++    |  st.d CRET1, 0(RA)
++    |  ins_next2
++    |5:
++    |  or MULTRES, RD, r0
++    |  or CARG1, L, r0
++    |  bl extern lj_gc_step_fixtop	// (lua_State *L)
++    |  or RD, MULTRES, r0
++    |  b <1
++    break;
++
++  case BC_GGET:
++    |  // RA = dst*8, RD = str_const*8 (~)
++  case BC_GSET:
++    |  // RA = src*8, RD = str_const*8 (~)
++    |  ld.d LFUNC:TMP0, FRAME_FUNC(BASE)
++    |  sub.d TMP1, KBASE, RD
++    |  ld.d STR:RC, -8(TMP1)	// KBASE-8-str_const*8
++    |  cleartp LFUNC:TMP0
++    |  ld.d TAB:RB, LFUNC:TMP0->env
++    |  add.d RA, BASE, RA
++    if (op == BC_GGET) {
++      |  b ->BC_TGETS_Z
++    } else {
++      |  b ->BC_TSETS_Z
++    }
++    break;
++
++  case BC_TGETV:
++    |  // RA = dst*8, RB = table*8, RC = key*8
++    |  decode_RB RB, INS
++    |  decode_RDtoRC8 RC, RD
++    |  add.d CARG2, BASE, RB
++    |  add.d CARG3, BASE, RC
++    |  ld.d TAB:RB, 0(CARG2)
++    |  ld.d TMP2, 0(CARG3)
++    |  add.d RA, BASE, RA
++    |  checktab TAB:RB, ->vmeta_tgetv
++    |  gettp TMP3, TMP2
++    |  ld.w TMP0, TAB:RB->asize
++    |  bne TMP3, TISNUM, >5		// Integer key?
++    |  slli.w TMP2, TMP2, 0
++    |  ld.d TMP1, TAB:RB->array
++    |  sltu TMP3, TMP2, TMP0		//array part (keys = [0, asize-1])
++    |  slli.w TMP2, TMP2, 3
++    |  beqz TMP3, ->vmeta_tgetv		// Integer key and in array part?
++    |  add.d TMP2, TMP1, TMP2
++    |  ld.d CRET1, 0(TMP2)
++    |  beq CRET1, TISNIL, >2
++    |1:
++    |  st.d CRET1, 0(RA)
++    |  ins_next
++    |
++    |2:  // Check for __index if table value is nil.
++    |  ld.d TAB:TMP2, TAB:RB->metatable
++    |  beqz TAB:TMP2, <1		// No metatable: done.
++    |  ld.bu TMP0, TAB:TMP2->nomm
++    |  andi TMP0, TMP0, 1<<MM_index
++    |  bnez TMP0, <1			// 'no __index' flag set: done.
++    |  b ->vmeta_tgetv
++    |
++    |5:
++    |  addi.d TMP0, r0, LJ_TSTR
++    |  cleartp RC, TMP2
++    |  bne TMP3, TMP0, ->vmeta_tgetv	// String key?
++    |  b ->BC_TGETS_Z
++    break;
++  case BC_TGETS:
++    |  // RA = dst*8, RB = table*8, RC = str_const*8 (~)
++    |  decode_RB RB, INS
++    |  decode_RDtoRC8 RC, RD
++    |  add.d CARG2, BASE, RB
++    |  sub.d CARG3, KBASE, RC
++    |  ld.d TAB:RB, 0(CARG2)
++    |  add.d RA, BASE, RA
++    |  ld.d STR:RC, -8(CARG3)		// KBASE-8-str_const*8
++    |  checktab TAB:RB, ->vmeta_tgets1
++    |->BC_TGETS_Z:
++    |  // TAB:RB = GCtab *, STR:RC = GCstr *, RA = dst*8
++    |  ld.w TMP0, TAB:RB->hmask
++    |  ld.w TMP1, STR:RC->sid
++    |  ld.d NODE:TMP2, TAB:RB->node
++    |  and TMP1, TMP1, TMP0		// idx = str->sid & tab->hmask
++    |  slli.w TMP0, TMP1, 5
++    |  slli.w TMP1, TMP1, 3
++    |  sub.w TMP1, TMP0, TMP1
++    |  addi.d TMP3, r0, LJ_TSTR
++    |  add.d NODE:TMP2, NODE:TMP2, TMP1	// node = tab->node + (idx*32-idx*8)
++    |  settp STR:RC, TMP3		// Tagged key to look for.
++    |1:
++    |  ld.d CARG1, NODE:TMP2->key
++    |  ld.d CARG2, NODE:TMP2->val
++    |  ld.d NODE:TMP1, NODE:TMP2->next
++    |  ld.d TAB:TMP3, TAB:RB->metatable
++    |  bne CARG1, RC, >4
++    |  beq CARG2, TISNIL, >5		// Key found, but nil value?
++    |3:
++    |  st.d CARG2, 0(RA)
++    |  ins_next
++    |
++    |4:  // Follow hash chain.
++    |  or NODE:TMP2, NODE:TMP1, r0
++    |  bnez NODE:TMP1, <1
++    |  // End of hash chain: key not found, nil result.
++    |
++    |5:  // Check for __index if table value is nil.
++    |  or CARG2, TISNIL, r0
++    |  beqz TAB:TMP3, <3		// No metatable: done.
++    |  ld.bu TMP0, TAB:TMP3->nomm
++    |  andi TMP0, TMP0, 1<<MM_index
++    |  bnez TMP0, <3			// 'no __index' flag set: done.
++    |  b ->vmeta_tgets
++    break;
++  case BC_TGETB:
++    |  // RA = dst*8, RB = table*8, RC = index*8
++    |  decode_RB RB, INS
++    |  add.d CARG2, BASE, RB
++    |  decode_RDtoRC8 RC, RD
++    |  ld.d TAB:RB, 0(CARG2)
++    |  add.d RA, BASE, RA
++    |  srli.w TMP0, RC, 3
++    |  checktab TAB:RB, ->vmeta_tgetb
++    |  ld.w TMP1, TAB:RB->asize
++    |  ld.d TMP2, TAB:RB->array
++    |  sltu TMP1, TMP0, TMP1
++    |  add.d RC, TMP2, RC
++    |  beqz TMP1, ->vmeta_tgetb
++    |  ld.d CRET1, 0(RC)
++    |  beq CRET1, TISNIL, >5
++    |1:
++    |  st.d CRET1, 0(RA)
++    |  ins_next
++    |
++    |5:  // Check for __index if table value is nil.
++    |  ld.d TAB:TMP2, TAB:RB->metatable
++    |  beqz TAB:TMP2, <1		// No metatable: done.
++    |  ld.bu TMP1, TAB:TMP2->nomm
++    |  andi TMP1, TMP1, 1<<MM_index
++    |  bnez TMP1, <1			// 'no __index' flag set: done.
++    |  b ->vmeta_tgetb			// Caveat: preserve TMP0 and CARG2!
++    break;
++  case BC_TGETR:
++    |  // RA = dst*8, RB = table*8, RC = key*8
++    |  decode_RB RB, INS
++    |  decode_RDtoRC8 RC, RD
++    |  add.d RB, BASE, RB
++    |  add.d RC, BASE, RC
++    |  ld.d TAB:CARG1, 0(RB)
++    |  ld.w CARG2, 0(RC)
++    |  add.d RA, BASE, RA
++    |  cleartp TAB:CARG1
++    |  ld.w TMP0, TAB:CARG1->asize
++    |  ld.d TMP1, TAB:CARG1->array
++    |  sltu TMP0, CARG2, TMP0
++    |  slli.w TMP2, CARG2, 3
++    |  add.d TMP3, TMP1, TMP2
++    |  beqz TMP0, ->vmeta_tgetr		// In array part?
++    |  ld.d TMP1, 0(TMP3)
++    |->BC_TGETR_Z:
++    |  ins_next1
++    |  st.d TMP1, 0(RA)
++    |  ins_next2
++    break;
++
++  case BC_TSETV:
++    |  // RA = src*8, RB = table*8, RC = key*8
++    |  decode_RB RB, INS
++    |  decode_RDtoRC8 RC, RD
++    |  add.d CARG2, BASE, RB
++    |  add.d CARG3, BASE, RC
++    |  ld.d TAB:RB, 0(CARG2)
++    |  ld.d TMP2, 0(CARG3)
++    |  add.d RA, BASE, RA
++    |  checktab TAB:RB, ->vmeta_tsetv
++    |  slli.w RC, TMP2, 0
++    |  checkint TMP2, >5
++    |  ld.w TMP0, TAB:RB->asize
++    |  ld.d TMP1, TAB:RB->array
++    |  sltu TMP0, RC, TMP0
++    |  slli.w TMP2, RC, 3
++    |  beqz TMP0, ->vmeta_tsetv		// Integer key and in array part?
++    |  add.d TMP1, TMP1, TMP2
++    |  ld.bu TMP3, TAB:RB->marked
++    |  ld.d TMP0, 0(TMP1)
++    |  ld.d CRET1, 0(RA)
++    |  beq TMP0, TISNIL, >3
++    |1:
++    |  andi TMP2, TMP3, LJ_GC_BLACK	// isblack(table)
++    |  st.d CRET1, 0(TMP1)
++    |  bnez TMP2, >7
++    |2:
++    |  ins_next
++    |
++    |3:  // Check for __newindex if previous value is nil.
++    |  ld.d TAB:TMP2, TAB:RB->metatable
++    |  beqz TAB:TMP2, <1		// No metatable: done.
++    |  ld.bu TMP2, TAB:TMP2->nomm
++    |  andi TMP2, TMP2, 1<<MM_newindex
++    |  bnez TMP2, <1			// 'no __newindex' flag set: done.
++    |  b ->vmeta_tsetv
++    |5:
++    |  gettp TMP0, TMP2
++    |  addi.d TMP0, TMP0, -LJ_TSTR
++    |  bnez TMP0, ->vmeta_tsetv
++    |  cleartp STR:RC, TMP2
++    |  b ->BC_TSETS_Z			// String key?
++    |
++    |7:  // Possible table write barrier for the value. Skip valiswhite check.
++    |  barrierback TAB:RB, TMP3, TMP0, <2
++    break;
++  case BC_TSETS:
++    |  // RA = src*8, RB = table*8, RC = str_const*8 (~)
++    |  decode_RB RB, INS
++    |  decode_RDtoRC8 RC, RD
++    |  add.d CARG2, BASE, RB
++    |  sub.d CARG3, KBASE, RC
++    |  ld.d TAB:RB, 0(CARG2)
++    |  ld.d RC, -8(CARG3)		// KBASE-8-str_const*8
++    |  add.d RA, BASE, RA
++    |  cleartp STR:RC
++    |  checktab TAB:RB, ->vmeta_tsets1
++    |->BC_TSETS_Z:
++    |  // TAB:RB = GCtab *, STR:RC = GCstr *, RA = BASE+src*8
++    |  ld.w TMP0, TAB:RB->hmask
++    |  ld.w TMP1, STR:RC->sid
++    |  ld.d NODE:TMP2, TAB:RB->node
++    |  st.b r0, TAB:RB->nomm		// Clear metamethod cache.
++    |  and TMP1, TMP1, TMP0		// idx = str->sid & tab->hmask
++    |  slli.w TMP0, TMP1, 5
++    |  slli.w TMP1, TMP1, 3
++    |  sub.w TMP1, TMP0, TMP1
++    |  addi.d TMP3, r0, LJ_TSTR
++    |  add.d NODE:TMP2, NODE:TMP2, TMP1	// node = tab->node + (idx*32-idx*8)
++    |  settp STR:RC, TMP3		// Tagged key to look for.
++    |  fld.d FTMP0, 0(RA)
++    |1:
++    |  ld.d TMP0, NODE:TMP2->key
++    |  ld.d CARG2, NODE:TMP2->val
++    |  ld.d NODE:TMP1, NODE:TMP2->next
++    |  ld.bu TMP3, TAB:RB->marked
++    |  bne TMP0, RC, >5
++    |  ld.d TAB:TMP0, TAB:RB->metatable
++    |  beq CARG2, TISNIL, >4		// Key found, but nil value?
++    |2:
++    |  andi TMP3, TMP3, LJ_GC_BLACK	// isblack(table)
++    |  fst.d FTMP0, NODE:TMP2->val
++    |  bnez TMP3, >7
++    |3:
++    |  ins_next
++    |
++    |4:  // Check for __newindex if previous value is nil.
++    |  beqz TAB:TMP0, <2		// No metatable: done.
++    |  ld.bu TMP0, TAB:TMP0->nomm
++    |  andi TMP0, TMP0, 1<<MM_newindex
++    |  bnez TMP0, <2			// 'no __newindex' flag set: done.
++    |  b ->vmeta_tsets
++    |
++    |5:  // Follow hash chain.
++    |  or NODE:TMP2, NODE:TMP1, r0
++    |  bnez NODE:TMP1, <1
++    |  // End of hash chain: key not found, add a new one
++    |
++    |  // But check for __newindex first.
++    |  ld.d TAB:TMP2, TAB:RB->metatable
++    |  .ADD16I CARG3, DISPATCH, DISPATCH_GL(tmptv)
++    |  beqz TAB:TMP2, >6		// No metatable: continue.
++    |  ld.bu TMP0, TAB:TMP2->nomm
++    |  andi TMP0, TMP0, 1<<MM_newindex
++    |  beqz TMP0, ->vmeta_tsets		// 'no __newindex' flag NOT set: check.
++    |6:
++    |  st.d RC, 0(CARG3)
++    |  st.d BASE, L->base
++    |  or CARG2, TAB:RB, r0
++    |  st.d PC, SAVE_PC(sp)
++    |  or CARG1, L, r0
++    |  bl extern lj_tab_newkey	// (lua_State *L, GCtab *t, TValue *k
++    |  // Returns TValue *.
++    |  ld.d BASE, L->base
++    |  fst.d FTMP0, 0(CRET1)
++    |  b <3				// No 2nd write barrier needed.
++    |
++    |7:  // Possible table write barrier for the value. Skip valiswhite check.
++    |  barrierback TAB:RB, TMP3, TMP0, <3
++    break;
++  case BC_TSETB:
++    |  // RA = src*8, RB = table*8, RC = index*8
++    |  decode_RB RB, INS
++    |  decode_RDtoRC8 RC, RD
++    |  add.d CARG2, BASE, RB
++    |  add.d RA, BASE, RA
++    |  ld.d TAB:RB, 0(CARG2)
++    |  srli.w TMP0, RC, 3
++    |  checktab RB, ->vmeta_tsetb
++    |  ld.w TMP1, TAB:RB->asize
++    |  ld.d TMP2, TAB:RB->array
++    |  sltu TMP1, TMP0, TMP1
++    |  add.d RC, TMP2, RC
++    |  beqz TMP1, ->vmeta_tsetb
++    |  ld.d TMP1, 0(RC)
++    |  ld.bu TMP3, TAB:RB->marked
++    |  beq TMP1, TISNIL, >5
++    |1:
++    |  ld.d CRET1, 0(RA)
++    |  andi TMP1, TMP3, LJ_GC_BLACK	// isblack(table)
++    |  st.d CRET1, 0(RC)
++    |  bnez TMP1, >7
++    |2:
++    |  ins_next
++    |
++    |5:  // Check for __newindex if previous value is nil.
++    |  ld.d TAB:TMP2, TAB:RB->metatable
++    |  beqz TAB:TMP2, <1		// No metatable: done.
++    |  ld.bu TMP1, TAB:TMP2->nomm
++    |  andi TMP1, TMP1, 1<<MM_newindex
++    |  bnez TMP1, <1			// 'no __newindex' flag set: done.
++    |  b ->vmeta_tsetb	// Caveat: preserve TMP0 and CARG2!
++    |
++    |7:  // Possible table write barrier for the value. Skip valiswhite check.
++    |  barrierback TAB:RB, TMP3, TMP0, <2
++    break;
++  case BC_TSETR:
++    |  // RA = dst*8, RB = table*8, RC = key*8
++    |  decode_RB RB, INS
++    |  decode_RDtoRC8 RC, RD
++    |  add.d CARG1, BASE, RB
++    |  add.d CARG3, BASE, RC
++    |  ld.d TAB:CARG2, 0(CARG1)
++    |  ld.w CARG3, 0(CARG3)
++    |  cleartp TAB:CARG2
++    |  ld.bu TMP3, TAB:CARG2->marked
++    |  ld.w TMP0, TAB:CARG2->asize
++    |  ld.d TMP1, TAB:CARG2->array
++    |  andi TMP2, TMP3, LJ_GC_BLACK	// isblack(table)
++    |  add.d RA, BASE, RA
++    |  bnez TMP2, >7
++    |2:
++    |  sltu TMP0, CARG3, TMP0
++    |  slli.w TMP2, CARG3, 3
++    |  add.d CRET1, TMP1, TMP2
++    |  beqz TMP0, ->vmeta_tsetr		// In array part?
++    |->BC_TSETR_Z:
++    |  ld.d TMP1, 0(RA)
++    |  ins_next1
++    |  st.d TMP1, 0(CRET1)
++    |  ins_next2
++    |
++    |7:  // Possible table write barrier for the value. Skip valiswhite check.
++    |  barrierback TAB:CARG2, TMP3, CRET1, <2
++    break;
++
++  case BC_TSETM:
++    |  // RA = base*8 (table at base-1), RD = num_const*8 (start index)
++    |  add.d RA, BASE, RA
++    |1:
++    |  add.d TMP3, KBASE, RD
++    |  ld.d TAB:CARG2, -8(RA)		// Guaranteed to be a table.
++    |  addi.w TMP0, MULTRES, -8
++    |  ld.w TMP3, 0(TMP3)		// Integer constant is in lo-word.
++    |  srli.w CARG3, TMP0, 3
++    |  beqz TMP0, >4			// Nothing to copy?
++    |  cleartp TAB:CARG2
++    |  add.w CARG3, CARG3, TMP3
++    |  ld.w TMP2, TAB:CARG2->asize
++    |  slli.w TMP1, TMP3, 3
++    |  ld.bu TMP3, TAB:CARG2->marked
++    |  ld.d CARG1, TAB:CARG2->array
++    |  sltu TMP4, TMP2, CARG3
++    |  add.d TMP2, RA, TMP0
++    |  bnez TMP4, >5
++    |  add.d TMP1, TMP1, CARG1
++    |  andi TMP0, TMP3, LJ_GC_BLACK	// isblack(table)
++    |3:  // Copy result slots to table.
++    |  ld.d CRET1, 0(RA)
++    |  addi.d RA, RA, 8
++    |  sltu TMP4, RA, TMP2
++    |  st.d CRET1, 0(TMP1)
++    |  addi.d TMP1, TMP1, 8
++    |  bnez TMP4, <3
++    |  bnez TMP0, >7
++    |4:
++    |  ins_next
++    |
++    |5:  // Need to resize array part.
++    |  st.d BASE, L->base
++    |  st.d PC, SAVE_PC(sp)
++    |  or BASE, RD, r0
++    |  or CARG1, L, r0
++    |  bl extern lj_tab_reasize	// (lua_State *L, GCtab *t, int nasize)
++    |  // Must not reallocate the stack.
++    |  or RD, BASE, r0
++    |  ld.d BASE, L->base        // Reload BASE for lack of a saved register.
++    |  b <1
++    |
++    |7:  // Possible table write barrier for any value. Skip valiswhite check.
++    |  barrierback TAB:CARG2, TMP3, TMP0, <4
++    break;
++
++  /* -- Calls and vararg handling ----------------------------------------- */
++
++  case BC_CALLM:
++    |  // RA = base*8, (RB = (nresults+1)*8,) RC = extra_nargs*8
++    |  decode_RDtoRC8 NARGS8:RC, RD
++    |  add.w NARGS8:RC, NARGS8:RC, MULTRES
++    |  b ->BC_CALL_Z
++    break;
++  case BC_CALL:
++    |  // RA = base*8, (RB = (nresults+1)*8,) RC = (nargs+1)*8
++    |  decode_RDtoRC8 NARGS8:RC, RD
++    |->BC_CALL_Z:
++    |  or TMP2, BASE, r0
++    |  add.d BASE, BASE, RA
++    |  ld.d LFUNC:RB, 0(BASE)
++    |  addi.d BASE, BASE, 16
++    |  addi.w NARGS8:RC, NARGS8:RC, -8
++    |  checkfunc RB, ->vmeta_call
++    |  ins_call
++    break;
++
++  case BC_CALLMT:
++    |  // RA = base*8, (RB = 0,) RC = extra_nargs*8
++    |  add.w NARGS8:RD, NARGS8:RD, MULTRES
++    |  b ->BC_CALLT_Z1
++    break;
++  case BC_CALLT:
++    |  // RA = base*8, (RB = 0,) RC = (nargs+1)*8
++    |->BC_CALLT_Z1:
++    |  add.d RA, BASE, RA
++    |  ld.d LFUNC:RB, 0(RA)
++    |  or NARGS8:RC, RD, r0
++    |  ld.d TMP1, FRAME_PC(BASE)
++    |  addi.d RA, RA, 16
++    |  addi.w NARGS8:RC, NARGS8:RC, -8
++    |  checktp CARG3, LFUNC:RB, -LJ_TFUNC, ->vmeta_callt
++    |->BC_CALLT_Z:
++    |  andi TMP0, TMP1, FRAME_TYPE	// Caveat: preserve TMP0 until the 'or'.
++    |  ld.bu TMP3, LFUNC:CARG3->ffid
++    |  xori TMP2, TMP1, FRAME_VARG
++    |  bnez TMP0, >7
++    |1:
++    |  st.d LFUNC:RB, FRAME_FUNC(BASE)		// Copy function down, but keep PC.
++    |  sltui CARG4, TMP3, 2		// (> FF_C) Calling a fast function?
++    |  or TMP2, BASE, r0
++    |  or RB, CARG3, r0
++    |  or TMP3, NARGS8:RC, r0
++    |  beqz NARGS8:RC, >3
++    |2:
++    |  ld.d CRET1, 0(RA)
++    |  addi.d RA, RA, 8
++    |  addi.w TMP3, TMP3, -8
++    |  st.d CRET1, 0(TMP2)
++    |  addi.d TMP2, TMP2, 8
++    |  bnez TMP3, <2
++    |3:
++    |  or TMP0, TMP0, CARG4
++    |  beqz TMP0, >5
++    |4:
++    |  ins_callt
++    |
++    |5:  // Tailcall to a fast function with a Lua frame below.
++    |  ld.w INS, -4(TMP1)
++    |  decode_RA RA, INS
++    |  sub.d TMP1, BASE, RA
++    |  ld.d TMP1, -32(TMP1)
++    |  cleartp LFUNC:TMP1
++    |  ld.d TMP1, LFUNC:TMP1->pc
++    |  ld.d KBASE, PC2PROTO(k)(TMP1)     // Need to prepare KBASE.
++    |  b <4
++    |
++    |7:  // Tailcall from a vararg function.
++    |  andi CARG4, TMP2, FRAME_TYPEP
++    |  sub.d TMP2, BASE, TMP2          // Relocate BASE down.
++    |  bnez CARG4, <1			// Vararg frame below?
++    |  or BASE, TMP2, r0
++    |  ld.d TMP1, FRAME_PC(TMP2)
++    |  andi TMP0, TMP1, FRAME_TYPE
++    |  b <1
++    break;
++
++  case BC_ITERC:
++    |  // RA = base*8, (RB = (nresults+1)*8, RC = (nargs+1)*8 ((2+1)*8))
++    |  or TMP2, BASE, r0			// Save old BASE for vmeta_call.
++    |  add.d BASE, BASE, RA
++    |  ld.d RB, -24(BASE)		//A, A+1, A+2 = A-3, A-2, A-1.
++    |  ld.d CARG1, -16(BASE)
++    |  ld.d CARG2, -8(BASE)
++    |  addi.d NARGS8:RC, r0, 16		// Iterators get 2 arguments.
++    |  st.d RB, 0(BASE)			// Copy callable.
++    |  st.d CARG1, 16(BASE)		// Copy state.
++    |  st.d CARG2, 24(BASE)		// Copy control var.
++    |  addi.d BASE, BASE, 16
++    |  checkfunc RB, ->vmeta_call
++    |  ins_call
++    break;
++
++  case BC_ITERN:
++    |  // RA = base*8, (RB = (nresults+1)*8, RC = (nargs+1)*8 (2+1)*8)
++    |->vm_IITERN:
++    |  add.d RA, BASE, RA
++    |  ld.d TAB:RB, -16(RA)
++    |  ld.w RC, -8(RA)		// Get index from control var.
++    |  cleartp TAB:RB
++    |  addi.d PC, PC, 4
++    |  ld.w TMP0, TAB:RB->asize
++    |  ld.d TMP1, TAB:RB->array
++    |  slli.d CARG3, TISNUM, 47
++    |1:  // Traverse array part.
++    |  sltu TMP2, RC, TMP0
++    |  slli.w TMP3, RC, 3
++    |  beqz TMP2, >5			// Index points after array part?
++    |  add.d TMP3, TMP1, TMP3
++    |  ld.d CARG1, 0(TMP3)
++    |  ld.hu RD, -4+OFS_RD(PC)		// ITERL RD
++    |  or TMP2, RC, CARG3
++    |  addi.w RC, RC, 1
++    |  beq CARG1, TISNIL, <1		// Skip holes in array part.
++    |  st.d TMP2, 0(RA)
++    |  st.d CARG1, 8(RA)
++    |  addu16i.d TMP3, r0, -0x2		// -BCBIAS_J*4
++    |  decode_BC4b RD
++    |  add.d RD, RD, TMP3
++    |  st.w RC, -8(RA)		// Update control var.
++    |  add.d PC, PC, RD
++    |3:
++    |  ins_next
++    |
++    |5:  // Traverse hash part.
++    |  ld.w TMP1, TAB:RB->hmask
++    |  sub.w RC, RC, TMP0
++    |  ld.d TMP2, TAB:RB->node
++    |6:
++    |  sltu CARG1, TMP1, RC		// End of iteration? Branch to ITERL+1.
++    |  slli.w TMP3, RC, 5
++    |  bnez CARG1, <3
++    |  slli.w RB, RC, 3
++    |  sub.w TMP3, TMP3, RB
++    |  add.d NODE:TMP3, TMP3, TMP2	// node = tab->node + (idx*32-idx*8)
++    |  ld.d CARG1, 0(NODE:TMP3)
++    |  ld.hu RD, -4+OFS_RD(PC)		// ITERL RD
++    |  addi.w RC, RC, 1
++    |  beq CARG1, TISNIL, <6		// Skip holes in hash part.
++    |  ld.d CARG2, NODE:TMP3->key
++    |  addu16i.d TMP3, r0, -0x2		// -BCBIAS_J*4
++    |  st.d CARG1, 8(RA)
++    |  add.w RC, RC, TMP0
++    |  decode_BC4b RD
++    |  add.w RD, RD, TMP3
++    |  st.d CARG2, 0(RA)
++    |   add.d PC, PC, RD
++    |  st.w RC, -8(RA)                // Update control var.
++    |  b <3
++    break;
++
++  case BC_ISNEXT:
++    |  // RA = base*8, RD = target (points to ITERN)
++    |  add.d RA, BASE, RA
++    |  srli.w TMP0, RD, 1
++    |  ld.d CFUNC:CARG1, -24(RA)
++    |  add.d TMP0, PC, TMP0
++    |  ld.d CARG2, -16(RA)
++    |  ld.d CARG3, -8(RA)
++    |  addu16i.d TMP2, r0, -0x2		// -BCBIAS_J*4
++    |  checkfunc CFUNC:CARG1, >5
++    |  gettp CARG2, CARG2
++    |  addi.d CARG2, CARG2, -LJ_TTAB
++    |  ld.bu TMP1, CFUNC:CARG1->ffid
++    |  addi.d CARG3, CARG3, -LJ_TNIL
++    |  or TMP3, CARG2, CARG3
++    |  addi.d TMP1, TMP1, -FF_next_N
++    |  or TMP3, TMP3, TMP1
++    |  addu16i.d TMP1, r0, 0xfffe		// LJ_KEYINDEX >> 16
++    |  bnez TMP3, >5
++    |  add.d PC, TMP0, TMP2
++    |  slli.d TMP1, TMP1, 16
++    |  addu16i.d TMP1, TMP1, 0x7fff		// LJ_KEYINDEX & 0xffff
++    |  slli.d TMP1, TMP1, 16
++    |  st.d TMP1, -8(RA)
++    |1:
++    |  ins_next
++    |5:  // Despecialize bytecode if any of the checks fail.
++    |  addi.d TMP3, r0, BC_JMP
++    |  addi.d TMP1, r0, BC_ITERC
++    |  st.b TMP3, -4+OFS_OP(PC)
++    |  add.d PC, TMP0, TMP2
++    |  st.b TMP1, OFS_OP(PC)
++    |  b <1
++    break;
++
++  case BC_VARG:
++    |  // RA = base*8, RB = (nresults+1)*8, RC = numparams*8
++    |  ld.d TMP0, FRAME_PC(BASE)
++    |  decode_RDtoRC8 RC, RD
++    |  decode_RB RB, INS
++    |  add.d RC, BASE, RC
++    |  add.d RA, BASE, RA
++    |  addi.d RC, RC, FRAME_VARG
++    |  add.d TMP2, RA, RB
++    |  addi.d TMP3, BASE, -16		// TMP3 = vtop
++    |  sub.d RC, RC, TMP0		// RC = vbase
++    |  // Note: RC may now be even _above_ BASE if nargs was < numparams.
++    |  sub.d TMP1, TMP3, RC
++    |  beqz RB, >5			// Copy all varargs?
++    |  addi.d TMP2, TMP2, -16
++    |1:  // Copy vararg slots to destination slots.
++    |  ld.d CARG1, 0(RC)
++    |  sltu TMP0, RC, TMP3
++    |  addi.d RC, RC, 8
++    |  maskeqz CARG1, CARG1, TMP0
++    |  masknez TMP0, TISNIL, TMP0
++    |  or CARG1, CARG1, TMP0
++    |  st.d CARG1, 0(RA)
++    |  sltu TMP0, RA, TMP2
++    |  addi.d RA, RA, 8
++    |  bnez TMP0, <1
++    |3:
++    |  ins_next
++    |
++    |5:  // Copy all varargs.
++    |  ld.d TMP0, L->maxstack
++    |  addi.d MULTRES, r0, 8		// MULTRES = (0+1)*8
++    |  bge r0, TMP1, <3			// No vararg slots?
++    |  add.d TMP2, RA, TMP1
++    |  sltu TMP2, TMP0, TMP2
++    |  addi.d MULTRES, TMP1, 8
++    |  bnez TMP2, >7
++    |6:
++    |  ld.d CRET1, 0(RC)
++    |  addi.d RC, RC, 8
++    |  st.d CRET1, 0(RA)
++    |  sltu TMP0, RC, TMP3
++    |  addi.d RA, RA, 8
++    |  bnez TMP0, <6			// More vararg slots?
++    |  b <3
++    |
++    |7:  // Grow stack for varargs.
++    |  st.d RA, L->top
++    |  sub.d RA, RA, BASE
++    |  st.d BASE, L->base
++    |  sub.d BASE, RC, BASE		// Need delta, because BASE may change.
++    |  st.d PC, SAVE_PC(sp)
++    |  srli.w CARG2, TMP1, 3
++    |  or CARG1, L, r0
++    |  bl extern lj_state_growstack	// (lua_State *L, int n)
++    |  or RC, BASE, r0
++    |  ld.d BASE, L->base
++    |  add.d RA, BASE, RA
++    |  add.d RC, BASE, RC
++    |  addi.d TMP3, BASE, -16
++    |  b <6
++    break;
++
++  /* -- Returns ----------------------------------------------------------- */
++
++  case BC_RETM:
++    |  // RA = results*8, RD = extra_nresults*8
++    |  add.w RD, RD, MULTRES
++    |  b ->BC_RET_Z1
++    break;
++
++  case BC_RET:
++    |  // RA = results*8, RD = (nresults+1)*8
++    |->BC_RET_Z1:
++    |  ld.d PC, FRAME_PC(BASE)
++    |  add.d RA, BASE, RA
++    |  or MULTRES, RD, r0
++    |1:
++    |  andi TMP0, PC, FRAME_TYPE
++    |  xori TMP1, PC, FRAME_VARG
++    |  bnez TMP0, ->BC_RETV_Z
++    |
++    |->BC_RET_Z:
++    |  // BASE = base, RA = resultptr, RD = (nresults+1)*8, PC = return
++    |  ld.w INS, -4(PC)
++    |  addi.d TMP2, BASE, -16
++    |  addi.d RC, RD, -8
++    |  decode_RA TMP0, INS
++    |  decode_RB RB, INS
++    |  add.d TMP3, TMP2, RB
++    |  sub.d BASE, TMP2, TMP0
++    |  beqz RC, >3
++    |2:
++    |  ld.d CRET1, 0(RA)
++    |  addi.d RA, RA, 8
++    |  addi.d RC, RC, -8
++    |  st.d CRET1, 0(TMP2)
++    |  addi.d TMP2, TMP2, 8
++    |  bnez RC, <2
++    |3:
++    |  addi.d TMP3, TMP3, -8
++    |5:
++    |  sltu TMP0, TMP2, TMP3
++    |  ld.d LFUNC:TMP1, FRAME_FUNC(BASE)
++    |  bnez TMP0, >6
++    |  cleartp LFUNC:TMP1
++    |  ld.d TMP1, LFUNC:TMP1->pc
++    |  ld.d KBASE, PC2PROTO(k)(TMP1)
++    |  ins_next
++    |
++    |6:  // Fill up results with nil.
++    |  st.d TISNIL, 0(TMP2)
++    |  addi.d TMP2, TMP2, 8
++    |  b <5
++    |
++    |->BC_RETV_Z:  // Non-standard return case.
++    |  andi TMP2, TMP1, FRAME_TYPEP
++    |  bnez TMP2, ->vm_return
++    |  // Return from vararg function: relocate BASE down.
++    |  sub.d BASE, BASE, TMP1
++    |  ld.d PC, FRAME_PC(BASE)
++    |  b <1
++    break;
++
++  case BC_RET0: case BC_RET1:
++    |  // RA = results*8, RD = (nresults+1)*8
++    |  ld.d PC, FRAME_PC(BASE)
++    |  add.d RA, BASE, RA
++    |  or MULTRES, RD, r0
++    |  andi TMP0, PC, FRAME_TYPE
++    |  xori TMP1, PC, FRAME_VARG
++    |  bnez TMP0, ->BC_RETV_Z
++    |  ld.w INS, -4(PC)
++    |  addi.d TMP2, BASE, -16
++    if (op == BC_RET1) {
++      |  ld.d CRET1, 0(RA)
++    }
++    |  decode_RB RB, INS
++    |  decode_RA RA, INS
++    |  sub.d BASE, TMP2, RA
++    if (op == BC_RET1) {
++      |  st.d CRET1, 0(TMP2)
++    }
++    |5:
++    |  sltu TMP0, RD, RB
++    |  ld.d TMP1, FRAME_FUNC(BASE)
++    |  bnez TMP0, >6
++    |  cleartp LFUNC:TMP1
++    |  ld.d TMP1, LFUNC:TMP1->pc
++    |  ins_next1
++    |  ld.d KBASE, PC2PROTO(k)(TMP1)
++    |  ins_next2
++    |
++    |6:  // Fill up results with nil.
++    |  addi.d TMP2, TMP2, 8
++    |  addi.d RD, RD, 8
++    if (op == BC_RET1) {
++      |  st.d TISNIL, 0(TMP2)
++    } else {
++      |  st.d TISNIL, -8(TMP2)
++    }
++    |  b <5
++    break;
++
++  /* -- Loops and branches ------------------------------------------------ */
++
++  case BC_FORL:
++    |  // Fall through. Assumes BC_IFORL follows.
++    break;
++
++  case BC_JFORI:
++  case BC_JFORL:
++#if !LJ_HASJIT
++    break;
++#endif
++  case BC_FORI:
++  case BC_IFORL:
++    |  // RA = base*8, RD = target (after end of loop or start of loop)
++    vk = (op == BC_IFORL || op == BC_JFORL);
++    |  add.d RA, BASE, RA
++    |  ld.d CARG1, FORL_IDX*8(RA)		// CARG1 = IDX
++    |  ld.d CARG2, FORL_STEP*8(RA)		// CARG2 = STEP
++    |  ld.d CARG3, FORL_STOP*8(RA)		// CARG3 = STOP
++    |  gettp CARG4, CARG1
++    |  gettp CARG5, CARG2
++    |  gettp CARG6, CARG3
++    if (op != BC_JFORL) {
++      |  srli.w RD, RD, 1
++      |  addu16i.d TMP2, r0, -0x2	// -BCBIAS_J<<2
++      |  add.d TMP2, RD, TMP2
++    }
++    |  bne CARG4, TISNUM, >3
++    |  slli.w CARG4, CARG1, 0		// start
++    |  slli.w CARG3, CARG3, 0		// stop
++    if (!vk) {				// init
++      |  bne CARG6, TISNUM, ->vmeta_for
++      |  bne CARG5, TISNUM, ->vmeta_for
++      |  bstrpick.d TMP0, CARG2, 31, 31	// sign
++      |  slt CARG2, CARG3, CARG4
++      |  slt TMP1, CARG4, CARG3
++      |  maskeqz TMP1, TMP1, TMP0
++      |  masknez CARG2, CARG2, TMP0
++      |  or CARG2, CARG2, TMP1		// CARG2=0: +,start <= stop or -,start >= stop
++    } else {
++      |  slli.w CARG5, CARG2, 0		// step
++      |  add.w CARG1, CARG4, CARG5	// start + step
++      |  xor TMP3, CARG1, CARG4		// y^a
++      |  xor TMP1, CARG1, CARG5		// y^b
++      |  and TMP3, TMP3, TMP1
++      |  slt TMP1, CARG1, CARG3		// start+step < stop ?
++      |  slt CARG3, CARG3, CARG1	// stop < start+step ?
++      |  slt TMP0, CARG5, r0		// step < 0 ?
++      |  slt TMP3, TMP3, r0		// ((y^a) & (y^b)) < 0: overflow.
++      |  maskeqz TMP1, TMP1, TMP0
++      |  masknez CARG3, CARG3, TMP0
++      |  or CARG3, CARG3, TMP1
++      |  or CARG2, CARG3, TMP3		// CARG2=1: overflow; CARG2=0: continue
++      |  bstrpick.d CARG1, CARG1, 31, 0
++      |  settp CARG1, TISNUM
++      |  st.d CARG1, FORL_IDX*8(RA)
++    }
++    |1:
++    if (op == BC_FORI) {
++      |  maskeqz TMP2, TMP2, CARG2	// CARG2!=0: jump out the loop; CARG2==0: next INS
++      |  add.d PC, PC, TMP2
++    } else if (op == BC_JFORI) {
++      |  add.d PC, PC, TMP2
++      |  ld.hu RD, -4+OFS_RD(PC)
++    } else if (op == BC_IFORL) {
++      |  masknez TMP2, TMP2, CARG2	// CARG2!=0: next INS; CARG2==0: jump back
++      |  add.d PC, PC, TMP2
++    }
++    |  ins_next1
++    |  st.d CARG1, FORL_EXT*8(RA)
++    |2:
++    if (op == BC_JFORI) {
++      |  decode_BC8b RD
++      |  beqz CARG2, =>BC_JLOOP		// CARG2 == 0: excute the loop
++    } else if (op == BC_JFORL) {
++      |  beqz CARG2, =>BC_JLOOP
++    }
++    |  ins_next2
++    |
++    |3:  // FP loop.
++    |  fld.d FTMP0, FORL_IDX*8(RA)	// start
++    |  fld.d FTMP1, FORL_STOP*8(RA)	// stop
++    |  ld.d TMP0, FORL_STEP*8(RA)	// step
++    |  slt TMP0, TMP0, r0		// step < 0 ?
++    |  movgr2fr.d FTMP2, TMP0
++    if (!vk) {
++      |  sltui TMP3, CARG4, LJ_TISNUM	// start is number ?
++      |  sltui TMP0, CARG5, LJ_TISNUM	// step is number ?
++      |  sltui TMP1, CARG6, LJ_TISNUM	// stop is number ?
++      |  and TMP3, TMP3, TMP1
++      |  and TMP0, TMP0, TMP3
++      |  beqz TMP0, ->vmeta_for		// if start or step or stop isn't number
++      |  fcmp.clt.d FCC0, FTMP0, FTMP1		// start < stop ?
++      |  fcmp.clt.d FCC1, FTMP1, FTMP0		// stop < start ?
++      |  movcf2fr FTMP3, FCC0
++      |  movcf2fr FTMP4, FCC1
++      |  movfr2cf FCC0, FTMP2
++      |  fsel FTMP2, FTMP4, FTMP3, FCC0
++      |  movfr2gr.d CARG2, FTMP2	// CARG2=0:+,start<stop or -,start>stop
++      |  b <1
++    } else {
++      |  fld.d FTMP3, FORL_STEP*8(RA)
++      |  fadd.d FTMP0, FTMP0, FTMP3		// start + step
++      |  fcmp.clt.d FCC0, FTMP0, FTMP1		// start + step < stop ?
++      |  fcmp.clt.d FCC1, FTMP1, FTMP0
++      |  movcf2fr FTMP3, FCC0
++      |  movcf2fr FTMP4, FCC1
++      |  movfr2cf FCC0, FTMP2
++      |  fsel FTMP2, FTMP4, FTMP3, FCC0
++      |  movfr2gr.d CARG2, FTMP2
++      if (op == BC_IFORL) {
++	|  masknez TMP2, TMP2, CARG2
++	|  add.d PC, PC, TMP2
++      }
++      |  fst.d FTMP0, FORL_IDX*8(RA)
++      |  ins_next1
++      |  fst.d FTMP0, FORL_EXT*8(RA)
++      |  b <2
++    }
++    break;
++
++  case BC_ITERL:
++    |  // Fall through. Assumes BC_IITERL follows.
++    break;
++
++  case BC_JITERL:
++#if !LJ_HASJIT
++    break;
++#endif
++  case BC_IITERL:
++    |  // RA = base*8, RD = target
++    |  add.d RA, BASE, RA
++    |  ld.d TMP1, 0(RA)
++    |  beq TMP1, TISNIL, >1		// Stop if iterator returned nil.
++    if (op == BC_JITERL) {
++      |  st.d TMP1,-8(RA)
++      |  b =>BC_JLOOP
++    } else {
++      |  branch_RD			// Otherwise save control var + branch.
++      |  st.d TMP1, -8(RA)
++    }
++    |1:
++    |  ins_next
++    break;
++
++  case BC_LOOP:
++    |  // RA = base*8, RD = target (loop extent)
++    |  // Note: RA/RD is only used by trace recorder to determine scope/extent
++    |  // This opcode does NOT jump, it's only purpose is to detect a hot loop.
++    |  // Fall through. Assumes BC_ILOOP follows.
++    break;
++
++  case BC_ILOOP:
++    |  // RA = base*8, RD = target (loop extent)
++    |  ins_next
++    break;
++
++  case BC_JLOOP:
++    break;
++
++  case BC_JMP:
++    |  // RA = base*8 (only used by trace recorder), RD = target
++    |  branch_RD		// PC + (jump - 0x8000)<<2
++    |  ins_next
++    break;
++
++  /* -- Function headers -------------------------------------------------- */
++
++  case BC_FUNCF:
++  case BC_FUNCV:  /* NYI: compiled vararg functions. */
++    |  // Fall through. Assumes BC_IFUNCF/BC_IFUNCV follow.
++    break;
++
++  case BC_JFUNCF:
++#if !LJ_HASJIT
++    break;
++#endif
++  case BC_IFUNCF:
++    |  // BASE = new base, RA = BASE+framesize*8, RB = LFUNC, RC = nargs*8
++    |  ld.d TMP2, L->maxstack
++    |  ld.bu TMP1, -4+PC2PROTO(numparams)(PC)
++    |  ld.d KBASE, -4+PC2PROTO(k)(PC)
++    |  sltu TMP0, TMP2, RA
++    |  slli.w TMP1, TMP1, 3			// numparams*8
++    |  bnez TMP0, ->vm_growstack_l
++    |2:
++    |  sltu TMP0, NARGS8:RC, TMP1		// Check for missing parameters.
++    |  bnez TMP0, >3
++    if (op == BC_JFUNCF) {
++      |  decode_RD RD, INS
++      |  b =>BC_JLOOP
++    } else {
++      |  ins_next
++    }
++    |
++    |3:  // Clear missing parameters.
++    |  add.d TMP0, BASE, NARGS8:RC
++    |  st.d TISNIL, 0(TMP0)
++    |  addi.w NARGS8:RC, NARGS8:RC, 8
++    |  b <2
++    break;
++
++  case BC_JFUNCV:
++#if !LJ_HASJIT
++    break;
++#endif
++    |  NYI  // NYI: compiled vararg functions
++    break;  /* NYI: compiled vararg functions. */
++
++  case BC_IFUNCV:
++    |  // BASE = new base, RA = BASE+framesize*8, RB = LFUNC, RC = nargs*8
++    |  addi.w TMP0, r0, LJ_TFUNC
++    |  add.d TMP1, BASE, RC
++    |  ld.d TMP2, L->maxstack
++    |  settp LFUNC:RB, TMP0
++    |  add.d TMP0, RA, RC
++    |  st.d LFUNC:RB, 0(TMP1)		// Store (tagged) copy of LFUNC.
++    |  addi.d TMP3, RC, 16+FRAME_VARG
++    |  sltu TMP0, TMP0, TMP2
++    |  ld.d KBASE, -4+PC2PROTO(k)(PC)
++    |  st.d TMP3, 8(TMP1)                // Store delta + FRAME_VARG.
++    |  beqz TMP0, ->vm_growstack_l
++    |  ld.bu TMP2, -4+PC2PROTO(numparams)(PC)
++    |  or RA, BASE, r0
++    |  or RC, TMP1, r0
++    |  ins_next1
++    |  addi.d BASE, TMP1, 16
++    |  beqz TMP2, >2
++    |1:
++    |  ld.d TMP0, 0(RA)
++    |  sltu CARG2, RA, RC			// Less args than parameters?
++    |  or CARG1, TMP0, r0
++    |  addi.d RA, RA, 8
++    |  addi.d TMP1, TMP1, 8
++    |  addi.w TMP2, TMP2, -1
++    |  beqz CARG2, >3
++    |  masknez TMP3, CARG1, CARG2		// Clear old fixarg slot (help the GC).
++    |  maskeqz CARG1, TISNIL, CARG2
++    |  or CARG1, CARG1, TMP3
++    |  st.d CARG1, -8(RA)
++    |  st.d TMP0, 8(TMP1)
++    |  bnez TMP2, <1
++    |2:
++    |  ins_next2
++    |3:
++    |  maskeqz TMP0, TMP0, CARG2		// Clear missing fixargs.
++    |  masknez TMP3, TISNIL, CARG2
++    |  or TMP0, TMP0, TMP3
++    |  st.d TMP0, 8(TMP1)
++    |  bnez TMP2, <1
++    |  b <2
++    break;
++
++  case BC_FUNCC:
++  case BC_FUNCCW:
++    |  // BASE = new base, RA = BASE+framesize*8, RB = CFUNC, RC = nargs*8
++    if (op == BC_FUNCC) {
++      |  ld.d CARG4, CFUNC:RB->f
++    } else {
++      |  .LDXD CARG4, DISPATCH, DISPATCH_GL(wrapf)
++    }
++    |  add.d TMP1, RA, NARGS8:RC
++    |  ld.d TMP2, L->maxstack
++    |  add.d RC, BASE, NARGS8:RC
++    |  st.d BASE, L->base		// base of currently excuting function
++    |  st.d RC, L->top
++    |  sltu TMP3, TMP2, TMP1
++    |  li_vmstate C			// addi.w TMP0, r0, ~LJ_VMST_C
++    if (op == BC_FUNCCW) {
++      |  ld.d CARG2, CFUNC:RB->f
++    }
++    |  or CARG1, L, r0
++    |  bnez TMP3, ->vm_growstack_c	// Need to grow stack.
++    |  st_vmstate			// .STXW TMP0, DISPATCH, DISPATCH_GL(vmstate)
++    |  jirl r1, CARG4, 0		// (lua_State *L [, lua_CFunction f])
++    |  // Returns nresults.
++    |  ld.d BASE, L->base
++    |  ld.d TMP1, L->top
++    |  .STXD L, DISPATCH, DISPATCH_GL(cur_L)
++    |  slli.w RD, CRET1, 3
++    |  li_vmstate INTERP
++    |  ld.d PC, FRAME_PC(BASE)		// Fetch PC of caller.
++    |  sub.d RA, TMP1, RD		// RA = L->top - nresults*8
++    |  st_vmstate
++    |  b ->vm_returnc
++    break;
++
++  /* ---------------------------------------------------------------------- */
++
++  default:
++    fprintf(stderr, "Error: undefined opcode BC_%s\n", bc_names[op]);
++    exit(2);
++    break;
++  }
++}
++
++static int build_backend(BuildCtx *ctx)
++{
++  int op;
++
++  dasm_growpc(Dst, BC__MAX);
++
++  build_subroutines(ctx);
++
++  |.code_op
++  for (op = 0; op < BC__MAX; op++)
++    build_ins(ctx, (BCOp)op, op);
++
++  return BC__MAX;
++}
+-- 
+2.20.1
+
diff --git a/loongarch64/0007-LoongArch64-Add-definitions-for-target-CPU.patch b/loongarch64/0007-LoongArch64-Add-definitions-for-target-CPU.patch
new file mode 100644
index 0000000..410cf27
--- /dev/null
+++ b/loongarch64/0007-LoongArch64-Add-definitions-for-target-CPU.patch
@@ -0,0 +1,364 @@
+From aa998fd2fc7efd7c5357041915baa92d936a1df5 Mon Sep 17 00:00:00 2001
+From: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
+Date: Wed, 27 Jul 2022 16:06:56 +0800
+Subject: [PATCH 07/20] LoongArch64: Add definitions for target CPU
+
+---
+ src/lj_target.h             |   6 +-
+ src/lj_target_loongarch64.h | 313 ++++++++++++++++++++++++++++++++++++
+ 2 files changed, 317 insertions(+), 2 deletions(-)
+ create mode 100644 src/lj_target_loongarch64.h
+
+diff --git a/src/lj_target.h b/src/lj_target.h
+index 19716928..3a581db1 100644
+--- a/src/lj_target.h
++++ b/src/lj_target.h
+@@ -55,7 +55,7 @@ typedef uint32_t RegSP;
+ /* Bitset for registers. 32 registers suffice for most architectures.
+ ** Note that one set holds bits for both GPRs and FPRs.
+ */
+-#if LJ_TARGET_PPC || LJ_TARGET_MIPS || LJ_TARGET_ARM64
++#if LJ_TARGET_PPC || LJ_TARGET_MIPS || LJ_TARGET_ARM64 || LJ_TARGET_LOONGARCH64
+ typedef uint64_t RegSet;
+ #else
+ typedef uint32_t RegSet;
+@@ -69,7 +69,7 @@ typedef uint32_t RegSet;
+ #define rset_set(rs, r)		(rs |= RID2RSET(r))
+ #define rset_clear(rs, r)	(rs &= ~RID2RSET(r))
+ #define rset_exclude(rs, r)	(rs & ~RID2RSET(r))
+-#if LJ_TARGET_PPC || LJ_TARGET_MIPS || LJ_TARGET_ARM64
++#if LJ_TARGET_PPC || LJ_TARGET_MIPS || LJ_TARGET_ARM64 || LJ_TARGET_LOONGARCH64
+ #define rset_picktop(rs)	((Reg)(__builtin_clzll(rs)^63))
+ #define rset_pickbot(rs)	((Reg)__builtin_ctzll(rs))
+ #else
+@@ -144,6 +144,8 @@ typedef uint32_t RegCost;
+ #include "lj_target_ppc.h"
+ #elif LJ_TARGET_MIPS
+ #include "lj_target_mips.h"
++#elif LJ_TARGET_LOONGARCH64
++#include "lj_target_loongarch64.h"
+ #else
+ #error "Missing include for target CPU"
+ #endif
+diff --git a/src/lj_target_loongarch64.h b/src/lj_target_loongarch64.h
+new file mode 100644
+index 00000000..100f5e87
+--- /dev/null
++++ b/src/lj_target_loongarch64.h
+@@ -0,0 +1,313 @@
++/*
++** Definitions for LoongArch CPUs.
++** Copyright (C) 2005-2022 Mike Pall. See Copyright Notice in luajit.h
++*/
++
++#ifndef _LJ_TARGET_LOONGARCH_H
++#define _LJ_TARGET_LOONGARCH_H
++
++/* -- Registers IDs ------------------------------------------------------- */
++
++#define GPRDEF(_) \
++  _(R0) _(RA) _(R2) _(SP) _(R4) _(R5) _(R6) _(R7) \
++  _(R8) _(R9) _(R10) _(R11) _(R12) _(R13) _(R14) _(R15) \
++  _(R16) _(R17) _(R18) _(R19) _(R20) _(X) _(R22) _(R23) \
++  _(R24) _(R25) _(R26) _(R27) _(R28) _(R29) _(R30) _(R31)
++#define FPRDEF(_) \
++  _(F0) _(F1) _(F2) _(F3) _(F4) _(F5) _(F6) _(F7) \
++  _(F8) _(F9) _(F10) _(F11) _(F12) _(F13) _(F14) _(F15) \
++  _(F16) _(F17) _(F18) _(F19) _(F20) _(F21) _(F22) _(F23) \
++  _(F24) _(F25) _(F26) _(F27) _(F28) _(F29) _(F30) _(F31)
++#define VRIDDEF(_)
++
++#define RIDENUM(name)	RID_##name,
++
++enum {
++  GPRDEF(RIDENUM)		/* General-purpose registers (GPRs). */
++  FPRDEF(RIDENUM)		/* Floating-point registers (FPRs). */
++  RID_MAX,
++  RID_ZERO = RID_R0,
++  RID_TMP = RID_RA,
++
++  /* Calling conventions. */
++  RID_RET = RID_R4,
++
++  RID_RETHI = RID_R5,
++  RID_RETLO = RID_R4,
++
++  RID_FPRET = RID_F0,
++
++  /* These definitions must match with the *.dasc file(s): */
++  RID_BASE = RID_R23,		/* Interpreter BASE. */
++  RID_LPC = RID_R25,		/* Interpreter PC. */
++  RID_DISPATCH = RID_R26,	/* Interpreter DISPATCH table. */
++  RID_LREG = RID_R27,		/* Interpreter L. */
++  RID_JGL = RID_R22,		/* On-trace: global_State + 32768. */
++
++  /* Register ranges [min, max) and number of registers. */
++  RID_MIN_GPR = RID_R0,
++  RID_MAX_GPR = RID_R31+1,
++  RID_MIN_FPR = RID_MAX_GPR,
++  RID_MAX_FPR = RID_F31+1,
++  RID_NUM_GPR = RID_MAX_GPR - RID_MIN_GPR,
++  RID_NUM_FPR = RID_MAX_FPR - RID_MIN_FPR
++};
++
++#define RID_NUM_KREF		RID_NUM_GPR
++#define RID_MIN_KREF		RID_R0
++
++/* -- Register sets ------------------------------------------------------- */
++
++/* Make use of all registers, except ZERO, TMP, R2, SP, JGL, R20 and X. */
++#define RSET_FIXED \
++  (RID2RSET(RID_ZERO)|RID2RSET(RID_TMP)|RID2RSET(RID_R2)|\
++   RID2RSET(RID_SP)|RID2RSET(RID_JGL)|RID2RSET(RID_R20)|\
++   RID2RSET(RID_X))
++#define RSET_GPR	(RSET_RANGE(RID_MIN_GPR, RID_MAX_GPR) - RSET_FIXED)
++#define RSET_FPR	RSET_RANGE(RID_MIN_FPR, RID_MAX_FPR)
++#define RSET_ALL	(RSET_GPR|RSET_FPR)
++#define RSET_INIT	RSET_ALL
++
++/* scratch register. */
++#define RSET_SCRATCH_GPR	RSET_RANGE(RID_R4, RID_R19+1)
++#define RSET_SCRATCH_FPR	RSET_RANGE(RID_F0, RID_F23+1)
++#define RSET_SCRATCH		(RSET_SCRATCH_GPR|RSET_SCRATCH_FPR)
++#define REGARG_FIRSTGPR		RID_R4
++#define REGARG_LASTGPR		RID_R11
++#define REGARG_NUMGPR		8
++#define REGARG_FIRSTFPR		RID_F0
++#define REGARG_LASTFPR		RID_F7
++#define REGARG_NUMFPR		8
++
++/* -- Spill slots --------------------------------------------------------- */
++
++/* Spill slots are 32 bit wide. An even/odd pair is used for FPRs.
++**
++** SPS_FIXED: Available fixed spill slots in interpreter frame.
++** This definition must match with the *.dasc file(s).
++**
++** SPS_FIRST: First spill slot for general use.
++*/
++#define SPS_FIXED	4
++#define SPS_FIRST	4
++
++#define SPOFS_TMP	0
++
++#define sps_scale(slot)		(4 * (int32_t)(slot))
++#define sps_align(slot)		(((slot) - SPS_FIXED + 3) & ~3)
++
++/* -- Exit state ---------------------------------------------------------- */
++
++/* This definition must match with the *.dasc file(s). */
++typedef struct {
++  lua_Number fpr[RID_NUM_FPR];	/* Floating-point registers. */
++  intptr_t gpr[RID_NUM_GPR];	/* General-purpose registers. */
++  int32_t spill[256];		/* Spill slots. */
++} ExitState;
++
++/* Highest exit + 1 indicates stack check. */
++#define EXITSTATE_CHECKEXIT	1
++
++/* Return the address of a per-trace exit stub. */
++static LJ_AINLINE uint32_t *exitstub_trace_addr_(uint32_t *p)
++{
++  while (*p == 0x03400000) p++;		/* Skip LOONGI_NOP. */
++  return p;
++}
++/* Avoid dependence on lj_jit.h if only including lj_target.h. */
++#define exitstub_trace_addr(T, exitno) \
++  exitstub_trace_addr_((MCode *)((char *)(T)->mcode + (T)->szmcode))
++
++/* -- Instructions -------------------------------------------------------- */
++
++/* Instruction fields. */
++#define LOONGF_D(r)	(r)
++#define LOONGF_J(r)	((r) << 5)
++#define LOONGF_K(r)	((r) << 10)
++#define LOONGF_A(r)	((r) << 15)
++#define LOONGF_I(n)	((n) << 10)
++#define LOONGF_I20(n)	((n) << 5)
++#define LOONGF_M(n)	((n) << 16)
++
++/* Check for valid field range. */
++#define LOONGF_S_OK(x, b) ((((x) + (1 << (b-1))) >> (b)) == 0)
++
++typedef enum LOONGIns {
++/* Integer instructions. */
++  LOONGI_MOVE = 0x00150000,
++  LOONGI_NOP = 0x03400000,
++
++  LOONGI_AND = 0x00148000,
++  LOONGI_ANDI = 0x03400000,
++  LOONGI_OR = 0x00150000,
++  LOONGI_ORI = 0x03800000,
++  LOONGI_XOR = 0x00158000,
++  LOONGI_XORI = 0x03c00000,
++  LOONGI_NOR = 0x00140000,
++
++  LOONGI_SLT = 0x00120000,
++  LOONGI_SLTU = 0x00128000,
++  LOONGI_SLTI = 0x02000000,
++  LOONGI_SLTUI = 0x02400000,
++
++  LOONGI_ADD_W = 0x00100000,
++  LOONGI_ADDI_W = 0x02800000,
++  LOONGI_SUB_W = 0x00110000,
++  LOONGI_MUL_W = 0x001c0000,
++  LOONGI_MULH_W = 0x001c8000,
++  LOONGI_DIV_W = 0x00200000,
++  LOONGI_DIV_WU = 0x00210000,
++
++  LOONGI_SLLI_W = 0x00408000,
++  LOONGI_SRLI_W = 0x00448000,
++  LOONGI_SRAI_W = 0x00488000,
++  LOONGI_ROTRI_W = 0x004c8000,
++  LOONGI_ROTRI_D = 0x004d0000,
++  LOONGI_SLL_W = 0x00170000,
++  LOONGI_SRL_W = 0x00178000,
++  LOONGI_SRA_W = 0x00180000,
++  LOONGI_ROTR_W = 0x001b0000,
++  LOONGI_ROTR_D = 0x001b8000,
++
++  LOONGI_EXT_W_B = 0x00005c00,
++  LOONGI_EXT_W_H = 0x00005800,
++  LOONGI_REVB_2H = 0x00003000,
++  LOONGI_REVB_4H = 0x00003400,
++
++  LOONGI_ALSL_W = 0x00040000,
++  LOONGI_ALSL_D = 0x002c0000,
++
++  LOONGI_B = 0x50000000,
++  LOONGI_BL = 0x54000000,
++  LOONGI_JIRL = 0x4c000000,
++
++  LOONGI_BEQ = 0x58000000,
++  LOONGI_BNE = 0x5c000000,
++  LOONGI_BLT = 0x60000000,
++  LOONGI_BGE = 0x64000000,
++  LOONGI_BGEU = 0x6c000000,
++  LOONGI_BLTU = 0x68000000,
++  LOONGI_BCEQZ = 0x48000000,
++  LOONGI_BCNEZ = 0x48000100,
++
++  /* Load/store instructions. */
++  LOONGI_LD_W = 0x28800000,
++  LOONGI_LD_D = 0x28c00000,
++  LOONGI_ST_W = 0x29800000,
++  LOONGI_ST_D = 0x29c00000,
++  LOONGI_LD_B = 0x28000000,
++  LOONGI_ST_B = 0x29000000,
++  LOONGI_LD_H = 0x28400000,
++  LOONGI_ST_H = 0x29400000,
++  LOONGI_LD_BU = 0x2a000000,
++  LOONGI_LD_HU = 0x2a400000,
++  LOONGI_LDX_B = 0x38000000,
++  LOONGI_LDX_BU = 0x38200000,
++  LOONGI_LDX_H = 0x38040000,
++  LOONGI_LDX_HU = 0x38240000,
++  LOONGI_LDX_D = 0x380c0000,
++  LOONGI_STX_D = 0x381c0000,
++  LOONGI_LDX_W = 0x38080000,
++  LOONGI_STX_W = 0x38180000,
++  LOONGI_STX_B = 0x38100000, 
++  LOONGI_STX_H = 0x38140000,
++  LOONGI_FLD_S = 0x2b000000,
++  LOONGI_FST_S = 0x2b400000,
++  LOONGI_FLD_D = 0x2b800000,
++  LOONGI_FST_D = 0x2bc00000,
++  LOONGI_FLDX_D = 0x38340000,
++  LOONGI_FLDX_S = 0x38300000,
++  LOONGI_FSTX_D = 0x383c0000,
++  LOONGI_FSTX_S = 0x38380000,
++
++  LOONGI_ADD_D = 0x00108000,
++  LOONGI_ADDI_D = 0x02c00000,
++  LOONGI_ADDU16I_D = 0x10000000,
++  LOONGI_LU12I_W = 0x14000000,
++  LOONGI_LU32I_D = 0x16000000,
++  LOONGI_LU52I_D = 0x3000000,
++  LOONGI_SUB_D = 0x00118000,
++  LOONGI_DIV_D = 0x00220000,
++  LOONGI_DIV_DU = 0x00230000,
++  LOONGI_MUL_D = 0x001d8000,
++
++  LOONGI_SLLI_D = 0x00410000,
++  LOONGI_SRLI_D = 0x00450000,
++  LOONGI_SLL_D = 0x00188000,
++  LOONGI_SRL_D = 0x00190000,
++  LOONGI_SRAI_D = 0x00490000,
++  LOONGI_SRA_D = 0x00198000,
++  LOONGI_REVH_D = 0x00004400,
++
++  /* Extract/insert instructions. */
++  LOONGI_BSTRPICK_D = 0x00c00000,
++  LOONGI_BSTRINS_D = 0x00800000,
++
++  LOONGI_MASKEQZ = 0x00130000,
++  LOONGI_MASKNEZ = 0x00138000,
++
++  /* FP instructions. */
++  LOONGI_FRINT_S = 0x011e4400,
++  LOONGI_FRINT_D = 0x011e4800,
++  LOONGI_FTINTRM_L_D = 0x011a2800,
++  LOONGI_FTINTRP_L_D = 0x011a6800,
++  LOONGI_FTINTRNE_L_D = 0x011ae800,
++
++  LOONGI_FMOV_S = 0x01149400,
++  LOONGI_FMOV_D = 0x01149800,
++
++  LOONGI_FABS_D = 0x01140800,
++  LOONGI_FNEG_D = 0x01141800,
++
++  LOONGI_FADD_D = 0x01010000,
++  LOONGI_FSUB_D = 0x01030000,
++  LOONGI_FMUL_D = 0x01050000,
++  LOONGI_FDIV_D = 0x01070000,
++  LOONGI_FSQRT_D = 0x01144800,
++
++  LOONGI_FMIN_D = 0x010b0000,
++  LOONGI_FMAX_D = 0x01090000,
++
++  LOONGI_FADD_S = 0x01008000,
++  LOONGI_FSUB_S = 0x01028000,
++
++  LOONGI_FMADD_S = 0x08100000,
++  LOONGI_FMADD_D = 0x08200000,
++  LOONGI_FNMADD_D = 0x08a00000,
++  LOONGI_FMSUB_S = 0x08500000,
++  LOONGI_FMSUB_D = 0x08600000,
++  LOONGI_FNMSUB_D = 0x08e00000,
++
++  LOONGI_FCVT_D_S = 0x01192400,
++  LOONGI_FTINT_W_S = 0x011b0400,
++  LOONGI_FCVT_S_D = 0x01191800,
++  LOONGI_FTINT_W_D = 0x011b0800,
++  LOONGI_FFINT_S_W = 0x011d1000,
++  LOONGI_FFINT_D_W = 0x011d2000,
++  LOONGI_FFINT_S_L = 0x011d1800,
++  LOONGI_FFINT_D_L = 0x011d2800,
++
++  LOONGI_FTINTRZ_W_S = 0x011a8400,
++  LOONGI_FTINTRZ_W_D = 0x011a8800,
++  LOONGI_FTINTRZ_L_S = 0x011aa400,
++  LOONGI_FTINTRZ_L_D = 0x011aa800,
++  LOONGI_FTINTRM_W_S = 0x011a0400,
++  LOONGI_FTINTRM_W_D = 0x011a0800,
++
++  LOONGI_MOVFR2GR_S = 0x0114b400,
++  LOONGI_MOVGR2FR_W = 0x0114a400,
++  LOONGI_MOVGR2FR_D = 0x0114a800,
++  LOONGI_MOVFR2GR_D = 0x0114b800,
++
++  LOONGI_FCMP_CEQ_D = 0x0c220000,
++  LOONGI_FCMP_CLT_S = 0x0c110000,
++  LOONGI_FCMP_CLT_D = 0x0c210000,
++  LOONGI_FCMP_CLE_D = 0x0c230000,
++  LOONGI_FCMP_CULE_D = 0x0c270000,
++  LOONGI_FCMP_CULT_D = 0x0c250000,
++  LOONGI_FCMP_CNE_D = 0x0c280000,
++  LOONGI_FSEL = 0x0d000000,
++} LOONGIns;
++
++#endif
++
+-- 
+2.20.1
+
diff --git a/loongarch64/0008-LoongArch64-Add-some-constant-definitions.patch b/loongarch64/0008-LoongArch64-Add-some-constant-definitions.patch
new file mode 100644
index 0000000..9b77c90
--- /dev/null
+++ b/loongarch64/0008-LoongArch64-Add-some-constant-definitions.patch
@@ -0,0 +1,92 @@
+From aa3fb5234af56598891181b9f2c49f8c92005475 Mon Sep 17 00:00:00 2001
+From: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
+Date: Wed, 27 Jul 2022 16:21:18 +0800
+Subject: [PATCH 08/20] LoongArch64: Add some constant definitions
+
+---
+ src/lj_jit.h   | 15 ++++++++++-----
+ src/lj_trace.c |  6 +++---
+ 2 files changed, 13 insertions(+), 8 deletions(-)
+
+diff --git a/src/lj_jit.h b/src/lj_jit.h
+index 32b3861a..11af4080 100644
+--- a/src/lj_jit.h
++++ b/src/lj_jit.h
+@@ -67,6 +67,10 @@
+ #endif
+ #endif
+ 
++//#elif LJ_TARGET_LOONGARCH64
++//#define JIT_F_GS464V            (JIT_F_CPU << 0)
++//#define JIT_F_CPUSTRING         "\6GS464V"
++
+ #else
+ 
+ #define JIT_F_CPUSTRING		""
+@@ -363,7 +367,7 @@ enum {
+   LJ_K64_M2P64_31 = LJ_K64_M2P64,
+ #endif
+ #endif
+-#if LJ_TARGET_MIPS
++#if LJ_TARGET_MIPS || LJ_TARGET_LOONGARCH64
+   LJ_K64_2P31,		/* 2^31 */
+ #if LJ_64
+   LJ_K64_2P63,		/* 2^63 */
+@@ -372,7 +376,7 @@ enum {
+ #endif
+   LJ_K64__MAX,
+ };
+-#define LJ_K64__USED	(LJ_TARGET_X86ORX64 || LJ_TARGET_MIPS)
++#define LJ_K64__USED	(LJ_TARGET_X86ORX64 || LJ_TARGET_MIPS || LJ_TARGET_LOONGARCH64)
+ 
+ enum {
+ #if LJ_TARGET_X86ORX64
+@@ -382,16 +386,17 @@ enum {
+   LJ_K32_2P52_2P31,	/* 2^52 + 2^31 */
+   LJ_K32_2P52,		/* 2^52 */
+ #endif
+-#if LJ_TARGET_PPC || LJ_TARGET_MIPS
++#if LJ_TARGET_PPC || LJ_TARGET_MIPS || LJ_TARGET_LOONGARCH64
+   LJ_K32_2P31,		/* 2^31 */
+ #endif
+-#if LJ_TARGET_MIPS64
++#if LJ_TARGET_MIPS64 || LJ_TARGET_LOONGARCH64
+   LJ_K32_2P63,		/* 2^63 */
+   LJ_K32_M2P64,		/* -2^64 */
+ #endif
+   LJ_K32__MAX
+ };
+-#define LJ_K32__USED	(LJ_TARGET_X86ORX64 || LJ_TARGET_PPC || LJ_TARGET_MIPS)
++#define LJ_K32__USED \
++  (LJ_TARGET_X86ORX64 || LJ_TARGET_PPC || LJ_TARGET_MIPS || LJ_TARGET_LOONGARCH64)
+ 
+ /* Get 16 byte aligned pointer to SIMD constant. */
+ #define LJ_KSIMD(J, n) \
+diff --git a/src/lj_trace.c b/src/lj_trace.c
+index c2329394..be4deeb1 100644
+--- a/src/lj_trace.c
++++ b/src/lj_trace.c
+@@ -329,17 +329,17 @@ void lj_trace_initstate(global_State *g)
+   J->k64[LJ_K64_2P64].u64 = U64x(43f00000,00000000);
+   J->k32[LJ_K32_M2P64_31] = LJ_64 ? 0xdf800000 : 0xcf000000;
+ #endif
+-#if LJ_TARGET_X86ORX64 || LJ_TARGET_MIPS64
++#if LJ_TARGET_X86ORX64 || LJ_TARGET_MIPS64 || LJ_TARGET_LOONGARCH64
+   J->k64[LJ_K64_M2P64].u64 = U64x(c3f00000,00000000);
+ #endif
+ #if LJ_TARGET_PPC
+   J->k32[LJ_K32_2P52_2P31] = 0x59800004;
+   J->k32[LJ_K32_2P52] = 0x59800000;
+ #endif
+-#if LJ_TARGET_PPC || LJ_TARGET_MIPS
++#if LJ_TARGET_PPC || LJ_TARGET_MIPS || LJ_TARGET_LOONGARCH64
+   J->k32[LJ_K32_2P31] = 0x4f000000;
+ #endif
+-#if LJ_TARGET_MIPS
++#if LJ_TARGET_MIPS || LJ_TARGET_LOONGARCH64
+   J->k64[LJ_K64_2P31].u64 = U64x(41e00000,00000000);
+ #if LJ_64
+   J->k64[LJ_K64_2P63].u64 = U64x(43e00000,00000000);
+-- 
+2.20.1
+
diff --git a/loongarch64/0009-LoongArch64-Add-LoongArch-instruction-emitter.patch b/loongarch64/0009-LoongArch64-Add-LoongArch-instruction-emitter.patch
new file mode 100644
index 0000000..714a618
--- /dev/null
+++ b/loongarch64/0009-LoongArch64-Add-LoongArch-instruction-emitter.patch
@@ -0,0 +1,325 @@
+From 1ca6df894da0914d6364b2d275856a692a0dcdce Mon Sep 17 00:00:00 2001
+From: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
+Date: Wed, 27 Jul 2022 16:28:39 +0800
+Subject: [PATCH 09/20] LoongArch64: Add LoongArch instruction emitter
+
+---
+ src/lj_emit_loongarch64.h | 306 ++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 306 insertions(+)
+ create mode 100644 src/lj_emit_loongarch64.h
+
+diff --git a/src/lj_emit_loongarch64.h b/src/lj_emit_loongarch64.h
+new file mode 100644
+index 00000000..74a293cc
+--- /dev/null
++++ b/src/lj_emit_loongarch64.h
+@@ -0,0 +1,306 @@
++/*
++** LoongArch instruction emitter.
++** Copyright (C) 2005-2022 Mike Pall. See Copyright Notice in luajit.h
++*/
++
++static intptr_t get_k64val(ASMState *as, IRRef ref)
++{
++  IRIns *ir = IR(ref);
++  if (ir->o == IR_KINT64) {
++    return (intptr_t)ir_kint64(ir)->u64;
++  } else if (ir->o == IR_KGC) {
++    return (intptr_t)ir_kgc(ir);
++  } else if (ir->o == IR_KPTR || ir->o == IR_KKPTR) {
++    return (intptr_t)ir_kptr(ir);
++  } else {
++    lj_assertA(ir->o == IR_KINT || ir->o == IR_KNULL,
++               "bad 64 bit const IR op %d", ir->o);
++    return ir->i;  /* Sign-extended. */
++  }
++}
++
++#define get_kval(as, ref)       get_k64val(as, ref)
++
++/* -- Emit basic instructions --------------------------------------------- */
++
++static void emit_djk(ASMState *as, LOONGIns loongi, Reg rd, Reg rj, Reg rk)
++{
++  *--as->mcp = loongi | LOONGF_D(rd & 0x1f) | LOONGF_J(rj & 0x1f) | LOONGF_K(rk & 0x1f);
++}
++
++#define emit_dj(as, loongi, rd, rj)         emit_djk(as, loongi, rd, rj, 0)
++
++static void emit_di(ASMState *as, LOONGIns loongi, Reg rd, int32_t i)
++{
++  *--as->mcp = loongi | LOONGF_D(rd & 0x1f) | LOONGF_I20(i & 0xfffff);
++}
++
++static void emit_dji(ASMState *as, LOONGIns loongi, Reg rd, Reg rj, int32_t i)
++{
++  *--as->mcp = loongi | LOONGF_D(rd & 0x1f) | LOONGF_J(rj & 0x1f) | LOONGF_I(i);
++}
++
++static void emit_dju(ASMState *as, LOONGIns loongi, Reg rd, Reg rj, uint32_t u)
++{
++  *--as->mcp = loongi | LOONGF_D(rd & 0x1f) | LOONGF_J(rj & 0x1f) | LOONGF_I(u);
++}
++
++#define checki12(x)	LOONGF_S_OK(x, 12)
++#define checku12(x)	((x) == ((x) & 0xfff))
++
++static Reg ra_allock(ASMState *as, intptr_t k, RegSet allow);
++static void ra_allockreg(ASMState *as, intptr_t k, Reg r);
++static Reg ra_scratch(ASMState *as, RegSet allow);
++
++static void emit_dj32i(ASMState *as, Reg rd, Reg rj, int32_t i)
++{
++  if (checki12(i)) {
++    *--as->mcp = LOONGI_ADDI_D | LOONGF_D(rd) | LOONGF_J(rj) | LOONGF_I(i&0xfff);
++  } else {
++    emit_djk(as, LOONGI_ADD_D, rd, RID_R20, rj);
++    emit_dju(as, LOONGI_ORI, RID_R20, RID_R20, i&0xfff);
++    emit_di(as, LOONGI_LU12I_W, RID_R20, (i>>12)&0xfffff);
++  }
++}
++
++static void emit_d16i(ASMState *as, Reg rd, int32_t i)
++{
++  emit_dji(as, LOONGI_SRAI_D, rd, rd, 16);
++  emit_dji(as, LOONGI_ADDU16I_D, rd, RID_ZERO, (i&0xffff));
++}
++
++static void emit_djml(ASMState *as, LOONGIns loongi, Reg rd, Reg rj, uint32_t m, uint32_t l)
++{
++  *--as->mcp = loongi | LOONGF_D(rd & 0x1f) | LOONGF_J(rj & 0x1f) | LOONGF_I(l & 0x3f) | LOONGF_M(m & 0x3f);
++}
++
++static void emit_djka(ASMState *as, LOONGIns loongi, Reg rd, Reg rj, Reg rk, Reg ra)
++{
++  *--as->mcp = loongi | LOONGF_D(rd & 0x1f) | LOONGF_J(rj & 0x1f) | LOONGF_K(rk & 0x1f) | LOONGF_A(ra & 0x1f);
++}
++
++static void emit_b_bl(ASMState *as, LOONGIns loongi, uint32_t i)
++{
++  *--as->mcp = loongi | LOONGF_I(i & 0xffff) | ((i >> 16) & 0x3ff);
++}
++
++
++/* -- Emit loads/stores --------------------------------------------------- */
++
++/* Prefer rematerialization of BASE/L from global_State over spills. */
++#define emit_canremat(ref)	((ref) <= REF_BASE)
++
++
++/* Load a 32 bit constant into a GPR. */
++static void emit_loadi(ASMState *as, Reg r, int32_t i)
++{
++  emit_dj32i(as, r, RID_ZERO, i);
++}
++
++/* Load a 64 bit constant into a GPR. */
++static void emit_loadu64(ASMState *as, Reg r, uint64_t u64)
++{
++  if (checki32((int64_t)u64)) {
++    emit_dj32i(as, r, RID_ZERO, (int32_t)u64);
++  } else {
++      *--as->mcp = LOONGI_LU52I_D | LOONGF_D(r) | LOONGF_J(r) | LOONGF_I((u64>>52)&0xfff);
++      *--as->mcp = LOONGI_LU32I_D | LOONGF_D(r) | LOONGF_I20((u64>>32)&0xfffff);
++      *--as->mcp = LOONGI_ORI | LOONGF_D(r) | LOONGF_J(r) | LOONGF_I(u64&0xfff);
++      *--as->mcp = LOONGI_LU12I_W | LOONGF_D(r) | LOONGF_I20((u64>>12)&0xfffff);
++  }
++}
++
++#define emit_loada(as, r, addr)         emit_loadu64(as, (r), u64ptr((addr)))
++
++/* Get/set from constant pointer. */
++static void emit_lsptr(ASMState *as, LOONGIns loongi, Reg r, void *p, RegSet allow)
++{
++  intptr_t jgl = (intptr_t)(J2G(as->J));
++  intptr_t i = (intptr_t)(p);
++  Reg base;
++  if ((uint32_t)(i-jgl) < 65536) {
++    i = i-jgl-32768;
++    base = RID_JGL;
++  } else {
++    base = ra_allock(as, i-(int16_t)i, allow);
++  }
++  if (checki12(i)) {
++    emit_dji(as, loongi, r, base, i&0xfff);
++  }
++  else {
++    /* ld.d->ldx.d, fld.d->fldx.d, ld.s->fldx.s */
++    if (loongi == LOONGI_LD_D)
++      loongi = LOONGI_LDX_D;
++    else if (loongi == LOONGI_FLD_D)
++      loongi = LOONGI_FLDX_D;
++    else if (loongi == LOONGI_FLD_S)
++      loongi = LOONGI_FLDX_S;
++    emit_djk(as, loongi, r, base, RID_R20);
++
++    /* move i to a GPR */
++    emit_d16i(as, RID_R20, i);	// i&0xffff
++  }
++}
++
++/* Load 64 bit IR constant into register. */
++static void emit_loadk64(ASMState *as, Reg r, IRIns *ir)
++{
++  const uint64_t *k = &ir_k64(ir)->u64;
++  Reg r64 = r;
++  if (rset_test(RSET_FPR, r)) {
++    r64 = RID_TMP;
++    emit_dj(as, LOONGI_MOVGR2FR_D, r, r64);
++  }
++  if ((uint32_t)((intptr_t)k-(intptr_t)J2G(as->J)) < 65536)
++    emit_lsptr(as, LOONGI_LD_D, r64, (void *)k, 0);	/*To copy a doubleword from a GPR to an FPR*/
++  else
++    emit_loadu64(as, r64, *k);
++}
++
++/* Get/set global_State fields. */
++static void emit_lsglptr2(ASMState *as, LOONGIns loongi, Reg r, int32_t ofs)
++{
++  emit_djk(as, loongi, r, RID_JGL, RID_R20);
++  emit_loadi(as, RID_R20, (ofs-32768));
++}
++
++#define emit_getgl(as, r, field) \
++  emit_lsglptr2(as, LOONGI_LDX_D, (r), (int32_t)offsetof(global_State, field))
++#define emit_setgl(as, r, field) \
++  emit_lsglptr2(as, LOONGI_STX_D, (r), (int32_t)offsetof(global_State, field))
++
++/* Trace number is determined from per-trace exit stubs. */
++#define emit_setvmstate(as, i)		UNUSED(i)
++
++/* -- Emit control-flow instructions -------------------------------------- */
++
++/* Label for internal jumps. */
++typedef MCode *MCLabel;
++
++/* Return label pointing to current PC. */
++#define emit_label(as)		((as)->mcp)
++
++static void emit_branch(ASMState *as, LOONGIns loongi, Reg rj, Reg rd, MCode *target)
++{
++  MCode *p = as->mcp;
++  ptrdiff_t delta = target - (p - 1);
++  lj_assertA(((delta + 0x8000) >> 16) == 0, "branch target out of range");
++  /*BEQ BNE BGE BLZ*/
++  *--p = loongi | LOONGF_D(rd) | LOONGF_J(rj) | LOONGF_I(((uint32_t)delta & 0xffffu));
++  as->mcp = p;
++}
++
++static void emit_branch21(ASMState *as, LOONGIns loongi, Reg rj, MCode *target)
++{
++  MCode *p = as->mcp;
++  ptrdiff_t delta = target - (p - 1);
++  lj_assertA(((delta + 0x100000) >> 21) == 0, "branch target out of range");
++  *--p = loongi | LOONGF_J(rj) | LOONGF_I(((uint32_t)delta & 0xffffu))
++         | (((uint32_t)delta & 0x1f0000u)>>16);		/*BEQZ BNEZ BCEQZ BCNEZ*/
++  as->mcp = p;
++}
++
++static void emit_jmp(ASMState *as, MCode *target)
++{
++  MCode *p = as->mcp;
++  ptrdiff_t delta = target - (p - 1);
++  emit_b_bl(as, LOONGI_B, (delta&0x3ffffff));	/*offs 26*/
++}
++
++#define emit_move(as, dst, src) \
++  emit_djk(as, LOONGI_OR, (dst), (src), RID_ZERO)
++
++static void emit_call(ASMState *as, void *target)
++{
++  MCode *p = --as->mcp;
++  ptrdiff_t delta = (char *)target - (char *)p;
++  if (LOONGF_S_OK(delta>>2, 26)) {
++    *p = LOONGI_BL | LOONGF_I((delta>>2) & 0xffff) | (((delta>>2) >> 16) & 0x3ff);
++  } else {  /* Target out of range: need indirect call. */
++    Reg r = ra_allock(as, (intptr_t)target, RSET_RANGE(RID_R12, RID_R19+1));
++    *p = LOONGI_JIRL | LOONGF_D(RID_RA) | LOONGF_J(r) | LOONGF_I(0);
++  }
++}
++
++/* -- Emit generic operations --------------------------------------------- */
++
++/* Generic move between two regs. */
++static void emit_movrr(ASMState *as, IRIns *ir, Reg dst, Reg src)
++{
++  if (dst < RID_MAX_GPR && src >= RID_MIN_FPR)
++    emit_dj(as, irt_isnum(ir->t) ? LOONGI_MOVFR2GR_D : LOONGI_MOVFR2GR_S, dst, src);
++  else if (dst < RID_MAX_GPR)
++    emit_move(as, dst, src);
++  else
++    emit_dj(as, irt_isnum(ir->t) ? LOONGI_FMOV_D : LOONGI_FMOV_S, dst, src);
++}
++
++/* Emit an arithmetic operation with a constant operand. */
++static void emit_addk(ASMState *as, Reg dest, Reg src, int32_t i, RegSet allow)
++{
++  if (checki12(i)) {
++    emit_dji(as, LOONGI_ADDI_D, dest, src, i&0xfff);
++  } else {
++    Reg src2 = ra_allock(as, i, allow);
++    emit_djk(as, LOONGI_ADD_D, dest, src, src2);
++  }
++}
++
++static void emit_lso(ASMState *as, LOONGIns loongi, Reg dest, Reg src, int64_t i, RegSet allow)
++{
++  if (checki12(i)) {
++    emit_dji(as, loongi, dest, src, i&0xfff);
++  } else {
++    LOONGIns loongk = LOONGI_NOP;
++    switch (loongi) {
++      case LOONGI_LD_D: loongk = LOONGI_LDX_D; break;
++      case LOONGI_LD_W: loongk = LOONGI_LDX_W; break;
++      case LOONGI_ST_D: loongk = LOONGI_STX_D; break;
++      case LOONGI_FLD_D: loongk = LOONGI_FLDX_D; break;
++      case LOONGI_FST_D: loongk = LOONGI_FSTX_D; break;
++      case LOONGI_LD_B: loongk = LOONGI_LDX_B; break;
++      case LOONGI_LD_BU: loongk = LOONGI_LDX_BU; break;
++      case LOONGI_LD_H: loongk = LOONGI_LDX_H; break;
++      case LOONGI_LD_HU: loongk = LOONGI_LDX_HU; break;
++      case LOONGI_FLD_S: loongk = LOONGI_FLDX_S; break;
++      default: break;
++    }
++    //Reg src2 = ra_allock(as, i, allow);
++    Reg src2 = ra_scratch(as, allow);
++    emit_djk(as, loongk, dest, src, src2);
++    emit_d16i(as, src2, i);
++  }
++}
++
++/* Generic load of register with base and (small) offset address. */
++static void emit_loadofs(ASMState *as, IRIns *ir, Reg r, Reg base, int32_t ofs)
++{
++  if (r < RID_MAX_GPR) {
++    emit_djk(as, irt_is64(ir->t) ? LOONGI_LDX_D : LOONGI_LDX_W, r, base, RID_R20);
++  } else {
++    emit_djk(as, irt_isnum(ir->t) ? LOONGI_FLDX_D : LOONGI_FLDX_S, r, base, RID_R20);
++  }
++  emit_d16i(as, RID_R20, ofs);
++}
++
++/* Generic store of register with base and (small) offset address. */
++static void emit_storeofs(ASMState *as, IRIns *ir, Reg r, Reg base, int32_t ofs)
++{
++  if (r < RID_MAX_GPR) {
++    emit_djk(as, irt_is64(ir->t) ? LOONGI_STX_D : LOONGI_STX_W, r, base, RID_R20);
++  } else {
++    emit_djk(as, irt_isnum(ir->t) ? LOONGI_FSTX_D : LOONGI_FSTX_S, (r&31), base, RID_R20);
++  }
++  emit_d16i(as, RID_R20, ofs);
++}
++
++/* Add offset to pointer. */
++static void emit_addptr(ASMState *as, Reg r, int32_t ofs)
++{
++  if (ofs) {
++    emit_addk(as, r, r, ofs, rset_exclude(RSET_GPR, r));
++  }
++}
++
++
++#define emit_spsub(as, ofs)	emit_addptr(as, RID_SP, -(ofs))
+-- 
+2.20.1
+
diff --git a/loongarch64/0010-LoongArch64-Add-IR-assembler-support.patch b/loongarch64/0010-LoongArch64-Add-IR-assembler-support.patch
new file mode 100644
index 0000000..b19cf49
--- /dev/null
+++ b/loongarch64/0010-LoongArch64-Add-IR-assembler-support.patch
@@ -0,0 +1,2161 @@
+From 1de0bd4187710c93ab6c44084ae24003e5086697 Mon Sep 17 00:00:00 2001
+From: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
+Date: Wed, 27 Jul 2022 17:31:57 +0800
+Subject: [PATCH 10/20] LoongArch64: Add IR assembler support
+
+---
+ src/lj_asm.c             |    4 +
+ src/lj_asm_loongarch64.h | 1990 ++++++++++++++++++++++++++++++++++++++
+ src/vm_loongarch64.dasc  |  117 +++
+ 3 files changed, 2111 insertions(+)
+ create mode 100644 src/lj_asm_loongarch64.h
+
+diff --git a/src/lj_asm.c b/src/lj_asm.c
+index 6f5e0c45..be30e21c 100644
+--- a/src/lj_asm.c
++++ b/src/lj_asm.c
+@@ -185,6 +185,8 @@ IRFLDEF(FLOFS)
+ #include "lj_emit_ppc.h"
+ #elif LJ_TARGET_MIPS
+ #include "lj_emit_mips.h"
++#elif LJ_TARGET_LOONGARCH64
++#include "lj_emit_loongarch64.h"
+ #else
+ #error "Missing instruction emitter for target CPU"
+ #endif
+@@ -1662,6 +1664,8 @@ static void asm_loop(ASMState *as)
+ #include "lj_asm_ppc.h"
+ #elif LJ_TARGET_MIPS
+ #include "lj_asm_mips.h"
++#elif LJ_TARGET_LOONGARCH64
++#include "lj_asm_loongarch64.h"
+ #else
+ #error "Missing assembler for target CPU"
+ #endif
+diff --git a/src/lj_asm_loongarch64.h b/src/lj_asm_loongarch64.h
+new file mode 100644
+index 00000000..932f0f2c
+--- /dev/null
++++ b/src/lj_asm_loongarch64.h
+@@ -0,0 +1,1990 @@
++/*
++** LoongArch IR assembler (SSA IR -> machine code).
++** Copyright (C) 2005-2022 Mike Pall. See Copyright Notice in luajit.h
++*/
++
++/* -- Register allocator extensions --------------------------------------- */
++
++/* Allocate a register with a hint. */
++static Reg ra_hintalloc(ASMState *as, IRRef ref, Reg hint, RegSet allow)
++{
++  Reg r = IR(ref)->r;
++  if (ra_noreg(r)) {
++    if (!ra_hashint(r) && !iscrossref(as, ref))
++      ra_sethint(IR(ref)->r, hint);  /* Propagate register hint. */
++    r = ra_allocref(as, ref, allow);
++  }
++  ra_noweak(as, r);
++  return r;
++}
++
++/* Allocate two source registers for three-operand instructions. */
++static Reg ra_alloc2(ASMState *as, IRIns *ir, RegSet allow)
++{
++  IRIns *irl = IR(ir->op1), *irr = IR(ir->op2);
++  Reg left = irl->r, right = irr->r;
++  if (ra_hasreg(left)) {
++    ra_noweak(as, left);
++    if (ra_noreg(right))
++      right = ra_allocref(as, ir->op2, rset_exclude(allow, left));
++    else
++      ra_noweak(as, right);
++  } else if (ra_hasreg(right)) {
++    ra_noweak(as, right);
++    left = ra_allocref(as, ir->op1, rset_exclude(allow, right));
++  } else if (ra_hashint(right)) {
++    right = ra_allocref(as, ir->op2, allow);
++    left = ra_alloc1(as, ir->op1, rset_exclude(allow, right));
++  } else {
++    left = ra_allocref(as, ir->op1, allow);
++    right = ra_alloc1(as, ir->op2, rset_exclude(allow, left));
++  }
++  return left | (right << 8);
++}
++
++/* -- Guard handling ------------------------------------------------------ */
++
++/* Setup exit stub after the end of each trace. */
++static void asm_exitstub_setup(ASMState *as)
++{
++  MCode *mxp = as->mctop;
++  if (as->mcp == mxp)
++    --as->mcp;
++  /* st.w TMP, sp, 0; li TMP, traceno; jirl ->vm_exit_handler;*/
++  *--mxp = LOONGI_JIRL | RID_R0 | LOONGF_J(RID_R20) | 0<<10;
++  emit_dj32i(as, RID_TMP, RID_ZERO, as->T->traceno);
++  *--mxp = *as->mcp;
++  *--mxp = LOONGI_LU52I_D | RID_R20 | LOONGF_J(RID_R20)
++            | LOONGF_I((((uintptr_t)(void *)lj_vm_exit_handler)>>52)&0xfff);
++  *--mxp = LOONGI_LU32I_D | RID_R20
++            | LOONGF_I20((((uintptr_t)(void *)lj_vm_exit_handler)>>32)&0xfffff);
++  *--mxp = LOONGI_ORI | RID_R20 | LOONGF_J(RID_R20)
++            | LOONGF_I(((uintptr_t)(void *)lj_vm_exit_handler)&0xfff);
++  *--mxp = LOONGI_LU12I_W | RID_R20
++            | LOONGF_I20((((uintptr_t)(void *)lj_vm_exit_handler)&0xfffff000)>>12);
++  *--mxp = LOONGI_ST_W | LOONGF_D(RID_TMP) | LOONGF_J(RID_SP);
++  as->mctop = mxp;
++}
++
++/* Keep this in-sync with exitstub_trace_addr(). */
++#define asm_exitstub_addr(as)	((as)->mctop)
++
++/* Emit conditional branch to exit for guard. */
++static void asm_guard(ASMState *as, LOONGIns loongi, Reg rj, Reg rd)
++{
++  MCode *target = asm_exitstub_addr(as);
++  MCode *p = as->mcp;
++  if (LJ_UNLIKELY(p == as->invmcp)) {
++    as->invmcp = NULL;
++    as->loopinv = 1;
++    as->mcp = p;
++    loongi = loongi ^ ((loongi>>28) == 4 ? 0x00000100u : 0x04000000u);  /* Invert cond. BEQ BNE BGE BLZ*/
++    target = p - 1;  /* Patch target later in asm_loop_fixup. */
++  }
++    emit_branch(as, loongi, rj, rd, target);
++    emit_dji(as, LOONGI_ADDI_D, RID_TMP, RID_ZERO, as->snapno);
++}
++
++static void asm_guard21(ASMState *as, LOONGIns loongi, Reg rj)
++{
++  MCode *target = asm_exitstub_addr(as);
++  MCode *p = as->mcp;
++  if (LJ_UNLIKELY(p == as->invmcp)) {
++    as->invmcp = NULL;
++    as->loopinv = 1;
++    as->mcp = p;
++    loongi = loongi ^ ((loongi>>28) == 4 ? 0x00000100u : 0x04000000u);  /* Invert cond. BCEQZ BCNEZ*/
++    target = p - 1;  /* Patch target later in asm_loop_fixup. */
++  }
++    emit_branch21(as, loongi, rj, target);
++    emit_dji(as, LOONGI_ADDI_D, RID_TMP, RID_ZERO, as->snapno);
++}
++
++/* -- Operand fusion ------------------------------------------------------ */
++
++/* Limit linear search to this distance. Avoids O(n^2) behavior. */
++#define CONFLICT_SEARCH_LIM	31
++
++/* Check if there's no conflicting instruction between curins and ref. */
++static int noconflict(ASMState *as, IRRef ref, IROp conflict)
++{
++  IRIns *ir = as->ir;
++  IRRef i = as->curins;
++  if (i > ref + CONFLICT_SEARCH_LIM)
++    return 0;  /* Give up, ref is too far away. */
++  while (--i > ref)
++    if (ir[i].o == conflict)
++      return 0;  /* Conflict found. */
++  return 1;  /* Ok, no conflict. */
++}
++
++/* Fuse the array base of colocated arrays. */
++static int32_t asm_fuseabase(ASMState *as, IRRef ref)
++{
++  IRIns *ir = IR(ref);
++  if (ir->o == IR_TNEW && ir->op1 <= LJ_MAX_COLOSIZE &&
++      !neverfuse(as) && noconflict(as, ref, IR_NEWREF))
++    return (int32_t)sizeof(GCtab);
++  return 0;
++}
++
++/* Fuse array/hash/upvalue reference into register+offset operand. */
++static Reg asm_fuseahuref(ASMState *as, IRRef ref, int32_t *ofsp, RegSet allow)
++{
++  IRIns *ir = IR(ref);
++  if (ra_noreg(ir->r)) {
++    if (ir->o == IR_AREF) {
++      if (mayfuse(as, ref)) {
++	if (irref_isk(ir->op2)) {
++	  IRRef tab = IR(ir->op1)->op1;
++	  int32_t ofs = asm_fuseabase(as, tab);
++	  IRRef refa = ofs ? tab : ir->op1;
++	  ofs += 8*IR(ir->op2)->i;
++	  if (checki16(ofs)) {
++	    *ofsp = ofs;
++	    return ra_alloc1(as, refa, allow);
++	  }
++	}
++      }
++    } else if (ir->o == IR_HREFK) {
++      if (mayfuse(as, ref)) {
++	int32_t ofs = (int32_t)(IR(ir->op2)->op2 * sizeof(Node));
++	if (checki16(ofs)) {
++	  *ofsp = ofs;
++	  return ra_alloc1(as, ir->op1, allow);
++	}
++      }
++    } else if (ir->o == IR_UREFC) {
++      if (irref_isk(ir->op1)) {
++	GCfunc *fn = ir_kfunc(IR(ir->op1));
++	intptr_t ofs = (intptr_t)&gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv.tv;
++	intptr_t jgl = (intptr_t)J2G(as->J);
++	if ((uintptr_t)(ofs-jgl) < 65536) {
++	  *ofsp = ofs-jgl-32768;
++	  return RID_JGL;
++	} else {
++	  *ofsp = (int16_t)ofs;
++	  return ra_allock(as, ofs-(int16_t)ofs, allow);
++	}
++      }
++    } else if (ir->o == IR_TMPREF) {
++      *ofsp = (int32_t)(offsetof(global_State, tmptv)-32768);
++      return RID_JGL;
++    }
++  }
++  *ofsp = 0;
++  return ra_alloc1(as, ref, allow);
++}
++
++/* Fuse XLOAD/XSTORE reference into load/store operand. */
++static void asm_fusexref(ASMState *as, LOONGIns loongi, Reg rd, IRRef ref,
++			 RegSet allow, int32_t ofs)
++{
++  IRIns *ir = IR(ref);
++  Reg base;
++  if (ra_noreg(ir->r) && canfuse(as, ir)) {
++    intptr_t ofs2;
++    if (ir->o == IR_ADD) {
++      if (irref_isk(ir->op2) && (ofs2 = ofs + get_kval(as, ir->op2),
++				 checki12(ofs2))) {
++	ref = ir->op1;
++	ofs = (int32_t)ofs2;
++      }
++    } else if (ir->o == IR_STRREF) {
++      ofs2 = 4096;
++      lj_assertA(ofs == 0, "bad usage");
++      ofs = (int32_t)sizeof(GCstr);
++      if (irref_isk(ir->op2)) {
++	ofs2 = ofs + get_kval(as, ir->op2);
++	ref = ir->op1;
++      } else if (irref_isk(ir->op1)) {
++	ofs2 = ofs + get_kval(as, ir->op1);
++	ref = ir->op2;
++      }
++      if (!checki12(ofs2)) {
++        /* NYI: Fuse ADD with constant. */
++        Reg right, left = ra_alloc2(as, ir, allow);
++        right = (left >> 8); left &= 255;
++        emit_dji(as, loongi, rd, RID_TMP, ofs&0xfff);
++        emit_djk(as, LOONGI_ADD_D, RID_TMP, left, right);
++        return;
++      }
++      ofs = ofs2;
++    }
++  }
++  base = ra_alloc1(as, ref, allow);
++  emit_dji(as, loongi, rd, base, ofs&0xfff);
++}
++
++/* Fuse FP multiply-add/sub. */
++
++static int asm_fusemadd(ASMState *as, IRIns *ir, LOONGIns loongi, LOONGIns loongir)
++{
++  IRRef lref = ir->op1, rref = ir->op2;
++  IRIns *irm;
++  if (lref != rref &&
++      ((mayfuse(as, lref) && (irm = IR(lref), irm->o == IR_MUL) &&
++       ra_noreg(irm->r)) ||
++       (mayfuse(as, rref) && (irm = IR(rref), irm->o == IR_MUL) &&
++       (rref = lref, loongi = loongir, ra_noreg(irm->r))))) {
++    Reg dest = ra_dest(as, ir, RSET_FPR);
++    Reg add = ra_hintalloc(as, rref, dest, RSET_FPR);
++    Reg left = ra_alloc2(as, irm, rset_exclude(rset_exclude(RSET_FPR, dest), add));
++    Reg right = (left >> 8); left &= 255;
++    emit_djka(as, loongi, (dest & 0x1f), (left & 0x1f), (right & 0x1f), (add & 0x1f));
++    return 1;
++  }
++  return 0;
++}
++/* -- Calls --------------------------------------------------------------- */
++
++/* Generate a call to a C function. */
++static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args)
++{
++  uint32_t n, nargs = CCI_XNARGS(ci);
++  int32_t ofs = 0;
++  Reg gpr, fpr = REGARG_FIRSTFPR;
++  if ((void *)ci->func)
++    emit_call(as, (void *)ci->func);
++  for (gpr = REGARG_FIRSTGPR; gpr <= REGARG_LASTGPR; gpr++)
++    as->cost[gpr] = REGCOST(~0u, ASMREF_L);
++  gpr = REGARG_FIRSTGPR;
++  for (n = 0; n < nargs; n++) { /* Setup args. */
++    IRRef ref = args[n];
++    if (ref) {
++      IRIns *ir = IR(ref);
++      if (irt_isfp(ir->t) && (n == 0 || !(ci->flags & CCI_VARARG))) {
++        if (fpr <= REGARG_LASTFPR) {
++	  lj_assertA(rset_test(as->freeset, fpr),
++	             "reg %d not free", fpr);  /* Must have been evicted. */
++          ra_leftov(as, fpr, ref);
++	  fpr++;
++	} else if (gpr <= REGARG_LASTGPR) {
++	  lj_assertA(rset_test(as->freeset, gpr),
++	             "reg %d not free", gpr);  /* Must have been evicted. */
++          ra_leftov(as, gpr, ref);
++	  gpr++;
++	} else {
++	  Reg r = ra_alloc1(as, ref, RSET_FPR);
++	  emit_spstore(as, ir, r, ofs);
++	  ofs += 8;
++	}
++      } else {
++        if (gpr <= REGARG_LASTGPR) {
++	  lj_assertA(rset_test(as->freeset, gpr),
++	             "reg %d not free", gpr);  /* Must have been evicted. */
++          ra_leftov(as, gpr, ref);
++	  gpr++;
++	} else {
++	  Reg r = ra_alloc1(as, ref, RSET_GPR);
++	  emit_spstore(as, ir, r, ofs);
++	  ofs += 8;
++	}
++      }
++    }
++  }
++}
++
++/* Setup result reg/sp for call. Evict scratch regs. */
++static void asm_setupresult(ASMState *as, IRIns *ir, const CCallInfo *ci)
++{
++  RegSet drop = RSET_SCRATCH;
++  int hiop = ((ir+1)->o == IR_HIOP && !irt_isnil((ir+1)->t));
++  if (ra_hasreg(ir->r))
++    rset_clear(drop, ir->r);  /* Dest reg handled below. */
++  if (hiop && ra_hasreg((ir+1)->r))
++    rset_clear(drop, (ir+1)->r);  /* Dest reg handled below. */
++  ra_evictset(as, drop);  /* Evictions must be performed first. */
++  if (ra_used(ir)) {
++    lj_assertA(!irt_ispri(ir->t), "PRI dest");
++    if (irt_isfp(ir->t)) {
++      if ((ci->flags & CCI_CASTU64)) {
++        Reg dest = ra_dest(as, ir, RSET_FPR);
++	emit_dj(as, irt_isnum(ir->t) ? LOONGI_MOVGR2FR_D : LOONGI_MOVGR2FR_W,
++	        dest, RID_RET);
++      } else {
++	ra_destreg(as, ir, RID_FPRET);
++      }
++    } else if (hiop) {
++      ra_destpair(as, ir);
++    } else {
++      ra_destreg(as, ir, RID_RET);
++    }
++  }
++}
++
++static void asm_callx(ASMState *as, IRIns *ir)
++{
++  IRRef args[CCI_NARGS_MAX*2];
++  CCallInfo ci;
++  IRRef func;
++  IRIns *irf;
++  ci.flags = asm_callx_flags(as, ir);
++  asm_collectargs(as, ir, &ci, args);
++  asm_setupresult(as, ir, &ci);
++  func = ir->op2; irf = IR(func);
++  if (irf->o == IR_CARG) { func = irf->op1; irf = IR(func); }
++  if (irref_isk(func)) {  /* Call to constant address. */
++    ci.func = (ASMFunction)(void *)get_kval(as, func);
++  } else {  /* Need specific register for indirect calls. */
++    Reg freg = ra_alloc1(as, func, RSET_RANGE(RID_R12, RID_MAX_GPR)-RSET_FIXED);
++    *--as->mcp = LOONGI_JIRL | LOONGF_D(RID_RA) | LOONGF_J(freg);
++    ci.func = (ASMFunction)(void *)0;
++  }
++  asm_gencall(as, &ci, args);
++}
++
++static void asm_callround(ASMState *as, IRIns *ir, IRCallID id)
++{
++  /* The modified regs must match with the *.dasc implementation. */
++  RegSet drop = RID2RSET(RID_R12)|RID2RSET(RID_R13)|RID2RSET(RID_F0)|
++                RID2RSET(RID_F4)|RID2RSET(RID_F9)|RID2RSET(RID_F22)
++                |RID2RSET(RID_F23);
++  if (ra_hasreg(ir->r)) rset_clear(drop, ir->r);
++  ra_evictset(as, drop);
++  ra_destreg(as, ir, RID_FPRET);
++  emit_call(as, (void *)lj_ir_callinfo[id].func);
++  ra_leftov(as, REGARG_FIRSTFPR, ir->op1);
++}
++
++/* -- Returns ------------------------------------------------------------- */
++
++/* Return to lower frame. Guard that it goes to the right spot. */
++static void asm_retf(ASMState *as, IRIns *ir)
++{
++  Reg base = ra_alloc1(as, REF_BASE, RSET_GPR);
++  void *pc = ir_kptr(IR(ir->op2));
++  int32_t delta = 1+LJ_FR2+bc_a(*((const BCIns *)pc - 1));
++  as->topslot -= (BCReg)delta;
++  if ((int32_t)as->topslot < 0) as->topslot = 0;
++  irt_setmark(IR(REF_BASE)->t);  /* Children must not coalesce with BASE reg. */
++  emit_setgl(as, base, jit_base);
++  emit_addptr(as, base, -8*delta);
++  Reg tmp = ra_scratch(as, rset_exclude(RSET_GPR, base));
++  asm_guard(as, LOONGI_BNE, tmp,
++	    ra_allock(as, igcptr(pc), rset_exclude(rset_exclude(RSET_GPR, base), tmp)));
++  emit_dji(as, LOONGI_LD_D, tmp, base, -8&0xfff);
++}
++
++/* -- Buffer operations --------------------------------------------------- */
++
++#if LJ_HASBUFFER
++static void asm_bufhdr_write(ASMState *as, Reg sb)
++{
++  Reg tmp = ra_scratch(as, rset_exclude(RSET_GPR, sb));
++  IRIns irgc;
++  irgc.ot = IRT(0, IRT_PGC);  /* GC type. */
++  emit_storeofs(as, &irgc, RID_TMP, sb, offsetof(SBuf, L));
++  emit_djml(as, LOONGI_BSTRINS_D, RID_TMP, tmp,
++            lj_fls(SBUF_MASK_FLAG), 0);
++  emit_getgl(as, RID_TMP, cur_L);
++  emit_loadofs(as, &irgc, tmp, sb, offsetof(SBuf, L));
++}
++#endif
++
++/* -- Type conversions ---------------------------------------------------- */
++
++static void asm_tointg(ASMState *as, IRIns *ir, Reg left)
++{
++  Reg tmp = ra_scratch(as, rset_exclude(RSET_FPR, left));
++  Reg dest = ra_dest(as, ir, RSET_GPR);
++  asm_guard21(as, LOONGI_BCEQZ, 0);
++  emit_djk(as, LOONGI_FCMP_CEQ_D, 0, tmp, left);
++  emit_dj(as, LOONGI_FFINT_D_W, tmp, tmp);
++  emit_dj(as, LOONGI_MOVFR2GR_S, dest, tmp);
++  emit_dj(as, LOONGI_FTINT_W_D, tmp, left);
++}
++
++static void asm_tobit(ASMState *as, IRIns *ir)
++{
++  RegSet allow = RSET_FPR;
++  Reg dest = ra_dest(as, ir, RSET_GPR);
++  Reg left = ra_alloc1(as, ir->op1, allow);
++  Reg right = ra_alloc1(as, ir->op2, rset_clear(allow, left));
++  Reg tmp = ra_scratch(as, rset_clear(allow, right));
++  emit_dj(as, LOONGI_MOVFR2GR_S, dest, tmp);
++  emit_djk(as, LOONGI_FADD_D, tmp, left, right);
++}
++
++static void asm_conv(ASMState *as, IRIns *ir)
++{
++  IRType st = (IRType)(ir->op2 & IRCONV_SRCMASK);	// source type
++  int stfp = (st == IRT_NUM || st == IRT_FLOAT);
++  int st64 = (st == IRT_I64 || st == IRT_U64 || st == IRT_P64);
++  IRRef lref = ir->op1;
++  lj_assertA(irt_type(ir->t) != st, "inconsistent types for CONV");
++  /* Use GPR to pass floating-point arguments */
++  if (irt_isfp(ir->t) && ir->r >= RID_R4 && ir->r <= RID_R11) {
++    Reg dest = ra_dest(as, ir, RSET_GPR);
++    Reg ftmp = ra_scratch(as, RSET_FPR);
++    if (stfp) {  /* FP to FP conversion. */
++      emit_dj(as, st == IRT_NUM ? LOONGI_MOVFR2GR_S : LOONGI_MOVFR2GR_D, dest, ftmp);
++      emit_dj(as, st == IRT_NUM ? LOONGI_FCVT_S_D : LOONGI_FCVT_D_S,
++	      ftmp, ra_alloc1(as, lref, RSET_FPR));
++    } else if (st == IRT_U32) {  /* U32 to FP conversion. */
++      /* y = (x ^ 0x80000000) + 2147483648.0 */
++      Reg left = ra_alloc1(as, lref, RSET_GPR);
++      Reg tmp = ra_scratch(as, rset_exclude(RSET_FPR, ftmp));
++      if (irt_isfloat(ir->t)) {
++        emit_dj(as, LOONGI_MOVFR2GR_S, dest, ftmp);
++	emit_dj(as, LOONGI_FCVT_S_D, ftmp, ftmp);
++      } else {
++        emit_dj(as, LOONGI_MOVFR2GR_D, dest, ftmp);
++      }
++      /* Must perform arithmetic with doubles to keep the precision. */
++      emit_djk(as, LOONGI_FADD_D, ftmp, ftmp, tmp);
++      emit_dj(as, LOONGI_FFINT_D_W, ftmp, ftmp);
++      emit_lsptr(as, LOONGI_FLD_D, (tmp & 0x1f),
++		 (void *)&as->J->k64[LJ_K64_2P31], RSET_GPR);
++      emit_dj(as, LOONGI_MOVGR2FR_W, ftmp, RID_TMP);
++      emit_djk(as, LOONGI_XOR, RID_TMP, RID_TMP, left);
++      emit_dji(as, LOONGI_ADDU16I_D, RID_TMP, RID_R0, 0x8000);
++    } else if(st == IRT_U64) {  /* U64 to FP conversion. */
++      /* if (x >= 1u<<63) y = (double)(int64_t)(x&(1u<<63)-1) + pow(2.0, 63) */
++      Reg left = ra_alloc1(as, lref, RSET_GPR);
++      Reg tmp = ra_scratch(as, rset_exclude(RSET_FPR, ftmp));
++      MCLabel l_end = emit_label(as);
++      if (irt_isfloat(ir->t)) {
++        emit_dj(as, LOONGI_MOVFR2GR_S, dest, ftmp);
++	emit_djk(as, LOONGI_FADD_S, ftmp, ftmp, tmp);
++	emit_lsptr(as, LOONGI_FLD_S, (tmp & 0x1f), (void *)&as->J->k32[LJ_K32_2P63],
++		   rset_exclude(RSET_GPR, left));
++	emit_branch(as, LOONGI_BGE, left, RID_ZERO, l_end);
++	emit_dj(as, LOONGI_FFINT_S_L, ftmp, ftmp);
++      } else {
++        emit_dj(as, LOONGI_MOVFR2GR_D, dest, ftmp);
++	emit_djk(as, LOONGI_FADD_D, ftmp, ftmp, tmp);
++	emit_lsptr(as, LOONGI_FLD_D, (tmp & 0x1f), (void *)&as->J->k64[LJ_K64_2P63],
++		   rset_exclude(RSET_GPR, left));
++	emit_branch(as, LOONGI_BGE, left, RID_ZERO, l_end);
++	emit_dj(as, LOONGI_FFINT_D_L, ftmp, ftmp);
++      }
++      emit_dj(as, LOONGI_MOVGR2FR_D, ftmp, RID_TMP);
++      emit_djml(as, LOONGI_BSTRPICK_D, RID_TMP, left, 62, 0);
++    } else {  /* Integer to FP conversion. */
++      Reg left = ra_alloc1(as, lref, RSET_GPR);
++      LOONGIns loongi = irt_isfloat(ir->t) ?
++        (st64 ? LOONGI_FFINT_S_L : LOONGI_FFINT_S_W) :
++        (st64 ? LOONGI_FFINT_D_L : LOONGI_FFINT_D_W);
++      emit_dj(as, st64 ? LOONGI_MOVFR2GR_D : LOONGI_MOVFR2GR_S, dest, ftmp);
++      emit_dj(as, loongi, ftmp, ftmp);
++      emit_dj(as, st64 ? LOONGI_MOVGR2FR_D : LOONGI_MOVGR2FR_W, ftmp, left);
++    }
++  } else if (irt_isfp(ir->t)) {
++    Reg dest = ra_dest(as, ir, RSET_FPR);
++    if (stfp) {  /* FP to FP conversion. */
++      emit_dj(as, st == IRT_NUM ? LOONGI_FCVT_S_D : LOONGI_FCVT_D_S,
++	      dest, ra_alloc1(as, lref, RSET_FPR));
++    } else if (st == IRT_U32) {  /* U32 to FP conversion. */
++      /* y = (x ^ 0x80000000) + 2147483648.0 */
++      Reg left = ra_alloc1(as, lref, RSET_GPR);
++      Reg tmp = ra_scratch(as, rset_exclude(RSET_FPR, dest));
++      if (irt_isfloat(ir->t))
++	emit_dj(as, LOONGI_FCVT_S_D, dest, dest);
++      /* Must perform arithmetic with doubles to keep the precision. */
++      emit_djk(as, LOONGI_FADD_D, dest, dest, tmp);
++      emit_dj(as, LOONGI_FFINT_D_W, dest, dest);
++      emit_lsptr(as, LOONGI_FLD_D, (tmp & 0x1f),
++		 (void *)&as->J->k64[LJ_K64_2P31], RSET_GPR);
++      emit_dj(as, LOONGI_MOVGR2FR_W, dest, RID_TMP);
++      emit_djk(as, LOONGI_XOR, RID_TMP, RID_TMP, left);
++      emit_dji(as, LOONGI_ADDU16I_D, RID_TMP, RID_R0, 0x8000);
++    } else if(st == IRT_U64) {  /* U64 to FP conversion. */
++      /* if (x >= 1u<<63) y = (double)(int64_t)(x&(1u<<63)-1) + pow(2.0, 63) */
++      Reg left = ra_alloc1(as, lref, RSET_GPR);
++      Reg tmp = ra_scratch(as, rset_exclude(RSET_FPR, dest));
++      MCLabel l_end = emit_label(as);
++      if (irt_isfloat(ir->t)) {
++	emit_djk(as, LOONGI_FADD_S, dest, dest, tmp);
++	emit_lsptr(as, LOONGI_FLD_S, (tmp & 0x1f), (void *)&as->J->k32[LJ_K32_2P63],
++		   rset_exclude(RSET_GPR, left));
++	emit_branch(as, LOONGI_BGE, left, RID_ZERO, l_end);
++	emit_dj(as, LOONGI_FFINT_S_L, dest, dest);
++      } else {
++	emit_djk(as, LOONGI_FADD_D, dest, dest, tmp);
++	emit_lsptr(as, LOONGI_FLD_D, (tmp & 0x1f), (void *)&as->J->k64[LJ_K64_2P63],
++		   rset_exclude(RSET_GPR, left));
++	emit_branch(as, LOONGI_BGE, left, RID_ZERO, l_end);
++	emit_dj(as, LOONGI_FFINT_D_L, dest, dest);
++      }
++      emit_dj(as, LOONGI_MOVGR2FR_D, dest, RID_TMP);
++      emit_djml(as, LOONGI_BSTRPICK_D, RID_TMP, left, 62, 0);
++    } else {  /* Integer to FP conversion. */
++      Reg left = ra_alloc1(as, lref, RSET_GPR);
++      LOONGIns loongi = irt_isfloat(ir->t) ?
++	(st64 ? LOONGI_FFINT_S_L : LOONGI_FFINT_S_W) :
++	(st64 ? LOONGI_FFINT_D_L : LOONGI_FFINT_D_W);
++      emit_dj(as, loongi, dest, dest);
++      emit_dj(as, st64 ? LOONGI_MOVGR2FR_D : LOONGI_MOVGR2FR_W, dest, left);
++    }
++  } else if (stfp) {  /* FP to integer conversion. */
++    if (irt_isguard(ir->t)) {
++      /* Checked conversions are only supported from number to int. */
++      lj_assertA(irt_isint(ir->t) && st == IRT_NUM,
++		 "bad type for checked CONV");
++      asm_tointg(as, ir, ra_alloc1(as, lref, RSET_FPR));
++    } else {
++      Reg dest = ra_dest(as, ir, RSET_GPR);
++      Reg left = ra_alloc1(as, lref, RSET_FPR);
++      Reg tmp = ra_scratch(as, rset_exclude(RSET_FPR, left));
++      if (irt_isu32(ir->t)) {  /* FP to U32 conversion. */
++	/* y = (int)floor(x - 2147483648.0) ^ 0x80000000 */
++	emit_djk(as, LOONGI_XOR, dest, dest, RID_TMP);
++	emit_dji(as, LOONGI_ADDU16I_D, RID_TMP, RID_R0, 0x8000);
++	emit_dj(as, LOONGI_MOVFR2GR_S, dest, tmp);
++	emit_dj(as, st == IRT_FLOAT ? LOONGI_FTINTRM_W_S : LOONGI_FTINTRM_W_D,
++		tmp, tmp);
++	emit_djk(as, st == IRT_FLOAT ? LOONGI_FSUB_S : LOONGI_FSUB_D,
++		 tmp, left, tmp);
++	if (st == IRT_FLOAT)
++	  emit_lsptr(as, LOONGI_FLD_S, (tmp & 0x1f),
++		     (void *)&as->J->k32[LJ_K32_2P31], RSET_GPR);
++	else
++	  emit_lsptr(as, LOONGI_FLD_D, (tmp & 0x1f),
++		     (void *)&as->J->k64[LJ_K64_2P31], RSET_GPR);
++      } else if (irt_isu64(ir->t)) {  /* FP to U64 conversion. */
++	MCLabel l_end;
++	emit_dj(as, LOONGI_MOVFR2GR_D, dest, tmp);
++	l_end = emit_label(as);
++	/* For inputs >= 2^63 add -2^64 and convert again. */
++	if (st == IRT_NUM) {
++	  emit_dj(as, LOONGI_FTINTRZ_L_D, tmp, tmp);
++	  emit_djk(as, LOONGI_FADD_D, tmp, left, tmp);
++	  emit_lsptr(as, LOONGI_FLD_D, (tmp & 0x1f),
++		     (void *)&as->J->k64[LJ_K64_M2P64],
++		     rset_exclude(RSET_GPR, dest));
++	  emit_branch21(as, LOONGI_BCNEZ, 0, l_end);
++	  emit_dj(as, LOONGI_FTINTRZ_L_D, tmp, left);
++	  emit_djk(as, LOONGI_FCMP_CLT_D, 0, left, tmp);
++	  emit_lsptr(as, LOONGI_FLD_D, (tmp & 0x1f),
++		     (void *)&as->J->k64[LJ_K64_2P63],
++		     rset_exclude(RSET_GPR, dest));
++	} else {
++	  emit_dj(as, LOONGI_FTINTRZ_L_S, tmp, tmp);
++	  emit_djk(as, LOONGI_FADD_S, tmp, left, tmp);
++	  emit_lsptr(as, LOONGI_FLD_S, (tmp & 0x1f),
++		     (void *)&as->J->k32[LJ_K32_M2P64],
++		     rset_exclude(RSET_GPR, dest));
++	  emit_branch21(as, LOONGI_BCNEZ, 0, l_end);
++	  emit_dj(as, LOONGI_FTINTRZ_L_S, tmp, left);
++	  emit_djk(as, LOONGI_FCMP_CLT_S, 0, left, tmp);
++	  emit_lsptr(as, LOONGI_FLD_S, (tmp & 0x1f),
++		     (void *)&as->J->k32[LJ_K32_2P63],
++		     rset_exclude(RSET_GPR, dest));
++	}
++      } else {
++	LOONGIns loongi = irt_is64(ir->t) ?
++	  (st == IRT_NUM ? LOONGI_FTINTRZ_L_D : LOONGI_FTINTRZ_L_S) :
++	  (st == IRT_NUM ? LOONGI_FTINTRZ_W_D : LOONGI_FTINTRZ_W_S);
++	emit_dj(as, irt_is64(ir->t) ? LOONGI_MOVFR2GR_D : LOONGI_MOVFR2GR_S, dest, left);
++	emit_dj(as, loongi, left, left);
++      }
++    }
++  } else {
++    Reg dest = ra_dest(as, ir, RSET_GPR);
++    if (st >= IRT_I8 && st <= IRT_U16) {  /* Extend to 32 bit integer. */
++      Reg left = ra_alloc1(as, ir->op1, RSET_GPR);
++      lj_assertA(irt_isint(ir->t) || irt_isu32(ir->t), "bad type for CONV EXT");
++      if ((ir->op2 & IRCONV_SEXT)) {	// sign-extend
++	emit_dj(as, st == IRT_I8 ? LOONGI_EXT_W_B : LOONGI_EXT_W_H, dest, left);
++      } else {	// zero-extend
++        int msbd = st == IRT_U8 ? 7 : 15;
++        emit_djml(as, LOONGI_BSTRPICK_D, dest, left, msbd, 0);
++      }
++    } else {  /* 32/64 bit integer conversions. */
++      if (irt_is64(ir->t)) {
++	if (st64) {
++	  /* 64/64 bit no-op (cast)*/
++	  ra_leftov(as, dest, lref);	/* Do nothing, but may need to move regs. */
++	} else {
++	  Reg left = ra_alloc1(as, lref, RSET_GPR);
++	  if ((ir->op2 & IRCONV_SEXT)) {  /* 32 to 64 bit sign extension. */
++	    emit_dju(as, LOONGI_SLLI_W, dest, left, 0);
++	  } else {  /* 32 to 64 bit zero extension. */
++	    emit_djml(as, LOONGI_BSTRPICK_D, dest, left, 31, 0);
++	  }
++	}
++      } else {
++	if (st64 && !(ir->op2 & IRCONV_NONE)) {
++	  /* This is either a 32 bit reg/reg mov which zeroes the hiword
++	  ** or a load of the loword from a 64 bit address.
++	  */
++	  Reg left = ra_alloc1(as, lref, RSET_GPR);
++	  emit_djml(as, LOONGI_BSTRPICK_D, dest, left, 31, 0);
++	} else {  /* 32/32 bit no-op (cast). */
++	  ra_leftov(as, dest, lref);	/* Do nothing, but may need to move regs. */
++	}
++      }
++    }
++  }
++}
++
++static void asm_strto(ASMState *as, IRIns *ir)
++{
++  const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_strscan_num];
++  IRRef args[2];
++  int32_t ofs = SPOFS_TMP;
++  RegSet drop = RSET_SCRATCH;
++  if (ra_hasreg(ir->r)) rset_set(drop, ir->r);  /* Spill dest reg (if any). */
++  ra_evictset(as, drop);
++  if (ir->s) ofs = sps_scale(ir->s);
++  asm_guard(as, LOONGI_BEQ, RID_RET, RID_ZERO);  /* Test return status. */
++  args[0] = ir->op1;      /* GCstr *str */
++  args[1] = ASMREF_TMP1;  /* TValue *n  */
++  asm_gencall(as, ci, args);
++  /* Store the result to the spill slot or temp slots. */
++  Reg tmp = ra_releasetmp(as, ASMREF_TMP1);
++  emit_addk(as, tmp, RID_SP, ofs, RSET_GPR);
++}
++
++/* -- Memory references --------------------------------------------------- */
++
++/* Store tagged value for ref at base+ofs. */
++static void asm_tvstore64(ASMState *as, Reg base, int32_t ofs, IRRef ref)
++{
++  RegSet allow = rset_exclude(RSET_GPR, base);
++  IRIns *ir = IR(ref);
++  lj_assertA(irt_ispri(ir->t) || irt_isaddr(ir->t) || irt_isinteger(ir->t),
++	     "store of IR type %d", irt_type(ir->t));
++  if (irref_isk(ref)) {
++    TValue k;
++    lj_ir_kvalue(as->J->L, &k, ir);
++    Reg ku64 = ra_allock(as, (int64_t)k.u64, allow);
++    rset_clear(allow, ku64);
++    if (checki12(ofs)) {
++      emit_dji(as, LOONGI_ST_D, ku64, base, ofs&0xfff);
++    } else {
++      emit_djk(as, LOONGI_STX_D, ku64, base, ra_allock(as, ofs, allow));
++    }
++  } else {
++    Reg src = ra_alloc1(as, ref, allow);
++    rset_clear(allow, src);
++    Reg type = ra_allock(as, (int64_t)irt_toitype(ir->t) << 47, allow);
++    emit_dji(as, LOONGI_ST_D, RID_TMP, base, ofs&0xfff);
++    if (irt_isinteger(ir->t)) {
++      emit_djk(as, LOONGI_ADD_D, RID_TMP, RID_TMP, type);
++      emit_djml(as, LOONGI_BSTRPICK_D, RID_TMP, src, 31, 0);
++    } else {
++      emit_djk(as, LOONGI_ADD_D, RID_TMP, src, type);
++    }
++  }
++}
++
++/* Get pointer to TValue. */
++static void asm_tvptr(ASMState *as, Reg dest, IRRef ref, MSize mode)	// todo-new
++{
++  int32_t tmpofs = (int32_t)(offsetof(global_State, tmptv)-32768);
++  RegSet allow = RSET_GPR;
++  if ((mode & IRTMPREF_IN1)) {
++    IRIns *ir = IR(ref);
++    if (irt_isnum(ir->t)) {
++      if ((mode & IRTMPREF_OUT1)) {
++        Reg src = ra_alloc1(as, ref, RSET_FPR);
++	emit_addk(as, dest, RID_JGL, tmpofs, allow);
++        emit_lso(as, LOONGI_ST_D, src, RID_JGL, tmpofs, allow);
++      } else if (irref_isk(ref)) {
++        /* Use the number constant itself as a TValue. */
++        ra_allockreg(as, igcptr(ir_knum(ir)), dest);
++      } else {
++        emit_dji(as, LOONGI_ADDI_D, dest, RID_SP, ra_spill(as, ir)&0xfff);
++      }
++    } else {
++      /* Otherwise use g->tmptv to hold the TValue. */
++      asm_tvstore64(as, dest, 0, ref);
++      emit_addk(as, dest, RID_JGL, tmpofs, RSET_GPR);
++    }
++  } else {
++    emit_addk(as, dest, RID_JGL, tmpofs, RSET_GPR);
++  }
++}
++
++static void asm_aref(ASMState *as, IRIns *ir)
++{
++  Reg dest = ra_dest(as, ir, RSET_GPR);
++  Reg idx, base;
++  if (irref_isk(ir->op2)) {
++    IRRef tab = IR(ir->op1)->op1;
++    int32_t ofs = asm_fuseabase(as, tab);
++    IRRef refa = ofs ? tab : ir->op1;
++    ofs += 8*IR(ir->op2)->i;
++    if (checki12(ofs)) {
++      base = ra_alloc1(as, refa, RSET_GPR);
++      emit_dji(as, LOONGI_ADDI_D, dest, base, ofs&0xfff);
++      return;
++    }
++  }
++  base = ra_alloc1(as, ir->op1, RSET_GPR);
++  idx = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, base));
++  emit_djk(as, LOONGI_ADD_D, dest, RID_TMP, base);
++  emit_dju(as, LOONGI_SLLI_D, RID_TMP, idx, 3);
++}
++
++/* Inlined hash lookup. Specialized for key type and for const keys.
++** The equivalent C code is:
++**   Node *n = hashkey(t, key);
++**   do {
++**     if (lj_obj_equal(&n->key, key)) return &n->val;
++**   } while ((n = nextnode(n)));
++**   return niltv(L);
++*/
++static void asm_href(ASMState *as, IRIns *ir, IROp merge)
++{
++  RegSet allow = RSET_GPR;
++  int destused = ra_used(ir);
++  Reg dest = ra_dest(as, ir, allow);
++  Reg tab = ra_alloc1(as, ir->op1, rset_clear(allow, dest));
++  Reg key = RID_NONE, type = RID_NONE, tmpnum = RID_NONE, tmp1, tmp2;
++  Reg cmp64 = RID_NONE;
++  IRRef refkey = ir->op2;
++  IRIns *irkey = IR(refkey);
++  int isk = irref_isk(refkey);
++  IRType1 kt = irkey->t;
++  uint32_t khash;
++  MCLabel l_end, l_loop, l_next;
++  rset_clear(allow, tab);
++  tmp1 = ra_scratch(as, allow);
++  rset_clear(allow, tmp1);
++  tmp2 = ra_scratch(as, allow);
++  rset_clear(allow, tmp2);
++
++  if (irt_isnum(kt)) {
++    key = ra_alloc1(as, refkey, RSET_FPR);
++    tmpnum = ra_scratch(as, rset_exclude(RSET_FPR, key));
++  } else {
++    /* Allocate cmp64 register used for 64-bit comparisons */
++    if (!isk && irt_isaddr(kt)) {
++      cmp64 = tmp2;
++    } else {
++      int64_t k;
++      if (isk && irt_isaddr(kt)) {
++	k = ((int64_t)irt_toitype(kt) << 47) | irkey[1].tv.u64;
++      } else {
++	lj_assertA(irt_ispri(kt) && !irt_isnil(kt), "bad HREF key type");
++	k = ~((int64_t)~irt_toitype(kt) << 47);
++      }
++      cmp64 = ra_allock(as, k, allow);
++      rset_clear(allow, cmp64);
++    }
++    if (!irt_ispri(kt)) {
++      key = ra_alloc1(as, refkey, allow);
++      rset_clear(allow, key);
++    }
++  } 
++
++  /* Key not found in chain: jump to exit (if merged) or load niltv. */
++  l_end = emit_label(as);
++  as->invmcp = NULL;
++  if (merge == IR_NE)
++    asm_guard(as, LOONGI_BEQ, RID_ZERO, RID_ZERO);
++  else if (destused)
++    emit_loada(as, dest, niltvg(J2G(as->J)));
++
++  /* Follow hash chain until the end. */
++  l_loop = --as->mcp;
++  emit_move(as, dest, tmp1);
++  emit_dji(as, LOONGI_LD_D, tmp1, dest, (int32_t)offsetof(Node, next)&0xfff);
++  l_next = emit_label(as);
++
++  /* Type and value comparison. */
++  if (merge == IR_EQ) {  /* Must match asm_guard(). */
++    l_end = asm_exitstub_addr(as);
++  }
++  if (irt_isnum(kt)) {
++    emit_branch21(as, LOONGI_BCNEZ, 0, l_end);
++    emit_dj32i(as, RID_TMP, RID_ZERO, as->snapno);
++    emit_djk(as, LOONGI_FCMP_CEQ_D, 0, tmpnum, key);
++    emit_branch(as, LOONGI_BEQ, tmp1, RID_ZERO, l_next);
++    emit_dju(as, LOONGI_SLTUI, tmp1, tmp1, ((int32_t)LJ_TISNUM)&0xfff);
++    emit_dju(as, LOONGI_SRAI_D, tmp1, tmp1, 47);
++    emit_dj(as, LOONGI_MOVGR2FR_D, tmpnum, tmp1);
++  } else {
++    emit_branch(as, LOONGI_BEQ, tmp1, cmp64, l_end);
++    emit_dj32i(as, RID_TMP, RID_ZERO, as->snapno);
++  }
++  emit_dji(as, LOONGI_LD_D, tmp1, dest, (int32_t)offsetof(Node, key.u64)&0xfff);
++  *l_loop = LOONGI_BNE | LOONGF_J(tmp1) | LOONGF_D(RID_ZERO) | LOONGF_I(((as->mcp-l_loop) & 0xffffu));
++  if (!isk && irt_isaddr(kt)) {
++    type = ra_allock(as, (int64_t)irt_toitype(kt) << 47, allow);
++    emit_djk(as, LOONGI_ADD_D, tmp2, key, type);
++    rset_clear(allow, type);
++  }
++
++  /* Load main position relative to tab->node into dest. */
++  khash = isk ? ir_khash(as, irkey) : 1;
++  if (khash == 0) {
++    emit_dji(as, LOONGI_LD_D, dest, tab, (int32_t)offsetof(GCtab, node)&0xfff);
++  } else {
++    Reg tmphash = tmp1;
++    if (isk)
++      tmphash = ra_allock(as, khash, allow);
++    /* node = tab->node + (idx*32-idx*8) */
++    emit_djk(as, LOONGI_ADD_D, dest, dest, tmp1);
++    lj_assertA(sizeof(Node) == 24, "bad Node size");
++    emit_djk(as, LOONGI_SUB_W, tmp1, tmp2, tmp1);
++    emit_dju(as, LOONGI_SLLI_W, tmp1, tmp1, 3);
++    emit_dju(as, LOONGI_SLLI_W, tmp2, tmp1, 5);
++    emit_djk(as, LOONGI_AND, tmp1, tmp2, tmphash);	// idx = hi & tab->hmask
++    emit_dji(as, LOONGI_LD_D, dest, tab, ((int32_t)offsetof(GCtab, node))&0xfff);
++    emit_dji(as, LOONGI_LD_W, tmp2, tab, ((int32_t)offsetof(GCtab, hmask))&0xfff);
++    if (isk) {
++      /* Nothing to do. */
++    } else if (irt_isstr(kt)) {
++      emit_dji(as, LOONGI_LD_W, tmp1, key, ((int32_t)offsetof(GCstr, sid))&0xfff);
++    } else {  /* Must match with hash*() in lj_tab.c. */
++      emit_djk(as, LOONGI_SUB_W, tmp1, tmp1, tmp2);
++      emit_dju(as, LOONGI_ROTRI_W, tmp2, tmp2, (-HASH_ROT3)&0x1f);
++      emit_djk(as, LOONGI_XOR, tmp1, tmp2, tmp1);
++      emit_dju(as, LOONGI_ROTRI_W, tmp1, tmp1, (-HASH_ROT2-HASH_ROT1)&0x1f);
++      emit_djk(as, LOONGI_SUB_W, tmp2, tmp2, dest);
++      emit_djk(as, LOONGI_XOR, tmp2, tmp2, tmp1);
++      emit_dju(as, LOONGI_ROTRI_W, dest, tmp1, (-HASH_ROT1)&0x1f);
++      if (irt_isnum(kt)) {
++	emit_dju(as, LOONGI_SLLI_W, tmp1, tmp1, 1);
++	emit_dju(as, LOONGI_SRAI_D, tmp1, tmp1, 32);	// hi
++	emit_dju(as, LOONGI_SLLI_W, tmp2, tmp1, 0);	// lo
++	emit_dj(as, LOONGI_MOVFR2GR_D, tmp1, key);
++      } else {
++	checkmclim(as);
++	emit_dju(as, LOONGI_SRAI_D, tmp1, tmp1, 32);	// hi
++	emit_dju(as, LOONGI_SLLI_W, tmp2, key, 0);	// lo
++	emit_djk(as, LOONGI_ADD_D, tmp1, key, type);
++      }
++    }
++  }
++}
++
++static void asm_hrefk(ASMState *as, IRIns *ir)
++{
++  IRIns *kslot = IR(ir->op2);
++  IRIns *irkey = IR(kslot->op1);
++  int32_t ofs = (int32_t)(kslot->op2 * sizeof(Node));
++  int32_t kofs = ofs + (int32_t)offsetof(Node, key);
++  Reg dest = (ra_used(ir)||ofs > 32736) ? ra_dest(as, ir, RSET_GPR) : RID_NONE;
++  Reg node = ra_alloc1(as, ir->op1, RSET_GPR);
++  RegSet allow = rset_exclude(RSET_GPR, node);
++  Reg idx = node;
++  Reg key = ra_scratch(as, allow);
++  int64_t k;
++  lj_assertA(ofs % sizeof(Node) == 0, "unaligned HREFK slot");
++  if (ofs > 32736) {
++    idx = dest;
++    rset_clear(allow, dest);
++    kofs = (int32_t)offsetof(Node, key);
++  } else if (ra_hasreg(dest)) {
++    emit_addk(as, dest, node, ofs, allow);
++  }
++  if (irt_ispri(irkey->t)) {
++    lj_assertA(!irt_isnil(irkey->t), "bad HREFK key type");
++    k = ~((int64_t)~irt_toitype(irkey->t) << 47);
++  } else if (irt_isnum(irkey->t)) {
++    k = (int64_t)ir_knum(irkey)->u64;
++  } else {
++    k = ((int64_t)irt_toitype(irkey->t) << 47) | (int64_t)ir_kgc(irkey);
++  }
++  asm_guard(as, LOONGI_BNE, key, ra_allock(as, k, allow));
++  emit_lso(as, LOONGI_LD_D, key, idx, kofs, allow);
++  if (ofs > 32736)
++    emit_djk(as, LOONGI_ADD_D, dest, node, ra_allock(as, ofs, allow));
++}
++
++static void asm_uref(ASMState *as, IRIns *ir)
++{
++  Reg dest = ra_dest(as, ir, RSET_GPR);
++  if (irref_isk(ir->op1)) {
++    GCfunc *fn = ir_kfunc(IR(ir->op1));
++    MRef *v = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv.v;
++    emit_lsptr(as, LOONGI_LD_D, dest, v, RSET_GPR);
++  } else {
++    Reg uv = ra_scratch(as, RSET_GPR);
++    Reg func = ra_alloc1(as, ir->op1, RSET_GPR);
++    if (ir->o == IR_UREFC) {
++      Reg tmp = ra_scratch(as, rset_exclude(rset_exclude(RSET_GPR, dest), uv));
++      asm_guard(as, LOONGI_BEQ, tmp, RID_ZERO);
++      emit_dji(as, LOONGI_ADDI_D, dest, uv, ((int32_t)offsetof(GCupval, tv))&0xfff);
++      emit_dji(as, LOONGI_LD_BU, tmp, uv, ((int32_t)offsetof(GCupval, closed))&0xfff);
++    } else {
++      emit_dji(as, LOONGI_LD_D, dest, uv, ((int32_t)offsetof(GCupval, v))&0xfff);
++    }
++    emit_lso(as, LOONGI_LD_D, uv, func, (int32_t)offsetof(GCfuncL, uvptr) +
++      (int32_t)sizeof(MRef) * (int32_t)(ir->op2 >> 8), RSET_GPR);
++  }
++}
++
++static void asm_fref(ASMState *as, IRIns *ir)
++{
++  UNUSED(as); UNUSED(ir);
++  lj_assertA(!ra_used(ir), "unfused FREF");
++}
++
++static void asm_strref(ASMState *as, IRIns *ir)
++{
++  RegSet allow = RSET_GPR;
++  Reg dest = ra_dest(as, ir, allow);
++  Reg base = ra_alloc1(as, ir->op1, allow);
++  IRIns *irr = IR(ir->op2);
++  int32_t ofs = sizeof(GCstr);
++  rset_clear(allow, base);
++  if (irref_isk(ir->op2) && checki12(ofs + irr->i)) {
++    emit_dji(as, LOONGI_ADDI_D, dest, base, (ofs + irr->i)&0xfff);
++  } else {
++    emit_dji(as, LOONGI_ADDI_D, dest, dest, ofs&0xfff);
++    emit_djk(as, LOONGI_ADD_D, dest, base, ra_alloc1(as, ir->op2, allow));
++  }
++}
++
++/* -- Loads and stores ---------------------------------------------------- */
++
++static LOONGIns asm_fxloadins(ASMState *as, IRIns *ir)
++{
++  UNUSED(as);
++  switch (irt_type(ir->t)) {
++  case IRT_I8:
++    return LOONGI_LD_B;
++  case IRT_U8:
++    return LOONGI_LD_BU;
++  case IRT_I16:
++    return LOONGI_LD_H;
++  case IRT_U16:
++    return LOONGI_LD_HU;
++  case IRT_NUM:
++    lj_assertA(!LJ_SOFTFP32, "unsplit FP op");
++    return LOONGI_FLD_D;
++  /* fallthrough */
++  case IRT_FLOAT:
++    return LOONGI_FLD_S;
++  /* fallthrough */
++  default:
++    return irt_is64(ir->t) ? LOONGI_LD_D : LOONGI_LD_W;
++  }
++}
++
++static LOONGIns asm_fxstoreins(ASMState *as, IRIns *ir)
++{
++  UNUSED(as);
++  switch (irt_type(ir->t)) {
++  case IRT_I8: case IRT_U8: return LOONGI_ST_B;
++  case IRT_I16: case IRT_U16: return LOONGI_ST_H;
++  case IRT_NUM:
++    lj_assertA(!LJ_SOFTFP32, "unsplit FP op");
++    if (!LJ_SOFTFP) return LOONGI_FST_D;
++  /* fallthrough */
++  case IRT_FLOAT: return LOONGI_FST_S;
++  /* fallthrough */
++  default: return (LJ_64 && irt_is64(ir->t)) ? LOONGI_ST_D : LOONGI_ST_W;
++  }
++}
++
++static void asm_fload(ASMState *as, IRIns *ir)
++{
++  RegSet allow = RSET_GPR;
++  Reg idx, dest = ra_dest(as, ir, allow);
++  rset_clear(allow, dest);
++  LOONGIns loongi = asm_fxloadins(as, ir);
++  int32_t ofs;
++  if (ir->op1 == REF_NIL) {  /* FLOAD from GG_State with offset. */
++    idx = ra_allock(as, (int64_t)J2GG(as->J), allow);
++    ofs = (int32_t)(ir->op2<<2);
++  } else {
++    idx = ra_alloc1(as, ir->op1, allow);
++    if (ir->op2 == IRFL_TAB_ARRAY) {
++      ofs = asm_fuseabase(as, ir->op1);
++      if (ofs) {  /* Turn the t->array load into an add for colocated arrays. */
++	emit_dji(as, LOONGI_ADDI_D, dest, idx, ofs);
++	return;
++      }
++    }
++    ofs = field_ofs[ir->op2];
++  }
++  rset_clear(allow, idx);
++  lj_assertA(!irt_isfp(ir->t), "bad FP FLOAD");
++  emit_lso(as, loongi, dest, idx, ofs, allow);
++}
++
++static void asm_fstore(ASMState *as, IRIns *ir)
++{
++  if (ir->r == RID_SINK)
++    return;
++  Reg src = ra_alloc1(as, ir->op2, RSET_GPR);
++  IRIns *irf = IR(ir->op1);
++  Reg idx = ra_alloc1(as, irf->op1, rset_exclude(RSET_GPR, src));
++  int32_t ofs = field_ofs[irf->op2];
++  lj_assertA(!irt_isfp(ir->t), "bad FP FSTORE");
++  emit_dji(as, asm_fxstoreins(as, ir), src, idx, ofs&0xfff);
++}
++
++static void asm_xload(ASMState *as, IRIns *ir)
++{
++  Reg dest = ra_dest(as, ir, (irt_isfp(ir->t)) ? RSET_FPR : RSET_GPR);
++  lj_assertA(LJ_TARGET_UNALIGNED || !(ir->op2 & IRXLOAD_UNALIGNED),
++	     "unaligned XLOAD");
++  asm_fusexref(as, asm_fxloadins(as, ir), dest, ir->op1, RSET_GPR, 0);
++}
++
++static void asm_xstore_(ASMState *as, IRIns *ir, int32_t ofs)
++{
++  if (ir->r == RID_SINK)
++    return;
++  Reg src = ra_alloc1(as, ir->op2, irt_isfp(ir->t) ? RSET_FPR : RSET_GPR);
++  asm_fusexref(as, asm_fxstoreins(as, ir), src, ir->op1,
++		 rset_exclude(RSET_GPR, src), ofs);
++}
++
++#define asm_xstore(as, ir)	asm_xstore_(as, ir, 0)
++
++static void asm_ahuvload(ASMState *as, IRIns *ir)
++{
++  Reg dest = RID_NONE, type, idx;
++  RegSet allow = RSET_GPR;
++  int32_t ofs = 0;
++  IRType1 t = ir->t;
++
++  type = ra_scratch(as, allow);
++  rset_clear(allow, type);
++
++  if (ra_used(ir)) {
++    lj_assertA((irt_isnum(ir->t)) || irt_isint(ir->t) || irt_isaddr(ir->t),
++	       "bad load type %d", irt_type(ir->t));
++    dest = ra_dest(as, ir, irt_isnum(t) ? RSET_FPR : allow);
++    rset_clear(allow, dest);
++    if (irt_isaddr(t))
++      emit_djml(as, LOONGI_BSTRPICK_D, dest, dest, 46, 0);
++    else if (irt_isint(t))
++      emit_dju(as, LOONGI_SLLI_W, dest, dest, 0);
++  }
++  idx = asm_fuseahuref(as, ir->op1, &ofs, allow);
++  if (ir->o == IR_VLOAD) ofs += 8 * ir->op2;
++  rset_clear(allow, idx);
++  if (irt_isnum(t)) {
++    Reg tmp2 = ra_scratch(as, allow);
++    asm_guard(as, LOONGI_BEQ, tmp2, RID_ZERO);
++    emit_dju(as, LOONGI_SLTUI, tmp2, type, ((int32_t)LJ_TISNUM)&0xfff);
++  } else {
++    asm_guard(as, LOONGI_BNE, type,
++	      ra_allock(as, (int32_t)irt_toitype(t), allow));
++  }
++  if (ra_hasreg(dest)) {
++    if (irt_isnum(t)) {
++      emit_lso(as, LOONGI_FLD_D, dest, idx, ofs, allow);
++      dest = type;
++    }
++  } else {
++    dest = type;
++  }
++  emit_dju(as, LOONGI_SRAI_D, type, dest, 47);
++  emit_lso(as, LOONGI_LD_D, dest, idx, ofs, allow);
++}
++
++static void asm_ahustore(ASMState *as, IRIns *ir)
++{
++  RegSet allow = RSET_GPR;
++  Reg idx, src = RID_NONE, type = RID_NONE;
++  int32_t ofs = 0;
++  if (ir->r == RID_SINK)
++    return;
++  if (irt_isnum(ir->t)) {
++    src = ra_alloc1(as, ir->op2, RSET_FPR);
++    idx = asm_fuseahuref(as, ir->op1, &ofs, allow);
++    emit_lso(as, LOONGI_FST_D, src, idx, ofs, allow);
++  } else {
++    Reg tmp = RID_TMP;
++    if (irt_ispri(ir->t)) {
++      tmp = ra_allock(as, ~((int64_t)~irt_toitype(ir->t) << 47), allow);
++      rset_clear(allow, tmp);
++    } else {
++      src = ra_alloc1(as, ir->op2, allow);
++      rset_clear(allow, src);
++      type = ra_allock(as, (int64_t)irt_toitype(ir->t) << 47, allow);
++      rset_clear(allow, type);
++    }
++    idx = asm_fuseahuref(as, ir->op1, &ofs, allow);
++    emit_lso(as, LOONGI_ST_D, tmp, idx, ofs, allow);
++    if (ra_hasreg(src)) {
++      if (irt_isinteger(ir->t)) {
++	emit_djk(as, LOONGI_ADD_D, tmp, tmp, type);
++	emit_djml(as, LOONGI_BSTRPICK_D, tmp, src, 31, 0);
++      } else {
++	emit_djk(as, LOONGI_ADD_D, tmp, src, type);
++      }
++    }
++  }
++}
++
++static void asm_sload(ASMState *as, IRIns *ir)
++{
++  Reg dest = RID_NONE, type = RID_NONE, base;
++  RegSet allow = RSET_GPR;
++  IRType1 t = ir->t;
++  int32_t ofs = 8*((int32_t)ir->op1-2);
++  lj_assertA(!(ir->op2 & IRSLOAD_PARENT),
++	     "bad parent SLOAD");  /* Handled by asm_head_side(). */
++  lj_assertA(irt_isguard(ir->t) || !(ir->op2 & IRSLOAD_TYPECHECK),
++	     "inconsistent SLOAD variant");
++  if ((ir->op2 & IRSLOAD_CONVERT) && irt_isguard(t) && irt_isint(t)) {
++    dest = ra_scratch(as, RSET_FPR);
++    asm_tointg(as, ir, dest);
++    t.irt = IRT_NUM;  /* Continue with a regular number type check. */
++  } else if (ra_used(ir)) {
++    lj_assertA((irt_isnum(ir->t)) ||
++	       irt_isint(ir->t) || irt_isaddr(ir->t),
++	       "bad SLOAD type %d", irt_type(ir->t));
++    dest = ra_dest(as, ir, irt_isnum(t) ? RSET_FPR : allow);
++    rset_clear(allow, dest);
++    base = ra_alloc1(as, REF_BASE, allow);
++    rset_clear(allow, base);
++    if (ir->op2 & IRSLOAD_CONVERT) {
++      if (irt_isint(t)) {
++	Reg tmp = ra_scratch(as, RSET_FPR);
++	emit_dj(as, LOONGI_MOVFR2GR_S, dest, tmp);
++	emit_dj(as, LOONGI_FTINTRZ_W_D, tmp, tmp);
++	dest = tmp;
++	t.irt = IRT_NUM;  /* Check for original type. */
++      } else {
++	Reg tmp = ra_scratch(as, RSET_GPR);
++	emit_dj(as, LOONGI_FFINT_D_W, dest, dest);
++	emit_dj(as, LOONGI_MOVGR2FR_W, dest, tmp);
++	dest = tmp;
++	t.irt = IRT_INT;  /* Check for original type. */
++      }
++    } else if (irt_isaddr(t)) {
++      /* Clear type from pointers. */
++      emit_djml(as, LOONGI_BSTRPICK_D, dest, dest, 46, 0);
++    } else if (irt_isint(t) && (ir->op2 & IRSLOAD_TYPECHECK)) {
++      /* Sign-extend integers. */
++      emit_dju(as, LOONGI_SLLI_W, dest, dest, 0);
++    }
++    goto dotypecheck;
++  }
++  base = ra_alloc1(as, REF_BASE, allow);
++  rset_clear(allow, base);
++dotypecheck:
++  if ((ir->op2 & IRSLOAD_TYPECHECK)) {
++    if (dest < RID_MAX_GPR) {
++      type = dest;
++    } else {
++      type = ra_scratch(as, allow);
++    }
++    rset_clear(allow, type);
++    Reg tmp1 = ra_scratch(as, allow);
++    if (irt_ispri(t)) {
++      asm_guard(as, LOONGI_BNE, type,
++		ra_allock(as, ~((int64_t)~irt_toitype(t) << 47) , allow));
++    } else if ((ir->op2 & IRSLOAD_KEYINDEX)) {
++      asm_guard(as, LOONGI_BNE, tmp1,
++               ra_allock(as, (int32_t)LJ_KEYINDEX, allow));
++      emit_dju(as, LOONGI_SRAI_D, tmp1, type, 32);
++    } else {
++      if (irt_isnum(t)) {
++        asm_guard(as, LOONGI_BEQ, tmp1, RID_ZERO);
++        emit_dji(as, LOONGI_SLTUI, tmp1, tmp1, LJ_TISNUM&0xfff);
++	if (ra_hasreg(dest)) {
++	  emit_lso(as, LOONGI_FLD_D, dest, base, ofs, allow);
++	}
++      } else {
++	asm_guard(as, LOONGI_BNE, tmp1,
++		  ra_allock(as, (int32_t)irt_toitype(t), allow));
++      }
++      emit_dju(as, LOONGI_SRAI_D, tmp1, type, 47);
++    }
++    emit_lso(as, LOONGI_LD_D, type, base, ofs, allow);
++  } else if (ra_hasreg(dest)) {
++    if (irt_isnum(t)) {
++      emit_lso(as, LOONGI_FLD_D, dest, base, ofs, allow);
++    } else {
++      emit_lso(as, irt_isint(t) ? LOONGI_LD_W : LOONGI_LD_D, dest, base, ofs, allow);
++    }
++  }
++}
++
++/* -- Allocations --------------------------------------------------------- */
++
++#if LJ_HASFFI
++static void asm_cnew(ASMState *as, IRIns *ir)
++{
++  CTState *cts = ctype_ctsG(J2G(as->J));
++  CTypeID id = (CTypeID)IR(ir->op1)->i;
++  CTSize sz;
++  CTInfo info = lj_ctype_info(cts, id, &sz);
++  const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_mem_newgco];
++  IRRef args[4];
++  RegSet drop = RSET_SCRATCH;
++  lj_assertA(sz != CTSIZE_INVALID || (ir->o == IR_CNEW && ir->op2 != REF_NIL),
++	     "bad CNEW/CNEWI operands");
++
++  as->gcsteps++;
++  if (ra_hasreg(ir->r))
++    rset_clear(drop, ir->r);  /* Dest reg handled below. */
++  ra_evictset(as, drop);
++  if (ra_used(ir))
++    ra_destreg(as, ir, RID_RET);  /* GCcdata * */
++
++  /* Initialize immutable cdata object. */
++  if (ir->o == IR_CNEWI) {
++    RegSet allow = (RSET_GPR & ~RSET_SCRATCH);
++    emit_dji(as, sz == 8 ? LOONGI_ST_D : LOONGI_ST_W, ra_alloc1(as, ir->op2, allow),
++	     RID_RET, (sizeof(GCcdata))&0xfff);
++    lj_assertA(sz == 4 || sz == 8, "bad CNEWI size %d", sz);
++  } else if (ir->op2 != REF_NIL) {  /* Create VLA/VLS/aligned cdata. */
++    ci = &lj_ir_callinfo[IRCALL_lj_cdata_newv];
++    args[0] = ASMREF_L;     /* lua_State *L */
++    args[1] = ir->op1;      /* CTypeID id   */
++    args[2] = ir->op2;      /* CTSize sz    */
++    args[3] = ASMREF_TMP1;  /* CTSize align */
++    asm_gencall(as, ci, args);
++    emit_loadi(as, ra_releasetmp(as, ASMREF_TMP1), (int32_t)ctype_align(info));
++    return;
++  }
++
++  /* Initialize gct and ctypeid. lj_mem_newgco() already sets marked. */
++  emit_dji(as, LOONGI_ST_B, RID_RET+1, RID_RET, (offsetof(GCcdata, gct))&0xfff);
++  emit_dji(as, LOONGI_ST_H, RID_TMP, RID_RET, (offsetof(GCcdata, ctypeid))&0xfff);
++  emit_dji(as, LOONGI_ADDI_D, RID_RET+1, RID_ZERO, ~LJ_TCDATA&0xfff);
++  emit_dj32i(as, RID_TMP, RID_ZERO, id);
++  args[0] = ASMREF_L;     /* lua_State *L */
++  args[1] = ASMREF_TMP1;  /* MSize size   */
++  asm_gencall(as, ci, args);
++  ra_allockreg(as, (int32_t)(sz+sizeof(GCcdata)), ra_releasetmp(as, ASMREF_TMP1));
++}
++#endif
++
++/* -- Write barriers ------------------------------------------------------ */
++
++static void asm_tbar(ASMState *as, IRIns *ir)
++{
++  Reg tab = ra_alloc1(as, ir->op1, RSET_GPR);
++  Reg mark = ra_scratch(as, rset_exclude(RSET_GPR, tab));
++  Reg link = RID_TMP;
++  MCLabel l_end = emit_label(as);
++  emit_dji(as, LOONGI_ST_D, link, tab, ((int32_t)offsetof(GCtab, gclist))&0xfff);
++  emit_dji(as, LOONGI_ST_B, mark, tab, ((int32_t)offsetof(GCtab, marked))&0xfff);
++  emit_setgl(as, tab, gc.grayagain);	// make tab gray again
++  emit_getgl(as, link, gc.grayagain);
++  emit_branch(as, LOONGI_BEQ, RID_TMP, RID_ZERO, l_end);	// black: not jump
++  emit_djk(as, LOONGI_XOR, mark, mark, RID_TMP);	// mark=0: gray
++  emit_dju(as, LOONGI_ANDI, RID_TMP, mark, LJ_GC_BLACK);
++  emit_dji(as, LOONGI_LD_BU, mark, tab, ((int32_t)offsetof(GCtab, marked))&0xfff);
++}
++
++static void asm_obar(ASMState *as, IRIns *ir)
++{
++  const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_gc_barrieruv];
++  IRRef args[2];
++  MCLabel l_end;
++  Reg obj, val, tmp;
++  /* No need for other object barriers (yet). */
++  lj_assertA(IR(ir->op1)->o == IR_UREFC, "bad OBAR type");	// Closed upvalue
++  ra_evictset(as, RSET_SCRATCH);
++  l_end = emit_label(as);
++  args[0] = ASMREF_TMP1;  /* global_State *g */
++  args[1] = ir->op1;      /* TValue *tv      */
++  asm_gencall(as, ci, args);
++  obj = IR(ir->op1)->r;
++  tmp = ra_scratch(as, rset_exclude(RSET_GPR, obj));
++  emit_branch(as, LOONGI_BEQ, tmp, RID_ZERO, l_end);
++  emit_addk(as, ra_releasetmp(as, ASMREF_TMP1), RID_JGL, -32768, RSET_GPR);
++  emit_branch(as, LOONGI_BEQ, RID_TMP, RID_ZERO, l_end);	// black: jump
++  emit_dju(as, LOONGI_ANDI, tmp, tmp, LJ_GC_BLACK);
++  emit_dju(as, LOONGI_ANDI, RID_TMP, RID_TMP, LJ_GC_WHITES);
++  val = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, obj));
++  emit_dji(as, LOONGI_LD_BU, tmp, obj,
++	   ((int32_t)offsetof(GCupval, marked)-(int32_t)offsetof(GCupval, tv))&0xfff);
++  emit_dji(as, LOONGI_LD_BU, RID_TMP, val, ((int32_t)offsetof(GChead, marked))&0xfff);
++}
++
++/* -- Arithmetic and logic operations ------------------------------------- */
++
++static void asm_fparith(ASMState *as, IRIns *ir, LOONGIns loongi)
++{
++  Reg dest = ra_dest(as, ir, RSET_FPR);
++  Reg right, left = ra_alloc2(as, ir, RSET_FPR);
++  right = (left >> 8); left &= 255;
++  emit_djk(as, loongi, dest, left, right);
++}
++
++static void asm_fpunary(ASMState *as, IRIns *ir, LOONGIns loongi)
++{
++  Reg dest = ra_dest(as, ir, RSET_FPR);
++  Reg left = ra_hintalloc(as, ir->op1, dest, RSET_FPR);
++  emit_dj(as, loongi, dest, left);
++}
++
++static void asm_fpmath(ASMState *as, IRIns *ir)
++{
++  IRFPMathOp fpm = (IRFPMathOp)ir->op2;
++  if (fpm <= IRFPM_TRUNC)
++    asm_callround(as, ir, IRCALL_lj_vm_floor + fpm);
++  else if (fpm == IRFPM_SQRT)
++    asm_fpunary(as, ir, LOONGI_FSQRT_D);
++  else
++    asm_callid(as, ir, IRCALL_lj_vm_floor + fpm);
++}
++
++static void asm_add(ASMState *as, IRIns *ir)
++{
++  IRType1 t = ir->t;
++  if (irt_isnum(t)) {
++    if (!asm_fusemadd(as, ir, LOONGI_FMADD_D, LOONGI_FMADD_D))
++      asm_fparith(as, ir, LOONGI_FADD_D);
++    return;
++  } else {
++    Reg dest = ra_dest(as, ir, RSET_GPR);
++    Reg left = ra_hintalloc(as, ir->op1, dest, RSET_GPR);
++    if (irref_isk(ir->op2)) {
++      intptr_t k = get_kval(as, ir->op2);
++      if (LOONGF_S_OK(k, 12)) {		// si12
++        if (irt_is64(t)) {
++          emit_dji(as, LOONGI_ADDI_D, dest, left, k&0xfff);
++        } else {
++	  emit_dji(as, LOONGI_ADDI_W, dest, left, k&0xfff);
++        }
++	return;
++      }
++    }
++    Reg right = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, left));
++    emit_djk(as, irt_is64(t) ? LOONGI_ADD_D : LOONGI_ADD_W, dest,
++	     left, right);
++  }
++}
++
++static void asm_sub(ASMState *as, IRIns *ir)
++{
++  if (irt_isnum(ir->t)) {
++    if (!asm_fusemadd(as, ir, LOONGI_FMSUB_D, LOONGI_FNMSUB_D))
++      asm_fparith(as, ir, LOONGI_FSUB_D);
++    return;
++  } else {
++    Reg dest = ra_dest(as, ir, RSET_GPR);
++    Reg right, left = ra_alloc2(as, ir, RSET_GPR);
++    right = (left >> 8); left &= 255;
++    emit_djk(as, irt_is64(ir->t) ? LOONGI_SUB_D : LOONGI_SUB_W, dest,
++	     left, right);
++  }
++}
++
++static void asm_mul(ASMState *as, IRIns *ir)
++{
++  if (irt_isnum(ir->t)) {
++    asm_fparith(as, ir, LOONGI_FMUL_D);
++  } else
++  {
++    Reg dest = ra_dest(as, ir, RSET_GPR);
++    Reg right, left = ra_alloc2(as, ir, RSET_GPR);
++    right = (left >> 8); left &= 255;
++    if (irt_is64(ir->t)) {
++      emit_djk(as, LOONGI_MUL_D, dest, left, right);
++    } else {
++      emit_djk(as, LOONGI_MUL_W, dest, left, right);
++    }
++  }
++}
++
++static void asm_fpdiv(ASMState *as, IRIns *ir)
++{
++    asm_fparith(as, ir, LOONGI_FDIV_D);
++}
++
++static void asm_neg(ASMState *as, IRIns *ir)
++{
++  if (irt_isnum(ir->t)) {
++    asm_fpunary(as, ir, LOONGI_FNEG_D);
++  } else {
++    Reg dest = ra_dest(as, ir, RSET_GPR);
++    Reg left = ra_hintalloc(as, ir->op1, dest, RSET_GPR);
++    emit_djk(as, irt_is64(ir->t) ? LOONGI_SUB_D : LOONGI_SUB_W, dest,
++	     RID_ZERO, left);
++  }
++}
++
++#define asm_abs(as, ir)		asm_fpunary(as, ir, LOONGI_FABS_D)
++
++static void asm_arithov(ASMState *as, IRIns *ir)
++{
++  RegSet allow = RSET_GPR;
++  Reg right, left, tmp, tmp2, dest = ra_dest(as, ir, allow);
++  rset_clear(allow, dest);
++  lj_assertA(!irt_is64(ir->t), "bad usage");
++  tmp2 = ra_scratch(as, allow);
++  rset_clear(allow, tmp2);
++  if (irref_isk(ir->op2)) {
++    int k = IR(ir->op2)->i;
++    if (ir->o == IR_SUBOV) k = -k;
++    if (LOONGF_S_OK(k, 12)) {	/* (dest < left) == (k >= 0 ? 1 : 0) */
++      left = ra_alloc1(as, ir->op1, allow);
++      asm_guard(as, k >= 0 ? LOONGI_BNE : LOONGI_BEQ, tmp2, RID_ZERO);
++      emit_djk(as, LOONGI_SLT, tmp2, dest, dest == left ? tmp2 : left);
++      emit_dji(as, LOONGI_ADDI_D, dest, left, k&0xfff);
++      if (dest == left) emit_move(as, tmp2, left);
++      return;
++    }
++  }
++  left = ra_alloc2(as, ir, allow);
++  right = (left >> 8); left &= 255;
++  rset_clear(allow, right);
++  rset_clear(allow, left);
++  tmp = ra_scratch(as, allow);
++  asm_guard(as, LOONGI_BLT, tmp2, RID_ZERO);
++  emit_djk(as, LOONGI_AND, tmp2, RID_TMP, tmp);
++  if (ir->o == IR_ADDOV) {  /* ((dest^left) & (dest^right)) < 0 */
++    emit_djk(as, LOONGI_XOR, RID_TMP, dest, dest == right ? RID_TMP : right);
++  } else {  /* ((dest^left) & (dest^~right)) < 0 */
++    emit_djk(as, LOONGI_XOR, RID_TMP, RID_TMP, dest);
++    emit_djk(as, LOONGI_NOR, RID_TMP, dest == right ? RID_TMP : right, RID_ZERO);
++  }
++  emit_djk(as, LOONGI_XOR, tmp, dest, dest == left ? RID_TMP : left);
++  emit_djk(as, ir->o == IR_ADDOV ? LOONGI_ADD_W : LOONGI_SUB_W, dest, left, right);
++  if (dest == left || dest == right)
++    emit_move(as, RID_TMP, dest == left ? left : right);
++}
++
++#define asm_addov(as, ir)	asm_arithov(as, ir)
++#define asm_subov(as, ir)	asm_arithov(as, ir)
++
++static void asm_mulov(ASMState *as, IRIns *ir)
++{
++  Reg dest = ra_dest(as, ir, RSET_GPR);
++  Reg tmp, tmp2, right, left = ra_alloc2(as, ir, RSET_GPR);
++  right = (left >> 8); left &= 255;
++  tmp = ra_scratch(as, rset_exclude(rset_exclude(rset_exclude(RSET_GPR, left),
++						 right), dest));
++  tmp2 = ra_scratch(as, rset_exclude(rset_exclude(rset_exclude(rset_exclude(RSET_GPR, left),
++						right), dest), tmp));
++  asm_guard(as, LOONGI_BNE, tmp2, tmp);
++  emit_dju(as, LOONGI_SRAI_W, tmp2, dest, 31);
++  emit_djk(as, LOONGI_MUL_W, dest, left, right);	// dest: [31:0]+signextend
++  emit_djk(as, LOONGI_MULH_W, tmp, left, right);	// tmp: [63:32]
++}
++
++static void asm_bnot(ASMState *as, IRIns *ir)
++{
++  Reg left, right, dest = ra_dest(as, ir, RSET_GPR);
++  IRIns *irl = IR(ir->op1);
++  if (mayfuse(as, ir->op1) && irl->o == IR_BOR) {
++    left = ra_alloc2(as, irl, RSET_GPR);
++    right = (left >> 8); left &= 255;
++  } else {
++    left = ra_hintalloc(as, ir->op1, dest, RSET_GPR);
++    right = RID_ZERO;
++  }
++  emit_djk(as, LOONGI_NOR, dest, left, right);
++}
++
++static void asm_bswap(ASMState *as, IRIns *ir)
++{
++  Reg dest = ra_dest(as, ir, RSET_GPR);
++  Reg left = ra_alloc1(as, ir->op1, RSET_GPR);
++  if (irt_is64(ir->t)) {
++    emit_dj(as, LOONGI_REVH_D, dest, RID_TMP);
++    emit_dj(as, LOONGI_REVB_4H, RID_TMP, left);
++  } else {
++    emit_dju(as, LOONGI_ROTRI_W, dest, RID_TMP, 16);
++    emit_dj(as, LOONGI_REVB_2H, RID_TMP, left);
++  }
++}
++
++static void asm_bitop(ASMState *as, IRIns *ir, LOONGIns loongi, LOONGIns loongik)
++{
++  Reg dest = ra_dest(as, ir, RSET_GPR);
++  Reg right, left = ra_hintalloc(as, ir->op1, dest, RSET_GPR);
++  if (irref_isk(ir->op2)) {
++    intptr_t k = get_kval(as, ir->op2);
++    if (checku12(k)) {
++      emit_dji(as, loongik, dest, left, k&0xfff);
++      return;
++    }
++  }
++  right = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, left));
++  emit_djk(as, loongi, dest, left, right);
++}
++
++#define asm_band(as, ir)	asm_bitop(as, ir, LOONGI_AND, LOONGI_ANDI)
++#define asm_bor(as, ir)		asm_bitop(as, ir, LOONGI_OR, LOONGI_ORI)
++#define asm_bxor(as, ir)	asm_bitop(as, ir, LOONGI_XOR, LOONGI_XORI)
++
++static void asm_bitshift(ASMState *as, IRIns *ir, LOONGIns loongi, LOONGIns loongik)
++{
++  Reg dest = ra_dest(as, ir, RSET_GPR);
++  Reg left = ra_alloc1(as, ir->op1, RSET_GPR);
++  uint32_t shmask = irt_is64(ir->t) ? 63 : 31;
++  if (irref_isk(ir->op2)) {  /* Constant shifts. */
++    uint32_t shift = (uint32_t)(IR(ir->op2)->i & shmask);
++    emit_dju(as, loongik, dest, left, shift);
++  } else {
++    Reg right = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, left));
++    emit_djk(as, loongi, dest, left, right);  /* Shift amount is in rs. */
++  }
++}
++
++#define asm_bshl(as, ir)	(irt_is64(ir->t) ? \
++  asm_bitshift(as, ir, LOONGI_SLL_D, LOONGI_SLLI_D) : \
++  asm_bitshift(as, ir, LOONGI_SLL_W, LOONGI_SLLI_W))
++#define asm_bshr(as, ir)	(irt_is64(ir->t) ? \
++  asm_bitshift(as, ir, LOONGI_SRL_D, LOONGI_SRLI_D) : \
++  asm_bitshift(as, ir, LOONGI_SRL_W, LOONGI_SRLI_W))
++#define asm_bsar(as, ir)	(irt_is64(ir->t) ? \
++  asm_bitshift(as, ir, LOONGI_SRA_D, LOONGI_SRAI_D) : \
++  asm_bitshift(as, ir, LOONGI_SRA_W, LOONGI_SRAI_W))
++#define asm_brol(as, ir)	lj_assertA(0, "unexpected BROL")
++#define asm_bror(as, ir)	(irt_is64(ir->t) ? \
++  asm_bitshift(as, ir, LOONGI_ROTR_D, LOONGI_ROTRI_D) : \
++  asm_bitshift(as, ir, LOONGI_ROTR_W, LOONGI_ROTRI_W))
++
++static void asm_min_max(ASMState *as, IRIns *ir, int ismax)
++{
++  if (irt_isnum(ir->t)) {
++    Reg dest = ra_dest(as, ir, RSET_FPR);
++    Reg right, left = ra_alloc2(as, ir, RSET_FPR);
++    right = (left >> 8); left &= 255;
++    emit_djk(as, ismax ? LOONGI_FMAX_D : LOONGI_FMIN_D, dest, left, right);
++  } else {
++    Reg dest = ra_dest(as, ir, RSET_GPR);
++    Reg left = ra_hintalloc(as, ir->op1, dest, RSET_GPR);
++    Reg right = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, left));
++    emit_djk(as, LOONGI_OR, dest, dest, RID_TMP);
++    if (dest != right) {
++      emit_djk(as, LOONGI_MASKEQZ, RID_TMP, right, RID_TMP);
++      emit_djk(as, LOONGI_MASKNEZ, dest, left, RID_TMP);
++    } else {
++      emit_djk(as, LOONGI_MASKNEZ, RID_TMP, left, RID_TMP);
++      emit_djk(as, LOONGI_MASKEQZ, dest, right, RID_TMP);
++    }
++    emit_djk(as, LOONGI_SLT, RID_TMP,
++	     ismax ? left : right, ismax ? right : left);
++  }
++}
++
++#define asm_min(as, ir)		asm_min_max(as, ir, 0)
++#define asm_max(as, ir)		asm_min_max(as, ir, 1)
++
++/* -- Comparisons --------------------------------------------------------- */
++
++/* FP comparisons. */
++static void asm_fpcomp(ASMState *as, IRIns *ir)
++{
++  IROp op = ir->o;
++  Reg right, left = ra_alloc2(as, ir, RSET_FPR);
++  right = (left >> 8); left &= 255;
++  asm_guard21(as, (op&1) ? LOONGI_BCNEZ : LOONGI_BCEQZ, 0);
++  switch (op) {
++    case IR_LT: case IR_UGE:
++      emit_djk(as, LOONGI_FCMP_CLT_D, 0, left, right);
++      break;
++    case IR_GE: case IR_ULT:
++      emit_djk(as, LOONGI_FCMP_CULT_D, 0, left, right);
++      break;
++    case IR_LE: case IR_UGT: case IR_ABC:
++      emit_djk(as, LOONGI_FCMP_CLE_D, 0, left, right);
++      break;
++    case IR_ULE: case IR_GT:
++      emit_djk(as, LOONGI_FCMP_CULE_D, 0, left, right);
++      break;
++    case IR_EQ: case IR_NE:
++      emit_djk(as, LOONGI_FCMP_CEQ_D, 0, left, right);
++      break;
++    default:
++      break;
++  }
++}
++
++/* Integer comparisons. */
++static void asm_intcomp(ASMState *as, IRIns *ir)
++{
++  /* ORDER IR: LT GE LE GT  ULT UGE ULE UGT. */
++  /*           00 01 10 11  100 101 110 111  */
++  IROp op = ir->o;
++  RegSet allow = RSET_GPR;
++  Reg tmp, right, left = ra_alloc1(as, ir->op1, allow);
++  rset_clear(allow, left);
++  if (op == IR_ABC) op = IR_UGT;
++  if ((op&4) == 0 && irref_isk(ir->op2) && get_kval(as, ir->op2) == 0) {
++    switch (op) {
++      case IR_GT: asm_guard(as, LOONGI_BGE, RID_ZERO, left); break;
++      case IR_LE: asm_guard(as, LOONGI_BLT, RID_ZERO, left); break;
++      case IR_GE: asm_guard(as, LOONGI_BLT, left, RID_ZERO); break;
++      case IR_LT: asm_guard(as, LOONGI_BGE, left, RID_ZERO); break;
++      default: break;
++    }
++    return;
++  }
++  tmp = ra_scratch(as, allow);
++  rset_clear(allow, tmp);
++  if (irref_isk(ir->op2)) {
++    intptr_t k = get_kval(as, ir->op2);
++    if ((op&2)) k++;
++    if (checki12(k)) {
++      asm_guard(as, (op&1) ? LOONGI_BNE : LOONGI_BEQ, tmp, RID_ZERO);
++      emit_dji(as, (op&4) ? LOONGI_SLTUI : LOONGI_SLTI, tmp, left, k&0xfff);
++      return;
++    }
++  }
++  right = ra_alloc1(as, ir->op2, allow);
++  asm_guard(as, ((op^(op>>1))&1) ? LOONGI_BNE : LOONGI_BEQ, tmp, RID_ZERO);
++  emit_djk(as, (op&4) ? LOONGI_SLTU : LOONGI_SLT,
++           tmp, (op&2) ? right : left, (op&2) ? left : right);
++}
++
++static void asm_comp(ASMState *as, IRIns *ir)
++{
++  if (irt_isnum(ir->t))
++    asm_fpcomp(as, ir);
++  else
++    asm_intcomp(as, ir);
++}
++
++static void asm_equal(ASMState *as, IRIns *ir)
++{
++  if (irt_isnum(ir->t)) {
++    asm_fpcomp(as, ir);
++  } else {
++    Reg right, left = ra_alloc2(as, ir, RSET_GPR);
++    right = (left >> 8); left &= 255;
++    asm_guard(as, (ir->o & 1) ? LOONGI_BEQ : LOONGI_BNE, left, right);
++  }
++}
++
++/* -- Split register ops -------------------------------------------------- */
++
++/* Hiword op of a split 64 bit op. Previous op must be the loword op. */
++static void asm_hiop(ASMState *as, IRIns *ir)
++{
++  /* HIOP is marked as a store because it needs its own DCE logic. */
++  int uselo = ra_used(ir-1), usehi = ra_used(ir);  /* Loword/hiword used? */
++  if (LJ_UNLIKELY(!(as->flags & JIT_F_OPT_DCE))) uselo = usehi = 1;
++  if (!usehi) return;  /* Skip unused hiword op for all remaining ops. */
++  switch ((ir-1)->o) {
++  case IR_CALLN:
++  case IR_CALLL:
++  case IR_CALLS:
++  case IR_CALLXS:
++    if (!uselo)
++      ra_allocref(as, ir->op1, RID2RSET(RID_RETLO));  /* Mark lo op as used. */
++    break;
++  default: lj_assertA(0, "bad HIOP for op %d", (ir-1)->o); break;
++  }
++}
++
++/* -- Profiling ----------------------------------------------------------- */
++
++static void asm_prof(ASMState *as, IRIns *ir)
++{
++  UNUSED(ir);
++  Reg tmp = ra_scratch(as, RSET_GPR);
++  asm_guard(as, LOONGI_BNE, tmp, RID_ZERO);
++  emit_dju(as, LOONGI_ANDI, tmp, tmp, HOOK_PROFILE);
++  emit_lsglptr2(as, LOONGI_LD_BU, tmp,
++	       (int32_t)offsetof(global_State, hookmask));
++}
++
++/* -- Stack handling ------------------------------------------------------ */
++
++/* Check Lua stack size for overflow. Use exit handler as fallback. */
++static void asm_stack_check(ASMState *as, BCReg topslot,
++			    IRIns *irp, RegSet allow, ExitNo exitno)
++{
++  /* Try to get an unused temp register, otherwise spill/restore RID_RET*. */
++  Reg tmp, pbase = irp ? (ra_hasreg(irp->r) ? irp->r : RID_TMP) : RID_BASE;
++  ExitNo oldsnap = as->snapno;
++  rset_clear(allow, pbase);
++  as->snapno = exitno;
++  asm_guard(as, LOONGI_BNE, RID_R20, RID_ZERO);
++  as->snapno = oldsnap;
++  if (allow) {
++    tmp = rset_pickbot(allow);
++    ra_modified(as, tmp);
++  } else {	// allow == RSET_EMPTY
++    tmp = RID_RET;
++    emit_dji(as, LOONGI_LD_D, tmp, RID_SP, 0);	/* Restore tmp1 register. */
++  }
++  lj_assertA(checki12(8*topslot), "slot offset %d does not fit in si12", 8*topslot);
++  emit_dji(as, LOONGI_SLTUI, RID_R20, RID_R20, (int32_t)(8*topslot)&0xfff);
++  emit_djk(as, LOONGI_SUB_D, RID_R20, tmp, pbase);
++  emit_dji(as, LOONGI_LD_D, tmp, tmp, offsetof(lua_State, maxstack));
++  if (pbase == RID_TMP)
++    emit_getgl(as, RID_TMP, jit_base);
++  emit_getgl(as, tmp, cur_L);
++  if (allow == RSET_EMPTY)  /* Spill temp register. */
++    emit_dji(as, LOONGI_ST_D, tmp, RID_SP, 0);
++}
++
++/* Restore Lua stack from on-trace state. */
++static void asm_stack_restore(ASMState *as, SnapShot *snap)
++{
++  SnapEntry *map = &as->T->snapmap[snap->mapofs];
++#ifdef LUA_USE_ASSERT
++  SnapEntry *flinks = &as->T->snapmap[snap_nextofs(as->T, snap)-1-LJ_FR2];
++#endif
++  MSize n, nent = snap->nent;
++  /* Store the value of all modified slots to the Lua stack. */
++  for (n = 0; n < nent; n++) {
++    SnapEntry sn = map[n];
++    BCReg s = snap_slot(sn);
++    int32_t ofs = 8*((int32_t)s-1-LJ_FR2);
++    IRRef ref = snap_ref(sn);
++    IRIns *ir = IR(ref);
++    if ((sn & SNAP_NORESTORE))
++      continue;
++    if (irt_isnum(ir->t)) {
++      Reg src = ra_alloc1(as, ref, RSET_FPR);
++      emit_dji(as, LOONGI_FST_D, src, RID_BASE, ofs&0xfff);
++    } else {
++      if ((sn & SNAP_KEYINDEX)) {
++        RegSet allow = rset_exclude(RSET_GPR, RID_BASE);
++	int64_t kki = (int64_t)LJ_KEYINDEX << 32;
++	if (irref_isk(ref)) {
++	  emit_djk(as, LOONGI_STX_D,
++	           ra_allock(as, kki | (int64_t)(uint32_t)ir->i, allow),
++		   RID_BASE, RID_R20);
++	  emit_d16i(as, RID_R20, ofs);
++	} else {
++	  Reg src = ra_alloc1(as, ref, allow);
++	  Reg rki = ra_allock(as, kki, rset_exclude(allow, src));
++	  emit_djk(as, LOONGI_STX_D, RID_TMP, RID_BASE, RID_R20);
++	  emit_d16i(as, RID_R20, ofs);
++	  emit_djk(as, LOONGI_ADD_D, RID_TMP, src, rki);
++	}
++      } else {
++        asm_tvstore64(as, RID_BASE, ofs, ref);
++      }
++    }
++    checkmclim(as);
++  }
++  lj_assertA(map + nent == flinks, "inconsistent frames in snapshot");
++}
++
++/* -- GC handling --------------------------------------------------------- */
++
++/* Marker to prevent patching the GC check exit. */
++#define LOONG_NOPATCH_GC_CHECK	LOONGI_OR
++
++/* Check GC threshold and do one or more GC steps. */
++static void asm_gc_check(ASMState *as)
++{
++  const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_gc_step_jit];
++  IRRef args[2];
++  MCLabel l_end;
++  Reg tmp1, tmp2;
++  ra_evictset(as, RSET_SCRATCH);
++  l_end = emit_label(as);
++  /* Exit trace if in GCSatomic or GCSfinalize. Avoids syncing GC objects. */
++  asm_guard(as, LOONGI_BNE, RID_RET, RID_ZERO);	/* Assumes asm_snap_prep() already done. */
++  *--as->mcp = LOONG_NOPATCH_GC_CHECK;
++  args[0] = ASMREF_TMP1;  /* global_State *g */
++  args[1] = ASMREF_TMP2;  /* MSize steps     */
++  asm_gencall(as, ci, args);
++  tmp1 = ra_releasetmp(as, ASMREF_TMP1);
++  tmp2 = ra_releasetmp(as, ASMREF_TMP2);
++  ra_allockreg(as, (int64_t)(J2G(as->J)), tmp1);
++  emit_loadi(as, tmp2, as->gcsteps);
++  /* Jump around GC step if GC total < GC threshold. */
++  emit_branch(as, LOONGI_BLTU, RID_TMP, tmp2, l_end);
++  emit_getgl(as, tmp2, gc.threshold);
++  emit_getgl(as, RID_TMP, gc.total);
++  as->gcsteps = 0;
++  checkmclim(as);
++}
++
++/* -- Loop handling ------------------------------------------------------- */
++
++/* Fixup the loop branch. */
++static void asm_loop_fixup(ASMState *as)
++{
++  MCode *p = as->mctop;
++  MCode *target = as->mcp;
++  if (as->loopinv) {  /* Inverted loop branch? */
++    /* asm_guard* already inverted the bceqz/bcnez/beq/bne/blt/bge, and patched the final b. */
++    uint32_t mask = (p[-2] & 0xfc000000) == 0x48000000 ? 0x1fffffu : 0xffffu;
++    ptrdiff_t delta = target - (p - 2);
++    if (mask == 0x1fffffu) {	/* BCEQZ  BCNEZ*/
++      p[-2] = p[-2] | LOONGF_I((uint32_t)delta & 0xffffu) | (((uint32_t)delta & 0x1f0000u) >> 16);
++    } else {	/* BEQ BNE BLE BGE BLTU BGEU*/
++      p[-2] |= LOONGF_I(delta & 0xffffu);
++    }
++    if (p[-1] == 0)
++      p[-1] = LOONGI_NOP;
++  } else {
++    /* b */
++    ptrdiff_t delta = target - (p - 1);
++    p[-1] = LOONGI_B | LOONGF_I(delta & 0xffffu) | ((delta & 0x3ff0000) >> 16);
++  }
++}
++
++/* Fixup the tail of the loop. */
++static void asm_loop_tail_fixup(ASMState *as)
++{
++  UNUSED(as);	/* Nothing to do. */
++}
++
++/* -- Head of trace ------------------------------------------------------- */
++
++/* Coalesce BASE register for a root trace. */
++static void asm_head_root_base(ASMState *as)
++{
++  IRIns *ir = IR(REF_BASE);
++  Reg r = ir->r;
++  if (ra_hasreg(r)) {
++    ra_free(as, r);
++    if (rset_test(as->modset, r) || irt_ismarked(ir->t))
++      ir->r = RID_INIT;  /* No inheritance for modified BASE register. */
++    if (r != RID_BASE)
++      emit_move(as, r, RID_BASE);
++  }
++}
++
++/* Coalesce BASE register for a side trace. */
++static RegSet asm_head_side_base(ASMState *as, IRIns *irp, RegSet allow)
++{
++  IRIns *ir = IR(REF_BASE);
++  Reg r = ir->r;
++  if (ra_hasreg(r)) {
++    ra_free(as, r);
++    if (rset_test(as->modset, r) || irt_ismarked(ir->t))
++      ir->r = RID_INIT;  /* No inheritance for modified BASE register. */
++    if (irp->r == r) {
++      rset_clear(allow, r);  /* Mark same BASE register as coalesced. */
++    } else if (ra_hasreg(irp->r) && rset_test(as->freeset, irp->r)) {
++      rset_clear(allow, irp->r);
++      emit_move(as, r, irp->r);  /* Move from coalesced parent reg. */
++    } else {
++      emit_getgl(as, r, jit_base);  /* Otherwise reload BASE. */
++    }
++  }
++  return allow;
++}
++
++/* -- Tail of trace ------------------------------------------------------- */
++
++/* Fixup the tail code. */
++static void asm_tail_fixup(ASMState *as, TraceNo lnk)
++{
++  MCode *target = lnk ? traceref(as->J,lnk)->mcode : (MCode *)lj_vm_exit_interp;
++  int32_t spadj = as->T->spadjust;
++  MCode *p = as->mctop - 1;
++  if (spadj == 0) {
++    p[-1] = LOONGI_NOP;
++  } else {
++    p[-1] = LOONGI_ADDI_D|LOONGF_D(RID_SP)|LOONGF_J(RID_SP)|LOONGF_I(spadj);
++  }
++
++  MCode *tmp = p;
++  *p = LOONGI_B | LOONGF_I((uintptr_t)(target-tmp)&0xffffu) | (((uintptr_t)(target-tmp)&0x3ff0000u) >> 16);
++}
++
++/* Prepare tail of code. */
++static void asm_tail_prep(ASMState *as)
++{
++  MCode *p = as->mctop - 1;  /* Leave room for exit branch. */
++  if (as->loopref) {
++    as->invmcp = as->mcp = p;
++  } else {
++    as->mcp = p-1;  /* Leave room for stack pointer adjustment. */
++    as->invmcp = NULL;
++  }
++  *p = LOONGI_NOP;  /* Prevent load/store merging. */
++}
++
++/* -- Trace setup --------------------------------------------------------- */
++
++/* Ensure there are enough stack slots for call arguments. */
++static Reg asm_setup_call_slots(ASMState *as, IRIns *ir, const CCallInfo *ci)
++{
++  IRRef args[CCI_NARGS_MAX*2];
++  uint32_t i, nargs = CCI_XNARGS(ci);
++  int nslots = 0, ngpr = REGARG_NUMGPR, nfpr = REGARG_NUMFPR;
++  asm_collectargs(as, ir, ci, args);
++  for (i = 0; i < nargs; i++) {
++    if (args[i] && irt_isfp(IR(args[i])->t)) {
++      if (nfpr > 0)
++        nfpr--;
++      else if (ngpr > 0)
++	ngpr--;
++      else
++	nslots += 2;
++    } else {
++      if (ngpr > 0)
++	ngpr--;
++      else
++	nslots += 2;
++    }
++  }
++  if (nslots > as->evenspill)  /* Leave room for args in stack slots. */
++    as->evenspill = nslots;
++  return REGSP_HINT(RID_RET);
++}
++
++static void asm_sparejump_setup(ASMState *as)
++{
++  MCode *mxp = as->mctop;
++  if ((char *)mxp == (char *)as->J->mcarea + as->J->szmcarea) {
++    mxp -= 4*1;
++    as->mctop = mxp;
++  }
++}
++
++static void asm_setup_target(ASMState *as)
++{
++  asm_sparejump_setup(as);
++  asm_exitstub_setup(as);
++}
++
++/* -- Trace patching ------------------------------------------------------ */
++
++/* Patch exit jumps of existing machine code to a new target. */
++void lj_asm_patchexit(jit_State *J, GCtrace *T, ExitNo exitno, MCode *target)
++{
++  MCode *p = T->mcode;
++  MCode *pe = (MCode *)((char *)p + T->szmcode);
++  MCode *px = exitstub_trace_addr(T, exitno);
++  MCode *cstart = NULL;
++  MCode *mcarea = lj_mcode_patch(J, p, 0);
++
++  MCode exitload = LOONGI_ADDI_D | LOONGF_D(RID_TMP) | LOONGF_J(RID_ZERO) | LOONGF_I(exitno&0xfff);
++
++  for (; p < pe; p++) {
++    if (*p == exitload) {
++    /* Look for exitstub branch, replace with branch to target. */
++    ptrdiff_t delta = target - p - 1;
++    MCode ins = p[1];
++      if (((ins ^ ((px-p-1)<<10)) & 0x3fffc00) == 0 &&
++          ((ins & 0xfc000000u) == LOONGI_BEQ ||
++           (ins & 0xfc000000u) == LOONGI_BNE ||
++           (ins & 0xfc000000u) == LOONGI_BLT ||
++           (ins & 0xfc000000u) == LOONGI_BGE ||
++	   (ins & 0xfc000000u) == LOONGI_BLTU)) {
++        /* Patch beq/bne/blt/bge, if within range. */
++        if (p[-1] == LOONG_NOPATCH_GC_CHECK) {
++	  /* nothing */
++        } else if (LOONGF_S_OK(delta, 16)) {
++          p[1] = (ins & 0xfc0003ffu) | LOONGF_I(delta & 0xffff);
++          *p = LOONGI_NOP;
++          if (!cstart) cstart = p + 1;
++        }
++      } else if (((ins ^ ((((px-p-1)&0xffff)<<10) + (((px-p-1)>>10)&0x1f))) & 0x3fffc1f) == 0 &&
++                 ((ins & 0xfc000000u) == LOONGI_BCEQZ ||
++                  (ins & 0xfc000100u) == LOONGI_BCNEZ)) {
++        /* Patch bceqz/bcnez, if within range. */
++        if (p[-1] == LOONG_NOPATCH_GC_CHECK) {
++	  /* nothing */
++        } else if (LOONGF_S_OK(delta, 21)) {
++          p[1] = (ins & 0xfc0003e0u) | LOONGF_I(delta & 0xffff) | ((delta & 0x1f0000) >> 16);
++          *p = LOONGI_NOP;
++          if (!cstart) cstart = p + 1;
++        }
++      } else if (((ins ^ ((((px-p-1)&0xffff)<<10) + (((px-p-1)>>10)&0x3f))) & 0x3ffffff) == 0 &&
++          ((ins & 0xfc000000u) == LOONGI_B)) {
++        /* Patch b. */
++        lj_assertJ(LOONGF_S_OK(delta, 26), "branch target out of range");
++        p[1] = (ins & 0xfc000000u) | LOONGF_I(delta & 0xffff) | ((delta & 0x3ff0000) >> 16);
++        *p = LOONGI_NOP;
++        if (!cstart) cstart = p + 1;
++      } else if (p+2 == pe){
++         if (p[2] == LOONGI_NOP) {
++            ptrdiff_t delta = target - &p[2];
++            lj_assertJ(LOONGF_S_OK(delta, 26), "branch target out of range");
++            p[2] = LOONGI_B | LOONGF_I(delta & 0xffff) | ((delta & 0x3ff0000) >> 16);
++            *p = LOONGI_NOP;
++            if (!cstart) cstart = p + 2;
++         }
++       }
++    }
++  }
++  if (cstart) lj_mcode_sync(cstart, px+1);
++  lj_mcode_patch(J, mcarea, 1);
++}
+diff --git a/src/vm_loongarch64.dasc b/src/vm_loongarch64.dasc
+index 9f98c382..c852a5c6 100644
+--- a/src/vm_loongarch64.dasc
++++ b/src/vm_loongarch64.dasc
+@@ -1986,6 +1986,123 @@ static void build_subroutines(BuildCtx *ctx)
+   |  b ->cont_nop
+ #endif
+   |
++  |//-----------------------------------------------------------------------
++  |//-- Trace exit handler -------------------------------------------------
++  |//-----------------------------------------------------------------------
++  |
++  |.macro savex_, a, b
++  |  fst.d f..a, a*8(sp)
++  |  fst.d f..b, b*8(sp)
++  |  st.d r..a, 32*8+a*8(sp)
++  |  st.d r..b, 32*8+b*8(sp)
++  |.endmacro
++  |
++  |->vm_exit_handler:
++  |.if JIT
++  |  addi.d sp, sp, -(32*8+32*8)
++  |  savex_ 0, 2
++  |  savex_ 4, 5
++  |  savex_ 6, 7
++  |  savex_ 8, 9
++  |  savex_ 10, 11
++  |  savex_ 12, 13
++  |  savex_ 14, 15
++  |  savex_ 16, 17
++  |  savex_ 18, 19
++  |  savex_ 20, 21
++  |  savex_ 22, 23
++  |  savex_ 24, 25
++  |  savex_ 26, 27
++  |  savex_ 28, 29
++  |  savex_ 30, 31
++  |  fst.d f1, 1*8(sp)
++  |  fst.d f3, 3*8(sp)
++  |  st.d r0, 32*8+1*8(sp)		// Clear RID_TMP.
++  |  addi.d TMP2, sp, 32*8+32*8		// Recompute original value of sp.
++  |  st.d TMP2, 32*8+3*8(sp)		// Store sp in RID_SP
++  |  li_vmstate EXIT
++  |  .ADD16I DISPATCH, JGL, -GG_DISP2G-32768
++  |  ld.w TMP1, 0(TMP2)			// Load exit number.
++  |  st_vmstate
++  |  .LDXD L, DISPATCH, DISPATCH_GL(cur_L)
++  |  .LDXD BASE, DISPATCH, DISPATCH_GL(jit_base)
++  |  .STXD L, DISPATCH, DISPATCH_J(L)
++  |  st.w ra, DISPATCH_J(parent)(DISPATCH)	// Store trace number.
++  |  st.d BASE, L->base
++  |  st.w TMP1, DISPATCH_J(exitno)(DISPATCH)	// Store exit number.
++  |  .ADD16I CARG1, DISPATCH, GG_DISP2J
++  |  .STXD r0, DISPATCH, DISPATCH_GL(jit_base)
++  |  or CARG2, sp, r0
++  |  bl extern lj_trace_exit		// (jit_State *J, ExitState *ex)
++  |  // Returns MULTRES (unscaled) or negated error code.
++  |  ld.d TMP1, L->cframe
++  |  addi.d TMP2, r0, -4
++  |  ld.d BASE, L->base
++  |  and sp, TMP1, TMP2
++  |  ld.d PC, SAVE_PC(sp)		// Get SAVE_PC.
++  |  st.d L, SAVE_L(sp)			// Set SAVE_L (on-trace resume/yield).
++  |  b >1
++  |.endif
++  |
++  |->vm_exit_interp:
++  |.if JIT
++  |  // CRET1 = MULTRES or negated error code, BASE, PC and JGL set.
++  |  ld.d L, SAVE_L(sp)
++  |  .ADD16I DISPATCH, JGL, -GG_DISP2G-32768
++  |  st.d BASE, L->base
++  |1:
++  |  ld.d LFUNC:RB, FRAME_FUNC(BASE)
++  |  blt CRET1, r0, >9			// Check for error from exit.
++  |  addu16i.d TMP3, r0, 0x59c0		// TOBIT = 2^52 + 2^51 (float).
++  |  slli.d MULTRES, CRET1, 3
++  |  cleartp LFUNC:RB
++  |  st.w MULTRES, TMPD(sp)
++  |  addi.d TISNIL, r0, LJ_TNIL
++  |  addi.d TISNUM, r0, LJ_TISNUM		// Setup type comparison constants.
++  |  movgr2fr.w TOBIT, TMP3
++  |  ld.d TMP1, LFUNC:RB->pc
++  |  .STXD r0, DISPATCH, DISPATCH_GL(jit_base)
++  |  ld.d KBASE, PC2PROTO(k)(TMP1)
++  |  fcvt.d.s TOBIT, TOBIT
++  |  // Modified copy of ins_next which handles function header dispatch, too.
++  |  ld.w INS, 0(PC)
++  |  addi.d PC, PC, 4
++  |  // Assumes TISNIL == ~LJ_VMST_INTERP == -1
++  |  .STXW TISNIL, DISPATCH, DISPATCH_GL(vmstate)
++  |  decode_OP TMP1, INS
++  |  decode_BC8b TMP1
++  |  sltui TMP2, TMP1, BC_FUNCF*8
++  |  add.d TMP0, DISPATCH, TMP1
++  |  decode_RD RD, INS
++  |  ld.d TMP3, 0(TMP0)
++  |  decode_RA RA, INS
++  |  beqz TMP2, >2
++  |  jirl r0, TMP3, 0
++  |2:
++  |  sltui TMP2, TMP1, (BC_FUNCC+2)*8	// Fast function?
++  |  ld.d TMP1, FRAME_PC(BASE)
++  |  bnez TMP2, >3
++  |  // Check frame below fast function.
++  |  andi TMP0, TMP1, FRAME_TYPE
++  |  bnez TMP0, >3			// Trace stitching continuation?
++  |  // Otherwise set KBASE for Lua function below fast function.
++  |  ld.w TMP2, -4(TMP1)
++  |  decode_RA TMP0, TMP2
++  |  sub.d TMP1, BASE, TMP0
++  |  ld.d LFUNC:TMP2, -32(TMP1)
++  |  cleartp LFUNC:TMP2
++  |  ld.d TMP1, LFUNC:TMP2->pc
++  |  ld.d KBASE, PC2PROTO(k)(TMP1)
++  |3:
++  |  addi.d RC, MULTRES, -8
++  |  add.d RA, RA, BASE
++  |  jirl r0, TMP3, 0
++  |
++  |9:  // Rethrow error from the right C frame.
++  |  sub.w CARG2, r0, CRET1		//TODO LA: sub.w  no trap
++  |  or CARG1, L, r0
++  |  bl extern lj_err_trace		// (lua_State *L, int errcode)
++  |.endif
+   |
+   |//-----------------------------------------------------------------------
+   |//-- Math helper functions ----------------------------------------------
+-- 
+2.20.1
+
diff --git a/loongarch64/0011-LoongArch64-Add-JIT-support-in-the-interpreter.patch b/loongarch64/0011-LoongArch64-Add-JIT-support-in-the-interpreter.patch
new file mode 100644
index 0000000..fe736c3
--- /dev/null
+++ b/loongarch64/0011-LoongArch64-Add-JIT-support-in-the-interpreter.patch
@@ -0,0 +1,384 @@
+From 39188c3bcb441abf3aabe840188b2e0048abbf76 Mon Sep 17 00:00:00 2001
+From: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
+Date: Wed, 27 Jul 2022 17:55:45 +0800
+Subject: [PATCH 11/20] LoongArch64: Add JIT support in the interpreter
+
+---
+ src/vm_loongarch64.dasc | 259 ++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 259 insertions(+)
+
+diff --git a/src/vm_loongarch64.dasc b/src/vm_loongarch64.dasc
+index c852a5c6..ab591cc2 100644
+--- a/src/vm_loongarch64.dasc
++++ b/src/vm_loongarch64.dasc
+@@ -310,6 +310,23 @@
+ |
+ #define PC2PROTO(field)  ((int)offsetof(GCproto, field)-(int)sizeof(GCproto))
+ |
++|.macro hotcheck, delta, target
++|  srli.d TMP1, PC, 1
++|  andi TMP1, TMP1, 126
++|  add.d TMP1, TMP1, DISPATCH
++|  ld.hu TMP2, GG_DISP2HOT(TMP1)
++|  addi.w TMP2, TMP2, -delta
++|  st.h TMP2, GG_DISP2HOT(TMP1)
++|  blt TMP2, r0, target
++|.endmacro
++|
++|.macro hotloop
++|  hotcheck HOTCOUNT_LOOP, ->vm_hotloop
++|.endmacro
++|
++|.macro hotcall
++|  hotcheck HOTCOUNT_CALL, ->vm_hotcall
++|.endmacro
+ |
+ |// Set current VM state. Uses TMP0.
+ |.macro li_vmstate, st; addi.w TMP0, r0, ~LJ_VMST_..st; .endmacro
+@@ -969,8 +986,15 @@ static void build_subroutines(BuildCtx *ctx)
+   |  or MULTRES, INS, r0
+   |  or CARG1, L, r0
+   |  bl extern lj_meta_for	// (lua_State *L, TValue *base)
++  |.if JIT
++  |  decode_OP TMP0, MULTRES
++  |  addi.d TMP1, r0, BC_JFORI
++  |.endif
+   |  decode_RA RA, MULTRES
+   |  decode_RD RD, MULTRES
++  |.if JIT
++  |  beq TMP0, TMP1, =>BC_JFORI
++  |.endif
+   |  b =>BC_FORI
+   |
+   |//-----------------------------------------------------------------------
+@@ -1923,6 +1947,20 @@ static void build_subroutines(BuildCtx *ctx)
+   |//-----------------------------------------------------------------------
+   |
+   |->vm_record:				// Dispatch target for recording phase.
++  |.if JIT
++  |  .LDXBU TMP3, DISPATCH, DISPATCH_GL(hookmask)
++  |  andi TMP1, TMP3, HOOK_VMEVENT	// No recording while in vmevent.
++  |  bnez TMP1, >5
++  |  // Decrement the hookcount for consistency, but always do the call.
++  |  .LDXW TMP2, DISPATCH, DISPATCH_GL(hookcount)
++  |  andi TMP1, TMP3, HOOK_ACTIVE
++  |  bnez TMP1, >1
++  |  addi.w TMP2, TMP2, -1
++  |  andi TMP1, TMP3, LUA_MASKLINE|LUA_MASKCOUNT
++  |  beqz TMP1, >1
++  |  .STXW TMP2, DISPATCH, DISPATCH_GL(hookcount)
++  |  b >1
++  |.endif
+   |
+   |->vm_rethook:			// Dispatch target for return hooks.
+   |  .LDXBU TMP3, DISPATCH, DISPATCH_GL(hookmask)
+@@ -1968,10 +2006,101 @@ static void build_subroutines(BuildCtx *ctx)
+   |  ld.w MULTRES, -24(RB)		// Restore MULTRES for *M ins.
+   |  b <4
+   |
++  |->vm_hotloop:			// Hot loop counter underflow.
++  |.if JIT
++  |  ld.d LFUNC:TMP1, FRAME_FUNC(BASE)
++  |  .ADD16I CARG1, DISPATCH, GG_DISP2J
++  |  cleartp LFUNC:TMP1
++  |  st.d PC, SAVE_PC(sp)
++  |  ld.d TMP1, LFUNC:TMP1->pc
++  |  or CARG2, PC, r0
++  |  .STXD L, DISPATCH, DISPATCH_J(L)
++  |  ld.bu TMP1, PC2PROTO(framesize)(TMP1)
++  |  st.d BASE, L->base
++  |  slli.d TMP1, TMP1, 3
++  |  add.d TMP1, BASE, TMP1
++  |  st.d TMP1, L->top
++  |  bl extern lj_trace_hot		// (jit_State *J, const BCIns *pc)
++  |  b <3
++  |.endif
++  |
+   |
+   |->vm_callhook:			// Dispatch target for call hooks.
+   |  or CARG2, PC, r0
++  |.if JIT
++  |  b >1
++  |.endif
++  |
++  |->vm_hotcall:			// Hot call counter underflow.
++  |.if JIT
++  |  ori CARG2, PC, 1
++  |1:
++  |.endif
++  |  add.d TMP0, BASE, RC
++  |  st.d PC, SAVE_PC(sp)
++  |  st.d BASE, L->base
++  |  sub.d RA, RA, BASE
++  |  st.d TMP0, L->top
++  |  or CARG1, L, r0
++  |  bl extern lj_dispatch_call		// (lua_State *L, const BCIns *pc)
++  |  // Returns ASMFunction.
++  |  ld.d BASE, L->base
++  |  ld.d TMP0, L->top
++  |  st.d r0, SAVE_PC(sp)		// Invalidate for subsequent line hook.
++  |  add.d RA, BASE, RA
++  |  sub.d NARGS8:RC, TMP0, BASE
++  |  ld.d LFUNC:RB, FRAME_FUNC(BASE)
++  |  cleartp LFUNC:RB
++  |  ld.w INS, -4(PC)
++  |  jirl r0, CRET1, 0
++  |
++  |->cont_stitch:			// Trace stitching.
++  |.if JIT
++  |  // RA = resultptr, RB = meta base
++  |  ld.w INS, -4(PC)
++  |  ld.d TRACE:TMP2, -40(RB)		// Save previous trace.
++  |  decode_RA RC, INS
++  |  addi.d TMP1, MULTRES, -8
++  |  cleartp TRACE:TMP2
++  |  add.d RC, BASE, RC			// Call base.
++  |  beqz TMP1, >2
++  |1:  // Move results down.
++  |  ld.d CARG1, 0(RA)
++  |  addi.d TMP1, TMP1, -8
++  |  addi.d RA, RA, 8
++  |  st.d CARG1, 0(RC)
++  |  addi.d RC, RC, 8
++  |  bnez TMP1, <1
++  |2:
++  |  decode_RA RA, INS
++  |  decode_RB RB, INS
++  |  add.d RA, RA, RB
++  |  add.d RA, BASE, RA
++  |3:
++  |  sltu TMP1, RC, RA
++  |  bnez TMP1, >9			// More results wanted?
++  |
++  |  ld.hu TMP3, TRACE:TMP2->traceno
++  |  ld.hu RD, TRACE:TMP2->link
++  |  beq RD, TMP3, ->cont_nop		// Blacklisted.
++  |  slli.w RD, RD, 3
++  |  bnez RD, =>BC_JLOOP		// Jump to stitched trace.
++  |
++  |  // Stitch a new trace to the previous trace.
++  |  st.w TMP3, DISPATCH_J(exitno)(DISPATCH)
++  |  .STXD L, DISPATCH, DISPATCH_J(L)
++  |  st.d BASE, L->base
++  |  .ADD16I CARG1, DISPATCH, GG_DISP2J
++  |  or CARG2, PC, r0
++  |  bl extern lj_dispatch_stitch	// (jit_State *J, const BCIns *pc)
++  |  ld.d BASE, L->base
++  |  b ->cont_nop
+   |
++  |9:
++  |  st.d TISNIL, 0(RC)
++  |  addi.d RC, RC, 8
++  |  b <3
++  |.endif
+   |
+   |->vm_profhook:			// Dispatch target for profiler hook.
+ #if LJ_HASPROFILE
+@@ -2126,6 +2255,18 @@ static void build_subroutines(BuildCtx *ctx)
+   |.else
+   |  addu16i.d TMP0, r0, 0x3ff0	// Hiword of +1 (double).
+   |.endif
++  |.if "func" == "trunc"
++  |  slli.d TMP0, TMP0, 32
++  |  movgr2fr.d FARG5, TMP0
++  |  fcmp.clt.d FCC0, FTMP4, FRET1	// |x| < result?
++  |  fsub.d FTMP4, FTMP3, FARG5
++  |  fsel FTMP1, FTMP3, FTMP4, FCC0
++  |  movgr2fr.d FTMP3, TMP1
++  |  fneg.d FTMP4, FTMP1
++  |  movfr2cf FCC0, FTMP3
++  |  fsel FTMP3, FTMP1, FTMP4, FCC0
++  |  jirl r0, ra, 0
++  |.else
+   |  fneg.d FTMP4, FTMP3
+   |  slli.d TMP0, TMP0, 32
+   |  movgr2fr.d FARG5, TMP0
+@@ -2141,6 +2282,7 @@ static void build_subroutines(BuildCtx *ctx)
+   |  fsel FTMP3, FTMP1, FTMP4, FCC0
+   |  fmov.d FARG1, FTMP3
+   |  jirl r0, ra, 0
++  |.endif
+   |1:
+   |  fmov.d FTMP3, FARG1
+   |  jirl r0, ra, 0
+@@ -2151,9 +2293,79 @@ static void build_subroutines(BuildCtx *ctx)
+   |  vm_round_hf floor
+   |->vm_ceil:
+   |  vm_round_hf ceil
++  |->vm_trunc:
++  |.if JIT
++  |  vm_round_hf trunc
++  |.endif
+   |
+   |
+   |//-----------------------------------------------------------------------
++  |//-- Miscellaneous functions --------------------------------------------
++  |//-----------------------------------------------------------------------
++  |
++  |.define NEXT_TAB,            TAB:CARG1
++  |.define NEXT_IDX,            CARG2
++  |.define NEXT_ASIZE,          CARG3
++  |.define NEXT_NIL,            CARG4
++  |.define NEXT_TMP0,           TMP0
++  |.define NEXT_TMP1,           TMP1
++  |.define NEXT_TMP2,           TMP2
++  |.define NEXT_RES_VK,         CRET1
++  |.define NEXT_RES_IDX,        CRET2
++  |.define NEXT_RES_PTR,        sp
++  |.define NEXT_RES_VAL,        0(sp)
++  |.define NEXT_RES_KEY,        8(sp)
++  |
++  |// TValue *lj_vm_next(GCtab *t, uint32_t idx)
++  |// Next idx returned in CRET2.
++  |->vm_next:
++  |.if JIT
++  |  ld.w NEXT_ASIZE, NEXT_TAB->asize
++  |  ld.d NEXT_TMP0, NEXT_TAB->array
++  |  addi.d NEXT_NIL, r0, LJ_TNIL
++  |1:  // Traverse array part.
++  |  sltu TMP3, NEXT_IDX, NEXT_ASIZE
++  |  slli.w NEXT_TMP1, NEXT_IDX, 3
++  |  add.d NEXT_TMP1, NEXT_TMP0, NEXT_TMP1
++  |  beqz TMP3, >5
++  |  addi.d TMP3, r0, LJ_TISNUM
++  |  ld.d NEXT_TMP2, 0(NEXT_TMP1)
++  |  slli.d TMP3, TMP3, 47
++  |  or NEXT_TMP1, NEXT_IDX, TMP3
++  |  addi.w NEXT_IDX, NEXT_IDX, 1
++  |  beq NEXT_TMP2, NEXT_NIL, <1
++  |  st.d NEXT_TMP2, NEXT_RES_VAL
++  |  st.d NEXT_TMP1, NEXT_RES_KEY
++  |  or NEXT_RES_VK, NEXT_RES_PTR, r0
++  |  or NEXT_RES_IDX, NEXT_IDX, r0
++  |  jirl r0, ra, 0
++  |
++  |5:  // Traverse hash part.
++  |  sub.w NEXT_RES_IDX, NEXT_IDX, NEXT_ASIZE
++  |  ld.w NEXT_TMP0, NEXT_TAB->hmask
++  |  ld.d NODE:NEXT_RES_VK, NEXT_TAB->node
++  |  slli.w NEXT_TMP2, NEXT_RES_IDX, 5
++  |  slli.w TMP3, NEXT_RES_IDX, 3
++  |  sub.w TMP3, NEXT_TMP2, TMP3
++  |  add.d NODE:NEXT_RES_VK, NODE:NEXT_RES_VK, TMP3
++  |6:
++  |  sltu TMP3, NEXT_TMP0, NEXT_RES_IDX
++  |  bnez TMP3, >8
++  |  ld.d NEXT_TMP2, NODE:NEXT_RES_VK->val
++  |  addi.w NEXT_RES_IDX, NEXT_RES_IDX, 1
++  |  bne NEXT_TMP2, NEXT_NIL, >9
++  |  // Skip holes in hash part.
++  |  addi.d NODE:NEXT_RES_VK, NODE:NEXT_RES_VK, sizeof(Node)
++  |  b <6
++  |
++  |8:  // End of iteration. Set the key to nil (not the value).
++  |  st.d NEXT_NIL, NEXT_RES_KEY
++  |  or NEXT_RES_VK, NEXT_RES_PTR, r0
++  |9:
++  |  add.w NEXT_RES_IDX, NEXT_RES_IDX, NEXT_ASIZE
++  |  jirl r0, ra, 0
++  |.endif
++  |
+ }
+ 
+ /* Generate the code for a single instruction. */
+@@ -3455,6 +3667,9 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
+ 
+   case BC_ITERN:
+     |  // RA = base*8, (RB = (nresults+1)*8, RC = (nargs+1)*8 (2+1)*8)
++    |.if JIT
++    |  hotloop
++    |.endif
+     |->vm_IITERN:
+     |  add.d RA, BASE, RA
+     |  ld.d TAB:RB, -16(RA)
+@@ -3542,8 +3757,27 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
+     |  addi.d TMP1, r0, BC_ITERC
+     |  st.b TMP3, -4+OFS_OP(PC)
+     |  add.d PC, TMP0, TMP2
++    |.if JIT
++    |  ld.b TMP0, OFS_OP(PC)
++    |  addi.d TMP3, r0, BC_ITERN
++    |  ld.hu TMP2, OFS_RD(PC)
++    |  bne TMP0, TMP3, >6
++    |.endif
+     |  st.b TMP1, OFS_OP(PC)
+     |  b <1
++    |.if JIT
++    |6:  // Unpatch JLOOP.
++    |  .LDXD TMP0, DISPATCH, DISPATCH_J(trace)
++    |  slli.w TMP2, TMP2, 3
++    |  add.d TMP0, TMP0, TMP2
++    |  ld.d TRACE:TMP2, 0(TMP0)
++    |  ld.w TMP0, TRACE:TMP2->startins
++    |  addi.d TMP3, r0, -256
++    |  and TMP0, TMP0, TMP3
++    |  or TMP0, TMP0, TMP1
++    |  st.w TMP0, 0(PC)
++    |  b <1
++    |.endif
+     break;
+ 
+   case BC_VARG:
+@@ -3713,6 +3947,9 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
+   /* -- Loops and branches ------------------------------------------------ */
+ 
+   case BC_FORL:
++    |.if JIT
++    |  hotloop
++    |.endif
+     |  // Fall through. Assumes BC_IFORL follows.
+     break;
+ 
+@@ -3832,6 +4069,9 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
+     break;
+ 
+   case BC_ITERL:
++    |.if JIT
++    |  hotloop
++    |.endif
+     |  // Fall through. Assumes BC_IITERL follows.
+     break;
+ 
+@@ -3859,6 +4099,9 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
+     |  // RA = base*8, RD = target (loop extent)
+     |  // Note: RA/RD is only used by trace recorder to determine scope/extent
+     |  // This opcode does NOT jump, it's only purpose is to detect a hot loop.
++    |.if JIT
++    |  hotloop
++    |.endif
+     |  // Fall through. Assumes BC_ILOOP follows.
+     break;
+ 
+@@ -3868,6 +4111,19 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
+     break;
+ 
+   case BC_JLOOP:
++    |.if JIT
++    |  // RA = base*8 (ignored), RD = traceno*8
++    |  .LDXD TMP0, DISPATCH, DISPATCH_J(trace)
++    |  add.d TMP0, TMP0, RD
++    |  // Traces on LOONGARCH don't store the trace number, so use 0.
++    |  .STXD r0, DISPATCH, DISPATCH_GL(vmstate)
++    |  ld.d TRACE:TMP1, 0(TMP0)
++    |  .STXD BASE, DISPATCH, DISPATCH_GL(jit_base)	// store Current JIT code L->base
++    |  ld.d TMP1, TRACE:TMP1->mcode
++    |  .ADD16I JGL, DISPATCH, GG_DISP2G+32768
++    |  .STXD L, DISPATCH, DISPATCH_GL(tmpbuf.L)
++    |  jirl r0, TMP1, 0
++    |.endif
+     break;
+ 
+   case BC_JMP:
+@@ -3879,6 +4135,9 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
+   /* -- Function headers -------------------------------------------------- */
+ 
+   case BC_FUNCF:
++    |.if JIT
++    |  hotcall
++    |.endif
+   case BC_FUNCV:  /* NYI: compiled vararg functions. */
+     |  // Fall through. Assumes BC_IFUNCF/BC_IFUNCV follow.
+     break;
+-- 
+2.20.1
+
diff --git a/loongarch64/0012-LoongArch64-Add-CPU-feature-detection-when-init-JIT-.patch b/loongarch64/0012-LoongArch64-Add-CPU-feature-detection-when-init-JIT-.patch
new file mode 100644
index 0000000..4c4b879
--- /dev/null
+++ b/loongarch64/0012-LoongArch64-Add-CPU-feature-detection-when-init-JIT-.patch
@@ -0,0 +1,28 @@
+From 056d478b029d88525fa1b6dfb91eec46b4b4b31a Mon Sep 17 00:00:00 2001
+From: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
+Date: Wed, 27 Jul 2022 18:02:20 +0800
+Subject: [PATCH 12/20] LoongArch64: Add CPU feature detection when init JIT
+ compiler
+
+---
+ src/lib_jit.c | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+diff --git a/src/lib_jit.c b/src/lib_jit.c
+index 2867d420..9c0511b7 100644
+--- a/src/lib_jit.c
++++ b/src/lib_jit.c
+@@ -719,6 +719,10 @@ static uint32_t jit_cpudetect(void)
+   }
+ #endif
+ 
++#elif LJ_TARGET_LOONGARCH64
++
++  /* No optional CPU features to detect (for now). */
++
+ #else
+ #error "Missing CPU detection for this architecture"
+ #endif
+-- 
+2.20.1
+
diff --git a/loongarch64/0013-LoongArch64-Add-LoongArch-lp64-calling-conventions-a.patch b/loongarch64/0013-LoongArch64-Add-LoongArch-lp64-calling-conventions-a.patch
new file mode 100644
index 0000000..7b5854f
--- /dev/null
+++ b/loongarch64/0013-LoongArch64-Add-LoongArch-lp64-calling-conventions-a.patch
@@ -0,0 +1,322 @@
+From 43ae1b7415f10e56b0293396c62a0bceb83c7e44 Mon Sep 17 00:00:00 2001
+From: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
+Date: Wed, 27 Jul 2022 18:20:58 +0800
+Subject: [PATCH 13/20] LoongArch64: Add LoongArch lp64 calling conventions and
+ FFI C call handling
+
+---
+ src/lj_ccall.c          | 152 +++++++++++++++++++++++++++++++++++++++-
+ src/lj_ccall.h          |  17 ++++-
+ src/vm_loongarch64.dasc |  64 +++++++++++++++++
+ 3 files changed, 229 insertions(+), 4 deletions(-)
+
+diff --git a/src/lj_ccall.c b/src/lj_ccall.c
+index 25f54dee..ff15005d 100644
+--- a/src/lj_ccall.c
++++ b/src/lj_ccall.c
+@@ -574,6 +574,95 @@
+     goto done; \
+   }
+ 
++#elif LJ_TARGET_LOONGARCH64
++/* -- LoongArch lp64 calling conventions ---------------------------------------- */
++
++#define CCALL_HANDLE_STRUCTRET \
++  /* Return structs of size > 16 by reference. */ \
++  cc->retref = !(sz <= 16); \
++  if (cc->retref) cc->gpr[ngpr++] = (GPRArg)dp;
++
++#define CCALL_HANDLE_STRUCTRET2 \
++  unsigned int cl = ccall_classify_struct(cts, ctr); \
++  if ((cl & 4) && (cl >> 8) <= 2) { \
++    CTSize i = (cl >> 8) - 1; \
++    do { ((float *)dp)[i] = cc->fpr[i].f; } while (i--); \
++  } else { \
++    if (cl > 1) { \
++      sp = (uint8_t *)&cc->fpr[0]; \
++      if ((cl >> 8) > 2) \
++        sp = (uint8_t *)&cc->gpr[0]; \
++    } \
++      memcpy(dp, sp, ctr->size); \
++  } \
++
++#define CCALL_HANDLE_COMPLEXRET \
++  /* Complex values are returned in 1 or 2 FPRs. */ \
++  cc->retref = 0;
++
++#define CCALL_HANDLE_COMPLEXRET2 \
++  if (ctr->size == 2*sizeof(float)) {  /* Copy complex float from FPRs. */ \
++    ((float *)dp)[0] = cc->fpr[0].f; \
++    ((float *)dp)[1] = cc->fpr[1].f; \
++  } else {  /* Copy complex double from FPRs. */ \
++    ((double *)dp)[0] = cc->fpr[0].d; \
++    ((double *)dp)[1] = cc->fpr[1].d; \
++  }
++
++#define CCALL_HANDLE_COMPLEXARG \
++  /* Pass complex double by reference. */ \
++  if (sz == 4*sizeof(double)) { \
++    rp = cdataptr(lj_cdata_new(cts, did, sz)); \
++    sz = CTSIZE_PTR; \
++  } else if (sz == 2*sizeof(float)) { \
++    isfp = 2; \
++    sz = 2*CTSIZE_PTR; \
++  } else { \
++    isfp = 1; \
++    sz = 2*CTSIZE_PTR; \
++  }
++
++#define CCALL_HANDLE_RET \
++  if (ctype_isfp(ctr->info) && ctr->size == sizeof(float)) \
++    sp = (uint8_t *)&cc->fpr[0].f;
++
++#define CCALL_HANDLE_STRUCTARG \
++  /* Pass structs of size >16 by reference. */ \
++  unsigned int cl = ccall_classify_struct(cts, d); \
++  nff = cl >> 8; \
++  if (sz > 16) { \
++    rp = cdataptr(lj_cdata_new(cts, did, sz)); \
++    sz = CTSIZE_PTR; \
++  } \
++  /* Pass struct in FPRs. */ \
++  if (cl > 1) { \
++    isfp = (cl & 4) ? 2 : 1; \
++  }
++
++
++#define CCALL_HANDLE_REGARG \
++  if (isfp && (!isva)) {  /* Try to pass argument in FPRs. */ \
++    int n2 = ctype_isvector(d->info) ? 1 : \
++	     isfp == 1 ? n : 2; \
++    if (nfpr + n2 <= CCALL_NARG_FPR && nff <= 2) { \
++      dp = &cc->fpr[nfpr]; \
++      nfpr += n2; \
++      goto done; \
++    } else { \
++      if (ngpr + n2 <= maxgpr) { \
++	dp = &cc->gpr[ngpr]; \
++	ngpr += n2; \
++	goto done; \
++      } \
++    } \
++  } else {  /* Try to pass argument in GPRs. */ \
++      if (ngpr + n <= maxgpr) { \
++        dp = &cc->gpr[ngpr]; \
++        ngpr += n; \
++        goto done; \
++    } \
++  }
++
+ #else
+ #error "Missing calling convention definitions for this architecture"
+ #endif
+@@ -889,6 +978,53 @@ static void ccall_copy_struct(CCallState *cc, CType *ctr, void *dp, void *sp,
+ 
+ #endif
+ 
++/* -- LoongArch64 ABI struct classification ---------------------------- */
++
++#if LJ_TARGET_LOONGARCH64
++
++static unsigned int ccall_classify_struct(CTState *cts, CType *ct)
++{
++  CTSize sz = ct->size;
++  unsigned int r = 0, n = 0, isu = (ct->info & CTF_UNION);
++  while (ct->sib) {
++    CType *sct;
++    ct = ctype_get(cts, ct->sib);
++    if (ctype_isfield(ct->info)) {
++      sct = ctype_rawchild(cts, ct);
++      if (ctype_isfp(sct->info)) {
++	r |= sct->size;
++	if (!isu) n++; else if (n == 0) n = 1;
++      } else if (ctype_iscomplex(sct->info)) {
++	r |= (sct->size >> 1);
++	if (!isu) n += 2; else if (n < 2) n = 2;
++      } else if (ctype_isstruct(sct->info)) {
++	goto substruct;
++      } else {
++	goto noth;
++      }
++    } else if (ctype_isbitfield(ct->info)) {
++      goto noth;
++    } else if (ctype_isxattrib(ct->info, CTA_SUBTYPE)) {
++      sct = ctype_rawchild(cts, ct);
++    substruct:
++      if (sct->size > 0) {
++	unsigned int s = ccall_classify_struct(cts, sct);
++	if (s <= 1) goto noth;
++	r |= (s & 255);
++	if (!isu) n += (s >> 8); else if (n < (s >>8)) n = (s >> 8);
++      }
++    }
++  }
++  if ((r == 4 || r == 8) && n <= 4)
++    return r + (n << 8);
++noth:  /* Not a homogeneous float/double aggregate. */
++  return (sz <= 16);  /* Return structs of size <= 16 in GPRs. */
++}
++
++
++#endif
++
++
+ /* -- Common C call handling ---------------------------------------------- */
+ 
+ /* Infer the destination CTypeID for a vararg argument. */
+@@ -934,7 +1070,9 @@ static int ccall_set_args(lua_State *L, CTState *cts, CType *ct,
+   MSize fprodd = 0;
+ #endif
+ #endif
+-
++#if LJ_TARGET_LOONGARCH64
++  int nff = 0;
++#endif
+   /* Clear unused regs to get some determinism in case of misdeclaration. */
+   memset(cc->gpr, 0, sizeof(cc->gpr));
+ #if CCALL_NUM_FPR
+@@ -1060,7 +1198,7 @@ static int ccall_set_args(lua_State *L, CTState *cts, CType *ct,
+     if (isfp && d->size == sizeof(float))
+       ((float *)dp)[1] = ((float *)dp)[0];  /* Floats occupy high slot. */
+ #endif
+-#if LJ_TARGET_MIPS64 || (LJ_TARGET_ARM64 && LJ_BE)
++#if LJ_TARGET_MIPS64 || (LJ_TARGET_ARM64 && LJ_BE) || LJ_TARGET_LOONGARCH64
+     if ((ctype_isinteger_or_bool(d->info) || ctype_isenum(d->info)
+ #if LJ_TARGET_MIPS64
+ 	 || (isfp && nsp == 0)
+@@ -1090,13 +1228,21 @@ static int ccall_set_args(lua_State *L, CTState *cts, CType *ct,
+       CTSize i = (sz >> 2) - 1;
+       do { ((uint64_t *)dp)[i] = ((uint32_t *)dp)[i]; } while (i--);
+     }
++#elif LJ_TARGET_LOONGARCH64
++    if (isfp == 2 && nff <= 2) {
++      /* Split complex float into separate registers. */
++      CTSize i = (sz >> 2) - 1;
++      do {
++        ((uint64_t *)dp)[i] = ((uint32_t *)dp)[i];
++      } while (i--);
++    }
+ #else
+     UNUSED(isfp);
+ #endif
+   }
+   if (fid) lj_err_caller(L, LJ_ERR_FFI_NUMARG);  /* Too few arguments. */
+ 
+-#if LJ_TARGET_X64 || (LJ_TARGET_PPC && !LJ_ABI_SOFTFP)
++#if LJ_TARGET_X64 || (LJ_TARGET_PPC && !LJ_ABI_SOFTFP) || LJ_TARGET_LOONGARCH64
+   cc->nfpr = nfpr;  /* Required for vararg functions. */
+ #endif
+   cc->nsp = nsp;
+diff --git a/src/lj_ccall.h b/src/lj_ccall.h
+index 0b3c5244..4236184b 100644
+--- a/src/lj_ccall.h
++++ b/src/lj_ccall.h
+@@ -126,6 +126,21 @@ typedef union FPRArg {
+   struct { LJ_ENDIAN_LOHI(float f; , float g;) };
+ } FPRArg;
+ 
++#elif LJ_TARGET_LOONGARCH64
++
++#define CCALL_NARG_GPR          8
++#define CCALL_NARG_FPR          8
++#define CCALL_NRET_GPR          2
++#define CCALL_NRET_FPR          2
++#define CCALL_SPS_EXTRA         3
++#define CCALL_SPS_FREE          1
++
++typedef intptr_t GPRArg;
++typedef union FPRArg {
++  double d;
++  struct { LJ_ENDIAN_LOHI(float f; , float g;) };
++} FPRArg;
++
+ #else
+ #error "Missing calling convention definitions for this architecture"
+ #endif
+@@ -168,7 +183,7 @@ typedef LJ_ALIGN(CCALL_ALIGN_CALLSTATE) struct CCallState {
+   uint8_t resx87;		/* Result on x87 stack: 1:float, 2:double. */
+ #elif LJ_TARGET_ARM64
+   void *retp;			/* Aggregate return pointer in x8. */
+-#elif LJ_TARGET_PPC
++#elif LJ_TARGET_PPC || LJ_TARGET_LOONGARCH64
+   uint8_t nfpr;			/* Number of arguments in FPRs. */
+ #endif
+ #if LJ_32
+diff --git a/src/vm_loongarch64.dasc b/src/vm_loongarch64.dasc
+index ab591cc2..8c6cde99 100644
+--- a/src/vm_loongarch64.dasc
++++ b/src/vm_loongarch64.dasc
+@@ -2366,6 +2366,70 @@ static void build_subroutines(BuildCtx *ctx)
+   |  jirl r0, ra, 0
+   |.endif
+   |
++  |//-----------------------------------------------------------------------
++  |//-- FFI helper functions -----------------------------------------------
++  |//-----------------------------------------------------------------------
++  |
++  |->vm_ffi_call:			// Call C function via FFI.
++  |  // Caveat: needs special frame unwinding, see below.
++  |.if FFI
++  |  .type CCSTATE, CCallState, CARG1
++  |  ld.w TMP1, CCSTATE->spadj
++  |  ld.bu CARG2, CCSTATE->nsp
++  |  ld.bu CARG3, CCSTATE->nfpr
++  |  or TMP2, sp, r0
++  |  sub.d sp, sp, TMP1
++  |  st.d ra, -8(TMP2)
++  |  slli.w CARG2, CARG2, 3
++  |  st.d r23, -16(TMP2)
++  |  st.d CCSTATE, -24(TMP2)
++  |  or r23, TMP2, r0
++  |  addi.d TMP1, CCSTATE, offsetof(CCallState, stack)
++  |  or TMP2, sp, r0
++  |  add.d TMP3, TMP1, CARG2
++  |  beqz CARG2, >2
++  |1:
++  |  ld.d TMP0, 0(TMP1)
++  |  addi.d TMP1, TMP1, 8
++  |  sltu TMP4, TMP1, TMP3
++  |  st.d TMP0, 0(TMP2)
++  |  addi.d TMP2, TMP2, 8
++  |  bnez TMP4, <1
++  |2:
++  |  beqz CARG3, >3
++  |  fld.d FARG1, CCSTATE->fpr[0]
++  |  fld.d FARG2, CCSTATE->fpr[1]
++  |  fld.d FARG3, CCSTATE->fpr[2]
++  |  fld.d FARG4, CCSTATE->fpr[3]
++  |  fld.d FARG5, CCSTATE->fpr[4]
++  |  fld.d FARG6, CCSTATE->fpr[5]
++  |  fld.d FARG7, CCSTATE->fpr[6]
++  |  fld.d FARG8, CCSTATE->fpr[7]
++  |3:
++  |  ld.d TMP3, CCSTATE->func
++  |  ld.d CARG2, CCSTATE->gpr[1]
++  |  ld.d CARG3, CCSTATE->gpr[2]
++  |  ld.d CARG4, CCSTATE->gpr[3]
++  |  ld.d CARG5, CCSTATE->gpr[4]
++  |  ld.d CARG6, CCSTATE->gpr[5]
++  |  ld.d CARG7, CCSTATE->gpr[6]
++  |  ld.d CARG8, CCSTATE->gpr[7]
++  |  ld.d CARG1, CCSTATE->gpr[0]         // Do this last, since CCSTATE is CARG1.
++  |  jirl r1, TMP3, 0
++  |  ld.d CCSTATE:TMP1, -24(r23)
++  |  ld.d TMP2, -16(r23)
++  |  ld.d ra, -8(r23)
++  |  st.d CRET1, CCSTATE:TMP1->gpr[0]
++  |  st.d CRET2, CCSTATE:TMP1->gpr[1]
++  |  fst.d FRET1, CCSTATE:TMP1->fpr[0]
++  |  fst.d FRET2, CCSTATE:TMP1->fpr[1]
++  |  or sp, r23, r0
++  |  or r23, TMP2, r0
++  |  jirl r0, ra, 0
++  |.endif
++  |// Note: vm_ffi_call must be the last function in this object file!
++  |
++  |//-----------------------------------------------------------------------
+ }
+ 
+ /* Generate the code for a single instruction. */
+-- 
+2.20.1
+
diff --git a/loongarch64/0014-LoongArch64-Add-FFI-C-callback-handling.patch b/loongarch64/0014-LoongArch64-Add-FFI-C-callback-handling.patch
new file mode 100644
index 0000000..19aafad
--- /dev/null
+++ b/loongarch64/0014-LoongArch64-Add-FFI-C-callback-handling.patch
@@ -0,0 +1,177 @@
+From 22b62adb4af7fcea2b76067e900bbcabd98d43cb Mon Sep 17 00:00:00 2001
+From: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
+Date: Wed, 27 Jul 2022 18:25:34 +0800
+Subject: [PATCH 14/20] LoongArch64: Add FFI C callback handling
+
+---
+ src/lj_ccallback.c      | 58 ++++++++++++++++++++++++++++++++++++-
+ src/vm_loongarch64.dasc | 63 +++++++++++++++++++++++++++++++++++++++++
+ 2 files changed, 120 insertions(+), 1 deletion(-)
+
+diff --git a/src/lj_ccallback.c b/src/lj_ccallback.c
+index 43e44305..f7a5830f 100644
+--- a/src/lj_ccallback.c
++++ b/src/lj_ccallback.c
+@@ -71,6 +71,10 @@ static MSize CALLBACK_OFS2SLOT(MSize ofs)
+ 
+ #define CALLBACK_MCODE_HEAD		52
+ 
++#elif LJ_TARGET_LOONGARCH64
++
++#define CALLBACK_MCODE_HEAD		52
++
+ #else
+ 
+ /* Missing support for this architecture. */
+@@ -238,6 +242,33 @@ static void *callback_mcode_init(global_State *g, uint32_t *page)
+   }
+   return p;
+ }
++#elif LJ_TARGET_LOONGARCH64
++static void *callback_mcode_init(global_State *g, uint32_t *page)
++{
++  uint32_t *p = page;
++  uintptr_t target = (uintptr_t)(void *)lj_vm_ffi_callback;
++  uintptr_t ug = (uintptr_t)(void *)g;
++  MSize slot;
++  *p++ = LOONGI_LU12I_W | LOONGF_D(RID_R18) | LOONGF_I20((target >> 12) & 0xfffff);
++  *p++ = LOONGI_LU12I_W | LOONGF_D(RID_R17) | LOONGF_I20((ug >> 12) & 0xfffff);
++  *p++ = LOONGI_ORI  | LOONGF_D(RID_R18) | LOONGF_J(RID_R18) | LOONGF_I(target & 0xfff);
++  *p++ = LOONGI_ORI  | LOONGF_D(RID_R17) | LOONGF_J(RID_R17) | LOONGF_I(ug & 0xfff);
++  *p++ = LOONGI_LU32I_D | LOONGF_D(RID_R18) | LOONGF_I20((target >> 32) & 0xfffff);
++  *p++ = LOONGI_LU32I_D | LOONGF_D(RID_R17) | LOONGF_I20((ug >> 32) & 0xfffff);
++  *p++ = LOONGI_LU52I_D | LOONGF_D(RID_R18) | LOONGF_J(RID_R18) | LOONGF_I((target >> 52) & 0xfff);
++  *p++ = LOONGI_LU52I_D | LOONGF_D(RID_R17) | LOONGF_J(RID_R17) | LOONGF_I((ug >> 52) & 0xfff);
++  *p++ = LOONGI_NOP;
++  *p++ = LOONGI_NOP;
++  *p++ = LOONGI_NOP;
++  *p++ = LOONGI_NOP;
++  *p++ = LOONGI_JIRL | LOONGF_D(RID_R0) | LOONGF_J(RID_R18) | LOONGF_I(0);
++  for (slot = 0; slot < CALLBACK_MAX_SLOT; slot++) {
++    *p++ = LOONGI_ORI  | LOONGF_D(RID_R19) | LOONGF_J(RID_R0) | LOONGF_I(slot & 0xfff);
++    *p = LOONGI_B | LOONGF_I((page-p) & 0xffff) | (((page-p) >> 16) & 0x3ff);
++    p++;
++  }
++  return p;
++}
+ #else
+ /* Missing support for this architecture. */
+ #define callback_mcode_init(g, p)	(p)
+@@ -512,6 +543,31 @@ void lj_ccallback_mcode_free(CTState *cts)
+   }
+ #endif
+ 
++#define CALLBACK_HANDLE_RET \
++  if (ctype_isfp(ctr->info) && ctr->size == sizeof(float)) \
++    ((float *)dp)[1] = *(float *)dp;
++
++#elif LJ_TARGET_LOONGARCH64
++
++#define CALLBACK_HANDLE_REGARG \
++  if (isfp) { \
++    if (nfpr + n <= CCALL_NARG_FPR) { \
++      sp = &cts->cb.fpr[nfpr]; \
++      nfpr += n; \
++      goto done; \
++    } else if (ngpr + n <= maxgpr) { \
++      sp = &cts->cb.gpr[ngpr]; \
++      ngpr += n; \
++      goto done; \
++    } \
++  } else { \
++    if (ngpr + n <= maxgpr) { \
++      sp = &cts->cb.gpr[ngpr]; \
++      ngpr += n; \
++      goto done; \
++    } \
++  }
++
+ #define CALLBACK_HANDLE_RET \
+   if (ctype_isfp(ctr->info) && ctr->size == sizeof(float)) \
+     ((float *)dp)[1] = *(float *)dp;
+@@ -662,7 +718,7 @@ static void callback_conv_result(CTState *cts, lua_State *L, TValue *o)
+ 	*(int32_t *)dp = ctr->size == 1 ? (int32_t)*(int8_t *)dp :
+ 					  (int32_t)*(int16_t *)dp;
+     }
+-#if LJ_TARGET_MIPS64 || (LJ_TARGET_ARM64 && LJ_BE)
++#if LJ_TARGET_MIPS64 || (LJ_TARGET_ARM64 && LJ_BE) || LJ_TARGET_LOONGARCH64
+     /* Always sign-extend results to 64 bits. Even a soft-fp 'float'. */
+     if (ctr->size <= 4 &&
+ 	(LJ_ABI_SOFTFP || ctype_isinteger_or_bool(ctr->info)))
+diff --git a/src/vm_loongarch64.dasc b/src/vm_loongarch64.dasc
+index 8c6cde99..e816de9b 100644
+--- a/src/vm_loongarch64.dasc
++++ b/src/vm_loongarch64.dasc
+@@ -2370,6 +2370,69 @@ static void build_subroutines(BuildCtx *ctx)
+   |//-- FFI helper functions -----------------------------------------------
+   |//-----------------------------------------------------------------------
+   |
++  |// Handler for callback functions. Callback slot number in r19, g in r17.
++  |->vm_ffi_callback:
++  |.if FFI
++  |.type CTSTATE, CTState, PC
++  |  saveregs
++  |  ld.d CTSTATE, GL:r17->ctype_state
++  |  .ADD16I DISPATCH, r17, GG_G2DISP
++  |  st.w r19, CTSTATE->cb.slot
++  |  st.d CARG1, CTSTATE->cb.gpr[0]
++  |  fst.d FARG1, CTSTATE->cb.fpr[0]
++  |  st.d CARG2, CTSTATE->cb.gpr[1]
++  |  fst.d FARG2, CTSTATE->cb.fpr[1]
++  |  st.d CARG3, CTSTATE->cb.gpr[2]
++  |  fst.d FARG3, CTSTATE->cb.fpr[2]
++  |  st.d CARG4, CTSTATE->cb.gpr[3]
++  |  fst.d FARG4, CTSTATE->cb.fpr[3]
++  |  st.d CARG5, CTSTATE->cb.gpr[4]
++  |  fst.d FARG5, CTSTATE->cb.fpr[4]
++  |  st.d CARG6, CTSTATE->cb.gpr[5]
++  |  fst.d FARG6, CTSTATE->cb.fpr[5]
++  |  st.d CARG7, CTSTATE->cb.gpr[6]
++  |  fst.d FARG7, CTSTATE->cb.fpr[6]
++  |  st.d CARG8, CTSTATE->cb.gpr[7]
++  |  fst.d FARG8, CTSTATE->cb.fpr[7]
++  |  addi.d TMP0, sp, CFRAME_SPACE
++  |  st.d TMP0, CTSTATE->cb.stack
++  |  st.d r0, SAVE_PC(sp)		// Any value outside of bytecode is ok.
++  |  or CARG1, CTSTATE, r0
++  |  or CARG2, sp, r0
++  |  bl extern lj_ccallback_enter	// (CTState *cts, void *cf)
++  |  // Returns lua_State *.
++  |  ld.d BASE, L:CRET1->base
++  |  ld.d RC, L:CRET1->top
++  |  or L, CRET1, r0
++  |  addu16i.d TMP3, r0, 0x59c0		// TOBIT = 2^52 + 2^51 (float).
++  |  ld.d LFUNC:RB, FRAME_FUNC(BASE)
++  |  movgr2fr.w TOBIT, TMP3
++  |  addi.d TISNIL, r0, LJ_TNIL
++  |  addi.d TISNUM, r0, LJ_TISNUM
++  |  li_vmstate INTERP
++  |  sub.w RC, RC, BASE
++  |  cleartp LFUNC:RB
++  |  st_vmstate
++  |  fcvt.d.s TOBIT, TOBIT
++  |  ins_callt
++  |.endif
++  |
++  |->cont_ffi_callback:			// Return from FFI callback.
++  |.if FFI
++  |  .LDXD CTSTATE, DISPATCH, DISPATCH_GL(ctype_state)
++  |  st.d BASE, L->base
++  |  st.d RB, L->top
++  |  st.d L, CTSTATE->L
++  |  or CARG1, CTSTATE, r0
++  |  or CARG2, RA, r0
++  |  bl extern lj_ccallback_leave	// (CTState *cts, TValue *o)
++  |  fld.d FRET1, CTSTATE->cb.fpr[0]
++  |  ld.d CRET1, CTSTATE->cb.gpr[0]
++  |  fld.d FRET2, CTSTATE->cb.fpr[1]
++  |  ld.d CRET2, CTSTATE->cb.gpr[1]
++  |  b ->vm_leave_unw
++  |.endif
++  |
+   |->vm_ffi_call:			// Call C function via FFI.
+   |  // Caveat: needs special frame unwinding, see below.
+   |.if FFI
+-- 
+2.20.1
+
diff --git a/loongarch64/0015-LoongArch64-Add-FFI-support-in-the-interpreter.patch b/loongarch64/0015-LoongArch64-Add-FFI-support-in-the-interpreter.patch
new file mode 100644
index 0000000..2689da4
--- /dev/null
+++ b/loongarch64/0015-LoongArch64-Add-FFI-support-in-the-interpreter.patch
@@ -0,0 +1,163 @@
+From 711b33dd70f8250cd6b336e7e50a29f8ccd70619 Mon Sep 17 00:00:00 2001
+From: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
+Date: Wed, 27 Jul 2022 18:28:46 +0800
+Subject: [PATCH 15/20] LoongArch64: Add FFI support in the interpreter
+
+---
+ src/vm_loongarch64.dasc | 71 +++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 71 insertions(+)
+
+diff --git a/src/vm_loongarch64.dasc b/src/vm_loongarch64.dasc
+index e816de9b..e528e7d5 100644
+--- a/src/vm_loongarch64.dasc
++++ b/src/vm_loongarch64.dasc
+@@ -687,15 +687,29 @@ static void build_subroutines(BuildCtx *ctx)
+   |  or RB, BASE, r0
+   |  or BASE, TMP2, r0			// Restore caller BASE.
+   |  ld.d LFUNC:TMP1, FRAME_FUNC(TMP2)
++  |.if FFI
++  |  sltui TMP3, TMP0, 2
++  |.endif
+   |  ld.d PC, -24(RB)			// Restore PC from [cont|PC].
+   |  cleartp LFUNC:TMP1
+   |  add.d TMP2, RA, RD
+   |  ld.d TMP1, LFUNC:TMP1->pc
+   |  st.d TISNIL, -8(TMP2)               // Ensure one valid arg.
++  |.if FFI
++  |  bnez TMP3, >1
++  |.endif
+   |  // BASE = base, RA = resultptr, RB = meta base
+   |  ld.d KBASE, PC2PROTO(k)(TMP1)
+   |  jirl r0, TMP0, 0				// Jump to continuation.
+   |
++  |.if FFI
++  |1:
++  |  addi.d TMP1, RB, -32
++  |  bnez TMP0, ->cont_ffi_callback	// cont = 1: return from FFI callback.
++  |  // cont = 0: tailcall from C function.
++  |  sub.d RC, TMP1, BASE
++  |  b ->vm_call_tail
++  |.endif
+   |
+   |->cont_cat:				// RA = resultptr, RB = meta base
+   |  ld.w INS, -4(PC)
+@@ -889,6 +903,17 @@ static void build_subroutines(BuildCtx *ctx)
+   |  // Returns 0/1 or TValue * (metamethod).
+   |  b <3
+   |
++  |->vmeta_equal_cd:
++  |.if FFI
++  |  or CARG2, INS, r0
++  |  addi.d PC, PC, -4
++  |  st.d BASE, L->base
++  |  or CARG1, L, r0
++  |  st.d PC, SAVE_PC(sp)
++  |  bl extern lj_meta_equal_cd		// (lua_State *L, BCIns op)
++  |  // Returns 0/1 or TValue * (metamethod).
++  |  b <3
++  |.endif
+   |
+   |->vmeta_istype:
+   |  addi.d PC, PC, -4
+@@ -2617,6 +2642,12 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
+       |  beqz TMP0, ->BC_ISNEN_Z
+     }
+     |// Either or both types are not numbers.
++    |.if FFI
++    |  // Check if RA or RD is a cdata.
++    |  addi.w TMP0, r0, LJ_TCDATA
++    |  beq CARG3, TMP0, ->vmeta_equal_cd
++    |  beq CARG4, TMP0, ->vmeta_equal_cd
++    |.endif
+     |  addu16i.d TMP3, r0, -0x2		// -BCBIAS_J*4
+     |  decode_BC4b TMP2
+     |  add.w TMP2, TMP2, TMP3		// (jump-0x8000)<<2
+@@ -2667,10 +2698,17 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
+     |  sub.d RD, KBASE, RD
+     |  ld.hu TMP2, -4+OFS_RD(PC)
+     |  ld.d CARG2, -8(RD)		// KBASE-8-str_const*8
++    |.if FFI
++    |  gettp CARG3, CARG1
++    |  addi.w TMP1, r0, LJ_TCDATA
++    |.endif
+     |  addi.w TMP0, r0, LJ_TSTR
+     |  decode_BC4b TMP2
+     |  settp CARG2, TMP0
+     |  addu16i.d TMP3, r0, -0x2		// -BCBIAS_J*4
++    |.if FFI
++    |  beq CARG3, TMP1, ->vmeta_equal_cd
++    |.endif
+     |  xor TMP0, CARG1, CARG2		// TMP2=0: A==D; TMP2!=0: A!=D
+     |  add.w TMP2, TMP2, TMP3
+     if (vk) {
+@@ -2720,7 +2758,11 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
+     |4:  // RA is not an integer.
+     |  sltu TMP0, CARG3, TISNUM
+     |  add.w TMP2, TMP2, TMP3
++    |.if FFI
++    |  beqz TMP0, >7
++    |.else
+     |  beqz TMP0, <2
++    |.endif
+     |  movgr2fr.d FTMP0, CARG1
+     |  movgr2fr.d FTMP2, CARG2
+     |  bne CARG4, TISNUM, >5
+@@ -2734,12 +2776,27 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
+     |
+     |6: // RA is an integer, RD is a number.
+     |  sltu TMP0, CARG4, TISNUM
++    |.if FFI
++    |  beqz TMP0, >8
++    |.else
+     |  beqz TMP0, <2
++    |.endif
+     |  movgr2fr.w FTMP0, CARG1
+     |  movgr2fr.d FTMP2, CARG2
+     |  ffint.d.w FTMP0, FTMP0
+     |  b <5
+     |
++    |.if FFI
++    |7:	// RA not int, not number
++    |  addi.w TMP0, r0, LJ_TCDATA
++    |  bne CARG3, TMP0, <2
++    |  b ->vmeta_equal_cd
++    |
++    |8:	// RD not int, not number
++    |  addi.w TMP0, r0, LJ_TCDATA
++    |  bne CARG4, TMP0, <2
++    |  b ->vmeta_equal_cd
++    |.endif
+     break;
+ 
+   case BC_ISEQP: case BC_ISNEP:
+@@ -2753,6 +2810,10 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
+     |  gettp TMP1, TMP1
+     |  addi.d PC, PC, 4
+     |  xor TMP0, TMP1, TMP0		// TMP0=0 A=D; TMP0!=0 A!=D
++    |.if FFI
++    |  addi.w TMP3, r0, LJ_TCDATA
++    |  beq TMP1, TMP3, ->vmeta_equal_cd
++    |.endif
+     |  decode_BC4b TMP2
+     |  addu16i.d TMP3, r0, -0x2		// -BCBIAS_J*4
+     |  add.w TMP2, TMP2, TMP3		// TMP2=(jump-0x8000)<<2
+@@ -3086,6 +3147,16 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
+     |  ins_next
+     break;
+   case BC_KCDATA:
++    |.if FFI
++    |  // RA = dst*8, RD = cdata_const*8 (~)
++    |  sub.d TMP1, KBASE, RD
++    |  ld.d TMP0, -8(TMP1)		// KBASE-8-cdata_const*8
++    |  addi.w TMP2, r0, LJ_TCDATA
++    |  add.d RA, BASE, RA
++    |  settp TMP0, TMP2
++    |  st.d TMP0, 0(RA)
++    |  ins_next
++    |.endif
+     break;
+   case BC_KSHORT:
+     |  // RA = dst*8, RD = int16_literal*8
+-- 
+2.20.1
+
diff --git a/loongarch64/0016-LoongArch64-Add-DWARF-and-ELF-header-definitions.patch b/loongarch64/0016-LoongArch64-Add-DWARF-and-ELF-header-definitions.patch
new file mode 100644
index 0000000..7bd52fa
--- /dev/null
+++ b/loongarch64/0016-LoongArch64-Add-DWARF-and-ELF-header-definitions.patch
@@ -0,0 +1,187 @@
+From d57fc6987a26e1c39fde5709a19f34cd881ca318 Mon Sep 17 00:00:00 2001
+From: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
+Date: Wed, 27 Jul 2022 18:31:45 +0800
+Subject: [PATCH 16/20] LoongArch64: Add DWARF and ELF header definitions
+
+---
+ src/lj_gdbjit.c         |  12 ++++
+ src/vm_loongarch64.dasc | 129 ++++++++++++++++++++++++++++++++++++++++
+ 2 files changed, 141 insertions(+)
+
+diff --git a/src/lj_gdbjit.c b/src/lj_gdbjit.c
+index c50d0d4c..772a678e 100644
+--- a/src/lj_gdbjit.c
++++ b/src/lj_gdbjit.c
+@@ -306,6 +306,9 @@ enum {
+ #elif LJ_TARGET_MIPS
+   DW_REG_SP = 29,
+   DW_REG_RA = 31,
++#elif LJ_TARGET_LOONGARCH64
++  DW_REG_SP = 3,
++  DW_REG_RA = 1,
+ #else
+ #error "Unsupported target architecture"
+ #endif
+@@ -383,6 +386,8 @@ static const ELFheader elfhdr_template = {
+   .machine = 20,
+ #elif LJ_TARGET_MIPS
+   .machine = 8,
++#elif LJ_TARGET_LOONGARCH64
++  .machine = 258,
+ #else
+ #error "Unsupported target architecture"
+ #endif
+@@ -591,6 +596,13 @@ static void LJ_FASTCALL gdbjit_ehframe(GDBJITctx *ctx)
+       for (i = 23; i >= 16; i--) { DB(DW_CFA_offset|i); DUV(26-i); }
+       for (i = 30; i >= 20; i -= 2) { DB(DW_CFA_offset|32|i); DUV(42-i); }
+     }
++#elif LJ_TARGET_LOONGARCH64
++    {
++      int i;
++      DB(DW_CFA_offset|30); DUV(2);
++      for (i = 31; i >= 23; i--) { DB(DW_CFA_offset|i); DUV(3+(31-i)); }
++      for (i = 31; i >= 24; i--) { DB(DW_CFA_offset|32|i); DUV(43-i); }
++    }
+ #else
+ #error "Unsupported target architecture"
+ #endif
+diff --git a/src/vm_loongarch64.dasc b/src/vm_loongarch64.dasc
+index e528e7d5..580f13ce 100644
+--- a/src/vm_loongarch64.dasc
++++ b/src/vm_loongarch64.dasc
+@@ -4477,3 +4477,132 @@ static int build_backend(BuildCtx *ctx)
+ 
+   return BC__MAX;
+ }
++
++/* Emit pseudo frame-info for all assembler functions. */
++static void emit_asm_debug(BuildCtx *ctx)
++{
++  int fcofs = (int)((uint8_t *)ctx->glob[GLOB_vm_ffi_call] - ctx->code);
++  int i;
++  switch (ctx->mode) {
++  case BUILD_elfasm:
++    fprintf(ctx->fp, "\t.section .debug_frame,\"\",@progbits\n");
++    fprintf(ctx->fp,
++	".Lframe0:\n"
++	"\t.4byte .LECIE0-.LSCIE0\n"
++	".LSCIE0:\n"
++	"\t.4byte 0xffffffff\n"
++	"\t.byte 0x1\n"
++	"\t.string \"\"\n"
++	"\t.uleb128 0x1\n"
++	"\t.sleb128 -4\n"
++	"\t.byte 1\n"				/* Return address is in ra. */
++	"\t.byte 0xc\n\t.uleb128 3\n\t.uleb128 0\n"	/* def_cfa sp 0 */
++	"\t.align 3\n"
++	".LECIE0:\n\n");
++    fprintf(ctx->fp,
++	".LSFDE0:\n"
++	"\t.4byte .LEFDE0-.LASFDE0\n"
++	".LASFDE0:\n"
++	"\t.4byte .Lframe0\n"
++	"\t.8byte .Lbegin\n"
++	"\t.8byte %d\n"
++	"\t.byte 0xe\n\t.uleb128 %d\n"
++	"\t.byte 0x81\n\t.uleb128 2*5\n"	/* offset ra*/
++	"\t.byte 0x96\n\t.uleb128 2*6\n",	/* offset fp */
++	fcofs, CFRAME_SIZE);
++    for (i = 31; i >= 23; i--)	/* offset r31-r23 */
++      fprintf(ctx->fp, "\t.byte %d\n\t.uleb128 %d\n", 0x80+i, 2*(31-i+7));
++    for (i = 31; i >= 24; i--)	/* offset f31-f24 */
++      fprintf(ctx->fp, "\t.byte %d\n\t.uleb128 %d\n", 0x80+32+i, 2*(31-i+16));
++    fprintf(ctx->fp,
++	"\t.align 3\n"
++	".LEFDE0:\n\n");
++#if LJ_HASFFI
++    fprintf(ctx->fp,
++	".LSFDE1:\n"
++	"\t.4byte .LEFDE1-.LASFDE1\n"
++	".LASFDE1:\n"
++	"\t.4byte .Lframe0\n"
++	"\t.4byte lj_vm_ffi_call\n"
++	"\t.4byte %d\n"
++	"\t.byte 0x81\n\t.uleb128 2*5\n"	/* offset ra*/
++	"\t.byte 0x96\n\t.uleb128 2*6\n"	/* offset fp */
++	"\t.align 3\n"
++	".LEFDE1:\n\n", (int)ctx->codesz - fcofs);
++#endif
++#if !LJ_NO_UNWIND
++    fprintf(ctx->fp, "\t.section .eh_frame,\"a\",@progbits\n");
++    fprintf(ctx->fp,
++	".Lframe1:\n"
++	"\t.4byte .LECIE1-.LSCIE1\n"
++	".LSCIE1:\n"
++	"\t.4byte 0\n"
++	"\t.byte 0x1\n"
++	"\t.string \"zPR\"\n"
++	"\t.uleb128 0x1\n"
++	"\t.sleb128 -4\n"
++	"\t.byte 1\n"				/* Return address is in ra. */
++	"\t.uleb128 6\n"			/* augmentation length */
++	"\t.byte 0x1b\n"
++	"\t.4byte lj_err_unwind_dwarf-.\n"
++	"\t.byte 0x1b\n"
++	"\t.byte 0xc\n\t.uleb128 3\n\t.uleb128 0\n"	/* def_cfa sp 0 */
++	"\t.align 2\n"
++	".LECIE1:\n\n");
++    fprintf(ctx->fp,
++	".LSFDE2:\n"
++	"\t.4byte .LEFDE2-.LASFDE2\n"
++	".LASFDE2:\n"
++	"\t.4byte .LASFDE2-.Lframe1\n"
++	"\t.4byte .Lbegin-.\n"
++	"\t.4byte %d\n"
++	"\t.uleb128 0\n"			/* augmentation length */
++	"\t.byte 0x81\n\t.uleb128 2*5\n"	/* offset ra*/
++	"\t.byte 0x96\n\t.uleb128 2*6\n",	/* offset fp */
++	fcofs);
++    for (i = 31; i >= 23; i--)	/* offset r23-r31 */
++      fprintf(ctx->fp, "\t.byte %d\n\t.uleb128 %d\n", 0x80+i, 2*(31-i+7));
++    for (i = 31; i >= 24; i--)	/* offset f24-f31 */
++      fprintf(ctx->fp, "\t.byte %d\n\t.uleb128 %d\n", 0x80+32+i, 2*(31-i+16));
++    fprintf(ctx->fp,
++	"\t.align 2\n"
++	".LEFDE2:\n\n");
++#if LJ_HASFFI
++    fprintf(ctx->fp,
++	".Lframe2:\n"
++	"\t.4byte .LECIE2-.LSCIE2\n"
++	".LSCIE2:\n"
++	"\t.4byte 0\n"
++	"\t.byte 0x1\n"
++	"\t.string \"zR\"\n"
++	"\t.uleb128 0x1\n"
++	"\t.sleb128 -4\n"
++	"\t.byte 1\n"				/* Return address is in ra. */
++	"\t.uleb128 1\n"			/* augmentation length */
++	"\t.byte 0x1b\n"
++	"\t.byte 0xc\n\t.uleb128 3\n\t.uleb128 0\n"	/* def_cfa sp 0 */
++	"\t.align 2\n"
++	".LECIE2:\n\n");
++    fprintf(ctx->fp,
++	".LSFDE3:\n"
++	"\t.4byte .LEFDE3-.LASFDE3\n"
++	".LASFDE3:\n"
++	"\t.4byte .LASFDE3- .Lframe2\n"
++	"\t.4byte lj_vm_ffi_call-.\n"
++	"\t.4byte %d\n"
++	"\t.uleb128 0\n"			/* augmentation length */
++	"\t.byte 0x81\n\t.uleb128 2*5\n"	/* offset ra*/
++	"\t.byte 0x96\n\t.uleb128 2*6\n"	/* offset fp */
++	"\t.align 2\n"
++	".LEFDE3:\n\n", (int)ctx->codesz - fcofs);
++#endif
++#endif
++#if !LJ_NO_UNWIND
++    /* NYI */
++#endif
++    break;
++  default:
++    break;
++  }
++}
++
+-- 
+2.20.1
+
diff --git a/loongarch64/0017-LoongArch64-Add-support-for-LuaJIT-VM-builder.patch b/loongarch64/0017-LoongArch64-Add-support-for-LuaJIT-VM-builder.patch
new file mode 100644
index 0000000..6f46e69
--- /dev/null
+++ b/loongarch64/0017-LoongArch64-Add-support-for-LuaJIT-VM-builder.patch
@@ -0,0 +1,46 @@
+From 42c13dded64fb5d6cdaf14ae1144acef4e30484a Mon Sep 17 00:00:00 2001
+From: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
+Date: Wed, 27 Jul 2022 18:35:18 +0800
+Subject: [PATCH 17/20] LoongArch64: Add support for LuaJIT VM builder
+
+---
+ src/host/buildvm.c     | 2 ++
+ src/host/buildvm_asm.c | 9 +++++++++
+ 2 files changed, 11 insertions(+)
+
+diff --git a/src/host/buildvm.c b/src/host/buildvm.c
+index 9ee47ada..b983c22f 100644
+--- a/src/host/buildvm.c
++++ b/src/host/buildvm.c
+@@ -67,6 +67,8 @@ static int collect_reloc(BuildCtx *ctx, uint8_t *addr, int idx, int type);
+ #include "../dynasm/dasm_ppc.h"
+ #elif LJ_TARGET_MIPS
+ #include "../dynasm/dasm_mips.h"
++#elif LJ_TARGET_LOONGARCH64
++#include "../dynasm/dasm_loongarch64.h"
+ #else
+ #error "No support for this architecture (yet)"
+ #endif
+diff --git a/src/host/buildvm_asm.c b/src/host/buildvm_asm.c
+index 7baa011f..e947b3f0 100644
+--- a/src/host/buildvm_asm.c
++++ b/src/host/buildvm_asm.c
+@@ -156,6 +156,15 @@ static void emit_asm_wordreloc(BuildCtx *ctx, uint8_t *p, int n,
+ 	  "Error: unsupported opcode %08x for %s symbol relocation.\n",
+ 	  ins, sym);
+   exit(1);
++#elif LJ_TARGET_LOONGARCH64
++  if ((ins >> 26) == 21) {
++    fprintf(ctx->fp, "\tbl %s\n", sym);
++  } else {
++    fprintf(stderr,
++            "Error: unsupported opcode %08x for %s symbol relocation.\n",
++            ins, sym);
++    exit(1);
++  }
+ #else
+ #error "missing relocation support for this architecture"
+ #endif
+-- 
+2.20.1
+
diff --git a/loongarch64/0018-LoongArch64-Add-loongarch64-support-when-save-list-b.patch b/loongarch64/0018-LoongArch64-Add-loongarch64-support-when-save-list-b.patch
new file mode 100644
index 0000000..397a73d
--- /dev/null
+++ b/loongarch64/0018-LoongArch64-Add-loongarch64-support-when-save-list-b.patch
@@ -0,0 +1,25 @@
+From 291d4df029d2b9fcf534db7cda01b1239461339c Mon Sep 17 00:00:00 2001
+From: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
+Date: Thu, 28 Jul 2022 10:24:50 +0800
+Subject: [PATCH 18/20] LoongArch64: Add loongarch64 support when save/list
+ bytecode
+
+---
+ src/jit/bcsave.lua | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/src/jit/bcsave.lua b/src/jit/bcsave.lua
+index 90fe9daf..9547be4b 100644
+--- a/src/jit/bcsave.lua
++++ b/src/jit/bcsave.lua
+@@ -97,6 +97,7 @@ local map_arch = {
+   mips64el =	{ e = "le", b = 64, m = 8, f = 0x80000007, },
+   mips64r6 =	{ e = "be", b = 64, m = 8, f = 0xa0000407, },
+   mips64r6el =	{ e = "le", b = 64, m = 8, f = 0xa0000407, },
++  loongarch64 =	{ e = "le", b = 64, m = 258, f = 0x3, },
+ }
+ 
+ local map_os = {
+-- 
+2.20.1
+
diff --git a/loongarch64/0019-LoongArch64-Add-LoongArch64-disassembler-module.patch b/loongarch64/0019-LoongArch64-Add-LoongArch64-disassembler-module.patch
new file mode 100644
index 0000000..ad5a5fc
--- /dev/null
+++ b/loongarch64/0019-LoongArch64-Add-LoongArch64-disassembler-module.patch
@@ -0,0 +1,716 @@
+From d8e04b04cb3a3b134eba63110550b97ff6ac0d74 Mon Sep 17 00:00:00 2001
+From: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
+Date: Thu, 28 Jul 2022 10:29:06 +0800
+Subject: [PATCH 19/20] LoongArch64: Add LoongArch64 disassembler module
+
+---
+ src/jit/dis_loongarch64.lua | 697 ++++++++++++++++++++++++++++++++++++
+ 1 file changed, 697 insertions(+)
+ create mode 100644 src/jit/dis_loongarch64.lua
+
+diff --git a/src/jit/dis_loongarch64.lua b/src/jit/dis_loongarch64.lua
+new file mode 100644
+index 00000000..0fe0266b
+--- /dev/null
++++ b/src/jit/dis_loongarch64.lua
+@@ -0,0 +1,697 @@
++----------------------------------------------------------------------------
++-- LuaJIT LoongArch64 disassembler module.
++--
++-- Copyright (C) 2005-2022 Mike Pall. All rights reserved.
++-- Released under the MIT/X license. See Copyright Notice in luajit.h
++----------------------------------------------------------------------------
++-- This is a helper module used by the LuaJIT machine code dumper module.
++--
++-- It disassembles most LoongArch instructions.
++-- NYI: SIMD instructions.
++------------------------------------------------------------------------------
++
++local type = type
++local byte, format = string.byte, string.format
++local match, gmatch = string.match, string.gmatch
++local concat = table.concat
++local bit = require("bit")
++local band, bor, bnot, tohex = bit.band, bit.bor, bit.bnot, bit.tohex
++local lshift, rshift, arshift = bit.lshift, bit.rshift, bit.arshift
++
++------------------------------------------------------------------------------
++-- Opcode maps
++------------------------------------------------------------------------------
++
++local map_18_0 = {      -- 18-20:0, 10-17
++  shift = 10, mask = 255,
++  [4] = "clo.wDJ",
++  [5] = "clz.wDJ",
++  [6] = "cto.wDJ",
++  [7] = "ctz.wDJ",
++  [8] = "clo.dDJ",
++  [9] = "clz.dDJ",
++  [10] = "cto.dDJ",
++  [11] = "ctz.dDJ",
++  [12] = "revb.2hDJ",
++  [13] = "revb.4hDJ",
++  [14] = "revb.2wDJ",
++  [15] = "revb.dDJ",
++  [16] = "revh.2wDJ",
++  [17] = "revh.dDJ",
++  [18] = "bitrev.4bDJ",
++  [19] = "bitrev.8bDJ",
++  [20] = "bitrev.wDJ",
++  [21] = "bitrev.dDJ",
++  [22] = "ext.w.hDJ",
++  [23] = "ext.w.bDJ",
++}
++
++local map_18_4 = {	-- 18-20:4, 15-17
++  shift = 15, mask = 7,
++  [0] = "add.wDJK",
++  [1] = "add.dDJK",
++  [2] = "sub.wDJK",
++  [3] = "sub.dDJK",
++  [4] = "sltDJK",
++  [5] = "sltuDJK",
++  [6] = "maskeqzDJK",
++  [7] = "masknezDJK",
++}
++
++local map_18_5 = {	-- 18-20:5, 15-17
++  shift = 15, mask = 7,
++  [0] = "norDJK",
++  [1] = "andDJK",
++  [2] = "orDJK",
++  [3] = "xorDJK",
++  [4] = "ornDJK",
++  [5] = "andnDJK",
++  [6] = "sll.wDJK",
++  [7] = "srl.wDJK",
++}
++
++local map_18_6 = {	-- 18-20:6, 15-17
++  shift = 15, mask = 7,
++  [0] = "sra.wDJK",
++  [1] = "sll.dDJK",
++  [2] = "srl.dDJK",
++  [3] = "sra.dDJK",
++  [6] = "rotr.wDJK",
++  [7] = "rotr.dDJK",
++}
++
++local map_18_7 = {	-- 18-20:7, 15-17
++  shift = 15, mask = 7,
++  [0] = "mul.wDJK",
++  [1] = "mulh.wDJK",
++  [2] = "mulh.wuDJK",
++  [3] = "mul.dDJK",
++  [4] = "mulh.dDJK",
++  [5] = "mulh.duDJK",
++  [6] = "mulw.d.wDJK",
++  [7] = "mulw.d.wuDJK",
++}
++
++local map_farith2 = {
++  shift = 10, mask = 31,
++  [1] = "fabs.sFG",
++  [2] = "fabs.dFG",
++  [5] = "fneg.sFG",
++  [6] = "fneg.dFG",
++  [9] = "flogb.sFG",
++  [10] = "flogb.dFG",
++  [13] = "fclass.sFG",
++  [14] = "fclass.dFG",
++  [17] = "fsqrt.sFG",
++  [18] = "fsqrt.dFG",
++  [21] = "frecip.sFG",
++  [22] = "frecip.dFG",
++  [25] = "frsqrt.sFG",
++  [26] = "frsqrt.dFG",
++  [29] = "frecipe.sFG",
++  [30] = "frecipe.dFG",
++  [33] = "frsqrte.sFG",
++  [34] = "frsqrte.dFG",
++}
++
++local map_fmov = {
++  shift = 10, mask = 31,
++  [5] = "fmov.sFG",
++  [6] = "fmov.dFG",
++  [9] = "movgr2fr.wFJ",
++  [10] = "movgr2fr.dFJ",
++  [11] = "movgr2frh.wFJ",
++  [13] = "movfr2gr.sDG",
++  [14] = "movfr2gr.dDG",
++  [15] = "movfrh2gr.sDG",
++  [16] = "movgr2fcsrSJ",
++  [18] = "movfcsr2grDR",
++  [20] = { shift = 3, mask = 3, [0] = "movfr2cfEG", },
++  [21] = { shift = 8, mask = 3, [0] = "movcf2frFA", },
++  [22] = { shift = 3, mask = 3, [0] = "movgr2cfEJ", },
++  [23] = { shift = 8, mask = 3, [0] = "movcf2grDA", },
++}
++
++local map_fconvert = { -- 15-20: 110010
++  shift = 10, mask = 31,
++  [6] = "fcvt.s.dFG",	[9] = "fcvt.d.sFG",
++}
++
++local map_fconvert1 = { -- 15-20: 110100
++  shift = 10, mask = 31,
++  [1] = "ftintrm.w.sFG",
++  [2] = "ftintrm.w.dFG",
++  [9] = "ftintrm.l.sFG",
++  [10] = "ftintrm.l.dFG",
++  [17] = "ftintrp.w.sFG",
++  [18] = "ftintrp.w.dFG",
++  [25] = "ftintrp.l.sFG",
++  [26] = "ftintrp.l.dFG",
++}
++
++local map_fconvert2 = { -- 15-20: 110101
++  shift = 10, mask = 31,
++  [1] = "ftintrz.w.sFG",
++  [2] = "ftintrz.w.dFG",
++  [9] = "ftintrz.l.sFG",
++  [10] = "ftintrz.l.dFG",
++  [17] = "ftintrne.w.sFG",
++  [18] = "ftintrne.w.dFG",
++  [25] = "ftintrne.l.sFG",
++  [26] = "ftintrne.l.dFG",
++}
++
++local map_fconvert3 = { -- 15-20: 110110
++  shift = 10, mask = 31,
++  [1] = "ftint.w.sFG",
++  [2] = "ftint.w.dFG",
++  [9] = "ftint.l.sFG",
++  [10] = "ftint.l.dFG",
++}
++
++local map_fconvert4 = { -- 15-20: 111010
++  shift = 10, mask = 31,
++  [4] = "ffint.s.wFG",
++  [6] =  "ffint.s.lFG",
++  [8] = "ffint.d.wFG",
++  [10] = "ffint.d.lFG",
++}
++
++local map_fconvert5 = { -- 15-20: 111100
++  shift = 10, mask = 31,
++  [17] = "frint.sFG",
++  [18] = "frint.dFG",
++}
++
++local map_farith = {	-- 22-25:4, 15-21
++  shift = 15, mask = 127,
++  [1] = "fadd.sFGH",
++  [2] = "fadd.dFGH",
++  [5] = "fsub.sFGH",
++  [6] = "fsub.dFGH",
++  [9] = "fmul.sFGH",
++  [10] = "fmul.dFGH",
++  [13] = "fdiv.sFGH",
++  [14] = "fdiv.dFGH",
++  [17] = "fmax.sFGH",
++  [18] = "fmax.dFGH",
++  [21] = "fmin.sFGH",
++  [22] = "fmin.dFGH",
++  [25] = "fmaxa.sFGH",
++  [26] = "fmaxa.dFGH",
++  [29] = "fmina.sFGH",
++  [30] = "fmina.dFGH",
++  [33] = "fscaleb.sFGH",
++  [34] = "fscaleb.dFGH",
++  [37] = "fcopysign.sFGH",
++  [38] = "fcopysign.dFGH",
++  [40] = map_farith2, [41] = map_fmov,
++  [50] = map_fconvert, [52] = map_fconvert1,
++  [53] = map_fconvert2, [54] = map_fconvert3,
++  [58] = map_fconvert4, [60] = map_fconvert5,
++}
++
++local map_21_0 = {	--21st:0, 18-20
++  shift = 18, mask = 7,
++  [0] = map_18_0,
++  [1] = { shift = 17, mask = 1, [0] = "alsl.wDJKQ", "alsl.wuDJKQ", },
++  [2] = {shift = 17, mask = 1, [0] = "bytepick.wDJKQ", },
++  [3] = "bytepick.dDJKB",
++  [4] = map_18_4,
++  [5] = map_18_5,
++  [6] = map_18_6,
++  [7] = map_18_7,
++}
++
++local map_21_1 = {      --21st:1, 22nd:0, 15-20
++  shift = 21, mask = 1,
++  [1] = {
++    shift = 18, mask = 7,
++    [0] = {
++      shift = 15, mask = 7,
++      [0] = "div.wDJK",
++      [1] = "mod.wDJK",
++      [2] = "div.wuDJK",
++      [3] = "mod.wuDJK",
++      [4] = "div.dDJK",
++      [5] = "mod.dDJK",
++      [6] = "div.duDJK",
++      [7] = "mod.duDJK",
++    },
++    [1] = {
++      shift = 18, mask = 7,
++      [0] = "crc.w.b.wDJK",
++      [1] = "crc.w.h.wDJK",
++      [2] = "crc.w.w.wDJK",
++      [3] = "crc.w.d.wDJK",
++      [4] = "crcc.w.b.wDJK",
++      [5] = "crcc.w.h.wDJK",
++      [6] = "crcc.w.w.wDJK",
++      [7] = "crcc.w.d.wDJK",
++    },
++    [2] = {
++      shift = 15, mask = 7,
++      [4] = breakC, [6] = syscallC,
++    },
++    [3] = { shift = 17, mask = 1, [0] = "alsl.dDJKQ", },
++  },
++}
++
++local map_22_0 = {
++  shift = 21, mask = 1,
++  [0] = map_21_0,
++  [1] = map_21_1,
++}
++
++local map_shift = {	-- 22nd:1, 21st:0
++  shift = 16, mask = 31,
++  [0] = { shift = 15, mask = 1, [1] = "slli.wDJU", },
++  [1] = "slli.dDJV",
++  [4] = { shift = 15, mask = 1, [1] = "srli.wDJU", },
++  [5] = "srli.dDJV",
++  [8] = { shift = 15, mask = 1, [1] = "srai.wDJU", },
++  [9] = "srai.dDJV",
++  [12] = { shift = 15, mask = 1, [1] = "rotri.wDJU", },
++  [13] = "rotri.dDJV",
++}
++
++local map_22_1 = {        -- 22nd:1
++  shift = 21, mask = 1,
++  [0] = map_shift,
++  [1] = { shift = 15, mask = 1, [0] = "bstrins.wDJMU", [1] = "bstrpick.wDJMU", },
++}
++
++local map_26_0 = {
++  shift = 22, mask = 15,
++  [0] = map_22_0,
++  [1] = map_22_1,
++  [2] = "bstrins.dDJNV",
++  [3] = "bstrpick.dDJNV",
++  [4] = map_farith,
++  [8] = "sltiDJX",
++  [9] = "sltuiDJX",
++  [10] = "addi.wDJX",
++  [11] = "addi.dDJX",
++  [12] = "lu52i.dDJX",
++  [13] = "andiDJT",
++  [14] = "oriDJT",
++  [15] = "xoriDJT",
++}
++
++local map_long_i_5 = { -- Long immediate fixed-point arithmetic.
++  shift = 25, mask = 1,
++  [0] = "lu12i.wDZ",
++  [1] = "lu32i.dDZ",
++}
++
++local map_long_i_6 = {
++  shift = 25, mask = 1,
++  [0] = "pcaddiDZ",
++  [1] = "pcalau12iDZ",
++}
++
++local map_long_i_7 = {
++  shift = 25, mask = 1,
++  [0] = "pcaddu12iDZ",
++  [1] = "pcaddu18iDZ",
++}
++
++local map_ldst0_14 = {
++  shift = 15, mask = 2047,
++  [0] = "ldx.bDJK", [8] = "ldx.hDJK", [16] = "ldx.wDJK",
++  [24] = "ldx.dDJK", [32] = "stx.bDJK", [40] = "stx.hDJK",
++  [48] = "stx.wDJK", [56] = "stx.dDJK", [64] = "ldx.buDJK",
++  [72] = "ldx.huDJK", [80] = "ldx.wuDJK", [96] = "fldx.sFJK",
++  [104] = "fldx.dFJK", [112] = "fstx.sFJK", [120] = "fstx.dFJK",
++  [232] = "fldgt.sFJK", [233] = "fldgt.dFJK", [234] = "fldle.sFJK",
++  [235] = "fldle.dFJK", [236] = "fstgt.sFJK", [237] = "fstgt.dFJK",
++  [238] = "fstle.sFJK", [239] = "fstle.dFJK", [240] = "ldgt.bDJK",
++  [241] = "ldgt.hDJK", [242] = "ldgt.wDJK", [243] = "ldgt.dDJK",
++  [244] = "ldle.bDJK", [245] = "ldle.hDJK", [246] = "ldle.wDJK",
++  [247] = "ldle.dDJK", [248] = "stgt.bDJK", [249] = "stgt.hDJK",
++  [250] = "stgt.wDJK", [251] = "stgt.dDJK", [252] = "stle.bDJK",
++  [253] = "stle.hDJK", [254] = "stle.wDJK", [255] = "stle.dDJK",
++}
++
++local map_ldst1_8 = {
++  shift = 24, mask = 3,
++  [0] = "ll.wDJW",
++  [1] = "sc.wDJW",
++  [2] = "ll.dDJW",
++  [3] = "sc.dDJW",
++}
++
++local map_ldst1_9 = {
++  shift = 24, mask = 3,
++  [0] = "ldptr.wDJW",
++  [1] = "stptr.wDJW",
++  [2] = "ldptr.dDJW",
++  [3] = "stptr.dDJW",
++}
++
++local map_ldst1_10 = {
++  shift = 22, mask = 15,
++  [0] = "ld.bDJX",
++  [1] = "ld.hDJX",
++  [2] = "ld.wDo",
++  [3] = "ld.dDo",
++  [4] = "st.bDo",
++  [5] = "st.hDo",
++  [6] = "st.wDo",
++  [7] = "st.dDo",
++  [8] = "ld.buDo",
++  [9] = "ld.huDo",
++  [10] = "ld.wuDJX",
++  [12] = "fld.sFo",
++  [13] = "fst.sFo",
++  [14] = "fld.dFo",
++  [15] = "fst.dFo",
++}
++
++local map_fcmp0 = {
++  shift = 15, mask = 31,
++  [0] = "fcmp.caf.sEGH",
++  [1] = "fcmp.saf.sEGH",
++  [2] = "fcmp.clt.sEGH",
++  [3] = "fcmp.slt.sEGH",
++  [4] = "fcmp.ceq.sEGH",
++  [5] = "fcmp.seq.sEGH",
++  [6] = "fcmp.cle.sEGH",
++  [7] = "fcmp.sle.sEGH",
++  [8] = "fcmp.cun.sEGH",
++  [9] = "fcmp.sun.sEGH",
++  [10] = "fcmp.cult.sEGH",
++  [11] ="fcmp.sult.sEGH",
++  [12] = "fcmp.cueq.sEGH",
++  [13] = "fcmp.sueq.sEGH",
++  [14] = "fcmp.cule.sEGH",
++  [15] = "fcmp.sule.sEGH",
++  [16] = "fcmp.cne.sEGH",
++  [17] = "fcmp.sne.sEGH",
++  [20] = "fcmp.cor.sEGH",
++  [21] = "fcmp.sor.sEGH",
++  [24] = "fcmp.cune.sEGH",
++  [25] = "fcmp.sune.sEGH",
++}
++
++local map_fcmp1 = {
++  shift = 15, mask = 31,
++  [0] = "fcmp.caf.dEGH",
++  [1] = "fcmp.saf.dEGH",
++  [2] = "fcmp.clt.dEGH",
++  [3] = "fcmp.slt.dEGH",
++  [4] = "fcmp.ceq.dEGH",
++  [5] = "fcmp.seq.dEGH",
++  [6] = "fcmp.cle.dEGH",
++  [7] = "fcmp.sle.dEGH",
++  [8] = "fcmp.cun.dEGH",
++  [9] = "fcmp.sun.dEGH",
++  [10] = "fcmp.cult.dEGH",
++  [11] = "fcmp.sult.dEGH",
++  [12] = "fcmp.cueq.dEGH",
++  [13] = "fcmp.sueq.dEGH",
++  [14] = "fcmp.cule.dEGH",
++  [15] = "fcmp.sule.dEGH",
++  [16] = "fcmp.cne.dEGH",
++  [17] = "fcmp.sne.dEGH",
++  [20] = "fcmp.cor.dEGH",
++  [21] = "fcmp.sor.dEGH",
++  [24] = "fcmp.cune.dEGH",
++  [25] = "fcmp.sune.dEGH",
++}
++
++local map_fcmp = {
++  shift = 20, mask = 63,
++  [1] = { shift = 3, mask = 3, [0] = map_fcmp0, },
++  [2] = { shift = 3, mask = 3, [0] = map_fcmp1, },
++  [16] = { shift = 18, mask = 3, [0] = "fselFGHI", },
++}
++
++local map_fp = {
++  shift = 20, mask = 15,
++  [1] = "fmadd.sFGHi",
++  [2] = "fmadd.dFGHi",
++  [4] = "fmsub.sFGHi",
++  [5] = "fmsub.dFGHi",
++  [10] = "fnmadd.dFGHi",
++  [14] = "fnmsub.dFGHi",
++}
++
++local map_init = {
++  shift = 26, mask = 63,
++  [0] = map_26_0,
++  [2] = map_fp,
++  [3] = map_fcmp,
++  [4] = "addu16i.dDJY",
++  [5] = map_long_i_5,
++  [6] = map_long_i_6,
++  [7] = map_long_i_7,
++  [8] = map_ldst1_8,
++  [9] = map_ldst1_9,
++  [10] = map_ldst1_10,
++  [14] = map_ldst0_14,
++  [16] = "beqzJL",
++  [17] = "bnezJL",
++  [18] = { shift = 8, mask = 3, [0] = "bceqzAL", "bcnezAL", },
++  [19] = "jirlDJa",
++  [20] = "bP",
++  [21] = "blP",
++  [22] = "beqJDO",
++  [23] = "bneJDO",
++  [24] = "bltJDO",
++  [25] = "bgeJDO",
++  [26] = "bltuJDO",
++  [27] = "bgeuJDO",
++}
++
++------------------------------------------------------------------------------
++
++local map_gpr = {
++  [0] = "r0", "ra", "r2", "sp", "r4", "r5", "r6", "r7",
++  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
++  "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
++  "r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31",
++}
++
++------------------------------------------------------------------------------
++
++-- Output a nicely formatted line with an opcode and operands.
++local function putop(ctx, text, operands)
++  local pos = ctx.pos
++  local extra = ""
++  if ctx.rel then
++    local sym = ctx.symtab[ctx.rel]
++    if sym then extra = "\t->"..sym end
++  end
++  if ctx.hexdump > 0 then
++    ctx.out(format("%08x  %s  %-7s %s%s\n",
++	    ctx.addr+pos, tohex(ctx.op), text, concat(operands, ", "), extra))
++  else
++    ctx.out(format("%08x  %-7s %s%s\n",
++	    ctx.addr+pos, text, concat(operands, ", "), extra))
++  end
++  ctx.pos = pos + 4
++end
++
++-- Fallback for unknown opcodes.
++local function unknown(ctx)
++  return putop(ctx, ".long", { "0x"..tohex(ctx.op) })
++end
++
++local function get_le(ctx)
++  local pos = ctx.pos
++  local b0, b1, b2, b3 = byte(ctx.code, pos+1, pos+4)
++  return bor(lshift(b3, 24), lshift(b2, 16), lshift(b1, 8), b0)
++end
++
++-- Decode imm.
++local function decode_si_imm(imm, bits, scale, signed, mask)
++  local n = tonumber(imm)
++  if n then
++    local m = arshift(n, scale)
++    if lshift(m, scale) == n then
++      if signed then
++        local s = arshift(band(m, mask), bits-1)
++        if s == 0 then
++          return m
++        elseif s == 1 then
++          return -(band(bnot(m), mask)+1)
++        end
++      else
++        if arshift(m, bits) == 0 then
++          return m
++        end
++      end
++    end
++  end
++end
++
++-- Disassemble a single instruction.
++local function disass_ins(ctx)
++  local op = ctx:get()
++  local operands = {}
++  local last = nil
++  ctx.op = op
++  ctx.rel = nil
++
++  local opat = ctx.map_pri[rshift(op, 26)]
++  while type(opat) ~= "string" do
++    if not opat then return unknown(ctx) end
++    opat = opat[band(rshift(op, opat.shift), opat.mask)]
++  end
++  local name, pat = match(opat, "^([a-z0-9_.]*)(.*)")
++  local altname, pat2 = match(pat, "|([a-z0-9_.|]*)(.*)")
++  if altname then pat = pat2 end
++
++  for p in gmatch(pat, ".") do
++    local x = nil
++    if p == "D" then
++      x = map_gpr[band(rshift(op, 0), 31)]
++    elseif p == "J" then
++      x = map_gpr[band(rshift(op, 5), 31)]
++    elseif p == "K" then
++      x = map_gpr[band(rshift(op, 10), 31)]
++    elseif p == "F" then
++      x = "f"..band(rshift(op, 0), 31)
++    elseif p == "G" then
++      x = "f"..band(rshift(op, 5), 31)
++    elseif p == "H" then
++      x = "f"..band(rshift(op, 10), 31)
++    elseif p == "i" then
++      x = "f"..band(rshift(op, 15), 31)
++    elseif p == "S" then
++      x = "fcsr"..band(rshift(op, 0), 31)
++    elseif p == "R" then
++      x = "fcsr"..band(rshift(op, 5), 31)
++    elseif p == "E" then
++      x = "fcc"..band(rshift(op, 0), 7)
++    elseif p == "A" then
++      x = "fcc"..band(rshift(op, 5), 7)
++    elseif p == "I" then
++      x = "fcc"..band(rshift(op, 15), 7)
++    elseif p == "Q" then	-- sa2
++      x = band(rshift(op, 15), 3)
++      ctx.rel = x
++      x = format("%d", x)
++    elseif p == "B" then	-- sa3
++      x = band(rshift(op, 15), 7)
++      ctx.rel = x
++      x = format("%d", x)
++    elseif p == "M" then	-- msbw
++      x = band(rshift(op, 16), 31)
++      ctx.rel = x
++      x = format("%d(0x%x)", x, x)
++    elseif p == "N" then	-- msbd
++      x = band(rshift(op, 16), 63)
++      ctx.rel = x
++      x = format("%d(0x%x)", x, x)
++    elseif p == "U" then	-- ui5
++      x = band(rshift(op, 10), 31)
++      ctx.rel = x
++      x = format("%d(0x%x)", x, x)
++    elseif p == "V" then	-- ui6
++      x = band(rshift(op, 10), 63)
++      ctx.rel = x
++      x = format("%d(0x%x)", x, x)
++    elseif p == "T" then	-- ui12
++      x = band(rshift(op, 10), 4095)
++      ctx.rel = x
++      x = format("%d(0x%x)", x, x)
++    elseif p == "W" then	-- si14
++      x = band(rshift(op, 10), 16383)
++      x = decode_si_imm(x, 14, 0, true, 0x3fff)
++      ctx.rel = x
++      x = format("%d(0x%04x)", x, band(x, 0x3fff))
++    elseif p == "X" then	-- si12
++      x = band(rshift(op, 10), 4095)
++      x = decode_si_imm(x, 12, 0, true, 0xfff)
++      ctx.rel = x
++      x = format("%d(0x%03x)", x, band(x, 0xfff))
++    elseif p == "o" then
++      local disp = band((rshift(op, 10)), 0xfff)
++      operands[#operands] = format("%s, %d", last, disp)
++    elseif p == "Y" then	-- si16
++      x = band(rshift(op, 10), 65535)
++      x = decode_si_imm(x, 16, 0, true, 0xffff)
++      ctx.rel = x
++      x = format("%d(0x%04x)", x, band(x, 0xffff))
++    elseif p == "Z" then	-- si20
++      x = band(rshift(op, 10), 1048575)
++      x = decode_si_imm(x, 20, 0, true, 0xfffff)
++      ctx.rel = x
++      x = format("%d(0x%05x)", x, band(x, 0xfffff))
++    elseif p == "C" then	-- code
++      x = band(rshift(op, 0), 32767)
++    elseif p == "O" then	-- offs[15:0]
++      x = band(rshift(op, 10), 65535)
++      x = decode_si_imm(x, 16, 0, true, 0xffff)
++      ctx.rel = x
++      x = format("%d(0x%04x)", x, band(x, 0xffff))
++    elseif p == "L" then	-- offs[15:0] + offs[20:16]
++      x = lshift(band(op, 31), 16) + band(rshift(op, 10), 65535)
++      x = decode_si_imm(x, 21, 0, true, 0x1fffff)
++      ctx.rel = x
++      x = format("%d(0x%06x)", x, band(x, 0x1fffff))
++    elseif p == "P" then	-- offs[15:0] + offs[25:16]
++      x = lshift(band(op, 1023), 16) + band(rshift(op, 10), 65535)
++      x = decode_si_imm(x, 26, 0, true, 0x3ffffff)
++      ctx.rel = x
++      x = format("%d(0x%07x)", x, band(x, 0x3ffffff))
++    elseif p == "a" then
++      x = band(rshift(op, 10), 65535)
++      x = decode_si_imm(x, 16, 0, true, 0xffff)
++      ctx.rel = x
++      x = format("%d(0x%04x)", x, band(x, 0xffff))
++    else
++      assert(false)
++    end
++    if x then operands[#operands+1] = x; last = x end
++  end
++
++  return putop(ctx, name, operands)
++end
++
++------------------------------------------------------------------------------
++
++-- Disassemble a block of code.
++local function disass_block(ctx, ofs, len)
++  if not ofs then ofs = 0 end
++  local stop = len and ofs+len or #ctx.code
++  stop = stop - stop % 4
++  ctx.pos = ofs - ofs % 4
++  ctx.rel = nil
++  while ctx.pos < stop do disass_ins(ctx) end
++end
++
++-- Extended API: create a disassembler context. Then call ctx:disass(ofs, len).
++local function create(code, addr, out)
++  local ctx = {}
++  ctx.code = code
++  ctx.addr = addr or 0
++  ctx.out = out or io.write
++  ctx.symtab = {}
++  ctx.disass = disass_block
++  ctx.hexdump = 8
++  ctx.get = get_le
++  ctx.map_pri = map_init
++  return ctx
++end
++
++-- Simple API: disassemble code (a string) at address and output via out.
++local function disass(code, addr, out)
++  create(code, addr, out):disass()
++end
++
++-- Return register name for RID.
++local function regname(r)
++  if r < 32 then return map_gpr[r] end
++  return "f"..(r-32)
++end
++
++-- Public module functions.
++return {
++  create = create,
++  disass = disass,
++  regname = regname
++}
++
+-- 
+2.20.1
+
diff --git a/loongarch64/0020-LoongArch64-Add-support-in-Makefile.patch b/loongarch64/0020-LoongArch64-Add-support-in-Makefile.patch
new file mode 100644
index 0000000..8a5dc88
--- /dev/null
+++ b/loongarch64/0020-LoongArch64-Add-support-in-Makefile.patch
@@ -0,0 +1,68 @@
+From d2eb1408daf4be266cf6fbc8be6024b6b5c717c5 Mon Sep 17 00:00:00 2001
+From: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
+Date: Thu, 28 Jul 2022 11:02:28 +0800
+Subject: [PATCH] LoongArch64: Add support in Makefile
+
+---
+ Makefile     |  2 +-
+ src/Makefile | 10 +++++++++-
+ 2 files changed, 10 insertions(+), 2 deletions(-)
+
+diff --git a/Makefile b/Makefile
+index b0288b4d..6cd25b5c 100644
+--- a/Makefile
++++ b/Makefile
+@@ -92,7 +92,7 @@ FILES_INC= lua.h lualib.h lauxlib.h luaconf.h lua.hpp luajit.h
+ FILES_JITLIB= bc.lua bcsave.lua dump.lua p.lua v.lua zone.lua \
+ 	      dis_x86.lua dis_x64.lua dis_arm.lua dis_arm64.lua \
+ 	      dis_arm64be.lua dis_ppc.lua dis_mips.lua dis_mipsel.lua \
+-	      dis_mips64.lua dis_mips64el.lua vmdef.lua
++	      dis_mips64.lua dis_mips64el.lua dis_loongarch64.lua vmdef.lua
+ 
+ ifeq (,$(findstring Windows,$(OS)))
+   HOST_SYS:= $(shell uname -s)
+diff --git a/src/Makefile b/src/Makefile
+index 30d64be2..5035a8a2 100644
+--- a/src/Makefile
++++ b/src/Makefile
+@@ -53,6 +53,7 @@ CCOPT_arm=
+ CCOPT_arm64=
+ CCOPT_ppc=
+ CCOPT_mips=
++CCOPT_loongarch64= -fwrapv
+ #
+ CCDEBUG=
+ # Uncomment the next line to generate debug information:
+@@ -244,6 +245,10 @@ else
+ ifneq (,$(findstring LJ_TARGET_ARM ,$(TARGET_TESTARCH)))
+   TARGET_LJARCH= arm
+ else
++ifneq (,$(findstring LJ_TARGET_LOONGARCH64 ,$(TARGET_TESTARCH)))
++  TARGET_ARCH= -DLJ_ARCH_ENDIAN=LUAJIT_LE
++  TARGET_LJARCH= loongarch64
++else
+ ifneq (,$(findstring LJ_TARGET_ARM64 ,$(TARGET_TESTARCH)))
+   ifneq (,$(findstring __AARCH64EB__ ,$(TARGET_TESTARCH)))
+     TARGET_ARCH= -D__AARCH64EB__=1
+@@ -275,6 +280,7 @@ endif
+ endif
+ endif
+ endif
++endif
+ 
+ ifneq (,$(findstring LJ_TARGET_PS3 1,$(TARGET_TESTARCH)))
+   TARGET_SYS= PS3
+@@ -338,7 +344,9 @@ else
+     # Find out whether the target toolchain always generates unwind tables.
+     TARGET_TESTUNWIND=$(shell exec 2>/dev/null; echo 'extern void b(void);int a(void){b();return 0;}' | $(TARGET_CC) -c -x c - -o tmpunwind.o && { grep -qa -e eh_frame -e __unwind_info tmpunwind.o || grep -qU -e eh_frame -e __unwind_info tmpunwind.o; } && echo E; rm -f tmpunwind.o)
+     ifneq (,$(findstring E,$(TARGET_TESTUNWIND)))
+-      TARGET_XCFLAGS+= -DLUAJIT_UNWIND_EXTERNAL
++      ifeq (,$(findstring LJ_TARGET_LOONGARCH64 ,$(TARGET_TESTARCH)))
++        TARGET_XCFLAGS+= -DLUAJIT_UNWIND_EXTERNAL
++      endif
+     endif
+   endif
+   ifneq (SunOS,$(TARGET_SYS))
+-- 
+2.20.1
+
diff --git a/luajit.spec b/luajit.spec
index 8a5e1a1..dcc99d5 100644
--- a/luajit.spec
+++ b/luajit.spec
@@ -2,18 +2,23 @@
 
 Name:           luajit
 Version:        2.1.0
-Release:        4
+Release:        5
 Summary:        Just-In-Time Compiler for Lua
 License:        MIT
 URL:            http://luajit.org/
 Source0:        http://luajit.org/download/LuaJIT-2.1.0-beta3.tar.gz
+%ifarch loongarch64
+Source1:        loongarch64.tar.gz
+Source2:        loongarch64.conf
+Source3:        apply-patches
+%endif
 
 # Patches from https://github.com/LuaJit/LuaJIT.git
 # Generated from v2.1 branch against the 2.1.0-beta3 tag using
 # git diff v2.1.0-beta3..v2.1 > luajit-2.1-update.patch commit 224129a
 Patch0:         luajit-2.1-224129a-update.patch
 
-ExclusiveArch:  %{arm} %{ix86} x86_64 %{mips} aarch64
+ExclusiveArch:  %{arm} %{ix86} x86_64 %{mips} aarch64 loongarch64
 
 BuildRequires:  gcc
 BuildRequires:  make
@@ -38,6 +43,13 @@ Man pages and other related documents for luajit.
 %prep
 %autosetup -n LuaJIT-2.1.0-beta3 -p1
 
+%ifarch loongarch64
+cp %{SOURCE1} .
+cp %{SOURCE2} .
+cp %{SOURCE3} .
+sh ./apply-patches
+%endif
+
 sed -i -e '/install -m/s/-m/-p -m/' Makefile
 
 %build
@@ -77,6 +89,9 @@ ln -s luajit-2.1.0-beta3 %{buildroot}%{_bindir}/luajit
 %{_mandir}/man1/%{name}.1*
 
 %changelog
+* Mon May 15 2023 zhaoxiaolin <zhaoxiaolin@loongson.cn> - 2.1.0-5
+- Add loongarch64 base support
+
 * Fri May 12 2023 xu_ping <707078654@qq.com> - 2.1.0-4
 - Round upstream commit
 
-- 
Gitee