diff --git a/Makefile b/Makefile
index 540a4d05d..6a5a297d1 100644
--- a/Makefile
+++ b/Makefile
@@ -38,6 +38,7 @@ SLJIT_LIR_FILES = $(SRCDIR)/sljitLir.c $(SRCDIR)/sljitUtils.c \
 	$(SRCDIR)/sljitNativePPC_common.c $(SRCDIR)/sljitNativePPC_32.c $(SRCDIR)/sljitNativePPC_64.c \
 	$(SRCDIR)/sljitNativeSPARC_common.c $(SRCDIR)/sljitNativeSPARC_32.c \
 	$(SRCDIR)/sljitNativeTILEGX_64.c \
+	$(SRCDIR)/sljitNativeS390X.c \
 	$(SRCDIR)/sljitNativeX86_common.c $(SRCDIR)/sljitNativeX86_32.c $(SRCDIR)/sljitNativeX86_64.c
 
 .PHONY: all clean examples
diff --git a/sljit_src/sljitConfigInternal.h b/sljit_src/sljitConfigInternal.h
index 5a21d28c3..f3ba444a5 100644
--- a/sljit_src/sljitConfigInternal.h
+++ b/sljit_src/sljitConfigInternal.h
@@ -162,8 +162,6 @@ extern "C" {
 #define SLJIT_CONFIG_SPARC_32 1
 #elif defined(__tilegx__)
 #define SLJIT_CONFIG_TILEGX 1
-#elif defined(__s390x__)
-#define SLJIT_CONFIG_S390X 1
 #else
 /* Unsupported architecture */
 #define SLJIT_CONFIG_UNSUPPORTED 1
@@ -724,9 +722,30 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_sw sljit_exec_offset(void* ptr);
 
 #elif (defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X)
 
+/*
+ * https://refspecs.linuxbase.org/ELF/zSeries/lzsabi0_zSeries.html#STACKFRAME
+ *
+ * 160
+ *  .. FR6
+ *  .. FR4
+ *  .. FR2
+ * 128 FR0
+ * 120 R15 (used for SP)
+ * 112 R14
+ * 104 R13
+ *  96 R12
+ *  ..
+ *  48 R6
+ *  ..
+ *  16 R2
+ *   8 RESERVED
+ *   0 SP
+ */
+#define SLJIT_S390X_DEFAULT_STACK_FRAME_SIZE 160
+
 #define SLJIT_NUMBER_OF_REGISTERS 12
 #define SLJIT_NUMBER_OF_SAVED_REGISTERS 8
-#define SLJIT_LOCALS_OFFSET_BASE 160
+#define SLJIT_LOCALS_OFFSET_BASE SLJIT_S390X_DEFAULT_STACK_FRAME_SIZE
 
 #elif (defined SLJIT_CONFIG_UNSUPPORTED && SLJIT_CONFIG_UNSUPPORTED)
 
diff --git a/sljit_src/sljitLir.c b/sljit_src/sljitLir.c
index 465098726..16149ae8b 100644
--- a/sljit_src/sljitLir.c
+++ b/sljit_src/sljitLir.c
@@ -633,7 +633,10 @@ static SLJIT_INLINE sljit_s32 get_arg_count(sljit_s32 arg_types)
 	return arg_count;
 }
 
-#if !(defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86)
+
+/* Only used in RISC architectures where the instruction size is constant */
+#if !(defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86) \
+	&& !(defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X)
 
 static SLJIT_INLINE sljit_uw compute_next_addr(struct sljit_label *label, struct sljit_jump *jump,
 	struct sljit_const *const_, struct sljit_put_label *put_label)
@@ -655,7 +658,7 @@ static SLJIT_INLINE sljit_uw compute_next_addr(struct sljit_label *label, struct
 	return result;
 }
 
-#endif /* !SLJIT_CONFIG_X86 */
+#endif /* !SLJIT_CONFIG_X86 && !SLJIT_CONFIG_S390X */
 
 static SLJIT_INLINE void set_emit_enter(struct sljit_compiler *compiler,
 	sljit_s32 options, sljit_s32 args, sljit_s32 scratches, sljit_s32 saveds,
diff --git a/sljit_src/sljitNativeS390X.c b/sljit_src/sljitNativeS390X.c
index 8b3c8c521..e61783228 100644
--- a/sljit_src/sljitNativeS390X.c
+++ b/sljit_src/sljitNativeS390X.c
@@ -42,30 +42,52 @@ SLJIT_API_FUNC_ATTRIBUTE const char* sljit_get_platform_name(void)
 typedef sljit_uw sljit_ins;
 
 /* Instruction tags (most significant halfword). */
-const sljit_ins sljit_ins_const = (sljit_ins)(1) << 48;
+const sljit_ins sljit_ins_const = (sljit_ins)1 << 48;
+
+static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 4] = {
+	14, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 0, 1
+};
 
 /* General Purpose Registers [0-15]. */
 typedef sljit_uw sljit_gpr;
-const sljit_gpr r0 = 0; /* 0 in address calculations; reserved */
-const sljit_gpr r1 = 1; /* reserved */
-const sljit_gpr r2 = 2; /* 1st argument */
-const sljit_gpr r3 = 3; /* 2nd argument */
-const sljit_gpr r4 = 4; /* 3rd argument */
-const sljit_gpr r5 = 5; /* 4th argument */
-const sljit_gpr r6 = 6; /* 5th argument; 1st saved register */
-const sljit_gpr r7 = 7;
-const sljit_gpr r8 = 8;
-const sljit_gpr r9 = 9;
-const sljit_gpr r10 = 10;
-const sljit_gpr r11 = 11;
-const sljit_gpr r12 = 12;
-const sljit_gpr r13 = 13;
-const sljit_gpr r14 = 14; /* return address and flag register */
-const sljit_gpr r15 = 15; /* stack pointer */
-
-const sljit_gpr tmp0 = 0; /* r0 */
-const sljit_gpr tmp1 = 1; /* r1 */
 
+/*
+ * WARNING
+ * the following code is non standard and should be improved for
+ * consistency, but doesn't use SLJIT_NUMBER_OF_REGISTERS based
+ * registers because r0 and r1 are the ABI recommended volatiles.
+ * there is a gpr() function that maps sljit to physical register numbers
+ * that should be used instead of the usual index into reg_map[] and
+ * will be retired ASAP (TODO: carenas)
+ */
+const sljit_gpr r0 = 0;	/* reg_map[SLJIT_NUMBER_OF_REGISTERS + 2]: 0 in address calculations; reserved */
+const sljit_gpr r1 = 1;	/* reg_map[SLJIT_NUMBER_OF_REGISTERS + 3]: reserved */
+const sljit_gpr r2 = 2;	/* reg_map[1]: 1st argument */
+const sljit_gpr r3 = 3;	/* reg_map[2]: 2nd argument */
+const sljit_gpr r4 = 4;	/* reg_map[3]: 3rd argument */
+const sljit_gpr r5 = 5;	/* reg_map[4]: 4th argument */
+const sljit_gpr r6 = 6;	/* reg_map[5]: 5th argument; 1st saved register */
+const sljit_gpr r7 = 7;	/* reg_map[6] */
+const sljit_gpr r8 = 8;	/* reg_map[7] */
+const sljit_gpr r9 = 9;	/* reg_map[8] */
+const sljit_gpr r10 = 10;	/* reg_map[9] */
+const sljit_gpr r11 = 11;	/* reg_map[10] */
+const sljit_gpr r12 = 12;	/* reg_map[11]: GOT */
+const sljit_gpr r13 = 13;	/* reg_map[12]: Literal Pool pointer */
+const sljit_gpr r14 = 14;	/* reg_map[0]: return address and flag register */
+const sljit_gpr r15 = 15;	/* reg_map[SLJIT_NUMBER_OF_REGISTERS + 1]: stack pointer */
+
+/* WARNING: r12 and r13 shouldn't be used as per ABI recommendation */
+/* TODO(carenas): r12 might conflict in PIC code, reserve? */
+/* TODO(carenas): r13 is usually ponted to "pool" per ABI, using a tmp
+ *                like we do know might be faster though, reserve?
+ */
+
+/* TODO(carenas): should be named TMP_REG[1-2] for consistency */
+#define tmp0	r0
+#define tmp1	r1
+
+/* TODO(carenas): flags should move, there is also a[2-15] available */
 /* Link registers. The normal link register is r14, but since
    we use that for flags we need to use r0 instead to do fast
    calls so that flags are preserved. */
@@ -81,32 +103,35 @@ const sljit_gpr fast_link_r = 0; /* r0 */
 */
 const sljit_gpr flag_r = 14; /* r14 */
 
-static const sljit_gpr reg_map[SLJIT_NUMBER_OF_REGISTERS + 1] = {
-	2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15
-};
-
 struct sljit_s390x_const {
 	struct sljit_const const_; /* must be first */
 	sljit_sw init_value;       /* required to build literal pool */
 };
 
 /* Convert SLJIT register to hardware register. */
-static sljit_gpr gpr(sljit_s32 r)
+static SLJIT_INLINE sljit_gpr gpr(sljit_s32 r)
 {
 	SLJIT_ASSERT(r != SLJIT_UNUSED);
-	r -= SLJIT_R0; /* normalize */
-	SLJIT_ASSERT(r < (sizeof(reg_map) / sizeof(reg_map[0])));
+	SLJIT_ASSERT(r < (sljit_s32)(sizeof(reg_map) / sizeof(reg_map[0])));
 	return reg_map[r];
 }
 
 /* Size of instruction in bytes. Tags must already be cleared. */
-static sljit_uw sizeof_ins(sljit_ins ins)
+static SLJIT_INLINE sljit_uw sizeof_ins(sljit_ins ins)
 {
-	if (ins == 0) return 2; /* keep faulting instructions */
-	if ((ins&0x00000000ffff) == ins) return 2;
-	if ((ins&0x0000ffffffff) == ins) return 4;
-	if ((ins&0xffffffffffff) == ins) return 6;
-	abort();
+	/* keep faulting instructions */
+	if (ins == 0)
+		 return 2;
+
+	if ((ins & 0x00000000ffffL) == ins)
+		 return 2;
+	if ((ins & 0x0000ffffffffL) == ins)
+		 return 4;
+	if ((ins & 0xffffffffffffL) == ins)
+		 return 6;
+
+	SLJIT_UNREACHABLE();
+	return (sljit_uw)-1;
 }
 
 static sljit_s32 push_inst(struct sljit_compiler *compiler, sljit_ins ins)
@@ -122,64 +147,71 @@ static sljit_s32 encode_inst(void **ptr, sljit_ins ins)
 {
 	sljit_u16 *ibuf = (sljit_u16 *)*ptr;
 	sljit_uw size = sizeof_ins(ins);
-	SLJIT_ASSERT((size&6) == size);
+
+	SLJIT_ASSERT((size & 6) == size);
 	switch (size) {
-	case 6: *ibuf++ = (sljit_u16)(ins >> 32);
-	case 4: *ibuf++ = (sljit_u16)(ins >> 16);
-	case 2: *ibuf++ = (sljit_u16)(ins);
+	case 6:
+		*ibuf++ = (sljit_u16)(ins >> 32);
+		/* fallthrough */
+	case 4:
+		*ibuf++ = (sljit_u16)(ins >> 16);
+		/* fallthrough */
+	case 2:
+		*ibuf++ = (sljit_u16)(ins);
 	}
 	*ptr = (void*)ibuf;
 	return SLJIT_SUCCESS;
 }
 
 /* Map the given type to a 4-bit condition code mask. */
-static sljit_uw get_cc(sljit_s32 type) {
-	const sljit_uw eq = 1 << 3; /* equal {,to zero} */
-	const sljit_uw lt = 1 << 2; /* less than {,zero} */
-	const sljit_uw gt = 1 << 1; /* greater than {,zero} */
-	const sljit_uw ov = 1 << 0; /* {overflow,NaN} */
-	const sljit_uw mask = 0xf;
+static SLJIT_INLINE sljit_u8 get_cc(sljit_s32 type) {
+	const sljit_u8 eq = 1 << 3; /* equal {,to zero} */
+	const sljit_u8 lt = 1 << 2; /* less than {,zero} */
+	const sljit_u8 gt = 1 << 1; /* greater than {,zero} */
+	const sljit_u8 ov = 1 << 0; /* {overflow,NaN} */
 
 	switch (type) {
 	case SLJIT_EQUAL:
 	case SLJIT_EQUAL_F64:
-		return mask & eq;
+		return eq;
 
 	case SLJIT_NOT_EQUAL:
 	case SLJIT_NOT_EQUAL_F64:
-		return mask & ~eq;
+		return ~eq;
 
 	case SLJIT_LESS:
 	case SLJIT_SIG_LESS:
 	case SLJIT_LESS_F64:
-		return mask & lt;
+		return lt;
 
 	case SLJIT_LESS_EQUAL:
 	case SLJIT_SIG_LESS_EQUAL:
 	case SLJIT_LESS_EQUAL_F64:
-		return mask & (lt | eq);
+		return (lt | eq);
 
 	case SLJIT_GREATER:
 	case SLJIT_SIG_GREATER:
 	case SLJIT_GREATER_F64:
-		return mask & gt;
+		return gt;
 
 	case SLJIT_GREATER_EQUAL:
 	case SLJIT_SIG_GREATER_EQUAL:
 	case SLJIT_GREATER_EQUAL_F64:
-		return mask & (gt | eq);
+		return (gt | eq);
 
 	case SLJIT_OVERFLOW:
 	case SLJIT_MUL_OVERFLOW:
 	case SLJIT_UNORDERED_F64:
-		return mask & ov;
+		return ov;
 
 	case SLJIT_NOT_OVERFLOW:
 	case SLJIT_MUL_NOT_OVERFLOW:
 	case SLJIT_ORDERED_F64:
-		return mask & ~ov;
+		return ~ov;
 	}
+
 	SLJIT_UNREACHABLE();
+	return (sljit_u8)-1;
 }
 
 /* Facility to bit index mappings.
@@ -204,7 +236,7 @@ typedef sljit_uw facility_bit;
    value given a constant argument. */
 static SLJIT_INLINE int have_facility_static(facility_bit x)
 {
-#if ENABLE_STATIC_FACILITY_DETECTION != 0
+#if ENABLE_STATIC_FACILITY_DETECTION
 	switch (x) {
 	case FAST_LONG_DISPLACEMENT_FACILITY:
 		return (__ARCH__ >=  6 /* z990 */);
@@ -223,15 +255,17 @@ static SLJIT_INLINE int have_facility_static(facility_bit x)
 	case MISCELLANEOUS_INSTRUCTION_EXTENSIONS_2_FACILITY:
 	case VECTOR_ENHANCEMENTS_1_FACILITY:
 		return (__ARCH__ >= 12 /* z14 */);
+	default:
+		SLJIT_UNREACHABLE();
 	}
 #endif
 	return 0;
 }
 
-static SLJIT_INLINE long get_hwcap()
+static SLJIT_INLINE unsigned long get_hwcap()
 {
-	static long hwcap = 0;
-	if (SLJIT_UNLIKELY(hwcap == 0)) {
+	static unsigned long hwcap = 0;
+	if (SLJIT_UNLIKELY(!hwcap)) {
 		hwcap = getauxval(AT_HWCAP);
 		SLJIT_ASSERT(hwcap != 0);
 	}
@@ -240,10 +274,10 @@ static SLJIT_INLINE long get_hwcap()
 
 static SLJIT_INLINE int have_stfle()
 {
-	if (have_facility_static(STORE_FACILITY_LIST_EXTENDED_FACILITY)) {
+	if (have_facility_static(STORE_FACILITY_LIST_EXTENDED_FACILITY))
 		return 1;
-	}
-	return SLJIT_LIKELY((HWCAP_S390_STFLE & get_hwcap()) != 0);
+
+	return (get_hwcap() & HWCAP_S390_STFLE);
 }
 
 /* Report whether the given facility is available. This function always
@@ -255,10 +289,13 @@ static int have_facility_dynamic(facility_bit x)
 		sljit_uw bits[4];
 	} cpu_features;
 	size_t size = sizeof(cpu_features);
-	SLJIT_ASSERT(sizeof(cpu_features) >= ((x+7)/8));
-	if (SLJIT_UNLIKELY(!have_stfle())) {
+	const sljit_uw word_index = x >> 6;
+	const sljit_uw bit_index = ((1UL << 63) >> (x & 63));
+
+	SLJIT_ASSERT(x < size * 8);
+	if (SLJIT_UNLIKELY(!have_stfle()))
 		return 0;
-	}
+
 	if (SLJIT_UNLIKELY(cpu_features.bits[0] == 0)) {
 		__asm__ __volatile__ (
 			"lgr   %%r0, %0;"
@@ -269,8 +306,6 @@ static int have_facility_dynamic(facility_bit x)
 		);
 		SLJIT_ASSERT(cpu_features.bits[0] != 0);
 	}
-	const sljit_uw word_index = x / 64;
-	const sljit_uw bit_index = ((1UL << 63) >> (x & 63));
 	return (cpu_features.bits[word_index] & bit_index) != 0;
 #else
 	return 0;
@@ -282,14 +317,14 @@ static SLJIT_INLINE int name() \
 { \
 	static int have = -1; \
 	/* Static check first. May allow the function to be optimized away. */ \
-	if (have_facility_static(bit)) { \
-		return 1; \
-	} \
-	if (SLJIT_UNLIKELY(have < 0)) { \
+	if (have_facility_static(bit)) \
+		have = 1; \
+	else if (SLJIT_UNLIKELY(have < 0)) \
 		have = have_facility_dynamic(bit) ? 1 : 0; \
-	} \
-	return SLJIT_LIKELY(have); \
+\
+	return have; \
 }
+
 HAVE_FACILITY(have_eimm,    EXTENDED_IMMEDIATE_FACILITY)
 HAVE_FACILITY(have_ldisp,   FAST_LONG_DISPLACEMENT_FACILITY)
 HAVE_FACILITY(have_genext,  GENERAL_INSTRUCTION_EXTENSION_FACILITY)
@@ -298,32 +333,34 @@ HAVE_FACILITY(have_lscond2, LOAD_STORE_ON_CONDITION_2_FACILITY)
 HAVE_FACILITY(have_misc2,   MISCELLANEOUS_INSTRUCTION_EXTENSIONS_2_FACILITY)
 #undef HAVE_FACILITY
 
+#define is_u12(d)	(0 <= (d) && (d) <= 0x00000fffL)
+#define is_u32(d)	(0 <= (d) && (d) <= 0xffffffffL)
+
 #define CHECK_SIGNED(v, bitlen) \
-	((v) == (((v) << (sizeof(v)*8 - bitlen)) >> (sizeof(v)*8 - bitlen)))
-static int is_s16(sljit_sw d) { return CHECK_SIGNED(d, 16); }
-static int is_s20(sljit_sw d) { return CHECK_SIGNED(d, 20); }
-static int is_s32(sljit_sw d) { return CHECK_SIGNED(d, 32); }
-#undef CHECK_SIGNED
+	((v) == (((v) << (sizeof(v) * 8 - bitlen)) >> (sizeof(v) * 8 - bitlen)))
 
-static int is_u12(sljit_sw d) { return 0 <= d && d <= 0x00000fffL; }
-static int is_u32(sljit_sw d) { return 0 <= d && d <= 0xffffffffL; }
+#define is_s16(d)	CHECK_SIGNED((d), 16)
+#define is_s20(d)	CHECK_SIGNED((d), 20)
+#define is_s32(d)	CHECK_SIGNED((d), 32)
 
-static sljit_uw disp_s20(sljit_s32 d)
+static SLJIT_INLINE sljit_uw disp_s20(sljit_s32 d)
 {
-	SLJIT_ASSERT(is_s20(d));
 	sljit_uw dh = (d >> 12) & 0xff;
 	sljit_uw dl = (d << 8) & 0xfff00;
+
+	SLJIT_ASSERT(is_s20(d));
 	return dh | dl;
 }
 
+/* TODO(carenas): variadic macro is not strictly needed */
 #define SLJIT_S390X_INSTRUCTION(op, ...) \
-static sljit_ins op(__VA_ARGS__)
+static SLJIT_INLINE sljit_ins op(__VA_ARGS__)
 
 /* RR form instructions. */
 #define SLJIT_S390X_RR(name, pattern) \
 SLJIT_S390X_INSTRUCTION(name, sljit_gpr dst, sljit_gpr src) \
 { \
-	return pattern | ((dst&0xf)<<4) | (src&0xf); \
+	return (pattern) | ((dst & 0xf) << 4) | (src & 0xf); \
 }
 
 /* ADD */
@@ -374,7 +411,7 @@ SLJIT_S390X_RR(slr,  0x1f00)
 #define SLJIT_S390X_RRE(name, pattern) \
 SLJIT_S390X_INSTRUCTION(name, sljit_gpr dst, sljit_gpr src) \
 { \
-	return pattern | ((dst&0xf)<<4) | (src&0xf); \
+	return (pattern) | ((dst & 0xf) << 4) | (src & 0xf); \
 }
 
 /* ADD */
@@ -459,7 +496,7 @@ SLJIT_S390X_RRE(slbgr, 0xb9890000)
 #define SLJIT_S390X_RIA(name, pattern, imm_type) \
 SLJIT_S390X_INSTRUCTION(name, sljit_gpr reg, imm_type imm) \
 { \
-	return pattern | ((reg&0xf) << 20) | (imm&0xffff); \
+	return (pattern) | ((reg & 0xf) << 20) | (imm & 0xffff); \
 }
 
 /* ADD HALFWORD IMMEDIATE */
@@ -497,7 +534,7 @@ SLJIT_S390X_RIA(tmlh,  0xa7000000, sljit_u16)
 SLJIT_S390X_INSTRUCTION(name, sljit_gpr reg, imm_type imm) \
 { \
 	SLJIT_ASSERT(have_eimm()); \
-	return pattern | ((sljit_ins)(reg&0xf) << 36) | (imm&0xffffffff); \
+	return (pattern) | ((sljit_ins)(reg & 0xf) << 36) | (imm & 0xffffffff); \
 }
 
 /* ADD IMMEDIATE */
@@ -549,12 +586,15 @@ SLJIT_S390X_RILA(oilf,  0xc00d00000000, sljit_u32)
 #define SLJIT_S390X_RXA(name, pattern) \
 SLJIT_S390X_INSTRUCTION(name, sljit_gpr r, sljit_u16 d, sljit_gpr x, sljit_gpr b) \
 { \
-	SLJIT_ASSERT((d&0xfff) == d); \
-	sljit_ins ri = (sljit_ins)(r&0xf) << 20; \
-	sljit_ins xi = (sljit_ins)(x&0xf) << 16; \
-	sljit_ins bi = (sljit_ins)(b&0xf) << 12; \
-	sljit_ins di = (sljit_ins)(d&0xfff); \
-	return pattern | ri | xi | bi | di; \
+	sljit_ins ri, xi, bi, di; \
+\
+	SLJIT_ASSERT((d & 0xfff) == d); \
+	ri = (sljit_ins)(r & 0xf) << 20; \
+	xi = (sljit_ins)(x & 0xf) << 16; \
+	bi = (sljit_ins)(b & 0xf) << 12; \
+	di = (sljit_ins)(d & 0xfff); \
+\
+	return (pattern) | ri | xi | bi | di; \
 }
 
 /* ADD */
@@ -605,12 +645,15 @@ SLJIT_S390X_RXA(sl,  0x5f000000)
 #define SLJIT_S390X_RXYA(name, pattern, cond) \
 SLJIT_S390X_INSTRUCTION(name, sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b) \
 { \
+	sljit_ins ri, xi, bi, di; \
+\
 	SLJIT_ASSERT(cond); \
-	sljit_ins ri = (sljit_ins)(r&0xf) << 36; \
-	sljit_ins xi = (sljit_ins)(x&0xf) << 32; \
-	sljit_ins bi = (sljit_ins)(b&0xf) << 28; \
-	sljit_ins di = (sljit_ins)disp_s20(d) << 8; \
-	return pattern | ri | xi | bi | di; \
+	ri = (sljit_ins)(r & 0xf) << 36; \
+	xi = (sljit_ins)(x & 0xf) << 32; \
+	bi = (sljit_ins)(b & 0xf) << 28; \
+	di = (sljit_ins)disp_s20(d) << 8; \
+\
+	return (pattern) | ri | xi | bi | di; \
 }
 
 /* ADD */
@@ -693,10 +736,10 @@ SLJIT_S390X_RXYA(slbg,  0xe30000000089, 1)
 #define SLJIT_S390X_RSA(name, pattern) \
 SLJIT_S390X_INSTRUCTION(name, sljit_gpr reg, sljit_sw d, sljit_gpr b) \
 { \
-	sljit_ins r1 = (sljit_ins)(reg&0xf) << 20; \
-	sljit_ins b2 = (sljit_ins)(b&0xf) << 12; \
-	sljit_ins d2 = (sljit_ins)(d&0xfff); \
-	return pattern | r1 | b2 | d2; \
+	sljit_ins r1 = (sljit_ins)(reg & 0xf) << 20; \
+	sljit_ins b2 = (sljit_ins)(b & 0xf) << 12; \
+	sljit_ins d2 = (sljit_ins)(d & 0xfff); \
+	return (pattern) | r1 | b2 | d2; \
 }
 
 /* SHIFT LEFT SINGLE LOGICAL */
@@ -714,12 +757,15 @@ SLJIT_S390X_RSA(srl, 0x88000000)
 #define SLJIT_S390X_RSYA(name, pattern, cond) \
 SLJIT_S390X_INSTRUCTION(name, sljit_gpr dst, sljit_gpr src, sljit_sw d, sljit_gpr b) \
 { \
+	sljit_ins r1, r3, b2, d2; \
+\
 	SLJIT_ASSERT(cond); \
-	sljit_ins r1 = (sljit_ins)(dst&0xf) << 36; \
-	sljit_ins r3 = (sljit_ins)(src&0xf) << 32; \
-	sljit_ins b2 = (sljit_ins)(b&0xf) << 28; \
-	sljit_ins d2 = (sljit_ins)disp_s20(d) << 8; \
-	return pattern | r1 | r3 | b2 | d2; \
+	r1 = (sljit_ins)(dst & 0xf) << 36; \
+	r3 = (sljit_ins)(src & 0xf) << 32; \
+	b2 = (sljit_ins)(b & 0xf) << 28; \
+	d2 = (sljit_ins)disp_s20(d) << 8; \
+\
+	return (pattern) | r1 | r3 | b2 | d2; \
 }
 
 /* LOAD MULTIPLE */
@@ -743,13 +789,16 @@ SLJIT_S390X_RSYA(stmg,  0xeb0000000024, 1)
 #define SLJIT_S390X_RIEF(name, pattern) \
 SLJIT_S390X_INSTRUCTION(name, sljit_gpr dst, sljit_gpr src, sljit_u8 start, sljit_u8 end, sljit_u8 rot) \
 { \
+	sljit_ins r1, r2, i3, i4, i5; \
+\
 	SLJIT_ASSERT(have_genext()); \
-	sljit_ins r1 = (sljit_ins)(dst&0xf) << 36; \
-	sljit_ins r2 = (sljit_ins)(src&0xf) << 32; \
-	sljit_ins i3 = (sljit_ins)(start) << 24; \
-	sljit_ins i4 = (sljit_ins)(end) << 16; \
-	sljit_ins i5 = (sljit_ins)(rot) << 8; \
-	return pattern | r1 | r2 | i3 | i4 | i5; \
+	r1 = (sljit_ins)(dst & 0xf) << 36; \
+	r2 = (sljit_ins)(src & 0xf) << 32; \
+	i3 = (sljit_ins)start << 24; \
+	i4 = (sljit_ins)end << 16; \
+	i5 = (sljit_ins)rot << 8; \
+\
+	return (pattern) | r1 | r2 | i3 | i4 | i5; \
 }
 
 /* ROTATE THEN AND SELECTED BITS */
@@ -777,11 +826,14 @@ SLJIT_S390X_RIEF(risbhg, 0xec000000005d)
 #define SLJIT_S390X_RRFA(name, pattern, cond) \
 SLJIT_S390X_INSTRUCTION(name, sljit_gpr dst, sljit_gpr src1, sljit_gpr src2) \
 { \
+	sljit_ins r1, r2, r3; \
+\
 	SLJIT_ASSERT(cond); \
-	sljit_ins r1 = (sljit_ins)(dst&0xf) << 4; \
-	sljit_ins r2 = (sljit_ins)(src1&0xf); \
-	sljit_ins r3 = (sljit_ins)(src2&0xf) << 12; \
-	return pattern | r3 | r1 | r2; \
+	r1 = (sljit_ins)(dst & 0xf) << 4; \
+	r2 = (sljit_ins)(src1 & 0xf); \
+	r3 = (sljit_ins)(src2 & 0xf) << 12; \
+\
+	return (pattern) | r3 | r1 | r2; \
 }
 
 /* MULTIPLY */
@@ -794,11 +846,14 @@ SLJIT_S390X_RRFA(msgrkc, 0xb9ed0000, have_misc2())
 #define SLJIT_S390X_RRFC(name, pattern) \
 SLJIT_S390X_INSTRUCTION(name, sljit_gpr dst, sljit_gpr src, sljit_uw mask) \
 { \
+	sljit_ins r1, r2, m3; \
+\
 	SLJIT_ASSERT(have_lscond1()); \
-	sljit_ins r1 = (sljit_ins)(dst&0xf) << 4; \
-	sljit_ins r2 = (sljit_ins)(src&0xf); \
-	sljit_ins m3 = (sljit_ins)(mask&0xf) << 12; \
-	return pattern | m3 | r1 | r2; \
+	r1 = (sljit_ins)(dst & 0xf) << 4; \
+	r2 = (sljit_ins)(src & 0xf); \
+	m3 = (sljit_ins)(mask & 0xf) << 12; \
+\
+	return (pattern) | m3 | r1 | r2; \
 }
 
 /* LOAD HALFWORD IMMEDIATE ON CONDITION */
@@ -811,11 +866,14 @@ SLJIT_S390X_RRFC(locgr, 0xb9e20000)
 #define SLJIT_S390X_RIEG(name, pattern) \
 SLJIT_S390X_INSTRUCTION(name, sljit_gpr reg, sljit_sw imm, sljit_uw mask) \
 { \
+	sljit_ins r1, m3, i2; \
+\
 	SLJIT_ASSERT(have_lscond2()); \
-	sljit_ins r1 = (sljit_ins)(reg&0xf) << 36; \
-	sljit_ins m3 = (sljit_ins)(mask&0xf) << 32; \
-	sljit_ins i2 = (sljit_ins)(imm&0xffffL) << 16; \
-	return pattern | r1 | m3 | i2; \
+	r1 = (sljit_ins)(reg & 0xf) << 36; \
+	m3 = (sljit_ins)(mask & 0xf) << 32; \
+	i2 = (sljit_ins)(imm & 0xffffL) << 16; \
+\
+	return (pattern) | r1 | m3 | i2; \
 }
 
 /* LOAD HALFWORD IMMEDIATE ON CONDITION */
@@ -827,20 +885,23 @@ SLJIT_S390X_RIEG(locghi, 0xec0000000046)
 #define SLJIT_S390X_RILB(name, pattern, cond) \
 SLJIT_S390X_INSTRUCTION(name, sljit_gpr reg, sljit_sw ri) \
 { \
+	sljit_ins r1, ri2; \
+\
 	SLJIT_ASSERT(cond); \
-	sljit_ins r1 = (sljit_ins)(reg&0xf) << 36; \
-	sljit_ins ri2 = (sljit_ins)(ri&0xffffffff); \
-	return pattern | r1 | ri2; \
+	r1 = (sljit_ins)(reg & 0xf) << 36; \
+	ri2 = (sljit_ins)(ri & 0xffffffff); \
+\
+	return (pattern) | r1 | ri2; \
 }
 
 /* BRANCH RELATIVE AND SAVE LONG */
-SLJIT_S390X_RILB(brasl, 0xc00500000000, 1);
+SLJIT_S390X_RILB(brasl, 0xc00500000000, 1)
 
 /* LOAD ADDRESS RELATIVE LONG */
-SLJIT_S390X_RILB(larl,  0xc00000000000, 1);
+SLJIT_S390X_RILB(larl,  0xc00000000000, 1)
 
 /* LOAD RELATIVE LONG */
-SLJIT_S390X_RILB(lgrl,  0xc40800000000, have_genext());
+SLJIT_S390X_RILB(lgrl,  0xc40800000000, have_genext())
 
 #undef SLJIT_S390X_RILB
 
@@ -851,23 +912,23 @@ SLJIT_S390X_INSTRUCTION(br, sljit_gpr target)
 
 SLJIT_S390X_INSTRUCTION(brcl, sljit_uw mask, sljit_sw target)
 {
-	sljit_ins m1 = (sljit_ins)(mask&0xf) << 36;
-	sljit_ins ri2 = (sljit_ins)target&0xffffffff;
-	return 0xc00400000000 | m1 | ri2;
+	sljit_ins m1 = (sljit_ins)(mask & 0xf) << 36;
+	sljit_ins ri2 = (sljit_ins)target & 0xffffffff;
+	return 0xc00400000000L | m1 | ri2;
 }
 
 SLJIT_S390X_INSTRUCTION(flogr, sljit_gpr dst, sljit_gpr src)
 {
+	sljit_ins r1 = ((sljit_ins)dst & 0xf) << 8;
+	sljit_ins r2 = ((sljit_ins)src & 0xf);
 	SLJIT_ASSERT(have_eimm());
-	sljit_ins r1 = ((sljit_ins)(dst)&0xf) << 8;
-	sljit_ins r2 = ((sljit_ins)(src)&0xf);
 	return 0xb9830000 | r1 | r2;
 }
 
 /* INSERT PROGRAM MASK */
 SLJIT_S390X_INSTRUCTION(ipm, sljit_gpr dst)
 {
-	return 0xb2220000 | ((sljit_ins)(dst&0xf) << 4);
+	return 0xb2220000 | ((sljit_ins)(dst & 0xf) << 4);
 }
 
 /* ROTATE THEN INSERT SELECTED BITS HIGH (ZERO) */
@@ -885,10 +946,10 @@ static sljit_s32 push_load_cc(struct sljit_compiler *compiler, sljit_s32 type)
 	switch (type) {
 	case SLJIT_ZERO:
 	case SLJIT_NOT_ZERO:
-		FAIL_IF(push_inst(compiler, cih(flag_r, 0)));
+		return push_inst(compiler, cih(flag_r, 0));
 		break;
 	default:
-		FAIL_IF(push_inst(compiler, tmlh(flag_r, 0x3000)));
+		return push_inst(compiler, tmlh(flag_r, 0x3000));
 		break;
 	}
 	return SLJIT_SUCCESS;
@@ -900,7 +961,7 @@ static sljit_s32 push_store_zero_flag(struct sljit_compiler *compiler, sljit_s32
 	FAIL_IF(push_inst(compiler, risbhgz(flag_r, source, 0, 31, 32)));
 	if (!(op & SLJIT_I32_OP)) {
 		/* OR high 32-bits with high 32-bits of flag register */
-		FAIL_IF(push_inst(compiler, rosbg(flag_r, source, 0, 31, 0)));
+		return push_inst(compiler, rosbg(flag_r, source, 0, 31, 0));
 	}
 	return SLJIT_SUCCESS;
 }
@@ -909,35 +970,34 @@ static sljit_s32 push_store_zero_flag(struct sljit_compiler *compiler, sljit_s32
 static sljit_s32 push_load_imm_inst(struct sljit_compiler *compiler, sljit_gpr target, sljit_sw v)
 {
 	/* 4 byte instructions */
-	if (v == ((v << 48)>>48)) {
+	if (is_s16(v))
 		return push_inst(compiler, lghi(target, (sljit_s16)v));
-	}
-	if (v == (v & 0x000000000000ffff)) {
-		return push_inst(compiler, llill(target, (sljit_u16)(v)));
-	}
-	if (v == (v & 0x00000000ffff0000)) {
-		return push_inst(compiler, llilh(target, (sljit_u16)(v>>16)));
-	}
-	if (v == (v & 0x0000ffff00000000)) {
-		return push_inst(compiler, llihl(target, (sljit_u16)(v>>32)));
-	}
-	if (v == (v & 0xffff000000000000)) {
-		return push_inst(compiler, llihh(target, (sljit_u16)(v>>48)));
-	}
+
+	if ((sljit_uw)v == (v & 0x000000000000ffffU))
+		return push_inst(compiler, llill(target, (sljit_u16)v));
+
+	if ((sljit_uw)v == (v & 0x00000000ffff0000U))
+		return push_inst(compiler, llilh(target, (sljit_u16)(v >> 16)));
+
+	if ((sljit_uw)v == (v & 0x0000ffff00000000U))
+		return push_inst(compiler, llihl(target, (sljit_u16)(v >> 32)));
+
+	if ((sljit_uw)v == (v & 0xffff000000000000U))
+		return push_inst(compiler, llihh(target, (sljit_u16)(v >> 48)));
 
 	/* 6 byte instructions (requires extended immediate facility) */
 	if (have_eimm()) {
-		if (v == ((v << 32)>>32)) {
+		if (is_s32(v))
 			return push_inst(compiler, lgfi(target, (sljit_s32)v));
-		}
-		if (v == (v & 0x00000000ffffffff)) {
-			return push_inst(compiler, llilf(target, (sljit_u32)(v)));
-		}
-		if (v == (v & 0xffffffff00000000)) {
-			return push_inst(compiler, llihf(target, (sljit_u32)(v>>32)));
-		}
-		FAIL_IF(push_inst(compiler, llilf(target, (sljit_u32)(v))));
-		return push_inst(compiler, iihf(target, (sljit_u32)(v>>32)));
+
+		if ((sljit_uw)v == (v & 0x00000000ffffffffU))
+			return push_inst(compiler, llilf(target, (sljit_u32)v));
+
+		if ((sljit_uw)v == (v & 0xffffffff00000000U))
+			return push_inst(compiler, llihf(target, (sljit_u32)(v >> 32)));
+
+		FAIL_IF(push_inst(compiler, llilf(target, (sljit_u32)v)));
+		return push_inst(compiler, iihf(target, (sljit_u32)(v >> 32)));
 	}
 	/* TODO(mundaym): instruction sequences that don't use extended immediates */
 	abort();
@@ -950,18 +1010,17 @@ struct addr {
 };
 
 /* transform memory operand into D(X,B) form with a signed 20-bit offset */
-static sljit_s32 make_addr_bxy(
-	struct sljit_compiler *compiler,
-	struct addr *addr,
-	sljit_s32 mem, sljit_sw off,
-	sljit_gpr tmp /* clobbered, must not be R0 */)
+static sljit_s32 make_addr_bxy(struct sljit_compiler *compiler,
+	struct addr *addr, sljit_s32 mem, sljit_sw off,
+	sljit_gpr tmp /* clobbered, must not be r0 */)
 {
-	SLJIT_ASSERT(tmp != r0);
 	sljit_gpr base = r0;
-	if (mem & REG_MASK) {
-		base = gpr(mem & REG_MASK);
-	}
 	sljit_gpr index = r0;
+
+	SLJIT_ASSERT(tmp != r0);
+	if (mem & REG_MASK)
+		base = gpr(mem & REG_MASK);
+
 	if (mem & OFFS_REG_MASK) {
 		index = gpr(OFFS_REG(mem));
 		if (off != 0) {
@@ -971,32 +1030,30 @@ static sljit_s32 make_addr_bxy(
 			index = tmp;
 			off = 0; /* clear offset */
 		}
-	} else if (!is_s20(off)) {
+	}
+	else if (!is_s20(off)) {
 		FAIL_IF(push_load_imm_inst(compiler, tmp, off));
 		index = tmp;
 		off = 0; /* clear offset */
 	}
-	*addr = (struct addr) {
-		.base = base,
-		.index = index,
-		.offset = off
-	};
+	addr->base = base;
+	addr->index = index;
+	addr->offset = off;
 	return SLJIT_SUCCESS;
 }
 
 /* transform memory operand into D(X,B) form with an unsigned 12-bit offset */
-static sljit_s32 make_addr_bx(
-	struct sljit_compiler *compiler,
-	struct addr *addr,
-	sljit_s32 mem, sljit_sw off,
-	sljit_gpr tmp /* clobbered, must not be R0 */)
+static sljit_s32 make_addr_bx(struct sljit_compiler *compiler,
+	struct addr *addr, sljit_s32 mem, sljit_sw off,
+	sljit_gpr tmp /* clobbered, must not be r0 */)
 {
-	SLJIT_ASSERT(tmp != r0);
 	sljit_gpr base = r0;
-	if (mem & REG_MASK) {
-		base = gpr(mem & REG_MASK);
-	}
 	sljit_gpr index = r0;
+
+	SLJIT_ASSERT(tmp != r0);
+	if (mem & REG_MASK)
+		base = gpr(mem & REG_MASK);
+
 	if (mem & OFFS_REG_MASK) {
 		index = gpr(OFFS_REG(mem));
 		if (off != 0) {
@@ -1006,38 +1063,40 @@ static sljit_s32 make_addr_bx(
 			index = tmp;
 			off = 0; /* clear offset */
 		}
-	} else if (!is_u12(off)) {
+	}
+	else if (!is_u12(off)) {
 		FAIL_IF(push_load_imm_inst(compiler, tmp, off));
 		index = tmp;
 		off = 0; /* clear offset */
 	}
-	*addr = (struct addr) {
-		.base = base,
-		.index = index,
-		.offset = off
-	};
+	addr->base = base;
+	addr->index = index;
+	addr->offset = off;
 	return SLJIT_SUCCESS;
 }
 
+#define WHEN(cond, r, i1, i2, addr) \
+	(cond) ? i1(r, addr.offset, addr.index, addr.base) \
+		: i2(r, addr.offset, addr.index, addr.base)
+
 static sljit_s32 load_word(struct sljit_compiler *compiler, sljit_gpr dst,
 		sljit_s32 src, sljit_sw srcw,
 		sljit_gpr tmp /* clobbered */, sljit_s32 is_32bit)
 {
-	SLJIT_ASSERT(src & SLJIT_MEM);
 	struct addr addr;
-	if (have_ldisp() || !is_32bit) {
+	sljit_ins ins;
+
+	SLJIT_ASSERT(src & SLJIT_MEM);
+	if (have_ldisp() || !is_32bit)
 		FAIL_IF(make_addr_bxy(compiler, &addr, src, srcw, tmp));
-	} else {
+	else
 		FAIL_IF(make_addr_bx(compiler, &addr, src, srcw, tmp));
-	}
-	sljit_ins ins = 0;
-	if (is_32bit) {
-		ins = is_u12(addr.offset) ?
-			l(dst, addr.offset, addr.index, addr.base) :
-			ly(dst, addr.offset, addr.index, addr.base);
-	} else {
+
+	if (is_32bit)
+		ins = WHEN(is_u12(addr.offset), dst, l, ly, addr);
+	else
 		ins = lg(dst, addr.offset, addr.index, addr.base);
-	}
+
 	return push_inst(compiler, ins);
 }
 
@@ -1045,47 +1104,71 @@ static sljit_s32 store_word(struct sljit_compiler *compiler, sljit_gpr src,
 		sljit_s32 dst, sljit_sw dstw,
 		sljit_gpr tmp /* clobbered */, sljit_s32 is_32bit)
 {
-	SLJIT_ASSERT(dst & SLJIT_MEM);
 	struct addr addr;
-	if (have_ldisp() || !is_32bit) {
+	sljit_ins ins;
+
+	SLJIT_ASSERT(dst & SLJIT_MEM);
+	if (have_ldisp() || !is_32bit)
 		FAIL_IF(make_addr_bxy(compiler, &addr, dst, dstw, tmp));
-	} else {
+	else
 		FAIL_IF(make_addr_bx(compiler, &addr, dst, dstw, tmp));
-	}
-	sljit_ins ins = 0;
-	if (is_32bit) {
-		ins = is_u12(addr.offset) ?
-			st(src, addr.offset, addr.index, addr.base) :
-			sty(src, addr.offset, addr.index, addr.base);
-	} else {
+
+	if (is_32bit)
+		ins = WHEN(is_u12(addr.offset), src, st, sty, addr);
+	else
 		ins = stg(src, addr.offset, addr.index, addr.base);
-	}
+
 	return push_inst(compiler, ins);
 }
 
+#undef WHEN
+
 SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compiler)
 {
+	struct sljit_label *label;
+	struct sljit_jump *jump;
+	struct sljit_s390x_const *const_;
+	struct sljit_put_label *put_label;
+	sljit_sw executable_offset;
+	sljit_uw ins_size = 0; /* instructions */
+	sljit_uw pool_size = 0; /* literal pool */
+	sljit_uw pad_size;
+	sljit_uw i, j = 0;
+	struct sljit_memory_fragment *buf;
+	void *code, *code_ptr;
+	sljit_uw *pool, *pool_ptr;
+
+	sljit_uw source;
+	sljit_sw offset; /* TODO(carenas): only need 32 bit */
+
 	CHECK_ERROR_PTR();
 	CHECK_PTR(check_sljit_generate_code(compiler));
 	reverse_buf(compiler);
 
 	/* branch handling */
-	struct sljit_label *label = compiler->labels;
-	struct sljit_jump *jump = compiler->jumps;
+	label = compiler->labels;
+	jump = compiler->jumps;
+	put_label = compiler->put_labels;
 
+	/* TODO(carenas): compiler->executable_size could be calculated
+         *                before to avoid the following loop (except for
+         *                pool_size)
+         */
 	/* calculate the size of the code */
-	sljit_uw ins_size = 0;  /* instructions */
-	sljit_uw pool_size = 0; /* literal pool */
-	sljit_uw i, j = 0;
-	struct sljit_memory_fragment *buf = NULL;
 	for (buf = compiler->buf; buf != NULL; buf = buf->next) {
 		sljit_uw len = buf->used_size / sizeof(sljit_ins);
 		sljit_ins *ibuf = (sljit_ins *)buf->memory;
 		for (i = 0; i < len; ++i, ++j) {
-			/* TODO(mundaym): labels, jumps, constants... */
 			sljit_ins ins = ibuf[i];
+
+			/* TODO(carenas): instruction tag vs size/addr == j
+			 * using instruction tags for const is creative
+			 * but unlike all other architectures, and is not
+			 * done consistently for all other objects.
+			 * This might need reviewing later.
+			 */
 			if (ins & sljit_ins_const) {
-				pool_size += 8;
+				pool_size += sizeof(*pool);
 				ins &= ~sljit_ins_const;
 			}
 			if (label && label->size == j) {
@@ -1099,44 +1182,61 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compil
 					/* replace with: */
 					/*   lgrl %r1, <pool_addr> */
 					/*   bras %r14, %r1 (or bcr <mask>, %r1) */
-					pool_size += 8;
+					pool_size += sizeof(*pool);
 					ins_size += 2;
 				}
 				jump = jump->next;
 			}
+			if (put_label && put_label->addr == j) {
+				pool_size += sizeof(*pool);
+				put_label = put_label->next;
+			}
 			ins_size += sizeof_ins(ins);
 		}
-		/* emit trailing label */
-		if (label && label->size == j) {
-			label->size = ins_size;
-			label = label->next;
-		}
 	}
-	SLJIT_ASSERT(label == NULL);
-	SLJIT_ASSERT(jump == NULL);
 
-	/* pad code size to 8 bytes */
+	/* emit trailing label */
+	if (label && label->size == j) {
+		label->size = ins_size;
+		label = label->next;
+	}
+
+	SLJIT_ASSERT(!label);
+	SLJIT_ASSERT(!jump);
+	SLJIT_ASSERT(!put_label);
+
+	/* pad code size to 8 bytes so is accessible with half word offsets */
 	/* the literal pool needs to be doubleword aligned */
-	sljit_uw pad_size = (((ins_size)+7UL)&~7UL) - ins_size;
+	pad_size = ((ins_size + 7UL) & ~7UL) - ins_size;
 	SLJIT_ASSERT(pad_size < 8UL);
 
 	/* allocate target buffer */
-	void *code = SLJIT_MALLOC_EXEC(ins_size + pad_size + pool_size);
+	code = SLJIT_MALLOC_EXEC(ins_size + pad_size + pool_size,
+					compiler->exec_allocator_data);
 	PTR_FAIL_WITH_EXEC_IF(code);
-	void *code_ptr = code;
-	sljit_uw *pool = (sljit_uw *)((sljit_uw)(code) + ins_size + pad_size);
-	sljit_uw *pool_ptr = pool;
-	struct sljit_s390x_const *const_ = (struct sljit_s390x_const *)compiler->consts;
+	code_ptr = code;
+	executable_offset = SLJIT_EXEC_OFFSET(code);
+
+	/* TODO(carenas): pool is optional, and the ABI recommends it to
+         *                be created before the function code, instead of
+         *                globally; if generated code is too big could
+         *                need offsets bigger than 32bit words and asser()
+         */
+	pool = (sljit_uw *)((sljit_uw)code + ins_size + pad_size);
+	pool_ptr = pool;
+	const_ = (struct sljit_s390x_const *)compiler->consts;
 
 	/* update label addresses */
 	label = compiler->labels;
 	while (label) {
-		label->addr = (sljit_uw)code_ptr + label->size;
+		label->addr = (sljit_uw)SLJIT_ADD_EXEC_OFFSET(
+			(sljit_uw)code_ptr + label->size, executable_offset);
 		label = label->next;
 	}
 
 	/* reset jumps */
 	jump = compiler->jumps;
+	put_label = compiler->put_labels;
 
 	/* emit the code */
 	j = 0;
@@ -1144,19 +1244,18 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compil
 		sljit_uw len = buf->used_size / sizeof(sljit_ins);
 		sljit_ins *ibuf = (sljit_ins *)buf->memory;
 		for (i = 0; i < len; ++i, ++j) {
-			/* TODO(mundaym): labels, jumps, constants... */
 			sljit_ins ins = ibuf[i];
 			if (ins & sljit_ins_const) {
 				/* clear the const tag */
 				ins &= ~sljit_ins_const;
 
 				/* update instruction with relative address of constant */
-				sljit_uw pos = (sljit_uw)(code_ptr);
-				sljit_uw off = (sljit_uw)(pool_ptr) - pos;
-				SLJIT_ASSERT((off&1) == 0);
-				off >>= 1; /* halfword (not byte) offset */
-				SLJIT_ASSERT(is_s32(off));
-				ins |= (sljit_ins)(off&0xffffffff);
+				source = (sljit_uw)code_ptr;
+				offset = (sljit_uw)pool_ptr - source;
+				SLJIT_ASSERT(!(offset & 1));
+				offset >>= 1; /* halfword (not byte) offset */
+				SLJIT_ASSERT(is_s32(offset));
+				ins |= (sljit_ins)offset & 0xffffffff;
 
 				/* update address */
 				const_->const_.addr = (sljit_uw)pool_ptr;
@@ -1173,19 +1272,20 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compil
 					jump->addr = (sljit_uw)pool_ptr;
 
 					/* load address into tmp1 */
-					sljit_uw pos = (sljit_uw)(code_ptr);
-					sljit_uw off = (sljit_uw)(pool_ptr) - pos;
-					SLJIT_ASSERT((off&1) == 0);
-					off >>= 1; /* halfword (not byte) offset */
-					SLJIT_ASSERT(is_s32(off));
-					encode_inst(&code_ptr, lgrl(tmp1, off&0xffffffff));
+					source = (sljit_uw)SLJIT_ADD_EXEC_OFFSET(code_ptr, executable_offset);
+					offset = (sljit_uw)SLJIT_ADD_EXEC_OFFSET(pool_ptr, executable_offset) - source;
+					SLJIT_ASSERT(!(offset & 1));
+					offset >>= 1;
+					SLJIT_ASSERT(is_s32(offset));
+					encode_inst(&code_ptr,
+						lgrl(tmp1, offset & 0xffffffff));
 
 					/* store jump target into pool and update pool address */
 					*(pool_ptr++) = target;
 
 					/* branch to tmp1 */
-					sljit_ins op = (ins>>32)&0xf;
-					sljit_ins arg = (ins>>36)&0xf;
+					sljit_ins op = (ins >> 32) & 0xf;
+					sljit_ins arg = (ins >> 36) & 0xf;
 					switch (op) {
 					case 4: /* brcl -> bcr */
 						ins = bcr(arg, tmp1);
@@ -1196,11 +1296,11 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compil
 					default:
 						abort();
 					}
-				} else {
-					jump->addr = (sljit_uw)(code_ptr) + 2;
-					/* TODO(mundaym): handle executable offset? */
-					sljit_sw source = (sljit_sw)(code_ptr);
-					sljit_sw offset = target - source;
+				}
+				else {
+					jump->addr = (sljit_uw)code_ptr + 2;
+					source = (sljit_uw)SLJIT_ADD_EXEC_OFFSET(code_ptr, executable_offset);
+					offset = target - source;
 
 					/* offset must be halfword aligned */
 					SLJIT_ASSERT(!(offset & 1));
@@ -1208,22 +1308,41 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compil
 					SLJIT_ASSERT(is_s32(offset)); /* TODO(mundaym): handle arbitrary offsets */
 
 					/* patch jump target */
-					ins |= (sljit_ins)(offset)&0xffffffff;
+					ins |= (sljit_ins)offset & 0xffffffff;
 				}
 				jump = jump->next;
 			}
+			if (put_label && put_label->addr == j) {
+				source = (sljit_uw)SLJIT_ADD_EXEC_OFFSET(code_ptr, executable_offset);
+
+				SLJIT_ASSERT(put_label->label);
+				put_label->addr = (sljit_uw)code_ptr;
+
+				/* store target into pool */
+				*pool_ptr = put_label->label->addr;
+				offset = (sljit_uw)SLJIT_ADD_EXEC_OFFSET(pool_ptr, executable_offset) - source;
+				pool_ptr++;
+
+				SLJIT_ASSERT(!(offset & 1));
+				offset >>= 1;
+				SLJIT_ASSERT(is_s32(offset));
+				ins |= (sljit_ins)offset & 0xffffffff;
+
+				put_label = put_label->next;
+			}
 			encode_inst(&code_ptr, ins);
 		}
 	}
-	SLJIT_ASSERT(code + ins_size == code_ptr);
-	SLJIT_ASSERT(((void *)(pool) + pool_size) == (void *)pool_ptr);
+	SLJIT_ASSERT((sljit_u8 *)code + ins_size == code_ptr);
+	SLJIT_ASSERT((sljit_u8 *)pool + pool_size == (sljit_u8 *)pool_ptr);
 
 	compiler->error = SLJIT_ERR_COMPILED;
-	compiler->executable_offset = SLJIT_EXEC_OFFSET(code);
+	compiler->executable_offset = executable_offset;
 	compiler->executable_size = ins_size;
 	code = SLJIT_ADD_EXEC_OFFSET(code, executable_offset);
 	code_ptr = SLJIT_ADD_EXEC_OFFSET(code_ptr, executable_offset);
 	SLJIT_CACHE_FLUSH(code, code_ptr);
+	SLJIT_UPDATE_WX_FLAGS(code, code_ptr, 1);
 	return code;
 }
 
@@ -1249,27 +1368,29 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compi
 	sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds,
 	sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size)
 {
+	sljit_s32 args = get_arg_count(arg_types);
+	sljit_sw frame_size;
+
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size));
 	set_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size);
 
 	/* saved registers go in callee allocated save area */
-	compiler->local_size = (local_size+0xf)&~0xf;
-	sljit_sw frame_size = compiler->local_size + 160;
+	compiler->local_size = (local_size + 0xf) & ~0xf;
+	frame_size = compiler->local_size + SLJIT_S390X_DEFAULT_STACK_FRAME_SIZE;
 
-	FAIL_IF(push_inst(compiler, stmg(r6, r15, 48, r15))); /* save registers TODO(MGM): optimize */
+	FAIL_IF(push_inst(compiler, stmg(r6, r15, r6 * sizeof(sljit_sw), r15))); /* save registers TODO(MGM): optimize */
 	if (frame_size != 0) {
-		if (is_s16(-frame_size)) {
+		if (is_s16(-frame_size))
 			FAIL_IF(push_inst(compiler, aghi(r15, -((sljit_s16)frame_size))));
-		} else if (is_s32(-frame_size)) {
+		else if (is_s32(-frame_size))
 			FAIL_IF(push_inst(compiler, agfi(r15, -((sljit_s32)frame_size))));
-		} else {
+		else {
 			FAIL_IF(push_load_imm_inst(compiler, tmp1, -frame_size));
 			FAIL_IF(push_inst(compiler, la(r15, 0, tmp1, r15)));
 		}
 	}
 
-	sljit_s32 args = get_arg_count(arg_types);
 	if (args >= 1)
 		FAIL_IF(push_inst(compiler, lgr(gpr(SLJIT_S0), gpr(SLJIT_R0))));
 	if (args >= 2)
@@ -1296,19 +1417,24 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_set_context(struct sljit_compiler *comp
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 src, sljit_sw srcw)
 {
+	sljit_sw size;
+	sljit_gpr end;
+
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_return(compiler, op, src, srcw));
 
 	FAIL_IF(emit_mov_before_return(compiler, op, src, srcw));
 
-	sljit_sw size = compiler->local_size + 160 + 48;
-	sljit_gpr end = r15;
+	size = compiler->local_size + SLJIT_S390X_DEFAULT_STACK_FRAME_SIZE + (r6 * sizeof(sljit_sw));
 	if (!is_s20(size)) {
-		FAIL_IF(push_load_imm_inst(compiler, tmp1, compiler->local_size + 160));
+		FAIL_IF(push_load_imm_inst(compiler, tmp1, compiler->local_size + SLJIT_S390X_DEFAULT_STACK_FRAME_SIZE));
 		FAIL_IF(push_inst(compiler, la(r15, 0, tmp1, r15)));
-		size = 48;
+		size = r6 * sizeof(sljit_sw);
 		end = r14; /* r15 has been restored already */
 	}
+	else
+		end = r15;
+
 	FAIL_IF(push_inst(compiler, lmg(r6, end, size, r15))); /* restore registers TODO(MGM): optimize */
 	FAIL_IF(push_inst(compiler, br(r14))); /* return */
 
@@ -1321,12 +1447,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return(struct sljit_compiler *comp
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compiler, sljit_s32 op)
 {
-	CHECK_ERROR();
-	CHECK(check_sljit_emit_op0(compiler, op));
-
 	sljit_gpr arg0 = gpr(SLJIT_R0);
 	sljit_gpr arg1 = gpr(SLJIT_R1);
 
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_op0(compiler, op));
+
 	op = GET_OPCODE(op) | (op & SLJIT_I32_OP);
 	switch (op) {
 	case SLJIT_BREAKPOINT:
@@ -1356,9 +1482,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compile
 		FAIL_IF(push_inst(compiler, lr(tmp1, arg0)));
 		FAIL_IF(push_inst(compiler, dlr(tmp0, arg1)));
 		FAIL_IF(push_inst(compiler, lr(arg0, tmp1))); /* quotient */
-		if (op == SLJIT_DIVMOD_U32) {
-			FAIL_IF(push_inst(compiler, lr(arg1, tmp0))); /* remainder */
-		}
+		if (op == SLJIT_DIVMOD_U32)
+			return push_inst(compiler, lr(arg1, tmp0)); /* remainder */
+
 		return SLJIT_SUCCESS;
 	case SLJIT_DIV_S32:
 	case SLJIT_DIVMOD_S32:
@@ -1366,9 +1492,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compile
 		FAIL_IF(push_inst(compiler, lr(tmp1, arg0)));
 		FAIL_IF(push_inst(compiler, dr(tmp0, arg1)));
 		FAIL_IF(push_inst(compiler, lr(arg0, tmp1))); /* quotient */
-		if (op == SLJIT_DIVMOD_S32) {
-			FAIL_IF(push_inst(compiler, lr(arg1, tmp0))); /* remainder */
-		}
+		if (op == SLJIT_DIVMOD_S32)
+			return push_inst(compiler, lr(arg1, tmp0)); /* remainder */
+
 		return SLJIT_SUCCESS;
 	case SLJIT_DIV_UW:
 	case SLJIT_DIVMOD_UW:
@@ -1376,18 +1502,22 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compile
 		FAIL_IF(push_inst(compiler, lgr(tmp1, arg0)));
 		FAIL_IF(push_inst(compiler, dlgr(tmp0, arg1)));
 		FAIL_IF(push_inst(compiler, lgr(arg0, tmp1))); /* quotient */
-		if (op == SLJIT_DIVMOD_UW) {
-			FAIL_IF(push_inst(compiler, lgr(arg1, tmp0))); /* remainder */
-		}
+		if (op == SLJIT_DIVMOD_UW)
+			return push_inst(compiler, lgr(arg1, tmp0)); /* remainder */
+
 		return SLJIT_SUCCESS;
 	case SLJIT_DIV_SW:
 	case SLJIT_DIVMOD_SW:
 		FAIL_IF(push_inst(compiler, lgr(tmp1, arg0)));
 		FAIL_IF(push_inst(compiler, dsgr(tmp0, arg1)));
 		FAIL_IF(push_inst(compiler, lgr(arg0, tmp1))); /* quotient */
-		if (op == SLJIT_DIVMOD_SW) {
-			FAIL_IF(push_inst(compiler, lgr(arg1, tmp0))); /* remainder */
-		}
+		if (op == SLJIT_DIVMOD_SW)
+			return push_inst(compiler, lgr(arg1, tmp0)); /* remainder */
+
+		return SLJIT_SUCCESS;
+	case SLJIT_ENDBR:
+		return SLJIT_SUCCESS;
+	case SLJIT_SKIP_FRAMES_BEFORE_RETURN:
 		return SLJIT_SUCCESS;
 	default:
 		SLJIT_UNREACHABLE();
@@ -1402,12 +1532,17 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compile
         sljit_s32 dst, sljit_sw dstw,
         sljit_s32 src, sljit_sw srcw)
 {
+	sljit_ins ins;
+	struct addr mem;
+	sljit_gpr dst_r;
+	sljit_gpr src_r;
+	sljit_s32 opcode = GET_OPCODE(op);
+
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_op1(compiler, op, dst, dstw, src, srcw));
 	ADJUST_LOCAL_OFFSET(dst, dstw);
 	ADJUST_LOCAL_OFFSET(src, srcw);
 
-	sljit_s32 opcode = GET_OPCODE(op);
 	if ((dst == SLJIT_UNUSED) && !HAS_FLAGS(op)) {
 		/* TODO(mundaym): implement prefetch? */
 		return SLJIT_SUCCESS;
@@ -1415,76 +1550,127 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compile
 	if (opcode >= SLJIT_MOV && opcode <= SLJIT_MOV_P) {
 		/* LOAD REGISTER */
 		if (FAST_IS_REG(dst) && FAST_IS_REG(src)) {
-			sljit_ins ins = 0;
-			sljit_gpr dst_r = gpr(dst);
-			sljit_gpr src_r = gpr(src);
+			dst_r = gpr(dst);
+			src_r = gpr(src);
 			switch (opcode | (op & SLJIT_I32_OP)) {
 			/* 32-bit */
-			case SLJIT_MOV32_U8:  ins = llcr(dst_r, src_r); break;
-			case SLJIT_MOV32_S8:  ins =  lbr(dst_r, src_r); break;
-			case SLJIT_MOV32_U16: ins = llhr(dst_r, src_r); break;
-			case SLJIT_MOV32_S16: ins =  lhr(dst_r, src_r); break;
-			case SLJIT_MOV32:     ins =   lr(dst_r, src_r); break;
+			case SLJIT_MOV32_U8:
+				ins = llcr(dst_r, src_r);
+				break;
+			case SLJIT_MOV32_S8:
+				ins = lbr(dst_r, src_r);
+				break;
+			case SLJIT_MOV32_U16:
+				ins = llhr(dst_r, src_r);
+				break;
+			case SLJIT_MOV32_S16:
+				ins = lhr(dst_r, src_r);
+				break;
+			case SLJIT_MOV32:
+				ins = lr(dst_r, src_r);
+				break;
 			/* 64-bit */
-			case SLJIT_MOV_U8:  ins = llgcr(dst_r, src_r); break;
-			case SLJIT_MOV_S8:  ins =  lgbr(dst_r, src_r); break;
-			case SLJIT_MOV_U16: ins = llghr(dst_r, src_r); break;
-			case SLJIT_MOV_S16: ins =  lghr(dst_r, src_r); break;
-			case SLJIT_MOV_U32: ins = llgfr(dst_r, src_r); break;
-			case SLJIT_MOV_S32: ins =  lgfr(dst_r, src_r); break;
+			case SLJIT_MOV_U8:
+				ins = llgcr(dst_r, src_r);
+				break;
+			case SLJIT_MOV_S8:
+				ins = lgbr(dst_r, src_r);
+				break;
+			case SLJIT_MOV_U16:
+				ins = llghr(dst_r, src_r);
+				break;
+			case SLJIT_MOV_S16:
+				ins = lghr(dst_r, src_r);
+				break;
+			case SLJIT_MOV_U32:
+				ins = llgfr(dst_r, src_r);
+				break;
+			case SLJIT_MOV_S32:
+				ins = lgfr(dst_r, src_r);
+				break;
 			case SLJIT_MOV:
 			case SLJIT_MOV_P:
 				ins = lgr(dst_r, src_r);
 				break;
 			default:
+				ins = 0;
 				SLJIT_UNREACHABLE();
 			}
 			FAIL_IF(push_inst(compiler, ins));
 			if (HAS_FLAGS(op)) {
 				/* only handle zero flag */
 				SLJIT_ASSERT(!(op & VARIABLE_FLAG_MASK));
-				FAIL_IF(push_store_zero_flag(compiler, op, dst_r));
+				return push_store_zero_flag(compiler, op, dst_r);
 			}
 			return SLJIT_SUCCESS;
 		}
 		/* LOAD IMMEDIATE */
 		if (FAST_IS_REG(dst) && (src & SLJIT_IMM)) {
 			switch (opcode) {
-			case SLJIT_MOV_U8:  srcw = (sljit_sw)((sljit_u8)(srcw));  break;
-			case SLJIT_MOV_S8:  srcw = (sljit_sw)((sljit_s8)(srcw));  break;
-			case SLJIT_MOV_U16: srcw = (sljit_sw)((sljit_u16)(srcw)); break;
-			case SLJIT_MOV_S16: srcw = (sljit_sw)((sljit_s16)(srcw)); break;
-			case SLJIT_MOV_U32: srcw = (sljit_sw)((sljit_u32)(srcw)); break;
-			case SLJIT_MOV_S32: srcw = (sljit_sw)((sljit_s32)(srcw)); break;
+			case SLJIT_MOV_U8:
+				srcw = (sljit_sw)((sljit_u8)(srcw));
+				break;
+			case SLJIT_MOV_S8:
+				srcw = (sljit_sw)((sljit_s8)(srcw));
+				break;
+			case SLJIT_MOV_U16:
+				srcw = (sljit_sw)((sljit_u16)(srcw));
+				break;
+			case SLJIT_MOV_S16:
+				srcw = (sljit_sw)((sljit_s16)(srcw));
+				break;
+			case SLJIT_MOV_U32:
+				srcw = (sljit_sw)((sljit_u32)(srcw));
+				break;
+			case SLJIT_MOV_S32:
+				srcw = (sljit_sw)((sljit_s32)(srcw));
+				break;
 			}
 			return push_load_imm_inst(compiler, gpr(dst), srcw);
 		}
 		/* LOAD */
 		if (FAST_IS_REG(dst) && (src & SLJIT_MEM)) {
 			sljit_gpr reg = gpr(dst);
-			struct addr mem;
+
 			FAIL_IF(make_addr_bxy(compiler, &mem, src, srcw, tmp1));
-			sljit_ins ins;
 			switch (opcode | (op & SLJIT_I32_OP)) {
-			case SLJIT_MOV32_U8:  ins = llc(reg, mem.offset, mem.index, mem.base); break;
-			case SLJIT_MOV32_S8:  ins =  lb(reg, mem.offset, mem.index, mem.base); break;
-			case SLJIT_MOV32_U16: ins = llh(reg, mem.offset, mem.index, mem.base); break;
+			case SLJIT_MOV32_U8:
+				ins = llc(reg, mem.offset, mem.index, mem.base);
+				break;
+			case SLJIT_MOV32_S8:
+				ins = lb(reg, mem.offset, mem.index, mem.base);
+				break;
+			case SLJIT_MOV32_U16:
+				ins = llh(reg, mem.offset, mem.index, mem.base);
+				break;
 			case SLJIT_MOV32_S16:
 				ins = is_u12(mem.offset) ?
-					lh(reg, mem.offset, mem.index, mem.base) :
+					lh(reg, mem.offset, mem.index, mem.base):
 					lhy(reg, mem.offset, mem.index, mem.base);
 				break;
 			case SLJIT_MOV32:
 				ins = is_u12(mem.offset) ?
-					l(reg, mem.offset, mem.index, mem.base) :
+					l(reg, mem.offset, mem.index, mem.base):
 					ly(reg, mem.offset, mem.index, mem.base);
 				break;
-			case SLJIT_MOV_U8:  ins = llgc(reg, mem.offset, mem.index, mem.base); break;
-			case SLJIT_MOV_S8:  ins =  lgb(reg, mem.offset, mem.index, mem.base); break;
-			case SLJIT_MOV_U16: ins = llgh(reg, mem.offset, mem.index, mem.base); break;
-			case SLJIT_MOV_S16: ins =  lgh(reg, mem.offset, mem.index, mem.base); break;
-			case SLJIT_MOV_U32: ins = llgf(reg, mem.offset, mem.index, mem.base); break;
-			case SLJIT_MOV_S32: ins =  lgf(reg, mem.offset, mem.index, mem.base); break;
+			case SLJIT_MOV_U8:
+				ins = llgc(reg, mem.offset, mem.index, mem.base);
+				break;
+			case SLJIT_MOV_S8:
+				ins = lgb(reg, mem.offset, mem.index, mem.base);
+				break;
+			case SLJIT_MOV_U16:
+				ins = llgh(reg, mem.offset, mem.index, mem.base);
+				break;
+			case SLJIT_MOV_S16:
+				ins = lgh(reg, mem.offset, mem.index, mem.base);
+				break;
+			case SLJIT_MOV_U32:
+				ins = llgf(reg, mem.offset, mem.index, mem.base);
+				break;
+			case SLJIT_MOV_S32:
+				ins = lgf(reg, mem.offset, mem.index, mem.base);
+				break;
 			case SLJIT_MOV_P:
 			case SLJIT_MOV:
 				ins = lg(reg, mem.offset, mem.index, mem.base);
@@ -1496,13 +1682,13 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compile
 			if (HAS_FLAGS(op)) {
 				/* only handle zero flag */
 				SLJIT_ASSERT(!(op & VARIABLE_FLAG_MASK));
-				FAIL_IF(push_store_zero_flag(compiler, op, reg));
+				return push_store_zero_flag(compiler, op, reg);
 			}
 			return SLJIT_SUCCESS;
 		}
 		/* STORE and STORE IMMEDIATE */
-		if ((dst & SLJIT_MEM) &&
-			(FAST_IS_REG(src) || (src & SLJIT_IMM))) {
+		if ((dst & SLJIT_MEM)
+			&& (FAST_IS_REG(src) || (src & SLJIT_IMM))) {
 			sljit_gpr reg = FAST_IS_REG(src) ? gpr(src) : tmp0;
 			if (src & SLJIT_IMM) {
 				/* TODO(mundaym): MOVE IMMEDIATE? */
@@ -1533,7 +1719,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compile
 				if (HAS_FLAGS(op)) {
 					/* only handle zero flag */
 					SLJIT_ASSERT(!(op & VARIABLE_FLAG_MASK));
-					FAIL_IF(push_store_zero_flag(compiler, op, reg));
+					return push_store_zero_flag(compiler, op, reg);
 				}
 				return SLJIT_SUCCESS;
 			default:
@@ -1576,43 +1762,42 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compile
 				if (HAS_FLAGS(op)) {
 					/* only handle zero flag */
 					SLJIT_ASSERT(!(op & VARIABLE_FLAG_MASK));
-					FAIL_IF(push_store_zero_flag(compiler, op, tmp0));
+					return push_store_zero_flag(compiler, op, tmp0);
 				}
 				return SLJIT_SUCCESS;
 			default:
 				SLJIT_UNREACHABLE();
 			}
 		}
-		abort();
+		SLJIT_UNREACHABLE();
 	}
 
 	SLJIT_ASSERT((src & SLJIT_IMM) == 0); /* no immediates */
 
-	sljit_gpr dst_r = SLOW_IS_REG(dst) ? gpr(REG_MASK & dst) : tmp0;
-	sljit_gpr src_r = FAST_IS_REG(src) ? gpr(REG_MASK & src) : tmp0;
-	if (src & SLJIT_MEM) {
+	dst_r = SLOW_IS_REG(dst) ? gpr(REG_MASK & dst) : tmp0;
+	src_r = FAST_IS_REG(src) ? gpr(REG_MASK & src) : tmp0;
+	if (src & SLJIT_MEM)
 		FAIL_IF(load_word(compiler, src_r, src, srcw, tmp1, src & SLJIT_I32_OP));
-	}
 
 	/* TODO(mundaym): optimize loads and stores */
 	switch (opcode | (op & SLJIT_I32_OP)) {
 	case SLJIT_NOT:
 		/* emulate ~x with x^-1 */
 		FAIL_IF(push_load_imm_inst(compiler, tmp1, -1));
-		if (src_r != dst_r) {
+		if (src_r != dst_r)
 			FAIL_IF(push_inst(compiler, lgr(dst_r, src_r)));
-		}
+
 		FAIL_IF(push_inst(compiler, xgr(dst_r, tmp1)));
 		break;
 	case SLJIT_NOT32:
 		/* emulate ~x with x^-1 */
-		if (have_eimm()) {
+		if (have_eimm())
 			FAIL_IF(push_inst(compiler, xilf(dst_r, -1)));
-		} else {
+		else {
 			FAIL_IF(push_load_imm_inst(compiler, tmp1, -1));
-			if (src_r != dst_r) {
+			if (src_r != dst_r)
 				FAIL_IF(push_inst(compiler, lr(dst_r, src_r)));
-			}
+
 			FAIL_IF(push_inst(compiler, xr(dst_r, tmp1)));
 		}
 		break;
@@ -1625,9 +1810,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compile
 	case SLJIT_CLZ:
 		if (have_eimm()) {
 			FAIL_IF(push_inst(compiler, flogr(tmp0, src_r))); /* clobbers tmp1 */
-			if (dst_r != tmp0) {
+			if (dst_r != tmp0)
 				FAIL_IF(push_inst(compiler, lgr(dst_r, tmp0)));
-			}
 		} else {
 			abort(); /* TODO(mundaym): no eimm (?) */
 		}
@@ -1637,9 +1821,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compile
 			FAIL_IF(push_inst(compiler, sllg(tmp1, src_r, 32, 0)));
 			FAIL_IF(push_inst(compiler, iilf(tmp1, 0xffffffff)));
 			FAIL_IF(push_inst(compiler, flogr(tmp0, tmp1))); /* clobbers tmp1 */
-			if (dst_r != tmp0) {
+			if (dst_r != tmp0)
 				FAIL_IF(push_inst(compiler, lr(dst_r, tmp0)));
-			}
 		} else {
 			abort(); /* TODO(mundaym): no eimm (?) */
 		}
@@ -1649,23 +1832,21 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compile
 	}
 
 	/* write condition code to emulated flag register */
-	if (op & VARIABLE_FLAG_MASK) {
+	if (op & VARIABLE_FLAG_MASK)
 		FAIL_IF(push_inst(compiler, ipm(flag_r)));
-	}
 
 	/* write zero flag to emulated flag register */
-	if (op & SLJIT_SET_Z) {
+	if (op & SLJIT_SET_Z)
 		FAIL_IF(push_store_zero_flag(compiler, op, dst_r));
-	}
 
-	if ((dst != SLJIT_UNUSED) && (dst & SLJIT_MEM)) {
+	/* TODO(carenas): doesn't need FAIL_IF */
+	if ((dst != SLJIT_UNUSED) && (dst & SLJIT_MEM))
 		FAIL_IF(store_word(compiler, dst_r, dst, dstw, tmp1, op & SLJIT_I32_OP));
-	}
 
 	return SLJIT_SUCCESS;
 }
 
-static int is_commutative(sljit_s32 op)
+static SLJIT_INLINE int is_commutative(sljit_s32 op)
 {
 	switch (GET_OPCODE(op)) {
 	case SLJIT_ADD:
@@ -1679,7 +1860,7 @@ static int is_commutative(sljit_s32 op)
 	return 0;
 }
 
-static int is_shift(sljit_s32 op) {
+static SLJIT_INLINE int is_shift(sljit_s32 op) {
 	sljit_s32 v = GET_OPCODE(op);
 	return v == SLJIT_SHL ||
 		v == SLJIT_ASHR ||
@@ -1687,7 +1868,7 @@ static int is_shift(sljit_s32 op) {
 		? 1 : 0;
 }
 
-static int sets_signed_flag(sljit_s32 op)
+static SLJIT_INLINE int sets_signed_flag(sljit_s32 op)
 {
 	switch (GET_FLAG_TYPE(op)) {
 	case SLJIT_OVERFLOW:
@@ -1715,17 +1896,17 @@ static int have_op_2_imm(sljit_s32 op, sljit_sw imm) {
 	switch (GET_OPCODE(op) | (op & SLJIT_I32_OP)) {
 	case SLJIT_ADD32:
 	case SLJIT_ADD:
-		if (!HAS_FLAGS(op) || sets_signed_flag(op)) {
+		if (!HAS_FLAGS(op) || sets_signed_flag(op))
 			return have_eimm() ? is_s32(imm) : is_s16(imm);
-		}
+
 		return have_eimm() && is_u32(imm);
 	case SLJIT_MUL32:
 	case SLJIT_MUL:
 		/* TODO(mundaym): general extension check */
 		/* for ms{,g}fi */
-		if (op & VARIABLE_FLAG_MASK) {
+		if (op & VARIABLE_FLAG_MASK)
 			return 0;
-		}
+
 		return have_genext() && is_s16(imm);
 	case SLJIT_OR32:
 	case SLJIT_XOR32:
@@ -1796,16 +1977,16 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compile
 		if (FAST_IS_REG(src2) && (src1 & SLJIT_MEM)) {
 			SWAP_ARGS
 		}
+		#undef SWAP_ARGS
 	}
 
 	/* src1 must be in a register */
 	sljit_gpr src1_r = FAST_IS_REG(src1) ? gpr(src1 & REG_MASK) : tmp0;
-	if (src1 & SLJIT_IMM) {
+	if (src1 & SLJIT_IMM)
 		FAIL_IF(push_load_imm_inst(compiler, src1_r, src1w));
-	}
-	if (src1 & SLJIT_MEM) {
+
+	if (src1 & SLJIT_MEM)
 		FAIL_IF(load_word(compiler, src1_r, src1, src1w, tmp1, op & SLJIT_I32_OP));
-	}
 
 	/* emit comparison before subtract */
 	if (GET_OPCODE(op) == SLJIT_SUB && (op & VARIABLE_FLAG_MASK)) {
@@ -1825,7 +2006,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compile
 			cmp = -1; /* signed */
 			break;
 		}
-		if (cmp != 0) {
+		if (cmp) {
 			/* clear flags - no need to generate now */
 			op &= ~VARIABLE_FLAG_MASK;
 			sljit_gpr src2_r = FAST_IS_REG(src2) ? gpr(src2 & REG_MASK) : tmp1;
@@ -1835,17 +2016,20 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compile
 					FAIL_IF(push_inst(compiler, (op & SLJIT_I32_OP) ?
 						clfi(src1_r, src2w) :
 						clgfi(src1_r, src2w)));
-				} else if (cmp < 0 && is_s16(src2w)) {
+				}
+				else if (cmp < 0 && is_s16(src2w)) {
 					/* signed */
 					FAIL_IF(push_inst(compiler, (op & SLJIT_I32_OP) ?
 						chi(src1_r, src2w) :
 						cghi(src1_r, src2w)));
-				} else if (cmp < 0 && is_s32(src2w)) {
+				}
+				else if (cmp < 0 && is_s32(src2w)) {
 					/* signed */
 					FAIL_IF(push_inst(compiler, (op & SLJIT_I32_OP) ?
 						cfi(src1_r, src2w) :
 						cgfi(src1_r, src2w)));
-				} else {
+				}
+				else {
 					FAIL_IF(push_load_imm_inst(compiler, src2_r, src2w));
 					if (cmp > 0) {
 						/* unsigned */
@@ -1860,7 +2044,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compile
 							cgr(src1_r, src2_r)));
 					}
 				}
-			} else {
+			}
+			else {
 				if (src2 & SLJIT_MEM) {
 					/* TODO(mundaym): comparisons with memory */
 					/* load src2 into register */
@@ -1883,9 +2068,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compile
 		}
 	}
 
-	if (!HAS_FLAGS(op) && dst == SLJIT_UNUSED) {
+	if (!HAS_FLAGS(op) && dst == SLJIT_UNUSED)
 		return SLJIT_SUCCESS;
-	}
 
 	/* need to specify signed or logical operation */
 	int signed_flags = sets_signed_flag(op);
@@ -1894,9 +2078,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compile
 		/* handle shifts first, they have more constraints than other operations */
 		sljit_sw d = 0;
 		sljit_gpr b = FAST_IS_REG(src2) ? gpr(src2 & REG_MASK) : r0;
-		if (src2 & SLJIT_IMM) {
+		if (src2 & SLJIT_IMM)
 			d = src2w & ((op & SLJIT_I32_OP) ? 31 : 63);
-		}
+
 		if (src2 & SLJIT_MEM) {
 			/* shift amount (b) cannot be in r0 (i.e. tmp0) */
 			FAIL_IF(load_word(compiler, tmp1, src2, src2w, tmp1, op & SLJIT_I32_OP));
@@ -1932,10 +2116,11 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compile
 		default:
 			SLJIT_UNREACHABLE();
 		}
-		if (workaround_alias && dst_r != src1_r) {
+		if (workaround_alias && dst_r != src1_r)
 			FAIL_IF(push_inst(compiler, lr(dst_r, src1_r)));
-		}
-	} else if ((GET_OPCODE(op) == SLJIT_MUL) && HAS_FLAGS(op)) {
+
+	}
+	else if ((GET_OPCODE(op) == SLJIT_MUL) && HAS_FLAGS(op)) {
 		/* multiply instructions do not generally set flags so we need to manually */
 		/* detect overflow conditions */
 		/* TODO(mundaym): 64-bit overflow */
@@ -1950,11 +2135,11 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compile
 			/* load src2 into register */
 			FAIL_IF(load_word(compiler, src2_r, src2, src2w, tmp1, op & SLJIT_I32_OP));
 		}
-		if (have_misc2()) {
+		if (have_misc2())
 			FAIL_IF(push_inst(compiler, op & SLJIT_I32_OP ?
 			        msrkc(dst_r, src1_r, src2_r) :
 				msgrkc(dst_r, src1_r, src2_r)));
-		} else if (op & SLJIT_I32_OP) {
+		else if (op & SLJIT_I32_OP) {
 			op &= ~VARIABLE_FLAG_MASK;
 			FAIL_IF(push_inst(compiler, lgfr(tmp0, src1_r)));
 			FAIL_IF(push_inst(compiler, msgfr(tmp0, src2_r)));
@@ -1965,10 +2150,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compile
 			FAIL_IF(push_inst(compiler, nihf(tmp0, ~1U)));
 			FAIL_IF(push_inst(compiler, ipm(flag_r)));
 			FAIL_IF(push_inst(compiler, oilh(flag_r, 0x2000)));
-		} else {
-			return SLJIT_ERR_UNSUPPORTED;
 		}
-	} else if ((GET_OPCODE(op) == SLJIT_SUB) && (op & SLJIT_SET_Z) && !signed_flags) {
+		else
+			return SLJIT_ERR_UNSUPPORTED;
+
+	}
+	else if ((GET_OPCODE(op) == SLJIT_SUB) && (op & SLJIT_SET_Z) && !signed_flags) {
 		/* subtract logical instructions do not set the right flags unfortunately */
 		/* instead, negate src2 and issue an add logical */
 		/* TODO(mundaym): distinct operand facility where needed */
@@ -1992,11 +2179,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compile
 		FAIL_IF(push_inst(compiler, (op & SLJIT_I32_OP) ?
 			alr(src1_r, tmp1) :
 			algr(src1_r, tmp1)));
-		if (src1_r != dst_r) {
+		if (src1_r != dst_r)
 			FAIL_IF(push_inst(compiler, (op & SLJIT_I32_OP) ?
 				lr(dst_r, src1_r) : lgr(dst_r, src1_r)));
-		}
-	} else if ((src2 & SLJIT_IMM) && (src1_r == dst_r) && have_op_2_imm(op, src2w)) {
+
+	}
+	else if ((src2 & SLJIT_IMM) && (src1_r == dst_r) && have_op_2_imm(op, src2w)) {
 		switch (GET_OPCODE(op) | (op & SLJIT_I32_OP)) {
 		case SLJIT_ADD:
 			if (!HAS_FLAGS(op) || signed_flags) {
@@ -2004,9 +2192,10 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compile
 					aghi(dst_r, src2w) :
 					agfi(dst_r, src2w)
 				));
-			} else {
-				FAIL_IF(push_inst(compiler, algfi(dst_r, src2w)));
 			}
+			else
+				FAIL_IF(push_inst(compiler, algfi(dst_r, src2w)));
+
 			break;
 		case SLJIT_ADD32:
 			if (!HAS_FLAGS(op) || signed_flags) {
@@ -2014,9 +2203,10 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compile
 					ahi(dst_r, src2w) :
 					afi(dst_r, src2w)
 				));
-			} else {
-				FAIL_IF(push_inst(compiler, alfi(dst_r, src2w)));
 			}
+			else
+				FAIL_IF(push_inst(compiler, alfi(dst_r, src2w)));
+
 			break;
 		case SLJIT_MUL:
 			FAIL_IF(push_inst(compiler, mhi(dst_r, src2w)));
@@ -2036,55 +2226,87 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compile
 		default:
 			SLJIT_UNREACHABLE();
 		}
-	} else if ((src2 & SLJIT_IMM) && have_op_3_imm(op, src2w)) {
+	}
+	else if ((src2 & SLJIT_IMM) && have_op_3_imm(op, src2w)) {
 		abort(); /* TODO(mundaym): implement */
-	} else if ((src2 & SLJIT_MEM) && (dst_r == src1_r)) {
+	}
+	else if ((src2 & SLJIT_MEM) && (dst_r == src1_r)) {
 		/* most 32-bit instructions can only handle 12-bit immediate offsets */
 		int need_u12 = !have_ldisp() &&
 			(op & SLJIT_I32_OP) &&
 			(GET_OPCODE(op) != SLJIT_ADDC) &&
 			(GET_OPCODE(op) != SLJIT_SUBC);
 		struct addr mem;
-		if (need_u12) {
+		if (need_u12)
 			FAIL_IF(make_addr_bx(compiler, &mem, src2, src2w, tmp1));
-		} else {
+		else
 			FAIL_IF(make_addr_bxy(compiler, &mem, src2, src2w, tmp1));
-		}
 
 		int can_u12 = is_u12(mem.offset) ? 1 : 0;
 		sljit_ins ins = 0;
 		switch (GET_OPCODE(op) | (op & SLJIT_I32_OP)) {
 		/* 64-bit ops */
-#define EVAL(op) op(dst_r, mem.offset, mem.index, mem.base)
-		case SLJIT_ADD:  ins = signed_flags ? EVAL(ag) : EVAL(alg); break;
-		case SLJIT_SUB:  ins = signed_flags ? EVAL(sg) : EVAL(slg); break;
-		case SLJIT_ADDC: ins = EVAL(alcg); break;
-		case SLJIT_SUBC: ins = EVAL(slbg); break;
-		case SLJIT_MUL:  ins = EVAL( msg); break;
-		case SLJIT_OR:   ins = EVAL(  og); break;
-		case SLJIT_XOR:  ins = EVAL(  xg); break;
-		case SLJIT_AND:  ins = EVAL(  ng); break;
+		#define EVAL(op) op(dst_r, mem.offset, mem.index, mem.base)
+		case SLJIT_ADD:
+			ins = signed_flags ? EVAL(ag) : EVAL(alg);
+			break;
+		case SLJIT_SUB:
+			ins = signed_flags ? EVAL(sg) : EVAL(slg);
+			break;
+		case SLJIT_ADDC:
+			ins = EVAL(alcg);
+			break;
+		case SLJIT_SUBC:
+			ins = EVAL(slbg);
+			break;
+		case SLJIT_MUL:
+			ins = EVAL(msg);
+			break;
+		case SLJIT_OR:
+			ins = EVAL(og);
+			break;
+		case SLJIT_XOR:
+			ins = EVAL(xg);
+			break;
+		case SLJIT_AND:
+			ins = EVAL(ng);
+			break;
 		/* 32-bit ops */
-		case SLJIT_ADD32: ins = signed_flags ?
-			(can_u12 ? EVAL( a) : EVAL( ay)) :
-			(can_u12 ? EVAL(al) : EVAL(aly));
+		case SLJIT_ADD32:
+			ins = signed_flags ?
+				(can_u12 ? EVAL(a) : EVAL(ay)):
+				(can_u12 ? EVAL(al) : EVAL(aly));
+			break;
+		case SLJIT_SUB32:
+			ins = signed_flags ?
+				(can_u12 ? EVAL(s) : EVAL(sy)):
+				(can_u12 ? EVAL(sl) : EVAL(sly));
+			break;
+		case SLJIT_ADDC32:
+			ins = EVAL(alc);
+			break;
+		case SLJIT_SUBC32:
+			ins = EVAL(slb);
+			break;
+		case SLJIT_MUL32:
+			ins = can_u12 ? EVAL(ms) : EVAL(msy);
 			break;
-		case SLJIT_SUB32: ins = signed_flags ?
-			(can_u12 ? EVAL( s) : EVAL( sy)) :
-			(can_u12 ? EVAL(sl) : EVAL(sly));
+		case SLJIT_OR32:
+			ins = can_u12 ? EVAL(o) : EVAL(oy);
+			break;
+		case SLJIT_XOR32:
+			ins = can_u12 ? EVAL(x) : EVAL(xy);
 			break;
-		case SLJIT_ADDC32: ins = EVAL(alc); break;
-		case SLJIT_SUBC32: ins = EVAL(slb); break;
-		case SLJIT_MUL32:  ins = can_u12 ? EVAL(ms) : EVAL(msy); break;
-		case SLJIT_OR32:   ins = can_u12 ? EVAL( o) : EVAL( oy); break;
-		case SLJIT_XOR32:  ins = can_u12 ? EVAL( x) : EVAL( xy); break;
-		case SLJIT_AND32:  ins = can_u12 ? EVAL( n) : EVAL( ny); break;
-#undef EVAL
+		case SLJIT_AND32:
+			ins = can_u12 ? EVAL(n) : EVAL(ny);
+			break;
+		#undef EVAL
 		default:
 			SLJIT_UNREACHABLE();
 		}
 		FAIL_IF(push_inst(compiler, ins));
-	} else {
+	}
+	else {
 		sljit_gpr src2_r = FAST_IS_REG(src2) ? gpr(src2 & REG_MASK) : tmp1;
 		if (src2 & SLJIT_IMM) {
 			/* load src2 into register */
@@ -2111,56 +2333,110 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compile
 			sgr(src1_r, src2_r) :
 			slgr(src1_r, src2_r);
 			break;
-		case SLJIT_ADDC: ins = alcgr(src1_r, src2_r); break;
-		case SLJIT_SUBC: ins = slbgr(src1_r, src2_r); break;
-		case SLJIT_MUL:  ins =  msgr(src1_r, src2_r); break;
-		case SLJIT_AND:  ins =   ngr(src1_r, src2_r); break;
-		case SLJIT_OR:   ins =   ogr(src1_r, src2_r); break;
-		case SLJIT_XOR:  ins =   xgr(src1_r, src2_r); break;
+		case SLJIT_ADDC:
+			ins = alcgr(src1_r, src2_r);
+			break;
+		case SLJIT_SUBC:
+			ins = slbgr(src1_r, src2_r);
+			break;
+		case SLJIT_MUL:
+			ins = msgr(src1_r, src2_r);
+			break;
+		case SLJIT_AND:
+			ins = ngr(src1_r, src2_r);
+			break;
+		case SLJIT_OR:
+			ins = ogr(src1_r, src2_r);
+			break;
+		case SLJIT_XOR:
+			ins = xgr(src1_r, src2_r);
+			break;
 		/* 32-bit ops */
-		case SLJIT_ADD32: ins = signed_flags ?
-			ar(src1_r, src2_r) :
-			alr(src1_r, src2_r);
+		case SLJIT_ADD32:
+			ins = signed_flags ?
+				ar(src1_r, src2_r):
+				alr(src1_r, src2_r);
+			break;
+		case SLJIT_SUB32:
+			ins = signed_flags ?
+				sr(src1_r, src2_r):
+				slr(src1_r, src2_r);
+			break;
+		case SLJIT_ADDC32:
+			ins = alcr(src1_r, src2_r);
+			break;
+		case SLJIT_SUBC32:
+			ins = slbr(src1_r, src2_r);
+			break;
+		case SLJIT_MUL32:
+			ins = msr(src1_r, src2_r);
+			break;
+		case SLJIT_AND32:
+			ins = nr(src1_r, src2_r);
+			break;
+		case SLJIT_OR32:
+			ins = or(src1_r, src2_r);
 			break;
-		case SLJIT_SUB32: ins = signed_flags ?
-			sr(src1_r, src2_r) :
-			slr(src1_r, src2_r);
+		case SLJIT_XOR32:
+			ins = xr(src1_r, src2_r);
 			break;
-		case SLJIT_ADDC32: ins = alcr(src1_r, src2_r); break;
-		case SLJIT_SUBC32: ins = slbr(src1_r, src2_r); break;
-		case SLJIT_MUL32:  ins =  msr(src1_r, src2_r); break;
-		case SLJIT_AND32:  ins =   nr(src1_r, src2_r); break;
-		case SLJIT_OR32:   ins =   or(src1_r, src2_r); break;
-		case SLJIT_XOR32:  ins =   xr(src1_r, src2_r); break;
 		default:
 			SLJIT_UNREACHABLE();
 		}
 		FAIL_IF(push_inst(compiler, ins));
-		if (src1_r != dst_r) {
+		if (src1_r != dst_r)
 			FAIL_IF(push_inst(compiler, (op & SLJIT_I32_OP) ?
 				lr(dst_r, src1_r) : lgr(dst_r, src1_r)));
-		}
+
 	}
 
 	/* write condition code to emulated flag register */
-	if (op & VARIABLE_FLAG_MASK) {
+	if (op & VARIABLE_FLAG_MASK)
 		FAIL_IF(push_inst(compiler, ipm(flag_r)));
-	}
 
 	/* write zero flag to emulated flag register */
-	if (op & SLJIT_SET_Z) {
+	if (op & SLJIT_SET_Z)
 		FAIL_IF(push_store_zero_flag(compiler, op, dst_r));
-	}
 
 	/* finally write the result to memory if required */
 	if (dst & SLJIT_MEM) {
 		SLJIT_ASSERT(dst_r != tmp1);
+		/* TODO(carenas): s/FAIL_IF/ return */
 		FAIL_IF(store_word(compiler, dst_r, dst, dstw, tmp1, op & SLJIT_I32_OP));
 	}
 
 	return SLJIT_SUCCESS;
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_src(
+	struct sljit_compiler *compiler,
+	sljit_s32 op, sljit_s32 src, sljit_sw srcw)
+{
+	sljit_gpr src_r;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_op_src(compiler, op, src, srcw));
+	ADJUST_LOCAL_OFFSET(src, srcw);
+
+	switch (op) {
+	case SLJIT_FAST_RETURN:
+		src_r = FAST_IS_REG(src) ? gpr(src) : tmp1;
+		if (src & SLJIT_MEM)
+			FAIL_IF(load_word(compiler, tmp1, src, srcw, tmp1, 0));
+
+		return push_inst(compiler, br(src_r));
+	case SLJIT_SKIP_FRAMES_BEFORE_FAST_RETURN:
+	case SLJIT_PREFETCH_L1:
+	case SLJIT_PREFETCH_L2:
+	case SLJIT_PREFETCH_L3:
+	case SLJIT_PREFETCH_ONCE:
+	default:
+		return SLJIT_SUCCESS;
+	}
+
+	return SLJIT_SUCCESS;
+}
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_register_index(sljit_s32 reg)
 {
 	CHECK_REG_INDEX(check_sljit_get_register_index(reg));
@@ -2169,17 +2445,19 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_register_index(sljit_s32 reg)
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_float_register_index(sljit_s32 reg)
 {
+	CHECK_REG_INDEX(check_sljit_get_float_register_index(reg));
 	abort();
 }
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_custom(struct sljit_compiler *compiler,
 	void *instruction, sljit_s32 size)
 {
+	sljit_u8 *src = (sljit_u8*)instruction;
+	sljit_ins ins = 0;
+
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_op_custom(compiler, instruction, size));
 
-	sljit_u8 *src = (sljit_u8*)instruction;
-	sljit_ins ins = 0;
 	switch (size) {
 	case 2:
 		ins |= (sljit_ins)(src[0]) << 8;
@@ -2243,20 +2521,6 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fast_enter(struct sljit_compiler *
 	return store_word(compiler, fast_link_r, dst, dstw, tmp1, 0);
 }
 
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fast_return(struct sljit_compiler *compiler, sljit_s32 src, sljit_sw srcw)
-{
-	CHECK_ERROR();
-	CHECK(check_sljit_emit_fast_return(compiler, src, srcw));
-	ADJUST_LOCAL_OFFSET(src, srcw);
-
-	sljit_gpr src_r = FAST_IS_REG(src) ? gpr(src) : tmp1;
-	if (src & SLJIT_MEM) {
-	       FAIL_IF(load_word(compiler, tmp1, src, srcw, tmp1, 0));
-	}
-
-	return push_inst(compiler, br(src_r));
-}
-
 /* --------------------------------------------------------------------- */
 /*  Conditional instructions                                             */
 /* --------------------------------------------------------------------- */
@@ -2279,14 +2543,14 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_label* sljit_emit_label(struct sljit_compi
 
 SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_jump(struct sljit_compiler *compiler, sljit_s32 type)
 {
+	sljit_u8 mask = ((type & 0xff) < SLJIT_JUMP) ? get_cc(type & 0xff) : 0xf;
+
 	CHECK_ERROR_PTR();
 	CHECK_PTR(check_sljit_emit_jump(compiler, type));
 
 	/* reload condition code */
-	sljit_uw mask = ((type&0xff) < SLJIT_JUMP) ? get_cc(type&0xff) : 0xf;
-	if (mask != 0xf) {
-		PTR_FAIL_IF(push_load_cc(compiler, type&0xff));
-	}
+	if (mask != 0xf)
+		PTR_FAIL_IF(push_load_cc(compiler, type & 0xff));
 
 	/* record jump */
 	struct sljit_jump *jump = (struct sljit_jump *)
@@ -2297,11 +2561,10 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_jump(struct sljit_compile
 
 	/* emit jump instruction */
 	type &= 0xff;
-	if (type >= SLJIT_FAST_CALL) {
+	if (type >= SLJIT_FAST_CALL)
 		PTR_FAIL_IF(push_inst(compiler, brasl(type == SLJIT_FAST_CALL ? fast_link_r : link_r, 0)));
-	} else {
+	else
 		PTR_FAIL_IF(push_inst(compiler, brcl(mask, 0)));
-	}
 
 	return jump;
 }
@@ -2322,22 +2585,23 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_call(struct sljit_compile
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_ijump(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 src, sljit_sw srcw)
 {
+	sljit_gpr src_r = FAST_IS_REG(src) ? gpr(src) : tmp1;
+
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_ijump(compiler, type, src, srcw));
 	ADJUST_LOCAL_OFFSET(src, srcw);
 
-	sljit_gpr src_r = FAST_IS_REG(src) ? gpr(src) : tmp1;
 	if (src & SLJIT_IMM) {
 		SLJIT_ASSERT(!(srcw & 1)); /* target address must be even */
 		FAIL_IF(push_load_imm_inst(compiler, src_r, srcw));
-	} else if (src & SLJIT_MEM) {
-		FAIL_IF(load_word(compiler, src_r, src, srcw, tmp1, 0 /* 64-bit */));
 	}
+	else if (src & SLJIT_MEM)
+		FAIL_IF(load_word(compiler, src_r, src, srcw, tmp1, 0 /* 64-bit */));
 
 	/* emit jump instruction */
-	if (type >= SLJIT_FAST_CALL) {
+	if (type >= SLJIT_FAST_CALL)
 		return push_inst(compiler, basr(type == SLJIT_FAST_CALL ? fast_link_r : link_r, src_r));
-	}
+
 	return push_inst(compiler, br(src_r));
 }
 
@@ -2360,6 +2624,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *co
 	sljit_s32 dst, sljit_sw dstw,
 	sljit_s32 type)
 {
+	sljit_u8 mask = get_cc(type & 0xff);
+
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, type));
 
@@ -2370,9 +2636,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *co
 	case SLJIT_OR:
 	case SLJIT_XOR:
 		/* dst is also source operand */
-		if (dst & SLJIT_MEM) {
+		if (dst & SLJIT_MEM)
 			FAIL_IF(load_word(compiler, dst_r, dst, dstw, tmp1, op & SLJIT_I32_OP));
-		}
+
 		break;
 	case SLJIT_MOV:
 	case (SLJIT_MOV32 & ~SLJIT_I32_OP):
@@ -2383,10 +2649,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *co
 		SLJIT_UNREACHABLE();
 	}
 
-	sljit_uw mask = get_cc(type & 0xff);
-	if (mask != 0xf) {
+	if (mask != 0xf)
 		FAIL_IF(push_load_cc(compiler, type & 0xff));
-	}
+
 	/* TODO(mundaym): fold into cmov helper function? */
 	if (have_lscond2()) {
 		FAIL_IF(push_load_imm_inst(compiler, loc_r, 0));
@@ -2418,14 +2683,13 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *co
 	}
 
 	/* set zero flag if needed */
-	if (op & SLJIT_SET_Z) {
+	if (op & SLJIT_SET_Z)
 		FAIL_IF(push_store_zero_flag(compiler, op, dst_r));
-	}
 
 	/* store result to memory if required */
-	if (dst & SLJIT_MEM) {
+	/* TODO(carenas): s/FAIL_IF/ return */
+	if (dst & SLJIT_MEM)
 		FAIL_IF(store_word(compiler, dst_r, dst, dstw, tmp1, op & SLJIT_I32_OP));
-	}
 
 	return SLJIT_SUCCESS;
 }
@@ -2434,26 +2698,25 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_cmov(struct sljit_compiler *compil
 	sljit_s32 dst_reg,
 	sljit_s32 src, sljit_sw srcw)
 {
+	sljit_u8 mask = get_cc(type & 0xff);
+	sljit_gpr dst_r = gpr(dst_reg & ~SLJIT_I32_OP);
+	sljit_gpr src_r = FAST_IS_REG(src) ? gpr(src) : tmp0;
+
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_cmov(compiler, type, dst_reg, src, srcw));
 
-	sljit_uw mask = get_cc(type & 0xff);
-	if (mask != 0xf) {
+	if (mask != 0xf)
 		FAIL_IF(push_load_cc(compiler, type & 0xff));
-	}
 
-	sljit_gpr dst_r = gpr(dst_reg & ~SLJIT_I32_OP);
-	sljit_gpr src_r = FAST_IS_REG(src) ? gpr(src) : tmp0;
 	if (src & SLJIT_IMM) {
 		/* TODO(mundaym): fast path with lscond2 */
 		FAIL_IF(push_load_imm_inst(compiler, src_r, srcw));
 	}
 
-	if (have_lscond1()) {
+	if (have_lscond1())
 		return push_inst(compiler, dst_reg & SLJIT_I32_OP ?
-			locr(dst_r, src_r, mask) :
+			locr(dst_r, src_r, mask):
 			locgr(dst_r, src_r, mask));
-	}
 
 	/* TODO(mundaym): implement */
 	return SLJIT_ERR_UNSUPPORTED;
@@ -2467,8 +2730,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_cmov(struct sljit_compiler *compil
    advantages:
 
      1. we only need one instruction in the instruction stream (LGRL)
-     2. we don't need to worry about flushing the instruction cache
-        when updating constants
+     2. we can store 64 bit addresses and use 32 bit offsets
 
    To retrofit the extra information needed to build the literal pool we
    add a new sljit_s390x_const struct that contains the initial value but
@@ -2476,44 +2738,80 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_cmov(struct sljit_compiler *compil
 
 SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw init_value)
 {
+	struct sljit_s390x_const *const_;
+	sljit_gpr dst_r;
+
 	CHECK_ERROR_PTR();
 	CHECK_PTR(check_sljit_emit_const(compiler, dst, dstw, init_value));
 
-	struct sljit_s390x_const *const_ =
-		(struct sljit_s390x_const*)ensure_abuf(compiler, sizeof(struct sljit_s390x_const));
+	const_ = (struct sljit_s390x_const*)ensure_abuf(compiler,
+					sizeof(struct sljit_s390x_const));
 	PTR_FAIL_IF(!const_);
 	set_const((struct sljit_const*)const_, compiler);
 	const_->init_value = init_value;
 
-	sljit_gpr dst_r = FAST_IS_REG(dst) ? gpr(dst & REG_MASK) : tmp0;
-	if (have_genext()) {
+	dst_r = FAST_IS_REG(dst) ? gpr(dst & REG_MASK) : tmp0;
+	if (have_genext())
 		PTR_FAIL_IF(push_inst(compiler, sljit_ins_const | lgrl(dst_r, 0)));
-	} else {
+	else {
 		PTR_FAIL_IF(push_inst(compiler, sljit_ins_const | larl(tmp1, 0)));
 		PTR_FAIL_IF(push_inst(compiler, lg(dst_r, 0, r0, tmp1)));
 	}
-	if (dst & SLJIT_MEM) {
+
+	if (dst & SLJIT_MEM)
 		PTR_FAIL_IF(store_word(compiler, dst_r, dst, dstw, tmp1, 0 /* always 64-bit */));
-	}
+
 	return (struct sljit_const*)const_;
 }
 
-SLJIT_API_FUNC_ATTRIBUTE void sljit_set_const(sljit_uw addr, sljit_sw new_constant, sljit_sw executable_offset)
+SLJIT_API_FUNC_ATTRIBUTE void sljit_set_jump_addr(sljit_uw addr, sljit_uw new_target, sljit_sw executable_offset)
 {
 	/* Update the constant pool. */
-	sljit_uw *ptr = (sljit_uw *)(SLJIT_ADD_EXEC_OFFSET((sljit_uw *)(addr), executable_offset));
-	*ptr = new_constant;
-	/* No need to flush the instruction cache as we do not modify
-	   any executable code (the constant is fetched from the data
-	   data cache). */
+	sljit_uw *ptr = (sljit_uw *)addr;
+	SLJIT_UNUSED_ARG(executable_offset);
+
+	SLJIT_UPDATE_WX_FLAGS(ptr, ptr + 1, 0);
+	*ptr = new_target;
+	SLJIT_UPDATE_WX_FLAGS(ptr, ptr + 1, 1);
+	SLJIT_CACHE_FLUSH(ptr, ptr + 1);
 }
 
-SLJIT_API_FUNC_ATTRIBUTE void sljit_set_jump_addr(sljit_uw addr, sljit_uw new_target, sljit_sw executable_offset)
+SLJIT_API_FUNC_ATTRIBUTE void sljit_set_const(sljit_uw addr, sljit_sw new_constant, sljit_sw executable_offset)
 {
-	/* Update the constant pool. */
-	sljit_uw *ptr = (sljit_uw *)(SLJIT_ADD_EXEC_OFFSET((sljit_uw *)(addr), executable_offset));
-	*ptr = new_target;
-	/* No need to flush the instruction cache as we do not modify
-	   any executable code (the constant is fetched from the data
-	   data cache). */
+	sljit_set_jump_addr(addr, new_constant, executable_offset);
 }
+
+SLJIT_API_FUNC_ATTRIBUTE struct sljit_put_label *sljit_emit_put_label(
+	struct sljit_compiler *compiler,
+	sljit_s32 dst, sljit_sw dstw)
+{
+	struct sljit_put_label *put_label;
+	sljit_gpr dst_r;
+
+	CHECK_ERROR_PTR();
+	CHECK_PTR(check_sljit_emit_put_label(compiler, dst, dstw));
+	ADJUST_LOCAL_OFFSET(dst, dstw);
+
+	put_label = (struct sljit_put_label*)ensure_abuf(compiler, sizeof(struct sljit_put_label));
+	PTR_FAIL_IF(!put_label);
+	set_put_label(put_label, compiler, 0);
+
+	dst_r = FAST_IS_REG(dst) ? gpr(dst & REG_MASK) : tmp0;
+
+	if (have_genext())
+		PTR_FAIL_IF(push_inst(compiler, lgrl(dst_r, 0)));
+	else {
+		PTR_FAIL_IF(push_inst(compiler, larl(tmp1, 0)));
+		PTR_FAIL_IF(push_inst(compiler, lg(dst_r, 0, r0, tmp1)));
+	}
+
+	if (dst & SLJIT_MEM)
+		PTR_FAIL_IF(store_word(compiler, dst_r, dst, dstw, tmp1, 0));
+
+	return put_label;
+}
+
+#undef tmp1
+#undef tmp0
+
+/* TODO(carenas): undef other macros that spill like is_u12? */
diff --git a/test_src/sljitTest.c b/test_src/sljitTest.c
index 162b33835..1a7c9a3c3 100644
--- a/test_src/sljitTest.c
+++ b/test_src/sljitTest.c
@@ -3779,6 +3779,13 @@ static void test41(void)
 		| (sljit_get_register_index(SLJIT_S0) << 14)
 		| sljit_get_register_index(SLJIT_S1);
 	sljit_emit_op_custom(compiler, &inst, sizeof(sljit_u32));
+#elif (defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X)
+	/* agrk rd, rs1, rs2 */
+	inst = (0xb9e8 << 16)
+		| (sljit_get_register_index(SLJIT_RETURN_REG) << 4)
+		| (sljit_get_register_index(SLJIT_S0) << 12)
+		| sljit_get_register_index(SLJIT_S1);
+	sljit_emit_op_custom(compiler, &inst, sizeof(inst));
 #else
 	inst = 0;
 	sljit_emit_op_custom(compiler, &inst, 0);
@@ -6303,10 +6310,10 @@ static void test64(void)
 #if (defined SLJIT_64BIT_ARCHITECTURE && SLJIT_64BIT_ARCHITECTURE)
 	sljit_sw offs1 = SLJIT_W(0x123456781122);
 	sljit_sw offs2 = SLJIT_W(0x1234567811223344);
-#else
+#else /* !SLJIT_64BIT_ARCHITECTURE */
 	sljit_sw offs1 = 0x12345678;
 	sljit_sw offs2 = 0x80000000;
-#endif
+#endif /* SLJIT_64BIT_ARCHITECTURE */
 
 	if (verbose)
 		printf("Run test64\n");