sylware / nyanlinux (public) (License: AFFERO GPLv3) (since 2019-09-09) (hash sha1)
scripts for a lean, from scratch, amd hardware, linux distro

/files/stg.tar.xz (402273b4c0d5d401bcaa5c661fc6cd7b8019ab39) (143360 bytes) (mode 100644) (type blob)

./stg/0000777000175000017500000000000013707412666007647 5ustar  user./stg/a.out0000755000175000017500000002074013672200621010605 0ustar  userELF>�@ @8	@@@@��888�� �
�
 �
 Xx �
�
 �
 ��TTTDDP�td���<<Q�tdR�td�
�
 �
 ((/lib64/ld-linux-x86-64.so.2GNU GNUԟ�/�ߕ�>�Qb��?3 !O ^ r "libc.so.6calloc__cxa_finalize__libc_start_main_ITM_deregisterTMCloneTable__gmon_start___Jv_RegisterClasses_ITM_registerTMCloneTableGLIBC_2.2.5ui	��
 ��
 @( ( � � � � � �  H��H��
 H��t��H����5�
 �%�
 @�%�
 h����%�
 f�1�I��^H��H���PTL��H�
cH�=�.
 �DH�=y
 H�y
 UH)�H��H��vH��	 H��t	]��fD]�@f.�H�=9
 H�52
 UH)�H��H��H��H��?H�H�tH��	 H��t]��f�]�@f.��=�	 u'H�=�	 UH��tH�=�	 �
����H���]��	 ��@f.�H�=a H�?u�^���fDH�I	 H��t�UH���]�@���UH��SH����
���H�q	 H�j	 ��
�{���H��H��[]�f.�f�AWAVA��AUATL�%� UH�-� SI��I��L)�H��H�����H��t 1��L��L��D��A��H��H9�u�H��[]A\A]A^A_Ðf.���H��H���;8�����������T,����|������,zRx�����+zRx�$@��� FJw�?;*3$"D8���\`���DA�C
E�zD|����eB�B�E �B(�H0�H8�M@r8A0A(B BBB������@0
t�
 �
 ���o�x�
� @�	���o���o ���o�o���o�
 f( GCC: (Debian 6.3.0-18+deb9u1) 6.3.0 201705168Tt��x 	@
0P
p�t����
 �
 �
 �
 �    0 ��
 ��.@D0 S�
 z���
 �������
 ���
 ��
 ��
 �� p, �   H@ L0 &tSr�  � �( ���e�P ��+�0 ��D� �0 � "�0crtstuff.c__JCR_LIST__deregister_tm_clones__do_global_dtors_auxcompleted.6972__do_global_dtors_aux_fini_array_entryframe_dummy__frame_dummy_init_array_entrytest.c__FRAME_END____JCR_END____init_array_end_DYNAMIC__init_array_start__GNU_EH_FRAME_HDR_GLOBAL_OFFSET_TABLE___libc_csu_fini_ITM_deregisterTMCloneTablectx_edata__libc_start_main@@GLIBC_2.2.5calloc@@GLIBC_2.2.5__data_start__gmon_start____dso_handle_IO_stdin_used__libc_csu_init__bss_startmain_Jv_RegisterClasses__TMC_END___ITM_registerTMCloneTable__cxa_finalize@@GLIBC_2.2.5.symtab.strtab.shstrtab.interp.note.ABI-tag.note.gnu.build-id.gnu.hash.dynsym.dynstr.gnu.version.gnu.version_r.rela.dyn.rela.plt.init.plt.got.text.fini.rodata.eh_frame_hdr.eh_frame.init_array.fini_array.jcr.dynamic.got.plt.data.bss.comment88#TT 1tt$D���o��N���Vxx�^���ok���o   z@@��B�00�PP �pp�����tt	������<�����
 �
��
 �
��
 �
��
 �
��� �0�  �   �0 0 00-`x/	�5
./stg/local.c0000666000175000017500000013433313707305623011106 0ustar  user#ifndef STG_LOCAL_C
#define STG_LOCAL_C
/*NSPC*/
STATIC u32 rd_w_le32(struct stg_ctx_t *ctx, u32 idx)
{
	return le32toh(ctx->input.spirv.ws[idx]);
}
/*NSPC*/
STATIC u32 rd_w_be32(struct stg_ctx_t *ctx, u32 idx)
{
	return be32toh(ctx->input.spirv.ws[idx]);
}
/*NSPC*/
STATIC void endianess(struct stg_ctx_t *ctx)
{
	u8 *magic_nr;
	struct stg_ctx_private_t *p;

	p = ctx->private;

	magic_nr = (u8*)ctx->input.spirv.ws;
	if (	magic_nr[0] == 0x07
	    ||	magic_nr[1] == 0x23
	    ||	magic_nr[2] == 0x02
	    ||	magic_nr[3] == 0x03)
		/* big endian */
		p->rd_w = rd_w_be32;
	else
		/* little endian */
		p->rd_w = rd_w_le32;
}
#define NOT_FOUND	0
#define FOUND		1
/*NSPC*/
STATIC u8 location_lookup(struct stg_ctx_t *ctx, u32 *location, u32 id)
{
	struct stg_ctx_private_t *p;
	u32 i;

	p = ctx->private;
	i = p->layout.annotations.start_spirv_idx;
	loop {
		u32 op;

		if (i >= p->layout.annotations.end_spirv_idx)
			return NOT_FOUND;
		op = RDW(i);
		if (OP_ENUMERANT(op) == 71) { /* opdecorate */
			u32 target_id;
	
			target_id = RDW(i + 1);	
			if (target_id == id) {
				u32 decoration;

				decoration = RDW(i + 2);
				if (decoration == 30) { /* location */
					*location = RDW(i + 3);
					return FOUND;
				}
			}
		}
		i += OP_WS_N(op);
	}
	/* unreachable */
}
#undef NOT_FOUND
#undef FOUND
/*NSPC*/
STATIC u32 lookup_opfunction(struct stg_ctx_t *ctx, u32 id)
{
	struct stg_ctx_private_t *p;
	u32 i;

	p = ctx->private;
	i = p->layout.funcs.start_spirv_idx;
	loop {
		u32 op;

		if (i >= ctx->input.spirv.ws_n)
			return 0;
		op = RDW(i);
		if (OP_ENUMERANT(op) == 54) {
			u32 opfunction_id;

			opfunction_id = RDW(i + 2);
			if (opfunction_id == id)
				return i;
		}
		i += OP_WS_N(op);
	}
	/* unreachable */
}
/*NSPC*/
STATIC u8 hw_emit_w(struct stg_ctx_t *ctx, u32 w)
{
	u8 *p;

	ctx->output.hw = realloc(ctx->output.hw, ctx->output.hw_bytes_n + 4);
	if (ctx->output.hw == 0) {
		LOG_ERR("unable to allocate more memory for hw program\n");
		return FAILURE;
	}
	p = ctx->output.hw + ctx->output.hw_bytes_n;
	*(u32*)p = htole32(w);
	ctx->output.hw_bytes_n += 4;
	return SUCCESS;
}
#define DFMT_32 4
#define NFMT_FLOAT 7
#define TBUFFER_LOAD_FORMAT_X 0
#define SOFFSET_ZERO 128
#define MTBUF 0b111010
/*NSPC*/
STATIC u8 tbuffer_load_format_x(struct stg_ctx_t *ctx, u32 dst_vgpr_idx,
	u32 buf_desc_sgprs_idx, u16 offset, bool miss_l1_cache)
{
	u32 w0;
	u32 w1;
	u8 r;
	struct s_waitcnt_t wait;
	struct stg_ctx_private_t *p;
	/*--------------------------------------------------------------------*/
	/* pre-s_waitcnt */
	s_waitcnt_init(ctx, &wait);
	wait.lgkm_cnt_zero = s_waitcnt_are_sgprs_loading(ctx,
							buf_desc_sgprs_idx, 2);
	/* check vm_cnt overflow */
	p = ctx->private;
	if ((p->s_waitcnt.vm_cnt.stack.n + 1) > p->s_waitcnt.vm_cnt.n_max) {
		/* need to make room for one inst */
		wait.use_vm_cnt = true;
		wait.vm_cnt = p->s_waitcnt.vm_cnt.n_max - 1;
	}
	/* export */
	wait.exp_cnt_zero = p->vgprs.array[dst_vgpr_idx].pending_export;
	/* emmission and state update */
	r = s_waitcnt(ctx, &wait);
	if (r != SUCCESS) {
		LOG_ERR("tbuffer_load_format_x:unable to emit pre-s_waitcnt\n");
		return FAILURE;
	}
	/* loads and stores are in-order on dst_vgpr_idx, hence no pb */
	/*--------------------------------------------------------------------*/
	LOG("hw:tbuffer_load_format_x dst=v%u buf_desc.sgprs.idx=%u offset=%u miss_l1_cache=%s\n", dst_vgpr_idx, buf_desc_sgprs_idx, offset, miss_l1_cache ? "yes" : "no");
	w0 = 0;
	w0 |= (u32)offset;
	w0 |= 1 << 13; /* enable an idx in a vgpr */
	w0 |= (miss_l1_cache ? 1 : 0)  << 14; /* idx based loading */
	w0 |= TBUFFER_LOAD_FORMAT_X << 16;
	w0 |= DFMT_32 << 19;
	w0 |= NFMT_FLOAT << 23;
	w0 |= MTBUF << 26;

	w1 = 0;	
	w1 |= dst_vgpr_idx << (40 - 32);
	w1 |= (buf_desc_sgprs_idx >> 2) << (48 - 32);
	w1 |= SOFFSET_ZERO << (56 - 32);
	/* idx is pre-loaded in v0 */

	r = hw_emit_w(ctx, w0);
	if (r != SUCCESS) {
		LOG_ERR("tbuffer_load_format_xyzw:unable to emit the first machine instruction words\n");
		return FAILURE;
	}
	r = hw_emit_w(ctx, w1);
	if (r != SUCCESS) {
		LOG_ERR("tbuffer_load_format_xyzw:unable to emit the second machine instruction words\n");
		return FAILURE;
	}
	/*--------------------------------------------------------------------*/
	s_waitcnt_post_vgpr_load(ctx, dst_vgpr_idx);
	return SUCCESS;
}
#undef DFMT_32_32_32_32
#undef NFMT_FLOAT
#undef TBUFFERLOAD_FORMAT_XYZW
#undef SOFFSET_ZERO
#define free_zero(a) \
if (a != 0) { \
	free(a); \
	a = 0; \
}
/*NSPC*/
STATIC void ctx_private_del(struct stg_ctx_private_t *p)
{
	u32 i;

	free_zero(p->vgprs.array);
	free_zero(p->sgprs.array);
	free_zero(p->s_waitcnt.vm_cnt.stack.slots);
	i = 0;
	loop {
		if (i == p->ids.n)
			break;
		switch (p->ids.array[i].type) {
		case STG_ID_TYPE_VGPRS:
		case STG_ID_TYPE_CST:
			free_zero(p->ids.array[i].vgprs.idxs);
			break;
		default:
			break;
		}
		++i;
	}
	free(p);
}
/*NSPC*/
STATIC u8 type_vec_cpnts_n(struct stg_ctx_t *ctx, u32 *cpnts_n, u32 type_id)
{
	struct stg_ctx_private_t *p;
	u32 i;

	p = ctx->private;
	i = p->layout.nonfunc_decls.start_spirv_idx;
	loop {
		u32 op;

		op = RDW(i);
		if (i >= p->layout.nonfunc_decls.end_spirv_idx) {
			LOG_ERR("unable to find a vector type of id %u in non-function declarations section\n", type_id);
			return FAILURE;
		}
		if (OP_ENUMERANT(op) == 23) {
			if (type_id == RDW(i + 1)) {
				*cpnts_n = RDW(i + 3);
				return SUCCESS;
			}
		}
		i += OP_WS_N(op);
	}
	/* unreachable */
}
/*NSPC*/
STATIC u8 machine_csts(struct stg_ctx_t *ctx)
{
	struct stg_ctx_private_t *p;
	u32 i;

	p = ctx->private;
	switch (ctx->input.machine) {
	case STG_MACHINE_GFX_6:
		p->vgprs.n_max = 256;
		p->sgprs.n_max = 104;
		p->s_waitcnt.vm_cnt.n_max= 16;
		break;
	default:
		LOG_ERR("machine constants:unknown machine\n");
		return FAILURE;
	}
	/*--------------------------------------------------------------------*/
	p->vgprs.array = calloc(p->vgprs.n_max, sizeof(*p->vgprs.array));
	if (p->vgprs.array == 0) {
		LOG_ERR("machine constants:unable to alloc memory for the vgprs tracker\n");
		return FAILURE;
	}
	/* XXX book the first vgprs for the loaded-by-hardware idx, always */
	p->vgprs.array[0].free = false;
	i = 1;
	loop {
		if (i == p->vgprs.n_max)
			break;
		p->vgprs.array[i].free = true;
		++i;
	}
	/*--------------------------------------------------------------------*/
	p->sgprs.array = calloc(p->sgprs.n_max, sizeof(*p->sgprs.array));
	if (p->sgprs.array == 0) {
		LOG_ERR("machine constants:unable to alloc memory for the sgprs tracker\n");
		return FAILURE;
	}
	i = 1;
	loop {
		if (i == p->sgprs.n_max)
			break;
		p->sgprs.array[i].free = true;
		++i;
	}
	/*--------------------------------------------------------------------*/
	p->s_waitcnt.vm_cnt.stack.slots = calloc( p->s_waitcnt.vm_cnt.n_max,
				sizeof(*p->s_waitcnt.vm_cnt.stack.slots));
	if (p->s_waitcnt.vm_cnt.stack.slots == 0) {
		LOG_ERR("machine constants:unable to alloc memory for the vector memory access stack\n");
		return FAILURE;
	}
	return SUCCESS;
}
/*NSPC*/
STATIC u8 vgpr_alloc(struct stg_ctx_t *ctx, u32 *vgpr)
{
	struct stg_ctx_private_t *p;

	p = ctx->private;
	*vgpr = 0;
	loop {
		if (*vgpr == p->vgprs.n_max) {
			LOG_ERR("vgpr_alloc:unable to find an available vgpr\n");
			return FAILURE;
		}
		if (p->vgprs.array[*vgpr].free) {
			p->vgprs.array[*vgpr].free = false;
			return SUCCESS;
		}
		++(*vgpr);
	}
	/* unreachable */
}
/*NSPC*/
STATIC u8 vgprs_alloc(struct stg_ctx_t *ctx, u32 *vgprs, u32 n)
{
	u32 i;

	i = 0;
	loop {
		u8 r;

		if (i == n)
			return SUCCESS;
		r = vgpr_alloc(ctx, vgprs + i);
		if (r != SUCCESS) {
			LOG_ERR("vgprs_alloc:unable to allocation vgpr number %u\n", i);
			return FAILURE;
		}
		++i;
	}
	/* unrechable */
}
/*
 * NOTE: some spirv modules out there use a huge sparse range of ids. If I
 * understood well, ids are supposed to be compact without holes. Don't know
 * why those real-life modules don't have a sensible id management.
 */
/*NSPC*/
STATIC u8 id_alloc(struct stg_ctx_t *ctx, u32 *idx, u32 id)
{
	struct stg_ctx_private_t *p;

	p = ctx->private;
	p->ids.array = realloc(p->ids.array, sizeof(*(p->ids.array))
							* (p->ids.n + 1));
	if (p->ids.array == 0) {
		LOG_ERR("unable to allocate a new id\n");
		return FAILURE;
	}
	*idx = p->ids.n;
	++(p->ids.n);
	p->ids.array[*idx].id = id;
	return SUCCESS;	
}
/*NSPC*/
STATIC u8 id_idx_lookup(struct stg_ctx_t *ctx, u32 *idx, u32 id)
{
	struct stg_ctx_private_t *p;

	p = ctx->private;
	*idx = 0;
	loop {
		if (*idx == p->ids.n) 
			return FAILURE;
		if (id == p->ids.array[*idx].id)
			return SUCCESS;
		++(*idx);
	}
	/* unreachable */
}
/*NPSC*/
STATIC u8 sgpr_alloc(struct stg_ctx_t *ctx, u32 *sgpr)
{
	struct stg_ctx_private_t *p;

	p = ctx->private;
	*sgpr = 0;
	loop {
		/* book the top 2 sgprs for vcc register, see docs */
		if (*sgpr >= (p->sgprs.n_max - 2)) {
			LOG_ERR("sgpr_alloc:unable to find an available sgpr\n");
			return FAILURE;
		}
		if (*sgpr < STG_USER_SGPRS_N_MAX
					&& ctx->input.user_sgprs[*sgpr]) {
			++(*sgpr);
			continue;
		}
		if (p->sgprs.array[*sgpr].free) {
			p->sgprs.array[*sgpr].free = false;
			return SUCCESS;
		}
		++(*sgpr);
	}
}
/*NPSC*/
STATIC u8 sgprs_alloc(struct stg_ctx_t *ctx, u32 *sgprs, u32 n)
{
	u32 i;
	
	i = 0;
	loop {
		u8 r;

		if (i == n)
			return SUCCESS;
		r = sgpr_alloc(ctx, sgprs + i);
		if (r != SUCCESS) {
			LOG_ERR("sgprs_alloc:unable to alloc sgpr nr %u\n", i);
			return FAILURE;
		}
		++i;
	}
	/* unreachable */
}
#define s p->sgprs.array
#define is_user_sgprs(s) (s < STG_USER_SGPRS_N_MAX && ctx->input.user_sgprs[s])
/*NPSC*/
STATIC u8 sgprs_blk_alloc(struct stg_ctx_t *ctx, u32 *sgprs, u32 n, u32 align)
{
	struct stg_ctx_private_t *p;
	u32 first;

	p = ctx->private;
	first = 0;
	loop {
		/* book the top 2 sgprs for vcc register, see docs */
		if (first >= (p->sgprs.n_max - 2)) {
			LOG_ERR("sgprs allocation:unable to find a suitable starting sgpr for %u sgprs with %u alignment\n", n, align);
			return FAILURE;
		}
		/* is this sgprs was pre-loaded */
		if (is_user_sgprs(first)) {
			++first;
			continue;
		}
		if (s[first].free) {
			if ((first % align) == 0) {
				u32 i;
				u32 end;

				i = first + 1;
				/* idx past the last one to check */
				end = first + n;
				/*
				 * inner loop checking the availability of n
				 * consecutive vgprs
				 */
				loop {
					if (i == end) { /* book the sgprs */
						i = first; /* reuse i */
						loop {
							if (i == end)
								break;
							s[i].free = false;
							++i;
						}
						*sgprs = first;
						return SUCCESS;
					}
					if (i >= (p->sgprs.n_max - 2)) {
						LOG_ERR("sgprs allocation:unable to find a long enough sequence of %u sgprs with %u alignment\n", n, align);
						return FAILURE;
					}
					if (is_user_sgprs(i) || !s[i].free)
						break;
					++i;
				}
			}
		}
		++first;
	}
	/* unreachable */
}
#undef s
#undef is_user_sgprs
#define S_MOV_B32 3
#define SOP1 0b101111101
/*NPSC*/
STATIC u8 s_mov_b32_sgpr(struct stg_ctx_t *ctx, u32 dst_sgpr, u32 src_sgpr)
{
	u32 w;
	u8 r;

	LOG("hw:s_mov_b32_sgpr dst=s%u src=s%u\n", dst_sgpr, src_sgpr);
	w = 0;
	w |= src_sgpr;
	w |= S_MOV_B32 << 8;
	w |= dst_sgpr << 16;
	w |= SOP1 << 23;

	r = hw_emit_w(ctx, w);
	if (r != SUCCESS) {
		LOG_ERR("s_mov_b32_sgpr:unable to emit the machine instruction word\n");
		return FAILURE;
	}
	return SUCCESS;
}
#undef S_MOV_B32
#undef SOP1
#define S_MOV_B32 3
#define SOP1 0b101111101
/*NPSC*/
STATIC u8 s_mov_b32_imm(struct stg_ctx_t *ctx, u32 dst_sgpr, u32 imm)
{
	u32 w;
	u8 r;

	LOG("hw:s_mov_b32_imm dst=s%u imm=0x%08x\n", dst_sgpr, imm);
	w = 0;
	w |= 255; /* literal cst */
	w |= S_MOV_B32 << 8;
	w |= dst_sgpr << 16;
	w |=  SOP1 << 23;

	r = hw_emit_w(ctx, w);
	if (r != SUCCESS) {
		LOG_ERR("s_mov_b32_imm:unable to emit the machine instruction word\n");
		return FAILURE;
	}
	r = hw_emit_w(ctx, imm);
	if (r != SUCCESS) {
		LOG_ERR("s_mov_b32_imm:unable to emit the literal constant word\n");
		return FAILURE;
	}
	return SUCCESS;
}
#undef S_MOV_B32
#undef SOP1
/*NSPC*/
STATIC u8 buf_descs_array_addr_merge(struct stg_ctx_t *ctx)
{
	struct stg_ctx_private_t *p;
	u8 r;

	p = ctx->private;
	/*
	 * we align on a boundary of 2 words to fit requirements of hopefully
	 * all the machine insts which will use the addr
	 */
	r = sgprs_blk_alloc(ctx, &p->bindings.buf_descs_array.addr.sgprs, 2, 2);
	if (r != SUCCESS) {
		LOG_ERR("array of buffer descriptors: unable to allocate 2 sgprs for the 64bits address\n");
		return FAILURE;
	}
	r = s_mov_b32_sgpr(ctx, p->bindings.buf_descs_array.addr.sgprs,
				ctx->input.bindings.addr.lo_32bits_sgpr);
	if (r != SUCCESS) {
		LOG_ERR("unable to move the low 32 bits of the address of the array of buffer descriptors from the \"user\" s%u to s%u\n", ctx->input.bindings.addr.lo_32bits_sgpr, p->bindings.buf_descs_array.addr.sgprs);
		return FAILURE;
	}
	r = s_mov_b32_imm(ctx, p->bindings.buf_descs_array.addr.sgprs + 1,
					ctx->input.bindings.addr.hi_32bits);
	if (r != SUCCESS) {
		LOG_ERR("unable to load the high 32 bits of the address of the array of buffer descriptors, 0x%08x, in s%u\n", ctx->input.bindings.addr.hi_32bits, p->bindings.buf_descs_array.addr.sgprs + 1);
		return FAILURE;
	}
	p->bindings.buf_descs_array.addr.ready = true;
	return SUCCESS;
}
#define s ((struct stg_ctx_private_t*)ctx->private)->sgprs.array
#define S_LOAD_DWORDX4 2
#define SMRD 0b11000
/*NSPC*/
STATIC u8 s_load_dwordx4(struct stg_ctx_t *ctx, u32 dst_sgprs,
					u32 src_addr_sgprs, u8 offset_ws_n)
{
	u32 w;
	u8 r;
	u32 i;
	struct s_waitcnt_t wait;
	/*--------------------------------------------------------------------*/
	/* s_waitcnt prolog */
	s_waitcnt_init(ctx, &wait);
	wait.lgkm_cnt_zero = s_waitcnt_are_sgprs_loading(ctx, dst_sgprs, 4)
			|| s_waitcnt_are_sgprs_loading(ctx, src_addr_sgprs, 4);
	r = s_waitcnt(ctx, &wait);
	if (r != SUCCESS) {
		LOG_ERR("s_load_dwordx4:unable to emit required prolog s_waitcnt machine instruction\n");
		return FAILURE;
	}
	/*--------------------------------------------------------------------*/
	LOG("hw:s_load_dwordx4 dst=s[%u:4] src.addr=s[%u:2] add_offset_dws_n=%u\n", dst_sgprs, src_addr_sgprs, offset_ws_n);
	w = 0;
	w |= offset_ws_n;
	w |= 1 << 8; /* enable above w offset */
	w |= (src_addr_sgprs >> 1) << 9; /* aligned on 2 sgprs */
	w |= dst_sgprs << 15;
	w |= S_LOAD_DWORDX4 << 22;
	w |= SMRD << 27;

	r = hw_emit_w(ctx, w);
	if (r != SUCCESS) {
		LOG_ERR("s_load_dwordx4:unable to emit the machine instruction word\n");
		return FAILURE;
	}
	/*--------------------------------------------------------------------*/
	/* we must track the offchip mem load */
	i = 0;
	loop {
		if (i == 4)
			break;
		s[dst_sgprs + i].pending_offchip_mem_load = true;	
		++i;
	}	
	return SUCCESS;
}
#undef S_LOAD_DWORDX4
#undef s
#undef SMRD
#define binding p->bindings.array[binding_nr]
#define s p->sgprs.array
/*NSPC*/
STATIC u8 binding_buf_desc_sgprs_get(struct stg_ctx_t *ctx, u32 *sgprs,
								u32 binding_nr)
{
	struct stg_ctx_private_t *p;
	u8 r;

	p = ctx->private;
	if (!p->bindings.buf_descs_array.addr.ready) {
		r = buf_descs_array_addr_merge(ctx);
		if (r != SUCCESS) {
			LOG_ERR("unable to generate the address of the array of buffer descriptors needed for binding %u\n", binding_nr);
			return FAILURE;
		}
	}
	if (binding.buf_desc.loaded)
		goto exit;
	r = sgprs_blk_alloc(ctx, &binding.buf_desc.sgprs, 4, 4);
	if (r != SUCCESS) {
		LOG_ERR("unable to allocate 4 aligned sgprs for the buffer descriptor of binding %u\n", binding_nr);
		return FAILURE;
	}
	/* XXX: on fresh sgprs we don't need to use s_waitcnt */
	#define OFFSET_WS_N (binding_nr * 4 * 2)
	r = s_load_dwordx4(ctx, binding.buf_desc.sgprs,
			p->bindings.buf_descs_array.addr.sgprs, OFFSET_WS_N);
	#undef OFFSET_WS_N
	if (r != SUCCESS) {
		LOG_ERR("unable to load the buffer descriptor for binding %u\n", binding_nr);
		return FAILURE;
	}
	LOG("binding[%u] buffer descriptor loaded in s[%u:4]\n", binding_nr, binding.buf_desc.sgprs);
	binding.buf_desc.loaded = true;
exit:
	*sgprs = binding.buf_desc.sgprs;
	return SUCCESS;
}
#undef binding
#undef s
// XXX hardcoded for an input location, could we have anything else
// XXX could be optimized to emit tbuffer insts with more words
/* this is a big one, tons of things are happening here */
#define DO_MISS_L1 true
#define location ctx->input.locations.array
#define id p->ids.array
#define NOT_FOUND 0
/*NSPC*/
STATIC u8 opload(struct stg_ctx_t *ctx, u32 i)
{
	struct stg_ctx_private_t *p;
	u8 r;
	u32 dst_id;
	u32 dst_id_idx;
	u32 dst_type_id;
	u32 dst_type_cpnts_n;
	u32 ptr_id;
	u32 nr; /* location nr */
	u32 buf_desc_sgprs; /* idx of the first sgpr of the buf desc */
	u32 c;

	p = ctx->private;
	LOG("OpLoad:idx=%u\n", i);
	/*====================================================================*/
	/* location, a story of sgprs */
	ptr_id = RDW(i + 3);
	r = location_lookup(ctx, &nr, ptr_id);
	if (r == NOT_FOUND) {
		LOG_ERR("opload:unable to find location for %%%u\n", ptr_id);
		return FAILURE;
	}
	/* do load the location buf desc if not already done */
	r = binding_buf_desc_sgprs_get(ctx, &buf_desc_sgprs,
							location[nr].binding);
	if (r != SUCCESS) {
		LOG_ERR("opload:unable to load the buffer descriptor for binding %u\n", location[nr].binding);
		return FAILURE;
	}
	/*====================================================================*/
	/* the dst, aka the vgprs */
	/*--------------------------------------------------------------------*/
	dst_type_id = RDW(i + 1);
	/*
	 * could be generalized to any type, but that would be really bad since
	 * registers are a scarse resource, need more perspective to proceed
	 */
	r = type_vec_cpnts_n(ctx, &dst_type_cpnts_n, dst_type_id);
	if (r != SUCCESS) {
		LOG_ERR("opload:unable to lookup for the number of components of the destination vector\n");
		return FAILURE;
	}
	/*--------------------------------------------------------------------*/
	dst_id = RDW(i + 2);
	r = id_alloc(ctx, &dst_id_idx, dst_id);	
	if (r != SUCCESS) {
		LOG_ERR("opload:unable to allocate a new id\n");
		return FAILURE;
	}
	id[dst_id_idx].type = STG_ID_TYPE_VGPRS;
	id[dst_id_idx].vgprs.n = dst_type_cpnts_n;
	id[dst_id_idx].vgprs.idxs = calloc(dst_type_cpnts_n, sizeof(u32));
	if (id[dst_id_idx].vgprs.idxs == 0) {
		LOG_ERR("opload:unable to allocate memory for %u vgpr indexes related to id %u\n", dst_type_cpnts_n, dst_id);
		return FAILURE;
	}
	r = vgprs_alloc(ctx, id[dst_id_idx].vgprs.idxs, dst_type_cpnts_n);
	if (r != SUCCESS) {
		LOG_ERR("opload:unable to allocate %u vgprs for destination %%%u type %%%u\n", dst_type_cpnts_n, dst_id, dst_type_id);
		return FAILURE;
	}
	c = 0;
	loop {
		if (c == dst_type_cpnts_n)
			break;
		r = tbuffer_load_format_x(ctx, id[dst_id_idx].vgprs.idxs[c],
			buf_desc_sgprs, location[nr].inner_offset + c * 4,
								DO_MISS_L1);
		if (r != SUCCESS) {
			LOG_ERR("opload:unable to generate the tbuffer_load_format_x machine instruction\n");
			return FAILURE;
		}
		++c;
	}
	return SUCCESS;
}
#undef location
#undef id
#undef DO_MISS_L1
#undef NOT_FOUND
/*NSPC*/
STATIC void exp_target_to_str(u8 *str, u32 sz, u8 target)
{
	if (0 <= target && target <= 7) {
		snprintf(str, sz, "mrt%u", target);
		return;
	}
	if (target == 8) {
		snprintf(str, sz, "z");
		return;
	}
	if (target == 9) {
		snprintf(str, sz, "null");
		return;
	}
	if (12 <= target && target <= 15) {
		snprintf(str, sz, "pos%u", target - 12);
		return;
	}
	if (32 <= target && target <= 63) {
		snprintf(str, sz, "param%u", target - 32);
		return;
	}
}
#define id p->ids.array
/*NSPC*/
STATIC void cst_to_str(struct stg_ctx_t *ctx, u8 *str, u32 sz, u32 cst_id_idx)
{
	u8 r;
	u32 cst_type_id;
	struct stg_ctx_private_t *p;

	p = ctx->private;
	if (id[cst_id_idx].type != STG_ID_TYPE_CST) {
		snprintf(str, sz, "constant string conversion failure, id is not refering to a constant");
		return;
	}
	cst_type_id = RDW(id[cst_id_idx].spirv_idx + 1);
	if (cst_type_id == p->type_f32_id) {
		f32 *f;

		f = (f32*)(ctx->input.spirv.ws + id[cst_id_idx].spirv_idx + 3);
		snprintf(str, sz, "%f", (double)(*f));
		return;
	}
	/* TODO */
	snprintf(str, sz, "constant string conversion failure, unhandled constant type");
}
#undef id
#define LITERAL		1
#define CODE		2
#define id p->ids.array
/*NSPC*/
STATIC u8 cst_to_hw(struct stg_ctx_t *ctx, u32 *out, u32 cst_id_idx)
{
	struct stg_ctx_private_t *p;
	u32 cst_spirv_idx;
	u32 type_id;

	p = ctx->private;
	cst_spirv_idx = id[cst_id_idx].spirv_idx;

	type_id = RDW(cst_spirv_idx + 1);
	if (type_id == p->type_f32_id) {
		f32 *val;

		val = (f32*)(ctx->input.spirv.ws + cst_spirv_idx + 3);
		if (*val == 0.0) {
			*out = 128; /* XXX: seems to be a "all types" zero */
			return CODE;
		} else if (*val == 0.5) {
			*out = 240;
			return CODE;
		} else if (*val == -0.5) {
			*out = 241;
			return CODE;
		} else if (*val == 1.0) {
			*out = 242;
			return CODE;
		} else if (*val == -1.0) {
			*out = 243;
			return CODE;
		} else if (*val == 2.0) {
			*out = 244;
			return CODE;
		} else if (*val == -2.0) {
			*out = 245;
			return CODE;
		} else if (*val == 4.0) {
			*out = 246;
			return CODE;
		} else if (*val == -4.0) {
			*out = 247;
			return CODE;
		} else {
			*out = *((u32*)val);
			return LITERAL;
		}
	} 
	/* TODO */
}
#undef LITERAL
#undef CODE
#undef id
#define STR_SZ 128
#define V_MOV_B32 1
#define VOP1 0b0111111
#define LITERAL 1
/*NSPC*/
STATIC u8 v_mov_b32_cst(struct stg_ctx_t *ctx,  u32 dst_vgpr_idx,
							u32 src_cst_id_idx)
{
	u8 r;
	u32 w;
	u32 cst;
	bool cst_is_literal;
	u8 src_cst_str[STR_SZ];

	r = cst_to_hw(ctx, &cst, src_cst_id_idx);
	if (r == FAILURE) {
		LOG_ERR("v_mov_b32_cst:unable to convert the spirv constant to a hardware machine constant\n");
		return FAILURE;
	}
	w = 0;
	if (r == LITERAL) {
		w |= 255;
		cst_is_literal = true;
	} else { /* r == CODE */
		w |= (u8)(cst);
		cst_is_literal = false;
	}
	w |= V_MOV_B32 << 9;
	w |= dst_vgpr_idx << 17;
	w |= VOP1 << 25;

	r = hw_emit_w(ctx, w);
	if (r != SUCCESS) {
		LOG_ERR("v_mov_b32_cst:unable to emit the machine instruction word\n");
		return FAILURE;
	}
	if (cst_is_literal) {
		r = hw_emit_w(ctx, cst);
		if (r != SUCCESS) {
			LOG_ERR("v_mov_b32_cst:unable to emit the literal constant word\n");
			return FAILURE;
		}
	}
	cst_to_str(ctx, src_cst_str, STR_SZ, src_cst_id_idx);
	LOG("hw:v_mov_b32_cst dst=v%u src=%s\n", dst_vgpr_idx, src_cst_str);
	return SUCCESS;
}
#undef STR_SZ
#undef V_MOV_B32
#undef VOP1
#undef LITERAL
#define id ((struct stg_ctx_private_t*)ctx->private)->ids.array
/*NSPC*/
STATIC u8 opvectorshuffle(struct stg_ctx_t *ctx, u32 i)
{
	u8 r;
	u32 dst_id;
	u32 dst_type_id;
	u32 dst_id_idx;
	u32 src0_id;
	u32 src0_id_idx;
	u32 src1_id;
	u32 src1_id_idx;
	u32 op_e_idx;
	u32 cpnt_op_idx;
	u32 dst_cpnt_idx;
	u32 dst_type_cpnts_n;
	u32 src_cpnt_idx;

	LOG("OpVectorShuffle:idx=%u\n", i);
	dst_type_id = RDW(i + 1);
	r = type_vec_cpnts_n(ctx, &dst_type_cpnts_n, dst_type_id);
	if (r != SUCCESS) {
		LOG_ERR("opvectorshuffle:unable to lookup for the number of components of the destination vector\n");
		return FAILURE;
	}
	dst_id = RDW(i + 2);
	r = id_alloc(ctx, &dst_id_idx, dst_id);
	if (r != SUCCESS) {
		LOG_ERR("opvectorshuffle:unable to allocate an new id\n");
		return FAILURE;
	}
	/*--------------------------------------------------------------------*/
	id[dst_id_idx].type = STG_ID_TYPE_VGPRS;
	id[dst_id_idx].vgprs.idxs = calloc(dst_type_cpnts_n, sizeof(u32));
	if (id[dst_id_idx].vgprs.idxs == 0) {
		LOG_ERR("opvectorshuffle:unable to allocate memory for %u vgpr indexes related to id %u\n", dst_type_cpnts_n, dst_id);
		return FAILURE;
	}
	id[dst_id_idx].vgprs.n = dst_type_cpnts_n;
	/*--------------------------------------------------------------------*/
	src0_id = RDW(i + 3);
	r = id_idx_lookup(ctx, &src0_id_idx, src0_id);
	if (r != SUCCESS) {
		LOG_ERR("opvectorshuffle:unable to find the source0 id\n");
		return FAILURE;
	}
	src1_id = RDW(i + 4);
	r = id_idx_lookup(ctx, &src1_id_idx, src1_id);
	if (r != SUCCESS) {
		LOG_ERR("opvectorshuffle:unable to find the source1 id\n");
		return FAILURE;
	}
	op_e_idx=i + OP_WS_N(RDW(i));
	cpnt_op_idx = i + 5;
	dst_cpnt_idx = 0;
	/*--------------------------------------------------------------------*/
	/* src vec 0 */
	src_cpnt_idx = 0;
	loop {
		u32 cpnt_idx;

		if (cpnt_op_idx >= op_e_idx)
			return SUCCESS;
		if (dst_cpnt_idx >= dst_type_cpnts_n)
			return SUCCESS;
		if (src_cpnt_idx >= id[src0_id_idx].vgprs.n)
			break; /* next vec */
		cpnt_idx = RDW(cpnt_op_idx);
		if (cpnt_idx != 0xffffffff) {
			id[dst_id_idx].vgprs.idxs[dst_cpnt_idx] =
					id[src0_id_idx].vgprs.idxs[cpnt_idx];
			LOG("alias v%u from %%%u\n", id[src0_id_idx].vgprs.idxs[cpnt_idx], src0_id);
		}
		++cpnt_op_idx;
		++src_cpnt_idx;
		++dst_cpnt_idx;
	}
	/*--------------------------------------------------------------------*/
	/* src vec 1 */
	src_cpnt_idx = 0;
	loop {
		u32 cpnt_idx;

		if (cpnt_op_idx >= op_e_idx)
			return SUCCESS;
		if (dst_cpnt_idx >= dst_type_cpnts_n)
			return SUCCESS;
		if (src_cpnt_idx >= id[src1_id_idx].vgprs.n)
			break;
		cpnt_idx = RDW(cpnt_op_idx);
		if (cpnt_idx != 0xffffffff) {
			id[dst_id_idx].vgprs.idxs[dst_cpnt_idx] =
					id[src1_id_idx].vgprs.idxs[cpnt_idx];
			LOG("alias v%u from %%%u\n", id[src0_id_idx].vgprs.idxs[cpnt_idx], src1_id);
		}
		++cpnt_op_idx;
		++src_cpnt_idx;
		++dst_cpnt_idx;
	}
	return SUCCESS;
}
#undef id
// XXX hardcoded: lowered to first level component words
// XXX hardcoded: only making a register alias
#define id ((struct stg_ctx_private_t*)ctx->private)->ids.array
/*NSPC*/
STATIC u8 opcompositeextract(struct stg_ctx_t *ctx, u32 i)
{
	u8 r;
	u32 src_composite_id;
	u32 src_composite_id_idx;
	u32 src_composite_vgprs_idx;
	u32 src_composite_inner_idx;
	u32 dst_id;
	u32 dst_id_idx;

	LOG("OpCompositeExtract:idx=%u\n", i);
	src_composite_id = RDW(i + 3);
	r = id_idx_lookup(ctx, &src_composite_id_idx, src_composite_id);
	if (r != SUCCESS) {
		LOG_ERR("opcompositeextract:unable to find the source composite id\n");
		return FAILURE;
	}
	dst_id = RDW(i + 2);
	r = id_alloc(ctx, &dst_id_idx, dst_id);
	if (r != SUCCESS) {
		LOG_ERR("opcompositeextract:unable to allocate an new id for the destination\n");
		return FAILURE;
	}
	id[dst_id_idx].type = STG_ID_TYPE_VGPRS;
	id[dst_id_idx].vgprs.n = 1;
	id[dst_id_idx].vgprs.idxs = calloc(1, sizeof(u32));
	if (id[dst_id_idx].vgprs.idxs == 0) {
		LOG_ERR("opcompositeextract:unable to allocate memory for the vgpr index\n");
		return FAILURE;
	}
	if (id[src_composite_id_idx].type != STG_ID_TYPE_VGPRS) {
		LOG_ERR("opcompositeextract:unsupported src type %%%u\n", src_composite_id);
		return FAILURE;
	}
	src_composite_vgprs_idx = id[src_composite_id_idx].vgprs.idxs[0];
	src_composite_inner_idx = RDW(i + 4);
	id[dst_id_idx].vgprs.idxs[0] = src_composite_vgprs_idx
						+ src_composite_inner_idx;
	LOG("v%u alias\n", id[dst_id_idx].vgprs.idxs[0]);
	return SUCCESS;	
}
#undef id
/* use function macro to avoid the conflict with the id field */
#define id(x) ((struct stg_ctx_private_t*)ctx->private)->ids.array[x]
/*NSPC*/
STATIC u8 cst_load(struct stg_ctx_t *ctx, u32 id_idx)
{
	u8 r;

	id(id_idx).vgprs.idxs = calloc(1, sizeof(u32));
	if (id(id_idx).vgprs.idxs == 0) {
		LOG_ERR("cst_load:unable to allocate memory for 1 vgpr index to load the constant %%%u\n", id(id_idx).id);
		return FAILURE;
	}
	r = vgpr_alloc(ctx, id(id_idx).vgprs.idxs);
	if (r != SUCCESS) {
		LOG_ERR("cst_load:unable to allocate a vgpr for constant %%%u\n", id(id_idx).id);
		return FAILURE;
	}
	id(id_idx).vgprs.n = 1;
	r = v_mov_b32_cst(ctx, id(id_idx).vgprs.idxs[0], id_idx);
	if (r != SUCCESS) {
		LOG_ERR("cst_load:unable to emit v_mov_b32 machine instruction word to load %%%u in v%u", id(id_idx).id, id(id_idx).vgprs.idxs[0]);
		return FAILURE;
	}
	id(id_idx).cst_loaded = true;
	return SUCCESS;
}
#undef id
#define id(x) ((struct stg_ctx_private_t*)ctx->private)->ids.array[x]
// XXX hardcoded for a vector of floats in vgprs or csts
// XXX hardcoded for vgpr aliases
/*NSPC*/
STATIC u8 opcompositeconstruct(struct stg_ctx_t *ctx, u32 i)
{
	u8 r;
	u32 dst_vgprs_idx;
	u32 dst_vgprs_n;
	u32 dst_id;
	u32 dst_id_idx;
	u32 v;

	LOG("OpCompositeConstruct:idx=%u\n", i);
	dst_vgprs_n = OP_WS_N(RDW(i)) - 3;
	dst_id = RDW(i + 2);
	/*--------------------------------------------------------------------*/
	r = id_alloc(ctx, &dst_id_idx, dst_id);
	if (r != SUCCESS) {
		LOG_ERR("opcompositeconstruct:unable to allocate an new id for the destination composite\n");
		return FAILURE;
	}
	id(dst_id_idx).type = STG_ID_TYPE_VGPRS;
	id(dst_id_idx).vgprs.n = dst_vgprs_n;
	id(dst_id_idx).vgprs.idxs = calloc(dst_vgprs_n, sizeof(u32));
	if (id(dst_id_idx).vgprs.idxs == 0) {
		LOG_ERR("opcompositeconstruct:unable to allocate memory for vgpr indexes for %%%u\n", dst_id);
		return FAILURE;
	}
	/*--------------------------------------------------------------------*/
	v = 0;
	loop {
		u32 src_id;
		u32 src_id_idx;

		if (v == dst_vgprs_n)
			break;
		src_id = RDW(i + 3 + v);
		r = id_idx_lookup(ctx, &src_id_idx, src_id);
		if (r != SUCCESS) {
			LOG_ERR("opcompositeconstruct:unable to find the source %%%u\n", src_id);
			return FAILURE;
		}
		switch (id(src_id_idx).type) {
		case STG_ID_TYPE_VGPRS:
			// XXX this is an alias
			id(dst_id_idx).vgprs.idxs[v] = id(src_id_idx).vgprs.idxs[0];
			break;
		case STG_ID_TYPE_CST:
			/* we load only 1 cpnt w from the cst */
			if(!id(src_id_idx).cst_loaded) {
				r = cst_load(ctx, src_id_idx);
				if (r != SUCCESS) {
					LOG_ERR("opcompositeconstruct:unable to load constant %%%u\n", id(src_id_idx).id);
					return FAILURE;
				}
			}
			id(dst_id_idx).vgprs.idxs[v] = id(src_id_idx).vgprs.idxs[0];
			break;
		default:
			LOG_ERR("opcompositeconstruct:unsupported source type\n");
			return FAILURE;
		}
		++v;
	}
	return SUCCESS;
}
#undef id
/*NSPC*/
STATIC bool is_builtin_pos(struct stg_ctx_t *ctx, u32 id)
{
	struct stg_ctx_private_t *p;
	u32 i;

	p = ctx->private;

	i = p->layout.annotations.start_spirv_idx;
	loop {
		u32 op;

		if (i >= p->layout.annotations.end_spirv_idx)
			return false;
		op = RDW(i);
		if (OP_ENUMERANT(op) == 71) { /* opdecorate */
			u32 target_id;
	
			target_id = RDW(i + 1);	
			if (target_id == id) {
				u32 decoration;

				decoration = RDW(i + 2);
				if (decoration == 11) { /* builtin */
					u32 builtin;

					builtin = RDW(i + 3);
					if (builtin == 0)
						return true;
				}
			}
		}
		i += OP_WS_N(op);
	}
	/* unreachable */
}
/* XXX hardcoded for a output builtin pos or output location store */
/* XXX hardcoded for an input vec of vgprs */
#define FOUND 1
#define id ((struct stg_ctx_private_t*)(ctx->private))->ids.array
#define EXP 0b111110
#define v p->vgprs.array
#define slot p->s_waitcnt.vm_cnt.stack.slots
/*NSPC*/
STATIC u8 opstore(struct stg_ctx_t *ctx, u32 i)
{
	struct stg_ctx_private_t *p;
	u8 target_str[sizeof("paramXX")];
	u32 ptr_id;
	u32 location; /* output */
	u8 r;
	u8 hw_target;
	u32 obj_id;
	u32 obj_id_idx;
	u32 w0;
	u32 w1;
	u32 j;
	u32 en;
	struct s_waitcnt_t wait;

	p = ctx->private;
	LOG("OpStore:idx=%u\n", i);
	ptr_id = RDW(i + 1);
	if (is_builtin_pos(ctx, ptr_id)) {
		hw_target = 12;
	} else {
		r = location_lookup(ctx, &location, ptr_id);
		if (r != FOUND) {
			LOG_ERR("opstore:%%%u is neither a the position builtin or an output location\n", ptr_id);
			return FAILURE;
		}
		hw_target = (u8)(location + 32);
	}
	obj_id = RDW(i + 2);
	r = id_idx_lookup(ctx, &obj_id_idx, obj_id);
	if (r != SUCCESS) {
		LOG_ERR("opstore:object %%%u is unknown\n", obj_id);
		return FAILURE;
	}
	if (id[obj_id_idx].type != STG_ID_TYPE_VGPRS) {
		LOG_ERR("opstore:object %%%u has no vgprs\n", obj_id);
		return FAILURE;
	}
	/*--------------------------------------------------------------------*/
	/* pre-s_waitcnt */
	s_waitcnt_init(ctx, &wait);
	p = ctx->private;
	/*
	 * secure any pending offchip mem load:
	 * we will have to wait on the most recent pending load inst to finish,
	 * if it exists, namely the one with the lowest position in the
	 * s_waitcnt vm_cnt stack.
	 * since loads and stores are in-order, "waiting on the most recent
	 * load/store" implies "waiting on all older ones"
	 */
	j = 0;
	loop {
		u32 exp_vgpr;
		u32 most_recent_inst;
		u32 k;

		if (j == id[obj_id_idx].vgprs.n)
			break;
		exp_vgpr = id[obj_id_idx].vgprs.idxs[j];
		if (!v[exp_vgpr].offchip_mem.pending) {
			++j;
			continue;
		}

		most_recent_inst = v[exp_vgpr].offchip_mem.last_s_waitcnt_inst;
		k = 0;
		loop {
			if (k == p->s_waitcnt.vm_cnt.stack.n)
				break;
			if (slot[k].inst != most_recent_inst
				|| slot[k].type != STG_S_WAITCNT_VM_CNT_LOAD) {
				++k;
				continue;
			}
			if (wait.use_vm_cnt == false) {
				wait.use_vm_cnt = true;
				wait.vm_cnt = k;
			} else if (k < wait.vm_cnt)
				wait.vm_cnt = k;
			++k;
		}
		++j;
	}
	r = s_waitcnt(ctx, &wait);
	if (r != SUCCESS) {
		LOG_ERR("opstore:unable to emit pre-s_waitcnt machine instruction for the upcoming export machine instruction\n");
		return FAILURE;
	}
	/*--------------------------------------------------------------------*/
	exp_target_to_str(target_str, sizeof(target_str), hw_target);
	LOG("hw:export target=%s", target_str);
	j = 0;
	loop {
		if (j == id[obj_id_idx].vgprs.n)
			break;
		LOG(" src%u=v%u", j, id[obj_id_idx].vgprs.idxs[j]); 
		++j;
	}
	/* XXX: only 1 posX for now, then the last one */
	if (hw_target == 12)
		LOG(" done");
	LOG("\n");

	w0 = 0;
	j = 0;
	en = 0;
	loop {
		if (j >= id[obj_id_idx].vgprs.n)
			break;
		else
			en = en << 1;
		en |= 1;
		++j;
	}
	w0 |= en;
	w0 |= hw_target << 4;
	/* XXX: vtx pos is presumed the last one */
	w0 |= (hw_target == 12 ? 1 : 0) << 11;  /* DONE */
	/* XXX VM = 0, but would be wrong for a pixel shader */
	w0 |= EXP << 26;

	w1 = 0;
	j = 0;
	loop {	
		if (j >= id[obj_id_idx].vgprs.n)
			break;
		w1 |= (id[obj_id_idx].vgprs.idxs[j]) << (j * 8);
		++j;
	}

	r = hw_emit_w(ctx, w0);
	if (r != SUCCESS) {
		LOG_ERR("opstore:unable to emit hardware export machine word 0\n");
		return FAILURE;
	}
	r = hw_emit_w(ctx, w1);
	if (r != SUCCESS) {
		LOG_ERR("opstore:unable to emit hardware export machine word 1\n");
		return FAILURE;
	}
	/*--------------------------------------------------------------------*/
	j = 0;
	loop {
		u32 exp_vgpr;

		if (j == id[obj_id_idx].vgprs.n)
			break;
		exp_vgpr = id[obj_id_idx].vgprs.idxs[j];
		v[exp_vgpr].pending_export = true;
		++j;
	}
	return SUCCESS;
}
#undef FOUND
#undef id
#undef EXP
#undef v
#undef slot
/*NSPC*/
STATIC u8 opfunction(struct stg_ctx_t *ctx, u32 i)
{
	i += OP_WS_N(RDW(i));
	loop {
		u8 r;
		u32 opcode;
	
		opcode = RDW(i);	
		switch (OP_ENUMERANT(opcode)) {
		case 61:
			r = opload(ctx, i);
			if (r != SUCCESS) {
				LOG_ERR("opfunction:opload processing failure\n");
				return FAILURE;
			}	
			break;
		case 62:
			r = opstore(ctx, i);
			if (r != SUCCESS) {
				LOG_ERR("opfunction:opstore processing failure\n");
				return FAILURE;
			}
			break;
		case 79:
			r = opvectorshuffle(ctx, i);
			if (r != SUCCESS) {
				LOG_ERR("opfunction:opvectorshuffle processing failure\n");
				return FAILURE;
			}
			break;
		case 80:
			r = opcompositeconstruct(ctx, i);
			if (r != SUCCESS) {
				LOG_ERR("opfunction:opcompositeconstruct processing failure\n");
				return FAILURE;
			}
			break;
		case 81:
			r = opcompositeextract(ctx, i);
			if (r != SUCCESS) {
				LOG_ERR("opfunction:opcompositeextract processing failure\n");
				return FAILURE;
			}
			break;
		case 253:
			LOG("opreturn, exiting translation\n");
			return SUCCESS;
		/* TODO */
		default:
			/* ignored */
			LOG("opfunction:ignoring opcode %u\n", OP_ENUMERANT(opcode));
			break;
		}
		i += OP_WS_N(opcode);
	}
	return SUCCESS;
}
/*NSPC*/
STATIC u8 layout_capabilities(struct stg_ctx_t *ctx, u32 *i)
{
	LOG("logical layout, start of section of capabilities at word %u\n", *i);
	loop {
		u32 op;

		if (*i >= ctx->input.spirv.ws_n) {
			LOG("logical layout, end of section of capabilities at word %u, end of spirv\n", *i);
			break;
		}
		op = RDW(*i);
		if (OP_ENUMERANT(op) != 17 /* OpCapability */) {
			LOG("logical layout, end of section of capabilities at word %u, breaking opcode enumerant is %u\n", *i, OP_ENUMERANT(op));
			break;
		}
		*i += OP_WS_N(op);
	}
	return SUCCESS;
}
/*NSPC*/
STATIC u8 layout_extensions(struct stg_ctx_t *ctx, u32 *i)
{
	LOG("logical layout, start of optional section of extensions at word %u\n", *i);
	loop {
		u32 op;

		if (*i >= ctx->input.spirv.ws_n) {
			LOG("logical layout, end of optional section of capabilities at word %u, end of spirv\n", *i);
			break;
		}
		op = RDW(*i);
		if (OP_ENUMERANT(op) != 10 /* OpExtension */) {
			LOG("logical layout, end of optional section of capabilities at word %u, breaking opcode enumerant is %u\n", *i, OP_ENUMERANT(op));
			break;
		}
		*i += OP_WS_N(op);
	}
	return SUCCESS;
}
/*NSPC*/
STATIC u8 layout_imports_external_insts(struct stg_ctx_t *ctx, u32 *i)
{
	LOG("logical layout, start of optional section of imports of external instructions at word %u\n", *i);
	loop {
		u32 op;

		if (*i >= ctx->input.spirv.ws_n) {
			LOG("logical layout, end of optional section of imports of external instructions at word %u, end of spirv\n", *i);
			break;
		}
		op = RDW(*i);
		if (OP_ENUMERANT(op) != 11 /* OpExtInstImport */) {
			LOG("logical layout, end of optional section of imports of external instructions at word %u, breaking opcode enumerant is %u\n", *i, OP_ENUMERANT(op));
			break;
		}
		*i += OP_WS_N(op);
	}
	return SUCCESS;
}
/*NSPC*/
STATIC u8 layout_memory_model(struct stg_ctx_t *ctx, u32 *i)
{
	LOG("logical layout, start of the memory model section at word %u\n", *i);
	loop {
		u32 op;

		if (*i >= ctx->input.spirv.ws_n) {
			LOG("logical layout, end of memory model section at word %u, end of spirv\n", *i);
			break;
		}
		op = RDW(*i);
		if (OP_ENUMERANT(op) != 14 /* OpMemoryModel */) {
			LOG("logical layout, end of memory model section at word %u, breaking opcode enumerant is %u\n", *i, OP_ENUMERANT(op));
			break;
		}
		*i += OP_WS_N(op);
	}
	return SUCCESS;
}
/*NSPC*/
STATIC u8 layout_entrypoints(struct stg_ctx_t *ctx, u32 *i)
{
	struct stg_ctx_private_t *p;

	p = ctx->private;
	p->layout.entrypoints.start_spirv_idx = *i;
	LOG("logical layout, start of the section of entry points at word %u\n", *i);
	loop {
		u32 op;

		if (*i >= ctx->input.spirv.ws_n) {
			LOG("logical layout, end of section of entry points at word %u, end of spirv\n", *i);
			break;
		}
		op = RDW(*i);
		if (OP_ENUMERANT(op) != 15 /* OpEntryPoint */) {
			LOG("logical layout, end of section of entry points at word %u, breaking opcode enumerant is %u\n", *i, OP_ENUMERANT(op));
			p->layout.entrypoints.end_spirv_idx =
							ctx->input.spirv.ws_n;
			break;
		} else
			LOG("entry point \"%s\"\n", ctx->input.spirv.ws + *i + 3);
		*i += OP_WS_N(op);
	}
	return SUCCESS;
}
/*NSPC*/
STATIC u8 layout_execution_modes(struct stg_ctx_t *ctx, u32 *i)
{
	LOG("logical layout, start of the section of exectution modes at word %u\n", *i);
	loop {
		u32 op;

		if (*i >= ctx->input.spirv.ws_n) {
			LOG("logical layout, end of section of execution modes at word %u, end of spirv\n", *i);
			break;
		}
		op = RDW(*i);
		if (   OP_ENUMERANT(op) != 16	/* OpExecutionMode */
		    && OP_ENUMERANT(op) != 331	/* OpExecutionModeId */) {
			LOG("logical layout, end of section of execution models at word %u, breaking opcode enumerant is %u\n", *i, OP_ENUMERANT(op));
			break;
		}
		*i += OP_WS_N(op);
	}
	return SUCCESS;
}
/*NSPC*/
STATIC u8 layout_debug_first(struct stg_ctx_t *ctx, u32 *i)
{
	LOG("logical layout, start of the first debug section at word %u\n", *i);
	loop {
		u32 op;

		if (*i >= ctx->input.spirv.ws_n) {
			LOG("logical layout, end of first debug section at word %u, end of spirv\n", *i);
			break;
		}
		op = RDW(*i);
		if (   OP_ENUMERANT(op) != 2 /* OpSourceContinued */
		    && OP_ENUMERANT(op) != 3 /* OpSource */
		    && OP_ENUMERANT(op) != 4 /* OpSourceExtension */
		    && OP_ENUMERANT(op) != 7 /* OpString */) {
			LOG("logical layout, end of first debug section at word %u, breaking opcode enumerant is %u\n", *i, OP_ENUMERANT(op));
			break;
		}
		*i += OP_WS_N(op);
	}
	return SUCCESS;
}
/*NSPC*/
STATIC u8 layout_debug_second(struct stg_ctx_t *ctx, u32 *i)
{
	LOG("logical layout, start of the second debug section at word %u\n", *i);
	loop {
		u32 op;

		if (*i >= ctx->input.spirv.ws_n) {
			LOG("logical layout, end of second debug section at word %u, end of spirv\n", *i);
			break;
		}
		op = RDW(*i);
		if (   OP_ENUMERANT(op) != 5 /* OpName */
		    && OP_ENUMERANT(op) != 6 /* OpMemberName */) {
			LOG("logical layout, end of second debug section at word %u, breaking opcode enumerant is %u\n", *i, OP_ENUMERANT(op));
			break;
		}
		*i += OP_WS_N(op);
	}
	return SUCCESS;
}
/*NSPC*/
STATIC u8 layout_debug_third(struct stg_ctx_t *ctx, u32 *i)
{
	LOG("logical layout, start of the third debug section at word %u\n", *i);
	loop {
		u32 op;

		if (*i >= ctx->input.spirv.ws_n) {
			LOG("logical layout, end of third debug section at word %u, end of spirv\n", *i);
			break;
		}
		op = RDW(*i);
		if (OP_ENUMERANT(op) != 330 /* OpModuleProcessed */) {
			LOG("logical layout, end of third debug section at word %u, breaking opcode enumerant is %u\n", *i, OP_ENUMERANT(op));
			break;
		}
		*i += OP_WS_N(op);
	}
	return SUCCESS;
}
/*NSPC*/
STATIC u8 layout_annotations(struct stg_ctx_t *ctx, u32 *i)
{
	struct stg_ctx_private_t *p;

	p = ctx->private;
	p->layout.annotations.start_spirv_idx = *i;
	LOG("logical layout, start of the section of annotations at word %u\n", *i);
	loop {
		u8 r;
		u32 op;

		if (*i >= ctx->input.spirv.ws_n) {
			LOG("logical layout, end of section of annotations at word %u, end of spirv\n", *i);
			p->layout.annotations.end_spirv_idx =
							ctx->input.spirv.ws_n;
			return SUCCESS;
		}
		op = RDW(*i);
		switch (OP_ENUMERANT(op)) {
		case 71: /* OpDecorate */
		case 72: /* OpMemberDecorate */
			break;
		case 74:
			LOG("WARNING:OpGroupDecorate was found and is obsolete\n");
			break;
		case 75:
			LOG("WARNING:OpGroupMemberDecorate was found and is obsolete\n");
			break;
		case 73:
			LOG("WARNING:OpDecorationGroup was found and is obsolete\n");
			break;
		default:
			LOG("logical layout, end of section of annotations at word %u, breaking opcode enumerant is %u\n", *i, OP_ENUMERANT(op));
			p->layout.annotations.end_spirv_idx = *i;
			return SUCCESS;
		}
		*i += OP_WS_N(op);
	}
	/* unreachable */
}
#define id ((struct stg_ctx_private_t*)(ctx->private))->ids.array
/*NSPC*/
STATIC u8 opconstant(struct stg_ctx_t *ctx, u32 *i)
{
	u8 r;
	u32 id_idx;

	r = id_alloc(ctx, &id_idx, RDW(*i + 2));
	if (r != SUCCESS) {
		LOG_ERR("opconstant:unable to allocate an new id for constant\n");
		return FAILURE;
	}
	id[id_idx].type = STG_ID_TYPE_CST;
	id[id_idx].spirv_idx = *i;
	id[id_idx].cst_loaded = false;
	LOG("new constant %%%u\n", RDW(*i + 2));
	return SUCCESS;
}
#undef id
/*NSPC*/
STATIC u8 optypefloat(struct stg_ctx_t *ctx, u32 *i)
{
	if (RDW(*i + 2) == 32) {
		struct stg_ctx_private_t *p;

		p = ctx->private;
		p->type_f32_id = RDW(*i + 1);
		LOG("type f32 %%%u\n", p->type_f32_id);
	}
	return SUCCESS;
}
#define id(x) ((struct stg_ctx_private_t*)(ctx->private))->ids.array[x]
/*NSPC*/
STATIC u8 opvariable(struct stg_ctx_t *ctx, u32 *i)
{
	u32 id;
	u32 id_idx;
	u8 r;

	id = RDW(*i + 2);
	LOG("OpVariable:idx=%u:id=%u\n", *i, id);
	r = id_alloc(ctx, &id_idx, id);
	if (r != SUCCESS) {
		LOG_ERR("opvariable:unable to alloc a new id\n");
		return FAILURE;
	}
	id(id_idx).type = STG_ID_TYPE_SPIRV_IDX;
	id(id_idx).spirv_idx = *i;
	/* TODO */
	return SUCCESS;
}
#undef id
/*NSPC*/
STATIC u8 layout_nonfunc_decls(struct stg_ctx_t *ctx, u32 *i)
{
	struct stg_ctx_private_t *p;
	bool section_end;
	u8 r;

	p = ctx->private;
	p->layout.nonfunc_decls.start_spirv_idx = *i;
	section_end = false;
	LOG("logical layout, start of the section of non-function declarations at word %u\n", *i);
	loop {
		u32 op;

		if (*i >= ctx->input.spirv.ws_n) {
			LOG("logical layout, end of section of non-function declarations at word %u, end of spirv\n", *i);
			p->layout.nonfunc_decls.end_spirv_idx =
							ctx->input.spirv.ws_n;
			break;
		}
		op = RDW(*i);
		switch (OP_ENUMERANT(op)) {
		case 8:		/* OpLine */
		/* types */
		case 19:	/* OpTypeVoid */
		case 20:	/* OpTypeBool */
		case 21:	/* OpTypeInt */
			break;
		case 22:	/* OpTypeFloat */
			r = optypefloat(ctx, i);
			if (r != SUCCESS)
				return FAILURE;		
			break;
		case 23:	/* OpTypeVector */
		case 24:	/* OpTypeMatrix */
		case 25:	/* OpTypeImage */
		case 26:	/* OpTypeSampler */
		case 27:	/* OpTypeSampledImage */
		case 28:	/* OpTypeArray */
		case 29:	/* OpTypeRuntimeArray */
		case 30:	/* OpTypeStruct */
		case 31:	/* OpTypeOpaque */
		case 32:	/* OpTypePointer */
		case 33:	/* OpTypeFunction */
		case 34:	/* OpTypeEvent */
		case 35:	/* OpTypeDeviceEvent */
		case 36:	/* OpTypeReserveId */
		case 37:	/* OpTypeQueue */
		case 38:	/* OpTypePipe */
		case 39:	/* OpTypeForwardPointer */
		case 322:	/* OpTypePipeStorage */
		case 327:	/* OpTypeNamedBarrier */
		/* constants */
		case 41:	/* OpConstantTrue */
		case 42:	/* OpConstantFalse */
			break;
		case 43:	/* OpConstant */
			r = opconstant(ctx, i);
			if (r != SUCCESS)
				return FAILURE;
			break;
		case 44:	/* OpConstantComposite */
		case 45:	/* OpConstantSampler */
		case 46:	/* OpConstantNull */
		case 48:	/* OpSpecConstantTrue */
		case 49:	/* OpSpecConstantFalse */
		case 50:	/* OpSpecConstant */
		case 51:	/* OpSpecConstantComposite */
		case 52:	/* OpSpecConstantOp */
			break;
		/* variables and others */
		case 59:	/* OpVariable A SPIRV VARIABLE IS A POINTER */
			r = opvariable(ctx, i);
			if (r != SUCCESS)
				return FAILURE;
			break;
		case 1:		/* OpUndef */
			break;
		default:
			section_end = true;
			break;
		}
		if (section_end) {
			LOG("logical layout, end of section of non-function declarations at word %u, breaking opcode enumerant is %u\n", *i, OP_ENUMERANT(op));
			p->layout.nonfunc_decls.end_spirv_idx = *i;
			break;
		}
		*i += OP_WS_N(op);
	}
	return SUCCESS;
}
#define V_ADD_I32 37
/*NSPC*/
STATIC u8 v_add_i32_sgpr(struct stg_ctx_t *ctx, u32 dst_vgpr, u32 src_sgpr)
{
	u32 w;
	u8 r;

	LOG("hw:v_add_i32_sgpr dst=v%u src=s%u\n", dst_vgpr, src_sgpr);
	w = 0;
	w |= src_sgpr;
	w |= dst_vgpr << 17;
	w |= V_ADD_I32 << 25;
	/* w |= VOP2(=0b0) << 31 */

	r = hw_emit_w(ctx, w);
	if (r != SUCCESS) {
		LOG_ERR("v_add_i32_sgpr:unable to emit the machine instruction word\n");
		return FAILURE;
	}
	return SUCCESS;
}
#undef V_ADD_I32
/*NSPC*/
STATIC u8 shader_prolog_emit(struct stg_ctx_t *ctx)
{
	if (ctx->input.have_vtx_base_idx) {
		u8 r;

		/* the vtx idx is pre-load per shader invocation in v0 */
		r = v_add_i32_sgpr(ctx, 0, ctx->input.vtx_base_idx_sgpr);
		if (r != SUCCESS) {
			LOG_ERR("shader prolog:unable to add the vertex base index to the invocation pre-loaded vertex index\n");
			return FAILURE;
		}
	}
	/* TODO: base instance idx if ever used */
	return SUCCESS;
}
/*NSPC*/
STATIC u32 lookup_entrypoint_id(struct stg_ctx_t *ctx)
{
	struct stg_ctx_private_t *p;
	u32 i;

	p = ctx->private;
	i = p->layout.entrypoints.start_spirv_idx;
	loop {
		u32 op;

		if (i >= p->layout.entrypoints.end_spirv_idx)
			return 0;
		if (strcmp(ctx->input.entrypoint_name,
				(char *)(ctx->input.spirv.ws + i + 3)) == 0) {
			u32 entrypoint_id;

			entrypoint_id = RDW(i + 2);
			LOG("module entry point found \"%s\" %%%u\n", ctx->input.spirv.ws + i + 3, entrypoint_id);
			return entrypoint_id;
		}
		i += OP_WS_N(op);
	}
	/* unreachable */
}
#endif
./stg/public.c0000666000175000017500000000770713704341151011267 0ustar  user#ifndef STG_PUBLIC_C
#define STG_PUBLIC_C
STG_PUBLIC struct stg_ctx_t *stg_ctx_new(void)
{
	struct stg_ctx_t *new;
	struct stg_ctx_private_t *p;

	new = calloc(1, sizeof(*new));
	p = calloc(1, sizeof(*p));
	new->private = p;
	return new;
}
STG_PUBLIC void stg_ctx_del(struct stg_ctx_t **ctx)
{
	if ((*ctx)->private != 0)
		ctx_private_del((*ctx)->private);

	if ((*ctx)->input.entrypoint_name != 0)
		free((*ctx)->input.entrypoint_name);
	if ((*ctx)->input.desc_sets.array != 0)
		free((*ctx)->input.desc_sets.array);
	if ((*ctx)->input.locations.array != 0)
		free((*ctx)->input.locations.array);
	if ((*ctx)->output.hw != 0)
		free((*ctx)->output.hw);
	if ((*ctx)->input.spirv.ws != 0)
		free((*ctx)->input.spirv.ws);

	free(*ctx);
	*ctx = 0;
}
#define TRANSLATION_FAILURE	0
#define TRANSLATION_SUCCESS	1
#define END_PGM 0xbfc00000
STG_PUBLIC u8 stg_translate(struct stg_ctx_t *ctx)
{
	u8 r;
	u32 i;
	u32 entrypoint_idx;
	struct stg_ctx_private_t *p;
	
	r = machine_csts(ctx);
	if (r != SUCCESS) {
		LOG_ERR("translate:unable to initialize the machine constants\n");
		return FAILURE;
	}	
	endianess(ctx);

	i = 5;
	r = layout_capabilities(ctx, &i);
	if (r != SUCCESS) {
		LOG_ERR("translate:unable to parse the section of capabilities\n");
		return FAILURE;
	}
	r = layout_extensions(ctx, &i);
	if (r != SUCCESS) {
		LOG_ERR("translate:unable to parse the optional section of extensions\n");
		return FAILURE;
	}
	r = layout_imports_external_insts(ctx, &i);
	if (r != SUCCESS) {
		LOG_ERR("translate:unable to parse the section of imports of external instructions\n");
		return FAILURE;
	}
	r = layout_memory_model(ctx, &i);
	if (r != SUCCESS) {
		LOG_ERR("translate:unable to parse the section of the memory model\n");
		return FAILURE;
	}
	r = layout_entrypoints(ctx, &i);
	if (r != SUCCESS) {
		LOG_ERR("translate:unable to parse the section of entry points\n");
		return FAILURE;
	}
	r = layout_execution_modes(ctx, &i);
	if (r != SUCCESS) {
		LOG_ERR("translate:unable to parse the section of execution modes\n");
		return FAILURE;
	}
	r = layout_debug_first(ctx, &i);
	if (r != SUCCESS) {
		LOG_ERR("translate:unable to parse the first debug section\n");
		return FAILURE;
	}
	r = layout_debug_second(ctx, &i);
	if (r != SUCCESS) {
		LOG_ERR("translate:unable to parse the second debug section\n");
		return FAILURE;
	}
	r = layout_debug_third(ctx, &i);
	if (r != SUCCESS) {
		LOG_ERR("translate:unable to parse the third debug section\n");
		return FAILURE;
	}
	r = layout_annotations(ctx, &i);
	if (r != SUCCESS) {
		LOG_ERR("translate:unable to parse the section of annotations\n");
		return FAILURE;
	}
	/* from here, in the spirv module OpLine is allowed */
	r = layout_nonfunc_decls(ctx, &i);
	if (r != SUCCESS) {
		LOG_ERR("translate:unable to parse the section of non-function declarations\n");
		return FAILURE;
	}
	/*--------------------------------------------------------------------*/
	p = ctx->private;
	p->layout.funcs.start_spirv_idx = i;
	LOG("logical layout: from here function declarations (without bodies) then definitions (with bodies)\n");

	r = shader_prolog_emit(ctx);
	if (r != SUCCESS) {
		LOG_ERR("translate:unable to emit shader prolog machine words\n");
		return TRANSLATION_FAILURE;
	}
	p->entrypoint_id = lookup_entrypoint_id(ctx);
	if (p->entrypoint_id == 0) {
		LOG_ERR("translate:unable to find entry point \"%s\" in spirv section of entry points\n", ctx->input.entrypoint_name);
		return TRANSLATION_FAILURE;
	}
	entrypoint_idx = lookup_opfunction(ctx, p->entrypoint_id);
	if (entrypoint_idx == 0) {
		LOG_ERR("translate:OpEntryPoint \"%s\" OpFunction not found:id=%%%u\n", ctx->input.entrypoint_name, p->entrypoint_id);
		return TRANSLATION_FAILURE;
	}
	LOG("OpEntryPoint \"%s\" Opfunction found:idx=%u\n", ctx->input.entrypoint_name, entrypoint_idx);
	r = opfunction(ctx, entrypoint_idx);
	if (r != SUCCESS)
		return TRANSLATION_FAILURE;
	LOG("hw:end_pgm\n");
	hw_emit_w(ctx, END_PGM);
	return TRANSLATION_SUCCESS;
}
#undef END_PGM
#undef TRANSLATION_FAILURE
#undef TRANSLATION_SUCCESS
#endif
./stg/doc/0000755000000000000000000000000013707412621010352 5ustar  root./stg/doc/s_waitcnt_guide0000644000175000017500000001354213706355064013505 0ustar  userThis document explain how we deal with the different counters of the hardware
s_waitcnt instruction... at the time of writting and depending on how we do
understand the hardware documentation, and it is a wip (work-in-progress).
There are 3 counters: vm_cnt, exp_cnt and lgkm_cnt.

vm_cnt
======

This counter deals with vector off-chip memory (far far away, in another
galaxy) access. A vgpr content can be the target of store/load operations
to/from this memory. Those "stores" and "loads" are executed by the hardware in
order. Those "stores" and "loads" can be long to execute from the hardware
shader execution unit point of view. The s_waitcnt instruction is how a shader
can wait for such long "stores" and "loads" to finish, but only in a
coarse-grained fashion.

For instance, if the shader needs to _read_ some vgpr content, if several
"stores" are pending/in-progress on its content, no need to use the s_waitcnt
instruction to wait for those pending/in-progress stores to finish. But if the
shader needs to _write_ the vgpr content, the shader must use s_waitcnt to wait
for those stores to finish. Since a bare counter is merely a coarse-grained
control over those stores/loads, we try to find a sweet spot between
too-much-code-complexity and
the-shader-will-waste-a-ton-of-cycles-waiting-for-store/load-to-finish (this
can reach ultra-complexity very fast and be based on specific hardware
scheduling fine knowledge).

side note: in huge hardware instruction pipeline (usually CPUs since they have
die room for that), the instructions are expected to be re-ordered to some
degree with runtime information to minimize the negative inpact of such
expensive loads/stores on performance (hence the "microcode" of CPUs).

vm_cnt accounts for the number of issued instructions performing a store or
load on a set of vgprs. For instance, a hardware buffer store instruction
submit a store for several vgprs. Each instruction has an index with a
monotonic growth for unicity. Each vgpr tracks the last pending store/load
instruction, and we track the stack of the instruction indexes up to the size
of the vm_cnt counter and that _in order_. This stack has a slot for for each
value of vm_cnt.

In this example, we use a single letter for an instruction index, in time:
a->b->c
instruction a: store v12,v33,v2
instruction b: load v5,v30
instruction c: store v50,v3
instruction x: other store/loads
stack slots looks like: |c|x|b|x|a|x||||||...(empty slots up to vm_cnt size)
                         ⬆ ⬆ ⬆ ⬆ ⬆ ⬆ 
                         0 1 2 3 4 5
If the shader need to perform a write into v3, we have to expect a pending
store, instruction 'c'. we want to wait for instruction 'c' to be finished and
since those type of instructions are executed in order, we have to wait for
instruction 'c' then all previous instructions, here 'b' and 'a' (and any
filling 'x'). It means we have to empty the stack, and emit a 's_waitcnt
vm_cnt(0)' machine instruction. Then the stack will have to be entirely
flushed.

If the shader needs to perform a read on v5, we have to expect a pending load,
instruction 'b'. Then we have to wait for 'b' to finish, then all previous
instructions, here 'a' (and the filling 'x'). 'b' stack slot is 2, then we have
to emit a 's_waitcnt vm_cnt(2)' machine instruction and chop the bottom of the
stack from 'b'.
We have 6 instructions running, but we remove the oldest 4, leaving the most 2
recent ones, hence a vm_cnt of 2 because the oldest 4 will decrement vm_cnt
before the most 2 recents ones, because those instructions are in-order.

If the shader needs to perform a write on v12, we have to expect a pending
store, instruction 'a'. 'a' stack slot is 4, then we will issue 's_waitcnt
vm_cnt(4)' and chop the bottom of the stack form a. It means we are going
to wait the 2 most recent instructions to finish, from 6 issued instructions
to 4.

limits
------
The stack is full? We were not able to locate in the hardware documentation
how that would be handled. Nethertheless we will apply what seems to appear
"sound and safe", namely the following brutal behavior:
emit a "s_waitcnt vm_vm_cnt(0)" before emmiting the next vm_cnt incrementing
instruction, and clear th stack.

conditional branching
---------------------
each branch will need a separate vm_cnt tracking, both being a continuation
of the vm_cnt tracking right before the branching.

exp_cnt
=======
This counter seems to be handling mostly out-of-order operations. Moreover,
based on hardware documentation, this counter does account for a set of
operations which is different among hardware chip families. In older chip
families, accounting of some operations seems even shared with the vm_cnt
counter. What seems to be a shareable handling of this counter among all
hardware chip families, and not forgetting the fact it is handling mostly
out-of-order operations, is the following:
after an export instruction, the content of the exported vgrps must not be
modified before a "s_waitcnt exp_cnt(0)" instruction is issued. That to be 
sure the export operation has read its vgprs content.
What does happen in crazy shaders doing a storm of exports and overflowing
the exp_cnt counter? we were not able to locate anything about this in the
documentation. We will do an act of faith and trust shaders not to do such
thing. May do some empirical tests on own hardware though.

lgkm_cnt
========
whatever is our usage of the set of operations accounted by this counter,
only "s_waitcnt lgkm_cnt(0)" seems relevant.
Among this set of operations are the out-of-order scalar memory reads. Using
the content of sgprs from off-chip memory data must wait the data with
"s_waitcnt lgkm_cnt(0)".
What does happen in crazy shaders doing a storm of instructions overflowing
the lgkm_cnt counter? we were not able to locade anything about this in the
documentation. We will do an act of faith and trust shaders not to do such
thing. May do some empirical test on own hardware though.
./stg/doc/shader_programming_guide0000666000175000017500000004542213707303631015362 0ustar  userThe "shader programming guide" was never published by AMD. This is literaly an
ATTEMPT to reverse-engineer one from mesa vulkan code, first for the common
case of simple vertex and fragment shaders, then compute.
There is some catch on amd gpu gfx10 and above: the vertex shader is merged
into the geometry shader.

Why: near ZERO documentation on the matter. Only MASSIVE AND ULTRA COMPLEX
opensource components, some being sick c++ brainfuckage: this is obvious
open source obfuscation or accute lack of perspective/experience.

You must have all published documentation from AMD related to GPU programming.
The vulkan/spirv specs too.

Usually an hardware shader generator does output machine instruction words,
and some register related values (see SPI_SHADER_PGM_RSRC[1:3]_[VS/PS/etc] in
the 3D register reference). As input, the hardware shader generator expects
being told on the content of the user spgrs which is defined at hardware
pipeline programming time (Application Binary Interface). vgprs pre-loaded
content is mostly implicit based on shader code.

This document has 3 main parts:
 - vertex shader initial state
 - vulkan spirv mapping
 - mesa vulkan amdgpu ABI

NOTE: direct hardware programming is expected to be easier to deal with than
vulkan/spirv absraction.

vertex shader initial state
===========================
vgprs
-----
from src/amd/vulkan/radv_shader.c/radv_postprocess_config:

"VGPR0-3: (VertexID, InstanceID / StepRate0, PrimID, InstanceID)
If PrimID is disabled. InstanceID / StepRate1 is loaded instead.
StepRate0 is set to 1. so that VGPR3 doesn't have to be loaded."
This is controlled in the SPI_SHADER_PGM_RSRC1_VS register using the
VGPR_COMP_CNT field, if it is not programmed by the hardware command engine.

sgprs
-----
up to 16 sgprs can be loaded by the spi, starting at index 0.  The count of
"user spgrs" is set in the shader 2nd resource word: SPI_SHADER_PGM_RSRC2_VS.
The values are set in the SPI_SHADER_USER_DATA_VS[0-15]. Their pre-loading is
heavily based on the ABI (Application Binary Interface) of the GPU library you
are using. See further down for the mesa vulkan ABI related to amd GPUs.

lds
---
nothing

vertex shader output
====================
per vertex, up to 4 "positions". The first "position" is the actual NDC (see
below) space position of the vertex, it must be exported.

per vertex up to 32 parameters.

The last export instruction must set its "done" bit.

WARNING: the coordinate space of the ouput vertexes is the NDC space or
"Normalize Device Coordinate" space. _NOT_ the "real 2D" space: 

"NDC PROJECTION" ~ "REAL 2D PROJECTION" - "PERSPECTIVE DIVIDE" + "Z INFORMATION"

or

"REAL 2D PROJECTION" ~ "NDC PROJECTION" + "PERSPECTIVE DIVIDE" - "Z INFORMATION"

fragment shader initial state
=============================
preliminary note: in most gpu some projected 2D triangle coverage of pixels
computation unit is a "quad", a block of 2x2 pixels. In is used in the lds
layout of ready for linear interpolation vertex parameters.

THE ULTIMATE PROPERTY OF "3D" PROJECTIVE MATHS
----------------------------------------------
This is first/second year college maths (remember the easy lectures), or "3D"
hobby maths for motivated teenagers.

Rougthly speaking: When we "project" a 3D triangle onto a 2D surface (the
screen), keeping the Z (depth information) for the triangle vertexes is enough,
from the 2D coordinates of a point in the 2D triangle, to linear interpolate
(see right below for the definition) _exactly_ an attribute of the vertexes of
the 3D triangle. For this, we prefer the convenient barycentric coordinates.

P_2D a point with barycentric coordinates I_2D/J_2D ("linear")
 ^                                           |
 |                                           | 3D vertex Zs (depths)
 | projected                                 |
 |                                           v
P_3D a point with barycentric coordinates I_3D/J_3D ("perspective correct")

LINEAR INTERPOLATION DEFINITION IN A "3D" TRIANGLE WITH BARYCENTRIC COORDINATES
-------------------------------------------------------------------------------
For one attribute A, each vertex of a triangle has a value: A(V0), A(V1), A(V2).
Take P a point of the triangle with barycentric coordinates I/J.

A(P) = A(V0) + I * (A(V1) - A(V0)) + J * (A(V2) - A(V0))
 
If this attribute is a vector [x,y,z,w], apply the formula on each x, y, z and w.

vgprs
-----
There are various types of values. For instance, many fragment barycentric
"I/J" values which can be used for interpolations with the vertex attributes:
See in evergreen_cayman_programming_guide.pdf, section "unified interpolation".

You need to mathematically know what are and how to use those values
with the prepared vertex attribute related values loaded in lds (see below).

The "perspective correct" barycentric coordinates: those are convenient,
prepared by the hardware, values for correct linear interpolation of vertex
attributes in the 3D triangle.

sgprs
-----
up to 16 sgprs can be loaded by the spi, starting at index 0.
The count of "user spgrs" is set in the shader 2nd resource word:
SPI_SHADER_PGM_RSRC2_PS. The values are set in the
SPI_SHADER_USER_DATA_PS[0-15].

lds
---
In this section, fragment shader "vertex attributes" = vertex "positionS" +
vertex "parameters". 

lds is loaded with values based on "almost-straight" vertex output, the vertex
"parameters" related values for linear interpolation. Those related values
which could be used with the barycentric "I/J" values loaded in vgprs (see
above) for various linear interpolations (this explains the already substracted
values: v1-v0 and v2-v0, see linear interpolation above)

"THOSE VALUES ARE PRE-SUBSTRACTIONS OF VERTEX PARAMETERS".

The lds address for the vertex "parameters" values for linear interpolation is
in M0 register (see isas specs documents).
From the M0 register, "parameter" values are found using the byte/word? offset
from the SPI_PS_INPUT_CNTL[0-31] registers.
The M0 register is divided in 2 parts: upper 16bits and lower 16bits. The lower
16bits is actually a base offset in lds. The upper 16bits manages quad primitive (triangle)
selection, which will be automatically used by the linear interpolation machine
instructions. Remember, a quad is 2x2 pixels, the unit for pixel coverage.

_Without_ the primitive interleaving, the lds looks like this:
M0                :v0[0],v0[1],v0[2],v0[3],  (v1-v0)[0],(v1-v0)[1],(v1-v0)[2],(v1-v0)[3],  (v2-v0)[0],(v2-v0)[1],(v2-v0)[2],(v2-v0)[3] "parameter[0]"
M0 + 1 "parameter":v0[0],v0[1],v0[2],v0[3],  (v1-v0)[0],(v1-v0)[1],(v1-v0)[2],(v1-v0)[3],  (v2-v0)[0],(v2-v0)[1],(v2-v0)[2],(v2-v0)[3] "parameter[1]"
...

See in evergreen_cayman_programming_guide.pdf, section "unified interpolation".

fragment shader output
======================
at least 1 color, and optionally the z depth (usually linear interpolated from
the 3D triangle vertexes Zs).


********************************************************************************
********************************************************************************
**                           vulkan spirv mapping                             **
********************************************************************************
********************************************************************************

vertex shader
=============

input
-----

The vertex index, instance index, etc, which are loaded in vgprs are mapped to
spirv builtin. For instance there is a specific spirv VertexIndex builtin
decoration.
Since these are inputs to the vertex shader, they must be declared with the
spirv "input" storage class in the spirv OpEntrypoint.
Those builtins do map to pre-loaded values by the hardware in vgprs.
This is for explicit usage of such indexes. There are also implicit usage of
such indexes with vertex attributes/locations/components, see below.

Usually: v0 is loaded by the hardware with the vertex index, and the "user"
sgprs are loaded with values depending on the ABI for the GPU library used (see
further below for the mesa vulkan ABI).

The vertex attributes (position, color, etc) are defined using the
binding&location&component abstraction. In a vertex spirv shader, a variable
which is a vertex attribute must be explicitely annotated with location
decoration, component decoration and binding decoration can be implicit then
have the value 0. Keep in mind that annotations are defined in a specific
section of the shader using the "decorate" spirv opcode.

All locations are indexed to the vertex index/instance indexes. The mental
picture is LOCATION&COMPONENT DO DEFINE INNER 32BITS WORD OFFSETS INTO AN
ELEMENT(=VERTEX) OF A ARRAY(ARRAY OF VERTEXES OR VERTEX BUFFER, AKA THE
"BINDING").
The binary description is done in vulkan, see right below to how to compute
those offsets. Each binding decoration maps to the buffer of the same index.

The hardware offset of a locationX/componentY is from vulkan, see "vertex input
description" chapter in vulkan specs. In the VkVertexInputAttributeDescription,
the location member maps directly to the spirv shader location. From there,
with the VkVertexInputBindingDescription structures and with the
VkPipelineVertexInputStateCreateInfo used upon graphic pipeline creation, we
know that this location is using implicitly either the vertex index or the
instance index. The vertex index and the instance index will be preloaded in
the first vgprs by the fixed-function hardware. The connexion between actual
GPU ram is done between the VkVertexInputBindingDescription and VkBuffer with
the vkCmdBindVertexBuffers function. A VkBuffer has a GPU virtual address and
is a spirv binding. Such GPU virtual address is usually embedded in a hardware
buffer resource descriptor/constant (up to 128bits).
To pass in a more or less complex fashion those hardware buffer resource
descriptors/constants is done using "user" sgprs. From hardware isas specs:the
content of sgprs registers are shared among all invocations/executions of a
hardware shader, but the content of vgprs registers is specific to each
invocation/execution. A vkBuffer is divided in "elements", each vkBuffer
element has an internal binary structure described by the attribute
description. 

VkVertexInputAttributeDescription---------
    |-->LOCATION/COMPONENT, SPIRV<->VULKAN|
    |-->vkFormat<->size in bytes          |
    |-->offset in vkBuffer element        |
                                          + VkPipelineVertexInputStateCreateInfo
                                          |    structure
VkVertexInputBindingDescription-------------------------
    |-->BINDING, SPIRV<->VULKAN                         |
    |-->vertex index (v0) _OR_ instance index           |
    |     this is the index of the element              |
    |-->buffer stride in bytes                          |
          this the number of bytes between the first    |
          byte, included, of an element and the first   |
          byte of the next element                      |
                                                        + vkCmdBindVertexBuffers
                                                        |     function
VkBuffer------------------------------------------------
    |-->GPU virtual address (set of 2 sgprs) but usually a buffer resource
    |   descriptor/constant (set of 2 or 4 sgprs)
    |-->major offset in the buffer

Using pseudo code, the GPU virtual address of a vertex attribute is:
vkbuffer.base_virtual_address + vkbuffer.offset
	+ vkbinding.(vertex|instance)_index * vkbinding.buffer_element_stride
						+ vkattribute.offset_in_element
to simplify, for the classic vertex index case:
vkbuffer.virtual_address + "offsets" + vkbinding.vertex_index * vkbinding.stride

This can be translated in GPU machine code using basic arithmetic code but
most hardware have support for such memory complex addressing (the hardware
buffer resource descriptor/constant).

From a hardware perspective, each binding is assigned a index of a hardware
buffer resource descriptors/constants in the array of buffer resource
descriptors/constants which "32bits" address is pre-loaded in the sgpr mapped to
AC_UD_VS_VERTEX_BUFFERS (see below for the construction of the real 64bits
virtual address from this pre-loaded 32bits value).

output
------

The vertex shader must output the vertex position in "clipped coordinates"
(checkout a vulkan 3D maths guide). There is a spirv builtin decoration
"position" for this purpose. This will be mapped to the proper hardware export
instruction with the "pos0" field (see hardware isa reference). A few other
vertex shader outputs can be mapped by the hardware posX export instruction (X
= 1-3). There are spirv decoration builtins for these outputs, like the
"clipped coordinates" vertex position, for instance the "clip distance". The
hardware can map those posX export instructions (X = 1-3) to specific values
which will be used by the "fixed-function" hardware which follows an
"invocation" (a run of a the vertex shader for a specific vertex) of the vertex
shader in the hardware pipeline.

Vertexes can have additional generic attributes/parameters, like a color or
coordinates in a texture. Those must be output-ed by the hardware vertex shader
in order to let the hardware prepare them for interpolation in the hardware
fragment shader. For this, the hardware export instruction can target a
specific generic attribut/parameters slot (up to roughly 30 slots, see hardware
isa reference for the export instruction). These attributes/parameters will
have to be declared as spirv opentry outputs with the location spirv decoration
specifying the slot number. The hardware will "prepare" those
attributes/parameters for "interpolation" per fragment in special memory
banks, the lds (Local Data Store), where the hardware fragment shader
will find the "prepared for interpolation" attribute/parameter data.
In the hardware fragment shader invocation, the M0 register will be
loaded with the offset, in the lds, of the interpolation data of the slot 0.
The other slots/locations will have to be indexed from this offset. Of course,
the spirv fragment shader will have to declare as opentry inputs those
attributes/parameters with the interpolation decoration and the location
decoration (slot mapping).

fragment shader
===============

input
-----

The vertex "generic" attributes/parameters which where output-ed by the vertex
shader must be declared as spirv OpEntry inputs (input storage class) with the
proper interpolation decoration. The mapping between vertex outputs and
fragment inputs is made with the spirv location decoration:

spirv vertex output storage class location 0 -> spirv fragment shader input
storage class, with interpolation decoration, and location 0.

Each slot/location has its interpolation data in the lds.

output
------

The hardware will output colors with an hardware export instruction. Those
export instructions must be mapped to the spirv color attachment of the spirv
fragment shader using the spirv location decoration.

hardware export color target 0 ->  spirv location 0 -> vulkan fragment shader color attachment 0
hardware export color target 1 ->  spirv location 1 -> vulkan fragment shader color attachment 1
	... (the hardware can up to 7)

BE CAREFULL NOT TO MIX INPUT LOCATIONS WITH OUTPUT LOCATIONS.


********************************************************************************
********************************************************************************
**                           mesa vulkan amdgpu ABI                           **
********************************************************************************
********************************************************************************

mesa vulkan amdgpu pipeline programming (refered as the library from now on)
code will agree on 2 main groups of data with the shader machine code generator
refered as the generator from now on):
	 - "user" data
	 - direct descriptor sets

"user" data
===========

A user sgpr refers to a sgpr which will be loaded by the hardware pipeline
before the execution of shaders.
The C constants right below defines the content of the user sgprs, at the time
of writting (20200707):
enum radv_ud_index {
	AC_UD_SCRATCH_RING_OFFSETS = 0,
	AC_UD_PUSH_CONSTANTS = 1,
	AC_UD_INLINE_PUSH_CONSTANTS = 2,
	AC_UD_INDIRECT_DESCRIPTOR_SETS = 3,
	AC_UD_VIEW_INDEX = 4,
	AC_UD_STREAMOUT_BUFFERS = 5,
	AC_UD_NGG_GS_STATE = 6,
	AC_UD_SHADER_START = 7,
	AC_UD_VS_VERTEX_BUFFERS = AC_UD_SHADER_START,
	AC_UD_VS_BASE_VERTEX_START_INSTANCE,
	AC_UD_VS_MAX_UD,
	AC_UD_PS_MAX_UD,
	AC_UD_CS_GRID_SIZE = AC_UD_SHADER_START,
	AC_UD_CS_MAX_UD,
	AC_UD_GS_MAX_UD,
	AC_UD_TCS_MAX_UD,
	AC_UD_TES_MAX_UD,
	AC_UD_MAX_UD = AC_UD_TCS_MAX_UD,
};

example: for a vertex shader, the library and the generator will agree on which
sgprs will be loaded with the address of the array of "vertex buffers".
Namely, AC_UD_VS_VERTEX_BUFFERS (currently 7)  --> index of first user sgpr and
count of sgprs.
A bit of warning: on gfx10 and above, mesa sgpr index in actually shifted by
8 due to 8 "system sgprs" (but the first 2 are still
AC_UD_SCRATCH_RING_OFFSETS, see right below) in the case of the merged
vertex/geometry shader.

AC_UD_SCRATCH_RING_OFFSETS
--------------------------

Even if unused by the shader, it is always present. It is the only virtual
address provided as 64bits, then it consumes 2 user sgprs. Since it is often
declared first, usually the virtual address is in s[0:1].


AC_UD_VS_VERTEX_BUFFERS
-----------------------

When the libary and the generator agree on AC_UD_VS_VERTEX_BUFFERS as 1 user
sgpr, this very sgpr does contain the low 32bits of the virtual address of the
array of hardware buffer resources (see hardware ISA programing manual for
their definition). An hardware buffer resource defines an array of data element
(here one vertex) and a spirv locationX/componentY is actually an offset
_inside_ such element. The selection of the data element (or vertex). The data
element/vertex is selected with the shader execution "index". An hardware
buffer resource is 4x32bits words/128bits.
To compute the 64bits virtual address, the library and the generator must agree
on the 32bits immediat value to use as the high 32bits of the virtual
address. The shader generator will have to emit machine instructions to build
the 64bits virtual address using the sgpr containing the low 32bits of the
virtual address and the immediat value of the high 32bits of the virtual
address.

AC_UD_VS_BASE_VERTEX_START_INSTANCE
-----------------------------------

This is related to indirect draw hardware commands. See the published
programming manuals, description of draw_indirect packet descriptions,
"base_vtx_loc". This declares the sgprs which will be used to tell the shader
of any vertex index offset and/or instance index offset.

AC_UD_VIEW_INDEX
----------------

In a vertex shader, the vertex index may need a base index to start from, and
that related to a view. This will declare 1 sgpr to hold such base/view index.
It means the generator will have to add this base/view index to the hardware
loaded index (usually in v0).


direct descriptor sets
======================
TODO
./stg/s_waitcnt.c0000644000175000017500000001114413706566071012002 0ustar  user#ifndef STG_S_WAITCNT_C
#define STG_S_WAITCNT_C
#define s ((struct stg_ctx_private_t*)ctx->private)->sgprs.array
/*NSPC*/
STATIC bool s_waitcnt_are_sgprs_loading(struct stg_ctx_t *ctx, u32 sgprs,
								u32 sgprs_n)
{
	u32 i;

	i = 0;
	loop {
		if (i == sgprs_n)
			break;
		if (!s[sgprs + i].free && s[sgprs + i].pending_offchip_mem_load)
			return true;
		++i;
	}
	return false;
}
#undef s
/*NSPC*/
STATIC void s_waitcnt_init(struct stg_ctx_t *ctx, struct s_waitcnt_t *wait)
{
	struct stg_ctx_private_t *p;

	memset(wait, 0, sizeof(*wait));
	p = ctx->private;
	wait->vm_cnt = p->s_waitcnt.vm_cnt.n_max;
}
/*
 * clear only the "waited" offchip mem loads/stores, see _our_ s_waitcnt guide
 * doc
 */
#define stk p->s_waitcnt.vm_cnt.stack.slots
#define stk_n p->s_waitcnt.vm_cnt.stack.n
#define v p->vgprs.array
/*NSPC*/
STATIC void vm_cnt_post_update(struct stg_ctx_t *ctx, struct s_waitcnt_t *wait)
{
	struct stg_ctx_private_t *p;
	u32 i;

	p = ctx->private;
	/*
	 * all registers of the insts "older" that the waited vm_cnt are to be
	 * "cleared" of vm_cnt pending load/store flags
	 */
	i = wait->vm_cnt;
	loop {
		u32 j;

		if (i == stk_n)
			break;
		j = 0;
		loop {
			if (j == p->vgprs.n_max)
				break;
			if (v[j].free) {
				++j;
				continue;
			}
			if (v[j].offchip_mem.last_s_waitcnt_inst
								== stk[i].inst)
				v[j].offchip_mem.pending = false;
			++j;
		}
		++i;
	}
	/* chop the bottom of the stack */
	stk_n = wait->vm_cnt;
}
#undef stk
#undef stk_n
#undef v
/*
 * after a s_waitcnt machine instruction was emitted, update the various
 * related states, see _our_ s_waitcnt guide doc
 */
#define s p->sgprs.array
#define v p->vgprs.array
/*NSPC*/
STATIC void s_waitcnt_post_update(struct stg_ctx_t *ctx,
						struct s_waitcnt_t *wait)
{
	struct stg_ctx_private_t *p;

	p = ctx->private;
	if (wait->lgkm_cnt_zero) { /* clear pending sgpr offchip mem loads */
		u32 i;

		i = 0;
		loop {
			if (i == p->sgprs.n_max)
				break;
			if (!s[i].free) 
				s[i].pending_offchip_mem_load = false;
			++i;
		}
	}
	if (wait->exp_cnt_zero) { /* clear pending vgpr exports */
		u32 i;

		i = 0;
		loop {
			if (i == p->vgprs.n_max)
				break;
			if (!v[i].free)
				v[i].pending_export = false;			
			++i;
		}
	}
	if (wait->use_vm_cnt) /* the tricky one, see _our_ doc */
		vm_cnt_post_update(ctx, wait); 
}
#undef s
#undef v
/*NSPC*/
#define stk \
((struct stg_ctx_private_t*)ctx->private)->s_waitcnt.vm_cnt.stack.slots
#define stk_n \
(((struct stg_ctx_private_t*)ctx->private)->s_waitcnt.vm_cnt.stack.n)
STATIC void s_waitcnt_push(struct stg_ctx_t *ctx, u32 inst, u8 type)
{
	/* overlapping is handled */
	memmove(&stk[1], &stk[0], sizeof(*stk) * stk_n);
	++stk_n;
	stk[0].type = type;
	stk[0].inst = inst;
}
#undef stk
#undef stk_n
#define v p->vgprs.array
/*NSPC*/
STATIC void s_waitcnt_post_vgpr_load(struct stg_ctx_t *ctx, u32 loaded_vgpr)
{
	struct stg_ctx_private_t *p;
	u32 new_inst;

	p = ctx->private;
	new_inst = p->s_waitcnt.vm_cnt.next_inst;
	++(p->s_waitcnt.vm_cnt.next_inst);

	/* update/overwrite */
	v[loaded_vgpr].offchip_mem.pending = true;
	v[loaded_vgpr].offchip_mem.last_s_waitcnt_inst = new_inst;

	s_waitcnt_push(ctx, new_inst, STG_S_WAITCNT_VM_CNT_LOAD);
}
#undef v
STATIC u8 hw_emit_w(struct stg_ctx_t *ctx, u32 w);
/* XXX: GFX6 ok, VEGA nok because vm_cnt is split in 2 set of bits */
#define LGKM_CNT_MAX	0b11111	/* 5 bits */
#define EXP_CNT_MAX	0b111	/* 3 bits */
#define S_WAITCNT	12 /* see docs */
#define SOPP		0b101111111 /* see docs */
/*NSPC*/
STATIC u8 s_waitcnt(struct stg_ctx_t *ctx, struct s_waitcnt_t *wait)
{
	struct stg_ctx_private_t *p;
	u32 lgkm_cnt;
	u32 vm_cnt;
	u32 exp_cnt;
	u32 w;
	u8 r;

	if (!wait->lgkm_cnt_zero && !wait->use_vm_cnt && !wait->exp_cnt_zero)
		return SUCCESS;

	p = ctx->private;
	lgkm_cnt = LGKM_CNT_MAX; /* hardware family independent for now */
	vm_cnt = p->s_waitcnt.vm_cnt.n_max;
	exp_cnt = EXP_CNT_MAX; 

	if (wait->lgkm_cnt_zero)
		lgkm_cnt = 0;
	if (wait->use_vm_cnt)
		vm_cnt = wait->vm_cnt;
	if (wait->exp_cnt_zero)
		exp_cnt = 0;

	w = 0;
	/* XXX: this is not valid for all hardware families see docs */
	w |= vm_cnt;
	w |= exp_cnt << 4;
	w |= lgkm_cnt << 8;
	w |= S_WAITCNT << 16;
	w |= SOPP << 23;

	r = hw_emit_w(ctx, w);
	if (r != SUCCESS) {
		LOG_ERR("s_waitcnt:unable to emit the machine instruction word\n");
		return FAILURE;
	}

	LOG("hw:s_waitcnt");
	if (vm_cnt != p->s_waitcnt.vm_cnt.n_max)
		LOG_ERR(" vm_cnt(%u)", vm_cnt);
	if (exp_cnt != EXP_CNT_MAX)
		LOG_ERR(" exp_cnt(%u)", exp_cnt);
	if (lgkm_cnt != LGKM_CNT_MAX)
		LOG_ERR(" lgkm_cnt(%u)", lgkm_cnt);
	LOG_ERR("\n");

	s_waitcnt_post_update(ctx, wait);
	return SUCCESS;
}
#undef LGKM_CNT_MAX
#undef EXP_CNT_MAX
#undef S_WAITCNT
#undef SOPP
#endif
./stg/main.c0000666000000000000000000000135713706303257010712 0ustar  root#ifndef STG_MAIN_C
#define STG_MAIN_C
#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
/*----------------------------------------------------------------------------*/
#include "stg/global_c_fixing.h"
#include "stg/global_macros.h"
/*----------------------------------------------------------------------------*/
#include <stg/public.h>
/*----------------------------------------------------------------------------*/
#include "stg/private.h"
#include "stg/s_waitcnt.c"
#include "stg/local.c"
#include "stg/public.c"
/*----------------------------------------------------------------------------*/
#define CLEANUP
#include "stg/global_c_fixing.h"
#include "stg/global_macros.h"
#undef CLEANUP
#endif
./stg/namespace/0000777000175000017500000000000013670730552011577 5ustar  user./stg/b0000777000175000017500000000040713675373127010021 0ustar  user#!/bin/sh
top_dir=$(readlink -e ..)
dst_dir=/run/user/1000
#dst_dir=/run
cc=gcc
ar=ar

$cc -DSTG_PUBLIC= -DSTATIC=static -I$top_dir/stg/include -I$top_dir -c $top_dir/stg/main.c -o $dst_dir/main.o
rm -f $dst_dir/libstg.a
$ar rcs $dst_dir/libstg.a $dst_dir/main.o
./stg/global_macros.h0000644000175000017500000000111313674663316012617 0ustar  user#ifndef CLEANUP
#define FAILURE 0
#define LOG(fmt, ...) ({if (ctx->input.log.out != 0)\
ctx->input.log.out(fmt, ##__VA_ARGS__);})
#define LOG_ERR(fmt, ...) ({if (ctx->input.log.err != 0)\
ctx->input.log.err(fmt, ##__VA_ARGS__);})
#define OP_ENUMERANT(x) (0x0000ffff & (x))
#define OP_WS_N(x) (x >> 16)
#define RDW(x) ((struct stg_ctx_private_t*)ctx->private)->rd_w(ctx, x)
#define SUCCESS 1
/*============================================================================*/
#else
#undef FAILURE
#undef LOG
#undef LOG_ERR
#undef OP_ENUMERANT
#undef OP_WS_N
#undef RDW
#undef SUCCESS
#endif
./stg/ABBREVIATIONS0000666000175000017500000000126513706041755011562 0ustar  useraddr : ADDRess
blk(s) : BLocK(S)
buf(s) : BUFfer(S)
clr : CLeaR
cpnt(s) : ComPoNenT(S)
cst(s) : ConSTant(S) (don't use "const" to avoid collision with the C keyword)
decl(s) : DECLaration(S)
del : DELete
desc(s) : DESCriptor(S)
dst(s) : DeSTination(S)
e : End (usually a pointer/index right past the last element of an array)
func : FUNCtion
hw : HardWare
hi : HIgh
id(s) : IDentifier(S)
idx(s) : InDeX(eS)
imm : IMMediat value
inst(s) : INSTruction(S)
lo : LOw
mem : MEMory
n : couNt
nr : NumbeR
pos : POSition
ptr(s) : PoinTeR(S)
rd : ReaD
reg(s) : REGister(S)
src(s) : SouRCe(S)
stg : Spirv To Gcn
sz : SiZe
val(s) : VALue(S)
vec(s) : VECtor(S)
vtx(s) : VerTeX(eS)
w(s) : Word(S) (32 bits)
./stg/private.h0000644000175000017500000000561013706375117011466 0ustar  user#ifndef STG_PRIVATE_H
#define STC_PRIVATE_H
/* XXX into compiler constants */
#define STG_ID_TYPE_VGPRS	1
// #define STG_IDTYPE_VGPRS_ALIASES, need more perspective to proceed
#define STG_ID_TYPE_CST 	2
#define STG_ID_TYPE_SPIRV_IDX	3
#define STG_BINDINGS_N_MAX	32
/*NSPC*/
struct stg_ctx_private_t {
	u32 (*rd_w)(struct stg_ctx_t *ctx, u32 idx);

	struct {
		struct {
			u32 start_spirv_idx;
			u32 end_spirv_idx;
		} entrypoints;
		struct {
			u32 start_spirv_idx;
			u32 end_spirv_idx;
		} annotations;
		struct {
			u32 start_spirv_idx;
			u32 end_spirv_idx;
		} nonfunc_decls;
		struct {
			u32 start_spirv_idx;
			/* until the end of the spirv data */
		} funcs;
	} layout;

	u32 entrypoint_id;
	u32 type_f32_id;
	
	struct {
		struct {
			bool free; /* if unused by the shader at all */
			/*
			 * map to a stacked vec mem access blk in the related 
			 * s_waitcnt counter state tracker below.
			 */
			struct {
				bool pending; 
				/*
				 * we could have several pending "stores" to
				 * vec mem but only the last one is significant
				 * for s_waitcnt vm_cnt since vec mem accesses
				 * are in-order.
				 */
				u32 last_s_waitcnt_inst;
			} offchip_mem;
			/* the content of this vgpr is being exported */
			bool pending_export;
		} *array;
		u32 n_max;
	} vgprs;
	struct {
		struct {
			bool free;
			/* this is s_waitcnt lgkm_cnt */
			bool pending_offchip_mem_load;
		} *array;
		u32 n_max;
	} sgprs;
	struct {
		struct {
			u32 id; /* legal is 0 < id < bound */
			u32 type;
			u32 spirv_idx;
			bool cst_loaded;
			struct {
				u32 *idxs;
				u32 n;
			} vgprs;
		} *array;
		u32 n;
	} ids;
	struct {
		struct {
			struct {
				bool ready;
				u32 sgprs; /* the sgprs with the 64bits addr */
			} addr;
		} buf_descs_array;
		struct {
			struct {
				bool loaded;
				u32 sgprs; /* the 4 sgprs the buf desc */
			} buf_desc;
		} array[STG_BINDINGS_N_MAX];
	} bindings;
	/*--------------------------------------------------------------------*/
	/*  s_waitcnt hw inst (see hw isa doc) */
	struct {
		/* see _our_ s_waitcnt_ guide doc */
		struct {
			u32 n_max; /* cst depending on hw family */
			#define STG_S_WAITCNT_VM_CNT_LOAD	0
			#define STG_S_WAITCNT_VM_CNT_STORE	1
			struct {
				struct {
					u8 type; /* load/store from vec mem */
					u32 inst; /* monotonic counter */
				} *slots;
				u32 n;
			} stack;
			u32 next_inst; /* monotonic or uniq ids for insts */
		} vm_cnt;
	} s_waitcnt;
};
/*
 * many hw insts involve many registers. we need to "accumulate" 
 * s_waitcnt related information along the generation of such one inst,
 * that in order to emit one s_waitcnt, if needed, consistent with the later
 * usage of the registers it will use.
 */
/*NSPC*/
struct s_waitcnt_t {
	bool lgkm_cnt_zero; /* need to wait on lgkm_cnt being 0 */
	bool use_vm_cnt; /* do wait on the following vm_cnt */
	u32 vm_cnt; /* the vm_cnt value to wait for */
	bool exp_cnt_zero; /* need to wait on exp_cnt being 0 */
};
#endif
./stg/test/0000777000000000000000000000000013707311657010577 5ustar  root./stg/test/vtx.spirv0000644000175000017500000000160013700100267012505 0ustar  user#GLSL.std.450main	�	GL_ARB_separate_shader_objects	GL_ARB_shading_language_420packmain	gl_Positionposuvuv_incolorcolor_inG	GGGGG!  ;	 
;
+�? ; ;;;
6�=
O

QQQP>	=>=>�8./stg/test/frag.spirv0000644000175000017500000000132413700100267012606 0ustar  user#GLSL.std.450main	�	GL_ARB_separate_shader_objects	GL_ARB_shading_language_420packmain	uFragColorcolorsuvG	GG"G!G!  ;	 
;
	

 ; ;6�===W�>	�8./stg/test/main.c0000666000175000017500000001067113704341222011705 0ustar  user#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <string.h>
#include "stg/global_c_fixing.h"
/* stg interface needs "c_fixing.h" and STG_PUBLIC */
#define STG_PUBLIC
#include <stg/public.h>
static void read_whole_file(struct stg_ctx_t *stg_ctx, FILE *f, char *f_name)
{
	size_t total_read_bytes_n;
	size_t bufsiz_n;

	clearerr(f);
	bufsiz_n = 0;
	total_read_bytes_n = 0;
	loop {
		size_t read_bytes_n;
		u32 *new_spirv;

		if ((total_read_bytes_n + BUFSIZ) > (BUFSIZ * bufsiz_n)) {
			size_t alloc_bytes_n;

			++bufsiz_n;
			alloc_bytes_n = bufsiz_n * BUFSIZ;
			new_spirv = realloc(stg_ctx->input.spirv.ws,
								alloc_bytes_n);
			if (new_spirv == 0){
				fprintf(stderr, "ERROR:unable to re-allocate the spirv file buffer to %lu bytes\n", (unsigned long)alloc_bytes_n);
				exit(EXIT_FAILURE);
			}
			stg_ctx->input.spirv.ws = new_spirv;
		}
		read_bytes_n = fread((u8*)(stg_ctx->input.spirv.ws)
					+ total_read_bytes_n, 1, BUFSIZ, f);
		if (ferror(f) != 0) { 
			fprintf(stderr, "ERROR:unable to read \"%s\" spirv file\n", f_name);
			exit(EXIT_FAILURE);
		}
		total_read_bytes_n += read_bytes_n;
		if (feof(f) != 0)
			break;
	}
	/* whole file is read */
	if ((total_read_bytes_n % sizeof(u32)) != 0) {
		fprintf(stderr, "ERROR:the spirv file \"%s\" has not a size (%lu) aligned on 32 bit words\n", f_name, (unsigned long)total_read_bytes_n);
		exit(EXIT_FAILURE);
	}
	stg_ctx->input.spirv.ws_n = (u64)(total_read_bytes_n / sizeof(u32));
}
static void test_log_err(u8 *fmt, ...)
{
	va_list ap;
	va_start(ap, fmt);
	vprintf(fmt, ap);
	va_end(ap);
}
static void test_log(u8 *fmt, ...)
{
	va_list ap;
	va_start(ap, fmt);
	vprintf(fmt, ap);
	va_end(ap);
}
static void machine_instruction_ws_dump(struct stg_ctx_t *ctx)
{
	u32 i;

	i = 0;
	printf("hw:MACHINE INSTRUCTION WORDS:\n");
	loop {
		if (i == ctx->output.hw_bytes_n)
			break;
		printf("hw:0x%08x:\t0x%08x\n", i, *(u32*)(ctx->output.hw + i));
		i += 4;
	}
}
#define location(x) stg_ctx->input.locations.array[x]
#define STG_TRANSLATION_SUCCESSFUL 1
int main(int argc, char **argv)
{
	u8 r8;
	int ri;
	FILE *f;
	struct stg_ctx_t *stg_ctx;

	f = fopen(argv[1], "r");
	if (f == 0) {
		fprintf(stderr, "ERROR:unable to open file \"%s\" for reading\n", argv[1] ? argv[1] : "MISSING FILE");
		return EXIT_FAILURE;
	}
	stg_ctx = stg_ctx_new();
	if (stg_ctx == 0) {
		fprintf(stderr, "ERROR:unable to instanciate a new spirv to hw context\n");
		return EXIT_FAILURE;
	}
	read_whole_file(stg_ctx, f, argv[1]);
	stg_ctx->input.log.err = test_log_err;
	stg_ctx->input.log.out = test_log;
	/*--------------------------------------------------------------------*/
	/* the bindings or array of hardware buf descs */
	stg_ctx->input.bindings.addr.hi_32bits = 0;
	stg_ctx->input.bindings.addr.lo_32bits_sgpr = 2;
	stg_ctx->input.user_sgprs[2] = true;
	/*--------------------------------------------------------------------*/
	/* fake locations */
	stg_ctx->input.locations.array = calloc(1,
				3 * sizeof(stg_ctx->input.locations.array[0]));
	location(0).binding = 0;
	location(0).inner_offset = 0;
	location(1).binding = 0;
	location(1).inner_offset = location(0).inner_offset + 4 * 4; 
	location(2).binding = 0;
	location(2).inner_offset = location(1).inner_offset + 4 * 2;
	stg_ctx->input.locations.n = 3;
	/*--------------------------------------------------------------------*/
	/* vtx base idx and instance base idx */
	stg_ctx->input.vtx_base_idx_sgpr = 3;
	stg_ctx->input.user_sgprs[3] = true;
	stg_ctx->input.have_vtx_base_idx = true;
	stg_ctx->input.instance_base_idx_sgpr = 4;
	stg_ctx->input.user_sgprs[4] = true;
	stg_ctx->input.have_instance_base_idx = true;
	/*--------------------------------------------------------------------*/
	stg_ctx->input.machine = STG_MACHINE_GFX_6;
	/*--------------------------------------------------------------------*/
	stg_ctx->input.entrypoint_name = strdup("main");
	/*--------------------------------------------------------------------*/
	ri = EXIT_FAILURE;
	r8 = stg_translate(stg_ctx);
	if (r8 == STG_TRANSLATION_SUCCESSFUL) {
		fprintf(stdout, "SUCCESS: translation done\n");
		machine_instruction_ws_dump(stg_ctx);
		ri = EXIT_SUCCESS;
	} else
		fprintf(stderr, "ERROR:unable to translate the provided spirv module\n");
	stg_ctx_del(&stg_ctx);	
	return ri;
}
#undef STG_TRANSLATION_SUCCESSFUL
#undef location
/*----------------------------------------------------------------------------*/
#undef STG_PUBLIC
#define CLEANUP
#include "stg/global_c_fixing.h"
#undef CLEANUP
./stg/test/vtx.llvm.gcn0000644000175000017500000000340713707311541013077 0ustar  userBB15_0:
	s_mov_b32 s0, s3                                                         ; BE800303
	s_mov_b32 s3, 0                                                          ; BE830380
	v_add_i32_e32 v4, vcc, s0, v0                                            ; 4A080000
	s_load_dwordx4 s[0:3], s[2:3], 0x0                                       ; C0800300
	v_mov_b32_e32 v9, 1.0                                                    ; 7E1202F2
	s_waitcnt lgkmcnt(0)                                                     ; BF8C007F
	tbuffer_load_format_xyzw v[0:3], v4, s[0:3],  dfmt:13, nfmt:7, 0 idxen   ; EBEB2000 80000004
	tbuffer_load_format_x v3, v4, s[0:3],  dfmt:4, nfmt:7, 0 idxen offset:12 ; EBA0200C 80000304
	tbuffer_load_format_x v5, v4, s[0:3],  dfmt:4, nfmt:7, 0 idxen offset:16 ; EBA02010 80000504
	tbuffer_load_format_x v6, v4, s[0:3],  dfmt:4, nfmt:7, 0 idxen offset:20 ; EBA02014 80000604
	tbuffer_load_format_x v7, v4, s[0:3],  dfmt:4, nfmt:7, 0 idxen offset:24 ; EBA02018 80000704
	tbuffer_load_format_x v8, v4, s[0:3],  dfmt:4, nfmt:7, 0 idxen offset:28 ; EBA0201C 80000804
	tbuffer_load_format_x v4, v4, s[0:3],  dfmt:4, nfmt:7, 0 idxen offset:32 ; EBA02020 80000404
	s_waitcnt vmcnt(6)                                                       ; BF8C0F76
	exp pos0 v0, v1, v2, v9 done                                             ; F80008CF 09020100
	s_waitcnt vmcnt(2)                                                       ; BF8C0F72
	exp param0 v3, v5, v6, v7                                                ; F800020F 07060503
	s_waitcnt vmcnt(0)                                                       ; BF8C0F70
	exp param1 v8, v4, off, off                                              ; F8000213 00000408
	s_endpgm                                                                 ; BF810000

./stg/test/b0000777000000000000000000000046313675373234010754 0ustar  root#!/bin/sh
top_dir=$(readlink -e ../..)
dst_dir=/run/user/1000
#dst_dir=/run
cc=gcc
ccld='gcc -B/nyan/glibc/current/lib'

$cc -I$top_dir/stg/include -I$top_dir -c main.c -o $dst_dir/test_main.o
# argument ordering does matter for symbol resolving
$ccld  $dst_dir/test_main.o -L$dst_dir -lstg -o $dst_dir/stg
./stg/test/frag.spirv.txt0000644000175000017500000000674213700100566013437 0ustar  user// magic number = 0x07230203 (good)
// version = 0x00010000
// generator = 0x00080008
// bound = 24
// reserved = 0x00000000

// section start: capabilities
		capability shader
// section end: capabilities, breaking opcode=ext_inst_import(11)

// section start: extensions
// section end: extensions, breaking opcoder=ext_inst_import(11)

// section start: extended set of instructions imports
/* %1 */	ext_inst_import id=%1 name="GLSL.std.450"
// section end: extended set of instructions imports, breaking opcode=memory_model(14)

// the only memory model instruction, if one, should be here
		memory_model addressing_model=logical memory_model=glsl450

// section start: entry points
		entry_point execution_model=fragment entry_point=%4 name="main" /*interfaces[0]=*/%9 /*interfaces[1]=*/%11 /*interfaces[2]=*/%20
// section end: entry points, breaking opcode=execution_mode(16)

// section start: execution modes
		execution_mode entry_point=%4 mode=0x00000007
// section end: execution modes, breaking opcoder=source(3)

// section start: debug

// debug first subsection start
		source glsl 0x0000008c
		source_extension "GL_ARB_separate_shader_objects"
		source_extension "GL_ARB_shading_language_420pack"
// debug first subsection end, breaking opcode=name(5)

// debug second subsection start
		name target=%4 name="main"
		name target=%9 name="uFragColor"
		name target=%11 name="color"
		name target=%16 name="s"
		name target=%20 name="uv"
// debug second subsection end, breaking opcode=decorate(71)

// debug third subsection start
// debug third subsection end, breaking opcode=decorate(71)
// section end: debug

// section start: annotations
		decorate target=%9 decoration=location 0
		decorate target=%11 decoration=location 1
		decorate target=%16 decoration=descriptor_set 0
		decorate target=%16 decoration=binding 0
		decorate target=%20 decoration=location 0
// section end: annotations, breaking opcode=type_void(19)

// section start: non function declarations
/* %2 */	type_void %2
/* %3 */	type_function id=%3 return_type=%2
/* %6 */	type_float id=%6 width=32
/* %7 */	type_vector id=%7 component_type=%6 components_n=4
/* %8 */	type_pointer id=%8 storage_class=output type=%7
/* %9 */	variable pointer_id=%9 type=%8 storage_class=output
/* %10 */	type_pointer id=%10 storage_class=input type=%7
/* %11 */	variable pointer_id=%11 type=%10 storage_class=input
/* %13 */	type_image id=%13 sampled_type=%6 dim=2d depth=no_depth_image arrayed=non_arrayed_content multisample=single_sampled sampled=sampler image_format=unknown_valid
/* %14 */	type_sampled_image id=%14 image_type=%13
/* %15 */	type_pointer id=%15 storage_class=uniform_constant type=%14
/* %16 */	variable pointer_id=%16 type=%15 storage_class=uniform_constant
/* %18 */	type_vector id=%18 component_type=%6 components_n=2
/* %19 */	type_pointer id=%19 storage_class=input type=%18
/* %20 */	variable pointer_id=%20 type=%19 storage_class=input
// section end: non function declarations, breaking opcode=function(54)

// section start: function declarations then definitions
/* %4 */	function id=%4 return_type=%2 control=none type=%3
/* %5 */		label id=%5
/* %12 */			load id=%12 type=%7 pointer=%11
/* %17 */			load id=%17 type=%14 pointer=%16
/* %21 */			load id=%21 type=%18 pointer=%20
/* %22 */			image_sample_implicit_lod id=%22 type=%7 sampled_image=%17 coordinate=%21
/* %23 */			fmul id=%23 type=%7 /*operands[0]=*/%12 /*operands[1]=*/%22
				store pointer=%9 object=%23
				return
		function_end
// section end: function declarations then definitions
./stg/test/vtx.spirv.txt0000644000175000017500000001000113700100344013312 0ustar  user// magic number = 0x07230203 (good)
// version = 0x00010000
// generator = 0x00080008
// bound = 29
// reserved = 0x00000000

// section start: capabilities
		capability shader
// section end: capabilities, breaking opcode=ext_inst_import(11)

// section start: extensions
// section end: extensions, breaking opcoder=ext_inst_import(11)

// section start: extended set of instructions imports
/* %1 */	ext_inst_import id=%1 name="GLSL.std.450"
// section end: extended set of instructions imports, breaking opcode=memory_model(14)

// the only memory model instruction, if one, should be here
		memory_model addressing_model=logical memory_model=glsl450

// section start: entry points
		entry_point execution_model=vertex entry_point=%4 name="main" /*interfaces[0]=*/%9 /*interfaces[1]=*/%11 /*interfaces[2]=*/%22 /*interfaces[3]=*/%24 /*interfaces[4]=*/%26 /*interfaces[5]=*/%27
// section end: entry points, breaking opcode=source(3)

// section start: execution modes
// section end: execution modes, breaking opcoder=source(3)

// section start: debug

// debug first subsection start
		source glsl 0x0000008c
		source_extension "GL_ARB_separate_shader_objects"
		source_extension "GL_ARB_shading_language_420pack"
// debug first subsection end, breaking opcode=name(5)

// debug second subsection start
		name target=%4 name="main"
		name target=%9 name="gl_Position"
		name target=%11 name="pos"
		name target=%22 name="uv"
		name target=%24 name="uv_in"
		name target=%26 name="color"
		name target=%27 name="color_in"
// debug second subsection end, breaking opcode=decorate(71)

// debug third subsection start
// debug third subsection end, breaking opcode=decorate(71)
// section end: debug

// section start: annotations
		decorate target=%9 decoration=builtin position
		decorate target=%11 decoration=location 0
		decorate target=%22 decoration=location 0
		decorate target=%24 decoration=location 1
		decorate target=%26 decoration=location 1
		decorate target=%27 decoration=location 2
// section end: annotations, breaking opcode=type_void(19)

// section start: non function declarations
/* %2 */	type_void %2
/* %3 */	type_function id=%3 return_type=%2
/* %6 */	type_float id=%6 width=32
/* %7 */	type_vector id=%7 component_type=%6 components_n=4
/* %8 */	type_pointer id=%8 storage_class=output type=%7
/* %9 */	variable pointer_id=%9 type=%8 storage_class=output
/* %10 */	type_pointer id=%10 storage_class=input type=%7
/* %11 */	variable pointer_id=%11 type=%10 storage_class=input
/* %12 */	type_vector id=%12 component_type=%6 components_n=3
/* %15 */	constant id=%15 type=%6 /*values[0]=*/0x3f800000
/* %20 */	type_vector id=%20 component_type=%6 components_n=2
/* %21 */	type_pointer id=%21 storage_class=output type=%20
/* %22 */	variable pointer_id=%22 type=%21 storage_class=output
/* %23 */	type_pointer id=%23 storage_class=input type=%20
/* %24 */	variable pointer_id=%24 type=%23 storage_class=input
/* %26 */	variable pointer_id=%26 type=%8 storage_class=output
/* %27 */	variable pointer_id=%27 type=%10 storage_class=input
// section end: non function declarations, breaking opcode=function(54)

// section start: function declarations then definitions
/* %4 */	function id=%4 return_type=%2 control=none type=%3
/* %5 */		label id=%5
/* %13 */			load id=%13 type=%7 pointer=%11
/* %14 */			vector_shuffle id=%14 type=%12 /*vector[0]=*/%13 /*vector[1]=*/%13 /*components[0]=*/0 /*components[1]=*/1 /*components[2]=*/2
/* %16 */			composite_extract id=%16 type=%6 composite=%14 /*indexes[0]=*/0
/* %17 */			composite_extract id=%17 type=%6 composite=%14 /*indexes[0]=*/1
/* %18 */			composite_extract id=%18 type=%6 composite=%14 /*indexes[0]=*/2
/* %19 */			composite_construct id=%19 type=%7 /*constituents[0]=*/%16 /*constituents[1]=*/%17 /*constituents[2]=*/%18 /*constituents[3]=*/%15
				store pointer=%9 object=%19
/* %25 */			load id=%25 type=%20 pointer=%24
				store pointer=%22 object=%25
/* %28 */			load id=%28 type=%7 pointer=%27
				store pointer=%26 object=%28
				return
		function_end
// section end: function declarations then definitions
./stg/test/frag.llvm.gcn0000644000175000017500000000417313707311361013176 0ustar  userBB14_0:
	s_mov_b64 s[12:13], exec                               ; BE8C047E
	s_wqm_b64 exec, exec                                   ; BEFE0A7E
	s_mov_b32 m0, s3                                       ; BEFC0303
	s_mov_b32 s3, 0                                        ; BE830380
	s_load_dwordx8 s[4:11], s[2:3], 0x0                    ; C0C20300
	s_addk_i32 s2, 0x50                                    ; B7820050
	s_load_dwordx4 s[0:3], s[2:3], 0x0                     ; C0800300
	v_interp_p1_f32 v4, v0, attr0.z                        ; C8100200
	v_interp_p1_f32 v5, v0, attr0.w                        ; C8140300
	v_interp_p1_f32 v6, v0, attr1.x                        ; C8180400
	v_interp_p1_f32 v7, v0, attr1.y                        ; C81C0500
	v_interp_p1_f32 v2, v0, attr0.x                        ; C8080000
	v_interp_p1_f32 v3, v0, attr0.y                        ; C80C0100
	v_interp_p2_f32 v4, v1, attr0.z                        ; C8110201
	v_interp_p2_f32 v5, v1, attr0.w                        ; C8150301
	v_interp_p2_f32 v6, v1, attr1.x                        ; C8190401
	v_interp_p2_f32 v7, v1, attr1.y                        ; C81D0501
	v_interp_p2_f32 v2, v1, attr0.x                        ; C8090001
	v_interp_p2_f32 v3, v1, attr0.y                        ; C80D0101
	s_waitcnt lgkmcnt(0)                                   ; BF8C007F
	s_and_b32 s0, s0, s11                                  ; 87000B00
	s_and_b64 exec, exec, s[12:13]                         ; 87FE0C7E
	image_sample v[0:3], v[2:3], s[4:11], s[0:3] dmask:0xf ; F0800F00 00010002
	s_waitcnt vmcnt(0)                                     ; BF8C0F70
	v_mul_f32_e32 v0, v4, v0                               ; 10000104
	v_mul_f32_e32 v1, v5, v1                               ; 10020305
	v_mul_f32_e32 v2, v6, v2                               ; 10040506
	v_mul_f32_e32 v3, v7, v3                               ; 10060707
	v_cvt_pkrtz_f16_f32_e32 v0, v0, v1                     ; 5E000300
	v_cvt_pkrtz_f16_f32_e32 v1, v2, v3                     ; 5E020702
	exp mrt0 v0, off, v1, off done compr vm                ; F8001C05 00000100
	s_endpgm                                               ; BF810000

./stg/README0000666000000000000000000000142113707412664010476 0ustar  rootthis project is "one compilation unit" friendly. namely, symbol name "mangling"
is performed "manually" with the C preprocessor, that without the need of
horrible and beyond sanity compilers/runtimes, like c++ and similar. doing our
name "mangling" is orders of magnitude less toxic than requiring such
compilers/runtimes.

This code would grow in an organic way: generalization would happen step by
step because it would be unreasonable to try to do it with such incomplete
perspective. For this, function tables ("vtables") may be used, "object
oriented" or other.

this project was started with near zero knowledge of generalized compiler
algorithms and the zoo of compiler data structures. the first step is trying to
acheive a correct 1 pass "register expensive" code generation.
./stg/global_c_fixing.h0000666000000000000000000000045113676620007013074 0ustar  root#ifndef CLEANUP
#define u8 uint8_t
#define u16 uint16_t
#define u32 uint32_t
#define u64 uint64_t
#define f32 float
#define loop for(;;)
/*============================================================================*/
#else
#undef u8
#undef u16
#undef u32
#undef u64
#undef f32
#undef loop
#endif
./stg/include/0000777000000000000000000000000013706331756011244 5ustar  root./stg/include/stg/0000777000175000017500000000000013706403140012051 5ustar  user./stg/include/stg/public.h0000666000175000017500000000646513704341364013522 0ustar  user#ifndef STG_PUBLIC_H
#define STG_PUBLIC_H
/* XXX: see c_fixing.h file for u8/u32/u64/etc definition */
/*
 * XXX: you must define STG_PUBLIC to the scope you intend to use. For instance
 *      it could be "static" in order to include stg in a "one compilation unit"
 *      project.
 *
 * it will take memory management ownership of what is provided as input
 * try not to use abbreviations here
 */
#include <stdbool.h>
#define STG_MACHINE_GFX_6 0x06
#define STG_USER_SGPRS_N_MAX 32
struct stg_ctx_t {
	struct {
		struct {
			u32 *ws;
			u64 ws_n;
		} spirv;
		/*------------------------------------------------------------*/
		u8 *entrypoint_name; /* will be free() by us */
		/*------------------------------------------------------------*/
		/* "user" sgprs or driver pre-loaded sgprs */
		bool user_sgprs[STG_USER_SGPRS_N_MAX];
		/*------------------------------------------------------------*/
		/*
		 * from some types of hardware draw commands: the hardware will
		 * preload 2 sgprs (consecutive or not depending on the type of
		 * the draw command) which content will be a base idx to add to
		 * the vtx idx and a base instance idx to add to the instance
		 * idx.
		 * in the hardware documentation, look for "base_vtx_loc" (here
		 * this is a hardware location, not a spirv location).
		 */
		bool have_vtx_base_idx;
		u8 vtx_base_idx_sgpr;
		bool have_instance_base_idx;
		u8 instance_base_idx_sgpr;
		/*
		 * the vulkan/spirv input binding nr is the idx in the array
		 * of buf descs which is located at the following addr:
		 */
		struct {
			struct {
				/* immediat val provided by the kernel driver */
				u32 hi_32bits;
				/* a user sgpr idx, AC_UD_VS_VERTEX_BUFFERS */
				u8 lo_32bits_sgpr;
			} addr;
		} bindings; /* actually an array of buf descs */
		/*
		 * the vulkan/spirv input location nr is the index in the array
		 * of locations
		 */
		struct {
			struct {
				/*
				 * map to the idx of a buf desc in the above
 				 * array of buf descs
				 */
				u32 binding;
				/*
				 * map the location to a byte offset inside a
				 * elt of the buf described by the buf desc
				 * which idx in the array of buf descs is the
				 * above binding
				 * (the most limiting hardware instruction
				 * seems to be 12bits)
				 */
				u16 inner_offset;
			} *array;
			u64 n;
		} locations;
		/*------------------------------------------------------------*/
		/*
		 * the vulkan/spirv descriptor set is the index in the array of
		 * descriptor sets
		 */
		struct {
			u64 addr; /* base gpu virtual address */
			/*
			 * the vulkan/spirv binding nr is the index in the
			 * descriptor set array of bindings
			 */
			struct {
				struct {
					u8 type;
					u64 n;
				} *bindings;
			} *array;
			u64 n;
		} desc_sets;
		/*------------------------------------------------------------*/
		u8 machine;
		/*------------------------------------------------------------*/
		struct {
			void (*err)(u8 *fmt, ...);
			u8 lvl;
			void (*out)(u8 *fmt, ...);
		} log;
	} input;
	/*====================================================================*/
	struct {
		u8 *hw;
		u64 hw_bytes_n;
	} output;
	/*====================================================================*/
	void *private;
};
STG_PUBLIC struct stg_ctx_t *stg_ctx_new(void);
STG_PUBLIC void stg_ctx_del(struct stg_ctx_t **ctx);
STG_PUBLIC u8 stg_translate(struct stg_ctx_t *ctx);
#endif
./stg/include/stg/ABBREVIATIONS0000666000175000017500000000004113564263104013765 0ustar  userw(s) : Word(S) (32 bits/4 bytes)
./stg/include/stg/c_fixing.h0000666000000000000000000000020713670727060013774 0ustar  root#ifndef CLEANUP
#define u8 uint8_t
#define u32 uint32_t
#define u64 uint64_t
#else
#undef u8
#undef u32
#undef u64
#undef loop
#endif



Mode Type Size Ref File
100644 blob 5 8eba6c8dd4dcaf6166bd22285ed34625f38a84ff .gitignore
100755 blob 1587 57fa4264b9ee0ae0a6f678f2527a05d3b22dda78 00-bootstrap-build.sh
100755 blob 848 a30f443bf405d56682efe3b4c5d3a19d5f7eb45d 01-re-bootstrap-build.sh
100644 blob 2142 f19c2d6b293244bb11a3f74ee77c10675cadc7d6 INSTALL
100644 blob 30 c9b735fa1332286f4b3f5f81fa10527fd7506b6e LICENSE
040000 tree - 898b86d9a71d0093e8db2f6c00ddb9542b2fe6de builders
100644 blob 1773 ef1551089a803bde37e36edc8d61bb819d06f793 conf.bootstrap.sh
100644 blob 479 8cc15efe46965ac7750fe304460f5a2b0aa4201c conf.sh
040000 tree - 7736496abef80608f40188d3f4425ef67a16375e files
100755 blob 356 8fb8be28ac72f7214b59934b9f74a682665f2d32 pkg-build
100644 blob 22800641 e9e6291054c857401f6835c728f31541dae4311e steam.tar.bz2
100644 blob 173 2047af328b22f9d146585cd9e759edbc18122250 utils.sh
040000 tree - 8e23f551092a35f82b37129dd08c35c4d313c17b x64
040000 tree - b7a22de7f5cbd97650dd45412ef7d4246e395eb8 x86
Hints:
Before first commit, do not forget to setup your git environment:
git config --global user.name "your_name_here"
git config --global user.email "your@email_here"

Clone this repository using HTTP(S):
git clone https://rocketgit.com/user/sylware/nyanlinux

Clone this repository using ssh (do not forget to upload a key first):
git clone ssh://rocketgit@ssh.rocketgit.com/user/sylware/nyanlinux

Clone this repository using git:
git clone git://git.rocketgit.com/user/sylware/nyanlinux

You are allowed to anonymously push to this repository.
This means that your pushed commits will automatically be transformed into a merge request:
... clone the repository ...
... make some changes and some commits ...
git push origin main