RocketGit

sylware / si-user (public) (License: Unspecified) (since 2019-09-09) (hash sha1)
old sample user level code to go with my gpu driver (has interesting code)
Clone URLs: https://rocketgit.com/user/sylware/si-user ssh://rocketgit@ssh.rocketgit.com/user/sylware/si-user git://git.rocketgit.com/user/sylware/si-user
master
/tri.c (fec6dbe39290da3a33e42c89be93646569d10691) (36311 bytes) (mode 100644) (type blob)
//author Sylvain Bertrand <digital.ragnarok@gmail.com>
//Protected by GNU Affero GPL v3 with some exceptions.

//NOTES:
//This is raw, but very linear and simple radeon 3D pipeline programing.  The
//ISA (Instruction Set Architecture) documentation (see AMD web site) let you
//understand a lot of this programing. Reading the "official" code (drm + mesa)
//is an hundred fold more difficult.  Don't be afraid, it's not that hard.  Keep
//in mind: aligment constraints, unit of work size constraints.

#include <stdio.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/ioctl.h>
#include <string.h>
#include <math.h>
#include <stdint.h>
#include <stdlib.h>
#include <error.h>
#include <sys/mman.h>
#include <endian.h>
#include <errno.h>

#include <linux/types.h>

#include <alga/pixel_fmts.h>
#include <alga/amd/dce6/dce6.h>
#include <alga/amd/si/ioctl.h>
#include <alga/amd/si/pkt.h>
#include <alga/amd/si/cps_regs.h>
#include <alga/amd/si/gpu_regs_cfg.h>
#include <alga/amd/si/gpu_regs_sh.h>
#include <alga/amd/si/gpu_regs_ctx.h>

static uint32_t set(uint32_t mask,uint32_t v)
{
        uint8_t shift;
        shift=ffs(mask)-1;
        return (v<<shift)&mask;
}

struct params_3d {
  uint64_t vs_gpu_addr;
  uint64_t ps_gpu_addr;
  uint64_t w;
  uint64_t h;
  uint64_t fb_gpu_addr;
};

#define e(m,...) error(0,0,m,##__VA_ARGS__)
#define o(m,...) printf(m "\n",##__VA_ARGS__)
#define ul unsigned long
#define ull unsigned long long

#define IB_DWS_N_MAX (16 * 64)

struct vertex {
  float position[4];
  float param0[4];
};

#define VERTICES_N 4
static struct vertex vertices[VERTICES_N]={
  {
    { -0.2f, -0.9f, 0.0f, 1.0f },
    { 1.0f, 0.0f, 0.0f, 1.0f }
  },
  {
    { -0.9f, 0.9f, 0.0f, 1.0f },
    { 0.0f, 1.0f, 0.0f, 1.0f }
  },
  {
    { 0.9f, 0.9f, 0.0f, 1.0f },
    { 0.0f, 0.0f, 1.0f, 1.0f }
  },
  {
    { 0, 0, 0, 0 },
    { 0, 0, 0, 0 }
  }
};

static uint32_t buf_res_descs[]={
  //init with the vram lower 32 bits vertex position buffer address
  0x00000000,
  //oring the upper 8 remaining bits of buffer address. 
  //stride=0x20 (8 floats (4 position+4 color components) of 4 bytes.
  0x00200000,
  //4 records, namely 4 vertices, the last one is "null"
  0x00000004,
  //dst_sel_x=4(x) dst_sel_y=5(y) dst_sel_z=6(z) dest_sel_w=7(w) 
  //nfmt=7(float) dfmt=14(32_32_32_32)
  0x00077fac,
  //----------------------------------------------------------------------------
  //init with the vram lower 32 bits vertex param 0 buffer address 
  0x00000000,
  //oring the upper 8 remaining bits of buffer address.
  //stride=0x20 (8 floats (4 position+4 param 0 components) of 4 bytes.
  0x00200000,
  //4 records, namely 4 vertices, the last one is "null" 
  0x00000004,
  //dst_sel_x=4(r) dst_sel_y=5(g) dst_sel_z=6(b) dst_sel_w=7(a)
  //(customary to use color terminology for params)
  //nfmt=7(float) dfmt=14(32_32_32_32)
  0x00077fac
};

// o USER_SGPR[3:0]<--buffer resouce descriptor of the buffer of vertex
//   positions
// o USER_SGPR[7:4]<--buffer resouce descriptor of the buffer of vertex
//   parameter 0 (unused here)
// note: the done bit in export instructions is only for vertex positions.
static const uint8_t vs_vgprs_n=9;
static const uint8_t vs_user_sgprs_n=8;
static const uint8_t vs_sgprs_n=8;//at least vs_user_sgprs_n
static const uint8_t vs_exported_params_n=1;
static  uint32_t vs[]={
  0xe00c2000,//buffer_load_format_xyzw idxen=1
  0x80000100,//                        soffset=128(=0) vdata=1
  0xbf8c0000,//s_waitcnt
  0xe00c2000,//buffer_load_format_xyzw idxen=1
  0x80010500,//            soffset=128(=0) srsrc=1(sgprs[4:7]) vdata=5
  0xbf8c0000,//s_waitcnt
  0xf80008cf,//export en=0b1111 done=1 tgt=12(pos0)
  0x04030201,//       vsrc0=1 vsrc1=2 vsrc2=3 vsrc3=4
  0xbf8c0000,//s_waitcnt
  0xf800020f,//export en=0b1111 tgt=32(param0)
  0x08070605,//       vsrc0=5 vsrc1=6 vsrc2=7 vsrc3=8
  0xbf8c0000,//s_waitcnt
  0xbf810000 //s_endpgm
};

//m0 is put by the spi right after the last user pre-loaded sgprs. m0 must
//be loaded in order to index properly the parameters in lds.
//note: we don't deal with the "valid mask" for pixer en exec register.
static const uint8_t ps_vgprs_n=4;
static const uint8_t ps_user_sgprs_n=0;
static const uint8_t ps_sgprs_n=0;//at least ps_user_sgprs_n
static uint32_t ps[]={
  0x7e0002f2,//v_mov_b32 src0=242(1.0f) vdst=0
  0xbf8c0000,//s_waitcnt
  0x7e0202f2,//v_mov_b32 src0=242(1.0f) vdst=1
  0xbf8c0000,//s_waitcnt */
  0x7e0402f2,//v_mov_b32 src0=242(1.0f) vdst=2 
  0xbf8c0000,//s_waitcnt
  0x7e0602f2,//v_mov_b32 src0=242(1.0f) vdst=3 
  0xbf8c0000,//s_waitcnt
  0x5e000300,//v_cvt_pkrtz_f16_f32 vdst=0 vsrc1=1 src0=256(vgpr0)
  0x5e020702,//v_cvt_pkrtz_f16_f32 vdst=1 vsrc1=3 src0=258(vgpr2)
  0xf8001c0f,//exp vm=1 done=1 compr=1 en=0x1111
  0x01000100,//    vsrc3=1 vsrc2=0 vsrc1=1 vsrc0=0
  0xbf8c0000,//s_waitcnt
  0xbf810000 //s_endpgm
};

//return the offset aligned on power of two order equal or above the offset
//argument
static uint64_t next_aligned_of(uint64_t of,uint64_t order)
{
  uint64_t blk_sz=(1<<order);
  uint64_t mask=blk_sz-1;
  if(of&mask) return (of+blk_sz)&~mask;
  else return of;
}

static void cpy_htole32(uint32_t *dst, uint32_t *src, uint64_t dws_n)
{
  while(1){
    if(!dws_n) break;
    *dst++=htole32(*src++);
    dws_n--;
  }
}

static uint32_t upper_32_bits(uint64_t x)
{
  return x>>32;
}

static uint32_t lower_32_bits(uint64_t x)
{
  return (uint32_t)x;
}

union f2u {
	float f;
	uint32_t u;
};
static inline uint32_t f2u(float f)
{
	union f2u tmp;

	tmp.f = f;
	return tmp.u;
}

#define ib_wr(x) *((*ib)++)=htole32(x)

static void prelude(uint32_t **ib)
{
  //----------------------------------------------------------------------------
  //sync shader read/write caches, read/write L1/L2texture caches, read caches
  //of color blocks (we don't use the depth block).
  ib_wr(PKT3(PKT3_SURF_SYNC,4));
  //CP_COHER_CTL_0
  ib_wr(CCC_SH_ICACHE_ACTION_ENA|CCC_SH_KCACHE_ACTION_ENA|CCC_TCL2_ACTION_ENA
		                                      |CCC_CB_ACTION_ENA);
  //CP_COHER_SZ
  ib_wr(0xffffffff);
  //CP_COHER_BASE
  ib_wr(0);
  ib_wr(0x0000000a);//polling interval, 0xa(10) * 16 clocks
  //----------------------------------------------------------------------------

  //----------------------------------------------------------------------------
  //seems mandatory at the start of a command stream
  ib_wr(PKT3(PKT3_CTX_CTL,2));
  ib_wr(0x80000000);
  ib_wr(0x80000000);
  //----------------------------------------------------------------------------
}

//Config reg programming, then, in theory, flushing before modifiying their
//values. If same value for *ALL* accel code, should go into the linux
//module to be set once and for all.
static void cfg(uint32_t **ib)
{
  //----------------------------------------------------------------------------
  //VGT (Vertex Grouper and Tesselator block)
  ib_wr(PKT3(PKT3_SET_CFG_REG,2));
  ib_wr(CFG_REG_IDX(VGT_PRIM_TYPE));
  //VGT_PRIM_TYPE
  ib_wr(set(VPT_PRIM_TYPE,VPT_TRILIST));
  //----------------------------------------------------------------------------

  //----------------------------------------------------------------------------
  //PA (Primitive Assembler) CL (CLipper)
  ib_wr(PKT3(PKT3_SET_CFG_REG,2));
  ib_wr(CFG_REG_IDX(PA_CL_ENHANCE));
  //PA_CL_ENHANCE
  ib_wr(set(PCE_CLIP_SEQ_N,3)|PCE_CLIP_VTX_REORDER_ENA);
  //----------------------------------------------------------------------------
}

static void ctx_misc_init(uint32_t **ib)
{
  //basic init GPU context, XXX: not using the CLR_CTX command ???
  ib_wr(PKT3(PKT3_SET_CTX_REG,14));
  ib_wr(CTX_REG_IDX(VGT_OUTPUT_PATH_CTL));
  //VGT_OUTPUT_PATH_CTL
  ib_wr(0);
  ///VGT_HOS_CTL
  ib_wr(0);
  ///VGT_HOS_MAX_TESS_LVL
  ib_wr(0);
  //VGT_HOS_MIN_TESS_LVL
  ib_wr(0);
  //VGT_HOS_REUSE_DEPTH
  ib_wr(0);
  //VGT_GROUP_PRIM_TYPE
  ib_wr(0);
  //VGT_GROUP_FIRST_DECR
  ib_wr(0);
  //VGT_GROUP_DECR
  ib_wr(0);
  //VGT_GROUP_VECT_0_CTL
  ib_wr(0);
  //VGT_GROUP_VECT_1_CTL
  ib_wr(0);
  //VGT_GROUP_VECT_0_FMT_CTL
  ib_wr(0);
  //VGT_GROUP_VECT_1_FMT_CTL
  ib_wr(0);
  //VGT_GS_MODE
  ib_wr(0);
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(VGT_PRIM_ID_ENA));
  //VGT_PRIM_ID_ENA
  ib_wr(0);
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(VGT_PRIM_ID_RESET));
  //VGT_PRIM_ID_RESET
  ib_wr(0);
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,3));
  ib_wr(CTX_REG_IDX(VGT_STRMOUT_CFG));
  //VGT_STRMOUT_CFG
  ib_wr(0);
  //VGT_STRMOUT_BUF_CFG
  ib_wr(0);
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(IA_MULTI_VGT_PARAM));
  //IA_MULTI_VGT_PARAM
  ib_wr(IMVP_SWITCH_ON_EOP | IMVP_PARTIAL_VS_WAVE_ON
                                                 | set(IMVP_PRIM_GROUP_SZ, 63));
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,3));
  ib_wr(CTX_REG_IDX(VGT_REUSE_OFF));
  //VGT_REUSE_OFF
  ib_wr(0);
  //VGT_VTX_CNT_ENA
  ib_wr(0);
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(VGT_SHADER_STAGES_ENA));
  //VGT_SHADER_STAGES_ENA
  ib_wr(0);
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,3));
  ib_wr(CTX_REG_IDX(PA_SC_CENTROID_PRIORITY_0));
  //PA_SC_CENTROID_PRIORITY_0
  ib_wr(0x76543210);
  //PA_SC_CENTROID_PRIORITY_1
  ib_wr(0xfedcba98);
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(DB_EQAA));
  //DB_EQAA
  ib_wr(0x00110000);
}

static void ctx_vgt(uint32_t **ib)
{
  //VGT (Vertex Grouper and Tesselator block)
  ib_wr(PKT3(PKT3_SET_CTX_REG,5));
  ib_wr(CTX_REG_IDX(VGT_MAX_VTX_IDX));
  //VGT_MAX_VTX_IDX
  ib_wr(~0);
  //VGT_MIN_VTX_IDX
  ib_wr(0);
  //VGT_IDX_OF
  ib_wr(0);
  //VGT_MULTI_PRIM_IB_RESET_IDX
  ib_wr(0);
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(VGT_MULTI_PRIM_IB_RESET_ENA));
  //VGT_MULTI_PRIM_IB_RESET_ENA
  ib_wr(0);
}

static void ctx_spi_sh_vs(uint32_t **ib,struct params_3d *params_3d)
{
  //setup specific for the vertex shader
  
  //Tell the spi to pre-load the buffer descriptors in user sgprs
  ib_wr(PKT3(PKT3_SET_SH_REG,9));
  ib_wr(SH_REG_IDX(SPI_SH_USER_DATA_VS_0));
  //SPI_SH_USER_DATA_VS_0
  ib_wr(buf_res_descs[0]);
  //SPI_SH_USER_DATA_VS_1
  ib_wr(buf_res_descs[1]);
  //SPI_SH_USER_DATA_VS_2
  ib_wr(buf_res_descs[2]);
  //SPI_SH_USER_DATA_VS_3
  ib_wr(buf_res_descs[3]);
  //SPI_SH_USER_DATA_VS_4
  ib_wr(buf_res_descs[4]);
  //SPI_SH_USER_DATA_VS_5
  ib_wr(buf_res_descs[5]);
  //SPI_SH_USER_DATA_VS_6
  ib_wr(buf_res_descs[6]);
  //SPI_SH_USER_DATA_VS_7
  ib_wr(buf_res_descs[7]);
  
  ib_wr(PKT3(PKT3_SET_SH_REG,5));
  ib_wr(SH_REG_IDX(SPI_SH_PGM_LO_VS));
  //SPI_SH_PGM_LO_VS
  ib_wr(lower_32_bits(params_3d->vs_gpu_addr>>8));
  //SPI_SH_PGM_HI_VS
  ib_wr(set(SSPHV_MEM_BASE,upper_32_bits(params_3d->vs_gpu_addr>>8)));
  //SPI_SH_PGM_RSRC_VS_0: the vgrs are allocated using units of 4 vgprs,
  //sgprs using units of 8 sgprs. Don't forget to book 2 additionnal
  //sgprs for vcc. Both counts are minus one unit.
  ib_wr(set(SSPRV_VGPRS,((vs_vgprs_n-1)/4))
                                       | set(SSPRV_SGPRS,((vs_sgprs_n+2)-1)/8));
  //SPI_SH_PGM_RSRC_VS_1: tell the spi the count of sgprs which are notx vcc.
  ib_wr(set(SSPRV_USER_SGPR,vs_user_sgprs_n));
  
  //our vertex shader export only the color as parameter
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(SPI_VS_OUT_CFG));
  //SPI_VS_OUT_CFG
  ib_wr(set(SVOC_VS_PARAM_EXPORT_COUNT,vs_exported_params_n-1));
  
  //The spi needs to be told what packing format is used by the vertex
  //shader to export the position.
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(SPI_SH_POS_FMT));
  //SPI_SH_POS_FMT
  ib_wr(set(SSPF_POS_0_EXPORT_FMT,SSPF_4COMP));
}

static void ctx_spi_sh_ps(uint32_t **ib,struct params_3d *params_3d)
{
  //setup specific for the pixel/fragment shader

  ib_wr(PKT3(PKT3_SET_SH_REG,5));
  ib_wr(SH_REG_IDX(SPI_SH_PGM_LO_PS));
  //SPI_SH_PGM_LO_PS
  ib_wr(lower_32_bits(params_3d->ps_gpu_addr>>8));
  //SPI_SH_PGM_HI_PS
  ib_wr(set(SSPHP_MEM_BASE,upper_32_bits(params_3d->ps_gpu_addr>>8)));
  //SPI_SH_PGM_RSRC_PS_0: we must account 1 additional sgpr for m0 since
  //which will be loaded in the sgpr right after the last user sgpr.
  ib_wr(set(SSPRP_VGPRS,((ps_vgprs_n-1)/4))
                                     | set(SSPRP_SGPRS,((ps_sgprs_n+1+2)-1)/8));
  //SPI_SH_PGM_RSRC_PS_1: same constrains than the vertex shaders
  //plus the fact the spi will load the m0 in the first sgpr after the
  //last user loaded sgpr, namely sgpr6 in this case.
  ib_wr(set(SSPRP_USER_SGPR,ps_sgprs_n));
  
  //tell the spi the pixel/fragment shader will need perpective center
  //interpolation data in input (mandatory or gpu hang)
  ib_wr(PKT3(PKT3_SET_CTX_REG,3));
  ib_wr(CTX_REG_IDX(SPI_PS_INPUT_ENA));
  //SPI_PS_INPUT_ENA
  ib_wr(SPIE_PERSP_CENTER_ENA);
  //SPI_PS_INPUT_ADDR
  ib_wr(SPIA_PERSP_CENTER_ENA);
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(SPI_PS_IN_CTL));
  //SPI_PS_IN_CTL: 1 parameter to interpolate. Must have at least one 
  ib_wr(set(SPIC_INTERP_N,1));
  
  //don't care about z depth export
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(SPI_SH_Z_FMT));
  //SPI_SH_Z_FMT
  ib_wr(set(SSZF_Z_EXPORT_FMT,SSZF_ZERO));
  
  //only 1 input param on 32, then only SPI_PS_INPUT_CTL_00
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(SPI_PS_INPUT_CTL_00));
  //SPI_PS_INPUT_CTL_00
  ib_wr(0);
  
  //The spi sends the pixel color exported by a pixel/fragment shader to
  //a cb, it needs to be told about the special color packing format the
  //shader used.
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(SPI_SH_COLOR_FMT));
  //SPI_SH_COLOR_FMT
  ib_wr(set(SSCF_COLOR_0_EXPORT_FMT,SSCF_FP16_ABGR));
}

static void ctx_spi_sh(uint32_t **ib,struct params_3d *params_3d)
{
  //SH (SHader block)
  ctx_spi_sh_vs(ib,params_3d);
  ctx_spi_sh_ps(ib,params_3d);
}

static void ctx_spi(uint32_t **ib,struct params_3d *params_3d)
{
  //SPI (Shader Processor Interpolator)
  //disable the point primitive sprite
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(SPI_INTERPOL_CTL_0));
  //SPI_INTERPOL_CTL_0
  ib_wr(0);
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(SPI_BARYC_CTL));
  //SPI_BARYC_CTL: want 0 in working sample
  ib_wr(0);
  
  ctx_spi_sh(ib,params_3d);
}

static void ctx_pa_su(uint32_t **ib)
{
  //PA (Primitive Assembler) SU (Setup Unit)
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(PA_SU_VTX_CTL));
  //PA_SU_VTX_CTL: tells the PA (Primitive Assembler) SU (Setup Unit)
  //to place the(?) pixel at the center of the vertex?
  ib_wr(PSVC_PIX_CENTER);
  
  //setup for the PA (Primitive Assembler) SU (Setup Unit) for the
  //point/line primitive rendering: we do not render point
  //or line primitives.
  //Set it to 8 like in working samples
  ib_wr(PKT3(PKT3_SET_CTX_REG,4));
  ib_wr(CTX_REG_IDX(PA_SU_POINT_SZ));
  //PA_SU_POINT_SZ
  ib_wr(set(PSPS_H,8)|set(PSPS_W,8));
  //PA_SU_POINT_MINMAX
  ib_wr(set(PSPM_MIN,8)|set(PSPM_MAX,8));
  //PA_SU_LINE_CTL
  ib_wr(set(PSLC_W,8));
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(PA_SU_POLY_OF_CLAMP));
  //PA_SU_POLY_OF_CLAMP: tell the PA (Primitive Assembler) SU
  //(Setup Unit) for polygon not to clamp something ?
  ib_wr(0);
  
  //related to the SC (Scan Converter)
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(PA_SU_SC_MODE_CTL));
  //PA_SU_SC_MODE_CTL: removed FACE to follow working samples
  ib_wr(set(PSSMC_POLY_MODE_FRONT_PTYPE,PSSMC_DRAW_TRIANGLES)
                          | set(PSSMC_POLY_MODE_BACK_PTYPE,PSSMC_DRAW_TRIANGLES)
                          | PSSMC_PROVOKING_VTX_LAST);
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(PA_SU_PRIM_FILTER_CTL));
  //PA_SU_PRIM_FILTER_CTL
  ib_wr(0);
}

static void ctx_pa_cl(uint32_t **ib)
{
  //PA (Primitive Assembler) CL (CLipper)
  ib_wr(PKT3(PKT3_SET_CTX_REG,5));
  ib_wr(CTX_REG_IDX(PA_CL_GB_VERT_CLIP_ADJ));
  //disable GB (Guard Band) by setting those registers to 1.0f
  //PA_CL_GB_VERT_CLIP_ADJ
  ib_wr(f2u(1.0f));
  //PA_CL_GB_VERT_DISC_ADJ
  ib_wr(f2u(1.0f));
  //PA_CL_GB_HORZ_CLIP_ADJ
  ib_wr(f2u(1.0f));
  //PA_CL_GB_HORZ_DISC_ADJ
  ib_wr(f2u(1.0f));
  
  //define the way the PA (Primitive Assembler) CL (CLipper) will
  //behave regarding NAN (Not A Number) and INF (INFinity) values
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(PA_CL_NANINF_CTL));
  //PA_CL_NANINF_CTL: to hardware default behaviour
  ib_wr(0);
  
  //no clipping done on the input from the vertex shader
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(PA_CL_VS_OUT_CTL));
  //PA_CL_VS_OUT_CTL
  ib_wr(0);
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(PA_CL_CLIP_CTL));
  //PA_CL_CLIP_CTL: ucp mode 3=always expand and clip as trifan
  ib_wr(set(PCCC_PS_UCP_MODE,3) | PCCC_DX_LINEAR_ATTR_CLIP_ENA);
}

static void ctx_pa_sc_vport_0_te(uint32_t **ib, struct params_3d *params_3d)
{
  //PA (Primitive Assembler) SC (Scan Converter) VPORT (ViewPORT) 0 TE
  //(Transform Engine)
  ib_wr(PKT3(PKT3_SET_CTX_REG,7));
  ib_wr(CTX_REG_IDX(PA_SC_VPORT_0_TE_X_SCALE));
  //PA_SC_VPORT_0_TE_X_SCALE
  ib_wr(f2u(params_3d->w/2.0f));
  //PA_SC_VPORT_0_TE_X_OF
  ib_wr(f2u(params_3d->w / 2.0f));
  //PA_SC_VPORT_0_TE_Y_SCALE
  ib_wr(f2u(params_3d->h / 2.0f));
  //PA_SC_VPORT_0_TE_Y_OF
  ib_wr(f2u(params_3d->h / 2.0f));
  //PA_SC_VPORT_0_TE_Z_SCALE: stick to working sample values
  ib_wr(f2u(0.5f));
  //PA_SC_VPORT_0_TE_Z_OF: stick to working sample values
  ib_wr(f2u(0.5f));
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,3));
  ib_wr(CTX_REG_IDX(PA_SC_VPORT_0_TE_ZMIN));
  //PA_SC_VPORT_0_TE_ZMIN: min Z value from VPORT TE
  ib_wr(f2u(0.0f));
  //PA_SC_VPORT_0_TE_ZMAX: max Z value from VPORT TE
  ib_wr(f2u(1.0f));
}

static void ctx_pa_sc_vport_0(uint32_t **ib, struct params_3d *params_3d)
{
  //PA (Primitive Assembler) SC (Scan Converter) VPORT (ViewPORT) 0
  ib_wr(PKT3(PKT3_SET_CTX_REG,3));
  ib_wr(CTX_REG_IDX(PA_SC_VPORT_0_SCISSOR_TL));
  //PA_SC_VPORT_0_SCISSOR_TL
  ib_wr(set(PSVST_X,0)|set(PSVST_Y,0));
  //PA_SC_VPORT_0_SCISSOR_BR
  ib_wr(set(PSVSB_X,params_3d->w)|set(PSVSB_Y,params_3d->h));

  ctx_pa_sc_vport_0_te(ib,params_3d);
}

static void ctx_pa_sc_vports_te(uint32_t **ib)
{
  //PA (Primitive Assembler) SC (Scan Converter) VPORT (ViewPORT) TE (Transform
  //Engine)
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(PA_SC_VPORT_TE_CTL));
  //PA_SC_VPORT_TE_CTL: no so called perpective division
  ib_wr(PSVTC_VPORT_X_SCALE_ENA|PSVTC_VPORT_X_OF_ENA|PSVTC_VPORT_Y_SCALE_ENA
              |PSVTC_VPORT_Y_OF_ENA|PSVTC_VPORT_Z_SCALE_ENA|PSVTC_VPORT_Z_OF_ENA
              |PSVTC_VTX_W0_FMT);
}

static void ctx_pa_sc_vports(uint32_t **ib, struct params_3d *params_3d)
{
  //PA (Primitive Assembler) SC (Scan Converter) VPORT (ViewPORT)
  ctx_pa_sc_vport_0(ib,params_3d);
  ctx_pa_sc_vports_te(ib);
}
 
static void ctx_pa_sc(uint32_t **ib, struct params_3d *params_3d)
{
  //PA (Primitive Assembler) SC (Scan Converter)
  ib_wr(PKT3(PKT3_SET_CTX_REG,3));
  ib_wr(CTX_REG_IDX(PA_SC_MODE_CTL_0));
  //PA_SC_MODE_CTL_0
  ib_wr(0);
  //PA_SC_MODE_CTL_1
  ib_wr(0);
  
  //defines how to render the edge of primitives
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(PA_SC_EDGERULE));
  //PA_SC_EDGERULE
  ib_wr(0xaaaaaaaa);
 
  //---------------------------------------------------------------------------- 
  //Anti-Aliasing... probably
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(PA_SC_AA_CFG));
  //PA_SC_AA_CFG
  ib_wr(0);
  
  //do something AA related
  ib_wr(PKT3(PKT3_SET_CTX_REG,3));
  ib_wr(CTX_REG_IDX(PA_SC_AA_MASK_X0Y0_X1Y0));
  //PA_SC_AA_MASK_X0Y0_X1Y0
  ib_wr(0xffffffff);
  //PA_SC_AA_MASK_X0Y1_X1Y1
  ib_wr(0xffffffff);
  //---------------------------------------------------------------------------- 
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,10));
  ib_wr(CTX_REG_IDX(PA_SC_CLIPRECT_RULE));
  //PA_SC_CLIPRECT_RULE: no scissor required then clip rule is 0xffff (no specs
  //provided)
  ib_wr(set(PSCR_CLIP_RULE,0xffff));
  //PA_SC_CLIPRECT_0_TL
  ib_wr(0);
  //PA_SC_CLIPRECT_0_BR
  ib_wr(set(PSCB_X,params_3d->w)|set(PSCB_Y,params_3d->h));
  //PA_SC_CLIPRECT_1_TL
  ib_wr(0);
  //PA_SC_CLIPRECT_1_BR
  ib_wr(set(PSCB_X,params_3d->w)|set(PSCB_Y,params_3d->h));
  //PA_SC_CLIPRECT_2_TL
  ib_wr(0);
  //PA_SC_CLIPRECT_2_BR
  ib_wr(set(PSCB_X,params_3d->w)|set(PSCB_Y,params_3d->h));
  //PA_SC_CLIPRECT_3_TL
  ib_wr(0);
  //PA_SC_CLIPRECT_3_BR
  ib_wr(set(PSCB_X,params_3d->w)|set(PSCB_Y,params_3d->h));
  
  //---------------------------------------------------------------------------- 
  //Tells the SC (Scan Converter/rasteriser) we don't use line stipple since we
  //do not render line primitives.  XXX: ORed register? Because if not will set
  //all bits to 0!  We only want to set to 0 LINE_STIPPLE_ENA.
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(PA_SC_LINE_STIPPLE));
  //PA_SC_LINE_STIPPLE */
  ib_wr(0);
  
  //Even if we are not rendering line primitives, tells the PA (Primitive
  //Assembler) SC (scan converter/rasteriser) to do "something with the last
  //pixel
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(PA_SC_LINE_CTL));
  //PA_SC_LINE_CTL
  ib_wr(PSLC_LAST_PIXEL);
  //---------------------------------------------------------------------------- 
  
  //---------------------------------------------------------------------------- 
  //set the value of the scissors
  ib_wr(PKT3(PKT3_SET_CTX_REG,3));
  ib_wr(CTX_REG_IDX(PA_SC_GENERIC_SCISSOR_TL));
  //PA_SC_GENERIC_SCISSOR_TL
  ib_wr(set(PSGST_X,0)|set(PSGST_Y,0));
  //PA_SC_GENERIC_SCISSOR_BR
  ib_wr(set(PSGSB_X,params_3d->w)|set(PSGSB_Y,params_3d->h));
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,3));
  ib_wr(CTX_REG_IDX(PA_SC_SCREEN_SCISSOR_TL));
  //PA_SC_SCREEN_SCISSOR_TL
  ib_wr(set(PSSST_X,0)|set(PSSST_Y,0));
  //PA_SC_SCREEN_SCISSOR_BR
  ib_wr(set(PSSSB_X,params_3d->w)|set(PSSSB_Y,params_3d->h));
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,4));
  ib_wr(CTX_REG_IDX(PA_SC_WND_OF));
  //PA_SC_WND_OF: the window offset in the screen which can be used by many
  //scissors.
  ib_wr(0);
  //PA_SC_WND_SCISSOR_TL
  ib_wr(set(PSWST_X,0)|set(PSWST_Y,0));
  //PA_SC_WND_SCISSOR_BR
  ib_wr(set(PSWSB_X,params_3d->w)|set(PSWSB_Y,params_3d->h));
  //---------------------------------------------------------------------------- 
  
  ctx_pa_sc_vports(ib,params_3d);
}

static void ctx_pa(uint32_t **ib,struct params_3d *params_3d)
{
  //PA (Primitive Assembler)
  ctx_pa_su(ib);
  ctx_pa_cl(ib);
  ctx_pa_sc(ib,params_3d);
}

static void ctx_dbs(uint32_t **ib)
{
  //DBs (Depth Blocks)
  //disable the depth stencil/z-buffer
  ib_wr(PKT3(PKT3_SET_CTX_REG,3));
  ib_wr(CTX_REG_IDX(DB_Z_INFO));
  //DB_Z_INFO
  ib_wr(0);
  //DB_STENCIL_INFO
  ib_wr(0);
  
  //even if disabled, setup some clean values in a few regs
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(DB_DEPTH_CTL));
  //DB_DEPTH_CTL
  ib_wr(0);
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,5));
  ib_wr(CTX_REG_IDX(DB_DEPTH_BOUNDS_MIN));
  //DB_DEPTH_BOUNDS_MIN
  ib_wr(0);
  //DB_DEPTH_BOUNDS_MAX
  ib_wr(0);
  //DB_STENCIL_CLR
  ib_wr(0);
  //DB_DEPTH_CLR
  ib_wr(f2u(1.0f));
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(DB_RENDER_CTL));
  //DB_RENDER_CTL
  ib_wr(0);
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(DB_RENDER_OVERRIDE_0));
  //DB_RENDER_OVERRIDE_0
  ib_wr(set(DRO_FORCE_HIZ_ENA,DRO_FORCE_DIS)
                                         |set(DRO_FORCE_HIS_ENA_0,DRO_FORCE_DIS)
                                                       |set(DRO_FORCE_HIS_ENA_1,
                                                                DRO_FORCE_DIS));
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(DB_STENCIL_CTL));
  //DB_STENCIL_CTL
  ib_wr(0);
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,4));
  ib_wr(CTX_REG_IDX(DB_SRESULTS_CMP_STATE_0));
  //DB_SRESULTS_CMP_STATE_0
  ib_wr(0);
  //DB_SRESULTS_CMP_STATE_1
  ib_wr(0);
  //DB_PRELOAD_CTL
  ib_wr(0);
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(DB_ALPHA_TO_MASK));
  //DB_ALPHA_TO_MASK
  ib_wr(set(DATM_ALPHA_TO_MASK_OF_0,2)|set(DATM_ALPHA_TO_MASK_OF_1,2)
                |set(DATM_ALPHA_TO_MASK_OF_2,2)|set(DATM_ALPHA_TO_MASK_OF_3,2));
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,3));
  ib_wr(CTX_REG_IDX(DB_STENCILREFMASK));
  //DB_STENCILREFMASK
  ib_wr(0);
  //DB_STENCILREFMASK_BF
  ib_wr(0);
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(DB_SH_CTL));
  //DB_SH_CTL
  ib_wr(set(DSC_Z_ORDER,DSC_EARLY_Z_THEN_LATE_Z));
}

static void ctx_cbs_blend(uint32_t **ib)
{
  //blend blocks of CBs (Color Blocks)
  ib_wr(PKT3(PKT3_SET_CTX_REG,9));
  ib_wr(CTX_REG_IDX(CB_0_BLEND_CTL));
  //CB_0_BLEND_CTL: disable blending
  ib_wr(0);
  //CB_1_BLEND_CTL: disable blending
  ib_wr(0);
  //CB_2_BLEND_CTL: disable blending
  ib_wr(0);
  //CB_3_BLEND_CTL: disable blending
  ib_wr(0);
  //CB_4_BLEND_CTL: disable blending
  ib_wr(0);
  //CB_5_BLEND_CTL: disable blending
  ib_wr(0);
  //CB_6_BLEND_CTL: disable blending
  ib_wr(0);
  //CB_7_BLEND_CTL: disable blending
  ib_wr(0);
}

static void ctx_cb_0(uint32_t **ib,struct params_3d *params_3d)
{
  //CB 0 (Color Block 0)
  ib_wr(PKT3(PKT3_SET_CTX_REG,7));
  ib_wr(CTX_REG_IDX(CB_0_COLOR_BASE));
  //CB_0_COLOR_BASE
  ib_wr(params_3d->fb_gpu_addr>>8);
  //CB_0_COLOR_PITCH: a thin1 tile is 8x8 pixels
  ib_wr(set(CCP_TILE_MAX,params_3d->w/8-1)); 
  //CB_0_COLOR_SLICE: a thin1 tile is 8x8 pixels
  ib_wr(set(CCS_TILE_MAX,params_3d->w*params_3d->h/64-1));
  //CB_0_COLOR_VIEW: 0, or last tile index for an array of slices
  ib_wr(0);
  //CB_0_COLOR_INFO: for sRGB color space, in 8 bits little endian argb, the
  //color component swap is ALT for the color components from the pixel/fragment
  //shader and value must be clamped before and after blending to mrt range.
  ib_wr(set(CCI_ENDIAN,CCI_ENDIAN_NONE)|set(CCI_FMT,CCI_COLOR_8_8_8_8)
         |set(CCI_COMP_SWAP, CCI_SWAP_ALT)|set(CCI_NUMBER_TYPE,CCI_NUMBER_UNORM)
                                                              |CCI_BLEND_CLAMP);
  //CB_0_COLOR_ATTRIB: see gpu/tiling.c
  ib_wr(set(CCA_TILE_MODE_IDX,8));
}

static void ctx_cbs(uint32_t **ib,struct params_3d *params_3d)
{
  //CBs (Color Blocks)
  ctx_cbs_blend(ib);
  
  ctx_cb_0(ib,params_3d);
  
  //do enable all color components (RGBA) from the pixel/fragment shader to be
  //used by the CB 0 and do enable CB 0 to output all computed color components
  //to target (here our framebuffer)
  ib_wr(PKT3(PKT3_SET_CTX_REG,3));
  ib_wr(CTX_REG_IDX(CB_TGT_MASK));
  //CB_TGT_MASK */
  ib_wr(set(CTM_TGT_0_ENA,CTM_TGT_RED|CTM_TGT_GREEN|CTM_TGT_BLUE
                                                               |CTM_TGT_ALPHA));
  //CB_SH_MASK
  ib_wr(set(CSM_OUTPUT_0_ENA,CSM_OUTPUT_RED|CSM_OUTPUT_GREEN|CSM_OUTPUT_BLUE
                                                            |CSM_OUTPUT_ALPHA));
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(CB_COLOR_CTL));
  //CB_COLOR_CTL: switch normal mode for all CBs
  ib_wr(set(CCC_MODE,CCC_CB_NORMAL)|set(CCC_ROP3,CCC_0XCC));
}

static void ctx(uint32_t **ib,struct params_3d *params_3d)
{
  ctx_misc_init(ib);
  ctx_vgt(ib);
  ctx_spi(ib,params_3d);
  ctx_pa(ib,params_3d);
  ctx_dbs(ib);
  ctx_cbs(ib,params_3d);
}

static void draw(uint32_t **ib)
{
  ib_wr(PKT3(PKT3_IDX_TYPE,1));
  ib_wr(set(PKT3_SZ,PKT3_16BITS));

  ib_wr(PKT3(PKT3_INST_N,1));
  ib_wr(1);

  ib_wr(PKT3(PKT3_DRAW_IDX_AUTO,2));
  //3 indices to generate
  ib_wr(VERTICES_N-1); 
  //VGT_DRAW_INITIATOR
  ib_wr(set(VDI_SRC_SELECT,VDI_AUTO_IDX));
}

static void pfp_align(uint32_t **ib,uint32_t *ib_start)
{
  while(((*ib-ib_start)&CP_RING_PFP_DW_MASK)!=0) ib_wr(PKT2);
}

static uint64_t ib_3d(uint32_t *ib_start, struct params_3d *params_3d)
{
  uint32_t *ib=ib_start;
  prelude(&ib);
  //============================================================================
  //the real thing is here
  cfg(&ib);
  ctx(&ib,params_3d);
  //============================================================================
  draw(&ib);
  pfp_align(&ib,ib_start);
  return ib-ib_start;
}

//We prepare a big buffer with everything cpu side, then dma it to gpu vram, run
//it, and wait for a fence.
int main(int argc, char *argv[])
{
  int r0=0;
  int r1=0;
  //----------------------------------------------------------------------------
  //arguments
  if(argc<4){
    e("missing arguments");
    r0=EXIT_FAILURE;
    goto exit;
  }
  uint64_t fb_gpu_addr=strtoul(argv[1],NULL,16);
  uint64_t w=strtoul(argv[2],NULL,10);
  uint64_t h=strtoul(argv[3],NULL,10);
  //----------------------------------------------------------------------------

  o("drawing a triangle:fb=0x%016Lx,w=%Lu;h=%Lu",(ull)fb_gpu_addr,(ull)w,
                                                                        (ull)h);

  //----------------------------------------------------------------------------
  errno=0;
  int f=open("/dev/si0",O_RDWR);
  if(f==-1){
    e("open failed:%s",strerror(errno));
    r0=EXIT_FAILURE;
    goto exit;
  }
  //----------------------------------------------------------------------------

  //----------------------------------------------------------------------------
  //pre-compute aligned offsets and the aligned size of our vram buffer

  //vertex shader must be 256 bytes aligned (order 8), 0 since we will allocate
  //a 256 bytes aligned buffer.
  uint64_t vs_of=0;
  //pixel/fragment shader must be 256 bytes aligned (order 8)
  uint64_t ps_of=next_aligned_of(sizeof(vs),8);
  //vertices are fetch by block of 4 vertices, dw aligned (order 2)
  uint64_t vertices_of=next_aligned_of(ps_of+sizeof(ps),2);
  //ib is aligned on prefetch size which is 16 dws (order 6)
  uint64_t ib_of=next_aligned_of(vertices_of+sizeof(vertices),6);

  //worst alignment is 256 bytes (order 8), then round up for allocation
  uint64_t vram_buf_sz=next_aligned_of(ib_of+(IB_DWS_N_MAX<<2),8);
  o("vs_of=0x%016llx ps_of=0x%016llx vertices_of=0x%016llx ib_of=0x%016llx vram_buf_sz=0x%016llx",
            (ull)vs_of,(ull)ps_of,(ull)vertices_of,(ull)ib_of,(ull)vram_buf_sz);
  //----------------------------------------------------------------------------
 
  //----------------------------------------------------------------------------
  o("allocating 256 bytes aligned vram buffer...");
  struct si_mem mem;
  mem.align=256;//worst alignment is vs and ps
  mem.sz=vram_buf_sz;
  errno=0;
  ul req=_IOWR('d',SI_MEM_ALLOC,mem);
  r1=ioctl(f,req,&mem);
  if(r1==-1){
    e("alloc vram buffer failed:%s",strerror(errno));
    r0=EXIT_FAILURE;
    goto exit;
  }
  o("vram_buf_gpu_addr=0x%016llx",(ull)mem.gpu_addr);
  o("allocating 256 bytes aligned vram buffer done");
  //----------------------------------------------------------------------------

  //----------------------------------------------------------------------------
  //now, we have enough info to init the parameters for the pipeline
  struct params_3d params_3d;
  params_3d.vs_gpu_addr=mem.gpu_addr+vs_of;
  params_3d.ps_gpu_addr=mem.gpu_addr+ps_of;
  params_3d.w=w;
  params_3d.h=h;
  params_3d.fb_gpu_addr=fb_gpu_addr;
  o("params_3d:vs_gpu_addr=0x%016llx ps_gpu_addr=0x%016llx w=%llu h=%llu fb_gpu_addr=0x%016llx",
            (ull)params_3d.vs_gpu_addr,(ull)params_3d.ps_gpu_addr,(ull)w,(ull)h,
                                                    (ull)params_3d.fb_gpu_addr);
  //----------------------------------------------------------------------------

  //----------------------------------------------------------------------------
  o("mmaping an aperture...");
  //get an aperture of the size of our vram buffer for dma
  errno=0;
  void *dma_buffer=mmap(NULL,vram_buf_sz,PROT_READ|PROT_WRITE,MAP_SHARED,f,0);
  if(dma_buffer==MAP_FAILED){
    e("unable to mmap an aperture buffer:%s",strerror(errno));
    r0=EXIT_FAILURE;
    goto free_vram_buf;
  }
  o("dma buffer=%p",dma_buffer);
  o("mmaping an aperture done");
  //----------------------------------------------------------------------------

  //----------------------------------------------------------------------------
  //configure buffer resources
  uint64_t vtx_buf_gpu_addr=mem.gpu_addr+vertices_of;
  o("buffer resources:vtx_buf_gpu=0x%016llx",(ull)vtx_buf_gpu_addr);

  //vertex position buffer start address
  buf_res_descs[0]=lower_32_bits(vtx_buf_gpu_addr); 
  buf_res_descs[1]|=upper_32_bits(vtx_buf_gpu_addr); 
  //vertex color buffer start address
  buf_res_descs[4]=lower_32_bits(vtx_buf_gpu_addr+VERTICES_N*sizeof(float));
  buf_res_descs[5]|=upper_32_bits(vtx_buf_gpu_addr+VERTICES_N*sizeof(float));
  //----------------------------------------------------------------------------

  //----------------------------------------------------------------------------
  o("copying static data into dma buffer...");
  cpy_htole32(dma_buffer+vs_of,&vs[0],sizeof(vs)>>2);
  cpy_htole32(dma_buffer+ps_of,&ps[0],sizeof(ps)>>2);
  cpy_htole32(dma_buffer+vertices_of,(uint32_t*)&vertices[0],
                                                           sizeof(vertices)>>2);
  o("copying static data into dma buffer done");
  //----------------------------------------------------------------------------

  //============================================================================
  //there, we program the 3D pipeline
  uint64_t ib_dws_n=ib_3d(dma_buffer+ib_of,&params_3d);
  o("ib_dws_n=0x%016llx(max=0x%016llx)",(ull)ib_dws_n,(ull)IB_DWS_N_MAX);
  //============================================================================
 
  //----------------------------------------------------------------------------
  o("dma-ing the cpu buffer to vram buffer...");
  struct si_dma dma;
  struct si_dma_l2l *l2l=&dma.params.l2l;
  struct si_timeouts_info *t_info=&dma.t_info;
  dma.type=SI_DMA_TYPE_L2L;
  dma.dir=SI_DMA_TO_DEVICE;
  //we don't really care here lets put one seconde!
  t_info->ring.n_max=1;
  t_info->ring.us=1000000;
  t_info->fence.n_max=1;
  t_info->fence.us=1000000;
  l2l->src_addr=(uint64_t)dma_buffer;
  l2l->dst_addr=mem.gpu_addr;
  l2l->sz=vram_buf_sz;
  req=_IOW('d',SI_DMA,dma);
  errno=0;
  r1=ioctl(f,req,&dma);
  switch(r1){
  case -1:
    e("dma l2l failed:%s",strerror(errno));
    r0=EXIT_FAILURE;
    goto free_vram_buf;
  case SI_RING_TIMEOUT:
    e("dma l2l failed:ring timeout");
    r0=EXIT_FAILURE;
    goto free_vram_buf;
  case SI_FENCE_TIMEOUT:
    e("dma l2l failed:fence timeout");
    r0=EXIT_FAILURE;
    goto free_vram_buf;
  }
  o("dma-ing the cpu buffer to vram buffer done");
  //----------------------------------------------------------------------------

  //----------------------------------------------------------------------------
  o("running the ib...");
  struct si_gpu_3d_ib gpu_3d_ib;
  struct si_timeout_info *ring_t_info=&gpu_3d_ib.ring_t_info;
  //we don't really care here lets put one seconde!
  ring_t_info->n_max=1;
  ring_t_info->us=1000000;
  gpu_3d_ib.gpu_addr=mem.gpu_addr+ib_of;
  gpu_3d_ib.dws_n=ib_dws_n;
  req=_IOW('d',SI_GPU_3D_IB,gpu_3d_ib);
  errno=0;
  r1=ioctl(f,req,&gpu_3d_ib);
  switch(r1){
  case -1:
    e("running the GPU_3D indirecting buffer failed:%s",strerror(errno));
    r0=EXIT_FAILURE;
    goto free_vram_buf;
  case SI_RING_TIMEOUT:
    e("running the GPU_3D indirecting buffer failed:ring timeout");
    r0=EXIT_FAILURE;
    goto free_vram_buf;
  }
  o("running the ib done");
  //----------------------------------------------------------------------------

  //----------------------------------------------------------------------------
  o("fencing...");
  struct si_gpu_3d_fence gpu_3d_fence;
  t_info=&gpu_3d_fence.t_info;
  //we don't really care here lets put one seconde!
  t_info->ring.n_max=1;
  t_info->ring.us=1000000;
  t_info->fence.n_max=1;
  t_info->fence.us=1000000;
  req=_IOW('d',SI_GPU_3D_FENCE,gpu_3d_fence);
  errno=0;
  r1=ioctl(f,req,&gpu_3d_fence);
  switch(r1){
  case -1:
    e("waiting for fence failed:%s",strerror(errno));
    r0=EXIT_FAILURE;
    break;
  case SI_RING_TIMEOUT:
    e("waiting for fence failed:ring timeout");
    r0=EXIT_FAILURE;
    break;
  case SI_FENCE_TIMEOUT:
    e("waiting for fence failed:fence timeout");
    r0=EXIT_FAILURE;
    break;
  }
  o("fencing done");
  //----------------------------------------------------------------------------

free_vram_buf:
  //----------------------------------------------------------------------------
  o("freeing vram buffer...");
  req=_IOW('d',SI_MEM_FREE,mem.gpu_addr);
  errno=0;
  r1=ioctl(f,req,&mem.gpu_addr);
  if(r1==-1){
    e("free vram buffer failed (LEAK!):%s",strerror(errno));
    r0=EXIT_FAILURE;
  }
  o("freeing vram buffer done");
  //----------------------------------------------------------------------------
exit:
  return r0;
}
Mode	Type	Size	Ref	File
100644	blob	35123	f891bd326c39a671fddfd210c0d951c97674c7c3	LICENSE
100644	blob	405	3ed5aa544d933f7b88b648964752e5443c27df85	README.md
100644	blob	2282	1c4b0d5ba77ce6e79cbb62295b3e8d15ec7a9689	dma_l2l.c
100644	blob	1746	a4d2242bba822dc989ee13598862483e47ab101f	dma_u32_fill.c
100644	blob	1260	a458be504b627da28249b6787b60d01d7194c32a	dpm.c
100644	blob	1653	3650ee140eab2d08eee33d9c30072580a4e7e6e8	edid.c
100644	blob	804	7d3d87f7d43f8000784ad4bf16ae9d055e177640	makefile
100644	blob	3029	77fa08994a1b8edf40a103d2cb73250e7ef49f81	modeset.c
100644	blob	2452	9daad256ae24e196d376e6d1aa2826466be7ee8f	pf.c
100644	blob	36311	fec6dbe39290da3a33e42c89be93646569d10691	tri.c
Hints:
Before first commit, do not forget to setup your git environment:
git config --global user.name "your_name_here"
git config --global user.email "your@email_here"
Clone this repository using HTTP(S):
git clone https://rocketgit.com/user/sylware/si-user
Clone this repository using ssh (do not forget to upload a key first):
git clone ssh://rocketgit@ssh.rocketgit.com/user/sylware/si-user
Clone this repository using git:
git clone git://git.rocketgit.com/user/sylware/si-user
You are allowed to anonymously push to this repository.
This means that your pushed commits will automatically be transformed into a merge request:
... clone the repository ...
... make some changes and some commits ...
git push origin main