//author Sylvain Bertrand <digital.ragnarok@gmail.com>
//Protected by GNU Affero GPL v3 with some exceptions.
//NOTES:
//This is raw, but very linear and simple radeon 3D pipeline programing. The
//ISA (Instruction Set Architecture) documentation (see AMD web site) let you
//understand a lot of this programing. Reading the "official" code (drm + mesa)
//is an hundred fold more difficult. Don't be afraid, it's not that hard. Keep
//in mind: aligment constraints, unit of work size constraints.
#include <stdio.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/ioctl.h>
#include <string.h>
#include <math.h>
#include <stdint.h>
#include <stdlib.h>
#include <error.h>
#include <sys/mman.h>
#include <endian.h>
#include <errno.h>
#include <linux/types.h>
#include <alga/pixel_fmts.h>
#include <alga/amd/dce6/dce6.h>
#include <alga/amd/si/ioctl.h>
#include <alga/amd/si/pkt.h>
#include <alga/amd/si/cps_regs.h>
#include <alga/amd/si/gpu_regs_cfg.h>
#include <alga/amd/si/gpu_regs_sh.h>
#include <alga/amd/si/gpu_regs_ctx.h>
static uint32_t set(uint32_t mask,uint32_t v)
{
uint8_t shift;
shift=ffs(mask)-1;
return (v<<shift)&mask;
}
struct params_3d {
uint64_t vs_gpu_addr;
uint64_t ps_gpu_addr;
uint64_t w;
uint64_t h;
uint64_t fb_gpu_addr;
};
#define e(m,...) error(0,0,m,##__VA_ARGS__)
#define o(m,...) printf(m "\n",##__VA_ARGS__)
#define ul unsigned long
#define ull unsigned long long
#define IB_DWS_N_MAX (16 * 64)
struct vertex {
float position[4];
float param0[4];
};
#define VERTICES_N 4
static struct vertex vertices[VERTICES_N]={
{
{ -0.2f, -0.9f, 0.0f, 1.0f },
{ 1.0f, 0.0f, 0.0f, 1.0f }
},
{
{ -0.9f, 0.9f, 0.0f, 1.0f },
{ 0.0f, 1.0f, 0.0f, 1.0f }
},
{
{ 0.9f, 0.9f, 0.0f, 1.0f },
{ 0.0f, 0.0f, 1.0f, 1.0f }
},
{
{ 0, 0, 0, 0 },
{ 0, 0, 0, 0 }
}
};
static uint32_t buf_res_descs[]={
//init with the vram lower 32 bits vertex position buffer address
0x00000000,
//oring the upper 8 remaining bits of buffer address.
//stride=0x20 (8 floats (4 position+4 color components) of 4 bytes.
0x00200000,
//4 records, namely 4 vertices, the last one is "null"
0x00000004,
//dst_sel_x=4(x) dst_sel_y=5(y) dst_sel_z=6(z) dest_sel_w=7(w)
//nfmt=7(float) dfmt=14(32_32_32_32)
0x00077fac,
//----------------------------------------------------------------------------
//init with the vram lower 32 bits vertex param 0 buffer address
0x00000000,
//oring the upper 8 remaining bits of buffer address.
//stride=0x20 (8 floats (4 position+4 param 0 components) of 4 bytes.
0x00200000,
//4 records, namely 4 vertices, the last one is "null"
0x00000004,
//dst_sel_x=4(r) dst_sel_y=5(g) dst_sel_z=6(b) dst_sel_w=7(a)
//(customary to use color terminology for params)
//nfmt=7(float) dfmt=14(32_32_32_32)
0x00077fac
};
// o USER_SGPR[3:0]<--buffer resouce descriptor of the buffer of vertex
// positions
// o USER_SGPR[7:4]<--buffer resouce descriptor of the buffer of vertex
// parameter 0 (unused here)
// note: the done bit in export instructions is only for vertex positions.
static const uint8_t vs_vgprs_n=9;
static const uint8_t vs_user_sgprs_n=8;
static const uint8_t vs_sgprs_n=8;//at least vs_user_sgprs_n
static const uint8_t vs_exported_params_n=1;
static uint32_t vs[]={
0xe00c2000,//buffer_load_format_xyzw idxen=1
0x80000100,// soffset=128(=0) vdata=1
0xbf8c0000,//s_waitcnt
0xe00c2000,//buffer_load_format_xyzw idxen=1
0x80010500,// soffset=128(=0) srsrc=1(sgprs[4:7]) vdata=5
0xbf8c0000,//s_waitcnt
0xf80008cf,//export en=0b1111 done=1 tgt=12(pos0)
0x04030201,// vsrc0=1 vsrc1=2 vsrc2=3 vsrc3=4
0xbf8c0000,//s_waitcnt
0xf800020f,//export en=0b1111 tgt=32(param0)
0x08070605,// vsrc0=5 vsrc1=6 vsrc2=7 vsrc3=8
0xbf8c0000,//s_waitcnt
0xbf810000 //s_endpgm
};
//m0 is put by the spi right after the last user pre-loaded sgprs. m0 must
//be loaded in order to index properly the parameters in lds.
//note: we don't deal with the "valid mask" for pixer en exec register.
static const uint8_t ps_vgprs_n=4;
static const uint8_t ps_user_sgprs_n=0;
static const uint8_t ps_sgprs_n=0;//at least ps_user_sgprs_n
static uint32_t ps[]={
0x7e0002f2,//v_mov_b32 src0=242(1.0f) vdst=0
0xbf8c0000,//s_waitcnt
0x7e0202f2,//v_mov_b32 src0=242(1.0f) vdst=1
0xbf8c0000,//s_waitcnt */
0x7e0402f2,//v_mov_b32 src0=242(1.0f) vdst=2
0xbf8c0000,//s_waitcnt
0x7e0602f2,//v_mov_b32 src0=242(1.0f) vdst=3
0xbf8c0000,//s_waitcnt
0x5e000300,//v_cvt_pkrtz_f16_f32 vdst=0 vsrc1=1 src0=256(vgpr0)
0x5e020702,//v_cvt_pkrtz_f16_f32 vdst=1 vsrc1=3 src0=258(vgpr2)
0xf8001c0f,//exp vm=1 done=1 compr=1 en=0x1111
0x01000100,// vsrc3=1 vsrc2=0 vsrc1=1 vsrc0=0
0xbf8c0000,//s_waitcnt
0xbf810000 //s_endpgm
};
//return the offset aligned on power of two order equal or above the offset
//argument
static uint64_t next_aligned_of(uint64_t of,uint64_t order)
{
uint64_t blk_sz=(1<<order);
uint64_t mask=blk_sz-1;
if(of&mask) return (of+blk_sz)&~mask;
else return of;
}
static void cpy_htole32(uint32_t *dst, uint32_t *src, uint64_t dws_n)
{
while(1){
if(!dws_n) break;
*dst++=htole32(*src++);
dws_n--;
}
}
static uint32_t upper_32_bits(uint64_t x)
{
return x>>32;
}
static uint32_t lower_32_bits(uint64_t x)
{
return (uint32_t)x;
}
union f2u {
float f;
uint32_t u;
};
static inline uint32_t f2u(float f)
{
union f2u tmp;
tmp.f = f;
return tmp.u;
}
#define ib_wr(x) *((*ib)++)=htole32(x)
static void prelude(uint32_t **ib)
{
//----------------------------------------------------------------------------
//sync shader read/write caches, read/write L1/L2texture caches, read caches
//of color blocks (we don't use the depth block).
ib_wr(PKT3(PKT3_SURF_SYNC,4));
//CP_COHER_CTL_0
ib_wr(CCC_SH_ICACHE_ACTION_ENA|CCC_SH_KCACHE_ACTION_ENA|CCC_TCL2_ACTION_ENA
|CCC_CB_ACTION_ENA);
//CP_COHER_SZ
ib_wr(0xffffffff);
//CP_COHER_BASE
ib_wr(0);
ib_wr(0x0000000a);//polling interval, 0xa(10) * 16 clocks
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
//seems mandatory at the start of a command stream
ib_wr(PKT3(PKT3_CTX_CTL,2));
ib_wr(0x80000000);
ib_wr(0x80000000);
//----------------------------------------------------------------------------
}
//Config reg programming, then, in theory, flushing before modifiying their
//values. If same value for *ALL* accel code, should go into the linux
//module to be set once and for all.
static void cfg(uint32_t **ib)
{
//----------------------------------------------------------------------------
//VGT (Vertex Grouper and Tesselator block)
ib_wr(PKT3(PKT3_SET_CFG_REG,2));
ib_wr(CFG_REG_IDX(VGT_PRIM_TYPE));
//VGT_PRIM_TYPE
ib_wr(set(VPT_PRIM_TYPE,VPT_TRILIST));
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
//PA (Primitive Assembler) CL (CLipper)
ib_wr(PKT3(PKT3_SET_CFG_REG,2));
ib_wr(CFG_REG_IDX(PA_CL_ENHANCE));
//PA_CL_ENHANCE
ib_wr(set(PCE_CLIP_SEQ_N,3)|PCE_CLIP_VTX_REORDER_ENA);
//----------------------------------------------------------------------------
}
static void ctx_misc_init(uint32_t **ib)
{
//basic init GPU context, XXX: not using the CLR_CTX command ???
ib_wr(PKT3(PKT3_SET_CTX_REG,14));
ib_wr(CTX_REG_IDX(VGT_OUTPUT_PATH_CTL));
//VGT_OUTPUT_PATH_CTL
ib_wr(0);
///VGT_HOS_CTL
ib_wr(0);
///VGT_HOS_MAX_TESS_LVL
ib_wr(0);
//VGT_HOS_MIN_TESS_LVL
ib_wr(0);
//VGT_HOS_REUSE_DEPTH
ib_wr(0);
//VGT_GROUP_PRIM_TYPE
ib_wr(0);
//VGT_GROUP_FIRST_DECR
ib_wr(0);
//VGT_GROUP_DECR
ib_wr(0);
//VGT_GROUP_VECT_0_CTL
ib_wr(0);
//VGT_GROUP_VECT_1_CTL
ib_wr(0);
//VGT_GROUP_VECT_0_FMT_CTL
ib_wr(0);
//VGT_GROUP_VECT_1_FMT_CTL
ib_wr(0);
//VGT_GS_MODE
ib_wr(0);
ib_wr(PKT3(PKT3_SET_CTX_REG,2));
ib_wr(CTX_REG_IDX(VGT_PRIM_ID_ENA));
//VGT_PRIM_ID_ENA
ib_wr(0);
ib_wr(PKT3(PKT3_SET_CTX_REG,2));
ib_wr(CTX_REG_IDX(VGT_PRIM_ID_RESET));
//VGT_PRIM_ID_RESET
ib_wr(0);
ib_wr(PKT3(PKT3_SET_CTX_REG,3));
ib_wr(CTX_REG_IDX(VGT_STRMOUT_CFG));
//VGT_STRMOUT_CFG
ib_wr(0);
//VGT_STRMOUT_BUF_CFG
ib_wr(0);
ib_wr(PKT3(PKT3_SET_CTX_REG,2));
ib_wr(CTX_REG_IDX(IA_MULTI_VGT_PARAM));
//IA_MULTI_VGT_PARAM
ib_wr(IMVP_SWITCH_ON_EOP | IMVP_PARTIAL_VS_WAVE_ON
| set(IMVP_PRIM_GROUP_SZ, 63));
ib_wr(PKT3(PKT3_SET_CTX_REG,3));
ib_wr(CTX_REG_IDX(VGT_REUSE_OFF));
//VGT_REUSE_OFF
ib_wr(0);
//VGT_VTX_CNT_ENA
ib_wr(0);
ib_wr(PKT3(PKT3_SET_CTX_REG,2));
ib_wr(CTX_REG_IDX(VGT_SHADER_STAGES_ENA));
//VGT_SHADER_STAGES_ENA
ib_wr(0);
ib_wr(PKT3(PKT3_SET_CTX_REG,3));
ib_wr(CTX_REG_IDX(PA_SC_CENTROID_PRIORITY_0));
//PA_SC_CENTROID_PRIORITY_0
ib_wr(0x76543210);
//PA_SC_CENTROID_PRIORITY_1
ib_wr(0xfedcba98);
ib_wr(PKT3(PKT3_SET_CTX_REG,2));
ib_wr(CTX_REG_IDX(DB_EQAA));
//DB_EQAA
ib_wr(0x00110000);
}
static void ctx_vgt(uint32_t **ib)
{
//VGT (Vertex Grouper and Tesselator block)
ib_wr(PKT3(PKT3_SET_CTX_REG,5));
ib_wr(CTX_REG_IDX(VGT_MAX_VTX_IDX));
//VGT_MAX_VTX_IDX
ib_wr(~0);
//VGT_MIN_VTX_IDX
ib_wr(0);
//VGT_IDX_OF
ib_wr(0);
//VGT_MULTI_PRIM_IB_RESET_IDX
ib_wr(0);
ib_wr(PKT3(PKT3_SET_CTX_REG,2));
ib_wr(CTX_REG_IDX(VGT_MULTI_PRIM_IB_RESET_ENA));
//VGT_MULTI_PRIM_IB_RESET_ENA
ib_wr(0);
}
static void ctx_spi_sh_vs(uint32_t **ib,struct params_3d *params_3d)
{
//setup specific for the vertex shader
//Tell the spi to pre-load the buffer descriptors in user sgprs
ib_wr(PKT3(PKT3_SET_SH_REG,9));
ib_wr(SH_REG_IDX(SPI_SH_USER_DATA_VS_0));
//SPI_SH_USER_DATA_VS_0
ib_wr(buf_res_descs[0]);
//SPI_SH_USER_DATA_VS_1
ib_wr(buf_res_descs[1]);
//SPI_SH_USER_DATA_VS_2
ib_wr(buf_res_descs[2]);
//SPI_SH_USER_DATA_VS_3
ib_wr(buf_res_descs[3]);
//SPI_SH_USER_DATA_VS_4
ib_wr(buf_res_descs[4]);
//SPI_SH_USER_DATA_VS_5
ib_wr(buf_res_descs[5]);
//SPI_SH_USER_DATA_VS_6
ib_wr(buf_res_descs[6]);
//SPI_SH_USER_DATA_VS_7
ib_wr(buf_res_descs[7]);
ib_wr(PKT3(PKT3_SET_SH_REG,5));
ib_wr(SH_REG_IDX(SPI_SH_PGM_LO_VS));
//SPI_SH_PGM_LO_VS
ib_wr(lower_32_bits(params_3d->vs_gpu_addr>>8));
//SPI_SH_PGM_HI_VS
ib_wr(set(SSPHV_MEM_BASE,upper_32_bits(params_3d->vs_gpu_addr>>8)));
//SPI_SH_PGM_RSRC_VS_0: the vgrs are allocated using units of 4 vgprs,
//sgprs using units of 8 sgprs. Don't forget to book 2 additionnal
//sgprs for vcc. Both counts are minus one unit.
ib_wr(set(SSPRV_VGPRS,((vs_vgprs_n-1)/4))
| set(SSPRV_SGPRS,((vs_sgprs_n+2)-1)/8));
//SPI_SH_PGM_RSRC_VS_1: tell the spi the count of sgprs which are notx vcc.
ib_wr(set(SSPRV_USER_SGPR,vs_user_sgprs_n));
//our vertex shader export only the color as parameter
ib_wr(PKT3(PKT3_SET_CTX_REG,2));
ib_wr(CTX_REG_IDX(SPI_VS_OUT_CFG));
//SPI_VS_OUT_CFG
ib_wr(set(SVOC_VS_PARAM_EXPORT_COUNT,vs_exported_params_n-1));
//The spi needs to be told what packing format is used by the vertex
//shader to export the position.
ib_wr(PKT3(PKT3_SET_CTX_REG,2));
ib_wr(CTX_REG_IDX(SPI_SH_POS_FMT));
//SPI_SH_POS_FMT
ib_wr(set(SSPF_POS_0_EXPORT_FMT,SSPF_4COMP));
}
static void ctx_spi_sh_ps(uint32_t **ib,struct params_3d *params_3d)
{
//setup specific for the pixel/fragment shader
ib_wr(PKT3(PKT3_SET_SH_REG,5));
ib_wr(SH_REG_IDX(SPI_SH_PGM_LO_PS));
//SPI_SH_PGM_LO_PS
ib_wr(lower_32_bits(params_3d->ps_gpu_addr>>8));
//SPI_SH_PGM_HI_PS
ib_wr(set(SSPHP_MEM_BASE,upper_32_bits(params_3d->ps_gpu_addr>>8)));
//SPI_SH_PGM_RSRC_PS_0: we must account 1 additional sgpr for m0 since
//which will be loaded in the sgpr right after the last user sgpr.
ib_wr(set(SSPRP_VGPRS,((ps_vgprs_n-1)/4))
| set(SSPRP_SGPRS,((ps_sgprs_n+1+2)-1)/8));
//SPI_SH_PGM_RSRC_PS_1: same constrains than the vertex shaders
//plus the fact the spi will load the m0 in the first sgpr after the
//last user loaded sgpr, namely sgpr6 in this case.
ib_wr(set(SSPRP_USER_SGPR,ps_sgprs_n));
//tell the spi the pixel/fragment shader will need perpective center
//interpolation data in input (mandatory or gpu hang)
ib_wr(PKT3(PKT3_SET_CTX_REG,3));
ib_wr(CTX_REG_IDX(SPI_PS_INPUT_ENA));
//SPI_PS_INPUT_ENA
ib_wr(SPIE_PERSP_CENTER_ENA);
//SPI_PS_INPUT_ADDR
ib_wr(SPIA_PERSP_CENTER_ENA);
ib_wr(PKT3(PKT3_SET_CTX_REG,2));
ib_wr(CTX_REG_IDX(SPI_PS_IN_CTL));
//SPI_PS_IN_CTL: 1 parameter to interpolate. Must have at least one
ib_wr(set(SPIC_INTERP_N,1));
//don't care about z depth export
ib_wr(PKT3(PKT3_SET_CTX_REG,2));
ib_wr(CTX_REG_IDX(SPI_SH_Z_FMT));
//SPI_SH_Z_FMT
ib_wr(set(SSZF_Z_EXPORT_FMT,SSZF_ZERO));
//only 1 input param on 32, then only SPI_PS_INPUT_CTL_00
ib_wr(PKT3(PKT3_SET_CTX_REG,2));
ib_wr(CTX_REG_IDX(SPI_PS_INPUT_CTL_00));
//SPI_PS_INPUT_CTL_00
ib_wr(0);
//The spi sends the pixel color exported by a pixel/fragment shader to
//a cb, it needs to be told about the special color packing format the
//shader used.
ib_wr(PKT3(PKT3_SET_CTX_REG,2));
ib_wr(CTX_REG_IDX(SPI_SH_COLOR_FMT));
//SPI_SH_COLOR_FMT
ib_wr(set(SSCF_COLOR_0_EXPORT_FMT,SSCF_FP16_ABGR));
}
static void ctx_spi_sh(uint32_t **ib,struct params_3d *params_3d)
{
//SH (SHader block)
ctx_spi_sh_vs(ib,params_3d);
ctx_spi_sh_ps(ib,params_3d);
}
static void ctx_spi(uint32_t **ib,struct params_3d *params_3d)
{
//SPI (Shader Processor Interpolator)
//disable the point primitive sprite
ib_wr(PKT3(PKT3_SET_CTX_REG,2));
ib_wr(CTX_REG_IDX(SPI_INTERPOL_CTL_0));
//SPI_INTERPOL_CTL_0
ib_wr(0);
ib_wr(PKT3(PKT3_SET_CTX_REG,2));
ib_wr(CTX_REG_IDX(SPI_BARYC_CTL));
//SPI_BARYC_CTL: want 0 in working sample
ib_wr(0);
ctx_spi_sh(ib,params_3d);
}
static void ctx_pa_su(uint32_t **ib)
{
//PA (Primitive Assembler) SU (Setup Unit)
ib_wr(PKT3(PKT3_SET_CTX_REG,2));
ib_wr(CTX_REG_IDX(PA_SU_VTX_CTL));
//PA_SU_VTX_CTL: tells the PA (Primitive Assembler) SU (Setup Unit)
//to place the(?) pixel at the center of the vertex?
ib_wr(PSVC_PIX_CENTER);
//setup for the PA (Primitive Assembler) SU (Setup Unit) for the
//point/line primitive rendering: we do not render point
//or line primitives.
//Set it to 8 like in working samples
ib_wr(PKT3(PKT3_SET_CTX_REG,4));
ib_wr(CTX_REG_IDX(PA_SU_POINT_SZ));
//PA_SU_POINT_SZ
ib_wr(set(PSPS_H,8)|set(PSPS_W,8));
//PA_SU_POINT_MINMAX
ib_wr(set(PSPM_MIN,8)|set(PSPM_MAX,8));
//PA_SU_LINE_CTL
ib_wr(set(PSLC_W,8));
ib_wr(PKT3(PKT3_SET_CTX_REG,2));
ib_wr(CTX_REG_IDX(PA_SU_POLY_OF_CLAMP));
//PA_SU_POLY_OF_CLAMP: tell the PA (Primitive Assembler) SU
//(Setup Unit) for polygon not to clamp something ?
ib_wr(0);
//related to the SC (Scan Converter)
ib_wr(PKT3(PKT3_SET_CTX_REG,2));
ib_wr(CTX_REG_IDX(PA_SU_SC_MODE_CTL));
//PA_SU_SC_MODE_CTL: removed FACE to follow working samples
ib_wr(set(PSSMC_POLY_MODE_FRONT_PTYPE,PSSMC_DRAW_TRIANGLES)
| set(PSSMC_POLY_MODE_BACK_PTYPE,PSSMC_DRAW_TRIANGLES)
| PSSMC_PROVOKING_VTX_LAST);
ib_wr(PKT3(PKT3_SET_CTX_REG,2));
ib_wr(CTX_REG_IDX(PA_SU_PRIM_FILTER_CTL));
//PA_SU_PRIM_FILTER_CTL
ib_wr(0);
}
static void ctx_pa_cl(uint32_t **ib)
{
//PA (Primitive Assembler) CL (CLipper)
ib_wr(PKT3(PKT3_SET_CTX_REG,5));
ib_wr(CTX_REG_IDX(PA_CL_GB_VERT_CLIP_ADJ));
//disable GB (Guard Band) by setting those registers to 1.0f
//PA_CL_GB_VERT_CLIP_ADJ
ib_wr(f2u(1.0f));
//PA_CL_GB_VERT_DISC_ADJ
ib_wr(f2u(1.0f));
//PA_CL_GB_HORZ_CLIP_ADJ
ib_wr(f2u(1.0f));
//PA_CL_GB_HORZ_DISC_ADJ
ib_wr(f2u(1.0f));
//define the way the PA (Primitive Assembler) CL (CLipper) will
//behave regarding NAN (Not A Number) and INF (INFinity) values
ib_wr(PKT3(PKT3_SET_CTX_REG,2));
ib_wr(CTX_REG_IDX(PA_CL_NANINF_CTL));
//PA_CL_NANINF_CTL: to hardware default behaviour
ib_wr(0);
//no clipping done on the input from the vertex shader
ib_wr(PKT3(PKT3_SET_CTX_REG,2));
ib_wr(CTX_REG_IDX(PA_CL_VS_OUT_CTL));
//PA_CL_VS_OUT_CTL
ib_wr(0);
ib_wr(PKT3(PKT3_SET_CTX_REG,2));
ib_wr(CTX_REG_IDX(PA_CL_CLIP_CTL));
//PA_CL_CLIP_CTL: ucp mode 3=always expand and clip as trifan
ib_wr(set(PCCC_PS_UCP_MODE,3) | PCCC_DX_LINEAR_ATTR_CLIP_ENA);
}
static void ctx_pa_sc_vport_0_te(uint32_t **ib, struct params_3d *params_3d)
{
//PA (Primitive Assembler) SC (Scan Converter) VPORT (ViewPORT) 0 TE
//(Transform Engine)
ib_wr(PKT3(PKT3_SET_CTX_REG,7));
ib_wr(CTX_REG_IDX(PA_SC_VPORT_0_TE_X_SCALE));
//PA_SC_VPORT_0_TE_X_SCALE
ib_wr(f2u(params_3d->w/2.0f));
//PA_SC_VPORT_0_TE_X_OF
ib_wr(f2u(params_3d->w / 2.0f));
//PA_SC_VPORT_0_TE_Y_SCALE
ib_wr(f2u(params_3d->h / 2.0f));
//PA_SC_VPORT_0_TE_Y_OF
ib_wr(f2u(params_3d->h / 2.0f));
//PA_SC_VPORT_0_TE_Z_SCALE: stick to working sample values
ib_wr(f2u(0.5f));
//PA_SC_VPORT_0_TE_Z_OF: stick to working sample values
ib_wr(f2u(0.5f));
ib_wr(PKT3(PKT3_SET_CTX_REG,3));
ib_wr(CTX_REG_IDX(PA_SC_VPORT_0_TE_ZMIN));
//PA_SC_VPORT_0_TE_ZMIN: min Z value from VPORT TE
ib_wr(f2u(0.0f));
//PA_SC_VPORT_0_TE_ZMAX: max Z value from VPORT TE
ib_wr(f2u(1.0f));
}
static void ctx_pa_sc_vport_0(uint32_t **ib, struct params_3d *params_3d)
{
//PA (Primitive Assembler) SC (Scan Converter) VPORT (ViewPORT) 0
ib_wr(PKT3(PKT3_SET_CTX_REG,3));
ib_wr(CTX_REG_IDX(PA_SC_VPORT_0_SCISSOR_TL));
//PA_SC_VPORT_0_SCISSOR_TL
ib_wr(set(PSVST_X,0)|set(PSVST_Y,0));
//PA_SC_VPORT_0_SCISSOR_BR
ib_wr(set(PSVSB_X,params_3d->w)|set(PSVSB_Y,params_3d->h));
ctx_pa_sc_vport_0_te(ib,params_3d);
}
static void ctx_pa_sc_vports_te(uint32_t **ib)
{
//PA (Primitive Assembler) SC (Scan Converter) VPORT (ViewPORT) TE (Transform
//Engine)
ib_wr(PKT3(PKT3_SET_CTX_REG,2));
ib_wr(CTX_REG_IDX(PA_SC_VPORT_TE_CTL));
//PA_SC_VPORT_TE_CTL: no so called perpective division
ib_wr(PSVTC_VPORT_X_SCALE_ENA|PSVTC_VPORT_X_OF_ENA|PSVTC_VPORT_Y_SCALE_ENA
|PSVTC_VPORT_Y_OF_ENA|PSVTC_VPORT_Z_SCALE_ENA|PSVTC_VPORT_Z_OF_ENA
|PSVTC_VTX_W0_FMT);
}
static void ctx_pa_sc_vports(uint32_t **ib, struct params_3d *params_3d)
{
//PA (Primitive Assembler) SC (Scan Converter) VPORT (ViewPORT)
ctx_pa_sc_vport_0(ib,params_3d);
ctx_pa_sc_vports_te(ib);
}
static void ctx_pa_sc(uint32_t **ib, struct params_3d *params_3d)
{
//PA (Primitive Assembler) SC (Scan Converter)
ib_wr(PKT3(PKT3_SET_CTX_REG,3));
ib_wr(CTX_REG_IDX(PA_SC_MODE_CTL_0));
//PA_SC_MODE_CTL_0
ib_wr(0);
//PA_SC_MODE_CTL_1
ib_wr(0);
//defines how to render the edge of primitives
ib_wr(PKT3(PKT3_SET_CTX_REG,2));
ib_wr(CTX_REG_IDX(PA_SC_EDGERULE));
//PA_SC_EDGERULE
ib_wr(0xaaaaaaaa);
//----------------------------------------------------------------------------
//Anti-Aliasing... probably
ib_wr(PKT3(PKT3_SET_CTX_REG,2));
ib_wr(CTX_REG_IDX(PA_SC_AA_CFG));
//PA_SC_AA_CFG
ib_wr(0);
//do something AA related
ib_wr(PKT3(PKT3_SET_CTX_REG,3));
ib_wr(CTX_REG_IDX(PA_SC_AA_MASK_X0Y0_X1Y0));
//PA_SC_AA_MASK_X0Y0_X1Y0
ib_wr(0xffffffff);
//PA_SC_AA_MASK_X0Y1_X1Y1
ib_wr(0xffffffff);
//----------------------------------------------------------------------------
ib_wr(PKT3(PKT3_SET_CTX_REG,10));
ib_wr(CTX_REG_IDX(PA_SC_CLIPRECT_RULE));
//PA_SC_CLIPRECT_RULE: no scissor required then clip rule is 0xffff (no specs
//provided)
ib_wr(set(PSCR_CLIP_RULE,0xffff));
//PA_SC_CLIPRECT_0_TL
ib_wr(0);
//PA_SC_CLIPRECT_0_BR
ib_wr(set(PSCB_X,params_3d->w)|set(PSCB_Y,params_3d->h));
//PA_SC_CLIPRECT_1_TL
ib_wr(0);
//PA_SC_CLIPRECT_1_BR
ib_wr(set(PSCB_X,params_3d->w)|set(PSCB_Y,params_3d->h));
//PA_SC_CLIPRECT_2_TL
ib_wr(0);
//PA_SC_CLIPRECT_2_BR
ib_wr(set(PSCB_X,params_3d->w)|set(PSCB_Y,params_3d->h));
//PA_SC_CLIPRECT_3_TL
ib_wr(0);
//PA_SC_CLIPRECT_3_BR
ib_wr(set(PSCB_X,params_3d->w)|set(PSCB_Y,params_3d->h));
//----------------------------------------------------------------------------
//Tells the SC (Scan Converter/rasteriser) we don't use line stipple since we
//do not render line primitives. XXX: ORed register? Because if not will set
//all bits to 0! We only want to set to 0 LINE_STIPPLE_ENA.
ib_wr(PKT3(PKT3_SET_CTX_REG,2));
ib_wr(CTX_REG_IDX(PA_SC_LINE_STIPPLE));
//PA_SC_LINE_STIPPLE */
ib_wr(0);
//Even if we are not rendering line primitives, tells the PA (Primitive
//Assembler) SC (scan converter/rasteriser) to do "something with the last
//pixel
ib_wr(PKT3(PKT3_SET_CTX_REG,2));
ib_wr(CTX_REG_IDX(PA_SC_LINE_CTL));
//PA_SC_LINE_CTL
ib_wr(PSLC_LAST_PIXEL);
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
//set the value of the scissors
ib_wr(PKT3(PKT3_SET_CTX_REG,3));
ib_wr(CTX_REG_IDX(PA_SC_GENERIC_SCISSOR_TL));
//PA_SC_GENERIC_SCISSOR_TL
ib_wr(set(PSGST_X,0)|set(PSGST_Y,0));
//PA_SC_GENERIC_SCISSOR_BR
ib_wr(set(PSGSB_X,params_3d->w)|set(PSGSB_Y,params_3d->h));
ib_wr(PKT3(PKT3_SET_CTX_REG,3));
ib_wr(CTX_REG_IDX(PA_SC_SCREEN_SCISSOR_TL));
//PA_SC_SCREEN_SCISSOR_TL
ib_wr(set(PSSST_X,0)|set(PSSST_Y,0));
//PA_SC_SCREEN_SCISSOR_BR
ib_wr(set(PSSSB_X,params_3d->w)|set(PSSSB_Y,params_3d->h));
ib_wr(PKT3(PKT3_SET_CTX_REG,4));
ib_wr(CTX_REG_IDX(PA_SC_WND_OF));
//PA_SC_WND_OF: the window offset in the screen which can be used by many
//scissors.
ib_wr(0);
//PA_SC_WND_SCISSOR_TL
ib_wr(set(PSWST_X,0)|set(PSWST_Y,0));
//PA_SC_WND_SCISSOR_BR
ib_wr(set(PSWSB_X,params_3d->w)|set(PSWSB_Y,params_3d->h));
//----------------------------------------------------------------------------
ctx_pa_sc_vports(ib,params_3d);
}
static void ctx_pa(uint32_t **ib,struct params_3d *params_3d)
{
//PA (Primitive Assembler)
ctx_pa_su(ib);
ctx_pa_cl(ib);
ctx_pa_sc(ib,params_3d);
}
static void ctx_dbs(uint32_t **ib)
{
//DBs (Depth Blocks)
//disable the depth stencil/z-buffer
ib_wr(PKT3(PKT3_SET_CTX_REG,3));
ib_wr(CTX_REG_IDX(DB_Z_INFO));
//DB_Z_INFO
ib_wr(0);
//DB_STENCIL_INFO
ib_wr(0);
//even if disabled, setup some clean values in a few regs
ib_wr(PKT3(PKT3_SET_CTX_REG,2));
ib_wr(CTX_REG_IDX(DB_DEPTH_CTL));
//DB_DEPTH_CTL
ib_wr(0);
ib_wr(PKT3(PKT3_SET_CTX_REG,5));
ib_wr(CTX_REG_IDX(DB_DEPTH_BOUNDS_MIN));
//DB_DEPTH_BOUNDS_MIN
ib_wr(0);
//DB_DEPTH_BOUNDS_MAX
ib_wr(0);
//DB_STENCIL_CLR
ib_wr(0);
//DB_DEPTH_CLR
ib_wr(f2u(1.0f));
ib_wr(PKT3(PKT3_SET_CTX_REG,2));
ib_wr(CTX_REG_IDX(DB_RENDER_CTL));
//DB_RENDER_CTL
ib_wr(0);
ib_wr(PKT3(PKT3_SET_CTX_REG,2));
ib_wr(CTX_REG_IDX(DB_RENDER_OVERRIDE_0));
//DB_RENDER_OVERRIDE_0
ib_wr(set(DRO_FORCE_HIZ_ENA,DRO_FORCE_DIS)
|set(DRO_FORCE_HIS_ENA_0,DRO_FORCE_DIS)
|set(DRO_FORCE_HIS_ENA_1,
DRO_FORCE_DIS));
ib_wr(PKT3(PKT3_SET_CTX_REG,2));
ib_wr(CTX_REG_IDX(DB_STENCIL_CTL));
//DB_STENCIL_CTL
ib_wr(0);
ib_wr(PKT3(PKT3_SET_CTX_REG,4));
ib_wr(CTX_REG_IDX(DB_SRESULTS_CMP_STATE_0));
//DB_SRESULTS_CMP_STATE_0
ib_wr(0);
//DB_SRESULTS_CMP_STATE_1
ib_wr(0);
//DB_PRELOAD_CTL
ib_wr(0);
ib_wr(PKT3(PKT3_SET_CTX_REG,2));
ib_wr(CTX_REG_IDX(DB_ALPHA_TO_MASK));
//DB_ALPHA_TO_MASK
ib_wr(set(DATM_ALPHA_TO_MASK_OF_0,2)|set(DATM_ALPHA_TO_MASK_OF_1,2)
|set(DATM_ALPHA_TO_MASK_OF_2,2)|set(DATM_ALPHA_TO_MASK_OF_3,2));
ib_wr(PKT3(PKT3_SET_CTX_REG,3));
ib_wr(CTX_REG_IDX(DB_STENCILREFMASK));
//DB_STENCILREFMASK
ib_wr(0);
//DB_STENCILREFMASK_BF
ib_wr(0);
ib_wr(PKT3(PKT3_SET_CTX_REG,2));
ib_wr(CTX_REG_IDX(DB_SH_CTL));
//DB_SH_CTL
ib_wr(set(DSC_Z_ORDER,DSC_EARLY_Z_THEN_LATE_Z));
}
static void ctx_cbs_blend(uint32_t **ib)
{
//blend blocks of CBs (Color Blocks)
ib_wr(PKT3(PKT3_SET_CTX_REG,9));
ib_wr(CTX_REG_IDX(CB_0_BLEND_CTL));
//CB_0_BLEND_CTL: disable blending
ib_wr(0);
//CB_1_BLEND_CTL: disable blending
ib_wr(0);
//CB_2_BLEND_CTL: disable blending
ib_wr(0);
//CB_3_BLEND_CTL: disable blending
ib_wr(0);
//CB_4_BLEND_CTL: disable blending
ib_wr(0);
//CB_5_BLEND_CTL: disable blending
ib_wr(0);
//CB_6_BLEND_CTL: disable blending
ib_wr(0);
//CB_7_BLEND_CTL: disable blending
ib_wr(0);
}
static void ctx_cb_0(uint32_t **ib,struct params_3d *params_3d)
{
//CB 0 (Color Block 0)
ib_wr(PKT3(PKT3_SET_CTX_REG,7));
ib_wr(CTX_REG_IDX(CB_0_COLOR_BASE));
//CB_0_COLOR_BASE
ib_wr(params_3d->fb_gpu_addr>>8);
//CB_0_COLOR_PITCH: a thin1 tile is 8x8 pixels
ib_wr(set(CCP_TILE_MAX,params_3d->w/8-1));
//CB_0_COLOR_SLICE: a thin1 tile is 8x8 pixels
ib_wr(set(CCS_TILE_MAX,params_3d->w*params_3d->h/64-1));
//CB_0_COLOR_VIEW: 0, or last tile index for an array of slices
ib_wr(0);
//CB_0_COLOR_INFO: for sRGB color space, in 8 bits little endian argb, the
//color component swap is ALT for the color components from the pixel/fragment
//shader and value must be clamped before and after blending to mrt range.
ib_wr(set(CCI_ENDIAN,CCI_ENDIAN_NONE)|set(CCI_FMT,CCI_COLOR_8_8_8_8)
|set(CCI_COMP_SWAP, CCI_SWAP_ALT)|set(CCI_NUMBER_TYPE,CCI_NUMBER_UNORM)
|CCI_BLEND_CLAMP);
//CB_0_COLOR_ATTRIB: see gpu/tiling.c
ib_wr(set(CCA_TILE_MODE_IDX,8));
}
static void ctx_cbs(uint32_t **ib,struct params_3d *params_3d)
{
//CBs (Color Blocks)
ctx_cbs_blend(ib);
ctx_cb_0(ib,params_3d);
//do enable all color components (RGBA) from the pixel/fragment shader to be
//used by the CB 0 and do enable CB 0 to output all computed color components
//to target (here our framebuffer)
ib_wr(PKT3(PKT3_SET_CTX_REG,3));
ib_wr(CTX_REG_IDX(CB_TGT_MASK));
//CB_TGT_MASK */
ib_wr(set(CTM_TGT_0_ENA,CTM_TGT_RED|CTM_TGT_GREEN|CTM_TGT_BLUE
|CTM_TGT_ALPHA));
//CB_SH_MASK
ib_wr(set(CSM_OUTPUT_0_ENA,CSM_OUTPUT_RED|CSM_OUTPUT_GREEN|CSM_OUTPUT_BLUE
|CSM_OUTPUT_ALPHA));
ib_wr(PKT3(PKT3_SET_CTX_REG,2));
ib_wr(CTX_REG_IDX(CB_COLOR_CTL));
//CB_COLOR_CTL: switch normal mode for all CBs
ib_wr(set(CCC_MODE,CCC_CB_NORMAL)|set(CCC_ROP3,CCC_0XCC));
}
static void ctx(uint32_t **ib,struct params_3d *params_3d)
{
ctx_misc_init(ib);
ctx_vgt(ib);
ctx_spi(ib,params_3d);
ctx_pa(ib,params_3d);
ctx_dbs(ib);
ctx_cbs(ib,params_3d);
}
static void draw(uint32_t **ib)
{
ib_wr(PKT3(PKT3_IDX_TYPE,1));
ib_wr(set(PKT3_SZ,PKT3_16BITS));
ib_wr(PKT3(PKT3_INST_N,1));
ib_wr(1);
ib_wr(PKT3(PKT3_DRAW_IDX_AUTO,2));
//3 indices to generate
ib_wr(VERTICES_N-1);
//VGT_DRAW_INITIATOR
ib_wr(set(VDI_SRC_SELECT,VDI_AUTO_IDX));
}
static void pfp_align(uint32_t **ib,uint32_t *ib_start)
{
while(((*ib-ib_start)&CP_RING_PFP_DW_MASK)!=0) ib_wr(PKT2);
}
static uint64_t ib_3d(uint32_t *ib_start, struct params_3d *params_3d)
{
uint32_t *ib=ib_start;
prelude(&ib);
//============================================================================
//the real thing is here
cfg(&ib);
ctx(&ib,params_3d);
//============================================================================
draw(&ib);
pfp_align(&ib,ib_start);
return ib-ib_start;
}
//We prepare a big buffer with everything cpu side, then dma it to gpu vram, run
//it, and wait for a fence.
int main(int argc, char *argv[])
{
int r0=0;
int r1=0;
//----------------------------------------------------------------------------
//arguments
if(argc<4){
e("missing arguments");
r0=EXIT_FAILURE;
goto exit;
}
uint64_t fb_gpu_addr=strtoul(argv[1],NULL,16);
uint64_t w=strtoul(argv[2],NULL,10);
uint64_t h=strtoul(argv[3],NULL,10);
//----------------------------------------------------------------------------
o("drawing a triangle:fb=0x%016Lx,w=%Lu;h=%Lu",(ull)fb_gpu_addr,(ull)w,
(ull)h);
//----------------------------------------------------------------------------
errno=0;
int f=open("/dev/si0",O_RDWR);
if(f==-1){
e("open failed:%s",strerror(errno));
r0=EXIT_FAILURE;
goto exit;
}
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
//pre-compute aligned offsets and the aligned size of our vram buffer
//vertex shader must be 256 bytes aligned (order 8), 0 since we will allocate
//a 256 bytes aligned buffer.
uint64_t vs_of=0;
//pixel/fragment shader must be 256 bytes aligned (order 8)
uint64_t ps_of=next_aligned_of(sizeof(vs),8);
//vertices are fetch by block of 4 vertices, dw aligned (order 2)
uint64_t vertices_of=next_aligned_of(ps_of+sizeof(ps),2);
//ib is aligned on prefetch size which is 16 dws (order 6)
uint64_t ib_of=next_aligned_of(vertices_of+sizeof(vertices),6);
//worst alignment is 256 bytes (order 8), then round up for allocation
uint64_t vram_buf_sz=next_aligned_of(ib_of+(IB_DWS_N_MAX<<2),8);
o("vs_of=0x%016llx ps_of=0x%016llx vertices_of=0x%016llx ib_of=0x%016llx vram_buf_sz=0x%016llx",
(ull)vs_of,(ull)ps_of,(ull)vertices_of,(ull)ib_of,(ull)vram_buf_sz);
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
o("allocating 256 bytes aligned vram buffer...");
struct si_mem mem;
mem.align=256;//worst alignment is vs and ps
mem.sz=vram_buf_sz;
errno=0;
ul req=_IOWR('d',SI_MEM_ALLOC,mem);
r1=ioctl(f,req,&mem);
if(r1==-1){
e("alloc vram buffer failed:%s",strerror(errno));
r0=EXIT_FAILURE;
goto exit;
}
o("vram_buf_gpu_addr=0x%016llx",(ull)mem.gpu_addr);
o("allocating 256 bytes aligned vram buffer done");
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
//now, we have enough info to init the parameters for the pipeline
struct params_3d params_3d;
params_3d.vs_gpu_addr=mem.gpu_addr+vs_of;
params_3d.ps_gpu_addr=mem.gpu_addr+ps_of;
params_3d.w=w;
params_3d.h=h;
params_3d.fb_gpu_addr=fb_gpu_addr;
o("params_3d:vs_gpu_addr=0x%016llx ps_gpu_addr=0x%016llx w=%llu h=%llu fb_gpu_addr=0x%016llx",
(ull)params_3d.vs_gpu_addr,(ull)params_3d.ps_gpu_addr,(ull)w,(ull)h,
(ull)params_3d.fb_gpu_addr);
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
o("mmaping an aperture...");
//get an aperture of the size of our vram buffer for dma
errno=0;
void *dma_buffer=mmap(NULL,vram_buf_sz,PROT_READ|PROT_WRITE,MAP_SHARED,f,0);
if(dma_buffer==MAP_FAILED){
e("unable to mmap an aperture buffer:%s",strerror(errno));
r0=EXIT_FAILURE;
goto free_vram_buf;
}
o("dma buffer=%p",dma_buffer);
o("mmaping an aperture done");
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
//configure buffer resources
uint64_t vtx_buf_gpu_addr=mem.gpu_addr+vertices_of;
o("buffer resources:vtx_buf_gpu=0x%016llx",(ull)vtx_buf_gpu_addr);
//vertex position buffer start address
buf_res_descs[0]=lower_32_bits(vtx_buf_gpu_addr);
buf_res_descs[1]|=upper_32_bits(vtx_buf_gpu_addr);
//vertex color buffer start address
buf_res_descs[4]=lower_32_bits(vtx_buf_gpu_addr+VERTICES_N*sizeof(float));
buf_res_descs[5]|=upper_32_bits(vtx_buf_gpu_addr+VERTICES_N*sizeof(float));
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
o("copying static data into dma buffer...");
cpy_htole32(dma_buffer+vs_of,&vs[0],sizeof(vs)>>2);
cpy_htole32(dma_buffer+ps_of,&ps[0],sizeof(ps)>>2);
cpy_htole32(dma_buffer+vertices_of,(uint32_t*)&vertices[0],
sizeof(vertices)>>2);
o("copying static data into dma buffer done");
//----------------------------------------------------------------------------
//============================================================================
//there, we program the 3D pipeline
uint64_t ib_dws_n=ib_3d(dma_buffer+ib_of,¶ms_3d);
o("ib_dws_n=0x%016llx(max=0x%016llx)",(ull)ib_dws_n,(ull)IB_DWS_N_MAX);
//============================================================================
//----------------------------------------------------------------------------
o("dma-ing the cpu buffer to vram buffer...");
struct si_dma dma;
struct si_dma_l2l *l2l=&dma.params.l2l;
struct si_timeouts_info *t_info=&dma.t_info;
dma.type=SI_DMA_TYPE_L2L;
dma.dir=SI_DMA_TO_DEVICE;
//we don't really care here lets put one seconde!
t_info->ring.n_max=1;
t_info->ring.us=1000000;
t_info->fence.n_max=1;
t_info->fence.us=1000000;
l2l->src_addr=(uint64_t)dma_buffer;
l2l->dst_addr=mem.gpu_addr;
l2l->sz=vram_buf_sz;
req=_IOW('d',SI_DMA,dma);
errno=0;
r1=ioctl(f,req,&dma);
switch(r1){
case -1:
e("dma l2l failed:%s",strerror(errno));
r0=EXIT_FAILURE;
goto free_vram_buf;
case SI_RING_TIMEOUT:
e("dma l2l failed:ring timeout");
r0=EXIT_FAILURE;
goto free_vram_buf;
case SI_FENCE_TIMEOUT:
e("dma l2l failed:fence timeout");
r0=EXIT_FAILURE;
goto free_vram_buf;
}
o("dma-ing the cpu buffer to vram buffer done");
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
o("running the ib...");
struct si_gpu_3d_ib gpu_3d_ib;
struct si_timeout_info *ring_t_info=&gpu_3d_ib.ring_t_info;
//we don't really care here lets put one seconde!
ring_t_info->n_max=1;
ring_t_info->us=1000000;
gpu_3d_ib.gpu_addr=mem.gpu_addr+ib_of;
gpu_3d_ib.dws_n=ib_dws_n;
req=_IOW('d',SI_GPU_3D_IB,gpu_3d_ib);
errno=0;
r1=ioctl(f,req,&gpu_3d_ib);
switch(r1){
case -1:
e("running the GPU_3D indirecting buffer failed:%s",strerror(errno));
r0=EXIT_FAILURE;
goto free_vram_buf;
case SI_RING_TIMEOUT:
e("running the GPU_3D indirecting buffer failed:ring timeout");
r0=EXIT_FAILURE;
goto free_vram_buf;
}
o("running the ib done");
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
o("fencing...");
struct si_gpu_3d_fence gpu_3d_fence;
t_info=&gpu_3d_fence.t_info;
//we don't really care here lets put one seconde!
t_info->ring.n_max=1;
t_info->ring.us=1000000;
t_info->fence.n_max=1;
t_info->fence.us=1000000;
req=_IOW('d',SI_GPU_3D_FENCE,gpu_3d_fence);
errno=0;
r1=ioctl(f,req,&gpu_3d_fence);
switch(r1){
case -1:
e("waiting for fence failed:%s",strerror(errno));
r0=EXIT_FAILURE;
break;
case SI_RING_TIMEOUT:
e("waiting for fence failed:ring timeout");
r0=EXIT_FAILURE;
break;
case SI_FENCE_TIMEOUT:
e("waiting for fence failed:fence timeout");
r0=EXIT_FAILURE;
break;
}
o("fencing done");
//----------------------------------------------------------------------------
free_vram_buf:
//----------------------------------------------------------------------------
o("freeing vram buffer...");
req=_IOW('d',SI_MEM_FREE,mem.gpu_addr);
errno=0;
r1=ioctl(f,req,&mem.gpu_addr);
if(r1==-1){
e("free vram buffer failed (LEAK!):%s",strerror(errno));
r0=EXIT_FAILURE;
}
o("freeing vram buffer done");
//----------------------------------------------------------------------------
exit:
return r0;
}