Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
157 changes: 136 additions & 21 deletions src/external/rlsw.h
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,19 @@
#endif
#endif

// Fast power-of-two texture wrap (SW_REPEAT mode only)
// When defined, textures whose width/height are powers of two use a bitmask
// wrap (`x & (size-1)`) instead of `floorf`-based fractional wrap or the
// signed `%` chain in the linear sampler. Saves a software divide on Xtensa
// and a few instructions everywhere. NPOT textures keep using the original
// path via a runtime `(size & (size-1)) == 0` check, so SW_REPEAT remains
// correct for them. The only observable behavior change is for POT textures
// sampled with negative UV coordinates: bitmask wrap (two's complement) can
// differ from `sw_fract` by one texel. Off by default to keep bit-for-bit
// behavior; opt in if you control your asset UVs.
//
// #define SW_TEXTURE_REPEAT_POT_FAST
Comment thread
raysan5 marked this conversation as resolved.

//----------------------------------------------------------------------------------
// OpenGL Compatibility Types
//----------------------------------------------------------------------------------
Expand Down Expand Up @@ -844,6 +857,17 @@ SWAPI void swGetFramebufferAttachmentParameteriv(SWattachment attachment, SWatta
#endif
#endif

// ESP-DSP acceleration: ESP-IDF ships an optimized math library that includes
// `dspm_mult_4x4x4_f32` (4x4 matrix multiply) and `dspm_mult_4x4x1_f32`
// (matrix * vector). These are S3-tuned hand-vectorized kernels that beat the
// scalar versions for both throughput and code-size. Detection is opt-in to
// keep the dependency optional: define SW_USE_ESP_DSP from your build system
// (or rely on the `idf_component.yml` example shown in the rlsw docs).
#if defined(ESP_PLATFORM) && defined(SW_USE_ESP_DSP)
#define SW_HAS_ESP_DSP
#include "dspm_mult.h"
#endif

#ifdef __cplusplus
#define SW_CURLY_INIT(name) name
#else
Expand Down Expand Up @@ -1038,6 +1062,9 @@ typedef struct {
SWmatrix currentMatrixMode; // Current matrix mode (e.g., sw_MODELVIEW, sw_PROJECTION)
sw_matrix_t *currentMatrix; // Pointer to the currently used matrix according to the mode
sw_matrix_t matMVP; // Model view projection matrix, calculated and used internally
#ifdef SW_HAS_ESP_DSP
float matMVP_rm[16]; // Row-major MVP, kept in sync for esp-dsp dspm_mult_4x4x1_f32 vertex transform
#endif
bool isDirtyMVP; // Indicates if the MVP matrix should be rebuilt

sw_handle_t boundFramebufferId; // Framebuffer currently bound
Expand Down Expand Up @@ -1141,6 +1168,14 @@ static inline void sw_matrix_id(sw_matrix_t dst)

static inline void sw_matrix_mul_rst(float *SW_RESTRICT dst, const float *SW_RESTRICT left, const float *SW_RESTRICT right)
{
#ifdef SW_HAS_ESP_DSP
// dspm_mult_4x4x4_f32 treats its operands as row-major. rlsw stores matrices
// column-major, so passing them flat is equivalent to passing transposes:
// dspm_mult(L^T, R^T) computes (L^T)*(R^T) = (R*L)^T, written back into a
// flat array gives the same bit pattern as the column-major product (R*L)
// -- exactly the semantic the scalar fallback below has.
dspm_mult_4x4x4_f32(left, right, dst);
#else
float l00 = left[0], l01 = left[1], l02 = left[2], l03 = left[3];
float l10 = left[4], l11 = left[5], l12 = left[6], l13 = left[7];
float l20 = left[8], l21 = left[9], l22 = left[10], l23 = left[11];
Expand All @@ -1165,6 +1200,7 @@ static inline void sw_matrix_mul_rst(float *SW_RESTRICT dst, const float *SW_RES
dst[7] = l10*right[3] + l11*right[7] + l12*right[11] + l13*right[15];
dst[11] = l20*right[3] + l21*right[7] + l22*right[11] + l23*right[15];
dst[15] = l30*right[3] + l31*right[7] + l32*right[11] + l33*right[15];
#endif
}

static inline void sw_matrix_mul(sw_matrix_t dst, const sw_matrix_t left, const sw_matrix_t right)
Expand Down Expand Up @@ -1210,6 +1246,33 @@ static inline float sw_fract(float x)
return (x - floorf(x));
}

// Fast reciprocal: 1-ULP accurate in ~7 instructions on Xtensa using the
// hardware `recip0.s` seed + two Newton-Raphson refinement steps. All work
// stays in FPU registers — no `__divsf3` software call. Hot-path divisions
// in the rasterizer (span/triangle setup, perspective divide, etc.) call
// this. On non-Xtensa targets it transparently expands to `1.0f / x`, so
// generated code is identical to before.
#if defined(__XTENSA__)
__attribute__((always_inline))
static inline float sw_rcp(float x)
{
float result, temp;
__asm__(
"recip0.s %0, %2\n"
"const.s %1, 1\n"
"msub.s %1, %2, %0\n"
"madd.s %0, %0, %1\n"
"const.s %1, 1\n"
"msub.s %1, %2, %0\n"
"maddn.s %0, %0, %1\n"
: "=&f"(result), "=&f"(temp) : "f"(x)
);
return result;
}
#else
static inline float sw_rcp(float x) { return 1.0f/x; }
#endif

static inline uint8_t sw_luminance8(const uint8_t *color)
{
return (uint8_t)((color[0]*77 + color[1]*150 + color[2]*29) >> 8);
Expand Down Expand Up @@ -2406,11 +2469,31 @@ static inline void sw_texture_free(sw_texture_t *texture)

static inline void sw_texture_sample_nearest(float *SW_RESTRICT color, const sw_texture_t *SW_RESTRICT tex, float u, float v)
{
u = (tex->sWrap == SW_REPEAT)? sw_fract(u) : sw_saturate(u);
v = (tex->tWrap == SW_REPEAT)? sw_fract(v) : sw_saturate(v);
int x, y;

#ifdef SW_TEXTURE_REPEAT_POT_FAST
if ((tex->sWrap == SW_REPEAT) && ((tex->width & tex->wMinus1) == 0))
Comment on lines +2474 to +2475
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's fine that when SW_TEXTURE_REPEAT_POT_FAST is defined, we restrict SW_REPEAT to POT textures only, this would remove all extra branches here.

This would require updating swTexParameteri so SW_REPEAT is applied only if the texture is POT, otherwise leave it unchanged and set RLSW.errCode = SW_INVALID_OPERATION.

We could also store a bool isPOT; in sw_texture_t determined during sw_texture_alloc.

We should also explicitly set default parameters in sw_texture_alloc, like SW_CLAMP for wrapping.

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I usually try to minimize to store information that can be calculated when needed but I understand that isPOT would be useful for every texture access. I forget that we are operating one level below OpenGL...

{
x = (int)(u*tex->width) & tex->wMinus1;
}
else
#endif
{
u = (tex->sWrap == SW_REPEAT)? sw_fract(u) : sw_saturate(u);
x = (int)(u*tex->width);
}

int x = u*tex->width;
int y = v*tex->height;
#ifdef SW_TEXTURE_REPEAT_POT_FAST
if ((tex->tWrap == SW_REPEAT) && ((tex->height & tex->hMinus1) == 0))
{
y = (int)(v*tex->height) & tex->hMinus1;
}
else
#endif
{
v = (tex->tWrap == SW_REPEAT)? sw_fract(v) : sw_saturate(v);
y = (int)(v*tex->height);
}

tex->readColor(color, tex->pixels, y*tex->width + x);
}
Expand All @@ -2432,13 +2515,19 @@ static inline void sw_texture_sample_linear(float *SW_RESTRICT color, const sw_t
int x1 = x0 + 1;
int y1 = y0 + 1;

// NOTE: If the textures are POT, avoid the division for SW_REPEAT

if (tex->sWrap == SW_CLAMP)
{
x0 = (x0 > tex->wMinus1)? tex->wMinus1 : x0;
x1 = (x1 > tex->wMinus1)? tex->wMinus1 : x1;
}
#ifdef SW_TEXTURE_REPEAT_POT_FAST
else if ((tex->width & tex->wMinus1) == 0)
{
// POT fast path: bitmask wrap covers negative ints via two's complement
x0 = x0 & tex->wMinus1;
x1 = x1 & tex->wMinus1;
}
#endif
else
{
x0 = (x0%tex->width + tex->width)%tex->width;
Expand All @@ -2450,6 +2539,13 @@ static inline void sw_texture_sample_linear(float *SW_RESTRICT color, const sw_t
y0 = (y0 > tex->hMinus1)? tex->hMinus1 : y0;
y1 = (y1 > tex->hMinus1)? tex->hMinus1 : y1;
}
#ifdef SW_TEXTURE_REPEAT_POT_FAST
else if ((tex->height & tex->hMinus1) == 0)
{
y0 = y0 & tex->hMinus1;
y1 = y1 & tex->hMinus1;
}
#endif
else
{
y0 = (y0%tex->height + tex->height)%tex->height;
Expand Down Expand Up @@ -3366,7 +3462,7 @@ static void sw_triangle_clip_and_project(void)

// Calculation of the reciprocal of W for normalization
// as well as perspective-correct attributes
const float wRcp = 1.0f/v->position[3];
const float wRcp = sw_rcp(v->position[3]);
Comment thread
raysan5 marked this conversation as resolved.

// Division of XYZ coordinates by weight
v->position[0] *= wRcp;
Expand Down Expand Up @@ -3481,7 +3577,7 @@ static void sw_quad_clip_and_project(void)

// Calculation of the reciprocal of W for normalization
// as well as perspective-correct attributes
const float wRcp = 1.0f/v->position[3];
const float wRcp = sw_rcp(v->position[3]);

// Division of XYZ coordinates by weight
v->position[0] *= wRcp;
Expand Down Expand Up @@ -3659,8 +3755,8 @@ static bool sw_line_clip_and_project(sw_vertex_t *v0, sw_vertex_t *v1)
if (!sw_line_clip(v0, v1)) return false;

// Convert clip coordinates to NDC
v0->position[3] = 1.0f/v0->position[3];
v1->position[3] = 1.0f/v1->position[3];
v0->position[3] = sw_rcp(v0->position[3]);
v1->position[3] = sw_rcp(v1->position[3]);
for (int i = 0; i < 3; i++)
{
v0->position[i] *= v0->position[3];
Expand Down Expand Up @@ -3709,7 +3805,7 @@ static bool sw_point_clip_and_project(sw_vertex_t *v)
if ((v->position[i] < -v->position[3]) || (v->position[i] > v->position[3])) return false;
}

v->position[3] = 1.0f/v->position[3];
v->position[3] = sw_rcp(v->position[3]);
v->position[0] *= v->position[3];
v->position[1] *= v->position[3];
v->position[2] *= v->position[3];
Expand Down Expand Up @@ -3791,6 +3887,19 @@ static void sw_immediate_begin(SWdraw mode)
RLSW.stackModelview[RLSW.stackModelviewCounter - 1],
RLSW.stackProjection[RLSW.stackProjectionCounter - 1]);

#ifdef SW_HAS_ESP_DSP
// Pre-transpose to row-major so dspm_mult_4x4x1_f32(matMVP_rm, v, out)
// computes M*v directly in the per-vertex hot path. 16 scalar copies
// per MVP update vs. saving ~20 cycles per vertex transform.
for (int i = 0; i < 4; i++)
{
for (int j = 0; j < 4; j++)
{
RLSW.matMVP_rm[4*i + j] = RLSW.matMVP[4*j + i];
}
}
#endif

RLSW.isDirtyMVP = false;
}

Expand Down Expand Up @@ -3842,11 +3951,17 @@ static void sw_immediate_push_vertex(const float position[4])
sw_vertex_t *vertex = &RLSW.primitive.buffer[RLSW.primitive.vertexCount++];

// Calculate clip coordinates
#ifdef SW_HAS_ESP_DSP
// dspm_mult_4x4x1_f32 declares its inputs non-const; rlsw treats them as
// read-only and the cast is safe (the kernel only loads from B).
dspm_mult_4x4x1_f32(RLSW.matMVP_rm, (float *)position, vertex->position);
#else
const float *m = RLSW.matMVP;
vertex->position[0] = m[0]*position[0] + m[4]*position[1] + m[8]*position[2] + m[12]*position[3];
vertex->position[1] = m[1]*position[0] + m[5]*position[1] + m[9]*position[2] + m[13]*position[3];
vertex->position[2] = m[2]*position[0] + m[6]*position[1] + m[10]*position[2] + m[14]*position[3];
vertex->position[3] = m[3]*position[0] + m[7]*position[1] + m[11]*position[2] + m[15]*position[3];
#endif

// Copy the attributes in the current vertex
for (int i = 0; i < 4; i++) vertex->color[i] = RLSW.primitive.color[i];
Expand Down Expand Up @@ -5272,7 +5387,7 @@ static void SW_RASTER_TRIANGLE_SPAN(const sw_vertex_t *start, const sw_vertex_t
if (xStart == xEnd) return;

// Compute the inverse horizontal distance along the X axis
float dxRcp = 1.0f/(end->position[0] - start->position[0]);
float dxRcp = sw_rcp(end->position[0] - start->position[0]);

// Compute the interpolation steps along the X axis
float dWdx = (end->position[3] - start->position[3])*dxRcp;
Expand Down Expand Up @@ -5326,12 +5441,12 @@ static void SW_RASTER_TRIANGLE_SPAN(const sw_vertex_t *start, const sw_vertex_t
int blockEnd = x + SW_AFFINE_BLOCK;
if (blockEnd > xEnd) blockEnd = xEnd;
float blockLenF = (float)(blockEnd - x);
float blockLenRcp = 1.0f/blockLenF;
float blockLenRcp = sw_rcp(blockLenF);

// Only 2 '1/w' here; none inside the pixel loop
float wRcpA = 1.0f/w;
float wRcpA = sw_rcp(w);
float wB = w + dWdx*blockLenF;
float wRcpB = 1.0f/wB;
float wRcpB = sw_rcp(wB);

// Perspective-correct color at both block endpoints, then affine gradient
float srcColor[4] = {
Expand Down Expand Up @@ -5459,9 +5574,9 @@ static void SW_RASTER_TRIANGLE(const sw_vertex_t *v0, const sw_vertex_t *v1, con
if (h02 < 1e-6f) return;

// Inverse edge dy for per-edge dV/dy (scanline interpolation)
float h02Rcp = 1.0f/h02;
float h01Rcp = (h01 > 1e-6f)? 1.0f/h01 : 0.0f;
float h12Rcp = (h12 > 1e-6f)? 1.0f/h12 : 0.0f;
float h02Rcp = sw_rcp(h02);
float h01Rcp = (h01 > 1e-6f)? sw_rcp(h01) : 0.0f;
float h12Rcp = (h12 > 1e-6f)? sw_rcp(h12) : 0.0f;

// Compute gradients for each side of the triangle
sw_vertex_t dVXdy02, dVXdy01, dVXdy12;
Expand Down Expand Up @@ -5560,8 +5675,8 @@ static void SW_RASTER_QUAD(const sw_vertex_t *a, const sw_vertex_t *b,
float h = (float)(yMax - yMin);
if ((w <= 0) || (h <= 0)) return;

float wRcp = 1.0f/w;
float hRcp = 1.0f/h;
float wRcp = sw_rcp(w);
float hRcp = sw_rcp(h);

// Subpixel corrections
float xSubstep = 1.0f - sw_fract(tl->position[0]);
Expand Down Expand Up @@ -5746,7 +5861,7 @@ static void SW_RASTER_LINE(const sw_vertex_t *v0, const sw_vertex_t *v1)
// Compute per pixel increments
float xInc = dx/steps;
float yInc = dy/steps;
float stepRcp = 1.0f/steps;
float stepRcp = sw_rcp(steps);
#ifdef SW_ENABLE_DEPTH_TEST
float zInc = (v1->position[2] - v0->position[2])*stepRcp;
#endif
Expand Down