Drop custom SSE code from WindowQuads::makeInterleavedArrays

Both GCC and Clang do auto vecrization these days and testing shows that that actually produces faster code, so dropping the SSE stuff makes things both simpler and faster.
2022-07-05 11:54:36 +02:00 · 2022-07-05 11:54:36 +02:00 · a95b556868
commit a95b556868
parent 0c8d87f9d4
1 changed files with 25 additions and 94 deletions
--- a/src/libkwineffects/kwineffects.cpp
+++ b/src/libkwineffects/kwineffects.cpp
@ -25,10 +25,6 @@
 #include <kconfiggroup.h>
 #include <ksharedconfig.h>

-#if defined(__SSE2__)
-#include <emmintrin.h>
-#endif
-
 #include <optional>

 namespace KWin
@ -1055,106 +1051,41 @@ void WindowQuadList::makeInterleavedArrays(unsigned int type, GLVertex2D *vertic
    Q_ASSERT(type == GL_QUADS || type == GL_TRIANGLES);

    switch (type) {
-    case GL_QUADS:
-#if defined(__SSE2__)
-        if (!(intptr_t(vertex) & 0xf)) {
-            for (const WindowQuad &quad : *this) {
-                alignas(16) GLVertex2D v[4];
+    case GL_QUADS: {
+        for (const WindowQuad &quad : *this) {
+            for (int j = 0; j < 4; j++) {
+                const WindowVertex &wv = quad[j];

-                for (int j = 0; j < 4; j++) {
-                    const WindowVertex &wv = quad[j];
+                GLVertex2D v;
+                v.position = QVector2D(wv.x(), wv.y());
+                v.texcoord = QVector2D(wv.u(), wv.v()) * coeff + offset;

-                    v[j].position = QVector2D(wv.x(), wv.y());
-                    v[j].texcoord = QVector2D(wv.u(), wv.v()) * coeff + offset;
-                }
-
-                const __m128i *srcP = reinterpret_cast<const __m128i *>(&v);
-                __m128i *dstP = reinterpret_cast<__m128i *>(vertex);
-
-                _mm_stream_si128(&dstP[0], _mm_load_si128(&srcP[0])); // Top-left
-                _mm_stream_si128(&dstP[1], _mm_load_si128(&srcP[1])); // Top-right
-                _mm_stream_si128(&dstP[2], _mm_load_si128(&srcP[2])); // Bottom-right
-                _mm_stream_si128(&dstP[3], _mm_load_si128(&srcP[3])); // Bottom-left
-
-                vertex += 4;
-            }
-        } else
-#endif // __SSE2__
-        {
-            for (const WindowQuad &quad : *this) {
-                for (int j = 0; j < 4; j++) {
-                    const WindowVertex &wv = quad[j];
-
-                    GLVertex2D v;
-                    v.position = QVector2D(wv.x(), wv.y());
-                    v.texcoord = QVector2D(wv.u(), wv.v()) * coeff + offset;
-
-                    *(vertex++) = v;
-                }
+                *(vertex++) = v;
            }
        }
-        break;
+    } break;
+    case GL_TRIANGLES: {
+        for (const WindowQuad &quad : *this) {
+            GLVertex2D v[4]; // Four unique vertices / quad

-    case GL_TRIANGLES:
-#if defined(__SSE2__)
-        if (!(intptr_t(vertex) & 0xf)) {
-            for (const WindowQuad &quad : *this) {
-                alignas(16) GLVertex2D v[4];
+            for (int j = 0; j < 4; j++) {
+                const WindowVertex &wv = quad[j];

-                for (int j = 0; j < 4; j++) {
-                    const WindowVertex &wv = quad[j];
-
-                    v[j].position = QVector2D(wv.x(), wv.y());
-                    v[j].texcoord = QVector2D(wv.u(), wv.v()) * coeff + offset;
-                }
-
-                const __m128i *srcP = reinterpret_cast<const __m128i *>(&v);
-                __m128i *dstP = reinterpret_cast<__m128i *>(vertex);
-
-                __m128i src[4];
-                src[0] = _mm_load_si128(&srcP[0]); // Top-left
-                src[1] = _mm_load_si128(&srcP[1]); // Top-right
-                src[2] = _mm_load_si128(&srcP[2]); // Bottom-right
-                src[3] = _mm_load_si128(&srcP[3]); // Bottom-left
-
-                // First triangle
-                _mm_stream_si128(&dstP[0], src[1]); // Top-right
-                _mm_stream_si128(&dstP[1], src[0]); // Top-left
-                _mm_stream_si128(&dstP[2], src[3]); // Bottom-left
-
-                // Second triangle
-                _mm_stream_si128(&dstP[3], src[3]); // Bottom-left
-                _mm_stream_si128(&dstP[4], src[2]); // Bottom-right
-                _mm_stream_si128(&dstP[5], src[1]); // Top-right
-
-                vertex += 6;
+                v[j].position = QVector2D(wv.x(), wv.y());
+                v[j].texcoord = QVector2D(wv.u(), wv.v()) * coeff + offset;
            }
-        } else
-#endif // __SSE2__
-        {
-            for (const WindowQuad &quad : *this) {
-                GLVertex2D v[4]; // Four unique vertices / quad

-                for (int j = 0; j < 4; j++) {
-                    const WindowVertex &wv = quad[j];
+            // First triangle
+            *(vertex++) = v[1]; // Top-right
+            *(vertex++) = v[0]; // Top-left
+            *(vertex++) = v[3]; // Bottom-left

-                    v[j].position = QVector2D(wv.x(), wv.y());
-                    v[j].texcoord = QVector2D(wv.u(), wv.v()) * coeff + offset;
-                }
-
-                // First triangle
-                *(vertex++) = v[1]; // Top-right
-                *(vertex++) = v[0]; // Top-left
-                *(vertex++) = v[3]; // Bottom-left
-
-                // Second triangle
-                *(vertex++) = v[3]; // Bottom-left
-                *(vertex++) = v[2]; // Bottom-right
-                *(vertex++) = v[1]; // Top-right
-            }
+            // Second triangle
+            *(vertex++) = v[3]; // Bottom-left
+            *(vertex++) = v[2]; // Bottom-right
+            *(vertex++) = v[1]; // Top-right
        }
-        break;
-
+    } break;
    default:
        break;
    }