Drop custom SSE code from WindowQuads::makeInterleavedArrays

Both GCC and Clang do auto vecrization these days and testing shows that
that actually produces faster code, so dropping the SSE stuff makes
things both simpler and faster.
This commit is contained in:
Arjen Hiemstra 2022-07-05 11:54:36 +02:00
parent 0c8d87f9d4
commit a95b556868

View file

@ -25,10 +25,6 @@
#include <kconfiggroup.h>
#include <ksharedconfig.h>
#if defined(__SSE2__)
#include <emmintrin.h>
#endif
#include <optional>
namespace KWin
@ -1055,106 +1051,41 @@ void WindowQuadList::makeInterleavedArrays(unsigned int type, GLVertex2D *vertic
Q_ASSERT(type == GL_QUADS || type == GL_TRIANGLES);
switch (type) {
case GL_QUADS:
#if defined(__SSE2__)
if (!(intptr_t(vertex) & 0xf)) {
for (const WindowQuad &quad : *this) {
alignas(16) GLVertex2D v[4];
case GL_QUADS: {
for (const WindowQuad &quad : *this) {
for (int j = 0; j < 4; j++) {
const WindowVertex &wv = quad[j];
for (int j = 0; j < 4; j++) {
const WindowVertex &wv = quad[j];
GLVertex2D v;
v.position = QVector2D(wv.x(), wv.y());
v.texcoord = QVector2D(wv.u(), wv.v()) * coeff + offset;
v[j].position = QVector2D(wv.x(), wv.y());
v[j].texcoord = QVector2D(wv.u(), wv.v()) * coeff + offset;
}
const __m128i *srcP = reinterpret_cast<const __m128i *>(&v);
__m128i *dstP = reinterpret_cast<__m128i *>(vertex);
_mm_stream_si128(&dstP[0], _mm_load_si128(&srcP[0])); // Top-left
_mm_stream_si128(&dstP[1], _mm_load_si128(&srcP[1])); // Top-right
_mm_stream_si128(&dstP[2], _mm_load_si128(&srcP[2])); // Bottom-right
_mm_stream_si128(&dstP[3], _mm_load_si128(&srcP[3])); // Bottom-left
vertex += 4;
}
} else
#endif // __SSE2__
{
for (const WindowQuad &quad : *this) {
for (int j = 0; j < 4; j++) {
const WindowVertex &wv = quad[j];
GLVertex2D v;
v.position = QVector2D(wv.x(), wv.y());
v.texcoord = QVector2D(wv.u(), wv.v()) * coeff + offset;
*(vertex++) = v;
}
*(vertex++) = v;
}
}
break;
} break;
case GL_TRIANGLES: {
for (const WindowQuad &quad : *this) {
GLVertex2D v[4]; // Four unique vertices / quad
case GL_TRIANGLES:
#if defined(__SSE2__)
if (!(intptr_t(vertex) & 0xf)) {
for (const WindowQuad &quad : *this) {
alignas(16) GLVertex2D v[4];
for (int j = 0; j < 4; j++) {
const WindowVertex &wv = quad[j];
for (int j = 0; j < 4; j++) {
const WindowVertex &wv = quad[j];
v[j].position = QVector2D(wv.x(), wv.y());
v[j].texcoord = QVector2D(wv.u(), wv.v()) * coeff + offset;
}
const __m128i *srcP = reinterpret_cast<const __m128i *>(&v);
__m128i *dstP = reinterpret_cast<__m128i *>(vertex);
__m128i src[4];
src[0] = _mm_load_si128(&srcP[0]); // Top-left
src[1] = _mm_load_si128(&srcP[1]); // Top-right
src[2] = _mm_load_si128(&srcP[2]); // Bottom-right
src[3] = _mm_load_si128(&srcP[3]); // Bottom-left
// First triangle
_mm_stream_si128(&dstP[0], src[1]); // Top-right
_mm_stream_si128(&dstP[1], src[0]); // Top-left
_mm_stream_si128(&dstP[2], src[3]); // Bottom-left
// Second triangle
_mm_stream_si128(&dstP[3], src[3]); // Bottom-left
_mm_stream_si128(&dstP[4], src[2]); // Bottom-right
_mm_stream_si128(&dstP[5], src[1]); // Top-right
vertex += 6;
v[j].position = QVector2D(wv.x(), wv.y());
v[j].texcoord = QVector2D(wv.u(), wv.v()) * coeff + offset;
}
} else
#endif // __SSE2__
{
for (const WindowQuad &quad : *this) {
GLVertex2D v[4]; // Four unique vertices / quad
for (int j = 0; j < 4; j++) {
const WindowVertex &wv = quad[j];
// First triangle
*(vertex++) = v[1]; // Top-right
*(vertex++) = v[0]; // Top-left
*(vertex++) = v[3]; // Bottom-left
v[j].position = QVector2D(wv.x(), wv.y());
v[j].texcoord = QVector2D(wv.u(), wv.v()) * coeff + offset;
}
// First triangle
*(vertex++) = v[1]; // Top-right
*(vertex++) = v[0]; // Top-left
*(vertex++) = v[3]; // Bottom-left
// Second triangle
*(vertex++) = v[3]; // Bottom-left
*(vertex++) = v[2]; // Bottom-right
*(vertex++) = v[1]; // Top-right
}
// Second triangle
*(vertex++) = v[3]; // Bottom-left
*(vertex++) = v[2]; // Bottom-right
*(vertex++) = v[1]; // Top-right
}
break;
} break;
default:
break;
}