backends/drm: combine color operations more aggressively than ColorPipeline does

With programmable LUTs, consecutive transfer functions, inverse transfer functions and multipliers can all be combined into one LUT. This allows offloading operations in more situations and makes the operations more efficient too, as potentially fewer LUTs have to be programmed
2024-07-15 20:35:46 +02:00 · 2024-07-15 20:35:46 +02:00 · e2c8f25d31
commit e2c8f25d31
parent 3f2f3cb020
2 changed files with 60 additions and 80 deletions
--- a/src/backends/drm/drm_colorop.cpp
+++ b/src/backends/drm/drm_colorop.cpp
@ -82,29 +82,28 @@ bool DrmAbstractColorOp::matchPipeline(DrmAtomicCommit *commit, const ColorPipel
            currentOp->bypass(m_cache.get());
            currentOp = currentOp->next();
        }
-        currentOp->program(m_cache.get(), *initialOp, 1, 1);
+        currentOp->program(m_cache.get(), std::span(&*initialOp, 1), 1, 1);
        currentOp = currentOp->next();
    }
-    for (auto it = pipeline.ops.begin(); it != pipeline.ops.end(); it++) {
+    for (auto it = pipeline.ops.begin(); it != pipeline.ops.end();) {
        while (!currentOp->canBeUsedFor(*it)) {
            currentOp->bypass(m_cache.get());
            currentOp = currentOp->next();
        }
-        if (it == pipeline.ops.end() - 1) {
-            // this is the last op, we need to un-do the factor
-            // this assumes that the output is always limited range
-            currentOp->program(m_cache.get(), *it, valueScaling, 1.0);
-            valueScaling = 1.0;
-        } else if (needsLimitedRange(*it) || needsLimitedRange(*(it + 1))) {
-            // this op can only output limited range or the next op needs a limited range input,
-            // adjust the factor to make it happen
-            currentOp->program(m_cache.get(), *it, valueScaling, 1.0 / it->output.max);
-            valueScaling = 1.0 / it->output.max;
-        } else {
-            // this and the next op are both fine with extended range, set the factor to 1.0 to use all the resolution we can get
-            currentOp->program(m_cache.get(), *it, valueScaling, 1.0);
-            valueScaling = 1.0;
+        auto firstIt = it;
+        it++;
+        // combine as many operations into one hardware operation as possible
+        while (it != pipeline.ops.end() && currentOp->canBeUsedFor(*it)) {
+            it++;
        }
+        std::span operations(firstIt, it);
+        double outputScaling = 1.0;
+        if (it != pipeline.ops.end() && (needsLimitedRange(operations.front()) || needsLimitedRange(operations.back()) || needsLimitedRange(*it))) {
+            // if this or the next operation needs a limited range, or we need limited range for the output, make it happen
+            outputScaling = 1.0 / operations.back().output.max;
+        }
+        currentOp->program(m_cache.get(), operations, valueScaling, outputScaling);
+        valueScaling = outputScaling;
        currentOp = currentOp->next();
    }
    while (currentOp) {
@ -136,45 +135,31 @@ bool LegacyLutColorOp::canBeUsedFor(const ColorOp &op)
    return false;
 }

-void LegacyLutColorOp::program(DrmAtomicCommit *commit, const ColorOp &op, double inputScale, double outputScale)
+void LegacyLutColorOp::program(DrmAtomicCommit *commit, std::span<const ColorOp> operations, double inputScale, double outputScale)
 {
-    if (auto tf = std::get_if<ColorTransferFunction>(&op.operation)) {
-        for (uint32_t i = 0; i < m_maxSize; i++) {
-            const double nits = tf->tf.encodedToNits(i / double(m_maxSize - 1) / inputScale, tf->referenceLuminance);
-            const uint16_t output = std::round(std::clamp(nits * outputScale, 0.0, 1.0) * std::numeric_limits<uint16_t>::max());
-            m_components[i] = {
-                .red = output,
-                .green = output,
-                .blue = output,
-                .reserved = 0,
-            };
+    for (uint32_t i = 0; i < m_maxSize; i++) {
+        const double input = i / double(m_maxSize - 1);
+        const double scaledInput = input / inputScale;
+        QVector3D output(scaledInput, scaledInput, scaledInput);
+        for (const auto &op : operations) {
+            if (auto tf = std::get_if<ColorTransferFunction>(&op.operation)) {
+                output = tf->tf.encodedToNits(output, tf->referenceLuminance);
+            } else if (auto tf = std::get_if<InverseColorTransferFunction>(&op.operation)) {
+                output = tf->tf.nitsToEncoded(output, tf->referenceLuminance);
+            } else if (auto mult = std::get_if<ColorMultiplier>(&op.operation)) {
+                output *= mult->factors;
+            } else {
+                Q_UNREACHABLE();
+            }
        }
-        commit->addBlob(*m_prop, DrmBlob::create(m_prop->drmObject()->gpu(), m_components.data(), sizeof(drm_color_lut) * m_components.size()));
-    } else if (auto tf = std::get_if<InverseColorTransferFunction>(&op.operation)) {
-        for (uint32_t i = 0; i < m_maxSize; i++) {
-            const double nits = tf->tf.nitsToEncoded(i / double(m_maxSize - 1) / inputScale, tf->referenceLuminance);
-            const uint16_t output = std::round(std::clamp(nits * outputScale, 0.0, 1.0) * std::numeric_limits<uint16_t>::max());
-            m_components[i] = {
-                .red = output,
-                .green = output,
-                .blue = output,
-                .reserved = 0,
-            };
-        }
-        commit->addBlob(*m_prop, DrmBlob::create(m_prop->drmObject()->gpu(), m_components.data(), sizeof(drm_color_lut) * m_components.size()));
-    } else if (auto mult = std::get_if<ColorMultiplier>(&op.operation)) {
-        for (uint32_t i = 0; i < m_maxSize; i++) {
-            m_components[i] = {
-                .red = uint16_t(std::round(std::clamp(mult->factors.x() * outputScale / inputScale * i / double(m_maxSize - 1), 0.0, 1.0) * std::numeric_limits<uint16_t>::max())),
-                .green = uint16_t(std::round(std::clamp(mult->factors.y() * outputScale / inputScale * i / double(m_maxSize - 1), 0.0, 1.0) * std::numeric_limits<uint16_t>::max())),
-                .blue = uint16_t(std::round(std::clamp(mult->factors.z() * outputScale / inputScale * i / double(m_maxSize - 1), 0.0, 1.0) * std::numeric_limits<uint16_t>::max())),
-                .reserved = 0,
-            };
-        }
-        commit->addBlob(*m_prop, DrmBlob::create(m_prop->drmObject()->gpu(), m_components.data(), sizeof(drm_color_lut) * m_maxSize));
-    } else {
-        Q_ASSERT(false);
+        m_components[i] = {
+            .red = uint16_t(std::round(std::clamp(output.x() * outputScale, 0.0, 1.0) * std::numeric_limits<uint16_t>::max())),
+            .green = uint16_t(std::round(std::clamp(output.y() * outputScale, 0.0, 1.0) * std::numeric_limits<uint16_t>::max())),
+            .blue = uint16_t(std::round(std::clamp(output.z() * outputScale, 0.0, 1.0) * std::numeric_limits<uint16_t>::max())),
+            .reserved = 0,
+        };
    }
+    commit->addBlob(*m_prop, DrmBlob::create(m_prop->drmObject()->gpu(), m_components.data(), sizeof(drm_color_lut) * m_maxSize));
 }

 void LegacyLutColorOp::bypass(DrmAtomicCommit *commit)
@ -215,33 +200,27 @@ static uint64_t doubleToFixed(double value)
    return ret;
 }

-void LegacyMatrixColorOp::program(DrmAtomicCommit *commit, const ColorOp &op, double inputScale, double outputScale)
+void LegacyMatrixColorOp::program(DrmAtomicCommit *commit, std::span<const ColorOp> operations, double inputScale, double outputScale)
 {
-    if (auto matrix = std::get_if<ColorMatrix>(&op.operation)) {
-        QMatrix4x4 scaled = matrix->mat;
-        scaled.scale(outputScale / inputScale);
-        drm_color_ctm data = {
-            .matrix = {
-                doubleToFixed(scaled(0, 0)), doubleToFixed(scaled(0, 1)), doubleToFixed(scaled(0, 2)), //
-                doubleToFixed(scaled(1, 0)), doubleToFixed(scaled(1, 1)), doubleToFixed(scaled(1, 2)), //
-                doubleToFixed(scaled(2, 0)), doubleToFixed(scaled(2, 1)), doubleToFixed(scaled(2, 2)), //
-            },
-        };
-        commit->addBlob(*m_prop, DrmBlob::create(m_prop->drmObject()->gpu(), &data, sizeof(data)));
-    } else if (auto mult = std::get_if<ColorMultiplier>(&op.operation)) {
-        QVector3D scaled = mult->factors;
-        scaled *= outputScale / inputScale;
-        drm_color_ctm data = {
-            .matrix = {
-                doubleToFixed(scaled.x()), doubleToFixed(0), doubleToFixed(0), //
-                doubleToFixed(0), doubleToFixed(scaled.y()), doubleToFixed(0), //
-                doubleToFixed(0), doubleToFixed(0), doubleToFixed(scaled.z()), //
-            },
-        };
-        commit->addBlob(*m_prop, DrmBlob::create(m_prop->drmObject()->gpu(), &data, sizeof(data)));
-    } else {
-        Q_ASSERT(false);
+    QMatrix4x4 result;
+    result.scale(1.0 / inputScale);
+    for (const auto &op : operations) {
+        if (auto matrix = std::get_if<ColorMatrix>(&op.operation)) {
+            result *= matrix->mat;
+        } else if (auto mult = std::get_if<ColorMultiplier>(&op.operation)) {
+            result.scale(mult->factors.x(), mult->factors.y(), mult->factors.z());
+        } else {
+            Q_UNREACHABLE();
+        }
    }
+    drm_color_ctm data = {
+        .matrix = {
+            doubleToFixed(result(0, 0)), doubleToFixed(result(0, 1)), doubleToFixed(result(0, 2)), //
+            doubleToFixed(result(1, 0)), doubleToFixed(result(1, 1)), doubleToFixed(result(1, 2)), //
+            doubleToFixed(result(2, 0)), doubleToFixed(result(2, 1)), doubleToFixed(result(2, 2)), //
+        },
+    };
+    commit->addBlob(*m_prop, DrmBlob::create(m_prop->drmObject()->gpu(), &data, sizeof(data)));
 }

 void LegacyMatrixColorOp::bypass(DrmAtomicCommit *commit)
--- a/src/backends/drm/drm_colorop.h
+++ b/src/backends/drm/drm_colorop.h
@ -11,6 +11,7 @@

 #include <drm.h>
 #include <memory>
+#include <span>

 namespace KWin
 {
@ -27,7 +28,7 @@ public:

    bool matchPipeline(DrmAtomicCommit *commit, const ColorPipeline &pipeline);
    virtual bool canBeUsedFor(const ColorOp &op) = 0;
-    virtual void program(DrmAtomicCommit *commit, const ColorOp &op, double inputScale, double outputScale) = 0;
+    virtual void program(DrmAtomicCommit *commit, std::span<const ColorOp> operations, double inputScale, double outputScale) = 0;
    virtual void bypass(DrmAtomicCommit *commit) = 0;

    DrmAbstractColorOp *next() const;
@ -45,7 +46,7 @@ public:
    explicit LegacyLutColorOp(DrmAbstractColorOp *next, DrmProperty *prop, uint32_t maxSize);

    bool canBeUsedFor(const ColorOp &op) override;
-    void program(DrmAtomicCommit *commit, const ColorOp &op, double inputScale, double outputScale) override;
+    void program(DrmAtomicCommit *commit, std::span<const ColorOp> operations, double inputScale, double outputScale) override;
    void bypass(DrmAtomicCommit *commit) override;

 private:
@ -60,7 +61,7 @@ public:
    explicit LegacyMatrixColorOp(DrmAbstractColorOp *next, DrmProperty *prop);

    bool canBeUsedFor(const ColorOp &op) override;
-    void program(DrmAtomicCommit *commit, const ColorOp &op, double inputScale, double outputScale) override;
+    void program(DrmAtomicCommit *commit, std::span<const ColorOp> operations, double inputScale, double outputScale) override;
    void bypass(DrmAtomicCommit *commit) override;

 private: