From e783b0d4a97a67eb3380c64f3fc6d65d41a51872 Mon Sep 17 00:00:00 2001
From: GPUCode <47210458+GPUCode@users.noreply.github.com>
Date: Tue, 18 Jul 2023 17:31:31 +0300
Subject: [PATCH] rasterizer_cache: Fixes to (unaligned) texture downloads
 (#6697)

* rasterizer_cache: Header cleanup

* gl_texture_runtime: Fix incorrect stride in single scanline downloads

* texture_codec: Fix unaligned texture downloads
---
 src/video_core/CMakeLists.txt                 |  2 +
 .../rasterizer_cache/rasterizer_cache.h       |  3 +-
 .../rasterizer_cache/rasterizer_cache_base.h  |  9 +---
 src/video_core/rasterizer_cache/slot_id.h     | 21 ++++++++
 .../rasterizer_cache/texture_codec.h          | 14 ++---
 .../rasterizer_cache/texture_cube.h           | 52 +++++++++++++++++++
 src/video_core/rasterizer_cache/utils.h       | 46 ----------------
 .../renderer_opengl/gl_texture_runtime.cpp    | 32 ++++++------
 .../renderer_opengl/gl_texture_runtime.h      |  1 +
 9 files changed, 105 insertions(+), 75 deletions(-)
 create mode 100644 src/video_core/rasterizer_cache/slot_id.h
 create mode 100644 src/video_core/rasterizer_cache/texture_cube.h

diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 581bbba466..b2cc2884fa 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -42,11 +42,13 @@ add_library(video_core STATIC
     rasterizer_cache/rasterizer_cache.h
     rasterizer_cache/rasterizer_cache_base.h
     rasterizer_cache/sampler_params.h
+    rasterizer_cache/slot_id.h
     rasterizer_cache/surface_base.cpp
     rasterizer_cache/surface_base.h
     rasterizer_cache/surface_params.cpp
     rasterizer_cache/surface_params.h
     rasterizer_cache/texture_codec.h
+    rasterizer_cache/texture_cube.h
     rasterizer_cache/utils.cpp
     rasterizer_cache/utils.h
     renderer_opengl/frame_dumper_opengl.cpp
diff --git a/src/video_core/rasterizer_cache/rasterizer_cache.h b/src/video_core/rasterizer_cache/rasterizer_cache.h
index 31771d5528..1133e90b01 100644
--- a/src/video_core/rasterizer_cache/rasterizer_cache.h
+++ b/src/video_core/rasterizer_cache/rasterizer_cache.h
@@ -14,6 +14,7 @@
 #include "core/memory.h"
 #include "video_core/custom_textures/custom_tex_manager.h"
 #include "video_core/rasterizer_cache/rasterizer_cache_base.h"
+#include "video_core/rasterizer_cache/surface_base.h"
 #include "video_core/regs.h"
 #include "video_core/renderer_base.h"
 #include "video_core/texture/texture_decode.h"
@@ -1212,7 +1213,7 @@ void RasterizerCache<T>::ClearAll(bool flush) {
 
     // Remove the whole cache without really looking at it.
     cached_pages -= flush_interval;
-    dirty_regions -= SurfaceInterval(0x0, 0xFFFFFFFF);
+    dirty_regions.clear();
     page_table.clear();
     remove_surfaces.clear();
 }
diff --git a/src/video_core/rasterizer_cache/rasterizer_cache_base.h b/src/video_core/rasterizer_cache/rasterizer_cache_base.h
index 57c7473bcd..2b76f40767 100644
--- a/src/video_core/rasterizer_cache/rasterizer_cache_base.h
+++ b/src/video_core/rasterizer_cache/rasterizer_cache_base.h
@@ -11,7 +11,8 @@
 #include <boost/icl/interval_map.hpp>
 #include <tsl/robin_map.h>
 #include "video_core/rasterizer_cache/sampler_params.h"
-#include "video_core/rasterizer_cache/surface_base.h"
+#include "video_core/rasterizer_cache/surface_params.h"
+#include "video_core/rasterizer_cache/texture_cube.h"
 
 namespace Memory {
 class MemorySystem;
@@ -70,12 +71,6 @@ class RasterizerCache {
         SurfaceId depth_id;
     };
 
-    struct TextureCube {
-        SurfaceId surface_id;
-        std::array<SurfaceId, 6> face_ids;
-        std::array<u64, 6> ticks;
-    };
-
 public:
     explicit RasterizerCache(Memory::MemorySystem& memory, CustomTexManager& custom_tex_manager,
                              Runtime& runtime, Pica::Regs& regs, RendererBase& renderer);
diff --git a/src/video_core/rasterizer_cache/slot_id.h b/src/video_core/rasterizer_cache/slot_id.h
new file mode 100644
index 0000000000..b76805be94
--- /dev/null
+++ b/src/video_core/rasterizer_cache/slot_id.h
@@ -0,0 +1,21 @@
+// Copyright 2023 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/slot_vector.h"
+
+#pragma once
+
+namespace VideoCore {
+
+using SurfaceId = Common::SlotId;
+using SamplerId = Common::SlotId;
+
+/// Fake surface ID for null surfaces
+constexpr SurfaceId NULL_SURFACE_ID{0};
+/// Fake surface ID for null cube surfaces
+constexpr SurfaceId NULL_SURFACE_CUBE_ID{1};
+/// Fake sampler ID for null samplers
+constexpr SamplerId NULL_SAMPLER_ID{0};
+
+} // namespace VideoCore
diff --git a/src/video_core/rasterizer_cache/texture_codec.h b/src/video_core/rasterizer_cache/texture_codec.h
index a2a4c5425f..01e90f4617 100644
--- a/src/video_core/rasterizer_cache/texture_codec.h
+++ b/src/video_core/rasterizer_cache/texture_codec.h
@@ -3,6 +3,7 @@
 // Refer to the license.txt file included.
 
 #pragma once
+
 #include <algorithm>
 #include <bit>
 #include <span>
@@ -264,6 +265,7 @@ static constexpr void MortonCopy(u32 width, u32 height, u32 start_offset, u32 en
     const u32 aligned_down_start_offset = Common::AlignDown(start_offset, tile_size);
     const u32 aligned_start_offset = Common::AlignUp(start_offset, tile_size);
     const u32 aligned_end_offset = Common::AlignDown(end_offset, tile_size);
+    const u32 begin_pixel_index = aligned_down_start_offset * 8 / GetFormatBpp(format);
 
     ASSERT(!morton_to_linear ||
            (aligned_start_offset == start_offset && aligned_end_offset == end_offset));
@@ -271,12 +273,12 @@ static constexpr void MortonCopy(u32 width, u32 height, u32 start_offset, u32 en
     // In OpenGL the texture origin is in the bottom left corner as opposed to other
     // APIs that have it at the top left. To avoid flipping texture coordinates in
     // the shader we read/write the linear buffer from the bottom up
-    u32 linear_offset = ((height - 8) * width) * aligned_bytes_per_pixel;
+    u32 x = (begin_pixel_index % (width * 8)) / 8;
+    u32 y = (begin_pixel_index / (width * 8)) * 8;
+    u32 linear_offset = ((height - 8 - y) * width + x) * aligned_bytes_per_pixel;
     u32 tiled_offset = 0;
-    u32 x = 0;
-    u32 y = 0;
 
-    const auto LinearNextTile = [&] {
+    const auto linear_next_tile = [&] {
         x = (x + 8) % width;
         linear_offset += 8 * aligned_bytes_per_pixel;
         if (!x) {
@@ -300,7 +302,7 @@ static constexpr void MortonCopy(u32 width, u32 height, u32 start_offset, u32 en
                     std::min(aligned_start_offset, end_offset) - start_offset);
 
         tiled_offset += aligned_start_offset - start_offset;
-        LinearNextTile();
+        linear_next_tile();
     }
 
     // If the copy spans multiple tiles, copy the fully aligned tiles in between.
@@ -313,7 +315,7 @@ static constexpr void MortonCopy(u32 width, u32 height, u32 start_offset, u32 en
             auto tiled_data = tiled_buffer.subspan(tiled_offset, tile_size);
             MortonCopyTile<morton_to_linear, format, converted>(width, tiled_data, linear_data);
             tiled_offset += tile_size;
-            LinearNextTile();
+            linear_next_tile();
         }
     }
 
diff --git a/src/video_core/rasterizer_cache/texture_cube.h b/src/video_core/rasterizer_cache/texture_cube.h
new file mode 100644
index 0000000000..204dcb7f8a
--- /dev/null
+++ b/src/video_core/rasterizer_cache/texture_cube.h
@@ -0,0 +1,52 @@
+// Copyright 2023 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "common/hash.h"
+#include "video_core/rasterizer_cache/slot_id.h"
+#include "video_core/regs_texturing.h"
+
+namespace VideoCore {
+
+struct TextureCube {
+    SurfaceId surface_id;
+    std::array<SurfaceId, 6> face_ids;
+    std::array<u64, 6> ticks;
+};
+
+struct TextureCubeConfig {
+    PAddr px;
+    PAddr nx;
+    PAddr py;
+    PAddr ny;
+    PAddr pz;
+    PAddr nz;
+    u32 width;
+    u32 levels;
+    Pica::TexturingRegs::TextureFormat format;
+
+    bool operator==(const TextureCubeConfig& rhs) const {
+        return std::memcmp(this, &rhs, sizeof(TextureCubeConfig)) == 0;
+    }
+
+    bool operator!=(const TextureCubeConfig& rhs) const {
+        return std::memcmp(this, &rhs, sizeof(TextureCubeConfig)) != 0;
+    }
+
+    const u64 Hash() const {
+        return Common::ComputeHash64(this, sizeof(TextureCubeConfig));
+    }
+};
+
+} // namespace VideoCore
+
+namespace std {
+template <>
+struct hash<VideoCore::TextureCubeConfig> {
+    std::size_t operator()(const VideoCore::TextureCubeConfig& config) const noexcept {
+        return config.Hash();
+    }
+};
+} // namespace std
diff --git a/src/video_core/rasterizer_cache/utils.h b/src/video_core/rasterizer_cache/utils.h
index 3f6ee1f971..aeb7d74513 100644
--- a/src/video_core/rasterizer_cache/utils.h
+++ b/src/video_core/rasterizer_cache/utils.h
@@ -5,24 +5,11 @@
 #pragma once
 
 #include <span>
-#include "common/hash.h"
 #include "common/math_util.h"
-#include "common/slot_vector.h"
 #include "common/vector_math.h"
-#include "video_core/regs_texturing.h"
 
 namespace VideoCore {
 
-using SurfaceId = Common::SlotId;
-using SamplerId = Common::SlotId;
-
-/// Fake surface ID for null surfaces
-constexpr SurfaceId NULL_SURFACE_ID{0};
-/// Fake surface ID for null cube surfaces
-constexpr SurfaceId NULL_SURFACE_CUBE_ID{1};
-/// Fake sampler ID for null samplers
-constexpr SamplerId NULL_SAMPLER_ID{0};
-
 struct Offset {
     u32 x = 0;
     u32 y = 0;
@@ -79,30 +66,6 @@ struct StagingData {
     std::span<u8> mapped;
 };
 
-struct TextureCubeConfig {
-    PAddr px;
-    PAddr nx;
-    PAddr py;
-    PAddr ny;
-    PAddr pz;
-    PAddr nz;
-    u32 width;
-    u32 levels;
-    Pica::TexturingRegs::TextureFormat format;
-
-    bool operator==(const TextureCubeConfig& rhs) const {
-        return std::memcmp(this, &rhs, sizeof(TextureCubeConfig)) == 0;
-    }
-
-    bool operator!=(const TextureCubeConfig& rhs) const {
-        return std::memcmp(this, &rhs, sizeof(TextureCubeConfig)) != 0;
-    }
-
-    const u64 Hash() const {
-        return Common::ComputeHash64(this, sizeof(TextureCubeConfig));
-    }
-};
-
 class SurfaceParams;
 
 u32 MipLevels(u32 width, u32 height, u32 max_level);
@@ -134,12 +97,3 @@ void DecodeTexture(const SurfaceParams& surface_info, PAddr start_addr, PAddr en
                    std::span<u8> source, std::span<u8> dest, bool convert = false);
 
 } // namespace VideoCore
-
-namespace std {
-template <>
-struct hash<VideoCore::TextureCubeConfig> {
-    std::size_t operator()(const VideoCore::TextureCubeConfig& config) const noexcept {
-        return config.Hash();
-    }
-};
-} // namespace std
diff --git a/src/video_core/renderer_opengl/gl_texture_runtime.cpp b/src/video_core/renderer_opengl/gl_texture_runtime.cpp
index 6f09de2765..2786b9d671 100644
--- a/src/video_core/renderer_opengl/gl_texture_runtime.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_runtime.cpp
@@ -484,20 +484,19 @@ void Surface::Download(const VideoCore::BufferTextureCopy& download,
 
 bool Surface::DownloadWithoutFbo(const VideoCore::BufferTextureCopy& download,
                                  const VideoCore::StagingData& staging) {
-    const bool is_full_download = download.texture_rect == GetRect();
-    const bool has_sub_image = driver->HasArbGetTextureSubImage();
-    if (driver->IsOpenGLES() || (!is_full_download && !has_sub_image)) {
+    if (driver->IsOpenGLES()) {
         return false;
     }
 
-    const GLuint old_tex = OpenGLState::GetCurState().texture_units[0].texture_2d;
     const auto& tuple = runtime->GetFormatTuple(pixel_format);
+    const u32 unscaled_width = download.texture_rect.GetWidth();
 
-    glActiveTexture(GL_TEXTURE0);
-    glPixelStorei(GL_PACK_ROW_LENGTH, static_cast<GLint>(stride));
+    glPixelStorei(GL_PACK_ROW_LENGTH, unscaled_width);
     SCOPE_EXIT({ glPixelStorei(GL_PACK_ROW_LENGTH, 0); });
 
     // Prefer glGetTextureSubImage in most cases since it's the fastest and most convenient option
+    const bool is_full_download = download.texture_rect == GetRect();
+    const bool has_sub_image = driver->HasArbGetTextureSubImage();
     if (has_sub_image) {
         const GLsizei buf_size = static_cast<GLsizei>(staging.mapped.size());
         glGetTextureSubImage(Handle(0), download.texture_level, download.texture_rect.left,
@@ -505,16 +504,19 @@ bool Surface::DownloadWithoutFbo(const VideoCore::BufferTextureCopy& download,
                              download.texture_rect.GetHeight(), 1, tuple.format, tuple.type,
                              buf_size, staging.mapped.data());
         return true;
+    } else if (is_full_download) {
+        // This should only trigger for full texture downloads in oldish intel drivers
+        // that only support up to 4.3
+        OpenGLState state = OpenGLState::GetCurState();
+        state.texture_units[0].texture_2d = Handle(0);
+        state.Apply();
+
+        glGetTexImage(GL_TEXTURE_2D, download.texture_level, tuple.format, tuple.type,
+                      staging.mapped.data());
+
+        return true;
     }
-
-    // This should only trigger for full texture downloads in oldish intel drivers
-    // that only support up to 4.3
-    glBindTexture(GL_TEXTURE_2D, Handle(0));
-    glGetTexImage(GL_TEXTURE_2D, download.texture_level, tuple.format, tuple.type,
-                  staging.mapped.data());
-    glBindTexture(GL_TEXTURE_2D, old_tex);
-
-    return true;
+    return false;
 }
 
 void Surface::Attach(GLenum target, u32 level, u32 layer, bool scaled) {
diff --git a/src/video_core/renderer_opengl/gl_texture_runtime.h b/src/video_core/renderer_opengl/gl_texture_runtime.h
index 0551394df2..654e321ebb 100644
--- a/src/video_core/renderer_opengl/gl_texture_runtime.h
+++ b/src/video_core/renderer_opengl/gl_texture_runtime.h
@@ -6,6 +6,7 @@
 
 #include "video_core/rasterizer_cache/framebuffer_base.h"
 #include "video_core/rasterizer_cache/rasterizer_cache_base.h"
+#include "video_core/rasterizer_cache/surface_base.h"
 #include "video_core/renderer_opengl/gl_blit_helper.h"
 
 namespace VideoCore {