diff --git a/.gitignore b/.gitignore
index c7bbb2808e..48ce7cc770 100644
--- a/.gitignore
+++ b/.gitignore
@@ -37,4 +37,4 @@ tools/nsc/bin/*
 */__pycache__/*
 __pycache__/*
 *.pyc
-
+tmp/*
diff --git a/.gitmodules b/.gitmodules
index 8a04f82d9d..8c03de482d 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -129,3 +129,6 @@
 [submodule "3rdparty/Vulkan-Tools"]
 	path = 3rdparty/Vulkan-Tools
 	url = git@github.com:Devsh-Graphics-Programming/Vulkan-Tools.git
+[submodule "3rdparty/fast_float"]
+	path = 3rdparty/fast_float
+	url = ../fast_float.git
diff --git a/3rdparty/CMakeLists.txt b/3rdparty/CMakeLists.txt
index 68e821dfdf..826bfa4cad 100755
--- a/3rdparty/CMakeLists.txt
+++ b/3rdparty/CMakeLists.txt
@@ -282,12 +282,31 @@ target_compile_definitions(spirv_cross PUBLIC SPIRV_CROSS_EXCEPTIONS_TO_ASSERTIO
 # note that checking if a repository is dirty may cost build time (especially a lot if like us you have a lot of submodules) - by default we run with all checks but if you want to increase build time iterations I recommend to exclude this check
 option(GIT_EXCLUDE_IS_DIRTY "Exclude IS_DIRTY from git tracking checks, will increase build time iterations at the expense of the meta information loss" OFF)
 add_subdirectory(git-version-tracking EXCLUDE_FROM_ALL)
-NBL_ADD_GIT_TRACKING_META_LIBRARY(nabla "${NBL_ROOT_PATH}")
-NBL_ADD_GIT_TRACKING_META_LIBRARY(dxc "${CMAKE_CURRENT_SOURCE_DIR}/dxc/dxc")
-nbl_install_file("${CMAKE_CURRENT_BINARY_DIR}/git-version-tracking/nabla_git_info.json")
-nbl_install_file("${CMAKE_CURRENT_BINARY_DIR}/git-version-tracking/dxc_git_info.json")
-
-NBL_GENERATE_GIT_TRACKING_META()
+NBL_CONFIGURE_GIT_TRACKING_META_RUNTIME(
+	TARGET gtml_core
+	NAMESPACE gtml
+	IGITINFO_HEADER_PATH nbl/gtml/IGitInfo.h
+	JSON_FORMATTER_HEADER_PATH nbl/gtml/SJsonFormatter.h
+)
+NBL_ADD_GIT_TRACKING_META_LIBRARY(
+	TARGET gtml
+	NAMESPACE nbl::gtml
+	HEADER_PATH nbl/git/info.h
+	REPOS
+		nabla "${NBL_ROOT_PATH}"
+		dxc "${CMAKE_CURRENT_SOURCE_DIR}/dxc/dxc"
+)
+NBL_GET_GIT_TRACKING_META_RUNTIME_OUTPUTS(
+	IGITINFO_HEADER_OUTPUT_VAR _NBL_GTML_IGITINFO_HEADER
+	JSON_FORMATTER_HEADER_OUTPUT_VAR _NBL_GTML_JSON_FORMATTER_HEADER
+)
+NBL_GET_GIT_TRACKING_META_OUTPUTS(TARGET gtml HEADER_OUTPUT_VAR _NBL_GTML_HEADER JSON_OUTPUTS_VAR _NBL_GTML_JSONS)
+nbl_install_file_spec("${_NBL_GTML_IGITINFO_HEADER}" "nbl/gtml")
+nbl_install_file_spec("${_NBL_GTML_JSON_FORMATTER_HEADER}" "nbl/gtml")
+nbl_install_file_spec("${_NBL_GTML_HEADER}" "nbl/git")
+foreach(_NBL_GTML_JSON IN LISTS _NBL_GTML_JSONS)
+	nbl_install_file("${_NBL_GTML_JSON}")
+endforeach()
 
 # NGFX
 include(ngfx/ngfx.cmake)
@@ -456,6 +475,8 @@ set(NBL_3RDPARTY_TARGETS
 				lz4
 				aesGladman
 				spirv_cross
+				gtml_core
+				gtml
 				png_static
 				zlibstatic
 				shaderc_util
@@ -528,14 +549,10 @@ nbl_install_dir(glm/glm)
 nbl_install_file_spec(${CMAKE_CURRENT_BINARY_DIR}/imath/config/ImathConfig.h imath)
 nbl_install_dir(imath/src/Imath)
 
-nbl_install_file(blake/c/blake3.h)
-
 nbl_install_dir(boost/superproject/libs/preprocessor/include/boost)
 
 nbl_install_file_spec(renderdoc/renderdoc_app.h renderdoc)
 
-nbl_install_file(${CMAKE_CURRENT_BINARY_DIR}/git-version-tracking/git_info.h)
-
 # parent scope exports, must be at the end of the file
 set(_NBL_3RDPARTY_TARGETS_ 
 	${NBL_3RDPARTY_TARGETS}
diff --git a/3rdparty/fast_float b/3rdparty/fast_float
new file mode 160000
index 0000000000..221a4920db
--- /dev/null
+++ b/3rdparty/fast_float
@@ -0,0 +1 @@
+Subproject commit 221a4920db7d68d33ab9794af602daef19667351
diff --git a/3rdparty/git-version-tracking b/3rdparty/git-version-tracking
index 6c3ecac5f0..b0a7450c14 160000
--- a/3rdparty/git-version-tracking
+++ b/3rdparty/git-version-tracking
@@ -1 +1 @@
-Subproject commit 6c3ecac5f0297877d1573ef4e3cdb537c5feeb62
+Subproject commit b0a7450c141e8520c0225370d7408ed9a18e8efb
diff --git a/examples_tests b/examples_tests
index 77f4b77500..187aebeaf3 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 77f4b775008a50cda066af5d611e6147a886f52e
+Subproject commit 187aebeaf3287ba5ba3a6e872bd682af988d8e85
diff --git a/include/nabla.h b/include/nabla.h
index fa231e3db7..cedf6b0ebf 100644
--- a/include/nabla.h
+++ b/include/nabla.h
@@ -64,11 +64,11 @@
 #include "SColor.h"
 
 // meta info
-#include "git_info.h"
+#include "nbl/git/info.h"
 
 namespace nbl {
-	const NBL_API2 gtml::GitInfo& getGitInfo(gtml::E_GIT_REPO_META repo);
+	const NBL_API2 ::gtml::IGitInfo& getGitInfo(gtml::E_GIT_REPO_META repo);
 }
 
 
-#endif // __NABLA_H_INCLUDED__
\ No newline at end of file
+#endif // __NABLA_H_INCLUDED__
diff --git a/include/nbl/application_templates/MonoDeviceApplication.hpp b/include/nbl/application_templates/MonoDeviceApplication.hpp
index c7a94fe332..a3399ac8f0 100644
--- a/include/nbl/application_templates/MonoDeviceApplication.hpp
+++ b/include/nbl/application_templates/MonoDeviceApplication.hpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2023-2023 - DevSH Graphics Programming Sp. z O.O.
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 #ifndef _NBL_APPLICATION_TEMPLATES_MONO_DEVICE_APPLICATION_HPP_INCLUDED_
@@ -280,4 +280,4 @@ class MonoDeviceApplication : public virtual MonoSystemMonoLoggerApplication
 };
 
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/include/nbl/asset/IAssetManager.h b/include/nbl/asset/IAssetManager.h
index 45b32b7c61..557aff64dc 100644
--- a/include/nbl/asset/IAssetManager.h
+++ b/include/nbl/asset/IAssetManager.h
@@ -5,6 +5,7 @@
 #define _NBL_ASSET_I_ASSET_MANAGER_H_INCLUDED_
 
 #include <array>
+#include <optional>
 #include <ostream>
 
 #include "nbl/core/declarations.h"
@@ -51,6 +52,12 @@ class NBL_API2 IAssetManager : public core::IReferenceCounted
         friend std::function<void(SAssetBundle&)> makeAssetDisposeFunc(const IAssetManager* const _mgr);
 
     public:
+        struct SWriterFlagInfo
+        {
+            writer_flags_t supported = EWF_NONE;
+            writer_flags_t forced = EWF_NONE;
+        };
+
 #ifdef USE_MAPS_FOR_PATH_BASED_CACHE
         using AssetCacheType = core::CConcurrentMultiObjectCache<std::string, SAssetBundle, std::multimap>;
 #else
@@ -180,19 +187,31 @@ class NBL_API2 IAssetManager : public core::IReferenceCounted
         SAssetBundle getAssetInHierarchy_impl(const std::string& _filePath, const IAssetLoader::SAssetLoadParams& _params, uint32_t _hierarchyLevel, IAssetLoader::IAssetLoaderOverride* _override)
         {
             IAssetLoader::SAssetLoadContext ctx(_params, nullptr);
+            system::ISystem::future_t<core::smart_refctd_ptr<system::IFile>> future;
+            const auto tryLoadAssetFromPath = [&](const system::path& path)->SAssetBundle
+            {
+                m_system->createFile(future, path, static_cast<system::IFile::E_CREATE_FLAGS>(system::IFile::ECF_READ | system::IFile::ECF_MAPPABLE));
+                if (auto file=future.acquire())
+                    return getAssetInHierarchy_impl(file->get(), path.string(), ctx.params, _hierarchyLevel, _override);
+                m_system->createFile(future, path, system::IFile::ECF_READ);
+                if (auto file=future.acquire())
+                    return getAssetInHierarchy_impl(file->get(), path.string(), ctx.params, _hierarchyLevel, _override);
+                return SAssetBundle(0);
+            };
 
             system::path filePath = _filePath;
             _override->getLoadFilename(filePath, m_system.get(), ctx, _hierarchyLevel);
-            if (!m_system->exists(filePath,system::IFile::ECF_READ))
+            if (auto bundle=tryLoadAssetFromPath(filePath); !bundle.getContents().empty())
+                return bundle;
+
+            auto fallbackPath = _params.workingDirectory / filePath;
+            if (fallbackPath != filePath)
             {
-                filePath = _params.workingDirectory/filePath;
+                filePath = std::move(fallbackPath);
                 _override->getLoadFilename(filePath, m_system.get(), ctx, _hierarchyLevel);
+                if (auto bundle=tryLoadAssetFromPath(filePath); !bundle.getContents().empty())
+                    return bundle;
             }
-            
-            system::ISystem::future_t<core::smart_refctd_ptr<system::IFile>> future;
-            m_system->createFile(future, filePath, system::IFile::ECF_READ);
-            if (auto file=future.acquire())
-                return getAssetInHierarchy_impl(file->get(), filePath.string(), ctx.params, _hierarchyLevel, _override);
             return SAssetBundle(0);
         }
 
@@ -350,8 +369,12 @@ class NBL_API2 IAssetManager : public core::IReferenceCounted
             if (!_override)
                 _override = &defOverride;
 
+            system::path filename = _filename;
+            if (filename.is_relative() && !_params.workingDirectory.empty())
+                filename = _params.workingDirectory / filename;
+
             system::ISystem::future_t<core::smart_refctd_ptr<system::IFile>> future;
-            m_system->createFile(future, (_params.workingDirectory.generic_string()+_filename).c_str(), system::IFile::ECF_WRITE);
+            m_system->createFile(future, std::move(filename), system::IFile::ECF_WRITE);
             if (auto file=future.acquire())
                 return writeAsset(file->get(), _params, _override);
             return false;
@@ -381,6 +404,18 @@ class NBL_API2 IAssetManager : public core::IReferenceCounted
             return writeAsset(_file, _params, nullptr);
         }
 
+        inline std::optional<SWriterFlagInfo> getAssetWriterFlagInfo(const IAsset::E_TYPE assetType, const std::string_view extension) const
+        {
+            const auto capableWritersRng = m_writers.perTypeAndFileExt.findRange({assetType, std::string(extension)});
+            if (capableWritersRng.empty())
+                return std::nullopt;
+            auto* const writer = capableWritersRng.begin()->second;
+            return SWriterFlagInfo{
+                .supported = writer->getSupportedFlags(),
+                .forced = writer->getForcedFlags()
+            };
+        }
+
         // Asset Loaders [FOLLOWING ARE NOT THREAD SAFE]
         uint32_t getAssetLoaderCount() { return static_cast<uint32_t>(m_loaders.vector.size()); }
 
diff --git a/include/nbl/asset/ICPUBuffer.h b/include/nbl/asset/ICPUBuffer.h
index 26f45d4ced..5cd03363ef 100644
--- a/include/nbl/asset/ICPUBuffer.h
+++ b/include/nbl/asset/ICPUBuffer.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 #ifndef _NBL_ASSET_I_CPU_BUFFER_H_INCLUDED_
@@ -76,13 +76,12 @@ class ICPUBuffer final : public asset::IBuffer, public IPreHashed
         constexpr static inline auto AssetType = ET_BUFFER;
         inline IAsset::E_TYPE getAssetType() const override final { return AssetType; }
 
-        inline core::blake3_hash_t computeContentHash() const override
-        {
-            core::blake3_hasher hasher;
-            if (m_data)
-                hasher.update(m_data, m_creationParams.size);
-            return static_cast<core::blake3_hash_t>(hasher);
-        }
+        inline core::blake3_hash_t computeContentHash() const override
+        {
+            if (!m_data)
+                return static_cast<core::blake3_hash_t>(core::blake3_hasher{});
+            return core::blake3_hash_buffer(m_data, m_creationParams.size);
+        }
 
         inline bool missingContent() const override { return !m_data; }
 
@@ -149,4 +148,4 @@ class ICPUBuffer final : public asset::IBuffer, public IPreHashed
 
 } // end namespace nbl::asset
 
-#endif
\ No newline at end of file
+#endif
diff --git a/include/nbl/asset/ICPUPolygonGeometry.h b/include/nbl/asset/ICPUPolygonGeometry.h
index 2fb640e02b..e877499443 100644
--- a/include/nbl/asset/ICPUPolygonGeometry.h
+++ b/include/nbl/asset/ICPUPolygonGeometry.h
@@ -8,6 +8,7 @@
 #include "nbl/asset/IAsset.h"
 #include "nbl/asset/ICPUBuffer.h"
 #include "nbl/asset/IPolygonGeometry.h"
+#include "nbl/builtin/hlsl/shapes/AABBAccumulator.hlsl"
 
 
 namespace nbl::asset
@@ -112,7 +113,20 @@ class NBL_API2 ICPUPolygonGeometry final : public IPolygonGeometry<ICPUBuffer>
             return false;
         }
         template<typename Scalar>
-        inline bool setAABB(const hlsl::shapes::AABB<3,Scalar>& aabb) {return visitAABB([&aabb](auto&& ref)->void{ref=aabb;});}
+        inline bool setAABB(const hlsl::shapes::AABB<3,Scalar>& aabb)
+        {
+            bool assigned = false;
+            const bool visited = visitAABB([&aabb, &assigned](auto&& ref)->void
+            {
+                assigned = hlsl::shapes::util::assignAABB(ref, aabb);
+            });
+            return visited && assigned;
+        }
+        template<typename Scalar>
+        inline bool applyAABB(const hlsl::shapes::AABB<3, Scalar>& aabb)
+        {
+            return setAABB(aabb);
+        }
 
         //
         inline bool setJointCount(const uint32_t count)
@@ -194,4 +208,4 @@ class NBL_API2 ICPUPolygonGeometry final : public IPolygonGeometry<ICPUBuffer>
 };
 
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/include/nbl/asset/SBufferAdoption.h b/include/nbl/asset/SBufferAdoption.h
new file mode 100644
index 0000000000..d31e5ff95b
--- /dev/null
+++ b/include/nbl/asset/SBufferAdoption.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_ASSET_S_BUFFER_ADOPTION_H_INCLUDED_
+#define _NBL_ASSET_S_BUFFER_ADOPTION_H_INCLUDED_
+#include <concepts>
+#include <ranges>
+#include <type_traits>
+#include <utility>
+#include "nbl/asset/ICPUBuffer.h"
+namespace nbl::asset
+{
+namespace impl
+{
+// Owns contiguous storage that can be adopted by a CPU buffer. Views like std::span are rejected.
+template<typename Storage>
+concept AdoptedBufferStorage =
+    std::ranges::contiguous_range<std::remove_reference_t<Storage>> &&
+    std::ranges::sized_range<std::remove_reference_t<Storage>> &&
+    (!std::ranges::view<std::remove_cvref_t<Storage>>) &&
+    requires(std::remove_reference_t<Storage>& storage)
+    {
+        typename std::ranges::range_value_t<std::remove_reference_t<Storage>>;
+        { std::ranges::data(storage) } -> std::same_as<std::ranges::range_value_t<std::remove_reference_t<Storage>>*>;
+    };
+}
+// Generic CPU-buffer adoption helper for owning contiguous storage such as std::vector or core::vector.
+class SBufferAdoption
+{
+    public:
+        template<impl::AdoptedBufferStorage Storage>
+        static inline core::smart_refctd_ptr<ICPUBuffer> create(Storage&& data)
+        {
+            using storage_t = std::remove_cvref_t<Storage>;
+            using value_t = std::ranges::range_value_t<storage_t>;
+
+            if (std::ranges::empty(data))
+                return nullptr;
+
+            auto backer = core::make_smart_refctd_ptr<core::adoption_memory_resource<storage_t>>(std::forward<Storage>(data));
+            auto& storage = backer->getBacker();
+            const size_t byteCount = std::ranges::size(storage) * sizeof(value_t);
+            return ICPUBuffer::create(
+                { { byteCount }, std::ranges::data(storage), core::smart_refctd_ptr<core::refctd_memory_resource>(std::move(backer)), alignof(value_t) },
+                core::adopt_memory);
+        }
+};
+}
+#endif
diff --git a/include/nbl/asset/format/EFormat.h b/include/nbl/asset/format/EFormat.h
index 62ce71555e..7daf5ae45c 100644
--- a/include/nbl/asset/format/EFormat.h
+++ b/include/nbl/asset/format/EFormat.h
@@ -5,6 +5,7 @@
 #ifndef __NBL_ASSET_E_FORMAT_H_INCLUDED__
 #define __NBL_ASSET_E_FORMAT_H_INCLUDED__
 
+#include <array>
 #include <cstdint>
 #include <type_traits>
 #include "BuildConfigOptions.h"
@@ -574,6 +575,64 @@ constexpr uint32_t getFormatChannelCount()
 {
 #include "nbl/asset/format/impl/EFormat_getFormatChannelCount.h"
 }
+namespace impl
+{
+struct SStructuredFormatVariants
+{
+	E_FORMAT base;
+	std::array<E_FORMAT, 4> variants;
+};
+static inline constexpr uint32_t StructuredFormatChannelVariantCount = 4u;
+static inline constexpr auto StructuredFormatVariants = std::to_array<SStructuredFormatVariants>({
+	{EF_R8_SINT, {EF_R8_SINT, EF_R8G8_SINT, EF_R8G8B8_SINT, EF_R8G8B8A8_SINT}},
+	{EF_R8_UINT, {EF_R8_UINT, EF_R8G8_UINT, EF_R8G8B8_UINT, EF_R8G8B8A8_UINT}},
+	{EF_R16_SINT, {EF_R16_SINT, EF_R16G16_SINT, EF_R16G16B16_SINT, EF_R16G16B16A16_SINT}},
+	{EF_R16_UINT, {EF_R16_UINT, EF_R16G16_UINT, EF_R16G16B16_UINT, EF_R16G16B16A16_UINT}},
+	{EF_R32_SINT, {EF_R32_SINT, EF_R32G32_SINT, EF_R32G32B32_SINT, EF_R32G32B32A32_SINT}},
+	{EF_R32_UINT, {EF_R32_UINT, EF_R32G32_UINT, EF_R32G32B32_UINT, EF_R32G32B32A32_UINT}},
+	{EF_R32_SFLOAT, {EF_R32_SFLOAT, EF_R32G32_SFLOAT, EF_R32G32B32_SFLOAT, EF_R32G32B32A32_SFLOAT}},
+	{EF_R64_SFLOAT, {EF_R64_SFLOAT, EF_R64G64_SFLOAT, EF_R64G64B64_SFLOAT, EF_R64G64B64A64_SFLOAT}}
+	});
+	inline constexpr uint32_t getStructuredFormatVariantIndex(const E_FORMAT _fmt)
+	{
+		for (uint32_t i = 0u; i < StructuredFormatVariants.size(); ++i)
+			if (StructuredFormatVariants[i].base == _fmt)
+				return i;
+		return StructuredFormatVariants.size();
+	}
+	template<E_FORMAT _fmt>
+	inline constexpr uint32_t getStructuredFormatVariantIndex()
+	{
+		return getStructuredFormatVariantIndex(_fmt);
+	}
+	inline constexpr E_FORMAT getStructuredFormatVariant(const uint32_t _variantIndex, const uint32_t _channelCount)
+	{
+		return _variantIndex < StructuredFormatVariants.size() && _channelCount > 0u && _channelCount <= StructuredFormatChannelVariantCount ?
+			StructuredFormatVariants[_variantIndex].variants[_channelCount - 1u] : EF_UNKNOWN;
+	}
+	template<uint32_t _channelCount>
+	inline constexpr E_FORMAT getStructuredFormatVariant(const uint32_t _variantIndex)
+	{
+		if constexpr (_channelCount > 0u && _channelCount <= StructuredFormatChannelVariantCount)
+			return _variantIndex < StructuredFormatVariants.size() ? StructuredFormatVariants[_variantIndex].variants[_channelCount - 1u] : EF_UNKNOWN;
+		else
+			return EF_UNKNOWN;
+	}
+}
+template<E_FORMAT _fmt>
+inline constexpr E_FORMAT getFormatWithChannelCount(const uint32_t _channelCount)
+{
+	return impl::getStructuredFormatVariant(impl::getStructuredFormatVariantIndex<_fmt>(), _channelCount);
+}
+template<E_FORMAT _fmt, uint32_t _channelCount>
+inline constexpr E_FORMAT getFormatWithChannelCount()
+{
+	return impl::getStructuredFormatVariant<_channelCount>(impl::getStructuredFormatVariantIndex<_fmt>());
+}
+inline constexpr E_FORMAT getFormatWithChannelCount(const E_FORMAT _fmt, const uint32_t _channelCount)
+{
+	return impl::getStructuredFormatVariant(impl::getStructuredFormatVariantIndex(_fmt), _channelCount);
+}
 
 /*
 inline uint32_t getBitsPerChannel(asset::E_FORMAT _fmt, uint8_t _channel)
@@ -1987,4 +2046,4 @@ namespace std
     };
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/include/nbl/asset/interchange/COBJMeshWriter.h b/include/nbl/asset/interchange/COBJMeshWriter.h
new file mode 100644
index 0000000000..5446118246
--- /dev/null
+++ b/include/nbl/asset/interchange/COBJMeshWriter.h
@@ -0,0 +1,35 @@
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_ASSET_OBJ_MESH_WRITER_H_INCLUDED_
+#define _NBL_ASSET_OBJ_MESH_WRITER_H_INCLUDED_
+#include "nbl/asset/interchange/ISceneWriter.h"
+namespace nbl::asset
+{
+/**
+	Writes OBJ from a single polygon geometry, a geometry collection, or a scene.
+	OBJ itself is still treated here as final flattened geometry data, not as a scene format.
+	Scene input is accepted only as export input: the writer bakes transforms
+	and serializes all collected polygon geometries into one OBJ stream.
+	This preserves the final shape but does not try to keep scene-only structure
+	such as hierarchy or instancing.
+	In other words `ET_SCENE -> OBJ` is supported as flattening,
+	not as round-tripping scene semantics through the OBJ format.
+*/
+class COBJMeshWriter : public ISceneWriter
+{
+	public:
+		COBJMeshWriter();
+
+		uint64_t getSupportedAssetTypesBitfield() const override;
+
+		const char** getAssociatedFileExtensions() const override;
+
+		writer_flags_t getSupportedFlags() override;
+
+		writer_flags_t getForcedFlags() override;
+
+		bool writeAsset(system::IFile* _file, const SAssetWriteParams& _params, IAssetWriterOverride* _override = nullptr) override;
+};
+} // end namespace
+#endif
diff --git a/include/nbl/asset/interchange/IAssetLoader.h b/include/nbl/asset/interchange/IAssetLoader.h
index 5354228278..9ba1e5e14a 100644
--- a/include/nbl/asset/interchange/IAssetLoader.h
+++ b/include/nbl/asset/interchange/IAssetLoader.h
@@ -1,22 +1,17 @@
-// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 #ifndef _NBL_ASSET_I_ASSET_LOADER_H_INCLUDED_
 #define _NBL_ASSET_I_ASSET_LOADER_H_INCLUDED_
-
-
 #include "nbl/system/declarations.h"
-
 #include "nbl/system/ISystem.h"
 #include "nbl/system/ILogger.h"
-
+#include "nbl/core/util/bitflag.h"
 #include "nbl/asset/interchange/SAssetBundle.h"
+#include "nbl/asset/interchange/SFileIOPolicy.h"
 #include "nbl/asset/utils/CGeometryCreator.h"
-
-
 namespace nbl::asset
 {
-
 class CPolygonGeometryManipulator;
 
 //! A class automating process of loading Assets from resources, eg. files
@@ -59,7 +54,6 @@ class CPolygonGeometryManipulator;
 	@see IAssetManager
 	@see IAssetWriter
 */
-
 class NBL_API2 IAssetLoader : public virtual core::IReferenceCounted
 {
 	public:
@@ -75,6 +69,7 @@ class NBL_API2 IAssetLoader : public virtual core::IReferenceCounted
 			//! meaning identical as to ECF_DUPLICATE_TOP_LEVEL but for any asset in the chain
 			ECF_DUPLICATE_REFERENCES = 0xffffffffffffffffull
 		};
+		using caching_flags_t = core::bitflag<E_CACHING_FLAGS>;
 
 		//! Parameter flags for a loader
 		/**
@@ -91,17 +86,19 @@ class NBL_API2 IAssetLoader : public virtual core::IReferenceCounted
 			ELPF_NONE = 0,									//!< default value, it doesn't do anything
 //[[deprecated]] ELPF_RIGHT_HANDED_MESHES = 0x1,	//!< specifies that a mesh will be flipped in such a way that it'll look correctly in right-handed camera system
 //[[deprecated]] ELPF_DONT_COMPILE_GLSL = 0x2,		//!< it states that GLSL won't be compiled to SPIR-V if it is loaded or generated
-			ELPF_LOAD_METADATA_ONLY = 0x4					//!< it forces the loader to not load the entire scene for performance in special cases to fetch metadata.
+			ELPF_LOAD_METADATA_ONLY = 0x4,					//!< it forces the loader to not load the entire scene for performance in special cases to fetch metadata.
+			ELPF_DONT_COMPUTE_CONTENT_HASHES = 0x8			//!< opt-out from computing content hashes of produced buffers before returning.
 		};
+		using loader_flags_t = core::bitflag<E_LOADER_PARAMETER_FLAGS>;
 
 		struct SAssetLoadParams
 		{
 			inline SAssetLoadParams(const size_t _decryptionKeyLen = 0u, const uint8_t* const _decryptionKey = nullptr,
-				const E_CACHING_FLAGS _cacheFlags = ECF_CACHE_EVERYTHING,const E_LOADER_PARAMETER_FLAGS _loaderFlags = ELPF_NONE, 
-				const system::logger_opt_ptr _logger = nullptr, const std::filesystem::path& cwd = "") :
+				const caching_flags_t _cacheFlags = ECF_CACHE_EVERYTHING, const loader_flags_t _loaderFlags = ELPF_NONE,
+				const system::logger_opt_ptr _logger = nullptr, const std::filesystem::path& cwd = "", const SFileIOPolicy& _ioPolicy = {}) :
 					decryptionKeyLen(_decryptionKeyLen), decryptionKey(_decryptionKey),
 					cacheFlags(_cacheFlags), loaderFlags(_loaderFlags),
-					logger(std::move(_logger)), workingDirectory(cwd)
+					logger(std::move(_logger)), workingDirectory(cwd), ioPolicy(_ioPolicy)
 			{
 			}
 
@@ -111,16 +108,18 @@ class NBL_API2 IAssetLoader : public virtual core::IReferenceCounted
 				cacheFlags(rhs.cacheFlags),
 				loaderFlags(rhs.loaderFlags),
 				logger(rhs.logger),
-				workingDirectory(rhs.workingDirectory)
+				workingDirectory(rhs.workingDirectory),
+				ioPolicy(rhs.ioPolicy)
 			{
 			}
 
 			size_t decryptionKeyLen;
 			const uint8_t* decryptionKey;
-			E_CACHING_FLAGS cacheFlags;
-			E_LOADER_PARAMETER_FLAGS loaderFlags;				//!< Flags having an impact on extraordinary tasks during loading process
+			caching_flags_t cacheFlags;
+			loader_flags_t loaderFlags;			//!< Flags having an impact on extraordinary tasks during loading process
 			std::filesystem::path workingDirectory = "";
 			system::logger_opt_ptr logger;
+			SFileIOPolicy ioPolicy = {};
 		};
 
 		//! Struct for keeping the state of the current loadoperation for safe threading
@@ -133,37 +132,37 @@ class NBL_API2 IAssetLoader : public virtual core::IReferenceCounted
 		};
 
 		// following could be inlined
-		static E_CACHING_FLAGS ECF_DONT_CACHE_LEVEL(uint64_t N)
+		static caching_flags_t ECF_DONT_CACHE_LEVEL(uint64_t N)
 		{
 			N *= 2ull;
-			return (E_CACHING_FLAGS)(ECF_DONT_CACHE_TOP_LEVEL << N);
+			return caching_flags_t(static_cast<uint64_t>(ECF_DONT_CACHE_TOP_LEVEL) << N);
 		}
-		static E_CACHING_FLAGS ECF_DUPLICATE_LEVEL(uint64_t N)
+		static caching_flags_t ECF_DUPLICATE_LEVEL(uint64_t N)
 		{
 			N *= 2ull;
-			return (E_CACHING_FLAGS)(ECF_DUPLICATE_TOP_LEVEL << N);
+			return caching_flags_t(static_cast<uint64_t>(ECF_DUPLICATE_TOP_LEVEL) << N);
 		}
-		static E_CACHING_FLAGS ECF_DONT_CACHE_FROM_LEVEL(uint64_t N)
+		static caching_flags_t ECF_DONT_CACHE_FROM_LEVEL(uint64_t N)
 		{
 			// (Criss) Shouldn't be set all DONT_CACHE bits from hierarchy numbers N-1 to 32 (64==2*32) ? Same for ECF_DUPLICATE_FROM_LEVEL below
 			N *= 2ull;
-			return (E_CACHING_FLAGS)(ECF_DONT_CACHE_REFERENCES << N);
+			return caching_flags_t(static_cast<uint64_t>(ECF_DONT_CACHE_REFERENCES) << N);
 		}
-		static E_CACHING_FLAGS ECF_DUPLICATE_FROM_LEVEL(uint64_t N)
+		static caching_flags_t ECF_DUPLICATE_FROM_LEVEL(uint64_t N)
 		{
 			N *= 2ull;
-			return (E_CACHING_FLAGS)(ECF_DUPLICATE_REFERENCES << N);
+			return caching_flags_t(static_cast<uint64_t>(ECF_DUPLICATE_REFERENCES) << N);
 		}
-		static E_CACHING_FLAGS ECF_DONT_CACHE_UNTIL_LEVEL(uint64_t N)
+		static caching_flags_t ECF_DONT_CACHE_UNTIL_LEVEL(uint64_t N)
 		{
 			// (Criss) is this ok? Shouldn't be set all DONT_CACHE bits from hierarchy numbers 0 to N-1? Same for ECF_DUPLICATE_UNTIL_LEVEL below
 			N = 64ull - N * 2ull;
-			return (E_CACHING_FLAGS)(ECF_DONT_CACHE_REFERENCES >> N);
+			return caching_flags_t(static_cast<uint64_t>(ECF_DONT_CACHE_REFERENCES) >> N);
 		}
-		static E_CACHING_FLAGS ECF_DUPLICATE_UNTIL_LEVEL(uint64_t N)
+		static caching_flags_t ECF_DUPLICATE_UNTIL_LEVEL(uint64_t N)
 		{
 			N = 64ull - N * 2ull;
-			return (E_CACHING_FLAGS)(ECF_DUPLICATE_REFERENCES >> N);
+			return caching_flags_t(static_cast<uint64_t>(ECF_DUPLICATE_REFERENCES) >> N);
 		}
 
 		//! Override class to facilitate changing how assets are loaded
@@ -256,6 +255,8 @@ class NBL_API2 IAssetLoader : public virtual core::IReferenceCounted
 				//! Called before loading a file to determine the correct path (could be relative or absolute)
 				inline virtual void getLoadFilename(system::path& inOutFilename, const system::ISystem* sys,  const SAssetLoadContext& ctx, const uint32_t hierarchyLevel)
 				{
+					if (inOutFilename.is_absolute() || inOutFilename.has_root_path())
+						return;
 					// try compute absolute path
 					auto absolute = ctx.params.workingDirectory/inOutFilename;
 					if (sys->exists(absolute,system::IFile::ECF_READ))
diff --git a/include/nbl/asset/interchange/IAssetWriter.h b/include/nbl/asset/interchange/IAssetWriter.h
index 694053df5e..fca8e24124 100644
--- a/include/nbl/asset/interchange/IAssetWriter.h
+++ b/include/nbl/asset/interchange/IAssetWriter.h
@@ -3,17 +3,12 @@
 // For conditions of distribution and use, see copyright notice in nabla.h
 #ifndef _NBL_ASSET_I_ASSET_WRITER_H_INCLUDED_
 #define _NBL_ASSET_I_ASSET_WRITER_H_INCLUDED_
-
-
 #include "nbl/system/IFile.h"
 #include "nbl/system/ILogger.h"
-
 #include "nbl/asset/IAsset.h"
-
-
+#include "nbl/asset/interchange/SFileIOPolicy.h"
 namespace nbl::asset
 {
-
 //! Writing flags
 /**
 	They have an impact on writing (saving) an Asset.
@@ -36,9 +31,10 @@ enum E_WRITER_FLAGS : uint32_t
     //! write in binary format rather than text if possible
     EWF_BINARY = 1u << 2u,
 
-    //!< specifies the incoming orientation of loaded mesh we want to write. Flipping will be performed if needed in dependency of format extension orientation	
+    //!< specifies the incoming orientation of loaded mesh we want to write. Flipping will be performed if needed in dependency of format extension orientation
     EWF_MESH_IS_RIGHT_HANDED = 1u << 3u
 };
+using writer_flags_t = core::bitflag<E_WRITER_FLAGS>;
 
 //! A class that defines rules during Asset-writing (saving) process
 /**
@@ -85,21 +81,22 @@ class IAssetWriter : public virtual core::IReferenceCounted
 	*/
     struct SAssetWriteParams
     {
-        SAssetWriteParams(IAsset* _asset, const E_WRITER_FLAGS& _flags = EWF_NONE, const float& _compressionLevel = 0.f, const size_t& _encryptionKeyLen = 0, const uint8_t* _encryptionKey = nullptr, const void* _userData = nullptr, const system::logger_opt_ptr _logger = nullptr, system::path cwd = "") :
+        SAssetWriteParams(IAsset* _asset, const writer_flags_t _flags = EWF_NONE, const float& _compressionLevel = 0.f, const size_t& _encryptionKeyLen = 0, const uint8_t* _encryptionKey = nullptr, const void* _userData = nullptr, const system::logger_opt_ptr _logger = nullptr, system::path cwd = "", const SFileIOPolicy& _ioPolicy = {}) :
             rootAsset(_asset), flags(_flags), compressionLevel(_compressionLevel),
             encryptionKeyLen(_encryptionKeyLen), encryptionKey(_encryptionKey),
-            userData(_userData), logger(_logger), workingDirectory(cwd)
+            userData(_userData), logger(_logger), workingDirectory(cwd), ioPolicy(_ioPolicy)
         {
         }
 
         const IAsset* rootAsset;			//!< An Asset on which entire writing process is based.
-        E_WRITER_FLAGS flags;				//!< Flags set by user that defines rules during writing process.
+        writer_flags_t flags;				//!< Flags set by user that defines rules during writing process.
         float compressionLevel;				//!< The more compression level, the more expensive (slower) compression algorithm is launched.
         size_t encryptionKeyLen;			//!< Stores a size of data in encryptionKey pointer for correct iteration.
         const uint8_t* encryptionKey;		//!< Stores an encryption key used for encryption process.
         const void* userData;				//!< Stores writer-dependets parameters. It is usually a struct provided by a writer author.
         system::logger_opt_ptr logger;
         system::path workingDirectory;
+        SFileIOPolicy ioPolicy = {};
     };
 
     //! Struct for keeping the state of the current write operation for safe threading
@@ -116,9 +113,7 @@ class IAssetWriter : public virtual core::IReferenceCounted
         const SAssetWriteParams params;
         system::IFile* outputFile;
     };
-
 public:
-
     //! Returns an array of string literals terminated by nullptr
     virtual const char** getAssociatedFileExtensions() const = 0;
 
@@ -130,10 +125,10 @@ class IAssetWriter : public virtual core::IReferenceCounted
     virtual uint64_t getSupportedAssetTypesBitfield() const { return 0; }
 
     //! Returns which flags are supported for writing modes
-    virtual uint32_t getSupportedFlags() = 0;
+    virtual writer_flags_t getSupportedFlags() = 0;
 
     //! Returns which flags are forced for writing modes, i.e. a writer can only support binary
-    virtual uint32_t getForcedFlags() = 0;
+    virtual writer_flags_t getForcedFlags() = 0;
 
     //! Override class to facilitate changing how assets are written, especially the sub-assets
 	/*
@@ -146,7 +141,7 @@ class IAssetWriter : public virtual core::IReferenceCounted
         //! The only reason these functions are not declared static is to allow stateful overrides
     public:
         //! To allow the asset writer to write different sub-assets with different flags
-        inline virtual E_WRITER_FLAGS getAssetWritingFlags(const SAssetWriteContext& ctx, const IAsset* assetToWrite, const uint32_t& hierarchyLevel)
+        inline virtual writer_flags_t getAssetWritingFlags(const SAssetWriteContext& ctx, const IAsset* assetToWrite, const uint32_t& hierarchyLevel)
         {
             return ctx.params.flags;
         }
@@ -192,4 +187,4 @@ class IAssetWriter : public virtual core::IReferenceCounted
 };
 
 } //nbl::asset
-#endif
\ No newline at end of file
+#endif
diff --git a/include/nbl/asset/interchange/IGeometryLoader.h b/include/nbl/asset/interchange/IGeometryLoader.h
index 6e6c7c4e26..4f6321d7bc 100644
--- a/include/nbl/asset/interchange/IGeometryLoader.h
+++ b/include/nbl/asset/interchange/IGeometryLoader.h
@@ -1,21 +1,16 @@
-// Copyright (C) 2025-2025 - DevSH Graphics Programming Sp. z O.O.
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 #ifndef _NBL_ASSET_I_GEOMETRY_LOADER_H_INCLUDED_
 #define _NBL_ASSET_I_GEOMETRY_LOADER_H_INCLUDED_
-
-
 #include "nbl/core/declarations.h"
-
 #include "nbl/asset/ICPUPolygonGeometry.h"
 #include "nbl/asset/interchange/IAssetLoader.h"
 #include "nbl/asset/interchange/IImageAssetHandlerBase.h"
 #include "nbl/asset/utils/CGeometryManipulator.h"
-
-
 namespace nbl::asset
 {
-
+//! Geometry loader base shared by mesh-style interchange formats.
 class IGeometryLoader : public IAssetLoader
 {
 	public:
@@ -24,6 +19,7 @@ class IGeometryLoader : public IAssetLoader
 	protected:
 		inline IGeometryLoader() {}
 
+		//! Creates one geometry data view from caller-owned memory or copied storage.
 		template<bool AdoptMemory=false>
 		static inline IGeometry<ICPUBuffer>::SDataView createView(
 			const E_FORMAT format, const size_t elementCount, const void* data=nullptr,
@@ -51,7 +47,8 @@ class IGeometryLoader : public IAssetLoader
 			}
 			return retval;
 		}
-		// creates a View from a mapped file
+
+		//! Memory resource that keeps a mapped file alive while adopted geometry views reference it.
 		class CFileMemoryResource final : public core::refctd_memory_resource
 		{
 			public:
@@ -71,6 +68,8 @@ class IGeometryLoader : public IAssetLoader
 			protected:
 				core::smart_refctd_ptr<system::IFile> m_file;
 		};
+
+		//! Creates one geometry data view backed directly by a mapped file or by copied file contents.
 		static inline IGeometry<ICPUBuffer>::SDataView createView(const E_FORMAT format, const size_t elementCount, core::smart_refctd_ptr<system::IFile>&& file, const size_t offsetInFile)
 		{
 			if (auto* const basePtr=reinterpret_cast<const uint8_t*>(file->getMappedPointer()); basePtr)
@@ -96,7 +95,5 @@ class IGeometryLoader : public IAssetLoader
 
 	private:
 };
-
 }
-
 #endif
diff --git a/include/nbl/asset/interchange/ISceneWriter.h b/include/nbl/asset/interchange/ISceneWriter.h
new file mode 100644
index 0000000000..94e4548270
--- /dev/null
+++ b/include/nbl/asset/interchange/ISceneWriter.h
@@ -0,0 +1,21 @@
+// Copyright (C) 2025-2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_ASSET_I_SCENE_WRITER_H_INCLUDED_
+#define _NBL_ASSET_I_SCENE_WRITER_H_INCLUDED_
+#include "nbl/core/declarations.h"
+#include "nbl/asset/ICPUScene.h"
+#include "nbl/asset/interchange/IAssetWriter.h"
+namespace nbl::asset
+{
+//! Writer base for exporters whose root asset type is `ET_SCENE`.
+class ISceneWriter : public IAssetWriter
+{
+	public:
+		virtual inline uint64_t getSupportedAssetTypesBitfield() const override { return IAsset::ET_SCENE; }
+	protected:
+		ISceneWriter() = default;
+		virtual ~ISceneWriter() = default;
+};
+}
+#endif
diff --git a/include/nbl/asset/interchange/SFileIOPolicy.h b/include/nbl/asset/interchange/SFileIOPolicy.h
new file mode 100644
index 0000000000..108f35addc
--- /dev/null
+++ b/include/nbl/asset/interchange/SFileIOPolicy.h
@@ -0,0 +1,189 @@
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_ASSET_S_FILE_IO_POLICY_H_INCLUDED_
+#define _NBL_ASSET_S_FILE_IO_POLICY_H_INCLUDED_
+#include "nbl/core/util/bitflag.h"
+#include "nbl/system/to_string.h"
+#include <algorithm>
+#include <bit>
+#include <cstdint>
+#include <limits>
+#include <string>
+#include <thread>
+namespace nbl::asset
+{
+//! Requested IO strategy selected before file size and mapping constraints are resolved.
+enum class EFileIOStrategy : uint8_t
+{
+    Invalid = 0u, //!< Sentinel used when strategy resolution fails or the value is uninitialized.
+    Auto, //!< Pick whole-file or chunked dynamically based on file size and policy limits.
+    WholeFile, //!< Force whole-file strategy. May fallback when not feasible unless strict=true.
+    Chunked //!< Force chunked strategy.
+};
+
+//! Requested IO policy shared by loaders, writers, and hash stages before file constraints are resolved.
+struct SFileIOPolicy
+{
+    //! Runtime tuning knobs shared by loader parallelism and IO anomaly diagnostics.
+    struct SRuntimeTuning
+    {
+        //! Runtime tuning strategy for worker/chunk selection.
+        enum class Mode : uint8_t
+        {
+            Sequential, //!< Disable runtime tuning and force sequential execution.
+            None = Sequential, //!< Backward-compatible alias for Sequential.
+            Heuristic, //!< Use deterministic heuristics derived from input size and hardware.
+            Hybrid //!< Use heuristics and optionally refine with lightweight sampling.
+        };
+
+        Mode mode = Mode::Heuristic; //!< Runtime tuning mode.
+        float maxOverheadRatio = 0.05f; //!< Maximum acceptable tuning overhead as a fraction of estimated full workload time.
+        float samplingBudgetRatio = 0.05f; //!< Maximum sampling budget as a fraction of estimated full workload time.
+        float minExpectedGainRatio = 0.03f; //!< Minimum expected gain required to keep extra workers enabled.
+        uint16_t maxWorkers = 0u; //!< Hard cap for worker count. 0 means auto.
+        uint8_t workerHeadroom = 2u; //!< Reserved hardware threads not used by the loader. Prevents full CPU saturation.
+        uint8_t samplingMaxCandidates = 4u; //!< Maximum number of worker-count candidates tested in hybrid mode.
+        uint8_t samplingPasses = 1u; //!< Number of benchmark passes per candidate in hybrid mode.
+        uint64_t samplingMinWorkUnits = 0ull; //!< Minimum work units required before hybrid sampling is allowed. 0 means auto.
+        uint8_t targetChunksPerWorker = 4u; //!< Target chunk count assigned to each worker for loader stages.
+        uint8_t hashTaskTargetChunksPerWorker = 1u; //!< Target chunk count assigned to each worker for hash stages.
+        uint64_t hashInlineThresholdBytes = 1ull << 20; //!< Hash inlining threshold. Inputs up to this size prefer inline hash build.
+        uint64_t minSampleBytes = 4ull << 10; //!< Lower bound for sampled byte count in hybrid mode.
+        uint64_t maxSampleBytes = 128ull << 10; //!< Upper bound for sampled byte count in hybrid mode.
+        uint64_t tinyIoPayloadThresholdBytes = 1ull << 20; //!< Payload size threshold for tiny-IO anomaly detection.
+        uint64_t tinyIoAvgBytesThreshold = 1024ull; //!< Average operation size threshold for tiny-IO anomaly detection.
+        uint64_t tinyIoMinBytesThreshold = 64ull; //!< Minimum operation size threshold for tiny-IO anomaly detection.
+        uint64_t tinyIoMinCallCount = 1024ull; //!< Minimum operation count required to report tiny-IO anomaly.
+        uint8_t chunkedInFlightDepth = 0u; //!< Chunked IO requests allowed in flight. 0 means auto, 1 disables pipelining.
+    };
+
+    using Strategy = EFileIOStrategy;
+
+    //! Extra resolution flags affecting fallback behavior.
+    enum E_FLAGS : uint8_t { EF_NONE = 0u, EF_STRICT_BIT = 1u << 0u };
+
+    static inline constexpr uint64_t MIN_CHUNK_SIZE_BYTES = 64ull << 10u; //!< 64 KiB.
+    static inline constexpr uint8_t MIN_CHUNK_SIZE_LOG2 = static_cast<uint8_t>(std::bit_width(MIN_CHUNK_SIZE_BYTES) - 1u);
+    static inline constexpr uint8_t MAX_BYTE_SIZE_LOG2 = std::numeric_limits<uint64_t>::digits - 1u;
+    static inline constexpr uint64_t DEFAULT_WHOLE_FILE_THRESHOLD_BYTES = 64ull << 20u; //!< 64 MiB.
+    static inline constexpr uint64_t DEFAULT_CHUNK_SIZE_BYTES = 4ull << 20u; //!< 4 MiB.
+    static inline constexpr uint64_t DEFAULT_MAX_STAGING_BYTES = 256ull << 20u; //!< 256 MiB.
+
+    //! These defaults are stored and clamped as log2(byte_count), so the source byte values must stay powers of two.
+    static_assert(std::has_single_bit(MIN_CHUNK_SIZE_BYTES));
+    static_assert(std::has_single_bit(DEFAULT_WHOLE_FILE_THRESHOLD_BYTES));
+    static_assert(std::has_single_bit(DEFAULT_CHUNK_SIZE_BYTES));
+    static_assert(std::has_single_bit(DEFAULT_MAX_STAGING_BYTES));
+
+    static inline constexpr uint8_t clampBytesLog2(const uint8_t value, const uint8_t minValue = 0u) { return std::clamp<uint8_t>(value, minValue, MAX_BYTE_SIZE_LOG2); }
+
+    static inline constexpr uint64_t bytesFromLog2(const uint8_t value, const uint8_t minValue = 0u) { return 1ull << clampBytesLog2(value, minValue); }
+
+    Strategy strategy = Strategy::Auto; //!< Requested IO strategy. Defaults to Auto.
+    core::bitflag<E_FLAGS> flags = EF_NONE; //!< Resolution flags. Defaults to none.
+
+    //! Maximum payload size allowed for whole-file strategy in auto mode. Defaults to 64 MiB.
+    uint8_t wholeFileThresholdLog2 = static_cast<uint8_t>(std::bit_width(DEFAULT_WHOLE_FILE_THRESHOLD_BYTES) - 1u);
+
+    //! Chunk size used by chunked strategy encoded as log2(bytes). Defaults to 4 MiB.
+    uint8_t chunkSizeLog2 = static_cast<uint8_t>(std::bit_width(DEFAULT_CHUNK_SIZE_BYTES) - 1u);
+
+    //! Maximum staging allocation for whole-file strategy encoded as log2(bytes). Defaults to 256 MiB.
+    uint8_t maxStagingLog2 = static_cast<uint8_t>(std::bit_width(DEFAULT_MAX_STAGING_BYTES) - 1u);
+
+    SRuntimeTuning runtimeTuning = {}; //!< Runtime tuning controls used by loaders and hash stages.
+
+    inline constexpr bool strict() const { return flags.hasAnyFlag(EF_STRICT_BIT); }
+    inline constexpr uint64_t wholeFileThresholdBytes() const { return bytesFromLog2(wholeFileThresholdLog2, MIN_CHUNK_SIZE_LOG2); }
+    inline constexpr uint64_t chunkSizeBytes() const { return bytesFromLog2(chunkSizeLog2, MIN_CHUNK_SIZE_LOG2); }
+    inline constexpr uint64_t maxStagingBytes() const { return bytesFromLog2(maxStagingLog2, MIN_CHUNK_SIZE_LOG2); }
+};
+
+//! Resolved IO plan chosen from SFileIOPolicy after considering file size, mapping, and staging limits.
+struct SResolvedFileIOPolicy
+{
+    using Strategy = EFileIOStrategy;
+
+    constexpr SResolvedFileIOPolicy() = default;
+    inline constexpr SResolvedFileIOPolicy(const SFileIOPolicy& policy, const uint64_t byteCount, const bool sizeKnown = true, const bool fileMappable = false) : SResolvedFileIOPolicy(resolve(policy, byteCount, sizeKnown, fileMappable)) {}
+    Strategy strategy = Strategy::Invalid; //!< Effective strategy chosen by resolver. Invalid means strict policy resolution failed.
+
+    //! Effective chunk size encoded as log2(bytes). Also set for whole-file for telemetry consistency.
+    uint8_t chunkSizeLog2 = SFileIOPolicy::MIN_CHUNK_SIZE_LOG2;
+    uint8_t chunkedInFlightDepth = 1u; //!< Resolved chunked in-flight depth. Non-chunked strategies always keep this at 1.
+
+    const char* reason = "invalid"; //!< Resolver reason string used in logs and diagnostics.
+
+    inline constexpr bool isValid() const { return strategy != Strategy::Invalid; }
+
+    inline constexpr uint64_t chunkSizeBytes() const { return SFileIOPolicy::bytesFromLog2(chunkSizeLog2, SFileIOPolicy::MIN_CHUNK_SIZE_LOG2); }
+
+    static inline constexpr SResolvedFileIOPolicy resolve(const SFileIOPolicy& policy, const uint64_t byteCount, const bool sizeKnown = true, const bool fileMappable = false)
+    {
+        const uint8_t maxStagingLog2 = SFileIOPolicy::clampBytesLog2(policy.maxStagingLog2, SFileIOPolicy::MIN_CHUNK_SIZE_LOG2);
+        const uint8_t chunkSizeLog2 = std::min<uint8_t>(SFileIOPolicy::clampBytesLog2(policy.chunkSizeLog2, SFileIOPolicy::MIN_CHUNK_SIZE_LOG2), maxStagingLog2);
+        const uint64_t maxStaging = SFileIOPolicy::bytesFromLog2(maxStagingLog2, SFileIOPolicy::MIN_CHUNK_SIZE_LOG2);
+        const uint64_t wholeThreshold = policy.wholeFileThresholdBytes();
+        const uint64_t chunkSizeBytes = SFileIOPolicy::bytesFromLog2(chunkSizeLog2, SFileIOPolicy::MIN_CHUNK_SIZE_LOG2);
+        const uint64_t chunkCount = chunkSizeBytes ? std::max<uint64_t>(1ull, (byteCount + chunkSizeBytes - 1ull) / chunkSizeBytes) : 1ull;
+        auto resolveChunkedInFlightDepth = [&](const Strategy strategy) -> uint8_t
+        {
+            if (strategy != Strategy::Chunked || chunkCount <= 1ull)
+                return 1u;
+            if (policy.runtimeTuning.chunkedInFlightDepth > 0u)
+                return static_cast<uint8_t>(std::min<uint64_t>(policy.runtimeTuning.chunkedInFlightDepth, chunkCount));
+            const uint32_t hardwareThreads = policy.runtimeTuning.maxWorkers ? policy.runtimeTuning.maxWorkers : std::thread::hardware_concurrency();
+            const uint32_t usableThreads = hardwareThreads > policy.runtimeTuning.workerHeadroom ? (hardwareThreads - policy.runtimeTuning.workerHeadroom) : 1u;
+            return static_cast<uint8_t>(std::clamp<uint64_t>(usableThreads, 1ull, std::min<uint64_t>(chunkCount, std::numeric_limits<uint8_t>::max())));
+        };
+        auto makeResolved = [&](const Strategy strategy, const char* const reason) -> SResolvedFileIOPolicy { SResolvedFileIOPolicy resolved = {}; resolved.strategy = strategy; resolved.chunkSizeLog2 = chunkSizeLog2; resolved.chunkedInFlightDepth = resolveChunkedInFlightDepth(strategy); resolved.reason = reason; return resolved; };
+        switch (policy.strategy)
+        {
+            case SFileIOPolicy::Strategy::Invalid:
+                return makeResolved(Strategy::Invalid, "invalid_requested_strategy");
+            case SFileIOPolicy::Strategy::WholeFile:
+            {
+                if (fileMappable || (sizeKnown && byteCount <= maxStaging))
+                    return makeResolved(Strategy::WholeFile, fileMappable ? "requested_whole_file_mappable" : "requested_whole_file");
+                if (policy.strict())
+                    return makeResolved(Strategy::Invalid, "whole_file_not_feasible_strict");
+                return makeResolved(Strategy::Chunked, sizeKnown ? "whole_file_not_feasible_fallback_chunked" : "whole_file_unknown_size_fallback_chunked");
+            }
+            case SFileIOPolicy::Strategy::Chunked:
+                return makeResolved(Strategy::Chunked, "requested_chunked");
+            case SFileIOPolicy::Strategy::Auto:
+            default:
+            {
+                if (fileMappable)
+                    return makeResolved(Strategy::WholeFile, sizeKnown ? "auto_mappable_prefers_whole_file" : "auto_unknown_size_mappable_whole_file");
+                if (!sizeKnown)
+                    return makeResolved(Strategy::Chunked, "auto_unknown_size");
+                const uint64_t wholeLimit = std::min<uint64_t>(wholeThreshold, maxStaging);
+                if (byteCount <= wholeLimit)
+                    return makeResolved(Strategy::WholeFile, "auto_small_enough_for_whole_file");
+                return makeResolved(Strategy::Chunked, "auto_too_large_for_whole_file");
+            }
+        }
+    }
+};
+}
+namespace nbl::system::impl
+{
+template<>
+struct to_string_helper<asset::EFileIOStrategy>
+{
+    static inline std::string __call(const asset::EFileIOStrategy value)
+    {
+        switch (value)
+        {
+            case asset::EFileIOStrategy::Invalid: return "invalid";
+            case asset::EFileIOStrategy::Auto: return "auto";
+            case asset::EFileIOStrategy::WholeFile: return "whole";
+            case asset::EFileIOStrategy::Chunked: return "chunked";
+            default: return "unknown";
+        }
+    }
+};
+}
+#endif
diff --git a/include/nbl/asset/interchange/SGeometryContentHash.h b/include/nbl/asset/interchange/SGeometryContentHash.h
new file mode 100644
index 0000000000..c7353dea9b
--- /dev/null
+++ b/include/nbl/asset/interchange/SGeometryContentHash.h
@@ -0,0 +1,58 @@
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_ASSET_S_GEOMETRY_CONTENT_HASH_H_INCLUDED_
+#define _NBL_ASSET_S_GEOMETRY_CONTENT_HASH_H_INCLUDED_
+#include "nbl/asset/IPreHashed.h"
+#include "nbl/asset/utils/CPolygonGeometryManipulator.h"
+#include "nbl/core/hash/blake.h"
+namespace nbl::asset
+{
+//! Geometry-content-hash helper operating on all unique buffers referenced by one polygon geometry.
+class SPolygonGeometryContentHash
+{
+    public:
+        using mode_t = CPolygonGeometryManipulator::EContentHashMode;
+
+        //! Collects all unique buffers contributing to the geometry content hash.
+        static inline void collectBuffers(const ICPUPolygonGeometry* geometry, core::vector<core::smart_refctd_ptr<ICPUBuffer>>& buffers) { CPolygonGeometryManipulator::collectUniqueBuffers(geometry, buffers); }
+
+        //! Resets all referenced buffer hashes to `INVALID_HASH`.
+        static inline void reset(ICPUPolygonGeometry* geometry)
+        {
+            core::vector<core::smart_refctd_ptr<ICPUBuffer>> buffers;
+            collectBuffers(geometry, buffers);
+            for (auto& buffer : buffers)
+                if (buffer)
+                    buffer->setContentHash(IPreHashed::INVALID_HASH);
+        }
+
+        //! Composes the geometry hash from the current content hashes of all referenced buffers.
+        static inline core::blake3_hash_t composeHashFromBufferContentHashes(const ICPUPolygonGeometry* geometry)
+        {
+            if (!geometry)
+                return IPreHashed::INVALID_HASH;
+
+            core::blake3_hasher hashBuilder = {};
+            if (const auto* indexing = geometry->getIndexingCallback(); indexing)
+            {
+                hashBuilder << indexing->degree();
+                hashBuilder << indexing->rate();
+                hashBuilder << indexing->knownTopology();
+            }
+
+            core::vector<core::smart_refctd_ptr<ICPUBuffer>> buffers;
+            collectBuffers(geometry, buffers);
+            for (const auto& buffer : buffers)
+                hashBuilder << (buffer ? buffer->getContentHash() : IPreHashed::INVALID_HASH);
+            return static_cast<core::blake3_hash_t>(hashBuilder);
+        }
+
+        //! Computes missing buffer hashes and returns the composed geometry hash.
+        static inline core::blake3_hash_t computeMissing(ICPUPolygonGeometry* geometry, const SFileIOPolicy& ioPolicy) { CPolygonGeometryManipulator::computeMissingContentHashesParallel(geometry, ioPolicy); return composeHashFromBufferContentHashes(geometry); }
+
+        //! Recomputes all buffer hashes and returns the composed geometry hash.
+        static inline core::blake3_hash_t recompute(ICPUPolygonGeometry* geometry, const SFileIOPolicy& ioPolicy) { CPolygonGeometryManipulator::recomputeContentHashesParallel(geometry, ioPolicy); return composeHashFromBufferContentHashes(geometry); }
+};
+}
+#endif
diff --git a/include/nbl/asset/interchange/SGeometryLoaderCommon.h b/include/nbl/asset/interchange/SGeometryLoaderCommon.h
new file mode 100644
index 0000000000..dd2e1cf72e
--- /dev/null
+++ b/include/nbl/asset/interchange/SGeometryLoaderCommon.h
@@ -0,0 +1,128 @@
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_ASSET_S_GEOMETRY_LOADER_COMMON_H_INCLUDED_
+#define _NBL_ASSET_S_GEOMETRY_LOADER_COMMON_H_INCLUDED_
+#include <algorithm>
+#include <cassert>
+#include <ranges>
+#include <type_traits>
+#include "nbl/asset/SBufferAdoption.h"
+#include "nbl/asset/ICPUPolygonGeometry.h"
+namespace nbl::asset
+{
+//! Shared geometry-loader helpers for adopting buffers and assembling formatted data views.
+class SGeometryLoaderCommon
+{
+	public:
+		//! Creates one formatted data view over an existing CPU buffer.
+		static inline IGeometry<ICPUBuffer>::SDataView createDataView(core::smart_refctd_ptr<ICPUBuffer>&& buffer, const size_t byteCount, const uint32_t stride, const E_FORMAT format)
+		{
+			if (!buffer || byteCount == 0ull)
+				return {};
+			return {.composed = {.stride = stride, .format = format, .rangeFormat = IGeometryBase::getMatchingAABBFormat(format)}, .src = {.offset = 0ull, .size = byteCount, .buffer = std::move(buffer)}};
+		}
+		//! Tracks the widest scalar component format and highest component index seen for one structured attribute.
+		static inline void negotiateStructuredComponent(IGeometry<ICPUBuffer>::SDataViewBase& view, const E_FORMAT componentFormat, const uint8_t component)
+		{
+			assert(getFormatChannelCount(componentFormat) != 0u);
+			if (getTexelOrBlockBytesize(componentFormat) > getTexelOrBlockBytesize(view.format))
+				view.format = componentFormat;
+			view.stride = std::max<uint32_t>(view.stride, component);
+		}
+		//! Finalizes one structured base view and invokes `onComponent(offset,stride,componentFormat)` per component slot.
+		template<typename Fn>
+		static inline void finalizeStructuredBaseView(IGeometry<ICPUBuffer>::SDataViewBase& view, Fn&& onComponent)
+		{
+			if (view.format == EF_UNKNOWN)
+				return;
+			const auto componentFormat = view.format;
+			const auto componentCount = view.stride + 1u;
+			view.format = getFormatWithChannelCount(componentFormat, componentCount);
+			view.stride = getTexelOrBlockBytesize(view.format);
+			for (uint32_t c = 0u; c < componentCount; ++c)
+				onComponent(getTexelOrBlockBytesize(componentFormat) * c, view.stride, componentFormat);
+		}
+		//! Creates one owned data view with storage sized for `elementCount` items in `format`.
+		static inline IGeometry<ICPUBuffer>::SDataView createOwnedView(const E_FORMAT format, const size_t elementCount)
+		{
+			if (format == EF_UNKNOWN || elementCount == 0ull)
+				return {};
+			const auto stride = getTexelOrBlockBytesize(format);
+			auto buffer = ICPUBuffer::create({stride * elementCount});
+			return buffer ? createDataView(std::move(buffer), stride * elementCount, stride, format) : IGeometry<ICPUBuffer>::SDataView{};
+		}
+		//! Finalizes one structured base view, calls `onComponent`, and allocates the resulting owned data view.
+		template<typename Fn>
+		static inline IGeometry<ICPUBuffer>::SDataView createStructuredView(IGeometry<ICPUBuffer>::SDataViewBase& view, const size_t elementCount, Fn&& onComponent)
+		{
+			if (view.format == EF_UNKNOWN)
+				return {};
+			finalizeStructuredBaseView(view, std::forward<Fn>(onComponent));
+			return createOwnedView(view.format, elementCount);
+		}
+		//! Finalizes one structured view, appends per-component iterator bindings, rebases them against the allocated buffer, and passes the created view to `setter`.
+		template<typename IteratorContainer, typename PushComponent, typename RebaseComponent, typename Setter>
+		static inline void attachStructuredView(IGeometry<ICPUBuffer>::SDataViewBase& baseView, const size_t elementCount, IteratorContainer& iterators, PushComponent&& pushComponent, RebaseComponent&& rebaseComponent, Setter&& setter)
+		{
+			auto beginIx = iterators.size();
+			auto view = createStructuredView(baseView, elementCount, [&](const size_t offset, const uint32_t stride, const E_FORMAT componentFormat) -> void { pushComponent(iterators, offset, stride, componentFormat); });
+			if (!view)
+				return;
+			const auto basePtr = ptrdiff_t(view.src.buffer->getPointer()) + view.src.offset;
+			for (const auto endIx = iterators.size(); beginIx != endIx; ++beginIx)
+				rebaseComponent(iterators[beginIx], basePtr);
+			setter(std::move(view));
+		}
+		//! Visits position, normal, and auxiliary attribute views for one polygon geometry.
+		template<typename Visitor>
+		static inline void visitVertexAttributeViews(const ICPUPolygonGeometry* geometry, Visitor&& visitor)
+		{
+			if (!geometry)
+				return;
+			visitor(geometry->getPositionView());
+			visitor(geometry->getNormalView());
+			for (const auto& view : geometry->getAuxAttributeViews())
+				visitor(view);
+		}
+		//! Visits all views owned by one polygon geometry, including index and skeletal data.
+		template<typename Visitor>
+		static inline void visitGeometryViews(const ICPUPolygonGeometry* geometry, Visitor&& visitor)
+		{
+			if (!geometry)
+				return;
+			visitVertexAttributeViews(geometry, visitor);
+			visitor(geometry->getIndexView());
+			for (const auto& view : geometry->getJointWeightViews())
+			{
+				visitor(view.indices);
+				visitor(view.weights);
+			}
+			if (const auto jointObb = geometry->getJointOBBView(); jointObb)
+				visitor(*jointObb);
+		}
+		//! Stores one auxiliary view at `slot`, resizing the aux array as needed.
+		static inline void setAuxViewAt(ICPUPolygonGeometry* geometry, const uint32_t slot, IGeometry<ICPUBuffer>::SDataView&& view)
+		{
+			if (!geometry || !view)
+				return;
+			auto* const auxViews = geometry->getAuxAttributeViews();
+			if (auxViews->size() <= slot)
+				auxViews->resize(slot + 1u);
+			(*auxViews)[slot] = std::move(view);
+		}
+
+		//! Adopts contiguous caller-owned storage into a CPU buffer and exposes it as a formatted data view.
+		template<E_FORMAT Format, impl::AdoptedBufferStorage Storage>
+		static inline IGeometry<ICPUBuffer>::SDataView createAdoptedView(Storage&& data)
+		{
+			using storage_t = std::remove_cvref_t<Storage>;
+			using value_t = std::ranges::range_value_t<storage_t>;
+			auto buffer = SBufferAdoption::create(std::forward<Storage>(data));
+			if (!buffer)
+				return {};
+			return createDataView(std::move(buffer), buffer->getSize(), static_cast<uint32_t>(sizeof(value_t)), Format);
+		}
+};
+}
+#endif
diff --git a/include/nbl/asset/interchange/SGeometryWriterCommon.h b/include/nbl/asset/interchange/SGeometryWriterCommon.h
new file mode 100644
index 0000000000..5c2055a5e5
--- /dev/null
+++ b/include/nbl/asset/interchange/SGeometryWriterCommon.h
@@ -0,0 +1,227 @@
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_ASSET_S_GEOMETRY_WRITER_COMMON_H_INCLUDED_
+#define _NBL_ASSET_S_GEOMETRY_WRITER_COMMON_H_INCLUDED_
+#include <concepts>
+#include "nbl/asset/ICPUScene.h"
+#include "nbl/asset/ICPUGeometryCollection.h"
+#include "nbl/asset/ICPUPolygonGeometry.h"
+#include "nbl/builtin/hlsl/math/linalg/fast_affine.hlsl"
+#include <array>
+#include <charconv>
+#include <cstdio>
+#include <cstring>
+#include <limits>
+#include <system_error>
+#include <type_traits>
+namespace nbl::asset
+{
+//! Shared writer-side helpers used by geometry exporters.
+class SGeometryWriterCommon
+{
+    public:
+        //! Common scene/collection context propagated to one emitted geometry item.
+        struct SWriteState
+        {
+            //! World transform accumulated up to the emitted geometry.
+            hlsl::float32_t3x4 transform = hlsl::math::linalg::identity<hlsl::float32_t3x4>();
+            uint32_t instanceIx = ~0u; //!< Scene instance index or `~0u` when not applicable.
+            uint32_t targetIx = ~0u; //!< Morph-target index or `~0u` when not applicable.
+            uint32_t geometryIx = 0u; //!< Geometry index inside the current collection.
+        };
+        //! One polygon geometry together with the scene context needed by writers.
+        struct SPolygonGeometryWriteItem : SWriteState { const ICPUPolygonGeometry* geometry = nullptr; };
+
+        //! Collects polygon geometry items from a geometry, geometry collection, or scene root asset.
+        template<typename Container = core::vector<SPolygonGeometryWriteItem>> requires requires(Container& c, const SPolygonGeometryWriteItem& item) { c.emplace_back(item); }
+        static inline Container collectPolygonGeometryWriteItems(const IAsset* rootAsset)
+        {
+            Container out = {};
+            if (!rootAsset)
+                return out;
+            const auto identity = hlsl::math::linalg::identity<hlsl::float32_t3x4>();
+            auto appendFromCollection = [&](const ICPUGeometryCollection* collection, const hlsl::float32_t3x4& transform, const uint32_t instanceIx, const uint32_t targetIx) -> void {
+                if (!collection)
+                    return;
+                const auto& geometries = collection->getGeometries();
+                for (uint32_t geometryIx = 0u; geometryIx < geometries.size(); ++geometryIx)
+                {
+                    const auto& ref = geometries[geometryIx];
+                    if (!ref.geometry || ref.geometry->getPrimitiveType() != IGeometryBase::EPrimitiveType::Polygon)
+                        continue;
+                    SPolygonGeometryWriteItem item = {};
+                    item.geometry = static_cast<const ICPUPolygonGeometry*>(ref.geometry.get());
+                    item.transform = hlsl::math::linalg::promoted_mul(transform, ref.hasTransform() ? ref.transform : identity);
+                    item.instanceIx = instanceIx; item.targetIx = targetIx; item.geometryIx = geometryIx;
+                    out.emplace_back(item);
+                }
+            };
+            if (rootAsset->getAssetType() == IAsset::ET_GEOMETRY)
+            {
+                const auto* geometry = static_cast<const IGeometry<ICPUBuffer>*>(rootAsset);
+                if (geometry->getPrimitiveType() == IGeometryBase::EPrimitiveType::Polygon)
+                    out.emplace_back(SPolygonGeometryWriteItem{.geometry = static_cast<const ICPUPolygonGeometry*>(rootAsset)});
+                return out;
+            }
+            if (rootAsset->getAssetType() == IAsset::ET_GEOMETRY_COLLECTION)
+            {
+                appendFromCollection(static_cast<const ICPUGeometryCollection*>(rootAsset), identity, ~0u, ~0u);
+                return out;
+            }
+            if (rootAsset->getAssetType() != IAsset::ET_SCENE)
+                return out;
+            const auto* scene = static_cast<const ICPUScene*>(rootAsset);
+            const auto& instances = scene->getInstances();
+            const auto& morphTargets = instances.getMorphTargets();
+            const auto& initialTransforms = instances.getInitialTransforms();
+            for (uint32_t instanceIx = 0u; instanceIx < morphTargets.size(); ++instanceIx)
+            {
+                const auto* targets = morphTargets[instanceIx].get();
+                if (!targets)
+                    continue;
+                const auto instanceTransform = initialTransforms.empty() ? identity : initialTransforms[instanceIx];
+                const auto& targetList = targets->getTargets();
+                for (uint32_t targetIx = 0u; targetIx < targetList.size(); ++targetIx)
+                    appendFromCollection(targetList[targetIx].geoCollection.get(), instanceTransform, instanceIx, targetIx);
+            }
+            return out;
+        }
+        //! Returns true when the transform equals the writer identity matrix.
+        static inline bool isIdentityTransform(const hlsl::float32_t3x4& transform) { return transform == hlsl::math::linalg::identity<hlsl::float32_t3x4>(); }
+        //! Returns one auxiliary view when it exists and optionally matches `requiredElementCount`.
+        static inline const ICPUPolygonGeometry::SDataView* getAuxViewAt(const ICPUPolygonGeometry* geom, const uint32_t auxViewIx, const size_t requiredElementCount = 0ull)
+        {
+            if (!geom)
+                return nullptr;
+            const auto& auxViews = geom->getAuxAttributeViews();
+            if (auxViewIx >= auxViews.size())
+                return nullptr;
+            const auto& view = auxViews[auxViewIx];
+            if (!view)
+                return nullptr;
+            if (requiredElementCount && view.getElementCount() != requiredElementCount)
+                return nullptr;
+            return &view;
+        }
+        //! Resolves the triangle face count for indexed or non-indexed polygon geometry.
+        static inline bool getTriangleFaceCount(const ICPUPolygonGeometry* geom, size_t& outFaceCount)
+        {
+            outFaceCount = 0ull;
+            if (!geom)
+                return false;
+            const auto& positionView = geom->getPositionView();
+            const size_t vertexCount = positionView.getElementCount();
+            if (vertexCount == 0ull)
+                return false;
+            const auto& indexView = geom->getIndexView();
+            if (indexView)
+            {
+                const size_t indexCount = indexView.getElementCount();
+                if ((indexCount % 3ull) != 0ull)
+                    return false;
+                return (outFaceCount = indexCount / 3ull), true;
+            }
+            if ((vertexCount % 3ull) != 0ull)
+                return false;
+            return (outFaceCount = vertexCount / 3ull), true;
+        }
+        //! Visits triangle indices as validated `uint32_t` triplets.
+        template<typename Visitor>
+        static inline bool visitTriangleIndices(const ICPUPolygonGeometry* geom, Visitor&& visitor)
+        {
+            if (!geom)
+                return false;
+            const auto& positionView = geom->getPositionView();
+            const size_t vertexCount = positionView.getElementCount();
+            if (vertexCount == 0ull)
+                return false;
+            auto visit = [&]<typename IndexT>(const IndexT i0, const IndexT i1, const IndexT i2)->bool
+            {
+                const uint32_t u0 = static_cast<uint32_t>(i0);
+                const uint32_t u1 = static_cast<uint32_t>(i1);
+                const uint32_t u2 = static_cast<uint32_t>(i2);
+                if (u0 >= vertexCount || u1 >= vertexCount || u2 >= vertexCount)
+                    return false;
+                if constexpr (std::is_same_v<std::invoke_result_t<Visitor&, uint32_t, uint32_t, uint32_t>, bool>)
+                    return visitor(u0, u1, u2);
+                else { visitor(u0, u1, u2); return true; }
+            };
+            const auto& indexView = geom->getIndexView();
+            if (!indexView)
+            {
+                if ((vertexCount % 3ull) != 0ull)
+                    return false;
+                for (uint32_t i = 0u; i < vertexCount; i += 3u)
+                    if (!visit(i + 0u, i + 1u, i + 2u))
+                        return false;
+                return true;
+            }
+            const size_t indexCount = indexView.getElementCount();
+            if ((indexCount % 3ull) != 0ull)
+                return false;
+            const void* const src = indexView.getPointer();
+            if (!src)
+                return false;
+            auto visitIndexed = [&]<typename IndexT>()->bool
+            {
+                const auto* indices = reinterpret_cast<const IndexT*>(src);
+                for (size_t i = 0ull; i < indexCount; i += 3ull)
+                    if (!visit(indices[i + 0ull], indices[i + 1ull], indices[i + 2ull]))
+                        return false;
+                return true;
+            };
+            switch (geom->getIndexType())
+            {
+                case EIT_32BIT: return visitIndexed.template operator()<uint32_t>();
+                case EIT_16BIT: return visitIndexed.template operator()<uint16_t>();
+                default: return false;
+            }
+        }
+        //! Returns a direct pointer for tightly packed views that already match `ExpectedFormat`.
+        template<typename T, E_FORMAT ExpectedFormat>
+        static inline const T* getTightView(const ICPUPolygonGeometry::SDataView& view) { return view && view.composed.format == ExpectedFormat && view.composed.getStride() == sizeof(T) ? reinterpret_cast<const T*>(view.getPointer()) : nullptr; }
+        //! Appends one floating-point value to a caller-provided character buffer.
+        static inline char* appendFloatToBuffer(char* dst, char* end, float value) { return appendFloatingPointToBuffer(dst, end, value); }
+        //! Appends one double-precision value to a caller-provided character buffer.
+        static inline char* appendFloatToBuffer(char* dst, char* end, double value) { return appendFloatingPointToBuffer(dst, end, value); }
+        //! Appends one unsigned integer value to a caller-provided character buffer.
+        static inline char* appendUIntToBuffer(char* dst, char* const end, const uint32_t value)
+        {
+            if (!dst || dst >= end)
+                return end;
+            const auto result = std::to_chars(dst, end, value);
+            if (result.ec == std::errc())
+                return result.ptr;
+            const int written = std::snprintf(dst, static_cast<size_t>(end - dst), "%u", value);
+            if (written <= 0)
+                return dst;
+            const size_t writeLen = static_cast<size_t>(written);
+            return (writeLen < static_cast<size_t>(end - dst)) ? (dst + writeLen) : end;
+        }
+    private:
+        //! Shared floating-point backend for the `appendFloatToBuffer` overload set.
+        template<typename T>
+        static inline char* appendFloatingPointToBuffer(char* dst, char* const end, const T value)
+        {
+            static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>);
+            if (!dst || dst >= end)
+                return end;
+            const auto result = std::to_chars(dst, end, value);
+            if (result.ec == std::errc())
+                return result.ptr;
+            constexpr size_t FloatingPointScratchSize = std::numeric_limits<T>::max_digits10 + 9ull;
+            std::array<char, FloatingPointScratchSize> scratch = {};
+            constexpr int Precision = std::numeric_limits<T>::max_digits10;
+            const int written = std::snprintf(scratch.data(), scratch.size(), "%.*g", Precision, static_cast<double>(value));
+            if (written <= 0)
+                return dst;
+            const size_t writeLen = static_cast<size_t>(written);
+            if (writeLen > static_cast<size_t>(end - dst))
+                return end;
+            std::memcpy(dst, scratch.data(), writeLen);
+            return dst + writeLen;
+        }
+};
+}
+#endif
diff --git a/include/nbl/asset/interchange/SInterchangeIO.h b/include/nbl/asset/interchange/SInterchangeIO.h
new file mode 100644
index 0000000000..953e3142d5
--- /dev/null
+++ b/include/nbl/asset/interchange/SInterchangeIO.h
@@ -0,0 +1,235 @@
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_ASSET_S_INTERCHANGE_IO_H_INCLUDED_
+#define _NBL_ASSET_S_INTERCHANGE_IO_H_INCLUDED_
+#include "nbl/asset/interchange/SFileIOPolicy.h"
+#include "nbl/system/IFile.h"
+#include <algorithm>
+#include <chrono>
+#include <concepts>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <optional>
+#include <span>
+namespace nbl::asset
+{
+//! Shared read/write helpers that execute a resolved IO plan and collect simple telemetry.
+class SInterchangeIO
+{
+    public:
+        //! Tracks IO call count and byte distribution for tiny-io diagnostics.
+        struct STelemetry
+        {
+            uint64_t callCount = 0ull; //!< Number of IO calls recorded.
+            uint64_t totalBytes = 0ull; //!< Sum of processed bytes across all calls.
+            uint64_t minBytes = std::numeric_limits<uint64_t>::max(); //!< Smallest processed byte count observed so far.
+
+            inline void account(const uint64_t bytes)
+            {
+                ++callCount;
+                totalBytes += bytes;
+                if (bytes < minBytes)
+                    minBytes = bytes;
+            }
+
+            inline uint64_t getMinOrZero() const { return callCount ? minBytes : 0ull; }
+            inline uint64_t getAvgOrZero() const { return callCount ? (totalBytes / callCount) : 0ull; }
+        };
+        using SReadTelemetry = STelemetry;
+        using SWriteTelemetry = STelemetry;
+        //! Flags large payloads that were served through suspiciously small IO calls.
+        //! Defaults are 1 MiB, 1 KiB, 64 B, and 1024 calls.
+        static inline bool isTinyIOTelemetryLikely(const STelemetry& telemetry, const uint64_t payloadBytes, const uint64_t bigPayloadThresholdBytes = (1ull << 20), const uint64_t lowAvgBytesThreshold = 1024ull, const uint64_t tinyChunkBytesThreshold = 64ull, const uint64_t tinyChunkCallsThreshold = 1024ull)
+        {
+            if (payloadBytes <= bigPayloadThresholdBytes)
+                return false;
+            const uint64_t minBytes = telemetry.getMinOrZero();
+            const uint64_t avgBytes = telemetry.getAvgOrZero();
+            return avgBytes < lowAvgBytesThreshold || (minBytes < tinyChunkBytesThreshold && telemetry.callCount > tinyChunkCallsThreshold);
+        }
+        //! Same tiny-io heuristic but pulls thresholds from the resolved IO policy.
+        static inline bool isTinyIOTelemetryLikely(const STelemetry& telemetry, const uint64_t payloadBytes, const SFileIOPolicy& ioPolicy) { return isTinyIOTelemetryLikely(telemetry, payloadBytes, ioPolicy.runtimeTuning.tinyIoPayloadThresholdBytes, ioPolicy.runtimeTuning.tinyIoAvgBytesThreshold, ioPolicy.runtimeTuning.tinyIoMinBytesThreshold, ioPolicy.runtimeTuning.tinyIoMinCallCount); }
+        //! Issues one read request and verifies that the full byte count was returned.
+        static inline bool readFileExact(system::IFile* file, void* dst, const size_t offset, const size_t bytes, SReadTelemetry* ioTelemetry = nullptr)
+        {
+            if (!file || (!dst && bytes != 0ull)) return false;
+            if (bytes == 0ull) return true;
+            system::IFile::success_t success;
+            file->read(success, dst, offset, bytes);
+            if (success && ioTelemetry) ioTelemetry->account(success.getBytesProcessed());
+            return success && success.getBytesProcessed() == bytes;
+        }
+
+        /**
+            Reads a byte range using the resolved whole-file or chunked strategy.
+            When `ioTime` is non-null it also reports wall time in `TimeUnit`.
+            Default `TimeUnit` is milliseconds.
+        */
+        template<typename TimeUnit = std::chrono::duration<double, std::milli>>
+        requires std::same_as<TimeUnit, std::chrono::duration<typename TimeUnit::rep, typename TimeUnit::period>>
+        static inline bool readFileWithPolicy(system::IFile* file, void* dst, const size_t offset, const size_t bytes, const SResolvedFileIOPolicy& ioPlan, SReadTelemetry* ioTelemetry = nullptr, TimeUnit* ioTime = nullptr)
+        {
+            using clock_t = std::chrono::high_resolution_clock;
+            const auto ioStart = ioTime ? clock_t::now() : clock_t::time_point{};
+            auto finalize = [&](const bool ok) -> bool { if (ioTime) *ioTime = std::chrono::duration_cast<TimeUnit>(clock_t::now() - ioStart); return ok; };
+            if (!file || (!dst && bytes != 0ull))
+                return finalize(false);
+            if (bytes == 0ull)
+                return finalize(true);
+            auto* out = reinterpret_cast<uint8_t*>(dst);
+            switch (ioPlan.strategy)
+            {
+                case SResolvedFileIOPolicy::Strategy::WholeFile:
+                    return finalize(readFileExact(file, out, offset, bytes, ioTelemetry));
+                case SResolvedFileIOPolicy::Strategy::Chunked:
+                default:
+                {
+                    const size_t inFlightDepth = ioPlan.chunkedInFlightDepth;
+                    auto inFlight = std::make_unique<SChunkedRequest[]>(inFlightDepth);
+                    size_t submitOffset = 0ull;
+                    size_t activeCount = 0ull;
+                    size_t submitIndex = 0ull;
+                    size_t drainIndex = 0ull;
+                    const uint64_t chunkSizeBytes = ioPlan.chunkSizeBytes();
+                    auto submitChunk = [&]() -> bool {
+                        if (submitOffset >= bytes || activeCount >= inFlightDepth)
+                            return false;
+                        auto& request = inFlight[submitIndex];
+                        const size_t toRead = static_cast<size_t>(std::min<uint64_t>(chunkSizeBytes, bytes - submitOffset));
+                        request.success.emplace();
+                        file->read(*request.success, out + submitOffset, offset + submitOffset, toRead);
+                        request.bytes = toRead;
+                        request.active = true;
+                        submitOffset += toRead;
+                        submitIndex = (submitIndex + 1ull) % inFlightDepth;
+                        ++activeCount;
+                        return true;
+                    };
+                    auto drainChunk = [&]() -> bool {
+                        auto& request = inFlight[drainIndex];
+                        if (!request.active)
+                            return false;
+                        const bool ok = drainChunkedRequest(request, ioTelemetry);
+                        drainIndex = (drainIndex + 1ull) % inFlightDepth;
+                        --activeCount;
+                        return ok;
+                    };
+                    while (submitOffset < bytes || activeCount)
+                    {
+                        while (submitChunk()) {}
+                        if (activeCount && !drainChunk())
+                            return finalize(false);
+                    }
+                    return finalize(true);
+                }
+            }
+        }
+        //! Describes one contiguous output buffer written as part of a larger stream.
+        struct SBufferRange
+        {
+            const void* data = nullptr; //!< Start of the contiguous byte range.
+            size_t byteCount = 0ull; //!< Number of bytes to write from `data`.
+        };
+        //! Writes one or more buffers sequentially at `fileOffset` and advances it on success.
+        static inline bool writeBuffersWithPolicyAtOffset(system::IFile* file, const SResolvedFileIOPolicy& ioPlan, const std::span<const SBufferRange> buffers, size_t& fileOffset, SWriteTelemetry* ioTelemetry = nullptr)
+        {
+            if (!file) return false;
+            const uint64_t chunkSizeBytes = ioPlan.chunkSizeBytes();
+            for (const auto& buffer : buffers)
+            {
+                if (!buffer.data && buffer.byteCount != 0ull) return false;
+                if (buffer.byteCount == 0ull)
+                    continue;
+                const auto* data = reinterpret_cast<const uint8_t*>(buffer.data);
+                size_t writtenTotal = 0ull;
+                if (ioPlan.strategy == SResolvedFileIOPolicy::Strategy::WholeFile)
+                {
+                    const size_t toWrite = buffer.byteCount;
+                    system::IFile::success_t success;
+                    file->write(success, data, fileOffset, toWrite);
+                    if (!success)
+                        return false;
+                    const size_t written = success.getBytesProcessed();
+                    if (written == 0ull)
+                        return false;
+                    if (ioTelemetry)
+                        ioTelemetry->account(written);
+                    writtenTotal += written;
+                }
+                else
+                {
+                    const size_t inFlightDepth = ioPlan.chunkedInFlightDepth;
+                    auto inFlight = std::make_unique<SChunkedRequest[]>(inFlightDepth);
+                    size_t submitOffset = 0ull;
+                    size_t activeCount = 0ull;
+                    size_t submitIndex = 0ull;
+                    size_t drainIndex = 0ull;
+                    auto submitChunk = [&]() -> bool {
+                        if (submitOffset >= buffer.byteCount || activeCount >= inFlightDepth)
+                            return false;
+                        auto& request = inFlight[submitIndex];
+                        const size_t toWrite = static_cast<size_t>(std::min<uint64_t>(chunkSizeBytes, buffer.byteCount - submitOffset));
+                        request.success.emplace();
+                        file->write(*request.success, data + submitOffset, fileOffset + submitOffset, toWrite);
+                        request.bytes = toWrite;
+                        request.active = true;
+                        submitOffset += toWrite;
+                        submitIndex = (submitIndex + 1ull) % inFlightDepth;
+                        ++activeCount;
+                        return true;
+                    };
+                    auto drainChunk = [&]() -> bool {
+                        auto& request = inFlight[drainIndex];
+                        if (!request.active)
+                            return false;
+                        const bool ok = drainChunkedRequest(request, ioTelemetry);
+                        if (ok)
+                            writtenTotal += request.bytes;
+                        drainIndex = (drainIndex + 1ull) % inFlightDepth;
+                        --activeCount;
+                        return ok;
+                    };
+                    while (submitOffset < buffer.byteCount || activeCount)
+                    {
+                        while (submitChunk()) {}
+                        if (activeCount && !drainChunk())
+                            return false;
+                    }
+                }
+                fileOffset += writtenTotal;
+            }
+            return true;
+        }
+        //! Writes one or more buffers starting from file offset `0`.
+        static inline bool writeBuffersWithPolicy(system::IFile* file, const SResolvedFileIOPolicy& ioPlan, const std::span<const SBufferRange> buffers, SWriteTelemetry* ioTelemetry = nullptr) { size_t fileOffset = 0ull; return writeBuffersWithPolicyAtOffset(file, ioPlan, buffers, fileOffset, ioTelemetry); }
+        //! Single-buffer convenience wrapper over `writeBuffersWithPolicyAtOffset`.
+        static inline bool writeFileWithPolicyAtOffset(system::IFile* file, const SResolvedFileIOPolicy& ioPlan, const void* data, size_t byteCount, size_t& fileOffset, SWriteTelemetry* ioTelemetry = nullptr) { const SBufferRange buffers[] = {{.data = data, .byteCount = byteCount}}; return writeBuffersWithPolicyAtOffset(file, ioPlan, buffers, fileOffset, ioTelemetry); }
+        //! Single-buffer convenience wrapper over `writeBuffersWithPolicy`.
+        static inline bool writeFileWithPolicy(system::IFile* file, const SResolvedFileIOPolicy& ioPlan, const void* data, size_t byteCount, SWriteTelemetry* ioTelemetry = nullptr) { const SBufferRange buffers[] = {{.data = data, .byteCount = byteCount}}; return writeBuffersWithPolicy(file, ioPlan, buffers, ioTelemetry); }
+    private:
+        struct SChunkedRequest
+        {
+            std::optional<system::IFile::success_t> success = std::nullopt;
+            size_t bytes = 0ull;
+            bool active = false;
+        };
+        static inline bool drainChunkedRequest(SChunkedRequest& request, STelemetry* ioTelemetry)
+        {
+            const size_t processed = request.success ? request.success->getBytesProcessed():0ull;
+            request.success.reset();
+            request.active = false;
+            if (processed != request.bytes || processed == 0ull)
+                return false;
+            if (ioTelemetry)
+                ioTelemetry->account(processed);
+            return true;
+        }
+};
+using SFileIOTelemetry = SInterchangeIO::STelemetry;
+using SFileReadTelemetry = SInterchangeIO::SReadTelemetry;
+using SFileWriteTelemetry = SInterchangeIO::SWriteTelemetry;
+}
+#endif
diff --git a/include/nbl/asset/interchange/SLoaderRuntimeTuning.h b/include/nbl/asset/interchange/SLoaderRuntimeTuning.h
new file mode 100644
index 0000000000..e180325606
--- /dev/null
+++ b/include/nbl/asset/interchange/SLoaderRuntimeTuning.h
@@ -0,0 +1,303 @@
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_ASSET_S_LOADER_RUNTIME_TUNING_H_INCLUDED_
+#define _NBL_ASSET_S_LOADER_RUNTIME_TUNING_H_INCLUDED_
+#include "nbl/asset/interchange/SFileIOPolicy.h"
+#include <algorithm>
+#include <atomic>
+#include <chrono>
+#include <concepts>
+#include <cstdint>
+#include <limits>
+#include <thread>
+#include <utility>
+#include <vector>
+namespace nbl::asset
+{
+//! Input describing one loader or hash stage that needs worker and chunk sizing.
+struct SLoaderRuntimeTuningRequest
+{
+    uint64_t inputBytes = 0ull; //!< Total input bytes for the tuned stage.
+    uint64_t totalWorkUnits = 0ull; //!< Total amount of stage work in logical units.
+    uint64_t minWorkUnitsPerWorker = 1ull; //!< Minimum work units assigned to one worker.
+    uint64_t minBytesPerWorker = 1ull; //!< Minimum input bytes assigned to one worker.
+    uint32_t hardwareThreads = 0u; //!< Hardware thread count override. 0 means auto-detect.
+    uint32_t hardMaxWorkers = 0u; //!< Hard cap for workers for this request. 0 means no extra cap.
+    uint32_t targetChunksPerWorker = 0u; //!< Preferred chunk count per worker for this stage. 0 means policy default.
+    uint64_t minChunkWorkUnits = 1ull; //!< Minimum work units in one chunk.
+    uint64_t maxChunkWorkUnits = std::numeric_limits<uint64_t>::max(); //!< Maximum work units in one chunk.
+    const uint8_t* sampleData = nullptr; //!< Pointer to representative sample bytes for hybrid sampling.
+    uint64_t sampleBytes = 0ull; //!< Number of sample bytes available at sampleData.
+    uint64_t sampleMinWorkUnits = 0ull; //!< Minimum work units required to allow sampling. 0 means policy or auto value.
+    uint32_t samplePasses = 0u; //!< Sampling pass count override. 0 means policy default.
+    uint32_t sampleMaxCandidates = 0u; //!< Sampling candidate count override. 0 means policy default.
+};
+//! Final worker and chunk layout selected for one stage.
+struct SLoaderRuntimeTuningResult
+{
+    size_t workerCount = 1ull; //!< Selected worker count for the stage.
+    size_t chunkCount = 1ull; //!< Total chunk count for the stage.
+    uint64_t chunkWorkUnits = 1ull; //!< Work units per chunk assigned by tuner.
+};
+//! Stateless runtime tuner used by loaders and hash stages to size worker pools and chunking.
+struct SLoaderRuntimeTuner
+{
+    private:
+        //! Aggregated timings collected while probing one worker-count candidate.
+        struct SBenchmarkSampleStats
+        {
+            uint64_t medianNs = 0ull;
+            uint64_t minNs = 0ull;
+            uint64_t maxNs = 0ull;
+            uint64_t totalNs = 0ull;
+        };
+    public:
+        /**
+            Dispatches workers `1..N-1` on `std::jthread`
+            and runs worker `0` on the caller thread.
+        */
+        template<typename Fn>
+        requires std::invocable<Fn&, size_t>
+        static void dispatchWorkers(const size_t workerCount, Fn&& fn)
+        {
+            if (workerCount <= 1ull)
+                return fn(0ull);
+            std::vector<std::jthread> workers;
+            workers.reserve(workerCount - 1ull);
+            for (size_t workerIx = 1ull; workerIx < workerCount; ++workerIx)
+                workers.emplace_back([&fn, workerIx]() { fn(workerIx); });
+            fn(0ull);
+        }
+
+        //! Integer ceil division. Callers must pass a non-zero denominator.
+        static constexpr uint64_t ceilDiv(const uint64_t numerator, const uint64_t denominator) { return (numerator + denominator - 1ull) / denominator; }
+
+        /**
+            Measures one sampled memory-touch pass configuration
+            and returns aggregate wall time across all passes.
+        */
+        template<typename TimeUnit = std::chrono::nanoseconds>
+        requires std::same_as<TimeUnit, std::chrono::duration<typename TimeUnit::rep, typename TimeUnit::period>>
+        static inline TimeUnit benchmarkSample(const uint8_t* const sampleData, const uint64_t sampleBytes, const size_t workerCount, const uint32_t passes)
+        {
+            if (!sampleData || sampleBytes == 0ull || workerCount == 0ull)
+                return TimeUnit::zero();
+            const uint32_t passCount = std::max<uint32_t>(1u, passes);
+            std::vector<uint64_t> partial(workerCount, 0ull);
+            uint64_t elapsedNs = 0ull;
+            using clock_t = std::chrono::steady_clock;
+            for (uint32_t passIx = 0u; passIx < passCount; ++passIx)
+            {
+                const auto passStart = clock_t::now();
+                SLoaderRuntimeTuner::dispatchWorkers(workerCount, [&](const size_t workerIx)
+                {
+                    const uint64_t begin = (sampleBytes * workerIx) / workerCount;
+                    const uint64_t end = (sampleBytes * (workerIx + 1ull)) / workerCount;
+                    const uint8_t* ptr = sampleData + begin;
+                    uint64_t local = 0ull;
+                    for (uint64_t i = 0ull, count = end - begin; i < count; ++i)
+                        local += static_cast<uint64_t>(ptr[i]);
+                    partial[workerIx] ^= local;
+                });
+                elapsedNs += static_cast<uint64_t>(std::chrono::duration_cast<std::chrono::nanoseconds>(clock_t::now() - passStart).count());
+            }
+            uint64_t reduced = 0ull;
+            for (const uint64_t v : partial)
+                reduced ^= v;
+            static std::atomic<uint64_t> sink = 0ull;
+            sink.fetch_xor(reduced, std::memory_order_relaxed);
+            return std::chrono::duration_cast<TimeUnit>(std::chrono::nanoseconds(elapsedNs));
+        }
+
+        //! Warms up once and then collects timing observations for one worker-count candidate.
+        static inline SBenchmarkSampleStats benchmarkSampleStats(const uint8_t* const sampleData, const uint64_t sampleBytes, const size_t workerCount, const uint32_t passes, const uint32_t observations)
+        {
+            SBenchmarkSampleStats stats = {};
+            if (!sampleData || sampleBytes == 0ull || workerCount == 0ull)
+                return stats;
+            const uint32_t observationCount = std::max<uint32_t>(1u, observations);
+            std::vector<uint64_t> samples;
+            samples.reserve(observationCount);
+            benchmarkSample(sampleData, sampleBytes, workerCount, 1u);
+            for (uint32_t obsIx = 0u; obsIx < observationCount; ++obsIx)
+            {
+                const uint64_t elapsedNs = static_cast<uint64_t>(benchmarkSample(sampleData, sampleBytes, workerCount, passes).count());
+                if (elapsedNs == 0ull)
+                    continue;
+                stats.totalNs += elapsedNs;
+                samples.push_back(elapsedNs);
+            }
+            if (samples.empty())
+                return {};
+            std::sort(samples.begin(), samples.end());
+            stats.minNs = samples.front();
+            stats.maxNs = samples.back();
+            if ((samples.size() & 1ull) != 0ull)
+                stats.medianNs = samples[samples.size() / 2ull];
+            else
+                stats.medianNs = (samples[samples.size() / 2ull - 1ull] + samples[samples.size() / 2ull]) / 2ull;
+            return stats;
+        }
+        //! Keeps the candidate probe list unique while preserving insertion order.
+        static inline void appendCandidate(std::vector<size_t>& dst, const size_t candidate) { if (candidate != 0ull && std::find(dst.begin(), dst.end(), candidate) == dst.end()) dst.push_back(candidate); }
+        //! Chooses the sample byte budget used by hybrid tuning from the known input size and policy clamps.
+        static inline uint64_t resolveSampleBytes(const SFileIOPolicy& ioPolicy, const uint64_t knownInputBytes)
+        {
+            if (knownInputBytes == 0ull)
+                return 0ull;
+            const uint64_t minSampleBytes = std::max<uint64_t>(1ull, ioPolicy.runtimeTuning.minSampleBytes);
+            const uint64_t maxSampleBytes = std::max<uint64_t>(minSampleBytes, ioPolicy.runtimeTuning.maxSampleBytes);
+            const uint64_t cappedMin = std::min<uint64_t>(minSampleBytes, knownInputBytes);
+            const uint64_t cappedMax = std::min<uint64_t>(maxSampleBytes, knownInputBytes);
+            const uint64_t adaptive = std::max<uint64_t>(knownInputBytes / 64ull, cappedMin);
+            return std::clamp<uint64_t>(adaptive, cappedMin, cappedMax);
+        }
+        //! Returns true when the hash build is small enough to stay on the caller thread.
+        static inline bool shouldInlineHashBuild(const SFileIOPolicy& ioPolicy, const uint64_t inputBytes) { return inputBytes <= std::max<uint64_t>(1ull, ioPolicy.runtimeTuning.hashInlineThresholdBytes); }
+        //! Resolves the effective hardware thread count and always returns at least one worker.
+        static inline size_t resolveHardwareThreads(const uint32_t requested = 0u) { const size_t hw = requested ? static_cast<size_t>(requested) : static_cast<size_t>(std::thread::hardware_concurrency()); return hw ? hw : 1ull; }
+        //! Applies worker headroom while keeping at least two workers when parallel hardware is available.
+        static inline size_t resolveHardMaxWorkers(const size_t hardwareThreads, const uint32_t workerHeadroom)
+        {
+            const size_t hw = std::max<size_t>(1ull, hardwareThreads), minWorkers = hw >= 2ull ? 2ull : 1ull, headroom = static_cast<size_t>(workerHeadroom);
+            if (headroom == 0ull)
+                return hw;
+            if (hw <= headroom)
+                return minWorkers;
+            return std::max<size_t>(minWorkers, hw - headroom);
+        }
+        //! Resolves worker and chunk counts for one stage using policy limits plus optional hybrid sampling.
+        static inline SLoaderRuntimeTuningResult tune(const SFileIOPolicy& ioPolicy, const SLoaderRuntimeTuningRequest& request)
+        {
+            using RTMode = SFileIOPolicy::SRuntimeTuning::Mode;
+            SLoaderRuntimeTuningResult result = {};
+            if (request.totalWorkUnits == 0ull)
+                return (result.chunkWorkUnits = 0ull), (result.chunkCount = 0ull), result;
+            const size_t hw = SLoaderRuntimeTuner::resolveHardwareThreads(request.hardwareThreads);
+            size_t maxWorkers = hw;
+            if (request.hardMaxWorkers > 0u)
+                maxWorkers = std::min(maxWorkers, static_cast<size_t>(request.hardMaxWorkers));
+            if (ioPolicy.runtimeTuning.maxWorkers > 0u)
+                maxWorkers = std::min(maxWorkers, static_cast<size_t>(ioPolicy.runtimeTuning.maxWorkers));
+            maxWorkers = std::max<size_t>(1ull, maxWorkers);
+            const uint64_t minWorkUnitsPerWorker = std::max<uint64_t>(1ull, request.minWorkUnitsPerWorker);
+            const uint64_t minBytesPerWorker = std::max<uint64_t>(1ull, request.minBytesPerWorker);
+            const size_t maxByWork = static_cast<size_t>(SLoaderRuntimeTuner::ceilDiv(request.totalWorkUnits, minWorkUnitsPerWorker));
+            const size_t maxByBytes = request.inputBytes ? static_cast<size_t>(SLoaderRuntimeTuner::ceilDiv(request.inputBytes, minBytesPerWorker)) : maxWorkers;
+            const bool heuristicEnabled = ioPolicy.runtimeTuning.mode != RTMode::Sequential;
+            const bool hybridEnabled = ioPolicy.runtimeTuning.mode == RTMode::Hybrid;
+            size_t workerCount = 1ull;
+            if (heuristicEnabled)
+                workerCount = std::max<size_t>(1ull, std::min({ maxWorkers, maxByWork, maxByBytes }));
+            const size_t targetChunksPerWorker = std::max<size_t>(1ull, static_cast<size_t>(request.targetChunksPerWorker ? request.targetChunksPerWorker : ioPolicy.runtimeTuning.targetChunksPerWorker));
+            if (workerCount > 1ull && heuristicEnabled)
+            {
+                const double maxOverheadRatio = std::max(0.0, static_cast<double>(ioPolicy.runtimeTuning.maxOverheadRatio));
+                const double minExpectedGainRatio = std::clamp(static_cast<double>(ioPolicy.runtimeTuning.minExpectedGainRatio), 0.0, 0.99);
+                while (workerCount > 1ull)
+                {
+                    const double idealGain = 1.0 - (1.0 / static_cast<double>(workerCount));
+                    const double overheadRatio = static_cast<double>(workerCount * targetChunksPerWorker) / static_cast<double>(std::max<uint64_t>(1ull, request.totalWorkUnits));
+                    if (idealGain < minExpectedGainRatio || overheadRatio > maxOverheadRatio)
+                    {
+                        --workerCount;
+                        continue;
+                    }
+                    break;
+                }
+            }
+            const size_t heuristicWorkerCount = std::max<size_t>(1ull, workerCount);
+            if (heuristicEnabled && hybridEnabled && request.sampleData != nullptr && request.sampleBytes > 0ull && heuristicWorkerCount > 1ull && maxWorkers > 1ull)
+            {
+                const uint64_t autoMinSamplingWorkUnits = std::max<uint64_t>(static_cast<uint64_t>(targetChunksPerWorker) * 8ull, static_cast<uint64_t>(maxWorkers * targetChunksPerWorker));
+                const uint64_t minSamplingWorkUnits = request.sampleMinWorkUnits ? request.sampleMinWorkUnits : (ioPolicy.runtimeTuning.samplingMinWorkUnits ? ioPolicy.runtimeTuning.samplingMinWorkUnits : autoMinSamplingWorkUnits);
+                if (request.totalWorkUnits >= minSamplingWorkUnits)
+                {
+                    const double samplingBudgetRatio = std::clamp(static_cast<double>(ioPolicy.runtimeTuning.samplingBudgetRatio), 0.0, 0.5);
+                    uint64_t effectiveSampleBytes = request.sampleBytes;
+                    if (request.inputBytes)
+                        effectiveSampleBytes = std::min<uint64_t>(effectiveSampleBytes, request.inputBytes);
+                    if (effectiveSampleBytes > 0ull && samplingBudgetRatio > 0.0)
+                    {
+                        if (request.inputBytes > 0ull)
+                        {
+                            // Keep probing lightweight: sample fraction scales with input and parallelism.
+                            const uint64_t sampleDivisor = std::max<uint64_t>(4ull, static_cast<uint64_t>(heuristicWorkerCount) * static_cast<uint64_t>(targetChunksPerWorker));
+                            const uint64_t adaptiveSampleBytes = std::max<uint64_t>(1ull, request.inputBytes / sampleDivisor);
+                            effectiveSampleBytes = std::min<uint64_t>(effectiveSampleBytes, adaptiveSampleBytes);
+                        }
+                        const uint32_t samplePasses = request.samplePasses ? request.samplePasses : ioPolicy.runtimeTuning.samplingPasses;
+                        uint32_t maxCandidates = request.sampleMaxCandidates ? request.sampleMaxCandidates : ioPolicy.runtimeTuning.samplingMaxCandidates;
+                        maxCandidates = std::max<uint32_t>(2u, maxCandidates);
+                        std::vector<size_t> candidates;
+                        candidates.reserve(maxCandidates);
+                        appendCandidate(candidates, heuristicWorkerCount);
+                        appendCandidate(candidates, heuristicWorkerCount > 1ull ? (heuristicWorkerCount - 1ull) : 1ull);
+                        appendCandidate(candidates, std::min(maxWorkers, heuristicWorkerCount + 1ull));
+                        if (heuristicWorkerCount > 2ull)
+                            appendCandidate(candidates, heuristicWorkerCount - 2ull);
+                        if (heuristicWorkerCount + 2ull <= maxWorkers)
+                            appendCandidate(candidates, heuristicWorkerCount + 2ull);
+                        if (candidates.size() > maxCandidates)
+                            candidates.resize(maxCandidates);
+                        // Probe heuristic first and only continue when budget can amortize additional probes.
+                        const auto heuristicStatsProbe = benchmarkSampleStats(request.sampleData, effectiveSampleBytes, heuristicWorkerCount, samplePasses, 2u);
+                        if (heuristicStatsProbe.medianNs > 0ull)
+                        {
+                            const double scale = request.inputBytes ? (static_cast<double>(request.inputBytes) / static_cast<double>(effectiveSampleBytes)) : 1.0;
+                            const uint64_t estimatedFullNs = static_cast<uint64_t>(static_cast<double>(heuristicStatsProbe.medianNs) * std::max(1.0, scale));
+                            const uint64_t samplingBudgetNs = static_cast<uint64_t>(static_cast<double>(estimatedFullNs) * samplingBudgetRatio);
+                            uint64_t spentNs = heuristicStatsProbe.totalNs;
+                            const size_t alternativeCandidates = (candidates.size() > 0ull) ? (candidates.size() - 1ull) : 0ull;
+                            if (alternativeCandidates > 0ull && spentNs < samplingBudgetNs)
+                            {
+                                const uint64_t spareBudgetNs = samplingBudgetNs - spentNs;
+                                const uint64_t estimatedEvalNs = std::max<uint64_t>(1ull, heuristicStatsProbe.medianNs);
+                                const uint64_t estimatedEvaluations = std::max<uint64_t>(1ull, spareBudgetNs / estimatedEvalNs);
+                                const uint32_t observations = static_cast<uint32_t>(std::clamp<uint64_t>(estimatedEvaluations / static_cast<uint64_t>(alternativeCandidates), 1ull, 3ull));
+                                SBenchmarkSampleStats bestStats = heuristicStatsProbe;
+                                size_t bestWorker = heuristicWorkerCount;
+                                for (const size_t candidate : candidates)
+                                {
+                                    if (candidate == heuristicWorkerCount)
+                                        continue;
+                                    if (spentNs >= samplingBudgetNs)
+                                        break;
+                                    const auto candidateStats = benchmarkSampleStats(
+                                        request.sampleData, effectiveSampleBytes, candidate, samplePasses, observations);
+                                    if (candidateStats.medianNs == 0ull)
+                                        continue;
+                                    spentNs += candidateStats.totalNs;
+                                    if (candidateStats.medianNs < bestStats.medianNs)
+                                        bestStats = candidateStats, bestWorker = candidate;
+                                }
+                                if (bestWorker != heuristicWorkerCount)
+                                {
+                                    const double gain = static_cast<double>(heuristicStatsProbe.medianNs - bestStats.medianNs) / static_cast<double>(heuristicStatsProbe.medianNs);
+                                    const uint64_t heuristicSpan = heuristicStatsProbe.maxNs - heuristicStatsProbe.minNs;
+                                    const uint64_t bestSpan = bestStats.maxNs - bestStats.minNs;
+                                    const double heuristicNoise = static_cast<double>(heuristicSpan) / static_cast<double>(std::max<uint64_t>(1ull, heuristicStatsProbe.medianNs));
+                                    const double bestNoise = static_cast<double>(bestSpan) / static_cast<double>(std::max<uint64_t>(1ull, bestStats.medianNs));
+                                    const double requiredGain = std::max(std::clamp(static_cast<double>(ioPolicy.runtimeTuning.minExpectedGainRatio), 0.0, 0.99), std::clamp(std::max(heuristicNoise, bestNoise) * 1.25, 0.0, 0.99));
+                                    if (gain >= requiredGain)
+                                        workerCount = bestWorker;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            result.workerCount = std::max<size_t>(1ull, workerCount);
+            const uint64_t minChunkWorkUnits = std::max<uint64_t>(1ull, request.minChunkWorkUnits);
+            uint64_t maxChunkWorkUnits = std::max<uint64_t>(minChunkWorkUnits, request.maxChunkWorkUnits);
+            const uint64_t desiredChunkCount = static_cast<uint64_t>(std::max<size_t>(1ull, result.workerCount * targetChunksPerWorker));
+            uint64_t chunkWorkUnits = SLoaderRuntimeTuner::ceilDiv(request.totalWorkUnits, desiredChunkCount);
+            chunkWorkUnits = std::clamp<uint64_t>(chunkWorkUnits, minChunkWorkUnits, maxChunkWorkUnits);
+            result.chunkWorkUnits = chunkWorkUnits;
+            result.chunkCount = static_cast<size_t>(SLoaderRuntimeTuner::ceilDiv(request.totalWorkUnits, chunkWorkUnits));
+            return result;
+        }
+};
+}
+#endif
diff --git a/include/nbl/asset/interchange/SOBJPolygonGeometryAuxLayout.h b/include/nbl/asset/interchange/SOBJPolygonGeometryAuxLayout.h
new file mode 100644
index 0000000000..ed2743e493
--- /dev/null
+++ b/include/nbl/asset/interchange/SOBJPolygonGeometryAuxLayout.h
@@ -0,0 +1,14 @@
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_ASSET_S_OBJ_POLYGON_GEOMETRY_AUX_LAYOUT_H_INCLUDED_
+#define _NBL_ASSET_S_OBJ_POLYGON_GEOMETRY_AUX_LAYOUT_H_INCLUDED_
+namespace nbl::asset
+{
+//! Public OBJ aux-view slot ids shared by loader and writer code.
+struct SOBJPolygonGeometryAuxLayout
+{
+	static inline constexpr uint32_t UV0 = 0u;
+};
+}
+#endif
diff --git a/include/nbl/asset/interchange/SPLYPolygonGeometryAuxLayout.h b/include/nbl/asset/interchange/SPLYPolygonGeometryAuxLayout.h
new file mode 100644
index 0000000000..66a1f3d692
--- /dev/null
+++ b/include/nbl/asset/interchange/SPLYPolygonGeometryAuxLayout.h
@@ -0,0 +1,14 @@
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_ASSET_S_PLY_POLYGON_GEOMETRY_AUX_LAYOUT_H_INCLUDED_
+#define _NBL_ASSET_S_PLY_POLYGON_GEOMETRY_AUX_LAYOUT_H_INCLUDED_
+namespace nbl::asset
+{
+//! Public PLY aux-view slot ids shared by loader and writer code.
+struct SPLYPolygonGeometryAuxLayout
+{
+	static inline constexpr uint32_t UV0 = 0u;
+};
+}
+#endif
diff --git a/include/nbl/asset/interchange/SSTLPolygonGeometryAuxLayout.h b/include/nbl/asset/interchange/SSTLPolygonGeometryAuxLayout.h
new file mode 100644
index 0000000000..a49b16b6ee
--- /dev/null
+++ b/include/nbl/asset/interchange/SSTLPolygonGeometryAuxLayout.h
@@ -0,0 +1,14 @@
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_ASSET_S_STL_POLYGON_GEOMETRY_AUX_LAYOUT_H_INCLUDED_
+#define _NBL_ASSET_S_STL_POLYGON_GEOMETRY_AUX_LAYOUT_H_INCLUDED_
+namespace nbl::asset
+{
+//! Public STL aux-view slot ids shared by loader and writer code.
+struct SSTLPolygonGeometryAuxLayout
+{
+	static inline constexpr uint32_t COLOR0 = 0u;
+};
+}
+#endif
diff --git a/include/nbl/asset/metadata/CPLYMetadata.h b/include/nbl/asset/metadata/CPLYMetadata.h
index 39ad07561a..ec7112bd0a 100644
--- a/include/nbl/asset/metadata/CPLYMetadata.h
+++ b/include/nbl/asset/metadata/CPLYMetadata.h
@@ -6,6 +6,8 @@
 
 
 #include "nbl/asset/metadata/IAssetMetadata.h"
+#include <string>
+#include <string_view>
 
 
 namespace nbl::asset
@@ -13,12 +15,38 @@ namespace nbl::asset
 
 class CPLYMetadata final : public IAssetMetadata
 {
-    public:        
-        CPLYMetadata() : IAssetMetadata() {}
+    public:
+		class CPolygonGeometry : public IPolygonGeometryMetadata
+		{
+			public:
+				using IPolygonGeometryMetadata::IPolygonGeometryMetadata;
+				inline CPolygonGeometry& operator=(CPolygonGeometry&& other)
+				{
+					IPolygonGeometryMetadata::operator=(std::move(other));
+					std::swap(m_auxAttributeNames, other.m_auxAttributeNames);
+					return *this;
+				}
+				inline std::string_view getAuxAttributeName(const uint32_t auxViewIx) const
+				{
+					return auxViewIx < m_auxAttributeNames.size() ? std::string_view(m_auxAttributeNames[auxViewIx]) : std::string_view{};
+				}
+				core::vector<std::string> m_auxAttributeNames;
+		};
+        CPLYMetadata(const uint32_t geometryCount = 0u) : IAssetMetadata(), m_geometryMetaStorage(createContainer<CPolygonGeometry>(geometryCount)) {}
 
         _NBL_STATIC_INLINE_CONSTEXPR const char* LoaderName = "CPLYMeshFileLoader";
         const char* getLoaderName() const override { return LoaderName; }
+	private:
+		meta_container_t<CPolygonGeometry> m_geometryMetaStorage;
+		friend class CPLYMeshFileLoader;
+		inline void placeMeta(const uint32_t offset, const ICPUPolygonGeometry* geometry, core::vector<std::string>&& auxAttributeNames)
+		{
+			auto& meta = (*m_geometryMetaStorage)[offset];
+			meta = CPolygonGeometry{};
+			meta.m_auxAttributeNames = std::move(auxAttributeNames);
+			IAssetMetadata::insertAssetSpecificMetadata(geometry, &meta);
+		}
 };
 
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/include/nbl/asset/utils/CPolygonGeometryManipulator.h b/include/nbl/asset/utils/CPolygonGeometryManipulator.h
index c3bed0e49e..1d971776ec 100644
--- a/include/nbl/asset/utils/CPolygonGeometryManipulator.h
+++ b/include/nbl/asset/utils/CPolygonGeometryManipulator.h
@@ -8,9 +8,11 @@
 #include "nbl/core/declarations.h"
 
 #include "nbl/asset/ICPUPolygonGeometry.h"
+#include "nbl/asset/interchange/SFileIOPolicy.h"
 #include "nbl/asset/utils/CGeometryManipulator.h"
 #include "nbl/asset/utils/CSmoothNormalGenerator.h"
 #include "nbl/asset/utils/COBBGenerator.h"
+#include "nbl/builtin/hlsl/shapes/AABBAccumulator.hlsl"
 #include "nbl/builtin/hlsl/shapes/obb.hlsl"
 
 namespace nbl::asset
@@ -20,25 +22,44 @@ namespace nbl::asset
 class NBL_API2 CPolygonGeometryManipulator
 {
 	public:
+		enum class EContentHashMode : uint8_t
+		{
+			MissingOnly,
+			RecomputeAll
+		};
+
+		static void collectUniqueBuffers(const ICPUPolygonGeometry* geo, core::vector<core::smart_refctd_ptr<ICPUBuffer>>& outBuffers);
+		static void computeContentHashesParallel(ICPUPolygonGeometry* geo, const SFileIOPolicy& ioPolicy, const EContentHashMode mode = EContentHashMode::MissingOnly);
+		static inline void computeMissingContentHashesParallel(ICPUPolygonGeometry* geo, const SFileIOPolicy& ioPolicy)
+		{
+			computeContentHashesParallel(geo, ioPolicy, EContentHashMode::MissingOnly);
+		}
+		static inline void recomputeContentHashesParallel(ICPUPolygonGeometry* geo, const SFileIOPolicy& ioPolicy)
+		{
+			computeContentHashesParallel(geo, ioPolicy, EContentHashMode::RecomputeAll);
+		}
 
 		static inline void recomputeContentHashes(ICPUPolygonGeometry* geo)
 		{
-			if (!geo)
-				return;
-			CGeometryManipulator::recomputeContentHash(geo->getPositionView());
-			CGeometryManipulator::recomputeContentHash(geo->getIndexView());
-			CGeometryManipulator::recomputeContentHash(geo->getNormalView());
-			for (const auto& view : *geo->getJointWeightViews())
-			{
-				CGeometryManipulator::recomputeContentHash(view.indices);
-				CGeometryManipulator::recomputeContentHash(view.weights);
-			}
-			if (auto pView=geo->getJointOBBView(); pView)
-				CGeometryManipulator::recomputeContentHash(*pView);
-			for (const auto& view : *geo->getAuxAttributeViews())
-				CGeometryManipulator::recomputeContentHash(view);
+			recomputeContentHashesParallel(geo, SFileIOPolicy{});
 		}
 
+		//! Public aliases for the generic smooth-normal accumulation core.
+		//! The default path keeps float32 positions to match current geometry storage.
+		using ESmoothNormalAccumulationMode = CSmoothNormalGenerator::EAccumulationMode;
+		using SSmoothNormalCorner = CSmoothNormalGenerator::SAccumulatedCorner<>;
+		using CSmoothNormalAccumulator = CSmoothNormalGenerator::CAccumulatedNormals<>;
+
+		//! Convenience wrapper over the incremental smooth-normal accumulator for the common
+		//! "indexed positions + generate only missing normals" case. This keeps the existing
+		//! area-weighted behaviour while reusing the generic accumulator implementation.
+		static bool generateMissingSmoothNormals(
+			core::vector<hlsl::float32_t3>& normals,
+			const core::vector<hlsl::float32_t3>& positions,
+			const core::vector<uint32_t>& indices,
+			const core::vector<uint8_t>& normalNeedsGeneration
+		);
+
 		//
 		static inline void recomputeRanges(ICPUPolygonGeometry* geo, const bool deduceRangeFormats=true)
 		{
@@ -89,6 +110,15 @@ class NBL_API2 CPolygonGeometryManipulator
 				auto addToAABB = [&](auto& aabb)->void
 				{
 					using aabb_t = std::remove_reference_t<decltype(aabb)>;
+					using point_t = typename aabb_t::point_t;
+					using component_t = std::remove_cv_t<std::remove_reference_t<decltype(point_t{}.x)>>;
+					hlsl::shapes::util::AABBAccumulator3<component_t> parsedAABB = hlsl::shapes::util::createAABBAccumulator<component_t>();
+					auto addVertexToAABB = [&](const uint32_t vertex_i)->void
+					{
+						point_t pt;
+						geo->getPositionView().decodeElement(vertex_i, pt);
+						hlsl::shapes::util::extendAABBAccumulator(parsedAABB, pt);
+					};
 					if (geo->getIndexView())
 					{
 						for (auto index_i = 0u; index_i != geo->getIndexView().getElementCount(); index_i++)
@@ -96,20 +126,17 @@ class NBL_API2 CPolygonGeometryManipulator
 							hlsl::vector<uint32_t, 1> vertex_i;
 							geo->getIndexView().decodeElement(index_i, vertex_i);
 							if (isVertexSkinned(geo, vertex_i.x)) continue;
-							typename aabb_t::point_t pt;
-							geo->getPositionView().decodeElement(vertex_i.x, pt);
-							aabb.addPoint(pt);
+							addVertexToAABB(vertex_i.x);
 						}
 					} else
 					{
 						for (auto vertex_i = 0u; vertex_i != geo->getPositionView().getElementCount(); vertex_i++)
 						{
 							if (isVertexSkinned(geo, vertex_i)) continue;
-							typename aabb_t::point_t pt;
-							geo->getPositionView().decodeElement(vertex_i, pt);
-							aabb.addPoint(pt);
+							addVertexToAABB(vertex_i);
 						}
 					}
+					hlsl::shapes::util::assignAABBFromAccumulator(aabb, parsedAABB);
 				};
 				IGeometryBase::SDataViewBase tmp = geo->getPositionView().composed;
 				tmp.resetRange();
diff --git a/include/nbl/asset/utils/SGeometryNormalCommon.h b/include/nbl/asset/utils/SGeometryNormalCommon.h
new file mode 100644
index 0000000000..8900559421
--- /dev/null
+++ b/include/nbl/asset/utils/SGeometryNormalCommon.h
@@ -0,0 +1,22 @@
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_ASSET_S_GEOMETRY_NORMAL_COMMON_H_INCLUDED_
+#define _NBL_ASSET_S_GEOMETRY_NORMAL_COMMON_H_INCLUDED_
+#include "nbl/builtin/hlsl/tgmath.hlsl"
+namespace nbl::asset
+{
+class SGeometryNormalCommon
+{
+    public:
+        static_assert(sizeof(hlsl::float32_t3) == sizeof(float[3]));
+        static_assert(alignof(hlsl::float32_t3) == alignof(float));
+
+        static inline hlsl::float32_t3 normalizeOrZero(const hlsl::float32_t3& v, const float epsilon = 0.f) { const float len2 = hlsl::dot(v, v), epsilon2 = epsilon * epsilon; return len2 <= epsilon2 ? hlsl::float32_t3(0.f, 0.f, 0.f) : hlsl::normalize(v); }
+
+        static inline hlsl::float32_t3 computeFaceNormal(const hlsl::float32_t3& a, const hlsl::float32_t3& b, const hlsl::float32_t3& c, const float epsilon = 0.000001f) { return normalizeOrZero(hlsl::cross(b - a, c - a), epsilon); }
+
+        static inline void computeFaceNormal(const float a[3], const float b[3], const float c[3], float normal[3], const float epsilon = 0.000001f) { *(hlsl::float32_t3*)normal = computeFaceNormal(*(const hlsl::float32_t3*)a, *(const hlsl::float32_t3*)b, *(const hlsl::float32_t3*)c, epsilon); }
+};
+}
+#endif
diff --git a/include/nbl/builtin/hlsl/math/linalg/transform.hlsl b/include/nbl/builtin/hlsl/math/linalg/transform.hlsl
index e46dfe997b..82001770a1 100644
--- a/include/nbl/builtin/hlsl/math/linalg/transform.hlsl
+++ b/include/nbl/builtin/hlsl/math/linalg/transform.hlsl
@@ -8,6 +8,7 @@
 #include <nbl/builtin/hlsl/cpp_compat/intrinsics.hlsl>
 #include <nbl/builtin/hlsl/concepts.hlsl>
 #include <nbl/builtin/hlsl/math/linalg/basic.hlsl>
+#include <nbl/builtin/hlsl/shapes/aabb.hlsl>
 
 namespace nbl
 {
@@ -52,7 +53,19 @@ inline matrix<T, 3, 4> rhLookAt(
     r[1] = vector<T, 4>(yaxis, -hlsl::dot(yaxis, position));
     r[2] = vector<T, 4>(zaxis, -hlsl::dot(zaxis, position));
 
-    return r;
+	return r;
+}
+
+// Transforms an AABB by a full affine 3x4 matrix and returns the enclosing AABB.
+// This exists because shapes::util::transform(matrix<T,3,4>, AABB) applies only the linear part and leaves translation out.
+template<typename T NBL_FUNC_REQUIRES(concepts::FloatingPoint<T>)
+inline shapes::AABB<3, T> pseudo_mul(NBL_CONST_REF_ARG(matrix<T, 3, 4>) lhs, NBL_CONST_REF_ARG(shapes::AABB<3, T>) rhs)
+{
+	const auto translation = hlsl::transpose(lhs)[3];
+	auto transformed = shapes::util::transform(lhs, rhs);
+	transformed.minVx += translation;
+	transformed.maxVx += translation;
+	return transformed;
 }
 
 }
diff --git a/include/nbl/builtin/hlsl/shapes/AABBAccumulator.hlsl b/include/nbl/builtin/hlsl/shapes/AABBAccumulator.hlsl
new file mode 100644
index 0000000000..eda7ef536f
--- /dev/null
+++ b/include/nbl/builtin/hlsl/shapes/AABBAccumulator.hlsl
@@ -0,0 +1,122 @@
+// Copyright (C) 2018-2026 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_BUILTIN_HLSL_SHAPES_AABB_ACCUMULATOR_INCLUDED_
+#define _NBL_BUILTIN_HLSL_SHAPES_AABB_ACCUMULATOR_INCLUDED_
+
+
+#include "nbl/builtin/hlsl/shapes/aabb.hlsl"
+#include "nbl/builtin/hlsl/array_accessors.hlsl"
+#include "nbl/builtin/hlsl/concepts/vector.hlsl"
+
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace shapes
+{
+namespace util
+{
+
+template<typename Scalar = float32_t>
+struct AABBAccumulator3
+{
+    using scalar_t = Scalar;
+    using aabb_t = AABB<3, Scalar>;
+    using point_t = typename aabb_t::point_t;
+
+    static AABBAccumulator3 create()
+    {
+        AABBAccumulator3 retval;
+        retval.value = aabb_t::create();
+        return retval;
+    }
+
+    bool empty() NBL_CONST_MEMBER_FUNC
+    {
+        return
+            value.minVx.x > value.maxVx.x ||
+            value.minVx.y > value.maxVx.y ||
+            value.minVx.z > value.maxVx.z;
+    }
+
+    void addPoint(NBL_CONST_REF_ARG(point_t) pt)
+    {
+        value.addPoint(pt);
+    }
+
+    void addXYZ(const Scalar x, const Scalar y, const Scalar z)
+    {
+        point_t pt = point_t(x, y, z);
+        value.addPoint(pt);
+    }
+
+    aabb_t value;
+};
+
+template<typename Scalar>
+inline AABBAccumulator3<Scalar> createAABBAccumulator()
+{
+    return AABBAccumulator3<Scalar>::create();
+}
+
+template<typename Scalar>
+inline void extendAABBAccumulator(NBL_REF_ARG(AABBAccumulator3<Scalar>) aabb, const Scalar x, const Scalar y, const Scalar z)
+{
+    aabb.addXYZ(x, y, z);
+}
+
+template<typename Scalar, typename Point NBL_FUNC_REQUIRES(concepts::Vectorial<Point> && (vector_traits<Point>::Dimension >= 3))
+inline void extendAABBAccumulator(NBL_REF_ARG(AABBAccumulator3<Scalar>) aabb, NBL_CONST_REF_ARG(Point) pt)
+{
+    array_get<Point, typename vector_traits<Point>::scalar_type> getter;
+    typename AABBAccumulator3<Scalar>::point_t converted = typename AABBAccumulator3<Scalar>::point_t(
+        Scalar(getter(pt, 0)),
+        Scalar(getter(pt, 1)),
+        Scalar(getter(pt, 2))
+    );
+    aabb.addPoint(converted);
+}
+
+template<int16_t DstD, typename DstScalar, int16_t SrcD, typename SrcScalar NBL_FUNC_REQUIRES(DstD >= 3 && SrcD >= 3)
+inline bool assignAABB(NBL_REF_ARG(AABB<DstD, DstScalar>) dst, NBL_CONST_REF_ARG(AABB<SrcD, SrcScalar>) src)
+{
+    array_set<typename AABB<DstD, DstScalar>::point_t, DstScalar> setter;
+    array_get<typename AABB<SrcD, SrcScalar>::point_t, SrcScalar> getter;
+
+    if (
+        getter(src.minVx, 0) > getter(src.maxVx, 0) ||
+        getter(src.minVx, 1) > getter(src.maxVx, 1) ||
+        getter(src.minVx, 2) > getter(src.maxVx, 2))
+        return false;
+
+    dst = AABB<DstD, DstScalar>::create();
+    NBL_UNROLL for (int16_t i = 0; i < 3; ++i)
+    {
+        setter(dst.minVx, i, DstScalar(getter(src.minVx, i)));
+        setter(dst.maxVx, i, DstScalar(getter(src.maxVx, i)));
+    }
+    NBL_UNROLL for (int16_t i = 3; i < DstD; ++i)
+    {
+        setter(dst.minVx, i, DstScalar(0));
+        setter(dst.maxVx, i, DstScalar(0));
+    }
+    return true;
+}
+
+template<int16_t D, typename DstScalar, typename SrcScalar NBL_FUNC_REQUIRES(D >= 3)
+inline bool assignAABBFromAccumulator(NBL_REF_ARG(AABB<D, DstScalar>) dst, NBL_CONST_REF_ARG(AABBAccumulator3<SrcScalar>) aabb)
+{
+    if (aabb.empty())
+        return false;
+
+    return assignAABB(dst, aabb.value);
+}
+
+}
+}
+}
+}
+
+#endif
diff --git a/include/nbl/builtin/hlsl/shapes/aabb.hlsl b/include/nbl/builtin/hlsl/shapes/aabb.hlsl
index 07219c6687..ec916f2734 100644
--- a/include/nbl/builtin/hlsl/shapes/aabb.hlsl
+++ b/include/nbl/builtin/hlsl/shapes/aabb.hlsl
@@ -66,28 +66,28 @@ namespace util
 namespace impl
 {
 template<int16_t D, typename Scalar>
-struct intersect_helper<AABB<D,Scalar>>
+struct intersect_helper<AABB<D, Scalar> >
 {
     using type = AABB<D,Scalar>;
 
     static inline type __call(NBL_CONST_REF_ARG(type) lhs, NBL_CONST_REF_ARG(type) rhs)
     {
         type retval;
-        retval.minVx = hlsl::max<type::point_t>(lhs.minVx,rhs.minVx);
-        retval.maxVx = hlsl::min<type::point_t>(lhs.maxVx,rhs.maxVx);
+        retval.minVx = hlsl::max<typename type::point_t>(lhs.minVx,rhs.minVx);
+        retval.maxVx = hlsl::min<typename type::point_t>(lhs.maxVx,rhs.maxVx);
         return retval;
     }
 };
 template<int16_t D, typename Scalar>
-struct union_helper<AABB<D,Scalar>>
+struct union_helper<AABB<D, Scalar> >
 {
     using type = AABB<D,Scalar>;
 
     static inline type __call(NBL_CONST_REF_ARG(type) lhs, NBL_CONST_REF_ARG(type) rhs)
     {
         type retval;
-        retval.minVx = hlsl::min<type::point_t>(lhs.minVx,rhs.minVx);
-        retval.maxVx = hlsl::max<type::point_t>(lhs.maxVx,rhs.maxVx);
+        retval.minVx = hlsl::min<typename type::point_t>(lhs.minVx,rhs.minVx);
+        retval.maxVx = hlsl::max<typename type::point_t>(lhs.maxVx,rhs.maxVx);
         return retval;
     }
 };
diff --git a/include/nbl/config/BuildConfigOptions.h.in b/include/nbl/config/BuildConfigOptions.h.in
index d130ff4ce2..7bd4e950f3 100644
--- a/include/nbl/config/BuildConfigOptions.h.in
+++ b/include/nbl/config/BuildConfigOptions.h.in
@@ -35,6 +35,7 @@
 #cmakedefine _NBL_COMPILE_WITH_GLTF_LOADER_
 
 // writers
+#cmakedefine _NBL_COMPILE_WITH_OBJ_WRITER_
 #cmakedefine _NBL_COMPILE_WITH_STL_WRITER_
 #cmakedefine _NBL_COMPILE_WITH_PLY_WRITER_
 #cmakedefine _NBL_COMPILE_WITH_BAW_WRITER_
@@ -95,4 +96,4 @@
     #define NBL_API2
 #endif
 
-#endif // __NBL_BUILD_CONFIG_OPTIONS_H_INCLUDED__
\ No newline at end of file
+#endif // __NBL_BUILD_CONFIG_OPTIONS_H_INCLUDED__
diff --git a/include/nbl/core/hash/blake.h b/include/nbl/core/hash/blake.h
index fb91c9969f..61d1c02d9a 100644
--- a/include/nbl/core/hash/blake.h
+++ b/include/nbl/core/hash/blake.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2024-2024 - DevSH Graphics Programming Sp. z O.O.
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 #ifndef _NBL_CORE_HASH_BLAKE3_H_INCLUDED_
@@ -6,20 +6,25 @@
 
 
 #include "nbl/config/BuildConfigOptions.h"
-#include "blake3.h"
 
+#include <cstddef>
+#include <cstdint>
 #include <span>
+#include <string_view>
+#include <type_traits>
 
 
 namespace nbl::core
 {
 struct blake3_hash_t final
 {
+	static inline constexpr size_t DigestSize = 32ull;
+
 	inline bool operator==(const blake3_hash_t&) const = default;
 
 	// could initialize this to a hash of a zero-length array,
 	// but that requires a .cpp file and a static
-	uint8_t data[BLAKE3_OUT_LEN];
+	uint8_t data[DigestSize];
 };
 
 class NBL_API2 blake3_hasher final
@@ -37,7 +42,12 @@ class NBL_API2 blake3_hasher final
 			}
 		};
 
-		::blake3_hasher m_state;
+		static inline constexpr size_t OpaqueStateSize = 1920ull;
+		static inline constexpr size_t OpaqueStateAlign = 16ull;
+
+		static void validateOpaqueStateLayout();
+
+		alignas(OpaqueStateAlign) unsigned char m_state[OpaqueStateSize];
 
 	public:
 		blake3_hasher();
@@ -55,7 +65,10 @@ class NBL_API2 blake3_hasher final
 		explicit operator blake3_hash_t() const;
 };
 
-// Useful specializations
+NBL_API2 blake3_hash_t blake3_hash_buffer(const void* data, size_t bytes);
+NBL_API2 blake3_hash_t blake3_hash_buffer_sequential(const void* data, size_t bytes);
+
+// Convenience specializations for common wrapper inputs.
 template<typename Dummy>
 struct blake3_hasher::update_impl<blake3_hash_t,Dummy>
 {
@@ -113,11 +126,11 @@ struct hash<nbl::core::blake3_hash_t>
 	{
 		auto* as_p_uint64_t = reinterpret_cast<const size_t*>(blake3.data);
 		size_t retval = as_p_uint64_t[0];
-		for (auto i=1; i<BLAKE3_OUT_LEN/sizeof(size_t); i++)
+		for (auto i=1; i<nbl::core::blake3_hash_t::DigestSize/sizeof(size_t); i++)
 			retval ^= as_p_uint64_t[i] + 0x9e3779b97f4a7c15ull + (retval << 6) + (retval >> 2);
 		return retval;
 	}
 };
 }
 
-#endif // _NBL_CORE_HASH_BLAKE3_H_INCLUDED_
\ No newline at end of file
+#endif // _NBL_CORE_HASH_BLAKE3_H_INCLUDED_
diff --git a/include/nbl/logging_macros.h b/include/nbl/logging_macros.h
index cf4f63f9bc..97cbdcc0c2 100644
--- a/include/nbl/logging_macros.h
+++ b/include/nbl/logging_macros.h
@@ -1,8 +1,8 @@
 #if defined(NBL_LOG) || defined(NBL_LOG_ERROR)
 	#error redefinition of NBL_LOG/NBL_LOG_ERROR. did you forgot to undefine logging macros somewhere? #include "nbl/undefine_logging_macros.h"
-#elif !defined(_GIT_INFO_H_INCLUDED_)
-	#error logging macros require git meta info, include "git_info.h"
+#elif !defined(_NBL_GIT_INFO_H_INCLUDED_)
+	#error logging macros require git meta info, include "nbl/git/info.h"
 #else
-	#define NBL_LOG(SEVERITY, FORMAT, ...) NBL_LOG_FUNCTION(FORMAT" [%s][%s - %s:%d]", SEVERITY __VA_OPT__(,) __VA_ARGS__, nbl::gtml::nabla_git_info.commitShortHash, __FUNCTION__, __FILE__, __LINE__);
+	#define NBL_LOG(SEVERITY, FORMAT, ...) NBL_LOG_FUNCTION(FORMAT" [%s][%s - %s:%d]", SEVERITY __VA_OPT__(,) __VA_ARGS__, nbl::gtml::nabla_git_info.commitShortHash().data(), __FUNCTION__, __FILE__, __LINE__);
 	#define NBL_LOG_ERROR(FORMAT, ...) NBL_LOG(nbl::system::ILogger::ELL_ERROR, FORMAT __VA_OPT__(,) __VA_ARGS__)
-#endif
\ No newline at end of file
+#endif
diff --git a/include/nbl/system/CGrowableMemoryFile.h b/include/nbl/system/CGrowableMemoryFile.h
new file mode 100644
index 0000000000..aea3e60a15
--- /dev/null
+++ b/include/nbl/system/CGrowableMemoryFile.h
@@ -0,0 +1,325 @@
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_SYSTEM_C_GROWABLE_MEMORY_FILE_H_INCLUDED_
+#define _NBL_SYSTEM_C_GROWABLE_MEMORY_FILE_H_INCLUDED_
+
+#include "nbl/system/IFile.h"
+
+#include <algorithm>
+#include <chrono>
+#include <cstddef>
+#include <cstring>
+#include <limits>
+#include <mutex>
+#include <utility>
+#include <vector>
+
+namespace nbl::system
+{
+
+namespace impl
+{
+
+struct CImmediateFileIoResultSetter final : ISystem::IFutureManipulator
+{
+    using ISystem::IFutureManipulator::set_result;
+};
+
+struct CNoopMutex
+{
+    inline void lock() {}
+    inline void unlock() {}
+};
+
+class CGrowableMemoryFileStorage
+{
+    public:
+        constexpr static inline size_t InitialGrowthBytes = 1ull << 20; // 1 MiB
+
+        inline size_t size() const
+        {
+            return m_storage.size();
+        }
+
+        inline size_t capacity() const
+        {
+            return m_storage.capacity();
+        }
+
+        inline void reserve(const size_t reservedSize)
+        {
+            m_storage.reserve(reservedSize);
+        }
+
+        inline void clear()
+        {
+            m_storage.clear();
+        }
+
+        inline const std::byte* data() const
+        {
+            return m_storage.empty() ? nullptr : m_storage.data();
+        }
+
+        inline std::byte* data()
+        {
+            return m_storage.empty() ? nullptr : m_storage.data();
+        }
+
+        inline std::vector<std::byte> copyData() const
+        {
+            return m_storage;
+        }
+
+        inline size_t read(void* const buffer, const size_t offset, const size_t sizeToRead) const
+        {
+            if (offset >= m_storage.size())
+                return 0ull;
+
+            const size_t clampedRead = std::min(sizeToRead, m_storage.size() - offset);
+            std::memcpy(buffer, m_storage.data() + offset, clampedRead);
+            return clampedRead;
+        }
+
+        inline size_t write(const void* const buffer, const size_t offset, const size_t sizeToWrite)
+        {
+            const size_t requiredSize = offset + sizeToWrite;
+            if (requiredSize > m_storage.capacity())
+                reserve(growCapacity(requiredSize));
+            if (requiredSize > m_storage.size())
+                m_storage.resize(requiredSize);
+            std::memcpy(m_storage.data() + offset, buffer, sizeToWrite);
+            return sizeToWrite;
+        }
+
+    private:
+        inline size_t growCapacity(const size_t requiredSize) const
+        {
+            size_t currentCapacity = m_storage.capacity();
+            if (currentCapacity == 0ull)
+                currentCapacity = InitialGrowthBytes;
+
+            size_t nextCapacity = currentCapacity;
+            while (nextCapacity < requiredSize)
+            {
+                const size_t growth = std::max(nextCapacity, InitialGrowthBytes);
+                if (nextCapacity > std::numeric_limits<size_t>::max() - growth)
+                    return requiredSize;
+                nextCapacity += growth;
+            }
+            return nextCapacity;
+        }
+
+        std::vector<std::byte> m_storage;
+};
+
+template<typename MutexType>
+class IGrowableMemoryFile : public IFile
+{
+    protected:
+        using mutex_t = MutexType;
+
+        inline explicit IGrowableMemoryFile(path&& filename, const size_t reservedSize = 0ull, const time_point_t initialModified = std::chrono::utc_clock::now())
+            : IFile(std::move(filename), core::bitflag<E_CREATE_FLAGS>(E_CREATE_FLAGS::ECF_READ_WRITE), initialModified)
+        {
+            reserve(reservedSize);
+        }
+
+        template<typename Fn>
+        inline decltype(auto) withLockedStorage(Fn&& fn)
+        {
+            std::lock_guard<mutex_t> lock(m_mutex);
+            return std::forward<Fn>(fn)(m_storage);
+        }
+
+        template<typename Fn>
+        inline decltype(auto) withLockedStorage(Fn&& fn) const
+        {
+            std::lock_guard<mutex_t> lock(m_mutex);
+            return std::forward<Fn>(fn)(m_storage);
+        }
+
+    public:
+        inline size_t getSize() const override
+        {
+            return withLockedStorage([](const CGrowableMemoryFileStorage& storage) {
+                return storage.size();
+            });
+        }
+
+        inline size_t capacity() const
+        {
+            return withLockedStorage([](const CGrowableMemoryFileStorage& storage) {
+                return storage.capacity();
+            });
+        }
+
+        //! Optional capacity hint for callers that can estimate the final serialized size.
+        /** The internal storage already uses an adaptive growth policy, so this is only a performance hint. */
+        inline void reserve(const size_t reservedSize)
+        {
+            withLockedStorage([reservedSize](CGrowableMemoryFileStorage& storage) {
+                storage.reserve(reservedSize);
+            });
+        }
+
+        inline void clear()
+        {
+            withLockedStorage([](CGrowableMemoryFileStorage& storage) {
+                storage.clear();
+            });
+            setLastWriteTime();
+        }
+
+        inline std::vector<std::byte> copyData() const
+        {
+            return withLockedStorage([](const CGrowableMemoryFileStorage& storage) {
+                return storage.copyData();
+            });
+        }
+
+    protected:
+        inline void* getMappedPointer_impl() override
+        {
+            return nullptr;
+        }
+
+        inline const void* getMappedPointer_impl() const override
+        {
+            return nullptr;
+        }
+
+        inline void unmappedRead(ISystem::future_t<size_t>& fut, void* buffer, size_t offset, size_t sizeToRead) override
+        {
+            static const CImmediateFileIoResultSetter resultSetter = {};
+            const size_t processed = withLockedStorage([buffer, offset, sizeToRead](const CGrowableMemoryFileStorage& storage) {
+                return storage.read(buffer, offset, sizeToRead);
+            });
+            resultSetter.set_result(fut, processed);
+        }
+
+        inline void unmappedWrite(ISystem::future_t<size_t>& fut, const void* buffer, size_t offset, size_t sizeToWrite) override
+        {
+            static const CImmediateFileIoResultSetter resultSetter = {};
+            const size_t processed = withLockedStorage([buffer, offset, sizeToWrite](CGrowableMemoryFileStorage& storage) {
+                return storage.write(buffer, offset, sizeToWrite);
+            });
+            resultSetter.set_result(fut, processed);
+        }
+
+    private:
+        mutable mutex_t m_mutex;
+        CGrowableMemoryFileStorage m_storage;
+};
+
+}
+
+//! A lightweight growable in-memory implementation of `system::IFile`.
+/**
+    This class stores file contents in a dynamically growing byte buffer while preserving the regular
+    Nabla file-oriented API. It is useful in flows that want `IFile*` interoperability without
+    forcing an obligatory round-trip through the host filesystem.
+
+    Representative use-cases include:
+    - serialization roundtrip validation
+    - benchmark or profiling harnesses that want to separate codec work from storage latency
+    - tool pipelines that need a temporary serialized representation but do not require a persistent file
+
+    The object grows on demand during writes and can later be consumed by APIs that read from
+    `system::IFile*`, for example `IAssetManager::getAsset(system::IFile*, supposedFilename, ...)`.
+
+    Allocation policy:
+    - storage growth is handled internally
+    - capacity expansion is geometric rather than exact-size-only
+    - the first growth step uses a minimum allocation quantum of `1 MiB`
+    - callers may still provide an explicit `reserve(...)` hint if they already know the likely output size
+
+    This keeps the common case simple for callers while reducing the amount of repeated reallocation
+    and copying that would otherwise happen during long sequential write streams.
+
+    Important notes:
+    - reads and writes are positional and operate on the current logical size
+    - `getMappedPointer()` intentionally returns `nullptr`
+      The storage is growable, so exposing a stable mapped pointer would be misleading
+    - this class is not thread-safe
+      Concurrent read, write, reserve, clear, or direct `data()` access on the same object requires external synchronization
+*/
+class CGrowableMemoryFile final : public impl::IGrowableMemoryFile<impl::CNoopMutex>
+{
+        using base_t = impl::IGrowableMemoryFile<impl::CNoopMutex>;
+
+    public:
+        using base_t::capacity;
+        using base_t::clear;
+        using base_t::copyData;
+        using base_t::reserve;
+
+        inline explicit CGrowableMemoryFile(path&& filename, const size_t reservedSize = 0ull, const time_point_t initialModified = std::chrono::utc_clock::now())
+            : base_t(std::move(filename), reservedSize, initialModified)
+        {
+        }
+
+        inline const std::byte* data() const
+        {
+            return withLockedStorage([](const impl::CGrowableMemoryFileStorage& storage) {
+                return storage.data();
+            });
+        }
+
+        inline std::byte* data()
+        {
+            return withLockedStorage([](impl::CGrowableMemoryFileStorage& storage) {
+                return storage.data();
+            });
+        }
+};
+
+//! A synchronized growable in-memory implementation of `system::IFile`.
+/**
+    This variant serializes internal operations with a mutex. It is intended for cases where the same
+    memory-backed file object may be touched from multiple threads and external synchronization is not
+    desirable or not available.
+
+    The synchronized variant intentionally does not expose raw `data()` accessors. A raw pointer would
+    not carry any lifetime relationship to the internal lock and would therefore invite accidental use
+    after another thread mutates or reallocates the storage. Callers that need to inspect the contents
+    can either:
+    - take a snapshot with `copyData()`
+    - use `withLockedData(...)` and keep any pointer or span-like view strictly inside the callback
+*/
+class CSynchronizedGrowableMemoryFile final : public impl::IGrowableMemoryFile<std::mutex>
+{
+        using base_t = impl::IGrowableMemoryFile<std::mutex>;
+
+    public:
+        using base_t::capacity;
+        using base_t::clear;
+        using base_t::copyData;
+        using base_t::reserve;
+
+        inline explicit CSynchronizedGrowableMemoryFile(path&& filename, const size_t reservedSize = 0ull, const time_point_t initialModified = std::chrono::utc_clock::now())
+            : base_t(std::move(filename), reservedSize, initialModified)
+        {
+        }
+
+        template<typename Fn>
+        inline decltype(auto) withLockedData(Fn&& fn)
+        {
+            return withLockedStorage([&fn](impl::CGrowableMemoryFileStorage& storage) -> decltype(auto) {
+                return std::forward<Fn>(fn)(storage.data(), storage.size());
+            });
+        }
+
+        template<typename Fn>
+        inline decltype(auto) withLockedData(Fn&& fn) const
+        {
+            return withLockedStorage([&fn](const impl::CGrowableMemoryFileStorage& storage) -> decltype(auto) {
+                return std::forward<Fn>(fn)(storage.data(), storage.size());
+            });
+        }
+};
+
+}
+
+#endif
diff --git a/include/nbl/system/CSystemWin32.h b/include/nbl/system/CSystemWin32.h
index 7c73525c43..97ab6ce709 100644
--- a/include/nbl/system/CSystemWin32.h
+++ b/include/nbl/system/CSystemWin32.h
@@ -22,7 +22,7 @@ class NBL_API2 CSystemWin32 : public ISystem
             public:
                 CCaller(ISystem* _system) : ICaller(_system) {}
 
-                core::smart_refctd_ptr<ISystemFile> createFile(const std::filesystem::path& filename, const core::bitflag<IFile::E_CREATE_FLAGS> flags) override final;
+                core::smart_refctd_ptr<ISystemFile> createFile(const std::filesystem::path& filename, core::bitflag<IFile::E_CREATE_FLAGS> flags) override final;
         };
         
     public:
diff --git a/include/nbl/system/IFile.h b/include/nbl/system/IFile.h
index 0ab739ba4a..f2c615c311 100644
--- a/include/nbl/system/IFile.h
+++ b/include/nbl/system/IFile.h
@@ -81,11 +81,16 @@ class IFile : public IFileBase, private ISystem::IFutureManipulator
 		};
 		void read(success_t& fut, void* buffer, size_t offset, size_t sizeToRead)
 		{
+			// The higher-level IO helpers may queue multiple chunked operations before waiting on the futures.
+			// Backends therefore need to treat `offset` as the request-local byte position rather than relying on
+			// a mutable shared file pointer hidden inside the OS file handle.
 			read(fut.m_internalFuture,buffer,offset,sizeToRead);
 			fut.sizeToProcess = sizeToRead;
 		}
 		void write(success_t& fut, const void* buffer, size_t offset, size_t sizeToWrite)
 		{
+			// Same requirement as `read(...)`: writes are logically positional requests and must honor the explicit
+			// byte offset even when multiple operations are submitted before the caller drains their futures.
 			write(fut.m_internalFuture,buffer,offset,sizeToWrite);
 			fut.sizeToProcess = sizeToWrite;
 		}
diff --git a/include/nbl/system/ISystem.h b/include/nbl/system/ISystem.h
index 65f0351582..9ee5f0bb83 100644
--- a/include/nbl/system/ISystem.h
+++ b/include/nbl/system/ISystem.h
@@ -105,7 +105,9 @@ class NBL_API2 ISystem : public core::IReferenceCounted
         void createFile(
             future_t<core::smart_refctd_ptr<IFile>>& future, // creation may happen on a dedicated thread, so its async
             path filename, // absolute path within our virtual filesystem
-            const core::bitflag<IFileBase::E_CREATE_FLAGS> flags, // access flags (IMPORTANT: files from most archives wont open with ECF_WRITE bit)
+            const core::bitflag<IFileBase::E_CREATE_FLAGS> flags, // intended access flags (IMPORTANT: files from most archives wont open with ECF_WRITE bit)
+            // actual file flags may be downgraded when backend/archive cannot honor all requested flags
+            // for example a backend may open the file successfully but strip mapping/coherency when it cannot provide them
             const std::string_view& accessToken="" // usually password for archives, but should be SSH key for URL downloads
         );
         
@@ -148,6 +150,7 @@ class NBL_API2 ISystem : public core::IReferenceCounted
         struct SystemInfo
         {
             uint64_t cpuFrequencyHz = 0u;
+            uint32_t physicalCoreCount = 0u;
 
             // in bytes
             uint64_t totalMemory = 0u;
@@ -156,6 +159,7 @@ class NBL_API2 ISystem : public core::IReferenceCounted
             uint32_t desktopResX = 0u;
             uint32_t desktopResY = 0u;
 
+            std::string cpuName = "Unknown";
             std::string OSFullName = "Unknown";
         };
         virtual SystemInfo getSystemInfo() const = 0;
@@ -168,6 +172,7 @@ class NBL_API2 ISystem : public core::IReferenceCounted
         {
             public:
                 // each per-platform backend must override this function
+                // returned files may expose fewer flags than requested if the backend had to fall back
                 virtual core::smart_refctd_ptr<ISystemFile> createFile(const std::filesystem::path& filename, const core::bitflag<IFileBase::E_CREATE_FLAGS> flags) = 0;
 
                 // these contain some hoisted common sense checks
@@ -219,7 +224,7 @@ class NBL_API2 ISystem : public core::IReferenceCounted
             using retval_t = core::smart_refctd_ptr<IFile>;
             void operator()(core::StorageTrivializer<retval_t>* retval, ICaller* _caller);
 
-            char filename[MAX_FILENAME_LENGTH] {};
+            std::filesystem::path filename;
             IFileBase::E_CREATE_FLAGS flags;
         };
         struct SRequestParams_READ
diff --git a/include/nbl/video/IGPUCommandBuffer.h b/include/nbl/video/IGPUCommandBuffer.h
index 8f0f1fce30..b863de3030 100644
--- a/include/nbl/video/IGPUCommandBuffer.h
+++ b/include/nbl/video/IGPUCommandBuffer.h
@@ -10,7 +10,7 @@
 #include "nbl/video/IGPUCommandPool.h"
 #include "nbl/video/IQueue.h"
 
-#include "git_info.h"
+#include "nbl/git/info.h"
 #define NBL_LOG_FUNCTION m_logger.log
 #include "nbl/logging_macros.h"
 
diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h
index ae351fdecd..983f6c6b5a 100644
--- a/include/nbl/video/ILogicalDevice.h
+++ b/include/nbl/video/ILogicalDevice.h
@@ -15,7 +15,7 @@
 #include "nbl/video/CThreadSafeQueueAdapter.h"
 #include "nbl/video/CJITIncludeLoader.h"
 
-#include "git_info.h"
+#include "nbl/git/info.h"
 #define NBL_LOG_FUNCTION m_logger.log
 #include "nbl/logging_macros.h"
 
@@ -1606,4 +1606,4 @@ inline bool ILogicalDevice::validateMemoryBarrier(const uint32_t queueFamilyInde
 } // namespace nbl::video
 
 #include "nbl/undef_logging_macros.h"
-#endif //_NBL_VIDEO_I_LOGICAL_DEVICE_H_INCLUDED_
\ No newline at end of file
+#endif //_NBL_VIDEO_I_LOGICAL_DEVICE_H_INCLUDED_
diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt
index 18a25c8619..f0aabc1c95 100644
--- a/src/nbl/CMakeLists.txt
+++ b/src/nbl/CMakeLists.txt
@@ -1,5 +1,5 @@
-# Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-# Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
+# Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
+# Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
 # This file is part of the "Nabla Engine".
 # For conditions of distribution and use, see copyright notice in nabla.h
 
@@ -50,12 +50,12 @@ include(common)
 #[[ Loaders and writers compile options available to edit by user
 	All revelant _NBL_COMPILE_WITH will be there]]
 option(_NBL_COMPILE_WITH_MTL_LOADER_ "Compile with MTL Loader" OFF) #default off until Material Compiler 2
-option(_NBL_COMPILE_WITH_OBJ_LOADER_ "Compile with OBJ Loader" OFF) #default off until Material Compiler 2
-#option(_NBL_COMPILE_WITH_OBJ_WRITER_ "Compile with OBJ Writer" ON) uncomment when writer exists
-option(_NBL_COMPILE_WITH_STL_LOADER_ "Compile with STL Loader" OFF) #default off until reimplemented
-option(_NBL_COMPILE_WITH_STL_WRITER_ "Compile with STL Writer" OFF) #default off until reimplemented
+option(_NBL_COMPILE_WITH_OBJ_LOADER_ "Compile with OBJ Loader" ON)
+option(_NBL_COMPILE_WITH_OBJ_WRITER_ "Compile with OBJ Writer" ON)
+option(_NBL_COMPILE_WITH_STL_LOADER_ "Compile with STL Loader" ON)
+option(_NBL_COMPILE_WITH_STL_WRITER_ "Compile with STL Writer" ON)
 option(_NBL_COMPILE_WITH_PLY_LOADER_ "Compile with PLY Loader" ON)
-option(_NBL_COMPILE_WITH_PLY_WRITER_ "Compile with PLY Writer" OFF) #default off until reimplemented
+option(_NBL_COMPILE_WITH_PLY_WRITER_ "Compile with PLY Writer" ON)
 option(_NBL_COMPILE_WITH_JPG_LOADER_ "Compile with JPG Loader" ON)
 option(_NBL_COMPILE_WITH_JPG_WRITER_ "Compile with JPG Writer" ON)
 option(_NBL_COMPILE_WITH_PNG_LOADER_ "Compile with PNG Loader" ON)
@@ -165,6 +165,7 @@ set(NBL_ASSET_SOURCES
 	asset/ICPUImage.cpp
 	asset/ICPUPolygonGeometry.cpp
 	asset/interchange/IAssetWriter.cpp
+	asset/interchange/IGeometryWriter.cpp
 	asset/interchange/IAssetLoader.cpp
 	
 # Shaders
@@ -199,6 +200,7 @@ set(NBL_ASSET_SOURCES
 	asset/interchange/CGLTFLoader.cpp
 
 # Mesh writers
+	asset/interchange/COBJMeshWriter.cpp
 	asset/interchange/CPLYMeshWriter.cpp
 	asset/interchange/CSTLMeshWriter.cpp
 	asset/interchange/CGLTFWriter.cpp
@@ -504,7 +506,6 @@ endif()
 
 # blake3
 add_dependencies(Nabla blake3)
-list(APPEND PUBLIC_BUILD_INCLUDE_DIRS $<TARGET_PROPERTY:blake3,INCLUDE_DIRECTORIES>)
 if(NBL_STATIC_BUILD)
 	target_link_libraries(Nabla INTERFACE blake3)
 else()
@@ -554,6 +555,8 @@ else()
 endif()
 list(APPEND PUBLIC_BUILD_INCLUDE_DIRS ${THIRD_PARTY_SOURCE_DIR}/simdjson)
 
+list(APPEND PUBLIC_BUILD_INCLUDE_DIRS ${THIRD_PARTY_SOURCE_DIR}/fast_float/include)
+
 # libjpeg
 add_dependencies(Nabla jpeg-static)
 if(NBL_STATIC_BUILD)
diff --git a/src/nbl/asset/IAssetManager.cpp b/src/nbl/asset/IAssetManager.cpp
index 29930bccd9..4d5e762aa3 100644
--- a/src/nbl/asset/IAssetManager.cpp
+++ b/src/nbl/asset/IAssetManager.cpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
@@ -8,6 +8,7 @@
 #include "nbl/asset/interchange/CHLSLLoader.h"
 #include "nbl/asset/interchange/CSPVLoader.h"
 
+#include <algorithm>
 #include <array>
 #include <nbl/core/string/StringLiteral.h>	
 
@@ -55,6 +56,10 @@
 #include "nbl/asset/interchange/CSTLMeshWriter.h"
 #endif
 
+#ifdef _NBL_COMPILE_WITH_OBJ_WRITER_
+#include "nbl/asset/interchange/COBJMeshWriter.h"
+#endif
+
 #ifdef _NBL_COMPILE_WITH_PLY_WRITER_
 #include "nbl/asset/interchange/CPLYMeshWriter.h"
 #endif
@@ -160,6 +165,9 @@ void IAssetManager::addLoadersAndWriters()
 #ifdef _NBL_COMPILE_WITH_GLTF_WRITER_
     addAssetWriter(core::make_smart_refctd_ptr<asset::CGLTFWriter>());
 #endif
+#ifdef _NBL_COMPILE_WITH_OBJ_WRITER_
+	addAssetWriter(core::make_smart_refctd_ptr<asset::COBJMeshWriter>());
+#endif
 #ifdef _NBL_COMPILE_WITH_PLY_WRITER_
 	addAssetWriter(core::make_smart_refctd_ptr<asset::CPLYMeshWriter>());
 #endif
@@ -195,22 +203,24 @@ SAssetBundle IAssetManager::getAssetInHierarchy_impl(system::IFile* _file, const
     IAssetLoader::SAssetLoadContext ctx{params,_file};
 
     std::filesystem::path filename = _file ? _file->getFileName() : std::filesystem::path(_supposedFilename);
-    auto file = _override->getLoadFile(_file, filename.string(), ctx, _hierarchyLevel);
+    auto filenameString = filename.string();
+    auto file = _override->getLoadFile(_file, filenameString, ctx, _hierarchyLevel);
 
     filename = file.get() ? file->getFileName() : std::filesystem::path(_supposedFilename);
+    filenameString = filename.string();
     // TODO: should we remove? (is a root absolute path working dir ever needed)
     if (params.workingDirectory.empty())
         params.workingDirectory = filename.parent_path();
 
-    const uint64_t levelFlags = params.cacheFlags >> ((uint64_t)_hierarchyLevel * 2ull);
+    const auto levelFlags = IAssetLoader::caching_flags_t(static_cast<uint64_t>(params.cacheFlags.value) >> ((uint64_t)_hierarchyLevel * 2ull));
 
     SAssetBundle bundle;
-    if ((levelFlags & IAssetLoader::ECF_DUPLICATE_TOP_LEVEL) != IAssetLoader::ECF_DUPLICATE_TOP_LEVEL)
+    if (!levelFlags.hasFlags(IAssetLoader::ECF_DUPLICATE_TOP_LEVEL))
     {
-        auto found = findAssets(filename.string());
+        auto found = findAssets(filenameString);
         if (found->size())
             return _override->chooseRelevantFromFound(found->begin(), found->end(), ctx, _hierarchyLevel);
-        else if (!(bundle = _override->handleSearchFail(filename.string(), ctx, _hierarchyLevel)).getContents().empty())
+        else if (!(bundle = _override->handleSearchFail(filenameString, ctx, _hierarchyLevel)).getContents().empty())
             return bundle;
     }
 
@@ -220,30 +230,36 @@ SAssetBundle IAssetManager::getAssetInHierarchy_impl(system::IFile* _file, const
 
     auto ext = system::extension_wo_dot(filename);
     auto capableLoadersRng = m_loaders.perFileExt.findRange(ext);
-    // loaders associated with the file's extension tryout
+    core::vector<IAssetLoader*> extensionLoaders;
+    extensionLoaders.reserve(8u);
     for (auto& loader : capableLoadersRng)
     {
-        if (loader.second->isALoadableFileFormat(file.get()) && !(bundle = loader.second->loadAsset(file.get(), params, _override, _hierarchyLevel)).getContents().empty())
+        auto* extensionLoader = loader.second;
+        extensionLoaders.push_back(extensionLoader);
+        if (extensionLoader->isALoadableFileFormat(file.get()) && !(bundle = extensionLoader->loadAsset(file.get(), params, _override, _hierarchyLevel)).getContents().empty())
             break;
     }
-    for (auto loaderItr = std::begin(m_loaders.vector); bundle.getContents().empty() && loaderItr != std::end(m_loaders.vector); ++loaderItr) // all loaders tryout
+    for (auto loaderItr = std::begin(m_loaders.vector); bundle.getContents().empty() && loaderItr != std::end(m_loaders.vector); ++loaderItr)
     {
-        if ((*loaderItr)->isALoadableFileFormat(file.get()) && !(bundle = (*loaderItr)->loadAsset(file.get(), params, _override, _hierarchyLevel)).getContents().empty())
+        auto* loader = loaderItr->get();
+        if (std::find(extensionLoaders.begin(), extensionLoaders.end(), loader) != extensionLoaders.end())
+            continue;
+        if (loader->isALoadableFileFormat(file.get()) && !(bundle = loader->loadAsset(file.get(), params, _override, _hierarchyLevel)).getContents().empty())
             break;
     }
 
     if (!bundle.getContents().empty() && 
-        ((levelFlags & IAssetLoader::ECF_DONT_CACHE_TOP_LEVEL) != IAssetLoader::ECF_DONT_CACHE_TOP_LEVEL) &&
-        ((levelFlags & IAssetLoader::ECF_DUPLICATE_TOP_LEVEL) != IAssetLoader::ECF_DUPLICATE_TOP_LEVEL))
+        !levelFlags.hasFlags(IAssetLoader::ECF_DONT_CACHE_TOP_LEVEL) &&
+        !levelFlags.hasFlags(IAssetLoader::ECF_DUPLICATE_TOP_LEVEL))
     {
-        _override->insertAssetIntoCache(bundle, filename.string(), ctx.params, _hierarchyLevel);
+        _override->insertAssetIntoCache(bundle, filenameString, ctx.params, _hierarchyLevel);
     }
     else if (bundle.getContents().empty())
     {
         bool addToCache;
-        bundle = _override->handleLoadFail(addToCache, file.get(), filename.string(), filename.string(), ctx, _hierarchyLevel);
+        bundle = _override->handleLoadFail(addToCache, file.get(), filenameString, filenameString, ctx, _hierarchyLevel);
         if (!bundle.getContents().empty() && addToCache)
-            _override->insertAssetIntoCache(bundle, filename.string(), ctx.params, _hierarchyLevel);
+            _override->insertAssetIntoCache(bundle, filenameString, ctx.params, _hierarchyLevel);
     }            
     return bundle;
 }
diff --git a/src/nbl/asset/ICPUImage.cpp b/src/nbl/asset/ICPUImage.cpp
index cd3f884890..1e06f4ccf7 100644
--- a/src/nbl/asset/ICPUImage.cpp
+++ b/src/nbl/asset/ICPUImage.cpp
@@ -1,4 +1,5 @@
 #include <type_traits>
+#include <memory>
 #include "nbl/asset/ICPUImage.h"
 #include "nbl/asset/filters/CMatchedSizeInOutImageFilterCommon.h"
 #include "nbl/asset/filters/CFlattenRegionsImageFilter.h"
@@ -37,7 +38,7 @@ class CFlattenRegionsStreamHashImageFilter : public CMatchedSizeInOutImageFilter
 			const auto product = parameters.mipLevels * parameters.arrayLayers;
 
 			size_t bufferSize = product * sizeof(CState::outHash);
-			bufferSize += product * sizeof(blake3_hasher);
+			bufferSize += product * sizeof(core::blake3_hasher);
 			bufferSize += getFlattenBufferSize(input);
 
 			return bufferSize;
@@ -136,9 +137,11 @@ class CFlattenRegionsStreamHashImageFilter : public CMatchedSizeInOutImageFilter
 			const auto product = parameters.mipLevels * parameters.arrayLayers;
 			
 			scratch.hashes = { static_cast<CState::hash_t*>(state->scratch.memory), product };
-			scratch.hashers = { reinterpret_cast<blake3_hasher*>(scratch.hashes.data() + scratch.hashes.size()), product };
+			scratch.hashers = { reinterpret_cast<core::blake3_hasher*>(scratch.hashes.data() + scratch.hashes.size()), product };
 			scratch.flatten = { .offset = scratch.hashes.size_bytes() + scratch.hashers.size_bytes(), .size = state->scratch.size - scratch.hashers.size_bytes() - scratch.hashes.size_bytes(), .buffer = buffer};
 		}
+		for (auto& hasher : scratch.hashers)
+			std::construct_at(&hasher);
 
 		const auto isFullyFlatten = scratch.flatten.size == 0ull;
 
@@ -225,7 +228,7 @@ class CFlattenRegionsStreamHashImageFilter : public CMatchedSizeInOutImageFilter
 				auto* const hasher = hashers + pOffset;
 				auto* const hash = hashes + pOffset;
 
-				blake3_hasher_init(hasher);
+				hasher->reset();
 
 				IImage::SSubresourceLayers subresource = { .aspectMask = static_cast<IImage::E_ASPECT_FLAGS>(0u), .mipLevel = miplevel, .baseArrayLayer = layer, .layerCount = 1u }; // stick to given mip level and single layer
 				CMatchedSizeInOutImageFilterCommon::state_type::TexelRange range = { .offset = {}, .extent = { parameters.extent.width, parameters.extent.height, parameters.extent.depth } }; // cover all texels within layer range, take 0th mip level size to not clip anything at all
@@ -233,7 +236,7 @@ class CFlattenRegionsStreamHashImageFilter : public CMatchedSizeInOutImageFilter
 
 				auto executePerTexelOrBlock = [&](uint32_t readBlockArrayOffset, core::vectorSIMDu32 readBlockPos) -> void
 				{
-					blake3_hasher_update(hasher, inData + readBlockArrayOffset, texelOrBlockByteSize);
+					hasher->update(inData + readBlockArrayOffset, texelOrBlockByteSize);
 				};
 
 				const auto regions = image->getRegions(miplevel);
@@ -242,7 +245,7 @@ class CFlattenRegionsStreamHashImageFilter : public CMatchedSizeInOutImageFilter
 				if (!performNullHash)
 					CBasicImageFilterCommon::executePerRegion(std::execution::seq, image, executePerTexelOrBlock, regions, clipFunctor); // fire the hasher for a layer, note we forcing seq policy because texels/blocks cannot be handled with par policies when we hash them
 
-				blake3_hasher_finalize(hasher, reinterpret_cast<uint8_t*>(hash), sizeof(CState::hash_t)); // finalize hash for layer + put it to heap for given mip level	
+				*hash = static_cast<CState::hash_t>(*hasher); // finalize hash for layer + put it to heap for given mip level
 			};
 
 			std::for_each(policy, layers.begin(), layers.end(), executePerLayer); // fire per layer for given given mip level with specified execution policy, yes you can use parallel policy here if you want at it will work
@@ -255,8 +258,8 @@ class CFlattenRegionsStreamHashImageFilter : public CMatchedSizeInOutImageFilter
 			time to use them and compute final hash
 		*/
 
-		blake3_hasher hasher;
-		blake3_hasher_init(&hasher);
+		core::blake3_hasher hasher;
+		hasher.reset();
 		{
 			for (auto miplevel = 0u; miplevel < parameters.mipLevels; ++miplevel)
 			{
@@ -265,11 +268,11 @@ class CFlattenRegionsStreamHashImageFilter : public CMatchedSizeInOutImageFilter
 				for (auto layer = 0u; layer < parameters.arrayLayers; ++layer)
 				{
 					auto* hash = hashes + mipOffset + layer;
-					blake3_hasher_update(&hasher, hash->data, sizeof(CState::hash_t));
+					hasher.update(hash->data, sizeof(CState::hash_t));
 				}
 			}
 
-			blake3_hasher_finalize(&hasher, reinterpret_cast<uint8_t*>(&state->outHash), sizeof(CState::hash_t)); // finalize output hash for whole image given all hashes
+			state->outHash = static_cast<CState::hash_t>(hasher); // finalize output hash for whole image given all hashes
 		}
 
 		return true;
@@ -284,7 +287,7 @@ class CFlattenRegionsStreamHashImageFilter : public CMatchedSizeInOutImageFilter
 	struct ScratchMap
 	{
 		std::span<CState::hash_t> hashes; // hashes, single hash is obtained from given miplevel & layer, full hash for an image is a hash of this hash buffer
-		std::span<blake3_hasher> hashers; // hashers, used to produce a hash
+		std::span<core::blake3_hasher> hashers; // hashers, used to produce a hash
 		asset::SBufferRange<asset::ICPUBuffer> flatten; // tightly packed texels from input, no memory gaps
 	};
 };
@@ -307,4 +310,4 @@ core::blake3_hash_t ICPUImage::computeContentHash() const
 	assert(passed); // actually this should never fail, leaving in case
 
 	return state.outHash;
-}
\ No newline at end of file
+}
diff --git a/src/nbl/asset/interchange/CGLIWriter.h b/src/nbl/asset/interchange/CGLIWriter.h
index db88583054..fccde37735 100644
--- a/src/nbl/asset/interchange/CGLIWriter.h
+++ b/src/nbl/asset/interchange/CGLIWriter.h
@@ -35,9 +35,9 @@ class CGLIWriter final : public asset::IAssetWriter
 
 		uint64_t getSupportedAssetTypesBitfield() const override { return asset::IAsset::ET_IMAGE_VIEW; }
 
-		uint32_t getSupportedFlags() override { return asset::EWF_NONE | asset::EWF_BINARY; }
+		writer_flags_t getSupportedFlags() override { return asset::EWF_BINARY; }
 
-		uint32_t getForcedFlags() override { return asset::EWF_NONE | asset::EWF_BINARY; }
+		writer_flags_t getForcedFlags() override { return asset::EWF_BINARY; }
 
 		bool writeAsset(system::IFile* _file, const SAssetWriteParams& _params, IAssetWriterOverride* _override = nullptr) override;
 
diff --git a/src/nbl/asset/interchange/CGLTFLoader.cpp b/src/nbl/asset/interchange/CGLTFLoader.cpp
index fde9552179..c7c4be034b 100644
--- a/src/nbl/asset/interchange/CGLTFLoader.cpp
+++ b/src/nbl/asset/interchange/CGLTFLoader.cpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2020 AnastaZIuk
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in Nabla.h
 
@@ -60,7 +60,7 @@ using namespace nbl::asset;
 
 				core::smart_refctd_ptr<const system::IFile> glslFile = loadBuiltinData(decltype(constexprStringType)::value);
 				auto glsl = asset::ICPUBuffer::create({ glslFile->getSize() });
-				memcpy(glsl->getPointer(),glslFile->getMappedPointer(),glsl->getSize());
+				memcpy(glsl->getPointer(),static_cast<const system::IFile*>(glslFile.get())->getMappedPointer(),glsl->getSize());
 
 				auto unspecializedShader = core::make_smart_refctd_ptr<asset::ICPUShader>(std::move(glsl), stage, asset::ICPUShader::E_CONTENT_TYPE::ECT_GLSL, stage != ICPUShader::ESS_VERTEX ? "?IrrlichtBAW glTFLoader FragmentShader?" : "?IrrlichtBAW glTFLoader VertexShader?");
 				if (extraDefine)
@@ -1274,26 +1274,26 @@ using namespace nbl::asset;
 													auto* packedJointsData = reinterpret_cast<JointComponentT*>(reinterpret_cast<uint8_t*>(vOverrideRepackedJointsBuffer->getPointer()) + vAttributeIx * repackJointsTexelByteSize);
 													auto* packedWeightsData = reinterpret_cast<WeightCompomentT*>(reinterpret_cast<uint8_t*>(vOverrideRepackedWeightsBuffer->getPointer()) + vAttributeIx * repackWeightsTexelByteSize);
 
-													auto quantize = [&](const core::vectorSIMDf& input, void* data, const E_FORMAT requestQuantizeFormat)
+													auto quantize = [&](const hlsl::float32_t4& input, void* data, const E_FORMAT requestQuantizeFormat)
 													{
-														return ICPUMeshBuffer::setAttribute(input, data, requestQuantizeFormat);
+														return ICPUMeshBuffer::setAttribute(&input[0], data, requestQuantizeFormat);
 													};
 
 													auto decodeQuant = [&](void* data, const E_FORMAT requestQuantizeFormat)
 													{
-														core::vectorSIMDf out;
-														ICPUMeshBuffer::getAttribute(out, data, requestQuantizeFormat);
+														hlsl::float32_t4 out = {};
+														ICPUMeshBuffer::getAttribute(&out[0], data, requestQuantizeFormat);
 														return out;
 													};
 
-													core::vectorSIMDf packedWeightsStream; //! always go with full vectorSIMDf stream, weights being not used are leaved with default vector's compoment value and are not considered
+													hlsl::float32_t4 packedWeightsStream = {}; //! always go with full float4 stream, weights being not used are leaved with default vector's compoment value and are not considered
 
 													for (uint16_t i = 0, vxSkinComponentOffset = 0; i < 4u; ++i) //! packing
 													{
 														if (unpackedWeightsData[i])
 														{
 															packedJointsData[vxSkinComponentOffset] = unpackedJointsData[i];
-															packedWeightsStream.pointer[i] = packedWeightsData[vxSkinComponentOffset] = unpackedWeightsData[i];
+															packedWeightsStream[i] = packedWeightsData[vxSkinComponentOffset] = unpackedWeightsData[i];
 
 															++vxSkinComponentOffset;
 															assert(vxSkinComponentOffset <= maxJointsPerVertex);
@@ -1309,14 +1309,14 @@ using namespace nbl::asset;
 														const E_FORMAT requestQuantFormat = std::get<E_FORMAT>(encode);
 
 														quantize(packedWeightsStream, quantBuffer, requestQuantFormat);
-														core::vectorSIMDf quantsDecoded = decodeQuant(quantBuffer, requestQuantFormat);
+														hlsl::float32_t4 quantsDecoded = decodeQuant(quantBuffer, requestQuantFormat);
 
 														for (uint16_t i = 0; i < MAX_INFLUENCE_WEIGHTS_PER_VERTEX; ++i)
 														{
-															const auto& weightInput = packedWeightsStream.pointer[i];
+															const auto weightInput = packedWeightsStream[i];
 															if (weightInput)
 															{
-																const typename QuantRequest::ERROR_TYPE& errorComponent = errorBuffer[i] = core::abs(quantsDecoded.pointer[i] - weightInput);
+																const typename QuantRequest::ERROR_TYPE& errorComponent = errorBuffer[i] = core::abs(quantsDecoded[i] - weightInput);
 
 																if (errorComponent)
 																{
@@ -1420,13 +1420,13 @@ using namespace nbl::asset;
 															const size_t quantizedVWeightsOffset = vAttributeIx * weightComponentsByteStride;
 															void* quantizedWeightsData = reinterpret_cast<uint8_t*>(vOverrideQuantizedWeightsBuffer->getPointer()) + quantizedVWeightsOffset;
 
-															core::vectorSIMDf packedWeightsStream; //! always go with full vectorSIMDf stream, weights being not used are leaved with default vector's compoment value and are not considered
+															hlsl::float32_t4 packedWeightsStream = {}; //! always go with full float4 stream, weights being not used are leaved with default vector's compoment value and are not considered
 															auto* packedWeightsData = reinterpret_cast<WeightCompomentT*>(reinterpret_cast<uint8_t*>(vOverrideRepackedWeightsBuffer->getPointer()) + vAttributeIx * repackWeightsTexelByteSize);
 
 															for (uint16_t i = 0; i < maxJointsPerVertex; ++i)
-																packedWeightsStream.pointer[i] = packedWeightsData[i];
+																packedWeightsStream[i] = packedWeightsData[i];
 
-															ICPUMeshBuffer::setAttribute(packedWeightsStream, quantizedWeightsData, weightsQuantizeFormat); //! quantize
+															ICPUMeshBuffer::setAttribute(&packedWeightsStream[0], quantizedWeightsData, weightsQuantizeFormat); //! quantize
 														}
 													}
 
diff --git a/src/nbl/asset/interchange/CGLTFWriter.h b/src/nbl/asset/interchange/CGLTFWriter.h
index 6184bc0be2..7fde5eb319 100644
--- a/src/nbl/asset/interchange/CGLTFWriter.h
+++ b/src/nbl/asset/interchange/CGLTFWriter.h
@@ -40,9 +40,9 @@ namespace nbl
 
 				uint64_t getSupportedAssetTypesBitfield() const override { return asset::IAsset::ET_MESH; }
 
-				uint32_t getSupportedFlags() override { return asset::EWF_NONE; }
+				writer_flags_t getSupportedFlags() override { return asset::EWF_NONE; }
 
-				uint32_t getForcedFlags() override { return asset::EWF_NONE; }
+				writer_flags_t getForcedFlags() override { return asset::EWF_NONE; }
 
 				bool writeAsset(system::IFile* _file, const SAssetWriteParams& _params, IAssetWriterOverride* _override = nullptr) override;
 		};
diff --git a/src/nbl/asset/interchange/CGraphicsPipelineLoaderMTL.cpp b/src/nbl/asset/interchange/CGraphicsPipelineLoaderMTL.cpp
index b538f75eb3..6521fa9775 100644
--- a/src/nbl/asset/interchange/CGraphicsPipelineLoaderMTL.cpp
+++ b/src/nbl/asset/interchange/CGraphicsPipelineLoaderMTL.cpp
@@ -9,6 +9,8 @@
 #include <utility>
 #include <regex>
 #include <filesystem>
+#include <charconv>
+#include <fast_float/fast_float.h>
 
 #include "nbl/system/CFileView.h"
 
@@ -458,10 +460,15 @@ const char* CGraphicsPipelineLoaderMTL::readTexture(const char* _bufPtr, const c
                     mapType = found->second;
             }
         }
-        else if (strncmp(_bufPtr,"-bm",3)==0)
+		else if (strncmp(_bufPtr,"-bm",3)==0)
 		{
 			_bufPtr = goAndCopyNextWord(tmpbuf, _bufPtr, WORD_BUFFER_LENGTH, _bufEnd);
-			sscanf(tmpbuf, "%f", &_currMaterial->params.bumpFactor);
+            const char* tokenEnd = tmpbuf;
+            while (*tokenEnd != '\0')
+                ++tokenEnd;
+            const auto parseResult = fast_float::from_chars(tmpbuf, tokenEnd, _currMaterial->params.bumpFactor);
+            if (!(parseResult.ec == std::errc() && parseResult.ptr == tokenEnd))
+                _currMaterial->params.bumpFactor = 0.f;
 		}
 		else
 		if (strncmp(_bufPtr,"-blendu",7)==0)
@@ -763,12 +770,15 @@ auto CGraphicsPipelineLoaderMTL::readMaterials(system::IFile* _file, const syste
     char tmpbuf[WORD_BUFFER_LENGTH]{};
 
     auto readFloat = [&tmpbuf, &bufPtr, bufEnd] {
-        float f = 0.f;
-
         bufPtr = goAndCopyNextWord(tmpbuf, bufPtr, WORD_BUFFER_LENGTH, bufEnd);
-        sscanf(tmpbuf, "%f", &f);
 
-        return f;
+        const char* tokenEnd = tmpbuf;
+        while (*tokenEnd != '\0')
+            ++tokenEnd;
+
+        float f = 0.f;
+        const auto parseResult = fast_float::from_chars(tmpbuf, tokenEnd, f);
+        return (parseResult.ec == std::errc() && parseResult.ptr == tokenEnd) ? f : 0.f;
     };
     auto readRGB = [&readFloat] {
         core::vector3df_SIMD rgb(1.f);
@@ -817,7 +827,13 @@ auto CGraphicsPipelineLoaderMTL::readMaterials(system::IFile* _file, const syste
             if (currMaterial)
             {
                 bufPtr = goAndCopyNextWord(tmpbuf, bufPtr, WORD_BUFFER_LENGTH, bufEnd);
-                currMaterial->params.extra |= (atol(tmpbuf)&0x0f);//illum values are in range [0;10]
+                const char* tokenEnd = tmpbuf;
+                while (*tokenEnd != '\0')
+                    ++tokenEnd;
+                uint32_t illum = 0u;
+                const auto parseResult = std::from_chars(tmpbuf, tokenEnd, illum, 10);
+                if (parseResult.ec == std::errc() && parseResult.ptr == tokenEnd)
+                    currMaterial->params.extra |= (illum & 0x0fu);//illum values are in range [0;10]
             }
             break;
         case 'N':
diff --git a/src/nbl/asset/interchange/CImageWriterJPG.cpp b/src/nbl/asset/interchange/CImageWriterJPG.cpp
index 3943e207ed..4557b0e3d4 100644
--- a/src/nbl/asset/interchange/CImageWriterJPG.cpp
+++ b/src/nbl/asset/interchange/CImageWriterJPG.cpp
@@ -198,16 +198,16 @@ bool CImageWriterJPG::writeAsset(system::IFile* _file, const SAssetWriteParams&
 #else
 	SAssetWriteContext ctx{ _params, _file };
 
-	auto imageView = IAsset::castDown<const ICPUImageView>(_params.rootAsset);
+    auto imageView = IAsset::castDown<const ICPUImageView>(_params.rootAsset);
 
     system::IFile* file = _override->getOutputFile(_file, ctx, { imageView, 0u});
-    const asset::E_WRITER_FLAGS flags = _override->getAssetWritingFlags(ctx, imageView, 0u);
+    const auto flags = _override->getAssetWritingFlags(ctx, imageView, 0u);
     const float comprLvl = _override->getAssetCompressionLevel(ctx, imageView, 0u);
 
-	return writeJPEGFile(file, m_system.get(), imageView, (!!(flags & asset::EWF_COMPRESSED)) * static_cast<uint32_t>((1.f-comprLvl)*100.f), _params.logger); // if quality==0, then it defaults to 75
+	return writeJPEGFile(file, m_system.get(), imageView, flags.hasAnyFlag(asset::EWF_COMPRESSED) * static_cast<uint32_t>((1.f-comprLvl)*100.f), _params.logger); // if quality==0, then it defaults to 75
 
 #endif//!defined(_NBL_COMPILE_WITH_LIBJPEG_ )
 }
 
 #undef OUTPUT_BUF_SIZE
-#endif
\ No newline at end of file
+#endif
diff --git a/src/nbl/asset/interchange/CImageWriterJPG.h b/src/nbl/asset/interchange/CImageWriterJPG.h
index 40157f0bf6..1d2b5f2963 100644
--- a/src/nbl/asset/interchange/CImageWriterJPG.h
+++ b/src/nbl/asset/interchange/CImageWriterJPG.h
@@ -33,9 +33,9 @@ class CImageWriterJPG : public asset::IAssetWriter
 
         virtual uint64_t getSupportedAssetTypesBitfield() const override { return asset::IAsset::ET_IMAGE_VIEW; }
 
-        virtual uint32_t getSupportedFlags() override { return asset::EWF_COMPRESSED; }
+        virtual writer_flags_t getSupportedFlags() override { return asset::EWF_COMPRESSED; }
 
-        virtual uint32_t getForcedFlags() { return asset::EWF_BINARY; }
+        virtual writer_flags_t getForcedFlags() { return asset::EWF_BINARY; }
 
         virtual bool writeAsset(system::IFile* _file, const SAssetWriteParams& _params, IAssetWriterOverride* _override = nullptr) override;
 };
diff --git a/src/nbl/asset/interchange/CImageWriterOpenEXR.h b/src/nbl/asset/interchange/CImageWriterOpenEXR.h
index 37da219c64..5a2e0a1cda 100644
--- a/src/nbl/asset/interchange/CImageWriterOpenEXR.h
+++ b/src/nbl/asset/interchange/CImageWriterOpenEXR.h
@@ -33,9 +33,9 @@ class CImageWriterOpenEXR final : public IImageWriter
 
 		uint64_t getSupportedAssetTypesBitfield() const override { return asset::IAsset::ET_IMAGE_VIEW; }
 
-		uint32_t getSupportedFlags() override { return asset::EWF_BINARY; }
+		writer_flags_t getSupportedFlags() override { return asset::EWF_BINARY; }
 
-		uint32_t getForcedFlags() { return asset::EWF_BINARY; }
+		writer_flags_t getForcedFlags() { return asset::EWF_BINARY; }
 
 		bool writeAsset(system::IFile* _file, const SAssetWriteParams& _params, IAssetWriterOverride* _override = nullptr) override;
 
diff --git a/src/nbl/asset/interchange/CImageWriterPNG.h b/src/nbl/asset/interchange/CImageWriterPNG.h
index ec2f3b39ef..5111df6ac5 100644
--- a/src/nbl/asset/interchange/CImageWriterPNG.h
+++ b/src/nbl/asset/interchange/CImageWriterPNG.h
@@ -39,9 +39,9 @@ class CImageWriterPNG : public asset::IAssetWriter
     
     virtual uint64_t getSupportedAssetTypesBitfield() const override { return asset::IAsset::ET_IMAGE_VIEW; }
     
-    virtual uint32_t getSupportedFlags() override { return 0u; }
-    
-    virtual uint32_t getForcedFlags() { return asset::EWF_BINARY; }
+    virtual writer_flags_t getSupportedFlags() override { return asset::EWF_NONE; }
+
+    virtual writer_flags_t getForcedFlags() { return asset::EWF_BINARY; }
     
     virtual bool writeAsset(system::IFile* _file, const SAssetWriteParams& _params, IAssetWriterOverride* _override = nullptr) override;
 };
diff --git a/src/nbl/asset/interchange/CImageWriterTGA.h b/src/nbl/asset/interchange/CImageWriterTGA.h
index 2341d1a910..a741898fbb 100644
--- a/src/nbl/asset/interchange/CImageWriterTGA.h
+++ b/src/nbl/asset/interchange/CImageWriterTGA.h
@@ -33,9 +33,9 @@ class CImageWriterTGA : public asset::IAssetWriter
         	return asset::IAsset::ET_IMAGE_VIEW;
         }
 
-        virtual uint32_t getSupportedFlags() override { return 0u; }
+        virtual writer_flags_t getSupportedFlags() override { return asset::EWF_NONE; }
 
-        virtual uint32_t getForcedFlags() { return asset::EWF_BINARY; }
+        virtual writer_flags_t getForcedFlags() { return asset::EWF_BINARY; }
 
         virtual bool writeAsset(system::IFile* _file, const SAssetWriteParams& _params, IAssetWriterOverride* _override = nullptr) override;
 };
diff --git a/src/nbl/asset/interchange/COBJMeshFileLoader.cpp b/src/nbl/asset/interchange/COBJMeshFileLoader.cpp
index 69651f8061..0662628799 100644
--- a/src/nbl/asset/interchange/COBJMeshFileLoader.cpp
+++ b/src/nbl/asset/interchange/COBJMeshFileLoader.cpp
@@ -1,747 +1,958 @@
-// Copyright (C) 2019 - DevSH Graphics Programming Sp. z O.O.
+#ifdef _NBL_COMPILE_WITH_OBJ_LOADER_
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine" and was originally part of the "Irrlicht Engine"
 // For conditions of distribution and use, see copyright notice in nabla.h
 // See the original file in irrlicht source for authors
-
 #include "nbl/core/declarations.h"
-
 #include "nbl/asset/IAssetManager.h"
+#include "nbl/asset/ICPUGeometryCollection.h"
+#include "nbl/asset/ICPUMorphTargets.h"
+#include "nbl/asset/ICPUScene.h"
+#include "nbl/asset/interchange/SGeometryContentHash.h"
+#include "nbl/asset/interchange/SGeometryLoaderCommon.h"
+#include "nbl/asset/interchange/SOBJPolygonGeometryAuxLayout.h"
+#include "nbl/asset/interchange/SInterchangeIO.h"
+#include "nbl/asset/interchange/SLoaderRuntimeTuning.h"
 #include "nbl/asset/utils/CPolygonGeometryManipulator.h"
-
-#ifdef _NBL_COMPILE_WITH_OBJ_LOADER_
-
-#include "nbl/system/ISystem.h"
+#include "nbl/builtin/hlsl/shapes/AABBAccumulator.hlsl"
 #include "nbl/system/IFile.h"
-
-#include "nbl/asset/utils/CQuantNormalCache.h"
-
 #include "COBJMeshFileLoader.h"
-
-#include <filesystem>
-
-namespace nbl
-{
-namespace asset
+#include "impl/SFileAccess.h"
+#include "impl/STextParse.h"
+#include <array>
+#include <bit>
+#include <cctype>
+#include <optional>
+#include <span>
+#include <string_view>
+#include <unordered_map>
+namespace nbl::asset
 {
-
-//#ifdef _NBL_DEBUG
-#define _NBL_DEBUG_OBJ_LOADER_
-//#endif
-
-static const uint32_t WORD_BUFFER_LENGTH = 512;
-
-constexpr uint32_t POSITION = 0u;
-constexpr uint32_t UV = 2u;
-constexpr uint32_t NORMAL = 3u;
-constexpr uint32_t BND_NUM = 0u;
-
-//! Constructor
-COBJMeshFileLoader::COBJMeshFileLoader(IAssetManager* _manager) : AssetManager(_manager), System(_manager->getSystem())
+namespace
 {
-}
-
-
-//! destructor
-COBJMeshFileLoader::~COBJMeshFileLoader()
-{
-}
-
-asset::SAssetBundle COBJMeshFileLoader::loadAsset(system::IFile* _file, const asset::IAssetLoader::SAssetLoadParams& _params, asset::IAssetLoader::IAssetLoaderOverride* _override, uint32_t _hierarchyLevel)
+struct Parse
 {
-    SContext ctx(
-        asset::IAssetLoader::SAssetLoadContext{
-            _params,
-            _file
-        },
-		_hierarchyLevel,
-        _override
-    );
-
-	if (_params.meshManipulatorOverride == nullptr)
+	using Common = impl::TextParse;
+	struct VertexDedupNode { int32_t uv = -1; int32_t normal = -1; uint32_t smoothingGroup = 0u; uint32_t outIndex = 0u; int32_t next = -1; };
+	static bool resolveIndex(const int32_t rawIndex, const size_t elementCount, int32_t& resolved)
 	{
-		_NBL_DEBUG_BREAK_IF(true);
-		assert(false);
+		if (rawIndex > 0)
+		{
+			const uint64_t oneBased = static_cast<uint64_t>(rawIndex);
+			if (oneBased == 0ull)
+				return false;
+			const uint64_t zeroBased = oneBased - 1ull;
+			if (zeroBased >= elementCount)
+				return false;
+			resolved = static_cast<int32_t>(zeroBased);
+			return true;
+		}
+		const int64_t zeroBased = static_cast<int64_t>(elementCount) + static_cast<int64_t>(rawIndex);
+		if (zeroBased < 0 || zeroBased >= static_cast<int64_t>(elementCount))
+			return false;
+		resolved = static_cast<int32_t>(zeroBased);
+		return true;
 	}
-
-	CQuantNormalCache* const quantNormalCache = _params.meshManipulatorOverride->getQuantNormalCache();
-
-	const long filesize = _file->getSize();
-	if (!filesize)
-        return {};
-
-	const uint32_t WORD_BUFFER_LENGTH = 512u;
-    char tmpbuf[WORD_BUFFER_LENGTH]{};
-
-	uint32_t smoothingGroup=0;
-
-	const std::filesystem::path fullName = _file->getFileName();
-	const std::string relPath = [&fullName]() -> std::string
+	static void parseSmoothingGroup(const char* linePtr, const char* const lineEnd, uint32_t& outGroup)
 	{
-		auto dir = fullName.parent_path().string();
-		return dir;
-	}();
-
-    //value_type: directory from which .mtl (pipeline) was loaded and the pipeline
-	using pipeline_meta_pair_t = std::pair<core::smart_refctd_ptr<ICPURenderpassIndependentPipeline>,const CMTLMetadata::CRenderpassIndependentPipeline*>;
-	struct hash_t
+		Common::skipInlineWhitespace(linePtr, lineEnd);
+		if (linePtr >= lineEnd)
+			return void(outGroup = 0u);
+		const char* const tokenStart = linePtr;
+		while (linePtr < lineEnd && !Common::isInlineWhitespace(*linePtr))
+			++linePtr;
+		const std::string_view token(tokenStart, static_cast<size_t>(linePtr - tokenStart));
+		if (token.size() == 2u && std::tolower(token[0]) == 'o' && std::tolower(token[1]) == 'n')
+			return void(outGroup = 1u);
+		if (token.size() == 3u && std::tolower(token[0]) == 'o' && std::tolower(token[1]) == 'f' && std::tolower(token[2]) == 'f')
+			return void(outGroup = 0u);
+		uint32_t value = 0u;
+		outGroup = Common::parseExactNumber(token, value) ? value : 0u;
+	}
+	static std::string parseIdentifier(const char* linePtr, const char* const lineEnd, const std::string_view fallback)
 	{
-		inline auto operator()(const pipeline_meta_pair_t& item) const
+		const char* endPtr = lineEnd;
+		Common::skipInlineWhitespace(linePtr, lineEnd);
+		while (endPtr > linePtr && Common::isInlineWhitespace(endPtr[-1]))
+			--endPtr;
+		if (linePtr >= endPtr)
+			return std::string(fallback);
+		return std::string(linePtr, static_cast<size_t>(endPtr - linePtr));
+	}
+	static bool parseTrianglePositiveTripletLine(const char* const lineStart, const char* const lineEnd, std::array<hlsl::int32_t3, 3>& out, const size_t posCount, const size_t uvCount, const size_t normalCount)
+	{
+		const char* ptr = lineStart;
+		auto parsePositive = [&](const size_t count, int32_t& outIx) -> bool {
+			uint32_t value = 0u;
+			if (!Common::parseNonZeroNumber(ptr, lineEnd, value))
+				return false;
+			if (value > count)
+				return false;
+			outIx = value - 1u;
+			return true;
+		};
+		for (uint32_t corner = 0u; corner < 3u; ++corner)
 		{
-			return std::hash<std::string>()(item.second->m_name);
+			Common::skipInlineWhitespace(ptr, lineEnd);
+			if (ptr >= lineEnd || !Common::isDigit(*ptr))
+				return false;
+			int32_t posIx = -1;
+			if (!parsePositive(posCount, posIx))
+				return false;
+			if (ptr >= lineEnd || *ptr != '/')
+				return false;
+			++ptr;
+			int32_t uvIx = -1;
+			if (!parsePositive(uvCount, uvIx))
+				return false;
+			if (ptr >= lineEnd || *ptr != '/')
+				return false;
+			++ptr;
+			int32_t normalIx = -1;
+			if (!parsePositive(normalCount, normalIx))
+				return false;
+			out[corner] = hlsl::int32_t3(posIx, uvIx, normalIx);
 		}
-	};
-	struct key_equal_t
+		Common::skipInlineWhitespace(ptr, lineEnd);
+		return ptr == lineEnd;
+	}
+	static bool parseTrianglePositivePositionNormalLine(const char* const lineStart, const char* const lineEnd, std::array<hlsl::int32_t3, 3>& out, const size_t posCount, const size_t normalCount)
 	{
-		inline bool operator()(const pipeline_meta_pair_t& lhs, const pipeline_meta_pair_t& rhs) const
+		const char* ptr = lineStart;
+		auto parsePositive = [&](const size_t count, int32_t& outIx) -> bool {
+			uint32_t value = 0u;
+			if (!Common::parseNonZeroNumber(ptr, lineEnd, value))
+				return false;
+			if (value > count)
+				return false;
+			outIx = value - 1u;
+			return true;
+		};
+		for (uint32_t corner = 0u; corner < 3u; ++corner)
 		{
-			return lhs.second->m_name==rhs.second->m_name;
+			Common::skipInlineWhitespace(ptr, lineEnd);
+			if (ptr >= lineEnd || !Common::isDigit(*ptr))
+				return false;
+			int32_t posIx = -1;
+			if (!parsePositive(posCount, posIx))
+				return false;
+			if ((ptr + 1) >= lineEnd || ptr[0] != '/' || ptr[1] != '/')
+				return false;
+			ptr += 2;
+			int32_t normalIx = -1;
+			if (!parsePositive(normalCount, normalIx))
+				return false;
+			out[corner] = hlsl::int32_t3(posIx, -1, normalIx);
 		}
-	};
-    core::unordered_multiset<pipeline_meta_pair_t,hash_t,key_equal_t> pipelines;
-
-	// TODO: map the file whenever possible
-    std::string fileContents;
-    fileContents.resize(filesize);
-	char* const buf = fileContents.data();
-
-	system::IFile::success_t success;
-	_file->read(success, buf, 0, filesize);
-	if (!success)
-		return {};
-
-	const char* const bufEnd = buf+filesize;
-	// Process obj information
-	const char* bufPtr = buf;
-	std::string grpName, mtlName;
-
-	auto performActionBasedOnOrientationSystem = [&](auto performOnRightHanded, auto performOnLeftHanded)
-	{
-		if (_params.loaderFlags & E_LOADER_PARAMETER_FLAGS::ELPF_RIGHT_HANDED_MESHES)
-			performOnRightHanded();
-		else
-			performOnLeftHanded();
-	};
-
-
-    struct vec3 {
-        float data[3];
-    };
-    struct vec2 {
-        float data[2];
-    };
-    core::vector<vec3> vertexBuffer;
-    core::vector<vec3> normalsBuffer;
-    core::vector<vec2> textureCoordBuffer;
-
-    core::vector<core::smart_refctd_ptr<ICPUMeshBuffer>> submeshes;
-    core::vector<core::vector<uint32_t>> indices;
-    core::vector<SObjVertex> vertices;
-    core::map<SObjVertex, uint32_t> map_vtx2ix;
-    core::vector<bool> recalcNormals;
-    core::vector<bool> submeshWasLoadedFromCache;
-    core::vector<std::string> submeshCacheKeys;
-    core::vector<std::string> submeshMaterialNames;
-    core::vector<uint32_t> vtxSmoothGrp;
-
-	// TODO: handle failures much better!
-	constexpr const char* NO_MATERIAL_MTL_NAME = "#";
-	bool noMaterial = true;
-	bool dummyMaterialCreated = false;
-	while(bufPtr != bufEnd)
+		Common::skipInlineWhitespace(ptr, lineEnd);
+		return ptr == lineEnd;
+	}
+	static bool parseFaceVertexToken(const char*& linePtr, const char* const lineEnd, hlsl::int32_t3& idx, const size_t posCount, const size_t uvCount, const size_t normalCount)
 	{
-		switch(bufPtr[0])
+		Common::skipInlineWhitespace(linePtr, lineEnd);
+		if (linePtr >= lineEnd)
+			return false;
+		idx = hlsl::int32_t3(-1, -1, -1);
+		const char* ptr = linePtr;
+		auto parsePositive = [&](const size_t count, int32_t& outIx) -> bool {
+			uint32_t raw = 0u;
+			if (!Common::parseNonZeroNumber(ptr, lineEnd, raw))
+				return false;
+			if (raw > count)
+				return false;
+			outIx = raw - 1u;
+			return true;
+		};
+		auto parseResolved = [&](const size_t count, int32_t& outIx) -> bool {
+			int32_t raw = 0;
+			return Common::parseNonZeroNumber(ptr, lineEnd, raw) && resolveIndex(raw, count, outIx);
+		};
+		if (*ptr != '-' && *ptr != '+')
 		{
-		case 'm':	// mtllib (material)
-		{
-			if (ctx.useMaterials)
+			if (!parsePositive(posCount, idx.x))
+				return false;
+			if (ptr < lineEnd && *ptr == '/')
 			{
-				bufPtr = goAndCopyNextWord(tmpbuf, bufPtr, WORD_BUFFER_LENGTH, bufEnd);
-				_params.logger.log("Reading material _file %s", system::ILogger::ELL_DEBUG, tmpbuf);
-
-                std::string mtllib = tmpbuf;
-                std::replace(mtllib.begin(), mtllib.end(), '\\', '/');
-                SAssetLoadParams loadParams(_params);
-				loadParams.workingDirectory = _file->getFileName().parent_path();
-                auto bundle = interm_getAssetInHierarchy(AssetManager, mtllib, loadParams, _hierarchyLevel+ICPUMesh::PIPELINE_HIERARCHYLEVELS_BELOW, _override);
-                
-				if (bundle.getContents().empty())
-					break;
-
-				if (bundle.getMetadata())
+				++ptr;
+				if (ptr < lineEnd && *ptr != '/')
+				{
+					if (!parsePositive(uvCount, idx.y))
+						return false;
+				}
+				if (ptr < lineEnd && *ptr == '/')
 				{
-					auto meta = bundle.getMetadata()->selfCast<const CMTLMetadata>();
-					if (bundle.getAssetType()==IAsset::ET_RENDERPASS_INDEPENDENT_PIPELINE)
-					for (auto ass : bundle.getContents())
+					++ptr;
+					if (ptr < lineEnd && !Common::isInlineWhitespace(*ptr))
 					{
-						auto ppln = core::smart_refctd_ptr_static_cast<ICPURenderpassIndependentPipeline>(ass);
-						const auto pplnMeta = meta->getAssetSpecificMetadata(ppln.get());
-						if (!pplnMeta)
-							continue;
-
-						pipelines.emplace(std::move(ppln),pplnMeta);
+						if (!parsePositive(normalCount, idx.z))
+							return false;
 					}
 				}
+				else if (ptr < lineEnd && !Common::isInlineWhitespace(*ptr))
+					return false;
 			}
+			else if (ptr < lineEnd && !Common::isInlineWhitespace(*ptr))
+				return false;
 		}
-			break;
-
-		case 'v':               // v, vn, vt
-			//reset flags
-			noMaterial = true;
-			dummyMaterialCreated = false;
-			switch(bufPtr[1])
-			{
-			case ' ':          // vertex
-				{
-					vec3 vec;
-					bufPtr = readVec3(bufPtr, vec.data, bufEnd);
-					performActionBasedOnOrientationSystem([&]() {vec.data[0] = -vec.data[0];}, [&]() {});
-					vertexBuffer.push_back(vec);
-				}
-				break;
-
-			case 'n':       // normal
-				{
-					vec3 vec;
-					bufPtr = readVec3(bufPtr, vec.data, bufEnd);
-					performActionBasedOnOrientationSystem([&]() {vec.data[0] = -vec.data[0]; }, [&]() {});
-					normalsBuffer.push_back(vec);
-				}
-				break;
-
-			case 't':       // texcoord
-				{
-					vec2 vec;
-					bufPtr = readUV(bufPtr, vec.data, bufEnd);
-					textureCoordBuffer.push_back(vec);
-				}
-				break;
-			}
-			break;
-
-		case 'g': // group name
-            bufPtr = goAndCopyNextWord(tmpbuf, bufPtr, WORD_BUFFER_LENGTH, bufEnd);
-            grpName = tmpbuf;
-			break;
-		case 's': // smoothing can be a group or off (equiv. to 0)
-			{
-				bufPtr = goAndCopyNextWord(tmpbuf, bufPtr, WORD_BUFFER_LENGTH, bufEnd);
-				_params.logger.log("Loaded smoothing group start %s",system::ILogger::ELL_DEBUG, tmpbuf);
-				if (strcmp("off", tmpbuf)==0)
-					smoothingGroup=0u;
-				else
-                    sscanf(tmpbuf,"%u",&smoothingGroup);
-			}
-			break;
-
-		case 'u': // usemtl
-			// get name of material
-			{
-				noMaterial = false;
-				bufPtr = goAndCopyNextWord(tmpbuf, bufPtr, WORD_BUFFER_LENGTH, bufEnd);
-				_params.logger.log("Loaded material start %s", system::ILogger::ELL_DEBUG, tmpbuf);
-				mtlName=tmpbuf;
-
-                if (ctx.useMaterials && !ctx.useGroups)
-                {
-                    asset::IAsset::E_TYPE types[] {asset::IAsset::ET_SUB_MESH, (asset::IAsset::E_TYPE)0u };
-                    auto mb_bundle = _override->findCachedAsset(genKeyForMeshBuf(ctx, _file->getFileName().string(), mtlName, grpName), types, ctx.inner, _hierarchyLevel+ICPUMesh::MESHBUFFER_HIERARCHYLEVELS_BELOW);
-                    auto mbs = mb_bundle.getContents();
-					bool notempty = mbs.size()!=0ull;
-                    {
-                        auto mb = notempty ? core::smart_refctd_ptr_static_cast<ICPUMeshBuffer>(*mbs.begin()) : core::make_smart_refctd_ptr<ICPUMeshBuffer>();
-                        submeshes.push_back(std::move(mb));
-                    }
-                    indices.emplace_back();
-                    recalcNormals.push_back(false);
-                    submeshWasLoadedFromCache.push_back(notempty);
-                    //if submesh was loaded from cache - insert empty "cache key" (submesh loaded from cache won't be added to cache again)
-                    submeshCacheKeys.push_back(submeshWasLoadedFromCache.back() ? "" : genKeyForMeshBuf(ctx, _file->getFileName().string(), mtlName, grpName));
-                    submeshMaterialNames.push_back(mtlName);
-                }
-			}
-			break;
-		case 'f':               // face
+		else
 		{
-			if (noMaterial && !dummyMaterialCreated)
+			if (!parseResolved(posCount, idx.x))
+				return false;
+			if (ptr < lineEnd && *ptr == '/')
 			{
-				dummyMaterialCreated = true;
-
-				submeshes.push_back(core::make_smart_refctd_ptr<ICPUMeshBuffer>());
-				indices.emplace_back();
-				recalcNormals.push_back(false);
-				submeshWasLoadedFromCache.push_back(false);
-				submeshCacheKeys.push_back(genKeyForMeshBuf(ctx, _file->getFileName().string(), NO_MATERIAL_MTL_NAME, grpName));
-				submeshMaterialNames.push_back(NO_MATERIAL_MTL_NAME);
-			}
-
-			SObjVertex v;
-
-			// get all vertices data in this face (current line of obj _file)
-			const std::string wordBuffer = copyLine(bufPtr, bufEnd);
-			const char* linePtr = wordBuffer.c_str();
-			const char* const endPtr = linePtr + wordBuffer.size();
-
-			core::vector<uint32_t> faceCorners;
-			faceCorners.reserve(32ull);
-
-			// read in all vertices
-			linePtr = goNextWord(linePtr, endPtr);
-			while (0 != linePtr[0])
-			{
-				// Array to communicate with retrieveVertexIndices()
-				// sends the buffer sizes and gets the actual indices
-				// if index not set returns -1
-				int32_t Idx[3];
-				Idx[1] = Idx[2] = -1;
-
-				// read in next vertex's data
-				uint32_t wlength = copyWord(tmpbuf, linePtr, WORD_BUFFER_LENGTH, endPtr);
-				// this function will also convert obj's 1-based index to c++'s 0-based index
-				retrieveVertexIndices(tmpbuf, Idx, tmpbuf+wlength+1, vertexBuffer.size(), textureCoordBuffer.size(), normalsBuffer.size());
-				v.pos[0] = vertexBuffer[Idx[0]].data[0];
-				v.pos[1] = vertexBuffer[Idx[0]].data[1];
-				v.pos[2] = vertexBuffer[Idx[0]].data[2];
-				//set texcoord
-				if ( -1 != Idx[1] )
-                {
-					v.uv[0] = textureCoordBuffer[Idx[1]].data[0];
-					v.uv[1] = textureCoordBuffer[Idx[1]].data[1];
-                }
-				else
-                {
-					v.uv[0] = core::nan<float>();
-					v.uv[1] = core::nan<float>();
-                }
-                //set normal
-				if ( -1 != Idx[2] )
-                {
-					core::vectorSIMDf simdNormal;
-					simdNormal.set(normalsBuffer[Idx[2]].data);
-                    simdNormal.makeSafe3D();
-					v.normal32bit = quantNormalCache->quantize<EF_A2B10G10R10_SNORM_PACK32>(simdNormal);
-                }
-				else
+				++ptr;
+				if (ptr < lineEnd && *ptr != '/')
 				{
-					v.normal32bit = core::vectorSIMDu32(0u);
-                    recalcNormals.back() = true;
+					if (!parseResolved(uvCount, idx.y))
+						return false;
 				}
-
-				uint32_t ix;
-				auto vtx_ix = map_vtx2ix.find(v);
-				if (vtx_ix != map_vtx2ix.end() && smoothingGroup==vtxSmoothGrp[vtx_ix->second])
-					ix = vtx_ix->second;
-				else
+				if (ptr < lineEnd && *ptr == '/')
 				{
-					ix = vertices.size();
-					vertices.push_back(v);
-                    vtxSmoothGrp.push_back(smoothingGroup);
-					map_vtx2ix.insert({v, ix});
-				}
-
-				faceCorners.push_back(ix);
-
-				// go to next vertex
-				linePtr = goNextWord(linePtr, endPtr);
-			}
-
-            // triangulate the face
-            for (uint32_t i = 1u; i < faceCorners.size()-1u; ++i)
-            {
-                // Add a triangle
-                performActionBasedOnOrientationSystem
-                (
-                [&]()
-                {
-                    indices.back().push_back(faceCorners[0]);
-                    indices.back().push_back(faceCorners[i]);
-                    indices.back().push_back(faceCorners[i + 1]);
-                },
-                [&]()
-                {
-                    indices.back().push_back(faceCorners[i + 1]);
-                    indices.back().push_back(faceCorners[i]);
-                    indices.back().push_back(faceCorners[0]);
-                }
-                );
-            }
-		}
-		break;
-
-		case '#': // comment
-		default:
-			break;
-		}	// end switch(bufPtr[0])
-		// eat up rest of line
-		bufPtr = goNextLine(bufPtr, bufEnd);
-	}	// end while(bufPtr && (bufPtr-buf<filesize))
-
-	// prune out invalid empty shape groups (TODO: convert to AoS and use an erase_if)
-	for (size_t i = 0ull; i < submeshes.size(); ++i)
-	if (indices[i].size())
-		i++;
-	else
-	{
-		submeshes.erase(submeshes.begin()+i);
-		indices.erase(indices.begin()+i);
-		recalcNormals.erase(recalcNormals.begin()+i);
-		submeshWasLoadedFromCache.erase(submeshWasLoadedFromCache.begin()+i);
-		submeshCacheKeys.erase(submeshCacheKeys.begin()+i);
-		submeshMaterialNames.erase(submeshMaterialNames.begin()+i);
-	}
-	
-    core::unordered_set<pipeline_meta_pair_t,hash_t,key_equal_t> usedPipelines;
-    {
-        uint64_t ixBufOffset = 0ull;
-        for (size_t i = 0ull; i < submeshes.size(); ++i)
-        {
-            if (submeshWasLoadedFromCache[i])
-                continue;                
-
-            submeshes[i]->setIndexCount(indices[i].size());
-            submeshes[i]->setIndexType(EIT_32BIT);
-			submeshes[i]->setIndexBufferBinding({ixBufOffset,nullptr});
-            ixBufOffset += indices[i].size()*4ull;
-
-            const uint32_t hasUV = !core::isnan(vertices[indices[i][0]].uv[0]);
-			using namespace std::string_literals;
-			_params.logger.log("Has UV: "s + (hasUV ? "YES":"NO"), system::ILogger::ELL_DEBUG);
-			// search in loaded
-			pipeline_meta_pair_t pipeline;
-			{
-				CMTLMetadata::CRenderpassIndependentPipeline dummyKey;
-				dummyKey.m_name = submeshCacheKeys[i].substr(submeshCacheKeys[i].find_last_of('?')+1u);
-				pipeline_meta_pair_t dummy{nullptr,&dummyKey};
-
-				auto rng = pipelines.equal_range(dummy);
-				for (auto it=rng.first; it!=rng.second; it++)
-				if (it->second->m_hash==hasUV)
-				{
-					pipeline = *it;
-					break;
-				}
-			}
-			//if there's no pipeline for this meshbuffer, set dummy one
-			if (!pipeline.first)
-			{
-				const IAsset::E_TYPE searchTypes[] = {IAsset::ET_RENDERPASS_INDEPENDENT_PIPELINE,static_cast<IAsset::E_TYPE>(0u)};
-				auto bundle = _override->findCachedAsset("nbl/builtin/renderpass_independent_pipeline/loader/mtl/missing_material_pipeline",searchTypes,ctx.inner,_hierarchyLevel+ICPUMesh::PIPELINE_HIERARCHYLEVELS_BELOW);
-				const auto* meta = bundle.getMetadata()->selfCast<CMTLMetadata>();
-				const auto contents = bundle.getContents();
-				for (auto pplnIt=contents.begin(); pplnIt!=contents.end(); pplnIt++)
-				{
-					auto ppln = core::smart_refctd_ptr_static_cast<ICPURenderpassIndependentPipeline>(*pplnIt);
-					auto pplnMeta = meta->getAssetSpecificMetadata(ppln.get());
-					if (pplnMeta && pplnMeta->m_hash==hasUV)
+					++ptr;
+					if (ptr < lineEnd && !Common::isInlineWhitespace(*ptr))
 					{
-						pipeline = { std::move(ppln),pplnMeta };
-						break;
+						if (!parseResolved(normalCount, idx.z))
+							return false;
 					}
 				}
+				else if (ptr < lineEnd && !Common::isInlineWhitespace(*ptr))
+					return false;
 			}
-			// do some checks
-			assert(pipeline.first && pipeline.second);
-			const auto* cPpln = pipeline.first.get();
-            if (hasUV)
-            {
-                const auto& vtxParams = cPpln->getCachedCreationParams().vertexInput;
-                assert(vtxParams.attributes[POSITION].relativeOffset==offsetof(SObjVertex,pos));
-                assert(vtxParams.attributes[NORMAL].relativeOffset==offsetof(SObjVertex,normal32bit));
-                assert(vtxParams.attributes[UV].relativeOffset==offsetof(SObjVertex,uv));
-                assert(vtxParams.enabledAttribFlags&(1u<<UV));
-                assert(vtxParams.enabledBindingFlags==(1u<<BND_NUM));
-            }
-
-			const uint32_t pcoffset = cPpln->getLayout()->getPushConstantRanges().begin()[0].offset;
-			submeshes[i]->setAttachedDescriptorSet(core::smart_refctd_ptr<ICPUDescriptorSet>(pipeline.second->m_descriptorSet3));
-			memcpy(
-				submeshes[i]->getPushConstantsDataPtr()+pcoffset,
-				&pipeline.second->m_materialParams,
-				sizeof(CMTLMetadata::CRenderpassIndependentPipeline::SMaterialParameters)
-			);
-
-			usedPipelines.insert(pipeline);
-			submeshes[i]->setPipeline(std::move(pipeline.first));
-        }
-
-        core::smart_refctd_ptr<ICPUBuffer> vtxBuf = ICPUBuffer::create({ vertices.size() * sizeof(SObjVertex) });
-        memcpy(vtxBuf->getPointer(), vertices.data(), vtxBuf->getSize());
-
-        auto ixBuf = ICPUBuffer::create({ ixBufOffset });
-        for (size_t i = 0ull; i < submeshes.size(); ++i)
-        {
-            if (submeshWasLoadedFromCache[i])
-                continue;
-
-            submeshes[i]->setPositionAttributeIx(POSITION);
-			submeshes[i]->setNormalAttributeIx(NORMAL);
-			
-			submeshes[i]->setIndexBufferBinding({submeshes[i]->getIndexBufferBinding().offset,ixBuf});
-            const uint64_t offset = submeshes[i]->getIndexBufferBinding().offset;
-            memcpy(reinterpret_cast<uint8_t*>(ixBuf->getPointer())+offset, indices[i].data(), indices[i].size()*4ull);
-
-            SBufferBinding<ICPUBuffer> vtxBufBnd;
-            vtxBufBnd.offset = 0ull;
-            vtxBufBnd.buffer = vtxBuf;
-            submeshes[i]->setVertexBufferBinding(std::move(vtxBufBnd), BND_NUM);
-
-			if (recalcNormals[i])
-			{
-				auto vtxcmp = [&vtxSmoothGrp](const IMeshManipulator::SSNGVertexData& v0, const IMeshManipulator::SSNGVertexData& v1, ICPUMeshBuffer* buffer)
-				{
-					return vtxSmoothGrp[v0.indexOffset]==vtxSmoothGrp[v1.indexOffset];
-				};
-
-				auto* meshManipulator = AssetManager->getMeshManipulator();
-				meshManipulator->calculateSmoothNormals(submeshes[i].get(), false, 1.52e-5f, NORMAL, vtxcmp);
-			}
-        }
-    }
-
-    auto mesh = core::make_smart_refctd_ptr<ICPUMesh>();
-    for (auto& submesh : submeshes)
-    {
-		IMeshManipulator::recalculateBoundingBox(submesh.get());
-        mesh->getMeshBufferVector().emplace_back(std::move(submesh));
-    }
-
-	IMeshManipulator::recalculateBoundingBox(mesh.get());
-	if (mesh->getMeshBuffers().empty())
-        return {};
-    
-	//
-	auto meta = core::make_smart_refctd_ptr<COBJMetadata>(usedPipelines.size());
-	uint32_t metaOffset = 0u;
-	for (auto pipeAndMeta : usedPipelines)
-		meta->placeMeta(metaOffset++,pipeAndMeta.first.get(),*pipeAndMeta.second);
-
-    //at the very end, insert submeshes into cache
-	uint32_t i = 0u;
-	for (auto meshbuffer : mesh->getMeshBuffers())
-	{
-		auto bundle = SAssetBundle(meta,{ core::smart_refctd_ptr<ICPUMeshBuffer>(meshbuffer) });
-        _override->insertAssetIntoCache(bundle, submeshCacheKeys[i++], ctx.inner, _hierarchyLevel+ICPUMesh::MESHBUFFER_HIERARCHYLEVELS_BELOW);
-	}
-
-	return SAssetBundle(std::move(meta),{std::move(mesh)});
-}
-
-
-//! Read 3d vector of floats
-const char* COBJMeshFileLoader::readVec3(const char* bufPtr, float vec[3], const char* const bufEnd)
-{
-	const uint32_t WORD_BUFFER_LENGTH = 256;
-	char wordBuffer[WORD_BUFFER_LENGTH];
-
-	bufPtr = goAndCopyNextWord(wordBuffer, bufPtr, WORD_BUFFER_LENGTH, bufEnd);
-	sscanf(wordBuffer,"%f",vec);
-	bufPtr = goAndCopyNextWord(wordBuffer, bufPtr, WORD_BUFFER_LENGTH, bufEnd);
-	sscanf(wordBuffer,"%f",vec+1);
-	bufPtr = goAndCopyNextWord(wordBuffer, bufPtr, WORD_BUFFER_LENGTH, bufEnd);
-	sscanf(wordBuffer,"%f",vec+2);
-
-    vec[0] = -vec[0]; // change handedness
-	return bufPtr;
-}
-
-
-//! Read 2d vector of floats
-const char* COBJMeshFileLoader::readUV(const char* bufPtr, float vec[2], const char* const bufEnd)
-{
-	const uint32_t WORD_BUFFER_LENGTH = 256;
-	char wordBuffer[WORD_BUFFER_LENGTH];
-
-	bufPtr = goAndCopyNextWord(wordBuffer, bufPtr, WORD_BUFFER_LENGTH, bufEnd);
-	sscanf(wordBuffer,"%f",vec);
-	bufPtr = goAndCopyNextWord(wordBuffer, bufPtr, WORD_BUFFER_LENGTH, bufEnd);
-	sscanf(wordBuffer,"%f",vec+1);
-
-	vec[1] = 1.f-vec[1]; // change handedness
-	return bufPtr;
-}
-
-
-//! Read boolean value represented as 'on' or 'off'
-const char* COBJMeshFileLoader::readBool(const char* bufPtr, bool& tf, const char* const bufEnd)
-{
-	const uint32_t BUFFER_LENGTH = 8;
-	char tfStr[BUFFER_LENGTH];
-
-	bufPtr = goAndCopyNextWord(tfStr, bufPtr, BUFFER_LENGTH, bufEnd);
-	tf = strcmp(tfStr, "off") != 0;
-	return bufPtr;
-}
-
-//! skip space characters and stop on first non-space
-const char* COBJMeshFileLoader::goFirstWord(const char* buf, const char* const bufEnd, bool acrossNewlines)
-{
-	// skip space characters
-	if (acrossNewlines)
-		while((buf != bufEnd) && core::isspace(*buf))
-			++buf;
-	else
-		while((buf != bufEnd) && core::isspace(*buf) && (*buf != '\n'))
-			++buf;
-
-	return buf;
-}
-
-
-//! skip current word and stop at beginning of next one
-const char* COBJMeshFileLoader::goNextWord(const char* buf, const char* const bufEnd, bool acrossNewlines)
-{
-	// skip current word
-	while(( buf != bufEnd ) && !core::isspace(*buf))
-		++buf;
-
-	return goFirstWord(buf, bufEnd, acrossNewlines);
-}
-
-
-//! Read until line break is reached and stop at the next non-space character
-const char* COBJMeshFileLoader::goNextLine(const char* buf, const char* const bufEnd)
-{
-	// look for newline characters
-	while(buf != bufEnd)
-	{
-		// found it, so leave
-		if (*buf=='\n' || *buf=='\r')
-			break;
-		++buf;
+			else if (ptr < lineEnd && !Common::isInlineWhitespace(*ptr))
+				return false;
+		}
+		if (ptr < lineEnd && !Common::isInlineWhitespace(*ptr))
+			return false;
+		linePtr = ptr;
+		return true;
 	}
-	return goFirstWord(buf, bufEnd);
+};
 }
-
-
-uint32_t COBJMeshFileLoader::copyWord(char* outBuf, const char* const inBuf, uint32_t outBufLength, const char* const bufEnd)
+COBJMeshFileLoader::COBJMeshFileLoader(IAssetManager*)
 {
-	if (!outBufLength)
-		return 0;
-	if (!inBuf)
-	{
-		*outBuf = 0;
-		return 0;
-	}
-
-	uint32_t i = 0;
-	while(inBuf[i])
-	{
-		if (core::isspace(inBuf[i]) || &(inBuf[i]) == bufEnd)
-			break;
-		++i;
-	}
-
-	uint32_t length = core::min(i, outBufLength-1);
-	for (uint32_t j=0; j<length; ++j)
-		outBuf[j] = inBuf[j];
-
-	outBuf[length] = 0;
-	return length;
 }
-
-
-std::string COBJMeshFileLoader::copyLine(const char* inBuf, const char* bufEnd)
+COBJMeshFileLoader::~COBJMeshFileLoader() = default;
+bool COBJMeshFileLoader::isALoadableFileFormat(system::IFile* _file, const system::logger_opt_ptr) const
 {
-	if (!inBuf)
-		return std::string();
-
-	const char* ptr = inBuf;
-	while (ptr<bufEnd)
+	if (!_file)
+		return false;
+	const auto fileSize = _file->getSize();
+	if (fileSize <= 0)
+		return false;
+	constexpr size_t ProbeBytes = 4096ull;
+	const size_t bytesToRead = std::min<size_t>(ProbeBytes, static_cast<size_t>(fileSize));
+	std::array<char, ProbeBytes> probe = {};
+	system::IFile::success_t succ;
+	_file->read(succ, probe.data(), 0ull, bytesToRead);
+	if (!succ || bytesToRead == 0ull)
+		return false;
+	const char* ptr = probe.data();
+	const char* const end = probe.data() + bytesToRead;
+	if ((end - ptr) >= 3 && static_cast<uint8_t>(ptr[0]) == 0xEFu && static_cast<uint8_t>(ptr[1]) == 0xBBu && static_cast<uint8_t>(ptr[2]) == 0xBFu)
+		ptr += 3;
+	while (ptr < end)
 	{
-		if (*ptr=='\n' || *ptr=='\r')
+		while (ptr < end && (*ptr == ' ' || *ptr == '\t' || *ptr == '\r' || *ptr == '\n'))
+			++ptr;
+		if (ptr >= end)
 			break;
-		++ptr;
+		if (*ptr == '#')
+		{
+			while (ptr < end && *ptr != '\n')
+				++ptr;
+			continue;
+		}
+		switch (std::tolower(*ptr))
+		{
+			case 'v':
+			case 'f':
+			case 'o':
+			case 'g':
+			case 's':
+			case 'u':
+			case 'm':
+			case 'l':
+			case 'p':
+				return true;
+			default:
+				return false;
+		}
 	}
-	// we must avoid the +1 in case the array is used up
-	return std::string(inBuf, (uint32_t)(ptr-inBuf+((ptr < bufEnd) ? 1 : 0)));
+	return false;
 }
-
-
-const char* COBJMeshFileLoader::goAndCopyNextWord(char* outBuf, const char* inBuf, uint32_t outBufLength, const char* bufEnd)
+const char** COBJMeshFileLoader::getAssociatedFileExtensions() const
 {
-	inBuf = goNextWord(inBuf, bufEnd, false);
-	copyWord(outBuf, inBuf, outBufLength, bufEnd);
-	return inBuf;
+	static const char* ext[] = { "obj", nullptr };
+	return ext;
 }
-
-
-bool COBJMeshFileLoader::retrieveVertexIndices(char* vertexData, int32_t* idx, const char* bufEnd, uint32_t vbsize, uint32_t vtsize, uint32_t vnsize)
-{
-	char word[16] = "";
-	const char* p = goFirstWord(vertexData, bufEnd);
-	uint32_t idxType = 0;	// 0 = posIdx, 1 = texcoordIdx, 2 = normalIdx
-
-	uint32_t i = 0;
-	while ( p != bufEnd )
-	{
-		if ( ( core::isdigit(*p)) || (*p == '-') )
-		{
-			// build up the number
-			word[i++] = *p;
-		}
-		else if ( *p == '/' || *p == ' ' || *p == '\0' )
-		{
-			// number is completed. Convert and store it
-			word[i] = '\0';
-			// if no number was found index will become 0 and later on -1 by decrement
-			sscanf(word,"%d",idx+idxType);
-			if (idx[idxType]<0)
-			{
-				switch (idxType)
-				{
-					case 0:
-						idx[idxType] += vbsize;
-						break;
-					case 1:
-						idx[idxType] += vtsize;
-						break;
-					case 2:
-						idx[idxType] += vnsize;
-						break;
-				}
-			}
-			else
-				idx[idxType]-=1;
-
-			// reset the word
-			word[0] = '\0';
-			i = 0;
-
-			// go to the next kind of index type
-			if (*p == '/')
-			{
-				if ( ++idxType > 2 )
-				{
-					// error checking, shouldn't reach here unless file is wrong
-					idxType = 0;
-				}
-			}
-			else
-			{
-				// set all missing values to disable (=-1)
-				while (++idxType < 3)
-					idx[idxType]=-1;
-				++p;
-				break; // while
+asset::SAssetBundle COBJMeshFileLoader::loadAsset(
+    system::IFile* _file, const asset::IAssetLoader::SAssetLoadParams& _params,
+    asset::IAssetLoader::IAssetLoaderOverride* _override [[maybe_unused]],
+    uint32_t _hierarchyLevel [[maybe_unused]]) {
+    if (!_file)
+        return {};
+    uint64_t faceCount = 0u;
+    uint64_t faceFastTokenCount = 0u;
+    uint64_t faceFallbackTokenCount = 0u;
+    SFileReadTelemetry ioTelemetry = {};
+    const long filesize = _file->getSize();
+    if (filesize <= 0)
+        return {};
+    impl::SLoadSession loadSession = {};
+    if (!impl::SLoadSession::begin(_params.logger, "OBJ loader", _file, _params.ioPolicy, static_cast<uint64_t>(filesize), true, loadSession))
+        return {};
+    core::vector<uint8_t> fileContents;
+    const auto* fileData = loadSession.mapOrReadWholeFile(fileContents, &ioTelemetry);
+    if (!fileData)
+        return {};
+    const char* const buf = reinterpret_cast<const char*>(fileData);
+    const char* const bufEnd = buf + static_cast<size_t>(filesize);
+    const char* bufPtr = buf;
+    core::vector<hlsl::float32_t3> positions;
+    core::vector<hlsl::float32_t3> normals;
+    core::vector<hlsl::float32_t2> uvs;
+    const size_t estimatedAttributeCount =
+        std::max<size_t>(16ull, static_cast<size_t>(filesize) / 32ull);
+    positions.reserve(estimatedAttributeCount);
+    normals.reserve(estimatedAttributeCount);
+    uvs.reserve(estimatedAttributeCount);
+    core::vector<hlsl::float32_t3> outPositions;
+    core::vector<hlsl::float32_t3> outNormals;
+    core::vector<uint8_t> outNormalNeedsGeneration;
+    core::vector<hlsl::float32_t2> outUVs;
+    std::optional<CPolygonGeometryManipulator::CSmoothNormalAccumulator> smoothNormalAccumulator;
+    core::vector<uint32_t> indices;
+    core::vector<int32_t> dedupHeadByPos;
+    core::vector<Parse::VertexDedupNode> dedupNodes;
+    const size_t estimatedOutVertexCount = std::max<size_t>(
+        estimatedAttributeCount, static_cast<size_t>(filesize) / 20ull);
+    const size_t estimatedOutIndexCount =
+        (estimatedOutVertexCount <= (std::numeric_limits<size_t>::max() / 3ull))
+            ? (estimatedOutVertexCount * 3ull)
+            : std::numeric_limits<size_t>::max();
+    const size_t initialOutVertexCapacity =
+        std::max<size_t>(1ull, estimatedOutVertexCount);
+    const size_t initialOutIndexCapacity =
+        (estimatedOutIndexCount == std::numeric_limits<size_t>::max())
+            ? 3ull
+            : std::max<size_t>(3ull, estimatedOutIndexCount);
+    size_t outVertexWriteCount = 0ull;
+    size_t outIndexWriteCount = 0ull;
+    size_t dedupNodeCount = 0ull;
+    struct SDedupHotEntry {
+        int32_t pos = -1;
+        int32_t uv = -1;
+        int32_t normal = -1;
+        uint32_t outIndex = 0u;
+    };
+    const size_t hw = SLoaderRuntimeTuner::resolveHardwareThreads();
+    const size_t hardMaxWorkers = SLoaderRuntimeTuner::resolveHardMaxWorkers(
+        hw, _params.ioPolicy.runtimeTuning.workerHeadroom);
+    SLoaderRuntimeTuningRequest dedupTuningRequest = {};
+    dedupTuningRequest.inputBytes = static_cast<uint64_t>(filesize);
+    dedupTuningRequest.totalWorkUnits = estimatedOutVertexCount;
+    dedupTuningRequest.hardwareThreads = static_cast<uint32_t>(hw);
+    dedupTuningRequest.hardMaxWorkers = static_cast<uint32_t>(hardMaxWorkers);
+    dedupTuningRequest.targetChunksPerWorker =
+        _params.ioPolicy.runtimeTuning.targetChunksPerWorker;
+    dedupTuningRequest.sampleData = reinterpret_cast<const uint8_t*>(buf);
+    dedupTuningRequest.sampleBytes = SLoaderRuntimeTuner::resolveSampleBytes(
+        _params.ioPolicy, static_cast<uint64_t>(filesize));
+    const auto dedupTuning =
+        SLoaderRuntimeTuner::tune(_params.ioPolicy, dedupTuningRequest);
+    const size_t dedupHotSeed = std::max<size_t>(
+        16ull, estimatedOutVertexCount /
+                   std::max<size_t>(1ull, dedupTuning.workerCount * 8ull));
+    const size_t dedupHotEntryCount = std::bit_ceil(dedupHotSeed);
+    core::vector<SDedupHotEntry> dedupHotCache(dedupHotEntryCount);
+    const size_t dedupHotMask = dedupHotEntryCount - 1ull;
+    struct SLoadedGeometry {
+        core::smart_refctd_ptr<ICPUPolygonGeometry> geometry = {};
+        std::string objectName = {};
+        std::string groupName = {};
+        uint64_t faceCount = 0ull;
+        uint64_t faceFastTokenCount = 0ull;
+        uint64_t faceFallbackTokenCount = 0ull;
+    };
+    core::vector<SLoadedGeometry> loadedGeometries;
+    std::string currentObjectName = "default_object";
+    std::string currentGroupName = "default_group";
+    bool sawObjectDirective = false;
+    bool sawGroupDirective = false;
+    bool hasProvidedNormals = false;
+    bool needsNormalGeneration = false;
+    bool hasUVs = false;
+    hlsl::shapes::util::AABBAccumulator3<float> parsedAABB =
+        hlsl::shapes::util::createAABBAccumulator<float>();
+    uint64_t currentFaceCount = 0ull;
+    uint64_t currentFaceFastTokenCount = 0ull;
+    uint64_t currentFaceFallbackTokenCount = 0ull;
+    const auto resetBuilderState = [&]() -> void {
+        outPositions.clear();
+        outNormals.clear();
+        outNormalNeedsGeneration.clear();
+        outUVs.clear();
+        smoothNormalAccumulator.reset();
+        indices.clear();
+        dedupNodes.clear();
+        outPositions.resize(initialOutVertexCapacity);
+        outNormals.resize(initialOutVertexCapacity);
+        outNormalNeedsGeneration.resize(initialOutVertexCapacity, 0u);
+        outUVs.resize(initialOutVertexCapacity);
+        indices.resize(initialOutIndexCapacity);
+        dedupHeadByPos.assign(positions.size(), -1);
+        dedupNodes.resize(initialOutVertexCapacity);
+        outVertexWriteCount = 0ull;
+        outIndexWriteCount = 0ull;
+        dedupNodeCount = 0ull;
+        hasProvidedNormals = false;
+        needsNormalGeneration = false;
+        hasUVs = false;
+        parsedAABB = hlsl::shapes::util::createAABBAccumulator<float>();
+        currentFaceCount = 0ull;
+        currentFaceFastTokenCount = 0ull;
+        currentFaceFallbackTokenCount = 0ull;
+        const SDedupHotEntry emptyHotEntry = {};
+        std::fill(dedupHotCache.begin(), dedupHotCache.end(), emptyHotEntry);
+    };
+    const auto finalizeCurrentGeometry = [&]() -> bool {
+        if (outVertexWriteCount == 0ull)
+            return true;
+        outPositions.resize(outVertexWriteCount);
+        outNormals.resize(outVertexWriteCount);
+        outNormalNeedsGeneration.resize(outVertexWriteCount);
+        outUVs.resize(outVertexWriteCount);
+        indices.resize(outIndexWriteCount);
+        if (needsNormalGeneration) {
+            // OBJ smoothing groups are already encoded in the parser-side vertex
+            // split corners that must stay sharp become different output vertices
+            // even if they share position. We therefore feed the parser-final
+            // indexed triangles into a smoothing accumulator and finalize only
+            // the normals that were missing in the source.
+            if (!smoothNormalAccumulator)
+                return false;
+            smoothNormalAccumulator->reserveVertices(outVertexWriteCount);
+            if (!smoothNormalAccumulator->finalize(
+                    std::span<hlsl::float32_t3>(outNormals.data(), outNormals.size()),
+                    std::span<const uint8_t>(outNormalNeedsGeneration.data(), outNormalNeedsGeneration.size())))
+                return false;
+        }
+        const size_t outVertexCount = outPositions.size();
+        auto geometry = core::make_smart_refctd_ptr<ICPUPolygonGeometry>();
+        {
+            auto view = SGeometryLoaderCommon::createAdoptedView<EF_R32G32B32_SFLOAT>(
+                std::move(outPositions));
+            if (!view)
+                return false;
+            geometry->setPositionView(std::move(view));
+        }
+        const bool hasNormals = hasProvidedNormals || needsNormalGeneration;
+        if (hasNormals) {
+            auto view = SGeometryLoaderCommon::createAdoptedView<EF_R32G32B32_SFLOAT>(
+                std::move(outNormals));
+            if (!view)
+                return false;
+            geometry->setNormalView(std::move(view));
+        }
+        if (hasUVs) {
+            auto view = SGeometryLoaderCommon::createAdoptedView<EF_R32G32_SFLOAT>(
+                std::move(outUVs));
+            if (!view)
+                return false;
+            auto* const auxViews = geometry->getAuxAttributeViews();
+            auxViews->resize(SOBJPolygonGeometryAuxLayout::UV0 + 1u);
+            (*auxViews)[SOBJPolygonGeometryAuxLayout::UV0] = std::move(view);
+        }
+        if (!indices.empty()) {
+            geometry->setIndexing(IPolygonGeometryBase::TriangleList());
+            if (outVertexCount <=
+                static_cast<size_t>(std::numeric_limits<uint16_t>::max()) + 1ull) {
+                core::vector<uint16_t> indices16(indices.size());
+                for (size_t i = 0u; i < indices.size(); ++i)
+                    indices16[i] = static_cast<uint16_t>(indices[i]);
+                auto view = SGeometryLoaderCommon::createAdoptedView<EF_R16_UINT>(
+                    std::move(indices16));
+                if (!view)
+                    return false;
+                geometry->setIndexView(std::move(view));
+            } else {
+                auto view = SGeometryLoaderCommon::createAdoptedView<EF_R32_UINT>(
+                    std::move(indices));
+                if (!view)
+                    return false;
+                geometry->setIndexView(std::move(view));
+            }
+        } else {
+            geometry->setIndexing(IPolygonGeometryBase::PointList());
+        }
+        if (!_params.loaderFlags.hasAnyFlag(
+                IAssetLoader::ELPF_DONT_COMPUTE_CONTENT_HASHES))
+            SPolygonGeometryContentHash::computeMissing(geometry.get(),
+                                                        _params.ioPolicy);
+        if (!parsedAABB.empty())
+            geometry->applyAABB(parsedAABB.value);
+        else
+            CPolygonGeometryManipulator::recomputeAABB(geometry.get());
+        loadedGeometries.push_back(SLoadedGeometry{
+            .geometry = std::move(geometry),
+            .objectName = currentObjectName,
+            .groupName = currentGroupName,
+            .faceCount = currentFaceCount,
+            .faceFastTokenCount = currentFaceFastTokenCount,
+            .faceFallbackTokenCount = currentFaceFallbackTokenCount});
+        return true;
+    };
+    resetBuilderState();
+    auto allocateOutVertex = [&](uint32_t& outIx) -> bool {
+        if (outVertexWriteCount >= outPositions.size()) {
+            const size_t newCapacity = std::max<size_t>(outVertexWriteCount + 1ull,
+                                                        outPositions.size() * 2ull);
+            outPositions.resize(newCapacity);
+            outNormals.resize(newCapacity);
+            outNormalNeedsGeneration.resize(newCapacity, 0u);
+            outUVs.resize(newCapacity);
+            if (smoothNormalAccumulator) {
+                smoothNormalAccumulator->reserveVertices(newCapacity);
+                smoothNormalAccumulator->prepareIdentityGroups(newCapacity);
+            }
+        }
+        if (outVertexWriteCount >
+            static_cast<size_t>(std::numeric_limits<uint32_t>::max()))
+            return false;
+        outIx = static_cast<uint32_t>(outVertexWriteCount++);
+        return true;
+    };
+    auto appendIndex = [&](const uint32_t value) -> bool {
+        if (outIndexWriteCount >= indices.size()) {
+            const size_t newCapacity =
+                std::max<size_t>(outIndexWriteCount + 1ull, indices.size() * 2ull);
+            indices.resize(newCapacity);
+        }
+        indices[outIndexWriteCount++] = value;
+        return true;
+    };
+    auto allocateDedupNode = [&]() -> int32_t {
+        if (dedupNodeCount >= dedupNodes.size()) {
+            const size_t newCapacity =
+                std::max<size_t>(dedupNodeCount + 1ull, dedupNodes.size() * 2ull);
+            dedupNodes.resize(newCapacity);
+        }
+        if (dedupNodeCount >
+            static_cast<size_t>(std::numeric_limits<int32_t>::max()))
+            return -1;
+        const int32_t ix = static_cast<int32_t>(dedupNodeCount++);
+        return ix;
+    };
+    auto findCornerIndex =
+        [&](const int32_t posIx, const int32_t uvIx, const int32_t normalIx,
+            const uint32_t dedupSmoothingGroup, uint32_t& outIx) -> bool {
+        if (posIx < 0 || static_cast<size_t>(posIx) >= positions.size())
+            return false;
+        if (static_cast<size_t>(posIx) >= dedupHeadByPos.size())
+            dedupHeadByPos.resize(positions.size(), -1);
+        int32_t nodeIx = dedupHeadByPos[static_cast<size_t>(posIx)];
+        while (nodeIx >= 0) {
+            const auto& node = dedupNodes[static_cast<size_t>(nodeIx)];
+            if (node.uv == uvIx && node.normal == normalIx &&
+                node.smoothingGroup == dedupSmoothingGroup) {
+                outIx = node.outIndex;
+                return true;
+            }
+            nodeIx = node.next;
+        }
+        return false;
+    };
+    auto materializeCornerIndex =
+        [&](const int32_t posIx, const int32_t uvIx, const int32_t normalIx,
+            const uint32_t dedupSmoothingGroup, uint32_t& outIx) -> bool {
+        if (!allocateOutVertex(outIx))
+            return false;
+        const int32_t newNodeIx = allocateDedupNode();
+        if (newNodeIx < 0)
+            return false;
+        auto& node = dedupNodes[static_cast<size_t>(newNodeIx)];
+        node.uv = uvIx;
+        node.normal = normalIx;
+        node.smoothingGroup = dedupSmoothingGroup;
+        node.outIndex = outIx;
+        node.next = dedupHeadByPos[static_cast<size_t>(posIx)];
+        dedupHeadByPos[static_cast<size_t>(posIx)] = newNodeIx;
+        const auto& srcPos = positions[static_cast<size_t>(posIx)];
+        outPositions[static_cast<size_t>(outIx)] = srcPos;
+        hlsl::shapes::util::extendAABBAccumulator(parsedAABB, srcPos);
+        hlsl::float32_t2 uv(0.f, 0.f);
+        if (uvIx >= 0 && static_cast<size_t>(uvIx) < uvs.size()) {
+            uv = uvs[static_cast<size_t>(uvIx)];
+            hasUVs = true;
+        }
+        outUVs[static_cast<size_t>(outIx)] = uv;
+        hlsl::float32_t3 normal(0.f, 0.f, 0.f);
+        if (normalIx >= 0 && static_cast<size_t>(normalIx) < normals.size()) {
+            normal = normals[static_cast<size_t>(normalIx)];
+            hasProvidedNormals = true;
+            outNormalNeedsGeneration[static_cast<size_t>(outIx)] = 0u;
+        } else {
+            needsNormalGeneration = true;
+            outNormalNeedsGeneration[static_cast<size_t>(outIx)] = 1u;
+        }
+        outNormals[static_cast<size_t>(outIx)] = normal;
+        return true;
+    };
+    auto acquireCornerIndex = [&](const hlsl::int32_t3& idx,
+                                  const uint32_t smoothingGroup,
+                                  uint32_t& outIx) -> bool {
+        const int32_t posIx = idx.x;
+        if (posIx < 0 || static_cast<size_t>(posIx) >= positions.size())
+            return false;
+        const uint32_t dedupSmoothingGroup = idx.z >= 0 ? 0u : smoothingGroup;
+        if (findCornerIndex(posIx, idx.y, idx.z, dedupSmoothingGroup, outIx))
+            return true;
+        return materializeCornerIndex(posIx, idx.y, idx.z, dedupSmoothingGroup,
+                                      outIx);
+    };
+    auto acquireCornerIndexPositiveTriplet = [&](const hlsl::int32_t3& idx,
+                                                 uint32_t& outIx) -> bool {
+        const uint32_t hotHash = static_cast<uint32_t>(idx.x) * 73856093u ^
+                                 static_cast<uint32_t>(idx.y) * 19349663u ^
+                                 static_cast<uint32_t>(idx.z) * 83492791u;
+        auto& hotEntry = dedupHotCache[static_cast<size_t>(hotHash) & dedupHotMask];
+        if (hotEntry.pos == idx.x && hotEntry.uv == idx.y &&
+            hotEntry.normal == idx.z) {
+            outIx = hotEntry.outIndex;
+            return true;
+        }
+        if (findCornerIndex(idx.x, idx.y, idx.z, 0u, outIx) ||
+            materializeCornerIndex(idx.x, idx.y, idx.z, 0u, outIx)) {
+            hotEntry.pos = idx.x;
+            hotEntry.uv = idx.y;
+            hotEntry.normal = idx.z;
+            hotEntry.outIndex = outIx;
+            return true;
+        }
+        return false;
+    };
+    auto acquireCornerIndexPositiveNormal = [&](const hlsl::int32_t3& idx,
+                                                uint32_t& outIx) -> bool {
+        const uint32_t hotHash = static_cast<uint32_t>(idx.x) * 73856093u ^
+                                 static_cast<uint32_t>(idx.z) * 83492791u ^
+                                 0x9e3779b9u;
+        auto& hotEntry = dedupHotCache[static_cast<size_t>(hotHash) & dedupHotMask];
+        if (hotEntry.pos == idx.x && hotEntry.uv == -1 &&
+            hotEntry.normal == idx.z) {
+            outIx = hotEntry.outIndex;
+            return true;
+        }
+        if (findCornerIndex(idx.x, -1, idx.z, 0u, outIx) ||
+            materializeCornerIndex(idx.x, -1, idx.z, 0u, outIx)) {
+            hotEntry.pos = idx.x;
+            hotEntry.uv = -1;
+            hotEntry.normal = idx.z;
+            hotEntry.outIndex = outIx;
+            return true;
+        }
+        return false;
+    };
+    auto acquireTriangleCorners = [&](auto&& acquire, const std::array<hlsl::int32_t3, 3>& triIdx, hlsl::uint32_t3& cornerIx) -> bool {
+        return acquire(triIdx[0], cornerIx.x) && acquire(triIdx[1], cornerIx.y) && acquire(triIdx[2], cornerIx.z);
+    };
+    auto appendTriangle = [&](const hlsl::uint32_t3& cornerIx) -> bool {
+        if (!(appendIndex(cornerIx.z) && appendIndex(cornerIx.y) && appendIndex(cornerIx.x)))
+            return false;
+        if (!needsNormalGeneration)
+            return true;
+        if (!smoothNormalAccumulator) {
+            smoothNormalAccumulator.emplace(CPolygonGeometryManipulator::ESmoothNormalAccumulationMode::AreaWeighted);
+            smoothNormalAccumulator->reserveVertices(outVertexWriteCount);
+            smoothNormalAccumulator->prepareIdentityGroups(outPositions.size());
+        }
+        if (outNormalNeedsGeneration[static_cast<size_t>(cornerIx.x)] == 0u &&
+            outNormalNeedsGeneration[static_cast<size_t>(cornerIx.y)] == 0u &&
+            outNormalNeedsGeneration[static_cast<size_t>(cornerIx.z)] == 0u)
+            return true;
+        return smoothNormalAccumulator->addPreparedIdentityTriangle(
+            cornerIx.z, outPositions[static_cast<size_t>(cornerIx.z)],
+            cornerIx.y, outPositions[static_cast<size_t>(cornerIx.y)],
+            cornerIx.x, outPositions[static_cast<size_t>(cornerIx.x)]);
+    };
+    uint32_t currentSmoothingGroup = 0u;
+    while (bufPtr < bufEnd) {
+        const char* const lineStart = bufPtr;
+        const size_t remaining = static_cast<size_t>(bufEnd - lineStart);
+        const char* lineTerminator =
+            static_cast<const char*>(std::memchr(lineStart, '\n', remaining));
+        if (!lineTerminator)
+            lineTerminator =
+                static_cast<const char*>(std::memchr(lineStart, '\r', remaining));
+        if (!lineTerminator)
+            lineTerminator = bufEnd;
+        const char* lineEnd = lineTerminator;
+        if (lineEnd > lineStart && lineEnd[-1] == '\r')
+            --lineEnd;
+        if (lineStart < lineEnd) {
+            const char lineType = std::tolower(*lineStart);
+            if (lineType == 'v') {
+				auto parseVector = [&](const char* ptr, float* values,
+									   const uint32_t count) -> bool {
+					for (uint32_t i = 0u; i < count; ++i) {
+						while (ptr < lineEnd && Parse::Common::isInlineWhitespace(*ptr))
+							++ptr;
+						if (ptr >= lineEnd || !Parse::Common::parseNumber(ptr, lineEnd, values[i]))
+							return false;
+					}
+					return true;
+                };
+                const char subType =
+                    ((lineStart + 1) < lineEnd) ? std::tolower(lineStart[1]) : '\0';
+                if ((lineStart + 1) < lineEnd && subType == ' ') {
+                    hlsl::float32_t3 vec{};
+                    if (!parseVector(lineStart + 2, &vec.x, 3u))
+                        return {};
+                    positions.push_back(vec);
+                    dedupHeadByPos.push_back(-1);
+                } else if ((lineStart + 2) < lineEnd && subType == 'n' &&
+                           Parse::Common::isInlineWhitespace(lineStart[2])) {
+                    hlsl::float32_t3 vec{};
+                    if (!parseVector(lineStart + 3, &vec.x, 3u))
+                        return {};
+                    normals.push_back(vec);
+                } else if ((lineStart + 2) < lineEnd && subType == 't' &&
+                           Parse::Common::isInlineWhitespace(lineStart[2])) {
+                    hlsl::float32_t2 vec{};
+                    if (!parseVector(lineStart + 3, &vec.x, 2u))
+                        return {};
+                    vec.y = 1.f - vec.y;
+                    uvs.push_back(vec);
+                }
+            } else if (lineType == 'o' && (lineStart + 1) < lineEnd &&
+                       Parse::Common::isInlineWhitespace(lineStart[1])) {
+                if (!finalizeCurrentGeometry())
+                    return {};
+                resetBuilderState();
+                currentObjectName =
+                    Parse::parseIdentifier(lineStart + 2, lineEnd, "default_object");
+                sawObjectDirective = true;
+            } else if (lineType == 'g' && (lineStart + 1) < lineEnd &&
+                       Parse::Common::isInlineWhitespace(lineStart[1])) {
+                if (!finalizeCurrentGeometry())
+                    return {};
+                resetBuilderState();
+                currentGroupName =
+                    Parse::parseIdentifier(lineStart + 2, lineEnd, "default_group");
+                sawGroupDirective = true;
+            } else if (lineType == 's' && (lineStart + 1) < lineEnd &&
+                       Parse::Common::isInlineWhitespace(lineStart[1])) {
+                Parse::parseSmoothingGroup(lineStart + 2, lineEnd,
+                                           currentSmoothingGroup);
+            } else if (lineType == 'f' && (lineStart + 1) < lineEnd &&
+                       Parse::Common::isInlineWhitespace(lineStart[1])) {
+                if (positions.empty())
+                    return {};
+                ++faceCount;
+                ++currentFaceCount;
+                const size_t posCount = positions.size();
+                const size_t uvCount = uvs.size();
+                const size_t normalCount = normals.size();
+                const char* triLinePtr = lineStart + 1;
+                std::array triIdx = {hlsl::int32_t3(-1, -1, -1),
+                                     hlsl::int32_t3(-1, -1, -1),
+                                     hlsl::int32_t3(-1, -1, -1)};
+                bool triangleFastPath = Parse::parseTrianglePositiveTripletLine(
+                    lineStart + 1, lineEnd, triIdx, posCount, uvCount, normalCount);
+                bool positiveNormalOnlyFastPath = false;
+                if (!triangleFastPath && uvCount == 0u && normalCount != 0u) {
+                    triangleFastPath = Parse::parseTrianglePositivePositionNormalLine(
+                        lineStart + 1, lineEnd, triIdx, posCount, normalCount);
+                    positiveNormalOnlyFastPath = triangleFastPath;
+                }
+                bool parsedFirstThree = triangleFastPath;
+                if (!triangleFastPath) {
+                    triLinePtr = lineStart + 1;
+                    parsedFirstThree =
+                        Parse::parseFaceVertexToken(triLinePtr, lineEnd, triIdx[0],
+                                                    posCount, uvCount, normalCount) &&
+                        Parse::parseFaceVertexToken(triLinePtr, lineEnd, triIdx[1],
+                                                    posCount, uvCount, normalCount) &&
+                        Parse::parseFaceVertexToken(triLinePtr, lineEnd, triIdx[2],
+                                                    posCount, uvCount, normalCount);
+                    triangleFastPath = parsedFirstThree;
+                    if (parsedFirstThree) {
+                        while (triLinePtr < lineEnd &&
+                               Parse::Common::isInlineWhitespace(*triLinePtr))
+                            ++triLinePtr;
+                        triangleFastPath = (triLinePtr == lineEnd);
+                    }
+                }
+                if (triangleFastPath && !positiveNormalOnlyFastPath) {
+                    const bool fullTriplet = std::all_of(
+                        triIdx.begin(), triIdx.end(), [](const hlsl::int32_t3& idx) {
+                            return hlsl::all(glm::greaterThanEqual(idx, hlsl::int32_t3(0)));
+                        });
+                    if (!fullTriplet)
+                        triangleFastPath = false;
+                }
+                if (triangleFastPath) {
+                    hlsl::uint32_t3 cornerIx = {};
+                    if (positiveNormalOnlyFastPath) {
+                        if (!acquireTriangleCorners(acquireCornerIndexPositiveNormal, triIdx, cornerIx))
+                            return {};
+                    } else if (!acquireTriangleCorners(acquireCornerIndexPositiveTriplet, triIdx, cornerIx))
+                        return {};
+                    faceFastTokenCount += 3u;
+                    currentFaceFastTokenCount += 3u;
+                    if (!appendTriangle(cornerIx))
+                        return {};
+                } else {
+                    const char* linePtr = lineStart + 1;
+                    uint32_t firstCorner = 0u;
+                    uint32_t previousCorner = 0u;
+                    uint32_t cornerCount = 0u;
+                    if (parsedFirstThree) {
+                        hlsl::uint32_t3 cornerIx = {};
+                        if (!acquireTriangleCorners([&](const hlsl::int32_t3& idx, uint32_t& outIx) { return acquireCornerIndex(idx, currentSmoothingGroup, outIx); }, triIdx, cornerIx))
+                            return {};
+                        faceFallbackTokenCount += 3u;
+                        currentFaceFallbackTokenCount += 3u;
+                        if (!appendTriangle(cornerIx))
+                            return {};
+                        firstCorner = cornerIx.x;
+                        previousCorner = cornerIx.z;
+                        cornerCount = 3u;
+                        linePtr = triLinePtr;
+                    }
+                    while (linePtr < lineEnd) {
+                        while (linePtr < lineEnd &&
+                               Parse::Common::isInlineWhitespace(*linePtr))
+                            ++linePtr;
+                        if (linePtr >= lineEnd)
+                            break;
+                        hlsl::int32_t3 idx(-1, -1, -1);
+                        if (!Parse::parseFaceVertexToken(linePtr, lineEnd, idx, posCount,
+                                                         uvCount, normalCount))
+                            return {};
+                        ++faceFallbackTokenCount;
+                        ++currentFaceFallbackTokenCount;
+                        uint32_t cornerIx = 0u;
+                        if (!acquireCornerIndex(idx, currentSmoothingGroup, cornerIx))
+                            return {};
+                        if (cornerCount == 0u) {
+                            firstCorner = cornerIx;
+                            ++cornerCount;
+                            continue;
+                        }
+                        if (cornerCount == 1u) {
+                            previousCorner = cornerIx;
+                            ++cornerCount;
+                            continue;
+                        }
+                        if (!appendTriangle(hlsl::uint32_t3(firstCorner, previousCorner, cornerIx)))
+                            return {};
+                        previousCorner = cornerIx;
+                        ++cornerCount;
+                    }
+                }
+            }
+        }
+        if (lineTerminator >= bufEnd)
+            bufPtr = bufEnd;
+        else if (*lineTerminator == '\r' && (lineTerminator + 1) < bufEnd &&
+                 lineTerminator[1] == '\n')
+            bufPtr = lineTerminator + 2;
+        else
+            bufPtr = lineTerminator + 1;
+    }
+    if (!finalizeCurrentGeometry())
+        return {};
+    if (loadedGeometries.empty())
+        return {};
+    uint64_t outVertexCount = 0ull;
+    uint64_t outIndexCount = 0ull;
+    uint64_t faceFastTokenCountSum = 0ull;
+    uint64_t faceFallbackTokenCountSum = 0ull;
+    for (const auto& loaded : loadedGeometries) {
+        const auto& posView = loaded.geometry->getPositionView();
+        outVertexCount +=
+            static_cast<uint64_t>(posView ? posView.getElementCount() : 0ull);
+        const auto& indexView = loaded.geometry->getIndexView();
+        outIndexCount +=
+            static_cast<uint64_t>(indexView ? indexView.getElementCount() : 0ull);
+        faceFastTokenCountSum += loaded.faceFastTokenCount;
+        faceFallbackTokenCountSum += loaded.faceFallbackTokenCount;
+    }
+	loadSession.logTinyIO(_params.logger, ioTelemetry);
+	core::vector<core::smart_refctd_ptr<ICPUGeometryCollection>> objectCollections;
+	objectCollections.reserve(loadedGeometries.size());
+	std::unordered_map<std::string_view, size_t> objectIndices;
+	objectIndices.reserve(loadedGeometries.size());
+	size_t currentObjectIx = ~size_t(0ull);
+	std::string_view currentCollectionObjectName;
+	for (auto& loaded : loadedGeometries) {
+		const std::string_view objectName(loaded.objectName);
+		size_t objectIx = currentObjectIx;
+		if (objectIx == ~size_t(0ull) || currentCollectionObjectName != objectName) {
+			auto [it, inserted] = objectIndices.try_emplace(objectName, objectCollections.size());
+			if (inserted) {
+				auto collection = core::make_smart_refctd_ptr<ICPUGeometryCollection>();
+				if (!collection)
+					return {};
+				objectCollections.push_back(std::move(collection));
 			}
+			objectIx = it->second;
+			currentObjectIx = objectIx;
+			currentCollectionObjectName = objectName;
 		}
-
-		// go to the next char
-		++p;
+		auto* refs = objectCollections[objectIx]->getGeometries();
+		if (!refs)
+			return {};
+        IGeometryCollection<ICPUBuffer>::SGeometryReference ref = {};
+		ref.geometry = core::smart_refctd_ptr_static_cast<IGeometry<ICPUBuffer>>(loaded.geometry);
+		refs->push_back(std::move(ref));
 	}
-
-	return true;
+	auto scene = ICPUScene::create(nullptr);
+	if (!scene)
+		return {};
+	auto& instances = scene->getInstances();
+	instances.resize(objectCollections.size());
+	auto morphTargets = instances.getMorphTargets();
+	for (size_t i = 0ull; i < objectCollections.size(); ++i) {
+		auto targets = core::make_smart_refctd_ptr<ICPUMorphTargets>();
+		if (!targets)
+			return {};
+        auto* targetList = targets->getTargets();
+        if (!targetList)
+            return {};
+        targetList->push_back({.geoCollection = std::move(objectCollections[i])});
+        morphTargets[i] = std::move(targets);
+    }
+	// Plain OBJ now loads as a flat scene so later material pairing can attach
+	// to scene instances. We keep identity transforms here and leave material
+	// tables invalid until `MTL` support lands.
+	core::vector<core::smart_refctd_ptr<IAsset>> outputAssets;
+	outputAssets.push_back(core::smart_refctd_ptr_static_cast<IAsset>(std::move(scene)));
+	const uint64_t objectCount = objectCollections.size();
+	_params.logger.log(
+		"OBJ loader stats: file=%s in(v=%llu n=%llu uv=%llu) out(v=%llu idx=%llu "
+		"faces=%llu face_fast_tokens=%llu face_fallback_tokens=%llu "
+		"geometries=%llu objects=%llu io_reads=%llu io_min_read=%llu "
+		"io_avg_read=%llu io_req=%s io_eff=%s io_chunk=%llu io_reason=%s",
+		system::ILogger::ELL_PERFORMANCE, _file->getFileName().string().c_str(),
+		static_cast<unsigned long long>(positions.size()),
+        static_cast<unsigned long long>(normals.size()),
+        static_cast<unsigned long long>(uvs.size()),
+        static_cast<unsigned long long>(outVertexCount),
+		static_cast<unsigned long long>(outIndexCount),
+		static_cast<unsigned long long>(faceCount),
+		static_cast<unsigned long long>(faceFastTokenCountSum),
+		static_cast<unsigned long long>(faceFallbackTokenCountSum),
+		static_cast<unsigned long long>(loadedGeometries.size()),
+		static_cast<unsigned long long>(objectCount),
+		static_cast<unsigned long long>(ioTelemetry.callCount),
+		static_cast<unsigned long long>(ioTelemetry.getMinOrZero()),
+        static_cast<unsigned long long>(ioTelemetry.getAvgOrZero()),
+        system::to_string(_params.ioPolicy.strategy).c_str(),
+        system::to_string(loadSession.ioPlan.strategy).c_str(),
+        static_cast<unsigned long long>(loadSession.ioPlan.chunkSizeBytes()), loadSession.ioPlan.reason);
+    return SAssetBundle(core::smart_refctd_ptr<IAssetMetadata>(),
+                        std::move(outputAssets));
 }
-
-std::string COBJMeshFileLoader::genKeyForMeshBuf(const SContext& _ctx, const std::string& _baseKey, const std::string& _mtlName, const std::string& _grpName) const
-{
-    return _baseKey + "?" + _grpName + "?" + _mtlName;
 }
-
-
-
-
-} // end namespace scene
-} // end namespace nbl
-
 #endif // _NBL_COMPILE_WITH_OBJ_LOADER_
diff --git a/src/nbl/asset/interchange/COBJMeshFileLoader.h b/src/nbl/asset/interchange/COBJMeshFileLoader.h
index c11a09e671..2af6f62bd7 100644
--- a/src/nbl/asset/interchange/COBJMeshFileLoader.h
+++ b/src/nbl/asset/interchange/COBJMeshFileLoader.h
@@ -1,136 +1,38 @@
-// Copyright (C) 2019-2025 - DevSH Graphics Programming Sp. z O.O.
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine" and was originally part of the "Irrlicht Engine"
 // For conditions of distribution and use, see copyright notice in nabla.h
 // See the original file in irrlicht source for authors
 #ifndef _NBL_ASSET_C_OBJ_MESH_FILE_LOADER_H_INCLUDED_
 #define _NBL_ASSET_C_OBJ_MESH_FILE_LOADER_H_INCLUDED_
-
 #include "nbl/core/declarations.h"
-#include "nbl/asset/ICPUPolygonGeometry.h"
-#include "nbl/asset/interchange/IAssetLoader.h"
-#include "nbl/asset/metadata/CMTLMetadata.h"
-
+#include "nbl/asset/interchange/ISceneLoader.h"
 namespace nbl::asset
 {
-
-#include "nbl/nblpack.h"
-class SObjVertex
-{
-public:
-    inline bool operator<(const SObjVertex& other) const
-    {
-        if (pos[0]==other.pos[0])
-        {
-            if (pos[1]==other.pos[1])
-            {
-                if (pos[2]==other.pos[2])
-                {
-                    if (uv[0]==other.uv[0])
-                    {
-                        if (uv[1]==other.uv[1])
-                            return normal32bit<other.normal32bit;
-
-                        return uv[1]<other.uv[1];
-                    }
-                    return uv[0]<other.uv[0];
-                }
-                return pos[2]<other.pos[2];
-            }
-            return pos[1]<other.pos[1];
-        }
-
-        return pos[0]<other.pos[0];
-    }
-    inline bool operator==(const SObjVertex& other) const
-    {
-        return pos[0]==other.pos[0]&&pos[1]==other.pos[1]&&pos[2]==other.pos[2]&&uv[0]==other.uv[0]&&uv[1]==other.uv[1]&&normal32bit==other.normal32bit;
-    }
-    float pos[3];
-    float uv[2];
-    CQuantNormalCache::value_type_t<EF_A2B10G10R10_SNORM_PACK32> normal32bit;
-} PACK_STRUCT;
-#include "nbl/nblunpack.h"
-
-//! Meshloader capable of loading obj meshes.
-class COBJMeshFileLoader : public IGeometryLoader
+/**
+	Loads plain OBJ into a flat `ICPUScene`.
+	Multiple `o` and `g` blocks become separate scene instances backed by
+	geometry collections.
+	All instance transforms stay identity here.
+	Material tables stay invalid until `MTL` support is implemented.
+
+	References:
+	- https://www.loc.gov/preservation/digital/formats/fdd/fdd000507
+	- https://www.fileformat.info/format/wavefrontobj/egff.htm
+*/
+class COBJMeshFileLoader : public ISceneLoader
 {
-    struct SContext
-    {
-        SContext(const IAssetLoader::SAssetLoadContext& _innerCtx, uint32_t _topHierarchyLevel, IAssetLoader::IAssetLoaderOverride* _override)
-														: inner(_innerCtx), topHierarchyLevel(_topHierarchyLevel), loaderOverride(_override) {}
-
-        IAssetLoader::SAssetLoadContext inner;
-		uint32_t topHierarchyLevel;
-        IAssetLoader::IAssetLoaderOverride* loaderOverride;
-
-        const bool useGroups = false;
-        const bool useMaterials = true;
-    };
-
-protected:
-	//! destructor
-	virtual ~COBJMeshFileLoader();
+	public:
+		~COBJMeshFileLoader() override;
 
-public:
-	//! Constructor
-	COBJMeshFileLoader(IAssetManager* _manager);
+		//! Constructor
+		explicit COBJMeshFileLoader(IAssetManager* _manager);
 
-    inline bool isALoadableFileFormat(system::IFile* _file, const system::logger_opt_ptr logger) const override
-    {
-        // OBJ doesn't really have any header but usually starts with a comment
-        system::IFile::success_t succ;
-        char firstChar = 0;
-        _file->read(succ, &firstChar, 0, sizeof(firstChar));
-        return succ && (firstChar =='#' || firstChar =='v');
-    }
+		bool isALoadableFileFormat(system::IFile* _file, const system::logger_opt_ptr logger) const override;
 
-    virtual const char** getAssociatedFileExtensions() const override
-    {
-        static const char* ext[]{ "obj", nullptr };
-        return ext;
-    }
+		const char** getAssociatedFileExtensions() const override;
 
-    virtual asset::SAssetBundle loadAsset(system::IFile* _file, const asset::IAssetLoader::SAssetLoadParams& _params, asset::IAssetLoader::IAssetLoaderOverride* _override = nullptr, uint32_t _hierarchyLevel = 0u) override;
-
-private:
-	// returns a pointer to the first printable character available in the buffer
-	const char* goFirstWord(const char* buf, const char* const bufEnd, bool acrossNewlines=true);
-	// returns a pointer to the first printable character after the first non-printable
-	const char* goNextWord(const char* buf, const char* const bufEnd, bool acrossNewlines=true);
-	// returns a pointer to the next printable character after the first line break
-	const char* goNextLine(const char* buf, const char* const bufEnd);
-	// copies the current word from the inBuf to the outBuf
-	uint32_t copyWord(char* outBuf, const char* inBuf, uint32_t outBufLength, const char* const pBufEnd);
-	// copies the current line from the inBuf to the outBuf
-	std::string copyLine(const char* inBuf, const char* const bufEnd);
-
-	// combination of goNextWord followed by copyWord
-	const char* goAndCopyNextWord(char* outBuf, const char* inBuf, uint32_t outBufLength, const char* const pBufEnd);
-
-	//! Read 3d vector of floats
-	const char* readVec3(const char* bufPtr, float vec[3], const char* const pBufEnd);
-	//! Read 2d vector of floats
-	const char* readUV(const char* bufPtr, float vec[2], const char* const pBufEnd);
-	//! Read boolean value represented as 'on' or 'off'
-	const char* readBool(const char* bufPtr, bool& tf, const char* const bufEnd);
-
-	// reads and convert to integer the vertex indices in a line of obj file's face statement
-	// -1 for the index if it doesn't exist
-	// indices are changed to 0-based index instead of 1-based from the obj file
-	bool retrieveVertexIndices(char* vertexData, int32_t* idx, const char* bufEnd, uint32_t vbsize, uint32_t vtsize, uint32_t vnsize);
-
-    std::string genKeyForMeshBuf(const SContext& _ctx, const std::string& _baseKey, const std::string& _mtlName, const std::string& _grpName) const;
-
-	IAssetManager* AssetManager;
-	system::ISystem* System;
-
-	template<typename aType>
-	static inline void performActionBasedOnOrientationSystem(aType& varToHandle, void (*performOnCertainOrientation)(aType& varToHandle))
-	{
-		performOnCertainOrientation(varToHandle);
-	}
+		//! Loads one OBJ asset bundle from an already opened file.
+		asset::SAssetBundle loadAsset(system::IFile* _file, const asset::IAssetLoader::SAssetLoadParams& _params, asset::IAssetLoader::IAssetLoaderOverride* _override = nullptr, uint32_t _hierarchyLevel = 0u) override;
 };
-
 } // end namespace nbl::asset
-
 #endif
diff --git a/src/nbl/asset/interchange/COBJMeshWriter.cpp b/src/nbl/asset/interchange/COBJMeshWriter.cpp
new file mode 100644
index 0000000000..ccd48e599d
--- /dev/null
+++ b/src/nbl/asset/interchange/COBJMeshWriter.cpp
@@ -0,0 +1,296 @@
+#ifdef _NBL_COMPILE_WITH_OBJ_WRITER_
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#include "nbl/asset/interchange/COBJMeshWriter.h"
+#include "nbl/asset/interchange/SGeometryViewDecode.h"
+#include "nbl/asset/interchange/SGeometryWriterCommon.h"
+#include "nbl/asset/interchange/SOBJPolygonGeometryAuxLayout.h"
+#include "nbl/asset/interchange/SInterchangeIO.h"
+#include "impl/SFileAccess.h"
+#include "nbl/builtin/hlsl/array_accessors.hlsl"
+#include "nbl/builtin/hlsl/vector_utils/vector_traits.hlsl"
+#include "nbl/system/IFile.h"
+#include <algorithm>
+#include <array>
+#include <charconv>
+#include <cstdio>
+#include <cstring>
+#include <limits>
+#include <system_error>
+namespace nbl::asset
+{
+COBJMeshWriter::COBJMeshWriter()
+{
+	#ifdef _NBL_DEBUG
+	setDebugName("COBJMeshWriter");
+	#endif
+}
+uint64_t COBJMeshWriter::getSupportedAssetTypesBitfield() const
+{
+	return IAsset::ET_GEOMETRY | IAsset::ET_GEOMETRY_COLLECTION | IAsset::ET_SCENE;
+}
+const char** COBJMeshWriter::getAssociatedFileExtensions() const
+{
+	static const char* ext[] = { "obj", nullptr };
+	return ext;
+}
+writer_flags_t COBJMeshWriter::getSupportedFlags()
+{
+	return EWF_MESH_IS_RIGHT_HANDED;
+}
+writer_flags_t COBJMeshWriter::getForcedFlags()
+{
+	return EWF_NONE;
+}
+namespace
+{
+struct Parse
+{
+	static constexpr size_t MaxFloatTextChars = std::numeric_limits<float>::max_digits10 + 8ull;
+	static constexpr size_t MaxUInt32Chars = std::numeric_limits<uint32_t>::digits10 + 1ull;
+	static constexpr size_t MaxIndexTokenBytes = MaxUInt32Chars * 3ull + 2ull;
+	struct IndexStringRef { uint32_t offset = 0u; uint16_t length = 0u; };
+	struct GeometryTransformState { hlsl::float32_t3x4 transform; hlsl::float32_t3x3 linear; bool identity = true; bool reverseWinding = false; hlsl::math::linalg::cofactors_base<float, 3> normalTransform; };
+
+	template<typename Vec>
+	static void appendVecLine(std::string& out, const char* prefix, const size_t prefixSize, const Vec& values)
+	{
+		constexpr size_t N = hlsl::vector_traits<Vec>::Dimension;
+		const size_t oldSize = out.size();
+		out.resize(oldSize + prefixSize + (N * MaxFloatTextChars) + N);
+		char* const lineBegin = out.data() + oldSize;
+		char* cursor = lineBegin;
+		char* const lineEnd = out.data() + out.size();
+		hlsl::array_get<Vec, float> getter;
+		std::memcpy(cursor, prefix, prefixSize);
+		cursor += prefixSize;
+		for (size_t i = 0ull; i < N; ++i)
+		{
+			cursor = SGeometryWriterCommon::appendFloatToBuffer(cursor, lineEnd, getter(values, static_cast<uint32_t>(i)));
+			if (cursor < lineEnd)
+				*(cursor++) = (i + 1ull < N) ? ' ' : '\n';
+		}
+		out.resize(oldSize + static_cast<size_t>(cursor - lineBegin));
+	}
+
+	static void appendFaceLine(std::string& out, const std::string& storage, const core::vector<IndexStringRef>& refs, const hlsl::uint32_t3& face)
+	{
+		const auto& ref0 = refs[face.x];
+		const auto& ref1 = refs[face.y];
+		const auto& ref2 = refs[face.z];
+		const size_t oldSize = out.size();
+		const size_t lineSize = 2ull + static_cast<size_t>(ref0.length) + 1ull + static_cast<size_t>(ref1.length) + 1ull + static_cast<size_t>(ref2.length) + 1ull;
+		out.resize(oldSize + lineSize);
+		char* cursor = out.data() + oldSize;
+		*(cursor++) = 'f';
+		*(cursor++) = ' ';
+		std::memcpy(cursor, storage.data() + ref0.offset, ref0.length);
+		cursor += ref0.length;
+		*(cursor++) = ' ';
+		std::memcpy(cursor, storage.data() + ref1.offset, ref1.length);
+		cursor += ref1.length;
+		*(cursor++) = ' ';
+		std::memcpy(cursor, storage.data() + ref2.offset, ref2.length);
+		cursor += ref2.length;
+		*(cursor++) = '\n';
+	}
+
+	static void appendIndexToken(std::string& storage, core::vector<IndexStringRef>& refs, const uint32_t positionIx, const bool hasUVs, const uint32_t uvIx, const bool hasNormals, const uint32_t normalIx)
+	{
+		IndexStringRef ref = {};
+		ref.offset = static_cast<uint32_t>(storage.size());
+		const size_t oldSize = storage.size();
+		storage.resize(oldSize + MaxIndexTokenBytes);
+		char* const token = storage.data() + oldSize;
+		char* const tokenEnd = token + MaxIndexTokenBytes;
+		char* cursor = token;
+		cursor = SGeometryWriterCommon::appendUIntToBuffer(cursor, tokenEnd, positionIx);
+		if (hasUVs || hasNormals)
+		{
+			if (cursor < tokenEnd)
+				*(cursor++) = '/';
+			if (hasUVs)
+				cursor = SGeometryWriterCommon::appendUIntToBuffer(cursor, tokenEnd, uvIx);
+			if (hasNormals)
+			{
+				if (cursor < tokenEnd)
+					*(cursor++) = '/';
+				cursor = SGeometryWriterCommon::appendUIntToBuffer(cursor, tokenEnd, normalIx);
+			}
+		}
+		storage.resize(oldSize + static_cast<size_t>(cursor - token));
+		ref.length = static_cast<uint16_t>(storage.size() - ref.offset);
+		refs.push_back(ref);
+	}
+
+	static void appendHeader(std::string& out, const SGeometryWriterCommon::SPolygonGeometryWriteItem& item)
+	{
+		std::array<char, 128> name = {};
+		if (item.instanceIx != ~0u)
+			std::snprintf(name.data(), name.size(), "o instance_%u_target_%u_geometry_%u\n", item.instanceIx, item.targetIx, item.geometryIx);
+		else
+			std::snprintf(name.data(), name.size(), "o geometry_%u\n", item.geometryIx);
+		out.append(name.data());
+	}
+
+	static GeometryTransformState createTransformState(const hlsl::float32_t3x4& transform)
+	{
+		const auto linear = hlsl::float32_t3x3(transform);
+		return {.transform = transform, .linear = linear, .identity = SGeometryWriterCommon::isIdentityTransform(transform), .reverseWinding = hlsl::determinant(linear) < 0.f, .normalTransform = hlsl::math::linalg::cofactors_base<float, 3>::create(linear)};
+	}
+	static hlsl::float32_t3 applyPosition(const GeometryTransformState& state, const hlsl::float32_t3& value) { return state.identity ? value : hlsl::mul(state.transform, hlsl::float32_t4(value.x, value.y, value.z, 1.f)); }
+	static hlsl::float32_t3 applyNormal(const GeometryTransformState& state, const hlsl::float32_t3& value) { return state.identity ? value : state.normalTransform.normalTransform(value); }
+};
+}
+bool COBJMeshWriter::writeAsset(system::IFile* _file, const SAssetWriteParams& _params, IAssetWriterOverride* _override)
+{
+	SFileWriteTelemetry ioTelemetry = {};
+	if (!_override)
+		getDefaultOverride(_override);
+	if (!_file || !_params.rootAsset)
+		return false;
+	const auto items = SGeometryWriterCommon::collectPolygonGeometryWriteItems(_params.rootAsset);
+	if (items.empty())
+		return false;
+	SAssetWriteContext ctx = {_params, _file};
+	system::IFile* file = _override->getOutputFile(_file, ctx, {_params.rootAsset, 0u});
+	if (!file)
+		return false;
+	std::string output;
+	output.append("# Nabla OBJ\n");
+	uint64_t totalVertexCount = 0ull;
+	uint64_t totalFaceCount = 0ull;
+	uint32_t positionBase = 1u;
+	uint32_t uvBase = 1u;
+	uint32_t normalBase = 1u;
+	using SemanticDecode = SGeometryViewDecode::Prepared<SGeometryViewDecode::EMode::Semantic>;
+	for (size_t itemIx = 0u; itemIx < items.size(); ++itemIx)
+	{
+		const auto& item = items[itemIx];
+		const auto* geom = item.geometry;
+		if (!geom || !geom->valid())
+			return false;
+		const auto& positionView = geom->getPositionView();
+		if (!positionView)
+			return false;
+		const auto& normalView = geom->getNormalView();
+		const bool hasNormals = static_cast<bool>(normalView);
+		const size_t vertexCount = positionView.getElementCount();
+		const ICPUPolygonGeometry::SDataView* uvView = SGeometryWriterCommon::getAuxViewAt(geom, SOBJPolygonGeometryAuxLayout::UV0, vertexCount);
+		if (uvView && getFormatChannelCount(uvView->composed.format) != 2u)
+			uvView = nullptr;
+		const bool hasUVs = uvView != nullptr;
+		if (vertexCount == 0ull)
+			return false;
+		if (hasNormals && normalView.getElementCount() != vertexCount)
+			return false;
+		if (hasUVs && uvView->getElementCount() != vertexCount)
+			return false;
+		const auto* indexing = geom->getIndexingCallback();
+		if (!indexing)
+			return false;
+		if (indexing->knownTopology() != E_PRIMITIVE_TOPOLOGY::EPT_TRIANGLE_LIST)
+			return false;
+		size_t faceCount = 0ull;
+		if (!SGeometryWriterCommon::getTriangleFaceCount(geom, faceCount))
+			return false;
+		const auto flags = _override->getAssetWritingFlags(ctx, geom, 0u);
+		const bool flipHandedness = !flags.hasAnyFlag(E_WRITER_FLAGS::EWF_MESH_IS_RIGHT_HANDED);
+		// Scene input is flattened here by baking transforms and writing every collected polygon geometry as its own OBJ object block.
+		const auto transformState = Parse::createTransformState(item.transform);
+		const hlsl::float32_t3* const tightPositions = SGeometryWriterCommon::getTightView<hlsl::float32_t3, EF_R32G32B32_SFLOAT>(positionView);
+		const hlsl::float32_t3* const tightNormals = hasNormals ? SGeometryWriterCommon::getTightView<hlsl::float32_t3, EF_R32G32B32_SFLOAT>(normalView) : nullptr;
+		const hlsl::float32_t2* const tightUV = hasUVs ? SGeometryWriterCommon::getTightView<hlsl::float32_t2, EF_R32G32_SFLOAT>(*uvView) : nullptr;
+		const SemanticDecode positionDecode = tightPositions ? SemanticDecode{} : SGeometryViewDecode::prepare<SGeometryViewDecode::EMode::Semantic>(positionView);
+		const SemanticDecode uvDecode = (!hasUVs || tightUV) ? SemanticDecode{} : SGeometryViewDecode::prepare<SGeometryViewDecode::EMode::Semantic>(*uvView);
+		const SemanticDecode normalDecode = (!hasNormals || tightNormals) ? SemanticDecode{} : SGeometryViewDecode::prepare<SGeometryViewDecode::EMode::Semantic>(normalView);
+		if (itemIx != 0u)
+			output.push_back('\n');
+		Parse::appendHeader(output, item);
+		for (size_t i = 0u; i < vertexCount; ++i)
+		{
+			hlsl::float32_t3 vertex = {};
+			if (tightPositions)
+				vertex = tightPositions[i];
+			else if (!positionDecode.decode(i, vertex))
+				return false;
+			vertex = Parse::applyPosition(transformState, vertex);
+			if (flipHandedness)
+				vertex.x = -vertex.x;
+			Parse::appendVecLine<hlsl::float32_t3>(output, "v ", sizeof("v ") - 1ull, vertex);
+		}
+		if (hasUVs)
+		{
+			for (size_t i = 0u; i < vertexCount; ++i)
+			{
+				hlsl::float32_t2 uv = {};
+				if (tightUV)
+					uv = hlsl::float32_t2(tightUV[i].x, 1.f - tightUV[i].y);
+				else if (!uvDecode.decode(i, uv))
+					return false;
+				if (!tightUV)
+					uv.y = 1.f - uv.y;
+				Parse::appendVecLine<hlsl::float32_t2>(output, "vt ", sizeof("vt ") - 1ull, uv);
+			}
+		}
+		if (hasNormals)
+		{
+			for (size_t i = 0u; i < vertexCount; ++i)
+			{
+				hlsl::float32_t3 normal = {};
+				if (tightNormals)
+					normal = tightNormals[i];
+				else if (!normalDecode.decode(i, normal))
+					return false;
+				normal = Parse::applyNormal(transformState, normal);
+				if (flipHandedness)
+					normal.x = -normal.x;
+				Parse::appendVecLine<hlsl::float32_t3>(output, "vn ", sizeof("vn ") - 1ull, normal);
+			}
+		}
+		core::vector<Parse::IndexStringRef> faceIndexRefs;
+		faceIndexRefs.reserve(vertexCount);
+		std::string faceIndexStorage;
+		faceIndexStorage.reserve(vertexCount * 24ull);
+		for (size_t i = 0u; i < vertexCount; ++i)
+		{
+			const uint32_t positionIx = positionBase + static_cast<uint32_t>(i);
+			const uint32_t uvIx = hasUVs ? (uvBase + static_cast<uint32_t>(i)) : 0u;
+			const uint32_t normalIx = hasNormals ? (normalBase + static_cast<uint32_t>(i)) : 0u;
+			Parse::appendIndexToken(faceIndexStorage, faceIndexRefs, positionIx, hasUVs, uvIx, hasNormals, normalIx);
+		}
+		const hlsl::uint32_t3 faceLimit(static_cast<uint32_t>(faceIndexRefs.size()));
+		if (!SGeometryWriterCommon::visitTriangleIndices(geom, [&](const uint32_t i0, const uint32_t i1, const uint32_t i2) -> bool {
+			const hlsl::uint32_t3 face(transformState.reverseWinding ? i0 : i2, i1, transformState.reverseWinding ? i2 : i0);
+			if (hlsl::any(glm::greaterThanEqual(face, faceLimit)))
+				return false;
+			Parse::appendFaceLine(output, faceIndexStorage, faceIndexRefs, face);
+			return true;
+		}))
+			return false;
+
+		positionBase += static_cast<uint32_t>(vertexCount);
+		if (hasUVs)
+			uvBase += static_cast<uint32_t>(vertexCount);
+		if (hasNormals)
+			normalBase += static_cast<uint32_t>(vertexCount);
+		totalVertexCount += vertexCount;
+		totalFaceCount += faceCount;
+	}
+	const auto ioPlan = impl::SFileAccess::resolvePlan(_params.ioPolicy, static_cast<uint64_t>(output.size()), true, file);
+	if (impl::SFileAccess::logInvalidPlan(_params.logger, "OBJ writer", file->getFileName().string().c_str(), ioPlan))
+		return false;
+	const bool writeOk = SInterchangeIO::writeFileWithPolicy(file, ioPlan, output.data(), output.size(), &ioTelemetry);
+	const uint64_t ioMinWrite = ioTelemetry.getMinOrZero();
+	const uint64_t ioAvgWrite = ioTelemetry.getAvgOrZero();
+	impl::SFileAccess::logTinyIO(_params.logger, "OBJ writer", file->getFileName().string().c_str(), ioTelemetry, static_cast<uint64_t>(output.size()), _params.ioPolicy, "writes");
+	_params.logger.log("OBJ writer stats: file=%s bytes=%llu vertices=%llu faces=%llu geometries=%llu io_writes=%llu io_min_write=%llu io_avg_write=%llu io_req=%s io_eff=%s io_chunk=%llu io_reason=%s",
+		system::ILogger::ELL_PERFORMANCE, file->getFileName().string().c_str(), static_cast<unsigned long long>(output.size()),
+		static_cast<unsigned long long>(totalVertexCount), static_cast<unsigned long long>(totalFaceCount), static_cast<unsigned long long>(items.size()),
+		static_cast<unsigned long long>(ioTelemetry.callCount), static_cast<unsigned long long>(ioMinWrite), static_cast<unsigned long long>(ioAvgWrite),
+		system::to_string(_params.ioPolicy.strategy).c_str(), system::to_string(ioPlan.strategy).c_str(), static_cast<unsigned long long>(ioPlan.chunkSizeBytes()), ioPlan.reason);
+	return writeOk;
+}
+}
+#endif // _NBL_COMPILE_WITH_OBJ_WRITER_
diff --git a/src/nbl/asset/interchange/CPLYMeshFileLoader.cpp b/src/nbl/asset/interchange/CPLYMeshFileLoader.cpp
index 932a04b82c..3e009207a6 100644
--- a/src/nbl/asset/interchange/CPLYMeshFileLoader.cpp
+++ b/src/nbl/asset/interchange/CPLYMeshFileLoader.cpp
@@ -1,883 +1,1840 @@
-// Copyright (C) 2019 - DevSH Graphics Programming Sp. z O.O.
+#ifdef _NBL_COMPILE_WITH_PLY_LOADER_
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine" and was originally part of the "Irrlicht Engine"
 // For conditions of distribution and use, see copyright notice in nabla.h
 // See the original file in irrlicht source for authors
-#ifdef _NBL_COMPILE_WITH_PLY_LOADER_
-
-
 #include "CPLYMeshFileLoader.h"
-
-#include <numeric>
-
+#include "impl/SBinaryData.h"
+#include "impl/SFileAccess.h"
+#include "impl/STextParse.h"
 #include "nbl/asset/IAssetManager.h"
-
-#include "nbl/system/ISystem.h"
+#include "nbl/asset/interchange/SGeometryContentHash.h"
+#include "nbl/asset/interchange/SGeometryLoaderCommon.h"
+#include "nbl/asset/interchange/SPLYPolygonGeometryAuxLayout.h"
+#include "nbl/asset/interchange/SInterchangeIO.h"
+#include "nbl/asset/interchange/SLoaderRuntimeTuning.h"
+#include "nbl/asset/metadata/CPLYMetadata.h"
+#include "nbl/builtin/hlsl/array_accessors.hlsl"
+#include "nbl/builtin/hlsl/shapes/AABBAccumulator.hlsl"
+#include "nbl/builtin/hlsl/vector_utils/vector_traits.hlsl"
+#include "nbl/core/hash/blake.h"
 #include "nbl/system/IFile.h"
-
-//#include "nbl/asset/utils/IMeshManipulator.h"
-
-
+#include "nbl/system/ISystem.h"
+#include <chrono>
+#include <thread>
 namespace nbl::asset
 {
-
-bool CPLYMeshFileLoader::isALoadableFileFormat(system::IFile* _file, const system::logger_opt_ptr logger) const
+namespace
 {
-    char buf[40];
-
-	system::IFile::success_t success;
-	_file->read(success,buf,0,sizeof(buf));
-	if (!success)
-		return false;
-
-    char* header = buf;
-    if (strncmp(header,"ply",3u)!=0)
-        return false;
-    
-    header += 4;
-    char* lf = strstr(header,"\n");
-    if (!lf)
-        return false;
-	
-    constexpr std::array<std::string_view,3> headers = {
-        "format ascii 1.0",
-        "format binary_little_endian 1.0",
-        "format binary_big_endian 1.0"
-    };
-	return std::find(headers.begin(),headers.end(),std::string_view(header,lf))!=headers.end();
-}
-
-template<typename T>
-inline T byteswap(const T& v)
-{
-	T retval;
-	auto it = reinterpret_cast<const char*>(&v);
-	std::reverse_copy(it,it+sizeof(T),reinterpret_cast<char*>(&retval));
-	return retval;
-}
-
-struct SContext
+struct Parse
 {
-	
-	//
-	struct SProperty
+    using Binary = impl::BinaryData;
+	using Common = impl::TextParse;
+	struct ContentHashBuild
 	{
-		static E_FORMAT getType(const char* typeString)
-		{
-			if (strcmp(typeString, "char")==0 || strcmp(typeString, "int8")==0)
-				return EF_R8_SINT;
-			else if (strcmp(typeString, "uchar")==0 || strcmp(typeString, "uint8")==0)
-				return EF_R8_UINT;
-			else if (strcmp(typeString, "short")==0 || strcmp(typeString, "int16")==0)
-				return EF_R16_SINT;
-			else if (strcmp(typeString, "ushort")==0 || strcmp(typeString, "uint16")==0)
-				return EF_R16_UINT;
-			else if (strcmp(typeString, "long")==0 || strcmp(typeString, "int")==0 || strcmp(typeString, "int16")==0)
-				return EF_R32_SINT;
-			else if (strcmp(typeString, "ulong")==0 || strcmp(typeString, "uint16")==0)
-				return EF_R32_UINT;
-			else if (strcmp(typeString, "float")==0 || strcmp(typeString, "float32")==0)
-				return EF_R32_SFLOAT;
-			else if (strcmp(typeString, "double")==0 || strcmp(typeString, "float64")==0)
-				return EF_R64_SFLOAT;
-			else
-				return EF_UNKNOWN;
-		}
-
-		inline bool isList() const {return type==EF_UNKNOWN && asset::isIntegerFormat(list.countType) && asset::isIntegerFormat(list.itemType);}
-
-		void skip(SContext& _ctx) const
+		bool enabled = false;
+		bool inlineHash = false;
+		core::vector<core::smart_refctd_ptr<ICPUBuffer>> hashedBuffers = {};
+		std::jthread deferredThread = {};
+		static inline ContentHashBuild create(const bool enabled, const bool inlineHash) { return {.enabled = enabled, .inlineHash = inlineHash}; }
+		inline bool hashesInline() const { return enabled && inlineHash; }
+		inline bool hashesDeferred() const { return enabled && !inlineHash; }
+		inline void hashNow(ICPUBuffer* const buffer)
 		{
-			if (isList())
-			{
-				int32_t count = _ctx.getInt(list.countType);
-
-				for (decltype(count) i=0; i<count; ++i)
-					_ctx.getInt(list.countType);
-			}
-			else if (_ctx.IsBinaryFile)
-				_ctx.moveForward(getTexelOrBlockBytesize(type));
-			else
-				_ctx.getNextWord();
+			if (!hashesInline() || !buffer || buffer->getContentHash() != IPreHashed::INVALID_HASH)
+				return;
+			for (const auto& hashed : hashedBuffers)
+				if (hashed.get() == buffer)
+					return;
+			buffer->setContentHash(buffer->computeContentHash());
+			hashedBuffers.push_back(core::smart_refctd_ptr<ICPUBuffer>(buffer));
 		}
-
-		std::string Name;
-		E_FORMAT type;
-		struct SListTypes
+		inline void tryDefer(ICPUBuffer* const buffer)
 		{
-			E_FORMAT countType;
-			E_FORMAT itemType;
-		} list;
-	};
-	struct SElement
-	{
-		void skipElement(SContext& _ctx) const
-		{
-			if (_ctx.IsBinaryFile)
-			{
-				if (KnownSize)
-					_ctx.moveForward(KnownSize);
-				else
-				for (auto i=0u; i<Properties.size(); ++i)
-					Properties[i].skip(_ctx);
-			}
-			else
-				_ctx.getNextLine();
+			if (!hashesDeferred() || !buffer || deferredThread.joinable() || buffer->getContentHash() != IPreHashed::INVALID_HASH)
+				return;
+			auto keepAlive = core::smart_refctd_ptr<ICPUBuffer>(buffer);
+			deferredThread = std::jthread([buffer = std::move(keepAlive)]() mutable {buffer->setContentHash(buffer->computeContentHash());});
 		}
-
-		// name of the element. We only want "vertex" and "face" elements
-		// but we have to parse the others anyway.
-		std::string Name;
-		// Properties of this element
-		core::vector<SProperty> Properties;
-		// The number of elements in the file
-		size_t Count;
-		// known size in bytes, 0 if unknown
-		uint32_t KnownSize;
+		inline void wait() { if (deferredThread.joinable()) deferredThread.join(); }
 	};
-
-	inline void init()
+	static std::string_view toStringView(const char* text)
 	{
-		EndPointer = StartPointer = Buffer.data();
-		LineEndPointer = EndPointer-1;
-
-		fillBuffer();
+		return text ? std::string_view{text} : std::string_view{};
 	}
-
-	// gets more data from the file
-	void fillBuffer()
+	struct Context
 	{
-		if (EndOfFile)
-			return;
-		else if (fileOffset>=inner.mainFile->getSize())
+		static constexpr uint64_t ReadWindowPaddingBytes = 1ull;
+		struct SProperty
 		{
-			EndOfFile = true;
-			return;
-		}
-		
-		const auto length = std::distance(StartPointer,EndPointer);
-		auto newStart = Buffer.data();
-		// copy the remaining data to the start of the buffer
-		if (length && StartPointer!=newStart)
-			memmove(newStart,StartPointer,length);
-		// reset start position
-		StartPointer = newStart;
-		EndPointer = newStart+length;
-
-		// read data from the file
-		const size_t requestSize = Buffer.size()-length;
-		system::IFile::success_t success;
-		inner.mainFile->read(success,EndPointer,fileOffset,requestSize);
-		const size_t bytesRead = success.getBytesProcessed();
-		fileOffset += bytesRead;
-		EndPointer += bytesRead;
-
-		// if we didn't completely fill the buffer
-		if (bytesRead!=requestSize)
-		{
-			// cauterize the string
-			*EndPointer = 0;
-			EndOfFile = true;
-		}
-	}
-	// Split the string data into a line in place by terminating it instead of copying.
-	const char* getNextLine()
-	{
-		// move the start pointer along
-		StartPointer = LineEndPointer+1;
-
-		// crlf split across buffer move
-		if (*StartPointer=='\n')
-			*(StartPointer++) = '\0';
-
-		// begin at the start of the next line
-		const std::array<const char,3> Terminators = { '\0','\r','\n'};
-		auto terminator = std::find_first_of(StartPointer,EndPointer,Terminators.begin(),Terminators.end());
-		if (terminator!=EndPointer)
-			*(terminator++) = '\0';
-
-		// we have reached the end of the buffer
-		if (terminator==EndPointer)
-		{
-			// get data from the file
-			if (EndOfFile)
+			static E_FORMAT getType(const char* typeString)
 			{
-				StartPointer = EndPointer-1;
-				*StartPointer = '\0';
-				return StartPointer;
+				struct STypeAlias
+				{
+					std::string_view name;
+					E_FORMAT format;
+				};
+				constexpr std::array<STypeAlias, 16> typeAliases = {{
+					{"char", EF_R8_SINT},
+					{"int8", EF_R8_SINT},
+					{"uchar", EF_R8_UINT},
+					{"uint8", EF_R8_UINT},
+					{"short", EF_R16_SINT},
+					{"int16", EF_R16_SINT},
+					{"ushort", EF_R16_UINT},
+					{"uint16", EF_R16_UINT},
+					{"long", EF_R32_SINT},
+					{"int", EF_R32_SINT},
+					{"int32", EF_R32_SINT},
+					{"ulong", EF_R32_UINT},
+					{"uint", EF_R32_UINT},
+					{"uint32", EF_R32_UINT},
+					{"float", EF_R32_SFLOAT},
+					{"float32", EF_R32_SFLOAT}
+				}};
+				const std::string_view typeName = Parse::toStringView(typeString);
+				for (const auto& alias : typeAliases)
+				{
+					if (alias.name == typeName)
+						return alias.format;
+				}
+				if (typeName == "double" || typeName == "float64")
+					return EF_R64_SFLOAT;
+				return EF_UNKNOWN;
 			}
-			else
+			bool isList() const
 			{
-				fillBuffer();
-				// reset line end pointer
-				LineEndPointer = StartPointer-1;
-				if (StartPointer!=EndPointer)
-					return getNextLine();
-				else
-					return StartPointer;
+				return type == EF_UNKNOWN && asset::isIntegerFormat(list.countType) && asset::isIntegerFormat(list.itemType);
 			}
-		}
-		else
-		{
-			LineEndPointer = terminator-1;
-			WordLength = -1;
-			// return pointer to the start of the line
-			return StartPointer;
-		}
-	}
-	// null terminate the next word on the previous line and move the next word pointer along
-	// since we already have a full line in the buffer, we never need to retrieve more data
-	const char* getNextWord()
-	{
-		// move the start pointer along
-		StartPointer += WordLength + 1;
-		if (!*StartPointer)
-			getNextLine();
-
-		if (StartPointer==LineEndPointer)
-		{
-			WordLength = -1; //
-			return LineEndPointer;
-		}
-		// process the next word
-		{
-			assert(LineEndPointer<=EndPointer);
-			const std::array<const char,3> WhiteSpace = {'\0',' ','\t'};
-			auto wordEnd = std::find_first_of(StartPointer,LineEndPointer,WhiteSpace.begin(),WhiteSpace.end());
-			// null terminate the next word
-			if (wordEnd!=LineEndPointer)
-				*(wordEnd++) = '\0';
-			// find next word
-			auto notWhiteSpace = [WhiteSpace](const char c)->bool
+			void skip(Context& _ctx) const
 			{
-				return std::find(WhiteSpace.begin(),WhiteSpace.end(),c)==WhiteSpace.end();
-			};
-			auto nextWord = std::find_if(wordEnd,LineEndPointer,notWhiteSpace);
-			WordLength = std::distance(StartPointer,nextWord)-1;
-		}
-		// return pointer to the start of current word
-		return StartPointer;
-	}
-	// skips x bytes in the file, getting more data if required
-	void moveForward(const size_t bytes)
-	{
-		assert(IsBinaryFile);
-		if (StartPointer+bytes>=EndPointer)
-			fillBuffer();
-
-		if (StartPointer+bytes<EndPointer)
-			StartPointer += bytes;
-		else
-			StartPointer = EndPointer;
-	}
-
-	// read the next int from the file and move the start pointer along
-	using widest_int_t = uint32_t;
-	widest_int_t getInt(const E_FORMAT f)
-	{
-		assert(!isFloatingPointFormat(f));
-		if (IsBinaryFile)
-		{
-			if (StartPointer+sizeof(widest_int_t)>EndPointer)
-				fillBuffer();
-
-			switch (getTexelOrBlockBytesize(f))
-			{
-				case 1:
-					if (StartPointer+sizeof(int8_t)>EndPointer)
-						break;
-					return *(StartPointer++);
-				case 2:
-				{
-					if (StartPointer+sizeof(int16_t)>EndPointer)
-						break;
-					auto retval = *(reinterpret_cast<int16_t*&>(StartPointer)++);
-					if (IsWrongEndian)
-						retval = byteswap(retval);
-					return retval;
-				}
-				case 4:
+				if (isList())
 				{
-					if (StartPointer+sizeof(int32_t)>EndPointer)
-						break;
-					auto retval = *(reinterpret_cast<int32_t*&>(StartPointer)++);
-					if (IsWrongEndian)
-						retval = byteswap(retval);
-					return retval;
+					int32_t count = _ctx.getInt(list.countType);
+					for (decltype(count) i = 0; i < count; ++i)
+						_ctx.getInt(list.itemType);
 				}
-				default:
-					assert(false);
-					break;
+				else if (_ctx.IsBinaryFile)
+					_ctx.moveForward(getTexelOrBlockBytesize(type));
+				else
+					_ctx.getNextWord();
 			}
-			return 0;
-		}
-		return std::atoi(getNextWord());
-	}
-	// read the next float from the file and move the start pointer along
-	hlsl::float64_t getFloat(const E_FORMAT f)
-	{
-		assert(isFloatingPointFormat(f));
-		if (IsBinaryFile)
+			std::string Name;
+			E_FORMAT type;
+			struct SListTypes
+			{
+				E_FORMAT countType;
+				E_FORMAT itemType;
+			} list;
+		};
+		struct SElement
 		{
-			if (StartPointer+sizeof(hlsl::float64_t)>EndPointer)
-				fillBuffer();
-
-			switch (getTexelOrBlockBytesize(f))
+			void skipElement(Context& _ctx) const
 			{
-				case 4:
+				if (_ctx.IsBinaryFile)
 				{
-					if (StartPointer+sizeof(hlsl::float32_t)>EndPointer)
-						break;
-					auto retval = *(reinterpret_cast<hlsl::float32_t*&>(StartPointer)++);
-					if (IsWrongEndian)
-						retval = byteswap(retval);
-					return retval;
-				}
-				case 8:
-				{
-					if (StartPointer+sizeof(hlsl::float64_t)>EndPointer)
-						break;
-					auto retval = *(reinterpret_cast<hlsl::float64_t*&>(StartPointer)++);
-					if (IsWrongEndian)
-						retval = byteswap(retval);
-					return retval;
+					if (KnownSize)
+						_ctx.moveForward(KnownSize);
+					else
+						for (auto i = 0u; i < Properties.size(); ++i)
+							Properties[i].skip(_ctx);
 				}
-				default:
-					assert(false);
-					break;
+				else
+					_ctx.getNextLine();
 			}
-			return 0;
-		}
-		return std::atoi(getNextWord());
-	}
-	// read the next thing from the file and move the start pointer along
-	void getData(void* dst, const E_FORMAT f)
-	{
-		const auto size = getTexelOrBlockBytesize(f);
-		if (StartPointer+size>EndPointer)
+			std::string Name; // name of the element. We only want "vertex" and "face" elements
+			// but we have to parse the others anyway.
+			core::vector<SProperty> Properties; // Properties of this element
+			size_t Count; // The number of elements in the file
+			uint32_t KnownSize; // known size in bytes, 0 if unknown
+		};
+		static constexpr size_t DefaultIoReadWindowBytes = 50ull << 10;
+		void init(size_t _ioReadWindowSize = DefaultIoReadWindowBytes)
 		{
+			ioReadWindowSize = std::max<size_t>(_ioReadWindowSize, DefaultIoReadWindowBytes);
+			Buffer.resize(ioReadWindowSize + ReadWindowPaddingBytes, '\0');
+			EndPointer = StartPointer = Buffer.data();
+			LineEndPointer = EndPointer - 1;
 			fillBuffer();
-			if (StartPointer+size>EndPointer)
-				return;
 		}
-		if (IsWrongEndian)
-			std::reverse_copy(StartPointer,StartPointer+size,reinterpret_cast<char*>(dst));
-		else
-			memcpy(dst,StartPointer,size);
-		StartPointer += size;
-	}
-	struct SVertAttrIt
-	{
-		uint8_t* ptr;
-		uint32_t stride;
-		E_FORMAT dstFmt;
-	};
-	inline void readVertex(const IAssetLoader::SAssetLoadParams& _params, const SElement& el)
-	{
-		assert(el.Name=="vertex");
-		assert(el.Properties.size()==vertAttrIts.size());
-		if (!IsBinaryFile)
-			getNextLine();
-
-		for (size_t j=0; j<el.Count; ++j)
-		for (auto i=0u; i<vertAttrIts.size(); i++)
+		void fillBuffer() // gets more data from the file
 		{
-			const auto& prop = el.Properties[i];
-			auto& it = vertAttrIts[i];
-			if (!it.ptr)
-			{
-				prop.skip(*this);
-				continue;
-			}
-			// conversion required? 
-			if (it.dstFmt!=prop.type)
+			if (EndOfFile)
+				return;
+			if (fileOffset >= inner.mainFile->getSize())
 			{
-				assert(isIntegerFormat(it.dstFmt)==isIntegerFormat(prop.type));
-				if (isIntegerFormat(it.dstFmt))
-				{
-					uint64_t tmp = getInt(prop.type);
-					encodePixels(it.dstFmt,it.ptr,&tmp);
-				}
-				else
-				{
-					hlsl::float64_t tmp = getFloat(prop.type);
-					encodePixels(it.dstFmt,it.ptr,&tmp);
-				}
+				EndOfFile = true;
+				return;
 			}
-			else
-				getData(it.ptr,prop.type);
-			//
-			it.ptr += it.stride;
-		}
-	}
-	bool readFace(const SElement& Element, core::vector<uint32_t>& _outIndices)
-	{
-		if (!IsBinaryFile)
-			getNextLine();
-
-		for (const auto& prop : Element.Properties)
-		{
-			if (prop.isList() && (prop.Name=="vertex_indices" || prop.Name == "vertex_index"))
+			const auto length = std::distance(StartPointer, EndPointer);
+			auto newStart = Buffer.data();
+			// copy the remaining data to the start of the buffer
+			if (length && StartPointer != newStart)
+				memmove(newStart, StartPointer, length);
+			// reset start position
+			StartPointer = newStart;
+			EndPointer = newStart + length;
+			const size_t usableBufferSize = Buffer.size() > 0ull ? Buffer.size() - ReadWindowPaddingBytes : 0ull;
+			if (usableBufferSize <= length)
 			{
-				const uint32_t count = getInt(prop.list.countType);
-				//_NBL_DEBUG_BREAK_IF(count != 3)
-				const auto srcIndexFmt = prop.list.itemType;
-
-				_outIndices.push_back(getInt(srcIndexFmt));
-				_outIndices.push_back(getInt(srcIndexFmt));
-				_outIndices.push_back(getInt(srcIndexFmt));
-				// TODO: handle varying vertex count faces via variable vertex count geometry collections (PLY loader should be a Geometry Collection loader)
-				for (auto j=3u; j<count; ++j)
-				{
-					// this seems to be a triangle fan ?
-					_outIndices.push_back(_outIndices.front());
-					_outIndices.push_back(_outIndices.back());
-					_outIndices.push_back(getInt(srcIndexFmt));
-				}
+				EndOfFile = true;
+				return;
 			}
-			else if (prop.Name == "intensity")
+			const size_t requestSize = usableBufferSize - length;
+			system::IFile::success_t success;
+			// read data from the file
+			inner.mainFile->read(success, EndPointer, fileOffset, requestSize);
+			const size_t bytesRead = success.getBytesProcessed();
+			++readCallCount;
+			readBytesTotal += bytesRead;
+			if (bytesRead < readMinBytes)
+				readMinBytes = bytesRead;
+			fileOffset += bytesRead;
+			EndPointer += bytesRead;
+			// if we didn't completely fill the buffer
+			if (bytesRead != requestSize)
 			{
-				// todo: face intensity
-				prop.skip(*this);
+				// cauterize the string
+				*EndPointer = 0;
+				EndOfFile = true;
 			}
-			else
-				prop.skip(*this);
 		}
-		return true;
-	}
-
-	IAssetLoader::SAssetLoadContext inner;
-	uint32_t topHierarchyLevel;
-	IAssetLoader::IAssetLoaderOverride* loaderOverride;
-	// input buffer must be at least twice as long as the longest line in the file
-	std::array<char,50<<10> Buffer; // 50kb seems sane to store a line
-	core::vector<SElement> ElementList = {};
-	char* StartPointer = nullptr, *EndPointer = nullptr, *LineEndPointer = nullptr;
-	int32_t LineLength = 0;
-	int32_t WordLength = -1; // this variable is a misnomer, its really the offset to next word minus one
-	bool IsBinaryFile = false, IsWrongEndian = false, EndOfFile = false;
-	size_t fileOffset = {};
-	//
-	core::vector<SVertAttrIt> vertAttrIts;
-};
-
-//! creates/loads an animated mesh from the file.
-SAssetBundle CPLYMeshFileLoader::loadAsset(system::IFile* _file, const IAssetLoader::SAssetLoadParams& _params, IAssetLoader::IAssetLoaderOverride* _override, uint32_t _hierarchyLevel)
-{
-	using namespace nbl::core;
-	if (!_file)
-		return {};
-
-	SContext ctx = {
-		asset::IAssetLoader::SAssetLoadContext{
-			_params,
-			_file
-		},
-		_hierarchyLevel,
-		_override
-	};
-	ctx.init();
-
-	// start with empty mesh
-    auto geometry = make_smart_refctd_ptr<ICPUPolygonGeometry>();
-	uint32_t vertCount=0;
-
-	// Currently only supports ASCII or binary meshes
-	if (strcmp(ctx.getNextLine(),"ply"))
-	{
-		_params.logger.log("Not a valid PLY file %s", system::ILogger::ELL_ERROR,ctx.inner.mainFile->getFileName().string().c_str());
-		return {};
-	}
-
-	// cut the next line out
-	ctx.getNextLine();
-	// grab the word from this line
-	const char* word = ctx.getNextWord();
-	// ignore comments
-	for (; strcmp(word,"comment")==0; ctx.getNextLine())
-		word = ctx.getNextWord();
-
-	bool readingHeader = true;
-	bool continueReading = true;
-	ctx.IsBinaryFile = false;
-	ctx.IsWrongEndian= false;
-
-	do
-	{
-		if (strcmp(word,"property") == 0)
+		std::string_view getNextLine() // Split the string data into a line in place by terminating it instead of copying.
 		{
-			word = ctx.getNextWord();
-
-			if (ctx.ElementList.empty())
+			// move the start pointer along
+			StartPointer = LineEndPointer + 1;
+			// crlf split across buffer move
+			if (StartPointer < EndPointer && *StartPointer == '\n')
+				*(StartPointer++) = '\0';
+			const char* const lineStart = StartPointer;
+			// begin at the start of the next line
+			const std::array<const char, 3> Terminators = {'\0', '\r', '\n'};
+			auto terminator = std::find_first_of(StartPointer, EndPointer, Terminators.begin(), Terminators.end());
+			if (terminator != EndPointer)
 			{
-				_params.logger.log("PLY property token found before element %s", system::ILogger::ELL_WARNING, word);
+				const char* const lineEnd = terminator;
+				*(terminator++) = '\0';
+				LineEndPointer = terminator - 1;
+				WordLength = -1;
+				return std::string_view(lineStart, static_cast<size_t>(lineEnd - lineStart));
 			}
-			else
+			// we have reached the end of the buffer
+			if (terminator == EndPointer)
 			{
-				// get element
-				auto& el = ctx.ElementList.back();
-				
-				// fill property struct
-				auto& prop = el.Properties.emplace_back();
-				prop.type = prop.getType(word);
-				if (prop.type==EF_UNKNOWN)
+				if (EndOfFile)
 				{
-					el.KnownSize = false;
-
-					word = ctx.getNextWord();
-
-					prop.list.countType = prop.getType(word);
-					if (ctx.IsBinaryFile && !isIntegerFormat(prop.list.countType))
-					{
-						_params.logger.log("Cannot read binary PLY file containing data types of unknown or non integer length %s", system::ILogger::ELL_WARNING, word);
-						continueReading = false;
-					}
-					else
-					{
-						word = ctx.getNextWord();
-						prop.list.itemType = prop.getType(word);
-						if (ctx.IsBinaryFile && !isIntegerFormat(prop.list.itemType))
-						{
-							_params.logger.log("Cannot read binary PLY file containing data types of unknown or non integer length %s", system::ILogger::ELL_ERROR, word);
-							continueReading = false;
-						}
-					}
+					StartPointer = EndPointer - 1;
+					*StartPointer = '\0';
+					return {};
 				}
-				else if (ctx.IsBinaryFile && prop.type==EF_UNKNOWN)
+				// get data from the file
+				fillBuffer();
+				// reset line end pointer
+				LineEndPointer = StartPointer - 1;
+				return StartPointer != EndPointer ? getNextLine() : std::string_view{};
+			}
+			return {};
+		}
+		const char* getNextWord() // null terminate the next word on the previous line and move the next word pointer along since we already have a full line in the buffer, we never need to retrieve more data
+		{
+			// move the start pointer along
+			StartPointer += WordLength + 1;
+			if (StartPointer >= EndPointer)
+			{
+				if (EndOfFile)
 				{
-					_params.logger.log("Cannot read binary PLY file containing data types of unknown length %s", system::ILogger::ELL_ERROR, word);
-					continueReading = false;
+					WordLength = -1;
+					return EndPointer;
 				}
-				else
-					el.KnownSize += getTexelOrBlockBytesize(prop.type);
-
-				prop.Name = ctx.getNextWord();
+				getNextLine();
+			}
+			if (StartPointer < EndPointer && !*StartPointer)
+				getNextLine();
+			if (StartPointer >= LineEndPointer)
+			{
+				WordLength = -1;
+				return StartPointer;
 			}
+			assert(LineEndPointer <= EndPointer);
+			// process the next word
+			const std::array<const char, 3> WhiteSpace = {'\0', ' ', '\t'};
+			auto wordEnd = std::find_first_of(StartPointer, LineEndPointer, WhiteSpace.begin(), WhiteSpace.end());
+			// null terminate the next word
+			if (wordEnd != LineEndPointer)
+				*(wordEnd++) = '\0';
+			// find next word
+			auto nextWord = std::find_if(wordEnd, LineEndPointer, [WhiteSpace](const char c) -> bool { return std::find(WhiteSpace.begin(), WhiteSpace.end(), c) == WhiteSpace.end(); });
+			WordLength = std::distance(StartPointer, nextWord) - 1;
+			// return pointer to the start of current word
+			return StartPointer;
 		}
-		else if (strcmp(word,"element")==0)
+		size_t getAbsoluteOffset(const char* ptr) const
 		{
-			auto& el = ctx.ElementList.emplace_back();
-			el.Name = ctx.getNextWord();
-			el.Count = atoi(ctx.getNextWord());
-			el.KnownSize = 0;
-			if (el.Name=="vertex")
-				vertCount = el.Count;
+			if (!ptr || ptr > EndPointer)
+				return fileOffset;
+			const size_t trailingBytes = static_cast<size_t>(EndPointer - ptr);
+			return fileOffset >= trailingBytes ? (fileOffset - trailingBytes) : 0ull;
 		}
-		else if (strcmp(word,"comment")==0)
+		void useMappedBinaryWindow(const char* data, const size_t sizeBytes)
 		{
-			// ignore line
+			if (!data)
+				return;
+			StartPointer = const_cast<char*>(data);
+			EndPointer = StartPointer + sizeBytes;
+			LineEndPointer = StartPointer - 1;
+			WordLength = -1;
+			EndOfFile = true;
+			fileOffset = inner.mainFile ? inner.mainFile->getSize() : fileOffset;
 		}
-		// must be `format {binary_little_endian|binary_big_endian|ascii} 1.0`
-		else if (strcmp(word,"format") == 0)
+		void moveForward(const size_t bytes) // skips x bytes in the file, getting more data if required
 		{
-			word = ctx.getNextWord();
-
-			if (strcmp(word, "binary_little_endian") == 0)
-			{
-				ctx.IsBinaryFile = true;
-			}
-			else if (strcmp(word, "binary_big_endian") == 0)
-			{
-				ctx.IsBinaryFile = true;
-				ctx.IsWrongEndian = true;
-			}
-			else if (strcmp(word, "ascii")==0)
-			{
-			}
-			else
+			assert(IsBinaryFile);
+			size_t remaining = bytes;
+			if (remaining == 0ull)
+				return;
+			const size_t availableInitially = EndPointer > StartPointer ? static_cast<size_t>(EndPointer - StartPointer) : 0ull;
+			if (remaining > availableInitially)
 			{
-				// abort if this isn't an ascii or a binary mesh
-				_params.logger.log("Unsupported PLY mesh format %s", system::ILogger::ELL_ERROR, word);
-				continueReading = false;
+				remaining -= availableInitially;
+				StartPointer = EndPointer;
+				if (remaining > ioReadWindowSize)
+				{
+					const size_t fileSize = inner.mainFile->getSize();
+					const size_t fileRemaining = fileSize > fileOffset ? (fileSize - fileOffset) : 0ull;
+					const size_t directSkip = std::min(remaining, fileRemaining);
+					fileOffset += directSkip;
+					remaining -= directSkip;
+				}
 			}
-
-			if (continueReading)
+			while (remaining)
 			{
-				word = ctx.getNextWord();
-				if (strcmp(word, "1.0"))
+				if (StartPointer >= EndPointer)
 				{
-					_params.logger.log("Unsupported PLY mesh version %s",system::ILogger::ELL_WARNING,word);
+					fillBuffer();
+					if (StartPointer >= EndPointer)
+						return;
 				}
+				const size_t available = static_cast<size_t>(EndPointer - StartPointer);
+				const size_t step = std::min(available, remaining);
+				StartPointer += step;
+				remaining -= step;
 			}
 		}
-		else if (strcmp(word,"end_header")==0)
+		using widest_int_t = uint32_t;
+		const char* getCurrentWordEnd(const char* word) const
 		{
-			readingHeader = false;
-			if (ctx.IsBinaryFile)
-				ctx.StartPointer = ctx.LineEndPointer+1;
+			const size_t tokenLen = WordLength >= 0 ? static_cast<size_t>(WordLength + 1) : std::char_traits<char>::length(word);
+			return word + tokenLen;
 		}
-		else
+		inline bool ensureBytes(const size_t bytes)
 		{
-			_params.logger.log("Unknown item in PLY file %s", system::ILogger::ELL_WARNING, word);
+			if (StartPointer + bytes > EndPointer)
+				fillBuffer();
+			return StartPointer + bytes <= EndPointer;
 		}
-
-		if (readingHeader && continueReading)
+		template<typename T>
+		inline T loadBinaryScalar()
 		{
-			ctx.getNextLine();
-			word = ctx.getNextWord();
+			if (!ensureBytes(sizeof(T)))
+				return T{};
+			const auto retval = Binary::loadUnaligned<T>(StartPointer, IsWrongEndian);
+			StartPointer += sizeof(T);
+			return retval;
 		}
-	}
-	while (readingHeader && continueReading);
-
-	//
-	if (!continueReading)
-		return {};
-
-	// now to read the actual data from the file
-	using index_t = uint32_t;
-	core::vector<index_t> indices = {};
-
-	// loop through each of the elements
-	bool verticesProcessed = false;
-	for (uint32_t i=0; i<ctx.ElementList.size(); ++i)
-	{
-		auto& el = ctx.ElementList[i];
-		if (el.Name=="vertex") // TODO: are multiple of these possible in a file? do we create a geometry collection then? Probably not -> https://paulbourke.net/dataformats/ply/
+		template<typename T>
+		inline T parseCurrentWordValue()
 		{
-			if (verticesProcessed)
-			{
-				_params.logger.log("Multiple `vertex` elements not supported!", system::ILogger::ELL_ERROR);
-				return {};
-			}
-			ICPUPolygonGeometry::SDataViewBase posView = {}, normalView = {};
-			for (auto& vertexProperty : el.Properties)
+			const char* word = getNextWord();
+			if (!word)
+				return T{};
+			const char* const wordEnd = getCurrentWordEnd(word);
+			if (word == wordEnd)
+				return T{};
+			T value = {};
+			auto ptr = word;
+			if (Common::parseNumber(ptr, wordEnd, value) && ptr == wordEnd)
+				return value;
+			return ptr != word ? value : T{};
+		}
+		widest_int_t getInt(const E_FORMAT f) // read the next int from the file and move the start pointer along
+		{
+			assert(!isFloatingPointFormat(f));
+			if (IsBinaryFile)
 			{
-				const auto& propertyName = vertexProperty.Name;
-				// only positions and normals need to be structured/canonicalized in any way
-				auto negotiateFormat = [&vertexProperty](ICPUPolygonGeometry::SDataViewBase& view, const uint8_t component)->void
+				switch (getTexelOrBlockBytesize(f))
 				{
-					assert(getFormatChannelCount(vertexProperty.type)!=0);
-					if (getTexelOrBlockBytesize(vertexProperty.type)>getTexelOrBlockBytesize(view.format))
-						view.format = vertexProperty.type;
-					view.stride = hlsl::max<uint32_t>(view.stride,component);
-				};
-				if (propertyName=="x")
-					negotiateFormat(posView,0);
-				else if (propertyName=="y")
-					negotiateFormat(posView,1);
-				else if (propertyName=="z")
-					negotiateFormat(posView,2);
-				else if (propertyName=="nx")
-					negotiateFormat(normalView,0);
-				else if (propertyName=="ny")
-					negotiateFormat(normalView,1);
-				else if (propertyName=="nz")
-					negotiateFormat(normalView,2);
-				else
-				{
-// TODO: record the `propertyName`
-					geometry->getAuxAttributeViews()->push_back(createView(vertexProperty.type,el.Count));
+					case 1:
+						if (ensureBytes(sizeof(int8_t)))
+							return *(StartPointer++);
+						break;
+					case 2: return static_cast<widest_int_t>(loadBinaryScalar<int16_t>());
+					case 4: return static_cast<widest_int_t>(loadBinaryScalar<int32_t>());
+					default:
+						assert(false);
+						break;
 				}
+				return 0u;
 			}
-			auto setFinalFormat = [&ctx](ICPUPolygonGeometry::SDataViewBase& view)->void
+			return isSignedFormat(f) ? static_cast<widest_int_t>(parseCurrentWordValue<int64_t>()) : static_cast<widest_int_t>(parseCurrentWordValue<uint64_t>());
+		}
+		hlsl::float64_t getFloat(const E_FORMAT f) // read the next float from the file and move the start pointer along
+		{
+			assert(isFloatingPointFormat(f));
+			if (IsBinaryFile)
 			{
-				const auto componentFormat = view.format;
-				const auto componentCount = view.stride+1;
-				// turn single channel format to multiple
-				view.format = [=]()->E_FORMAT
+				switch (getTexelOrBlockBytesize(f))
 				{
-					switch (view.format)
-					{
-						case EF_R8_SINT:
-							switch (componentCount)
-							{
-								case 1:
-									return EF_R8_SINT;
-								case 2:
-									return EF_R8G8_SINT;
-								case 3:
-									return EF_R8G8B8_SINT;
-								case 4:
-									return EF_R8G8B8A8_SINT;
-								default:
-									break;
-							}
-							break;
-						case EF_R8_UINT:
-							switch (componentCount)
-							{
-								case 1:
-									return EF_R8_UINT;
-								case 2:
-									return EF_R8G8_UINT;
-								case 3:
-									return EF_R8G8B8_UINT;
-								case 4:
-									return EF_R8G8B8A8_UINT;
-								default:
-									break;
-							}
-							break;
-						case EF_R16_SINT:
-							switch (componentCount)
-							{
-								case 1:
-									return EF_R16_SINT;
-								case 2:
-									return EF_R16G16_SINT;
-								case 3:
-									return EF_R16G16B16_SINT;
-								case 4:
-									return EF_R16G16B16A16_SINT;
-								default:
-									break;
-							}
-							break;
-						case EF_R16_UINT:
-							switch (componentCount)
-							{
-								case 1:
-									return EF_R16_UINT;
-								case 2:
-									return EF_R16G16_UINT;
-								case 3:
-									return EF_R16G16B16_UINT;
-								case 4:
-									return EF_R16G16B16A16_UINT;
-								default:
-									break;
-							}
-							break;
-						case EF_R32_SINT:
-							switch (componentCount)
-							{
-								case 1:
-									return EF_R32_SINT;
-								case 2:
-									return EF_R32G32_SINT;
-								case 3:
-									return EF_R32G32B32_SINT;
-								case 4:
-									return EF_R32G32B32A32_SINT;
-								default:
-									break;
-							}
-							break;
-						case EF_R32_UINT:
-							switch (componentCount)
-							{
-								case 1:
-									return EF_R32_UINT;
-								case 2:
-									return EF_R32G32_UINT;
-								case 3:
-									return EF_R32G32B32_UINT;
-								case 4:
-									return EF_R32G32B32A32_UINT;
-								default:
-									break;
-							}
-							break;
-						case EF_R32_SFLOAT:
-							switch (componentCount)
-							{
-								case 1:
-									return EF_R32_SFLOAT;
-								case 2:
-									return EF_R32G32_SFLOAT;
-								case 3:
-									return EF_R32G32B32_SFLOAT;
-								case 4:
-									return EF_R32G32B32A32_SFLOAT;
-								default:
-									break;
-							}
-							break;
-						case EF_R64_SFLOAT:
-							switch (componentCount)
-							{
-								case 1:
-									return EF_R64_SFLOAT;
-								case 2:
-									return EF_R64G64_SFLOAT;
-								case 3:
-									return EF_R64G64B64_SFLOAT;
-								case 4:
-									return EF_R64G64B64A64_SFLOAT;
-								default:
-									break;
-							}
-							break;
-						default:
-							break;
-					}
-					return EF_UNKNOWN;
-				}();
-				view.stride = getTexelOrBlockBytesize(view.format);
-				//
-				for (auto c=0u; c<componentCount; c++)
-				{
-					size_t offset = getTexelOrBlockBytesize(componentFormat)*c;
-					ctx.vertAttrIts.push_back({
-						.ptr = reinterpret_cast<uint8_t*>(offset),
-						.stride = view.stride,
-						.dstFmt = componentFormat
-					});
+					case 4: return loadBinaryScalar<hlsl::float32_t>();
+					case 8: return loadBinaryScalar<hlsl::float64_t>();
+					default:
+						assert(false);
+						break;
 				}
-			};
-			if (posView.format!=EF_UNKNOWN)
-			{
-				auto beginIx = ctx.vertAttrIts.size();
-				setFinalFormat(posView);
-				auto view = createView(posView.format,el.Count);
-				for (const auto size=ctx.vertAttrIts.size(); beginIx!=size; beginIx++)
-					ctx.vertAttrIts[beginIx].ptr += ptrdiff_t(view.src.buffer->getPointer())+view.src.offset;
-				geometry->setPositionView(std::move(view));
-			}
-			if (normalView.format!=EF_UNKNOWN)
-			{
-				auto beginIx = ctx.vertAttrIts.size();
-				setFinalFormat(normalView);
-				auto view = createView(normalView.format,el.Count);
-				for (const auto size=ctx.vertAttrIts.size(); beginIx!=size; beginIx++)
-					ctx.vertAttrIts[beginIx].ptr += ptrdiff_t(view.src.buffer->getPointer())+view.src.offset;
-				geometry->setNormalView(std::move(view));
+				return 0.0;
 			}
-			//
-			for (auto& view : *geometry->getAuxAttributeViews())
-				ctx.vertAttrIts.push_back({
-					.ptr = reinterpret_cast<uint8_t*>(view.src.buffer->getPointer())+view.src.offset,
-					.stride = getTexelOrBlockBytesize(view.composed.format),
-					.dstFmt = view.composed.format
-				});
-			// loop through vertex properties
-			ctx.readVertex(_params,el);
-			verticesProcessed = true;
+			return parseCurrentWordValue<hlsl::float64_t>();
 		}
-		else if (el.Name=="face")
+		void getData(void* dst, const E_FORMAT f) // read the next thing from the file and move the start pointer along
 		{
-			for (size_t j=0; j<el.Count; ++j)
-				ctx.readFace(el,indices);
-		}
-		else
-		{
-			// skip these elements
-			for (size_t j=0; j<el.Count; ++j)
-				el.skipElement(ctx);
+			const auto size = getTexelOrBlockBytesize(f);
+			if (!ensureBytes(size))
+				return;
+			if (IsWrongEndian)
+				std::reverse_copy(StartPointer, StartPointer + size, reinterpret_cast<char*>(dst));
+			else
+				memcpy(dst, StartPointer, size);
+			StartPointer += size;
 		}
-	}
-
-	// do before indices so we don't compute their stuff again
-	CPolygonGeometryManipulator::recomputeContentHashes(geometry.get());
-	CPolygonGeometryManipulator::recomputeRanges(geometry.get());
-
-	if (indices.empty())
-	{
-		// no index buffer means point cloud
-		geometry->setIndexing(IPolygonGeometryBase::PointList());
-	}
-	else
-	{
-		geometry->setIndexing(IPolygonGeometryBase::TriangleList());
-		auto view = IGeometryLoader::createView(EF_R32_UINT,indices.size(),indices.data());
-		geometry->setIndexView(std::move(view));
-	}
-
-	CPolygonGeometryManipulator::recomputeAABB(geometry.get());
-
-	auto meta = core::make_smart_refctd_ptr<CPLYMetadata>();
-	return SAssetBundle(std::move(meta),{std::move(geometry)});
+        struct SVertAttrIt {
+            uint8_t* ptr;
+            uint32_t stride;
+            E_FORMAT dstFmt;
+        };
+        enum class EFastVertexReadResult : uint8_t {
+            NotApplicable,
+            Success,
+            Error
+        };
+        EFastVertexReadResult readVertexElementFast(
+            const SElement& el,
+            hlsl::shapes::util::AABBAccumulator3<float>* parsedAABB) {
+            if (!IsBinaryFile || el.Name != "vertex")
+                return EFastVertexReadResult::NotApplicable;
+            enum class ELayoutKind : uint8_t { XYZ, XYZ_N, XYZ_N_UV };
+            auto allF32 = [&el]()->bool {
+                for (const auto& prop : el.Properties)
+                    if (prop.type != EF_R32_SFLOAT)
+                        return false;
+                return true;
+            };
+            if (!allF32())
+                return EFastVertexReadResult::NotApplicable;
+            auto matchNames = [&el](std::initializer_list<const char*> names)->bool {
+                if (el.Properties.size() != names.size())
+                    return false;
+                size_t i = 0ull;
+                for (const auto* name : names)
+                {
+                    if (el.Properties[i].Name != name)
+                        return false;
+                    ++i;
+                }
+                return true;
+            };
+            ELayoutKind layout = ELayoutKind::XYZ;
+            if (matchNames({"x", "y", "z"}))
+                layout = ELayoutKind::XYZ;
+            else if (matchNames({"x", "y", "z", "nx", "ny", "nz"}))
+                layout = ELayoutKind::XYZ_N;
+            else if (matchNames({"x", "y", "z", "nx", "ny", "nz", "u", "v"}) ||
+                     matchNames({"x", "y", "z", "nx", "ny", "nz", "s", "t"}))
+                layout = ELayoutKind::XYZ_N_UV;
+            else
+                return EFastVertexReadResult::NotApplicable;
+            const size_t floatBytes = sizeof(hlsl::float32_t);
+            auto validateTuple = [&](const size_t beginIx, const size_t componentCount, uint32_t& outStride, uint8_t*& outBase)->bool {
+                if (beginIx + componentCount > vertAttrIts.size())
+                    return false;
+                auto& first = vertAttrIts[beginIx];
+                if (!first.ptr || first.dstFmt != EF_R32_SFLOAT)
+                    return false;
+                outStride = first.stride;
+                outBase = first.ptr;
+                for (size_t c = 1ull; c < componentCount; ++c)
+                {
+                    auto& it = vertAttrIts[beginIx + c];
+                    if (!it.ptr || it.dstFmt != EF_R32_SFLOAT)
+                        return false;
+                    if (it.stride != outStride)
+                        return false;
+                    if (it.ptr != outBase + c * floatBytes)
+                        return false;
+                }
+                return true;
+            };
+            uint32_t posStride = 0u, normalStride = 0u, uvStride = 0u;
+            uint8_t* posBase = nullptr;
+            uint8_t* normalBase = nullptr;
+            uint8_t* uvBase = nullptr;
+            switch (layout)
+            {
+                case ELayoutKind::XYZ:
+                    if (vertAttrIts.size() != 3u || !validateTuple(0u, 3u, posStride, posBase))
+                        return EFastVertexReadResult::NotApplicable;
+                    break;
+                case ELayoutKind::XYZ_N:
+                    if (vertAttrIts.size() != 6u)
+                        return EFastVertexReadResult::NotApplicable;
+                    if (!validateTuple(0u, 3u, posStride, posBase) || !validateTuple(3u, 3u, normalStride, normalBase))
+                        return EFastVertexReadResult::NotApplicable;
+                    break;
+                case ELayoutKind::XYZ_N_UV:
+                    if (vertAttrIts.size() != 8u)
+                        return EFastVertexReadResult::NotApplicable;
+                    if (!validateTuple(0u, 3u, posStride, posBase) || !validateTuple(3u, 3u, normalStride, normalBase) || !validateTuple(6u, 2u, uvStride, uvBase))
+                        return EFastVertexReadResult::NotApplicable;
+                    break;
+            }
+            const size_t srcBytesPerVertex = [layout]()->size_t {
+                switch (layout)
+                {
+                    case ELayoutKind::XYZ: return sizeof(hlsl::float32_t) * 3ull;
+                    case ELayoutKind::XYZ_N: return sizeof(hlsl::float32_t) * 6ull;
+                    case ELayoutKind::XYZ_N_UV: return sizeof(hlsl::float32_t) * 8ull;
+                    default: return 0ull;
+                }
+            }();
+            if (srcBytesPerVertex == 0ull || el.Count > (std::numeric_limits<size_t>::max() / srcBytesPerVertex))
+                return EFastVertexReadResult::Error;
+            const bool trackAABB = parsedAABB != nullptr;
+            const bool needsByteSwap = IsWrongEndian;
+            auto decodeF32 = [needsByteSwap](const uint8_t* src)->float {
+                uint32_t bits = 0u;
+                std::memcpy(&bits, src, sizeof(bits));
+                if (needsByteSwap)
+                    bits = Binary::byteswap(bits);
+                float value = 0.f;
+                std::memcpy(&value, &bits, sizeof(value));
+                return value;
+            };
+            size_t remainingVertices = el.Count;
+            while (remainingVertices > 0ull)
+            {
+                if (StartPointer + srcBytesPerVertex > EndPointer)
+                    fillBuffer();
+                const size_t available = EndPointer > StartPointer ? static_cast<size_t>(EndPointer - StartPointer) : 0ull;
+                if (available < srcBytesPerVertex)
+                    return EFastVertexReadResult::Error;
+                const size_t batchVertices = std::min(remainingVertices, available / srcBytesPerVertex);
+                const uint8_t* src = reinterpret_cast<const uint8_t*>(StartPointer);
+                switch (layout)
+                {
+                    case ELayoutKind::XYZ:
+                    {
+                        if (posStride == 3ull * floatBytes)
+                        {
+                            const size_t batchBytes = batchVertices * 3ull * floatBytes;
+                            if (trackAABB && batchVertices >= (1ull << 20))
+                            {
+                                const size_t hw = SLoaderRuntimeTuner::resolveHardwareThreads();
+                                const size_t hardMaxWorkers = SLoaderRuntimeTuner::resolveHardMaxWorkers(hw, inner.params.ioPolicy.runtimeTuning.workerHeadroom);
+                                SLoaderRuntimeTuningRequest vertexTuningRequest = {};
+                                vertexTuningRequest.inputBytes = batchBytes;
+                                vertexTuningRequest.totalWorkUnits = batchVertices;
+                                vertexTuningRequest.minBytesPerWorker = 3ull * floatBytes;
+                                vertexTuningRequest.hardwareThreads = static_cast<uint32_t>(hw);
+                                vertexTuningRequest.hardMaxWorkers = static_cast<uint32_t>(hardMaxWorkers);
+                                vertexTuningRequest.targetChunksPerWorker = inner.params.ioPolicy.runtimeTuning.targetChunksPerWorker;
+                                vertexTuningRequest.sampleData = reinterpret_cast<const uint8_t*>(src);
+                                vertexTuningRequest.sampleBytes = SLoaderRuntimeTuner::resolveSampleBytes(inner.params.ioPolicy, batchBytes);
+                                const auto vertexTuning = SLoaderRuntimeTuner::tune(inner.params.ioPolicy, vertexTuningRequest);
+                                const size_t workerCount = std::min(vertexTuning.workerCount, batchVertices);
+                                if (workerCount > 1ull)
+                                {
+                                    struct SAABBRange { float minX = std::numeric_limits<float>::max(); float minY = std::numeric_limits<float>::max(); float minZ = std::numeric_limits<float>::max(); float maxX = std::numeric_limits<float>::lowest(); float maxY = std::numeric_limits<float>::lowest(); float maxZ = std::numeric_limits<float>::lowest(); };
+                                    std::vector<SAABBRange> workerRanges(workerCount);
+                                    uint8_t* dstBase = posBase;
+                                    SLoaderRuntimeTuner::dispatchWorkers(workerCount, [&](const size_t workerIx) {
+                                        const size_t begin = (batchVertices * workerIx) / workerCount;
+                                        const size_t end = (batchVertices * (workerIx + 1ull)) / workerCount;
+                                        const size_t count = end - begin;
+                                        if (count == 0ull)
+                                            return;
+                                        auto& range = workerRanges[workerIx];
+                                        const uint8_t* inBytes = src + begin * 3ull * floatBytes;
+                                        float* outFloats = reinterpret_cast<float*>(dstBase + begin * 3ull * floatBytes);
+                                        if (!needsByteSwap)
+                                        {
+                                            std::memcpy(outFloats, inBytes, count * 3ull * floatBytes);
+                                            const float* xyz = reinterpret_cast<const float*>(inBytes);
+                                            for (size_t v = 0ull; v < count; ++v)
+                                            {
+                                                const float x = xyz[v * 3ull + 0ull];
+                                                const float y = xyz[v * 3ull + 1ull];
+                                                const float z = xyz[v * 3ull + 2ull];
+                                                if (x < range.minX) range.minX = x;
+                                                if (y < range.minY) range.minY = y;
+                                                if (z < range.minZ) range.minZ = z;
+                                                if (x > range.maxX) range.maxX = x;
+                                                if (y > range.maxY) range.maxY = y;
+                                                if (z > range.maxZ) range.maxZ = z;
+                                            }
+                                        }
+                                        else
+                                        {
+                                            for (size_t v = 0ull; v < count; ++v)
+                                            {
+                                                uint32_t xb = 0u, yb = 0u, zb = 0u;
+                                                std::memcpy(&xb, inBytes + 0ull * floatBytes, sizeof(xb));
+                                                std::memcpy(&yb, inBytes + 1ull * floatBytes, sizeof(yb));
+                                                std::memcpy(&zb, inBytes + 2ull * floatBytes, sizeof(zb));
+                                                xb = Binary::byteswap(xb);
+                                                yb = Binary::byteswap(yb);
+                                                zb = Binary::byteswap(zb);
+                                                float x = 0.f, y = 0.f, z = 0.f;
+                                                std::memcpy(&x, &xb, sizeof(x));
+                                                std::memcpy(&y, &yb, sizeof(y));
+                                                std::memcpy(&z, &zb, sizeof(z));
+                                                outFloats[0] = x;
+                                                outFloats[1] = y;
+                                                outFloats[2] = z;
+                                                if (x < range.minX) range.minX = x;
+                                                if (y < range.minY) range.minY = y;
+                                                if (z < range.minZ) range.minZ = z;
+                                                if (x > range.maxX) range.maxX = x;
+                                                if (y > range.maxY) range.maxY = y;
+                                                if (z > range.maxZ) range.maxZ = z;
+                                                inBytes += 3ull * floatBytes;
+                                                outFloats += 3ull;
+                                            }
+                                        }
+                                    });
+                                    auto& aabb = parsedAABB->value;
+                                    for (const auto& range : workerRanges)
+                                    {
+                                        if (range.minX < aabb.minVx.x) aabb.minVx.x = range.minX;
+                                        if (range.minY < aabb.minVx.y) aabb.minVx.y = range.minY;
+                                        if (range.minZ < aabb.minVx.z) aabb.minVx.z = range.minZ;
+                                        if (range.maxX > aabb.maxVx.x) aabb.maxVx.x = range.maxX;
+                                        if (range.maxY > aabb.maxVx.y) aabb.maxVx.y = range.maxY;
+                                        if (range.maxZ > aabb.maxVx.z) aabb.maxVx.z = range.maxZ;
+                                    }
+                                    src += batchBytes;
+                                    posBase += batchBytes;
+                                    break;
+                                }
+                            }
+                            if (!needsByteSwap)
+                            {
+                                std::memcpy(posBase, src, batchBytes);
+                                if (trackAABB)
+                                {
+                                    const float* xyz = reinterpret_cast<const float*>(src);
+                                    auto& aabb = parsedAABB->value;
+                                    for (size_t v = 0ull; v < batchVertices; ++v)
+                                    {
+                                        const float x = xyz[v * 3ull + 0ull];
+                                        const float y = xyz[v * 3ull + 1ull];
+                                        const float z = xyz[v * 3ull + 2ull];
+                                        if (x < aabb.minVx.x) aabb.minVx.x = x;
+                                        if (y < aabb.minVx.y) aabb.minVx.y = y;
+                                        if (z < aabb.minVx.z) aabb.minVx.z = z;
+                                        if (x > aabb.maxVx.x) aabb.maxVx.x = x;
+                                        if (y > aabb.maxVx.y) aabb.maxVx.y = y;
+                                        if (z > aabb.maxVx.z) aabb.maxVx.z = z;
+                                    }
+                                }
+                                src += batchBytes;
+                                posBase += batchBytes;
+                            }
+                            else
+                            {
+                                for (size_t v = 0ull; v < batchVertices; ++v)
+                                {
+                                    const float x = decodeF32(src + 0ull * floatBytes);
+                                    const float y = decodeF32(src + 1ull * floatBytes);
+                                    const float z = decodeF32(src + 2ull * floatBytes);
+                                    reinterpret_cast<float*>(posBase)[0] = x;
+                                    reinterpret_cast<float*>(posBase)[1] = y;
+                                    reinterpret_cast<float*>(posBase)[2] = z;
+                                    if (trackAABB)
+                                        hlsl::shapes::util::extendAABBAccumulator(*parsedAABB, x, y, z);
+                                    src += 3ull * floatBytes;
+                                    posBase += posStride;
+                                }
+                            }
+                        }
+                        else
+                        {
+                            for (size_t v = 0ull; v < batchVertices; ++v)
+                            {
+                                const float x = decodeF32(src + 0ull * floatBytes);
+                                const float y = decodeF32(src + 1ull * floatBytes);
+                                const float z = decodeF32(src + 2ull * floatBytes);
+                                reinterpret_cast<float*>(posBase)[0] = x;
+                                reinterpret_cast<float*>(posBase)[1] = y;
+                                reinterpret_cast<float*>(posBase)[2] = z;
+                                if (trackAABB)
+                                    hlsl::shapes::util::extendAABBAccumulator(*parsedAABB, x, y, z);
+                                src += 3ull * floatBytes;
+                                posBase += posStride;
+                            }
+                        }
+                    }
+                    break;
+                    case ELayoutKind::XYZ_N:
+                    {
+                        for (size_t v = 0ull; v < batchVertices; ++v)
+                        {
+                            const float x = decodeF32(src + 0ull * floatBytes);
+                            const float y = decodeF32(src + 1ull * floatBytes);
+                            const float z = decodeF32(src + 2ull * floatBytes);
+                            reinterpret_cast<float*>(posBase)[0] = x;
+                            reinterpret_cast<float*>(posBase)[1] = y;
+                            reinterpret_cast<float*>(posBase)[2] = z;
+                            if (trackAABB)
+                                hlsl::shapes::util::extendAABBAccumulator(*parsedAABB, hlsl::float32_t3(x, y, z));
+                            src += 3ull * floatBytes;
+                            posBase += posStride;
+                            reinterpret_cast<float*>(normalBase)[0] = decodeF32(src + 0ull * floatBytes);
+                            reinterpret_cast<float*>(normalBase)[1] = decodeF32(src + 1ull * floatBytes);
+                            reinterpret_cast<float*>(normalBase)[2] = decodeF32(src + 2ull * floatBytes);
+                            src += 3ull * floatBytes;
+                            normalBase += normalStride;
+                        }
+                    }
+                    break;
+                    case ELayoutKind::XYZ_N_UV:
+                    {
+                        for (size_t v = 0ull; v < batchVertices; ++v)
+                        {
+                            const float x = decodeF32(src + 0ull * floatBytes);
+                            const float y = decodeF32(src + 1ull * floatBytes);
+                            const float z = decodeF32(src + 2ull * floatBytes);
+                            reinterpret_cast<float*>(posBase)[0] = x;
+                            reinterpret_cast<float*>(posBase)[1] = y;
+                            reinterpret_cast<float*>(posBase)[2] = z;
+                            if (trackAABB)
+                                hlsl::shapes::util::extendAABBAccumulator(*parsedAABB, hlsl::float32_t3(x, y, z));
+                            src += 3ull * floatBytes;
+                            posBase += posStride;
+                            reinterpret_cast<float*>(normalBase)[0] = decodeF32(src + 0ull * floatBytes);
+                            reinterpret_cast<float*>(normalBase)[1] = decodeF32(src + 1ull * floatBytes);
+                            reinterpret_cast<float*>(normalBase)[2] = decodeF32(src + 2ull * floatBytes);
+                            src += 3ull * floatBytes;
+                            normalBase += normalStride;
+                            reinterpret_cast<float*>(uvBase)[0] = decodeF32(src + 0ull * floatBytes);
+                            reinterpret_cast<float*>(uvBase)[1] = decodeF32(src + 1ull * floatBytes);
+                            src += 2ull * floatBytes;
+                            uvBase += uvStride;
+                        }
+                    }
+                }
+                const size_t consumed = batchVertices * srcBytesPerVertex;
+                StartPointer += consumed;
+                remainingVertices -= batchVertices;
+            }
+            const size_t posAdvance = el.Count * posStride;
+            vertAttrIts[0].ptr += posAdvance;
+            vertAttrIts[1].ptr += posAdvance;
+            vertAttrIts[2].ptr += posAdvance;
+            if (layout == ELayoutKind::XYZ_N || layout == ELayoutKind::XYZ_N_UV)
+            {
+                const size_t normalAdvance = el.Count * normalStride;
+                vertAttrIts[3].ptr += normalAdvance;
+                vertAttrIts[4].ptr += normalAdvance;
+                vertAttrIts[5].ptr += normalAdvance;
+            }
+            if (layout == ELayoutKind::XYZ_N_UV)
+            {
+                const size_t uvAdvance = el.Count * uvStride;
+                vertAttrIts[6].ptr += uvAdvance;
+                vertAttrIts[7].ptr += uvAdvance;
+            }
+            return EFastVertexReadResult::Success;
+        }
+        void readVertex(const IAssetLoader::SAssetLoadParams& _params,
+                        const SElement& el) {
+            assert(el.Name == "vertex");
+            assert(el.Properties.size() == vertAttrIts.size());
+            if (!IsBinaryFile)
+                getNextLine();
+            for (size_t j = 0; j < el.Count; ++j)
+                for (auto i = 0u; i < vertAttrIts.size(); i++) {
+                    const auto& prop = el.Properties[i];
+                    auto& it = vertAttrIts[i];
+                    if (!it.ptr) {
+                        prop.skip(*this);
+                        continue;
+                    }
+                    if (!IsBinaryFile) {
+                        if (isIntegerFormat(prop.type)) {
+                            uint64_t tmp = getInt(prop.type);
+                            encodePixels(it.dstFmt, it.ptr, &tmp);
+                        } else {
+                            hlsl::float64_t tmp = getFloat(prop.type);
+                            encodePixels(it.dstFmt, it.ptr, &tmp);
+                        }
+                    } else if (it.dstFmt != prop.type) {
+                        assert(isIntegerFormat(it.dstFmt) == isIntegerFormat(prop.type));
+                        if (isIntegerFormat(it.dstFmt)) {
+                            uint64_t tmp = getInt(prop.type);
+                            encodePixels(it.dstFmt, it.ptr, &tmp);
+                        } else {
+                            hlsl::float64_t tmp = getFloat(prop.type);
+                            encodePixels(it.dstFmt, it.ptr, &tmp);
+                        }
+                    } else
+                        getData(it.ptr, prop.type);
+                    //
+                    it.ptr += it.stride;
+                }
+        }
+        bool readFace(const SElement& Element, core::vector<uint32_t>& _outIndices,
+                      uint32_t& _maxIndex, const uint32_t vertexCount) {
+            if (!IsBinaryFile)
+                getNextLine();
+            const bool hasVertexCount = vertexCount != 0u;
+            for (const auto& prop : Element.Properties) {
+                if (prop.isList() &&
+                    (prop.Name == "vertex_indices" || prop.Name == "vertex_index")) {
+                    const uint32_t count = getInt(prop.list.countType);
+                    const auto srcIndexFmt = prop.list.itemType;
+                    if (count < 3u) {
+                        for (uint32_t j = 0u; j < count; ++j)
+                            getInt(srcIndexFmt);
+                        continue;
+                    }
+                    if (count > 3u)
+                        _outIndices.reserve(_outIndices.size() +
+                                            static_cast<size_t>(count - 2u) * 3ull);
+                    auto emitFan = [&_outIndices, &_maxIndex, hasVertexCount,
+                                    vertexCount](auto&& readIndex,
+                                                 const uint32_t faceVertexCount) -> bool {
+                        uint32_t i0 = readIndex();
+                        uint32_t i1 = readIndex();
+                        uint32_t i2 = readIndex();
+                        if (hasVertexCount) {
+                            if (i0 >= vertexCount || i1 >= vertexCount || i2 >= vertexCount)
+                                return false;
+                        } else {
+                            _maxIndex = std::max(_maxIndex, std::max(i0, std::max(i1, i2)));
+                        }
+                        _outIndices.push_back(i0);
+                        _outIndices.push_back(i1);
+                        _outIndices.push_back(i2);
+                        uint32_t prev = i2;
+                        for (uint32_t j = 3u; j < faceVertexCount; ++j) {
+                            const uint32_t idx = readIndex();
+                            if (hasVertexCount) {
+                                if (idx >= vertexCount)
+                                    return false;
+                            } else {
+                                _maxIndex = std::max(_maxIndex, idx);
+                            }
+                            _outIndices.push_back(i0);
+                            _outIndices.push_back(prev);
+						_outIndices.push_back(idx);
+						prev = idx;
+					}
+					return true;
+				};
+				auto tryReadContiguousFan = [&]<typename T>() -> bool {
+					const size_t bytesNeeded = static_cast<size_t>(count) * sizeof(T);
+					if (!ensureBytes(bytesNeeded))
+						return false;
+					const uint8_t* ptr = reinterpret_cast<const uint8_t*>(StartPointer);
+					auto readIndex = [&ptr]() -> uint32_t {
+						T v = {};
+						std::memcpy(&v, ptr, sizeof(v));
+						ptr += sizeof(v);
+						return static_cast<uint32_t>(v);
+					};
+					if (!emitFan(readIndex, count))
+						return false;
+					StartPointer = reinterpret_cast<char*>(const_cast<uint8_t*>(ptr));
+					return true;
+				};
+				if (IsBinaryFile && !IsWrongEndian && srcIndexFmt == EF_R32_UINT && tryReadContiguousFan.template operator()<uint32_t>())
+					continue;
+				if (IsBinaryFile && !IsWrongEndian && srcIndexFmt == EF_R16_UINT && tryReadContiguousFan.template operator()<uint16_t>())
+					continue;
+				auto readIndex = [&]() -> uint32_t {
+					return static_cast<uint32_t>(getInt(srcIndexFmt));
+				};
+                    if (!emitFan(readIndex, count))
+                        return false;
+                } else if (prop.Name == "intensity") {
+                    // todo: face intensity
+                    prop.skip(*this);
+                } else
+                    prop.skip(*this);
+            }
+            return true;
+        }
+        enum class EFastFaceReadResult : uint8_t { NotApplicable,
+                                                   Success,
+                                                   Error };
+        EFastFaceReadResult readFaceElementFast(
+            const SElement& element, core::vector<uint32_t>& _outIndices,
+            uint32_t& _maxIndex, uint64_t& _faceCount, const uint32_t vertexCount,
+            const bool computeIndexHash, core::blake3_hash_t& outIndexHash) {
+            if (!IsBinaryFile)
+                return EFastFaceReadResult::NotApplicable;
+            if (element.Properties.size() != 1u)
+                return EFastFaceReadResult::NotApplicable;
+            const auto& prop = element.Properties[0];
+            if (!prop.isList() ||
+                (prop.Name != "vertex_indices" && prop.Name != "vertex_index"))
+                return EFastFaceReadResult::NotApplicable;
+            if (prop.list.countType != EF_R8_UINT)
+                return EFastFaceReadResult::NotApplicable;
+            const E_FORMAT srcIndexFmt = prop.list.itemType;
+            const bool isSrcU32 = srcIndexFmt == EF_R32_UINT;
+            const bool isSrcS32 = srcIndexFmt == EF_R32_SINT;
+            const bool isSrcU16 = srcIndexFmt == EF_R16_UINT;
+            const bool isSrcS16 = srcIndexFmt == EF_R16_SINT;
+            if (!isSrcU32 && !isSrcS32 && !isSrcU16 && !isSrcS16)
+                return EFastFaceReadResult::NotApplicable;
+            const bool is32Bit = isSrcU32 || isSrcS32;
+            const bool needEndianSwap = IsWrongEndian;
+            const size_t indexSize = is32Bit ? sizeof(uint32_t) : sizeof(uint16_t);
+            const bool hasVertexCount = vertexCount != 0u;
+            const bool trackMaxIndex = !hasVertexCount;
+            outIndexHash = IPreHashed::INVALID_HASH;
+            const size_t minTriangleRecordSize = sizeof(uint8_t) + indexSize * 3u;
+            if (element.Count >
+                (std::numeric_limits<size_t>::max() / minTriangleRecordSize))
+                return EFastFaceReadResult::Error;
+            const size_t minBytesNeeded = element.Count * minTriangleRecordSize;
+            if (StartPointer + minBytesNeeded <= EndPointer) {
+                if (element.Count > (std::numeric_limits<size_t>::max() / 3u))
+                    return EFastFaceReadResult::Error;
+                const size_t triIndices = element.Count * 3u;
+                if (_outIndices.size() >
+                    (std::numeric_limits<size_t>::max() - triIndices))
+                    return EFastFaceReadResult::Error;
+                const size_t oldSize = _outIndices.size();
+                const uint32_t oldMaxIndex = _maxIndex;
+                _outIndices.resize(oldSize + triIndices);
+                uint32_t* out = _outIndices.data() + oldSize;
+                const uint8_t* ptr = reinterpret_cast<const uint8_t*>(StartPointer);
+                auto readU32 = [needEndianSwap](const uint8_t* src) -> uint32_t {
+                    uint32_t value = 0u;
+                    std::memcpy(&value, src, sizeof(value));
+                    if (needEndianSwap)
+                        value = Binary::byteswap(value);
+                    return value;
+                };
+                auto readU16 = [needEndianSwap](const uint8_t* src) -> uint16_t {
+                    uint16_t value = 0u;
+                    std::memcpy(&value, src, sizeof(value));
+                    if (needEndianSwap)
+                        value = Binary::byteswap(value);
+                    return value;
+                };
+                if (is32Bit) {
+                    const size_t hw = SLoaderRuntimeTuner::resolveHardwareThreads();
+                    const size_t hardMaxWorkers =
+                        SLoaderRuntimeTuner::resolveHardMaxWorkers(
+                            hw, inner.params.ioPolicy.runtimeTuning.workerHeadroom);
+                    const size_t recordBytes = sizeof(uint8_t) + 3ull * sizeof(uint32_t);
+                    SLoaderRuntimeTuningRequest faceTuningRequest = {};
+                    faceTuningRequest.inputBytes = minBytesNeeded;
+                    faceTuningRequest.totalWorkUnits = element.Count;
+                    faceTuningRequest.minBytesPerWorker = recordBytes;
+                    faceTuningRequest.hardwareThreads = static_cast<uint32_t>(hw);
+                    faceTuningRequest.hardMaxWorkers =
+                        static_cast<uint32_t>(hardMaxWorkers);
+                    faceTuningRequest.targetChunksPerWorker =
+                        inner.params.ioPolicy.runtimeTuning.targetChunksPerWorker;
+                    faceTuningRequest.sampleData = ptr;
+                    faceTuningRequest.sampleBytes =
+                        SLoaderRuntimeTuner::resolveSampleBytes(inner.params.ioPolicy,
+                                                                minBytesNeeded);
+                    const auto faceTuning = SLoaderRuntimeTuner::tune(
+                        inner.params.ioPolicy, faceTuningRequest);
+                    size_t workerCount = std::min(faceTuning.workerCount, element.Count);
+                    if (workerCount > 1ull) {
+                        const bool needMax = trackMaxIndex;
+                        const bool validateAgainstVertexCount = hasVertexCount;
+                        std::vector<uint8_t> workerNonTriangle(workerCount, 0u);
+                        std::vector<uint8_t> workerInvalid(workerCount, 0u);
+                        std::vector<uint32_t> workerMax(needMax ? workerCount : 0ull, 0u);
+                        const bool hashInParsePipeline = computeIndexHash;
+                        std::vector<uint8_t> workerReady(
+                            hashInParsePipeline ? workerCount : 0ull, 0u);
+                        std::vector<uint8_t> workerHashable(
+                            hashInParsePipeline ? workerCount : 0ull, 1u);
+                        std::atomic_bool hashPipelineOk = true;
+                        core::blake3_hash_t parsedIndexHash = IPreHashed::INVALID_HASH;
+                        std::jthread hashThread;
+                        if (hashInParsePipeline) {
+                            hashThread = std::jthread([&]() {
+                                try {
+                                    core::blake3_hasher hasher;
+                                    for (size_t workerIx = 0ull; workerIx < workerCount;
+                                         ++workerIx) {
+                                        auto ready =
+                                            std::atomic_ref<uint8_t>(workerReady[workerIx]);
+                                        while (ready.load(std::memory_order_acquire) == 0u)
+                                            ready.wait(0u, std::memory_order_acquire);
+                                        if (workerHashable[workerIx] == 0u) {
+                                            hashPipelineOk.store(false, std::memory_order_relaxed);
+                                            return;
+                                        }
+                                        const size_t begin =
+                                            (element.Count * workerIx) / workerCount;
+                                        const size_t end =
+                                            (element.Count * (workerIx + 1ull)) / workerCount;
+                                        const size_t faceCount = end - begin;
+                                        hasher.update(out + begin * 3ull,
+                                                      faceCount * 3ull * sizeof(uint32_t));
+                                    }
+                                    parsedIndexHash = static_cast<core::blake3_hash_t>(hasher);
+                                } catch (...) {
+                                    hashPipelineOk.store(false, std::memory_order_relaxed);
+                                }
+                            });
+                        }
+                        auto parseChunk = [&](const size_t workerIx, const size_t beginFace,
+                                              const size_t endFace) -> void {
+                            const uint8_t* in = ptr + beginFace * recordBytes;
+                            uint32_t* outLocal = out + beginFace * 3ull;
+                            uint32_t localMax = 0u;
+                            for (size_t faceIx = beginFace; faceIx < endFace; ++faceIx) {
+                                if (*in != 3u) {
+                                    workerNonTriangle[workerIx] = 1u;
+                                    if (hashInParsePipeline)
+                                        workerHashable[workerIx] = 0u;
+                                    break;
+                                }
+                                ++in;
+                                const uint32_t i0 = readU32(in + 0ull * sizeof(uint32_t));
+                                const uint32_t i1 = readU32(in + 1ull * sizeof(uint32_t));
+                                const uint32_t i2 = readU32(in + 2ull * sizeof(uint32_t));
+                                outLocal[0] = i0;
+                                outLocal[1] = i1;
+                                outLocal[2] = i2;
+                                const uint32_t triOr = i0 | i1 | i2;
+                                if (isSrcS32 && (triOr & 0x80000000u)) {
+                                    workerInvalid[workerIx] = 1u;
+                                    if (hashInParsePipeline)
+                                        workerHashable[workerIx] = 0u;
+                                    break;
+                                }
+                                if (validateAgainstVertexCount) {
+                                    if (i0 >= vertexCount || i1 >= vertexCount || i2 >= vertexCount) {
+                                        workerInvalid[workerIx] = 1u;
+                                        if (hashInParsePipeline)
+                                            workerHashable[workerIx] = 0u;
+                                        break;
+                                    }
+                                } else if (needMax) {
+                                    if (i0 > localMax) localMax = i0;
+                                    if (i1 > localMax) localMax = i1;
+                                    if (i2 > localMax) localMax = i2;
+                                }
+                                in += 3ull * sizeof(uint32_t);
+                                outLocal += 3ull;
+                            }
+                            if (needMax)
+                                workerMax[workerIx] = localMax;
+                            if (hashInParsePipeline) {
+                                auto ready = std::atomic_ref<uint8_t>(workerReady[workerIx]);
+                                ready.store(1u, std::memory_order_release);
+                                ready.notify_one();
+                            }
+                        };
+                        SLoaderRuntimeTuner::dispatchWorkers(
+                            workerCount, [&](const size_t workerIx) {
+                                const size_t begin = (element.Count * workerIx) / workerCount;
+                                const size_t end =
+                                    (element.Count * (workerIx + 1ull)) / workerCount;
+                                parseChunk(workerIx, begin, end);
+                            });
+                        if (hashThread.joinable())
+                            hashThread.join();
+                        const bool anyNonTriangle =
+                            std::any_of(workerNonTriangle.begin(), workerNonTriangle.end(),
+                                        [](const uint8_t v) { return v != 0u; });
+                        if (anyNonTriangle) {
+                            _outIndices.resize(oldSize);
+                            _maxIndex = oldMaxIndex;
+                            return EFastFaceReadResult::NotApplicable;
+                        }
+                        const bool anyInvalid =
+                            std::any_of(workerInvalid.begin(), workerInvalid.end(),
+                                        [](const uint8_t v) { return v != 0u; });
+                        if (anyInvalid) {
+                            _outIndices.resize(oldSize);
+                            _maxIndex = oldMaxIndex;
+                            return EFastFaceReadResult::Error;
+                        }
+                        if (trackMaxIndex) {
+                            for (const uint32_t local : workerMax)
+                                if (local > _maxIndex)
+                                    _maxIndex = local;
+                        }
+                        if (hashInParsePipeline &&
+                            hashPipelineOk.load(std::memory_order_relaxed))
+                            outIndexHash = parsedIndexHash;
+                        StartPointer = reinterpret_cast<char*>(
+                            const_cast<uint8_t*>(ptr + element.Count * recordBytes));
+                        _faceCount += element.Count;
+                        return EFastFaceReadResult::Success;
+                    }
+                }
+                if (is32Bit)
+                {
+                    if (isSrcU32)
+                    {
+                        if (trackMaxIndex)
+                        {
+                            for (size_t j = 0u; j < element.Count; ++j)
+                            {
+                                const uint8_t c = *ptr++;
+                                if (c != 3u)
+                                    return EFastFaceReadResult::NotApplicable;
+                                out[0] = readU32(ptr + 0ull * sizeof(uint32_t));
+                                out[1] = readU32(ptr + 1ull * sizeof(uint32_t));
+                                out[2] = readU32(ptr + 2ull * sizeof(uint32_t));
+                                ptr += 3ull * sizeof(uint32_t);
+                                if (out[0] > _maxIndex) _maxIndex = out[0];
+                                if (out[1] > _maxIndex) _maxIndex = out[1];
+                                if (out[2] > _maxIndex) _maxIndex = out[2];
+                                out += 3u;
+                            }
+                        }
+                        else
+                        {
+                            for (size_t j = 0u; j < element.Count; ++j)
+                            {
+                                const uint8_t c = *ptr++;
+                                if (c != 3u)
+                                    return EFastFaceReadResult::NotApplicable;
+                                out[0] = readU32(ptr + 0ull * sizeof(uint32_t));
+                                out[1] = readU32(ptr + 1ull * sizeof(uint32_t));
+                                out[2] = readU32(ptr + 2ull * sizeof(uint32_t));
+                                ptr += 3ull * sizeof(uint32_t);
+                                if (out[0] >= vertexCount || out[1] >= vertexCount || out[2] >= vertexCount)
+                                    return EFastFaceReadResult::Error;
+                                out += 3u;
+                            }
+                        }
+                    }
+                    else
+                    {
+                        for (size_t j = 0u; j < element.Count; ++j)
+                        {
+                            const uint8_t c = *ptr++;
+                            if (c != 3u)
+                                return EFastFaceReadResult::NotApplicable;
+                            out[0] = readU32(ptr + 0ull * sizeof(uint32_t));
+                            out[1] = readU32(ptr + 1ull * sizeof(uint32_t));
+                            out[2] = readU32(ptr + 2ull * sizeof(uint32_t));
+                            ptr += 3ull * sizeof(uint32_t);
+                            if ((out[0] | out[1] | out[2]) & 0x80000000u)
+                                return EFastFaceReadResult::Error;
+                            if (trackMaxIndex)
+                            {
+                                if (out[0] > _maxIndex) _maxIndex = out[0];
+                                if (out[1] > _maxIndex) _maxIndex = out[1];
+                                if (out[2] > _maxIndex) _maxIndex = out[2];
+                            }
+                            else if (out[0] >= vertexCount || out[1] >= vertexCount || out[2] >= vertexCount)
+                                return EFastFaceReadResult::Error;
+                            out += 3u;
+                        }
+                    }
+                }
+                else
+                {
+                    if (isSrcU16)
+                    {
+                        if (trackMaxIndex)
+                        {
+                            for (size_t j = 0u; j < element.Count; ++j)
+                            {
+                                const uint8_t c = *ptr++;
+                                if (c != 3u)
+                                    return EFastFaceReadResult::NotApplicable;
+                                out[0] = readU16(ptr + 0ull * sizeof(uint16_t));
+                                out[1] = readU16(ptr + 1ull * sizeof(uint16_t));
+                                out[2] = readU16(ptr + 2ull * sizeof(uint16_t));
+                                ptr += 3ull * sizeof(uint16_t);
+                                if (out[0] > _maxIndex) _maxIndex = out[0];
+                                if (out[1] > _maxIndex) _maxIndex = out[1];
+                                if (out[2] > _maxIndex) _maxIndex = out[2];
+                                out += 3u;
+                            }
+                        }
+                        else
+                        {
+                            for (size_t j = 0u; j < element.Count; ++j)
+                            {
+                                const uint8_t c = *ptr++;
+                                if (c != 3u)
+                                    return EFastFaceReadResult::NotApplicable;
+                                out[0] = readU16(ptr + 0ull * sizeof(uint16_t));
+                                out[1] = readU16(ptr + 1ull * sizeof(uint16_t));
+                                out[2] = readU16(ptr + 2ull * sizeof(uint16_t));
+                                ptr += 3ull * sizeof(uint16_t);
+                                if (out[0] >= vertexCount || out[1] >= vertexCount || out[2] >= vertexCount)
+                                    return EFastFaceReadResult::Error;
+                                out += 3u;
+                            }
+                        }
+                    }
+                    else
+                    {
+                        for (size_t j = 0u; j < element.Count; ++j)
+                        {
+                            const uint8_t c = *ptr++;
+                            if (c != 3u)
+                                return EFastFaceReadResult::NotApplicable;
+                            out[0] = readU16(ptr + 0ull * sizeof(uint16_t));
+                            out[1] = readU16(ptr + 1ull * sizeof(uint16_t));
+                            out[2] = readU16(ptr + 2ull * sizeof(uint16_t));
+                            ptr += 3ull * sizeof(uint16_t);
+                            if ((out[0] | out[1] | out[2]) & 0x8000u)
+                                return EFastFaceReadResult::Error;
+                            if (trackMaxIndex)
+                            {
+                                if (out[0] > _maxIndex) _maxIndex = out[0];
+                                if (out[1] > _maxIndex) _maxIndex = out[1];
+                                if (out[2] > _maxIndex) _maxIndex = out[2];
+                            }
+                            else if (out[0] >= vertexCount || out[1] >= vertexCount || out[2] >= vertexCount)
+                                return EFastFaceReadResult::Error;
+                            out += 3u;
+                        }
+                    }
+                }
+                StartPointer = reinterpret_cast<char*>(const_cast<uint8_t*>(ptr));
+                _faceCount += element.Count;
+                return EFastFaceReadResult::Success;
+            }
+            if (element.Count > (std::numeric_limits<size_t>::max() / 3u))
+                return EFastFaceReadResult::Error;
+            const size_t reserveCount = element.Count * 3u;
+            if (_outIndices.size() >
+                (std::numeric_limits<size_t>::max() - reserveCount))
+                return EFastFaceReadResult::Error;
+            const size_t oldSize = _outIndices.size();
+            _outIndices.resize(oldSize + reserveCount);
+            uint32_t* out = _outIndices.data() + oldSize;
+            size_t written = 0ull;
+            auto ensureBytes = [this](const size_t bytes) -> bool {
+                if (StartPointer + bytes > EndPointer)
+                    fillBuffer();
+                return StartPointer + bytes <= EndPointer;
+            };
+            auto readCount = [&ensureBytes, this](int32_t& outCount) -> bool {
+                if (!ensureBytes(sizeof(uint8_t)))
+                    return false;
+                outCount = static_cast<uint8_t>(*StartPointer++);
+                return true;
+            };
+            auto readIndex = [&ensureBytes, this, is32Bit, isSrcU32, isSrcU16,
+                              needEndianSwap](uint32_t& out) -> bool {
+                if (is32Bit) {
+                    if (!ensureBytes(sizeof(uint32_t)))
+                        return false;
+                    if (isSrcU32) {
+                        std::memcpy(&out, StartPointer, sizeof(uint32_t));
+                        if (needEndianSwap)
+                            out = Binary::byteswap(out);
+                    } else {
+                        int32_t v = 0;
+                        std::memcpy(&v, StartPointer, sizeof(v));
+                        if (needEndianSwap)
+                            v = Binary::byteswap(v);
+                        if (v < 0)
+                            return false;
+                        out = static_cast<uint32_t>(v);
+                    }
+                    StartPointer += sizeof(uint32_t);
+                    return true;
+                }
+                if (!ensureBytes(sizeof(uint16_t)))
+                    return false;
+                if (isSrcU16) {
+                    uint16_t v = 0u;
+                    std::memcpy(&v, StartPointer, sizeof(uint16_t));
+                    if (needEndianSwap)
+                        v = Binary::byteswap(v);
+                    out = v;
+                } else {
+                    int16_t v = 0;
+                    std::memcpy(&v, StartPointer, sizeof(int16_t));
+                    if (needEndianSwap)
+                        v = Binary::byteswap(v);
+                    if (v < 0)
+                        return false;
+                    out = static_cast<uint32_t>(v);
+                }
+                StartPointer += sizeof(uint16_t);
+                return true;
+            };
+            auto readPackedU32 = [needEndianSwap](const uint8_t* src) -> uint32_t {
+                uint32_t value = 0u;
+                std::memcpy(&value, src, sizeof(value));
+                if (needEndianSwap)
+                    value = Binary::byteswap(value);
+                return value;
+            };
+            auto readPackedU16 = [needEndianSwap](const uint8_t* src) -> uint32_t {
+                uint16_t value = 0u;
+                std::memcpy(&value, src, sizeof(value));
+                if (needEndianSwap)
+                    value = Binary::byteswap(value);
+                return value;
+            };
+            for (size_t j = 0u; j < element.Count; ++j) {
+                if (is32Bit && ensureBytes(sizeof(uint8_t) + sizeof(uint32_t) * 3ull) && static_cast<uint8_t>(*StartPointer) == 3u)
+                {
+                    ++StartPointer;
+                    const uint32_t i0 = readPackedU32(reinterpret_cast<const uint8_t*>(StartPointer) + 0ull * sizeof(uint32_t));
+                    const uint32_t i1 = readPackedU32(reinterpret_cast<const uint8_t*>(StartPointer) + 1ull * sizeof(uint32_t));
+                    const uint32_t i2 = readPackedU32(reinterpret_cast<const uint8_t*>(StartPointer) + 2ull * sizeof(uint32_t));
+                    StartPointer += 3ull * sizeof(uint32_t);
+                    if (isSrcS32 && ((i0 | i1 | i2) & 0x80000000u))
+                        return EFastFaceReadResult::Error;
+                    if (trackMaxIndex)
+                    {
+                        if (i0 > _maxIndex) _maxIndex = i0;
+                        if (i1 > _maxIndex) _maxIndex = i1;
+                        if (i2 > _maxIndex) _maxIndex = i2;
+                    }
+                    else if (i0 >= vertexCount || i1 >= vertexCount || i2 >= vertexCount)
+                        return EFastFaceReadResult::Error;
+                    out[0] = i0;
+                    out[1] = i1;
+                    out[2] = i2;
+                    out += 3u;
+                    written += 3ull;
+                    ++_faceCount;
+                    continue;
+                }
+                if (!is32Bit && ensureBytes(sizeof(uint8_t) + sizeof(uint16_t) * 3ull) && static_cast<uint8_t>(*StartPointer) == 3u)
+                {
+                    ++StartPointer;
+                    const uint32_t i0 = readPackedU16(reinterpret_cast<const uint8_t*>(StartPointer) + 0ull * sizeof(uint16_t));
+                    const uint32_t i1 = readPackedU16(reinterpret_cast<const uint8_t*>(StartPointer) + 1ull * sizeof(uint16_t));
+                    const uint32_t i2 = readPackedU16(reinterpret_cast<const uint8_t*>(StartPointer) + 2ull * sizeof(uint16_t));
+                    StartPointer += 3ull * sizeof(uint16_t);
+                    if (isSrcS16 && ((i0 | i1 | i2) & 0x8000u))
+                        return EFastFaceReadResult::Error;
+                    if (trackMaxIndex)
+                    {
+                        if (i0 > _maxIndex) _maxIndex = i0;
+                        if (i1 > _maxIndex) _maxIndex = i1;
+                        if (i2 > _maxIndex) _maxIndex = i2;
+                    }
+                    else if (i0 >= vertexCount || i1 >= vertexCount || i2 >= vertexCount)
+                        return EFastFaceReadResult::Error;
+                    out[0] = i0;
+                    out[1] = i1;
+                    out[2] = i2;
+                    out += 3u;
+                    written += 3ull;
+                    ++_faceCount;
+                    continue;
+                }
+                int32_t countSigned = 0;
+                if (!readCount(countSigned))
+                    return EFastFaceReadResult::Error;
+                const uint32_t count = static_cast<uint32_t>(countSigned);
+                if (count < 3u) {
+                    uint32_t dummy = 0u;
+                    for (uint32_t k = 0u; k < count; ++k) {
+                        if (!readIndex(dummy))
+                            return EFastFaceReadResult::Error;
+                    }
+                    ++_faceCount;
+                    continue;
+                }
+                uint32_t i0 = 0u;
+                uint32_t i1 = 0u;
+                uint32_t i2 = 0u;
+                if (!readIndex(i0) || !readIndex(i1) || !readIndex(i2))
+                    return EFastFaceReadResult::Error;
+                if (trackMaxIndex) {
+                    _maxIndex = std::max(_maxIndex, std::max(i0, std::max(i1, i2)));
+                } else if (i0 >= vertexCount || i1 >= vertexCount ||
+                           i2 >= vertexCount) {
+                    return EFastFaceReadResult::Error;
+                }
+                out[0] = i0;
+                out[1] = i1;
+                out[2] = i2;
+                out += 3u;
+                written += 3ull;
+                uint32_t prev = i2;
+                for (uint32_t k = 3u; k < count; ++k) {
+                    uint32_t idx = 0u;
+                    if (!readIndex(idx))
+                        return EFastFaceReadResult::Error;
+                    if (trackMaxIndex) {
+                        _maxIndex = std::max(_maxIndex, idx);
+                    } else if (idx >= vertexCount) {
+                        return EFastFaceReadResult::Error;
+                    }
+                    if (_outIndices.size() < oldSize + written + 3ull)
+                    {
+                        const size_t outOffset = static_cast<size_t>(out - _outIndices.data());
+                        _outIndices.resize(oldSize + written + 3ull);
+                        out = _outIndices.data() + outOffset;
+                    }
+                    out[0] = i0;
+                    out[1] = prev;
+                    out[2] = idx;
+                    out += 3u;
+                    written += 3ull;
+                    prev = idx;
+                }
+                ++_faceCount;
+            }
+            _outIndices.resize(oldSize + written);
+            return EFastFaceReadResult::Success;
+        }
+        IAssetLoader::SAssetLoadContext inner;
+        uint32_t topHierarchyLevel;
+        IAssetLoader::IAssetLoaderOverride* loaderOverride;
+        core::vector<char> Buffer; // input buffer must be at least twice as long as the longest line in the file
+        size_t ioReadWindowSize = DefaultIoReadWindowBytes;
+        core::vector<SElement> ElementList = {};
+        char *StartPointer = nullptr, *EndPointer = nullptr,
+             *LineEndPointer = nullptr;
+        int32_t LineLength = 0;
+        int32_t WordLength = -1; // this variable is a misnomer, its really the offset to next word minus one
+        bool IsBinaryFile = false, IsWrongEndian = false, EndOfFile = false;
+        size_t fileOffset = {};
+        uint64_t readCallCount = 0ull;
+        uint64_t readBytesTotal = 0ull;
+        uint64_t readMinBytes = std::numeric_limits<uint64_t>::max();
+        core::vector<SVertAttrIt> vertAttrIts;
+    };
+};
+}
+CPLYMeshFileLoader::CPLYMeshFileLoader() = default;
+const char** CPLYMeshFileLoader::getAssociatedFileExtensions() const
+{
+	static const char* ext[] = { "ply", nullptr };
+	return ext;
+}
+bool CPLYMeshFileLoader::isALoadableFileFormat(system::IFile* _file, const system::logger_opt_ptr) const {
+    std::array<char, 128> buf = {};
+    system::IFile::success_t success;
+    _file->read(success, buf.data(), 0, buf.size());
+    if (!success)
+        return false;
+    const std::string_view fileHeader(buf.data(), success.getBytesProcessed());
+    Parse::Common::LineCursor lineCursor = {.cursor = fileHeader.data(), .end = fileHeader.data() + fileHeader.size()};
+    const auto firstLineOpt = lineCursor.readLine();
+    if (!firstLineOpt.has_value() || Parse::Common::trimWhitespace(*firstLineOpt) != "ply")
+        return false;
+    constexpr std::array<std::string_view, 3> headers = {
+        "format ascii 1.0", "format binary_little_endian 1.0",
+        "format binary_big_endian 1.0"};
+    while (const auto lineOpt = lineCursor.readLine()) {
+        const std::string_view line = Parse::Common::trimWhitespace(*lineOpt);
+        if (line.starts_with("format "))
+            return std::find(headers.begin(), headers.end(), line) != headers.end();
+    }
+    return false;
+}
+//! creates/loads an animated mesh from the file.
+SAssetBundle CPLYMeshFileLoader::loadAsset(
+    system::IFile* _file, const IAssetLoader::SAssetLoadParams& _params,
+    IAssetLoader::IAssetLoaderOverride* _override, uint32_t _hierarchyLevel) {
+    using namespace nbl::core;
+    using clock_t = std::chrono::high_resolution_clock;
+    if (!_file)
+        return {};
+    const bool computeContentHashes = !_params.loaderFlags.hasAnyFlag(
+        IAssetLoader::ELPF_DONT_COMPUTE_CONTENT_HASHES);
+    uint64_t faceCount = 0u;
+    uint64_t fastFaceElementCount = 0u;
+    uint64_t fastVertexElementCount = 0u;
+    uint32_t maxIndexRead = 0u;
+    core::blake3_hash_t precomputedIndexHash = IPreHashed::INVALID_HASH;
+    const uint64_t fileSize = _file->getSize();
+    const bool hashInBuild =
+        computeContentHashes &&
+        SLoaderRuntimeTuner::shouldInlineHashBuild(_params.ioPolicy, fileSize);
+    impl::SLoadSession loadSession = {};
+    if (!impl::SLoadSession::begin(_params.logger, "PLY loader", _file, _params.ioPolicy, fileSize, true, loadSession))
+        return {};
+    Parse::Context ctx = {asset::IAssetLoader::SAssetLoadContext{_params, _file},
+                          _hierarchyLevel, _override};
+    uint64_t desiredReadWindow =
+        loadSession.isWholeFile()
+            ? (fileSize + Parse::Context::ReadWindowPaddingBytes)
+            : loadSession.ioPlan.chunkSizeBytes();
+    if (loadSession.isWholeFile()) {
+        const bool mappedInput = loadSession.mappedPointer() != nullptr;
+        if (mappedInput &&
+            fileSize > (Parse::Context::DefaultIoReadWindowBytes * 2ull))
+            desiredReadWindow = Parse::Context::DefaultIoReadWindowBytes;
+    }
+    const uint64_t safeReadWindow = std::min<uint64_t>(desiredReadWindow, static_cast<uint64_t>(std::numeric_limits<size_t>::max() - Parse::Context::ReadWindowPaddingBytes));
+    ctx.init(static_cast<size_t>(safeReadWindow));
+    // start with empty mesh
+    auto geometry = make_smart_refctd_ptr<ICPUPolygonGeometry>();
+    std::optional<core::vector<std::string>> geometryMetadata = std::nullopt;
+    hlsl::shapes::util::AABBAccumulator3<float> parsedAABB = hlsl::shapes::util::createAABBAccumulator<float>();
+    uint32_t vertCount = 0;
+    Parse::ContentHashBuild contentHashBuild = Parse::ContentHashBuild::create(computeContentHashes, hashInBuild);
+    double headerMs = 0.0, vertexMs = 0.0, faceMs = 0.0, finalizeMs = 0.0;
+    auto hashViewBufferIfNeeded = [&](const IGeometry<ICPUBuffer>::SDataView& view) -> void {
+        if (!view || !view.src.buffer)
+            return;
+        contentHashBuild.hashNow(view.src.buffer.get());
+    };
+    auto hashRemainingGeometryBuffers = [&]() -> void {
+        if (contentHashBuild.hashesInline())
+            SGeometryLoaderCommon::visitGeometryViews(geometry.get(), hashViewBufferIfNeeded);
+    };
+    auto tryLaunchDeferredHash = [&](const IGeometry<ICPUBuffer>::SDataView& view) -> void {
+        if (!view || !view.src.buffer)
+            return;
+        contentHashBuild.tryDefer(view.src.buffer.get());
+    };
+    // Currently only supports ASCII or binary meshes
+    if (Parse::Common::trimWhitespace(ctx.getNextLine()) != "ply") {
+        _params.logger.log("Not a valid PLY file %s", system::ILogger::ELL_ERROR,
+                           ctx.inner.mainFile->getFileName().string().c_str());
+        return {};
+    }
+    // cut the next line out
+    ctx.getNextLine();
+    // grab the word from this line
+    const char* word = ctx.getNextWord();
+    // ignore comments
+    for (; Parse::toStringView(word) == "comment"; ctx.getNextLine())
+        word = ctx.getNextWord();
+    bool readingHeader = true;
+    bool continueReading = true;
+    ctx.IsBinaryFile = false;
+    ctx.IsWrongEndian = false;
+    const auto headerStart = clock_t::now();
+    do {
+        const std::string_view wordView = Parse::toStringView(word);
+        if (wordView == "property") {
+            word = ctx.getNextWord();
+            if (ctx.ElementList.empty()) {
+                _params.logger.log("PLY property token found before element %s",
+                                   system::ILogger::ELL_WARNING, word);
+            } else {
+                // get element
+                auto& el = ctx.ElementList.back();
+                // fill property struct
+                auto& prop = el.Properties.emplace_back();
+                prop.type = prop.getType(word);
+                if (prop.type == EF_UNKNOWN) {
+                    el.KnownSize = false;
+                    word = ctx.getNextWord();
+                    prop.list.countType = prop.getType(word);
+                    if (ctx.IsBinaryFile && !isIntegerFormat(prop.list.countType)) {
+                        _params.logger.log("Cannot read binary PLY file containing data "
+                                           "types of unknown or non integer length %s",
+                                           system::ILogger::ELL_WARNING, word);
+                        continueReading = false;
+                    } else {
+                        word = ctx.getNextWord();
+                        prop.list.itemType = prop.getType(word);
+                        if (ctx.IsBinaryFile && !isIntegerFormat(prop.list.itemType)) {
+                            _params.logger.log("Cannot read binary PLY file containing data "
+                                               "types of unknown or non integer length %s",
+                                               system::ILogger::ELL_ERROR, word);
+                            continueReading = false;
+                        }
+                    }
+                } else if (ctx.IsBinaryFile && prop.type == EF_UNKNOWN) {
+                    _params.logger.log("Cannot read binary PLY file containing data "
+                                       "types of unknown length %s",
+                                       system::ILogger::ELL_ERROR, word);
+                    continueReading = false;
+                } else
+                    el.KnownSize += getTexelOrBlockBytesize(prop.type);
+                prop.Name = ctx.getNextWord();
+            }
+        } else if (wordView == "element") {
+            auto& el = ctx.ElementList.emplace_back();
+            el.Name = ctx.getNextWord();
+            const char* const countWord = ctx.getNextWord();
+            uint64_t parsedCount = 0ull;
+            const std::string_view countWordView = Parse::toStringView(countWord);
+            if (!countWordView.empty()) {
+                if (!Parse::Common::parseExactNumber(countWordView, parsedCount))
+                    parsedCount = 0ull;
+            }
+            el.Count = static_cast<size_t>(parsedCount);
+            el.KnownSize = 0;
+            if (el.Name == "vertex")
+                vertCount = el.Count;
+        } else if (wordView == "comment") {
+            // ignore line
+        } else if (wordView == "format") {
+            // must be `format {binary_little_endian|binary_big_endian|ascii} 1.0`
+            word = ctx.getNextWord();
+            const std::string_view formatView = Parse::toStringView(word);
+            if (formatView == "binary_little_endian") {
+                ctx.IsBinaryFile = true;
+            } else if (formatView == "binary_big_endian") {
+                ctx.IsBinaryFile = true;
+                ctx.IsWrongEndian = true;
+            } else if (formatView == "ascii") {
+            } else {
+                // abort if this isn't an ascii or a binary mesh
+                _params.logger.log("Unsupported PLY mesh format %s",
+                                   system::ILogger::ELL_ERROR, word);
+                continueReading = false;
+            }
+            if (continueReading) {
+                word = ctx.getNextWord();
+                if (Parse::toStringView(word) != "1.0") {
+                    _params.logger.log("Unsupported PLY mesh version %s",
+                                       system::ILogger::ELL_WARNING, word);
+                }
+            }
+        } else if (wordView == "end_header") {
+            readingHeader = false;
+            if (ctx.IsBinaryFile) {
+                char* const binaryStartInBuffer = ctx.LineEndPointer + 1;
+                const auto* const mappedBase = reinterpret_cast<const char*>(loadSession.mappedPointer());
+                if (mappedBase) {
+                    const size_t binaryOffset =
+                        ctx.getAbsoluteOffset(binaryStartInBuffer);
+                    const size_t remainingBytes = static_cast<size_t>(
+                        binaryOffset < fileSize ? (fileSize - binaryOffset) : 0ull);
+                    ctx.useMappedBinaryWindow(mappedBase + binaryOffset, remainingBytes);
+                } else {
+                    ctx.StartPointer = binaryStartInBuffer;
+                }
+            }
+        } else {
+            _params.logger.log("Unknown item in PLY file %s",
+                               system::ILogger::ELL_WARNING, word);
+        }
+        if (readingHeader && continueReading) {
+            ctx.getNextLine();
+            word = ctx.getNextWord();
+        }
+    } while (readingHeader && continueReading);
+    headerMs = std::chrono::duration<double, std::milli>(clock_t::now() - headerStart).count();
+    if (!continueReading)
+        return {};
+    // now to read the actual data from the file
+    using index_t = uint32_t;
+    core::vector<index_t> indices = {};
+    bool verticesProcessed = false;
+    const std::string fileName = _file->getFileName().string();
+    auto logMalformedElement = [&](const char* const elementName) -> void {
+        _params.logger.log("PLY %s fast path failed on malformed data for %s", system::ILogger::ELL_ERROR, elementName, fileName.c_str());
+    };
+    auto skipUnknownElement = [&](const Parse::Context::SElement& el) -> bool {
+        if (ctx.IsBinaryFile && el.KnownSize) {
+            const uint64_t bytesToSkip64 = static_cast<uint64_t>(el.KnownSize) * static_cast<uint64_t>(el.Count);
+            if (bytesToSkip64 > static_cast<uint64_t>(std::numeric_limits<size_t>::max()))
+                return false;
+            ctx.moveForward(static_cast<size_t>(bytesToSkip64));
+        } else {
+            for (size_t j = 0; j < el.Count; ++j)
+                el.skipElement(ctx);
+        }
+        return true;
+    };
+    auto readFaceElement = [&](const Parse::Context::SElement& el) -> bool {
+        const uint32_t vertexCount32 = vertCount <= static_cast<size_t>(std::numeric_limits<uint32_t>::max()) ? static_cast<uint32_t>(vertCount) : 0u;
+        const auto fastFaceResult = ctx.readFaceElementFast(el, indices, maxIndexRead, faceCount, vertexCount32, contentHashBuild.hashesDeferred(), precomputedIndexHash);
+        if (fastFaceResult == Parse::Context::EFastFaceReadResult::Success) {
+            ++fastFaceElementCount;
+            return true;
+        }
+        if (fastFaceResult == Parse::Context::EFastFaceReadResult::NotApplicable) {
+            indices.reserve(indices.size() + el.Count * 3u);
+            for (size_t j = 0; j < el.Count; ++j) {
+                if (!ctx.readFace(el, indices, maxIndexRead, vertexCount32))
+                    return false;
+                ++faceCount;
+            }
+            return true;
+        }
+        logMalformedElement("face");
+        return false;
+    };
+    // loop through each of the elements
+    for (uint32_t i = 0; i < ctx.ElementList.size(); ++i) {
+        auto& el = ctx.ElementList[i];
+        if (el.Name == "vertex") {
+            const auto vertexStart = clock_t::now();
+            if (verticesProcessed) {
+                // multiple vertex elements are currently treated as unsupported
+                _params.logger.log("Multiple `vertex` elements not supported!",
+                                   system::ILogger::ELL_ERROR);
+                return {};
+            }
+            ICPUPolygonGeometry::SDataViewBase posView = {}, normalView = {},
+                                               uvView = {};
+            core::vector<ICPUPolygonGeometry::SDataView> extraViews;
+            core::vector<std::string> extraViewNames;
+            for (auto& vertexProperty : el.Properties) {
+                const auto& propertyName = vertexProperty.Name;
+                if (propertyName == "x")
+                    SGeometryLoaderCommon::negotiateStructuredComponent(posView, vertexProperty.type, 0);
+                else if (propertyName == "y")
+                    SGeometryLoaderCommon::negotiateStructuredComponent(posView, vertexProperty.type, 1);
+                else if (propertyName == "z")
+                    SGeometryLoaderCommon::negotiateStructuredComponent(posView, vertexProperty.type, 2);
+                else if (propertyName == "nx")
+                    SGeometryLoaderCommon::negotiateStructuredComponent(normalView, vertexProperty.type, 0);
+                else if (propertyName == "ny")
+                    SGeometryLoaderCommon::negotiateStructuredComponent(normalView, vertexProperty.type, 1);
+                else if (propertyName == "nz")
+                    SGeometryLoaderCommon::negotiateStructuredComponent(normalView, vertexProperty.type, 2);
+                else if (propertyName == "u" || propertyName == "s")
+                    SGeometryLoaderCommon::negotiateStructuredComponent(uvView, vertexProperty.type, 0);
+                else if (propertyName == "v" || propertyName == "t")
+                    SGeometryLoaderCommon::negotiateStructuredComponent(uvView, vertexProperty.type, 1);
+                else
+                {
+                    extraViews.push_back(createView(vertexProperty.type, el.Count));
+                    extraViewNames.push_back(propertyName);
+                }
+            }
+            auto pushStructuredAttr = [](auto& iterators, const size_t offset, const uint32_t stride, const E_FORMAT componentFormat) -> void {
+                iterators.push_back({.ptr = reinterpret_cast<uint8_t*>(offset), .stride = stride, .dstFmt = componentFormat});
+            };
+            auto rebaseStructuredAttr = [](auto& iter, const ptrdiff_t basePtr) -> void {
+                iter.ptr += basePtr;
+            };
+            SGeometryLoaderCommon::attachStructuredView(posView, el.Count, ctx.vertAttrIts, pushStructuredAttr, rebaseStructuredAttr, [&](auto view) { geometry->setPositionView(std::move(view)); });
+            SGeometryLoaderCommon::attachStructuredView(normalView, el.Count, ctx.vertAttrIts, pushStructuredAttr, rebaseStructuredAttr, [&](auto view) { geometry->setNormalView(std::move(view)); });
+            SGeometryLoaderCommon::attachStructuredView(uvView, el.Count, ctx.vertAttrIts, pushStructuredAttr, rebaseStructuredAttr, [&](auto view) { SGeometryLoaderCommon::setAuxViewAt(geometry.get(), SPLYPolygonGeometryAuxLayout::UV0, std::move(view)); });
+            core::vector<std::string> auxAttributeNames;
+            const size_t extraNameOffset = geometry->getAuxAttributeViews()->size();
+            for (auto& view : extraViews)
+                ctx.vertAttrIts.push_back({.ptr = reinterpret_cast<uint8_t*>(view.src.buffer->getPointer()) + view.src.offset,
+                                           .stride = getTexelOrBlockBytesize(view.composed.format),
+                                           .dstFmt = view.composed.format});
+            for (auto& view : extraViews)
+                geometry->getAuxAttributeViews()->push_back(std::move(view));
+            if (!extraViewNames.empty())
+            {
+                auxAttributeNames.resize(geometry->getAuxAttributeViews()->size());
+                for (size_t extraIx = 0ull; extraIx < extraViewNames.size(); ++extraIx)
+                    auxAttributeNames[extraNameOffset + extraIx] = std::move(extraViewNames[extraIx]);
+            }
+            // loop through vertex properties
+            const auto fastVertexResult = ctx.readVertexElementFast(el, &parsedAABB);
+            if (fastVertexResult == Parse::Context::EFastVertexReadResult::Success) {
+                ++fastVertexElementCount;
+            } else if (fastVertexResult ==
+                       Parse::Context::EFastVertexReadResult::NotApplicable) {
+                ctx.readVertex(_params, el);
+            } else {
+                logMalformedElement("vertex");
+                return {};
+            }
+            SGeometryLoaderCommon::visitVertexAttributeViews(geometry.get(), hashViewBufferIfNeeded);
+            tryLaunchDeferredHash(geometry->getPositionView());
+            verticesProcessed = true;
+            if (!auxAttributeNames.empty())
+            {
+                geometryMetadata = std::move(auxAttributeNames);
+            }
+            vertexMs += std::chrono::duration<double, std::milli>(clock_t::now() - vertexStart).count();
+        } else if (el.Name == "face") {
+            const auto faceStart = clock_t::now();
+            if (!readFaceElement(el))
+                return {};
+            faceMs += std::chrono::duration<double, std::milli>(clock_t::now() - faceStart).count();
+        } else {
+            if (!skipUnknownElement(el))
+                return {};
+        }
+    }
+    if (!parsedAABB.empty())
+        geometry->applyAABB(parsedAABB.value);
+    else
+        CPolygonGeometryManipulator::recomputeAABB(geometry.get());
+    const uint64_t indexCount = static_cast<uint64_t>(indices.size());
+    if (indices.empty()) {
+        // no index buffer means point cloud
+        geometry->setIndexing(IPolygonGeometryBase::PointList());
+    } else {
+        if (vertCount != 0u && maxIndexRead >= vertCount) {
+            _params.logger.log("PLY indices out of range for %s",
+                               system::ILogger::ELL_ERROR,
+                               _file->getFileName().string().c_str());
+            return {};
+        }
+        geometry->setIndexing(IPolygonGeometryBase::TriangleList());
+        const bool canUseU16 =
+            (vertCount != 0u)
+                ? (vertCount <= std::numeric_limits<uint16_t>::max())
+                : (maxIndexRead <= std::numeric_limits<uint16_t>::max());
+        if (canUseU16) {
+            core::vector<uint16_t> indices16(indices.size());
+            for (size_t i = 0u; i < indices.size(); ++i)
+                indices16[i] = static_cast<uint16_t>(indices[i]);
+            auto view = SGeometryLoaderCommon::createAdoptedView<EF_R16_UINT>(
+                std::move(indices16));
+            if (!view)
+                return {};
+            geometry->setIndexView(std::move(view));
+            hashViewBufferIfNeeded(geometry->getIndexView());
+        } else {
+            auto view = SGeometryLoaderCommon::createAdoptedView<EF_R32_UINT>(
+                std::move(indices));
+            if (!view)
+                return {};
+            if (precomputedIndexHash != IPreHashed::INVALID_HASH)
+                view.src.buffer->setContentHash(precomputedIndexHash);
+            geometry->setIndexView(std::move(view));
+            hashViewBufferIfNeeded(geometry->getIndexView());
+        }
+    }
+    const auto finalizeStart = clock_t::now();
+    if (contentHashBuild.hashesDeferred()) {
+        contentHashBuild.wait();
+        SPolygonGeometryContentHash::computeMissing(geometry.get(),
+                                                    _params.ioPolicy);
+    } else {
+        hashRemainingGeometryBuffers();
+    }
+    finalizeMs = std::chrono::duration<double, std::milli>(clock_t::now() - finalizeStart).count();
+    const uint64_t ioMinRead = ctx.readCallCount ? ctx.readMinBytes : 0ull;
+    const uint64_t ioAvgRead =
+        ctx.readCallCount ? (ctx.readBytesTotal / ctx.readCallCount) : 0ull;
+    const SFileReadTelemetry ioTelemetry = {.callCount = ctx.readCallCount,
+                                            .totalBytes = ctx.readBytesTotal,
+                                            .minBytes = ctx.readMinBytes};
+    loadSession.logTinyIO(_params.logger, ioTelemetry);
+    _params.logger.log(
+        "PLY loader stats: file=%s binary=%d verts=%llu faces=%llu idx=%llu "
+        "vertex_fast=%llu face_fast=%llu io_reads=%llu io_min_read=%llu "
+        "io_avg_read=%llu io_req=%s io_eff=%s io_chunk=%llu io_reason=%s",
+        system::ILogger::ELL_PERFORMANCE, _file->getFileName().string().c_str(),
+        ctx.IsBinaryFile ? 1 : 0, static_cast<unsigned long long>(vertCount),
+        static_cast<unsigned long long>(faceCount),
+        static_cast<unsigned long long>(indexCount),
+        static_cast<unsigned long long>(fastVertexElementCount),
+        static_cast<unsigned long long>(fastFaceElementCount),
+        static_cast<unsigned long long>(ctx.readCallCount),
+        static_cast<unsigned long long>(ioMinRead),
+        static_cast<unsigned long long>(ioAvgRead),
+        system::to_string(_params.ioPolicy.strategy).c_str(),
+        system::to_string(loadSession.ioPlan.strategy).c_str(),
+        static_cast<unsigned long long>(loadSession.ioPlan.chunkSizeBytes()), loadSession.ioPlan.reason);
+    _params.logger.log("PLY loader stages: file=%s header=%.3f ms vertex=%.3f ms face=%.3f ms finalize=%.3f ms", system::ILogger::ELL_PERFORMANCE, _file->getFileName().string().c_str(), headerMs, vertexMs, faceMs, finalizeMs);
+    auto meta = core::make_smart_refctd_ptr<CPLYMetadata>(1u);
+    if (geometryMetadata)
+        meta->placeMeta(0u, geometry.get(), std::move(*geometryMetadata));
+    return SAssetBundle(std::move(meta), {std::move(geometry)});
+}
 }
-
-
-} // end namespace nbl::asset
 #endif // _NBL_COMPILE_WITH_PLY_LOADER_
diff --git a/src/nbl/asset/interchange/CPLYMeshFileLoader.h b/src/nbl/asset/interchange/CPLYMeshFileLoader.h
index 6215364466..43d57e74d7 100644
--- a/src/nbl/asset/interchange/CPLYMeshFileLoader.h
+++ b/src/nbl/asset/interchange/CPLYMeshFileLoader.h
@@ -1,39 +1,25 @@
-// Copyright (C) 2019-2025 - DevSH Graphics Programming Sp. z O.O.
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine" and was originally part of the "Irrlicht Engine"
 // For conditions of distribution and use, see copyright notice in nabla.h
 // See the original file in irrlicht source for authors
 #ifndef _NBL_ASSET_C_PLY_MESH_FILE_LOADER_H_INCLUDED_
 #define _NBL_ASSET_C_PLY_MESH_FILE_LOADER_H_INCLUDED_
-#ifdef _NBL_COMPILE_WITH_PLY_LOADER_
-
 #include "nbl/core/declarations.h"
-
 #include "nbl/asset/interchange/IGeometryLoader.h"
-
-#include "nbl/asset/ICPUPolygonGeometry.h"
-#include "nbl/asset/metadata/CPLYMetadata.h"
-
 namespace nbl::asset
 {
-
-//! Meshloader capable of loading obj meshes.
+//! Mesh loader capable of loading PLY meshes.
 class CPLYMeshFileLoader final : public IGeometryLoader
 {
 	public:
-		inline CPLYMeshFileLoader() = default;
+		CPLYMeshFileLoader();
 
 		bool isALoadableFileFormat(system::IFile* _file, const system::logger_opt_ptr logger) const override;
 
-		const char** getAssociatedFileExtensions() const override
-		{
-			static const char* ext[]{ "ply", nullptr };
-			return ext;
-		}
+		const char** getAssociatedFileExtensions() const override;
 
-		//! creates/loads an animated mesh from the file.
+		//! Loads one PLY asset bundle from an already opened file.
 		SAssetBundle loadAsset(system::IFile* _file, const IAssetLoader::SAssetLoadParams& _params, IAssetLoader::IAssetLoaderOverride* _override = nullptr, uint32_t _hierarchyLevel = 0u) override;
 };
-
 } // end namespace nbl::asset
 #endif
-#endif
diff --git a/src/nbl/asset/interchange/CPLYMeshWriter.cpp b/src/nbl/asset/interchange/CPLYMeshWriter.cpp
index fd6fa3ea9e..0d6f1e7b92 100644
--- a/src/nbl/asset/interchange/CPLYMeshWriter.cpp
+++ b/src/nbl/asset/interchange/CPLYMeshWriter.cpp
@@ -1,620 +1,721 @@
-// Copyright (C) 2019 - DevSH Graphics Programming Sp. z O.O.
+#ifdef _NBL_COMPILE_WITH_PLY_WRITER_
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine" and was originally part of the "Irrlicht Engine"
 // For conditions of distribution and use, see copyright notice in nabla.h
 // See the original file in irrlicht source for authors
-
 #include "CPLYMeshWriter.h"
-
-#ifdef _NBL_COMPILE_WITH_PLY_WRITER_
-
-#include "nbl/system/ISystem.h"
+#include "nbl/asset/interchange/SGeometryViewDecode.h"
+#include "nbl/asset/interchange/SPLYPolygonGeometryAuxLayout.h"
+#include "nbl/asset/interchange/SGeometryWriterCommon.h"
+#include "nbl/asset/interchange/SInterchangeIO.h"
+#include "impl/SFileAccess.h"
 #include "nbl/system/IFile.h"
-#include "nbl/asset/utils/CMeshManipulator.h"
-
-namespace nbl
-{
-namespace asset
-{
-
-namespace impl
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <charconv>
+#include <chrono>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <limits>
+#include <sstream>
+#include <system_error>
+namespace nbl::asset
 {
-static asset::E_FORMAT getCorrespondingIntegerFormat(asset::E_FORMAT _fmt)
-{
-    using namespace asset;
-    switch (_fmt)
-    {
-    case EF_R8_UNORM: return EF_R8_UINT;
-    case EF_R8_SNORM: return EF_R8_SINT;
-    case EF_R8G8_UNORM: return EF_R8G8_UINT;
-    case EF_R8G8_SNORM: return EF_R8G8_SINT;
-    case EF_R8G8B8_UNORM: return EF_R8G8B8_UINT;
-    case EF_R8G8B8_SNORM: return EF_R8G8B8_SINT;
-    case EF_R8G8B8A8_UNORM: return EF_R8G8B8A8_UINT;
-    case EF_R8G8B8A8_SNORM: return EF_R8G8B8A8_SINT;
-    case EF_R16_UNORM: return EF_R16_UINT;
-    case EF_R16_SNORM: return EF_R16_SINT;
-    case EF_R16G16_UNORM: return EF_R16G16_UINT;
-    case EF_R16G16_SNORM: return EF_R16G16_SINT;
-    case EF_R16G16B16_UNORM: return EF_R16G16B16_UINT;
-    case EF_R16G16B16_SNORM: return EF_R16G16B16_SINT;
-    case EF_R16G16B16A16_UNORM: return EF_R16G16B16A16_UINT;
-    case EF_R16G16B16A16_SNORM: return EF_R16G16B16A16_SINT;
-    case EF_A2B10G10R10_UNORM_PACK32: return EF_A2B10G10R10_UINT_PACK32;
-    case EF_A2B10G10R10_SNORM_PACK32: return EF_A2B10G10R10_SINT_PACK32;
-    case EF_B8G8R8A8_UNORM: return EF_R8G8B8A8_SINT;
-    case EF_A2R10G10B10_UNORM_PACK32: return EF_A2B10G10R10_UINT_PACK32;
-    case EF_A2R10G10B10_SNORM_PACK32: return EF_A2B10G10R10_SINT_PACK32;
-    default: return EF_UNKNOWN;
-    }
-}
-}
-
 CPLYMeshWriter::CPLYMeshWriter()
 {
 	#ifdef _NBL_DEBUG
 	setDebugName("CPLYMeshWriter");
 	#endif
 }
-
-//! writes a mesh
-bool CPLYMeshWriter::writeAsset(system::IFile* _file, const SAssetWriteParams& _params, IAssetWriterOverride* _override)
+const char** CPLYMeshWriter::getAssociatedFileExtensions() const
 {
-    if (!_override)
-        getDefaultOverride(_override);
-
-    SAssetWriteContext inCtx{ _params, _file };
-
-    const asset::ICPUMesh* mesh = IAsset::castDown<const ICPUMesh>(_params.rootAsset);
-    if (!mesh)
-        return false;
-
-    system::IFile* file = _override->getOutputFile(_file, inCtx, {mesh, 0u});
-
-    auto meshbuffers = mesh->getMeshBuffers();
-	if (!file || !mesh)
-		return false;
-
-    SContext context = { SAssetWriteContext{ inCtx.params, file} };
-    
-    if (meshbuffers.size() > 1)
-    {
-        #ifdef  _NBL_DEBUG
-        context.writeContext.params.logger.log("PLY WRITER WARNING (" + std::to_string(__LINE__) + " line): Only one meshbuffer input is allowed for writing! Saving first one", system::ILogger::ELL_WARNING, file->getFileName().string().c_str());
-        #endif // _NBL_DEBUG
-    }
-
-    context.writeContext.params.logger.log("Writing PLY mesh", system::ILogger::ELL_INFO, file->getFileName().string().c_str());
-
-    const asset::E_WRITER_FLAGS flags = _override->getAssetWritingFlags(context.writeContext, mesh, 0u);
-
-    auto getConvertedCpuMeshBufferWithIndexBuffer = [&]() -> core::smart_refctd_ptr<asset::ICPUMeshBuffer>
-    {
-        auto inputMeshBuffer = *meshbuffers.begin();
-        const bool doesItHaveIndexBuffer = inputMeshBuffer->getIndexBufferBinding().buffer.get();
-        const bool isItNotTriangleListsPrimitive = inputMeshBuffer->getPipeline()->getCachedCreationParams().primitiveAssembly.primitiveType != asset::EPT_TRIANGLE_LIST;
-        
-        if (doesItHaveIndexBuffer && isItNotTriangleListsPrimitive)
-        {
-            auto cpuConvertedMeshBuffer = core::smart_refctd_ptr_static_cast<asset::ICPUMeshBuffer>(inputMeshBuffer->clone());
-            IMeshManipulator::homogenizePrimitiveTypeAndIndices(&cpuConvertedMeshBuffer, &cpuConvertedMeshBuffer + 1, asset::EPT_TRIANGLE_LIST, asset::EIT_32BIT);
-            return cpuConvertedMeshBuffer;
-        }
-        else
-            return nullptr;
-    };
-
-    const auto cpuConvertedMeshBufferWithIndexBuffer = getConvertedCpuMeshBufferWithIndexBuffer();
-    const asset::ICPUMeshBuffer* rawCopyMeshBuffer = cpuConvertedMeshBufferWithIndexBuffer.get() ? cpuConvertedMeshBufferWithIndexBuffer.get() : *meshbuffers.begin();
-    const bool doesItUseIndexBufferBinding = (rawCopyMeshBuffer->getIndexBufferBinding().buffer.get() && rawCopyMeshBuffer->getIndexType() != asset::EIT_UNKNOWN);
-
-    uint32_t faceCount = {}; 
-    size_t vertexCount = {};
-
-    void* indices = nullptr;
-    {
-        auto indexCount = rawCopyMeshBuffer->getIndexCount();
-
-        indices = _NBL_ALIGNED_MALLOC(indexCount * sizeof(uint32_t), _NBL_SIMD_ALIGNMENT);
-        memcpy(indices, rawCopyMeshBuffer->getIndices(), indexCount * sizeof(uint32_t));
-        
-        IMeshManipulator::getPolyCount(faceCount, rawCopyMeshBuffer);
-        vertexCount = IMeshManipulator::upperBoundVertexID(rawCopyMeshBuffer);
-    }
-
-	// write PLY header
-    std::string header = "ply\n";
-    header += (flags & asset::EWF_BINARY) ? "format binary_little_endian 1.0" : "format ascii 1.0";
-	header += "\ncomment IrrlichtBAW ";
-	header +=  NABLA_SDK_VERSION;
-
-	// vertex definition
-	header += "\nelement vertex ";
-	header += std::to_string(vertexCount) + '\n';
-
-    bool vaidToWrite[4]{ 0, 0, 0, 0 };
-
-    const uint32_t POSITION_ATTRIBUTE = rawCopyMeshBuffer->getPositionAttributeIx();
-    constexpr uint32_t COLOR_ATTRIBUTE = 1;
-    constexpr uint32_t UV_ATTRIBUTE = 2;
-    const uint32_t NORMAL_ATTRIBUTE = rawCopyMeshBuffer->getNormalAttributeIx();
-
-    if (rawCopyMeshBuffer->getAttribBoundBuffer(POSITION_ATTRIBUTE).buffer)
-    {
-        const asset::E_FORMAT t = rawCopyMeshBuffer->getAttribFormat(POSITION_ATTRIBUTE);
-        std::string typeStr = getTypeString(t);
-        vaidToWrite[0] = true;
-        header +=
-            "property " + typeStr + " x\n" +
-            "property " + typeStr + " y\n" +
-            "property " + typeStr + " z\n";
-    }
-    if (rawCopyMeshBuffer->getAttribBoundBuffer(COLOR_ATTRIBUTE).buffer)
-    {
-        const asset::E_FORMAT t = rawCopyMeshBuffer->getAttribFormat(COLOR_ATTRIBUTE);
-        std::string typeStr = getTypeString(t);
-        vaidToWrite[1] = true;
-        header +=
-            "property " + typeStr + " red\n" +
-            "property " + typeStr + " green\n" +
-            "property " + typeStr + " blue\n";
-        if (asset::getFormatChannelCount(t) == 4u)
-        {
-            header += "property " + typeStr + " alpha\n";
-        }
-    }
-    if (rawCopyMeshBuffer->getAttribBoundBuffer(UV_ATTRIBUTE).buffer)
-    {
-        const asset::E_FORMAT t = rawCopyMeshBuffer->getAttribFormat(UV_ATTRIBUTE);
-        std::string typeStr = getTypeString(t);
-        vaidToWrite[2] = true;
-        header +=
-            "property " + typeStr + " u\n" +
-            "property " + typeStr + " v\n";
-    }
-    if (rawCopyMeshBuffer->getAttribBoundBuffer(NORMAL_ATTRIBUTE).buffer)
-    {
-        const asset::E_FORMAT t = rawCopyMeshBuffer->getAttribFormat(NORMAL_ATTRIBUTE);
-        std::string typeStr = getTypeString(t);
-        vaidToWrite[3] = true;
-        header +=
-            "property " + typeStr + " nx\n" +
-            "property " + typeStr + " ny\n" +
-            "property " + typeStr + " nz\n";
-    }    
-
-    asset::E_INDEX_TYPE idxT = asset::EIT_UNKNOWN;
-    bool forceFaces = false;
-
-    const auto primitiveType = rawCopyMeshBuffer->getPipeline()->getCachedCreationParams().primitiveAssembly.primitiveType;
-    const auto indexType = rawCopyMeshBuffer->getIndexType();
-  
-    if (primitiveType == asset::EPT_POINT_LIST)
-        faceCount = 0u;
-    else if (doesItUseIndexBufferBinding)
-    {
-        header += "element face ";
-        header += std::to_string(faceCount) + '\n';
-        idxT = indexType;
-        const std::string idxTypeStr = idxT == asset::EIT_32BIT ? "uint32" : "uint16";
-        header += "property list uchar " + idxTypeStr + " vertex_indices\n";
-    }
-    else if (primitiveType == asset::EPT_TRIANGLE_LIST)
-    {
-        forceFaces = true;
-
-        header += "element face ";
-        header += std::to_string(faceCount) + '\n';
-        idxT = vertexCount <= ((1u<<16) - 1) ? asset::EIT_16BIT : asset::EIT_32BIT;
-        const std::string idxTypeStr = idxT == asset::EIT_32BIT ? "uint32" : "uint16";
-        header += "property list uchar " + idxTypeStr + " vertex_indices\n";
-    }
-    else
-        faceCount = 0u;
-    header += "end_header\n";
-
-    {
-        system::IFile::success_t success;
-        file->write(success, header.c_str(), context.fileOffset, header.size());
-        context.fileOffset += success.getBytesProcessed();
-    }
- 
-    if (flags & asset::EWF_BINARY)
-        writeBinary(rawCopyMeshBuffer, vertexCount, faceCount, idxT, indices, forceFaces, vaidToWrite, context);
-    else
-        writeText(rawCopyMeshBuffer, vertexCount, faceCount, idxT, indices, forceFaces, vaidToWrite, context);
-
-    _NBL_ALIGNED_FREE(const_cast<void*>(indices));
-
-	return true;
+	static const char* ext[] = { "ply", nullptr };
+	return ext;
 }
-
-void CPLYMeshWriter::writeBinary(const asset::ICPUMeshBuffer* _mbuf, size_t _vtxCount, size_t _fcCount, asset::E_INDEX_TYPE _idxType, void* const _indices, bool _forceFaces, const bool _vaidToWrite[4], SContext& context) const
+writer_flags_t CPLYMeshWriter::getSupportedFlags()
 {
-    const size_t colCpa = asset::getFormatChannelCount(_mbuf->getAttribFormat(1));
-
-	bool flipVectors = (!(context.writeContext.params.flags & E_WRITER_FLAGS::EWF_MESH_IS_RIGHT_HANDED)) ? true : false;
-
-    auto mbCopy = createCopyMBuffNormalizedReplacedWithTrueInt(_mbuf);
-    for (size_t i = 0u; i < _vtxCount; ++i)
-    {
-        core::vectorSIMDf f;
-        uint32_t ui[4];
-        if (_vaidToWrite[0])
-        {
-            writeAttribBinary(context, mbCopy.get(), 0, i, 3u, flipVectors);
-        }
-        if (_vaidToWrite[1])
-        {
-            writeAttribBinary(context, mbCopy.get(), 1, i, colCpa);
-        }
-        if (_vaidToWrite[2])
-        {
-            writeAttribBinary(context, mbCopy.get(), 2, i, 2u);
-        }
-        if (_vaidToWrite[3])
-        {
-            writeAttribBinary(context, mbCopy.get(), 3, i, 3u, flipVectors);
-        }
-    }
-
-    constexpr uint8_t listSize = 3u;
-    void* indices = _indices;
-    if (_forceFaces)
-    {
-        indices = _NBL_ALIGNED_MALLOC((_idxType == asset::EIT_32BIT ? 4 : 2) * listSize * _fcCount,_NBL_SIMD_ALIGNMENT);
-        if (_idxType == asset::EIT_16BIT)
-        {
-            for (uint16_t i = 0u; i < _fcCount; ++i)
-                ((uint16_t*)indices)[i] = i;
-        }
-        else
-        {
-            for (uint32_t i = 0u; i < _fcCount; ++i)
-                ((uint32_t*)indices)[i] = i;
-        }
-    }
-    if (_idxType == asset::EIT_32BIT)
-    {
-        uint32_t* ind = (uint32_t*)indices;
-        for (size_t i = 0u; i < _fcCount; ++i)
-        {
-            {
-                system::IFile::success_t success;
-                context.writeContext.outputFile->write(success, &listSize, context.fileOffset, sizeof(listSize));
-                context.fileOffset += success.getBytesProcessed();
-            }
-
-            {
-                system::IFile::success_t success;
-                context.writeContext.outputFile->write(success, ind, context.fileOffset, listSize * 4);
-                context.fileOffset += success.getBytesProcessed();
-            }
-
-            ind += listSize;
-        }
-    }
-    else
-    {
-        uint16_t* ind = (uint16_t*)indices;
-        for (size_t i = 0u; i < _fcCount; ++i)
-        {
-            {
-                system::IFile::success_t success;
-                context.writeContext.outputFile->write(success, &listSize, context.fileOffset, sizeof(listSize));
-                context.fileOffset += success.getBytesProcessed();
-            }
-
-            {
-                system::IFile::success_t success;
-                context.writeContext.outputFile->write(success, ind, context.fileOffset, listSize * 2);
-                context.fileOffset += success.getBytesProcessed();
-            }
-            
-            ind += listSize;
-        }
-    }
-
-    if (_forceFaces)
-        _NBL_ALIGNED_FREE(indices);
+	return writer_flags_t(asset::EWF_BINARY | asset::EWF_MESH_IS_RIGHT_HANDED);
 }
-
-void CPLYMeshWriter::writeText(const asset::ICPUMeshBuffer* _mbuf, size_t _vtxCount, size_t _fcCount, asset::E_INDEX_TYPE _idxType, void* const _indices, bool _forceFaces, const bool _vaidToWrite[4], SContext& context) const
+writer_flags_t CPLYMeshWriter::getForcedFlags()
 {
-    auto mbCopy = createCopyMBuffNormalizedReplacedWithTrueInt(_mbuf);
-
-    auto writefunc = [&context, &mbCopy, this](uint32_t _vaid, size_t _ix, size_t _cpa)
-    {
-		bool flipVerteciesAndNormals = false;
-		if (!(context.writeContext.params.flags & E_WRITER_FLAGS::EWF_MESH_IS_RIGHT_HANDED))
-			if(_vaid == 0u || _vaid == 3u)
-				flipVerteciesAndNormals = true;
-
-        uint32_t ui[4];
-        core::vectorSIMDf f;
-        const asset::E_FORMAT t = mbCopy->getAttribFormat(_vaid);
-        if (asset::isScaledFormat(t) || asset::isIntegerFormat(t))
-        {
-            mbCopy->getAttribute(ui, _vaid, _ix);
-            if (!asset::isSignedFormat(t))
-                writeVectorAsText(context, ui, _cpa, flipVerteciesAndNormals);
-            else
-            {
-                int32_t ii[4];
-                memcpy(ii, ui, 4*4);
-                writeVectorAsText(context, ii, _cpa, flipVerteciesAndNormals);
-            }
-        }
-        else
-        {
-            mbCopy->getAttribute(f, _vaid, _ix);
-            writeVectorAsText(context, f.pointer, _cpa, flipVerteciesAndNormals);
-        }
-    };
-
-    const size_t colCpa = asset::getFormatChannelCount(_mbuf->getAttribFormat(1));
-
-    for (size_t i = 0u; i < _vtxCount; ++i)
-    {
-        core::vectorSIMDf f;
-        uint32_t ui[4];
-        if (_vaidToWrite[0])
-        {
-            writefunc(0, i, 3u);
-        }
-        if (_vaidToWrite[1])
-        {
-            writefunc(1, i, colCpa);
-        }
-        if (_vaidToWrite[2])
-        {
-            writefunc(2, i, 2u);
-        }
-        if (_vaidToWrite[3])
-        {
-            writefunc(3, i, 3u);
-        }
-
-        {
-            system::IFile::success_t success;
-            context.writeContext.outputFile->write(success, "\n", context.fileOffset, 1);
-            context.fileOffset += success.getBytesProcessed();
-        }
-    }
-
-    const char* listSize = "3 ";
-    void* indices = _indices;
-    if (_forceFaces)
-    {
-        indices = _NBL_ALIGNED_MALLOC((_idxType == asset::EIT_32BIT ? 4 : 2) * 3 * _fcCount,_NBL_SIMD_ALIGNMENT);
-        if (_idxType == asset::EIT_16BIT)
-        {
-            for (uint16_t i = 0u; i < _fcCount; ++i)
-                ((uint16_t*)indices)[i] = i;
-        }
-        else
-        {
-            for (uint32_t i = 0u; i < _fcCount; ++i)
-                ((uint32_t*)indices)[i] = i;
-        }
-    }
-    if (_idxType == asset::EIT_32BIT)
-    {
-        uint32_t* ind = (uint32_t*)indices;
-        for (size_t i = 0u; i < _fcCount; ++i)
-        {
-            {
-                system::IFile::success_t success;
-                context.writeContext.outputFile->write(success, listSize, context.fileOffset, 2);
-                context.fileOffset += success.getBytesProcessed();
-            }
-
-            writeVectorAsText(context, ind, 3);
-
-            {
-                system::IFile::success_t success;
-                context.writeContext.outputFile->write(success, "\n", context.fileOffset, 1);
-                context.fileOffset += success.getBytesProcessed();
-            }
-
-            ind += 3;
-        }
-    }
-    else
-    {
-        uint16_t* ind = (uint16_t*)indices;
-        for (size_t i = 0u; i < _fcCount; ++i)
-        {
-            {
-                system::IFile::success_t success;
-                context.writeContext.outputFile->write(success, listSize, context.fileOffset, 2);
-                context.fileOffset += success.getBytesProcessed();
-            }
-
-            writeVectorAsText(context, ind, 3);
-
-            {
-                system::IFile::success_t success;
-                context.writeContext.outputFile->write(success, "\n", context.fileOffset, 1);
-                context.fileOffset += success.getBytesProcessed();
-            }
-
-            ind += 3;
-        }
-    }
-
-    if (_forceFaces)
-        _NBL_ALIGNED_FREE(indices);
+	return EWF_NONE;
 }
-
-void CPLYMeshWriter::writeAttribBinary(SContext& context, asset::ICPUMeshBuffer* _mbuf, uint32_t _vaid, size_t _ix, size_t _cpa, bool flipAttribute) const
+namespace
 {
-    uint32_t ui[4];
-    core::vectorSIMDf f;
-    asset::E_FORMAT t = _mbuf->getAttribFormat(_vaid);
-
-    if (asset::isScaledFormat(t) || asset::isIntegerFormat(t))
-    {
-        _mbuf->getAttribute(ui, _vaid, _ix);
-        if (flipAttribute)
-            ui[0] = -ui[0];
-
-        const uint32_t bytesPerCh = asset::getTexelOrBlockBytesize(t)/asset::getFormatChannelCount(t);
-        if (bytesPerCh == 1u || t == asset::EF_A2B10G10R10_UINT_PACK32 || t == asset::EF_A2B10G10R10_SINT_PACK32 || t == asset::EF_A2B10G10R10_SSCALED_PACK32 || t == asset::EF_A2B10G10R10_USCALED_PACK32)
-        {
-            uint8_t a[4];
-            for (uint32_t k = 0u; k < _cpa; ++k)
-                a[k] = ui[k];
-
-            {
-                system::IFile::success_t success;
-                context.writeContext.outputFile->write(success, a, context.fileOffset, _cpa);
-                context.fileOffset += success.getBytesProcessed();
-            }
-        }
-        else if (bytesPerCh == 2u)
-        {
-            uint16_t a[4];
-            for (uint32_t k = 0u; k < _cpa; ++k)
-                a[k] = ui[k];
-
-            {
-                system::IFile::success_t success;
-                context.writeContext.outputFile->write(success, a, context.fileOffset, 2 * _cpa);
-                context.fileOffset += success.getBytesProcessed();
-            }
-        }
-        else if (bytesPerCh == 4u)
-        {
-            {
-                system::IFile::success_t success;
-                context.writeContext.outputFile->write(success, ui, context.fileOffset, 4 * _cpa);
-                context.fileOffset += success.getBytesProcessed();
-            }
-        }
-    }
-    else
-    {
-        _mbuf->getAttribute(f, _vaid, _ix);
-        if (flipAttribute)
-            f[0] = -f[0];
-
-        {
-            system::IFile::success_t success;
-            context.writeContext.outputFile->write(success, f.pointer, context.fileOffset, 4 * _cpa);
-            context.fileOffset += success.getBytesProcessed();
-        }
-    }
-}
-
-core::smart_refctd_ptr<asset::ICPUMeshBuffer> CPLYMeshWriter::createCopyMBuffNormalizedReplacedWithTrueInt(const asset::ICPUMeshBuffer* _mbuf)
+struct Parse
 {
-    auto mbCopy = core::smart_refctd_ptr_static_cast<ICPUMeshBuffer>(_mbuf->clone(2));
-
-    for (size_t i = 0; i < ICPUMeshBuffer::MAX_VERTEX_ATTRIB_COUNT; ++i)
-    {
-        auto vaid = i;
-        asset::E_FORMAT t = _mbuf->getAttribFormat(vaid);
-    
-        if (_mbuf->getAttribBoundBuffer(vaid).buffer)
-            mbCopy->getPipeline()->getCachedCreationParams().vertexInput.attributes[vaid].format = asset::isNormalizedFormat(t) ? impl::getCorrespondingIntegerFormat(t) : t;
-    }
-
-    return mbCopy;
+	enum class ScalarType : uint8_t { Int8, UInt8, Int16, UInt16, Int32, UInt32, Float32, Float64 };
+	using SemanticDecode = SGeometryViewDecode::Prepared<SGeometryViewDecode::EMode::Semantic>;
+	using StoredDecode = SGeometryViewDecode::Prepared<SGeometryViewDecode::EMode::Stored>;
+	struct ScalarMeta { const char* name = "float32"; uint32_t byteSize = sizeof(float); bool integer = false; bool signedType = true; };
+	struct ExtraAuxView { const ICPUPolygonGeometry::SDataView* view = nullptr; uint32_t components = 0u; uint32_t auxIndex = 0u; ScalarType scalarType = ScalarType::Float32; };
+	struct WriteInput { const ICPUPolygonGeometry* geom = nullptr; ScalarType positionScalarType = ScalarType::Float32; const ICPUPolygonGeometry::SDataView* uvView = nullptr; ScalarType uvScalarType = ScalarType::Float32; const core::vector<ExtraAuxView>* extraAuxViews = nullptr; bool writeNormals = false; ScalarType normalScalarType = ScalarType::Float32; size_t vertexCount = 0ull; const uint32_t* indices = nullptr; size_t faceCount = 0ull; bool write16BitIndices = false; bool flipVectors = false; };
+	static constexpr size_t ApproxTextBytesPerVertex = sizeof("0.000000 0.000000 0.000000 0.000000 0.000000 0.000000\n") - 1ull;
+	static constexpr size_t ApproxTextBytesPerFace = sizeof("3 4294967295 4294967295 4294967295\n") - 1ull;
+	static constexpr size_t MaxFloatTextChars = std::numeric_limits<double>::max_digits10 + 16ull;
+	template<typename T>
+	static void appendIntegral(std::string& out, const T value) { std::array<char, 32> buf = {}; const auto res = std::to_chars(buf.data(), buf.data() + buf.size(), value); if (res.ec == std::errc()) out.append(buf.data(), static_cast<size_t>(res.ptr - buf.data())); }
+	static void appendFloat(std::string& out, double value)
+	{
+		const size_t oldSize = out.size();
+		out.resize(oldSize + MaxFloatTextChars);
+		char* const begin = out.data() + oldSize;
+		char* const end = begin + MaxFloatTextChars;
+		char* const cursor = SGeometryWriterCommon::appendFloatToBuffer(begin, end, value);
+		out.resize(oldSize + static_cast<size_t>(cursor - begin));
+	}
+	static ScalarMeta getScalarMeta(const ScalarType type)
+	{
+		switch (type)
+		{
+			case ScalarType::Int8: return {"int8", sizeof(int8_t), true, true};
+			case ScalarType::UInt8: return {"uint8", sizeof(uint8_t), true, false};
+			case ScalarType::Int16: return {"int16", sizeof(int16_t), true, true};
+			case ScalarType::UInt16: return {"uint16", sizeof(uint16_t), true, false};
+			case ScalarType::Int32: return {"int32", sizeof(int32_t), true, true};
+			case ScalarType::UInt32: return {"uint32", sizeof(uint32_t), true, false};
+			case ScalarType::Float64: return {"float64", sizeof(double), false, true};
+			default: return {"float32", sizeof(float), false, true};
+		}
+	}
+	struct PreparedView
+	{
+		const ICPUPolygonGeometry::SDataView* view = nullptr;
+		uint32_t componentCount = 0u;
+		ScalarType scalarType = ScalarType::Float32;
+		bool flipVectors = false;
+		SemanticDecode semantic = {};
+		StoredDecode stored = {};
+		static inline PreparedView create(const ICPUPolygonGeometry::SDataView& view, const uint32_t componentCount, const ScalarType scalarType, const bool flipVectors)
+		{
+			PreparedView retval = {.view = &view, .componentCount = componentCount, .scalarType = scalarType, .flipVectors = flipVectors};
+			const auto meta = getScalarMeta(scalarType);
+			if (meta.integer)
+				retval.stored = SGeometryViewDecode::prepare<SGeometryViewDecode::EMode::Stored>(view);
+			else
+				retval.semantic = SGeometryViewDecode::prepare<SGeometryViewDecode::EMode::Semantic>(view);
+			return retval;
+		}
+	};
+	static bool isSupportedScalarFormat(const E_FORMAT format)
+	{
+		if (format == EF_UNKNOWN)
+			return false;
+		const uint32_t channels = getFormatChannelCount(format);
+		if (channels == 0u)
+			return false;
+		if (!(isIntegerFormat(format) || isFloatingPointFormat(format) || isNormalizedFormat(format) || isScaledFormat(format)))
+			return false;
+		const auto bytesPerPixel = getBytesPerPixel(format);
+		if (bytesPerPixel.getDenominator() != 1u)
+			return false;
+		const uint32_t pixelBytes = bytesPerPixel.getNumerator();
+		if (pixelBytes == 0u || (pixelBytes % channels) != 0u)
+			return false;
+		const uint32_t bytesPerChannel = pixelBytes / channels;
+		return bytesPerChannel == 1u || bytesPerChannel == 2u || bytesPerChannel == 4u || bytesPerChannel == 8u;
+	}
+	static ScalarType selectScalarType(const E_FORMAT format)
+	{
+		if (!isSupportedScalarFormat(format))
+			return ScalarType::Float32;
+		if (isNormalizedFormat(format) || isScaledFormat(format))
+			return ScalarType::Float32;
+		const uint32_t channels = getFormatChannelCount(format);
+		if (channels == 0u)
+		{
+			assert(format == EF_UNKNOWN);
+			return ScalarType::Float32;
+		}
+		const auto bytesPerPixel = getBytesPerPixel(format);
+		if (bytesPerPixel.getDenominator() != 1u)
+			return ScalarType::Float32;
+		const uint32_t pixelBytes = bytesPerPixel.getNumerator();
+		if (pixelBytes == 0u || (pixelBytes % channels) != 0u)
+			return ScalarType::Float32;
+		const uint32_t bytesPerChannel = pixelBytes / channels;
+		if (isIntegerFormat(format))
+		{
+			const bool signedType = isSignedFormat(format);
+			switch (bytesPerChannel)
+			{
+				case 1u: return signedType ? ScalarType::Int8 : ScalarType::UInt8;
+				case 2u: return signedType ? ScalarType::Int16 : ScalarType::UInt16;
+				case 4u: return signedType ? ScalarType::Int32 : ScalarType::UInt32;
+				default: return ScalarType::Float64;
+			}
+		}
+		if (isFloatingPointFormat(format))
+			return bytesPerChannel >= 8u ? ScalarType::Float64 : ScalarType::Float32;
+		return ScalarType::Float32;
+	}
+	static bool isDirectScalarFormat(const E_FORMAT format, const ScalarType scalarType, const uint32_t componentCount, uint32_t& outByteSize)
+	{
+		outByteSize = 0u;
+		if (format == EF_UNKNOWN || componentCount == 0u)
+			return false;
+		if (isNormalizedFormat(format) || isScaledFormat(format))
+			return false;
+		const uint32_t channels = getFormatChannelCount(format);
+		if (channels < componentCount)
+			return false;
+		const auto bytesPerPixel = getBytesPerPixel(format);
+		if (bytesPerPixel.getDenominator() != 1u)
+			return false;
+		const uint32_t pixelBytes = bytesPerPixel.getNumerator();
+		if (pixelBytes == 0u || (pixelBytes % channels) != 0u)
+			return false;
+		const uint32_t byteSize = pixelBytes / channels;
+		const auto meta = getScalarMeta(scalarType);
+		if (byteSize != meta.byteSize)
+			return false;
+		switch (scalarType)
+		{
+			case ScalarType::Float32:
+			case ScalarType::Float64:
+				if (!isFloatingPointFormat(format))
+					return false;
+				break;
+			case ScalarType::Int8:
+			case ScalarType::Int16:
+			case ScalarType::Int32:
+				if (!isIntegerFormat(format) || !isSignedFormat(format))
+					return false;
+				break;
+			case ScalarType::UInt8:
+			case ScalarType::UInt16:
+			case ScalarType::UInt32:
+				if (!isIntegerFormat(format) || isSignedFormat(format))
+					return false;
+				break;
+		}
+		outByteSize = byteSize;
+		return true;
+	}
+	static bool writeDirectBinaryView(const ICPUPolygonGeometry::SDataView& view, const size_t ix, const uint32_t componentCount, const ScalarType scalarType, const bool flipVectors, uint8_t*& dst)
+	{
+		if (flipVectors || !dst || !view.composed.isFormatted())
+			return false;
+		uint32_t byteSize = 0u;
+		if (!isDirectScalarFormat(view.composed.format, scalarType, componentCount, byteSize))
+			return false;
+		const uint32_t pixelBytes = getBytesPerPixel(view.composed.format).getNumerator();
+		if (view.composed.getStride() != pixelBytes)
+			return false;
+		const void* src = view.getPointer(ix);
+		if (!src)
+			return false;
+		const size_t copyBytes = static_cast<size_t>(componentCount) * byteSize;
+		std::memcpy(dst, src, copyBytes);
+		dst += copyBytes;
+		return true;
+	}
+	static bool writeTypedViewBinary(const PreparedView& prepared, const size_t ix, uint8_t*& dst)
+	{
+		if (!prepared.view || !dst)
+			return false;
+		const auto& view = *prepared.view;
+		const auto componentCount = prepared.componentCount;
+		const auto scalarType = prepared.scalarType;
+		const auto flipVectors = prepared.flipVectors;
+		if (!dst)
+			return false;
+		if (writeDirectBinaryView(view, ix, componentCount, scalarType, flipVectors, dst))
+			return true;
+		switch (scalarType)
+		{
+			case ScalarType::Float64:
+			case ScalarType::Float32:
+			{
+				std::array<double, 4> tmp = {};
+				if (!prepared.semantic.decode(ix, tmp))
+					return false;
+				for (uint32_t c = 0u; c < componentCount; ++c)
+				{
+					double value = tmp[c];
+					if (flipVectors && c == 0u)
+						value = -value;
+					if (scalarType == ScalarType::Float64)
+					{
+						std::memcpy(dst, &value, sizeof(value));
+						dst += sizeof(value);
+					}
+					else
+					{
+						const float typed = static_cast<float>(value);
+						std::memcpy(dst, &typed, sizeof(typed));
+						dst += sizeof(typed);
+					}
+				}
+				return true;
+			}
+			case ScalarType::Int8:
+			case ScalarType::Int16:
+			case ScalarType::Int32:
+			{
+				std::array<int64_t, 4> tmp = {};
+				if (!prepared.stored.decode(ix, tmp))
+					return false;
+				for (uint32_t c = 0u; c < componentCount; ++c)
+				{
+					int64_t value = tmp[c];
+					if (flipVectors && c == 0u)
+						value = -value;
+					switch (scalarType)
+					{
+						case ScalarType::Int8:
+						{
+							const int8_t typed = static_cast<int8_t>(value);
+							std::memcpy(dst, &typed, sizeof(typed));
+							dst += sizeof(typed);
+						} break;
+						case ScalarType::Int16:
+						{
+							const int16_t typed = static_cast<int16_t>(value);
+							std::memcpy(dst, &typed, sizeof(typed));
+							dst += sizeof(typed);
+						} break;
+						default:
+						{
+							const int32_t typed = static_cast<int32_t>(value);
+							std::memcpy(dst, &typed, sizeof(typed));
+							dst += sizeof(typed);
+						} break;
+					}
+				}
+				return true;
+			}
+			case ScalarType::UInt8:
+			case ScalarType::UInt16:
+			case ScalarType::UInt32:
+			{
+				std::array<uint64_t, 4> tmp = {};
+				if (!prepared.stored.decode(ix, tmp))
+					return false;
+				for (uint32_t c = 0u; c < componentCount; ++c)
+				{
+					switch (scalarType)
+					{
+						case ScalarType::UInt8:
+						{
+							const uint8_t typed = static_cast<uint8_t>(tmp[c]);
+							std::memcpy(dst, &typed, sizeof(typed));
+							dst += sizeof(typed);
+						} break;
+						case ScalarType::UInt16:
+						{
+							const uint16_t typed = static_cast<uint16_t>(tmp[c]);
+							std::memcpy(dst, &typed, sizeof(typed));
+							dst += sizeof(typed);
+						} break;
+						default:
+						{
+							const uint32_t typed = static_cast<uint32_t>(tmp[c]);
+							std::memcpy(dst, &typed, sizeof(typed));
+							dst += sizeof(typed);
+						} break;
+					}
+				}
+				return true;
+			}
+		}
+		return false;
+	}
+	static bool writeTypedViewText(std::string& output, const PreparedView& prepared, const size_t ix)
+	{
+		if (!prepared.view)
+			return false;
+		const auto componentCount = prepared.componentCount;
+		const auto scalarType = prepared.scalarType;
+		const auto flipVectors = prepared.flipVectors;
+		switch (scalarType)
+		{
+			case ScalarType::Float64:
+			case ScalarType::Float32:
+			{
+				std::array<double, 4> tmp = {};
+				if (!prepared.semantic.decode(ix, tmp))
+					return false;
+				for (uint32_t c = 0u; c < componentCount; ++c)
+				{
+					double value = tmp[c];
+					if (flipVectors && c == 0u)
+						value = -value;
+					appendFloat(output, value);
+					output.push_back(' ');
+				}
+				return true;
+			}
+			case ScalarType::Int8:
+			case ScalarType::Int16:
+			case ScalarType::Int32:
+			{
+				std::array<int64_t, 4> tmp = {};
+				if (!prepared.stored.decode(ix, tmp))
+					return false;
+				for (uint32_t c = 0u; c < componentCount; ++c)
+				{
+					int64_t value = tmp[c];
+					if (flipVectors && c == 0u)
+						value = -value;
+					appendIntegral(output, value);
+					output.push_back(' ');
+				}
+				return true;
+			}
+			case ScalarType::UInt8:
+			case ScalarType::UInt16:
+			case ScalarType::UInt32:
+			{
+				std::array<uint64_t, 4> tmp = {};
+				if (!prepared.stored.decode(ix, tmp))
+					return false;
+				for (uint32_t c = 0u; c < componentCount; ++c)
+				{
+					appendIntegral(output, tmp[c]);
+					output.push_back(' ');
+				}
+				return true;
+			}
+		}
+		return false;
+	}
+	static bool writeBinaryFast(const WriteInput& input, uint8_t*& dst)
+	{
+		if (!input.geom || !input.indices || !input.extraAuxViews || !dst || input.flipVectors || input.writeNormals || input.uvView || !input.extraAuxViews->empty() || input.positionScalarType != ScalarType::Float32)
+			return false;
+		const auto& positionView = input.geom->getPositionView();
+		if (!positionView.composed.isFormatted() || positionView.composed.format != EF_R32G32B32_SFLOAT || positionView.composed.getStride() != sizeof(hlsl::float32_t3))
+			return false;
+		const void* src = positionView.getPointer();
+		if (!src)
+			return false;
+		const size_t vertexBytes = input.vertexCount * sizeof(hlsl::float32_t3);
+		std::memcpy(dst, src, vertexBytes);
+		dst += vertexBytes;
+		for (size_t i = 0u; i < input.faceCount; ++i)
+		{
+			*dst++ = 3u;
+			const uint32_t* tri = input.indices + i * 3u;
+			if (input.write16BitIndices)
+			{
+				const uint16_t tri16[3] = {static_cast<uint16_t>(tri[0]), static_cast<uint16_t>(tri[1]), static_cast<uint16_t>(tri[2])};
+				std::memcpy(dst, tri16, sizeof(tri16));
+				dst += sizeof(tri16);
+			}
+			else
+			{
+				std::memcpy(dst, tri, sizeof(uint32_t) * 3u);
+				dst += sizeof(uint32_t) * 3u;
+			}
+		}
+		return true;
+	}
+	static bool writeBinary(const WriteInput& input, uint8_t* dst)
+	{
+		if (!input.geom || !input.extraAuxViews || !dst)
+			return false;
+		if (writeBinaryFast(input, dst))
+			return true;
+		const auto& positionView = input.geom->getPositionView();
+		const auto& normalView = input.geom->getNormalView();
+		const auto& extraAuxViews = *input.extraAuxViews;
+		const PreparedView preparedPosition = PreparedView::create(positionView, 3u, input.positionScalarType, input.flipVectors);
+		const PreparedView preparedNormal = input.writeNormals ? PreparedView::create(normalView, 3u, input.normalScalarType, input.flipVectors) : PreparedView{};
+		const PreparedView preparedUV = input.uvView ? PreparedView::create(*input.uvView, 2u, input.uvScalarType, false) : PreparedView{};
+		core::vector<PreparedView> preparedExtraAuxViews;
+		preparedExtraAuxViews.reserve(extraAuxViews.size());
+		for (const auto& extra : extraAuxViews)
+		{
+			if (!extra.view)
+				return false;
+			preparedExtraAuxViews.push_back(PreparedView::create(*extra.view, extra.components, extra.scalarType, false));
+		}
+		for (size_t i = 0u; i < input.vertexCount; ++i)
+		{
+			if (!writeTypedViewBinary(preparedPosition, i, dst))
+				return false;
+			if (input.writeNormals && !writeTypedViewBinary(preparedNormal, i, dst))
+				return false;
+			if (input.uvView && !writeTypedViewBinary(preparedUV, i, dst))
+				return false;
+			for (const auto& extra : preparedExtraAuxViews)
+				if (!writeTypedViewBinary(extra, i, dst))
+					return false;
+		}
+		if (!input.indices)
+			return false;
+		for (size_t i = 0u; i < input.faceCount; ++i)
+		{
+			const uint8_t listSize = 3u;
+			*dst++ = listSize;
+			const uint32_t* tri = input.indices + i * 3u;
+			if (input.write16BitIndices)
+			{
+				const uint16_t tri16[3] = {static_cast<uint16_t>(tri[0]), static_cast<uint16_t>(tri[1]), static_cast<uint16_t>(tri[2])};
+				std::memcpy(dst, tri16, sizeof(tri16));
+				dst += sizeof(tri16);
+			}
+			else
+			{
+				std::memcpy(dst, tri, sizeof(uint32_t) * 3u);
+				dst += sizeof(uint32_t) * 3u;
+			}
+		}
+		return true;
+	}
+	static bool writeText(const WriteInput& input, std::string& output)
+	{
+		if (!input.geom || !input.extraAuxViews)
+			return false;
+		const auto& extraAuxViews = *input.extraAuxViews;
+		const PreparedView preparedPosition = PreparedView::create(input.geom->getPositionView(), 3u, input.positionScalarType, input.flipVectors);
+		const PreparedView preparedNormal = input.writeNormals ? PreparedView::create(input.geom->getNormalView(), 3u, input.normalScalarType, input.flipVectors) : PreparedView{};
+		const PreparedView preparedUV = input.uvView ? PreparedView::create(*input.uvView, 2u, input.uvScalarType, false) : PreparedView{};
+		core::vector<PreparedView> preparedExtraAuxViews;
+		preparedExtraAuxViews.reserve(extraAuxViews.size());
+		for (const auto& extra : extraAuxViews)
+		{
+			if (!extra.view)
+				return false;
+			preparedExtraAuxViews.push_back(PreparedView::create(*extra.view, extra.components, extra.scalarType, false));
+		}
+		for (size_t i = 0u; i < input.vertexCount; ++i)
+		{
+			if (!writeTypedViewText(output, preparedPosition, i))
+				return false;
+			if (input.writeNormals && !writeTypedViewText(output, preparedNormal, i))
+				return false;
+			if (input.uvView && !writeTypedViewText(output, preparedUV, i))
+				return false;
+			for (const auto& extra : preparedExtraAuxViews)
+				if (!writeTypedViewText(output, extra, i))
+					return false;
+			output.push_back('\n');
+		}
+		if (!input.indices)
+			return false;
+		for (size_t i = 0u; i < input.faceCount; ++i)
+		{
+			const uint32_t* tri = input.indices + i * 3u;
+			output.append("3 ");
+			appendIntegral(output, tri[0]);
+			output.push_back(' ');
+			appendIntegral(output, tri[1]);
+			output.push_back(' ');
+			appendIntegral(output, tri[2]);
+			output.push_back('\n');
+		}
+		return true;
+	}
+};
 }
-
-std::string CPLYMeshWriter::getTypeString(asset::E_FORMAT _t)
+bool CPLYMeshWriter::writeAsset(system::IFile* _file, const SAssetWriteParams& _params, IAssetWriterOverride* _override)
 {
-    using namespace asset;
-
-    if (isFloatingPointFormat(_t))
-        return "float";
-
-    switch (_t)
-    {
-    case EF_R8_SNORM:
-    case EF_R8_SINT:
-    case EF_R8_SSCALED:
-    case EF_R8G8_SNORM:
-    case EF_R8G8_SINT:
-    case EF_R8G8_SSCALED:
-    case EF_R8G8B8_SNORM:
-    case EF_R8G8B8_SINT:
-    case EF_R8G8B8_SSCALED:
-    case EF_R8G8B8A8_SNORM:
-    case EF_R8G8B8A8_SINT:
-    case EF_R8G8B8A8_SSCALED:
-    case EF_B8G8R8A8_UNORM:
-    case EF_A2B10G10R10_SNORM_PACK32:
-    case EF_A2B10G10R10_SINT_PACK32:
-    case EF_A2B10G10R10_SSCALED_PACK32:
-    case EF_A2R10G10B10_SNORM_PACK32:
-        return "char";
-
-    case EF_R8_UNORM:
-    case EF_R8_UINT:
-    case EF_R8_USCALED:
-    case EF_R8G8_UNORM:
-    case EF_R8G8_UINT:
-    case EF_R8G8_USCALED:
-    case EF_R8G8B8_UNORM:
-    case EF_R8G8B8_UINT:
-    case EF_R8G8B8_USCALED:
-    case EF_R8G8B8A8_UNORM:
-    case EF_R8G8B8A8_UINT:
-    case EF_R8G8B8A8_USCALED:
-    case EF_A2R10G10B10_UNORM_PACK32:
-    case EF_A2B10G10R10_UNORM_PACK32:
-    case EF_A2B10G10R10_UINT_PACK32:
-    case EF_A2B10G10R10_USCALED_PACK32:
-        return "uchar";
-
-    case EF_R16_UNORM:
-    case EF_R16_UINT:
-    case EF_R16_USCALED:
-    case EF_R16G16_UNORM:
-    case EF_R16G16_UINT:
-    case EF_R16G16_USCALED:
-    case EF_R16G16B16_UNORM:
-    case EF_R16G16B16_UINT:
-    case EF_R16G16B16_USCALED:
-    case EF_R16G16B16A16_UNORM:
-    case EF_R16G16B16A16_UINT:
-    case EF_R16G16B16A16_USCALED:
-        return "ushort";
-
-    case EF_R16_SNORM:
-    case EF_R16_SINT:
-    case EF_R16_SSCALED:
-    case EF_R16G16_SNORM:
-    case EF_R16G16_SINT:
-    case EF_R16G16_SSCALED:
-    case EF_R16G16B16_SNORM:
-    case EF_R16G16B16_SINT:
-    case EF_R16G16B16_SSCALED:
-    case EF_R16G16B16A16_SNORM:
-    case EF_R16G16B16A16_SINT:
-    case EF_R16G16B16A16_SSCALED:
-        return "short";
-
-    case EF_R32_UINT:
-    case EF_R32G32_UINT:
-    case EF_R32G32B32_UINT:
-    case EF_R32G32B32A32_UINT:
-        return "uint";
-
-    case EF_R32_SINT:
-    case EF_R32G32_SINT:
-    case EF_R32G32B32_SINT:
-    case EF_R32G32B32A32_SINT:
-        return "int";
-
-    default:
-        return "";
-    }
+	using ScalarType = Parse::ScalarType;
+	using clock_t = std::chrono::high_resolution_clock;
+	SFileWriteTelemetry ioTelemetry = {};
+	if (!_override)
+		getDefaultOverride(_override);
+	if (!_file || !_params.rootAsset)
+		return _params.logger.log("PLY writer: missing output file or root asset.", system::ILogger::ELL_ERROR), false;
+	const auto items = SGeometryWriterCommon::collectPolygonGeometryWriteItems(_params.rootAsset);
+	if (items.size() != 1u)
+		return _params.logger.log("PLY writer: expected exactly one polygon geometry to write.", system::ILogger::ELL_ERROR), false;
+	const auto& item = items.front();
+	const auto* geom = item.geometry;
+	if (!geom || !geom->valid())
+		return _params.logger.log("PLY writer: root asset is not a valid polygon geometry.", system::ILogger::ELL_ERROR), false;
+	if (!SGeometryWriterCommon::isIdentityTransform(item.transform))
+		return _params.logger.log("PLY writer: transformed scene or collection export is not supported.", system::ILogger::ELL_ERROR), false;
+	SAssetWriteContext ctx = {_params, _file};
+	system::IFile* file = _override->getOutputFile(_file, ctx, {geom, 0u});
+	if (!file)
+		return _params.logger.log("PLY writer: output override returned null file.", system::ILogger::ELL_ERROR), false;
+	const auto& positionView = geom->getPositionView();
+	const auto& normalView = geom->getNormalView();
+	const size_t vertexCount = positionView.getElementCount();
+	if (vertexCount == 0ull)
+		return _params.logger.log("PLY writer: empty position view.", system::ILogger::ELL_ERROR), false;
+	const bool writeNormals = static_cast<bool>(normalView);
+	if (writeNormals && normalView.getElementCount() != vertexCount)
+		return _params.logger.log("PLY writer: normal vertex count mismatch.", system::ILogger::ELL_ERROR), false;
+	const ICPUPolygonGeometry::SDataView* uvView = SGeometryWriterCommon::getAuxViewAt(geom, SPLYPolygonGeometryAuxLayout::UV0, vertexCount);
+	if (uvView && getFormatChannelCount(uvView->composed.format) != 2u)
+		uvView = nullptr;
+	core::vector<Parse::ExtraAuxView> extraAuxViews;
+	const auto& auxViews = geom->getAuxAttributeViews();
+	extraAuxViews.reserve(auxViews.size());
+	for (uint32_t auxIx = 0u; auxIx < static_cast<uint32_t>(auxViews.size()); ++auxIx)
+	{
+		const auto& view = auxViews[auxIx];
+		if (!view || (uvView && auxIx == SPLYPolygonGeometryAuxLayout::UV0))
+			continue;
+		if (view.getElementCount() != vertexCount)
+			continue;
+		const uint32_t channels = getFormatChannelCount(view.composed.format);
+		if (channels == 0u)
+			continue;
+		const uint32_t components = std::min(4u, channels);
+		extraAuxViews.push_back({&view, components, auxIx, Parse::selectScalarType(view.composed.format)});
+	}
+	_params.logger.log("PLY writer input: file=%s pos_fmt=%u pos_stride=%u pos_count=%llu normal_fmt=%u normal_stride=%u normal_count=%llu uv_fmt=%u uv_stride=%u uv_count=%llu aux=%u",
+		system::ILogger::ELL_INFO, file->getFileName().string().c_str(), static_cast<uint32_t>(positionView.composed.format), positionView.composed.getStride(),
+		static_cast<unsigned long long>(positionView.getElementCount()), static_cast<uint32_t>(normalView.composed.format), normalView.composed.getStride(),
+		static_cast<unsigned long long>(normalView.getElementCount()), uvView ? static_cast<uint32_t>(uvView->composed.format) : static_cast<uint32_t>(EF_UNKNOWN),
+		uvView ? uvView->composed.getStride() : 0u, uvView ? static_cast<unsigned long long>(uvView->getElementCount()) : 0ull, static_cast<uint32_t>(extraAuxViews.size()));
+	const auto* indexing = geom->getIndexingCallback();
+	if (!indexing)
+		return _params.logger.log("PLY writer: missing indexing callback.", system::ILogger::ELL_ERROR), false;
+	if (indexing->knownTopology() != E_PRIMITIVE_TOPOLOGY::EPT_TRIANGLE_LIST)
+		return _params.logger.log("PLY writer: only triangle-list topology is supported.", system::ILogger::ELL_ERROR), false;
+	const auto& indexView = geom->getIndexView();
+	core::vector<uint32_t> indexData;
+	const uint32_t* indices = nullptr;
+	size_t faceCount = 0ull;
+	if (indexView)
+	{
+		const size_t indexCount = indexView.getElementCount();
+		if ((indexCount % 3u) != 0u)
+			return _params.logger.log("PLY writer: failed to validate triangle indexing.", system::ILogger::ELL_ERROR), false;
+		const void* src = indexView.getPointer();
+		if (!src)
+			return _params.logger.log("PLY writer: missing index buffer pointer.", system::ILogger::ELL_ERROR), false;
+		if (indexView.composed.format == EF_R32_UINT && indexView.composed.getStride() == sizeof(uint32_t))
+			indices = reinterpret_cast<const uint32_t*>(src);
+		else if (indexView.composed.format == EF_R16_UINT && indexView.composed.getStride() == sizeof(uint16_t))
+		{
+			const auto* src16 = reinterpret_cast<const uint16_t*>(src);
+			indexData.resize(indexCount);
+			for (size_t i = 0u; i < indexCount; ++i)
+				indexData[i] = src16[i];
+			indices = indexData.data();
+		}
+		else
+		{
+			indexData.resize(indexCount);
+			for (size_t i = 0u; i < indexCount; ++i)
+			{
+				hlsl::uint32_t4 decoded = {};
+				if (!indexView.decodeElement(i, decoded))
+					return _params.logger.log("PLY writer: failed to decode index view.", system::ILogger::ELL_ERROR), false;
+				indexData[i] = decoded.x;
+			}
+			indices = indexData.data();
+		}
+		faceCount = indexCount / 3u;
+	}
+	else
+	{
+		if ((vertexCount % 3u) != 0u)
+			return _params.logger.log("PLY writer: failed to derive triangle indexing from positions.", system::ILogger::ELL_ERROR), false;
+		indexData.resize(vertexCount);
+		for (size_t i = 0u; i < vertexCount; ++i)
+			indexData[i] = static_cast<uint32_t>(i);
+		indices = indexData.data();
+		faceCount = vertexCount / 3u;
+	}
+	const auto flags = _override->getAssetWritingFlags(ctx, geom, 0u);
+	const bool binary = flags.hasAnyFlag(E_WRITER_FLAGS::EWF_BINARY);
+	const bool flipVectors = !flags.hasAnyFlag(E_WRITER_FLAGS::EWF_MESH_IS_RIGHT_HANDED);
+	const bool write16BitIndices = vertexCount <= static_cast<size_t>(std::numeric_limits<uint16_t>::max()) + 1ull;
+	ScalarType positionScalarType = Parse::selectScalarType(positionView.composed.format);
+	if (flipVectors && Parse::getScalarMeta(positionScalarType).integer && !Parse::getScalarMeta(positionScalarType).signedType)
+		positionScalarType = ScalarType::Float32;
+	ScalarType normalScalarType = Parse::selectScalarType(normalView.composed.format);
+	if (flipVectors && Parse::getScalarMeta(normalScalarType).integer && !Parse::getScalarMeta(normalScalarType).signedType)
+		normalScalarType = ScalarType::Float32;
+	const ScalarType uvScalarType = uvView ? Parse::selectScalarType(uvView->composed.format) : ScalarType::Float32;
+	const auto positionMeta = Parse::getScalarMeta(positionScalarType);
+	const auto normalMeta = Parse::getScalarMeta(normalScalarType);
+	const auto uvMeta = Parse::getScalarMeta(uvScalarType);
+	size_t extraAuxBytesPerVertex = 0ull;
+	for (const auto& extra : extraAuxViews)
+		extraAuxBytesPerVertex += static_cast<size_t>(extra.components) * Parse::getScalarMeta(extra.scalarType).byteSize;
+	std::ostringstream headerBuilder;
+	headerBuilder << "ply\n";
+	headerBuilder << (binary ? "format binary_little_endian 1.0" : "format ascii 1.0");
+	headerBuilder << "\ncomment Nabla " << NABLA_SDK_VERSION;
+	headerBuilder << "\nelement vertex " << vertexCount << "\n";
+	headerBuilder << "property " << positionMeta.name << " x\n";
+	headerBuilder << "property " << positionMeta.name << " y\n";
+	headerBuilder << "property " << positionMeta.name << " z\n";
+	if (writeNormals)
+	{
+		headerBuilder << "property " << normalMeta.name << " nx\n";
+		headerBuilder << "property " << normalMeta.name << " ny\n";
+		headerBuilder << "property " << normalMeta.name << " nz\n";
+	}
+	if (uvView)
+	{
+		headerBuilder << "property " << uvMeta.name << " u\n";
+		headerBuilder << "property " << uvMeta.name << " v\n";
+	}
+	for (const auto& extra : extraAuxViews)
+	{
+		const auto extraMeta = Parse::getScalarMeta(extra.scalarType);
+		for (uint32_t component = 0u; component < extra.components; ++component)
+		{
+			headerBuilder << "property " << extraMeta.name << " aux" << extra.auxIndex;
+			if (extra.components > 1u)
+				headerBuilder << "_" << component;
+			headerBuilder << "\n";
+		}
+	}
+	headerBuilder << "element face " << faceCount;
+	headerBuilder << (write16BitIndices ? "\nproperty list uchar uint16 vertex_indices\n" : "\nproperty list uchar uint32 vertex_indices\n");
+	headerBuilder << "end_header\n";
+	const std::string header = headerBuilder.str();
+	const Parse::WriteInput input = {.geom = geom, .positionScalarType = positionScalarType, .uvView = uvView, .uvScalarType = uvScalarType, .extraAuxViews = &extraAuxViews, .writeNormals = writeNormals, .normalScalarType = normalScalarType, .vertexCount = vertexCount, .indices = indices, .faceCount = faceCount, .write16BitIndices = write16BitIndices, .flipVectors = flipVectors};
+	bool writeOk = false;
+	size_t outputBytes = 0ull;
+	double writeIoMs = 0.0;
+	auto writePayload = [&](const void* bodyData, const size_t bodySize) -> bool {
+		const size_t outputSize = header.size() + bodySize;
+		const auto ioPlan = impl::SFileAccess::resolvePlan(_params.ioPolicy, static_cast<uint64_t>(outputSize), true, file);
+		if (impl::SFileAccess::logInvalidPlan(_params.logger, "PLY writer", file->getFileName().string().c_str(), ioPlan))
+			return false;
+		outputBytes = outputSize;
+		const SInterchangeIO::SBufferRange writeBuffers[] = {{.data = header.data(), .byteCount = header.size()}, {.data = bodyData, .byteCount = bodySize}};
+		const auto ioStart = clock_t::now();
+		writeOk = SInterchangeIO::writeBuffersWithPolicy(file, ioPlan, writeBuffers, &ioTelemetry);
+		writeIoMs = std::chrono::duration<double, std::milli>(clock_t::now() - ioStart).count();
+		const uint64_t ioMinWrite = ioTelemetry.getMinOrZero();
+		const uint64_t ioAvgWrite = ioTelemetry.getAvgOrZero();
+		impl::SFileAccess::logTinyIO(_params.logger, "PLY writer", file->getFileName().string().c_str(), ioTelemetry, static_cast<uint64_t>(outputBytes), _params.ioPolicy, "writes");
+		_params.logger.log("PLY writer stats: file=%s bytes=%llu vertices=%llu faces=%llu binary=%d io_writes=%llu io_min_write=%llu io_avg_write=%llu io_req=%s io_eff=%s io_chunk=%llu io_reason=%s",
+			system::ILogger::ELL_PERFORMANCE, file->getFileName().string().c_str(), static_cast<unsigned long long>(outputBytes),
+			static_cast<unsigned long long>(vertexCount), static_cast<unsigned long long>(faceCount), binary ? 1 : 0,
+			static_cast<unsigned long long>(ioTelemetry.callCount), static_cast<unsigned long long>(ioMinWrite), static_cast<unsigned long long>(ioAvgWrite),
+			system::to_string(_params.ioPolicy.strategy).c_str(), system::to_string(ioPlan.strategy).c_str(), static_cast<unsigned long long>(ioPlan.chunkSizeBytes()), ioPlan.reason);
+		return writeOk;
+	};
+	if (binary)
+	{
+		const size_t vertexStride = static_cast<size_t>(positionMeta.byteSize) * 3ull + (writeNormals ? static_cast<size_t>(normalMeta.byteSize) * 3ull : 0ull) + (uvView ? static_cast<size_t>(uvMeta.byteSize) * 2ull : 0ull) + extraAuxBytesPerVertex;
+		const size_t faceStride = sizeof(uint8_t) + (write16BitIndices ? sizeof(uint16_t) : sizeof(uint32_t)) * 3u;
+		const size_t bodySize = vertexCount * vertexStride + faceCount * faceStride;
+		core::vector<uint8_t> body;
+		const auto fillStart = clock_t::now();
+		body.resize(bodySize);
+		if (!Parse::writeBinary(input, body.data()))
+			return _params.logger.log("PLY writer: binary payload generation failed.", system::ILogger::ELL_ERROR), false;
+		const auto fillMs = std::chrono::duration<double, std::milli>(clock_t::now() - fillStart).count();
+		const bool ok = writePayload(body.data(), body.size());
+		_params.logger.log("PLY writer stages: file=%s header=%llu body=%llu fill=%.3f ms io=%.3f ms", system::ILogger::ELL_PERFORMANCE, file->getFileName().string().c_str(), static_cast<unsigned long long>(header.size()), static_cast<unsigned long long>(body.size()), fillMs, writeIoMs);
+		return ok;
+	}
+	std::string body;
+	body.reserve(vertexCount * Parse::ApproxTextBytesPerVertex + faceCount * Parse::ApproxTextBytesPerFace);
+	const auto fillStart = clock_t::now();
+	if (!Parse::writeText(input, body))
+		return _params.logger.log("PLY writer: text payload generation failed.", system::ILogger::ELL_ERROR), false;
+	const auto fillMs = std::chrono::duration<double, std::milli>(clock_t::now() - fillStart).count();
+	const bool ok = writePayload(body.data(), body.size());
+	_params.logger.log("PLY writer stages: file=%s header=%llu body=%llu fill=%.3f ms io=%.3f ms", system::ILogger::ELL_PERFORMANCE, file->getFileName().string().c_str(), static_cast<unsigned long long>(header.size()), static_cast<unsigned long long>(body.size()), fillMs, writeIoMs);
+	return ok;
+}
 }
-
-} // end namespace
-} // end namespace
-
 #endif // _NBL_COMPILE_WITH_PLY_WRITER_
-
diff --git a/src/nbl/asset/interchange/CPLYMeshWriter.h b/src/nbl/asset/interchange/CPLYMeshWriter.h
index e709ffa0fe..4adacc4c68 100644
--- a/src/nbl/asset/interchange/CPLYMeshWriter.h
+++ b/src/nbl/asset/interchange/CPLYMeshWriter.h
@@ -1,79 +1,24 @@
-// Copyright (C) 2019-2025 - DevSH Graphics Programming Sp. z O.O.
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine" and was originally part of the "Irrlicht Engine"
 // For conditions of distribution and use, see copyright notice in nabla.h
 // See the original file in irrlicht source for authors
 #ifndef _NBL_ASSET_PLY_MESH_WRITER_H_INCLUDED_
 #define _NBL_ASSET_PLY_MESH_WRITER_H_INCLUDED_
-
-
-#include "nbl/asset/ICPUPolygonGeometry.h"
 #include "nbl/asset/interchange/IGeometryWriter.h"
-
-#include <iomanip>
-
-
 namespace nbl::asset
 {
-
-//! class to write PLY mesh files
+//! Geometry writer capable of emitting PLY mesh files.
 class CPLYMeshWriter : public IGeometryWriter
 {
 	public:
 		CPLYMeshWriter();
 
-        virtual const char** getAssociatedFileExtensions() const
-        {
-            static const char* ext[]{ "ply", nullptr };
-            return ext;
-        }
-
-        virtual uint32_t getSupportedFlags() override { return asset::EWF_BINARY; }
-
-        virtual uint32_t getForcedFlags() { return 0u; }
-
-        virtual bool writeAsset(system::IFile* _file, const SAssetWriteParams& _params, IAssetWriterOverride* _override = nullptr) override;
+		const char** getAssociatedFileExtensions() const override;
 
-    private:
+		writer_flags_t getSupportedFlags() override;
+		writer_flags_t getForcedFlags() override;
 
-        struct SContext
-        {
-            SAssetWriteContext writeContext;
-            size_t fileOffset = 0;
-        };
-
-        void writeBinary(const ICPUPolygonGeometry* geom, size_t _vtxCount, size_t _fcCount, asset::E_INDEX_TYPE _idxType, void* const _indices, bool _forceFaces, const bool _vaidToWrite[4], SContext& context) const;
-        void writeText(const ICPUPolygonGeometry* geom, size_t _vtxCount, size_t _fcCount, asset::E_INDEX_TYPE _idxType, void* const _indices, bool _forceFaces, const bool _vaidToWrite[4], SContext& context) const;
-
-        void writeAttribBinary(SContext& context, ICPUPolygonGeometry* geom, uint32_t _vaid, size_t _ix, size_t _cpa, bool flipAttribute = false) const;
-
-        //! Creates new geometry with the same attribute buffers mapped but with normalized types changed to corresponding true integer types.
-        static core::smart_refctd_ptr<ICPUPolygonGeometry> createCopyNormalizedReplacedWithTrueInt(const ICPUPolygonGeometry* geom);
-
-        static std::string getTypeString(asset::E_FORMAT _t);
-
-        template<typename T>
-        void writeVectorAsText(SContext& context, const T* _vec, size_t _elementsToWrite, bool flipVectors = false) const
-        {
-			constexpr size_t xID = 0u;
-            std::stringstream ss;
-            ss << std::fixed;
-			bool currentFlipOnVariable = false;
-			for (size_t i = 0u; i < _elementsToWrite; ++i)
-			{
-				if (flipVectors && i == xID)
-					currentFlipOnVariable = true;
-				else
-					currentFlipOnVariable = false;
-
-					ss << std::setprecision(6) << _vec[i] * (currentFlipOnVariable ? -1 : 1) << " ";
-			}
-            auto str = ss.str();
-
-            system::IFile::success_t succ;
-            context.writeContext.outputFile->write(succ, str.c_str(), context.fileOffset, str.size());
-            context.fileOffset += succ.getBytesProcessed();
-        }
+		bool writeAsset(system::IFile* _file, const SAssetWriteParams& _params, IAssetWriterOverride* _override = nullptr) override;
 };
-
 } // end namespace
 #endif
diff --git a/src/nbl/asset/interchange/CSTLMeshFileLoader.cpp b/src/nbl/asset/interchange/CSTLMeshFileLoader.cpp
index d00c37cf10..a92b86f839 100644
--- a/src/nbl/asset/interchange/CSTLMeshFileLoader.cpp
+++ b/src/nbl/asset/interchange/CSTLMeshFileLoader.cpp
@@ -1,437 +1,629 @@
-// Copyright (C) 2019 - DevSH Graphics Programming Sp. z O.O.
+#ifdef _NBL_COMPILE_WITH_STL_LOADER_
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine" and was originally part of the "Irrlicht Engine"
 // For conditions of distribution and use, see copyright notice in nabla.h
 // See the original file in irrlicht source for authors
-
 #include "CSTLMeshFileLoader.h"
-
-#ifdef _NBL_COMPILE_WITH_STL_LOADER_
-
+#include "impl/SFileAccess.h"
+#include "impl/STextParse.h"
 #include "nbl/asset/asset.h"
-
-#include "nbl/asset/IAssetManager.h"
-
-#include "nbl/system/ISystem.h"
+#include "nbl/asset/format/convertColor.h"
+#include "nbl/asset/interchange/SGeometryContentHash.h"
+#include "nbl/asset/interchange/SGeometryLoaderCommon.h"
+#include "nbl/asset/interchange/SSTLPolygonGeometryAuxLayout.h"
+#include "nbl/asset/interchange/SInterchangeIO.h"
+#include "nbl/asset/interchange/SLoaderRuntimeTuning.h"
+#include "nbl/asset/metadata/CSTLMetadata.h"
+#include "nbl/asset/utils/CPolygonGeometryManipulator.h"
+#include "nbl/asset/utils/SGeometryNormalCommon.h"
+#include "nbl/builtin/hlsl/shapes/AABBAccumulator.hlsl"
+#include "nbl/core/hash/blake.h"
 #include "nbl/system/IFile.h"
-
-using namespace nbl;
-using namespace nbl::asset;
-
-constexpr auto POSITION_ATTRIBUTE = 0;
-constexpr auto COLOR_ATTRIBUTE = 1;
-constexpr auto UV_ATTRIBUTE = 2;
-constexpr auto NORMAL_ATTRIBUTE = 3;
-
-CSTLMeshFileLoader::CSTLMeshFileLoader(asset::IAssetManager* _m_assetMgr)
-	: IRenderpassIndependentPipelineLoader(_m_assetMgr), m_assetMgr(_m_assetMgr)
+#include <optional>
+namespace nbl::asset
 {
-	
-}
-
-void CSTLMeshFileLoader::initialize()
+namespace
+{
+struct Parse
 {
-	IRenderpassIndependentPipelineLoader::initialize();
+	using Common = impl::TextParse;
+	struct LayoutProbe { bool hasPrefix = false; bool startsWithSolid = false; bool binaryBySize = false; uint32_t triangleCount = 0u; };
+	static hlsl::float32_t3 resolveStoredNormal(const hlsl::float32_t3& fileNormal) { const float fileLen2 = hlsl::dot(fileNormal, fileNormal); return (fileLen2 > 0.f && std::abs(fileLen2 - 1.f) < 1e-4f) ? fileNormal : SGeometryNormalCommon::normalizeOrZero(fileNormal); }
+	static void pushTriangleReversed(const std::array<hlsl::float32_t3, 3>& p, core::vector<hlsl::float32_t3>& positions) { positions.push_back(p[2u]); positions.push_back(p[1u]); positions.push_back(p[0u]); }
+	static uint32_t decodeViscamColorToB8G8R8A8(const uint16_t packedColor) { std::array<const void*, 4> src = {&packedColor}; uint32_t outColor = 0u; convertColor<EF_A1R5G5B5_UNORM_PACK16, EF_B8G8R8A8_UNORM>(src.data(), &outColor, 0u, 0u); return outColor; }
 
-	auto precomputeAndCachePipeline = [&](bool withColorAttribute)
+	struct Context
 	{
-		auto getShaderDefaultPaths = [&]() -> std::pair<std::string_view, std::string_view>
-		{
-			if (withColorAttribute)
-				return std::make_pair("nbl/builtin/material/debug/vertex_color/specialized_shader.vert", "nbl/builtin/material/debug/vertex_color/specialized_shader.frag");
-			else
-				return std::make_pair("nbl/builtin/material/debug/vertex_normal/specialized_shader.vert", "nbl/builtin/material/debug/vertex_normal/specialized_shader.frag");
-		 };
-
-		auto defaultOverride = IAssetLoaderOverride(m_assetMgr);
-		const std::string pipelineCacheHash = getPipelineCacheKey(withColorAttribute).data();
-		const uint32_t _hierarchyLevel = 0;
-		const IAssetLoader::SAssetLoadContext fakeContext(IAssetLoader::SAssetLoadParams{}, nullptr);
-
-		const asset::IAsset::E_TYPE types[]{ asset::IAsset::ET_RENDERPASS_INDEPENDENT_PIPELINE, (asset::IAsset::E_TYPE)0u };
-		auto pipelineBundle = defaultOverride.findCachedAsset(pipelineCacheHash, types, fakeContext, _hierarchyLevel + ICPURenderpassIndependentPipeline::DESC_SET_HIERARCHYLEVELS_BELOW);
-		if (pipelineBundle.getContents().empty())
-		{
-			auto mbVertexShader = core::smart_refctd_ptr<ICPUSpecializedShader>();
-			auto mbFragmentShader = core::smart_refctd_ptr<ICPUSpecializedShader>();
-			{
-				const IAsset::E_TYPE types[]{ IAsset::E_TYPE::ET_SPECIALIZED_SHADER, static_cast<IAsset::E_TYPE>(0u) };
-				const auto shaderPaths = getShaderDefaultPaths();
-
-				auto vertexShaderBundle = m_assetMgr->findAssets(shaderPaths.first.data(), types);
-				auto fragmentShaderBundle = m_assetMgr->findAssets(shaderPaths.second.data(), types);
-
-				mbVertexShader = core::smart_refctd_ptr_static_cast<ICPUSpecializedShader>(vertexShaderBundle->begin()->getContents().begin()[0]);
-				mbFragmentShader = core::smart_refctd_ptr_static_cast<ICPUSpecializedShader>(fragmentShaderBundle->begin()->getContents().begin()[0]);
-			}
-
-			auto defaultOverride = IAssetLoaderOverride(m_assetMgr);
-
-			const IAssetLoader::SAssetLoadContext fakeContext(IAssetLoader::SAssetLoadParams{}, nullptr);
-			auto mbBundlePipelineLayout = defaultOverride.findDefaultAsset<ICPUPipelineLayout>("nbl/builtin/pipeline_layout/loader/STL", fakeContext, _hierarchyLevel + ICPURenderpassIndependentPipeline::PIPELINE_LAYOUT_HIERARCHYLEVELS_BELOW);
-			auto mbPipelineLayout = mbBundlePipelineLayout.first;
-
-			auto const positionFormatByteSize = getTexelOrBlockBytesize(EF_R32G32B32_SFLOAT);
-			auto const colorFormatByteSize = withColorAttribute ? getTexelOrBlockBytesize(EF_B8G8R8A8_UNORM) : 0;
-			auto const normalFormatByteSize = getTexelOrBlockBytesize(EF_A2B10G10R10_SNORM_PACK32);
-
-			SVertexInputParams mbInputParams;
-			const auto stride = positionFormatByteSize + colorFormatByteSize + normalFormatByteSize;
-			mbInputParams.enabledBindingFlags |= core::createBitmask({ 0 });
-			mbInputParams.enabledAttribFlags |= core::createBitmask({ POSITION_ATTRIBUTE, NORMAL_ATTRIBUTE, withColorAttribute ? COLOR_ATTRIBUTE : 0 });
-			mbInputParams.bindings[0] = { stride, EVIR_PER_VERTEX };
-
-			mbInputParams.attributes[POSITION_ATTRIBUTE].format = EF_R32G32B32_SFLOAT;
-			mbInputParams.attributes[POSITION_ATTRIBUTE].relativeOffset = 0;
-			mbInputParams.attributes[POSITION_ATTRIBUTE].binding = 0;
-
-			if (withColorAttribute)
-			{
-				mbInputParams.attributes[COLOR_ATTRIBUTE].format = EF_R32G32B32_SFLOAT;
-				mbInputParams.attributes[COLOR_ATTRIBUTE].relativeOffset = positionFormatByteSize;
-				mbInputParams.attributes[COLOR_ATTRIBUTE].binding = 0;
-			}
-
-			mbInputParams.attributes[NORMAL_ATTRIBUTE].format = EF_R32G32B32_SFLOAT;
-			mbInputParams.attributes[NORMAL_ATTRIBUTE].relativeOffset = positionFormatByteSize + colorFormatByteSize;
-			mbInputParams.attributes[NORMAL_ATTRIBUTE].binding = 0;
-
-			SBlendParams blendParams;
-			SPrimitiveAssemblyParams primitiveAssemblyParams;
-			primitiveAssemblyParams.primitiveType = E_PRIMITIVE_TOPOLOGY::EPT_TRIANGLE_LIST;
+		IAssetLoader::SAssetLoadContext inner;
+		SFileReadTelemetry ioTelemetry = {};
+		static constexpr size_t TextProbeBytes = 6ull;
+		static constexpr size_t BinaryHeaderBytes = 80ull;
+		static constexpr size_t TriangleCountBytes = sizeof(uint32_t);
+		static constexpr size_t BinaryPrefixBytes = BinaryHeaderBytes + TriangleCountBytes;
+		static constexpr size_t TriangleFloatCount = 12ull;
+		static constexpr size_t TriangleFloatBytes = sizeof(float) * TriangleFloatCount;
+		static constexpr size_t TriangleAttributeBytes = sizeof(uint16_t);
+		static constexpr size_t TriangleRecordBytes = TriangleFloatBytes + TriangleAttributeBytes;
+		static constexpr size_t VerticesPerTriangle = 3ull;
+		static constexpr size_t FloatChannelsPerVertex = 3ull;
+	};
 
-			SRasterizationParams rastarizationParmas;
+	static bool probeLayout(system::IFile* file, const size_t fileSize, const uint8_t* const wholeFileData, SFileReadTelemetry* const ioTelemetry, LayoutProbe& out)
+	{
+		out = {};
+		if (!file || fileSize < Context::TextProbeBytes)
+			return false;
 
-			auto mbPipeline = core::make_smart_refctd_ptr<ICPURenderpassIndependentPipeline>(std::move(mbPipelineLayout), nullptr, nullptr, mbInputParams, blendParams, primitiveAssemblyParams, rastarizationParmas);
+		if (fileSize >= Context::BinaryPrefixBytes)
+		{
+			std::array<uint8_t, Context::BinaryPrefixBytes> prefix = {};
+			out.hasPrefix = wholeFileData ? true : SInterchangeIO::readFileExact(file, prefix.data(), 0ull, Context::BinaryPrefixBytes, ioTelemetry);
+			if (out.hasPrefix)
 			{
-				mbPipeline->setShaderAtStage(asset::IShader::ESS_VERTEX, mbVertexShader.get());
-				mbPipeline->setShaderAtStage(asset::IShader::ESS_FRAGMENT, mbFragmentShader.get());
+				if (wholeFileData)
+					std::memcpy(prefix.data(), wholeFileData, Context::BinaryPrefixBytes);
+				out.startsWithSolid = (std::memcmp(prefix.data(), "solid ", Context::TextProbeBytes) == 0);
+				std::memcpy(&out.triangleCount, prefix.data() + Context::BinaryHeaderBytes, sizeof(out.triangleCount));
+				const uint64_t expectedSize = Context::BinaryPrefixBytes + static_cast<uint64_t>(out.triangleCount) * Context::TriangleRecordBytes;
+				out.binaryBySize = (expectedSize == fileSize);
+				return true;
 			}
-
-			asset::SAssetBundle newPipelineBundle(nullptr, {core::smart_refctd_ptr<asset::ICPURenderpassIndependentPipeline>(mbPipeline)});
-			defaultOverride.insertAssetIntoCache(newPipelineBundle, pipelineCacheHash, fakeContext, _hierarchyLevel + ICPURenderpassIndependentPipeline::DESC_SET_HIERARCHYLEVELS_BELOW);
 		}
-		else
-			return;
-	};
-
-	/*
-		Pipeline permutations are cached
-	*/
-
-	precomputeAndCachePipeline(true);
-	precomputeAndCachePipeline(false);
-}
-
-SAssetBundle CSTLMeshFileLoader::loadAsset(system::IFile* _file, const IAssetLoader::SAssetLoadParams& _params, IAssetLoader::IAssetLoaderOverride* _override, uint32_t _hierarchyLevel)
-{
-	if (!_file)
-		return {};
-
-	SContext context = {
-		asset::IAssetLoader::SAssetLoadContext{
-			_params,
-			_file
-		},
-		_hierarchyLevel,
-		_override
-	};
-
-
-	const size_t filesize = context.inner.mainFile->getSize();
-	if (filesize < 6ull) // we need a header
-		return {};
-
-	bool hasColor = false;
-
-	auto mesh = core::make_smart_refctd_ptr<ICPUMesh>();
-	auto meshbuffer = core::make_smart_refctd_ptr<ICPUMeshBuffer>();
-	meshbuffer->setPositionAttributeIx(POSITION_ATTRIBUTE);
-	meshbuffer->setNormalAttributeIx(NORMAL_ATTRIBUTE);
-
-	bool binary = false;
-	std::string token;
-	if (getNextToken(&context, token) != "solid")
-		binary = hasColor = true;
-
-	core::vector<core::vectorSIMDf> positions, normals;
-	core::vector<uint32_t> colors;
-	if (binary)
-	{
-		if (_file->getSize() < 80)
-			return {};
-
-		constexpr size_t headerOffset = 80; 
-		context.fileOffset = headerOffset; //! skip header
-
-		uint32_t vertexCount = 0u;
 
-		system::IFile::success_t success;
-		context.inner.mainFile->read(success, &vertexCount, context.fileOffset, sizeof(vertexCount));
-		if (!success)
-			return {};
-		context.fileOffset += sizeof(vertexCount);
-
-		positions.reserve(3 * vertexCount);
-		normals.reserve(vertexCount);
-		colors.reserve(vertexCount);
+		char header[Context::TextProbeBytes] = {};
+		if (wholeFileData)
+			std::memcpy(header, wholeFileData, sizeof(header));
+		else if (!SInterchangeIO::readFileExact(file, header, 0ull, sizeof(header), ioTelemetry))
+			return false;
+		out.startsWithSolid = (std::strncmp(header, "solid ", Context::TextProbeBytes) == 0);
+		return true;
 	}
-	else
-		goNextLine(&context); // skip header
 
-	uint16_t attrib = 0u;
-	token.reserve(32);
-	while (context.fileOffset < filesize) // TODO: check it
+	class AsciiParser
 	{
-		if (!binary)
-		{
-			if (getNextToken(&context, token) != "facet")
+		public:
+			inline AsciiParser(const char* begin, const char* end) : m_cursor(begin), m_end(end) {}
+			inline std::optional<std::string_view> readToken() { return Common::readToken(m_cursor, m_end); }
+			inline std::optional<float> readFloat()
 			{
-				if (token == "endsolid")
-					break;
-				return {};
+				Common::skipWhitespace(m_cursor, m_end);
+				float value = 0.f;
+				return Common::parseNumber(m_cursor, m_end, value) ? std::optional<float>(value) : std::nullopt;
 			}
-			if (getNextToken(&context, token) != "normal")
+			inline std::optional<hlsl::float32_t3> readVec3()
 			{
-				return {};
+				const auto x = readFloat(), y = readFloat(), z = readFloat();
+				return x.has_value() && y.has_value() && z.has_value() ? std::optional<hlsl::float32_t3>(hlsl::float32_t3(*x, *y, *z)) : std::nullopt;
 			}
-		}
-
-		{
-			core::vectorSIMDf n;
-			getNextVector(&context, n, binary);
-			if(_params.loaderFlags & E_LOADER_PARAMETER_FLAGS::ELPF_RIGHT_HANDED_MESHES)
-				performActionBasedOnOrientationSystem<float>(n.x, [](float& varToFlip) {varToFlip = -varToFlip;});
-			normals.push_back(core::normalize(n));
-		}
+		private:
+			const char* m_cursor = nullptr;
+			const char* m_end = nullptr;
+	};
 
-		if (!binary)
-		{
-			if (getNextToken(&context, token) != "outer" || getNextToken(&context, token) != "loop")
-				return {};
-		}
+	class SplitBlockMemoryResource final : public core::refctd_memory_resource
+	{
+		public:
+			inline SplitBlockMemoryResource(core::smart_refctd_ptr<core::refctd_memory_resource>&& upstream, void* block, const size_t blockBytes, const size_t alignment) : m_upstream(std::move(upstream)), m_block(block), m_blockBytes(blockBytes), m_alignment(alignment) {}
+			inline void* allocate(std::size_t, std::size_t) override { assert(false); return nullptr; }
 
-		{
-			core::vectorSIMDf p[3];
-			for (uint32_t i = 0u; i < 3u; ++i)
+			inline void deallocate(void* p, std::size_t bytes, std::size_t) override
 			{
-				if (!binary)
-				{
-					if (getNextToken(&context, token) != "vertex")
-						return {};
-				}
-				getNextVector(&context, p[i], binary);
-				if (_params.loaderFlags & E_LOADER_PARAMETER_FLAGS::ELPF_RIGHT_HANDED_MESHES)
-					performActionBasedOnOrientationSystem<float>(p[i].x, [](float& varToFlip){varToFlip = -varToFlip; });
+				const auto* const begin = reinterpret_cast<const uint8_t*>(m_block);
+				const auto* const end = begin + m_blockBytes;
+				const auto* const ptr = reinterpret_cast<const uint8_t*>(p);
+				assert(ptr >= begin && ptr <= end);
+				assert(ptr + bytes <= end);
 			}
-			for (uint32_t i = 0u; i < 3u; ++i) // seems like in STL format vertices are ordered in clockwise manner...
-				positions.push_back(p[2u - i]);
-		}
 
-		if (!binary)
-		{
-			if (getNextToken(&context, token) != "endloop" || getNextToken(&context, token) != "endfacet")
-				return {};
-		}
-		else
-		{
-			system::IFile::success_t success;
-			context.inner.mainFile->read(success, &attrib, context.fileOffset, sizeof(attrib));
-			if (!success)
-				return {};
-			context.fileOffset += sizeof(attrib);
-		}
-
-		if (hasColor && (attrib & 0x8000u)) // assuming VisCam/SolidView non-standard trick to store color in 2 bytes of extra attribute
-		{
-			const void* srcColor[1]{ &attrib };
-			uint32_t color{};
-			convertColor<EF_A1R5G5B5_UNORM_PACK16, EF_B8G8R8A8_UNORM>(srcColor, &color, 0u, 0u);
-			colors.push_back(color);
-		}
-		else
-		{
-			hasColor = false;
-			colors.clear();
-		}
-
-		if ((normals.back() == core::vectorSIMDf()).all())
-		{
-			normals.back().set(
-				core::plane3dSIMDf(
-					*(positions.rbegin() + 2),
-					*(positions.rbegin() + 1),
-					*(positions.rbegin() + 0)).getNormal()
-			);
-		}
-	} // end while (_file->getPos() < filesize)
-
-	const size_t vtxSize = hasColor ? (3 * sizeof(float) + 4 + 4) : (3 * sizeof(float) + 4);
-	auto vertexBuf = asset::ICPUBuffer::create({ vtxSize * positions.size() });
-
-	quant_normal_t normal;
-	for (size_t i = 0u; i < positions.size(); ++i)
-	{
-		if (i % 3 == 0)
-			normal = quantNormalCache->quantize<EF_A2B10G10R10_SNORM_PACK32>(normals[i / 3]);
-		uint8_t* ptr = (reinterpret_cast<uint8_t*>(vertexBuf->getPointer())) + i * vtxSize;
-		memcpy(ptr, positions[i].pointer, 3 * 4);
-
-		*reinterpret_cast<quant_normal_t*>(ptr + 12) = normal;
-
-		if (hasColor)
-			memcpy(ptr + 16, colors.data() + i / 3, 4);
-	}
-
-	const IAssetLoader::SAssetLoadContext fakeContext(IAssetLoader::SAssetLoadParams{}, nullptr);
-	const asset::IAsset::E_TYPE types[]{ asset::IAsset::ET_RENDERPASS_INDEPENDENT_PIPELINE, (asset::IAsset::E_TYPE)0u };
-	auto pipelineBundle = _override->findCachedAsset(getPipelineCacheKey(hasColor).data(), types, fakeContext, _hierarchyLevel + ICPURenderpassIndependentPipeline::DESC_SET_HIERARCHYLEVELS_BELOW);
-	{
-		bool status = !pipelineBundle.getContents().empty();
-		assert(status);
-	}
-
-	auto mbPipeline = core::smart_refctd_ptr_static_cast<asset::ICPURenderpassIndependentPipeline>(pipelineBundle.getContents().begin()[0]);
+		protected:
+			inline ~SplitBlockMemoryResource() override { if (m_upstream && m_block) m_upstream->deallocate(m_block, m_blockBytes, m_alignment); }
 
-	auto meta = core::make_smart_refctd_ptr<CSTLMetadata>(1u, std::move(m_basicViewParamsSemantics));
-	meta->placeMeta(0u, mbPipeline.get());
-
-	meshbuffer->setPipeline(std::move(mbPipeline));
-	meshbuffer->setIndexCount(positions.size());
-	meshbuffer->setIndexType(asset::EIT_UNKNOWN);
-
-	meshbuffer->setVertexBufferBinding({ 0ul, vertexBuf }, 0);
-	mesh->getMeshBufferVector().emplace_back(std::move(meshbuffer));
-	
-	return SAssetBundle(std::move(meta), { std::move(mesh) });
+		private:
+			core::smart_refctd_ptr<core::refctd_memory_resource> m_upstream;
+			void* m_block = nullptr;
+			size_t m_blockBytes = 0ull;
+			size_t m_alignment = 1ull;
+	};
+};
 }
-
-bool CSTLMeshFileLoader::isALoadableFileFormat(system::IFile* _file, const system::logger_opt_ptr logger) const
+CSTLMeshFileLoader::CSTLMeshFileLoader(asset::IAssetManager*)
 {
-	if (!_file || _file->getSize() <= 6u)
-		return false;
+}
 
-	char header[6];
-	{
-		system::IFile::success_t success;
-		_file->read(success, header, 0, sizeof(header));
-		if (!success)
-			return false;
-	}
+const char** CSTLMeshFileLoader::getAssociatedFileExtensions() const
+{
+	static const char* ext[] = { "stl", nullptr };
+	return ext;
+}
 
-	if (strncmp(header, "solid ", 6u) == 0)
-		return true;
-	else
-	{
-		if (_file->getSize() < 84u)
-			return false;
+SAssetBundle CSTLMeshFileLoader::loadAsset(system::IFile* _file, const IAssetLoader::SAssetLoadParams& _params, IAssetLoader::IAssetLoaderOverride* _override [[maybe_unused]], uint32_t _hierarchyLevel [[maybe_unused]])
+{
+	using Context = Parse::Context;
+	using AsciiParser = Parse::AsciiParser;
+	using SplitBlockMemoryResource = Parse::SplitBlockMemoryResource;
 
-		uint32_t triangleCount;
+	if (!_file)
+		return {};
 
-		constexpr size_t readOffset = 80;
-		system::IFile::success_t success;
-		_file->read(success, &triangleCount, readOffset, sizeof(triangleCount));
-		if (!success)
-			return false;
+	uint64_t triangleCount = 0u;
+	const char* parsePath = "unknown";
+	const bool computeContentHashes = !_params.loaderFlags.hasAnyFlag(IAssetLoader::ELPF_DONT_COMPUTE_CONTENT_HASHES);
+	bool hasTriangleColors = false;
 
-		constexpr size_t STL_TRI_SZ = 50u;
-		return _file->getSize() == (STL_TRI_SZ * triangleCount + 84u);
-	}
-}
+	Context context = {asset::IAssetLoader::SAssetLoadContext{_params, _file}, 0ull};
+	const size_t filesize = context.inner.mainFile->getSize();
+	if (filesize < Context::TextProbeBytes)
+		return {};
 
-//! Read 3d vector of floats
-void CSTLMeshFileLoader::getNextVector(SContext* context, core::vectorSIMDf& vec, bool binary) const
-{
-	if (binary)
-	{
-		{
-			system::IFile::success_t success;
-			context->inner.mainFile->read(success, &vec.X, context->fileOffset, 4);
-			context->fileOffset += success.getBytesProcessed();
-		}
-		
-		{
-			system::IFile::success_t success;
-			context->inner.mainFile->read(success, &vec.Y, context->fileOffset, 4);
-			context->fileOffset += success.getBytesProcessed();
-		}
+	impl::SLoadSession loadSession = {};
+	if (!impl::SLoadSession::begin(_params.logger, "STL loader", _file, _params.ioPolicy, static_cast<uint64_t>(filesize), true, loadSession))
+		return {};
 
-		{
-			system::IFile::success_t success;
-			context->inner.mainFile->read(success, &vec.Z, context->fileOffset, 4);
-			context->fileOffset += success.getBytesProcessed();
-		}
-	}
-	else
+	core::vector<uint8_t> wholeFilePayload;
+	const uint8_t* wholeFileData = nullptr;
+	if (loadSession.isWholeFile())
 	{
-		goNextWord(context);
-		std::string tmp;
-
-		getNextToken(context, tmp);
-		sscanf(tmp.c_str(), "%f", &vec.X);
-		getNextToken(context, tmp);
-		sscanf(tmp.c_str(), "%f", &vec.Y);
-		getNextToken(context, tmp);
-		sscanf(tmp.c_str(), "%f", &vec.Z);
+		wholeFileData = loadSession.mapOrReadWholeFile(wholeFilePayload, &context.ioTelemetry);
+		if (!wholeFileData)
+			return {};
 	}
-	vec.X = -vec.X;
-}
 
-//! Read next word
-const std::string& CSTLMeshFileLoader::getNextToken(SContext* context, std::string& token) const
-{
-	goNextWord(context);
-	char c;
-	token = "";
+	Parse::LayoutProbe layout = {};
+	if (!Parse::probeLayout(context.inner.mainFile, filesize, wholeFileData, &context.ioTelemetry, layout))
+		return {};
+	const bool binary = layout.binaryBySize || !layout.startsWithSolid;
+	const bool hasBinaryTriCountFromDetect = layout.hasPrefix;
+	const uint32_t binaryTriCountFromDetect = layout.triangleCount;
+
+	auto geometry = core::make_smart_refctd_ptr<ICPUPolygonGeometry>();
+	geometry->setIndexing(IPolygonGeometryBase::TriangleList());
+	hlsl::shapes::util::AABBAccumulator3<float> parsedAABB = hlsl::shapes::util::createAABBAccumulator<float>();
+	uint64_t vertexCount = 0ull;
+
+	if (binary) {
+		parsePath = "binary_fast";
+		if (filesize < Context::BinaryPrefixBytes)
+			return {};
 
-	while (context->fileOffset != context->inner.mainFile->getSize())
-	{
-		system::IFile::success_t success;
-		context->inner.mainFile->read(success, &c, context->fileOffset, sizeof(c));
-		context->fileOffset += success.getBytesProcessed();
-
-		// found it, so leave
-		if (core::isspace(c))
-			break;
-		token += c;
-	}
-	return token;
+        uint32_t triangleCount32 = binaryTriCountFromDetect;
+        if (!hasBinaryTriCountFromDetect && !SInterchangeIO::readFileExact(context.inner.mainFile, &triangleCount32, Context::BinaryHeaderBytes, sizeof(triangleCount32), &context.ioTelemetry))
+            return {};
+        triangleCount = triangleCount32;
+        const size_t dataSize = static_cast<size_t>(triangleCount) * Context::TriangleRecordBytes;
+        const size_t expectedSize = Context::BinaryPrefixBytes + dataSize;
+        if (filesize < expectedSize)
+            return {};
+        const uint8_t* payloadData = wholeFileData ? (wholeFileData + Context::BinaryPrefixBytes) : loadSession.readRange(Context::BinaryPrefixBytes, dataSize, wholeFilePayload, &context.ioTelemetry);
+        if (!payloadData)
+            return {};
+        vertexCount = triangleCount * Context::VerticesPerTriangle;
+        const size_t vertexCountSizeT = static_cast<size_t>(vertexCount);
+        if (vertexCountSizeT > (std::numeric_limits<size_t>::max() / sizeof(hlsl::float32_t3)))
+            return {};
+        const size_t viewByteSize = vertexCountSizeT * sizeof(hlsl::float32_t3);
+        if (viewByteSize > (std::numeric_limits<size_t>::max() - viewByteSize))
+            return {};
+        const size_t blockBytes = viewByteSize * 2ull;
+        auto upstream = core::getDefaultMemoryResource();
+        if (!upstream)
+            return {};
+        void* block = upstream->allocate(blockBytes, alignof(float));
+        if (!block)
+            return {};
+        auto blockResource = core::make_smart_refctd_ptr<SplitBlockMemoryResource>(core::smart_refctd_ptr<core::refctd_memory_resource>(std::move(upstream)), block, blockBytes, alignof(float));
+        auto posBuffer = ICPUBuffer::create({{viewByteSize}, block, core::smart_refctd_ptr<core::refctd_memory_resource>(blockResource), alignof(float)}, core::adopt_memory);
+        auto normalBuffer = ICPUBuffer::create({{viewByteSize}, reinterpret_cast<uint8_t*>(block) + viewByteSize, core::smart_refctd_ptr<core::refctd_memory_resource>(blockResource), alignof(float)}, core::adopt_memory);
+        if (!posBuffer || !normalBuffer)
+            return {};
+        ICPUPolygonGeometry::SDataView posView = {};
+        posView.composed = {.stride = sizeof(hlsl::float32_t3), .format = EF_R32G32B32_SFLOAT, .rangeFormat = IGeometryBase::getMatchingAABBFormat(EF_R32G32B32_SFLOAT)};
+        posView.src = {.offset = 0ull, .size = viewByteSize, .buffer = std::move(posBuffer)};
+        ICPUPolygonGeometry::SDataView normalView = {};
+        normalView.composed = {.stride = sizeof(hlsl::float32_t3), .format = EF_R32G32B32_SFLOAT, .rangeFormat = IGeometryBase::getMatchingAABBFormat(EF_R32G32B32_SFLOAT)};
+        normalView.src = {.offset = 0ull, .size = viewByteSize, .buffer = std::move(normalBuffer)};
+        auto* posOutFloat = reinterpret_cast<float*>(posView.getPointer());
+        auto* normalOutFloat = reinterpret_cast<float*>(normalView.getPointer());
+        if (!posOutFloat || !normalOutFloat)
+            return {};
+
+        const uint8_t* cursor = payloadData;
+        const uint8_t* const end = cursor + dataSize;
+        if (end < cursor ||
+            static_cast<size_t>(end - cursor) <
+                static_cast<size_t>(triangleCount) * Context::TriangleRecordBytes)
+            return {};
+        core::vector<uint32_t> faceColors(static_cast<size_t>(triangleCount), 0u);
+        std::atomic_bool colorValidForAllFaces = true;
+        const size_t hw = SLoaderRuntimeTuner::resolveHardwareThreads();
+        const size_t hardMaxWorkers = SLoaderRuntimeTuner::resolveHardMaxWorkers(
+            hw, _params.ioPolicy.runtimeTuning.workerHeadroom);
+        SLoaderRuntimeTuningRequest parseTuningRequest = {};
+        parseTuningRequest.inputBytes = dataSize;
+        parseTuningRequest.totalWorkUnits = triangleCount;
+        parseTuningRequest.minBytesPerWorker = Context::TriangleRecordBytes;
+        parseTuningRequest.hardwareThreads = static_cast<uint32_t>(hw);
+        parseTuningRequest.hardMaxWorkers = static_cast<uint32_t>(hardMaxWorkers);
+        parseTuningRequest.targetChunksPerWorker = _params.ioPolicy.runtimeTuning.targetChunksPerWorker;
+        parseTuningRequest.minChunkWorkUnits = 1ull;
+        parseTuningRequest.maxChunkWorkUnits = std::max<uint64_t>(1ull, triangleCount);
+        parseTuningRequest.sampleData = payloadData;
+        parseTuningRequest.sampleBytes = SLoaderRuntimeTuner::resolveSampleBytes(_params.ioPolicy, dataSize);
+        const auto parseTuning = SLoaderRuntimeTuner::tune(_params.ioPolicy, parseTuningRequest);
+        const size_t workerCount = std::max<size_t>(1ull, std::min(parseTuning.workerCount, static_cast<size_t>(std::max<uint64_t>(1ull, triangleCount))));
+        static constexpr bool ComputeAABBInParse = true;
+        struct SThreadAABB { bool has = false; float minX = 0.f; float minY = 0.f; float minZ = 0.f; float maxX = 0.f; float maxY = 0.f; float maxZ = 0.f; };
+        std::vector<SThreadAABB> threadAABBs(ComputeAABBInParse ? workerCount : 0ull);
+        const uint64_t parseChunkTriangles = std::max<uint64_t>(1ull, parseTuning.chunkWorkUnits);
+        const size_t parseChunkCount = static_cast<size_t>(SLoaderRuntimeTuner::ceilDiv(triangleCount, parseChunkTriangles));
+        const bool hashInParsePipeline = computeContentHashes;
+        std::vector<uint8_t> hashChunkReady(hashInParsePipeline ? parseChunkCount : 0ull, 0u);
+        std::atomic_bool hashPipelineOk = true;
+        core::blake3_hash_t parsedPositionHash = static_cast<core::blake3_hash_t>(core::blake3_hasher{});
+        core::blake3_hash_t parsedNormalHash = static_cast<core::blake3_hash_t>(core::blake3_hasher{});
+        auto parseRange = [&](const uint64_t beginTri, const uint64_t endTri, SThreadAABB& localAABB) -> void {
+            const uint8_t* localCursor = payloadData + beginTri * Context::TriangleRecordBytes;
+            float* posCursor = posOutFloat + beginTri * Context::VerticesPerTriangle * Context::FloatChannelsPerVertex;
+            float* normalCursor = normalOutFloat + beginTri * Context::VerticesPerTriangle * Context::FloatChannelsPerVertex;
+            for (uint64_t tri = beginTri; tri < endTri; ++tri) {
+                const uint8_t* const triRecord = localCursor;
+                localCursor += Context::TriangleRecordBytes;
+                std::array<float, Context::TriangleFloatCount> triValues = {};
+                std::memcpy(triValues.data(), triRecord, sizeof(triValues));
+                uint16_t packedColor = 0u;
+                std::memcpy(&packedColor, triRecord + Context::TriangleFloatBytes, sizeof(packedColor));
+                if (packedColor & 0x8000u)
+                    faceColors[static_cast<size_t>(tri)] = Parse::decodeViscamColorToB8G8R8A8(packedColor);
+                else
+                    colorValidForAllFaces.store(false, std::memory_order_relaxed);
+
+                float normalX = triValues[0ull];
+                float normalY = triValues[1ull];
+                float normalZ = triValues[2ull];
+
+                const float vertex0x = triValues[9ull];
+                const float vertex0y = triValues[10ull];
+                const float vertex0z = triValues[11ull];
+                const float vertex1x = triValues[6ull];
+                const float vertex1y = triValues[7ull];
+                const float vertex1z = triValues[8ull];
+                const float vertex2x = triValues[3ull];
+                const float vertex2y = triValues[4ull];
+                const float vertex2z = triValues[5ull];
+
+                posCursor[0ull] = vertex0x;
+                posCursor[1ull] = vertex0y;
+                posCursor[2ull] = vertex0z;
+                posCursor[3ull] = vertex1x;
+                posCursor[4ull] = vertex1y;
+                posCursor[5ull] = vertex1z;
+                posCursor[6ull] = vertex2x;
+                posCursor[7ull] = vertex2y;
+                posCursor[8ull] = vertex2z;
+                if constexpr (ComputeAABBInParse) {
+                    if (!localAABB.has) {
+                        localAABB.has = true;
+                        localAABB.minX = vertex0x;
+                        localAABB.minY = vertex0y;
+                        localAABB.minZ = vertex0z;
+                        localAABB.maxX = vertex0x;
+                        localAABB.maxY = vertex0y;
+                        localAABB.maxZ = vertex0z;
+                    }
+                    if (vertex0x < localAABB.minX)
+                        localAABB.minX = vertex0x;
+                    if (vertex0y < localAABB.minY)
+                        localAABB.minY = vertex0y;
+                    if (vertex0z < localAABB.minZ)
+                        localAABB.minZ = vertex0z;
+                    if (vertex0x > localAABB.maxX)
+                        localAABB.maxX = vertex0x;
+                    if (vertex0y > localAABB.maxY)
+                        localAABB.maxY = vertex0y;
+                    if (vertex0z > localAABB.maxZ)
+                        localAABB.maxZ = vertex0z;
+                    if (vertex1x < localAABB.minX)
+                        localAABB.minX = vertex1x;
+                    if (vertex1y < localAABB.minY)
+                        localAABB.minY = vertex1y;
+                    if (vertex1z < localAABB.minZ)
+                        localAABB.minZ = vertex1z;
+                    if (vertex1x > localAABB.maxX)
+                        localAABB.maxX = vertex1x;
+                    if (vertex1y > localAABB.maxY)
+                        localAABB.maxY = vertex1y;
+                    if (vertex1z > localAABB.maxZ)
+                        localAABB.maxZ = vertex1z;
+                    if (vertex2x < localAABB.minX)
+                        localAABB.minX = vertex2x;
+                    if (vertex2y < localAABB.minY)
+                        localAABB.minY = vertex2y;
+                    if (vertex2z < localAABB.minZ)
+                        localAABB.minZ = vertex2z;
+                    if (vertex2x > localAABB.maxX)
+                        localAABB.maxX = vertex2x;
+                    if (vertex2y > localAABB.maxY)
+                        localAABB.maxY = vertex2y;
+                    if (vertex2z > localAABB.maxZ)
+                        localAABB.maxZ = vertex2z;
+                }
+                if (normalX == 0.f && normalY == 0.f && normalZ == 0.f) {
+                    const float edge10x = vertex1x - vertex0x;
+                    const float edge10y = vertex1y - vertex0y;
+                    const float edge10z = vertex1z - vertex0z;
+                    const float edge20x = vertex2x - vertex0x;
+                    const float edge20y = vertex2y - vertex0y;
+                    const float edge20z = vertex2z - vertex0z;
+
+                    normalX = edge10y * edge20z - edge10z * edge20y;
+                    normalY = edge10z * edge20x - edge10x * edge20z;
+                    normalZ = edge10x * edge20y - edge10y * edge20x;
+                    const float planeLen2 =
+                        normalX * normalX + normalY * normalY + normalZ * normalZ;
+                    if (planeLen2 > 0.f) {
+                        const float invLen = 1.f / std::sqrt(planeLen2);
+                        normalX *= invLen;
+                        normalY *= invLen;
+                        normalZ *= invLen;
+                    } else {
+                        normalX = 0.f;
+                        normalY = 0.f;
+                        normalZ = 0.f;
+                    }
+                }
+                normalCursor[0ull] = normalX;
+                normalCursor[1ull] = normalY;
+                normalCursor[2ull] = normalZ;
+                normalCursor[3ull] = normalX;
+                normalCursor[4ull] = normalY;
+                normalCursor[5ull] = normalZ;
+                normalCursor[6ull] = normalX;
+                normalCursor[7ull] = normalY;
+                normalCursor[8ull] = normalZ;
+                posCursor +=
+                    Context::VerticesPerTriangle * Context::FloatChannelsPerVertex;
+                normalCursor +=
+                    Context::VerticesPerTriangle * Context::FloatChannelsPerVertex;
+            }
+        };
+        std::jthread positionHashThread;
+        std::jthread normalHashThread;
+        if (hashInParsePipeline) {
+            auto launchHashThread =
+                [&](const float* srcFloat,
+                    core::blake3_hash_t& outHash) -> std::jthread {
+                return std::jthread([&, srcFloat, outHashPtr = &outHash]() {
+                    try {
+                        core::blake3_hasher hasher;
+                        size_t chunkIx = 0ull;
+                        while (chunkIx < parseChunkCount) {
+                            auto ready = std::atomic_ref<uint8_t>(hashChunkReady[chunkIx]);
+                            while (ready.load(std::memory_order_acquire) == 0u)
+                                ready.wait(0u, std::memory_order_acquire);
+
+                            size_t runEnd = chunkIx + 1ull;
+                            while (runEnd < parseChunkCount) {
+                                const auto runReady =
+                                    std::atomic_ref<uint8_t>(hashChunkReady[runEnd])
+                                        .load(std::memory_order_acquire);
+                                if (runReady == 0u)
+                                    break;
+                                ++runEnd;
+                            }
+
+                            const uint64_t begin =
+                                static_cast<uint64_t>(chunkIx) * parseChunkTriangles;
+                            const uint64_t endTri = std::min<uint64_t>(
+                                static_cast<uint64_t>(runEnd) * parseChunkTriangles,
+                                triangleCount);
+                            const size_t runTriangles = static_cast<size_t>(endTri - begin);
+                            const size_t runBytes =
+                                runTriangles * Context::VerticesPerTriangle *
+                                Context::FloatChannelsPerVertex * sizeof(float);
+                            hasher.update(srcFloat + begin * Context::VerticesPerTriangle *
+                                                         Context::FloatChannelsPerVertex,
+                                          runBytes);
+                            chunkIx = runEnd;
+                        }
+                        *outHashPtr = static_cast<core::blake3_hash_t>(hasher);
+                    } catch (...) {
+                        hashPipelineOk.store(false, std::memory_order_relaxed);
+                    }
+                });
+            };
+            positionHashThread = launchHashThread(posOutFloat, parsedPositionHash);
+            normalHashThread = launchHashThread(normalOutFloat, parsedNormalHash);
+        }
+        std::atomic_size_t nextChunkIx = 0ull;
+        auto parseWorker = [&](const size_t workerIx) -> void {
+            SThreadAABB localAABB = {};
+            while (true) {
+                const size_t chunkIx =
+                    nextChunkIx.fetch_add(1ull, std::memory_order_relaxed);
+                if (chunkIx >= parseChunkCount)
+                    break;
+                const uint64_t begin =
+                    static_cast<uint64_t>(chunkIx) * parseChunkTriangles;
+                const uint64_t endTri =
+                    std::min<uint64_t>(begin + parseChunkTriangles, triangleCount);
+                parseRange(begin, endTri, localAABB);
+                if (hashInParsePipeline) {
+                    auto ready = std::atomic_ref<uint8_t>(hashChunkReady[chunkIx]);
+                    ready.store(1u, std::memory_order_release);
+                    ready.notify_all();
+                }
+            }
+            if constexpr (ComputeAABBInParse)
+                threadAABBs[workerIx] = localAABB;
+        };
+        SLoaderRuntimeTuner::dispatchWorkers(workerCount, parseWorker);
+        if (positionHashThread.joinable())
+            positionHashThread.join();
+        if (normalHashThread.joinable())
+            normalHashThread.join();
+        if (hashInParsePipeline) {
+            if (!hashPipelineOk.load(std::memory_order_relaxed))
+                return {};
+            posView.src.buffer->setContentHash(parsedPositionHash);
+            normalView.src.buffer->setContentHash(parsedNormalHash);
+        }
+        if constexpr (ComputeAABBInParse) {
+            for (const auto& localAABB : threadAABBs) {
+                if (!localAABB.has)
+                    continue;
+                hlsl::shapes::util::extendAABBAccumulator(
+                    parsedAABB, localAABB.minX, localAABB.minY, localAABB.minZ);
+                hlsl::shapes::util::extendAABBAccumulator(
+                    parsedAABB, localAABB.maxX, localAABB.maxY, localAABB.maxZ);
+            }
+        }
+        geometry->setPositionView(std::move(posView));
+        geometry->setNormalView(std::move(normalView));
+        if (colorValidForAllFaces.load(std::memory_order_relaxed)) {
+            core::vector<uint32_t> vertexColors(vertexCountSizeT);
+            for (size_t triIx = 0ull; triIx < static_cast<size_t>(triangleCount);
+                 ++triIx) {
+                const uint32_t triColor = faceColors[triIx];
+                const size_t baseIx = triIx * Context::VerticesPerTriangle;
+                vertexColors[baseIx + 0ull] = triColor;
+                vertexColors[baseIx + 1ull] = triColor;
+                vertexColors[baseIx + 2ull] = triColor;
+            }
+            auto colorView =
+                SGeometryLoaderCommon::createAdoptedView<EF_B8G8R8A8_UNORM>(
+                    std::move(vertexColors));
+            if (!colorView)
+                return {};
+            auto* const auxViews = geometry->getAuxAttributeViews();
+            auxViews->resize(SSTLPolygonGeometryAuxLayout::COLOR0 + 1u);
+            (*auxViews)[SSTLPolygonGeometryAuxLayout::COLOR0] = std::move(colorView);
+            hasTriangleColors = true;
+        }
+    } else {
+        parsePath = "ascii_fallback";
+        if (!wholeFileData)
+        {
+            wholeFileData = loadSession.mapOrReadWholeFile(wholeFilePayload, &context.ioTelemetry);
+            if (!wholeFileData)
+                return {};
+        }
+
+        const char* const begin = reinterpret_cast<const char*>(wholeFileData);
+        const char* const end = begin + filesize;
+        AsciiParser parser(begin, end);
+        core::vector<hlsl::float32_t3> positions;
+        core::vector<hlsl::float32_t3> normals;
+        const auto firstToken = parser.readToken();
+        if (!firstToken.has_value() || *firstToken != std::string_view("solid"))
+            return {};
+
+        for (;;) {
+            const auto maybeToken = parser.readToken();
+            if (!maybeToken.has_value())
+                break;
+            const std::string_view textToken = *maybeToken;
+            if (textToken == std::string_view("endsolid"))
+                break;
+            if (textToken != std::string_view("facet"))
+                continue;
+
+            const auto normalKeyword = parser.readToken();
+            if (!normalKeyword.has_value() ||
+                *normalKeyword != std::string_view("normal"))
+                return {};
+
+            const auto fileNormal = parser.readVec3();
+            if (!fileNormal.has_value())
+                return {};
+
+            const auto outerKeyword = parser.readToken();
+            if (!outerKeyword.has_value() ||
+                *outerKeyword != std::string_view("outer"))
+                return {};
+            const auto loopKeyword = parser.readToken();
+            if (!loopKeyword.has_value() || *loopKeyword != std::string_view("loop"))
+                return {};
+
+            std::array<hlsl::float32_t3, 3> p = {};
+            for (uint32_t i = 0u; i < 3u; ++i) {
+                const auto vertexKeyword = parser.readToken();
+                if (!vertexKeyword.has_value() ||
+                    *vertexKeyword != std::string_view("vertex"))
+                    return {};
+                const auto vertex = parser.readVec3();
+                if (!vertex.has_value())
+                    return {};
+                p[i] = *vertex;
+            }
+
+            Parse::pushTriangleReversed(p, positions);
+            hlsl::float32_t3 faceNormal = Parse::resolveStoredNormal(*fileNormal);
+            if (hlsl::dot(faceNormal, faceNormal) <= 0.f)
+                faceNormal =
+                    SGeometryNormalCommon::computeFaceNormal(p[2u], p[1u], p[0u]);
+            normals.push_back(faceNormal);
+            normals.push_back(faceNormal);
+            normals.push_back(faceNormal);
+            hlsl::shapes::util::extendAABBAccumulator(parsedAABB, p[2u]);
+            hlsl::shapes::util::extendAABBAccumulator(parsedAABB, p[1u]);
+            hlsl::shapes::util::extendAABBAccumulator(parsedAABB, p[0u]);
+
+            const auto endLoopKeyword = parser.readToken();
+            if (!endLoopKeyword.has_value() || *endLoopKeyword != std::string_view("endloop"))
+                return {};
+            const auto endFacetKeyword = parser.readToken();
+            if (!endFacetKeyword.has_value() || *endFacetKeyword != std::string_view("endfacet"))
+                return {};
+        }
+        if (positions.empty())
+            return {};
+
+        triangleCount = positions.size() / Context::VerticesPerTriangle;
+        vertexCount = positions.size();
+        auto posView = SGeometryLoaderCommon::createAdoptedView<EF_R32G32B32_SFLOAT>(std::move(positions));
+        auto normalView = SGeometryLoaderCommon::createAdoptedView<EF_R32G32B32_SFLOAT>(std::move(normals));
+        if (!posView || !normalView)
+            return {};
+        geometry->setPositionView(std::move(posView));
+        geometry->setNormalView(std::move(normalView));
+    }
+
+    if (vertexCount == 0ull)
+        return {};
+    if (computeContentHashes)
+        SPolygonGeometryContentHash::computeMissing(geometry.get(), _params.ioPolicy);
+    if (!parsedAABB.empty())
+        geometry->applyAABB(parsedAABB.value);
+    else
+        CPolygonGeometryManipulator::recomputeAABB(geometry.get());
+    const uint64_t ioMinRead = context.ioTelemetry.getMinOrZero();
+    const uint64_t ioAvgRead = context.ioTelemetry.getAvgOrZero();
+    loadSession.logTinyIO(_params.logger, context.ioTelemetry);
+    _params.logger.log(
+        "STL loader stats: file=%s binary=%d parse_path=%s triangles=%llu "
+        "vertices=%llu colors=%d io_reads=%llu io_min_read=%llu io_avg_read=%llu "
+        "io_req=%s io_eff=%s io_chunk=%llu io_reason=%s",
+        system::ILogger::ELL_PERFORMANCE, _file->getFileName().string().c_str(),
+        binary ? 1 : 0, parsePath, static_cast<unsigned long long>(triangleCount),
+        static_cast<unsigned long long>(vertexCount), hasTriangleColors ? 1 : 0,
+        static_cast<unsigned long long>(context.ioTelemetry.callCount),
+        static_cast<unsigned long long>(ioMinRead),
+        static_cast<unsigned long long>(ioAvgRead),
+        system::to_string(_params.ioPolicy.strategy).c_str(),
+        system::to_string(loadSession.ioPlan.strategy).c_str(),
+        static_cast<unsigned long long>(loadSession.ioPlan.chunkSizeBytes()), loadSession.ioPlan.reason);
+    auto meta = core::make_smart_refctd_ptr<CSTLMetadata>();
+    return SAssetBundle(std::move(meta), {std::move(geometry)});
 }
 
-//! skip to next word
-void CSTLMeshFileLoader::goNextWord(SContext* context) const
-{
-	uint8_t c;
-	while (context->fileOffset != context->inner.mainFile->getSize()) // TODO: check it
-	{
-		system::IFile::success_t success;
-		context->inner.mainFile->read(success, &c, context->fileOffset, sizeof(c));
-		context->fileOffset += success.getBytesProcessed();
-
-		// found it, so leave
-		if (!core::isspace(c))
-		{
-			context->fileOffset -= success.getBytesProcessed();
-			break;
-		}
-	}
+bool CSTLMeshFileLoader::isALoadableFileFormat(
+    system::IFile* _file, const system::logger_opt_ptr) const {
+	using Context = Parse::Context;
+	if (!_file || _file->getSize() <= Context::TextProbeBytes)
+		return false;
+	Parse::LayoutProbe layout = {};
+	if (!Parse::probeLayout(_file, _file->getSize(), nullptr, nullptr, layout))
+		return false;
+	return layout.startsWithSolid || layout.binaryBySize;
 }
-
-//! Read until line break is reached and stop at the next non-space character
-void CSTLMeshFileLoader::goNextLine(SContext* context) const
-{
-	uint8_t c;
-	// look for newline characters
-	while (context->fileOffset != context->inner.mainFile->getSize()) // TODO: check it
-	{
-		system::IFile::success_t success;
-		context->inner.mainFile->read(success, &c, context->fileOffset, sizeof(c));
-		context->fileOffset += success.getBytesProcessed();
-
-		// found it, so leave
-		if (c == '\n' || c == '\r')
-			break;
-	}
 }
-
-
 #endif // _NBL_COMPILE_WITH_STL_LOADER_
diff --git a/src/nbl/asset/interchange/CSTLMeshFileLoader.h b/src/nbl/asset/interchange/CSTLMeshFileLoader.h
index f7020ab292..dadfb1ca7f 100644
--- a/src/nbl/asset/interchange/CSTLMeshFileLoader.h
+++ b/src/nbl/asset/interchange/CSTLMeshFileLoader.h
@@ -1,64 +1,26 @@
-// Copyright (C) 2019-2025 - DevSH Graphics Programming Sp. z O.O.
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine" and was originally part of the "Irrlicht Engine"
 // For conditions of distribution and use, see copyright notice in nabla.h
 // See the original file in irrlicht source for authors
 #ifndef _NBL_ASSET_C_STL_MESH_FILE_LOADER_H_INCLUDED_
 #define _NBL_ASSET_C_STL_MESH_FILE_LOADER_H_INCLUDED_
-
-
 #include "nbl/core/declarations.h"
-
 #include "nbl/asset/interchange/IGeometryLoader.h"
-#include "nbl/asset/metadata/CSTLMetadata.h"
-
-
 namespace nbl::asset
 {
-
-//! Meshloader capable of loading STL meshes.
+//! Mesh loader capable of loading STL meshes.
 class CSTLMeshFileLoader final : public IGeometryLoader
 {
 	public:
+		explicit CSTLMeshFileLoader(asset::IAssetManager* _assetManager);
 
-		CSTLMeshFileLoader(asset::IAssetManager* _m_assetMgr);
-
+		//! Loads one STL asset bundle from an already opened file.
 		asset::SAssetBundle loadAsset(system::IFile* _file, const IAssetLoader::SAssetLoadParams& _params, IAssetLoader::IAssetLoaderOverride* _override = nullptr, uint32_t _hierarchyLevel = 0u) override;
 
 		bool isALoadableFileFormat(system::IFile* _file, const system::logger_opt_ptr logger) const override;
 
-		const char** getAssociatedFileExtensions() const override
-		{
-			static const char* ext[]{ "stl", nullptr };
-			return ext;
-		}
-
-	private:
-		struct SContext
-		{
-			IAssetLoader::SAssetLoadContext inner;
-			uint32_t topHierarchyLevel;
-			IAssetLoader::IAssetLoaderOverride* loaderOverride;
-
-			size_t fileOffset = {};
-		};
-
-		virtual void initialize() override;
-
-		const std::string_view getPipelineCacheKey(bool withColorAttribute) { return withColorAttribute ? "nbl/builtin/pipeline/loader/STL/color_attribute" : "nbl/builtin/pipeline/loader/STL/no_color_attribute"; }
-
-		// skips to the first non-space character available
-		void goNextWord(SContext* context) const;
-		// returns the next word
-
-		const std::string& getNextToken(SContext* context, std::string& token) const;
-		// skip to next printable character after the first line break
-		void goNextLine(SContext* context) const;
-		//! Read 3d vector of floats
-		void getNextVector(SContext* context, core::vectorSIMDf& vec, bool binary) const;
-
-		asset::IAssetManager* m_assetMgr;
+		const char** getAssociatedFileExtensions() const override;
 };
-
-}	// end namespace nbl::scene
+} // end namespace nbl::asset
 #endif
 
diff --git a/src/nbl/asset/interchange/CSTLMeshWriter.cpp b/src/nbl/asset/interchange/CSTLMeshWriter.cpp
index 45c7c1f939..db2b70aef4 100644
--- a/src/nbl/asset/interchange/CSTLMeshWriter.cpp
+++ b/src/nbl/asset/interchange/CSTLMeshWriter.cpp
@@ -1,474 +1,553 @@
-// Copyright (C) 2019 - DevSH Graphics Programming Sp. z O.O.
+#ifdef _NBL_COMPILE_WITH_STL_WRITER_
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine" and was originally part of the "Irrlicht Engine"
 // For conditions of distribution and use, see copyright notice in nabla.h
 // See the original file in irrlicht source for authors
-#include "nbl/system/ISystem.h"
 #include "nbl/system/IFile.h"
-
 #include "CSTLMeshWriter.h"
-#include "SColor.h"
-
-using namespace nbl;
-using namespace nbl::asset;
-
-#ifdef _NBL_COMPILE_WITH_STL_WRITER_
-constexpr auto POSITION_ATTRIBUTE = 0;
-constexpr auto COLOR_ATTRIBUTE = 1;
-constexpr auto UV_ATTRIBUTE = 2;
-constexpr auto NORMAL_ATTRIBUTE = 3;
-
-CSTLMeshWriter::CSTLMeshWriter()
+#include "impl/SFileAccess.h"
+#include "nbl/asset/format/convertColor.h"
+#include "nbl/asset/interchange/SSTLPolygonGeometryAuxLayout.h"
+#include "nbl/asset/interchange/SGeometryWriterCommon.h"
+#include "nbl/asset/interchange/SInterchangeIO.h"
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <cstring>
+#include <limits>
+#include <memory>
+#include <new>
+#include <string_view>
+namespace nbl::asset
 {
-	#ifdef _NBL_DEBUG
-	setDebugName("CSTLMeshWriter");
-	#endif
-}
-
-
-CSTLMeshWriter::~CSTLMeshWriter()
-{
-}
-
-//! writes a mesh
-bool CSTLMeshWriter::writeAsset(system::IFile* _file, const SAssetWriteParams& _params, IAssetWriterOverride* _override)
-{
-    if (!_override)
-        getDefaultOverride(_override);
-
-    SAssetWriteContext inCtx{_params, _file};
-
-    const asset::ICPUMesh* mesh =
-#   ifndef _NBL_DEBUG
-        static_cast<const asset::ICPUMesh*>(_params.rootAsset);
-#   else
-        dynamic_cast<const asset::ICPUMesh*>(_params.rootAsset);
-#   endif
-    assert(mesh);
-
-	system::IFile* file = _override->getOutputFile(_file, inCtx, {mesh, 0u});
-
-	if (!file)
-		return false;
-
-	SContext context = { SAssetWriteContext{ inCtx.params, file} };
-
-	_params.logger.log("WRITING STL: writing the file %s", system::ILogger::ELL_INFO, file->getFileName().string().c_str());
-
-    const asset::E_WRITER_FLAGS flags = _override->getAssetWritingFlags(context.writeContext, mesh, 0u);
-	if (flags & asset::EWF_BINARY)
-		return writeMeshBinary(mesh, &context);
-	else
-		return writeMeshASCII(mesh, &context);
-}
-
 namespace
 {
-template <class I>
-inline void writeFacesBinary(const asset::ICPUMeshBuffer* buffer, const bool& noIndices, system::IFile* file, uint32_t _colorVaid, IAssetWriter::SAssetWriteContext* context, size_t* fileOffset)
+struct Parse
 {
-	auto& inputParams = buffer->getPipeline()->getCachedCreationParams().vertexInput;
-	bool hasColor = inputParams.enabledAttribFlags & core::createBitmask({ COLOR_ATTRIBUTE });
-    const asset::E_FORMAT colorType = static_cast<asset::E_FORMAT>(hasColor ? inputParams.attributes[COLOR_ATTRIBUTE].format : asset::EF_UNKNOWN);
-
-    const uint32_t indexCount = buffer->getIndexCount();
-    for (uint32_t j = 0u; j < indexCount; j += 3u)
-    {
-        I idx[3];
-        for (uint32_t i = 0u; i < 3u; ++i)
-        {
-            if (noIndices)
-                idx[i] = j + i;
-            else
-                idx[i] = ((I*)buffer->getIndices())[j + i];
-        }
-
-        core::vectorSIMDf v[3];
-        for (uint32_t i = 0u; i < 3u; ++i)
-            v[i] = buffer->getPosition(idx[i]);
-
-        uint16_t color = 0u;
-        if (hasColor)
-        {
-            if (asset::isIntegerFormat(colorType))
-            {
-                uint32_t res[4];
-                for (uint32_t i = 0u; i < 3u; ++i)
-                {
-                    uint32_t d[4];
-                    buffer->getAttribute(d, _colorVaid, idx[i]);
-                    res[0] += d[0]; res[1] += d[1]; res[2] += d[2];
-                }
-                color = video::RGB16(res[0]/3, res[1]/3, res[2]/3);
-            }
-            else
-            {
-                core::vectorSIMDf res;
-                for (uint32_t i = 0u; i < 3u; ++i)
-                {
-                    core::vectorSIMDf d;
-                    buffer->getAttribute(d, _colorVaid, idx[i]);
-                    res += d;
-                }
-                res /= 3.f;
-                color = video::RGB16(res.X, res.Y, res.Z);
-            }
-        }
-
-		core::vectorSIMDf normal = core::plane3dSIMDf(v[0], v[1], v[2]).getNormal();
-		core::vectorSIMDf vertex1 = v[2];
-		core::vectorSIMDf vertex2 = v[1];
-		core::vectorSIMDf vertex3 = v[0];
-
-		auto flipVectors = [&]()
-		{
-			vertex1.X = -vertex1.X;
-			vertex2.X = -vertex2.X;
-			vertex3.X = -vertex3.X;
-			normal = core::plane3dSIMDf(vertex1, vertex2, vertex3).getNormal();
-		};
-
-		if (!(context->params.flags & E_WRITER_FLAGS::EWF_MESH_IS_RIGHT_HANDED))
-			flipVectors();
-
-		{
-			system::IFile::success_t success;;
-			file->write(success, &normal, *fileOffset, 12);
-	
-			*fileOffset += success.getBytesProcessed();
-		}
-
-		{
-			system::IFile::success_t success;;
-			file->write(success, &vertex1, *fileOffset, 12);
-	
-			*fileOffset += success.getBytesProcessed();
-		}
-
-		{
-			system::IFile::success_t success;;
-			file->write(success, &vertex2, *fileOffset, 12);
-	
-			*fileOffset += success.getBytesProcessed();
-		}
-
-		{
-			system::IFile::success_t success;;
-			file->write(success, &vertex3, *fileOffset, 12);
-	
-			*fileOffset += success.getBytesProcessed();
-		}
-
-		{
-			system::IFile::success_t success;;
-			file->write(success, &color, *fileOffset, 2); // saving color using non-standard VisCAM/SolidView trick
-	
-			*fileOffset += success.getBytesProcessed();
-		}
-    }
-}
-}
-
-bool CSTLMeshWriter::writeMeshBinary(const asset::ICPUMesh* mesh, SContext* context)
-{
-	// write STL MESH header
-    const char headerTxt[] = "Irrlicht-baw Engine";
-    constexpr size_t HEADER_SIZE = 80u;
-
-	{
-		system::IFile::success_t success;;
-		context->writeContext.outputFile->write(success, headerTxt, context->fileOffset, sizeof(headerTxt));
-
-		context->fileOffset += success.getBytesProcessed();
-	}
-
-	const std::string name = context->writeContext.outputFile->getFileName().filename().replace_extension().string(); // TODO: check it
-	const int32_t sizeleft = HEADER_SIZE - sizeof(headerTxt) - name.size();
-
-	if (sizeleft < 0)
+	struct Context
 	{
-		system::IFile::success_t success;;
-		context->writeContext.outputFile->write(success, name.c_str(), context->fileOffset, HEADER_SIZE - sizeof(headerTxt));
-
-		context->fileOffset += success.getBytesProcessed();
-	}
-	else
-	{
-		const char buf[80] = {0};
-
+		IAssetWriter::SAssetWriteContext writeContext;
+		SResolvedFileIOPolicy ioPlan = {};
+		core::vector<uint8_t> ioBuffer = {};
+		size_t fileOffset = 0ull;
+		SFileWriteTelemetry writeTelemetry = {};
+		bool flush()
 		{
-			system::IFile::success_t success;;
-			context->writeContext.outputFile->write(success, name.c_str(), context->fileOffset, name.size());
-	
-			context->fileOffset += success.getBytesProcessed();
+			if (ioBuffer.empty())
+				return true;
+			size_t bytesWritten = 0ull;
+			const size_t totalBytes = ioBuffer.size();
+			while (bytesWritten < totalBytes)
+			{
+				system::IFile::success_t success;
+				writeContext.outputFile->write(success, ioBuffer.data() + bytesWritten, fileOffset + bytesWritten, totalBytes - bytesWritten);
+				if (!success)
+					return false;
+				const size_t processed = success.getBytesProcessed();
+				if (processed == 0ull)
+					return false;
+				writeTelemetry.account(processed);
+				bytesWritten += processed;
+			}
+			fileOffset += totalBytes;
+			ioBuffer.clear();
+			return true;
 		}
-
+		bool write(const void* data, size_t size)
 		{
-			system::IFile::success_t success;;
-			context->writeContext.outputFile->write(success, buf, context->fileOffset, sizeleft);
-	
-			context->fileOffset += success.getBytesProcessed();
+			if (!data && size != 0ull)
+				return false;
+			if (size == 0ull)
+				return true;
+			const uint8_t* src = reinterpret_cast<const uint8_t*>(data);
+			switch (ioPlan.strategy)
+			{
+				case SResolvedFileIOPolicy::Strategy::WholeFile:
+				{
+					const size_t oldSize = ioBuffer.size();
+					ioBuffer.resize(oldSize + size);
+					std::memcpy(ioBuffer.data() + oldSize, src, size);
+					return true;
+				}
+				case SResolvedFileIOPolicy::Strategy::Chunked:
+				default:
+				{
+					const size_t chunkSize = static_cast<size_t>(ioPlan.chunkSizeBytes());
+					size_t remaining = size;
+					while (remaining > 0ull)
+					{
+						const size_t freeSpace = chunkSize - ioBuffer.size();
+						const size_t toCopy = std::min(freeSpace, remaining);
+						const size_t oldSize = ioBuffer.size();
+						ioBuffer.resize(oldSize + toCopy);
+						std::memcpy(ioBuffer.data() + oldSize, src, toCopy);
+						src += toCopy;
+						remaining -= toCopy;
+						if (ioBuffer.size() == chunkSize && !flush())
+							return false;
+					}
+					return true;
+				}
+			}
 		}
-	}
-
-	uint32_t facenum = 0;
-	for (auto& mb : mesh->getMeshBuffers())
-		facenum += mb->getIndexCount()/3;
+	};
+	struct TriangleData { hlsl::float32_t3 normal = {}; hlsl::float32_t3 vertex1 = {}; hlsl::float32_t3 vertex2 = {}; hlsl::float32_t3 vertex3 = {}; };
+	static constexpr size_t BinaryHeaderBytes = 80ull;
+	static constexpr size_t BinaryTriangleCountBytes = sizeof(uint32_t);
+	static constexpr size_t BinaryTriangleFloatCount = 12ull;
+	static constexpr size_t BinaryTriangleFloatBytes = sizeof(float) * BinaryTriangleFloatCount;
+	static constexpr size_t BinaryTriangleAttributeBytes = sizeof(uint16_t);
+	static constexpr size_t BinaryTriangleRecordBytes = BinaryTriangleFloatBytes + BinaryTriangleAttributeBytes;
+	static constexpr size_t BinaryPrefixBytes = BinaryHeaderBytes + BinaryTriangleCountBytes;
+	static constexpr size_t IoFallbackReserveBytes = 1ull << 20;
+	static constexpr size_t AsciiFaceTextMaxBytes = 1024ull;
+	static constexpr char AsciiSolidPrefix[] = "solid ";
+	static constexpr char AsciiEndSolidPrefix[] = "endsolid ";
+	static constexpr char AsciiDefaultName[] = "nabla_mesh";
+	static_assert(BinaryTriangleRecordBytes == 50ull);
+	static bool appendLiteral(char*& cursor, char* const end, const char* text, const size_t textSize)
 	{
-		system::IFile::success_t success;;
-		context->writeContext.outputFile->write(success, &facenum, context->fileOffset, sizeof(facenum));
-
-		context->fileOffset += success.getBytesProcessed();
+		if (!cursor || cursor + textSize > end)
+			return false;
+		std::memcpy(cursor, text, textSize);
+		cursor += textSize;
+		return true;
 	}
-	// write mesh buffers
-
-	for (auto& buffer : mesh->getMeshBuffers())
-	if (buffer)
+	static bool appendVectorAsAsciiLine(char*& cursor, char* const end, const hlsl::float32_t3& v)
 	{
-        asset::E_INDEX_TYPE type = buffer->getIndexType();
-		if (!buffer->getIndexBufferBinding().buffer)
-            type = asset::EIT_UNKNOWN;
-
-		if (type== asset::EIT_16BIT)
-            writeFacesBinary<uint16_t>(buffer, false, context->writeContext.outputFile, COLOR_ATTRIBUTE, &context->writeContext, &context->fileOffset);
-		else if (type== asset::EIT_32BIT)
-            writeFacesBinary<uint32_t>(buffer, false, context->writeContext.outputFile, COLOR_ATTRIBUTE, &context->writeContext, &context->fileOffset);
-		else
-            writeFacesBinary<uint16_t>(buffer, true, context->writeContext.outputFile, COLOR_ATTRIBUTE, &context->writeContext, &context->fileOffset); //template param doesn't matter if there's no indices
+		cursor = SGeometryWriterCommon::appendFloatToBuffer(cursor, end, v.x);
+		if (cursor >= end)
+			return false;
+		*(cursor++) = ' ';
+		cursor = SGeometryWriterCommon::appendFloatToBuffer(cursor, end, v.y);
+		if (cursor >= end)
+			return false;
+		*(cursor++) = ' ';
+		cursor = SGeometryWriterCommon::appendFloatToBuffer(cursor, end, v.z);
+		if (cursor >= end)
+			return false;
+		*(cursor++) = '\n';
+		return true;
 	}
-	return true;
-}
-
-bool CSTLMeshWriter::writeMeshASCII(const asset::ICPUMesh* mesh, SContext* context)
-{
-	// write STL MESH header
-    const char headerTxt[] = "Irrlicht-baw Engine ";
-
+	static bool decodeTriangle(const ICPUPolygonGeometry* geom, const IPolygonGeometryBase::IIndexingCallback* indexing, const ICPUPolygonGeometry::SDataView& posView, uint32_t primIx, hlsl::float32_t3& out0, hlsl::float32_t3& out1, hlsl::float32_t3& out2, hlsl::uint32_t3* outIdx)
 	{
-		system::IFile::success_t success;;
-		context->writeContext.outputFile->write(success, "solid ", context->fileOffset, 6);
-
-		context->fileOffset += success.getBytesProcessed();
+		hlsl::uint32_t3 idx(0u);
+		const auto& indexView = geom->getIndexView();
+		const void* indexBuffer = indexView ? indexView.getPointer() : nullptr;
+		const uint64_t indexSize = indexView ? indexView.composed.getStride() : 0u;
+		IPolygonGeometryBase::IIndexingCallback::SContext<uint32_t> ctx = {.indexBuffer = indexBuffer, .indexSize = indexSize, .beginPrimitive = primIx, .endPrimitive = primIx + 1u, .out = &idx.x};
+		indexing->operator()(ctx);
+		if (outIdx)
+			*outIdx = idx;
+		std::array<hlsl::float32_t3, 3> positions = {};
+		if (!decodeIndexedTriple(idx, [&posView](const uint32_t vertexIx, hlsl::float32_t3& out) -> bool { return posView.decodeElement(vertexIx, out); }, positions.data()))
+			return false;
+		out0 = positions[0];
+		out1 = positions[1];
+		out2 = positions[2];
+		return true;
 	}
-
-
+	template<typename DecodeFn, typename T>
+	static bool decodeIndexedTriple(const hlsl::uint32_t3& idx, DecodeFn&& decode, T* out)
 	{
-		system::IFile::success_t success;;
-		context->writeContext.outputFile->write(success, headerTxt, context->fileOffset, sizeof(headerTxt) - 1);
-
-		context->fileOffset += success.getBytesProcessed();
+		return out && decode(idx.x, out[0]) && decode(idx.y, out[1]) && decode(idx.z, out[2]);
 	}
-
-	const std::string name = context->writeContext.outputFile->getFileName().filename().replace_extension().string();
-
+	static bool decodeTriangleNormal(const ICPUPolygonGeometry::SDataView& normalView, const hlsl::uint32_t3& idx, hlsl::float32_t3& outNormal)
 	{
-		system::IFile::success_t success;;
-		context->writeContext.outputFile->write(success, name.c_str(), context->fileOffset, name.size());
-
-		context->fileOffset += success.getBytesProcessed();
+		if (!normalView)
+			return false;
+		std::array<hlsl::float32_t3, 3> normals = {};
+		if (!decodeIndexedTriple(idx, [&normalView](const uint32_t vertexIx, hlsl::float32_t3& out) -> bool { return normalView.decodeElement(vertexIx, out); }, normals.data()))
+			return false;
+		return selectFirstValidNormal(normals.data(), static_cast<uint32_t>(normals.size()), outNormal);
 	}
-
-
+	static bool selectFirstValidNormal(const hlsl::float32_t3* const normals, const uint32_t count, hlsl::float32_t3& outNormal)
 	{
-		system::IFile::success_t success;;
-		context->writeContext.outputFile->write(success, "\n", context->fileOffset, 1);
-
-		context->fileOffset += success.getBytesProcessed();
+		if (!normals || count == 0u)
+			return false;
+		for (uint32_t i = 0u; i < count; ++i)
+			if (hlsl::dot(normals[i], normals[i]) > 0.f)
+				return outNormal = normals[i], true;
+		return false;
 	}
-
-	// write mesh buffers
-	for (auto& buffer : mesh->getMeshBuffers())
-	if (buffer)
+	static void prepareVertices(const hlsl::float32_t3& p0, const hlsl::float32_t3& p1, const hlsl::float32_t3& p2, const bool flipHandedness, hlsl::float32_t3& vertex1, hlsl::float32_t3& vertex2, hlsl::float32_t3& vertex3)
 	{
-        asset::E_INDEX_TYPE type = buffer->getIndexType();
-		if (!buffer->getIndexBufferBinding().buffer)
-            type = asset::EIT_UNKNOWN;
-		const uint32_t indexCount = buffer->getIndexCount();
-		if (type==asset::EIT_16BIT)
-		{
-            //os::Printer::log("Writing mesh with 16bit indices");
-            for (uint32_t j=0; j<indexCount; j+=3)
-            {
-                writeFaceText(
-                    buffer->getPosition(((uint16_t*)buffer->getIndices())[j]),
-                    buffer->getPosition(((uint16_t*)buffer->getIndices())[j+1]),
-                    buffer->getPosition(((uint16_t*)buffer->getIndices())[j+2]),
-					context
-                );
-            }
-		}
-		else if (type==asset::EIT_32BIT)
-		{
-            //os::Printer::log("Writing mesh with 32bit indices");
-            for (uint32_t j=0; j<indexCount; j+=3)
-            {
-                writeFaceText(
-                    buffer->getPosition(((uint32_t*)buffer->getIndices())[j]),
-                    buffer->getPosition(((uint32_t*)buffer->getIndices())[j+1]),
-                    buffer->getPosition(((uint32_t*)buffer->getIndices())[j+2]),
-					context
-                );
-            }
-		}
-		else
-        {
-            //os::Printer::log("Writing mesh with no indices");
-            for (uint32_t j=0; j<indexCount; j+=3)
-            {
-                writeFaceText(
-                    buffer->getPosition(j),
-                    buffer->getPosition(j+1ul),
-                    buffer->getPosition(j+2ul),
-					context
-                );
-            }
-        }
-
+		vertex1 = p2;
+		vertex2 = p1;
+		vertex3 = p0;
+		if (flipHandedness)
 		{
-			system::IFile::success_t success;;
-			context->writeContext.outputFile->write(success, "\n", context->fileOffset, 1);
-	
-			context->fileOffset += success.getBytesProcessed();
+			vertex1.x = -vertex1.x;
+			vertex2.x = -vertex2.x;
+			vertex3.x = -vertex3.x;
 		}
 	}
-
+	static hlsl::float32_t3 computePlaneNormal(const hlsl::float32_t3& vertex1, const hlsl::float32_t3& vertex2, const hlsl::float32_t3& vertex3, float* const planeNormalLen2 = nullptr)
 	{
-		system::IFile::success_t success;;
-		context->writeContext.outputFile->write(success, "endsolid ", context->fileOffset, 9);
-
-		context->fileOffset += success.getBytesProcessed();
+		const hlsl::float32_t3 planeNormal = hlsl::cross(vertex2 - vertex1, vertex3 - vertex1);
+		const float len2 = hlsl::dot(planeNormal, planeNormal);
+		if (planeNormalLen2)
+			return *planeNormalLen2 = len2, planeNormal;
+		return len2 > 0.f ? hlsl::normalize(planeNormal) : hlsl::float32_t3(0.f, 0.f, 0.f);
 	}
-
+	static hlsl::float32_t3 resolveTriangleNormal(const hlsl::float32_t3& planeNormal, const float planeNormalLen2, const hlsl::float32_t3* const attrNormals, const uint32_t attrNormalCount, const bool flipHandedness, const bool alignToPlane)
 	{
-		system::IFile::success_t success;;
-		context->writeContext.outputFile->write(success, headerTxt, context->fileOffset, sizeof(headerTxt) - 1);
-
-		context->fileOffset += success.getBytesProcessed();
+		hlsl::float32_t3 attrNormal = {};
+		if (selectFirstValidNormal(attrNormals, attrNormalCount, attrNormal))
+		{
+			if (flipHandedness)
+				attrNormal.x = -attrNormal.x;
+			if (alignToPlane && planeNormalLen2 > 0.f && hlsl::dot(attrNormal, planeNormal) < 0.f)
+				attrNormal = -attrNormal;
+			return attrNormal;
+		}
+		return planeNormalLen2 > 0.f ? hlsl::normalize(planeNormal) : hlsl::float32_t3(0.f, 0.f, 0.f);
 	}
-
+	static void buildTriangle(const hlsl::float32_t3& p0, const hlsl::float32_t3& p1, const hlsl::float32_t3& p2, const hlsl::float32_t3* const attrNormals, const uint32_t attrNormalCount, const bool flipHandedness, const bool alignToPlane, TriangleData& triangle)
 	{
-		system::IFile::success_t success;;
-		context->writeContext.outputFile->write(success, name.c_str(), context->fileOffset, name.size());
-
-		context->fileOffset += success.getBytesProcessed();
+		prepareVertices(p0, p1, p2, flipHandedness, triangle.vertex1, triangle.vertex2, triangle.vertex3);
+		float planeNormalLen2 = 0.f;
+		const hlsl::float32_t3 planeNormal = computePlaneNormal(triangle.vertex1, triangle.vertex2, triangle.vertex3, &planeNormalLen2);
+		triangle.normal = resolveTriangleNormal(planeNormal, planeNormalLen2, attrNormals, attrNormalCount, flipHandedness, alignToPlane);
 	}
-
-	return true;
-}
-
-void CSTLMeshWriter::getVectorAsStringLine(const core::vectorSIMDf& v, std::string& s) const
-{
-    std::ostringstream tmp;
-    tmp << v.X << " " << v.Y << " " << v.Z << "\n";
-    s = std::string(tmp.str().c_str());
-}
-
-void CSTLMeshWriter::writeFaceText(
-		const core::vectorSIMDf& v1,
-		const core::vectorSIMDf& v2,
-		const core::vectorSIMDf& v3,
-		SContext* context)
-{
-	core::vectorSIMDf vertex1 = v3;
-	core::vectorSIMDf vertex2 = v2;
-	core::vectorSIMDf vertex3 = v1;
-	core::vectorSIMDf normal = core::plane3dSIMDf(vertex1, vertex2, vertex3).getNormal();
-	std::string tmp;
-
-	auto flipVectors = [&]()
-	{
-		vertex1.X = -vertex1.X;
-		vertex2.X = -vertex2.X;
-		vertex3.X = -vertex3.X;
-		normal = core::plane3dSIMDf(vertex1, vertex2, vertex3).getNormal();
-	};
-	
-	if (!(context->writeContext.params.flags & E_WRITER_FLAGS::EWF_MESH_IS_RIGHT_HANDED))
-		flipVectors();
-	
-	{
-		system::IFile::success_t success;;
-		context->writeContext.outputFile->write(success, "facet normal ", context->fileOffset, 13);
-
-		context->fileOffset += success.getBytesProcessed();
-	}
-
-	getVectorAsStringLine(normal, tmp);
-
+	static double normalizeColorComponentToUnit(double value)
 	{
-		system::IFile::success_t success;;
-		context->writeContext.outputFile->write(success, tmp.c_str(), context->fileOffset, tmp.size());
-
-		context->fileOffset += success.getBytesProcessed();
+		if (!std::isfinite(value))
+			return 0.0;
+		if (value > 1.0)
+			value /= 255.0;
+		return std::clamp(value, 0.0, 1.0);
 	}
-
+	struct PackedColor
 	{
-		system::IFile::success_t success;;
-		context->writeContext.outputFile->write(success, "  outer loop\n", context->fileOffset, 13);
-
-		context->fileOffset += success.getBytesProcessed();
-	}
-
+		uint32_t value = 0u;
+		E_FORMAT format = EF_B8G8R8A8_UNORM;
+	};
+	static uint16_t packViscamColorFromB8G8R8A8(const uint32_t color)
 	{
-		system::IFile::success_t success;;
-		context->writeContext.outputFile->write(success, "    vertex ", context->fileOffset, 11);
-
-		context->fileOffset += success.getBytesProcessed();
+		const void* src[4] = {&color, nullptr, nullptr, nullptr};
+		uint16_t packed = 0u;
+		convertColor<EF_B8G8R8A8_UNORM, EF_A1R5G5B5_UNORM_PACK16>(src, &packed, 0u, 0u);
+		return packed | 0x8000u;
 	}
-
-	getVectorAsStringLine(vertex1, tmp);
-
+	static const ICPUPolygonGeometry::SDataView* getColorView(const ICPUPolygonGeometry* geom, const size_t vertexCount)
 	{
-		system::IFile::success_t success;;
-		context->writeContext.outputFile->write(success, tmp.c_str(), context->fileOffset, tmp.size());
-
-		context->fileOffset += success.getBytesProcessed();
+		const auto* view = SGeometryWriterCommon::getAuxViewAt(geom, SSTLPolygonGeometryAuxLayout::COLOR0, vertexCount);
+		return view && getFormatChannelCount(view->composed.format) >= 3u ? view : nullptr;
 	}
-
+	static bool decodeColorB8G8R8A8(const ICPUPolygonGeometry::SDataView& colorView, const uint32_t ix, PackedColor& outColor)
 	{
-		system::IFile::success_t success;;
-		context->writeContext.outputFile->write(success, "    vertex ", context->fileOffset, 11);
-
-		context->fileOffset += success.getBytesProcessed();
+		if ((colorView.composed.format == EF_B8G8R8A8_UNORM || colorView.composed.format == EF_B8G8R8A8_SRGB) && colorView.composed.getStride() == sizeof(uint32_t))
+		{
+			const auto* const ptr = reinterpret_cast<const uint8_t*>(colorView.getPointer());
+			if (!ptr)
+				return false;
+			std::memcpy(&outColor.value, ptr + static_cast<size_t>(ix) * sizeof(uint32_t), sizeof(outColor.value));
+			outColor.format = colorView.composed.format;
+			return true;
+		}
+		hlsl::float32_t4 decoded = {};
+		if (!colorView.decodeElement(ix, decoded))
+			return false;
+		const double rgbaUnit[4] = {normalizeColorComponentToUnit(decoded.x), normalizeColorComponentToUnit(decoded.y), normalizeColorComponentToUnit(decoded.z), getFormatChannelCount(colorView.composed.format) >= 4u ? normalizeColorComponentToUnit(decoded.w) : 1.0};
+		encodePixels<EF_B8G8R8A8_UNORM, double>(&outColor.value, rgbaUnit);
+		outColor.format = EF_B8G8R8A8_UNORM;
+		return true;
 	}
-
-	getVectorAsStringLine(vertex2, tmp);
-
+	static void decodeColorUnitRGBAFromB8G8R8A8(const PackedColor& color, double* const outRGBA)
 	{
-		system::IFile::success_t success;;
-		context->writeContext.outputFile->write(success, tmp.c_str(), context->fileOffset, tmp.size());
-
-		context->fileOffset += success.getBytesProcessed();
+		const void* src[4] = {&color.value, nullptr, nullptr, nullptr};
+		if (color.format == EF_B8G8R8A8_SRGB)
+			decodePixels<EF_B8G8R8A8_SRGB, double>(src, outRGBA, 0u, 0u);
+		else
+			decodePixels<EF_B8G8R8A8_UNORM, double>(src, outRGBA, 0u, 0u);
 	}
-
+	static bool writeMeshBinary(const asset::ICPUPolygonGeometry* geom, Context* context)
 	{
-		system::IFile::success_t success;;
-		context->writeContext.outputFile->write(success, "    vertex ", context->fileOffset, 11);
-
-		context->fileOffset += success.getBytesProcessed();
+		if (!geom || !context || !context->writeContext.outputFile)
+			return false;
+		const auto& posView = geom->getPositionView();
+		if (!posView)
+			return false;
+		const bool flipHandedness = !context->writeContext.params.flags.hasAnyFlag(E_WRITER_FLAGS::EWF_MESH_IS_RIGHT_HANDED);
+		const size_t vertexCount = posView.getElementCount();
+		if (vertexCount == 0ull)
+			return false;
+		size_t faceCount = 0ull;
+		if (!SGeometryWriterCommon::getTriangleFaceCount(geom, faceCount))
+			return false;
+		if (faceCount > static_cast<size_t>(std::numeric_limits<uint32_t>::max()))
+			return false;
+		const uint32_t facenum = static_cast<uint32_t>(faceCount);
+		const size_t outputSize = BinaryPrefixBytes + static_cast<size_t>(facenum) * BinaryTriangleRecordBytes;
+		std::unique_ptr<uint8_t[]> output(new (std::nothrow) uint8_t[outputSize]);
+		if (!output)
+			return false;
+		uint8_t* dst = output.get();
+		std::memset(dst, 0, BinaryHeaderBytes);
+		dst += BinaryHeaderBytes;
+		std::memcpy(dst, &facenum, sizeof(facenum));
+		dst += sizeof(facenum);
+		const auto& normalView = geom->getNormalView();
+		const bool hasNormals = static_cast<bool>(normalView);
+		const auto* const colorView = getColorView(geom, vertexCount);
+		const hlsl::float32_t3* const tightPositions = SGeometryWriterCommon::getTightView<hlsl::float32_t3, EF_R32G32B32_SFLOAT>(posView);
+		const hlsl::float32_t3* const tightNormals = hasNormals ? SGeometryWriterCommon::getTightView<hlsl::float32_t3, EF_R32G32B32_SFLOAT>(normalView) : nullptr;
+		const bool hasFastTightPath = !geom->getIndexView() && tightPositions && (!hasNormals || tightNormals);
+		const float handednessSign = flipHandedness ? -1.f : 1.f;
+		auto decodePosition = [&](const uint32_t ix, hlsl::float32_t3& out) -> bool { return tightPositions ? (out = tightPositions[ix], true) : posView.decodeElement(ix, out); };
+		auto decodeNormal = [&](const uint32_t ix, hlsl::float32_t3& out) -> bool { return hasNormals && (tightNormals ? (out = tightNormals[ix], true) : normalView.decodeElement(ix, out)); };
+		auto computeFaceColor = [&](const hlsl::uint32_t3& idx, uint16_t& outColor) -> bool {
+			outColor = 0u;
+			if (!colorView)
+				return true;
+			const std::array<uint32_t, 3> vertexIx = {idx.x, idx.y, idx.z};
+			std::array<double, 4> rgbaAvg = {};
+			for (uint32_t corner = 0u; corner < vertexIx.size(); ++corner)
+			{
+				PackedColor color = {};
+				if (!decodeColorB8G8R8A8(*colorView, vertexIx[corner], color))
+					return false;
+				std::array<double, 4> rgba = {};
+				decodeColorUnitRGBAFromB8G8R8A8(color, rgba.data());
+				rgbaAvg[0] += rgba[0];
+				rgbaAvg[1] += rgba[1];
+				rgbaAvg[2] += rgba[2];
+			}
+			rgbaAvg[0] /= 3.0;
+			rgbaAvg[1] /= 3.0;
+			rgbaAvg[2] /= 3.0;
+			rgbaAvg[3] = 1.0;
+			uint32_t avgColor = 0u;
+			encodePixels<EF_B8G8R8A8_UNORM, double>(&avgColor, rgbaAvg.data());
+			outColor = packViscamColorFromB8G8R8A8(avgColor);
+			return true;
+		};
+		auto writeRecord = [&dst](const hlsl::float32_t3& normal, const hlsl::float32_t3& vertex1, const hlsl::float32_t3& vertex2, const hlsl::float32_t3& vertex3, const uint16_t attribute) -> void {
+			const float payload[BinaryTriangleFloatCount] = {normal.x, normal.y, normal.z, vertex1.x, vertex1.y, vertex1.z, vertex2.x, vertex2.y, vertex2.z, vertex3.x, vertex3.y, vertex3.z};
+			std::memcpy(dst, payload, BinaryTriangleFloatBytes);
+			dst += BinaryTriangleFloatBytes;
+			std::memcpy(dst, &attribute, BinaryTriangleAttributeBytes);
+			dst += BinaryTriangleAttributeBytes;
+		};
+		auto emitTriangle = [&](const hlsl::float32_t3& p0, const hlsl::float32_t3& p1, const hlsl::float32_t3& p2, const hlsl::uint32_t3& idx, const hlsl::float32_t3* const attrNormals, const uint32_t attrNormalCount, const bool alignToPlane) -> bool {
+			uint16_t faceColor = 0u;
+			if (!computeFaceColor(idx, faceColor))
+				return false;
+			TriangleData triangle = {};
+			buildTriangle(p0, p1, p2, attrNormals, attrNormalCount, flipHandedness, alignToPlane, triangle);
+			writeRecord(triangle.normal, triangle.vertex1, triangle.vertex2, triangle.vertex3, faceColor);
+			return true;
+		};
+		if (hasFastTightPath && hasNormals)
+		{
+			const hlsl::float32_t3* posTri = tightPositions;
+			const hlsl::float32_t3* nrmTri = tightNormals;
+			bool allFastNormalsNonZero = true;
+			for (size_t i = 0ull, normalCount = static_cast<size_t>(facenum) * 3ull; i < normalCount; ++i)
+			{
+				const auto& n = tightNormals[i];
+				if (n.x == 0.f && n.y == 0.f && n.z == 0.f)
+				{
+					allFastNormalsNonZero = false;
+					break;
+				}
+			}
+			for (uint32_t primIx = 0u; primIx < facenum; ++primIx, posTri += 3u, nrmTri += 3u)
+			{
+				const hlsl::uint32_t3 idx(primIx * 3u + 0u, primIx * 3u + 1u, primIx * 3u + 2u);
+				uint16_t faceColor = 0u;
+				if (!computeFaceColor(idx, faceColor))
+					return false;
+				hlsl::float32_t3 vertex1 = posTri[2u];
+				hlsl::float32_t3 vertex2 = posTri[1u];
+				hlsl::float32_t3 vertex3 = posTri[0u];
+				vertex1.x *= handednessSign;
+				vertex2.x *= handednessSign;
+				vertex3.x *= handednessSign;
+				hlsl::float32_t3 normal = {};
+				if (allFastNormalsNonZero)
+				{
+					normal = nrmTri[0u];
+					if (flipHandedness)
+						normal.x = -normal.x;
+				}
+				else if (selectFirstValidNormal(nrmTri, 3u, normal))
+				{
+					if (flipHandedness)
+						normal.x = -normal.x;
+				}
+				else
+				{
+					float planeNormalLen2 = 0.f;
+					const hlsl::float32_t3 planeNormal = computePlaneNormal(vertex1, vertex2, vertex3, &planeNormalLen2);
+					normal = planeNormalLen2 > 0.f ? hlsl::normalize(planeNormal) : hlsl::float32_t3(0.f, 0.f, 0.f);
+				}
+				writeRecord(normal, vertex1, vertex2, vertex3, faceColor);
+			}
+		}
+		else if (hasFastTightPath)
+		{
+			const hlsl::float32_t3* posTri = tightPositions;
+			for (uint32_t primIx = 0u; primIx < facenum; ++primIx, posTri += 3u)
+			{
+				const hlsl::uint32_t3 idx(primIx * 3u + 0u, primIx * 3u + 1u, primIx * 3u + 2u);
+				uint16_t faceColor = 0u;
+				if (!computeFaceColor(idx, faceColor))
+					return false;
+				hlsl::float32_t3 vertex1 = posTri[2u];
+				hlsl::float32_t3 vertex2 = posTri[1u];
+				hlsl::float32_t3 vertex3 = posTri[0u];
+				vertex1.x *= handednessSign;
+				vertex2.x *= handednessSign;
+				vertex3.x *= handednessSign;
+				float planeNormalLen2 = 0.f;
+				const hlsl::float32_t3 planeNormal = computePlaneNormal(vertex1, vertex2, vertex3, &planeNormalLen2);
+				const hlsl::float32_t3 normal = planeNormalLen2 > 0.f ? hlsl::normalize(planeNormal) : hlsl::float32_t3(0.f, 0.f, 0.f);
+				writeRecord(normal, vertex1, vertex2, vertex3, faceColor);
+			}
+		}
+		else if (!SGeometryWriterCommon::visitTriangleIndices(geom, [&](const uint32_t i0, const uint32_t i1, const uint32_t i2) -> bool {
+			const hlsl::uint32_t3 idx(i0, i1, i2);
+			std::array<hlsl::float32_t3, 3> positions = {};
+			if (!decodeIndexedTriple(idx, decodePosition, positions.data()))
+				return false;
+			std::array<hlsl::float32_t3, 3> normals = {};
+			if (hasNormals && !decodeIndexedTriple(idx, decodeNormal, normals.data()))
+				return false;
+			return emitTriangle(positions[0], positions[1], positions[2], idx, hasNormals ? normals.data() : nullptr, hasNormals ? 3u : 0u, true);
+		}))
+			return false;
+		const bool writeOk = SInterchangeIO::writeFileWithPolicy(context->writeContext.outputFile, context->ioPlan, output.get(), outputSize, &context->writeTelemetry);
+		if (writeOk)
+			context->fileOffset += outputSize;
+		return writeOk;
 	}
-
-	getVectorAsStringLine(vertex3, tmp);
-
+	static bool writeMeshASCII(const asset::ICPUPolygonGeometry* geom, Context* context)
 	{
-		system::IFile::success_t success;;
-		context->writeContext.outputFile->write(success, tmp.c_str(), context->fileOffset, tmp.size());
-
-		context->fileOffset += success.getBytesProcessed();
+		if (!geom)
+			return false;
+		const auto* indexing = geom->getIndexingCallback();
+		if (!indexing || indexing->degree() != 3u)
+			return false;
+		const auto& posView = geom->getPositionView();
+		if (!posView)
+			return false;
+		const auto& normalView = geom->getNormalView();
+		const bool flipHandedness = !context->writeContext.params.flags.hasAnyFlag(E_WRITER_FLAGS::EWF_MESH_IS_RIGHT_HANDED);
+		const std::string name = context->writeContext.outputFile->getFileName().filename().replace_extension().string();
+		const std::string_view solidName = name.empty() ? std::string_view(AsciiDefaultName) : std::string_view(name);
+		if (!context->write(AsciiSolidPrefix, sizeof(AsciiSolidPrefix) - 1ull) || !context->write(solidName.data(), solidName.size()) || !context->write("\n", sizeof("\n") - 1ull))
+			return false;
+		const uint32_t faceCount = static_cast<uint32_t>(geom->getPrimitiveCount());
+		for (uint32_t primIx = 0u; primIx < faceCount; ++primIx)
+		{
+			hlsl::float32_t3 v0 = {};
+			hlsl::float32_t3 v1 = {};
+			hlsl::float32_t3 v2 = {};
+			hlsl::uint32_t3 idx(0u);
+			if (!decodeTriangle(geom, indexing, posView, primIx, v0, v1, v2, &idx))
+				return false;
+			if (!writeFaceText(v0, v1, v2, idx, normalView, flipHandedness, context))
+				return false;
+			if (!context->write("\n", sizeof("\n") - 1ull))
+				return false;
+		}
+		return context->write(AsciiEndSolidPrefix, sizeof(AsciiEndSolidPrefix) - 1ull) && context->write(solidName.data(), solidName.size());
 	}
-
+	static bool writeFaceText(const hlsl::float32_t3& v1, const hlsl::float32_t3& v2, const hlsl::float32_t3& v3, const hlsl::uint32_t3& idx, const asset::ICPUPolygonGeometry::SDataView& normalView, const bool flipHandedness, Context* context)
 	{
-		system::IFile::success_t success;;
-		context->writeContext.outputFile->write(success, "  endloop\n", context->fileOffset, 10);
-
-		context->fileOffset += success.getBytesProcessed();
+		hlsl::float32_t3 attrNormal = {};
+		TriangleData triangle = {};
+		const hlsl::float32_t3* const attrNormalPtr = decodeTriangleNormal(normalView, idx, attrNormal) ? &attrNormal : nullptr;
+		buildTriangle(v1, v2, v3, attrNormalPtr, attrNormalPtr ? 1u : 0u, flipHandedness, true, triangle);
+		std::array<char, AsciiFaceTextMaxBytes> faceText = {};
+		char* cursor = faceText.data();
+		char* const end = faceText.data() + faceText.size();
+		const std::array vertices = {triangle.vertex1, triangle.vertex2, triangle.vertex3};
+		if (!appendLiteral(cursor, end, "facet normal ", sizeof("facet normal ") - 1ull))
+			return false;
+		if (!appendVectorAsAsciiLine(cursor, end, triangle.normal))
+			return false;
+		if (!appendLiteral(cursor, end, "  outer loop\n", sizeof("  outer loop\n") - 1ull))
+			return false;
+		for (const auto& vertex : vertices)
+		{
+			if (!appendLiteral(cursor, end, "    vertex ", sizeof("    vertex ") - 1ull))
+				return false;
+			if (!appendVectorAsAsciiLine(cursor, end, vertex))
+				return false;
+		}
+		if (!appendLiteral(cursor, end, "  endloop\n", sizeof("  endloop\n") - 1ull))
+			return false;
+		if (!appendLiteral(cursor, end, "endfacet\n", sizeof("endfacet\n") - 1ull))
+			return false;
+		return context->write(faceText.data(), static_cast<size_t>(cursor - faceText.data()));
 	}
-
+};
+}
+CSTLMeshWriter::CSTLMeshWriter()
+{
+	#ifdef _NBL_DEBUG
+	setDebugName("CSTLMeshWriter");
+	#endif
+}
+CSTLMeshWriter::~CSTLMeshWriter()
+{
+}
+const char** CSTLMeshWriter::getAssociatedFileExtensions() const
+{
+	static const char* ext[] = { "stl", nullptr };
+	return ext;
+}
+writer_flags_t CSTLMeshWriter::getSupportedFlags()
+{
+	return writer_flags_t(asset::EWF_BINARY | asset::EWF_MESH_IS_RIGHT_HANDED);
+}
+writer_flags_t CSTLMeshWriter::getForcedFlags()
+{
+	return EWF_NONE;
+}
+bool CSTLMeshWriter::writeAsset(system::IFile* _file, const SAssetWriteParams& _params, IAssetWriterOverride* _override)
+{
+	using Context = Parse::Context;
+	if (!_override)
+		getDefaultOverride(_override);
+	IAssetWriter::SAssetWriteContext inCtx{_params, _file};
+	const asset::ICPUPolygonGeometry* geom = IAsset::castDown<const asset::ICPUPolygonGeometry>(_params.rootAsset);
+	if (!geom)
+		return false;
+	system::IFile* file = _override->getOutputFile(_file, inCtx, {geom, 0u});
+	if (!file)
+		return false;
+	Context context = {IAssetWriter::SAssetWriteContext{inCtx.params, file}};
+	_params.logger.log("WRITING STL: writing the file %s", system::ILogger::ELL_INFO, file->getFileName().string().c_str());
+	const auto flags = _override->getAssetWritingFlags(context.writeContext, geom, 0u);
+	const bool binary = flags.hasAnyFlag(asset::EWF_BINARY);
+	uint64_t expectedSize = 0ull;
+	bool sizeKnown = false;
+	if (binary)
 	{
-		system::IFile::success_t success;;
-		context->writeContext.outputFile->write(success, "endfacet\n", context->fileOffset, 9);
-
-		context->fileOffset += success.getBytesProcessed();
+		expectedSize = Parse::BinaryPrefixBytes + static_cast<uint64_t>(geom->getPrimitiveCount()) * Parse::BinaryTriangleRecordBytes;
+		sizeKnown = true;
 	}
+	context.ioPlan = impl::SFileAccess::resolvePlan(_params.ioPolicy, expectedSize, sizeKnown, file);
+	if (impl::SFileAccess::logInvalidPlan(_params.logger, "STL writer", file->getFileName().string().c_str(), context.ioPlan))
+		return false;
+	context.ioBuffer.reserve(static_cast<size_t>(context.ioPlan.strategy == SResolvedFileIOPolicy::Strategy::WholeFile && sizeKnown ? expectedSize : std::min<uint64_t>(context.ioPlan.chunkSizeBytes(), Parse::IoFallbackReserveBytes)));
+	const bool written = binary ? Parse::writeMeshBinary(geom, &context) : Parse::writeMeshASCII(geom, &context);
+	if (!written)
+		return false;
+	if (!context.flush())
+		return false;
+	const uint64_t ioMinWrite = context.writeTelemetry.getMinOrZero();
+	const uint64_t ioAvgWrite = context.writeTelemetry.getAvgOrZero();
+	impl::SFileAccess::logTinyIO(_params.logger, "STL writer", file->getFileName().string().c_str(), context.writeTelemetry, context.fileOffset, _params.ioPolicy, "writes");
+	_params.logger.log("STL writer stats: file=%s bytes=%llu binary=%d io_writes=%llu io_min_write=%llu io_avg_write=%llu io_req=%s io_eff=%s io_chunk=%llu io_reason=%s",
+		system::ILogger::ELL_PERFORMANCE, file->getFileName().string().c_str(), static_cast<unsigned long long>(context.fileOffset), binary ? 1 : 0,
+		static_cast<unsigned long long>(context.writeTelemetry.callCount), static_cast<unsigned long long>(ioMinWrite), static_cast<unsigned long long>(ioAvgWrite),
+		system::to_string(_params.ioPolicy.strategy).c_str(), system::to_string(context.ioPlan.strategy).c_str(), static_cast<unsigned long long>(context.ioPlan.chunkSizeBytes()), context.ioPlan.reason);
+	return true;
+}
 }
-
 #endif
diff --git a/src/nbl/asset/interchange/CSTLMeshWriter.h b/src/nbl/asset/interchange/CSTLMeshWriter.h
index a25a84534c..e06e5c5b65 100644
--- a/src/nbl/asset/interchange/CSTLMeshWriter.h
+++ b/src/nbl/asset/interchange/CSTLMeshWriter.h
@@ -1,59 +1,25 @@
-// Copyright (C) 2019-2025 - DevSH Graphics Programming Sp. z O.O.
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine" and was originally part of the "Irrlicht Engine"
 // For conditions of distribution and use, see copyright notice in nabla.h
 // See the original file in irrlicht source for authors
 #ifndef _NBL_ASSET_STL_MESH_WRITER_H_INCLUDED_
 #define _NBL_ASSET_STL_MESH_WRITER_H_INCLUDED_
-
-
-#include "nbl/asset/ICPUPolygonGeometry.h"
 #include "nbl/asset/interchange/IGeometryWriter.h"
-
-
 namespace nbl::asset
 {
-
-//! class to write meshes, implementing a STL writer
+//! Geometry writer capable of emitting STL mesh files.
 class CSTLMeshWriter : public IGeometryWriter
 {
-    protected:
-        virtual ~CSTLMeshWriter();
-
     public:
         CSTLMeshWriter();
+        ~CSTLMeshWriter() override;
 
-        virtual const char** getAssociatedFileExtensions() const
-        {
-            static const char* ext[]{ "stl", nullptr };
-            return ext;
-        }
-
-        virtual uint32_t getSupportedFlags() override { return asset::EWF_BINARY; }
+        const char** getAssociatedFileExtensions() const override;
 
-        virtual uint32_t getForcedFlags() { return 0u; }
+        writer_flags_t getSupportedFlags() override;
+        writer_flags_t getForcedFlags() override;
 
-        virtual bool writeAsset(system::IFile* _file, const SAssetWriteParams& _params, IAssetWriterOverride* _override = nullptr) override;
-
-    private:
-
-        struct SContext
-        {
-            SAssetWriteContext writeContext;
-            size_t fileOffset;
-        };
-
-        // write binary format
-        bool writeMeshBinary(const ICPUPolygonGeometry* geom, SContext* context);
-
-        // write text format
-        bool writeMeshASCII(const ICPUPolygonGeometry* geom, SContext* context);
-
-        // create vector output with line end into string
-        void getVectorAsStringLine(const core::vectorSIMDf& v, std::string& s) const;
-
-        // write face information to file
-        void writeFaceText(const core::vectorSIMDf& v1, const core::vectorSIMDf& v2, const core::vectorSIMDf& v3, SContext* context);
+        bool writeAsset(system::IFile* _file, const SAssetWriteParams& _params, IAssetWriterOverride* _override = nullptr) override;
 };
-
 } // end namespace
 #endif
diff --git a/src/nbl/asset/interchange/IAssetLoader.cpp b/src/nbl/asset/interchange/IAssetLoader.cpp
index 4a9a8f0378..98f579257d 100644
--- a/src/nbl/asset/interchange/IAssetLoader.cpp
+++ b/src/nbl/asset/interchange/IAssetLoader.cpp
@@ -16,8 +16,8 @@ IAssetLoader::IAssetLoaderOverride::IAssetLoaderOverride(SCreationParams&& param
 
 SAssetBundle IAssetLoader::IAssetLoaderOverride::findCachedAsset(const std::string& inSearchKey, const IAsset::E_TYPE* inAssetTypes, const SAssetLoadContext& ctx, const uint32_t hierarchyLevel)
 {
-    auto levelFlag = ctx.params.cacheFlags >> (uint64_t(hierarchyLevel) * 2ull);
-    if ((levelFlag & ECF_DUPLICATE_TOP_LEVEL) == ECF_DUPLICATE_TOP_LEVEL)
+    const auto levelFlags = caching_flags_t(static_cast<uint64_t>(ctx.params.cacheFlags.value) >> (uint64_t(hierarchyLevel) * 2ull));
+    if (levelFlags.hasFlags(ECF_DUPLICATE_TOP_LEVEL))
         return {};
 
     auto found = getManager()->findAssets(inSearchKey, inAssetTypes);
@@ -30,8 +30,8 @@ void IAssetLoader::IAssetLoaderOverride::insertAssetIntoCache(SAssetBundle& asse
 {
 	getManager()->changeAssetKey(asset, supposedKey);
 
-    auto levelFlag = _params.cacheFlags >> (uint64_t(hierarchyLevel) * 2ull);
-    if (!(levelFlag&ECF_DONT_CACHE_TOP_LEVEL))
+    const auto levelFlags = caching_flags_t(static_cast<uint64_t>(_params.cacheFlags.value) >> (uint64_t(hierarchyLevel) * 2ull));
+    if (!levelFlags.hasAnyFlag(ECF_DONT_CACHE_TOP_LEVEL))
 		getManager()->insertAssetIntoCache(asset,ASSET_MUTABILITY_ON_CACHE_INSERT);
 }
 
@@ -126,4 +126,4 @@ smart_refctd_ptr<ICPUImageView> IAssetLoader::createDefaultImageView(core::smart
 		.viewType = viewType,
 		.format = imageParams.format
 	});
-}
\ No newline at end of file
+}
diff --git a/src/nbl/asset/interchange/IGeometryWriter.cpp b/src/nbl/asset/interchange/IGeometryWriter.cpp
new file mode 100644
index 0000000000..795241e539
--- /dev/null
+++ b/src/nbl/asset/interchange/IGeometryWriter.cpp
@@ -0,0 +1,8 @@
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#include "nbl/asset/interchange/IGeometryWriter.h"
+namespace nbl::asset
+{
+IGeometryWriter::~IGeometryWriter() = default;
+}
diff --git a/src/nbl/asset/interchange/SGeometryViewDecode.h b/src/nbl/asset/interchange/SGeometryViewDecode.h
new file mode 100644
index 0000000000..3e8d72adba
--- /dev/null
+++ b/src/nbl/asset/interchange/SGeometryViewDecode.h
@@ -0,0 +1,113 @@
+// Internal src-only header. Do not include from public headers.
+#ifndef _NBL_ASSET_S_GEOMETRY_VIEW_DECODE_H_INCLUDED_
+#define _NBL_ASSET_S_GEOMETRY_VIEW_DECODE_H_INCLUDED_
+#include "nbl/asset/ICPUPolygonGeometry.h"
+#include "nbl/asset/format/decodePixels.h"
+#include "nbl/builtin/hlsl/concepts.hlsl"
+#include "nbl/builtin/hlsl/vector_utils/vector_traits.hlsl"
+#include <algorithm>
+#include <array>
+#include <type_traits>
+namespace nbl::asset
+{
+//! Shared decode helper for geometry `SDataView` read paths used by writers.
+class SGeometryViewDecode
+{
+	public:
+		//! Selects whether the output should be in logical attribute space or storage space.
+		enum class EMode : uint8_t
+		{
+			Semantic, //!< Decode values ready for writer-side math and text/binary emission.
+			Stored //!< Decode values in storage-domain form for raw integer emission.
+		};
+
+		//! Prepared decode state hoisted out of inner loops for one formatted view.
+		template<EMode Mode>
+		struct Prepared
+		{
+			const uint8_t* data = nullptr; //!< First byte of the view payload.
+			uint32_t stride = 0u; //!< Byte stride between consecutive elements.
+			E_FORMAT format = EF_UNKNOWN; //!< Source format used by `decodePixels`.
+			uint32_t channels = 0u; //!< Channel count cached from `format`.
+			bool normalized = false; //!< True when semantic decode must apply `range`.
+
+			//! Decoded attribute range used for normalized semantic outputs.
+			hlsl::shapes::AABB<4, hlsl::float64_t> range = hlsl::shapes::AABB<4, hlsl::float64_t>::create();
+			inline explicit operator bool() const { return data != nullptr && stride != 0u && format != EF_UNKNOWN && channels != 0u; }
+
+			//! Decodes one element into a fixed-size `std::array`.
+			template<typename T, size_t N>
+			inline bool decode(const size_t ix, std::array<T, N>& out) const { out.fill(T{}); return SGeometryViewDecode::template decodePrepared<Mode>(*this, ix, out.data(), static_cast<uint32_t>(N)); }
+
+			//! Decodes one element into an HLSL vector type.
+			template<typename V> requires hlsl::concepts::Vector<V>
+			inline bool decode(const size_t ix, V& out) const { out = V{}; return SGeometryViewDecode::template decodePrepared<Mode>(*this, ix, out); }
+		};
+
+		//! Prepares one decode state that can be reused across many elements of the same view.
+		template<EMode Mode>
+		static inline Prepared<Mode> prepare(const ICPUPolygonGeometry::SDataView& view)
+		{
+			Prepared<Mode> retval = {};
+			if (!view.composed.isFormatted())
+				return {};
+			if (!(retval.data = reinterpret_cast<const uint8_t*>(view.getPointer())))
+				return {};
+			retval.stride = view.composed.getStride();
+			retval.format = view.composed.format;
+			retval.channels = getFormatChannelCount(retval.format);
+			if constexpr (Mode == EMode::Semantic)
+				if (retval.normalized = isNormalizedFormat(retval.format); retval.normalized)
+					retval.range = view.composed.getRange<hlsl::shapes::AABB<4, hlsl::float64_t>>();
+			return retval;
+		}
+
+		//! One-shot convenience wrapper over `prepare(...).decode(...)`.
+		template<typename Out, EMode Mode = EMode::Semantic>
+		static inline bool decodeElement(const ICPUPolygonGeometry::SDataView& view, const size_t ix, Out& out) { return prepare<Mode>(view).decode(ix, out); }
+	private:
+		//! Shared scalar/vector backend that decodes one prepared element into plain components.
+		template<EMode Mode, typename T>
+		static inline bool decodePreparedComponents(const Prepared<Mode>& prepared, const size_t ix, T* out, const uint32_t outDim)
+		{
+			if (!prepared || !out || outDim == 0u)
+				return false;
+			using storage_t = std::conditional_t<std::is_floating_point_v<T>, hlsl::float64_t, std::conditional_t<std::is_signed_v<T>, int64_t, uint64_t>>;
+			std::array<storage_t, 4> tmp = {};
+			const void* srcArr[4] = {prepared.data + ix * prepared.stride, nullptr};
+			if (!decodePixels<storage_t>(prepared.format, srcArr, tmp.data(), 0u, 0u))
+				return false;
+			const uint32_t componentCount = std::min({prepared.channels, outDim, 4u});
+			if constexpr (Mode == EMode::Semantic && std::is_floating_point_v<storage_t>)
+			{
+				if (prepared.normalized)
+				{
+					for (uint32_t i = 0u; i < componentCount; ++i)
+						tmp[i] = static_cast<storage_t>(tmp[i] * (prepared.range.maxVx[i] - prepared.range.minVx[i]) + prepared.range.minVx[i]);
+				}
+			}
+			for (uint32_t i = 0u; i < componentCount; ++i)
+				out[i] = static_cast<T>(tmp[i]);
+			return true;
+		}
+
+		//! Vector overload built on top of `decodePreparedComponents`.
+		template<EMode Mode, typename V> requires hlsl::concepts::Vector<V>
+		static inline bool decodePrepared(const Prepared<Mode>& prepared, const size_t ix, V& out)
+		{
+			using scalar_t = typename hlsl::vector_traits<V>::scalar_type;
+			constexpr uint32_t Dimension = hlsl::vector_traits<V>::Dimension;
+			std::array<scalar_t, Dimension> tmp = {};
+			if (!decodePreparedComponents(prepared, ix, tmp.data(), Dimension))
+				return false;
+			for (uint32_t i = 0u; i < Dimension; ++i)
+				out[i] = tmp[i];
+			return true;
+		}
+
+		//! Pointer overload used by `std::array` and internal scratch storage.
+		template<EMode Mode, typename T>
+		static inline bool decodePrepared(const Prepared<Mode>& prepared, const size_t ix, T* out, const uint32_t outDim) { return decodePreparedComponents(prepared, ix, out, outDim); }
+};
+}
+#endif
diff --git a/src/nbl/asset/interchange/impl/SBinaryData.h b/src/nbl/asset/interchange/impl/SBinaryData.h
new file mode 100644
index 0000000000..370f1a383d
--- /dev/null
+++ b/src/nbl/asset/interchange/impl/SBinaryData.h
@@ -0,0 +1,36 @@
+// Internal src-only header. Do not include from public headers.
+#ifndef _NBL_ASSET_IMPL_S_BINARY_DATA_H_INCLUDED_
+#define _NBL_ASSET_IMPL_S_BINARY_DATA_H_INCLUDED_
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+namespace nbl::asset::impl
+{
+//! Binary helpers for endian conversion and unaligned loads/stores.
+struct BinaryData
+{
+	//! Returns `value` with byte order reversed.
+	template<typename T>
+	static inline T byteswap(const T value) { auto retval = value; const auto* it = reinterpret_cast<const char*>(&value); std::reverse_copy(it, it + sizeof(retval), reinterpret_cast<char*>(&retval)); return retval; }
+
+	//! Loads one trivially copyable value from unaligned memory and optionally byte-swaps it.
+	template<typename T>
+	static inline T loadUnaligned(const void* src, const bool swapEndian = false)
+	{
+		T value = {};
+		if (!src)
+			return value;
+		std::memcpy(&value, src, sizeof(value));
+		return swapEndian ? byteswap(value) : value;
+	}
+
+	//! Stores one trivially copyable value into unaligned memory.
+	template<typename T>
+	static inline void storeUnaligned(void* dst, const T& value) { std::memcpy(dst, &value, sizeof(value)); }
+
+	//! Stores one value and advances the destination pointer by `sizeof(T)`.
+	template<typename T>
+	static inline void storeUnalignedAdvance(uint8_t*& dst, const T& value) { storeUnaligned(dst, value); dst += sizeof(value); }
+};
+}
+#endif
diff --git a/src/nbl/asset/interchange/impl/SFileAccess.h b/src/nbl/asset/interchange/impl/SFileAccess.h
new file mode 100644
index 0000000000..b1e15010f7
--- /dev/null
+++ b/src/nbl/asset/interchange/impl/SFileAccess.h
@@ -0,0 +1,105 @@
+// Internal src-only header. Do not include from public headers.
+#ifndef _NBL_ASSET_IMPL_S_FILE_ACCESS_H_INCLUDED_
+#define _NBL_ASSET_IMPL_S_FILE_ACCESS_H_INCLUDED_
+#include "nbl/core/declarations.h"
+#include "nbl/asset/interchange/SInterchangeIO.h"
+#include "nbl/system/ILogger.h"
+#include <string>
+namespace nbl::asset::impl
+{
+//! Small file access helper shared by interchange loaders.
+class SFileAccess
+{
+	public:
+		//! Returns true when the file exposes a mapped pointer.
+		static inline bool isMappable(const system::IFile* file) { return file && core::bitflag<system::IFile::E_CREATE_FLAGS>(file->getFlags()).hasAnyFlag(system::IFile::ECF_MAPPABLE); }
+		//! Resolves the final IO plan after considering payload size and mapping support.
+		static inline SResolvedFileIOPolicy resolvePlan(const SFileIOPolicy& ioPolicy, const uint64_t payloadBytes, const bool sizeKnown, const system::IFile* file) { return SResolvedFileIOPolicy(ioPolicy, payloadBytes, sizeKnown, isMappable(file)); }
+
+		//! Logs an invalid plan and returns true when the caller should abort.
+		template<typename Logger>
+		static inline bool logInvalidPlan(Logger& logger, const char* const owner, const char* const fileName, const SResolvedFileIOPolicy& ioPlan)
+		{
+			if (ioPlan.isValid())
+				return false;
+			logger.log("%s: invalid io policy for %s reason=%s", system::ILogger::ELL_ERROR, owner, fileName, ioPlan.reason);
+			return true;
+		}
+
+		//! Emits the shared tiny-IO warning when telemetry indicates suspiciously small reads.
+		template<typename Logger>
+		static inline void logTinyIO(Logger& logger, const char* const owner, const char* const fileName, const SInterchangeIO::STelemetry& telemetry, const uint64_t payloadBytes, const SFileIOPolicy& ioPolicy, const char* const opName)
+		{
+			if (!SInterchangeIO::isTinyIOTelemetryLikely(telemetry, payloadBytes, ioPolicy))
+				return;
+			logger.log("%s tiny-io guard: file=%s %s=%llu min=%llu avg=%llu", system::ILogger::ELL_WARNING, owner, fileName, opName, static_cast<unsigned long long>(telemetry.callCount), static_cast<unsigned long long>(telemetry.getMinOrZero()), static_cast<unsigned long long>(telemetry.getAvgOrZero()));
+		}
+		//! Reads one byte range using the already resolved IO plan.
+		static inline const uint8_t* readRange(system::IFile* file, const size_t offset, const size_t bytes, core::vector<uint8_t>& storage, const SResolvedFileIOPolicy& ioPlan, SFileReadTelemetry* ioTelemetry = nullptr, const bool zeroTerminate = false)
+		{
+			storage.resize(bytes + (zeroTerminate ? 1ull : 0ull), 0u);
+			if (!SInterchangeIO::readFileWithPolicy(file, storage.data(), offset, bytes, ioPlan, ioTelemetry))
+				return nullptr;
+			if (zeroTerminate)
+				storage[bytes] = 0u;
+			return storage.data();
+		}
+		//! Uses the mapped pointer for whole-file mode when available, otherwise falls back to `readRange`.
+		static inline const uint8_t* mapOrReadWholeFile(system::IFile* file, const size_t bytes, core::vector<uint8_t>& storage, const SResolvedFileIOPolicy& ioPlan, SFileReadTelemetry* ioTelemetry = nullptr, bool* wasMapped = nullptr, const bool zeroTerminate = false)
+		{
+			if (wasMapped)
+				*wasMapped = false;
+			if (ioPlan.strategy == SResolvedFileIOPolicy::Strategy::WholeFile)
+			{
+				const auto* mapped = reinterpret_cast<const uint8_t*>(static_cast<const system::IFile*>(file)->getMappedPointer());
+				if (mapped)
+				{
+					if (ioTelemetry) ioTelemetry->account(bytes);
+					if (wasMapped) *wasMapped = true;
+					return mapped;
+				}
+			}
+			return readRange(file, 0ull, bytes, storage, ioPlan, ioTelemetry, zeroTerminate);
+		}
+};
+//! Per-load session state shared across the loader entry points.
+class SLoadSession
+{
+	public:
+		system::IFile* file = nullptr; //!< File being processed by the loader.
+		const SFileIOPolicy* requestedPolicy = nullptr; //!< Original policy requested by the caller.
+		SResolvedFileIOPolicy ioPlan = {}; //!< Final plan chosen for this payload.
+		uint64_t payloadBytes = 0ull; //!< Logical payload size covered by `ioPlan`.
+		const char* owner = nullptr; //!< Human-readable loader name used in logs.
+		std::string fileName = {}; //!< Cached file name used in diagnostics.
+
+		//! Initializes the session and resolves the IO plan.
+		template<typename Logger>
+		static inline bool begin(Logger& logger, const char* const owner, system::IFile* file, const SFileIOPolicy& ioPolicy, const uint64_t payloadBytes, const bool sizeKnown, SLoadSession& out)
+		{
+			out = {};
+			if (!file)
+				return false;
+			out.file = file;
+			out.requestedPolicy = &ioPolicy;
+			out.ioPlan = SFileAccess::resolvePlan(ioPolicy, payloadBytes, sizeKnown, file);
+			out.payloadBytes = payloadBytes;
+			out.owner = owner;
+			out.fileName = file->getFileName().string();
+			return !SFileAccess::logInvalidPlan(logger, owner, out.fileName.c_str(), out.ioPlan);
+		}
+		//! Returns true when the resolved plan prefers whole-file access.
+		inline bool isWholeFile() const { return ioPlan.strategy == SResolvedFileIOPolicy::Strategy::WholeFile; }
+		//! Returns the mapped pointer for whole-file mode or `nullptr` when unavailable.
+		inline const uint8_t* mappedPointer() const { return file && isWholeFile() ? reinterpret_cast<const uint8_t*>(static_cast<const system::IFile*>(file)->getMappedPointer()) : nullptr; }
+		//! Convenience wrapper over `SFileAccess::readRange` bound to this session.
+		inline const uint8_t* readRange(const size_t offset, const size_t bytes, core::vector<uint8_t>& storage, SFileReadTelemetry* const ioTelemetry = nullptr, const bool zeroTerminate = false) const { return SFileAccess::readRange(file, offset, bytes, storage, ioPlan, ioTelemetry, zeroTerminate); }
+		//! Convenience wrapper over `SFileAccess::mapOrReadWholeFile` bound to this session.
+		inline const uint8_t* mapOrReadWholeFile(core::vector<uint8_t>& storage, SFileReadTelemetry* const ioTelemetry = nullptr, bool* const wasMapped = nullptr, const bool zeroTerminate = false) const { return SFileAccess::mapOrReadWholeFile(file, static_cast<size_t>(payloadBytes), storage, ioPlan, ioTelemetry, wasMapped, zeroTerminate); }
+
+		//! Emits the shared tiny-IO diagnostic for this session.
+		template<typename Logger, typename Telemetry>
+		inline void logTinyIO(Logger& logger, const Telemetry& telemetry, const char* const opName = "reads") const { if (requestedPolicy) SFileAccess::logTinyIO(logger, owner, fileName.c_str(), telemetry, payloadBytes, *requestedPolicy, opName); }
+};
+}
+#endif
diff --git a/src/nbl/asset/interchange/impl/STextParse.h b/src/nbl/asset/interchange/impl/STextParse.h
new file mode 100644
index 0000000000..ac4ed2d9b2
--- /dev/null
+++ b/src/nbl/asset/interchange/impl/STextParse.h
@@ -0,0 +1,208 @@
+// Internal src-only header. Do not include from public headers.
+#ifndef _NBL_ASSET_IMPL_S_TEXT_PARSE_H_INCLUDED_
+#define _NBL_ASSET_IMPL_S_TEXT_PARSE_H_INCLUDED_
+#include "nbl/core/string/stringutil.h"
+#include <charconv>
+#include <cstdint>
+#include <iterator>
+#include <optional>
+#include <string_view>
+#include <system_error>
+#include <type_traits>
+#include <fast_float/fast_float.h>
+namespace nbl::asset::impl
+{
+//! Text token and numeric parsing helpers shared by interchange text formats.
+struct TextParse
+{
+	struct LineCursor
+	{
+		const char* cursor = nullptr;
+		const char* end = nullptr;
+		inline std::optional<std::string_view> readLine()
+		{
+			if (!cursor || cursor >= end)
+				return std::nullopt;
+			const char* lineEnd = cursor;
+			while (lineEnd < end && *lineEnd != '\0' && *lineEnd != '\r' && *lineEnd != '\n')
+				++lineEnd;
+			const std::string_view line(cursor, static_cast<size_t>(lineEnd - cursor));
+			if (lineEnd < end && *lineEnd == '\r')
+				++lineEnd;
+			if (lineEnd < end && *lineEnd == '\n')
+				++lineEnd;
+			else if (lineEnd < end && *lineEnd == '\0')
+				++lineEnd;
+			cursor = lineEnd;
+			return line;
+		}
+	};
+	static inline bool isDigit(const char c) { return c >= '0' && c <= '9'; }
+	//! Parses one arithmetic token and advances `ptr` on success.
+	template<typename T>
+	static inline bool parseNumber(const char*& ptr, const char* const end, T& out)
+	{
+		static_assert(std::is_arithmetic_v<T>);
+		if constexpr (std::is_floating_point_v<T>)
+		{
+			const char* const start = ptr;
+			if (start >= end)
+				return false;
+			const char* p = start;
+			bool negative = false;
+			if (*p == '-' || *p == '+')
+			{
+				negative = (*p == '-');
+				++p;
+				if (p >= end)
+					return false;
+			}
+			// Fast path for the common plain-decimal subset: optional sign, digits, and an optional fractional part, but no exponent.
+			// This follows the same broad idea as RapidJSON's StrtodFast: cheaply handle the dominant simple spellings before delegating
+			// harder cases to the full parser. This is not a standalone general-purpose parser. Tokens with exponents or otherwise
+			// non-trivial spellings still fall back to fast_float.
+			if (*p != '.' && isDigit(*p))
+			{
+				uint64_t integerPart = 0ull;
+				while (p < end && isDigit(*p))
+				{
+					integerPart = integerPart * 10ull + static_cast<uint64_t>(*p - '0');
+					++p;
+				}
+				double value = static_cast<double>(integerPart);
+				if (p < end && *p == '.')
+				{
+					const char* const dot = p;
+					if ((dot + 7) <= end)
+					{
+						const char d0 = dot[1];
+						const char d1 = dot[2];
+						const char d2 = dot[3];
+						const char d3 = dot[4];
+						const char d4 = dot[5];
+						const char d5 = dot[6];
+						if (isDigit(d0) && isDigit(d1) && isDigit(d2) && isDigit(d3) && isDigit(d4) && isDigit(d5))
+						{
+							const bool hasNext = (dot + 7) < end;
+							const char next = hasNext ? dot[7] : '\0';
+							if ((!hasNext || !isDigit(next)) && (!hasNext || (next != 'e' && next != 'E')))
+							{
+								const uint32_t frac =
+									static_cast<uint32_t>(d0 - '0') * 100000u +
+									static_cast<uint32_t>(d1 - '0') * 10000u +
+									static_cast<uint32_t>(d2 - '0') * 1000u +
+									static_cast<uint32_t>(d3 - '0') * 100u +
+									static_cast<uint32_t>(d4 - '0') * 10u +
+									static_cast<uint32_t>(d5 - '0');
+								value += static_cast<double>(frac) * 1e-6;
+								ptr = dot + 7;
+								out = static_cast<T>(negative ? -value : value);
+								return true;
+							}
+						}
+					}
+					static constexpr double InvPow10[] = {
+						1.0,
+						1e-1, 1e-2, 1e-3, 1e-4, 1e-5,
+						1e-6, 1e-7, 1e-8, 1e-9, 1e-10,
+						1e-11, 1e-12, 1e-13, 1e-14, 1e-15,
+						1e-16, 1e-17, 1e-18
+					};
+					++p;
+					uint64_t fractionPart = 0ull;
+					uint32_t fractionDigits = 0u;
+					while (p < end && isDigit(*p))
+					{
+						if (fractionDigits >= (std::size(InvPow10) - 1u))
+							break;
+						fractionPart = fractionPart * 10ull + static_cast<uint64_t>(*p - '0');
+						++fractionDigits;
+						++p;
+					}
+					if (fractionDigits)
+						value += static_cast<double>(fractionPart) * InvPow10[fractionDigits];
+					if (p < end && isDigit(*p))
+					{
+						const auto parseResult = fast_float::from_chars(start, end, out);
+						if (parseResult.ec != std::errc() || parseResult.ptr == start)
+							return false;
+						ptr = parseResult.ptr;
+						return true;
+					}
+				}
+				if (p < end && (*p == 'e' || *p == 'E'))
+				{
+					const auto parseResult = fast_float::from_chars(start, end, out);
+					if (parseResult.ec != std::errc() || parseResult.ptr == start)
+						return false;
+					ptr = parseResult.ptr;
+					return true;
+				}
+				ptr = p;
+				out = static_cast<T>(negative ? -value : value);
+				return true;
+			}
+			const auto parseResult = fast_float::from_chars(ptr, end, out);
+			if (parseResult.ec != std::errc() || parseResult.ptr == ptr)
+				return false;
+			ptr = parseResult.ptr;
+			return true;
+		}
+		else
+		{
+			const auto parseResult = std::from_chars(ptr, end, out);
+			if (parseResult.ec != std::errc() || parseResult.ptr == ptr)
+				return false;
+			ptr = parseResult.ptr;
+			return true;
+		}
+	}
+
+	//! Parses one arithmetic token and succeeds only if the whole range was consumed.
+	template<typename T>
+	static inline bool parseExactNumber(const char* const begin, const char* const end, T& out) { auto ptr = begin; return parseNumber(ptr, end, out) && ptr == end; }
+
+	//! `std::string_view` convenience wrapper over `parseExactNumber(begin,end,...)`.
+	template<typename T>
+	static inline bool parseExactNumber(const std::string_view token, T& out) { return parseExactNumber(token.data(), token.data() + token.size(), out); }
+
+	//! Parses one arithmetic token and rejects zero.
+	template<typename T>
+	static inline bool parseNonZeroNumber(const char*& ptr, const char* const end, T& out) { return parseNumber(ptr, end, out) && out != static_cast<T>(0); }
+
+	//! Returns true for inline whitespace accepted inside tokenized text formats.
+	static inline bool isInlineWhitespace(const char c) { return c == ' ' || c == '\t' || c == '\v' || c == '\f'; }
+	//! Skips spaces and tabs that stay within the current logical line.
+	static inline void skipInlineWhitespace(const char*& ptr, const char* const end) { while (ptr < end && isInlineWhitespace(*ptr)) ++ptr; }
+	//! Skips generic whitespace according to `core::isspace`.
+	static inline void skipWhitespace(const char*& ptr, const char* const end) { while (ptr < end && core::isspace(*ptr)) ++ptr; }
+	//! Trims leading and trailing whitespace from a token view.
+	static inline std::string_view trimWhitespace(std::string_view token)
+	{
+		while (!token.empty() && core::isspace(token.front())) token.remove_prefix(1ull);
+		while (!token.empty() && core::isspace(token.back())) token.remove_suffix(1ull);
+		return token;
+	}
+	//! Reads one whitespace-delimited token and advances `cursor` past it.
+	static inline std::optional<std::string_view> readToken(const char*& cursor, const char* const end)
+	{
+		skipWhitespace(cursor, end);
+		if (cursor >= end)
+			return std::nullopt;
+		const auto* tokenEnd = cursor;
+		while (tokenEnd < end && !core::isspace(*tokenEnd))
+			++tokenEnd;
+		const std::string_view token(cursor, static_cast<size_t>(tokenEnd - cursor));
+		return cursor = tokenEnd, token;
+	}
+	//! Reads one line view from a contiguous text buffer and advances `cursor`.
+	static inline std::optional<std::string_view> readLine(const char*& cursor, const char* const end)
+	{
+		LineCursor lineCursor = {.cursor = cursor, .end = end};
+		auto line = lineCursor.readLine();
+		cursor = lineCursor.cursor;
+		return line;
+	}
+};
+}
+#endif
diff --git a/src/nbl/asset/pch_asset.h b/src/nbl/asset/pch_asset.h
index 361df786f1..d24252be24 100644
--- a/src/nbl/asset/pch_asset.h
+++ b/src/nbl/asset/pch_asset.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 #ifndef _NBL_ASSET_PCH_ASSET_H_INCLUDED_
@@ -37,6 +37,7 @@
 #include "nbl/asset/interchange/CPLYMeshFileLoader.h"
 #include "nbl/asset/interchange/CSTLMeshFileLoader.h"
 // writers
+#include "nbl/asset/interchange/COBJMeshWriter.h"
 #include "nbl/asset/interchange/CPLYMeshWriter.h"
 #include "nbl/asset/interchange/CSTLMeshWriter.h"
 // manipulation
diff --git a/src/nbl/asset/utils/CGeometryCreator.cpp b/src/nbl/asset/utils/CGeometryCreator.cpp
index 3750a37a70..9950f8b997 100644
--- a/src/nbl/asset/utils/CGeometryCreator.cpp
+++ b/src/nbl/asset/utils/CGeometryCreator.cpp
@@ -4,6 +4,7 @@
 
 
 #include "nbl/asset/utils/CGeometryCreator.h"
+#include "nbl/asset/utils/SGeometryNormalCommon.h"
 #include "nbl/builtin/hlsl/tgmath.hlsl"
 #include "nbl/builtin/hlsl/math/linalg/transform.hlsl"
 #include "nbl/builtin/hlsl/math/quaternions.hlsl"
@@ -946,44 +947,6 @@ class Icosphere
 
 private:
 
-	/*
-		return face normal (4th param) of a triangle v1-v2-v3
-		if a triangle has no surface (normal length = 0), then return a zero vector
-	*/
-
-	static inline void computeFaceNormal(const float v1[3], const float v2[3], const float v3[3], float normal[3])
-	{
-		constexpr float EPSILON = 0.000001f;
-
-		// default return value (0, 0, 0)
-		normal[0] = normal[1] = normal[2] = 0;
-
-		// find 2 edge vectors: v1-v2, v1-v3
-		float ex1 = v2[0] - v1[0];
-		float ey1 = v2[1] - v1[1];
-		float ez1 = v2[2] - v1[2];
-		float ex2 = v3[0] - v1[0];
-		float ey2 = v3[1] - v1[1];
-		float ez2 = v3[2] - v1[2];
-
-		// cross product: e1 x e2
-		float nx, ny, nz;
-		nx = ey1 * ez2 - ez1 * ey2;
-		ny = ez1 * ex2 - ex1 * ez2;
-		nz = ex1 * ey2 - ey1 * ex2;
-
-		// normalize only if the length is > 0
-		float length = sqrtf(nx * nx + ny * ny + nz * nz);
-		if (length > EPSILON)
-		{
-			// normalize
-			float lengthInv = 1.0f / length;
-			normal[0] = nx * lengthInv;
-			normal[1] = ny * lengthInv;
-			normal[2] = nz * lengthInv;
-		}
-	}
-
 	/*
 		return vertex normal (2nd param) by mormalizing the vertex vector
 	*/
@@ -1229,27 +1192,27 @@ class Icosphere
 			t11[0] = 2 * i * S_STEP;         t11[1] = T_STEP * 3;
 
 			// add a triangle in 1st row
-			Icosphere::computeFaceNormal(v0, v1, v2, n);
+			SGeometryNormalCommon::computeFaceNormal(v0, v1, v2, n);
 			addVertices(v0, v1, v2);
 			addNormals(n, n, n);
 			addTexCoords(t0, t1, t2);
 			addIndices(index, index + 1, index + 2);
 
 			// add 2 triangles in 2nd row
-			Icosphere::computeFaceNormal(v1, v3, v2, n);
+			SGeometryNormalCommon::computeFaceNormal(v1, v3, v2, n);
 			addVertices(v1, v3, v2);
 			addNormals(n, n, n);
 			addTexCoords(t1, t3, t2);
 			addIndices(index + 3, index + 4, index + 5);
 
-			Icosphere::computeFaceNormal(v2, v3, v4, n);
+			SGeometryNormalCommon::computeFaceNormal(v2, v3, v4, n);
 			addVertices(v2, v3, v4);
 			addNormals(n, n, n);
 			addTexCoords(t2, t3, t4);
 			addIndices(index + 6, index + 7, index + 8);
 
 			// add a triangle in 3rd row
-			Icosphere::computeFaceNormal(v3, v11, v4, n);
+			SGeometryNormalCommon::computeFaceNormal(v3, v11, v4, n);
 			addVertices(v3, v11, v4);
 			addNormals(n, n, n);
 			addTexCoords(t3, t11, t4);
@@ -1562,25 +1525,25 @@ class Icosphere
 				// add 4 new triangles
 				addVertices(v1, newV1, newV3);
 				addTexCoords(t1, newT1, newT3);
-				computeFaceNormal(v1, newV1, newV3, normal);
+				SGeometryNormalCommon::computeFaceNormal(v1, newV1, newV3, normal);
 				addNormals(normal, normal, normal);
 				addIndices(index, index + 1, index + 2);
 
 				addVertices(newV1, v2, newV2);
 				addTexCoords(newT1, t2, newT2);
-				computeFaceNormal(newV1, v2, newV2, normal);
+				SGeometryNormalCommon::computeFaceNormal(newV1, v2, newV2, normal);
 				addNormals(normal, normal, normal);
 				addIndices(index + 3, index + 4, index + 5);
 
 				addVertices(newV1, newV2, newV3);
 				addTexCoords(newT1, newT2, newT3);
-				computeFaceNormal(newV1, newV2, newV3, normal);
+				SGeometryNormalCommon::computeFaceNormal(newV1, newV2, newV3, normal);
 				addNormals(normal, normal, normal);
 				addIndices(index + 6, index + 7, index + 8);
 
 				addVertices(newV3, newV2, v3);
 				addTexCoords(newT3, newT2, t3);
-				computeFaceNormal(newV3, newV2, v3, normal);
+				SGeometryNormalCommon::computeFaceNormal(newV3, newV2, v3, normal);
 				addNormals(normal, normal, normal);
 				addIndices(index + 9, index + 10, index + 11);
 
diff --git a/src/nbl/asset/utils/CPolygonGeometryManipulator.cpp b/src/nbl/asset/utils/CPolygonGeometryManipulator.cpp
index 818751052b..f83fb3c3e0 100644
--- a/src/nbl/asset/utils/CPolygonGeometryManipulator.cpp
+++ b/src/nbl/asset/utils/CPolygonGeometryManipulator.cpp
@@ -7,18 +7,171 @@
 
 #include <functional>
 #include <algorithm>
+#include <span>
 
 #include "nbl/asset/utils/CPolygonGeometryManipulator.h"
+#include "nbl/asset/interchange/SLoaderRuntimeTuning.h"
 #include "nbl/asset/utils/CVertexWelder.h"
 #include "nbl/asset/utils/CSmoothNormalGenerator.h"
 #include "nbl/asset/utils/CForsythVertexCacheOptimizer.h"
 #include "nbl/asset/utils/COverdrawPolygonGeometryOptimizer.h"
 #include "nbl/asset/utils/COBBGenerator.h"
+#include "nbl/asset/IPreHashed.h"
 
 
 namespace nbl::asset
 {
 
+void CPolygonGeometryManipulator::collectUniqueBuffers(const ICPUPolygonGeometry* geo, core::vector<core::smart_refctd_ptr<ICPUBuffer>>& outBuffers)
+{
+	if (!geo)
+	{
+		outBuffers.clear();
+		return;
+	}
+
+	outBuffers.clear();
+	auto appendBuffer = [&outBuffers](const IGeometry<ICPUBuffer>::SDataView& view)->void
+	{
+		if (!view || !view.src.buffer)
+			return;
+		for (const auto& existing : outBuffers)
+		{
+			if (existing.get() == view.src.buffer.get())
+				return;
+		}
+		outBuffers.push_back(core::smart_refctd_ptr<ICPUBuffer>(view.src.buffer));
+	};
+
+	appendBuffer(geo->getPositionView());
+	appendBuffer(geo->getIndexView());
+	appendBuffer(geo->getNormalView());
+	for (const auto& view : geo->getAuxAttributeViews())
+		appendBuffer(view);
+	for (const auto& view : geo->getJointWeightViews())
+	{
+		appendBuffer(view.indices);
+		appendBuffer(view.weights);
+	}
+	if (auto jointOBB = geo->getJointOBBView(); jointOBB)
+		appendBuffer(*jointOBB);
+}
+
+void CPolygonGeometryManipulator::computeContentHashesParallel(ICPUPolygonGeometry* geo, const SFileIOPolicy& ioPolicy, const EContentHashMode mode)
+{
+	if (!geo)
+		return;
+
+	core::vector<core::smart_refctd_ptr<ICPUBuffer>> buffers;
+	collectUniqueBuffers(geo, buffers);
+	if (buffers.empty())
+		return;
+
+	core::vector<size_t> pending;
+	pending.reserve(buffers.size());
+	uint64_t totalBytes = 0ull;
+	for (size_t i = 0ull; i < buffers.size(); ++i)
+	{
+		auto& buffer = buffers[i];
+		if (!buffer)
+			continue;
+		if (mode == EContentHashMode::MissingOnly && buffer->getContentHash() != IPreHashed::INVALID_HASH)
+			continue;
+		totalBytes += static_cast<uint64_t>(buffer->getSize());
+		pending.push_back(i);
+	}
+	if (pending.empty())
+		return;
+
+	const auto hashPendingRange = [&](const size_t beginIx, const size_t endIx) -> void
+	{
+		for (size_t i = beginIx; i < endIx; ++i)
+		{
+			auto& buffer = buffers[pending[i]];
+			buffer->setContentHash(buffer->computeContentHash());
+		}
+	};
+
+	if (ioPolicy.runtimeTuning.mode == SFileIOPolicy::SRuntimeTuning::Mode::Sequential)
+	{
+		hashPendingRange(0ull, pending.size());
+		return;
+	}
+
+	const size_t hw = SLoaderRuntimeTuner::resolveHardwareThreads();
+	const uint8_t* hashSampleData = nullptr;
+	uint64_t hashSampleBytes = 0ull;
+	for (const auto pendingIx : pending)
+	{
+		auto& buffer = buffers[pendingIx];
+		const auto* ptr = reinterpret_cast<const uint8_t*>(buffer->getPointer());
+		if (!ptr)
+			continue;
+		hashSampleData = ptr;
+		hashSampleBytes = SLoaderRuntimeTuner::resolveSampleBytes(ioPolicy, static_cast<uint64_t>(buffer->getSize()));
+		if (hashSampleBytes > 0ull)
+			break;
+	}
+
+	SLoaderRuntimeTuningRequest tuningRequest = {};
+	tuningRequest.inputBytes = totalBytes;
+	tuningRequest.totalWorkUnits = pending.size();
+	tuningRequest.minBytesPerWorker = std::max<uint64_t>(1ull, SLoaderRuntimeTuner::ceilDiv(totalBytes, static_cast<uint64_t>(pending.size())));
+	tuningRequest.hardwareThreads = static_cast<uint32_t>(hw);
+	const size_t hardMaxWorkers = SLoaderRuntimeTuner::resolveHardMaxWorkers(hw, ioPolicy.runtimeTuning.workerHeadroom);
+	tuningRequest.hardMaxWorkers = static_cast<uint32_t>(std::min(pending.size(), hardMaxWorkers));
+	tuningRequest.targetChunksPerWorker = ioPolicy.runtimeTuning.hashTaskTargetChunksPerWorker;
+	tuningRequest.sampleData = hashSampleData;
+	tuningRequest.sampleBytes = hashSampleBytes;
+	const auto tuning = SLoaderRuntimeTuner::tune(ioPolicy, tuningRequest);
+	const size_t workerCount = std::min(tuning.workerCount, pending.size());
+
+	if (workerCount > 1ull)
+	{
+		SLoaderRuntimeTuner::dispatchWorkers(workerCount, [&](const size_t workerIx)
+		{
+			const size_t beginIx = (pending.size() * workerIx) / workerCount;
+			const size_t endIx = (pending.size() * (workerIx + 1ull)) / workerCount;
+			hashPendingRange(beginIx, endIx);
+		});
+		return;
+	}
+
+	hashPendingRange(0ull, pending.size());
+}
+
+bool CPolygonGeometryManipulator::generateMissingSmoothNormals(
+	core::vector<hlsl::float32_t3>& normals,
+	const core::vector<hlsl::float32_t3>& positions,
+	const core::vector<uint32_t>& indices,
+	const core::vector<uint8_t>& normalNeedsGeneration
+)
+{
+	if (normals.size() != positions.size() || normals.size() != normalNeedsGeneration.size())
+		return false;
+
+	CSmoothNormalAccumulator accumulator(ESmoothNormalAccumulationMode::AreaWeighted);
+	accumulator.reserveVertices(positions.size());
+	accumulator.prepareIdentityGroups(positions.size());
+	const size_t triangleCount = indices.size() / 3ull;
+	for (size_t triIx = 0ull; triIx < triangleCount; ++triIx)
+	{
+		const uint32_t i0 = indices[triIx * 3ull + 0ull];
+		const uint32_t i1 = indices[triIx * 3ull + 1ull];
+		const uint32_t i2 = indices[triIx * 3ull + 2ull];
+		if (i0 >= positions.size() || i1 >= positions.size() || i2 >= positions.size())
+			continue;
+		if (!accumulator.addPreparedIdentityTriangle(
+				i0, positions[static_cast<size_t>(i0)],
+				i1, positions[static_cast<size_t>(i1)],
+				i2, positions[static_cast<size_t>(i2)]))
+			return false;
+	}
+	return accumulator.finalize(
+		std::span<hlsl::float32_t3>(normals.data(), normals.size()),
+		std::span<const uint8_t>(normalNeedsGeneration.data(), normalNeedsGeneration.size()));
+}
+
 
 core::smart_refctd_ptr<ICPUPolygonGeometry> CPolygonGeometryManipulator::createUnweldedList(const ICPUPolygonGeometry* inGeo, const bool reverse, const bool recomputeHash)
 {
diff --git a/src/nbl/asset/utils/CSmoothNormalGenerator.h b/src/nbl/asset/utils/CSmoothNormalGenerator.h
index 7c9bf5358f..8bce90f864 100644
--- a/src/nbl/asset/utils/CSmoothNormalGenerator.h
+++ b/src/nbl/asset/utils/CSmoothNormalGenerator.h
@@ -5,18 +5,283 @@
 #define _NBL_ASSET_C_SMOOTH_NORMAL_GENERATOR_H_INCLUDED_
 
 #include "nbl/asset/utils/CVertexHashGrid.h"
+#include "nbl/builtin/hlsl/shapes/triangle.hlsl"
+
+#include <array>
+#include <concepts>
+#include <limits>
+#include <span>
 
 
 namespace nbl::asset 
 {
 
-// TODO: implement a class template that take position type(either float32_t3 or float64_t3 as template argument
+template<typename PositionT>
+concept SmoothNormalPosition = std::same_as<PositionT, hlsl::float32_t3> || std::same_as<PositionT, hlsl::float64_t3>;
+
+//! Generic smooth-normal accumulation utilities. The core accepts triangles incrementally,
+//! supports indexed inputs, optional caller-defined grouping, and finalizes into a caller-owned
+//! normal buffer. Parsing and authoring of any format-specific grouping rules stay outside.
 class CSmoothNormalGenerator final
 {
 	public:
 		CSmoothNormalGenerator() = delete;
 		~CSmoothNormalGenerator() = delete;
 
+		//! AreaWeighted matches the existing behaviour used by current loaders. AngleWeighted
+		//! is available for future callers that need angle-based smoothing without changing the API.
+		enum class EAccumulationMode : uint8_t
+		{
+			AreaWeighted,
+			AngleWeighted
+		};
+
+		//! One triangle corner to be accumulated. `vertexIx` points at the output vertex whose
+		//! normal will be written on finalize. `accumulationGroup` controls which corners smooth
+		//! together. This is the generic equivalent of format-specific smoothing-group semantics.
+		//! Callers can keep it equal to `vertexIx` for identity grouping or map it to any other
+		//! stable grouping key when corners that share a position must stay sharp.
+		template<SmoothNormalPosition PositionT = hlsl::float32_t3>
+		struct SAccumulatedCorner
+		{
+			uint32_t vertexIx = 0u;
+			uint32_t accumulationGroup = 0u;
+			PositionT position = PositionT(0.f, 0.f, 0.f);
+		};
+
+		//! Incremental smooth-normal accumulator. Callers feed triangles through `addTriangle(...)`
+		//! and then materialize results with `finalize(...)`. Grouping is provided entirely by
+		//! the caller through `accumulationGroup`.
+		template<SmoothNormalPosition PositionT = hlsl::float32_t3>
+		class CAccumulatedNormals final
+		{
+			public:
+				using vector_t = PositionT;
+
+				explicit CAccumulatedNormals(const EAccumulationMode mode = EAccumulationMode::AreaWeighted) : m_mode(mode) {}
+
+				//! Records how many output vertices may need normals. This affects finalize-time
+				//! validation and may reserve group storage if non-identity grouping is already active.
+				NBL_FORCE_INLINE void reserveVertices(const size_t count)
+				{
+					if (count > m_vertexCount)
+						m_vertexCount = count;
+					if (count > m_groupsByVertex.capacity() && !m_groupsByVertex.empty())
+						m_groupsByVertex.reserve(growSize(count));
+				}
+
+				//! Reserves accumulation storage for explicit grouping. Callers that know they will
+				//! feed many non-identity groups can use this to avoid repeated reallocations.
+				NBL_FORCE_INLINE void reserveGroups(const size_t count)
+				{
+					if (count > m_accumulatedNormals.capacity())
+						m_accumulatedNormals.reserve(growSize(count));
+				}
+
+				//! Prepares the common identity-group case (`accumulationGroup == vertexIx`) up front.
+				//! This enables a lighter hot path where `addPreparedIdentityTriangle(...)` can skip
+				//! per-corner registration and write straight into pre-sized accumulation slots.
+				NBL_FORCE_INLINE void prepareIdentityGroups(const size_t count)
+				{
+					if (!m_groupsByVertex.empty())
+						return;
+					ensureGroupStorage(count);
+				}
+
+				//! Generic triangle submission path. Use this when the caller needs custom grouping.
+				//! In particular, callers can encode smoothing-group-like semantics by assigning
+				//! the same `accumulationGroup` to corners that should share a smooth normal and a
+				//! different one to corners that must stay sharp.
+				NBL_FORCE_INLINE bool addTriangle(const std::array<SAccumulatedCorner<PositionT>, 3>& corners)
+				{
+					if (canUseIdentityFastPath(corners))
+						return addTriangle(corners[0].vertexIx, corners[0].position, corners[1].vertexIx, corners[1].position, corners[2].vertexIx, corners[2].position);
+					for (const auto& corner : corners)
+					{
+						if (!registerCorner(corner))
+							return false;
+					}
+					return accumulateTriangle(corners, [](const SAccumulatedCorner<PositionT>& corner) { return corner.accumulationGroup; });
+				}
+
+				NBL_FORCE_INLINE bool addTriangle(const uint32_t i0, const PositionT& p0, const uint32_t i1, const PositionT& p1, const uint32_t i2, const PositionT& p2)
+				{
+					const size_t maxIx = std::max(static_cast<size_t>(i0), std::max(static_cast<size_t>(i1), static_cast<size_t>(i2)));
+					const size_t requiredCount = maxIx + 1ull;
+					if (requiredCount > m_vertexCount)
+						m_vertexCount = requiredCount;
+					ensureGroupStorage(requiredCount);
+					if (m_groupsByVertex.empty())
+						return accumulateTriangle(p0, p1, p2, i0, i1, i2);
+					return addTriangle({{
+						{.vertexIx = i0, .accumulationGroup = i0, .position = p0},
+						{.vertexIx = i1, .accumulationGroup = i1, .position = p1},
+						{.vertexIx = i2, .accumulationGroup = i2, .position = p2}
+					}});
+				}
+
+				//! Hot path for already-prepared identity grouping. This is still triangle accumulation,
+				//! not a separate algorithm. It simply avoids the generic registration overhead once the
+				//! caller has committed to `vertexIx == accumulationGroup`.
+				NBL_FORCE_INLINE bool addPreparedIdentityTriangle(const uint32_t i0, const PositionT& p0, const uint32_t i1, const PositionT& p1, const uint32_t i2, const PositionT& p2)
+				{
+					if (!m_groupsByVertex.empty())
+						return false;
+					const size_t requiredCount = std::max(static_cast<size_t>(i0), std::max(static_cast<size_t>(i1), static_cast<size_t>(i2))) + 1ull;
+					if (requiredCount > m_vertexCount)
+						m_vertexCount = requiredCount;
+					if (requiredCount > m_accumulatedNormals.size())
+						return false;
+					return accumulateTriangle(p0, p1, p2, i0, i1, i2);
+				}
+
+				//! Writes accumulated normals into the caller-owned output buffer. If `normalNeedsGeneration`
+				//! is supplied, only those entries marked non-zero are overwritten. This supports the
+				//! common "preserve existing normals and fill only the missing ones" workflow.
+				template<typename NormalT = hlsl::float32_t3>
+				NBL_FORCE_INLINE bool finalize(const std::span<NormalT> normals, const std::span<const uint8_t> normalNeedsGeneration = {}, const NormalT& fallback = NormalT(0.f, 0.f, 1.f)) const
+				{
+					if (!normalNeedsGeneration.empty() && normalNeedsGeneration.size() != normals.size())
+						return false;
+					if (normals.size() < m_vertexCount)
+						return false;
+
+					if (m_groupsByVertex.empty())
+					{
+						for (size_t vertexIx = 0ull; vertexIx < m_vertexCount; ++vertexIx)
+						{
+							if (!normalNeedsGeneration.empty() && normalNeedsGeneration[vertexIx] == 0u)
+								continue;
+							const auto normal = vertexIx < m_accumulatedNormals.size() ? m_accumulatedNormals[vertexIx] : vector_t(0.f, 0.f, 0.f);
+							const auto lenSq = hlsl::dot(normal, normal);
+							normals[vertexIx] = (lenSq > 1e-20f) ? (normal * hlsl::rsqrt(lenSq)) : fallback;
+						}
+						return true;
+					}
+
+					for (size_t vertexIx = 0ull; vertexIx < m_vertexCount; ++vertexIx)
+					{
+						if (!normalNeedsGeneration.empty() && normalNeedsGeneration[vertexIx] == 0u)
+							continue;
+						const uint32_t group = resolveGroup(static_cast<uint32_t>(vertexIx));
+						if (group == InvalidGroup)
+							return false;
+
+						const auto normal = group < m_accumulatedNormals.size() ? m_accumulatedNormals[group] : vector_t(0.f, 0.f, 0.f);
+						const auto lenSq = hlsl::dot(normal, normal);
+						normals[vertexIx] = (lenSq > 1e-20f) ? (normal * hlsl::rsqrt(lenSq)) : fallback;
+					}
+					return true;
+				}
+
+			private:
+				static inline constexpr uint32_t InvalidGroup = std::numeric_limits<uint32_t>::max();
+
+				static NBL_FORCE_INLINE size_t growSize(const size_t required)
+				{
+					return required > 1ull ? std::bit_ceil(required) : 1ull;
+				}
+
+				template<typename GroupFn>
+				NBL_FORCE_INLINE bool accumulateTriangle(const std::array<SAccumulatedCorner<PositionT>, 3>& corners, GroupFn&& groupFn)
+				{
+					return accumulateTriangle(
+						corners[0].position, corners[1].position, corners[2].position,
+						groupFn(corners[0]), groupFn(corners[1]), groupFn(corners[2])
+					);
+				}
+
+				NBL_FORCE_INLINE void ensureGroupStorage(const size_t requiredCount)
+				{
+					if (requiredCount <= m_accumulatedNormals.size())
+						return;
+					const size_t grownCount = growSize(requiredCount);
+					if (requiredCount > m_accumulatedNormals.capacity())
+						m_accumulatedNormals.reserve(grownCount);
+					m_accumulatedNormals.resize(grownCount, vector_t(0.f, 0.f, 0.f));
+				}
+
+				NBL_FORCE_INLINE bool accumulateTriangle(const PositionT& p0, const PositionT& p1, const PositionT& p2, const uint32_t g0, const uint32_t g1, const uint32_t g2)
+				{
+					const auto edge10 = p1 - p0;
+					const auto edge20 = p2 - p0;
+					const auto faceNormal = hlsl::cross(edge10, edge20);
+					const auto faceLenSq = hlsl::dot(faceNormal, faceNormal);
+					if (faceLenSq <= 1e-20f)
+						return true;
+
+					if (m_mode == EAccumulationMode::AreaWeighted)
+					{
+						m_accumulatedNormals[g0] += faceNormal;
+						m_accumulatedNormals[g1] += faceNormal;
+						m_accumulatedNormals[g2] += faceNormal;
+						return true;
+					}
+
+					const auto weights = hlsl::shapes::util::anglesFromTriangleEdges(p2 - p1, p0 - p2, p1 - p0);
+					const auto unitNormal = faceNormal * hlsl::rsqrt(faceLenSq);
+					m_accumulatedNormals[g0] += unitNormal * weights.x;
+					m_accumulatedNormals[g1] += unitNormal * weights.y;
+					m_accumulatedNormals[g2] += unitNormal * weights.z;
+					return true;
+				}
+
+				NBL_FORCE_INLINE bool canUseIdentityFastPath(const std::array<SAccumulatedCorner<PositionT>, 3>& corners) const
+				{
+					if (!m_groupsByVertex.empty())
+						return false;
+					for (const auto& corner : corners)
+					{
+						if (corner.vertexIx != corner.accumulationGroup)
+							return false;
+					}
+					return true;
+				}
+
+				NBL_FORCE_INLINE uint32_t resolveGroup(const uint32_t vertexIx) const
+				{
+					if (vertexIx >= m_vertexCount)
+						return InvalidGroup;
+					if (m_groupsByVertex.empty())
+						return vertexIx;
+					if (vertexIx >= m_groupsByVertex.size())
+						return vertexIx;
+					const uint32_t mapped = m_groupsByVertex[vertexIx];
+					return mapped == InvalidGroup ? vertexIx : mapped;
+				}
+
+				NBL_FORCE_INLINE bool registerCorner(const SAccumulatedCorner<PositionT>& corner)
+				{
+					if ((static_cast<size_t>(corner.vertexIx) + 1ull) > m_vertexCount)
+						m_vertexCount = static_cast<size_t>(corner.vertexIx) + 1ull;
+					ensureGroupStorage(static_cast<size_t>(corner.accumulationGroup) + 1ull);
+					if (m_groupsByVertex.empty())
+					{
+						if (corner.vertexIx == corner.accumulationGroup)
+							return true;
+						m_groupsByVertex.reserve(growSize(m_vertexCount));
+					}
+					else if (corner.vertexIx >= m_groupsByVertex.size())
+						m_groupsByVertex.reserve(growSize(m_vertexCount));
+					if (corner.vertexIx >= m_groupsByVertex.size())
+						m_groupsByVertex.resize(growSize(static_cast<size_t>(corner.vertexIx) + 1ull), InvalidGroup);
+					auto& group = m_groupsByVertex[corner.vertexIx];
+					if (group == InvalidGroup)
+					{
+						if (corner.vertexIx == corner.accumulationGroup)
+							return true;
+						group = corner.accumulationGroup;
+						return true;
+					}
+					return group == corner.accumulationGroup;
+				}
+
+				EAccumulationMode m_mode;
+				size_t m_vertexCount = 0ull;
+				core::vector<uint32_t> m_groupsByVertex;
+				core::vector<vector_t> m_accumulatedNormals;
+		};
+
 		struct VertexData
 		{
 			//offset of the vertex into index buffer
@@ -31,9 +296,9 @@ class CSmoothNormalGenerator final
 				return position;
 			}
 
-			void setHash(uint32_t hash)
+			void setHash(uint32_t newHash)
 			{
-				this->hash = hash;
+				hash = newHash;
 			}
 
 			uint32_t getHash() const
@@ -60,4 +325,4 @@ class CSmoothNormalGenerator final
 };
 
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/src/nbl/core/hash/blake.cpp b/src/nbl/core/hash/blake.cpp
index 043c28698d..5e6ee253e9 100644
--- a/src/nbl/core/hash/blake.cpp
+++ b/src/nbl/core/hash/blake.cpp
@@ -1,29 +1,570 @@
 #include "nbl/core/hash/blake.h"
 
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <cstring>
+#include <future>
+#include <thread>
+
+extern "C"
+{
+#include "blake3.h"
+#include "blake3_impl.h"
+}
+
+/*
+	BLAKE3 is tree-based and explicitly designed for parallel processing. The tree mode
+	(chunks and parent-node reduction) is part of the specification, so a parallel
+	implementation can be done without changing hash semantics.
+
+	Why this local implementation exists:
+	- Nabla needs a multithreaded hash path integrated with its own runtime policy and
+	  standard C++ threading.
+	- Upstream C API exposes a single-threaded update path and an optional oneTBB path
+	  (`blake3_hasher_update_tbb`) which requires building with `BLAKE3_USE_TBB`.
+	- Here we keep the same algorithmic rules and final digest, while using only C++20
+	  standard facilities (`std::async`, `std::thread`) and no oneTBB dependency.
+	- The local helpers below are adapted from upstream tree-processing internals used
+	  in `c/blake3.c` and the oneTBB integration path.
+
+	Primary references:
+	- BLAKE3 spec repository (paper): https://github.com/BLAKE3-team/BLAKE3-specs
+	- C2SP BLAKE3 specification: https://c2sp.org/BLAKE3
+	- Upstream BLAKE3 C API notes (`update_tbb`): https://github.com/BLAKE3-team/BLAKE3/blob/master/c/README.md
+*/
+
 namespace nbl::core
 {
 
+namespace
+{
+
+struct output_t
+{
+	uint32_t input_cv[8];
+	uint64_t counter;
+	uint8_t block[BLAKE3_BLOCK_LEN];
+	uint8_t block_len;
+	uint8_t flags;
+};
+
+INLINE void chunk_state_init_local(blake3_chunk_state* self, const uint32_t key[8], uint8_t flags)
+{
+	std::memcpy(self->cv, key, BLAKE3_KEY_LEN);
+	self->chunk_counter = 0;
+	std::memset(self->buf, 0, BLAKE3_BLOCK_LEN);
+	self->buf_len = 0;
+	self->blocks_compressed = 0;
+	self->flags = flags;
+}
+
+INLINE void chunk_state_reset_local(blake3_chunk_state* self, const uint32_t key[8], uint64_t chunk_counter)
+{
+	std::memcpy(self->cv, key, BLAKE3_KEY_LEN);
+	self->chunk_counter = chunk_counter;
+	self->blocks_compressed = 0;
+	std::memset(self->buf, 0, BLAKE3_BLOCK_LEN);
+	self->buf_len = 0;
+}
+
+INLINE size_t chunk_state_len_local(const blake3_chunk_state* self)
+{
+	return (BLAKE3_BLOCK_LEN * static_cast<size_t>(self->blocks_compressed)) + static_cast<size_t>(self->buf_len);
+}
+
+INLINE size_t chunk_state_fill_buf_local(blake3_chunk_state* self, const uint8_t* input, size_t input_len)
+{
+	size_t take = BLAKE3_BLOCK_LEN - static_cast<size_t>(self->buf_len);
+	if (take > input_len)
+		take = input_len;
+	auto* const dest = self->buf + static_cast<size_t>(self->buf_len);
+	std::memcpy(dest, input, take);
+	self->buf_len += static_cast<uint8_t>(take);
+	return take;
+}
+
+INLINE uint8_t chunk_state_maybe_start_flag_local(const blake3_chunk_state* self)
+{
+	return self->blocks_compressed == 0 ? CHUNK_START : 0;
+}
+
+INLINE output_t make_output_local(const uint32_t input_cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
+{
+	output_t ret = {};
+	std::memcpy(ret.input_cv, input_cv, 32);
+	std::memcpy(ret.block, block, BLAKE3_BLOCK_LEN);
+	ret.block_len = block_len;
+	ret.counter = counter;
+	ret.flags = flags;
+	return ret;
+}
+
+INLINE void output_chaining_value_local(const output_t* self, uint8_t cv[32])
+{
+	uint32_t cv_words[8];
+	std::memcpy(cv_words, self->input_cv, 32);
+	blake3_compress_in_place(cv_words, self->block, self->block_len, self->counter, self->flags);
+	store_cv_words(cv, cv_words);
+}
+
+INLINE void chunk_state_update_local(blake3_chunk_state* self, const uint8_t* input, size_t input_len)
+{
+	if (self->buf_len > 0)
+	{
+		size_t take = chunk_state_fill_buf_local(self, input, input_len);
+		input += take;
+		input_len -= take;
+		if (input_len > 0)
+		{
+			blake3_compress_in_place(
+				self->cv,
+				self->buf,
+				BLAKE3_BLOCK_LEN,
+				self->chunk_counter,
+				self->flags | chunk_state_maybe_start_flag_local(self));
+			self->blocks_compressed += 1;
+			self->buf_len = 0;
+			std::memset(self->buf, 0, BLAKE3_BLOCK_LEN);
+		}
+	}
+
+	while (input_len > BLAKE3_BLOCK_LEN)
+	{
+		blake3_compress_in_place(
+			self->cv,
+			input,
+			BLAKE3_BLOCK_LEN,
+			self->chunk_counter,
+			self->flags | chunk_state_maybe_start_flag_local(self));
+		self->blocks_compressed += 1;
+		input += BLAKE3_BLOCK_LEN;
+		input_len -= BLAKE3_BLOCK_LEN;
+	}
+
+	(void)chunk_state_fill_buf_local(self, input, input_len);
+}
+
+INLINE output_t chunk_state_output_local(const blake3_chunk_state* self)
+{
+	const uint8_t block_flags = self->flags | chunk_state_maybe_start_flag_local(self) | CHUNK_END;
+	return make_output_local(self->cv, self->buf, self->buf_len, self->chunk_counter, block_flags);
+}
+
+INLINE output_t parent_output_local(const uint8_t block[BLAKE3_BLOCK_LEN], const uint32_t key[8], uint8_t flags)
+{
+	return make_output_local(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT);
+}
+
+INLINE size_t left_len_local(size_t content_len)
+{
+	const size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN;
+	return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN;
+}
+
+INLINE size_t compress_chunks_parallel_local(
+	const uint8_t* input,
+	size_t input_len,
+	const uint32_t key[8],
+	uint64_t chunk_counter,
+	uint8_t flags,
+	uint8_t* out)
+{
+	const uint8_t* chunks_array[MAX_SIMD_DEGREE];
+	size_t input_position = 0;
+	size_t chunks_array_len = 0;
+	while (input_len - input_position >= BLAKE3_CHUNK_LEN)
+	{
+		chunks_array[chunks_array_len] = &input[input_position];
+		input_position += BLAKE3_CHUNK_LEN;
+		chunks_array_len += 1;
+	}
+
+	blake3_hash_many(
+		chunks_array,
+		chunks_array_len,
+		BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN,
+		key,
+		chunk_counter,
+		true,
+		flags,
+		CHUNK_START,
+		CHUNK_END,
+		out);
+
+	if (input_len > input_position)
+	{
+		const uint64_t counter = chunk_counter + static_cast<uint64_t>(chunks_array_len);
+		blake3_chunk_state chunk_state = {};
+		chunk_state_init_local(&chunk_state, key, flags);
+		chunk_state.chunk_counter = counter;
+		chunk_state_update_local(&chunk_state, &input[input_position], input_len - input_position);
+		const auto output = chunk_state_output_local(&chunk_state);
+		output_chaining_value_local(&output, &out[chunks_array_len * BLAKE3_OUT_LEN]);
+		return chunks_array_len + 1;
+	}
+
+	return chunks_array_len;
+}
+
+INLINE size_t compress_parents_parallel_local(
+	const uint8_t* child_chaining_values,
+	size_t num_chaining_values,
+	const uint32_t key[8],
+	uint8_t flags,
+	uint8_t* out)
+{
+	const uint8_t* parents_array[MAX_SIMD_DEGREE_OR_2];
+	size_t parents_array_len = 0;
+	while (num_chaining_values - (2 * parents_array_len) >= 2)
+	{
+		parents_array[parents_array_len] =
+			&child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN];
+		parents_array_len += 1;
+	}
+
+	blake3_hash_many(
+		parents_array,
+		parents_array_len,
+		1,
+		key,
+		0,
+		false,
+		flags | PARENT,
+		0,
+		0,
+		out);
+
+	if (num_chaining_values > 2 * parents_array_len)
+	{
+		std::memcpy(
+			&out[parents_array_len * BLAKE3_OUT_LEN],
+			&child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN],
+			BLAKE3_OUT_LEN);
+		return parents_array_len + 1;
+	}
+
+	return parents_array_len;
+}
+
+constexpr size_t ParallelMinInputBytes = 1ull << 20;
+constexpr size_t ParallelThreadGranularityBytes = 768ull << 10;
+constexpr size_t ParallelSpawnMinSubtreeBytes = 512ull << 10;
+constexpr uint32_t ParallelMaxThreads = 8u;
+std::atomic_uint32_t g_parallelHashCalls = 0u;
+
+class SParallelCallGuard final
+{
+	public:
+		SParallelCallGuard() : m_active(g_parallelHashCalls.fetch_add(1u, std::memory_order_relaxed) + 1u)
+		{
+		}
+
+		~SParallelCallGuard()
+		{
+			g_parallelHashCalls.fetch_sub(1u, std::memory_order_relaxed);
+		}
+
+		inline uint32_t activeCalls() const
+		{
+			return m_active;
+		}
+
+	private:
+		uint32_t m_active = 1u;
+};
+
+size_t compress_subtree_wide_mt(
+	const uint8_t* input,
+	size_t input_len,
+	const uint32_t key[8],
+	uint64_t chunk_counter,
+	uint8_t flags,
+	uint8_t* out,
+	uint32_t threadBudget);
+
+INLINE void compress_subtree_to_parent_node_mt(
+	const uint8_t* input,
+	size_t input_len,
+	const uint32_t key[8],
+	uint64_t chunk_counter,
+	uint8_t flags,
+	uint8_t out[2 * BLAKE3_OUT_LEN],
+	uint32_t threadBudget)
+{
+	uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
+	size_t num_cvs = compress_subtree_wide_mt(input, input_len, key, chunk_counter, flags, cv_array, threadBudget);
+	assert(num_cvs <= MAX_SIMD_DEGREE_OR_2);
+
+#if MAX_SIMD_DEGREE_OR_2 > 2
+	uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2];
+	while (num_cvs > 2)
+	{
+		num_cvs = compress_parents_parallel_local(cv_array, num_cvs, key, flags, out_array);
+		std::memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
+	}
+#endif
+
+	std::memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
+}
+
+size_t compress_subtree_wide_mt(
+	const uint8_t* input,
+	size_t input_len,
+	const uint32_t key[8],
+	uint64_t chunk_counter,
+	uint8_t flags,
+	uint8_t* out,
+	uint32_t threadBudget)
+{
+	if (input_len <= blake3_simd_degree() * BLAKE3_CHUNK_LEN)
+		return compress_chunks_parallel_local(input, input_len, key, chunk_counter, flags, out);
+
+	const size_t left_input_len = left_len_local(input_len);
+	const size_t right_input_len = input_len - left_input_len;
+	const uint8_t* const right_input = &input[left_input_len];
+	const uint64_t right_chunk_counter = chunk_counter + static_cast<uint64_t>(left_input_len / BLAKE3_CHUNK_LEN);
+
+	uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
+	size_t degree = blake3_simd_degree();
+	if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1)
+		degree = 2;
+	uint8_t* const right_cvs = &cv_array[degree * BLAKE3_OUT_LEN];
+
+	size_t left_n = 0;
+	size_t right_n = 0;
+	bool spawned = false;
+	if (
+		threadBudget > 1u &&
+		left_input_len >= ParallelSpawnMinSubtreeBytes &&
+		right_input_len >= ParallelSpawnMinSubtreeBytes)
+	{
+		try
+		{
+			uint32_t leftBudget = threadBudget / 2u;
+			if (leftBudget == 0u)
+				leftBudget = 1u;
+			uint32_t rightBudget = threadBudget - leftBudget;
+			if (rightBudget == 0u)
+				rightBudget = 1u;
+
+			auto rightFuture = std::async(std::launch::async, [right_input, right_input_len, key, right_chunk_counter, flags, right_cvs, rightBudget]() -> size_t
+			{
+				return compress_subtree_wide_mt(right_input, right_input_len, key, right_chunk_counter, flags, right_cvs, rightBudget);
+			});
+			left_n = compress_subtree_wide_mt(input, left_input_len, key, chunk_counter, flags, cv_array, leftBudget);
+			right_n = rightFuture.get();
+			spawned = true;
+		}
+		catch (...)
+		{
+			spawned = false;
+		}
+	}
+
+	if (!spawned)
+	{
+		left_n = compress_subtree_wide_mt(input, left_input_len, key, chunk_counter, flags, cv_array, 1u);
+		right_n = compress_subtree_wide_mt(right_input, right_input_len, key, right_chunk_counter, flags, right_cvs, 1u);
+	}
+
+	if (left_n == 1)
+	{
+		std::memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
+		return 2;
+	}
+
+	const size_t num_chaining_values = left_n + right_n;
+	return compress_parents_parallel_local(cv_array, num_chaining_values, key, flags, out);
+}
+
+INLINE void hasher_merge_cv_stack_local(::blake3_hasher* self, uint64_t total_len)
+{
+	const size_t post_merge_stack_len = static_cast<size_t>(popcnt(total_len));
+	while (self->cv_stack_len > post_merge_stack_len)
+	{
+		auto* const parent_node = &self->cv_stack[(self->cv_stack_len - 2) * BLAKE3_OUT_LEN];
+		const auto output = parent_output_local(parent_node, self->key, self->chunk.flags);
+		output_chaining_value_local(&output, parent_node);
+		self->cv_stack_len -= 1;
+	}
+}
+
+INLINE void hasher_push_cv_local(::blake3_hasher* self, uint8_t new_cv[BLAKE3_OUT_LEN], uint64_t chunk_counter)
+{
+	hasher_merge_cv_stack_local(self, chunk_counter);
+	std::memcpy(&self->cv_stack[self->cv_stack_len * BLAKE3_OUT_LEN], new_cv, BLAKE3_OUT_LEN);
+	self->cv_stack_len += 1;
+}
+
+void hasher_update_parallel(::blake3_hasher* self, const uint8_t* input_bytes, size_t input_len, uint32_t threadBudget)
+{
+	if (input_len == 0)
+		return;
+
+	if (chunk_state_len_local(&self->chunk) > 0)
+	{
+		size_t take = BLAKE3_CHUNK_LEN - chunk_state_len_local(&self->chunk);
+		if (take > input_len)
+			take = input_len;
+		chunk_state_update_local(&self->chunk, input_bytes, take);
+		input_bytes += take;
+		input_len -= take;
+		if (input_len > 0)
+		{
+			const auto output = chunk_state_output_local(&self->chunk);
+			uint8_t chunk_cv[BLAKE3_OUT_LEN];
+			output_chaining_value_local(&output, chunk_cv);
+			hasher_push_cv_local(self, chunk_cv, self->chunk.chunk_counter);
+			chunk_state_reset_local(&self->chunk, self->key, self->chunk.chunk_counter + 1);
+		}
+		else
+		{
+			return;
+		}
+	}
+
+	while (input_len > BLAKE3_CHUNK_LEN)
+	{
+		size_t subtree_len = round_down_to_power_of_2(input_len);
+		const uint64_t count_so_far = self->chunk.chunk_counter * BLAKE3_CHUNK_LEN;
+		while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0)
+			subtree_len /= 2;
+
+		const uint64_t subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN;
+		if (subtree_len <= BLAKE3_CHUNK_LEN)
+		{
+			blake3_chunk_state chunk_state = {};
+			chunk_state_init_local(&chunk_state, self->key, self->chunk.flags);
+			chunk_state.chunk_counter = self->chunk.chunk_counter;
+			chunk_state_update_local(&chunk_state, input_bytes, subtree_len);
+			const auto output = chunk_state_output_local(&chunk_state);
+			uint8_t cv[BLAKE3_OUT_LEN];
+			output_chaining_value_local(&output, cv);
+			hasher_push_cv_local(self, cv, chunk_state.chunk_counter);
+		}
+		else
+		{
+			uint8_t cv_pair[2 * BLAKE3_OUT_LEN];
+			compress_subtree_to_parent_node_mt(
+				input_bytes,
+				subtree_len,
+				self->key,
+				self->chunk.chunk_counter,
+				self->chunk.flags,
+				cv_pair,
+				threadBudget);
+			hasher_push_cv_local(self, cv_pair, self->chunk.chunk_counter);
+			hasher_push_cv_local(self, &cv_pair[BLAKE3_OUT_LEN], self->chunk.chunk_counter + (subtree_chunks / 2));
+		}
+		self->chunk.chunk_counter += subtree_chunks;
+		input_bytes += subtree_len;
+		input_len -= subtree_len;
+	}
+
+	if (input_len > 0)
+	{
+		chunk_state_update_local(&self->chunk, input_bytes, input_len);
+		hasher_merge_cv_stack_local(self, self->chunk.chunk_counter);
+	}
+}
+
+INLINE uint32_t pick_parallel_budget(const size_t bytes)
+{
+	const uint32_t hw = std::thread::hardware_concurrency();
+	if (hw <= 1u || bytes < ParallelMinInputBytes)
+		return 1u;
+
+	const uint32_t maxBySize = static_cast<uint32_t>(std::max<size_t>(1ull, bytes / ParallelThreadGranularityBytes));
+	uint32_t budget = std::min<uint32_t>(hw, ParallelMaxThreads);
+	budget = std::min<uint32_t>(budget, maxBySize);
+	return std::max<uint32_t>(1u, budget);
+}
+
+}
+
+void blake3_hasher::validateOpaqueStateLayout()
+{
+	// The wrapper keeps a small inline storage margin so the real vendor hasher
+	// stays out of the public API. The margin gives us a safe footprint reserve
+	// for ABI or platform differences and only increases the wrapper size slightly.
+	static_assert(sizeof(::blake3_hasher) <= OpaqueStateSize);
+	static_assert(alignof(::blake3_hasher) <= OpaqueStateAlign);
+}
+
 blake3_hasher::blake3_hasher()
 {
-    ::blake3_hasher_init(&m_state);
+	validateOpaqueStateLayout();
+	::blake3_hasher_init(reinterpret_cast<::blake3_hasher*>(m_state));
 }
 
 blake3_hasher& blake3_hasher::update(const void* data, const size_t bytes)
 {
-    ::blake3_hasher_update(&m_state, data, bytes);
-    return *this;
+	if (bytes == 0ull)
+		return *this;
+
+	assert(data != nullptr);
+	if (!data)
+		return *this;
+
+	::blake3_hasher_update(reinterpret_cast<::blake3_hasher*>(m_state), data, bytes);
+	return *this;
 }
 
-void blake3_hasher::reset() {
-  ::blake3_hasher_reset(&m_state);
+void blake3_hasher::reset()
+{
+	::blake3_hasher_init(reinterpret_cast<::blake3_hasher*>(m_state));
 }
 
 blake3_hasher::operator blake3_hash_t() const
 {
-    blake3_hash_t retval;
-    // the blake3 docs say that the hasher can be finalized multiple times
-    ::blake3_hasher_finalize(&m_state, retval.data, sizeof(retval));
-    return retval;
+	blake3_hash_t retval = {};
+	const auto* const state = reinterpret_cast<const ::blake3_hasher*>(m_state);
+	::blake3_hasher stateCopy = *state;
+	::blake3_hasher_finalize(&stateCopy, retval.data, blake3_hash_t::DigestSize);
+	return retval;
+}
+
+blake3_hash_t blake3_hash_buffer(const void* data, size_t bytes)
+{
+	if (!data && bytes != 0ull)
+		return {};
+	if (bytes == 0ull)
+		return static_cast<blake3_hash_t>(blake3_hasher{});
+
+	uint32_t threadBudget = pick_parallel_budget(bytes);
+	if (threadBudget <= 1u)
+		return blake3_hash_buffer_sequential(data, bytes);
+
+	SParallelCallGuard guard;
+	const uint32_t activeCalls = std::max<uint32_t>(1u, guard.activeCalls());
+	const uint32_t hw = std::max<uint32_t>(1u, std::thread::hardware_concurrency());
+	const uint32_t hwShare = std::max<uint32_t>(1u, hw / activeCalls);
+	threadBudget = std::min(threadBudget, hwShare);
+	if (threadBudget <= 1u)
+		return blake3_hash_buffer_sequential(data, bytes);
+
+	::blake3_hasher hasherState = {};
+	::blake3_hasher_init(&hasherState);
+	hasher_update_parallel(&hasherState, reinterpret_cast<const uint8_t*>(data), bytes, threadBudget);
+	blake3_hash_t retval = {};
+	::blake3_hasher_finalize(&hasherState, retval.data, blake3_hash_t::DigestSize);
+	return retval;
+}
+
+blake3_hash_t blake3_hash_buffer_sequential(const void* data, size_t bytes)
+{
+	if (!data && bytes != 0ull)
+		return {};
+
+	::blake3_hasher hasher = {};
+	::blake3_hasher_init(&hasher);
+	if (bytes != 0ull)
+		::blake3_hasher_update(&hasher, data, bytes);
+
+	blake3_hash_t retval = {};
+	::blake3_hasher_finalize(&hasher, retval.data, blake3_hash_t::DigestSize);
+	return retval;
 }
 
 }
diff --git a/src/nbl/gtml.cpp b/src/nbl/gtml.cpp
index 2829c03c07..f1f9b1d0fe 100644
--- a/src/nbl/gtml.cpp
+++ b/src/nbl/gtml.cpp
@@ -1,7 +1,7 @@
-#include "git_info.h"
+#include "nbl/git/info.h"
 
 namespace nbl {
-	const gtml::GitInfo& getGitInfo(gtml::E_GIT_REPO_META repo) {
-		return gtml::gitMeta[repo];
+	const ::gtml::IGitInfo& getGitInfo(gtml::E_GIT_REPO_META repo) {
+		return *gtml::gitMeta[repo];
 	}
-}
\ No newline at end of file
+}
diff --git a/src/nbl/system/CFilePOSIX.cpp b/src/nbl/system/CFilePOSIX.cpp
index 1f78d5befa..2eb9e62ed2 100644
--- a/src/nbl/system/CFilePOSIX.cpp
+++ b/src/nbl/system/CFilePOSIX.cpp
@@ -28,13 +28,13 @@ CFilePOSIX::~CFilePOSIX()
 
 size_t CFilePOSIX::asyncRead(void* buffer, size_t offset, size_t sizeToRead)
 {
-	lseek(m_native, offset, SEEK_SET);
-	return ::read(m_native, buffer, sizeToRead);
+	const auto processed = pread(m_native, buffer, sizeToRead, static_cast<off_t>(offset));
+	return processed > 0 ? static_cast<size_t>(processed):0ull;
 }
 
 size_t CFilePOSIX::asyncWrite(const void* buffer, size_t offset, size_t sizeToWrite)
 {
-	lseek(m_native, offset, SEEK_SET);
-	return ::write(m_native, buffer, sizeToWrite);
+	const auto processed = pwrite(m_native, buffer, sizeToWrite, static_cast<off_t>(offset));
+	return processed > 0 ? static_cast<size_t>(processed):0ull;
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/src/nbl/system/CFileWin32.cpp b/src/nbl/system/CFileWin32.cpp
index ae888e0d9b..ffe9d9c6b0 100644
--- a/src/nbl/system/CFileWin32.cpp
+++ b/src/nbl/system/CFileWin32.cpp
@@ -57,17 +57,19 @@ inline size_t CFileWin32::getSize() const
 
 size_t CFileWin32::asyncRead(void* buffer, size_t offset, size_t sizeToRead)
 {
-	seek(offset);
+	OVERLAPPED overlapped = {};
+	overlapped.Offset = LODWORD(offset);
+	overlapped.OffsetHigh = HIDWORD(offset);
 	DWORD numOfBytesRead;
-	ReadFile(m_native, buffer, sizeToRead, &numOfBytesRead, nullptr);
-	return numOfBytesRead;
+	return ReadFile(m_native, buffer, sizeToRead, &numOfBytesRead, &overlapped) ? numOfBytesRead:0ull;
 }
 size_t CFileWin32::asyncWrite(const void* buffer, size_t offset, size_t sizeToWrite)
 {
-	seek(offset);
+	OVERLAPPED overlapped = {};
+	overlapped.Offset = LODWORD(offset);
+	overlapped.OffsetHigh = HIDWORD(offset);
 	DWORD numOfBytesWritten;
-	WriteFile(m_native, buffer, sizeToWrite, &numOfBytesWritten, nullptr);
-	return numOfBytesWritten;
+	return WriteFile(m_native, buffer, sizeToWrite, &numOfBytesWritten, &overlapped) ? numOfBytesWritten:0ull;
 }
 
 
@@ -76,4 +78,4 @@ void CFileWin32::seek(size_t position)
 	LONG hiDword = HIDWORD(position);
 	SetFilePointer(m_native,position,&hiDword,FILE_BEGIN);
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/src/nbl/system/CSystemLinux.cpp b/src/nbl/system/CSystemLinux.cpp
index 58aaeeb51b..8a47ac09b3 100644
--- a/src/nbl/system/CSystemLinux.cpp
+++ b/src/nbl/system/CSystemLinux.cpp
@@ -5,10 +5,29 @@ using namespace nbl::system;
 
 #ifdef _NBL_PLATFORM_LINUX_
 
+#include <algorithm>
+#include <cctype>
+#include <fstream>
+#include <string>
+#include <unordered_set>
 #include <sys/sysinfo.h>
 #include <sys/stat.h>
 #include <fcntl.h>
 #include <unistd.h>
+
+namespace
+{
+
+std::string trimCopy(std::string value)
+{
+    auto notSpace = [](unsigned char ch) { return !std::isspace(ch); };
+    value.erase(value.begin(), std::find_if(value.begin(), value.end(), notSpace));
+    value.erase(std::find_if(value.rbegin(), value.rend(), notSpace).base(), value.end());
+    return value;
+}
+
+}
+
 ISystem::SystemInfo CSystemLinux::getSystemInfo() const
 {
     SystemInfo info;
@@ -27,6 +46,53 @@ ISystem::SystemInfo CSystemLinux::getSystemInfo() const
     info.desktopResX = 0xdeadbeefu;
     info.desktopResY = 0xdeadbeefu;
 
+    std::ifstream cpuInfo("/proc/cpuinfo");
+    std::unordered_set<std::string> uniquePhysicalCores;
+    std::string currentPhysicalId;
+    std::string currentCoreId;
+    auto flushCurrentCore = [&]()
+    {
+        if (!currentPhysicalId.empty() || !currentCoreId.empty())
+            uniquePhysicalCores.insert(currentPhysicalId + ":" + currentCoreId);
+        currentPhysicalId.clear();
+        currentCoreId.clear();
+    };
+
+    for (std::string line; std::getline(cpuInfo, line);)
+    {
+        if (line.empty())
+        {
+            flushCurrentCore();
+            continue;
+        }
+
+        if (line.starts_with("model name"))
+        {
+            const auto separator = line.find(':');
+            if (separator != std::string::npos && info.cpuName == "Unknown")
+                info.cpuName = trimCopy(line.substr(separator + 1u));
+            continue;
+        }
+
+        if (line.starts_with("physical id"))
+        {
+            const auto separator = line.find(':');
+            if (separator != std::string::npos)
+                currentPhysicalId = trimCopy(line.substr(separator + 1u));
+            continue;
+        }
+
+        if (line.starts_with("core id"))
+        {
+            const auto separator = line.find(':');
+            if (separator != std::string::npos)
+                currentCoreId = trimCopy(line.substr(separator + 1u));
+            continue;
+        }
+    }
+    flushCurrentCore();
+    info.physicalCoreCount = static_cast<uint32_t>(uniquePhysicalCores.size());
+
     return info;
 }
 
@@ -62,4 +128,4 @@ bool isDebuggerAttached()
    return false;
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/nbl/system/CSystemWin32.cpp b/src/nbl/system/CSystemWin32.cpp
index 2798b4fb27..49aaf2e3ac 100644
--- a/src/nbl/system/CSystemWin32.cpp
+++ b/src/nbl/system/CSystemWin32.cpp
@@ -1,11 +1,70 @@
 #include "nbl/system/CSystemWin32.h"
 #include "nbl/system/CFileWin32.h"
+#include "nbl/system/SWin32PathUtilities.h"
 
 using namespace nbl;
 using namespace nbl::system;
 
 #ifdef _NBL_PLATFORM_WINDOWS_
+#include <algorithm>
+#include <vector>
 #include <powerbase.h>
+#include <intrin.h>
+#include <array>
+#include <cstring>
+
+namespace
+{
+std::string queryCpuName()
+{
+    int cpuInfo[4] = {};
+    __cpuid(cpuInfo, 0x80000000);
+    const auto maxExtendedLeaf = static_cast<uint32_t>(cpuInfo[0]);
+    if (maxExtendedLeaf < 0x80000004u)
+        return "Unknown";
+
+    std::array<char, 49> brandString = {};
+    auto* cursor = reinterpret_cast<int*>(brandString.data());
+    for (auto leaf = 0x80000002; leaf <= 0x80000004; ++leaf)
+    {
+        __cpuid(cpuInfo, leaf);
+        std::memcpy(cursor, cpuInfo, sizeof(cpuInfo));
+        cursor += sizeof(cpuInfo) / sizeof(int);
+    }
+
+    std::string result = brandString.data();
+    auto notSpace = [](unsigned char ch) { return !std::isspace(ch); };
+    result.erase(result.begin(), std::find_if(result.begin(), result.end(), notSpace));
+    result.erase(std::find_if(result.rbegin(), result.rend(), notSpace).base(), result.end());
+    return result.empty() ? std::string("Unknown") : result;
+}
+
+uint32_t queryPhysicalCoreCount()
+{
+    DWORD bufferSize = 0u;
+    GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &bufferSize);
+    if (bufferSize == 0u)
+        return 0u;
+
+    std::vector<uint8_t> buffer(bufferSize);
+    auto* info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data());
+    if (!GetLogicalProcessorInformationEx(RelationProcessorCore, info, &bufferSize))
+        return 0u;
+
+    uint32_t coreCount = 0u;
+    auto* current = reinterpret_cast<uint8_t*>(info);
+    const auto* end = current + bufferSize;
+    while (current < end)
+    {
+        auto* entry = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(current);
+        if (entry->Relationship == RelationProcessorCore)
+            ++coreCount;
+        current += entry->Size;
+    }
+    return coreCount;
+}
+
+}
 
 //LOL the struct definition wasn't added to winapi headers do they ask to declare them yourself
 typedef struct _PROCESSOR_POWER_INFORMATION {
@@ -34,12 +93,14 @@ ISystem::SystemInfo CSystemWin32::getSystemInfo() const
 
     info.desktopResX = GetSystemMetrics(SM_CXSCREEN);
     info.desktopResY = GetSystemMetrics(SM_CYSCREEN);
+    info.cpuName = queryCpuName();
+    info.physicalCoreCount = queryPhysicalCoreCount();
 
     return info;
 }
 
 
-core::smart_refctd_ptr<ISystemFile> CSystemWin32::CCaller::createFile(const std::filesystem::path& filename, const core::bitflag<IFile::E_CREATE_FLAGS> flags)
+core::smart_refctd_ptr<ISystemFile> CSystemWin32::CCaller::createFile(const std::filesystem::path& filename, core::bitflag<IFile::E_CREATE_FLAGS> flags)
 {
     const bool writeAccess = flags.value&IFile::ECF_WRITE;
 	const DWORD fileAccess = ((flags.value&IFile::ECF_READ) ? FILE_GENERIC_READ:0)|(writeAccess ? FILE_GENERIC_WRITE:0);
@@ -52,12 +113,11 @@ core::smart_refctd_ptr<ISystemFile> CSystemWin32::CCaller::createFile(const std:
 	SECURITY_ATTRIBUTES secAttribs{ sizeof(SECURITY_ATTRIBUTES), nullptr, FALSE };
 	
 	system::path p = filename;
-	if (p.is_absolute()) 
-		p.make_preferred(); // Replace "/" separators with "\"
+	const auto nativePath = impl::makeLongPathAwareWindowsPath(p);
 
     // only write access should create new files if they don't exist
 	const auto creationDisposition = writeAccess ? OPEN_ALWAYS : OPEN_EXISTING;
-	HANDLE _native = CreateFileA(p.string().data(), fileAccess, shareMode, &secAttribs, creationDisposition, FILE_ATTRIBUTE_NORMAL, nullptr);
+	HANDLE _native = CreateFileW(nativePath.c_str(), fileAccess, shareMode, &secAttribs, creationDisposition, FILE_ATTRIBUTE_NORMAL, nullptr);
     if (_native==INVALID_HANDLE_VALUE)
     {
         auto e = GetLastError();
@@ -73,36 +133,37 @@ core::smart_refctd_ptr<ISystemFile> CSystemWin32::CCaller::createFile(const std:
         For now it equals the size of a file so it'll work fine for archive reading, but if we try to
         write outside those boungs, things will go bad.
         */
-        _fileMappingObj = CreateFileMappingA(_native,nullptr,writeAccess ? PAGE_READWRITE:PAGE_READONLY, 0, 0, filename.string().c_str());
+        _fileMappingObj = CreateFileMappingA(_native,nullptr,writeAccess ? PAGE_READWRITE:PAGE_READONLY, 0, 0, nullptr);
         if (!_fileMappingObj)
         {
-            CloseHandle(_native);
-            return nullptr;
+            // backend fallback: file opens successfully but mapping-related flags are removed
+            flags.value &= ~(IFile::ECF_COHERENT | IFile::ECF_MAPPABLE);
         }
-        DWORD hi = 0;
-        size_t size = GetFileSize(_native,&hi);
-        size |= size_t(hi) << 32ull;
-        switch (flags.value&IFile::ECF_READ_WRITE)
+		else
         {
-            case IFile::ECF_READ:
-                _mappedPtr = MapViewOfFile(_fileMappingObj,FILE_MAP_READ,0,0,size);
-                break;
-            case IFile::ECF_WRITE:
-                _mappedPtr = MapViewOfFile(_fileMappingObj,FILE_MAP_WRITE,0,0,size);
-                break;
-            case IFile::ECF_READ_WRITE:
-                _mappedPtr = MapViewOfFile(_fileMappingObj,FILE_MAP_ALL_ACCESS,0,0,size);
-                break;
-            default:
-                assert(false); // should never happen
-                break;
-        }
-        if (!_mappedPtr)
-        {
-            CloseHandle(_native);
-            CloseHandle(_fileMappingObj);
-            return nullptr;
-        }
+            switch (flags.value&IFile::ECF_READ_WRITE)
+            {
+                case IFile::ECF_READ:
+                    _mappedPtr = MapViewOfFile(_fileMappingObj,FILE_MAP_READ,0,0,0);
+                    break;
+                case IFile::ECF_WRITE:
+                    _mappedPtr = MapViewOfFile(_fileMappingObj,FILE_MAP_WRITE,0,0,0);
+                    break;
+                case IFile::ECF_READ_WRITE:
+                    _mappedPtr = MapViewOfFile(_fileMappingObj,FILE_MAP_ALL_ACCESS,0,0,0);
+                    break;
+                default:
+                    assert(false); // should never happen
+                    break;
+            }
+            if (!_mappedPtr)
+            {
+                CloseHandle(_fileMappingObj);
+                _fileMappingObj = nullptr;
+                // backend fallback: file opens successfully but mapping-related flags are removed
+                flags.value &= ~(IFile::ECF_COHERENT | IFile::ECF_MAPPABLE);
+            }
+		}
     }
     return core::make_smart_refctd_ptr<CFileWin32>(core::smart_refctd_ptr<ISystem>(m_system),path(filename),flags,_mappedPtr,_native,_fileMappingObj);
 }
diff --git a/src/nbl/system/ISystem.cpp b/src/nbl/system/ISystem.cpp
index 6b25471f8d..f1b3dec85e 100644
--- a/src/nbl/system/ISystem.cpp
+++ b/src/nbl/system/ISystem.cpp
@@ -11,6 +11,14 @@
 #include "nbl/system/CArchiveLoaderZip.h"
 #include "nbl/system/CArchiveLoaderTar.h"
 #include "nbl/system/CMountDirectoryArchive.h"
+#include "nbl/system/SWin32PathUtilities.h"
+
+#ifdef _NBL_PLATFORM_WINDOWS_
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
+#include <windows.h>
+#endif
 
 using namespace nbl;
 using namespace nbl::system;
@@ -37,15 +45,12 @@ ISystem::ISystem(core::smart_refctd_ptr<ISystem::ICaller>&& caller) : m_dispatch
 bool ISystem::exists(const system::path& filename, const core::bitflag<IFile::E_CREATE_FLAGS> flags) const
 {
     const bool writeUsage = flags.value&IFile::ECF_WRITE;
-    
-    // filename too long
-    if (filename.string().size() >= sizeof(SRequestParams_CREATE_FILE::filename))
-        return false;
-    // archive file
-    if (!writeUsage && findFileInArchive(filename).archive)
-        return true;
     // regular file
-    return std::filesystem::exists(filename);
+    std::error_code fsEc;
+    if (std::filesystem::exists(filename, fsEc) && !fsEc)
+        return true;
+    // archive file
+    return !writeUsage && findFileInArchive(filename).archive;
 }
 
 bool ISystem::isPathReadOnly(const system::path& p) const
@@ -122,10 +127,18 @@ bool ISystem::deleteDirectory(const system::path& p)
 
 bool nbl::system::ISystem::deleteFile(const system::path& p)
 {
+#ifdef _NBL_PLATFORM_WINDOWS_
+    const auto nativePath = impl::makeLongPathAwareWindowsPath(std::filesystem::path(p.string()));
+    const DWORD attributes = GetFileAttributesW(nativePath.c_str());
+    if (attributes == INVALID_FILE_ATTRIBUTES || (attributes & FILE_ATTRIBUTE_DIRECTORY))
+        return false;
+    return DeleteFileW(nativePath.c_str());
+#else
     if (std::filesystem::exists(p) && !std::filesystem::is_directory(p))
         return std::filesystem::remove(p);
     else
         return false;
+#endif
 }
 
 std::error_code ISystem::moveFileOrDirectory(const system::path& oldPath, const system::path& newPath)
@@ -193,14 +206,34 @@ bool ISystem::copy(const system::path& from, const system::path& to)
 
 void ISystem::createFile(future_t<core::smart_refctd_ptr<IFile>>& future, std::filesystem::path filename, const core::bitflag<IFileBase::E_CREATE_FLAGS> flags, const std::string_view& accessToken)
 {
-    // canonicalize
-    if (std::filesystem::exists(filename))
-        filename = std::filesystem::canonical(filename);
+    std::error_code fsEc;
+    const bool writeUsage = flags.value&IFile::ECF_WRITE;
+    const bool absoluteInput = filename.is_absolute();
+    bool pathExists = false;
+    if (!writeUsage)
+    {
+        fsEc.clear();
+        pathExists = std::filesystem::exists(filename, fsEc) && !fsEc;
+        if (pathExists && !absoluteInput)
+        {
+            fsEc.clear();
+            const auto absolute = std::filesystem::absolute(filename, fsEc);
+            if (!fsEc)
+                filename = absolute;
+        }
+    }
 
     // try archives (readonly, for now)
-    if (!(flags.value&IFile::ECF_WRITE))
+    if (!writeUsage && !pathExists)
     {
-        const auto found = findFileInArchive(filename);
+        auto found = findFileInArchive(filename);
+        if (!found.archive && !absoluteInput)
+        {
+            fsEc.clear();
+            const auto absolute = std::filesystem::absolute(filename, fsEc);
+            if (!fsEc)
+                found = findFileInArchive(absolute);
+        }
         if (found.archive)
         {
             auto file = found.archive->getFile(found.pathRelativeToArchive,flags,accessToken);
@@ -213,17 +246,8 @@ void ISystem::createFile(future_t<core::smart_refctd_ptr<IFile>>& future, std::f
     }
 
     //
-    if (std::filesystem::exists(filename))
-        filename = std::filesystem::absolute(filename).generic_string();
-    if (filename.string().size()>=MAX_FILENAME_LENGTH)
-    {
-        future.set_result(nullptr);
-        return;
-    }
-
-
     SRequestParams_CREATE_FILE params;
-    strcpy(params.filename,filename.string().c_str());
+    params.filename = std::move(filename);
     params.flags = flags.value;
     m_dispatcher.request(&future,params);
 }
@@ -255,26 +279,69 @@ core::smart_refctd_ptr<IFileArchive> ISystem::openFileArchive(core::smart_refctd
 
 ISystem::FoundArchiveFile ISystem::findFileInArchive(const system::path& absolutePath) const
 {
-    system::path path = std::filesystem::exists(absolutePath) ? std::filesystem::canonical(absolutePath.parent_path()):absolutePath.parent_path();
-    // going up the directory tree
-    while (!path.empty() && path.parent_path()!=path)
+    std::error_code fsEc;
+    const system::path normalizedAbsolutePath = absolutePath.lexically_normal();
+    system::path normalizedAbsoluteFallback = {};
+    bool hasAbsoluteFallback = false;
+    if (!normalizedAbsolutePath.is_absolute())
     {
-        path = std::filesystem::exists(path) ? std::filesystem::canonical(path):path;
+        const auto absoluteCandidate = std::filesystem::absolute(normalizedAbsolutePath, fsEc);
+        if (!fsEc)
+        {
+            normalizedAbsoluteFallback = absoluteCandidate.lexically_normal();
+            hasAbsoluteFallback = true;
+        }
+    }
 
-        const auto archives = m_cachedArchiveFiles.findRange(path);
-        for (auto& archive : archives)
+    auto tryMatchAtPath = [&](const system::path& archivePath) -> FoundArchiveFile
+    {
+        auto tryMatchSingle = [&](const system::path& normalizedPath) -> FoundArchiveFile
         {
-            const auto relative = std::filesystem::relative(absolutePath,path);
-            const auto items = static_cast<IFileArchive::SFileList::range_t>(archive.second->listAssets());
+            std::error_code relativeEc;
+            const auto relative = std::filesystem::relative(normalizedPath, archivePath, relativeEc);
+            if (relativeEc)
+                return { nullptr, {} };
 
-            const IFileArchive::SFileList::SEntry itemToFind = { relative };
-            auto found = std::lower_bound(items.begin(), items.end(), itemToFind);
-            if (found!=items.end() && found->pathRelativeToArchive==relative)
-                return {archive.second.get(),relative};
+            const auto archives = m_cachedArchiveFiles.findRange(archivePath);
+            for (auto& archive : archives)
+            {
+                const auto items = static_cast<IFileArchive::SFileList::range_t>(archive.second->listAssets());
+                const IFileArchive::SFileList::SEntry itemToFind = { relative };
+                auto found = std::lower_bound(items.begin(), items.end(), itemToFind);
+                if (found != items.end() && found->pathRelativeToArchive == relative)
+                    return { archive.second.get(), relative };
+            }
+            return { nullptr, {} };
+        };
+
+        if (auto found = tryMatchSingle(normalizedAbsolutePath); found.archive)
+            return found;
+        if (hasAbsoluteFallback)
+            return tryMatchSingle(normalizedAbsoluteFallback);
+        return { nullptr, {} };
+    };
+
+    system::path path = normalizedAbsolutePath.parent_path().lexically_normal();
+    while (!path.empty() && path.parent_path() != path)
+    {
+        if (auto found = tryMatchAtPath(path); found.archive)
+            return found;
+
+        fsEc.clear();
+        if (std::filesystem::exists(path, fsEc) && !fsEc)
+        {
+            fsEc.clear();
+            const auto canonicalPath = std::filesystem::canonical(path, fsEc);
+            if (!fsEc && canonicalPath != path)
+            {
+                if (auto found = tryMatchAtPath(canonicalPath); found.archive)
+                    return found;
+            }
         }
+
         path = path.parent_path();
     }
-    return { nullptr,{} };
+    return { nullptr, {} };
 }
 
 
@@ -394,4 +461,4 @@ bool ISystem::isDebuggerAttached()
 
     return false;
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/src/nbl/system/SWin32PathUtilities.h b/src/nbl/system/SWin32PathUtilities.h
new file mode 100644
index 0000000000..0f2ae33f24
--- /dev/null
+++ b/src/nbl/system/SWin32PathUtilities.h
@@ -0,0 +1,43 @@
+// Internal src-only header. Do not include from public headers.
+#ifndef _NBL_SYSTEM_S_WIN32_PATH_UTILITIES_H_INCLUDED_
+#define _NBL_SYSTEM_S_WIN32_PATH_UTILITIES_H_INCLUDED_
+
+#ifdef _NBL_PLATFORM_WINDOWS_
+
+#include <filesystem>
+#include <string>
+#include <string_view>
+#include <system_error>
+
+namespace nbl::system::impl
+{
+
+inline std::wstring makeLongPathAwareWindowsPath(std::filesystem::path path)
+{
+    path = path.lexically_normal();
+    if (!path.is_absolute())
+    {
+        std::error_code ec;
+        const auto absolutePath = std::filesystem::absolute(path, ec);
+        if (!ec)
+            path = absolutePath.lexically_normal();
+    }
+    path.make_preferred();
+
+    std::wstring native = path.native();
+    constexpr std::wstring_view ExtendedPrefix = LR"(\\?\)";
+    constexpr std::wstring_view UncPrefix = LR"(\\)";
+    constexpr std::wstring_view ExtendedUncPrefix = LR"(\\?\UNC\)";
+
+    if (native.rfind(ExtendedPrefix.data(), 0u) == 0u)
+        return native;
+    if (native.rfind(UncPrefix.data(), 0u) == 0u)
+        return std::wstring(ExtendedUncPrefix) + native.substr(2u);
+    return std::wstring(ExtendedPrefix) + native;
+}
+
+}
+
+#endif
+
+#endif
diff --git a/src/nbl/video/ILogicalDevice.cpp b/src/nbl/video/ILogicalDevice.cpp
index a98deff5c7..c24f7c1950 100644
--- a/src/nbl/video/ILogicalDevice.cpp
+++ b/src/nbl/video/ILogicalDevice.cpp
@@ -1,6 +1,6 @@
 #include "nbl/video/IPhysicalDevice.h"
 
-#include "git_info.h"
+#include "nbl/git/info.h"
 #define NBL_LOG_FUNCTION m_logger.log
 #include "nbl/logging_macros.h"
 
@@ -1147,4 +1147,4 @@ bool ILogicalDevice::createRayTracingPipelines(IGPUPipelineCache* const pipeline
     }
     return retval;
 }
-#include "nbl/undef_logging_macros.h"
\ No newline at end of file
+#include "nbl/undef_logging_macros.h"
diff --git a/src/nbl/video/IQueue.cpp b/src/nbl/video/IQueue.cpp
index 70acecffca..4e24a0e7e9 100644
--- a/src/nbl/video/IQueue.cpp
+++ b/src/nbl/video/IQueue.cpp
@@ -3,7 +3,7 @@
 #include "nbl/video/ILogicalDevice.h"
 #include "nbl/video/TimelineEventHandlers.h"
 
-#include "git_info.h"
+#include "nbl/git/info.h"
 #define NBL_LOG_FUNCTION logger->log
 #include "nbl/logging_macros.h"
 
@@ -245,4 +245,4 @@ void IQueue::DeferredSubmitCallback::operator()()
 
 } // namespace nbl::video
 
-#include "nbl/undef_logging_macros.h"
\ No newline at end of file
+#include "nbl/undef_logging_macros.h"
diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
index d7f2d7dbbc..4be0913ebe 100644
--- a/src/nbl/video/utilities/CAssetConverter.cpp
+++ b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2024-2024 - DevSH Graphics Programming Sp. z O.O.
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 #include "nbl/video/utilities/CAssetConverter.h"
 
@@ -1183,7 +1183,10 @@ bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_t<ICPUBuffer> loo
 	auto patchedParams = lookup.asset->getCreationParams();
 	assert(lookup.patch->usage.hasFlags(patchedParams.usage));
 	patchedParams.usage = lookup.patch->usage;
-	hasher.update(&patchedParams,sizeof(patchedParams)) << lookup.asset->getContentHash();
+	const auto contentHash = lookup.asset->getContentHash();
+	if (contentHash==NoContentHash)
+		return false;
+	hasher.update(&patchedParams,sizeof(patchedParams)) << contentHash;
 	return true;
 }
 bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_t<ICPUBottomLevelAccelerationStructure> lookup)
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index 57f66ad44b..8d78ea75db 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -1,8 +1,9 @@
 add_subdirectory(nsc)
 add_subdirectory(xxHash256)
+add_subdirectory(hcp)
 
 if(NBL_BUILD_IMGUI)
 	add_subdirectory(nite EXCLUDE_FROM_ALL)
 endif()
 
-NBL_ADJUST_FOLDERS(tools)
\ No newline at end of file
+NBL_ADJUST_FOLDERS(tools)
diff --git a/tools/hcp/CMakeLists.txt b/tools/hcp/CMakeLists.txt
new file mode 100644
index 0000000000..456b0f3e1b
--- /dev/null
+++ b/tools/hcp/CMakeLists.txt
@@ -0,0 +1,31 @@
+nbl_create_executable_project("" "" "" "")
+add_dependencies(${EXECUTABLE_NAME} argparse)
+target_include_directories(${EXECUTABLE_NAME} PRIVATE $<TARGET_PROPERTY:argparse,INTERFACE_INCLUDE_DIRECTORIES>)
+
+enable_testing()
+
+set(NBL_HCP_CI_ARGS
+	--buffer-bytes 67108864
+	--seed 12345
+)
+
+function(nbl_hcp_add_ci_test mode)
+	string(TOUPPER "${mode}" mode_upper)
+	add_test(NAME "NBL_HCP_${mode_upper}"
+		COMMAND "$<TARGET_FILE:${EXECUTABLE_NAME}>" --runtime-tuning "${mode}" ${NBL_HCP_CI_ARGS}
+		WORKING_DIRECTORY "$<TARGET_FILE_DIR:${EXECUTABLE_NAME}>"
+		COMMAND_EXPAND_LISTS
+	)
+endfunction()
+
+nbl_hcp_add_ci_test(sequential)
+nbl_hcp_add_ci_test(heuristic)
+nbl_hcp_add_ci_test(hybrid)
+
+set_tests_properties(
+	NBL_HCP_SEQUENTIAL
+	NBL_HCP_HEURISTIC
+	NBL_HCP_HYBRID
+	PROPERTIES
+		LABELS "hash;ci"
+)
diff --git a/tools/hcp/README.md b/tools/hcp/README.md
new file mode 100644
index 0000000000..e1a11fffb7
--- /dev/null
+++ b/tools/hcp/README.md
@@ -0,0 +1,22 @@
+# hcp
+
+Headless parity checker for polygon geometry content hashing.
+
+## What it checks
+- input geometry buffers are generated as deterministic dummy blobs from `--seed`
+- `recompute(..., sequential)` as baseline
+- `recompute(..., <selected mode>)` equals baseline hash
+- `computeMissing(..., <selected mode>)` preserves pre-set hashes and equals baseline hash
+- confirms `BLAKE3` content hashing parity independent of runtime tuning mode
+- timing logs for baseline, recompute and computeMissing
+
+## Args
+- `--runtime-tuning <sequential|heuristic|hybrid>` (alias: `none` -> `sequential`, default: `heuristic`)
+- `--buffer-bytes <N>` (minimum: `2097152`)
+- `--seed <U64>` (deterministic payload seed)
+
+## Example
+`./hcp_d.exe --runtime-tuning heuristic --buffer-bytes 67108864 --seed 12345`
+
+## CTest
+`ctest --output-on-failure -C Debug -R NBL_HCP`
diff --git a/tools/hcp/main.cpp b/tools/hcp/main.cpp
new file mode 100644
index 0000000000..afc8373280
--- /dev/null
+++ b/tools/hcp/main.cpp
@@ -0,0 +1,359 @@
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#include "nabla.h"
+#include "nbl/system/IApplicationFramework.h"
+#include "nbl/system/CStdoutLogger.h"
+
+#include "nbl/asset/interchange/SFileIOPolicy.h"
+#include "nbl/asset/interchange/SGeometryContentHash.h"
+#include "nbl/core/hash/blake.h"
+#include "argparse/argparse.hpp"
+
+#include <chrono>
+#include <cstdint>
+#include <limits>
+#include <optional>
+#include <string>
+#include <vector>
+
+using namespace nbl;
+using namespace nbl::asset;
+using namespace nbl::system;
+
+constexpr size_t kMinBufferBytes = 2ull * 1024ull * 1024ull;
+constexpr uint64_t kDefaultSeed = 0x6a09e667f3bcc909ull;
+
+enum class RuntimeMode : uint8_t
+{
+    Sequential,
+    Heuristic,
+    Hybrid
+};
+
+struct Options
+{
+    RuntimeMode mode = RuntimeMode::Heuristic;
+    size_t bufferBytes = kMinBufferBytes;
+    uint64_t seed = kDefaultSeed;
+};
+
+static const char* modeName(RuntimeMode mode)
+{
+    if (mode == RuntimeMode::Sequential)
+        return "sequential";
+    if (mode == RuntimeMode::Hybrid)
+        return "hybrid";
+    return "heuristic";
+}
+
+static SFileIOPolicy makePolicy(RuntimeMode mode)
+{
+    SFileIOPolicy policy = {};
+    if (mode == RuntimeMode::Sequential)
+        policy.runtimeTuning.mode = SFileIOPolicy::SRuntimeTuning::Mode::Sequential;
+    else if (mode == RuntimeMode::Hybrid)
+        policy.runtimeTuning.mode = SFileIOPolicy::SRuntimeTuning::Mode::Hybrid;
+    else
+        policy.runtimeTuning.mode = SFileIOPolicy::SRuntimeTuning::Mode::Heuristic;
+    return policy;
+}
+
+static uint64_t nextRand(uint64_t& state)
+{
+    state ^= state >> 12u;
+    state ^= state << 25u;
+    state ^= state >> 27u;
+    return state * 2685821657736338717ull;
+}
+
+static std::vector<uint8_t> makeRandomBytes(const size_t byteCount, const uint64_t seed, const uint64_t stream)
+{
+    std::vector<uint8_t> data(byteCount);
+    uint64_t state = seed ^ (stream * 0x9e3779b97f4a7c15ull);
+    if (state == 0ull)
+        state = kDefaultSeed ^ stream;
+    for (auto& byte : data)
+        byte = static_cast<uint8_t>(nextRand(state) & 0xffull);
+    return data;
+}
+
+static std::optional<Options> parseOptions(const core::vector<std::string>& args)
+{
+    argparse::ArgumentParser parser("hcp");
+    parser.add_argument("--runtime-tuning").default_value(std::string("heuristic"));
+    parser.add_argument("--buffer-bytes").default_value(std::to_string(kMinBufferBytes));
+    parser.add_argument("--seed").default_value(std::to_string(kDefaultSeed));
+
+    try
+    {
+        parser.parse_args({ args.data(), args.data() + args.size() });
+    }
+    catch (const std::exception&)
+    {
+        return std::nullopt;
+    }
+
+    auto parseU64 = [](const std::string& v) -> std::optional<uint64_t>
+    {
+        try { return std::stoull(v, nullptr, 10); } catch (...) { return std::nullopt; }
+    };
+    auto parseSize = [](const std::string& v) -> std::optional<size_t>
+    {
+        try
+        {
+            const auto x = std::stoull(v, nullptr, 10);
+            if (x > static_cast<unsigned long long>(std::numeric_limits<size_t>::max()))
+                return std::nullopt;
+            return static_cast<size_t>(x);
+        }
+        catch (...)
+        {
+            return std::nullopt;
+        }
+    };
+
+    Options options = {};
+    const auto mode = parser.get<std::string>("--runtime-tuning");
+    if (mode == "sequential" || mode == "none")
+        options.mode = RuntimeMode::Sequential;
+    else if (mode == "heuristic")
+        options.mode = RuntimeMode::Heuristic;
+    else if (mode == "hybrid")
+        options.mode = RuntimeMode::Hybrid;
+    else
+        return std::nullopt;
+
+    const auto bytes = parseSize(parser.get<std::string>("--buffer-bytes"));
+    const auto seed = parseU64(parser.get<std::string>("--seed"));
+    if (!bytes.has_value() || !seed.has_value() || *bytes < kMinBufferBytes)
+        return std::nullopt;
+
+    options.bufferBytes = *bytes;
+    options.seed = *seed;
+    return options;
+}
+
+static core::smart_refctd_ptr<ICPUPolygonGeometry> createGeometry(const Options& options)
+{
+    constexpr E_FORMAT positionFormat = EF_R32G32B32_SFLOAT;
+    constexpr E_FORMAT normalFormat = EF_R32G32B32_SFLOAT;
+    constexpr E_FORMAT indexFormat = EF_R32_UINT;
+    constexpr E_FORMAT colorFormat = EF_R8G8B8A8_UNORM;
+
+    const uint32_t positionStride = getTexelOrBlockBytesize(positionFormat);
+    const uint32_t normalStride = getTexelOrBlockBytesize(normalFormat);
+    const uint32_t indexStride = getTexelOrBlockBytesize(indexFormat);
+    const uint32_t colorStride = getTexelOrBlockBytesize(colorFormat);
+    const auto alignDown = [&](uint32_t stride) -> size_t { return options.bufferBytes - (options.bufferBytes % stride); };
+
+    auto makeBuffer = [&](size_t bytes, core::bitflag<IBuffer::E_USAGE_FLAGS> usage, uint64_t stream) -> core::smart_refctd_ptr<ICPUBuffer>
+    {
+        auto data = makeRandomBytes(bytes, options.seed, stream);
+
+        ICPUBuffer::SCreationParams params = {};
+        params.size = data.size();
+        params.usage = usage;
+        params.data = data.data();
+        return ICPUBuffer::create(std::move(params));
+    };
+
+    auto makeView = [](const core::smart_refctd_ptr<ICPUBuffer>& buffer, E_FORMAT format, uint32_t stride) -> ICPUPolygonGeometry::SDataView
+    {
+        ICPUPolygonGeometry::SDataView view = {};
+        view.composed.format = format;
+        view.composed.stride = stride;
+        view.composed.rangeFormat = IGeometryBase::getMatchingAABBFormat(format);
+        view.composed.resetRange();
+        view.src.offset = 0ull;
+        view.src.size = buffer ? buffer->getSize() : 0ull;
+        view.src.buffer = buffer;
+        return view;
+    };
+
+    auto positionBuffer = makeBuffer(alignDown(positionStride), IBuffer::EUF_VERTEX_BUFFER_BIT, 1ull);
+    auto normalBuffer = makeBuffer(alignDown(normalStride), IBuffer::EUF_VERTEX_BUFFER_BIT, 2ull);
+    auto indexBuffer = makeBuffer(alignDown(indexStride), IBuffer::EUF_INDEX_BUFFER_BIT, 3ull);
+    auto colorBuffer = makeBuffer(alignDown(colorStride), IBuffer::EUF_VERTEX_BUFFER_BIT, 4ull);
+    if (!positionBuffer || !normalBuffer || !indexBuffer || !colorBuffer)
+        return nullptr;
+
+    auto geometry = core::make_smart_refctd_ptr<ICPUPolygonGeometry>();
+    geometry->setIndexing(IPolygonGeometryBase::TriangleList());
+    geometry->setPositionView(makeView(positionBuffer, positionFormat, positionStride));
+    geometry->setNormalView(makeView(normalBuffer, normalFormat, normalStride));
+    geometry->setIndexView(makeView(indexBuffer, indexFormat, indexStride));
+    geometry->getAuxAttributeViews()->push_back(makeView(colorBuffer, colorFormat, colorStride));
+    geometry->getAuxAttributeViews()->push_back(makeView(colorBuffer, colorFormat, colorStride));
+    return geometry;
+}
+
+static bool runStandaloneBufferParityCheck(const Options& options, ILogger* logger)
+{
+    using clock_t = std::chrono::high_resolution_clock;
+    auto toMs = [](clock_t::duration d) { return std::chrono::duration<double, std::milli>(d).count(); };
+    auto toMiB = [](size_t bytes) { return static_cast<double>(bytes) / (1024.0 * 1024.0); };
+    auto throughput = [&](size_t bytes, double ms) { return ms > 0.0 ? toMiB(bytes) * 1000.0 / ms : 0.0; };
+
+    auto data = makeRandomBytes(options.bufferBytes, options.seed, 0x11ull);
+    ICPUBuffer::SCreationParams params = {};
+    params.size = data.size();
+    params.usage = IBuffer::EUF_TRANSFER_SRC_BIT;
+    params.data = data.data();
+    auto buffer = ICPUBuffer::create(std::move(params));
+    if (!buffer)
+    {
+        logger->log("Failed to create standalone buffer.", ILogger::ELL_ERROR);
+        return false;
+    }
+
+    const auto legacyStart = clock_t::now();
+    const auto legacyHash = core::blake3_hash_buffer_sequential(data.data(), data.size());
+    const double legacyMs = toMs(clock_t::now() - legacyStart);
+
+    const auto directStart = clock_t::now();
+    const auto directHash = core::blake3_hash_buffer(data.data(), data.size());
+    const double directMs = toMs(clock_t::now() - directStart);
+    if (directHash != legacyHash)
+    {
+        logger->log("Direct BLAKE3 hash mismatch.", ILogger::ELL_ERROR);
+        return false;
+    }
+
+    const auto bufferStart = clock_t::now();
+    const auto bufferHash = buffer->computeContentHash();
+    const double bufferMs = toMs(clock_t::now() - bufferStart);
+    if (bufferHash != legacyHash)
+    {
+        logger->log("ICPUBuffer::computeContentHash mismatch.", ILogger::ELL_ERROR);
+        return false;
+    }
+
+    logger->log("HCP single-buffer bytes=%llu mib=%.3f", ILogger::ELL_INFO, static_cast<unsigned long long>(data.size()), toMiB(data.size()));
+    logger->log("HCP single-buffer legacy ms=%.3f mib_s=%.3f", ILogger::ELL_INFO, legacyMs, throughput(data.size(), legacyMs));
+    logger->log("HCP single-buffer direct ms=%.3f mib_s=%.3f", ILogger::ELL_INFO, directMs, throughput(data.size(), directMs));
+    logger->log("HCP single-buffer api ms=%.3f mib_s=%.3f", ILogger::ELL_INFO, bufferMs, throughput(data.size(), bufferMs));
+    return true;
+}
+
+static bool runGeometryParityCheck(const Options& options, ILogger* logger)
+{
+    using clock_t = std::chrono::high_resolution_clock;
+    auto toMs = [](clock_t::duration d) { return std::chrono::duration<double, std::milli>(d).count(); };
+    auto toMiB = [](size_t bytes) { return static_cast<double>(bytes) / (1024.0 * 1024.0); };
+    auto throughput = [&](size_t bytes, double ms) { return ms > 0.0 ? toMiB(bytes) * 1000.0 / ms : 0.0; };
+
+    auto geometry = createGeometry(options);
+    if (!geometry)
+    {
+        logger->log("Failed to create dummy geometry.", ILogger::ELL_ERROR);
+        return false;
+    }
+
+    core::vector<core::smart_refctd_ptr<ICPUBuffer>> buffers;
+    SPolygonGeometryContentHash::collectBuffers(geometry.get(), buffers);
+    if (buffers.empty())
+    {
+        logger->log("No buffers collected from geometry.", ILogger::ELL_ERROR);
+        return false;
+    }
+
+    size_t totalBytes = 0ull;
+    for (const auto& buffer : buffers)
+        totalBytes += buffer ? buffer->getSize() : 0ull;
+    if (totalBytes == 0ull)
+    {
+        logger->log("Collected zero-sized buffers.", ILogger::ELL_ERROR);
+        return false;
+    }
+
+    const auto legacyPolicy = makePolicy(RuntimeMode::Sequential);
+    SPolygonGeometryContentHash::reset(geometry.get());
+    const auto legacyStart = clock_t::now();
+    const auto legacyHash = SPolygonGeometryContentHash::recompute(geometry.get(), legacyPolicy);
+    const double legacyMs = toMs(clock_t::now() - legacyStart);
+
+    SPolygonGeometryContentHash::reset(geometry.get());
+    const auto recomputeStart = clock_t::now();
+    const auto recomputeHash = SPolygonGeometryContentHash::recompute(geometry.get(), makePolicy(options.mode));
+    const double recomputeMs = toMs(clock_t::now() - recomputeStart);
+    if (recomputeHash != legacyHash)
+    {
+        logger->log("recompute hash mismatch.", ILogger::ELL_ERROR);
+        return false;
+    }
+
+    if (!buffers[0])
+    {
+        logger->log("First geometry buffer is null.", ILogger::ELL_ERROR);
+        return false;
+    }
+    const auto preservedHash = buffers[0]->getContentHash();
+    const size_t missingBytes = totalBytes - buffers[0]->getSize();
+    SPolygonGeometryContentHash::reset(geometry.get());
+    buffers[0]->setContentHash(preservedHash);
+    const auto missingStart = clock_t::now();
+    const auto missingHash = SPolygonGeometryContentHash::computeMissing(geometry.get(), makePolicy(options.mode));
+    const double missingMs = toMs(clock_t::now() - missingStart);
+    if (buffers[0]->getContentHash() != preservedHash)
+    {
+        logger->log("computeMissing overwrote pre-set hash.", ILogger::ELL_ERROR);
+        return false;
+    }
+    if (missingHash != legacyHash)
+    {
+        logger->log("computeMissing hash mismatch.", ILogger::ELL_ERROR);
+        return false;
+    }
+
+    logger->log("HCP mode=%s buffers=%llu total_mib=%.3f", ILogger::ELL_INFO, modeName(options.mode), static_cast<unsigned long long>(buffers.size()), toMiB(totalBytes));
+    logger->log("HCP legacy ms=%.3f mib_s=%.3f", ILogger::ELL_INFO, legacyMs, throughput(totalBytes, legacyMs));
+    logger->log("HCP recompute ms=%.3f mib_s=%.3f", ILogger::ELL_INFO, recomputeMs, throughput(totalBytes, recomputeMs));
+    logger->log("HCP computeMissing ms=%.3f mib_s=%.3f missing_mib=%.3f", ILogger::ELL_INFO, missingMs, throughput(missingBytes, missingMs), toMiB(missingBytes));
+    return true;
+}
+
+static bool runParityCheck(const Options& options, ILogger* logger)
+{
+    if (!runStandaloneBufferParityCheck(options, logger))
+        return false;
+    return runGeometryParityCheck(options, logger);
+}
+
+class HashContentParityApp final : public IApplicationFramework
+{
+public:
+    using IApplicationFramework::IApplicationFramework;
+
+    bool onAppInitialized(core::smart_refctd_ptr<ISystem>&&) override
+    {
+        m_logger = core::make_smart_refctd_ptr<CStdoutLogger>(ILogger::DefaultLogMask());
+        if (!isAPILoaded())
+        {
+            m_logger->log("Could not load Nabla API.", ILogger::ELL_ERROR);
+            return false;
+        }
+
+        const auto options = parseOptions(argv);
+        if (!options.has_value())
+        {
+            m_logger->log("Usage: hcp [--runtime-tuning sequential|heuristic|hybrid] [--buffer-bytes N] [--seed U64]", ILogger::ELL_ERROR);
+            m_logger->log("Constraint: --buffer-bytes must be >= %llu", ILogger::ELL_ERROR, static_cast<unsigned long long>(kMinBufferBytes));
+            return false;
+        }
+
+        if (!runParityCheck(*options, m_logger.get()))
+            return false;
+        m_logger->log("OK", ILogger::ELL_INFO);
+        return true;
+    }
+
+    void workLoopBody() override {}
+    bool keepRunning() override { return false; }
+
+private:
+    core::smart_refctd_ptr<ILogger> m_logger;
+};
+
+NBL_MAIN_FUNC(HashContentParityApp)
diff --git a/tools/nsc/main.cpp b/tools/nsc/main.cpp
index 203aa6ce8c..9745a17299 100644
--- a/tools/nsc/main.cpp
+++ b/tools/nsc/main.cpp
@@ -1,4 +1,5 @@
 #include "nabla.h"
+#include "nbl/gtml/SJsonFormatter.h"
 #include "nbl/system/IApplicationFramework.h"
 #include <iostream>
 #include <cstdlib>
@@ -418,27 +419,8 @@ class ShaderCompiler final : public IApplicationFramework
     {
         ::json j;
         auto& modules = j["modules"];
-
-        auto serialize = [&](const gtml::GitInfo& info, std::string_view target)
-        {
-            auto& s = modules[target.data()];
-            s["isPopulated"] = info.isPopulated;
-            s["hasUncommittedChanges"] = info.hasUncommittedChanges.has_value() ? ::json(info.hasUncommittedChanges.value()) : ::json("UNKNOWN, BUILT WITHOUT DIRTY-CHANGES CAPTURE");
-            s["commitAuthorName"] = info.commitAuthorName;
-            s["commitAuthorEmail"] = info.commitAuthorEmail;
-            s["commitHash"] = info.commitHash;
-            s["commitShortHash"] = info.commitShortHash;
-            s["commitDate"] = info.commitDate;
-            s["commitSubject"] = info.commitSubject;
-            s["commitBody"] = info.commitBody;
-            s["describe"] = info.describe;
-            s["branchName"] = info.branchName;
-            s["latestTag"] = info.latestTag;
-            s["latestTagName"] = info.latestTagName;
-        };
-
-        serialize(gtml::nabla_git_info, "nabla");
-        serialize(gtml::dxc_git_info, "dxc");
+        modules["nabla"] = ::json::parse(::gtml::SJsonFormatter::toString(nbl::gtml::nabla_git_info));
+        modules["dxc"] = ::json::parse(::gtml::SJsonFormatter::toString(nbl::gtml::dxc_git_info));
 
         const auto pretty = j.dump(4);
         std::cout << pretty << std::endl;