diff --git a/Components/MeshLodGenerator/include/OgreLod0Stripifier.h b/Components/MeshLodGenerator/include/OgreLod0Stripifier.h index 43c88d6e6fd..4992664cf8d 100644 --- a/Components/MeshLodGenerator/include/OgreLod0Stripifier.h +++ b/Components/MeshLodGenerator/include/OgreLod0Stripifier.h @@ -207,7 +207,6 @@ namespace Ogre vertexData->vertexStart = 0; vertexData->vertexCount = remapInfo.usedCount; - vertexData->hardwareShadowVolWBuffer = HardwareVertexBufferSharedPtr(); // TODO: check this vertexData->hwAnimationDataList.clear(); // TODO: check this vertexData->hwAnimDataItemsUsed = 0; // TODO: check this } diff --git a/Media/Main/ShadowExtrudeDirLight.vert b/Media/Main/ShadowExtrudeDirLight.vert index 60809865c14..effcd0e3318 100644 --- a/Media/Main/ShadowExtrudeDirLight.vert +++ b/Media/Main/ShadowExtrudeDirLight.vert @@ -5,14 +5,13 @@ uniform mat4 worldviewproj_matrix; uniform vec4 light_position_object_space; // homogenous, object space MAIN_PARAMETERS -IN(vec4 uv0, TEXCOORD0) IN(vec4 position, POSITION) MAIN_DECLARATION { // Extrusion in object space // Vertex unmodified if w==1, extruded if w==0 vec4 newpos = - (uv0.xxxx * (position + light_position_object_space)) - light_position_object_space; + (position.wwww * (position + light_position_object_space)) - light_position_object_space; gl_Position = mul(worldviewproj_matrix, newpos); } \ No newline at end of file diff --git a/Media/Main/ShadowExtrudeDirLightFinite.vert b/Media/Main/ShadowExtrudeDirLightFinite.vert index 16f758c74e6..5861eabdee1 100644 --- a/Media/Main/ShadowExtrudeDirLightFinite.vert +++ b/Media/Main/ShadowExtrudeDirLightFinite.vert @@ -6,7 +6,6 @@ uniform vec4 light_position_object_space; // homogenous, object space uniform float shadow_extrusion_distance; // how far to extrude MAIN_PARAMETERS -IN(vec4 uv0, TEXCOORD0) IN(vec4 position, POSITION) MAIN_DECLARATION { @@ -16,7 +15,7 @@ MAIN_DECLARATION extrusionDir = normalize(extrusionDir); vec4 newpos = vec4(position.xyz + - ((1.0 - uv0.x) * shadow_extrusion_distance * extrusionDir), 1.0); + ((1.0 - position.w) * shadow_extrusion_distance * extrusionDir), 1.0); gl_Position = mul(worldviewproj_matrix, newpos); } \ No newline at end of file diff --git a/Media/Main/ShadowExtrudePointLight.vert b/Media/Main/ShadowExtrudePointLight.vert index 9f4ddf4d035..e79948eb5df 100644 --- a/Media/Main/ShadowExtrudePointLight.vert +++ b/Media/Main/ShadowExtrudePointLight.vert @@ -5,14 +5,13 @@ uniform mat4 worldviewproj_matrix; uniform vec4 light_position_object_space; // homogenous, object space MAIN_PARAMETERS -IN(vec4 uv0, TEXCOORD0) IN(vec4 position, POSITION) MAIN_DECLARATION { // Extrusion in object space // Vertex unmodified if w==1, extruded if w==0 vec4 newpos = - (uv0.xxxx * light_position_object_space) + + (position.wwww * light_position_object_space) + vec4(position.xyz - light_position_object_space.xyz, 0.0); gl_Position = mul(worldviewproj_matrix, newpos); diff --git a/Media/Main/ShadowExtrudePointLightFinite.vert b/Media/Main/ShadowExtrudePointLightFinite.vert index 1e7134b49b8..c4cf490a5f8 100644 --- a/Media/Main/ShadowExtrudePointLightFinite.vert +++ b/Media/Main/ShadowExtrudePointLightFinite.vert @@ -6,7 +6,6 @@ uniform vec4 light_position_object_space; // homogenous, object space uniform float shadow_extrusion_distance; // how far to extrude MAIN_PARAMETERS -IN(vec4 uv0, TEXCOORD0) IN(vec4 position, POSITION) MAIN_DECLARATION { @@ -16,7 +15,7 @@ MAIN_DECLARATION extrusionDir = normalize(extrusionDir); vec4 newpos = vec4(position.xyz + - ((1.0 - uv0.x) * shadow_extrusion_distance * extrusionDir), 1.0); + ((1.0 - position.w) * shadow_extrusion_distance * extrusionDir), 1.0); gl_Position = mul(worldviewproj_matrix, newpos); } \ No newline at end of file diff --git a/OgreMain/include/OgreEntity.h b/OgreMain/include/OgreEntity.h index a621b706db8..653b5d768e2 100644 --- a/OgreMain/include/OgreEntity.h +++ b/OgreMain/include/OgreEntity.h @@ -324,8 +324,6 @@ namespace Ogre { Entity* mParent; /// Shared link to position buffer. HardwareVertexBufferSharedPtr mPositionBuffer; - /// Shared link to w-coord buffer (optional). - HardwareVertexBufferSharedPtr mWBuffer; /// Link to current vertex data used to bind (maybe changes). const VertexData* mCurrentVertexData; /// Link to SubEntity, only present if SubEntity has it's own geometry. @@ -343,7 +341,6 @@ namespace Ogre { void _createSeparateLightCap(); void getWorldTransforms(Matrix4* xform) const override; HardwareVertexBufferSharedPtr getPositionBuffer(void) { return mPositionBuffer; } - HardwareVertexBufferSharedPtr getWBuffer(void) { return mWBuffer; } /// Rebind the source positions (for temp buffer users). void rebindPositionBuffer(const VertexData* vertexData, bool force); bool isVisible(void) const override; diff --git a/OgreMain/include/OgreManualObject.h b/OgreMain/include/OgreManualObject.h index d1f7fd14ac8..399f5c113f8 100644 --- a/OgreMain/include/OgreManualObject.h +++ b/OgreMain/include/OgreManualObject.h @@ -623,8 +623,6 @@ namespace Ogre ManualObject* mParent; // Shared link to position buffer HardwareVertexBufferSharedPtr mPositionBuffer; - // Shared link to w-coord buffer (optional) - HardwareVertexBufferSharedPtr mWBuffer; public: ManualObjectSectionShadowRenderable(ManualObject* parent, @@ -633,7 +631,6 @@ namespace Ogre ~ManualObjectSectionShadowRenderable(); void getWorldTransforms(Matrix4* xform) const override; HardwareVertexBufferSharedPtr getPositionBuffer(void) { return mPositionBuffer; } - HardwareVertexBufferSharedPtr getWBuffer(void) { return mWBuffer; } virtual void rebindIndexBuffer(const HardwareIndexBufferSharedPtr& indexBuffer) override; diff --git a/OgreMain/include/OgreOptimisedUtil.h b/OgreMain/include/OgreOptimisedUtil.h index f01c502e5ff..5dad138a894 100644 --- a/OgreMain/include/OgreOptimisedUtil.h +++ b/OgreMain/include/OgreOptimisedUtil.h @@ -155,12 +155,14 @@ namespace Ogre { @param faceNormals The array of Vector4 used to store triangles face normal, Must be aligned to SIMD alignment. @param numTriangles Number of triangles to calculate face normal. + @param components Number of components in positions buffer. Must be 3 or 4 */ virtual void calculateFaceNormals( const float *positions, const EdgeData::Triangle *triangles, Vector4 *faceNormals, - size_t numTriangles) = 0; + size_t numTriangles, + int components = 3) = 0; /** Calculate the light facing state of the triangle's face normals @remarks diff --git a/OgreMain/include/OgreStaticGeometry.h b/OgreMain/include/OgreStaticGeometry.h index 5bee7e43d3a..367638ce72c 100644 --- a/OgreMain/include/OgreStaticGeometry.h +++ b/OgreMain/include/OgreStaticGeometry.h @@ -292,9 +292,6 @@ namespace Ogre { LODBucket* mParent; // Shared link to position buffer HardwareVertexBufferSharedPtr mPositionBuffer; - // Shared link to w-coord buffer (optional) - HardwareVertexBufferSharedPtr mWBuffer; - public: LODShadowRenderable(LODBucket* parent, HardwareIndexBufferSharedPtr* indexBuffer, const VertexData* vertexData, @@ -302,7 +299,6 @@ namespace Ogre { ~LODShadowRenderable(); void getWorldTransforms(Matrix4* xform) const override; HardwareVertexBufferSharedPtr getPositionBuffer(void) { return mPositionBuffer; } - HardwareVertexBufferSharedPtr getWBuffer(void) { return mWBuffer; } virtual void rebindIndexBuffer(const HardwareIndexBufferSharedPtr& indexBuffer) override; }; diff --git a/OgreMain/include/OgreVertexIndexData.h b/OgreMain/include/OgreVertexIndexData.h index 53d2514b691..763ab54da8c 100644 --- a/OgreMain/include/OgreVertexIndexData.h +++ b/OgreMain/include/OgreVertexIndexData.h @@ -144,22 +144,6 @@ namespace Ogre { */ void prepareForShadowVolume(void); - /** Additional shadow volume vertex buffer storage. - @remarks - This additional buffer is only used where we have prepared this VertexData for - use in shadow volume construction, and where the current render system supports - vertex programs. This buffer contains the 'w' vertex position component which will - be used by that program to differentiate between extruded and non-extruded vertices. - This 'w' component cannot be included in the original position buffer because - DirectX does not allow 4-component positions in the fixed-function pipeline, and the original - position buffer must still be usable for fixed-function rendering. - @par - Note that we don't store any vertex declaration or vertex buffer binding here because this - can be reused in the shadow algorithm. - */ - HardwareVertexBufferSharedPtr hardwareShadowVolWBuffer; - - /** Reorganises the data in the vertex buffers according to the new vertex declaration passed in. Note that new vertex buffers are created and written to, so if the buffers being referenced diff --git a/OgreMain/src/OgreEdgeListBuilder.cpp b/OgreMain/src/OgreEdgeListBuilder.cpp index ed8b6f23b81..5a5fb5f6615 100644 --- a/OgreMain/src/OgreEdgeListBuilder.cpp +++ b/OgreMain/src/OgreEdgeListBuilder.cpp @@ -414,11 +414,11 @@ namespace Ogre { void EdgeData::updateFaceNormals(size_t vertexSet, const HardwareVertexBufferSharedPtr& positionBuffer) { - assert (positionBuffer->getVertexSize() == sizeof(float) * 3 - && "Position buffer should contain only positions!"); + OgreAssert(positionBuffer->getVertexSize() == sizeof(float) * 4, + "Position buffer should contain only positions!"); // Triangle face normals should be 1:1 with triangles - assert(triangleFaceNormals.size() == triangles.size()); + OgreAssert(triangleFaceNormals.size() == triangles.size(), "size mismatch"); // Calculate triangles which are using this vertex set const EdgeData::EdgeGroup& eg = edgeGroups[vertexSet]; @@ -429,7 +429,8 @@ namespace Ogre { static_cast(positionsLock.pData), &triangles[eg.triStart], &triangleFaceNormals[eg.triStart], - eg.triCount); + eg.triCount, + 4); } } //--------------------------------------------------------------------- diff --git a/OgreMain/src/OgreEntity.cpp b/OgreMain/src/OgreEntity.cpp index ac7db3e6792..159532a9815 100644 --- a/OgreMain/src/OgreEntity.cpp +++ b/OgreMain/src/OgreEntity.cpp @@ -1987,8 +1987,10 @@ namespace Ogre { // Lock, we'll be locking the (suppressed hardware update) shadow buffer HardwareBufferLockGuard posLock(esrPositionBuffer, HardwareBuffer::HBL_NORMAL); float* pSrc = static_cast(posLock.pData); - float* pDest = pSrc + (egi->vertexData->vertexCount * 3); - memcpy(pDest, pSrc, sizeof(float) * 3 * egi->vertexData->vertexCount); + float* pDest = pSrc + (egi->vertexData->vertexCount * 4); + memcpy(pDest, pSrc, sizeof(float) * 4 * egi->vertexData->vertexCount); + for (size_t i = 0; i < egi->vertexData->vertexCount; i++) + pDest[i * 4 + 3] = 0; // second part needs w=0 } if (egi->vertexData == mMesh->sharedVertexData) { @@ -2121,18 +2123,11 @@ namespace Ogre { // Create vertex data which just references position component (and 2 component) mRenderOp.vertexData = OGRE_NEW VertexData(); // Map in position data - mRenderOp.vertexData->vertexDeclaration->addElement(0,0,VET_FLOAT3, VES_POSITION); + mRenderOp.vertexData->vertexDeclaration->addElement(0,0,VET_FLOAT4, VES_POSITION); mOriginalPosBufferBinding = vertexData->vertexDeclaration->findElementBySemantic(VES_POSITION)->getSource(); mPositionBuffer = vertexData->vertexBufferBinding->getBuffer(mOriginalPosBufferBinding); mRenderOp.vertexData->vertexBufferBinding->setBinding(0, mPositionBuffer); - // Map in w-coord buffer (if present) - if(vertexData->hardwareShadowVolWBuffer) - { - mRenderOp.vertexData->vertexDeclaration->addElement(1,0,VET_FLOAT1, VES_TEXTURE_COORDINATES, 0); - mWBuffer = vertexData->hardwareShadowVolWBuffer; - mRenderOp.vertexData->vertexBufferBinding->setBinding(1, mWBuffer); - } // Use same vertex start as input mRenderOp.vertexData->vertexStart = vertexData->vertexStart; diff --git a/OgreMain/src/OgreHardwareBufferManager.cpp b/OgreMain/src/OgreHardwareBufferManager.cpp index 1accfcf0fc1..35fe4184115 100644 --- a/OgreMain/src/OgreHardwareBufferManager.cpp +++ b/OgreMain/src/OgreHardwareBufferManager.cpp @@ -476,7 +476,7 @@ namespace Ogre { if (positions && !destPositionBuffer) { destPositionBuffer = srcPositionBuffer->getManager()->allocateVertexBufferCopy(srcPositionBuffer, - HardwareBufferManagerBase::BLT_AUTOMATIC_RELEASE, this); + HardwareBufferManagerBase::BLT_AUTOMATIC_RELEASE, this, true); // copy contents to keep W-coord for stencil shadows } if (normals && !posNormalShareBuffer && srcNormalBuffer && !destNormalBuffer) { diff --git a/OgreMain/src/OgreManualObject.cpp b/OgreMain/src/OgreManualObject.cpp index 81849026708..447d492a507 100644 --- a/OgreMain/src/OgreManualObject.cpp +++ b/OgreMain/src/OgreManualObject.cpp @@ -773,18 +773,11 @@ ManualObject::ManualObject(const String& name) // Create vertex data which just references position component (and 2 component) mRenderOp.vertexData = OGRE_NEW VertexData(); // Map in position data - mRenderOp.vertexData->vertexDeclaration->addElement(0,0,VET_FLOAT3, VES_POSITION); + mRenderOp.vertexData->vertexDeclaration->addElement(0,0,VET_FLOAT4, VES_POSITION); ushort origPosBind = vertexData->vertexDeclaration->findElementBySemantic(VES_POSITION)->getSource(); mPositionBuffer = vertexData->vertexBufferBinding->getBuffer(origPosBind); mRenderOp.vertexData->vertexBufferBinding->setBinding(0, mPositionBuffer); - // Map in w-coord buffer (if present) - if(vertexData->hardwareShadowVolWBuffer) - { - mRenderOp.vertexData->vertexDeclaration->addElement(1,0,VET_FLOAT1, VES_TEXTURE_COORDINATES, 0); - mWBuffer = vertexData->hardwareShadowVolWBuffer; - mRenderOp.vertexData->vertexBufferBinding->setBinding(1, mWBuffer); - } // Use same vertex start as input mRenderOp.vertexData->vertexStart = vertexData->vertexStart; diff --git a/OgreMain/src/OgreMesh.cpp b/OgreMain/src/OgreMesh.cpp index 56cfa50c8b8..25c6876bd67 100644 --- a/OgreMain/src/OgreMesh.cpp +++ b/OgreMain/src/OgreMesh.cpp @@ -1993,10 +1993,8 @@ namespace Ogre { // Lock destination buffers for writing - HardwareBufferLockGuard destPosLock(destPosBuf, - (destNormBuf != destPosBuf && destPosBuf->getVertexSize() == destElemPos->getSize()) || - (destNormBuf == destPosBuf && destPosBuf->getVertexSize() == destElemPos->getSize() + destElemNorm->getSize()) ? - HardwareBuffer::HBL_DISCARD : HardwareBuffer::HBL_NORMAL); + // HBL_NORMAL to keep W-coord for stencil shadows + HardwareBufferLockGuard destPosLock(destPosBuf, HardwareBuffer::HBL_NORMAL); destElemPos->baseVertexPointerToElement(destPosLock.pData, &pDestPos); HardwareBufferLockGuard destNormLock; if (includeNormals) diff --git a/OgreMain/src/OgreOptimisedUtil.cpp b/OgreMain/src/OgreOptimisedUtil.cpp index db6d62ddf52..90bbd8cb7e7 100644 --- a/OgreMain/src/OgreOptimisedUtil.cpp +++ b/OgreMain/src/OgreOptimisedUtil.cpp @@ -202,7 +202,8 @@ namespace Ogre { const float *positions, const EdgeData::Triangle *triangles, Vector4 *faceNormals, - size_t numTriangles) + size_t numTriangles, + int components) { static ProfileItems results; static size_t index; @@ -215,7 +216,8 @@ namespace Ogre { positions, triangles, faceNormals, - numTriangles); + numTriangles, + components); profile.end(); // diff --git a/OgreMain/src/OgreOptimisedUtilGeneral.cpp b/OgreMain/src/OgreOptimisedUtilGeneral.cpp index 390cade2bb6..1dcb579c047 100644 --- a/OgreMain/src/OgreOptimisedUtilGeneral.cpp +++ b/OgreMain/src/OgreOptimisedUtilGeneral.cpp @@ -75,7 +75,8 @@ namespace Ogre { const float *positions, const EdgeData::Triangle *triangles, Vector4 *faceNormals, - size_t numTriangles); + size_t numTriangles, + int components); /// @copydoc OptimisedUtil::calculateLightFacing virtual void calculateLightFacing( @@ -286,20 +287,21 @@ namespace Ogre { const float *positions, const EdgeData::Triangle *triangles, Vector4 *faceNormals, - size_t numTriangles) + size_t numTriangles, + int components) { for ( ; numTriangles; --numTriangles) { const EdgeData::Triangle& t = *triangles++; size_t offset; - offset = t.vertIndex[0] * 3; + offset = t.vertIndex[0] * components; Vector3 v1(positions[offset+0], positions[offset+1], positions[offset+2]); - offset = t.vertIndex[1] * 3; + offset = t.vertIndex[1] * components; Vector3 v2(positions[offset+0], positions[offset+1], positions[offset+2]); - offset = t.vertIndex[2] * 3; + offset = t.vertIndex[2] * components; Vector3 v3(positions[offset+0], positions[offset+1], positions[offset+2]); *faceNormals++ = Math::calculateFaceNormalWithoutNormalize(v1, v2, v3); @@ -341,6 +343,7 @@ namespace Ogre { *pDestPos++ = *pSrcPos++ + extrusionDir.x; *pDestPos++ = *pSrcPos++ + extrusionDir.y; *pDestPos++ = *pSrcPos++ + extrusionDir.z; + pDestPos++, pSrcPos++; } } else @@ -360,6 +363,7 @@ namespace Ogre { *pDestPos++ = *pSrcPos++ + extrusionDir.x; *pDestPos++ = *pSrcPos++ + extrusionDir.y; *pDestPos++ = *pSrcPos++ + extrusionDir.z; + pDestPos++, pSrcPos++; } } } diff --git a/OgreMain/src/OgreOptimisedUtilSSE.cpp b/OgreMain/src/OgreOptimisedUtilSSE.cpp index 095ecb45ce9..d112d2e8b22 100644 --- a/OgreMain/src/OgreOptimisedUtilSSE.cpp +++ b/OgreMain/src/OgreOptimisedUtilSSE.cpp @@ -113,7 +113,8 @@ namespace Ogre { const float *positions, const EdgeData::Triangle *triangles, Vector4 *faceNormals, - size_t numTriangles); + size_t numTriangles, + int components); /// @copydoc OptimisedUtil::calculateLightFacing virtual void __OGRE_SIMD_ALIGN_ATTRIBUTE calculateLightFacing( @@ -224,7 +225,8 @@ namespace Ogre { const float *positions, const EdgeData::Triangle *triangles, Vector4 *faceNormals, - size_t numTriangles) + size_t numTriangles, + int components) { __OGRE_SIMD_ALIGN_STACK(); @@ -232,7 +234,8 @@ namespace Ogre { positions, triangles, faceNormals, - numTriangles); + numTriangles, + components); } /// @copydoc OptimisedUtil::calculateLightFacing @@ -1583,7 +1586,8 @@ namespace Ogre { const float *positions, const EdgeData::Triangle *triangles, Vector4 *faceNormals, - size_t numTriangles) + size_t numTriangles, + int components) { __OGRE_CHECK_STACK_ALIGNED_FOR_SSE(); @@ -1628,24 +1632,24 @@ namespace Ogre { // Load vertex 0 of four triangles, packed as component-major format: xxxx yyyy zzzz __LOAD_FOUR_VECTOR3(x0, y0, z0, - positions + triangles[0].vertIndex[0] * 3, - positions + triangles[1].vertIndex[0] * 3, - positions + triangles[2].vertIndex[0] * 3, - positions + triangles[3].vertIndex[0] * 3); + positions + triangles[0].vertIndex[0] * components, + positions + triangles[1].vertIndex[0] * components, + positions + triangles[2].vertIndex[0] * components, + positions + triangles[3].vertIndex[0] * components); // Load vertex 1 of four triangles, packed as component-major format: xxxx yyyy zzzz __LOAD_FOUR_VECTOR3(x1, y1, z1, - positions + triangles[0].vertIndex[1] * 3, - positions + triangles[1].vertIndex[1] * 3, - positions + triangles[2].vertIndex[1] * 3, - positions + triangles[3].vertIndex[1] * 3); + positions + triangles[0].vertIndex[1] * components, + positions + triangles[1].vertIndex[1] * components, + positions + triangles[2].vertIndex[1] * components, + positions + triangles[3].vertIndex[1] * components); // Load vertex 2 of four triangles, packed as component-major format: xxxx yyyy zzzz __LOAD_FOUR_VECTOR3(x2, y2, z2, - positions + triangles[0].vertIndex[2] * 3, - positions + triangles[1].vertIndex[2] * 3, - positions + triangles[2].vertIndex[2] * 3, - positions + triangles[3].vertIndex[2] * 3); + positions + triangles[0].vertIndex[2] * components, + positions + triangles[1].vertIndex[2] * components, + positions + triangles[2].vertIndex[2] * components, + positions + triangles[3].vertIndex[2] * components); triangles += 4; @@ -1688,9 +1692,9 @@ namespace Ogre { for (size_t j = 0; j < numTriangles; ++j) { // Load vertices of the triangle - __m128 v0 = __LOAD_VECTOR3(positions + triangles->vertIndex[0] * 3); - __m128 v1 = __LOAD_VECTOR3(positions + triangles->vertIndex[1] * 3); - __m128 v2 = __LOAD_VECTOR3(positions + triangles->vertIndex[2] * 3); + __m128 v0 = __LOAD_VECTOR3(positions + triangles->vertIndex[0] * components); + __m128 v1 = __LOAD_VECTOR3(positions + triangles->vertIndex[1] * components); + __m128 v2 = __LOAD_VECTOR3(positions + triangles->vertIndex[2] * components); ++triangles; // Calculate face normal @@ -1896,14 +1900,10 @@ namespace Ogre { // Looks like VC7.1 generate a bit inefficient code for 'rsqrtss', so use 'rsqrtps' instead tmp = _mm_mul_ss(_mm_rsqrt_ps(tmp), _mm_load_ss(&extrudeDist)); __m128 dir = _mm_mul_ps(lp, __MM_SELECT(tmp, 0)); // X Y Z - + dir = _mm_cvt_si2ss(dir, 1); // dir[3] = 1: ensure w is zero in result - // Prepare extrude direction for extruding 4 vertices parallelly - __m128 dir0 = _mm_shuffle_ps(dir, dir, _MM_SHUFFLE(0,2,1,0)); // X Y Z X - __m128 dir1 = _mm_shuffle_ps(dir, dir, _MM_SHUFFLE(1,0,2,1)); // Y Z X Y - __m128 dir2 = _mm_shuffle_ps(dir, dir, _MM_SHUFFLE(2,1,0,2)); // Z X Y Z - - __m128 s0, s1, s2; - __m128 d0, d1, d2; + __m128 s0, s1, s2, s3; + __m128 d0, d1, d2, d3; size_t numIterations = numVertices / 4; numVertices &= 3; @@ -1914,63 +1914,31 @@ namespace Ogre { s0 = SrcAccessor::load(pSrcPos + 0); s1 = SrcAccessor::load(pSrcPos + 4); s2 = SrcAccessor::load(pSrcPos + 8); - pSrcPos += 12; + s3 = SrcAccessor::load(pSrcPos + 12); + pSrcPos += 16; // The extrusion direction is inverted, use subtract instruction here - d0 = _mm_sub_ps(s0, dir0); // X0 Y0 Z0 X1 - d1 = _mm_sub_ps(s1, dir1); // Y1 Z1 X2 Y2 - d2 = _mm_sub_ps(s2, dir2); // Z2 X3 Y3 Z3 + d0 = _mm_sub_ps(s0, dir); // X0 Y0 Z0 W0 + d1 = _mm_sub_ps(s1, dir); // X1 Y1 Z1 W1 + d2 = _mm_sub_ps(s2, dir); // X2 Y2 Z2 W2 + d3 = _mm_sub_ps(s3, dir); // X3 Y3 Z3 W3 DestAccessor::store(pDestPos + 0, d0); DestAccessor::store(pDestPos + 4, d1); DestAccessor::store(pDestPos + 8, d2); - pDestPos += 12; + DestAccessor::store(pDestPos + 12, d3); + pDestPos += 16; } - // Dealing with remaining vertices - switch (numVertices) + for (size_t vert = 0; vert < numVertices; ++vert) { - case 3: - // 9 floating-point values s0 = SrcAccessor::load(pSrcPos + 0); - s1 = SrcAccessor::load(pSrcPos + 4); - s2 = _mm_load_ss(pSrcPos + 8); - // The extrusion direction is inverted, use subtract instruction here - d0 = _mm_sub_ps(s0, dir0); // X0 Y0 Z0 X1 - d1 = _mm_sub_ps(s1, dir1); // Y1 Z1 X2 Y2 - d2 = _mm_sub_ss(s2, dir2); // Z2 -- -- -- - + d0 = _mm_sub_ps(s0, dir); DestAccessor::store(pDestPos + 0, d0); - DestAccessor::store(pDestPos + 4, d1); - _mm_store_ss(pDestPos + 8, d2); - break; - case 2: - // 6 floating-point values - s0 = SrcAccessor::load(pSrcPos + 0); - s1 = _mm_loadl_pi(dir1, (const __m64*)(pSrcPos + 4)); // dir1 is meaningless here - - // The extrusion direction is inverted, use subtract instruction here - d0 = _mm_sub_ps(s0, dir0); // X0 Y0 Z0 X1 - d1 = _mm_sub_ps(s1, dir1); // Y1 Z1 -- -- - - DestAccessor::store(pDestPos + 0, d0); - _mm_storel_pi((__m64*)(pDestPos + 4), d1); - break; - - case 1: - // 3 floating-point values - s0 = _mm_loadl_pi(dir0, (const __m64*)(pSrcPos + 0)); // dir0 is meaningless here - s1 = _mm_load_ss(pSrcPos + 2); - - // The extrusion direction is inverted, use subtract instruction here - d0 = _mm_sub_ps(s0, dir0); // X0 Y0 -- -- - d1 = _mm_sub_ss(s1, dir2); // Z0 -- -- -- - - _mm_storel_pi((__m64*)(pDestPos + 0), d0); - _mm_store_ss(pDestPos + 2, d1); - break; + pSrcPos += 4; + pDestPos += 4; } } }; @@ -2004,13 +1972,14 @@ namespace Ogre { for (size_t i = 0; i < numIterations; ++i) { // Load source positions - __m128 s0 = SrcAccessor::load(pSrcPos + 0); // x0 y0 z0 x1 - __m128 s1 = SrcAccessor::load(pSrcPos + 4); // y1 z1 x2 y2 - __m128 s2 = SrcAccessor::load(pSrcPos + 8); // z2 x3 y3 z3 - pSrcPos += 12; + __m128 s0 = SrcAccessor::load(pSrcPos + 0); // x0 y0 z0 w0 + __m128 s1 = SrcAccessor::load(pSrcPos + 4); // x1 y1 z1 w1 + __m128 s2 = SrcAccessor::load(pSrcPos + 8); // x2 y2 z2 w2 + __m128 s3 = SrcAccessor::load(pSrcPos + 12); // x3 y3 z3 w3 + pSrcPos += 16; - // Arrange to 3x4 component-major for batches calculate - __MM_TRANSPOSE4x3_PS(s0, s1, s2); + // Arrange to 4x4 component-major for batches calculate + __MM_TRANSPOSE4x4_PS(s0, s1, s2, s3); // Calculate unnormalised extrusion direction __m128 dx = _mm_sub_ps(s0, __MM_SELECT(lp, 0)); // X0 X1 X2 X3 @@ -2028,15 +1997,17 @@ namespace Ogre { __m128 d0 = _mm_add_ps(dx, s0); __m128 d1 = _mm_add_ps(dy, s1); __m128 d2 = _mm_add_ps(dz, s2); + __m128 d3 = _mm_setzero_ps(); - // Arrange back to 4x3 continuous format for store results - __MM_TRANSPOSE3x4_PS(d0, d1, d2); + // Arrange back to 4x4 continuous format for store results + __MM_TRANSPOSE4x4_PS(d0, d1, d2, d3); // Store extruded positions DestAccessor::store(pDestPos + 0, d0); DestAccessor::store(pDestPos + 4, d1); DestAccessor::store(pDestPos + 8, d2); - pDestPos += 12; + DestAccessor::store(pDestPos + 12,d3); + pDestPos += 16; } // Dealing with remaining vertices @@ -2044,7 +2015,7 @@ namespace Ogre { { // Load source position __m128 src = _mm_loadh_pi(_mm_load_ss(pSrcPos + 0), (const __m64*)(pSrcPos + 1)); // x 0 y z - pSrcPos += 3; + pSrcPos += 4; // Calculate unnormalised extrusion direction __m128 dir = _mm_sub_ps(src, _mm_shuffle_ps(lp, lp, _MM_SHUFFLE(2,1,3,0))); // X 1 Y Z @@ -2062,7 +2033,7 @@ namespace Ogre { // Store extruded position _mm_store_ss(pDestPos + 0, dst); _mm_storeh_pi((__m64*)(pDestPos + 1), dst); - pDestPos += 3; + pDestPos += 4; } } }; diff --git a/OgreMain/src/OgreShadowCaster.cpp b/OgreMain/src/OgreShadowCaster.cpp index 4211443b13b..08b43edf27a 100644 --- a/OgreMain/src/OgreShadowCaster.cpp +++ b/OgreMain/src/OgreShadowCaster.cpp @@ -465,7 +465,7 @@ namespace Ogre { // TODO: We should add extra (ununsed) vertices ensure source and // destination buffer have same alignment for slight performance gain. - float* pDest = pSrc + originalVertexCount * 3; + float* pDest = pSrc + originalVertexCount * 4; OptimisedUtil::getImplementation()->extrudeVertices( light, extrudeDist, diff --git a/OgreMain/src/OgreStaticGeometry.cpp b/OgreMain/src/OgreStaticGeometry.cpp index f59822637ac..ad53563c3f5 100644 --- a/OgreMain/src/OgreStaticGeometry.cpp +++ b/OgreMain/src/OgreStaticGeometry.cpp @@ -941,18 +941,11 @@ namespace Ogre { // Create vertex data which just references position component (and 2 component) mRenderOp.vertexData = OGRE_NEW VertexData(); // Map in position data - mRenderOp.vertexData->vertexDeclaration->addElement(0,0,VET_FLOAT3, VES_POSITION); + mRenderOp.vertexData->vertexDeclaration->addElement(0,0,VET_FLOAT4, VES_POSITION); ushort origPosBind = vertexData->vertexDeclaration->findElementBySemantic(VES_POSITION)->getSource(); mPositionBuffer = vertexData->vertexBufferBinding->getBuffer(origPosBind); mRenderOp.vertexData->vertexBufferBinding->setBinding(0, mPositionBuffer); - // Map in w-coord buffer (if present) - if(vertexData->hardwareShadowVolWBuffer) - { - mRenderOp.vertexData->vertexDeclaration->addElement(1,0,VET_FLOAT1, VES_TEXTURE_COORDINATES, 0); - mWBuffer = vertexData->hardwareShadowVolWBuffer; - mRenderOp.vertexData->vertexBufferBinding->setBinding(1, mWBuffer); - } // Use same vertex start as input mRenderOp.vertexData->vertexStart = vertexData->vertexStart; @@ -1495,6 +1488,11 @@ namespace Ogre { } void StaticGeometry::GeometryBucket::build(bool stencilShadows) { + // Need to double the vertex count for the position buffer + // if we're doing stencil shadows + OgreAssert(!stencilShadows || mVertexData->vertexCount * 2 <= mMaxVertexIndex, + "Index range exceeded when using stencil shadows, consider reducing your region size or " + "reducing poly count"); // Ok, here's where we transfer the vertices and indexes to the shared // buffers // Shortcuts @@ -1511,22 +1509,12 @@ namespace Ogre { uint16* p16Dest = static_cast(dstIndexLock.pData); // create all vertex buffers, and lock ushort b; - ushort posBufferIdx = dcl->findElementBySemantic(VES_POSITION)->getSource(); std::vector destBufferLocks; std::vector bufferElements; for (b = 0; b < binds->getBufferCount(); ++b) { size_t vertexCount = mVertexData->vertexCount; - // Need to double the vertex count for the position buffer - // if we're doing stencil shadows - if (stencilShadows && b == posBufferIdx) - { - vertexCount = vertexCount * 2; - assert(vertexCount <= mMaxVertexIndex && - "Index range exceeded when using stencil shadows, consider " - "reducing your region size or reducing poly count"); - } HardwareVertexBufferSharedPtr vbuf = HardwareBufferManager::getSingleton().createVertexBuffer( dcl->getVertexSize(b), @@ -1659,44 +1647,6 @@ namespace Ogre { { binds->getBuffer(b)->unlock(); } - - // If we're dealing with stencil shadows, copy the position data from - // the early half of the buffer to the latter part - if (stencilShadows) - { - HardwareVertexBufferSharedPtr buf = binds->getBuffer(posBufferIdx); - HardwareBufferLockGuard bufLock(buf, HardwareBuffer::HBL_NORMAL); - void* pSrc = bufLock.pData; - // Point dest at second half (remember vertexcount is original count) - void* pDest = static_cast(pSrc) + - buf->getVertexSize() * mVertexData->vertexCount; - memcpy(pDest, pSrc, buf->getVertexSize() * mVertexData->vertexCount); - bufLock.unlock(); - - // Also set up hardware W buffer if appropriate - RenderSystem* rend = Root::getSingleton().getRenderSystem(); - if (rend) - { - buf = HardwareBufferManager::getSingleton().createVertexBuffer( - sizeof(float), mVertexData->vertexCount * 2, - HardwareBuffer::HBU_STATIC_WRITE_ONLY, false); - // Fill the first half with 1.0, second half with 0.0 - bufLock.lock(buf, HardwareBuffer::HBL_DISCARD); - float *pW = static_cast(bufLock.pData); - size_t v; - for (v = 0; v < mVertexData->vertexCount; ++v) - { - *pW++ = 1.0f; - } - for (v = 0; v < mVertexData->vertexCount; ++v) - { - *pW++ = 0.0f; - } - bufLock.unlock(); - mVertexData->hardwareShadowVolWBuffer = buf; - } - } - } //-------------------------------------------------------------------------- void StaticGeometry::GeometryBucket::dump(std::ofstream& of) const diff --git a/OgreMain/src/OgreVertexIndexData.cpp b/OgreMain/src/OgreVertexIndexData.cpp index 40a1767265a..052b76ea142 100644 --- a/OgreMain/src/OgreVertexIndexData.cpp +++ b/OgreMain/src/OgreVertexIndexData.cpp @@ -118,9 +118,6 @@ namespace Ogre { ei->getIndex() ); } - // Copy reference to hardware shadow buffer, no matter whether copy data or not - dest->hardwareShadowVolWBuffer = hardwareShadowVolWBuffer; - // copy anim data dest->hwAnimationDataList = hwAnimationDataList; dest->hwAnimDataItemsUsed = hwAnimDataItemsUsed; @@ -131,104 +128,84 @@ namespace Ogre { //----------------------------------------------------------------------- void VertexData::prepareForShadowVolume(void) { - /* NOTE - I would dearly, dearly love to just use a 4D position buffer in order to - store the extra 'w' value I need to differentiate between extruded and - non-extruded sections of the buffer, so that vertex programs could use that. - Hey, it works fine for GL. However, D3D9 in it's infinite stupidity, does not - support 4d position vertices in the fixed-function pipeline. If you use them, - you just see nothing. Since we can't know whether the application is going to use - fixed function or vertex programs, we have to stick to 3d position vertices and - store the 'w' in a separate 1D texture coordinate buffer, which is only used - when rendering the shadow. - */ - - // Upfront, lets check whether we have vertex program capability - RenderSystem* rend = Root::getSingleton().getRenderSystem(); - bool useVertexPrograms = rend; - - // Look for a position element const VertexElement* posElem = vertexDeclaration->findElementBySemantic(VES_POSITION); if (posElem) { size_t v; unsigned short posOldSource = posElem->getSource(); + auto posOldSize = posElem->getSize(); HardwareVertexBufferSharedPtr vbuf = vertexBufferBinding->getBuffer(posOldSource); - bool wasSharedBuffer = false; // Are there other elements in the buffer except for the position? - if (vbuf->getVertexSize() > posElem->getSize()) - { - // We need to create another buffer to contain the remaining elements - // Most drivers don't like gaps in the declaration, and in any case it's waste - wasSharedBuffer = true; - } + bool wasSharedBuffer = vbuf->getVertexSize() > posElem->getSize(); + HardwareVertexBufferSharedPtr newPosBuffer, newRemainderBuffer; if (wasSharedBuffer) { + // We need to create another buffer to contain the remaining elements + // Most drivers don't like gaps in the declaration, and in any case it's waste newRemainderBuffer = vbuf->getManager()->createVertexBuffer( vbuf->getVertexSize() - posElem->getSize(), vbuf->getNumVertices(), vbuf->getUsage(), vbuf->hasShadowBuffer()); } - // Allocate new position buffer, will be FLOAT3 and 2x the size + // Allocate new position buffer, will be FLOAT4 and 2x the size size_t oldVertexCount = vbuf->getNumVertices(); size_t newVertexCount = oldVertexCount * 2; newPosBuffer = vbuf->getManager()->createVertexBuffer( - VertexElement::getTypeSize(VET_FLOAT3), newVertexCount, vbuf->getUsage(), + VertexElement::getTypeSize(VET_FLOAT4), newVertexCount, vbuf->getUsage(), vbuf->hasShadowBuffer()); // Iterate over the old buffer, copying the appropriate elements and initialising the rest float* pSrc; - unsigned char *pBaseSrc = static_cast( - vbuf->lock(HardwareBuffer::HBL_READ_ONLY)); + auto* pBaseSrc = static_cast(vbuf->lock(HardwareBuffer::HBL_READ_ONLY)); // Point first destination pointer at the start of the new position buffer, // the other one half way along float *pDest = static_cast(newPosBuffer->lock(HardwareBuffer::HBL_DISCARD)); - float* pDest2 = pDest + oldVertexCount * 3; + float* pDest2 = pDest + oldVertexCount * 4; // Precalculate any dimensions of vertex areas outside the position size_t prePosVertexSize = 0; - unsigned char *pBaseDestRem = 0; - if (wasSharedBuffer) + uchar *pBaseDestRem = 0; + + size_t postPosVertexSize, postPosVertexOffset; + prePosVertexSize = posElem->getOffset(); + postPosVertexOffset = prePosVertexSize + posElem->getSize(); + postPosVertexSize = vbuf->getVertexSize() - postPosVertexOffset; + + if(wasSharedBuffer) { - size_t postPosVertexSize, postPosVertexOffset; - pBaseDestRem = static_cast( - newRemainderBuffer->lock(HardwareBuffer::HBL_DISCARD)); - prePosVertexSize = posElem->getOffset(); - postPosVertexOffset = prePosVertexSize + posElem->getSize(); - postPosVertexSize = vbuf->getVertexSize() - postPosVertexOffset; // the 2 separate bits together should be the same size as the remainder buffer vertex assert (newRemainderBuffer->getVertexSize() == prePosVertexSize + postPosVertexSize); - - // Iterate over the vertices - for (v = 0; v < oldVertexCount; ++v) - { - // Copy position, into both buffers - posElem->baseVertexPointerToElement(pBaseSrc, &pSrc); - *pDest++ = *pDest2++ = *pSrc++; - *pDest++ = *pDest2++ = *pSrc++; - *pDest++ = *pDest2++ = *pSrc++; - - // now deal with any other elements - // Basically we just memcpy the vertex excluding the position - if (prePosVertexSize > 0) - memcpy(pBaseDestRem, pBaseSrc, prePosVertexSize); - if (postPosVertexSize > 0) - memcpy(pBaseDestRem + prePosVertexSize, - pBaseSrc + postPosVertexOffset, postPosVertexSize); - pBaseDestRem += newRemainderBuffer->getVertexSize(); - - pBaseSrc += vbuf->getVertexSize(); - - } // next vertex + pBaseDestRem = static_cast(newRemainderBuffer->lock(HardwareBuffer::HBL_DISCARD)); } - else + + // Iterate over the vertices + for (v = 0; v < oldVertexCount; ++v) { - // Unshared buffer, can block copy the whole thing - memcpy(pDest, pBaseSrc, vbuf->getSizeInBytes()); - memcpy(pDest2, pBaseSrc, vbuf->getSizeInBytes()); - } + // Copy position, into both buffers + posElem->baseVertexPointerToElement(pBaseSrc, &pSrc); + *pDest++ = *pDest2++ = *pSrc++; + *pDest++ = *pDest2++ = *pSrc++; + *pDest++ = *pDest2++ = *pSrc++; + // Fill the first half with w=1.0, second half with w=0.0 + *pDest++ = 1; *pDest2++ = 0; + + if (!wasSharedBuffer) + continue; + + // now deal with any other elements + // Basically we just memcpy the vertex excluding the position + if (prePosVertexSize > 0) + memcpy(pBaseDestRem, pBaseSrc, prePosVertexSize); + if (postPosVertexSize > 0) + memcpy(pBaseDestRem + prePosVertexSize, + pBaseSrc + postPosVertexOffset, postPosVertexSize); + pBaseDestRem += newRemainderBuffer->getVertexSize(); + + pBaseSrc += vbuf->getVertexSize(); + + } // next vertex vbuf->unlock(); newPosBuffer->unlock(); @@ -239,25 +216,6 @@ namespace Ogre { // So we should force the deallocation of any temporary copies vbuf->getManager()->_forceReleaseBufferCopies(vbuf); - if (useVertexPrograms) - { - // Now it's time to set up the w buffer - hardwareShadowVolWBuffer = vbuf->getManager()->createVertexBuffer( - sizeof(float), newVertexCount, HardwareBuffer::HBU_STATIC_WRITE_ONLY, false); - // Fill the first half with 1.0, second half with 0.0 - pDest = static_cast( - hardwareShadowVolWBuffer->lock(HardwareBuffer::HBL_DISCARD)); - for (v = 0; v < oldVertexCount; ++v) - { - *pDest++ = 1.0f; - } - for (v = 0; v < oldVertexCount; ++v) - { - *pDest++ = 0.0f; - } - hardwareShadowVolWBuffer->unlock(); - } - unsigned short newPosBufferSource; if (wasSharedBuffer) { @@ -302,7 +260,7 @@ namespace Ogre { vertexDeclaration->modifyElement( idx, posOldSource, // same old source - elemi->getOffset() - posElem->getSize(), // less offset now + elemi->getOffset() - posOldSize, // less offset now elemi->getType(), elemi->getSemantic(), elemi->getIndex()); diff --git a/OgreMain/src/SSE2NEON.h b/OgreMain/src/SSE2NEON.h index 08047aedda5..55875b4d96c 100644 --- a/OgreMain/src/SSE2NEON.h +++ b/OgreMain/src/SSE2NEON.h @@ -471,6 +471,11 @@ FORCE_INLINE __m128 _mm_load_ss(const float * p) return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0)); } +// Loads an single - precision, floating - point value into the low word and copy the upper three words. https://msdn.microsoft.com/en-us/library/k2204k44%28v=vs.90%29.aspx +FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b) +{ + return vreinterpretq_m128_f32(vsetq_lane_f32((float)b, a, 0)); +} // ****************************************** // Logic/Binary operations