PCSX2 · TJnotJT · Feb 2, 2025 · Feb 5, 2025
diff --git a/pcsx2/GS/GSState.cpp b/pcsx2/GS/GSState.cpp
@@ -1595,13 +1595,6 @@ inline bool GSState::TestDrawChanged()
 	return false;
 }
 
-u32 GSState::CalcMask(int exp, int max_exp)
-{
-	const int amount = 9 + (max_exp - exp);
-
-	return (1 << std::min(amount, 23)) - 1;
-}
-
 void GSState::FlushPrim()
 {
 	if (m_index.tail > 0)
@@ -1676,49 +1669,17 @@ void GSState::FlushPrim()
 #endif
 
 		m_vt.Update(m_vertex.buff, m_index.buff, m_vertex.tail, m_index.tail, GSUtil::GetPrimClass(PRIM->PRIM));
+
+		// Fix huge or nan ST coordinates
+		if (PRIM->TME && !PRIM->FST)
+		{
+			FixHugeSTCoords();	
+		}
 
-		// Texel coordinate rounding
-		// Helps Manhunt (lights shining through objects).
-		// Can help with some alignment issues when upscaling too, and is for both Software and Hardware renderers.
-		// Sometimes hardware doesn't get affected, likely due to the difference in how GPU's handle textures (Persona minimap).
-		if (PRIM->TME && (GSUtil::GetPrimClass(PRIM->PRIM) == GS_PRIM_CLASS::GS_SPRITE_CLASS || m_vt.m_eq.z))
+		// Round fractional parts of ST coords
+		if (PRIM->TME && !PRIM->FST && (GSUtil::GetPrimClass(PRIM->PRIM) == GS_PRIM_CLASS::GS_SPRITE_CLASS || m_vt.m_eq.z))
 		{
-			if (!PRIM->FST) // STQ's
-			{
-				const bool is_sprite = GSUtil::GetPrimClass(PRIM->PRIM) == GS_PRIM_CLASS::GS_SPRITE_CLASS;
-				// ST's have the lowest 9 bits (or greater depending on exponent difference) rounding down (from hardware tests).
-				for (int i = m_index.tail - 1; i >= 0; i--)
-				{
-					GSVertex* v = &m_vertex.buff[m_index.buff[i]];
-
-					// Only Q on the second vertex is valid
-					if (!(i & 1) && is_sprite)
-						v->RGBAQ.Q = m_vertex.buff[m_index.buff[i + 1]].RGBAQ.Q;
-
-					int T = std::bit_cast<int>(v->ST.T);
-					int Q = std::bit_cast<int>(v->RGBAQ.Q);
-					int S = std::bit_cast<int>(v->ST.S);
-					const int expS = (S >> 23) & 0xff;
-					const int expT = (T >> 23) & 0xff;
-					const int expQ = (Q >> 23) & 0xff;
-					int max_exp = std::max(expS, expQ);
-
-					u32 mask = CalcMask(expS, max_exp);
-					S &= ~mask;
-					v->ST.S = std::bit_cast<float>(S);
-					max_exp = std::max(expT, expQ);
-					mask = CalcMask(expT, max_exp);
-					T &= ~mask;
-					v->ST.T = std::bit_cast<float>(T);
-					Q &= ~0xff;
-
-					if (!is_sprite || (i & 1))
-						v->RGBAQ.Q = std::bit_cast<float>(Q);
-
-					m_vt.m_min.t.x = std::min(m_vt.m_min.t.x, (v->ST.S / v->RGBAQ.Q) * (1 << m_context->TEX0.TW));
-					m_vt.m_min.t.y = std::min(m_vt.m_min.t.y, (v->ST.T / v->RGBAQ.Q) * (1 << m_context->TEX0.TH));
-				}
-			}
+			RoundSTCoords();
 		}
 
 		// Skip draw if Z test is enabled, but set to fail all pixels.
@@ -3831,8 +3792,8 @@ GSState::TextureMinMaxResult GSState::GetTextureMinMax(GIFRegTEX0 TEX0, GIFRegCL
 
 	u8 uses_border = 0;
 
-	if (m_vt.m_max.t.x >= FLT_MAX || m_vt.m_min.t.x <= -FLT_MAX ||
-		m_vt.m_max.t.y >= FLT_MAX || m_vt.m_min.t.y <= -FLT_MAX)
+	if (m_vt.m_max.t.x >= 2047.0f || m_vt.m_min.t.x <= -2047.0f ||
+		m_vt.m_max.t.y >= 2047.0f || m_vt.m_min.t.y <= -2047.0f)
 	{
 		// If any of the min/max values are +-FLT_MAX we can't rely on them
 		// so just assume full texture.
@@ -4009,6 +3970,208 @@ GSState::TextureMinMaxResult GSState::GetTextureMinMax(GIFRegTEX0 TEX0, GIFRegCL
 	return { vr, uses_border };
 }
 
+// ST coordinate rounding
+// Helps Manhunt (lights shining through objects).
+// Can help with some alignment issues when upscaling too, and is for both Software and Hardware renderers.
+// Sometimes hardware doesn't get affected, likely due to the difference in how GPU's handle textures (Persona minimap).
+void GSState::RoundSTCoords()
+{
+	// ST's have the lowest 9 bits (or greater depending on exponent difference) rounded down (from hardware tests).
+	// This gives the bitmask for the lower 9 (or more) bits.
+	auto LowerBitsMask = [](int exp, int max_exp)
+	{
+		const int amount = 9 + (max_exp - exp);
+		return (1 << std::min(amount, 23)) - 1;
+	};
+
+	for (int i = m_index.tail - 1; i >= 0; i--)
+	{
+		GSVertex* v = &m_vertex.buff[m_index.buff[i]];
+
+		if (m_vt.m_primclass == GS_SPRITE_CLASS && (i & 1))
+		{
+			// FIXME: Remove this once done debugging
+			pxAssertMsg(m_vertex.buff[m_index.buff[i]].RGBAQ.Q == m_vertex.buff[m_index.buff[i - 1]].RGBAQ.Q, "Sprite Qs different");
+		}
+
+		int S = std::bit_cast<int>(v->ST.S);
+		int T = std::bit_cast<int>(v->ST.T);
+		int Q = std::bit_cast<int>(v->RGBAQ.Q);
+
+		const int expS = (S >> 23) & 0xff;
+		const int expT = (T >> 23) & 0xff;
+		const int expQ = (Q >> 23) & 0xff;
+
+		S &= ~LowerBitsMask(expS, std::max(expS, expQ));
+		T &= ~LowerBitsMask(expT, std::max(expT, expQ));
+		Q &= ~0xff; // Q gets truncated less than ST by hardware tests
+
+		v->ST.S = std::bit_cast<float>(S);
+		v->ST.T = std::bit_cast<float>(T);
+
+		const float U = (v->ST.S / v->RGBAQ.Q) * (1 << m_context->TEX0.TW);
+		const float V = (v->ST.T / v->RGBAQ.Q) * (1 << m_context->TEX0.TH);
+		const float Qf = std::bit_cast<float>(Q);
+
+		const GSVector4 uvq(U, V, Qf, Qf);
+
+		// Do min/max with only those values that are not NaN
+		m_vt.m_min.t = m_vt.m_min.t.blend32(m_vt.m_min.t.min(uvq), uvq.notnan());
+		m_vt.m_max.t = m_vt.m_max.t.blend32(m_vt.m_max.t.max(uvq), uvq.notnan());
+	}
+
+	// Clamp the min/max UV values to the min/max valid UV values.
+	m_vt.m_min.t = m_vt.m_min.t.min(GSVector4(2047.0f)).max(GSVector4(-2047.0f)).xyzw(m_vt.m_min.t);
+	m_vt.m_max.t = m_vt.m_max.t.min(GSVector4(2047.0f)).max(GSVector4(-2047.0f)).xyzw(m_vt.m_max.t);
+}
+
+// Handle the huge or NaN ST coords culling primitives or replacing
+// replacing the primitives with valid coordinates.
+// This is based on hardware test that show that seem to show that ST coordinate get clamped to +/- 2047
+// before applying repeat or region repeat.
+// Note that the huge texture coords may be a symptom of floating point issues in the EE and
+// it would be better to have them fixed there.
+void GSState::FixHugeSTCoords()
+{
+	switch (GSUtil::GetClassVertexCount(GSUtil::GetPrimClass(PRIM->PRIM)))
+	{
+		case 1:
+			FixHugeSTCoordsImpl<1, false>();
+			break;
+		case 2:
+			FixHugeSTCoordsImpl<2, false>();
+			break;
+		case 3:
+			FixHugeSTCoordsImpl<3, false>();
+			break;
+		default:
+			pxFail("Impossible");
+	}
+}
+
+template <u32 n, bool cull>
+void GSState::FixHugeSTCoordsImpl()
+{
+	GSVertex* const vertex = m_vertex.buff;
+	u16* const index = m_index.buff;
+
+	u32 new_index_tail = 0;
+
+	constexpr float huge = 1e10f; // Arbitrary large value
+
+	const float tex_width = 1 << m_context->TEX0.TW;
+	const float tex_height = 1 << m_context->TEX0.TH;
+
+	bool new_prims = false; // Did we generate new primitives?
+
+	for (u32 i = 0; i < m_index.tail; i += n)
+	{
+		bool nan_s = false;
+		bool nan_t = false;
+		bool huge_pos_s = false;
+		bool huge_neg_s = false;
+		bool huge_pos_t = false;
+		bool huge_neg_t = false;
+
+		if (m_vt.m_primclass == GS_SPRITE_CLASS)
+		{
+			// FIXME: Remove this once done debugging
+			pxAssertMsg(vertex[index[i + 0]].RGBAQ.Q == vertex[index[i + 1]].RGBAQ.Q, "Sprite Qs different");
+		}
+
+		for (u32 j = 0; j < n; j++)
+		{
+			const float s = vertex[index[i + j]].ST.S / vertex[index[i + j]].RGBAQ.Q;
+			const float t = vertex[index[i + j]].ST.T / vertex[index[i + j]].RGBAQ.Q;
+			nan_s |= std::isnan(s);
+			nan_t |= std::isnan(t);
+			huge_pos_s |= s > huge;
+			huge_pos_t |= t > huge;
+			huge_neg_s |= s < -huge;
+			huge_neg_t |= t < -huge;
+		}
+
+		// ambiguous = true would probably result in NaN in the SW rasterizer or something undefined in HW.
+		// PS2 does not have NaN so there is no really accurate way to emulate this.
+		// huge = true and ambiguous = false seems to have well-defined behavior on the PS2:
+		// it clamps huge values to +/-2047 in UV coordinates space. We try to approximate this by
+		// giving ST the values that would result in exactly +/-2047 across the primitive.
+		// For ambiguous values either cull the primitive or replace coordinates by 0.
+		const bool ambiguous_s = nan_s || (huge_pos_s && huge_neg_s);
+		const bool ambiguous_t = nan_t || (huge_pos_t && huge_neg_t);
+
+		if ((ambiguous_s || ambiguous_t) && cull)
+		{
+			// Cull the primitive by not saving the indices
+			continue;
+		}
+
+		if (huge_pos_s || huge_pos_t || huge_neg_s || huge_neg_t || ambiguous_s || ambiguous_t)
+		{
+			// Add new vertices to replace the primitive with another primitive with clamped values.
+			new_prims = true;
+
+			// Copy old values to tail of vertex buffer.
+			// The vertex buffer is allocated so that there is always at least room for 3 new vertices at the end.
+			for (u32 j = 0; j < n; j++)
+				vertex[m_vertex.tail + j] = vertex[index[i + j]];
+
+			const float new_u_val = ambiguous_s ? 0.0f :
+									huge_pos_s  ? 2047.0f :
+									huge_neg_s  ? -2047.0f :
+												  NAN;
+			const float new_v_val = ambiguous_t ? 0.0f :
+									huge_pos_t  ? 2047.0f :
+									huge_neg_t  ? -2047.0f :
+												  NAN;
+
+			// If we are replacing both S and T, replace Q by 1.0f
+			if (!std::isnan(new_u_val) && !std::isnan(new_v_val))
+			{
+				for (u32 j = 0; j < n; j++)
+					vertex[m_vertex.tail + j].RGBAQ.Q = 1.0f;
+			}
+
+			// Try to replace huge/ambiguous values so that we get constant U or V across the entire primitive after interpolation
+			if (!std::isnan(new_u_val))
+			{
+				for (u32 j = 0; j < n; j++)
+					vertex[m_vertex.tail + j].ST.S = new_u_val * vertex[m_vertex.tail + j].RGBAQ.Q / tex_width;
+			}
+
+			if (!std::isnan(new_v_val))
+			{
+				for (u32 j = 0; j < n; j++)
+					vertex[m_vertex.tail + j].ST.T = new_v_val * vertex[m_vertex.tail + j].RGBAQ.Q / tex_width;
+			}
+
+			// Make new indices point to new vertices
+			for (u32 j = 0; j < n; j++)
+				index[new_index_tail + j] = m_vertex.tail + j;
+
+			// Advance tail since we pushed new vertices
+			m_vertex.tail += n;
+
+			if (m_vertex.tail >= m_vertex.maxcount)
+				GrowVertexBuffer();
+		}
+		else if (new_index_tail < i) // If new_index_tail == i, don't update indices since no primitives have been culled
+		{
+			// Keep the same primitive so shift indices down
+			for (u32 j = 0; j < n; j++)
+				index[new_index_tail + j] = index[i + j];
+		}
+
+		new_index_tail += n;
+	}
+
+	m_index.tail = new_index_tail;
+
+	// If indexed new primitives at the end of the buffer, update head and next also
+	if (new_prims)
+		m_vertex.head = m_vertex.next = m_vertex.tail;
+}
+
 void GSState::CalcAlphaMinMax(const int tex_alpha_min, const int tex_alpha_max)
 {
 	if (m_vt.m_alpha.valid && tex_alpha_min == 0 && tex_alpha_max == 255)

diff --git a/pcsx2/GS/GSState.h b/pcsx2/GS/GSState.h
@@ -190,6 +190,10 @@ class GSState : public GSAlignedClass<32>
 	bool IsCoverageAlpha();
 	void CalcAlphaMinMax(const int tex_min, const int tex_max);
 	void CorrectATEAlphaMinMax(const u32 atst, const int aref);
+	void RoundSTCoords();
+	void FixHugeSTCoords();
+	template <u32 n, bool cull>
+	void FixHugeSTCoordsImpl();
 
 public:
 	struct GSUploadQueue

diff --git a/pcsx2/GS/GSVector4.h b/pcsx2/GS/GSVector4.h
@@ -267,6 +267,16 @@ class alignas(16) GSVector4
 		return round<Round_PosInf>();
 	}
 
+	__forceinline GSVector4 notnan() const
+	{
+		return GSVector4(_mm_cmpord_ps(m, m));
+	}
+
+	__forceinline GSVector4 isnan() const
+	{
+		return GSVector4(_mm_cmpunord_ps(m, m));
+	}
+
 	// http://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
 
 #define LOG_POLY0(x, c0) GSVector4(c0)
@@ -656,6 +666,11 @@ class alignas(16) GSVector4
 		return neg();
 	}
 
+	__forceinline GSVector4 operator~() const
+	{
+		return cast(~GSVector4i::cast(*this));
+	}
+
 	__forceinline void operator+=(const GSVector4& v)
 	{
 		m = _mm_add_ps(m, v);

diff --git a/pcsx2/GS/GSVector4_arm64.h b/pcsx2/GS/GSVector4_arm64.h
@@ -241,6 +241,16 @@ class alignas(16) GSVector4
 		return GSVector4(vrndpq_f32(v4s));
 	}
 
+	__forceinline GSVector4 notnan() const
+	{
+		return *this == *this;
+	}
+
+	__forceinline GSVector4 isnan() const
+	{
+		return *this != *this;
+	}
+
 	// http://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
 
 #define LOG_POLY0(x, c0) GSVector4(c0)
@@ -560,6 +570,11 @@ class alignas(16) GSVector4
 		return neg();
 	}
 
+	__forceinline GSVector4 operator~() const
+	{
+		return cast(~GSVector4i::cast(*this));
+	}
+
 	__forceinline void operator+=(const GSVector4& v)
 	{
 		v4s = vaddq_f32(v4s, v.v4s);

diff --git a/pcsx2/GS/Renderers/Common/GSVertexTrace.cpp b/pcsx2/GS/Renderers/Common/GSVertexTrace.cpp
@@ -20,11 +20,12 @@ void GSVertexTrace::Update(const void* vertex, const u16* index, int v_count, in
 
 	m_primclass = primclass;
 
-	u32 iip = m_state->PRIM->IIP;
-	u32 tme = m_state->PRIM->TME;
-	u32 fst = m_state->PRIM->FST;
-	u32 color = !(m_state->PRIM->TME && m_state->m_context->TEX0.TFX == TFX_DECAL && m_state->m_context->TEX0.TCC);
+	const u32 iip = m_state->PRIM->IIP;
+	const u32 tme = m_state->PRIM->TME;
+	const u32 fst = m_state->PRIM->FST;
+	const u32 color = !(m_state->PRIM->TME && m_state->m_context->TEX0.TFX == TFX_DECAL && m_state->m_context->TEX0.TCC);
 
+	// Call the correct function to find the min/max values
 	m_fmm[color][fst][tme][iip][primclass](*this, vertex, index, i_count);
 
 	// Potential float overflow detected. Better uses the slower division instead