diff --git a/src/Effects/Anime4K/Anime4K_Upscale_Denoise_VL.hlsl b/src/Effects/Anime4K/Anime4K_Upscale_Denoise_VL.hlsl index 202939f90..d2a1bbd11 100644 --- a/src/Effects/Anime4K/Anime4K_Upscale_Denoise_VL.hlsl +++ b/src/Effects/Anime4K/Anime4K_Upscale_Denoise_VL.hlsl @@ -4,6 +4,9 @@ //!MAGPIE EFFECT //!VERSION 4 //!SORT_NAME Anime4K_Upscale_Denoise_2 +//!USE FP16, MulAdd + +#include "..\StubDefs.hlsli" //!TEXTURE @@ -125,22 +128,22 @@ void Pass1(uint2 blockStart, uint3 threadId) { uint i, j; - float3 src[4][4]; + MF3 src[4][4]; [unroll] for (i = 0; i <= 2; i += 2) { [unroll] for (j = 0; j <= 2; j += 2) { float2 tpos = (gxy + uint2(i, j)) * inputPt; - const float4 sr = INPUT.GatherRed(sam, tpos); - const float4 sg = INPUT.GatherGreen(sam, tpos); - const float4 sb = INPUT.GatherBlue(sam, tpos); + const MF4 sr = INPUT.GatherRed(sam, tpos); + const MF4 sg = INPUT.GatherGreen(sam, tpos); + const MF4 sb = INPUT.GatherBlue(sam, tpos); // w z // x y - src[i][j] = float3(sr.w, sg.w, sb.w); - src[i][j + 1] = float3(sr.x, sg.x, sb.x); - src[i + 1][j] = float3(sr.z, sg.z, sb.z); - src[i + 1][j + 1] = float3(sr.y, sg.y, sb.y); + src[i][j] = MF3(sr.w, sg.w, sb.w); + src[i][j + 1] = MF3(sr.x, sg.x, sb.x); + src[i + 1][j] = MF3(sr.z, sg.z, sb.z); + src[i + 1][j + 1] = MF3(sr.y, sg.y, sb.y); } } @@ -156,27 +159,27 @@ void Pass1(uint2 blockStart, uint3 threadId) { } } - float4 target1 = mul(src[i - 1][j - 1], float3x4(0.28296316, -0.020139743, 0.1038232, 0.09352482, -0.16964972, 0.07910997, -0.049914766, -0.10661066, -0.121037185, -0.029087039, -0.02511847, -0.078911744)); - target1 += mul(src[i - 1][j], float3x4(-0.3927183, 0.01805193, -0.031168332, -0.13300525, 0.20814548, 0.118818566, 0.1655351, 0.095023684, 0.17600809, -0.03928444, -0.014350658, 0.08458312)); - target1 += mul(src[i - 1][j + 1], float3x4(0.079089314, -0.0421829, 0.05452305, -0.22055493, 0.013279097, -0.12875281, 0.02452735, -0.101503745, -0.085946664, 0.05539176, 0.022408713, 0.14837204)); - target1 += mul(src[i][j - 1], float3x4(-0.102643915, -0.011254746, 0.1478563, 0.1030208, 0.12396588, 0.0016621432, 0.2551224, -0.10399001, -0.01068436, 0.07155532, -0.104522154, 0.026937222)); - target1 += mul(src[i][j], float3x4(-0.8789423, 0.35707328, -0.29964274, -0.064913996, 0.4962815, 0.26001287, -0.9511284, 0.49574667, 0.39539725, 0.16308042, 0.119878456, -0.30259115)); - target1 += mul(src[i][j + 1], float3x4(-0.08852938, -0.32612664, -0.006712046, 0.28693515, 0.06320871, -0.3322611, 0.04651086, -0.11020996, 0.01821082, -0.22851005, -0.07803438, 0.021527015)); - target1 += mul(src[i + 1][j - 1], float3x4(0.12295851, -0.011285535, 0.015859747, 0.04005441, -0.018136669, 0.03171969, -0.0406123, -0.10731229, -0.12117574, 0.005033036, 0.047838476, 0.026843475)); - target1 += mul(src[i + 1][j], float3x4(0.4655988, 0.05519082, 0.039515793, 0.28410903, -0.36144528, 0.13039446, 0.11338478, -0.2141387, -0.10026682, -0.07903024, -0.09410254, 0.043833878)); - target1 += mul(src[i + 1][j + 1], float3x4(0.110124744, -0.024725702, 0.028102143, -0.09493807, -0.06455328, -0.15164614, 0.04425987, 0.15483347, -0.045039337, 0.07210396, -0.005390788, -0.03832707)); - target1 += float4(0.007907974, -0.035503313, 0.057224784, -0.19763541); - - float4 target2 = mul(src[i - 1][j - 1], float3x4(-0.012326053, 0.050769784, 0.1278702, -0.100782245, 0.14329414, -0.054558773, 0.023473471, 0.056829426, 0.048292916, 0.0046510273, -0.11478287, 0.0011030561)); - target2 += mul(src[i - 1][j], float3x4(0.29542983, -0.55061895, -0.068554066, 0.1433222, -0.072878316, 0.30201668, -0.2223378, -0.06704077, 0.16955832, 0.3279914, 0.17619601, -0.1276919)); - target2 += mul(src[i - 1][j + 1], float3x4(0.09623417, 0.30559412, 0.094622105, -0.076706685, 0.07943858, -0.084815115, 0.12472551, 0.079850115, -0.13044213, -0.21300878, -0.095747225, 0.13412355)); - target2 += mul(src[i][j - 1], float3x4(0.21291664, 0.17195296, -0.20080926, 0.1064855, 0.10228669, -0.09580175, -0.11217631, -0.09740562, -0.0033135475, -0.053094357, 0.2983595, 0.035281878)); - target2 += mul(src[i][j], float3x4(-0.08955812, -0.45707774, -0.4606922, -0.5754473, -0.11395895, 0.33530128, 0.29705846, -0.18877256, -0.43502945, 0.114171304, -0.3750776, -0.081597246)); - target2 += mul(src[i][j + 1], float3x4(-0.26109028, 0.02662961, -0.10441071, 0.11199392, -0.12038989, -0.09642296, -0.061320662, -0.33058178, 0.20212512, 0.00840794, 0.14357455, -0.038080238)); - target2 += mul(src[i + 1][j - 1], float3x4(-0.09533881, -0.13644339, 0.068756215, 0.079305276, -0.053370547, 0.19572955, 0.0682981, 0.14469264, 0.15582883, -0.057183057, -0.13919263, -0.016394936)); - target2 += mul(src[i + 1][j], float3x4(-0.041189935, 0.39878023, 0.028704925, 0.30194348, -0.04486593, -0.33899093, -0.103968106, 0.21802065, -0.077099144, -0.07389541, 0.18069103, 0.18894517)); - target2 += mul(src[i + 1][j + 1], float3x4(-0.12399862, 0.19246885, 0.034825478, -0.0044787163, 0.13121822, -0.13573012, -0.030162754, 0.1899518, 0.102326415, -0.061512686, -0.005647928, -0.0937634)); - target2 += float4(0.019286277, -0.033644073, 0.08196311, 0.0054393094); + MF4 target1 = { 0.007907974, -0.035503313, 0.057224784, -0.19763541 }; + target1 = MulAdd(src[i - 1][j - 1], MF3x4(0.28296316, -0.020139743, 0.1038232, 0.09352482, -0.16964972, 0.07910997, -0.049914766, -0.10661066, -0.121037185, -0.029087039, -0.02511847, -0.078911744), target1); + target1 = MulAdd(src[i - 1][j], MF3x4(-0.3927183, 0.01805193, -0.031168332, -0.13300525, 0.20814548, 0.118818566, 0.1655351, 0.095023684, 0.17600809, -0.03928444, -0.014350658, 0.08458312), target1); + target1 = MulAdd(src[i - 1][j + 1], MF3x4(0.079089314, -0.0421829, 0.05452305, -0.22055493, 0.013279097, -0.12875281, 0.02452735, -0.101503745, -0.085946664, 0.05539176, 0.022408713, 0.14837204), target1); + target1 = MulAdd(src[i][j - 1], MF3x4(-0.102643915, -0.011254746, 0.1478563, 0.1030208, 0.12396588, 0.0016621432, 0.2551224, -0.10399001, -0.01068436, 0.07155532, -0.104522154, 0.026937222), target1); + target1 = MulAdd(src[i][j], MF3x4(-0.8789423, 0.35707328, -0.29964274, -0.064913996, 0.4962815, 0.26001287, -0.9511284, 0.49574667, 0.39539725, 0.16308042, 0.119878456, -0.30259115), target1); + target1 = MulAdd(src[i][j + 1], MF3x4(-0.08852938, -0.32612664, -0.006712046, 0.28693515, 0.06320871, -0.3322611, 0.04651086, -0.11020996, 0.01821082, -0.22851005, -0.07803438, 0.021527015), target1); + target1 = MulAdd(src[i + 1][j - 1], MF3x4(0.12295851, -0.011285535, 0.015859747, 0.04005441, -0.018136669, 0.03171969, -0.0406123, -0.10731229, -0.12117574, 0.005033036, 0.047838476, 0.026843475), target1); + target1 = MulAdd(src[i + 1][j], MF3x4(0.4655988, 0.05519082, 0.039515793, 0.28410903, -0.36144528, 0.13039446, 0.11338478, -0.2141387, -0.10026682, -0.07903024, -0.09410254, 0.043833878), target1); + target1 = MulAdd(src[i + 1][j + 1], MF3x4(0.110124744, -0.024725702, 0.028102143, -0.09493807, -0.06455328, -0.15164614, 0.04425987, 0.15483347, -0.045039337, 0.07210396, -0.005390788, -0.03832707), target1); + + MF4 target2 = { 0.019286277, -0.033644073, 0.08196311, 0.0054393094 }; + target2 = MulAdd(src[i - 1][j - 1], MF3x4(-0.012326053, 0.050769784, 0.1278702, -0.100782245, 0.14329414, -0.054558773, 0.023473471, 0.056829426, 0.048292916, 0.0046510273, -0.11478287, 0.0011030561), target2); + target2 = MulAdd(src[i - 1][j], MF3x4(0.29542983, -0.55061895, -0.068554066, 0.1433222, -0.072878316, 0.30201668, -0.2223378, -0.06704077, 0.16955832, 0.3279914, 0.17619601, -0.1276919), target2); + target2 = MulAdd(src[i - 1][j + 1], MF3x4(0.09623417, 0.30559412, 0.094622105, -0.076706685, 0.07943858, -0.084815115, 0.12472551, 0.079850115, -0.13044213, -0.21300878, -0.095747225, 0.13412355), target2); + target2 = MulAdd(src[i][j - 1], MF3x4(0.21291664, 0.17195296, -0.20080926, 0.1064855, 0.10228669, -0.09580175, -0.11217631, -0.09740562, -0.0033135475, -0.053094357, 0.2983595, 0.035281878), target2); + target2 = MulAdd(src[i][j], MF3x4(-0.08955812, -0.45707774, -0.4606922, -0.5754473, -0.11395895, 0.33530128, 0.29705846, -0.18877256, -0.43502945, 0.114171304, -0.3750776, -0.081597246), target2); + target2 = MulAdd(src[i][j + 1], MF3x4(-0.26109028, 0.02662961, -0.10441071, 0.11199392, -0.12038989, -0.09642296, -0.061320662, -0.33058178, 0.20212512, 0.00840794, 0.14357455, -0.038080238), target2); + target2 = MulAdd(src[i + 1][j - 1], MF3x4(-0.09533881, -0.13644339, 0.068756215, 0.079305276, -0.053370547, 0.19572955, 0.0682981, 0.14469264, 0.15582883, -0.057183057, -0.13919263, -0.016394936), target2); + target2 = MulAdd(src[i + 1][j], MF3x4(-0.041189935, 0.39878023, 0.028704925, 0.30194348, -0.04486593, -0.33899093, -0.103968106, 0.21802065, -0.077099144, -0.07389541, 0.18069103, 0.18894517), target2); + target2 = MulAdd(src[i + 1][j + 1], MF3x4(-0.12399862, 0.19246885, 0.034825478, -0.0044787163, 0.13121822, -0.13573012, -0.030162754, 0.1899518, 0.102326415, -0.061512686, -0.005647928, -0.0937634), target2); conv2d_tf[destPos] = target1; conv2d_tf1[destPos] = target2; @@ -204,25 +207,25 @@ void Pass2(uint2 blockStart, uint3 threadId) { // [ a, d, g ] // [ b, e, h ] // [ c, f, i ] - float4 a1 = conv2d_tf.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0); - float4 b1 = conv2d_tf.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); - float4 c1 = conv2d_tf.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); - float4 d1 = conv2d_tf.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); - float4 e1 = conv2d_tf.SampleLevel(sam, pos, 0); - float4 f1 = conv2d_tf.SampleLevel(sam, pos + float2(0, inputPt.y), 0); - float4 g1 = conv2d_tf.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); - float4 h1 = conv2d_tf.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); - float4 i1 = conv2d_tf.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0); - - float4 na1 = max(-a1, 0); - float4 nb1 = max(-b1, 0); - float4 nc1 = max(-c1, 0); - float4 nd1 = max(-d1, 0); - float4 ne1 = max(-e1, 0); - float4 nf1 = max(-f1, 0); - float4 ng1 = max(-g1, 0); - float4 nh1 = max(-h1, 0); - float4 ni1 = max(-i1, 0); + MF4 a1 = conv2d_tf.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0); + MF4 b1 = conv2d_tf.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); + MF4 c1 = conv2d_tf.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); + MF4 d1 = conv2d_tf.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); + MF4 e1 = conv2d_tf.SampleLevel(sam, pos, 0); + MF4 f1 = conv2d_tf.SampleLevel(sam, pos + float2(0, inputPt.y), 0); + MF4 g1 = conv2d_tf.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); + MF4 h1 = conv2d_tf.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); + MF4 i1 = conv2d_tf.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0); + + MF4 na1 = max(-a1, 0); + MF4 nb1 = max(-b1, 0); + MF4 nc1 = max(-c1, 0); + MF4 nd1 = max(-d1, 0); + MF4 ne1 = max(-e1, 0); + MF4 nf1 = max(-f1, 0); + MF4 ng1 = max(-g1, 0); + MF4 nh1 = max(-h1, 0); + MF4 ni1 = max(-i1, 0); a1 = max(a1, 0); b1 = max(b1, 0); @@ -234,25 +237,25 @@ void Pass2(uint2 blockStart, uint3 threadId) { h1 = max(h1, 0); i1 = max(i1, 0); - float4 a2 = conv2d_tf1.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0); - float4 b2 = conv2d_tf1.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); - float4 c2 = conv2d_tf1.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); - float4 d2 = conv2d_tf1.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); - float4 e2 = conv2d_tf1.SampleLevel(sam, pos, 0); - float4 f2 = conv2d_tf1.SampleLevel(sam, pos + float2(0, inputPt.y), 0); - float4 g2 = conv2d_tf1.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); - float4 h2 = conv2d_tf1.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); - float4 i2 = conv2d_tf1.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0); - - float4 na2 = max(-a2, 0); - float4 nb2 = max(-b2, 0); - float4 nc2 = max(-c2, 0); - float4 nd2 = max(-d2, 0); - float4 ne2 = max(-e2, 0); - float4 nf2 = max(-f2, 0); - float4 ng2 = max(-g2, 0); - float4 nh2 = max(-h2, 0); - float4 ni2 = max(-i2, 0); + MF4 a2 = conv2d_tf1.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0); + MF4 b2 = conv2d_tf1.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); + MF4 c2 = conv2d_tf1.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); + MF4 d2 = conv2d_tf1.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); + MF4 e2 = conv2d_tf1.SampleLevel(sam, pos, 0); + MF4 f2 = conv2d_tf1.SampleLevel(sam, pos + float2(0, inputPt.y), 0); + MF4 g2 = conv2d_tf1.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); + MF4 h2 = conv2d_tf1.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); + MF4 i2 = conv2d_tf1.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0); + + MF4 na2 = max(-a2, 0); + MF4 nb2 = max(-b2, 0); + MF4 nc2 = max(-c2, 0); + MF4 nd2 = max(-d2, 0); + MF4 ne2 = max(-e2, 0); + MF4 nf2 = max(-f2, 0); + MF4 ng2 = max(-g2, 0); + MF4 nh2 = max(-h2, 0); + MF4 ni2 = max(-i2, 0); a2 = max(a2, 0); b2 = max(b2, 0); @@ -264,81 +267,81 @@ void Pass2(uint2 blockStart, uint3 threadId) { h2 = max(h2, 0); i2 = max(i2, 0); - float4 target1 = mul(a1, float4x4(-0.04088509, -0.06585775, -0.3094732, 0.12059048, 0.041417453, -0.06144871, -0.06655134, 0.03308842, 0.09287731, 0.010969216, 0.10343026, -0.11185897, 0.05685865, -0.09490512, 0.040908635, 0.03501189)); - target1 += mul(b1, float4x4(-0.04854754, -0.098667145, 0.67147833, -0.11299351, -0.022114437, -0.029202767, 0.014179382, 0.26027945, 0.22076549, -0.16490546, -0.0010764733, 0.08405975, 0.11849154, -0.19072372, -0.35719597, -0.059621073)); - target1 += mul(c1, float4x4(0.079224996, 0.0669873, -0.1718969, -0.05002573, 0.044926763, -0.02904369, 0.017489236, 0.01144465, 0.059109706, 0.064998455, 0.14725484, -0.23879208, 0.039234288, -0.027365638, 0.26172164, -0.094598554)); - target1 += mul(d1, float4x4(-0.07159218, -0.03181544, 0.113837324, -0.053089984, -0.098298974, -0.29500297, 0.1608509, -0.044355504, 0.050882854, -0.19417204, -0.080487266, -0.00879743, 0.0007914453, 0.16640955, 0.21786706, 0.16398212)); - target1 += mul(e1, float4x4(0.16324541, 0.19753313, -0.46597233, -0.2593132, -0.2781038, -0.21973547, -0.024623038, -0.16348499, 0.3628299, 0.044888914, 0.04054647, -0.63605887, 0.02099492, 0.060544077, -0.49359834, 0.36336297)); - target1 += mul(f1, float4x4(-0.16885692, 0.31907207, 0.020906802, 0.13290039, -0.037330728, -0.022859452, -0.020451576, -0.113437995, -0.085683934, 0.054102756, -0.044824492, 0.061346747, -0.038684413, 0.098444365, -0.06734984, -0.17084897)); - target1 += mul(g1, float4x4(-0.015821548, -0.119599186, 0.1614827, 0.08383641, -0.07933593, 0.12528986, -0.06300182, 0.09286327, -0.10199266, 0.02419403, 0.0028411683, -0.09028338, 0.07962534, -0.08676035, -0.19237737, -0.115502626)); - target1 += mul(h1, float4x4(0.09471972, 0.21153462, -0.14393018, 0.055180002, 0.1817461, 0.016607309, -0.0771979, 0.11181317, -0.5491086, -0.102757886, -0.20952754, 0.022466583, -0.075119644, -0.14725658, 0.38451517, 0.12920731)); - target1 += mul(i1, float4x4(0.0867803, 0.114654355, 0.21199988, -0.15367955, -0.01803536, 0.056378633, 0.0018388306, 0.024613786, -0.13306658, 0.017211098, 0.073351346, -0.12064064, -0.10484361, -0.067748636, 0.033206712, -0.13061953)); - target1 += mul(a2, float4x4(-0.002236411, -0.022144757, -0.04586377, 0.101181075, -0.03511624, 0.08440529, 0.18544284, -0.22786349, -0.042184375, 0.015734851, -0.038622506, 0.038529944, -0.09170703, 0.034527462, -0.07817406, 0.10547265)); - target1 += mul(b2, float4x4(-0.12135524, -0.07412039, -0.04979351, -0.082267545, 0.13343571, 0.29196215, -0.4364121, -0.10226428, 0.060835477, -0.23307934, -0.018231759, 0.15550235, 0.09095689, 0.18164484, 0.1322021, -0.022567045)); - target1 += mul(c2, float4x4(-0.0054531163, -0.039762255, -0.030490747, 0.04779882, -0.15290286, 0.056712102, -0.0776974, 0.04114215, 0.15946816, -0.03882117, 0.16770308, -0.026126247, -0.027203865, -0.064107865, -0.13670811, 0.1556276)); - target1 += mul(d2, float4x4(-0.092548385, -0.027285473, 0.084179096, 0.014961629, 0.2564254, 0.07626849, 0.28534448, 0.2588713, -0.018600503, -0.2433456, 0.041392803, -0.045712482, 0.26388907, -0.053502295, 0.14522223, 0.032808404)); - target1 += mul(e2, float4x4(-0.0013780193, 0.3482449, 0.071003586, -0.30707207, -0.05122194, -0.2833618, 0.07910779, 0.051078696, 0.021535402, 0.13021478, 0.022049015, -0.533547, 0.57265025, -0.12843914, -0.14913581, -0.1433724)); - target1 += mul(f2, float4x4(0.07382619, -0.12152924, 0.13364957, 0.181974, 0.15804219, -0.10126773, 0.3029618, -0.12874149, 0.13743348, -0.23245592, -0.20119278, 0.029547188, 0.042436857, 0.04213892, -0.07975374, 0.023821082)); - target1 += mul(g2, float4x4(0.022782229, -0.08359311, -0.060623147, 0.06565042, 0.09828792, 0.044808697, -0.28872305, -0.00092168007, 0.021737702, -0.08698349, 0.1950025, 0.07931995, 0.040952396, -0.07443172, -0.021157127, 0.0056698937)); - target1 += mul(h2, float4x4(-0.09995892, -0.2047294, 0.1414849, 0.062335726, -0.22492298, 0.05269799, -0.029233055, -0.050517935, -0.12534393, -0.12194023, -0.07035469, -0.070764475, 0.18903446, 0.07691209, 0.06153371, 0.011280912)); - target1 += mul(i2, float4x4(-0.036189888, -0.07586571, -0.05888163, 0.010425367, -0.028375402, -0.18870986, -0.19146784, 0.19274063, -0.18856238, 0.0064240266, -0.14537223, -0.06971656, 0.0852742, -0.04866623, -0.031686075, 0.031702038)); - target1 += mul(na1, float4x4(0.0618941, 0.100858234, 0.2628019, -0.048507668, -0.051001363, -0.03195978, 0.035452217, -0.001991919, -0.09649028, -0.047445696, -0.09221298, 0.07602656, -0.02382384, -0.119645916, 0.085616075, -0.07076033)); - target1 += mul(nb1, float4x4(0.019222878, -0.0491929, -0.4902266, 0.18501294, 0.014529614, -0.077125326, 0.011563931, -0.20236616, -0.101982154, -0.021150962, -0.07537948, -0.1540349, 0.028949164, -0.06827332, 0.0067634755, 0.09582376)); - target1 += mul(nc1, float4x4(-0.05995539, -0.031138182, 0.01334257, 0.06827176, -0.030762246, 0.006615233, -0.03562788, 0.016249394, -0.14797118, 0.014671043, -0.09325859, 0.25653747, -0.11474991, 0.05436232, 0.031051394, 0.04179694)); - target1 += mul(nd1, float4x4(0.032279838, -0.030521005, 0.0029688699, 0.005165139, 0.15907808, -0.20421815, -0.07713175, 0.067530625, -0.08619395, 0.026114263, 0.08821273, 0.011591694, 0.018677557, 0.057708874, -0.25859246, -0.18693781)); - target1 += mul(ne1, float4x4(0.10823143, -0.31875235, -0.24394153, -0.0025489891, 0.016761065, -0.19857498, -0.07858479, -0.07811158, -0.38551694, -0.049090322, -0.050053325, 0.23398961, 0.014974165, 0.17498055, 0.29105362, -0.353647)); - target1 += mul(nf1, float4x4(0.05621677, -0.19492444, 0.460332, 0.055917628, -0.06404381, -0.06684098, 0.053624872, 0.057300456, -0.019248677, -0.15110065, 0.032379635, -0.12673225, 0.0068658157, -0.13001235, -0.017716292, 0.064182095)); - target1 += mul(ng1, float4x4(-0.06764552, 0.004707433, -0.13827331, -0.21957871, -0.03789028, -0.04962028, 0.022955444, -0.058468018, 0.13735814, -0.031270552, -0.018490225, 0.0063876202, -0.052979283, -0.030049473, -0.004811771, -0.0044099926)); - target1 += mul(nh1, float4x4(-0.028652798, -0.027029367, 0.62600744, 0.0900562, 0.03869923, -0.20111556, 0.095930666, -0.13164565, 0.5562579, 0.011937122, 0.22882107, 0.030288015, 0.09856272, 0.04736032, -0.077492185, -0.10207275)); - target1 += mul(ni1, float4x4(-0.10581002, -0.16504957, -0.5688921, 0.0414545, 0.04749444, -0.052849945, -0.011017121, -0.025284614, 0.14316759, -0.08547362, -0.09654446, 0.08682504, 0.050776027, 0.0678741, -0.04913651, 0.07527552)); - target1 += mul(na2, float4x4(0.04126091, 0.0048704315, 0.041699376, -0.05820725, -0.09664279, 0.07648305, -0.17979898, 0.11698985, -0.025436765, 0.023232851, 0.010656572, 0.08157569, 0.19584864, -0.022928072, 0.053339157, 0.0039929505)); - target1 += mul(nb2, float4x4(0.040733483, 0.12260473, 0.08071146, 0.07257762, -0.016945919, -0.31637576, -0.24281953, -0.0038469466, -0.10203634, 0.13631973, 0.06505259, -0.13119389, -0.09723076, -0.139551, -0.07504509, 0.08645985)); - target1 += mul(nc2, float4x4(0.017005404, 0.049066268, -0.007544932, -0.04884536, 0.09984347, -0.04447364, 0.4902235, -0.062780835, -0.18389583, 0.07305648, -0.22014385, 0.08004685, 0.0992568, -0.08569604, 0.093966395, -0.07047139)); - target1 += mul(nd2, float4x4(0.0017705248, 0.020553982, -0.09167042, 0.0036356782, -0.11867446, -0.07055574, 0.40252638, 0.09657129, 0.0888632, 0.1031708, -0.022127641, -0.023769693, -0.0861388, 0.13420185, -0.11774454, 0.038774434)); - target1 += mul(ne2, float4x4(-0.15173717, -0.13590458, -0.0891863, 0.12289548, 0.13942605, 0.22152588, -0.19292432, 0.14169839, 0.010543665, 0.07648361, -0.057333756, 0.09535759, -0.053601623, -0.026824495, 0.09365424, 0.17476946)); - target1 += mul(nf2, float4x4(-0.070416056, -0.061970036, -0.039723337, -0.18874651, -0.07098426, -0.019835865, -0.5612458, 0.060437083, -0.03774378, 0.18536821, 0.28587544, 0.035555754, 0.15771326, -0.13527197, 0.13342534, -0.06564073)); - target1 += mul(ng2, float4x4(-0.10967661, 0.025388904, 0.09003177, -0.04087592, 0.09531671, -0.11809294, -0.41613623, 0.038198076, 0.01019813, -0.018864965, -0.18400626, -0.038704176, 0.0105671035, 0.024449013, -0.008989595, -0.027171193)); - target1 += mul(nh2, float4x4(0.16193569, -0.21445285, -0.20130903, -0.13498883, -0.008031679, 0.050757203, 0.78938776, -0.03749514, 0.11998137, 0.19368882, 0.12328945, 0.0058578993, -0.13852382, -0.033867255, -0.018267661, 0.036348555)); - target1 += mul(ni2, float4x4(-0.06254118, 0.087295115, 0.031116437, 0.0416281, 0.061828617, 0.34479564, -0.15537797, -0.17144552, 0.13989387, -0.13792284, 0.056215156, 0.12714528, -0.0198865, 0.04927947, 0.013614583, -0.041810013)); - target1 += float4(-0.044073943, 0.12072677, -0.0022342638, -0.24414532); + MF4 target1 = { -0.044073943, 0.12072677, -0.0022342638, -0.24414532 }; + target1 = MulAdd(a1, MF4x4(-0.04088509, -0.06585775, -0.3094732, 0.12059048, 0.041417453, -0.06144871, -0.06655134, 0.03308842, 0.09287731, 0.010969216, 0.10343026, -0.11185897, 0.05685865, -0.09490512, 0.040908635, 0.03501189), target1); + target1 = MulAdd(b1, MF4x4(-0.04854754, -0.098667145, 0.67147833, -0.11299351, -0.022114437, -0.029202767, 0.014179382, 0.26027945, 0.22076549, -0.16490546, -0.0010764733, 0.08405975, 0.11849154, -0.19072372, -0.35719597, -0.059621073), target1); + target1 = MulAdd(c1, MF4x4(0.079224996, 0.0669873, -0.1718969, -0.05002573, 0.044926763, -0.02904369, 0.017489236, 0.01144465, 0.059109706, 0.064998455, 0.14725484, -0.23879208, 0.039234288, -0.027365638, 0.26172164, -0.094598554), target1); + target1 = MulAdd(d1, MF4x4(-0.07159218, -0.03181544, 0.113837324, -0.053089984, -0.098298974, -0.29500297, 0.1608509, -0.044355504, 0.050882854, -0.19417204, -0.080487266, -0.00879743, 0.0007914453, 0.16640955, 0.21786706, 0.16398212), target1); + target1 = MulAdd(e1, MF4x4(0.16324541, 0.19753313, -0.46597233, -0.2593132, -0.2781038, -0.21973547, -0.024623038, -0.16348499, 0.3628299, 0.044888914, 0.04054647, -0.63605887, 0.02099492, 0.060544077, -0.49359834, 0.36336297), target1); + target1 = MulAdd(f1, MF4x4(-0.16885692, 0.31907207, 0.020906802, 0.13290039, -0.037330728, -0.022859452, -0.020451576, -0.113437995, -0.085683934, 0.054102756, -0.044824492, 0.061346747, -0.038684413, 0.098444365, -0.06734984, -0.17084897), target1); + target1 = MulAdd(g1, MF4x4(-0.015821548, -0.119599186, 0.1614827, 0.08383641, -0.07933593, 0.12528986, -0.06300182, 0.09286327, -0.10199266, 0.02419403, 0.0028411683, -0.09028338, 0.07962534, -0.08676035, -0.19237737, -0.115502626), target1); + target1 = MulAdd(h1, MF4x4(0.09471972, 0.21153462, -0.14393018, 0.055180002, 0.1817461, 0.016607309, -0.0771979, 0.11181317, -0.5491086, -0.102757886, -0.20952754, 0.022466583, -0.075119644, -0.14725658, 0.38451517, 0.12920731), target1); + target1 = MulAdd(i1, MF4x4(0.0867803, 0.114654355, 0.21199988, -0.15367955, -0.01803536, 0.056378633, 0.0018388306, 0.024613786, -0.13306658, 0.017211098, 0.073351346, -0.12064064, -0.10484361, -0.067748636, 0.033206712, -0.13061953), target1); + target1 = MulAdd(a2, MF4x4(-0.002236411, -0.022144757, -0.04586377, 0.101181075, -0.03511624, 0.08440529, 0.18544284, -0.22786349, -0.042184375, 0.015734851, -0.038622506, 0.038529944, -0.09170703, 0.034527462, -0.07817406, 0.10547265), target1); + target1 = MulAdd(b2, MF4x4(-0.12135524, -0.07412039, -0.04979351, -0.082267545, 0.13343571, 0.29196215, -0.4364121, -0.10226428, 0.060835477, -0.23307934, -0.018231759, 0.15550235, 0.09095689, 0.18164484, 0.1322021, -0.022567045), target1); + target1 = MulAdd(c2, MF4x4(-0.0054531163, -0.039762255, -0.030490747, 0.04779882, -0.15290286, 0.056712102, -0.0776974, 0.04114215, 0.15946816, -0.03882117, 0.16770308, -0.026126247, -0.027203865, -0.064107865, -0.13670811, 0.1556276), target1); + target1 = MulAdd(d2, MF4x4(-0.092548385, -0.027285473, 0.084179096, 0.014961629, 0.2564254, 0.07626849, 0.28534448, 0.2588713, -0.018600503, -0.2433456, 0.041392803, -0.045712482, 0.26388907, -0.053502295, 0.14522223, 0.032808404), target1); + target1 = MulAdd(e2, MF4x4(-0.0013780193, 0.3482449, 0.071003586, -0.30707207, -0.05122194, -0.2833618, 0.07910779, 0.051078696, 0.021535402, 0.13021478, 0.022049015, -0.533547, 0.57265025, -0.12843914, -0.14913581, -0.1433724), target1); + target1 = MulAdd(f2, MF4x4(0.07382619, -0.12152924, 0.13364957, 0.181974, 0.15804219, -0.10126773, 0.3029618, -0.12874149, 0.13743348, -0.23245592, -0.20119278, 0.029547188, 0.042436857, 0.04213892, -0.07975374, 0.023821082), target1); + target1 = MulAdd(g2, MF4x4(0.022782229, -0.08359311, -0.060623147, 0.06565042, 0.09828792, 0.044808697, -0.28872305, -0.00092168007, 0.021737702, -0.08698349, 0.1950025, 0.07931995, 0.040952396, -0.07443172, -0.021157127, 0.0056698937), target1); + target1 = MulAdd(h2, MF4x4(-0.09995892, -0.2047294, 0.1414849, 0.062335726, -0.22492298, 0.05269799, -0.029233055, -0.050517935, -0.12534393, -0.12194023, -0.07035469, -0.070764475, 0.18903446, 0.07691209, 0.06153371, 0.011280912), target1); + target1 = MulAdd(i2, MF4x4(-0.036189888, -0.07586571, -0.05888163, 0.010425367, -0.028375402, -0.18870986, -0.19146784, 0.19274063, -0.18856238, 0.0064240266, -0.14537223, -0.06971656, 0.0852742, -0.04866623, -0.031686075, 0.031702038), target1); + target1 = MulAdd(na1, MF4x4(0.0618941, 0.100858234, 0.2628019, -0.048507668, -0.051001363, -0.03195978, 0.035452217, -0.001991919, -0.09649028, -0.047445696, -0.09221298, 0.07602656, -0.02382384, -0.119645916, 0.085616075, -0.07076033), target1); + target1 = MulAdd(nb1, MF4x4(0.019222878, -0.0491929, -0.4902266, 0.18501294, 0.014529614, -0.077125326, 0.011563931, -0.20236616, -0.101982154, -0.021150962, -0.07537948, -0.1540349, 0.028949164, -0.06827332, 0.0067634755, 0.09582376), target1); + target1 = MulAdd(nc1, MF4x4(-0.05995539, -0.031138182, 0.01334257, 0.06827176, -0.030762246, 0.006615233, -0.03562788, 0.016249394, -0.14797118, 0.014671043, -0.09325859, 0.25653747, -0.11474991, 0.05436232, 0.031051394, 0.04179694), target1); + target1 = MulAdd(nd1, MF4x4(0.032279838, -0.030521005, 0.0029688699, 0.005165139, 0.15907808, -0.20421815, -0.07713175, 0.067530625, -0.08619395, 0.026114263, 0.08821273, 0.011591694, 0.018677557, 0.057708874, -0.25859246, -0.18693781), target1); + target1 = MulAdd(ne1, MF4x4(0.10823143, -0.31875235, -0.24394153, -0.0025489891, 0.016761065, -0.19857498, -0.07858479, -0.07811158, -0.38551694, -0.049090322, -0.050053325, 0.23398961, 0.014974165, 0.17498055, 0.29105362, -0.353647), target1); + target1 = MulAdd(nf1, MF4x4(0.05621677, -0.19492444, 0.460332, 0.055917628, -0.06404381, -0.06684098, 0.053624872, 0.057300456, -0.019248677, -0.15110065, 0.032379635, -0.12673225, 0.0068658157, -0.13001235, -0.017716292, 0.064182095), target1); + target1 = MulAdd(ng1, MF4x4(-0.06764552, 0.004707433, -0.13827331, -0.21957871, -0.03789028, -0.04962028, 0.022955444, -0.058468018, 0.13735814, -0.031270552, -0.018490225, 0.0063876202, -0.052979283, -0.030049473, -0.004811771, -0.0044099926), target1); + target1 = MulAdd(nh1, MF4x4(-0.028652798, -0.027029367, 0.62600744, 0.0900562, 0.03869923, -0.20111556, 0.095930666, -0.13164565, 0.5562579, 0.011937122, 0.22882107, 0.030288015, 0.09856272, 0.04736032, -0.077492185, -0.10207275), target1); + target1 = MulAdd(ni1, MF4x4(-0.10581002, -0.16504957, -0.5688921, 0.0414545, 0.04749444, -0.052849945, -0.011017121, -0.025284614, 0.14316759, -0.08547362, -0.09654446, 0.08682504, 0.050776027, 0.0678741, -0.04913651, 0.07527552), target1); + target1 = MulAdd(na2, MF4x4(0.04126091, 0.0048704315, 0.041699376, -0.05820725, -0.09664279, 0.07648305, -0.17979898, 0.11698985, -0.025436765, 0.023232851, 0.010656572, 0.08157569, 0.19584864, -0.022928072, 0.053339157, 0.0039929505), target1); + target1 = MulAdd(nb2, MF4x4(0.040733483, 0.12260473, 0.08071146, 0.07257762, -0.016945919, -0.31637576, -0.24281953, -0.0038469466, -0.10203634, 0.13631973, 0.06505259, -0.13119389, -0.09723076, -0.139551, -0.07504509, 0.08645985), target1); + target1 = MulAdd(nc2, MF4x4(0.017005404, 0.049066268, -0.007544932, -0.04884536, 0.09984347, -0.04447364, 0.4902235, -0.062780835, -0.18389583, 0.07305648, -0.22014385, 0.08004685, 0.0992568, -0.08569604, 0.093966395, -0.07047139), target1); + target1 = MulAdd(nd2, MF4x4(0.0017705248, 0.020553982, -0.09167042, 0.0036356782, -0.11867446, -0.07055574, 0.40252638, 0.09657129, 0.0888632, 0.1031708, -0.022127641, -0.023769693, -0.0861388, 0.13420185, -0.11774454, 0.038774434), target1); + target1 = MulAdd(ne2, MF4x4(-0.15173717, -0.13590458, -0.0891863, 0.12289548, 0.13942605, 0.22152588, -0.19292432, 0.14169839, 0.010543665, 0.07648361, -0.057333756, 0.09535759, -0.053601623, -0.026824495, 0.09365424, 0.17476946), target1); + target1 = MulAdd(nf2, MF4x4(-0.070416056, -0.061970036, -0.039723337, -0.18874651, -0.07098426, -0.019835865, -0.5612458, 0.060437083, -0.03774378, 0.18536821, 0.28587544, 0.035555754, 0.15771326, -0.13527197, 0.13342534, -0.06564073), target1); + target1 = MulAdd(ng2, MF4x4(-0.10967661, 0.025388904, 0.09003177, -0.04087592, 0.09531671, -0.11809294, -0.41613623, 0.038198076, 0.01019813, -0.018864965, -0.18400626, -0.038704176, 0.0105671035, 0.024449013, -0.008989595, -0.027171193), target1); + target1 = MulAdd(nh2, MF4x4(0.16193569, -0.21445285, -0.20130903, -0.13498883, -0.008031679, 0.050757203, 0.78938776, -0.03749514, 0.11998137, 0.19368882, 0.12328945, 0.0058578993, -0.13852382, -0.033867255, -0.018267661, 0.036348555), target1); + target1 = MulAdd(ni2, MF4x4(-0.06254118, 0.087295115, 0.031116437, 0.0416281, 0.061828617, 0.34479564, -0.15537797, -0.17144552, 0.13989387, -0.13792284, 0.056215156, 0.12714528, -0.0198865, 0.04927947, 0.013614583, -0.041810013), target1); - float4 target2 = mul(a1, float4x4(0.07115729, 0.01065505, 0.19167988, -0.02504489, -0.15064801, 0.079008736, 0.05437936, 0.027479589, -0.021383656, 0.032731537, -0.06657876, 0.022649521, -0.06501893, -0.02335689, 0.010445489, -0.05430297)); - target2 += mul(b1, float4x4(-0.1178601, 0.07425715, 0.063272275, -0.18308601, -0.13955134, 0.005166404, -0.022591779, -0.016827974, -0.024990188, -0.13372071, -0.056342285, 0.12489847, 0.081861794, -0.07083351, 0.021897513, 0.0629395)); - target2 += mul(c1, float4x4(0.051357627, -0.13874975, -0.09887168, -0.011908862, 0.03639772, -0.13195883, -0.05321156, 0.03913229, -0.08160194, -0.07128151, 0.043625016, 0.11966009, 0.03162217, 0.018834392, -0.0625129, 0.10726711)); - target2 += mul(d1, float4x4(-0.15922394, -0.043482754, -0.22571066, 0.009280428, -0.3882705, 0.08418719, 0.15329506, -0.028419001, -0.011272379, 0.15897545, 0.041217074, -0.0143014155, 0.09451862, -0.056342427, -0.14568482, 0.05556279)); - target2 += mul(e1, float4x4(0.13879324, -0.23339099, -0.24573983, -0.09575104, 0.03823306, 0.4752516, -0.1696623, -0.18472373, -0.1510259, 0.23040254, 0.4196143, 0.3462817, 0.035172507, 0.18228662, 0.22475636, -0.19945027)); - target2 += mul(f1, float4x4(-0.08876766, 0.19567333, 0.25174314, -0.09637879, -0.007957943, 0.13510521, 0.030193076, -0.0018362573, -0.006884444, -0.41804117, -0.1026309, -0.053339038, -0.1283198, -0.03033918, 0.055674326, 0.094377995)); - target2 += mul(g1, float4x4(0.06780768, -0.07774435, -0.0616546, -0.046531744, -0.11723141, 0.10792474, 0.013314576, -0.031451598, -0.009870351, 0.10215877, -0.13101454, -0.19878799, -0.09712651, 0.10423937, 0.14170039, -0.03359521)); - target2 += mul(h1, float4x4(-0.020114673, -0.015194169, 0.03657608, 0.17162928, 0.070458665, -0.08041664, 0.14067306, 0.19699603, -0.28763783, -0.033556152, -0.6588468, -0.48221052, -0.123711474, -0.080758795, -0.3187303, 0.121004865)); - target2 += mul(i1, float4x4(-0.074900605, 0.09297913, -0.08621144, 0.116730206, -0.034766622, -0.10381484, 0.060793545, -0.014790814, -0.123858415, -0.0010626495, 0.20547503, -0.07206306, -0.17324795, 0.023932874, 0.017495958, -0.09924652)); - target2 += mul(a2, float4x4(-0.015568068, 0.005394868, 0.15463537, 0.06416607, -0.045670815, -0.013540727, -0.12960619, 0.0006581649, 0.09432853, 0.05575682, -0.022219105, 0.022416297, 0.0148129435, -0.067619696, 0.022989385, -0.09695771)); - target2 += mul(b2, float4x4(-0.107209, 0.07072438, -0.10235772, -0.12078849, -0.02751833, -0.043195058, -0.17197154, 0.120612316, -0.17310137, -0.09429793, 0.06511165, 0.18072544, -0.21168593, 0.16383737, 0.25012484, -0.089589044)); - target2 += mul(c2, float4x4(0.005439779, 0.0028433986, -0.09885586, -0.06572956, -0.0061691296, 0.15485546, -0.23724958, 0.004232802, 0.07794742, -0.012552598, 0.07554404, 0.10843201, -0.013223918, -0.08705092, -0.23228747, 0.03599732)); - target2 += mul(d2, float4x4(-0.043396916, -0.10680695, -0.019935586, -0.06703658, -0.30075943, -0.010179525, 0.30197874, 0.04888297, 0.00779067, 0.22583807, 0.2039884, -0.0074303118, -0.19240093, -0.024718538, 0.057117213, 0.19431825)); - target2 += mul(e2, float4x4(-0.37633005, 0.043971814, -0.21423087, 0.118503235, -0.15058799, 0.115756795, -0.13719647, 0.020510519, 0.1123193, 0.14797291, 0.05467349, 0.2039607, -0.31973588, 0.1667847, -0.017739004, -0.11280262)); - target2 += mul(f2, float4x4(-0.0084394775, -0.1281101, -0.20841378, 0.01986435, -0.04122467, -0.21089631, -0.08062371, 0.11315133, 0.05693114, -0.23773515, 0.03792205, -0.008872407, 0.04554895, -0.10683658, 0.10683206, 0.06875721)); - target2 += mul(g2, float4x4(-0.103948504, -0.007483217, -0.12571928, 0.054868475, -0.030646881, -0.010098222, 0.019018777, -0.07072212, -0.10689893, 0.16498323, 0.048089568, -0.10912806, -0.027318537, -0.025491163, 0.012588013, 0.072701246)); - target2 += mul(h2, float4x4(0.14094622, -0.028118243, 0.016804086, -0.18000692, 0.33351874, 0.14980756, -0.07135749, -0.16573106, -0.17243773, 0.054617904, -0.2933543, -0.12602285, 0.08480712, -0.05704333, 0.22336398, 0.026583148)); - target2 += mul(i2, float4x4(0.046759557, -0.03100408, 0.40000245, -0.08521555, 0.19592628, -0.15150753, 0.25288078, -0.061794683, -0.047818147, -0.12249124, 0.020410215, -0.11503924, 0.046108168, 0.030459814, -0.14096366, 0.09120256)); - target2 += mul(na1, float4x4(-0.087491795, -0.024289595, -0.09060237, 0.020922959, 0.09557061, -0.08556962, -0.0503455, -0.010846053, 0.0030694185, -0.008256591, 0.08290225, -0.034981687, 0.07342003, -0.021816112, -0.13905519, -0.06265962)); - target2 += mul(nb1, float4x4(-0.08126147, -0.05866924, -0.015698025, 0.093630895, -0.02379264, 0.115918085, 0.19431724, 0.041815966, -0.051647816, 0.15277039, -0.03721037, -0.085520886, 0.041766718, 0.104392216, 0.0559556, 0.0049254233)); - target2 += mul(nc1, float4x4(-0.11176419, 0.112272635, 0.1367475, -0.010482275, -0.06719008, 0.064003386, -0.08132314, 0.015465676, 0.052741583, 0.06779717, 0.038533892, -0.16428822, 0.040990274, 0.002559234, 0.097567044, -0.058192518)); - target2 += mul(nd1, float4x4(0.17228632, 0.008296625, 0.009418271, 0.037103783, -0.0601486, 0.04531715, 0.19613501, 0.112170085, -0.02256726, -0.093685195, -0.1341531, -0.038480807, 0.109840475, 0.062418167, 0.15140085, 0.050787117)); - target2 += mul(ne1, float4x4(0.15433665, 0.2104034, 0.12395812, 0.13799714, 0.14945604, 0.67457545, 0.27575177, -0.047493283, 0.24992993, -0.5305435, 0.0131732905, -0.36911693, 0.14442082, -0.18583177, -0.2861722, 0.19419897)); - target2 += mul(nf1, float4x4(0.040242445, -0.13234852, 0.10056324, 0.055854917, 0.07447713, -0.023067042, 0.00021051937, -0.0495407, -0.22037992, 0.68047297, 0.05774606, -0.012461005, 0.104557075, 0.04832623, 0.010292581, -0.050617047)); - target2 += mul(ng1, float4x4(-0.060079176, 0.086553656, 0.0060872175, -0.012576339, 0.025149338, -0.07379716, -0.18048704, -0.007130346, 0.007826557, -0.095655076, -0.0032888134, 0.21027069, -0.09868755, -0.1180311, 0.0081250835, -0.05078016)); - target2 += mul(nh1, float4x4(0.19124818, -0.05949092, -0.36762074, -0.08203597, -0.10276991, 0.111005515, -0.2845309, 0.113985784, 0.07206471, -0.026585411, 0.20032002, 0.5691625, -0.0460136, 0.03874166, 0.09858682, -0.15913802)); - target2 += mul(ni1, float4x4(-0.00397842, -0.014763085, 0.080231026, -0.09142265, 0.03637215, 0.064106315, -0.030963007, 0.0557953, 0.04173885, -0.024534896, -0.2092259, 0.06913638, 0.08103145, -0.0033994897, -0.10903093, 0.062850125)); - target2 += mul(na2, float4x4(0.01206918, 0.024855271, -0.051995132, 0.013999821, -0.021517826, 0.06216198, -0.050853133, -0.064136736, -0.047408275, -0.07858566, 0.074464396, -0.038218755, -0.13216262, 0.008905726, 0.10333, 0.03049554)); - target2 += mul(nb2, float4x4(-0.027152343, -0.069046065, -0.013017797, 0.0763, -0.08611993, -0.020867927, 0.012807627, -0.11971997, 0.025972975, 0.095127404, -0.070044935, -0.21399231, -0.22536097, -0.028828809, 0.123399965, -0.15967365)); - target2 += mul(nc2, float4x4(0.038314234, -0.014114242, 0.012115026, 0.05505015, 0.11785298, -0.08772618, 0.034408223, 0.09134674, -0.04727011, 0.020709611, -0.01780165, -0.14374214, -0.30412516, -0.011123043, -0.024216317, -0.007538433)); - target2 += mul(nd2, float4x4(-0.17673545, 0.077738725, 0.056153737, 0.028693894, 0.05688375, 0.021928595, 0.014585902, 0.019364892, 0.029056642, -0.2072201, -0.17548367, 0.085471265, 0.16439342, -0.0052957633, 0.22321554, -0.19246858)); - target2 += mul(ne2, float4x4(0.1914782, -0.15620962, -0.16686897, -0.04141303, 0.07696967, -0.013115313, -0.057627093, -0.13849305, 0.08699377, -0.07339016, -0.053074118, -0.059418138, 0.19988623, -0.23852244, -0.12574267, -0.29139704)); - target2 += mul(nf2, float4x4(-0.017691063, 0.18901291, 0.16250716, -0.11039392, 0.056900974, 0.036662772, -0.13399602, -0.11378214, -0.10924602, 0.2130181, -0.042094063, -0.012445028, 0.013168919, 0.119448364, -0.014406005, 0.0054324497)); - target2 += mul(ng2, float4x4(0.11552786, 0.090796515, -0.11559005, -0.035706047, -0.044022456, -0.027642358, 0.08824298, 0.035067793, 0.18125483, -0.15502097, 0.094219126, 0.07493505, 0.022493582, 0.038250685, -0.076567575, -0.059311453)); - target2 += mul(nh2, float4x4(-0.08612596, 0.016376335, -0.0023271537, 0.32511148, 0.03789289, 0.13106889, 0.22370385, 0.21145949, 0.1844514, -0.0766592, 0.093758754, 0.13821359, -0.062405586, 0.0028724174, -0.13588348, 0.00024406122)); - target2 += mul(ni2, float4x4(-0.08991004, 0.074423954, -0.020964831, -0.070288494, -0.1192369, -0.015506713, -0.28136373, 0.042911243, 0.08215164, 0.11065419, -0.006201638, 0.057742044, 0.0014476188, -0.01443158, 0.22631277, -0.06787264)); - target2 += float4(-0.07235962, -0.019149294, 0.05072898, 0.03962245); + MF4 target2 = { -0.07235962, -0.019149294, 0.05072898, 0.03962245 }; + target2 = MulAdd(a1, MF4x4(0.07115729, 0.01065505, 0.19167988, -0.02504489, -0.15064801, 0.079008736, 0.05437936, 0.027479589, -0.021383656, 0.032731537, -0.06657876, 0.022649521, -0.06501893, -0.02335689, 0.010445489, -0.05430297), target2); + target2 = MulAdd(b1, MF4x4(-0.1178601, 0.07425715, 0.063272275, -0.18308601, -0.13955134, 0.005166404, -0.022591779, -0.016827974, -0.024990188, -0.13372071, -0.056342285, 0.12489847, 0.081861794, -0.07083351, 0.021897513, 0.0629395), target2); + target2 = MulAdd(c1, MF4x4(0.051357627, -0.13874975, -0.09887168, -0.011908862, 0.03639772, -0.13195883, -0.05321156, 0.03913229, -0.08160194, -0.07128151, 0.043625016, 0.11966009, 0.03162217, 0.018834392, -0.0625129, 0.10726711), target2); + target2 = MulAdd(d1, MF4x4(-0.15922394, -0.043482754, -0.22571066, 0.009280428, -0.3882705, 0.08418719, 0.15329506, -0.028419001, -0.011272379, 0.15897545, 0.041217074, -0.0143014155, 0.09451862, -0.056342427, -0.14568482, 0.05556279), target2); + target2 = MulAdd(e1, MF4x4(0.13879324, -0.23339099, -0.24573983, -0.09575104, 0.03823306, 0.4752516, -0.1696623, -0.18472373, -0.1510259, 0.23040254, 0.4196143, 0.3462817, 0.035172507, 0.18228662, 0.22475636, -0.19945027), target2); + target2 = MulAdd(f1, MF4x4(-0.08876766, 0.19567333, 0.25174314, -0.09637879, -0.007957943, 0.13510521, 0.030193076, -0.0018362573, -0.006884444, -0.41804117, -0.1026309, -0.053339038, -0.1283198, -0.03033918, 0.055674326, 0.094377995), target2); + target2 = MulAdd(g1, MF4x4(0.06780768, -0.07774435, -0.0616546, -0.046531744, -0.11723141, 0.10792474, 0.013314576, -0.031451598, -0.009870351, 0.10215877, -0.13101454, -0.19878799, -0.09712651, 0.10423937, 0.14170039, -0.03359521), target2); + target2 = MulAdd(h1, MF4x4(-0.020114673, -0.015194169, 0.03657608, 0.17162928, 0.070458665, -0.08041664, 0.14067306, 0.19699603, -0.28763783, -0.033556152, -0.6588468, -0.48221052, -0.123711474, -0.080758795, -0.3187303, 0.121004865), target2); + target2 = MulAdd(i1, MF4x4(-0.074900605, 0.09297913, -0.08621144, 0.116730206, -0.034766622, -0.10381484, 0.060793545, -0.014790814, -0.123858415, -0.0010626495, 0.20547503, -0.07206306, -0.17324795, 0.023932874, 0.017495958, -0.09924652), target2); + target2 = MulAdd(a2, MF4x4(-0.015568068, 0.005394868, 0.15463537, 0.06416607, -0.045670815, -0.013540727, -0.12960619, 0.0006581649, 0.09432853, 0.05575682, -0.022219105, 0.022416297, 0.0148129435, -0.067619696, 0.022989385, -0.09695771), target2); + target2 = MulAdd(b2, MF4x4(-0.107209, 0.07072438, -0.10235772, -0.12078849, -0.02751833, -0.043195058, -0.17197154, 0.120612316, -0.17310137, -0.09429793, 0.06511165, 0.18072544, -0.21168593, 0.16383737, 0.25012484, -0.089589044), target2); + target2 = MulAdd(c2, MF4x4(0.005439779, 0.0028433986, -0.09885586, -0.06572956, -0.0061691296, 0.15485546, -0.23724958, 0.004232802, 0.07794742, -0.012552598, 0.07554404, 0.10843201, -0.013223918, -0.08705092, -0.23228747, 0.03599732), target2); + target2 = MulAdd(d2, MF4x4(-0.043396916, -0.10680695, -0.019935586, -0.06703658, -0.30075943, -0.010179525, 0.30197874, 0.04888297, 0.00779067, 0.22583807, 0.2039884, -0.0074303118, -0.19240093, -0.024718538, 0.057117213, 0.19431825), target2); + target2 = MulAdd(e2, MF4x4(-0.37633005, 0.043971814, -0.21423087, 0.118503235, -0.15058799, 0.115756795, -0.13719647, 0.020510519, 0.1123193, 0.14797291, 0.05467349, 0.2039607, -0.31973588, 0.1667847, -0.017739004, -0.11280262), target2); + target2 = MulAdd(f2, MF4x4(-0.0084394775, -0.1281101, -0.20841378, 0.01986435, -0.04122467, -0.21089631, -0.08062371, 0.11315133, 0.05693114, -0.23773515, 0.03792205, -0.008872407, 0.04554895, -0.10683658, 0.10683206, 0.06875721), target2); + target2 = MulAdd(g2, MF4x4(-0.103948504, -0.007483217, -0.12571928, 0.054868475, -0.030646881, -0.010098222, 0.019018777, -0.07072212, -0.10689893, 0.16498323, 0.048089568, -0.10912806, -0.027318537, -0.025491163, 0.012588013, 0.072701246), target2); + target2 = MulAdd(h2, MF4x4(0.14094622, -0.028118243, 0.016804086, -0.18000692, 0.33351874, 0.14980756, -0.07135749, -0.16573106, -0.17243773, 0.054617904, -0.2933543, -0.12602285, 0.08480712, -0.05704333, 0.22336398, 0.026583148), target2); + target2 = MulAdd(i2, MF4x4(0.046759557, -0.03100408, 0.40000245, -0.08521555, 0.19592628, -0.15150753, 0.25288078, -0.061794683, -0.047818147, -0.12249124, 0.020410215, -0.11503924, 0.046108168, 0.030459814, -0.14096366, 0.09120256), target2); + target2 = MulAdd(na1, MF4x4(-0.087491795, -0.024289595, -0.09060237, 0.020922959, 0.09557061, -0.08556962, -0.0503455, -0.010846053, 0.0030694185, -0.008256591, 0.08290225, -0.034981687, 0.07342003, -0.021816112, -0.13905519, -0.06265962), target2); + target2 = MulAdd(nb1, MF4x4(-0.08126147, -0.05866924, -0.015698025, 0.093630895, -0.02379264, 0.115918085, 0.19431724, 0.041815966, -0.051647816, 0.15277039, -0.03721037, -0.085520886, 0.041766718, 0.104392216, 0.0559556, 0.0049254233), target2); + target2 = MulAdd(nc1, MF4x4(-0.11176419, 0.112272635, 0.1367475, -0.010482275, -0.06719008, 0.064003386, -0.08132314, 0.015465676, 0.052741583, 0.06779717, 0.038533892, -0.16428822, 0.040990274, 0.002559234, 0.097567044, -0.058192518), target2); + target2 = MulAdd(nd1, MF4x4(0.17228632, 0.008296625, 0.009418271, 0.037103783, -0.0601486, 0.04531715, 0.19613501, 0.112170085, -0.02256726, -0.093685195, -0.1341531, -0.038480807, 0.109840475, 0.062418167, 0.15140085, 0.050787117), target2); + target2 = MulAdd(ne1, MF4x4(0.15433665, 0.2104034, 0.12395812, 0.13799714, 0.14945604, 0.67457545, 0.27575177, -0.047493283, 0.24992993, -0.5305435, 0.0131732905, -0.36911693, 0.14442082, -0.18583177, -0.2861722, 0.19419897), target2); + target2 = MulAdd(nf1, MF4x4(0.040242445, -0.13234852, 0.10056324, 0.055854917, 0.07447713, -0.023067042, 0.00021051937, -0.0495407, -0.22037992, 0.68047297, 0.05774606, -0.012461005, 0.104557075, 0.04832623, 0.010292581, -0.050617047), target2); + target2 = MulAdd(ng1, MF4x4(-0.060079176, 0.086553656, 0.0060872175, -0.012576339, 0.025149338, -0.07379716, -0.18048704, -0.007130346, 0.007826557, -0.095655076, -0.0032888134, 0.21027069, -0.09868755, -0.1180311, 0.0081250835, -0.05078016), target2); + target2 = MulAdd(nh1, MF4x4(0.19124818, -0.05949092, -0.36762074, -0.08203597, -0.10276991, 0.111005515, -0.2845309, 0.113985784, 0.07206471, -0.026585411, 0.20032002, 0.5691625, -0.0460136, 0.03874166, 0.09858682, -0.15913802), target2); + target2 = MulAdd(ni1, MF4x4(-0.00397842, -0.014763085, 0.080231026, -0.09142265, 0.03637215, 0.064106315, -0.030963007, 0.0557953, 0.04173885, -0.024534896, -0.2092259, 0.06913638, 0.08103145, -0.0033994897, -0.10903093, 0.062850125), target2); + target2 = MulAdd(na2, MF4x4(0.01206918, 0.024855271, -0.051995132, 0.013999821, -0.021517826, 0.06216198, -0.050853133, -0.064136736, -0.047408275, -0.07858566, 0.074464396, -0.038218755, -0.13216262, 0.008905726, 0.10333, 0.03049554), target2); + target2 = MulAdd(nb2, MF4x4(-0.027152343, -0.069046065, -0.013017797, 0.0763, -0.08611993, -0.020867927, 0.012807627, -0.11971997, 0.025972975, 0.095127404, -0.070044935, -0.21399231, -0.22536097, -0.028828809, 0.123399965, -0.15967365), target2); + target2 = MulAdd(nc2, MF4x4(0.038314234, -0.014114242, 0.012115026, 0.05505015, 0.11785298, -0.08772618, 0.034408223, 0.09134674, -0.04727011, 0.020709611, -0.01780165, -0.14374214, -0.30412516, -0.011123043, -0.024216317, -0.007538433), target2); + target2 = MulAdd(nd2, MF4x4(-0.17673545, 0.077738725, 0.056153737, 0.028693894, 0.05688375, 0.021928595, 0.014585902, 0.019364892, 0.029056642, -0.2072201, -0.17548367, 0.085471265, 0.16439342, -0.0052957633, 0.22321554, -0.19246858), target2); + target2 = MulAdd(ne2, MF4x4(0.1914782, -0.15620962, -0.16686897, -0.04141303, 0.07696967, -0.013115313, -0.057627093, -0.13849305, 0.08699377, -0.07339016, -0.053074118, -0.059418138, 0.19988623, -0.23852244, -0.12574267, -0.29139704), target2); + target2 = MulAdd(nf2, MF4x4(-0.017691063, 0.18901291, 0.16250716, -0.11039392, 0.056900974, 0.036662772, -0.13399602, -0.11378214, -0.10924602, 0.2130181, -0.042094063, -0.012445028, 0.013168919, 0.119448364, -0.014406005, 0.0054324497), target2); + target2 = MulAdd(ng2, MF4x4(0.11552786, 0.090796515, -0.11559005, -0.035706047, -0.044022456, -0.027642358, 0.08824298, 0.035067793, 0.18125483, -0.15502097, 0.094219126, 0.07493505, 0.022493582, 0.038250685, -0.076567575, -0.059311453), target2); + target2 = MulAdd(nh2, MF4x4(-0.08612596, 0.016376335, -0.0023271537, 0.32511148, 0.03789289, 0.13106889, 0.22370385, 0.21145949, 0.1844514, -0.0766592, 0.093758754, 0.13821359, -0.062405586, 0.0028724174, -0.13588348, 0.00024406122), target2); + target2 = MulAdd(ni2, MF4x4(-0.08991004, 0.074423954, -0.020964831, -0.070288494, -0.1192369, -0.015506713, -0.28136373, 0.042911243, 0.08215164, 0.11065419, -0.006201638, 0.057742044, 0.0014476188, -0.01443158, 0.22631277, -0.06787264), target2); conv2d_1_tf[gxy] = target1; conv2d_1_tf1[gxy] = target2; @@ -364,25 +367,25 @@ void Pass3(uint2 blockStart, uint3 threadId) { // [ a, d, g ] // [ b, e, h ] // [ c, f, i ] - float4 a1 = conv2d_1_tf.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0); - float4 b1 = conv2d_1_tf.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); - float4 c1 = conv2d_1_tf.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); - float4 d1 = conv2d_1_tf.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); - float4 e1 = conv2d_1_tf.SampleLevel(sam, pos, 0); - float4 f1 = conv2d_1_tf.SampleLevel(sam, pos + float2(0, inputPt.y), 0); - float4 g1 = conv2d_1_tf.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); - float4 h1 = conv2d_1_tf.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); - float4 i1 = conv2d_1_tf.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0); - - float4 na1 = max(-a1, 0); - float4 nb1 = max(-b1, 0); - float4 nc1 = max(-c1, 0); - float4 nd1 = max(-d1, 0); - float4 ne1 = max(-e1, 0); - float4 nf1 = max(-f1, 0); - float4 ng1 = max(-g1, 0); - float4 nh1 = max(-h1, 0); - float4 ni1 = max(-i1, 0); + MF4 a1 = conv2d_1_tf.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0); + MF4 b1 = conv2d_1_tf.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); + MF4 c1 = conv2d_1_tf.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); + MF4 d1 = conv2d_1_tf.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); + MF4 e1 = conv2d_1_tf.SampleLevel(sam, pos, 0); + MF4 f1 = conv2d_1_tf.SampleLevel(sam, pos + float2(0, inputPt.y), 0); + MF4 g1 = conv2d_1_tf.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); + MF4 h1 = conv2d_1_tf.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); + MF4 i1 = conv2d_1_tf.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0); + + MF4 na1 = max(-a1, 0); + MF4 nb1 = max(-b1, 0); + MF4 nc1 = max(-c1, 0); + MF4 nd1 = max(-d1, 0); + MF4 ne1 = max(-e1, 0); + MF4 nf1 = max(-f1, 0); + MF4 ng1 = max(-g1, 0); + MF4 nh1 = max(-h1, 0); + MF4 ni1 = max(-i1, 0); a1 = max(a1, 0); b1 = max(b1, 0); @@ -394,25 +397,25 @@ void Pass3(uint2 blockStart, uint3 threadId) { h1 = max(h1, 0); i1 = max(i1, 0); - float4 a2 = conv2d_1_tf1.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0); - float4 b2 = conv2d_1_tf1.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); - float4 c2 = conv2d_1_tf1.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); - float4 d2 = conv2d_1_tf1.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); - float4 e2 = conv2d_1_tf1.SampleLevel(sam, pos, 0); - float4 f2 = conv2d_1_tf1.SampleLevel(sam, pos + float2(0, inputPt.y), 0); - float4 g2 = conv2d_1_tf1.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); - float4 h2 = conv2d_1_tf1.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); - float4 i2 = conv2d_1_tf1.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0); - - float4 na2 = max(-a2, 0); - float4 nb2 = max(-b2, 0); - float4 nc2 = max(-c2, 0); - float4 nd2 = max(-d2, 0); - float4 ne2 = max(-e2, 0); - float4 nf2 = max(-f2, 0); - float4 ng2 = max(-g2, 0); - float4 nh2 = max(-h2, 0); - float4 ni2 = max(-i2, 0); + MF4 a2 = conv2d_1_tf1.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0); + MF4 b2 = conv2d_1_tf1.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); + MF4 c2 = conv2d_1_tf1.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); + MF4 d2 = conv2d_1_tf1.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); + MF4 e2 = conv2d_1_tf1.SampleLevel(sam, pos, 0); + MF4 f2 = conv2d_1_tf1.SampleLevel(sam, pos + float2(0, inputPt.y), 0); + MF4 g2 = conv2d_1_tf1.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); + MF4 h2 = conv2d_1_tf1.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); + MF4 i2 = conv2d_1_tf1.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0); + + MF4 na2 = max(-a2, 0); + MF4 nb2 = max(-b2, 0); + MF4 nc2 = max(-c2, 0); + MF4 nd2 = max(-d2, 0); + MF4 ne2 = max(-e2, 0); + MF4 nf2 = max(-f2, 0); + MF4 ng2 = max(-g2, 0); + MF4 nh2 = max(-h2, 0); + MF4 ni2 = max(-i2, 0); a2 = max(a2, 0); b2 = max(b2, 0); @@ -424,81 +427,81 @@ void Pass3(uint2 blockStart, uint3 threadId) { h2 = max(h2, 0); i2 = max(i2, 0); - float4 target1 = mul(a1, float4x4(0.14315613, -0.031299837, -0.011195234, 0.0073360316, 0.07264984, -0.110979274, 0.06560588, -0.040463638, 0.28964168, -0.05644335, -0.060729366, -0.15811591, 0.028339373, 0.027486937, 0.0360574, 0.05856459)); - target1 += mul(b1, float4x4(0.16211128, 0.20672597, -0.30374205, -0.056202736, -0.10893948, 0.053066984, -0.18297112, 0.028844962, 0.22754766, -0.07141921, 0.07142953, -0.1357581, 0.008053467, 0.04668908, 0.17258649, 0.22506891)); - target1 += mul(c1, float4x4(0.07014762, 0.032112304, 0.028849715, 0.09427007, 0.008323501, -0.085777245, 0.083501115, -0.16150802, 0.24127382, -0.1305689, -0.027557204, -0.15057805, 0.09748757, 0.08182083, -0.107643455, 0.020552907)); - target1 += mul(d1, float4x4(-0.04630706, -0.056070503, 0.058440026, -0.005662525, 0.08736355, 0.08821088, -0.049539115, 0.08171937, 0.28466523, -0.025859421, -0.0026971614, -0.15181617, -0.022231927, 0.3566104, -0.024887348, 0.12051598)); - target1 += mul(e1, float4x4(-0.20976813, -0.23778942, 0.28854275, -0.27583683, -0.27604774, -0.15861328, 0.09581984, 0.06572128, 0.092306405, -0.06962751, -0.042226445, 0.035234913, 0.084891975, -0.03846841, -0.1473667, 0.2810354)); - target1 += mul(f1, float4x4(0.028011162, 0.08945262, 0.15859836, 0.18426442, 0.10649845, -0.0918649, -0.12257575, -0.00914911, 0.23701023, -0.030067213, -0.01938559, -0.11026175, -0.5953985, 0.28875506, -0.035278864, -0.05043055)); - target1 += mul(g1, float4x4(-0.14445779, -0.06907616, 0.13078876, -0.0089114, -0.110637166, -0.123719245, -0.094949, 0.046267383, 0.4727523, 0.0073195575, -0.014788787, -0.14922102, -0.021974785, -0.10706751, 0.00049029186, 0.09215345)); - target1 += mul(h1, float4x4(-0.20936993, -0.22377276, -0.07697398, 0.039161056, 0.044213686, 0.037542075, -0.06600642, 0.017124292, 0.3406197, 0.011907687, 0.019732054, -0.22745137, -0.22178015, 0.49051985, -0.03707166, 0.14849792)); - target1 += mul(i1, float4x4(0.07833466, 0.10888627, 0.16015877, -0.049263358, 0.29002127, -0.010949114, 0.013081097, -0.071674205, 0.3532135, 0.013165473, -0.05282189, -0.16688257, 0.009552089, -0.2740816, 0.04927233, -0.37047002)); - target1 += mul(a2, float4x4(0.23682123, -0.027914839, 0.02372468, -0.07127212, 0.053436097, 0.057737537, -0.008556659, -0.025973454, 0.06468388, 0.18805866, -0.08180048, 0.058999106, -0.3058321, -0.06642967, -0.092997625, 0.10527466)); - target1 += mul(b2, float4x4(-0.1353085, -0.016593851, 0.21518163, -0.10272456, 0.14382689, 0.05056661, -0.27799338, 0.11351653, 0.05838342, 0.28104934, -0.03777824, 0.003435516, 0.057915565, -0.17574134, -0.24437475, 0.13420977)); - target1 += mul(c2, float4x4(0.13400255, -0.056437124, 0.11310834, 0.040429913, 0.098928474, -0.020769242, -0.079605736, 0.0494632, 0.0660877, 0.098982334, -0.055884495, -0.046533633, 0.17815505, 0.027310565, -0.24176653, -0.025550256)); - target1 += mul(d2, float4x4(0.03637618, -0.012618673, 0.11865397, 0.19804053, -0.03522831, 0.24310908, -0.056454524, -0.44885796, 0.02212509, -0.20253624, 0.038810212, -0.17396528, 0.08970355, 0.005103078, 0.061075203, 0.44292897)); - target1 += mul(e2, float4x4(-0.25074747, -0.0015575301, -0.685015, 0.07345307, -0.08419402, 0.06640714, 0.43799296, -0.17571151, 0.0049855476, 0.09024738, 0.055744022, 0.018739637, 0.34734032, 0.114896655, 0.0404696, -0.11327049)); - target1 += mul(f2, float4x4(-0.12284062, -0.31131, -0.14712588, -0.18645866, 0.17581487, 0.1357234, 0.09913364, 0.005298711, -0.056155153, 0.042429443, 0.039454732, -0.04111384, 0.2623163, 0.09701166, 0.022825675, 0.050480727)); - target1 += mul(g2, float4x4(0.058734808, 0.038528245, -0.042670116, -0.15190329, -0.028179986, -0.05362995, 0.017090468, -0.24449602, -0.08240927, -0.033122182, 0.009938243, -0.0052937623, 0.2171439, 0.06879817, -0.10361997, 0.018995138)); - target1 += mul(h2, float4x4(0.027555468, 0.016337285, 0.19074728, 0.26690376, -0.088713005, -0.0021182299, -0.23062791, -0.32101163, -0.0040022335, 0.16835448, 0.05424022, -0.02156396, 0.24163729, 0.10243619, -0.04331782, -0.014350939)); - target1 += mul(i2, float4x4(-0.13836963, 0.053369813, 0.036432605, 0.062288612, -0.06264361, -0.049093347, -0.0315955, -0.11237456, -0.064744405, -0.0151798045, 0.044210885, 0.010166375, -0.038355727, -0.05203739, -0.075036794, 0.1664177)); - target1 += mul(na1, float4x4(-0.08583114, 0.08268218, -0.05771351, 0.10195048, -0.10128163, 0.10874855, -0.02580701, 0.028834302, 0.1950179, -0.0130183315, 0.0092119705, -0.060479227, 0.117747106, 0.061403573, -0.0028475628, -0.032362986)); - target1 += mul(nb1, float4x4(-0.05310153, -0.061091065, 0.19438389, -0.10475873, 0.00045120303, -0.24876194, 0.017168125, -0.050173752, 0.012073283, 0.035660096, -0.017562328, -0.110271364, -0.015546384, 0.17965329, 0.10068208, -0.014481325)); - target1 += mul(nc1, float4x4(0.085558474, -0.0007109211, 0.20868625, 0.150163, -0.19283043, 0.025976779, 0.08384698, 0.031011146, 0.17268184, 0.008871077, -0.04097794, -0.12868725, 0.01336166, -0.038823843, 0.1703644, -0.067780636)); - target1 += mul(nd1, float4x4(0.06480841, -0.44256654, -0.19949587, -0.030677497, -0.27930573, -0.041867044, -0.15648738, 0.11573067, 0.28664824, 0.009770385, -0.058617204, -0.06607673, -0.038160402, 0.009497089, 0.03303058, -0.079379834)); - target1 += mul(ne1, float4x4(0.17752203, 0.10979527, -0.058749028, -0.30194217, 0.30484176, -0.20980492, -0.05234784, -0.2590473, 0.23003183, 0.21903595, -0.024891363, -0.14337292, -0.02971356, -0.29613075, -0.045642294, 0.23826689)); - target1 += mul(nf1, float4x4(0.018211683, -0.005840598, -0.19021381, -0.096696235, 0.39998052, -0.34746838, -0.039627917, 0.087701194, 0.15526368, -0.008095372, -0.044220537, -0.08634815, -0.121496454, -0.06792033, -0.14959472, 0.078917444)); - target1 += mul(ng1, float4x4(0.33109078, 0.012287281, -0.034155898, -0.04840956, 0.068748444, 0.006142039, 0.06598935, 0.024775596, 0.22379673, 0.056089353, -0.006119644, -0.018509025, 0.10084137, 0.15556572, -0.041211523, -0.21550669)); - target1 += mul(nh1, float4x4(-0.058160853, 0.08899222, -0.17401625, -0.1449813, -0.015872562, -0.03780256, 0.15702572, 0.34013954, 0.1580772, 0.074823864, 0.035488904, -0.01627819, -0.15551315, -0.3638866, -0.09833458, 0.15037175)); - target1 += mul(ni1, float4x4(-0.12707977, -0.19947061, -0.11524648, 0.09216174, -0.07161296, 0.05675567, 0.06843247, 0.2803306, 0.25222927, -0.044076066, 0.053775772, -0.09939824, 0.16903089, 0.11475717, -0.07015584, -0.036021322)); - target1 += mul(na2, float4x4(-0.12290332, -0.05469477, 0.02696626, 0.051133692, -0.05541504, -0.2811521, -0.13008943, 0.031793896, -0.32529324, -0.01663752, -0.0658181, 0.17300756, 0.22281154, -0.11001508, 0.09578194, -0.055437982)); - target1 += mul(nb2, float4x4(0.083753526, -0.048933715, -0.13912897, 0.10929772, -0.1789828, -0.1586524, -0.10964165, -0.08210391, -0.11568187, -0.04813496, -0.2417861, 0.24446528, 0.13570863, -0.26869404, 0.3013413, 0.11678686)); - target1 += mul(nc2, float4x4(0.21105368, 0.15749952, -0.18983693, -0.023642758, -0.1633653, 0.10107988, 0.052329395, -0.080253236, 0.15375629, -0.045091413, 0.05070866, 0.12416106, 0.16600485, -0.10412354, 0.061849747, -0.084013924)); - target1 += mul(nd2, float4x4(0.03863923, 0.03690167, -0.053106382, -0.07523278, -0.04214836, 0.53898096, 0.15308584, 0.22835171, -0.24771535, 0.1402687, 0.1000896, -0.08719167, 0.0886567, 0.15255097, 0.14695966, -0.06659865)); - target1 += mul(ne2, float4x4(0.110334344, -0.12696493, 0.24256139, 0.02536166, 0.08322421, 0.022147777, -0.35030407, 0.13734557, 0.053133942, 0.43650532, -0.30170345, 0.08751837, 0.012917502, 0.27496436, 0.11422729, 0.15508565)); - target1 += mul(nf2, float4x4(0.16684863, 0.26743406, 0.15951683, 0.033597723, -0.044719726, 0.1127182, 0.007923161, 0.06415458, -0.07269362, -0.07828715, 0.09216738, 0.11528897, -0.13371283, -0.124177165, 0.14804523, 0.14156726)); - target1 += mul(ng2, float4x4(-0.041141883, 0.023617791, 0.11484465, 0.13131519, -0.14753738, 0.17067687, -0.017538434, 0.24042644, -0.058103643, 0.3143255, 0.02476919, -0.0024666793, -0.26759955, -0.06099211, 0.006415725, 0.10394301)); - target1 += mul(nh2, float4x4(-0.04198037, 0.03277123, -0.25069895, -0.21043587, -0.27417016, 0.08047665, 0.29731026, 0.07629813, -0.15695353, -0.14299184, 0.026618432, 0.13265325, 0.07727133, 0.12872085, 0.13887435, 0.1347057)); - target1 += mul(ni2, float4x4(0.039232086, 0.117847264, -0.071643315, -0.040677182, -0.029160816, -0.06968689, 0.12880929, 0.037579957, -0.036671028, -0.022678757, -0.069731854, 0.10590314, 0.028034678, -0.015759282, 0.047180142, -0.16366881)); - target1 += float4(-0.079253934, 0.001511763, 0.100159355, 0.01585197); + MF4 target1 = { -0.079253934, 0.001511763, 0.100159355, 0.01585197 }; + target1 = MulAdd(a1, MF4x4(0.14315613, -0.031299837, -0.011195234, 0.0073360316, 0.07264984, -0.110979274, 0.06560588, -0.040463638, 0.28964168, -0.05644335, -0.060729366, -0.15811591, 0.028339373, 0.027486937, 0.0360574, 0.05856459), target1); + target1 = MulAdd(b1, MF4x4(0.16211128, 0.20672597, -0.30374205, -0.056202736, -0.10893948, 0.053066984, -0.18297112, 0.028844962, 0.22754766, -0.07141921, 0.07142953, -0.1357581, 0.008053467, 0.04668908, 0.17258649, 0.22506891), target1); + target1 = MulAdd(c1, MF4x4(0.07014762, 0.032112304, 0.028849715, 0.09427007, 0.008323501, -0.085777245, 0.083501115, -0.16150802, 0.24127382, -0.1305689, -0.027557204, -0.15057805, 0.09748757, 0.08182083, -0.107643455, 0.020552907), target1); + target1 = MulAdd(d1, MF4x4(-0.04630706, -0.056070503, 0.058440026, -0.005662525, 0.08736355, 0.08821088, -0.049539115, 0.08171937, 0.28466523, -0.025859421, -0.0026971614, -0.15181617, -0.022231927, 0.3566104, -0.024887348, 0.12051598), target1); + target1 = MulAdd(e1, MF4x4(-0.20976813, -0.23778942, 0.28854275, -0.27583683, -0.27604774, -0.15861328, 0.09581984, 0.06572128, 0.092306405, -0.06962751, -0.042226445, 0.035234913, 0.084891975, -0.03846841, -0.1473667, 0.2810354), target1); + target1 = MulAdd(f1, MF4x4(0.028011162, 0.08945262, 0.15859836, 0.18426442, 0.10649845, -0.0918649, -0.12257575, -0.00914911, 0.23701023, -0.030067213, -0.01938559, -0.11026175, -0.5953985, 0.28875506, -0.035278864, -0.05043055), target1); + target1 = MulAdd(g1, MF4x4(-0.14445779, -0.06907616, 0.13078876, -0.0089114, -0.110637166, -0.123719245, -0.094949, 0.046267383, 0.4727523, 0.0073195575, -0.014788787, -0.14922102, -0.021974785, -0.10706751, 0.00049029186, 0.09215345), target1); + target1 = MulAdd(h1, MF4x4(-0.20936993, -0.22377276, -0.07697398, 0.039161056, 0.044213686, 0.037542075, -0.06600642, 0.017124292, 0.3406197, 0.011907687, 0.019732054, -0.22745137, -0.22178015, 0.49051985, -0.03707166, 0.14849792), target1); + target1 = MulAdd(i1, MF4x4(0.07833466, 0.10888627, 0.16015877, -0.049263358, 0.29002127, -0.010949114, 0.013081097, -0.071674205, 0.3532135, 0.013165473, -0.05282189, -0.16688257, 0.009552089, -0.2740816, 0.04927233, -0.37047002), target1); + target1 = MulAdd(a2, MF4x4(0.23682123, -0.027914839, 0.02372468, -0.07127212, 0.053436097, 0.057737537, -0.008556659, -0.025973454, 0.06468388, 0.18805866, -0.08180048, 0.058999106, -0.3058321, -0.06642967, -0.092997625, 0.10527466), target1); + target1 = MulAdd(b2, MF4x4(-0.1353085, -0.016593851, 0.21518163, -0.10272456, 0.14382689, 0.05056661, -0.27799338, 0.11351653, 0.05838342, 0.28104934, -0.03777824, 0.003435516, 0.057915565, -0.17574134, -0.24437475, 0.13420977), target1); + target1 = MulAdd(c2, MF4x4(0.13400255, -0.056437124, 0.11310834, 0.040429913, 0.098928474, -0.020769242, -0.079605736, 0.0494632, 0.0660877, 0.098982334, -0.055884495, -0.046533633, 0.17815505, 0.027310565, -0.24176653, -0.025550256), target1); + target1 = MulAdd(d2, MF4x4(0.03637618, -0.012618673, 0.11865397, 0.19804053, -0.03522831, 0.24310908, -0.056454524, -0.44885796, 0.02212509, -0.20253624, 0.038810212, -0.17396528, 0.08970355, 0.005103078, 0.061075203, 0.44292897), target1); + target1 = MulAdd(e2, MF4x4(-0.25074747, -0.0015575301, -0.685015, 0.07345307, -0.08419402, 0.06640714, 0.43799296, -0.17571151, 0.0049855476, 0.09024738, 0.055744022, 0.018739637, 0.34734032, 0.114896655, 0.0404696, -0.11327049), target1); + target1 = MulAdd(f2, MF4x4(-0.12284062, -0.31131, -0.14712588, -0.18645866, 0.17581487, 0.1357234, 0.09913364, 0.005298711, -0.056155153, 0.042429443, 0.039454732, -0.04111384, 0.2623163, 0.09701166, 0.022825675, 0.050480727), target1); + target1 = MulAdd(g2, MF4x4(0.058734808, 0.038528245, -0.042670116, -0.15190329, -0.028179986, -0.05362995, 0.017090468, -0.24449602, -0.08240927, -0.033122182, 0.009938243, -0.0052937623, 0.2171439, 0.06879817, -0.10361997, 0.018995138), target1); + target1 = MulAdd(h2, MF4x4(0.027555468, 0.016337285, 0.19074728, 0.26690376, -0.088713005, -0.0021182299, -0.23062791, -0.32101163, -0.0040022335, 0.16835448, 0.05424022, -0.02156396, 0.24163729, 0.10243619, -0.04331782, -0.014350939), target1); + target1 = MulAdd(i2, MF4x4(-0.13836963, 0.053369813, 0.036432605, 0.062288612, -0.06264361, -0.049093347, -0.0315955, -0.11237456, -0.064744405, -0.0151798045, 0.044210885, 0.010166375, -0.038355727, -0.05203739, -0.075036794, 0.1664177), target1); + target1 = MulAdd(na1, MF4x4(-0.08583114, 0.08268218, -0.05771351, 0.10195048, -0.10128163, 0.10874855, -0.02580701, 0.028834302, 0.1950179, -0.0130183315, 0.0092119705, -0.060479227, 0.117747106, 0.061403573, -0.0028475628, -0.032362986), target1); + target1 = MulAdd(nb1, MF4x4(-0.05310153, -0.061091065, 0.19438389, -0.10475873, 0.00045120303, -0.24876194, 0.017168125, -0.050173752, 0.012073283, 0.035660096, -0.017562328, -0.110271364, -0.015546384, 0.17965329, 0.10068208, -0.014481325), target1); + target1 = MulAdd(nc1, MF4x4(0.085558474, -0.0007109211, 0.20868625, 0.150163, -0.19283043, 0.025976779, 0.08384698, 0.031011146, 0.17268184, 0.008871077, -0.04097794, -0.12868725, 0.01336166, -0.038823843, 0.1703644, -0.067780636), target1); + target1 = MulAdd(nd1, MF4x4(0.06480841, -0.44256654, -0.19949587, -0.030677497, -0.27930573, -0.041867044, -0.15648738, 0.11573067, 0.28664824, 0.009770385, -0.058617204, -0.06607673, -0.038160402, 0.009497089, 0.03303058, -0.079379834), target1); + target1 = MulAdd(ne1, MF4x4(0.17752203, 0.10979527, -0.058749028, -0.30194217, 0.30484176, -0.20980492, -0.05234784, -0.2590473, 0.23003183, 0.21903595, -0.024891363, -0.14337292, -0.02971356, -0.29613075, -0.045642294, 0.23826689), target1); + target1 = MulAdd(nf1, MF4x4(0.018211683, -0.005840598, -0.19021381, -0.096696235, 0.39998052, -0.34746838, -0.039627917, 0.087701194, 0.15526368, -0.008095372, -0.044220537, -0.08634815, -0.121496454, -0.06792033, -0.14959472, 0.078917444), target1); + target1 = MulAdd(ng1, MF4x4(0.33109078, 0.012287281, -0.034155898, -0.04840956, 0.068748444, 0.006142039, 0.06598935, 0.024775596, 0.22379673, 0.056089353, -0.006119644, -0.018509025, 0.10084137, 0.15556572, -0.041211523, -0.21550669), target1); + target1 = MulAdd(nh1, MF4x4(-0.058160853, 0.08899222, -0.17401625, -0.1449813, -0.015872562, -0.03780256, 0.15702572, 0.34013954, 0.1580772, 0.074823864, 0.035488904, -0.01627819, -0.15551315, -0.3638866, -0.09833458, 0.15037175), target1); + target1 = MulAdd(ni1, MF4x4(-0.12707977, -0.19947061, -0.11524648, 0.09216174, -0.07161296, 0.05675567, 0.06843247, 0.2803306, 0.25222927, -0.044076066, 0.053775772, -0.09939824, 0.16903089, 0.11475717, -0.07015584, -0.036021322), target1); + target1 = MulAdd(na2, MF4x4(-0.12290332, -0.05469477, 0.02696626, 0.051133692, -0.05541504, -0.2811521, -0.13008943, 0.031793896, -0.32529324, -0.01663752, -0.0658181, 0.17300756, 0.22281154, -0.11001508, 0.09578194, -0.055437982), target1); + target1 = MulAdd(nb2, MF4x4(0.083753526, -0.048933715, -0.13912897, 0.10929772, -0.1789828, -0.1586524, -0.10964165, -0.08210391, -0.11568187, -0.04813496, -0.2417861, 0.24446528, 0.13570863, -0.26869404, 0.3013413, 0.11678686), target1); + target1 = MulAdd(nc2, MF4x4(0.21105368, 0.15749952, -0.18983693, -0.023642758, -0.1633653, 0.10107988, 0.052329395, -0.080253236, 0.15375629, -0.045091413, 0.05070866, 0.12416106, 0.16600485, -0.10412354, 0.061849747, -0.084013924), target1); + target1 = MulAdd(nd2, MF4x4(0.03863923, 0.03690167, -0.053106382, -0.07523278, -0.04214836, 0.53898096, 0.15308584, 0.22835171, -0.24771535, 0.1402687, 0.1000896, -0.08719167, 0.0886567, 0.15255097, 0.14695966, -0.06659865), target1); + target1 = MulAdd(ne2, MF4x4(0.110334344, -0.12696493, 0.24256139, 0.02536166, 0.08322421, 0.022147777, -0.35030407, 0.13734557, 0.053133942, 0.43650532, -0.30170345, 0.08751837, 0.012917502, 0.27496436, 0.11422729, 0.15508565), target1); + target1 = MulAdd(nf2, MF4x4(0.16684863, 0.26743406, 0.15951683, 0.033597723, -0.044719726, 0.1127182, 0.007923161, 0.06415458, -0.07269362, -0.07828715, 0.09216738, 0.11528897, -0.13371283, -0.124177165, 0.14804523, 0.14156726), target1); + target1 = MulAdd(ng2, MF4x4(-0.041141883, 0.023617791, 0.11484465, 0.13131519, -0.14753738, 0.17067687, -0.017538434, 0.24042644, -0.058103643, 0.3143255, 0.02476919, -0.0024666793, -0.26759955, -0.06099211, 0.006415725, 0.10394301), target1); + target1 = MulAdd(nh2, MF4x4(-0.04198037, 0.03277123, -0.25069895, -0.21043587, -0.27417016, 0.08047665, 0.29731026, 0.07629813, -0.15695353, -0.14299184, 0.026618432, 0.13265325, 0.07727133, 0.12872085, 0.13887435, 0.1347057), target1); + target1 = MulAdd(ni2, MF4x4(0.039232086, 0.117847264, -0.071643315, -0.040677182, -0.029160816, -0.06968689, 0.12880929, 0.037579957, -0.036671028, -0.022678757, -0.069731854, 0.10590314, 0.028034678, -0.015759282, 0.047180142, -0.16366881), target1); - float4 target2 = mul(a1, float4x4(0.024126908, 0.01737047, 0.04563732, 0.08303721, -0.21339902, 0.00025652428, -0.09666459, -0.07654246, -0.01201168, 0.14373912, 0.22268519, 0.049181588, -0.0751725, 0.006847365, -0.025867194, 0.19233267)); - target2 += mul(b1, float4x4(-0.25251204, -0.34213448, -0.0022676045, 0.29270738, 0.08876456, 0.067294724, 0.2865476, -0.009144941, -0.074606106, 0.14566834, 0.14162645, 0.10980335, -0.7958991, -0.15410729, 0.038512416, -0.17033637)); - target2 += mul(c1, float4x4(-0.115404196, -0.11004134, 0.13174473, -0.0006875606, 0.0051814034, 0.058522645, -0.0795437, 0.0011465811, -0.019500278, 0.12752724, 0.16985136, -0.054932587, 0.16734739, -0.04686017, -0.072241016, 0.054562975)); - target2 += mul(d1, float4x4(-0.07528159, -0.113516726, 0.2081102, 0.009942251, 0.08256535, 0.050133914, 0.012745932, 0.13902397, 0.009369715, 0.083261885, 0.17366019, 0.069754004, 0.030654406, -0.045856245, -0.055254143, 0.16265897)); - target2 += mul(e1, float4x4(-0.14366727, 0.24948351, 0.12160293, 0.10929859, -0.116071545, -0.11725494, -0.13926856, -0.026759636, 0.12723772, 0.1938045, -0.02745115, -0.0644584, -0.23854719, 0.059308372, -0.446269, -0.06978486)); - target2 += mul(f1, float4x4(0.21108554, -0.1717225, 0.066633105, 0.15418948, -0.08902029, 0.047925282, 0.15817304, -0.080941506, 0.007364865, 0.10506626, 0.20205018, -0.078695655, 0.14004812, -0.3195092, 0.19157887, -0.12697977)); - target2 += mul(g1, float4x4(-0.08145032, -0.14292753, 0.066565305, -0.061348185, -0.08738346, 0.011608093, -0.0024047727, -0.024741996, -0.11547277, 0.10013328, 0.21730538, 0.05598899, -0.17741105, 0.075944185, 0.027434295, -0.2550598)); - target2 += mul(h1, float4x4(-0.026223006, 0.11214396, -0.133987, 0.1303522, 9.32011e-05, -0.14755996, -0.14002979, -0.039624512, 0.045111652, 0.17618611, 0.17764348, 0.104528464, 0.20592515, 0.07240335, -0.27604735, 0.038880046)); - target2 += mul(i1, float4x4(0.17734227, -0.002935363, 0.07505682, -0.029969893, -0.024536638, 0.11236127, 0.119374484, 0.08002781, -0.003541722, 0.1428466, 0.1729824, 0.055412393, -0.04790376, 0.18020035, 0.05376964, -0.1520942)); - target2 += mul(a2, float4x4(-0.11352182, -0.019249126, 0.10782615, 0.03079928, 0.020381734, -0.08998433, -0.09211494, -0.054406203, 0.1828849, -0.07692097, 0.004733955, -0.026685018, -0.08044814, -0.071961075, 0.029184176, -0.22562811)); - target2 += mul(b2, float4x4(-0.34489468, -0.07447471, 0.026422959, 0.33550653, 0.22130035, 0.059709545, -0.07646962, -0.18386386, 0.33911958, -0.07534871, 0.040870134, 0.051136248, 0.32681262, 0.20612194, -0.1609581, -0.70460784)); - target2 += mul(c2, float4x4(0.27617922, 0.09758603, 0.05103887, -0.09281693, -0.007143339, 0.006635712, -0.055270564, -0.022629099, -0.13023081, -0.013819027, -0.038695697, 0.047280338, -0.13964762, 0.09852924, -0.10056262, -0.084967695)); - target2 += mul(d2, float4x4(0.1370323, 0.030904075, -0.033860117, 0.08926374, -0.14616281, -0.29926816, -0.23738252, -0.21374625, -0.14039646, 0.11503669, 0.082101606, -0.061717354, 0.021357644, -0.10676707, 0.03214661, 0.029967157)); - target2 += mul(e2, float4x4(-0.29881296, -0.22195289, -0.3512607, -0.2277441, 0.033705913, -0.23267402, -0.119738854, -0.18925253, 0.068823405, -0.15160555, 0.2585695, 0.10484223, -0.012574211, 0.38808516, 0.2599094, -0.4991424)); - target2 += mul(f2, float4x4(-0.07474731, 0.22742131, 0.014462262, 0.08409484, 0.09579643, -0.0519534, 0.0007793075, -0.044820115, -0.010144471, -0.040506937, 0.0056340825, 0.057767954, -0.14988829, -0.05099549, 0.007204364, -0.07094934)); - target2 += mul(g2, float4x4(-0.05736621, 0.12072876, -0.02037183, 0.05012334, -0.1173538, -0.10062993, -0.0033958228, 0.0142556345, -0.011005385, -0.0066177617, -0.058390465, 0.048240293, 0.09835053, 0.17917523, -0.06466951, 0.017518612)); - target2 += mul(h2, float4x4(0.1413101, -0.30268928, -0.17851736, -0.10797371, -0.01964573, 0.14356858, -0.06759965, 0.17416531, 0.13905385, -0.017476829, 0.06541924, -0.044690568, -0.080723755, -0.08610206, 0.095347285, -0.09233214)); - target2 += mul(i2, float4x4(-0.07254187, -0.091158785, 0.018472971, 0.03514051, 0.018888336, 0.107934274, -0.018830854, 0.10007211, -0.053966418, -0.035646267, -0.031214178, -0.05980228, -0.13045661, -0.011743741, -0.03325275, 0.071065165)); - target2 += mul(na1, float4x4(-0.037697386, 0.054388218, -0.010934479, 0.2266702, 0.049999133, 0.017648092, -0.044225454, 0.21611899, -0.03805845, 0.054236397, -0.018563407, -0.060588073, -0.031215845, 0.075081706, 0.07333242, -0.09651128)); - target2 += mul(nb1, float4x4(-0.32236508, -0.0026381002, -0.30787975, 0.2963127, -0.13276175, 0.1058753, -0.12744896, 0.09749292, -0.02683677, -0.0041124597, 0.006103888, -0.09997201, 0.092101686, -0.08375288, 0.09641652, 0.053333007)); - target2 += mul(nc1, float4x4(0.027999232, -0.060004722, -0.009207874, -0.0952888, -0.038418446, -0.13316345, 0.099323496, 0.048450433, 0.0443969, 0.056023613, 0.1156147, 0.018980766, 0.040020484, 0.07555044, 0.0039174426, -0.044098593)); - target2 += mul(nd1, float4x4(-0.101029314, 0.33333415, -0.22052327, -0.035329416, 0.17229559, 0.12564908, -0.07879576, -0.09248896, -0.03239869, 0.022611454, 0.05610472, -0.02181683, -0.06347532, -0.077292696, 0.02005389, -0.078899406)); - target2 += mul(ne1, float4x4(-0.028139396, -0.04349171, -0.019393284, 0.42110333, 0.37065667, 0.5282552, 0.43816927, 0.19155908, 0.051832534, 0.02050813, 0.030795977, 0.023960136, -0.27617985, 0.19165507, -0.005492024, -0.13349663)); - target2 += mul(nf1, float4x4(5.0700226e-05, 0.21293098, -0.39902148, -0.058406413, -0.06766975, 0.1129277, -0.012398328, 0.025031524, 0.03519656, 0.06486415, 0.15710293, 0.014098051, 0.057754945, 0.116186336, -0.14429826, 0.051864166)); - target2 += mul(ng1, float4x4(-0.012280755, 0.043744788, -0.06420968, 0.012739398, 0.043073926, 0.031230433, 0.00036492705, -0.039208546, -0.09329152, 0.06928111, 0.11622664, -0.009106846, 0.111528054, -0.020315262, 0.036427997, 0.15881014)); - target2 += mul(nh1, float4x4(-0.066635534, 0.13901882, 0.0885122, 0.1030835, 0.08539728, -0.015466482, 0.0706688, -0.1611047, 0.02179479, -0.00048529037, 0.08708685, -0.00894464, -0.13046473, -0.21456988, -0.20666413, 0.049039323)); - target2 += mul(ni1, float4x4(-0.100800075, -0.03772198, -0.095183305, -0.15150243, -0.08743059, -0.24299338, -0.019315414, -0.1574107, -0.013610722, 0.064871654, 0.058439128, 0.008972897, 0.10339555, -0.027356634, 0.07666196, 0.048524544)); - target2 += mul(na2, float4x4(0.046309173, -0.03858991, -0.13260359, 0.0017626585, 0.1453724, 0.1402359, -0.079240486, 0.13017912, 0.0629575, -0.15448172, -0.1856442, -0.044694453, -0.17226808, -0.08065212, -0.008038736, -0.15994963)); - target2 += mul(nb2, float4x4(0.18369722, 0.03849556, -0.035185467, -0.20205377, 0.03879293, 0.02712859, -0.051278092, 0.14862835, 0.10261192, 0.18085574, -0.025982017, -0.029160796, 0.5301373, 0.09614058, 0.35518438, -0.014906588)); - target2 += mul(nc2, float4x4(-0.31154996, -0.06868871, -0.012681131, 0.028093819, -0.37321633, -0.14738804, 0.06060776, 0.050054748, 0.013779029, -0.020390315, -0.12487434, -0.0029474346, -0.274524, -0.09142805, 0.0132142445, 0.1577639)); - target2 += mul(nd2, float4x4(-0.02177336, -0.020817943, -0.0111796055, -0.0046033757, 0.45033064, 0.3573757, 0.55279994, 0.602122, -0.05536106, -0.33642644, -0.1851379, -0.052192084, 0.03683446, 0.13613251, 0.20098919, -0.090587094)); - target2 += mul(ne2, float4x4(0.1520822, 0.37173554, -0.061298244, 0.0019386727, 0.44656134, 0.13406622, 0.39018136, 0.5722051, -0.13074401, 0.012778576, -0.2837446, 0.16098566, 0.100189455, -0.40386122, 0.17464107, -0.17862785)); - target2 += mul(nf2, float4x4(-0.01217905, -0.24295084, 0.08192982, -0.14160301, -0.05936872, -0.003312342, -0.07542139, 0.13488367, -0.21560493, -0.14342502, -0.19195864, -0.09448305, -0.1038431, -0.075766176, 0.03226791, 0.06455397)); - target2 += mul(ng2, float4x4(-0.076916575, -0.10891301, 0.032635316, 0.03848802, 0.15750243, 0.48169684, 0.5410635, 0.017279895, 0.012730932, -0.0059071835, 0.030766146, -0.0225503, -0.030178519, -0.05866621, 0.033593398, -0.00033098995)); - target2 += mul(nh2, float4x4(-0.10757409, 0.2644168, -0.025696747, -0.0077012815, 0.31728277, 0.29771668, 0.2443613, -0.047722775, -0.083712585, -0.12742844, -0.3138776, -0.059888497, 0.12291351, -0.14435866, 0.051414594, -0.11889901)); - target2 += mul(ni2, float4x4(-0.063888945, 0.002844068, -0.06129518, 0.03381495, 0.10176077, -0.11625004, -0.10745763, -0.20636752, -0.03820934, 0.01926402, -0.20310643, 0.09767577, -0.00776684, 0.13453315, -0.036967937, 0.09780335)); - target2 += float4(0.019374544, -0.050425697, -0.005817216, -0.0059976326); + MF4 target2 = { 0.019374544, -0.050425697, -0.005817216, -0.0059976326 }; + target2 = MulAdd(a1, MF4x4(0.024126908, 0.01737047, 0.04563732, 0.08303721, -0.21339902, 0.00025652428, -0.09666459, -0.07654246, -0.01201168, 0.14373912, 0.22268519, 0.049181588, -0.0751725, 0.006847365, -0.025867194, 0.19233267), target2); + target2 = MulAdd(b1, MF4x4(-0.25251204, -0.34213448, -0.0022676045, 0.29270738, 0.08876456, 0.067294724, 0.2865476, -0.009144941, -0.074606106, 0.14566834, 0.14162645, 0.10980335, -0.7958991, -0.15410729, 0.038512416, -0.17033637), target2); + target2 = MulAdd(c1, MF4x4(-0.115404196, -0.11004134, 0.13174473, -0.0006875606, 0.0051814034, 0.058522645, -0.0795437, 0.0011465811, -0.019500278, 0.12752724, 0.16985136, -0.054932587, 0.16734739, -0.04686017, -0.072241016, 0.054562975), target2); + target2 = MulAdd(d1, MF4x4(-0.07528159, -0.113516726, 0.2081102, 0.009942251, 0.08256535, 0.050133914, 0.012745932, 0.13902397, 0.009369715, 0.083261885, 0.17366019, 0.069754004, 0.030654406, -0.045856245, -0.055254143, 0.16265897), target2); + target2 = MulAdd(e1, MF4x4(-0.14366727, 0.24948351, 0.12160293, 0.10929859, -0.116071545, -0.11725494, -0.13926856, -0.026759636, 0.12723772, 0.1938045, -0.02745115, -0.0644584, -0.23854719, 0.059308372, -0.446269, -0.06978486), target2); + target2 = MulAdd(f1, MF4x4(0.21108554, -0.1717225, 0.066633105, 0.15418948, -0.08902029, 0.047925282, 0.15817304, -0.080941506, 0.007364865, 0.10506626, 0.20205018, -0.078695655, 0.14004812, -0.3195092, 0.19157887, -0.12697977), target2); + target2 = MulAdd(g1, MF4x4(-0.08145032, -0.14292753, 0.066565305, -0.061348185, -0.08738346, 0.011608093, -0.0024047727, -0.024741996, -0.11547277, 0.10013328, 0.21730538, 0.05598899, -0.17741105, 0.075944185, 0.027434295, -0.2550598), target2); + target2 = MulAdd(h1, MF4x4(-0.026223006, 0.11214396, -0.133987, 0.1303522, 9.32011e-05, -0.14755996, -0.14002979, -0.039624512, 0.045111652, 0.17618611, 0.17764348, 0.104528464, 0.20592515, 0.07240335, -0.27604735, 0.038880046), target2); + target2 = MulAdd(i1, MF4x4(0.17734227, -0.002935363, 0.07505682, -0.029969893, -0.024536638, 0.11236127, 0.119374484, 0.08002781, -0.003541722, 0.1428466, 0.1729824, 0.055412393, -0.04790376, 0.18020035, 0.05376964, -0.1520942), target2); + target2 = MulAdd(a2, MF4x4(-0.11352182, -0.019249126, 0.10782615, 0.03079928, 0.020381734, -0.08998433, -0.09211494, -0.054406203, 0.1828849, -0.07692097, 0.004733955, -0.026685018, -0.08044814, -0.071961075, 0.029184176, -0.22562811), target2); + target2 = MulAdd(b2, MF4x4(-0.34489468, -0.07447471, 0.026422959, 0.33550653, 0.22130035, 0.059709545, -0.07646962, -0.18386386, 0.33911958, -0.07534871, 0.040870134, 0.051136248, 0.32681262, 0.20612194, -0.1609581, -0.70460784), target2); + target2 = MulAdd(c2, MF4x4(0.27617922, 0.09758603, 0.05103887, -0.09281693, -0.007143339, 0.006635712, -0.055270564, -0.022629099, -0.13023081, -0.013819027, -0.038695697, 0.047280338, -0.13964762, 0.09852924, -0.10056262, -0.084967695), target2); + target2 = MulAdd(d2, MF4x4(0.1370323, 0.030904075, -0.033860117, 0.08926374, -0.14616281, -0.29926816, -0.23738252, -0.21374625, -0.14039646, 0.11503669, 0.082101606, -0.061717354, 0.021357644, -0.10676707, 0.03214661, 0.029967157), target2); + target2 = MulAdd(e2, MF4x4(-0.29881296, -0.22195289, -0.3512607, -0.2277441, 0.033705913, -0.23267402, -0.119738854, -0.18925253, 0.068823405, -0.15160555, 0.2585695, 0.10484223, -0.012574211, 0.38808516, 0.2599094, -0.4991424), target2); + target2 = MulAdd(f2, MF4x4(-0.07474731, 0.22742131, 0.014462262, 0.08409484, 0.09579643, -0.0519534, 0.0007793075, -0.044820115, -0.010144471, -0.040506937, 0.0056340825, 0.057767954, -0.14988829, -0.05099549, 0.007204364, -0.07094934), target2); + target2 = MulAdd(g2, MF4x4(-0.05736621, 0.12072876, -0.02037183, 0.05012334, -0.1173538, -0.10062993, -0.0033958228, 0.0142556345, -0.011005385, -0.0066177617, -0.058390465, 0.048240293, 0.09835053, 0.17917523, -0.06466951, 0.017518612), target2); + target2 = MulAdd(h2, MF4x4(0.1413101, -0.30268928, -0.17851736, -0.10797371, -0.01964573, 0.14356858, -0.06759965, 0.17416531, 0.13905385, -0.017476829, 0.06541924, -0.044690568, -0.080723755, -0.08610206, 0.095347285, -0.09233214), target2); + target2 = MulAdd(i2, MF4x4(-0.07254187, -0.091158785, 0.018472971, 0.03514051, 0.018888336, 0.107934274, -0.018830854, 0.10007211, -0.053966418, -0.035646267, -0.031214178, -0.05980228, -0.13045661, -0.011743741, -0.03325275, 0.071065165), target2); + target2 = MulAdd(na1, MF4x4(-0.037697386, 0.054388218, -0.010934479, 0.2266702, 0.049999133, 0.017648092, -0.044225454, 0.21611899, -0.03805845, 0.054236397, -0.018563407, -0.060588073, -0.031215845, 0.075081706, 0.07333242, -0.09651128), target2); + target2 = MulAdd(nb1, MF4x4(-0.32236508, -0.0026381002, -0.30787975, 0.2963127, -0.13276175, 0.1058753, -0.12744896, 0.09749292, -0.02683677, -0.0041124597, 0.006103888, -0.09997201, 0.092101686, -0.08375288, 0.09641652, 0.053333007), target2); + target2 = MulAdd(nc1, MF4x4(0.027999232, -0.060004722, -0.009207874, -0.0952888, -0.038418446, -0.13316345, 0.099323496, 0.048450433, 0.0443969, 0.056023613, 0.1156147, 0.018980766, 0.040020484, 0.07555044, 0.0039174426, -0.044098593), target2); + target2 = MulAdd(nd1, MF4x4(-0.101029314, 0.33333415, -0.22052327, -0.035329416, 0.17229559, 0.12564908, -0.07879576, -0.09248896, -0.03239869, 0.022611454, 0.05610472, -0.02181683, -0.06347532, -0.077292696, 0.02005389, -0.078899406), target2); + target2 = MulAdd(ne1, MF4x4(-0.028139396, -0.04349171, -0.019393284, 0.42110333, 0.37065667, 0.5282552, 0.43816927, 0.19155908, 0.051832534, 0.02050813, 0.030795977, 0.023960136, -0.27617985, 0.19165507, -0.005492024, -0.13349663), target2); + target2 = MulAdd(nf1, MF4x4(5.0700226e-05, 0.21293098, -0.39902148, -0.058406413, -0.06766975, 0.1129277, -0.012398328, 0.025031524, 0.03519656, 0.06486415, 0.15710293, 0.014098051, 0.057754945, 0.116186336, -0.14429826, 0.051864166), target2); + target2 = MulAdd(ng1, MF4x4(-0.012280755, 0.043744788, -0.06420968, 0.012739398, 0.043073926, 0.031230433, 0.00036492705, -0.039208546, -0.09329152, 0.06928111, 0.11622664, -0.009106846, 0.111528054, -0.020315262, 0.036427997, 0.15881014), target2); + target2 = MulAdd(nh1, MF4x4(-0.066635534, 0.13901882, 0.0885122, 0.1030835, 0.08539728, -0.015466482, 0.0706688, -0.1611047, 0.02179479, -0.00048529037, 0.08708685, -0.00894464, -0.13046473, -0.21456988, -0.20666413, 0.049039323), target2); + target2 = MulAdd(ni1, MF4x4(-0.100800075, -0.03772198, -0.095183305, -0.15150243, -0.08743059, -0.24299338, -0.019315414, -0.1574107, -0.013610722, 0.064871654, 0.058439128, 0.008972897, 0.10339555, -0.027356634, 0.07666196, 0.048524544), target2); + target2 = MulAdd(na2, MF4x4(0.046309173, -0.03858991, -0.13260359, 0.0017626585, 0.1453724, 0.1402359, -0.079240486, 0.13017912, 0.0629575, -0.15448172, -0.1856442, -0.044694453, -0.17226808, -0.08065212, -0.008038736, -0.15994963), target2); + target2 = MulAdd(nb2, MF4x4(0.18369722, 0.03849556, -0.035185467, -0.20205377, 0.03879293, 0.02712859, -0.051278092, 0.14862835, 0.10261192, 0.18085574, -0.025982017, -0.029160796, 0.5301373, 0.09614058, 0.35518438, -0.014906588), target2); + target2 = MulAdd(nc2, MF4x4(-0.31154996, -0.06868871, -0.012681131, 0.028093819, -0.37321633, -0.14738804, 0.06060776, 0.050054748, 0.013779029, -0.020390315, -0.12487434, -0.0029474346, -0.274524, -0.09142805, 0.0132142445, 0.1577639), target2); + target2 = MulAdd(nd2, MF4x4(-0.02177336, -0.020817943, -0.0111796055, -0.0046033757, 0.45033064, 0.3573757, 0.55279994, 0.602122, -0.05536106, -0.33642644, -0.1851379, -0.052192084, 0.03683446, 0.13613251, 0.20098919, -0.090587094), target2); + target2 = MulAdd(ne2, MF4x4(0.1520822, 0.37173554, -0.061298244, 0.0019386727, 0.44656134, 0.13406622, 0.39018136, 0.5722051, -0.13074401, 0.012778576, -0.2837446, 0.16098566, 0.100189455, -0.40386122, 0.17464107, -0.17862785), target2); + target2 = MulAdd(nf2, MF4x4(-0.01217905, -0.24295084, 0.08192982, -0.14160301, -0.05936872, -0.003312342, -0.07542139, 0.13488367, -0.21560493, -0.14342502, -0.19195864, -0.09448305, -0.1038431, -0.075766176, 0.03226791, 0.06455397), target2); + target2 = MulAdd(ng2, MF4x4(-0.076916575, -0.10891301, 0.032635316, 0.03848802, 0.15750243, 0.48169684, 0.5410635, 0.017279895, 0.012730932, -0.0059071835, 0.030766146, -0.0225503, -0.030178519, -0.05866621, 0.033593398, -0.00033098995), target2); + target2 = MulAdd(nh2, MF4x4(-0.10757409, 0.2644168, -0.025696747, -0.0077012815, 0.31728277, 0.29771668, 0.2443613, -0.047722775, -0.083712585, -0.12742844, -0.3138776, -0.059888497, 0.12291351, -0.14435866, 0.051414594, -0.11889901), target2); + target2 = MulAdd(ni2, MF4x4(-0.063888945, 0.002844068, -0.06129518, 0.03381495, 0.10176077, -0.11625004, -0.10745763, -0.20636752, -0.03820934, 0.01926402, -0.20310643, 0.09767577, -0.00776684, 0.13453315, -0.036967937, 0.09780335), target2); conv2d_2_tf[gxy] = target1; conv2d_2_tf1[gxy] = target2; @@ -524,25 +527,25 @@ void Pass4(uint2 blockStart, uint3 threadId) { // [ a, d, g ] // [ b, e, h ] // [ c, f, i ] - float4 a1 = conv2d_2_tf.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0); - float4 b1 = conv2d_2_tf.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); - float4 c1 = conv2d_2_tf.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); - float4 d1 = conv2d_2_tf.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); - float4 e1 = conv2d_2_tf.SampleLevel(sam, pos, 0); - float4 f1 = conv2d_2_tf.SampleLevel(sam, pos + float2(0, inputPt.y), 0); - float4 g1 = conv2d_2_tf.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); - float4 h1 = conv2d_2_tf.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); - float4 i1 = conv2d_2_tf.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0); - - float4 na1 = max(-a1, 0); - float4 nb1 = max(-b1, 0); - float4 nc1 = max(-c1, 0); - float4 nd1 = max(-d1, 0); - float4 ne1 = max(-e1, 0); - float4 nf1 = max(-f1, 0); - float4 ng1 = max(-g1, 0); - float4 nh1 = max(-h1, 0); - float4 ni1 = max(-i1, 0); + MF4 a1 = conv2d_2_tf.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0); + MF4 b1 = conv2d_2_tf.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); + MF4 c1 = conv2d_2_tf.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); + MF4 d1 = conv2d_2_tf.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); + MF4 e1 = conv2d_2_tf.SampleLevel(sam, pos, 0); + MF4 f1 = conv2d_2_tf.SampleLevel(sam, pos + float2(0, inputPt.y), 0); + MF4 g1 = conv2d_2_tf.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); + MF4 h1 = conv2d_2_tf.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); + MF4 i1 = conv2d_2_tf.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0); + + MF4 na1 = max(-a1, 0); + MF4 nb1 = max(-b1, 0); + MF4 nc1 = max(-c1, 0); + MF4 nd1 = max(-d1, 0); + MF4 ne1 = max(-e1, 0); + MF4 nf1 = max(-f1, 0); + MF4 ng1 = max(-g1, 0); + MF4 nh1 = max(-h1, 0); + MF4 ni1 = max(-i1, 0); a1 = max(a1, 0); b1 = max(b1, 0); @@ -554,25 +557,25 @@ void Pass4(uint2 blockStart, uint3 threadId) { h1 = max(h1, 0); i1 = max(i1, 0); - float4 a2 = conv2d_2_tf1.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0); - float4 b2 = conv2d_2_tf1.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); - float4 c2 = conv2d_2_tf1.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); - float4 d2 = conv2d_2_tf1.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); - float4 e2 = conv2d_2_tf1.SampleLevel(sam, pos, 0); - float4 f2 = conv2d_2_tf1.SampleLevel(sam, pos + float2(0, inputPt.y), 0); - float4 g2 = conv2d_2_tf1.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); - float4 h2 = conv2d_2_tf1.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); - float4 i2 = conv2d_2_tf1.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0); - - float4 na2 = max(-a2, 0); - float4 nb2 = max(-b2, 0); - float4 nc2 = max(-c2, 0); - float4 nd2 = max(-d2, 0); - float4 ne2 = max(-e2, 0); - float4 nf2 = max(-f2, 0); - float4 ng2 = max(-g2, 0); - float4 nh2 = max(-h2, 0); - float4 ni2 = max(-i2, 0); + MF4 a2 = conv2d_2_tf1.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0); + MF4 b2 = conv2d_2_tf1.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); + MF4 c2 = conv2d_2_tf1.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); + MF4 d2 = conv2d_2_tf1.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); + MF4 e2 = conv2d_2_tf1.SampleLevel(sam, pos, 0); + MF4 f2 = conv2d_2_tf1.SampleLevel(sam, pos + float2(0, inputPt.y), 0); + MF4 g2 = conv2d_2_tf1.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); + MF4 h2 = conv2d_2_tf1.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); + MF4 i2 = conv2d_2_tf1.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0); + + MF4 na2 = max(-a2, 0); + MF4 nb2 = max(-b2, 0); + MF4 nc2 = max(-c2, 0); + MF4 nd2 = max(-d2, 0); + MF4 ne2 = max(-e2, 0); + MF4 nf2 = max(-f2, 0); + MF4 ng2 = max(-g2, 0); + MF4 nh2 = max(-h2, 0); + MF4 ni2 = max(-i2, 0); a2 = max(a2, 0); b2 = max(b2, 0); @@ -584,81 +587,81 @@ void Pass4(uint2 blockStart, uint3 threadId) { h2 = max(h2, 0); i2 = max(i2, 0); - float4 target1 = mul(a1, float4x4(-0.028328063, 0.038015317, 0.14893384, 0.10103896, 0.028176744, -0.02067147, -0.10970998, 0.015726602, -0.07402682, -0.075281784, -0.012586929, 0.053476278, 0.14823362, 0.20312984, 0.24213, 0.039118115)); - target1 += mul(b1, float4x4(0.009731573, -0.019011121, 0.016360838, -0.0073153526, 0.14594506, -0.0427664, -0.094225354, -0.013891855, -0.037061375, 0.024959227, -0.12289382, -0.21792257, -0.33579424, 0.052678566, 0.04346115, 0.07943186)); - target1 += mul(c1, float4x4(0.0022269129, 0.013298362, -0.045071404, -0.007918287, 0.010860651, -0.073057, -0.0042394064, 0.03340809, 0.04938919, -0.024218693, -0.008147567, 0.08848061, -0.06840333, 0.10077341, -0.272586, -0.06542769)); - target1 += mul(d1, float4x4(0.15929016, -0.1415167, 0.057084452, 0.06830724, 0.0046992986, 0.068573505, 0.22142749, -0.18493174, -0.1006019, -0.11373546, 0.17520057, -0.12888812, 0.05176946, -0.14703397, -0.20610721, 0.16611591)); - target1 += mul(e1, float4x4(-0.0069309813, 0.22358349, -0.18569049, 0.13456121, -0.21528137, 0.04618922, -0.47261322, -0.09682613, 0.5402922, 0.15818685, 0.05288464, -0.09949312, 0.21833964, 0.06652228, -0.2694682, 0.58216536)); - target1 += mul(f1, float4x4(0.040808782, 0.023110595, 0.12678777, -0.09057271, 0.03159572, 0.044006016, -0.10090222, 0.09940838, -0.08454473, -0.118349984, -0.053009644, 0.24352531, -0.103818566, 0.12536442, -0.17832974, 0.25161982)); - target1 += mul(g1, float4x4(-0.026323501, -0.14911288, -0.0073903934, 0.06902844, 0.07188603, -0.05006621, 0.06539817, -0.048083752, -0.08032579, -0.07449341, -0.015944218, 0.032426495, 0.069349505, -0.07441237, 0.055614363, 0.065174624)); - target1 += mul(h1, float4x4(-0.046432327, -0.051616143, 0.017791865, -0.047294978, 0.025944458, -0.0020909954, 0.083794415, -0.055740435, -0.3720184, 0.06654654, 0.1944378, 0.07806658, 0.00870193, 0.005404396, -0.059417505, -0.06270168)); - target1 += mul(i1, float4x4(-0.011699918, -0.03260685, 0.016413182, -0.02199741, -0.042898953, -0.018734168, -0.12387174, 0.06405199, -0.050764065, 0.07050078, 0.006969675, 0.05508108, -0.079687595, 0.12154926, 0.071177684, 0.046873443)); - target1 += mul(a2, float4x4(-0.2158498, 0.03612371, -0.05268691, -0.065594874, 0.06997431, -0.07327132, -0.03323361, -0.23306306, -0.00011140713, -0.1891967, -0.017328357, 0.15796778, -0.061359044, 0.008202449, -0.031317197, -0.020873578)); - target1 += mul(b2, float4x4(-0.022816254, -0.014594548, 0.0064240466, 0.07976367, -0.0070318123, -0.07651652, -0.111756384, -0.2788498, 0.16634429, -0.1583179, -0.10245271, 0.10480152, 0.345051, -0.07809675, 0.046080578, -0.32139245)); - target1 += mul(c2, float4x4(0.020630263, 0.032152038, 0.0019161701, 0.05435833, 0.078139454, -0.10090956, 0.14244889, 0.017286595, 0.0039871824, -0.026395446, 0.14158171, 0.0010112645, 0.17055373, -0.08093189, -0.049234428, -0.33473247)); - target1 += mul(d2, float4x4(-0.10982378, 0.029386539, -0.15483, -0.04345961, -0.16869037, -0.30862433, 0.060743757, -0.032285906, 0.017884266, -0.09846199, -0.090971105, -0.1693697, -0.063690096, -0.08489718, 0.18247683, -0.19921213)); - target1 += mul(e2, float4x4(0.1898742, 0.22187345, -0.28495324, -0.42578775, 0.12833633, -0.2251503, -0.025917793, 0.6011678, -0.36586264, 0.23302059, -0.072634645, 0.0064221635, 0.56792957, -0.4684677, 0.05015159, 0.30121225)); - target1 += mul(f2, float4x4(0.10837159, 0.14743729, -0.03639783, -0.34797576, -0.18306817, -0.07957882, -0.111433275, 0.30104128, -0.102763996, -0.01020151, 0.016333267, -0.012390819, 0.11835027, -0.12597388, -0.006298998, 0.0513052)); - target1 += mul(g2, float4x4(-0.23662986, 0.23325302, -0.046104953, 0.36488137, 0.06990537, -0.06887873, -0.012611426, -0.02618366, -0.05296669, 0.195254, 0.016366778, 0.01693462, -0.08488424, -0.24656284, -0.035283253, -0.15318634)); - target1 += mul(h2, float4x4(0.061704446, -0.26930714, -0.24598889, 0.27657726, 0.05046488, -0.341884, 0.10704377, -0.15971762, 0.072999336, -0.2005826, -0.05874223, -0.053938035, -0.08284583, -0.22792995, 0.1027033, -0.012932447)); - target1 += mul(i2, float4x4(-0.029079054, 0.14774945, 0.026151389, 0.12380946, 0.08926635, -0.08387116, -0.17018612, -0.09304499, 0.086990625, -0.27579373, 0.003823722, -0.024723161, 0.08762848, -0.10080674, -0.012214886, -0.30239874)); - target1 += mul(na1, float4x4(-0.25756493, 0.2537789, 0.21723714, 0.0017929028, -0.014724892, 0.086692676, 0.11934202, -0.025869334, 0.008618976, -0.0046638376, -0.06863114, -0.07598151, -0.17309345, 0.009138105, -0.099874064, 0.07377463)); - target1 += mul(nb1, float4x4(-0.39971545, 0.16774859, 0.13102596, 0.30735064, -0.060374007, -0.036933452, 0.14408773, 0.06479284, 0.03806265, 0.045560133, 0.043136165, -0.019244662, 0.17573427, -0.11398941, -0.0751098, 0.041702736)); - target1 += mul(nc1, float4x4(-0.074492976, 0.18349282, -0.009050458, 0.0869807, -0.23123743, -0.015426683, -0.14346392, 0.005445149, -0.05322614, 0.10287576, 0.16083732, -0.09557319, -0.13891783, -0.13752605, -0.023572346, 0.13608918)); - target1 += mul(nd1, float4x4(-0.31140685, 0.40130782, 0.07704675, 0.27509958, 0.09711739, -0.18293281, -0.14500841, -0.15334702, 0.098314695, 0.22749798, 0.006017282, -0.013669673, 0.07147038, 0.022289474, -0.036797456, -0.0013958871)); - target1 += mul(ne1, float4x4(0.0547557, -0.03036202, 0.65113044, 0.10668893, 0.304707, -0.1456157, 0.27668485, 0.2279428, -0.42439902, -0.0073047588, 0.045635667, 0.271637, -0.19595222, -0.28107607, 0.3905438, -0.29898256)); - target1 += mul(nf1, float4x4(0.076843366, 0.037181348, 0.08652873, 0.1756985, 0.03728033, -0.22783624, 0.16810594, -0.022009399, 0.16058537, 0.24559903, 0.05266939, -0.13929726, 0.15964857, 0.0013167082, 0.015017631, 0.101646364)); - target1 += mul(ng1, float4x4(-0.3022452, 0.20052882, 0.13433233, 0.04250016, -0.15248592, 0.014216527, -0.23489903, 0.13919333, 0.22891816, -0.0053335144, -0.05567782, -0.12769286, -0.05337762, -0.11429989, -0.00882089, -0.030790573)); - target1 += mul(nh1, float4x4(-0.11763547, 0.1073185, 0.15810886, 0.013149736, -0.028268294, -0.24712053, 0.08592036, 0.075742744, 0.19626461, -0.10880887, -0.22599675, -0.37207767, -0.032548983, -0.011045266, -0.035218395, 0.099996395)); - target1 += mul(ni1, float4x4(0.05631665, 0.029538663, 0.043909863, 0.13720988, 0.10980592, -0.047748722, 0.080308706, -0.06828442, 0.1144396, -0.12510885, -0.067976676, 0.030742755, 0.07134681, -0.06652312, -0.0642328, -0.034490924)); - target1 += mul(na2, float4x4(0.019588284, -0.15197967, -0.16797094, -0.026324488, 0.014429439, -0.028491383, 0.059453625, 0.23443304, 0.02504347, 0.08872467, 0.032782357, -0.085310735, 0.013040259, -0.09837991, 0.073533125, -0.03544458)); - target1 += mul(nb2, float4x4(0.02198588, -0.09614766, 0.024655875, 0.025384603, 0.012162857, 0.065071434, 0.018112874, 0.19828922, -0.33289856, 0.011323505, 0.13696423, 0.31772846, -0.06587399, -0.05569957, -0.16469179, -0.22545892)); - target1 += mul(nc2, float4x4(-0.009093827, 0.086783886, 0.060070645, 0.049957857, 0.041628215, 0.082412794, 0.117729135, -0.178277, 0.08326062, -0.07120824, 0.1788718, 0.050748438, -0.08952197, -0.14609487, 0.05515471, 0.14784457)); - target1 += mul(nd2, float4x4(-0.10823147, -0.05108019, 0.092807196, -0.13899301, 0.19123949, -0.044189975, 0.0030145745, 0.08935499, -0.10338727, 0.01996205, 0.15671325, -0.08229972, 0.05603653, 0.043324884, 0.13562247, -0.11487494)); - target1 += mul(ne2, float4x4(-0.18872134, -0.07302765, 0.030137405, 0.30928415, -0.07689583, 0.045998566, 0.45554903, -0.1653404, 0.14705873, -0.10649596, 0.46833125, 0.17608039, -0.43967086, 0.056812476, -0.17908083, -0.40455228)); - target1 += mul(nf2, float4x4(-0.08093384, 0.032636635, 0.124594346, 0.13655491, 0.16780408, -1.4671803e-05, 0.13044862, -0.397665, -0.013273644, 0.08253894, 0.16302188, -0.052874118, 0.04073075, -0.18063635, -0.00838661, -0.31084144)); - target1 += mul(ng2, float4x4(0.06804371, -0.14755388, -0.12055216, -0.00437858, -0.044694718, 0.22744909, 0.012434794, 0.06245207, 0.00560859, -0.15815294, -0.19711316, 0.07711764, 0.03078979, -0.09560189, 0.10509056, 0.010651465)); - target1 += mul(nh2, float4x4(-0.026342146, 0.13919179, -0.0030414977, 0.06607403, 0.071292974, 0.065464914, -0.027091878, 0.10620255, -0.052090824, 0.06840278, -0.08457357, 0.08867469, 0.2976581, -0.6702739, -0.15472057, -0.3066263)); - target1 += mul(ni2, float4x4(-0.00072869845, 0.046573937, -0.08363707, 0.07867379, 0.038065, 0.01228845, 0.031746328, -0.024448024, -0.065555945, 0.1220454, 0.032151606, -0.022336006, -0.0010816467, -0.026455112, 0.112422734, -0.10285581)); - target1 += float4(0.052450567, 0.10404023, -0.059578225, 0.009724121); + MF4 target1 = { 0.052450567, 0.10404023, -0.059578225, 0.009724121 }; + target1 = MulAdd(a1, MF4x4(-0.028328063, 0.038015317, 0.14893384, 0.10103896, 0.028176744, -0.02067147, -0.10970998, 0.015726602, -0.07402682, -0.075281784, -0.012586929, 0.053476278, 0.14823362, 0.20312984, 0.24213, 0.039118115), target1); + target1 = MulAdd(b1, MF4x4(0.009731573, -0.019011121, 0.016360838, -0.0073153526, 0.14594506, -0.0427664, -0.094225354, -0.013891855, -0.037061375, 0.024959227, -0.12289382, -0.21792257, -0.33579424, 0.052678566, 0.04346115, 0.07943186), target1); + target1 = MulAdd(c1, MF4x4(0.0022269129, 0.013298362, -0.045071404, -0.007918287, 0.010860651, -0.073057, -0.0042394064, 0.03340809, 0.04938919, -0.024218693, -0.008147567, 0.08848061, -0.06840333, 0.10077341, -0.272586, -0.06542769), target1); + target1 = MulAdd(d1, MF4x4(0.15929016, -0.1415167, 0.057084452, 0.06830724, 0.0046992986, 0.068573505, 0.22142749, -0.18493174, -0.1006019, -0.11373546, 0.17520057, -0.12888812, 0.05176946, -0.14703397, -0.20610721, 0.16611591), target1); + target1 = MulAdd(e1, MF4x4(-0.0069309813, 0.22358349, -0.18569049, 0.13456121, -0.21528137, 0.04618922, -0.47261322, -0.09682613, 0.5402922, 0.15818685, 0.05288464, -0.09949312, 0.21833964, 0.06652228, -0.2694682, 0.58216536), target1); + target1 = MulAdd(f1, MF4x4(0.040808782, 0.023110595, 0.12678777, -0.09057271, 0.03159572, 0.044006016, -0.10090222, 0.09940838, -0.08454473, -0.118349984, -0.053009644, 0.24352531, -0.103818566, 0.12536442, -0.17832974, 0.25161982), target1); + target1 = MulAdd(g1, MF4x4(-0.026323501, -0.14911288, -0.0073903934, 0.06902844, 0.07188603, -0.05006621, 0.06539817, -0.048083752, -0.08032579, -0.07449341, -0.015944218, 0.032426495, 0.069349505, -0.07441237, 0.055614363, 0.065174624), target1); + target1 = MulAdd(h1, MF4x4(-0.046432327, -0.051616143, 0.017791865, -0.047294978, 0.025944458, -0.0020909954, 0.083794415, -0.055740435, -0.3720184, 0.06654654, 0.1944378, 0.07806658, 0.00870193, 0.005404396, -0.059417505, -0.06270168), target1); + target1 = MulAdd(i1, MF4x4(-0.011699918, -0.03260685, 0.016413182, -0.02199741, -0.042898953, -0.018734168, -0.12387174, 0.06405199, -0.050764065, 0.07050078, 0.006969675, 0.05508108, -0.079687595, 0.12154926, 0.071177684, 0.046873443), target1); + target1 = MulAdd(a2, MF4x4(-0.2158498, 0.03612371, -0.05268691, -0.065594874, 0.06997431, -0.07327132, -0.03323361, -0.23306306, -0.00011140713, -0.1891967, -0.017328357, 0.15796778, -0.061359044, 0.008202449, -0.031317197, -0.020873578), target1); + target1 = MulAdd(b2, MF4x4(-0.022816254, -0.014594548, 0.0064240466, 0.07976367, -0.0070318123, -0.07651652, -0.111756384, -0.2788498, 0.16634429, -0.1583179, -0.10245271, 0.10480152, 0.345051, -0.07809675, 0.046080578, -0.32139245), target1); + target1 = MulAdd(c2, MF4x4(0.020630263, 0.032152038, 0.0019161701, 0.05435833, 0.078139454, -0.10090956, 0.14244889, 0.017286595, 0.0039871824, -0.026395446, 0.14158171, 0.0010112645, 0.17055373, -0.08093189, -0.049234428, -0.33473247), target1); + target1 = MulAdd(d2, MF4x4(-0.10982378, 0.029386539, -0.15483, -0.04345961, -0.16869037, -0.30862433, 0.060743757, -0.032285906, 0.017884266, -0.09846199, -0.090971105, -0.1693697, -0.063690096, -0.08489718, 0.18247683, -0.19921213), target1); + target1 = MulAdd(e2, MF4x4(0.1898742, 0.22187345, -0.28495324, -0.42578775, 0.12833633, -0.2251503, -0.025917793, 0.6011678, -0.36586264, 0.23302059, -0.072634645, 0.0064221635, 0.56792957, -0.4684677, 0.05015159, 0.30121225), target1); + target1 = MulAdd(f2, MF4x4(0.10837159, 0.14743729, -0.03639783, -0.34797576, -0.18306817, -0.07957882, -0.111433275, 0.30104128, -0.102763996, -0.01020151, 0.016333267, -0.012390819, 0.11835027, -0.12597388, -0.006298998, 0.0513052), target1); + target1 = MulAdd(g2, MF4x4(-0.23662986, 0.23325302, -0.046104953, 0.36488137, 0.06990537, -0.06887873, -0.012611426, -0.02618366, -0.05296669, 0.195254, 0.016366778, 0.01693462, -0.08488424, -0.24656284, -0.035283253, -0.15318634), target1); + target1 = MulAdd(h2, MF4x4(0.061704446, -0.26930714, -0.24598889, 0.27657726, 0.05046488, -0.341884, 0.10704377, -0.15971762, 0.072999336, -0.2005826, -0.05874223, -0.053938035, -0.08284583, -0.22792995, 0.1027033, -0.012932447), target1); + target1 = MulAdd(i2, MF4x4(-0.029079054, 0.14774945, 0.026151389, 0.12380946, 0.08926635, -0.08387116, -0.17018612, -0.09304499, 0.086990625, -0.27579373, 0.003823722, -0.024723161, 0.08762848, -0.10080674, -0.012214886, -0.30239874), target1); + target1 = MulAdd(na1, MF4x4(-0.25756493, 0.2537789, 0.21723714, 0.0017929028, -0.014724892, 0.086692676, 0.11934202, -0.025869334, 0.008618976, -0.0046638376, -0.06863114, -0.07598151, -0.17309345, 0.009138105, -0.099874064, 0.07377463), target1); + target1 = MulAdd(nb1, MF4x4(-0.39971545, 0.16774859, 0.13102596, 0.30735064, -0.060374007, -0.036933452, 0.14408773, 0.06479284, 0.03806265, 0.045560133, 0.043136165, -0.019244662, 0.17573427, -0.11398941, -0.0751098, 0.041702736), target1); + target1 = MulAdd(nc1, MF4x4(-0.074492976, 0.18349282, -0.009050458, 0.0869807, -0.23123743, -0.015426683, -0.14346392, 0.005445149, -0.05322614, 0.10287576, 0.16083732, -0.09557319, -0.13891783, -0.13752605, -0.023572346, 0.13608918), target1); + target1 = MulAdd(nd1, MF4x4(-0.31140685, 0.40130782, 0.07704675, 0.27509958, 0.09711739, -0.18293281, -0.14500841, -0.15334702, 0.098314695, 0.22749798, 0.006017282, -0.013669673, 0.07147038, 0.022289474, -0.036797456, -0.0013958871), target1); + target1 = MulAdd(ne1, MF4x4(0.0547557, -0.03036202, 0.65113044, 0.10668893, 0.304707, -0.1456157, 0.27668485, 0.2279428, -0.42439902, -0.0073047588, 0.045635667, 0.271637, -0.19595222, -0.28107607, 0.3905438, -0.29898256), target1); + target1 = MulAdd(nf1, MF4x4(0.076843366, 0.037181348, 0.08652873, 0.1756985, 0.03728033, -0.22783624, 0.16810594, -0.022009399, 0.16058537, 0.24559903, 0.05266939, -0.13929726, 0.15964857, 0.0013167082, 0.015017631, 0.101646364), target1); + target1 = MulAdd(ng1, MF4x4(-0.3022452, 0.20052882, 0.13433233, 0.04250016, -0.15248592, 0.014216527, -0.23489903, 0.13919333, 0.22891816, -0.0053335144, -0.05567782, -0.12769286, -0.05337762, -0.11429989, -0.00882089, -0.030790573), target1); + target1 = MulAdd(nh1, MF4x4(-0.11763547, 0.1073185, 0.15810886, 0.013149736, -0.028268294, -0.24712053, 0.08592036, 0.075742744, 0.19626461, -0.10880887, -0.22599675, -0.37207767, -0.032548983, -0.011045266, -0.035218395, 0.099996395), target1); + target1 = MulAdd(ni1, MF4x4(0.05631665, 0.029538663, 0.043909863, 0.13720988, 0.10980592, -0.047748722, 0.080308706, -0.06828442, 0.1144396, -0.12510885, -0.067976676, 0.030742755, 0.07134681, -0.06652312, -0.0642328, -0.034490924), target1); + target1 = MulAdd(na2, MF4x4(0.019588284, -0.15197967, -0.16797094, -0.026324488, 0.014429439, -0.028491383, 0.059453625, 0.23443304, 0.02504347, 0.08872467, 0.032782357, -0.085310735, 0.013040259, -0.09837991, 0.073533125, -0.03544458), target1); + target1 = MulAdd(nb2, MF4x4(0.02198588, -0.09614766, 0.024655875, 0.025384603, 0.012162857, 0.065071434, 0.018112874, 0.19828922, -0.33289856, 0.011323505, 0.13696423, 0.31772846, -0.06587399, -0.05569957, -0.16469179, -0.22545892), target1); + target1 = MulAdd(nc2, MF4x4(-0.009093827, 0.086783886, 0.060070645, 0.049957857, 0.041628215, 0.082412794, 0.117729135, -0.178277, 0.08326062, -0.07120824, 0.1788718, 0.050748438, -0.08952197, -0.14609487, 0.05515471, 0.14784457), target1); + target1 = MulAdd(nd2, MF4x4(-0.10823147, -0.05108019, 0.092807196, -0.13899301, 0.19123949, -0.044189975, 0.0030145745, 0.08935499, -0.10338727, 0.01996205, 0.15671325, -0.08229972, 0.05603653, 0.043324884, 0.13562247, -0.11487494), target1); + target1 = MulAdd(ne2, MF4x4(-0.18872134, -0.07302765, 0.030137405, 0.30928415, -0.07689583, 0.045998566, 0.45554903, -0.1653404, 0.14705873, -0.10649596, 0.46833125, 0.17608039, -0.43967086, 0.056812476, -0.17908083, -0.40455228), target1); + target1 = MulAdd(nf2, MF4x4(-0.08093384, 0.032636635, 0.124594346, 0.13655491, 0.16780408, -1.4671803e-05, 0.13044862, -0.397665, -0.013273644, 0.08253894, 0.16302188, -0.052874118, 0.04073075, -0.18063635, -0.00838661, -0.31084144), target1); + target1 = MulAdd(ng2, MF4x4(0.06804371, -0.14755388, -0.12055216, -0.00437858, -0.044694718, 0.22744909, 0.012434794, 0.06245207, 0.00560859, -0.15815294, -0.19711316, 0.07711764, 0.03078979, -0.09560189, 0.10509056, 0.010651465), target1); + target1 = MulAdd(nh2, MF4x4(-0.026342146, 0.13919179, -0.0030414977, 0.06607403, 0.071292974, 0.065464914, -0.027091878, 0.10620255, -0.052090824, 0.06840278, -0.08457357, 0.08867469, 0.2976581, -0.6702739, -0.15472057, -0.3066263), target1); + target1 = MulAdd(ni2, MF4x4(-0.00072869845, 0.046573937, -0.08363707, 0.07867379, 0.038065, 0.01228845, 0.031746328, -0.024448024, -0.065555945, 0.1220454, 0.032151606, -0.022336006, -0.0010816467, -0.026455112, 0.112422734, -0.10285581), target1); - float4 target2 = mul(a1, float4x4(-0.037506457, -0.06573841, -0.087879084, -0.06359248, -0.0017873603, -0.009097742, 0.010108622, 0.026364084, 0.012306545, 0.12607974, -0.088268295, 0.14034338, 0.24951904, 0.0983314, 0.03635719, -0.047059253)); - target2 += mul(b1, float4x4(-0.05570699, 0.11044774, 0.04827364, -0.03185735, -0.032498132, -0.062959515, 0.2933071, 0.22244357, 0.061075654, 0.0064111133, 0.011452209, 0.11576761, 0.13969804, 0.20502032, 0.1114938, 0.022496287)); - target2 += mul(c1, float4x4(-0.054194342, 0.000389916, -0.039589155, -0.018707246, -0.036095835, -0.06873059, -0.077109694, 0.028726012, -0.08820959, -0.109247595, -0.05745309, 0.043230128, 0.033671502, 0.16398554, 0.030398889, -0.17000203)); - target2 += mul(d1, float4x4(-0.09218165, -0.12813722, -0.040984686, -0.016605416, 0.054269493, 0.12971285, -0.013961638, -0.17803082, -0.014683587, 0.2502267, -0.14249405, -0.025687713, -0.097426265, -0.30111355, -0.21776466, 0.008809217)); - target2 += mul(e1, float4x4(0.21033873, 0.15221386, 0.18138756, -0.08248389, -0.10091519, -0.06940753, -0.014009188, -0.3009861, -0.02452202, -0.08800422, -0.36376888, 0.18485394, 0.35076657, -0.13293292, 0.24624826, 0.39373755)); - target2 += mul(f1, float4x4(0.014170062, -0.029623963, 0.057001226, 0.09269898, -0.14630881, -0.16557585, 0.06735037, -0.015008042, -0.27238864, 0.081130914, -0.07869508, 0.098087415, 0.11217335, 0.48223323, 0.18613088, -0.035602476)); - target2 += mul(g1, float4x4(-0.21623239, -0.1125095, -0.09964635, 0.101452544, 0.11877652, 0.13471957, -0.10402355, 0.0077938605, 0.030518647, 0.22309083, -0.2115206, 0.017967062, -0.042780407, 0.099759325, -0.10465051, -0.033807248)); - target2 += mul(h1, float4x4(-0.059608232, 0.06684556, 0.00039066386, 0.08542961, 0.097183906, -0.1868667, 0.07778909, -0.06172202, 0.0021662437, -0.05387577, -0.4077133, -0.028940776, 0.110816136, -0.04154161, 0.030078325, 0.072834246)); - target2 += mul(i1, float4x4(-0.01881586, -0.06384429, -0.054874837, -0.016731417, -0.06570834, -0.13579571, 0.0033891131, -0.059161015, -0.11559389, 0.02149361, -0.08791608, -0.008113861, 0.08313892, -0.07327947, -0.013473171, 0.13254371)); - target2 += mul(a2, float4x4(-0.11458958, -0.08827364, -0.025030116, 0.12626298, 0.0070429775, 0.0337767, 0.051719055, -0.09654129, -0.04867615, -0.03609001, -0.06522421, -0.044131942, -0.048825134, 0.10652733, -0.015310965, -0.07341175)); - target2 += mul(b2, float4x4(0.05782829, 0.014247012, 0.12126171, 0.100055166, 0.24079333, -0.20155986, 0.1640186, -0.12158374, -0.153708, -0.24445893, -0.10536192, 0.12758626, -0.19430119, -0.019024884, -0.080120996, -0.29866305)); - target2 += mul(c2, float4x4(-0.017357074, 0.04390695, 0.12889594, 0.11451521, 0.03333342, -0.16417275, 0.10196121, 0.13059081, 0.09948873, 0.15007107, 0.22664218, 0.35449567, -0.089776486, 0.025239054, 0.12463201, -0.13109131)); - target2 += mul(d2, float4x4(0.064875744, 0.40551752, 0.11903257, 0.14822967, 0.14993542, -0.12758526, 0.23159283, -0.06080246, -0.084577255, 0.14307548, -0.02186462, 0.05793564, -0.050965074, 0.23895216, -0.07796932, -0.1624384)); - target2 += mul(e2, float4x4(-0.15942748, 0.07191155, 0.42204422, 0.35219797, 0.23286703, -0.283381, -0.2749432, 0.25922084, 0.10494953, 0.14575887, -0.19649154, -0.14563714, -0.03709703, 0.023375817, -0.05610175, -0.32548484)); - target2 += mul(f2, float4x4(-0.04872624, -0.3592348, -0.027413938, 0.0836858, 0.046842758, -0.35193914, 0.06154142, 0.05559191, -0.22538327, -0.097689696, -0.21317257, -0.033945527, -0.23628096, -0.016477302, 0.027297588, -0.04105733)); - target2 += mul(g2, float4x4(0.11543502, -0.043297376, 0.118703, 0.15013209, 0.03191795, 0.014122794, 0.05156918, 0.023102578, 0.0808462, -0.06445798, 0.15860644, -0.062393136, -0.018691704, -0.00032888897, 0.01196705, -0.025045555)); - target2 += mul(h2, float4x4(0.08301664, 0.12298539, 0.20151077, 0.2993159, 0.16968682, -0.18196446, -0.13322797, -0.13693243, -0.0048389523, -0.057406515, 0.21409932, -0.060822334, -0.08554752, -0.19363636, -0.35241908, -0.32256603)); - target2 += mul(i2, float4x4(-0.0523748, 0.17082025, 0.08556144, 0.19181536, -0.2445756, -0.3616732, -0.01641404, -0.078599006, 0.23907976, 0.025989126, 0.07574993, -0.06859337, -0.06667767, -0.022847861, -0.037942342, -0.21112117)); - target2 += mul(na1, float4x4(0.15098672, 0.024212115, -0.19068481, -0.22606348, -0.15221487, -0.032165635, -0.06244531, -0.043535717, -0.07398802, -0.06088507, -0.013834592, -0.10145823, 0.06901983, -0.0862135, -0.05545454, 0.15514566)); - target2 += mul(nb1, float4x4(0.044767097, -0.07583697, -0.17739761, -0.25538698, 0.0966659, -0.0013492911, -0.23315248, -0.21652249, -0.14381947, 0.017784966, -0.15960035, -0.13297895, 0.009810349, -0.041348267, 0.05443229, 0.17781278)); - target2 += mul(nc1, float4x4(-0.0052824756, 0.087268956, -0.022167318, -0.09450279, 0.1254372, 0.075806946, 0.028893303, -0.09019378, 0.03488572, 0.046265777, 0.026162563, 0.003914548, -0.0632334, -0.19494742, -0.03602023, 0.113897055)); - target2 += mul(nd1, float4x4(-0.11311528, 0.2616239, 0.12303548, 0.13427438, -0.26537886, 0.015112677, -0.03641703, -0.014114427, -0.023280613, 0.03626403, 0.12833157, 0.19168468, 0.2119137, -0.02374797, 0.117919676, 0.07794395)); - target2 += mul(ne1, float4x4(-0.13746078, 0.25739196, 0.008431936, -0.053867325, -0.13228695, -0.20661803, 0.026474724, 0.3205188, -0.41819036, 0.42812085, 0.17249924, -0.15810613, 0.39602605, -0.10873597, 0.1457145, -0.060503867)); - target2 += mul(nf1, float4x4(0.03706167, -0.036211733, 0.06519942, -0.2123978, 0.019934088, 0.17494182, -0.017252771, -0.067341134, -0.15416612, -0.114118524, -0.00028491023, -0.08172238, -0.11722721, -0.2647645, 0.13316637, 0.13562322)); - target2 += mul(ng1, float4x4(0.11832847, 0.22822993, 0.020318847, 0.0734738, -0.025950216, -0.072782144, 0.11133989, 0.18845533, -0.004584898, -0.10486471, 0.054522812, -0.14136603, 0.01940983, -0.039433163, 0.008390286, 0.013686628)); - target2 += mul(nh1, float4x4(-0.042335663, 0.0035399816, -0.1813205, -0.25639042, 0.1042524, 0.07707001, -0.04922454, 0.18140413, -0.22322963, 0.030809738, -0.11041754, -0.040288754, 0.09431559, -0.08017892, -0.18317147, -0.019331435)); - target2 += mul(ni1, float4x4(-0.061776266, 0.0069793356, 0.019964112, -0.14504445, -0.00070097746, -0.027107855, 0.030182542, -0.05625612, -0.04958449, 0.123165295, 0.0013953283, 0.017912487, 0.031161329, -0.31798717, 0.018331604, 0.030411277)); - target2 += mul(na2, float4x4(-0.0530594, -0.07933117, 0.024755973, 0.004785411, 0.045512546, 0.12833083, 0.023195961, -0.018028054, 0.014223835, 0.102213494, 0.052169293, -0.020509718, 0.017905682, 0.021354724, -0.0410789, -0.066523656)); - target2 += mul(nb2, float4x4(0.017061293, -0.08770806, -0.04889939, 0.01825556, -0.03228951, -0.06838898, -0.09249373, 0.18103507, 0.087000825, 0.04175679, -0.09305919, -0.2792485, 0.03405797, 0.062147446, -0.04757652, -0.021603985)); - target2 += mul(nc2, float4x4(-0.04115162, 0.02547615, 0.07033616, 0.09814065, 0.2597489, -0.0335038, 0.14097647, 0.047022782, 0.1374654, -0.27390274, 0.02080897, -0.15251215, -0.025431091, 0.08871465, -0.22243279, -0.07792812)); - target2 += mul(nd2, float4x4(-0.061674852, -0.051326606, -0.04885301, 0.08548189, -0.07100394, 0.044875987, -0.19810183, -0.09841128, -0.06628199, -0.041564234, 0.1111919, -0.044448826, 0.06980301, 0.00046094303, -0.045978926, -0.20736355)); - target2 += mul(ne2, float4x4(-0.18405268, -0.28115878, -0.33536536, 0.0753763, 0.028309148, 0.0014874876, 0.28369543, -0.2133985, 0.16520546, 0.29562506, 0.109781906, 0.028433772, -0.02691105, -0.39038795, -0.12942268, -0.080103286)); - target2 += mul(nf2, float4x4(-0.05387814, -0.04672615, 0.046064686, 0.2791977, 0.11359623, -0.204098, -0.018091407, 0.13550591, 0.04216003, -0.1631328, -0.043013666, -0.045698896, 0.032403514, 0.010206319, -0.25789943, -0.36328712)); - target2 += mul(ng2, float4x4(0.11280466, 0.11671405, -0.02122692, 0.021664057, -0.07836575, 0.014747725, 0.030007286, -0.10128616, -0.13695373, -0.10353946, -0.043571353, 0.05922437, -0.11293257, 0.0828006, -0.07322761, -0.08197273)); - target2 += mul(nh2, float4x4(-0.0010509897, -0.1674067, 0.08191839, 0.056608744, 0.061343428, 0.19574693, 0.05302967, -0.006813754, -0.016064182, 0.22949885, -0.06631832, 0.034382205, 0.12674272, 0.06583508, 0.19319807, 0.011400221)); - target2 += mul(ni2, float4x4(-0.032175347, -0.021227444, -0.027698517, 0.067299634, 0.23929007, 0.20669897, 0.004856941, 0.0009404045, 0.04919408, 0.020296812, 0.012571405, -0.16185577, -0.012276781, 0.16609742, -0.15718406, -0.20344186)); - target2 += float4(0.022815697, 0.012251767, 0.045309987, -0.0879881); + MF4 target2 = { 0.022815697, 0.012251767, 0.045309987, -0.0879881 }; + target2 = MulAdd(a1, MF4x4(-0.037506457, -0.06573841, -0.087879084, -0.06359248, -0.0017873603, -0.009097742, 0.010108622, 0.026364084, 0.012306545, 0.12607974, -0.088268295, 0.14034338, 0.24951904, 0.0983314, 0.03635719, -0.047059253), target2); + target2 = MulAdd(b1, MF4x4(-0.05570699, 0.11044774, 0.04827364, -0.03185735, -0.032498132, -0.062959515, 0.2933071, 0.22244357, 0.061075654, 0.0064111133, 0.011452209, 0.11576761, 0.13969804, 0.20502032, 0.1114938, 0.022496287), target2); + target2 = MulAdd(c1, MF4x4(-0.054194342, 0.000389916, -0.039589155, -0.018707246, -0.036095835, -0.06873059, -0.077109694, 0.028726012, -0.08820959, -0.109247595, -0.05745309, 0.043230128, 0.033671502, 0.16398554, 0.030398889, -0.17000203), target2); + target2 = MulAdd(d1, MF4x4(-0.09218165, -0.12813722, -0.040984686, -0.016605416, 0.054269493, 0.12971285, -0.013961638, -0.17803082, -0.014683587, 0.2502267, -0.14249405, -0.025687713, -0.097426265, -0.30111355, -0.21776466, 0.008809217), target2); + target2 = MulAdd(e1, MF4x4(0.21033873, 0.15221386, 0.18138756, -0.08248389, -0.10091519, -0.06940753, -0.014009188, -0.3009861, -0.02452202, -0.08800422, -0.36376888, 0.18485394, 0.35076657, -0.13293292, 0.24624826, 0.39373755), target2); + target2 = MulAdd(f1, MF4x4(0.014170062, -0.029623963, 0.057001226, 0.09269898, -0.14630881, -0.16557585, 0.06735037, -0.015008042, -0.27238864, 0.081130914, -0.07869508, 0.098087415, 0.11217335, 0.48223323, 0.18613088, -0.035602476), target2); + target2 = MulAdd(g1, MF4x4(-0.21623239, -0.1125095, -0.09964635, 0.101452544, 0.11877652, 0.13471957, -0.10402355, 0.0077938605, 0.030518647, 0.22309083, -0.2115206, 0.017967062, -0.042780407, 0.099759325, -0.10465051, -0.033807248), target2); + target2 = MulAdd(h1, MF4x4(-0.059608232, 0.06684556, 0.00039066386, 0.08542961, 0.097183906, -0.1868667, 0.07778909, -0.06172202, 0.0021662437, -0.05387577, -0.4077133, -0.028940776, 0.110816136, -0.04154161, 0.030078325, 0.072834246), target2); + target2 = MulAdd(i1, MF4x4(-0.01881586, -0.06384429, -0.054874837, -0.016731417, -0.06570834, -0.13579571, 0.0033891131, -0.059161015, -0.11559389, 0.02149361, -0.08791608, -0.008113861, 0.08313892, -0.07327947, -0.013473171, 0.13254371), target2); + target2 = MulAdd(a2, MF4x4(-0.11458958, -0.08827364, -0.025030116, 0.12626298, 0.0070429775, 0.0337767, 0.051719055, -0.09654129, -0.04867615, -0.03609001, -0.06522421, -0.044131942, -0.048825134, 0.10652733, -0.015310965, -0.07341175), target2); + target2 = MulAdd(b2, MF4x4(0.05782829, 0.014247012, 0.12126171, 0.100055166, 0.24079333, -0.20155986, 0.1640186, -0.12158374, -0.153708, -0.24445893, -0.10536192, 0.12758626, -0.19430119, -0.019024884, -0.080120996, -0.29866305), target2); + target2 = MulAdd(c2, MF4x4(-0.017357074, 0.04390695, 0.12889594, 0.11451521, 0.03333342, -0.16417275, 0.10196121, 0.13059081, 0.09948873, 0.15007107, 0.22664218, 0.35449567, -0.089776486, 0.025239054, 0.12463201, -0.13109131), target2); + target2 = MulAdd(d2, MF4x4(0.064875744, 0.40551752, 0.11903257, 0.14822967, 0.14993542, -0.12758526, 0.23159283, -0.06080246, -0.084577255, 0.14307548, -0.02186462, 0.05793564, -0.050965074, 0.23895216, -0.07796932, -0.1624384), target2); + target2 = MulAdd(e2, MF4x4(-0.15942748, 0.07191155, 0.42204422, 0.35219797, 0.23286703, -0.283381, -0.2749432, 0.25922084, 0.10494953, 0.14575887, -0.19649154, -0.14563714, -0.03709703, 0.023375817, -0.05610175, -0.32548484), target2); + target2 = MulAdd(f2, MF4x4(-0.04872624, -0.3592348, -0.027413938, 0.0836858, 0.046842758, -0.35193914, 0.06154142, 0.05559191, -0.22538327, -0.097689696, -0.21317257, -0.033945527, -0.23628096, -0.016477302, 0.027297588, -0.04105733), target2); + target2 = MulAdd(g2, MF4x4(0.11543502, -0.043297376, 0.118703, 0.15013209, 0.03191795, 0.014122794, 0.05156918, 0.023102578, 0.0808462, -0.06445798, 0.15860644, -0.062393136, -0.018691704, -0.00032888897, 0.01196705, -0.025045555), target2); + target2 = MulAdd(h2, MF4x4(0.08301664, 0.12298539, 0.20151077, 0.2993159, 0.16968682, -0.18196446, -0.13322797, -0.13693243, -0.0048389523, -0.057406515, 0.21409932, -0.060822334, -0.08554752, -0.19363636, -0.35241908, -0.32256603), target2); + target2 = MulAdd(i2, MF4x4(-0.0523748, 0.17082025, 0.08556144, 0.19181536, -0.2445756, -0.3616732, -0.01641404, -0.078599006, 0.23907976, 0.025989126, 0.07574993, -0.06859337, -0.06667767, -0.022847861, -0.037942342, -0.21112117), target2); + target2 = MulAdd(na1, MF4x4(0.15098672, 0.024212115, -0.19068481, -0.22606348, -0.15221487, -0.032165635, -0.06244531, -0.043535717, -0.07398802, -0.06088507, -0.013834592, -0.10145823, 0.06901983, -0.0862135, -0.05545454, 0.15514566), target2); + target2 = MulAdd(nb1, MF4x4(0.044767097, -0.07583697, -0.17739761, -0.25538698, 0.0966659, -0.0013492911, -0.23315248, -0.21652249, -0.14381947, 0.017784966, -0.15960035, -0.13297895, 0.009810349, -0.041348267, 0.05443229, 0.17781278), target2); + target2 = MulAdd(nc1, MF4x4(-0.0052824756, 0.087268956, -0.022167318, -0.09450279, 0.1254372, 0.075806946, 0.028893303, -0.09019378, 0.03488572, 0.046265777, 0.026162563, 0.003914548, -0.0632334, -0.19494742, -0.03602023, 0.113897055), target2); + target2 = MulAdd(nd1, MF4x4(-0.11311528, 0.2616239, 0.12303548, 0.13427438, -0.26537886, 0.015112677, -0.03641703, -0.014114427, -0.023280613, 0.03626403, 0.12833157, 0.19168468, 0.2119137, -0.02374797, 0.117919676, 0.07794395), target2); + target2 = MulAdd(ne1, MF4x4(-0.13746078, 0.25739196, 0.008431936, -0.053867325, -0.13228695, -0.20661803, 0.026474724, 0.3205188, -0.41819036, 0.42812085, 0.17249924, -0.15810613, 0.39602605, -0.10873597, 0.1457145, -0.060503867), target2); + target2 = MulAdd(nf1, MF4x4(0.03706167, -0.036211733, 0.06519942, -0.2123978, 0.019934088, 0.17494182, -0.017252771, -0.067341134, -0.15416612, -0.114118524, -0.00028491023, -0.08172238, -0.11722721, -0.2647645, 0.13316637, 0.13562322), target2); + target2 = MulAdd(ng1, MF4x4(0.11832847, 0.22822993, 0.020318847, 0.0734738, -0.025950216, -0.072782144, 0.11133989, 0.18845533, -0.004584898, -0.10486471, 0.054522812, -0.14136603, 0.01940983, -0.039433163, 0.008390286, 0.013686628), target2); + target2 = MulAdd(nh1, MF4x4(-0.042335663, 0.0035399816, -0.1813205, -0.25639042, 0.1042524, 0.07707001, -0.04922454, 0.18140413, -0.22322963, 0.030809738, -0.11041754, -0.040288754, 0.09431559, -0.08017892, -0.18317147, -0.019331435), target2); + target2 = MulAdd(ni1, MF4x4(-0.061776266, 0.0069793356, 0.019964112, -0.14504445, -0.00070097746, -0.027107855, 0.030182542, -0.05625612, -0.04958449, 0.123165295, 0.0013953283, 0.017912487, 0.031161329, -0.31798717, 0.018331604, 0.030411277), target2); + target2 = MulAdd(na2, MF4x4(-0.0530594, -0.07933117, 0.024755973, 0.004785411, 0.045512546, 0.12833083, 0.023195961, -0.018028054, 0.014223835, 0.102213494, 0.052169293, -0.020509718, 0.017905682, 0.021354724, -0.0410789, -0.066523656), target2); + target2 = MulAdd(nb2, MF4x4(0.017061293, -0.08770806, -0.04889939, 0.01825556, -0.03228951, -0.06838898, -0.09249373, 0.18103507, 0.087000825, 0.04175679, -0.09305919, -0.2792485, 0.03405797, 0.062147446, -0.04757652, -0.021603985), target2); + target2 = MulAdd(nc2, MF4x4(-0.04115162, 0.02547615, 0.07033616, 0.09814065, 0.2597489, -0.0335038, 0.14097647, 0.047022782, 0.1374654, -0.27390274, 0.02080897, -0.15251215, -0.025431091, 0.08871465, -0.22243279, -0.07792812), target2); + target2 = MulAdd(nd2, MF4x4(-0.061674852, -0.051326606, -0.04885301, 0.08548189, -0.07100394, 0.044875987, -0.19810183, -0.09841128, -0.06628199, -0.041564234, 0.1111919, -0.044448826, 0.06980301, 0.00046094303, -0.045978926, -0.20736355), target2); + target2 = MulAdd(ne2, MF4x4(-0.18405268, -0.28115878, -0.33536536, 0.0753763, 0.028309148, 0.0014874876, 0.28369543, -0.2133985, 0.16520546, 0.29562506, 0.109781906, 0.028433772, -0.02691105, -0.39038795, -0.12942268, -0.080103286), target2); + target2 = MulAdd(nf2, MF4x4(-0.05387814, -0.04672615, 0.046064686, 0.2791977, 0.11359623, -0.204098, -0.018091407, 0.13550591, 0.04216003, -0.1631328, -0.043013666, -0.045698896, 0.032403514, 0.010206319, -0.25789943, -0.36328712), target2); + target2 = MulAdd(ng2, MF4x4(0.11280466, 0.11671405, -0.02122692, 0.021664057, -0.07836575, 0.014747725, 0.030007286, -0.10128616, -0.13695373, -0.10353946, -0.043571353, 0.05922437, -0.11293257, 0.0828006, -0.07322761, -0.08197273), target2); + target2 = MulAdd(nh2, MF4x4(-0.0010509897, -0.1674067, 0.08191839, 0.056608744, 0.061343428, 0.19574693, 0.05302967, -0.006813754, -0.016064182, 0.22949885, -0.06631832, 0.034382205, 0.12674272, 0.06583508, 0.19319807, 0.011400221), target2); + target2 = MulAdd(ni2, MF4x4(-0.032175347, -0.021227444, -0.027698517, 0.067299634, 0.23929007, 0.20669897, 0.004856941, 0.0009404045, 0.04919408, 0.020296812, 0.012571405, -0.16185577, -0.012276781, 0.16609742, -0.15718406, -0.20344186), target2); conv2d_3_tf[gxy] = target1; conv2d_3_tf1[gxy] = target2; @@ -684,25 +687,25 @@ void Pass5(uint2 blockStart, uint3 threadId) { // [ a, d, g ] // [ b, e, h ] // [ c, f, i ] - float4 a1 = conv2d_3_tf.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0); - float4 b1 = conv2d_3_tf.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); - float4 c1 = conv2d_3_tf.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); - float4 d1 = conv2d_3_tf.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); - float4 e1 = conv2d_3_tf.SampleLevel(sam, pos, 0); - float4 f1 = conv2d_3_tf.SampleLevel(sam, pos + float2(0, inputPt.y), 0); - float4 g1 = conv2d_3_tf.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); - float4 h1 = conv2d_3_tf.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); - float4 i1 = conv2d_3_tf.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0); - - float4 na1 = max(-a1, 0); - float4 nb1 = max(-b1, 0); - float4 nc1 = max(-c1, 0); - float4 nd1 = max(-d1, 0); - float4 ne1 = max(-e1, 0); - float4 nf1 = max(-f1, 0); - float4 ng1 = max(-g1, 0); - float4 nh1 = max(-h1, 0); - float4 ni1 = max(-i1, 0); + MF4 a1 = conv2d_3_tf.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0); + MF4 b1 = conv2d_3_tf.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); + MF4 c1 = conv2d_3_tf.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); + MF4 d1 = conv2d_3_tf.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); + MF4 e1 = conv2d_3_tf.SampleLevel(sam, pos, 0); + MF4 f1 = conv2d_3_tf.SampleLevel(sam, pos + float2(0, inputPt.y), 0); + MF4 g1 = conv2d_3_tf.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); + MF4 h1 = conv2d_3_tf.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); + MF4 i1 = conv2d_3_tf.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0); + + MF4 na1 = max(-a1, 0); + MF4 nb1 = max(-b1, 0); + MF4 nc1 = max(-c1, 0); + MF4 nd1 = max(-d1, 0); + MF4 ne1 = max(-e1, 0); + MF4 nf1 = max(-f1, 0); + MF4 ng1 = max(-g1, 0); + MF4 nh1 = max(-h1, 0); + MF4 ni1 = max(-i1, 0); a1 = max(a1, 0); b1 = max(b1, 0); @@ -714,25 +717,25 @@ void Pass5(uint2 blockStart, uint3 threadId) { h1 = max(h1, 0); i1 = max(i1, 0); - float4 a2 = conv2d_3_tf1.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0); - float4 b2 = conv2d_3_tf1.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); - float4 c2 = conv2d_3_tf1.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); - float4 d2 = conv2d_3_tf1.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); - float4 e2 = conv2d_3_tf1.SampleLevel(sam, pos, 0); - float4 f2 = conv2d_3_tf1.SampleLevel(sam, pos + float2(0, inputPt.y), 0); - float4 g2 = conv2d_3_tf1.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); - float4 h2 = conv2d_3_tf1.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); - float4 i2 = conv2d_3_tf1.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0); - - float4 na2 = max(-a2, 0); - float4 nb2 = max(-b2, 0); - float4 nc2 = max(-c2, 0); - float4 nd2 = max(-d2, 0); - float4 ne2 = max(-e2, 0); - float4 nf2 = max(-f2, 0); - float4 ng2 = max(-g2, 0); - float4 nh2 = max(-h2, 0); - float4 ni2 = max(-i2, 0); + MF4 a2 = conv2d_3_tf1.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0); + MF4 b2 = conv2d_3_tf1.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); + MF4 c2 = conv2d_3_tf1.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); + MF4 d2 = conv2d_3_tf1.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); + MF4 e2 = conv2d_3_tf1.SampleLevel(sam, pos, 0); + MF4 f2 = conv2d_3_tf1.SampleLevel(sam, pos + float2(0, inputPt.y), 0); + MF4 g2 = conv2d_3_tf1.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); + MF4 h2 = conv2d_3_tf1.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); + MF4 i2 = conv2d_3_tf1.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0); + + MF4 na2 = max(-a2, 0); + MF4 nb2 = max(-b2, 0); + MF4 nc2 = max(-c2, 0); + MF4 nd2 = max(-d2, 0); + MF4 ne2 = max(-e2, 0); + MF4 nf2 = max(-f2, 0); + MF4 ng2 = max(-g2, 0); + MF4 nh2 = max(-h2, 0); + MF4 ni2 = max(-i2, 0); a2 = max(a2, 0); b2 = max(b2, 0); @@ -744,81 +747,81 @@ void Pass5(uint2 blockStart, uint3 threadId) { h2 = max(h2, 0); i2 = max(i2, 0); - float4 target1 = mul(a1, float4x4(0.010501252, -0.046741538, -0.0017120431, -0.04840009, 0.20547974, 0.3366821, -0.10182207, 0.17451541, -0.03404171, -0.15138055, 0.16771653, -0.07168161, 0.102572344, 0.08266354, 0.20205829, 0.13429944)); - target1 += mul(b1, float4x4(0.05584234, 0.06844309, 0.025430907, 0.124140054, 0.36385667, 0.12099467, -0.41671994, 0.085477844, 0.19748127, -0.21473993, 0.005037813, -0.3973761, 0.04669592, -0.100342326, -0.09403772, -0.034248166)); - target1 += mul(c1, float4x4(-0.17654696, 0.009085064, 0.028360577, 0.033909567, 0.09377573, 0.27896938, 0.103994116, 0.0008595595, 0.064523555, 0.040994007, -0.06337235, 0.05662917, 0.0037455747, 0.017608117, -0.14610702, 1.2175746e-05)); - target1 += mul(d1, float4x4(-0.04631749, -0.14251712, -0.16420849, -0.16259338, 0.46187812, 0.17576592, 0.00049142196, 0.029193122, -0.003925961, -0.11218227, 0.007026237, -0.20583045, -0.0010964901, 0.19355829, 0.2221649, 0.1187224)); - target1 += mul(e1, float4x4(-0.041567978, -0.31510913, 0.01618704, 0.04979329, 0.101294376, 0.16356954, 0.21361789, 0.20735294, 0.1900854, -0.4151726, -0.30471593, -0.59483325, 0.033624128, 0.11495109, -0.15194787, 0.4920959)); - target1 += mul(f1, float4x4(-0.18910064, -0.06516878, -0.20508374, -0.063928686, 0.7289614, 0.26674315, 0.2929481, 0.4026098, -0.033123735, -0.090371035, -0.029094126, -0.15197921, -0.08723726, -0.060160585, -0.07908409, -0.08826931)); - target1 += mul(g1, float4x4(-0.08321312, -0.09749648, -0.08783197, -0.23072585, 0.24343425, 0.10888949, 0.17419606, 0.04136083, 0.0066000987, -0.06112787, -0.12176007, -0.20907228, -0.0008522778, -0.054704696, -0.07197735, -0.0877179)); - target1 += mul(h1, float4x4(-0.40559706, -0.3801705, 0.05970925, -0.6157092, 0.28944594, 0.1252121, 0.403247, -0.122819394, -0.096336536, -0.2324694, 0.05980106, -0.19970767, -0.16646989, -0.10164633, -0.09282806, -0.08897996)); - target1 += mul(i1, float4x4(-0.14336498, -0.12967408, -0.016268672, -0.021431219, -0.0850116, 0.37105832, -0.04093888, 0.08540873, 0.035717323, -0.07282701, -0.009123291, -0.0036565473, -0.02508944, -0.087611906, 0.03604423, -0.00089080486)); - target1 += mul(a2, float4x4(0.1373875, 0.05283984, -0.11992707, 0.102294855, 0.3305128, 0.044920854, 0.31622922, -0.04711731, 0.001336024, 0.022799017, -0.062343203, 0.017140022, -0.07556853, -0.12864219, -0.25721326, -0.20741239)); - target1 += mul(b2, float4x4(0.22062224, 0.09266222, 0.22466063, 0.18527372, -0.06940306, 0.1317168, 0.019784274, -0.07422301, 0.04061616, 0.0022494853, 0.21723995, 0.24732308, 0.14088804, 0.0116154915, 0.102064446, 0.020701224)); - target1 += mul(c2, float4x4(-0.025154127, 0.045180723, -0.05877639, -0.099235624, 0.13630918, 0.24653725, -0.05723323, -0.022995364, -0.10826078, 0.049667366, 0.12618053, 0.1557369, 0.037487056, -0.22215757, 0.005912914, -0.20549043)); - target1 += mul(d2, float4x4(0.09641055, 0.098845296, -0.08192096, -0.03691394, -0.18450394, 0.29955688, -0.082493715, -0.06268039, -0.0754319, 0.21018648, -0.016580105, -0.1810546, 0.13857666, -0.0327626, 0.03161804, -0.32589525)); - target1 += mul(e2, float4x4(-0.18272439, -0.17595461, 0.047229152, 0.14596708, 0.40453747, 0.5658558, -0.17969102, 0.21557859, -0.34232348, 0.40355968, 0.53874254, 0.0012561383, 0.28154096, -0.06745097, -0.13049632, 0.42997465)); - target1 += mul(f2, float4x4(0.081179485, -0.0041369614, -0.12001932, -0.102107175, -0.050293338, 0.29165673, 0.08062538, 0.22925815, 0.19389379, 0.28463286, -0.057207666, 0.23133168, -0.07545728, 0.06729763, -0.103593476, 0.014468794)); - target1 += mul(g2, float4x4(0.069821335, -0.010299579, 0.069458775, 0.03894593, -0.054688405, 0.32758355, 0.13935772, 0.37506017, 0.24083133, -0.06105339, 0.25636867, 0.09627044, 0.08939188, 0.006728639, 0.10629504, 0.07887502)); - target1 += mul(h2, float4x4(0.10563019, 0.077379815, 0.045456886, 0.09303406, 0.11326298, 0.28762257, -0.35142374, 0.10285745, 0.28762287, 0.3592446, 0.23816557, 0.22676824, 0.030372012, -0.028023086, -0.30956736, -0.27588373)); - target1 += mul(i2, float4x4(0.110499, 0.009828844, 0.086689755, 0.1839749, 0.16656482, 0.083707325, 0.19506347, -0.01547141, 0.13804145, 0.2206598, -0.16484791, -0.0021595939, -0.06844408, -0.07861768, 0.040771082, -0.13347322)); - target1 += mul(na1, float4x4(0.02667995, 0.019265587, -0.18211095, -0.102116466, -0.042541366, -0.07700912, -0.020587347, -0.03532171, 0.14816427, -0.1672272, -0.17522137, -0.04657808, 0.013430233, -0.0021270285, 0.109880306, 0.004838907)); - target1 += mul(nb1, float4x4(0.14285165, -0.1364756, 0.017568532, -0.27690783, -0.015461915, 0.045437083, 0.018187419, 0.12473493, 0.17991658, -0.15642665, 0.10009151, -0.19040193, 0.1734127, -0.13817501, 0.0710856, -0.12921426)); - target1 += mul(nc1, float4x4(-0.14114712, -0.18893671, 0.16121174, 0.035988737, 0.17872387, -0.106395856, -0.23183517, 0.012380416, 0.043066982, -0.28539032, -0.049011275, -0.21125022, -0.11976977, -0.015564958, 0.18880925, -0.0034812456)); - target1 += mul(nd1, float4x4(-0.05894521, 0.17266215, -0.0458901, 0.08049924, 0.0156061025, -0.0047465423, 0.09714626, 0.045990974, -0.08786066, -0.37803304, -0.19629405, -0.08546443, 0.014874948, 0.16931784, 0.24799919, 0.06316819)); - target1 += mul(ne1, float4x4(-0.28352743, 0.29973608, -0.014540065, 0.2865005, 0.048086923, 0.18976144, 0.22969759, 0.1643124, -0.11259408, -0.107592925, 0.184308, 0.30998367, -1.0860825, -0.29118305, -0.51242536, -0.38492215)); - target1 += mul(nf1, float4x4(-0.17199941, -0.14274743, -0.14213641, -0.1691383, -0.17294803, -0.013992068, -0.12135059, 0.082377024, -0.11255549, -0.124990575, -0.32526177, -0.08199375, -0.25591666, 0.1882329, 0.07895415, 0.22012262)); - target1 += mul(ng1, float4x4(0.026025832, -0.07267515, 0.09738688, 0.074536435, -0.060470507, -0.037861936, 0.0507819, -0.054857653, 0.0043173633, -0.18107842, -0.02996759, 0.04072402, -0.012617744, 0.061665237, 0.0013981885, 0.08679919)); - target1 += mul(nh1, float4x4(0.27913737, 0.39656082, 0.1579819, 0.2774727, -0.007996453, 0.08704765, -0.016933938, 0.07066135, 0.12361742, -0.20802726, -0.13705719, -0.18794124, 0.037409827, -0.03351758, -0.2970392, -0.11001984)); - target1 += mul(ni1, float4x4(-0.027419567, 0.043236237, -0.19843115, -0.056489736, -0.017010912, 0.070949584, -0.14881176, -0.0780235, 0.0039477753, -0.16772608, -0.009547604, -0.14060417, 0.0103197545, 0.07129672, 0.034949142, 0.014112084)); - target1 += mul(na2, float4x4(-0.06467971, 0.084101565, 0.26296136, 0.08878442, -0.11232121, -0.054373942, -0.17263442, 0.046408508, 0.032239515, 0.042490713, 0.036938053, -0.034339923, -0.07139367, 0.032505415, 0.0045828503, 0.24428385)); - target1 += mul(nb2, float4x4(0.053585388, -0.08175568, -0.04787236, 0.06061965, -0.0740297, 0.11113596, -0.12467945, 0.08229154, -0.01941305, 0.12903687, 0.09095716, -0.13062255, -0.0102068605, 0.107291475, 0.030279635, 0.07464777)); - target1 += mul(nc2, float4x4(0.11041978, -0.0123585425, 0.11147018, 0.07380536, 0.06632908, 0.011784447, 0.029638765, -0.01566135, 0.009105331, 0.05252663, -0.17972581, 0.01210126, -0.10749957, -0.028144639, -0.105761215, 0.083784826)); - target1 += mul(nd2, float4x4(-0.058018316, 0.15083058, 0.2725673, 0.024263225, -0.067711554, 0.051117413, -0.31144425, -0.15761986, 0.017503206, -0.14361219, -0.38261738, -0.20354146, -0.04211545, 0.12921454, -0.01319619, 0.35809723)); - target1 += mul(ne2, float4x4(-0.107978396, 0.3230084, -0.13806303, 0.12903036, 0.039864987, -0.006241628, 0.18701774, -0.10785807, 0.30056882, -0.3092082, -0.4273322, 0.3784662, -0.026107281, 0.23165871, 0.35258314, -0.06654702)); - target1 += mul(nf2, float4x4(-0.15840323, 0.15210885, 0.04086692, 0.19169305, 0.11847602, 0.0009038581, 0.095951624, 0.043941673, 0.1512248, 0.0749449, -0.027045414, -0.19729601, 0.08265063, -0.045218006, -0.10732461, 0.05197371)); - target1 += mul(ng2, float4x4(0.13637526, 0.28841978, 0.10298119, -0.005948496, 0.020897362, -0.02186902, -0.16207378, -0.021084815, 0.029192554, 0.07076991, -0.07210881, -0.06752328, 0.0006557475, 0.08986717, -0.29430988, 0.21411087)); - target1 += mul(nh2, float4x4(0.18667863, 0.3117322, -0.0859705, -0.038189936, 0.10214859, -0.11244034, 0.2680223, -0.072901204, -0.07434324, -0.17855306, 0.23134363, -0.055360887, -0.020968167, 0.0858459, 0.078975916, 0.13254759)); - target1 += mul(ni2, float4x4(-0.15676941, 0.03476677, -0.09922334, -0.15847856, -0.0033982224, 0.020932984, 0.12874377, 0.048792202, 0.06521213, 0.12456798, 0.15958112, 0.15981804, 0.07657683, 0.1759313, 0.012727211, 0.120304115)); - target1 += float4(0.08911729, -0.027969634, -0.010653148, -0.08001697); + MF4 target1 = { 0.08911729, -0.027969634, -0.010653148, -0.08001697 }; + target1 = MulAdd(a1, MF4x4(0.010501252, -0.046741538, -0.0017120431, -0.04840009, 0.20547974, 0.3366821, -0.10182207, 0.17451541, -0.03404171, -0.15138055, 0.16771653, -0.07168161, 0.102572344, 0.08266354, 0.20205829, 0.13429944), target1); + target1 = MulAdd(b1, MF4x4(0.05584234, 0.06844309, 0.025430907, 0.124140054, 0.36385667, 0.12099467, -0.41671994, 0.085477844, 0.19748127, -0.21473993, 0.005037813, -0.3973761, 0.04669592, -0.100342326, -0.09403772, -0.034248166), target1); + target1 = MulAdd(c1, MF4x4(-0.17654696, 0.009085064, 0.028360577, 0.033909567, 0.09377573, 0.27896938, 0.103994116, 0.0008595595, 0.064523555, 0.040994007, -0.06337235, 0.05662917, 0.0037455747, 0.017608117, -0.14610702, 1.2175746e-05), target1); + target1 = MulAdd(d1, MF4x4(-0.04631749, -0.14251712, -0.16420849, -0.16259338, 0.46187812, 0.17576592, 0.00049142196, 0.029193122, -0.003925961, -0.11218227, 0.007026237, -0.20583045, -0.0010964901, 0.19355829, 0.2221649, 0.1187224), target1); + target1 = MulAdd(e1, MF4x4(-0.041567978, -0.31510913, 0.01618704, 0.04979329, 0.101294376, 0.16356954, 0.21361789, 0.20735294, 0.1900854, -0.4151726, -0.30471593, -0.59483325, 0.033624128, 0.11495109, -0.15194787, 0.4920959), target1); + target1 = MulAdd(f1, MF4x4(-0.18910064, -0.06516878, -0.20508374, -0.063928686, 0.7289614, 0.26674315, 0.2929481, 0.4026098, -0.033123735, -0.090371035, -0.029094126, -0.15197921, -0.08723726, -0.060160585, -0.07908409, -0.08826931), target1); + target1 = MulAdd(g1, MF4x4(-0.08321312, -0.09749648, -0.08783197, -0.23072585, 0.24343425, 0.10888949, 0.17419606, 0.04136083, 0.0066000987, -0.06112787, -0.12176007, -0.20907228, -0.0008522778, -0.054704696, -0.07197735, -0.0877179), target1); + target1 = MulAdd(h1, MF4x4(-0.40559706, -0.3801705, 0.05970925, -0.6157092, 0.28944594, 0.1252121, 0.403247, -0.122819394, -0.096336536, -0.2324694, 0.05980106, -0.19970767, -0.16646989, -0.10164633, -0.09282806, -0.08897996), target1); + target1 = MulAdd(i1, MF4x4(-0.14336498, -0.12967408, -0.016268672, -0.021431219, -0.0850116, 0.37105832, -0.04093888, 0.08540873, 0.035717323, -0.07282701, -0.009123291, -0.0036565473, -0.02508944, -0.087611906, 0.03604423, -0.00089080486), target1); + target1 = MulAdd(a2, MF4x4(0.1373875, 0.05283984, -0.11992707, 0.102294855, 0.3305128, 0.044920854, 0.31622922, -0.04711731, 0.001336024, 0.022799017, -0.062343203, 0.017140022, -0.07556853, -0.12864219, -0.25721326, -0.20741239), target1); + target1 = MulAdd(b2, MF4x4(0.22062224, 0.09266222, 0.22466063, 0.18527372, -0.06940306, 0.1317168, 0.019784274, -0.07422301, 0.04061616, 0.0022494853, 0.21723995, 0.24732308, 0.14088804, 0.0116154915, 0.102064446, 0.020701224), target1); + target1 = MulAdd(c2, MF4x4(-0.025154127, 0.045180723, -0.05877639, -0.099235624, 0.13630918, 0.24653725, -0.05723323, -0.022995364, -0.10826078, 0.049667366, 0.12618053, 0.1557369, 0.037487056, -0.22215757, 0.005912914, -0.20549043), target1); + target1 = MulAdd(d2, MF4x4(0.09641055, 0.098845296, -0.08192096, -0.03691394, -0.18450394, 0.29955688, -0.082493715, -0.06268039, -0.0754319, 0.21018648, -0.016580105, -0.1810546, 0.13857666, -0.0327626, 0.03161804, -0.32589525), target1); + target1 = MulAdd(e2, MF4x4(-0.18272439, -0.17595461, 0.047229152, 0.14596708, 0.40453747, 0.5658558, -0.17969102, 0.21557859, -0.34232348, 0.40355968, 0.53874254, 0.0012561383, 0.28154096, -0.06745097, -0.13049632, 0.42997465), target1); + target1 = MulAdd(f2, MF4x4(0.081179485, -0.0041369614, -0.12001932, -0.102107175, -0.050293338, 0.29165673, 0.08062538, 0.22925815, 0.19389379, 0.28463286, -0.057207666, 0.23133168, -0.07545728, 0.06729763, -0.103593476, 0.014468794), target1); + target1 = MulAdd(g2, MF4x4(0.069821335, -0.010299579, 0.069458775, 0.03894593, -0.054688405, 0.32758355, 0.13935772, 0.37506017, 0.24083133, -0.06105339, 0.25636867, 0.09627044, 0.08939188, 0.006728639, 0.10629504, 0.07887502), target1); + target1 = MulAdd(h2, MF4x4(0.10563019, 0.077379815, 0.045456886, 0.09303406, 0.11326298, 0.28762257, -0.35142374, 0.10285745, 0.28762287, 0.3592446, 0.23816557, 0.22676824, 0.030372012, -0.028023086, -0.30956736, -0.27588373), target1); + target1 = MulAdd(i2, MF4x4(0.110499, 0.009828844, 0.086689755, 0.1839749, 0.16656482, 0.083707325, 0.19506347, -0.01547141, 0.13804145, 0.2206598, -0.16484791, -0.0021595939, -0.06844408, -0.07861768, 0.040771082, -0.13347322), target1); + target1 = MulAdd(na1, MF4x4(0.02667995, 0.019265587, -0.18211095, -0.102116466, -0.042541366, -0.07700912, -0.020587347, -0.03532171, 0.14816427, -0.1672272, -0.17522137, -0.04657808, 0.013430233, -0.0021270285, 0.109880306, 0.004838907), target1); + target1 = MulAdd(nb1, MF4x4(0.14285165, -0.1364756, 0.017568532, -0.27690783, -0.015461915, 0.045437083, 0.018187419, 0.12473493, 0.17991658, -0.15642665, 0.10009151, -0.19040193, 0.1734127, -0.13817501, 0.0710856, -0.12921426), target1); + target1 = MulAdd(nc1, MF4x4(-0.14114712, -0.18893671, 0.16121174, 0.035988737, 0.17872387, -0.106395856, -0.23183517, 0.012380416, 0.043066982, -0.28539032, -0.049011275, -0.21125022, -0.11976977, -0.015564958, 0.18880925, -0.0034812456), target1); + target1 = MulAdd(nd1, MF4x4(-0.05894521, 0.17266215, -0.0458901, 0.08049924, 0.0156061025, -0.0047465423, 0.09714626, 0.045990974, -0.08786066, -0.37803304, -0.19629405, -0.08546443, 0.014874948, 0.16931784, 0.24799919, 0.06316819), target1); + target1 = MulAdd(ne1, MF4x4(-0.28352743, 0.29973608, -0.014540065, 0.2865005, 0.048086923, 0.18976144, 0.22969759, 0.1643124, -0.11259408, -0.107592925, 0.184308, 0.30998367, -1.0860825, -0.29118305, -0.51242536, -0.38492215), target1); + target1 = MulAdd(nf1, MF4x4(-0.17199941, -0.14274743, -0.14213641, -0.1691383, -0.17294803, -0.013992068, -0.12135059, 0.082377024, -0.11255549, -0.124990575, -0.32526177, -0.08199375, -0.25591666, 0.1882329, 0.07895415, 0.22012262), target1); + target1 = MulAdd(ng1, MF4x4(0.026025832, -0.07267515, 0.09738688, 0.074536435, -0.060470507, -0.037861936, 0.0507819, -0.054857653, 0.0043173633, -0.18107842, -0.02996759, 0.04072402, -0.012617744, 0.061665237, 0.0013981885, 0.08679919), target1); + target1 = MulAdd(nh1, MF4x4(0.27913737, 0.39656082, 0.1579819, 0.2774727, -0.007996453, 0.08704765, -0.016933938, 0.07066135, 0.12361742, -0.20802726, -0.13705719, -0.18794124, 0.037409827, -0.03351758, -0.2970392, -0.11001984), target1); + target1 = MulAdd(ni1, MF4x4(-0.027419567, 0.043236237, -0.19843115, -0.056489736, -0.017010912, 0.070949584, -0.14881176, -0.0780235, 0.0039477753, -0.16772608, -0.009547604, -0.14060417, 0.0103197545, 0.07129672, 0.034949142, 0.014112084), target1); + target1 = MulAdd(na2, MF4x4(-0.06467971, 0.084101565, 0.26296136, 0.08878442, -0.11232121, -0.054373942, -0.17263442, 0.046408508, 0.032239515, 0.042490713, 0.036938053, -0.034339923, -0.07139367, 0.032505415, 0.0045828503, 0.24428385), target1); + target1 = MulAdd(nb2, MF4x4(0.053585388, -0.08175568, -0.04787236, 0.06061965, -0.0740297, 0.11113596, -0.12467945, 0.08229154, -0.01941305, 0.12903687, 0.09095716, -0.13062255, -0.0102068605, 0.107291475, 0.030279635, 0.07464777), target1); + target1 = MulAdd(nc2, MF4x4(0.11041978, -0.0123585425, 0.11147018, 0.07380536, 0.06632908, 0.011784447, 0.029638765, -0.01566135, 0.009105331, 0.05252663, -0.17972581, 0.01210126, -0.10749957, -0.028144639, -0.105761215, 0.083784826), target1); + target1 = MulAdd(nd2, MF4x4(-0.058018316, 0.15083058, 0.2725673, 0.024263225, -0.067711554, 0.051117413, -0.31144425, -0.15761986, 0.017503206, -0.14361219, -0.38261738, -0.20354146, -0.04211545, 0.12921454, -0.01319619, 0.35809723), target1); + target1 = MulAdd(ne2, MF4x4(-0.107978396, 0.3230084, -0.13806303, 0.12903036, 0.039864987, -0.006241628, 0.18701774, -0.10785807, 0.30056882, -0.3092082, -0.4273322, 0.3784662, -0.026107281, 0.23165871, 0.35258314, -0.06654702), target1); + target1 = MulAdd(nf2, MF4x4(-0.15840323, 0.15210885, 0.04086692, 0.19169305, 0.11847602, 0.0009038581, 0.095951624, 0.043941673, 0.1512248, 0.0749449, -0.027045414, -0.19729601, 0.08265063, -0.045218006, -0.10732461, 0.05197371), target1); + target1 = MulAdd(ng2, MF4x4(0.13637526, 0.28841978, 0.10298119, -0.005948496, 0.020897362, -0.02186902, -0.16207378, -0.021084815, 0.029192554, 0.07076991, -0.07210881, -0.06752328, 0.0006557475, 0.08986717, -0.29430988, 0.21411087), target1); + target1 = MulAdd(nh2, MF4x4(0.18667863, 0.3117322, -0.0859705, -0.038189936, 0.10214859, -0.11244034, 0.2680223, -0.072901204, -0.07434324, -0.17855306, 0.23134363, -0.055360887, -0.020968167, 0.0858459, 0.078975916, 0.13254759), target1); + target1 = MulAdd(ni2, MF4x4(-0.15676941, 0.03476677, -0.09922334, -0.15847856, -0.0033982224, 0.020932984, 0.12874377, 0.048792202, 0.06521213, 0.12456798, 0.15958112, 0.15981804, 0.07657683, 0.1759313, 0.012727211, 0.120304115), target1); - float4 target2 = mul(a1, float4x4(0.003206617, 0.04896987, 0.049652386, 0.10869342, 0.36313584, -0.070666805, 0.93581825, -0.52484274, -0.14278883, 0.064016834, -0.05534331, 0.02961736, -0.1319316, 0.05740655, 0.2405951, -0.12313382)); - target2 += mul(b1, float4x4(0.014092832, 0.07058761, -0.07887866, -0.27478936, -0.31456405, -0.31036922, -0.18380909, -0.11277979, -0.034889866, -0.37914017, -0.056245584, 0.24008954, -0.03414483, -0.023189066, -0.010568316, -0.004604883)); - target2 += mul(c1, float4x4(0.15443979, -0.050161768, -0.012300917, -0.08834887, 0.082193285, 0.06878423, 0.1478042, -0.3774468, -0.18659878, 0.14238152, 0.033605397, 0.13560006, -0.032682173, -0.024561955, 0.05656941, -0.034246165)); - target2 += mul(d1, float4x4(0.04691462, 0.064624496, -0.15950382, 0.16081297, -0.1417951, -0.109690994, -0.021205869, 0.19361454, -0.006306647, 0.3401972, -0.00014070333, 0.11619607, -0.13437814, 0.05464789, 0.37712076, -0.12470751)); - target2 += mul(e1, float4x4(-0.40016884, 0.010666597, -0.005395378, 0.51084363, -0.009875391, 0.3969395, 0.47768033, -0.3392299, -0.1509509, -0.057620626, -0.1834601, -0.09998148, 0.10095897, -0.2213528, 0.02546703, -0.28506726)); - target2 += mul(f1, float4x4(0.26652217, -0.106772706, -0.12609608, -0.0949661, -0.10869194, -0.55331933, -0.011515521, -0.27978876, -0.2124893, 0.03954004, 0.1691768, 0.05590268, 0.1539662, 0.10703386, -0.027286088, 0.2168544)); - target2 += mul(g1, float4x4(-0.04862511, 0.06919758, -0.12962708, 0.016036907, -0.030030789, -0.20159967, 0.0013158675, -0.07799172, -0.032236706, -0.0035921712, -0.085437834, -0.025374755, -0.06251374, -0.009269627, -0.07519051, -0.01884611)); - target2 += mul(h1, float4x4(0.23940067, -0.19496065, -0.05494683, 0.11601073, -0.074225076, 0.24976431, 0.41665986, 0.12029472, 0.16815041, -0.115868434, 0.06333614, 0.032145746, 0.15990137, -0.14886795, 0.034102913, -0.07727595)); - target2 += mul(i1, float4x4(0.14702639, -0.013711502, 0.011437429, -0.11201445, -0.2582659, 0.34539905, 0.058082145, -0.18346462, 0.0027891365, 0.072565466, 0.12716974, 0.050636146, 0.092657596, 0.08541754, -0.1266164, 0.027881607)); - target2 += mul(a2, float4x4(0.043362036, 0.020758621, 0.09906072, -0.22401148, -0.19104514, -0.25774476, 0.074128486, 0.08558291, -0.075419895, 0.20380639, 0.06398196, 0.015925938, 0.089786015, -0.100721814, -0.1374862, 0.26110905)); - target2 += mul(b2, float4x4(-0.12547149, 0.08151811, -0.15953775, -0.33995447, -0.50784314, 0.46155545, 0.24986996, 0.03404644, -0.047789436, -0.12438347, -0.14143273, -0.17951359, -0.08057819, 0.023863006, -0.008539273, -0.06775414)); - target2 += mul(c2, float4x4(0.1430169, 0.056971863, -0.021576611, -0.045342956, -0.22356391, -0.15344621, -0.0467977, -0.22970036, -0.0125351725, 0.16957329, -0.0069183917, -0.013949834, -0.048609708, 0.05261722, 0.023262242, 0.2123519)); - target2 += mul(d2, float4x4(-0.019523792, 0.008228363, -0.04616012, -0.14341992, -0.19307113, 0.005937241, 0.24048887, -0.04279845, 0.022574252, 0.15558265, -0.035000063, 0.18318397, -0.05392528, -0.26044658, -0.13493988, 0.056433514)); - target2 += mul(e2, float4x4(-0.28926027, -0.17381874, 0.07685766, -0.0061521684, -0.47455552, -0.49213487, 0.36924496, 0.29042044, 0.201094, -0.14280887, -0.4531411, -0.52902204, -0.28123, 0.1401882, 0.32054895, -0.11357518)); - target2 += mul(f2, float4x4(0.14173324, -0.12069898, -0.07242415, 0.105665006, 0.017373435, -0.056042343, 0.07270201, 0.022111928, -0.01106541, 0.01666006, 0.013564169, -0.36628693, -0.25450787, -0.28179473, -0.04721874, -0.21912882)); - target2 += mul(g2, float4x4(-0.09464695, -0.027919646, 0.13088459, 0.17504032, -0.101641014, 0.29687008, 0.08832321, 0.020538324, -0.15108941, -0.21930224, -0.026915176, -0.07078217, 0.10723033, 0.034364715, 0.18183397, -0.119012214)); - target2 += mul(h2, float4x4(-0.21713468, -0.0846604, 0.046551514, -0.14989382, 0.08672032, -0.07933831, 0.08093595, -0.064147756, -0.15980323, 0.50000644, -0.091568656, 0.03201994, -0.1848647, -0.0646309, 0.03288009, 0.046442386)); - target2 += mul(i2, float4x4(0.053532355, -0.054523747, -0.040242642, -0.31438905, 0.06452703, -0.18785381, -0.14987698, -0.067642935, -0.19892459, -0.057256676, 0.05943023, -0.17331842, 0.02588534, 0.13134238, -0.07121775, 0.23446162)); - target2 += mul(na1, float4x4(0.20633182, 0.01686198, 0.17934167, -0.02063493, 0.042606052, -0.05289458, 0.031508356, 0.00082803797, 0.0756423, -0.047548845, 0.01456339, 0.15910533, -0.20119642, 0.029213727, 0.111036316, -0.047010012)); - target2 += mul(nb1, float4x4(0.09258436, -0.27904224, -0.086695746, 0.33095327, -0.20126075, -0.050745636, -0.048944805, -0.10536587, -0.012995092, 0.07926994, 0.15071853, -0.13644052, -0.05188447, -0.06750699, -0.14227037, 0.028751127)); - target2 += mul(nc1, float4x4(-0.18562223, 0.10250865, -0.17573993, 0.20434102, -0.05187468, -0.06441594, -0.052127104, -0.01925564, 0.02927959, -0.12711872, 0.059629507, 0.15696885, -0.010168965, 0.09971862, -0.03177664, -0.022744441)); - target2 += mul(nd1, float4x4(0.21474063, -0.15679085, 0.09609374, 0.109079376, -0.049934637, -0.07393633, 0.16688468, -0.018888129, 0.04240162, -0.31895876, -0.106516436, 0.20008606, -0.054410245, 0.028970616, -0.18008347, -0.013362003)); - target2 += mul(ne1, float4x4(0.37891293, 0.042730846, -0.24735828, -0.5234527, -0.3681344, -0.06609157, -0.14993733, -0.020316398, 0.123008475, 0.29632482, 0.32149333, 0.35999274, -0.18967044, 0.46154186, -0.016041815, 0.097378336)); - target2 += mul(nf1, float4x4(-0.14873263, 0.07600569, -0.051758345, 0.1803135, -0.23121934, 0.13574593, 0.043973465, -0.13992754, -0.061972607, -0.124083005, -0.049196843, -0.07700431, 0.21572952, -0.25241727, 0.1218322, -0.07773728)); - target2 += mul(ng1, float4x4(0.040287063, 0.024240922, 0.021917762, -0.050616946, -0.023174169, 0.05977014, 0.018892275, 0.04014965, 0.11715485, 0.062129, 0.024620812, 0.013617107, 0.075699426, 0.1858111, -0.11769179, -0.08085602)); - target2 += mul(nh1, float4x4(-0.3194255, 0.08695645, -0.09453595, 0.2564516, 0.02192303, 0.08167247, -0.06257352, 0.043801844, 0.04392246, 0.2020571, 0.045180902, 0.18857521, 0.1835961, -0.043788187, -0.08768916, -0.14755538)); - target2 += mul(ni1, float4x4(-0.22074097, 0.13768476, -0.16183749, 0.059949517, -0.011375954, 0.08581876, 0.004800447, 0.019403988, 0.014646056, 0.07363176, -0.058036458, 0.0706421, 0.08082624, 0.17740329, -0.05484784, 0.050796065)); - target2 += mul(na2, float4x4(-0.032330472, -0.067666024, 0.18980837, -0.19077848, 0.1111905, 0.03855666, -0.11272314, -0.00577739, 0.17697452, -0.053044144, -0.07510145, 0.061853852, -0.024240626, 0.14846492, 0.14804313, -0.20275854)); - target2 += mul(nb2, float4x4(0.17133904, -0.16356844, 0.1978664, 0.13877816, 0.28208038, 0.031539194, 0.11313891, -0.0014802719, 0.0033749861, 0.046372313, 0.054808807, -0.0024151779, 0.0068782056, -0.16414621, -0.07545907, -0.2521294)); - target2 += mul(nc2, float4x4(-0.1746992, -0.037628956, -0.0044012754, -0.004390821, 0.0050341445, -0.112742625, 0.051241755, 0.01984483, 0.0003531837, 0.043500375, 0.030881992, 0.003503799, 0.13611782, -0.02509031, -0.007503557, -0.009321301)); - target2 += mul(nd2, float4x4(0.087250136, 0.12374122, 0.2959519, 0.11314702, 0.22080182, 0.106726184, -0.29768205, 0.14931595, 0.23356548, -0.008709153, -0.0797829, 0.046940215, -0.07027616, 0.20533602, 0.0723021, -0.1963585)); - target2 += mul(ne2, float4x4(0.00609982, 0.35277408, -0.22781096, -0.28912535, 0.42393112, -0.07654207, 0.12636793, 0.049337976, -0.0967726, -0.19349189, 0.36800626, 0.09745645, 0.47663373, 0.03876107, -0.042987954, 0.016161885)); - target2 += mul(nf2, float4x4(-0.047490966, -0.05823166, 0.036158644, 0.025337253, -0.046618905, 0.108276576, -0.024148034, 0.0026794411, 0.1497962, -0.09328474, -0.03160641, 0.24351281, -0.05198027, 0.030720685, 0.00014528916, -0.2224931)); - target2 += mul(ng2, float4x4(-0.007338369, 0.18710312, 0.14617369, -0.0070655346, 0.10464997, -0.029674934, -0.11842202, -0.09114357, 0.08524458, -0.08082762, 0.06479597, -0.023760766, 0.07523641, 0.0067315935, 0.101266846, -0.2780903)); - target2 += mul(nh2, float4x4(0.14181875, -0.19523518, 0.1068169, -0.10284853, 0.11634046, -0.117397435, 0.09113022, 0.009371062, -0.022120507, -0.1127032, 0.092574745, -0.021989716, -0.088107705, -0.13541982, 0.08130504, -0.0678927)); - target2 += mul(ni2, float4x4(0.09948295, 0.23699793, -0.042369924, 0.16744529, -0.10045506, -0.045623623, 0.04871897, -0.0023967526, 0.02602692, -0.089873284, -0.050681606, -0.09332558, -0.09596149, -0.06988313, 0.0007193808, -0.11936899)); - target2 += float4(-0.04928105, -0.003357327, -0.03886671, 0.076106146); + MF4 target2 = { -0.04928105, -0.003357327, -0.03886671, 0.076106146 }; + target2 = MulAdd(a1, MF4x4(0.003206617, 0.04896987, 0.049652386, 0.10869342, 0.36313584, -0.070666805, 0.93581825, -0.52484274, -0.14278883, 0.064016834, -0.05534331, 0.02961736, -0.1319316, 0.05740655, 0.2405951, -0.12313382), target2); + target2 = MulAdd(b1, MF4x4(0.014092832, 0.07058761, -0.07887866, -0.27478936, -0.31456405, -0.31036922, -0.18380909, -0.11277979, -0.034889866, -0.37914017, -0.056245584, 0.24008954, -0.03414483, -0.023189066, -0.010568316, -0.004604883), target2); + target2 = MulAdd(c1, MF4x4(0.15443979, -0.050161768, -0.012300917, -0.08834887, 0.082193285, 0.06878423, 0.1478042, -0.3774468, -0.18659878, 0.14238152, 0.033605397, 0.13560006, -0.032682173, -0.024561955, 0.05656941, -0.034246165), target2); + target2 = MulAdd(d1, MF4x4(0.04691462, 0.064624496, -0.15950382, 0.16081297, -0.1417951, -0.109690994, -0.021205869, 0.19361454, -0.006306647, 0.3401972, -0.00014070333, 0.11619607, -0.13437814, 0.05464789, 0.37712076, -0.12470751), target2); + target2 = MulAdd(e1, MF4x4(-0.40016884, 0.010666597, -0.005395378, 0.51084363, -0.009875391, 0.3969395, 0.47768033, -0.3392299, -0.1509509, -0.057620626, -0.1834601, -0.09998148, 0.10095897, -0.2213528, 0.02546703, -0.28506726), target2); + target2 = MulAdd(f1, MF4x4(0.26652217, -0.106772706, -0.12609608, -0.0949661, -0.10869194, -0.55331933, -0.011515521, -0.27978876, -0.2124893, 0.03954004, 0.1691768, 0.05590268, 0.1539662, 0.10703386, -0.027286088, 0.2168544), target2); + target2 = MulAdd(g1, MF4x4(-0.04862511, 0.06919758, -0.12962708, 0.016036907, -0.030030789, -0.20159967, 0.0013158675, -0.07799172, -0.032236706, -0.0035921712, -0.085437834, -0.025374755, -0.06251374, -0.009269627, -0.07519051, -0.01884611), target2); + target2 = MulAdd(h1, MF4x4(0.23940067, -0.19496065, -0.05494683, 0.11601073, -0.074225076, 0.24976431, 0.41665986, 0.12029472, 0.16815041, -0.115868434, 0.06333614, 0.032145746, 0.15990137, -0.14886795, 0.034102913, -0.07727595), target2); + target2 = MulAdd(i1, MF4x4(0.14702639, -0.013711502, 0.011437429, -0.11201445, -0.2582659, 0.34539905, 0.058082145, -0.18346462, 0.0027891365, 0.072565466, 0.12716974, 0.050636146, 0.092657596, 0.08541754, -0.1266164, 0.027881607), target2); + target2 = MulAdd(a2, MF4x4(0.043362036, 0.020758621, 0.09906072, -0.22401148, -0.19104514, -0.25774476, 0.074128486, 0.08558291, -0.075419895, 0.20380639, 0.06398196, 0.015925938, 0.089786015, -0.100721814, -0.1374862, 0.26110905), target2); + target2 = MulAdd(b2, MF4x4(-0.12547149, 0.08151811, -0.15953775, -0.33995447, -0.50784314, 0.46155545, 0.24986996, 0.03404644, -0.047789436, -0.12438347, -0.14143273, -0.17951359, -0.08057819, 0.023863006, -0.008539273, -0.06775414), target2); + target2 = MulAdd(c2, MF4x4(0.1430169, 0.056971863, -0.021576611, -0.045342956, -0.22356391, -0.15344621, -0.0467977, -0.22970036, -0.0125351725, 0.16957329, -0.0069183917, -0.013949834, -0.048609708, 0.05261722, 0.023262242, 0.2123519), target2); + target2 = MulAdd(d2, MF4x4(-0.019523792, 0.008228363, -0.04616012, -0.14341992, -0.19307113, 0.005937241, 0.24048887, -0.04279845, 0.022574252, 0.15558265, -0.035000063, 0.18318397, -0.05392528, -0.26044658, -0.13493988, 0.056433514), target2); + target2 = MulAdd(e2, MF4x4(-0.28926027, -0.17381874, 0.07685766, -0.0061521684, -0.47455552, -0.49213487, 0.36924496, 0.29042044, 0.201094, -0.14280887, -0.4531411, -0.52902204, -0.28123, 0.1401882, 0.32054895, -0.11357518), target2); + target2 = MulAdd(f2, MF4x4(0.14173324, -0.12069898, -0.07242415, 0.105665006, 0.017373435, -0.056042343, 0.07270201, 0.022111928, -0.01106541, 0.01666006, 0.013564169, -0.36628693, -0.25450787, -0.28179473, -0.04721874, -0.21912882), target2); + target2 = MulAdd(g2, MF4x4(-0.09464695, -0.027919646, 0.13088459, 0.17504032, -0.101641014, 0.29687008, 0.08832321, 0.020538324, -0.15108941, -0.21930224, -0.026915176, -0.07078217, 0.10723033, 0.034364715, 0.18183397, -0.119012214), target2); + target2 = MulAdd(h2, MF4x4(-0.21713468, -0.0846604, 0.046551514, -0.14989382, 0.08672032, -0.07933831, 0.08093595, -0.064147756, -0.15980323, 0.50000644, -0.091568656, 0.03201994, -0.1848647, -0.0646309, 0.03288009, 0.046442386), target2); + target2 = MulAdd(i2, MF4x4(0.053532355, -0.054523747, -0.040242642, -0.31438905, 0.06452703, -0.18785381, -0.14987698, -0.067642935, -0.19892459, -0.057256676, 0.05943023, -0.17331842, 0.02588534, 0.13134238, -0.07121775, 0.23446162), target2); + target2 = MulAdd(na1, MF4x4(0.20633182, 0.01686198, 0.17934167, -0.02063493, 0.042606052, -0.05289458, 0.031508356, 0.00082803797, 0.0756423, -0.047548845, 0.01456339, 0.15910533, -0.20119642, 0.029213727, 0.111036316, -0.047010012), target2); + target2 = MulAdd(nb1, MF4x4(0.09258436, -0.27904224, -0.086695746, 0.33095327, -0.20126075, -0.050745636, -0.048944805, -0.10536587, -0.012995092, 0.07926994, 0.15071853, -0.13644052, -0.05188447, -0.06750699, -0.14227037, 0.028751127), target2); + target2 = MulAdd(nc1, MF4x4(-0.18562223, 0.10250865, -0.17573993, 0.20434102, -0.05187468, -0.06441594, -0.052127104, -0.01925564, 0.02927959, -0.12711872, 0.059629507, 0.15696885, -0.010168965, 0.09971862, -0.03177664, -0.022744441), target2); + target2 = MulAdd(nd1, MF4x4(0.21474063, -0.15679085, 0.09609374, 0.109079376, -0.049934637, -0.07393633, 0.16688468, -0.018888129, 0.04240162, -0.31895876, -0.106516436, 0.20008606, -0.054410245, 0.028970616, -0.18008347, -0.013362003), target2); + target2 = MulAdd(ne1, MF4x4(0.37891293, 0.042730846, -0.24735828, -0.5234527, -0.3681344, -0.06609157, -0.14993733, -0.020316398, 0.123008475, 0.29632482, 0.32149333, 0.35999274, -0.18967044, 0.46154186, -0.016041815, 0.097378336), target2); + target2 = MulAdd(nf1, MF4x4(-0.14873263, 0.07600569, -0.051758345, 0.1803135, -0.23121934, 0.13574593, 0.043973465, -0.13992754, -0.061972607, -0.124083005, -0.049196843, -0.07700431, 0.21572952, -0.25241727, 0.1218322, -0.07773728), target2); + target2 = MulAdd(ng1, MF4x4(0.040287063, 0.024240922, 0.021917762, -0.050616946, -0.023174169, 0.05977014, 0.018892275, 0.04014965, 0.11715485, 0.062129, 0.024620812, 0.013617107, 0.075699426, 0.1858111, -0.11769179, -0.08085602), target2); + target2 = MulAdd(nh1, MF4x4(-0.3194255, 0.08695645, -0.09453595, 0.2564516, 0.02192303, 0.08167247, -0.06257352, 0.043801844, 0.04392246, 0.2020571, 0.045180902, 0.18857521, 0.1835961, -0.043788187, -0.08768916, -0.14755538), target2); + target2 = MulAdd(ni1, MF4x4(-0.22074097, 0.13768476, -0.16183749, 0.059949517, -0.011375954, 0.08581876, 0.004800447, 0.019403988, 0.014646056, 0.07363176, -0.058036458, 0.0706421, 0.08082624, 0.17740329, -0.05484784, 0.050796065), target2); + target2 = MulAdd(na2, MF4x4(-0.032330472, -0.067666024, 0.18980837, -0.19077848, 0.1111905, 0.03855666, -0.11272314, -0.00577739, 0.17697452, -0.053044144, -0.07510145, 0.061853852, -0.024240626, 0.14846492, 0.14804313, -0.20275854), target2); + target2 = MulAdd(nb2, MF4x4(0.17133904, -0.16356844, 0.1978664, 0.13877816, 0.28208038, 0.031539194, 0.11313891, -0.0014802719, 0.0033749861, 0.046372313, 0.054808807, -0.0024151779, 0.0068782056, -0.16414621, -0.07545907, -0.2521294), target2); + target2 = MulAdd(nc2, MF4x4(-0.1746992, -0.037628956, -0.0044012754, -0.004390821, 0.0050341445, -0.112742625, 0.051241755, 0.01984483, 0.0003531837, 0.043500375, 0.030881992, 0.003503799, 0.13611782, -0.02509031, -0.007503557, -0.009321301), target2); + target2 = MulAdd(nd2, MF4x4(0.087250136, 0.12374122, 0.2959519, 0.11314702, 0.22080182, 0.106726184, -0.29768205, 0.14931595, 0.23356548, -0.008709153, -0.0797829, 0.046940215, -0.07027616, 0.20533602, 0.0723021, -0.1963585), target2); + target2 = MulAdd(ne2, MF4x4(0.00609982, 0.35277408, -0.22781096, -0.28912535, 0.42393112, -0.07654207, 0.12636793, 0.049337976, -0.0967726, -0.19349189, 0.36800626, 0.09745645, 0.47663373, 0.03876107, -0.042987954, 0.016161885), target2); + target2 = MulAdd(nf2, MF4x4(-0.047490966, -0.05823166, 0.036158644, 0.025337253, -0.046618905, 0.108276576, -0.024148034, 0.0026794411, 0.1497962, -0.09328474, -0.03160641, 0.24351281, -0.05198027, 0.030720685, 0.00014528916, -0.2224931), target2); + target2 = MulAdd(ng2, MF4x4(-0.007338369, 0.18710312, 0.14617369, -0.0070655346, 0.10464997, -0.029674934, -0.11842202, -0.09114357, 0.08524458, -0.08082762, 0.06479597, -0.023760766, 0.07523641, 0.0067315935, 0.101266846, -0.2780903), target2); + target2 = MulAdd(nh2, MF4x4(0.14181875, -0.19523518, 0.1068169, -0.10284853, 0.11634046, -0.117397435, 0.09113022, 0.009371062, -0.022120507, -0.1127032, 0.092574745, -0.021989716, -0.088107705, -0.13541982, 0.08130504, -0.0678927), target2); + target2 = MulAdd(ni2, MF4x4(0.09948295, 0.23699793, -0.042369924, 0.16744529, -0.10045506, -0.045623623, 0.04871897, -0.0023967526, 0.02602692, -0.089873284, -0.050681606, -0.09332558, -0.09596149, -0.06988313, 0.0007193808, -0.11936899), target2); conv2d_4_tf[gxy] = target1; conv2d_4_tf1[gxy] = target2; @@ -844,25 +847,25 @@ void Pass6(uint2 blockStart, uint3 threadId) { // [ a, d, g ] // [ b, e, h ] // [ c, f, i ] - float4 a1 = conv2d_4_tf.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0); - float4 b1 = conv2d_4_tf.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); - float4 c1 = conv2d_4_tf.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); - float4 d1 = conv2d_4_tf.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); - float4 e1 = conv2d_4_tf.SampleLevel(sam, pos, 0); - float4 f1 = conv2d_4_tf.SampleLevel(sam, pos + float2(0, inputPt.y), 0); - float4 g1 = conv2d_4_tf.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); - float4 h1 = conv2d_4_tf.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); - float4 i1 = conv2d_4_tf.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0); - - float4 na1 = max(-a1, 0); - float4 nb1 = max(-b1, 0); - float4 nc1 = max(-c1, 0); - float4 nd1 = max(-d1, 0); - float4 ne1 = max(-e1, 0); - float4 nf1 = max(-f1, 0); - float4 ng1 = max(-g1, 0); - float4 nh1 = max(-h1, 0); - float4 ni1 = max(-i1, 0); + MF4 a1 = conv2d_4_tf.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0); + MF4 b1 = conv2d_4_tf.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); + MF4 c1 = conv2d_4_tf.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); + MF4 d1 = conv2d_4_tf.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); + MF4 e1 = conv2d_4_tf.SampleLevel(sam, pos, 0); + MF4 f1 = conv2d_4_tf.SampleLevel(sam, pos + float2(0, inputPt.y), 0); + MF4 g1 = conv2d_4_tf.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); + MF4 h1 = conv2d_4_tf.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); + MF4 i1 = conv2d_4_tf.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0); + + MF4 na1 = max(-a1, 0); + MF4 nb1 = max(-b1, 0); + MF4 nc1 = max(-c1, 0); + MF4 nd1 = max(-d1, 0); + MF4 ne1 = max(-e1, 0); + MF4 nf1 = max(-f1, 0); + MF4 ng1 = max(-g1, 0); + MF4 nh1 = max(-h1, 0); + MF4 ni1 = max(-i1, 0); a1 = max(a1, 0); b1 = max(b1, 0); @@ -874,25 +877,25 @@ void Pass6(uint2 blockStart, uint3 threadId) { h1 = max(h1, 0); i1 = max(i1, 0); - float4 a2 = conv2d_4_tf1.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0); - float4 b2 = conv2d_4_tf1.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); - float4 c2 = conv2d_4_tf1.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); - float4 d2 = conv2d_4_tf1.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); - float4 e2 = conv2d_4_tf1.SampleLevel(sam, pos, 0); - float4 f2 = conv2d_4_tf1.SampleLevel(sam, pos + float2(0, inputPt.y), 0); - float4 g2 = conv2d_4_tf1.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); - float4 h2 = conv2d_4_tf1.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); - float4 i2 = conv2d_4_tf1.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0); - - float4 na2 = max(-a2, 0); - float4 nb2 = max(-b2, 0); - float4 nc2 = max(-c2, 0); - float4 nd2 = max(-d2, 0); - float4 ne2 = max(-e2, 0); - float4 nf2 = max(-f2, 0); - float4 ng2 = max(-g2, 0); - float4 nh2 = max(-h2, 0); - float4 ni2 = max(-i2, 0); + MF4 a2 = conv2d_4_tf1.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0); + MF4 b2 = conv2d_4_tf1.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); + MF4 c2 = conv2d_4_tf1.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); + MF4 d2 = conv2d_4_tf1.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); + MF4 e2 = conv2d_4_tf1.SampleLevel(sam, pos, 0); + MF4 f2 = conv2d_4_tf1.SampleLevel(sam, pos + float2(0, inputPt.y), 0); + MF4 g2 = conv2d_4_tf1.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); + MF4 h2 = conv2d_4_tf1.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); + MF4 i2 = conv2d_4_tf1.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0); + + MF4 na2 = max(-a2, 0); + MF4 nb2 = max(-b2, 0); + MF4 nc2 = max(-c2, 0); + MF4 nd2 = max(-d2, 0); + MF4 ne2 = max(-e2, 0); + MF4 nf2 = max(-f2, 0); + MF4 ng2 = max(-g2, 0); + MF4 nh2 = max(-h2, 0); + MF4 ni2 = max(-i2, 0); a2 = max(a2, 0); b2 = max(b2, 0); @@ -904,81 +907,81 @@ void Pass6(uint2 blockStart, uint3 threadId) { h2 = max(h2, 0); i2 = max(i2, 0); - float4 target1 = mul(a1, float4x4(-0.13425097, -0.23487093, 0.2480183, -0.2806276, -0.041303713, 0.100773126, -0.110890545, 0.036205858, -0.331331, -0.12929262, 0.16300063, 0.3776673, -0.20316373, -0.011239426, 0.10650887, -0.027857736)); - target1 += mul(b1, float4x4(0.09517376, -0.3004956, 0.05033304, -0.07464521, 0.009204248, -0.23034886, 0.30492118, -0.1215848, 0.15728685, -0.10430078, 0.04038878, 0.08034804, 0.04320418, -0.2929594, -0.018968396, 0.02542387)); - target1 += mul(c1, float4x4(-0.10651935, -0.2736715, 0.19267319, -0.033337504, -0.06697293, 0.028424729, 0.047814637, 0.44929537, 0.02565344, -0.253426, -0.040931404, -0.05018104, 0.032979824, -0.035349697, -0.039578713, -0.3116414)); - target1 += mul(d1, float4x4(0.09176126, 0.031713437, 0.24861507, 0.31351718, 0.36284143, 0.3622709, 0.16165464, 0.07319267, -0.6303202, -0.21209712, -0.02169929, 0.037275597, -0.1295319, 0.033090707, -0.029330662, 0.054679472)); - target1 += mul(e1, float4x4(0.15021572, -0.15177831, 0.1318225, 0.46864823, 0.059443284, 0.07404233, 0.22612074, 0.21105285, 0.319694, 0.09397257, 0.14277866, -0.0235649, -0.037205156, -0.40715128, -0.18572816, 0.058741573)); - target1 += mul(f1, float4x4(-0.122751735, -0.20926422, 0.2099333, -0.11627138, 0.04171681, 0.0669586, -0.03831368, 0.27334675, 0.0492008, 0.12854317, 0.03308622, 0.45236585, 0.03122829, 0.13853219, 0.05084764, -0.3965012)); - target1 += mul(g1, float4x4(-0.0019293908, -0.15562099, 0.12418126, 0.0045440597, 0.05442391, -0.15613738, 0.14828286, -0.17687118, -0.053517755, -0.33350968, -0.062314924, -0.31358472, -0.09670371, 0.043190923, 0.008150662, 0.09928506)); - target1 += mul(h1, float4x4(-0.06698031, -0.099411525, 0.24259582, -0.1073659, 0.06762824, 0.059605874, -0.20944163, -0.1598055, 0.32746908, -0.17759447, 0.2859796, -0.1274256, 0.30796206, -0.00791448, 0.114059694, 0.14775705)); - target1 += mul(i1, float4x4(0.16291203, -0.14958477, 0.14716864, 0.2056065, -0.019337546, 0.032286238, 0.0030445335, -0.08208513, -0.14208078, 0.13601872, -0.23367858, -0.19092909, -0.20207883, -0.016950991, 0.009309007, 0.1376546)); - target1 += mul(a2, float4x4(-0.11093091, -0.32362202, -0.041845415, 0.029758021, -0.07261404, -0.048653398, 0.19167832, 0.09343212, 0.030472826, -0.15078579, -0.0056376588, 0.0045257527, -0.24521805, -0.10473077, 0.11163019, -0.1724187)); - target1 += mul(b2, float4x4(-0.08601668, 0.16612485, -0.07751539, 0.07261594, -0.19028407, 0.23896623, -0.10416726, 0.23500614, 0.1955228, 0.08699591, -0.049277775, 0.13447775, 0.19434914, -0.11481196, 0.088043146, 0.13352895)); - target1 += mul(c2, float4x4(-0.013221233, 0.07521129, 0.042819552, -0.11163175, 0.066080205, -0.25043094, -0.010348969, -0.013258202, 0.09444396, 0.29623637, 0.025016114, 0.050744686, -0.12219596, -0.0735393, -0.024817836, -0.06897588)); - target1 += mul(d2, float4x4(-0.25720942, 0.19861753, -0.18535058, 0.12190362, -0.33756095, -0.0038898317, 0.09739055, 0.41227046, -0.10030796, 0.025445882, -0.23542109, 0.08677691, 0.08140194, -0.22716106, 0.14016968, -0.0927231)); - target1 += mul(e2, float4x4(0.58745646, -0.12533307, 0.30129984, 0.08898194, -0.07972344, -0.37581098, 0.06863413, -0.13185541, 0.21801205, 0.31779078, -0.3804784, -0.3200699, 0.14534226, 0.05912262, 0.07938948, -0.34869507)); - target1 += mul(f2, float4x4(0.024675166, -0.067802526, 0.030065436, 0.06509131, 0.14367498, 0.022554757, 0.014991865, -0.029914752, 0.5123549, -0.012557206, -0.13014166, -0.34184244, -0.09080884, 0.13782553, -0.018931886, -0.35642785)); - target1 += mul(g2, float4x4(-0.37336427, -0.02705006, 0.14392053, 0.024049882, -0.024705589, 0.14556128, -0.12120506, -0.06275598, -0.1284325, 0.11409197, -0.08397436, -0.075944416, 0.056465942, 0.04016099, 0.096723564, -0.08359723)); - target1 += mul(h2, float4x4(0.20243345, -0.09287934, -0.11676041, 0.005206654, -0.2879361, 0.41677123, -0.16924824, 0.22429213, 0.082279116, -0.1780833, 0.20209241, 0.12970525, -0.030272234, -0.19200714, 0.0015769673, -0.1389732)); - target1 += mul(i2, float4x4(0.04211243, 0.07331798, -0.055724114, 0.04086206, -0.04635456, 0.027212424, 0.021861525, 0.12424812, 0.43009162, 0.021664696, 0.20828371, 0.11859106, 0.07390811, -0.1861182, 0.034559406, 0.18561925)); - target1 += mul(na1, float4x4(0.22596797, 0.025346763, -0.056839246, 0.09137385, 0.07363095, -0.12382036, 0.08911783, -0.012355983, -0.07869761, 0.051298574, 0.00816572, -0.044984274, 0.07962154, -0.2254524, -0.007821531, -0.04936664)); - target1 += mul(nb1, float4x4(0.06265961, -0.17783198, 0.11678783, -0.12965304, 0.014506855, -0.17513473, -0.23593299, 0.14054537, 0.1580306, 0.31872272, -0.0042505316, -0.070422255, -0.01316396, 0.0058355615, 0.062464185, -0.06086727)); - target1 += mul(nc1, float4x4(-0.079526044, 0.23932967, -0.1139716, 0.15888569, 0.06526993, -0.06958436, -0.04070066, -0.12081254, 0.026716579, 0.014887845, 0.0061467467, 0.127956, 0.040913627, -0.0032820841, 0.086145625, 0.22520025)); - target1 += mul(nd1, float4x4(0.25577608, 0.02553098, -0.14822578, -0.11907723, -0.09787419, -0.03544863, -0.08098151, -0.01305555, 0.20404844, 0.11294246, 0.10096346, 0.15795162, 0.2554626, 0.09361069, 0.001985862, -0.0051444587)); - target1 += mul(ne1, float4x4(-0.24454486, -0.014714279, -0.2954907, -0.39995646, -0.15907967, 0.30107877, -0.34781745, 0.095281735, -0.12492393, -0.28375402, -0.16872306, 0.2531788, -0.52085644, 0.35986066, 0.07716912, 0.09565738)); - target1 += mul(nf1, float4x4(0.2493129, 0.06395661, -0.09491958, 0.19702488, 0.109871864, -0.051376317, 0.15404263, -0.21282886, 0.1188967, 0.07824094, -0.016752928, -0.14027214, 0.10949832, -0.27629098, 0.081909016, 0.1354018)); - target1 += mul(ng1, float4x4(0.18950915, -0.034574565, -0.10378051, -0.15800652, -0.06835184, -0.06987467, 0.035007782, 0.04686656, 0.054061133, 0.014833506, -0.0035361175, 0.016156103, 0.120767444, -0.10196722, 0.10668838, -0.09058739)); - target1 += mul(nh1, float4x4(-0.032248627, 0.056413256, 0.042716432, 0.06681831, 0.047605485, -0.07629479, 0.14311917, -0.06909803, 0.10640394, 0.10701861, -0.0051839007, -0.15133362, -0.32146424, -0.039978918, -0.12280021, 0.0048507582)); - target1 += mul(ni1, float4x4(-0.1954503, -0.09257865, 0.11023244, -0.01817947, -0.0035485283, -0.015536726, 0.0071826433, 0.042538714, -0.015454641, 0.079593316, -0.07242554, 0.031178504, 0.2319168, -0.10519467, 0.013837495, -0.040088437)); - target1 += mul(na2, float4x4(0.12625901, 0.04531166, 0.038758352, -0.05790713, -0.10029771, -0.118265375, -0.23944628, 0.11955388, 0.070732996, 0.19404806, -0.019913414, 0.04609079, 0.06262817, 0.022330387, -0.029681094, 0.03719176)); - target1 += mul(nb2, float4x4(-0.07737922, 0.0024623116, -0.037666153, -0.19271135, -0.015002153, -0.0059966356, 0.0024538909, -0.0401021, -0.18540399, -0.11140236, -0.11102473, -0.06390247, 0.016754225, 0.35000673, -0.19139731, 0.07363001)); - target1 += mul(nc2, float4x4(0.02150171, -0.2311761, -0.025124706, 0.16819553, -0.0013348719, 0.32091036, -0.061826598, 0.12579474, -0.036611024, -0.018266583, -0.11280143, 0.11073158, 0.050171874, -0.14706045, 0.029553955, 0.0052631944)); - target1 += mul(nd2, float4x4(0.19249865, -0.22854832, 0.09472751, 0.014705341, 0.059496958, 0.13427268, -0.06309558, -0.07153743, -0.31890163, -0.0657967, -0.040345218, 0.09544393, 0.07359761, 0.11245483, 0.00033233972, 0.031550154)); - target1 += mul(ne2, float4x4(-0.24668917, -0.37181908, -0.50614715, -0.101197146, -0.1569055, 0.27734125, 0.17144768, -0.04336267, 0.03658949, 0.06747124, 0.30720958, 0.56301194, -0.11314631, -0.29258573, 0.16256689, 0.5221001)); - target1 += mul(nf2, float4x4(-0.022761503, 0.13063031, 0.002526217, -0.03466151, -0.15225072, 0.40217137, -0.089131154, 0.19195192, -0.1379853, -0.04640692, 0.104670234, 0.12268618, -0.012009209, -0.20534724, 0.028777445, 0.22195113)); - target1 += mul(ng2, float4x4(0.23697586, 0.08793654, -0.10565018, 0.013993297, -0.025932996, -0.13859354, 0.14333159, -0.099132575, -0.049601994, -0.0917448, -0.0021633878, -0.009032609, -0.034750953, -0.30761167, 0.058994945, -0.19427797)); - target1 += mul(nh2, float4x4(-0.26944515, 0.30523893, -0.17787015, 0.10827742, 0.06457236, -0.12202401, 0.15371302, 0.011699893, -0.06253491, -0.10976804, -0.37283847, -0.23996784, -0.2750512, -0.024101513, -0.094127975, -0.17462716)); - target1 += mul(ni2, float4x4(-0.026286924, 0.06250577, 0.095423855, -0.02849258, -0.12916361, -0.10954709, -0.05825132, -0.102924265, -0.19550376, -0.11730307, 0.032346163, -0.17682706, 0.16651174, 0.031927045, -0.004800601, -0.06323844)); - target1 += float4(0.0095873345, 0.04959374, -0.15246227, 0.0044831373); + MF4 target1 = { 0.0095873345, 0.04959374, -0.15246227, 0.0044831373 }; + target1 = MulAdd(a1, MF4x4(-0.13425097, -0.23487093, 0.2480183, -0.2806276, -0.041303713, 0.100773126, -0.110890545, 0.036205858, -0.331331, -0.12929262, 0.16300063, 0.3776673, -0.20316373, -0.011239426, 0.10650887, -0.027857736), target1); + target1 = MulAdd(b1, MF4x4(0.09517376, -0.3004956, 0.05033304, -0.07464521, 0.009204248, -0.23034886, 0.30492118, -0.1215848, 0.15728685, -0.10430078, 0.04038878, 0.08034804, 0.04320418, -0.2929594, -0.018968396, 0.02542387), target1); + target1 = MulAdd(c1, MF4x4(-0.10651935, -0.2736715, 0.19267319, -0.033337504, -0.06697293, 0.028424729, 0.047814637, 0.44929537, 0.02565344, -0.253426, -0.040931404, -0.05018104, 0.032979824, -0.035349697, -0.039578713, -0.3116414), target1); + target1 = MulAdd(d1, MF4x4(0.09176126, 0.031713437, 0.24861507, 0.31351718, 0.36284143, 0.3622709, 0.16165464, 0.07319267, -0.6303202, -0.21209712, -0.02169929, 0.037275597, -0.1295319, 0.033090707, -0.029330662, 0.054679472), target1); + target1 = MulAdd(e1, MF4x4(0.15021572, -0.15177831, 0.1318225, 0.46864823, 0.059443284, 0.07404233, 0.22612074, 0.21105285, 0.319694, 0.09397257, 0.14277866, -0.0235649, -0.037205156, -0.40715128, -0.18572816, 0.058741573), target1); + target1 = MulAdd(f1, MF4x4(-0.122751735, -0.20926422, 0.2099333, -0.11627138, 0.04171681, 0.0669586, -0.03831368, 0.27334675, 0.0492008, 0.12854317, 0.03308622, 0.45236585, 0.03122829, 0.13853219, 0.05084764, -0.3965012), target1); + target1 = MulAdd(g1, MF4x4(-0.0019293908, -0.15562099, 0.12418126, 0.0045440597, 0.05442391, -0.15613738, 0.14828286, -0.17687118, -0.053517755, -0.33350968, -0.062314924, -0.31358472, -0.09670371, 0.043190923, 0.008150662, 0.09928506), target1); + target1 = MulAdd(h1, MF4x4(-0.06698031, -0.099411525, 0.24259582, -0.1073659, 0.06762824, 0.059605874, -0.20944163, -0.1598055, 0.32746908, -0.17759447, 0.2859796, -0.1274256, 0.30796206, -0.00791448, 0.114059694, 0.14775705), target1); + target1 = MulAdd(i1, MF4x4(0.16291203, -0.14958477, 0.14716864, 0.2056065, -0.019337546, 0.032286238, 0.0030445335, -0.08208513, -0.14208078, 0.13601872, -0.23367858, -0.19092909, -0.20207883, -0.016950991, 0.009309007, 0.1376546), target1); + target1 = MulAdd(a2, MF4x4(-0.11093091, -0.32362202, -0.041845415, 0.029758021, -0.07261404, -0.048653398, 0.19167832, 0.09343212, 0.030472826, -0.15078579, -0.0056376588, 0.0045257527, -0.24521805, -0.10473077, 0.11163019, -0.1724187), target1); + target1 = MulAdd(b2, MF4x4(-0.08601668, 0.16612485, -0.07751539, 0.07261594, -0.19028407, 0.23896623, -0.10416726, 0.23500614, 0.1955228, 0.08699591, -0.049277775, 0.13447775, 0.19434914, -0.11481196, 0.088043146, 0.13352895), target1); + target1 = MulAdd(c2, MF4x4(-0.013221233, 0.07521129, 0.042819552, -0.11163175, 0.066080205, -0.25043094, -0.010348969, -0.013258202, 0.09444396, 0.29623637, 0.025016114, 0.050744686, -0.12219596, -0.0735393, -0.024817836, -0.06897588), target1); + target1 = MulAdd(d2, MF4x4(-0.25720942, 0.19861753, -0.18535058, 0.12190362, -0.33756095, -0.0038898317, 0.09739055, 0.41227046, -0.10030796, 0.025445882, -0.23542109, 0.08677691, 0.08140194, -0.22716106, 0.14016968, -0.0927231), target1); + target1 = MulAdd(e2, MF4x4(0.58745646, -0.12533307, 0.30129984, 0.08898194, -0.07972344, -0.37581098, 0.06863413, -0.13185541, 0.21801205, 0.31779078, -0.3804784, -0.3200699, 0.14534226, 0.05912262, 0.07938948, -0.34869507), target1); + target1 = MulAdd(f2, MF4x4(0.024675166, -0.067802526, 0.030065436, 0.06509131, 0.14367498, 0.022554757, 0.014991865, -0.029914752, 0.5123549, -0.012557206, -0.13014166, -0.34184244, -0.09080884, 0.13782553, -0.018931886, -0.35642785), target1); + target1 = MulAdd(g2, MF4x4(-0.37336427, -0.02705006, 0.14392053, 0.024049882, -0.024705589, 0.14556128, -0.12120506, -0.06275598, -0.1284325, 0.11409197, -0.08397436, -0.075944416, 0.056465942, 0.04016099, 0.096723564, -0.08359723), target1); + target1 = MulAdd(h2, MF4x4(0.20243345, -0.09287934, -0.11676041, 0.005206654, -0.2879361, 0.41677123, -0.16924824, 0.22429213, 0.082279116, -0.1780833, 0.20209241, 0.12970525, -0.030272234, -0.19200714, 0.0015769673, -0.1389732), target1); + target1 = MulAdd(i2, MF4x4(0.04211243, 0.07331798, -0.055724114, 0.04086206, -0.04635456, 0.027212424, 0.021861525, 0.12424812, 0.43009162, 0.021664696, 0.20828371, 0.11859106, 0.07390811, -0.1861182, 0.034559406, 0.18561925), target1); + target1 = MulAdd(na1, MF4x4(0.22596797, 0.025346763, -0.056839246, 0.09137385, 0.07363095, -0.12382036, 0.08911783, -0.012355983, -0.07869761, 0.051298574, 0.00816572, -0.044984274, 0.07962154, -0.2254524, -0.007821531, -0.04936664), target1); + target1 = MulAdd(nb1, MF4x4(0.06265961, -0.17783198, 0.11678783, -0.12965304, 0.014506855, -0.17513473, -0.23593299, 0.14054537, 0.1580306, 0.31872272, -0.0042505316, -0.070422255, -0.01316396, 0.0058355615, 0.062464185, -0.06086727), target1); + target1 = MulAdd(nc1, MF4x4(-0.079526044, 0.23932967, -0.1139716, 0.15888569, 0.06526993, -0.06958436, -0.04070066, -0.12081254, 0.026716579, 0.014887845, 0.0061467467, 0.127956, 0.040913627, -0.0032820841, 0.086145625, 0.22520025), target1); + target1 = MulAdd(nd1, MF4x4(0.25577608, 0.02553098, -0.14822578, -0.11907723, -0.09787419, -0.03544863, -0.08098151, -0.01305555, 0.20404844, 0.11294246, 0.10096346, 0.15795162, 0.2554626, 0.09361069, 0.001985862, -0.0051444587), target1); + target1 = MulAdd(ne1, MF4x4(-0.24454486, -0.014714279, -0.2954907, -0.39995646, -0.15907967, 0.30107877, -0.34781745, 0.095281735, -0.12492393, -0.28375402, -0.16872306, 0.2531788, -0.52085644, 0.35986066, 0.07716912, 0.09565738), target1); + target1 = MulAdd(nf1, MF4x4(0.2493129, 0.06395661, -0.09491958, 0.19702488, 0.109871864, -0.051376317, 0.15404263, -0.21282886, 0.1188967, 0.07824094, -0.016752928, -0.14027214, 0.10949832, -0.27629098, 0.081909016, 0.1354018), target1); + target1 = MulAdd(ng1, MF4x4(0.18950915, -0.034574565, -0.10378051, -0.15800652, -0.06835184, -0.06987467, 0.035007782, 0.04686656, 0.054061133, 0.014833506, -0.0035361175, 0.016156103, 0.120767444, -0.10196722, 0.10668838, -0.09058739), target1); + target1 = MulAdd(nh1, MF4x4(-0.032248627, 0.056413256, 0.042716432, 0.06681831, 0.047605485, -0.07629479, 0.14311917, -0.06909803, 0.10640394, 0.10701861, -0.0051839007, -0.15133362, -0.32146424, -0.039978918, -0.12280021, 0.0048507582), target1); + target1 = MulAdd(ni1, MF4x4(-0.1954503, -0.09257865, 0.11023244, -0.01817947, -0.0035485283, -0.015536726, 0.0071826433, 0.042538714, -0.015454641, 0.079593316, -0.07242554, 0.031178504, 0.2319168, -0.10519467, 0.013837495, -0.040088437), target1); + target1 = MulAdd(na2, MF4x4(0.12625901, 0.04531166, 0.038758352, -0.05790713, -0.10029771, -0.118265375, -0.23944628, 0.11955388, 0.070732996, 0.19404806, -0.019913414, 0.04609079, 0.06262817, 0.022330387, -0.029681094, 0.03719176), target1); + target1 = MulAdd(nb2, MF4x4(-0.07737922, 0.0024623116, -0.037666153, -0.19271135, -0.015002153, -0.0059966356, 0.0024538909, -0.0401021, -0.18540399, -0.11140236, -0.11102473, -0.06390247, 0.016754225, 0.35000673, -0.19139731, 0.07363001), target1); + target1 = MulAdd(nc2, MF4x4(0.02150171, -0.2311761, -0.025124706, 0.16819553, -0.0013348719, 0.32091036, -0.061826598, 0.12579474, -0.036611024, -0.018266583, -0.11280143, 0.11073158, 0.050171874, -0.14706045, 0.029553955, 0.0052631944), target1); + target1 = MulAdd(nd2, MF4x4(0.19249865, -0.22854832, 0.09472751, 0.014705341, 0.059496958, 0.13427268, -0.06309558, -0.07153743, -0.31890163, -0.0657967, -0.040345218, 0.09544393, 0.07359761, 0.11245483, 0.00033233972, 0.031550154), target1); + target1 = MulAdd(ne2, MF4x4(-0.24668917, -0.37181908, -0.50614715, -0.101197146, -0.1569055, 0.27734125, 0.17144768, -0.04336267, 0.03658949, 0.06747124, 0.30720958, 0.56301194, -0.11314631, -0.29258573, 0.16256689, 0.5221001), target1); + target1 = MulAdd(nf2, MF4x4(-0.022761503, 0.13063031, 0.002526217, -0.03466151, -0.15225072, 0.40217137, -0.089131154, 0.19195192, -0.1379853, -0.04640692, 0.104670234, 0.12268618, -0.012009209, -0.20534724, 0.028777445, 0.22195113), target1); + target1 = MulAdd(ng2, MF4x4(0.23697586, 0.08793654, -0.10565018, 0.013993297, -0.025932996, -0.13859354, 0.14333159, -0.099132575, -0.049601994, -0.0917448, -0.0021633878, -0.009032609, -0.034750953, -0.30761167, 0.058994945, -0.19427797), target1); + target1 = MulAdd(nh2, MF4x4(-0.26944515, 0.30523893, -0.17787015, 0.10827742, 0.06457236, -0.12202401, 0.15371302, 0.011699893, -0.06253491, -0.10976804, -0.37283847, -0.23996784, -0.2750512, -0.024101513, -0.094127975, -0.17462716), target1); + target1 = MulAdd(ni2, MF4x4(-0.026286924, 0.06250577, 0.095423855, -0.02849258, -0.12916361, -0.10954709, -0.05825132, -0.102924265, -0.19550376, -0.11730307, 0.032346163, -0.17682706, 0.16651174, 0.031927045, -0.004800601, -0.06323844), target1); - float4 target2 = mul(a1, float4x4(-0.021453971, -0.108874515, 0.0005208881, -0.09774453, -0.0053757126, 0.20114918, 0.24454592, 0.04932251, -0.0037210248, -0.0240578, -0.07736935, 0.27604944, -0.12430849, -0.13093218, -0.014840212, 0.13450128)); - target2 += mul(b1, float4x4(-0.19143668, -0.23023333, -0.10232715, 0.24396868, 0.056112397, 0.14535592, -0.25882182, -0.26274678, -0.23119931, 0.07735849, -0.14785223, -0.21026523, -0.2064457, -0.34512606, -0.17808662, 0.30146623)); - target2 += mul(c1, float4x4(0.0072161015, -0.013303738, 0.07591899, 0.027883789, 0.210858, 0.1422139, -0.027882019, 0.2618474, -0.048504543, 0.07377317, -0.05427271, -0.10014041, -0.12974857, -0.13140713, -0.02249253, 0.08203184)); - target2 += mul(d1, float4x4(0.07855138, -0.13984342, 0.10037151, -0.056781758, 0.24686107, -0.0048190085, -0.2693424, 0.31722167, -0.28716075, -0.06422215, -0.06738793, -0.06723655, -0.08194382, -0.007975044, 0.20108353, -0.13338897)); - target2 += mul(e1, float4x4(0.35129568, 0.27930936, 0.024239251, -0.10712293, 0.48684034, -0.04380574, -0.0064479653, 0.03754327, -0.13139078, -0.44939983, -1.0460628, -0.016004754, -0.14476573, -0.07113434, 0.515311, -0.400374)); - target2 += mul(f1, float4x4(0.13104302, -0.23410062, 0.091530964, -0.003652217, 0.16696814, 0.16406855, -0.08138474, 0.047526445, 0.25358474, 0.37850454, 0.0362802, -0.046476766, -0.093869686, -0.4143772, 0.08641024, 0.115896136)); - target2 += mul(g1, float4x4(-0.04416574, -0.052188106, 0.05141859, -0.008132604, -0.013658864, 0.1021097, 0.19391364, -0.09257973, 0.15225394, -0.16920799, -0.16172324, 0.41466942, -0.07087308, 0.08632938, -0.07496043, -0.023530172)); - target2 += mul(h1, float4x4(0.09337352, 0.062108494, -0.219173, -0.046151914, 0.22507025, -0.08966131, -0.123690315, 0.08666376, -0.10731867, -0.08518657, 0.024199447, 0.17898631, 0.120247275, 0.089923285, -0.08756211, 0.1775775)); - target2 += mul(i1, float4x4(0.20326594, -0.060535498, -0.061659336, 0.113954924, -0.073462196, 0.15917051, 0.11728326, -0.072256014, -0.0752342, 0.06265616, -0.19494365, -0.25413772, -0.06641352, -0.015642308, 0.16825356, 0.0027654327)); - target2 += mul(a2, float4x4(-0.17029639, -0.05388927, -0.13159063, 0.0795609, 0.00501164, -0.0703107, -0.08229201, 0.07546247, 0.092942156, 0.059050936, -0.07987315, 0.010874322, 0.037708692, -0.0017377702, -0.030414931, 0.28946167)); - target2 += mul(b2, float4x4(-0.2692667, 0.2258295, 0.062060453, 0.1934921, -0.023051793, -0.038611185, 0.21473692, 0.33520013, 0.029885106, 0.103782356, 0.05217351, -0.13349791, -0.034186684, -0.3015818, 0.033423528, 0.21218027)); - target2 += mul(c2, float4x4(-0.013587494, 0.021273775, -0.022650799, -0.011939531, -0.11202949, 0.09365859, -0.042938907, -0.009910716, 0.27254924, 0.07752608, 0.029586637, 0.024899973, 0.04375618, 0.31453863, -0.006775175, 0.008228053)); - target2 += mul(d2, float4x4(-0.49562672, -0.12472124, -0.13618441, 0.09660054, -0.2275429, -0.0902811, 0.18311924, 0.11677185, -0.13325182, -0.061613016, -0.011462703, -0.12538978, 0.054934092, 0.06742866, 0.25515345, 0.35692096)); - target2 += mul(e2, float4x4(0.5266911, -0.09655596, -0.41069564, -0.3174325, 0.1431904, -0.17732115, -0.36320353, 0.37975433, -0.5158582, -0.21019879, 0.06852925, -0.06648648, -0.18956456, -0.018139647, 0.35707653, 0.07378416)); - target2 += mul(f2, float4x4(0.04151976, -0.037361674, 0.06936584, -0.10462262, -0.22264048, -0.043842267, -0.12742832, -0.21778631, 0.0715335, -0.17921853, -0.3856251, -0.16335362, 0.21045755, -0.5026229, 0.14405337, 0.23096423)); - target2 += mul(g2, float4x4(-0.32437655, 0.07860345, -0.0021187086, 0.123870686, -0.16616751, 0.11004699, 0.04754715, -0.0075211064, -0.08026408, 0.04284957, -0.018143758, 0.032623176, 0.06614686, -0.035856936, 0.13667971, -0.15696613)); - target2 += mul(h2, float4x4(0.11260625, 0.03274457, -0.033769324, -0.11558525, -0.35377702, 0.0019119612, 0.24906515, -0.06853208, 0.0009843144, -0.0050376705, 0.063123666, 0.009872904, 0.19592324, 0.0028321196, -0.114693984, 0.16404222)); - target2 += mul(i2, float4x4(-0.03699667, 0.011842293, -0.12273219, 0.04081692, 0.008484447, -0.052331816, 0.07151068, 0.018538639, 0.077749036, 0.07189092, 0.22443593, -0.2436085, 0.023654116, -0.05127411, 0.27350748, 0.12180999)); - target2 += mul(na1, float4x4(0.16090482, 0.059198547, 0.04856637, -0.19173436, 0.12747662, -0.079715036, -0.20203276, -0.13818277, -0.123076215, -0.07168488, 0.0644838, 0.03524764, 0.0005124138, -0.06789178, 0.048645556, -0.098922126)); - target2 += mul(nb1, float4x4(0.29220074, 0.25197285, 0.09825887, 0.030363245, -0.033246458, -0.08370418, -0.12231589, -0.023000835, 0.082732, -0.16907515, -0.052518822, 0.07991363, 0.06222654, -0.06747275, -0.18931144, -0.42009747)); - target2 += mul(nc1, float4x4(0.02667354, 0.03842717, -0.012755562, 0.061840586, 0.01060547, -0.29081437, 0.010907111, 0.07930905, 0.12273201, 0.017574295, 0.051024225, 0.019036688, 0.07671181, 0.049130872, -0.09734168, -0.070569195)); - target2 += mul(nd1, float4x4(0.08517651, 0.0767222, -0.15657257, 0.18501835, -0.13749431, -0.2833894, 0.109219365, 0.033763003, 0.18988928, 0.13461404, -0.036578514, -0.13256857, -0.097819485, -0.17316358, -0.06512401, 0.1937444)); - target2 += mul(ne1, float4x4(-0.32173568, -0.072075866, 0.13004705, -0.15507852, -0.23741087, -0.29364398, 0.10723945, -0.11976219, 0.20620506, 0.17970093, 0.24463713, -0.12555319, -0.021192182, -0.1374317, 0.5359718, 0.59974134)); - target2 += mul(nf1, float4x4(-0.01101575, 0.040466793, -0.009630791, 0.13422947, -0.13290837, -0.24789505, -0.061713737, -0.07838521, 0.05379315, -0.14643523, -0.09155805, -0.049997047, 0.06696885, 0.20043123, -0.07542329, -0.08041673)); - target2 += mul(ng1, float4x4(0.022160506, 0.01611432, -0.10189221, -0.022767285, -0.06682965, 0.047138248, 0.06860934, -0.012574086, 0.04010214, -0.041280016, -0.034621384, -0.018262599, 0.09731754, -0.059062295, 0.14786182, -0.15185094)); - target2 += mul(nh1, float4x4(-0.052484483, 0.06899427, 0.18380043, -0.058414727, 0.07685985, -0.07206598, -0.101362616, -0.012002652, 0.008517392, 0.079471916, -0.30394664, 0.028600946, -0.03270232, -0.23564856, 0.045065008, -0.0034684737)); - target2 += mul(ni1, float4x4(-0.049757, 0.07614825, 0.16394803, 0.027053174, 0.0451278, -0.09351286, -0.0042182617, 0.12332257, -0.025281021, -0.03843008, 0.12857373, -0.07611989, -0.0062898803, 0.022618141, -0.13122174, -0.03328411)); - target2 += mul(na2, float4x4(0.12251631, 0.047008447, 0.027589995, -0.12207328, -0.1510795, 0.06724553, 0.17066906, 0.16992114, -0.0026905634, -0.035480864, 0.033738773, 0.018674552, 0.028614907, -0.019945908, -0.0156899, -0.09562145)); - target2 += mul(nb2, float4x4(0.116588935, 0.14205505, 0.099545434, -0.045527786, -0.049273346, 0.20760757, 0.053965498, -0.12198069, -0.14654607, 0.041820496, 0.038068503, 0.24565905, 0.09786504, 0.18309233, 0.23802327, -0.085740186)); - target2 += mul(nc2, float4x4(-0.1262052, -0.011846116, -0.058820397, -0.019373653, -0.09569547, -0.08265971, -0.05178388, -0.020502446, -0.17525336, -0.22874829, 0.0075891856, -0.189923, 0.09809122, 0.109637566, -0.0005973885, -0.06477921)); - target2 += mul(nd2, float4x4(0.28209856, 0.11276813, 0.054377034, -0.00891202, -0.095922634, 0.071109876, -0.039932176, -0.047409832, -0.06504704, 0.11923986, 0.0013364811, -0.122095086, -0.20282102, -0.022717483, -0.115474045, 0.020858249)); - target2 += mul(ne2, float4x4(-0.16130303, 0.072821185, -0.021358958, -0.11687897, -0.15543966, 0.05783285, 0.10317231, -0.12240756, 0.053357504, -0.090291016, -0.21943556, 0.46947235, 0.19072579, 0.017349033, -0.55443907, -0.10510661)); - target2 += mul(nf2, float4x4(-0.4155687, 0.019206723, -0.20055711, 0.028732464, -0.1981807, 0.20637372, 0.03305817, -0.17949893, -0.21051097, 0.21483344, 0.0061496794, -0.48980987, -0.26750582, 0.09230394, -0.117223755, -0.07636286)); - target2 += mul(ng2, float4x4(0.20611528, -0.00095511036, -0.21555157, -0.07065484, 0.06880411, 0.068082534, -0.10104979, 0.16050354, -0.07437897, -0.13145325, -0.017651044, 0.055096775, -0.05443345, -0.018634815, -0.011232755, -0.10835)); - target2 += mul(nh2, float4x4(-0.2637829, 0.07681072, 0.015995527, 0.004554211, 0.07495561, 0.18873464, -0.14303622, 0.25786543, -0.14117226, -0.008715274, -0.17176823, -0.0006595096, -0.06566383, -0.19184378, -0.18945406, 0.20968987)); - target2 += mul(ni2, float4x4(-0.03293623, 0.003399063, 0.08051177, -0.0072856937, -0.07375858, 0.075319655, -0.10791501, -0.002204552, -0.093564905, -0.122712255, -0.10658267, -0.015067637, -0.033247817, 0.09952069, -0.13724248, 0.068189256)); - target2 += float4(-0.001935585, 0.05018077, -0.0154469935, -0.034524206); + MF4 target2 = { -0.001935585, 0.05018077, -0.0154469935, -0.034524206 }; + target2 = MulAdd(a1, MF4x4(-0.021453971, -0.108874515, 0.0005208881, -0.09774453, -0.0053757126, 0.20114918, 0.24454592, 0.04932251, -0.0037210248, -0.0240578, -0.07736935, 0.27604944, -0.12430849, -0.13093218, -0.014840212, 0.13450128), target2); + target2 = MulAdd(b1, MF4x4(-0.19143668, -0.23023333, -0.10232715, 0.24396868, 0.056112397, 0.14535592, -0.25882182, -0.26274678, -0.23119931, 0.07735849, -0.14785223, -0.21026523, -0.2064457, -0.34512606, -0.17808662, 0.30146623), target2); + target2 = MulAdd(c1, MF4x4(0.0072161015, -0.013303738, 0.07591899, 0.027883789, 0.210858, 0.1422139, -0.027882019, 0.2618474, -0.048504543, 0.07377317, -0.05427271, -0.10014041, -0.12974857, -0.13140713, -0.02249253, 0.08203184), target2); + target2 = MulAdd(d1, MF4x4(0.07855138, -0.13984342, 0.10037151, -0.056781758, 0.24686107, -0.0048190085, -0.2693424, 0.31722167, -0.28716075, -0.06422215, -0.06738793, -0.06723655, -0.08194382, -0.007975044, 0.20108353, -0.13338897), target2); + target2 = MulAdd(e1, MF4x4(0.35129568, 0.27930936, 0.024239251, -0.10712293, 0.48684034, -0.04380574, -0.0064479653, 0.03754327, -0.13139078, -0.44939983, -1.0460628, -0.016004754, -0.14476573, -0.07113434, 0.515311, -0.400374), target2); + target2 = MulAdd(f1, MF4x4(0.13104302, -0.23410062, 0.091530964, -0.003652217, 0.16696814, 0.16406855, -0.08138474, 0.047526445, 0.25358474, 0.37850454, 0.0362802, -0.046476766, -0.093869686, -0.4143772, 0.08641024, 0.115896136), target2); + target2 = MulAdd(g1, MF4x4(-0.04416574, -0.052188106, 0.05141859, -0.008132604, -0.013658864, 0.1021097, 0.19391364, -0.09257973, 0.15225394, -0.16920799, -0.16172324, 0.41466942, -0.07087308, 0.08632938, -0.07496043, -0.023530172), target2); + target2 = MulAdd(h1, MF4x4(0.09337352, 0.062108494, -0.219173, -0.046151914, 0.22507025, -0.08966131, -0.123690315, 0.08666376, -0.10731867, -0.08518657, 0.024199447, 0.17898631, 0.120247275, 0.089923285, -0.08756211, 0.1775775), target2); + target2 = MulAdd(i1, MF4x4(0.20326594, -0.060535498, -0.061659336, 0.113954924, -0.073462196, 0.15917051, 0.11728326, -0.072256014, -0.0752342, 0.06265616, -0.19494365, -0.25413772, -0.06641352, -0.015642308, 0.16825356, 0.0027654327), target2); + target2 = MulAdd(a2, MF4x4(-0.17029639, -0.05388927, -0.13159063, 0.0795609, 0.00501164, -0.0703107, -0.08229201, 0.07546247, 0.092942156, 0.059050936, -0.07987315, 0.010874322, 0.037708692, -0.0017377702, -0.030414931, 0.28946167), target2); + target2 = MulAdd(b2, MF4x4(-0.2692667, 0.2258295, 0.062060453, 0.1934921, -0.023051793, -0.038611185, 0.21473692, 0.33520013, 0.029885106, 0.103782356, 0.05217351, -0.13349791, -0.034186684, -0.3015818, 0.033423528, 0.21218027), target2); + target2 = MulAdd(c2, MF4x4(-0.013587494, 0.021273775, -0.022650799, -0.011939531, -0.11202949, 0.09365859, -0.042938907, -0.009910716, 0.27254924, 0.07752608, 0.029586637, 0.024899973, 0.04375618, 0.31453863, -0.006775175, 0.008228053), target2); + target2 = MulAdd(d2, MF4x4(-0.49562672, -0.12472124, -0.13618441, 0.09660054, -0.2275429, -0.0902811, 0.18311924, 0.11677185, -0.13325182, -0.061613016, -0.011462703, -0.12538978, 0.054934092, 0.06742866, 0.25515345, 0.35692096), target2); + target2 = MulAdd(e2, MF4x4(0.5266911, -0.09655596, -0.41069564, -0.3174325, 0.1431904, -0.17732115, -0.36320353, 0.37975433, -0.5158582, -0.21019879, 0.06852925, -0.06648648, -0.18956456, -0.018139647, 0.35707653, 0.07378416), target2); + target2 = MulAdd(f2, MF4x4(0.04151976, -0.037361674, 0.06936584, -0.10462262, -0.22264048, -0.043842267, -0.12742832, -0.21778631, 0.0715335, -0.17921853, -0.3856251, -0.16335362, 0.21045755, -0.5026229, 0.14405337, 0.23096423), target2); + target2 = MulAdd(g2, MF4x4(-0.32437655, 0.07860345, -0.0021187086, 0.123870686, -0.16616751, 0.11004699, 0.04754715, -0.0075211064, -0.08026408, 0.04284957, -0.018143758, 0.032623176, 0.06614686, -0.035856936, 0.13667971, -0.15696613), target2); + target2 = MulAdd(h2, MF4x4(0.11260625, 0.03274457, -0.033769324, -0.11558525, -0.35377702, 0.0019119612, 0.24906515, -0.06853208, 0.0009843144, -0.0050376705, 0.063123666, 0.009872904, 0.19592324, 0.0028321196, -0.114693984, 0.16404222), target2); + target2 = MulAdd(i2, MF4x4(-0.03699667, 0.011842293, -0.12273219, 0.04081692, 0.008484447, -0.052331816, 0.07151068, 0.018538639, 0.077749036, 0.07189092, 0.22443593, -0.2436085, 0.023654116, -0.05127411, 0.27350748, 0.12180999), target2); + target2 = MulAdd(na1, MF4x4(0.16090482, 0.059198547, 0.04856637, -0.19173436, 0.12747662, -0.079715036, -0.20203276, -0.13818277, -0.123076215, -0.07168488, 0.0644838, 0.03524764, 0.0005124138, -0.06789178, 0.048645556, -0.098922126), target2); + target2 = MulAdd(nb1, MF4x4(0.29220074, 0.25197285, 0.09825887, 0.030363245, -0.033246458, -0.08370418, -0.12231589, -0.023000835, 0.082732, -0.16907515, -0.052518822, 0.07991363, 0.06222654, -0.06747275, -0.18931144, -0.42009747), target2); + target2 = MulAdd(nc1, MF4x4(0.02667354, 0.03842717, -0.012755562, 0.061840586, 0.01060547, -0.29081437, 0.010907111, 0.07930905, 0.12273201, 0.017574295, 0.051024225, 0.019036688, 0.07671181, 0.049130872, -0.09734168, -0.070569195), target2); + target2 = MulAdd(nd1, MF4x4(0.08517651, 0.0767222, -0.15657257, 0.18501835, -0.13749431, -0.2833894, 0.109219365, 0.033763003, 0.18988928, 0.13461404, -0.036578514, -0.13256857, -0.097819485, -0.17316358, -0.06512401, 0.1937444), target2); + target2 = MulAdd(ne1, MF4x4(-0.32173568, -0.072075866, 0.13004705, -0.15507852, -0.23741087, -0.29364398, 0.10723945, -0.11976219, 0.20620506, 0.17970093, 0.24463713, -0.12555319, -0.021192182, -0.1374317, 0.5359718, 0.59974134), target2); + target2 = MulAdd(nf1, MF4x4(-0.01101575, 0.040466793, -0.009630791, 0.13422947, -0.13290837, -0.24789505, -0.061713737, -0.07838521, 0.05379315, -0.14643523, -0.09155805, -0.049997047, 0.06696885, 0.20043123, -0.07542329, -0.08041673), target2); + target2 = MulAdd(ng1, MF4x4(0.022160506, 0.01611432, -0.10189221, -0.022767285, -0.06682965, 0.047138248, 0.06860934, -0.012574086, 0.04010214, -0.041280016, -0.034621384, -0.018262599, 0.09731754, -0.059062295, 0.14786182, -0.15185094), target2); + target2 = MulAdd(nh1, MF4x4(-0.052484483, 0.06899427, 0.18380043, -0.058414727, 0.07685985, -0.07206598, -0.101362616, -0.012002652, 0.008517392, 0.079471916, -0.30394664, 0.028600946, -0.03270232, -0.23564856, 0.045065008, -0.0034684737), target2); + target2 = MulAdd(ni1, MF4x4(-0.049757, 0.07614825, 0.16394803, 0.027053174, 0.0451278, -0.09351286, -0.0042182617, 0.12332257, -0.025281021, -0.03843008, 0.12857373, -0.07611989, -0.0062898803, 0.022618141, -0.13122174, -0.03328411), target2); + target2 = MulAdd(na2, MF4x4(0.12251631, 0.047008447, 0.027589995, -0.12207328, -0.1510795, 0.06724553, 0.17066906, 0.16992114, -0.0026905634, -0.035480864, 0.033738773, 0.018674552, 0.028614907, -0.019945908, -0.0156899, -0.09562145), target2); + target2 = MulAdd(nb2, MF4x4(0.116588935, 0.14205505, 0.099545434, -0.045527786, -0.049273346, 0.20760757, 0.053965498, -0.12198069, -0.14654607, 0.041820496, 0.038068503, 0.24565905, 0.09786504, 0.18309233, 0.23802327, -0.085740186), target2); + target2 = MulAdd(nc2, MF4x4(-0.1262052, -0.011846116, -0.058820397, -0.019373653, -0.09569547, -0.08265971, -0.05178388, -0.020502446, -0.17525336, -0.22874829, 0.0075891856, -0.189923, 0.09809122, 0.109637566, -0.0005973885, -0.06477921), target2); + target2 = MulAdd(nd2, MF4x4(0.28209856, 0.11276813, 0.054377034, -0.00891202, -0.095922634, 0.071109876, -0.039932176, -0.047409832, -0.06504704, 0.11923986, 0.0013364811, -0.122095086, -0.20282102, -0.022717483, -0.115474045, 0.020858249), target2); + target2 = MulAdd(ne2, MF4x4(-0.16130303, 0.072821185, -0.021358958, -0.11687897, -0.15543966, 0.05783285, 0.10317231, -0.12240756, 0.053357504, -0.090291016, -0.21943556, 0.46947235, 0.19072579, 0.017349033, -0.55443907, -0.10510661), target2); + target2 = MulAdd(nf2, MF4x4(-0.4155687, 0.019206723, -0.20055711, 0.028732464, -0.1981807, 0.20637372, 0.03305817, -0.17949893, -0.21051097, 0.21483344, 0.0061496794, -0.48980987, -0.26750582, 0.09230394, -0.117223755, -0.07636286), target2); + target2 = MulAdd(ng2, MF4x4(0.20611528, -0.00095511036, -0.21555157, -0.07065484, 0.06880411, 0.068082534, -0.10104979, 0.16050354, -0.07437897, -0.13145325, -0.017651044, 0.055096775, -0.05443345, -0.018634815, -0.011232755, -0.10835), target2); + target2 = MulAdd(nh2, MF4x4(-0.2637829, 0.07681072, 0.015995527, 0.004554211, 0.07495561, 0.18873464, -0.14303622, 0.25786543, -0.14117226, -0.008715274, -0.17176823, -0.0006595096, -0.06566383, -0.19184378, -0.18945406, 0.20968987), target2); + target2 = MulAdd(ni2, MF4x4(-0.03293623, 0.003399063, 0.08051177, -0.0072856937, -0.07375858, 0.075319655, -0.10791501, -0.002204552, -0.093564905, -0.122712255, -0.10658267, -0.015067637, -0.033247817, 0.09952069, -0.13724248, 0.068189256), target2); conv2d_5_tf[gxy] = target1; conv2d_5_tf1[gxy] = target2; @@ -1004,25 +1007,25 @@ void Pass7(uint2 blockStart, uint3 threadId) { // [ a, d, g ] // [ b, e, h ] // [ c, f, i ] - float4 a1 = conv2d_5_tf.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0); - float4 b1 = conv2d_5_tf.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); - float4 c1 = conv2d_5_tf.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); - float4 d1 = conv2d_5_tf.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); - float4 e1 = conv2d_5_tf.SampleLevel(sam, pos, 0); - float4 f1 = conv2d_5_tf.SampleLevel(sam, pos + float2(0, inputPt.y), 0); - float4 g1 = conv2d_5_tf.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); - float4 h1 = conv2d_5_tf.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); - float4 i1 = conv2d_5_tf.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0); - - float4 na1 = max(-a1, 0); - float4 nb1 = max(-b1, 0); - float4 nc1 = max(-c1, 0); - float4 nd1 = max(-d1, 0); - float4 ne1 = max(-e1, 0); - float4 nf1 = max(-f1, 0); - float4 ng1 = max(-g1, 0); - float4 nh1 = max(-h1, 0); - float4 ni1 = max(-i1, 0); + MF4 a1 = conv2d_5_tf.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0); + MF4 b1 = conv2d_5_tf.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); + MF4 c1 = conv2d_5_tf.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); + MF4 d1 = conv2d_5_tf.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); + MF4 e1 = conv2d_5_tf.SampleLevel(sam, pos, 0); + MF4 f1 = conv2d_5_tf.SampleLevel(sam, pos + float2(0, inputPt.y), 0); + MF4 g1 = conv2d_5_tf.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); + MF4 h1 = conv2d_5_tf.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); + MF4 i1 = conv2d_5_tf.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0); + + MF4 na1 = max(-a1, 0); + MF4 nb1 = max(-b1, 0); + MF4 nc1 = max(-c1, 0); + MF4 nd1 = max(-d1, 0); + MF4 ne1 = max(-e1, 0); + MF4 nf1 = max(-f1, 0); + MF4 ng1 = max(-g1, 0); + MF4 nh1 = max(-h1, 0); + MF4 ni1 = max(-i1, 0); a1 = max(a1, 0); b1 = max(b1, 0); @@ -1034,25 +1037,25 @@ void Pass7(uint2 blockStart, uint3 threadId) { h1 = max(h1, 0); i1 = max(i1, 0); - float4 a2 = conv2d_5_tf1.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0); - float4 b2 = conv2d_5_tf1.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); - float4 c2 = conv2d_5_tf1.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); - float4 d2 = conv2d_5_tf1.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); - float4 e2 = conv2d_5_tf1.SampleLevel(sam, pos, 0); - float4 f2 = conv2d_5_tf1.SampleLevel(sam, pos + float2(0, inputPt.y), 0); - float4 g2 = conv2d_5_tf1.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); - float4 h2 = conv2d_5_tf1.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); - float4 i2 = conv2d_5_tf1.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0); - - float4 na2 = max(-a2, 0); - float4 nb2 = max(-b2, 0); - float4 nc2 = max(-c2, 0); - float4 nd2 = max(-d2, 0); - float4 ne2 = max(-e2, 0); - float4 nf2 = max(-f2, 0); - float4 ng2 = max(-g2, 0); - float4 nh2 = max(-h2, 0); - float4 ni2 = max(-i2, 0); + MF4 a2 = conv2d_5_tf1.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0); + MF4 b2 = conv2d_5_tf1.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); + MF4 c2 = conv2d_5_tf1.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); + MF4 d2 = conv2d_5_tf1.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); + MF4 e2 = conv2d_5_tf1.SampleLevel(sam, pos, 0); + MF4 f2 = conv2d_5_tf1.SampleLevel(sam, pos + float2(0, inputPt.y), 0); + MF4 g2 = conv2d_5_tf1.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); + MF4 h2 = conv2d_5_tf1.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); + MF4 i2 = conv2d_5_tf1.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0); + + MF4 na2 = max(-a2, 0); + MF4 nb2 = max(-b2, 0); + MF4 nc2 = max(-c2, 0); + MF4 nd2 = max(-d2, 0); + MF4 ne2 = max(-e2, 0); + MF4 nf2 = max(-f2, 0); + MF4 ng2 = max(-g2, 0); + MF4 nh2 = max(-h2, 0); + MF4 ni2 = max(-i2, 0); a2 = max(a2, 0); b2 = max(b2, 0); @@ -1064,81 +1067,81 @@ void Pass7(uint2 blockStart, uint3 threadId) { h2 = max(h2, 0); i2 = max(i2, 0); - float4 target1 = mul(a1, float4x4(0.0053346683, 0.010174534, -0.050979972, -0.06134544, -0.007238652, -0.012790015, 0.036398683, -0.09181499, 0.11328388, -0.14236617, -0.17519625, -0.34661606, 0.008069393, -0.028871074, -0.02980949, -0.0853359)); - target1 += mul(b1, float4x4(-0.05187267, -0.09381704, 0.035209883, 0.29482442, -0.0018002815, -0.029504262, 0.2609028, -0.09480671, -0.0737553, -0.070559524, 0.081991084, 0.1513024, 0.048344653, -0.09336617, 0.0034569732, 0.10530542)); - target1 += mul(c1, float4x4(-0.06749591, 0.0065624053, 0.013237342, 0.14225115, 0.27183163, -0.15656447, 0.031672053, 0.009592649, -0.0202286, 0.26220062, 0.19387855, -0.18505628, 0.040554795, 0.07295961, -0.13291295, -0.12600344)); - target1 += mul(d1, float4x4(0.039192002, 0.0846215, -0.06593224, 0.28147796, 0.06301313, 0.26323164, -0.16742979, 0.22004774, -0.17470881, 0.060716614, 0.15430811, 0.18970133, 0.08858931, -0.027321626, -0.037833836, 0.07344837)); - target1 += mul(e1, float4x4(0.0633813, 0.35046157, -0.101075254, 0.015974075, 0.19010352, -0.7135035, -0.24324696, -0.42072615, 0.06825536, -0.052808974, 0.28965715, -0.0015640302, -0.27062586, 0.4279925, 0.035623744, 0.46321228)); - target1 += mul(f1, float4x4(0.02639867, 0.26469797, -0.09086266, 0.07440796, -0.192054, 0.1010368, -0.04398074, 0.056824226, -0.27057743, -0.20455118, 0.19338831, -0.21843775, 0.20736177, -0.26259273, -0.07667085, -0.19504389)); - target1 += mul(g1, float4x4(-0.007056104, 0.04284205, 0.01933733, 0.07267832, 0.0012616975, -0.30140647, -0.019223223, -0.046687007, -0.037844718, -0.014929125, 0.022630794, 0.046716493, 0.057279173, -0.08055539, -0.027891241, 0.019557232)); - target1 += mul(h1, float4x4(0.035518404, -0.10087327, 0.0011048123, -0.123707846, 0.37190285, 0.43751532, -0.022599256, -0.041709043, 0.11357196, -0.029839104, -0.056960747, -0.17228557, 0.08558022, 0.046361133, 0.021548864, 0.24297418)); - target1 += mul(i1, float4x4(-0.043598346, -0.09812348, 0.056599542, -0.09833163, -0.07193007, 0.015760094, -0.053177495, -0.015448543, 0.035163186, -0.03889347, 0.121799015, 0.15738566, -0.115644835, 0.043310717, 0.060173217, -0.059635755)); - target1 += mul(a2, float4x4(-0.111604795, 0.1678389, 0.049967546, 0.045353863, -0.013896185, 0.035128903, 0.040686198, -0.16442506, 0.1149577, -0.14343217, -0.08858, 0.02656137, 0.059526477, -0.13914491, 0.12757027, 0.034920372)); - target1 += mul(b2, float4x4(0.15849945, 0.12067003, -0.1579611, 0.30790725, -0.041249942, 0.03948043, -0.12535375, -0.02566875, 0.3150059, 0.027081972, -0.026308673, -0.25326517, 0.016824603, -0.13551097, 0.1412756, 0.037750524)); - target1 += mul(c2, float4x4(0.1562541, -0.041948073, -0.14951487, 0.119380556, -0.21773878, -0.019281754, 0.08185942, 0.09982689, 0.017187534, -0.18181366, -0.09270861, 0.08527679, 0.051988564, 0.08686172, -0.12665209, -0.07205808)); - target1 += mul(d2, float4x4(0.08860466, -0.17931758, 0.10191625, -0.47623265, 0.1562338, -0.2960855, 0.013664795, 0.29452285, 0.1463958, 0.17562817, -0.41623253, -0.196999, -0.049113005, 0.0556021, 0.054452494, 0.14073615)); - target1 += mul(e2, float4x4(-0.5345973, -0.069205046, 0.37001884, 0.6955835, 0.22635284, -0.09021557, -0.04693607, -0.4458824, 0.25049326, -0.06503396, 0.07584689, 0.5394811, 0.33387923, -0.010540017, 0.038980547, -0.13371105)); - target1 += mul(f2, float4x4(-0.04414677, -0.22056313, 0.05580458, 0.11914465, 0.19864987, -0.1025625, -0.010050287, 0.15919746, -0.40589634, 0.4966349, -0.47632688, -0.022637444, 0.17247641, -0.51093113, 0.21157944, -0.2890017)); - target1 += mul(g2, float4x4(-0.034673482, -0.0075900992, -0.061077584, -0.03859898, 0.32444152, -0.14619137, -0.1375446, -0.030322462, 0.029679669, 0.079344586, -0.03862258, -0.05766807, 0.104488336, 0.006179548, -0.1168102, 0.069729604)); - target1 += mul(h2, float4x4(0.08504003, 0.042162962, -0.17509954, -0.06258286, -0.45796555, -0.061748773, 0.25438437, -0.02988987, -0.06897794, 0.105180845, -0.08879189, -0.120605074, -0.1478659, -0.13201937, -0.01755498, 0.020606143)); - target1 += mul(i2, float4x4(0.08932581, 0.1453785, -0.12802933, 0.049442187, 0.045360584, 0.16079827, -0.14142223, 0.10168491, 0.20244479, -0.17981426, 0.19759466, 0.05217847, 0.04889828, 0.06941533, -0.111836776, -0.08046399)); - target1 += mul(na1, float4x4(-0.011953735, 0.11362504, -0.122588776, -0.10408559, 0.051712614, -0.05161036, -0.068698496, -0.015663281, -0.06346889, 0.06561636, 0.03783044, 0.02756004, -0.036310352, -0.16962235, -0.062494226, 0.0069608325)); - target1 += mul(nb1, float4x4(-0.16857432, -0.17322211, 0.15971284, 0.19980437, -0.007965961, -0.015480705, 0.036090557, 0.07414387, -0.2941106, -0.24430539, 0.01070864, 0.22401866, -0.34321144, 0.09537491, -0.08020218, 0.45404655)); - target1 += mul(nc1, float4x4(-0.021609096, -0.11348408, -0.01450652, -0.063170746, 0.06990935, -0.035983834, -0.038010992, -0.10578655, 0.29232737, 0.048835874, 0.054028947, -0.12924139, -0.03058583, 0.028469706, 0.09563202, 0.085674495)); - target1 += mul(nd1, float4x4(-0.01894022, 0.037628658, -0.102314636, -0.28041583, 0.07495663, -0.058895253, 0.16422969, -0.07163792, 0.039416216, -0.13800906, -0.039811566, -0.10612402, -0.047593113, -0.28491783, 0.41632858, 0.15253194)); - target1 += mul(ne1, float4x4(0.26240867, -0.05335849, -0.014135048, 0.055749495, -0.020126658, 0.2952794, -0.015241771, 0.36143306, 0.43075684, 0.1921996, -0.4329065, 0.5114495, 0.7326109, -0.054901246, -0.076693356, -0.26104695)); - target1 += mul(nf1, float4x4(0.14548428, 0.14578429, 0.17193514, -0.07973242, 0.011952286, -0.047767498, 0.025101405, 0.0016503566, -0.26948047, -0.16503395, -0.061791085, 0.030557185, 0.15400517, -0.054951698, -0.14611247, 0.3550633)); - target1 += mul(ng1, float4x4(-0.05926111, -0.083442695, 0.046579204, -0.017723244, 0.12846185, 0.018434443, -0.17914511, -0.077696435, 0.060048338, 0.02956987, -0.11914462, 0.057770032, -0.054673657, -0.005353606, -0.39014184, 0.08306877)); - target1 += mul(nh1, float4x4(0.07357362, 0.23051825, -0.22640751, 0.080715515, -0.14467078, 0.009734264, 0.054320686, 0.24534328, -0.16038458, 0.06575425, 0.058553413, 0.17755087, 0.08184439, 0.17078212, 0.148369, -0.09309279)); - target1 += mul(ni1, float4x4(-0.11160211, -0.07590204, -0.01676188, -0.062253337, 0.016433533, 0.0146132, -0.040350936, 0.06749202, -0.031521842, 0.1441664, -0.09916073, 0.050578352, -0.06560962, -0.31174552, 0.056873083, -0.077912)); - target1 += mul(na2, float4x4(0.09344025, 0.075936995, -0.1627903, -0.04781558, -0.01878236, 0.045879602, -0.11507387, -0.025356822, -0.09113391, 0.07263937, 0.08232447, 0.08727616, -0.024921807, 0.051639438, 0.006532631, -0.018751068)); - target1 += mul(nb2, float4x4(0.022455849, -0.12924309, 0.26318657, -0.32464805, -0.09627585, 0.04496843, -0.09630052, -0.025761643, -0.090804085, 0.24410398, -0.03162944, -0.1961483, 0.14065808, -0.064709485, -0.0040163463, 0.05445074)); - target1 += mul(nc2, float4x4(-0.020935195, -0.1028065, 0.0012804621, 0.02302866, -0.00924972, -0.0041193594, 0.0060590385, -0.003394384, -0.23241943, -0.023235107, 0.08077456, 0.15720141, 0.06568382, -0.09971436, 0.09056065, 0.04271102)); - target1 += mul(nd2, float4x4(-0.20997737, -0.12892777, 0.4658528, 0.13622813, -0.2867294, -0.09359254, 0.18821026, 0.25550604, -0.18562363, 0.080713026, 0.13463654, 0.045504905, -0.013133853, -0.1316404, 0.08379897, -0.00047142128)); - target1 += mul(ne2, float4x4(0.3276134, 0.21952826, -0.80777377, -0.69810224, 0.34190908, -0.09293263, 0.33313555, -0.27255502, -0.24287084, -0.07741488, 0.06090265, -0.10161252, -0.37684909, 0.4678029, 0.13506591, 0.42470258)); - target1 += mul(nf2, float4x4(0.080790855, -0.09707547, -0.05506975, 0.027011644, -0.1434346, 0.01363872, 0.12616752, 0.16789167, 0.1656414, -0.11586835, 0.059612263, -0.074029386, -0.19813071, 0.46032718, -0.03935981, 0.0067143585)); - target1 += mul(ng2, float4x4(0.10322512, 0.0822636, -0.16766444, 0.041008063, -0.027768405, 0.23103505, 0.06737122, 0.15258405, 0.04557388, -0.18179403, 0.12489025, -0.09759324, -0.05925805, 0.04869987, 0.07329833, -0.09738542)); - target1 += mul(nh2, float4x4(-0.10823879, -0.403376, 0.3264802, -0.16503738, -0.057512645, -0.20902547, -0.14862378, -0.3192005, -0.046263676, 0.12744917, -0.019174274, -0.02318789, -0.085088454, -0.05723332, 0.0039772973, 0.07991316)); - target1 += mul(ni2, float4x4(0.10313916, 0.04410904, 0.03286652, 0.059946325, 0.019948404, 0.070217304, -0.017572487, 0.20332281, 0.06776308, 0.029285522, -0.14116238, -0.05864782, -0.18382367, -0.06568212, 0.11855615, 0.101256005)); - target1 += float4(-0.036374483, 0.029420665, 0.04437756, -0.04474691); + MF4 target1 = { -0.036374483, 0.029420665, 0.04437756, -0.04474691 }; + target1 = MulAdd(a1, MF4x4(0.0053346683, 0.010174534, -0.050979972, -0.06134544, -0.007238652, -0.012790015, 0.036398683, -0.09181499, 0.11328388, -0.14236617, -0.17519625, -0.34661606, 0.008069393, -0.028871074, -0.02980949, -0.0853359), target1); + target1 = MulAdd(b1, MF4x4(-0.05187267, -0.09381704, 0.035209883, 0.29482442, -0.0018002815, -0.029504262, 0.2609028, -0.09480671, -0.0737553, -0.070559524, 0.081991084, 0.1513024, 0.048344653, -0.09336617, 0.0034569732, 0.10530542), target1); + target1 = MulAdd(c1, MF4x4(-0.06749591, 0.0065624053, 0.013237342, 0.14225115, 0.27183163, -0.15656447, 0.031672053, 0.009592649, -0.0202286, 0.26220062, 0.19387855, -0.18505628, 0.040554795, 0.07295961, -0.13291295, -0.12600344), target1); + target1 = MulAdd(d1, MF4x4(0.039192002, 0.0846215, -0.06593224, 0.28147796, 0.06301313, 0.26323164, -0.16742979, 0.22004774, -0.17470881, 0.060716614, 0.15430811, 0.18970133, 0.08858931, -0.027321626, -0.037833836, 0.07344837), target1); + target1 = MulAdd(e1, MF4x4(0.0633813, 0.35046157, -0.101075254, 0.015974075, 0.19010352, -0.7135035, -0.24324696, -0.42072615, 0.06825536, -0.052808974, 0.28965715, -0.0015640302, -0.27062586, 0.4279925, 0.035623744, 0.46321228), target1); + target1 = MulAdd(f1, MF4x4(0.02639867, 0.26469797, -0.09086266, 0.07440796, -0.192054, 0.1010368, -0.04398074, 0.056824226, -0.27057743, -0.20455118, 0.19338831, -0.21843775, 0.20736177, -0.26259273, -0.07667085, -0.19504389), target1); + target1 = MulAdd(g1, MF4x4(-0.007056104, 0.04284205, 0.01933733, 0.07267832, 0.0012616975, -0.30140647, -0.019223223, -0.046687007, -0.037844718, -0.014929125, 0.022630794, 0.046716493, 0.057279173, -0.08055539, -0.027891241, 0.019557232), target1); + target1 = MulAdd(h1, MF4x4(0.035518404, -0.10087327, 0.0011048123, -0.123707846, 0.37190285, 0.43751532, -0.022599256, -0.041709043, 0.11357196, -0.029839104, -0.056960747, -0.17228557, 0.08558022, 0.046361133, 0.021548864, 0.24297418), target1); + target1 = MulAdd(i1, MF4x4(-0.043598346, -0.09812348, 0.056599542, -0.09833163, -0.07193007, 0.015760094, -0.053177495, -0.015448543, 0.035163186, -0.03889347, 0.121799015, 0.15738566, -0.115644835, 0.043310717, 0.060173217, -0.059635755), target1); + target1 = MulAdd(a2, MF4x4(-0.111604795, 0.1678389, 0.049967546, 0.045353863, -0.013896185, 0.035128903, 0.040686198, -0.16442506, 0.1149577, -0.14343217, -0.08858, 0.02656137, 0.059526477, -0.13914491, 0.12757027, 0.034920372), target1); + target1 = MulAdd(b2, MF4x4(0.15849945, 0.12067003, -0.1579611, 0.30790725, -0.041249942, 0.03948043, -0.12535375, -0.02566875, 0.3150059, 0.027081972, -0.026308673, -0.25326517, 0.016824603, -0.13551097, 0.1412756, 0.037750524), target1); + target1 = MulAdd(c2, MF4x4(0.1562541, -0.041948073, -0.14951487, 0.119380556, -0.21773878, -0.019281754, 0.08185942, 0.09982689, 0.017187534, -0.18181366, -0.09270861, 0.08527679, 0.051988564, 0.08686172, -0.12665209, -0.07205808), target1); + target1 = MulAdd(d2, MF4x4(0.08860466, -0.17931758, 0.10191625, -0.47623265, 0.1562338, -0.2960855, 0.013664795, 0.29452285, 0.1463958, 0.17562817, -0.41623253, -0.196999, -0.049113005, 0.0556021, 0.054452494, 0.14073615), target1); + target1 = MulAdd(e2, MF4x4(-0.5345973, -0.069205046, 0.37001884, 0.6955835, 0.22635284, -0.09021557, -0.04693607, -0.4458824, 0.25049326, -0.06503396, 0.07584689, 0.5394811, 0.33387923, -0.010540017, 0.038980547, -0.13371105), target1); + target1 = MulAdd(f2, MF4x4(-0.04414677, -0.22056313, 0.05580458, 0.11914465, 0.19864987, -0.1025625, -0.010050287, 0.15919746, -0.40589634, 0.4966349, -0.47632688, -0.022637444, 0.17247641, -0.51093113, 0.21157944, -0.2890017), target1); + target1 = MulAdd(g2, MF4x4(-0.034673482, -0.0075900992, -0.061077584, -0.03859898, 0.32444152, -0.14619137, -0.1375446, -0.030322462, 0.029679669, 0.079344586, -0.03862258, -0.05766807, 0.104488336, 0.006179548, -0.1168102, 0.069729604), target1); + target1 = MulAdd(h2, MF4x4(0.08504003, 0.042162962, -0.17509954, -0.06258286, -0.45796555, -0.061748773, 0.25438437, -0.02988987, -0.06897794, 0.105180845, -0.08879189, -0.120605074, -0.1478659, -0.13201937, -0.01755498, 0.020606143), target1); + target1 = MulAdd(i2, MF4x4(0.08932581, 0.1453785, -0.12802933, 0.049442187, 0.045360584, 0.16079827, -0.14142223, 0.10168491, 0.20244479, -0.17981426, 0.19759466, 0.05217847, 0.04889828, 0.06941533, -0.111836776, -0.08046399), target1); + target1 = MulAdd(na1, MF4x4(-0.011953735, 0.11362504, -0.122588776, -0.10408559, 0.051712614, -0.05161036, -0.068698496, -0.015663281, -0.06346889, 0.06561636, 0.03783044, 0.02756004, -0.036310352, -0.16962235, -0.062494226, 0.0069608325), target1); + target1 = MulAdd(nb1, MF4x4(-0.16857432, -0.17322211, 0.15971284, 0.19980437, -0.007965961, -0.015480705, 0.036090557, 0.07414387, -0.2941106, -0.24430539, 0.01070864, 0.22401866, -0.34321144, 0.09537491, -0.08020218, 0.45404655), target1); + target1 = MulAdd(nc1, MF4x4(-0.021609096, -0.11348408, -0.01450652, -0.063170746, 0.06990935, -0.035983834, -0.038010992, -0.10578655, 0.29232737, 0.048835874, 0.054028947, -0.12924139, -0.03058583, 0.028469706, 0.09563202, 0.085674495), target1); + target1 = MulAdd(nd1, MF4x4(-0.01894022, 0.037628658, -0.102314636, -0.28041583, 0.07495663, -0.058895253, 0.16422969, -0.07163792, 0.039416216, -0.13800906, -0.039811566, -0.10612402, -0.047593113, -0.28491783, 0.41632858, 0.15253194), target1); + target1 = MulAdd(ne1, MF4x4(0.26240867, -0.05335849, -0.014135048, 0.055749495, -0.020126658, 0.2952794, -0.015241771, 0.36143306, 0.43075684, 0.1921996, -0.4329065, 0.5114495, 0.7326109, -0.054901246, -0.076693356, -0.26104695), target1); + target1 = MulAdd(nf1, MF4x4(0.14548428, 0.14578429, 0.17193514, -0.07973242, 0.011952286, -0.047767498, 0.025101405, 0.0016503566, -0.26948047, -0.16503395, -0.061791085, 0.030557185, 0.15400517, -0.054951698, -0.14611247, 0.3550633), target1); + target1 = MulAdd(ng1, MF4x4(-0.05926111, -0.083442695, 0.046579204, -0.017723244, 0.12846185, 0.018434443, -0.17914511, -0.077696435, 0.060048338, 0.02956987, -0.11914462, 0.057770032, -0.054673657, -0.005353606, -0.39014184, 0.08306877), target1); + target1 = MulAdd(nh1, MF4x4(0.07357362, 0.23051825, -0.22640751, 0.080715515, -0.14467078, 0.009734264, 0.054320686, 0.24534328, -0.16038458, 0.06575425, 0.058553413, 0.17755087, 0.08184439, 0.17078212, 0.148369, -0.09309279), target1); + target1 = MulAdd(ni1, MF4x4(-0.11160211, -0.07590204, -0.01676188, -0.062253337, 0.016433533, 0.0146132, -0.040350936, 0.06749202, -0.031521842, 0.1441664, -0.09916073, 0.050578352, -0.06560962, -0.31174552, 0.056873083, -0.077912), target1); + target1 = MulAdd(na2, MF4x4(0.09344025, 0.075936995, -0.1627903, -0.04781558, -0.01878236, 0.045879602, -0.11507387, -0.025356822, -0.09113391, 0.07263937, 0.08232447, 0.08727616, -0.024921807, 0.051639438, 0.006532631, -0.018751068), target1); + target1 = MulAdd(nb2, MF4x4(0.022455849, -0.12924309, 0.26318657, -0.32464805, -0.09627585, 0.04496843, -0.09630052, -0.025761643, -0.090804085, 0.24410398, -0.03162944, -0.1961483, 0.14065808, -0.064709485, -0.0040163463, 0.05445074), target1); + target1 = MulAdd(nc2, MF4x4(-0.020935195, -0.1028065, 0.0012804621, 0.02302866, -0.00924972, -0.0041193594, 0.0060590385, -0.003394384, -0.23241943, -0.023235107, 0.08077456, 0.15720141, 0.06568382, -0.09971436, 0.09056065, 0.04271102), target1); + target1 = MulAdd(nd2, MF4x4(-0.20997737, -0.12892777, 0.4658528, 0.13622813, -0.2867294, -0.09359254, 0.18821026, 0.25550604, -0.18562363, 0.080713026, 0.13463654, 0.045504905, -0.013133853, -0.1316404, 0.08379897, -0.00047142128), target1); + target1 = MulAdd(ne2, MF4x4(0.3276134, 0.21952826, -0.80777377, -0.69810224, 0.34190908, -0.09293263, 0.33313555, -0.27255502, -0.24287084, -0.07741488, 0.06090265, -0.10161252, -0.37684909, 0.4678029, 0.13506591, 0.42470258), target1); + target1 = MulAdd(nf2, MF4x4(0.080790855, -0.09707547, -0.05506975, 0.027011644, -0.1434346, 0.01363872, 0.12616752, 0.16789167, 0.1656414, -0.11586835, 0.059612263, -0.074029386, -0.19813071, 0.46032718, -0.03935981, 0.0067143585), target1); + target1 = MulAdd(ng2, MF4x4(0.10322512, 0.0822636, -0.16766444, 0.041008063, -0.027768405, 0.23103505, 0.06737122, 0.15258405, 0.04557388, -0.18179403, 0.12489025, -0.09759324, -0.05925805, 0.04869987, 0.07329833, -0.09738542), target1); + target1 = MulAdd(nh2, MF4x4(-0.10823879, -0.403376, 0.3264802, -0.16503738, -0.057512645, -0.20902547, -0.14862378, -0.3192005, -0.046263676, 0.12744917, -0.019174274, -0.02318789, -0.085088454, -0.05723332, 0.0039772973, 0.07991316), target1); + target1 = MulAdd(ni2, MF4x4(0.10313916, 0.04410904, 0.03286652, 0.059946325, 0.019948404, 0.070217304, -0.017572487, 0.20332281, 0.06776308, 0.029285522, -0.14116238, -0.05864782, -0.18382367, -0.06568212, 0.11855615, 0.101256005), target1); - float4 target2 = mul(a1, float4x4(0.059325468, 0.10884231, 0.018158086, 0.031802185, 0.10368743, -0.06776637, 0.048326045, -0.06312353, -0.0025675546, 0.09309577, -0.025533969, 0.029684044, 0.017237723, 0.062099144, 0.047039766, 0.050348036)); - target2 += mul(b1, float4x4(-0.04767078, -0.06409279, 0.112965874, 0.04621161, -0.28172916, -0.13897015, -0.022806352, 0.26966885, 0.02019569, -0.10707113, -0.43058416, -0.14103983, -0.13225646, -0.020053176, -0.17319782, -0.009653082)); - target2 += mul(c1, float4x4(0.0031349238, -0.060933832, 0.107986666, -0.019791966, -0.23946726, -0.18045186, 0.18286318, -0.05431065, 0.11742379, -0.019123906, 0.33327517, 0.07455424, -0.035427105, 0.18659347, -0.050884776, 0.019193258)); - target2 += mul(d1, float4x4(-0.22954239, 0.011265787, -0.026520751, -0.12629737, -0.07009803, 0.44925988, -0.15938939, 0.11956771, 0.11535644, -0.1302371, 0.1235775, 0.16483483, 0.022965495, 0.110546246, 0.00064579415, -0.12753843)); - target2 += mul(e1, float4x4(0.047553673, 0.16213869, 0.7510964, 0.21228868, 0.40994287, 0.61919236, 0.3982374, -0.016163021, 0.3291035, 0.1134356, 0.12384387, -0.31114763, 0.21338554, -0.04721641, 0.122114286, 0.2717476)); - target2 += mul(f1, float4x4(-0.06529201, -0.08936482, 0.031857736, -0.02372691, 0.0416097, 0.28484538, -0.38181338, -0.05129518, 0.40150553, -0.01970737, 0.1043854, 0.11986372, -0.2267319, 0.0014845231, -0.035269983, 0.11712099)); - target2 += mul(g1, float4x4(0.079867415, -0.09982735, 0.10313241, 0.055490237, -0.42685422, -0.3431141, -0.06037366, 0.17539841, -0.010511819, -0.09743252, 0.050748866, 0.11064108, -0.09785722, -0.10230299, -0.04106169, -0.016831731)); - target2 += mul(h1, float4x4(-0.06847075, -0.026447225, -0.123430386, 0.063637204, -0.37617612, -0.09615662, -0.26226708, -0.008175561, -0.08101131, 0.11093525, -0.13149206, -0.06363292, -0.0482858, -0.2771799, 0.10528571, 0.119109035)); - target2 += mul(i1, float4x4(0.09151277, 0.029019276, 0.041349206, -0.011239478, 0.035083957, 0.05281079, -0.0742173, -0.018509442, -0.17175299, -0.4226507, -0.118186444, -0.0771296, 0.107038856, 0.0819975, 0.12445646, 0.07091557)); - target2 += mul(a2, float4x4(0.1275357, -0.097659886, -0.0114354445, 0.023900568, -0.02511702, 0.005830569, -0.010882143, -0.04046068, -0.08638482, 0.08664022, -0.15654318, 0.03333846, -0.12521335, -0.11987078, 0.028556254, -0.020760164)); - target2 += mul(b2, float4x4(-0.38474286, -0.15288061, 0.04925842, 0.050009686, 0.23555282, 0.054784663, -0.0971203, 0.017791113, -0.35539824, -0.08806168, 0.08992579, 0.22714761, -0.047685623, -0.17510797, 0.1137738, -0.069451034)); - target2 += mul(c2, float4x4(-0.16623408, -0.08202571, -0.03291826, 0.0016267949, 0.20682698, 0.08788948, 0.10241089, 0.019209227, -0.14802241, 0.091788374, -0.238735, -0.06633396, 0.02360112, 0.1521805, -0.022510838, -0.08931379)); - target2 += mul(d2, float4x4(0.034280665, -0.12431295, 0.092791, 0.15279225, -0.43373865, 0.20077267, -0.15919733, -0.27969292, -0.26948065, 0.19652127, -0.27456176, 0.04137772, 0.006545539, 0.0031402514, 0.03849979, -0.10978278)); - target2 += mul(e2, float4x4(0.62025917, -0.32462567, 0.2817292, -0.18380783, -0.3338593, -0.49056754, 0.32645953, 0.4146035, 0.3773462, 0.54346967, -0.032203436, -0.14506778, -0.30044907, 0.40134314, 0.24155408, 0.24397472)); - target2 += mul(f2, float4x4(0.089335114, -0.05529855, -0.18364899, -0.153323, -0.18347202, -0.060125064, -0.29216367, -0.2717291, 0.10592963, 0.38889876, 0.25363386, 0.33723134, -0.103703365, 0.14922962, -0.21206948, -0.20289616)); - target2 += mul(g2, float4x4(-0.035760924, 0.18820894, -0.12723185, -0.018780319, 0.124459654, 0.28909087, -0.2763883, -0.45110545, 0.098143585, 0.16052029, -0.055098705, -0.14840914, -0.0019514654, 0.07090622, -0.055036955, -0.0035953245)); - target2 += mul(h2, float4x4(-0.124669634, 0.23131305, -0.05750295, -0.056296032, 0.35691026, 0.2640789, 0.49912274, 0.26795143, -0.26460487, -0.026896512, -0.07179325, 0.17373477, -0.13186656, 0.0021319336, -0.016407885, 0.3014283)); - target2 += mul(i2, float4x4(-0.09491939, 0.11503968, -0.14077829, -0.043197304, -0.061866064, -0.1574549, 0.0054375776, 0.066160634, -0.17686372, -0.26767558, -0.038844116, 0.122724466, -0.05043839, 0.063884266, 0.0064002997, -0.13583377)); - target2 += mul(na1, float4x4(0.031301867, -0.02947819, -0.0016769855, 0.12952408, -0.025022922, 0.065425046, -0.072289295, -0.071249105, 0.14579567, -0.09058119, 0.12663712, 0.1515388, 0.44767743, 0.02971349, 0.015892735, -0.08058422)); - target2 += mul(nb1, float4x4(-0.2868111, -0.10812653, -0.29182926, -0.38444322, -0.0875354, -0.07220258, 0.05978065, 0.093328245, 0.058548283, -0.013913258, -0.20954674, -0.16400063, 0.3185215, 0.068897314, 0.15869021, 0.022877626)); - target2 += mul(nc1, float4x4(0.116845705, -0.12729645, 0.056697316, -0.21263942, -0.07000074, 0.073977455, -0.09006404, -0.029770354, -0.20823102, -0.20088868, 0.15658094, 0.24306639, -0.0453592, -0.16011035, 0.08521533, -0.032264974)); - target2 += mul(nd1, float4x4(0.1114789, -0.1083731, 0.10465276, -0.08903837, -0.06455987, 0.040030345, -0.07937248, -0.20654759, -0.26873547, -0.19390975, -0.039021965, -0.025602374, -0.5575801, -0.08876011, -0.19116728, -0.2401055)); - target2 += mul(ne1, float4x4(0.37626424, -0.0912155, -0.6153361, -0.71465075, 0.018208932, -0.14997734, 0.23627761, 0.20832567, 0.07427123, -0.37869486, -0.26574427, 0.187582, -0.37201726, 0.17809474, -0.02568795, 0.23900814)); - target2 += mul(nf1, float4x4(-0.085337594, -0.50634587, 0.30636734, -0.2760558, 0.01893911, -0.08425695, -0.023656169, 0.021421626, 0.16813251, -0.039550815, 0.21165498, -0.027628547, -0.123874225, 0.013802332, -0.2732087, -0.09419671)); - target2 += mul(ng1, float4x4(-0.07190724, -0.019237598, 0.020249542, 0.07541295, -0.03817686, 0.09266451, -0.12214172, -0.01344174, 0.03281797, 0.057655178, -0.059896503, 0.014948791, -0.13952477, 0.18810949, -0.19016883, 0.06842416)); - target2 += mul(nh1, float4x4(-0.13111524, 0.14539744, -0.10212538, -0.2169032, 0.13810973, -0.12576458, 0.124372825, 0.04992259, 0.21758182, -0.22160134, 0.24321079, 0.017698256, 0.39995426, 0.074034885, 0.120019354, -0.15522505)); - target2 += mul(ni1, float4x4(0.023914235, 0.1424257, 0.010302871, 0.15150794, -0.040021677, 0.015862139, 0.14459212, 0.08632827, 0.04257336, 0.055059638, 0.0030461506, 0.011985334, -0.049230937, 0.07851301, -0.05119983, -0.111701734)); - target2 += mul(na2, float4x4(0.04485158, 0.116597414, 0.00014909732, -0.012128512, 0.15801767, 0.18273115, -0.033926453, 0.05170487, -0.040683754, -0.18606974, 0.08324687, 0.069539666, 0.07098698, -0.014132968, 0.029499048, -0.07263477)); - target2 += mul(nb2, float4x4(0.04309544, 0.089722805, -0.018306322, 0.29061043, 0.15191254, 0.15917647, 0.0073858183, 0.039199475, 0.42514518, -0.053955313, 0.10820046, -0.09134685, -0.3087313, -0.16339037, -0.05226669, 0.044995327)); - target2 += mul(nc2, float4x4(0.008636428, 0.029086163, -0.09151674, -0.36466715, -0.0128008155, 0.018820466, -0.02700147, -0.0064047636, 0.28287655, 0.02709404, -0.05233492, -0.08967187, -0.042183813, -0.13990502, -0.005085154, -0.028511493)); - target2 += mul(nd2, float4x4(0.00022532263, -0.09108507, 0.0089569865, 0.052016005, -0.19314727, -0.355347, 0.08082937, 0.2134498, 0.21036889, -0.10165983, 0.20334485, 0.14575538, 0.017676214, -0.13149881, -0.018741794, -0.019599862)); - target2 += mul(ne2, float4x4(-0.20513605, 0.47578803, -0.18631598, 0.2535432, -0.049522053, -0.37224755, 0.11227206, -0.37000927, 0.19969453, -0.47287735, -0.07506754, -0.0957071, 0.82927394, -0.54057014, 0.5800732, 0.08937558)); - target2 += mul(nf2, float4x4(-0.022189412, 0.14622113, -0.4772564, -0.31178755, 0.10667427, -0.07335338, 0.06144331, 0.00056827103, -0.08263861, -0.009126272, -0.22802618, -0.20760304, 0.12688845, -0.061324466, 0.33361357, 0.38350767)); - target2 += mul(ng2, float4x4(0.021188622, 0.1151918, -0.10654739, -0.03341855, 0.24870358, -0.06689332, 0.11881217, -0.0045951125, -0.039464932, -0.030190004, 0.014174111, -0.025356272, 0.07469406, -0.0059695644, 0.008267219, -0.0991054)); - target2 += mul(nh2, float4x4(-0.009981438, -0.36484948, 0.04801225, 0.22368562, -0.055985868, 0.229039, -0.10823553, 0.1477355, -0.0091677625, 0.06279847, 0.034393013, 0.031901076, 0.28783056, 0.086422645, 0.20860936, 0.054018307)); - target2 += mul(ni2, float4x4(-0.08720452, -0.07756267, 0.018853918, -0.014108689, -0.019337144, 0.021249043, -0.05633926, -0.109904505, -0.088990815, 0.16876367, -0.13149975, -0.054357648, 0.08588134, -0.10262266, 0.12052009, 0.05154292)); - target2 += float4(-0.010602045, 0.053976092, 0.008913503, 0.0011945717); + MF4 target2 = { -0.010602045, 0.053976092, 0.008913503, 0.0011945717 }; + target2 = MulAdd(a1, MF4x4(0.059325468, 0.10884231, 0.018158086, 0.031802185, 0.10368743, -0.06776637, 0.048326045, -0.06312353, -0.0025675546, 0.09309577, -0.025533969, 0.029684044, 0.017237723, 0.062099144, 0.047039766, 0.050348036), target2); + target2 = MulAdd(b1, MF4x4(-0.04767078, -0.06409279, 0.112965874, 0.04621161, -0.28172916, -0.13897015, -0.022806352, 0.26966885, 0.02019569, -0.10707113, -0.43058416, -0.14103983, -0.13225646, -0.020053176, -0.17319782, -0.009653082), target2); + target2 = MulAdd(c1, MF4x4(0.0031349238, -0.060933832, 0.107986666, -0.019791966, -0.23946726, -0.18045186, 0.18286318, -0.05431065, 0.11742379, -0.019123906, 0.33327517, 0.07455424, -0.035427105, 0.18659347, -0.050884776, 0.019193258), target2); + target2 = MulAdd(d1, MF4x4(-0.22954239, 0.011265787, -0.026520751, -0.12629737, -0.07009803, 0.44925988, -0.15938939, 0.11956771, 0.11535644, -0.1302371, 0.1235775, 0.16483483, 0.022965495, 0.110546246, 0.00064579415, -0.12753843), target2); + target2 = MulAdd(e1, MF4x4(0.047553673, 0.16213869, 0.7510964, 0.21228868, 0.40994287, 0.61919236, 0.3982374, -0.016163021, 0.3291035, 0.1134356, 0.12384387, -0.31114763, 0.21338554, -0.04721641, 0.122114286, 0.2717476), target2); + target2 = MulAdd(f1, MF4x4(-0.06529201, -0.08936482, 0.031857736, -0.02372691, 0.0416097, 0.28484538, -0.38181338, -0.05129518, 0.40150553, -0.01970737, 0.1043854, 0.11986372, -0.2267319, 0.0014845231, -0.035269983, 0.11712099), target2); + target2 = MulAdd(g1, MF4x4(0.079867415, -0.09982735, 0.10313241, 0.055490237, -0.42685422, -0.3431141, -0.06037366, 0.17539841, -0.010511819, -0.09743252, 0.050748866, 0.11064108, -0.09785722, -0.10230299, -0.04106169, -0.016831731), target2); + target2 = MulAdd(h1, MF4x4(-0.06847075, -0.026447225, -0.123430386, 0.063637204, -0.37617612, -0.09615662, -0.26226708, -0.008175561, -0.08101131, 0.11093525, -0.13149206, -0.06363292, -0.0482858, -0.2771799, 0.10528571, 0.119109035), target2); + target2 = MulAdd(i1, MF4x4(0.09151277, 0.029019276, 0.041349206, -0.011239478, 0.035083957, 0.05281079, -0.0742173, -0.018509442, -0.17175299, -0.4226507, -0.118186444, -0.0771296, 0.107038856, 0.0819975, 0.12445646, 0.07091557), target2); + target2 = MulAdd(a2, MF4x4(0.1275357, -0.097659886, -0.0114354445, 0.023900568, -0.02511702, 0.005830569, -0.010882143, -0.04046068, -0.08638482, 0.08664022, -0.15654318, 0.03333846, -0.12521335, -0.11987078, 0.028556254, -0.020760164), target2); + target2 = MulAdd(b2, MF4x4(-0.38474286, -0.15288061, 0.04925842, 0.050009686, 0.23555282, 0.054784663, -0.0971203, 0.017791113, -0.35539824, -0.08806168, 0.08992579, 0.22714761, -0.047685623, -0.17510797, 0.1137738, -0.069451034), target2); + target2 = MulAdd(c2, MF4x4(-0.16623408, -0.08202571, -0.03291826, 0.0016267949, 0.20682698, 0.08788948, 0.10241089, 0.019209227, -0.14802241, 0.091788374, -0.238735, -0.06633396, 0.02360112, 0.1521805, -0.022510838, -0.08931379), target2); + target2 = MulAdd(d2, MF4x4(0.034280665, -0.12431295, 0.092791, 0.15279225, -0.43373865, 0.20077267, -0.15919733, -0.27969292, -0.26948065, 0.19652127, -0.27456176, 0.04137772, 0.006545539, 0.0031402514, 0.03849979, -0.10978278), target2); + target2 = MulAdd(e2, MF4x4(0.62025917, -0.32462567, 0.2817292, -0.18380783, -0.3338593, -0.49056754, 0.32645953, 0.4146035, 0.3773462, 0.54346967, -0.032203436, -0.14506778, -0.30044907, 0.40134314, 0.24155408, 0.24397472), target2); + target2 = MulAdd(f2, MF4x4(0.089335114, -0.05529855, -0.18364899, -0.153323, -0.18347202, -0.060125064, -0.29216367, -0.2717291, 0.10592963, 0.38889876, 0.25363386, 0.33723134, -0.103703365, 0.14922962, -0.21206948, -0.20289616), target2); + target2 = MulAdd(g2, MF4x4(-0.035760924, 0.18820894, -0.12723185, -0.018780319, 0.124459654, 0.28909087, -0.2763883, -0.45110545, 0.098143585, 0.16052029, -0.055098705, -0.14840914, -0.0019514654, 0.07090622, -0.055036955, -0.0035953245), target2); + target2 = MulAdd(h2, MF4x4(-0.124669634, 0.23131305, -0.05750295, -0.056296032, 0.35691026, 0.2640789, 0.49912274, 0.26795143, -0.26460487, -0.026896512, -0.07179325, 0.17373477, -0.13186656, 0.0021319336, -0.016407885, 0.3014283), target2); + target2 = MulAdd(i2, MF4x4(-0.09491939, 0.11503968, -0.14077829, -0.043197304, -0.061866064, -0.1574549, 0.0054375776, 0.066160634, -0.17686372, -0.26767558, -0.038844116, 0.122724466, -0.05043839, 0.063884266, 0.0064002997, -0.13583377), target2); + target2 = MulAdd(na1, MF4x4(0.031301867, -0.02947819, -0.0016769855, 0.12952408, -0.025022922, 0.065425046, -0.072289295, -0.071249105, 0.14579567, -0.09058119, 0.12663712, 0.1515388, 0.44767743, 0.02971349, 0.015892735, -0.08058422), target2); + target2 = MulAdd(nb1, MF4x4(-0.2868111, -0.10812653, -0.29182926, -0.38444322, -0.0875354, -0.07220258, 0.05978065, 0.093328245, 0.058548283, -0.013913258, -0.20954674, -0.16400063, 0.3185215, 0.068897314, 0.15869021, 0.022877626), target2); + target2 = MulAdd(nc1, MF4x4(0.116845705, -0.12729645, 0.056697316, -0.21263942, -0.07000074, 0.073977455, -0.09006404, -0.029770354, -0.20823102, -0.20088868, 0.15658094, 0.24306639, -0.0453592, -0.16011035, 0.08521533, -0.032264974), target2); + target2 = MulAdd(nd1, MF4x4(0.1114789, -0.1083731, 0.10465276, -0.08903837, -0.06455987, 0.040030345, -0.07937248, -0.20654759, -0.26873547, -0.19390975, -0.039021965, -0.025602374, -0.5575801, -0.08876011, -0.19116728, -0.2401055), target2); + target2 = MulAdd(ne1, MF4x4(0.37626424, -0.0912155, -0.6153361, -0.71465075, 0.018208932, -0.14997734, 0.23627761, 0.20832567, 0.07427123, -0.37869486, -0.26574427, 0.187582, -0.37201726, 0.17809474, -0.02568795, 0.23900814), target2); + target2 = MulAdd(nf1, MF4x4(-0.085337594, -0.50634587, 0.30636734, -0.2760558, 0.01893911, -0.08425695, -0.023656169, 0.021421626, 0.16813251, -0.039550815, 0.21165498, -0.027628547, -0.123874225, 0.013802332, -0.2732087, -0.09419671), target2); + target2 = MulAdd(ng1, MF4x4(-0.07190724, -0.019237598, 0.020249542, 0.07541295, -0.03817686, 0.09266451, -0.12214172, -0.01344174, 0.03281797, 0.057655178, -0.059896503, 0.014948791, -0.13952477, 0.18810949, -0.19016883, 0.06842416), target2); + target2 = MulAdd(nh1, MF4x4(-0.13111524, 0.14539744, -0.10212538, -0.2169032, 0.13810973, -0.12576458, 0.124372825, 0.04992259, 0.21758182, -0.22160134, 0.24321079, 0.017698256, 0.39995426, 0.074034885, 0.120019354, -0.15522505), target2); + target2 = MulAdd(ni1, MF4x4(0.023914235, 0.1424257, 0.010302871, 0.15150794, -0.040021677, 0.015862139, 0.14459212, 0.08632827, 0.04257336, 0.055059638, 0.0030461506, 0.011985334, -0.049230937, 0.07851301, -0.05119983, -0.111701734), target2); + target2 = MulAdd(na2, MF4x4(0.04485158, 0.116597414, 0.00014909732, -0.012128512, 0.15801767, 0.18273115, -0.033926453, 0.05170487, -0.040683754, -0.18606974, 0.08324687, 0.069539666, 0.07098698, -0.014132968, 0.029499048, -0.07263477), target2); + target2 = MulAdd(nb2, MF4x4(0.04309544, 0.089722805, -0.018306322, 0.29061043, 0.15191254, 0.15917647, 0.0073858183, 0.039199475, 0.42514518, -0.053955313, 0.10820046, -0.09134685, -0.3087313, -0.16339037, -0.05226669, 0.044995327), target2); + target2 = MulAdd(nc2, MF4x4(0.008636428, 0.029086163, -0.09151674, -0.36466715, -0.0128008155, 0.018820466, -0.02700147, -0.0064047636, 0.28287655, 0.02709404, -0.05233492, -0.08967187, -0.042183813, -0.13990502, -0.005085154, -0.028511493), target2); + target2 = MulAdd(nd2, MF4x4(0.00022532263, -0.09108507, 0.0089569865, 0.052016005, -0.19314727, -0.355347, 0.08082937, 0.2134498, 0.21036889, -0.10165983, 0.20334485, 0.14575538, 0.017676214, -0.13149881, -0.018741794, -0.019599862), target2); + target2 = MulAdd(ne2, MF4x4(-0.20513605, 0.47578803, -0.18631598, 0.2535432, -0.049522053, -0.37224755, 0.11227206, -0.37000927, 0.19969453, -0.47287735, -0.07506754, -0.0957071, 0.82927394, -0.54057014, 0.5800732, 0.08937558), target2); + target2 = MulAdd(nf2, MF4x4(-0.022189412, 0.14622113, -0.4772564, -0.31178755, 0.10667427, -0.07335338, 0.06144331, 0.00056827103, -0.08263861, -0.009126272, -0.22802618, -0.20760304, 0.12688845, -0.061324466, 0.33361357, 0.38350767), target2); + target2 = MulAdd(ng2, MF4x4(0.021188622, 0.1151918, -0.10654739, -0.03341855, 0.24870358, -0.06689332, 0.11881217, -0.0045951125, -0.039464932, -0.030190004, 0.014174111, -0.025356272, 0.07469406, -0.0059695644, 0.008267219, -0.0991054), target2); + target2 = MulAdd(nh2, MF4x4(-0.009981438, -0.36484948, 0.04801225, 0.22368562, -0.055985868, 0.229039, -0.10823553, 0.1477355, -0.0091677625, 0.06279847, 0.034393013, 0.031901076, 0.28783056, 0.086422645, 0.20860936, 0.054018307), target2); + target2 = MulAdd(ni2, MF4x4(-0.08720452, -0.07756267, 0.018853918, -0.014108689, -0.019337144, 0.021249043, -0.05633926, -0.109904505, -0.088990815, 0.16876367, -0.13149975, -0.054357648, 0.08588134, -0.10262266, 0.12052009, 0.05154292), target2); conv2d_6_tf[gxy] = target1; conv2d_6_tf1[gxy] = target2; @@ -1162,35 +1165,35 @@ void Pass8(uint2 blockStart, uint3 threadId) { float2 inputPt = GetInputPt(); float2 pos = ((gxy >> 1) + 0.5f) * inputPt; - float4 g0 = conv2d_tf.SampleLevel(sam, pos, 0); - float4 g1 = conv2d_tf1.SampleLevel(sam, pos, 0); - float4 g2 = conv2d_1_tf.SampleLevel(sam, pos, 0); - float4 g3 = conv2d_1_tf1.SampleLevel(sam, pos, 0); - float4 g4 = conv2d_2_tf.SampleLevel(sam, pos, 0); - float4 g5 = conv2d_2_tf1.SampleLevel(sam, pos, 0); - float4 g6 = conv2d_3_tf.SampleLevel(sam, pos, 0); - float4 g7 = conv2d_3_tf1.SampleLevel(sam, pos, 0); - float4 g8 = conv2d_4_tf.SampleLevel(sam, pos, 0); - float4 g9 = conv2d_4_tf1.SampleLevel(sam, pos, 0); - float4 g10 = conv2d_5_tf.SampleLevel(sam, pos, 0); - float4 g11 = conv2d_5_tf1.SampleLevel(sam, pos, 0); - float4 g12 = conv2d_6_tf.SampleLevel(sam, pos, 0); - float4 g13 = conv2d_6_tf1.SampleLevel(sam, pos, 0); - - float4 ng0 = max(-g0, 0); - float4 ng1 = max(-g1, 0); - float4 ng2 = max(-g2, 0); - float4 ng3 = max(-g3, 0); - float4 ng4 = max(-g4, 0); - float4 ng5 = max(-g5, 0); - float4 ng6 = max(-g6, 0); - float4 ng7 = max(-g7, 0); - float4 ng8 = max(-g8, 0); - float4 ng9 = max(-g9, 0); - float4 ng10 = max(-g10, 0); - float4 ng11 = max(-g11, 0); - float4 ng12 = max(-g12, 0); - float4 ng13 = max(-g13, 0); + MF4 g0 = conv2d_tf.SampleLevel(sam, pos, 0); + MF4 g1 = conv2d_tf1.SampleLevel(sam, pos, 0); + MF4 g2 = conv2d_1_tf.SampleLevel(sam, pos, 0); + MF4 g3 = conv2d_1_tf1.SampleLevel(sam, pos, 0); + MF4 g4 = conv2d_2_tf.SampleLevel(sam, pos, 0); + MF4 g5 = conv2d_2_tf1.SampleLevel(sam, pos, 0); + MF4 g6 = conv2d_3_tf.SampleLevel(sam, pos, 0); + MF4 g7 = conv2d_3_tf1.SampleLevel(sam, pos, 0); + MF4 g8 = conv2d_4_tf.SampleLevel(sam, pos, 0); + MF4 g9 = conv2d_4_tf1.SampleLevel(sam, pos, 0); + MF4 g10 = conv2d_5_tf.SampleLevel(sam, pos, 0); + MF4 g11 = conv2d_5_tf1.SampleLevel(sam, pos, 0); + MF4 g12 = conv2d_6_tf.SampleLevel(sam, pos, 0); + MF4 g13 = conv2d_6_tf1.SampleLevel(sam, pos, 0); + + MF4 ng0 = max(-g0, 0); + MF4 ng1 = max(-g1, 0); + MF4 ng2 = max(-g2, 0); + MF4 ng3 = max(-g3, 0); + MF4 ng4 = max(-g4, 0); + MF4 ng5 = max(-g5, 0); + MF4 ng6 = max(-g6, 0); + MF4 ng7 = max(-g7, 0); + MF4 ng8 = max(-g8, 0); + MF4 ng9 = max(-g9, 0); + MF4 ng10 = max(-g10, 0); + MF4 ng11 = max(-g11, 0); + MF4 ng12 = max(-g12, 0); + MF4 ng13 = max(-g13, 0); g0 = max(g0, 0); g1 = max(g1, 0); @@ -1207,110 +1210,110 @@ void Pass8(uint2 blockStart, uint3 threadId) { g12 = max(g12, 0); g13 = max(g13, 0); - float4 target1 = mul(g0, float4x4(-0.105475314, -0.07022547, -0.16326137, -0.12503424, -0.004623021, -0.0143323885, 0.042996034, 0.03422294, -0.38310882, -0.4431925, -0.28772846, -0.3213578, -0.018014904, 0.02429277, -0.07177951, -0.04458822)); - target1 += mul(g1, float4x4(-0.0973233, -0.032439478, -0.08420249, -0.054693196, 0.012960555, 0.06929602, 0.004247494, 0.061315402, -0.09607745, -0.16862066, 0.01537482, -0.038459156, 0.019662246, 0.059920583, -0.1071646, -0.06478967)); - target1 += mul(ng0, float4x4(0.15711947, 0.0754732, 0.17891979, 0.098270796, 0.14122486, 0.14893766, 0.12408279, 0.14845194, 0.16199848, 0.14090912, 0.13496809, 0.1119815, 0.03974558, -0.057513904, 0.09213575, -0.0012252429)); - target1 += mul(ng1, float4x4(-0.011343602, -0.02488338, 0.07799659, 0.06503721, 0.06380687, 0.048929837, -0.05555838, -0.050519127, 0.14673206, 0.18085165, 0.07261422, 0.09738158, 0.07395791, 0.005573146, -0.05454926, -0.13565786)); - target1 += mul(g2, float4x4(-0.08591514, -0.05664865, 0.23980616, 0.24876402, 0.19052829, 0.011938714, 0.21487322, 0.058656186, 0.036630988, 0.14918756, 0.013127693, 0.13092093, -0.37889576, -0.4068804, -0.27258882, -0.30605716)); - target1 += mul(g3, float4x4(-0.25149816, -0.21979512, -0.24949454, -0.20483162, -0.10972783, -0.17315808, -0.08562763, -0.16086778, 0.044681527, 0.050807394, -0.019424994, -0.022418005, 0.10039492, -0.013666552, -0.22373566, -0.34493732)); - target1 += mul(ng2, float4x4(0.1419155, 0.081392206, -0.18103191, -0.2122926, -0.1445937, -0.015969204, -0.12368782, -0.0044421684, -0.09534078, -0.14815839, -0.1052107, -0.16341865, 0.3050403, 0.34488317, 0.16171226, 0.18700944)); - target1 += mul(ng3, float4x4(0.12444696, 0.08712589, 0.06266247, 0.031022022, 0.17707655, 0.24904409, 0.20961654, 0.2610619, -0.099262595, -0.06900819, -0.034567446, -0.020191457, -0.1468561, -0.04683958, 0.14910224, 0.244686)); - target1 += mul(g4, float4x4(-0.002428158, -0.012889509, 0.0006541127, -0.0058380975, 0.096147396, 0.07791617, 0.119144954, 0.11699654, -0.024602454, -0.07894611, -0.00021709128, -0.03979557, 0.0028512406, -0.015790012, 0.0082511455, 0.029357092)); - target1 += mul(g5, float4x4(-0.01410329, -0.004162405, -0.09005045, -0.07753674, 0.004509965, -0.024188736, 0.13799691, 0.10589621, -0.023018798, 0.0064198375, -0.103344224, -0.07463909, -0.060048997, -0.071094714, -0.13042289, -0.14482167)); - target1 += mul(ng4, float4x4(-0.009015246, 0.01581748, -0.035448726, -0.012348933, -0.101627484, -0.05530413, -0.14063041, -0.121775225, 0.074719116, 0.033839386, 0.045573987, -0.006698053, 0.0015141299, 0.003634417, 0.017102007, 0.0074890694)); - target1 += mul(ng5, float4x4(0.0042357175, 0.018735386, 0.058959343, 0.057424515, -0.021633089, -0.037194982, -0.14109972, -0.1506368, 0.004357002, -0.006871023, 0.05337361, 0.039684236, 0.087463334, 0.07772685, 0.12278512, 0.1224218)); - target1 += mul(g6, float4x4(0.018359886, 0.046934873, -0.008225237, 0.020650858, -0.03961538, -0.014779162, -0.04161338, -0.00953579, 0.0017313146, 0.0068857935, -0.0024282748, 0.0047545764, 0.02635904, 0.027336216, 0.02701322, 0.029939381)); - target1 += mul(g7, float4x4(-0.00067966996, 0.024480496, -0.015218739, -0.010472019, -0.03994461, -0.052318517, -0.04450191, -0.043226667, -0.03166469, -0.03799331, 0.015428865, -0.018422252, 0.00040845043, 0.03558268, -0.0099401595, -0.00054432114)); - target1 += mul(ng6, float4x4(-0.0032104475, 0.019604867, -0.02486679, 0.002134673, 0.014368818, -0.0013395248, 0.017318068, 0.0021403218, -0.02198377, 0.010297547, -0.041619625, -0.02740482, -0.067249276, -0.03040953, -0.021304253, -0.009557115)); - target1 += mul(ng7, float4x4(-0.019099236, -0.037010793, 0.013720462, 0.023708181, 0.016356282, -0.00028589502, -0.010570909, -0.009186907, 0.03493662, 0.055599142, -0.017043956, 0.004204044, -0.013573257, -0.013537684, 0.008151195, 0.0074913655)); - target1 += mul(g8, float4x4(0.009309031, -0.0014795153, 0.025114728, -0.0066442797, -0.012085473, -0.0030560147, 0.002144206, 0.0009732741, 0.022301642, -0.0091133695, 0.0011837826, -0.020275833, -0.021349607, -0.011693419, -0.018912962, -0.022418445)); - target1 += mul(g9, float4x4(-0.0045772395, 0.031085191, 0.01215795, 0.023887333, 0.023408212, 0.0005998807, 0.011254428, -0.004634461, 0.016601006, 0.046663348, 0.031117432, 0.04910873, -0.113230005, -0.035702843, -0.058746565, -0.053893737)); - target1 += mul(ng8, float4x4(-0.020218112, 0.056803435, -0.0037077996, 0.05123925, -0.016713811, -0.05551032, -0.005916611, -0.037839632, -0.007671626, -0.009099201, -0.0010055836, 0.003332688, 0.020744357, 0.01957675, 0.057906736, 0.041446246)); - target1 += mul(ng9, float4x4(0.022438819, 0.04616756, 0.035925094, 0.0639705, 0.0009332198, 0.020964272, -0.010805394, 0.031757344, 0.051255573, 0.032838948, 0.00055445684, -0.03195623, 0.04753827, 0.016436901, 0.04788274, 0.022093765)); - target1 += mul(g10, float4x4(0.03479086, 0.035946105, 0.04343359, 0.04015664, 0.06081792, 0.061758887, 0.10128842, 0.007471392, -0.027261607, -0.01290544, -0.029938918, -0.050834358, -0.015550162, 0.0072828676, -0.04580556, -0.029642029)); - target1 += mul(g11, float4x4(0.011150116, 0.029789668, -0.00354488, 0.045047592, -0.018265083, -0.020843878, 0.015457328, 0.0053232997, 0.0791804, -0.028661052, 0.079342775, -0.039631505, 0.14613943, 0.08323415, 0.049641483, 0.047863442)); - target1 += mul(ng10, float4x4(-0.103034586, -0.107580125, 0.00044325445, 0.007830247, -0.017059505, 0.010152936, -0.02845979, -0.01841766, -0.10722863, -0.025262646, -0.07402096, -0.025055556, 0.0013303137, 0.12574737, -0.0161103, 0.06077798)); - target1 += mul(ng11, float4x4(-0.0420636, -0.062703885, -0.06476972, -0.10516001, 0.018120673, 0.024305122, -0.013997766, 0.015815413, -0.06317691, -0.03968166, -0.054052643, -0.016300509, -0.08255892, -0.01612941, -0.04194852, -0.012637189)); - target1 += mul(g12, float4x4(0.042659573, -0.10762496, -0.077143244, 0.12583935, -0.022020226, -0.0042312425, -0.016734738, 0.027007964, -0.06609771, -0.056038737, -0.0058528963, 0.035508137, -0.019722374, -0.055094264, 0.010977759, -0.009833099)); - target1 += mul(g13, float4x4(0.063830875, -0.019885639, 0.055574782, 0.039456647, 0.01576898, -0.1389799, 0.063411795, -0.11600623, -0.013968303, -0.03318867, -0.06806915, -0.09373464, -0.022723546, -0.03329239, 0.014282872, 0.027576538)); - target1 += mul(ng12, float4x4(-0.018100513, 0.06204485, 0.010761461, -0.045085587, 0.009286288, 0.02310671, 0.10633246, -0.090849996, 0.13112675, -0.01639808, 0.0022725316, -0.076779045, 0.11831251, 0.1460306, -0.10849466, -0.07749171)); - target1 += mul(ng13, float4x4(-0.15850247, 0.118011266, -0.10121594, -0.007109052, 0.071873754, 0.06954878, 0.0377852, 0.044174008, -0.062925555, -0.01758927, 0.1416964, 0.17206357, -0.035632525, -0.04652215, 0.061932907, 0.034339)); - target1 += float4(-0.11952045, -0.10779418, -0.0626279, -0.042614873); - - float4 target2 = mul(g0, float4x4(-0.009000901, -0.018048609, 0.013095594, 0.002321373, 0.0004716619, 0.00504148, -0.016826658, -0.014922383, 0.15059204, 0.16593806, 0.115392484, 0.12520894, 0.05049829, 0.060210057, 0.086421266, 0.07242362)); - target2 += mul(g1, float4x4(0.06268658, 0.030466434, 0.07876877, 0.04129863, 0.04142328, 0.009963961, 0.051785357, 0.012811113, 0.1295883, 0.139931, 0.07733839, 0.08014211, 0.07156476, 0.0342396, 0.051614303, 0.043559864)); - target2 += mul(ng0, float4x4(0.00041542648, 0.016051646, -0.011512418, 0.013076814, 0.03734479, 0.02791584, 0.012426691, 0.022044811, -0.034128398, -0.027107332, -0.021998279, -0.012139807, -0.033177473, -0.016310865, -0.078221664, -0.041203145)); - target2 += mul(ng1, float4x4(-0.008398536, -0.010332053, -0.050231732, -0.039691273, -0.042082537, -0.030281143, -0.014039778, -0.0020190612, -0.11956351, -0.13638765, -0.09794402, -0.10228069, -0.08344795, -0.07944541, -0.004189214, -0.028206991)); - target2 += mul(g2, float4x4(0.0002908945, -0.00831185, -0.06870294, -0.083311856, -0.024992501, 0.0038247898, -0.049389005, -0.020098582, -0.0135326125, -0.040408995, -0.012083491, -0.042174604, 0.16112538, 0.13720983, 0.13937058, 0.10870099)); - target2 += mul(g3, float4x4(0.078961425, 0.082619205, 0.06910667, 0.06579004, -0.0077012256, -0.00038692637, 0.00015553503, -0.012561662, 0.00053048285, -0.01461681, 0.02600344, 0.024862211, -0.06958201, -0.048246548, 0.058762506, 0.036662634)); - target2 += mul(ng2, float4x4(-0.023527982, -0.0028001352, 0.047800142, 0.09616409, 0.049143843, 0.030836122, 0.057244994, 0.025672587, 0.027565151, 0.039868724, 0.045296676, 0.04623187, -0.124759234, -0.14106254, -0.06337279, -0.076839216)); - target2 += mul(ng3, float4x4(-0.0911771, -0.064436875, -0.05308137, -0.022082496, -0.0040269364, 0.0014464161, -0.0029555515, 0.016098293, -0.026650434, -0.014081368, -0.06747348, -0.05481826, 0.097423114, 0.08620988, -0.01607732, -0.015440677)); - target2 += mul(g4, float4x4(-0.014001735, -0.015001655, -0.013250577, -0.009930805, 0.04885879, 0.07092224, 0.025783395, 0.03792237, -0.04332465, -0.06244993, -0.046748653, -0.07132349, -0.0053951666, -0.016514057, 0.023807624, 0.044013456)); - target2 += mul(g5, float4x4(-0.009097996, -0.016898679, -0.05043909, -0.063178614, -0.016210863, -0.02157998, -0.02654472, -0.042961173, 0.012103852, 0.019015301, 0.02492281, 0.03389976, 0.015276502, 0.009577683, 0.04132527, -0.00070621347)); - target2 += mul(ng4, float4x4(-0.0057500796, 0.00728164, -0.003422421, 0.0038979584, -0.03127353, -0.019125199, -0.012988815, -0.031890683, 0.09352588, 0.019210607, 0.09824038, 0.016637104, 0.010692808, 0.022393884, 0.008312123, 0.014120716)); - target2 += mul(ng5, float4x4(0.013895599, 0.023097904, 0.009370535, 0.014099512, 0.0124661345, -0.015076684, 0.03287286, 0.005912471, -0.03944815, -0.020340785, -0.06822037, -0.059383288, 0.03634978, 0.007832939, -0.007142306, -0.0061968984)); - target2 += mul(g6, float4x4(0.033002097, 0.0516016, -0.021056438, 0.005715988, -0.02223013, -0.007962324, -0.024417123, -0.0014790733, 0.002167189, 0.00043749413, -0.007284963, -0.0027283782, 0.026238248, 0.01756047, 0.008969755, 0.014201024)); - target2 += mul(g7, float4x4(0.011576685, 0.02087598, 0.0026766327, -0.0041780816, -0.05277701, -0.05412841, -0.05958835, -0.050426245, -0.00662945, -0.021645393, 0.03423904, -0.0064581474, -0.030403355, 0.018391011, -0.026089542, -0.0051510665)); - target2 += mul(ng6, float4x4(-0.046202097, -0.0066081425, -0.03698851, 0.0034165455, -0.011859245, -0.020945566, -0.0028196946, -0.010053285, -0.011400397, 0.030595876, -0.018915813, 0.006780077, -0.060040582, -0.009586898, -0.004477886, 0.011279908)); - target2 += mul(ng7, float4x4(-0.028692413, -0.032535568, 0.0017473884, 0.02207169, 0.0192618, 0.008956797, -0.0033381556, 0.006326402, 0.0169569, 0.041449737, -0.02611751, 0.0006410355, 0.006233776, 0.0008467914, 0.011884985, 0.009222136)); - target2 += mul(g8, float4x4(0.017076496, -0.0045380928, 0.03444613, -0.009804047, -0.004829834, -0.004889702, 0.0057807956, 0.0015014127, 0.03458368, -0.0035773432, -0.007769679, -0.032449644, -0.021396799, -0.017612215, -0.012764735, -0.025224172)); - target2 += mul(g9, float4x4(-0.011824532, 0.02335273, 0.00764845, 0.019215155, 0.022186808, 0.0066053392, 0.0071694753, -0.0036117272, 0.032144524, 0.05025988, 0.03982363, 0.052400436, -0.10555114, -0.03809396, -0.05334183, -0.05524487)); - target2 += mul(ng8, float4x4(-0.024599254, 0.058805298, 0.00069874676, 0.06263439, -0.018460508, -0.053566024, -0.0022889362, -0.035818785, -0.0135854995, -0.015712813, 0.0012080368, 0.005957637, 0.009450094, 0.03186346, 0.059969924, 0.057706963)); - target2 += mul(ng9, float4x4(0.026783831, 0.05475865, 0.027565574, 0.06032707, -0.0015639095, 0.024381682, -0.010199071, 0.037544634, 0.039889377, 0.03318851, -0.016529158, -0.0343188, 0.045666486, 0.021665907, 0.042189375, 0.02444145)); - target2 += mul(g10, float4x4(0.03791853, 0.043746054, 0.056224477, 0.05098111, 0.075256795, 0.074653305, 0.116220035, 0.01853866, -0.04133627, -0.009134169, -0.0420953, -0.05210053, -0.021748418, 0.004422131, -0.05422814, -0.035721727)); - target2 += mul(g11, float4x4(0.013814317, 0.03149986, -0.004971173, 0.04782029, -0.01693027, -0.017984565, 0.019328078, 0.008521426, 0.0845641, -0.027555496, 0.08150416, -0.04623306, 0.16494128, 0.09300831, 0.074097835, 0.0627848)); - target2 += mul(ng10, float4x4(-0.10307174, -0.112654425, -0.005589254, -0.0062108496, -0.012491583, 0.011512013, -0.03142282, -0.023683488, -0.099848576, -0.031290524, -0.07236223, -0.037460987, 0.008760208, 0.1473594, -0.009216949, 0.07251379)); - target2 += mul(ng11, float4x4(-0.04915367, -0.07121096, -0.06572174, -0.10967046, 0.019548079, 0.023992533, -0.019842865, 0.012366459, -0.07207817, -0.04237792, -0.054463565, -0.015374731, -0.092071235, -0.020860313, -0.054475963, -0.02303954)); - target2 += mul(g12, float4x4(0.04160816, -0.118427366, -0.08661791, 0.12787233, -0.01990174, 0.0012960634, -0.016121056, 0.031429946, -0.06830865, -0.057132352, -0.0022302791, 0.03845933, -0.026981276, -0.063532256, 0.011805961, -0.009616678)); - target2 += mul(g13, float4x4(0.07094465, -0.022284096, 0.060676746, 0.042626668, 0.011207256, -0.14960343, 0.05866539, -0.12742221, -0.021092903, -0.039463162, -0.07879986, -0.10232898, -0.026127055, -0.038111385, 0.019167708, 0.032637425)); - target2 += mul(ng12, float4x4(-0.014270794, 0.07157703, 0.013714203, -0.047801998, 0.0060221693, 0.022788104, 0.10630103, -0.09606649, 0.12690987, -0.017079826, -0.0077072172, -0.082584485, 0.13256006, 0.16012523, -0.10966099, -0.07927409)); - target2 += mul(ng13, float4x4(-0.17171615, 0.12114435, -0.10746857, -0.0074188868, 0.07854815, 0.07759372, 0.04310874, 0.051465522, -0.05963588, -0.014506484, 0.15522978, 0.18746643, -0.03845241, -0.0489534, 0.05837339, 0.032978524)); - target2 += float4(0.05825913, 0.051491056, 0.038389463, 0.03321517); - - float4 target3 = mul(g0, float4x4(0.2006987, 0.17832398, 0.26342955, 0.23500517, -0.012297829, -0.009008417, -0.039950736, -0.039973143, 0.34800097, 0.32196492, 0.30505183, 0.29214156, -0.21410535, -0.21166423, -0.16437815, -0.19172792)); - target3 += mul(g1, float4x4(-0.008804151, -0.07085123, 0.013577994, -0.05192605, -0.08981402, -0.14702585, -0.09145975, -0.14835288, -0.15882517, -0.14785844, -0.2381482, -0.22867912, 0.010898514, 0.031957507, 0.040597558, 0.078252345)); - target3 += mul(ng0, float4x4(-0.21658613, -0.1803885, -0.25954962, -0.20839214, -0.09597461, -0.09222647, -0.03909875, -0.03456191, -0.19723509, -0.16976605, -0.2041716, -0.1751425, 0.22901416, 0.24922715, 0.1800083, 0.23346905)); - target3 += mul(ng1, float4x4(0.110020064, 0.103858806, 0.052446555, 0.061105963, 0.032901537, 0.07140097, 0.11518793, 0.13860466, 0.13930707, 0.12712196, 0.19071707, 0.18399614, -0.08036458, -0.07349171, 0.021504594, 0.0024937368)); - target3 += mul(g2, float4x4(0.059065036, 0.00698257, -0.099622436, -0.15676253, -0.10942482, -0.04869624, -0.13654554, -0.07341863, -0.014169851, -0.06390744, 0.016093008, -0.04540248, 0.29041344, 0.24451919, 0.26292154, 0.22136512)); - target3 += mul(g3, float4x4(0.107946776, 0.097849295, 0.10266876, 0.09360328, 0.08931344, 0.08896482, 0.046495322, 0.044040844, -0.020361643, -0.030911373, 0.026598722, 0.019815676, -0.072677925, -0.042410247, 0.14127749, 0.13434973)); - target3 += mul(ng2, float4x4(-0.08809133, -0.03476601, 0.06420393, 0.14691353, 0.09296839, 0.06162562, 0.10992992, 0.0615685, 0.0168736, 0.06520281, 0.020010693, 0.046929173, -0.2219495, -0.21249783, -0.14622301, -0.14599061)); - target3 += mul(ng3, float4x4(-0.13251069, -0.08977477, -0.08930347, -0.045490693, -0.10980218, -0.09510885, -0.07299872, -0.064053826, 0.011365247, 0.014091111, -0.054976214, -0.056936122, 0.10148144, 0.07451642, -0.08138598, -0.10161657)); - target3 += mul(g4, float4x4(-0.0075518745, -0.005738622, -0.007577811, -0.00032088626, 0.032614008, 0.04858922, 0.00054855715, 0.011565026, -0.022675224, -0.034442738, -0.03580643, -0.05069376, -0.0020376542, -0.01505518, 0.019388825, 0.03746554)); - target3 += mul(g5, float4x4(-0.011413172, -0.016877454, -0.048923567, -0.055012885, -0.007709447, -0.016109072, -0.047132388, -0.07146396, 0.002604099, 0.00014681708, 0.03429465, 0.043265607, 0.029014807, 0.03337814, 0.07582056, 0.041660666)); - target3 += mul(ng4, float4x4(-0.020768544, -0.014378527, -0.01999972, -0.01385916, -0.012264676, -0.009959511, 0.0119015165, -0.016787319, 0.07266734, -0.0029914333, 0.08549183, 0.004367342, 0.008236065, 0.020370748, 0.0043428927, 0.007301017)); - target3 += mul(ng5, float4x4(0.011654731, 0.025318999, -0.0029306612, 0.007426217, -0.00010868774, -0.020845588, 0.041991003, 0.024147986, -0.030741083, -0.012328637, -0.06617428, -0.06103115, 0.010491518, -0.013338451, -0.04666634, -0.046481613)); - target3 += mul(g6, float4x4(0.0268538, 0.043785956, -0.01799385, 0.008743307, -0.013197458, -0.015049436, -0.017189734, -0.0047999253, -0.00059730676, -0.0008936153, -0.016006093, -0.0073406673, 0.014875853, 0.011491735, 9.819833e-05, 0.0073417514)); - target3 += mul(g7, float4x4(0.019930955, 0.027112626, 0.01307941, 0.005268897, -0.060213763, -0.050415818, -0.069006495, -0.051405095, 0.0036414433, -0.008606397, 0.037427194, -0.0018103109, -0.037434716, 0.010187546, -0.026227329, -0.0033639795)); - target3 += mul(ng6, float4x4(-0.03634798, 0.0007093891, -0.026819145, 0.009025687, -0.01750318, -0.020098133, -0.0063864207, -0.006606755, 0.0008565766, 0.028647956, -0.0024974607, 0.015250743, -0.048884176, -0.004310685, -0.0010757383, 0.00974984)); - target3 += mul(ng7, float4x4(-0.031253602, -0.031743724, -0.009083253, 0.0145388115, 0.02048611, 0.0058071036, -0.0038228377, 0.00049654936, 0.0059105973, 0.03437731, -0.025785418, 0.004187733, 0.009980489, -4.08268e-05, 0.009384428, 0.0019492983)); - target3 += mul(g8, float4x4(0.012587245, -0.0032654977, 0.029739188, -0.009440694, -0.0018237908, -0.0080032, 0.010499013, 0.0012466761, 0.03461923, -0.0060207327, -0.008771263, -0.034545746, -0.015023473, -0.008901684, -0.011490296, -0.01976464)); - target3 += mul(g9, float4x4(-0.009444331, 0.020809013, 0.009985801, 0.020350901, 0.013234775, 0.004382635, -0.0007761826, -0.005247294, 0.034115106, 0.05190378, 0.039124765, 0.050993033, -0.0898732, -0.030428126, -0.044204578, -0.052484997)); - target3 += mul(ng8, float4x4(-0.020434443, 0.053520404, 0.0007571144, 0.05895061, -0.018991265, -0.043982152, -0.004035192, -0.025452444, -0.012197152, -0.013770753, 0.0012919102, 0.003996682, 0.0056104586, 0.025686186, 0.05128293, 0.05105745)); - target3 += mul(ng9, float4x4(0.030201769, 0.052521482, 0.029641917, 0.05559941, 0.0018870027, 0.020112835, -0.0043867202, 0.035877172, 0.02961142, 0.02163634, -0.027972858, -0.040669747, 0.03393723, 0.013455979, 0.03313782, 0.01968004)); - target3 += mul(g10, float4x4(0.034817442, 0.04045217, 0.054816365, 0.05092461, 0.06600807, 0.062576495, 0.09923777, 0.006663677, -0.039958935, -0.010009866, -0.041522443, -0.04959681, -0.020962957, 0.003845031, -0.04910384, -0.03233655)); - target3 += mul(g11, float4x4(0.015433112, 0.028965838, -0.0055138534, 0.042267464, -0.012690953, -0.009424165, 0.017896382, 0.01186686, 0.07231686, -0.038834292, 0.07033086, -0.052548733, 0.15721905, 0.09334892, 0.07676042, 0.06701375)); - target3 += mul(ng10, float4x4(-0.09797534, -0.11201098, -0.0037222446, -0.008105951, -0.01152357, 0.012165641, -0.029051905, -0.021293389, -0.09600697, -0.028819272, -0.069084235, -0.035421908, 0.0054322914, 0.14168966, -0.0200274, 0.06505187)); - target3 += mul(ng11, float4x4(-0.05034882, -0.06622497, -0.062471002, -0.100628324, 0.018115615, 0.019867867, -0.018836644, 0.007562053, -0.06317378, -0.034458403, -0.047243826, -0.009989589, -0.08270121, -0.018645251, -0.05160367, -0.023690399)); - target3 += mul(g12, float4x4(0.03897899, -0.10862529, -0.081805214, 0.1202324, -0.021866674, -0.00041882638, -0.018235246, 0.027227063, -0.0656312, -0.053432237, -0.0029235696, 0.03549672, -0.022848906, -0.057047505, 0.013400545, -0.0072439364)); - target3 += mul(g13, float4x4(0.06879516, -0.018637763, 0.058062725, 0.041045032, 0.011702424, -0.13693465, 0.05674195, -0.11679955, -0.022940686, -0.03856922, -0.07531371, -0.09692582, -0.019870926, -0.032572743, 0.026138868, 0.037639033)); - target3 += mul(ng12, float4x4(-0.015270301, 0.06478719, 0.011016518, -0.04533957, 0.00688319, 0.024815995, 0.10159924, -0.08467507, 0.11939162, -0.01939453, -0.0058689644, -0.077881604, 0.118726775, 0.14489114, -0.10831982, -0.07972515)); - target3 += mul(ng13, float4x4(-0.16734359, 0.10685446, -0.102714166, -0.010225307, 0.07306756, 0.07014447, 0.040464073, 0.04688462, -0.05489714, -0.01525318, 0.14690581, 0.17514132, -0.03250648, -0.03688211, 0.05047889, 0.03078089)); - target3 += float4(0.06614842, 0.045779686, 0.032838725, 0.017085627); + MF4 target1 = { -0.11952045, -0.10779418, -0.0626279, -0.042614873 }; + target1 = MulAdd(g0, MF4x4(-0.105475314, -0.07022547, -0.16326137, -0.12503424, -0.004623021, -0.0143323885, 0.042996034, 0.03422294, -0.38310882, -0.4431925, -0.28772846, -0.3213578, -0.018014904, 0.02429277, -0.07177951, -0.04458822), target1); + target1 = MulAdd(g1, MF4x4(-0.0973233, -0.032439478, -0.08420249, -0.054693196, 0.012960555, 0.06929602, 0.004247494, 0.061315402, -0.09607745, -0.16862066, 0.01537482, -0.038459156, 0.019662246, 0.059920583, -0.1071646, -0.06478967), target1); + target1 = MulAdd(ng0, MF4x4(0.15711947, 0.0754732, 0.17891979, 0.098270796, 0.14122486, 0.14893766, 0.12408279, 0.14845194, 0.16199848, 0.14090912, 0.13496809, 0.1119815, 0.03974558, -0.057513904, 0.09213575, -0.0012252429), target1); + target1 = MulAdd(ng1, MF4x4(-0.011343602, -0.02488338, 0.07799659, 0.06503721, 0.06380687, 0.048929837, -0.05555838, -0.050519127, 0.14673206, 0.18085165, 0.07261422, 0.09738158, 0.07395791, 0.005573146, -0.05454926, -0.13565786), target1); + target1 = MulAdd(g2, MF4x4(-0.08591514, -0.05664865, 0.23980616, 0.24876402, 0.19052829, 0.011938714, 0.21487322, 0.058656186, 0.036630988, 0.14918756, 0.013127693, 0.13092093, -0.37889576, -0.4068804, -0.27258882, -0.30605716), target1); + target1 = MulAdd(g3, MF4x4(-0.25149816, -0.21979512, -0.24949454, -0.20483162, -0.10972783, -0.17315808, -0.08562763, -0.16086778, 0.044681527, 0.050807394, -0.019424994, -0.022418005, 0.10039492, -0.013666552, -0.22373566, -0.34493732), target1); + target1 = MulAdd(ng2, MF4x4(0.1419155, 0.081392206, -0.18103191, -0.2122926, -0.1445937, -0.015969204, -0.12368782, -0.0044421684, -0.09534078, -0.14815839, -0.1052107, -0.16341865, 0.3050403, 0.34488317, 0.16171226, 0.18700944), target1); + target1 = MulAdd(ng3, MF4x4(0.12444696, 0.08712589, 0.06266247, 0.031022022, 0.17707655, 0.24904409, 0.20961654, 0.2610619, -0.099262595, -0.06900819, -0.034567446, -0.020191457, -0.1468561, -0.04683958, 0.14910224, 0.244686), target1); + target1 = MulAdd(g4, MF4x4(-0.002428158, -0.012889509, 0.0006541127, -0.0058380975, 0.096147396, 0.07791617, 0.119144954, 0.11699654, -0.024602454, -0.07894611, -0.00021709128, -0.03979557, 0.0028512406, -0.015790012, 0.0082511455, 0.029357092), target1); + target1 = MulAdd(g5, MF4x4(-0.01410329, -0.004162405, -0.09005045, -0.07753674, 0.004509965, -0.024188736, 0.13799691, 0.10589621, -0.023018798, 0.0064198375, -0.103344224, -0.07463909, -0.060048997, -0.071094714, -0.13042289, -0.14482167), target1); + target1 = MulAdd(ng4, MF4x4(-0.009015246, 0.01581748, -0.035448726, -0.012348933, -0.101627484, -0.05530413, -0.14063041, -0.121775225, 0.074719116, 0.033839386, 0.045573987, -0.006698053, 0.0015141299, 0.003634417, 0.017102007, 0.0074890694), target1); + target1 = MulAdd(ng5, MF4x4(0.0042357175, 0.018735386, 0.058959343, 0.057424515, -0.021633089, -0.037194982, -0.14109972, -0.1506368, 0.004357002, -0.006871023, 0.05337361, 0.039684236, 0.087463334, 0.07772685, 0.12278512, 0.1224218), target1); + target1 = MulAdd(g6, MF4x4(0.018359886, 0.046934873, -0.008225237, 0.020650858, -0.03961538, -0.014779162, -0.04161338, -0.00953579, 0.0017313146, 0.0068857935, -0.0024282748, 0.0047545764, 0.02635904, 0.027336216, 0.02701322, 0.029939381), target1); + target1 = MulAdd(g7, MF4x4(-0.00067966996, 0.024480496, -0.015218739, -0.010472019, -0.03994461, -0.052318517, -0.04450191, -0.043226667, -0.03166469, -0.03799331, 0.015428865, -0.018422252, 0.00040845043, 0.03558268, -0.0099401595, -0.00054432114), target1); + target1 = MulAdd(ng6, MF4x4(-0.0032104475, 0.019604867, -0.02486679, 0.002134673, 0.014368818, -0.0013395248, 0.017318068, 0.0021403218, -0.02198377, 0.010297547, -0.041619625, -0.02740482, -0.067249276, -0.03040953, -0.021304253, -0.009557115), target1); + target1 = MulAdd(ng7, MF4x4(-0.019099236, -0.037010793, 0.013720462, 0.023708181, 0.016356282, -0.00028589502, -0.010570909, -0.009186907, 0.03493662, 0.055599142, -0.017043956, 0.004204044, -0.013573257, -0.013537684, 0.008151195, 0.0074913655), target1); + target1 = MulAdd(g8, MF4x4(0.009309031, -0.0014795153, 0.025114728, -0.0066442797, -0.012085473, -0.0030560147, 0.002144206, 0.0009732741, 0.022301642, -0.0091133695, 0.0011837826, -0.020275833, -0.021349607, -0.011693419, -0.018912962, -0.022418445), target1); + target1 = MulAdd(g9, MF4x4(-0.0045772395, 0.031085191, 0.01215795, 0.023887333, 0.023408212, 0.0005998807, 0.011254428, -0.004634461, 0.016601006, 0.046663348, 0.031117432, 0.04910873, -0.113230005, -0.035702843, -0.058746565, -0.053893737), target1); + target1 = MulAdd(ng8, MF4x4(-0.020218112, 0.056803435, -0.0037077996, 0.05123925, -0.016713811, -0.05551032, -0.005916611, -0.037839632, -0.007671626, -0.009099201, -0.0010055836, 0.003332688, 0.020744357, 0.01957675, 0.057906736, 0.041446246), target1); + target1 = MulAdd(ng9, MF4x4(0.022438819, 0.04616756, 0.035925094, 0.0639705, 0.0009332198, 0.020964272, -0.010805394, 0.031757344, 0.051255573, 0.032838948, 0.00055445684, -0.03195623, 0.04753827, 0.016436901, 0.04788274, 0.022093765), target1); + target1 = MulAdd(g10, MF4x4(0.03479086, 0.035946105, 0.04343359, 0.04015664, 0.06081792, 0.061758887, 0.10128842, 0.007471392, -0.027261607, -0.01290544, -0.029938918, -0.050834358, -0.015550162, 0.0072828676, -0.04580556, -0.029642029), target1); + target1 = MulAdd(g11, MF4x4(0.011150116, 0.029789668, -0.00354488, 0.045047592, -0.018265083, -0.020843878, 0.015457328, 0.0053232997, 0.0791804, -0.028661052, 0.079342775, -0.039631505, 0.14613943, 0.08323415, 0.049641483, 0.047863442), target1); + target1 = MulAdd(ng10, MF4x4(-0.103034586, -0.107580125, 0.00044325445, 0.007830247, -0.017059505, 0.010152936, -0.02845979, -0.01841766, -0.10722863, -0.025262646, -0.07402096, -0.025055556, 0.0013303137, 0.12574737, -0.0161103, 0.06077798), target1); + target1 = MulAdd(ng11, MF4x4(-0.0420636, -0.062703885, -0.06476972, -0.10516001, 0.018120673, 0.024305122, -0.013997766, 0.015815413, -0.06317691, -0.03968166, -0.054052643, -0.016300509, -0.08255892, -0.01612941, -0.04194852, -0.012637189), target1); + target1 = MulAdd(g12, MF4x4(0.042659573, -0.10762496, -0.077143244, 0.12583935, -0.022020226, -0.0042312425, -0.016734738, 0.027007964, -0.06609771, -0.056038737, -0.0058528963, 0.035508137, -0.019722374, -0.055094264, 0.010977759, -0.009833099), target1); + target1 = MulAdd(g13, MF4x4(0.063830875, -0.019885639, 0.055574782, 0.039456647, 0.01576898, -0.1389799, 0.063411795, -0.11600623, -0.013968303, -0.03318867, -0.06806915, -0.09373464, -0.022723546, -0.03329239, 0.014282872, 0.027576538), target1); + target1 = MulAdd(ng12, MF4x4(-0.018100513, 0.06204485, 0.010761461, -0.045085587, 0.009286288, 0.02310671, 0.10633246, -0.090849996, 0.13112675, -0.01639808, 0.0022725316, -0.076779045, 0.11831251, 0.1460306, -0.10849466, -0.07749171), target1); + target1 = MulAdd(ng13, MF4x4(-0.15850247, 0.118011266, -0.10121594, -0.007109052, 0.071873754, 0.06954878, 0.0377852, 0.044174008, -0.062925555, -0.01758927, 0.1416964, 0.17206357, -0.035632525, -0.04652215, 0.061932907, 0.034339), target1); + + MF4 target2 = { 0.05825913, 0.051491056, 0.038389463, 0.03321517 }; + target2 = MulAdd(g0, MF4x4(-0.009000901, -0.018048609, 0.013095594, 0.002321373, 0.0004716619, 0.00504148, -0.016826658, -0.014922383, 0.15059204, 0.16593806, 0.115392484, 0.12520894, 0.05049829, 0.060210057, 0.086421266, 0.07242362), target2); + target2 = MulAdd(g1, MF4x4(0.06268658, 0.030466434, 0.07876877, 0.04129863, 0.04142328, 0.009963961, 0.051785357, 0.012811113, 0.1295883, 0.139931, 0.07733839, 0.08014211, 0.07156476, 0.0342396, 0.051614303, 0.043559864), target2); + target2 = MulAdd(ng0, MF4x4(0.00041542648, 0.016051646, -0.011512418, 0.013076814, 0.03734479, 0.02791584, 0.012426691, 0.022044811, -0.034128398, -0.027107332, -0.021998279, -0.012139807, -0.033177473, -0.016310865, -0.078221664, -0.041203145), target2); + target2 = MulAdd(ng1, MF4x4(-0.008398536, -0.010332053, -0.050231732, -0.039691273, -0.042082537, -0.030281143, -0.014039778, -0.0020190612, -0.11956351, -0.13638765, -0.09794402, -0.10228069, -0.08344795, -0.07944541, -0.004189214, -0.028206991), target2); + target2 = MulAdd(g2, MF4x4(0.0002908945, -0.00831185, -0.06870294, -0.083311856, -0.024992501, 0.0038247898, -0.049389005, -0.020098582, -0.0135326125, -0.040408995, -0.012083491, -0.042174604, 0.16112538, 0.13720983, 0.13937058, 0.10870099), target2); + target2 = MulAdd(g3, MF4x4(0.078961425, 0.082619205, 0.06910667, 0.06579004, -0.0077012256, -0.00038692637, 0.00015553503, -0.012561662, 0.00053048285, -0.01461681, 0.02600344, 0.024862211, -0.06958201, -0.048246548, 0.058762506, 0.036662634), target2); + target2 = MulAdd(ng2, MF4x4(-0.023527982, -0.0028001352, 0.047800142, 0.09616409, 0.049143843, 0.030836122, 0.057244994, 0.025672587, 0.027565151, 0.039868724, 0.045296676, 0.04623187, -0.124759234, -0.14106254, -0.06337279, -0.076839216), target2); + target2 = MulAdd(ng3, MF4x4(-0.0911771, -0.064436875, -0.05308137, -0.022082496, -0.0040269364, 0.0014464161, -0.0029555515, 0.016098293, -0.026650434, -0.014081368, -0.06747348, -0.05481826, 0.097423114, 0.08620988, -0.01607732, -0.015440677), target2); + target2 = MulAdd(g4, MF4x4(-0.014001735, -0.015001655, -0.013250577, -0.009930805, 0.04885879, 0.07092224, 0.025783395, 0.03792237, -0.04332465, -0.06244993, -0.046748653, -0.07132349, -0.0053951666, -0.016514057, 0.023807624, 0.044013456), target2); + target2 = MulAdd(g5, MF4x4(-0.009097996, -0.016898679, -0.05043909, -0.063178614, -0.016210863, -0.02157998, -0.02654472, -0.042961173, 0.012103852, 0.019015301, 0.02492281, 0.03389976, 0.015276502, 0.009577683, 0.04132527, -0.00070621347), target2); + target2 = MulAdd(ng4, MF4x4(-0.0057500796, 0.00728164, -0.003422421, 0.0038979584, -0.03127353, -0.019125199, -0.012988815, -0.031890683, 0.09352588, 0.019210607, 0.09824038, 0.016637104, 0.010692808, 0.022393884, 0.008312123, 0.014120716), target2); + target2 = MulAdd(ng5, MF4x4(0.013895599, 0.023097904, 0.009370535, 0.014099512, 0.0124661345, -0.015076684, 0.03287286, 0.005912471, -0.03944815, -0.020340785, -0.06822037, -0.059383288, 0.03634978, 0.007832939, -0.007142306, -0.0061968984), target2); + target2 = MulAdd(g6, MF4x4(0.033002097, 0.0516016, -0.021056438, 0.005715988, -0.02223013, -0.007962324, -0.024417123, -0.0014790733, 0.002167189, 0.00043749413, -0.007284963, -0.0027283782, 0.026238248, 0.01756047, 0.008969755, 0.014201024), target2); + target2 = MulAdd(g7, MF4x4(0.011576685, 0.02087598, 0.0026766327, -0.0041780816, -0.05277701, -0.05412841, -0.05958835, -0.050426245, -0.00662945, -0.021645393, 0.03423904, -0.0064581474, -0.030403355, 0.018391011, -0.026089542, -0.0051510665), target2); + target2 = MulAdd(ng6, MF4x4(-0.046202097, -0.0066081425, -0.03698851, 0.0034165455, -0.011859245, -0.020945566, -0.0028196946, -0.010053285, -0.011400397, 0.030595876, -0.018915813, 0.006780077, -0.060040582, -0.009586898, -0.004477886, 0.011279908), target2); + target2 = MulAdd(ng7, MF4x4(-0.028692413, -0.032535568, 0.0017473884, 0.02207169, 0.0192618, 0.008956797, -0.0033381556, 0.006326402, 0.0169569, 0.041449737, -0.02611751, 0.0006410355, 0.006233776, 0.0008467914, 0.011884985, 0.009222136), target2); + target2 = MulAdd(g8, MF4x4(0.017076496, -0.0045380928, 0.03444613, -0.009804047, -0.004829834, -0.004889702, 0.0057807956, 0.0015014127, 0.03458368, -0.0035773432, -0.007769679, -0.032449644, -0.021396799, -0.017612215, -0.012764735, -0.025224172), target2); + target2 = MulAdd(g9, MF4x4(-0.011824532, 0.02335273, 0.00764845, 0.019215155, 0.022186808, 0.0066053392, 0.0071694753, -0.0036117272, 0.032144524, 0.05025988, 0.03982363, 0.052400436, -0.10555114, -0.03809396, -0.05334183, -0.05524487), target2); + target2 = MulAdd(ng8, MF4x4(-0.024599254, 0.058805298, 0.00069874676, 0.06263439, -0.018460508, -0.053566024, -0.0022889362, -0.035818785, -0.0135854995, -0.015712813, 0.0012080368, 0.005957637, 0.009450094, 0.03186346, 0.059969924, 0.057706963), target2); + target2 = MulAdd(ng9, MF4x4(0.026783831, 0.05475865, 0.027565574, 0.06032707, -0.0015639095, 0.024381682, -0.010199071, 0.037544634, 0.039889377, 0.03318851, -0.016529158, -0.0343188, 0.045666486, 0.021665907, 0.042189375, 0.02444145), target2); + target2 = MulAdd(g10, MF4x4(0.03791853, 0.043746054, 0.056224477, 0.05098111, 0.075256795, 0.074653305, 0.116220035, 0.01853866, -0.04133627, -0.009134169, -0.0420953, -0.05210053, -0.021748418, 0.004422131, -0.05422814, -0.035721727), target2); + target2 = MulAdd(g11, MF4x4(0.013814317, 0.03149986, -0.004971173, 0.04782029, -0.01693027, -0.017984565, 0.019328078, 0.008521426, 0.0845641, -0.027555496, 0.08150416, -0.04623306, 0.16494128, 0.09300831, 0.074097835, 0.0627848), target2); + target2 = MulAdd(ng10, MF4x4(-0.10307174, -0.112654425, -0.005589254, -0.0062108496, -0.012491583, 0.011512013, -0.03142282, -0.023683488, -0.099848576, -0.031290524, -0.07236223, -0.037460987, 0.008760208, 0.1473594, -0.009216949, 0.07251379), target2); + target2 = MulAdd(ng11, MF4x4(-0.04915367, -0.07121096, -0.06572174, -0.10967046, 0.019548079, 0.023992533, -0.019842865, 0.012366459, -0.07207817, -0.04237792, -0.054463565, -0.015374731, -0.092071235, -0.020860313, -0.054475963, -0.02303954), target2); + target2 = MulAdd(g12, MF4x4(0.04160816, -0.118427366, -0.08661791, 0.12787233, -0.01990174, 0.0012960634, -0.016121056, 0.031429946, -0.06830865, -0.057132352, -0.0022302791, 0.03845933, -0.026981276, -0.063532256, 0.011805961, -0.009616678), target2); + target2 = MulAdd(g13, MF4x4(0.07094465, -0.022284096, 0.060676746, 0.042626668, 0.011207256, -0.14960343, 0.05866539, -0.12742221, -0.021092903, -0.039463162, -0.07879986, -0.10232898, -0.026127055, -0.038111385, 0.019167708, 0.032637425), target2); + target2 = MulAdd(ng12, MF4x4(-0.014270794, 0.07157703, 0.013714203, -0.047801998, 0.0060221693, 0.022788104, 0.10630103, -0.09606649, 0.12690987, -0.017079826, -0.0077072172, -0.082584485, 0.13256006, 0.16012523, -0.10966099, -0.07927409), target2); + target2 = MulAdd(ng13, MF4x4(-0.17171615, 0.12114435, -0.10746857, -0.0074188868, 0.07854815, 0.07759372, 0.04310874, 0.051465522, -0.05963588, -0.014506484, 0.15522978, 0.18746643, -0.03845241, -0.0489534, 0.05837339, 0.032978524), target2); + + MF4 target3 = { 0.06614842, 0.045779686, 0.032838725, 0.017085627 }; + target3 = MulAdd(g0, MF4x4(0.2006987, 0.17832398, 0.26342955, 0.23500517, -0.012297829, -0.009008417, -0.039950736, -0.039973143, 0.34800097, 0.32196492, 0.30505183, 0.29214156, -0.21410535, -0.21166423, -0.16437815, -0.19172792), target3); + target3 = MulAdd(g1, MF4x4(-0.008804151, -0.07085123, 0.013577994, -0.05192605, -0.08981402, -0.14702585, -0.09145975, -0.14835288, -0.15882517, -0.14785844, -0.2381482, -0.22867912, 0.010898514, 0.031957507, 0.040597558, 0.078252345), target3); + target3 = MulAdd(ng0, MF4x4(-0.21658613, -0.1803885, -0.25954962, -0.20839214, -0.09597461, -0.09222647, -0.03909875, -0.03456191, -0.19723509, -0.16976605, -0.2041716, -0.1751425, 0.22901416, 0.24922715, 0.1800083, 0.23346905), target3); + target3 = MulAdd(ng1, MF4x4(0.110020064, 0.103858806, 0.052446555, 0.061105963, 0.032901537, 0.07140097, 0.11518793, 0.13860466, 0.13930707, 0.12712196, 0.19071707, 0.18399614, -0.08036458, -0.07349171, 0.021504594, 0.0024937368), target3); + target3 = MulAdd(g2, MF4x4(0.059065036, 0.00698257, -0.099622436, -0.15676253, -0.10942482, -0.04869624, -0.13654554, -0.07341863, -0.014169851, -0.06390744, 0.016093008, -0.04540248, 0.29041344, 0.24451919, 0.26292154, 0.22136512), target3); + target3 = MulAdd(g3, MF4x4(0.107946776, 0.097849295, 0.10266876, 0.09360328, 0.08931344, 0.08896482, 0.046495322, 0.044040844, -0.020361643, -0.030911373, 0.026598722, 0.019815676, -0.072677925, -0.042410247, 0.14127749, 0.13434973), target3); + target3 = MulAdd(ng2, MF4x4(-0.08809133, -0.03476601, 0.06420393, 0.14691353, 0.09296839, 0.06162562, 0.10992992, 0.0615685, 0.0168736, 0.06520281, 0.020010693, 0.046929173, -0.2219495, -0.21249783, -0.14622301, -0.14599061), target3); + target3 = MulAdd(ng3, MF4x4(-0.13251069, -0.08977477, -0.08930347, -0.045490693, -0.10980218, -0.09510885, -0.07299872, -0.064053826, 0.011365247, 0.014091111, -0.054976214, -0.056936122, 0.10148144, 0.07451642, -0.08138598, -0.10161657), target3); + target3 = MulAdd(g4, MF4x4(-0.0075518745, -0.005738622, -0.007577811, -0.00032088626, 0.032614008, 0.04858922, 0.00054855715, 0.011565026, -0.022675224, -0.034442738, -0.03580643, -0.05069376, -0.0020376542, -0.01505518, 0.019388825, 0.03746554), target3); + target3 = MulAdd(g5, MF4x4(-0.011413172, -0.016877454, -0.048923567, -0.055012885, -0.007709447, -0.016109072, -0.047132388, -0.07146396, 0.002604099, 0.00014681708, 0.03429465, 0.043265607, 0.029014807, 0.03337814, 0.07582056, 0.041660666), target3); + target3 = MulAdd(ng4, MF4x4(-0.020768544, -0.014378527, -0.01999972, -0.01385916, -0.012264676, -0.009959511, 0.0119015165, -0.016787319, 0.07266734, -0.0029914333, 0.08549183, 0.004367342, 0.008236065, 0.020370748, 0.0043428927, 0.007301017), target3); + target3 = MulAdd(ng5, MF4x4(0.011654731, 0.025318999, -0.0029306612, 0.007426217, -0.00010868774, -0.020845588, 0.041991003, 0.024147986, -0.030741083, -0.012328637, -0.06617428, -0.06103115, 0.010491518, -0.013338451, -0.04666634, -0.046481613), target3); + target3 = MulAdd(g6, MF4x4(0.0268538, 0.043785956, -0.01799385, 0.008743307, -0.013197458, -0.015049436, -0.017189734, -0.0047999253, -0.00059730676, -0.0008936153, -0.016006093, -0.0073406673, 0.014875853, 0.011491735, 9.819833e-05, 0.0073417514), target3); + target3 = MulAdd(g7, MF4x4(0.019930955, 0.027112626, 0.01307941, 0.005268897, -0.060213763, -0.050415818, -0.069006495, -0.051405095, 0.0036414433, -0.008606397, 0.037427194, -0.0018103109, -0.037434716, 0.010187546, -0.026227329, -0.0033639795), target3); + target3 = MulAdd(ng6, MF4x4(-0.03634798, 0.0007093891, -0.026819145, 0.009025687, -0.01750318, -0.020098133, -0.0063864207, -0.006606755, 0.0008565766, 0.028647956, -0.0024974607, 0.015250743, -0.048884176, -0.004310685, -0.0010757383, 0.00974984), target3); + target3 = MulAdd(ng7, MF4x4(-0.031253602, -0.031743724, -0.009083253, 0.0145388115, 0.02048611, 0.0058071036, -0.0038228377, 0.00049654936, 0.0059105973, 0.03437731, -0.025785418, 0.004187733, 0.009980489, -4.08268e-05, 0.009384428, 0.0019492983), target3); + target3 = MulAdd(g8, MF4x4(0.012587245, -0.0032654977, 0.029739188, -0.009440694, -0.0018237908, -0.0080032, 0.010499013, 0.0012466761, 0.03461923, -0.0060207327, -0.008771263, -0.034545746, -0.015023473, -0.008901684, -0.011490296, -0.01976464), target3); + target3 = MulAdd(g9, MF4x4(-0.009444331, 0.020809013, 0.009985801, 0.020350901, 0.013234775, 0.004382635, -0.0007761826, -0.005247294, 0.034115106, 0.05190378, 0.039124765, 0.050993033, -0.0898732, -0.030428126, -0.044204578, -0.052484997), target3); + target3 = MulAdd(ng8, MF4x4(-0.020434443, 0.053520404, 0.0007571144, 0.05895061, -0.018991265, -0.043982152, -0.004035192, -0.025452444, -0.012197152, -0.013770753, 0.0012919102, 0.003996682, 0.0056104586, 0.025686186, 0.05128293, 0.05105745), target3); + target3 = MulAdd(ng9, MF4x4(0.030201769, 0.052521482, 0.029641917, 0.05559941, 0.0018870027, 0.020112835, -0.0043867202, 0.035877172, 0.02961142, 0.02163634, -0.027972858, -0.040669747, 0.03393723, 0.013455979, 0.03313782, 0.01968004), target3); + target3 = MulAdd(g10, MF4x4(0.034817442, 0.04045217, 0.054816365, 0.05092461, 0.06600807, 0.062576495, 0.09923777, 0.006663677, -0.039958935, -0.010009866, -0.041522443, -0.04959681, -0.020962957, 0.003845031, -0.04910384, -0.03233655), target3); + target3 = MulAdd(g11, MF4x4(0.015433112, 0.028965838, -0.0055138534, 0.042267464, -0.012690953, -0.009424165, 0.017896382, 0.01186686, 0.07231686, -0.038834292, 0.07033086, -0.052548733, 0.15721905, 0.09334892, 0.07676042, 0.06701375), target3); + target3 = MulAdd(ng10, MF4x4(-0.09797534, -0.11201098, -0.0037222446, -0.008105951, -0.01152357, 0.012165641, -0.029051905, -0.021293389, -0.09600697, -0.028819272, -0.069084235, -0.035421908, 0.0054322914, 0.14168966, -0.0200274, 0.06505187), target3); + target3 = MulAdd(ng11, MF4x4(-0.05034882, -0.06622497, -0.062471002, -0.100628324, 0.018115615, 0.019867867, -0.018836644, 0.007562053, -0.06317378, -0.034458403, -0.047243826, -0.009989589, -0.08270121, -0.018645251, -0.05160367, -0.023690399), target3); + target3 = MulAdd(g12, MF4x4(0.03897899, -0.10862529, -0.081805214, 0.1202324, -0.021866674, -0.00041882638, -0.018235246, 0.027227063, -0.0656312, -0.053432237, -0.0029235696, 0.03549672, -0.022848906, -0.057047505, 0.013400545, -0.0072439364), target3); + target3 = MulAdd(g13, MF4x4(0.06879516, -0.018637763, 0.058062725, 0.041045032, 0.011702424, -0.13693465, 0.05674195, -0.11679955, -0.022940686, -0.03856922, -0.07531371, -0.09692582, -0.019870926, -0.032572743, 0.026138868, 0.037639033), target3); + target3 = MulAdd(ng12, MF4x4(-0.015270301, 0.06478719, 0.011016518, -0.04533957, 0.00688319, 0.024815995, 0.10159924, -0.08467507, 0.11939162, -0.01939453, -0.0058689644, -0.077881604, 0.118726775, 0.14489114, -0.10831982, -0.07972515), target3); + target3 = MulAdd(ng13, MF4x4(-0.16734359, 0.10685446, -0.102714166, -0.010225307, 0.07306756, 0.07014447, 0.040464073, 0.04688462, -0.05489714, -0.01525318, 0.14690581, 0.17514132, -0.03250648, -0.03688211, 0.05047889, 0.03078089), target3); float2 outputPt = GetOutputPt(); pos -= 0.5f * outputPt; - OUTPUT[gxy] = float4(float3(target1.x, target2.x, target3.x) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1); + OUTPUT[gxy] = MF4(MF3(target1.x, target2.x, target3.x) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1); ++gxy.x; pos.x += outputPt.x; - OUTPUT[gxy] = float4(float3(target1.y, target2.y, target3.y) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1); + OUTPUT[gxy] = MF4(MF3(target1.y, target2.y, target3.y) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1); ++gxy.y; pos.y += outputPt.y; - OUTPUT[gxy] = float4(float3(target1.w, target2.w, target3.w) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1); + OUTPUT[gxy] = MF4(MF3(target1.w, target2.w, target3.w) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1); --gxy.x; pos.x -= outputPt.x; - OUTPUT[gxy] = float4(float3(target1.z, target2.z, target3.z) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1); + OUTPUT[gxy] = MF4(MF3(target1.z, target2.z, target3.z) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1); } diff --git a/src/Effects/Anime4K/Anime4K_Upscale_GAN_x2_M.hlsl b/src/Effects/Anime4K/Anime4K_Upscale_GAN_x2_M.hlsl index 6ab5ce8bf..022af5495 100644 --- a/src/Effects/Anime4K/Anime4K_Upscale_GAN_x2_M.hlsl +++ b/src/Effects/Anime4K/Anime4K_Upscale_GAN_x2_M.hlsl @@ -4,6 +4,9 @@ //!MAGPIE EFFECT //!VERSION 4 //!SORT_NAME Anime4K_Upscale_GAN_x2_2 +//!USE FP16, MulAdd + +#include "..\StubDefs.hlsli" // 圆括号内的输入只被采样一次 // INPUT -> tf, tf1 @@ -108,22 +111,22 @@ void Pass1(uint2 blockStart, uint3 threadId) { uint i, j; - min16float3 src[4][4]; + MF3 src[4][4]; [unroll] for (i = 0; i <= 2; i += 2) { [unroll] for (j = 0; j <= 2; j += 2) { float2 tpos = (gxy + uint2(i, j)) * inputPt; - const min16float4 sr = INPUT.GatherRed(sam, tpos); - const min16float4 sg = INPUT.GatherGreen(sam, tpos); - const min16float4 sb = INPUT.GatherBlue(sam, tpos); + const MF4 sr = INPUT.GatherRed(sam, tpos); + const MF4 sg = INPUT.GatherGreen(sam, tpos); + const MF4 sb = INPUT.GatherBlue(sam, tpos); // w z // x y - src[i][j] = min16float3(sr.w, sg.w, sb.w); - src[i][j + 1] = min16float3(sr.x, sg.x, sb.x); - src[i + 1][j] = min16float3(sr.z, sg.z, sb.z); - src[i + 1][j + 1] = min16float3(sr.y, sg.y, sb.y); + src[i][j] = MF3(sr.w, sg.w, sb.w); + src[i][j + 1] = MF3(sr.x, sg.x, sb.x); + src[i + 1][j] = MF3(sr.z, sg.z, sb.z); + src[i + 1][j + 1] = MF3(sr.y, sg.y, sb.y); } } @@ -139,28 +142,28 @@ void Pass1(uint2 blockStart, uint3 threadId) { } } - min16float4 result = mul(src[i - 1][j - 1], min16float3x4(-0.17498326, -0.14677401, -0.43065637, 0.10841958, 0.24096319, -0.008683959, -0.29844064, 0.3567803, 0.43360776, 0.11304715, -0.0802437, 0.190904)); - result += mul(src[i - 1][j], min16float3x4(0.24688073, 0.086462855, 0.05716678, -0.1739644, 0.3236298, 0.23382919, 0.20481811, -0.022618154, -0.336325, -0.21624258, -0.18736486, -0.14936537)); - result += mul(src[i - 1][j + 1], min16float3x4(0.38230455, 0.410552, 0.34809712, 0.2510045, 0.30689523, 0.09889703, -0.26991332, 0.1108426, 0.5083409, 0.2854462, -0.1912902, 0.40354714)); - result += mul(src[i][j - 1], min16float3x4(0.46870667, -0.03530456, 0.13705169, -0.11884997, -0.0772201, 0.17073877, 0.03287621, -0.14975251, -0.18155691, 0.14545092, -0.1584816, 0.051269397)); - result += mul(src[i][j], min16float3x4(-0.5830986, -0.009166566, 0.54358304, -0.4545001, -0.27541155, 0.6697277, -0.29205534, -0.61038095, -0.64781004, 0.32052672, 0.14704794, -0.6479083)); - result += mul(src[i][j + 1], min16float3x4(-0.04402336, 0.05461938, -0.18035333, 0.5464947, 0.21475682, -0.6899343, 0.49390903, 0.62440956, 0.75365967, -0.26500008, 0.59187347, 0.10037025)); - result += mul(src[i + 1][j - 1], min16float3x4(-0.25319895, -0.1764162, -0.22574338, 0.03075524, -0.29618785, -0.491323, 0.008427114, -0.363144, -0.17214127, -0.11891048, -0.19321653, -0.13424487)); - result += mul(src[i + 1][j], min16float3x4(0.17425235, 0.07049646, -0.1759216, 0.05697634, -0.39496303, 0.35450256, -0.09984144, 0.15470548, -0.03375828, 0.06442114, 0.14598753, 0.46114844)); - result += mul(src[i + 1][j + 1], min16float3x4(-0.19262458, -0.17141157, -0.11393742, -0.07778959, -0.006366565, -0.16713034, 0.2135569, 0.23494779, -0.37996295, -0.2767951, -0.1515432, -0.110363424)); - result += min16float4(0.010385515, 0.011541315, -0.002942497, -0.00020902864); + MF4 result = MF4(0.010385515, 0.011541315, -0.002942497, -0.00020902864); + result = MulAdd(src[i - 1][j - 1], MF3x4(-0.17498326, -0.14677401, -0.43065637, 0.10841958, 0.24096319, -0.008683959, -0.29844064, 0.3567803, 0.43360776, 0.11304715, -0.0802437, 0.190904), result); + result = MulAdd(src[i - 1][j], MF3x4(0.24688073, 0.086462855, 0.05716678, -0.1739644, 0.3236298, 0.23382919, 0.20481811, -0.022618154, -0.336325, -0.21624258, -0.18736486, -0.14936537), result); + result = MulAdd(src[i - 1][j + 1], MF3x4(0.38230455, 0.410552, 0.34809712, 0.2510045, 0.30689523, 0.09889703, -0.26991332, 0.1108426, 0.5083409, 0.2854462, -0.1912902, 0.40354714), result); + result = MulAdd(src[i][j - 1], MF3x4(0.46870667, -0.03530456, 0.13705169, -0.11884997, -0.0772201, 0.17073877, 0.03287621, -0.14975251, -0.18155691, 0.14545092, -0.1584816, 0.051269397), result); + result = MulAdd(src[i][j], MF3x4(-0.5830986, -0.009166566, 0.54358304, -0.4545001, -0.27541155, 0.6697277, -0.29205534, -0.61038095, -0.64781004, 0.32052672, 0.14704794, -0.6479083), result); + result = MulAdd(src[i][j + 1], MF3x4(-0.04402336, 0.05461938, -0.18035333, 0.5464947, 0.21475682, -0.6899343, 0.49390903, 0.62440956, 0.75365967, -0.26500008, 0.59187347, 0.10037025), result); + result = MulAdd(src[i + 1][j - 1], MF3x4(-0.25319895, -0.1764162, -0.22574338, 0.03075524, -0.29618785, -0.491323, 0.008427114, -0.363144, -0.17214127, -0.11891048, -0.19321653, -0.13424487), result); + result = MulAdd(src[i + 1][j], MF3x4(0.17425235, 0.07049646, -0.1759216, 0.05697634, -0.39496303, 0.35450256, -0.09984144, 0.15470548, -0.03375828, 0.06442114, 0.14598753, 0.46114844), result); + result = MulAdd(src[i + 1][j + 1], MF3x4(-0.19262458, -0.17141157, -0.11393742, -0.07778959, -0.006366565, -0.16713034, 0.2135569, 0.23494779, -0.37996295, -0.2767951, -0.1515432, -0.110363424), result); tex1[destPos] = result; - result = mul(src[i - 1][j - 1], min16float3x4(0.8031736, -0.1500194, -0.23398483, -0.060760673, 0.5049785, -0.099199474, -0.035531044, 0.0310586, -0.0310334, 0.15932913, 0.08973915, 0.08766925)); - result += mul(src[i - 1][j], min16float3x4(-0.2187303, 0.20974335, 0.016500302, 0.15386087, 0.2381243, -0.176845, -0.003643712, 0.08195259, 0.18417378, -0.18228108, 0.19170114, -0.3758241)); - result += mul(src[i - 1][j + 1], min16float3x4(0.4429508, -0.025832538, -0.021855514, 0.11322045, -0.08459551, -0.17815724, -0.19924322, -0.03736318, -0.22390507, -0.50430673, -0.13770194, 0.03014482)); - result += mul(src[i][j - 1], min16float3x4(-0.15976174, 0.31052437, 0.2498092, -0.29137832, -0.10121105, 0.35164458, 0.4901633, -0.35297948, -0.2569739, -0.14258477, 0.12585007, -0.2552164)); - result += mul(src[i][j], min16float3x4(-0.5260107, -0.8547037, 0.92173797, 0.37817466, -0.4162576, 0.10989847, 0.26875922, 0.8614761, 0.069195434, 0.045593478, 0.03790176, 0.7332446)); - result += mul(src[i][j + 1], min16float3x4(0.14287843, -0.283008, -0.28487602, -0.13313514, -0.019538656, -0.02361782, 0.28037757, -0.10543745, 0.1586713, 0.12037641, 0.24249536, 0.2524637)); - result += mul(src[i + 1][j - 1], min16float3x4(-0.037178896, 0.23858358, -0.18704462, -0.13747689, 0.07629898, 0.2710832, -0.71619016, -0.09074896, 0.30446374, -0.0052702115, -0.27990812, -0.1392364)); - result += mul(src[i + 1][j], min16float3x4(-0.086045384, 0.695562, -0.23519892, -0.23438415, 0.16208446, 0.2172693, -0.16647956, -0.3718635, 0.024940055, 0.5650778, 0.20409326, -0.13530363)); - result += mul(src[i + 1][j + 1], min16float3x4(-0.19389555, -0.028506106, -0.35060602, 0.22244014, 0.055054635, -0.17651209, -0.19871834, -0.02667603, -0.1402023, -0.02455308, -0.57856905, -0.2174221)); - result += min16float4(0.02648044, -0.0017647704, -0.016136197, 0.0011179475); + result = MF4(0.02648044, -0.0017647704, -0.016136197, 0.0011179475); + result = MulAdd(src[i - 1][j - 1], MF3x4(0.8031736, -0.1500194, -0.23398483, -0.060760673, 0.5049785, -0.099199474, -0.035531044, 0.0310586, -0.0310334, 0.15932913, 0.08973915, 0.08766925), result); + result = MulAdd(src[i - 1][j], MF3x4(-0.2187303, 0.20974335, 0.016500302, 0.15386087, 0.2381243, -0.176845, -0.003643712, 0.08195259, 0.18417378, -0.18228108, 0.19170114, -0.3758241), result); + result = MulAdd(src[i - 1][j + 1], MF3x4(0.4429508, -0.025832538, -0.021855514, 0.11322045, -0.08459551, -0.17815724, -0.19924322, -0.03736318, -0.22390507, -0.50430673, -0.13770194, 0.03014482), result); + result = MulAdd(src[i][j - 1], MF3x4(-0.15976174, 0.31052437, 0.2498092, -0.29137832, -0.10121105, 0.35164458, 0.4901633, -0.35297948, -0.2569739, -0.14258477, 0.12585007, -0.2552164), result); + result = MulAdd(src[i][j], MF3x4(-0.5260107, -0.8547037, 0.92173797, 0.37817466, -0.4162576, 0.10989847, 0.26875922, 0.8614761, 0.069195434, 0.045593478, 0.03790176, 0.7332446), result); + result = MulAdd(src[i][j + 1], MF3x4(0.14287843, -0.283008, -0.28487602, -0.13313514, -0.019538656, -0.02361782, 0.28037757, -0.10543745, 0.1586713, 0.12037641, 0.24249536, 0.2524637), result); + result = MulAdd(src[i + 1][j - 1], MF3x4(-0.037178896, 0.23858358, -0.18704462, -0.13747689, 0.07629898, 0.2710832, -0.71619016, -0.09074896, 0.30446374, -0.0052702115, -0.27990812, -0.1392364), result); + result = MulAdd(src[i + 1][j], MF3x4(-0.086045384, 0.695562, -0.23519892, -0.23438415, 0.16208446, 0.2172693, -0.16647956, -0.3718635, 0.024940055, 0.5650778, 0.20409326, -0.13530363), result); + result = MulAdd(src[i + 1][j + 1], MF3x4(-0.19389555, -0.028506106, -0.35060602, 0.22244014, 0.055054635, -0.17651209, -0.19871834, -0.02667603, -0.1402023, -0.02455308, -0.57856905, -0.2174221), result); tex2[destPos] = result; } } @@ -187,25 +190,25 @@ void Pass2(uint2 blockStart, uint3 threadId) { // [ a, d, g ] // [ b, e, h ] // [ c, f, i ] - min16float4 a1 = tex1.SampleLevel(sam, pos - inputPt, 0); - min16float4 b1 = tex1.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); - min16float4 c1 = tex1.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); - min16float4 d1 = tex1.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); - min16float4 e1 = tex1.SampleLevel(sam, pos, 0); - min16float4 f1 = tex1.SampleLevel(sam, pos + float2(0, inputPt.y), 0); - min16float4 g1 = tex1.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); - min16float4 h1 = tex1.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); - min16float4 i1 = tex1.SampleLevel(sam, pos + inputPt, 0); - - min16float4 na1 = max(-a1, 0); - min16float4 nb1 = max(-b1, 0); - min16float4 nc1 = max(-c1, 0); - min16float4 nd1 = max(-d1, 0); - min16float4 ne1 = max(-e1, 0); - min16float4 nf1 = max(-f1, 0); - min16float4 ng1 = max(-g1, 0); - min16float4 nh1 = max(-h1, 0); - min16float4 ni1 = max(-i1, 0); + MF4 a1 = tex1.SampleLevel(sam, pos - inputPt, 0); + MF4 b1 = tex1.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); + MF4 c1 = tex1.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); + MF4 d1 = tex1.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); + MF4 e1 = tex1.SampleLevel(sam, pos, 0); + MF4 f1 = tex1.SampleLevel(sam, pos + float2(0, inputPt.y), 0); + MF4 g1 = tex1.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); + MF4 h1 = tex1.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); + MF4 i1 = tex1.SampleLevel(sam, pos + inputPt, 0); + + MF4 na1 = max(-a1, 0); + MF4 nb1 = max(-b1, 0); + MF4 nc1 = max(-c1, 0); + MF4 nd1 = max(-d1, 0); + MF4 ne1 = max(-e1, 0); + MF4 nf1 = max(-f1, 0); + MF4 ng1 = max(-g1, 0); + MF4 nh1 = max(-h1, 0); + MF4 ni1 = max(-i1, 0); a1 = max(a1, 0); b1 = max(b1, 0); @@ -217,25 +220,25 @@ void Pass2(uint2 blockStart, uint3 threadId) { h1 = max(h1, 0); i1 = max(i1, 0); - min16float4 a2 = tex2.SampleLevel(sam, pos - inputPt, 0); - min16float4 b2 = tex2.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); - min16float4 c2 = tex2.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); - min16float4 d2 = tex2.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); - min16float4 e2 = tex2.SampleLevel(sam, pos, 0); - min16float4 f2 = tex2.SampleLevel(sam, pos + float2(0, inputPt.y), 0); - min16float4 g2 = tex2.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); - min16float4 h2 = tex2.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); - min16float4 i2 = tex2.SampleLevel(sam, pos + inputPt, 0); - - min16float4 na2 = max(-a2, 0); - min16float4 nb2 = max(-b2, 0); - min16float4 nc2 = max(-c2, 0); - min16float4 nd2 = max(-d2, 0); - min16float4 ne2 = max(-e2, 0); - min16float4 nf2 = max(-f2, 0); - min16float4 ng2 = max(-g2, 0); - min16float4 nh2 = max(-h2, 0); - min16float4 ni2 = max(-i2, 0); + MF4 a2 = tex2.SampleLevel(sam, pos - inputPt, 0); + MF4 b2 = tex2.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); + MF4 c2 = tex2.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); + MF4 d2 = tex2.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); + MF4 e2 = tex2.SampleLevel(sam, pos, 0); + MF4 f2 = tex2.SampleLevel(sam, pos + float2(0, inputPt.y), 0); + MF4 g2 = tex2.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); + MF4 h2 = tex2.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); + MF4 i2 = tex2.SampleLevel(sam, pos + inputPt, 0); + + MF4 na2 = max(-a2, 0); + MF4 nb2 = max(-b2, 0); + MF4 nc2 = max(-c2, 0); + MF4 nd2 = max(-d2, 0); + MF4 ne2 = max(-e2, 0); + MF4 nf2 = max(-f2, 0); + MF4 ng2 = max(-g2, 0); + MF4 nh2 = max(-h2, 0); + MF4 ni2 = max(-i2, 0); a2 = max(a2, 0); b2 = max(b2, 0); @@ -247,107 +250,107 @@ void Pass2(uint2 blockStart, uint3 threadId) { h2 = max(h2, 0); i2 = max(i2, 0); - min16float4 conv2d_2_tf = mul(a1, min16float4x4(-0.14656883, -0.044076134, -0.40314636, -0.08023388, 0.12564746, -0.21633625, -0.0210282, -0.19231434, -0.019945038, 0.020343186, -0.007134301, 0.013607319, 0.07334655, -0.050848506, 0.0011201366, 0.26975143)); - conv2d_2_tf += mul(b1, min16float4x4(-0.043205153, -0.13764456, -0.5368405, -0.04096279, 0.009450832, 0.23953767, -0.022408254, -0.124040656, 0.53450584, 0.02690831, 0.39857075, 0.42423433, 0.014167992, 0.055189077, -0.038074926, 0.12800713)); - conv2d_2_tf += mul(c1, min16float4x4(-0.05354771, -0.06626498, 0.0092389295, 0.100637995, 0.05051714, -0.0033487207, -0.0076860636, 0.013058279, 0.10727092, -0.31131467, 0.058990292, 0.46365786, 0.08736531, 0.038865663, -0.008022449, -0.067517675)); - conv2d_2_tf += mul(d1, min16float4x4(-0.25327486, -0.0041089035, 0.04877498, -0.36375836, 0.0003920389, -0.09273049, 0.016388323, 0.11530572, -0.14216854, 0.07370458, -0.27584067, -0.34536567, 0.0848517, -0.1954229, -0.22656868, -0.13531597)); - conv2d_2_tf += mul(e1, min16float4x4(0.7035245, 0.1131446, 0.1833189, 0.63726306, -0.20649737, 0.14149575, -0.084267326, 0.020898562, -0.026810758, -0.17932594, -0.08032681, 0.07790513, -0.01148237, -0.19930641, 0.33902612, -0.013703277)); - conv2d_2_tf += mul(f1, min16float4x4(-0.2862842, 0.01491211, -0.30474076, 0.19604082, 0.21565811, 0.29193363, 0.024934597, -0.17113955, 0.26136434, -0.12819171, 0.3874644, -0.30533502, 0.004006889, -0.07340657, -0.04817435, -0.013651047)); - conv2d_2_tf += mul(g1, min16float4x4(-0.14331155, -0.09605764, -0.06941299, -0.09740676, 0.0059936745, -0.27215815, -0.31393203, 0.17594862, 0.045626156, 0.060231503, 0.10607796, -0.030635068, 0.15021041, -0.19662435, -0.14648037, 0.028361326)); - conv2d_2_tf += mul(h1, min16float4x4(0.25090003, -0.2845429, -0.30109838, -0.070956856, -0.08051349, -0.07526823, 0.13524723, 0.14151429, -0.1232367, 0.08824123, 0.28804728, 0.31701297, 0.014839836, -0.09193038, 0.30188346, -0.07903937)); - conv2d_2_tf += mul(i1, min16float4x4(0.21150468, 0.13863595, -0.2624825, 0.1652623, -0.026336774, -0.45599234, 0.015901498, 0.23009071, 0.19700526, -0.11013044, 0.19850798, -0.19702299, 0.060643747, -0.03162518, -0.18272553, 0.24863112)); - conv2d_2_tf += mul(a2, min16float4x4(0.16361383, 0.0028921412, 0.18107067, 0.0720563, 0.06378758, -0.09442821, -0.3054202, 0.06843394, 0.20913927, -0.17700543, 0.14682317, 0.21683829, 0.02948067, -0.34866366, -0.04474257, -0.011365872)); - conv2d_2_tf += mul(b2, min16float4x4(0.008512374, 0.19717449, 0.4456541, -0.15356806, -0.24209222, 0.12543896, -0.18232138, 0.012759448, 0.052473016, 0.17268041, 0.25826934, -0.16848944, 0.10150518, -0.30244592, 0.38495708, -0.2090818)); - conv2d_2_tf += mul(c2, min16float4x4(-0.07227807, -0.10066125, -0.1090768, 0.13579647, 0.023154313, 0.079166815, -0.20014893, -0.21884407, 0.09634875, -0.22551452, 0.20771019, 0.16381831, -0.23455033, 0.12578821, -0.43342614, -0.23609087)); - conv2d_2_tf += mul(d2, min16float4x4(-0.11084086, -0.03875876, -0.17912252, -0.24158017, 0.070904315, 0.21862641, 0.02659038, -0.36572614, 0.06265698, 0.32029516, 0.12044166, 0.18424052, 0.050192874, 0.15095103, 0.13794746, -0.111053675)); - conv2d_2_tf += mul(e2, min16float4x4(-0.11362966, 0.5249116, 0.27814335, -0.023295242, 0.022581467, 0.3195408, -0.06865207, -0.13818301, 0.18826036, 0.21182717, -0.30241874, 0.02916674, -0.19999875, 0.8222055, -0.2981789, -0.31122693)); - conv2d_2_tf += mul(f2, min16float4x4(0.058648925, -0.39456168, -0.36158726, 0.4050607, 0.0609484, 0.01624418, -0.2699451, 0.25976416, 0.31131884, 0.18382475, 0.12856431, 0.3285595, 0.4798488, -0.26074353, 0.78901637, -0.071622506)); - conv2d_2_tf += mul(g2, min16float4x4(-0.038631868, -0.20723929, 0.045573164, 0.10398485, 0.20236868, 0.14958549, 0.18842755, -0.23352885, 0.18624173, 0.2800279, 0.23280786, -0.12909916, -0.037398554, 0.1557195, -0.04866289, -0.13633357)); - conv2d_2_tf += mul(h2, min16float4x4(-0.15441336, 0.0968205, -0.32649723, -0.021546176, -0.10667603, 0.18065608, 0.017242601, 0.027690934, -0.23079967, 0.093206555, -0.11170116, 0.19002458, -0.352287, 0.008375842, 0.2459501, -0.09389683)); - conv2d_2_tf += mul(i2, min16float4x4(0.2130623, -0.4781421, -0.53600657, 0.44947717, -0.018234696, -0.17257519, -0.063182175, 0.22729957, -0.037309792, 0.13939567, -0.013829814, -0.20586358, 0.052985236, -0.04452726, 0.1880475, 0.096934296)); - conv2d_2_tf += mul(na1, min16float4x4(0.026266143, -0.03171053, 0.2277772, 0.01093641, -0.007701242, 0.115488306, 0.029304042, 0.33619022, 0.14467055, 0.075788446, -0.076583475, -0.051929206, 0.02211152, 0.031270072, -0.075583085, -0.20198274)); - conv2d_2_tf += mul(nb1, min16float4x4(-0.010648877, 0.21413183, 0.24339998, -0.22960022, -0.16156821, -0.45364898, -0.105244495, -0.07713787, -0.31945667, -0.097204186, -0.2457385, 0.04241939, -0.16228637, 0.13461526, 0.009693403, -0.13537757)); - conv2d_2_tf += mul(nc1, min16float4x4(0.058250688, 0.007912516, -0.071061306, 0.01889538, -0.14592043, -0.10374761, 0.07840785, 0.008756123, -0.045008816, 0.05261628, -0.2615482, -0.01929421, -0.23048545, 0.010220507, -0.16385053, 0.031251106)); - conv2d_2_tf += mul(nd1, min16float4x4(-0.03350765, 0.0737811, -0.09780837, -0.031780828, -0.1919008, 0.36382285, 0.19377235, -0.2797014, -0.12267188, 0.023496462, 0.38848102, -0.010005188, -0.09733866, 0.51535326, 0.47232744, 0.0073942994)); - conv2d_2_tf += mul(ne1, min16float4x4(-0.27284998, 0.14916854, -0.25612846, -0.029941292, 0.18539569, -0.43832946, -0.119871736, 0.044226155, -0.106426276, 0.05740293, -0.046056107, -0.17616963, -0.52316684, 0.33400205, -0.08133327, 0.0948221)); - conv2d_2_tf += mul(nf1, min16float4x4(0.32683802, -0.26026967, 0.19948171, -0.011760837, -0.30256173, -0.45944482, 0.051236197, 0.84710604, -0.08078167, 0.2675028, -0.27241448, 0.27764642, 0.13335843, 0.068502, -0.033614077, 0.19930291)); - conv2d_2_tf += mul(ng1, min16float4x4(0.07075588, 0.029963106, 0.055358, 0.042518128, -0.1441339, 0.42236832, 0.1387107, -0.40421516, 0.02318193, -0.36765453, -0.21558793, 0.21393713, 0.31122518, -0.3358225, -0.4967671, 0.46344024)); - conv2d_2_tf += mul(nh1, min16float4x4(-0.28364134, 0.19475235, 0.42310834, 0.060645495, -0.14013693, -0.049322303, -0.09870014, 0.23229486, -0.033104394, -0.37716264, -0.18488638, 0.17441164, -0.24427529, -0.26787207, -0.16721556, -0.10374529)); - conv2d_2_tf += mul(ni1, min16float4x4(-0.3376618, -0.09682554, 0.3423445, 0.047880173, 0.3354013, -0.21854481, -0.40352795, 0.1841921, 0.008460585, -0.03459756, -0.22880521, 0.35112804, -0.01764322, -0.16448145, 0.107058726, -0.28482538)); - conv2d_2_tf += mul(na2, min16float4x4(-0.032480888, 0.0034003556, -0.032999255, 0.16414961, 0.098690405, 0.0887987, 0.32215804, -0.002440519, -0.16814353, 0.0029867117, -0.28380692, 0.060728613, 0.15944195, 0.16642234, 0.110365815, 0.22413619)); - conv2d_2_tf += mul(nb2, min16float4x4(-0.088509634, 0.047311794, -0.30038288, -0.27227867, 0.41235012, 0.23889793, 0.7280631, 0.13555974, -0.08230139, 0.09955461, -0.13654864, 0.0314745, -0.275061, -0.10253638, -0.34706068, 0.03781376)); - conv2d_2_tf += mul(nc2, min16float4x4(0.09819424, -0.017704371, -0.031446967, 0.061441656, -0.110502265, -0.19236599, 0.2783733, 0.12798637, -0.047672354, -0.018956421, -0.17555775, -0.018790504, 0.43967727, -0.62039405, 0.08790998, 0.4353703)); - conv2d_2_tf += mul(nd2, min16float4x4(-0.019217307, 0.14623284, 0.015177701, 0.15983194, -0.106374666, -0.0131188845, 0.033161264, 0.41326195, 0.052029386, -0.11639186, -0.026856689, -0.020853983, -0.024652582, -0.12368135, -0.39344305, 0.17345576)); - conv2d_2_tf += mul(ne2, min16float4x4(-0.047131967, -0.28568837, 0.4201909, -0.28901812, -0.13973507, 0.03312194, -0.16265458, -0.10710893, 0.21189946, -0.32837728, 0.12424836, -0.30587387, 0.036961686, -0.8623908, 0.3661179, -0.1692949)); - conv2d_2_tf += mul(nf2, min16float4x4(0.1143412, 0.07707313, 0.3981437, -0.17059685, -0.094056316, -0.27234176, 0.12281097, -0.16966031, -0.1512859, -0.0524175, 0.1654043, 0.13700214, -0.3156236, -0.27636334, -0.52670264, 0.9250529)); - conv2d_2_tf += mul(ng2, min16float4x4(0.16162306, -0.15842794, -0.06699449, 0.059618954, 0.06798694, -0.060685594, -0.14878511, 0.17194197, -0.05110082, -0.12152871, -0.2020905, 0.09337634, 0.0602552, -0.07327089, 0.07043988, 0.15926042)); - conv2d_2_tf += mul(nh2, min16float4x4(-0.10312201, -0.13890414, -0.07694594, -0.29262447, 0.0597966, -0.228, -0.00046558332, 0.09373052, 0.2520174, -0.2992283, -0.01796473, -0.052195024, 0.09554047, -0.25678295, -0.38657847, 0.16130428)); - conv2d_2_tf += mul(ni2, min16float4x4(0.21114396, -0.64854, -0.52819866, -0.67061704, 0.05760163, -0.121914886, 0.05448798, -0.1352843, 0.007051261, 0.065677196, -0.09763541, 0.032613076, -0.17908493, -0.7194699, -0.6342276, 0.031814635)); - conv2d_2_tf += min16float4(0.051319666, 0.019196881, 0.0759832, 0.050857317); - min16float4 nconv2d_2_tf = max(-conv2d_2_tf, 0); + MF4 conv2d_2_tf = MF4(0.051319666, 0.019196881, 0.0759832, 0.050857317); + conv2d_2_tf = MulAdd(a1, MF4x4(-0.14656883, -0.044076134, -0.40314636, -0.08023388, 0.12564746, -0.21633625, -0.0210282, -0.19231434, -0.019945038, 0.020343186, -0.007134301, 0.013607319, 0.07334655, -0.050848506, 0.0011201366, 0.26975143), conv2d_2_tf); + conv2d_2_tf = MulAdd(b1, MF4x4(-0.043205153, -0.13764456, -0.5368405, -0.04096279, 0.009450832, 0.23953767, -0.022408254, -0.124040656, 0.53450584, 0.02690831, 0.39857075, 0.42423433, 0.014167992, 0.055189077, -0.038074926, 0.12800713), conv2d_2_tf); + conv2d_2_tf = MulAdd(c1, MF4x4(-0.05354771, -0.06626498, 0.0092389295, 0.100637995, 0.05051714, -0.0033487207, -0.0076860636, 0.013058279, 0.10727092, -0.31131467, 0.058990292, 0.46365786, 0.08736531, 0.038865663, -0.008022449, -0.067517675), conv2d_2_tf); + conv2d_2_tf = MulAdd(d1, MF4x4(-0.25327486, -0.0041089035, 0.04877498, -0.36375836, 0.0003920389, -0.09273049, 0.016388323, 0.11530572, -0.14216854, 0.07370458, -0.27584067, -0.34536567, 0.0848517, -0.1954229, -0.22656868, -0.13531597), conv2d_2_tf); + conv2d_2_tf = MulAdd(e1, MF4x4(0.7035245, 0.1131446, 0.1833189, 0.63726306, -0.20649737, 0.14149575, -0.084267326, 0.020898562, -0.026810758, -0.17932594, -0.08032681, 0.07790513, -0.01148237, -0.19930641, 0.33902612, -0.013703277), conv2d_2_tf); + conv2d_2_tf = MulAdd(f1, MF4x4(-0.2862842, 0.01491211, -0.30474076, 0.19604082, 0.21565811, 0.29193363, 0.024934597, -0.17113955, 0.26136434, -0.12819171, 0.3874644, -0.30533502, 0.004006889, -0.07340657, -0.04817435, -0.013651047), conv2d_2_tf); + conv2d_2_tf = MulAdd(g1, MF4x4(-0.14331155, -0.09605764, -0.06941299, -0.09740676, 0.0059936745, -0.27215815, -0.31393203, 0.17594862, 0.045626156, 0.060231503, 0.10607796, -0.030635068, 0.15021041, -0.19662435, -0.14648037, 0.028361326), conv2d_2_tf); + conv2d_2_tf = MulAdd(h1, MF4x4(0.25090003, -0.2845429, -0.30109838, -0.070956856, -0.08051349, -0.07526823, 0.13524723, 0.14151429, -0.1232367, 0.08824123, 0.28804728, 0.31701297, 0.014839836, -0.09193038, 0.30188346, -0.07903937), conv2d_2_tf); + conv2d_2_tf = MulAdd(i1, MF4x4(0.21150468, 0.13863595, -0.2624825, 0.1652623, -0.026336774, -0.45599234, 0.015901498, 0.23009071, 0.19700526, -0.11013044, 0.19850798, -0.19702299, 0.060643747, -0.03162518, -0.18272553, 0.24863112), conv2d_2_tf); + conv2d_2_tf = MulAdd(a2, MF4x4(0.16361383, 0.0028921412, 0.18107067, 0.0720563, 0.06378758, -0.09442821, -0.3054202, 0.06843394, 0.20913927, -0.17700543, 0.14682317, 0.21683829, 0.02948067, -0.34866366, -0.04474257, -0.011365872), conv2d_2_tf); + conv2d_2_tf = MulAdd(b2, MF4x4(0.008512374, 0.19717449, 0.4456541, -0.15356806, -0.24209222, 0.12543896, -0.18232138, 0.012759448, 0.052473016, 0.17268041, 0.25826934, -0.16848944, 0.10150518, -0.30244592, 0.38495708, -0.2090818), conv2d_2_tf); + conv2d_2_tf = MulAdd(c2, MF4x4(-0.07227807, -0.10066125, -0.1090768, 0.13579647, 0.023154313, 0.079166815, -0.20014893, -0.21884407, 0.09634875, -0.22551452, 0.20771019, 0.16381831, -0.23455033, 0.12578821, -0.43342614, -0.23609087), conv2d_2_tf); + conv2d_2_tf = MulAdd(d2, MF4x4(-0.11084086, -0.03875876, -0.17912252, -0.24158017, 0.070904315, 0.21862641, 0.02659038, -0.36572614, 0.06265698, 0.32029516, 0.12044166, 0.18424052, 0.050192874, 0.15095103, 0.13794746, -0.111053675), conv2d_2_tf); + conv2d_2_tf = MulAdd(e2, MF4x4(-0.11362966, 0.5249116, 0.27814335, -0.023295242, 0.022581467, 0.3195408, -0.06865207, -0.13818301, 0.18826036, 0.21182717, -0.30241874, 0.02916674, -0.19999875, 0.8222055, -0.2981789, -0.31122693), conv2d_2_tf); + conv2d_2_tf = MulAdd(f2, MF4x4(0.058648925, -0.39456168, -0.36158726, 0.4050607, 0.0609484, 0.01624418, -0.2699451, 0.25976416, 0.31131884, 0.18382475, 0.12856431, 0.3285595, 0.4798488, -0.26074353, 0.78901637, -0.071622506), conv2d_2_tf); + conv2d_2_tf = MulAdd(g2, MF4x4(-0.038631868, -0.20723929, 0.045573164, 0.10398485, 0.20236868, 0.14958549, 0.18842755, -0.23352885, 0.18624173, 0.2800279, 0.23280786, -0.12909916, -0.037398554, 0.1557195, -0.04866289, -0.13633357), conv2d_2_tf); + conv2d_2_tf = MulAdd(h2, MF4x4(-0.15441336, 0.0968205, -0.32649723, -0.021546176, -0.10667603, 0.18065608, 0.017242601, 0.027690934, -0.23079967, 0.093206555, -0.11170116, 0.19002458, -0.352287, 0.008375842, 0.2459501, -0.09389683), conv2d_2_tf); + conv2d_2_tf = MulAdd(i2, MF4x4(0.2130623, -0.4781421, -0.53600657, 0.44947717, -0.018234696, -0.17257519, -0.063182175, 0.22729957, -0.037309792, 0.13939567, -0.013829814, -0.20586358, 0.052985236, -0.04452726, 0.1880475, 0.096934296), conv2d_2_tf); + conv2d_2_tf = MulAdd(na1, MF4x4(0.026266143, -0.03171053, 0.2277772, 0.01093641, -0.007701242, 0.115488306, 0.029304042, 0.33619022, 0.14467055, 0.075788446, -0.076583475, -0.051929206, 0.02211152, 0.031270072, -0.075583085, -0.20198274), conv2d_2_tf); + conv2d_2_tf = MulAdd(nb1, MF4x4(-0.010648877, 0.21413183, 0.24339998, -0.22960022, -0.16156821, -0.45364898, -0.105244495, -0.07713787, -0.31945667, -0.097204186, -0.2457385, 0.04241939, -0.16228637, 0.13461526, 0.009693403, -0.13537757), conv2d_2_tf); + conv2d_2_tf = MulAdd(nc1, MF4x4(0.058250688, 0.007912516, -0.071061306, 0.01889538, -0.14592043, -0.10374761, 0.07840785, 0.008756123, -0.045008816, 0.05261628, -0.2615482, -0.01929421, -0.23048545, 0.010220507, -0.16385053, 0.031251106), conv2d_2_tf); + conv2d_2_tf = MulAdd(nd1, MF4x4(-0.03350765, 0.0737811, -0.09780837, -0.031780828, -0.1919008, 0.36382285, 0.19377235, -0.2797014, -0.12267188, 0.023496462, 0.38848102, -0.010005188, -0.09733866, 0.51535326, 0.47232744, 0.0073942994), conv2d_2_tf); + conv2d_2_tf = MulAdd(ne1, MF4x4(-0.27284998, 0.14916854, -0.25612846, -0.029941292, 0.18539569, -0.43832946, -0.119871736, 0.044226155, -0.106426276, 0.05740293, -0.046056107, -0.17616963, -0.52316684, 0.33400205, -0.08133327, 0.0948221), conv2d_2_tf); + conv2d_2_tf = MulAdd(nf1, MF4x4(0.32683802, -0.26026967, 0.19948171, -0.011760837, -0.30256173, -0.45944482, 0.051236197, 0.84710604, -0.08078167, 0.2675028, -0.27241448, 0.27764642, 0.13335843, 0.068502, -0.033614077, 0.19930291), conv2d_2_tf); + conv2d_2_tf = MulAdd(ng1, MF4x4(0.07075588, 0.029963106, 0.055358, 0.042518128, -0.1441339, 0.42236832, 0.1387107, -0.40421516, 0.02318193, -0.36765453, -0.21558793, 0.21393713, 0.31122518, -0.3358225, -0.4967671, 0.46344024), conv2d_2_tf); + conv2d_2_tf = MulAdd(nh1, MF4x4(-0.28364134, 0.19475235, 0.42310834, 0.060645495, -0.14013693, -0.049322303, -0.09870014, 0.23229486, -0.033104394, -0.37716264, -0.18488638, 0.17441164, -0.24427529, -0.26787207, -0.16721556, -0.10374529), conv2d_2_tf); + conv2d_2_tf = MulAdd(ni1, MF4x4(-0.3376618, -0.09682554, 0.3423445, 0.047880173, 0.3354013, -0.21854481, -0.40352795, 0.1841921, 0.008460585, -0.03459756, -0.22880521, 0.35112804, -0.01764322, -0.16448145, 0.107058726, -0.28482538), conv2d_2_tf); + conv2d_2_tf = MulAdd(na2, MF4x4(-0.032480888, 0.0034003556, -0.032999255, 0.16414961, 0.098690405, 0.0887987, 0.32215804, -0.002440519, -0.16814353, 0.0029867117, -0.28380692, 0.060728613, 0.15944195, 0.16642234, 0.110365815, 0.22413619), conv2d_2_tf); + conv2d_2_tf = MulAdd(nb2, MF4x4(-0.088509634, 0.047311794, -0.30038288, -0.27227867, 0.41235012, 0.23889793, 0.7280631, 0.13555974, -0.08230139, 0.09955461, -0.13654864, 0.0314745, -0.275061, -0.10253638, -0.34706068, 0.03781376), conv2d_2_tf); + conv2d_2_tf = MulAdd(nc2, MF4x4(0.09819424, -0.017704371, -0.031446967, 0.061441656, -0.110502265, -0.19236599, 0.2783733, 0.12798637, -0.047672354, -0.018956421, -0.17555775, -0.018790504, 0.43967727, -0.62039405, 0.08790998, 0.4353703), conv2d_2_tf); + conv2d_2_tf = MulAdd(nd2, MF4x4(-0.019217307, 0.14623284, 0.015177701, 0.15983194, -0.106374666, -0.0131188845, 0.033161264, 0.41326195, 0.052029386, -0.11639186, -0.026856689, -0.020853983, -0.024652582, -0.12368135, -0.39344305, 0.17345576), conv2d_2_tf); + conv2d_2_tf = MulAdd(ne2, MF4x4(-0.047131967, -0.28568837, 0.4201909, -0.28901812, -0.13973507, 0.03312194, -0.16265458, -0.10710893, 0.21189946, -0.32837728, 0.12424836, -0.30587387, 0.036961686, -0.8623908, 0.3661179, -0.1692949), conv2d_2_tf); + conv2d_2_tf = MulAdd(nf2, MF4x4(0.1143412, 0.07707313, 0.3981437, -0.17059685, -0.094056316, -0.27234176, 0.12281097, -0.16966031, -0.1512859, -0.0524175, 0.1654043, 0.13700214, -0.3156236, -0.27636334, -0.52670264, 0.9250529), conv2d_2_tf); + conv2d_2_tf = MulAdd(ng2, MF4x4(0.16162306, -0.15842794, -0.06699449, 0.059618954, 0.06798694, -0.060685594, -0.14878511, 0.17194197, -0.05110082, -0.12152871, -0.2020905, 0.09337634, 0.0602552, -0.07327089, 0.07043988, 0.15926042), conv2d_2_tf); + conv2d_2_tf = MulAdd(nh2, MF4x4(-0.10312201, -0.13890414, -0.07694594, -0.29262447, 0.0597966, -0.228, -0.00046558332, 0.09373052, 0.2520174, -0.2992283, -0.01796473, -0.052195024, 0.09554047, -0.25678295, -0.38657847, 0.16130428), conv2d_2_tf); + conv2d_2_tf = MulAdd(ni2, MF4x4(0.21114396, -0.64854, -0.52819866, -0.67061704, 0.05760163, -0.121914886, 0.05448798, -0.1352843, 0.007051261, 0.065677196, -0.09763541, 0.032613076, -0.17908493, -0.7194699, -0.6342276, 0.031814635), conv2d_2_tf); + MF4 nconv2d_2_tf = max(-conv2d_2_tf, 0); conv2d_2_tf = max(conv2d_2_tf, 0); - min16float4 conv2d_1_tf = mul(a1, min16float4x4(0.10187621, 0.11053595, 0.14810364, -0.18582201, 0.16617906, -0.011798966, 0.09280227, 0.13307849, -0.044728525, 0.10914104, 0.075626835, -0.10416733, -0.094498746, -0.06870642, -0.07571491, 0.04897303)); - conv2d_1_tf += mul(b1, min16float4x4(0.33485547, 0.03678466, -0.29866266, -0.048795477, -0.010474432, -0.10252797, 0.036609326, -0.013254512, -0.14475596, 0.011886287, 0.11828754, -0.13557065, -0.28870094, -0.17330378, 0.044048756, -0.019826433)); - conv2d_1_tf += mul(c1, min16float4x4(0.105582856, -0.039765045, 0.0818729, 0.09955303, 0.023201315, 0.09243788, 0.07389467, -0.012808492, 0.0492865, 0.19755632, -0.06548781, 0.08533675, -0.013952, 0.017339202, -0.20518751, -0.054678205)); - conv2d_1_tf += mul(d1, min16float4x4(-0.26653445, 0.04810761, -0.23108084, -0.19818014, 0.23671885, 0.016349426, 0.0045669116, 0.077428445, -0.140711, 0.11972277, 0.101062275, -0.18716832, -0.190941, -0.34035257, -0.09143259, 0.04359683)); - conv2d_1_tf += mul(e1, min16float4x4(-0.14573975, 0.23356283, -0.3772715, -0.22460096, -0.053278442, 0.069576025, 0.05169695, 0.17249753, 0.028048603, -0.25471392, -0.09931249, 0.2095619, 0.22173007, 0.38787642, -0.30738685, 0.01936576)); - conv2d_1_tf += mul(f1, min16float4x4(0.081078954, -0.16813248, 0.1542311, 0.17158946, -0.15383756, 0.025605323, 0.2360881, -0.14753577, -0.016537111, 0.048651446, -0.35849985, 0.01651406, 0.17044473, 0.13180882, 0.324054, -0.18812656)); - conv2d_1_tf += mul(g1, min16float4x4(-0.15537027, -0.08164218, 0.049979087, -0.31885874, -0.15126401, -0.14352658, 0.18948728, 0.020951044, 0.054829888, -0.18936221, -0.22699763, 0.14384085, 0.055476833, -0.011031805, -0.23653851, 0.02768591)); - conv2d_1_tf += mul(h1, min16float4x4(-0.34108123, -0.28492066, 0.50347435, 0.0034134283, 0.041766707, 0.12375689, -0.08600751, 0.22726676, 0.10521852, 0.16621545, 0.038216297, 0.029870255, 0.07065742, -0.03542451, 0.38924676, -0.117774665)); - conv2d_1_tf += mul(i1, min16float4x4(-0.19437145, -0.01827461, 0.15408134, -0.14991991, 0.13832837, 0.0668659, 0.092678316, 0.05341174, 0.21633142, 0.09575402, -0.111060366, -0.00874764, -0.21256353, -0.052944425, 0.16459747, 0.07091838)); - conv2d_1_tf += mul(a2, min16float4x4(0.022236984, 0.19067548, 0.049743406, 0.05148808, 0.23003219, 0.08688227, 0.030773275, -0.059972208, -0.039038613, 0.21701579, -0.11092254, -0.10850967, -0.17777155, -0.20399293, -0.006843039, 0.24139926)); - conv2d_1_tf += mul(b2, min16float4x4(-0.07928885, -0.011657496, -0.03982505, -0.031084592, -0.09403157, -0.13860224, 0.15166754, 0.1279725, -0.084909394, 0.18945958, 0.055481352, -0.24365151, -0.04130202, 0.105171725, -0.47306657, -0.2218246)); - conv2d_1_tf += mul(c2, min16float4x4(-0.06171395, 0.0029490888, 0.055825688, -0.01362009, 0.045571987, -0.04197536, -0.024671398, -0.11600467, 0.02611751, -0.06675449, 0.38841903, 0.109969236, 0.1846224, -0.22673915, -0.11488994, -0.18271959)); - conv2d_1_tf += mul(d2, min16float4x4(-0.08073766, -0.1512685, 0.09596278, 0.061552938, -0.23016383, 0.044725727, -0.1058148, -0.09081257, 0.25391936, 0.13075152, 0.1153331, 0.035533328, 0.14628118, 0.053434838, -0.061957166, -0.11092296)); - conv2d_1_tf += mul(e2, min16float4x4(0.004972408, 0.26720062, -0.0014180156, -0.15569477, 0.08964792, 0.39218047, -0.113748655, -0.20653862, -0.0182982, -0.009456181, 0.096566215, 0.19871894, -0.45192167, -0.19494532, 0.5282211, -0.033234302)); - conv2d_1_tf += mul(f2, min16float4x4(0.11633487, 0.055492207, -0.09550419, 0.019721292, 0.05191187, 0.110391244, 0.13541168, 0.108687185, -0.3231262, -0.071254596, 0.12103068, -0.063508354, 0.16086432, 0.22202429, -0.2793211, -0.059888415)); - conv2d_1_tf += mul(g2, min16float4x4(0.09845572, -0.11364447, -0.06817361, 0.20479278, 0.008171668, -0.10222864, -0.12512983, 0.11285637, 0.2092848, 0.12593135, -0.054839488, 0.1560058, 0.109415986, -0.04229047, -0.21525817, 0.10153635)); - conv2d_1_tf += mul(h2, min16float4x4(-0.26443723, 0.18267378, 0.2874903, -0.15007962, 0.23901714, -0.039331976, -0.4055973, 0.18869716, 0.060133275, -0.030050457, -0.16689767, -0.024223989, 0.43243858, -0.004281818, -0.5925553, 0.08473984)); - conv2d_1_tf += mul(i2, min16float4x4(-0.11769163, -0.6005158, -0.0700652, 0.0062212353, -0.022391787, 0.08070833, 0.10332995, 0.100591965, 0.1680161, 0.1209537, -0.11606606, -0.0032385625, -0.30508906, -0.11541758, 0.27825746, 0.18774803)); - conv2d_1_tf += mul(na1, min16float4x4(-0.06629365, -0.14032914, -0.2580204, 0.18303558, -0.1916567, 0.029803488, -0.12213443, -0.07165115, 0.012936617, -0.11358297, -0.19138688, 0.10422416, 0.18062063, 0.14369549, 0.10535131, -0.036331207)); - conv2d_1_tf += mul(nb1, min16float4x4(-0.23739359, -0.14102252, 0.16535138, -0.055494435, 0.11510639, -0.02530117, 0.13571805, -0.11962709, 0.14311576, -0.11346015, -0.053082045, 0.23039193, 0.2412315, 0.34595123, -0.057626486, 0.1273758)); - conv2d_1_tf += mul(nc1, min16float4x4(-0.031894613, 0.04056866, -0.14806709, -0.061261263, -0.05113628, -0.150074, -0.05885426, 0.025318084, -0.028839143, -0.14976048, -0.061418023, -0.10849576, 0.10669465, 0.025044547, 0.13002798, 0.033596892)); - conv2d_1_tf += mul(nd1, min16float4x4(0.31830126, -0.109857574, 0.022382054, 0.19084917, -0.21992075, -0.06509279, 0.04586319, -0.10979886, 0.07565896, 0.008375114, -0.025531407, 0.112079956, 0.32532254, 0.39258766, 0.15983114, -0.047324624)); - conv2d_1_tf += mul(ne1, min16float4x4(0.06333816, -0.43997836, 0.28480944, -0.037927028, -0.16247569, 0.14209846, -0.5309942, -0.23058164, -0.18387268, 0.3324917, 0.010288075, -0.2516956, -0.42476243, -0.19866063, 0.32058033, 0.052254338)); - conv2d_1_tf += mul(nf1, min16float4x4(-0.019851776, 0.17185202, -0.14713249, -0.1373522, 0.23155597, -0.009191596, -0.15395427, 0.24423079, -0.11106813, -0.034888845, 0.17169674, -0.08786573, -0.08697707, -0.28842747, -0.25445274, 0.13578549)); - conv2d_1_tf += mul(ng1, min16float4x4(0.2099323, 0.09262897, -0.08977398, 0.30791095, 0.12376861, 0.24654338, -0.097672515, 0.008614657, 0.006388779, 0.076170854, 0.25119394, -0.12392118, 0.3138793, -0.015998395, 0.15131904, -0.3009305)); - conv2d_1_tf += mul(nh1, min16float4x4(0.33982292, 0.26557416, -0.3754559, -0.110353656, 0.08402225, -0.053171434, 0.051136248, -0.2696132, -0.14568366, -0.048726343, 0.06216166, 0.018804165, -0.084439, 0.15103953, -0.020082679, 0.15082058)); - conv2d_1_tf += mul(ni1, min16float4x4(0.14522389, -0.0462971, -0.10824406, 0.14163211, -0.08392773, -0.22920173, -0.23795773, -0.2580316, -0.22207144, -0.15956368, 0.12665017, -0.08286834, 0.09581649, 0.12603259, -0.15513468, -0.010735423)); - conv2d_1_tf += mul(na2, min16float4x4(0.00818024, -0.15539199, -0.011369519, 0.05717366, -0.25330603, -0.018393422, 0.027386196, 0.121692196, 0.059138533, -0.1631142, 0.10282322, 0.08011751, 0.10027271, 0.255391, 0.010682224, -0.3095357)); - conv2d_1_tf += mul(nb2, min16float4x4(0.117767766, 0.120644994, 0.09232613, -0.018057318, -0.038398392, 0.14537762, -0.016560853, -0.08958423, 0.06743331, -0.23562634, -0.123906426, 0.028323429, -0.09386831, -0.16833909, 0.019829117, -0.08108203)); - conv2d_1_tf += mul(nc2, min16float4x4(0.05462869, -0.031615634, -0.121678494, 0.05315917, -0.012636353, -0.13374922, 0.18577711, 0.0005971412, -0.099537544, -0.060773082, -0.28754288, -0.20077203, -0.15873533, -0.11387871, -0.17841183, -0.120239034)); - conv2d_1_tf += mul(nd2, min16float4x4(0.13845754, 0.223389, -0.20315485, -0.03479761, 0.1806296, 0.057029717, 0.010771242, 0.15245064, -0.0040082, 0.015283898, -0.34807077, 0.078581005, 0.026417086, -0.058825746, 0.07728649, 0.066044815)); - conv2d_1_tf += mul(ne2, min16float4x4(-0.13820273, -0.050027788, 0.061389934, 0.11189863, 0.008062022, -0.17326912, 0.18159898, 0.08510656, 0.22065656, 0.3918094, -0.05124615, -0.22959533, 0.85480285, 0.5621734, -0.817405, 0.065126896)); - conv2d_1_tf += mul(nf2, min16float4x4(-0.15309735, 0.1396192, 0.16662036, -0.10952867, -0.03473452, -0.08712044, -0.2422528, -0.19236326, 0.49887487, 0.2615184, -0.076631024, 0.16010238, -0.09836315, -0.27126545, 0.17968613, -0.21053861)); - conv2d_1_tf += mul(ng2, min16float4x4(-0.18809205, 0.050410215, 0.1418759, -0.2876976, -0.13414268, 0.07458343, 0.096421175, -0.060676426, -0.17345451, -0.13678914, -0.06512698, -0.102106765, -0.12989639, 0.09089589, 0.07377932, -0.07263102)); - conv2d_1_tf += mul(nh2, min16float4x4(0.45035192, 0.2393797, -0.045452517, -0.04553052, -0.26037264, -0.021321824, 0.24618645, -0.108074926, -0.030116243, 0.04612789, 0.2273845, -0.07468269, -0.48789972, 0.12628402, 1.0130231, -0.14672706)); - conv2d_1_tf += mul(ni2, min16float4x4(0.5591947, -0.0326075, 0.12768768, -0.7916967, 0.023168698, -0.042015456, -0.12410894, -0.033611402, -0.14815444, -0.124497496, 0.08198418, -0.014488041, 0.4252749, -0.20253694, 0.042329047, -0.50953263)); - conv2d_1_tf += min16float4(-0.048558664, 0.11006767, -0.074099846, -0.016021004); + MF4 conv2d_1_tf = MF4(-0.048558664, 0.11006767, -0.074099846, -0.016021004); + conv2d_1_tf = MulAdd(a1, MF4x4(0.10187621, 0.11053595, 0.14810364, -0.18582201, 0.16617906, -0.011798966, 0.09280227, 0.13307849, -0.044728525, 0.10914104, 0.075626835, -0.10416733, -0.094498746, -0.06870642, -0.07571491, 0.04897303), conv2d_1_tf); + conv2d_1_tf = MulAdd(b1, MF4x4(0.33485547, 0.03678466, -0.29866266, -0.048795477, -0.010474432, -0.10252797, 0.036609326, -0.013254512, -0.14475596, 0.011886287, 0.11828754, -0.13557065, -0.28870094, -0.17330378, 0.044048756, -0.019826433), conv2d_1_tf); + conv2d_1_tf = MulAdd(c1, MF4x4(0.105582856, -0.039765045, 0.0818729, 0.09955303, 0.023201315, 0.09243788, 0.07389467, -0.012808492, 0.0492865, 0.19755632, -0.06548781, 0.08533675, -0.013952, 0.017339202, -0.20518751, -0.054678205), conv2d_1_tf); + conv2d_1_tf = MulAdd(d1, MF4x4(-0.26653445, 0.04810761, -0.23108084, -0.19818014, 0.23671885, 0.016349426, 0.0045669116, 0.077428445, -0.140711, 0.11972277, 0.101062275, -0.18716832, -0.190941, -0.34035257, -0.09143259, 0.04359683), conv2d_1_tf); + conv2d_1_tf = MulAdd(e1, MF4x4(-0.14573975, 0.23356283, -0.3772715, -0.22460096, -0.053278442, 0.069576025, 0.05169695, 0.17249753, 0.028048603, -0.25471392, -0.09931249, 0.2095619, 0.22173007, 0.38787642, -0.30738685, 0.01936576), conv2d_1_tf); + conv2d_1_tf = MulAdd(f1, MF4x4(0.081078954, -0.16813248, 0.1542311, 0.17158946, -0.15383756, 0.025605323, 0.2360881, -0.14753577, -0.016537111, 0.048651446, -0.35849985, 0.01651406, 0.17044473, 0.13180882, 0.324054, -0.18812656), conv2d_1_tf); + conv2d_1_tf = MulAdd(g1, MF4x4(-0.15537027, -0.08164218, 0.049979087, -0.31885874, -0.15126401, -0.14352658, 0.18948728, 0.020951044, 0.054829888, -0.18936221, -0.22699763, 0.14384085, 0.055476833, -0.011031805, -0.23653851, 0.02768591), conv2d_1_tf); + conv2d_1_tf = MulAdd(h1, MF4x4(-0.34108123, -0.28492066, 0.50347435, 0.0034134283, 0.041766707, 0.12375689, -0.08600751, 0.22726676, 0.10521852, 0.16621545, 0.038216297, 0.029870255, 0.07065742, -0.03542451, 0.38924676, -0.117774665), conv2d_1_tf); + conv2d_1_tf = MulAdd(i1, MF4x4(-0.19437145, -0.01827461, 0.15408134, -0.14991991, 0.13832837, 0.0668659, 0.092678316, 0.05341174, 0.21633142, 0.09575402, -0.111060366, -0.00874764, -0.21256353, -0.052944425, 0.16459747, 0.07091838), conv2d_1_tf); + conv2d_1_tf = MulAdd(a2, MF4x4(0.022236984, 0.19067548, 0.049743406, 0.05148808, 0.23003219, 0.08688227, 0.030773275, -0.059972208, -0.039038613, 0.21701579, -0.11092254, -0.10850967, -0.17777155, -0.20399293, -0.006843039, 0.24139926), conv2d_1_tf); + conv2d_1_tf = MulAdd(b2, MF4x4(-0.07928885, -0.011657496, -0.03982505, -0.031084592, -0.09403157, -0.13860224, 0.15166754, 0.1279725, -0.084909394, 0.18945958, 0.055481352, -0.24365151, -0.04130202, 0.105171725, -0.47306657, -0.2218246), conv2d_1_tf); + conv2d_1_tf = MulAdd(c2, MF4x4(-0.06171395, 0.0029490888, 0.055825688, -0.01362009, 0.045571987, -0.04197536, -0.024671398, -0.11600467, 0.02611751, -0.06675449, 0.38841903, 0.109969236, 0.1846224, -0.22673915, -0.11488994, -0.18271959), conv2d_1_tf); + conv2d_1_tf = MulAdd(d2, MF4x4(-0.08073766, -0.1512685, 0.09596278, 0.061552938, -0.23016383, 0.044725727, -0.1058148, -0.09081257, 0.25391936, 0.13075152, 0.1153331, 0.035533328, 0.14628118, 0.053434838, -0.061957166, -0.11092296), conv2d_1_tf); + conv2d_1_tf = MulAdd(e2, MF4x4(0.004972408, 0.26720062, -0.0014180156, -0.15569477, 0.08964792, 0.39218047, -0.113748655, -0.20653862, -0.0182982, -0.009456181, 0.096566215, 0.19871894, -0.45192167, -0.19494532, 0.5282211, -0.033234302), conv2d_1_tf); + conv2d_1_tf = MulAdd(f2, MF4x4(0.11633487, 0.055492207, -0.09550419, 0.019721292, 0.05191187, 0.110391244, 0.13541168, 0.108687185, -0.3231262, -0.071254596, 0.12103068, -0.063508354, 0.16086432, 0.22202429, -0.2793211, -0.059888415), conv2d_1_tf); + conv2d_1_tf = MulAdd(g2, MF4x4(0.09845572, -0.11364447, -0.06817361, 0.20479278, 0.008171668, -0.10222864, -0.12512983, 0.11285637, 0.2092848, 0.12593135, -0.054839488, 0.1560058, 0.109415986, -0.04229047, -0.21525817, 0.10153635), conv2d_1_tf); + conv2d_1_tf = MulAdd(h2, MF4x4(-0.26443723, 0.18267378, 0.2874903, -0.15007962, 0.23901714, -0.039331976, -0.4055973, 0.18869716, 0.060133275, -0.030050457, -0.16689767, -0.024223989, 0.43243858, -0.004281818, -0.5925553, 0.08473984), conv2d_1_tf); + conv2d_1_tf = MulAdd(i2, MF4x4(-0.11769163, -0.6005158, -0.0700652, 0.0062212353, -0.022391787, 0.08070833, 0.10332995, 0.100591965, 0.1680161, 0.1209537, -0.11606606, -0.0032385625, -0.30508906, -0.11541758, 0.27825746, 0.18774803), conv2d_1_tf); + conv2d_1_tf = MulAdd(na1, MF4x4(-0.06629365, -0.14032914, -0.2580204, 0.18303558, -0.1916567, 0.029803488, -0.12213443, -0.07165115, 0.012936617, -0.11358297, -0.19138688, 0.10422416, 0.18062063, 0.14369549, 0.10535131, -0.036331207), conv2d_1_tf); + conv2d_1_tf = MulAdd(nb1, MF4x4(-0.23739359, -0.14102252, 0.16535138, -0.055494435, 0.11510639, -0.02530117, 0.13571805, -0.11962709, 0.14311576, -0.11346015, -0.053082045, 0.23039193, 0.2412315, 0.34595123, -0.057626486, 0.1273758), conv2d_1_tf); + conv2d_1_tf = MulAdd(nc1, MF4x4(-0.031894613, 0.04056866, -0.14806709, -0.061261263, -0.05113628, -0.150074, -0.05885426, 0.025318084, -0.028839143, -0.14976048, -0.061418023, -0.10849576, 0.10669465, 0.025044547, 0.13002798, 0.033596892), conv2d_1_tf); + conv2d_1_tf = MulAdd(nd1, MF4x4(0.31830126, -0.109857574, 0.022382054, 0.19084917, -0.21992075, -0.06509279, 0.04586319, -0.10979886, 0.07565896, 0.008375114, -0.025531407, 0.112079956, 0.32532254, 0.39258766, 0.15983114, -0.047324624), conv2d_1_tf); + conv2d_1_tf = MulAdd(ne1, MF4x4(0.06333816, -0.43997836, 0.28480944, -0.037927028, -0.16247569, 0.14209846, -0.5309942, -0.23058164, -0.18387268, 0.3324917, 0.010288075, -0.2516956, -0.42476243, -0.19866063, 0.32058033, 0.052254338), conv2d_1_tf); + conv2d_1_tf = MulAdd(nf1, MF4x4(-0.019851776, 0.17185202, -0.14713249, -0.1373522, 0.23155597, -0.009191596, -0.15395427, 0.24423079, -0.11106813, -0.034888845, 0.17169674, -0.08786573, -0.08697707, -0.28842747, -0.25445274, 0.13578549), conv2d_1_tf); + conv2d_1_tf = MulAdd(ng1, MF4x4(0.2099323, 0.09262897, -0.08977398, 0.30791095, 0.12376861, 0.24654338, -0.097672515, 0.008614657, 0.006388779, 0.076170854, 0.25119394, -0.12392118, 0.3138793, -0.015998395, 0.15131904, -0.3009305), conv2d_1_tf); + conv2d_1_tf = MulAdd(nh1, MF4x4(0.33982292, 0.26557416, -0.3754559, -0.110353656, 0.08402225, -0.053171434, 0.051136248, -0.2696132, -0.14568366, -0.048726343, 0.06216166, 0.018804165, -0.084439, 0.15103953, -0.020082679, 0.15082058), conv2d_1_tf); + conv2d_1_tf = MulAdd(ni1, MF4x4(0.14522389, -0.0462971, -0.10824406, 0.14163211, -0.08392773, -0.22920173, -0.23795773, -0.2580316, -0.22207144, -0.15956368, 0.12665017, -0.08286834, 0.09581649, 0.12603259, -0.15513468, -0.010735423), conv2d_1_tf); + conv2d_1_tf = MulAdd(na2, MF4x4(0.00818024, -0.15539199, -0.011369519, 0.05717366, -0.25330603, -0.018393422, 0.027386196, 0.121692196, 0.059138533, -0.1631142, 0.10282322, 0.08011751, 0.10027271, 0.255391, 0.010682224, -0.3095357), conv2d_1_tf); + conv2d_1_tf = MulAdd(nb2, MF4x4(0.117767766, 0.120644994, 0.09232613, -0.018057318, -0.038398392, 0.14537762, -0.016560853, -0.08958423, 0.06743331, -0.23562634, -0.123906426, 0.028323429, -0.09386831, -0.16833909, 0.019829117, -0.08108203), conv2d_1_tf); + conv2d_1_tf = MulAdd(nc2, MF4x4(0.05462869, -0.031615634, -0.121678494, 0.05315917, -0.012636353, -0.13374922, 0.18577711, 0.0005971412, -0.099537544, -0.060773082, -0.28754288, -0.20077203, -0.15873533, -0.11387871, -0.17841183, -0.120239034), conv2d_1_tf); + conv2d_1_tf = MulAdd(nd2, MF4x4(0.13845754, 0.223389, -0.20315485, -0.03479761, 0.1806296, 0.057029717, 0.010771242, 0.15245064, -0.0040082, 0.015283898, -0.34807077, 0.078581005, 0.026417086, -0.058825746, 0.07728649, 0.066044815), conv2d_1_tf); + conv2d_1_tf = MulAdd(ne2, MF4x4(-0.13820273, -0.050027788, 0.061389934, 0.11189863, 0.008062022, -0.17326912, 0.18159898, 0.08510656, 0.22065656, 0.3918094, -0.05124615, -0.22959533, 0.85480285, 0.5621734, -0.817405, 0.065126896), conv2d_1_tf); + conv2d_1_tf = MulAdd(nf2, MF4x4(-0.15309735, 0.1396192, 0.16662036, -0.10952867, -0.03473452, -0.08712044, -0.2422528, -0.19236326, 0.49887487, 0.2615184, -0.076631024, 0.16010238, -0.09836315, -0.27126545, 0.17968613, -0.21053861), conv2d_1_tf); + conv2d_1_tf = MulAdd(ng2, MF4x4(-0.18809205, 0.050410215, 0.1418759, -0.2876976, -0.13414268, 0.07458343, 0.096421175, -0.060676426, -0.17345451, -0.13678914, -0.06512698, -0.102106765, -0.12989639, 0.09089589, 0.07377932, -0.07263102), conv2d_1_tf); + conv2d_1_tf = MulAdd(nh2, MF4x4(0.45035192, 0.2393797, -0.045452517, -0.04553052, -0.26037264, -0.021321824, 0.24618645, -0.108074926, -0.030116243, 0.04612789, 0.2273845, -0.07468269, -0.48789972, 0.12628402, 1.0130231, -0.14672706), conv2d_1_tf); + conv2d_1_tf = MulAdd(ni2, MF4x4(0.5591947, -0.0326075, 0.12768768, -0.7916967, 0.023168698, -0.042015456, -0.12410894, -0.033611402, -0.14815444, -0.124497496, 0.08198418, -0.014488041, 0.4252749, -0.20253694, 0.042329047, -0.50953263), conv2d_1_tf); tex3[gxy] = conv2d_1_tf; - min16float4 nconv2d_1_tf = max(-conv2d_1_tf, 0); + MF4 nconv2d_1_tf = max(-conv2d_1_tf, 0); conv2d_1_tf = max(conv2d_1_tf, 0); - min16float4 target = mul(e1, min16float4x4(-0.26519376, -0.45442572, -0.24128473, 0.56122154, 0.45048368, 0.32492852, -0.14123245, -0.027976234, -0.11764467, -0.47563952, -0.09401533, 0.024141679, -0.19278349, -0.5169275, -0.26203018, 0.04326379)); - target += mul(e2, min16float4x4(-0.14198317, 0.18704857, -0.20165806, 0.3868074, 0.26532957, 0.13556235, -0.5872983, 0.13357028, 0.48151335, -0.3750496, 0.020972235, -0.32213062, -0.46967435, 0.10506199, 0.24039303, -0.3906582)); - target += mul(ne1, min16float4x4(0.10981934, -0.0040414287, -0.0025180888, -0.23061854, -0.6781062, -0.27331296, -0.1538456, 0.31020573, -0.05341261, 0.45214307, 0.23456645, 0.3261386, -0.020520406, 0.46579385, 0.57791334, 0.441774)); - target += mul(ne2, min16float4x4(0.11475315, 0.18062253, 0.21255025, -0.1963313, -0.22190428, -0.19369084, 0.5878038, -0.051808596, -0.39728877, -0.044071846, 0.0066692936, -0.0066007506, 0.03501876, 0.27602142, 0.11396466, 0.81461775)); - target += mul(conv2d_2_tf, min16float4x4(-0.44411597, -0.11377309, 0.16160126, 0.47119814, 0.22932883, -0.43011594, 0.01986201, 0.01446102, -0.2783236, -0.07647468, -0.5016725, 0.4227215, 0.31808656, 0.23829709, -0.12855907, -0.15950239)); - target += mul(nconv2d_2_tf, min16float4x4(-0.4784548, -0.042179376, -0.4882858, -0.046462137, -0.21421364, -0.35029694, -0.15496174, 0.11386904, 0.22592051, 0.1590684, 0.49690887, -0.37077406, -0.48519966, -0.14407466, 0.24836525, 0.38462397)); - target += mul(conv2d_1_tf, min16float4x4(-0.043213595, -0.004892144, 0.29046863, 0.57064444, 0.37136674, -0.5603234, -0.30733815, 0.26740906, 0.016959883, -0.26567596, 0.101653986, 0.34387913, -0.13222592, -0.34239995, 0.32046688, 0.023962379)); - target += mul(nconv2d_1_tf, min16float4x4(-0.2955613, 0.44671535, 0.056253802, -0.6011664, -0.30715483, 0.16890973, 0.041257784, -0.1544008, 0.4653661, -0.22183, -0.23155628, -0.063779, 0.10350268, 0.02045104, -0.22509801, 0.14633855)); - target += min16float4(-0.00089101185, -0.038285345, 0.023986168, -0.122330956); + MF4 target = MF4(-0.00089101185, -0.038285345, 0.023986168, -0.122330956); + target = MulAdd(e1, MF4x4(-0.26519376, -0.45442572, -0.24128473, 0.56122154, 0.45048368, 0.32492852, -0.14123245, -0.027976234, -0.11764467, -0.47563952, -0.09401533, 0.024141679, -0.19278349, -0.5169275, -0.26203018, 0.04326379), target); + target = MulAdd(e2, MF4x4(-0.14198317, 0.18704857, -0.20165806, 0.3868074, 0.26532957, 0.13556235, -0.5872983, 0.13357028, 0.48151335, -0.3750496, 0.020972235, -0.32213062, -0.46967435, 0.10506199, 0.24039303, -0.3906582), target); + target = MulAdd(ne1, MF4x4(0.10981934, -0.0040414287, -0.0025180888, -0.23061854, -0.6781062, -0.27331296, -0.1538456, 0.31020573, -0.05341261, 0.45214307, 0.23456645, 0.3261386, -0.020520406, 0.46579385, 0.57791334, 0.441774), target); + target = MulAdd(ne2, MF4x4(0.11475315, 0.18062253, 0.21255025, -0.1963313, -0.22190428, -0.19369084, 0.5878038, -0.051808596, -0.39728877, -0.044071846, 0.0066692936, -0.0066007506, 0.03501876, 0.27602142, 0.11396466, 0.81461775), target); + target = MulAdd(conv2d_2_tf, MF4x4(-0.44411597, -0.11377309, 0.16160126, 0.47119814, 0.22932883, -0.43011594, 0.01986201, 0.01446102, -0.2783236, -0.07647468, -0.5016725, 0.4227215, 0.31808656, 0.23829709, -0.12855907, -0.15950239), target); + target = MulAdd(nconv2d_2_tf, MF4x4(-0.4784548, -0.042179376, -0.4882858, -0.046462137, -0.21421364, -0.35029694, -0.15496174, 0.11386904, 0.22592051, 0.1590684, 0.49690887, -0.37077406, -0.48519966, -0.14407466, 0.24836525, 0.38462397), target); + target = MulAdd(conv2d_1_tf, MF4x4(-0.043213595, -0.004892144, 0.29046863, 0.57064444, 0.37136674, -0.5603234, -0.30733815, 0.26740906, 0.016959883, -0.26567596, 0.101653986, 0.34387913, -0.13222592, -0.34239995, 0.32046688, 0.023962379), target); + target = MulAdd(nconv2d_1_tf, MF4x4(-0.2955613, 0.44671535, 0.056253802, -0.6011664, -0.30715483, 0.16890973, 0.041257784, -0.1544008, 0.4653661, -0.22183, -0.23155628, -0.063779, 0.10350268, 0.02045104, -0.22509801, 0.14633855), target); tex4[gxy] = target; - target = mul(e1, min16float4x4(-0.6336626, -0.23328744, 0.054100014, -0.6572063, 0.22899812, 0.47125596, 0.087406546, 0.5788615, -0.24324284, -0.17465535, 0.23223022, -0.4417298, -0.1195797, -0.14119461, -0.2301777, -0.1748931)); - target += mul(e2, min16float4x4(0.2554768, -0.0835268, 0.13054265, 0.033940453, -0.22754695, 0.053536188, -0.10300488, -0.10146903, 0.3104604, -0.5024146, 0.089460805, -0.20216464, 0.6033507, 0.12908716, -0.29953086, 0.292064)); - target += mul(ne1, min16float4x4(0.09586759, -0.037499018, -0.23253569, 0.63889295, 0.18920106, -0.6646685, 0.07218118, -0.61459464, -0.16397415, 0.3131906, -0.39399612, 0.36777702, 0.39545253, 0.030677503, 0.29420745, -0.02527333)); - target += mul(ne2, min16float4x4(-0.2464485, -0.117239855, -0.13390337, 0.43170166, 0.10044111, -0.13811369, -0.007668335, 0.06387773, -0.11786689, 0.23223364, 0.12805769, 0.06410502, -0.2818576, 0.21286973, 0.17026524, -0.22247931)); - target += mul(conv2d_2_tf, min16float4x4(0.12590794, 0.25101408, -0.014941272, -0.06091461, -0.106272854, -0.23196393, 0.64016813, 0.0025616125, 0.16706267, 0.008579063, 0.04476896, -0.5403641, -0.011274305, -0.014704461, -0.068788156, 0.47190762)); - target += mul(nconv2d_2_tf, min16float4x4(0.10427173, -0.11386145, -0.6048206, -0.20245847, -0.011730377, -0.0119483, 0.06255473, -0.5017671, -0.07181296, -0.08626898, -0.035322662, 0.42718327, 0.041101683, 0.017210655, -0.07089471, -0.6541289)); - target += mul(conv2d_1_tf, min16float4x4(-0.43911383, -0.099413894, -0.22120018, -0.3121928, -0.32394376, 0.1159015, 0.04434728, 0.014404674, 0.040322874, 0.06727233, -0.046662346, -0.066591434, -0.004613069, -0.6566657, -0.13442427, -0.081967555)); - target += mul(nconv2d_1_tf, min16float4x4(0.7393613, 0.059159152, 0.21900342, 0.26184326, 0.15656939, -0.05151207, -0.02730003, -0.055701576, -0.50296444, 0.09566756, -0.10248052, -0.39747316, 0.5877897, 0.83397114, -0.07968032, -0.3097048)); - target += min16float4(-0.010642331, -0.050244823, -0.009665539, 0.26457447); + target = MF4(-0.010642331, -0.050244823, -0.009665539, 0.26457447); + target = MulAdd(e1, MF4x4(-0.6336626, -0.23328744, 0.054100014, -0.6572063, 0.22899812, 0.47125596, 0.087406546, 0.5788615, -0.24324284, -0.17465535, 0.23223022, -0.4417298, -0.1195797, -0.14119461, -0.2301777, -0.1748931), target); + target = MulAdd(e2, MF4x4(0.2554768, -0.0835268, 0.13054265, 0.033940453, -0.22754695, 0.053536188, -0.10300488, -0.10146903, 0.3104604, -0.5024146, 0.089460805, -0.20216464, 0.6033507, 0.12908716, -0.29953086, 0.292064), target); + target = MulAdd(ne1, MF4x4(0.09586759, -0.037499018, -0.23253569, 0.63889295, 0.18920106, -0.6646685, 0.07218118, -0.61459464, -0.16397415, 0.3131906, -0.39399612, 0.36777702, 0.39545253, 0.030677503, 0.29420745, -0.02527333), target); + target = MulAdd(ne2, MF4x4(-0.2464485, -0.117239855, -0.13390337, 0.43170166, 0.10044111, -0.13811369, -0.007668335, 0.06387773, -0.11786689, 0.23223364, 0.12805769, 0.06410502, -0.2818576, 0.21286973, 0.17026524, -0.22247931), target); + target = MulAdd(conv2d_2_tf, MF4x4(0.12590794, 0.25101408, -0.014941272, -0.06091461, -0.106272854, -0.23196393, 0.64016813, 0.0025616125, 0.16706267, 0.008579063, 0.04476896, -0.5403641, -0.011274305, -0.014704461, -0.068788156, 0.47190762), target); + target = MulAdd(nconv2d_2_tf, MF4x4(0.10427173, -0.11386145, -0.6048206, -0.20245847, -0.011730377, -0.0119483, 0.06255473, -0.5017671, -0.07181296, -0.08626898, -0.035322662, 0.42718327, 0.041101683, 0.017210655, -0.07089471, -0.6541289), target); + target = MulAdd(conv2d_1_tf, MF4x4(-0.43911383, -0.099413894, -0.22120018, -0.3121928, -0.32394376, 0.1159015, 0.04434728, 0.014404674, 0.040322874, 0.06727233, -0.046662346, -0.066591434, -0.004613069, -0.6566657, -0.13442427, -0.081967555), target); + target = MulAdd(nconv2d_1_tf, MF4x4(0.7393613, 0.059159152, 0.21900342, 0.26184326, 0.15656939, -0.05151207, -0.02730003, -0.055701576, -0.50296444, 0.09566756, -0.10248052, -0.39747316, 0.5877897, 0.83397114, -0.07968032, -0.3097048), target); tex5[gxy] = target; } @@ -372,25 +375,25 @@ void Pass3(uint2 blockStart, uint3 threadId) { // [ a, d, g ] // [ b, e, h ] // [ c, f, i ] - min16float4 a1 = tex4.SampleLevel(sam, pos - inputPt, 0); - min16float4 b1 = tex4.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); - min16float4 c1 = tex4.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); - min16float4 d1 = tex4.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); - min16float4 e1 = tex4.SampleLevel(sam, pos, 0); - min16float4 f1 = tex4.SampleLevel(sam, pos + float2(0, inputPt.y), 0); - min16float4 g1 = tex4.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); - min16float4 h1 = tex4.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); - min16float4 i1 = tex4.SampleLevel(sam, pos + inputPt, 0); - - min16float4 na1 = max(-a1, 0); - min16float4 nb1 = max(-b1, 0); - min16float4 nc1 = max(-c1, 0); - min16float4 nd1 = max(-d1, 0); - min16float4 ne1 = max(-e1, 0); - min16float4 nf1 = max(-f1, 0); - min16float4 ng1 = max(-g1, 0); - min16float4 nh1 = max(-h1, 0); - min16float4 ni1 = max(-i1, 0); + MF4 a1 = tex4.SampleLevel(sam, pos - inputPt, 0); + MF4 b1 = tex4.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); + MF4 c1 = tex4.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); + MF4 d1 = tex4.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); + MF4 e1 = tex4.SampleLevel(sam, pos, 0); + MF4 f1 = tex4.SampleLevel(sam, pos + float2(0, inputPt.y), 0); + MF4 g1 = tex4.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); + MF4 h1 = tex4.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); + MF4 i1 = tex4.SampleLevel(sam, pos + inputPt, 0); + + MF4 na1 = max(-a1, 0); + MF4 nb1 = max(-b1, 0); + MF4 nc1 = max(-c1, 0); + MF4 nd1 = max(-d1, 0); + MF4 ne1 = max(-e1, 0); + MF4 nf1 = max(-f1, 0); + MF4 ng1 = max(-g1, 0); + MF4 nh1 = max(-h1, 0); + MF4 ni1 = max(-i1, 0); a1 = max(a1, 0); b1 = max(b1, 0); @@ -402,25 +405,25 @@ void Pass3(uint2 blockStart, uint3 threadId) { h1 = max(h1, 0); i1 = max(i1, 0); - min16float4 a2 = tex5.SampleLevel(sam, pos - inputPt, 0); - min16float4 b2 = tex5.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); - min16float4 c2 = tex5.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); - min16float4 d2 = tex5.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); - min16float4 e2 = tex5.SampleLevel(sam, pos, 0); - min16float4 f2 = tex5.SampleLevel(sam, pos + float2(0, inputPt.y), 0); - min16float4 g2 = tex5.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); - min16float4 h2 = tex5.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); - min16float4 i2 = tex5.SampleLevel(sam, pos + inputPt, 0); - - min16float4 na2 = max(-a2, 0); - min16float4 nb2 = max(-b2, 0); - min16float4 nc2 = max(-c2, 0); - min16float4 nd2 = max(-d2, 0); - min16float4 ne2 = max(-e2, 0); - min16float4 nf2 = max(-f2, 0); - min16float4 ng2 = max(-g2, 0); - min16float4 nh2 = max(-h2, 0); - min16float4 ni2 = max(-i2, 0); + MF4 a2 = tex5.SampleLevel(sam, pos - inputPt, 0); + MF4 b2 = tex5.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); + MF4 c2 = tex5.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); + MF4 d2 = tex5.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); + MF4 e2 = tex5.SampleLevel(sam, pos, 0); + MF4 f2 = tex5.SampleLevel(sam, pos + float2(0, inputPt.y), 0); + MF4 g2 = tex5.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); + MF4 h2 = tex5.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); + MF4 i2 = tex5.SampleLevel(sam, pos + inputPt, 0); + + MF4 na2 = max(-a2, 0); + MF4 nb2 = max(-b2, 0); + MF4 nc2 = max(-c2, 0); + MF4 nd2 = max(-d2, 0); + MF4 ne2 = max(-e2, 0); + MF4 nf2 = max(-f2, 0); + MF4 ng2 = max(-g2, 0); + MF4 nh2 = max(-h2, 0); + MF4 ni2 = max(-i2, 0); a2 = max(a2, 0); b2 = max(b2, 0); @@ -432,115 +435,115 @@ void Pass3(uint2 blockStart, uint3 threadId) { h2 = max(h2, 0); i2 = max(i2, 0); - min16float4 conv2d_1_tf = tex3.SampleLevel(sam, pos, 0); - min16float4 nconv2d_1_tf = max(-conv2d_1_tf, 0); + MF4 conv2d_1_tf = tex3.SampleLevel(sam, pos, 0); + MF4 nconv2d_1_tf = max(-conv2d_1_tf, 0); conv2d_1_tf = max(conv2d_1_tf, 0); - min16float4 conv2d_5_tf = mul(a1, min16float4x4(-0.11527973, 0.18487021, 0.0010509634, -0.3687562, -0.012861112, -0.37319645, -0.31061935, -0.051598914, 0.061436053, -0.2643697, -0.032551475, 0.59398615, 0.17265628, 0.1634019, 0.026527049, -0.0040123775)); - conv2d_5_tf += mul(b1, min16float4x4(-0.19826698, -0.29437867, 0.15727736, 0.44590214, 0.27655315, 0.28220633, 0.12990361, -0.09000104, -0.26396993, -0.53520125, 0.40639028, 0.7958488, 0.043264065, -0.08110669, -0.28618547, 0.12722827)); - conv2d_5_tf += mul(c1, min16float4x4(0.26455724, -0.36315665, -0.22116943, 0.049996275, 0.28526706, -0.0045478707, -0.20538875, 0.03192557, 0.04443011, -0.48084733, -0.32755423, 0.0075373487, 0.34481105, 0.04272154, -0.11092845, -0.07401724)); - conv2d_5_tf += mul(d1, min16float4x4(0.28374255, 0.13204694, 0.041846596, -0.57726663, 1.0038753, 0.42640173, -0.045806255, 0.3795911, 0.52897507, -0.2522673, 0.37759414, 0.158503, 0.111165345, -0.033814687, -0.37906894, 0.14007671)); - conv2d_5_tf += mul(e1, min16float4x4(0.30553007, -0.032092307, 0.6779135, -0.32720757, 0.29837027, 0.13522549, 0.21653146, 0.4553826, -0.22200927, -0.20921928, 0.36475468, 0.27989116, 0.6222863, -0.37027213, 0.06746388, 0.16675332)); - conv2d_5_tf += mul(f1, min16float4x4(0.31677073, -0.37482786, 0.4029838, 0.43627468, 0.32849845, -0.3442297, 0.1752726, 0.37502408, -0.1561963, -0.17489041, 0.7141825, -0.13179696, 0.17682795, 0.052273672, 0.07300372, 0.20322469)); - conv2d_5_tf += mul(g1, min16float4x4(0.07722791, 0.51997215, 0.2052519, -0.6162976, 0.07318059, -0.16653596, -0.0609372, -0.13199529, -0.011298448, -0.066250905, 0.11658636, -0.07317175, -0.068134755, 0.032443475, -0.27242857, 0.26479205)); - conv2d_5_tf += mul(h1, min16float4x4(-0.46400046, 0.34256476, -0.074927844, -0.082626544, 0.38616362, 0.10320202, 0.7306549, -0.41960227, -0.33295953, -0.35537082, 0.040369444, 0.18173583, 0.36835003, -0.078561984, -0.13071333, -0.06847678)); - conv2d_5_tf += mul(i1, min16float4x4(0.0951899, -0.21144655, 0.12174552, 0.09496668, -0.17025085, -0.36465582, 0.20724316, 0.07027979, 0.17988989, -0.16671456, -0.15068638, 0.26715076, 0.022114933, 0.14284599, -0.06316286, 0.017598677)); - conv2d_5_tf += mul(a2, min16float4x4(0.22179046, -0.19104601, 0.10500515, 0.22017653, -0.065115064, -0.027006533, -0.21086605, 0.00932852, -0.6196575, 0.04396425, 0.52487534, 0.61164427, 0.15172893, 0.219877, 0.103516005, -0.103571504)); - conv2d_5_tf += mul(b2, min16float4x4(0.122733794, 0.19491453, 0.22410785, -0.17341182, -0.18816754, 0.22092234, -0.055087283, -0.14617631, 0.4338981, -0.45366564, -1.4062341, 0.19594707, 0.2178627, 0.016837195, -0.2226328, 0.079190396)); - conv2d_5_tf += mul(c2, min16float4x4(0.16418308, 0.14917587, 0.35162288, 0.04064204, -0.037038237, 0.06579139, -0.08464511, -0.2156906, 0.22791082, -1.1695892, 0.53665465, -0.77753544, 0.0065266103, 0.15857838, 0.010236925, 0.14953533)); - conv2d_5_tf += mul(d2, min16float4x4(0.64548135, -0.02291521, -0.14370848, 0.049308565, 0.13637903, 0.14568083, -0.1488358, -0.0038734428, 0.0809154, -0.15466721, -0.06614126, -0.047732286, 0.311668, 0.22075401, 0.26094854, -0.27763176)); - conv2d_5_tf += mul(e2, min16float4x4(0.12075334, -0.23920162, 0.19115442, -0.33920774, 0.15199614, 0.27974042, -0.05022236, -0.15280685, 0.37271795, -0.76389724, -0.56503266, 1.4975219, 0.24002175, -0.12661129, 0.045953903, 0.2102559)); - conv2d_5_tf += mul(f2, min16float4x4(-0.02855315, -0.16729961, -0.27380818, -0.08810453, 0.061245166, 0.27268958, 0.2282609, 0.072155826, -0.65736717, -0.46307757, -0.5473049, 0.50772667, -0.1581774, 0.28763455, -0.1870661, -0.16523343)); - conv2d_5_tf += mul(g2, min16float4x4(0.23464368, 0.25850806, -0.054024473, -0.13788947, -0.24835043, -0.028147692, -0.23022775, 0.11494646, 0.31069988, -0.21450949, 0.40749013, -0.073832974, -0.16241223, 0.15673774, 0.23648019, -0.34203738)); - conv2d_5_tf += mul(h2, min16float4x4(-0.10198349, -0.052500926, 0.02638934, 0.19718044, -0.09078705, 0.07717591, 0.44648582, -0.30146563, -0.10124157, 0.12145466, -0.2133955, 0.16855773, -0.12310728, 0.35327804, -0.44273457, 0.20639896)); - conv2d_5_tf += mul(i2, min16float4x4(0.08033835, 0.0977811, 0.007069267, -0.110171854, -0.008568571, -0.10922981, 0.12048108, -0.0835261, 0.019930357, -0.12652875, 0.02870121, 0.12214532, -0.024486745, 0.3588685, -0.16501926, 0.11914434)); - conv2d_5_tf += mul(na1, min16float4x4(0.24003507, -0.040643565, -0.4267142, 0.34356147, -0.2618635, -0.1550601, -0.18566506, 0.33267352, -0.17584917, -0.24971883, 0.167064, -0.20808934, 0.3197215, 0.19626021, -0.16993162, -0.16976681)); - conv2d_5_tf += mul(nb1, min16float4x4(0.159248, -0.33713767, -0.37823528, 0.25286102, -0.6171255, 0.01159639, 0.08387377, -0.0796005, -0.18405017, -0.11881008, -0.03026552, 0.030733835, 0.17692643, 0.17118043, 0.23938146, -0.40504465)); - conv2d_5_tf += mul(nc1, min16float4x4(0.11274836, -0.023647472, 0.083114825, 0.5222033, -0.07415273, -0.3251913, -0.034298245, -0.07125199, 0.09593269, -0.23062208, -0.3168607, -0.13040248, -0.41249517, 0.39030293, 0.47400078, -0.109306306)); - conv2d_5_tf += mul(nd1, min16float4x4(-0.49999082, 0.012254524, -0.035179958, 0.212335, -0.10354367, -0.19730526, 0.092015326, -0.07317916, -0.21900047, -0.13948579, -0.3228226, -0.22363624, -0.06421761, 0.16125691, 0.38075948, -0.31371582)); - conv2d_5_tf += mul(ne1, min16float4x4(-1.0006356, -0.13763155, -0.8414047, -0.051852856, -0.44105098, 0.526086, 0.23091859, -0.6621191, -0.015348964, 0.37972412, -0.24986422, 0.13964157, -0.03184678, 0.25394693, -0.051659737, -0.34171197)); - conv2d_5_tf += mul(nf1, min16float4x4(0.14520285, 0.1346628, 0.047271203, 0.64346415, -0.25639483, 0.052174076, 0.28681588, -0.32156095, 0.014350296, 0.028580237, 0.33776954, 0.06681965, -0.27312553, 0.44097883, -0.16519593, -0.7293824)); - conv2d_5_tf += mul(ng1, min16float4x4(-0.65626615, -0.20801732, -0.18783297, 0.27998376, -0.51550066, -0.23272751, -0.3744558, 0.11267917, -0.1879591, 0.043539204, -0.17665562, 0.28546363, -0.20627682, 0.33176526, 0.34412766, -0.4310386)); - conv2d_5_tf += mul(nh1, min16float4x4(0.51410156, -0.08615402, -0.2396778, -0.027256064, 0.11491742, -0.20842157, 0.3855824, -0.19823207, 0.0062098945, -0.2629099, 0.13158852, -0.08746773, -0.46980307, 0.57169086, -0.13392213, 0.13375558)); - conv2d_5_tf += mul(ni1, min16float4x4(0.09988252, 0.19396676, -0.011215926, 0.2714918, 0.07985461, 0.30587563, 0.21915142, -0.14004244, -0.336268, 0.023702772, 0.15740578, -0.06307948, 0.06453276, 0.26978606, 0.45891464, -0.35511568)); - conv2d_5_tf += mul(na2, min16float4x4(-0.33263445, -0.13086738, -0.30128893, 0.03720744, 0.46366304, -0.13430476, -0.26493385, 0.14521147, -0.025578065, -0.043376725, 0.055235144, -0.08467402, 0.12879072, 0.2621278, -0.030150373, -0.079033755)); - conv2d_5_tf += mul(nb2, min16float4x4(-0.15686864, 0.06962337, -0.24032803, 0.05093969, 0.12118379, 0.2144539, 0.21314697, -0.15564163, -0.15193312, -0.15797225, 0.061610706, 0.06689548, 0.42354256, 0.24339569, 0.14413804, -0.08890708)); - conv2d_5_tf += mul(nc2, min16float4x4(0.021830576, -0.0682399, -0.25052184, 0.035374403, -0.0022370394, 0.23796171, 0.40747103, -0.14309348, -0.22325014, 0.12337428, -0.0727028, 0.12374459, -0.24148722, 0.34091887, 0.5052561, -0.13712624)); - conv2d_5_tf += mul(nd2, min16float4x4(-0.583754, -0.10253819, -0.26736188, -0.084894784, 0.7130811, 0.5888696, 0.24837445, 0.20670207, 0.08242887, -0.03090308, 0.24002716, -0.04146999, 0.33550006, -0.006085788, -0.2078999, 0.016955601)); - conv2d_5_tf += mul(ne2, min16float4x4(-0.23921615, 1.0534316, -0.29723012, -0.06626253, 0.022887046, -0.6139072, 0.22857629, 0.4203786, -0.02951169, 0.0501039, -0.054740574, -0.15496075, 0.9533812, 0.21038955, 0.33969748, 0.18853404)); - conv2d_5_tf += mul(nf2, min16float4x4(-0.13571729, -0.045776337, 0.23663524, 0.1457326, -0.23159564, -0.44608104, -0.35497522, -0.14684997, 0.042379193, 0.16966693, 0.2560789, -0.07091574, 0.010749883, -0.26966086, -0.16322245, 0.095426805)); - conv2d_5_tf += mul(ng2, min16float4x4(-0.027934154, -0.25037688, 0.19623838, 0.16128206, 0.21479255, 0.4066385, -0.06756232, -0.19681008, 0.09168842, 0.46935177, -0.059632402, -0.3419115, 0.2789002, 0.012714867, 0.15322958, 0.05255599)); - conv2d_5_tf += mul(nh2, min16float4x4(0.2074098, -0.19564646, 0.21713807, -0.29207307, -0.08546043, 0.122562535, -0.5150736, 0.5190804, -0.116998374, 0.17080544, -0.29132518, 0.47585255, -0.14625762, -0.026589578, -0.13111407, 0.03473621)); - conv2d_5_tf += mul(ni2, min16float4x4(-0.3399405, 0.063775875, -0.0121724615, 0.13809827, -0.1575877, 0.13529225, -0.28708464, -0.063552216, 0.08623843, 0.034867074, 0.25082812, -0.038863987, 0.08048017, -0.43998414, -0.05038377, -0.20123458)); - conv2d_5_tf += min16float4(0.19016464, 0.19431238, -0.073604904, 0.101166695); - min16float4 nconv2d_5_tf = max(-conv2d_5_tf, 0); + MF4 conv2d_5_tf = MF4(0.19016464, 0.19431238, -0.073604904, 0.101166695); + conv2d_5_tf = MulAdd(a1, MF4x4(-0.11527973, 0.18487021, 0.0010509634, -0.3687562, -0.012861112, -0.37319645, -0.31061935, -0.051598914, 0.061436053, -0.2643697, -0.032551475, 0.59398615, 0.17265628, 0.1634019, 0.026527049, -0.0040123775), conv2d_5_tf); + conv2d_5_tf = MulAdd(b1, MF4x4(-0.19826698, -0.29437867, 0.15727736, 0.44590214, 0.27655315, 0.28220633, 0.12990361, -0.09000104, -0.26396993, -0.53520125, 0.40639028, 0.7958488, 0.043264065, -0.08110669, -0.28618547, 0.12722827), conv2d_5_tf); + conv2d_5_tf = MulAdd(c1, MF4x4(0.26455724, -0.36315665, -0.22116943, 0.049996275, 0.28526706, -0.0045478707, -0.20538875, 0.03192557, 0.04443011, -0.48084733, -0.32755423, 0.0075373487, 0.34481105, 0.04272154, -0.11092845, -0.07401724), conv2d_5_tf); + conv2d_5_tf = MulAdd(d1, MF4x4(0.28374255, 0.13204694, 0.041846596, -0.57726663, 1.0038753, 0.42640173, -0.045806255, 0.3795911, 0.52897507, -0.2522673, 0.37759414, 0.158503, 0.111165345, -0.033814687, -0.37906894, 0.14007671), conv2d_5_tf); + conv2d_5_tf = MulAdd(e1, MF4x4(0.30553007, -0.032092307, 0.6779135, -0.32720757, 0.29837027, 0.13522549, 0.21653146, 0.4553826, -0.22200927, -0.20921928, 0.36475468, 0.27989116, 0.6222863, -0.37027213, 0.06746388, 0.16675332), conv2d_5_tf); + conv2d_5_tf = MulAdd(f1, MF4x4(0.31677073, -0.37482786, 0.4029838, 0.43627468, 0.32849845, -0.3442297, 0.1752726, 0.37502408, -0.1561963, -0.17489041, 0.7141825, -0.13179696, 0.17682795, 0.052273672, 0.07300372, 0.20322469), conv2d_5_tf); + conv2d_5_tf = MulAdd(g1, MF4x4(0.07722791, 0.51997215, 0.2052519, -0.6162976, 0.07318059, -0.16653596, -0.0609372, -0.13199529, -0.011298448, -0.066250905, 0.11658636, -0.07317175, -0.068134755, 0.032443475, -0.27242857, 0.26479205), conv2d_5_tf); + conv2d_5_tf = MulAdd(h1, MF4x4(-0.46400046, 0.34256476, -0.074927844, -0.082626544, 0.38616362, 0.10320202, 0.7306549, -0.41960227, -0.33295953, -0.35537082, 0.040369444, 0.18173583, 0.36835003, -0.078561984, -0.13071333, -0.06847678), conv2d_5_tf); + conv2d_5_tf = MulAdd(i1, MF4x4(0.0951899, -0.21144655, 0.12174552, 0.09496668, -0.17025085, -0.36465582, 0.20724316, 0.07027979, 0.17988989, -0.16671456, -0.15068638, 0.26715076, 0.022114933, 0.14284599, -0.06316286, 0.017598677), conv2d_5_tf); + conv2d_5_tf = MulAdd(a2, MF4x4(0.22179046, -0.19104601, 0.10500515, 0.22017653, -0.065115064, -0.027006533, -0.21086605, 0.00932852, -0.6196575, 0.04396425, 0.52487534, 0.61164427, 0.15172893, 0.219877, 0.103516005, -0.103571504), conv2d_5_tf); + conv2d_5_tf = MulAdd(b2, MF4x4(0.122733794, 0.19491453, 0.22410785, -0.17341182, -0.18816754, 0.22092234, -0.055087283, -0.14617631, 0.4338981, -0.45366564, -1.4062341, 0.19594707, 0.2178627, 0.016837195, -0.2226328, 0.079190396), conv2d_5_tf); + conv2d_5_tf = MulAdd(c2, MF4x4(0.16418308, 0.14917587, 0.35162288, 0.04064204, -0.037038237, 0.06579139, -0.08464511, -0.2156906, 0.22791082, -1.1695892, 0.53665465, -0.77753544, 0.0065266103, 0.15857838, 0.010236925, 0.14953533), conv2d_5_tf); + conv2d_5_tf = MulAdd(d2, MF4x4(0.64548135, -0.02291521, -0.14370848, 0.049308565, 0.13637903, 0.14568083, -0.1488358, -0.0038734428, 0.0809154, -0.15466721, -0.06614126, -0.047732286, 0.311668, 0.22075401, 0.26094854, -0.27763176), conv2d_5_tf); + conv2d_5_tf = MulAdd(e2, MF4x4(0.12075334, -0.23920162, 0.19115442, -0.33920774, 0.15199614, 0.27974042, -0.05022236, -0.15280685, 0.37271795, -0.76389724, -0.56503266, 1.4975219, 0.24002175, -0.12661129, 0.045953903, 0.2102559), conv2d_5_tf); + conv2d_5_tf = MulAdd(f2, MF4x4(-0.02855315, -0.16729961, -0.27380818, -0.08810453, 0.061245166, 0.27268958, 0.2282609, 0.072155826, -0.65736717, -0.46307757, -0.5473049, 0.50772667, -0.1581774, 0.28763455, -0.1870661, -0.16523343), conv2d_5_tf); + conv2d_5_tf = MulAdd(g2, MF4x4(0.23464368, 0.25850806, -0.054024473, -0.13788947, -0.24835043, -0.028147692, -0.23022775, 0.11494646, 0.31069988, -0.21450949, 0.40749013, -0.073832974, -0.16241223, 0.15673774, 0.23648019, -0.34203738), conv2d_5_tf); + conv2d_5_tf = MulAdd(h2, MF4x4(-0.10198349, -0.052500926, 0.02638934, 0.19718044, -0.09078705, 0.07717591, 0.44648582, -0.30146563, -0.10124157, 0.12145466, -0.2133955, 0.16855773, -0.12310728, 0.35327804, -0.44273457, 0.20639896), conv2d_5_tf); + conv2d_5_tf = MulAdd(i2, MF4x4(0.08033835, 0.0977811, 0.007069267, -0.110171854, -0.008568571, -0.10922981, 0.12048108, -0.0835261, 0.019930357, -0.12652875, 0.02870121, 0.12214532, -0.024486745, 0.3588685, -0.16501926, 0.11914434), conv2d_5_tf); + conv2d_5_tf = MulAdd(na1, MF4x4(0.24003507, -0.040643565, -0.4267142, 0.34356147, -0.2618635, -0.1550601, -0.18566506, 0.33267352, -0.17584917, -0.24971883, 0.167064, -0.20808934, 0.3197215, 0.19626021, -0.16993162, -0.16976681), conv2d_5_tf); + conv2d_5_tf = MulAdd(nb1, MF4x4(0.159248, -0.33713767, -0.37823528, 0.25286102, -0.6171255, 0.01159639, 0.08387377, -0.0796005, -0.18405017, -0.11881008, -0.03026552, 0.030733835, 0.17692643, 0.17118043, 0.23938146, -0.40504465), conv2d_5_tf); + conv2d_5_tf = MulAdd(nc1, MF4x4(0.11274836, -0.023647472, 0.083114825, 0.5222033, -0.07415273, -0.3251913, -0.034298245, -0.07125199, 0.09593269, -0.23062208, -0.3168607, -0.13040248, -0.41249517, 0.39030293, 0.47400078, -0.109306306), conv2d_5_tf); + conv2d_5_tf = MulAdd(nd1, MF4x4(-0.49999082, 0.012254524, -0.035179958, 0.212335, -0.10354367, -0.19730526, 0.092015326, -0.07317916, -0.21900047, -0.13948579, -0.3228226, -0.22363624, -0.06421761, 0.16125691, 0.38075948, -0.31371582), conv2d_5_tf); + conv2d_5_tf = MulAdd(ne1, MF4x4(-1.0006356, -0.13763155, -0.8414047, -0.051852856, -0.44105098, 0.526086, 0.23091859, -0.6621191, -0.015348964, 0.37972412, -0.24986422, 0.13964157, -0.03184678, 0.25394693, -0.051659737, -0.34171197), conv2d_5_tf); + conv2d_5_tf = MulAdd(nf1, MF4x4(0.14520285, 0.1346628, 0.047271203, 0.64346415, -0.25639483, 0.052174076, 0.28681588, -0.32156095, 0.014350296, 0.028580237, 0.33776954, 0.06681965, -0.27312553, 0.44097883, -0.16519593, -0.7293824), conv2d_5_tf); + conv2d_5_tf = MulAdd(ng1, MF4x4(-0.65626615, -0.20801732, -0.18783297, 0.27998376, -0.51550066, -0.23272751, -0.3744558, 0.11267917, -0.1879591, 0.043539204, -0.17665562, 0.28546363, -0.20627682, 0.33176526, 0.34412766, -0.4310386), conv2d_5_tf); + conv2d_5_tf = MulAdd(nh1, MF4x4(0.51410156, -0.08615402, -0.2396778, -0.027256064, 0.11491742, -0.20842157, 0.3855824, -0.19823207, 0.0062098945, -0.2629099, 0.13158852, -0.08746773, -0.46980307, 0.57169086, -0.13392213, 0.13375558), conv2d_5_tf); + conv2d_5_tf = MulAdd(ni1, MF4x4(0.09988252, 0.19396676, -0.011215926, 0.2714918, 0.07985461, 0.30587563, 0.21915142, -0.14004244, -0.336268, 0.023702772, 0.15740578, -0.06307948, 0.06453276, 0.26978606, 0.45891464, -0.35511568), conv2d_5_tf); + conv2d_5_tf = MulAdd(na2, MF4x4(-0.33263445, -0.13086738, -0.30128893, 0.03720744, 0.46366304, -0.13430476, -0.26493385, 0.14521147, -0.025578065, -0.043376725, 0.055235144, -0.08467402, 0.12879072, 0.2621278, -0.030150373, -0.079033755), conv2d_5_tf); + conv2d_5_tf = MulAdd(nb2, MF4x4(-0.15686864, 0.06962337, -0.24032803, 0.05093969, 0.12118379, 0.2144539, 0.21314697, -0.15564163, -0.15193312, -0.15797225, 0.061610706, 0.06689548, 0.42354256, 0.24339569, 0.14413804, -0.08890708), conv2d_5_tf); + conv2d_5_tf = MulAdd(nc2, MF4x4(0.021830576, -0.0682399, -0.25052184, 0.035374403, -0.0022370394, 0.23796171, 0.40747103, -0.14309348, -0.22325014, 0.12337428, -0.0727028, 0.12374459, -0.24148722, 0.34091887, 0.5052561, -0.13712624), conv2d_5_tf); + conv2d_5_tf = MulAdd(nd2, MF4x4(-0.583754, -0.10253819, -0.26736188, -0.084894784, 0.7130811, 0.5888696, 0.24837445, 0.20670207, 0.08242887, -0.03090308, 0.24002716, -0.04146999, 0.33550006, -0.006085788, -0.2078999, 0.016955601), conv2d_5_tf); + conv2d_5_tf = MulAdd(ne2, MF4x4(-0.23921615, 1.0534316, -0.29723012, -0.06626253, 0.022887046, -0.6139072, 0.22857629, 0.4203786, -0.02951169, 0.0501039, -0.054740574, -0.15496075, 0.9533812, 0.21038955, 0.33969748, 0.18853404), conv2d_5_tf); + conv2d_5_tf = MulAdd(nf2, MF4x4(-0.13571729, -0.045776337, 0.23663524, 0.1457326, -0.23159564, -0.44608104, -0.35497522, -0.14684997, 0.042379193, 0.16966693, 0.2560789, -0.07091574, 0.010749883, -0.26966086, -0.16322245, 0.095426805), conv2d_5_tf); + conv2d_5_tf = MulAdd(ng2, MF4x4(-0.027934154, -0.25037688, 0.19623838, 0.16128206, 0.21479255, 0.4066385, -0.06756232, -0.19681008, 0.09168842, 0.46935177, -0.059632402, -0.3419115, 0.2789002, 0.012714867, 0.15322958, 0.05255599), conv2d_5_tf); + conv2d_5_tf = MulAdd(nh2, MF4x4(0.2074098, -0.19564646, 0.21713807, -0.29207307, -0.08546043, 0.122562535, -0.5150736, 0.5190804, -0.116998374, 0.17080544, -0.29132518, 0.47585255, -0.14625762, -0.026589578, -0.13111407, 0.03473621), conv2d_5_tf); + conv2d_5_tf = MulAdd(ni2, MF4x4(-0.3399405, 0.063775875, -0.0121724615, 0.13809827, -0.1575877, 0.13529225, -0.28708464, -0.063552216, 0.08623843, 0.034867074, 0.25082812, -0.038863987, 0.08048017, -0.43998414, -0.05038377, -0.20123458), conv2d_5_tf); + MF4 nconv2d_5_tf = max(-conv2d_5_tf, 0); conv2d_5_tf = max(conv2d_5_tf, 0); - min16float4 conv2d_4_tf = mul(a1, min16float4x4(0.259803, 0.14121838, -0.3216694, 0.16912009, -0.24997918, -0.024859427, 0.07951931, -0.17898253, 0.14770418, -0.38608834, 0.7155576, -0.008749993, 0.106385805, -0.08190305, 0.06277034, 0.05247095)); - conv2d_4_tf += mul(b1, min16float4x4(-0.10331291, 0.29847905, -0.20864278, -0.34607938, -0.0629403, 0.24202278, 0.15617771, 0.09471163, 0.29827452, -0.5237911, 0.8446165, -0.038001515, 0.085504964, -0.012998129, -0.12903701, -0.068084855)); - conv2d_4_tf += mul(c1, min16float4x4(-0.028803846, 0.117718086, 0.11924323, -0.23554896, -0.31169716, 0.2164557, 0.054745417, -0.2886858, 0.34304592, -0.15872054, 0.21533915, 0.23624876, -0.02507208, 0.16001348, -0.14645866, -0.013143789)); - conv2d_4_tf += mul(d1, min16float4x4(0.12311184, 0.16843726, -0.5478087, 0.036556758, -0.0024939126, -0.12264501, 0.090127975, -0.14638199, -0.33366996, 0.1817309, 0.018728942, -0.025097579, -0.00233696, 0.15182042, -0.072947, -0.15065937)); - conv2d_4_tf += mul(e1, min16float4x4(0.3238381, 0.19316678, 0.23307748, -0.10455285, -0.35405514, -0.06559013, 0.4206979, 0.08059919, -0.26130152, -0.23416454, -0.21285532, 0.07799376, 0.12372864, -0.3774056, 0.022239799, 0.22356819)); - conv2d_4_tf += mul(f1, min16float4x4(0.066345, 0.20370135, -0.01601085, 0.014701113, 0.27098605, 0.25511372, -0.048403386, -0.014162313, 0.11301996, -0.09638182, 0.12047054, -0.010323633, 0.21627729, 0.18377618, -0.12752205, -0.0668105)); - conv2d_4_tf += mul(g1, min16float4x4(0.18890683, -0.21100806, -0.38314816, 0.12188494, -0.09069559, 0.1785706, -0.19502263, -0.22853898, -0.096488185, 0.18105212, -0.0045291157, -0.018952737, 0.14934972, -0.17416078, 0.05363704, -0.17642738)); - conv2d_4_tf += mul(h1, min16float4x4(-0.15392087, 0.13997103, -0.12765433, -0.054465868, 0.0061383434, 0.03424787, -0.08585949, -0.10249745, -0.055375032, -0.047258787, -0.10105776, 0.09468892, 0.32030013, -0.14938186, 0.18287018, 0.007592655)); - conv2d_4_tf += mul(i1, min16float4x4(0.109669484, 0.02212132, 0.038995523, -0.0041161263, -0.12115841, -0.048061926, 0.06674463, -0.33846095, 0.04251217, -0.05917749, 0.17834029, 0.010219928, 0.2690458, 0.09282476, 0.077470005, -0.07310091)); - conv2d_4_tf += mul(a2, min16float4x4(0.4314233, 0.035379685, 0.27331847, 0.19597715, -0.09619968, -0.055907905, 0.07898602, 0.031254813, -0.09366987, -0.37436283, 0.061305135, -0.32644534, -0.16999187, 0.06906536, -0.1228417, -0.09826574)); - conv2d_4_tf += mul(b2, min16float4x4(0.6059936, -0.10060162, -0.18080838, 0.26205355, 0.033052504, -0.10625297, -0.0038814575, 0.026052764, 0.19484659, -0.24242568, 0.8054419, -0.3437365, -0.010305425, -0.079504244, 0.11879563, -0.14375582)); - conv2d_4_tf += mul(c2, min16float4x4(0.23313539, -0.026485069, 0.13332158, 0.28462213, -0.19786534, 0.048259735, 0.024113638, 0.23403068, -1.0330093, 0.0059400625, 0.23721488, -1.379481, 0.12166913, -0.07133997, 0.060898513, 0.092720084)); - conv2d_4_tf += mul(d2, min16float4x4(0.16513251, 0.013819962, -0.009859532, -0.037474833, 0.25651336, -0.131653, 0.03145131, -0.27886832, 0.27808505, -0.099978246, -0.11189488, 0.053313572, 0.11455811, 0.10826371, 0.0017301271, -0.041959)); - conv2d_4_tf += mul(e2, min16float4x4(-0.037442397, 0.061722398, 0.099159, -0.18970016, -0.13042277, 0.16767356, -0.028342545, 0.18715699, 0.22246139, 0.3154743, -0.39717823, 0.26053482, -0.012097491, 0.1746896, 0.3899962, -0.13013846)); - conv2d_4_tf += mul(f2, min16float4x4(-0.14552362, -0.26800197, 0.09035887, 0.24266347, -0.14494316, 0.033814326, -0.06647855, -0.16609156, 0.30540654, 0.037082594, 0.14951941, 0.12753695, -0.045153987, -0.28476146, 0.37640104, -0.04667195)); - conv2d_4_tf += mul(g2, min16float4x4(0.2071077, -0.09297775, -0.04906301, -0.24280597, 0.15925987, -0.05631783, 0.08169953, -0.20124075, 0.23060048, -0.05786468, 0.23959383, 0.1620485, 0.14333409, -0.12757483, -0.1424963, 0.13118197)); - conv2d_4_tf += mul(h2, min16float4x4(-0.101942524, -0.02240319, 0.11718157, -0.13591368, 0.11223302, -0.042933583, -0.07766777, 0.01667011, 0.07462998, 0.020704709, -0.04329035, -0.01358702, 0.13569939, 0.015980164, -0.08001042, 0.13890027)); - conv2d_4_tf += mul(i2, min16float4x4(0.01755685, -0.047599614, 0.06456479, -0.08004052, 0.08108282, 0.06789228, -0.14048836, -0.020240005, 0.039701223, 0.023405846, 0.06305444, -0.046804685, 0.040620867, 0.013529182, -0.094961315, 0.02959053)); - conv2d_4_tf += mul(na1, min16float4x4(-0.053775985, -0.0060494044, 0.14724614, 0.07248909, -0.056616947, 0.0004714896, -0.18737504, -0.15240799, -0.030883765, -0.007487297, -0.0044565946, 0.15024893, -0.16870505, 0.09338804, -0.21873595, -0.14493267)); - conv2d_4_tf += mul(nb1, min16float4x4(-0.045113027, -0.2153715, 0.04520989, 0.26561612, -0.12634845, -0.10975088, -0.3677834, -0.4343602, -0.34146985, 0.29135808, 0.026339425, -0.0995021, 0.012693227, 0.07312179, 0.21671581, 0.11961088)); - conv2d_4_tf += mul(nc1, min16float4x4(0.19766524, -0.31538734, 0.35708517, 0.33092737, 0.027086282, 0.024219114, -0.15289012, -0.18128034, -0.16041638, 0.057314564, 0.079830885, -0.08828221, 0.11828446, -0.13336371, -0.078453206, 0.21232514)); - conv2d_4_tf += mul(nd1, min16float4x4(-0.13100033, -0.24849984, 0.3087074, 0.017271562, -0.17455627, -0.014364008, 0.077686995, -0.015820628, 0.18584616, -0.16705278, -0.3169503, 0.09107534, -0.04958684, -0.008202742, 0.024148908, -0.04654239)); - conv2d_4_tf += mul(ne1, min16float4x4(-0.16020702, -0.18623418, -0.29434547, 0.5008317, 0.23796988, -0.11154579, -0.5167728, -0.14195764, 0.15495163, -0.028505204, -0.2105556, 0.22491512, -0.11658545, 0.31665426, 0.35085753, -0.40148884)); - conv2d_4_tf += mul(nf1, min16float4x4(0.24866697, -0.3752738, 0.8472619, 0.16663249, -0.25808626, -0.037561346, -0.1440471, -0.107407264, 0.016663626, 0.1599037, -0.31926402, 0.15272903, -0.14700623, -0.05275371, 0.061130624, 0.084672675)); - conv2d_4_tf += mul(ng1, min16float4x4(-0.24184473, -0.016008917, 0.040023588, 0.1517675, -0.1339458, 0.009985992, 0.15634708, -0.07649679, 0.0021696684, -0.07027257, -0.07509208, -0.27060902, -0.21299353, 0.12154156, -0.3159698, 0.2511261)); - conv2d_4_tf += mul(nh1, min16float4x4(0.19845779, 0.023986215, -0.073409855, 0.0812208, 0.013382121, -0.049414996, -0.12990347, 0.052681953, -0.12787153, -0.100129806, -0.036296804, -0.13915883, -0.24022135, 0.167096, -0.15128131, 0.17779276)); - conv2d_4_tf += mul(ni1, min16float4x4(-0.05787442, -0.19698323, 0.13090582, 0.1501304, -0.09954089, -0.008470983, -0.095334776, 0.114635326, -0.16330223, -0.046815667, -0.086304545, -0.15729928, -0.1982723, 0.10607274, -0.25540838, 0.09633669)); - conv2d_4_tf += mul(na2, min16float4x4(-0.25680968, -0.18444876, 0.053333476, 0.10470261, 0.17798793, -0.108659215, 0.1787569, -0.027407814, 0.12637395, -0.038193744, -0.16185284, 0.14068736, 0.092281684, 0.022276353, 0.013779975, 0.026369803)); - conv2d_4_tf += mul(nb2, min16float4x4(-0.17329752, 0.21632285, -0.036964342, 0.30856085, 0.015225849, 0.04158692, -0.010607313, 0.16295516, 0.18873654, 0.24728407, 0.09787, -0.14381099, -0.091119304, 0.12914585, -0.039659716, -0.10700463)); - conv2d_4_tf += mul(nc2, min16float4x4(-0.037163302, 0.05201725, -0.149489, -0.05682234, -0.022634465, -0.074764505, -0.010783339, 0.028970495, -0.045976285, -0.1923207, -0.037494432, -0.13024884, -0.1957353, 0.013454359, -0.30236122, -0.078870796)); - conv2d_4_tf += mul(nd2, min16float4x4(-0.17753989, -0.1549664, 0.08087595, 0.046868976, -0.09354348, 0.22648604, 0.002651186, 0.11890617, -0.0073132347, 0.05030891, -0.08128038, 0.14395374, -0.001108739, -0.030957213, -0.03568773, 0.055131156)); - conv2d_4_tf += mul(ne2, min16float4x4(-0.029484594, -0.013036961, -0.31721568, 0.11611545, -0.24111903, -0.33007705, 0.5950326, -0.070911475, -0.04757172, -0.037676062, -0.14590797, 0.076822214, -0.1672743, -0.41848892, 0.39202756, -0.30958134)); - conv2d_4_tf += mul(nf2, min16float4x4(0.17605461, 0.12216047, -0.02412872, -0.14132546, -0.052373543, 0.08169531, 0.18497281, 0.074685514, -0.055427983, 0.14018987, -0.11671619, 0.108945735, -0.032986425, 0.11385016, 0.05801377, -0.1457665)); - conv2d_4_tf += mul(ng2, min16float4x4(-0.27222672, -0.0074164676, 0.35768685, 0.0074552484, 0.16729778, 0.14860032, -0.3657366, 0.24510175, -0.0621289, -0.0137252435, -0.26145887, 0.0556681, -0.07332952, 0.13122542, -0.020396946, 0.113705456)); - conv2d_4_tf += mul(nh2, min16float4x4(0.08118381, -0.06442098, 0.00044297878, 0.13279027, -0.20708169, 0.11252618, -0.033728387, -0.0105973175, -0.2138218, 0.34612998, -0.15597765, 0.18179017, -0.007853463, -0.045547944, 0.22064093, 0.0548327)); - conv2d_4_tf += mul(ni2, min16float4x4(-0.10656318, -0.014200068, 0.062040597, -0.037210476, -0.07271065, -0.027337732, -0.14988437, -0.14711551, -0.028843492, -0.0046596485, -0.15023676, 0.08530336, -0.016875269, -0.024734195, 0.055177588, 0.010381644)); - conv2d_4_tf += min16float4(-0.021330277, -0.09496422, -0.1339419, 0.012216251); + MF4 conv2d_4_tf = MF4(-0.021330277, -0.09496422, -0.1339419, 0.012216251); + conv2d_4_tf = MulAdd(a1, MF4x4(0.259803, 0.14121838, -0.3216694, 0.16912009, -0.24997918, -0.024859427, 0.07951931, -0.17898253, 0.14770418, -0.38608834, 0.7155576, -0.008749993, 0.106385805, -0.08190305, 0.06277034, 0.05247095), conv2d_4_tf); + conv2d_4_tf = MulAdd(b1, MF4x4(-0.10331291, 0.29847905, -0.20864278, -0.34607938, -0.0629403, 0.24202278, 0.15617771, 0.09471163, 0.29827452, -0.5237911, 0.8446165, -0.038001515, 0.085504964, -0.012998129, -0.12903701, -0.068084855), conv2d_4_tf); + conv2d_4_tf = MulAdd(c1, MF4x4(-0.028803846, 0.117718086, 0.11924323, -0.23554896, -0.31169716, 0.2164557, 0.054745417, -0.2886858, 0.34304592, -0.15872054, 0.21533915, 0.23624876, -0.02507208, 0.16001348, -0.14645866, -0.013143789), conv2d_4_tf); + conv2d_4_tf = MulAdd(d1, MF4x4(0.12311184, 0.16843726, -0.5478087, 0.036556758, -0.0024939126, -0.12264501, 0.090127975, -0.14638199, -0.33366996, 0.1817309, 0.018728942, -0.025097579, -0.00233696, 0.15182042, -0.072947, -0.15065937), conv2d_4_tf); + conv2d_4_tf = MulAdd(e1, MF4x4(0.3238381, 0.19316678, 0.23307748, -0.10455285, -0.35405514, -0.06559013, 0.4206979, 0.08059919, -0.26130152, -0.23416454, -0.21285532, 0.07799376, 0.12372864, -0.3774056, 0.022239799, 0.22356819), conv2d_4_tf); + conv2d_4_tf = MulAdd(f1, MF4x4(0.066345, 0.20370135, -0.01601085, 0.014701113, 0.27098605, 0.25511372, -0.048403386, -0.014162313, 0.11301996, -0.09638182, 0.12047054, -0.010323633, 0.21627729, 0.18377618, -0.12752205, -0.0668105), conv2d_4_tf); + conv2d_4_tf = MulAdd(g1, MF4x4(0.18890683, -0.21100806, -0.38314816, 0.12188494, -0.09069559, 0.1785706, -0.19502263, -0.22853898, -0.096488185, 0.18105212, -0.0045291157, -0.018952737, 0.14934972, -0.17416078, 0.05363704, -0.17642738), conv2d_4_tf); + conv2d_4_tf = MulAdd(h1, MF4x4(-0.15392087, 0.13997103, -0.12765433, -0.054465868, 0.0061383434, 0.03424787, -0.08585949, -0.10249745, -0.055375032, -0.047258787, -0.10105776, 0.09468892, 0.32030013, -0.14938186, 0.18287018, 0.007592655), conv2d_4_tf); + conv2d_4_tf = MulAdd(i1, MF4x4(0.109669484, 0.02212132, 0.038995523, -0.0041161263, -0.12115841, -0.048061926, 0.06674463, -0.33846095, 0.04251217, -0.05917749, 0.17834029, 0.010219928, 0.2690458, 0.09282476, 0.077470005, -0.07310091), conv2d_4_tf); + conv2d_4_tf = MulAdd(a2, MF4x4(0.4314233, 0.035379685, 0.27331847, 0.19597715, -0.09619968, -0.055907905, 0.07898602, 0.031254813, -0.09366987, -0.37436283, 0.061305135, -0.32644534, -0.16999187, 0.06906536, -0.1228417, -0.09826574), conv2d_4_tf); + conv2d_4_tf = MulAdd(b2, MF4x4(0.6059936, -0.10060162, -0.18080838, 0.26205355, 0.033052504, -0.10625297, -0.0038814575, 0.026052764, 0.19484659, -0.24242568, 0.8054419, -0.3437365, -0.010305425, -0.079504244, 0.11879563, -0.14375582), conv2d_4_tf); + conv2d_4_tf = MulAdd(c2, MF4x4(0.23313539, -0.026485069, 0.13332158, 0.28462213, -0.19786534, 0.048259735, 0.024113638, 0.23403068, -1.0330093, 0.0059400625, 0.23721488, -1.379481, 0.12166913, -0.07133997, 0.060898513, 0.092720084), conv2d_4_tf); + conv2d_4_tf = MulAdd(d2, MF4x4(0.16513251, 0.013819962, -0.009859532, -0.037474833, 0.25651336, -0.131653, 0.03145131, -0.27886832, 0.27808505, -0.099978246, -0.11189488, 0.053313572, 0.11455811, 0.10826371, 0.0017301271, -0.041959), conv2d_4_tf); + conv2d_4_tf = MulAdd(e2, MF4x4(-0.037442397, 0.061722398, 0.099159, -0.18970016, -0.13042277, 0.16767356, -0.028342545, 0.18715699, 0.22246139, 0.3154743, -0.39717823, 0.26053482, -0.012097491, 0.1746896, 0.3899962, -0.13013846), conv2d_4_tf); + conv2d_4_tf = MulAdd(f2, MF4x4(-0.14552362, -0.26800197, 0.09035887, 0.24266347, -0.14494316, 0.033814326, -0.06647855, -0.16609156, 0.30540654, 0.037082594, 0.14951941, 0.12753695, -0.045153987, -0.28476146, 0.37640104, -0.04667195), conv2d_4_tf); + conv2d_4_tf = MulAdd(g2, MF4x4(0.2071077, -0.09297775, -0.04906301, -0.24280597, 0.15925987, -0.05631783, 0.08169953, -0.20124075, 0.23060048, -0.05786468, 0.23959383, 0.1620485, 0.14333409, -0.12757483, -0.1424963, 0.13118197), conv2d_4_tf); + conv2d_4_tf = MulAdd(h2, MF4x4(-0.101942524, -0.02240319, 0.11718157, -0.13591368, 0.11223302, -0.042933583, -0.07766777, 0.01667011, 0.07462998, 0.020704709, -0.04329035, -0.01358702, 0.13569939, 0.015980164, -0.08001042, 0.13890027), conv2d_4_tf); + conv2d_4_tf = MulAdd(i2, MF4x4(0.01755685, -0.047599614, 0.06456479, -0.08004052, 0.08108282, 0.06789228, -0.14048836, -0.020240005, 0.039701223, 0.023405846, 0.06305444, -0.046804685, 0.040620867, 0.013529182, -0.094961315, 0.02959053), conv2d_4_tf); + conv2d_4_tf = MulAdd(na1, MF4x4(-0.053775985, -0.0060494044, 0.14724614, 0.07248909, -0.056616947, 0.0004714896, -0.18737504, -0.15240799, -0.030883765, -0.007487297, -0.0044565946, 0.15024893, -0.16870505, 0.09338804, -0.21873595, -0.14493267), conv2d_4_tf); + conv2d_4_tf = MulAdd(nb1, MF4x4(-0.045113027, -0.2153715, 0.04520989, 0.26561612, -0.12634845, -0.10975088, -0.3677834, -0.4343602, -0.34146985, 0.29135808, 0.026339425, -0.0995021, 0.012693227, 0.07312179, 0.21671581, 0.11961088), conv2d_4_tf); + conv2d_4_tf = MulAdd(nc1, MF4x4(0.19766524, -0.31538734, 0.35708517, 0.33092737, 0.027086282, 0.024219114, -0.15289012, -0.18128034, -0.16041638, 0.057314564, 0.079830885, -0.08828221, 0.11828446, -0.13336371, -0.078453206, 0.21232514), conv2d_4_tf); + conv2d_4_tf = MulAdd(nd1, MF4x4(-0.13100033, -0.24849984, 0.3087074, 0.017271562, -0.17455627, -0.014364008, 0.077686995, -0.015820628, 0.18584616, -0.16705278, -0.3169503, 0.09107534, -0.04958684, -0.008202742, 0.024148908, -0.04654239), conv2d_4_tf); + conv2d_4_tf = MulAdd(ne1, MF4x4(-0.16020702, -0.18623418, -0.29434547, 0.5008317, 0.23796988, -0.11154579, -0.5167728, -0.14195764, 0.15495163, -0.028505204, -0.2105556, 0.22491512, -0.11658545, 0.31665426, 0.35085753, -0.40148884), conv2d_4_tf); + conv2d_4_tf = MulAdd(nf1, MF4x4(0.24866697, -0.3752738, 0.8472619, 0.16663249, -0.25808626, -0.037561346, -0.1440471, -0.107407264, 0.016663626, 0.1599037, -0.31926402, 0.15272903, -0.14700623, -0.05275371, 0.061130624, 0.084672675), conv2d_4_tf); + conv2d_4_tf = MulAdd(ng1, MF4x4(-0.24184473, -0.016008917, 0.040023588, 0.1517675, -0.1339458, 0.009985992, 0.15634708, -0.07649679, 0.0021696684, -0.07027257, -0.07509208, -0.27060902, -0.21299353, 0.12154156, -0.3159698, 0.2511261), conv2d_4_tf); + conv2d_4_tf = MulAdd(nh1, MF4x4(0.19845779, 0.023986215, -0.073409855, 0.0812208, 0.013382121, -0.049414996, -0.12990347, 0.052681953, -0.12787153, -0.100129806, -0.036296804, -0.13915883, -0.24022135, 0.167096, -0.15128131, 0.17779276), conv2d_4_tf); + conv2d_4_tf = MulAdd(ni1, MF4x4(-0.05787442, -0.19698323, 0.13090582, 0.1501304, -0.09954089, -0.008470983, -0.095334776, 0.114635326, -0.16330223, -0.046815667, -0.086304545, -0.15729928, -0.1982723, 0.10607274, -0.25540838, 0.09633669), conv2d_4_tf); + conv2d_4_tf = MulAdd(na2, MF4x4(-0.25680968, -0.18444876, 0.053333476, 0.10470261, 0.17798793, -0.108659215, 0.1787569, -0.027407814, 0.12637395, -0.038193744, -0.16185284, 0.14068736, 0.092281684, 0.022276353, 0.013779975, 0.026369803), conv2d_4_tf); + conv2d_4_tf = MulAdd(nb2, MF4x4(-0.17329752, 0.21632285, -0.036964342, 0.30856085, 0.015225849, 0.04158692, -0.010607313, 0.16295516, 0.18873654, 0.24728407, 0.09787, -0.14381099, -0.091119304, 0.12914585, -0.039659716, -0.10700463), conv2d_4_tf); + conv2d_4_tf = MulAdd(nc2, MF4x4(-0.037163302, 0.05201725, -0.149489, -0.05682234, -0.022634465, -0.074764505, -0.010783339, 0.028970495, -0.045976285, -0.1923207, -0.037494432, -0.13024884, -0.1957353, 0.013454359, -0.30236122, -0.078870796), conv2d_4_tf); + conv2d_4_tf = MulAdd(nd2, MF4x4(-0.17753989, -0.1549664, 0.08087595, 0.046868976, -0.09354348, 0.22648604, 0.002651186, 0.11890617, -0.0073132347, 0.05030891, -0.08128038, 0.14395374, -0.001108739, -0.030957213, -0.03568773, 0.055131156), conv2d_4_tf); + conv2d_4_tf = MulAdd(ne2, MF4x4(-0.029484594, -0.013036961, -0.31721568, 0.11611545, -0.24111903, -0.33007705, 0.5950326, -0.070911475, -0.04757172, -0.037676062, -0.14590797, 0.076822214, -0.1672743, -0.41848892, 0.39202756, -0.30958134), conv2d_4_tf); + conv2d_4_tf = MulAdd(nf2, MF4x4(0.17605461, 0.12216047, -0.02412872, -0.14132546, -0.052373543, 0.08169531, 0.18497281, 0.074685514, -0.055427983, 0.14018987, -0.11671619, 0.108945735, -0.032986425, 0.11385016, 0.05801377, -0.1457665), conv2d_4_tf); + conv2d_4_tf = MulAdd(ng2, MF4x4(-0.27222672, -0.0074164676, 0.35768685, 0.0074552484, 0.16729778, 0.14860032, -0.3657366, 0.24510175, -0.0621289, -0.0137252435, -0.26145887, 0.0556681, -0.07332952, 0.13122542, -0.020396946, 0.113705456), conv2d_4_tf); + conv2d_4_tf = MulAdd(nh2, MF4x4(0.08118381, -0.06442098, 0.00044297878, 0.13279027, -0.20708169, 0.11252618, -0.033728387, -0.0105973175, -0.2138218, 0.34612998, -0.15597765, 0.18179017, -0.007853463, -0.045547944, 0.22064093, 0.0548327), conv2d_4_tf); + conv2d_4_tf = MulAdd(ni2, MF4x4(-0.10656318, -0.014200068, 0.062040597, -0.037210476, -0.07271065, -0.027337732, -0.14988437, -0.14711551, -0.028843492, -0.0046596485, -0.15023676, 0.08530336, -0.016875269, -0.024734195, 0.055177588, 0.010381644), conv2d_4_tf); tex6[gxy] = conv2d_4_tf; - min16float4 nconv2d_4_tf = max(-conv2d_4_tf, 0); + MF4 nconv2d_4_tf = max(-conv2d_4_tf, 0); conv2d_4_tf = max(conv2d_4_tf, 0); - min16float4 target = mul(e1, min16float4x4(-0.4756803, -0.16041027, 0.30747655, 0.27719444, 0.33626345, -0.093426555, -0.08751585, -0.025898175, 0.12469858, 0.162526, 0.071950376, 0.36727026, -0.26165214, 0.17652564, -0.081568465, 0.17669047)); - target += mul(e2, min16float4x4(0.10045615, -0.47277164, 0.13970673, -0.036603283, 0.10723418, -0.0733819, 0.07046736, 0.04479655, -0.5100679, 0.4051206, -0.3043826, 0.07709692, 0.25090587, -0.5827475, 0.27195984, 0.42297873)); - target += mul(ne1, min16float4x4(-0.34415862, -0.056642354, -0.32332316, 0.049897127, 0.08399151, 0.683046, -0.16349371, -0.4878456, -0.097749546, 0.7214421, -0.2821467, -0.16691755, 0.3712332, -0.71557045, 0.40365914, 0.37325174)); - target += mul(ne2, min16float4x4(-0.333854, 0.11971563, -0.26533902, -0.033346854, 0.09896302, -0.19311592, -0.006087015, -0.104003794, 0.05347405, -0.16057043, 0.15876219, 0.1538847, -0.07954591, 0.24062383, -0.025401022, -0.33599105)); - target += mul(conv2d_5_tf, min16float4x4(0.11794056, -0.0031797416, 0.08360105, 0.12222232, -0.16638078, 0.26014742, -0.047267277, -0.27900735, 0.17616066, -0.12788172, 0.22856903, -0.39034957, -0.36313176, 0.12272574, 0.2235959, -0.31102005)); - target += mul(nconv2d_5_tf, min16float4x4(0.03297161, 0.19597028, -0.068131894, -0.059938233, 0.18935929, -0.12004069, 0.08705267, 0.26411813, -0.021374375, 0.24630849, -0.08980925, 0.15982057, 0.3533297, -0.15414584, -0.19008748, 0.11310849)); - target += mul(conv2d_1_tf, min16float4x4(-0.4622819, 0.31923467, -0.38989246, 0.5539857, -0.035433546, -0.12729715, -0.0669769, -0.048216928, -0.32078394, 0.26958883, 0.08897814, -0.31043166, 0.26743132, 0.38835636, -0.30535862, -0.22241123)); - target += mul(nconv2d_1_tf, min16float4x4(0.47431698, -0.755935, -0.075302646, 0.27771655, 0.052087527, -0.17221431, 0.0008429987, 0.15527548, -0.04587466, -0.11802989, 0.39905685, -0.07758683, -0.11415051, 0.004637339, -0.19803126, 0.19956517)); - target += mul(conv2d_4_tf, min16float4x4(0.36277947, -0.13364364, 0.18459712, -0.1705512, -0.46083033, 0.43629453, 0.112646095, -0.18511245, 0.037818372, 0.1220617, -0.22268273, -0.11983507, -0.5432721, -0.2102279, -0.014456884, 0.16428374)); - target += mul(nconv2d_4_tf, min16float4x4(0.22811654, 0.16262956, 0.18411161, 0.49102694, -0.15078211, -0.6144134, -0.11632199, 0.2740543, -0.11322067, -0.16751853, 0.18453367, 0.14305107, 0.36418238, -0.34248996, -0.055178564, 0.37168074)); - target += min16float4(0.07878663, -0.045328207, -0.07142425, -0.006036755); + MF4 target = MF4(0.07878663, -0.045328207, -0.07142425, -0.006036755); + target = MulAdd(e1, MF4x4(-0.4756803, -0.16041027, 0.30747655, 0.27719444, 0.33626345, -0.093426555, -0.08751585, -0.025898175, 0.12469858, 0.162526, 0.071950376, 0.36727026, -0.26165214, 0.17652564, -0.081568465, 0.17669047), target); + target = MulAdd(e2, MF4x4(0.10045615, -0.47277164, 0.13970673, -0.036603283, 0.10723418, -0.0733819, 0.07046736, 0.04479655, -0.5100679, 0.4051206, -0.3043826, 0.07709692, 0.25090587, -0.5827475, 0.27195984, 0.42297873), target); + target = MulAdd(ne1, MF4x4(-0.34415862, -0.056642354, -0.32332316, 0.049897127, 0.08399151, 0.683046, -0.16349371, -0.4878456, -0.097749546, 0.7214421, -0.2821467, -0.16691755, 0.3712332, -0.71557045, 0.40365914, 0.37325174), target); + target = MulAdd(ne2, MF4x4(-0.333854, 0.11971563, -0.26533902, -0.033346854, 0.09896302, -0.19311592, -0.006087015, -0.104003794, 0.05347405, -0.16057043, 0.15876219, 0.1538847, -0.07954591, 0.24062383, -0.025401022, -0.33599105), target); + target = MulAdd(conv2d_5_tf, MF4x4(0.11794056, -0.0031797416, 0.08360105, 0.12222232, -0.16638078, 0.26014742, -0.047267277, -0.27900735, 0.17616066, -0.12788172, 0.22856903, -0.39034957, -0.36313176, 0.12272574, 0.2235959, -0.31102005), target); + target = MulAdd(nconv2d_5_tf, MF4x4(0.03297161, 0.19597028, -0.068131894, -0.059938233, 0.18935929, -0.12004069, 0.08705267, 0.26411813, -0.021374375, 0.24630849, -0.08980925, 0.15982057, 0.3533297, -0.15414584, -0.19008748, 0.11310849), target); + target = MulAdd(conv2d_1_tf, MF4x4(-0.4622819, 0.31923467, -0.38989246, 0.5539857, -0.035433546, -0.12729715, -0.0669769, -0.048216928, -0.32078394, 0.26958883, 0.08897814, -0.31043166, 0.26743132, 0.38835636, -0.30535862, -0.22241123), target); + target = MulAdd(nconv2d_1_tf, MF4x4(0.47431698, -0.755935, -0.075302646, 0.27771655, 0.052087527, -0.17221431, 0.0008429987, 0.15527548, -0.04587466, -0.11802989, 0.39905685, -0.07758683, -0.11415051, 0.004637339, -0.19803126, 0.19956517), target); + target = MulAdd(conv2d_4_tf, MF4x4(0.36277947, -0.13364364, 0.18459712, -0.1705512, -0.46083033, 0.43629453, 0.112646095, -0.18511245, 0.037818372, 0.1220617, -0.22268273, -0.11983507, -0.5432721, -0.2102279, -0.014456884, 0.16428374), target); + target = MulAdd(nconv2d_4_tf, MF4x4(0.22811654, 0.16262956, 0.18411161, 0.49102694, -0.15078211, -0.6144134, -0.11632199, 0.2740543, -0.11322067, -0.16751853, 0.18453367, 0.14305107, 0.36418238, -0.34248996, -0.055178564, 0.37168074), target); tex1[gxy] = target; - target = mul(e1, min16float4x4(-0.35645446, -0.01804877, -0.53608185, 0.32968932, 0.13975728, -0.1716116, 0.09503091, -0.12088551, 0.30239868, 0.9217966, 0.016221086, -0.26894137, -0.0047026747, 0.54764843, -0.2826915, 0.0016894634)); - target += mul(e2, min16float4x4(-0.15123259, 0.2014175, 0.05961645, -0.32386652, -0.25275725, 0.3658508, -0.104193784, -0.02756655, 0.2696138, 0.17608197, 0.17685752, 0.6808081, -0.40293297, 0.48387393, 0.25278264, 0.28291366)); - target += mul(ne1, min16float4x4(-0.18928573, -0.18908137, 0.47045723, 0.5454373, 0.31339395, -0.0064702537, -0.37307036, -0.37479213, 0.2235379, -0.370863, 0.02827034, 0.024350066, -0.32538193, -0.33686417, 0.8949382, 0.3324315)); - target += mul(ne2, min16float4x4(-0.17215039, -0.14995, -0.4451278, 0.30758965, 0.21607, 0.08995007, 0.09553425, -0.21233945, -0.14442022, 0.09295349, -0.29228872, -0.3875935, 0.11704046, -0.4206096, 0.35226774, -0.08189522)); - target += mul(conv2d_5_tf, min16float4x4(-0.12517966, 0.060051568, -0.38888076, 0.08354471, 0.17010468, -0.34286287, -0.06961373, 0.032387406, -0.025718998, -0.1661844, -0.075671494, 0.10289619, -0.28309906, -0.14461538, 0.22726184, 0.4752376)); - target += mul(nconv2d_5_tf, min16float4x4(0.15411675, 0.17533994, 0.3406641, -0.0597274, -0.21072194, 0.1517182, 0.032032263, 0.18653658, 0.20970167, -0.10793765, -0.05335404, -0.095203936, 0.2917104, -0.1170929, -0.11652503, -0.46912733)); - target += mul(conv2d_1_tf, min16float4x4(-0.272871, 0.07467413, 0.16981912, 0.57318956, 0.35038894, -0.06679483, 0.3777534, -0.01522816, 0.2588504, -0.008976239, 0.31769443, 0.07070477, 0.059302222, 0.28855336, -0.14700443, -0.08605704)); - target += mul(nconv2d_1_tf, min16float4x4(-0.27067363, -0.2191635, -0.2377148, -1.0028448, -0.25673935, 0.10997322, -0.39032057, 0.06524818, 0.5248202, 0.40049195, 0.6711809, 0.2878331, 0.19606547, -0.092196286, 0.27838528, 0.03120515)); - target += mul(conv2d_4_tf, min16float4x4(0.3029178, -0.027027214, 0.13855064, -0.16550988, 0.2354576, -0.1715326, 0.12981784, 0.5013446, 0.24411377, -0.13030572, -0.08595908, -0.104394995, 0.16794646, -0.044388745, 0.2807999, 0.39108425)); - target += mul(nconv2d_4_tf, min16float4x4(-0.05535261, -0.15662162, 0.14935054, 0.10706811, 0.026958441, -0.15323113, -0.19261432, -0.24361719, -0.2607876, 0.038486157, -0.04509224, 0.18722118, -0.14478058, 0.03614682, -0.12608361, -0.5203596)); - target += min16float4(-0.17363991, 0.071162574, -0.09289675, 0.013446863); + target = MF4(-0.17363991, 0.071162574, -0.09289675, 0.013446863); + target = MulAdd(e1, MF4x4(-0.35645446, -0.01804877, -0.53608185, 0.32968932, 0.13975728, -0.1716116, 0.09503091, -0.12088551, 0.30239868, 0.9217966, 0.016221086, -0.26894137, -0.0047026747, 0.54764843, -0.2826915, 0.0016894634), target); + target = MulAdd(e2, MF4x4(-0.15123259, 0.2014175, 0.05961645, -0.32386652, -0.25275725, 0.3658508, -0.104193784, -0.02756655, 0.2696138, 0.17608197, 0.17685752, 0.6808081, -0.40293297, 0.48387393, 0.25278264, 0.28291366), target); + target = MulAdd(ne1, MF4x4(-0.18928573, -0.18908137, 0.47045723, 0.5454373, 0.31339395, -0.0064702537, -0.37307036, -0.37479213, 0.2235379, -0.370863, 0.02827034, 0.024350066, -0.32538193, -0.33686417, 0.8949382, 0.3324315), target); + target = MulAdd(ne2, MF4x4(-0.17215039, -0.14995, -0.4451278, 0.30758965, 0.21607, 0.08995007, 0.09553425, -0.21233945, -0.14442022, 0.09295349, -0.29228872, -0.3875935, 0.11704046, -0.4206096, 0.35226774, -0.08189522), target); + target = MulAdd(conv2d_5_tf, MF4x4(-0.12517966, 0.060051568, -0.38888076, 0.08354471, 0.17010468, -0.34286287, -0.06961373, 0.032387406, -0.025718998, -0.1661844, -0.075671494, 0.10289619, -0.28309906, -0.14461538, 0.22726184, 0.4752376), target); + target = MulAdd(nconv2d_5_tf, MF4x4(0.15411675, 0.17533994, 0.3406641, -0.0597274, -0.21072194, 0.1517182, 0.032032263, 0.18653658, 0.20970167, -0.10793765, -0.05335404, -0.095203936, 0.2917104, -0.1170929, -0.11652503, -0.46912733), target); + target = MulAdd(conv2d_1_tf, MF4x4(-0.272871, 0.07467413, 0.16981912, 0.57318956, 0.35038894, -0.06679483, 0.3777534, -0.01522816, 0.2588504, -0.008976239, 0.31769443, 0.07070477, 0.059302222, 0.28855336, -0.14700443, -0.08605704), target); + target = MulAdd(nconv2d_1_tf, MF4x4(-0.27067363, -0.2191635, -0.2377148, -1.0028448, -0.25673935, 0.10997322, -0.39032057, 0.06524818, 0.5248202, 0.40049195, 0.6711809, 0.2878331, 0.19606547, -0.092196286, 0.27838528, 0.03120515), target); + target = MulAdd(conv2d_4_tf, MF4x4(0.3029178, -0.027027214, 0.13855064, -0.16550988, 0.2354576, -0.1715326, 0.12981784, 0.5013446, 0.24411377, -0.13030572, -0.08595908, -0.104394995, 0.16794646, -0.044388745, 0.2807999, 0.39108425), target); + target = MulAdd(nconv2d_4_tf, MF4x4(-0.05535261, -0.15662162, 0.14935054, 0.10706811, 0.026958441, -0.15323113, -0.19261432, -0.24361719, -0.2607876, 0.038486157, -0.04509224, 0.18722118, -0.14478058, 0.03614682, -0.12608361, -0.5203596), target); tex2[gxy] = target; } @@ -565,25 +568,25 @@ void Pass4(uint2 blockStart, uint3 threadId) { // [ a, d, g ] // [ b, e, h ] // [ c, f, i ] - min16float4 a1 = tex1.SampleLevel(sam, pos - inputPt, 0); - min16float4 b1 = tex1.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); - min16float4 c1 = tex1.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); - min16float4 d1 = tex1.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); - min16float4 e1 = tex1.SampleLevel(sam, pos, 0); - min16float4 f1 = tex1.SampleLevel(sam, pos + float2(0, inputPt.y), 0); - min16float4 g1 = tex1.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); - min16float4 h1 = tex1.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); - min16float4 i1 = tex1.SampleLevel(sam, pos + inputPt, 0); - - min16float4 na1 = max(-a1, 0); - min16float4 nb1 = max(-b1, 0); - min16float4 nc1 = max(-c1, 0); - min16float4 nd1 = max(-d1, 0); - min16float4 ne1 = max(-e1, 0); - min16float4 nf1 = max(-f1, 0); - min16float4 ng1 = max(-g1, 0); - min16float4 nh1 = max(-h1, 0); - min16float4 ni1 = max(-i1, 0); + MF4 a1 = tex1.SampleLevel(sam, pos - inputPt, 0); + MF4 b1 = tex1.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); + MF4 c1 = tex1.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); + MF4 d1 = tex1.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); + MF4 e1 = tex1.SampleLevel(sam, pos, 0); + MF4 f1 = tex1.SampleLevel(sam, pos + float2(0, inputPt.y), 0); + MF4 g1 = tex1.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); + MF4 h1 = tex1.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); + MF4 i1 = tex1.SampleLevel(sam, pos + inputPt, 0); + + MF4 na1 = max(-a1, 0); + MF4 nb1 = max(-b1, 0); + MF4 nc1 = max(-c1, 0); + MF4 nd1 = max(-d1, 0); + MF4 ne1 = max(-e1, 0); + MF4 nf1 = max(-f1, 0); + MF4 ng1 = max(-g1, 0); + MF4 nh1 = max(-h1, 0); + MF4 ni1 = max(-i1, 0); a1 = max(a1, 0); b1 = max(b1, 0); @@ -595,25 +598,25 @@ void Pass4(uint2 blockStart, uint3 threadId) { h1 = max(h1, 0); i1 = max(i1, 0); - min16float4 a2 = tex2.SampleLevel(sam, pos - inputPt, 0); - min16float4 b2 = tex2.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); - min16float4 c2 = tex2.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); - min16float4 d2 = tex2.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); - min16float4 e2 = tex2.SampleLevel(sam, pos, 0); - min16float4 f2 = tex2.SampleLevel(sam, pos + float2(0, inputPt.y), 0); - min16float4 g2 = tex2.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); - min16float4 h2 = tex2.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); - min16float4 i2 = tex2.SampleLevel(sam, pos + inputPt, 0); - - min16float4 na2 = max(-a2, 0); - min16float4 nb2 = max(-b2, 0); - min16float4 nc2 = max(-c2, 0); - min16float4 nd2 = max(-d2, 0); - min16float4 ne2 = max(-e2, 0); - min16float4 nf2 = max(-f2, 0); - min16float4 ng2 = max(-g2, 0); - min16float4 nh2 = max(-h2, 0); - min16float4 ni2 = max(-i2, 0); + MF4 a2 = tex2.SampleLevel(sam, pos - inputPt, 0); + MF4 b2 = tex2.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); + MF4 c2 = tex2.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); + MF4 d2 = tex2.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); + MF4 e2 = tex2.SampleLevel(sam, pos, 0); + MF4 f2 = tex2.SampleLevel(sam, pos + float2(0, inputPt.y), 0); + MF4 g2 = tex2.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); + MF4 h2 = tex2.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); + MF4 i2 = tex2.SampleLevel(sam, pos + inputPt, 0); + + MF4 na2 = max(-a2, 0); + MF4 nb2 = max(-b2, 0); + MF4 nc2 = max(-c2, 0); + MF4 nd2 = max(-d2, 0); + MF4 ne2 = max(-e2, 0); + MF4 nf2 = max(-f2, 0); + MF4 ng2 = max(-g2, 0); + MF4 nh2 = max(-h2, 0); + MF4 ni2 = max(-i2, 0); a2 = max(a2, 0); b2 = max(b2, 0); @@ -625,123 +628,123 @@ void Pass4(uint2 blockStart, uint3 threadId) { h2 = max(h2, 0); i2 = max(i2, 0); - min16float4 conv2d_1_tf = tex3.SampleLevel(sam, pos, 0); - min16float4 nconv2d_1_tf = max(-conv2d_1_tf, 0); + MF4 conv2d_1_tf = tex3.SampleLevel(sam, pos, 0); + MF4 nconv2d_1_tf = max(-conv2d_1_tf, 0); conv2d_1_tf = max(conv2d_1_tf, 0); - min16float4 conv2d_4_tf = tex6.SampleLevel(sam, pos, 0); - min16float4 nconv2d_4_tf = max(-conv2d_4_tf, 0); + MF4 conv2d_4_tf = tex6.SampleLevel(sam, pos, 0); + MF4 nconv2d_4_tf = max(-conv2d_4_tf, 0); conv2d_4_tf = max(conv2d_4_tf, 0); - min16float4 conv2d_8_tf = mul(a1, min16float4x4(-0.162897, -0.21250516, -0.11219427, 0.30969706, 0.078927204, -0.14922144, 0.5486932, 0.2884913, 0.07018745, 0.45946357, -0.23759702, -0.18914284, 0.19762751, 0.56881535, -0.2141465, 0.27216902)); - conv2d_8_tf += mul(b1, min16float4x4(-0.17507325, -0.577772, -0.46351492, 0.09431303, 0.023881523, -0.068162896, -0.0029204858, -0.076631226, -0.07645065, 0.28997856, -0.0349899, 0.087704636, 0.29194608, 0.7767595, 0.17478088, -0.295144)); - conv2d_8_tf += mul(c1, min16float4x4(0.23039296, -0.000184939, -0.032427344, 0.0926983, -0.4264918, -0.44138262, 0.39098918, -0.0102598835, 0.066287994, 0.15478721, -0.062338993, 0.15079321, 0.120016515, 0.7005824, -0.12260436, 0.090042405)); - conv2d_8_tf += mul(d1, min16float4x4(0.014269367, 0.21645544, -0.4203915, 0.0077638677, -0.18618487, 0.30165052, 0.6985895, -0.014695781, -0.071353786, -0.49996287, -0.79902583, -0.06343025, 0.291085, 0.28801495, 0.46547806, 0.1311194)); - conv2d_8_tf += mul(e1, min16float4x4(0.17163453, 0.21760814, -0.67896426, 0.1487859, 0.05881719, -0.08391752, 0.44536906, 0.24853623, -0.7685656, 0.45705163, -1.0204223, 0.1884743, -0.3251896, -0.49221343, 0.38116506, -0.09428967)); - conv2d_8_tf += mul(f1, min16float4x4(0.2759429, 0.22141403, -0.13269989, 0.06833041, -0.29562923, -0.26589182, -0.34452415, 0.36388424, -0.3184807, 0.08254464, -0.15477169, 0.03237491, -0.34190834, -0.2777646, -0.15713428, -0.22231084)); - conv2d_8_tf += mul(g1, min16float4x4(-0.36887082, -0.34607458, 0.5719879, 0.09200919, 0.12724651, 0.20281908, 0.39280674, 0.09216231, 0.3126475, -0.0075341803, -0.046779484, 0.10883631, 0.20136468, 0.70330596, -0.024237871, -0.061087623)); - conv2d_8_tf += mul(h1, min16float4x4(-0.08114617, -0.02456657, 0.1287709, 0.5486885, -0.10143632, -0.39116892, 0.24008204, 5.8133483e-05, -0.36640543, -0.52113515, 0.3836287, 0.09541327, 0.01115865, -0.2044513, 0.07734024, -0.29509112)); - conv2d_8_tf += mul(i1, min16float4x4(0.25962162, -0.05327207, -0.28945914, 0.320823, 0.049143463, 0.011241379, 0.004193257, 0.3872085, -0.47137228, -0.44385332, -0.2591483, -0.20189615, 0.08729277, -0.14813553, -0.29911307, 0.0067013856)); - conv2d_8_tf += mul(a2, min16float4x4(0.13526323, 0.6637005, 0.09491454, -0.038491655, -0.5078187, -0.5782128, -1.0748478, 0.18678135, 0.16064858, 0.0795754, 0.116070546, 0.06408978, 0.085641995, -0.39126787, -0.16669247, -0.024058852)); - conv2d_8_tf += mul(b2, min16float4x4(-0.30658495, -0.08933112, 0.38358685, -0.048228927, 0.050148476, -0.08750905, -0.5015779, 0.4012965, -0.068299964, 0.08993712, 0.03617703, -0.030809006, 0.05144756, -0.7659615, -0.33359128, 0.0054376507)); - conv2d_8_tf += mul(c2, min16float4x4(-0.24894494, 0.08617524, -0.095747314, 0.14710969, -0.09528072, 0.19016005, 0.060339417, -0.059556015, 0.01127292, -0.021082405, 0.5204205, 0.23831797, -0.035384487, 0.001653611, -0.28902745, -0.0060615037)); - conv2d_8_tf += mul(d2, min16float4x4(0.2978602, 0.2580722, 0.11472323, -0.06937241, -0.45087403, -0.35747236, -0.38835877, 0.22520676, 0.09162963, 0.50932664, -0.41183934, -0.08526183, -0.043625794, -0.27782285, -0.4119391, -0.339948)); - conv2d_8_tf += mul(e2, min16float4x4(-0.005034612, 0.010024151, 0.55194247, -0.16040643, 0.0072234212, -0.047230296, 0.011222393, -0.017184192, 0.2156304, 0.02429907, 0.41669923, -0.06159069, -0.39241523, 0.009254305, 0.35784644, -0.45379582)); - conv2d_8_tf += mul(f2, min16float4x4(-0.18008694, -0.35366225, 0.12110043, -0.102665015, 0.2763678, -0.61502653, 0.3051717, -0.23991431, 0.6076138, -1.142571, 1.2579885, 0.15013893, -0.13282573, -0.16185799, -0.26278257, 0.044563264)); - conv2d_8_tf += mul(g2, min16float4x4(-0.043284204, -0.1374118, -0.6523209, -0.1682561, -0.002918912, 0.2768846, 0.045174655, -0.046218265, 0.10686049, -0.004872297, 0.04118156, -0.07015327, -0.3329307, 0.19972506, -0.38307762, 0.11627049)); - conv2d_8_tf += mul(h2, min16float4x4(0.09306764, -0.5036807, -0.25358048, -0.033543527, 0.07199686, -0.28982875, -0.022885432, -0.078454, -0.0836088, -0.08261633, 0.38759607, 0.021209864, 0.09516953, -0.1896164, -0.12284774, 0.16532375)); - conv2d_8_tf += mul(i2, min16float4x4(0.27196047, -0.6199637, 0.12209493, -0.0055379267, -0.08997175, -0.0025996822, -0.20710677, 0.15223576, -0.07073166, -0.20732503, -0.044538528, 0.35751408, 0.33849528, -0.14603287, 0.19472563, 0.20992133)); - conv2d_8_tf += mul(na1, min16float4x4(0.018979501, 0.030001618, 0.09530055, -0.22441792, -0.11513775, -0.05383842, 0.042144198, -0.2824055, 0.20338169, 0.9622458, -0.20780474, 0.5217952, 0.11518432, 0.24126045, -0.046675194, -0.07326568)); - conv2d_8_tf += mul(nb1, min16float4x4(-0.13768856, 0.17157272, -0.32123035, -0.08968111, 0.011915078, -0.08129057, -0.71480066, 0.24239756, 0.5093838, 0.29058817, -0.07181868, -0.22533971, 0.24244072, -0.2716092, 0.03331018, 0.008624937)); - conv2d_8_tf += mul(nc1, min16float4x4(0.21304299, 0.5180637, 0.40324917, -0.078679435, 0.17033757, -0.4813804, -0.47702515, -0.017285354, -0.054009005, -0.5853617, -0.5427995, 0.13533083, 0.12440328, -0.6455633, 0.0012186684, 0.031838413)); - conv2d_8_tf += mul(nd1, min16float4x4(0.04057183, -0.27768528, -0.07563423, 0.13400203, -0.03429928, -0.32794374, -0.085426375, -0.3724642, -0.19195397, 0.1349262, -0.2909766, -0.43096116, 0.056601644, 0.5106557, -0.267059, -0.046354882)); - conv2d_8_tf += mul(ne1, min16float4x4(0.14343774, -0.29267886, -0.2406526, -0.30307195, -0.10270894, 0.008828463, -1.5378821, 0.017785087, 0.48302534, -0.310974, 1.5381073, 0.08598342, 0.82111055, -0.0049781636, 0.4820726, 0.301231)); - conv2d_8_tf += mul(nf1, min16float4x4(0.012052944, -0.090234, 0.52199095, -0.3329521, 0.110252894, 0.2897882, -0.37447298, 0.17326026, 0.18148576, -0.23976558, 0.1848407, 0.5042414, 0.33321953, 0.2712571, 0.18124644, 0.20849751)); - conv2d_8_tf += mul(ng1, min16float4x4(0.066107936, 0.035174694, -0.1587501, -0.22672103, 0.012212267, -0.05451626, -0.6004301, 0.013387352, -0.04113352, 0.53583735, -0.15342614, -0.0018758774, 0.09947345, -0.18213694, 0.02965846, -0.044368513)); - conv2d_8_tf += mul(nh1, min16float4x4(0.099831305, 0.2666737, -0.12301129, -0.113591194, 0.018106552, 0.290373, 0.1480011, 0.032558106, 0.0024403003, 0.11745559, 0.7669008, -0.18195944, 0.21291047, 0.49549788, -0.04361018, 0.6138144)); - conv2d_8_tf += mul(ni1, min16float4x4(-0.24335642, -0.023037815, -0.22853605, -0.49450716, 0.04834612, 0.040727314, 0.36239302, -0.076259434, -0.08173315, 0.14689375, 0.3357786, 0.34003472, -0.11701219, -0.35594055, 0.55640507, 0.3573448)); - conv2d_8_tf += mul(na2, min16float4x4(0.039600838, -0.08580259, -0.25375724, -0.41294497, 0.052295998, 0.34286344, 0.23627926, 0.08080187, 0.0015981429, 0.37459275, -0.11763548, 0.027264152, 0.11372706, 0.34742436, 0.30963847, -0.2995273)); - conv2d_8_tf += mul(nb2, min16float4x4(-0.035936117, -0.42153218, -0.40176156, 0.20363232, 0.22382015, 0.48679677, 0.07365761, -0.20890754, 0.22791456, -0.28418672, -0.17189962, 0.0968373, -0.70834696, -0.41918173, -0.13482817, 0.037949625)); - conv2d_8_tf += mul(nc2, min16float4x4(0.11910686, 0.0473921, 0.37869528, 0.17928337, 0.17311068, 0.21572089, 0.34996882, -0.26002827, -0.014036688, -0.6574892, -0.14409806, -0.06467717, -0.33688435, -0.18185017, 0.04036214, 0.06086553)); - conv2d_8_tf += mul(nd2, min16float4x4(-0.15657301, -0.05661294, -0.36640826, -0.13215317, 0.060342815, 0.19098124, 0.18715985, -0.40765548, 0.090474375, -0.07720432, -0.016231487, 0.0885778, 0.0272616, 0.22065723, 0.1691866, -0.19491237)); - conv2d_8_tf += mul(ne2, min16float4x4(-0.13054666, 0.3278881, -1.3170725, -0.4575742, -0.061401486, 0.15868792, 0.2789515, 0.13829961, 0.09607008, -0.7175924, 0.01804374, 0.41284522, 0.044577077, 0.04847126, -0.25607756, -0.02249741)); - conv2d_8_tf += mul(nf2, min16float4x4(0.22145797, 0.8114419, -0.17527157, 0.09274125, -0.25224185, 0.2955128, -0.37553602, -0.17377761, -0.9684024, 0.42457148, -0.64265996, 0.10394252, -0.11231096, 0.064703405, 0.42858216, -0.21214609)); - conv2d_8_tf += mul(ng2, min16float4x4(0.1910386, -0.0065560606, 0.18119961, -0.026436953, 0.07887997, 0.15127628, -0.11523928, -0.0679343, 0.031198656, 0.16947536, 0.05943052, 0.060350783, 0.32215032, -0.1347014, 0.017390233, -0.06527528)); - conv2d_8_tf += mul(nh2, min16float4x4(-0.19811153, -0.033103824, 0.0053317053, 0.008003428, -0.020805335, 0.17872533, -0.3161484, -0.11559199, -0.24902378, -0.2596549, 0.034520704, -0.006125487, 0.13173361, -0.10967251, -0.7860965, -0.035326626)); - conv2d_8_tf += mul(ni2, min16float4x4(-0.124631934, 0.21335506, 0.375809, -0.13598146, 0.047685858, 0.14553228, -0.068173625, -0.117949426, 0.07296198, 0.08935096, -0.26368606, 0.29653412, -0.27378097, 0.060699224, -0.09753418, -0.08484599)); - conv2d_8_tf += min16float4(-0.009278051, 0.62221414, 0.22868732, 0.14880095); - min16float4 nconv2d_8_tf = max(-conv2d_8_tf, 0); + MF4 conv2d_8_tf = MF4(-0.009278051, 0.62221414, 0.22868732, 0.14880095); + conv2d_8_tf = MulAdd(a1, MF4x4(-0.162897, -0.21250516, -0.11219427, 0.30969706, 0.078927204, -0.14922144, 0.5486932, 0.2884913, 0.07018745, 0.45946357, -0.23759702, -0.18914284, 0.19762751, 0.56881535, -0.2141465, 0.27216902), conv2d_8_tf); + conv2d_8_tf = MulAdd(b1, MF4x4(-0.17507325, -0.577772, -0.46351492, 0.09431303, 0.023881523, -0.068162896, -0.0029204858, -0.076631226, -0.07645065, 0.28997856, -0.0349899, 0.087704636, 0.29194608, 0.7767595, 0.17478088, -0.295144), conv2d_8_tf); + conv2d_8_tf = MulAdd(c1, MF4x4(0.23039296, -0.000184939, -0.032427344, 0.0926983, -0.4264918, -0.44138262, 0.39098918, -0.0102598835, 0.066287994, 0.15478721, -0.062338993, 0.15079321, 0.120016515, 0.7005824, -0.12260436, 0.090042405), conv2d_8_tf); + conv2d_8_tf = MulAdd(d1, MF4x4(0.014269367, 0.21645544, -0.4203915, 0.0077638677, -0.18618487, 0.30165052, 0.6985895, -0.014695781, -0.071353786, -0.49996287, -0.79902583, -0.06343025, 0.291085, 0.28801495, 0.46547806, 0.1311194), conv2d_8_tf); + conv2d_8_tf = MulAdd(e1, MF4x4(0.17163453, 0.21760814, -0.67896426, 0.1487859, 0.05881719, -0.08391752, 0.44536906, 0.24853623, -0.7685656, 0.45705163, -1.0204223, 0.1884743, -0.3251896, -0.49221343, 0.38116506, -0.09428967), conv2d_8_tf); + conv2d_8_tf = MulAdd(f1, MF4x4(0.2759429, 0.22141403, -0.13269989, 0.06833041, -0.29562923, -0.26589182, -0.34452415, 0.36388424, -0.3184807, 0.08254464, -0.15477169, 0.03237491, -0.34190834, -0.2777646, -0.15713428, -0.22231084), conv2d_8_tf); + conv2d_8_tf = MulAdd(g1, MF4x4(-0.36887082, -0.34607458, 0.5719879, 0.09200919, 0.12724651, 0.20281908, 0.39280674, 0.09216231, 0.3126475, -0.0075341803, -0.046779484, 0.10883631, 0.20136468, 0.70330596, -0.024237871, -0.061087623), conv2d_8_tf); + conv2d_8_tf = MulAdd(h1, MF4x4(-0.08114617, -0.02456657, 0.1287709, 0.5486885, -0.10143632, -0.39116892, 0.24008204, 5.8133483e-05, -0.36640543, -0.52113515, 0.3836287, 0.09541327, 0.01115865, -0.2044513, 0.07734024, -0.29509112), conv2d_8_tf); + conv2d_8_tf = MulAdd(i1, MF4x4(0.25962162, -0.05327207, -0.28945914, 0.320823, 0.049143463, 0.011241379, 0.004193257, 0.3872085, -0.47137228, -0.44385332, -0.2591483, -0.20189615, 0.08729277, -0.14813553, -0.29911307, 0.0067013856), conv2d_8_tf); + conv2d_8_tf = MulAdd(a2, MF4x4(0.13526323, 0.6637005, 0.09491454, -0.038491655, -0.5078187, -0.5782128, -1.0748478, 0.18678135, 0.16064858, 0.0795754, 0.116070546, 0.06408978, 0.085641995, -0.39126787, -0.16669247, -0.024058852), conv2d_8_tf); + conv2d_8_tf = MulAdd(b2, MF4x4(-0.30658495, -0.08933112, 0.38358685, -0.048228927, 0.050148476, -0.08750905, -0.5015779, 0.4012965, -0.068299964, 0.08993712, 0.03617703, -0.030809006, 0.05144756, -0.7659615, -0.33359128, 0.0054376507), conv2d_8_tf); + conv2d_8_tf = MulAdd(c2, MF4x4(-0.24894494, 0.08617524, -0.095747314, 0.14710969, -0.09528072, 0.19016005, 0.060339417, -0.059556015, 0.01127292, -0.021082405, 0.5204205, 0.23831797, -0.035384487, 0.001653611, -0.28902745, -0.0060615037), conv2d_8_tf); + conv2d_8_tf = MulAdd(d2, MF4x4(0.2978602, 0.2580722, 0.11472323, -0.06937241, -0.45087403, -0.35747236, -0.38835877, 0.22520676, 0.09162963, 0.50932664, -0.41183934, -0.08526183, -0.043625794, -0.27782285, -0.4119391, -0.339948), conv2d_8_tf); + conv2d_8_tf = MulAdd(e2, MF4x4(-0.005034612, 0.010024151, 0.55194247, -0.16040643, 0.0072234212, -0.047230296, 0.011222393, -0.017184192, 0.2156304, 0.02429907, 0.41669923, -0.06159069, -0.39241523, 0.009254305, 0.35784644, -0.45379582), conv2d_8_tf); + conv2d_8_tf = MulAdd(f2, MF4x4(-0.18008694, -0.35366225, 0.12110043, -0.102665015, 0.2763678, -0.61502653, 0.3051717, -0.23991431, 0.6076138, -1.142571, 1.2579885, 0.15013893, -0.13282573, -0.16185799, -0.26278257, 0.044563264), conv2d_8_tf); + conv2d_8_tf = MulAdd(g2, MF4x4(-0.043284204, -0.1374118, -0.6523209, -0.1682561, -0.002918912, 0.2768846, 0.045174655, -0.046218265, 0.10686049, -0.004872297, 0.04118156, -0.07015327, -0.3329307, 0.19972506, -0.38307762, 0.11627049), conv2d_8_tf); + conv2d_8_tf = MulAdd(h2, MF4x4(0.09306764, -0.5036807, -0.25358048, -0.033543527, 0.07199686, -0.28982875, -0.022885432, -0.078454, -0.0836088, -0.08261633, 0.38759607, 0.021209864, 0.09516953, -0.1896164, -0.12284774, 0.16532375), conv2d_8_tf); + conv2d_8_tf = MulAdd(i2, MF4x4(0.27196047, -0.6199637, 0.12209493, -0.0055379267, -0.08997175, -0.0025996822, -0.20710677, 0.15223576, -0.07073166, -0.20732503, -0.044538528, 0.35751408, 0.33849528, -0.14603287, 0.19472563, 0.20992133), conv2d_8_tf); + conv2d_8_tf = MulAdd(na1, MF4x4(0.018979501, 0.030001618, 0.09530055, -0.22441792, -0.11513775, -0.05383842, 0.042144198, -0.2824055, 0.20338169, 0.9622458, -0.20780474, 0.5217952, 0.11518432, 0.24126045, -0.046675194, -0.07326568), conv2d_8_tf); + conv2d_8_tf = MulAdd(nb1, MF4x4(-0.13768856, 0.17157272, -0.32123035, -0.08968111, 0.011915078, -0.08129057, -0.71480066, 0.24239756, 0.5093838, 0.29058817, -0.07181868, -0.22533971, 0.24244072, -0.2716092, 0.03331018, 0.008624937), conv2d_8_tf); + conv2d_8_tf = MulAdd(nc1, MF4x4(0.21304299, 0.5180637, 0.40324917, -0.078679435, 0.17033757, -0.4813804, -0.47702515, -0.017285354, -0.054009005, -0.5853617, -0.5427995, 0.13533083, 0.12440328, -0.6455633, 0.0012186684, 0.031838413), conv2d_8_tf); + conv2d_8_tf = MulAdd(nd1, MF4x4(0.04057183, -0.27768528, -0.07563423, 0.13400203, -0.03429928, -0.32794374, -0.085426375, -0.3724642, -0.19195397, 0.1349262, -0.2909766, -0.43096116, 0.056601644, 0.5106557, -0.267059, -0.046354882), conv2d_8_tf); + conv2d_8_tf = MulAdd(ne1, MF4x4(0.14343774, -0.29267886, -0.2406526, -0.30307195, -0.10270894, 0.008828463, -1.5378821, 0.017785087, 0.48302534, -0.310974, 1.5381073, 0.08598342, 0.82111055, -0.0049781636, 0.4820726, 0.301231), conv2d_8_tf); + conv2d_8_tf = MulAdd(nf1, MF4x4(0.012052944, -0.090234, 0.52199095, -0.3329521, 0.110252894, 0.2897882, -0.37447298, 0.17326026, 0.18148576, -0.23976558, 0.1848407, 0.5042414, 0.33321953, 0.2712571, 0.18124644, 0.20849751), conv2d_8_tf); + conv2d_8_tf = MulAdd(ng1, MF4x4(0.066107936, 0.035174694, -0.1587501, -0.22672103, 0.012212267, -0.05451626, -0.6004301, 0.013387352, -0.04113352, 0.53583735, -0.15342614, -0.0018758774, 0.09947345, -0.18213694, 0.02965846, -0.044368513), conv2d_8_tf); + conv2d_8_tf = MulAdd(nh1, MF4x4(0.099831305, 0.2666737, -0.12301129, -0.113591194, 0.018106552, 0.290373, 0.1480011, 0.032558106, 0.0024403003, 0.11745559, 0.7669008, -0.18195944, 0.21291047, 0.49549788, -0.04361018, 0.6138144), conv2d_8_tf); + conv2d_8_tf = MulAdd(ni1, MF4x4(-0.24335642, -0.023037815, -0.22853605, -0.49450716, 0.04834612, 0.040727314, 0.36239302, -0.076259434, -0.08173315, 0.14689375, 0.3357786, 0.34003472, -0.11701219, -0.35594055, 0.55640507, 0.3573448), conv2d_8_tf); + conv2d_8_tf = MulAdd(na2, MF4x4(0.039600838, -0.08580259, -0.25375724, -0.41294497, 0.052295998, 0.34286344, 0.23627926, 0.08080187, 0.0015981429, 0.37459275, -0.11763548, 0.027264152, 0.11372706, 0.34742436, 0.30963847, -0.2995273), conv2d_8_tf); + conv2d_8_tf = MulAdd(nb2, MF4x4(-0.035936117, -0.42153218, -0.40176156, 0.20363232, 0.22382015, 0.48679677, 0.07365761, -0.20890754, 0.22791456, -0.28418672, -0.17189962, 0.0968373, -0.70834696, -0.41918173, -0.13482817, 0.037949625), conv2d_8_tf); + conv2d_8_tf = MulAdd(nc2, MF4x4(0.11910686, 0.0473921, 0.37869528, 0.17928337, 0.17311068, 0.21572089, 0.34996882, -0.26002827, -0.014036688, -0.6574892, -0.14409806, -0.06467717, -0.33688435, -0.18185017, 0.04036214, 0.06086553), conv2d_8_tf); + conv2d_8_tf = MulAdd(nd2, MF4x4(-0.15657301, -0.05661294, -0.36640826, -0.13215317, 0.060342815, 0.19098124, 0.18715985, -0.40765548, 0.090474375, -0.07720432, -0.016231487, 0.0885778, 0.0272616, 0.22065723, 0.1691866, -0.19491237), conv2d_8_tf); + conv2d_8_tf = MulAdd(ne2, MF4x4(-0.13054666, 0.3278881, -1.3170725, -0.4575742, -0.061401486, 0.15868792, 0.2789515, 0.13829961, 0.09607008, -0.7175924, 0.01804374, 0.41284522, 0.044577077, 0.04847126, -0.25607756, -0.02249741), conv2d_8_tf); + conv2d_8_tf = MulAdd(nf2, MF4x4(0.22145797, 0.8114419, -0.17527157, 0.09274125, -0.25224185, 0.2955128, -0.37553602, -0.17377761, -0.9684024, 0.42457148, -0.64265996, 0.10394252, -0.11231096, 0.064703405, 0.42858216, -0.21214609), conv2d_8_tf); + conv2d_8_tf = MulAdd(ng2, MF4x4(0.1910386, -0.0065560606, 0.18119961, -0.026436953, 0.07887997, 0.15127628, -0.11523928, -0.0679343, 0.031198656, 0.16947536, 0.05943052, 0.060350783, 0.32215032, -0.1347014, 0.017390233, -0.06527528), conv2d_8_tf); + conv2d_8_tf = MulAdd(nh2, MF4x4(-0.19811153, -0.033103824, 0.0053317053, 0.008003428, -0.020805335, 0.17872533, -0.3161484, -0.11559199, -0.24902378, -0.2596549, 0.034520704, -0.006125487, 0.13173361, -0.10967251, -0.7860965, -0.035326626), conv2d_8_tf); + conv2d_8_tf = MulAdd(ni2, MF4x4(-0.124631934, 0.21335506, 0.375809, -0.13598146, 0.047685858, 0.14553228, -0.068173625, -0.117949426, 0.07296198, 0.08935096, -0.26368606, 0.29653412, -0.27378097, 0.060699224, -0.09753418, -0.08484599), conv2d_8_tf); + MF4 nconv2d_8_tf = max(-conv2d_8_tf, 0); conv2d_8_tf = max(conv2d_8_tf, 0); - min16float4 conv2d_7_tf = mul(a1, min16float4x4(0.018128054, -0.14104486, -0.027475944, 0.22669935, -2.7264505e-05, 0.14775783, 0.13441783, 0.11450963, -0.09942102, 0.29735768, 0.04839269, -0.14066552, -0.024448555, 0.3104163, -0.03636913, 0.002947356)); - conv2d_7_tf += mul(b1, min16float4x4(-0.20438337, 0.35419708, 0.037506625, 0.100693576, -0.074241616, -0.15304284, 0.0054191337, -0.12816934, 0.028913809, -0.098240785, 0.5653599, -0.38662913, 0.018716848, 0.0021957273, 0.061397206, -0.111899704)); - conv2d_7_tf += mul(c1, min16float4x4(-0.18681246, -0.23609419, 0.21475013, 0.051762715, 0.04889926, -0.033886652, 0.26262638, -0.27322114, 0.049140245, 0.3380464, -0.13617653, -0.05796957, 0.080669545, 0.21348572, -0.10067047, -0.0016244814)); - conv2d_7_tf += mul(d1, min16float4x4(0.025566151, -0.027286734, -0.10856872, 0.108885765, -0.07635088, 0.13037659, 0.2892404, -0.2160093, -0.30649704, 0.34650138, -0.021391464, 0.08717436, -0.02000013, 0.027722841, 0.43060175, -0.04844848)); - conv2d_7_tf += mul(e1, min16float4x4(0.09925131, -0.11167345, -0.14262813, -0.21267861, -0.15972298, -0.1823657, -0.073309824, 0.15542479, 0.005081145, -0.40594074, 0.24862696, 0.19943975, -0.36283687, -0.38990027, 0.4759463, 0.45561194)); - conv2d_7_tf += mul(f1, min16float4x4(-0.13126811, 0.24284562, 0.06109369, -0.15402594, 0.016967572, -0.08234942, -0.053873185, 0.026438333, 0.13412815, -0.10839792, -0.345438, 0.0720746, 0.21260333, -0.15989558, -0.012461376, 0.20363508)); - conv2d_7_tf += mul(g1, min16float4x4(0.09231617, 0.17787862, 0.22783166, 0.09095521, -0.0935426, -0.22921127, 0.2591894, -0.19451278, -0.0046325484, -0.60839254, 0.061737422, -0.024267042, -0.04048761, 0.2450175, 0.14390652, 0.07999217)); - conv2d_7_tf += mul(h1, min16float4x4(-0.09204067, -0.05434134, 0.32136026, -0.053413626, 0.044170942, 0.10284346, 0.10827547, -0.03207593, -0.036979157, -0.37019014, -0.07072617, 0.07745549, 0.026007036, 0.13402742, 0.22873925, -0.09879518)); - conv2d_7_tf += mul(i1, min16float4x4(-0.039409183, -0.15304323, 0.110744946, 0.04479048, 0.073402554, -0.31955537, 0.13518381, 0.09020946, 0.21437532, -0.08866372, 0.062359575, -0.08147204, -0.012339588, 0.038986444, -0.059496317, 0.04353628)); - conv2d_7_tf += mul(a2, min16float4x4(-0.029447578, 0.18052183, 0.026130654, -0.18024941, -0.2357611, 0.92272073, -0.40873498, 0.3829195, -0.049990416, -0.2626007, 0.07313907, -0.20231684, 0.23846717, 0.06304234, -0.072538964, 0.34895507)); - conv2d_7_tf += mul(b2, min16float4x4(-0.21427542, 0.33398184, 0.19135003, -0.079177245, -0.047564022, 0.25006044, 0.19287021, -0.07119212, -0.0064072064, 0.14020945, -0.15136649, -0.04587045, -0.113710366, 0.05126853, -0.084781885, 0.1418395)); - conv2d_7_tf += mul(c2, min16float4x4(0.04655672, -0.010115347, 0.18253572, 0.017085062, -0.04543099, 0.08404545, 0.07929449, 0.17069206, -0.045596916, 0.12133366, 0.12615037, -0.11942128, -0.07431312, -0.0975234, 0.17188828, -0.021951154)); - conv2d_7_tf += mul(d2, min16float4x4(0.013333504, -0.22424631, -0.25461286, -0.09366057, -0.24168679, -0.1413706, -0.084172204, 0.1557298, 0.023721283, 0.18159337, -0.029377997, -0.12690134, -0.07779016, 0.49728185, 0.060146395, 0.17318316)); - conv2d_7_tf += mul(e2, min16float4x4(0.08302447, 0.86936367, -0.17584775, -0.2508983, 0.16770333, 0.106514744, 0.056097895, -0.1516464, -0.04237734, 0.3350473, 0.08797126, 0.053822745, 0.36157215, -0.04365805, -0.20060433, -0.23983552)); - conv2d_7_tf += mul(f2, min16float4x4(0.09215062, 0.0729301, 0.2564446, -0.09456067, -0.04279617, 0.009632537, -0.067693666, 0.07115211, -0.58410543, 0.7954688, -0.6856004, -0.0039867237, 0.05259691, -0.19899113, 0.34015554, -0.1301164)); - conv2d_7_tf += mul(g2, min16float4x4(-0.08229732, 0.22852908, -0.17944984, -0.053203765, 0.01401186, -0.01731911, -0.017196467, 0.017660033, -0.06473575, 0.11841842, -0.09651762, 0.08812678, 0.15789783, 0.41068667, -0.17433365, 0.112683386)); - conv2d_7_tf += mul(h2, min16float4x4(0.19192256, -0.048173536, -0.27452058, -0.086614236, 0.03459962, -0.076093, -0.13129567, 0.10529364, -0.003243667, -0.11558274, 0.15014142, -0.11415493, -0.058378108, -0.23308878, 0.016655494, -0.06092205)); - conv2d_7_tf += mul(i2, min16float4x4(0.053656723, -0.2520498, -0.06450468, 0.14063323, -0.07785553, 0.06996582, 0.043691944, -0.09447727, -0.19854756, 0.08710172, 0.103271045, -0.20072943, -0.10393605, -0.19852036, -0.01656043, 0.19936512)); - conv2d_7_tf += mul(na1, min16float4x4(-0.043692272, -0.15573448, -0.07609012, -0.25906095, 0.042468645, 0.06499704, 0.021691361, -0.14418614, 0.007778065, -0.04098781, 0.16854198, 0.1880123, -0.0024735837, -0.38171276, 0.29813913, -0.13975172)); - conv2d_7_tf += mul(nb1, min16float4x4(0.0786739, -0.13743922, -0.16762766, 0.0551441, -0.16237186, 0.47069517, -0.16434868, 0.38760075, 0.29262593, 0.21078295, 0.1564407, -0.19921672, -0.07819381, 0.045407712, 0.25388238, 0.12049804)); - conv2d_7_tf += mul(nc1, min16float4x4(0.13686253, 0.15139718, -0.14193471, -0.037212268, 0.017021572, -0.13029522, -0.07875422, 0.22883393, -0.117323294, -0.11999564, 0.074406326, 0.029792523, 0.071242705, 0.04940517, 0.27540857, 0.094216466)); - conv2d_7_tf += mul(nd1, min16float4x4(0.05651692, -0.09319446, -0.15223487, -0.16004439, 0.09602424, 0.114855476, 0.13851804, 0.11632249, -0.15697844, -0.03465572, -0.6334014, 0.0043645306, -0.13810518, -0.24692737, -0.13962403, -0.17288178)); - conv2d_7_tf += mul(ne1, min16float4x4(-0.1125169, 0.2582768, 0.14571975, 0.3412717, 0.046649273, 0.053606547, -0.5402628, -0.14801335, -0.12299524, 0.79026186, -0.3587726, -0.040698707, 0.18239951, 0.18461016, -0.13213885, -0.6929199)); - conv2d_7_tf += mul(nf1, min16float4x4(-0.009360833, 0.22758053, -0.334423, 0.35250792, 0.05025162, -0.1640276, 0.21909785, -0.12123492, -0.33830088, -0.26451996, 0.09280175, -0.18673559, -0.20446195, 0.13918248, 0.09164517, -0.20213476)); - conv2d_7_tf += mul(ng1, min16float4x4(-0.03443797, -0.25032473, -0.0018426777, -0.065064386, 0.03455914, 0.022166712, -0.2954429, 0.012212829, -0.0223488, 0.1161553, -0.106024936, 0.028343895, 0.15230536, -0.5538007, -0.24089493, 0.06740007)); - conv2d_7_tf += mul(nh1, min16float4x4(0.09501347, -0.0845406, -0.13952151, 0.031915456, 0.05118853, -0.25089842, -0.113984115, 0.08745874, 0.14493734, 0.17449388, 0.037183553, 0.060414817, 0.045083977, -0.50209135, -0.25451177, 0.23309624)); - conv2d_7_tf += mul(ni1, min16float4x4(0.08991499, 0.14019197, -0.12056033, -0.05024532, -0.07585356, 0.073596515, 0.017992107, -0.0009288775, -0.17292187, 0.07525249, 0.14620323, -0.058494095, 0.09669742, -0.28342497, 0.10102461, 0.0075472025)); - conv2d_7_tf += mul(na2, min16float4x4(-0.059322756, 0.07296391, -0.22688308, 0.17183779, 0.0921908, -0.18311407, -0.10553935, -0.2998603, -0.05373476, -0.08882287, 0.009316159, -0.09303765, 0.08415284, -0.044707574, 0.07481887, 0.06931905)); - conv2d_7_tf += mul(nb2, min16float4x4(-0.26374707, 0.17429374, -0.54841083, 0.23039351, 0.1550329, -0.0991982, -0.07031106, -0.23306605, -0.076208115, 0.058818877, 0.48602778, -0.116065495, 0.13632986, 0.5399192, -0.088733315, -0.04031161)); - conv2d_7_tf += mul(nc2, min16float4x4(-0.118198454, -0.04607605, -0.10619185, 0.034395956, 0.0023600461, 0.1470174, -0.21100855, -0.024570175, -0.0016899678, 0.1612513, -0.03985272, 0.01355469, 0.30949214, -0.056687307, 0.1295898, 0.031099077)); - conv2d_7_tf += mul(nd2, min16float4x4(-0.37869355, 0.06961967, 0.2779311, 0.3090361, 0.23564096, -0.014765556, -0.097406775, -0.08233581, -0.05444356, -0.056364074, -0.13940345, -0.1710778, 0.053456437, -0.5668305, -0.21371025, -0.11354647)); - conv2d_7_tf += mul(ne2, min16float4x4(-0.2009931, -0.46823156, 0.04674297, -0.33720648, -0.48212242, -0.022402052, 0.4083246, 0.3498801, -0.12801081, 0.080993176, 0.12559398, 0.30281347, -0.36876208, -0.19425368, 0.040795308, 0.4358033)); - conv2d_7_tf += mul(nf2, min16float4x4(-0.008429336, -0.007929484, -0.21348138, 0.19799937, -0.0032136212, -0.037011284, 0.060586747, -0.012355498, 0.37488303, -0.626778, 0.45391387, -0.030982537, 0.26613617, -0.027296683, -0.094556324, 0.03054091)); - conv2d_7_tf += mul(ng2, min16float4x4(-0.0032568173, -0.3056237, 0.0007252052, 0.052250773, -0.05099108, 0.23182255, -0.044636346, 0.08786388, -0.12470104, -0.16238213, 0.16018245, -0.11313074, -0.044513255, -0.2792024, 0.13793966, -0.20955163)); - conv2d_7_tf += mul(nh2, min16float4x4(-0.14750522, -0.022307748, -0.15649515, 0.15537989, -0.061475005, 0.19822353, 0.0671258, -0.06628393, -0.04068137, 0.22010179, 0.12955783, -0.0517817, 0.02655539, 0.17269138, -0.1296634, 0.030146338)); - conv2d_7_tf += mul(ni2, min16float4x4(0.061146796, 0.31339607, 0.034430694, 0.10376425, 0.03029668, -0.0401898, -0.1825413, 0.06257798, 0.08390942, -0.31551626, 0.010347497, -0.0031549276, 0.21435012, -0.13221692, -0.021980911, -0.1482502)); - conv2d_7_tf += min16float4(0.039428633, 0.032666046, 0.16482623, -0.016402772); + MF4 conv2d_7_tf = MF4(0.039428633, 0.032666046, 0.16482623, -0.016402772); + conv2d_7_tf = MulAdd(a1, MF4x4(0.018128054, -0.14104486, -0.027475944, 0.22669935, -2.7264505e-05, 0.14775783, 0.13441783, 0.11450963, -0.09942102, 0.29735768, 0.04839269, -0.14066552, -0.024448555, 0.3104163, -0.03636913, 0.002947356), conv2d_7_tf); + conv2d_7_tf = MulAdd(b1, MF4x4(-0.20438337, 0.35419708, 0.037506625, 0.100693576, -0.074241616, -0.15304284, 0.0054191337, -0.12816934, 0.028913809, -0.098240785, 0.5653599, -0.38662913, 0.018716848, 0.0021957273, 0.061397206, -0.111899704), conv2d_7_tf); + conv2d_7_tf = MulAdd(c1, MF4x4(-0.18681246, -0.23609419, 0.21475013, 0.051762715, 0.04889926, -0.033886652, 0.26262638, -0.27322114, 0.049140245, 0.3380464, -0.13617653, -0.05796957, 0.080669545, 0.21348572, -0.10067047, -0.0016244814), conv2d_7_tf); + conv2d_7_tf = MulAdd(d1, MF4x4(0.025566151, -0.027286734, -0.10856872, 0.108885765, -0.07635088, 0.13037659, 0.2892404, -0.2160093, -0.30649704, 0.34650138, -0.021391464, 0.08717436, -0.02000013, 0.027722841, 0.43060175, -0.04844848), conv2d_7_tf); + conv2d_7_tf = MulAdd(e1, MF4x4(0.09925131, -0.11167345, -0.14262813, -0.21267861, -0.15972298, -0.1823657, -0.073309824, 0.15542479, 0.005081145, -0.40594074, 0.24862696, 0.19943975, -0.36283687, -0.38990027, 0.4759463, 0.45561194), conv2d_7_tf); + conv2d_7_tf = MulAdd(f1, MF4x4(-0.13126811, 0.24284562, 0.06109369, -0.15402594, 0.016967572, -0.08234942, -0.053873185, 0.026438333, 0.13412815, -0.10839792, -0.345438, 0.0720746, 0.21260333, -0.15989558, -0.012461376, 0.20363508), conv2d_7_tf); + conv2d_7_tf = MulAdd(g1, MF4x4(0.09231617, 0.17787862, 0.22783166, 0.09095521, -0.0935426, -0.22921127, 0.2591894, -0.19451278, -0.0046325484, -0.60839254, 0.061737422, -0.024267042, -0.04048761, 0.2450175, 0.14390652, 0.07999217), conv2d_7_tf); + conv2d_7_tf = MulAdd(h1, MF4x4(-0.09204067, -0.05434134, 0.32136026, -0.053413626, 0.044170942, 0.10284346, 0.10827547, -0.03207593, -0.036979157, -0.37019014, -0.07072617, 0.07745549, 0.026007036, 0.13402742, 0.22873925, -0.09879518), conv2d_7_tf); + conv2d_7_tf = MulAdd(i1, MF4x4(-0.039409183, -0.15304323, 0.110744946, 0.04479048, 0.073402554, -0.31955537, 0.13518381, 0.09020946, 0.21437532, -0.08866372, 0.062359575, -0.08147204, -0.012339588, 0.038986444, -0.059496317, 0.04353628), conv2d_7_tf); + conv2d_7_tf = MulAdd(a2, MF4x4(-0.029447578, 0.18052183, 0.026130654, -0.18024941, -0.2357611, 0.92272073, -0.40873498, 0.3829195, -0.049990416, -0.2626007, 0.07313907, -0.20231684, 0.23846717, 0.06304234, -0.072538964, 0.34895507), conv2d_7_tf); + conv2d_7_tf = MulAdd(b2, MF4x4(-0.21427542, 0.33398184, 0.19135003, -0.079177245, -0.047564022, 0.25006044, 0.19287021, -0.07119212, -0.0064072064, 0.14020945, -0.15136649, -0.04587045, -0.113710366, 0.05126853, -0.084781885, 0.1418395), conv2d_7_tf); + conv2d_7_tf = MulAdd(c2, MF4x4(0.04655672, -0.010115347, 0.18253572, 0.017085062, -0.04543099, 0.08404545, 0.07929449, 0.17069206, -0.045596916, 0.12133366, 0.12615037, -0.11942128, -0.07431312, -0.0975234, 0.17188828, -0.021951154), conv2d_7_tf); + conv2d_7_tf = MulAdd(d2, MF4x4(0.013333504, -0.22424631, -0.25461286, -0.09366057, -0.24168679, -0.1413706, -0.084172204, 0.1557298, 0.023721283, 0.18159337, -0.029377997, -0.12690134, -0.07779016, 0.49728185, 0.060146395, 0.17318316), conv2d_7_tf); + conv2d_7_tf = MulAdd(e2, MF4x4(0.08302447, 0.86936367, -0.17584775, -0.2508983, 0.16770333, 0.106514744, 0.056097895, -0.1516464, -0.04237734, 0.3350473, 0.08797126, 0.053822745, 0.36157215, -0.04365805, -0.20060433, -0.23983552), conv2d_7_tf); + conv2d_7_tf = MulAdd(f2, MF4x4(0.09215062, 0.0729301, 0.2564446, -0.09456067, -0.04279617, 0.009632537, -0.067693666, 0.07115211, -0.58410543, 0.7954688, -0.6856004, -0.0039867237, 0.05259691, -0.19899113, 0.34015554, -0.1301164), conv2d_7_tf); + conv2d_7_tf = MulAdd(g2, MF4x4(-0.08229732, 0.22852908, -0.17944984, -0.053203765, 0.01401186, -0.01731911, -0.017196467, 0.017660033, -0.06473575, 0.11841842, -0.09651762, 0.08812678, 0.15789783, 0.41068667, -0.17433365, 0.112683386), conv2d_7_tf); + conv2d_7_tf = MulAdd(h2, MF4x4(0.19192256, -0.048173536, -0.27452058, -0.086614236, 0.03459962, -0.076093, -0.13129567, 0.10529364, -0.003243667, -0.11558274, 0.15014142, -0.11415493, -0.058378108, -0.23308878, 0.016655494, -0.06092205), conv2d_7_tf); + conv2d_7_tf = MulAdd(i2, MF4x4(0.053656723, -0.2520498, -0.06450468, 0.14063323, -0.07785553, 0.06996582, 0.043691944, -0.09447727, -0.19854756, 0.08710172, 0.103271045, -0.20072943, -0.10393605, -0.19852036, -0.01656043, 0.19936512), conv2d_7_tf); + conv2d_7_tf = MulAdd(na1, MF4x4(-0.043692272, -0.15573448, -0.07609012, -0.25906095, 0.042468645, 0.06499704, 0.021691361, -0.14418614, 0.007778065, -0.04098781, 0.16854198, 0.1880123, -0.0024735837, -0.38171276, 0.29813913, -0.13975172), conv2d_7_tf); + conv2d_7_tf = MulAdd(nb1, MF4x4(0.0786739, -0.13743922, -0.16762766, 0.0551441, -0.16237186, 0.47069517, -0.16434868, 0.38760075, 0.29262593, 0.21078295, 0.1564407, -0.19921672, -0.07819381, 0.045407712, 0.25388238, 0.12049804), conv2d_7_tf); + conv2d_7_tf = MulAdd(nc1, MF4x4(0.13686253, 0.15139718, -0.14193471, -0.037212268, 0.017021572, -0.13029522, -0.07875422, 0.22883393, -0.117323294, -0.11999564, 0.074406326, 0.029792523, 0.071242705, 0.04940517, 0.27540857, 0.094216466), conv2d_7_tf); + conv2d_7_tf = MulAdd(nd1, MF4x4(0.05651692, -0.09319446, -0.15223487, -0.16004439, 0.09602424, 0.114855476, 0.13851804, 0.11632249, -0.15697844, -0.03465572, -0.6334014, 0.0043645306, -0.13810518, -0.24692737, -0.13962403, -0.17288178), conv2d_7_tf); + conv2d_7_tf = MulAdd(ne1, MF4x4(-0.1125169, 0.2582768, 0.14571975, 0.3412717, 0.046649273, 0.053606547, -0.5402628, -0.14801335, -0.12299524, 0.79026186, -0.3587726, -0.040698707, 0.18239951, 0.18461016, -0.13213885, -0.6929199), conv2d_7_tf); + conv2d_7_tf = MulAdd(nf1, MF4x4(-0.009360833, 0.22758053, -0.334423, 0.35250792, 0.05025162, -0.1640276, 0.21909785, -0.12123492, -0.33830088, -0.26451996, 0.09280175, -0.18673559, -0.20446195, 0.13918248, 0.09164517, -0.20213476), conv2d_7_tf); + conv2d_7_tf = MulAdd(ng1, MF4x4(-0.03443797, -0.25032473, -0.0018426777, -0.065064386, 0.03455914, 0.022166712, -0.2954429, 0.012212829, -0.0223488, 0.1161553, -0.106024936, 0.028343895, 0.15230536, -0.5538007, -0.24089493, 0.06740007), conv2d_7_tf); + conv2d_7_tf = MulAdd(nh1, MF4x4(0.09501347, -0.0845406, -0.13952151, 0.031915456, 0.05118853, -0.25089842, -0.113984115, 0.08745874, 0.14493734, 0.17449388, 0.037183553, 0.060414817, 0.045083977, -0.50209135, -0.25451177, 0.23309624), conv2d_7_tf); + conv2d_7_tf = MulAdd(ni1, MF4x4(0.08991499, 0.14019197, -0.12056033, -0.05024532, -0.07585356, 0.073596515, 0.017992107, -0.0009288775, -0.17292187, 0.07525249, 0.14620323, -0.058494095, 0.09669742, -0.28342497, 0.10102461, 0.0075472025), conv2d_7_tf); + conv2d_7_tf = MulAdd(na2, MF4x4(-0.059322756, 0.07296391, -0.22688308, 0.17183779, 0.0921908, -0.18311407, -0.10553935, -0.2998603, -0.05373476, -0.08882287, 0.009316159, -0.09303765, 0.08415284, -0.044707574, 0.07481887, 0.06931905), conv2d_7_tf); + conv2d_7_tf = MulAdd(nb2, MF4x4(-0.26374707, 0.17429374, -0.54841083, 0.23039351, 0.1550329, -0.0991982, -0.07031106, -0.23306605, -0.076208115, 0.058818877, 0.48602778, -0.116065495, 0.13632986, 0.5399192, -0.088733315, -0.04031161), conv2d_7_tf); + conv2d_7_tf = MulAdd(nc2, MF4x4(-0.118198454, -0.04607605, -0.10619185, 0.034395956, 0.0023600461, 0.1470174, -0.21100855, -0.024570175, -0.0016899678, 0.1612513, -0.03985272, 0.01355469, 0.30949214, -0.056687307, 0.1295898, 0.031099077), conv2d_7_tf); + conv2d_7_tf = MulAdd(nd2, MF4x4(-0.37869355, 0.06961967, 0.2779311, 0.3090361, 0.23564096, -0.014765556, -0.097406775, -0.08233581, -0.05444356, -0.056364074, -0.13940345, -0.1710778, 0.053456437, -0.5668305, -0.21371025, -0.11354647), conv2d_7_tf); + conv2d_7_tf = MulAdd(ne2, MF4x4(-0.2009931, -0.46823156, 0.04674297, -0.33720648, -0.48212242, -0.022402052, 0.4083246, 0.3498801, -0.12801081, 0.080993176, 0.12559398, 0.30281347, -0.36876208, -0.19425368, 0.040795308, 0.4358033), conv2d_7_tf); + conv2d_7_tf = MulAdd(nf2, MF4x4(-0.008429336, -0.007929484, -0.21348138, 0.19799937, -0.0032136212, -0.037011284, 0.060586747, -0.012355498, 0.37488303, -0.626778, 0.45391387, -0.030982537, 0.26613617, -0.027296683, -0.094556324, 0.03054091), conv2d_7_tf); + conv2d_7_tf = MulAdd(ng2, MF4x4(-0.0032568173, -0.3056237, 0.0007252052, 0.052250773, -0.05099108, 0.23182255, -0.044636346, 0.08786388, -0.12470104, -0.16238213, 0.16018245, -0.11313074, -0.044513255, -0.2792024, 0.13793966, -0.20955163), conv2d_7_tf); + conv2d_7_tf = MulAdd(nh2, MF4x4(-0.14750522, -0.022307748, -0.15649515, 0.15537989, -0.061475005, 0.19822353, 0.0671258, -0.06628393, -0.04068137, 0.22010179, 0.12955783, -0.0517817, 0.02655539, 0.17269138, -0.1296634, 0.030146338), conv2d_7_tf); + conv2d_7_tf = MulAdd(ni2, MF4x4(0.061146796, 0.31339607, 0.034430694, 0.10376425, 0.03029668, -0.0401898, -0.1825413, 0.06257798, 0.08390942, -0.31551626, 0.010347497, -0.0031549276, 0.21435012, -0.13221692, -0.021980911, -0.1482502), conv2d_7_tf); tex7[gxy] = conv2d_7_tf; - min16float4 nconv2d_7_tf = max(-conv2d_7_tf, 0); + MF4 nconv2d_7_tf = max(-conv2d_7_tf, 0); conv2d_7_tf = max(conv2d_7_tf, 0); - min16float4 target = mul(e1, min16float4x4(0.13591515, 0.21395922, 0.040862843, 0.3054825, -0.088837944, -0.6928339, -0.15643471, 0.13081591, 0.07604966, 0.37446347, -0.34723157, -0.17870799, -0.2037286, -0.106576756, 0.25523958, -0.13762575)); - target += mul(e2, min16float4x4(0.21503459, 0.0373132, -0.008046219, -0.18440363, -0.09729587, 0.043958187, 0.23459528, -0.044009138, 0.1686642, -0.1615934, -0.13173419, -0.079085656, -0.07647595, -0.37286422, -0.06148421, 0.015342882)); - target += mul(ne1, min16float4x4(-0.14785692, -0.2707874, -0.017647093, -0.2908642, 0.5612585, 0.4271698, -0.48191005, 0.11905855, -0.21741737, -0.2821245, 0.29278705, -0.20538986, 0.03150152, 0.03138199, 0.10423793, -0.045527548)); - target += mul(ne2, min16float4x4(0.31277063, 0.07915742, -0.34087706, 0.39680582, -0.022496004, -0.33672526, -0.111507386, 0.025953399, -0.15757395, 0.11465282, 0.28329894, 0.12420795, -0.36261007, 0.46334505, 0.30303243, -0.03249052)); - target += mul(conv2d_8_tf, min16float4x4(0.57927984, 0.06878386, -0.24236098, 0.31338137, 0.10464923, -0.07153124, 0.13588428, -0.02373762, -0.19124955, -0.1138502, 0.17388438, 0.01707623, -0.24228282, 0.04736911, 0.6398566, -0.32334659)); - target += mul(nconv2d_8_tf, min16float4x4(-0.54402775, -0.24674532, 0.11212342, -0.09593871, -0.17339998, 0.1323692, -0.1680261, 0.025882099, -0.19121705, 0.1832492, -0.08548955, -0.14068407, 0.13255714, 0.10409962, -0.01394588, 0.22216345)); - target += mul(conv2d_1_tf, min16float4x4(0.2702694, -0.56255573, -0.5357781, 0.05541389, 0.070275396, -0.08012564, -0.13473864, -0.113696516, 0.06642909, 0.23810093, 0.0728827, -0.17656006, 0.48172018, -0.25749484, -0.1752313, 0.33768335)); - target += mul(nconv2d_1_tf, min16float4x4(0.46950498, 0.059317388, -0.09860531, -0.006304164, -0.4128484, -0.049649406, 0.2954393, -0.190237, -0.20938443, 0.034176145, 0.063109055, 0.07802573, -0.20652357, -0.23180202, -0.11936575, 0.2589604)); - target += mul(conv2d_4_tf, min16float4x4(0.3843954, -0.08686217, 0.18839231, 0.01876761, -0.03335079, -0.12043262, -0.42323095, -0.02321388, -0.22252762, -0.049455926, 0.2268798, 0.082169, 0.2473631, 0.23347862, 0.002254042, 0.2757807)); - target += mul(nconv2d_4_tf, min16float4x4(0.1020188, -0.037612554, -0.33062017, 0.1570476, 0.19851524, 0.35976177, -0.016449552, 0.22057539, 0.20401593, 0.07004227, -0.062413715, -0.10547836, 0.14671406, -0.3905135, -0.038352408, -0.28926837)); - target += mul(conv2d_7_tf, min16float4x4(0.4110517, 0.06280497, 0.16709873, -0.49500167, -0.10045096, -0.2238529, 0.012172345, 0.19666891, -0.16135901, 0.017100533, 0.35809904, 0.35188627, 0.20347194, -0.14602524, 0.71737736, 0.14195462)); - target += mul(nconv2d_7_tf, min16float4x4(-0.5236819, 0.4352016, -0.4066126, -0.04252335, 0.1086945, 0.145471, 0.21984594, -0.24670586, -0.07109616, -0.2711473, -0.89353126, -0.3953869, 0.17096898, 0.12978637, -0.42527854, -0.019720567)); - target += min16float4(-0.027689768, -0.16386859, -0.009289161, 0.09287236); + MF4 target = MF4(-0.027689768, -0.16386859, -0.009289161, 0.09287236); + target = MulAdd(e1, MF4x4(0.13591515, 0.21395922, 0.040862843, 0.3054825, -0.088837944, -0.6928339, -0.15643471, 0.13081591, 0.07604966, 0.37446347, -0.34723157, -0.17870799, -0.2037286, -0.106576756, 0.25523958, -0.13762575), target); + target = MulAdd(e2, MF4x4(0.21503459, 0.0373132, -0.008046219, -0.18440363, -0.09729587, 0.043958187, 0.23459528, -0.044009138, 0.1686642, -0.1615934, -0.13173419, -0.079085656, -0.07647595, -0.37286422, -0.06148421, 0.015342882), target); + target = MulAdd(ne1, MF4x4(-0.14785692, -0.2707874, -0.017647093, -0.2908642, 0.5612585, 0.4271698, -0.48191005, 0.11905855, -0.21741737, -0.2821245, 0.29278705, -0.20538986, 0.03150152, 0.03138199, 0.10423793, -0.045527548), target); + target = MulAdd(ne2, MF4x4(0.31277063, 0.07915742, -0.34087706, 0.39680582, -0.022496004, -0.33672526, -0.111507386, 0.025953399, -0.15757395, 0.11465282, 0.28329894, 0.12420795, -0.36261007, 0.46334505, 0.30303243, -0.03249052), target); + target = MulAdd(conv2d_8_tf, MF4x4(0.57927984, 0.06878386, -0.24236098, 0.31338137, 0.10464923, -0.07153124, 0.13588428, -0.02373762, -0.19124955, -0.1138502, 0.17388438, 0.01707623, -0.24228282, 0.04736911, 0.6398566, -0.32334659), target); + target = MulAdd(nconv2d_8_tf, MF4x4(-0.54402775, -0.24674532, 0.11212342, -0.09593871, -0.17339998, 0.1323692, -0.1680261, 0.025882099, -0.19121705, 0.1832492, -0.08548955, -0.14068407, 0.13255714, 0.10409962, -0.01394588, 0.22216345), target); + target = MulAdd(conv2d_1_tf, MF4x4(0.2702694, -0.56255573, -0.5357781, 0.05541389, 0.070275396, -0.08012564, -0.13473864, -0.113696516, 0.06642909, 0.23810093, 0.0728827, -0.17656006, 0.48172018, -0.25749484, -0.1752313, 0.33768335), target); + target = MulAdd(nconv2d_1_tf, MF4x4(0.46950498, 0.059317388, -0.09860531, -0.006304164, -0.4128484, -0.049649406, 0.2954393, -0.190237, -0.20938443, 0.034176145, 0.063109055, 0.07802573, -0.20652357, -0.23180202, -0.11936575, 0.2589604), target); + target = MulAdd(conv2d_4_tf, MF4x4(0.3843954, -0.08686217, 0.18839231, 0.01876761, -0.03335079, -0.12043262, -0.42323095, -0.02321388, -0.22252762, -0.049455926, 0.2268798, 0.082169, 0.2473631, 0.23347862, 0.002254042, 0.2757807), target); + target = MulAdd(nconv2d_4_tf, MF4x4(0.1020188, -0.037612554, -0.33062017, 0.1570476, 0.19851524, 0.35976177, -0.016449552, 0.22057539, 0.20401593, 0.07004227, -0.062413715, -0.10547836, 0.14671406, -0.3905135, -0.038352408, -0.28926837), target); + target = MulAdd(conv2d_7_tf, MF4x4(0.4110517, 0.06280497, 0.16709873, -0.49500167, -0.10045096, -0.2238529, 0.012172345, 0.19666891, -0.16135901, 0.017100533, 0.35809904, 0.35188627, 0.20347194, -0.14602524, 0.71737736, 0.14195462), target); + target = MulAdd(nconv2d_7_tf, MF4x4(-0.5236819, 0.4352016, -0.4066126, -0.04252335, 0.1086945, 0.145471, 0.21984594, -0.24670586, -0.07109616, -0.2711473, -0.89353126, -0.3953869, 0.17096898, 0.12978637, -0.42527854, -0.019720567), target); tex4[gxy] = target; - target = mul(e1, min16float4x4(0.19380243, 0.020101497, 0.021015864, 0.40521726, 0.038862754, -0.3473658, 0.22289194, -0.2075226, -0.15960178, 0.20686232, -0.19066268, -0.24524036, -0.19289994, -0.6356018, 0.040245753, -0.22887161)); - target += mul(e2, min16float4x4(-0.06837712, -0.59243137, 0.08107887, -0.18099897, 0.08890105, -0.20113088, 0.0076543097, -0.28404838, -0.39403212, 0.124420464, 0.07661543, -0.16511264, 0.440653, 0.17841326, -0.40957427, -0.055862557)); - target += mul(ne1, min16float4x4(-0.052128255, -0.17906874, -0.0063690864, -0.3027001, -0.12118662, 0.5986499, -0.35075194, 0.11334461, -0.13089949, 0.48732534, 0.31238684, 0.0636065, 0.21470545, -0.12680373, 0.20702313, -0.14277203)); - target += mul(ne2, min16float4x4(-0.13521394, 0.5266374, -0.4765612, 0.32102558, -0.07704129, -0.26604977, 0.36475307, 0.27245706, 0.16729634, -0.04975267, 0.18763311, 0.07594951, -0.20137721, 0.07614109, -0.056586545, 0.35838535)); - target += mul(conv2d_8_tf, min16float4x4(0.22150421, -0.023909386, -0.30742592, 0.54860467, 0.038963366, -0.47929683, 0.001491465, -0.2016597, 0.14891255, -0.12298715, 0.12770613, 0.16882578, 0.52988553, -0.34417477, -0.11196754, 0.038432673)); - target += mul(nconv2d_8_tf, min16float4x4(0.10892675, 0.15687913, 0.4061297, -0.2549851, -0.12231971, 0.7066191, -0.038577385, 0.1871752, -0.23520122, 0.6384404, -0.04857454, -0.23879313, -0.26810166, -0.08090798, 0.3287431, 0.15214305)); - target += mul(conv2d_1_tf, min16float4x4(0.16076286, 0.08942198, 0.79264593, -0.5107746, -0.10051664, -0.18325275, 0.31161344, 0.023725776, 0.09911152, 0.1552438, -0.22447744, -0.2995641, 0.27984253, -1.107023, 0.010454479, 0.6606262)); - target += mul(nconv2d_1_tf, min16float4x4(0.041668475, 0.16935597, -0.11855577, 0.2013473, 0.2991738, -0.38238418, 0.17906274, -0.27559698, -0.4381387, 0.39814267, -0.40905684, 0.57992136, 0.2830281, 0.12482517, -0.30402762, 0.47808015)); - target += mul(conv2d_4_tf, min16float4x4(0.05201121, 0.3396993, -0.04965309, -0.25744373, -0.13495848, -0.120026626, 0.15645088, -0.20658544, 0.414069, -0.03110071, 0.070210315, 0.028046172, -0.17324251, 0.14329922, -0.14353131, 0.028436944)); - target += mul(nconv2d_4_tf, min16float4x4(-0.15607943, 0.98266315, -0.15506491, 0.34884667, -0.16584046, 0.07532187, 0.0062847883, 0.8719761, -0.30521882, -0.34961814, -0.055313803, 0.041199762, 0.2634066, 0.31106153, 0.029962108, -0.017541675)); - target += mul(conv2d_7_tf, min16float4x4(0.1285044, 0.41011113, 0.16163284, -0.40202442, 0.33554438, -0.2626098, 0.18437132, 0.06627138, 0.26390168, -0.23918642, -0.17191365, -0.16348109, 0.30074367, -0.99079835, 0.60264456, 0.050881945)); - target += mul(nconv2d_7_tf, min16float4x4(0.3971443, -0.034655187, 0.11870823, 0.39984652, -0.45068088, -0.054210827, -0.27554438, -0.16074227, -0.14983663, 0.35434055, 0.42479035, 0.07799301, -0.4260275, 0.66214204, -0.095251344, 0.09080398)); - target += min16float4(-0.012729538, -0.13335368, 0.14840336, 0.025965473); + target = MF4(-0.012729538, -0.13335368, 0.14840336, 0.025965473); + target = MulAdd(e1, MF4x4(0.19380243, 0.020101497, 0.021015864, 0.40521726, 0.038862754, -0.3473658, 0.22289194, -0.2075226, -0.15960178, 0.20686232, -0.19066268, -0.24524036, -0.19289994, -0.6356018, 0.040245753, -0.22887161), target); + target = MulAdd(e2, MF4x4(-0.06837712, -0.59243137, 0.08107887, -0.18099897, 0.08890105, -0.20113088, 0.0076543097, -0.28404838, -0.39403212, 0.124420464, 0.07661543, -0.16511264, 0.440653, 0.17841326, -0.40957427, -0.055862557), target); + target = MulAdd(ne1, MF4x4(-0.052128255, -0.17906874, -0.0063690864, -0.3027001, -0.12118662, 0.5986499, -0.35075194, 0.11334461, -0.13089949, 0.48732534, 0.31238684, 0.0636065, 0.21470545, -0.12680373, 0.20702313, -0.14277203), target); + target = MulAdd(ne2, MF4x4(-0.13521394, 0.5266374, -0.4765612, 0.32102558, -0.07704129, -0.26604977, 0.36475307, 0.27245706, 0.16729634, -0.04975267, 0.18763311, 0.07594951, -0.20137721, 0.07614109, -0.056586545, 0.35838535), target); + target = MulAdd(conv2d_8_tf, MF4x4(0.22150421, -0.023909386, -0.30742592, 0.54860467, 0.038963366, -0.47929683, 0.001491465, -0.2016597, 0.14891255, -0.12298715, 0.12770613, 0.16882578, 0.52988553, -0.34417477, -0.11196754, 0.038432673), target); + target = MulAdd(nconv2d_8_tf, MF4x4(0.10892675, 0.15687913, 0.4061297, -0.2549851, -0.12231971, 0.7066191, -0.038577385, 0.1871752, -0.23520122, 0.6384404, -0.04857454, -0.23879313, -0.26810166, -0.08090798, 0.3287431, 0.15214305), target); + target = MulAdd(conv2d_1_tf, MF4x4(0.16076286, 0.08942198, 0.79264593, -0.5107746, -0.10051664, -0.18325275, 0.31161344, 0.023725776, 0.09911152, 0.1552438, -0.22447744, -0.2995641, 0.27984253, -1.107023, 0.010454479, 0.6606262), target); + target = MulAdd(nconv2d_1_tf, MF4x4(0.041668475, 0.16935597, -0.11855577, 0.2013473, 0.2991738, -0.38238418, 0.17906274, -0.27559698, -0.4381387, 0.39814267, -0.40905684, 0.57992136, 0.2830281, 0.12482517, -0.30402762, 0.47808015), target); + target = MulAdd(conv2d_4_tf, MF4x4(0.05201121, 0.3396993, -0.04965309, -0.25744373, -0.13495848, -0.120026626, 0.15645088, -0.20658544, 0.414069, -0.03110071, 0.070210315, 0.028046172, -0.17324251, 0.14329922, -0.14353131, 0.028436944), target); + target = MulAdd(nconv2d_4_tf, MF4x4(-0.15607943, 0.98266315, -0.15506491, 0.34884667, -0.16584046, 0.07532187, 0.0062847883, 0.8719761, -0.30521882, -0.34961814, -0.055313803, 0.041199762, 0.2634066, 0.31106153, 0.029962108, -0.017541675), target); + target = MulAdd(conv2d_7_tf, MF4x4(0.1285044, 0.41011113, 0.16163284, -0.40202442, 0.33554438, -0.2626098, 0.18437132, 0.06627138, 0.26390168, -0.23918642, -0.17191365, -0.16348109, 0.30074367, -0.99079835, 0.60264456, 0.050881945), target); + target = MulAdd(nconv2d_7_tf, MF4x4(0.3971443, -0.034655187, 0.11870823, 0.39984652, -0.45068088, -0.054210827, -0.27554438, -0.16074227, -0.14983663, 0.35434055, 0.42479035, 0.07799301, -0.4260275, 0.66214204, -0.095251344, 0.09080398), target); tex5[gxy] = target; } @@ -766,25 +769,25 @@ void Pass5(uint2 blockStart, uint3 threadId) { // [ a, d, g ] // [ b, e, h ] // [ c, f, i ] - min16float4 a1 = tex4.SampleLevel(sam, pos - inputPt, 0); - min16float4 b1 = tex4.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); - min16float4 c1 = tex4.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); - min16float4 d1 = tex4.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); - min16float4 e1 = tex4.SampleLevel(sam, pos, 0); - min16float4 f1 = tex4.SampleLevel(sam, pos + float2(0, inputPt.y), 0); - min16float4 g1 = tex4.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); - min16float4 h1 = tex4.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); - min16float4 i1 = tex4.SampleLevel(sam, pos + inputPt, 0); - - min16float4 na1 = max(-a1, 0); - min16float4 nb1 = max(-b1, 0); - min16float4 nc1 = max(-c1, 0); - min16float4 nd1 = max(-d1, 0); - min16float4 ne1 = max(-e1, 0); - min16float4 nf1 = max(-f1, 0); - min16float4 ng1 = max(-g1, 0); - min16float4 nh1 = max(-h1, 0); - min16float4 ni1 = max(-i1, 0); + MF4 a1 = tex4.SampleLevel(sam, pos - inputPt, 0); + MF4 b1 = tex4.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); + MF4 c1 = tex4.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); + MF4 d1 = tex4.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); + MF4 e1 = tex4.SampleLevel(sam, pos, 0); + MF4 f1 = tex4.SampleLevel(sam, pos + float2(0, inputPt.y), 0); + MF4 g1 = tex4.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); + MF4 h1 = tex4.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); + MF4 i1 = tex4.SampleLevel(sam, pos + inputPt, 0); + + MF4 na1 = max(-a1, 0); + MF4 nb1 = max(-b1, 0); + MF4 nc1 = max(-c1, 0); + MF4 nd1 = max(-d1, 0); + MF4 ne1 = max(-e1, 0); + MF4 nf1 = max(-f1, 0); + MF4 ng1 = max(-g1, 0); + MF4 nh1 = max(-h1, 0); + MF4 ni1 = max(-i1, 0); a1 = max(a1, 0); b1 = max(b1, 0); @@ -796,25 +799,25 @@ void Pass5(uint2 blockStart, uint3 threadId) { h1 = max(h1, 0); i1 = max(i1, 0); - min16float4 a2 = tex5.SampleLevel(sam, pos - inputPt, 0); - min16float4 b2 = tex5.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); - min16float4 c2 = tex5.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); - min16float4 d2 = tex5.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); - min16float4 e2 = tex5.SampleLevel(sam, pos, 0); - min16float4 f2 = tex5.SampleLevel(sam, pos + float2(0, inputPt.y), 0); - min16float4 g2 = tex5.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); - min16float4 h2 = tex5.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); - min16float4 i2 = tex5.SampleLevel(sam, pos + inputPt, 0); - - min16float4 na2 = max(-a2, 0); - min16float4 nb2 = max(-b2, 0); - min16float4 nc2 = max(-c2, 0); - min16float4 nd2 = max(-d2, 0); - min16float4 ne2 = max(-e2, 0); - min16float4 nf2 = max(-f2, 0); - min16float4 ng2 = max(-g2, 0); - min16float4 nh2 = max(-h2, 0); - min16float4 ni2 = max(-i2, 0); + MF4 a2 = tex5.SampleLevel(sam, pos - inputPt, 0); + MF4 b2 = tex5.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); + MF4 c2 = tex5.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); + MF4 d2 = tex5.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); + MF4 e2 = tex5.SampleLevel(sam, pos, 0); + MF4 f2 = tex5.SampleLevel(sam, pos + float2(0, inputPt.y), 0); + MF4 g2 = tex5.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); + MF4 h2 = tex5.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); + MF4 i2 = tex5.SampleLevel(sam, pos + inputPt, 0); + + MF4 na2 = max(-a2, 0); + MF4 nb2 = max(-b2, 0); + MF4 nc2 = max(-c2, 0); + MF4 nd2 = max(-d2, 0); + MF4 ne2 = max(-e2, 0); + MF4 nf2 = max(-f2, 0); + MF4 ng2 = max(-g2, 0); + MF4 nh2 = max(-h2, 0); + MF4 ni2 = max(-i2, 0); a2 = max(a2, 0); b2 = max(b2, 0); @@ -826,132 +829,132 @@ void Pass5(uint2 blockStart, uint3 threadId) { h2 = max(h2, 0); i2 = max(i2, 0); - min16float4 conv2d_1_tf = tex3.SampleLevel(sam, pos, 0); - min16float4 nconv2d_1_tf = max(-conv2d_1_tf, 0); + MF4 conv2d_1_tf = tex3.SampleLevel(sam, pos, 0); + MF4 nconv2d_1_tf = max(-conv2d_1_tf, 0); conv2d_1_tf = max(conv2d_1_tf, 0); - min16float4 conv2d_4_tf = tex6.SampleLevel(sam, pos, 0); - min16float4 nconv2d_4_tf = max(-conv2d_4_tf, 0); + MF4 conv2d_4_tf = tex6.SampleLevel(sam, pos, 0); + MF4 nconv2d_4_tf = max(-conv2d_4_tf, 0); conv2d_4_tf = max(conv2d_4_tf, 0); - min16float4 conv2d_7_tf = tex7.SampleLevel(sam, pos, 0); - min16float4 nconv2d_7_tf = max(-conv2d_7_tf, 0); + MF4 conv2d_7_tf = tex7.SampleLevel(sam, pos, 0); + MF4 nconv2d_7_tf = max(-conv2d_7_tf, 0); conv2d_7_tf = max(conv2d_7_tf, 0); - min16float4 conv2d_11_tf = mul(a1, min16float4x4(-0.22341304, 0.26908797, 0.04134543, 0.06961319, 0.32176727, 0.07702703, 0.03751845, -0.13761088, -0.09979559, 0.06891045, -0.01716057, -0.031486046, -0.016294012, 0.0262252, 0.012725462, -0.054174248)); - conv2d_11_tf += mul(b1, min16float4x4(0.0758998, 0.044578414, -0.058127478, -0.04941571, 0.1685694, 0.9547572, 0.3217995, 0.04913146, 0.08628588, -0.49687696, 0.05530926, -0.19010891, 0.0077229803, 0.3938303, 0.18076055, -0.048131783)); - conv2d_11_tf += mul(c1, min16float4x4(0.03656385, 0.23112705, 0.13059878, 0.16223684, -0.2766845, 0.053392846, 0.06446786, 0.19696166, -0.14884388, -0.23103243, -0.07006061, -0.021727445, 0.026394684, -0.31138313, -0.0976933, -0.062459927)); - conv2d_11_tf += mul(d1, min16float4x4(-0.36985022, -0.3396681, 0.035750575, 0.019713784, 0.10074354, -0.34114882, -0.01150834, -0.1436701, -0.36870074, -0.3272402, -0.03879516, -0.094077155, 0.016875539, 0.23895474, -0.14396004, -0.06785279)); - conv2d_11_tf += mul(e1, min16float4x4(0.057131216, -0.5966212, -0.13011967, -0.3684052, 0.6414469, 0.45823926, 0.043126952, -0.12702179, 0.029217511, 0.43957123, 0.06747733, 0.35508418, -0.13576074, 0.28117993, 0.1785782, 0.20060769)); - conv2d_11_tf += mul(f1, min16float4x4(0.112133466, 0.2773932, -0.047416527, -0.06561597, 0.093935706, 0.032524325, 0.02208551, 0.10400939, -0.0062363064, 0.20578235, 0.124429, 0.045867924, 0.024913216, -0.07508951, -0.1506746, -0.07368737)); - conv2d_11_tf += mul(g1, min16float4x4(0.029188056, 0.13675697, -0.10047892, -0.15162368, 0.11152231, 0.17758776, 0.04638467, -0.15375991, -0.08195171, -0.00092798605, -0.11137887, -0.20476487, -0.06701632, -0.38742077, 0.10833869, 0.07575963)); - conv2d_11_tf += mul(h1, min16float4x4(0.12579612, -0.13082299, 0.022704111, -0.049295194, 0.02813974, 0.06766161, 0.021488592, -0.22899324, -0.13967377, -0.42789128, -0.15561862, -0.13880157, -0.31957027, -0.051553562, -0.15501565, -0.17607704)); - conv2d_11_tf += mul(i1, min16float4x4(-0.014785312, -0.3358245, 0.09859993, 0.17852743, 0.06758491, 0.040827237, -0.014897847, -0.027630018, -0.041637477, -0.10967412, -0.10507281, 0.058183335, -0.01929858, 0.09047934, -0.19679205, -0.16896065)); - conv2d_11_tf += mul(a2, min16float4x4(-0.19059956, 0.059083544, -0.07367043, 0.10374235, -0.12928921, 0.16821185, 0.03542259, 0.07853399, -0.029948441, 0.045060057, 0.10522493, 0.15548709, 0.13417992, 0.12784965, 0.068737574, 0.024369959)); - conv2d_11_tf += mul(b2, min16float4x4(-0.2539489, -0.15361321, -0.024794202, 0.23387837, -0.021986792, 0.035640705, -0.053465687, 0.041275553, -0.12349385, 0.11599216, -0.12158652, -0.0016647653, 0.03552641, 0.15126309, 0.10521408, 0.022221778)); - conv2d_11_tf += mul(c2, min16float4x4(-0.09391041, 0.21640098, 0.06468435, 0.021124857, -0.017427467, 0.14731239, 0.0888631, 0.06669842, 0.16802992, -0.042000934, -0.007442969, -0.17762569, -0.106376246, -0.007006815, 0.048836768, 0.07634349)); - conv2d_11_tf += mul(d2, min16float4x4(-0.08242374, -0.35055616, 0.11752318, 0.06287576, -0.08078838, 0.015269983, 0.07802465, 0.036515962, -0.047435157, -0.23535018, 0.10882656, 0.00760307, 0.20816213, 0.16291322, -0.17480974, -0.09656055)); - conv2d_11_tf += mul(e2, min16float4x4(0.3776239, 0.48836887, 0.046571143, -0.0005301381, 0.111404456, -0.2056147, 0.0976322, -0.07087254, -0.23208277, 0.64508325, 0.029519977, -0.32163903, 0.12203931, 1.2488136, 0.0713469, -0.12589021)); - conv2d_11_tf += mul(f2, min16float4x4(-0.1458724, -0.2927259, -0.11825573, 0.050236594, 0.005908592, 0.009147886, 0.014676971, -0.09960781, -0.031219782, 0.0008116867, -0.16999915, -0.08393424, -0.017762119, 0.15271363, 0.17894958, 0.104973435)); - conv2d_11_tf += mul(g2, min16float4x4(0.15102111, -0.017580042, -0.009878415, 0.09603493, -0.14158034, 0.01766169, 0.026301328, 0.14016923, 0.07513633, 0.12250821, 0.14139763, 0.119470306, 0.056335848, 0.011718554, -0.051952817, -0.1087701)); - conv2d_11_tf += mul(h2, min16float4x4(0.12267096, 0.22258927, -0.23374331, -0.336529, -0.03149633, -0.26095635, 0.00365308, 0.048830956, 0.035902984, -0.04686918, -0.08079191, -0.17013429, 0.0254567, -0.05592242, 0.0968047, 0.07426071)); - conv2d_11_tf += mul(i2, min16float4x4(-0.16953564, 0.074455656, 0.0029755495, 0.20576377, -0.050961535, 0.060958825, 0.014226229, 0.104992926, 0.06942283, 0.29077423, 0.040234245, 0.12337425, -0.012045997, -0.11109262, 0.020255094, 0.08945579)); - conv2d_11_tf += mul(na1, min16float4x4(0.2978639, -0.24613461, -0.083074145, -0.2367985, -0.13995647, -0.21201506, -0.16809967, -0.08163256, 0.22451796, -0.21319884, 0.097241744, 0.17276905, 0.059754357, -0.21800114, 0.016986718, 0.059852242)); - conv2d_11_tf += mul(nb1, min16float4x4(0.10399378, 0.016165858, 0.006949626, -0.00957426, -0.07206657, 0.85400176, -0.069736175, 0.11563255, -0.15550873, 0.21035826, -0.09730208, 0.21803263, -0.029731166, 0.07174115, -0.075019605, 0.06605764)); - conv2d_11_tf += mul(nc1, min16float4x4(0.008660154, -0.1689362, -0.13275097, -0.14157207, -0.06571528, 0.2641335, 0.17738026, 0.016201235, -0.058384545, -0.089386165, -0.10691102, 0.03380599, 0.07696467, 0.010921241, -0.05858657, 0.044599395)); - conv2d_11_tf += mul(nd1, min16float4x4(0.29438433, 0.39757052, -0.12448894, -0.14726874, 0.054101802, 0.19893955, 0.0081761405, -0.030686913, -0.09465847, -0.09517581, 0.0046200817, 0.2743172, 0.18768987, 0.2577441, 0.3185588, -0.0043636197)); - conv2d_11_tf += mul(ne1, min16float4x4(0.30364004, 0.45719072, -0.002478791, -0.25550374, 0.044718135, 0.9974692, 0.27661783, 0.38724384, 0.20643012, -0.36335453, 0.04044719, -0.15773767, 0.019318745, -0.015368104, -0.13033883, -0.21446472)); - conv2d_11_tf += mul(nf1, min16float4x4(0.17225221, -0.2870429, -0.11031537, -0.20985241, -0.1813215, 0.47034717, 0.19177493, 0.1565604, -0.22090979, -0.1778559, -0.15998572, 0.20591277, -0.27751637, -0.17734572, -0.22385214, 0.2001247)); - conv2d_11_tf += mul(ng1, min16float4x4(0.09103924, 0.012440279, -0.11811386, -0.28955194, -0.024203198, -0.014690502, -0.041423846, 0.0062359073, 0.06732812, -0.040848043, -0.0807372, -0.06598595, -0.020464217, 0.35617942, 0.054869782, -0.06990699)); - conv2d_11_tf += mul(nh1, min16float4x4(-0.22022852, -0.30250633, -0.008539953, -0.17535509, 0.048545327, -0.06961757, 0.1520779, 0.15551318, 0.145789, 0.41386685, 0.19608185, 0.02285933, 0.19650589, 0.1140758, 0.058065582, 0.06438903)); - conv2d_11_tf += mul(ni1, min16float4x4(0.17500387, 0.009752107, -0.08735754, -0.40322778, -0.04718948, -0.1520063, 0.015334469, 0.055586398, -0.06315823, 0.01381341, 0.06333497, 0.20780154, -0.14789844, 0.008873181, 0.20424104, 0.18570045)); - conv2d_11_tf += mul(na2, min16float4x4(0.17809622, -0.054737452, 0.045792647, -0.05761767, 0.1530876, -0.058534857, -0.008100565, 0.036446143, 0.27693272, 0.3004126, -0.1283306, -0.50103384, -0.3350802, 0.09919993, -0.10481551, 0.059236333)); - conv2d_11_tf += mul(nb2, min16float4x4(0.08178473, 0.01796507, 0.045470674, -0.1395204, -0.07053285, -0.15308544, -0.016434597, 0.09957456, 0.07303232, 0.5558379, 0.1058254, -0.12340164, -0.37540868, 0.20688659, 0.11254531, 0.08988308)); - conv2d_11_tf += mul(nc2, min16float4x4(-0.115479395, -0.04145597, -0.02444945, -0.0012505532, -0.016777854, -0.21254961, -0.11969028, -0.10986302, 0.34061527, 0.35168666, 0.19457188, -0.25304377, 0.089430355, -0.13593785, -0.03715568, -0.07161111)); - conv2d_11_tf += mul(nd2, min16float4x4(0.135465, 0.16024914, -0.16819438, -0.076060556, 0.14722055, -0.12402309, -0.091675736, -0.11345004, 0.3370019, 0.21161243, 0.08165217, 0.26650387, 0.11799823, 1.1248134, 0.031586587, 0.40626523)); - conv2d_11_tf += mul(ne2, min16float4x4(-0.3881156, 0.075572714, -0.2955678, -0.04820779, -0.14431494, 0.17108414, -0.031334974, 0.14272547, 0.10431918, -0.92185026, -0.550305, -0.09849551, -0.19279402, 0.47034186, 0.38574138, 0.5469418)); - conv2d_11_tf += mul(nf2, min16float4x4(0.07301299, -0.1655295, 0.0851716, 0.0349889, 0.037978686, -0.34476924, -0.09894407, -0.09279173, -0.017504893, 0.16626996, 0.23299451, -0.29538614, -0.035250418, 0.102075204, 0.014679606, 0.05283856)); - conv2d_11_tf += mul(ng2, min16float4x4(0.082496785, -0.047353677, -0.1036778, -0.014507561, 0.091381975, -0.07229443, -0.03069601, -0.07463806, 0.2173226, 0.061551273, 0.01672064, 0.065622196, 0.1645865, 0.08651663, 0.18979368, 0.2012662)); - conv2d_11_tf += mul(nh2, min16float4x4(-0.2116467, -0.26988897, -0.049475558, 0.18609211, -0.08837133, -0.219245, 0.05900789, -0.007832284, -0.028579885, 0.20587349, -0.07297767, -0.19551088, 0.052455146, -0.24630548, 0.12438646, -0.017073039)); - conv2d_11_tf += mul(ni2, min16float4x4(0.15815273, -0.13286865, -0.036927793, -0.118895106, 0.06876401, -0.08193885, -0.073907554, -0.17851423, 0.025570622, -0.05206693, 0.0054880823, -0.14550385, 0.031355973, -0.0617539, -0.09522895, 0.007602468)); - conv2d_11_tf += min16float4(0.10656278, 0.12657918, 0.16990805, -0.12699938); + MF4 conv2d_11_tf = MF4(0.10656278, 0.12657918, 0.16990805, -0.12699938); + conv2d_11_tf = MulAdd(a1, MF4x4(-0.22341304, 0.26908797, 0.04134543, 0.06961319, 0.32176727, 0.07702703, 0.03751845, -0.13761088, -0.09979559, 0.06891045, -0.01716057, -0.031486046, -0.016294012, 0.0262252, 0.012725462, -0.054174248), conv2d_11_tf); + conv2d_11_tf = MulAdd(b1, MF4x4(0.0758998, 0.044578414, -0.058127478, -0.04941571, 0.1685694, 0.9547572, 0.3217995, 0.04913146, 0.08628588, -0.49687696, 0.05530926, -0.19010891, 0.0077229803, 0.3938303, 0.18076055, -0.048131783), conv2d_11_tf); + conv2d_11_tf = MulAdd(c1, MF4x4(0.03656385, 0.23112705, 0.13059878, 0.16223684, -0.2766845, 0.053392846, 0.06446786, 0.19696166, -0.14884388, -0.23103243, -0.07006061, -0.021727445, 0.026394684, -0.31138313, -0.0976933, -0.062459927), conv2d_11_tf); + conv2d_11_tf = MulAdd(d1, MF4x4(-0.36985022, -0.3396681, 0.035750575, 0.019713784, 0.10074354, -0.34114882, -0.01150834, -0.1436701, -0.36870074, -0.3272402, -0.03879516, -0.094077155, 0.016875539, 0.23895474, -0.14396004, -0.06785279), conv2d_11_tf); + conv2d_11_tf = MulAdd(e1, MF4x4(0.057131216, -0.5966212, -0.13011967, -0.3684052, 0.6414469, 0.45823926, 0.043126952, -0.12702179, 0.029217511, 0.43957123, 0.06747733, 0.35508418, -0.13576074, 0.28117993, 0.1785782, 0.20060769), conv2d_11_tf); + conv2d_11_tf = MulAdd(f1, MF4x4(0.112133466, 0.2773932, -0.047416527, -0.06561597, 0.093935706, 0.032524325, 0.02208551, 0.10400939, -0.0062363064, 0.20578235, 0.124429, 0.045867924, 0.024913216, -0.07508951, -0.1506746, -0.07368737), conv2d_11_tf); + conv2d_11_tf = MulAdd(g1, MF4x4(0.029188056, 0.13675697, -0.10047892, -0.15162368, 0.11152231, 0.17758776, 0.04638467, -0.15375991, -0.08195171, -0.00092798605, -0.11137887, -0.20476487, -0.06701632, -0.38742077, 0.10833869, 0.07575963), conv2d_11_tf); + conv2d_11_tf = MulAdd(h1, MF4x4(0.12579612, -0.13082299, 0.022704111, -0.049295194, 0.02813974, 0.06766161, 0.021488592, -0.22899324, -0.13967377, -0.42789128, -0.15561862, -0.13880157, -0.31957027, -0.051553562, -0.15501565, -0.17607704), conv2d_11_tf); + conv2d_11_tf = MulAdd(i1, MF4x4(-0.014785312, -0.3358245, 0.09859993, 0.17852743, 0.06758491, 0.040827237, -0.014897847, -0.027630018, -0.041637477, -0.10967412, -0.10507281, 0.058183335, -0.01929858, 0.09047934, -0.19679205, -0.16896065), conv2d_11_tf); + conv2d_11_tf = MulAdd(a2, MF4x4(-0.19059956, 0.059083544, -0.07367043, 0.10374235, -0.12928921, 0.16821185, 0.03542259, 0.07853399, -0.029948441, 0.045060057, 0.10522493, 0.15548709, 0.13417992, 0.12784965, 0.068737574, 0.024369959), conv2d_11_tf); + conv2d_11_tf = MulAdd(b2, MF4x4(-0.2539489, -0.15361321, -0.024794202, 0.23387837, -0.021986792, 0.035640705, -0.053465687, 0.041275553, -0.12349385, 0.11599216, -0.12158652, -0.0016647653, 0.03552641, 0.15126309, 0.10521408, 0.022221778), conv2d_11_tf); + conv2d_11_tf = MulAdd(c2, MF4x4(-0.09391041, 0.21640098, 0.06468435, 0.021124857, -0.017427467, 0.14731239, 0.0888631, 0.06669842, 0.16802992, -0.042000934, -0.007442969, -0.17762569, -0.106376246, -0.007006815, 0.048836768, 0.07634349), conv2d_11_tf); + conv2d_11_tf = MulAdd(d2, MF4x4(-0.08242374, -0.35055616, 0.11752318, 0.06287576, -0.08078838, 0.015269983, 0.07802465, 0.036515962, -0.047435157, -0.23535018, 0.10882656, 0.00760307, 0.20816213, 0.16291322, -0.17480974, -0.09656055), conv2d_11_tf); + conv2d_11_tf = MulAdd(e2, MF4x4(0.3776239, 0.48836887, 0.046571143, -0.0005301381, 0.111404456, -0.2056147, 0.0976322, -0.07087254, -0.23208277, 0.64508325, 0.029519977, -0.32163903, 0.12203931, 1.2488136, 0.0713469, -0.12589021), conv2d_11_tf); + conv2d_11_tf = MulAdd(f2, MF4x4(-0.1458724, -0.2927259, -0.11825573, 0.050236594, 0.005908592, 0.009147886, 0.014676971, -0.09960781, -0.031219782, 0.0008116867, -0.16999915, -0.08393424, -0.017762119, 0.15271363, 0.17894958, 0.104973435), conv2d_11_tf); + conv2d_11_tf = MulAdd(g2, MF4x4(0.15102111, -0.017580042, -0.009878415, 0.09603493, -0.14158034, 0.01766169, 0.026301328, 0.14016923, 0.07513633, 0.12250821, 0.14139763, 0.119470306, 0.056335848, 0.011718554, -0.051952817, -0.1087701), conv2d_11_tf); + conv2d_11_tf = MulAdd(h2, MF4x4(0.12267096, 0.22258927, -0.23374331, -0.336529, -0.03149633, -0.26095635, 0.00365308, 0.048830956, 0.035902984, -0.04686918, -0.08079191, -0.17013429, 0.0254567, -0.05592242, 0.0968047, 0.07426071), conv2d_11_tf); + conv2d_11_tf = MulAdd(i2, MF4x4(-0.16953564, 0.074455656, 0.0029755495, 0.20576377, -0.050961535, 0.060958825, 0.014226229, 0.104992926, 0.06942283, 0.29077423, 0.040234245, 0.12337425, -0.012045997, -0.11109262, 0.020255094, 0.08945579), conv2d_11_tf); + conv2d_11_tf = MulAdd(na1, MF4x4(0.2978639, -0.24613461, -0.083074145, -0.2367985, -0.13995647, -0.21201506, -0.16809967, -0.08163256, 0.22451796, -0.21319884, 0.097241744, 0.17276905, 0.059754357, -0.21800114, 0.016986718, 0.059852242), conv2d_11_tf); + conv2d_11_tf = MulAdd(nb1, MF4x4(0.10399378, 0.016165858, 0.006949626, -0.00957426, -0.07206657, 0.85400176, -0.069736175, 0.11563255, -0.15550873, 0.21035826, -0.09730208, 0.21803263, -0.029731166, 0.07174115, -0.075019605, 0.06605764), conv2d_11_tf); + conv2d_11_tf = MulAdd(nc1, MF4x4(0.008660154, -0.1689362, -0.13275097, -0.14157207, -0.06571528, 0.2641335, 0.17738026, 0.016201235, -0.058384545, -0.089386165, -0.10691102, 0.03380599, 0.07696467, 0.010921241, -0.05858657, 0.044599395), conv2d_11_tf); + conv2d_11_tf = MulAdd(nd1, MF4x4(0.29438433, 0.39757052, -0.12448894, -0.14726874, 0.054101802, 0.19893955, 0.0081761405, -0.030686913, -0.09465847, -0.09517581, 0.0046200817, 0.2743172, 0.18768987, 0.2577441, 0.3185588, -0.0043636197), conv2d_11_tf); + conv2d_11_tf = MulAdd(ne1, MF4x4(0.30364004, 0.45719072, -0.002478791, -0.25550374, 0.044718135, 0.9974692, 0.27661783, 0.38724384, 0.20643012, -0.36335453, 0.04044719, -0.15773767, 0.019318745, -0.015368104, -0.13033883, -0.21446472), conv2d_11_tf); + conv2d_11_tf = MulAdd(nf1, MF4x4(0.17225221, -0.2870429, -0.11031537, -0.20985241, -0.1813215, 0.47034717, 0.19177493, 0.1565604, -0.22090979, -0.1778559, -0.15998572, 0.20591277, -0.27751637, -0.17734572, -0.22385214, 0.2001247), conv2d_11_tf); + conv2d_11_tf = MulAdd(ng1, MF4x4(0.09103924, 0.012440279, -0.11811386, -0.28955194, -0.024203198, -0.014690502, -0.041423846, 0.0062359073, 0.06732812, -0.040848043, -0.0807372, -0.06598595, -0.020464217, 0.35617942, 0.054869782, -0.06990699), conv2d_11_tf); + conv2d_11_tf = MulAdd(nh1, MF4x4(-0.22022852, -0.30250633, -0.008539953, -0.17535509, 0.048545327, -0.06961757, 0.1520779, 0.15551318, 0.145789, 0.41386685, 0.19608185, 0.02285933, 0.19650589, 0.1140758, 0.058065582, 0.06438903), conv2d_11_tf); + conv2d_11_tf = MulAdd(ni1, MF4x4(0.17500387, 0.009752107, -0.08735754, -0.40322778, -0.04718948, -0.1520063, 0.015334469, 0.055586398, -0.06315823, 0.01381341, 0.06333497, 0.20780154, -0.14789844, 0.008873181, 0.20424104, 0.18570045), conv2d_11_tf); + conv2d_11_tf = MulAdd(na2, MF4x4(0.17809622, -0.054737452, 0.045792647, -0.05761767, 0.1530876, -0.058534857, -0.008100565, 0.036446143, 0.27693272, 0.3004126, -0.1283306, -0.50103384, -0.3350802, 0.09919993, -0.10481551, 0.059236333), conv2d_11_tf); + conv2d_11_tf = MulAdd(nb2, MF4x4(0.08178473, 0.01796507, 0.045470674, -0.1395204, -0.07053285, -0.15308544, -0.016434597, 0.09957456, 0.07303232, 0.5558379, 0.1058254, -0.12340164, -0.37540868, 0.20688659, 0.11254531, 0.08988308), conv2d_11_tf); + conv2d_11_tf = MulAdd(nc2, MF4x4(-0.115479395, -0.04145597, -0.02444945, -0.0012505532, -0.016777854, -0.21254961, -0.11969028, -0.10986302, 0.34061527, 0.35168666, 0.19457188, -0.25304377, 0.089430355, -0.13593785, -0.03715568, -0.07161111), conv2d_11_tf); + conv2d_11_tf = MulAdd(nd2, MF4x4(0.135465, 0.16024914, -0.16819438, -0.076060556, 0.14722055, -0.12402309, -0.091675736, -0.11345004, 0.3370019, 0.21161243, 0.08165217, 0.26650387, 0.11799823, 1.1248134, 0.031586587, 0.40626523), conv2d_11_tf); + conv2d_11_tf = MulAdd(ne2, MF4x4(-0.3881156, 0.075572714, -0.2955678, -0.04820779, -0.14431494, 0.17108414, -0.031334974, 0.14272547, 0.10431918, -0.92185026, -0.550305, -0.09849551, -0.19279402, 0.47034186, 0.38574138, 0.5469418), conv2d_11_tf); + conv2d_11_tf = MulAdd(nf2, MF4x4(0.07301299, -0.1655295, 0.0851716, 0.0349889, 0.037978686, -0.34476924, -0.09894407, -0.09279173, -0.017504893, 0.16626996, 0.23299451, -0.29538614, -0.035250418, 0.102075204, 0.014679606, 0.05283856), conv2d_11_tf); + conv2d_11_tf = MulAdd(ng2, MF4x4(0.082496785, -0.047353677, -0.1036778, -0.014507561, 0.091381975, -0.07229443, -0.03069601, -0.07463806, 0.2173226, 0.061551273, 0.01672064, 0.065622196, 0.1645865, 0.08651663, 0.18979368, 0.2012662), conv2d_11_tf); + conv2d_11_tf = MulAdd(nh2, MF4x4(-0.2116467, -0.26988897, -0.049475558, 0.18609211, -0.08837133, -0.219245, 0.05900789, -0.007832284, -0.028579885, 0.20587349, -0.07297767, -0.19551088, 0.052455146, -0.24630548, 0.12438646, -0.017073039), conv2d_11_tf); + conv2d_11_tf = MulAdd(ni2, MF4x4(0.15815273, -0.13286865, -0.036927793, -0.118895106, 0.06876401, -0.08193885, -0.073907554, -0.17851423, 0.025570622, -0.05206693, 0.0054880823, -0.14550385, 0.031355973, -0.0617539, -0.09522895, 0.007602468), conv2d_11_tf); tex8[gxy] = conv2d_11_tf; - min16float4 nconv2d_11_tf = max(-conv2d_11_tf, 0); + MF4 nconv2d_11_tf = max(-conv2d_11_tf, 0); conv2d_11_tf = max(conv2d_11_tf, 0); - min16float4 conv2d_10_tf = mul(a1, min16float4x4(-0.07384766, -0.027958225, 0.37361667, -0.082532816, 0.14156812, 0.02939518, 0.22737388, 0.19935979, -0.090212055, 0.04403584, 0.18456662, -0.026585983, 0.22868252, 0.09938934, -0.08726494, -0.115827106)); - conv2d_10_tf += mul(b1, min16float4x4(-0.09788985, -0.3116416, 0.35298944, -0.08990593, 0.16181462, -0.22193117, -0.5422943, 0.23932208, 0.15739329, -0.06103239, 0.7953177, -0.047183976, 0.21341586, 0.19858226, 0.0016054768, 0.054749873)); - conv2d_10_tf += mul(c1, min16float4x4(-0.026696216, 0.061291914, -0.35742328, 0.00082715444, 0.10632543, -0.09428293, -0.12645036, -0.043706786, 0.09915236, 0.13788143, 0.15950204, -0.089837976, 0.04461279, -0.054954246, 0.04740199, 0.07014664)); - conv2d_10_tf += mul(d1, min16float4x4(-0.12016896, 0.16669498, 0.26552972, -0.35876223, 0.045097463, -0.15016092, -0.0988156, -0.416339, -0.0101760905, 0.26459762, 0.31927487, -0.16307381, 0.12096833, -0.06770049, -0.017283063, 0.013299284)); - conv2d_10_tf += mul(e1, min16float4x4(0.15951112, 0.14506923, 0.6747884, -0.24716964, -0.3413045, -0.2017185, -0.9612693, 0.5421329, -0.16023788, 0.32216108, 0.062496744, 0.21633703, 0.004581572, 0.2359334, -0.35295007, 0.09726352)); - conv2d_10_tf += mul(f1, min16float4x4(0.13874753, -0.0063067, -0.14469895, 0.11554976, -0.019183924, -0.04544159, -0.29430693, -0.10431769, 0.15769906, 0.00601582, -0.454376, -0.11790236, 0.16000259, 0.29670846, -0.9759625, 0.31053123)); - conv2d_10_tf += mul(g1, min16float4x4(0.014491841, 0.0074491766, -0.09696308, -0.09127842, -0.03579932, -0.20163259, -0.21284793, -0.261139, 0.24359487, 0.14113441, 0.23983651, -0.16634561, -0.09547295, 0.10859189, 0.13468629, 0.33521304)); - conv2d_10_tf += mul(h1, min16float4x4(0.008276171, 0.12959969, 0.5093179, 0.002464717, 0.016199486, -0.03156574, -0.4428472, -0.10885838, -0.049632378, 0.2476587, 0.07033375, -0.20044556, 0.04982328, 0.19631135, -0.33776414, -0.6421577)); - conv2d_10_tf += mul(i1, min16float4x4(-0.04192616, 0.06393284, 0.07120974, 0.076716706, -0.09867013, -0.13239172, 0.012114291, -0.038557116, 0.029985918, 0.022090917, 0.07777519, 0.008410333, 0.0034299784, 0.062100925, -0.38884223, -0.01593217)); - conv2d_10_tf += mul(a2, min16float4x4(-0.013629574, -0.06545711, 0.14423661, -0.03981215, -0.052800525, -0.058425374, -0.05814048, -0.11337634, 0.05479856, -0.010584571, -0.22650285, 0.056241333, -0.1396656, -0.0010838923, -0.30166936, 0.040658727)); - conv2d_10_tf += mul(b2, min16float4x4(0.045267094, -0.086306006, -0.05226326, 0.1539859, -0.02723665, -0.13326567, 0.22143897, -0.018399606, 0.12181383, 0.1452545, -0.3973738, -0.10285705, -0.15147118, -0.28072536, 0.4379245, -0.06340889)); - conv2d_10_tf += mul(c2, min16float4x4(0.14590915, 0.034363795, -0.02217679, 0.15465777, -0.020056443, 0.06256286, 0.00068213895, -0.004845135, 0.10313473, 0.13895464, -0.0957288, 0.10452721, -0.06313026, -0.06739777, 0.16052145, -0.115432285)); - conv2d_10_tf += mul(d2, min16float4x4(-0.083468825, 0.15143521, 0.19880214, -0.0054416056, -0.1074472, 0.027439727, -0.16624895, -0.026701076, -0.046576414, -0.061388403, 0.34304553, -0.08921803, 0.09399348, -0.043658186, -1.3050584, -0.07285428)); - conv2d_10_tf += mul(e2, min16float4x4(-0.2544287, -0.38059148, 0.7181705, -0.44567156, 0.10387618, 0.06472145, 0.08178852, -0.016514499, -0.1630076, -0.16066378, -0.19193888, -0.24423774, -0.14821364, -0.28755048, -0.1322022, 0.25716448)); - conv2d_10_tf += mul(f2, min16float4x4(0.13228743, 0.24624044, 0.10462062, 0.26341802, 0.035913363, 0.09206641, 0.044785645, 0.010443224, 0.05206244, 0.008345797, -0.32408288, -0.2484674, -0.027154556, 0.0006338974, 0.09008037, 0.027416239)); - conv2d_10_tf += mul(g2, min16float4x4(-0.061936356, -0.07008738, -0.22344092, 0.20339371, 0.03216865, 0.103117235, 0.10232644, 0.10809929, 0.08320763, 0.058004253, -0.06520991, 0.038012277, -0.12916973, -0.1150849, -0.03713365, -0.0886423)); - conv2d_10_tf += mul(h2, min16float4x4(0.3213531, 0.1826207, 0.022152286, 0.025484305, -0.054090437, 0.08160166, 0.13491987, -0.06896833, 0.10781034, 0.08944192, -0.34036443, -0.018937334, -0.18917687, -0.13239872, 0.11581373, -0.038915917)); - conv2d_10_tf += mul(i2, min16float4x4(-0.20916902, 0.08310064, 0.19347866, 0.29880634, -0.007023385, 0.005319598, -0.06649972, 0.03248317, -0.04066817, -0.06176127, -0.41747397, 0.14132817, -0.021392342, -0.021360394, 0.101215124, -0.05375729)); - conv2d_10_tf += mul(na1, min16float4x4(-0.008702178, -0.03840238, 0.13321695, 0.065163925, -0.062342774, -0.030948557, 0.0069512874, -0.2634128, -0.09415655, 0.02985776, 0.021763485, 0.27137864, -0.21608604, -0.19126832, -0.37335086, -0.16941321)); - conv2d_10_tf += mul(nb1, min16float4x4(0.04631249, 0.33492458, -0.6266605, 0.20180638, 0.039800193, -0.14341171, -0.8203481, 0.04878081, 0.008235832, 0.15065777, -0.32971388, 0.1828355, -0.1510293, -0.17637968, 0.125366, -0.06719769)); - conv2d_10_tf += mul(nc1, min16float4x4(-0.014685718, -0.04156494, 0.2728874, -0.106735535, -0.1312142, -0.05991217, 0.15173748, -0.09276527, 0.027946949, 0.12980466, 0.017537035, 0.058945708, -0.11254791, -0.06708247, -0.28308856, -0.058375884)); - conv2d_10_tf += mul(nd1, min16float4x4(0.2220684, -0.19030218, -0.1259754, 0.09647918, -0.20530927, -0.16737363, -0.055208467, -0.067288965, 0.1428622, 0.08903465, 0.494294, 0.28669015, -0.17464463, -0.2190753, 0.13515279, 0.24887499)); - conv2d_10_tf += mul(ne1, min16float4x4(-0.24211104, -0.11129136, 0.03340221, 0.49835417, -0.11755811, -0.732711, -0.3876752, 0.6178176, 0.1437329, -0.05131951, -0.16705558, -0.3823752, -0.23198022, -0.27967533, 0.7223488, -0.5565778)); - conv2d_10_tf += mul(nf1, min16float4x4(-0.04738433, -0.14606567, 0.22317784, 0.0055712103, -0.064653076, -0.16446865, -0.10802961, -0.10179589, 0.060855757, 0.22762765, -0.037358448, 0.24772792, -0.15458576, -0.0770241, 0.43480682, 0.008342627)); - conv2d_10_tf += mul(ng1, min16float4x4(0.117756896, -0.06760757, 0.12629354, -0.13241243, -0.05329636, 0.031004142, 0.19809054, 0.1504123, -0.024029436, -0.011011192, -0.014698134, 0.12855798, 0.027526522, -0.102618076, -0.2597635, -0.23887417)); - conv2d_10_tf += mul(nh1, min16float4x4(-0.012681944, 0.088339254, 0.58977854, 0.020116867, -0.30643263, -0.11593101, 0.2829653, -0.060883448, 0.027514484, -0.19997032, -0.12530403, 0.3302542, -0.10344085, -0.0644199, -0.11374762, 0.38778695)); - conv2d_10_tf += mul(ni1, min16float4x4(0.073869206, -0.059440095, -0.016326021, -0.08571949, -0.04171866, 0.042949438, 0.13984677, -0.15829174, -0.025245706, 0.0059198164, -0.0432442, 0.20765327, -0.058762096, 0.11539401, 0.036120266, 0.24331446)); - conv2d_10_tf += mul(na2, min16float4x4(0.012567978, 0.07251118, -0.12190053, 0.10283353, 0.088345066, 0.0017397653, -0.2381744, 0.101314925, 0.022791719, -0.043069735, -0.15024713, -0.072577685, 0.19976862, -0.059844784, 0.38824072, 0.0020866133)); - conv2d_10_tf += mul(nb2, min16float4x4(0.27314463, 0.0739519, 0.08960633, 0.03709254, 0.032681584, 0.22859, -0.41635752, -0.07382896, 0.13144481, -0.24017848, 0.07981319, 0.15370876, 0.059314378, 0.29214182, -0.39464346, -0.13867916)); - conv2d_10_tf += mul(nc2, min16float4x4(-0.005685388, -0.039528795, -0.055917054, -0.06578973, 0.020702876, -0.00709528, 0.08486715, -0.0075865295, 0.05714374, -0.27417144, 0.4555885, 0.013780273, 0.05096835, 0.159233, -0.05228782, 0.15794256)); - conv2d_10_tf += mul(nd2, min16float4x4(-0.0010807351, -0.022064442, 0.13078515, 0.11357431, 0.11269685, 0.029679844, 0.14385091, 0.10241993, 0.030162932, -0.016101424, 0.20761637, 0.4683215, 0.03091817, -0.58406824, -0.3438075, 0.3653469)); - conv2d_10_tf += mul(ne2, min16float4x4(-0.016927537, 0.13944507, -0.38772225, -0.11645372, -0.1683389, -0.081295304, 0.271328, 0.14980802, 0.47266555, 0.04091753, 0.006903156, -0.00832747, -0.056511678, 0.06924621, -1.0780094, 0.1268596)); - conv2d_10_tf += mul(nf2, min16float4x4(-0.21017683, -0.077091806, 0.28906518, 0.022843512, -0.062092084, -0.017447937, 0.25115407, -0.1367289, 0.0021664056, 0.0034106125, 0.5305142, -0.029012429, -0.014483031, 0.05575314, -0.35784876, -0.09252365)); - conv2d_10_tf += mul(ng2, min16float4x4(0.008859689, 0.06481962, 0.09483335, 0.18473764, 0.0015982646, -0.06144117, 0.054042596, -0.19934553, -0.20250106, 0.096015476, 0.21697922, 0.6265738, -0.16049659, -0.33120447, 0.27775142, 0.14459921)); - conv2d_10_tf += mul(nh2, min16float4x4(-0.11195867, 0.21663944, 0.5021048, 0.04712746, 0.08637696, 0.07792573, 0.23626573, -0.075164914, 0.06574307, -0.16795279, 0.06829719, -0.027584063, -0.015064924, -0.057976205, 0.14589287, -0.15683101)); - conv2d_10_tf += mul(ni2, min16float4x4(0.07626267, -0.03523683, 0.106941625, -0.15825523, 0.032598946, 0.038718563, -0.016688785, -0.054390162, 0.05544311, 0.13933052, 0.078817375, -0.10183935, 0.041770034, 0.032732744, 0.062236354, 0.0068387473)); - conv2d_10_tf += min16float4(-0.11589812, -0.123082116, -0.003926807, -0.15363532); + MF4 conv2d_10_tf = MF4(-0.11589812, -0.123082116, -0.003926807, -0.15363532); + conv2d_10_tf = MulAdd(a1, MF4x4(-0.07384766, -0.027958225, 0.37361667, -0.082532816, 0.14156812, 0.02939518, 0.22737388, 0.19935979, -0.090212055, 0.04403584, 0.18456662, -0.026585983, 0.22868252, 0.09938934, -0.08726494, -0.115827106), conv2d_10_tf); + conv2d_10_tf = MulAdd(b1, MF4x4(-0.09788985, -0.3116416, 0.35298944, -0.08990593, 0.16181462, -0.22193117, -0.5422943, 0.23932208, 0.15739329, -0.06103239, 0.7953177, -0.047183976, 0.21341586, 0.19858226, 0.0016054768, 0.054749873), conv2d_10_tf); + conv2d_10_tf = MulAdd(c1, MF4x4(-0.026696216, 0.061291914, -0.35742328, 0.00082715444, 0.10632543, -0.09428293, -0.12645036, -0.043706786, 0.09915236, 0.13788143, 0.15950204, -0.089837976, 0.04461279, -0.054954246, 0.04740199, 0.07014664), conv2d_10_tf); + conv2d_10_tf = MulAdd(d1, MF4x4(-0.12016896, 0.16669498, 0.26552972, -0.35876223, 0.045097463, -0.15016092, -0.0988156, -0.416339, -0.0101760905, 0.26459762, 0.31927487, -0.16307381, 0.12096833, -0.06770049, -0.017283063, 0.013299284), conv2d_10_tf); + conv2d_10_tf = MulAdd(e1, MF4x4(0.15951112, 0.14506923, 0.6747884, -0.24716964, -0.3413045, -0.2017185, -0.9612693, 0.5421329, -0.16023788, 0.32216108, 0.062496744, 0.21633703, 0.004581572, 0.2359334, -0.35295007, 0.09726352), conv2d_10_tf); + conv2d_10_tf = MulAdd(f1, MF4x4(0.13874753, -0.0063067, -0.14469895, 0.11554976, -0.019183924, -0.04544159, -0.29430693, -0.10431769, 0.15769906, 0.00601582, -0.454376, -0.11790236, 0.16000259, 0.29670846, -0.9759625, 0.31053123), conv2d_10_tf); + conv2d_10_tf = MulAdd(g1, MF4x4(0.014491841, 0.0074491766, -0.09696308, -0.09127842, -0.03579932, -0.20163259, -0.21284793, -0.261139, 0.24359487, 0.14113441, 0.23983651, -0.16634561, -0.09547295, 0.10859189, 0.13468629, 0.33521304), conv2d_10_tf); + conv2d_10_tf = MulAdd(h1, MF4x4(0.008276171, 0.12959969, 0.5093179, 0.002464717, 0.016199486, -0.03156574, -0.4428472, -0.10885838, -0.049632378, 0.2476587, 0.07033375, -0.20044556, 0.04982328, 0.19631135, -0.33776414, -0.6421577), conv2d_10_tf); + conv2d_10_tf = MulAdd(i1, MF4x4(-0.04192616, 0.06393284, 0.07120974, 0.076716706, -0.09867013, -0.13239172, 0.012114291, -0.038557116, 0.029985918, 0.022090917, 0.07777519, 0.008410333, 0.0034299784, 0.062100925, -0.38884223, -0.01593217), conv2d_10_tf); + conv2d_10_tf = MulAdd(a2, MF4x4(-0.013629574, -0.06545711, 0.14423661, -0.03981215, -0.052800525, -0.058425374, -0.05814048, -0.11337634, 0.05479856, -0.010584571, -0.22650285, 0.056241333, -0.1396656, -0.0010838923, -0.30166936, 0.040658727), conv2d_10_tf); + conv2d_10_tf = MulAdd(b2, MF4x4(0.045267094, -0.086306006, -0.05226326, 0.1539859, -0.02723665, -0.13326567, 0.22143897, -0.018399606, 0.12181383, 0.1452545, -0.3973738, -0.10285705, -0.15147118, -0.28072536, 0.4379245, -0.06340889), conv2d_10_tf); + conv2d_10_tf = MulAdd(c2, MF4x4(0.14590915, 0.034363795, -0.02217679, 0.15465777, -0.020056443, 0.06256286, 0.00068213895, -0.004845135, 0.10313473, 0.13895464, -0.0957288, 0.10452721, -0.06313026, -0.06739777, 0.16052145, -0.115432285), conv2d_10_tf); + conv2d_10_tf = MulAdd(d2, MF4x4(-0.083468825, 0.15143521, 0.19880214, -0.0054416056, -0.1074472, 0.027439727, -0.16624895, -0.026701076, -0.046576414, -0.061388403, 0.34304553, -0.08921803, 0.09399348, -0.043658186, -1.3050584, -0.07285428), conv2d_10_tf); + conv2d_10_tf = MulAdd(e2, MF4x4(-0.2544287, -0.38059148, 0.7181705, -0.44567156, 0.10387618, 0.06472145, 0.08178852, -0.016514499, -0.1630076, -0.16066378, -0.19193888, -0.24423774, -0.14821364, -0.28755048, -0.1322022, 0.25716448), conv2d_10_tf); + conv2d_10_tf = MulAdd(f2, MF4x4(0.13228743, 0.24624044, 0.10462062, 0.26341802, 0.035913363, 0.09206641, 0.044785645, 0.010443224, 0.05206244, 0.008345797, -0.32408288, -0.2484674, -0.027154556, 0.0006338974, 0.09008037, 0.027416239), conv2d_10_tf); + conv2d_10_tf = MulAdd(g2, MF4x4(-0.061936356, -0.07008738, -0.22344092, 0.20339371, 0.03216865, 0.103117235, 0.10232644, 0.10809929, 0.08320763, 0.058004253, -0.06520991, 0.038012277, -0.12916973, -0.1150849, -0.03713365, -0.0886423), conv2d_10_tf); + conv2d_10_tf = MulAdd(h2, MF4x4(0.3213531, 0.1826207, 0.022152286, 0.025484305, -0.054090437, 0.08160166, 0.13491987, -0.06896833, 0.10781034, 0.08944192, -0.34036443, -0.018937334, -0.18917687, -0.13239872, 0.11581373, -0.038915917), conv2d_10_tf); + conv2d_10_tf = MulAdd(i2, MF4x4(-0.20916902, 0.08310064, 0.19347866, 0.29880634, -0.007023385, 0.005319598, -0.06649972, 0.03248317, -0.04066817, -0.06176127, -0.41747397, 0.14132817, -0.021392342, -0.021360394, 0.101215124, -0.05375729), conv2d_10_tf); + conv2d_10_tf = MulAdd(na1, MF4x4(-0.008702178, -0.03840238, 0.13321695, 0.065163925, -0.062342774, -0.030948557, 0.0069512874, -0.2634128, -0.09415655, 0.02985776, 0.021763485, 0.27137864, -0.21608604, -0.19126832, -0.37335086, -0.16941321), conv2d_10_tf); + conv2d_10_tf = MulAdd(nb1, MF4x4(0.04631249, 0.33492458, -0.6266605, 0.20180638, 0.039800193, -0.14341171, -0.8203481, 0.04878081, 0.008235832, 0.15065777, -0.32971388, 0.1828355, -0.1510293, -0.17637968, 0.125366, -0.06719769), conv2d_10_tf); + conv2d_10_tf = MulAdd(nc1, MF4x4(-0.014685718, -0.04156494, 0.2728874, -0.106735535, -0.1312142, -0.05991217, 0.15173748, -0.09276527, 0.027946949, 0.12980466, 0.017537035, 0.058945708, -0.11254791, -0.06708247, -0.28308856, -0.058375884), conv2d_10_tf); + conv2d_10_tf = MulAdd(nd1, MF4x4(0.2220684, -0.19030218, -0.1259754, 0.09647918, -0.20530927, -0.16737363, -0.055208467, -0.067288965, 0.1428622, 0.08903465, 0.494294, 0.28669015, -0.17464463, -0.2190753, 0.13515279, 0.24887499), conv2d_10_tf); + conv2d_10_tf = MulAdd(ne1, MF4x4(-0.24211104, -0.11129136, 0.03340221, 0.49835417, -0.11755811, -0.732711, -0.3876752, 0.6178176, 0.1437329, -0.05131951, -0.16705558, -0.3823752, -0.23198022, -0.27967533, 0.7223488, -0.5565778), conv2d_10_tf); + conv2d_10_tf = MulAdd(nf1, MF4x4(-0.04738433, -0.14606567, 0.22317784, 0.0055712103, -0.064653076, -0.16446865, -0.10802961, -0.10179589, 0.060855757, 0.22762765, -0.037358448, 0.24772792, -0.15458576, -0.0770241, 0.43480682, 0.008342627), conv2d_10_tf); + conv2d_10_tf = MulAdd(ng1, MF4x4(0.117756896, -0.06760757, 0.12629354, -0.13241243, -0.05329636, 0.031004142, 0.19809054, 0.1504123, -0.024029436, -0.011011192, -0.014698134, 0.12855798, 0.027526522, -0.102618076, -0.2597635, -0.23887417), conv2d_10_tf); + conv2d_10_tf = MulAdd(nh1, MF4x4(-0.012681944, 0.088339254, 0.58977854, 0.020116867, -0.30643263, -0.11593101, 0.2829653, -0.060883448, 0.027514484, -0.19997032, -0.12530403, 0.3302542, -0.10344085, -0.0644199, -0.11374762, 0.38778695), conv2d_10_tf); + conv2d_10_tf = MulAdd(ni1, MF4x4(0.073869206, -0.059440095, -0.016326021, -0.08571949, -0.04171866, 0.042949438, 0.13984677, -0.15829174, -0.025245706, 0.0059198164, -0.0432442, 0.20765327, -0.058762096, 0.11539401, 0.036120266, 0.24331446), conv2d_10_tf); + conv2d_10_tf = MulAdd(na2, MF4x4(0.012567978, 0.07251118, -0.12190053, 0.10283353, 0.088345066, 0.0017397653, -0.2381744, 0.101314925, 0.022791719, -0.043069735, -0.15024713, -0.072577685, 0.19976862, -0.059844784, 0.38824072, 0.0020866133), conv2d_10_tf); + conv2d_10_tf = MulAdd(nb2, MF4x4(0.27314463, 0.0739519, 0.08960633, 0.03709254, 0.032681584, 0.22859, -0.41635752, -0.07382896, 0.13144481, -0.24017848, 0.07981319, 0.15370876, 0.059314378, 0.29214182, -0.39464346, -0.13867916), conv2d_10_tf); + conv2d_10_tf = MulAdd(nc2, MF4x4(-0.005685388, -0.039528795, -0.055917054, -0.06578973, 0.020702876, -0.00709528, 0.08486715, -0.0075865295, 0.05714374, -0.27417144, 0.4555885, 0.013780273, 0.05096835, 0.159233, -0.05228782, 0.15794256), conv2d_10_tf); + conv2d_10_tf = MulAdd(nd2, MF4x4(-0.0010807351, -0.022064442, 0.13078515, 0.11357431, 0.11269685, 0.029679844, 0.14385091, 0.10241993, 0.030162932, -0.016101424, 0.20761637, 0.4683215, 0.03091817, -0.58406824, -0.3438075, 0.3653469), conv2d_10_tf); + conv2d_10_tf = MulAdd(ne2, MF4x4(-0.016927537, 0.13944507, -0.38772225, -0.11645372, -0.1683389, -0.081295304, 0.271328, 0.14980802, 0.47266555, 0.04091753, 0.006903156, -0.00832747, -0.056511678, 0.06924621, -1.0780094, 0.1268596), conv2d_10_tf); + conv2d_10_tf = MulAdd(nf2, MF4x4(-0.21017683, -0.077091806, 0.28906518, 0.022843512, -0.062092084, -0.017447937, 0.25115407, -0.1367289, 0.0021664056, 0.0034106125, 0.5305142, -0.029012429, -0.014483031, 0.05575314, -0.35784876, -0.09252365), conv2d_10_tf); + conv2d_10_tf = MulAdd(ng2, MF4x4(0.008859689, 0.06481962, 0.09483335, 0.18473764, 0.0015982646, -0.06144117, 0.054042596, -0.19934553, -0.20250106, 0.096015476, 0.21697922, 0.6265738, -0.16049659, -0.33120447, 0.27775142, 0.14459921), conv2d_10_tf); + conv2d_10_tf = MulAdd(nh2, MF4x4(-0.11195867, 0.21663944, 0.5021048, 0.04712746, 0.08637696, 0.07792573, 0.23626573, -0.075164914, 0.06574307, -0.16795279, 0.06829719, -0.027584063, -0.015064924, -0.057976205, 0.14589287, -0.15683101), conv2d_10_tf); + conv2d_10_tf = MulAdd(ni2, MF4x4(0.07626267, -0.03523683, 0.106941625, -0.15825523, 0.032598946, 0.038718563, -0.016688785, -0.054390162, 0.05544311, 0.13933052, 0.078817375, -0.10183935, 0.041770034, 0.032732744, 0.062236354, 0.0068387473), conv2d_10_tf); tex9[gxy] = conv2d_10_tf; - min16float4 nconv2d_10_tf = max(-conv2d_10_tf, 0); + MF4 nconv2d_10_tf = max(-conv2d_10_tf, 0); conv2d_10_tf = max(conv2d_10_tf, 0); - min16float4 target = mul(e1, min16float4x4(-0.25229862, 0.22394362, 0.0050771693, -0.07544911, -0.11078993, -0.14940143, 0.009394699, 0.0110528935, 0.044721916, 0.26324025, -0.046336185, 0.38099283, 0.053437576, -0.07238376, -0.090147175, 0.5568665)); - target += mul(e2, min16float4x4(0.036739275, -0.2334262, 0.032853063, 0.24364692, -0.122930475, 0.1975849, -0.01315444, -0.13528247, -0.014283123, 0.057573725, 0.058717266, 0.16260214, 0.03097313, -0.11750414, -0.18610783, -0.23006414)); - target += mul(ne1, min16float4x4(0.37318927, -0.26915783, 0.035015646, 0.2676218, 0.1748369, 0.094052985, -0.11020892, -0.14514406, 0.004877109, -0.26225975, 0.13958913, -0.16787122, 0.06908459, -0.10446216, -0.028498875, -0.28281447)); - target += mul(ne2, min16float4x4(0.1980342, 0.021963626, -0.03271427, 0.28889674, 0.043385092, -0.16916741, -0.008713317, 0.00013464666, 0.0819348, 0.0152427135, -0.14862345, -0.15659885, -0.050634, 0.04153691, 0.042288564, 0.00585241)); - target += mul(conv2d_11_tf, min16float4x4(-0.17560056, 0.3521319, 0.20137301, -0.25535235, 0.030570813, 0.2411823, 0.053508975, -0.34454364, 0.22279017, -0.41471666, -0.15029109, 0.22158626, -0.08751699, -0.09357398, 0.20704596, -0.20073438)); - target += mul(nconv2d_11_tf, min16float4x4(0.15419295, 0.31318265, 0.004593545, 0.78029615, -0.16751337, -0.32214537, -0.44051525, 0.22405408, -0.0064655836, 0.36599794, -0.26032063, 0.1850997, 0.13661511, -0.49070612, -0.34533858, 0.16373816)); - target += mul(conv2d_1_tf, min16float4x4(0.09806042, 0.36764845, 0.11531638, 0.073847674, -0.16854957, -0.19408809, -0.16800502, -0.12827317, -0.5168489, 0.030958507, -0.03509507, 0.086487584, 0.01842899, -0.10123225, -0.17940263, -0.028054722)); - target += mul(nconv2d_1_tf, min16float4x4(0.21619087, -0.05322262, -0.31423846, 0.37783054, 0.20402598, 0.53124064, -0.012658878, 0.20003271, -0.17958061, -0.37326333, -0.24583863, 0.057008818, -0.13031931, -0.031875104, -0.2130229, 0.44612458)); - target += mul(conv2d_4_tf, min16float4x4(0.25865164, -0.28258085, 0.09512834, 0.054259088, 0.25939894, 0.38799945, -0.33007956, 0.6692063, -0.22719514, 0.16910313, 0.056874167, 0.016987909, -0.19956954, -0.20683451, -0.19937307, -0.41771019)); - target += mul(nconv2d_4_tf, min16float4x4(0.23592101, -0.15792374, -0.06965535, 0.30855724, -0.22757038, 0.12033792, 0.3199687, 0.2674324, 0.112318985, -0.14153072, -0.13629095, 0.13337436, 0.09185144, 0.24124412, 0.028630963, 0.22709718)); - target += mul(conv2d_7_tf, min16float4x4(0.44043523, 0.32490492, -0.117098905, 0.38431495, 0.07962198, 0.1517891, 0.22628377, 0.13990402, 0.38505656, -0.014830039, 0.20684186, 0.065970615, -0.054330014, -0.046108313, 0.49422976, 0.13082288)); - target += mul(nconv2d_7_tf, min16float4x4(-0.08174229, -0.013488396, -0.09494761, 0.31210786, -0.14530393, -0.22510533, -0.30971226, -0.17040919, -0.64233893, -0.07164386, -0.20537859, -0.17981663, -0.0060102916, -0.10167985, -0.24380594, 0.36305648)); - target += mul(conv2d_10_tf, min16float4x4(-0.23301682, -0.19649999, -0.0016176507, 0.7897105, -0.68460715, -0.06446943, -0.5841334, -0.17928797, 0.021772655, 0.46175778, 0.36450028, 0.27175686, -0.03546283, -0.19889158, -0.24603742, -0.090037055)); - target += mul(nconv2d_10_tf, min16float4x4(0.1085313, 0.04249687, 0.13247591, 0.09551512, -0.37197208, 0.3261908, -0.13848339, -0.13538006, 0.13875476, -0.3748712, -0.21430004, 0.09772982, -0.35635203, 0.13196826, -0.09840773, -0.21841893)); - target += min16float4(0.062238827, 0.069814906, -0.107347876, 0.64385885); + MF4 target = MF4(0.062238827, 0.069814906, -0.107347876, 0.64385885); + target = MulAdd(e1, MF4x4(-0.25229862, 0.22394362, 0.0050771693, -0.07544911, -0.11078993, -0.14940143, 0.009394699, 0.0110528935, 0.044721916, 0.26324025, -0.046336185, 0.38099283, 0.053437576, -0.07238376, -0.090147175, 0.5568665), target); + target = MulAdd(e2, MF4x4(0.036739275, -0.2334262, 0.032853063, 0.24364692, -0.122930475, 0.1975849, -0.01315444, -0.13528247, -0.014283123, 0.057573725, 0.058717266, 0.16260214, 0.03097313, -0.11750414, -0.18610783, -0.23006414), target); + target = MulAdd(ne1, MF4x4(0.37318927, -0.26915783, 0.035015646, 0.2676218, 0.1748369, 0.094052985, -0.11020892, -0.14514406, 0.004877109, -0.26225975, 0.13958913, -0.16787122, 0.06908459, -0.10446216, -0.028498875, -0.28281447), target); + target = MulAdd(ne2, MF4x4(0.1980342, 0.021963626, -0.03271427, 0.28889674, 0.043385092, -0.16916741, -0.008713317, 0.00013464666, 0.0819348, 0.0152427135, -0.14862345, -0.15659885, -0.050634, 0.04153691, 0.042288564, 0.00585241), target); + target = MulAdd(conv2d_11_tf, MF4x4(-0.17560056, 0.3521319, 0.20137301, -0.25535235, 0.030570813, 0.2411823, 0.053508975, -0.34454364, 0.22279017, -0.41471666, -0.15029109, 0.22158626, -0.08751699, -0.09357398, 0.20704596, -0.20073438), target); + target = MulAdd(nconv2d_11_tf, MF4x4(0.15419295, 0.31318265, 0.004593545, 0.78029615, -0.16751337, -0.32214537, -0.44051525, 0.22405408, -0.0064655836, 0.36599794, -0.26032063, 0.1850997, 0.13661511, -0.49070612, -0.34533858, 0.16373816), target); + target = MulAdd(conv2d_1_tf, MF4x4(0.09806042, 0.36764845, 0.11531638, 0.073847674, -0.16854957, -0.19408809, -0.16800502, -0.12827317, -0.5168489, 0.030958507, -0.03509507, 0.086487584, 0.01842899, -0.10123225, -0.17940263, -0.028054722), target); + target = MulAdd(nconv2d_1_tf, MF4x4(0.21619087, -0.05322262, -0.31423846, 0.37783054, 0.20402598, 0.53124064, -0.012658878, 0.20003271, -0.17958061, -0.37326333, -0.24583863, 0.057008818, -0.13031931, -0.031875104, -0.2130229, 0.44612458), target); + target = MulAdd(conv2d_4_tf, MF4x4(0.25865164, -0.28258085, 0.09512834, 0.054259088, 0.25939894, 0.38799945, -0.33007956, 0.6692063, -0.22719514, 0.16910313, 0.056874167, 0.016987909, -0.19956954, -0.20683451, -0.19937307, -0.41771019), target); + target = MulAdd(nconv2d_4_tf, MF4x4(0.23592101, -0.15792374, -0.06965535, 0.30855724, -0.22757038, 0.12033792, 0.3199687, 0.2674324, 0.112318985, -0.14153072, -0.13629095, 0.13337436, 0.09185144, 0.24124412, 0.028630963, 0.22709718), target); + target = MulAdd(conv2d_7_tf, MF4x4(0.44043523, 0.32490492, -0.117098905, 0.38431495, 0.07962198, 0.1517891, 0.22628377, 0.13990402, 0.38505656, -0.014830039, 0.20684186, 0.065970615, -0.054330014, -0.046108313, 0.49422976, 0.13082288), target); + target = MulAdd(nconv2d_7_tf, MF4x4(-0.08174229, -0.013488396, -0.09494761, 0.31210786, -0.14530393, -0.22510533, -0.30971226, -0.17040919, -0.64233893, -0.07164386, -0.20537859, -0.17981663, -0.0060102916, -0.10167985, -0.24380594, 0.36305648), target); + target = MulAdd(conv2d_10_tf, MF4x4(-0.23301682, -0.19649999, -0.0016176507, 0.7897105, -0.68460715, -0.06446943, -0.5841334, -0.17928797, 0.021772655, 0.46175778, 0.36450028, 0.27175686, -0.03546283, -0.19889158, -0.24603742, -0.090037055), target); + target = MulAdd(nconv2d_10_tf, MF4x4(0.1085313, 0.04249687, 0.13247591, 0.09551512, -0.37197208, 0.3261908, -0.13848339, -0.13538006, 0.13875476, -0.3748712, -0.21430004, 0.09772982, -0.35635203, 0.13196826, -0.09840773, -0.21841893), target); tex1[gxy] = target; - target = mul(e1, min16float4x4(0.22607668, 0.021170171, -0.06774968, -0.019062893, -0.029051676, 0.029224426, 0.097410545, 0.07505055, 0.17470665, -0.025774082, -0.041022647, 0.07615996, 0.031361237, -0.18075092, -0.01981288, 0.30251572)); - target += mul(e2, min16float4x4(-0.2228827, -0.18372375, 0.17952546, 0.031262513, 0.10978829, 0.095414534, -0.11202218, -0.017824037, 0.13419671, -0.056704585, 0.086960495, 0.089463, 0.0436869, 0.1987542, -0.24825421, -0.14668585)); - target += mul(ne1, min16float4x4(-0.2848745, -0.09242928, 0.24002336, -0.06059541, -0.0066300016, 0.050746392, -0.26092768, -0.060129635, -0.2699064, -0.13927452, 0.3134039, -0.21668927, 0.0028670141, 0.044556674, 0.040246494, -0.26040232)); - target += mul(ne2, min16float4x4(0.08408219, -0.038882803, -0.08522774, 0.1714629, -0.03067602, -0.10863579, 0.072058044, -0.012343554, -0.0076697394, 0.17840211, -0.2823912, 0.11976201, -0.05657313, 0.092938855, -0.060931504, 0.06991858)); - target += mul(conv2d_11_tf, min16float4x4(0.09868284, 0.054261737, 0.13327791, -0.14897001, -0.06348394, 0.11385057, 0.09684055, -0.084950894, -0.3038146, -0.08645148, 0.035114545, -0.07148952, -0.15862693, 0.26620075, -0.018059343, 0.35772058)); - target += mul(nconv2d_11_tf, min16float4x4(-0.4964452, -0.32340884, 0.5129584, -0.090460144, 0.28658384, -0.117274396, 0.25311428, 0.119918026, 0.27442876, -0.19332558, -0.40261742, -0.0627285, -0.36318043, -0.07865861, -0.11114984, -0.1290027)); - target += mul(conv2d_1_tf, min16float4x4(0.42158237, -0.032889403, 0.034080755, 0.25719455, -0.18799819, 0.0981468, 0.22785765, -0.07262642, 0.22532979, -0.09519116, -0.1005627, 0.1767603, -0.100850165, -0.06818755, 0.0059797456, -0.0718568)); - target += mul(nconv2d_1_tf, min16float4x4(0.12787001, -0.20670003, 0.0034799385, -0.024907416, 0.04423561, -0.13276835, -0.102332935, 0.14673741, 0.08700579, 0.08124997, -0.009865786, 0.041748982, -0.076119795, 0.09744985, 0.13542135, 0.12240728)); - target += mul(conv2d_4_tf, min16float4x4(-0.1702021, 0.18497302, 0.06786661, -0.09040049, 0.15212716, 0.055503774, 0.020584844, 0.24927403, 0.23556694, -0.1571619, -0.02012801, 0.08423509, -0.114376806, -0.04171382, 0.040876187, -0.116261706)); - target += mul(nconv2d_4_tf, min16float4x4(-0.0854133, -0.023111762, 0.3320211, -0.21760856, -0.169973, 0.22671382, 0.4513697, 0.35962802, -0.1499719, 0.24696982, -0.29979527, 0.006662296, 0.20241787, -0.2276791, 0.059445832, 0.18853071)); - target += mul(conv2d_7_tf, min16float4x4(-0.026398154, 0.124663144, 0.20381314, 0.2053697, 0.010302614, -0.050437275, 0.033807695, 0.014369258, -0.20720173, 0.05919782, 0.008449617, -0.31949872, 0.011598942, -0.0432789, 0.12732887, 0.049919438)); - target += mul(nconv2d_7_tf, min16float4x4(-0.06617085, 0.023928246, 0.1698239, 0.19584818, 0.022199618, -0.0040151025, -0.14364237, -0.06734091, 0.49634683, 0.40206975, -0.023004102, 0.16953272, 0.13243976, -0.47359994, 0.18358715, -0.15007599)); - target += mul(conv2d_10_tf, min16float4x4(0.03754883, -0.84370553, -0.0057923268, -0.06449944, 0.09488198, -0.09577232, 0.31362334, -0.09768442, 0.15369056, -0.16346063, 0.41194627, 0.10364933, -0.2073915, -0.15944852, -0.57649344, 0.1580545)); - target += mul(nconv2d_10_tf, min16float4x4(-0.3224099, -0.17332473, 0.12429976, -0.12284861, 0.32270268, 0.2888736, -0.20192772, 0.15415959, -0.10240418, 0.09524166, -0.14117688, -0.1239787, 0.0015336396, 0.10390812, 0.20461708, -0.12672688)); - target += min16float4(0.01866206, -0.01430976, -0.04231479, 0.06331023); + target = MF4(0.01866206, -0.01430976, -0.04231479, 0.06331023); + target = MulAdd(e1, MF4x4(0.22607668, 0.021170171, -0.06774968, -0.019062893, -0.029051676, 0.029224426, 0.097410545, 0.07505055, 0.17470665, -0.025774082, -0.041022647, 0.07615996, 0.031361237, -0.18075092, -0.01981288, 0.30251572), target); + target = MulAdd(e2, MF4x4(-0.2228827, -0.18372375, 0.17952546, 0.031262513, 0.10978829, 0.095414534, -0.11202218, -0.017824037, 0.13419671, -0.056704585, 0.086960495, 0.089463, 0.0436869, 0.1987542, -0.24825421, -0.14668585), target); + target = MulAdd(ne1, MF4x4(-0.2848745, -0.09242928, 0.24002336, -0.06059541, -0.0066300016, 0.050746392, -0.26092768, -0.060129635, -0.2699064, -0.13927452, 0.3134039, -0.21668927, 0.0028670141, 0.044556674, 0.040246494, -0.26040232), target); + target = MulAdd(ne2, MF4x4(0.08408219, -0.038882803, -0.08522774, 0.1714629, -0.03067602, -0.10863579, 0.072058044, -0.012343554, -0.0076697394, 0.17840211, -0.2823912, 0.11976201, -0.05657313, 0.092938855, -0.060931504, 0.06991858), target); + target = MulAdd(conv2d_11_tf, MF4x4(0.09868284, 0.054261737, 0.13327791, -0.14897001, -0.06348394, 0.11385057, 0.09684055, -0.084950894, -0.3038146, -0.08645148, 0.035114545, -0.07148952, -0.15862693, 0.26620075, -0.018059343, 0.35772058), target); + target = MulAdd(nconv2d_11_tf, MF4x4(-0.4964452, -0.32340884, 0.5129584, -0.090460144, 0.28658384, -0.117274396, 0.25311428, 0.119918026, 0.27442876, -0.19332558, -0.40261742, -0.0627285, -0.36318043, -0.07865861, -0.11114984, -0.1290027), target); + target = MulAdd(conv2d_1_tf, MF4x4(0.42158237, -0.032889403, 0.034080755, 0.25719455, -0.18799819, 0.0981468, 0.22785765, -0.07262642, 0.22532979, -0.09519116, -0.1005627, 0.1767603, -0.100850165, -0.06818755, 0.0059797456, -0.0718568), target); + target = MulAdd(nconv2d_1_tf, MF4x4(0.12787001, -0.20670003, 0.0034799385, -0.024907416, 0.04423561, -0.13276835, -0.102332935, 0.14673741, 0.08700579, 0.08124997, -0.009865786, 0.041748982, -0.076119795, 0.09744985, 0.13542135, 0.12240728), target); + target = MulAdd(conv2d_4_tf, MF4x4(-0.1702021, 0.18497302, 0.06786661, -0.09040049, 0.15212716, 0.055503774, 0.020584844, 0.24927403, 0.23556694, -0.1571619, -0.02012801, 0.08423509, -0.114376806, -0.04171382, 0.040876187, -0.116261706), target); + target = MulAdd(nconv2d_4_tf, MF4x4(-0.0854133, -0.023111762, 0.3320211, -0.21760856, -0.169973, 0.22671382, 0.4513697, 0.35962802, -0.1499719, 0.24696982, -0.29979527, 0.006662296, 0.20241787, -0.2276791, 0.059445832, 0.18853071), target); + target = MulAdd(conv2d_7_tf, MF4x4(-0.026398154, 0.124663144, 0.20381314, 0.2053697, 0.010302614, -0.050437275, 0.033807695, 0.014369258, -0.20720173, 0.05919782, 0.008449617, -0.31949872, 0.011598942, -0.0432789, 0.12732887, 0.049919438), target); + target = MulAdd(nconv2d_7_tf, MF4x4(-0.06617085, 0.023928246, 0.1698239, 0.19584818, 0.022199618, -0.0040151025, -0.14364237, -0.06734091, 0.49634683, 0.40206975, -0.023004102, 0.16953272, 0.13243976, -0.47359994, 0.18358715, -0.15007599), target); + target = MulAdd(conv2d_10_tf, MF4x4(0.03754883, -0.84370553, -0.0057923268, -0.06449944, 0.09488198, -0.09577232, 0.31362334, -0.09768442, 0.15369056, -0.16346063, 0.41194627, 0.10364933, -0.2073915, -0.15944852, -0.57649344, 0.1580545), target); + target = MulAdd(nconv2d_10_tf, MF4x4(-0.3224099, -0.17332473, 0.12429976, -0.12284861, 0.32270268, 0.2888736, -0.20192772, 0.15415959, -0.10240418, 0.09524166, -0.14117688, -0.1239787, 0.0015336396, 0.10390812, 0.20461708, -0.12672688), target); tex2[gxy] = target; } @@ -975,25 +978,25 @@ void Pass6(uint2 blockStart, uint3 threadId) { // [ a, d, g ] // [ b, e, h ] // [ c, f, i ] - min16float4 a1 = tex1.SampleLevel(sam, pos - inputPt, 0); - min16float4 b1 = tex1.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); - min16float4 c1 = tex1.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); - min16float4 d1 = tex1.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); - min16float4 e1 = tex1.SampleLevel(sam, pos, 0); - min16float4 f1 = tex1.SampleLevel(sam, pos + float2(0, inputPt.y), 0); - min16float4 g1 = tex1.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); - min16float4 h1 = tex1.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); - min16float4 i1 = tex1.SampleLevel(sam, pos + inputPt, 0); - - min16float4 na1 = max(-a1, 0); - min16float4 nb1 = max(-b1, 0); - min16float4 nc1 = max(-c1, 0); - min16float4 nd1 = max(-d1, 0); - min16float4 ne1 = max(-e1, 0); - min16float4 nf1 = max(-f1, 0); - min16float4 ng1 = max(-g1, 0); - min16float4 nh1 = max(-h1, 0); - min16float4 ni1 = max(-i1, 0); + MF4 a1 = tex1.SampleLevel(sam, pos - inputPt, 0); + MF4 b1 = tex1.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); + MF4 c1 = tex1.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); + MF4 d1 = tex1.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); + MF4 e1 = tex1.SampleLevel(sam, pos, 0); + MF4 f1 = tex1.SampleLevel(sam, pos + float2(0, inputPt.y), 0); + MF4 g1 = tex1.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); + MF4 h1 = tex1.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); + MF4 i1 = tex1.SampleLevel(sam, pos + inputPt, 0); + + MF4 na1 = max(-a1, 0); + MF4 nb1 = max(-b1, 0); + MF4 nc1 = max(-c1, 0); + MF4 nd1 = max(-d1, 0); + MF4 ne1 = max(-e1, 0); + MF4 nf1 = max(-f1, 0); + MF4 ng1 = max(-g1, 0); + MF4 nh1 = max(-h1, 0); + MF4 ni1 = max(-i1, 0); a1 = max(a1, 0); b1 = max(b1, 0); @@ -1005,25 +1008,25 @@ void Pass6(uint2 blockStart, uint3 threadId) { h1 = max(h1, 0); i1 = max(i1, 0); - min16float4 a2 = tex2.SampleLevel(sam, pos - inputPt, 0); - min16float4 b2 = tex2.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); - min16float4 c2 = tex2.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); - min16float4 d2 = tex2.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); - min16float4 e2 = tex2.SampleLevel(sam, pos, 0); - min16float4 f2 = tex2.SampleLevel(sam, pos + float2(0, inputPt.y), 0); - min16float4 g2 = tex2.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); - min16float4 h2 = tex2.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); - min16float4 i2 = tex2.SampleLevel(sam, pos + inputPt, 0); - - min16float4 na2 = max(-a2, 0); - min16float4 nb2 = max(-b2, 0); - min16float4 nc2 = max(-c2, 0); - min16float4 nd2 = max(-d2, 0); - min16float4 ne2 = max(-e2, 0); - min16float4 nf2 = max(-f2, 0); - min16float4 ng2 = max(-g2, 0); - min16float4 nh2 = max(-h2, 0); - min16float4 ni2 = max(-i2, 0); + MF4 a2 = tex2.SampleLevel(sam, pos - inputPt, 0); + MF4 b2 = tex2.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); + MF4 c2 = tex2.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); + MF4 d2 = tex2.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); + MF4 e2 = tex2.SampleLevel(sam, pos, 0); + MF4 f2 = tex2.SampleLevel(sam, pos + float2(0, inputPt.y), 0); + MF4 g2 = tex2.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); + MF4 h2 = tex2.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); + MF4 i2 = tex2.SampleLevel(sam, pos + inputPt, 0); + + MF4 na2 = max(-a2, 0); + MF4 nb2 = max(-b2, 0); + MF4 nc2 = max(-c2, 0); + MF4 nd2 = max(-d2, 0); + MF4 ne2 = max(-e2, 0); + MF4 nf2 = max(-f2, 0); + MF4 ng2 = max(-g2, 0); + MF4 nh2 = max(-h2, 0); + MF4 ni2 = max(-i2, 0); a2 = max(a2, 0); b2 = max(b2, 0); @@ -1035,121 +1038,121 @@ void Pass6(uint2 blockStart, uint3 threadId) { h2 = max(h2, 0); i2 = max(i2, 0); - min16float4 conv2d_11_tf = tex8.SampleLevel(sam, pos, 0); - min16float4 nconv2d_11_tf = max(-conv2d_11_tf, 0); + MF4 conv2d_11_tf = tex8.SampleLevel(sam, pos, 0); + MF4 nconv2d_11_tf = max(-conv2d_11_tf, 0); conv2d_11_tf = max(conv2d_11_tf, 0); - min16float4 conv2d_1_tf = tex3.SampleLevel(sam, pos, 0); - min16float4 nconv2d_1_tf = max(-conv2d_1_tf, 0); + MF4 conv2d_1_tf = tex3.SampleLevel(sam, pos, 0); + MF4 nconv2d_1_tf = max(-conv2d_1_tf, 0); conv2d_1_tf = max(conv2d_1_tf, 0); - min16float4 conv2d_4_tf = tex6.SampleLevel(sam, pos, 0); - min16float4 nconv2d_4_tf = max(-conv2d_4_tf, 0); + MF4 conv2d_4_tf = tex6.SampleLevel(sam, pos, 0); + MF4 nconv2d_4_tf = max(-conv2d_4_tf, 0); conv2d_4_tf = max(conv2d_4_tf, 0); - min16float4 conv2d_7_tf = tex7.SampleLevel(sam, pos, 0); - min16float4 nconv2d_7_tf = max(-conv2d_7_tf, 0); + MF4 conv2d_7_tf = tex7.SampleLevel(sam, pos, 0); + MF4 nconv2d_7_tf = max(-conv2d_7_tf, 0); conv2d_7_tf = max(conv2d_7_tf, 0); - min16float4 conv2d_10_tf = tex9.SampleLevel(sam, pos, 0); - min16float4 nconv2d_10_tf = max(-conv2d_10_tf, 0); + MF4 conv2d_10_tf = tex9.SampleLevel(sam, pos, 0); + MF4 nconv2d_10_tf = max(-conv2d_10_tf, 0); conv2d_10_tf = max(conv2d_10_tf, 0); - min16float4 conv2d_13_tf = mul(a1, min16float4x4(0.09638616, 0.041973136, 0.032690834, 0.0017506832, 0.035889357, 0.046528358, 0.06497702, 0.06353481, -0.07129311, -0.027845494, 0.003971696, 0.015161773, -0.016153565, -0.02228567, -0.011083082, 0.037676543)); - conv2d_13_tf += mul(b1, min16float4x4(0.2134379, 0.26289365, 0.1335757, 0.13036838, -0.08787389, -0.106764, -0.048054244, 0.17788094, -0.15528837, -0.11408854, -0.06642222, -0.07838564, -0.09646518, -0.116988175, -0.22729287, -0.11145718)); - conv2d_13_tf += mul(c1, min16float4x4(0.09568265, 0.006643416, 0.11656759, -0.049414653, 0.14153476, -0.04269765, 0.09150523, 0.26861703, 0.16641477, -0.1080059, 0.22390138, -0.08730618, -0.01928994, -0.06351, -0.0022028533, 0.04301657)); - conv2d_13_tf += mul(d1, min16float4x4(-0.11731019, -0.040432923, -0.1977298, -0.17696093, -0.09182833, 0.071209684, -0.120773874, 0.021507429, -0.016429326, 0.04448132, 0.0681032, 0.044070866, -0.14647268, 0.008662263, -0.06507026, -0.075289875)); - conv2d_13_tf += mul(e1, min16float4x4(0.5694518, -0.6138523, 0.28939885, -0.06047394, 0.11681902, -0.7026379, 0.20342608, 0.07128985, 0.06697409, 0.2678358, 1.1430641, 0.20436136, -1.6117494, 0.2799715, -0.01652429, -0.16711035)); - conv2d_13_tf += mul(f1, min16float4x4(0.15830286, 0.16772346, -0.03232187, 0.029600514, -0.18494213, -0.25623813, 0.15487063, 0.06255487, -0.058094956, 0.19903323, 0.4756497, 0.6381142, -0.036022857, -0.09470495, 0.046093524, 0.031300675)); - conv2d_13_tf += mul(g1, min16float4x4(-0.38466138, 0.16052443, -0.13819315, -0.059899956, 0.14069949, -0.1297194, 0.105595976, 0.13371274, 0.06298681, -0.038837492, 0.08675327, 0.1501906, 0.031129224, 0.029751344, -0.06775066, -0.047534525)); - conv2d_13_tf += mul(h1, min16float4x4(0.097809926, -0.14269543, -0.14661346, -0.1819761, -0.023082452, 0.19019675, -0.15678905, -0.07669464, -0.07322769, -0.30472377, 0.33603573, 0.22620338, 0.05328552, 0.030486144, -0.037603505, -0.081246674)); - conv2d_13_tf += mul(i1, min16float4x4(-0.15090303, -0.0650902, 0.11741429, -0.003369476, 0.043803368, 0.13717425, -0.038966697, -0.05230889, -0.0042353314, -0.017051768, 0.102879845, 0.044044945, -0.012893164, 0.0152335, 0.015073082, 0.08049258)); - conv2d_13_tf += mul(a2, min16float4x4(-0.07802851, -0.07544602, -0.0039040581, -0.03915584, 0.115673676, -0.024907975, -0.011459969, 0.026098263, 0.043594692, 0.10627707, 0.027093858, 0.051561285, 0.071452856, -0.1758179, 0.28485832, 0.28952092)); - conv2d_13_tf += mul(b2, min16float4x4(-0.052147392, 0.18546684, 0.19015399, -0.053752594, -0.29468048, 0.010600442, -0.09287294, -0.09246605, 0.17687573, -0.04858957, 0.06478161, -0.0035372626, 0.5927226, 0.38359696, 0.33155236, 0.13010578)); - conv2d_13_tf += mul(c2, min16float4x4(0.04136322, 0.11806175, 0.19966072, 0.07308716, -0.09563447, -0.064514905, -0.0077517326, 0.11964638, -0.1460613, 0.02240298, 0.014256963, -0.0123070385, 0.1897282, -0.0058207656, 0.040057864, -0.49406672)); - conv2d_13_tf += mul(d2, min16float4x4(-0.43775788, -0.25118434, -0.3468631, -0.30180287, -0.27033472, -0.0023914252, 0.053275872, -0.021835659, 0.02879347, 0.036559265, 0.044093054, 0.12771723, 0.2702892, -0.2581491, -0.059361164, -0.046974897)); - conv2d_13_tf += mul(e2, min16float4x4(-0.03310008, -0.5622936, 0.5419483, -0.3599514, 0.2634039, 0.3500813, 0.4152074, 0.24876466, -0.2629078, -0.18554081, -0.76194984, -0.54471385, 0.72921526, 0.3316481, -0.20936906, -0.16736485)); - conv2d_13_tf += mul(f2, min16float4x4(0.07884802, 0.16494922, 0.2734585, -0.09396988, -0.14178166, -0.105561115, 0.006780099, 0.063054875, 0.12384575, -0.163967, -0.19682601, -0.1647527, 0.59927565, 0.24755491, -0.29760644, -0.074884824)); - conv2d_13_tf += mul(g2, min16float4x4(-0.186745, 0.21136905, 0.027726538, 0.08498169, 0.009122279, 0.01566938, -0.051473126, 0.014151464, 0.04580383, 0.02071651, 0.14929157, 0.17253524, -0.034080226, 0.07048439, -0.11602547, -0.12655921)); - conv2d_13_tf += mul(h2, min16float4x4(-0.2831727, -0.21816732, -0.37266397, -0.26041594, -0.18912914, -0.13482115, -0.10902061, -0.110694066, -0.20758803, -0.07158453, 0.14401175, 0.1590672, 0.27700564, -0.3202948, -0.23177631, 0.060082316)); - conv2d_13_tf += mul(i2, min16float4x4(0.16861005, -0.13237478, -0.12109852, -0.16306286, 0.032467425, 0.009778175, -0.05084063, 0.02528882, -0.028993038, -0.06119019, 0.0124081755, -0.0819979, -0.2308113, -0.23910572, 0.3170529, 0.22742116)); - conv2d_13_tf += mul(na1, min16float4x4(-0.19654512, 0.037653327, -0.015190324, 0.038381096, 0.034783594, -0.16242851, 0.07052334, 0.0019672879, 0.08069976, 0.090035714, 0.12597767, -0.00065050717, -0.10528094, 0.015088367, -0.045706235, -0.14849594)); - conv2d_13_tf += mul(nb1, min16float4x4(-0.0981129, -0.0044483114, 0.00918156, 0.28903985, 0.23872024, 0.11113565, 0.23359483, 0.21115206, 0.2144387, 0.106830046, 0.03875094, -0.14864162, 0.19366172, 0.21310017, 0.06280982, -0.0581721)); - conv2d_13_tf += mul(nc1, min16float4x4(-0.22814496, -0.08812413, -0.25392863, -0.02752917, 0.05930787, 0.08304853, -0.04027662, -0.010756739, 0.034590207, 0.070662424, 0.15285444, 0.058270697, -0.022838322, 0.024096202, 0.01309858, -0.10489201)); - conv2d_13_tf += mul(nd1, min16float4x4(0.17219496, -0.0066256993, 0.1442649, -0.07291206, 0.34312358, -0.24952441, 0.040031537, 0.18302973, 0.0015231773, 0.24825755, -0.01807878, -0.037405558, 0.21687117, 0.02481246, -0.08312088, -0.14397743)); - conv2d_13_tf += mul(ne1, min16float4x4(0.2859165, 0.6145777, 0.060804237, 0.22117847, -0.25534254, 0.3753605, 0.4193899, 0.06387241, -0.13308842, 0.0012660836, -0.055252563, -0.2552111, 0.8831952, -0.16249466, 0.76958305, 0.3658401)); - conv2d_13_tf += mul(nf1, min16float4x4(-0.14865848, -0.13086087, 0.17719927, 0.2801542, 0.3776111, 0.20903045, 0.1710449, 0.25524843, 0.11910105, 0.034738105, -0.12101939, -0.22116004, 0.11605619, 0.16838482, -0.07223086, -0.15225673)); - conv2d_13_tf += mul(ng1, min16float4x4(0.101802975, -0.12683764, -0.21380596, -0.19243564, 0.017763488, 0.0076850834, -0.0107422285, 0.058099743, 0.03071978, 0.02958345, 0.09209252, -0.012379192, -0.058930825, -0.07321041, -0.09178575, -0.09764888)); - conv2d_13_tf += mul(nh1, min16float4x4(0.2205578, -0.053928245, -0.14290524, -0.18790527, 0.002521159, -0.23389481, 0.11274272, 0.17174199, 0.2128134, 0.14586388, 0.08666812, 0.052028902, 0.024853414, -0.027658377, 0.033780072, -0.0045349374)); - conv2d_13_tf += mul(ni1, min16float4x4(-0.053073518, 0.12716359, 0.008456044, 0.014315154, 0.01918925, -0.13495505, 0.08007481, 0.08627198, 0.024612406, 0.0021514448, 0.04478567, -0.034171678, 0.0027070146, 0.0149149615, -0.15999815, -0.1866448)); - conv2d_13_tf += mul(na2, min16float4x4(0.040357295, -0.12759757, 0.03543834, -0.029329961, -0.078925595, 0.07807751, 0.08971355, -0.05469623, -0.08630596, -0.11219292, -0.08082983, -0.020131797, -0.04191703, 0.22003745, -0.28878415, -0.132956)); - conv2d_13_tf += mul(nb2, min16float4x4(0.021098461, 0.048261415, -0.121181525, -0.24724431, 0.32716268, 0.03046708, -0.28138334, -0.22871564, -0.15983087, 0.10721642, -0.14833531, -0.115366876, -0.393837, -0.62930757, -0.29534766, 0.02588463)); - conv2d_13_tf += mul(nc2, min16float4x4(-0.03972534, -0.051577512, -0.04452277, -0.12650263, 0.15491997, -0.026459083, 0.009715449, -0.20551588, -0.042652152, 0.119186826, -0.13313279, -0.13183416, -0.20730016, 0.003008999, -0.19962612, 0.1760052)); - conv2d_13_tf += mul(nd2, min16float4x4(0.1724579, -0.3179752, 0.18908302, 0.40730157, 0.44569418, -0.038390577, -0.13144472, -0.18369946, -0.1654486, -0.2106428, -0.084723935, 0.10262653, -0.26097777, 0.15257284, -0.36599034, -0.30871773)); - conv2d_13_tf += mul(ne2, min16float4x4(-0.21338613, 0.680362, 0.079820015, 0.6081361, -0.9754953, -0.33735132, -1.2323227, -0.17950675, -0.31327835, 0.4732144, 0.22757599, 0.23051551, -0.8099572, -0.49106973, 0.96547806, 0.30975753)); - conv2d_13_tf += mul(nf2, min16float4x4(0.16933723, 0.17994887, -0.38310486, -0.4208871, 0.373761, 0.20749316, -0.080664486, -0.26229286, -0.04797456, 0.28605196, -0.040223103, -0.034632236, -0.5650002, -0.38834664, 0.14565933, 0.1488285)); - conv2d_13_tf += mul(ng2, min16float4x4(0.32558438, -0.18572666, 0.049500592, 0.2319145, -0.23547912, 0.2740939, 0.027905073, -0.022077003, 0.10860379, -0.15617043, -0.097419575, -0.11391895, -0.4266203, 0.060962453, -0.12154808, -0.19734453)); - conv2d_13_tf += mul(nh2, min16float4x4(-0.07880791, -0.2247225, 0.445858, 0.3889803, 0.14111102, 0.378859, 0.040187526, -0.021096235, 0.04169405, -0.075737596, 0.046068836, 0.11624106, 0.08169536, 0.3022304, -0.24427707, -0.34422734)); - conv2d_13_tf += mul(ni2, min16float4x4(0.13501012, -0.07389663, -0.010668981, -0.069029465, 0.06960202, -0.067375034, 0.08431378, 0.04207825, -0.121635035, -0.051126126, -0.1546829, 0.00073073455, -0.20674464, 0.27346626, -0.15771666, -0.024096)); - conv2d_13_tf += min16float4(-0.17614856, -0.14261112, 0.14600825, 0.20389698); - min16float4 nconv2d_13_tf = max(-conv2d_13_tf, 0); + MF4 conv2d_13_tf = MF4(-0.17614856, -0.14261112, 0.14600825, 0.20389698); + conv2d_13_tf = MulAdd(a1, MF4x4(0.09638616, 0.041973136, 0.032690834, 0.0017506832, 0.035889357, 0.046528358, 0.06497702, 0.06353481, -0.07129311, -0.027845494, 0.003971696, 0.015161773, -0.016153565, -0.02228567, -0.011083082, 0.037676543), conv2d_13_tf); + conv2d_13_tf = MulAdd(b1, MF4x4(0.2134379, 0.26289365, 0.1335757, 0.13036838, -0.08787389, -0.106764, -0.048054244, 0.17788094, -0.15528837, -0.11408854, -0.06642222, -0.07838564, -0.09646518, -0.116988175, -0.22729287, -0.11145718), conv2d_13_tf); + conv2d_13_tf = MulAdd(c1, MF4x4(0.09568265, 0.006643416, 0.11656759, -0.049414653, 0.14153476, -0.04269765, 0.09150523, 0.26861703, 0.16641477, -0.1080059, 0.22390138, -0.08730618, -0.01928994, -0.06351, -0.0022028533, 0.04301657), conv2d_13_tf); + conv2d_13_tf = MulAdd(d1, MF4x4(-0.11731019, -0.040432923, -0.1977298, -0.17696093, -0.09182833, 0.071209684, -0.120773874, 0.021507429, -0.016429326, 0.04448132, 0.0681032, 0.044070866, -0.14647268, 0.008662263, -0.06507026, -0.075289875), conv2d_13_tf); + conv2d_13_tf = MulAdd(e1, MF4x4(0.5694518, -0.6138523, 0.28939885, -0.06047394, 0.11681902, -0.7026379, 0.20342608, 0.07128985, 0.06697409, 0.2678358, 1.1430641, 0.20436136, -1.6117494, 0.2799715, -0.01652429, -0.16711035), conv2d_13_tf); + conv2d_13_tf = MulAdd(f1, MF4x4(0.15830286, 0.16772346, -0.03232187, 0.029600514, -0.18494213, -0.25623813, 0.15487063, 0.06255487, -0.058094956, 0.19903323, 0.4756497, 0.6381142, -0.036022857, -0.09470495, 0.046093524, 0.031300675), conv2d_13_tf); + conv2d_13_tf = MulAdd(g1, MF4x4(-0.38466138, 0.16052443, -0.13819315, -0.059899956, 0.14069949, -0.1297194, 0.105595976, 0.13371274, 0.06298681, -0.038837492, 0.08675327, 0.1501906, 0.031129224, 0.029751344, -0.06775066, -0.047534525), conv2d_13_tf); + conv2d_13_tf = MulAdd(h1, MF4x4(0.097809926, -0.14269543, -0.14661346, -0.1819761, -0.023082452, 0.19019675, -0.15678905, -0.07669464, -0.07322769, -0.30472377, 0.33603573, 0.22620338, 0.05328552, 0.030486144, -0.037603505, -0.081246674), conv2d_13_tf); + conv2d_13_tf = MulAdd(i1, MF4x4(-0.15090303, -0.0650902, 0.11741429, -0.003369476, 0.043803368, 0.13717425, -0.038966697, -0.05230889, -0.0042353314, -0.017051768, 0.102879845, 0.044044945, -0.012893164, 0.0152335, 0.015073082, 0.08049258), conv2d_13_tf); + conv2d_13_tf = MulAdd(a2, MF4x4(-0.07802851, -0.07544602, -0.0039040581, -0.03915584, 0.115673676, -0.024907975, -0.011459969, 0.026098263, 0.043594692, 0.10627707, 0.027093858, 0.051561285, 0.071452856, -0.1758179, 0.28485832, 0.28952092), conv2d_13_tf); + conv2d_13_tf = MulAdd(b2, MF4x4(-0.052147392, 0.18546684, 0.19015399, -0.053752594, -0.29468048, 0.010600442, -0.09287294, -0.09246605, 0.17687573, -0.04858957, 0.06478161, -0.0035372626, 0.5927226, 0.38359696, 0.33155236, 0.13010578), conv2d_13_tf); + conv2d_13_tf = MulAdd(c2, MF4x4(0.04136322, 0.11806175, 0.19966072, 0.07308716, -0.09563447, -0.064514905, -0.0077517326, 0.11964638, -0.1460613, 0.02240298, 0.014256963, -0.0123070385, 0.1897282, -0.0058207656, 0.040057864, -0.49406672), conv2d_13_tf); + conv2d_13_tf = MulAdd(d2, MF4x4(-0.43775788, -0.25118434, -0.3468631, -0.30180287, -0.27033472, -0.0023914252, 0.053275872, -0.021835659, 0.02879347, 0.036559265, 0.044093054, 0.12771723, 0.2702892, -0.2581491, -0.059361164, -0.046974897), conv2d_13_tf); + conv2d_13_tf = MulAdd(e2, MF4x4(-0.03310008, -0.5622936, 0.5419483, -0.3599514, 0.2634039, 0.3500813, 0.4152074, 0.24876466, -0.2629078, -0.18554081, -0.76194984, -0.54471385, 0.72921526, 0.3316481, -0.20936906, -0.16736485), conv2d_13_tf); + conv2d_13_tf = MulAdd(f2, MF4x4(0.07884802, 0.16494922, 0.2734585, -0.09396988, -0.14178166, -0.105561115, 0.006780099, 0.063054875, 0.12384575, -0.163967, -0.19682601, -0.1647527, 0.59927565, 0.24755491, -0.29760644, -0.074884824), conv2d_13_tf); + conv2d_13_tf = MulAdd(g2, MF4x4(-0.186745, 0.21136905, 0.027726538, 0.08498169, 0.009122279, 0.01566938, -0.051473126, 0.014151464, 0.04580383, 0.02071651, 0.14929157, 0.17253524, -0.034080226, 0.07048439, -0.11602547, -0.12655921), conv2d_13_tf); + conv2d_13_tf = MulAdd(h2, MF4x4(-0.2831727, -0.21816732, -0.37266397, -0.26041594, -0.18912914, -0.13482115, -0.10902061, -0.110694066, -0.20758803, -0.07158453, 0.14401175, 0.1590672, 0.27700564, -0.3202948, -0.23177631, 0.060082316), conv2d_13_tf); + conv2d_13_tf = MulAdd(i2, MF4x4(0.16861005, -0.13237478, -0.12109852, -0.16306286, 0.032467425, 0.009778175, -0.05084063, 0.02528882, -0.028993038, -0.06119019, 0.0124081755, -0.0819979, -0.2308113, -0.23910572, 0.3170529, 0.22742116), conv2d_13_tf); + conv2d_13_tf = MulAdd(na1, MF4x4(-0.19654512, 0.037653327, -0.015190324, 0.038381096, 0.034783594, -0.16242851, 0.07052334, 0.0019672879, 0.08069976, 0.090035714, 0.12597767, -0.00065050717, -0.10528094, 0.015088367, -0.045706235, -0.14849594), conv2d_13_tf); + conv2d_13_tf = MulAdd(nb1, MF4x4(-0.0981129, -0.0044483114, 0.00918156, 0.28903985, 0.23872024, 0.11113565, 0.23359483, 0.21115206, 0.2144387, 0.106830046, 0.03875094, -0.14864162, 0.19366172, 0.21310017, 0.06280982, -0.0581721), conv2d_13_tf); + conv2d_13_tf = MulAdd(nc1, MF4x4(-0.22814496, -0.08812413, -0.25392863, -0.02752917, 0.05930787, 0.08304853, -0.04027662, -0.010756739, 0.034590207, 0.070662424, 0.15285444, 0.058270697, -0.022838322, 0.024096202, 0.01309858, -0.10489201), conv2d_13_tf); + conv2d_13_tf = MulAdd(nd1, MF4x4(0.17219496, -0.0066256993, 0.1442649, -0.07291206, 0.34312358, -0.24952441, 0.040031537, 0.18302973, 0.0015231773, 0.24825755, -0.01807878, -0.037405558, 0.21687117, 0.02481246, -0.08312088, -0.14397743), conv2d_13_tf); + conv2d_13_tf = MulAdd(ne1, MF4x4(0.2859165, 0.6145777, 0.060804237, 0.22117847, -0.25534254, 0.3753605, 0.4193899, 0.06387241, -0.13308842, 0.0012660836, -0.055252563, -0.2552111, 0.8831952, -0.16249466, 0.76958305, 0.3658401), conv2d_13_tf); + conv2d_13_tf = MulAdd(nf1, MF4x4(-0.14865848, -0.13086087, 0.17719927, 0.2801542, 0.3776111, 0.20903045, 0.1710449, 0.25524843, 0.11910105, 0.034738105, -0.12101939, -0.22116004, 0.11605619, 0.16838482, -0.07223086, -0.15225673), conv2d_13_tf); + conv2d_13_tf = MulAdd(ng1, MF4x4(0.101802975, -0.12683764, -0.21380596, -0.19243564, 0.017763488, 0.0076850834, -0.0107422285, 0.058099743, 0.03071978, 0.02958345, 0.09209252, -0.012379192, -0.058930825, -0.07321041, -0.09178575, -0.09764888), conv2d_13_tf); + conv2d_13_tf = MulAdd(nh1, MF4x4(0.2205578, -0.053928245, -0.14290524, -0.18790527, 0.002521159, -0.23389481, 0.11274272, 0.17174199, 0.2128134, 0.14586388, 0.08666812, 0.052028902, 0.024853414, -0.027658377, 0.033780072, -0.0045349374), conv2d_13_tf); + conv2d_13_tf = MulAdd(ni1, MF4x4(-0.053073518, 0.12716359, 0.008456044, 0.014315154, 0.01918925, -0.13495505, 0.08007481, 0.08627198, 0.024612406, 0.0021514448, 0.04478567, -0.034171678, 0.0027070146, 0.0149149615, -0.15999815, -0.1866448), conv2d_13_tf); + conv2d_13_tf = MulAdd(na2, MF4x4(0.040357295, -0.12759757, 0.03543834, -0.029329961, -0.078925595, 0.07807751, 0.08971355, -0.05469623, -0.08630596, -0.11219292, -0.08082983, -0.020131797, -0.04191703, 0.22003745, -0.28878415, -0.132956), conv2d_13_tf); + conv2d_13_tf = MulAdd(nb2, MF4x4(0.021098461, 0.048261415, -0.121181525, -0.24724431, 0.32716268, 0.03046708, -0.28138334, -0.22871564, -0.15983087, 0.10721642, -0.14833531, -0.115366876, -0.393837, -0.62930757, -0.29534766, 0.02588463), conv2d_13_tf); + conv2d_13_tf = MulAdd(nc2, MF4x4(-0.03972534, -0.051577512, -0.04452277, -0.12650263, 0.15491997, -0.026459083, 0.009715449, -0.20551588, -0.042652152, 0.119186826, -0.13313279, -0.13183416, -0.20730016, 0.003008999, -0.19962612, 0.1760052), conv2d_13_tf); + conv2d_13_tf = MulAdd(nd2, MF4x4(0.1724579, -0.3179752, 0.18908302, 0.40730157, 0.44569418, -0.038390577, -0.13144472, -0.18369946, -0.1654486, -0.2106428, -0.084723935, 0.10262653, -0.26097777, 0.15257284, -0.36599034, -0.30871773), conv2d_13_tf); + conv2d_13_tf = MulAdd(ne2, MF4x4(-0.21338613, 0.680362, 0.079820015, 0.6081361, -0.9754953, -0.33735132, -1.2323227, -0.17950675, -0.31327835, 0.4732144, 0.22757599, 0.23051551, -0.8099572, -0.49106973, 0.96547806, 0.30975753), conv2d_13_tf); + conv2d_13_tf = MulAdd(nf2, MF4x4(0.16933723, 0.17994887, -0.38310486, -0.4208871, 0.373761, 0.20749316, -0.080664486, -0.26229286, -0.04797456, 0.28605196, -0.040223103, -0.034632236, -0.5650002, -0.38834664, 0.14565933, 0.1488285), conv2d_13_tf); + conv2d_13_tf = MulAdd(ng2, MF4x4(0.32558438, -0.18572666, 0.049500592, 0.2319145, -0.23547912, 0.2740939, 0.027905073, -0.022077003, 0.10860379, -0.15617043, -0.097419575, -0.11391895, -0.4266203, 0.060962453, -0.12154808, -0.19734453), conv2d_13_tf); + conv2d_13_tf = MulAdd(nh2, MF4x4(-0.07880791, -0.2247225, 0.445858, 0.3889803, 0.14111102, 0.378859, 0.040187526, -0.021096235, 0.04169405, -0.075737596, 0.046068836, 0.11624106, 0.08169536, 0.3022304, -0.24427707, -0.34422734), conv2d_13_tf); + conv2d_13_tf = MulAdd(ni2, MF4x4(0.13501012, -0.07389663, -0.010668981, -0.069029465, 0.06960202, -0.067375034, 0.08431378, 0.04207825, -0.121635035, -0.051126126, -0.1546829, 0.00073073455, -0.20674464, 0.27346626, -0.15771666, -0.024096), conv2d_13_tf); + MF4 nconv2d_13_tf = max(-conv2d_13_tf, 0); conv2d_13_tf = max(conv2d_13_tf, 0); - min16float4 target = mul(e1, min16float4x4(-0.3378193, 0.013861057, 0.19208853, -0.05050854, 0.08691835, 0.16724123, 0.10351982, -0.40157926, -0.055889476, -0.040115904, -0.13351472, -0.7937818, 0.18700145, 0.109559685, -0.119053595, -0.12651901)); - target += mul(e2, min16float4x4(0.05863214, -0.011048432, 0.22007701, -0.21624403, -0.06139813, -0.06766812, 0.022506371, 0.17585056, -0.37994936, -0.018394569, 0.5127985, -0.19700864, -0.07880973, 0.15687309, -0.12574019, -0.19570859)); - target += mul(ne1, min16float4x4(0.5059051, -0.010676642, -0.47922808, -0.017590942, -0.20583269, -0.10777252, -0.33185184, -0.0025075034, -0.1518394, 0.14268444, 0.005011664, 0.09016961, -0.46011007, -0.09428751, 0.34915137, 0.13334215)); - target += mul(ne2, min16float4x4(-0.15615676, 0.09427065, 0.006016912, -0.0003997069, 0.16170138, 0.09666374, 0.14158808, -0.23772424, 0.39373854, 0.004074768, -0.28073287, 0.0032489141, 0.23473479, -0.12678933, -0.24589436, -0.21988034)); - target += mul(conv2d_11_tf, min16float4x4(-0.12682347, 0.033012364, 0.18928578, 0.12523666, 0.12809147, 0.008567846, -0.10653368, -0.03712133, 0.075765386, -0.042196997, 0.039182812, 0.17273012, 0.21258987, 0.039698593, -0.0018848967, -0.07930902)); - target += mul(nconv2d_11_tf, min16float4x4(0.013454855, -0.18023406, -0.49323913, -0.032017395, 0.11903338, -0.043025218, -0.46579728, 0.21894619, -0.21387324, -0.13455649, 0.30638975, 0.3472243, 0.09305909, -0.015791988, 0.071368046, -0.038680866)); - target += mul(conv2d_1_tf, min16float4x4(0.012506262, 0.09754124, -0.092920735, 0.23061672, 0.08051618, -0.38472125, 0.17626029, 0.009075537, -0.18316247, -0.1338181, 0.2650675, 0.0516641, 0.080453254, 0.22033659, -0.13004474, -0.07781194)); - target += mul(nconv2d_1_tf, min16float4x4(-0.12412428, -0.11978811, 0.06780084, -0.1710261, -0.09355731, 0.31283846, -0.022725523, -0.16437142, -0.11865966, 0.10907317, 0.22463441, 0.017325362, 0.02512185, -0.49577957, 0.2016018, 0.14196795)); - target += mul(conv2d_4_tf, min16float4x4(0.02570746, 0.22231244, -0.10168496, -0.21518417, -0.0054759895, -0.32655567, -0.34048972, 0.11826245, -0.002854444, -0.11257602, -0.09318273, -0.10332744, 0.078923725, -0.11612356, -0.030546617, -0.12474622)); - target += mul(nconv2d_4_tf, min16float4x4(-0.11420135, -0.24489257, 0.15446539, 0.12646616, -0.07092042, 0.110105604, 0.054362826, 0.07867222, -0.15557991, 0.071640015, 0.21894808, 0.24164975, 0.0062167975, 0.10681122, -0.32373384, 0.06931269)); - target += mul(conv2d_7_tf, min16float4x4(0.0769479, -0.09528171, -0.38724712, 0.010703831, -0.016925508, -0.018486671, 0.035855293, -0.17932071, -0.078450575, -0.036463127, 0.20942347, 0.060895607, -0.16549253, -0.008952913, 0.20420915, -0.009001661)); - target += mul(nconv2d_7_tf, min16float4x4(0.074243605, 0.015648128, -0.05003613, 0.10121142, -0.0218682, 0.006933849, 0.101385176, 0.16132122, 0.0013466089, 0.14042993, -0.25816667, -0.040413387, -0.19570185, -0.08637437, 0.17934911, 0.24961887)); - target += mul(conv2d_10_tf, min16float4x4(-0.40401492, -0.16131033, 0.454142, 0.56882274, -0.013024656, -0.04423676, -0.023137214, 0.36117804, -0.0901519, -0.03237353, 0.010538879, -0.033432953, 0.105834074, -0.0549062, 0.05576519, -0.092626475)); - target += mul(nconv2d_10_tf, min16float4x4(-0.0017419134, -0.022569131, 0.027351622, -0.1289159, -0.0823291, -0.020735232, -0.28244564, -0.21001048, -0.048950948, 0.022033915, 0.14678808, -0.010097721, -0.06839686, 0.031720705, 0.11333891, 0.05049834)); - target += mul(conv2d_13_tf, min16float4x4(-0.2191025, -0.005935159, 0.24627906, 0.058490098, -0.011270337, -0.019233467, -0.17698613, -0.0052346545, 0.2288101, -2.5289672e-05, 0.267102, -0.026019678, -0.17386179, -0.017672652, -0.35420522, 0.2836498)); - target += mul(nconv2d_13_tf, min16float4x4(0.19294678, 0.011570707, -0.34666267, -0.09040537, 0.18127288, 0.10182209, 0.08549184, -0.48737645, -0.040560674, 0.20645715, -0.68665904, -1.3146902, 0.18629448, 0.09806124, 0.09953519, -0.5450951)); - target += min16float4(-0.24792486, -0.09899526, 0.3761066, 0.022595163); + MF4 target = MF4(-0.24792486, -0.09899526, 0.3761066, 0.022595163); + target = MulAdd(e1, MF4x4(-0.3378193, 0.013861057, 0.19208853, -0.05050854, 0.08691835, 0.16724123, 0.10351982, -0.40157926, -0.055889476, -0.040115904, -0.13351472, -0.7937818, 0.18700145, 0.109559685, -0.119053595, -0.12651901), target); + target = MulAdd(e2, MF4x4(0.05863214, -0.011048432, 0.22007701, -0.21624403, -0.06139813, -0.06766812, 0.022506371, 0.17585056, -0.37994936, -0.018394569, 0.5127985, -0.19700864, -0.07880973, 0.15687309, -0.12574019, -0.19570859), target); + target = MulAdd(ne1, MF4x4(0.5059051, -0.010676642, -0.47922808, -0.017590942, -0.20583269, -0.10777252, -0.33185184, -0.0025075034, -0.1518394, 0.14268444, 0.005011664, 0.09016961, -0.46011007, -0.09428751, 0.34915137, 0.13334215), target); + target = MulAdd(ne2, MF4x4(-0.15615676, 0.09427065, 0.006016912, -0.0003997069, 0.16170138, 0.09666374, 0.14158808, -0.23772424, 0.39373854, 0.004074768, -0.28073287, 0.0032489141, 0.23473479, -0.12678933, -0.24589436, -0.21988034), target); + target = MulAdd(conv2d_11_tf, MF4x4(-0.12682347, 0.033012364, 0.18928578, 0.12523666, 0.12809147, 0.008567846, -0.10653368, -0.03712133, 0.075765386, -0.042196997, 0.039182812, 0.17273012, 0.21258987, 0.039698593, -0.0018848967, -0.07930902), target); + target = MulAdd(nconv2d_11_tf, MF4x4(0.013454855, -0.18023406, -0.49323913, -0.032017395, 0.11903338, -0.043025218, -0.46579728, 0.21894619, -0.21387324, -0.13455649, 0.30638975, 0.3472243, 0.09305909, -0.015791988, 0.071368046, -0.038680866), target); + target = MulAdd(conv2d_1_tf, MF4x4(0.012506262, 0.09754124, -0.092920735, 0.23061672, 0.08051618, -0.38472125, 0.17626029, 0.009075537, -0.18316247, -0.1338181, 0.2650675, 0.0516641, 0.080453254, 0.22033659, -0.13004474, -0.07781194), target); + target = MulAdd(nconv2d_1_tf, MF4x4(-0.12412428, -0.11978811, 0.06780084, -0.1710261, -0.09355731, 0.31283846, -0.022725523, -0.16437142, -0.11865966, 0.10907317, 0.22463441, 0.017325362, 0.02512185, -0.49577957, 0.2016018, 0.14196795), target); + target = MulAdd(conv2d_4_tf, MF4x4(0.02570746, 0.22231244, -0.10168496, -0.21518417, -0.0054759895, -0.32655567, -0.34048972, 0.11826245, -0.002854444, -0.11257602, -0.09318273, -0.10332744, 0.078923725, -0.11612356, -0.030546617, -0.12474622), target); + target = MulAdd(nconv2d_4_tf, MF4x4(-0.11420135, -0.24489257, 0.15446539, 0.12646616, -0.07092042, 0.110105604, 0.054362826, 0.07867222, -0.15557991, 0.071640015, 0.21894808, 0.24164975, 0.0062167975, 0.10681122, -0.32373384, 0.06931269), target); + target = MulAdd(conv2d_7_tf, MF4x4(0.0769479, -0.09528171, -0.38724712, 0.010703831, -0.016925508, -0.018486671, 0.035855293, -0.17932071, -0.078450575, -0.036463127, 0.20942347, 0.060895607, -0.16549253, -0.008952913, 0.20420915, -0.009001661), target); + target = MulAdd(nconv2d_7_tf, MF4x4(0.074243605, 0.015648128, -0.05003613, 0.10121142, -0.0218682, 0.006933849, 0.101385176, 0.16132122, 0.0013466089, 0.14042993, -0.25816667, -0.040413387, -0.19570185, -0.08637437, 0.17934911, 0.24961887), target); + target = MulAdd(conv2d_10_tf, MF4x4(-0.40401492, -0.16131033, 0.454142, 0.56882274, -0.013024656, -0.04423676, -0.023137214, 0.36117804, -0.0901519, -0.03237353, 0.010538879, -0.033432953, 0.105834074, -0.0549062, 0.05576519, -0.092626475), target); + target = MulAdd(nconv2d_10_tf, MF4x4(-0.0017419134, -0.022569131, 0.027351622, -0.1289159, -0.0823291, -0.020735232, -0.28244564, -0.21001048, -0.048950948, 0.022033915, 0.14678808, -0.010097721, -0.06839686, 0.031720705, 0.11333891, 0.05049834), target); + target = MulAdd(conv2d_13_tf, MF4x4(-0.2191025, -0.005935159, 0.24627906, 0.058490098, -0.011270337, -0.019233467, -0.17698613, -0.0052346545, 0.2288101, -2.5289672e-05, 0.267102, -0.026019678, -0.17386179, -0.017672652, -0.35420522, 0.2836498), target); + target = MulAdd(nconv2d_13_tf, MF4x4(0.19294678, 0.011570707, -0.34666267, -0.09040537, 0.18127288, 0.10182209, 0.08549184, -0.48737645, -0.040560674, 0.20645715, -0.68665904, -1.3146902, 0.18629448, 0.09806124, 0.09953519, -0.5450951), target); tex4[gxy] = target; - target = mul(e1, min16float4x4(0.15938057, -0.23559119, -0.28445953, 0.05912659, 0.5229142, -0.02843545, -0.004113748, -0.056947608, 0.1367782, -0.026573306, -0.0056468234, 0.2564603, 0.25593445, 0.08957574, 0.26139608, -0.053708326)); - target += mul(e2, min16float4x4(0.1382045, -0.103480555, 0.05831098, 0.000735441, 0.20176832, -0.087079, -0.07839967, -0.0750771, -0.31373122, -0.27509713, -0.23071732, -0.2560584, 0.110963896, -0.052200988, 0.0015331429, -0.30707568)); - target += mul(ne1, min16float4x4(-0.056460302, 0.2147989, 0.40628514, -0.058157466, -0.17940372, -0.033689886, -0.022241283, -0.0018471872, 0.26578268, -0.098452985, -0.01501511, -0.35676336, -0.07152056, -0.07245194, -0.32194778, 0.03888747)); - target += mul(ne2, min16float4x4(0.09541087, 0.24680884, -0.045627397, -0.08557985, 0.08790337, 0.10179883, 0.3007415, 0.044102084, 0.1064372, 0.2994135, 0.15280741, 0.2683849, 0.24750276, -0.021364288, -0.004039902, 0.28266376)); - target += mul(conv2d_11_tf, min16float4x4(-0.26525706, -0.08389754, -0.10918147, -0.06878537, -0.080960914, 0.03737948, 0.107663736, -0.0025957434, -0.10748625, 0.03004828, 0.03505711, 0.075969726, 0.06360464, -0.02740913, 0.025467616, 0.017698402)); - target += mul(nconv2d_11_tf, min16float4x4(-0.2370006, -0.07687027, 0.015225365, 0.17986605, 0.37507248, 0.2088343, 0.17946883, 0.2379337, -0.25194344, 0.035336476, -0.15362923, -0.008527836, 0.045963865, 0.025127884, 0.06973296, 0.063168526)); - target += mul(conv2d_1_tf, min16float4x4(0.09583503, 0.15350054, -0.15248272, 0.045916792, -0.18339546, -0.29747355, 0.027330166, -0.39461568, 0.095963046, -0.1775004, -0.19221638, -0.15368307, 0.056089737, 0.18232727, 0.03182419, 0.30851522)); - target += mul(nconv2d_1_tf, min16float4x4(-0.053062204, -0.0018095247, -0.04514637, 0.05689337, 0.07561519, 0.17035827, -0.0048587993, 0.38348997, -0.063476466, 0.09454219, 0.03969728, 0.11693653, -0.0012066896, -0.25955358, -0.14428577, -0.19967856)); - target += mul(conv2d_4_tf, min16float4x4(0.034378257, 0.16030714, 0.05160261, 0.21927983, -0.14469208, 0.041181874, 0.034202367, 0.07983977, 0.22149332, -0.08595994, -0.102985874, -0.07265774, -0.123233125, -0.12819915, 0.08662329, -0.12866889)); - target += mul(nconv2d_4_tf, min16float4x4(-0.1511104, -0.056531575, -0.023363205, -0.1909304, -0.15387732, 0.0671428, -0.15435332, 0.32735124, -0.3293996, 0.055349957, -0.043602336, 0.08102016, 0.200238, 0.13393362, 0.0044564987, 0.16932343)); - target += mul(conv2d_7_tf, min16float4x4(-0.09768015, 0.09503259, 0.12768175, 0.109941825, 0.006567291, -0.102840215, -0.05611706, -0.06865725, -0.2605998, 0.00585688, -0.035119556, -0.06810342, -0.090756536, -0.079376444, -0.22370447, -0.05727839)); - target += mul(nconv2d_7_tf, min16float4x4(-0.101120085, 0.028628688, 0.07296149, 0.15868604, 0.047761433, 0.07732842, -0.016735386, 0.049528413, 0.45619023, 0.062347047, -0.026208224, 0.046785966, -0.05715451, 0.04459997, -0.13676195, 0.07778552)); - target += mul(conv2d_10_tf, min16float4x4(-0.051393595, -0.12524572, -0.36763692, 0.039426118, 0.0349489, 0.07154008, -0.12969223, 0.30249006, -0.15237582, -0.06685149, -0.042049125, -0.0065471376, 0.017375907, -0.07143284, -0.018227521, -0.02778629)); - target += mul(nconv2d_10_tf, min16float4x4(-0.048270147, -0.07275859, 0.05502608, -0.034233145, 0.12822276, -0.02580663, -0.035358194, 0.05195595, 0.044340245, 0.04435722, 0.017985033, 0.007126749, -0.052825354, -0.059360538, -0.09412195, 0.060212586)); - target += mul(conv2d_13_tf, min16float4x4(-0.18645881, -0.04506676, -0.035483524, 0.0063163475, -0.13747677, -0.046985928, 0.0015511635, 0.019160518, -0.4315584, -0.06979354, -0.001936674, 0.0034739177, 0.3490474, 0.15375568, -0.0085117165, 0.017511753)); - target += mul(nconv2d_13_tf, min16float4x4(0.20412005, 0.017221482, 0.08719384, -0.016668927, 0.10308073, -0.1013255, 0.087567665, -0.1004404, 0.9800944, -0.25387812, 0.36526182, -0.21970014, 0.36388537, -0.111629054, 0.21855496, -0.10375334)); - target += min16float4(-0.14657217, -0.04252579, -0.24773599, 0.13271233); + target = MF4(-0.14657217, -0.04252579, -0.24773599, 0.13271233); + target = MulAdd(e1, MF4x4(0.15938057, -0.23559119, -0.28445953, 0.05912659, 0.5229142, -0.02843545, -0.004113748, -0.056947608, 0.1367782, -0.026573306, -0.0056468234, 0.2564603, 0.25593445, 0.08957574, 0.26139608, -0.053708326), target); + target = MulAdd(e2, MF4x4(0.1382045, -0.103480555, 0.05831098, 0.000735441, 0.20176832, -0.087079, -0.07839967, -0.0750771, -0.31373122, -0.27509713, -0.23071732, -0.2560584, 0.110963896, -0.052200988, 0.0015331429, -0.30707568), target); + target = MulAdd(ne1, MF4x4(-0.056460302, 0.2147989, 0.40628514, -0.058157466, -0.17940372, -0.033689886, -0.022241283, -0.0018471872, 0.26578268, -0.098452985, -0.01501511, -0.35676336, -0.07152056, -0.07245194, -0.32194778, 0.03888747), target); + target = MulAdd(ne2, MF4x4(0.09541087, 0.24680884, -0.045627397, -0.08557985, 0.08790337, 0.10179883, 0.3007415, 0.044102084, 0.1064372, 0.2994135, 0.15280741, 0.2683849, 0.24750276, -0.021364288, -0.004039902, 0.28266376), target); + target = MulAdd(conv2d_11_tf, MF4x4(-0.26525706, -0.08389754, -0.10918147, -0.06878537, -0.080960914, 0.03737948, 0.107663736, -0.0025957434, -0.10748625, 0.03004828, 0.03505711, 0.075969726, 0.06360464, -0.02740913, 0.025467616, 0.017698402), target); + target = MulAdd(nconv2d_11_tf, MF4x4(-0.2370006, -0.07687027, 0.015225365, 0.17986605, 0.37507248, 0.2088343, 0.17946883, 0.2379337, -0.25194344, 0.035336476, -0.15362923, -0.008527836, 0.045963865, 0.025127884, 0.06973296, 0.063168526), target); + target = MulAdd(conv2d_1_tf, MF4x4(0.09583503, 0.15350054, -0.15248272, 0.045916792, -0.18339546, -0.29747355, 0.027330166, -0.39461568, 0.095963046, -0.1775004, -0.19221638, -0.15368307, 0.056089737, 0.18232727, 0.03182419, 0.30851522), target); + target = MulAdd(nconv2d_1_tf, MF4x4(-0.053062204, -0.0018095247, -0.04514637, 0.05689337, 0.07561519, 0.17035827, -0.0048587993, 0.38348997, -0.063476466, 0.09454219, 0.03969728, 0.11693653, -0.0012066896, -0.25955358, -0.14428577, -0.19967856), target); + target = MulAdd(conv2d_4_tf, MF4x4(0.034378257, 0.16030714, 0.05160261, 0.21927983, -0.14469208, 0.041181874, 0.034202367, 0.07983977, 0.22149332, -0.08595994, -0.102985874, -0.07265774, -0.123233125, -0.12819915, 0.08662329, -0.12866889), target); + target = MulAdd(nconv2d_4_tf, MF4x4(-0.1511104, -0.056531575, -0.023363205, -0.1909304, -0.15387732, 0.0671428, -0.15435332, 0.32735124, -0.3293996, 0.055349957, -0.043602336, 0.08102016, 0.200238, 0.13393362, 0.0044564987, 0.16932343), target); + target = MulAdd(conv2d_7_tf, MF4x4(-0.09768015, 0.09503259, 0.12768175, 0.109941825, 0.006567291, -0.102840215, -0.05611706, -0.06865725, -0.2605998, 0.00585688, -0.035119556, -0.06810342, -0.090756536, -0.079376444, -0.22370447, -0.05727839), target); + target = MulAdd(nconv2d_7_tf, MF4x4(-0.101120085, 0.028628688, 0.07296149, 0.15868604, 0.047761433, 0.07732842, -0.016735386, 0.049528413, 0.45619023, 0.062347047, -0.026208224, 0.046785966, -0.05715451, 0.04459997, -0.13676195, 0.07778552), target); + target = MulAdd(conv2d_10_tf, MF4x4(-0.051393595, -0.12524572, -0.36763692, 0.039426118, 0.0349489, 0.07154008, -0.12969223, 0.30249006, -0.15237582, -0.06685149, -0.042049125, -0.0065471376, 0.017375907, -0.07143284, -0.018227521, -0.02778629), target); + target = MulAdd(nconv2d_10_tf, MF4x4(-0.048270147, -0.07275859, 0.05502608, -0.034233145, 0.12822276, -0.02580663, -0.035358194, 0.05195595, 0.044340245, 0.04435722, 0.017985033, 0.007126749, -0.052825354, -0.059360538, -0.09412195, 0.060212586), target); + target = MulAdd(conv2d_13_tf, MF4x4(-0.18645881, -0.04506676, -0.035483524, 0.0063163475, -0.13747677, -0.046985928, 0.0015511635, 0.019160518, -0.4315584, -0.06979354, -0.001936674, 0.0034739177, 0.3490474, 0.15375568, -0.0085117165, 0.017511753), target); + target = MulAdd(nconv2d_13_tf, MF4x4(0.20412005, 0.017221482, 0.08719384, -0.016668927, 0.10308073, -0.1013255, 0.087567665, -0.1004404, 0.9800944, -0.25387812, 0.36526182, -0.21970014, 0.36388537, -0.111629054, 0.21855496, -0.10375334), target); tex5[gxy] = target; - target = mul(e1, min16float4x4(-0.22553514, -0.086349756, -0.07735866, 0.48776403, -0.33010843, 0.28214008, -0.2242988, -0.11439686, -0.14720698, 0.2391116, 0.017813087, 0.4352493, -0.16412133, -0.12791261, -0.019643517, 0.19420698)); - target += mul(e2, min16float4x4(-0.9178235, -0.6335296, 0.11146894, -0.0759723, -0.4519685, -0.3007054, 0.014501872, 0.49081457, 0.10673664, 0.035011876, 0.10259641, 0.106546804, 0.5186602, 0.44900152, 0.20597687, -0.39562696)); - target += mul(ne1, min16float4x4(-0.11399027, -0.19542706, 0.087422565, -0.70140034, -0.41029623, -0.049330976, 0.19682989, 0.22516033, -0.22858454, -0.12200487, -0.14852463, -0.40852943, -0.035900578, 0.1886829, 0.019452838, -0.16703403)); - target += mul(ne2, min16float4x4(0.077843145, 0.7323388, -0.022324003, 0.09445821, 0.026166735, -0.1790519, 0.086004496, -0.40011314, 0.01210975, -0.053515363, -0.2501869, 0.06671936, -0.71530163, -0.57196116, -0.38604704, 0.5024949)); - target += mul(conv2d_11_tf, min16float4x4(0.30748057, 0.12223383, 0.059069566, 0.18568543, 0.008148904, 0.009438993, 0.053996127, -0.19665428, 0.38345802, 0.20945628, 0.01368962, -0.2834185, -0.15974379, -0.4628119, -0.18307796, 0.22361058)); - target += mul(nconv2d_11_tf, min16float4x4(0.00833237, -0.10446639, -0.028896136, -0.18917766, -0.24016596, -0.034934085, -0.013062447, 0.079293504, -0.16635038, -0.11056953, 0.2618598, 0.07227063, 0.057050053, 0.013885738, 0.09385356, -0.27068567)); - target += mul(conv2d_1_tf, min16float4x4(-0.5675842, 0.13328329, -0.0252242, 0.34746942, 0.34712863, 0.13635597, 0.02356317, -0.1617803, -0.16861948, -0.018621348, 0.02680753, 0.30408886, -0.034069773, 0.08948961, -0.057724215, 0.111602895)); - target += mul(nconv2d_1_tf, min16float4x4(-0.03835732, -0.11742271, 0.025922403, 0.24378933, -0.36450952, -0.15091905, 0.1214089, 0.21004228, 0.28717628, 0.17053549, 0.10836553, -0.08449643, 0.17507422, -0.03195037, -0.03947606, 0.050725944)); - target += mul(conv2d_4_tf, min16float4x4(-0.21257977, -0.0043600267, -0.12929972, -0.233982, -0.26728988, -0.21511734, 0.07835361, -0.24275993, -0.359975, -0.23956355, -0.07852281, 0.40282407, 0.17184453, 0.11672362, 0.0433819, -0.032416925)); - target += mul(nconv2d_4_tf, min16float4x4(0.20235331, 0.16114245, 0.015931258, -0.17612378, 0.2449233, 0.0031623375, -0.2784109, 0.3347522, 0.46005112, 0.20291579, 0.13030154, -0.23390344, -0.39526668, -0.09738018, 0.013237711, 0.15512206)); - target += mul(conv2d_7_tf, min16float4x4(-0.1434995, -0.12447443, 0.095140964, -0.08841888, -0.05424789, -0.11747197, -0.097216785, 0.12958516, 0.34194428, 0.111434594, -0.02794559, -0.22843723, -0.043816507, -0.16116165, -0.29044297, 0.33768278)); - target += mul(nconv2d_7_tf, min16float4x4(0.39615574, 0.05410518, -0.07885892, -0.22024721, 0.011598219, 0.1446308, 0.11650995, -0.020602686, -0.51892537, 0.14221898, -0.01697185, 0.05188913, 0.07683384, 0.122416414, 0.02296055, 0.2932525)); - target += mul(conv2d_10_tf, min16float4x4(-0.058334768, -0.12389275, -0.02024463, 0.46323973, 0.17553197, 0.35435143, 0.19796194, 0.06836581, 0.15947883, -0.056819815, -0.091066726, 0.22499265, -0.21629064, -0.22203816, 0.053594038, 0.09816408)); - target += mul(nconv2d_10_tf, min16float4x4(-0.016514458, -0.14323495, 0.017527288, -0.19750872, -0.47891942, -0.073656894, -0.086305656, 0.38173944, 0.1016976, 0.15224999, 0.048396923, -0.19529565, 0.13985658, 0.07292602, 0.06549534, 0.210662)); - target += mul(conv2d_13_tf, min16float4x4(0.3459035, 0.0071707424, -0.019186711, 0.2527976, 0.29675815, 0.35949966, -0.06114439, -0.02610484, 0.5475115, -0.13828747, 0.019238133, 0.101953685, -0.52718824, 0.017254699, 0.08887026, -0.19507161)); - target += mul(nconv2d_13_tf, min16float4x4(-0.3064509, -0.031613164, 0.040971015, -0.24252266, -0.21725285, -0.35069898, 0.0951283, -0.065222666, -0.98867434, 0.08824426, 0.06094605, -0.21000125, -0.72066385, -0.34141323, 0.049487203, 0.0690126)); - target += min16float4(0.25545248, -0.112931795, -0.073284395, 0.29349956); + target = MF4(0.25545248, -0.112931795, -0.073284395, 0.29349956); + target = MulAdd(e1, MF4x4(-0.22553514, -0.086349756, -0.07735866, 0.48776403, -0.33010843, 0.28214008, -0.2242988, -0.11439686, -0.14720698, 0.2391116, 0.017813087, 0.4352493, -0.16412133, -0.12791261, -0.019643517, 0.19420698), target); + target = MulAdd(e2, MF4x4(-0.9178235, -0.6335296, 0.11146894, -0.0759723, -0.4519685, -0.3007054, 0.014501872, 0.49081457, 0.10673664, 0.035011876, 0.10259641, 0.106546804, 0.5186602, 0.44900152, 0.20597687, -0.39562696), target); + target = MulAdd(ne1, MF4x4(-0.11399027, -0.19542706, 0.087422565, -0.70140034, -0.41029623, -0.049330976, 0.19682989, 0.22516033, -0.22858454, -0.12200487, -0.14852463, -0.40852943, -0.035900578, 0.1886829, 0.019452838, -0.16703403), target); + target = MulAdd(ne2, MF4x4(0.077843145, 0.7323388, -0.022324003, 0.09445821, 0.026166735, -0.1790519, 0.086004496, -0.40011314, 0.01210975, -0.053515363, -0.2501869, 0.06671936, -0.71530163, -0.57196116, -0.38604704, 0.5024949), target); + target = MulAdd(conv2d_11_tf, MF4x4(0.30748057, 0.12223383, 0.059069566, 0.18568543, 0.008148904, 0.009438993, 0.053996127, -0.19665428, 0.38345802, 0.20945628, 0.01368962, -0.2834185, -0.15974379, -0.4628119, -0.18307796, 0.22361058), target); + target = MulAdd(nconv2d_11_tf, MF4x4(0.00833237, -0.10446639, -0.028896136, -0.18917766, -0.24016596, -0.034934085, -0.013062447, 0.079293504, -0.16635038, -0.11056953, 0.2618598, 0.07227063, 0.057050053, 0.013885738, 0.09385356, -0.27068567), target); + target = MulAdd(conv2d_1_tf, MF4x4(-0.5675842, 0.13328329, -0.0252242, 0.34746942, 0.34712863, 0.13635597, 0.02356317, -0.1617803, -0.16861948, -0.018621348, 0.02680753, 0.30408886, -0.034069773, 0.08948961, -0.057724215, 0.111602895), target); + target = MulAdd(nconv2d_1_tf, MF4x4(-0.03835732, -0.11742271, 0.025922403, 0.24378933, -0.36450952, -0.15091905, 0.1214089, 0.21004228, 0.28717628, 0.17053549, 0.10836553, -0.08449643, 0.17507422, -0.03195037, -0.03947606, 0.050725944), target); + target = MulAdd(conv2d_4_tf, MF4x4(-0.21257977, -0.0043600267, -0.12929972, -0.233982, -0.26728988, -0.21511734, 0.07835361, -0.24275993, -0.359975, -0.23956355, -0.07852281, 0.40282407, 0.17184453, 0.11672362, 0.0433819, -0.032416925), target); + target = MulAdd(nconv2d_4_tf, MF4x4(0.20235331, 0.16114245, 0.015931258, -0.17612378, 0.2449233, 0.0031623375, -0.2784109, 0.3347522, 0.46005112, 0.20291579, 0.13030154, -0.23390344, -0.39526668, -0.09738018, 0.013237711, 0.15512206), target); + target = MulAdd(conv2d_7_tf, MF4x4(-0.1434995, -0.12447443, 0.095140964, -0.08841888, -0.05424789, -0.11747197, -0.097216785, 0.12958516, 0.34194428, 0.111434594, -0.02794559, -0.22843723, -0.043816507, -0.16116165, -0.29044297, 0.33768278), target); + target = MulAdd(nconv2d_7_tf, MF4x4(0.39615574, 0.05410518, -0.07885892, -0.22024721, 0.011598219, 0.1446308, 0.11650995, -0.020602686, -0.51892537, 0.14221898, -0.01697185, 0.05188913, 0.07683384, 0.122416414, 0.02296055, 0.2932525), target); + target = MulAdd(conv2d_10_tf, MF4x4(-0.058334768, -0.12389275, -0.02024463, 0.46323973, 0.17553197, 0.35435143, 0.19796194, 0.06836581, 0.15947883, -0.056819815, -0.091066726, 0.22499265, -0.21629064, -0.22203816, 0.053594038, 0.09816408), target); + target = MulAdd(nconv2d_10_tf, MF4x4(-0.016514458, -0.14323495, 0.017527288, -0.19750872, -0.47891942, -0.073656894, -0.086305656, 0.38173944, 0.1016976, 0.15224999, 0.048396923, -0.19529565, 0.13985658, 0.07292602, 0.06549534, 0.210662), target); + target = MulAdd(conv2d_13_tf, MF4x4(0.3459035, 0.0071707424, -0.019186711, 0.2527976, 0.29675815, 0.35949966, -0.06114439, -0.02610484, 0.5475115, -0.13828747, 0.019238133, 0.101953685, -0.52718824, 0.017254699, 0.08887026, -0.19507161), target); + target = MulAdd(nconv2d_13_tf, MF4x4(-0.3064509, -0.031613164, 0.040971015, -0.24252266, -0.21725285, -0.35069898, 0.0951283, -0.065222666, -0.98867434, 0.08824426, 0.06094605, -0.21000125, -0.72066385, -0.34141323, 0.049487203, 0.0690126), target); tex10[gxy] = target; } @@ -1174,25 +1177,25 @@ void Pass7(uint2 blockStart, uint3 threadId) { // [ a, d, g ] // [ b, e, h ] // [ c, f, i ] - min16float4 a1 = tex4.SampleLevel(sam1, pos - outputPt, 0); - min16float4 b1 = tex4.SampleLevel(sam1, pos + float2(-outputPt.x, 0), 0); - min16float4 c1 = tex4.SampleLevel(sam1, pos + float2(-outputPt.x, outputPt.y), 0); - min16float4 d1 = tex4.SampleLevel(sam1, pos + float2(0, -outputPt.y), 0); - min16float4 e1 = tex4.SampleLevel(sam1, pos, 0); - min16float4 f1 = tex4.SampleLevel(sam1, pos + float2(0, outputPt.y), 0); - min16float4 g1 = tex4.SampleLevel(sam1, pos + float2(outputPt.x, -outputPt.y), 0); - min16float4 h1 = tex4.SampleLevel(sam1, pos + float2(outputPt.x, 0), 0); - min16float4 i1 = tex4.SampleLevel(sam1, pos + outputPt, 0); - - min16float4 na1 = max(-a1, 0); - min16float4 nb1 = max(-b1, 0); - min16float4 nc1 = max(-c1, 0); - min16float4 nd1 = max(-d1, 0); - min16float4 ne1 = max(-e1, 0); - min16float4 nf1 = max(-f1, 0); - min16float4 ng1 = max(-g1, 0); - min16float4 nh1 = max(-h1, 0); - min16float4 ni1 = max(-i1, 0); + MF4 a1 = tex4.SampleLevel(sam1, pos - outputPt, 0); + MF4 b1 = tex4.SampleLevel(sam1, pos + float2(-outputPt.x, 0), 0); + MF4 c1 = tex4.SampleLevel(sam1, pos + float2(-outputPt.x, outputPt.y), 0); + MF4 d1 = tex4.SampleLevel(sam1, pos + float2(0, -outputPt.y), 0); + MF4 e1 = tex4.SampleLevel(sam1, pos, 0); + MF4 f1 = tex4.SampleLevel(sam1, pos + float2(0, outputPt.y), 0); + MF4 g1 = tex4.SampleLevel(sam1, pos + float2(outputPt.x, -outputPt.y), 0); + MF4 h1 = tex4.SampleLevel(sam1, pos + float2(outputPt.x, 0), 0); + MF4 i1 = tex4.SampleLevel(sam1, pos + outputPt, 0); + + MF4 na1 = max(-a1, 0); + MF4 nb1 = max(-b1, 0); + MF4 nc1 = max(-c1, 0); + MF4 nd1 = max(-d1, 0); + MF4 ne1 = max(-e1, 0); + MF4 nf1 = max(-f1, 0); + MF4 ng1 = max(-g1, 0); + MF4 nh1 = max(-h1, 0); + MF4 ni1 = max(-i1, 0); a1 = max(a1, 0); b1 = max(b1, 0); @@ -1204,25 +1207,25 @@ void Pass7(uint2 blockStart, uint3 threadId) { h1 = max(h1, 0); i1 = max(i1, 0); - min16float4 a2 = tex5.SampleLevel(sam1, pos - outputPt, 0); - min16float4 b2 = tex5.SampleLevel(sam1, pos + float2(-outputPt.x, 0), 0); - min16float4 c2 = tex5.SampleLevel(sam1, pos + float2(-outputPt.x, outputPt.y), 0); - min16float4 d2 = tex5.SampleLevel(sam1, pos + float2(0, -outputPt.y), 0); - min16float4 e2 = tex5.SampleLevel(sam1, pos, 0); - min16float4 f2 = tex5.SampleLevel(sam1, pos + float2(0, outputPt.y), 0); - min16float4 g2 = tex5.SampleLevel(sam1, pos + float2(outputPt.x, -outputPt.y), 0); - min16float4 h2 = tex5.SampleLevel(sam1, pos + float2(outputPt.x, 0), 0); - min16float4 i2 = tex5.SampleLevel(sam1, pos + outputPt, 0); - - min16float4 na2 = max(-a2, 0); - min16float4 nb2 = max(-b2, 0); - min16float4 nc2 = max(-c2, 0); - min16float4 nd2 = max(-d2, 0); - min16float4 ne2 = max(-e2, 0); - min16float4 nf2 = max(-f2, 0); - min16float4 ng2 = max(-g2, 0); - min16float4 nh2 = max(-h2, 0); - min16float4 ni2 = max(-i2, 0); + MF4 a2 = tex5.SampleLevel(sam1, pos - outputPt, 0); + MF4 b2 = tex5.SampleLevel(sam1, pos + float2(-outputPt.x, 0), 0); + MF4 c2 = tex5.SampleLevel(sam1, pos + float2(-outputPt.x, outputPt.y), 0); + MF4 d2 = tex5.SampleLevel(sam1, pos + float2(0, -outputPt.y), 0); + MF4 e2 = tex5.SampleLevel(sam1, pos, 0); + MF4 f2 = tex5.SampleLevel(sam1, pos + float2(0, outputPt.y), 0); + MF4 g2 = tex5.SampleLevel(sam1, pos + float2(outputPt.x, -outputPt.y), 0); + MF4 h2 = tex5.SampleLevel(sam1, pos + float2(outputPt.x, 0), 0); + MF4 i2 = tex5.SampleLevel(sam1, pos + outputPt, 0); + + MF4 na2 = max(-a2, 0); + MF4 nb2 = max(-b2, 0); + MF4 nc2 = max(-c2, 0); + MF4 nd2 = max(-d2, 0); + MF4 ne2 = max(-e2, 0); + MF4 nf2 = max(-f2, 0); + MF4 ng2 = max(-g2, 0); + MF4 nh2 = max(-h2, 0); + MF4 ni2 = max(-i2, 0); a2 = max(a2, 0); b2 = max(b2, 0); @@ -1234,25 +1237,25 @@ void Pass7(uint2 blockStart, uint3 threadId) { h2 = max(h2, 0); i2 = max(i2, 0); - min16float4 a3 = tex10.SampleLevel(sam1, pos - outputPt, 0); - min16float4 b3 = tex10.SampleLevel(sam1, pos + float2(-outputPt.x, 0), 0); - min16float4 c3 = tex10.SampleLevel(sam1, pos + float2(-outputPt.x, outputPt.y), 0); - min16float4 d3 = tex10.SampleLevel(sam1, pos + float2(0, -outputPt.y), 0); - min16float4 e3 = tex10.SampleLevel(sam1, pos, 0); - min16float4 f3 = tex10.SampleLevel(sam1, pos + float2(0, outputPt.y), 0); - min16float4 g3 = tex10.SampleLevel(sam1, pos + float2(outputPt.x, -outputPt.y), 0); - min16float4 h3 = tex10.SampleLevel(sam1, pos + float2(outputPt.x, 0), 0); - min16float4 i3 = tex10.SampleLevel(sam1, pos + outputPt, 0); - - min16float4 na3 = max(-a3, 0); - min16float4 nb3 = max(-b3, 0); - min16float4 nc3 = max(-c3, 0); - min16float4 nd3 = max(-d3, 0); - min16float4 ne3 = max(-e3, 0); - min16float4 nf3 = max(-f3, 0); - min16float4 ng3 = max(-g3, 0); - min16float4 nh3 = max(-h3, 0); - min16float4 ni3 = max(-i3, 0); + MF4 a3 = tex10.SampleLevel(sam1, pos - outputPt, 0); + MF4 b3 = tex10.SampleLevel(sam1, pos + float2(-outputPt.x, 0), 0); + MF4 c3 = tex10.SampleLevel(sam1, pos + float2(-outputPt.x, outputPt.y), 0); + MF4 d3 = tex10.SampleLevel(sam1, pos + float2(0, -outputPt.y), 0); + MF4 e3 = tex10.SampleLevel(sam1, pos, 0); + MF4 f3 = tex10.SampleLevel(sam1, pos + float2(0, outputPt.y), 0); + MF4 g3 = tex10.SampleLevel(sam1, pos + float2(outputPt.x, -outputPt.y), 0); + MF4 h3 = tex10.SampleLevel(sam1, pos + float2(outputPt.x, 0), 0); + MF4 i3 = tex10.SampleLevel(sam1, pos + outputPt, 0); + + MF4 na3 = max(-a3, 0); + MF4 nb3 = max(-b3, 0); + MF4 nc3 = max(-c3, 0); + MF4 nd3 = max(-d3, 0); + MF4 ne3 = max(-e3, 0); + MF4 nf3 = max(-f3, 0); + MF4 ng3 = max(-g3, 0); + MF4 nh3 = max(-h3, 0); + MF4 ni3 = max(-i3, 0); a3 = max(a3, 0); b3 = max(b3, 0); @@ -1264,61 +1267,61 @@ void Pass7(uint2 blockStart, uint3 threadId) { h3 = max(h3, 0); i3 = max(i3, 0); - min16float3 target = mul(a1, min16float4x3(0.009331738, 0.018572107, 0.022010602, 0.0039357482, -0.016444422, -0.02944063, -0.03631314, -0.056094132, -0.050672945, 0.0077923858, -0.023002634, 0.021950275)); - target += mul(b1, min16float4x3(-0.015352033, -0.018134398, -0.031076321, 0.09254242, 0.07433854, 0.094745025, 0.09154548, 0.10833595, 0.084574744, -0.06755486, 0.022037052, -0.09424632)); - target += mul(c1, min16float4x3(0.019884977, 0.021337362, 0.026944455, 0.11712925, 0.021360623, -0.017487818, -0.14924358, -0.1149652, -0.12671575, 0.012104617, -0.039750118, -0.002691512)); - target += mul(d1, min16float4x3(0.00344861, -0.0071971808, -0.011530234, 0.039175995, 0.12297611, 0.15838134, 0.033669177, 0.018021118, -0.010552058, -0.048705686, 0.03920792, -0.00043378037)); - target += mul(e1, min16float4x3(-0.09026871, -0.09393277, -0.0849584, -0.16249315, -0.15300421, -0.1383744, -0.10384136, -0.04767781, 0.022754611, 0.14949107, 0.06619118, 0.016498014)); - target += mul(f1, min16float4x3(0.0138111375, 0.0033931104, 0.010171692, -0.037168514, -0.029690385, -0.045251988, 0.074186325, 0.056937214, 0.06968052, -0.057218343, -0.060974367, -0.030270662)); - target += mul(g1, min16float4x3(-0.0028436058, 0.010812401, 0.012844112, 0.050941236, -0.035253663, -0.061899442, -0.01614737, 0.01752726, -0.023620276, -0.04961744, -0.06673698, 0.039417736)); - target += mul(h1, min16float4x3(0.042587858, 0.03904053, 0.028782798, -0.09807107, -0.01929461, -0.034585416, 0.029584344, -0.053522006, 0.0068953806, -0.04451219, 0.018451538, -0.029895604)); - target += mul(i1, min16float4x3(-0.0041629653, 0.0070575047, 0.004515914, 0.043267716, 0.0020689464, 0.010954458, -0.0013374113, 0.009222025, -0.0272451, 0.00645634, -0.037133303, -0.03209227)); - target += mul(a2, min16float4x3(-0.010575585, -0.00065620174, -0.009598815, -0.068592854, -0.06461729, -0.05058234, 0.03790364, 0.044340994, 0.049410254, -0.009466368, 0.081484325, 0.07265021)); - target += mul(b2, min16float4x3(-0.01598744, -0.025267042, -0.010857686, 0.0771284, 0.081469566, 0.07138724, -0.00555409, -0.006099002, -0.02123016, -0.29761449, -0.10614364, -0.1027762)); - target += mul(c2, min16float4x3(0.02664693, 0.027294884, 0.019080907, 0.010511018, 0.01179118, 0.02403106, 0.05436632, 0.07234358, 0.08310484, 0.03146414, -0.02122628, -0.021377526)); - target += mul(d2, min16float4x3(0.027889153, 0.018621879, 0.025370836, -0.14017807, -0.14772555, -0.14436993, -0.017539013, -0.028932836, -0.06139342, 0.0007456944, -0.00086823467, -0.05282406)); - target += mul(e2, min16float4x3(-0.0017060362, 0.00777287, 0.003343087, 0.20926197, 0.21706305, 0.23307496, -0.16601992, -0.183019, -0.139133, 0.13933188, -0.013340946, -0.021960167)); - target += mul(f2, min16float4x3(-0.018459205, -0.023415336, -0.0173199, 0.08558963, 0.10207333, 0.06444232, -2.5721886e-06, -0.015806457, -0.036833573, -0.20488425, -0.009690944, 0.020323949)); - target += mul(g2, min16float4x3(0.010601256, 0.007344732, 0.0056538777, 0.021578439, 0.017345639, 0.0032158173, 0.031785835, 0.04436094, 0.05920955, 0.23948166, -0.06085234, -0.14597872)); - target += mul(h2, min16float4x3(0.00777581, 0.012557825, 0.0123206265, -0.0691877, -0.0861206, -0.077578135, -0.018104369, -0.024902673, -0.036656447, 0.10611258, 0.09515675, 0.118361965)); - target += mul(i2, min16float4x3(0.0021278602, 0.003906813, 0.0016891633, -0.06379228, -0.060215514, -0.051921096, 0.039505195, 0.052035928, 0.05059492, -0.047328927, -0.0066980706, 0.09447027)); - target += mul(a3, min16float4x3(0.18920127, -0.045531996, -0.044905778, 0.013732142, 0.019208554, 0.011500921, -0.0040531917, -0.02001873, -0.0023935249, -0.033091005, -0.017751431, -0.009764133)); - target += mul(b3, min16float4x3(0.15241088, -0.13676398, -0.01825122, -0.003517022, -0.004041717, 0.003177141, 0.011362495, 0.03685609, 0.008397426, -0.08597375, -0.111830845, -0.110682696)); - target += mul(c3, min16float4x3(-0.046171717, 0.23827009, -0.119844295, 0.005446854, 0.00826863, 0.002206898, -0.11165099, -0.14702465, -0.1203897, 0.12169146, 0.11585612, 0.10473949)); - target += mul(d3, min16float4x3(-0.18456058, 0.13293917, 0.06901046, 0.010084839, -0.0006403412, -0.011852079, -0.062180433, -0.06781299, -0.08111614, -0.02218764, -0.015271581, -0.019768957)); - target += mul(e3, min16float4x3(0.034135204, -0.20479187, 0.27587336, -0.058966126, -0.065613195, -0.056132246, 0.07697151, 0.0706985, 0.098771244, 0.06747748, 0.10971204, 0.13186967)); - target += mul(f3, min16float4x3(0.017322296, -0.06730298, 0.07034802, 0.013449086, 0.007968637, 0.012679429, 0.0902275, 0.11269024, 0.08805874, -0.06179092, -0.06705483, -0.13040404)); - target += mul(g3, min16float4x3(-0.052505482, -0.018989135, 0.03388015, -0.068704374, -0.05350174, -0.057223134, 0.011537428, 0.017847707, 0.0270268, -0.008713432, -0.02698126, -0.017463546)); - target += mul(h3, min16float4x3(0.15220639, -0.05387876, -0.08352881, 0.026893694, 0.027608246, 0.025959803, 0.035518423, 0.035180617, 0.01858579, -0.021064412, -0.014214504, -0.0051168953)); - target += mul(i3, min16float4x3(-0.11906418, 0.13103563, -0.06997703, 0.005664134, 0.0075536724, 0.009519002, -0.025366528, -0.013528652, -0.015087253, 0.0071858848, -0.027586544, 0.016723866)); - target += mul(na1, min16float4x3(0.015307254, 0.02070064, 0.012568325, 0.06845904, -0.033312738, -0.0058661965, -0.016281582, -0.01631146, -0.021667928, -0.012522515, -0.020992521, -0.015833912)); - target += mul(nb1, min16float4x3(0.04937768, 0.0405066, 0.041023023, 0.05503905, -0.13230717, -0.14439866, 0.01618014, 0.0122084245, 0.016226485, 0.0014116488, 0.011495032, 0.002382562)); - target += mul(nc1, min16float4x3(-0.04847043, -0.050508745, -0.041216835, -0.067119725, -0.0448592, -0.011477939, -0.035635237, -0.037191708, -0.034170575, -0.016549444, -0.027191242, -0.017883684)); - target += mul(nd1, min16float4x3(0.034498286, 0.026938718, 0.052970096, -0.10511612, -0.13200648, -0.09493861, -0.0018118658, -0.0072637545, 0.0043198126, -0.038338073, -0.031448375, -0.035546694)); - target += mul(ne1, min16float4x3(0.048043568, 0.057704087, 0.06386534, 0.04542113, 0.20604704, 0.2598609, 0.049180254, 0.064697154, 0.05789202, 0.08370016, 0.08105142, 0.08807082)); - target += mul(nf1, min16float4x3(-0.018156562, 0.008306473, -0.014604633, 0.18912326, 0.024388695, -0.08006485, 0.009333483, 0.011596536, 0.0056475243, 0.027749287, 0.039271932, 0.02655462)); - target += mul(ng1, min16float4x3(-0.030157864, -0.035259083, -0.05771176, -0.22293729, 0.0768592, 0.14670776, -0.013287718, -0.011300663, -0.01670879, -0.009928094, -0.016364388, -0.013879692)); - target += mul(nh1, min16float4x3(-0.013415757, -0.013257486, -0.01940959, 0.014077903, 0.05088362, 0.04006286, -0.0033998038, -0.0062313867, -0.00833104, 0.015246904, 0.017004015, 0.01802002)); - target += mul(ni1, min16float4x3(-0.0016801689, -0.022088053, 0.0031654288, 0.027371893, -0.007083684, -0.10904292, -0.015408179, -0.01793058, -0.010933266, -0.023707654, -0.026440954, -0.025527867)); - target += mul(na2, min16float4x3(0.009003153, 0.0078040734, 0.037757806, 0.054483943, 0.058831017, 0.060899608, -0.011133613, -0.01601666, -0.007977876, -0.07686641, -0.049250316, -0.045481566)); - target += mul(nb2, min16float4x3(0.04344093, 0.07054628, 0.037604738, -0.0914579, -0.105631486, -0.108511426, 0.04426105, 0.0492282, 0.048829302, 0.14961997, 0.16839094, 0.16053638)); - target += mul(nc2, min16float4x3(-0.0032967671, -0.019857304, -0.014145445, -0.013525817, 0.001614058, -0.009782301, -0.044629153, -0.07325184, -0.07655591, -0.08667146, 0.024955297, 0.04591592)); - target += mul(nd2, min16float4x3(0.04816059, 0.030722216, 0.032487474, 0.09684092, 0.10024655, 0.101904154, 0.08137448, 0.092595905, 0.1118598, 0.0796932, 0.009548236, 0.0013610915)); - target += mul(ne2, min16float4x3(-0.17208904, -0.19137467, -0.17717223, -0.10827683, -0.11960323, -0.1204814, -0.030430049, -0.019306151, -0.05230355, -0.021787236, -0.015395303, -0.093210146)); - target += mul(nf2, min16float4x3(0.04527227, 0.057978027, 0.10569097, -0.1015645, -0.12595437, -0.097537845, 0.060087565, 0.09157804, 0.060251515, 0.05170573, 0.042533275, 0.08233745)); - target += mul(ng2, min16float4x3(-0.01908824, 0.0039797956, -0.015060464, 0.008187719, 0.013936167, 0.008152853, -0.02618239, -0.056918032, -0.0504624, -0.083657, 0.02122987, 0.022906482)); - target += mul(nh2, min16float4x3(0.058020473, 0.08750743, 0.032107625, 0.021999976, 0.030119067, 0.03513493, 0.06583862, 0.08137626, 0.09867312, -0.0021064964, -0.1227668, -0.0912879)); - target += mul(ni2, min16float4x3(0.022279112, -0.012710205, -0.0011416139, 0.05606448, 0.066590145, 0.061043978, -0.008292685, -0.019583363, -0.006212003, -0.053282585, -0.029954918, -0.021437356)); - target += mul(na3, min16float4x3(0.019198919, 0.020138288, 0.02048463, -0.012281223, -0.01964347, -0.010557296, 0.00830553, 0.02714052, 0.016606145, -0.0047117253, -0.0060619717, 0.0015284229)); - target += mul(nb3, min16float4x3(-0.01620369, -0.018634152, -0.018486649, -0.0037721654, -0.005256878, -0.0032221128, 0.048627518, 0.033200823, 0.05459796, 0.0064762663, 0.005607537, 0.0014544157)); - target += mul(nc3, min16float4x3(-0.0049319286, -0.003757374, -0.008033526, -0.009529666, -0.01023788, -0.011724289, 0.08779079, 0.11368912, 0.10699827, 0.014564745, 0.017019482, 0.018130492)); - target += mul(nd3, min16float4x3(-0.018128838, -0.020529313, -0.021291668, 0.022232227, 0.032956265, 0.030233478, 0.057042982, 0.052126013, 0.039634123, 0.04395578, 0.042147905, 0.047779605)); - target += mul(ne3, min16float4x3(-0.008916549, -0.011398656, -0.006473247, 0.07594334, 0.07910866, 0.0726948, -0.1670962, -0.17030263, -0.18856722, 0.0067814733, 0.01550948, 0.002108076)); - target += mul(nf3, min16float4x3(-0.0020052418, -0.0015789939, 0.0024248413, -0.018381692, -0.012541983, -0.016114611, -0.054943718, -0.08546223, -0.045788202, -0.02116913, -0.02479526, -0.02281286)); - target += mul(ng3, min16float4x3(0.004089441, 0.004577225, 0.009165186, -0.023352642, -0.03344756, -0.03359231, 0.051127084, 0.055484984, 0.06788994, -0.009284511, -0.0026670755, -0.011205212)); - target += mul(nh3, min16float4x3(-0.008048874, -0.003658728, -0.011127851, 0.0034879802, 0.014905489, 0.016252292, -0.07353042, -0.0754597, -0.09509333, 0.009990113, -0.0003871956, 0.0049740863)); - target += mul(ni3, min16float4x3(0.009073377, 0.006138898, 0.006741848, -0.009877169, -0.019738095, -0.015525384, 0.057441086, 0.06538757, 0.053950094, -0.0011834118, 0.0010558038, 0.004649949)); - target += min16float3(-0.008654677, -0.008960475, -0.009207461); + MF3 target = MF3(-0.008654677, -0.008960475, -0.009207461); + target = MulAdd(a1, MF4x3(0.009331738, 0.018572107, 0.022010602, 0.0039357482, -0.016444422, -0.02944063, -0.03631314, -0.056094132, -0.050672945, 0.0077923858, -0.023002634, 0.021950275), target); + target = MulAdd(b1, MF4x3(-0.015352033, -0.018134398, -0.031076321, 0.09254242, 0.07433854, 0.094745025, 0.09154548, 0.10833595, 0.084574744, -0.06755486, 0.022037052, -0.09424632), target); + target = MulAdd(c1, MF4x3(0.019884977, 0.021337362, 0.026944455, 0.11712925, 0.021360623, -0.017487818, -0.14924358, -0.1149652, -0.12671575, 0.012104617, -0.039750118, -0.002691512), target); + target = MulAdd(d1, MF4x3(0.00344861, -0.0071971808, -0.011530234, 0.039175995, 0.12297611, 0.15838134, 0.033669177, 0.018021118, -0.010552058, -0.048705686, 0.03920792, -0.00043378037), target); + target = MulAdd(e1, MF4x3(-0.09026871, -0.09393277, -0.0849584, -0.16249315, -0.15300421, -0.1383744, -0.10384136, -0.04767781, 0.022754611, 0.14949107, 0.06619118, 0.016498014), target); + target = MulAdd(f1, MF4x3(0.0138111375, 0.0033931104, 0.010171692, -0.037168514, -0.029690385, -0.045251988, 0.074186325, 0.056937214, 0.06968052, -0.057218343, -0.060974367, -0.030270662), target); + target = MulAdd(g1, MF4x3(-0.0028436058, 0.010812401, 0.012844112, 0.050941236, -0.035253663, -0.061899442, -0.01614737, 0.01752726, -0.023620276, -0.04961744, -0.06673698, 0.039417736), target); + target = MulAdd(h1, MF4x3(0.042587858, 0.03904053, 0.028782798, -0.09807107, -0.01929461, -0.034585416, 0.029584344, -0.053522006, 0.0068953806, -0.04451219, 0.018451538, -0.029895604), target); + target = MulAdd(i1, MF4x3(-0.0041629653, 0.0070575047, 0.004515914, 0.043267716, 0.0020689464, 0.010954458, -0.0013374113, 0.009222025, -0.0272451, 0.00645634, -0.037133303, -0.03209227), target); + target = MulAdd(a2, MF4x3(-0.010575585, -0.00065620174, -0.009598815, -0.068592854, -0.06461729, -0.05058234, 0.03790364, 0.044340994, 0.049410254, -0.009466368, 0.081484325, 0.07265021), target); + target = MulAdd(b2, MF4x3(-0.01598744, -0.025267042, -0.010857686, 0.0771284, 0.081469566, 0.07138724, -0.00555409, -0.006099002, -0.02123016, -0.29761449, -0.10614364, -0.1027762), target); + target = MulAdd(c2, MF4x3(0.02664693, 0.027294884, 0.019080907, 0.010511018, 0.01179118, 0.02403106, 0.05436632, 0.07234358, 0.08310484, 0.03146414, -0.02122628, -0.021377526), target); + target = MulAdd(d2, MF4x3(0.027889153, 0.018621879, 0.025370836, -0.14017807, -0.14772555, -0.14436993, -0.017539013, -0.028932836, -0.06139342, 0.0007456944, -0.00086823467, -0.05282406), target); + target = MulAdd(e2, MF4x3(-0.0017060362, 0.00777287, 0.003343087, 0.20926197, 0.21706305, 0.23307496, -0.16601992, -0.183019, -0.139133, 0.13933188, -0.013340946, -0.021960167), target); + target = MulAdd(f2, MF4x3(-0.018459205, -0.023415336, -0.0173199, 0.08558963, 0.10207333, 0.06444232, -2.5721886e-06, -0.015806457, -0.036833573, -0.20488425, -0.009690944, 0.020323949), target); + target = MulAdd(g2, MF4x3(0.010601256, 0.007344732, 0.0056538777, 0.021578439, 0.017345639, 0.0032158173, 0.031785835, 0.04436094, 0.05920955, 0.23948166, -0.06085234, -0.14597872), target); + target = MulAdd(h2, MF4x3(0.00777581, 0.012557825, 0.0123206265, -0.0691877, -0.0861206, -0.077578135, -0.018104369, -0.024902673, -0.036656447, 0.10611258, 0.09515675, 0.118361965), target); + target = MulAdd(i2, MF4x3(0.0021278602, 0.003906813, 0.0016891633, -0.06379228, -0.060215514, -0.051921096, 0.039505195, 0.052035928, 0.05059492, -0.047328927, -0.0066980706, 0.09447027), target); + target = MulAdd(a3, MF4x3(0.18920127, -0.045531996, -0.044905778, 0.013732142, 0.019208554, 0.011500921, -0.0040531917, -0.02001873, -0.0023935249, -0.033091005, -0.017751431, -0.009764133), target); + target = MulAdd(b3, MF4x3(0.15241088, -0.13676398, -0.01825122, -0.003517022, -0.004041717, 0.003177141, 0.011362495, 0.03685609, 0.008397426, -0.08597375, -0.111830845, -0.110682696), target); + target = MulAdd(c3, MF4x3(-0.046171717, 0.23827009, -0.119844295, 0.005446854, 0.00826863, 0.002206898, -0.11165099, -0.14702465, -0.1203897, 0.12169146, 0.11585612, 0.10473949), target); + target = MulAdd(d3, MF4x3(-0.18456058, 0.13293917, 0.06901046, 0.010084839, -0.0006403412, -0.011852079, -0.062180433, -0.06781299, -0.08111614, -0.02218764, -0.015271581, -0.019768957), target); + target = MulAdd(e3, MF4x3(0.034135204, -0.20479187, 0.27587336, -0.058966126, -0.065613195, -0.056132246, 0.07697151, 0.0706985, 0.098771244, 0.06747748, 0.10971204, 0.13186967), target); + target = MulAdd(f3, MF4x3(0.017322296, -0.06730298, 0.07034802, 0.013449086, 0.007968637, 0.012679429, 0.0902275, 0.11269024, 0.08805874, -0.06179092, -0.06705483, -0.13040404), target); + target = MulAdd(g3, MF4x3(-0.052505482, -0.018989135, 0.03388015, -0.068704374, -0.05350174, -0.057223134, 0.011537428, 0.017847707, 0.0270268, -0.008713432, -0.02698126, -0.017463546), target); + target = MulAdd(h3, MF4x3(0.15220639, -0.05387876, -0.08352881, 0.026893694, 0.027608246, 0.025959803, 0.035518423, 0.035180617, 0.01858579, -0.021064412, -0.014214504, -0.0051168953), target); + target = MulAdd(i3, MF4x3(-0.11906418, 0.13103563, -0.06997703, 0.005664134, 0.0075536724, 0.009519002, -0.025366528, -0.013528652, -0.015087253, 0.0071858848, -0.027586544, 0.016723866), target); + target = MulAdd(na1, MF4x3(0.015307254, 0.02070064, 0.012568325, 0.06845904, -0.033312738, -0.0058661965, -0.016281582, -0.01631146, -0.021667928, -0.012522515, -0.020992521, -0.015833912), target); + target = MulAdd(nb1, MF4x3(0.04937768, 0.0405066, 0.041023023, 0.05503905, -0.13230717, -0.14439866, 0.01618014, 0.0122084245, 0.016226485, 0.0014116488, 0.011495032, 0.002382562), target); + target = MulAdd(nc1, MF4x3(-0.04847043, -0.050508745, -0.041216835, -0.067119725, -0.0448592, -0.011477939, -0.035635237, -0.037191708, -0.034170575, -0.016549444, -0.027191242, -0.017883684), target); + target = MulAdd(nd1, MF4x3(0.034498286, 0.026938718, 0.052970096, -0.10511612, -0.13200648, -0.09493861, -0.0018118658, -0.0072637545, 0.0043198126, -0.038338073, -0.031448375, -0.035546694), target); + target = MulAdd(ne1, MF4x3(0.048043568, 0.057704087, 0.06386534, 0.04542113, 0.20604704, 0.2598609, 0.049180254, 0.064697154, 0.05789202, 0.08370016, 0.08105142, 0.08807082), target); + target = MulAdd(nf1, MF4x3(-0.018156562, 0.008306473, -0.014604633, 0.18912326, 0.024388695, -0.08006485, 0.009333483, 0.011596536, 0.0056475243, 0.027749287, 0.039271932, 0.02655462), target); + target = MulAdd(ng1, MF4x3(-0.030157864, -0.035259083, -0.05771176, -0.22293729, 0.0768592, 0.14670776, -0.013287718, -0.011300663, -0.01670879, -0.009928094, -0.016364388, -0.013879692), target); + target = MulAdd(nh1, MF4x3(-0.013415757, -0.013257486, -0.01940959, 0.014077903, 0.05088362, 0.04006286, -0.0033998038, -0.0062313867, -0.00833104, 0.015246904, 0.017004015, 0.01802002), target); + target = MulAdd(ni1, MF4x3(-0.0016801689, -0.022088053, 0.0031654288, 0.027371893, -0.007083684, -0.10904292, -0.015408179, -0.01793058, -0.010933266, -0.023707654, -0.026440954, -0.025527867), target); + target = MulAdd(na2, MF4x3(0.009003153, 0.0078040734, 0.037757806, 0.054483943, 0.058831017, 0.060899608, -0.011133613, -0.01601666, -0.007977876, -0.07686641, -0.049250316, -0.045481566), target); + target = MulAdd(nb2, MF4x3(0.04344093, 0.07054628, 0.037604738, -0.0914579, -0.105631486, -0.108511426, 0.04426105, 0.0492282, 0.048829302, 0.14961997, 0.16839094, 0.16053638), target); + target = MulAdd(nc2, MF4x3(-0.0032967671, -0.019857304, -0.014145445, -0.013525817, 0.001614058, -0.009782301, -0.044629153, -0.07325184, -0.07655591, -0.08667146, 0.024955297, 0.04591592), target); + target = MulAdd(nd2, MF4x3(0.04816059, 0.030722216, 0.032487474, 0.09684092, 0.10024655, 0.101904154, 0.08137448, 0.092595905, 0.1118598, 0.0796932, 0.009548236, 0.0013610915), target); + target = MulAdd(ne2, MF4x3(-0.17208904, -0.19137467, -0.17717223, -0.10827683, -0.11960323, -0.1204814, -0.030430049, -0.019306151, -0.05230355, -0.021787236, -0.015395303, -0.093210146), target); + target = MulAdd(nf2, MF4x3(0.04527227, 0.057978027, 0.10569097, -0.1015645, -0.12595437, -0.097537845, 0.060087565, 0.09157804, 0.060251515, 0.05170573, 0.042533275, 0.08233745), target); + target = MulAdd(ng2, MF4x3(-0.01908824, 0.0039797956, -0.015060464, 0.008187719, 0.013936167, 0.008152853, -0.02618239, -0.056918032, -0.0504624, -0.083657, 0.02122987, 0.022906482), target); + target = MulAdd(nh2, MF4x3(0.058020473, 0.08750743, 0.032107625, 0.021999976, 0.030119067, 0.03513493, 0.06583862, 0.08137626, 0.09867312, -0.0021064964, -0.1227668, -0.0912879), target); + target = MulAdd(ni2, MF4x3(0.022279112, -0.012710205, -0.0011416139, 0.05606448, 0.066590145, 0.061043978, -0.008292685, -0.019583363, -0.006212003, -0.053282585, -0.029954918, -0.021437356), target); + target = MulAdd(na3, MF4x3(0.019198919, 0.020138288, 0.02048463, -0.012281223, -0.01964347, -0.010557296, 0.00830553, 0.02714052, 0.016606145, -0.0047117253, -0.0060619717, 0.0015284229), target); + target = MulAdd(nb3, MF4x3(-0.01620369, -0.018634152, -0.018486649, -0.0037721654, -0.005256878, -0.0032221128, 0.048627518, 0.033200823, 0.05459796, 0.0064762663, 0.005607537, 0.0014544157), target); + target = MulAdd(nc3, MF4x3(-0.0049319286, -0.003757374, -0.008033526, -0.009529666, -0.01023788, -0.011724289, 0.08779079, 0.11368912, 0.10699827, 0.014564745, 0.017019482, 0.018130492), target); + target = MulAdd(nd3, MF4x3(-0.018128838, -0.020529313, -0.021291668, 0.022232227, 0.032956265, 0.030233478, 0.057042982, 0.052126013, 0.039634123, 0.04395578, 0.042147905, 0.047779605), target); + target = MulAdd(ne3, MF4x3(-0.008916549, -0.011398656, -0.006473247, 0.07594334, 0.07910866, 0.0726948, -0.1670962, -0.17030263, -0.18856722, 0.0067814733, 0.01550948, 0.002108076), target); + target = MulAdd(nf3, MF4x3(-0.0020052418, -0.0015789939, 0.0024248413, -0.018381692, -0.012541983, -0.016114611, -0.054943718, -0.08546223, -0.045788202, -0.02116913, -0.02479526, -0.02281286), target); + target = MulAdd(ng3, MF4x3(0.004089441, 0.004577225, 0.009165186, -0.023352642, -0.03344756, -0.03359231, 0.051127084, 0.055484984, 0.06788994, -0.009284511, -0.0026670755, -0.011205212), target); + target = MulAdd(nh3, MF4x3(-0.008048874, -0.003658728, -0.011127851, 0.0034879802, 0.014905489, 0.016252292, -0.07353042, -0.0754597, -0.09509333, 0.009990113, -0.0003871956, 0.0049740863), target); + target = MulAdd(ni3, MF4x3(0.009073377, 0.006138898, 0.006741848, -0.009877169, -0.019738095, -0.015525384, 0.057441086, 0.06538757, 0.053950094, -0.0011834118, 0.0010558038, 0.004649949), target); - OUTPUT[gxy] = float4(target + INPUT.SampleLevel(sam1, pos, 0).rgb, 1); + OUTPUT[gxy] = MF4(target + INPUT.SampleLevel(sam1, pos, 0).rgb, 1); } diff --git a/src/Magpie.Core/ComputeShader.hlsl b/src/Magpie.Core/ComputeShader.hlsl deleted file mode 100644 index cfe3ea33f..000000000 --- a/src/Magpie.Core/ComputeShader.hlsl +++ /dev/null @@ -1,239 +0,0 @@ -#define MF float -#define MF1 float1 -#define MF2 float2 -#define MF3 float3 -#define MF4 float4 -#define MF1x1 float1x1 -#define MF1x2 float1x2 -#define MF1x3 float1x3 -#define MF1x4 float1x4 -#define MF2x1 float2x1 -#define MF2x2 float2x2 -#define MF2x3 float2x3 -#define MF2x4 float2x4 -#define MF3x1 float3x1 -#define MF3x2 float3x2 -#define MF3x3 float3x3 -#define MF3x4 float3x4 -#define MF4x1 float4x1 -#define MF4x2 float4x2 -#define MF4x3 float4x3 -#define MF4x4 float4x4 - -#define MP_BLOCK_WIDTH 8 -#define MP_BLOCK_HEIGHT 8 -#define MP_NUM_THREADS_X 64 -#define MP_NUM_THREADS_Y 1 -#define MP_NUM_THREADS_Z 1 - -cbuffer __CB1 : register(b0) { - uint2 __inputSize; - uint2 __outputSize; - float2 __inputPt; - float2 __outputPt; - float2 __scale; -}; - -Texture2D tex3 : register(t0); -Texture2D tex4 : register(t1); -Texture2D tex6 : register(t2); -RWTexture2D tex1 : register(u0); -RWTexture2D tex2 : register(u1); -RWTexture2D tex5 : register(u2); -SamplerState sam : register(s0); - -uint __Bfe(uint src, uint off, uint bits) { uint mask = (1u << bits) - 1; return (src >> off) & mask; } -uint __BfiM(uint src, uint ins, uint bits) { uint mask = (1u << bits) - 1; return (ins & mask) | (src & (~mask)); } -uint2 Rmp8x8(uint a) { return uint2(__Bfe(a, 1u, 3u), __BfiM(__Bfe(a, 3u, 3u), a, 1u)); } -uint2 GetInputSize() { return __inputSize; } -float2 GetInputPt() { return __inputPt; } -uint2 GetOutputSize() { return __outputSize; } -float2 GetOutputPt() { return __outputPt; } -float2 GetScale() { return __scale; } -MF2 MulAdd(MF2 x, MF2x2 y, MF2 a) { - MF2 result = a; - result = mad(x.x, y._m00_m01, result); - result = mad(x.y, y._m10_m11, result); - return result; -} -MF3 MulAdd(MF2 x, MF2x3 y, MF3 a) { - MF3 result = a; - result = mad(x.x, y._m00_m01_m02, result); - result = mad(x.y, y._m10_m11_m12, result); - return result; -} -MF4 MulAdd(MF2 x, MF2x4 y, MF4 a) { - MF4 result = a; - result = mad(x.x, y._m00_m01_m02_m03, result); - result = mad(x.y, y._m10_m11_m12_m13, result); - return result; -} -MF2 MulAdd(MF3 x, MF3x2 y, MF2 a) { - MF2 result = a; - result = mad(x.x, y._m00_m01, result); - result = mad(x.y, y._m10_m11, result); - result = mad(x.z, y._m20_m21, result); - return result; -} -MF3 MulAdd(MF3 x, MF3x3 y, MF3 a) { - MF3 result = a; - result = mad(x.x, y._m00_m01_m02, result); - result = mad(x.y, y._m10_m11_m12, result); - result = mad(x.z, y._m20_m21_m22, result); - return result; -} -MF4 MulAdd(MF3 x, MF3x4 y, MF4 a) { - MF4 result = a; - result = mad(x.x, y._m00_m01_m02_m03, result); - result = mad(x.y, y._m10_m11_m12_m13, result); - result = mad(x.z, y._m20_m21_m22_m23, result); - return result; -} -MF2 MulAdd(MF4 x, MF4x2 y, MF2 a) { - MF2 result = a; - result = mad(x.x, y._m00_m01, result); - result = mad(x.y, y._m10_m11, result); - result = mad(x.z, y._m20_m21, result); - result = mad(x.w, y._m30_m31, result); - return result; -} -MF3 MulAdd(MF4 x, MF4x3 y, MF3 a) { - MF3 result = a; - result = mad(x.x, y._m00_m01_m02, result); - result = mad(x.y, y._m10_m11_m12, result); - result = mad(x.z, y._m20_m21_m22, result); - result = mad(x.w, y._m30_m31_m32, result); - return result; -} -MF4 MulAdd(MF4 x, MF4x4 y, MF4 a) { - MF4 result = a; - result = mad(x.x, y._m00_m01_m02_m03, result); - result = mad(x.y, y._m10_m11_m12_m13, result); - result = mad(x.z, y._m20_m21_m22_m23, result); - result = mad(x.w, y._m30_m31_m32_m33, result); - return result; -} - -void Pass5(uint2 blockStart, uint3 threadId) { - uint2 gxy = Rmp8x8(threadId.x) + blockStart; - uint2 inputSize = GetInputSize(); - if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) { - return; - } - - float2 inputPt = GetInputPt(); - float2 pos = (gxy + 0.5f) * inputPt; - - - - - MF4 a1 = tex3.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0); - MF4 b1 = tex3.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); - MF4 c1 = tex3.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); - MF4 d1 = tex3.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); - MF4 e1 = tex3.SampleLevel(sam, pos, 0); - MF4 f1 = tex3.SampleLevel(sam, pos + float2(0, inputPt.y), 0); - MF4 g1 = tex3.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); - MF4 h1 = tex3.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); - MF4 i1 = tex3.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0); - - MF4 a2 = tex4.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0); - MF4 b2 = tex4.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0); - MF4 c2 = tex4.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0); - MF4 d2 = tex4.SampleLevel(sam, pos + float2(0, -inputPt.y), 0); - MF4 e2 = tex4.SampleLevel(sam, pos, 0); - MF4 f2 = tex4.SampleLevel(sam, pos + float2(0, inputPt.y), 0); - MF4 g2 = tex4.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0); - MF4 h2 = tex4.SampleLevel(sam, pos + float2(inputPt.x, 0), 0); - MF4 i2 = tex4.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0); - - MF4 target = { -0.034743138, 0.012946433, -0.082333155, 0.07721756 }; - target = MulAdd(max(a1, 0), MF4x4(-0.06738501, 0.034009207, -0.21538448, 0.14296548, 0.12896985, -0.23526315, -0.08848608, 0.019602662, 0.14937137, 0.11353096, 0.11884168, -0.016765572, 0.030985225, 0.046430565, 0.06614828, -0.19202724), target); - target = MulAdd(max(b1, 0), MF4x4(-0.10326068, 0.11014975, 0.17069744, -0.21474148, 0.16761585, 0.13434832, -0.101021074, 0.006307025, 0.07478008, -0.1060066, 0.035315692, 0.033488914, -0.24906659, 0.06269967, 0.11120735, -0.040928528), target); - target = MulAdd(max(c1, 0), MF4x4(0.09334615, 0.057705753, 0.12213245, -0.06402275, 0.30694544, 0.034585163, 0.20345578, 0.07489286, 0.07483618, -0.14240396, 0.034846418, -0.03811241, 0.010882573, 0.13204294, 0.017563924, -0.047203008), target); - target = MulAdd(max(d1, 0), MF4x4(-0.21673942, -0.024010994, -0.10238504, -0.041160326, 0.06838163, -0.20950818, 0.06526309, -0.079094924, 0.02208821, -0.28130978, 0.086275116, -0.089067616, 0.12133826, -0.062600106, -0.020521903, -0.07654401), target); - target = MulAdd(max(e1, 0), MF4x4(-0.03055029, -0.15683146, -0.20331301, -0.06252028, 0.13350682, 0.20338707, 0.038425338, 0.1581342, -0.27322498, -0.14999662, -0.16681097, 0.0971585, -0.20014858, -0.081635274, -0.0781877, -0.20625232), target); - target = MulAdd(max(f1, 0), MF4x4(0.38375977, -0.019825654, 0.1886721, 0.22616312, 0.3402173, 0.1825304, -0.05531195, 0.30973226, -0.2676023, 0.14413352, 0.021706983, 0.01732799, 0.23466855, -0.13805965, 0.22570935, 0.018103868), target); - target = MulAdd(max(g1, 0), MF4x4(-0.15169825, 0.0270689, -0.2503316, 0.17289825, -0.16437647, 0.039233048, -0.35572487, -0.048393793, 0.19270042, 0.24260359, 0.12041881, -0.0009793913, 0.11656858, 0.11007414, -0.0757491, 0.047933612), target); - target = MulAdd(max(h1, 0), MF4x4(-0.18657999, -0.11252566, -0.05237504, -0.07368097, 0.13882741, -0.13710637, -0.006996468, -0.062354874, 0.23452504, 0.15333645, -0.0022776406, -0.17910439, 0.03629509, -0.16264829, -0.010011833, -0.15313338), target); - target = MulAdd(max(i1, 0), MF4x4(-0.060544558, -0.04913478, -0.061717357, 0.02323648, 0.28739056, -0.07434013, 0.19110644, 0.100050166, 0.0073363045, 0.08185653, -0.024797903, -0.14424153, -0.20838726, 0.16154376, -0.048517212, -0.025453888), target); - target = MulAdd(max(a2, 0), MF4x4(0.14975396, -0.13142908, 0.36210674, -0.054021083, -0.10632155, 0.045697935, -0.18946633, 0.02228141, -0.08919603, 0.09800842, -0.17634438, 0.09512711, -0.03425503, -0.12298555, -0.05354435, -0.17112055), target); - target = MulAdd(max(b2, 0), MF4x4(0.09958265, -0.057276618, -0.16262266, -0.06415915, 0.14579074, -0.36784375, 0.08034197, -0.04537706, 0.005460582, 0.22313322, 0.07382161, 0.014990379, 0.044636846, -0.2811128, -0.22621547, -0.06044004), target); - target = MulAdd(max(c2, 0), MF4x4(0.10569276, -0.03738662, 0.16100396, 0.058593616, -0.048862137, -0.08796426, 0.20101094, -0.11039573, 0.17196764, -0.04601554, 0.008571281, -0.073729075, 0.051433694, -0.051276565, 0.087334655, -0.0360379), target); - target = MulAdd(max(d2, 0), MF4x4(0.011119538, -0.28781965, 0.28637868, -0.1742508, -0.07121849, 0.10379717, 0.012615981, -0.029563965, -0.18678424, 0.05291095, 0.039143506, -0.028248642, -0.014103922, 0.029155696, 0.10433492, 0.16305852), target); - target = MulAdd(max(e2, 0), MF4x4(-0.2231037, -0.13697462, -0.29124337, 0.08519773, 0.15893684, -0.17763218, 0.06950923, 0.34361118, -0.024844287, 0.044008408, -0.033844844, -0.086971916, -0.07884748, 0.2543499, 0.056884114, 0.10068364), target); - target = MulAdd(max(f2, 0), MF4x4(-0.07710048, -0.23218372, 0.04346047, 0.21769643, 0.06473219, -0.18066105, -0.2511205, 0.15309611, 0.04535977, 0.16450433, 0.10846344, 0.0016952346, -0.010874939, 0.28966382, -0.121990964, 0.12956186), target); - target = MulAdd(max(g2, 0), MF4x4(-0.007910202, 0.17766511, 0.14364475, 0.1016258, 0.0051045395, 0.18691733, 0.005813767, -0.0070582186, 0.019418601, -0.1604435, 0.016088275, -0.18265302, -0.15719391, -0.17369832, -0.036745597, -0.19647408), target); - target = MulAdd(max(h2, 0), MF4x4(0.08938396, -0.0073808245, 0.11225727, -0.012303106, 0.096785046, 0.030483445, 0.027719889, -0.052584838, -0.14887555, -0.03422243, 0.12646855, -0.1722482, 0.010239037, 0.06406088, -0.20053658, 0.01964698), target); - target = MulAdd(max(i2, 0), MF4x4(-0.120734036, -0.12450362, -0.06582111, 0.1639675, -0.19787048, -0.08049789, -0.014257596, 0.058436662, -0.0009387449, -0.08698089, -0.017400503, 0.06295286, 0.09890349, -0.057190523, -0.103520766, -0.04207548), target); - target = MulAdd(max(-a1, 0), MF4x4(-0.0118413875, -0.031288836, 0.09749554, -0.012266401, -0.07998591, 0.22615653, -0.06207416, 0.03257896, -0.076378696, -0.079426095, -0.13968349, -0.15423697, -0.1091681, -0.02893125, -0.032659534, -0.063735925), target); - target = MulAdd(max(-b1, 0), MF4x4(0.119372696, 0.013176554, -0.029381052, 0.21919228, 0.045041792, 0.24844484, 0.26363325, 0.08480674, 0.087083444, 0.11984778, -0.088715754, 0.06421046, 0.05225977, -0.05140334, -0.055052705, -0.049854077), target); - target = MulAdd(max(-c1, 0), MF4x4(0.0035781674, 0.0861361, -0.07675145, -0.056479637, 0.16973391, -0.12113791, 0.10729832, -0.03773517, 0.058618728, 0.12148276, 0.17260705, -0.06968724, 0.076358154, -0.15307103, 0.17700425, -0.13467014), target); - target = MulAdd(max(-d1, 0), MF4x4(-0.02752418, -0.06366472, -0.025610954, 0.0013539721, -0.06465272, 0.0806373, -0.07336035, 0.10114861, 0.0041146413, 0.15878421, -0.044668555, -0.12150811, -0.1071482, -0.05086587, 0.18589285, 0.05065092), target); - target = MulAdd(max(-e1, 0), MF4x4(0.07200056, 0.021739854, 0.29476613, -0.08475931, 0.15018553, -0.07886365, 0.36336347, -0.020576432, 0.25866082, -0.059272554, 0.054249667, -0.17822553, 0.1755872, 0.3244387, -0.39173844, 0.33894604), target); - target = MulAdd(max(-f1, 0), MF4x4(-0.11570926, 0.1342677, -0.19511898, 0.0075454637, -0.01890476, -0.14239742, 0.18921931, 0.033990458, 0.31306365, -0.006998358, 0.029190077, -0.005679954, -0.15341778, 0.07766778, -0.25691047, -0.0964161), target); - target = MulAdd(max(-g1, 0), MF4x4(0.019746238, 0.0021332854, -0.00879096, -0.1338671, -0.0001600663, -0.29465106, 0.0867611, -0.114963025, 0.07874301, -0.012734178, -0.11124061, -0.010926616, -0.04941506, -0.07516841, 0.116663, -0.29018974), target); - target = MulAdd(max(-h1, 0), MF4x4(-0.01651721, 0.05955898, 0.023618208, 0.098695934, 0.018553663, -0.054378513, 0.1436929, 0.1693743, -0.27483663, 0.029127488, 0.09619316, -0.06109113, -0.08619361, 0.09315214, -0.02478657, 0.18544984), target); - target = MulAdd(max(-i1, 0), MF4x4(0.09570196, -0.016528936, -0.1559397, 0.14312246, 0.04029428, 0.08773151, -0.043646842, 0.17894371, -0.082413055, 0.0027082344, -0.100171275, 0.01547501, 0.18122818, -0.11933676, 0.26404107, -0.3169703), target); - target = MulAdd(max(-a2, 0), MF4x4(-0.12073344, 0.08683522, -0.09249099, 0.058786053, -0.14480567, -0.121013954, 0.033335857, 0.009353379, -0.055087596, -0.13002734, 0.08890566, 0.05508963, -0.0075715426, -0.15936922, -0.03968994, -0.1690259), target); - target = MulAdd(max(-b2, 0), MF4x4(0.2011206, 0.23898427, 0.23656492, 0.1287573, 0.14850396, 0.40532517, -0.107408255, 0.40119782, 0.099813245, -0.03830304, 0.101520434, -0.026478073, -0.048469637, 0.106440455, 0.056632314, -0.17825997), target); - target = MulAdd(max(-c2, 0), MF4x4(-0.076735444, 0.05965795, -0.0052469415, -0.21785147, 0.11887833, 0.067560315, 0.051149055, 0.23626682, -0.1297049, -0.035512198, 0.20352256, -0.025064934, 0.04958706, 0.0454198, 0.0113334535, 0.0417486), target); - target = MulAdd(max(-d2, 0), MF4x4(-0.09055751, 0.033915352, -0.21836667, 0.22006813, -0.099022895, 0.11720966, -0.15686816, -0.13586599, -0.094427735, -0.08831514, -0.06182928, 0.09213704, -0.03642064, 0.18129414, -0.012926811, 0.12179882), target); - target = MulAdd(max(-e2, 0), MF4x4(0.19389409, 0.09512252, 0.14768016, -0.16623649, -0.031052284, -0.026814984, 0.106168024, -0.2026781, -0.04581419, -0.0016849053, -0.04101923, 0.038959503, -0.011938445, 0.20096186, -0.26666564, 0.4824324), target); - target = MulAdd(max(-f2, 0), MF4x4(0.17727576, 0.07309147, 0.12131863, -0.163096, 0.17225246, 0.26256254, 0.27685758, 0.09094053, 0.029605515, -0.20217367, 0.047564875, 0.043115832, 0.15089568, -0.09670934, 0.24131384, 0.03337442), target); - target = MulAdd(max(-g2, 0), MF4x4(-0.34192136, 0.12063195, -0.31159517, 0.04170889, -0.30147067, -0.21330686, -0.1514457, -0.121126845, 0.04409098, 9.2206596e-05, 0.027680017, 0.03230512, -0.27993527, -0.093485355, 0.07568645, -0.23585452), target); - target = MulAdd(max(-h2, 0), MF4x4(0.0537712, -0.20847629, 0.1740093, -0.013894753, -0.32719997, -0.059484575, -0.006098233, -0.10336451, -0.14706188, -0.07424865, -0.07045905, 0.17093194, -0.22147557, 0.09086218, -0.11033544, -0.05306482), target); - target = MulAdd(max(-i2, 0), MF4x4(0.00489003, -0.11509064, -0.021005848, 0.16637677, -0.089347586, 0.17545725, -0.17313693, 0.13742085, -0.14577347, 0.07951095, -0.092139855, 0.017118992, -0.053472433, 0.079414465, 0.0330263, -0.11189824), target); - tex1[gxy] = target; - - target = MF4(0.08895955, -0.027667087, 0.20500831, 0.00037762933); - target = MulAdd(max(a1, 0), MF4x4(-0.25835788, 0.050451655, -0.1845038, -0.07232528, 0.1323318, 0.26276684, 0.10842882, -0.083056524, 0.17426784, -0.3594826, 0.2728965, 0.08388844, -0.004007842, 0.020535901, -0.051425606, 0.07750436), target); - target = MulAdd(max(b1, 0), MF4x4(-0.11410436, 0.014572361, -0.27057216, -0.023974562, 0.05234827, 0.15328228, -0.17502303, -0.3199359, 0.12188045, -0.095813684, 0.024145132, 0.0856916, -0.027453909, -0.043129764, 0.16971985, 0.021623038), target); - target = MulAdd(max(c1, 0), MF4x4(0.06611095, 0.038625732, -0.13717118, -0.04497733, 0.15213469, 0.04770935, 0.0729271, -0.062052976, 0.004571303, 0.035141192, -0.059409596, 0.044652313, 0.17520894, 0.09665589, -0.1479193, 0.06528058), target); - target = MulAdd(max(d1, 0), MF4x4(-0.1845968, 0.091479465, -0.09394898, -0.13545018, -0.029501775, -0.21426639, 0.09255898, 0.1257644, 0.20256902, 0.06267267, 0.10378081, 0.13494423, 0.058310498, 0.03642236, -0.16268995, -0.048100803), target); - target = MulAdd(max(e1, 0), MF4x4(0.2155119, -0.3683131, 0.049449228, -0.20559964, -0.11761922, -0.2518804, -0.020712897, 0.12895772, -0.07543782, 0.5805017, -0.11301444, -0.038493153, -0.06710986, -0.09321189, 0.108671665, -0.03259695), target); - target = MulAdd(max(f1, 0), MF4x4(0.035307787, 0.108389005, -0.27493554, 0.27029404, 0.25523573, -0.28636125, -0.20766719, -0.008661457, -0.004480811, -0.046390545, -0.16221444, 0.008979624, -0.061375532, 0.035076566, -0.018924266, 0.01380219), target); - target = MulAdd(max(g1, 0), MF4x4(-0.051922515, -0.12463486, -0.10383422, 0.02220095, -0.1573033, 0.13980615, 0.13248625, -0.16803266, -0.0692132, -0.21552645, 0.13744529, 0.23034313, 0.0052666534, 0.028977966, 0.07720251, -0.06477756), target); - target = MulAdd(max(h1, 0), MF4x4(-0.14097473, 0.2770271, -0.172289, -0.03000696, -0.028684044, 0.040578447, -0.2290285, 0.082329154, -0.042402364, -0.20926563, 0.08233207, 0.11862443, -0.07038536, -0.02273004, 0.091550544, -0.065856494), target); - target = MulAdd(max(i1, 0), MF4x4(0.14879914, -0.023923844, -0.23569296, 0.20306346, 0.17502785, 0.28776234, -0.2788995, 0.10012439, -0.05635638, -0.025840463, 0.09222198, 0.118032, 0.08057015, 0.1286071, 0.060189806, -0.052669708), target); - target = MulAdd(max(a2, 0), MF4x4(0.07076086, -0.15111323, -0.07427972, 0.008372168, -0.17791592, -0.16254742, 0.013961132, -0.0944912, -0.23380096, 0.17377278, -0.09683394, 0.019931393, -0.12042098, 0.0016406325, 0.09393333, -0.06882231), target); - target = MulAdd(max(b2, 0), MF4x4(0.21465093, 0.04142968, 0.06840044, -0.37831602, -0.05549571, 0.044905066, -0.07873589, -0.026804, -0.34764197, 0.022487951, -0.077293746, 0.089457795, -0.110094436, 0.24233972, 0.06285107, -0.10851744), target); - target = MulAdd(max(c2, 0), MF4x4(0.093270175, 0.084138945, 0.03938272, 0.063565865, -0.010733802, 0.13554469, -0.06650261, 0.033002816, 0.011187271, -0.12821455, 0.20785914, -0.030438649, -0.124710515, -0.022294303, 0.09732408, 0.057609864), target); - target = MulAdd(max(d2, 0), MF4x4(-0.12833868, 0.021577539, -0.02700365, 0.11799592, -0.03655647, -0.04225167, 0.11049353, -0.16036157, 0.049277548, -0.033842396, 0.10020137, 0.095509745, 0.08060231, -0.09237418, -0.035598125, -0.035926737), target); - target = MulAdd(max(e2, 0), MF4x4(-0.32829186, 0.3492363, 0.030671779, -0.12606762, 0.010437313, 0.2757115, -0.21517593, -0.15800527, -0.12592544, -0.20578934, 0.10444053, 0.12993255, -0.046079267, 0.03834173, -0.19277227, -0.22124454), target); - target = MulAdd(max(f2, 0), MF4x4(-0.052546192, 0.026082167, 0.13831234, 0.10982424, 0.012946818, -0.12439852, 0.10134106, -0.10050398, -0.04472338, -0.14325236, -0.20579574, 0.0044005127, 0.22013672, -0.32955512, 0.12404084, -0.008160738), target); - target = MulAdd(max(g2, 0), MF4x4(-0.10774314, -0.31650826, -0.06601711, 0.19635755, -0.12622592, -0.06396423, 0.13856032, 0.16540553, 0.021387719, 0.23377723, -0.053738154, -0.1000186, -0.08338395, -0.052813534, 0.008122962, 0.13732094), target); - target = MulAdd(max(h2, 0), MF4x4(-0.18270823, 0.06966014, -0.17788303, -0.27303055, -0.077971615, 0.013978423, -0.02039098, 0.12715338, -0.11924171, 0.18900296, -0.085199654, 0.215198, 0.18587974, -0.009749325, 0.0173584, -0.12018259), target); - target = MulAdd(max(i2, 0), MF4x4(0.052129295, -0.107416354, 0.12711766, 0.03708665, -0.14369462, -0.055359814, -0.16639823, -0.045143317, -0.06925672, -0.040696755, 0.01999809, -0.016040625, -0.02484878, 0.07417094, 0.050875198, 0.2145528), target); - target = MulAdd(max(-a1, 0), MF4x4(0.055696912, -0.16680926, -0.021987487, 0.024941636, -0.0927883, 0.022136632, 0.033782948, -0.10646058, -0.14944647, 0.25457275, 0.046682496, -0.022462368, -0.07886781, 0.08165927, 0.06848105, 0.0063734027), target); - target = MulAdd(max(-b1, 0), MF4x4(0.037053242, 0.033215813, 0.18291366, 0.12340375, 0.08491059, -0.28442004, -0.0127422465, -0.039834313, -0.23321372, 0.26676926, -0.05636355, -0.15672484, -0.12891728, -0.15486577, -0.032004442, -0.092745155), target); - target = MulAdd(max(-c1, 0), MF4x4(0.015779478, -0.18457565, 0.24996394, 0.036197674, 0.15694007, 0.15863103, -0.07332398, 0.0016235278, -0.15536517, -0.056062788, 0.14102836, 0.16915025, -0.08001087, 0.07073164, 0.13796777, 0.123867124), target); - target = MulAdd(max(-d1, 0), MF4x4(0.045792986, -0.15135059, -0.1354885, -0.043678258, -0.35655212, 0.51232076, -0.12816145, -0.046569496, -0.014127674, -0.06282611, -0.098873, -0.06359104, -0.0919222, 0.11822437, 0.079254694, 0.00579688), target); - target = MulAdd(max(-e1, 0), MF4x4(-0.15683417, 0.61610246, -0.3024612, 0.12917964, -0.09303367, 0.23612969, -0.40842506, -0.12374661, -0.07572449, -0.2613284, -0.09970177, -0.015227848, 0.106239066, -0.21411185, 0.051998455, -0.1364518), target); - target = MulAdd(max(-f1, 0), MF4x4(0.23850034, -0.14394449, -0.0031468747, -0.2380617, -0.027200876, -0.041352056, -0.01864445, 0.033848196, -0.12064239, -0.110480845, 0.08450956, -0.22328654, 0.17664163, 0.22268307, 0.050886698, -0.17475672), target); - target = MulAdd(max(-g1, 0), MF4x4(-0.17808256, 0.010803805, 0.03315186, 0.033143792, -0.14205995, 0.25039625, -0.08784382, -0.13454252, 0.19576813, 0.10755282, 0.22821628, 0.019456752, -0.0422955, -0.016182603, -0.12066697, 0.0548465), target); - target = MulAdd(max(-h1, 0), MF4x4(0.11563777, -0.257929, 0.0010403778, 0.080267854, -0.0025255163, 0.2855168, -0.060352214, -0.07816255, -0.00090574916, 0.049510725, 0.03720483, 0.059250016, -0.08674136, 0.20522198, -0.28694284, 0.1299507), target); - target = MulAdd(max(-i1, 0), MF4x4(-0.14638457, 0.04063328, 0.03139636, -0.007934521, 0.07689684, -0.09467145, 0.10607347, 0.054510128, 0.003306194, 0.05347124, 0.062762424, -0.041480847, -0.07677865, -0.139573, 0.010972524, 0.21957156), target); - target = MulAdd(max(-a2, 0), MF4x4(-0.026845628, -0.043439507, 0.034738723, 0.07281683, 0.14474197, 0.031586993, -0.22767854, -0.0707655, 0.105201736, -0.28805482, 0.008668302, -0.16329518, 0.06157049, 0.3803886, 0.26345953, -0.011096537), target); - target = MulAdd(max(-b2, 0), MF4x4(-0.23328833, 0.085731484, -0.07755016, 0.33559516, 0.07704345, 0.115106605, -0.24114038, -0.44630137, 0.2726737, -0.32170138, -0.009236524, -0.11666051, 0.0457048, 0.07876708, 0.13134004, -0.035318643), target); - target = MulAdd(max(-c2, 0), MF4x4(-0.05140272, 0.011605703, 0.13899171, -0.05071015, 0.18413687, -0.31413674, -0.13043414, -0.15118152, -0.15326938, -0.10720126, -0.23738635, 0.13481396, 0.25115076, -0.009316611, -0.2584441, -0.14389823), target); - target = MulAdd(max(-d2, 0), MF4x4(-0.039723795, -0.14869407, -0.1692942, 0.026501274, -0.10685166, -0.121267825, -0.08584318, -0.09580693, -0.10626739, -0.068417974, 0.11321909, -0.13664317, 0.061380867, -0.2587898, 0.14850819, 0.008178645), target); - target = MulAdd(max(-e2, 0), MF4x4(0.06912782, 0.24230564, -0.048150286, 0.2203717, -0.17417085, 0.105546735, -0.16648416, -0.0045053074, 0.09764028, 0.37122592, -0.1939995, -0.27899942, -0.088152565, -0.53869057, 0.21676709, -0.08056594), target); - target = MulAdd(max(-f2, 0), MF4x4(0.07651754, 0.03704878, -0.0197015, 0.1660726, 0.07002748, -0.11820414, -0.23360898, 0.1481592, 0.029847002, 0.054057185, 0.013176299, 0.06552942, -0.13865773, -0.20105527, -0.37550658, 0.005769631), target); - target = MulAdd(max(-g2, 0), MF4x4(-0.22697811, -0.17426412, 0.10148018, 0.008134666, 0.10771455, 0.16943407, -0.016319012, -0.40176705, -0.06854668, -0.049045276, 0.20919096, 0.13240765, -0.050125647, 0.14902508, 0.052697595, -0.13817468), target); - target = MulAdd(max(-h2, 0), MF4x4(0.04301619, 0.23184754, -0.023551717, 0.3768405, 0.028999053, 0.06709736, -0.05993663, -0.059861984, 0.15499207, -0.22217415, 0.111131504, -0.09082529, -0.19389243, 0.024621522, -0.15305442, 0.010799284), target); - target = MulAdd(max(-i2, 0), MF4x4(-0.035496738, 0.010802548, -0.028718363, 0.19263634, 0.16900502, -0.16661702, -0.027631328, 0.18309957, -0.015860107, -0.03309961, -0.091390446, 0.14000848, -0.0036591904, 0.47659522, -0.09373507, -0.29020965), target); - tex2[gxy] = target; - - target.rgb = tex6.SampleLevel(sam, pos, 0).rgb; - target.rgb = MulAdd(max(e1, 0), MF4x3(0.03094887, -0.008734403, 0.00042712069, 0.053891554, 0.05837673, 0.06200635, 0.09071558, -0.04202184, -0.046172567, -0.0425916, 0.04905093, 0.020835675), target.rgb); - target.rgb = MulAdd(max(e2, 0), MF4x3(0.096628904, -0.037792254, -0.043241944, -0.011923947, -0.025950424, -0.031381752, -0.060941868, -0.07859433, -0.07535451, -0.026777223, 0.08604982, 0.07829908), target.rgb); - target.rgb = MulAdd(max(-e1, 0), MF4x3(-0.06435972, 0.0036599538, 0.00786578, -0.061972067, -0.05681472, -0.06667608, -0.106890626, 0.007406496, 0.029977169, -0.20519382, -0.044860814, 0.0021225857), target.rgb); - target.rgb = MulAdd(max(-e2, 0), MF4x3(-0.16876474, 0.012789643, 0.026692612, 0.017817136, 0.026935097, 0.02227043, 0.01690181, 0.07716103, 0.086527, 0.07923805, -0.10443151, -0.10859543), target.rgb); - tex5[gxy] = MF4(target.rgb, 1); -} - -[numthreads(64, 1, 1)] -void main(uint3 tid : SV_GroupThreadID, uint3 gid : SV_GroupID) { - Pass5((gid.xy << 3), tid); -}