diff --git a/src/plugins/intel_cpu/src/graph_optimizer.cpp b/src/plugins/intel_cpu/src/graph_optimizer.cpp
index 6b7f4042c589ec..765b6a1ddd1629 100644
--- a/src/plugins/intel_cpu/src/graph_optimizer.cpp
+++ b/src/plugins/intel_cpu/src/graph_optimizer.cpp
@@ -91,6 +91,10 @@ void GraphOptimizer::ApplyCommonGraphOptimizations(Graph &graph) {
     FuseFCAndConvertOnWeights(graph);
     graph.RemoveDroppedNodes();
 
+    OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseFCAndTransposeOnWeights");
+    FuseFCAndTransposeOnWeights(graph);
+    graph.RemoveDroppedNodes();
+
     OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseDeconvolutionAndSimpleOperation");
     FuseDeconvolutionAndSimpleOperation(graph);
     graph.RemoveDroppedNodes();
@@ -792,13 +796,13 @@ void GraphOptimizer::MergeConvertAndScaleShift(Graph& graph) {
 }
 
 void GraphOptimizer::FuseFCAndConvertOnWeights(Graph& graph) {
-    // This optimization fuses Convert (fp16 -> bf16/fp32) on weights directly to FC input to allow precision conversion handling based on internal logic
+    // This optimization fuses Convert (fp16/u8 -> bf16/fp32) on weights directly to FC input to allow precision conversion handling based on internal logic
     // (e.g. fuse conversion with weights reordering)
     auto& graphNodes = graph.GetNodes();
 
     for (auto parent : graphNodes) {
         if (parent->getType() == Type::Convert && parent->isConstant() && parent->getChildEdgeAt(0)->getChild()->getType() == Type::FullyConnected
-                && parent->getOriginalInputPrecisionAtPort(0) == Precision::FP16
+                && one_of(parent->getOriginalInputPrecisionAtPort(0), Precision::FP16, Precision::U8)
                 && one_of(parent->getOriginalOutputPrecisionAtPort(0), Precision::FP32, Precision::BF16)) {
             auto childNode = parent->getChildEdgeAt(0)->getChild();
             // set correct weight precision
@@ -808,6 +812,21 @@ void GraphOptimizer::FuseFCAndConvertOnWeights(Graph& graph) {
     }
 }
 
+void GraphOptimizer::FuseFCAndTransposeOnWeights(Graph& graph) {
+    // This optimization allows us to avoid transposing the weights in Transpose node and do it directly along with reordering in FC node
+    auto& graphNodes = graph.GetNodes();
+
+    for (auto parent : graphNodes) {
+        if (parent->getType() == Type::Transpose && parent->isConstant() && parent->getChildEdgeAt(0)->getChild()->getType() == Type::FullyConnected
+                && parent->getOutputShapeAtPort(0).getRank() == 2) {
+            auto fcNode = std::dynamic_pointer_cast<FullyConnected>(parent->getChildEdgeAt(0)->getChild());
+            fcNode->setTransposeWeights(true);
+            auto transposeNode = std::dynamic_pointer_cast<Transpose>(parent);
+            transposeNode->setFakeTranspose(true);
+        }
+    }
+}
+
 void GraphOptimizer::FuseConvolutionAndZeroPoints(Graph &graph) {
     auto& graphNodes = graph.GetNodes();
 
diff --git a/src/plugins/intel_cpu/src/graph_optimizer.h b/src/plugins/intel_cpu/src/graph_optimizer.h
index fe4991deb86fe6..bb6494758f38be 100644
--- a/src/plugins/intel_cpu/src/graph_optimizer.h
+++ b/src/plugins/intel_cpu/src/graph_optimizer.h
@@ -27,6 +27,7 @@ class GraphOptimizer {
     void FuseMultiplyAndAdd(Graph &graph);
     void MergeConvertAndScaleShift(Graph& graph);
     void FuseFCAndConvertOnWeights(Graph& graph);
+    void FuseFCAndTransposeOnWeights(Graph& graph);
     void FuseFullyConnectedAndSimpleOperation(Graph &graph);
     void FuseMatMulAndSimpleOperation(Graph &graph);
     void FuseConvolutionAndSimpleOperationThroughMaxPool(Graph &graph);
diff --git a/src/plugins/intel_cpu/src/node.h b/src/plugins/intel_cpu/src/node.h
index 4cfe9c7d708660..e2ceba0d0eadd3 100644
--- a/src/plugins/intel_cpu/src/node.h
+++ b/src/plugins/intel_cpu/src/node.h
@@ -694,6 +694,14 @@ class Node {
 
     std::shared_ptr<IShapeInfer> shapeInference;
 
+    // we cannot rely on per-NUMA weightCache for caching weights because:
+    //   1.it may not exist(in single stream configuration)
+    //   2.it only holds weak references, the life-cycle of cached item
+    //     is still under control of strong references outside of cache.
+    // privateWeightCache is for holding strong references to constant weight
+    // copies of same content with different layouts.
+    std::unordered_map<std::string, MemoryPtr> privateWeightCache;
+
 private:
     std::vector<EdgeWeakPtr> parentEdges;
     std::vector<EdgeWeakPtr> childEdges;
@@ -723,13 +731,6 @@ class Node {
     ConstantType checkConstant(LOOK look, std::vector<NodePtr>& checkNodes);
     // Hold output scales
     std::vector<float> DQScales;
-    // we cannot rely on per-NUMA weightCache for caching weights because:
-    //   1.it may not exist(in single stream configuration)
-    //   2.it only holds weak references, the life-cycle of cached item
-    //     is still under control of strong references outside of cache.
-    // privateWeightCache is for holding strong references to constant weight
-    // copies of same content with different layouts.
-    std::unordered_map<std::string, MemoryPtr> privateWeightCache;
 
     CPU_DEBUG_CAP_ENABLE(friend class Verbose);
 };
diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
index 13a205c76b9c5e..1ded31384ab103 100644
--- a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
+++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
@@ -337,12 +337,14 @@ void FullyConnected::prepackMLASWeight() {
         MemoryPtr ptr;
         auto create = [&]() {
             float* weightPtr = reinterpret_cast<float*>(weightsMem->getData());
-            size_t ldb = K;
+            // todo: leave comment about transposeWeights
+            size_t ldb = transposeWeights ? N : K;
             MemoryPtr _ptr =
                 std::make_shared<Memory>(getEngine(),
                                          intel_cpu::CpuBlockedMemoryDesc(Precision::I8, intel_cpu::Shape{packedBsize}));
             float* prepackedDst = reinterpret_cast<float*>(_ptr->getData());
-            mlas_sgemm_pack("T", N, K, ldb, weightPtr, prepackedDst);
+            // todo: leave detailed comment (double transpose = do nothing)
+            mlas_sgemm_pack(transposeWeights ? "F" : "T", N, K, ldb, weightPtr, prepackedDst);
             return _ptr;
         };
 
@@ -513,7 +515,11 @@ void FullyConnected::prepareParams() {
         }
 
         if (!prevExecPtr || !execPtr->getWeightDesc()->isCompatible(*(prevExecPtr->getWeightDesc()))) {
-            primArgs[DNNL_ARG_WEIGHTS] = prepareWeightMemory(execPtr->getWeightDesc())->getPrimitive();
+            if (transposeWeights) {
+                primArgs[DNNL_ARG_WEIGHTS] = prepareTransposedWeightMemory(execPtr->getWeightDesc())->getPrimitive();
+            } else {
+                primArgs[DNNL_ARG_WEIGHTS] = prepareWeightMemory(execPtr->getWeightDesc())->getPrimitive();
+            }
         }
         // changed shapes may also cause the kernel type changed
         selected_pd->setImplementationType(execPtr->getImplementationType());
@@ -1106,6 +1112,50 @@ void FullyConnected::fuseDecompressionConstant(const NodePtr& constData, std::ve
                 Precision::FP32,
                 elementsCount);
 }
+
+MemoryPtr FullyConnected::prepareTransposedWeightMemory(DnnlMemoryDescPtr weightDesc) {
+    if (!getParentEdgeAt(1)->getParent()->isConstant())
+        IE_THROW() << "Weight input is not const for node " << getName() << ".";
+    auto edgeMem = getParentEdgeAt(1)->getMemoryPtr();
+    if (!edgeMem)
+        IE_THROW() << "Cannot get const weights edgeMem for node " << getName() << ".";
+
+    auto constDnnlMemOutDesc = edgeMem->getDescWithType<DnnlMemoryDesc>();
+    auto weightSrcDesc = constDnnlMemOutDesc->getDnnlDesc();
+    weightSrcDesc = {weightSrcDesc.get_dims(), weightSrcDesc.get_data_type(), memory::format_tag::ba};
+    weightSrcDesc = weightSrcDesc.reshape(weightDesc->getDnnlDesc().get_dims());
+
+    auto create = [&] () {
+        auto newSrcDesc = DnnlExtensionUtils::makeDescriptor(weightSrcDesc);
+
+        Memory srcMemory{ getEngine(), newSrcDesc, edgeMem->getData() };
+        MemoryPtr _ptr = std::make_shared<Memory>(getEngine(), weightDesc);
+        node::Reorder::reorderData(srcMemory, *_ptr, context->getParamsCache());
+
+        return _ptr;
+    };
+
+    MemoryPtr ptr;
+    const auto& format = weightDesc->serializeFormat();
+    auto itr = privateWeightCache.find(format);
+    if (privateWeightCache.end() != itr) {
+        ptr = itr->second;
+    } else {
+        auto weightCache = context->getWeightsCache();
+        if (weightCache != nullptr) {
+            const std::string string_hash = getName() + "_" + format
+                                            + "_" + std::to_string(edgeMem->getSize())
+                                            + "_" + std::to_string(reinterpret_cast<uint64_t>(edgeMem->getData()));
+
+            ptr = *weightCache->findOrCreate(string_hash, create);
+        } else {
+            ptr = create();
+        }
+        privateWeightCache[format] = ptr;
+    }
+
+    return ptr;
+}
 }   // namespace node
 }   // namespace intel_cpu
 }   // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.h b/src/plugins/intel_cpu/src/nodes/fullyconnected.h
index 2f9a11e5b57005..70ace5e4308bdb 100644
--- a/src/plugins/intel_cpu/src/nodes/fullyconnected.h
+++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.h
@@ -56,6 +56,9 @@ class FullyConnected : public Node {
     void prepareParams() override;
     void executeDynamicImpl(dnnl::stream strm) override;
     bool canBeExecutedInInt8() const override;
+    void setTransposeWeights(bool transpose) {
+        transposeWeights = transpose;
+    }
 
     void fuseDecompressionMultiply(const NodePtr& constData);
     const std::vector<float>& getDecompressionMultiply() const { return decompressionMultiply; }
@@ -117,6 +120,11 @@ class FullyConnected : public Node {
 
     std::vector<float> decompressionSubtract;
     std::vector<float> decompressionMultiply;
+
+    // FC with transposed weights
+    bool transposeWeights = false;
+    // this method is using to prepare transposed memory
+    MemoryPtr prepareTransposedWeightMemory(DnnlMemoryDescPtr weightDesc);
 };
 
 }   // namespace node
diff --git a/src/plugins/intel_cpu/src/nodes/transpose.cpp b/src/plugins/intel_cpu/src/nodes/transpose.cpp
index 5bc502410163dc..835115ff337044 100644
--- a/src/plugins/intel_cpu/src/nodes/transpose.cpp
+++ b/src/plugins/intel_cpu/src/nodes/transpose.cpp
@@ -140,7 +140,7 @@ void Transpose::initSupportedPrimitiveDescriptors() {
     config.inConfs[INPUT_ORDER_IDX].constant(isInputOrderConst);
     config.inConfs[INPUT_ORDER_IDX].setMemDesc(creatorsMap.at(LayoutType::ncsp)->createSharedDesc(
             Precision::I32, getInputShapeAtPort(INPUT_ORDER_IDX)));
-    config.outConfs[0].inPlace(-1);
+    config.outConfs[0].inPlace(fakeTranspose ? 0 : -1);
     config.outConfs[0].constant(false);
     transpose_context = std::make_shared<ExecutorContext>(context, getImplPriority());
 
@@ -287,6 +287,8 @@ void Transpose::createPrimitive() {
 }
 
 void Transpose::execute(dnnl::stream strm) {
+    if (fakeTranspose)
+        return;
     if (prim) {
         prim.execute(strm, primArgs);
     } else if (execPtr) {
diff --git a/src/plugins/intel_cpu/src/nodes/transpose.h b/src/plugins/intel_cpu/src/nodes/transpose.h
index 5fb7e9f76570bf..a09c929d58c7c9 100644
--- a/src/plugins/intel_cpu/src/nodes/transpose.h
+++ b/src/plugins/intel_cpu/src/nodes/transpose.h
@@ -38,6 +38,10 @@ class Transpose : public Node {
     bool needPrepareParams() const override;
     void prepareParams() override;
 
+    void setFakeTranspose(bool fake) {
+        fakeTranspose = fake;
+    }
+
 protected:
     void executeDynamicImpl(dnnl::stream strm) override;
     std::shared_ptr<ExecutorContext> transpose_context;
@@ -56,6 +60,7 @@ class Transpose : public Node {
     static constexpr size_t INPUT_ORDER_IDX = 1lu;
 
     bool performAsReorder = false;
+    bool fakeTranspose = false;
 };
 
 }   // namespace node
diff --git a/src/plugins/intel_cpu/tests/functional/subgraph_tests/src/matmul_decompress_convert.cpp b/src/plugins/intel_cpu/tests/functional/subgraph_tests/src/matmul_decompress_convert.cpp
index 81a7def84adaa6..cf6a97e3f7fb71 100644
--- a/src/plugins/intel_cpu/tests/functional/subgraph_tests/src/matmul_decompress_convert.cpp
+++ b/src/plugins/intel_cpu/tests/functional/subgraph_tests/src/matmul_decompress_convert.cpp
@@ -14,6 +14,7 @@ using namespace ov::test;
 
 namespace SubgraphTestsDefinitions {
 
+// todo:
 /* This test checks that the ConvertMatMulToFC transformation should work and the MatMul node is converted to the FC node.
  * The Convert node should be removed on the CPU plugin side.
 
@@ -49,9 +50,11 @@ namespace SubgraphTestsDefinitions {
 */
 
 using MatMulDecompressConvertParams = std::tuple<
-    std::vector<InputShape>, // input shapes
-    std::pair<bool, bool>,   // transposeA, transposeB
-    std::map<std::string, std::string> // additional config
+    std::vector<InputShape>,            // input shapes
+    std::pair<bool, bool>,              // transposeA, transposeB
+    ElementType,                        // weights precision
+    std::map<std::string, std::string>, // additional config
+    CPUSpecificParams
 >;
 
 class MatMulDecompressConvertTest : public testing::WithParamInterface<MatMulDecompressConvertParams>,
@@ -60,9 +63,11 @@ class MatMulDecompressConvertTest : public testing::WithParamInterface<MatMulDec
     static std::string getTestCaseName(testing::TestParamInfo<MatMulDecompressConvertParams> obj) {
         std::vector<InputShape> inputShapes;
         std::pair<bool, bool> transpose;
+        ElementType weiElemType;
         std::map<std::string, std::string> additionalConfig;
+        CPUSpecificParams cpuParams;
 
-        std::tie(inputShapes, transpose, additionalConfig) = obj.param;
+        std::tie(inputShapes, transpose, weiElemType, additionalConfig, cpuParams) = obj.param;
 
         std::ostringstream result;
         for (const auto& shape : inputShapes) {
@@ -82,12 +87,16 @@ class MatMulDecompressConvertTest : public testing::WithParamInterface<MatMulDec
         result << "transpose_a=" << transpose.first << "_";
         result << "transpose_b=" << transpose.second << "_";
 
+        result << "weiLemType=" << weiElemType << "_";
+
         result << "config=(";
         for (const auto& configEntry : additionalConfig) {
             result << configEntry.first << ", " << configEntry.second << ":";
         }
         result << ")";
 
+        result << CPUTestsBase::getTestCaseName(cpuParams);
+
         return result.str();
     }
 
@@ -98,7 +107,7 @@ class MatMulDecompressConvertTest : public testing::WithParamInterface<MatMulDec
         std::swap(*(shape.end() - 1), *(shape.end() - 2));
     }
 
-    void CheckConstFP16() const {
+    void CheckFCWeightsPrecision() const {
         auto getExecValue = [](const ov::Node::RTMap& rtInfo, const std::string &paramName) -> std::string {
             auto it = rtInfo.find(paramName);
             IE_ASSERT(rtInfo.end() != it);
@@ -110,8 +119,8 @@ class MatMulDecompressConvertTest : public testing::WithParamInterface<MatMulDec
         for (const auto &fcNode : execFunction->get_ops()) {
             if (getExecValue(fcNode->get_rt_info(), ExecGraphInfoSerialization::LAYER_TYPE) == "FullyConnected") {
                 const auto &constNode = fcNode->get_input_node_shared_ptr(1);
-                ASSERT_EQ(getExecValue(constNode->get_rt_info(), ExecGraphInfoSerialization::LAYER_TYPE), "Const");
-                ASSERT_EQ(getExecValue(constNode->get_rt_info(), ExecGraphInfoSerialization::OUTPUT_PRECISIONS), "FP16");
+                element::Type expectedType(getExecValue(constNode->get_rt_info(), ExecGraphInfoSerialization::OUTPUT_PRECISIONS));
+                ASSERT_EQ(expectedType, weiConstElemType);
             }
         }
     }
@@ -122,14 +131,19 @@ class MatMulDecompressConvertTest : public testing::WithParamInterface<MatMulDec
         std::vector<InputShape> inputShapes;
         std::pair<bool, bool> transpose;
         std::map<std::string, std::string> additionalConfig;
+        CPUSpecificParams cpuParams;
 
-        std::tie(inputShapes, transpose, additionalConfig) = this->GetParam();
+        std::tie(inputShapes, transpose, weiConstElemType, additionalConfig, cpuParams) = this->GetParam();
+        std::tie(inFmts, outFmts, priority, selectedType) = cpuParams;
 
         init_input_shapes(inputShapes);
 
         bool transpA = transpose.first;
         bool transpB = transpose.second;
 
+        if (transpA) transposesCount++;
+        if (!transpB) transposesCount++;
+
         if (transpA) {
             transposeShape(inputDynamicShapes[0]);
             for (auto& shapes : targetStaticShapes) {
@@ -148,32 +162,47 @@ class MatMulDecompressConvertTest : public testing::WithParamInterface<MatMulDec
 
         configuration.insert(additionalConfig.begin(), additionalConfig.end());
 
-        ElementType netType = element::f32;
-        if (additionalConfig[PluginConfigParams::KEY_ENFORCE_BF16] == PluginConfigParams::YES)
-            inType = outType = netType = ElementType::bf16;
-        else
+        ElementType netType = ElementType::f32;
+        ElementType convertOutType = ElementType::f32;
+        if (additionalConfig[PluginConfigParams::KEY_ENFORCE_BF16] == PluginConfigParams::YES) {
+            convertOutType = inType = outType = netType = ElementType::bf16;
+            weiConstElemType = (weiConstElemType != ElementType::f32) ? weiConstElemType : ElementType::bf16;
+        } else {
             inType = outType = netType;
+        }
 
         std::string cpuNodeType = "FullyConnected";
+        selectedType = makeSelectedTypeStr(selectedType, outType);
 
         auto params = builder::makeDynamicParams(inType, {inShapeA});
         auto paramOuts = helpers::convert2OutputVector(helpers::castOps2Nodes<opset1::Parameter>(params));
-
-        auto matrixB = ngraph::builder::makeConstant<float16>(element::f16, inShapeB.get_shape(), {}, true);
-        auto convert = std::make_shared<ngraph::opset1::Convert>(matrixB, inType);
-        mark_as_decompression(convert);
-        auto matMul = builder::makeMatMul(paramOuts[0], convert, transpA, transpB);
+        std::shared_ptr<Node> inputB = builder::makeConstant<float>(weiConstElemType, inShapeB.get_shape(), {}, true);
+        if (weiConstElemType == ElementType::f16 || weiConstElemType == ElementType::u8) {
+            inputB = std::make_shared<opset1::Convert>(inputB, convertOutType);
+            mark_as_decompression(inputB);
+        }
+        auto matMul = builder::makeMatMul(paramOuts[0], inputB, transpA, transpB);
 
         function = CPUTestsBase::makeNgraphFunction(netType, params, matMul, cpuNodeType);
     }
+
+    void CheckExecutionGraph() {
+        CheckPluginRelatedResults(compiledModel, "FullyConnected");
+        CheckNumberOfNodesWithType(compiledModel, "FullyConnected", 1);
+        CheckNumberOfNodesWithType(compiledModel, "Transpose", transposesCount);
+        CheckNumberOfNodesWithType(compiledModel, "Convert", 0);
+        CheckNumberOfNodesWithType(compiledModel, "Reorder", 0);
+        CheckFCWeightsPrecision();
+    }
+
+    size_t transposesCount = 0;
+    ElementType weiConstElemType = ElementType::f32;
 };
 
 TEST_P(MatMulDecompressConvertTest, CompareWithRefs) {
+    SKIP_IF_CURRENT_TEST_IS_DISABLED();
     run();
-    CheckNumberOfNodesWithType(compiledModel, "FullyConnected", 1);
-    CheckNumberOfNodesWithType(compiledModel, "Convert", 0);
-    CheckNumberOfNodesWithType(compiledModel, "Reorder", 0);
-    CheckConstFP16();
+    CheckExecutionGraph();
 }
 
 namespace {
@@ -185,32 +214,148 @@ const std::vector<std::pair<bool, bool>> transposeParams = {
     {true, true},
 };
 
+const std::vector<std::vector<InputShape>> inputShapes2D = {
+    static_shapes_to_test_representation({{2, 3}, {3, 4}}),
+    {
+        {{-1, -1}, {{2, 3}, {5, 3}}},
+        {{3, 4}, {{3, 4}, {3, 4}}}
+    },
+};
+
+const std::vector<std::vector<InputShape>> inputShapes3D = {
+    static_shapes_to_test_representation({{2, 2, 3}, {3, 4}}),
+    static_shapes_to_test_representation({{2, 3}, {1, 3, 4}}),
+    static_shapes_to_test_representation({{1, 2, 3}, {1, 3, 4}}),
+    {
+        {{-1, -1, -1}, {{2, 2, 3}, {3, 5, 3}}},
+        {{3, 4}, {{3, 4}, {3, 4}}}
+    },
+    {
+        {{-1, -1}, {{2, 3}, {5, 3}}},
+        {{1, 3, 4}, {{1, 3, 4}, {1, 3, 4}}}
+    },
+    {
+        {{-1, -1, -1}, {{1, 2, 3}, {1, 5, 3}}},
+        {{1, 3, 4}, {{1, 3, 4}, {1, 3, 4}}}
+    },
+};
+
+
 std::vector<std::map<std::string, std::string>> filterAdditionalConfig() {
     std::vector<std::map<std::string, std::string>> additionalConfig;
+#ifndef OV_CPU_WITH_MLAS
     additionalConfig.push_back(std::map<std::string, std::string>{/* empty config */});
+#endif
+    return additionalConfig;
+}
+
+std::vector<std::map<std::string, std::string>> filterAdditionalConfig_BF16() {
+    std::vector<std::map<std::string, std::string>> additionalConfig;
     if (with_cpu_x86_avx512_core()) {
         additionalConfig.push_back({{PluginConfigParams::KEY_ENFORCE_BF16, PluginConfigParams::YES}});
     }
+    return additionalConfig;
+}
 
+std::vector<std::map<std::string, std::string>> filterAdditionalConfig_MLAS() {
+    std::vector<std::map<std::string, std::string>> additionalConfig;
+    additionalConfig.push_back(std::map<std::string, std::string>{/* empty config */});
     return additionalConfig;
 }
 
+std::vector<CPUSpecificParams> filterSpecificParams() {
+    std::vector<CPUSpecificParams> specificParams;
+    if (with_cpu_x86_avx512_core()) {
+        specificParams.push_back(CPUSpecificParams{{}, {}, {"brgemm_avx512"}, "brgemm_avx512"});
+    } else if (with_cpu_x86_avx2()) {
+        specificParams.push_back(CPUSpecificParams{{}, {}, {"brgemm_avx2"}, "brgemm_avx2"});
+    }
+    return specificParams;
+}
+
+
+std::vector<CPUSpecificParams> filterSpecificParams_BF16() {
+    std::vector<CPUSpecificParams> specificParams;
+    specificParams.push_back(CPUSpecificParams{{}, {}, {"jit_gemm"}, "jit_gemm"});
+    return specificParams;
+}
+
+
+std::vector<CPUSpecificParams> filterSpecificParams_MLAS() {
+    std::vector<CPUSpecificParams> specificParams;
+    specificParams.push_back(CPUSpecificParams{{}, {}, {"gemm_mlas"}, "gemm_mlas"});
+    return specificParams;
+}
+
+
+#ifdef OV_CPU_WITH_MLAS
+const auto testParams2D_MLAS_smoke = ::testing::Combine(
+    ::testing::ValuesIn(inputShapes2D),
+    ::testing::ValuesIn(transposeParams),
+    ::testing::Values(ElementType::f32),
+    ::testing::ValuesIn(filterAdditionalConfig_MLAS()),
+    ::testing::ValuesIn(filterSpecificParams_MLAS()));
+
+INSTANTIATE_TEST_SUITE_P(smoke_FC_2D_MLAS, MatMulDecompressConvertTest, testParams2D_MLAS_smoke,
+                        MatMulDecompressConvertTest::getTestCaseName);
+#endif
+
+
 const auto testParams2D_smoke = ::testing::Combine(
-        ::testing::Values(static_shapes_to_test_representation({{2, 3}, {3, 4}})),
-        ::testing::ValuesIn(transposeParams),
-        ::testing::ValuesIn(filterAdditionalConfig()));
+    ::testing::ValuesIn(inputShapes2D),
+    ::testing::ValuesIn(transposeParams),
+    ::testing::Values(ElementType::f32, ElementType::f16, ElementType::u8),
+    ::testing::ValuesIn(filterAdditionalConfig()),
+    ::testing::ValuesIn(filterSpecificParams()));
 
 INSTANTIATE_TEST_SUITE_P(smoke_FC_2D, MatMulDecompressConvertTest, testParams2D_smoke,
-                         MatMulDecompressConvertTest::getTestCaseName);
+                        MatMulDecompressConvertTest::getTestCaseName);
+
+
+const auto testParams2D_BF16_smoke = ::testing::Combine(
+    ::testing::ValuesIn(inputShapes2D),
+    ::testing::ValuesIn(transposeParams),
+    ::testing::Values(ElementType::f32, ElementType::f16, ElementType::u8),
+    ::testing::ValuesIn(filterAdditionalConfig_BF16()),
+    ::testing::ValuesIn(filterSpecificParams_BF16()));
+
+INSTANTIATE_TEST_SUITE_P(smoke_FC_2D_BF16, MatMulDecompressConvertTest, testParams2D_BF16_smoke,
+                        MatMulDecompressConvertTest::getTestCaseName);
+
+
+#ifdef OV_CPU_WITH_MLAS
+const auto testParams3D_MLAS_smoke = ::testing::Combine(
+    ::testing::ValuesIn(inputShapes3D),
+    ::testing::ValuesIn(transposeParams),
+    ::testing::Values(ElementType::f32),
+    ::testing::ValuesIn(filterAdditionalConfig_MLAS()),
+    ::testing::ValuesIn(filterSpecificParams_MLAS()));
+
+INSTANTIATE_TEST_SUITE_P(smoke_FC_3D_MLAS, MatMulDecompressConvertTest, testParams3D_MLAS_smoke,
+                        MatMulDecompressConvertTest::getTestCaseName);
+#endif
+
 
 const auto testParams3D_smoke = ::testing::Combine(
-        ::testing::Values(static_shapes_to_test_representation({{1, 2, 3}, {3, 4}}),
-                          static_shapes_to_test_representation({{2, 3}, {1, 3, 4}})),
-        ::testing::ValuesIn(transposeParams),
-        ::testing::ValuesIn(filterAdditionalConfig()));
+    ::testing::ValuesIn(inputShapes3D),
+    ::testing::ValuesIn(transposeParams),
+    ::testing::Values(ElementType::f32, ElementType::f16, ElementType::u8),
+    ::testing::ValuesIn(filterAdditionalConfig()),
+    ::testing::ValuesIn(filterSpecificParams()));
 
 INSTANTIATE_TEST_SUITE_P(smoke_FC_3D, MatMulDecompressConvertTest, testParams3D_smoke,
-                         MatMulDecompressConvertTest::getTestCaseName);
+                        MatMulDecompressConvertTest::getTestCaseName);
+
+
+const auto testParams3D_BF16_smoke = ::testing::Combine(
+    ::testing::ValuesIn(inputShapes3D),
+    ::testing::ValuesIn(transposeParams),
+    ::testing::Values(ElementType::f32, ElementType::f16, ElementType::u8),
+    ::testing::ValuesIn(filterAdditionalConfig_BF16()),
+    ::testing::ValuesIn(filterSpecificParams_BF16()));
+
+INSTANTIATE_TEST_SUITE_P(smoke_FC_3D_BF16, MatMulDecompressConvertTest, testParams3D_BF16_smoke,
+                        MatMulDecompressConvertTest::getTestCaseName);
 
 } // namespace