Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add intrinsics for FEAT_SME_MOP4 #381

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
314 changes: 314 additions & 0 deletions main/acle.md
Original file line number Diff line number Diff line change
Expand Up @@ -444,6 +444,8 @@ Armv8.4-A [[ARMARMv84]](#ARMARMv84). Support is added for the Dot Product intrin
* Added `svdot[_n_f16_mf8]_fpm` and `svdot[_n_f32_mf8]_fpm`.
* Added Guarded Control Stack (GCS) at
[**Beta**](#current-status-and-anticipated-changes) quality level.
* Added [**Beta**](#current-status-and-anticipated-changes) support
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remember to change to alpha

for quarter-tile outer product intrinsics.

### References

Expand Down Expand Up @@ -2370,6 +2372,17 @@ support for the SME double precision floating-point outer product
(FEAT_SME_F64F64) instructions and if their associated intrinsics are
available. This implies that `__ARM_FEATURE_SME` is nonzero.

#### Quarter-tile outer product intrinsics

The specification for SME is in
[**Beta** state](#current-status-and-anticipated-changes) and may change or be
extended in the future.

`__ARM_FEATURE_SME_MOP4` is defined to `1` if there is hardware
support for the SME quarter-tile outer product(FEAT_SME_MOP4)
instructions and if their associated intrinsics are
available. This implies that `__ARM_FEATURE_SME2` is nonzero.

## Floating-point model

These macros test the floating-point model implemented by the compiler
Expand Down Expand Up @@ -2566,6 +2579,7 @@ be found in [[BA]](#BA).
| [`__ARM_FEATURE_SME_F8F16`](#modal-8-bit-floating-point-extensions) | Modal 8-bit floating-point extensions | 1 |
| [`__ARM_FEATURE_SME_F8F32`](#modal-8-bit-floating-point-extensions) | Modal 8-bit floating-point extensions | 1 |
| [`__ARM_FEATURE_SME_I16I64`](#16-bit-to-64-bit-integer-widening-outer-product-intrinsics) | 16-bit to 64-bit integer widening outer product intrinsics (FEAT_SME_I16I64) | 1 |
| [`__ARM_FEATURE_SME_MOP4`](#quarter-tile-outer-product-intrinsics) | quarter-tile outer product intrinsics (FEAT_SME_MOP4) | 1 |
| [`__ARM_FEATURE_SME_LOCALLY_STREAMING`](#scalable-matrix-extension-sme) | Support for the `arm_locally_streaming` attribute | 1 |
| [`__ARM_FEATURE_SME_LUTv2`](#lookup-table-extensions) | Lookup table extensions (FEAT_SME_LUTv2) | 1 |
| [`__ARM_FEATURE_SSVE_FP8DOT2`](#modal-8-bit-floating-point-extensions) | Modal 8-bit floating-point extensions | 1 |
Expand Down Expand Up @@ -11411,6 +11425,306 @@ Bitwise exclusive NOR population count outer product and accumulate/subtract
__arm_streaming __arm_inout("za");
```

#### FMOP4A (non-FP8), BFMOP4A, SMOP4A, UMOP4A

``` c
// Only if __ARM_FEATURE_SME_MOP4 != 0
// Variants are also available for:
// _za16[_f16_f16] (only if __ARM_FEATURE_SME_F16F16 != 0)
// _za16[_bf16_bf16] (only if __ARM_FEATURE_SME_B16B16 != 0)
// _za32[_f16_f16]
// _za32[_bf16_bf16]
// _za32[_s16_s16]
// _za32[_u16_u16]
// _za32[_s8_s8]
// _za32[_u8_u8]
// _za64[_f64_f64] (only if __ARM_FEATURE_SME_F64F64 != 0)
// _za64[_s16_s16] (only if __ARM_FEATURE_SME_I16I64 != 0)
void svmopa_za32[_f32_f32](uint64_t tile, svfloat32_t zn,
svfloat32_t zm)
__arm_streaming __arm_inout("za");

// Only if __ARM_FEATURE_SME_MOP4 != 0
// Variants are also available for:
// _za16[_f16_f16] (only if __ARM_FEATURE_SME_F16F16 != 0)
// _za16[_bf16_bf16] (only if __ARM_FEATURE_SME_B16B16 != 0)
// _za32[_f16_f16]
// _za32[_bf16_bf16]
// _za32[_s16_s16]
// _za32[_u16_u16]
// _za32[_s8_s8]
// _za32[_u8_u8]
// _za64[_f64_f64] (only if __ARM_FEATURE_SME_F64F64 != 0)
// _za64[_s16_s16] (only if __ARM_FEATURE_SME_I16I64 != 0)
void svmopa_za32[_f32_f32](uint64_t tile, svfloat32x2_t zn,
svfloat32x2_t zm)
__arm_streaming __arm_inout("za");

// Only if __ARM_FEATURE_SME_MOP4 != 0
// Variants are also available for:
// _za16[_f16_f16] (only if __ARM_FEATURE_SME_F16F16 != 0)
// _za16[_bf16_bf16] (only if __ARM_FEATURE_SME_B16B16 != 0)
// _za32[_f16_f16]
// _za32[_bf16_bf16]
// _za32[_s16_s16]
// _za32[_u16_u16]
// _za32[_s8_s8]
// _za32[_u8_u8]
// _za64[_f64_f64] (only if __ARM_FEATURE_SME_F64F64 != 0)
// _za64[_s16_s16] (only if __ARM_FEATURE_SME_I16I64 != 0)
void svmopa[_single]_za32[_f32_f32](uint64_t tile, svfloat32x2_t zn,
svfloat32_t zm)
__arm_streaming __arm_inout("za");

// Only if __ARM_FEATURE_SME_MOP4 != 0
// Variants are also available for:
// _za16[_f16_f16] (only if __ARM_FEATURE_SME_F16F16 != 0)
// _za16[_bf16_bf16] (only if __ARM_FEATURE_SME_B16B16 != 0)
// _za32[_f16_f16]
// _za32[_bf16_bf16]
// _za32[_s16_s16]
// _za32[_u16_u16]
// _za32[_s8_s8]
// _za32[_u8_u8]
// _za64[_f64_f64] (only if __ARM_FEATURE_SME_F64F64 != 0)
// _za64[_s16_s16] (only if __ARM_FEATURE_SME_I16I64 != 0)
void svmopa[_single]_za32[_f32_f32](uint64_t tile, svfloat32_t zn,
svfloat32x2_t zm)
__arm_streaming __arm_inout("za");
```

#### FMOP4S (non-FP8), BFMOP4S, SMOP4S, UMOP4S

``` c
// Only if __ARM_FEATURE_SME_MOP4 != 0
// Variants are also available for:
// _za16[_f16_f16] (only if __ARM_FEATURE_SME_F16F16 != 0)
// _za16[_bf16_bf16] (only if __ARM_FEATURE_SME_B16B16 != 0)
// _za32[_f16_f16]
// _za32[_bf16_bf16]
// _za32[_s16_s16]
// _za32[_u16_u16]
// _za32[_s8_s8]
// _za32[_u8_u8]
// _za64[_f64_f64] (only if __ARM_FEATURE_SME_F64F64 != 0)
// _za64[_s16_s16] (only if __ARM_FEATURE_SME_I16I64 != 0)
void svmops_za32[_f32_f32](uint64_t tile, svfloat32_t zn,
svfloat32_t zm)
__arm_streaming __arm_inout("za");

// Only if __ARM_FEATURE_SME_MOP4 != 0
// Variants are also available for:
// _za16[_f16_f16] (only if __ARM_FEATURE_SME_F16F16 != 0)
// _za16[_bf16_bf16] (only if __ARM_FEATURE_SME_B16B16 != 0)
// _za32[_f16_f16]
// _za32[_bf16_bf16]
// _za32[_s16_s16]
// _za32[_u16_u16]
// _za32[_s8_s8]
// _za32[_u8_u8]
// _za64[_f64_f64] (only if __ARM_FEATURE_SME_F64F64 != 0)
// _za64[_s16_s16] (only if __ARM_FEATURE_SME_I16I64 != 0)
void svmops_za32[_f32_f32](uint64_t tile, svfloat32x2_t zn,
svfloat32x2_t zm)
__arm_streaming __arm_inout("za");

// Only if __ARM_FEATURE_SME_MOP4 != 0
// Variants are also available for:
// _za16[_f16_f16] (only if __ARM_FEATURE_SME_F16F16 != 0)
// _za16[_bf16_bf16] (only if __ARM_FEATURE_SME_B16B16 != 0)
// _za32[_f16_f16]
// _za32[_bf16_bf16]
// _za32[_s16_s16]
// _za32[_u16_u16]
// _za32[_s8_s8]
// _za32[_u8_u8]
// _za64[_f64_f64] (only if __ARM_FEATURE_SME_F64F64 != 0)
// _za64[_s16_s16] (only if __ARM_FEATURE_SME_I16I64 != 0)
void svmops[_single]_za32[_f32_f32](uint64_t tile, svfloat32x2_t zn,
svfloat32_t zm)
__arm_streaming __arm_inout("za");

// Only if __ARM_FEATURE_SME_MOP4 != 0
// Variants are also available for:
// _za16[_f16_f16] (only if __ARM_FEATURE_SME_F16F16 != 0)
// _za16[_bf16_bf16] (only if __ARM_FEATURE_SME_B16B16 != 0)
// _za32[_f16_f16]
// _za32[_bf16_bf16]
// _za32[_s16_s16]
// _za32[_u16_u16]
// _za32[_s8_s8]
// _za32[_u8_u8]
// _za64[_f64_f64] (only if __ARM_FEATURE_SME_F64F64 != 0)
// _za64[_s16_s16] (only if __ARM_FEATURE_SME_I16I64 != 0)
void svmops[_single]_za32[_f32_f32](uint64_t tile, svfloat32_t zn,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Line 11543 should be different from 11559.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure what this comment means. The lines reffered are not about mop4a intrinsics ?

svfloat32x2_t zm)
__arm_streaming __arm_inout("za");
```

#### SUMOP4A

``` c
// Only if __ARM_FEATURE_SME_MOP4 != 0
// Variants are also available for:
// _za64[_s16_u16] (only if __ARM_FEATURE_SME_I16I64 != 0)
void svmopa_za32[_s8_u8](uint64_t tile, svint8_t zn,
svuint8_t zm)
__arm_streaming __arm_inout("za");

// Only if __ARM_FEATURE_SME_MOP4 != 0
// Variants are also available for:
// _za64[_s16_u16] (only if __ARM_FEATURE_SME_I16I64 != 0)
void svmopa_za32[_s8_u8](uint64_t tile, svint8x2_t zn,
svuint8x2_t zm)
__arm_streaming __arm_inout("za");

// Only if __ARM_FEATURE_SME_MOP4 != 0
// Variants are also available for:
// _za64[_s16_u16] (only if __ARM_FEATURE_SME_I16I64 != 0)
void svmopa[_single]_za32[_s8_u8](uint64_t tile, svint8x2_t zn,
svuint8__t zm)
__arm_streaming __arm_inout("za");

// Only if __ARM_FEATURE_SME_MOP4 != 0
// Variants are also available for:
// _za64[_s16_u16] (only if __ARM_FEATURE_SME_I16I64 != 0)
void svmopa[_single]_za32[_s8_u8](uint64_t tile, svint8_t zn,
svuint8x2_t zm)
__arm_streaming __arm_inout("za");
```

#### SUMOP4S

``` c
// Only if __ARM_FEATURE_SME_MOP4 != 0
// Variants are also available for:
// _za64[_s16_u16] (only if __ARM_FEATURE_SME_I16I64 != 0)
void svmops_za32[_s8_u8](uint64_t tile, svint8_t zn,
svuint8_t zm)
__arm_streaming __arm_inout("za");

// Only if __ARM_FEATURE_SME_MOP4 != 0
// Variants are also available for:
// _za64[_s16_u16] (only if __ARM_FEATURE_SME_I16I64 != 0)
void svmops_za32[_s8_u8](uint64_t tile, svint8x2_t zn,
svuint8x2_t zm)
__arm_streaming __arm_inout("za");

// Only if __ARM_FEATURE_SME_MOP4 != 0
// Variants are also available for:
// _za64[_s16_u16] (only if __ARM_FEATURE_SME_I16I64 != 0)
void svmops[_single]_za32[_s8_u8](uint64_t tile, svint8x2_t zn,
svuint8__t zm)
__arm_streaming __arm_inout("za");

// Only if __ARM_FEATURE_SME_MOP4 != 0
// Variants are also available for:
// _za64[_s16_u16] (only if __ARM_FEATURE_SME_I16I64 != 0)
void svmops[_single]_za32[_s8_u8](uint64_t tile, svint8_t zn,
svuint8x2_t zm)
__arm_streaming __arm_inout("za");
```

#### USMOP4A

``` c
// Only if __ARM_FEATURE_SME_MOP4 != 0
// Variants are also available for:
// _za64[_u16_s16] (only if __ARM_FEATURE_SME_I16I64 != 0)
void svmopa_za32[_u8_s8](uint64_t tile, svuint8_t zn,
svint8_t zm)
__arm_streaming __arm_inout("za");

// Only if __ARM_FEATURE_SME_MOP4 != 0
// Variants are also available for:
// _za64[_u16_s16] (only if __ARM_FEATURE_SME_I16I64 != 0)
void svmopa_za32[_u8_s8](uint64_t tile, svuint8x2_t zn,
svint8x2_t zm)
__arm_streaming __arm_inout("za");

// Only if __ARM_FEATURE_SME_MOP4 != 0
// Variants are also available for:
// _za64[_u16_s16] (only if __ARM_FEATURE_SME_I16I64 != 0)
void svmopa[_single]_za32[_u8_s8](uint64_t tile, svuint8x2_t zn,
svint8__t zm)
__arm_streaming __arm_inout("za");

// Only if __ARM_FEATURE_SME_MOP4 != 0
// Variants are also available for:
// _za64[_u16_s16] (only if __ARM_FEATURE_SME_I16I64 != 0)
void svmopa[_single]_za32[_u8_s8](uint64_t tile, svuint8_t zn,
svint8x2_t zm)
__arm_streaming __arm_inout("za");
```

#### USMOP4S

``` c
// Only if __ARM_FEATURE_SME_MOP4 != 0
// Variants are also available for:
// _za64[_u16_s16] (only if __ARM_FEATURE_SME_I16I64 != 0)
void svmops_za32[_u8_s8](uint64_t tile, svuint8_t zn,
svint8_t zm)
__arm_streaming __arm_inout("za");

// Only if __ARM_FEATURE_SME_MOP4 != 0
// Variants are also available for:
// _za64[_u16_s16] (only if __ARM_FEATURE_SME_I16I64 != 0)
void svmops_za32[_u8_s8](uint64_t tile, svuint8x2_t zn,
svint8x2_t zm)
__arm_streaming __arm_inout("za");

// Only if __ARM_FEATURE_SME_MOP4 != 0
// Variants are also available for:
// _za64[_u16_s16] (only if __ARM_FEATURE_SME_I16I64 != 0)
void svmops[_single]_za32[_u8_s8](uint64_t tile, svuint8x2_t zn,
svint8__t zm)
__arm_streaming __arm_inout("za");

// Only if __ARM_FEATURE_SME_MOP4 != 0
// Variants are also available for:
// _za64[_u16_s16] (only if __ARM_FEATURE_SME_I16I64 != 0)
void svmops[_single]_za32[_u8_s8](uint64_t tile, svuint8_t zn,
svint8x2_t zm)
__arm_streaming __arm_inout("za");
```

#### FMOP4A (FP8)

``` c
// Only if __ARM_FEATURE_SME_MOP4 != 0
// Variants are also available for:
// _za16[_f8] (only if __ARM_FEATURE_SME_F8F16 != 0)
// _za32[_f8] (only if __ARM_FEATURE_SME_F8F32 != 0)
void svmopa_za32[_f8](uint64_t tile, svmfloat8_t zn,
svmfloat8_t zm, fpm_t fpm)
__arm_streaming __arm_inout("za");

// Only if __ARM_FEATURE_SME_MOP4 != 0
// Variants are also available for:
// _za16[_f8] (only if __ARM_FEATURE_SME_F8F16 != 0)
// _za32[_f8] (only if __ARM_FEATURE_SME_F8F32 != 0)
void svmopa_za32[_f8](uint64_t tile, svmfloat8x2_t zn,
svmfloat8x2_t zm, fpm_t fpm)
__arm_streaming __arm_inout("za");

// Only if __ARM_FEATURE_SME_MOP4 != 0
// Variants are also available for:
// _za16[_f8] (only if __ARM_FEATURE_SME_F8F16 != 0)
// _za32[_f8] (only if __ARM_FEATURE_SME_F8F32 != 0)
void svmopa[_single]_za32[_f8](uint64_t tile, svmfloat8x2_t zn,
svmfloat8_t zm, fpm_t fpm)
__arm_streaming __arm_inout("za");

// Only if __ARM_FEATURE_SME_MOP4 != 0
// Variants are also available for:
// _za16[_f8] (only if __ARM_FEATURE_SME_F8F16 != 0)
// _za32[_f8] (only if __ARM_FEATURE_SME_F8F32 != 0)
void svmopa[_single]_za32[_f8](uint64_t tile, svmfloat8_t zn,
svmfloat8x2_t zm, fpm_t fpm)
__arm_streaming __arm_inout("za");
```

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: remove that

#### BFMLA, BFMLS, FMLA, FMLS (single)

Multi-vector floating-point fused multiply-add/subtract
Expand Down
Loading