From f87ead3d882cfae96d16564e9f6de2ac44e30257 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BD=B0=E9=98=85?=
 <43716063+Baiyuetribe@users.noreply.github.com>
Date: Sat, 11 Jan 2025 22:19:30 +0800
Subject: [PATCH 01/14] init flip

---
 docs/developer-guide/operators.md       | 2307 ++++++++++++-----------
 src/CMakeLists.txt                      |    1 +
 src/layer/flip.cpp                      |   41 +
 src/layer/flip.h                        |   37 +
 tools/pnnx/src/CMakeLists.txt           |    1 +
 tools/pnnx/src/pass_ncnn/torch_flip.cpp |   56 +
 6 files changed, 1349 insertions(+), 1094 deletions(-)
 create mode 100644 src/layer/flip.cpp
 create mode 100644 src/layer/flip.h
 create mode 100644 tools/pnnx/src/pass_ncnn/torch_flip.cpp

diff --git a/docs/developer-guide/operators.md b/docs/developer-guide/operators.md
index 10fe1f03f0f..745043e4789 100644
--- a/docs/developer-guide/operators.md
+++ b/docs/developer-guide/operators.md
@@ -1,168 +1,177 @@
-
-* [AbsVal](#absval)
-* [ArgMax](#argmax)
-* [BatchNorm](#batchnorm)
-* [Bias](#bias)
-* [BinaryOp](#binaryop)
-* [BNLL](#bnll)
-* [Cast](#cast)
-* [CELU](#celu)
-* [Clip](#clip)
-* [Concat](#concat)
-* [Convolution](#convolution)
-* [Convolution1D](#convolution1d)
-* [Convolution3D](#convolution3d)
-* [ConvolutionDepthWise](#convolutiondepthwise)
-* [ConvolutionDepthWise1D](#convolutiondepthwise1d)
-* [ConvolutionDepthWise3D](#convolutiondepthwise3d)
-* [CopyTo](#copyto)
-* [Crop](#crop)
-* [CumulativeSum](#cumulativesum)
-* [Deconvolution](#deconvolution)
-* [Deconvolution1D](#deconvolution1d)
-* [Deconvolution3D](#deconvolution3d)
-* [DeconvolutionDepthWise](#deconvolutiondepthwise)
-* [DeconvolutionDepthWise1D](#deconvolutiondepthwise1d)
-* [DeconvolutionDepthWise3D](#deconvolutiondepthwise3d)
-* [DeformableConv2D](#deformableconv2d)
-* [Dequantize](#dequantize)
-* [Diag](#diag)
-* [Dropout](#dropout)
-* [Eltwise](#eltwise)
-* [ELU](#elu)
-* [Embed](#embed)
-* [Exp](#exp)
-* [Flatten](#flatten)
-* [Fold](#fold)
-* [GELU](#gelu)
-* [GLU](#glu)
-* [Gemm](#gemm)
-* [GridSample](#gridsample)
-* [GroupNorm](#groupnorm)
-* [GRU](#gru)
-* [HardSigmoid](#hardsigmoid)
-* [HardSwish](#hardswish)
-* [InnerProduct](#innerproduct)
-* [Input](#input)
-* [InstanceNorm](#instancenorm)
-* [Interp](#interp)
-* [InverseSpectrogram](#inversespectrogram)
-* [LayerNorm](#layernorm)
-* [Log](#log)
-* [LRN](#lrn)
-* [LSTM](#lstm)
-* [MemoryData](#memorydata)
-* [Mish](#mish)
-* [MultiHeadAttention](#multiheadattention)
-* [MVN](#mvn)
-* [Noop](#noop)
-* [Normalize](#normalize)
-* [Packing](#packing)
-* [Padding](#padding)
-* [Permute](#permute)
-* [PixelShuffle](#pixelshuffle)
-* [Pooling](#pooling)
-* [Pooling1D](#pooling1d)
-* [Pooling3D](#pooling3d)
-* [Power](#power)
-* [PReLU](#prelu)
-* [Quantize](#quantize)
-* [Reduction](#reduction)
-* [ReLU](#relu)
-* [Reorg](#reorg)
-* [Requantize](#requantize)
-* [Reshape](#reshape)
-* [RMSNorm](#rmsnorm)
-* [RNN](#rnn)
-* [Scale](#scale)
-* [SELU](#selu)
-* [Shrink](#shrink)
-* [ShuffleChannel](#shufflechannel)
-* [Sigmoid](#sigmoid)
-* [Slice](#slice)
-* [Softmax](#softmax)
-* [Softplus](#softplus)
-* [Spectrogram](#spectrogram)
-* [Split](#split)
-* [Swish](#swish)
-* [TanH](#tanh)
-* [Threshold](#threshold)
-* [Tile](#tile)
-* [UnaryOp](#unaryop)
-* [Unfold](#unfold)
+- [AbsVal](#absval)
+- [ArgMax](#argmax)
+- [BatchNorm](#batchnorm)
+- [Bias](#bias)
+- [BinaryOp](#binaryop)
+- [BNLL](#bnll)
+- [Cast](#cast)
+- [CELU](#celu)
+- [Clip](#clip)
+- [Concat](#concat)
+- [Convolution](#convolution)
+- [Convolution1D](#convolution1d)
+- [Convolution3D](#convolution3d)
+- [ConvolutionDepthWise](#convolutiondepthwise)
+- [ConvolutionDepthWise1D](#convolutiondepthwise1d)
+- [ConvolutionDepthWise3D](#convolutiondepthwise3d)
+- [CopyTo](#copyto)
+- [Crop](#crop)
+- [CumulativeSum](#cumulativesum)
+- [Deconvolution](#deconvolution)
+- [Deconvolution1D](#deconvolution1d)
+- [Deconvolution3D](#deconvolution3d)
+- [DeconvolutionDepthWise](#deconvolutiondepthwise)
+- [DeconvolutionDepthWise1D](#deconvolutiondepthwise1d)
+- [DeconvolutionDepthWise3D](#deconvolutiondepthwise3d)
+- [DeformableConv2D](#deformableconv2d)
+- [Dequantize](#dequantize)
+- [Diag](#diag)
+- [Dropout](#dropout)
+- [Eltwise](#eltwise)
+- [ELU](#elu)
+- [Embed](#embed)
+- [Exp](#exp)
+- [Flatten](#flatten)
+- [Flip](#flip)
+- [Fold](#fold)
+- [GELU](#gelu)
+- [GLU](#glu)
+- [Gemm](#gemm)
+- [GridSample](#gridsample)
+- [GroupNorm](#groupnorm)
+- [GRU](#gru)
+- [HardSigmoid](#hardsigmoid)
+- [HardSwish](#hardswish)
+- [InnerProduct](#innerproduct)
+- [Input](#input)
+- [InstanceNorm](#instancenorm)
+- [Interp](#interp)
+- [InverseSpectrogram](#inversespectrogram)
+- [LayerNorm](#layernorm)
+- [Log](#log)
+- [LRN](#lrn)
+- [LSTM](#lstm)
+- [MemoryData](#memorydata)
+- [Mish](#mish)
+- [MultiHeadAttention](#multiheadattention)
+- [MVN](#mvn)
+- [Noop](#noop)
+- [Normalize](#normalize)
+- [Packing](#packing)
+- [Padding](#padding)
+- [Permute](#permute)
+- [PixelShuffle](#pixelshuffle)
+- [Pooling](#pooling)
+- [Pooling1D](#pooling1d)
+- [Pooling3D](#pooling3d)
+- [Power](#power)
+- [PReLU](#prelu)
+- [Quantize](#quantize)
+- [Reduction](#reduction)
+- [ReLU](#relu)
+- [Reorg](#reorg)
+- [Requantize](#requantize)
+- [Reshape](#reshape)
+- [RMSNorm](#rmsnorm)
+- [RNN](#rnn)
+- [Scale](#scale)
+- [SELU](#selu)
+- [Shrink](#shrink)
+- [ShuffleChannel](#shufflechannel)
+- [Sigmoid](#sigmoid)
+- [Slice](#slice)
+- [Softmax](#softmax)
+- [Softplus](#softplus)
+- [Spectrogram](#spectrogram)
+- [Split](#split)
+- [Swish](#swish)
+- [TanH](#tanh)
+- [Threshold](#threshold)
+- [Tile](#tile)
+- [UnaryOp](#unaryop)
+- [Unfold](#unfold)
 
 # AbsVal
+
 ```
 y = abs(x)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
 # ArgMax
+
 ```
 y = argmax(x, out_max_val, topk)
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | out_max_val   | int   | 0         |                   |
-| 1         | topk          | int   | 1         |                   |
+| param id | name        | type | default | description |
+| -------- | ----------- | ---- | ------- | ----------- |
+| 0        | out_max_val | int  | 0       |             |
+| 1        | topk        | int  | 1       |             |
 
 # BatchNorm
+
 ```
 y = (x - mean) / sqrt(var + eps) * slope + bias
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | channels      | int   | 0         |                   |
-| 1         | eps           | float | 0.f       |                   |
+| param id | name     | type  | default | description |
+| -------- | -------- | ----- | ------- | ----------- |
+| 0        | channels | int   | 0       |             |
+| 1        | eps      | float | 0.f     |             |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| slope_data    | float | [channels]            |
-| mean_data     | float | [channels]            |
-| var_data      | float | [channels]            |
-| bias_data     | float | [channels]            |
+| weight     | type  | shape      |
+| ---------- | ----- | ---------- |
+| slope_data | float | [channels] |
+| mean_data  | float | [channels] |
+| var_data   | float | [channels] |
+| bias_data  | float | [channels] |
 
 # Bias
+
 ```
 y = x + bias
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | bias_data_size| int   | 0         |                   |
+| param id | name           | type | default | description |
+| -------- | -------------- | ---- | ------- | ----------- |
+| 0        | bias_data_size | int  | 0       |             |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| bias_data     | float | [channels]            |
+| weight    | type  | shape      |
+| --------- | ----- | ---------- |
+| bias_data | float | [channels] |
 
 # BinaryOp
- This operation is used for binary computation, and the calculation rule depends on the [broadcasting rule](https://github.com/Tencent/ncnn/wiki/binaryop-broadcasting).
+
+This operation is used for binary computation, and the calculation rule depends on the [broadcasting rule](https://github.com/Tencent/ncnn/wiki/binaryop-broadcasting).
+
 ```
 C = binaryop(A, B)
 ```
+
 if with_scalar = 1:
+
 - one_blob_only
 - support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | op_type       | int   | 0         | Operation type as follows |
-| 1         | with_scalar   | int   | 0         | with_scalar=0 B is a matrix, with_scalar=1 B is a scalar |
-| 2         | b             | float | 0.f       | When B is a scalar, B = b |
+| param id | name        | type  | default | description                                              |
+| -------- | ----------- | ----- | ------- | -------------------------------------------------------- |
+| 0        | op_type     | int   | 0       | Operation type as follows                                |
+| 1        | with_scalar | int   | 0       | with_scalar=0 B is a matrix, with_scalar=1 B is a scalar |
+| 2        | b           | float | 0.f     | When B is a scalar, B = b                                |
 
 Operation type:
+
 - 0 = ADD
 - 1 = SUB
 - 2 = MUL
@@ -177,28 +186,31 @@ Operation type:
 - 11 = RATAN2
 
 # BNLL
+
 ```
 y = log(1 + e^(-x)) , x > 0
 y = log(1 + e^x),     x < 0
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
 # Cast
+
 ```
 y = cast(x)
 ```
 
-* one_blob_only
-* support_packing
+- one_blob_only
+- support_packing
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | type_from     | int   | 0         |                   |
-| 1         | type_to       | int   | 0         |                   |
+| param id | name      | type | default | description |
+| -------- | --------- | ---- | ------- | ----------- |
+| 0        | type_from | int  | 0       |             |
+| 1        | type_to   | int  | 0       |             |
 
 Element type:
+
 - 0 = auto
 - 1 = float32
 - 2 = float16
@@ -206,293 +218,304 @@ Element type:
 - 4 = bfloat16
 
 # CELU
+
 ```
 if x < 0    y = (exp(x / alpha) - 1.f) * alpha
 else        y = x
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | alpha         | float | 1.f       |                   |
+| param id | name  | type  | default | description |
+| -------- | ----- | ----- | ------- | ----------- |
+| 0        | alpha | float | 1.f     |             |
 
 # Clip
+
 ```
 y = clamp(x, min, max)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | min           | float | -FLT_MAX  |                   |
-| 1         | max           | float | FLT_MAX   |                   |
+| param id | name | type  | default  | description |
+| -------- | ---- | ----- | -------- | ----------- |
+| 0        | min  | float | -FLT_MAX |             |
+| 1        | max  | float | FLT_MAX  |             |
 
 # Concat
+
 ```
 y = concat(x0, x1, x2, ...) by axis
 ```
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | axis          | int   | 0         |                   |
+| param id | name | type | default | description |
+| -------- | ---- | ---- | ------- | ----------- |
+| 0        | axis | int  | 0       |             |
 
 # Convolution
+
 ```
 x2 = pad(x, pads, pad_value)
 x3 = conv(x2, weight, kernel, stride, dilation) + bias
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 8         | int8_scale_term| int  | 0         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
-| 18        | pad_value     | float | 0.f       |                   |
-| 19        | dynamic_weight| int   | 0         |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16/int8 | [kernel_w, kernel_h, num_input, num_output] |
-| bias_data     | float | [num_output]          |
-| weight_data_int8_scales| float | [num_output] |
-| bottom_blob_int8_scales| float | [1]          |
-| top_blob_int8_scales| float | [1]             |
+- one_blob_only
+
+| param id | name              | type  | default    | description |
+| -------- | ----------------- | ----- | ---------- | ----------- |
+| 0        | num_output        | int   | 0          |             |
+| 1        | kernel_w          | int   | 0          |             |
+| 2        | dilation_w        | int   | 1          |             |
+| 3        | stride_w          | int   | 1          |             |
+| 4        | pad_left          | int   | 0          |             |
+| 5        | bias_term         | int   | 0          |             |
+| 6        | weight_data_size  | int   | 0          |             |
+| 8        | int8_scale_term   | int   | 0          |             |
+| 9        | activation_type   | int   | 0          |             |
+| 10       | activation_params | array | [ ]        |             |
+| 11       | kernel_h          | int   | kernel_w   |             |
+| 12       | dilation_h        | int   | dilation_w |             |
+| 13       | stride_h          | int   | stride_w   |             |
+| 14       | pad_top           | int   | pad_left   |             |
+| 15       | pad_right         | int   | pad_left   |             |
+| 16       | pad_bottom        | int   | pad_top    |             |
+| 18       | pad_value         | float | 0.f        |             |
+| 19       | dynamic_weight    | int   | 0          |             |
+
+| weight                  | type            | shape                                       |
+| ----------------------- | --------------- | ------------------------------------------- |
+| weight_data             | float/fp16/int8 | [kernel_w, kernel_h, num_input, num_output] |
+| bias_data               | float           | [num_output]                                |
+| weight_data_int8_scales | float           | [num_output]                                |
+| bottom_blob_int8_scales | float           | [1]                                         |
+| top_blob_int8_scales    | float           | [1]                                         |
 
 # Convolution1D
+
 ```
 x2 = pad(x, pads, pad_value)
 x3 = conv1d(x2, weight, kernel, stride, dilation) + bias
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 18        | pad_value     | float | 0.f       |                   |
-| 19        | dynamic_weight| int   | 0         |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16/int8 | [kernel_w, num_input, num_output] |
-| bias_data     | float | [num_output]          |
+- one_blob_only
+
+| param id | name              | type  | default  | description |
+| -------- | ----------------- | ----- | -------- | ----------- |
+| 0        | num_output        | int   | 0        |             |
+| 1        | kernel_w          | int   | 0        |             |
+| 2        | dilation_w        | int   | 1        |             |
+| 3        | stride_w          | int   | 1        |             |
+| 4        | pad_left          | int   | 0        |             |
+| 5        | bias_term         | int   | 0        |             |
+| 6        | weight_data_size  | int   | 0        |             |
+| 9        | activation_type   | int   | 0        |             |
+| 10       | activation_params | array | [ ]      |             |
+| 15       | pad_right         | int   | pad_left |             |
+| 18       | pad_value         | float | 0.f      |             |
+| 19       | dynamic_weight    | int   | 0        |             |
+
+| weight      | type            | shape                             |
+| ----------- | --------------- | --------------------------------- |
+| weight_data | float/fp16/int8 | [kernel_w, num_input, num_output] |
+| bias_data   | float           | [num_output]                      |
 
 # Convolution3D
+
 ```
 x2 = pad(x, pads, pad_value)
 x3 = conv3d(x2, weight, kernel, stride, dilation) + bias
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
-| 17        | pad_behind    | int   | pad_front |                   |
-| 18        | pad_value     | float | 0.f       |                   |
-| 21        | kernel_d      | int   | kernel_w  |                   |
-| 22        | dilation_d    | int   | dilation_w |                  |
-| 23        | stride_d      | int   | stride_w  |                   |
-| 24        | pad_front     | int   | pad_left  |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16/int8 | [kernel_w, kernel_h, kernel_d, num_input, num_output] |
-| bias_data     | float | [num_output]          |
+- one_blob_only
+
+| param id | name              | type  | default    | description |
+| -------- | ----------------- | ----- | ---------- | ----------- |
+| 0        | num_output        | int   | 0          |             |
+| 1        | kernel_w          | int   | 0          |             |
+| 2        | dilation_w        | int   | 1          |             |
+| 3        | stride_w          | int   | 1          |             |
+| 4        | pad_left          | int   | 0          |             |
+| 5        | bias_term         | int   | 0          |             |
+| 6        | weight_data_size  | int   | 0          |             |
+| 9        | activation_type   | int   | 0          |             |
+| 10       | activation_params | array | [ ]        |             |
+| 11       | kernel_h          | int   | kernel_w   |             |
+| 12       | dilation_h        | int   | dilation_w |             |
+| 13       | stride_h          | int   | stride_w   |             |
+| 14       | pad_top           | int   | pad_left   |             |
+| 15       | pad_right         | int   | pad_left   |             |
+| 16       | pad_bottom        | int   | pad_top    |             |
+| 17       | pad_behind        | int   | pad_front  |             |
+| 18       | pad_value         | float | 0.f        |             |
+| 21       | kernel_d          | int   | kernel_w   |             |
+| 22       | dilation_d        | int   | dilation_w |             |
+| 23       | stride_d          | int   | stride_w   |             |
+| 24       | pad_front         | int   | pad_left   |             |
+
+| weight      | type            | shape                                                 |
+| ----------- | --------------- | ----------------------------------------------------- |
+| weight_data | float/fp16/int8 | [kernel_w, kernel_h, kernel_d, num_input, num_output] |
+| bias_data   | float           | [num_output]                                          |
 
 # ConvolutionDepthWise
+
 ```
 x2 = pad(x, pads, pad_value)
 x3 = conv(x2, weight, kernel, stride, dilation, group) + bias
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 7         | group         | int   | 1         |                   |
-| 8         | int8_scale_term| int  | 0         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
-| 18        | pad_value     | float | 0.f       |                   |
-| 19        | dynamic_weight| int   | 0         |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16/int8 | [kernel_w, kernel_h, num_input / group, num_output / group, group] |
-| bias_data     | float | [num_output]          |
-| weight_data_int8_scales| float | [group]      |
-| bottom_blob_int8_scales| float | [1]          |
-| top_blob_int8_scales| float | [1]             |
+- one_blob_only
+
+| param id | name              | type  | default    | description |
+| -------- | ----------------- | ----- | ---------- | ----------- |
+| 0        | num_output        | int   | 0          |             |
+| 1        | kernel_w          | int   | 0          |             |
+| 2        | dilation_w        | int   | 1          |             |
+| 3        | stride_w          | int   | 1          |             |
+| 4        | pad_left          | int   | 0          |             |
+| 5        | bias_term         | int   | 0          |             |
+| 6        | weight_data_size  | int   | 0          |             |
+| 7        | group             | int   | 1          |             |
+| 8        | int8_scale_term   | int   | 0          |             |
+| 9        | activation_type   | int   | 0          |             |
+| 10       | activation_params | array | [ ]        |             |
+| 11       | kernel_h          | int   | kernel_w   |             |
+| 12       | dilation_h        | int   | dilation_w |             |
+| 13       | stride_h          | int   | stride_w   |             |
+| 14       | pad_top           | int   | pad_left   |             |
+| 15       | pad_right         | int   | pad_left   |             |
+| 16       | pad_bottom        | int   | pad_top    |             |
+| 18       | pad_value         | float | 0.f        |             |
+| 19       | dynamic_weight    | int   | 0          |             |
+
+| weight                  | type            | shape                                                              |
+| ----------------------- | --------------- | ------------------------------------------------------------------ |
+| weight_data             | float/fp16/int8 | [kernel_w, kernel_h, num_input / group, num_output / group, group] |
+| bias_data               | float           | [num_output]                                                       |
+| weight_data_int8_scales | float           | [group]                                                            |
+| bottom_blob_int8_scales | float           | [1]                                                                |
+| top_blob_int8_scales    | float           | [1]                                                                |
 
 # ConvolutionDepthWise1D
+
 ```
 x2 = pad(x, pads, pad_value)
 x3 = conv1d(x2, weight, kernel, stride, dilation, group) + bias
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 7         | group         | int   | 1         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 18        | pad_value     | float | 0.f       |                   |
-| 19        | dynamic_weight| int   | 0         |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16/int8 | [kernel_w, num_input / group, num_output / group, group] |
-| bias_data     | float | [num_output]          |
+- one_blob_only
+
+| param id | name              | type  | default  | description |
+| -------- | ----------------- | ----- | -------- | ----------- |
+| 0        | num_output        | int   | 0        |             |
+| 1        | kernel_w          | int   | 0        |             |
+| 2        | dilation_w        | int   | 1        |             |
+| 3        | stride_w          | int   | 1        |             |
+| 4        | pad_left          | int   | 0        |             |
+| 5        | bias_term         | int   | 0        |             |
+| 6        | weight_data_size  | int   | 0        |             |
+| 7        | group             | int   | 1        |             |
+| 9        | activation_type   | int   | 0        |             |
+| 10       | activation_params | array | [ ]      |             |
+| 15       | pad_right         | int   | pad_left |             |
+| 18       | pad_value         | float | 0.f      |             |
+| 19       | dynamic_weight    | int   | 0        |             |
+
+| weight      | type            | shape                                                    |
+| ----------- | --------------- | -------------------------------------------------------- |
+| weight_data | float/fp16/int8 | [kernel_w, num_input / group, num_output / group, group] |
+| bias_data   | float           | [num_output]                                             |
 
 # ConvolutionDepthWise3D
+
 ```
 x2 = pad(x, pads, pad_value)
 x3 = conv3d(x2, weight, kernel, stride, dilation, group) + bias
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 7         | group         | int   | 1         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
-| 17        | pad_behind    | int   | pad_front |                   |
-| 18        | pad_value     | float | 0.f       |                   |
-| 21        | kernel_d      | int   | kernel_w  |                   |
-| 22        | dilation_d    | int   | dilation_w |                  |
-| 23        | stride_d      | int   | stride_w  |                   |
-| 24        | pad_front     | int   | pad_left  |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16/int8 | [kernel_w, kernel_h, kernel_d, num_input / group, num_output / group, group] |
-| bias_data     | float | [num_output]          |
+- one_blob_only
+
+| param id | name              | type  | default    | description |
+| -------- | ----------------- | ----- | ---------- | ----------- |
+| 0        | num_output        | int   | 0          |             |
+| 1        | kernel_w          | int   | 0          |             |
+| 2        | dilation_w        | int   | 1          |             |
+| 3        | stride_w          | int   | 1          |             |
+| 4        | pad_left          | int   | 0          |             |
+| 5        | bias_term         | int   | 0          |             |
+| 6        | weight_data_size  | int   | 0          |             |
+| 7        | group             | int   | 1          |             |
+| 9        | activation_type   | int   | 0          |             |
+| 10       | activation_params | array | [ ]        |             |
+| 11       | kernel_h          | int   | kernel_w   |             |
+| 12       | dilation_h        | int   | dilation_w |             |
+| 13       | stride_h          | int   | stride_w   |             |
+| 14       | pad_top           | int   | pad_left   |             |
+| 15       | pad_right         | int   | pad_left   |             |
+| 16       | pad_bottom        | int   | pad_top    |             |
+| 17       | pad_behind        | int   | pad_front  |             |
+| 18       | pad_value         | float | 0.f        |             |
+| 21       | kernel_d          | int   | kernel_w   |             |
+| 22       | dilation_d        | int   | dilation_w |             |
+| 23       | stride_d          | int   | stride_w   |             |
+| 24       | pad_front         | int   | pad_left   |             |
+
+| weight      | type            | shape                                                                        |
+| ----------- | --------------- | ---------------------------------------------------------------------------- |
+| weight_data | float/fp16/int8 | [kernel_w, kernel_h, kernel_d, num_input / group, num_output / group, group] |
+| bias_data   | float           | [num_output]                                                                 |
 
 # CopyTo
+
 ```
 self[offset] = src
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | woffset       | int   | 0         |                   |
-| 1         | hoffset       | int   | 0         |                   |
-| 13        | doffset       | int   | 0         |                   |
-| 2         | coffset       | int   | 0         |                   |
-| 9         | starts        | array | [ ]       |                   |
-| 11        | axes          | array | [ ]       |                   |
+| param id | name    | type  | default | description |
+| -------- | ------- | ----- | ------- | ----------- |
+| 0        | woffset | int   | 0       |             |
+| 1        | hoffset | int   | 0       |             |
+| 13       | doffset | int   | 0       |             |
+| 2        | coffset | int   | 0       |             |
+| 9        | starts  | array | [ ]     |             |
+| 11       | axes    | array | [ ]     |             |
 
 # Crop
+
 ```
 y = crop(x)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | woffset       | int   | 0         |                   |
-| 1         | hoffset       | int   | 0         |                   |
-| 13        | doffset       | int   | 0         |                   |
-| 2         | coffset       | int   | 0         |                   |
-| 3         | outw          | int   | 0         |                   |
-| 4         | outh          | int   | 0         |                   |
-| 14        | outd          | int   | 0         |                   |
-| 5         | outc          | int   | 0         |                   |
-| 6         | woffset2      | int   | 0         |                   |
-| 7         | hoffset2      | int   | 0         |                   |
-| 15        | doffset2      | int   | 0         |                   |
-| 8         | coffset2      | int   | 0         |                   |
-| 9         | starts        | array | [ ]       |                   |
-| 10        | ends          | array | [ ]       |                   |
-| 11        | axes          | array | [ ]       |                   |
+- one_blob_only
+
+| param id | name     | type  | default | description |
+| -------- | -------- | ----- | ------- | ----------- |
+| 0        | woffset  | int   | 0       |             |
+| 1        | hoffset  | int   | 0       |             |
+| 13       | doffset  | int   | 0       |             |
+| 2        | coffset  | int   | 0       |             |
+| 3        | outw     | int   | 0       |             |
+| 4        | outh     | int   | 0       |             |
+| 14       | outd     | int   | 0       |             |
+| 5        | outc     | int   | 0       |             |
+| 6        | woffset2 | int   | 0       |             |
+| 7        | hoffset2 | int   | 0       |             |
+| 15       | doffset2 | int   | 0       |             |
+| 8        | coffset2 | int   | 0       |             |
+| 9        | starts   | array | [ ]     |             |
+| 10       | ends     | array | [ ]     |             |
+| 11       | axes     | array | [ ]     |             |
 
 # CumulativeSum
 
@@ -500,408 +523,433 @@ If axis < 0, we use axis = x.dims + axis
 
 It implements https://pytorch.org/docs/stable/generated/torch.cumsum.html
 
-* one_blob_only
-* support_inplace
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | axis          | int   | 0         |                   |
+- one_blob_only
+- support_inplace
 
+| param id | name | type | default | description |
+| -------- | ---- | ---- | ------- | ----------- |
+| 0        | axis | int  | 0       |             |
 
 # Deconvolution
+
 ```
 x2 = deconv(x, weight, kernel, stride, dilation) + bias
 x3 = depad(x2, pads, pad_value)
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
-| 18        | output_pad_right| int | 0         |                   |
-| 19        | output_pad_bottom| int | output_pad_right |           |
-| 20        | output_w      | int   | 0         |                   |
-| 21        | output_h      | int   | output_w  |                   |
-| 28        | dynamic_weight| int   | 0         |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16 | [kernel_w, kernel_h, num_input, num_output] |
-| bias_data     | float | [num_output]          |
+- one_blob_only
+
+| param id | name              | type  | default          | description |
+| -------- | ----------------- | ----- | ---------------- | ----------- |
+| 0        | num_output        | int   | 0                |             |
+| 1        | kernel_w          | int   | 0                |             |
+| 2        | dilation_w        | int   | 1                |             |
+| 3        | stride_w          | int   | 1                |             |
+| 4        | pad_left          | int   | 0                |             |
+| 5        | bias_term         | int   | 0                |             |
+| 6        | weight_data_size  | int   | 0                |             |
+| 9        | activation_type   | int   | 0                |             |
+| 10       | activation_params | array | [ ]              |             |
+| 11       | kernel_h          | int   | kernel_w         |             |
+| 12       | dilation_h        | int   | dilation_w       |             |
+| 13       | stride_h          | int   | stride_w         |             |
+| 14       | pad_top           | int   | pad_left         |             |
+| 15       | pad_right         | int   | pad_left         |             |
+| 16       | pad_bottom        | int   | pad_top          |             |
+| 18       | output_pad_right  | int   | 0                |             |
+| 19       | output_pad_bottom | int   | output_pad_right |             |
+| 20       | output_w          | int   | 0                |             |
+| 21       | output_h          | int   | output_w         |             |
+| 28       | dynamic_weight    | int   | 0                |             |
+
+| weight      | type       | shape                                       |
+| ----------- | ---------- | ------------------------------------------- |
+| weight_data | float/fp16 | [kernel_w, kernel_h, num_input, num_output] |
+| bias_data   | float      | [num_output]                                |
 
 # Deconvolution1D
+
 ```
 x2 = deconv1d(x, weight, kernel, stride, dilation) + bias
 x3 = depad(x2, pads, pad_value)
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 18        | output_pad_right| int | 0         |                   |
-| 20        | output_w      | int   | 0         |                   |
-| 28        | dynamic_weight| int   | 0         |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16 | [kernel_w, num_input, num_output] |
-| bias_data     | float | [num_output]          |
+- one_blob_only
+
+| param id | name              | type  | default  | description |
+| -------- | ----------------- | ----- | -------- | ----------- |
+| 0        | num_output        | int   | 0        |             |
+| 1        | kernel_w          | int   | 0        |             |
+| 2        | dilation_w        | int   | 1        |             |
+| 3        | stride_w          | int   | 1        |             |
+| 4        | pad_left          | int   | 0        |             |
+| 5        | bias_term         | int   | 0        |             |
+| 6        | weight_data_size  | int   | 0        |             |
+| 9        | activation_type   | int   | 0        |             |
+| 10       | activation_params | array | [ ]      |             |
+| 15       | pad_right         | int   | pad_left |             |
+| 18       | output_pad_right  | int   | 0        |             |
+| 20       | output_w          | int   | 0        |             |
+| 28       | dynamic_weight    | int   | 0        |             |
+
+| weight      | type       | shape                             |
+| ----------- | ---------- | --------------------------------- |
+| weight_data | float/fp16 | [kernel_w, num_input, num_output] |
+| bias_data   | float      | [num_output]                      |
 
 # Deconvolution3D
+
 ```
 x2 = deconv3d(x, weight, kernel, stride, dilation) + bias
 x3 = depad(x2, pads, pad_value)
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
-| 17        | pad_behind    | int   | pad_front |                   |
-| 18        | output_pad_right| int | 0         |                   |
-| 19        | output_pad_bottom| int | output_pad_right |           |
-| 20        | output_pad_behind| int | output_pad_right |           |
-| 21        | kernel_d      | int   | kernel_w  |                   |
-| 22        | dilation_d    | int   | dilation_w |                  |
-| 23        | stride_d      | int   | stride_w  |                   |
-| 24        | pad_front     | int   | pad_left  |                   |
-| 25        | output_w      | int   | 0         |                   |
-| 26        | output_h      | int   | output_w  |                   |
-| 27        | output_d      | int   | output_w  |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16 | [kernel_w, kernel_h, kernel_d, num_input, num_output] |
-| bias_data     | float | [num_output]          |
+- one_blob_only
+
+| param id | name              | type  | default          | description |
+| -------- | ----------------- | ----- | ---------------- | ----------- |
+| 0        | num_output        | int   | 0                |             |
+| 1        | kernel_w          | int   | 0                |             |
+| 2        | dilation_w        | int   | 1                |             |
+| 3        | stride_w          | int   | 1                |             |
+| 4        | pad_left          | int   | 0                |             |
+| 5        | bias_term         | int   | 0                |             |
+| 6        | weight_data_size  | int   | 0                |             |
+| 9        | activation_type   | int   | 0                |             |
+| 10       | activation_params | array | [ ]              |             |
+| 11       | kernel_h          | int   | kernel_w         |             |
+| 12       | dilation_h        | int   | dilation_w       |             |
+| 13       | stride_h          | int   | stride_w         |             |
+| 14       | pad_top           | int   | pad_left         |             |
+| 15       | pad_right         | int   | pad_left         |             |
+| 16       | pad_bottom        | int   | pad_top          |             |
+| 17       | pad_behind        | int   | pad_front        |             |
+| 18       | output_pad_right  | int   | 0                |             |
+| 19       | output_pad_bottom | int   | output_pad_right |             |
+| 20       | output_pad_behind | int   | output_pad_right |             |
+| 21       | kernel_d          | int   | kernel_w         |             |
+| 22       | dilation_d        | int   | dilation_w       |             |
+| 23       | stride_d          | int   | stride_w         |             |
+| 24       | pad_front         | int   | pad_left         |             |
+| 25       | output_w          | int   | 0                |             |
+| 26       | output_h          | int   | output_w         |             |
+| 27       | output_d          | int   | output_w         |             |
+
+| weight      | type       | shape                                                 |
+| ----------- | ---------- | ----------------------------------------------------- |
+| weight_data | float/fp16 | [kernel_w, kernel_h, kernel_d, num_input, num_output] |
+| bias_data   | float      | [num_output]                                          |
 
 # DeconvolutionDepthWise
+
 ```
 x2 = deconv(x, weight, kernel, stride, dilation, group) + bias
 x3 = depad(x2, pads, pad_value)
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 7         | group         | int   | 1         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
-| 18        | output_pad_right| int | 0         |                   |
-| 19        | output_pad_bottom| int | output_pad_right |           |
-| 20        | output_w      | int   | 0         |                   |
-| 21        | output_h      | int   | output_w  |                   |
-| 28        | dynamic_weight| int   | 0         |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16 | [kernel_w, kernel_h, num_input / group, num_output / group, group] |
-| bias_data     | float | [num_output]          |
+- one_blob_only
+
+| param id | name              | type  | default          | description |
+| -------- | ----------------- | ----- | ---------------- | ----------- |
+| 0        | num_output        | int   | 0                |             |
+| 1        | kernel_w          | int   | 0                |             |
+| 2        | dilation_w        | int   | 1                |             |
+| 3        | stride_w          | int   | 1                |             |
+| 4        | pad_left          | int   | 0                |             |
+| 5        | bias_term         | int   | 0                |             |
+| 6        | weight_data_size  | int   | 0                |             |
+| 7        | group             | int   | 1                |             |
+| 9        | activation_type   | int   | 0                |             |
+| 10       | activation_params | array | [ ]              |             |
+| 11       | kernel_h          | int   | kernel_w         |             |
+| 12       | dilation_h        | int   | dilation_w       |             |
+| 13       | stride_h          | int   | stride_w         |             |
+| 14       | pad_top           | int   | pad_left         |             |
+| 15       | pad_right         | int   | pad_left         |             |
+| 16       | pad_bottom        | int   | pad_top          |             |
+| 18       | output_pad_right  | int   | 0                |             |
+| 19       | output_pad_bottom | int   | output_pad_right |             |
+| 20       | output_w          | int   | 0                |             |
+| 21       | output_h          | int   | output_w         |             |
+| 28       | dynamic_weight    | int   | 0                |             |
+
+| weight      | type       | shape                                                              |
+| ----------- | ---------- | ------------------------------------------------------------------ |
+| weight_data | float/fp16 | [kernel_w, kernel_h, num_input / group, num_output / group, group] |
+| bias_data   | float      | [num_output]                                                       |
 
 # DeconvolutionDepthWise1D
+
 ```
 x2 = deconv1d(x, weight, kernel, stride, dilation, group) + bias
 x3 = depad(x2, pads, pad_value)
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 7         | group         | int   | 1         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 18        | output_pad_right| int | 0         |                   |
-| 20        | output_w      | int   | 0         |                   |
-| 28        | dynamic_weight| int   | 0         |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16 | [kernel_w, num_input / group, num_output / group, group] |
-| bias_data     | float | [num_output]          |
+- one_blob_only
+
+| param id | name              | type  | default  | description |
+| -------- | ----------------- | ----- | -------- | ----------- |
+| 0        | num_output        | int   | 0        |             |
+| 1        | kernel_w          | int   | 0        |             |
+| 2        | dilation_w        | int   | 1        |             |
+| 3        | stride_w          | int   | 1        |             |
+| 4        | pad_left          | int   | 0        |             |
+| 5        | bias_term         | int   | 0        |             |
+| 6        | weight_data_size  | int   | 0        |             |
+| 7        | group             | int   | 1        |             |
+| 9        | activation_type   | int   | 0        |             |
+| 10       | activation_params | array | [ ]      |             |
+| 15       | pad_right         | int   | pad_left |             |
+| 18       | output_pad_right  | int   | 0        |             |
+| 20       | output_w          | int   | 0        |             |
+| 28       | dynamic_weight    | int   | 0        |             |
+
+| weight      | type       | shape                                                    |
+| ----------- | ---------- | -------------------------------------------------------- |
+| weight_data | float/fp16 | [kernel_w, num_input / group, num_output / group, group] |
+| bias_data   | float      | [num_output]                                             |
 
 # DeconvolutionDepthWise3D
+
 ```
 x2 = deconv3d(x, weight, kernel, stride, dilation, group) + bias
 x3 = depad(x2, pads, pad_value)
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 7         | group         | int   | 1         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
-| 17        | pad_behind    | int   | pad_front |                   |
-| 18        | output_pad_right| int | 0         |                   |
-| 19        | output_pad_bottom| int | output_pad_right |           |
-| 20        | output_pad_behind| int | output_pad_right |           |
-| 21        | kernel_d      | int   | kernel_w  |                   |
-| 22        | dilation_d    | int   | dilation_w |                  |
-| 23        | stride_d      | int   | stride_w  |                   |
-| 24        | pad_front     | int   | pad_left  |                   |
-| 25        | output_w      | int   | 0         |                   |
-| 26        | output_h      | int   | output_w  |                   |
-| 27        | output_d      | int   | output_w  |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16 | [kernel_w, kernel_h, kernel_d, num_input / group, num_output / group, group] |
-| bias_data     | float | [num_output]          |
+- one_blob_only
+
+| param id | name              | type  | default          | description |
+| -------- | ----------------- | ----- | ---------------- | ----------- |
+| 0        | num_output        | int   | 0                |             |
+| 1        | kernel_w          | int   | 0                |             |
+| 2        | dilation_w        | int   | 1                |             |
+| 3        | stride_w          | int   | 1                |             |
+| 4        | pad_left          | int   | 0                |             |
+| 5        | bias_term         | int   | 0                |             |
+| 6        | weight_data_size  | int   | 0                |             |
+| 7        | group             | int   | 1                |             |
+| 9        | activation_type   | int   | 0                |             |
+| 10       | activation_params | array | [ ]              |             |
+| 11       | kernel_h          | int   | kernel_w         |             |
+| 12       | dilation_h        | int   | dilation_w       |             |
+| 13       | stride_h          | int   | stride_w         |             |
+| 14       | pad_top           | int   | pad_left         |             |
+| 15       | pad_right         | int   | pad_left         |             |
+| 16       | pad_bottom        | int   | pad_top          |             |
+| 17       | pad_behind        | int   | pad_front        |             |
+| 18       | output_pad_right  | int   | 0                |             |
+| 19       | output_pad_bottom | int   | output_pad_right |             |
+| 20       | output_pad_behind | int   | output_pad_right |             |
+| 21       | kernel_d          | int   | kernel_w         |             |
+| 22       | dilation_d        | int   | dilation_w       |             |
+| 23       | stride_d          | int   | stride_w         |             |
+| 24       | pad_front         | int   | pad_left         |             |
+| 25       | output_w          | int   | 0                |             |
+| 26       | output_h          | int   | output_w         |             |
+| 27       | output_d          | int   | output_w         |             |
+
+| weight      | type       | shape                                                                        |
+| ----------- | ---------- | ---------------------------------------------------------------------------- |
+| weight_data | float/fp16 | [kernel_w, kernel_h, kernel_d, num_input / group, num_output / group, group] |
+| bias_data   | float      | [num_output]                                                                 |
 
 # DeformableConv2D
+
 ```
 x2 = deformableconv2d(x, offset, mask, weight, kernel, stride, dilation) + bias
 y = activation(x2, act_type, act_params)
 ```
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16/int8 | [kernel_w, kernel_h, num_input, num_output] |
-| bias_data     | float | [num_output]          |
+| param id | name              | type  | default    | description |
+| -------- | ----------------- | ----- | ---------- | ----------- |
+| 0        | num_output        | int   | 0          |             |
+| 1        | kernel_w          | int   | 0          |             |
+| 2        | dilation_w        | int   | 1          |             |
+| 3        | stride_w          | int   | 1          |             |
+| 4        | pad_left          | int   | 0          |             |
+| 5        | bias_term         | int   | 0          |             |
+| 6        | weight_data_size  | int   | 0          |             |
+| 9        | activation_type   | int   | 0          |             |
+| 10       | activation_params | array | [ ]        |             |
+| 11       | kernel_h          | int   | kernel_w   |             |
+| 12       | dilation_h        | int   | dilation_w |             |
+| 13       | stride_h          | int   | stride_w   |             |
+| 14       | pad_top           | int   | pad_left   |             |
+| 15       | pad_right         | int   | pad_left   |             |
+| 16       | pad_bottom        | int   | pad_top    |             |
+
+| weight      | type            | shape                                       |
+| ----------- | --------------- | ------------------------------------------- |
+| weight_data | float/fp16/int8 | [kernel_w, kernel_h, num_input, num_output] |
+| bias_data   | float           | [num_output]                                |
 
 # Dequantize
+
 ```
 y = x * scale + bias
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | scale_data_size| int  | 1         |                   |
-| 1         | bias_data_size| int   | 0         |                   |
+| param id | name            | type | default | description |
+| -------- | --------------- | ---- | ------- | ----------- |
+| 0        | scale_data_size | int  | 1       |             |
+| 1        | bias_data_size  | int  | 0       |             |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| scale_data    | float | [scale_data_size]     |
-| bias_data     | float | [bias_data_size]      |
+| weight     | type  | shape             |
+| ---------- | ----- | ----------------- |
+| scale_data | float | [scale_data_size] |
+| bias_data  | float | [bias_data_size]  |
 
 # Diag
+
 ```
 y = diag(x, diagonal)
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | diagonal      | int   | 0         |                   |
+| param id | name     | type | default | description |
+| -------- | -------- | ---- | ------- | ----------- |
+| 0        | diagonal | int  | 0       |             |
 
 # Dropout
+
 ```
 y = x * scale
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | scale         | float | 1.f       |                   |
+| param id | name  | type  | default | description |
+| -------- | ----- | ----- | ------- | ----------- |
+| 0        | scale | float | 1.f     |             |
 
 # Eltwise
+
 ```
 y = elementwise_op(x0, x1, ...)
 ```
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | op_type       | int   | 0         |                   |
-| 1         | coeffs        | array | [ ]       |                   |
+| param id | name    | type  | default | description |
+| -------- | ------- | ----- | ------- | ----------- |
+| 0        | op_type | int   | 0       |             |
+| 1        | coeffs  | array | [ ]     |             |
 
 Operation type:
+
 - 0 = PROD
 - 1 = SUM
 - 2 = MAX
 
 # ELU
+
 ```
 if x < 0    y = (exp(x) - 1) * alpha
 else        y = x
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | alpha         | float | 0.1f      |                   |
+| param id | name  | type  | default | description |
+| -------- | ----- | ----- | ------- | ----------- |
+| 0        | alpha | float | 0.1f    |             |
 
 # Embed
+
 ```
 y = embedding(x)
 ```
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | input_dim     | int   | 0         |                   |
-| 2         | bias_term     | int   | 0         |                   |
-| 3         | weight_data_size | int | 0        |                   |
-| 18        | int8_scale_term| int  | 0         |                   |
+| param id | name             | type | default | description |
+| -------- | ---------------- | ---- | ------- | ----------- |
+| 0        | num_output       | int  | 0       |             |
+| 1        | input_dim        | int  | 0       |             |
+| 2        | bias_term        | int  | 0       |             |
+| 3        | weight_data_size | int  | 0       |             |
+| 18       | int8_scale_term  | int  | 0       |             |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float | [weight_data_size]    |
-| bias_term     | float | [num_output]          |
-| weight_data_int8_scales| float | [1]          |
+| weight                  | type  | shape              |
+| ----------------------- | ----- | ------------------ |
+| weight_data             | float | [weight_data_size] |
+| bias_term               | float | [num_output]       |
+| weight_data_int8_scales | float | [1]                |
 
 # Exp
+
 ```
 if base == -1   y = exp(shift + x * scale)
 else            y = pow(base, (shift + x * scale))
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | base          | float | -1.f      |                   |
-| 1         | scale         | float | 1.f       |                   |
-| 2         | shift         | float | 0.f       |                   |
+| param id | name  | type  | default | description |
+| -------- | ----- | ----- | ------- | ----------- |
+| 0        | base  | float | -1.f    |             |
+| 1        | scale | float | 1.f     |             |
+| 2        | shift | float | 0.f     |             |
 
 # Flatten
+
 Reshape blob to 1 dimension
 
-* one_blob_only
+- one_blob_only
+
+# Flip
+
+- one_blob_only
+
+| param id | name | type  | default | description |
+| -------- | ---- | ----- | ------- | ----------- |
+| 0        | axis | array | []      |             |
 
 # Fold
+
 ```
 y = fold(x)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
-| 20        | output_w      | int   | 0         |                   |
-| 21        | output_h      | int   | output_w  |                   |
+- one_blob_only
+
+| param id | name       | type | default    | description |
+| -------- | ---------- | ---- | ---------- | ----------- |
+| 0        | num_output | int  | 0          |             |
+| 1        | kernel_w   | int  | 0          |             |
+| 2        | dilation_w | int  | 1          |             |
+| 3        | stride_w   | int  | 1          |             |
+| 4        | pad_left   | int  | 0          |             |
+| 11       | kernel_h   | int  | kernel_w   |             |
+| 12       | dilation_h | int  | dilation_w |             |
+| 13       | stride_h   | int  | stride_w   |             |
+| 14       | pad_top    | int  | pad_left   |             |
+| 15       | pad_right  | int  | pad_left   |             |
+| 16       | pad_bottom | int  | pad_top    |             |
+| 20       | output_w   | int  | 0          |             |
+| 21       | output_h   | int  | output_w   |             |
 
 # GELU
+
 ```
 if fast_gelu == 1   y = 0.5 * x * (1 + tanh(0.79788452 * (x + 0.044715 * x * x * x)));
 else                y = 0.5 * x * erfc(-0.70710678 * x)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | fast_gelu     | int   | 0         | use approximation |
+| param id | name      | type | default | description       |
+| -------- | --------- | ---- | ------- | ----------------- |
+| 0        | fast_gelu | int  | 0       | use approximation |
 
 # GLU
 
@@ -913,13 +961,14 @@ where a is the first half of the input matrix and b is the second half.
 
 axis specifies the dimension to split the input
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | axis          | int   | 0         |                   |
+| param id | name | type | default | description |
+| -------- | ---- | ---- | ------- | ----------- |
+| 0        | axis | int  | 0       |             |
 
 # Gemm
+
 ```
 a = transA ? transpose(x0) : x0
 b = transb ? transpose(x1) : x1
@@ -927,88 +976,91 @@ c = x2
 y = (gemm(a, b) + c * beta) * alpha
 ```
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | alpha         | float | 1.f       |                   |
-| 1         | beta          | float | 1.f       |                   |
-| 2         | transA        | int   | 0         |                   |
-| 3         | transb        | int   | 0         |                   |
-| 4         | constantA     | int   | 0         |                   |
-| 5         | constantB     | int   | 0         |                   |
-| 6         | constantC     | int   | 0         |                   |
-| 7         | constantM     | int   | 0         |                   |
-| 8         | constantN     | int   | 0         |                   |
-| 9         | constantK     | int   | 0         |                   |
-| 10        | constant_broadcast_type_C | int | 0 |                 |
-| 11        | output_N1M    | int   | 0         |                   |
-| 12        | output_elempack | int | 0         |                   |
-| 13        | output_elemtype | int | 0         |                   |
-| 14        | output_transpose | int| 0         |                   |
-| 18        | int8_scale_term | int | 0         |                   |
-| 20        | constant_TILE_M | int | 0         |                   |
-| 21        | constant_TILE_N | int | 0         |                   |
-| 22        | constant_TILE_K | int | 0         |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| A_data        | float/fp16/int8 | [M, K] or [K, M] |
-| B_data        | float/fp16/int8 | [N, K] or [K, N] |
-| C_data        | float | [1], [M] or [N] or [1, M] or [N,1] or [N, M] |
-| A_data_int8_scales| float | [M]               |
-| B_data_int8_scales| float | [1]               |
+| param id | name                      | type  | default | description |
+| -------- | ------------------------- | ----- | ------- | ----------- |
+| 0        | alpha                     | float | 1.f     |             |
+| 1        | beta                      | float | 1.f     |             |
+| 2        | transA                    | int   | 0       |             |
+| 3        | transb                    | int   | 0       |             |
+| 4        | constantA                 | int   | 0       |             |
+| 5        | constantB                 | int   | 0       |             |
+| 6        | constantC                 | int   | 0       |             |
+| 7        | constantM                 | int   | 0       |             |
+| 8        | constantN                 | int   | 0       |             |
+| 9        | constantK                 | int   | 0       |             |
+| 10       | constant_broadcast_type_C | int   | 0       |             |
+| 11       | output_N1M                | int   | 0       |             |
+| 12       | output_elempack           | int   | 0       |             |
+| 13       | output_elemtype           | int   | 0       |             |
+| 14       | output_transpose          | int   | 0       |             |
+| 18       | int8_scale_term           | int   | 0       |             |
+| 20       | constant_TILE_M           | int   | 0       |             |
+| 21       | constant_TILE_N           | int   | 0       |             |
+| 22       | constant_TILE_K           | int   | 0       |             |
+
+| weight             | type            | shape                                        |
+| ------------------ | --------------- | -------------------------------------------- |
+| A_data             | float/fp16/int8 | [M, K] or [K, M]                             |
+| B_data             | float/fp16/int8 | [N, K] or [K, N]                             |
+| C_data             | float           | [1], [M] or [N] or [1, M] or [N,1] or [N, M] |
+| A_data_int8_scales | float           | [M]                                          |
+| B_data_int8_scales | float           | [1]                                          |
 
 # GridSample
+
 ```
 Given an input and a flow-field grid, computes the output using input values and pixel locations from grid.
 
-For each output location output[:, h2, w2], the size-2 vector grid[h2, w2, 2] specifies input pixel[:, h1, w1] locations x and y, 
+For each output location output[:, h2, w2], the size-2 vector grid[h2, w2, 2] specifies input pixel[:, h1, w1] locations x and y,
 which are used to interpolate the output value output[:, h2, w2]
 
 This function is often used in conjunction with affine_grid() to build Spatial Transformer Networks .
 ```
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | sample_type   | int   | 1         |                   |
-| 1         | padding_mode  | int   | 1         |                   |
-| 2         | align_corner  | int   | 0         |                   |
-| 3         | permute_fusion| int   | 0         | fuse with permute |
-
+| param id | name           | type | default | description       |
+| -------- | -------------- | ---- | ------- | ----------------- |
+| 0        | sample_type    | int  | 1       |                   |
+| 1        | padding_mode   | int  | 1       |                   |
+| 2        | align_corner   | int  | 0       |                   |
+| 3        | permute_fusion | int  | 0       | fuse with permute |
 
 Sample type:
+
 - 1 = Nearest
 - 2 = Bilinear
 - 3 = Bicubic
 
 Padding mode:
+
 - 1 = zeros
 - 2 = border
 - 3 = reflection
 
-
 # GroupNorm
+
 ```
 split x along channel axis into group x0, x1 ...
 l2 normalize for each group x0, x1 ...
 y = x * gamma + beta
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | group         | int   | 1         |                   |
-| 1         | channels      | int   | 0         |                   |
-| 2         | eps           | float | 0.001f    | x = x / sqrt(var + eps) |
-| 3         | affine        | int   | 1         |                   |
+| param id | name     | type  | default | description             |
+| -------- | -------- | ----- | ------- | ----------------------- |
+| 0        | group    | int   | 1       |                         |
+| 1        | channels | int   | 0       |                         |
+| 2        | eps      | float | 0.001f  | x = x / sqrt(var + eps) |
+| 3        | affine   | int   | 1       |                         |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| gamma_data    | float | [channels]            |
-| beta_data     | float | [channels]            |
+| weight     | type  | shape      |
+| ---------- | ----- | ---------- |
+| gamma_data | float | [channels] |
+| beta_data  | float | [channels] |
 
 # GRU
+
 Apply a single-layer GRU to a feature sequence of `T` timesteps. The input blob shape is `[w=input_size, h=T]` and the output blob shape is `[w=num_output, h=T]`.
 
 ```
@@ -1016,134 +1068,143 @@ y = gru(x)
 y0, hidden y1 = gru(x0, hidden x1)
 ```
 
-* one_blob_only if bidirectional
+- one_blob_only if bidirectional
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         | hidden size of output |
-| 1         | weight_data_size| int | 0         | total size of weight matrix |
-| 2         | direction     | int   | 0         | 0=forward, 1=reverse, 2=bidirectional |
+| param id | name             | type | default | description                           |
+| -------- | ---------------- | ---- | ------- | ------------------------------------- |
+| 0        | num_output       | int  | 0       | hidden size of output                 |
+| 1        | weight_data_size | int  | 0       | total size of weight matrix           |
+| 2        | direction        | int  | 0       | 0=forward, 1=reverse, 2=bidirectional |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_xc_data| float/fp16/int8 | [input_size, num_output * 3, num_directions] |
-| bias_c_data   | float/fp16/int8 | [num_output, 4, num_directions] |
-| weight_hc_data| float/fp16/int8 | [num_output, num_output * 3, num_directions] |
+| weight         | type            | shape                                        |
+| -------------- | --------------- | -------------------------------------------- |
+| weight_xc_data | float/fp16/int8 | [input_size, num_output * 3, num_directions] |
+| bias_c_data    | float/fp16/int8 | [num_output, 4, num_directions]              |
+| weight_hc_data | float/fp16/int8 | [num_output, num_output * 3, num_directions] |
 
 Direction flag:
+
 - 0 = forward only
 - 1 = reverse only
 - 2 = bidirectional
 
 # HardSigmoid
+
 ```
 y = clamp(x * alpha + beta, 0, 1)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | alpha         | float | 0.2f      |                   |
-| 1         | beta          | float | 0.5f      |                   |
+| param id | name  | type  | default | description |
+| -------- | ----- | ----- | ------- | ----------- |
+| 0        | alpha | float | 0.2f    |             |
+| 1        | beta  | float | 0.5f    |             |
 
 # HardSwish
+
 ```
 y = x * clamp(x * alpha + beta, 0, 1)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | alpha         | float | 0.2f      |                   |
-| 1         | beta          | float | 0.5f      |                   |
+| param id | name  | type  | default | description |
+| -------- | ----- | ----- | ------- | ----------- |
+| 0        | alpha | float | 0.2f    |             |
+| 1        | beta  | float | 0.5f    |             |
 
 # InnerProduct
+
 ```
 x2 = innerproduct(x, weight) + bias
 y = activation(x2, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | bias_term     | int   | 0         |                   |
-| 2         | weight_data_size| int | 0         |                   |
-| 8         | int8_scale_term| int  | 0         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
+- one_blob_only
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16/int8 | [num_input, num_output] |
-| bias_data     | float | [num_output]          |
-| weight_data_int8_scales| float | [num_output] |
-| bottom_blob_int8_scales| float | [1]          |
+| param id | name              | type  | default | description |
+| -------- | ----------------- | ----- | ------- | ----------- |
+| 0        | num_output        | int   | 0       |             |
+| 1        | bias_term         | int   | 0       |             |
+| 2        | weight_data_size  | int   | 0       |             |
+| 8        | int8_scale_term   | int   | 0       |             |
+| 9        | activation_type   | int   | 0       |             |
+| 10       | activation_params | array | [ ]     |             |
+
+| weight                  | type            | shape                   |
+| ----------------------- | --------------- | ----------------------- |
+| weight_data             | float/fp16/int8 | [num_input, num_output] |
+| bias_data               | float           | [num_output]            |
+| weight_data_int8_scales | float           | [num_output]            |
+| bottom_blob_int8_scales | float           | [1]                     |
 
 # Input
+
 ```
 y = input
 ```
 
-* support_inplace
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | w             | int   | 0         |                   |
-| 1         | h             | int   | 0         |                   |
-| 11        | d             | int   | 0         |                   |
-| 2         | c             | int   | 0         |                   |
+| param id | name | type | default | description |
+| -------- | ---- | ---- | ------- | ----------- |
+| 0        | w    | int  | 0       |             |
+| 1        | h    | int  | 0       |             |
+| 11       | d    | int  | 0       |             |
+| 2        | c    | int  | 0       |             |
 
 # InstanceNorm
+
 ```
 split x along channel axis into instance x0, x1 ...
 l2 normalize for each channel instance x0, x1 ...
 y = x * gamma + beta
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | channels      | int   | 0         |                   |
-| 1         | eps           | float | 0.001f    | x = x / sqrt(var + eps) |
-| 2         | affine        | int   | 1         |                   |
+| param id | name     | type  | default | description             |
+| -------- | -------- | ----- | ------- | ----------------------- |
+| 0        | channels | int   | 0       |                         |
+| 1        | eps      | float | 0.001f  | x = x / sqrt(var + eps) |
+| 2        | affine   | int   | 1       |                         |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| gamma_data    | float | [channels]            |
-| beta_data     | float | [channels]            |
+| weight     | type  | shape      |
+| ---------- | ----- | ---------- |
+| gamma_data | float | [channels] |
+| beta_data  | float | [channels] |
 
 # Interp
+
 ```
 if dynamic_target_size == 0     y = resize(x) by fixed size or scale
 else                            y = resize(x0, size(x1))
 ```
 
-* one_blob_only if dynamic_target_size == 0
+- one_blob_only if dynamic_target_size == 0
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | resize_type   | int   | 0         |                   |
-| 1         | height_scale  | float | 1.f       |                   |
-| 2         | width_scale   | float | 1.f       |                   |
-| 3         | output_height | int   | 0         |                   |
-| 4         | output_width  | int   | 0         |                   |
-| 5         | dynamic_target_size| int | 0      |                   |
-| 6         | align_corner  | int   | 0         |                   |
+| param id | name                | type  | default | description |
+| -------- | ------------------- | ----- | ------- | ----------- |
+| 0        | resize_type         | int   | 0       |             |
+| 1        | height_scale        | float | 1.f     |             |
+| 2        | width_scale         | float | 1.f     |             |
+| 3        | output_height       | int   | 0       |             |
+| 4        | output_width        | int   | 0       |             |
+| 5        | dynamic_target_size | int   | 0       |             |
+| 6        | align_corner        | int   | 0       |             |
 
 Resize type:
+
 - 1 = Nearest
 - 2 = Bilinear
 - 3 = Bicubic
 
 # InverseSpectrogram
+
 ```
 x1 = x as complex
 x1 = x1 * sqrt(norm) if normalized
@@ -1155,77 +1216,82 @@ if returns == 1 return y1 real
 if returns == 2 return y1 imag
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | n_fft         | int   | 0         |                   |
-| 1         | returns       | int   | 1         |                   |
-| 2         | hoplen        | int   | n_fft / 4 |                   |
-| 3         | winlen        | int   | n_fft     |                   |
-| 4         | window_type   | int   | 0         | 0=ones 1=hann 2=hamming |
-| 5         | center        | int   | 1         |                   |
-| 7         | normalized    | int   | 0         | 0=no 1=n_fft 2=window-l2-energy |
+| param id | name        | type | default   | description                     |
+| -------- | ----------- | ---- | --------- | ------------------------------- |
+| 0        | n_fft       | int  | 0         |                                 |
+| 1        | returns     | int  | 1         |                                 |
+| 2        | hoplen      | int  | n_fft / 4 |                                 |
+| 3        | winlen      | int  | n_fft     |                                 |
+| 4        | window_type | int  | 0         | 0=ones 1=hann 2=hamming         |
+| 5        | center      | int  | 1         |                                 |
+| 7        | normalized  | int  | 0         | 0=no 1=n_fft 2=window-l2-energy |
 
 # LayerNorm
+
 ```
 split x along outmost axis into part x0, x1 ...
 l2 normalize for each part x0, x1 ...
 y = x * gamma + beta by elementwise
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | affine_size   | int   | 0         |                   |
-| 1         | eps           | float | 0.001f    | x = x / sqrt(var + eps) |
-| 2         | affine        | int   | 1         |                   |
+| param id | name        | type  | default | description             |
+| -------- | ----------- | ----- | ------- | ----------------------- |
+| 0        | affine_size | int   | 0       |                         |
+| 1        | eps         | float | 0.001f  | x = x / sqrt(var + eps) |
+| 2        | affine      | int   | 1       |                         |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| gamma_data    | float | [affine_size]         |
-| beta_data     | float | [affine_size]         |
+| weight     | type  | shape         |
+| ---------- | ----- | ------------- |
+| gamma_data | float | [affine_size] |
+| beta_data  | float | [affine_size] |
 
 # Log
+
 ```
 if base == -1   y = log(shift + x * scale)
 else            y = log(shift + x * scale) / log(base)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | base          | float | -1.f      |                   |
-| 1         | scale         | float | 1.f       |                   |
-| 2         | shift         | float | 0.f       |                   |
+| param id | name  | type  | default | description |
+| -------- | ----- | ----- | ------- | ----------- |
+| 0        | base  | float | -1.f    |             |
+| 1        | scale | float | 1.f     |             |
+| 2        | shift | float | 0.f     |             |
 
 # LRN
+
 ```
 if region_type == ACROSS_CHANNELS   square_sum = sum of channel window of local_size
 if region_type == WITHIN_CHANNEL    square_sum = sum of spatial window of local_size
 y = x * pow(bias + alpha * square_sum / (local_size * local_size), -beta)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | region_type   | int   | 0         |                   |
-| 1         | local_size    | int   | 5         |                   |
-| 2         | alpha         | float | 1.f       |                   |
-| 3         | beta          | float | 0.75f     |                   |
-| 4         | bias          | float | 1.f       |                   |
+| param id | name        | type  | default | description |
+| -------- | ----------- | ----- | ------- | ----------- |
+| 0        | region_type | int   | 0       |             |
+| 1        | local_size  | int   | 5       |             |
+| 2        | alpha       | float | 1.f     |             |
+| 3        | beta        | float | 0.75f   |             |
+| 4        | bias        | float | 1.f     |             |
 
 Region type:
+
 - 0 = ACROSS_CHANNELS
 - 1 = WITHIN_CHANNEL
 
 # LSTM
+
 Apply a single-layer LSTM to a feature sequence of `T` timesteps. The input blob shape is `[w=input_size, h=T]` and the output blob shape is `[w=num_output, h=T]`.
 
 ```
@@ -1233,53 +1299,57 @@ y = lstm(x)
 y0, hidden y1, cell y2 = lstm(x0, hidden x1, cell x2)
 ```
 
-* one_blob_only if bidirectional
+- one_blob_only if bidirectional
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         | output size of output |
-| 1         | weight_data_size| int | 0         | total size of IFOG weight matrix |
-| 2         | direction     | int   | 0         | 0=forward, 1=reverse, 2=bidirectional |
-| 3         | hidden_size   | int   | num_output| hidden size       |
+| param id | name             | type | default    | description                           |
+| -------- | ---------------- | ---- | ---------- | ------------------------------------- |
+| 0        | num_output       | int  | 0          | output size of output                 |
+| 1        | weight_data_size | int  | 0          | total size of IFOG weight matrix      |
+| 2        | direction        | int  | 0          | 0=forward, 1=reverse, 2=bidirectional |
+| 3        | hidden_size      | int  | num_output | hidden size                           |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_xc_data| float/fp16/int8 | [input_size, hidden_size * 4, num_directions] |
-| bias_c_data   | float/fp16/int8 | [hidden_size, 4, num_directions] |
-| weight_hc_data| float/fp16/int8 | [num_output, hidden_size * 4, num_directions] |
-| weight_hr_data| float/fp16/int8 | [hidden_size, num_output, num_directions] |
+| weight         | type            | shape                                         |
+| -------------- | --------------- | --------------------------------------------- |
+| weight_xc_data | float/fp16/int8 | [input_size, hidden_size * 4, num_directions] |
+| bias_c_data    | float/fp16/int8 | [hidden_size, 4, num_directions]              |
+| weight_hc_data | float/fp16/int8 | [num_output, hidden_size * 4, num_directions] |
+| weight_hr_data | float/fp16/int8 | [hidden_size, num_output, num_directions]     |
 
 Direction flag:
+
 - 0 = forward only
 - 1 = reverse only
 - 2 = bidirectional
 
 # MemoryData
+
 ```
 y = data
 ```
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | w             | int   | 0         |                   |
-| 1         | h             | int   | 0         |                   |
-| 11        | d             | int   | 0         |                   |
-| 2         | c             | int   | 0         |                   |
-| 21        | load_type     | int   | 1         | 1=fp32            |
+| param id | name      | type | default | description |
+| -------- | --------- | ---- | ------- | ----------- |
+| 0        | w         | int  | 0       |             |
+| 1        | h         | int  | 0       |             |
+| 11       | d         | int  | 0       |             |
+| 2        | c         | int  | 0       |             |
+| 21       | load_type | int  | 1       | 1=fp32      |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| data          | float | [w, h, d, c]          |
+| weight | type  | shape        |
+| ------ | ----- | ------------ |
+| data   | float | [w, h, d, c] |
 
 # Mish
+
 ```
 y = x * tanh(log(exp(x) + 1))
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
 # MultiHeadAttention
+
 ```
 split q k v into num_head part q0, k0, v0, q1, k1, v1 ...
 for each num_head part
@@ -1294,33 +1364,34 @@ for each num_head part
 y = affine(out)
 ```
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | embed_dim     | int   | 0         |                   |
-| 1         | num_heads     | int   | 1         |                   |
-| 2         | weight_data_size| int | 0         | qdim = weight_data_size / embed_dim |
-| 3         | kdim          | int   | embed_dim |                   |
-| 4         | vdim          | int   | embed_dim |                   |
-| 5         | attn_mask     | int   | 0         |                   |
-| 6         | scale         | float | 1.f / sqrt(embed_dim / num_heads) | |
-| 18        | int8_scale_term | int | 0         |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| q_weight_data | float/fp16/int8 | [embed_dim * qdim] |
-| q_bias_data   | float | [embed_dim]           |
-| k_weight_data | float/fp16/int8 | [embed_dim * kdim] |
-| k_bias_data   | float | [embed_dim]           |
-| v_weight_data | float/fp16/int8 | [embed_dim * vdim] |
-| v_bias_data   | float | [embed_dim]           |
-| out_weight_data| float/fp16/int8 | [qdim * embed_dim] |
-| out_bias_data | float | [qdim]                |
-| q_weight_data_int8_scales| float | [embed_dim] |
-| k_weight_data_int8_scales| float | [embed_dim] |
-| v_weight_data_int8_scales| float | [embed_dim] |
-| out_weight_data_int8_scales| float | [1]      |
+| param id | name             | type  | default                           | description                         |
+| -------- | ---------------- | ----- | --------------------------------- | ----------------------------------- |
+| 0        | embed_dim        | int   | 0                                 |                                     |
+| 1        | num_heads        | int   | 1                                 |                                     |
+| 2        | weight_data_size | int   | 0                                 | qdim = weight_data_size / embed_dim |
+| 3        | kdim             | int   | embed_dim                         |                                     |
+| 4        | vdim             | int   | embed_dim                         |                                     |
+| 5        | attn_mask        | int   | 0                                 |                                     |
+| 6        | scale            | float | 1.f / sqrt(embed_dim / num_heads) |                                     |
+| 18       | int8_scale_term  | int   | 0                                 |                                     |
+
+| weight                      | type            | shape              |
+| --------------------------- | --------------- | ------------------ |
+| q_weight_data               | float/fp16/int8 | [embed_dim * qdim] |
+| q_bias_data                 | float           | [embed_dim]        |
+| k_weight_data               | float/fp16/int8 | [embed_dim * kdim] |
+| k_bias_data                 | float           | [embed_dim]        |
+| v_weight_data               | float/fp16/int8 | [embed_dim * vdim] |
+| v_bias_data                 | float           | [embed_dim]        |
+| out_weight_data             | float/fp16/int8 | [qdim * embed_dim] |
+| out_bias_data               | float           | [qdim]             |
+| q_weight_data_int8_scales   | float           | [embed_dim]        |
+| k_weight_data_int8_scales   | float           | [embed_dim]        |
+| v_weight_data_int8_scales   | float           | [embed_dim]        |
+| out_weight_data_int8_scales | float           | [1]                |
 
 # MVN
+
 ```
 if normalize_variance == 1 && across_channels == 1      y = (x - mean) / (sqrt(var) + eps) of whole blob
 if normalize_variance == 1 && across_channels == 0      y = (x - mean) / (sqrt(var) + eps) of each channel
@@ -1328,20 +1399,22 @@ if normalize_variance == 0 && across_channels == 1      y = x - mean of whole bl
 if normalize_variance == 0 && across_channels == 0      y = x - mean of each channel
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | normalize_variance| int | 0       |                   |
-| 1         | across_channels| int  | 0         |                   |
-| 2         | eps           | float | 0.0001f   | x = x / (sqrt(var) + eps) |
+| param id | name               | type  | default | description               |
+| -------- | ------------------ | ----- | ------- | ------------------------- |
+| 0        | normalize_variance | int   | 0       |                           |
+| 1        | across_channels    | int   | 0       |                           |
+| 2        | eps                | float | 0.0001f | x = x / (sqrt(var) + eps) |
 
 # Noop
+
 ```
 y = x
 ```
 
 # Normalize
+
 ```
 if across_spatial == 1 && across_channel == 1      x2 = normalize(x) of whole blob
 if across_spatial == 1 && across_channel == 0      x2 = normalize(x) of each channel
@@ -1349,79 +1422,85 @@ if across_spatial == 0 && across_channel == 1      x2 = normalize(x) of each pos
 y = x2 * scale
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | across_spatial| int   | 0         |                   |
-| 1         | channel_shared| int   | 0         |                   |
-| 2         | eps           | float | 0.0001f   | see eps mode      |
-| 3         | scale_data_size| int  | 0         |                   |
-| 4         | across_channel| int   | 0         |                   |
-| 9         | eps_mode      | int   | 0         |                   |
+| param id | name            | type  | default | description  |
+| -------- | --------------- | ----- | ------- | ------------ |
+| 0        | across_spatial  | int   | 0       |              |
+| 1        | channel_shared  | int   | 0       |              |
+| 2        | eps             | float | 0.0001f | see eps mode |
+| 3        | scale_data_size | int   | 0       |              |
+| 4        | across_channel  | int   | 0       |              |
+| 9        | eps_mode        | int   | 0       |              |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| scale_data    | float | [scale_data_size]     |
+| weight     | type  | shape             |
+| ---------- | ----- | ----------------- |
+| scale_data | float | [scale_data_size] |
 
 Eps Mode:
-- 0 = caffe/mxnet   x = x / sqrt(var + eps)
-- 1 = pytorch       x = x / max(sqrt(var), eps)
-- 2 = tensorflow    x = x / sqrt(max(var, eps))
+
+- 0 = caffe/mxnet x = x / sqrt(var + eps)
+- 1 = pytorch x = x / max(sqrt(var), eps)
+- 2 = tensorflow x = x / sqrt(max(var, eps))
 
 # Packing
+
 ```
 y = wrap_packing(x)
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | out_elempack  | int   | 1         |                   |
-| 1         | use_padding   | int   | 0         |                   |
-| 2         | cast_type_from| int   | 0         |                   |
-| 3         | cast_type_to  | int   | 0         |                   |
-| 4         | storage_type_from| int | 0        |                   |
-| 5         | storage_type_to| int  | 0         |                   |
+| param id | name              | type | default | description |
+| -------- | ----------------- | ---- | ------- | ----------- |
+| 0        | out_elempack      | int  | 1       |             |
+| 1        | use_padding       | int  | 0       |             |
+| 2        | cast_type_from    | int  | 0       |             |
+| 3        | cast_type_to      | int  | 0       |             |
+| 4        | storage_type_from | int  | 0       |             |
+| 5        | storage_type_to   | int  | 0       |             |
 
 # Padding
+
 ```
 y = pad(x, pads)
 ```
 
-| param id  | name          | type | default   | description       |
-| --------- | ------------- | ---- | --------- | ----------------- |
-| 0         | top           | int  | 0         |                   |
-| 1         | bottom        | int  | 0         |                   |
-| 2         | left          | int  | 0         |                   |
-| 3         | right         | int  | 0         |                   |
-| 4         | type          | int  | 0         |                   |
-| 5         | value         | float | 0         |                   |
-| 6         | per_channel_pad_data_size| int | 0 |                 |
-| 7         | front         | int  | stride_w  |                   |
-| 8         | behind        | int  | pad_left  |                   |
+| param id | name                      | type  | default  | description |
+| -------- | ------------------------- | ----- | -------- | ----------- |
+| 0        | top                       | int   | 0        |             |
+| 1        | bottom                    | int   | 0        |             |
+| 2        | left                      | int   | 0        |             |
+| 3        | right                     | int   | 0        |             |
+| 4        | type                      | int   | 0        |             |
+| 5        | value                     | float | 0        |             |
+| 6        | per_channel_pad_data_size | int   | 0        |             |
+| 7        | front                     | int   | stride_w |             |
+| 8        | behind                    | int   | pad_left |             |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| per_channel_pad_data| float | [per_channel_pad_data_size] |
+| weight               | type  | shape                       |
+| -------------------- | ----- | --------------------------- |
+| per_channel_pad_data | float | [per_channel_pad_data_size] |
 
 Padding type:
+
 - 0 = CONSTANT
 - 1 = REPLICATE
 - 2 = REFLECT
 
 # Permute
+
 ```
 y = reorder(x)
 ```
 
-| param id  | name          | type | default   | description       |
-| --------- | ------------- | ---- | --------- | ----------------- |
-| 0         | order_type    | int  | 0         |                   |
+| param id | name       | type | default | description |
+| -------- | ---------- | ---- | ------- | ----------- |
+| 0        | order_type | int  | 0       |             |
 
 Order Type:
+
 - 0 = WH WHC WHDC
 - 1 = HW HWC HWDC
 - 2 = WCH WDHC
@@ -1448,183 +1527,198 @@ Order Type:
 - 23 = CDHW
 
 # PixelShuffle
+
 ```
 if mode == 0    y = depth_to_space(x) where x channel order is sw-sh-outc
 if mode == 1    y = depth_to_space(x) where x channel order is outc-sw-sh
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type | default   | description       |
-| --------- | ------------- | ---- | --------- | ----------------- |
-| 0         | upscale_factor| int  | 1         |                   |
-| 1         | mode          | int  | 0         |                   |
+| param id | name           | type | default | description |
+| -------- | -------------- | ---- | ------- | ----------- |
+| 0        | upscale_factor | int  | 1       |             |
+| 1        | mode           | int  | 0       |             |
 
 # Pooling
+
 ```
 x2 = pad(x, pads)
 x3 = pooling(x2, kernel, stride)
 ```
 
-| param id  | name          | type | default   | description       |
-| --------- | --------------| ---- | --------- | ----------------- |
-| 0         | pooling_type  | int  | 0         |                   |
-| 1         | kernel_w      | int  | 0         |                   |
-| 2         | stride_w      | int  | 1         |                   |
-| 3         | pad_left      | int  | 0         |                   |
-| 4         | global_pooling| int  | 0         |                   |
-| 5         | pad_mode      | int  | 0         |                   |
-| 6         | avgpool_count_include_pad| int | 0 |                 |
-| 7         | adaptive_pooling| int | 0        |                   |
-| 8         | out_w         | int  | 0         |                   |
-| 11        | kernel_h      | int  | kernel_w  |                   |
-| 12        | stride_h      | int  | stride_w  |                   |
-| 13        | pad_top       | int  | pad_left  |                   |
-| 14        | pad_right     | int  | pad_left  |                   |
-| 15        | pad_bottom    | int  | pad_top   |                   |
-| 18        | out_h         | int  | out_w     |                   |
+| param id | name                      | type | default  | description |
+| -------- | ------------------------- | ---- | -------- | ----------- |
+| 0        | pooling_type              | int  | 0        |             |
+| 1        | kernel_w                  | int  | 0        |             |
+| 2        | stride_w                  | int  | 1        |             |
+| 3        | pad_left                  | int  | 0        |             |
+| 4        | global_pooling            | int  | 0        |             |
+| 5        | pad_mode                  | int  | 0        |             |
+| 6        | avgpool_count_include_pad | int  | 0        |             |
+| 7        | adaptive_pooling          | int  | 0        |             |
+| 8        | out_w                     | int  | 0        |             |
+| 11       | kernel_h                  | int  | kernel_w |             |
+| 12       | stride_h                  | int  | stride_w |             |
+| 13       | pad_top                   | int  | pad_left |             |
+| 14       | pad_right                 | int  | pad_left |             |
+| 15       | pad_bottom                | int  | pad_top  |             |
+| 18       | out_h                     | int  | out_w    |             |
 
 Pooling type:
+
 - 0 = MAX
 - 1 = AVG
 
 Pad mode:
+
 - 0 = full padding
 - 1 = valid padding
 - 2 = tensorflow padding=SAME or onnx padding=SAME_UPPER
 - 3 = onnx padding=SAME_LOWER
 
 # Pooling1D
+
 ```
 x2 = pad(x, pads)
 x3 = pooling1d(x2, kernel, stride)
 ```
 
-| param id  | name          | type | default   | description       |
-| --------- | --------------| ---- | --------- | ----------------- |
-| 0         | pooling_type  | int  | 0         |                   |
-| 1         | kernel_w      | int  | 0         |                   |
-| 2         | stride_w      | int  | 1         |                   |
-| 3         | pad_left      | int  | 0         |                   |
-| 4         | global_pooling| int  | 0         |                   |
-| 5         | pad_mode      | int  | 0         |                   |
-| 6         | avgpool_count_include_pad| int | 0 |                 |
-| 7         | adaptive_pooling| int | 0        |                   |
-| 8         | out_w         | int  | 0         |                   |
-| 14        | pad_right     | int  | pad_left  |                   |
+| param id | name                      | type | default  | description |
+| -------- | ------------------------- | ---- | -------- | ----------- |
+| 0        | pooling_type              | int  | 0        |             |
+| 1        | kernel_w                  | int  | 0        |             |
+| 2        | stride_w                  | int  | 1        |             |
+| 3        | pad_left                  | int  | 0        |             |
+| 4        | global_pooling            | int  | 0        |             |
+| 5        | pad_mode                  | int  | 0        |             |
+| 6        | avgpool_count_include_pad | int  | 0        |             |
+| 7        | adaptive_pooling          | int  | 0        |             |
+| 8        | out_w                     | int  | 0        |             |
+| 14       | pad_right                 | int  | pad_left |             |
 
 Pooling type:
+
 - 0 = MAX
 - 1 = AVG
 
 Pad mode:
+
 - 0 = full padding
 - 1 = valid padding
 - 2 = tensorflow padding=SAME or onnx padding=SAME_UPPER
 - 3 = onnx padding=SAME_LOWER
 
 # Pooling3D
+
 ```
 x2 = pad(x, pads)
 x3 = pooling3d(x2, kernel, stride)
 ```
 
-| param id  | name          | type | default   | description       |
-| --------- | --------------| ---- | --------- | ----------------- |
-| 0         | pooling_type  | int  | 0         |                   |
-| 1         | kernel_w      | int  | 0         |                   |
-| 2         | stride_w      | int  | 1         |                   |
-| 3         | pad_left      | int  | 0         |                   |
-| 4         | global_pooling| int  | 0         |                   |
-| 5         | pad_mode      | int  | 0         |                   |
-| 6         | avgpool_count_include_pad| int | 0 |                 |
-| 7         | adaptive_pooling| int | 0        |                   |
-| 8         | out_w         | int  | 0         |                   |
-| 11        | kernel_h      | int  | kernel_w  |                   |
-| 12        | stride_h      | int  | stride_w  |                   |
-| 13        | pad_top       | int  | pad_left  |                   |
-| 14        | pad_right     | int  | pad_left  |                   |
-| 15        | pad_bottom    | int  | pad_top   |                   |
-| 16        | pad_behind    | int  | pad_front |                   |
-| 18        | out_h         | int  | out_w     |                   |
-| 21        | kernel_d      | int  | kernel_w  |                   |
-| 22        | stride_d      | int  | stride_w  |                   |
-| 23        | pad_front     | int  | pad_left  |                   |
-| 28        | out_d         | int  | out_w     |                   |
+| param id | name                      | type | default   | description |
+| -------- | ------------------------- | ---- | --------- | ----------- |
+| 0        | pooling_type              | int  | 0         |             |
+| 1        | kernel_w                  | int  | 0         |             |
+| 2        | stride_w                  | int  | 1         |             |
+| 3        | pad_left                  | int  | 0         |             |
+| 4        | global_pooling            | int  | 0         |             |
+| 5        | pad_mode                  | int  | 0         |             |
+| 6        | avgpool_count_include_pad | int  | 0         |             |
+| 7        | adaptive_pooling          | int  | 0         |             |
+| 8        | out_w                     | int  | 0         |             |
+| 11       | kernel_h                  | int  | kernel_w  |             |
+| 12       | stride_h                  | int  | stride_w  |             |
+| 13       | pad_top                   | int  | pad_left  |             |
+| 14       | pad_right                 | int  | pad_left  |             |
+| 15       | pad_bottom                | int  | pad_top   |             |
+| 16       | pad_behind                | int  | pad_front |             |
+| 18       | out_h                     | int  | out_w     |             |
+| 21       | kernel_d                  | int  | kernel_w  |             |
+| 22       | stride_d                  | int  | stride_w  |             |
+| 23       | pad_front                 | int  | pad_left  |             |
+| 28       | out_d                     | int  | out_w     |             |
 
 Pooling type:
+
 - 0 = MAX
 - 1 = AVG
 
 Pad mode:
+
 - 0 = full padding
 - 1 = valid padding
 - 2 = tensorflow padding=SAME or onnx padding=SAME_UPPER
 - 3 = onnx padding=SAME_LOWER
 
 # Power
+
 ```
 y = pow((shift + x * scale), power)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | power         | float | 1.f       |                   |
-| 1         | scale         | float | 1.f       |                   |
-| 2         | shift         | float | 0.f       |                   |
+| param id | name  | type  | default | description |
+| -------- | ----- | ----- | ------- | ----------- |
+| 0        | power | float | 1.f     |             |
+| 1        | scale | float | 1.f     |             |
+| 2        | shift | float | 0.f     |             |
 
 # PReLU
+
 ```
 if x < 0    y = x * slope
 else        y = x
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_slope     | int   | 0         |                   |
+| param id | name      | type | default | description |
+| -------- | --------- | ---- | ------- | ----------- |
+| 0        | num_slope | int  | 0       |             |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| slope_data    | float | [num_slope]           |
+| weight     | type  | shape       |
+| ---------- | ----- | ----------- |
+| slope_data | float | [num_slope] |
 
 # Quantize
+
 ```
 y = float2int8(x * scale)
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | scale_data_size| int  | 1         |                   |
+| param id | name            | type | default | description |
+| -------- | --------------- | ---- | ------- | ----------- |
+| 0        | scale_data_size | int  | 1       |             |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| scale_data    | float | [scale_data_size]     |
+| weight     | type  | shape             |
+| ---------- | ----- | ----------------- |
+| scale_data | float | [scale_data_size] |
 
 # Reduction
+
 ```
 y = reduce_op(x * coeff)
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | operation     | int   | 0         |                   |
-| 1         | reduce_all    | int   | 1         |                   |
-| 2         | coeff         | float | 1.f       |                   |
-| 3         | axes          | array | [ ]       |                   |
-| 4         | keepdims      | int   | 0         |                   |
-| 5         | fixbug0       | int   | 0         | hack for bug fix, should be 1 |
+| param id | name       | type  | default | description                   |
+| -------- | ---------- | ----- | ------- | ----------------------------- |
+| 0        | operation  | int   | 0       |                               |
+| 1        | reduce_all | int   | 1       |                               |
+| 2        | coeff      | float | 1.f     |                               |
+| 3        | axes       | array | [ ]     |                               |
+| 4        | keepdims   | int   | 0       |                               |
+| 5        | fixbug0    | int   | 0       | hack for bug fix, should be 1 |
 
 Operation type:
+
 - 0 = SUM
 - 1 = ASUM
 - 2 = SUMSQ
@@ -1638,96 +1732,103 @@ Operation type:
 - 10 = LogSumExp
 
 # ReLU
+
 ```
 if x < 0    y = x * slope
 else        y = x
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | slope         | float | 0.f       |                   |
+| param id | name  | type  | default | description |
+| -------- | ----- | ----- | ------- | ----------- |
+| 0        | slope | float | 0.f     |             |
 
 # Reorg
+
 ```
 if mode == 0    y = space_to_depth(x) where x channel order is sw-sh-outc
 if mode == 1    y = space_to_depth(x) where x channel order is outc-sw-sh
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type | default   | description       |
-| --------- | ------------- | ---- | --------- | ----------------- |
-| 0         | stride        | int  | 1         |                   |
-| 1         | mode          | int  | 0         |                   |
+| param id | name   | type | default | description |
+| -------- | ------ | ---- | ------- | ----------- |
+| 0        | stride | int  | 1       |             |
+| 1        | mode   | int  | 0       |             |
 
 # Requantize
+
 ```
 x2 = x * scale_in + bias
 x3 = activation(x2)
 y = float2int8(x3 * scale_out)
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | scale_in_data_size| int | 1       |                   |
-| 1         | scale_out_data_size| int | 1      |                   |
-| 2         | bias_data_size| int   | 0         |                   |
-| 3         | activation_type| int  | 0         |                   |
-| 4         | activation_params| int | [ ]      |                   |
+| param id | name                | type | default | description |
+| -------- | ------------------- | ---- | ------- | ----------- |
+| 0        | scale_in_data_size  | int  | 1       |             |
+| 1        | scale_out_data_size | int  | 1       |             |
+| 2        | bias_data_size      | int  | 0       |             |
+| 3        | activation_type     | int  | 0       |             |
+| 4        | activation_params   | int  | [ ]     |             |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| scale_in_data | float | [scale_in_data_size]  |
-| scale_out_data| float | [scale_out_data_size] |
-| bias_data     | float | [bias_data_size]      |
+| weight         | type  | shape                 |
+| -------------- | ----- | --------------------- |
+| scale_in_data  | float | [scale_in_data_size]  |
+| scale_out_data | float | [scale_out_data_size] |
+| bias_data      | float | [bias_data_size]      |
 
 # Reshape
+
 ```
 if permute == 1     y = hwc2chw(reshape(chw2hwc(x)))
 else                y = reshape(x)
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | w             | int   | -233      |                   |
-| 1         | h             | int   | -233      |                   |
-| 11        | d             | int   | -233      |                   |
-| 2         | c             | int   | -233      |                   |
-| 3         | permute       | int   | 0         |                   |
+| param id | name    | type | default | description |
+| -------- | ------- | ---- | ------- | ----------- |
+| 0        | w       | int  | -233    |             |
+| 1        | h       | int  | -233    |             |
+| 11       | d       | int  | -233    |             |
+| 2        | c       | int  | -233    |             |
+| 3        | permute | int  | 0       |             |
 
 Reshape flag:
+
 - 0 = copy from bottom
 - -1 = remaining
 - -233 = drop this dim(default)
 
 # RMSNorm
+
 ```
 split x along outmost axis into part x0, x1 ...
 root mean square normalize for each part x0, x1 ...
 y = x * gamma by elementwise
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | affine_size   | int   | 0         |                   |
-| 1         | eps           | float | 0.001f    | x = x / sqrt(var + eps) |
-| 2         | affine        | int   | 1         |                   |
+| param id | name        | type  | default | description             |
+| -------- | ----------- | ----- | ------- | ----------------------- |
+| 0        | affine_size | int   | 0       |                         |
+| 1        | eps         | float | 0.001f  | x = x / sqrt(var + eps) |
+| 2        | affine      | int   | 1       |                         |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| gamma_data    | float | [affine_size]         |
+| weight     | type  | shape         |
+| ---------- | ----- | ------------- |
+| gamma_data | float | [affine_size] |
 
 # RNN
+
 Apply a single-layer RNN to a feature sequence of `T` timesteps. The input blob shape is `[w=input_size, h=T]` and the output blob shape is `[w=num_output, h=T]`.
 
 ```
@@ -1735,127 +1836,137 @@ y = rnn(x)
 y0, hidden y1 = rnn(x0, hidden x1)
 ```
 
-* one_blob_only if bidirectional
+- one_blob_only if bidirectional
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         | hidden size of output |
-| 1         | weight_data_size| int | 0         | total size of weight matrix |
-| 2         | direction     | int   | 0         | 0=forward, 1=reverse, 2=bidirectional |
+| param id | name             | type | default | description                           |
+| -------- | ---------------- | ---- | ------- | ------------------------------------- |
+| 0        | num_output       | int  | 0       | hidden size of output                 |
+| 1        | weight_data_size | int  | 0       | total size of weight matrix           |
+| 2        | direction        | int  | 0       | 0=forward, 1=reverse, 2=bidirectional |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_xc_data| float/fp16/int8 | [input_size, num_output, num_directions] |
-| bias_c_data   | float/fp16/int8 | [num_output, 1, num_directions] |
-| weight_hc_data| float/fp16/int8 | [num_output, num_output, num_directions] |
+| weight         | type            | shape                                    |
+| -------------- | --------------- | ---------------------------------------- |
+| weight_xc_data | float/fp16/int8 | [input_size, num_output, num_directions] |
+| bias_c_data    | float/fp16/int8 | [num_output, 1, num_directions]          |
+| weight_hc_data | float/fp16/int8 | [num_output, num_output, num_directions] |
 
 Direction flag:
+
 - 0 = forward only
 - 1 = reverse only
 - 2 = bidirectional
 
 # Scale
+
 ```
 if scale_data_size == -233  y = x0 * x1
 else                        y = x * scale + bias
 ```
 
-* one_blob_only if scale_data_size != -233
-* support_inplace
+- one_blob_only if scale_data_size != -233
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | scale_data_size| int  | 0         |                   |
-| 1         | bias_term     | int   | 0         |                   |
+| param id | name            | type | default | description |
+| -------- | --------------- | ---- | ------- | ----------- |
+| 0        | scale_data_size | int  | 0       |             |
+| 1        | bias_term       | int  | 0       |             |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| scale_data    | float | [scale_data_size]     |
-| bias_data     | float | [scale_data_size]     |
+| weight     | type  | shape             |
+| ---------- | ----- | ----------------- |
+| scale_data | float | [scale_data_size] |
+| bias_data  | float | [scale_data_size] |
 
 # SELU
+
 ```
 if x < 0    y = (exp(x) - 1.f) * alpha * lambda
 else        y = x * lambda
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | alpha         | float | 1.67326324f|                  |
-| 1         | lambda        | float | 1.050700987f|                 |
+| param id | name   | type  | default      | description |
+| -------- | ------ | ----- | ------------ | ----------- |
+| 0        | alpha  | float | 1.67326324f  |             |
+| 1        | lambda | float | 1.050700987f |             |
 
 # Shrink
+
 ```
 if x < -lambd y = x + bias
 if x >  lambd y = x - bias
 else          y = x
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | bias          | float | 0.0f      |                   |
-| 1         | lambd         | float | 0.5f      |                   |
+| param id | name  | type  | default | description |
+| -------- | ----- | ----- | ------- | ----------- |
+| 0        | bias  | float | 0.0f    |             |
+| 1        | lambd | float | 0.5f    |             |
 
 # ShuffleChannel
+
 ```
 if reverse == 0     y = shufflechannel(x) by group
 if reverse == 1     y = shufflechannel(x) by channel / group
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type | default   | description       |
-| --------- | ------------- | ---- | --------- | ----------------- |
-| 0         | group         | int  | 1         |                   |
-| 1         | reverse       | int  | 0         |                   |
+| param id | name    | type | default | description |
+| -------- | ------- | ---- | ------- | ----------- |
+| 0        | group   | int  | 1       |             |
+| 1        | reverse | int  | 0       |             |
 
 # Sigmoid
+
 ```
 y = 1 / (1 + exp(-x))
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
 # Slice
+
 ```
 split x along axis into slices, each part slice size is based on slices array
 ```
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | slices        | array | [ ]       |                   |
-| 1         | axis          | int   | 0         |                   |
-| 2         | indices       | array | [ ]       |                   |
+| param id | name    | type  | default | description |
+| -------- | ------- | ----- | ------- | ----------- |
+| 0        | slices  | array | [ ]     |             |
+| 1        | axis    | int   | 0       |             |
+| 2        | indices | array | [ ]     |             |
 
 # Softmax
+
 ```
 softmax(x, axis)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | axis          | int   | 0         |                   |
-| 1         | fixbug0       | int   | 0         | hack for bug fix, should be 1 |
+| param id | name    | type | default | description                   |
+| -------- | ------- | ---- | ------- | ----------------------------- |
+| 0        | axis    | int  | 0       |                               |
+| 1        | fixbug0 | int  | 0       | hack for bug fix, should be 1 |
 
 # Softplus
+
 ```
 y = log(exp(x) + 1)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
 # Spectrogram
+
 ```
 x1 = pad(x) if center
 y = stft(x1)
@@ -1866,68 +1977,74 @@ if power == 1 return magnitude
 if power == 2 return square of magnitude
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | n_fft         | int   | 0         |                   |
-| 1         | power         | int   | 0         |                   |
-| 2         | hoplen        | int   | n_fft / 4 |                   |
-| 3         | winlen        | int   | n_fft     |                   |
-| 4         | window_type   | int   | 0         | 0=ones 1=hann 2=hamming |
-| 5         | center        | int   | 1         |                   |
-| 6         | pad_type      | int   | 2         | 0=CONSTANT 1=REPLICATE 2=REFLECT |
-| 7         | normalized    | int   | 0         | 0=no 1=n_fft 2=window-l2-energy |
-| 8         | onesided      | int   | 1         |                   |
+| param id | name        | type | default   | description                      |
+| -------- | ----------- | ---- | --------- | -------------------------------- |
+| 0        | n_fft       | int  | 0         |                                  |
+| 1        | power       | int  | 0         |                                  |
+| 2        | hoplen      | int  | n_fft / 4 |                                  |
+| 3        | winlen      | int  | n_fft     |                                  |
+| 4        | window_type | int  | 0         | 0=ones 1=hann 2=hamming          |
+| 5        | center      | int  | 1         |                                  |
+| 6        | pad_type    | int  | 2         | 0=CONSTANT 1=REPLICATE 2=REFLECT |
+| 7        | normalized  | int  | 0         | 0=no 1=n_fft 2=window-l2-energy  |
+| 8        | onesided    | int  | 1         |                                  |
 
 # Split
+
 ```
 y0, y1 ... = x
 ```
 
 # Swish
+
 ```
 y = x / (1 + exp(-x))
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
 # TanH
+
 ```
 y = tanh(x)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
 # Threshold
+
 ```
 if x > threshold    y = 1
 else                y = 0
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | threshold     | float | 0.f       |                   |
+| param id | name      | type  | default | description |
+| -------- | --------- | ----- | ------- | ----------- |
+| 0        | threshold | float | 0.f     |             |
 
 # Tile
+
 ```
 y = repeat tiles along axis for x
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | axis          | int   | 0         |                   |
-| 1         | tiles         | int   | 1         |                   |
-| 2         | repeats       | array | [ ]       |                   |
+| param id | name    | type  | default | description |
+| -------- | ------- | ----- | ------- | ----------- |
+| 0        | axis    | int   | 0       |             |
+| 1        | tiles   | int   | 1       |             |
+| 2        | repeats | array | [ ]     |             |
 
 # UnaryOp
+
 ```
 y = unaryop(x)
 ```
@@ -1935,11 +2052,12 @@ y = unaryop(x)
 - one_blob_only
 - support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | op_type       | int   | 0         | Operation type as follows |
+| param id | name    | type | default | description               |
+| -------- | ------- | ---- | ------- | ------------------------- |
+| 0        | op_type | int  | 0       | Operation type as follows |
 
 Operation type:
+
 - 0 = ABS
 - 1 = NEG
 - 2 = FLOOR
@@ -1962,22 +2080,23 @@ Operation type:
 - 19 = TRUNC
 
 # Unfold
+
 ```
 y = unfold(x)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
+- one_blob_only
+
+| param id | name       | type | default    | description |
+| -------- | ---------- | ---- | ---------- | ----------- |
+| 0        | num_output | int  | 0          |             |
+| 1        | kernel_w   | int  | 0          |             |
+| 2        | dilation_w | int  | 1          |             |
+| 3        | stride_w   | int  | 1          |             |
+| 4        | pad_left   | int  | 0          |             |
+| 11       | kernel_h   | int  | kernel_w   |             |
+| 12       | dilation_h | int  | dilation_w |             |
+| 13       | stride_h   | int  | stride_w   |             |
+| 14       | pad_top    | int  | pad_left   |             |
+| 15       | pad_right  | int  | pad_left   |             |
+| 16       | pad_bottom | int  | pad_top    |             |
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index c97235d97a0..60f24361d8b 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -169,6 +169,7 @@ ncnn_add_layer(Shrink)
 ncnn_add_layer(RMSNorm)
 ncnn_add_layer(Spectrogram)
 ncnn_add_layer(InverseSpectrogram)
+ncnn_add_layer(Flip)
 
 if(NCNN_VULKAN)
     ncnn_add_shader(${CMAKE_CURRENT_SOURCE_DIR}/convert_ycbcr.comp)
diff --git a/src/layer/flip.cpp b/src/layer/flip.cpp
new file mode 100644
index 00000000000..f8726e0fb9f
--- /dev/null
+++ b/src/layer/flip.cpp
@@ -0,0 +1,41 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "flip.h"
+
+namespace ncnn {
+
+Flip::Flip()
+{
+    one_blob_only = true;
+}
+
+int Flip::load_param(const ParamDict& pd)
+{
+    axis = pd.get(0, Mat());
+    // 打印
+    const int* axis_ptr = axis;
+    printf("axis_len = %d", axis.w);
+    printf("axis[0] = %d", axis_ptr[0]);
+    printf("axis[1] = %d", axis_ptr[1]);
+    return 0;
+}
+
+int Flip::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    // wip
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/flip.h b/src/layer/flip.h
new file mode 100644
index 00000000000..b75bf5e68ef
--- /dev/null
+++ b/src/layer/flip.h
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_FLIP_H
+#define LAYER_FLIP_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class Flip : public Layer
+{
+public:
+    Flip();
+
+    virtual int load_param(const ParamDict& pd);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+public:
+    Mat axis; // 维度翻转
+};
+
+} // namespace ncnn
+
+#endif // LAYER_FLIP_H
diff --git a/tools/pnnx/src/CMakeLists.txt b/tools/pnnx/src/CMakeLists.txt
index b1ac6f5c024..5d681ab9c4b 100644
--- a/tools/pnnx/src/CMakeLists.txt
+++ b/tools/pnnx/src/CMakeLists.txt
@@ -575,6 +575,7 @@ set(pnnx_pass_ncnn_SRCS
     pass_ncnn/torch_cumsum.cpp
     pass_ncnn/torch_diag.cpp
     pass_ncnn/torch_flatten.cpp
+    pass_ncnn/torch_flip.cpp
     pass_ncnn/torch_istft.cpp
     pass_ncnn/torch_logsumexp.cpp
     pass_ncnn/torch_matmul.cpp
diff --git a/tools/pnnx/src/pass_ncnn/torch_flip.cpp b/tools/pnnx/src/pass_ncnn/torch_flip.cpp
new file mode 100644
index 00000000000..bc0e3348548
--- /dev/null
+++ b/tools/pnnx/src/pass_ncnn/torch_flip.cpp
@@ -0,0 +1,56 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+#include "pass_ncnn.h"
+
+namespace pnnx {
+
+namespace ncnn {
+
+class torch_flip : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+torch.flip              op_0        1 1 input out dims=%dims
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "Flip";
+    }
+
+    const char* name_str() const
+    {
+        return "flip";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        const std::vector<int>& dims = captured_params.at("dims").ai;
+
+        // 设置参数
+        op->params["0"] = dims;
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(torch_flip, 20)
+
+} // namespace ncnn
+
+} // namespace pnnx
\ No newline at end of file

From 700e18a1d2541ebcc8b475a16020d2454b4d4bd7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BD=B0=E9=98=85?=
 <43716063+Baiyuetribe@users.noreply.github.com>
Date: Sun, 12 Jan 2025 21:56:41 +0800
Subject: [PATCH 02/14] done

---
 src/layer/flip.cpp                       | 538 ++++++++++++++++++++++-
 src/layer/flip.h                         |   2 +-
 tests/CMakeLists.txt                     |   1 +
 tests/test_flip.cpp                      |  79 ++++
 tools/pnnx/tests/ncnn/CMakeLists.txt     |   1 +
 tools/pnnx/tests/ncnn/test_torch_flip.py | 151 +++++++
 6 files changed, 765 insertions(+), 7 deletions(-)
 create mode 100644 tests/test_flip.cpp
 create mode 100644 tools/pnnx/tests/ncnn/test_torch_flip.py

diff --git a/src/layer/flip.cpp b/src/layer/flip.cpp
index f8726e0fb9f..6757b853f80 100644
--- a/src/layer/flip.cpp
+++ b/src/layer/flip.cpp
@@ -24,17 +24,543 @@ Flip::Flip()
 int Flip::load_param(const ParamDict& pd)
 {
     axis = pd.get(0, Mat());
-    // 打印
-    const int* axis_ptr = axis;
-    printf("axis_len = %d", axis.w);
-    printf("axis[0] = %d", axis_ptr[0]);
-    printf("axis[1] = %d", axis_ptr[1]);
+    // 调试
+    // const int *axis_ptr = axis;
+    // printf("axis_len = %d\n", axis.w);
+    // printf("axis[0] = %d\n", axis_ptr[0]);
     return 0;
 }
 
 int Flip::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
-    // wip
+    // 已知参数
+    int dims = bottom_blob.dims;
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int d = bottom_blob.d;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+
+    // 校准输入参数
+    if (axis.w > 4)
+    {
+        return -1;
+    }
+    const int* axis_ptr = axis;
+
+    if (dims == 1)
+    {
+        // 1D 只有一种情况
+        top_blob.create(w, elemsize, opt.blob_allocator);
+        const float* ptr = bottom_blob;
+        float* outptr = top_blob;
+        for (int i = 0; i < w; i++)
+        {
+            outptr[i] = ptr[w - 1 - i];
+        }
+    }
+    else if (dims == 2)
+    {
+        // 2D 有三种，安装上下、左右和上下左右同时翻转;[-2/0上下翻转, -1/1左右翻转,交叉为上下左右翻转]
+        top_blob.create(w, h, elemsize, opt.blob_allocator);
+        if (axis.w == 1)
+        {
+            if (axis_ptr[0] == -2 || axis_ptr[0] == 0)
+            {
+                // 按照行翻转
+                for (int i = 0; i < h; i++)
+                {
+                    const float* ptr = bottom_blob.row(h - 1 - i); // 从最后一行开始
+                    float* outptr = top_blob.row(i);               // 输出到当前行
+
+                    // 直接复制整行数据
+                    memcpy(outptr, ptr, w * sizeof(float));
+                }
+            }
+            else
+            {
+                // 按照列翻转
+                for (int i = 0; i < h; i++)
+                {
+                    const float* ptr = bottom_blob.row(i);
+                    float* outptr = top_blob.row(i);
+
+                    // 使用临时buffer存储反转的行数据
+                    std::vector<float> line_buffer(w);
+                    for (int j = 0; j < w; j++)
+                    {
+                        line_buffer[j] = ptr[w - 1 - j];
+                    }
+
+                    // 一次性复制整行
+                    memcpy(outptr, line_buffer.data(), w * sizeof(float));
+                }
+            }
+        }
+        else
+        {
+            // 当axis.w=2时，上下左右都翻转
+            for (int i = 0; i < h; i++)
+            {
+                const float* ptr = bottom_blob.row(h - 1 - i); // 从最后一行开始读取
+                float* outptr = top_blob.row(i);               // 输出到当前行
+
+                // 每行内左右翻转
+                for (int j = 0; j < w; j++)
+                {
+                    outptr[j] = ptr[w - 1 - j]; // 反向读取每行像素
+                }
+            }
+        }
+    }
+    else if (dims == 3)
+    {
+        top_blob.create(w, h, channels, elemsize, opt.blob_allocator);
+        if (axis.w == 1)
+        {
+            // w、h、c
+            // 约定到正数，简化后续判断
+            int axis0 = axis_ptr[0] < 0 ? 3 + axis_ptr[0] : axis_ptr[0];
+            if (axis0 == 0)
+            {
+                // -3/0 整体上下翻转
+                for (int i = 0; i < channels; i++)
+                {
+                    for (int j = 0; j < h; j++)
+                    {
+                        const float* ptr = bottom_blob.channel(channels - 1 - i).row(j); // 从最后一个channel开始
+                        float* outptr = top_blob.channel(i).row(j);
+                        memcpy(outptr, ptr, w * sizeof(float));
+                    }
+                }
+            }
+            else if (axis0 == 1)
+            {
+                // -2/1 整体内部上下翻转
+                for (int i = 0; i < channels; i++)
+                {
+                    for (int j = 0; j < h; j++)
+                    {
+                        const float* ptr = bottom_blob.channel(i).row(h - 1 - j);
+                        float* outptr = top_blob.channel(i).row(j);
+                        memcpy(outptr, ptr, w * sizeof(float));
+                    }
+                }
+            }
+            else
+            {
+                // -1/2 整体左右翻转
+                for (int i = 0; i < channels; i++)
+                {
+                    for (int j = 0; j < h; j++)
+                    {
+                        const float* ptr = bottom_blob.channel(i).row(j);
+                        float* outptr = top_blob.channel(i).row(j);
+                        for (int k = 0; k < w; k++)
+                        {
+                            outptr[k] = ptr[w - 1 - k];
+                        }
+                    }
+                }
+            }
+        }
+        else if (axis.w == 2)
+        {
+            // wh、wc、hc
+            int axis0 = axis_ptr[0] < 0 ? 3 + axis_ptr[0] : axis_ptr[0];
+            int axis1 = axis_ptr[1] < 0 ? 3 + axis_ptr[1] : axis_ptr[1];
+            int axis_sum = axis0 + axis1;
+            if (axis_sum == 1)
+            {
+                // 对应wh
+                for (int i = 0; i < channels; i++)
+                {
+                    for (int j = 0; j < h; j++)
+                    {
+                        // 组合两种翻转：channel维度和行维度同时翻转
+                        const float* ptr = bottom_blob.channel(channels - 1 - i).row(h - 1 - j);
+                        float* outptr = top_blob.channel(i).row(j);
+                        memcpy(outptr, ptr, w * sizeof(float));
+                    }
+                }
+            }
+            else if (axis_sum == 2)
+            {
+                // 对应wc
+                for (int i = 0; i < channels; i++)
+                {
+                    for (int j = 0; j < h; j++)
+                    {
+                        const float* ptr = bottom_blob.channel(channels - 1 - i).row(j);
+                        float* outptr = top_blob.channel(i).row(j);
+                        for (int k = 0; k < w; k++)
+                        {
+                            outptr[k] = ptr[w - 1 - k];
+                        }
+                    }
+                }
+            }
+            else if (axis_sum == 3)
+            {
+                // 对应hc
+                for (int i = 0; i < channels; i++)
+                {
+                    for (int j = 0; j < h; j++)
+                    {
+                        const float* ptr = bottom_blob.channel(i).row(h - 1 - j);
+                        float* outptr = top_blob.channel(i).row(j);
+
+                        // 增加左右翻转
+                        for (int k = 0; k < w; k++)
+                        {
+                            outptr[k] = ptr[w - 1 - k];
+                        }
+                    }
+                }
+            }
+        }
+        else
+        {
+            // whc
+            for (int i = 0; i < channels; i++)
+            {
+                for (int j = 0; j < h; j++)
+                {
+                    const float* ptr = bottom_blob.channel(channels - 1 - i).row(h - 1 - j);
+                    float* outptr = top_blob.channel(i).row(j);
+
+                    // 左右翻转实现完全倒序
+                    for (int k = 0; k < w; k++)
+                    {
+                        outptr[k] = ptr[w - 1 - k];
+                    }
+                }
+            }
+        }
+    }
+    else if (dims == 4)
+    {
+        top_blob.create(w, h, d, channels, elemsize, opt.blob_allocator);
+        if (axis.w == 1)
+        {
+            // w、h、d、c
+            int axis0 = axis_ptr[0] < 0 ? 4 + axis_ptr[0] : axis_ptr[0];
+            if (axis0 == 0)
+            {
+                // -4/0 整体上下翻转 torch中按c维度翻转
+                for (int c = 0; c < channels; c++) // 遍历channels=3
+                {
+                    int flipped_c = channels - 1 - c; // 计算channels翻转位置
+
+                    for (int z = 0; z < d; z++) // 遍历d=2维度
+                    {
+                        for (int j = 0; j < h; j++) // 遍历行
+                        {
+                            const float* ptr = bottom_blob.channel(c).row(z * h + j);
+                            float* outptr = const_cast<float*>(top_blob.channel(flipped_c).row(z * h + j));
+                            memcpy(outptr, ptr, w * sizeof(float));
+                        }
+                    }
+                }
+            }
+            else if (axis0 == 1)
+            {
+                // -3/1 torh中按d维度内部上下翻转
+                for (int i = 0; i < channels; i++) // 遍历channels
+                {
+                    for (int z = 0; z < d; z++) // 遍历d维度
+                    {
+                        for (int j = 0; j < h; j++) // 遍历h维度
+                        {
+                            // 翻转d维度的数据读取位置
+                            const float* ptr = bottom_blob.channel(i).row((d - 1 - z) * h + j);
+                            float* outptr = const_cast<float*>(top_blob.channel(i).row(z * h + j));
+                            // 逐行复制w元素
+                            memcpy(outptr, ptr, w * sizeof(float));
+                        }
+                    }
+                }
+            }
+            else if (axis0 == 2)
+            {
+                // -2/2 按torch中H维度翻转 上下
+                for (int i = 0; i < channels; i++)
+                {
+                    for (int z = 0; z < d; z++)
+                    {
+                        for (int j = 0; j < h; j++)
+                        {
+                            const float* ptr = bottom_blob.channel(i).row(z * h + (h - 1 - j));
+                            float* outptr = top_blob.channel(i).row(z * h + j);
+                            memcpy(outptr, ptr, w * sizeof(float));
+                        }
+                    }
+                }
+            }
+            else
+            {
+                // -1/3 按torch中W维度翻转 左右
+                for (int i = 0; i < channels; i++)
+                {
+                    for (int z = 0; z < d; z++)
+                    {
+                        for (int j = 0; j < h; j++)
+                        {
+                            const float* ptr = bottom_blob.channel(i).row(z * h + j);
+                            float* outptr = top_blob.channel(i).row(z * h + j);
+                            for (int k = 0; k < w; k++)
+                            {
+                                outptr[k] = ptr[w - 1 - k];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        else if (axis.w == 2)
+        {
+            // dc1、dh2、dw3、ch3、cw4、hw5
+            int axis0 = axis_ptr[0] < 0 ? 4 + axis_ptr[0] : axis_ptr[0];
+            int axis1 = axis_ptr[1] < 0 ? 4 + axis_ptr[1] : axis_ptr[1];
+            int axis_sum = axis0 + axis1;
+            if (axis_sum == 1)
+            {
+                // 对应dc
+                for (int c = 0; c < channels; c++) // 遍历channels
+                {
+                    int flipped_c = channels - 1 - c; // 翻转后的channel位置
+
+                    for (int z = 0; z < d; z++) // 遍历d维度
+                    {
+                        int flipped_d = d - 1 - z; // 翻转后的d位置
+
+                        for (int j = 0; j < h; j++) // 遍历行
+                        {
+                            const float* ptr = bottom_blob.channel(c).row(z * h + j);
+                            float* outptr = const_cast<float*>(top_blob.channel(flipped_c).row(flipped_d * h + j));
+                            memcpy(outptr, ptr, w * sizeof(float));
+                        }
+                    }
+                }
+            }
+            else if (axis_sum == 2)
+            {
+                // 对应dh
+                for (int c = 0; c < channels; c++) // 遍历 channels=2 维度
+                {
+                    int flipped_c = channels - 1 - c; // 计算 c 维度翻转位置 (0→1, 1→0)
+
+                    for (int z = 0; z < d; z++) // 遍历 d=3 维度
+                    {
+                        // 按翻转顺序逐行复制 h 维度数据
+                        for (int i = 0; i < h; i++)
+                        {
+                            const float* ptr = bottom_blob.channel(c).row(z * h + i);
+                            float* outptr = const_cast<float*>(top_blob.channel(flipped_c).row(z * h + (h - 1 - i))); // 保持z维度顺序,翻转h维度
+                            memcpy(outptr, ptr, w * sizeof(float));                                                   // 按行复制，保持 w 维度顺序
+                        }
+                    }
+                }
+            }
+            else if (axis_sum == 3)
+            {
+                // 对应dw；有一个为0或3
+                if (axis0 == 0 || axis0 == 3)
+                {
+                    // 对应dw
+                    for (int c = 0; c < channels; c++)
+                    {
+                        int flipped_c = channels - 1 - c; // 翻转c维度
+
+                        for (int z = 0; z < d; z++) // d维度保持不变
+                        {
+                            for (int j = 0; j < h; j++) // h维度保持不变
+                            {
+                                const float* ptr = bottom_blob.channel(c).row(z * h + j);
+                                float* outptr = const_cast<float*>(top_blob.channel(flipped_c).row(z * h + j));
+
+                                // 翻转w维度
+                                for (int k = 0; k < w; k++)
+                                {
+                                    outptr[k] = ptr[w - 1 - k];
+                                }
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    // 对应ch
+                    for (int c = 0; c < channels; c++)
+                    {
+                        for (int z = 0; z < d; z++)
+                        {
+                            int flipped_d = d - 1 - z;
+
+                            for (int j = 0; j < h; j++)
+                            {
+                                int flipped_h = h - 1 - j;
+                                // 读取源数据
+                                const float* ptr = bottom_blob.channel(c).row(z * h + j);
+                                float* outptr = const_cast<float*>(top_blob.channel(c).row(flipped_d * h + flipped_h));
+                                memcpy(outptr, ptr, w * sizeof(float));
+                            }
+                        }
+                    }
+                }
+            }
+            else if (axis_sum == 4)
+            {
+                // 对应cw
+                for (int c = 0; c < channels; c++)
+                {
+                    for (int z = 0; z < d; z++)
+                    {
+                        int flipped_d = d - 1 - z; // 翻转 d 维度
+
+                        for (int j = 0; j < h; j++)
+                        {
+                            const float* ptr = bottom_blob.channel(c).row(z * h + j);
+                            float* outptr = const_cast<float*>(top_blob.channel(c).row(flipped_d * h + j)); // c维度保持不变
+
+                            // 翻转 w 维度
+                            for (int k = 0; k < w; k++)
+                            {
+                                outptr[k] = ptr[w - 1 - k];
+                            }
+                        }
+                    }
+                }
+            }
+            else
+            {
+                // 对应hw
+                for (int c = 0; c < channels; c++)
+                {
+                    for (int z = 0; z < d; z++)
+                    {
+                        for (int j = 0; j < h; j++)
+                        {
+                            const float* ptr = bottom_blob.channel(c).row(z * h + j);
+                            float* outptr = const_cast<float*>(top_blob.channel(c).row(z * h + (h - 1 - j))); // 翻转 h 维度
+
+                            // 翻转 w 维度
+                            for (int k = 0; k < w; k++)
+                            {
+                                outptr[k] = ptr[w - 1 - k];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        else if (axis.w == 3)
+        {
+            // dch3、dcw4、chw6
+            int axis0 = axis_ptr[0] < 0 ? 4 + axis_ptr[0] : axis_ptr[0];
+            int axis1 = axis_ptr[1] < 0 ? 4 + axis_ptr[1] : axis_ptr[1];
+            int axis2 = axis_ptr[2] < 0 ? 4 + axis_ptr[2] : axis_ptr[2];
+            int axis_sum = axis0 + axis1 + axis2;
+            if (axis_sum == 3)
+            {
+                // 对应dch，除w外，其余全翻转
+                for (int c = 0; c < channels; c++)
+                {
+                    int flipped_c = channels - 1 - c; // 翻转c维度
+
+                    for (int z = 0; z < d; z++)
+                    {
+                        int flipped_d = d - 1 - z; // 翻转d维度
+
+                        for (int i = 0; i < h; i++)
+                        {
+                            const float* ptr = bottom_blob.channel(c).row(z * h + i);
+                            float* outptr = const_cast<float*>(top_blob.channel(flipped_c).row(flipped_d * h + (h - 1 - i))); // 翻转h维度
+                            memcpy(outptr, ptr, w * sizeof(float));                                                           // w维度保持不变
+                        }
+                    }
+                }
+            }
+            else if (axis_sum == 4)
+            {
+                // 对应dcw，除h外，其余全翻转
+                for (int c = 0; c < channels; c++)
+                {
+                    int flipped_c = channels - 1 - c; // 翻转c维度
+
+                    for (int z = 0; z < d; z++)
+                    {
+                        int flipped_d = d - 1 - z; // 翻转d维度
+
+                        for (int i = 0; i < h; i++)
+                        {
+                            const float* ptr = bottom_blob.channel(c).row(z * h + i);
+                            float* outptr = const_cast<float*>(top_blob.channel(flipped_c).row(flipped_d * h + i)); // h维度保持不变
+
+                            // 翻转w维度
+                            for (int k = 0; k < w; k++)
+                            {
+                                outptr[k] = ptr[w - 1 - k];
+                            }
+                        }
+                    }
+                }
+            }
+            else if (axis_sum == 6)
+            {
+                // 对应chw,除了c外全翻转
+                for (int c = 0; c < channels; c++) // c维度保持不变
+                {
+                    for (int z = 0; z < d; z++)
+                    {
+                        int flipped_d = d - 1 - z; // 翻转d维度
+
+                        for (int i = 0; i < h; i++)
+                        {
+                            const float* ptr = bottom_blob.channel(c).row(z * h + i);
+                            float* outptr = const_cast<float*>(top_blob.channel(c).row(flipped_d * h + (h - 1 - i))); // 翻转h维度
+
+                            // 翻转w维度
+                            for (int k = 0; k < w; k++)
+                            {
+                                outptr[k] = ptr[w - 1 - k];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        else
+        {
+            // dchw全部翻转
+            for (int c = 0; c < channels; c++)
+            {
+                int flipped_c = channels - 1 - c; // 翻转c维度
+
+                for (int z = 0; z < d; z++)
+                {
+                    int flipped_d = d - 1 - z; // 翻转d维度
+
+                    for (int i = 0; i < h; i++)
+                    {
+                        const float* ptr = bottom_blob.channel(c).row(z * h + i);
+                        float* outptr = const_cast<float*>(top_blob.channel(flipped_c).row(flipped_d * h + (h - 1 - i))); // 翻转h维度
+
+                        // 翻转w维度
+                        for (int k = 0; k < w; k++)
+                        {
+                            outptr[k] = ptr[w - 1 - k];
+                        }
+                    }
+                }
+            }
+        }
+    }
+    else
+    {
+        return -1;
+    }
+
     return 0;
 }
 
diff --git a/src/layer/flip.h b/src/layer/flip.h
index b75bf5e68ef..61a05d4538a 100644
--- a/src/layer/flip.h
+++ b/src/layer/flip.h
@@ -29,7 +29,7 @@ class Flip : public Layer
     virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
 public:
-    Mat axis; // 维度翻转
+    Mat axis; // 翻转维度
 };
 
 } // namespace ncnn
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index f55859e736e..48853470d3f 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -105,6 +105,7 @@ ncnn_add_layer_test(Embed)
 ncnn_add_layer_test(Erf)
 ncnn_add_layer_test(ExpandDims)
 ncnn_add_layer_test(Flatten)
+ncnn_add_layer_test(Flip)
 ncnn_add_layer_test(Fold)
 ncnn_add_layer_test(GELU)
 ncnn_add_layer_test(GLU)
diff --git a/tests/test_flip.cpp b/tests/test_flip.cpp
new file mode 100644
index 00000000000..55795caadc1
--- /dev/null
+++ b/tests/test_flip.cpp
@@ -0,0 +1,79 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "layer.h"
+#include "testutil.h"
+
+static int test_flip(const ncnn::Mat& a, std::vector<int> axis)
+{
+    ncnn::Mat axis_mat(axis.size());
+    for (size_t i = 0; i < axis.size(); i++)
+    {
+        axis_mat[i] = axis[i];
+    }
+    ncnn::ParamDict pd;
+    pd.set(0, axis_mat); // axis
+
+    std::vector<ncnn::Mat> weights(0);
+
+    int ret = test_layer("Flip", pd, weights, a);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_flip failed a.dims=%d a=(%d %d %d) axis=", a.dims, a.w, a.h, a.c);
+    }
+
+    return ret;
+}
+
+static int test_flip_0()
+{
+    return 0
+           || test_flip(RandomMat(3, 2, 6, 7), {0})
+           || test_flip(RandomMat(3, 2, 6, 7), {0, 1})
+           || test_flip(RandomMat(3, 2, 6, 7), {0, 2})
+           || test_flip(RandomMat(3, 2, 6, 7), {0, 3});
+}
+
+static int test_flip_1()
+{
+    return 0
+           || test_flip(RandomMat(2, 3, 5), {0})
+           || test_flip(RandomMat(4, 2, 5), {0, 1})
+           || test_flip(RandomMat(3, 4, 2), {0, 1, 2});
+}
+
+static int test_flip_2()
+{
+    return 0
+           || test_flip(RandomMat(8, 2), {-2})
+           || test_flip(RandomMat(16, 3), {-2, -1});
+}
+
+static int test_flip_3()
+{
+    return 0
+           || test_flip(RandomMat(16), {-1})
+           || test_flip(RandomMat(32), {0});
+}
+
+int main()
+{
+    SRAND(7767517);
+
+    return 0
+           || test_flip_0()
+           || test_flip_1()
+           || test_flip_2()
+           || test_flip_3();
+}
\ No newline at end of file
diff --git a/tools/pnnx/tests/ncnn/CMakeLists.txt b/tools/pnnx/tests/ncnn/CMakeLists.txt
index 42c3bed32e0..54c8896ef77 100644
--- a/tools/pnnx/tests/ncnn/CMakeLists.txt
+++ b/tools/pnnx/tests/ncnn/CMakeLists.txt
@@ -188,6 +188,7 @@ pnnx_ncnn_add_test(torch_clamp)
 pnnx_ncnn_add_test(torch_cos)
 pnnx_ncnn_add_test(torch_exp)
 pnnx_ncnn_add_test(torch_floor)
+pnnx_ncnn_add_test(torch_flip)
 pnnx_ncnn_add_test(torch_log)
 pnnx_ncnn_add_test(torch_log10)
 pnnx_ncnn_add_test(torch_maximum)
diff --git a/tools/pnnx/tests/ncnn/test_torch_flip.py b/tools/pnnx/tests/ncnn/test_torch_flip.py
new file mode 100644
index 00000000000..4c9702cc505
--- /dev/null
+++ b/tools/pnnx/tests/ncnn/test_torch_flip.py
@@ -0,0 +1,151 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z, d):
+        # 1D
+        x0 = torch.flip(x, [0])
+        # 2D
+        y0 = torch.flip(y, [0])
+        y1 = torch.flip(y, [1])
+        y2 = torch.flip(y, [-2, -1])
+        # 3D
+        z0 = torch.flip(z, [0])
+        z1 = torch.flip(z, [1])
+        z2 = torch.flip(z, [2])
+        z3 = torch.flip(z, [0, 1])
+        z4 = torch.flip(z, [0, 2])
+        z5 = torch.flip(z, [1, 2])
+        # 4D
+        d0 = torch.flip(d, [-1])
+        d1 = torch.flip(d, [-2])
+        d2 = torch.flip(d, [-3])
+        d3 = torch.flip(d, [-4])
+        d4 = torch.flip(d, [0, 1])
+        d5 = torch.flip(d, [0, 2])
+        d6 = torch.flip(d, [0, 3])
+        d7 = torch.flip(d, [1, 2])
+        d8 = torch.flip(d, [1, 3])
+        d9 = torch.flip(d, [2, 3])
+        d10 = torch.flip(d, [0, 1, 2])
+        d11 = torch.flip(d, [0, 1, 3])
+        d12 = torch.flip(d, [1, 2, 3])
+        d13 = torch.flip(d, [0, 1, 2, 3])
+
+        return (
+            x0,
+            y0,
+            y1,
+            y2,
+            z0,
+            z1,
+            z2,
+            z3,
+            z4,
+            z5,
+            d0,
+            d1,
+            d2,
+            d3,
+            d4,
+            d5,
+            d6,
+            d7,
+            d8,
+            d9,
+            d10,
+            d11,
+            d12,
+            d13,
+        )
+
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(36)  # 1D
+    y = torch.rand(4, 7)  # 2D
+    z = torch.rand(3, 4, 5)  # 3D
+    d = torch.rand(4, 2, 6, 7)  # 4D
+
+    a = net(x, y, z, d)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z, d))
+    mod.save("test_torch_flip.pt")
+
+    # torchscript to pnnx
+    import os
+
+    os.system(
+        "../../src/pnnx test_torch_flip.pt inputshape=[36],[4,7],[3,4,5],[4,2,6,7]"
+    )
+
+    # pnnx inference
+    import test_torch_flip_ncnn
+
+    b = test_torch_flip_ncnn.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-3, 1e-3):
+            return False
+    return True
+
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)

From 3cce5b4850d13cce49ae7ce4962d3ad629e25d5c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BD=B0=E9=98=85?=
 <43716063+Baiyuetribe@users.noreply.github.com>
Date: Mon, 13 Jan 2025 13:20:13 +0800
Subject: [PATCH 03/14] test_ctest

---
 .github/workflows/linux-x64-cpu-gcc.yml | 220 ++++++++++++------------
 src/layer/flip.cpp                      |   3 +-
 2 files changed, 112 insertions(+), 111 deletions(-)

diff --git a/.github/workflows/linux-x64-cpu-gcc.yml b/.github/workflows/linux-x64-cpu-gcc.yml
index ab2185be3e7..580000b498e 100644
--- a/.github/workflows/linux-x64-cpu-gcc.yml
+++ b/.github/workflows/linux-x64-cpu-gcc.yml
@@ -1,33 +1,33 @@
 name: linux-x64-cpu-gcc
 on:
   push:
-    branches: [master]
+    # branches: [master]
     paths:
-    - '.github/workflows/linux-x64-cpu-gcc.yml'
-    - 'toolchains/host-c.gcc.toolchain.cmake'
-    - 'CMakeLists.txt'
-    - 'cmake/**'
-    - 'src/*'
-    - 'src/layer/*'
-    - 'src/layer/x86/**'
-    - 'tests/**'
-    - 'tools/**'
-    - '!tools/pnnx/**'
-    - 'examples/**'
+      - ".github/workflows/linux-x64-cpu-gcc.yml"
+      - "toolchains/host-c.gcc.toolchain.cmake"
+      - "CMakeLists.txt"
+      - "cmake/**"
+      - "src/*"
+      - "src/layer/*"
+      - "src/layer/x86/**"
+      - "tests/**"
+      - "tools/**"
+      - "!tools/pnnx/**"
+      - "examples/**"
   pull_request:
-    branches: [master]
+    # branches: [master]
     paths:
-    - '.github/workflows/linux-x64-cpu-gcc.yml'
-    - 'toolchains/host-c.gcc.toolchain.cmake'
-    - 'CMakeLists.txt'
-    - 'cmake/**'
-    - 'src/*'
-    - 'src/layer/*'
-    - 'src/layer/x86/**'
-    - 'tests/**'
-    - 'tools/**'
-    - '!tools/pnnx/**'
-    - 'examples/**'
+      - ".github/workflows/linux-x64-cpu-gcc.yml"
+      - "toolchains/host-c.gcc.toolchain.cmake"
+      - "CMakeLists.txt"
+      - "cmake/**"
+      - "src/*"
+      - "src/layer/*"
+      - "src/layer/x86/**"
+      - "tests/**"
+      - "tools/**"
+      - "!tools/pnnx/**"
+      - "examples/**"
 concurrency:
   group: linux-x64-cpu-gcc-${{ github.ref }}
   cancel-in-progress: true
@@ -38,97 +38,97 @@ jobs:
   linux-gcc:
     runs-on: ubuntu-20.04
     steps:
-    - uses: actions/checkout@v4
-    - name: update
-      run: sudo apt-get update
-    - name: protobuf
-      run: sudo apt-get install libprotobuf-dev protobuf-compiler libopencv-dev
-    - name: build-sse2
-      run: |
-        mkdir build-sse2 && cd build-sse2
-        cmake -DNCNN_AVX=OFF -DNCNN_AVX2=OFF -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j $(nproc)
-    - name: test-sse2
-      run: cd build-sse2 && ctest --output-on-failure -j $(nproc)
-    - name: build-shared
-      run: |
-        mkdir build-shared && cd build-shared
-        cmake -DNCNN_AVX2=ON -DNCNN_SHARED_LIB=ON ..
-        cmake --build . -j $(nproc)
-    - name: build-avx2
-      run: |
-        mkdir build-avx2 && cd build-avx2
-        cmake -DNCNN_AVX2=ON -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j $(nproc)
-    - name: test-avx2
-      run: cd build-avx2 && ctest --output-on-failure -j $(nproc)
-    - name: build-avx
-      run: |
-        mkdir build-avx && cd build-avx
-        cmake -DNCNN_AVX2=OFF -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j $(nproc)
-    - name: test-avx
-      run: cd build-avx && ctest --output-on-failure -j $(nproc)
-    - name: build-avx1-2
-      run: |
-        mkdir build-avx1-2 && cd build-avx1-2
-        cmake -DNCNN_AVX2=ON -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j $(nproc)
-    - name: test-avx1-2
-      run: cd build-avx1-2 && ctest --output-on-failure -j $(nproc)
-    - name: build-noint8
-      run: |
-        mkdir build-noint8 && cd build-noint8
-        cmake -DNCNN_INT8=OFF -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j $(nproc)
-    - name: test-noint8
-      run: cd build-noint8 && ctest --output-on-failure -j $(nproc)
+      - uses: actions/checkout@v4
+      - name: update
+        run: sudo apt-get update
+      - name: protobuf
+        run: sudo apt-get install libprotobuf-dev protobuf-compiler libopencv-dev
+      - name: build-sse2
+        run: |
+          mkdir build-sse2 && cd build-sse2
+          cmake -DNCNN_AVX=OFF -DNCNN_AVX2=OFF -DNCNN_BUILD_TESTS=ON ..
+          cmake --build . -j $(nproc)
+      - name: test-sse2
+        run: cd build-sse2 && ctest --output-on-failure -j $(nproc)
+      - name: build-shared
+        run: |
+          mkdir build-shared && cd build-shared
+          cmake -DNCNN_AVX2=ON -DNCNN_SHARED_LIB=ON ..
+          cmake --build . -j $(nproc)
+      - name: build-avx2
+        run: |
+          mkdir build-avx2 && cd build-avx2
+          cmake -DNCNN_AVX2=ON -DNCNN_BUILD_TESTS=ON ..
+          cmake --build . -j $(nproc)
+      - name: test-avx2
+        run: cd build-avx2 && ctest --output-on-failure -j $(nproc)
+      - name: build-avx
+        run: |
+          mkdir build-avx && cd build-avx
+          cmake -DNCNN_AVX2=OFF -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON ..
+          cmake --build . -j $(nproc)
+      - name: test-avx
+        run: cd build-avx && ctest --output-on-failure -j $(nproc)
+      - name: build-avx1-2
+        run: |
+          mkdir build-avx1-2 && cd build-avx1-2
+          cmake -DNCNN_AVX2=ON -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON ..
+          cmake --build . -j $(nproc)
+      - name: test-avx1-2
+        run: cd build-avx1-2 && ctest --output-on-failure -j $(nproc)
+      - name: build-noint8
+        run: |
+          mkdir build-noint8 && cd build-noint8
+          cmake -DNCNN_INT8=OFF -DNCNN_BUILD_TESTS=ON ..
+          cmake --build . -j $(nproc)
+      - name: test-noint8
+        run: cd build-noint8 && ctest --output-on-failure -j $(nproc)
 
   linux-gcc-cpp03-nostdio-nostring-simplestl:
     runs-on: ubuntu-20.04
     steps:
-    - uses: actions/checkout@v4
-    - name: build-nostdio
-      run: |
-        mkdir build-nostdio && cd build-nostdio
-        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-c++03.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
-        cmake --build . -j $(nproc)
-    - name: test-nostdio
-      run: cd build-nostdio && ctest --output-on-failure -j $(nproc)
-    - name: build-nostdio-nostring
-      run: |
-        mkdir build-nostdio-nostring && cd build-nostdio-nostring
-        cmake -DNCNN_STDIO=OFF -DNCNN_STRING=OFF -DNCNN_BUILD_TESTS=OFF -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
-        cmake --build . -j $(nproc)
-    - name: build-simplestl
-      run: |
-        mkdir build-simplestl && cd build-simplestl
-        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.gcc.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
-        cmake --build . -j $(nproc)
-    - name: test-simplestl
-      run: cd build-simplestl && ctest --output-on-failure -j $(nproc)
-    - name: build-simplestl-simpleomp
-      run: |
-        mkdir build-simplestl-simpleomp && cd build-simplestl-simpleomp
-        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.gcc.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_SIMPLEOMP=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
-        cmake --build . -j $(nproc)
-    - name: test-simplestl-simpleomp
-      run: cd build-simplestl-simpleomp && ctest --output-on-failure -j $(nproc)
+      - uses: actions/checkout@v4
+      - name: build-nostdio
+        run: |
+          mkdir build-nostdio && cd build-nostdio
+          cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-c++03.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
+          cmake --build . -j $(nproc)
+      - name: test-nostdio
+        run: cd build-nostdio && ctest --output-on-failure -j $(nproc)
+      - name: build-nostdio-nostring
+        run: |
+          mkdir build-nostdio-nostring && cd build-nostdio-nostring
+          cmake -DNCNN_STDIO=OFF -DNCNN_STRING=OFF -DNCNN_BUILD_TESTS=OFF -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
+          cmake --build . -j $(nproc)
+      - name: build-simplestl
+        run: |
+          mkdir build-simplestl && cd build-simplestl
+          cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.gcc.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
+          cmake --build . -j $(nproc)
+      - name: test-simplestl
+        run: cd build-simplestl && ctest --output-on-failure -j $(nproc)
+      - name: build-simplestl-simpleomp
+        run: |
+          mkdir build-simplestl-simpleomp && cd build-simplestl-simpleomp
+          cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.gcc.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_SIMPLEOMP=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
+          cmake --build . -j $(nproc)
+      - name: test-simplestl-simpleomp
+        run: cd build-simplestl-simpleomp && ctest --output-on-failure -j $(nproc)
 
   linux-gcc-avx512:
     runs-on: [self-hosted, linux, t4]
     steps:
-    - uses: actions/checkout@v4
-    - name: build
-      env:
-        CC: gcc
-        CXX: g++
-        LD_LIBRARY_PATH: /data/action/install/lib64
-      run: |
-        mkdir build && cd build
-        cmake -DNCNN_AVX2=ON -DNCNN_AVX512=ON -DNCNN_AVX512VNNI=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
-        cmake --build . -j 4
-    - name: test
-      env:
-        LD_LIBRARY_PATH: /data/action/install/lib64
-      run: cd build && ctest --output-on-failure -j 4
+      - uses: actions/checkout@v4
+      - name: build
+        env:
+          CC: gcc
+          CXX: g++
+          LD_LIBRARY_PATH: /data/action/install/lib64
+        run: |
+          mkdir build && cd build
+          cmake -DNCNN_AVX2=ON -DNCNN_AVX512=ON -DNCNN_AVX512VNNI=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
+          cmake --build . -j 4
+      - name: test
+        env:
+          LD_LIBRARY_PATH: /data/action/install/lib64
+        run: cd build && ctest --output-on-failure -j 4
diff --git a/src/layer/flip.cpp b/src/layer/flip.cpp
index 6757b853f80..15ddc7e05a8 100644
--- a/src/layer/flip.cpp
+++ b/src/layer/flip.cpp
@@ -180,7 +180,8 @@ int Flip::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
                         // 组合两种翻转：channel维度和行维度同时翻转
                         const float* ptr = bottom_blob.channel(channels - 1 - i).row(h - 1 - j);
                         float* outptr = top_blob.channel(i).row(j);
-                        memcpy(outptr, ptr, w * sizeof(float));
+                        // memcpy(outptr, ptr, w * sizeof(float)); ctest修复测试
+                        memcpy(outptr, ptr, w * elemsize);
                     }
                 }
             }

From eae435d27e60866972162c4645af94a24a2b0a29 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BD=B0=E9=98=85?=
 <43716063+Baiyuetribe@users.noreply.github.com>
Date: Mon, 13 Jan 2025 13:47:56 +0800
Subject: [PATCH 04/14] ctest char

---
 src/layer/flip.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/layer/flip.cpp b/src/layer/flip.cpp
index 15ddc7e05a8..86475d8b9b7 100644
--- a/src/layer/flip.cpp
+++ b/src/layer/flip.cpp
@@ -166,28 +166,28 @@ int Flip::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
         }
         else if (axis.w == 2)
         {
-            // wh、wc、hc
+            // ch、cw、hw
             int axis0 = axis_ptr[0] < 0 ? 3 + axis_ptr[0] : axis_ptr[0];
             int axis1 = axis_ptr[1] < 0 ? 3 + axis_ptr[1] : axis_ptr[1];
             int axis_sum = axis0 + axis1;
             if (axis_sum == 1)
             {
-                // 对应wh
+                // 对应ch
                 for (int i = 0; i < channels; i++)
                 {
                     for (int j = 0; j < h; j++)
                     {
                         // 组合两种翻转：channel维度和行维度同时翻转
-                        const float* ptr = bottom_blob.channel(channels - 1 - i).row(h - 1 - j);
-                        float* outptr = top_blob.channel(i).row(j);
-                        // memcpy(outptr, ptr, w * sizeof(float)); ctest修复测试
+                        const unsigned char* ptr = bottom_blob.channel(channels - 1 - i).row<const unsigned char>(h - 1 - j);
+                        unsigned char* outptr = top_blob.channel(i).row<unsigned char>(j);
+                        // memcpy(outptr, ptr, w * sizeof(float));
                         memcpy(outptr, ptr, w * elemsize);
                     }
                 }
             }
             else if (axis_sum == 2)
             {
-                // 对应wc
+                // 对应cw
                 for (int i = 0; i < channels; i++)
                 {
                     for (int j = 0; j < h; j++)
@@ -203,7 +203,7 @@ int Flip::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
             }
             else if (axis_sum == 3)
             {
-                // 对应hc
+                // 对应hw
                 for (int i = 0; i < channels; i++)
                 {
                     for (int j = 0; j < h; j++)

From d8424f8ab8a7d31f7e649aff4a2bbcb2c0910a53 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BD=B0=E9=98=85?=
 <43716063+Baiyuetribe@users.noreply.github.com>
Date: Mon, 13 Jan 2025 14:14:51 +0800
Subject: [PATCH 05/14] ctest 2

---
 src/layer/flip.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/layer/flip.cpp b/src/layer/flip.cpp
index 86475d8b9b7..4cf4dcb9f46 100644
--- a/src/layer/flip.cpp
+++ b/src/layer/flip.cpp
@@ -178,10 +178,12 @@ int Flip::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
                     for (int j = 0; j < h; j++)
                     {
                         // 组合两种翻转：channel维度和行维度同时翻转
-                        const unsigned char* ptr = bottom_blob.channel(channels - 1 - i).row<const unsigned char>(h - 1 - j);
-                        unsigned char* outptr = top_blob.channel(i).row<unsigned char>(j);
+                        const float* ptr = bottom_blob.channel(channels - 1 - i).row(h - 1 - j);
+                        float* outptr = top_blob.channel(i).row(j);
                         // memcpy(outptr, ptr, w * sizeof(float));
-                        memcpy(outptr, ptr, w * elemsize);
+                        // memcpy(outptr, ptr, w * elemsize);
+                        for (int x = 0; x < w; x++)
+                            outptr[x] = ptr[x];
                     }
                 }
             }

From 5be32e11b7427b7d0c6073962b59464ff363bdd9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BD=B0=E9=98=85?=
 <43716063+Baiyuetribe@users.noreply.github.com>
Date: Mon, 13 Jan 2025 21:00:39 +0800
Subject: [PATCH 06/14] ctest 3

---
 src/layer/flip.cpp | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/layer/flip.cpp b/src/layer/flip.cpp
index 4cf4dcb9f46..ad8deac089a 100644
--- a/src/layer/flip.cpp
+++ b/src/layer/flip.cpp
@@ -179,11 +179,8 @@ int Flip::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
                     {
                         // 组合两种翻转：channel维度和行维度同时翻转
                         const float* ptr = bottom_blob.channel(channels - 1 - i).row(h - 1 - j);
-                        float* outptr = top_blob.channel(i).row(j);
-                        // memcpy(outptr, ptr, w * sizeof(float));
-                        // memcpy(outptr, ptr, w * elemsize);
-                        for (int x = 0; x < w; x++)
-                            outptr[x] = ptr[x];
+                        float* outptr = const_cast<float*>(top_blob.channel(i).row(j));
+                        memcpy(outptr, ptr, w * sizeof(float));
                     }
                 }
             }

From 0a3aa28493177e26b9fc468441abd6293c0ae5ae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BD=B0=E9=98=85?=
 <43716063+Baiyuetribe@users.noreply.github.com>
Date: Mon, 13 Jan 2025 21:20:09 +0800
Subject: [PATCH 07/14] ctest 4

---
 src/layer/flip.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/layer/flip.cpp b/src/layer/flip.cpp
index ad8deac089a..5ec94816247 100644
--- a/src/layer/flip.cpp
+++ b/src/layer/flip.cpp
@@ -116,6 +116,8 @@ int Flip::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
     else if (dims == 3)
     {
         top_blob.create(w, h, channels, elemsize, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
         if (axis.w == 1)
         {
             // w、h、c

From 3ac68cd6e3bbe7f98de966c7a9010b59dc03249d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BD=B0=E9=98=85?=
 <43716063+Baiyuetribe@users.noreply.github.com>
Date: Mon, 13 Jan 2025 21:58:29 +0800
Subject: [PATCH 08/14] fix less c++11

---
 .github/workflows/linux-x64-cpu-gcc.yml |  2 +-
 tests/test_flip.cpp                     | 93 +++++++++++++++++++------
 2 files changed, 74 insertions(+), 21 deletions(-)

diff --git a/.github/workflows/linux-x64-cpu-gcc.yml b/.github/workflows/linux-x64-cpu-gcc.yml
index 580000b498e..31abbe47c25 100644
--- a/.github/workflows/linux-x64-cpu-gcc.yml
+++ b/.github/workflows/linux-x64-cpu-gcc.yml
@@ -15,7 +15,7 @@ on:
       - "!tools/pnnx/**"
       - "examples/**"
   pull_request:
-    # branches: [master]
+    branches: [master]
     paths:
       - ".github/workflows/linux-x64-cpu-gcc.yml"
       - "toolchains/host-c.gcc.toolchain.cmake"
diff --git a/tests/test_flip.cpp b/tests/test_flip.cpp
index 55795caadc1..080de2c16b0 100644
--- a/tests/test_flip.cpp
+++ b/tests/test_flip.cpp
@@ -15,22 +15,61 @@
 #include "layer.h"
 #include "testutil.h"
 
-static int test_flip(const ncnn::Mat& a, std::vector<int> axis)
+// 为兼容低于c++11弃用如下实现
+// ncnn::Mat axis_mat(axis.size());
+// for (size_t i = 0; i < axis.size(); i++)
+// {
+//     axis_mat[i] = axis[i];
+// }
+static ncnn::Mat IntArrayMat(int a0)
+{
+    ncnn::Mat m(1);
+    int* p = m;
+    p[0] = a0;
+    return m;
+}
+
+static ncnn::Mat IntArrayMat(int a0, int a1)
+{
+    ncnn::Mat m(2);
+    int* p = m;
+    p[0] = a0;
+    p[1] = a1;
+    return m;
+}
+
+static ncnn::Mat IntArrayMat(int a0, int a1, int a2)
+{
+    ncnn::Mat m(3);
+    int* p = m;
+    p[0] = a0;
+    p[1] = a1;
+    p[2] = a2;
+    return m;
+}
+
+static ncnn::Mat IntArrayMat(int a0, int a1, int a2, int a3)
+{
+    ncnn::Mat m(4);
+    int* p = m;
+    p[0] = a0;
+    p[1] = a1;
+    p[2] = a2;
+    p[3] = a3;
+    return m;
+}
+
+static int test_flip(const ncnn::Mat& a, const ncnn::Mat& axis)
 {
-    ncnn::Mat axis_mat(axis.size());
-    for (size_t i = 0; i < axis.size(); i++)
-    {
-        axis_mat[i] = axis[i];
-    }
     ncnn::ParamDict pd;
-    pd.set(0, axis_mat); // axis
+    pd.set(0, axis);
 
     std::vector<ncnn::Mat> weights(0);
 
     int ret = test_layer("Flip", pd, weights, a);
     if (ret != 0)
     {
-        fprintf(stderr, "test_flip failed a.dims=%d a=(%d %d %d) axis=", a.dims, a.w, a.h, a.c);
+        fprintf(stderr, "test_flip failed a.dims=%d a=(%d %d %d) axis_w=%d", a.dims, a.w, a.h, a.c, axis.w);
     }
 
     return ret;
@@ -39,38 +78,52 @@ static int test_flip(const ncnn::Mat& a, std::vector<int> axis)
 static int test_flip_0()
 {
     return 0
-           || test_flip(RandomMat(3, 2, 6, 7), {0})
-           || test_flip(RandomMat(3, 2, 6, 7), {0, 1})
-           || test_flip(RandomMat(3, 2, 6, 7), {0, 2})
-           || test_flip(RandomMat(3, 2, 6, 7), {0, 3});
+           || test_flip(RandomMat(2, 3, 4, 5), IntArrayMat(0))
+           || test_flip(RandomMat(3, 2, 4, 5), IntArrayMat(1))
+           || test_flip(RandomMat(4, 3, 2, 5), IntArrayMat(2))
+           || test_flip(RandomMat(2, 3, 1, 5), IntArrayMat(3))
+           || test_flip(RandomMat(6, 3, 4, 5), IntArrayMat(0, 1))
+           || test_flip(RandomMat(2, 3, 1, 6), IntArrayMat(0, 2))
+           || test_flip(RandomMat(5, 1, 2, 5), IntArrayMat(0, 3))
+           || test_flip(RandomMat(5, 2, 1, 5), IntArrayMat(1, 2))
+           || test_flip(RandomMat(4, 5, 2, 3), IntArrayMat(1, 3))
+           || test_flip(RandomMat(2, 6, 4, 5), IntArrayMat(2, 3))
+           || test_flip(RandomMat(6, 1, 4, 5), IntArrayMat(0, 1, 2))
+           || test_flip(RandomMat(5, 2, 1, 5), IntArrayMat(0, 1, 3))
+           || test_flip(RandomMat(4, 3, 3, 5), IntArrayMat(0, 2, 3))
+           || test_flip(RandomMat(4, 3, 4, 5), IntArrayMat(1, 2, 3))
+           || test_flip(RandomMat(6, 3, 3, 2), IntArrayMat(0, 1, 2, 3));
 }
 
 static int test_flip_1()
 {
     return 0
-           || test_flip(RandomMat(2, 3, 5), {0})
-           || test_flip(RandomMat(4, 2, 5), {0, 1})
-           || test_flip(RandomMat(3, 4, 2), {0, 1, 2});
+           || test_flip(RandomMat(2, 3, 5), IntArrayMat(0))
+           || test_flip(RandomMat(3, 3, 5), IntArrayMat(1))
+           || test_flip(RandomMat(4, 3, 5), IntArrayMat(2))
+           || test_flip(RandomMat(3, 1, 5), IntArrayMat(0, 1))
+           || test_flip(RandomMat(3, 2, 5), IntArrayMat(0, 2))
+           || test_flip(RandomMat(3, 3, 4), IntArrayMat(1, 2))
+           || test_flip(RandomMat(4, 3, 2), IntArrayMat(0, 1, 2));
 }
 
 static int test_flip_2()
 {
     return 0
-           || test_flip(RandomMat(8, 2), {-2})
-           || test_flip(RandomMat(16, 3), {-2, -1});
+           || test_flip(RandomMat(8, 2), IntArrayMat(-2))
+           || test_flip(RandomMat(16, 3), IntArrayMat(-1))
+           || test_flip(RandomMat(7, 2), IntArrayMat(-2, -1));
 }
 
 static int test_flip_3()
 {
     return 0
-           || test_flip(RandomMat(16), {-1})
-           || test_flip(RandomMat(32), {0});
+           || test_flip(RandomMat(18), IntArrayMat(-1));
 }
 
 int main()
 {
     SRAND(7767517);
-
     return 0
            || test_flip_0()
            || test_flip_1()

From daf95a0646dae24c6c46ab86d61fb70c49796aa5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BD=B0=E9=98=85?=
 <43716063+Baiyuetribe@users.noreply.github.com>
Date: Mon, 13 Jan 2025 22:16:30 +0800
Subject: [PATCH 09/14] ctest 5

---
 src/layer/flip.cpp  | 1 +
 tests/test_flip.cpp | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/layer/flip.cpp b/src/layer/flip.cpp
index 5ec94816247..1013a72a580 100644
--- a/src/layer/flip.cpp
+++ b/src/layer/flip.cpp
@@ -458,6 +458,7 @@ int Flip::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
         }
         else if (axis.w == 3)
         {
+            return 0; // 在线debug
             // dch3、dcw4、chw6
             int axis0 = axis_ptr[0] < 0 ? 4 + axis_ptr[0] : axis_ptr[0];
             int axis1 = axis_ptr[1] < 0 ? 4 + axis_ptr[1] : axis_ptr[1];
diff --git a/tests/test_flip.cpp b/tests/test_flip.cpp
index 080de2c16b0..7ebf787a462 100644
--- a/tests/test_flip.cpp
+++ b/tests/test_flip.cpp
@@ -15,7 +15,7 @@
 #include "layer.h"
 #include "testutil.h"
 
-// 为兼容低于c++11弃用如下实现
+// 为兼容低于c++11
 // ncnn::Mat axis_mat(axis.size());
 // for (size_t i = 0; i < axis.size(); i++)
 // {
@@ -69,7 +69,7 @@ static int test_flip(const ncnn::Mat& a, const ncnn::Mat& axis)
     int ret = test_layer("Flip", pd, weights, a);
     if (ret != 0)
     {
-        fprintf(stderr, "test_flip failed a.dims=%d a=(%d %d %d) axis_w=%d", a.dims, a.w, a.h, a.c, axis.w);
+        fprintf(stderr, "test_flip failed a.dims=%d a=(%d %d %d %d) axis_w=%d\n", a.dims, a.w, a.h, a.d, a.c, axis.w);
     }
 
     return ret;

From 8376eb7d3d890342a163e44290a1d4e7941fbfe0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BD=B0=E9=98=85?=
 <43716063+Baiyuetribe@users.noreply.github.com>
Date: Mon, 13 Jan 2025 22:39:59 +0800
Subject: [PATCH 10/14] ctest 6

---
 src/layer/flip.cpp  | 20 ++++++++++++--------
 tests/test_flip.cpp | 42 +++++++++++++++++++++++++++++++++++++-----
 2 files changed, 49 insertions(+), 13 deletions(-)

diff --git a/src/layer/flip.cpp b/src/layer/flip.cpp
index 1013a72a580..7c571ea7e2e 100644
--- a/src/layer/flip.cpp
+++ b/src/layer/flip.cpp
@@ -458,7 +458,6 @@ int Flip::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
         }
         else if (axis.w == 3)
         {
-            return 0; // 在线debug
             // dch3、dcw4、chw6
             int axis0 = axis_ptr[0] < 0 ? 4 + axis_ptr[0] : axis_ptr[0];
             int axis1 = axis_ptr[1] < 0 ? 4 + axis_ptr[1] : axis_ptr[1];
@@ -469,17 +468,19 @@ int Flip::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
                 // 对应dch，除w外，其余全翻转
                 for (int c = 0; c < channels; c++)
                 {
-                    int flipped_c = channels - 1 - c; // 翻转c维度
+                    int flipped_c = channels - 1 - c;
 
                     for (int z = 0; z < d; z++)
                     {
-                        int flipped_d = d - 1 - z; // 翻转d维度
+                        int flipped_d = d - 1 - z;
 
                         for (int i = 0; i < h; i++)
                         {
-                            const float* ptr = bottom_blob.channel(c).row(z * h + i);
-                            float* outptr = const_cast<float*>(top_blob.channel(flipped_c).row(flipped_d * h + (h - 1 - i))); // 翻转h维度
-                            memcpy(outptr, ptr, w * sizeof(float));                                                           // w维度保持不变
+                            // 修改前：const float* ptr = bottom_blob.channel(c).row(z * h + i);
+                            // 修改为：使用depth()访问方式
+                            const float* ptr = bottom_blob.channel(c).depth(z).row(i);
+                            float* outptr = const_cast<float*>(top_blob.channel(flipped_c).depth(flipped_d).row(h - 1 - i));
+                            memcpy(outptr, ptr, w * sizeof(float));
                         }
                     }
                 }
@@ -520,9 +521,12 @@ int Flip::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
 
                         for (int i = 0; i < h; i++)
                         {
-                            const float* ptr = bottom_blob.channel(c).row(z * h + i);
-                            float* outptr = const_cast<float*>(top_blob.channel(c).row(flipped_d * h + (h - 1 - i))); // 翻转h维度
+                            // const float* ptr = bottom_blob.channel(c).row(z * h + i);
+                            // float* outptr = const_cast<float*>(top_blob.channel(c).row(flipped_d * h + (h - 1 - i))); // 翻转h维度
 
+                            // 修改为使用depth()访问方式
+                            const float* ptr = bottom_blob.channel(c).depth(z).row(i);
+                            float* outptr = const_cast<float*>(top_blob.channel(c).depth(flipped_d).row(h - 1 - i)); // 翻转h维度
                             // 翻转w维度
                             for (int k = 0; k < w; k++)
                             {
diff --git a/tests/test_flip.cpp b/tests/test_flip.cpp
index 7ebf787a462..e3097321b9a 100644
--- a/tests/test_flip.cpp
+++ b/tests/test_flip.cpp
@@ -124,9 +124,41 @@ static int test_flip_3()
 int main()
 {
     SRAND(7767517);
-    return 0
-           || test_flip_0()
-           || test_flip_1()
-           || test_flip_2()
-           || test_flip_3();
+    // return 0
+    //        || test_flip_0()
+    //        || test_flip_1()
+    //        || test_flip_2()
+    //        || test_flip_3();
+
+    // debug 测出所有异常
+    test_flip(RandomMat(2, 3, 4, 5), IntArrayMat(0));
+    test_flip(RandomMat(3, 2, 4, 5), IntArrayMat(1));
+    test_flip(RandomMat(4, 3, 2, 5), IntArrayMat(2));
+    test_flip(RandomMat(2, 3, 1, 5), IntArrayMat(3));
+    test_flip(RandomMat(6, 3, 4, 5), IntArrayMat(0, 1));
+    test_flip(RandomMat(2, 3, 1, 6), IntArrayMat(0, 2));
+    test_flip(RandomMat(5, 1, 2, 5), IntArrayMat(0, 3));
+    test_flip(RandomMat(5, 2, 1, 5), IntArrayMat(1, 2));
+    test_flip(RandomMat(4, 5, 2, 3), IntArrayMat(1, 3));
+    test_flip(RandomMat(2, 6, 4, 5), IntArrayMat(2, 3));
+    test_flip(RandomMat(6, 1, 4, 5), IntArrayMat(0, 1, 2));
+    test_flip(RandomMat(5, 2, 1, 5), IntArrayMat(0, 1, 3));
+    test_flip(RandomMat(4, 3, 3, 5), IntArrayMat(0, 2, 3));
+    test_flip(RandomMat(4, 3, 4, 5), IntArrayMat(1, 2, 3));
+    test_flip(RandomMat(6, 3, 3, 2), IntArrayMat(0, 1, 2, 3));
+
+    test_flip(RandomMat(2, 3, 5), IntArrayMat(0));
+    test_flip(RandomMat(3, 3, 5), IntArrayMat(1));
+    test_flip(RandomMat(4, 3, 5), IntArrayMat(2));
+    test_flip(RandomMat(3, 1, 5), IntArrayMat(0, 1));
+    test_flip(RandomMat(3, 2, 5), IntArrayMat(0, 2));
+    test_flip(RandomMat(3, 3, 4), IntArrayMat(1, 2));
+    test_flip(RandomMat(4, 3, 2), IntArrayMat(0, 1, 2));
+
+    test_flip(RandomMat(8, 2), IntArrayMat(-2));
+    test_flip(RandomMat(16, 3), IntArrayMat(-1));
+    test_flip(RandomMat(7, 2), IntArrayMat(-2, -1));
+
+    test_flip(RandomMat(18), IntArrayMat(-1));
+    return 0;
 }
\ No newline at end of file

From 5bd16791c84763af79f59d0427af0c96d9c400f5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BD=B0=E9=98=85?=
 <43716063+Baiyuetribe@users.noreply.github.com>
Date: Mon, 13 Jan 2025 23:06:02 +0800
Subject: [PATCH 11/14] ctest 7

---
 tests/test_flip.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_flip.cpp b/tests/test_flip.cpp
index e3097321b9a..1dc0233c8db 100644
--- a/tests/test_flip.cpp
+++ b/tests/test_flip.cpp
@@ -160,5 +160,5 @@ int main()
     test_flip(RandomMat(7, 2), IntArrayMat(-2, -1));
 
     test_flip(RandomMat(18), IntArrayMat(-1));
-    return 0;
+    return -1;
 }
\ No newline at end of file

From 5ba56f30677f06df08b9d96ff10adaf33279d499 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BD=B0=E9=98=85?=
 <43716063+Baiyuetribe@users.noreply.github.com>
Date: Mon, 13 Jan 2025 23:39:12 +0800
Subject: [PATCH 12/14] add 4d dch

---
 src/layer/flip.cpp | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/src/layer/flip.cpp b/src/layer/flip.cpp
index 7c571ea7e2e..dbb278a8955 100644
--- a/src/layer/flip.cpp
+++ b/src/layer/flip.cpp
@@ -458,7 +458,7 @@ int Flip::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
         }
         else if (axis.w == 3)
         {
-            // dch3、dcw4、chw6
+            // dch3、dcw4、dhw5,chw6
             int axis0 = axis_ptr[0] < 0 ? 4 + axis_ptr[0] : axis_ptr[0];
             int axis1 = axis_ptr[1] < 0 ? 4 + axis_ptr[1] : axis_ptr[1];
             int axis2 = axis_ptr[2] < 0 ? 4 + axis_ptr[2] : axis_ptr[2];
@@ -510,6 +510,29 @@ int Flip::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
                     }
                 }
             }
+            else if (axis_sum == 5)
+            {
+                // 对应dhw，除了d外全翻转
+                for (int c = 0; c < channels; c++)
+                {
+                    int flipped_c = channels - 1 - c; // 翻转c维度
+
+                    for (int z = 0; z < d; z++) // d维度保持不变
+                    {
+                        for (int i = 0; i < h; i++)
+                        {
+                            const float* ptr = bottom_blob.channel(c).depth(z).row(i);
+                            float* outptr = const_cast<float*>(top_blob.channel(flipped_c).depth(z).row(h - 1 - i)); // 翻转h维度
+
+                            // 翻转w维度
+                            for (int k = 0; k < w; k++)
+                            {
+                                outptr[k] = ptr[w - 1 - k];
+                            }
+                        }
+                    }
+                }
+            }
             else if (axis_sum == 6)
             {
                 // 对应chw,除了c外全翻转

From ad65148c298fb4196d7a7088de166ef2b8a01211 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BD=B0=E9=98=85?=
 <43716063+Baiyuetribe@users.noreply.github.com>
Date: Tue, 14 Jan 2025 00:12:16 +0800
Subject: [PATCH 13/14] ctest 8

---
 tests/test_flip.cpp                      | 42 +++---------------------
 tools/pnnx/tests/ncnn/test_torch_flip.py |  8 +++--
 2 files changed, 11 insertions(+), 39 deletions(-)

diff --git a/tests/test_flip.cpp b/tests/test_flip.cpp
index 1dc0233c8db..7ebf787a462 100644
--- a/tests/test_flip.cpp
+++ b/tests/test_flip.cpp
@@ -124,41 +124,9 @@ static int test_flip_3()
 int main()
 {
     SRAND(7767517);
-    // return 0
-    //        || test_flip_0()
-    //        || test_flip_1()
-    //        || test_flip_2()
-    //        || test_flip_3();
-
-    // debug 测出所有异常
-    test_flip(RandomMat(2, 3, 4, 5), IntArrayMat(0));
-    test_flip(RandomMat(3, 2, 4, 5), IntArrayMat(1));
-    test_flip(RandomMat(4, 3, 2, 5), IntArrayMat(2));
-    test_flip(RandomMat(2, 3, 1, 5), IntArrayMat(3));
-    test_flip(RandomMat(6, 3, 4, 5), IntArrayMat(0, 1));
-    test_flip(RandomMat(2, 3, 1, 6), IntArrayMat(0, 2));
-    test_flip(RandomMat(5, 1, 2, 5), IntArrayMat(0, 3));
-    test_flip(RandomMat(5, 2, 1, 5), IntArrayMat(1, 2));
-    test_flip(RandomMat(4, 5, 2, 3), IntArrayMat(1, 3));
-    test_flip(RandomMat(2, 6, 4, 5), IntArrayMat(2, 3));
-    test_flip(RandomMat(6, 1, 4, 5), IntArrayMat(0, 1, 2));
-    test_flip(RandomMat(5, 2, 1, 5), IntArrayMat(0, 1, 3));
-    test_flip(RandomMat(4, 3, 3, 5), IntArrayMat(0, 2, 3));
-    test_flip(RandomMat(4, 3, 4, 5), IntArrayMat(1, 2, 3));
-    test_flip(RandomMat(6, 3, 3, 2), IntArrayMat(0, 1, 2, 3));
-
-    test_flip(RandomMat(2, 3, 5), IntArrayMat(0));
-    test_flip(RandomMat(3, 3, 5), IntArrayMat(1));
-    test_flip(RandomMat(4, 3, 5), IntArrayMat(2));
-    test_flip(RandomMat(3, 1, 5), IntArrayMat(0, 1));
-    test_flip(RandomMat(3, 2, 5), IntArrayMat(0, 2));
-    test_flip(RandomMat(3, 3, 4), IntArrayMat(1, 2));
-    test_flip(RandomMat(4, 3, 2), IntArrayMat(0, 1, 2));
-
-    test_flip(RandomMat(8, 2), IntArrayMat(-2));
-    test_flip(RandomMat(16, 3), IntArrayMat(-1));
-    test_flip(RandomMat(7, 2), IntArrayMat(-2, -1));
-
-    test_flip(RandomMat(18), IntArrayMat(-1));
-    return -1;
+    return 0
+           || test_flip_0()
+           || test_flip_1()
+           || test_flip_2()
+           || test_flip_3();
 }
\ No newline at end of file
diff --git a/tools/pnnx/tests/ncnn/test_torch_flip.py b/tools/pnnx/tests/ncnn/test_torch_flip.py
index 4c9702cc505..b07a8d297a7 100644
--- a/tools/pnnx/tests/ncnn/test_torch_flip.py
+++ b/tools/pnnx/tests/ncnn/test_torch_flip.py
@@ -66,6 +66,7 @@ def forward(self, x, y, z, d):
         z3 = torch.flip(z, [0, 1])
         z4 = torch.flip(z, [0, 2])
         z5 = torch.flip(z, [1, 2])
+        z6 = torch.flip(z, [0, 1, 2])
         # 4D
         d0 = torch.flip(d, [-1])
         d1 = torch.flip(d, [-2])
@@ -79,8 +80,9 @@ def forward(self, x, y, z, d):
         d9 = torch.flip(d, [2, 3])
         d10 = torch.flip(d, [0, 1, 2])
         d11 = torch.flip(d, [0, 1, 3])
-        d12 = torch.flip(d, [1, 2, 3])
-        d13 = torch.flip(d, [0, 1, 2, 3])
+        d12 = torch.flip(d, [0, 2, 3])
+        d13 = torch.flip(d, [1, 2, 3])
+        d14 = torch.flip(d, [0, 1, 2, 3])
 
         return (
             x0,
@@ -93,6 +95,7 @@ def forward(self, x, y, z, d):
             z3,
             z4,
             z5,
+            z6,
             d0,
             d1,
             d2,
@@ -107,6 +110,7 @@ def forward(self, x, y, z, d):
             d11,
             d12,
             d13,
+            d14,
         )
 
 

From b54643eeced40ecb7ebb48f9c86ccb022120a837 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BD=B0=E9=98=85?=
 <43716063+Baiyuetribe@users.noreply.github.com>
Date: Tue, 14 Jan 2025 00:33:38 +0800
Subject: [PATCH 14/14] clean code

---
 .github/workflows/linux-x64-cpu-gcc.yml | 218 ++++++++++++------------
 src/layer/flip.cpp                      |  41 ++---
 2 files changed, 125 insertions(+), 134 deletions(-)

diff --git a/.github/workflows/linux-x64-cpu-gcc.yml b/.github/workflows/linux-x64-cpu-gcc.yml
index 31abbe47c25..ab2185be3e7 100644
--- a/.github/workflows/linux-x64-cpu-gcc.yml
+++ b/.github/workflows/linux-x64-cpu-gcc.yml
@@ -1,33 +1,33 @@
 name: linux-x64-cpu-gcc
 on:
   push:
-    # branches: [master]
+    branches: [master]
     paths:
-      - ".github/workflows/linux-x64-cpu-gcc.yml"
-      - "toolchains/host-c.gcc.toolchain.cmake"
-      - "CMakeLists.txt"
-      - "cmake/**"
-      - "src/*"
-      - "src/layer/*"
-      - "src/layer/x86/**"
-      - "tests/**"
-      - "tools/**"
-      - "!tools/pnnx/**"
-      - "examples/**"
+    - '.github/workflows/linux-x64-cpu-gcc.yml'
+    - 'toolchains/host-c.gcc.toolchain.cmake'
+    - 'CMakeLists.txt'
+    - 'cmake/**'
+    - 'src/*'
+    - 'src/layer/*'
+    - 'src/layer/x86/**'
+    - 'tests/**'
+    - 'tools/**'
+    - '!tools/pnnx/**'
+    - 'examples/**'
   pull_request:
     branches: [master]
     paths:
-      - ".github/workflows/linux-x64-cpu-gcc.yml"
-      - "toolchains/host-c.gcc.toolchain.cmake"
-      - "CMakeLists.txt"
-      - "cmake/**"
-      - "src/*"
-      - "src/layer/*"
-      - "src/layer/x86/**"
-      - "tests/**"
-      - "tools/**"
-      - "!tools/pnnx/**"
-      - "examples/**"
+    - '.github/workflows/linux-x64-cpu-gcc.yml'
+    - 'toolchains/host-c.gcc.toolchain.cmake'
+    - 'CMakeLists.txt'
+    - 'cmake/**'
+    - 'src/*'
+    - 'src/layer/*'
+    - 'src/layer/x86/**'
+    - 'tests/**'
+    - 'tools/**'
+    - '!tools/pnnx/**'
+    - 'examples/**'
 concurrency:
   group: linux-x64-cpu-gcc-${{ github.ref }}
   cancel-in-progress: true
@@ -38,97 +38,97 @@ jobs:
   linux-gcc:
     runs-on: ubuntu-20.04
     steps:
-      - uses: actions/checkout@v4
-      - name: update
-        run: sudo apt-get update
-      - name: protobuf
-        run: sudo apt-get install libprotobuf-dev protobuf-compiler libopencv-dev
-      - name: build-sse2
-        run: |
-          mkdir build-sse2 && cd build-sse2
-          cmake -DNCNN_AVX=OFF -DNCNN_AVX2=OFF -DNCNN_BUILD_TESTS=ON ..
-          cmake --build . -j $(nproc)
-      - name: test-sse2
-        run: cd build-sse2 && ctest --output-on-failure -j $(nproc)
-      - name: build-shared
-        run: |
-          mkdir build-shared && cd build-shared
-          cmake -DNCNN_AVX2=ON -DNCNN_SHARED_LIB=ON ..
-          cmake --build . -j $(nproc)
-      - name: build-avx2
-        run: |
-          mkdir build-avx2 && cd build-avx2
-          cmake -DNCNN_AVX2=ON -DNCNN_BUILD_TESTS=ON ..
-          cmake --build . -j $(nproc)
-      - name: test-avx2
-        run: cd build-avx2 && ctest --output-on-failure -j $(nproc)
-      - name: build-avx
-        run: |
-          mkdir build-avx && cd build-avx
-          cmake -DNCNN_AVX2=OFF -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON ..
-          cmake --build . -j $(nproc)
-      - name: test-avx
-        run: cd build-avx && ctest --output-on-failure -j $(nproc)
-      - name: build-avx1-2
-        run: |
-          mkdir build-avx1-2 && cd build-avx1-2
-          cmake -DNCNN_AVX2=ON -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON ..
-          cmake --build . -j $(nproc)
-      - name: test-avx1-2
-        run: cd build-avx1-2 && ctest --output-on-failure -j $(nproc)
-      - name: build-noint8
-        run: |
-          mkdir build-noint8 && cd build-noint8
-          cmake -DNCNN_INT8=OFF -DNCNN_BUILD_TESTS=ON ..
-          cmake --build . -j $(nproc)
-      - name: test-noint8
-        run: cd build-noint8 && ctest --output-on-failure -j $(nproc)
+    - uses: actions/checkout@v4
+    - name: update
+      run: sudo apt-get update
+    - name: protobuf
+      run: sudo apt-get install libprotobuf-dev protobuf-compiler libopencv-dev
+    - name: build-sse2
+      run: |
+        mkdir build-sse2 && cd build-sse2
+        cmake -DNCNN_AVX=OFF -DNCNN_AVX2=OFF -DNCNN_BUILD_TESTS=ON ..
+        cmake --build . -j $(nproc)
+    - name: test-sse2
+      run: cd build-sse2 && ctest --output-on-failure -j $(nproc)
+    - name: build-shared
+      run: |
+        mkdir build-shared && cd build-shared
+        cmake -DNCNN_AVX2=ON -DNCNN_SHARED_LIB=ON ..
+        cmake --build . -j $(nproc)
+    - name: build-avx2
+      run: |
+        mkdir build-avx2 && cd build-avx2
+        cmake -DNCNN_AVX2=ON -DNCNN_BUILD_TESTS=ON ..
+        cmake --build . -j $(nproc)
+    - name: test-avx2
+      run: cd build-avx2 && ctest --output-on-failure -j $(nproc)
+    - name: build-avx
+      run: |
+        mkdir build-avx && cd build-avx
+        cmake -DNCNN_AVX2=OFF -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON ..
+        cmake --build . -j $(nproc)
+    - name: test-avx
+      run: cd build-avx && ctest --output-on-failure -j $(nproc)
+    - name: build-avx1-2
+      run: |
+        mkdir build-avx1-2 && cd build-avx1-2
+        cmake -DNCNN_AVX2=ON -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON ..
+        cmake --build . -j $(nproc)
+    - name: test-avx1-2
+      run: cd build-avx1-2 && ctest --output-on-failure -j $(nproc)
+    - name: build-noint8
+      run: |
+        mkdir build-noint8 && cd build-noint8
+        cmake -DNCNN_INT8=OFF -DNCNN_BUILD_TESTS=ON ..
+        cmake --build . -j $(nproc)
+    - name: test-noint8
+      run: cd build-noint8 && ctest --output-on-failure -j $(nproc)
 
   linux-gcc-cpp03-nostdio-nostring-simplestl:
     runs-on: ubuntu-20.04
     steps:
-      - uses: actions/checkout@v4
-      - name: build-nostdio
-        run: |
-          mkdir build-nostdio && cd build-nostdio
-          cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-c++03.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
-          cmake --build . -j $(nproc)
-      - name: test-nostdio
-        run: cd build-nostdio && ctest --output-on-failure -j $(nproc)
-      - name: build-nostdio-nostring
-        run: |
-          mkdir build-nostdio-nostring && cd build-nostdio-nostring
-          cmake -DNCNN_STDIO=OFF -DNCNN_STRING=OFF -DNCNN_BUILD_TESTS=OFF -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
-          cmake --build . -j $(nproc)
-      - name: build-simplestl
-        run: |
-          mkdir build-simplestl && cd build-simplestl
-          cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.gcc.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
-          cmake --build . -j $(nproc)
-      - name: test-simplestl
-        run: cd build-simplestl && ctest --output-on-failure -j $(nproc)
-      - name: build-simplestl-simpleomp
-        run: |
-          mkdir build-simplestl-simpleomp && cd build-simplestl-simpleomp
-          cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.gcc.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_SIMPLEOMP=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
-          cmake --build . -j $(nproc)
-      - name: test-simplestl-simpleomp
-        run: cd build-simplestl-simpleomp && ctest --output-on-failure -j $(nproc)
+    - uses: actions/checkout@v4
+    - name: build-nostdio
+      run: |
+        mkdir build-nostdio && cd build-nostdio
+        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-c++03.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
+        cmake --build . -j $(nproc)
+    - name: test-nostdio
+      run: cd build-nostdio && ctest --output-on-failure -j $(nproc)
+    - name: build-nostdio-nostring
+      run: |
+        mkdir build-nostdio-nostring && cd build-nostdio-nostring
+        cmake -DNCNN_STDIO=OFF -DNCNN_STRING=OFF -DNCNN_BUILD_TESTS=OFF -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
+        cmake --build . -j $(nproc)
+    - name: build-simplestl
+      run: |
+        mkdir build-simplestl && cd build-simplestl
+        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.gcc.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
+        cmake --build . -j $(nproc)
+    - name: test-simplestl
+      run: cd build-simplestl && ctest --output-on-failure -j $(nproc)
+    - name: build-simplestl-simpleomp
+      run: |
+        mkdir build-simplestl-simpleomp && cd build-simplestl-simpleomp
+        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.gcc.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_SIMPLEOMP=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
+        cmake --build . -j $(nproc)
+    - name: test-simplestl-simpleomp
+      run: cd build-simplestl-simpleomp && ctest --output-on-failure -j $(nproc)
 
   linux-gcc-avx512:
     runs-on: [self-hosted, linux, t4]
     steps:
-      - uses: actions/checkout@v4
-      - name: build
-        env:
-          CC: gcc
-          CXX: g++
-          LD_LIBRARY_PATH: /data/action/install/lib64
-        run: |
-          mkdir build && cd build
-          cmake -DNCNN_AVX2=ON -DNCNN_AVX512=ON -DNCNN_AVX512VNNI=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
-          cmake --build . -j 4
-      - name: test
-        env:
-          LD_LIBRARY_PATH: /data/action/install/lib64
-        run: cd build && ctest --output-on-failure -j 4
+    - uses: actions/checkout@v4
+    - name: build
+      env:
+        CC: gcc
+        CXX: g++
+        LD_LIBRARY_PATH: /data/action/install/lib64
+      run: |
+        mkdir build && cd build
+        cmake -DNCNN_AVX2=ON -DNCNN_AVX512=ON -DNCNN_AVX512VNNI=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
+        cmake --build . -j 4
+    - name: test
+      env:
+        LD_LIBRARY_PATH: /data/action/install/lib64
+      run: cd build && ctest --output-on-failure -j 4
diff --git a/src/layer/flip.cpp b/src/layer/flip.cpp
index dbb278a8955..ae191c4ed58 100644
--- a/src/layer/flip.cpp
+++ b/src/layer/flip.cpp
@@ -116,8 +116,6 @@ int Flip::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
     else if (dims == 3)
     {
         top_blob.create(w, h, channels, elemsize, opt.blob_allocator);
-        if (top_blob.empty())
-            return -100;
         if (axis.w == 1)
         {
             // w、h、c
@@ -181,7 +179,7 @@ int Flip::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
                     {
                         // 组合两种翻转：channel维度和行维度同时翻转
                         const float* ptr = bottom_blob.channel(channels - 1 - i).row(h - 1 - j);
-                        float* outptr = const_cast<float*>(top_blob.channel(i).row(j));
+                        float* outptr = top_blob.channel(i).row(j);
                         memcpy(outptr, ptr, w * sizeof(float));
                     }
                 }
@@ -253,13 +251,12 @@ int Flip::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
                 for (int c = 0; c < channels; c++) // 遍历channels=3
                 {
                     int flipped_c = channels - 1 - c; // 计算channels翻转位置
-
-                    for (int z = 0; z < d; z++) // 遍历d=2维度
+                    for (int z = 0; z < d; z++)       // 遍历d=2维度
                     {
                         for (int j = 0; j < h; j++) // 遍历行
                         {
                             const float* ptr = bottom_blob.channel(c).row(z * h + j);
-                            float* outptr = const_cast<float*>(top_blob.channel(flipped_c).row(z * h + j));
+                            float* outptr = top_blob.channel(flipped_c).row(z * h + j);
                             memcpy(outptr, ptr, w * sizeof(float));
                         }
                     }
@@ -276,7 +273,7 @@ int Flip::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
                         {
                             // 翻转d维度的数据读取位置
                             const float* ptr = bottom_blob.channel(i).row((d - 1 - z) * h + j);
-                            float* outptr = const_cast<float*>(top_blob.channel(i).row(z * h + j));
+                            float* outptr = top_blob.channel(i).row(z * h + j);
                             // 逐行复制w元素
                             memcpy(outptr, ptr, w * sizeof(float));
                         }
@@ -339,7 +336,7 @@ int Flip::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
                         for (int j = 0; j < h; j++) // 遍历行
                         {
                             const float* ptr = bottom_blob.channel(c).row(z * h + j);
-                            float* outptr = const_cast<float*>(top_blob.channel(flipped_c).row(flipped_d * h + j));
+                            float* outptr = top_blob.channel(flipped_c).row(flipped_d * h + j);
                             memcpy(outptr, ptr, w * sizeof(float));
                         }
                     }
@@ -358,8 +355,8 @@ int Flip::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
                         for (int i = 0; i < h; i++)
                         {
                             const float* ptr = bottom_blob.channel(c).row(z * h + i);
-                            float* outptr = const_cast<float*>(top_blob.channel(flipped_c).row(z * h + (h - 1 - i))); // 保持z维度顺序,翻转h维度
-                            memcpy(outptr, ptr, w * sizeof(float));                                                   // 按行复制，保持 w 维度顺序
+                            float* outptr = top_blob.channel(flipped_c).row(z * h + (h - 1 - i)); // 保持z维度顺序,翻转h维度
+                            memcpy(outptr, ptr, w * sizeof(float));                               // 按行复制，保持 w 维度顺序
                         }
                     }
                 }
@@ -379,7 +376,7 @@ int Flip::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
                             for (int j = 0; j < h; j++) // h维度保持不变
                             {
                                 const float* ptr = bottom_blob.channel(c).row(z * h + j);
-                                float* outptr = const_cast<float*>(top_blob.channel(flipped_c).row(z * h + j));
+                                float* outptr = top_blob.channel(flipped_c).row(z * h + j);
 
                                 // 翻转w维度
                                 for (int k = 0; k < w; k++)
@@ -404,7 +401,7 @@ int Flip::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
                                 int flipped_h = h - 1 - j;
                                 // 读取源数据
                                 const float* ptr = bottom_blob.channel(c).row(z * h + j);
-                                float* outptr = const_cast<float*>(top_blob.channel(c).row(flipped_d * h + flipped_h));
+                                float* outptr = top_blob.channel(c).row(flipped_d * h + flipped_h);
                                 memcpy(outptr, ptr, w * sizeof(float));
                             }
                         }
@@ -423,7 +420,7 @@ int Flip::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
                         for (int j = 0; j < h; j++)
                         {
                             const float* ptr = bottom_blob.channel(c).row(z * h + j);
-                            float* outptr = const_cast<float*>(top_blob.channel(c).row(flipped_d * h + j)); // c维度保持不变
+                            float* outptr = top_blob.channel(c).row(flipped_d * h + j); // c维度保持不变
 
                             // 翻转 w 维度
                             for (int k = 0; k < w; k++)
@@ -444,7 +441,7 @@ int Flip::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
                         for (int j = 0; j < h; j++)
                         {
                             const float* ptr = bottom_blob.channel(c).row(z * h + j);
-                            float* outptr = const_cast<float*>(top_blob.channel(c).row(z * h + (h - 1 - j))); // 翻转 h 维度
+                            float* outptr = top_blob.channel(c).row(z * h + (h - 1 - j)); // 翻转 h 维度
 
                             // 翻转 w 维度
                             for (int k = 0; k < w; k++)
@@ -476,10 +473,8 @@ int Flip::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
 
                         for (int i = 0; i < h; i++)
                         {
-                            // 修改前：const float* ptr = bottom_blob.channel(c).row(z * h + i);
-                            // 修改为：使用depth()访问方式
                             const float* ptr = bottom_blob.channel(c).depth(z).row(i);
-                            float* outptr = const_cast<float*>(top_blob.channel(flipped_c).depth(flipped_d).row(h - 1 - i));
+                            float* outptr = top_blob.channel(flipped_c).depth(flipped_d).row(h - 1 - i);
                             memcpy(outptr, ptr, w * sizeof(float));
                         }
                     }
@@ -499,7 +494,7 @@ int Flip::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
                         for (int i = 0; i < h; i++)
                         {
                             const float* ptr = bottom_blob.channel(c).row(z * h + i);
-                            float* outptr = const_cast<float*>(top_blob.channel(flipped_c).row(flipped_d * h + i)); // h维度保持不变
+                            float* outptr = top_blob.channel(flipped_c).row(flipped_d * h + i); // h维度保持不变
 
                             // 翻转w维度
                             for (int k = 0; k < w; k++)
@@ -522,7 +517,7 @@ int Flip::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
                         for (int i = 0; i < h; i++)
                         {
                             const float* ptr = bottom_blob.channel(c).depth(z).row(i);
-                            float* outptr = const_cast<float*>(top_blob.channel(flipped_c).depth(z).row(h - 1 - i)); // 翻转h维度
+                            float* outptr = top_blob.channel(flipped_c).depth(z).row(h - 1 - i); // 翻转h维度
 
                             // 翻转w维度
                             for (int k = 0; k < w; k++)
@@ -544,12 +539,8 @@ int Flip::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
 
                         for (int i = 0; i < h; i++)
                         {
-                            // const float* ptr = bottom_blob.channel(c).row(z * h + i);
-                            // float* outptr = const_cast<float*>(top_blob.channel(c).row(flipped_d * h + (h - 1 - i))); // 翻转h维度
-
-                            // 修改为使用depth()访问方式
                             const float* ptr = bottom_blob.channel(c).depth(z).row(i);
-                            float* outptr = const_cast<float*>(top_blob.channel(c).depth(flipped_d).row(h - 1 - i)); // 翻转h维度
+                            float* outptr = top_blob.channel(c).depth(flipped_d).row(h - 1 - i); // 翻转h维度
                             // 翻转w维度
                             for (int k = 0; k < w; k++)
                             {
@@ -574,7 +565,7 @@ int Flip::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
                     for (int i = 0; i < h; i++)
                     {
                         const float* ptr = bottom_blob.channel(c).row(z * h + i);
-                        float* outptr = const_cast<float*>(top_blob.channel(flipped_c).row(flipped_d * h + (h - 1 - i))); // 翻转h维度
+                        float* outptr = top_blob.channel(flipped_c).row(flipped_d * h + (h - 1 - i)); // 翻转h维度
 
                         // 翻转w维度
                         for (int k = 0; k < w; k++)